BERTomelo-ModernBERT-Base-1k / trainer_state.json
renneruan's picture
Upload folder using huggingface_hub
a635935 verified
Invalid JSON: Unexpected token 'I', ..."ad_norm": Infinity, "... is not valid JSON
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.46803435666036,
"eval_steps": 500,
"global_step": 900000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 3.8533715074003996e-06,
"grad_norm": 33.24064254760742,
"learning_rate": 5e-05,
"loss": 10.4552,
"step": 1
},
{
"epoch": 0.0038533715074004,
"grad_norm": 2.1120877265930176,
"learning_rate": 4.99445e-05,
"loss": 4.0146,
"step": 1000
},
{
"epoch": 0.0077067430148008,
"grad_norm": 1.7379038333892822,
"learning_rate": 4.9888944444444445e-05,
"loss": 2.9699,
"step": 2000
},
{
"epoch": 0.0115601145222012,
"grad_norm": 1.6360819339752197,
"learning_rate": 4.9833388888888894e-05,
"loss": 2.6552,
"step": 3000
},
{
"epoch": 0.0154134860296016,
"grad_norm": 1.456438660621643,
"learning_rate": 4.9777833333333336e-05,
"loss": 2.4387,
"step": 4000
},
{
"epoch": 0.019266857537002,
"grad_norm": 1.4404393434524536,
"learning_rate": 4.972227777777778e-05,
"loss": 2.288,
"step": 5000
},
{
"epoch": 0.0231202290444024,
"grad_norm": 1.476318359375,
"learning_rate": 4.966672222222222e-05,
"loss": 2.1816,
"step": 6000
},
{
"epoch": 0.026973600551802798,
"grad_norm": 1.4118831157684326,
"learning_rate": 4.961116666666667e-05,
"loss": 2.1048,
"step": 7000
},
{
"epoch": 0.0308269720592032,
"grad_norm": 1.3684428930282593,
"learning_rate": 4.955561111111111e-05,
"loss": 2.0433,
"step": 8000
},
{
"epoch": 0.0346803435666036,
"grad_norm": 1.32038414478302,
"learning_rate": 4.9500055555555555e-05,
"loss": 1.9921,
"step": 9000
},
{
"epoch": 0.038533715074004,
"grad_norm": 1.3200163841247559,
"learning_rate": 4.9444500000000004e-05,
"loss": 1.9502,
"step": 10000
},
{
"epoch": 0.0423870865814044,
"grad_norm": 1.2274246215820312,
"learning_rate": 4.9388944444444446e-05,
"loss": 1.9121,
"step": 11000
},
{
"epoch": 0.0462404580888048,
"grad_norm": 1.2220534086227417,
"learning_rate": 4.9333388888888896e-05,
"loss": 1.8812,
"step": 12000
},
{
"epoch": 0.0500938295962052,
"grad_norm": 1.186949610710144,
"learning_rate": 4.927783333333333e-05,
"loss": 1.8519,
"step": 13000
},
{
"epoch": 0.053947201103605597,
"grad_norm": 1.1999038457870483,
"learning_rate": 4.922227777777778e-05,
"loss": 1.8266,
"step": 14000
},
{
"epoch": 0.057800572611006,
"grad_norm": 1.150551676750183,
"learning_rate": 4.916672222222222e-05,
"loss": 1.8038,
"step": 15000
},
{
"epoch": 0.0616539441184064,
"grad_norm": 1.1635456085205078,
"learning_rate": 4.911116666666667e-05,
"loss": 1.7846,
"step": 16000
},
{
"epoch": 0.0655073156258068,
"grad_norm": 1.1746997833251953,
"learning_rate": 4.9055611111111114e-05,
"loss": 1.7659,
"step": 17000
},
{
"epoch": 0.0693606871332072,
"grad_norm": 1.1516313552856445,
"learning_rate": 4.9000055555555556e-05,
"loss": 1.7487,
"step": 18000
},
{
"epoch": 0.0732140586406076,
"grad_norm": 1.1180574893951416,
"learning_rate": 4.8944500000000005e-05,
"loss": 1.7329,
"step": 19000
},
{
"epoch": 0.077067430148008,
"grad_norm": 1.0941097736358643,
"learning_rate": 4.888894444444445e-05,
"loss": 1.7186,
"step": 20000
},
{
"epoch": 0.0809208016554084,
"grad_norm": 1.0867908000946045,
"learning_rate": 4.883338888888889e-05,
"loss": 1.7059,
"step": 21000
},
{
"epoch": 0.0847741731628088,
"grad_norm": 1.1522151231765747,
"learning_rate": 4.877783333333333e-05,
"loss": 1.6922,
"step": 22000
},
{
"epoch": 0.0886275446702092,
"grad_norm": 1.0744513273239136,
"learning_rate": 4.872227777777778e-05,
"loss": 1.6789,
"step": 23000
},
{
"epoch": 0.0924809161776096,
"grad_norm": 1.0637524127960205,
"learning_rate": 4.8666722222222224e-05,
"loss": 1.6685,
"step": 24000
},
{
"epoch": 0.09633428768501,
"grad_norm": 1.1220489740371704,
"learning_rate": 4.8611166666666666e-05,
"loss": 1.6582,
"step": 25000
},
{
"epoch": 0.1001876591924104,
"grad_norm": 1.0609642267227173,
"learning_rate": 4.8555611111111115e-05,
"loss": 1.6472,
"step": 26000
},
{
"epoch": 0.1040410306998108,
"grad_norm": 1.0400673151016235,
"learning_rate": 4.850005555555556e-05,
"loss": 1.6383,
"step": 27000
},
{
"epoch": 0.10789440220721119,
"grad_norm": 0.982431948184967,
"learning_rate": 4.844450000000001e-05,
"loss": 1.6292,
"step": 28000
},
{
"epoch": 0.1117477737146116,
"grad_norm": 0.9695896506309509,
"learning_rate": 4.838894444444444e-05,
"loss": 1.6205,
"step": 29000
},
{
"epoch": 0.115601145222012,
"grad_norm": 1.0017755031585693,
"learning_rate": 4.833338888888889e-05,
"loss": 1.6121,
"step": 30000
},
{
"epoch": 0.1194545167294124,
"grad_norm": 0.9817500114440918,
"learning_rate": 4.8277833333333334e-05,
"loss": 1.6047,
"step": 31000
},
{
"epoch": 0.1233078882368128,
"grad_norm": 0.9635987877845764,
"learning_rate": 4.822227777777778e-05,
"loss": 1.597,
"step": 32000
},
{
"epoch": 0.1271612597442132,
"grad_norm": 1.0124236345291138,
"learning_rate": 4.8166722222222225e-05,
"loss": 1.5886,
"step": 33000
},
{
"epoch": 0.1310146312516136,
"grad_norm": 0.9595442414283752,
"learning_rate": 4.811116666666667e-05,
"loss": 1.5812,
"step": 34000
},
{
"epoch": 0.134868002759014,
"grad_norm": 0.9507773518562317,
"learning_rate": 4.805561111111112e-05,
"loss": 1.5757,
"step": 35000
},
{
"epoch": 0.1387213742664144,
"grad_norm": 1.0198057889938354,
"learning_rate": 4.800005555555556e-05,
"loss": 1.5693,
"step": 36000
},
{
"epoch": 0.1425747457738148,
"grad_norm": 0.9612113833427429,
"learning_rate": 4.79445e-05,
"loss": 1.5633,
"step": 37000
},
{
"epoch": 0.1464281172812152,
"grad_norm": 0.9176149368286133,
"learning_rate": 4.7888944444444444e-05,
"loss": 1.5563,
"step": 38000
},
{
"epoch": 0.1502814887886156,
"grad_norm": 0.9220606684684753,
"learning_rate": 4.783338888888889e-05,
"loss": 1.551,
"step": 39000
},
{
"epoch": 0.154134860296016,
"grad_norm": 0.9286023378372192,
"learning_rate": 4.7777833333333335e-05,
"loss": 1.5454,
"step": 40000
},
{
"epoch": 0.1579882318034164,
"grad_norm": 0.9021902680397034,
"learning_rate": 4.772227777777778e-05,
"loss": 1.5416,
"step": 41000
},
{
"epoch": 0.1618416033108168,
"grad_norm": 0.961746871471405,
"learning_rate": 4.766672222222223e-05,
"loss": 1.5355,
"step": 42000
},
{
"epoch": 0.1656949748182172,
"grad_norm": 0.8932749629020691,
"learning_rate": 4.761116666666667e-05,
"loss": 1.531,
"step": 43000
},
{
"epoch": 0.1695483463256176,
"grad_norm": 0.8928858041763306,
"learning_rate": 4.755561111111111e-05,
"loss": 1.5252,
"step": 44000
},
{
"epoch": 0.173401717833018,
"grad_norm": 0.9956013560295105,
"learning_rate": 4.7500055555555554e-05,
"loss": 1.5208,
"step": 45000
},
{
"epoch": 0.1772550893404184,
"grad_norm": 0.8892582058906555,
"learning_rate": 4.74445e-05,
"loss": 1.5159,
"step": 46000
},
{
"epoch": 0.1811084608478188,
"grad_norm": 0.9004553556442261,
"learning_rate": 4.7388944444444445e-05,
"loss": 1.5119,
"step": 47000
},
{
"epoch": 0.1849618323552192,
"grad_norm": 0.884730875492096,
"learning_rate": 4.7333388888888894e-05,
"loss": 1.507,
"step": 48000
},
{
"epoch": 0.1888152038626196,
"grad_norm": 0.866369903087616,
"learning_rate": 4.727783333333334e-05,
"loss": 1.5036,
"step": 49000
},
{
"epoch": 0.19266857537002,
"grad_norm": 0.8980478048324585,
"learning_rate": 4.722227777777778e-05,
"loss": 1.4993,
"step": 50000
},
{
"epoch": 0.1965219468774204,
"grad_norm": 0.9032998085021973,
"learning_rate": 4.716672222222223e-05,
"loss": 1.4949,
"step": 51000
},
{
"epoch": 0.2003753183848208,
"grad_norm": 0.8740929961204529,
"learning_rate": 4.711116666666667e-05,
"loss": 1.4919,
"step": 52000
},
{
"epoch": 0.2042286898922212,
"grad_norm": 0.8779985308647156,
"learning_rate": 4.705561111111111e-05,
"loss": 1.4876,
"step": 53000
},
{
"epoch": 0.2080820613996216,
"grad_norm": 0.8598712086677551,
"learning_rate": 4.7000055555555555e-05,
"loss": 1.4847,
"step": 54000
},
{
"epoch": 0.211935432907022,
"grad_norm": 0.8621186017990112,
"learning_rate": 4.6944500000000004e-05,
"loss": 1.4805,
"step": 55000
},
{
"epoch": 0.21578880441442239,
"grad_norm": 0.8625257015228271,
"learning_rate": 4.688894444444445e-05,
"loss": 1.4771,
"step": 56000
},
{
"epoch": 0.2196421759218228,
"grad_norm": 0.83707195520401,
"learning_rate": 4.683338888888889e-05,
"loss": 1.473,
"step": 57000
},
{
"epoch": 0.2234955474292232,
"grad_norm": 0.8472415804862976,
"learning_rate": 4.677783333333334e-05,
"loss": 1.4703,
"step": 58000
},
{
"epoch": 0.2273489189366236,
"grad_norm": 0.8501800298690796,
"learning_rate": 4.672227777777778e-05,
"loss": 1.4677,
"step": 59000
},
{
"epoch": 0.231202290444024,
"grad_norm": 0.8390816450119019,
"learning_rate": 4.666672222222222e-05,
"loss": 1.4632,
"step": 60000
},
{
"epoch": 0.2350556619514244,
"grad_norm": 0.8502111434936523,
"learning_rate": 4.6611166666666665e-05,
"loss": 1.4607,
"step": 61000
},
{
"epoch": 0.2389090334588248,
"grad_norm": 0.8513786196708679,
"learning_rate": 4.6555611111111114e-05,
"loss": 1.4569,
"step": 62000
},
{
"epoch": 0.2427624049662252,
"grad_norm": 0.8665297031402588,
"learning_rate": 4.6500055555555557e-05,
"loss": 1.4541,
"step": 63000
},
{
"epoch": 0.2466157764736256,
"grad_norm": 0.8247693777084351,
"learning_rate": 4.64445e-05,
"loss": 1.4519,
"step": 64000
},
{
"epoch": 0.250469147981026,
"grad_norm": 0.8859081268310547,
"learning_rate": 4.638894444444445e-05,
"loss": 1.4494,
"step": 65000
},
{
"epoch": 0.2543225194884264,
"grad_norm": 0.816459059715271,
"learning_rate": 4.633338888888889e-05,
"loss": 1.4457,
"step": 66000
},
{
"epoch": 0.2581758909958268,
"grad_norm": 0.8117705583572388,
"learning_rate": 4.627783333333334e-05,
"loss": 1.4426,
"step": 67000
},
{
"epoch": 0.2620292625032272,
"grad_norm": 0.8296411633491516,
"learning_rate": 4.6222277777777775e-05,
"loss": 1.4401,
"step": 68000
},
{
"epoch": 0.2658826340106276,
"grad_norm": 0.8232107758522034,
"learning_rate": 4.6166722222222224e-05,
"loss": 1.4377,
"step": 69000
},
{
"epoch": 0.269736005518028,
"grad_norm": 0.7909438014030457,
"learning_rate": 4.6111166666666667e-05,
"loss": 1.4357,
"step": 70000
},
{
"epoch": 0.2735893770254284,
"grad_norm": 0.8350186347961426,
"learning_rate": 4.6055611111111116e-05,
"loss": 1.4328,
"step": 71000
},
{
"epoch": 0.2774427485328288,
"grad_norm": 0.8087278604507446,
"learning_rate": 4.600005555555556e-05,
"loss": 1.4311,
"step": 72000
},
{
"epoch": 0.2812961200402292,
"grad_norm": 0.8941106200218201,
"learning_rate": 4.59445e-05,
"loss": 1.4278,
"step": 73000
},
{
"epoch": 0.2851494915476296,
"grad_norm": 0.8181429505348206,
"learning_rate": 4.588894444444445e-05,
"loss": 1.4262,
"step": 74000
},
{
"epoch": 0.28900286305503,
"grad_norm": 0.8224203586578369,
"learning_rate": 4.583338888888889e-05,
"loss": 1.4239,
"step": 75000
},
{
"epoch": 0.2928562345624304,
"grad_norm": 0.7924049496650696,
"learning_rate": 4.5777833333333334e-05,
"loss": 1.421,
"step": 76000
},
{
"epoch": 0.2967096060698308,
"grad_norm": 0.7774991393089294,
"learning_rate": 4.5722277777777776e-05,
"loss": 1.4189,
"step": 77000
},
{
"epoch": 0.3005629775772312,
"grad_norm": 0.8097211122512817,
"learning_rate": 4.5666722222222226e-05,
"loss": 1.4154,
"step": 78000
},
{
"epoch": 0.3044163490846316,
"grad_norm": 0.7798463702201843,
"learning_rate": 4.5611166666666675e-05,
"loss": 1.4146,
"step": 79000
},
{
"epoch": 0.308269720592032,
"grad_norm": 0.8168286681175232,
"learning_rate": 4.555561111111111e-05,
"loss": 1.4126,
"step": 80000
},
{
"epoch": 0.3121230920994324,
"grad_norm": 0.7852460145950317,
"learning_rate": 4.550005555555556e-05,
"loss": 1.4089,
"step": 81000
},
{
"epoch": 0.3159764636068328,
"grad_norm": 0.779107928276062,
"learning_rate": 4.54445e-05,
"loss": 1.4072,
"step": 82000
},
{
"epoch": 0.3198298351142332,
"grad_norm": 0.7704175710678101,
"learning_rate": 4.538894444444445e-05,
"loss": 1.4056,
"step": 83000
},
{
"epoch": 0.3236832066216336,
"grad_norm": 0.7774575352668762,
"learning_rate": 4.5333388888888886e-05,
"loss": 1.4044,
"step": 84000
},
{
"epoch": 0.327536578129034,
"grad_norm": 0.7762672305107117,
"learning_rate": 4.5277833333333336e-05,
"loss": 1.4018,
"step": 85000
},
{
"epoch": 0.3313899496364344,
"grad_norm": 0.7741659283638,
"learning_rate": 4.522227777777778e-05,
"loss": 1.3992,
"step": 86000
},
{
"epoch": 0.3352433211438348,
"grad_norm": 0.8049792051315308,
"learning_rate": 4.516672222222223e-05,
"loss": 1.3978,
"step": 87000
},
{
"epoch": 0.3390966926512352,
"grad_norm": 0.7718026041984558,
"learning_rate": 4.511116666666667e-05,
"loss": 1.3955,
"step": 88000
},
{
"epoch": 0.3429500641586356,
"grad_norm": 0.7472023367881775,
"learning_rate": 4.505561111111111e-05,
"loss": 1.394,
"step": 89000
},
{
"epoch": 0.346803435666036,
"grad_norm": 0.7536051869392395,
"learning_rate": 4.500005555555556e-05,
"loss": 1.3924,
"step": 90000
},
{
"epoch": 0.3506568071734364,
"grad_norm": 0.774494469165802,
"learning_rate": 4.49445e-05,
"loss": 1.3913,
"step": 91000
},
{
"epoch": 0.3545101786808368,
"grad_norm": 0.7755584120750427,
"learning_rate": 4.4888944444444445e-05,
"loss": 1.3888,
"step": 92000
},
{
"epoch": 0.3583635501882372,
"grad_norm": 0.748931348323822,
"learning_rate": 4.483338888888889e-05,
"loss": 1.3875,
"step": 93000
},
{
"epoch": 0.3622169216956376,
"grad_norm": 0.7655521035194397,
"learning_rate": 4.477783333333334e-05,
"loss": 1.3857,
"step": 94000
},
{
"epoch": 0.366070293203038,
"grad_norm": 0.7640711665153503,
"learning_rate": 4.472227777777778e-05,
"loss": 1.3838,
"step": 95000
},
{
"epoch": 0.3699236647104384,
"grad_norm": 0.7506535649299622,
"learning_rate": 4.466672222222222e-05,
"loss": 1.3816,
"step": 96000
},
{
"epoch": 0.3737770362178388,
"grad_norm": 0.7430715560913086,
"learning_rate": 4.461116666666667e-05,
"loss": 1.3796,
"step": 97000
},
{
"epoch": 0.3776304077252392,
"grad_norm": 0.733686089515686,
"learning_rate": 4.455561111111111e-05,
"loss": 1.3782,
"step": 98000
},
{
"epoch": 0.3814837792326396,
"grad_norm": 0.7562544941902161,
"learning_rate": 4.4500055555555555e-05,
"loss": 1.3782,
"step": 99000
},
{
"epoch": 0.38533715074004,
"grad_norm": 0.7897418141365051,
"learning_rate": 4.44445e-05,
"loss": 1.3758,
"step": 100000
},
{
"epoch": 0.3891905222474404,
"grad_norm": 0.7322382926940918,
"learning_rate": 4.438894444444445e-05,
"loss": 1.3746,
"step": 101000
},
{
"epoch": 0.3930438937548408,
"grad_norm": 0.7251117825508118,
"learning_rate": 4.4333388888888896e-05,
"loss": 1.3727,
"step": 102000
},
{
"epoch": 0.3968972652622412,
"grad_norm": 0.7524704337120056,
"learning_rate": 4.427783333333334e-05,
"loss": 1.3704,
"step": 103000
},
{
"epoch": 0.4007506367696416,
"grad_norm": 0.7829206585884094,
"learning_rate": 4.422227777777778e-05,
"loss": 1.3695,
"step": 104000
},
{
"epoch": 0.404604008277042,
"grad_norm": 0.7486142516136169,
"learning_rate": 4.416672222222222e-05,
"loss": 1.3685,
"step": 105000
},
{
"epoch": 0.4084573797844424,
"grad_norm": 0.7729934453964233,
"learning_rate": 4.411116666666667e-05,
"loss": 1.3664,
"step": 106000
},
{
"epoch": 0.4123107512918428,
"grad_norm": 0.7546641826629639,
"learning_rate": 4.4055611111111114e-05,
"loss": 1.3652,
"step": 107000
},
{
"epoch": 0.4161641227992432,
"grad_norm": 0.7382177710533142,
"learning_rate": 4.400005555555556e-05,
"loss": 1.3644,
"step": 108000
},
{
"epoch": 0.4200174943066436,
"grad_norm": 0.7239982485771179,
"learning_rate": 4.39445e-05,
"loss": 1.3622,
"step": 109000
},
{
"epoch": 0.423870865814044,
"grad_norm": 0.7373159527778625,
"learning_rate": 4.388894444444445e-05,
"loss": 1.361,
"step": 110000
},
{
"epoch": 0.4277242373214444,
"grad_norm": 0.7275413274765015,
"learning_rate": 4.383338888888889e-05,
"loss": 1.3599,
"step": 111000
},
{
"epoch": 0.43157760882884477,
"grad_norm": 0.7439739108085632,
"learning_rate": 4.377783333333333e-05,
"loss": 1.3578,
"step": 112000
},
{
"epoch": 0.4354309803362452,
"grad_norm": 0.7235158681869507,
"learning_rate": 4.372227777777778e-05,
"loss": 1.3571,
"step": 113000
},
{
"epoch": 0.4392843518436456,
"grad_norm": 0.7328953742980957,
"learning_rate": 4.3666722222222224e-05,
"loss": 1.3551,
"step": 114000
},
{
"epoch": 0.443137723351046,
"grad_norm": 0.7176284790039062,
"learning_rate": 4.361116666666667e-05,
"loss": 1.3541,
"step": 115000
},
{
"epoch": 0.4469910948584464,
"grad_norm": 0.7536628842353821,
"learning_rate": 4.355561111111111e-05,
"loss": 1.3514,
"step": 116000
},
{
"epoch": 0.4508444663658468,
"grad_norm": 0.7354797124862671,
"learning_rate": 4.350005555555556e-05,
"loss": 1.3516,
"step": 117000
},
{
"epoch": 0.4546978378732472,
"grad_norm": 0.7122487425804138,
"learning_rate": 4.344450000000001e-05,
"loss": 1.3512,
"step": 118000
},
{
"epoch": 0.4585512093806476,
"grad_norm": 0.7279472947120667,
"learning_rate": 4.338894444444444e-05,
"loss": 1.3491,
"step": 119000
},
{
"epoch": 0.462404580888048,
"grad_norm": 0.7182700037956238,
"learning_rate": 4.333338888888889e-05,
"loss": 1.3476,
"step": 120000
},
{
"epoch": 0.46625795239544837,
"grad_norm": 0.7295483350753784,
"learning_rate": 4.3277833333333334e-05,
"loss": 1.3474,
"step": 121000
},
{
"epoch": 0.4701113239028488,
"grad_norm": 0.7142743468284607,
"learning_rate": 4.3222277777777783e-05,
"loss": 1.3453,
"step": 122000
},
{
"epoch": 0.4739646954102492,
"grad_norm": 0.7233024835586548,
"learning_rate": 4.316672222222222e-05,
"loss": 1.3441,
"step": 123000
},
{
"epoch": 0.4778180669176496,
"grad_norm": 0.7175471186637878,
"learning_rate": 4.311116666666667e-05,
"loss": 1.3435,
"step": 124000
},
{
"epoch": 0.48167143842505,
"grad_norm": 0.7073889970779419,
"learning_rate": 4.305561111111111e-05,
"loss": 1.3427,
"step": 125000
},
{
"epoch": 0.4855248099324504,
"grad_norm": 0.7192471027374268,
"learning_rate": 4.300005555555556e-05,
"loss": 1.3414,
"step": 126000
},
{
"epoch": 0.4893781814398508,
"grad_norm": 0.7008840441703796,
"learning_rate": 4.29445e-05,
"loss": 1.3397,
"step": 127000
},
{
"epoch": 0.4932315529472512,
"grad_norm": 0.7169083952903748,
"learning_rate": 4.2888944444444444e-05,
"loss": 1.3386,
"step": 128000
},
{
"epoch": 0.4970849244546516,
"grad_norm": 0.7027848958969116,
"learning_rate": 4.2833388888888893e-05,
"loss": 1.3376,
"step": 129000
},
{
"epoch": 0.500938295962052,
"grad_norm": 0.7080409526824951,
"learning_rate": 4.2777833333333336e-05,
"loss": 1.3372,
"step": 130000
},
{
"epoch": 0.5047916674694524,
"grad_norm": 0.7177674770355225,
"learning_rate": 4.272227777777778e-05,
"loss": 1.3348,
"step": 131000
},
{
"epoch": 0.5086450389768528,
"grad_norm": 0.7196437120437622,
"learning_rate": 4.266672222222222e-05,
"loss": 1.3348,
"step": 132000
},
{
"epoch": 0.5124984104842532,
"grad_norm": 0.7128574848175049,
"learning_rate": 4.261116666666667e-05,
"loss": 1.3324,
"step": 133000
},
{
"epoch": 0.5163517819916535,
"grad_norm": 0.7088640332221985,
"learning_rate": 4.255561111111112e-05,
"loss": 1.3325,
"step": 134000
},
{
"epoch": 0.520205153499054,
"grad_norm": 0.7082544565200806,
"learning_rate": 4.2500055555555554e-05,
"loss": 1.3314,
"step": 135000
},
{
"epoch": 0.5240585250064544,
"grad_norm": 0.7220800518989563,
"learning_rate": 4.24445e-05,
"loss": 1.3306,
"step": 136000
},
{
"epoch": 0.5279118965138548,
"grad_norm": 0.6928138136863708,
"learning_rate": 4.2388944444444446e-05,
"loss": 1.3296,
"step": 137000
},
{
"epoch": 0.5317652680212552,
"grad_norm": 0.7121208310127258,
"learning_rate": 4.2333388888888895e-05,
"loss": 1.328,
"step": 138000
},
{
"epoch": 0.5356186395286556,
"grad_norm": 0.7346400022506714,
"learning_rate": 4.227783333333333e-05,
"loss": 1.3262,
"step": 139000
},
{
"epoch": 0.539472011036056,
"grad_norm": 0.7152061462402344,
"learning_rate": 4.222227777777778e-05,
"loss": 1.3264,
"step": 140000
},
{
"epoch": 0.5433253825434564,
"grad_norm": 0.7062528133392334,
"learning_rate": 4.216672222222223e-05,
"loss": 1.3246,
"step": 141000
},
{
"epoch": 0.5471787540508568,
"grad_norm": 0.7030431628227234,
"learning_rate": 4.211116666666667e-05,
"loss": 1.3245,
"step": 142000
},
{
"epoch": 0.5510321255582572,
"grad_norm": 0.706847071647644,
"learning_rate": 4.205561111111111e-05,
"loss": 1.3226,
"step": 143000
},
{
"epoch": 0.5548854970656576,
"grad_norm": Infinity,
"learning_rate": 4.2000055555555556e-05,
"loss": 1.3223,
"step": 144000
},
{
"epoch": 0.558738868573058,
"grad_norm": 0.6956000328063965,
"learning_rate": 4.1944500000000005e-05,
"loss": 1.3212,
"step": 145000
},
{
"epoch": 0.5625922400804584,
"grad_norm": 0.7229527235031128,
"learning_rate": 4.188894444444445e-05,
"loss": 1.3204,
"step": 146000
},
{
"epoch": 0.5664456115878588,
"grad_norm": 0.6998220682144165,
"learning_rate": 4.183338888888889e-05,
"loss": 1.3199,
"step": 147000
},
{
"epoch": 0.5702989830952592,
"grad_norm": 0.7195952534675598,
"learning_rate": 4.177783333333333e-05,
"loss": 1.3179,
"step": 148000
},
{
"epoch": 0.5741523546026596,
"grad_norm": 0.6813680529594421,
"learning_rate": 4.172227777777778e-05,
"loss": 1.3174,
"step": 149000
},
{
"epoch": 0.57800572611006,
"grad_norm": 0.7080066204071045,
"learning_rate": 4.166672222222222e-05,
"loss": 1.3166,
"step": 150000
},
{
"epoch": 0.5818590976174604,
"grad_norm": 0.7254391312599182,
"learning_rate": 4.1611166666666666e-05,
"loss": 1.3154,
"step": 151000
},
{
"epoch": 0.5857124691248607,
"grad_norm": 0.717185378074646,
"learning_rate": 4.1555611111111115e-05,
"loss": 1.3148,
"step": 152000
},
{
"epoch": 0.5895658406322613,
"grad_norm": 0.7029238343238831,
"learning_rate": 4.150005555555556e-05,
"loss": 1.3137,
"step": 153000
},
{
"epoch": 0.5934192121396616,
"grad_norm": 0.7082277536392212,
"learning_rate": 4.1444500000000006e-05,
"loss": 1.3132,
"step": 154000
},
{
"epoch": 0.597272583647062,
"grad_norm": 0.6892799139022827,
"learning_rate": 4.138894444444444e-05,
"loss": 1.3117,
"step": 155000
},
{
"epoch": 0.6011259551544624,
"grad_norm": 0.6990786790847778,
"learning_rate": 4.133338888888889e-05,
"loss": 1.3108,
"step": 156000
},
{
"epoch": 0.6049793266618628,
"grad_norm": 0.6933837532997131,
"learning_rate": 4.127783333333334e-05,
"loss": 1.3108,
"step": 157000
},
{
"epoch": 0.6088326981692632,
"grad_norm": 0.7177742123603821,
"learning_rate": 4.122227777777778e-05,
"loss": 1.3091,
"step": 158000
},
{
"epoch": 0.6126860696766636,
"grad_norm": 0.724977970123291,
"learning_rate": 4.1166722222222225e-05,
"loss": 1.3085,
"step": 159000
},
{
"epoch": 0.616539441184064,
"grad_norm": 0.707911491394043,
"learning_rate": 4.111116666666667e-05,
"loss": 1.3084,
"step": 160000
},
{
"epoch": 0.6203928126914644,
"grad_norm": 0.7061564326286316,
"learning_rate": 4.1055611111111116e-05,
"loss": 1.3072,
"step": 161000
},
{
"epoch": 0.6242461841988648,
"grad_norm": 0.676784873008728,
"learning_rate": 4.100005555555556e-05,
"loss": 1.3061,
"step": 162000
},
{
"epoch": 0.6280995557062652,
"grad_norm": 0.6791040897369385,
"learning_rate": 4.09445e-05,
"loss": 1.3058,
"step": 163000
},
{
"epoch": 0.6319529272136656,
"grad_norm": 0.6959836483001709,
"learning_rate": 4.088894444444445e-05,
"loss": 1.3049,
"step": 164000
},
{
"epoch": 0.635806298721066,
"grad_norm": 0.7067059874534607,
"learning_rate": 4.083338888888889e-05,
"loss": 1.3043,
"step": 165000
},
{
"epoch": 0.6396596702284664,
"grad_norm": 0.6933940052986145,
"learning_rate": 4.0777833333333335e-05,
"loss": 1.3019,
"step": 166000
},
{
"epoch": 0.6435130417358668,
"grad_norm": 0.7712944149971008,
"learning_rate": 4.072227777777778e-05,
"loss": 1.3022,
"step": 167000
},
{
"epoch": 0.6473664132432672,
"grad_norm": 0.6971937417984009,
"learning_rate": 4.0666722222222226e-05,
"loss": 1.3005,
"step": 168000
},
{
"epoch": 0.6512197847506676,
"grad_norm": 0.6904628276824951,
"learning_rate": 4.061116666666667e-05,
"loss": 1.2999,
"step": 169000
},
{
"epoch": 0.655073156258068,
"grad_norm": 0.6890471577644348,
"learning_rate": 4.055561111111111e-05,
"loss": 1.2996,
"step": 170000
},
{
"epoch": 0.6589265277654685,
"grad_norm": 0.6851339340209961,
"learning_rate": 4.050005555555555e-05,
"loss": 1.3,
"step": 171000
},
{
"epoch": 0.6627798992728688,
"grad_norm": 0.719916820526123,
"learning_rate": 4.04445e-05,
"loss": 1.2993,
"step": 172000
},
{
"epoch": 0.6666332707802692,
"grad_norm": 0.6848444938659668,
"learning_rate": 4.038894444444445e-05,
"loss": 1.2981,
"step": 173000
},
{
"epoch": 0.6704866422876696,
"grad_norm": 0.6885384321212769,
"learning_rate": 4.033338888888889e-05,
"loss": 1.2964,
"step": 174000
},
{
"epoch": 0.67434001379507,
"grad_norm": 0.7302813529968262,
"learning_rate": 4.0277833333333336e-05,
"loss": 1.2965,
"step": 175000
},
{
"epoch": 0.6781933853024704,
"grad_norm": 0.7206672430038452,
"learning_rate": 4.022227777777778e-05,
"loss": 1.2954,
"step": 176000
},
{
"epoch": 0.6820467568098708,
"grad_norm": 0.7295191884040833,
"learning_rate": 4.016672222222223e-05,
"loss": 1.2946,
"step": 177000
},
{
"epoch": 0.6859001283172712,
"grad_norm": 0.697117269039154,
"learning_rate": 4.011116666666666e-05,
"loss": 1.2947,
"step": 178000
},
{
"epoch": 0.6897534998246716,
"grad_norm": 0.6886401176452637,
"learning_rate": 4.005561111111111e-05,
"loss": 1.2932,
"step": 179000
},
{
"epoch": 0.693606871332072,
"grad_norm": 0.6862413883209229,
"learning_rate": 4.000005555555556e-05,
"loss": 1.2922,
"step": 180000
},
{
"epoch": 0.6974602428394724,
"grad_norm": 0.685055673122406,
"learning_rate": 3.9944500000000004e-05,
"loss": 1.292,
"step": 181000
},
{
"epoch": 0.7013136143468728,
"grad_norm": 0.6738216876983643,
"learning_rate": 3.9888944444444446e-05,
"loss": 1.2914,
"step": 182000
},
{
"epoch": 0.7051669858542732,
"grad_norm": 0.6774701476097107,
"learning_rate": 3.983338888888889e-05,
"loss": 1.2899,
"step": 183000
},
{
"epoch": 0.7090203573616736,
"grad_norm": 0.6937932968139648,
"learning_rate": 3.977783333333334e-05,
"loss": 1.2901,
"step": 184000
},
{
"epoch": 0.712873728869074,
"grad_norm": 0.6844605803489685,
"learning_rate": 3.972227777777778e-05,
"loss": 1.2881,
"step": 185000
},
{
"epoch": 0.7167271003764744,
"grad_norm": 0.7089695334434509,
"learning_rate": 3.966672222222222e-05,
"loss": 1.2874,
"step": 186000
},
{
"epoch": 0.7205804718838748,
"grad_norm": 0.7017620205879211,
"learning_rate": 3.9611166666666664e-05,
"loss": 1.288,
"step": 187000
},
{
"epoch": 0.7244338433912751,
"grad_norm": 0.685483455657959,
"learning_rate": 3.9555611111111113e-05,
"loss": 1.2874,
"step": 188000
},
{
"epoch": 0.7282872148986756,
"grad_norm": 0.7270601987838745,
"learning_rate": 3.950005555555556e-05,
"loss": 1.286,
"step": 189000
},
{
"epoch": 0.732140586406076,
"grad_norm": 0.6981102824211121,
"learning_rate": 3.94445e-05,
"loss": 1.2863,
"step": 190000
},
{
"epoch": 0.7359939579134764,
"grad_norm": 0.6715162992477417,
"learning_rate": 3.938894444444445e-05,
"loss": 1.2858,
"step": 191000
},
{
"epoch": 0.7398473294208768,
"grad_norm": 0.687854528427124,
"learning_rate": 3.933338888888889e-05,
"loss": 1.2853,
"step": 192000
},
{
"epoch": 0.7437007009282772,
"grad_norm": 0.7099502682685852,
"learning_rate": 3.927783333333334e-05,
"loss": 1.2835,
"step": 193000
},
{
"epoch": 0.7475540724356776,
"grad_norm": 0.6917000412940979,
"learning_rate": 3.9222277777777774e-05,
"loss": 1.2836,
"step": 194000
},
{
"epoch": 0.751407443943078,
"grad_norm": 0.6732981204986572,
"learning_rate": 3.9166722222222223e-05,
"loss": 1.2823,
"step": 195000
},
{
"epoch": 0.7552608154504784,
"grad_norm": 0.6816644668579102,
"learning_rate": 3.911116666666667e-05,
"loss": 1.2823,
"step": 196000
},
{
"epoch": 0.7591141869578788,
"grad_norm": 0.6732121109962463,
"learning_rate": 3.9055611111111115e-05,
"loss": 1.2814,
"step": 197000
},
{
"epoch": 0.7629675584652792,
"grad_norm": 0.6966871619224548,
"learning_rate": 3.900005555555556e-05,
"loss": 1.2799,
"step": 198000
},
{
"epoch": 0.7668209299726796,
"grad_norm": 0.6813514232635498,
"learning_rate": 3.89445e-05,
"loss": 1.2796,
"step": 199000
},
{
"epoch": 0.77067430148008,
"grad_norm": 0.6650587916374207,
"learning_rate": 3.888894444444445e-05,
"loss": 1.2798,
"step": 200000
},
{
"epoch": 0.7745276729874804,
"grad_norm": 0.674834668636322,
"learning_rate": 3.883338888888889e-05,
"loss": 1.2781,
"step": 201000
},
{
"epoch": 0.7783810444948808,
"grad_norm": 0.6968523859977722,
"learning_rate": 3.877783333333333e-05,
"loss": 1.2778,
"step": 202000
},
{
"epoch": 0.7822344160022812,
"grad_norm": 0.6971069574356079,
"learning_rate": 3.872227777777778e-05,
"loss": 1.2778,
"step": 203000
},
{
"epoch": 0.7860877875096816,
"grad_norm": 0.6771474480628967,
"learning_rate": 3.8666722222222225e-05,
"loss": 1.2767,
"step": 204000
},
{
"epoch": 0.789941159017082,
"grad_norm": 0.6738539338111877,
"learning_rate": 3.861116666666667e-05,
"loss": 1.2765,
"step": 205000
},
{
"epoch": 0.7937945305244823,
"grad_norm": 0.6804343461990356,
"learning_rate": 3.855561111111111e-05,
"loss": 1.2761,
"step": 206000
},
{
"epoch": 0.7976479020318828,
"grad_norm": 0.6916020512580872,
"learning_rate": 3.850005555555556e-05,
"loss": 1.2764,
"step": 207000
},
{
"epoch": 0.8015012735392832,
"grad_norm": 0.6735371947288513,
"learning_rate": 3.84445e-05,
"loss": 1.2755,
"step": 208000
},
{
"epoch": 0.8053546450466836,
"grad_norm": 0.6745339035987854,
"learning_rate": 3.838894444444445e-05,
"loss": 1.2744,
"step": 209000
},
{
"epoch": 0.809208016554084,
"grad_norm": 0.6855958700180054,
"learning_rate": 3.8333388888888886e-05,
"loss": 1.2735,
"step": 210000
},
{
"epoch": 0.8130613880614844,
"grad_norm": 0.6521448493003845,
"learning_rate": 3.8277833333333335e-05,
"loss": 1.2729,
"step": 211000
},
{
"epoch": 0.8169147595688848,
"grad_norm": 0.7009506821632385,
"learning_rate": 3.8222277777777784e-05,
"loss": 1.2722,
"step": 212000
},
{
"epoch": 0.8207681310762852,
"grad_norm": 0.7052969336509705,
"learning_rate": 3.8166722222222226e-05,
"loss": 1.2722,
"step": 213000
},
{
"epoch": 0.8246215025836856,
"grad_norm": 0.6950345635414124,
"learning_rate": 3.811116666666667e-05,
"loss": 1.2718,
"step": 214000
},
{
"epoch": 0.828474874091086,
"grad_norm": 0.6897072196006775,
"learning_rate": 3.805561111111111e-05,
"loss": 1.2708,
"step": 215000
},
{
"epoch": 0.8323282455984864,
"grad_norm": 0.6870512962341309,
"learning_rate": 3.800005555555556e-05,
"loss": 1.2706,
"step": 216000
},
{
"epoch": 0.8361816171058868,
"grad_norm": 0.6739286780357361,
"learning_rate": 3.79445e-05,
"loss": 1.2695,
"step": 217000
},
{
"epoch": 0.8400349886132872,
"grad_norm": 0.691164493560791,
"learning_rate": 3.7888944444444445e-05,
"loss": 1.2696,
"step": 218000
},
{
"epoch": 0.8438883601206876,
"grad_norm": 0.6866764426231384,
"learning_rate": 3.7833388888888894e-05,
"loss": 1.2684,
"step": 219000
},
{
"epoch": 0.847741731628088,
"grad_norm": 0.6938662528991699,
"learning_rate": 3.7777833333333336e-05,
"loss": 1.268,
"step": 220000
},
{
"epoch": 0.8515951031354884,
"grad_norm": 0.7062351107597351,
"learning_rate": 3.772227777777778e-05,
"loss": 1.267,
"step": 221000
},
{
"epoch": 0.8554484746428888,
"grad_norm": 0.6679728031158447,
"learning_rate": 3.766672222222222e-05,
"loss": 1.2674,
"step": 222000
},
{
"epoch": 0.8593018461502892,
"grad_norm": 0.6871834397315979,
"learning_rate": 3.761116666666667e-05,
"loss": 1.2662,
"step": 223000
},
{
"epoch": 0.8631552176576895,
"grad_norm": 0.652167797088623,
"learning_rate": 3.755561111111111e-05,
"loss": 1.267,
"step": 224000
},
{
"epoch": 0.86700858916509,
"grad_norm": 0.708121657371521,
"learning_rate": 3.7500055555555555e-05,
"loss": 1.2662,
"step": 225000
},
{
"epoch": 0.8708619606724904,
"grad_norm": 0.6835631728172302,
"learning_rate": 3.74445e-05,
"loss": 1.2651,
"step": 226000
},
{
"epoch": 0.8747153321798908,
"grad_norm": 0.67769455909729,
"learning_rate": 3.7388944444444446e-05,
"loss": 1.2645,
"step": 227000
},
{
"epoch": 0.8785687036872912,
"grad_norm": 0.6746647357940674,
"learning_rate": 3.7333388888888895e-05,
"loss": 1.264,
"step": 228000
},
{
"epoch": 0.8824220751946916,
"grad_norm": 0.6745488047599792,
"learning_rate": 3.727783333333333e-05,
"loss": 1.2631,
"step": 229000
},
{
"epoch": 0.886275446702092,
"grad_norm": 0.665640115737915,
"learning_rate": 3.722227777777778e-05,
"loss": 1.2628,
"step": 230000
},
{
"epoch": 0.8901288182094924,
"grad_norm": 0.6739605069160461,
"learning_rate": 3.716672222222222e-05,
"loss": 1.2627,
"step": 231000
},
{
"epoch": 0.8939821897168928,
"grad_norm": 0.7083284258842468,
"learning_rate": 3.711116666666667e-05,
"loss": 1.2612,
"step": 232000
},
{
"epoch": 0.8978355612242932,
"grad_norm": 0.6816121935844421,
"learning_rate": 3.7055611111111114e-05,
"loss": 1.2619,
"step": 233000
},
{
"epoch": 0.9016889327316936,
"grad_norm": 0.6729110479354858,
"learning_rate": 3.7000055555555556e-05,
"loss": 1.2615,
"step": 234000
},
{
"epoch": 0.905542304239094,
"grad_norm": 0.6974055171012878,
"learning_rate": 3.6944500000000005e-05,
"loss": 1.2596,
"step": 235000
},
{
"epoch": 0.9093956757464944,
"grad_norm": 0.6852896213531494,
"learning_rate": 3.688894444444445e-05,
"loss": 1.2601,
"step": 236000
},
{
"epoch": 0.9132490472538948,
"grad_norm": 0.6774199604988098,
"learning_rate": 3.683338888888889e-05,
"loss": 1.2595,
"step": 237000
},
{
"epoch": 0.9171024187612952,
"grad_norm": 0.672041118144989,
"learning_rate": 3.677783333333333e-05,
"loss": 1.2591,
"step": 238000
},
{
"epoch": 0.9209557902686956,
"grad_norm": 0.6697712540626526,
"learning_rate": 3.672227777777778e-05,
"loss": 1.2571,
"step": 239000
},
{
"epoch": 0.924809161776096,
"grad_norm": 0.6848810911178589,
"learning_rate": 3.6666722222222224e-05,
"loss": 1.2578,
"step": 240000
},
{
"epoch": 0.9286625332834963,
"grad_norm": 0.6790698766708374,
"learning_rate": 3.6611166666666666e-05,
"loss": 1.2565,
"step": 241000
},
{
"epoch": 0.9325159047908967,
"grad_norm": 0.6708704233169556,
"learning_rate": 3.6555611111111115e-05,
"loss": 1.2574,
"step": 242000
},
{
"epoch": 0.9363692762982972,
"grad_norm": 0.6941115260124207,
"learning_rate": 3.650005555555556e-05,
"loss": 1.2562,
"step": 243000
},
{
"epoch": 0.9402226478056976,
"grad_norm": 0.6749645471572876,
"learning_rate": 3.6444500000000007e-05,
"loss": 1.2567,
"step": 244000
},
{
"epoch": 0.944076019313098,
"grad_norm": 0.655571460723877,
"learning_rate": 3.638894444444444e-05,
"loss": 1.2557,
"step": 245000
},
{
"epoch": 0.9479293908204984,
"grad_norm": 0.6796839237213135,
"learning_rate": 3.633338888888889e-05,
"loss": 1.2563,
"step": 246000
},
{
"epoch": 0.9517827623278988,
"grad_norm": 0.6742174029350281,
"learning_rate": 3.6277833333333334e-05,
"loss": 1.2548,
"step": 247000
},
{
"epoch": 0.9556361338352992,
"grad_norm": 0.6875942349433899,
"learning_rate": 3.622227777777778e-05,
"loss": 1.254,
"step": 248000
},
{
"epoch": 0.9594895053426996,
"grad_norm": 0.6719071865081787,
"learning_rate": 3.616672222222222e-05,
"loss": 1.2533,
"step": 249000
},
{
"epoch": 0.9633428768501,
"grad_norm": 0.6660245060920715,
"learning_rate": 3.611116666666667e-05,
"loss": 1.2536,
"step": 250000
},
{
"epoch": 0.9671962483575004,
"grad_norm": 0.6870962977409363,
"learning_rate": 3.6055611111111117e-05,
"loss": 1.2527,
"step": 251000
},
{
"epoch": 0.9710496198649008,
"grad_norm": 0.6905462145805359,
"learning_rate": 3.600005555555556e-05,
"loss": 1.2531,
"step": 252000
},
{
"epoch": 0.9749029913723012,
"grad_norm": 0.6646074056625366,
"learning_rate": 3.59445e-05,
"loss": 1.2531,
"step": 253000
},
{
"epoch": 0.9787563628797016,
"grad_norm": 0.6946249008178711,
"learning_rate": 3.5888944444444444e-05,
"loss": 1.2515,
"step": 254000
},
{
"epoch": 0.982609734387102,
"grad_norm": 0.6882653832435608,
"learning_rate": 3.583338888888889e-05,
"loss": 1.2513,
"step": 255000
},
{
"epoch": 0.9864631058945024,
"grad_norm": 0.6676469445228577,
"learning_rate": 3.5777833333333335e-05,
"loss": 1.2507,
"step": 256000
},
{
"epoch": 0.9903164774019028,
"grad_norm": 0.6981261968612671,
"learning_rate": 3.572227777777778e-05,
"loss": 1.2504,
"step": 257000
},
{
"epoch": 0.9941698489093032,
"grad_norm": 0.6620067358016968,
"learning_rate": 3.5666722222222226e-05,
"loss": 1.2507,
"step": 258000
},
{
"epoch": 0.9980232204167035,
"grad_norm": 0.6728119850158691,
"learning_rate": 3.561116666666667e-05,
"loss": 1.249,
"step": 259000
},
{
"epoch": 1.001876591924104,
"grad_norm": 0.6715940833091736,
"learning_rate": 3.555561111111112e-05,
"loss": 1.2498,
"step": 260000
},
{
"epoch": 1.0057299634315044,
"grad_norm": 0.6744341254234314,
"learning_rate": 3.5500055555555553e-05,
"loss": 1.2488,
"step": 261000
},
{
"epoch": 1.0095833349389047,
"grad_norm": 0.6996214985847473,
"learning_rate": 3.54445e-05,
"loss": 1.2484,
"step": 262000
},
{
"epoch": 1.0134367064463052,
"grad_norm": 0.6556364893913269,
"learning_rate": 3.5388944444444445e-05,
"loss": 1.248,
"step": 263000
},
{
"epoch": 1.0172900779537055,
"grad_norm": 0.692175567150116,
"learning_rate": 3.5333388888888894e-05,
"loss": 1.2466,
"step": 264000
},
{
"epoch": 1.021143449461106,
"grad_norm": 0.6721535921096802,
"learning_rate": 3.5277833333333336e-05,
"loss": 1.2461,
"step": 265000
},
{
"epoch": 1.0249968209685063,
"grad_norm": 0.6992902159690857,
"learning_rate": 3.522227777777778e-05,
"loss": 1.2457,
"step": 266000
},
{
"epoch": 1.0288501924759068,
"grad_norm": 0.6894251108169556,
"learning_rate": 3.516672222222223e-05,
"loss": 1.246,
"step": 267000
},
{
"epoch": 1.032703563983307,
"grad_norm": 0.6852269172668457,
"learning_rate": 3.511116666666667e-05,
"loss": 1.2459,
"step": 268000
},
{
"epoch": 1.0365569354907076,
"grad_norm": 0.6719028949737549,
"learning_rate": 3.505561111111111e-05,
"loss": 1.2452,
"step": 269000
},
{
"epoch": 1.0404103069981079,
"grad_norm": 0.6796379089355469,
"learning_rate": 3.5000055555555555e-05,
"loss": 1.2439,
"step": 270000
},
{
"epoch": 1.0442636785055084,
"grad_norm": 0.6743236184120178,
"learning_rate": 3.4944500000000004e-05,
"loss": 1.2441,
"step": 271000
},
{
"epoch": 1.048117050012909,
"grad_norm": 0.6704487800598145,
"learning_rate": 3.4888944444444446e-05,
"loss": 1.2438,
"step": 272000
},
{
"epoch": 1.0519704215203092,
"grad_norm": 0.6776983141899109,
"learning_rate": 3.483338888888889e-05,
"loss": 1.2443,
"step": 273000
},
{
"epoch": 1.0558237930277097,
"grad_norm": 0.6701886057853699,
"learning_rate": 3.477783333333334e-05,
"loss": 1.2428,
"step": 274000
},
{
"epoch": 1.05967716453511,
"grad_norm": 0.6820278167724609,
"learning_rate": 3.472227777777778e-05,
"loss": 1.2426,
"step": 275000
},
{
"epoch": 1.0635305360425105,
"grad_norm": 0.7262411117553711,
"learning_rate": 3.466672222222222e-05,
"loss": 1.2429,
"step": 276000
},
{
"epoch": 1.0673839075499107,
"grad_norm": 0.6728771328926086,
"learning_rate": 3.4611166666666665e-05,
"loss": 1.2421,
"step": 277000
},
{
"epoch": 1.0712372790573113,
"grad_norm": 0.66309654712677,
"learning_rate": 3.4555611111111114e-05,
"loss": 1.2416,
"step": 278000
},
{
"epoch": 1.0750906505647115,
"grad_norm": 0.6764417886734009,
"learning_rate": 3.4500055555555556e-05,
"loss": 1.2413,
"step": 279000
},
{
"epoch": 1.078944022072112,
"grad_norm": 0.6755089163780212,
"learning_rate": 3.44445e-05,
"loss": 1.2411,
"step": 280000
},
{
"epoch": 1.0827973935795123,
"grad_norm": 0.669450581073761,
"learning_rate": 3.438894444444445e-05,
"loss": 1.2406,
"step": 281000
},
{
"epoch": 1.0866507650869128,
"grad_norm": 0.6609264612197876,
"learning_rate": 3.433338888888889e-05,
"loss": 1.2395,
"step": 282000
},
{
"epoch": 1.0905041365943131,
"grad_norm": 0.6697176694869995,
"learning_rate": 3.427783333333334e-05,
"loss": 1.2403,
"step": 283000
},
{
"epoch": 1.0943575081017136,
"grad_norm": 0.6523563861846924,
"learning_rate": 3.4222277777777775e-05,
"loss": 1.2392,
"step": 284000
},
{
"epoch": 1.098210879609114,
"grad_norm": 0.6608708500862122,
"learning_rate": 3.4166722222222224e-05,
"loss": 1.239,
"step": 285000
},
{
"epoch": 1.1020642511165144,
"grad_norm": 0.6721755862236023,
"learning_rate": 3.4111166666666666e-05,
"loss": 1.2394,
"step": 286000
},
{
"epoch": 1.105917622623915,
"grad_norm": 0.6699149012565613,
"learning_rate": 3.4055611111111115e-05,
"loss": 1.2389,
"step": 287000
},
{
"epoch": 1.1097709941313152,
"grad_norm": 0.6876478791236877,
"learning_rate": 3.400005555555556e-05,
"loss": 1.2384,
"step": 288000
},
{
"epoch": 1.1136243656387157,
"grad_norm": 0.6746466755867004,
"learning_rate": 3.39445e-05,
"loss": 1.2372,
"step": 289000
},
{
"epoch": 1.117477737146116,
"grad_norm": 0.6752446889877319,
"learning_rate": 3.388894444444445e-05,
"loss": 1.2367,
"step": 290000
},
{
"epoch": 1.1213311086535165,
"grad_norm": 0.6689814329147339,
"learning_rate": 3.383338888888889e-05,
"loss": 1.2373,
"step": 291000
},
{
"epoch": 1.1251844801609168,
"grad_norm": 0.6524012684822083,
"learning_rate": 3.3777833333333334e-05,
"loss": 1.2364,
"step": 292000
},
{
"epoch": 1.1290378516683173,
"grad_norm": 0.6835392713546753,
"learning_rate": 3.3722277777777776e-05,
"loss": 1.2367,
"step": 293000
},
{
"epoch": 1.1328912231757176,
"grad_norm": 0.663935124874115,
"learning_rate": 3.3666722222222225e-05,
"loss": 1.2354,
"step": 294000
},
{
"epoch": 1.136744594683118,
"grad_norm": 0.681470513343811,
"learning_rate": 3.361116666666667e-05,
"loss": 1.2341,
"step": 295000
},
{
"epoch": 1.1405979661905183,
"grad_norm": 0.6814187169075012,
"learning_rate": 3.355561111111111e-05,
"loss": 1.2347,
"step": 296000
},
{
"epoch": 1.1444513376979188,
"grad_norm": 0.6692870259284973,
"learning_rate": 3.350005555555556e-05,
"loss": 1.234,
"step": 297000
},
{
"epoch": 1.1483047092053191,
"grad_norm": 0.6934278011322021,
"learning_rate": 3.34445e-05,
"loss": 1.2349,
"step": 298000
},
{
"epoch": 1.1521580807127196,
"grad_norm": 0.6900179982185364,
"learning_rate": 3.338894444444445e-05,
"loss": 1.234,
"step": 299000
},
{
"epoch": 1.15601145222012,
"grad_norm": 0.6840701699256897,
"learning_rate": 3.3333388888888886e-05,
"loss": 1.2332,
"step": 300000
},
{
"epoch": 1.1598648237275204,
"grad_norm": 0.7003931403160095,
"learning_rate": 3.3277833333333335e-05,
"loss": 1.2329,
"step": 301000
},
{
"epoch": 1.1637181952349207,
"grad_norm": 0.6828613877296448,
"learning_rate": 3.322227777777778e-05,
"loss": 1.232,
"step": 302000
},
{
"epoch": 1.1675715667423212,
"grad_norm": 0.7264192700386047,
"learning_rate": 3.316672222222223e-05,
"loss": 1.2322,
"step": 303000
},
{
"epoch": 1.1714249382497215,
"grad_norm": 0.6868515014648438,
"learning_rate": 3.311116666666667e-05,
"loss": 1.2322,
"step": 304000
},
{
"epoch": 1.175278309757122,
"grad_norm": 0.6805739402770996,
"learning_rate": 3.305561111111111e-05,
"loss": 1.2307,
"step": 305000
},
{
"epoch": 1.1791316812645225,
"grad_norm": 0.6556283831596375,
"learning_rate": 3.300005555555556e-05,
"loss": 1.2308,
"step": 306000
},
{
"epoch": 1.1829850527719228,
"grad_norm": 0.6635182499885559,
"learning_rate": 3.29445e-05,
"loss": 1.2311,
"step": 307000
},
{
"epoch": 1.1868384242793233,
"grad_norm": 0.6593520641326904,
"learning_rate": 3.2888944444444445e-05,
"loss": 1.2307,
"step": 308000
},
{
"epoch": 1.1906917957867236,
"grad_norm": 0.6719244122505188,
"learning_rate": 3.283338888888889e-05,
"loss": 1.2302,
"step": 309000
},
{
"epoch": 1.194545167294124,
"grad_norm": 0.663469135761261,
"learning_rate": 3.2777833333333337e-05,
"loss": 1.2302,
"step": 310000
},
{
"epoch": 1.1983985388015244,
"grad_norm": 0.679842472076416,
"learning_rate": 3.272227777777778e-05,
"loss": 1.23,
"step": 311000
},
{
"epoch": 1.2022519103089249,
"grad_norm": 0.6602251529693604,
"learning_rate": 3.266672222222222e-05,
"loss": 1.229,
"step": 312000
},
{
"epoch": 1.2061052818163251,
"grad_norm": 0.6897211670875549,
"learning_rate": 3.261116666666667e-05,
"loss": 1.2293,
"step": 313000
},
{
"epoch": 1.2099586533237257,
"grad_norm": 0.6772252321243286,
"learning_rate": 3.255561111111111e-05,
"loss": 1.2287,
"step": 314000
},
{
"epoch": 1.213812024831126,
"grad_norm": 0.6991803646087646,
"learning_rate": 3.250005555555556e-05,
"loss": 1.2289,
"step": 315000
},
{
"epoch": 1.2176653963385264,
"grad_norm": 0.6778867840766907,
"learning_rate": 3.24445e-05,
"loss": 1.2273,
"step": 316000
},
{
"epoch": 1.2215187678459267,
"grad_norm": 0.6776384115219116,
"learning_rate": 3.2388944444444447e-05,
"loss": 1.2282,
"step": 317000
},
{
"epoch": 1.2253721393533272,
"grad_norm": 0.6960573196411133,
"learning_rate": 3.233338888888889e-05,
"loss": 1.227,
"step": 318000
},
{
"epoch": 1.2292255108607275,
"grad_norm": 0.682949423789978,
"learning_rate": 3.227783333333334e-05,
"loss": 1.2268,
"step": 319000
},
{
"epoch": 1.233078882368128,
"grad_norm": 0.6904979348182678,
"learning_rate": 3.222227777777778e-05,
"loss": 1.2273,
"step": 320000
},
{
"epoch": 1.2369322538755283,
"grad_norm": 0.6834551692008972,
"learning_rate": 3.216672222222222e-05,
"loss": 1.2265,
"step": 321000
},
{
"epoch": 1.2407856253829288,
"grad_norm": 0.6686312556266785,
"learning_rate": 3.211116666666667e-05,
"loss": 1.2258,
"step": 322000
},
{
"epoch": 1.244638996890329,
"grad_norm": 0.6807515025138855,
"learning_rate": 3.2055611111111114e-05,
"loss": 1.2264,
"step": 323000
},
{
"epoch": 1.2484923683977296,
"grad_norm": 0.6831598877906799,
"learning_rate": 3.2000055555555556e-05,
"loss": 1.2254,
"step": 324000
},
{
"epoch": 1.25234573990513,
"grad_norm": 0.6734605431556702,
"learning_rate": 3.19445e-05,
"loss": 1.2257,
"step": 325000
},
{
"epoch": 1.2561991114125304,
"grad_norm": 0.7259578704833984,
"learning_rate": 3.188894444444445e-05,
"loss": 1.2246,
"step": 326000
},
{
"epoch": 1.2600524829199307,
"grad_norm": 0.6729893684387207,
"learning_rate": 3.183338888888889e-05,
"loss": 1.2247,
"step": 327000
},
{
"epoch": 1.2639058544273312,
"grad_norm": 0.6584126353263855,
"learning_rate": 3.177783333333333e-05,
"loss": 1.2233,
"step": 328000
},
{
"epoch": 1.2677592259347317,
"grad_norm": 0.6399083733558655,
"learning_rate": 3.172227777777778e-05,
"loss": 1.224,
"step": 329000
},
{
"epoch": 1.271612597442132,
"grad_norm": 0.6504297852516174,
"learning_rate": 3.1666722222222224e-05,
"loss": 1.2235,
"step": 330000
},
{
"epoch": 1.2754659689495325,
"grad_norm": 0.6758235692977905,
"learning_rate": 3.1611166666666666e-05,
"loss": 1.2233,
"step": 331000
},
{
"epoch": 1.2793193404569327,
"grad_norm": 0.6571764945983887,
"learning_rate": 3.155561111111111e-05,
"loss": 1.2225,
"step": 332000
},
{
"epoch": 1.2831727119643332,
"grad_norm": 0.6841081976890564,
"learning_rate": 3.150005555555556e-05,
"loss": 1.2234,
"step": 333000
},
{
"epoch": 1.2870260834717335,
"grad_norm": 0.6634018421173096,
"learning_rate": 3.14445e-05,
"loss": 1.2223,
"step": 334000
},
{
"epoch": 1.290879454979134,
"grad_norm": 0.693499743938446,
"learning_rate": 3.138894444444444e-05,
"loss": 1.223,
"step": 335000
},
{
"epoch": 1.2947328264865343,
"grad_norm": 0.6660721898078918,
"learning_rate": 3.133338888888889e-05,
"loss": 1.2216,
"step": 336000
},
{
"epoch": 1.2985861979939348,
"grad_norm": 0.6657261252403259,
"learning_rate": 3.1277833333333334e-05,
"loss": 1.2209,
"step": 337000
},
{
"epoch": 1.302439569501335,
"grad_norm": 0.6683467626571655,
"learning_rate": 3.122227777777778e-05,
"loss": 1.2214,
"step": 338000
},
{
"epoch": 1.3062929410087356,
"grad_norm": 0.6729023456573486,
"learning_rate": 3.1166722222222225e-05,
"loss": 1.2207,
"step": 339000
},
{
"epoch": 1.3101463125161361,
"grad_norm": 0.6579126715660095,
"learning_rate": 3.111116666666667e-05,
"loss": 1.2208,
"step": 340000
},
{
"epoch": 1.3139996840235364,
"grad_norm": 0.7024135589599609,
"learning_rate": 3.105561111111111e-05,
"loss": 1.2197,
"step": 341000
},
{
"epoch": 1.3178530555309367,
"grad_norm": 0.6785723567008972,
"learning_rate": 3.100005555555556e-05,
"loss": 1.2197,
"step": 342000
},
{
"epoch": 1.3217064270383372,
"grad_norm": 0.6838181018829346,
"learning_rate": 3.09445e-05,
"loss": 1.2196,
"step": 343000
},
{
"epoch": 1.3255597985457377,
"grad_norm": 0.663948655128479,
"learning_rate": 3.0888944444444444e-05,
"loss": 1.2201,
"step": 344000
},
{
"epoch": 1.329413170053138,
"grad_norm": 0.6804963946342468,
"learning_rate": 3.083338888888889e-05,
"loss": 1.2183,
"step": 345000
},
{
"epoch": 1.3332665415605385,
"grad_norm": 0.6698565483093262,
"learning_rate": 3.0777833333333335e-05,
"loss": 1.2195,
"step": 346000
},
{
"epoch": 1.3371199130679388,
"grad_norm": 0.66984623670578,
"learning_rate": 3.072227777777778e-05,
"loss": 1.2177,
"step": 347000
},
{
"epoch": 1.3409732845753393,
"grad_norm": 0.6739431023597717,
"learning_rate": 3.066672222222222e-05,
"loss": 1.2187,
"step": 348000
},
{
"epoch": 1.3448266560827395,
"grad_norm": 0.6695194244384766,
"learning_rate": 3.061116666666667e-05,
"loss": 1.2172,
"step": 349000
},
{
"epoch": 1.34868002759014,
"grad_norm": 0.669137179851532,
"learning_rate": 3.055561111111111e-05,
"loss": 1.218,
"step": 350000
},
{
"epoch": 1.3525333990975403,
"grad_norm": 0.6813784241676331,
"learning_rate": 3.0500055555555557e-05,
"loss": 1.218,
"step": 351000
},
{
"epoch": 1.3563867706049408,
"grad_norm": 0.6733546853065491,
"learning_rate": 3.0444500000000003e-05,
"loss": 1.2168,
"step": 352000
},
{
"epoch": 1.3602401421123411,
"grad_norm": 0.6998477578163147,
"learning_rate": 3.0388944444444445e-05,
"loss": 1.2172,
"step": 353000
},
{
"epoch": 1.3640935136197416,
"grad_norm": 0.6859351396560669,
"learning_rate": 3.033338888888889e-05,
"loss": 1.2162,
"step": 354000
},
{
"epoch": 1.3679468851271421,
"grad_norm": 0.6735222339630127,
"learning_rate": 3.0277833333333333e-05,
"loss": 1.2165,
"step": 355000
},
{
"epoch": 1.3718002566345424,
"grad_norm": 0.6935626864433289,
"learning_rate": 3.022227777777778e-05,
"loss": 1.2164,
"step": 356000
},
{
"epoch": 1.3756536281419427,
"grad_norm": 0.695367693901062,
"learning_rate": 3.016672222222222e-05,
"loss": 1.2157,
"step": 357000
},
{
"epoch": 1.3795069996493432,
"grad_norm": 0.6997620463371277,
"learning_rate": 3.0111166666666667e-05,
"loss": 1.2151,
"step": 358000
},
{
"epoch": 1.3833603711567437,
"grad_norm": 0.6904904246330261,
"learning_rate": 3.0055611111111116e-05,
"loss": 1.2156,
"step": 359000
},
{
"epoch": 1.387213742664144,
"grad_norm": 0.6987177729606628,
"learning_rate": 3.0000055555555555e-05,
"loss": 1.2155,
"step": 360000
},
{
"epoch": 1.3910671141715443,
"grad_norm": 0.6865441203117371,
"learning_rate": 2.9944500000000004e-05,
"loss": 1.2139,
"step": 361000
},
{
"epoch": 1.3949204856789448,
"grad_norm": 0.6597044467926025,
"learning_rate": 2.9888944444444443e-05,
"loss": 1.2149,
"step": 362000
},
{
"epoch": 1.3987738571863453,
"grad_norm": 0.6782757639884949,
"learning_rate": 2.9833388888888892e-05,
"loss": 1.2141,
"step": 363000
},
{
"epoch": 1.4026272286937456,
"grad_norm": 0.6776473522186279,
"learning_rate": 2.977783333333333e-05,
"loss": 1.2139,
"step": 364000
},
{
"epoch": 1.406480600201146,
"grad_norm": 0.6748208403587341,
"learning_rate": 2.972227777777778e-05,
"loss": 1.2126,
"step": 365000
},
{
"epoch": 1.4103339717085464,
"grad_norm": 0.6814528107643127,
"learning_rate": 2.9666722222222226e-05,
"loss": 1.2133,
"step": 366000
},
{
"epoch": 1.4141873432159469,
"grad_norm": 0.6742803454399109,
"learning_rate": 2.961116666666667e-05,
"loss": 1.2131,
"step": 367000
},
{
"epoch": 1.4180407147233471,
"grad_norm": 0.6685371398925781,
"learning_rate": 2.9555611111111114e-05,
"loss": 1.2132,
"step": 368000
},
{
"epoch": 1.4218940862307476,
"grad_norm": 0.6587190628051758,
"learning_rate": 2.9500055555555557e-05,
"loss": 1.2126,
"step": 369000
},
{
"epoch": 1.425747457738148,
"grad_norm": 0.7063204050064087,
"learning_rate": 2.9444500000000002e-05,
"loss": 1.2133,
"step": 370000
},
{
"epoch": 1.4296008292455484,
"grad_norm": 0.6783314943313599,
"learning_rate": 2.9388944444444445e-05,
"loss": 1.2114,
"step": 371000
},
{
"epoch": 1.4334542007529487,
"grad_norm": 0.6613739132881165,
"learning_rate": 2.933338888888889e-05,
"loss": 1.2109,
"step": 372000
},
{
"epoch": 1.4373075722603492,
"grad_norm": 0.7002771496772766,
"learning_rate": 2.9277833333333333e-05,
"loss": 1.2115,
"step": 373000
},
{
"epoch": 1.4411609437677497,
"grad_norm": 0.69558185338974,
"learning_rate": 2.922227777777778e-05,
"loss": 1.2112,
"step": 374000
},
{
"epoch": 1.44501431527515,
"grad_norm": 0.6645965576171875,
"learning_rate": 2.9166722222222224e-05,
"loss": 1.2103,
"step": 375000
},
{
"epoch": 1.4488676867825503,
"grad_norm": 0.6624684929847717,
"learning_rate": 2.9111166666666667e-05,
"loss": 1.2096,
"step": 376000
},
{
"epoch": 1.4527210582899508,
"grad_norm": 0.6573096513748169,
"learning_rate": 2.9055611111111112e-05,
"loss": 1.2102,
"step": 377000
},
{
"epoch": 1.4565744297973513,
"grad_norm": 0.6568763852119446,
"learning_rate": 2.9000055555555555e-05,
"loss": 1.2109,
"step": 378000
},
{
"epoch": 1.4604278013047516,
"grad_norm": 0.6698375940322876,
"learning_rate": 2.8944500000000004e-05,
"loss": 1.2102,
"step": 379000
},
{
"epoch": 1.4642811728121519,
"grad_norm": 0.6893269419670105,
"learning_rate": 2.8888944444444443e-05,
"loss": 1.2096,
"step": 380000
},
{
"epoch": 1.4681345443195524,
"grad_norm": 0.6947731375694275,
"learning_rate": 2.8833388888888892e-05,
"loss": 1.2089,
"step": 381000
},
{
"epoch": 1.4719879158269529,
"grad_norm": 0.6623468399047852,
"learning_rate": 2.8777833333333338e-05,
"loss": 1.2093,
"step": 382000
},
{
"epoch": 1.4758412873343532,
"grad_norm": 0.6745172739028931,
"learning_rate": 2.872227777777778e-05,
"loss": 1.2092,
"step": 383000
},
{
"epoch": 1.4796946588417537,
"grad_norm": 0.6809899806976318,
"learning_rate": 2.8666722222222226e-05,
"loss": 1.2083,
"step": 384000
},
{
"epoch": 1.483548030349154,
"grad_norm": 0.6865934729576111,
"learning_rate": 2.8611166666666668e-05,
"loss": 1.2088,
"step": 385000
},
{
"epoch": 1.4874014018565545,
"grad_norm": 0.6749284863471985,
"learning_rate": 2.8555611111111114e-05,
"loss": 1.2085,
"step": 386000
},
{
"epoch": 1.4912547733639547,
"grad_norm": 0.6682766079902649,
"learning_rate": 2.8500055555555556e-05,
"loss": 1.2075,
"step": 387000
},
{
"epoch": 1.4951081448713552,
"grad_norm": 0.7050167322158813,
"learning_rate": 2.8444500000000002e-05,
"loss": 1.208,
"step": 388000
},
{
"epoch": 1.4989615163787555,
"grad_norm": 0.6618677377700806,
"learning_rate": 2.8388944444444448e-05,
"loss": 1.2073,
"step": 389000
},
{
"epoch": 1.502814887886156,
"grad_norm": 0.6752711534500122,
"learning_rate": 2.833338888888889e-05,
"loss": 1.2071,
"step": 390000
},
{
"epoch": 1.5066682593935563,
"grad_norm": 0.6783143877983093,
"learning_rate": 2.8277833333333336e-05,
"loss": 1.207,
"step": 391000
},
{
"epoch": 1.5105216309009568,
"grad_norm": 0.6858145594596863,
"learning_rate": 2.8222277777777778e-05,
"loss": 1.206,
"step": 392000
},
{
"epoch": 1.5143750024083573,
"grad_norm": 0.6630164384841919,
"learning_rate": 2.8166722222222224e-05,
"loss": 1.2066,
"step": 393000
},
{
"epoch": 1.5182283739157576,
"grad_norm": 0.6869551539421082,
"learning_rate": 2.8111166666666666e-05,
"loss": 1.2065,
"step": 394000
},
{
"epoch": 1.5220817454231579,
"grad_norm": 0.6730819344520569,
"learning_rate": 2.8055611111111112e-05,
"loss": 1.2053,
"step": 395000
},
{
"epoch": 1.5259351169305584,
"grad_norm": 0.6799289584159851,
"learning_rate": 2.8000055555555554e-05,
"loss": 1.2053,
"step": 396000
},
{
"epoch": 1.529788488437959,
"grad_norm": 0.6745020747184753,
"learning_rate": 2.79445e-05,
"loss": 1.2053,
"step": 397000
},
{
"epoch": 1.5336418599453592,
"grad_norm": 0.6658075451850891,
"learning_rate": 2.788894444444445e-05,
"loss": 1.2051,
"step": 398000
},
{
"epoch": 1.5374952314527595,
"grad_norm": 0.685326874256134,
"learning_rate": 2.7833388888888888e-05,
"loss": 1.2047,
"step": 399000
},
{
"epoch": 1.54134860296016,
"grad_norm": 0.6764355301856995,
"learning_rate": 2.7777833333333337e-05,
"loss": 1.2051,
"step": 400000
},
{
"epoch": 1.5452019744675605,
"grad_norm": 0.6723695993423462,
"learning_rate": 2.7722277777777776e-05,
"loss": 1.2047,
"step": 401000
},
{
"epoch": 1.5490553459749608,
"grad_norm": 0.6721011996269226,
"learning_rate": 2.7666722222222225e-05,
"loss": 1.2035,
"step": 402000
},
{
"epoch": 1.552908717482361,
"grad_norm": 0.6737053394317627,
"learning_rate": 2.7611166666666664e-05,
"loss": 1.2047,
"step": 403000
},
{
"epoch": 1.5567620889897615,
"grad_norm": 0.6559922695159912,
"learning_rate": 2.7555611111111113e-05,
"loss": 1.2037,
"step": 404000
},
{
"epoch": 1.560615460497162,
"grad_norm": 0.6571487188339233,
"learning_rate": 2.750005555555556e-05,
"loss": 1.2034,
"step": 405000
},
{
"epoch": 1.5644688320045623,
"grad_norm": 0.6738882660865784,
"learning_rate": 2.74445e-05,
"loss": 1.203,
"step": 406000
},
{
"epoch": 1.5683222035119628,
"grad_norm": 0.6922580599784851,
"learning_rate": 2.7388944444444447e-05,
"loss": 1.2028,
"step": 407000
},
{
"epoch": 1.5721755750193633,
"grad_norm": 0.6696702837944031,
"learning_rate": 2.733338888888889e-05,
"loss": 1.2021,
"step": 408000
},
{
"epoch": 1.5760289465267636,
"grad_norm": 0.688118577003479,
"learning_rate": 2.7277833333333335e-05,
"loss": 1.203,
"step": 409000
},
{
"epoch": 1.579882318034164,
"grad_norm": 0.6660063862800598,
"learning_rate": 2.7222277777777777e-05,
"loss": 1.2018,
"step": 410000
},
{
"epoch": 1.5837356895415644,
"grad_norm": 0.7178686857223511,
"learning_rate": 2.7166722222222223e-05,
"loss": 1.2018,
"step": 411000
},
{
"epoch": 1.587589061048965,
"grad_norm": 0.7126618027687073,
"learning_rate": 2.7111166666666665e-05,
"loss": 1.2019,
"step": 412000
},
{
"epoch": 1.5914424325563652,
"grad_norm": 0.7018870711326599,
"learning_rate": 2.705561111111111e-05,
"loss": 1.2018,
"step": 413000
},
{
"epoch": 1.5952958040637655,
"grad_norm": 0.6731059551239014,
"learning_rate": 2.700005555555556e-05,
"loss": 1.2012,
"step": 414000
},
{
"epoch": 1.599149175571166,
"grad_norm": 0.6750038862228394,
"learning_rate": 2.69445e-05,
"loss": 1.1995,
"step": 415000
},
{
"epoch": 1.6030025470785665,
"grad_norm": 0.661834180355072,
"learning_rate": 2.688894444444445e-05,
"loss": 1.1999,
"step": 416000
},
{
"epoch": 1.6068559185859668,
"grad_norm": 0.6862068176269531,
"learning_rate": 2.6833388888888887e-05,
"loss": 1.2,
"step": 417000
},
{
"epoch": 1.610709290093367,
"grad_norm": 0.6672124862670898,
"learning_rate": 2.6777833333333336e-05,
"loss": 1.1995,
"step": 418000
},
{
"epoch": 1.6145626616007676,
"grad_norm": 0.6680454015731812,
"learning_rate": 2.6722277777777775e-05,
"loss": 1.2003,
"step": 419000
},
{
"epoch": 1.618416033108168,
"grad_norm": 0.6725075244903564,
"learning_rate": 2.6666722222222225e-05,
"loss": 1.2002,
"step": 420000
},
{
"epoch": 1.6222694046155683,
"grad_norm": 0.6725237369537354,
"learning_rate": 2.661116666666667e-05,
"loss": 1.1989,
"step": 421000
},
{
"epoch": 1.6261227761229686,
"grad_norm": 0.6713998317718506,
"learning_rate": 2.6555611111111113e-05,
"loss": 1.1992,
"step": 422000
},
{
"epoch": 1.6299761476303694,
"grad_norm": 0.6976920366287231,
"learning_rate": 2.650005555555556e-05,
"loss": 1.1996,
"step": 423000
},
{
"epoch": 1.6338295191377696,
"grad_norm": 0.6603657007217407,
"learning_rate": 2.64445e-05,
"loss": 1.1988,
"step": 424000
},
{
"epoch": 1.63768289064517,
"grad_norm": 0.6840860843658447,
"learning_rate": 2.6388944444444446e-05,
"loss": 1.1986,
"step": 425000
},
{
"epoch": 1.6415362621525704,
"grad_norm": 0.6827540397644043,
"learning_rate": 2.633338888888889e-05,
"loss": 1.1985,
"step": 426000
},
{
"epoch": 1.645389633659971,
"grad_norm": 0.6934226155281067,
"learning_rate": 2.6277833333333334e-05,
"loss": 1.1986,
"step": 427000
},
{
"epoch": 1.6492430051673712,
"grad_norm": 0.6878825426101685,
"learning_rate": 2.622227777777778e-05,
"loss": 1.1977,
"step": 428000
},
{
"epoch": 1.6530963766747715,
"grad_norm": 0.6928458213806152,
"learning_rate": 2.6166722222222223e-05,
"loss": 1.1979,
"step": 429000
},
{
"epoch": 1.656949748182172,
"grad_norm": 0.6635681986808777,
"learning_rate": 2.6111166666666668e-05,
"loss": 1.1974,
"step": 430000
},
{
"epoch": 1.6608031196895725,
"grad_norm": 0.753787636756897,
"learning_rate": 2.605561111111111e-05,
"loss": 1.1975,
"step": 431000
},
{
"epoch": 1.6646564911969728,
"grad_norm": 0.6675045490264893,
"learning_rate": 2.600005555555556e-05,
"loss": 1.1972,
"step": 432000
},
{
"epoch": 1.668509862704373,
"grad_norm": 0.6703343987464905,
"learning_rate": 2.59445e-05,
"loss": 1.1969,
"step": 433000
},
{
"epoch": 1.6723632342117736,
"grad_norm": 0.6867698431015015,
"learning_rate": 2.5888944444444448e-05,
"loss": 1.1966,
"step": 434000
},
{
"epoch": 1.676216605719174,
"grad_norm": 0.6581012606620789,
"learning_rate": 2.5833388888888887e-05,
"loss": 1.1966,
"step": 435000
},
{
"epoch": 1.6800699772265744,
"grad_norm": 0.7066845297813416,
"learning_rate": 2.5777833333333336e-05,
"loss": 1.197,
"step": 436000
},
{
"epoch": 1.6839233487339746,
"grad_norm": 0.7115961313247681,
"learning_rate": 2.572227777777778e-05,
"loss": 1.196,
"step": 437000
},
{
"epoch": 1.6877767202413752,
"grad_norm": 0.7107385993003845,
"learning_rate": 2.5666722222222224e-05,
"loss": 1.1959,
"step": 438000
},
{
"epoch": 1.6916300917487757,
"grad_norm": 0.6925193667411804,
"learning_rate": 2.561116666666667e-05,
"loss": 1.1956,
"step": 439000
},
{
"epoch": 1.695483463256176,
"grad_norm": 0.6661742329597473,
"learning_rate": 2.5555611111111112e-05,
"loss": 1.1951,
"step": 440000
},
{
"epoch": 1.6993368347635764,
"grad_norm": 0.6755145788192749,
"learning_rate": 2.5500055555555558e-05,
"loss": 1.1951,
"step": 441000
},
{
"epoch": 1.703190206270977,
"grad_norm": 0.7076007127761841,
"learning_rate": 2.54445e-05,
"loss": 1.1953,
"step": 442000
},
{
"epoch": 1.7070435777783772,
"grad_norm": 0.6796631813049316,
"learning_rate": 2.5388944444444446e-05,
"loss": 1.1945,
"step": 443000
},
{
"epoch": 1.7108969492857775,
"grad_norm": 0.6803722381591797,
"learning_rate": 2.533338888888889e-05,
"loss": 1.1947,
"step": 444000
},
{
"epoch": 1.714750320793178,
"grad_norm": 0.6730965971946716,
"learning_rate": 2.5277833333333334e-05,
"loss": 1.1942,
"step": 445000
},
{
"epoch": 1.7186036923005785,
"grad_norm": 0.6800934672355652,
"learning_rate": 2.522227777777778e-05,
"loss": 1.1941,
"step": 446000
},
{
"epoch": 1.7224570638079788,
"grad_norm": 0.6878598928451538,
"learning_rate": 2.5166722222222222e-05,
"loss": 1.1941,
"step": 447000
},
{
"epoch": 1.726310435315379,
"grad_norm": 0.6674512624740601,
"learning_rate": 2.5111166666666668e-05,
"loss": 1.1935,
"step": 448000
},
{
"epoch": 1.7301638068227796,
"grad_norm": 0.6966185569763184,
"learning_rate": 2.505561111111111e-05,
"loss": 1.1936,
"step": 449000
},
{
"epoch": 1.73401717833018,
"grad_norm": 0.708171546459198,
"learning_rate": 2.5000055555555556e-05,
"loss": 1.1932,
"step": 450000
},
{
"epoch": 1.7378705498375804,
"grad_norm": 0.6620480418205261,
"learning_rate": 2.49445e-05,
"loss": 1.1929,
"step": 451000
},
{
"epoch": 1.7417239213449807,
"grad_norm": 0.6783220767974854,
"learning_rate": 2.4888944444444444e-05,
"loss": 1.1934,
"step": 452000
},
{
"epoch": 1.7455772928523812,
"grad_norm": 0.7003952264785767,
"learning_rate": 2.483338888888889e-05,
"loss": 1.1926,
"step": 453000
},
{
"epoch": 1.7494306643597817,
"grad_norm": 0.6729221343994141,
"learning_rate": 2.4777833333333332e-05,
"loss": 1.1925,
"step": 454000
},
{
"epoch": 1.753284035867182,
"grad_norm": 0.7178687453269958,
"learning_rate": 2.472227777777778e-05,
"loss": 1.193,
"step": 455000
},
{
"epoch": 1.7571374073745822,
"grad_norm": 0.6935052275657654,
"learning_rate": 2.4666722222222223e-05,
"loss": 1.1921,
"step": 456000
},
{
"epoch": 1.7609907788819827,
"grad_norm": 0.6983472108840942,
"learning_rate": 2.461116666666667e-05,
"loss": 1.1922,
"step": 457000
},
{
"epoch": 1.7648441503893832,
"grad_norm": 0.6847233176231384,
"learning_rate": 2.455561111111111e-05,
"loss": 1.192,
"step": 458000
},
{
"epoch": 1.7686975218967835,
"grad_norm": 0.6784983277320862,
"learning_rate": 2.4500055555555557e-05,
"loss": 1.1914,
"step": 459000
},
{
"epoch": 1.772550893404184,
"grad_norm": 0.6867578625679016,
"learning_rate": 2.44445e-05,
"loss": 1.1916,
"step": 460000
},
{
"epoch": 1.7764042649115845,
"grad_norm": 0.681760311126709,
"learning_rate": 2.4388944444444445e-05,
"loss": 1.1909,
"step": 461000
},
{
"epoch": 1.7802576364189848,
"grad_norm": 0.6820582151412964,
"learning_rate": 2.4333388888888888e-05,
"loss": 1.1913,
"step": 462000
},
{
"epoch": 1.784111007926385,
"grad_norm": 0.6804444193840027,
"learning_rate": 2.4277833333333337e-05,
"loss": 1.1905,
"step": 463000
},
{
"epoch": 1.7879643794337856,
"grad_norm": 0.6828032732009888,
"learning_rate": 2.422227777777778e-05,
"loss": 1.1906,
"step": 464000
},
{
"epoch": 1.7918177509411861,
"grad_norm": 0.675101101398468,
"learning_rate": 2.4166722222222225e-05,
"loss": 1.1902,
"step": 465000
},
{
"epoch": 1.7956711224485864,
"grad_norm": 0.6670296788215637,
"learning_rate": 2.4111166666666667e-05,
"loss": 1.1902,
"step": 466000
},
{
"epoch": 1.7995244939559867,
"grad_norm": 0.730213463306427,
"learning_rate": 2.4055611111111113e-05,
"loss": 1.1903,
"step": 467000
},
{
"epoch": 1.8033778654633872,
"grad_norm": 0.728732705116272,
"learning_rate": 2.4000055555555555e-05,
"loss": 1.1898,
"step": 468000
},
{
"epoch": 1.8072312369707877,
"grad_norm": 0.7038969993591309,
"learning_rate": 2.39445e-05,
"loss": 1.1901,
"step": 469000
},
{
"epoch": 1.811084608478188,
"grad_norm": 0.7080554366111755,
"learning_rate": 2.3888944444444443e-05,
"loss": 1.1902,
"step": 470000
},
{
"epoch": 1.8149379799855883,
"grad_norm": 0.6914920806884766,
"learning_rate": 2.3833388888888892e-05,
"loss": 1.1893,
"step": 471000
},
{
"epoch": 1.8187913514929888,
"grad_norm": 0.6858305931091309,
"learning_rate": 2.3777833333333335e-05,
"loss": 1.1886,
"step": 472000
},
{
"epoch": 1.8226447230003893,
"grad_norm": 0.7036804556846619,
"learning_rate": 2.372227777777778e-05,
"loss": 1.1891,
"step": 473000
},
{
"epoch": 1.8264980945077895,
"grad_norm": 0.7008316516876221,
"learning_rate": 2.3666722222222223e-05,
"loss": 1.1883,
"step": 474000
},
{
"epoch": 1.8303514660151898,
"grad_norm": 0.7048190236091614,
"learning_rate": 2.361116666666667e-05,
"loss": 1.189,
"step": 475000
},
{
"epoch": 1.8342048375225903,
"grad_norm": 0.7033438086509705,
"learning_rate": 2.355561111111111e-05,
"loss": 1.1871,
"step": 476000
},
{
"epoch": 1.8380582090299908,
"grad_norm": 0.7042247653007507,
"learning_rate": 2.3500055555555557e-05,
"loss": 1.1878,
"step": 477000
},
{
"epoch": 1.8419115805373911,
"grad_norm": 0.7107093334197998,
"learning_rate": 2.34445e-05,
"loss": 1.1877,
"step": 478000
},
{
"epoch": 1.8457649520447916,
"grad_norm": 0.6645656228065491,
"learning_rate": 2.3388944444444448e-05,
"loss": 1.1872,
"step": 479000
},
{
"epoch": 1.8496183235521921,
"grad_norm": 0.676702618598938,
"learning_rate": 2.333338888888889e-05,
"loss": 1.1867,
"step": 480000
},
{
"epoch": 1.8534716950595924,
"grad_norm": 0.7022708654403687,
"learning_rate": 2.3277833333333336e-05,
"loss": 1.1878,
"step": 481000
},
{
"epoch": 1.8573250665669927,
"grad_norm": 0.6802497506141663,
"learning_rate": 2.322227777777778e-05,
"loss": 1.1866,
"step": 482000
},
{
"epoch": 1.8611784380743932,
"grad_norm": 0.6969447135925293,
"learning_rate": 2.3166722222222224e-05,
"loss": 1.187,
"step": 483000
},
{
"epoch": 1.8650318095817937,
"grad_norm": 0.6957907676696777,
"learning_rate": 2.3111166666666666e-05,
"loss": 1.1869,
"step": 484000
},
{
"epoch": 1.868885181089194,
"grad_norm": 0.70342618227005,
"learning_rate": 2.3055611111111112e-05,
"loss": 1.1861,
"step": 485000
},
{
"epoch": 1.8727385525965943,
"grad_norm": 0.6961056590080261,
"learning_rate": 2.3000055555555558e-05,
"loss": 1.1852,
"step": 486000
},
{
"epoch": 1.8765919241039948,
"grad_norm": 0.6739860773086548,
"learning_rate": 2.2944500000000004e-05,
"loss": 1.1856,
"step": 487000
},
{
"epoch": 1.8804452956113953,
"grad_norm": 0.7163240909576416,
"learning_rate": 2.2888944444444446e-05,
"loss": 1.1852,
"step": 488000
},
{
"epoch": 1.8842986671187956,
"grad_norm": 0.6964929103851318,
"learning_rate": 2.2833388888888892e-05,
"loss": 1.1853,
"step": 489000
},
{
"epoch": 1.8881520386261958,
"grad_norm": 0.6857479214668274,
"learning_rate": 2.2777833333333334e-05,
"loss": 1.1858,
"step": 490000
},
{
"epoch": 1.8920054101335964,
"grad_norm": 0.6846323013305664,
"learning_rate": 2.272227777777778e-05,
"loss": 1.185,
"step": 491000
},
{
"epoch": 1.8958587816409969,
"grad_norm": 0.6962786316871643,
"learning_rate": 2.2666722222222222e-05,
"loss": 1.1845,
"step": 492000
},
{
"epoch": 1.8997121531483971,
"grad_norm": 0.7047570943832397,
"learning_rate": 2.2611166666666668e-05,
"loss": 1.185,
"step": 493000
},
{
"epoch": 1.9035655246557974,
"grad_norm": 0.699938952922821,
"learning_rate": 2.2555611111111114e-05,
"loss": 1.1845,
"step": 494000
},
{
"epoch": 1.9074188961631982,
"grad_norm": 0.6817954182624817,
"learning_rate": 2.2500055555555556e-05,
"loss": 1.1844,
"step": 495000
},
{
"epoch": 1.9112722676705984,
"grad_norm": 0.7055638432502747,
"learning_rate": 2.2444500000000002e-05,
"loss": 1.1848,
"step": 496000
},
{
"epoch": 1.9151256391779987,
"grad_norm": 0.688340425491333,
"learning_rate": 2.2388944444444444e-05,
"loss": 1.1842,
"step": 497000
},
{
"epoch": 1.9189790106853992,
"grad_norm": 0.667679488658905,
"learning_rate": 2.233338888888889e-05,
"loss": 1.1832,
"step": 498000
},
{
"epoch": 1.9228323821927997,
"grad_norm": 0.7045871019363403,
"learning_rate": 2.2277833333333335e-05,
"loss": 1.1823,
"step": 499000
},
{
"epoch": 1.9266857537002,
"grad_norm": 0.7178110480308533,
"learning_rate": 2.2222277777777778e-05,
"loss": 1.1834,
"step": 500000
},
{
"epoch": 1.9305391252076003,
"grad_norm": 0.7002114057540894,
"learning_rate": 2.2166722222222224e-05,
"loss": 1.1842,
"step": 501000
},
{
"epoch": 1.9343924967150008,
"grad_norm": 0.6976704001426697,
"learning_rate": 2.211116666666667e-05,
"loss": 1.1827,
"step": 502000
},
{
"epoch": 1.9382458682224013,
"grad_norm": 0.6923725008964539,
"learning_rate": 2.205561111111111e-05,
"loss": 1.1828,
"step": 503000
},
{
"epoch": 1.9420992397298016,
"grad_norm": 0.6811366081237793,
"learning_rate": 2.2000055555555557e-05,
"loss": 1.1831,
"step": 504000
},
{
"epoch": 1.9459526112372019,
"grad_norm": 0.6838697195053101,
"learning_rate": 2.19445e-05,
"loss": 1.1835,
"step": 505000
},
{
"epoch": 1.9498059827446024,
"grad_norm": 0.7210490703582764,
"learning_rate": 2.1888944444444445e-05,
"loss": 1.1827,
"step": 506000
},
{
"epoch": 1.9536593542520029,
"grad_norm": 0.7091962695121765,
"learning_rate": 2.1833388888888888e-05,
"loss": 1.1826,
"step": 507000
},
{
"epoch": 1.9575127257594032,
"grad_norm": 0.7018596529960632,
"learning_rate": 2.1777833333333334e-05,
"loss": 1.1817,
"step": 508000
},
{
"epoch": 1.9613660972668034,
"grad_norm": 0.6678944826126099,
"learning_rate": 2.1722277777777776e-05,
"loss": 1.1828,
"step": 509000
},
{
"epoch": 1.965219468774204,
"grad_norm": 0.7078109979629517,
"learning_rate": 2.1666722222222225e-05,
"loss": 1.1817,
"step": 510000
},
{
"epoch": 1.9690728402816045,
"grad_norm": 0.7082613110542297,
"learning_rate": 2.1611166666666667e-05,
"loss": 1.1816,
"step": 511000
},
{
"epoch": 1.9729262117890047,
"grad_norm": 0.692307710647583,
"learning_rate": 2.1555611111111113e-05,
"loss": 1.1818,
"step": 512000
},
{
"epoch": 1.9767795832964052,
"grad_norm": 0.6972088813781738,
"learning_rate": 2.1500055555555555e-05,
"loss": 1.1817,
"step": 513000
},
{
"epoch": 1.9806329548038057,
"grad_norm": 0.7020753622055054,
"learning_rate": 2.14445e-05,
"loss": 1.1815,
"step": 514000
},
{
"epoch": 1.984486326311206,
"grad_norm": 0.7120492458343506,
"learning_rate": 2.1388944444444443e-05,
"loss": 1.1814,
"step": 515000
},
{
"epoch": 1.9883396978186063,
"grad_norm": 0.6771794557571411,
"learning_rate": 2.133338888888889e-05,
"loss": 1.1813,
"step": 516000
},
{
"epoch": 1.9921930693260068,
"grad_norm": 0.6771290898323059,
"learning_rate": 2.1277833333333335e-05,
"loss": 1.1799,
"step": 517000
},
{
"epoch": 1.9960464408334073,
"grad_norm": 0.694908082485199,
"learning_rate": 2.122227777777778e-05,
"loss": 1.1807,
"step": 518000
},
{
"epoch": 1.9998998123408076,
"grad_norm": 0.6758216023445129,
"learning_rate": 2.1166722222222223e-05,
"loss": 1.1808,
"step": 519000
},
{
"epoch": 2.003753183848208,
"grad_norm": 0.6828746795654297,
"learning_rate": 2.111116666666667e-05,
"loss": 1.1799,
"step": 520000
},
{
"epoch": 2.007606555355608,
"grad_norm": 0.7079645991325378,
"learning_rate": 2.105561111111111e-05,
"loss": 1.1788,
"step": 521000
},
{
"epoch": 2.011459926863009,
"grad_norm": 0.7006625533103943,
"learning_rate": 2.1000055555555557e-05,
"loss": 1.1796,
"step": 522000
},
{
"epoch": 2.015313298370409,
"grad_norm": 0.7358622550964355,
"learning_rate": 2.09445e-05,
"loss": 1.1794,
"step": 523000
},
{
"epoch": 2.0191666698778095,
"grad_norm": 0.7054480910301208,
"learning_rate": 2.0888944444444445e-05,
"loss": 1.1797,
"step": 524000
},
{
"epoch": 2.02302004138521,
"grad_norm": 0.6997362971305847,
"learning_rate": 2.083338888888889e-05,
"loss": 1.1794,
"step": 525000
},
{
"epoch": 2.0268734128926105,
"grad_norm": 0.7154229879379272,
"learning_rate": 2.0777833333333336e-05,
"loss": 1.1795,
"step": 526000
},
{
"epoch": 2.0307267844000108,
"grad_norm": 0.7037490010261536,
"learning_rate": 2.072227777777778e-05,
"loss": 1.177,
"step": 527000
},
{
"epoch": 2.034580155907411,
"grad_norm": 0.6936154365539551,
"learning_rate": 2.0666722222222224e-05,
"loss": 1.1782,
"step": 528000
},
{
"epoch": 2.0384335274148118,
"grad_norm": 0.6885814070701599,
"learning_rate": 2.0611166666666667e-05,
"loss": 1.1787,
"step": 529000
},
{
"epoch": 2.042286898922212,
"grad_norm": 0.7303010821342468,
"learning_rate": 2.0555611111111112e-05,
"loss": 1.1779,
"step": 530000
},
{
"epoch": 2.0461402704296123,
"grad_norm": 0.7179313898086548,
"learning_rate": 2.0500055555555555e-05,
"loss": 1.1774,
"step": 531000
},
{
"epoch": 2.0499936419370126,
"grad_norm": 0.70022052526474,
"learning_rate": 2.04445e-05,
"loss": 1.1781,
"step": 532000
},
{
"epoch": 2.0538470134444133,
"grad_norm": 0.6912038326263428,
"learning_rate": 2.0388944444444446e-05,
"loss": 1.1772,
"step": 533000
},
{
"epoch": 2.0577003849518136,
"grad_norm": 0.6987645030021667,
"learning_rate": 2.0333388888888892e-05,
"loss": 1.1774,
"step": 534000
},
{
"epoch": 2.061553756459214,
"grad_norm": 0.7104570269584656,
"learning_rate": 2.0277833333333334e-05,
"loss": 1.1772,
"step": 535000
},
{
"epoch": 2.065407127966614,
"grad_norm": 0.6777941584587097,
"learning_rate": 2.022227777777778e-05,
"loss": 1.1771,
"step": 536000
},
{
"epoch": 2.069260499474015,
"grad_norm": 0.7318848967552185,
"learning_rate": 2.0166722222222222e-05,
"loss": 1.1767,
"step": 537000
},
{
"epoch": 2.073113870981415,
"grad_norm": 0.6985452175140381,
"learning_rate": 2.0111166666666668e-05,
"loss": 1.1777,
"step": 538000
},
{
"epoch": 2.0769672424888155,
"grad_norm": 0.6963249444961548,
"learning_rate": 2.005561111111111e-05,
"loss": 1.1771,
"step": 539000
},
{
"epoch": 2.0808206139962158,
"grad_norm": 0.7212308049201965,
"learning_rate": 2.0000055555555556e-05,
"loss": 1.1763,
"step": 540000
},
{
"epoch": 2.0846739855036165,
"grad_norm": 0.7275625467300415,
"learning_rate": 1.9944500000000002e-05,
"loss": 1.1766,
"step": 541000
},
{
"epoch": 2.0885273570110168,
"grad_norm": 0.6992737054824829,
"learning_rate": 1.9888944444444448e-05,
"loss": 1.1768,
"step": 542000
},
{
"epoch": 2.092380728518417,
"grad_norm": 0.736659049987793,
"learning_rate": 1.983338888888889e-05,
"loss": 1.1757,
"step": 543000
},
{
"epoch": 2.096234100025818,
"grad_norm": 0.6867293119430542,
"learning_rate": 1.9777833333333336e-05,
"loss": 1.1771,
"step": 544000
},
{
"epoch": 2.100087471533218,
"grad_norm": 0.7108346819877625,
"learning_rate": 1.9722277777777778e-05,
"loss": 1.1763,
"step": 545000
},
{
"epoch": 2.1039408430406183,
"grad_norm": 0.7174147367477417,
"learning_rate": 1.9666722222222224e-05,
"loss": 1.1741,
"step": 546000
},
{
"epoch": 2.1077942145480186,
"grad_norm": 0.6843900680541992,
"learning_rate": 1.9611166666666666e-05,
"loss": 1.1761,
"step": 547000
},
{
"epoch": 2.1116475860554194,
"grad_norm": 0.7024357318878174,
"learning_rate": 1.9555611111111112e-05,
"loss": 1.1747,
"step": 548000
},
{
"epoch": 2.1155009575628196,
"grad_norm": 0.7079586386680603,
"learning_rate": 1.9500055555555558e-05,
"loss": 1.1756,
"step": 549000
},
{
"epoch": 2.11935432907022,
"grad_norm": 0.7204769253730774,
"learning_rate": 1.94445e-05,
"loss": 1.1751,
"step": 550000
},
{
"epoch": 2.12320770057762,
"grad_norm": 0.6855219006538391,
"learning_rate": 1.9388944444444446e-05,
"loss": 1.1743,
"step": 551000
},
{
"epoch": 2.127061072085021,
"grad_norm": 0.7142133712768555,
"learning_rate": 1.933338888888889e-05,
"loss": 1.1749,
"step": 552000
},
{
"epoch": 2.130914443592421,
"grad_norm": 0.716454803943634,
"learning_rate": 1.9277833333333334e-05,
"loss": 1.1742,
"step": 553000
},
{
"epoch": 2.1347678150998215,
"grad_norm": 0.734761118888855,
"learning_rate": 1.922227777777778e-05,
"loss": 1.175,
"step": 554000
},
{
"epoch": 2.138621186607222,
"grad_norm": 0.7150977849960327,
"learning_rate": 1.9166722222222222e-05,
"loss": 1.1741,
"step": 555000
},
{
"epoch": 2.1424745581146225,
"grad_norm": 0.7086408734321594,
"learning_rate": 1.9111166666666668e-05,
"loss": 1.1733,
"step": 556000
},
{
"epoch": 2.146327929622023,
"grad_norm": 0.689511775970459,
"learning_rate": 1.9055611111111113e-05,
"loss": 1.1736,
"step": 557000
},
{
"epoch": 2.150181301129423,
"grad_norm": 0.6908608078956604,
"learning_rate": 1.9000055555555556e-05,
"loss": 1.1734,
"step": 558000
},
{
"epoch": 2.154034672636824,
"grad_norm": 0.7414750456809998,
"learning_rate": 1.89445e-05,
"loss": 1.1735,
"step": 559000
},
{
"epoch": 2.157888044144224,
"grad_norm": 0.6861109137535095,
"learning_rate": 1.8888944444444444e-05,
"loss": 1.1739,
"step": 560000
},
{
"epoch": 2.1617414156516244,
"grad_norm": 0.6974884271621704,
"learning_rate": 1.883338888888889e-05,
"loss": 1.1732,
"step": 561000
},
{
"epoch": 2.1655947871590246,
"grad_norm": 0.7133603692054749,
"learning_rate": 1.8777833333333332e-05,
"loss": 1.1729,
"step": 562000
},
{
"epoch": 2.1694481586664254,
"grad_norm": 0.7137395143508911,
"learning_rate": 1.8722277777777777e-05,
"loss": 1.1733,
"step": 563000
},
{
"epoch": 2.1733015301738257,
"grad_norm": 0.724099338054657,
"learning_rate": 1.8666722222222223e-05,
"loss": 1.1725,
"step": 564000
},
{
"epoch": 2.177154901681226,
"grad_norm": 0.7194118499755859,
"learning_rate": 1.861116666666667e-05,
"loss": 1.1725,
"step": 565000
},
{
"epoch": 2.1810082731886262,
"grad_norm": 0.7033548355102539,
"learning_rate": 1.855561111111111e-05,
"loss": 1.172,
"step": 566000
},
{
"epoch": 2.184861644696027,
"grad_norm": 0.6971196532249451,
"learning_rate": 1.8500055555555557e-05,
"loss": 1.1714,
"step": 567000
},
{
"epoch": 2.1887150162034272,
"grad_norm": 0.7256486415863037,
"learning_rate": 1.84445e-05,
"loss": 1.1723,
"step": 568000
},
{
"epoch": 2.1925683877108275,
"grad_norm": 0.6980853080749512,
"learning_rate": 1.8388944444444445e-05,
"loss": 1.1714,
"step": 569000
},
{
"epoch": 2.196421759218228,
"grad_norm": 0.7225170731544495,
"learning_rate": 1.8333388888888887e-05,
"loss": 1.1721,
"step": 570000
},
{
"epoch": 2.2002751307256285,
"grad_norm": 0.7157464623451233,
"learning_rate": 1.8277833333333333e-05,
"loss": 1.1718,
"step": 571000
},
{
"epoch": 2.204128502233029,
"grad_norm": 0.6965727806091309,
"learning_rate": 1.822227777777778e-05,
"loss": 1.1719,
"step": 572000
},
{
"epoch": 2.207981873740429,
"grad_norm": 0.6944066882133484,
"learning_rate": 1.8166722222222225e-05,
"loss": 1.1716,
"step": 573000
},
{
"epoch": 2.21183524524783,
"grad_norm": 0.709082841873169,
"learning_rate": 1.8111166666666667e-05,
"loss": 1.1719,
"step": 574000
},
{
"epoch": 2.21568861675523,
"grad_norm": 0.7087362408638,
"learning_rate": 1.8055611111111113e-05,
"loss": 1.1707,
"step": 575000
},
{
"epoch": 2.2195419882626304,
"grad_norm": 0.7397356033325195,
"learning_rate": 1.8000055555555555e-05,
"loss": 1.1713,
"step": 576000
},
{
"epoch": 2.2233953597700307,
"grad_norm": 0.7147518396377563,
"learning_rate": 1.79445e-05,
"loss": 1.1705,
"step": 577000
},
{
"epoch": 2.2272487312774314,
"grad_norm": 0.7075289487838745,
"learning_rate": 1.7888944444444443e-05,
"loss": 1.1707,
"step": 578000
},
{
"epoch": 2.2311021027848317,
"grad_norm": 0.7047394514083862,
"learning_rate": 1.783338888888889e-05,
"loss": 1.1696,
"step": 579000
},
{
"epoch": 2.234955474292232,
"grad_norm": 0.7000110149383545,
"learning_rate": 1.7777833333333335e-05,
"loss": 1.1711,
"step": 580000
},
{
"epoch": 2.2388088457996322,
"grad_norm": 0.7090974450111389,
"learning_rate": 1.772227777777778e-05,
"loss": 1.1701,
"step": 581000
},
{
"epoch": 2.242662217307033,
"grad_norm": 0.7133679389953613,
"learning_rate": 1.7666722222222223e-05,
"loss": 1.1701,
"step": 582000
},
{
"epoch": 2.2465155888144333,
"grad_norm": 0.69938063621521,
"learning_rate": 1.761116666666667e-05,
"loss": 1.1692,
"step": 583000
},
{
"epoch": 2.2503689603218335,
"grad_norm": 0.7067489624023438,
"learning_rate": 1.755561111111111e-05,
"loss": 1.1703,
"step": 584000
},
{
"epoch": 2.254222331829234,
"grad_norm": 0.7081299424171448,
"learning_rate": 1.7500055555555556e-05,
"loss": 1.1689,
"step": 585000
},
{
"epoch": 2.2580757033366345,
"grad_norm": 0.71224445104599,
"learning_rate": 1.74445e-05,
"loss": 1.169,
"step": 586000
},
{
"epoch": 2.261929074844035,
"grad_norm": 0.7073362469673157,
"learning_rate": 1.7388944444444448e-05,
"loss": 1.1691,
"step": 587000
},
{
"epoch": 2.265782446351435,
"grad_norm": 0.7043183445930481,
"learning_rate": 1.733338888888889e-05,
"loss": 1.1703,
"step": 588000
},
{
"epoch": 2.269635817858836,
"grad_norm": 0.6992381811141968,
"learning_rate": 1.7277833333333336e-05,
"loss": 1.1689,
"step": 589000
},
{
"epoch": 2.273489189366236,
"grad_norm": 0.6977965235710144,
"learning_rate": 1.7222277777777778e-05,
"loss": 1.1687,
"step": 590000
},
{
"epoch": 2.2773425608736364,
"grad_norm": 0.7121700644493103,
"learning_rate": 1.7166722222222224e-05,
"loss": 1.168,
"step": 591000
},
{
"epoch": 2.2811959323810367,
"grad_norm": 0.7015835642814636,
"learning_rate": 1.7111166666666666e-05,
"loss": 1.1687,
"step": 592000
},
{
"epoch": 2.285049303888437,
"grad_norm": 0.6966621279716492,
"learning_rate": 1.7055611111111112e-05,
"loss": 1.1675,
"step": 593000
},
{
"epoch": 2.2889026753958377,
"grad_norm": 0.7022546529769897,
"learning_rate": 1.7000055555555554e-05,
"loss": 1.1685,
"step": 594000
},
{
"epoch": 2.292756046903238,
"grad_norm": 0.7172012329101562,
"learning_rate": 1.6944500000000004e-05,
"loss": 1.1675,
"step": 595000
},
{
"epoch": 2.2966094184106383,
"grad_norm": 0.7011358141899109,
"learning_rate": 1.6888944444444446e-05,
"loss": 1.1673,
"step": 596000
},
{
"epoch": 2.300462789918039,
"grad_norm": 0.684775710105896,
"learning_rate": 1.683338888888889e-05,
"loss": 1.1682,
"step": 597000
},
{
"epoch": 2.3043161614254393,
"grad_norm": 0.7131794095039368,
"learning_rate": 1.6777833333333334e-05,
"loss": 1.1677,
"step": 598000
},
{
"epoch": 2.3081695329328396,
"grad_norm": 0.7285271883010864,
"learning_rate": 1.672227777777778e-05,
"loss": 1.1672,
"step": 599000
},
{
"epoch": 2.31202290444024,
"grad_norm": 0.7372791171073914,
"learning_rate": 1.6666722222222222e-05,
"loss": 1.1671,
"step": 600000
},
{
"epoch": 2.3158762759476406,
"grad_norm": 0.7125265002250671,
"learning_rate": 1.6611166666666668e-05,
"loss": 1.1678,
"step": 601000
},
{
"epoch": 2.319729647455041,
"grad_norm": 0.7177942395210266,
"learning_rate": 1.655561111111111e-05,
"loss": 1.1663,
"step": 602000
},
{
"epoch": 2.323583018962441,
"grad_norm": 0.7199053168296814,
"learning_rate": 1.6500055555555556e-05,
"loss": 1.1658,
"step": 603000
},
{
"epoch": 2.3274363904698414,
"grad_norm": 0.7532017827033997,
"learning_rate": 1.64445e-05,
"loss": 1.1664,
"step": 604000
},
{
"epoch": 2.331289761977242,
"grad_norm": 0.7073883414268494,
"learning_rate": 1.6388944444444447e-05,
"loss": 1.1658,
"step": 605000
},
{
"epoch": 2.3351431334846424,
"grad_norm": 0.748778223991394,
"learning_rate": 1.633338888888889e-05,
"loss": 1.1666,
"step": 606000
},
{
"epoch": 2.3389965049920427,
"grad_norm": 0.722709596157074,
"learning_rate": 1.6277833333333335e-05,
"loss": 1.1663,
"step": 607000
},
{
"epoch": 2.342849876499443,
"grad_norm": 0.7432075142860413,
"learning_rate": 1.6222277777777778e-05,
"loss": 1.1664,
"step": 608000
},
{
"epoch": 2.3467032480068437,
"grad_norm": 0.7249975204467773,
"learning_rate": 1.6166722222222223e-05,
"loss": 1.1662,
"step": 609000
},
{
"epoch": 2.350556619514244,
"grad_norm": 0.7158792614936829,
"learning_rate": 1.6111166666666666e-05,
"loss": 1.1652,
"step": 610000
},
{
"epoch": 2.3544099910216443,
"grad_norm": 0.703758180141449,
"learning_rate": 1.605561111111111e-05,
"loss": 1.1647,
"step": 611000
},
{
"epoch": 2.358263362529045,
"grad_norm": 0.7096312046051025,
"learning_rate": 1.6000055555555557e-05,
"loss": 1.1649,
"step": 612000
},
{
"epoch": 2.3621167340364453,
"grad_norm": 0.7235798239707947,
"learning_rate": 1.59445e-05,
"loss": 1.166,
"step": 613000
},
{
"epoch": 2.3659701055438456,
"grad_norm": 0.7283411026000977,
"learning_rate": 1.5888944444444445e-05,
"loss": 1.1656,
"step": 614000
},
{
"epoch": 2.369823477051246,
"grad_norm": 0.6973391175270081,
"learning_rate": 1.5833388888888888e-05,
"loss": 1.1662,
"step": 615000
},
{
"epoch": 2.3736768485586466,
"grad_norm": 0.6956027150154114,
"learning_rate": 1.5777833333333333e-05,
"loss": 1.1642,
"step": 616000
},
{
"epoch": 2.377530220066047,
"grad_norm": 0.7057496309280396,
"learning_rate": 1.572227777777778e-05,
"loss": 1.165,
"step": 617000
},
{
"epoch": 2.381383591573447,
"grad_norm": 0.7290093302726746,
"learning_rate": 1.5666722222222225e-05,
"loss": 1.1643,
"step": 618000
},
{
"epoch": 2.3852369630808474,
"grad_norm": 0.7349111437797546,
"learning_rate": 1.5611166666666667e-05,
"loss": 1.1639,
"step": 619000
},
{
"epoch": 2.389090334588248,
"grad_norm": 0.6929183006286621,
"learning_rate": 1.5555611111111113e-05,
"loss": 1.1647,
"step": 620000
},
{
"epoch": 2.3929437060956484,
"grad_norm": 0.7300617694854736,
"learning_rate": 1.5500055555555555e-05,
"loss": 1.1643,
"step": 621000
},
{
"epoch": 2.3967970776030487,
"grad_norm": 0.7160629034042358,
"learning_rate": 1.54445e-05,
"loss": 1.1639,
"step": 622000
},
{
"epoch": 2.400650449110449,
"grad_norm": 0.7036823034286499,
"learning_rate": 1.5388944444444443e-05,
"loss": 1.1641,
"step": 623000
},
{
"epoch": 2.4045038206178497,
"grad_norm": 0.7350702285766602,
"learning_rate": 1.533338888888889e-05,
"loss": 1.1636,
"step": 624000
},
{
"epoch": 2.40835719212525,
"grad_norm": 0.7381563186645508,
"learning_rate": 1.527783333333333e-05,
"loss": 1.1649,
"step": 625000
},
{
"epoch": 2.4122105636326503,
"grad_norm": 0.7308299541473389,
"learning_rate": 1.522227777777778e-05,
"loss": 1.1638,
"step": 626000
},
{
"epoch": 2.416063935140051,
"grad_norm": 0.7187788486480713,
"learning_rate": 1.5166722222222225e-05,
"loss": 1.1637,
"step": 627000
},
{
"epoch": 2.4199173066474513,
"grad_norm": 0.7224980592727661,
"learning_rate": 1.5111166666666669e-05,
"loss": 1.1623,
"step": 628000
},
{
"epoch": 2.4237706781548516,
"grad_norm": 0.7195943593978882,
"learning_rate": 1.5055611111111113e-05,
"loss": 1.1635,
"step": 629000
},
{
"epoch": 2.427624049662252,
"grad_norm": 0.7227275371551514,
"learning_rate": 1.5000055555555557e-05,
"loss": 1.1625,
"step": 630000
},
{
"epoch": 2.431477421169652,
"grad_norm": 0.7128928899765015,
"learning_rate": 1.49445e-05,
"loss": 1.1627,
"step": 631000
},
{
"epoch": 2.435330792677053,
"grad_norm": 0.6905311346054077,
"learning_rate": 1.4888944444444445e-05,
"loss": 1.1627,
"step": 632000
},
{
"epoch": 2.439184164184453,
"grad_norm": 0.7287958860397339,
"learning_rate": 1.4833388888888889e-05,
"loss": 1.1635,
"step": 633000
},
{
"epoch": 2.4430375356918534,
"grad_norm": 0.7288080453872681,
"learning_rate": 1.4777833333333334e-05,
"loss": 1.1615,
"step": 634000
},
{
"epoch": 2.446890907199254,
"grad_norm": 0.7119095921516418,
"learning_rate": 1.4722277777777778e-05,
"loss": 1.1626,
"step": 635000
},
{
"epoch": 2.4507442787066545,
"grad_norm": 0.7402783036231995,
"learning_rate": 1.4666722222222223e-05,
"loss": 1.1625,
"step": 636000
},
{
"epoch": 2.4545976502140547,
"grad_norm": 0.7251101732254028,
"learning_rate": 1.4611166666666668e-05,
"loss": 1.162,
"step": 637000
},
{
"epoch": 2.458451021721455,
"grad_norm": 0.7154144644737244,
"learning_rate": 1.4555611111111112e-05,
"loss": 1.1629,
"step": 638000
},
{
"epoch": 2.4623043932288557,
"grad_norm": 0.7547957301139832,
"learning_rate": 1.4500055555555556e-05,
"loss": 1.1618,
"step": 639000
},
{
"epoch": 2.466157764736256,
"grad_norm": 0.7051396369934082,
"learning_rate": 1.44445e-05,
"loss": 1.1625,
"step": 640000
},
{
"epoch": 2.4700111362436563,
"grad_norm": 0.7302169799804688,
"learning_rate": 1.4388944444444444e-05,
"loss": 1.1619,
"step": 641000
},
{
"epoch": 2.4738645077510566,
"grad_norm": 0.7116556167602539,
"learning_rate": 1.433338888888889e-05,
"loss": 1.1615,
"step": 642000
},
{
"epoch": 2.4777178792584573,
"grad_norm": 0.6997031569480896,
"learning_rate": 1.4277833333333334e-05,
"loss": 1.1609,
"step": 643000
},
{
"epoch": 2.4815712507658576,
"grad_norm": 0.7330045700073242,
"learning_rate": 1.4222277777777778e-05,
"loss": 1.1608,
"step": 644000
},
{
"epoch": 2.485424622273258,
"grad_norm": 0.7158441543579102,
"learning_rate": 1.4166722222222222e-05,
"loss": 1.1612,
"step": 645000
},
{
"epoch": 2.489277993780658,
"grad_norm": 0.7308253049850464,
"learning_rate": 1.4111166666666666e-05,
"loss": 1.1608,
"step": 646000
},
{
"epoch": 2.493131365288059,
"grad_norm": 0.746481716632843,
"learning_rate": 1.405561111111111e-05,
"loss": 1.1614,
"step": 647000
},
{
"epoch": 2.496984736795459,
"grad_norm": 0.740993320941925,
"learning_rate": 1.4000055555555554e-05,
"loss": 1.1613,
"step": 648000
},
{
"epoch": 2.5008381083028595,
"grad_norm": 0.7341724038124084,
"learning_rate": 1.3944500000000002e-05,
"loss": 1.1603,
"step": 649000
},
{
"epoch": 2.50469147981026,
"grad_norm": 0.7422592639923096,
"learning_rate": 1.3888944444444446e-05,
"loss": 1.1598,
"step": 650000
},
{
"epoch": 2.5085448513176605,
"grad_norm": 0.7050088047981262,
"learning_rate": 1.383338888888889e-05,
"loss": 1.1599,
"step": 651000
},
{
"epoch": 2.5123982228250608,
"grad_norm": 0.7259443998336792,
"learning_rate": 1.3777833333333334e-05,
"loss": 1.1595,
"step": 652000
},
{
"epoch": 2.516251594332461,
"grad_norm": 0.7026506066322327,
"learning_rate": 1.3722277777777778e-05,
"loss": 1.16,
"step": 653000
},
{
"epoch": 2.5201049658398613,
"grad_norm": 0.7095285058021545,
"learning_rate": 1.3666722222222222e-05,
"loss": 1.1597,
"step": 654000
},
{
"epoch": 2.523958337347262,
"grad_norm": 0.7308704853057861,
"learning_rate": 1.3611166666666666e-05,
"loss": 1.1602,
"step": 655000
},
{
"epoch": 2.5278117088546623,
"grad_norm": 0.7496655583381653,
"learning_rate": 1.355561111111111e-05,
"loss": 1.1587,
"step": 656000
},
{
"epoch": 2.531665080362063,
"grad_norm": 0.7287865877151489,
"learning_rate": 1.3500055555555557e-05,
"loss": 1.1595,
"step": 657000
},
{
"epoch": 2.5355184518694633,
"grad_norm": 0.740280032157898,
"learning_rate": 1.3444500000000001e-05,
"loss": 1.1586,
"step": 658000
},
{
"epoch": 2.5393718233768636,
"grad_norm": 0.7381448149681091,
"learning_rate": 1.3388944444444446e-05,
"loss": 1.1599,
"step": 659000
},
{
"epoch": 2.543225194884264,
"grad_norm": 0.7290709614753723,
"learning_rate": 1.333338888888889e-05,
"loss": 1.1589,
"step": 660000
},
{
"epoch": 2.547078566391664,
"grad_norm": 0.7309290766716003,
"learning_rate": 1.3277833333333334e-05,
"loss": 1.1583,
"step": 661000
},
{
"epoch": 2.550931937899065,
"grad_norm": 0.7135186195373535,
"learning_rate": 1.3222277777777778e-05,
"loss": 1.1586,
"step": 662000
},
{
"epoch": 2.554785309406465,
"grad_norm": 0.7129361033439636,
"learning_rate": 1.3166722222222222e-05,
"loss": 1.1583,
"step": 663000
},
{
"epoch": 2.5586386809138655,
"grad_norm": 0.7506452798843384,
"learning_rate": 1.3111166666666666e-05,
"loss": 1.159,
"step": 664000
},
{
"epoch": 2.562492052421266,
"grad_norm": 0.7159491181373596,
"learning_rate": 1.3055611111111113e-05,
"loss": 1.158,
"step": 665000
},
{
"epoch": 2.5663454239286665,
"grad_norm": 0.7607939839363098,
"learning_rate": 1.3000055555555557e-05,
"loss": 1.1584,
"step": 666000
},
{
"epoch": 2.5701987954360668,
"grad_norm": 0.737775444984436,
"learning_rate": 1.2944500000000001e-05,
"loss": 1.1576,
"step": 667000
},
{
"epoch": 2.574052166943467,
"grad_norm": 0.7362163662910461,
"learning_rate": 1.2888944444444445e-05,
"loss": 1.158,
"step": 668000
},
{
"epoch": 2.5779055384508673,
"grad_norm": 0.7213948369026184,
"learning_rate": 1.283338888888889e-05,
"loss": 1.1582,
"step": 669000
},
{
"epoch": 2.581758909958268,
"grad_norm": 0.7307848334312439,
"learning_rate": 1.2777833333333333e-05,
"loss": 1.158,
"step": 670000
},
{
"epoch": 2.5856122814656683,
"grad_norm": 0.7397758960723877,
"learning_rate": 1.2722277777777777e-05,
"loss": 1.1583,
"step": 671000
},
{
"epoch": 2.5894656529730686,
"grad_norm": 0.730469822883606,
"learning_rate": 1.2666722222222221e-05,
"loss": 1.1574,
"step": 672000
},
{
"epoch": 2.5933190244804694,
"grad_norm": 0.7288331985473633,
"learning_rate": 1.2611166666666669e-05,
"loss": 1.1568,
"step": 673000
},
{
"epoch": 2.5971723959878696,
"grad_norm": 0.7394465208053589,
"learning_rate": 1.2555611111111113e-05,
"loss": 1.1573,
"step": 674000
},
{
"epoch": 2.60102576749527,
"grad_norm": 0.7209343314170837,
"learning_rate": 1.2500055555555557e-05,
"loss": 1.1566,
"step": 675000
},
{
"epoch": 2.60487913900267,
"grad_norm": 0.7466188669204712,
"learning_rate": 1.2444500000000001e-05,
"loss": 1.1567,
"step": 676000
},
{
"epoch": 2.608732510510071,
"grad_norm": 0.7297884821891785,
"learning_rate": 1.2388944444444445e-05,
"loss": 1.1562,
"step": 677000
},
{
"epoch": 2.612585882017471,
"grad_norm": 0.7409054040908813,
"learning_rate": 1.233338888888889e-05,
"loss": 1.1563,
"step": 678000
},
{
"epoch": 2.6164392535248715,
"grad_norm": 0.7495500445365906,
"learning_rate": 1.2277833333333335e-05,
"loss": 1.1568,
"step": 679000
},
{
"epoch": 2.6202926250322722,
"grad_norm": 0.7482118606567383,
"learning_rate": 1.2222277777777779e-05,
"loss": 1.1559,
"step": 680000
},
{
"epoch": 2.6241459965396725,
"grad_norm": 0.7128192782402039,
"learning_rate": 1.2166722222222223e-05,
"loss": 1.1564,
"step": 681000
},
{
"epoch": 2.627999368047073,
"grad_norm": 0.7315691709518433,
"learning_rate": 1.2111166666666668e-05,
"loss": 1.1555,
"step": 682000
},
{
"epoch": 2.631852739554473,
"grad_norm": 0.7355465292930603,
"learning_rate": 1.2055611111111113e-05,
"loss": 1.1555,
"step": 683000
},
{
"epoch": 2.6357061110618734,
"grad_norm": 0.7146631479263306,
"learning_rate": 1.2000055555555557e-05,
"loss": 1.1553,
"step": 684000
},
{
"epoch": 2.639559482569274,
"grad_norm": 0.7412270903587341,
"learning_rate": 1.19445e-05,
"loss": 1.1554,
"step": 685000
},
{
"epoch": 2.6434128540766744,
"grad_norm": 0.7790284752845764,
"learning_rate": 1.1888944444444446e-05,
"loss": 1.1557,
"step": 686000
},
{
"epoch": 2.6472662255840747,
"grad_norm": 0.730482280254364,
"learning_rate": 1.183338888888889e-05,
"loss": 1.1553,
"step": 687000
},
{
"epoch": 2.6511195970914754,
"grad_norm": 0.7502373456954956,
"learning_rate": 1.1777833333333334e-05,
"loss": 1.1556,
"step": 688000
},
{
"epoch": 2.6549729685988757,
"grad_norm": 0.7493919730186462,
"learning_rate": 1.1722277777777778e-05,
"loss": 1.1549,
"step": 689000
},
{
"epoch": 2.658826340106276,
"grad_norm": 0.7542137503623962,
"learning_rate": 1.1666722222222224e-05,
"loss": 1.1555,
"step": 690000
},
{
"epoch": 2.6626797116136762,
"grad_norm": 0.7587451338768005,
"learning_rate": 1.1611166666666668e-05,
"loss": 1.1552,
"step": 691000
},
{
"epoch": 2.666533083121077,
"grad_norm": 0.7310791611671448,
"learning_rate": 1.1555611111111112e-05,
"loss": 1.1546,
"step": 692000
},
{
"epoch": 2.6703864546284772,
"grad_norm": 0.7424149513244629,
"learning_rate": 1.1500055555555556e-05,
"loss": 1.1546,
"step": 693000
},
{
"epoch": 2.6742398261358775,
"grad_norm": 0.7413462996482849,
"learning_rate": 1.14445e-05,
"loss": 1.1544,
"step": 694000
},
{
"epoch": 2.6780931976432782,
"grad_norm": 0.76901775598526,
"learning_rate": 1.1388944444444444e-05,
"loss": 1.1544,
"step": 695000
},
{
"epoch": 2.6819465691506785,
"grad_norm": 0.736726701259613,
"learning_rate": 1.133338888888889e-05,
"loss": 1.154,
"step": 696000
},
{
"epoch": 2.685799940658079,
"grad_norm": 0.7254749536514282,
"learning_rate": 1.1277833333333334e-05,
"loss": 1.1545,
"step": 697000
},
{
"epoch": 2.689653312165479,
"grad_norm": 0.7280982136726379,
"learning_rate": 1.1222277777777778e-05,
"loss": 1.1545,
"step": 698000
},
{
"epoch": 2.6935066836728794,
"grad_norm": 0.722722053527832,
"learning_rate": 1.1166722222222222e-05,
"loss": 1.1538,
"step": 699000
},
{
"epoch": 2.69736005518028,
"grad_norm": 0.7584505081176758,
"learning_rate": 1.1111166666666666e-05,
"loss": 1.1535,
"step": 700000
},
{
"epoch": 2.7012134266876804,
"grad_norm": 0.7559113502502441,
"learning_rate": 1.105561111111111e-05,
"loss": 1.1539,
"step": 701000
},
{
"epoch": 2.7050667981950807,
"grad_norm": 0.735378086566925,
"learning_rate": 1.1000055555555556e-05,
"loss": 1.1538,
"step": 702000
},
{
"epoch": 2.7089201697024814,
"grad_norm": 0.7334083318710327,
"learning_rate": 1.09445e-05,
"loss": 1.1538,
"step": 703000
},
{
"epoch": 2.7127735412098817,
"grad_norm": 0.7132292985916138,
"learning_rate": 1.0888944444444444e-05,
"loss": 1.1529,
"step": 704000
},
{
"epoch": 2.716626912717282,
"grad_norm": 0.7504042983055115,
"learning_rate": 1.0833388888888888e-05,
"loss": 1.1522,
"step": 705000
},
{
"epoch": 2.7204802842246822,
"grad_norm": 0.7536928057670593,
"learning_rate": 1.0777833333333334e-05,
"loss": 1.1532,
"step": 706000
},
{
"epoch": 2.7243336557320825,
"grad_norm": 0.7525560259819031,
"learning_rate": 1.0722277777777778e-05,
"loss": 1.1531,
"step": 707000
},
{
"epoch": 2.7281870272394833,
"grad_norm": 0.7648515701293945,
"learning_rate": 1.0666722222222222e-05,
"loss": 1.1537,
"step": 708000
},
{
"epoch": 2.7320403987468835,
"grad_norm": 0.7191064953804016,
"learning_rate": 1.0611166666666668e-05,
"loss": 1.1532,
"step": 709000
},
{
"epoch": 2.7358937702542843,
"grad_norm": 0.7417696714401245,
"learning_rate": 1.0555611111111112e-05,
"loss": 1.1536,
"step": 710000
},
{
"epoch": 2.7397471417616845,
"grad_norm": 0.7305838465690613,
"learning_rate": 1.0500055555555556e-05,
"loss": 1.153,
"step": 711000
},
{
"epoch": 2.743600513269085,
"grad_norm": 0.7491075992584229,
"learning_rate": 1.04445e-05,
"loss": 1.1526,
"step": 712000
},
{
"epoch": 2.747453884776485,
"grad_norm": 0.7386889457702637,
"learning_rate": 1.0388944444444445e-05,
"loss": 1.1523,
"step": 713000
},
{
"epoch": 2.7513072562838854,
"grad_norm": 0.7188745737075806,
"learning_rate": 1.033338888888889e-05,
"loss": 1.1524,
"step": 714000
},
{
"epoch": 2.755160627791286,
"grad_norm": 0.7587524652481079,
"learning_rate": 1.0277833333333333e-05,
"loss": 1.1523,
"step": 715000
},
{
"epoch": 2.7590139992986864,
"grad_norm": 0.74712073802948,
"learning_rate": 1.0222277777777778e-05,
"loss": 1.1523,
"step": 716000
},
{
"epoch": 2.7628673708060867,
"grad_norm": 0.700933039188385,
"learning_rate": 1.0166722222222223e-05,
"loss": 1.1517,
"step": 717000
},
{
"epoch": 2.7667207423134874,
"grad_norm": 0.7546241879463196,
"learning_rate": 1.0111166666666667e-05,
"loss": 1.1515,
"step": 718000
},
{
"epoch": 2.7705741138208877,
"grad_norm": 0.7323986291885376,
"learning_rate": 1.0055611111111111e-05,
"loss": 1.1522,
"step": 719000
},
{
"epoch": 2.774427485328288,
"grad_norm": 0.7574964761734009,
"learning_rate": 1.0000055555555555e-05,
"loss": 1.1516,
"step": 720000
},
{
"epoch": 2.7782808568356883,
"grad_norm": 0.7652174234390259,
"learning_rate": 9.944500000000001e-06,
"loss": 1.1512,
"step": 721000
},
{
"epoch": 2.7821342283430885,
"grad_norm": 0.7431154251098633,
"learning_rate": 9.888944444444445e-06,
"loss": 1.1515,
"step": 722000
},
{
"epoch": 2.7859875998504893,
"grad_norm": 0.7359040975570679,
"learning_rate": 9.83338888888889e-06,
"loss": 1.1507,
"step": 723000
},
{
"epoch": 2.7898409713578896,
"grad_norm": 0.7209459543228149,
"learning_rate": 9.777833333333333e-06,
"loss": 1.1515,
"step": 724000
},
{
"epoch": 2.79369434286529,
"grad_norm": 0.7616684436798096,
"learning_rate": 9.722277777777779e-06,
"loss": 1.1509,
"step": 725000
},
{
"epoch": 2.7975477143726906,
"grad_norm": 0.7581999897956848,
"learning_rate": 9.666722222222223e-06,
"loss": 1.1512,
"step": 726000
},
{
"epoch": 2.801401085880091,
"grad_norm": 0.7473781704902649,
"learning_rate": 9.611166666666667e-06,
"loss": 1.1516,
"step": 727000
},
{
"epoch": 2.805254457387491,
"grad_norm": 0.7421666383743286,
"learning_rate": 9.555611111111111e-06,
"loss": 1.1503,
"step": 728000
},
{
"epoch": 2.8091078288948914,
"grad_norm": 0.7370326519012451,
"learning_rate": 9.500055555555557e-06,
"loss": 1.1512,
"step": 729000
},
{
"epoch": 2.812961200402292,
"grad_norm": 0.7482893466949463,
"learning_rate": 9.4445e-06,
"loss": 1.1502,
"step": 730000
},
{
"epoch": 2.8168145719096924,
"grad_norm": 0.7539274096488953,
"learning_rate": 9.388944444444445e-06,
"loss": 1.1499,
"step": 731000
},
{
"epoch": 2.8206679434170927,
"grad_norm": 0.7609211802482605,
"learning_rate": 9.333388888888889e-06,
"loss": 1.1502,
"step": 732000
},
{
"epoch": 2.8245213149244934,
"grad_norm": 0.7571685314178467,
"learning_rate": 9.277833333333335e-06,
"loss": 1.1501,
"step": 733000
},
{
"epoch": 2.8283746864318937,
"grad_norm": 0.7442651987075806,
"learning_rate": 9.222277777777779e-06,
"loss": 1.1505,
"step": 734000
},
{
"epoch": 2.832228057939294,
"grad_norm": 0.7343342304229736,
"learning_rate": 9.166722222222223e-06,
"loss": 1.1502,
"step": 735000
},
{
"epoch": 2.8360814294466943,
"grad_norm": 0.7149516940116882,
"learning_rate": 9.111166666666667e-06,
"loss": 1.1498,
"step": 736000
},
{
"epoch": 2.8399348009540946,
"grad_norm": 0.7458763718605042,
"learning_rate": 9.055611111111112e-06,
"loss": 1.1494,
"step": 737000
},
{
"epoch": 2.8437881724614953,
"grad_norm": 0.7562174797058105,
"learning_rate": 9.000055555555556e-06,
"loss": 1.1497,
"step": 738000
},
{
"epoch": 2.8476415439688956,
"grad_norm": 0.7606706619262695,
"learning_rate": 8.9445e-06,
"loss": 1.1494,
"step": 739000
},
{
"epoch": 2.851494915476296,
"grad_norm": 0.7609912753105164,
"learning_rate": 8.888944444444445e-06,
"loss": 1.149,
"step": 740000
},
{
"epoch": 2.8553482869836966,
"grad_norm": 0.7673037052154541,
"learning_rate": 8.83338888888889e-06,
"loss": 1.1499,
"step": 741000
},
{
"epoch": 2.859201658491097,
"grad_norm": 0.7546699643135071,
"learning_rate": 8.777833333333334e-06,
"loss": 1.1491,
"step": 742000
},
{
"epoch": 2.863055029998497,
"grad_norm": 0.7696357369422913,
"learning_rate": 8.722277777777778e-06,
"loss": 1.1494,
"step": 743000
},
{
"epoch": 2.8669084015058974,
"grad_norm": 0.7652831673622131,
"learning_rate": 8.666722222222224e-06,
"loss": 1.1492,
"step": 744000
},
{
"epoch": 2.8707617730132977,
"grad_norm": 0.7520629167556763,
"learning_rate": 8.611166666666668e-06,
"loss": 1.1485,
"step": 745000
},
{
"epoch": 2.8746151445206984,
"grad_norm": 0.7434529066085815,
"learning_rate": 8.555611111111112e-06,
"loss": 1.1483,
"step": 746000
},
{
"epoch": 2.8784685160280987,
"grad_norm": 0.7456247210502625,
"learning_rate": 8.500055555555556e-06,
"loss": 1.1486,
"step": 747000
},
{
"epoch": 2.8823218875354994,
"grad_norm": 0.7586479187011719,
"learning_rate": 8.4445e-06,
"loss": 1.1478,
"step": 748000
},
{
"epoch": 2.8861752590428997,
"grad_norm": 0.7417891025543213,
"learning_rate": 8.388944444444446e-06,
"loss": 1.1473,
"step": 749000
},
{
"epoch": 2.8900286305503,
"grad_norm": 0.7475762367248535,
"learning_rate": 8.33338888888889e-06,
"loss": 1.1484,
"step": 750000
},
{
"epoch": 2.8938820020577003,
"grad_norm": 0.7549064755439758,
"learning_rate": 8.277833333333334e-06,
"loss": 1.1483,
"step": 751000
},
{
"epoch": 2.8977353735651006,
"grad_norm": 0.7571744322776794,
"learning_rate": 8.222277777777778e-06,
"loss": 1.148,
"step": 752000
},
{
"epoch": 2.9015887450725013,
"grad_norm": 0.7129827737808228,
"learning_rate": 8.166722222222222e-06,
"loss": 1.1486,
"step": 753000
},
{
"epoch": 2.9054421165799016,
"grad_norm": 0.7594377994537354,
"learning_rate": 8.111166666666666e-06,
"loss": 1.1478,
"step": 754000
},
{
"epoch": 2.909295488087302,
"grad_norm": 0.7634985446929932,
"learning_rate": 8.05561111111111e-06,
"loss": 1.1473,
"step": 755000
},
{
"epoch": 2.9131488595947026,
"grad_norm": 0.7703734636306763,
"learning_rate": 8.000055555555556e-06,
"loss": 1.148,
"step": 756000
},
{
"epoch": 2.917002231102103,
"grad_norm": 0.7532411813735962,
"learning_rate": 7.9445e-06,
"loss": 1.1475,
"step": 757000
},
{
"epoch": 2.920855602609503,
"grad_norm": 0.7445677518844604,
"learning_rate": 7.888944444444444e-06,
"loss": 1.1462,
"step": 758000
},
{
"epoch": 2.9247089741169034,
"grad_norm": 0.7641280293464661,
"learning_rate": 7.833388888888888e-06,
"loss": 1.1463,
"step": 759000
},
{
"epoch": 2.9285623456243037,
"grad_norm": 0.7298674583435059,
"learning_rate": 7.777833333333334e-06,
"loss": 1.1475,
"step": 760000
},
{
"epoch": 2.9324157171317045,
"grad_norm": 0.7470799684524536,
"learning_rate": 7.722277777777778e-06,
"loss": 1.1478,
"step": 761000
},
{
"epoch": 2.9362690886391047,
"grad_norm": 0.7401430010795593,
"learning_rate": 7.666722222222222e-06,
"loss": 1.1471,
"step": 762000
},
{
"epoch": 2.940122460146505,
"grad_norm": 0.7350935935974121,
"learning_rate": 7.611166666666667e-06,
"loss": 1.1461,
"step": 763000
},
{
"epoch": 2.9439758316539058,
"grad_norm": 0.7517477869987488,
"learning_rate": 7.5556111111111115e-06,
"loss": 1.1474,
"step": 764000
},
{
"epoch": 2.947829203161306,
"grad_norm": 0.7496416568756104,
"learning_rate": 7.500055555555556e-06,
"loss": 1.1469,
"step": 765000
},
{
"epoch": 2.9516825746687063,
"grad_norm": 0.7540420889854431,
"learning_rate": 7.4445000000000005e-06,
"loss": 1.1468,
"step": 766000
},
{
"epoch": 2.9555359461761066,
"grad_norm": 0.7611315846443176,
"learning_rate": 7.3889444444444445e-06,
"loss": 1.1471,
"step": 767000
},
{
"epoch": 2.9593893176835073,
"grad_norm": 0.7721447944641113,
"learning_rate": 7.333388888888889e-06,
"loss": 1.1462,
"step": 768000
},
{
"epoch": 2.9632426891909076,
"grad_norm": 0.7783055305480957,
"learning_rate": 7.277833333333333e-06,
"loss": 1.1464,
"step": 769000
},
{
"epoch": 2.967096060698308,
"grad_norm": 0.7445676326751709,
"learning_rate": 7.2222777777777775e-06,
"loss": 1.146,
"step": 770000
},
{
"epoch": 2.9709494322057086,
"grad_norm": 0.7596333622932434,
"learning_rate": 7.1667222222222215e-06,
"loss": 1.1466,
"step": 771000
},
{
"epoch": 2.974802803713109,
"grad_norm": 0.7583540081977844,
"learning_rate": 7.111166666666667e-06,
"loss": 1.1466,
"step": 772000
},
{
"epoch": 2.978656175220509,
"grad_norm": 0.7816259860992432,
"learning_rate": 7.055611111111111e-06,
"loss": 1.1461,
"step": 773000
},
{
"epoch": 2.9825095467279095,
"grad_norm": 0.757046639919281,
"learning_rate": 7.000055555555555e-06,
"loss": 1.146,
"step": 774000
},
{
"epoch": 2.9863629182353097,
"grad_norm": 0.7732102870941162,
"learning_rate": 6.944500000000001e-06,
"loss": 1.1456,
"step": 775000
},
{
"epoch": 2.9902162897427105,
"grad_norm": 0.7478325963020325,
"learning_rate": 6.888944444444445e-06,
"loss": 1.1448,
"step": 776000
},
{
"epoch": 2.9940696612501108,
"grad_norm": 0.7565175294876099,
"learning_rate": 6.833388888888889e-06,
"loss": 1.1461,
"step": 777000
},
{
"epoch": 2.997923032757511,
"grad_norm": 0.7752691507339478,
"learning_rate": 6.777833333333333e-06,
"loss": 1.1451,
"step": 778000
},
{
"epoch": 3.0017764042649118,
"grad_norm": 0.7705357670783997,
"learning_rate": 6.722277777777779e-06,
"loss": 1.1455,
"step": 779000
},
{
"epoch": 3.005629775772312,
"grad_norm": 0.729576587677002,
"learning_rate": 6.666722222222223e-06,
"loss": 1.145,
"step": 780000
},
{
"epoch": 3.0094831472797123,
"grad_norm": 0.7434535026550293,
"learning_rate": 6.611166666666667e-06,
"loss": 1.1451,
"step": 781000
},
{
"epoch": 3.0133365187871126,
"grad_norm": 0.7545701861381531,
"learning_rate": 6.555611111111111e-06,
"loss": 1.1449,
"step": 782000
},
{
"epoch": 3.0171898902945133,
"grad_norm": 0.7772207856178284,
"learning_rate": 6.500055555555557e-06,
"loss": 1.1456,
"step": 783000
},
{
"epoch": 3.0210432618019136,
"grad_norm": 0.7683879137039185,
"learning_rate": 6.444500000000001e-06,
"loss": 1.1446,
"step": 784000
},
{
"epoch": 3.024896633309314,
"grad_norm": 0.8129323720932007,
"learning_rate": 6.388944444444445e-06,
"loss": 1.1453,
"step": 785000
},
{
"epoch": 3.028750004816714,
"grad_norm": 0.7712565660476685,
"learning_rate": 6.333388888888889e-06,
"loss": 1.1438,
"step": 786000
},
{
"epoch": 3.032603376324115,
"grad_norm": 0.7380220293998718,
"learning_rate": 6.2778333333333345e-06,
"loss": 1.1445,
"step": 787000
},
{
"epoch": 3.036456747831515,
"grad_norm": 0.7839773297309875,
"learning_rate": 6.2222777777777786e-06,
"loss": 1.1445,
"step": 788000
},
{
"epoch": 3.0403101193389155,
"grad_norm": 0.7741373181343079,
"learning_rate": 6.166722222222223e-06,
"loss": 1.144,
"step": 789000
},
{
"epoch": 3.0441634908463158,
"grad_norm": 0.7754467129707336,
"learning_rate": 6.1111666666666675e-06,
"loss": 1.1442,
"step": 790000
},
{
"epoch": 3.0480168623537165,
"grad_norm": 0.7377423644065857,
"learning_rate": 6.0556111111111115e-06,
"loss": 1.1448,
"step": 791000
},
{
"epoch": 3.0518702338611168,
"grad_norm": 0.7510783672332764,
"learning_rate": 6.000055555555556e-06,
"loss": 1.1441,
"step": 792000
},
{
"epoch": 3.055723605368517,
"grad_norm": 0.7558544278144836,
"learning_rate": 5.9445000000000004e-06,
"loss": 1.1439,
"step": 793000
},
{
"epoch": 3.059576976875918,
"grad_norm": 0.763209342956543,
"learning_rate": 5.8889444444444445e-06,
"loss": 1.143,
"step": 794000
},
{
"epoch": 3.063430348383318,
"grad_norm": 0.7716565132141113,
"learning_rate": 5.833388888888889e-06,
"loss": 1.1439,
"step": 795000
},
{
"epoch": 3.0672837198907184,
"grad_norm": 0.8171836137771606,
"learning_rate": 5.777833333333333e-06,
"loss": 1.1431,
"step": 796000
},
{
"epoch": 3.0711370913981186,
"grad_norm": 0.7793330550193787,
"learning_rate": 5.722277777777777e-06,
"loss": 1.1435,
"step": 797000
},
{
"epoch": 3.0749904629055194,
"grad_norm": 0.7747001051902771,
"learning_rate": 5.666722222222222e-06,
"loss": 1.1438,
"step": 798000
},
{
"epoch": 3.0788438344129196,
"grad_norm": 0.7851794958114624,
"learning_rate": 5.611166666666666e-06,
"loss": 1.1431,
"step": 799000
},
{
"epoch": 3.08269720592032,
"grad_norm": 0.7496747970581055,
"learning_rate": 5.555611111111111e-06,
"loss": 1.143,
"step": 800000
},
{
"epoch": 3.08655057742772,
"grad_norm": 0.7908812761306763,
"learning_rate": 5.500055555555555e-06,
"loss": 1.1435,
"step": 801000
},
{
"epoch": 3.090403948935121,
"grad_norm": 0.7809085249900818,
"learning_rate": 5.4445e-06,
"loss": 1.1435,
"step": 802000
},
{
"epoch": 3.094257320442521,
"grad_norm": 0.7660035490989685,
"learning_rate": 5.388944444444444e-06,
"loss": 1.1421,
"step": 803000
},
{
"epoch": 3.0981106919499215,
"grad_norm": 0.7446494102478027,
"learning_rate": 5.333388888888889e-06,
"loss": 1.1431,
"step": 804000
},
{
"epoch": 3.101964063457322,
"grad_norm": 0.7707033753395081,
"learning_rate": 5.277833333333333e-06,
"loss": 1.143,
"step": 805000
},
{
"epoch": 3.1058174349647225,
"grad_norm": 0.7882303595542908,
"learning_rate": 5.222277777777778e-06,
"loss": 1.1425,
"step": 806000
},
{
"epoch": 3.109670806472123,
"grad_norm": 0.7790716290473938,
"learning_rate": 5.166722222222223e-06,
"loss": 1.1435,
"step": 807000
},
{
"epoch": 3.113524177979523,
"grad_norm": 0.7655811905860901,
"learning_rate": 5.111166666666667e-06,
"loss": 1.1428,
"step": 808000
},
{
"epoch": 3.117377549486924,
"grad_norm": 0.7627564072608948,
"learning_rate": 5.055611111111112e-06,
"loss": 1.1431,
"step": 809000
},
{
"epoch": 3.121230920994324,
"grad_norm": 0.7675108909606934,
"learning_rate": 5.000055555555556e-06,
"loss": 1.1417,
"step": 810000
},
{
"epoch": 3.1250842925017244,
"grad_norm": 0.7635100483894348,
"learning_rate": 4.944500000000001e-06,
"loss": 1.142,
"step": 811000
},
{
"epoch": 3.1289376640091247,
"grad_norm": 0.774726927280426,
"learning_rate": 4.888944444444445e-06,
"loss": 1.142,
"step": 812000
},
{
"epoch": 3.1327910355165254,
"grad_norm": 0.7784895300865173,
"learning_rate": 4.83338888888889e-06,
"loss": 1.1416,
"step": 813000
},
{
"epoch": 3.1366444070239257,
"grad_norm": 0.7654526233673096,
"learning_rate": 4.777833333333334e-06,
"loss": 1.1418,
"step": 814000
},
{
"epoch": 3.140497778531326,
"grad_norm": 0.7536936402320862,
"learning_rate": 4.7222777777777785e-06,
"loss": 1.1416,
"step": 815000
},
{
"epoch": 3.1443511500387262,
"grad_norm": 0.7567889094352722,
"learning_rate": 4.6667222222222226e-06,
"loss": 1.1407,
"step": 816000
},
{
"epoch": 3.148204521546127,
"grad_norm": 0.7922675609588623,
"learning_rate": 4.6111666666666674e-06,
"loss": 1.141,
"step": 817000
},
{
"epoch": 3.1520578930535272,
"grad_norm": 0.7856247425079346,
"learning_rate": 4.5556111111111115e-06,
"loss": 1.1418,
"step": 818000
},
{
"epoch": 3.1559112645609275,
"grad_norm": 0.7786199450492859,
"learning_rate": 4.500055555555556e-06,
"loss": 1.1411,
"step": 819000
},
{
"epoch": 3.159764636068328,
"grad_norm": 0.7758617997169495,
"learning_rate": 4.4445e-06,
"loss": 1.1415,
"step": 820000
},
{
"epoch": 3.1636180075757285,
"grad_norm": 0.769377589225769,
"learning_rate": 4.3889444444444444e-06,
"loss": 1.1415,
"step": 821000
},
{
"epoch": 3.167471379083129,
"grad_norm": 0.752196729183197,
"learning_rate": 4.333388888888889e-06,
"loss": 1.1409,
"step": 822000
},
{
"epoch": 3.171324750590529,
"grad_norm": 0.7932141423225403,
"learning_rate": 4.277833333333333e-06,
"loss": 1.1409,
"step": 823000
},
{
"epoch": 3.1751781220979294,
"grad_norm": 0.7658106684684753,
"learning_rate": 4.222277777777777e-06,
"loss": 1.1411,
"step": 824000
},
{
"epoch": 3.17903149360533,
"grad_norm": 0.7765457630157471,
"learning_rate": 4.166722222222222e-06,
"loss": 1.1412,
"step": 825000
},
{
"epoch": 3.1828848651127304,
"grad_norm": 0.7698619365692139,
"learning_rate": 4.111166666666666e-06,
"loss": 1.1416,
"step": 826000
},
{
"epoch": 3.1867382366201307,
"grad_norm": 0.7630689144134521,
"learning_rate": 4.055611111111111e-06,
"loss": 1.1412,
"step": 827000
},
{
"epoch": 3.190591608127531,
"grad_norm": 0.7751487493515015,
"learning_rate": 4.000055555555555e-06,
"loss": 1.1405,
"step": 828000
},
{
"epoch": 3.1944449796349317,
"grad_norm": 0.7852933406829834,
"learning_rate": 3.9445e-06,
"loss": 1.1416,
"step": 829000
},
{
"epoch": 3.198298351142332,
"grad_norm": 0.7691949605941772,
"learning_rate": 3.888944444444444e-06,
"loss": 1.1398,
"step": 830000
},
{
"epoch": 3.2021517226497322,
"grad_norm": 0.7742173671722412,
"learning_rate": 3.833388888888889e-06,
"loss": 1.1397,
"step": 831000
},
{
"epoch": 3.206005094157133,
"grad_norm": 0.7698484063148499,
"learning_rate": 3.777833333333333e-06,
"loss": 1.1397,
"step": 832000
},
{
"epoch": 3.2098584656645333,
"grad_norm": 0.7852274775505066,
"learning_rate": 3.722277777777778e-06,
"loss": 1.1394,
"step": 833000
},
{
"epoch": 3.2137118371719335,
"grad_norm": 0.7859106063842773,
"learning_rate": 3.666722222222222e-06,
"loss": 1.1407,
"step": 834000
},
{
"epoch": 3.217565208679334,
"grad_norm": 0.7774125337600708,
"learning_rate": 3.611166666666667e-06,
"loss": 1.1401,
"step": 835000
},
{
"epoch": 3.2214185801867345,
"grad_norm": 0.7690660357475281,
"learning_rate": 3.555611111111111e-06,
"loss": 1.1401,
"step": 836000
},
{
"epoch": 3.225271951694135,
"grad_norm": 0.7808369994163513,
"learning_rate": 3.5000555555555558e-06,
"loss": 1.1404,
"step": 837000
},
{
"epoch": 3.229125323201535,
"grad_norm": 0.7723608613014221,
"learning_rate": 3.4445000000000006e-06,
"loss": 1.1391,
"step": 838000
},
{
"epoch": 3.2329786947089354,
"grad_norm": 0.7605135440826416,
"learning_rate": 3.3889444444444447e-06,
"loss": 1.1398,
"step": 839000
},
{
"epoch": 3.236832066216336,
"grad_norm": 0.7743427753448486,
"learning_rate": 3.3333888888888896e-06,
"loss": 1.1398,
"step": 840000
},
{
"epoch": 3.2406854377237364,
"grad_norm": 0.783277690410614,
"learning_rate": 3.2778333333333336e-06,
"loss": 1.1404,
"step": 841000
},
{
"epoch": 3.2445388092311367,
"grad_norm": 0.7651547789573669,
"learning_rate": 3.222277777777778e-06,
"loss": 1.1396,
"step": 842000
},
{
"epoch": 3.248392180738537,
"grad_norm": 0.7884653806686401,
"learning_rate": 3.1667222222222225e-06,
"loss": 1.1398,
"step": 843000
},
{
"epoch": 3.2522455522459377,
"grad_norm": 0.7730636596679688,
"learning_rate": 3.1111666666666666e-06,
"loss": 1.1393,
"step": 844000
},
{
"epoch": 3.256098923753338,
"grad_norm": 0.7686559557914734,
"learning_rate": 3.055611111111111e-06,
"loss": 1.1402,
"step": 845000
},
{
"epoch": 3.2599522952607383,
"grad_norm": 0.7743884921073914,
"learning_rate": 3.0000555555555555e-06,
"loss": 1.1396,
"step": 846000
},
{
"epoch": 3.263805666768139,
"grad_norm": 0.7998344302177429,
"learning_rate": 2.9445e-06,
"loss": 1.1398,
"step": 847000
},
{
"epoch": 3.2676590382755393,
"grad_norm": 0.7782961130142212,
"learning_rate": 2.8889444444444444e-06,
"loss": 1.1397,
"step": 848000
},
{
"epoch": 3.2715124097829396,
"grad_norm": 0.788406252861023,
"learning_rate": 2.833388888888889e-06,
"loss": 1.1388,
"step": 849000
},
{
"epoch": 3.27536578129034,
"grad_norm": 0.7676372528076172,
"learning_rate": 2.7778333333333333e-06,
"loss": 1.1389,
"step": 850000
},
{
"epoch": 3.2792191527977406,
"grad_norm": 0.7785215377807617,
"learning_rate": 2.7222777777777778e-06,
"loss": 1.138,
"step": 851000
},
{
"epoch": 3.283072524305141,
"grad_norm": 0.7590740919113159,
"learning_rate": 2.6667222222222222e-06,
"loss": 1.1392,
"step": 852000
},
{
"epoch": 3.286925895812541,
"grad_norm": 0.7940697073936462,
"learning_rate": 2.6111666666666667e-06,
"loss": 1.1378,
"step": 853000
},
{
"epoch": 3.2907792673199414,
"grad_norm": 0.7704636454582214,
"learning_rate": 2.5556111111111116e-06,
"loss": 1.1381,
"step": 854000
},
{
"epoch": 3.294632638827342,
"grad_norm": 0.7758104801177979,
"learning_rate": 2.500055555555556e-06,
"loss": 1.1381,
"step": 855000
},
{
"epoch": 3.2984860103347424,
"grad_norm": 0.7805718183517456,
"learning_rate": 2.4445e-06,
"loss": 1.1378,
"step": 856000
},
{
"epoch": 3.3023393818421427,
"grad_norm": 0.7670098543167114,
"learning_rate": 2.3889444444444445e-06,
"loss": 1.1386,
"step": 857000
},
{
"epoch": 3.306192753349543,
"grad_norm": 0.7733041048049927,
"learning_rate": 2.333388888888889e-06,
"loss": 1.139,
"step": 858000
},
{
"epoch": 3.3100461248569437,
"grad_norm": 0.7957353591918945,
"learning_rate": 2.2778333333333334e-06,
"loss": 1.1383,
"step": 859000
},
{
"epoch": 3.313899496364344,
"grad_norm": 0.7921308875083923,
"learning_rate": 2.222277777777778e-06,
"loss": 1.1385,
"step": 860000
},
{
"epoch": 3.3177528678717443,
"grad_norm": 0.7467139363288879,
"learning_rate": 2.1667222222222224e-06,
"loss": 1.1373,
"step": 861000
},
{
"epoch": 3.321606239379145,
"grad_norm": 0.8030253648757935,
"learning_rate": 2.111166666666667e-06,
"loss": 1.1384,
"step": 862000
},
{
"epoch": 3.3254596108865453,
"grad_norm": 0.778984546661377,
"learning_rate": 2.0556111111111113e-06,
"loss": 1.1383,
"step": 863000
},
{
"epoch": 3.3293129823939456,
"grad_norm": 0.7732436656951904,
"learning_rate": 2.0000555555555557e-06,
"loss": 1.1385,
"step": 864000
},
{
"epoch": 3.333166353901346,
"grad_norm": 0.7700003981590271,
"learning_rate": 1.9445e-06,
"loss": 1.1378,
"step": 865000
},
{
"epoch": 3.337019725408746,
"grad_norm": 0.7778324484825134,
"learning_rate": 1.8889444444444446e-06,
"loss": 1.1373,
"step": 866000
},
{
"epoch": 3.340873096916147,
"grad_norm": 0.784168004989624,
"learning_rate": 1.833388888888889e-06,
"loss": 1.1378,
"step": 867000
},
{
"epoch": 3.344726468423547,
"grad_norm": 0.7781540155410767,
"learning_rate": 1.7778333333333334e-06,
"loss": 1.1376,
"step": 868000
},
{
"epoch": 3.3485798399309474,
"grad_norm": 0.770268440246582,
"learning_rate": 1.7222777777777778e-06,
"loss": 1.1382,
"step": 869000
},
{
"epoch": 3.352433211438348,
"grad_norm": 0.7837746143341064,
"learning_rate": 1.6667222222222223e-06,
"loss": 1.1373,
"step": 870000
},
{
"epoch": 3.3562865829457484,
"grad_norm": 0.7703538537025452,
"learning_rate": 1.6111666666666667e-06,
"loss": 1.1383,
"step": 871000
},
{
"epoch": 3.3601399544531487,
"grad_norm": 0.7656373977661133,
"learning_rate": 1.5556111111111112e-06,
"loss": 1.1374,
"step": 872000
},
{
"epoch": 3.363993325960549,
"grad_norm": 0.7768437266349792,
"learning_rate": 1.5000555555555556e-06,
"loss": 1.1362,
"step": 873000
},
{
"epoch": 3.3678466974679497,
"grad_norm": 0.7731209993362427,
"learning_rate": 1.4445e-06,
"loss": 1.1368,
"step": 874000
},
{
"epoch": 3.37170006897535,
"grad_norm": 0.7907932996749878,
"learning_rate": 1.3889444444444444e-06,
"loss": 1.1367,
"step": 875000
},
{
"epoch": 3.3755534404827503,
"grad_norm": 0.7951443791389465,
"learning_rate": 1.3333888888888888e-06,
"loss": 1.1371,
"step": 876000
},
{
"epoch": 3.379406811990151,
"grad_norm": 0.7773862481117249,
"learning_rate": 1.2778333333333333e-06,
"loss": 1.1374,
"step": 877000
},
{
"epoch": 3.3832601834975513,
"grad_norm": 0.7822207808494568,
"learning_rate": 1.222277777777778e-06,
"loss": 1.1373,
"step": 878000
},
{
"epoch": 3.3871135550049516,
"grad_norm": 0.790253221988678,
"learning_rate": 1.1667222222222224e-06,
"loss": 1.1369,
"step": 879000
},
{
"epoch": 3.390966926512352,
"grad_norm": 0.7918968200683594,
"learning_rate": 1.1111666666666669e-06,
"loss": 1.1372,
"step": 880000
},
{
"epoch": 3.394820298019752,
"grad_norm": 0.7610453963279724,
"learning_rate": 1.0556111111111113e-06,
"loss": 1.137,
"step": 881000
},
{
"epoch": 3.398673669527153,
"grad_norm": 0.7577848434448242,
"learning_rate": 1.0000555555555556e-06,
"loss": 1.1376,
"step": 882000
},
{
"epoch": 3.402527041034553,
"grad_norm": 0.766459584236145,
"learning_rate": 9.445e-07,
"loss": 1.1362,
"step": 883000
},
{
"epoch": 3.4063804125419535,
"grad_norm": 0.7742135524749756,
"learning_rate": 8.889444444444445e-07,
"loss": 1.1363,
"step": 884000
},
{
"epoch": 3.410233784049354,
"grad_norm": 0.7687368988990784,
"learning_rate": 8.333888888888889e-07,
"loss": 1.1367,
"step": 885000
},
{
"epoch": 3.4140871555567545,
"grad_norm": 0.7918124794960022,
"learning_rate": 7.778333333333334e-07,
"loss": 1.1371,
"step": 886000
},
{
"epoch": 3.4179405270641547,
"grad_norm": 0.7830091118812561,
"learning_rate": 7.222777777777777e-07,
"loss": 1.1377,
"step": 887000
},
{
"epoch": 3.421793898571555,
"grad_norm": 0.786780059337616,
"learning_rate": 6.667222222222222e-07,
"loss": 1.1364,
"step": 888000
},
{
"epoch": 3.4256472700789558,
"grad_norm": 0.7944617867469788,
"learning_rate": 6.111666666666667e-07,
"loss": 1.1365,
"step": 889000
},
{
"epoch": 3.429500641586356,
"grad_norm": 0.7738235592842102,
"learning_rate": 5.556111111111111e-07,
"loss": 1.1363,
"step": 890000
},
{
"epoch": 3.4333540130937563,
"grad_norm": 0.7691417336463928,
"learning_rate": 5.000555555555556e-07,
"loss": 1.1373,
"step": 891000
},
{
"epoch": 3.4372073846011566,
"grad_norm": 0.7909073829650879,
"learning_rate": 4.4450000000000004e-07,
"loss": 1.1363,
"step": 892000
},
{
"epoch": 3.4410607561085573,
"grad_norm": 0.7703380584716797,
"learning_rate": 3.8894444444444445e-07,
"loss": 1.1357,
"step": 893000
},
{
"epoch": 3.4449141276159576,
"grad_norm": 0.7877383232116699,
"learning_rate": 3.333888888888889e-07,
"loss": 1.1365,
"step": 894000
},
{
"epoch": 3.448767499123358,
"grad_norm": 0.7765257358551025,
"learning_rate": 2.778333333333333e-07,
"loss": 1.1359,
"step": 895000
},
{
"epoch": 3.452620870630758,
"grad_norm": 0.7824655175209045,
"learning_rate": 2.222777777777778e-07,
"loss": 1.1365,
"step": 896000
},
{
"epoch": 3.456474242138159,
"grad_norm": 0.7947019934654236,
"learning_rate": 1.6672222222222223e-07,
"loss": 1.1362,
"step": 897000
},
{
"epoch": 3.460327613645559,
"grad_norm": 0.7733472585678101,
"learning_rate": 1.1116666666666666e-07,
"loss": 1.1363,
"step": 898000
},
{
"epoch": 3.4641809851529595,
"grad_norm": 0.7852097749710083,
"learning_rate": 5.561111111111111e-08,
"loss": 1.1357,
"step": 899000
},
{
"epoch": 3.46803435666036,
"grad_norm": 0.7882575392723083,
"learning_rate": 5.5555555555555553e-11,
"loss": 1.1358,
"step": 900000
}
],
"logging_steps": 1000,
"max_steps": 900000,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.376157208460059e+20,
"train_batch_size": 256,
"trial_name": null,
"trial_params": null
}