litelm_moe / trainer_state.json
zhupipi's picture
litelm_moe
62f303e
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 5.0,
"eval_steps": 500,
"global_step": 81965,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.018301052310507854,
"grad_norm": 0.5670120716094971,
"learning_rate": 0.00019999343324994024,
"loss": 1.7725,
"step": 300
},
{
"epoch": 0.03660210462101571,
"grad_norm": 0.5167950987815857,
"learning_rate": 0.00019997364594600192,
"loss": 1.5681,
"step": 600
},
{
"epoch": 0.05490315693152356,
"grad_norm": 0.46956542134284973,
"learning_rate": 0.00019994064063090904,
"loss": 1.5176,
"step": 900
},
{
"epoch": 0.07320420924203141,
"grad_norm": 0.5100629329681396,
"learning_rate": 0.00019989442166845785,
"loss": 1.4843,
"step": 1200
},
{
"epoch": 0.09150526155253927,
"grad_norm": 0.4842604398727417,
"learning_rate": 0.0001998349951694864,
"loss": 1.4558,
"step": 1500
},
{
"epoch": 0.10980631386304712,
"grad_norm": 0.4316524863243103,
"learning_rate": 0.00019976236899106663,
"loss": 1.4411,
"step": 1800
},
{
"epoch": 0.12810736617355498,
"grad_norm": 0.4475869834423065,
"learning_rate": 0.0001996765527354655,
"loss": 1.4211,
"step": 2100
},
{
"epoch": 0.14640841848406283,
"grad_norm": 0.4860438406467438,
"learning_rate": 0.00019957755774887542,
"loss": 1.4096,
"step": 2400
},
{
"epoch": 0.16470947079457068,
"grad_norm": 0.4574320316314697,
"learning_rate": 0.00019946539711991412,
"loss": 1.3946,
"step": 2700
},
{
"epoch": 0.18301052310507854,
"grad_norm": 0.44823363423347473,
"learning_rate": 0.0001993400856778942,
"loss": 1.3861,
"step": 3000
},
{
"epoch": 0.2013115754155864,
"grad_norm": 0.4297441840171814,
"learning_rate": 0.00019920163999086237,
"loss": 1.3743,
"step": 3300
},
{
"epoch": 0.21961262772609424,
"grad_norm": 0.4372228980064392,
"learning_rate": 0.00019905007836340904,
"loss": 1.3683,
"step": 3600
},
{
"epoch": 0.2379136800366021,
"grad_norm": 0.43532341718673706,
"learning_rate": 0.00019888542083424797,
"loss": 1.36,
"step": 3900
},
{
"epoch": 0.25621473234710995,
"grad_norm": 0.4318517744541168,
"learning_rate": 0.00019870768917356713,
"loss": 1.353,
"step": 4200
},
{
"epoch": 0.27451578465761783,
"grad_norm": 0.41758111119270325,
"learning_rate": 0.00019851690688015006,
"loss": 1.3469,
"step": 4500
},
{
"epoch": 0.29281683696812566,
"grad_norm": 0.46511879563331604,
"learning_rate": 0.00019831309917826929,
"loss": 1.3392,
"step": 4800
},
{
"epoch": 0.31111788927863354,
"grad_norm": 0.4342522621154785,
"learning_rate": 0.00019809629301435096,
"loss": 1.3298,
"step": 5100
},
{
"epoch": 0.32941894158914137,
"grad_norm": 0.4393858313560486,
"learning_rate": 0.00019786651705341253,
"loss": 1.3247,
"step": 5400
},
{
"epoch": 0.34771999389964925,
"grad_norm": 0.4274674654006958,
"learning_rate": 0.00019762380167527238,
"loss": 1.3227,
"step": 5700
},
{
"epoch": 0.3660210462101571,
"grad_norm": 0.4522903561592102,
"learning_rate": 0.0001973681789705334,
"loss": 1.3152,
"step": 6000
},
{
"epoch": 0.38432209852066496,
"grad_norm": 0.4059693515300751,
"learning_rate": 0.00019709968273634018,
"loss": 1.3169,
"step": 6300
},
{
"epoch": 0.4026231508311728,
"grad_norm": 0.39100533723831177,
"learning_rate": 0.00019681834847191042,
"loss": 1.3093,
"step": 6600
},
{
"epoch": 0.42092420314168066,
"grad_norm": 0.4144134819507599,
"learning_rate": 0.00019652421337384136,
"loss": 1.3032,
"step": 6900
},
{
"epoch": 0.4392252554521885,
"grad_norm": 0.4192708432674408,
"learning_rate": 0.00019621731633119192,
"loss": 1.2983,
"step": 7200
},
{
"epoch": 0.45752630776269637,
"grad_norm": 0.4011496305465698,
"learning_rate": 0.0001958976979203409,
"loss": 1.2906,
"step": 7500
},
{
"epoch": 0.4758273600732042,
"grad_norm": 0.4050943851470947,
"learning_rate": 0.00019556540039962234,
"loss": 1.2897,
"step": 7800
},
{
"epoch": 0.4941284123837121,
"grad_norm": 0.40949124097824097,
"learning_rate": 0.0001952204677037382,
"loss": 1.2891,
"step": 8100
},
{
"epoch": 0.5124294646942199,
"grad_norm": 0.42695119976997375,
"learning_rate": 0.0001948629454379494,
"loss": 1.2822,
"step": 8400
},
{
"epoch": 0.5307305170047277,
"grad_norm": 0.37948158383369446,
"learning_rate": 0.00019449288087204658,
"loss": 1.2793,
"step": 8700
},
{
"epoch": 0.5490315693152357,
"grad_norm": 0.40617385506629944,
"learning_rate": 0.0001941103229340998,
"loss": 1.2742,
"step": 9000
},
{
"epoch": 0.5673326216257435,
"grad_norm": 0.43047332763671875,
"learning_rate": 0.00019371532220398976,
"loss": 1.2726,
"step": 9300
},
{
"epoch": 0.5856336739362513,
"grad_norm": 0.38262712955474854,
"learning_rate": 0.0001933079309067205,
"loss": 1.2728,
"step": 9600
},
{
"epoch": 0.6039347262467591,
"grad_norm": 0.4016256630420685,
"learning_rate": 0.0001928882029055143,
"loss": 1.2647,
"step": 9900
},
{
"epoch": 0.6222357785572671,
"grad_norm": 0.3772490918636322,
"learning_rate": 0.00019245619369469012,
"loss": 1.2606,
"step": 10200
},
{
"epoch": 0.6405368308677749,
"grad_norm": 0.3964807689189911,
"learning_rate": 0.0001920119603923266,
"loss": 1.2595,
"step": 10500
},
{
"epoch": 0.6588378831782827,
"grad_norm": 0.3914461135864258,
"learning_rate": 0.00019155556173271006,
"loss": 1.2549,
"step": 10800
},
{
"epoch": 0.6771389354887906,
"grad_norm": 0.4277031123638153,
"learning_rate": 0.000191087058058569,
"loss": 1.2555,
"step": 11100
},
{
"epoch": 0.6954399877992985,
"grad_norm": 0.3928915560245514,
"learning_rate": 0.00019060651131309582,
"loss": 1.2481,
"step": 11400
},
{
"epoch": 0.7137410401098063,
"grad_norm": 0.3531055450439453,
"learning_rate": 0.00019011398503175723,
"loss": 1.2525,
"step": 11700
},
{
"epoch": 0.7320420924203141,
"grad_norm": 0.37548962235450745,
"learning_rate": 0.00018960954433389345,
"loss": 1.2425,
"step": 12000
},
{
"epoch": 0.750343144730822,
"grad_norm": 0.3959772288799286,
"learning_rate": 0.00018909325591410906,
"loss": 1.2407,
"step": 12300
},
{
"epoch": 0.7686441970413299,
"grad_norm": 0.36781710386276245,
"learning_rate": 0.0001885651880334545,
"loss": 1.2417,
"step": 12600
},
{
"epoch": 0.7869452493518377,
"grad_norm": 0.3856872022151947,
"learning_rate": 0.00018802541051040112,
"loss": 1.2391,
"step": 12900
},
{
"epoch": 0.8052463016623456,
"grad_norm": 0.3833692669868469,
"learning_rate": 0.0001874739947116101,
"loss": 1.2377,
"step": 13200
},
{
"epoch": 0.8235473539728534,
"grad_norm": 0.40677550435066223,
"learning_rate": 0.00018691101354249688,
"loss": 1.2286,
"step": 13500
},
{
"epoch": 0.8418484062833613,
"grad_norm": 0.39465758204460144,
"learning_rate": 0.0001863365414375916,
"loss": 1.2324,
"step": 13800
},
{
"epoch": 0.8601494585938692,
"grad_norm": 0.3750082552433014,
"learning_rate": 0.00018575065435069807,
"loss": 1.2285,
"step": 14100
},
{
"epoch": 0.878450510904377,
"grad_norm": 0.3967496454715729,
"learning_rate": 0.0001851534297448515,
"loss": 1.2264,
"step": 14400
},
{
"epoch": 0.8967515632148848,
"grad_norm": 0.376779705286026,
"learning_rate": 0.00018454494658207658,
"loss": 1.2224,
"step": 14700
},
{
"epoch": 0.9150526155253927,
"grad_norm": 0.41239818930625916,
"learning_rate": 0.00018392528531294762,
"loss": 1.2209,
"step": 15000
},
{
"epoch": 0.9333536678359006,
"grad_norm": 0.41411134600639343,
"learning_rate": 0.00018329452786595177,
"loss": 1.2207,
"step": 15300
},
{
"epoch": 0.9516547201464084,
"grad_norm": 0.3956342339515686,
"learning_rate": 0.00018265275763665683,
"loss": 1.2169,
"step": 15600
},
{
"epoch": 0.9699557724569162,
"grad_norm": 0.4099271893501282,
"learning_rate": 0.00018200005947668522,
"loss": 1.2138,
"step": 15900
},
{
"epoch": 0.9882568247674242,
"grad_norm": 0.37904131412506104,
"learning_rate": 0.00018133651968249502,
"loss": 1.2126,
"step": 16200
},
{
"epoch": 1.006527375324081,
"grad_norm": 0.37347930669784546,
"learning_rate": 0.00018066222598397077,
"loss": 1.2007,
"step": 16500
},
{
"epoch": 1.024828427634589,
"grad_norm": 0.35225772857666016,
"learning_rate": 0.0001799772675328237,
"loss": 1.1819,
"step": 16800
},
{
"epoch": 1.0431294799450967,
"grad_norm": 0.3745623528957367,
"learning_rate": 0.00017928173489080505,
"loss": 1.1873,
"step": 17100
},
{
"epoch": 1.0614305322556048,
"grad_norm": 0.36279329657554626,
"learning_rate": 0.00017857572001773214,
"loss": 1.1799,
"step": 17400
},
{
"epoch": 1.0797315845661126,
"grad_norm": 0.3755429685115814,
"learning_rate": 0.00017785931625933008,
"loss": 1.1822,
"step": 17700
},
{
"epoch": 1.0980326368766204,
"grad_norm": 0.3589393198490143,
"learning_rate": 0.0001771326183348899,
"loss": 1.1854,
"step": 18000
},
{
"epoch": 1.1163336891871283,
"grad_norm": 0.3789694309234619,
"learning_rate": 0.00017639572232474538,
"loss": 1.1815,
"step": 18300
},
{
"epoch": 1.134634741497636,
"grad_norm": 0.41688501834869385,
"learning_rate": 0.00017564872565756977,
"loss": 1.1809,
"step": 18600
},
{
"epoch": 1.152935793808144,
"grad_norm": 0.3584393560886383,
"learning_rate": 0.00017489172709749413,
"loss": 1.1768,
"step": 18900
},
{
"epoch": 1.1712368461186518,
"grad_norm": 0.3694947063922882,
"learning_rate": 0.00017412482673104953,
"loss": 1.1799,
"step": 19200
},
{
"epoch": 1.1895378984291596,
"grad_norm": 0.356913298368454,
"learning_rate": 0.0001733481259539338,
"loss": 1.1714,
"step": 19500
},
{
"epoch": 1.2078389507396676,
"grad_norm": 0.3630887567996979,
"learning_rate": 0.00017256172745760563,
"loss": 1.1733,
"step": 19800
},
{
"epoch": 1.2261400030501755,
"grad_norm": 0.3733890950679779,
"learning_rate": 0.00017176573521570726,
"loss": 1.1756,
"step": 20100
},
{
"epoch": 1.2444410553606833,
"grad_norm": 0.37580791115760803,
"learning_rate": 0.00017096025447031757,
"loss": 1.1697,
"step": 20400
},
{
"epoch": 1.262742107671191,
"grad_norm": 0.3889394700527191,
"learning_rate": 0.00017014539171803756,
"loss": 1.172,
"step": 20700
},
{
"epoch": 1.281043159981699,
"grad_norm": 0.3594827950000763,
"learning_rate": 0.00016932125469590976,
"loss": 1.1716,
"step": 21000
},
{
"epoch": 1.2993442122922068,
"grad_norm": 0.3551196753978729,
"learning_rate": 0.00016848795236717385,
"loss": 1.1666,
"step": 21300
},
{
"epoch": 1.3176452646027146,
"grad_norm": 0.3702225387096405,
"learning_rate": 0.00016764559490686028,
"loss": 1.1725,
"step": 21600
},
{
"epoch": 1.3359463169132226,
"grad_norm": 0.37975847721099854,
"learning_rate": 0.00016679429368722317,
"loss": 1.1658,
"step": 21900
},
{
"epoch": 1.3542473692237302,
"grad_norm": 0.3720037341117859,
"learning_rate": 0.00016593416126301543,
"loss": 1.1665,
"step": 22200
},
{
"epoch": 1.3725484215342383,
"grad_norm": 0.3632305860519409,
"learning_rate": 0.00016506531135660732,
"loss": 1.1651,
"step": 22500
},
{
"epoch": 1.390849473844746,
"grad_norm": 0.3983381688594818,
"learning_rate": 0.00016418785884295055,
"loss": 1.1633,
"step": 22800
},
{
"epoch": 1.409150526155254,
"grad_norm": 0.35853487253189087,
"learning_rate": 0.00016330191973439027,
"loss": 1.1624,
"step": 23100
},
{
"epoch": 1.4274515784657618,
"grad_norm": 0.3824533224105835,
"learning_rate": 0.00016240761116532624,
"loss": 1.1624,
"step": 23400
},
{
"epoch": 1.4457526307762696,
"grad_norm": 0.38290977478027344,
"learning_rate": 0.00016150505137672624,
"loss": 1.158,
"step": 23700
},
{
"epoch": 1.4640536830867774,
"grad_norm": 0.34818851947784424,
"learning_rate": 0.0001605943597004926,
"loss": 1.1637,
"step": 24000
},
{
"epoch": 1.4823547353972852,
"grad_norm": 0.3592260181903839,
"learning_rate": 0.00015967565654368478,
"loss": 1.1571,
"step": 24300
},
{
"epoch": 1.5006557877077933,
"grad_norm": 0.36368805170059204,
"learning_rate": 0.00015874906337259978,
"loss": 1.1514,
"step": 24600
},
{
"epoch": 1.518956840018301,
"grad_norm": 0.3715674877166748,
"learning_rate": 0.00015781470269671263,
"loss": 1.154,
"step": 24900
},
{
"epoch": 1.537257892328809,
"grad_norm": 0.38933447003364563,
"learning_rate": 0.0001568726980524785,
"loss": 1.1521,
"step": 25200
},
{
"epoch": 1.5555589446393168,
"grad_norm": 0.4079601466655731,
"learning_rate": 0.00015592317398699948,
"loss": 1.1548,
"step": 25500
},
{
"epoch": 1.5738599969498246,
"grad_norm": 0.40320438146591187,
"learning_rate": 0.0001549662560415578,
"loss": 1.152,
"step": 25800
},
{
"epoch": 1.5921610492603324,
"grad_norm": 0.3639344573020935,
"learning_rate": 0.00015400207073501703,
"loss": 1.1452,
"step": 26100
},
{
"epoch": 1.6104621015708402,
"grad_norm": 0.3776426315307617,
"learning_rate": 0.0001530307455470946,
"loss": 1.1494,
"step": 26400
},
{
"epoch": 1.6287631538813483,
"grad_norm": 0.3482027053833008,
"learning_rate": 0.00015205240890150701,
"loss": 1.148,
"step": 26700
},
{
"epoch": 1.647064206191856,
"grad_norm": 0.380161851644516,
"learning_rate": 0.0001510671901489905,
"loss": 1.1487,
"step": 27000
},
{
"epoch": 1.665365258502364,
"grad_norm": 0.3611465394496918,
"learning_rate": 0.00015007521955019847,
"loss": 1.1437,
"step": 27300
},
{
"epoch": 1.6836663108128718,
"grad_norm": 0.3532879650592804,
"learning_rate": 0.00014907662825847968,
"loss": 1.146,
"step": 27600
},
{
"epoch": 1.7019673631233796,
"grad_norm": 0.37363260984420776,
"learning_rate": 0.00014807154830253732,
"loss": 1.1423,
"step": 27900
},
{
"epoch": 1.7202684154338874,
"grad_norm": 0.35999348759651184,
"learning_rate": 0.00014706011256897304,
"loss": 1.1439,
"step": 28200
},
{
"epoch": 1.7385694677443952,
"grad_norm": 0.35193225741386414,
"learning_rate": 0.0001460424547847175,
"loss": 1.1422,
"step": 28500
},
{
"epoch": 1.7568705200549033,
"grad_norm": 0.35045188665390015,
"learning_rate": 0.0001450187094993493,
"loss": 1.1405,
"step": 28800
},
{
"epoch": 1.775171572365411,
"grad_norm": 0.36350497603416443,
"learning_rate": 0.00014398901206730591,
"loss": 1.1395,
"step": 29100
},
{
"epoch": 1.793472624675919,
"grad_norm": 0.3509625792503357,
"learning_rate": 0.0001429534986299875,
"loss": 1.1366,
"step": 29400
},
{
"epoch": 1.8117736769864268,
"grad_norm": 0.38187241554260254,
"learning_rate": 0.00014191230609775715,
"loss": 1.1386,
"step": 29700
},
{
"epoch": 1.8300747292969346,
"grad_norm": 0.3755728304386139,
"learning_rate": 0.0001408655721318392,
"loss": 1.136,
"step": 30000
},
{
"epoch": 1.8483757816074424,
"grad_norm": 0.37514305114746094,
"learning_rate": 0.00013981343512611848,
"loss": 1.1395,
"step": 30300
},
{
"epoch": 1.8666768339179503,
"grad_norm": 0.38384050130844116,
"learning_rate": 0.00013875603418884243,
"loss": 1.1337,
"step": 30600
},
{
"epoch": 1.8849778862284583,
"grad_norm": 0.38143786787986755,
"learning_rate": 0.00013769350912422893,
"loss": 1.1325,
"step": 30900
},
{
"epoch": 1.903278938538966,
"grad_norm": 0.3801436424255371,
"learning_rate": 0.00013662600041398215,
"loss": 1.1358,
"step": 31200
},
{
"epoch": 1.921579990849474,
"grad_norm": 0.37413644790649414,
"learning_rate": 0.00013555364919871877,
"loss": 1.1319,
"step": 31500
},
{
"epoch": 1.9398810431599816,
"grad_norm": 0.36143454909324646,
"learning_rate": 0.00013447659725930713,
"loss": 1.1303,
"step": 31800
},
{
"epoch": 1.9581820954704896,
"grad_norm": 0.3567048907279968,
"learning_rate": 0.00013339498699812142,
"loss": 1.1292,
"step": 32100
},
{
"epoch": 1.9764831477809974,
"grad_norm": 0.4003380835056305,
"learning_rate": 0.00013230896142021425,
"loss": 1.1278,
"step": 32400
},
{
"epoch": 1.9947842000915053,
"grad_norm": 0.37288859486579895,
"learning_rate": 0.00013121866411440917,
"loss": 1.1252,
"step": 32700
},
{
"epoch": 2.013054750648162,
"grad_norm": 0.37710532546043396,
"learning_rate": 0.000130124239234316,
"loss": 1.1003,
"step": 33000
},
{
"epoch": 2.0313558029586702,
"grad_norm": 0.36639609932899475,
"learning_rate": 0.0001290258314792716,
"loss": 1.0967,
"step": 33300
},
{
"epoch": 2.049656855269178,
"grad_norm": 0.3679460883140564,
"learning_rate": 0.0001279235860752084,
"loss": 1.1011,
"step": 33600
},
{
"epoch": 2.067957907579686,
"grad_norm": 0.40204325318336487,
"learning_rate": 0.00012681764875545362,
"loss": 1.0981,
"step": 33900
},
{
"epoch": 2.0862589598901935,
"grad_norm": 0.37128251791000366,
"learning_rate": 0.0001257081657414608,
"loss": 1.0963,
"step": 34200
},
{
"epoch": 2.1045600122007015,
"grad_norm": 0.35975438356399536,
"learning_rate": 0.00012459528372347722,
"loss": 1.0977,
"step": 34500
},
{
"epoch": 2.1228610645112096,
"grad_norm": 0.3734683394432068,
"learning_rate": 0.0001234791498411495,
"loss": 1.0962,
"step": 34800
},
{
"epoch": 2.141162116821717,
"grad_norm": 0.36970898509025574,
"learning_rate": 0.0001223599116640693,
"loss": 1.0969,
"step": 35100
},
{
"epoch": 2.1594631691322252,
"grad_norm": 0.41932040452957153,
"learning_rate": 0.00012123771717226238,
"loss": 1.0945,
"step": 35400
},
{
"epoch": 2.177764221442733,
"grad_norm": 0.38500991463661194,
"learning_rate": 0.00012011271473662365,
"loss": 1.0975,
"step": 35700
},
{
"epoch": 2.196065273753241,
"grad_norm": 0.36856573820114136,
"learning_rate": 0.00011898505309930006,
"loss": 1.095,
"step": 36000
},
{
"epoch": 2.2143663260637485,
"grad_norm": 0.38829776644706726,
"learning_rate": 0.00011785488135402492,
"loss": 1.0961,
"step": 36300
},
{
"epoch": 2.2326673783742566,
"grad_norm": 0.3822610080242157,
"learning_rate": 0.00011672234892640525,
"loss": 1.0971,
"step": 36600
},
{
"epoch": 2.2509684306847646,
"grad_norm": 0.3844328820705414,
"learning_rate": 0.00011558760555416565,
"loss": 1.0945,
"step": 36900
},
{
"epoch": 2.269269482995272,
"grad_norm": 0.3492465317249298,
"learning_rate": 0.00011445080126735061,
"loss": 1.0938,
"step": 37200
},
{
"epoch": 2.2875705353057803,
"grad_norm": 0.4076479971408844,
"learning_rate": 0.00011331208636848844,
"loss": 1.09,
"step": 37500
},
{
"epoch": 2.305871587616288,
"grad_norm": 0.36309176683425903,
"learning_rate": 0.00011217161141271879,
"loss": 1.0918,
"step": 37800
},
{
"epoch": 2.324172639926796,
"grad_norm": 0.36070308089256287,
"learning_rate": 0.00011102952718788731,
"loss": 1.0947,
"step": 38100
},
{
"epoch": 2.3424736922373035,
"grad_norm": 0.36666443943977356,
"learning_rate": 0.00010988598469460896,
"loss": 1.0904,
"step": 38400
},
{
"epoch": 2.3607747445478116,
"grad_norm": 0.3801022171974182,
"learning_rate": 0.00010874113512630368,
"loss": 1.0907,
"step": 38700
},
{
"epoch": 2.379075796858319,
"grad_norm": 0.3615148067474365,
"learning_rate": 0.00010759512984920626,
"loss": 1.0893,
"step": 39000
},
{
"epoch": 2.397376849168827,
"grad_norm": 0.3498951196670532,
"learning_rate": 0.00010644812038235344,
"loss": 1.0907,
"step": 39300
},
{
"epoch": 2.4156779014793353,
"grad_norm": 0.38336920738220215,
"learning_rate": 0.0001053002583775509,
"loss": 1.0945,
"step": 39600
},
{
"epoch": 2.433978953789843,
"grad_norm": 0.3785784840583801,
"learning_rate": 0.00010415169559932263,
"loss": 1.091,
"step": 39900
},
{
"epoch": 2.452280006100351,
"grad_norm": 0.3536151945590973,
"learning_rate": 0.00010300258390484522,
"loss": 1.0857,
"step": 40200
},
{
"epoch": 2.4705810584108585,
"grad_norm": 0.35834431648254395,
"learning_rate": 0.00010185307522387033,
"loss": 1.0873,
"step": 40500
},
{
"epoch": 2.4888821107213666,
"grad_norm": 0.36664196848869324,
"learning_rate": 0.00010070332153863707,
"loss": 1.0886,
"step": 40800
},
{
"epoch": 2.5071831630318746,
"grad_norm": 0.35749050974845886,
"learning_rate": 9.955347486377786e-05,
"loss": 1.0866,
"step": 41100
},
{
"epoch": 2.525484215342382,
"grad_norm": 0.40087947249412537,
"learning_rate": 9.840368722621967e-05,
"loss": 1.0797,
"step": 41400
},
{
"epoch": 2.54378526765289,
"grad_norm": 0.3594193160533905,
"learning_rate": 9.725411064508392e-05,
"loss": 1.0879,
"step": 41700
},
{
"epoch": 2.562086319963398,
"grad_norm": 0.3470621705055237,
"learning_rate": 9.610489711158714e-05,
"loss": 1.0832,
"step": 42000
},
{
"epoch": 2.580387372273906,
"grad_norm": 0.37977200746536255,
"learning_rate": 9.495619856894563e-05,
"loss": 1.0842,
"step": 42300
},
{
"epoch": 2.5986884245844135,
"grad_norm": 0.3905890882015228,
"learning_rate": 9.380816689228608e-05,
"loss": 1.0811,
"step": 42600
},
{
"epoch": 2.6169894768949216,
"grad_norm": 0.3852068781852722,
"learning_rate": 9.266095386856565e-05,
"loss": 1.0832,
"step": 42900
},
{
"epoch": 2.635290529205429,
"grad_norm": 0.40047767758369446,
"learning_rate": 9.15147111765032e-05,
"loss": 1.0792,
"step": 43200
},
{
"epoch": 2.653591581515937,
"grad_norm": 0.40440091490745544,
"learning_rate": 9.03695903665254e-05,
"loss": 1.0802,
"step": 43500
},
{
"epoch": 2.6718926338264453,
"grad_norm": 0.3432023823261261,
"learning_rate": 8.922574284072916e-05,
"loss": 1.08,
"step": 43800
},
{
"epoch": 2.690193686136953,
"grad_norm": 0.3821187913417816,
"learning_rate": 8.80833198328643e-05,
"loss": 1.0775,
"step": 44100
},
{
"epoch": 2.7084947384474605,
"grad_norm": 0.3733247220516205,
"learning_rate": 8.694247238833797e-05,
"loss": 1.0762,
"step": 44400
},
{
"epoch": 2.7267957907579685,
"grad_norm": 0.35147610306739807,
"learning_rate": 8.58033513442444e-05,
"loss": 1.0747,
"step": 44700
},
{
"epoch": 2.7450968430684766,
"grad_norm": 0.37748807668685913,
"learning_rate": 8.466610730942169e-05,
"loss": 1.0738,
"step": 45000
},
{
"epoch": 2.763397895378984,
"grad_norm": 0.39242950081825256,
"learning_rate": 8.353089064453948e-05,
"loss": 1.0734,
"step": 45300
},
{
"epoch": 2.781698947689492,
"grad_norm": 0.36078205704689026,
"learning_rate": 8.239785144221857e-05,
"loss": 1.0727,
"step": 45600
},
{
"epoch": 2.8,
"grad_norm": 0.36267706751823425,
"learning_rate": 8.126713950718679e-05,
"loss": 1.0705,
"step": 45900
},
{
"epoch": 2.818301052310508,
"grad_norm": 0.38176020979881287,
"learning_rate": 8.013890433647228e-05,
"loss": 1.0718,
"step": 46200
},
{
"epoch": 2.836602104621016,
"grad_norm": 0.37327638268470764,
"learning_rate": 7.901329509963807e-05,
"loss": 1.0708,
"step": 46500
},
{
"epoch": 2.8549031569315235,
"grad_norm": 0.38468560576438904,
"learning_rate": 7.78904606190593e-05,
"loss": 1.0717,
"step": 46800
},
{
"epoch": 2.8732042092420316,
"grad_norm": 0.3524203896522522,
"learning_rate": 7.677054935024696e-05,
"loss": 1.0714,
"step": 47100
},
{
"epoch": 2.891505261552539,
"grad_norm": 0.3709242641925812,
"learning_rate": 7.565370936221959e-05,
"loss": 1.0725,
"step": 47400
},
{
"epoch": 2.9098063138630472,
"grad_norm": 0.3623937666416168,
"learning_rate": 7.454008831792668e-05,
"loss": 1.0665,
"step": 47700
},
{
"epoch": 2.928107366173555,
"grad_norm": 0.3631395995616913,
"learning_rate": 7.3429833454725e-05,
"loss": 1.0725,
"step": 48000
},
{
"epoch": 2.946408418484063,
"grad_norm": 0.38803574442863464,
"learning_rate": 7.2323091564912e-05,
"loss": 1.0679,
"step": 48300
},
{
"epoch": 2.9647094707945705,
"grad_norm": 0.3702128529548645,
"learning_rate": 7.122000897631738e-05,
"loss": 1.0687,
"step": 48600
},
{
"epoch": 2.9830105231050785,
"grad_norm": 0.3717169165611267,
"learning_rate": 7.012073153295662e-05,
"loss": 1.0668,
"step": 48900
},
{
"epoch": 3.0012810736617355,
"grad_norm": 0.40531572699546814,
"learning_rate": 6.902540457574798e-05,
"loss": 1.0633,
"step": 49200
},
{
"epoch": 3.0195821259722435,
"grad_norm": 0.38881915807724,
"learning_rate": 6.793417292329643e-05,
"loss": 1.0336,
"step": 49500
},
{
"epoch": 3.037883178282751,
"grad_norm": 0.368335485458374,
"learning_rate": 6.684718085274634e-05,
"loss": 1.0392,
"step": 49800
},
{
"epoch": 3.056184230593259,
"grad_norm": 0.3770321309566498,
"learning_rate": 6.576457208070615e-05,
"loss": 1.0399,
"step": 50100
},
{
"epoch": 3.0744852829037668,
"grad_norm": 0.3865681290626526,
"learning_rate": 6.468648974424633e-05,
"loss": 1.0398,
"step": 50400
},
{
"epoch": 3.092786335214275,
"grad_norm": 0.3610497713088989,
"learning_rate": 6.361307638197522e-05,
"loss": 1.0418,
"step": 50700
},
{
"epoch": 3.111087387524783,
"grad_norm": 0.3791823089122772,
"learning_rate": 6.254447391519271e-05,
"loss": 1.0391,
"step": 51000
},
{
"epoch": 3.1293884398352905,
"grad_norm": 0.3856153190135956,
"learning_rate": 6.148082362912652e-05,
"loss": 1.0363,
"step": 51300
},
{
"epoch": 3.1476894921457985,
"grad_norm": 0.37801018357276917,
"learning_rate": 6.042226615425186e-05,
"loss": 1.0412,
"step": 51600
},
{
"epoch": 3.165990544456306,
"grad_norm": 0.3763614594936371,
"learning_rate": 5.936894144769827e-05,
"loss": 1.0403,
"step": 51900
},
{
"epoch": 3.184291596766814,
"grad_norm": 0.36206382513046265,
"learning_rate": 5.8320988774744946e-05,
"loss": 1.0377,
"step": 52200
},
{
"epoch": 3.2025926490773218,
"grad_norm": 0.4072185456752777,
"learning_rate": 5.727854669040803e-05,
"loss": 1.0417,
"step": 52500
},
{
"epoch": 3.22089370138783,
"grad_norm": 0.39182448387145996,
"learning_rate": 5.6241753021121335e-05,
"loss": 1.0394,
"step": 52800
},
{
"epoch": 3.239194753698338,
"grad_norm": 0.41504260897636414,
"learning_rate": 5.521074484651374e-05,
"loss": 1.0366,
"step": 53100
},
{
"epoch": 3.2574958060088455,
"grad_norm": 0.3729825019836426,
"learning_rate": 5.418565848128518e-05,
"loss": 1.0385,
"step": 53400
},
{
"epoch": 3.2757968583193535,
"grad_norm": 0.36718717217445374,
"learning_rate": 5.316662945718396e-05,
"loss": 1.0376,
"step": 53700
},
{
"epoch": 3.294097910629861,
"grad_norm": 0.36935436725616455,
"learning_rate": 5.2153792505087115e-05,
"loss": 1.0365,
"step": 54000
},
{
"epoch": 3.312398962940369,
"grad_norm": 0.38036707043647766,
"learning_rate": 5.114728153718731e-05,
"loss": 1.0333,
"step": 54300
},
{
"epoch": 3.3307000152508768,
"grad_norm": 0.3772546350955963,
"learning_rate": 5.0147229629287406e-05,
"loss": 1.0384,
"step": 54600
},
{
"epoch": 3.349001067561385,
"grad_norm": 0.38388633728027344,
"learning_rate": 4.9153769003205944e-05,
"loss": 1.0385,
"step": 54900
},
{
"epoch": 3.3673021198718924,
"grad_norm": 0.3748055696487427,
"learning_rate": 4.816703100929538e-05,
"loss": 1.0354,
"step": 55200
},
{
"epoch": 3.3856031721824005,
"grad_norm": 0.40391644835472107,
"learning_rate": 4.718714610907581e-05,
"loss": 1.0394,
"step": 55500
},
{
"epoch": 3.4039042244929085,
"grad_norm": 0.3871014416217804,
"learning_rate": 4.6214243857985686e-05,
"loss": 1.0348,
"step": 55800
},
{
"epoch": 3.422205276803416,
"grad_norm": 0.3747030198574066,
"learning_rate": 4.524845288825298e-05,
"loss": 1.031,
"step": 56100
},
{
"epoch": 3.440506329113924,
"grad_norm": 0.38700729608535767,
"learning_rate": 4.428990089188769e-05,
"loss": 1.0341,
"step": 56400
},
{
"epoch": 3.458807381424432,
"grad_norm": 0.37710437178611755,
"learning_rate": 4.333871460379951e-05,
"loss": 1.0351,
"step": 56700
},
{
"epoch": 3.47710843373494,
"grad_norm": 0.3593015968799591,
"learning_rate": 4.239501978504117e-05,
"loss": 1.0369,
"step": 57000
},
{
"epoch": 3.4954094860454474,
"grad_norm": 0.386823445558548,
"learning_rate": 4.145894120618123e-05,
"loss": 1.0336,
"step": 57300
},
{
"epoch": 3.5137105383559555,
"grad_norm": 0.3797110915184021,
"learning_rate": 4.053060263080738e-05,
"loss": 1.0304,
"step": 57600
},
{
"epoch": 3.532011590666463,
"grad_norm": 0.35242992639541626,
"learning_rate": 3.961012679916307e-05,
"loss": 1.0276,
"step": 57900
},
{
"epoch": 3.550312642976971,
"grad_norm": 0.36182209849357605,
"learning_rate": 3.869763541191944e-05,
"loss": 1.0287,
"step": 58200
},
{
"epoch": 3.568613695287479,
"grad_norm": 0.3675349950790405,
"learning_rate": 3.779324911408475e-05,
"loss": 1.0321,
"step": 58500
},
{
"epoch": 3.586914747597987,
"grad_norm": 0.38415876030921936,
"learning_rate": 3.6897087479053104e-05,
"loss": 1.0336,
"step": 58800
},
{
"epoch": 3.605215799908495,
"grad_norm": 0.3762219548225403,
"learning_rate": 3.600926899279532e-05,
"loss": 1.0326,
"step": 59100
},
{
"epoch": 3.6235168522190024,
"grad_norm": 0.3666684031486511,
"learning_rate": 3.512991103819303e-05,
"loss": 1.0288,
"step": 59400
},
{
"epoch": 3.6418179045295105,
"grad_norm": 0.39124396443367004,
"learning_rate": 3.425912987951914e-05,
"loss": 1.0279,
"step": 59700
},
{
"epoch": 3.6601189568400185,
"grad_norm": 0.3636305332183838,
"learning_rate": 3.339704064706577e-05,
"loss": 1.0327,
"step": 60000
},
{
"epoch": 3.678420009150526,
"grad_norm": 0.40503907203674316,
"learning_rate": 3.2543757321922563e-05,
"loss": 1.0313,
"step": 60300
},
{
"epoch": 3.6967210614610337,
"grad_norm": 0.3835071623325348,
"learning_rate": 3.169939272090634e-05,
"loss": 1.0306,
"step": 60600
},
{
"epoch": 3.715022113771542,
"grad_norm": 0.4188929796218872,
"learning_rate": 3.08640584816454e-05,
"loss": 1.0305,
"step": 60900
},
{
"epoch": 3.73332316608205,
"grad_norm": 0.3759515583515167,
"learning_rate": 3.0037865047818903e-05,
"loss": 1.0295,
"step": 61200
},
{
"epoch": 3.7516242183925574,
"grad_norm": 0.36246317625045776,
"learning_rate": 2.922092165455508e-05,
"loss": 1.027,
"step": 61500
},
{
"epoch": 3.7699252707030655,
"grad_norm": 0.3859165608882904,
"learning_rate": 2.8413336313988303e-05,
"loss": 1.026,
"step": 61800
},
{
"epoch": 3.788226323013573,
"grad_norm": 0.3533560037612915,
"learning_rate": 2.7615215800978523e-05,
"loss": 1.0293,
"step": 62100
},
{
"epoch": 3.806527375324081,
"grad_norm": 0.3984517455101013,
"learning_rate": 2.6826665638993876e-05,
"loss": 1.0265,
"step": 62400
},
{
"epoch": 3.824828427634589,
"grad_norm": 0.3726932108402252,
"learning_rate": 2.6047790086158952e-05,
"loss": 1.0249,
"step": 62700
},
{
"epoch": 3.843129479945097,
"grad_norm": 0.3885975778102875,
"learning_rate": 2.5278692121470326e-05,
"loss": 1.0242,
"step": 63000
},
{
"epoch": 3.861430532255605,
"grad_norm": 0.3631696105003357,
"learning_rate": 2.4519473431181272e-05,
"loss": 1.026,
"step": 63300
},
{
"epoch": 3.8797315845661124,
"grad_norm": 0.38549143075942993,
"learning_rate": 2.3770234395357115e-05,
"loss": 1.0255,
"step": 63600
},
{
"epoch": 3.8980326368766205,
"grad_norm": 0.3577282428741455,
"learning_rate": 2.303107407460373e-05,
"loss": 1.0255,
"step": 63900
},
{
"epoch": 3.916333689187128,
"grad_norm": 0.36772647500038147,
"learning_rate": 2.2302090196970048e-05,
"loss": 1.0257,
"step": 64200
},
{
"epoch": 3.934634741497636,
"grad_norm": 0.38493648171424866,
"learning_rate": 2.1583379145027016e-05,
"loss": 1.0258,
"step": 64500
},
{
"epoch": 3.9529357938081437,
"grad_norm": 0.3904314339160919,
"learning_rate": 2.0875035943124422e-05,
"loss": 1.0267,
"step": 64800
},
{
"epoch": 3.971236846118652,
"grad_norm": 0.40153247117996216,
"learning_rate": 2.0177154244827255e-05,
"loss": 1.0263,
"step": 65100
},
{
"epoch": 3.98953789842916,
"grad_norm": 0.35689347982406616,
"learning_rate": 1.94898263205332e-05,
"loss": 1.0265,
"step": 65400
},
{
"epoch": 4.007808448985816,
"grad_norm": 0.37626588344573975,
"learning_rate": 1.881314304527323e-05,
"loss": 1.0123,
"step": 65700
},
{
"epoch": 4.026109501296324,
"grad_norm": 0.39857444167137146,
"learning_rate": 1.8147193886696467e-05,
"loss": 1.0024,
"step": 66000
},
{
"epoch": 4.044410553606832,
"grad_norm": 0.38053014874458313,
"learning_rate": 1.7492066893241354e-05,
"loss": 1.0055,
"step": 66300
},
{
"epoch": 4.0627116059173405,
"grad_norm": 0.3763318359851837,
"learning_rate": 1.684784868249417e-05,
"loss": 1.0008,
"step": 66600
},
{
"epoch": 4.0810126582278485,
"grad_norm": 0.35103437304496765,
"learning_rate": 1.6214624429737058e-05,
"loss": 1.0064,
"step": 66900
},
{
"epoch": 4.099313710538356,
"grad_norm": 0.349678099155426,
"learning_rate": 1.5592477856686416e-05,
"loss": 1.0025,
"step": 67200
},
{
"epoch": 4.117614762848864,
"grad_norm": 0.3956117033958435,
"learning_rate": 1.4981491220423727e-05,
"loss": 1.0055,
"step": 67500
},
{
"epoch": 4.135915815159372,
"grad_norm": 0.36628103256225586,
"learning_rate": 1.4381745302519978e-05,
"loss": 1.0037,
"step": 67800
},
{
"epoch": 4.15421686746988,
"grad_norm": 0.3932549059391022,
"learning_rate": 1.379331939835502e-05,
"loss": 1.0023,
"step": 68100
},
{
"epoch": 4.172517919780387,
"grad_norm": 0.3831135332584381,
"learning_rate": 1.3216291306633643e-05,
"loss": 1.0042,
"step": 68400
},
{
"epoch": 4.190818972090895,
"grad_norm": 0.37050631642341614,
"learning_rate": 1.2650737319099316e-05,
"loss": 1.0035,
"step": 68700
},
{
"epoch": 4.209120024401403,
"grad_norm": 0.3810383081436157,
"learning_rate": 1.2096732210447382e-05,
"loss": 1.0034,
"step": 69000
},
{
"epoch": 4.227421076711911,
"grad_norm": 0.37233009934425354,
"learning_rate": 1.155434922843861e-05,
"loss": 1.0029,
"step": 69300
},
{
"epoch": 4.245722129022419,
"grad_norm": 0.36925238370895386,
"learning_rate": 1.1023660084214916e-05,
"loss": 1.0032,
"step": 69600
},
{
"epoch": 4.264023181332926,
"grad_norm": 0.37379172444343567,
"learning_rate": 1.0504734942817851e-05,
"loss": 1.001,
"step": 69900
},
{
"epoch": 4.282324233643434,
"grad_norm": 0.39091187715530396,
"learning_rate": 9.997642413912001e-06,
"loss": 1.0,
"step": 70200
},
{
"epoch": 4.3006252859539424,
"grad_norm": 0.39929676055908203,
"learning_rate": 9.502449542713431e-06,
"loss": 1.0039,
"step": 70500
},
{
"epoch": 4.3189263382644505,
"grad_norm": 0.38398900628089905,
"learning_rate": 9.019221801125677e-06,
"loss": 1.005,
"step": 70800
},
{
"epoch": 4.337227390574958,
"grad_norm": 0.37324637174606323,
"learning_rate": 8.548023079083045e-06,
"loss": 1.0001,
"step": 71100
},
{
"epoch": 4.355528442885466,
"grad_norm": 0.3978380262851715,
"learning_rate": 8.088915676103681e-06,
"loss": 1.0033,
"step": 71400
},
{
"epoch": 4.373829495195974,
"grad_norm": 0.3560939431190491,
"learning_rate": 7.641960293052442e-06,
"loss": 1.004,
"step": 71700
},
{
"epoch": 4.392130547506482,
"grad_norm": 0.3844592571258545,
"learning_rate": 7.207216024115415e-06,
"loss": 1.001,
"step": 72000
},
{
"epoch": 4.41043159981699,
"grad_norm": 0.3786323070526123,
"learning_rate": 6.784740348986785e-06,
"loss": 1.0043,
"step": 72300
},
{
"epoch": 4.428732652127497,
"grad_norm": 0.3794030249118805,
"learning_rate": 6.374589125269203e-06,
"loss": 1.0005,
"step": 72600
},
{
"epoch": 4.447033704438005,
"grad_norm": 0.376708984375,
"learning_rate": 5.976816581088418e-06,
"loss": 1.0042,
"step": 72900
},
{
"epoch": 4.465334756748513,
"grad_norm": 0.3753119707107544,
"learning_rate": 5.591475307923744e-06,
"loss": 0.9998,
"step": 73200
},
{
"epoch": 4.483635809059021,
"grad_norm": 0.39294296503067017,
"learning_rate": 5.21861625365444e-06,
"loss": 1.002,
"step": 73500
},
{
"epoch": 4.501936861369529,
"grad_norm": 0.386958509683609,
"learning_rate": 4.8582887158238e-06,
"loss": 1.0032,
"step": 73800
},
{
"epoch": 4.520237913680036,
"grad_norm": 0.3635695278644562,
"learning_rate": 4.510540335121205e-06,
"loss": 1.0002,
"step": 74100
},
{
"epoch": 4.538538965990544,
"grad_norm": 0.40328043699264526,
"learning_rate": 4.175417089083378e-06,
"loss": 0.9995,
"step": 74400
},
{
"epoch": 4.5568400183010525,
"grad_norm": 0.39885058999061584,
"learning_rate": 3.852963286015376e-06,
"loss": 1.0049,
"step": 74700
},
{
"epoch": 4.5751410706115605,
"grad_norm": 0.3917011022567749,
"learning_rate": 3.5432215591324702e-06,
"loss": 0.9994,
"step": 75000
},
{
"epoch": 4.593442122922068,
"grad_norm": 0.36823955178260803,
"learning_rate": 3.2462328609232707e-06,
"loss": 0.998,
"step": 75300
},
{
"epoch": 4.611743175232576,
"grad_norm": 0.3946107029914856,
"learning_rate": 2.962036457735329e-06,
"loss": 1.0011,
"step": 75600
},
{
"epoch": 4.630044227543084,
"grad_norm": 0.37494683265686035,
"learning_rate": 2.6906699245834554e-06,
"loss": 1.0024,
"step": 75900
},
{
"epoch": 4.648345279853592,
"grad_norm": 0.3627372980117798,
"learning_rate": 2.4321691401817725e-06,
"loss": 1.0021,
"step": 76200
},
{
"epoch": 4.666646332164099,
"grad_norm": 0.3681999742984772,
"learning_rate": 2.1865682821999966e-06,
"loss": 1.0038,
"step": 76500
},
{
"epoch": 4.684947384474607,
"grad_norm": 0.38293156027793884,
"learning_rate": 1.9538998227446424e-06,
"loss": 0.9986,
"step": 76800
},
{
"epoch": 4.703248436785115,
"grad_norm": 0.3759300410747528,
"learning_rate": 1.7341945240657243e-06,
"loss": 1.0026,
"step": 77100
},
{
"epoch": 4.721549489095623,
"grad_norm": 0.4103819727897644,
"learning_rate": 1.5274814344895749e-06,
"loss": 0.9956,
"step": 77400
},
{
"epoch": 4.739850541406131,
"grad_norm": 0.38952457904815674,
"learning_rate": 1.3337878845781082e-06,
"loss": 0.9961,
"step": 77700
},
{
"epoch": 4.758151593716638,
"grad_norm": 0.370206356048584,
"learning_rate": 1.1531394835154308e-06,
"loss": 1.0005,
"step": 78000
},
{
"epoch": 4.776452646027146,
"grad_norm": 0.3750290274620056,
"learning_rate": 9.85560115721862e-07,
"loss": 0.9981,
"step": 78300
},
{
"epoch": 4.794753698337654,
"grad_norm": 0.3763698637485504,
"learning_rate": 8.310719376960596e-07,
"loss": 1.0021,
"step": 78600
},
{
"epoch": 4.8130547506481625,
"grad_norm": 0.3856047987937927,
"learning_rate": 6.896953750856083e-07,
"loss": 0.9989,
"step": 78900
},
{
"epoch": 4.8313558029586705,
"grad_norm": 0.3992738425731659,
"learning_rate": 5.61449119986468e-07,
"loss": 1.0031,
"step": 79200
},
{
"epoch": 4.849656855269178,
"grad_norm": 0.39547818899154663,
"learning_rate": 4.463501284715399e-07,
"loss": 1.0039,
"step": 79500
},
{
"epoch": 4.867957907579686,
"grad_norm": 0.39359623193740845,
"learning_rate": 3.4441361834886e-07,
"loss": 0.9978,
"step": 79800
},
{
"epoch": 4.886258959890194,
"grad_norm": 0.3566698431968689,
"learning_rate": 2.5565306714960825e-07,
"loss": 0.9969,
"step": 80100
},
{
"epoch": 4.904560012200702,
"grad_norm": 0.38897261023521423,
"learning_rate": 1.800802103461008e-07,
"loss": 1.0021,
"step": 80400
},
{
"epoch": 4.92286106451121,
"grad_norm": 0.38005581498146057,
"learning_rate": 1.1770503980024216e-07,
"loss": 1.0041,
"step": 80700
},
{
"epoch": 4.941162116821717,
"grad_norm": 0.39922693371772766,
"learning_rate": 6.853580244243762e-08,
"loss": 1.0011,
"step": 81000
},
{
"epoch": 4.959463169132225,
"grad_norm": 0.37051552534103394,
"learning_rate": 3.257899918120977e-08,
"loss": 1.002,
"step": 81300
},
{
"epoch": 4.977764221442733,
"grad_norm": 0.3841174840927124,
"learning_rate": 9.839384043719423e-09,
"loss": 1.0027,
"step": 81600
},
{
"epoch": 4.996065273753241,
"grad_norm": 0.3595673441886902,
"learning_rate": 3.1996354714625144e-10,
"loss": 1.0007,
"step": 81900
},
{
"epoch": 5.0,
"step": 81965,
"total_flos": 1.7441017798508093e+19,
"train_loss": 1.117615111251009,
"train_runtime": 146190.8611,
"train_samples_per_second": 143.527,
"train_steps_per_second": 0.561
}
],
"logging_steps": 300,
"max_steps": 81965,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.7441017798508093e+19,
"train_batch_size": 128,
"trial_name": null,
"trial_params": null
}