maestro-4.5-beta / trainer_state.json
hidude562's picture
Upload 8 files
e939a41 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.15151515151515152,
"eval_steps": 50000,
"global_step": 90000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0003367003367003367,
"grad_norm": 0.3848826289176941,
"learning_rate": 4.998316498316499e-05,
"loss": 0.483,
"step": 200
},
{
"epoch": 0.0006734006734006734,
"grad_norm": 0.28108373284339905,
"learning_rate": 4.9966329966329964e-05,
"loss": 0.4907,
"step": 400
},
{
"epoch": 0.00101010101010101,
"grad_norm": 1.1632909774780273,
"learning_rate": 4.994949494949495e-05,
"loss": 0.4656,
"step": 600
},
{
"epoch": 0.0013468013468013469,
"grad_norm": 0.38611674308776855,
"learning_rate": 4.993265993265993e-05,
"loss": 0.4801,
"step": 800
},
{
"epoch": 0.0016835016835016834,
"grad_norm": 0.44287604093551636,
"learning_rate": 4.991582491582492e-05,
"loss": 0.5212,
"step": 1000
},
{
"epoch": 0.00202020202020202,
"grad_norm": 0.3867124617099762,
"learning_rate": 4.9899074074074075e-05,
"loss": 0.4678,
"step": 1200
},
{
"epoch": 0.0023569023569023568,
"grad_norm": 0.5485501885414124,
"learning_rate": 4.988223905723906e-05,
"loss": 0.4865,
"step": 1400
},
{
"epoch": 0.0026936026936026937,
"grad_norm": 0.20481592416763306,
"learning_rate": 4.986548821548822e-05,
"loss": 0.5055,
"step": 1600
},
{
"epoch": 0.0030303030303030303,
"grad_norm": 0.2336668223142624,
"learning_rate": 4.98486531986532e-05,
"loss": 0.4695,
"step": 1800
},
{
"epoch": 0.003367003367003367,
"grad_norm": 0.5381026268005371,
"learning_rate": 4.9831818181818186e-05,
"loss": 0.4399,
"step": 2000
},
{
"epoch": 0.003703703703703704,
"grad_norm": 0.7313366532325745,
"learning_rate": 4.981498316498317e-05,
"loss": 0.4459,
"step": 2200
},
{
"epoch": 0.00404040404040404,
"grad_norm": 0.36844003200531006,
"learning_rate": 4.979814814814815e-05,
"loss": 0.4606,
"step": 2400
},
{
"epoch": 0.004377104377104377,
"grad_norm": 0.3787059187889099,
"learning_rate": 4.978131313131313e-05,
"loss": 0.5135,
"step": 2600
},
{
"epoch": 0.0047138047138047135,
"grad_norm": 0.1680217981338501,
"learning_rate": 4.9764478114478116e-05,
"loss": 0.4913,
"step": 2800
},
{
"epoch": 0.005050505050505051,
"grad_norm": 0.2730850875377655,
"learning_rate": 4.97476430976431e-05,
"loss": 0.4262,
"step": 3000
},
{
"epoch": 0.0053872053872053875,
"grad_norm": 0.9977230429649353,
"learning_rate": 4.9730808080808085e-05,
"loss": 0.4774,
"step": 3200
},
{
"epoch": 0.005723905723905724,
"grad_norm": 0.38187840580940247,
"learning_rate": 4.971397306397307e-05,
"loss": 0.4642,
"step": 3400
},
{
"epoch": 0.006060606060606061,
"grad_norm": 0.33152151107788086,
"learning_rate": 4.969713804713805e-05,
"loss": 0.4542,
"step": 3600
},
{
"epoch": 0.006397306397306397,
"grad_norm": 0.4819263815879822,
"learning_rate": 4.968030303030303e-05,
"loss": 0.456,
"step": 3800
},
{
"epoch": 0.006734006734006734,
"grad_norm": 0.7936732172966003,
"learning_rate": 4.9663468013468016e-05,
"loss": 0.5043,
"step": 4000
},
{
"epoch": 0.007070707070707071,
"grad_norm": 0.15946243703365326,
"learning_rate": 4.9646632996632993e-05,
"loss": 0.4722,
"step": 4200
},
{
"epoch": 0.007407407407407408,
"grad_norm": 0.32496747374534607,
"learning_rate": 4.9629797979797985e-05,
"loss": 0.5135,
"step": 4400
},
{
"epoch": 0.007744107744107744,
"grad_norm": 0.4165472388267517,
"learning_rate": 4.961304713804714e-05,
"loss": 0.5593,
"step": 4600
},
{
"epoch": 0.00808080808080808,
"grad_norm": 0.25771287083625793,
"learning_rate": 4.959621212121212e-05,
"loss": 0.4484,
"step": 4800
},
{
"epoch": 0.008417508417508417,
"grad_norm": 0.3406078815460205,
"learning_rate": 4.9579377104377104e-05,
"loss": 0.4823,
"step": 5000
},
{
"epoch": 0.008754208754208754,
"grad_norm": 8.295658111572266,
"learning_rate": 4.956254208754209e-05,
"loss": 0.4635,
"step": 5200
},
{
"epoch": 0.00909090909090909,
"grad_norm": 0.44405078887939453,
"learning_rate": 4.954570707070707e-05,
"loss": 0.4985,
"step": 5400
},
{
"epoch": 0.009427609427609427,
"grad_norm": 0.4546993374824524,
"learning_rate": 4.952887205387206e-05,
"loss": 0.5113,
"step": 5600
},
{
"epoch": 0.009764309764309764,
"grad_norm": 0.5952478647232056,
"learning_rate": 4.951203703703704e-05,
"loss": 0.514,
"step": 5800
},
{
"epoch": 0.010101010101010102,
"grad_norm": 0.27716225385665894,
"learning_rate": 4.9495202020202026e-05,
"loss": 0.5375,
"step": 6000
},
{
"epoch": 0.010437710437710438,
"grad_norm": 0.7038645148277283,
"learning_rate": 4.9478367003367004e-05,
"loss": 0.4905,
"step": 6200
},
{
"epoch": 0.010774410774410775,
"grad_norm": 0.48266687989234924,
"learning_rate": 4.946153198653199e-05,
"loss": 0.4924,
"step": 6400
},
{
"epoch": 0.011111111111111112,
"grad_norm": 0.5230734944343567,
"learning_rate": 4.944469696969697e-05,
"loss": 0.533,
"step": 6600
},
{
"epoch": 0.011447811447811448,
"grad_norm": 1.7186241149902344,
"learning_rate": 4.942794612794613e-05,
"loss": 0.4852,
"step": 6800
},
{
"epoch": 0.011784511784511785,
"grad_norm": 0.5641151666641235,
"learning_rate": 4.9411111111111114e-05,
"loss": 0.492,
"step": 7000
},
{
"epoch": 0.012121212121212121,
"grad_norm": 0.18955977261066437,
"learning_rate": 4.93942760942761e-05,
"loss": 0.4994,
"step": 7200
},
{
"epoch": 0.012457912457912458,
"grad_norm": 0.4500541687011719,
"learning_rate": 4.9377441077441076e-05,
"loss": 0.4761,
"step": 7400
},
{
"epoch": 0.012794612794612794,
"grad_norm": 0.5736501812934875,
"learning_rate": 4.936060606060606e-05,
"loss": 0.4835,
"step": 7600
},
{
"epoch": 0.013131313131313131,
"grad_norm": 0.12210117280483246,
"learning_rate": 4.9343771043771045e-05,
"loss": 0.4753,
"step": 7800
},
{
"epoch": 0.013468013468013467,
"grad_norm": 0.6027535796165466,
"learning_rate": 4.932693602693603e-05,
"loss": 0.442,
"step": 8000
},
{
"epoch": 0.013804713804713804,
"grad_norm": 0.6002740859985352,
"learning_rate": 4.9310101010101014e-05,
"loss": 0.4668,
"step": 8200
},
{
"epoch": 0.014141414141414142,
"grad_norm": 0.45654693245887756,
"learning_rate": 4.9293265993266e-05,
"loss": 0.4792,
"step": 8400
},
{
"epoch": 0.014478114478114479,
"grad_norm": 0.5119714736938477,
"learning_rate": 4.9276430976430976e-05,
"loss": 0.4767,
"step": 8600
},
{
"epoch": 0.014814814814814815,
"grad_norm": 0.22278934717178345,
"learning_rate": 4.925959595959596e-05,
"loss": 0.4826,
"step": 8800
},
{
"epoch": 0.015151515151515152,
"grad_norm": 0.265720933675766,
"learning_rate": 4.9242845117845124e-05,
"loss": 0.5267,
"step": 9000
},
{
"epoch": 0.015488215488215488,
"grad_norm": 0.4445594847202301,
"learning_rate": 4.922601010101011e-05,
"loss": 0.507,
"step": 9200
},
{
"epoch": 0.015824915824915825,
"grad_norm": 0.6113983988761902,
"learning_rate": 4.9209175084175086e-05,
"loss": 0.5192,
"step": 9400
},
{
"epoch": 0.01616161616161616,
"grad_norm": 0.2967589497566223,
"learning_rate": 4.919234006734007e-05,
"loss": 0.4728,
"step": 9600
},
{
"epoch": 0.016498316498316498,
"grad_norm": 1.5599192380905151,
"learning_rate": 4.9175505050505055e-05,
"loss": 0.5145,
"step": 9800
},
{
"epoch": 0.016835016835016835,
"grad_norm": 0.6804638504981995,
"learning_rate": 4.915867003367003e-05,
"loss": 0.4293,
"step": 10000
},
{
"epoch": 0.01717171717171717,
"grad_norm": 0.5212819576263428,
"learning_rate": 4.914183501683502e-05,
"loss": 0.5484,
"step": 10200
},
{
"epoch": 0.017508417508417508,
"grad_norm": 0.5872311592102051,
"learning_rate": 4.9125e-05,
"loss": 0.502,
"step": 10400
},
{
"epoch": 0.017845117845117844,
"grad_norm": 0.2819989323616028,
"learning_rate": 4.9108164983164986e-05,
"loss": 0.474,
"step": 10600
},
{
"epoch": 0.01818181818181818,
"grad_norm": 0.2001451998949051,
"learning_rate": 4.909132996632997e-05,
"loss": 0.4348,
"step": 10800
},
{
"epoch": 0.018518518518518517,
"grad_norm": 2.0528833866119385,
"learning_rate": 4.9074494949494955e-05,
"loss": 0.4569,
"step": 11000
},
{
"epoch": 0.018855218855218854,
"grad_norm": 0.7602177858352661,
"learning_rate": 4.905765993265993e-05,
"loss": 0.4319,
"step": 11200
},
{
"epoch": 0.01919191919191919,
"grad_norm": 0.2091585099697113,
"learning_rate": 4.904090909090909e-05,
"loss": 0.4193,
"step": 11400
},
{
"epoch": 0.019528619528619527,
"grad_norm": 0.20664581656455994,
"learning_rate": 4.902407407407408e-05,
"loss": 0.493,
"step": 11600
},
{
"epoch": 0.019865319865319864,
"grad_norm": 0.40701425075531006,
"learning_rate": 4.900723905723906e-05,
"loss": 0.5205,
"step": 11800
},
{
"epoch": 0.020202020202020204,
"grad_norm": 1.0059823989868164,
"learning_rate": 4.899040404040404e-05,
"loss": 0.4822,
"step": 12000
},
{
"epoch": 0.02053872053872054,
"grad_norm": 0.2437160164117813,
"learning_rate": 4.897356902356903e-05,
"loss": 0.4796,
"step": 12200
},
{
"epoch": 0.020875420875420877,
"grad_norm": 0.3996870219707489,
"learning_rate": 4.8956734006734005e-05,
"loss": 0.4243,
"step": 12400
},
{
"epoch": 0.021212121212121213,
"grad_norm": 1.2844949960708618,
"learning_rate": 4.893989898989899e-05,
"loss": 0.4628,
"step": 12600
},
{
"epoch": 0.02154882154882155,
"grad_norm": 0.5382771492004395,
"learning_rate": 4.8923063973063974e-05,
"loss": 0.5122,
"step": 12800
},
{
"epoch": 0.021885521885521887,
"grad_norm": 1.4352107048034668,
"learning_rate": 4.890622895622896e-05,
"loss": 0.4959,
"step": 13000
},
{
"epoch": 0.022222222222222223,
"grad_norm": 0.3689097464084625,
"learning_rate": 4.888939393939394e-05,
"loss": 0.4711,
"step": 13200
},
{
"epoch": 0.02255892255892256,
"grad_norm": 0.13188982009887695,
"learning_rate": 4.887255892255893e-05,
"loss": 0.4761,
"step": 13400
},
{
"epoch": 0.022895622895622896,
"grad_norm": 0.18019753694534302,
"learning_rate": 4.885572390572391e-05,
"loss": 0.4604,
"step": 13600
},
{
"epoch": 0.023232323232323233,
"grad_norm": 0.4621998071670532,
"learning_rate": 4.883888888888889e-05,
"loss": 0.5366,
"step": 13800
},
{
"epoch": 0.02356902356902357,
"grad_norm": 0.21382929384708405,
"learning_rate": 4.882205387205387e-05,
"loss": 0.4864,
"step": 14000
},
{
"epoch": 0.023905723905723906,
"grad_norm": 0.2051325887441635,
"learning_rate": 4.880521885521886e-05,
"loss": 0.458,
"step": 14200
},
{
"epoch": 0.024242424242424242,
"grad_norm": 0.3620564341545105,
"learning_rate": 4.8788468013468015e-05,
"loss": 0.499,
"step": 14400
},
{
"epoch": 0.02457912457912458,
"grad_norm": 0.9814438223838806,
"learning_rate": 4.8771632996633e-05,
"loss": 0.442,
"step": 14600
},
{
"epoch": 0.024915824915824916,
"grad_norm": 0.46618032455444336,
"learning_rate": 4.8754797979797984e-05,
"loss": 0.4635,
"step": 14800
},
{
"epoch": 0.025252525252525252,
"grad_norm": 0.2610645592212677,
"learning_rate": 4.873796296296296e-05,
"loss": 0.458,
"step": 15000
},
{
"epoch": 0.02558922558922559,
"grad_norm": 0.3188152611255646,
"learning_rate": 4.8721127946127946e-05,
"loss": 0.4726,
"step": 15200
},
{
"epoch": 0.025925925925925925,
"grad_norm": 0.3566981852054596,
"learning_rate": 4.870429292929293e-05,
"loss": 0.4847,
"step": 15400
},
{
"epoch": 0.026262626262626262,
"grad_norm": 1.0731638669967651,
"learning_rate": 4.8687457912457914e-05,
"loss": 0.4912,
"step": 15600
},
{
"epoch": 0.0265993265993266,
"grad_norm": 0.4343542456626892,
"learning_rate": 4.86706228956229e-05,
"loss": 0.471,
"step": 15800
},
{
"epoch": 0.026936026936026935,
"grad_norm": 0.37956860661506653,
"learning_rate": 4.865378787878788e-05,
"loss": 0.4797,
"step": 16000
},
{
"epoch": 0.02727272727272727,
"grad_norm": 0.49000558257102966,
"learning_rate": 4.863695286195287e-05,
"loss": 0.5084,
"step": 16200
},
{
"epoch": 0.027609427609427608,
"grad_norm": 0.28972625732421875,
"learning_rate": 4.8620117845117845e-05,
"loss": 0.493,
"step": 16400
},
{
"epoch": 0.027946127946127945,
"grad_norm": 0.5928806662559509,
"learning_rate": 4.860328282828283e-05,
"loss": 0.5001,
"step": 16600
},
{
"epoch": 0.028282828282828285,
"grad_norm": 0.4121922552585602,
"learning_rate": 4.8586447811447814e-05,
"loss": 0.448,
"step": 16800
},
{
"epoch": 0.02861952861952862,
"grad_norm": 0.3214101195335388,
"learning_rate": 4.85696127946128e-05,
"loss": 0.5098,
"step": 17000
},
{
"epoch": 0.028956228956228958,
"grad_norm": 2.317594289779663,
"learning_rate": 4.855277777777778e-05,
"loss": 0.4628,
"step": 17200
},
{
"epoch": 0.029292929292929294,
"grad_norm": 0.46101972460746765,
"learning_rate": 4.853594276094277e-05,
"loss": 0.4556,
"step": 17400
},
{
"epoch": 0.02962962962962963,
"grad_norm": 0.24499452114105225,
"learning_rate": 4.8519107744107745e-05,
"loss": 0.4103,
"step": 17600
},
{
"epoch": 0.029966329966329967,
"grad_norm": 0.21861068904399872,
"learning_rate": 4.850227272727273e-05,
"loss": 0.4887,
"step": 17800
},
{
"epoch": 0.030303030303030304,
"grad_norm": 0.6664220094680786,
"learning_rate": 4.848543771043771e-05,
"loss": 0.4822,
"step": 18000
},
{
"epoch": 0.03063973063973064,
"grad_norm": 0.5134005546569824,
"learning_rate": 4.846860269360269e-05,
"loss": 0.4432,
"step": 18200
},
{
"epoch": 0.030976430976430977,
"grad_norm": 0.31726887822151184,
"learning_rate": 4.8451851851851855e-05,
"loss": 0.4757,
"step": 18400
},
{
"epoch": 0.031313131313131314,
"grad_norm": 0.5236911177635193,
"learning_rate": 4.843501683501684e-05,
"loss": 0.4522,
"step": 18600
},
{
"epoch": 0.03164983164983165,
"grad_norm": 0.359935998916626,
"learning_rate": 4.841818181818182e-05,
"loss": 0.4901,
"step": 18800
},
{
"epoch": 0.03198653198653199,
"grad_norm": 0.5292563438415527,
"learning_rate": 4.84013468013468e-05,
"loss": 0.4962,
"step": 19000
},
{
"epoch": 0.03232323232323232,
"grad_norm": 0.5163784623146057,
"learning_rate": 4.8384511784511786e-05,
"loss": 0.4427,
"step": 19200
},
{
"epoch": 0.03265993265993266,
"grad_norm": 0.19916895031929016,
"learning_rate": 4.836767676767677e-05,
"loss": 0.4778,
"step": 19400
},
{
"epoch": 0.032996632996632996,
"grad_norm": 0.16879796981811523,
"learning_rate": 4.8350841750841755e-05,
"loss": 0.4561,
"step": 19600
},
{
"epoch": 0.03333333333333333,
"grad_norm": 0.40591439604759216,
"learning_rate": 4.833400673400674e-05,
"loss": 0.532,
"step": 19800
},
{
"epoch": 0.03367003367003367,
"grad_norm": 0.27528542280197144,
"learning_rate": 4.8317171717171723e-05,
"loss": 0.5181,
"step": 20000
},
{
"epoch": 0.034006734006734006,
"grad_norm": 0.46540895104408264,
"learning_rate": 4.83003367003367e-05,
"loss": 0.5005,
"step": 20200
},
{
"epoch": 0.03434343434343434,
"grad_norm": 0.4676566421985626,
"learning_rate": 4.8283501683501685e-05,
"loss": 0.4752,
"step": 20400
},
{
"epoch": 0.03468013468013468,
"grad_norm": 0.5396921038627625,
"learning_rate": 4.826666666666667e-05,
"loss": 0.4566,
"step": 20600
},
{
"epoch": 0.035016835016835016,
"grad_norm": 0.1875556856393814,
"learning_rate": 4.824983164983165e-05,
"loss": 0.4705,
"step": 20800
},
{
"epoch": 0.03535353535353535,
"grad_norm": 0.5470389723777771,
"learning_rate": 4.823299663299664e-05,
"loss": 0.5035,
"step": 21000
},
{
"epoch": 0.03569023569023569,
"grad_norm": 0.2772787809371948,
"learning_rate": 4.821616161616162e-05,
"loss": 0.4857,
"step": 21200
},
{
"epoch": 0.036026936026936025,
"grad_norm": 0.43938860297203064,
"learning_rate": 4.81993265993266e-05,
"loss": 0.5107,
"step": 21400
},
{
"epoch": 0.03636363636363636,
"grad_norm": 0.2839397192001343,
"learning_rate": 4.818257575757576e-05,
"loss": 0.451,
"step": 21600
},
{
"epoch": 0.0367003367003367,
"grad_norm": 0.46151599287986755,
"learning_rate": 4.816574074074074e-05,
"loss": 0.4984,
"step": 21800
},
{
"epoch": 0.037037037037037035,
"grad_norm": 0.4271756410598755,
"learning_rate": 4.814890572390573e-05,
"loss": 0.4462,
"step": 22000
},
{
"epoch": 0.03737373737373737,
"grad_norm": 0.20119212567806244,
"learning_rate": 4.813207070707071e-05,
"loss": 0.4914,
"step": 22200
},
{
"epoch": 0.03771043771043771,
"grad_norm": 0.7174796462059021,
"learning_rate": 4.8115235690235696e-05,
"loss": 0.4468,
"step": 22400
},
{
"epoch": 0.038047138047138045,
"grad_norm": 0.25557178258895874,
"learning_rate": 4.809840067340067e-05,
"loss": 0.4969,
"step": 22600
},
{
"epoch": 0.03838383838383838,
"grad_norm": 0.2094777226448059,
"learning_rate": 4.808156565656566e-05,
"loss": 0.4552,
"step": 22800
},
{
"epoch": 0.03872053872053872,
"grad_norm": 0.2642809748649597,
"learning_rate": 4.806473063973064e-05,
"loss": 0.4741,
"step": 23000
},
{
"epoch": 0.039057239057239054,
"grad_norm": 0.9128819704055786,
"learning_rate": 4.8047895622895626e-05,
"loss": 0.4964,
"step": 23200
},
{
"epoch": 0.03939393939393939,
"grad_norm": 0.6326189637184143,
"learning_rate": 4.8031060606060604e-05,
"loss": 0.4749,
"step": 23400
},
{
"epoch": 0.03973063973063973,
"grad_norm": 0.6259990334510803,
"learning_rate": 4.8014225589225595e-05,
"loss": 0.4915,
"step": 23600
},
{
"epoch": 0.040067340067340064,
"grad_norm": 0.22890082001686096,
"learning_rate": 4.799739057239058e-05,
"loss": 0.4941,
"step": 23800
},
{
"epoch": 0.04040404040404041,
"grad_norm": 0.28990963101387024,
"learning_rate": 4.798063973063973e-05,
"loss": 0.4874,
"step": 24000
},
{
"epoch": 0.040740740740740744,
"grad_norm": 0.21455387771129608,
"learning_rate": 4.7963804713804715e-05,
"loss": 0.5352,
"step": 24200
},
{
"epoch": 0.04107744107744108,
"grad_norm": 0.17958062887191772,
"learning_rate": 4.79469696969697e-05,
"loss": 0.421,
"step": 24400
},
{
"epoch": 0.04141414141414142,
"grad_norm": 0.36474618315696716,
"learning_rate": 4.793013468013468e-05,
"loss": 0.4513,
"step": 24600
},
{
"epoch": 0.041750841750841754,
"grad_norm": 0.4638340175151825,
"learning_rate": 4.791329966329967e-05,
"loss": 0.5075,
"step": 24800
},
{
"epoch": 0.04208754208754209,
"grad_norm": 0.2026415318250656,
"learning_rate": 4.789646464646465e-05,
"loss": 0.4807,
"step": 25000
},
{
"epoch": 0.04242424242424243,
"grad_norm": 0.3414445221424103,
"learning_rate": 4.787962962962963e-05,
"loss": 0.4731,
"step": 25200
},
{
"epoch": 0.04276094276094276,
"grad_norm": 0.20735152065753937,
"learning_rate": 4.7862794612794614e-05,
"loss": 0.5116,
"step": 25400
},
{
"epoch": 0.0430976430976431,
"grad_norm": 0.3263112008571625,
"learning_rate": 4.78459595959596e-05,
"loss": 0.4675,
"step": 25600
},
{
"epoch": 0.043434343434343436,
"grad_norm": 1.1240352392196655,
"learning_rate": 4.7829124579124576e-05,
"loss": 0.4651,
"step": 25800
},
{
"epoch": 0.04377104377104377,
"grad_norm": 0.47903600335121155,
"learning_rate": 4.781228956228957e-05,
"loss": 0.4696,
"step": 26000
},
{
"epoch": 0.04410774410774411,
"grad_norm": 2.3555760383605957,
"learning_rate": 4.779545454545455e-05,
"loss": 0.4826,
"step": 26200
},
{
"epoch": 0.044444444444444446,
"grad_norm": 0.22116072475910187,
"learning_rate": 4.777861952861953e-05,
"loss": 0.483,
"step": 26400
},
{
"epoch": 0.04478114478114478,
"grad_norm": 0.7030754685401917,
"learning_rate": 4.7761784511784514e-05,
"loss": 0.4317,
"step": 26600
},
{
"epoch": 0.04511784511784512,
"grad_norm": 0.3827281594276428,
"learning_rate": 4.774503367003367e-05,
"loss": 0.4292,
"step": 26800
},
{
"epoch": 0.045454545454545456,
"grad_norm": 0.8839394450187683,
"learning_rate": 4.7728198653198655e-05,
"loss": 0.4646,
"step": 27000
},
{
"epoch": 0.04579124579124579,
"grad_norm": 0.5600021481513977,
"learning_rate": 4.771136363636364e-05,
"loss": 0.5081,
"step": 27200
},
{
"epoch": 0.04612794612794613,
"grad_norm": 0.23018187284469604,
"learning_rate": 4.7694528619528624e-05,
"loss": 0.4908,
"step": 27400
},
{
"epoch": 0.046464646464646465,
"grad_norm": 0.456559419631958,
"learning_rate": 4.767769360269361e-05,
"loss": 0.4872,
"step": 27600
},
{
"epoch": 0.0468013468013468,
"grad_norm": 0.12121502310037613,
"learning_rate": 4.7660858585858586e-05,
"loss": 0.4979,
"step": 27800
},
{
"epoch": 0.04713804713804714,
"grad_norm": 0.38946759700775146,
"learning_rate": 4.764402356902357e-05,
"loss": 0.4651,
"step": 28000
},
{
"epoch": 0.047474747474747475,
"grad_norm": 2.142538547515869,
"learning_rate": 4.7627188552188555e-05,
"loss": 0.4288,
"step": 28200
},
{
"epoch": 0.04781144781144781,
"grad_norm": 0.15298590064048767,
"learning_rate": 4.761035353535353e-05,
"loss": 0.4365,
"step": 28400
},
{
"epoch": 0.04814814814814815,
"grad_norm": 0.6363445520401001,
"learning_rate": 4.7593518518518524e-05,
"loss": 0.4428,
"step": 28600
},
{
"epoch": 0.048484848484848485,
"grad_norm": 0.5169795751571655,
"learning_rate": 4.757668350168351e-05,
"loss": 0.4666,
"step": 28800
},
{
"epoch": 0.04882154882154882,
"grad_norm": 0.2661610245704651,
"learning_rate": 4.7559848484848486e-05,
"loss": 0.4492,
"step": 29000
},
{
"epoch": 0.04915824915824916,
"grad_norm": 0.4399496018886566,
"learning_rate": 4.754301346801347e-05,
"loss": 0.4504,
"step": 29200
},
{
"epoch": 0.049494949494949494,
"grad_norm": 0.31160300970077515,
"learning_rate": 4.7526178451178454e-05,
"loss": 0.4527,
"step": 29400
},
{
"epoch": 0.04983164983164983,
"grad_norm": 0.6169541478157043,
"learning_rate": 4.750934343434343e-05,
"loss": 0.4995,
"step": 29600
},
{
"epoch": 0.05016835016835017,
"grad_norm": 0.43009576201438904,
"learning_rate": 4.7492508417508416e-05,
"loss": 0.4681,
"step": 29800
},
{
"epoch": 0.050505050505050504,
"grad_norm": 0.3901723623275757,
"learning_rate": 4.74756734006734e-05,
"loss": 0.4509,
"step": 30000
},
{
"epoch": 0.05084175084175084,
"grad_norm": 0.2859044373035431,
"learning_rate": 4.745883838383839e-05,
"loss": 0.4262,
"step": 30200
},
{
"epoch": 0.05117845117845118,
"grad_norm": 0.29008200764656067,
"learning_rate": 4.744208754208754e-05,
"loss": 0.434,
"step": 30400
},
{
"epoch": 0.051515151515151514,
"grad_norm": 0.8030261397361755,
"learning_rate": 4.742525252525253e-05,
"loss": 0.5066,
"step": 30600
},
{
"epoch": 0.05185185185185185,
"grad_norm": 0.6552255153656006,
"learning_rate": 4.740841750841751e-05,
"loss": 0.4888,
"step": 30800
},
{
"epoch": 0.05218855218855219,
"grad_norm": 0.47479531168937683,
"learning_rate": 4.7391582491582496e-05,
"loss": 0.4886,
"step": 31000
},
{
"epoch": 0.052525252525252523,
"grad_norm": 0.6294977068901062,
"learning_rate": 4.737474747474748e-05,
"loss": 0.4144,
"step": 31200
},
{
"epoch": 0.05286195286195286,
"grad_norm": 0.346327006816864,
"learning_rate": 4.7357912457912465e-05,
"loss": 0.517,
"step": 31400
},
{
"epoch": 0.0531986531986532,
"grad_norm": 0.4449813961982727,
"learning_rate": 4.734107744107744e-05,
"loss": 0.4681,
"step": 31600
},
{
"epoch": 0.05353535353535353,
"grad_norm": 0.4756247401237488,
"learning_rate": 4.7324242424242426e-05,
"loss": 0.4772,
"step": 31800
},
{
"epoch": 0.05387205387205387,
"grad_norm": 0.5156170129776001,
"learning_rate": 4.730740740740741e-05,
"loss": 0.456,
"step": 32000
},
{
"epoch": 0.054208754208754206,
"grad_norm": 1.6009584665298462,
"learning_rate": 4.729057239057239e-05,
"loss": 0.4483,
"step": 32200
},
{
"epoch": 0.05454545454545454,
"grad_norm": 0.4179598391056061,
"learning_rate": 4.727373737373737e-05,
"loss": 0.4438,
"step": 32400
},
{
"epoch": 0.05488215488215488,
"grad_norm": 0.09701373428106308,
"learning_rate": 4.725690235690236e-05,
"loss": 0.4746,
"step": 32600
},
{
"epoch": 0.055218855218855216,
"grad_norm": 0.4205819368362427,
"learning_rate": 4.724006734006734e-05,
"loss": 0.4459,
"step": 32800
},
{
"epoch": 0.05555555555555555,
"grad_norm": 0.5670439004898071,
"learning_rate": 4.7223232323232326e-05,
"loss": 0.5025,
"step": 33000
},
{
"epoch": 0.05589225589225589,
"grad_norm": 0.47377878427505493,
"learning_rate": 4.720639730639731e-05,
"loss": 0.4746,
"step": 33200
},
{
"epoch": 0.056228956228956226,
"grad_norm": 0.254245400428772,
"learning_rate": 4.718956228956229e-05,
"loss": 0.4926,
"step": 33400
},
{
"epoch": 0.05656565656565657,
"grad_norm": 0.299713671207428,
"learning_rate": 4.717272727272727e-05,
"loss": 0.4748,
"step": 33600
},
{
"epoch": 0.056902356902356906,
"grad_norm": 0.16089321672916412,
"learning_rate": 4.715589225589226e-05,
"loss": 0.4813,
"step": 33800
},
{
"epoch": 0.05723905723905724,
"grad_norm": 0.27492621541023254,
"learning_rate": 4.713914141414142e-05,
"loss": 0.473,
"step": 34000
},
{
"epoch": 0.05757575757575758,
"grad_norm": 0.8170735239982605,
"learning_rate": 4.71223063973064e-05,
"loss": 0.4251,
"step": 34200
},
{
"epoch": 0.057912457912457915,
"grad_norm": 0.41072168946266174,
"learning_rate": 4.710547138047138e-05,
"loss": 0.4692,
"step": 34400
},
{
"epoch": 0.05824915824915825,
"grad_norm": 0.37332773208618164,
"learning_rate": 4.708863636363637e-05,
"loss": 0.4289,
"step": 34600
},
{
"epoch": 0.05858585858585859,
"grad_norm": 0.3257604241371155,
"learning_rate": 4.7071801346801345e-05,
"loss": 0.4623,
"step": 34800
},
{
"epoch": 0.058922558922558925,
"grad_norm": 0.23426009714603424,
"learning_rate": 4.705496632996633e-05,
"loss": 0.5082,
"step": 35000
},
{
"epoch": 0.05925925925925926,
"grad_norm": 0.28719109296798706,
"learning_rate": 4.703813131313132e-05,
"loss": 0.4767,
"step": 35200
},
{
"epoch": 0.0595959595959596,
"grad_norm": 0.35480618476867676,
"learning_rate": 4.70212962962963e-05,
"loss": 0.5006,
"step": 35400
},
{
"epoch": 0.059932659932659935,
"grad_norm": 0.5050226449966431,
"learning_rate": 4.700446127946128e-05,
"loss": 0.4809,
"step": 35600
},
{
"epoch": 0.06026936026936027,
"grad_norm": 0.2631937265396118,
"learning_rate": 4.698762626262627e-05,
"loss": 0.4508,
"step": 35800
},
{
"epoch": 0.06060606060606061,
"grad_norm": 0.32295939326286316,
"learning_rate": 4.6970791245791244e-05,
"loss": 0.4697,
"step": 36000
},
{
"epoch": 0.060942760942760944,
"grad_norm": 0.34667742252349854,
"learning_rate": 4.695404040404041e-05,
"loss": 0.4692,
"step": 36200
},
{
"epoch": 0.06127946127946128,
"grad_norm": 0.6987492442131042,
"learning_rate": 4.693720538720539e-05,
"loss": 0.4596,
"step": 36400
},
{
"epoch": 0.06161616161616162,
"grad_norm": 0.4795779883861542,
"learning_rate": 4.692037037037037e-05,
"loss": 0.4361,
"step": 36600
},
{
"epoch": 0.061952861952861954,
"grad_norm": 0.5291064381599426,
"learning_rate": 4.6903535353535355e-05,
"loss": 0.4408,
"step": 36800
},
{
"epoch": 0.06228956228956229,
"grad_norm": 0.46040576696395874,
"learning_rate": 4.688670033670034e-05,
"loss": 0.4976,
"step": 37000
},
{
"epoch": 0.06262626262626263,
"grad_norm": 2.0511856079101562,
"learning_rate": 4.6869865319865324e-05,
"loss": 0.4747,
"step": 37200
},
{
"epoch": 0.06296296296296296,
"grad_norm": 0.6845996975898743,
"learning_rate": 4.68530303030303e-05,
"loss": 0.4145,
"step": 37400
},
{
"epoch": 0.0632996632996633,
"grad_norm": 0.2830463945865631,
"learning_rate": 4.6836195286195286e-05,
"loss": 0.4888,
"step": 37600
},
{
"epoch": 0.06363636363636363,
"grad_norm": 0.4033803343772888,
"learning_rate": 4.681936026936028e-05,
"loss": 0.4584,
"step": 37800
},
{
"epoch": 0.06397306397306397,
"grad_norm": 0.26968395709991455,
"learning_rate": 4.6802525252525255e-05,
"loss": 0.4246,
"step": 38000
},
{
"epoch": 0.0643097643097643,
"grad_norm": 0.14037840068340302,
"learning_rate": 4.678569023569024e-05,
"loss": 0.4047,
"step": 38200
},
{
"epoch": 0.06464646464646465,
"grad_norm": 0.2874729335308075,
"learning_rate": 4.676885521885522e-05,
"loss": 0.4445,
"step": 38400
},
{
"epoch": 0.06498316498316499,
"grad_norm": 0.2633935213088989,
"learning_rate": 4.67520202020202e-05,
"loss": 0.4435,
"step": 38600
},
{
"epoch": 0.06531986531986532,
"grad_norm": 0.4510101079940796,
"learning_rate": 4.6735185185185185e-05,
"loss": 0.4724,
"step": 38800
},
{
"epoch": 0.06565656565656566,
"grad_norm": 0.20095351338386536,
"learning_rate": 4.671835016835017e-05,
"loss": 0.5128,
"step": 39000
},
{
"epoch": 0.06599326599326599,
"grad_norm": 0.4433535635471344,
"learning_rate": 4.670159932659933e-05,
"loss": 0.4581,
"step": 39200
},
{
"epoch": 0.06632996632996634,
"grad_norm": 0.5821954607963562,
"learning_rate": 4.668476430976431e-05,
"loss": 0.4985,
"step": 39400
},
{
"epoch": 0.06666666666666667,
"grad_norm": 1.3577245473861694,
"learning_rate": 4.6667929292929296e-05,
"loss": 0.5283,
"step": 39600
},
{
"epoch": 0.06700336700336701,
"grad_norm": 0.37699806690216064,
"learning_rate": 4.6651094276094274e-05,
"loss": 0.4825,
"step": 39800
},
{
"epoch": 0.06734006734006734,
"grad_norm": 0.41804903745651245,
"learning_rate": 4.663425925925926e-05,
"loss": 0.4907,
"step": 40000
},
{
"epoch": 0.06767676767676768,
"grad_norm": 0.243534654378891,
"learning_rate": 4.661742424242425e-05,
"loss": 0.5059,
"step": 40200
},
{
"epoch": 0.06801346801346801,
"grad_norm": 0.1186649277806282,
"learning_rate": 4.660058922558923e-05,
"loss": 0.4808,
"step": 40400
},
{
"epoch": 0.06835016835016836,
"grad_norm": 0.30161020159721375,
"learning_rate": 4.658375420875421e-05,
"loss": 0.4816,
"step": 40600
},
{
"epoch": 0.06868686868686869,
"grad_norm": 0.2418268918991089,
"learning_rate": 4.6566919191919195e-05,
"loss": 0.4371,
"step": 40800
},
{
"epoch": 0.06902356902356903,
"grad_norm": 0.2833971381187439,
"learning_rate": 4.655008417508418e-05,
"loss": 0.4686,
"step": 41000
},
{
"epoch": 0.06936026936026936,
"grad_norm": 0.2797035276889801,
"learning_rate": 4.653324915824916e-05,
"loss": 0.4599,
"step": 41200
},
{
"epoch": 0.0696969696969697,
"grad_norm": 0.3841836154460907,
"learning_rate": 4.651641414141414e-05,
"loss": 0.4111,
"step": 41400
},
{
"epoch": 0.07003367003367003,
"grad_norm": 0.6590111255645752,
"learning_rate": 4.6499579124579126e-05,
"loss": 0.4631,
"step": 41600
},
{
"epoch": 0.07037037037037037,
"grad_norm": 0.29527220129966736,
"learning_rate": 4.648274410774411e-05,
"loss": 0.4963,
"step": 41800
},
{
"epoch": 0.0707070707070707,
"grad_norm": 0.7178300619125366,
"learning_rate": 4.6465909090909095e-05,
"loss": 0.4694,
"step": 42000
},
{
"epoch": 0.07104377104377105,
"grad_norm": 0.38491058349609375,
"learning_rate": 4.644907407407408e-05,
"loss": 0.4437,
"step": 42200
},
{
"epoch": 0.07138047138047138,
"grad_norm": 0.3037305772304535,
"learning_rate": 4.643223905723906e-05,
"loss": 0.4635,
"step": 42400
},
{
"epoch": 0.07171717171717172,
"grad_norm": 0.4430043697357178,
"learning_rate": 4.641540404040404e-05,
"loss": 0.4623,
"step": 42600
},
{
"epoch": 0.07205387205387205,
"grad_norm": 0.3590750992298126,
"learning_rate": 4.6398569023569026e-05,
"loss": 0.4827,
"step": 42800
},
{
"epoch": 0.0723905723905724,
"grad_norm": 0.6161913275718689,
"learning_rate": 4.638181818181818e-05,
"loss": 0.4796,
"step": 43000
},
{
"epoch": 0.07272727272727272,
"grad_norm": 0.3607730269432068,
"learning_rate": 4.636498316498317e-05,
"loss": 0.4801,
"step": 43200
},
{
"epoch": 0.07306397306397307,
"grad_norm": 0.07056716829538345,
"learning_rate": 4.634814814814815e-05,
"loss": 0.4738,
"step": 43400
},
{
"epoch": 0.0734006734006734,
"grad_norm": 0.09327512234449387,
"learning_rate": 4.633131313131313e-05,
"loss": 0.4885,
"step": 43600
},
{
"epoch": 0.07373737373737374,
"grad_norm": 0.2519952952861786,
"learning_rate": 4.6314478114478114e-05,
"loss": 0.4541,
"step": 43800
},
{
"epoch": 0.07407407407407407,
"grad_norm": 0.4618964195251465,
"learning_rate": 4.62976430976431e-05,
"loss": 0.4593,
"step": 44000
},
{
"epoch": 0.07441077441077441,
"grad_norm": 0.4683738946914673,
"learning_rate": 4.628080808080808e-05,
"loss": 0.4536,
"step": 44200
},
{
"epoch": 0.07474747474747474,
"grad_norm": 0.2552854120731354,
"learning_rate": 4.626397306397307e-05,
"loss": 0.4702,
"step": 44400
},
{
"epoch": 0.07508417508417509,
"grad_norm": 0.33385610580444336,
"learning_rate": 4.624713804713805e-05,
"loss": 0.4623,
"step": 44600
},
{
"epoch": 0.07542087542087542,
"grad_norm": 0.17833998799324036,
"learning_rate": 4.6230303030303036e-05,
"loss": 0.419,
"step": 44800
},
{
"epoch": 0.07575757575757576,
"grad_norm": 0.14885468780994415,
"learning_rate": 4.621346801346801e-05,
"loss": 0.5132,
"step": 45000
},
{
"epoch": 0.07609427609427609,
"grad_norm": 0.4861992597579956,
"learning_rate": 4.6196632996633e-05,
"loss": 0.4914,
"step": 45200
},
{
"epoch": 0.07643097643097643,
"grad_norm": 0.20314612984657288,
"learning_rate": 4.617979797979798e-05,
"loss": 0.4535,
"step": 45400
},
{
"epoch": 0.07676767676767676,
"grad_norm": 0.7097423076629639,
"learning_rate": 4.616304713804714e-05,
"loss": 0.4343,
"step": 45600
},
{
"epoch": 0.0771043771043771,
"grad_norm": 0.23547014594078064,
"learning_rate": 4.6146212121212124e-05,
"loss": 0.449,
"step": 45800
},
{
"epoch": 0.07744107744107744,
"grad_norm": 0.25944817066192627,
"learning_rate": 4.612937710437711e-05,
"loss": 0.4535,
"step": 46000
},
{
"epoch": 0.07777777777777778,
"grad_norm": 0.6145304441452026,
"learning_rate": 4.6112542087542086e-05,
"loss": 0.4536,
"step": 46200
},
{
"epoch": 0.07811447811447811,
"grad_norm": 1.2527995109558105,
"learning_rate": 4.609570707070707e-05,
"loss": 0.4616,
"step": 46400
},
{
"epoch": 0.07845117845117845,
"grad_norm": 0.9534751772880554,
"learning_rate": 4.6078872053872055e-05,
"loss": 0.5009,
"step": 46600
},
{
"epoch": 0.07878787878787878,
"grad_norm": 0.552191436290741,
"learning_rate": 4.606203703703704e-05,
"loss": 0.4738,
"step": 46800
},
{
"epoch": 0.07912457912457913,
"grad_norm": 0.28889888525009155,
"learning_rate": 4.6045202020202023e-05,
"loss": 0.4721,
"step": 47000
},
{
"epoch": 0.07946127946127945,
"grad_norm": 0.4266869127750397,
"learning_rate": 4.602845117845118e-05,
"loss": 0.4695,
"step": 47200
},
{
"epoch": 0.0797979797979798,
"grad_norm": 0.5105581879615784,
"learning_rate": 4.601161616161616e-05,
"loss": 0.4739,
"step": 47400
},
{
"epoch": 0.08013468013468013,
"grad_norm": 0.4175490736961365,
"learning_rate": 4.599478114478114e-05,
"loss": 0.4456,
"step": 47600
},
{
"epoch": 0.08047138047138047,
"grad_norm": 0.3257778584957123,
"learning_rate": 4.5977946127946134e-05,
"loss": 0.4808,
"step": 47800
},
{
"epoch": 0.08080808080808081,
"grad_norm": 0.3000372648239136,
"learning_rate": 4.596111111111112e-05,
"loss": 0.4635,
"step": 48000
},
{
"epoch": 0.08114478114478114,
"grad_norm": 0.32268643379211426,
"learning_rate": 4.5944276094276096e-05,
"loss": 0.4925,
"step": 48200
},
{
"epoch": 0.08148148148148149,
"grad_norm": 0.5290645956993103,
"learning_rate": 4.592744107744108e-05,
"loss": 0.4711,
"step": 48400
},
{
"epoch": 0.08181818181818182,
"grad_norm": 0.29082873463630676,
"learning_rate": 4.5910606060606065e-05,
"loss": 0.4728,
"step": 48600
},
{
"epoch": 0.08215488215488216,
"grad_norm": 0.6704333424568176,
"learning_rate": 4.589377104377104e-05,
"loss": 0.4779,
"step": 48800
},
{
"epoch": 0.08249158249158249,
"grad_norm": 0.27797549962997437,
"learning_rate": 4.587693602693603e-05,
"loss": 0.4529,
"step": 49000
},
{
"epoch": 0.08282828282828283,
"grad_norm": 0.1398361176252365,
"learning_rate": 4.586010101010101e-05,
"loss": 0.4379,
"step": 49200
},
{
"epoch": 0.08316498316498316,
"grad_norm": 0.31926196813583374,
"learning_rate": 4.5843265993265996e-05,
"loss": 0.457,
"step": 49400
},
{
"epoch": 0.08350168350168351,
"grad_norm": 0.17603324353694916,
"learning_rate": 4.582643097643098e-05,
"loss": 0.452,
"step": 49600
},
{
"epoch": 0.08383838383838384,
"grad_norm": 0.4734348654747009,
"learning_rate": 4.5809595959595964e-05,
"loss": 0.4489,
"step": 49800
},
{
"epoch": 0.08417508417508418,
"grad_norm": 0.2849540710449219,
"learning_rate": 4.579276094276094e-05,
"loss": 0.4448,
"step": 50000
},
{
"epoch": 0.08417508417508418,
"eval_loss": 0.4674188494682312,
"eval_runtime": 400.4812,
"eval_samples_per_second": 14.982,
"eval_steps_per_second": 14.982,
"step": 50000
},
{
"epoch": 0.08451178451178451,
"grad_norm": 0.3861866295337677,
"learning_rate": 4.5775925925925926e-05,
"loss": 0.4691,
"step": 50200
},
{
"epoch": 0.08484848484848485,
"grad_norm": 0.23927472531795502,
"learning_rate": 4.575909090909091e-05,
"loss": 0.519,
"step": 50400
},
{
"epoch": 0.08518518518518518,
"grad_norm": 0.2663820683956146,
"learning_rate": 4.5742255892255895e-05,
"loss": 0.4394,
"step": 50600
},
{
"epoch": 0.08552188552188553,
"grad_norm": 0.0782080739736557,
"learning_rate": 4.572542087542088e-05,
"loss": 0.4181,
"step": 50800
},
{
"epoch": 0.08585858585858586,
"grad_norm": 0.32413387298583984,
"learning_rate": 4.5708585858585864e-05,
"loss": 0.4643,
"step": 51000
},
{
"epoch": 0.0861952861952862,
"grad_norm": 0.18156534433364868,
"learning_rate": 4.569175084175085e-05,
"loss": 0.4511,
"step": 51200
},
{
"epoch": 0.08653198653198653,
"grad_norm": 0.298673152923584,
"learning_rate": 4.5674915824915826e-05,
"loss": 0.4424,
"step": 51400
},
{
"epoch": 0.08686868686868687,
"grad_norm": 0.24452580511569977,
"learning_rate": 4.565808080808081e-05,
"loss": 0.4556,
"step": 51600
},
{
"epoch": 0.0872053872053872,
"grad_norm": 0.2959561049938202,
"learning_rate": 4.564132996632997e-05,
"loss": 0.4543,
"step": 51800
},
{
"epoch": 0.08754208754208755,
"grad_norm": 0.6213822960853577,
"learning_rate": 4.5624579124579125e-05,
"loss": 0.5136,
"step": 52000
},
{
"epoch": 0.08787878787878788,
"grad_norm": 0.5385012030601501,
"learning_rate": 4.560774410774411e-05,
"loss": 0.4602,
"step": 52200
},
{
"epoch": 0.08821548821548822,
"grad_norm": 0.6063356995582581,
"learning_rate": 4.5590909090909094e-05,
"loss": 0.4367,
"step": 52400
},
{
"epoch": 0.08855218855218855,
"grad_norm": 0.43720120191574097,
"learning_rate": 4.557407407407407e-05,
"loss": 0.4902,
"step": 52600
},
{
"epoch": 0.08888888888888889,
"grad_norm": 0.4334559738636017,
"learning_rate": 4.555723905723906e-05,
"loss": 0.4585,
"step": 52800
},
{
"epoch": 0.08922558922558922,
"grad_norm": 0.2874049246311188,
"learning_rate": 4.554040404040405e-05,
"loss": 0.4907,
"step": 53000
},
{
"epoch": 0.08956228956228957,
"grad_norm": 0.19016990065574646,
"learning_rate": 4.5523569023569025e-05,
"loss": 0.4561,
"step": 53200
},
{
"epoch": 0.0898989898989899,
"grad_norm": 0.7278497815132141,
"learning_rate": 4.550673400673401e-05,
"loss": 0.4702,
"step": 53400
},
{
"epoch": 0.09023569023569024,
"grad_norm": 0.28533700108528137,
"learning_rate": 4.5489898989898993e-05,
"loss": 0.5139,
"step": 53600
},
{
"epoch": 0.09057239057239057,
"grad_norm": 0.6488041281700134,
"learning_rate": 4.547306397306397e-05,
"loss": 0.4553,
"step": 53800
},
{
"epoch": 0.09090909090909091,
"grad_norm": 0.3091227412223816,
"learning_rate": 4.5456228956228955e-05,
"loss": 0.4779,
"step": 54000
},
{
"epoch": 0.09124579124579124,
"grad_norm": 0.3282964825630188,
"learning_rate": 4.543939393939394e-05,
"loss": 0.5117,
"step": 54200
},
{
"epoch": 0.09158249158249158,
"grad_norm": 0.473143070936203,
"learning_rate": 4.5422558922558924e-05,
"loss": 0.4791,
"step": 54400
},
{
"epoch": 0.09191919191919191,
"grad_norm": 0.5263796448707581,
"learning_rate": 4.540572390572391e-05,
"loss": 0.4686,
"step": 54600
},
{
"epoch": 0.09225589225589226,
"grad_norm": 0.4568365216255188,
"learning_rate": 4.538888888888889e-05,
"loss": 0.4936,
"step": 54800
},
{
"epoch": 0.09259259259259259,
"grad_norm": 0.9846563935279846,
"learning_rate": 4.537205387205388e-05,
"loss": 0.4968,
"step": 55000
},
{
"epoch": 0.09292929292929293,
"grad_norm": 0.15145862102508545,
"learning_rate": 4.5355218855218855e-05,
"loss": 0.5163,
"step": 55200
},
{
"epoch": 0.09326599326599326,
"grad_norm": 0.428117573261261,
"learning_rate": 4.533838383838384e-05,
"loss": 0.4646,
"step": 55400
},
{
"epoch": 0.0936026936026936,
"grad_norm": 0.3261561393737793,
"learning_rate": 4.5321548821548824e-05,
"loss": 0.4697,
"step": 55600
},
{
"epoch": 0.09393939393939393,
"grad_norm": 0.34254854917526245,
"learning_rate": 4.530471380471381e-05,
"loss": 0.4331,
"step": 55800
},
{
"epoch": 0.09427609427609428,
"grad_norm": 0.15681512653827667,
"learning_rate": 4.528787878787879e-05,
"loss": 0.4765,
"step": 56000
},
{
"epoch": 0.0946127946127946,
"grad_norm": 0.16159813106060028,
"learning_rate": 4.527104377104378e-05,
"loss": 0.4775,
"step": 56200
},
{
"epoch": 0.09494949494949495,
"grad_norm": 0.6212481260299683,
"learning_rate": 4.5254208754208754e-05,
"loss": 0.5314,
"step": 56400
},
{
"epoch": 0.09528619528619528,
"grad_norm": 0.3396393656730652,
"learning_rate": 4.523737373737374e-05,
"loss": 0.4898,
"step": 56600
},
{
"epoch": 0.09562289562289562,
"grad_norm": 0.32701626420021057,
"learning_rate": 4.5220622895622896e-05,
"loss": 0.4421,
"step": 56800
},
{
"epoch": 0.09595959595959595,
"grad_norm": 0.15523914992809296,
"learning_rate": 4.520378787878788e-05,
"loss": 0.4415,
"step": 57000
},
{
"epoch": 0.0962962962962963,
"grad_norm": 0.5103595852851868,
"learning_rate": 4.5186952861952865e-05,
"loss": 0.453,
"step": 57200
},
{
"epoch": 0.09663299663299663,
"grad_norm": 0.42163121700286865,
"learning_rate": 4.517011784511785e-05,
"loss": 0.4738,
"step": 57400
},
{
"epoch": 0.09696969696969697,
"grad_norm": 0.9396620392799377,
"learning_rate": 4.515328282828283e-05,
"loss": 0.473,
"step": 57600
},
{
"epoch": 0.09730639730639731,
"grad_norm": 0.4714924991130829,
"learning_rate": 4.513644781144781e-05,
"loss": 0.5031,
"step": 57800
},
{
"epoch": 0.09764309764309764,
"grad_norm": 0.24718382954597473,
"learning_rate": 4.5119612794612796e-05,
"loss": 0.4592,
"step": 58000
},
{
"epoch": 0.09797979797979799,
"grad_norm": 0.3186817169189453,
"learning_rate": 4.510277777777778e-05,
"loss": 0.4451,
"step": 58200
},
{
"epoch": 0.09831649831649832,
"grad_norm": 0.34213390946388245,
"learning_rate": 4.5085942760942764e-05,
"loss": 0.4646,
"step": 58400
},
{
"epoch": 0.09865319865319866,
"grad_norm": 0.29326021671295166,
"learning_rate": 4.506910774410775e-05,
"loss": 0.4825,
"step": 58600
},
{
"epoch": 0.09898989898989899,
"grad_norm": 0.8425318598747253,
"learning_rate": 4.505227272727273e-05,
"loss": 0.4899,
"step": 58800
},
{
"epoch": 0.09932659932659933,
"grad_norm": 0.23540657758712769,
"learning_rate": 4.503543771043771e-05,
"loss": 0.4654,
"step": 59000
},
{
"epoch": 0.09966329966329966,
"grad_norm": 0.44379663467407227,
"learning_rate": 4.5018602693602695e-05,
"loss": 0.4722,
"step": 59200
},
{
"epoch": 0.1,
"grad_norm": 0.23975303769111633,
"learning_rate": 4.500176767676768e-05,
"loss": 0.4589,
"step": 59400
},
{
"epoch": 0.10033670033670034,
"grad_norm": 0.29341402649879456,
"learning_rate": 4.498501683501684e-05,
"loss": 0.5041,
"step": 59600
},
{
"epoch": 0.10067340067340068,
"grad_norm": 0.34747016429901123,
"learning_rate": 4.496818181818182e-05,
"loss": 0.4647,
"step": 59800
},
{
"epoch": 0.10101010101010101,
"grad_norm": 0.8880186080932617,
"learning_rate": 4.4951346801346806e-05,
"loss": 0.4497,
"step": 60000
},
{
"epoch": 0.10134680134680135,
"grad_norm": 0.6821927428245544,
"learning_rate": 4.4934511784511783e-05,
"loss": 0.4716,
"step": 60200
},
{
"epoch": 0.10168350168350168,
"grad_norm": 0.4481610953807831,
"learning_rate": 4.491767676767677e-05,
"loss": 0.4576,
"step": 60400
},
{
"epoch": 0.10202020202020202,
"grad_norm": 0.6106315851211548,
"learning_rate": 4.490084175084175e-05,
"loss": 0.4386,
"step": 60600
},
{
"epoch": 0.10235690235690235,
"grad_norm": 0.3058004081249237,
"learning_rate": 4.4884006734006737e-05,
"loss": 0.4914,
"step": 60800
},
{
"epoch": 0.1026936026936027,
"grad_norm": 0.4685909152030945,
"learning_rate": 4.486717171717172e-05,
"loss": 0.4707,
"step": 61000
},
{
"epoch": 0.10303030303030303,
"grad_norm": 0.643690288066864,
"learning_rate": 4.4850336700336705e-05,
"loss": 0.5428,
"step": 61200
},
{
"epoch": 0.10336700336700337,
"grad_norm": 0.44329872727394104,
"learning_rate": 4.483350168350168e-05,
"loss": 0.4279,
"step": 61400
},
{
"epoch": 0.1037037037037037,
"grad_norm": 0.21025417745113373,
"learning_rate": 4.481666666666667e-05,
"loss": 0.458,
"step": 61600
},
{
"epoch": 0.10404040404040404,
"grad_norm": 0.24184706807136536,
"learning_rate": 4.479983164983165e-05,
"loss": 0.4964,
"step": 61800
},
{
"epoch": 0.10437710437710437,
"grad_norm": 0.4054918587207794,
"learning_rate": 4.4782996632996636e-05,
"loss": 0.4427,
"step": 62000
},
{
"epoch": 0.10471380471380472,
"grad_norm": 0.4844823181629181,
"learning_rate": 4.476616161616162e-05,
"loss": 0.4529,
"step": 62200
},
{
"epoch": 0.10505050505050505,
"grad_norm": 0.39470815658569336,
"learning_rate": 4.4749326599326605e-05,
"loss": 0.4441,
"step": 62400
},
{
"epoch": 0.10538720538720539,
"grad_norm": 0.47343677282333374,
"learning_rate": 4.473249158249159e-05,
"loss": 0.4253,
"step": 62600
},
{
"epoch": 0.10572390572390572,
"grad_norm": 0.21908357739448547,
"learning_rate": 4.471565656565657e-05,
"loss": 0.4445,
"step": 62800
},
{
"epoch": 0.10606060606060606,
"grad_norm": 0.4983006715774536,
"learning_rate": 4.469882154882155e-05,
"loss": 0.4567,
"step": 63000
},
{
"epoch": 0.1063973063973064,
"grad_norm": 0.4615258276462555,
"learning_rate": 4.4681986531986536e-05,
"loss": 0.489,
"step": 63200
},
{
"epoch": 0.10673400673400674,
"grad_norm": 0.30304470658302307,
"learning_rate": 4.466523569023569e-05,
"loss": 0.4748,
"step": 63400
},
{
"epoch": 0.10707070707070707,
"grad_norm": 0.1221388503909111,
"learning_rate": 4.464840067340068e-05,
"loss": 0.4529,
"step": 63600
},
{
"epoch": 0.10740740740740741,
"grad_norm": 0.36816734075546265,
"learning_rate": 4.463156565656566e-05,
"loss": 0.4518,
"step": 63800
},
{
"epoch": 0.10774410774410774,
"grad_norm": 0.13719257712364197,
"learning_rate": 4.461473063973064e-05,
"loss": 0.4314,
"step": 64000
},
{
"epoch": 0.10808080808080808,
"grad_norm": 0.45440635085105896,
"learning_rate": 4.4597895622895624e-05,
"loss": 0.5005,
"step": 64200
},
{
"epoch": 0.10841750841750841,
"grad_norm": 0.4109625816345215,
"learning_rate": 4.458106060606061e-05,
"loss": 0.4847,
"step": 64400
},
{
"epoch": 0.10875420875420876,
"grad_norm": 0.43106183409690857,
"learning_rate": 4.4564225589225586e-05,
"loss": 0.4607,
"step": 64600
},
{
"epoch": 0.10909090909090909,
"grad_norm": 0.3796352744102478,
"learning_rate": 4.454739057239058e-05,
"loss": 0.4793,
"step": 64800
},
{
"epoch": 0.10942760942760943,
"grad_norm": 0.5014599561691284,
"learning_rate": 4.453055555555556e-05,
"loss": 0.4848,
"step": 65000
},
{
"epoch": 0.10976430976430976,
"grad_norm": 0.3484991192817688,
"learning_rate": 4.4513720538720546e-05,
"loss": 0.5129,
"step": 65200
},
{
"epoch": 0.1101010101010101,
"grad_norm": 0.2991756200790405,
"learning_rate": 4.449688552188552e-05,
"loss": 0.4927,
"step": 65400
},
{
"epoch": 0.11043771043771043,
"grad_norm": 0.37985363602638245,
"learning_rate": 4.448005050505051e-05,
"loss": 0.4372,
"step": 65600
},
{
"epoch": 0.11077441077441078,
"grad_norm": 0.4508950114250183,
"learning_rate": 4.446321548821549e-05,
"loss": 0.4182,
"step": 65800
},
{
"epoch": 0.1111111111111111,
"grad_norm": 0.7306589484214783,
"learning_rate": 4.444646464646465e-05,
"loss": 0.5217,
"step": 66000
},
{
"epoch": 0.11144781144781145,
"grad_norm": 0.17345421016216278,
"learning_rate": 4.4429629629629634e-05,
"loss": 0.4653,
"step": 66200
},
{
"epoch": 0.11178451178451178,
"grad_norm": 0.36997854709625244,
"learning_rate": 4.441279461279462e-05,
"loss": 0.4631,
"step": 66400
},
{
"epoch": 0.11212121212121212,
"grad_norm": 0.31563735008239746,
"learning_rate": 4.4395959595959596e-05,
"loss": 0.4606,
"step": 66600
},
{
"epoch": 0.11245791245791245,
"grad_norm": 0.23173430562019348,
"learning_rate": 4.437912457912458e-05,
"loss": 0.4868,
"step": 66800
},
{
"epoch": 0.1127946127946128,
"grad_norm": 0.336233526468277,
"learning_rate": 4.4362289562289565e-05,
"loss": 0.4809,
"step": 67000
},
{
"epoch": 0.11313131313131314,
"grad_norm": 0.3722301423549652,
"learning_rate": 4.434545454545454e-05,
"loss": 0.4396,
"step": 67200
},
{
"epoch": 0.11346801346801347,
"grad_norm": 0.5491744875907898,
"learning_rate": 4.432861952861953e-05,
"loss": 0.3938,
"step": 67400
},
{
"epoch": 0.11380471380471381,
"grad_norm": 0.2742317020893097,
"learning_rate": 4.431178451178452e-05,
"loss": 0.4782,
"step": 67600
},
{
"epoch": 0.11414141414141414,
"grad_norm": 0.22197793424129486,
"learning_rate": 4.4294949494949495e-05,
"loss": 0.4233,
"step": 67800
},
{
"epoch": 0.11447811447811448,
"grad_norm": 0.2449079006910324,
"learning_rate": 4.427811447811448e-05,
"loss": 0.4889,
"step": 68000
},
{
"epoch": 0.11481481481481481,
"grad_norm": 0.25308141112327576,
"learning_rate": 4.4261279461279464e-05,
"loss": 0.4275,
"step": 68200
},
{
"epoch": 0.11515151515151516,
"grad_norm": 0.22254426777362823,
"learning_rate": 4.424444444444444e-05,
"loss": 0.487,
"step": 68400
},
{
"epoch": 0.11548821548821549,
"grad_norm": 0.4978940784931183,
"learning_rate": 4.4227609427609426e-05,
"loss": 0.4709,
"step": 68600
},
{
"epoch": 0.11582491582491583,
"grad_norm": 0.2366330623626709,
"learning_rate": 4.421077441077441e-05,
"loss": 0.487,
"step": 68800
},
{
"epoch": 0.11616161616161616,
"grad_norm": 0.38192349672317505,
"learning_rate": 4.41939393939394e-05,
"loss": 0.4751,
"step": 69000
},
{
"epoch": 0.1164983164983165,
"grad_norm": 0.4711579382419586,
"learning_rate": 4.417710437710438e-05,
"loss": 0.4694,
"step": 69200
},
{
"epoch": 0.11683501683501683,
"grad_norm": 0.7776811122894287,
"learning_rate": 4.4160269360269364e-05,
"loss": 0.505,
"step": 69400
},
{
"epoch": 0.11717171717171718,
"grad_norm": 0.22125215828418732,
"learning_rate": 4.414343434343435e-05,
"loss": 0.4599,
"step": 69600
},
{
"epoch": 0.1175084175084175,
"grad_norm": 0.3384982943534851,
"learning_rate": 4.4126683501683505e-05,
"loss": 0.4825,
"step": 69800
},
{
"epoch": 0.11784511784511785,
"grad_norm": 0.35308724641799927,
"learning_rate": 4.410984848484849e-05,
"loss": 0.4698,
"step": 70000
},
{
"epoch": 0.11818181818181818,
"grad_norm": 0.3890261650085449,
"learning_rate": 4.4093013468013474e-05,
"loss": 0.4414,
"step": 70200
},
{
"epoch": 0.11851851851851852,
"grad_norm": 0.6729969382286072,
"learning_rate": 4.407617845117845e-05,
"loss": 0.5287,
"step": 70400
},
{
"epoch": 0.11885521885521885,
"grad_norm": 0.18775266408920288,
"learning_rate": 4.4059343434343436e-05,
"loss": 0.4809,
"step": 70600
},
{
"epoch": 0.1191919191919192,
"grad_norm": 0.13612866401672363,
"learning_rate": 4.404250841750842e-05,
"loss": 0.4591,
"step": 70800
},
{
"epoch": 0.11952861952861953,
"grad_norm": 0.25239524245262146,
"learning_rate": 4.40256734006734e-05,
"loss": 0.4514,
"step": 71000
},
{
"epoch": 0.11986531986531987,
"grad_norm": 0.5928908586502075,
"learning_rate": 4.400883838383838e-05,
"loss": 0.4437,
"step": 71200
},
{
"epoch": 0.1202020202020202,
"grad_norm": 0.3206656575202942,
"learning_rate": 4.3992003367003374e-05,
"loss": 0.4193,
"step": 71400
},
{
"epoch": 0.12053872053872054,
"grad_norm": 0.1375039964914322,
"learning_rate": 4.397516835016835e-05,
"loss": 0.4642,
"step": 71600
},
{
"epoch": 0.12087542087542087,
"grad_norm": 0.31986692547798157,
"learning_rate": 4.3958333333333336e-05,
"loss": 0.4771,
"step": 71800
},
{
"epoch": 0.12121212121212122,
"grad_norm": 0.3976145088672638,
"learning_rate": 4.394158249158249e-05,
"loss": 0.5199,
"step": 72000
},
{
"epoch": 0.12154882154882155,
"grad_norm": 0.5115092992782593,
"learning_rate": 4.392474747474747e-05,
"loss": 0.4639,
"step": 72200
},
{
"epoch": 0.12188552188552189,
"grad_norm": 0.6884472370147705,
"learning_rate": 4.390791245791246e-05,
"loss": 0.5004,
"step": 72400
},
{
"epoch": 0.12222222222222222,
"grad_norm": 0.35599565505981445,
"learning_rate": 4.3891077441077446e-05,
"loss": 0.4125,
"step": 72600
},
{
"epoch": 0.12255892255892256,
"grad_norm": 0.31085407733917236,
"learning_rate": 4.387424242424243e-05,
"loss": 0.4276,
"step": 72800
},
{
"epoch": 0.12289562289562289,
"grad_norm": 0.4173491299152374,
"learning_rate": 4.385740740740741e-05,
"loss": 0.5051,
"step": 73000
},
{
"epoch": 0.12323232323232323,
"grad_norm": 0.857441782951355,
"learning_rate": 4.384057239057239e-05,
"loss": 0.4494,
"step": 73200
},
{
"epoch": 0.12356902356902356,
"grad_norm": 0.2960607707500458,
"learning_rate": 4.382382154882155e-05,
"loss": 0.5233,
"step": 73400
},
{
"epoch": 0.12390572390572391,
"grad_norm": 0.38231462240219116,
"learning_rate": 4.3806986531986535e-05,
"loss": 0.5003,
"step": 73600
},
{
"epoch": 0.12424242424242424,
"grad_norm": 0.20170505344867706,
"learning_rate": 4.379015151515152e-05,
"loss": 0.4543,
"step": 73800
},
{
"epoch": 0.12457912457912458,
"grad_norm": 0.49870565533638,
"learning_rate": 4.37733164983165e-05,
"loss": 0.4931,
"step": 74000
},
{
"epoch": 0.12491582491582491,
"grad_norm": 0.21166172623634338,
"learning_rate": 4.375648148148148e-05,
"loss": 0.4956,
"step": 74200
},
{
"epoch": 0.12525252525252525,
"grad_norm": 1.7191145420074463,
"learning_rate": 4.3739646464646465e-05,
"loss": 0.5209,
"step": 74400
},
{
"epoch": 0.12558922558922558,
"grad_norm": 0.15442191064357758,
"learning_rate": 4.372281144781145e-05,
"loss": 0.4723,
"step": 74600
},
{
"epoch": 0.1259259259259259,
"grad_norm": 0.6088646650314331,
"learning_rate": 4.3705976430976434e-05,
"loss": 0.4756,
"step": 74800
},
{
"epoch": 0.12626262626262627,
"grad_norm": 0.20357204973697662,
"learning_rate": 4.368914141414142e-05,
"loss": 0.4358,
"step": 75000
},
{
"epoch": 0.1265993265993266,
"grad_norm": 0.23374512791633606,
"learning_rate": 4.36723063973064e-05,
"loss": 0.4873,
"step": 75200
},
{
"epoch": 0.12693602693602693,
"grad_norm": 0.28036201000213623,
"learning_rate": 4.365547138047138e-05,
"loss": 0.4627,
"step": 75400
},
{
"epoch": 0.12727272727272726,
"grad_norm": 0.4876658320426941,
"learning_rate": 4.3638636363636365e-05,
"loss": 0.4865,
"step": 75600
},
{
"epoch": 0.12760942760942762,
"grad_norm": 0.24980993568897247,
"learning_rate": 4.362180134680135e-05,
"loss": 0.4715,
"step": 75800
},
{
"epoch": 0.12794612794612795,
"grad_norm": 0.5056689977645874,
"learning_rate": 4.3604966329966334e-05,
"loss": 0.5066,
"step": 76000
},
{
"epoch": 0.12828282828282828,
"grad_norm": 0.3562251329421997,
"learning_rate": 4.358813131313131e-05,
"loss": 0.4561,
"step": 76200
},
{
"epoch": 0.1286195286195286,
"grad_norm": 0.5188980102539062,
"learning_rate": 4.35712962962963e-05,
"loss": 0.4451,
"step": 76400
},
{
"epoch": 0.12895622895622896,
"grad_norm": 0.24979503452777863,
"learning_rate": 4.355446127946129e-05,
"loss": 0.4977,
"step": 76600
},
{
"epoch": 0.1292929292929293,
"grad_norm": 0.2918744385242462,
"learning_rate": 4.3537626262626264e-05,
"loss": 0.4465,
"step": 76800
},
{
"epoch": 0.12962962962962962,
"grad_norm": 0.11484523117542267,
"learning_rate": 4.352079124579125e-05,
"loss": 0.5008,
"step": 77000
},
{
"epoch": 0.12996632996632998,
"grad_norm": 0.13835379481315613,
"learning_rate": 4.350395622895623e-05,
"loss": 0.4469,
"step": 77200
},
{
"epoch": 0.1303030303030303,
"grad_norm": 0.17499831318855286,
"learning_rate": 4.348712121212121e-05,
"loss": 0.4558,
"step": 77400
},
{
"epoch": 0.13063973063973064,
"grad_norm": 0.14153020083904266,
"learning_rate": 4.3470286195286195e-05,
"loss": 0.4764,
"step": 77600
},
{
"epoch": 0.13097643097643097,
"grad_norm": 0.32110026478767395,
"learning_rate": 4.345353535353536e-05,
"loss": 0.4496,
"step": 77800
},
{
"epoch": 0.13131313131313133,
"grad_norm": 0.347741961479187,
"learning_rate": 4.343670033670034e-05,
"loss": 0.4365,
"step": 78000
},
{
"epoch": 0.13164983164983166,
"grad_norm": 1.8142364025115967,
"learning_rate": 4.341986531986532e-05,
"loss": 0.4963,
"step": 78200
},
{
"epoch": 0.13198653198653199,
"grad_norm": 0.18966235220432281,
"learning_rate": 4.3403030303030306e-05,
"loss": 0.4641,
"step": 78400
},
{
"epoch": 0.13232323232323231,
"grad_norm": 0.7899078726768494,
"learning_rate": 4.338619528619528e-05,
"loss": 0.42,
"step": 78600
},
{
"epoch": 0.13265993265993267,
"grad_norm": 0.11921744793653488,
"learning_rate": 4.336936026936027e-05,
"loss": 0.4489,
"step": 78800
},
{
"epoch": 0.132996632996633,
"grad_norm": 0.3183203935623169,
"learning_rate": 4.335252525252526e-05,
"loss": 0.4128,
"step": 79000
},
{
"epoch": 0.13333333333333333,
"grad_norm": 0.9126468896865845,
"learning_rate": 4.3335690235690236e-05,
"loss": 0.4063,
"step": 79200
},
{
"epoch": 0.13367003367003366,
"grad_norm": 0.30544015765190125,
"learning_rate": 4.331885521885522e-05,
"loss": 0.4363,
"step": 79400
},
{
"epoch": 0.13400673400673402,
"grad_norm": 0.32495343685150146,
"learning_rate": 4.3302020202020205e-05,
"loss": 0.4311,
"step": 79600
},
{
"epoch": 0.13434343434343435,
"grad_norm": 0.5995136499404907,
"learning_rate": 4.328518518518519e-05,
"loss": 0.4304,
"step": 79800
},
{
"epoch": 0.13468013468013468,
"grad_norm": 0.09273191541433334,
"learning_rate": 4.326835016835017e-05,
"loss": 0.44,
"step": 80000
},
{
"epoch": 0.135016835016835,
"grad_norm": 0.5083215832710266,
"learning_rate": 4.325151515151515e-05,
"loss": 0.4484,
"step": 80200
},
{
"epoch": 0.13535353535353536,
"grad_norm": 0.4086732566356659,
"learning_rate": 4.3234680134680136e-05,
"loss": 0.4431,
"step": 80400
},
{
"epoch": 0.1356902356902357,
"grad_norm": 0.25825831294059753,
"learning_rate": 4.321784511784512e-05,
"loss": 0.4903,
"step": 80600
},
{
"epoch": 0.13602693602693602,
"grad_norm": 0.22938190400600433,
"learning_rate": 4.3201010101010105e-05,
"loss": 0.446,
"step": 80800
},
{
"epoch": 0.13636363636363635,
"grad_norm": 4.104939937591553,
"learning_rate": 4.318417508417509e-05,
"loss": 0.5158,
"step": 81000
},
{
"epoch": 0.1367003367003367,
"grad_norm": 0.6533791422843933,
"learning_rate": 4.3167340067340067e-05,
"loss": 0.4411,
"step": 81200
},
{
"epoch": 0.13703703703703704,
"grad_norm": 0.3710763156414032,
"learning_rate": 4.315050505050505e-05,
"loss": 0.4504,
"step": 81400
},
{
"epoch": 0.13737373737373737,
"grad_norm": 0.32174888253211975,
"learning_rate": 4.3133670033670035e-05,
"loss": 0.4494,
"step": 81600
},
{
"epoch": 0.1377104377104377,
"grad_norm": 0.47536543011665344,
"learning_rate": 4.311683501683502e-05,
"loss": 0.4547,
"step": 81800
},
{
"epoch": 0.13804713804713806,
"grad_norm": 0.5591254830360413,
"learning_rate": 4.310008417508418e-05,
"loss": 0.4306,
"step": 82000
},
{
"epoch": 0.1383838383838384,
"grad_norm": 0.26102516055107117,
"learning_rate": 4.308324915824916e-05,
"loss": 0.4377,
"step": 82200
},
{
"epoch": 0.13872053872053872,
"grad_norm": 0.540073812007904,
"learning_rate": 4.306641414141414e-05,
"loss": 0.4913,
"step": 82400
},
{
"epoch": 0.13905723905723905,
"grad_norm": 0.8017529249191284,
"learning_rate": 4.3049579124579124e-05,
"loss": 0.4382,
"step": 82600
},
{
"epoch": 0.1393939393939394,
"grad_norm": 0.6620075106620789,
"learning_rate": 4.303274410774411e-05,
"loss": 0.3944,
"step": 82800
},
{
"epoch": 0.13973063973063973,
"grad_norm": 0.42346033453941345,
"learning_rate": 4.301590909090909e-05,
"loss": 0.4149,
"step": 83000
},
{
"epoch": 0.14006734006734006,
"grad_norm": 0.259355366230011,
"learning_rate": 4.299907407407408e-05,
"loss": 0.4737,
"step": 83200
},
{
"epoch": 0.1404040404040404,
"grad_norm": 0.10005613416433334,
"learning_rate": 4.298223905723906e-05,
"loss": 0.4642,
"step": 83400
},
{
"epoch": 0.14074074074074075,
"grad_norm": 0.40637290477752686,
"learning_rate": 4.2965404040404045e-05,
"loss": 0.4251,
"step": 83600
},
{
"epoch": 0.14107744107744108,
"grad_norm": 0.5488855838775635,
"learning_rate": 4.294856902356902e-05,
"loss": 0.4669,
"step": 83800
},
{
"epoch": 0.1414141414141414,
"grad_norm": 0.36019712686538696,
"learning_rate": 4.293173400673401e-05,
"loss": 0.4493,
"step": 84000
},
{
"epoch": 0.14175084175084174,
"grad_norm": 0.20330995321273804,
"learning_rate": 4.291498316498317e-05,
"loss": 0.4969,
"step": 84200
},
{
"epoch": 0.1420875420875421,
"grad_norm": 0.23681996762752533,
"learning_rate": 4.289814814814815e-05,
"loss": 0.4774,
"step": 84400
},
{
"epoch": 0.14242424242424243,
"grad_norm": 0.1740342527627945,
"learning_rate": 4.2881313131313134e-05,
"loss": 0.4915,
"step": 84600
},
{
"epoch": 0.14276094276094276,
"grad_norm": 0.4355227053165436,
"learning_rate": 4.286447811447812e-05,
"loss": 0.4681,
"step": 84800
},
{
"epoch": 0.14309764309764308,
"grad_norm": 0.295913964509964,
"learning_rate": 4.2847643097643096e-05,
"loss": 0.4818,
"step": 85000
},
{
"epoch": 0.14343434343434344,
"grad_norm": 0.17617417871952057,
"learning_rate": 4.283080808080808e-05,
"loss": 0.4661,
"step": 85200
},
{
"epoch": 0.14377104377104377,
"grad_norm": 0.36346206068992615,
"learning_rate": 4.2813973063973064e-05,
"loss": 0.456,
"step": 85400
},
{
"epoch": 0.1441077441077441,
"grad_norm": 0.5108135342597961,
"learning_rate": 4.279713804713805e-05,
"loss": 0.4311,
"step": 85600
},
{
"epoch": 0.14444444444444443,
"grad_norm": 0.6109139919281006,
"learning_rate": 4.278030303030303e-05,
"loss": 0.4398,
"step": 85800
},
{
"epoch": 0.1447811447811448,
"grad_norm": 0.35451897978782654,
"learning_rate": 4.276346801346802e-05,
"loss": 0.4674,
"step": 86000
},
{
"epoch": 0.14511784511784512,
"grad_norm": 0.4733004570007324,
"learning_rate": 4.2746632996632995e-05,
"loss": 0.4275,
"step": 86200
},
{
"epoch": 0.14545454545454545,
"grad_norm": 0.30051594972610474,
"learning_rate": 4.272979797979798e-05,
"loss": 0.4493,
"step": 86400
},
{
"epoch": 0.1457912457912458,
"grad_norm": 0.5513753294944763,
"learning_rate": 4.2712962962962964e-05,
"loss": 0.4803,
"step": 86600
},
{
"epoch": 0.14612794612794613,
"grad_norm": 0.5906115770339966,
"learning_rate": 4.269612794612795e-05,
"loss": 0.4502,
"step": 86800
},
{
"epoch": 0.14646464646464646,
"grad_norm": 0.662507176399231,
"learning_rate": 4.267929292929293e-05,
"loss": 0.4247,
"step": 87000
},
{
"epoch": 0.1468013468013468,
"grad_norm": 0.5167519450187683,
"learning_rate": 4.266254208754209e-05,
"loss": 0.477,
"step": 87200
},
{
"epoch": 0.14713804713804715,
"grad_norm": 0.3927953243255615,
"learning_rate": 4.2645707070707075e-05,
"loss": 0.461,
"step": 87400
},
{
"epoch": 0.14747474747474748,
"grad_norm": 0.3797866106033325,
"learning_rate": 4.262895622895623e-05,
"loss": 0.4923,
"step": 87600
},
{
"epoch": 0.1478114478114478,
"grad_norm": 0.3852689862251282,
"learning_rate": 4.2612121212121216e-05,
"loss": 0.4254,
"step": 87800
},
{
"epoch": 0.14814814814814814,
"grad_norm": 0.4846220016479492,
"learning_rate": 4.25952861952862e-05,
"loss": 0.4837,
"step": 88000
},
{
"epoch": 0.1484848484848485,
"grad_norm": 0.7036873698234558,
"learning_rate": 4.257845117845118e-05,
"loss": 0.4854,
"step": 88200
},
{
"epoch": 0.14882154882154883,
"grad_norm": 0.5932942032814026,
"learning_rate": 4.256161616161616e-05,
"loss": 0.4509,
"step": 88400
},
{
"epoch": 0.14915824915824916,
"grad_norm": 0.2926032841205597,
"learning_rate": 4.254478114478115e-05,
"loss": 0.4805,
"step": 88600
},
{
"epoch": 0.1494949494949495,
"grad_norm": 0.16851143538951874,
"learning_rate": 4.2527946127946125e-05,
"loss": 0.419,
"step": 88800
},
{
"epoch": 0.14983164983164984,
"grad_norm": 0.1768457293510437,
"learning_rate": 4.2511111111111116e-05,
"loss": 0.4432,
"step": 89000
},
{
"epoch": 0.15016835016835017,
"grad_norm": 1.1971328258514404,
"learning_rate": 4.24942760942761e-05,
"loss": 0.542,
"step": 89200
},
{
"epoch": 0.1505050505050505,
"grad_norm": 0.19443285465240479,
"learning_rate": 4.247744107744108e-05,
"loss": 0.4485,
"step": 89400
},
{
"epoch": 0.15084175084175083,
"grad_norm": 0.2966189682483673,
"learning_rate": 4.246060606060606e-05,
"loss": 0.4376,
"step": 89600
},
{
"epoch": 0.1511784511784512,
"grad_norm": 0.3715890645980835,
"learning_rate": 4.244385521885522e-05,
"loss": 0.4862,
"step": 89800
},
{
"epoch": 0.15151515151515152,
"grad_norm": 0.2697141468524933,
"learning_rate": 4.2427020202020204e-05,
"loss": 0.4746,
"step": 90000
}
],
"logging_steps": 200,
"max_steps": 594000,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 10000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 6.8214306373632e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}