dalat5 / checkpoints /checkpoint-62000 /trainer_state.json
crossroderick's picture
Refined (v5.3) update
3cf1937
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.994675460468109,
"eval_steps": 500,
"global_step": 62000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.016086222150727903,
"grad_norm": 0.5078127384185791,
"learning_rate": 4.959864230101023e-05,
"loss": 2.1432,
"step": 500
},
{
"epoch": 0.032172444301455806,
"grad_norm": 0.4508506655693054,
"learning_rate": 4.9196480277974395e-05,
"loss": 1.9093,
"step": 1000
},
{
"epoch": 0.048258666452183706,
"grad_norm": 0.4430558979511261,
"learning_rate": 4.879431825493855e-05,
"loss": 1.8418,
"step": 1500
},
{
"epoch": 0.06434488860291161,
"grad_norm": 0.4775325059890747,
"learning_rate": 4.8392156231902713e-05,
"loss": 1.7771,
"step": 2000
},
{
"epoch": 0.08043111075363951,
"grad_norm": 0.49685001373291016,
"learning_rate": 4.7989994208866876e-05,
"loss": 1.7226,
"step": 2500
},
{
"epoch": 0.09651733290436741,
"grad_norm": 0.5552434325218201,
"learning_rate": 4.7587832185831025e-05,
"loss": 1.6767,
"step": 3000
},
{
"epoch": 0.11260355505509531,
"grad_norm": 0.6779139637947083,
"learning_rate": 4.718567016279519e-05,
"loss": 1.6588,
"step": 3500
},
{
"epoch": 0.12868977720582322,
"grad_norm": 0.5552022457122803,
"learning_rate": 4.6783508139759344e-05,
"loss": 1.603,
"step": 4000
},
{
"epoch": 0.1447759993565511,
"grad_norm": 0.5302042365074158,
"learning_rate": 4.638134611672351e-05,
"loss": 1.5776,
"step": 4500
},
{
"epoch": 0.16086222150727902,
"grad_norm": 0.5810815691947937,
"learning_rate": 4.597918409368766e-05,
"loss": 1.5333,
"step": 5000
},
{
"epoch": 0.1769484436580069,
"grad_norm": 0.5819700956344604,
"learning_rate": 4.5577022070651826e-05,
"loss": 1.5168,
"step": 5500
},
{
"epoch": 0.19303466580873482,
"grad_norm": 0.6134072542190552,
"learning_rate": 4.517486004761599e-05,
"loss": 1.4748,
"step": 6000
},
{
"epoch": 0.2091208879594627,
"grad_norm": 0.5746152400970459,
"learning_rate": 4.4772698024580144e-05,
"loss": 1.4622,
"step": 6500
},
{
"epoch": 0.22520711011019062,
"grad_norm": 0.7663710713386536,
"learning_rate": 4.437053600154431e-05,
"loss": 1.4767,
"step": 7000
},
{
"epoch": 0.24129333226091854,
"grad_norm": 0.7993176579475403,
"learning_rate": 4.396837397850846e-05,
"loss": 1.4527,
"step": 7500
},
{
"epoch": 0.25737955441164645,
"grad_norm": 0.6892676949501038,
"learning_rate": 4.3566211955472626e-05,
"loss": 1.4325,
"step": 8000
},
{
"epoch": 0.2734657765623743,
"grad_norm": 0.6928556561470032,
"learning_rate": 4.316404993243678e-05,
"loss": 1.4038,
"step": 8500
},
{
"epoch": 0.2895519987131022,
"grad_norm": 0.7578593492507935,
"learning_rate": 4.2761887909400944e-05,
"loss": 1.3945,
"step": 9000
},
{
"epoch": 0.30563822086383013,
"grad_norm": 0.7504703402519226,
"learning_rate": 4.23597258863651e-05,
"loss": 1.3644,
"step": 9500
},
{
"epoch": 0.32172444301455805,
"grad_norm": 0.8370710611343384,
"learning_rate": 4.1957563863329256e-05,
"loss": 1.3619,
"step": 10000
},
{
"epoch": 0.3378106651652859,
"grad_norm": 0.8501142263412476,
"learning_rate": 4.155540184029342e-05,
"loss": 1.3448,
"step": 10500
},
{
"epoch": 0.3538968873160138,
"grad_norm": 0.9001900553703308,
"learning_rate": 4.1153239817257575e-05,
"loss": 1.3004,
"step": 11000
},
{
"epoch": 0.36998310946674173,
"grad_norm": 1.0658681392669678,
"learning_rate": 4.075107779422174e-05,
"loss": 1.2789,
"step": 11500
},
{
"epoch": 0.38606933161746965,
"grad_norm": 1.1038371324539185,
"learning_rate": 4.0348915771185894e-05,
"loss": 1.2651,
"step": 12000
},
{
"epoch": 0.40215555376819756,
"grad_norm": 1.2004213333129883,
"learning_rate": 3.994755807219613e-05,
"loss": 1.2216,
"step": 12500
},
{
"epoch": 0.4182417759189254,
"grad_norm": 1.235543966293335,
"learning_rate": 3.9545396049160286e-05,
"loss": 1.1955,
"step": 13000
},
{
"epoch": 0.43432799806965333,
"grad_norm": 1.5088828802108765,
"learning_rate": 3.914323402612445e-05,
"loss": 1.1836,
"step": 13500
},
{
"epoch": 0.45041422022038125,
"grad_norm": 1.264153242111206,
"learning_rate": 3.8741072003088605e-05,
"loss": 1.1658,
"step": 14000
},
{
"epoch": 0.46650044237110916,
"grad_norm": 1.3023343086242676,
"learning_rate": 3.833971430409884e-05,
"loss": 1.1481,
"step": 14500
},
{
"epoch": 0.48258666452183707,
"grad_norm": 1.3824670314788818,
"learning_rate": 3.7938356605109064e-05,
"loss": 1.1221,
"step": 15000
},
{
"epoch": 0.49867288667256493,
"grad_norm": 1.4364969730377197,
"learning_rate": 3.75369989061193e-05,
"loss": 1.1057,
"step": 15500
},
{
"epoch": 0.5147591088232929,
"grad_norm": 2.051701545715332,
"learning_rate": 3.7134836883083456e-05,
"loss": 1.0873,
"step": 16000
},
{
"epoch": 0.5308453309740208,
"grad_norm": 1.4329720735549927,
"learning_rate": 3.673267486004762e-05,
"loss": 1.0607,
"step": 16500
},
{
"epoch": 0.5469315531247486,
"grad_norm": 1.4981014728546143,
"learning_rate": 3.6330512837011775e-05,
"loss": 1.0516,
"step": 17000
},
{
"epoch": 0.5630177752754766,
"grad_norm": 1.3012079000473022,
"learning_rate": 3.592835081397594e-05,
"loss": 1.0317,
"step": 17500
},
{
"epoch": 0.5791039974262044,
"grad_norm": 1.401825189590454,
"learning_rate": 3.552699311498617e-05,
"loss": 1.0183,
"step": 18000
},
{
"epoch": 0.5951902195769324,
"grad_norm": 2.0783369541168213,
"learning_rate": 3.512483109195033e-05,
"loss": 0.9985,
"step": 18500
},
{
"epoch": 0.6112764417276603,
"grad_norm": 2.3940794467926025,
"learning_rate": 3.4722669068914486e-05,
"loss": 0.9698,
"step": 19000
},
{
"epoch": 0.6273626638783881,
"grad_norm": 1.4747998714447021,
"learning_rate": 3.432050704587865e-05,
"loss": 0.9657,
"step": 19500
},
{
"epoch": 0.6434488860291161,
"grad_norm": 3.0782012939453125,
"learning_rate": 3.391914934688888e-05,
"loss": 0.9379,
"step": 20000
},
{
"epoch": 0.659535108179844,
"grad_norm": 2.4914307594299316,
"learning_rate": 3.3516987323853034e-05,
"loss": 0.915,
"step": 20500
},
{
"epoch": 0.6756213303305718,
"grad_norm": 2.772120237350464,
"learning_rate": 3.3115629624863264e-05,
"loss": 0.9047,
"step": 21000
},
{
"epoch": 0.6917075524812998,
"grad_norm": 2.519575595855713,
"learning_rate": 3.271346760182743e-05,
"loss": 0.8688,
"step": 21500
},
{
"epoch": 0.7077937746320276,
"grad_norm": 4.085098743438721,
"learning_rate": 3.231130557879158e-05,
"loss": 0.8581,
"step": 22000
},
{
"epoch": 0.7238799967827556,
"grad_norm": 1.4670002460479736,
"learning_rate": 3.1909143555755745e-05,
"loss": 0.8354,
"step": 22500
},
{
"epoch": 0.7399662189334835,
"grad_norm": 2.4749488830566406,
"learning_rate": 3.1507785856765975e-05,
"loss": 0.8108,
"step": 23000
},
{
"epoch": 0.7560524410842113,
"grad_norm": 1.8635029792785645,
"learning_rate": 3.110562383373014e-05,
"loss": 0.7773,
"step": 23500
},
{
"epoch": 0.7721386632349393,
"grad_norm": 3.5713748931884766,
"learning_rate": 3.0703461810694294e-05,
"loss": 0.756,
"step": 24000
},
{
"epoch": 0.7882248853856672,
"grad_norm": 1.8903526067733765,
"learning_rate": 3.0301299787658456e-05,
"loss": 0.7326,
"step": 24500
},
{
"epoch": 0.8043111075363951,
"grad_norm": 8.286703109741211,
"learning_rate": 2.9899942088668686e-05,
"loss": 0.6948,
"step": 25000
},
{
"epoch": 0.820397329687123,
"grad_norm": 2.2209272384643555,
"learning_rate": 2.9497780065632845e-05,
"loss": 0.6914,
"step": 25500
},
{
"epoch": 0.8364835518378508,
"grad_norm": 2.2284536361694336,
"learning_rate": 2.9095618042597e-05,
"loss": 0.6585,
"step": 26000
},
{
"epoch": 0.8525697739885788,
"grad_norm": 3.4615938663482666,
"learning_rate": 2.869345601956116e-05,
"loss": 0.633,
"step": 26500
},
{
"epoch": 0.8686559961393067,
"grad_norm": 3.1158838272094727,
"learning_rate": 2.829209832057139e-05,
"loss": 0.6181,
"step": 27000
},
{
"epoch": 0.8847422182900346,
"grad_norm": 2.3320417404174805,
"learning_rate": 2.7889936297535553e-05,
"loss": 0.5993,
"step": 27500
},
{
"epoch": 0.9008284404407625,
"grad_norm": 1.8331427574157715,
"learning_rate": 2.7487774274499712e-05,
"loss": 0.5839,
"step": 28000
},
{
"epoch": 0.9169146625914903,
"grad_norm": 3.2398369312286377,
"learning_rate": 2.708561225146387e-05,
"loss": 0.562,
"step": 28500
},
{
"epoch": 0.9330008847422183,
"grad_norm": 1.6575061082839966,
"learning_rate": 2.66842545524741e-05,
"loss": 0.5313,
"step": 29000
},
{
"epoch": 0.9490871068929462,
"grad_norm": 2.1604230403900146,
"learning_rate": 2.6282092529438264e-05,
"loss": 0.5203,
"step": 29500
},
{
"epoch": 0.9651733290436741,
"grad_norm": 3.3743808269500732,
"learning_rate": 2.5879930506402423e-05,
"loss": 0.4938,
"step": 30000
},
{
"epoch": 0.981259551194402,
"grad_norm": 3.766514301300049,
"learning_rate": 2.5477768483366583e-05,
"loss": 0.4724,
"step": 30500
},
{
"epoch": 0.9973457733451299,
"grad_norm": 2.26712703704834,
"learning_rate": 2.5075606460330742e-05,
"loss": 0.4656,
"step": 31000
},
{
"epoch": 1.0,
"eval_loss": 0.26554691791534424,
"eval_runtime": 1917.4803,
"eval_samples_per_second": 345.81,
"eval_steps_per_second": 43.227,
"step": 31083
},
{
"epoch": 1.0134159092737072,
"grad_norm": 2.1041958332061768,
"learning_rate": 2.467424876134097e-05,
"loss": 0.4381,
"step": 31500
},
{
"epoch": 1.029502131424435,
"grad_norm": 1.7629106044769287,
"learning_rate": 2.427208673830513e-05,
"loss": 0.4298,
"step": 32000
},
{
"epoch": 1.0455883535751629,
"grad_norm": 2.5032904148101807,
"learning_rate": 2.386992471526929e-05,
"loss": 0.4188,
"step": 32500
},
{
"epoch": 1.0616745757258907,
"grad_norm": 1.6467881202697754,
"learning_rate": 2.3467762692233446e-05,
"loss": 0.3986,
"step": 33000
},
{
"epoch": 1.0777607978766186,
"grad_norm": 1.957220435142517,
"learning_rate": 2.3065600669197606e-05,
"loss": 0.382,
"step": 33500
},
{
"epoch": 1.0938470200273467,
"grad_norm": 1.6566946506500244,
"learning_rate": 2.2663438646161765e-05,
"loss": 0.3689,
"step": 34000
},
{
"epoch": 1.1099332421780745,
"grad_norm": 2.081613540649414,
"learning_rate": 2.2261276623125928e-05,
"loss": 0.3603,
"step": 34500
},
{
"epoch": 1.1260194643288024,
"grad_norm": 2.155226945877075,
"learning_rate": 2.1859918924136157e-05,
"loss": 0.3478,
"step": 35000
},
{
"epoch": 1.1421056864795303,
"grad_norm": 1.9459590911865234,
"learning_rate": 2.1457756901100317e-05,
"loss": 0.3315,
"step": 35500
},
{
"epoch": 1.1581919086302581,
"grad_norm": 2.3381567001342773,
"learning_rate": 2.1055594878064476e-05,
"loss": 0.3259,
"step": 36000
},
{
"epoch": 1.1742781307809862,
"grad_norm": 1.4302254915237427,
"learning_rate": 2.0653432855028635e-05,
"loss": 0.3168,
"step": 36500
},
{
"epoch": 1.190364352931714,
"grad_norm": 1.1770597696304321,
"learning_rate": 2.0251270831992795e-05,
"loss": 0.3082,
"step": 37000
},
{
"epoch": 1.206450575082442,
"grad_norm": 1.7475298643112183,
"learning_rate": 1.9849913133003024e-05,
"loss": 0.3014,
"step": 37500
},
{
"epoch": 1.2225367972331698,
"grad_norm": 1.2397468090057373,
"learning_rate": 1.9447751109967187e-05,
"loss": 0.288,
"step": 38000
},
{
"epoch": 1.2386230193838976,
"grad_norm": 1.6603740453720093,
"learning_rate": 1.9045589086931343e-05,
"loss": 0.2797,
"step": 38500
},
{
"epoch": 1.2547092415346257,
"grad_norm": 1.7009538412094116,
"learning_rate": 1.8643427063895502e-05,
"loss": 0.275,
"step": 39000
},
{
"epoch": 1.2707954636853536,
"grad_norm": 1.4941717386245728,
"learning_rate": 1.8241265040859662e-05,
"loss": 0.2623,
"step": 39500
},
{
"epoch": 1.2868816858360814,
"grad_norm": 1.941115140914917,
"learning_rate": 1.7839907341869895e-05,
"loss": 0.2572,
"step": 40000
},
{
"epoch": 1.3029679079868093,
"grad_norm": 1.487726092338562,
"learning_rate": 1.7437745318834054e-05,
"loss": 0.2502,
"step": 40500
},
{
"epoch": 1.3190541301375371,
"grad_norm": 1.4628674983978271,
"learning_rate": 1.7035583295798213e-05,
"loss": 0.2437,
"step": 41000
},
{
"epoch": 1.3351403522882652,
"grad_norm": 1.401607632637024,
"learning_rate": 1.663342127276237e-05,
"loss": 0.2421,
"step": 41500
},
{
"epoch": 1.351226574438993,
"grad_norm": 1.1497563123703003,
"learning_rate": 1.623125924972653e-05,
"loss": 0.231,
"step": 42000
},
{
"epoch": 1.367312796589721,
"grad_norm": 1.322836995124817,
"learning_rate": 1.5829097226690688e-05,
"loss": 0.2261,
"step": 42500
},
{
"epoch": 1.3833990187404488,
"grad_norm": 1.5328525304794312,
"learning_rate": 1.542773952770092e-05,
"loss": 0.2177,
"step": 43000
},
{
"epoch": 1.3994852408911767,
"grad_norm": 1.7748241424560547,
"learning_rate": 1.502557750466508e-05,
"loss": 0.2186,
"step": 43500
},
{
"epoch": 1.4155714630419047,
"grad_norm": 1.6542141437530518,
"learning_rate": 1.4623415481629241e-05,
"loss": 0.2138,
"step": 44000
},
{
"epoch": 1.4316576851926326,
"grad_norm": 1.3098843097686768,
"learning_rate": 1.4221253458593397e-05,
"loss": 0.211,
"step": 44500
},
{
"epoch": 1.4477439073433604,
"grad_norm": 1.345651626586914,
"learning_rate": 1.3819091435557557e-05,
"loss": 0.2027,
"step": 45000
},
{
"epoch": 1.4638301294940883,
"grad_norm": 1.4520297050476074,
"learning_rate": 1.3416929412521718e-05,
"loss": 0.2039,
"step": 45500
},
{
"epoch": 1.4799163516448162,
"grad_norm": 1.5913499593734741,
"learning_rate": 1.3014767389485877e-05,
"loss": 0.1939,
"step": 46000
},
{
"epoch": 1.4960025737955442,
"grad_norm": 1.1803226470947266,
"learning_rate": 1.2612605366450037e-05,
"loss": 0.1887,
"step": 46500
},
{
"epoch": 1.5120887959462719,
"grad_norm": 1.1462236642837524,
"learning_rate": 1.2210443343414194e-05,
"loss": 0.1883,
"step": 47000
},
{
"epoch": 1.528175018097,
"grad_norm": 0.8483968377113342,
"learning_rate": 1.1808281320378355e-05,
"loss": 0.1809,
"step": 47500
},
{
"epoch": 1.5442612402477278,
"grad_norm": 1.1205823421478271,
"learning_rate": 1.1406119297342515e-05,
"loss": 0.1813,
"step": 48000
},
{
"epoch": 1.5603474623984557,
"grad_norm": 1.417622447013855,
"learning_rate": 1.1003957274306672e-05,
"loss": 0.1788,
"step": 48500
},
{
"epoch": 1.5764336845491838,
"grad_norm": 1.179103970527649,
"learning_rate": 1.0602599575316904e-05,
"loss": 0.1809,
"step": 49000
},
{
"epoch": 1.5925199066999114,
"grad_norm": 1.1092889308929443,
"learning_rate": 1.0200437552281065e-05,
"loss": 0.1734,
"step": 49500
},
{
"epoch": 1.6086061288506395,
"grad_norm": 1.0196574926376343,
"learning_rate": 9.798275529245222e-06,
"loss": 0.1688,
"step": 50000
},
{
"epoch": 1.6246923510013673,
"grad_norm": 1.1376862525939941,
"learning_rate": 9.396113506209382e-06,
"loss": 0.1703,
"step": 50500
},
{
"epoch": 1.6407785731520952,
"grad_norm": 0.8885149955749512,
"learning_rate": 8.995560131265685e-06,
"loss": 0.1691,
"step": 51000
},
{
"epoch": 1.6568647953028233,
"grad_norm": 1.2574944496154785,
"learning_rate": 8.593398108229844e-06,
"loss": 0.1615,
"step": 51500
},
{
"epoch": 1.672951017453551,
"grad_norm": 1.2620723247528076,
"learning_rate": 8.191236085194004e-06,
"loss": 0.1593,
"step": 52000
},
{
"epoch": 1.689037239604279,
"grad_norm": 1.551480770111084,
"learning_rate": 7.789074062158163e-06,
"loss": 0.1639,
"step": 52500
},
{
"epoch": 1.7051234617550068,
"grad_norm": 1.5938962697982788,
"learning_rate": 7.386912039122322e-06,
"loss": 0.1587,
"step": 53000
},
{
"epoch": 1.7212096839057347,
"grad_norm": 1.0503953695297241,
"learning_rate": 6.984750016086482e-06,
"loss": 0.1599,
"step": 53500
},
{
"epoch": 1.7372959060564628,
"grad_norm": 1.1205036640167236,
"learning_rate": 6.583392317096712e-06,
"loss": 0.1541,
"step": 54000
},
{
"epoch": 1.7533821282071904,
"grad_norm": 0.7524433732032776,
"learning_rate": 6.181230294060872e-06,
"loss": 0.1521,
"step": 54500
},
{
"epoch": 1.7694683503579185,
"grad_norm": 0.9619775414466858,
"learning_rate": 5.779068271025031e-06,
"loss": 0.1521,
"step": 55000
},
{
"epoch": 1.7855545725086464,
"grad_norm": 0.9406844973564148,
"learning_rate": 5.37690624798919e-06,
"loss": 0.1509,
"step": 55500
},
{
"epoch": 1.8016407946593742,
"grad_norm": 0.9363726377487183,
"learning_rate": 4.975548548999421e-06,
"loss": 0.1513,
"step": 56000
},
{
"epoch": 1.8177270168101023,
"grad_norm": 0.9941402673721313,
"learning_rate": 4.573386525963581e-06,
"loss": 0.1484,
"step": 56500
},
{
"epoch": 1.83381323896083,
"grad_norm": 1.3756345510482788,
"learning_rate": 4.17122450292774e-06,
"loss": 0.1509,
"step": 57000
},
{
"epoch": 1.849899461111558,
"grad_norm": 1.0644595623016357,
"learning_rate": 3.7690624798918986e-06,
"loss": 0.1486,
"step": 57500
},
{
"epoch": 1.8659856832622859,
"grad_norm": 1.070890188217163,
"learning_rate": 3.3669004568560584e-06,
"loss": 0.1462,
"step": 58000
},
{
"epoch": 1.8820719054130137,
"grad_norm": 1.3034768104553223,
"learning_rate": 2.9647384338202173e-06,
"loss": 0.1481,
"step": 58500
},
{
"epoch": 1.8981581275637418,
"grad_norm": 1.127517580986023,
"learning_rate": 2.5625764107843767e-06,
"loss": 0.1451,
"step": 59000
},
{
"epoch": 1.9142443497144694,
"grad_norm": 0.9431403279304504,
"learning_rate": 2.1604143877485364e-06,
"loss": 0.1458,
"step": 59500
},
{
"epoch": 1.9303305718651975,
"grad_norm": 1.271483302116394,
"learning_rate": 1.7590566887587673e-06,
"loss": 0.1463,
"step": 60000
},
{
"epoch": 1.9464167940159254,
"grad_norm": 0.7327952980995178,
"learning_rate": 1.3568946657229264e-06,
"loss": 0.1434,
"step": 60500
},
{
"epoch": 1.9625030161666532,
"grad_norm": 1.0670543909072876,
"learning_rate": 9.547326426870858e-07,
"loss": 0.1424,
"step": 61000
},
{
"epoch": 1.9785892383173813,
"grad_norm": 1.2705425024032593,
"learning_rate": 5.525706196512451e-07,
"loss": 0.1431,
"step": 61500
},
{
"epoch": 1.994675460468109,
"grad_norm": 0.9267213344573975,
"learning_rate": 1.5040859661540443e-07,
"loss": 0.1418,
"step": 62000
}
],
"logging_steps": 500,
"max_steps": 62164,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.3425693542685082e+17,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}