phi4_fullrouter / trainer_state.json
gchaves-99's picture
Add all files
077d109
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.9983431239644522,
"eval_steps": 500,
"global_step": 4977,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.030125018828136767,
"grad_norm": 0.30684176087379456,
"learning_rate": 4.94976893711071e-05,
"loss": 1.2767,
"mean_token_accuracy": 0.7496056535840034,
"step": 50
},
{
"epoch": 0.060250037656273535,
"grad_norm": 0.25573843717575073,
"learning_rate": 4.899537874221419e-05,
"loss": 0.7198,
"mean_token_accuracy": 0.8821907821297645,
"step": 100
},
{
"epoch": 0.0903750564844103,
"grad_norm": 0.30719634890556335,
"learning_rate": 4.849306811332128e-05,
"loss": 0.7046,
"mean_token_accuracy": 0.886384769231081,
"step": 150
},
{
"epoch": 0.12050007531254707,
"grad_norm": 0.28959545493125916,
"learning_rate": 4.799075748442837e-05,
"loss": 0.7079,
"mean_token_accuracy": 0.885994749814272,
"step": 200
},
{
"epoch": 0.15062509414068384,
"grad_norm": 0.23001542687416077,
"learning_rate": 4.748844685553547e-05,
"loss": 0.6882,
"mean_token_accuracy": 0.8865358050167561,
"step": 250
},
{
"epoch": 0.1807501129688206,
"grad_norm": 0.28763362765312195,
"learning_rate": 4.6986136226642556e-05,
"loss": 0.6663,
"mean_token_accuracy": 0.8903915384411811,
"step": 300
},
{
"epoch": 0.21087513179695738,
"grad_norm": 0.2092406004667282,
"learning_rate": 4.648382559774965e-05,
"loss": 0.6441,
"mean_token_accuracy": 0.8928172151744366,
"step": 350
},
{
"epoch": 0.24100015062509414,
"grad_norm": 0.2575877606868744,
"learning_rate": 4.598151496885674e-05,
"loss": 0.6986,
"mean_token_accuracy": 0.8838813950121402,
"step": 400
},
{
"epoch": 0.2711251694532309,
"grad_norm": 0.25107163190841675,
"learning_rate": 4.547920433996384e-05,
"loss": 0.6658,
"mean_token_accuracy": 0.8893497291207314,
"step": 450
},
{
"epoch": 0.3012501882813677,
"grad_norm": 0.2437737137079239,
"learning_rate": 4.4976893711070926e-05,
"loss": 0.6684,
"mean_token_accuracy": 0.8893143194913864,
"step": 500
},
{
"epoch": 0.33137520710950447,
"grad_norm": 0.2163419872522354,
"learning_rate": 4.447458308217802e-05,
"loss": 0.7015,
"mean_token_accuracy": 0.8857556004822255,
"step": 550
},
{
"epoch": 0.3615002259376412,
"grad_norm": 0.39833882451057434,
"learning_rate": 4.397227245328511e-05,
"loss": 0.6729,
"mean_token_accuracy": 0.8865716621279717,
"step": 600
},
{
"epoch": 0.391625244765778,
"grad_norm": 0.3186735212802887,
"learning_rate": 4.3469961824392206e-05,
"loss": 0.6451,
"mean_token_accuracy": 0.892935143262148,
"step": 650
},
{
"epoch": 0.42175026359391476,
"grad_norm": 0.2682092487812042,
"learning_rate": 4.2967651195499295e-05,
"loss": 0.704,
"mean_token_accuracy": 0.8814965118467808,
"step": 700
},
{
"epoch": 0.4518752824220515,
"grad_norm": 0.32946068048477173,
"learning_rate": 4.246534056660639e-05,
"loss": 0.6565,
"mean_token_accuracy": 0.8922205206751823,
"step": 750
},
{
"epoch": 0.4820003012501883,
"grad_norm": 0.28554585576057434,
"learning_rate": 4.196302993771348e-05,
"loss": 0.6471,
"mean_token_accuracy": 0.8931228183209896,
"step": 800
},
{
"epoch": 0.512125320078325,
"grad_norm": 0.19599439203739166,
"learning_rate": 4.1460719308820575e-05,
"loss": 0.6864,
"mean_token_accuracy": 0.8856179165840149,
"step": 850
},
{
"epoch": 0.5422503389064618,
"grad_norm": 0.30608075857162476,
"learning_rate": 4.095840867992767e-05,
"loss": 0.6508,
"mean_token_accuracy": 0.8878815796971321,
"step": 900
},
{
"epoch": 0.5723753577345986,
"grad_norm": 0.254626989364624,
"learning_rate": 4.045609805103476e-05,
"loss": 0.6196,
"mean_token_accuracy": 0.8950139920413495,
"step": 950
},
{
"epoch": 0.6025003765627354,
"grad_norm": 0.42999160289764404,
"learning_rate": 3.9953787422141856e-05,
"loss": 0.6342,
"mean_token_accuracy": 0.8938413085043431,
"step": 1000
},
{
"epoch": 0.6326253953908721,
"grad_norm": 0.23657967150211334,
"learning_rate": 3.945147679324895e-05,
"loss": 0.6389,
"mean_token_accuracy": 0.894249224960804,
"step": 1050
},
{
"epoch": 0.6627504142190089,
"grad_norm": 0.3286744952201843,
"learning_rate": 3.894916616435604e-05,
"loss": 0.6349,
"mean_token_accuracy": 0.8949852520227433,
"step": 1100
},
{
"epoch": 0.6928754330471456,
"grad_norm": 0.3509972393512726,
"learning_rate": 3.8446855535463136e-05,
"loss": 0.6118,
"mean_token_accuracy": 0.8998459935188293,
"step": 1150
},
{
"epoch": 0.7230004518752824,
"grad_norm": 0.3571523129940033,
"learning_rate": 3.7944544906570225e-05,
"loss": 0.6381,
"mean_token_accuracy": 0.8936076226830483,
"step": 1200
},
{
"epoch": 0.7531254707034192,
"grad_norm": 0.3348468244075775,
"learning_rate": 3.744223427767732e-05,
"loss": 0.6522,
"mean_token_accuracy": 0.8882578992843628,
"step": 1250
},
{
"epoch": 0.783250489531556,
"grad_norm": 0.28266018629074097,
"learning_rate": 3.693992364878441e-05,
"loss": 0.6246,
"mean_token_accuracy": 0.8964510107040405,
"step": 1300
},
{
"epoch": 0.8133755083596927,
"grad_norm": 0.4280668795108795,
"learning_rate": 3.6437613019891505e-05,
"loss": 0.648,
"mean_token_accuracy": 0.8886529618501663,
"step": 1350
},
{
"epoch": 0.8435005271878295,
"grad_norm": 0.3760441839694977,
"learning_rate": 3.5935302390998594e-05,
"loss": 0.6051,
"mean_token_accuracy": 0.897853167951107,
"step": 1400
},
{
"epoch": 0.8736255460159663,
"grad_norm": 0.4479055106639862,
"learning_rate": 3.543299176210569e-05,
"loss": 0.5927,
"mean_token_accuracy": 0.9007090017199516,
"step": 1450
},
{
"epoch": 0.903750564844103,
"grad_norm": 0.28697535395622253,
"learning_rate": 3.493068113321278e-05,
"loss": 0.7065,
"mean_token_accuracy": 0.8828328484296799,
"step": 1500
},
{
"epoch": 0.9338755836722398,
"grad_norm": 0.2910836338996887,
"learning_rate": 3.4428370504319875e-05,
"loss": 0.672,
"mean_token_accuracy": 0.8896669654548168,
"step": 1550
},
{
"epoch": 0.9640006025003766,
"grad_norm": 0.39928898215293884,
"learning_rate": 3.3926059875426964e-05,
"loss": 0.6337,
"mean_token_accuracy": 0.89401711165905,
"step": 1600
},
{
"epoch": 0.9941256213285133,
"grad_norm": 0.23171083629131317,
"learning_rate": 3.342374924653406e-05,
"loss": 0.6376,
"mean_token_accuracy": 0.8934089505672455,
"step": 1650
},
{
"epoch": 1.0,
"eval_loss": 0.632087230682373,
"eval_mean_token_accuracy": 0.8758415237579056,
"eval_runtime": 77.2141,
"eval_samples_per_second": 19.116,
"eval_steps_per_second": 2.396,
"step": 1660
},
{
"epoch": 1.0241000150625095,
"grad_norm": 0.2895660102367401,
"learning_rate": 3.293148483021901e-05,
"loss": 0.6191,
"mean_token_accuracy": 0.8999803757295013,
"step": 1700
},
{
"epoch": 1.0542250338906463,
"grad_norm": 0.3923441171646118,
"learning_rate": 3.2429174201326105e-05,
"loss": 0.6278,
"mean_token_accuracy": 0.8943565684556961,
"step": 1750
},
{
"epoch": 1.084350052718783,
"grad_norm": 0.3033309876918793,
"learning_rate": 3.1926863572433193e-05,
"loss": 0.6631,
"mean_token_accuracy": 0.892311205714941,
"step": 1800
},
{
"epoch": 1.1144750715469196,
"grad_norm": 0.26226454973220825,
"learning_rate": 3.142455294354029e-05,
"loss": 0.6549,
"mean_token_accuracy": 0.8898773008584976,
"step": 1850
},
{
"epoch": 1.1446000903750564,
"grad_norm": 0.36343246698379517,
"learning_rate": 3.0922242314647385e-05,
"loss": 0.6303,
"mean_token_accuracy": 0.893774523884058,
"step": 1900
},
{
"epoch": 1.1747251092031932,
"grad_norm": 0.3890613615512848,
"learning_rate": 3.041993168575447e-05,
"loss": 0.609,
"mean_token_accuracy": 0.8982135467231274,
"step": 1950
},
{
"epoch": 1.20485012803133,
"grad_norm": 0.3525061011314392,
"learning_rate": 2.9917621056861566e-05,
"loss": 0.5923,
"mean_token_accuracy": 0.9009903834760189,
"step": 2000
},
{
"epoch": 1.2349751468594667,
"grad_norm": 0.45349597930908203,
"learning_rate": 2.9415310427968655e-05,
"loss": 0.6066,
"mean_token_accuracy": 0.9004408088326454,
"step": 2050
},
{
"epoch": 1.2651001656876035,
"grad_norm": 0.32030248641967773,
"learning_rate": 2.891299979907575e-05,
"loss": 0.6295,
"mean_token_accuracy": 0.8940686418116093,
"step": 2100
},
{
"epoch": 1.2952251845157403,
"grad_norm": 0.3644977807998657,
"learning_rate": 2.8410689170182843e-05,
"loss": 0.6594,
"mean_token_accuracy": 0.8896569818258285,
"step": 2150
},
{
"epoch": 1.325350203343877,
"grad_norm": 0.3809216022491455,
"learning_rate": 2.7908378541289935e-05,
"loss": 0.5941,
"mean_token_accuracy": 0.898374630510807,
"step": 2200
},
{
"epoch": 1.3554752221720139,
"grad_norm": 0.42949002981185913,
"learning_rate": 2.7406067912397028e-05,
"loss": 0.6087,
"mean_token_accuracy": 0.8992364549636841,
"step": 2250
},
{
"epoch": 1.3856002410001507,
"grad_norm": 0.47053080797195435,
"learning_rate": 2.6903757283504123e-05,
"loss": 0.5955,
"mean_token_accuracy": 0.8992098160088062,
"step": 2300
},
{
"epoch": 1.4157252598282875,
"grad_norm": 0.21600554883480072,
"learning_rate": 2.6401446654611212e-05,
"loss": 0.5848,
"mean_token_accuracy": 0.8993302121758461,
"step": 2350
},
{
"epoch": 1.4458502786564242,
"grad_norm": 0.3977588713169098,
"learning_rate": 2.5899136025718308e-05,
"loss": 0.5728,
"mean_token_accuracy": 0.9013423874974251,
"step": 2400
},
{
"epoch": 1.475975297484561,
"grad_norm": 0.3258291184902191,
"learning_rate": 2.5396825396825397e-05,
"loss": 0.5936,
"mean_token_accuracy": 0.9022154864668847,
"step": 2450
},
{
"epoch": 1.5061003163126978,
"grad_norm": 0.5135733485221863,
"learning_rate": 2.4894514767932493e-05,
"loss": 0.5831,
"mean_token_accuracy": 0.8982969619333744,
"step": 2500
},
{
"epoch": 1.5362253351408346,
"grad_norm": 0.4302254915237427,
"learning_rate": 2.4392204139039585e-05,
"loss": 0.5975,
"mean_token_accuracy": 0.8992070508003235,
"step": 2550
},
{
"epoch": 1.5663503539689714,
"grad_norm": 0.8697525858879089,
"learning_rate": 2.3889893510146677e-05,
"loss": 0.629,
"mean_token_accuracy": 0.8953581416606903,
"step": 2600
},
{
"epoch": 1.5964753727971082,
"grad_norm": 0.37328246235847473,
"learning_rate": 2.338758288125377e-05,
"loss": 0.5771,
"mean_token_accuracy": 0.9028278756141662,
"step": 2650
},
{
"epoch": 1.6266003916252447,
"grad_norm": 0.42918869853019714,
"learning_rate": 2.2885272252360862e-05,
"loss": 0.6585,
"mean_token_accuracy": 0.8911288838088512,
"step": 2700
},
{
"epoch": 1.6567254104533815,
"grad_norm": 0.39805442094802856,
"learning_rate": 2.2382961623467954e-05,
"loss": 0.5669,
"mean_token_accuracy": 0.9028179155290127,
"step": 2750
},
{
"epoch": 1.6868504292815183,
"grad_norm": 0.2861442565917969,
"learning_rate": 2.1880650994575047e-05,
"loss": 0.6342,
"mean_token_accuracy": 0.897926286906004,
"step": 2800
},
{
"epoch": 1.716975448109655,
"grad_norm": 0.36629295349121094,
"learning_rate": 2.137834036568214e-05,
"loss": 0.615,
"mean_token_accuracy": 0.8986844432353973,
"step": 2850
},
{
"epoch": 1.7471004669377919,
"grad_norm": 0.22408436238765717,
"learning_rate": 2.087602973678923e-05,
"loss": 0.6102,
"mean_token_accuracy": 0.8989599145203829,
"step": 2900
},
{
"epoch": 1.7772254857659286,
"grad_norm": 0.533674955368042,
"learning_rate": 2.0373719107896324e-05,
"loss": 0.6184,
"mean_token_accuracy": 0.8953662586212158,
"step": 2950
},
{
"epoch": 1.8073505045940652,
"grad_norm": 0.49870041012763977,
"learning_rate": 1.9871408479003416e-05,
"loss": 0.58,
"mean_token_accuracy": 0.8989548328518867,
"step": 3000
},
{
"epoch": 1.837475523422202,
"grad_norm": 0.5503713488578796,
"learning_rate": 1.936909785011051e-05,
"loss": 0.5872,
"mean_token_accuracy": 0.9030785009264946,
"step": 3050
},
{
"epoch": 1.8676005422503388,
"grad_norm": 0.2998668849468231,
"learning_rate": 1.88667872212176e-05,
"loss": 0.6009,
"mean_token_accuracy": 0.8998409834504127,
"step": 3100
},
{
"epoch": 1.8977255610784756,
"grad_norm": 0.43862882256507874,
"learning_rate": 1.8364476592324696e-05,
"loss": 0.5873,
"mean_token_accuracy": 0.9020068399608135,
"step": 3150
},
{
"epoch": 1.9278505799066123,
"grad_norm": 0.34258952736854553,
"learning_rate": 1.786216596343179e-05,
"loss": 0.6118,
"mean_token_accuracy": 0.8974671520292758,
"step": 3200
},
{
"epoch": 1.9579755987347491,
"grad_norm": 0.42319709062576294,
"learning_rate": 1.735985533453888e-05,
"loss": 0.6068,
"mean_token_accuracy": 0.8985735175013542,
"step": 3250
},
{
"epoch": 1.988100617562886,
"grad_norm": 0.4495251774787903,
"learning_rate": 1.6857544705645973e-05,
"loss": 0.5651,
"mean_token_accuracy": 0.9044052864611148,
"step": 3300
},
{
"epoch": 2.0,
"eval_loss": 0.6136223077774048,
"eval_mean_token_accuracy": 0.8889741445769374,
"eval_runtime": 77.9377,
"eval_samples_per_second": 18.938,
"eval_steps_per_second": 2.374,
"step": 3320
},
{
"epoch": 2.018075011296882,
"grad_norm": 0.4041847288608551,
"learning_rate": 1.6355234076753066e-05,
"loss": 0.6079,
"mean_token_accuracy": 0.8902337176104387,
"step": 3350
},
{
"epoch": 2.048200030125019,
"grad_norm": 0.34565427899360657,
"learning_rate": 1.5852923447860158e-05,
"loss": 0.552,
"mean_token_accuracy": 0.907501307874918,
"step": 3400
},
{
"epoch": 2.0783250489531557,
"grad_norm": 0.39507198333740234,
"learning_rate": 1.535061281896725e-05,
"loss": 0.5902,
"mean_token_accuracy": 0.8986614851653576,
"step": 3450
},
{
"epoch": 2.1084500677812925,
"grad_norm": 0.35742080211639404,
"learning_rate": 1.4848302190074343e-05,
"loss": 0.5765,
"mean_token_accuracy": 0.9004091265797615,
"step": 3500
},
{
"epoch": 2.1385750866094293,
"grad_norm": 0.4014514684677124,
"learning_rate": 1.4345991561181435e-05,
"loss": 0.6075,
"mean_token_accuracy": 0.8991987191140651,
"step": 3550
},
{
"epoch": 2.168700105437566,
"grad_norm": 0.2786984443664551,
"learning_rate": 1.3843680932288527e-05,
"loss": 0.5772,
"mean_token_accuracy": 0.9038310977816582,
"step": 3600
},
{
"epoch": 2.198825124265703,
"grad_norm": 0.4025174081325531,
"learning_rate": 1.334137030339562e-05,
"loss": 0.5891,
"mean_token_accuracy": 0.8991677206754685,
"step": 3650
},
{
"epoch": 2.228950143093839,
"grad_norm": 0.4690361022949219,
"learning_rate": 1.2839059674502712e-05,
"loss": 0.5261,
"mean_token_accuracy": 0.9109566512703896,
"step": 3700
},
{
"epoch": 2.2590751619219764,
"grad_norm": 0.3918741047382355,
"learning_rate": 1.2336749045609804e-05,
"loss": 0.5686,
"mean_token_accuracy": 0.9028687690198421,
"step": 3750
},
{
"epoch": 2.289200180750113,
"grad_norm": 0.4912905991077423,
"learning_rate": 1.1834438416716898e-05,
"loss": 0.5102,
"mean_token_accuracy": 0.9118883027136326,
"step": 3800
},
{
"epoch": 2.3193251995782496,
"grad_norm": 0.7354199886322021,
"learning_rate": 1.133212778782399e-05,
"loss": 0.5748,
"mean_token_accuracy": 0.90053293466568,
"step": 3850
},
{
"epoch": 2.3494502184063863,
"grad_norm": 0.49514544010162354,
"learning_rate": 1.0829817158931085e-05,
"loss": 0.5867,
"mean_token_accuracy": 0.9003237128257752,
"step": 3900
},
{
"epoch": 2.379575237234523,
"grad_norm": 0.5507615804672241,
"learning_rate": 1.0327506530038177e-05,
"loss": 0.5952,
"mean_token_accuracy": 0.9011906269192695,
"step": 3950
},
{
"epoch": 2.40970025606266,
"grad_norm": 0.9635323286056519,
"learning_rate": 9.82519590114527e-06,
"loss": 0.5871,
"mean_token_accuracy": 0.9019941617548466,
"step": 4000
},
{
"epoch": 2.4398252748907967,
"grad_norm": 0.306292325258255,
"learning_rate": 9.322885272252362e-06,
"loss": 0.5418,
"mean_token_accuracy": 0.907406060397625,
"step": 4050
},
{
"epoch": 2.4699502937189335,
"grad_norm": 0.36833733320236206,
"learning_rate": 8.820574643359454e-06,
"loss": 0.5374,
"mean_token_accuracy": 0.9102728597819805,
"step": 4100
},
{
"epoch": 2.5000753125470703,
"grad_norm": 0.4845290780067444,
"learning_rate": 8.318264014466546e-06,
"loss": 0.6115,
"mean_token_accuracy": 0.9012929057329893,
"step": 4150
},
{
"epoch": 2.530200331375207,
"grad_norm": 0.4215283691883087,
"learning_rate": 7.815953385573639e-06,
"loss": 0.5214,
"mean_token_accuracy": 0.909003015756607,
"step": 4200
},
{
"epoch": 2.560325350203344,
"grad_norm": 0.4454072415828705,
"learning_rate": 7.313642756680732e-06,
"loss": 0.6277,
"mean_token_accuracy": 0.8945660217106343,
"step": 4250
},
{
"epoch": 2.5904503690314806,
"grad_norm": 0.7070040106773376,
"learning_rate": 6.811332127787824e-06,
"loss": 0.5678,
"mean_token_accuracy": 0.9047226509451867,
"step": 4300
},
{
"epoch": 2.6205753878596174,
"grad_norm": 0.545863687992096,
"learning_rate": 6.3090214988949165e-06,
"loss": 0.5955,
"mean_token_accuracy": 0.901444385945797,
"step": 4350
},
{
"epoch": 2.650700406687754,
"grad_norm": 0.4422617554664612,
"learning_rate": 5.80671087000201e-06,
"loss": 0.5588,
"mean_token_accuracy": 0.9066709437966347,
"step": 4400
},
{
"epoch": 2.680825425515891,
"grad_norm": 0.7092880010604858,
"learning_rate": 5.304400241109103e-06,
"loss": 0.602,
"mean_token_accuracy": 0.900201300829649,
"step": 4450
},
{
"epoch": 2.7109504443440278,
"grad_norm": 0.33753281831741333,
"learning_rate": 4.802089612216195e-06,
"loss": 0.5353,
"mean_token_accuracy": 0.9101526521146297,
"step": 4500
},
{
"epoch": 2.7410754631721646,
"grad_norm": 0.49151691794395447,
"learning_rate": 4.2997789833232875e-06,
"loss": 0.6013,
"mean_token_accuracy": 0.8975072601437568,
"step": 4550
},
{
"epoch": 2.7712004820003013,
"grad_norm": 0.5509622693061829,
"learning_rate": 3.7974683544303802e-06,
"loss": 0.5806,
"mean_token_accuracy": 0.9017076626420021,
"step": 4600
},
{
"epoch": 2.801325500828438,
"grad_norm": 0.46273571252822876,
"learning_rate": 3.2951577255374726e-06,
"loss": 0.6097,
"mean_token_accuracy": 0.8960529206693173,
"step": 4650
},
{
"epoch": 2.831450519656575,
"grad_norm": 0.4964665174484253,
"learning_rate": 2.792847096644565e-06,
"loss": 0.546,
"mean_token_accuracy": 0.9039208325743675,
"step": 4700
},
{
"epoch": 2.8615755384847117,
"grad_norm": 0.5726104378700256,
"learning_rate": 2.2905364677516576e-06,
"loss": 0.5698,
"mean_token_accuracy": 0.9045622007548809,
"step": 4750
},
{
"epoch": 2.8917005573128485,
"grad_norm": 0.47625041007995605,
"learning_rate": 1.7882258388587504e-06,
"loss": 0.6098,
"mean_token_accuracy": 0.8977401655912399,
"step": 4800
},
{
"epoch": 2.921825576140985,
"grad_norm": 0.8348466157913208,
"learning_rate": 1.285915209965843e-06,
"loss": 0.5718,
"mean_token_accuracy": 0.9037941220402718,
"step": 4850
},
{
"epoch": 2.951950594969122,
"grad_norm": 0.6317358016967773,
"learning_rate": 7.836045810729356e-07,
"loss": 0.5573,
"mean_token_accuracy": 0.9056886151432991,
"step": 4900
},
{
"epoch": 2.9820756137972584,
"grad_norm": 0.5102740526199341,
"learning_rate": 2.8129395218002816e-07,
"loss": 0.5308,
"mean_token_accuracy": 0.9070908261835575,
"step": 4950
},
{
"epoch": 2.9983431239644522,
"eval_loss": 0.6088222861289978,
"eval_mean_token_accuracy": 0.8896377841730665,
"eval_runtime": 76.8211,
"eval_samples_per_second": 19.213,
"eval_steps_per_second": 2.408,
"step": 4977
},
{
"epoch": 2.9983431239644522,
"step": 4977,
"total_flos": 1017098040639488.0,
"train_loss": 0.6195706200211647,
"train_runtime": 35359.1981,
"train_samples_per_second": 1.127,
"train_steps_per_second": 0.141
}
],
"logging_steps": 50,
"max_steps": 4977,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1017098040639488.0,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}