{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.9983431239644522, "eval_steps": 500, "global_step": 4977, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.030125018828136767, "grad_norm": 0.30684176087379456, "learning_rate": 4.94976893711071e-05, "loss": 1.2767, "mean_token_accuracy": 0.7496056535840034, "step": 50 }, { "epoch": 0.060250037656273535, "grad_norm": 0.25573843717575073, "learning_rate": 4.899537874221419e-05, "loss": 0.7198, "mean_token_accuracy": 0.8821907821297645, "step": 100 }, { "epoch": 0.0903750564844103, "grad_norm": 0.30719634890556335, "learning_rate": 4.849306811332128e-05, "loss": 0.7046, "mean_token_accuracy": 0.886384769231081, "step": 150 }, { "epoch": 0.12050007531254707, "grad_norm": 0.28959545493125916, "learning_rate": 4.799075748442837e-05, "loss": 0.7079, "mean_token_accuracy": 0.885994749814272, "step": 200 }, { "epoch": 0.15062509414068384, "grad_norm": 0.23001542687416077, "learning_rate": 4.748844685553547e-05, "loss": 0.6882, "mean_token_accuracy": 0.8865358050167561, "step": 250 }, { "epoch": 0.1807501129688206, "grad_norm": 0.28763362765312195, "learning_rate": 4.6986136226642556e-05, "loss": 0.6663, "mean_token_accuracy": 0.8903915384411811, "step": 300 }, { "epoch": 0.21087513179695738, "grad_norm": 0.2092406004667282, "learning_rate": 4.648382559774965e-05, "loss": 0.6441, "mean_token_accuracy": 0.8928172151744366, "step": 350 }, { "epoch": 0.24100015062509414, "grad_norm": 0.2575877606868744, "learning_rate": 4.598151496885674e-05, "loss": 0.6986, "mean_token_accuracy": 0.8838813950121402, "step": 400 }, { "epoch": 0.2711251694532309, "grad_norm": 0.25107163190841675, "learning_rate": 4.547920433996384e-05, "loss": 0.6658, "mean_token_accuracy": 0.8893497291207314, "step": 450 }, { "epoch": 0.3012501882813677, "grad_norm": 0.2437737137079239, "learning_rate": 4.4976893711070926e-05, "loss": 0.6684, "mean_token_accuracy": 0.8893143194913864, "step": 500 }, { "epoch": 0.33137520710950447, "grad_norm": 0.2163419872522354, "learning_rate": 4.447458308217802e-05, "loss": 0.7015, "mean_token_accuracy": 0.8857556004822255, "step": 550 }, { "epoch": 0.3615002259376412, "grad_norm": 0.39833882451057434, "learning_rate": 4.397227245328511e-05, "loss": 0.6729, "mean_token_accuracy": 0.8865716621279717, "step": 600 }, { "epoch": 0.391625244765778, "grad_norm": 0.3186735212802887, "learning_rate": 4.3469961824392206e-05, "loss": 0.6451, "mean_token_accuracy": 0.892935143262148, "step": 650 }, { "epoch": 0.42175026359391476, "grad_norm": 0.2682092487812042, "learning_rate": 4.2967651195499295e-05, "loss": 0.704, "mean_token_accuracy": 0.8814965118467808, "step": 700 }, { "epoch": 0.4518752824220515, "grad_norm": 0.32946068048477173, "learning_rate": 4.246534056660639e-05, "loss": 0.6565, "mean_token_accuracy": 0.8922205206751823, "step": 750 }, { "epoch": 0.4820003012501883, "grad_norm": 0.28554585576057434, "learning_rate": 4.196302993771348e-05, "loss": 0.6471, "mean_token_accuracy": 0.8931228183209896, "step": 800 }, { "epoch": 0.512125320078325, "grad_norm": 0.19599439203739166, "learning_rate": 4.1460719308820575e-05, "loss": 0.6864, "mean_token_accuracy": 0.8856179165840149, "step": 850 }, { "epoch": 0.5422503389064618, "grad_norm": 0.30608075857162476, "learning_rate": 4.095840867992767e-05, "loss": 0.6508, "mean_token_accuracy": 0.8878815796971321, "step": 900 }, { "epoch": 0.5723753577345986, "grad_norm": 0.254626989364624, "learning_rate": 4.045609805103476e-05, "loss": 0.6196, "mean_token_accuracy": 0.8950139920413495, "step": 950 }, { "epoch": 0.6025003765627354, "grad_norm": 0.42999160289764404, "learning_rate": 3.9953787422141856e-05, "loss": 0.6342, "mean_token_accuracy": 0.8938413085043431, "step": 1000 }, { "epoch": 0.6326253953908721, "grad_norm": 0.23657967150211334, "learning_rate": 3.945147679324895e-05, "loss": 0.6389, "mean_token_accuracy": 0.894249224960804, "step": 1050 }, { "epoch": 0.6627504142190089, "grad_norm": 0.3286744952201843, "learning_rate": 3.894916616435604e-05, "loss": 0.6349, "mean_token_accuracy": 0.8949852520227433, "step": 1100 }, { "epoch": 0.6928754330471456, "grad_norm": 0.3509972393512726, "learning_rate": 3.8446855535463136e-05, "loss": 0.6118, "mean_token_accuracy": 0.8998459935188293, "step": 1150 }, { "epoch": 0.7230004518752824, "grad_norm": 0.3571523129940033, "learning_rate": 3.7944544906570225e-05, "loss": 0.6381, "mean_token_accuracy": 0.8936076226830483, "step": 1200 }, { "epoch": 0.7531254707034192, "grad_norm": 0.3348468244075775, "learning_rate": 3.744223427767732e-05, "loss": 0.6522, "mean_token_accuracy": 0.8882578992843628, "step": 1250 }, { "epoch": 0.783250489531556, "grad_norm": 0.28266018629074097, "learning_rate": 3.693992364878441e-05, "loss": 0.6246, "mean_token_accuracy": 0.8964510107040405, "step": 1300 }, { "epoch": 0.8133755083596927, "grad_norm": 0.4280668795108795, "learning_rate": 3.6437613019891505e-05, "loss": 0.648, "mean_token_accuracy": 0.8886529618501663, "step": 1350 }, { "epoch": 0.8435005271878295, "grad_norm": 0.3760441839694977, "learning_rate": 3.5935302390998594e-05, "loss": 0.6051, "mean_token_accuracy": 0.897853167951107, "step": 1400 }, { "epoch": 0.8736255460159663, "grad_norm": 0.4479055106639862, "learning_rate": 3.543299176210569e-05, "loss": 0.5927, "mean_token_accuracy": 0.9007090017199516, "step": 1450 }, { "epoch": 0.903750564844103, "grad_norm": 0.28697535395622253, "learning_rate": 3.493068113321278e-05, "loss": 0.7065, "mean_token_accuracy": 0.8828328484296799, "step": 1500 }, { "epoch": 0.9338755836722398, "grad_norm": 0.2910836338996887, "learning_rate": 3.4428370504319875e-05, "loss": 0.672, "mean_token_accuracy": 0.8896669654548168, "step": 1550 }, { "epoch": 0.9640006025003766, "grad_norm": 0.39928898215293884, "learning_rate": 3.3926059875426964e-05, "loss": 0.6337, "mean_token_accuracy": 0.89401711165905, "step": 1600 }, { "epoch": 0.9941256213285133, "grad_norm": 0.23171083629131317, "learning_rate": 3.342374924653406e-05, "loss": 0.6376, "mean_token_accuracy": 0.8934089505672455, "step": 1650 }, { "epoch": 1.0, "eval_loss": 0.632087230682373, "eval_mean_token_accuracy": 0.8758415237579056, "eval_runtime": 77.2141, "eval_samples_per_second": 19.116, "eval_steps_per_second": 2.396, "step": 1660 }, { "epoch": 1.0241000150625095, "grad_norm": 0.2895660102367401, "learning_rate": 3.293148483021901e-05, "loss": 0.6191, "mean_token_accuracy": 0.8999803757295013, "step": 1700 }, { "epoch": 1.0542250338906463, "grad_norm": 0.3923441171646118, "learning_rate": 3.2429174201326105e-05, "loss": 0.6278, "mean_token_accuracy": 0.8943565684556961, "step": 1750 }, { "epoch": 1.084350052718783, "grad_norm": 0.3033309876918793, "learning_rate": 3.1926863572433193e-05, "loss": 0.6631, "mean_token_accuracy": 0.892311205714941, "step": 1800 }, { "epoch": 1.1144750715469196, "grad_norm": 0.26226454973220825, "learning_rate": 3.142455294354029e-05, "loss": 0.6549, "mean_token_accuracy": 0.8898773008584976, "step": 1850 }, { "epoch": 1.1446000903750564, "grad_norm": 0.36343246698379517, "learning_rate": 3.0922242314647385e-05, "loss": 0.6303, "mean_token_accuracy": 0.893774523884058, "step": 1900 }, { "epoch": 1.1747251092031932, "grad_norm": 0.3890613615512848, "learning_rate": 3.041993168575447e-05, "loss": 0.609, "mean_token_accuracy": 0.8982135467231274, "step": 1950 }, { "epoch": 1.20485012803133, "grad_norm": 0.3525061011314392, "learning_rate": 2.9917621056861566e-05, "loss": 0.5923, "mean_token_accuracy": 0.9009903834760189, "step": 2000 }, { "epoch": 1.2349751468594667, "grad_norm": 0.45349597930908203, "learning_rate": 2.9415310427968655e-05, "loss": 0.6066, "mean_token_accuracy": 0.9004408088326454, "step": 2050 }, { "epoch": 1.2651001656876035, "grad_norm": 0.32030248641967773, "learning_rate": 2.891299979907575e-05, "loss": 0.6295, "mean_token_accuracy": 0.8940686418116093, "step": 2100 }, { "epoch": 1.2952251845157403, "grad_norm": 0.3644977807998657, "learning_rate": 2.8410689170182843e-05, "loss": 0.6594, "mean_token_accuracy": 0.8896569818258285, "step": 2150 }, { "epoch": 1.325350203343877, "grad_norm": 0.3809216022491455, "learning_rate": 2.7908378541289935e-05, "loss": 0.5941, "mean_token_accuracy": 0.898374630510807, "step": 2200 }, { "epoch": 1.3554752221720139, "grad_norm": 0.42949002981185913, "learning_rate": 2.7406067912397028e-05, "loss": 0.6087, "mean_token_accuracy": 0.8992364549636841, "step": 2250 }, { "epoch": 1.3856002410001507, "grad_norm": 0.47053080797195435, "learning_rate": 2.6903757283504123e-05, "loss": 0.5955, "mean_token_accuracy": 0.8992098160088062, "step": 2300 }, { "epoch": 1.4157252598282875, "grad_norm": 0.21600554883480072, "learning_rate": 2.6401446654611212e-05, "loss": 0.5848, "mean_token_accuracy": 0.8993302121758461, "step": 2350 }, { "epoch": 1.4458502786564242, "grad_norm": 0.3977588713169098, "learning_rate": 2.5899136025718308e-05, "loss": 0.5728, "mean_token_accuracy": 0.9013423874974251, "step": 2400 }, { "epoch": 1.475975297484561, "grad_norm": 0.3258291184902191, "learning_rate": 2.5396825396825397e-05, "loss": 0.5936, "mean_token_accuracy": 0.9022154864668847, "step": 2450 }, { "epoch": 1.5061003163126978, "grad_norm": 0.5135733485221863, "learning_rate": 2.4894514767932493e-05, "loss": 0.5831, "mean_token_accuracy": 0.8982969619333744, "step": 2500 }, { "epoch": 1.5362253351408346, "grad_norm": 0.4302254915237427, "learning_rate": 2.4392204139039585e-05, "loss": 0.5975, "mean_token_accuracy": 0.8992070508003235, "step": 2550 }, { "epoch": 1.5663503539689714, "grad_norm": 0.8697525858879089, "learning_rate": 2.3889893510146677e-05, "loss": 0.629, "mean_token_accuracy": 0.8953581416606903, "step": 2600 }, { "epoch": 1.5964753727971082, "grad_norm": 0.37328246235847473, "learning_rate": 2.338758288125377e-05, "loss": 0.5771, "mean_token_accuracy": 0.9028278756141662, "step": 2650 }, { "epoch": 1.6266003916252447, "grad_norm": 0.42918869853019714, "learning_rate": 2.2885272252360862e-05, "loss": 0.6585, "mean_token_accuracy": 0.8911288838088512, "step": 2700 }, { "epoch": 1.6567254104533815, "grad_norm": 0.39805442094802856, "learning_rate": 2.2382961623467954e-05, "loss": 0.5669, "mean_token_accuracy": 0.9028179155290127, "step": 2750 }, { "epoch": 1.6868504292815183, "grad_norm": 0.2861442565917969, "learning_rate": 2.1880650994575047e-05, "loss": 0.6342, "mean_token_accuracy": 0.897926286906004, "step": 2800 }, { "epoch": 1.716975448109655, "grad_norm": 0.36629295349121094, "learning_rate": 2.137834036568214e-05, "loss": 0.615, "mean_token_accuracy": 0.8986844432353973, "step": 2850 }, { "epoch": 1.7471004669377919, "grad_norm": 0.22408436238765717, "learning_rate": 2.087602973678923e-05, "loss": 0.6102, "mean_token_accuracy": 0.8989599145203829, "step": 2900 }, { "epoch": 1.7772254857659286, "grad_norm": 0.533674955368042, "learning_rate": 2.0373719107896324e-05, "loss": 0.6184, "mean_token_accuracy": 0.8953662586212158, "step": 2950 }, { "epoch": 1.8073505045940652, "grad_norm": 0.49870041012763977, "learning_rate": 1.9871408479003416e-05, "loss": 0.58, "mean_token_accuracy": 0.8989548328518867, "step": 3000 }, { "epoch": 1.837475523422202, "grad_norm": 0.5503713488578796, "learning_rate": 1.936909785011051e-05, "loss": 0.5872, "mean_token_accuracy": 0.9030785009264946, "step": 3050 }, { "epoch": 1.8676005422503388, "grad_norm": 0.2998668849468231, "learning_rate": 1.88667872212176e-05, "loss": 0.6009, "mean_token_accuracy": 0.8998409834504127, "step": 3100 }, { "epoch": 1.8977255610784756, "grad_norm": 0.43862882256507874, "learning_rate": 1.8364476592324696e-05, "loss": 0.5873, "mean_token_accuracy": 0.9020068399608135, "step": 3150 }, { "epoch": 1.9278505799066123, "grad_norm": 0.34258952736854553, "learning_rate": 1.786216596343179e-05, "loss": 0.6118, "mean_token_accuracy": 0.8974671520292758, "step": 3200 }, { "epoch": 1.9579755987347491, "grad_norm": 0.42319709062576294, "learning_rate": 1.735985533453888e-05, "loss": 0.6068, "mean_token_accuracy": 0.8985735175013542, "step": 3250 }, { "epoch": 1.988100617562886, "grad_norm": 0.4495251774787903, "learning_rate": 1.6857544705645973e-05, "loss": 0.5651, "mean_token_accuracy": 0.9044052864611148, "step": 3300 }, { "epoch": 2.0, "eval_loss": 0.6136223077774048, "eval_mean_token_accuracy": 0.8889741445769374, "eval_runtime": 77.9377, "eval_samples_per_second": 18.938, "eval_steps_per_second": 2.374, "step": 3320 }, { "epoch": 2.018075011296882, "grad_norm": 0.4041847288608551, "learning_rate": 1.6355234076753066e-05, "loss": 0.6079, "mean_token_accuracy": 0.8902337176104387, "step": 3350 }, { "epoch": 2.048200030125019, "grad_norm": 0.34565427899360657, "learning_rate": 1.5852923447860158e-05, "loss": 0.552, "mean_token_accuracy": 0.907501307874918, "step": 3400 }, { "epoch": 2.0783250489531557, "grad_norm": 0.39507198333740234, "learning_rate": 1.535061281896725e-05, "loss": 0.5902, "mean_token_accuracy": 0.8986614851653576, "step": 3450 }, { "epoch": 2.1084500677812925, "grad_norm": 0.35742080211639404, "learning_rate": 1.4848302190074343e-05, "loss": 0.5765, "mean_token_accuracy": 0.9004091265797615, "step": 3500 }, { "epoch": 2.1385750866094293, "grad_norm": 0.4014514684677124, "learning_rate": 1.4345991561181435e-05, "loss": 0.6075, "mean_token_accuracy": 0.8991987191140651, "step": 3550 }, { "epoch": 2.168700105437566, "grad_norm": 0.2786984443664551, "learning_rate": 1.3843680932288527e-05, "loss": 0.5772, "mean_token_accuracy": 0.9038310977816582, "step": 3600 }, { "epoch": 2.198825124265703, "grad_norm": 0.4025174081325531, "learning_rate": 1.334137030339562e-05, "loss": 0.5891, "mean_token_accuracy": 0.8991677206754685, "step": 3650 }, { "epoch": 2.228950143093839, "grad_norm": 0.4690361022949219, "learning_rate": 1.2839059674502712e-05, "loss": 0.5261, "mean_token_accuracy": 0.9109566512703896, "step": 3700 }, { "epoch": 2.2590751619219764, "grad_norm": 0.3918741047382355, "learning_rate": 1.2336749045609804e-05, "loss": 0.5686, "mean_token_accuracy": 0.9028687690198421, "step": 3750 }, { "epoch": 2.289200180750113, "grad_norm": 0.4912905991077423, "learning_rate": 1.1834438416716898e-05, "loss": 0.5102, "mean_token_accuracy": 0.9118883027136326, "step": 3800 }, { "epoch": 2.3193251995782496, "grad_norm": 0.7354199886322021, "learning_rate": 1.133212778782399e-05, "loss": 0.5748, "mean_token_accuracy": 0.90053293466568, "step": 3850 }, { "epoch": 2.3494502184063863, "grad_norm": 0.49514544010162354, "learning_rate": 1.0829817158931085e-05, "loss": 0.5867, "mean_token_accuracy": 0.9003237128257752, "step": 3900 }, { "epoch": 2.379575237234523, "grad_norm": 0.5507615804672241, "learning_rate": 1.0327506530038177e-05, "loss": 0.5952, "mean_token_accuracy": 0.9011906269192695, "step": 3950 }, { "epoch": 2.40970025606266, "grad_norm": 0.9635323286056519, "learning_rate": 9.82519590114527e-06, "loss": 0.5871, "mean_token_accuracy": 0.9019941617548466, "step": 4000 }, { "epoch": 2.4398252748907967, "grad_norm": 0.306292325258255, "learning_rate": 9.322885272252362e-06, "loss": 0.5418, "mean_token_accuracy": 0.907406060397625, "step": 4050 }, { "epoch": 2.4699502937189335, "grad_norm": 0.36833733320236206, "learning_rate": 8.820574643359454e-06, "loss": 0.5374, "mean_token_accuracy": 0.9102728597819805, "step": 4100 }, { "epoch": 2.5000753125470703, "grad_norm": 0.4845290780067444, "learning_rate": 8.318264014466546e-06, "loss": 0.6115, "mean_token_accuracy": 0.9012929057329893, "step": 4150 }, { "epoch": 2.530200331375207, "grad_norm": 0.4215283691883087, "learning_rate": 7.815953385573639e-06, "loss": 0.5214, "mean_token_accuracy": 0.909003015756607, "step": 4200 }, { "epoch": 2.560325350203344, "grad_norm": 0.4454072415828705, "learning_rate": 7.313642756680732e-06, "loss": 0.6277, "mean_token_accuracy": 0.8945660217106343, "step": 4250 }, { "epoch": 2.5904503690314806, "grad_norm": 0.7070040106773376, "learning_rate": 6.811332127787824e-06, "loss": 0.5678, "mean_token_accuracy": 0.9047226509451867, "step": 4300 }, { "epoch": 2.6205753878596174, "grad_norm": 0.545863687992096, "learning_rate": 6.3090214988949165e-06, "loss": 0.5955, "mean_token_accuracy": 0.901444385945797, "step": 4350 }, { "epoch": 2.650700406687754, "grad_norm": 0.4422617554664612, "learning_rate": 5.80671087000201e-06, "loss": 0.5588, "mean_token_accuracy": 0.9066709437966347, "step": 4400 }, { "epoch": 2.680825425515891, "grad_norm": 0.7092880010604858, "learning_rate": 5.304400241109103e-06, "loss": 0.602, "mean_token_accuracy": 0.900201300829649, "step": 4450 }, { "epoch": 2.7109504443440278, "grad_norm": 0.33753281831741333, "learning_rate": 4.802089612216195e-06, "loss": 0.5353, "mean_token_accuracy": 0.9101526521146297, "step": 4500 }, { "epoch": 2.7410754631721646, "grad_norm": 0.49151691794395447, "learning_rate": 4.2997789833232875e-06, "loss": 0.6013, "mean_token_accuracy": 0.8975072601437568, "step": 4550 }, { "epoch": 2.7712004820003013, "grad_norm": 0.5509622693061829, "learning_rate": 3.7974683544303802e-06, "loss": 0.5806, "mean_token_accuracy": 0.9017076626420021, "step": 4600 }, { "epoch": 2.801325500828438, "grad_norm": 0.46273571252822876, "learning_rate": 3.2951577255374726e-06, "loss": 0.6097, "mean_token_accuracy": 0.8960529206693173, "step": 4650 }, { "epoch": 2.831450519656575, "grad_norm": 0.4964665174484253, "learning_rate": 2.792847096644565e-06, "loss": 0.546, "mean_token_accuracy": 0.9039208325743675, "step": 4700 }, { "epoch": 2.8615755384847117, "grad_norm": 0.5726104378700256, "learning_rate": 2.2905364677516576e-06, "loss": 0.5698, "mean_token_accuracy": 0.9045622007548809, "step": 4750 }, { "epoch": 2.8917005573128485, "grad_norm": 0.47625041007995605, "learning_rate": 1.7882258388587504e-06, "loss": 0.6098, "mean_token_accuracy": 0.8977401655912399, "step": 4800 }, { "epoch": 2.921825576140985, "grad_norm": 0.8348466157913208, "learning_rate": 1.285915209965843e-06, "loss": 0.5718, "mean_token_accuracy": 0.9037941220402718, "step": 4850 }, { "epoch": 2.951950594969122, "grad_norm": 0.6317358016967773, "learning_rate": 7.836045810729356e-07, "loss": 0.5573, "mean_token_accuracy": 0.9056886151432991, "step": 4900 }, { "epoch": 2.9820756137972584, "grad_norm": 0.5102740526199341, "learning_rate": 2.8129395218002816e-07, "loss": 0.5308, "mean_token_accuracy": 0.9070908261835575, "step": 4950 }, { "epoch": 2.9983431239644522, "eval_loss": 0.6088222861289978, "eval_mean_token_accuracy": 0.8896377841730665, "eval_runtime": 76.8211, "eval_samples_per_second": 19.213, "eval_steps_per_second": 2.408, "step": 4977 }, { "epoch": 2.9983431239644522, "step": 4977, "total_flos": 1017098040639488.0, "train_loss": 0.6195706200211647, "train_runtime": 35359.1981, "train_samples_per_second": 1.127, "train_steps_per_second": 0.141 } ], "logging_steps": 50, "max_steps": 4977, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1017098040639488.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }