| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.9983431239644522, | |
| "eval_steps": 500, | |
| "global_step": 4977, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.030125018828136767, | |
| "grad_norm": 0.30684176087379456, | |
| "learning_rate": 4.94976893711071e-05, | |
| "loss": 1.2767, | |
| "mean_token_accuracy": 0.7496056535840034, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.060250037656273535, | |
| "grad_norm": 0.25573843717575073, | |
| "learning_rate": 4.899537874221419e-05, | |
| "loss": 0.7198, | |
| "mean_token_accuracy": 0.8821907821297645, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.0903750564844103, | |
| "grad_norm": 0.30719634890556335, | |
| "learning_rate": 4.849306811332128e-05, | |
| "loss": 0.7046, | |
| "mean_token_accuracy": 0.886384769231081, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.12050007531254707, | |
| "grad_norm": 0.28959545493125916, | |
| "learning_rate": 4.799075748442837e-05, | |
| "loss": 0.7079, | |
| "mean_token_accuracy": 0.885994749814272, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.15062509414068384, | |
| "grad_norm": 0.23001542687416077, | |
| "learning_rate": 4.748844685553547e-05, | |
| "loss": 0.6882, | |
| "mean_token_accuracy": 0.8865358050167561, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.1807501129688206, | |
| "grad_norm": 0.28763362765312195, | |
| "learning_rate": 4.6986136226642556e-05, | |
| "loss": 0.6663, | |
| "mean_token_accuracy": 0.8903915384411811, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.21087513179695738, | |
| "grad_norm": 0.2092406004667282, | |
| "learning_rate": 4.648382559774965e-05, | |
| "loss": 0.6441, | |
| "mean_token_accuracy": 0.8928172151744366, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.24100015062509414, | |
| "grad_norm": 0.2575877606868744, | |
| "learning_rate": 4.598151496885674e-05, | |
| "loss": 0.6986, | |
| "mean_token_accuracy": 0.8838813950121402, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.2711251694532309, | |
| "grad_norm": 0.25107163190841675, | |
| "learning_rate": 4.547920433996384e-05, | |
| "loss": 0.6658, | |
| "mean_token_accuracy": 0.8893497291207314, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.3012501882813677, | |
| "grad_norm": 0.2437737137079239, | |
| "learning_rate": 4.4976893711070926e-05, | |
| "loss": 0.6684, | |
| "mean_token_accuracy": 0.8893143194913864, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.33137520710950447, | |
| "grad_norm": 0.2163419872522354, | |
| "learning_rate": 4.447458308217802e-05, | |
| "loss": 0.7015, | |
| "mean_token_accuracy": 0.8857556004822255, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.3615002259376412, | |
| "grad_norm": 0.39833882451057434, | |
| "learning_rate": 4.397227245328511e-05, | |
| "loss": 0.6729, | |
| "mean_token_accuracy": 0.8865716621279717, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.391625244765778, | |
| "grad_norm": 0.3186735212802887, | |
| "learning_rate": 4.3469961824392206e-05, | |
| "loss": 0.6451, | |
| "mean_token_accuracy": 0.892935143262148, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.42175026359391476, | |
| "grad_norm": 0.2682092487812042, | |
| "learning_rate": 4.2967651195499295e-05, | |
| "loss": 0.704, | |
| "mean_token_accuracy": 0.8814965118467808, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.4518752824220515, | |
| "grad_norm": 0.32946068048477173, | |
| "learning_rate": 4.246534056660639e-05, | |
| "loss": 0.6565, | |
| "mean_token_accuracy": 0.8922205206751823, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.4820003012501883, | |
| "grad_norm": 0.28554585576057434, | |
| "learning_rate": 4.196302993771348e-05, | |
| "loss": 0.6471, | |
| "mean_token_accuracy": 0.8931228183209896, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.512125320078325, | |
| "grad_norm": 0.19599439203739166, | |
| "learning_rate": 4.1460719308820575e-05, | |
| "loss": 0.6864, | |
| "mean_token_accuracy": 0.8856179165840149, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.5422503389064618, | |
| "grad_norm": 0.30608075857162476, | |
| "learning_rate": 4.095840867992767e-05, | |
| "loss": 0.6508, | |
| "mean_token_accuracy": 0.8878815796971321, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.5723753577345986, | |
| "grad_norm": 0.254626989364624, | |
| "learning_rate": 4.045609805103476e-05, | |
| "loss": 0.6196, | |
| "mean_token_accuracy": 0.8950139920413495, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.6025003765627354, | |
| "grad_norm": 0.42999160289764404, | |
| "learning_rate": 3.9953787422141856e-05, | |
| "loss": 0.6342, | |
| "mean_token_accuracy": 0.8938413085043431, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.6326253953908721, | |
| "grad_norm": 0.23657967150211334, | |
| "learning_rate": 3.945147679324895e-05, | |
| "loss": 0.6389, | |
| "mean_token_accuracy": 0.894249224960804, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.6627504142190089, | |
| "grad_norm": 0.3286744952201843, | |
| "learning_rate": 3.894916616435604e-05, | |
| "loss": 0.6349, | |
| "mean_token_accuracy": 0.8949852520227433, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.6928754330471456, | |
| "grad_norm": 0.3509972393512726, | |
| "learning_rate": 3.8446855535463136e-05, | |
| "loss": 0.6118, | |
| "mean_token_accuracy": 0.8998459935188293, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.7230004518752824, | |
| "grad_norm": 0.3571523129940033, | |
| "learning_rate": 3.7944544906570225e-05, | |
| "loss": 0.6381, | |
| "mean_token_accuracy": 0.8936076226830483, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.7531254707034192, | |
| "grad_norm": 0.3348468244075775, | |
| "learning_rate": 3.744223427767732e-05, | |
| "loss": 0.6522, | |
| "mean_token_accuracy": 0.8882578992843628, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.783250489531556, | |
| "grad_norm": 0.28266018629074097, | |
| "learning_rate": 3.693992364878441e-05, | |
| "loss": 0.6246, | |
| "mean_token_accuracy": 0.8964510107040405, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.8133755083596927, | |
| "grad_norm": 0.4280668795108795, | |
| "learning_rate": 3.6437613019891505e-05, | |
| "loss": 0.648, | |
| "mean_token_accuracy": 0.8886529618501663, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.8435005271878295, | |
| "grad_norm": 0.3760441839694977, | |
| "learning_rate": 3.5935302390998594e-05, | |
| "loss": 0.6051, | |
| "mean_token_accuracy": 0.897853167951107, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.8736255460159663, | |
| "grad_norm": 0.4479055106639862, | |
| "learning_rate": 3.543299176210569e-05, | |
| "loss": 0.5927, | |
| "mean_token_accuracy": 0.9007090017199516, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.903750564844103, | |
| "grad_norm": 0.28697535395622253, | |
| "learning_rate": 3.493068113321278e-05, | |
| "loss": 0.7065, | |
| "mean_token_accuracy": 0.8828328484296799, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.9338755836722398, | |
| "grad_norm": 0.2910836338996887, | |
| "learning_rate": 3.4428370504319875e-05, | |
| "loss": 0.672, | |
| "mean_token_accuracy": 0.8896669654548168, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.9640006025003766, | |
| "grad_norm": 0.39928898215293884, | |
| "learning_rate": 3.3926059875426964e-05, | |
| "loss": 0.6337, | |
| "mean_token_accuracy": 0.89401711165905, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.9941256213285133, | |
| "grad_norm": 0.23171083629131317, | |
| "learning_rate": 3.342374924653406e-05, | |
| "loss": 0.6376, | |
| "mean_token_accuracy": 0.8934089505672455, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 0.632087230682373, | |
| "eval_mean_token_accuracy": 0.8758415237579056, | |
| "eval_runtime": 77.2141, | |
| "eval_samples_per_second": 19.116, | |
| "eval_steps_per_second": 2.396, | |
| "step": 1660 | |
| }, | |
| { | |
| "epoch": 1.0241000150625095, | |
| "grad_norm": 0.2895660102367401, | |
| "learning_rate": 3.293148483021901e-05, | |
| "loss": 0.6191, | |
| "mean_token_accuracy": 0.8999803757295013, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.0542250338906463, | |
| "grad_norm": 0.3923441171646118, | |
| "learning_rate": 3.2429174201326105e-05, | |
| "loss": 0.6278, | |
| "mean_token_accuracy": 0.8943565684556961, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 1.084350052718783, | |
| "grad_norm": 0.3033309876918793, | |
| "learning_rate": 3.1926863572433193e-05, | |
| "loss": 0.6631, | |
| "mean_token_accuracy": 0.892311205714941, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.1144750715469196, | |
| "grad_norm": 0.26226454973220825, | |
| "learning_rate": 3.142455294354029e-05, | |
| "loss": 0.6549, | |
| "mean_token_accuracy": 0.8898773008584976, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 1.1446000903750564, | |
| "grad_norm": 0.36343246698379517, | |
| "learning_rate": 3.0922242314647385e-05, | |
| "loss": 0.6303, | |
| "mean_token_accuracy": 0.893774523884058, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 1.1747251092031932, | |
| "grad_norm": 0.3890613615512848, | |
| "learning_rate": 3.041993168575447e-05, | |
| "loss": 0.609, | |
| "mean_token_accuracy": 0.8982135467231274, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 1.20485012803133, | |
| "grad_norm": 0.3525061011314392, | |
| "learning_rate": 2.9917621056861566e-05, | |
| "loss": 0.5923, | |
| "mean_token_accuracy": 0.9009903834760189, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 1.2349751468594667, | |
| "grad_norm": 0.45349597930908203, | |
| "learning_rate": 2.9415310427968655e-05, | |
| "loss": 0.6066, | |
| "mean_token_accuracy": 0.9004408088326454, | |
| "step": 2050 | |
| }, | |
| { | |
| "epoch": 1.2651001656876035, | |
| "grad_norm": 0.32030248641967773, | |
| "learning_rate": 2.891299979907575e-05, | |
| "loss": 0.6295, | |
| "mean_token_accuracy": 0.8940686418116093, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 1.2952251845157403, | |
| "grad_norm": 0.3644977807998657, | |
| "learning_rate": 2.8410689170182843e-05, | |
| "loss": 0.6594, | |
| "mean_token_accuracy": 0.8896569818258285, | |
| "step": 2150 | |
| }, | |
| { | |
| "epoch": 1.325350203343877, | |
| "grad_norm": 0.3809216022491455, | |
| "learning_rate": 2.7908378541289935e-05, | |
| "loss": 0.5941, | |
| "mean_token_accuracy": 0.898374630510807, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 1.3554752221720139, | |
| "grad_norm": 0.42949002981185913, | |
| "learning_rate": 2.7406067912397028e-05, | |
| "loss": 0.6087, | |
| "mean_token_accuracy": 0.8992364549636841, | |
| "step": 2250 | |
| }, | |
| { | |
| "epoch": 1.3856002410001507, | |
| "grad_norm": 0.47053080797195435, | |
| "learning_rate": 2.6903757283504123e-05, | |
| "loss": 0.5955, | |
| "mean_token_accuracy": 0.8992098160088062, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 1.4157252598282875, | |
| "grad_norm": 0.21600554883480072, | |
| "learning_rate": 2.6401446654611212e-05, | |
| "loss": 0.5848, | |
| "mean_token_accuracy": 0.8993302121758461, | |
| "step": 2350 | |
| }, | |
| { | |
| "epoch": 1.4458502786564242, | |
| "grad_norm": 0.3977588713169098, | |
| "learning_rate": 2.5899136025718308e-05, | |
| "loss": 0.5728, | |
| "mean_token_accuracy": 0.9013423874974251, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 1.475975297484561, | |
| "grad_norm": 0.3258291184902191, | |
| "learning_rate": 2.5396825396825397e-05, | |
| "loss": 0.5936, | |
| "mean_token_accuracy": 0.9022154864668847, | |
| "step": 2450 | |
| }, | |
| { | |
| "epoch": 1.5061003163126978, | |
| "grad_norm": 0.5135733485221863, | |
| "learning_rate": 2.4894514767932493e-05, | |
| "loss": 0.5831, | |
| "mean_token_accuracy": 0.8982969619333744, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 1.5362253351408346, | |
| "grad_norm": 0.4302254915237427, | |
| "learning_rate": 2.4392204139039585e-05, | |
| "loss": 0.5975, | |
| "mean_token_accuracy": 0.8992070508003235, | |
| "step": 2550 | |
| }, | |
| { | |
| "epoch": 1.5663503539689714, | |
| "grad_norm": 0.8697525858879089, | |
| "learning_rate": 2.3889893510146677e-05, | |
| "loss": 0.629, | |
| "mean_token_accuracy": 0.8953581416606903, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 1.5964753727971082, | |
| "grad_norm": 0.37328246235847473, | |
| "learning_rate": 2.338758288125377e-05, | |
| "loss": 0.5771, | |
| "mean_token_accuracy": 0.9028278756141662, | |
| "step": 2650 | |
| }, | |
| { | |
| "epoch": 1.6266003916252447, | |
| "grad_norm": 0.42918869853019714, | |
| "learning_rate": 2.2885272252360862e-05, | |
| "loss": 0.6585, | |
| "mean_token_accuracy": 0.8911288838088512, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 1.6567254104533815, | |
| "grad_norm": 0.39805442094802856, | |
| "learning_rate": 2.2382961623467954e-05, | |
| "loss": 0.5669, | |
| "mean_token_accuracy": 0.9028179155290127, | |
| "step": 2750 | |
| }, | |
| { | |
| "epoch": 1.6868504292815183, | |
| "grad_norm": 0.2861442565917969, | |
| "learning_rate": 2.1880650994575047e-05, | |
| "loss": 0.6342, | |
| "mean_token_accuracy": 0.897926286906004, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 1.716975448109655, | |
| "grad_norm": 0.36629295349121094, | |
| "learning_rate": 2.137834036568214e-05, | |
| "loss": 0.615, | |
| "mean_token_accuracy": 0.8986844432353973, | |
| "step": 2850 | |
| }, | |
| { | |
| "epoch": 1.7471004669377919, | |
| "grad_norm": 0.22408436238765717, | |
| "learning_rate": 2.087602973678923e-05, | |
| "loss": 0.6102, | |
| "mean_token_accuracy": 0.8989599145203829, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 1.7772254857659286, | |
| "grad_norm": 0.533674955368042, | |
| "learning_rate": 2.0373719107896324e-05, | |
| "loss": 0.6184, | |
| "mean_token_accuracy": 0.8953662586212158, | |
| "step": 2950 | |
| }, | |
| { | |
| "epoch": 1.8073505045940652, | |
| "grad_norm": 0.49870041012763977, | |
| "learning_rate": 1.9871408479003416e-05, | |
| "loss": 0.58, | |
| "mean_token_accuracy": 0.8989548328518867, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 1.837475523422202, | |
| "grad_norm": 0.5503713488578796, | |
| "learning_rate": 1.936909785011051e-05, | |
| "loss": 0.5872, | |
| "mean_token_accuracy": 0.9030785009264946, | |
| "step": 3050 | |
| }, | |
| { | |
| "epoch": 1.8676005422503388, | |
| "grad_norm": 0.2998668849468231, | |
| "learning_rate": 1.88667872212176e-05, | |
| "loss": 0.6009, | |
| "mean_token_accuracy": 0.8998409834504127, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 1.8977255610784756, | |
| "grad_norm": 0.43862882256507874, | |
| "learning_rate": 1.8364476592324696e-05, | |
| "loss": 0.5873, | |
| "mean_token_accuracy": 0.9020068399608135, | |
| "step": 3150 | |
| }, | |
| { | |
| "epoch": 1.9278505799066123, | |
| "grad_norm": 0.34258952736854553, | |
| "learning_rate": 1.786216596343179e-05, | |
| "loss": 0.6118, | |
| "mean_token_accuracy": 0.8974671520292758, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 1.9579755987347491, | |
| "grad_norm": 0.42319709062576294, | |
| "learning_rate": 1.735985533453888e-05, | |
| "loss": 0.6068, | |
| "mean_token_accuracy": 0.8985735175013542, | |
| "step": 3250 | |
| }, | |
| { | |
| "epoch": 1.988100617562886, | |
| "grad_norm": 0.4495251774787903, | |
| "learning_rate": 1.6857544705645973e-05, | |
| "loss": 0.5651, | |
| "mean_token_accuracy": 0.9044052864611148, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "eval_loss": 0.6136223077774048, | |
| "eval_mean_token_accuracy": 0.8889741445769374, | |
| "eval_runtime": 77.9377, | |
| "eval_samples_per_second": 18.938, | |
| "eval_steps_per_second": 2.374, | |
| "step": 3320 | |
| }, | |
| { | |
| "epoch": 2.018075011296882, | |
| "grad_norm": 0.4041847288608551, | |
| "learning_rate": 1.6355234076753066e-05, | |
| "loss": 0.6079, | |
| "mean_token_accuracy": 0.8902337176104387, | |
| "step": 3350 | |
| }, | |
| { | |
| "epoch": 2.048200030125019, | |
| "grad_norm": 0.34565427899360657, | |
| "learning_rate": 1.5852923447860158e-05, | |
| "loss": 0.552, | |
| "mean_token_accuracy": 0.907501307874918, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 2.0783250489531557, | |
| "grad_norm": 0.39507198333740234, | |
| "learning_rate": 1.535061281896725e-05, | |
| "loss": 0.5902, | |
| "mean_token_accuracy": 0.8986614851653576, | |
| "step": 3450 | |
| }, | |
| { | |
| "epoch": 2.1084500677812925, | |
| "grad_norm": 0.35742080211639404, | |
| "learning_rate": 1.4848302190074343e-05, | |
| "loss": 0.5765, | |
| "mean_token_accuracy": 0.9004091265797615, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 2.1385750866094293, | |
| "grad_norm": 0.4014514684677124, | |
| "learning_rate": 1.4345991561181435e-05, | |
| "loss": 0.6075, | |
| "mean_token_accuracy": 0.8991987191140651, | |
| "step": 3550 | |
| }, | |
| { | |
| "epoch": 2.168700105437566, | |
| "grad_norm": 0.2786984443664551, | |
| "learning_rate": 1.3843680932288527e-05, | |
| "loss": 0.5772, | |
| "mean_token_accuracy": 0.9038310977816582, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 2.198825124265703, | |
| "grad_norm": 0.4025174081325531, | |
| "learning_rate": 1.334137030339562e-05, | |
| "loss": 0.5891, | |
| "mean_token_accuracy": 0.8991677206754685, | |
| "step": 3650 | |
| }, | |
| { | |
| "epoch": 2.228950143093839, | |
| "grad_norm": 0.4690361022949219, | |
| "learning_rate": 1.2839059674502712e-05, | |
| "loss": 0.5261, | |
| "mean_token_accuracy": 0.9109566512703896, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 2.2590751619219764, | |
| "grad_norm": 0.3918741047382355, | |
| "learning_rate": 1.2336749045609804e-05, | |
| "loss": 0.5686, | |
| "mean_token_accuracy": 0.9028687690198421, | |
| "step": 3750 | |
| }, | |
| { | |
| "epoch": 2.289200180750113, | |
| "grad_norm": 0.4912905991077423, | |
| "learning_rate": 1.1834438416716898e-05, | |
| "loss": 0.5102, | |
| "mean_token_accuracy": 0.9118883027136326, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 2.3193251995782496, | |
| "grad_norm": 0.7354199886322021, | |
| "learning_rate": 1.133212778782399e-05, | |
| "loss": 0.5748, | |
| "mean_token_accuracy": 0.90053293466568, | |
| "step": 3850 | |
| }, | |
| { | |
| "epoch": 2.3494502184063863, | |
| "grad_norm": 0.49514544010162354, | |
| "learning_rate": 1.0829817158931085e-05, | |
| "loss": 0.5867, | |
| "mean_token_accuracy": 0.9003237128257752, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 2.379575237234523, | |
| "grad_norm": 0.5507615804672241, | |
| "learning_rate": 1.0327506530038177e-05, | |
| "loss": 0.5952, | |
| "mean_token_accuracy": 0.9011906269192695, | |
| "step": 3950 | |
| }, | |
| { | |
| "epoch": 2.40970025606266, | |
| "grad_norm": 0.9635323286056519, | |
| "learning_rate": 9.82519590114527e-06, | |
| "loss": 0.5871, | |
| "mean_token_accuracy": 0.9019941617548466, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 2.4398252748907967, | |
| "grad_norm": 0.306292325258255, | |
| "learning_rate": 9.322885272252362e-06, | |
| "loss": 0.5418, | |
| "mean_token_accuracy": 0.907406060397625, | |
| "step": 4050 | |
| }, | |
| { | |
| "epoch": 2.4699502937189335, | |
| "grad_norm": 0.36833733320236206, | |
| "learning_rate": 8.820574643359454e-06, | |
| "loss": 0.5374, | |
| "mean_token_accuracy": 0.9102728597819805, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 2.5000753125470703, | |
| "grad_norm": 0.4845290780067444, | |
| "learning_rate": 8.318264014466546e-06, | |
| "loss": 0.6115, | |
| "mean_token_accuracy": 0.9012929057329893, | |
| "step": 4150 | |
| }, | |
| { | |
| "epoch": 2.530200331375207, | |
| "grad_norm": 0.4215283691883087, | |
| "learning_rate": 7.815953385573639e-06, | |
| "loss": 0.5214, | |
| "mean_token_accuracy": 0.909003015756607, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 2.560325350203344, | |
| "grad_norm": 0.4454072415828705, | |
| "learning_rate": 7.313642756680732e-06, | |
| "loss": 0.6277, | |
| "mean_token_accuracy": 0.8945660217106343, | |
| "step": 4250 | |
| }, | |
| { | |
| "epoch": 2.5904503690314806, | |
| "grad_norm": 0.7070040106773376, | |
| "learning_rate": 6.811332127787824e-06, | |
| "loss": 0.5678, | |
| "mean_token_accuracy": 0.9047226509451867, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 2.6205753878596174, | |
| "grad_norm": 0.545863687992096, | |
| "learning_rate": 6.3090214988949165e-06, | |
| "loss": 0.5955, | |
| "mean_token_accuracy": 0.901444385945797, | |
| "step": 4350 | |
| }, | |
| { | |
| "epoch": 2.650700406687754, | |
| "grad_norm": 0.4422617554664612, | |
| "learning_rate": 5.80671087000201e-06, | |
| "loss": 0.5588, | |
| "mean_token_accuracy": 0.9066709437966347, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 2.680825425515891, | |
| "grad_norm": 0.7092880010604858, | |
| "learning_rate": 5.304400241109103e-06, | |
| "loss": 0.602, | |
| "mean_token_accuracy": 0.900201300829649, | |
| "step": 4450 | |
| }, | |
| { | |
| "epoch": 2.7109504443440278, | |
| "grad_norm": 0.33753281831741333, | |
| "learning_rate": 4.802089612216195e-06, | |
| "loss": 0.5353, | |
| "mean_token_accuracy": 0.9101526521146297, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 2.7410754631721646, | |
| "grad_norm": 0.49151691794395447, | |
| "learning_rate": 4.2997789833232875e-06, | |
| "loss": 0.6013, | |
| "mean_token_accuracy": 0.8975072601437568, | |
| "step": 4550 | |
| }, | |
| { | |
| "epoch": 2.7712004820003013, | |
| "grad_norm": 0.5509622693061829, | |
| "learning_rate": 3.7974683544303802e-06, | |
| "loss": 0.5806, | |
| "mean_token_accuracy": 0.9017076626420021, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 2.801325500828438, | |
| "grad_norm": 0.46273571252822876, | |
| "learning_rate": 3.2951577255374726e-06, | |
| "loss": 0.6097, | |
| "mean_token_accuracy": 0.8960529206693173, | |
| "step": 4650 | |
| }, | |
| { | |
| "epoch": 2.831450519656575, | |
| "grad_norm": 0.4964665174484253, | |
| "learning_rate": 2.792847096644565e-06, | |
| "loss": 0.546, | |
| "mean_token_accuracy": 0.9039208325743675, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 2.8615755384847117, | |
| "grad_norm": 0.5726104378700256, | |
| "learning_rate": 2.2905364677516576e-06, | |
| "loss": 0.5698, | |
| "mean_token_accuracy": 0.9045622007548809, | |
| "step": 4750 | |
| }, | |
| { | |
| "epoch": 2.8917005573128485, | |
| "grad_norm": 0.47625041007995605, | |
| "learning_rate": 1.7882258388587504e-06, | |
| "loss": 0.6098, | |
| "mean_token_accuracy": 0.8977401655912399, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 2.921825576140985, | |
| "grad_norm": 0.8348466157913208, | |
| "learning_rate": 1.285915209965843e-06, | |
| "loss": 0.5718, | |
| "mean_token_accuracy": 0.9037941220402718, | |
| "step": 4850 | |
| }, | |
| { | |
| "epoch": 2.951950594969122, | |
| "grad_norm": 0.6317358016967773, | |
| "learning_rate": 7.836045810729356e-07, | |
| "loss": 0.5573, | |
| "mean_token_accuracy": 0.9056886151432991, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 2.9820756137972584, | |
| "grad_norm": 0.5102740526199341, | |
| "learning_rate": 2.8129395218002816e-07, | |
| "loss": 0.5308, | |
| "mean_token_accuracy": 0.9070908261835575, | |
| "step": 4950 | |
| }, | |
| { | |
| "epoch": 2.9983431239644522, | |
| "eval_loss": 0.6088222861289978, | |
| "eval_mean_token_accuracy": 0.8896377841730665, | |
| "eval_runtime": 76.8211, | |
| "eval_samples_per_second": 19.213, | |
| "eval_steps_per_second": 2.408, | |
| "step": 4977 | |
| }, | |
| { | |
| "epoch": 2.9983431239644522, | |
| "step": 4977, | |
| "total_flos": 1017098040639488.0, | |
| "train_loss": 0.6195706200211647, | |
| "train_runtime": 35359.1981, | |
| "train_samples_per_second": 1.127, | |
| "train_steps_per_second": 0.141 | |
| } | |
| ], | |
| "logging_steps": 50, | |
| "max_steps": 4977, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1017098040639488.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |