{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9975031210986267, "eval_steps": 500, "global_step": 12800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007802746566791511, "grad_norm": 0.09780355542898178, "learning_rate": 4.9923533083645446e-05, "loss": 2.1517, "step": 50 }, { "epoch": 0.015605493133583021, "grad_norm": 0.1110800951719284, "learning_rate": 4.984550561797753e-05, "loss": 2.0733, "step": 100 }, { "epoch": 0.023408239700374533, "grad_norm": 0.11351309716701508, "learning_rate": 4.9767478152309616e-05, "loss": 1.9999, "step": 150 }, { "epoch": 0.031210986267166042, "grad_norm": 0.12184558063745499, "learning_rate": 4.96894506866417e-05, "loss": 1.911, "step": 200 }, { "epoch": 0.03901373283395755, "grad_norm": 0.12001396715641022, "learning_rate": 4.9611423220973786e-05, "loss": 1.8957, "step": 250 }, { "epoch": 0.04681647940074907, "grad_norm": 0.13359545171260834, "learning_rate": 4.9533395755305875e-05, "loss": 1.881, "step": 300 }, { "epoch": 0.054619225967540576, "grad_norm": 0.177729532122612, "learning_rate": 4.9455368289637956e-05, "loss": 1.8426, "step": 350 }, { "epoch": 0.062421972534332085, "grad_norm": 0.15538156032562256, "learning_rate": 4.9377340823970044e-05, "loss": 1.8528, "step": 400 }, { "epoch": 0.0702247191011236, "grad_norm": 0.1490369588136673, "learning_rate": 4.9299313358302126e-05, "loss": 1.8604, "step": 450 }, { "epoch": 0.0780274656679151, "grad_norm": 0.15821540355682373, "learning_rate": 4.9221285892634214e-05, "loss": 1.8454, "step": 500 }, { "epoch": 0.08583021223470662, "grad_norm": 0.15961939096450806, "learning_rate": 4.9143258426966296e-05, "loss": 1.8235, "step": 550 }, { "epoch": 0.09363295880149813, "grad_norm": 0.18454231321811676, "learning_rate": 4.906523096129838e-05, "loss": 1.8245, "step": 600 }, { "epoch": 0.10143570536828964, "grad_norm": 0.19588346779346466, "learning_rate": 4.8987203495630466e-05, "loss": 1.8332, "step": 650 }, { "epoch": 0.10923845193508115, "grad_norm": 0.20523308217525482, "learning_rate": 4.890917602996255e-05, "loss": 1.8176, "step": 700 }, { "epoch": 0.11704119850187265, "grad_norm": 0.2048981934785843, "learning_rate": 4.883114856429463e-05, "loss": 1.7751, "step": 750 }, { "epoch": 0.12484394506866417, "grad_norm": 0.19311609864234924, "learning_rate": 4.875312109862672e-05, "loss": 1.8034, "step": 800 }, { "epoch": 0.13264669163545567, "grad_norm": 0.21123754978179932, "learning_rate": 4.86750936329588e-05, "loss": 1.7859, "step": 850 }, { "epoch": 0.1404494382022472, "grad_norm": 0.270113080739975, "learning_rate": 4.859706616729089e-05, "loss": 1.8121, "step": 900 }, { "epoch": 0.1482521847690387, "grad_norm": 0.23597899079322815, "learning_rate": 4.8519038701622975e-05, "loss": 1.8213, "step": 950 }, { "epoch": 0.1560549313358302, "grad_norm": 0.2787221372127533, "learning_rate": 4.844101123595506e-05, "loss": 1.8048, "step": 1000 }, { "epoch": 0.16385767790262173, "grad_norm": 0.2505972981452942, "learning_rate": 4.8362983770287145e-05, "loss": 1.7799, "step": 1050 }, { "epoch": 0.17166042446941324, "grad_norm": 0.2511955797672272, "learning_rate": 4.828495630461923e-05, "loss": 1.7758, "step": 1100 }, { "epoch": 0.17946317103620474, "grad_norm": 0.2740708291530609, "learning_rate": 4.8206928838951315e-05, "loss": 1.7817, "step": 1150 }, { "epoch": 0.18726591760299627, "grad_norm": 0.30172184109687805, "learning_rate": 4.81289013732834e-05, "loss": 1.7665, "step": 1200 }, { "epoch": 0.19506866416978777, "grad_norm": 0.2734954059123993, "learning_rate": 4.8050873907615485e-05, "loss": 1.7673, "step": 1250 }, { "epoch": 0.20287141073657927, "grad_norm": 0.2837677597999573, "learning_rate": 4.797284644194757e-05, "loss": 1.7539, "step": 1300 }, { "epoch": 0.21067415730337077, "grad_norm": 0.287616491317749, "learning_rate": 4.7894818976279655e-05, "loss": 1.7845, "step": 1350 }, { "epoch": 0.2184769038701623, "grad_norm": 0.35649779438972473, "learning_rate": 4.7816791510611737e-05, "loss": 1.7629, "step": 1400 }, { "epoch": 0.2262796504369538, "grad_norm": 0.3466143012046814, "learning_rate": 4.7738764044943825e-05, "loss": 1.7637, "step": 1450 }, { "epoch": 0.2340823970037453, "grad_norm": 0.29961591958999634, "learning_rate": 4.7660736579275906e-05, "loss": 1.7775, "step": 1500 }, { "epoch": 0.24188514357053684, "grad_norm": 0.3261962831020355, "learning_rate": 4.7582709113607995e-05, "loss": 1.7651, "step": 1550 }, { "epoch": 0.24968789013732834, "grad_norm": 0.31850898265838623, "learning_rate": 4.7504681647940076e-05, "loss": 1.7601, "step": 1600 }, { "epoch": 0.25749063670411987, "grad_norm": 0.3334502577781677, "learning_rate": 4.7426654182272165e-05, "loss": 1.7783, "step": 1650 }, { "epoch": 0.26529338327091134, "grad_norm": 0.3199051320552826, "learning_rate": 4.7348626716604246e-05, "loss": 1.7492, "step": 1700 }, { "epoch": 0.2730961298377029, "grad_norm": 0.3381502628326416, "learning_rate": 4.7270599250936335e-05, "loss": 1.7592, "step": 1750 }, { "epoch": 0.2808988764044944, "grad_norm": 0.33800897002220154, "learning_rate": 4.7192571785268416e-05, "loss": 1.7403, "step": 1800 }, { "epoch": 0.2887016229712859, "grad_norm": 0.3942790925502777, "learning_rate": 4.71145443196005e-05, "loss": 1.7955, "step": 1850 }, { "epoch": 0.2965043695380774, "grad_norm": 0.3514516353607178, "learning_rate": 4.7036516853932586e-05, "loss": 1.7453, "step": 1900 }, { "epoch": 0.30430711610486894, "grad_norm": 0.3649444878101349, "learning_rate": 4.695848938826467e-05, "loss": 1.7584, "step": 1950 }, { "epoch": 0.3121098626716604, "grad_norm": 0.33589330315589905, "learning_rate": 4.6880461922596756e-05, "loss": 1.7369, "step": 2000 }, { "epoch": 0.31991260923845194, "grad_norm": 0.3481082320213318, "learning_rate": 4.680243445692884e-05, "loss": 1.7281, "step": 2050 }, { "epoch": 0.32771535580524347, "grad_norm": 0.34206053614616394, "learning_rate": 4.6724406991260926e-05, "loss": 1.7266, "step": 2100 }, { "epoch": 0.33551810237203494, "grad_norm": 0.3519572615623474, "learning_rate": 4.664637952559301e-05, "loss": 1.7365, "step": 2150 }, { "epoch": 0.3433208489388265, "grad_norm": 0.35082659125328064, "learning_rate": 4.6568352059925096e-05, "loss": 1.7411, "step": 2200 }, { "epoch": 0.351123595505618, "grad_norm": 0.3652777373790741, "learning_rate": 4.649032459425718e-05, "loss": 1.7466, "step": 2250 }, { "epoch": 0.3589263420724095, "grad_norm": 0.39173486828804016, "learning_rate": 4.6412297128589266e-05, "loss": 1.7443, "step": 2300 }, { "epoch": 0.366729088639201, "grad_norm": 0.47386667132377625, "learning_rate": 4.633426966292135e-05, "loss": 1.7276, "step": 2350 }, { "epoch": 0.37453183520599254, "grad_norm": 0.3906112015247345, "learning_rate": 4.6256242197253436e-05, "loss": 1.7194, "step": 2400 }, { "epoch": 0.382334581772784, "grad_norm": 0.4129493832588196, "learning_rate": 4.6178214731585524e-05, "loss": 1.7203, "step": 2450 }, { "epoch": 0.39013732833957554, "grad_norm": 0.4499260485172272, "learning_rate": 4.6100187265917605e-05, "loss": 1.7268, "step": 2500 }, { "epoch": 0.397940074906367, "grad_norm": 0.4021989703178406, "learning_rate": 4.6022159800249694e-05, "loss": 1.7532, "step": 2550 }, { "epoch": 0.40574282147315854, "grad_norm": 0.3842841386795044, "learning_rate": 4.5944132334581775e-05, "loss": 1.7195, "step": 2600 }, { "epoch": 0.4135455680399501, "grad_norm": 0.3462275266647339, "learning_rate": 4.5866104868913864e-05, "loss": 1.7269, "step": 2650 }, { "epoch": 0.42134831460674155, "grad_norm": 0.39410606026649475, "learning_rate": 4.5788077403245945e-05, "loss": 1.7192, "step": 2700 }, { "epoch": 0.4291510611735331, "grad_norm": 0.4126410484313965, "learning_rate": 4.5710049937578034e-05, "loss": 1.7421, "step": 2750 }, { "epoch": 0.4369538077403246, "grad_norm": 0.36984720826148987, "learning_rate": 4.5632022471910115e-05, "loss": 1.7101, "step": 2800 }, { "epoch": 0.4447565543071161, "grad_norm": 0.3527930974960327, "learning_rate": 4.5553995006242203e-05, "loss": 1.7276, "step": 2850 }, { "epoch": 0.4525593008739076, "grad_norm": 0.4303431510925293, "learning_rate": 4.5475967540574285e-05, "loss": 1.7292, "step": 2900 }, { "epoch": 0.46036204744069914, "grad_norm": 0.43609175086021423, "learning_rate": 4.5397940074906367e-05, "loss": 1.7167, "step": 2950 }, { "epoch": 0.4681647940074906, "grad_norm": 0.43081963062286377, "learning_rate": 4.5319912609238455e-05, "loss": 1.7214, "step": 3000 }, { "epoch": 0.47596754057428214, "grad_norm": 0.4468071162700653, "learning_rate": 4.5241885143570536e-05, "loss": 1.7375, "step": 3050 }, { "epoch": 0.4837702871410737, "grad_norm": 0.4700297713279724, "learning_rate": 4.516385767790262e-05, "loss": 1.7245, "step": 3100 }, { "epoch": 0.49157303370786515, "grad_norm": 0.4510380029678345, "learning_rate": 4.5085830212234706e-05, "loss": 1.7303, "step": 3150 }, { "epoch": 0.4993757802746567, "grad_norm": 0.4726191461086273, "learning_rate": 4.5007802746566795e-05, "loss": 1.7146, "step": 3200 }, { "epoch": 0.5071785268414482, "grad_norm": 0.4493762254714966, "learning_rate": 4.4929775280898876e-05, "loss": 1.7214, "step": 3250 }, { "epoch": 0.5149812734082397, "grad_norm": 0.425413578748703, "learning_rate": 4.4851747815230965e-05, "loss": 1.6912, "step": 3300 }, { "epoch": 0.5227840199750312, "grad_norm": 0.49669891595840454, "learning_rate": 4.4773720349563046e-05, "loss": 1.7082, "step": 3350 }, { "epoch": 0.5305867665418227, "grad_norm": 0.5002058148384094, "learning_rate": 4.4695692883895134e-05, "loss": 1.7119, "step": 3400 }, { "epoch": 0.5383895131086143, "grad_norm": 0.5210412740707397, "learning_rate": 4.4617665418227216e-05, "loss": 1.7139, "step": 3450 }, { "epoch": 0.5461922596754057, "grad_norm": 0.42913952469825745, "learning_rate": 4.4539637952559304e-05, "loss": 1.7083, "step": 3500 }, { "epoch": 0.5539950062421972, "grad_norm": 0.45524224638938904, "learning_rate": 4.4461610486891386e-05, "loss": 1.715, "step": 3550 }, { "epoch": 0.5617977528089888, "grad_norm": 0.47845831513404846, "learning_rate": 4.4383583021223474e-05, "loss": 1.7194, "step": 3600 }, { "epoch": 0.5696004993757803, "grad_norm": 0.4836772680282593, "learning_rate": 4.4305555555555556e-05, "loss": 1.7009, "step": 3650 }, { "epoch": 0.5774032459425718, "grad_norm": 0.4728386700153351, "learning_rate": 4.4227528089887644e-05, "loss": 1.7128, "step": 3700 }, { "epoch": 0.5852059925093633, "grad_norm": 0.4861275255680084, "learning_rate": 4.4149500624219726e-05, "loss": 1.7089, "step": 3750 }, { "epoch": 0.5930087390761548, "grad_norm": 0.5425216555595398, "learning_rate": 4.4071473158551814e-05, "loss": 1.7238, "step": 3800 }, { "epoch": 0.6008114856429463, "grad_norm": 0.4091911315917969, "learning_rate": 4.3993445692883896e-05, "loss": 1.683, "step": 3850 }, { "epoch": 0.6086142322097379, "grad_norm": 0.38465380668640137, "learning_rate": 4.3915418227215984e-05, "loss": 1.6991, "step": 3900 }, { "epoch": 0.6164169787765293, "grad_norm": 0.4543341398239136, "learning_rate": 4.383739076154807e-05, "loss": 1.7105, "step": 3950 }, { "epoch": 0.6242197253433208, "grad_norm": 0.46507373452186584, "learning_rate": 4.3759363295880154e-05, "loss": 1.6935, "step": 4000 }, { "epoch": 0.6320224719101124, "grad_norm": 0.4509834349155426, "learning_rate": 4.368133583021224e-05, "loss": 1.6869, "step": 4050 }, { "epoch": 0.6398252184769039, "grad_norm": 0.5607530474662781, "learning_rate": 4.3603308364544324e-05, "loss": 1.7125, "step": 4100 }, { "epoch": 0.6476279650436954, "grad_norm": 0.4719931483268738, "learning_rate": 4.3525280898876405e-05, "loss": 1.677, "step": 4150 }, { "epoch": 0.6554307116104869, "grad_norm": 0.4882090091705322, "learning_rate": 4.344725343320849e-05, "loss": 1.6981, "step": 4200 }, { "epoch": 0.6632334581772784, "grad_norm": 0.5204262733459473, "learning_rate": 4.3369225967540575e-05, "loss": 1.6905, "step": 4250 }, { "epoch": 0.6710362047440699, "grad_norm": 0.5416198372840881, "learning_rate": 4.329119850187266e-05, "loss": 1.6874, "step": 4300 }, { "epoch": 0.6788389513108615, "grad_norm": 0.484465628862381, "learning_rate": 4.3213171036204745e-05, "loss": 1.6967, "step": 4350 }, { "epoch": 0.686641697877653, "grad_norm": 0.4895637333393097, "learning_rate": 4.313514357053683e-05, "loss": 1.7088, "step": 4400 }, { "epoch": 0.6944444444444444, "grad_norm": 0.5524686574935913, "learning_rate": 4.3057116104868915e-05, "loss": 1.6826, "step": 4450 }, { "epoch": 0.702247191011236, "grad_norm": 0.5131920576095581, "learning_rate": 4.2979088639200997e-05, "loss": 1.6932, "step": 4500 }, { "epoch": 0.7100499375780275, "grad_norm": 0.5526320934295654, "learning_rate": 4.2901061173533085e-05, "loss": 1.7017, "step": 4550 }, { "epoch": 0.717852684144819, "grad_norm": 0.5150460600852966, "learning_rate": 4.282303370786517e-05, "loss": 1.6903, "step": 4600 }, { "epoch": 0.7256554307116105, "grad_norm": 0.5039018988609314, "learning_rate": 4.2745006242197255e-05, "loss": 1.7134, "step": 4650 }, { "epoch": 0.733458177278402, "grad_norm": 0.48572325706481934, "learning_rate": 4.266697877652934e-05, "loss": 1.6976, "step": 4700 }, { "epoch": 0.7412609238451935, "grad_norm": 0.5142014026641846, "learning_rate": 4.2588951310861425e-05, "loss": 1.6834, "step": 4750 }, { "epoch": 0.7490636704119851, "grad_norm": 0.4606572985649109, "learning_rate": 4.251092384519351e-05, "loss": 1.6793, "step": 4800 }, { "epoch": 0.7568664169787765, "grad_norm": 0.46960020065307617, "learning_rate": 4.2432896379525595e-05, "loss": 1.6828, "step": 4850 }, { "epoch": 0.764669163545568, "grad_norm": 0.4922361671924591, "learning_rate": 4.235486891385768e-05, "loss": 1.7022, "step": 4900 }, { "epoch": 0.7724719101123596, "grad_norm": 0.5289677381515503, "learning_rate": 4.2276841448189764e-05, "loss": 1.6903, "step": 4950 }, { "epoch": 0.7802746566791511, "grad_norm": 0.5616611838340759, "learning_rate": 4.219881398252185e-05, "loss": 1.6915, "step": 5000 }, { "epoch": 0.7880774032459426, "grad_norm": 0.4942910075187683, "learning_rate": 4.2120786516853934e-05, "loss": 1.6813, "step": 5050 }, { "epoch": 0.795880149812734, "grad_norm": 0.5219690203666687, "learning_rate": 4.204275905118602e-05, "loss": 1.6731, "step": 5100 }, { "epoch": 0.8036828963795256, "grad_norm": 0.4913477897644043, "learning_rate": 4.1964731585518104e-05, "loss": 1.7001, "step": 5150 }, { "epoch": 0.8114856429463171, "grad_norm": 0.49623045325279236, "learning_rate": 4.188670411985019e-05, "loss": 1.686, "step": 5200 }, { "epoch": 0.8192883895131086, "grad_norm": 0.5660040974617004, "learning_rate": 4.1808676654182274e-05, "loss": 1.6794, "step": 5250 }, { "epoch": 0.8270911360799001, "grad_norm": 0.5747349262237549, "learning_rate": 4.173064918851436e-05, "loss": 1.6847, "step": 5300 }, { "epoch": 0.8348938826466916, "grad_norm": 0.5206372737884521, "learning_rate": 4.1652621722846444e-05, "loss": 1.6719, "step": 5350 }, { "epoch": 0.8426966292134831, "grad_norm": 0.5366120934486389, "learning_rate": 4.1574594257178526e-05, "loss": 1.6696, "step": 5400 }, { "epoch": 0.8504993757802747, "grad_norm": 0.6354568600654602, "learning_rate": 4.1496566791510614e-05, "loss": 1.6657, "step": 5450 }, { "epoch": 0.8583021223470662, "grad_norm": 0.5194640159606934, "learning_rate": 4.1418539325842695e-05, "loss": 1.6731, "step": 5500 }, { "epoch": 0.8661048689138576, "grad_norm": 0.43448275327682495, "learning_rate": 4.1340511860174784e-05, "loss": 1.6639, "step": 5550 }, { "epoch": 0.8739076154806492, "grad_norm": 0.5286650657653809, "learning_rate": 4.1262484394506865e-05, "loss": 1.677, "step": 5600 }, { "epoch": 0.8817103620474407, "grad_norm": 0.49659380316734314, "learning_rate": 4.1184456928838954e-05, "loss": 1.6758, "step": 5650 }, { "epoch": 0.8895131086142322, "grad_norm": 0.5224044322967529, "learning_rate": 4.1106429463171035e-05, "loss": 1.6528, "step": 5700 }, { "epoch": 0.8973158551810237, "grad_norm": 0.510977566242218, "learning_rate": 4.1028401997503124e-05, "loss": 1.6869, "step": 5750 }, { "epoch": 0.9051186017478152, "grad_norm": 0.5862101912498474, "learning_rate": 4.0950374531835205e-05, "loss": 1.68, "step": 5800 }, { "epoch": 0.9129213483146067, "grad_norm": 0.5646480321884155, "learning_rate": 4.0872347066167293e-05, "loss": 1.6749, "step": 5850 }, { "epoch": 0.9207240948813983, "grad_norm": 0.5872883200645447, "learning_rate": 4.0794319600499375e-05, "loss": 1.6661, "step": 5900 }, { "epoch": 0.9285268414481898, "grad_norm": 0.5308676958084106, "learning_rate": 4.0716292134831463e-05, "loss": 1.6747, "step": 5950 }, { "epoch": 0.9363295880149812, "grad_norm": 0.5872898101806641, "learning_rate": 4.0638264669163545e-05, "loss": 1.6719, "step": 6000 }, { "epoch": 0.9441323345817728, "grad_norm": 0.6066872477531433, "learning_rate": 4.056023720349563e-05, "loss": 1.6746, "step": 6050 }, { "epoch": 0.9519350811485643, "grad_norm": 0.5329908132553101, "learning_rate": 4.048220973782772e-05, "loss": 1.6789, "step": 6100 }, { "epoch": 0.9597378277153558, "grad_norm": 0.4528316855430603, "learning_rate": 4.04041822721598e-05, "loss": 1.6767, "step": 6150 }, { "epoch": 0.9675405742821473, "grad_norm": 0.5394971966743469, "learning_rate": 4.032615480649189e-05, "loss": 1.6797, "step": 6200 }, { "epoch": 0.9753433208489388, "grad_norm": 0.6735771298408508, "learning_rate": 4.024812734082397e-05, "loss": 1.6842, "step": 6250 }, { "epoch": 0.9831460674157303, "grad_norm": 0.48064225912094116, "learning_rate": 4.017009987515606e-05, "loss": 1.6767, "step": 6300 }, { "epoch": 0.9909488139825219, "grad_norm": 0.49285510182380676, "learning_rate": 4.009207240948814e-05, "loss": 1.666, "step": 6350 }, { "epoch": 0.9987515605493134, "grad_norm": 0.5762596726417542, "learning_rate": 4.001404494382023e-05, "loss": 1.6766, "step": 6400 }, { "epoch": 1.006554307116105, "grad_norm": 0.4971337616443634, "learning_rate": 3.993601747815231e-05, "loss": 1.6727, "step": 6450 }, { "epoch": 1.0143570536828963, "grad_norm": 0.5485156178474426, "learning_rate": 3.9857990012484394e-05, "loss": 1.6842, "step": 6500 }, { "epoch": 1.0221598002496879, "grad_norm": 0.4900757968425751, "learning_rate": 3.977996254681648e-05, "loss": 1.6661, "step": 6550 }, { "epoch": 1.0299625468164795, "grad_norm": 0.4844594895839691, "learning_rate": 3.9701935081148564e-05, "loss": 1.6601, "step": 6600 }, { "epoch": 1.0377652933832708, "grad_norm": 0.6316475868225098, "learning_rate": 3.9623907615480646e-05, "loss": 1.6481, "step": 6650 }, { "epoch": 1.0455680399500624, "grad_norm": 0.6341625452041626, "learning_rate": 3.9545880149812734e-05, "loss": 1.6656, "step": 6700 }, { "epoch": 1.053370786516854, "grad_norm": 0.6208726763725281, "learning_rate": 3.9467852684144816e-05, "loss": 1.6379, "step": 6750 }, { "epoch": 1.0611735330836454, "grad_norm": 0.605767011642456, "learning_rate": 3.9389825218476904e-05, "loss": 1.6537, "step": 6800 }, { "epoch": 1.068976279650437, "grad_norm": 0.5422509908676147, "learning_rate": 3.931179775280899e-05, "loss": 1.6689, "step": 6850 }, { "epoch": 1.0767790262172285, "grad_norm": 0.6556758880615234, "learning_rate": 3.9233770287141074e-05, "loss": 1.6619, "step": 6900 }, { "epoch": 1.08458177278402, "grad_norm": 0.5463916659355164, "learning_rate": 3.915574282147316e-05, "loss": 1.6584, "step": 6950 }, { "epoch": 1.0923845193508115, "grad_norm": 0.660961925983429, "learning_rate": 3.9077715355805244e-05, "loss": 1.6813, "step": 7000 }, { "epoch": 1.100187265917603, "grad_norm": 0.5504911541938782, "learning_rate": 3.899968789013733e-05, "loss": 1.6482, "step": 7050 }, { "epoch": 1.1079900124843944, "grad_norm": 0.5077877044677734, "learning_rate": 3.8921660424469414e-05, "loss": 1.6508, "step": 7100 }, { "epoch": 1.115792759051186, "grad_norm": 0.5340275764465332, "learning_rate": 3.88436329588015e-05, "loss": 1.6685, "step": 7150 }, { "epoch": 1.1235955056179776, "grad_norm": 0.615564227104187, "learning_rate": 3.8765605493133584e-05, "loss": 1.6617, "step": 7200 }, { "epoch": 1.131398252184769, "grad_norm": 0.600592315196991, "learning_rate": 3.868757802746567e-05, "loss": 1.6478, "step": 7250 }, { "epoch": 1.1392009987515606, "grad_norm": 0.606829047203064, "learning_rate": 3.8609550561797754e-05, "loss": 1.6627, "step": 7300 }, { "epoch": 1.1470037453183521, "grad_norm": 0.5715992450714111, "learning_rate": 3.853152309612984e-05, "loss": 1.6724, "step": 7350 }, { "epoch": 1.1548064918851435, "grad_norm": 0.5475966334342957, "learning_rate": 3.8453495630461923e-05, "loss": 1.6762, "step": 7400 }, { "epoch": 1.162609238451935, "grad_norm": 0.5486684441566467, "learning_rate": 3.837546816479401e-05, "loss": 1.6418, "step": 7450 }, { "epoch": 1.1704119850187267, "grad_norm": 0.5656526684761047, "learning_rate": 3.829744069912609e-05, "loss": 1.6623, "step": 7500 }, { "epoch": 1.178214731585518, "grad_norm": 0.471967875957489, "learning_rate": 3.821941323345818e-05, "loss": 1.6339, "step": 7550 }, { "epoch": 1.1860174781523096, "grad_norm": 0.5814192891120911, "learning_rate": 3.814138576779026e-05, "loss": 1.6626, "step": 7600 }, { "epoch": 1.1938202247191012, "grad_norm": 0.5809513926506042, "learning_rate": 3.806335830212235e-05, "loss": 1.6521, "step": 7650 }, { "epoch": 1.2016229712858926, "grad_norm": 0.564431369304657, "learning_rate": 3.798533083645443e-05, "loss": 1.633, "step": 7700 }, { "epoch": 1.2094257178526842, "grad_norm": 0.5864349007606506, "learning_rate": 3.7907303370786515e-05, "loss": 1.6724, "step": 7750 }, { "epoch": 1.2172284644194757, "grad_norm": 0.5883368849754333, "learning_rate": 3.78292759051186e-05, "loss": 1.6661, "step": 7800 }, { "epoch": 1.225031210986267, "grad_norm": 0.5318378806114197, "learning_rate": 3.7751248439450685e-05, "loss": 1.6703, "step": 7850 }, { "epoch": 1.2328339575530587, "grad_norm": 0.6735764741897583, "learning_rate": 3.767322097378277e-05, "loss": 1.6558, "step": 7900 }, { "epoch": 1.2406367041198503, "grad_norm": 0.5900487303733826, "learning_rate": 3.7595193508114855e-05, "loss": 1.6472, "step": 7950 }, { "epoch": 1.2484394506866416, "grad_norm": 0.4971151649951935, "learning_rate": 3.751716604244694e-05, "loss": 1.6357, "step": 8000 }, { "epoch": 1.2562421972534332, "grad_norm": 0.6045508980751038, "learning_rate": 3.7439138576779024e-05, "loss": 1.6556, "step": 8050 }, { "epoch": 1.2640449438202248, "grad_norm": 0.5860553979873657, "learning_rate": 3.736111111111111e-05, "loss": 1.6659, "step": 8100 }, { "epoch": 1.2718476903870162, "grad_norm": 0.5339462161064148, "learning_rate": 3.7283083645443194e-05, "loss": 1.6707, "step": 8150 }, { "epoch": 1.2796504369538078, "grad_norm": 0.5763932466506958, "learning_rate": 3.720505617977528e-05, "loss": 1.6694, "step": 8200 }, { "epoch": 1.2874531835205993, "grad_norm": 0.5927013754844666, "learning_rate": 3.712702871410737e-05, "loss": 1.6258, "step": 8250 }, { "epoch": 1.2952559300873907, "grad_norm": 0.6203845739364624, "learning_rate": 3.704900124843945e-05, "loss": 1.6459, "step": 8300 }, { "epoch": 1.3030586766541823, "grad_norm": 0.5438473224639893, "learning_rate": 3.697097378277154e-05, "loss": 1.6352, "step": 8350 }, { "epoch": 1.3108614232209739, "grad_norm": 0.6493474245071411, "learning_rate": 3.689294631710362e-05, "loss": 1.6643, "step": 8400 }, { "epoch": 1.3186641697877652, "grad_norm": 0.5788607597351074, "learning_rate": 3.681491885143571e-05, "loss": 1.6753, "step": 8450 }, { "epoch": 1.3264669163545568, "grad_norm": 0.5591830015182495, "learning_rate": 3.673689138576779e-05, "loss": 1.6316, "step": 8500 }, { "epoch": 1.3342696629213484, "grad_norm": 0.4842735230922699, "learning_rate": 3.665886392009988e-05, "loss": 1.6699, "step": 8550 }, { "epoch": 1.3420724094881398, "grad_norm": 0.6692916750907898, "learning_rate": 3.658083645443196e-05, "loss": 1.6249, "step": 8600 }, { "epoch": 1.3498751560549314, "grad_norm": 0.5876237154006958, "learning_rate": 3.650280898876405e-05, "loss": 1.6576, "step": 8650 }, { "epoch": 1.357677902621723, "grad_norm": 0.6215786933898926, "learning_rate": 3.642478152309613e-05, "loss": 1.6332, "step": 8700 }, { "epoch": 1.3654806491885143, "grad_norm": 0.54453444480896, "learning_rate": 3.634675405742822e-05, "loss": 1.6456, "step": 8750 }, { "epoch": 1.373283395755306, "grad_norm": 0.6348562836647034, "learning_rate": 3.62687265917603e-05, "loss": 1.6476, "step": 8800 }, { "epoch": 1.3810861423220975, "grad_norm": 0.6236295700073242, "learning_rate": 3.6190699126092384e-05, "loss": 1.6676, "step": 8850 }, { "epoch": 1.3888888888888888, "grad_norm": 0.5054611563682556, "learning_rate": 3.611267166042447e-05, "loss": 1.6506, "step": 8900 }, { "epoch": 1.3966916354556804, "grad_norm": 0.582488477230072, "learning_rate": 3.6034644194756553e-05, "loss": 1.6413, "step": 8950 }, { "epoch": 1.404494382022472, "grad_norm": 0.6589245796203613, "learning_rate": 3.595661672908864e-05, "loss": 1.6824, "step": 9000 }, { "epoch": 1.4122971285892634, "grad_norm": 0.5584797859191895, "learning_rate": 3.587858926342072e-05, "loss": 1.6685, "step": 9050 }, { "epoch": 1.420099875156055, "grad_norm": 0.6652534604072571, "learning_rate": 3.580056179775281e-05, "loss": 1.6362, "step": 9100 }, { "epoch": 1.4279026217228465, "grad_norm": 0.6449257731437683, "learning_rate": 3.572253433208489e-05, "loss": 1.6543, "step": 9150 }, { "epoch": 1.435705368289638, "grad_norm": 0.6358399391174316, "learning_rate": 3.564450686641698e-05, "loss": 1.6464, "step": 9200 }, { "epoch": 1.4435081148564295, "grad_norm": 0.6031101942062378, "learning_rate": 3.556647940074906e-05, "loss": 1.6435, "step": 9250 }, { "epoch": 1.451310861423221, "grad_norm": 0.5363774299621582, "learning_rate": 3.548845193508115e-05, "loss": 1.6728, "step": 9300 }, { "epoch": 1.4591136079900124, "grad_norm": 0.6634340286254883, "learning_rate": 3.541042446941323e-05, "loss": 1.6638, "step": 9350 }, { "epoch": 1.466916354556804, "grad_norm": 0.6200147867202759, "learning_rate": 3.533239700374532e-05, "loss": 1.6537, "step": 9400 }, { "epoch": 1.4747191011235956, "grad_norm": 0.5800793766975403, "learning_rate": 3.52543695380774e-05, "loss": 1.6549, "step": 9450 }, { "epoch": 1.482521847690387, "grad_norm": 0.5839795470237732, "learning_rate": 3.517634207240949e-05, "loss": 1.6571, "step": 9500 }, { "epoch": 1.4903245942571786, "grad_norm": 0.5768577456474304, "learning_rate": 3.509831460674157e-05, "loss": 1.679, "step": 9550 }, { "epoch": 1.4981273408239701, "grad_norm": 0.5268595218658447, "learning_rate": 3.502028714107366e-05, "loss": 1.6461, "step": 9600 }, { "epoch": 1.5059300873907615, "grad_norm": 0.6356619000434875, "learning_rate": 3.494225967540574e-05, "loss": 1.6312, "step": 9650 }, { "epoch": 1.513732833957553, "grad_norm": 0.5722295641899109, "learning_rate": 3.486423220973783e-05, "loss": 1.6418, "step": 9700 }, { "epoch": 1.5215355805243447, "grad_norm": 0.5974990129470825, "learning_rate": 3.478620474406992e-05, "loss": 1.6526, "step": 9750 }, { "epoch": 1.529338327091136, "grad_norm": 0.6584164500236511, "learning_rate": 3.4708177278402e-05, "loss": 1.6749, "step": 9800 }, { "epoch": 1.5371410736579276, "grad_norm": 0.6195454001426697, "learning_rate": 3.463014981273409e-05, "loss": 1.6406, "step": 9850 }, { "epoch": 1.5449438202247192, "grad_norm": 0.5923195481300354, "learning_rate": 3.455212234706617e-05, "loss": 1.6471, "step": 9900 }, { "epoch": 1.5527465667915106, "grad_norm": 0.59232097864151, "learning_rate": 3.447409488139825e-05, "loss": 1.6595, "step": 9950 }, { "epoch": 1.5605493133583022, "grad_norm": 0.5838867425918579, "learning_rate": 3.439606741573034e-05, "loss": 1.6449, "step": 10000 }, { "epoch": 1.5683520599250937, "grad_norm": 0.6070720553398132, "learning_rate": 3.431803995006242e-05, "loss": 1.6376, "step": 10050 }, { "epoch": 1.576154806491885, "grad_norm": 0.5864161849021912, "learning_rate": 3.4240012484394504e-05, "loss": 1.6555, "step": 10100 }, { "epoch": 1.5839575530586767, "grad_norm": 0.6388084292411804, "learning_rate": 3.416198501872659e-05, "loss": 1.6522, "step": 10150 }, { "epoch": 1.5917602996254683, "grad_norm": 0.5700277090072632, "learning_rate": 3.4083957553058674e-05, "loss": 1.6347, "step": 10200 }, { "epoch": 1.5995630461922596, "grad_norm": 0.6094324588775635, "learning_rate": 3.400593008739076e-05, "loss": 1.6439, "step": 10250 }, { "epoch": 1.6073657927590512, "grad_norm": 0.6227761507034302, "learning_rate": 3.3927902621722844e-05, "loss": 1.6567, "step": 10300 }, { "epoch": 1.6151685393258428, "grad_norm": 0.6303547024726868, "learning_rate": 3.384987515605493e-05, "loss": 1.6221, "step": 10350 }, { "epoch": 1.6229712858926342, "grad_norm": 0.6025314927101135, "learning_rate": 3.3771847690387014e-05, "loss": 1.6371, "step": 10400 }, { "epoch": 1.6307740324594258, "grad_norm": 0.683813214302063, "learning_rate": 3.36938202247191e-05, "loss": 1.6364, "step": 10450 }, { "epoch": 1.6385767790262173, "grad_norm": 0.763845682144165, "learning_rate": 3.361579275905119e-05, "loss": 1.6274, "step": 10500 }, { "epoch": 1.6463795255930087, "grad_norm": 0.6550936698913574, "learning_rate": 3.353776529338327e-05, "loss": 1.6433, "step": 10550 }, { "epoch": 1.6541822721598003, "grad_norm": 0.6213018894195557, "learning_rate": 3.345973782771536e-05, "loss": 1.6405, "step": 10600 }, { "epoch": 1.6619850187265919, "grad_norm": 0.6199821829795837, "learning_rate": 3.338171036204744e-05, "loss": 1.6221, "step": 10650 }, { "epoch": 1.6697877652933832, "grad_norm": 0.6940792798995972, "learning_rate": 3.330368289637953e-05, "loss": 1.6064, "step": 10700 }, { "epoch": 1.6775905118601748, "grad_norm": 0.5895411968231201, "learning_rate": 3.322565543071161e-05, "loss": 1.6246, "step": 10750 }, { "epoch": 1.6853932584269664, "grad_norm": 0.584697425365448, "learning_rate": 3.31476279650437e-05, "loss": 1.6164, "step": 10800 }, { "epoch": 1.6931960049937578, "grad_norm": 0.5995935201644897, "learning_rate": 3.306960049937578e-05, "loss": 1.6303, "step": 10850 }, { "epoch": 1.7009987515605494, "grad_norm": 0.6184208989143372, "learning_rate": 3.299157303370787e-05, "loss": 1.6226, "step": 10900 }, { "epoch": 1.708801498127341, "grad_norm": 0.6035963892936707, "learning_rate": 3.291354556803995e-05, "loss": 1.6519, "step": 10950 }, { "epoch": 1.7166042446941323, "grad_norm": 0.6514495611190796, "learning_rate": 3.283551810237204e-05, "loss": 1.6311, "step": 11000 }, { "epoch": 1.724406991260924, "grad_norm": 0.681898832321167, "learning_rate": 3.275749063670412e-05, "loss": 1.6493, "step": 11050 }, { "epoch": 1.7322097378277155, "grad_norm": 0.641394317150116, "learning_rate": 3.267946317103621e-05, "loss": 1.6562, "step": 11100 }, { "epoch": 1.7400124843945068, "grad_norm": 0.6565654277801514, "learning_rate": 3.260143570536829e-05, "loss": 1.6193, "step": 11150 }, { "epoch": 1.7478152309612984, "grad_norm": 0.6501032114028931, "learning_rate": 3.252340823970037e-05, "loss": 1.6294, "step": 11200 }, { "epoch": 1.75561797752809, "grad_norm": 0.5766665935516357, "learning_rate": 3.244538077403246e-05, "loss": 1.6236, "step": 11250 }, { "epoch": 1.7634207240948814, "grad_norm": 0.6376942992210388, "learning_rate": 3.236735330836454e-05, "loss": 1.6592, "step": 11300 }, { "epoch": 1.771223470661673, "grad_norm": 0.6972282528877258, "learning_rate": 3.228932584269663e-05, "loss": 1.6173, "step": 11350 }, { "epoch": 1.7790262172284645, "grad_norm": 0.6171312928199768, "learning_rate": 3.221129837702871e-05, "loss": 1.6414, "step": 11400 }, { "epoch": 1.786828963795256, "grad_norm": 0.715184211730957, "learning_rate": 3.21332709113608e-05, "loss": 1.637, "step": 11450 }, { "epoch": 1.7946317103620475, "grad_norm": 0.6628192663192749, "learning_rate": 3.205524344569288e-05, "loss": 1.6102, "step": 11500 }, { "epoch": 1.802434456928839, "grad_norm": 0.6058287024497986, "learning_rate": 3.197721598002497e-05, "loss": 1.6512, "step": 11550 }, { "epoch": 1.8102372034956304, "grad_norm": 0.6275887489318848, "learning_rate": 3.189918851435705e-05, "loss": 1.6435, "step": 11600 }, { "epoch": 1.818039950062422, "grad_norm": 0.7389242053031921, "learning_rate": 3.182116104868914e-05, "loss": 1.6234, "step": 11650 }, { "epoch": 1.8258426966292136, "grad_norm": 0.649131715297699, "learning_rate": 3.174313358302122e-05, "loss": 1.6332, "step": 11700 }, { "epoch": 1.833645443196005, "grad_norm": 0.5898476839065552, "learning_rate": 3.166510611735331e-05, "loss": 1.6299, "step": 11750 }, { "epoch": 1.8414481897627963, "grad_norm": 0.617365837097168, "learning_rate": 3.158707865168539e-05, "loss": 1.608, "step": 11800 }, { "epoch": 1.8492509363295881, "grad_norm": 0.6347021460533142, "learning_rate": 3.150905118601748e-05, "loss": 1.6168, "step": 11850 }, { "epoch": 1.8570536828963795, "grad_norm": 0.6479565501213074, "learning_rate": 3.143102372034957e-05, "loss": 1.6089, "step": 11900 }, { "epoch": 1.8648564294631709, "grad_norm": 0.6168213486671448, "learning_rate": 3.135299625468165e-05, "loss": 1.63, "step": 11950 }, { "epoch": 1.8726591760299627, "grad_norm": 0.5773766040802002, "learning_rate": 3.127496878901374e-05, "loss": 1.6183, "step": 12000 }, { "epoch": 1.880461922596754, "grad_norm": 0.5600804686546326, "learning_rate": 3.119694132334582e-05, "loss": 1.6393, "step": 12050 }, { "epoch": 1.8882646691635454, "grad_norm": 0.623058557510376, "learning_rate": 3.111891385767791e-05, "loss": 1.6112, "step": 12100 }, { "epoch": 1.8960674157303372, "grad_norm": 0.5952323079109192, "learning_rate": 3.104088639200999e-05, "loss": 1.6319, "step": 12150 }, { "epoch": 1.9038701622971286, "grad_norm": 0.7055174112319946, "learning_rate": 3.096285892634208e-05, "loss": 1.6275, "step": 12200 }, { "epoch": 1.91167290886392, "grad_norm": 0.5625096559524536, "learning_rate": 3.088483146067416e-05, "loss": 1.6345, "step": 12250 }, { "epoch": 1.9194756554307117, "grad_norm": 0.5937293767929077, "learning_rate": 3.080680399500624e-05, "loss": 1.6197, "step": 12300 }, { "epoch": 1.927278401997503, "grad_norm": 0.6606655120849609, "learning_rate": 3.072877652933833e-05, "loss": 1.6201, "step": 12350 }, { "epoch": 1.9350811485642945, "grad_norm": 0.6392807960510254, "learning_rate": 3.065074906367041e-05, "loss": 1.6309, "step": 12400 }, { "epoch": 1.9428838951310863, "grad_norm": 0.7471784353256226, "learning_rate": 3.057272159800249e-05, "loss": 1.6131, "step": 12450 }, { "epoch": 1.9506866416978776, "grad_norm": 0.6735255718231201, "learning_rate": 3.0494694132334585e-05, "loss": 1.6514, "step": 12500 }, { "epoch": 1.958489388264669, "grad_norm": 0.6417968273162842, "learning_rate": 3.0416666666666666e-05, "loss": 1.6245, "step": 12550 }, { "epoch": 1.9662921348314608, "grad_norm": 0.6633313894271851, "learning_rate": 3.0338639200998755e-05, "loss": 1.6267, "step": 12600 }, { "epoch": 1.9740948813982522, "grad_norm": 0.6218631267547607, "learning_rate": 3.026061173533084e-05, "loss": 1.6472, "step": 12650 }, { "epoch": 1.9818976279650435, "grad_norm": 0.594956636428833, "learning_rate": 3.018258426966292e-05, "loss": 1.6298, "step": 12700 }, { "epoch": 1.9897003745318353, "grad_norm": 0.5888795852661133, "learning_rate": 3.010455680399501e-05, "loss": 1.626, "step": 12750 }, { "epoch": 1.9975031210986267, "grad_norm": 0.6327818632125854, "learning_rate": 3.002652933832709e-05, "loss": 1.625, "step": 12800 } ], "logging_steps": 50, "max_steps": 32040, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.7153416265151283e+17, "train_batch_size": 10, "trial_name": null, "trial_params": null }