{ "best_global_step": 203490, "best_metric": 0.6936652741069145, "best_model_checkpoint": "modernbert-heritage-category/checkpoint-203490", "epoch": 3.0, "eval_steps": 500, "global_step": 203490, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007371369600471768, "grad_norm": 1.2525012493133545, "learning_rate": 1.995105410585287e-05, "loss": 0.1758, "step": 500 }, { "epoch": 0.014742739200943536, "grad_norm": 0.9046939611434937, "learning_rate": 1.990200992677773e-05, "loss": 0.1536, "step": 1000 }, { "epoch": 0.022114108801415303, "grad_norm": 1.5520201921463013, "learning_rate": 1.9852867462774585e-05, "loss": 0.1429, "step": 1500 }, { "epoch": 0.02948547840188707, "grad_norm": 2.7046384811401367, "learning_rate": 1.980372499877144e-05, "loss": 0.1292, "step": 2000 }, { "epoch": 0.03685684800235884, "grad_norm": 0.7337467670440674, "learning_rate": 1.9754582534768295e-05, "loss": 0.1229, "step": 2500 }, { "epoch": 0.044228217602830605, "grad_norm": 0.6729874610900879, "learning_rate": 1.970544007076515e-05, "loss": 0.1193, "step": 3000 }, { "epoch": 0.05159958720330237, "grad_norm": 6.235721111297607, "learning_rate": 1.9656592461546026e-05, "loss": 0.1182, "step": 3500 }, { "epoch": 0.05897095680377414, "grad_norm": 0.02936830371618271, "learning_rate": 1.9607449997542877e-05, "loss": 0.1183, "step": 4000 }, { "epoch": 0.06634232640424591, "grad_norm": 0.028409462422132492, "learning_rate": 1.9558307533539732e-05, "loss": 0.1164, "step": 4500 }, { "epoch": 0.07371369600471768, "grad_norm": 4.09944486618042, "learning_rate": 1.9509165069536587e-05, "loss": 0.1053, "step": 5000 }, { "epoch": 0.08108506560518944, "grad_norm": 3.7119829654693604, "learning_rate": 1.9460022605533442e-05, "loss": 0.1118, "step": 5500 }, { "epoch": 0.08845643520566121, "grad_norm": 1.3968170881271362, "learning_rate": 1.9410880141530297e-05, "loss": 0.1092, "step": 6000 }, { "epoch": 0.09582780480613298, "grad_norm": 1.2682095766067505, "learning_rate": 1.9361737677527152e-05, "loss": 0.107, "step": 6500 }, { "epoch": 0.10319917440660474, "grad_norm": 1.898970127105713, "learning_rate": 1.9312595213524007e-05, "loss": 0.106, "step": 7000 }, { "epoch": 0.11057054400707651, "grad_norm": 1.1229842901229858, "learning_rate": 1.9263452749520862e-05, "loss": 0.1066, "step": 7500 }, { "epoch": 0.11794191360754828, "grad_norm": 1.5770803689956665, "learning_rate": 1.9214310285517717e-05, "loss": 0.1008, "step": 8000 }, { "epoch": 0.12531328320802004, "grad_norm": 2.80481219291687, "learning_rate": 1.9165167821514572e-05, "loss": 0.1115, "step": 8500 }, { "epoch": 0.13268465280849182, "grad_norm": 0.8099410533905029, "learning_rate": 1.9116025357511427e-05, "loss": 0.0979, "step": 9000 }, { "epoch": 0.1400560224089636, "grad_norm": 2.5966243743896484, "learning_rate": 1.9066882893508282e-05, "loss": 0.1111, "step": 9500 }, { "epoch": 0.14742739200943536, "grad_norm": 0.39728572964668274, "learning_rate": 1.9017740429505137e-05, "loss": 0.1014, "step": 10000 }, { "epoch": 0.15479876160990713, "grad_norm": 0.4020085334777832, "learning_rate": 1.896869625043e-05, "loss": 0.0956, "step": 10500 }, { "epoch": 0.16217013121037888, "grad_norm": 1.247157335281372, "learning_rate": 1.891955378642685e-05, "loss": 0.1054, "step": 11000 }, { "epoch": 0.16954150081085065, "grad_norm": 0.4768887758255005, "learning_rate": 1.8870411322423706e-05, "loss": 0.0981, "step": 11500 }, { "epoch": 0.17691287041132242, "grad_norm": 2.418336868286133, "learning_rate": 1.882126885842056e-05, "loss": 0.0936, "step": 12000 }, { "epoch": 0.1842842400117942, "grad_norm": 0.27637165784835815, "learning_rate": 1.877212639441742e-05, "loss": 0.0975, "step": 12500 }, { "epoch": 0.19165560961226596, "grad_norm": 0.23639962077140808, "learning_rate": 1.8722983930414275e-05, "loss": 0.0915, "step": 13000 }, { "epoch": 0.19902697921273774, "grad_norm": 0.8892920017242432, "learning_rate": 1.8673841466411126e-05, "loss": 0.1, "step": 13500 }, { "epoch": 0.20639834881320948, "grad_norm": 3.2099547386169434, "learning_rate": 1.862469900240798e-05, "loss": 0.0965, "step": 14000 }, { "epoch": 0.21376971841368125, "grad_norm": 0.9265658259391785, "learning_rate": 1.8575556538404836e-05, "loss": 0.0988, "step": 14500 }, { "epoch": 0.22114108801415303, "grad_norm": 1.8521679639816284, "learning_rate": 1.8526414074401695e-05, "loss": 0.0951, "step": 15000 }, { "epoch": 0.2285124576146248, "grad_norm": 2.191715717315674, "learning_rate": 1.8477369895326553e-05, "loss": 0.1037, "step": 15500 }, { "epoch": 0.23588382721509657, "grad_norm": 0.10625209659337997, "learning_rate": 1.8428227431323408e-05, "loss": 0.093, "step": 16000 }, { "epoch": 0.24325519681556834, "grad_norm": 4.059609413146973, "learning_rate": 1.8379084967320263e-05, "loss": 0.0931, "step": 16500 }, { "epoch": 0.2506265664160401, "grad_norm": 0.341007798910141, "learning_rate": 1.8329942503317118e-05, "loss": 0.0877, "step": 17000 }, { "epoch": 0.2579979360165119, "grad_norm": 0.16973993182182312, "learning_rate": 1.8280800039313973e-05, "loss": 0.0981, "step": 17500 }, { "epoch": 0.26536930561698363, "grad_norm": 0.20512279868125916, "learning_rate": 1.8231657575310828e-05, "loss": 0.0944, "step": 18000 }, { "epoch": 0.27274067521745543, "grad_norm": 5.859679222106934, "learning_rate": 1.818251511130768e-05, "loss": 0.0952, "step": 18500 }, { "epoch": 0.2801120448179272, "grad_norm": 0.06838594377040863, "learning_rate": 1.8133470932232545e-05, "loss": 0.0891, "step": 19000 }, { "epoch": 0.2874834144183989, "grad_norm": 1.6491619348526, "learning_rate": 1.80843284682294e-05, "loss": 0.095, "step": 19500 }, { "epoch": 0.2948547840188707, "grad_norm": 0.21653395891189575, "learning_rate": 1.8035186004226252e-05, "loss": 0.0925, "step": 20000 }, { "epoch": 0.30222615361934246, "grad_norm": 0.1839127540588379, "learning_rate": 1.7986043540223107e-05, "loss": 0.0874, "step": 20500 }, { "epoch": 0.30959752321981426, "grad_norm": 1.7096320390701294, "learning_rate": 1.7936901076219962e-05, "loss": 0.0921, "step": 21000 }, { "epoch": 0.316968892820286, "grad_norm": 0.013913823291659355, "learning_rate": 1.788775861221682e-05, "loss": 0.0879, "step": 21500 }, { "epoch": 0.32434026242075775, "grad_norm": 2.2390196323394775, "learning_rate": 1.7838616148213672e-05, "loss": 0.0988, "step": 22000 }, { "epoch": 0.33171163202122955, "grad_norm": 1.1112462282180786, "learning_rate": 1.7789473684210527e-05, "loss": 0.0906, "step": 22500 }, { "epoch": 0.3390830016217013, "grad_norm": 2.3240630626678467, "learning_rate": 1.774042950513539e-05, "loss": 0.0919, "step": 23000 }, { "epoch": 0.3464543712221731, "grad_norm": 0.31203529238700867, "learning_rate": 1.7691287041132244e-05, "loss": 0.0886, "step": 23500 }, { "epoch": 0.35382574082264484, "grad_norm": 0.002407611347734928, "learning_rate": 1.76421445771291e-05, "loss": 0.0892, "step": 24000 }, { "epoch": 0.36119711042311664, "grad_norm": 0.23297803103923798, "learning_rate": 1.7593002113125954e-05, "loss": 0.0894, "step": 24500 }, { "epoch": 0.3685684800235884, "grad_norm": 0.5540401339530945, "learning_rate": 1.754385964912281e-05, "loss": 0.0905, "step": 25000 }, { "epoch": 0.37593984962406013, "grad_norm": 1.9130643606185913, "learning_rate": 1.749481547004767e-05, "loss": 0.0906, "step": 25500 }, { "epoch": 0.38331121922453193, "grad_norm": 0.23371170461177826, "learning_rate": 1.7445673006044523e-05, "loss": 0.0925, "step": 26000 }, { "epoch": 0.3906825888250037, "grad_norm": 0.16552847623825073, "learning_rate": 1.7396530542041378e-05, "loss": 0.088, "step": 26500 }, { "epoch": 0.3980539584254755, "grad_norm": 0.008411018177866936, "learning_rate": 1.7347388078038233e-05, "loss": 0.0946, "step": 27000 }, { "epoch": 0.4054253280259472, "grad_norm": 0.6356103420257568, "learning_rate": 1.729824561403509e-05, "loss": 0.0869, "step": 27500 }, { "epoch": 0.41279669762641896, "grad_norm": 0.8396435379981995, "learning_rate": 1.724920143495995e-05, "loss": 0.0933, "step": 28000 }, { "epoch": 0.42016806722689076, "grad_norm": 5.201042652130127, "learning_rate": 1.7200157255884812e-05, "loss": 0.0878, "step": 28500 }, { "epoch": 0.4275394368273625, "grad_norm": 1.2198799848556519, "learning_rate": 1.7151014791881667e-05, "loss": 0.0837, "step": 29000 }, { "epoch": 0.4349108064278343, "grad_norm": 1.1107237339019775, "learning_rate": 1.7101872327878522e-05, "loss": 0.0898, "step": 29500 }, { "epoch": 0.44228217602830605, "grad_norm": 0.47166362404823303, "learning_rate": 1.705282814880338e-05, "loss": 0.083, "step": 30000 }, { "epoch": 0.44965354562877785, "grad_norm": 0.9816909432411194, "learning_rate": 1.7003685684800236e-05, "loss": 0.0861, "step": 30500 }, { "epoch": 0.4570249152292496, "grad_norm": 0.10324009507894516, "learning_rate": 1.6954543220797094e-05, "loss": 0.0942, "step": 31000 }, { "epoch": 0.46439628482972134, "grad_norm": 0.42705604434013367, "learning_rate": 1.6905400756793946e-05, "loss": 0.0826, "step": 31500 }, { "epoch": 0.47176765443019314, "grad_norm": 1.8074253797531128, "learning_rate": 1.68562582927908e-05, "loss": 0.0853, "step": 32000 }, { "epoch": 0.4791390240306649, "grad_norm": 1.1949777603149414, "learning_rate": 1.6807115828787656e-05, "loss": 0.0936, "step": 32500 }, { "epoch": 0.4865103936311367, "grad_norm": 1.8849105834960938, "learning_rate": 1.675797336478451e-05, "loss": 0.0849, "step": 33000 }, { "epoch": 0.49388176323160843, "grad_norm": 2.1948788166046143, "learning_rate": 1.670883090078137e-05, "loss": 0.083, "step": 33500 }, { "epoch": 0.5012531328320802, "grad_norm": 1.5681918859481812, "learning_rate": 1.665968843677822e-05, "loss": 0.0845, "step": 34000 }, { "epoch": 0.508624502432552, "grad_norm": 1.447178840637207, "learning_rate": 1.6610545972775076e-05, "loss": 0.0883, "step": 34500 }, { "epoch": 0.5159958720330238, "grad_norm": 0.678683876991272, "learning_rate": 1.6561501793699938e-05, "loss": 0.0901, "step": 35000 }, { "epoch": 0.5233672416334955, "grad_norm": 1.585949420928955, "learning_rate": 1.6512359329696793e-05, "loss": 0.0893, "step": 35500 }, { "epoch": 0.5307386112339673, "grad_norm": 2.8461952209472656, "learning_rate": 1.6463216865693648e-05, "loss": 0.0846, "step": 36000 }, { "epoch": 0.5381099808344391, "grad_norm": 0.08873996883630753, "learning_rate": 1.6414074401690503e-05, "loss": 0.0838, "step": 36500 }, { "epoch": 0.5454813504349109, "grad_norm": 0.08909033238887787, "learning_rate": 1.6364931937687354e-05, "loss": 0.0835, "step": 37000 }, { "epoch": 0.5528527200353825, "grad_norm": 0.01679537631571293, "learning_rate": 1.6315789473684213e-05, "loss": 0.0826, "step": 37500 }, { "epoch": 0.5602240896358543, "grad_norm": 0.018643999472260475, "learning_rate": 1.6266647009681068e-05, "loss": 0.0891, "step": 38000 }, { "epoch": 0.5675954592363261, "grad_norm": 3.226288080215454, "learning_rate": 1.6217504545677923e-05, "loss": 0.0927, "step": 38500 }, { "epoch": 0.5749668288367978, "grad_norm": 6.410881042480469, "learning_rate": 1.6168362081674774e-05, "loss": 0.0826, "step": 39000 }, { "epoch": 0.5823381984372696, "grad_norm": 2.421131134033203, "learning_rate": 1.611921961767163e-05, "loss": 0.0854, "step": 39500 }, { "epoch": 0.5897095680377414, "grad_norm": 0.012708733789622784, "learning_rate": 1.6070077153668488e-05, "loss": 0.0841, "step": 40000 }, { "epoch": 0.5970809376382131, "grad_norm": 5.636229515075684, "learning_rate": 1.6021032974593347e-05, "loss": 0.0794, "step": 40500 }, { "epoch": 0.6044523072386849, "grad_norm": 1.866571307182312, "learning_rate": 1.59718905105902e-05, "loss": 0.0836, "step": 41000 }, { "epoch": 0.6118236768391567, "grad_norm": 1.0771315097808838, "learning_rate": 1.5922748046587057e-05, "loss": 0.0782, "step": 41500 }, { "epoch": 0.6191950464396285, "grad_norm": 0.09344267845153809, "learning_rate": 1.587360558258391e-05, "loss": 0.0891, "step": 42000 }, { "epoch": 0.6265664160401002, "grad_norm": 2.4186413288116455, "learning_rate": 1.5824463118580767e-05, "loss": 0.0907, "step": 42500 }, { "epoch": 0.633937785640572, "grad_norm": 0.09242186695337296, "learning_rate": 1.577532065457762e-05, "loss": 0.081, "step": 43000 }, { "epoch": 0.6413091552410438, "grad_norm": 0.09285570681095123, "learning_rate": 1.572637476043049e-05, "loss": 0.0902, "step": 43500 }, { "epoch": 0.6486805248415155, "grad_norm": 1.1049730777740479, "learning_rate": 1.5677232296427346e-05, "loss": 0.081, "step": 44000 }, { "epoch": 0.6560518944419873, "grad_norm": 0.1485988050699234, "learning_rate": 1.5628089832424197e-05, "loss": 0.0834, "step": 44500 }, { "epoch": 0.6634232640424591, "grad_norm": 1.4170334339141846, "learning_rate": 1.5578947368421052e-05, "loss": 0.08, "step": 45000 }, { "epoch": 0.6707946336429309, "grad_norm": 2.51129150390625, "learning_rate": 1.5529903189345914e-05, "loss": 0.0881, "step": 45500 }, { "epoch": 0.6781660032434026, "grad_norm": 0.0491604208946228, "learning_rate": 1.548076072534277e-05, "loss": 0.0845, "step": 46000 }, { "epoch": 0.6855373728438744, "grad_norm": 0.09064287692308426, "learning_rate": 1.5431618261339624e-05, "loss": 0.0838, "step": 46500 }, { "epoch": 0.6929087424443462, "grad_norm": 1.6225173473358154, "learning_rate": 1.538247579733648e-05, "loss": 0.0806, "step": 47000 }, { "epoch": 0.7002801120448179, "grad_norm": 0.025229327380657196, "learning_rate": 1.5333333333333334e-05, "loss": 0.0852, "step": 47500 }, { "epoch": 0.7076514816452897, "grad_norm": 1.561880350112915, "learning_rate": 1.528419086933019e-05, "loss": 0.0819, "step": 48000 }, { "epoch": 0.7150228512457615, "grad_norm": 0.024792635813355446, "learning_rate": 1.5235048405327044e-05, "loss": 0.0844, "step": 48500 }, { "epoch": 0.7223942208462333, "grad_norm": 0.2325647473335266, "learning_rate": 1.51859059413239e-05, "loss": 0.0786, "step": 49000 }, { "epoch": 0.729765590446705, "grad_norm": 1.0401220321655273, "learning_rate": 1.5136763477320754e-05, "loss": 0.0753, "step": 49500 }, { "epoch": 0.7371369600471768, "grad_norm": 2.6318838596343994, "learning_rate": 1.5087621013317608e-05, "loss": 0.0793, "step": 50000 }, { "epoch": 0.7445083296476486, "grad_norm": 0.15695439279079437, "learning_rate": 1.5038478549314463e-05, "loss": 0.085, "step": 50500 }, { "epoch": 0.7518796992481203, "grad_norm": 0.006814942229539156, "learning_rate": 1.498933608531132e-05, "loss": 0.0819, "step": 51000 }, { "epoch": 0.7592510688485921, "grad_norm": 2.0822718143463135, "learning_rate": 1.4940193621308174e-05, "loss": 0.0798, "step": 51500 }, { "epoch": 0.7666224384490639, "grad_norm": 0.06762377172708511, "learning_rate": 1.4891149442233035e-05, "loss": 0.0793, "step": 52000 }, { "epoch": 0.7739938080495357, "grad_norm": 1.627299189567566, "learning_rate": 1.484200697822989e-05, "loss": 0.08, "step": 52500 }, { "epoch": 0.7813651776500073, "grad_norm": 0.8819578289985657, "learning_rate": 1.4792864514226743e-05, "loss": 0.0897, "step": 53000 }, { "epoch": 0.7887365472504791, "grad_norm": 3.7201988697052, "learning_rate": 1.4743722050223598e-05, "loss": 0.0787, "step": 53500 }, { "epoch": 0.796107916850951, "grad_norm": 2.0705556869506836, "learning_rate": 1.4694677871148462e-05, "loss": 0.0897, "step": 54000 }, { "epoch": 0.8034792864514226, "grad_norm": 0.08984575420618057, "learning_rate": 1.4645535407145315e-05, "loss": 0.0875, "step": 54500 }, { "epoch": 0.8108506560518944, "grad_norm": 0.5264925956726074, "learning_rate": 1.459639294314217e-05, "loss": 0.081, "step": 55000 }, { "epoch": 0.8182220256523662, "grad_norm": 0.7385400533676147, "learning_rate": 1.4547250479139025e-05, "loss": 0.0804, "step": 55500 }, { "epoch": 0.8255933952528379, "grad_norm": 0.04134887456893921, "learning_rate": 1.449810801513588e-05, "loss": 0.0797, "step": 56000 }, { "epoch": 0.8329647648533097, "grad_norm": 0.03769136965274811, "learning_rate": 1.4448965551132734e-05, "loss": 0.084, "step": 56500 }, { "epoch": 0.8403361344537815, "grad_norm": 0.2623615264892578, "learning_rate": 1.4399921372057597e-05, "loss": 0.0821, "step": 57000 }, { "epoch": 0.8477075040542533, "grad_norm": 2.0235373973846436, "learning_rate": 1.435077890805445e-05, "loss": 0.0806, "step": 57500 }, { "epoch": 0.855078873654725, "grad_norm": 0.32753029465675354, "learning_rate": 1.4301636444051306e-05, "loss": 0.0828, "step": 58000 }, { "epoch": 0.8624502432551968, "grad_norm": 2.255500316619873, "learning_rate": 1.425249398004816e-05, "loss": 0.0804, "step": 58500 }, { "epoch": 0.8698216128556686, "grad_norm": 2.8162291049957275, "learning_rate": 1.4203449800973021e-05, "loss": 0.0827, "step": 59000 }, { "epoch": 0.8771929824561403, "grad_norm": 0.41316208243370056, "learning_rate": 1.4154307336969876e-05, "loss": 0.0753, "step": 59500 }, { "epoch": 0.8845643520566121, "grad_norm": 1.9982844591140747, "learning_rate": 1.4105164872966733e-05, "loss": 0.0854, "step": 60000 }, { "epoch": 0.8919357216570839, "grad_norm": 1.8432923555374146, "learning_rate": 1.4056022408963586e-05, "loss": 0.077, "step": 60500 }, { "epoch": 0.8993070912575557, "grad_norm": 3.33919620513916, "learning_rate": 1.4006879944960441e-05, "loss": 0.082, "step": 61000 }, { "epoch": 0.9066784608580274, "grad_norm": 3.227517604827881, "learning_rate": 1.3957737480957296e-05, "loss": 0.0785, "step": 61500 }, { "epoch": 0.9140498304584992, "grad_norm": 0.014868408441543579, "learning_rate": 1.3908595016954151e-05, "loss": 0.0741, "step": 62000 }, { "epoch": 0.921421200058971, "grad_norm": 0.06270582973957062, "learning_rate": 1.3859452552951008e-05, "loss": 0.0786, "step": 62500 }, { "epoch": 0.9287925696594427, "grad_norm": 0.06437293440103531, "learning_rate": 1.3810310088947861e-05, "loss": 0.076, "step": 63000 }, { "epoch": 0.9361639392599145, "grad_norm": 0.9199370741844177, "learning_rate": 1.3761167624944716e-05, "loss": 0.0815, "step": 63500 }, { "epoch": 0.9435353088603863, "grad_norm": 0.020321089774370193, "learning_rate": 1.3712123445869577e-05, "loss": 0.0837, "step": 64000 }, { "epoch": 0.9509066784608581, "grad_norm": 2.2705533504486084, "learning_rate": 1.3662980981866432e-05, "loss": 0.0871, "step": 64500 }, { "epoch": 0.9582780480613298, "grad_norm": 0.027517901733517647, "learning_rate": 1.3613838517863287e-05, "loss": 0.0755, "step": 65000 }, { "epoch": 0.9656494176618016, "grad_norm": 1.307394027709961, "learning_rate": 1.3564696053860143e-05, "loss": 0.0764, "step": 65500 }, { "epoch": 0.9730207872622734, "grad_norm": 2.4579734802246094, "learning_rate": 1.3515553589856995e-05, "loss": 0.0782, "step": 66000 }, { "epoch": 0.9803921568627451, "grad_norm": 13.52622127532959, "learning_rate": 1.3466411125853852e-05, "loss": 0.0704, "step": 66500 }, { "epoch": 0.9877635264632169, "grad_norm": 0.32894331216812134, "learning_rate": 1.3417268661850707e-05, "loss": 0.0838, "step": 67000 }, { "epoch": 0.9951348960636887, "grad_norm": 0.08820515871047974, "learning_rate": 1.3368126197847562e-05, "loss": 0.0792, "step": 67500 }, { "epoch": 1.0, "eval_accuracy": 0.6063492063492063, "eval_f1": 0.6658804318243672, "eval_loss": 0.07707133144140244, "eval_roc_auc": 0.8010158538939403, "eval_runtime": 92.2448, "eval_samples_per_second": 64.882, "eval_steps_per_second": 64.882, "step": 67830 }, { "epoch": 1.0025062656641603, "grad_norm": 1.2780104875564575, "learning_rate": 1.3318983733844415e-05, "loss": 0.0722, "step": 68000 }, { "epoch": 1.0098776352646321, "grad_norm": 0.016919715330004692, "learning_rate": 1.3269939554769277e-05, "loss": 0.0544, "step": 68500 }, { "epoch": 1.017249004865104, "grad_norm": 0.0008094881195574999, "learning_rate": 1.3220797090766134e-05, "loss": 0.0589, "step": 69000 }, { "epoch": 1.0246203744655757, "grad_norm": 0.06712741404771805, "learning_rate": 1.3171752911690994e-05, "loss": 0.0692, "step": 69500 }, { "epoch": 1.0319917440660475, "grad_norm": 1.4024405479431152, "learning_rate": 1.3122610447687849e-05, "loss": 0.0577, "step": 70000 }, { "epoch": 1.0393631136665193, "grad_norm": 3.811220407485962, "learning_rate": 1.3073467983684702e-05, "loss": 0.0598, "step": 70500 }, { "epoch": 1.046734483266991, "grad_norm": 0.061250410974025726, "learning_rate": 1.3024325519681557e-05, "loss": 0.064, "step": 71000 }, { "epoch": 1.0541058528674627, "grad_norm": 2.3042991161346436, "learning_rate": 1.2975183055678412e-05, "loss": 0.0646, "step": 71500 }, { "epoch": 1.0614772224679345, "grad_norm": 0.32951870560646057, "learning_rate": 1.2926040591675269e-05, "loss": 0.0668, "step": 72000 }, { "epoch": 1.0688485920684063, "grad_norm": 0.013675130903720856, "learning_rate": 1.2876898127672122e-05, "loss": 0.0576, "step": 72500 }, { "epoch": 1.0762199616688781, "grad_norm": 2.3298966884613037, "learning_rate": 1.2827755663668977e-05, "loss": 0.0583, "step": 73000 }, { "epoch": 1.08359133126935, "grad_norm": 0.04673844203352928, "learning_rate": 1.2778809769521845e-05, "loss": 0.0697, "step": 73500 }, { "epoch": 1.0909627008698215, "grad_norm": 1.1629608869552612, "learning_rate": 1.27296673055187e-05, "loss": 0.0621, "step": 74000 }, { "epoch": 1.0983340704702933, "grad_norm": 0.06381271779537201, "learning_rate": 1.2680524841515553e-05, "loss": 0.0629, "step": 74500 }, { "epoch": 1.105705440070765, "grad_norm": 0.00508810393512249, "learning_rate": 1.2631382377512408e-05, "loss": 0.065, "step": 75000 }, { "epoch": 1.113076809671237, "grad_norm": 4.200405597686768, "learning_rate": 1.2582239913509265e-05, "loss": 0.0704, "step": 75500 }, { "epoch": 1.1204481792717087, "grad_norm": 0.18736723065376282, "learning_rate": 1.253309744950612e-05, "loss": 0.0683, "step": 76000 }, { "epoch": 1.1278195488721805, "grad_norm": 0.09223194420337677, "learning_rate": 1.2483954985502975e-05, "loss": 0.0557, "step": 76500 }, { "epoch": 1.1351909184726523, "grad_norm": 5.287250518798828, "learning_rate": 1.2434812521499828e-05, "loss": 0.0643, "step": 77000 }, { "epoch": 1.1425622880731239, "grad_norm": 1.283521294593811, "learning_rate": 1.2385670057496683e-05, "loss": 0.0584, "step": 77500 }, { "epoch": 1.1499336576735957, "grad_norm": 3.34344220161438, "learning_rate": 1.233652759349354e-05, "loss": 0.0673, "step": 78000 }, { "epoch": 1.1573050272740675, "grad_norm": 0.23046046495437622, "learning_rate": 1.2287385129490395e-05, "loss": 0.0605, "step": 78500 }, { "epoch": 1.1646763968745393, "grad_norm": 0.0487230159342289, "learning_rate": 1.2238242665487248e-05, "loss": 0.0672, "step": 79000 }, { "epoch": 1.172047766475011, "grad_norm": 0.0587400384247303, "learning_rate": 1.2189100201484103e-05, "loss": 0.0635, "step": 79500 }, { "epoch": 1.1794191360754829, "grad_norm": 0.3049776256084442, "learning_rate": 1.2140056022408964e-05, "loss": 0.0587, "step": 80000 }, { "epoch": 1.1867905056759547, "grad_norm": 0.5761535167694092, "learning_rate": 1.2091011843333826e-05, "loss": 0.0706, "step": 80500 }, { "epoch": 1.1941618752764263, "grad_norm": 2.524258852005005, "learning_rate": 1.2041967664258686e-05, "loss": 0.0607, "step": 81000 }, { "epoch": 1.201533244876898, "grad_norm": 0.026634838432073593, "learning_rate": 1.1992825200255543e-05, "loss": 0.0581, "step": 81500 }, { "epoch": 1.2089046144773699, "grad_norm": 0.39337214827537537, "learning_rate": 1.1943682736252398e-05, "loss": 0.0652, "step": 82000 }, { "epoch": 1.2162759840778417, "grad_norm": 1.8906174898147583, "learning_rate": 1.1894540272249251e-05, "loss": 0.0659, "step": 82500 }, { "epoch": 1.2236473536783135, "grad_norm": 0.011290138587355614, "learning_rate": 1.1845397808246106e-05, "loss": 0.0615, "step": 83000 }, { "epoch": 1.2310187232787853, "grad_norm": 0.5536847114562988, "learning_rate": 1.1796255344242961e-05, "loss": 0.0666, "step": 83500 }, { "epoch": 1.238390092879257, "grad_norm": 0.0035450158175081015, "learning_rate": 1.1747112880239818e-05, "loss": 0.0597, "step": 84000 }, { "epoch": 1.2457614624797286, "grad_norm": 0.005579414777457714, "learning_rate": 1.1697970416236671e-05, "loss": 0.0545, "step": 84500 }, { "epoch": 1.2531328320802004, "grad_norm": 0.10251569747924805, "learning_rate": 1.1648926237161533e-05, "loss": 0.0652, "step": 85000 }, { "epoch": 1.2605042016806722, "grad_norm": 3.2004494667053223, "learning_rate": 1.1599783773158386e-05, "loss": 0.0546, "step": 85500 }, { "epoch": 1.267875571281144, "grad_norm": 1.5647473335266113, "learning_rate": 1.1550641309155241e-05, "loss": 0.0661, "step": 86000 }, { "epoch": 1.2752469408816158, "grad_norm": 2.5646321773529053, "learning_rate": 1.1501498845152096e-05, "loss": 0.0616, "step": 86500 }, { "epoch": 1.2826183104820876, "grad_norm": 0.008838827721774578, "learning_rate": 1.1452356381148953e-05, "loss": 0.0643, "step": 87000 }, { "epoch": 1.2899896800825594, "grad_norm": 0.27586570382118225, "learning_rate": 1.1403213917145805e-05, "loss": 0.0651, "step": 87500 }, { "epoch": 1.297361049683031, "grad_norm": 2.2683589458465576, "learning_rate": 1.1354071453142661e-05, "loss": 0.0613, "step": 88000 }, { "epoch": 1.3047324192835028, "grad_norm": 0.0017950567416846752, "learning_rate": 1.1305027274067524e-05, "loss": 0.0616, "step": 88500 }, { "epoch": 1.3121037888839746, "grad_norm": 0.03913048282265663, "learning_rate": 1.1255884810064377e-05, "loss": 0.0627, "step": 89000 }, { "epoch": 1.3194751584844464, "grad_norm": 5.085097312927246, "learning_rate": 1.1206742346061232e-05, "loss": 0.0648, "step": 89500 }, { "epoch": 1.3268465280849182, "grad_norm": 0.04779289662837982, "learning_rate": 1.1157599882058089e-05, "loss": 0.0666, "step": 90000 }, { "epoch": 1.33421789768539, "grad_norm": 0.01123060006648302, "learning_rate": 1.1108457418054944e-05, "loss": 0.0618, "step": 90500 }, { "epoch": 1.3415892672858618, "grad_norm": 1.5869191884994507, "learning_rate": 1.1059314954051797e-05, "loss": 0.066, "step": 91000 }, { "epoch": 1.3489606368863334, "grad_norm": 0.00517408037558198, "learning_rate": 1.1010172490048652e-05, "loss": 0.062, "step": 91500 }, { "epoch": 1.3563320064868052, "grad_norm": 0.09691867977380753, "learning_rate": 1.0961030026045507e-05, "loss": 0.0606, "step": 92000 }, { "epoch": 1.363703376087277, "grad_norm": 3.3921549320220947, "learning_rate": 1.0911887562042362e-05, "loss": 0.0643, "step": 92500 }, { "epoch": 1.3710747456877488, "grad_norm": 2.874007225036621, "learning_rate": 1.0862843382967222e-05, "loss": 0.0622, "step": 93000 }, { "epoch": 1.3784461152882206, "grad_norm": 0.03864584490656853, "learning_rate": 1.0813897488820091e-05, "loss": 0.0582, "step": 93500 }, { "epoch": 1.3858174848886924, "grad_norm": 0.12044321745634079, "learning_rate": 1.0764755024816945e-05, "loss": 0.0672, "step": 94000 }, { "epoch": 1.3931888544891642, "grad_norm": 3.673576593399048, "learning_rate": 1.07156125608138e-05, "loss": 0.0619, "step": 94500 }, { "epoch": 1.4005602240896358, "grad_norm": 0.023468611761927605, "learning_rate": 1.0666470096810655e-05, "loss": 0.0567, "step": 95000 }, { "epoch": 1.4079315936901076, "grad_norm": 1.5474720001220703, "learning_rate": 1.061732763280751e-05, "loss": 0.057, "step": 95500 }, { "epoch": 1.4153029632905794, "grad_norm": 0.4061996340751648, "learning_rate": 1.0568185168804366e-05, "loss": 0.0551, "step": 96000 }, { "epoch": 1.4226743328910512, "grad_norm": 1.828468918800354, "learning_rate": 1.0519042704801218e-05, "loss": 0.0642, "step": 96500 }, { "epoch": 1.430045702491523, "grad_norm": 0.006184196099638939, "learning_rate": 1.0469900240798075e-05, "loss": 0.0562, "step": 97000 }, { "epoch": 1.4374170720919948, "grad_norm": 0.8913156390190125, "learning_rate": 1.0420856061722935e-05, "loss": 0.0626, "step": 97500 }, { "epoch": 1.4447884416924666, "grad_norm": 0.08904910832643509, "learning_rate": 1.037171359771979e-05, "loss": 0.0558, "step": 98000 }, { "epoch": 1.4521598112929381, "grad_norm": 0.1683080941438675, "learning_rate": 1.0322571133716645e-05, "loss": 0.0644, "step": 98500 }, { "epoch": 1.45953118089341, "grad_norm": 0.4399701654911041, "learning_rate": 1.0273428669713502e-05, "loss": 0.0608, "step": 99000 }, { "epoch": 1.4669025504938817, "grad_norm": 9.481819152832031, "learning_rate": 1.0224286205710354e-05, "loss": 0.0597, "step": 99500 }, { "epoch": 1.4742739200943535, "grad_norm": 0.10929368436336517, "learning_rate": 1.017514374170721e-05, "loss": 0.0616, "step": 100000 }, { "epoch": 1.4816452896948253, "grad_norm": 0.0035255183465778828, "learning_rate": 1.0126001277704065e-05, "loss": 0.0593, "step": 100500 }, { "epoch": 1.4890166592952971, "grad_norm": 0.07139890640974045, "learning_rate": 1.007685881370092e-05, "loss": 0.0635, "step": 101000 }, { "epoch": 1.496388028895769, "grad_norm": 3.2497317790985107, "learning_rate": 1.0027716349697774e-05, "loss": 0.055, "step": 101500 }, { "epoch": 1.5037593984962405, "grad_norm": 0.16377945244312286, "learning_rate": 9.978573885694629e-06, "loss": 0.0602, "step": 102000 }, { "epoch": 1.5111307680967123, "grad_norm": 0.29184427857398987, "learning_rate": 9.929431421691485e-06, "loss": 0.0596, "step": 102500 }, { "epoch": 1.5185021376971841, "grad_norm": 0.14543047547340393, "learning_rate": 9.88028895768834e-06, "loss": 0.0593, "step": 103000 }, { "epoch": 1.525873507297656, "grad_norm": 4.776684284210205, "learning_rate": 9.831146493685194e-06, "loss": 0.0604, "step": 103500 }, { "epoch": 1.5332448768981277, "grad_norm": 3.4175798892974854, "learning_rate": 9.782102314610056e-06, "loss": 0.0622, "step": 104000 }, { "epoch": 1.5406162464985993, "grad_norm": 5.478698253631592, "learning_rate": 9.73295985060691e-06, "loss": 0.0582, "step": 104500 }, { "epoch": 1.5479876160990713, "grad_norm": 0.09877605736255646, "learning_rate": 9.683817386603766e-06, "loss": 0.0644, "step": 105000 }, { "epoch": 1.555358985699543, "grad_norm": 3.169551134109497, "learning_rate": 9.634674922600619e-06, "loss": 0.0672, "step": 105500 }, { "epoch": 1.5627303553000147, "grad_norm": 0.0030992806423455477, "learning_rate": 9.585532458597476e-06, "loss": 0.0654, "step": 106000 }, { "epoch": 1.5701017249004865, "grad_norm": 1.8882814645767212, "learning_rate": 9.536488279522336e-06, "loss": 0.0643, "step": 106500 }, { "epoch": 1.5774730945009583, "grad_norm": 0.02398967184126377, "learning_rate": 9.487345815519191e-06, "loss": 0.0677, "step": 107000 }, { "epoch": 1.58484446410143, "grad_norm": 0.010672827251255512, "learning_rate": 9.438203351516046e-06, "loss": 0.0637, "step": 107500 }, { "epoch": 1.5922158337019017, "grad_norm": 0.018269941210746765, "learning_rate": 9.389159172440906e-06, "loss": 0.0624, "step": 108000 }, { "epoch": 1.5995872033023737, "grad_norm": 1.7238303422927856, "learning_rate": 9.340016708437761e-06, "loss": 0.0595, "step": 108500 }, { "epoch": 1.6069585729028453, "grad_norm": 1.6856399774551392, "learning_rate": 9.290874244434616e-06, "loss": 0.0572, "step": 109000 }, { "epoch": 1.614329942503317, "grad_norm": 0.08445548266172409, "learning_rate": 9.241731780431471e-06, "loss": 0.0617, "step": 109500 }, { "epoch": 1.6217013121037889, "grad_norm": 2.7674472332000732, "learning_rate": 9.192589316428326e-06, "loss": 0.061, "step": 110000 }, { "epoch": 1.6290726817042607, "grad_norm": 6.365856647491455, "learning_rate": 9.143446852425181e-06, "loss": 0.0548, "step": 110500 }, { "epoch": 1.6364440513047325, "grad_norm": 1.224268913269043, "learning_rate": 9.094304388422036e-06, "loss": 0.0621, "step": 111000 }, { "epoch": 1.643815420905204, "grad_norm": 0.021649343892931938, "learning_rate": 9.045161924418891e-06, "loss": 0.0659, "step": 111500 }, { "epoch": 1.651186790505676, "grad_norm": 0.012330977246165276, "learning_rate": 8.996117745343752e-06, "loss": 0.0602, "step": 112000 }, { "epoch": 1.6585581601061476, "grad_norm": 0.6574206948280334, "learning_rate": 8.946975281340607e-06, "loss": 0.0617, "step": 112500 }, { "epoch": 1.6659295297066194, "grad_norm": 0.018540961667895317, "learning_rate": 8.897832817337462e-06, "loss": 0.0602, "step": 113000 }, { "epoch": 1.6733008993070912, "grad_norm": 5.6425557136535645, "learning_rate": 8.848690353334317e-06, "loss": 0.0597, "step": 113500 }, { "epoch": 1.680672268907563, "grad_norm": 2.435633420944214, "learning_rate": 8.799646174259177e-06, "loss": 0.0624, "step": 114000 }, { "epoch": 1.6880436385080348, "grad_norm": 1.5483721494674683, "learning_rate": 8.750503710256032e-06, "loss": 0.0614, "step": 114500 }, { "epoch": 1.6954150081085064, "grad_norm": 0.06437569856643677, "learning_rate": 8.701361246252887e-06, "loss": 0.0594, "step": 115000 }, { "epoch": 1.7027863777089784, "grad_norm": 0.29250073432922363, "learning_rate": 8.652218782249742e-06, "loss": 0.0559, "step": 115500 }, { "epoch": 1.71015774730945, "grad_norm": 0.5888819098472595, "learning_rate": 8.603076318246597e-06, "loss": 0.0659, "step": 116000 }, { "epoch": 1.7175291169099218, "grad_norm": 0.2926543354988098, "learning_rate": 8.55403213917146e-06, "loss": 0.063, "step": 116500 }, { "epoch": 1.7249004865103936, "grad_norm": 2.316805601119995, "learning_rate": 8.50498796009632e-06, "loss": 0.0544, "step": 117000 }, { "epoch": 1.7322718561108654, "grad_norm": 0.018379326909780502, "learning_rate": 8.455845496093175e-06, "loss": 0.0663, "step": 117500 }, { "epoch": 1.7396432257113372, "grad_norm": 0.014781077392399311, "learning_rate": 8.40670303209003e-06, "loss": 0.0573, "step": 118000 }, { "epoch": 1.7470145953118088, "grad_norm": 2.3919591903686523, "learning_rate": 8.357560568086885e-06, "loss": 0.0592, "step": 118500 }, { "epoch": 1.7543859649122808, "grad_norm": 2.7669119834899902, "learning_rate": 8.30841810408374e-06, "loss": 0.0644, "step": 119000 }, { "epoch": 1.7617573345127524, "grad_norm": 3.5739755630493164, "learning_rate": 8.259275640080595e-06, "loss": 0.0572, "step": 119500 }, { "epoch": 1.7691287041132242, "grad_norm": 0.4299847483634949, "learning_rate": 8.21013317607745e-06, "loss": 0.0664, "step": 120000 }, { "epoch": 1.776500073713696, "grad_norm": 0.9990677833557129, "learning_rate": 8.16108899700231e-06, "loss": 0.0638, "step": 120500 }, { "epoch": 1.7838714433141678, "grad_norm": 2.1424782276153564, "learning_rate": 8.111946532999165e-06, "loss": 0.0572, "step": 121000 }, { "epoch": 1.7912428129146396, "grad_norm": 4.301726341247559, "learning_rate": 8.06280406899602e-06, "loss": 0.0596, "step": 121500 }, { "epoch": 1.7986141825151112, "grad_norm": 8.399239540100098, "learning_rate": 8.013661604992875e-06, "loss": 0.0667, "step": 122000 }, { "epoch": 1.8059855521155832, "grad_norm": 0.00977667048573494, "learning_rate": 7.96451914098973e-06, "loss": 0.0596, "step": 122500 }, { "epoch": 1.8133569217160548, "grad_norm": 0.019315605983138084, "learning_rate": 7.915376676986585e-06, "loss": 0.0607, "step": 123000 }, { "epoch": 1.8207282913165266, "grad_norm": 0.011183898895978928, "learning_rate": 7.866332497911446e-06, "loss": 0.0656, "step": 123500 }, { "epoch": 1.8280996609169984, "grad_norm": 0.1689341515302658, "learning_rate": 7.8171900339083e-06, "loss": 0.0571, "step": 124000 }, { "epoch": 1.8354710305174702, "grad_norm": 7.370288848876953, "learning_rate": 7.768047569905156e-06, "loss": 0.0627, "step": 124500 }, { "epoch": 1.842842400117942, "grad_norm": 0.058736398816108704, "learning_rate": 7.719003390830016e-06, "loss": 0.0511, "step": 125000 }, { "epoch": 1.8502137697184136, "grad_norm": 0.014665275812149048, "learning_rate": 7.669860926826873e-06, "loss": 0.0624, "step": 125500 }, { "epoch": 1.8575851393188856, "grad_norm": 0.572428286075592, "learning_rate": 7.620718462823726e-06, "loss": 0.0607, "step": 126000 }, { "epoch": 1.8649565089193572, "grad_norm": 0.4777454733848572, "learning_rate": 7.571575998820582e-06, "loss": 0.0606, "step": 126500 }, { "epoch": 1.872327878519829, "grad_norm": 0.020159974694252014, "learning_rate": 7.522433534817436e-06, "loss": 0.0539, "step": 127000 }, { "epoch": 1.8796992481203008, "grad_norm": 0.010219153016805649, "learning_rate": 7.473291070814291e-06, "loss": 0.0593, "step": 127500 }, { "epoch": 1.8870706177207726, "grad_norm": 1.854982614517212, "learning_rate": 7.424148606811146e-06, "loss": 0.0636, "step": 128000 }, { "epoch": 1.8944419873212444, "grad_norm": 0.6811599135398865, "learning_rate": 7.375006142808001e-06, "loss": 0.0598, "step": 128500 }, { "epoch": 1.901813356921716, "grad_norm": 0.2864709496498108, "learning_rate": 7.325863678804855e-06, "loss": 0.0604, "step": 129000 }, { "epoch": 1.909184726522188, "grad_norm": 0.030090967193245888, "learning_rate": 7.276721214801711e-06, "loss": 0.0602, "step": 129500 }, { "epoch": 1.9165560961226595, "grad_norm": 4.568465232849121, "learning_rate": 7.227578750798565e-06, "loss": 0.0571, "step": 130000 }, { "epoch": 1.9239274657231313, "grad_norm": 2.8999075889587402, "learning_rate": 7.17843628679542e-06, "loss": 0.0556, "step": 130500 }, { "epoch": 1.9312988353236031, "grad_norm": 2.021425485610962, "learning_rate": 7.129392107720282e-06, "loss": 0.0593, "step": 131000 }, { "epoch": 1.938670204924075, "grad_norm": 5.252723217010498, "learning_rate": 7.080249643717136e-06, "loss": 0.054, "step": 131500 }, { "epoch": 1.9460415745245467, "grad_norm": 8.669822692871094, "learning_rate": 7.031107179713991e-06, "loss": 0.0622, "step": 132000 }, { "epoch": 1.9534129441250183, "grad_norm": 2.219619035720825, "learning_rate": 6.981964715710846e-06, "loss": 0.0656, "step": 132500 }, { "epoch": 1.9607843137254903, "grad_norm": 0.023053865879774094, "learning_rate": 6.932822251707701e-06, "loss": 0.0628, "step": 133000 }, { "epoch": 1.968155683325962, "grad_norm": 0.3438442647457123, "learning_rate": 6.8836797877045555e-06, "loss": 0.0593, "step": 133500 }, { "epoch": 1.9755270529264337, "grad_norm": 0.0067783379927277565, "learning_rate": 6.834537323701411e-06, "loss": 0.0639, "step": 134000 }, { "epoch": 1.9828984225269055, "grad_norm": 1.7491209506988525, "learning_rate": 6.7853948596982655e-06, "loss": 0.0518, "step": 134500 }, { "epoch": 1.9902697921273773, "grad_norm": 0.0214830469340086, "learning_rate": 6.736448965551133e-06, "loss": 0.0499, "step": 135000 }, { "epoch": 1.9976411617278491, "grad_norm": 2.5241074562072754, "learning_rate": 6.687306501547988e-06, "loss": 0.0649, "step": 135500 }, { "epoch": 2.0, "eval_accuracy": 0.6497911445279866, "eval_f1": 0.6859684311502573, "eval_loss": 0.08295563608407974, "eval_roc_auc": 0.8222563704452553, "eval_runtime": 89.7963, "eval_samples_per_second": 66.651, "eval_steps_per_second": 66.651, "step": 135660 }, { "epoch": 2.0050125313283207, "grad_norm": 3.550609588623047, "learning_rate": 6.638164037544843e-06, "loss": 0.0348, "step": 136000 }, { "epoch": 2.0123839009287927, "grad_norm": 0.014567219652235508, "learning_rate": 6.589021573541698e-06, "loss": 0.034, "step": 136500 }, { "epoch": 2.0197552705292643, "grad_norm": 4.3217291831970215, "learning_rate": 6.539977394466559e-06, "loss": 0.0346, "step": 137000 }, { "epoch": 2.0271266401297363, "grad_norm": 0.12837082147598267, "learning_rate": 6.490834930463414e-06, "loss": 0.0402, "step": 137500 }, { "epoch": 2.034498009730208, "grad_norm": 0.08076170086860657, "learning_rate": 6.4416924664602685e-06, "loss": 0.0269, "step": 138000 }, { "epoch": 2.0418693793306795, "grad_norm": 0.004311998374760151, "learning_rate": 6.392550002457124e-06, "loss": 0.0273, "step": 138500 }, { "epoch": 2.0492407489311515, "grad_norm": 0.0007999803638085723, "learning_rate": 6.3434075384539785e-06, "loss": 0.0263, "step": 139000 }, { "epoch": 2.056612118531623, "grad_norm": 0.004331584554165602, "learning_rate": 6.294461644306846e-06, "loss": 0.0301, "step": 139500 }, { "epoch": 2.063983488132095, "grad_norm": 0.013063879683613777, "learning_rate": 6.245417465231706e-06, "loss": 0.03, "step": 140000 }, { "epoch": 2.0713548577325667, "grad_norm": 0.01823696680366993, "learning_rate": 6.196275001228562e-06, "loss": 0.0343, "step": 140500 }, { "epoch": 2.0787262273330387, "grad_norm": 7.280787944793701, "learning_rate": 6.147132537225416e-06, "loss": 0.0261, "step": 141000 }, { "epoch": 2.0860975969335103, "grad_norm": 0.0030333856120705605, "learning_rate": 6.097990073222272e-06, "loss": 0.0298, "step": 141500 }, { "epoch": 2.093468966533982, "grad_norm": Infinity, "learning_rate": 6.0489458941471335e-06, "loss": 0.0241, "step": 142000 }, { "epoch": 2.100840336134454, "grad_norm": 46.87717056274414, "learning_rate": 5.999803430143988e-06, "loss": 0.0352, "step": 142500 }, { "epoch": 2.1082117057349254, "grad_norm": 0.008375998586416245, "learning_rate": 5.9506609661408435e-06, "loss": 0.0283, "step": 143000 }, { "epoch": 2.1155830753353975, "grad_norm": 0.0009784572757780552, "learning_rate": 5.901518502137698e-06, "loss": 0.0271, "step": 143500 }, { "epoch": 2.122954444935869, "grad_norm": 0.0016769421054050326, "learning_rate": 5.852376038134553e-06, "loss": 0.0233, "step": 144000 }, { "epoch": 2.1303258145363406, "grad_norm": 0.006961928680539131, "learning_rate": 5.803233574131407e-06, "loss": 0.0294, "step": 144500 }, { "epoch": 2.1376971841368126, "grad_norm": 0.005140836350619793, "learning_rate": 5.754091110128263e-06, "loss": 0.039, "step": 145000 }, { "epoch": 2.1450685537372842, "grad_norm": 0.001053107320331037, "learning_rate": 5.704948646125117e-06, "loss": 0.0309, "step": 145500 }, { "epoch": 2.1524399233377562, "grad_norm": 0.0007632673368789256, "learning_rate": 5.655806182121973e-06, "loss": 0.0363, "step": 146000 }, { "epoch": 2.159811292938228, "grad_norm": 0.4370366036891937, "learning_rate": 5.606762003046833e-06, "loss": 0.0317, "step": 146500 }, { "epoch": 2.1671826625387, "grad_norm": 0.00013254112855065614, "learning_rate": 5.557619539043688e-06, "loss": 0.0284, "step": 147000 }, { "epoch": 2.1745540321391714, "grad_norm": 6.598239421844482, "learning_rate": 5.508477075040542e-06, "loss": 0.0269, "step": 147500 }, { "epoch": 2.181925401739643, "grad_norm": 0.021487107500433922, "learning_rate": 5.459334611037398e-06, "loss": 0.0382, "step": 148000 }, { "epoch": 2.189296771340115, "grad_norm": 11.90185546875, "learning_rate": 5.410192147034252e-06, "loss": 0.0372, "step": 148500 }, { "epoch": 2.1966681409405866, "grad_norm": 0.0010807571234181523, "learning_rate": 5.361049683031107e-06, "loss": 0.0294, "step": 149000 }, { "epoch": 2.2040395105410586, "grad_norm": 0.024999860674142838, "learning_rate": 5.311907219027963e-06, "loss": 0.031, "step": 149500 }, { "epoch": 2.21141088014153, "grad_norm": 0.011472758837044239, "learning_rate": 5.2628630399528235e-06, "loss": 0.0253, "step": 150000 }, { "epoch": 2.218782249742002, "grad_norm": 0.0003523350169416517, "learning_rate": 5.213720575949679e-06, "loss": 0.0312, "step": 150500 }, { "epoch": 2.226153619342474, "grad_norm": 0.00018138001905754209, "learning_rate": 5.1645781119465335e-06, "loss": 0.0306, "step": 151000 }, { "epoch": 2.2335249889429454, "grad_norm": 27.588563919067383, "learning_rate": 5.1154356479433885e-06, "loss": 0.0257, "step": 151500 }, { "epoch": 2.2408963585434174, "grad_norm": 0.0001788044028216973, "learning_rate": 5.066293183940243e-06, "loss": 0.0364, "step": 152000 }, { "epoch": 2.248267728143889, "grad_norm": 0.025098439306020737, "learning_rate": 5.017347289793111e-06, "loss": 0.033, "step": 152500 }, { "epoch": 2.255639097744361, "grad_norm": 0.028794238343834877, "learning_rate": 4.968204825789966e-06, "loss": 0.0268, "step": 153000 }, { "epoch": 2.2630104673448326, "grad_norm": 0.0443144217133522, "learning_rate": 4.91906236178682e-06, "loss": 0.0277, "step": 153500 }, { "epoch": 2.2703818369453046, "grad_norm": 5.745356559753418, "learning_rate": 4.869919897783675e-06, "loss": 0.0319, "step": 154000 }, { "epoch": 2.277753206545776, "grad_norm": 0.01881660334765911, "learning_rate": 4.82077743378053e-06, "loss": 0.0321, "step": 154500 }, { "epoch": 2.2851245761462478, "grad_norm": 0.0014673862606287003, "learning_rate": 4.771634969777385e-06, "loss": 0.0325, "step": 155000 }, { "epoch": 2.2924959457467198, "grad_norm": 0.001017951057292521, "learning_rate": 4.72249250577424e-06, "loss": 0.0369, "step": 155500 }, { "epoch": 2.2998673153471914, "grad_norm": 0.0014965501613914967, "learning_rate": 4.673350041771095e-06, "loss": 0.0349, "step": 156000 }, { "epoch": 2.3072386849476634, "grad_norm": 0.0005304542719386518, "learning_rate": 4.6243058626959555e-06, "loss": 0.0303, "step": 156500 }, { "epoch": 2.314610054548135, "grad_norm": 0.0002412071480648592, "learning_rate": 4.575261683620817e-06, "loss": 0.0331, "step": 157000 }, { "epoch": 2.321981424148607, "grad_norm": 0.0006970348185859621, "learning_rate": 4.526119219617672e-06, "loss": 0.0343, "step": 157500 }, { "epoch": 2.3293527937490786, "grad_norm": 0.011668604798614979, "learning_rate": 4.476976755614527e-06, "loss": 0.0279, "step": 158000 }, { "epoch": 2.33672416334955, "grad_norm": 0.005698871333152056, "learning_rate": 4.427834291611382e-06, "loss": 0.0322, "step": 158500 }, { "epoch": 2.344095532950022, "grad_norm": 0.0057191732339560986, "learning_rate": 4.378691827608237e-06, "loss": 0.0314, "step": 159000 }, { "epoch": 2.3514669025504937, "grad_norm": 0.00243947422131896, "learning_rate": 4.329549363605092e-06, "loss": 0.0267, "step": 159500 }, { "epoch": 2.3588382721509658, "grad_norm": 0.00033369645825587213, "learning_rate": 4.280406899601947e-06, "loss": 0.0244, "step": 160000 }, { "epoch": 2.3662096417514373, "grad_norm": 0.023830311372876167, "learning_rate": 4.231264435598802e-06, "loss": 0.0283, "step": 160500 }, { "epoch": 2.3735810113519094, "grad_norm": 0.000461634888779372, "learning_rate": 4.182121971595656e-06, "loss": 0.0285, "step": 161000 }, { "epoch": 2.380952380952381, "grad_norm": 0.001816113363020122, "learning_rate": 4.132979507592511e-06, "loss": 0.03, "step": 161500 }, { "epoch": 2.3883237505528525, "grad_norm": 0.001825949759222567, "learning_rate": 4.083837043589366e-06, "loss": 0.0294, "step": 162000 }, { "epoch": 2.3956951201533245, "grad_norm": 9.502708435058594, "learning_rate": 4.034792864514227e-06, "loss": 0.0255, "step": 162500 }, { "epoch": 2.403066489753796, "grad_norm": 0.0001642414426896721, "learning_rate": 3.985650400511082e-06, "loss": 0.0255, "step": 163000 }, { "epoch": 2.410437859354268, "grad_norm": 5.004094123840332, "learning_rate": 3.936507936507936e-06, "loss": 0.0304, "step": 163500 }, { "epoch": 2.4178092289547397, "grad_norm": 0.1878451257944107, "learning_rate": 3.887365472504791e-06, "loss": 0.0236, "step": 164000 }, { "epoch": 2.4251805985552117, "grad_norm": 0.0004031589487567544, "learning_rate": 3.838223008501646e-06, "loss": 0.0316, "step": 164500 }, { "epoch": 2.4325519681556833, "grad_norm": 0.0004982321988791227, "learning_rate": 3.7890805444985013e-06, "loss": 0.0277, "step": 165000 }, { "epoch": 2.439923337756155, "grad_norm": 0.0037121805362403393, "learning_rate": 3.739938080495356e-06, "loss": 0.0303, "step": 165500 }, { "epoch": 2.447294707356627, "grad_norm": 0.0101530272513628, "learning_rate": 3.690795616492211e-06, "loss": 0.0371, "step": 166000 }, { "epoch": 2.4546660769570985, "grad_norm": 0.00018950540106743574, "learning_rate": 3.6416531524890663e-06, "loss": 0.0319, "step": 166500 }, { "epoch": 2.4620374465575705, "grad_norm": 0.8489145040512085, "learning_rate": 3.5925106884859213e-06, "loss": 0.0261, "step": 167000 }, { "epoch": 2.469408816158042, "grad_norm": 0.0013942194636911154, "learning_rate": 3.543368224482776e-06, "loss": 0.0283, "step": 167500 }, { "epoch": 2.476780185758514, "grad_norm": 0.4112614691257477, "learning_rate": 3.494225760479631e-06, "loss": 0.0332, "step": 168000 }, { "epoch": 2.4841515553589857, "grad_norm": 0.0010579220252111554, "learning_rate": 3.445181581404492e-06, "loss": 0.0288, "step": 168500 }, { "epoch": 2.4915229249594573, "grad_norm": 0.0012302091345191002, "learning_rate": 3.396137402329353e-06, "loss": 0.0272, "step": 169000 }, { "epoch": 2.4988942945599293, "grad_norm": 0.0008321187924593687, "learning_rate": 3.3470932232542143e-06, "loss": 0.0251, "step": 169500 }, { "epoch": 2.506265664160401, "grad_norm": 2.113279342651367, "learning_rate": 3.297950759251069e-06, "loss": 0.0306, "step": 170000 }, { "epoch": 2.513637033760873, "grad_norm": 0.030419809743762016, "learning_rate": 3.248808295247924e-06, "loss": 0.033, "step": 170500 }, { "epoch": 2.5210084033613445, "grad_norm": 0.002908308058977127, "learning_rate": 3.199665831244779e-06, "loss": 0.0333, "step": 171000 }, { "epoch": 2.5283797729618165, "grad_norm": 0.01623060740530491, "learning_rate": 3.1505233672416334e-06, "loss": 0.0347, "step": 171500 }, { "epoch": 2.535751142562288, "grad_norm": 0.0023743058554828167, "learning_rate": 3.1013809032384884e-06, "loss": 0.029, "step": 172000 }, { "epoch": 2.5431225121627596, "grad_norm": 0.0017329910770058632, "learning_rate": 3.0522384392353434e-06, "loss": 0.0399, "step": 172500 }, { "epoch": 2.5504938817632317, "grad_norm": 0.003035512287169695, "learning_rate": 3.003095975232199e-06, "loss": 0.0292, "step": 173000 }, { "epoch": 2.5578652513637032, "grad_norm": 0.016472771763801575, "learning_rate": 2.9539535112290534e-06, "loss": 0.0295, "step": 173500 }, { "epoch": 2.5652366209641753, "grad_norm": 0.19270442426204681, "learning_rate": 2.9048110472259084e-06, "loss": 0.0237, "step": 174000 }, { "epoch": 2.572607990564647, "grad_norm": 14.15371322631836, "learning_rate": 2.855668583222763e-06, "loss": 0.031, "step": 174500 }, { "epoch": 2.579979360165119, "grad_norm": 0.0006798787508159876, "learning_rate": 2.806526119219618e-06, "loss": 0.0348, "step": 175000 }, { "epoch": 2.5873507297655904, "grad_norm": 0.0007228073664009571, "learning_rate": 2.757383655216473e-06, "loss": 0.0266, "step": 175500 }, { "epoch": 2.594722099366062, "grad_norm": 0.003529267618432641, "learning_rate": 2.708339476141334e-06, "loss": 0.025, "step": 176000 }, { "epoch": 2.602093468966534, "grad_norm": 0.004369991831481457, "learning_rate": 2.659197012138189e-06, "loss": 0.0343, "step": 176500 }, { "epoch": 2.6094648385670056, "grad_norm": 0.0011527182068675756, "learning_rate": 2.610054548135044e-06, "loss": 0.0344, "step": 177000 }, { "epoch": 2.6168362081674776, "grad_norm": 0.0012939295265823603, "learning_rate": 2.5609120841318984e-06, "loss": 0.023, "step": 177500 }, { "epoch": 2.624207577767949, "grad_norm": 0.004880073014646769, "learning_rate": 2.5117696201287534e-06, "loss": 0.0226, "step": 178000 }, { "epoch": 2.6315789473684212, "grad_norm": 0.004119515884667635, "learning_rate": 2.4626271561256084e-06, "loss": 0.0217, "step": 178500 }, { "epoch": 2.638950316968893, "grad_norm": 0.0001360880269203335, "learning_rate": 2.4135829770504697e-06, "loss": 0.0318, "step": 179000 }, { "epoch": 2.6463216865693644, "grad_norm": 0.09521844983100891, "learning_rate": 2.3644405130473242e-06, "loss": 0.0382, "step": 179500 }, { "epoch": 2.6536930561698364, "grad_norm": 0.0037165977992117405, "learning_rate": 2.3152980490441792e-06, "loss": 0.0277, "step": 180000 }, { "epoch": 2.661064425770308, "grad_norm": 0.0008691260009072721, "learning_rate": 2.2661555850410342e-06, "loss": 0.0322, "step": 180500 }, { "epoch": 2.66843579537078, "grad_norm": 1.0185267925262451, "learning_rate": 2.217111405965895e-06, "loss": 0.0287, "step": 181000 }, { "epoch": 2.6758071649712516, "grad_norm": 11.761468887329102, "learning_rate": 2.1679689419627505e-06, "loss": 0.0269, "step": 181500 }, { "epoch": 2.6831785345717236, "grad_norm": 0.00046698356163688004, "learning_rate": 2.118826477959605e-06, "loss": 0.0324, "step": 182000 }, { "epoch": 2.690549904172195, "grad_norm": 0.010263352654874325, "learning_rate": 2.0698805838124726e-06, "loss": 0.0239, "step": 182500 }, { "epoch": 2.6979212737726668, "grad_norm": 3.434927463531494, "learning_rate": 2.020738119809327e-06, "loss": 0.0335, "step": 183000 }, { "epoch": 2.705292643373139, "grad_norm": 0.00016977268387563527, "learning_rate": 1.971595655806182e-06, "loss": 0.0275, "step": 183500 }, { "epoch": 2.7126640129736104, "grad_norm": 0.0012544383062049747, "learning_rate": 1.922453191803037e-06, "loss": 0.0312, "step": 184000 }, { "epoch": 2.7200353825740824, "grad_norm": 0.0017481895629316568, "learning_rate": 1.8733107277998921e-06, "loss": 0.0244, "step": 184500 }, { "epoch": 2.727406752174554, "grad_norm": 0.001362023875117302, "learning_rate": 1.824168263796747e-06, "loss": 0.0242, "step": 185000 }, { "epoch": 2.734778121775026, "grad_norm": 0.0014405859401449561, "learning_rate": 1.775025799793602e-06, "loss": 0.0304, "step": 185500 }, { "epoch": 2.7421494913754976, "grad_norm": 0.0008861696696840227, "learning_rate": 1.7258833357904567e-06, "loss": 0.0278, "step": 186000 }, { "epoch": 2.749520860975969, "grad_norm": 0.12847045063972473, "learning_rate": 1.6767408717873115e-06, "loss": 0.0244, "step": 186500 }, { "epoch": 2.756892230576441, "grad_norm": 0.001703253947198391, "learning_rate": 1.6275984077841663e-06, "loss": 0.0356, "step": 187000 }, { "epoch": 2.7642636001769127, "grad_norm": 0.003184770466759801, "learning_rate": 1.5784559437810213e-06, "loss": 0.0234, "step": 187500 }, { "epoch": 2.7716349697773848, "grad_norm": 5.178366661071777, "learning_rate": 1.529313479777876e-06, "loss": 0.0272, "step": 188000 }, { "epoch": 2.7790063393778563, "grad_norm": 0.0035832468420267105, "learning_rate": 1.4801710157747309e-06, "loss": 0.0227, "step": 188500 }, { "epoch": 2.7863777089783284, "grad_norm": 0.007811425253748894, "learning_rate": 1.4311268366995923e-06, "loss": 0.03, "step": 189000 }, { "epoch": 2.7937490785788, "grad_norm": 12.170905113220215, "learning_rate": 1.3819843726964471e-06, "loss": 0.028, "step": 189500 }, { "epoch": 2.8011204481792715, "grad_norm": 0.0028726314194500446, "learning_rate": 1.3329401936213082e-06, "loss": 0.0303, "step": 190000 }, { "epoch": 2.8084918177797435, "grad_norm": 30.53835678100586, "learning_rate": 1.283797729618163e-06, "loss": 0.0321, "step": 190500 }, { "epoch": 2.815863187380215, "grad_norm": 0.006513836327940226, "learning_rate": 1.234655265615018e-06, "loss": 0.0235, "step": 191000 }, { "epoch": 2.823234556980687, "grad_norm": 7.619091510772705, "learning_rate": 1.185512801611873e-06, "loss": 0.0307, "step": 191500 }, { "epoch": 2.8306059265811587, "grad_norm": 0.005691774655133486, "learning_rate": 1.1363703376087278e-06, "loss": 0.0257, "step": 192000 }, { "epoch": 2.8379772961816307, "grad_norm": 2.5761618614196777, "learning_rate": 1.0872278736055828e-06, "loss": 0.0226, "step": 192500 }, { "epoch": 2.8453486657821023, "grad_norm": 0.00010429321264382452, "learning_rate": 1.0380854096024375e-06, "loss": 0.0242, "step": 193000 }, { "epoch": 2.852720035382574, "grad_norm": 0.02838067337870598, "learning_rate": 9.889429455992925e-07, "loss": 0.0287, "step": 193500 }, { "epoch": 2.860091404983046, "grad_norm": 0.12736959755420685, "learning_rate": 9.398004815961473e-07, "loss": 0.0256, "step": 194000 }, { "epoch": 2.8674627745835175, "grad_norm": 0.32672008872032166, "learning_rate": 8.907563025210085e-07, "loss": 0.0265, "step": 194500 }, { "epoch": 2.8748341441839895, "grad_norm": 0.000785826297942549, "learning_rate": 8.416138385178634e-07, "loss": 0.0339, "step": 195000 }, { "epoch": 2.882205513784461, "grad_norm": 6.065654754638672, "learning_rate": 7.924713745147182e-07, "loss": 0.0337, "step": 195500 }, { "epoch": 2.889576883384933, "grad_norm": 0.012824644334614277, "learning_rate": 7.433289105115731e-07, "loss": 0.0278, "step": 196000 }, { "epoch": 2.8969482529854047, "grad_norm": 0.011444471776485443, "learning_rate": 6.941864465084281e-07, "loss": 0.0368, "step": 196500 }, { "epoch": 2.9043196225858763, "grad_norm": 0.0061572156846523285, "learning_rate": 6.450439825052828e-07, "loss": 0.0289, "step": 197000 }, { "epoch": 2.9116909921863483, "grad_norm": 0.010506005957722664, "learning_rate": 5.95999803430144e-07, "loss": 0.022, "step": 197500 }, { "epoch": 2.91906236178682, "grad_norm": 0.00013557464990299195, "learning_rate": 5.468573394269989e-07, "loss": 0.0313, "step": 198000 }, { "epoch": 2.926433731387292, "grad_norm": 0.0008798382477834821, "learning_rate": 4.977148754238538e-07, "loss": 0.0302, "step": 198500 }, { "epoch": 2.9338051009877635, "grad_norm": 0.11164344847202301, "learning_rate": 4.485724114207086e-07, "loss": 0.0234, "step": 199000 }, { "epoch": 2.9411764705882355, "grad_norm": 17.183870315551758, "learning_rate": 3.9942994741756357e-07, "loss": 0.0294, "step": 199500 }, { "epoch": 2.948547840188707, "grad_norm": 14.767284393310547, "learning_rate": 3.502874834144184e-07, "loss": 0.0277, "step": 200000 }, { "epoch": 2.9559192097891787, "grad_norm": 0.0102895712479949, "learning_rate": 3.012433043392796e-07, "loss": 0.0307, "step": 200500 }, { "epoch": 2.9632905793896507, "grad_norm": 0.0019515061285346746, "learning_rate": 2.5210084033613445e-07, "loss": 0.0345, "step": 201000 }, { "epoch": 2.9706619489901223, "grad_norm": 0.5170195698738098, "learning_rate": 2.0295837633298937e-07, "loss": 0.0282, "step": 201500 }, { "epoch": 2.9780333185905943, "grad_norm": 0.00022552709560841322, "learning_rate": 1.5381591232984424e-07, "loss": 0.0294, "step": 202000 }, { "epoch": 2.985404688191066, "grad_norm": 0.004490272141993046, "learning_rate": 1.0467344832669911e-07, "loss": 0.0223, "step": 202500 }, { "epoch": 2.992776057791538, "grad_norm": 0.0005161833250895143, "learning_rate": 5.562926925156028e-08, "loss": 0.0247, "step": 203000 }, { "epoch": 3.0, "eval_accuracy": 0.6753550543024227, "eval_f1": 0.6936652741069145, "eval_loss": 0.12206839770078659, "eval_roc_auc": 0.833023672143383, "eval_runtime": 94.2338, "eval_samples_per_second": 63.512, "eval_steps_per_second": 63.512, "step": 203490 } ], "logging_steps": 500, "max_steps": 203490, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.1521488766625792e+17, "train_batch_size": 1, "trial_name": null, "trial_params": null }