{ "best_metric": 0.28622403740882874, "best_model_checkpoint": "embed/MP_modbert_embed_voc1_0_100_MP/checkpoint-15728", "epoch": 100.0, "eval_steps": 500, "global_step": 98300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.508646998982706, "grad_norm": 0.3106672167778015, "learning_rate": 1.5259409969481182e-06, "loss": 1.541, "step": 500 }, { "epoch": 1.0, "eval_loss": 1.2760541439056396, "eval_mae": 0.8460478782653809, "eval_r2": -7.616851806640625, "eval_rmse": 1.1296252012252808, "eval_runtime": 20.7046, "eval_samples_per_second": 482.984, "eval_steps_per_second": 3.816, "step": 983 }, { "epoch": 1.017293997965412, "grad_norm": 0.6856330037117004, "learning_rate": 3.0518819938962364e-06, "loss": 1.3676, "step": 1000 }, { "epoch": 1.5259409969481181, "grad_norm": 1.0226482152938843, "learning_rate": 4.577822990844354e-06, "loss": 1.09, "step": 1500 }, { "epoch": 2.0, "eval_loss": 0.7750646471977234, "eval_mae": 0.6696741580963135, "eval_r2": -0.1967395395040512, "eval_rmse": 0.8803772330284119, "eval_runtime": 20.7332, "eval_samples_per_second": 482.319, "eval_steps_per_second": 3.81, "step": 1966 }, { "epoch": 2.034587995930824, "grad_norm": 1.0323240756988525, "learning_rate": 6.103763987792473e-06, "loss": 0.8467, "step": 2000 }, { "epoch": 2.5432349949135302, "grad_norm": 1.000579833984375, "learning_rate": 7.62970498474059e-06, "loss": 0.6796, "step": 2500 }, { "epoch": 3.0, "eval_loss": 0.5403278470039368, "eval_mae": 0.5509415864944458, "eval_r2": 0.3976656198501587, "eval_rmse": 0.7350696325302124, "eval_runtime": 19.1568, "eval_samples_per_second": 522.008, "eval_steps_per_second": 4.124, "step": 2949 }, { "epoch": 3.051881993896236, "grad_norm": 1.153286099433899, "learning_rate": 9.155645981688708e-06, "loss": 0.5626, "step": 3000 }, { "epoch": 3.560528992878942, "grad_norm": 1.2380833625793457, "learning_rate": 1.0681586978636825e-05, "loss": 0.4871, "step": 3500 }, { "epoch": 4.0, "eval_loss": 0.4411344826221466, "eval_mae": 0.4877315163612366, "eval_r2": 0.5509063005447388, "eval_rmse": 0.6641793847084045, "eval_runtime": 17.1276, "eval_samples_per_second": 583.854, "eval_steps_per_second": 4.612, "step": 3932 }, { "epoch": 4.069175991861648, "grad_norm": 1.0829737186431885, "learning_rate": 1.2207527975584946e-05, "loss": 0.4344, "step": 4000 }, { "epoch": 4.577822990844354, "grad_norm": 0.9966434240341187, "learning_rate": 1.3733468972533063e-05, "loss": 0.3966, "step": 4500 }, { "epoch": 5.0, "eval_loss": 0.391430139541626, "eval_mae": 0.44986167550086975, "eval_r2": 0.6214621067047119, "eval_rmse": 0.6256435513496399, "eval_runtime": 19.311, "eval_samples_per_second": 517.839, "eval_steps_per_second": 4.091, "step": 4915 }, { "epoch": 5.0864699898270604, "grad_norm": 1.2099027633666992, "learning_rate": 1.525940996948118e-05, "loss": 0.3747, "step": 5000 }, { "epoch": 5.595116988809766, "grad_norm": 1.3580559492111206, "learning_rate": 1.67853509664293e-05, "loss": 0.3501, "step": 5500 }, { "epoch": 6.0, "eval_loss": 0.35625067353248596, "eval_mae": 0.42443710565567017, "eval_r2": 0.6606752872467041, "eval_rmse": 0.5968672037124634, "eval_runtime": 19.6114, "eval_samples_per_second": 509.909, "eval_steps_per_second": 4.028, "step": 5898 }, { "epoch": 6.103763987792472, "grad_norm": 1.0308208465576172, "learning_rate": 1.8311291963377416e-05, "loss": 0.3332, "step": 6000 }, { "epoch": 6.612410986775178, "grad_norm": 1.0148053169250488, "learning_rate": 1.9837232960325533e-05, "loss": 0.3184, "step": 6500 }, { "epoch": 7.0, "eval_loss": 0.3325399160385132, "eval_mae": 0.4055442214012146, "eval_r2": 0.714428186416626, "eval_rmse": 0.576662540435791, "eval_runtime": 19.3745, "eval_samples_per_second": 516.143, "eval_steps_per_second": 4.078, "step": 6881 }, { "epoch": 7.121057985757884, "grad_norm": 1.2479807138442993, "learning_rate": 2.136317395727365e-05, "loss": 0.3061, "step": 7000 }, { "epoch": 7.62970498474059, "grad_norm": 0.9779444932937622, "learning_rate": 2.288911495422177e-05, "loss": 0.2946, "step": 7500 }, { "epoch": 8.0, "eval_loss": 0.3256845772266388, "eval_mae": 0.39594584703445435, "eval_r2": 0.7166282534599304, "eval_rmse": 0.5706875920295715, "eval_runtime": 20.9579, "eval_samples_per_second": 477.147, "eval_steps_per_second": 3.769, "step": 7864 }, { "epoch": 8.138351983723297, "grad_norm": 0.9078311324119568, "learning_rate": 2.441505595116989e-05, "loss": 0.2871, "step": 8000 }, { "epoch": 8.646998982706002, "grad_norm": 0.9875624775886536, "learning_rate": 2.594099694811801e-05, "loss": 0.276, "step": 8500 }, { "epoch": 9.0, "eval_loss": 0.31457769870758057, "eval_mae": 0.3851270079612732, "eval_r2": 0.7076315879821777, "eval_rmse": 0.5608720779418945, "eval_runtime": 18.1766, "eval_samples_per_second": 550.158, "eval_steps_per_second": 4.346, "step": 8847 }, { "epoch": 9.155645981688709, "grad_norm": 0.8023911714553833, "learning_rate": 2.7466937945066126e-05, "loss": 0.2669, "step": 9000 }, { "epoch": 9.664292980671414, "grad_norm": 0.846149206161499, "learning_rate": 2.8992878942014243e-05, "loss": 0.26, "step": 9500 }, { "epoch": 10.0, "eval_loss": 0.3037531077861786, "eval_mae": 0.37509840726852417, "eval_r2": 0.7389528155326843, "eval_rmse": 0.5511377453804016, "eval_runtime": 19.169, "eval_samples_per_second": 521.676, "eval_steps_per_second": 4.121, "step": 9830 }, { "epoch": 10.172939979654121, "grad_norm": 1.080586552619934, "learning_rate": 2.9942353340115293e-05, "loss": 0.2499, "step": 10000 }, { "epoch": 10.681586978636826, "grad_norm": 0.7065219879150391, "learning_rate": 2.9772804340454395e-05, "loss": 0.2452, "step": 10500 }, { "epoch": 11.0, "eval_loss": 0.2963975965976715, "eval_mae": 0.36469894647598267, "eval_r2": 0.7409036159515381, "eval_rmse": 0.5444238781929016, "eval_runtime": 19.1182, "eval_samples_per_second": 523.063, "eval_steps_per_second": 4.132, "step": 10813 }, { "epoch": 11.190233977619531, "grad_norm": 0.7577599287033081, "learning_rate": 2.960325534079349e-05, "loss": 0.2346, "step": 11000 }, { "epoch": 11.698880976602238, "grad_norm": 0.9281987547874451, "learning_rate": 2.9433706341132588e-05, "loss": 0.2283, "step": 11500 }, { "epoch": 12.0, "eval_loss": 0.2902641296386719, "eval_mae": 0.3583831191062927, "eval_r2": 0.7628405690193176, "eval_rmse": 0.5387614965438843, "eval_runtime": 20.6847, "eval_samples_per_second": 483.45, "eval_steps_per_second": 3.819, "step": 11796 }, { "epoch": 12.207527975584943, "grad_norm": 0.7825191617012024, "learning_rate": 2.9264157341471686e-05, "loss": 0.2218, "step": 12000 }, { "epoch": 12.71617497456765, "grad_norm": 0.7279273867607117, "learning_rate": 2.9094608341810784e-05, "loss": 0.2145, "step": 12500 }, { "epoch": 13.0, "eval_loss": 0.2926931083202362, "eval_mae": 0.35721156001091003, "eval_r2": 0.7634265422821045, "eval_rmse": 0.5410109758377075, "eval_runtime": 19.8102, "eval_samples_per_second": 504.789, "eval_steps_per_second": 3.988, "step": 12779 }, { "epoch": 13.224821973550355, "grad_norm": 0.7883431911468506, "learning_rate": 2.8925059342149882e-05, "loss": 0.2086, "step": 13000 }, { "epoch": 13.733468972533062, "grad_norm": 0.7794452905654907, "learning_rate": 2.875551034248898e-05, "loss": 0.2023, "step": 13500 }, { "epoch": 14.0, "eval_loss": 0.2874827980995178, "eval_mae": 0.3508231043815613, "eval_r2": 0.7408859133720398, "eval_rmse": 0.5361739993095398, "eval_runtime": 19.5699, "eval_samples_per_second": 510.99, "eval_steps_per_second": 4.037, "step": 13762 }, { "epoch": 14.242115971515767, "grad_norm": 0.6752628684043884, "learning_rate": 2.8585961342828075e-05, "loss": 0.1954, "step": 14000 }, { "epoch": 14.750762970498474, "grad_norm": 0.7037671804428101, "learning_rate": 2.8416412343167176e-05, "loss": 0.1927, "step": 14500 }, { "epoch": 15.0, "eval_loss": 0.2897137701511383, "eval_mae": 0.34881192445755005, "eval_r2": 0.7613890171051025, "eval_rmse": 0.5382503867149353, "eval_runtime": 20.9472, "eval_samples_per_second": 477.39, "eval_steps_per_second": 3.771, "step": 14745 }, { "epoch": 15.25940996948118, "grad_norm": 0.7775920033454895, "learning_rate": 2.8246863343506274e-05, "loss": 0.1852, "step": 15000 }, { "epoch": 15.768056968463886, "grad_norm": 0.6624333262443542, "learning_rate": 2.8077314343845372e-05, "loss": 0.1846, "step": 15500 }, { "epoch": 16.0, "eval_loss": 0.28622403740882874, "eval_mae": 0.34425806999206543, "eval_r2": 0.7593017816543579, "eval_rmse": 0.5349988341331482, "eval_runtime": 19.8037, "eval_samples_per_second": 504.956, "eval_steps_per_second": 3.989, "step": 15728 }, { "epoch": 16.276703967446593, "grad_norm": 0.6602976322174072, "learning_rate": 2.790776534418447e-05, "loss": 0.1776, "step": 16000 }, { "epoch": 16.7853509664293, "grad_norm": 0.6582776308059692, "learning_rate": 2.773821634452357e-05, "loss": 0.1762, "step": 16500 }, { "epoch": 17.0, "eval_loss": 0.28853750228881836, "eval_mae": 0.3421522378921509, "eval_r2": 0.7615672945976257, "eval_rmse": 0.5371565818786621, "eval_runtime": 20.8644, "eval_samples_per_second": 479.286, "eval_steps_per_second": 3.786, "step": 16711 }, { "epoch": 17.293997965412004, "grad_norm": 0.7847622632980347, "learning_rate": 2.7568667344862667e-05, "loss": 0.1677, "step": 17000 }, { "epoch": 17.80264496439471, "grad_norm": 0.7339199185371399, "learning_rate": 2.739911834520176e-05, "loss": 0.1694, "step": 17500 }, { "epoch": 18.0, "eval_loss": 0.29071176052093506, "eval_mae": 0.34257909655570984, "eval_r2": 0.7714438438415527, "eval_rmse": 0.5391767024993896, "eval_runtime": 19.1023, "eval_samples_per_second": 523.496, "eval_steps_per_second": 4.136, "step": 17694 }, { "epoch": 18.311291963377418, "grad_norm": 0.7355333566665649, "learning_rate": 2.7229569345540863e-05, "loss": 0.164, "step": 18000 }, { "epoch": 18.819938962360123, "grad_norm": 0.6125873923301697, "learning_rate": 2.7060020345879958e-05, "loss": 0.1621, "step": 18500 }, { "epoch": 19.0, "eval_loss": 0.2921219766139984, "eval_mae": 0.34166020154953003, "eval_r2": 0.771050214767456, "eval_rmse": 0.5404828786849976, "eval_runtime": 18.7584, "eval_samples_per_second": 533.096, "eval_steps_per_second": 4.211, "step": 18677 }, { "epoch": 19.328585961342828, "grad_norm": 0.533932626247406, "learning_rate": 2.689047134621906e-05, "loss": 0.1574, "step": 19000 }, { "epoch": 19.837232960325533, "grad_norm": 0.5610256791114807, "learning_rate": 2.6720922346558154e-05, "loss": 0.1585, "step": 19500 }, { "epoch": 20.0, "eval_loss": 0.2904634177684784, "eval_mae": 0.3388892710208893, "eval_r2": 0.759547233581543, "eval_rmse": 0.5389463305473328, "eval_runtime": 18.6352, "eval_samples_per_second": 536.618, "eval_steps_per_second": 4.239, "step": 19660 }, { "epoch": 20.345879959308242, "grad_norm": 0.639903724193573, "learning_rate": 2.6551373346897255e-05, "loss": 0.1518, "step": 20000 }, { "epoch": 20.854526958290947, "grad_norm": 0.6409148573875427, "learning_rate": 2.6381824347236353e-05, "loss": 0.1538, "step": 20500 }, { "epoch": 21.0, "eval_loss": 0.28870195150375366, "eval_mae": 0.3359060287475586, "eval_r2": 0.7662752866744995, "eval_rmse": 0.5373095870018005, "eval_runtime": 19.483, "eval_samples_per_second": 513.267, "eval_steps_per_second": 4.055, "step": 20643 }, { "epoch": 21.363173957273652, "grad_norm": 0.6030653715133667, "learning_rate": 2.6212275347575448e-05, "loss": 0.1454, "step": 21000 }, { "epoch": 21.871820956256357, "grad_norm": 0.5733378529548645, "learning_rate": 2.604272634791455e-05, "loss": 0.1512, "step": 21500 }, { "epoch": 22.0, "eval_loss": 0.2888459265232086, "eval_mae": 0.336772620677948, "eval_r2": 0.7591472268104553, "eval_rmse": 0.5374436974525452, "eval_runtime": 19.7086, "eval_samples_per_second": 507.392, "eval_steps_per_second": 4.008, "step": 21626 }, { "epoch": 22.380467955239062, "grad_norm": 0.5780584812164307, "learning_rate": 2.5873177348253644e-05, "loss": 0.1426, "step": 22000 }, { "epoch": 22.88911495422177, "grad_norm": 0.6120862364768982, "learning_rate": 2.5703628348592746e-05, "loss": 0.1462, "step": 22500 }, { "epoch": 23.0, "eval_loss": 0.28724002838134766, "eval_mae": 0.3324105143547058, "eval_r2": 0.7722494602203369, "eval_rmse": 0.5359475016593933, "eval_runtime": 19.302, "eval_samples_per_second": 518.08, "eval_steps_per_second": 4.093, "step": 22609 }, { "epoch": 23.397761953204476, "grad_norm": 0.5332029461860657, "learning_rate": 2.553407934893184e-05, "loss": 0.1386, "step": 23000 }, { "epoch": 23.90640895218718, "grad_norm": 0.5000107884407043, "learning_rate": 2.5364530349270942e-05, "loss": 0.1438, "step": 23500 }, { "epoch": 24.0, "eval_loss": 0.2948116064071655, "eval_mae": 0.33423200249671936, "eval_r2": 0.771849513053894, "eval_rmse": 0.542965292930603, "eval_runtime": 16.3008, "eval_samples_per_second": 613.466, "eval_steps_per_second": 4.846, "step": 23592 }, { "epoch": 24.415055951169887, "grad_norm": 0.4723941385746002, "learning_rate": 2.5194981349610037e-05, "loss": 0.1351, "step": 24000 }, { "epoch": 24.923702950152595, "grad_norm": 0.5572139024734497, "learning_rate": 2.5025432349949135e-05, "loss": 0.1408, "step": 24500 }, { "epoch": 25.0, "eval_loss": 0.28881922364234924, "eval_mae": 0.3297838866710663, "eval_r2": 0.7694551944732666, "eval_rmse": 0.5374186635017395, "eval_runtime": 16.1445, "eval_samples_per_second": 619.406, "eval_steps_per_second": 4.893, "step": 24575 }, { "epoch": 25.4323499491353, "grad_norm": 0.5389395952224731, "learning_rate": 2.4855883350288233e-05, "loss": 0.1332, "step": 25000 }, { "epoch": 25.940996948118006, "grad_norm": 0.7782149910926819, "learning_rate": 2.468633435062733e-05, "loss": 0.1374, "step": 25500 }, { "epoch": 26.0, "eval_loss": 0.2942332625389099, "eval_mae": 0.3317820429801941, "eval_r2": 0.7754645347595215, "eval_rmse": 0.5424323081970215, "eval_runtime": 15.3961, "eval_samples_per_second": 649.517, "eval_steps_per_second": 5.131, "step": 25558 }, { "epoch": 26.44964394710071, "grad_norm": 0.462462455034256, "learning_rate": 2.4516785350966432e-05, "loss": 0.1295, "step": 26000 }, { "epoch": 26.95829094608342, "grad_norm": 0.5143025517463684, "learning_rate": 2.4347236351305527e-05, "loss": 0.137, "step": 26500 }, { "epoch": 27.0, "eval_loss": 0.2935540974140167, "eval_mae": 0.330686092376709, "eval_r2": 0.767002284526825, "eval_rmse": 0.5418060421943665, "eval_runtime": 15.7155, "eval_samples_per_second": 636.314, "eval_steps_per_second": 5.027, "step": 26541 }, { "epoch": 27.466937945066125, "grad_norm": 0.4412664473056793, "learning_rate": 2.417768735164463e-05, "loss": 0.1266, "step": 27000 }, { "epoch": 27.97558494404883, "grad_norm": 0.5299028754234314, "learning_rate": 2.4008138351983723e-05, "loss": 0.1345, "step": 27500 }, { "epoch": 28.0, "eval_loss": 0.29448285698890686, "eval_mae": 0.33093565702438354, "eval_r2": 0.7710368633270264, "eval_rmse": 0.5426624417304993, "eval_runtime": 15.225, "eval_samples_per_second": 656.813, "eval_steps_per_second": 5.189, "step": 27524 }, { "epoch": 28.484231943031535, "grad_norm": 0.3932570219039917, "learning_rate": 2.383858935232282e-05, "loss": 0.126, "step": 28000 }, { "epoch": 28.992878942014244, "grad_norm": 0.5242863893508911, "learning_rate": 2.366904035266192e-05, "loss": 0.1295, "step": 28500 }, { "epoch": 29.0, "eval_loss": 0.29432228207588196, "eval_mae": 0.3288809657096863, "eval_r2": 0.7685636878013611, "eval_rmse": 0.542514443397522, "eval_runtime": 15.4546, "eval_samples_per_second": 647.058, "eval_steps_per_second": 5.112, "step": 28507 }, { "epoch": 29.50152594099695, "grad_norm": 0.4471481144428253, "learning_rate": 2.3499491353001018e-05, "loss": 0.1237, "step": 29000 }, { "epoch": 30.0, "eval_loss": 0.2978579103946686, "eval_mae": 0.3290804624557495, "eval_r2": 0.7655062675476074, "eval_rmse": 0.5457631945610046, "eval_runtime": 15.0561, "eval_samples_per_second": 664.183, "eval_steps_per_second": 5.247, "step": 29490 }, { "epoch": 30.010172939979654, "grad_norm": 0.5170900821685791, "learning_rate": 2.3329942353340116e-05, "loss": 0.1289, "step": 29500 }, { "epoch": 30.51881993896236, "grad_norm": 0.4907204806804657, "learning_rate": 2.3160393353679214e-05, "loss": 0.1207, "step": 30000 }, { "epoch": 31.0, "eval_loss": 0.294801265001297, "eval_mae": 0.32844942808151245, "eval_r2": 0.7698482275009155, "eval_rmse": 0.542955756187439, "eval_runtime": 15.6533, "eval_samples_per_second": 638.844, "eval_steps_per_second": 5.047, "step": 30473 }, { "epoch": 31.027466937945068, "grad_norm": 0.4819788932800293, "learning_rate": 2.2990844354018312e-05, "loss": 0.1285, "step": 30500 }, { "epoch": 31.536113936927773, "grad_norm": 0.48075565695762634, "learning_rate": 2.282129535435741e-05, "loss": 0.1194, "step": 31000 }, { "epoch": 32.0, "eval_loss": 0.2976591885089874, "eval_mae": 0.3284505605697632, "eval_r2": 0.770287275314331, "eval_rmse": 0.5455812811851501, "eval_runtime": 16.4511, "eval_samples_per_second": 607.864, "eval_steps_per_second": 4.802, "step": 31456 }, { "epoch": 32.044760935910475, "grad_norm": 0.4494258165359497, "learning_rate": 2.2651746354696508e-05, "loss": 0.1254, "step": 31500 }, { "epoch": 32.55340793489319, "grad_norm": 0.4671981632709503, "learning_rate": 2.2482197355035606e-05, "loss": 0.1193, "step": 32000 }, { "epoch": 33.0, "eval_loss": 0.29970449209213257, "eval_mae": 0.32844069600105286, "eval_r2": 0.7684862613677979, "eval_rmse": 0.5474523305892944, "eval_runtime": 16.4076, "eval_samples_per_second": 609.473, "eval_steps_per_second": 4.815, "step": 32439 }, { "epoch": 33.06205493387589, "grad_norm": 0.37166231870651245, "learning_rate": 2.2312648355374704e-05, "loss": 0.1237, "step": 32500 }, { "epoch": 33.5707019328586, "grad_norm": 0.34825435280799866, "learning_rate": 2.2143099355713802e-05, "loss": 0.1178, "step": 33000 }, { "epoch": 34.0, "eval_loss": 0.30024948716163635, "eval_mae": 0.3275962471961975, "eval_r2": 0.7728116512298584, "eval_rmse": 0.5479499697685242, "eval_runtime": 18.1242, "eval_samples_per_second": 551.747, "eval_steps_per_second": 4.359, "step": 33422 }, { "epoch": 34.0793489318413, "grad_norm": 0.41295087337493896, "learning_rate": 2.19735503560529e-05, "loss": 0.1216, "step": 33500 }, { "epoch": 34.58799593082401, "grad_norm": 0.4062606692314148, "learning_rate": 2.1804001356392e-05, "loss": 0.1172, "step": 34000 }, { "epoch": 35.0, "eval_loss": 0.2986757755279541, "eval_mae": 0.3264056444168091, "eval_r2": 0.7658240795135498, "eval_rmse": 0.5465120077133179, "eval_runtime": 18.6496, "eval_samples_per_second": 536.206, "eval_steps_per_second": 4.236, "step": 34405 }, { "epoch": 35.09664292980671, "grad_norm": 0.4476313889026642, "learning_rate": 2.1634452356731097e-05, "loss": 0.1198, "step": 34500 }, { "epoch": 35.60528992878942, "grad_norm": 0.29763707518577576, "learning_rate": 2.146490335707019e-05, "loss": 0.1165, "step": 35000 }, { "epoch": 36.0, "eval_loss": 0.30164480209350586, "eval_mae": 0.3274855315685272, "eval_r2": 0.7613018751144409, "eval_rmse": 0.5492217540740967, "eval_runtime": 18.3514, "eval_samples_per_second": 544.919, "eval_steps_per_second": 4.305, "step": 35388 }, { "epoch": 36.11393692777212, "grad_norm": 0.3980378806591034, "learning_rate": 2.1295354357409293e-05, "loss": 0.1178, "step": 35500 }, { "epoch": 36.622583926754835, "grad_norm": 0.48255112767219543, "learning_rate": 2.1125805357748388e-05, "loss": 0.1135, "step": 36000 }, { "epoch": 37.0, "eval_loss": 0.30013135075569153, "eval_mae": 0.3258868455886841, "eval_r2": 0.7733784914016724, "eval_rmse": 0.5478420853614807, "eval_runtime": 20.577, "eval_samples_per_second": 485.98, "eval_steps_per_second": 3.839, "step": 36371 }, { "epoch": 37.13123092573754, "grad_norm": 0.3691520392894745, "learning_rate": 2.095625635808749e-05, "loss": 0.1176, "step": 36500 }, { "epoch": 37.639877924720246, "grad_norm": 0.36071911454200745, "learning_rate": 2.0786707358426584e-05, "loss": 0.1146, "step": 37000 }, { "epoch": 38.0, "eval_loss": 0.30118992924690247, "eval_mae": 0.327684223651886, "eval_r2": 0.7721153497695923, "eval_rmse": 0.5488075017929077, "eval_runtime": 19.8739, "eval_samples_per_second": 503.173, "eval_steps_per_second": 3.975, "step": 37354 }, { "epoch": 38.14852492370295, "grad_norm": 0.3069157898426056, "learning_rate": 2.0617158358765685e-05, "loss": 0.1145, "step": 37500 }, { "epoch": 38.657171922685656, "grad_norm": 0.36026087403297424, "learning_rate": 2.0447609359104783e-05, "loss": 0.1139, "step": 38000 }, { "epoch": 39.0, "eval_loss": 0.2999809980392456, "eval_mae": 0.3254566788673401, "eval_r2": 0.7663213610649109, "eval_rmse": 0.5477048754692078, "eval_runtime": 18.5952, "eval_samples_per_second": 537.773, "eval_steps_per_second": 4.248, "step": 38337 }, { "epoch": 39.16581892166836, "grad_norm": 0.35690009593963623, "learning_rate": 2.0278060359443878e-05, "loss": 0.1149, "step": 38500 }, { "epoch": 39.674465920651066, "grad_norm": 0.34920957684516907, "learning_rate": 2.010851135978298e-05, "loss": 0.1136, "step": 39000 }, { "epoch": 40.0, "eval_loss": 0.3011326789855957, "eval_mae": 0.3251156210899353, "eval_r2": 0.7704006433486938, "eval_rmse": 0.5487552285194397, "eval_runtime": 17.6999, "eval_samples_per_second": 564.975, "eval_steps_per_second": 4.463, "step": 39320 }, { "epoch": 40.18311291963377, "grad_norm": 0.3468838036060333, "learning_rate": 1.9938962360122074e-05, "loss": 0.1121, "step": 39500 }, { "epoch": 40.691759918616484, "grad_norm": 0.4196014702320099, "learning_rate": 1.9769413360461176e-05, "loss": 0.1122, "step": 40000 }, { "epoch": 41.0, "eval_loss": 0.30130186676979065, "eval_mae": 0.324237585067749, "eval_r2": 0.7646907567977905, "eval_rmse": 0.5489093661308289, "eval_runtime": 17.2647, "eval_samples_per_second": 579.217, "eval_steps_per_second": 4.576, "step": 40303 }, { "epoch": 41.20040691759919, "grad_norm": 0.3917727470397949, "learning_rate": 1.959986436080027e-05, "loss": 0.1134, "step": 40500 }, { "epoch": 41.709053916581894, "grad_norm": 0.3163571059703827, "learning_rate": 1.9430315361139372e-05, "loss": 0.1093, "step": 41000 }, { "epoch": 42.0, "eval_loss": 0.30112671852111816, "eval_mae": 0.3250886797904968, "eval_r2": 0.7695274353027344, "eval_rmse": 0.5487497448921204, "eval_runtime": 17.9807, "eval_samples_per_second": 556.152, "eval_steps_per_second": 4.394, "step": 41286 }, { "epoch": 42.2177009155646, "grad_norm": 0.34252750873565674, "learning_rate": 1.9260766361478467e-05, "loss": 0.112, "step": 41500 }, { "epoch": 42.726347914547304, "grad_norm": 0.39978763461112976, "learning_rate": 1.9091217361817565e-05, "loss": 0.1105, "step": 42000 }, { "epoch": 43.0, "eval_loss": 0.30118075013160706, "eval_mae": 0.32392618060112, "eval_r2": 0.7685805559158325, "eval_rmse": 0.5487990379333496, "eval_runtime": 19.3392, "eval_samples_per_second": 517.085, "eval_steps_per_second": 4.085, "step": 42269 }, { "epoch": 43.23499491353001, "grad_norm": 0.38831090927124023, "learning_rate": 1.8921668362156663e-05, "loss": 0.1105, "step": 42500 }, { "epoch": 43.743641912512714, "grad_norm": 0.3529709577560425, "learning_rate": 1.875211936249576e-05, "loss": 0.111, "step": 43000 }, { "epoch": 44.0, "eval_loss": 0.3047633171081543, "eval_mae": 0.3260849118232727, "eval_r2": 0.7712223529815674, "eval_rmse": 0.5520533323287964, "eval_runtime": 17.5503, "eval_samples_per_second": 569.792, "eval_steps_per_second": 4.501, "step": 43252 }, { "epoch": 44.25228891149542, "grad_norm": 0.33766278624534607, "learning_rate": 1.8582570362834862e-05, "loss": 0.1084, "step": 43500 }, { "epoch": 44.760935910478125, "grad_norm": 0.4114132225513458, "learning_rate": 1.8413021363173957e-05, "loss": 0.1108, "step": 44000 }, { "epoch": 45.0, "eval_loss": 0.3017372488975525, "eval_mae": 0.32358628511428833, "eval_r2": 0.7659143209457397, "eval_rmse": 0.5493058562278748, "eval_runtime": 19.7823, "eval_samples_per_second": 505.501, "eval_steps_per_second": 3.993, "step": 44235 }, { "epoch": 45.26958290946084, "grad_norm": 0.37686386704444885, "learning_rate": 1.824347236351306e-05, "loss": 0.1068, "step": 44500 }, { "epoch": 45.77822990844354, "grad_norm": 0.32547298073768616, "learning_rate": 1.8073923363852153e-05, "loss": 0.1091, "step": 45000 }, { "epoch": 46.0, "eval_loss": 0.30219876766204834, "eval_mae": 0.3234938383102417, "eval_r2": 0.7667394280433655, "eval_rmse": 0.5497257709503174, "eval_runtime": 21.0895, "eval_samples_per_second": 474.169, "eval_steps_per_second": 3.746, "step": 45218 }, { "epoch": 46.28687690742625, "grad_norm": 0.3565451502799988, "learning_rate": 1.790437436419125e-05, "loss": 0.1074, "step": 45500 }, { "epoch": 46.79552390640895, "grad_norm": 0.3059617877006531, "learning_rate": 1.773482536453035e-05, "loss": 0.1085, "step": 46000 }, { "epoch": 47.0, "eval_loss": 0.300829142332077, "eval_mae": 0.32321953773498535, "eval_r2": 0.7664545178413391, "eval_rmse": 0.5484786033630371, "eval_runtime": 26.1836, "eval_samples_per_second": 381.918, "eval_steps_per_second": 3.017, "step": 46201 }, { "epoch": 47.30417090539166, "grad_norm": 0.37008407711982727, "learning_rate": 1.7565276364869448e-05, "loss": 0.1069, "step": 46500 }, { "epoch": 47.81281790437436, "grad_norm": 0.36229032278060913, "learning_rate": 1.7395727365208546e-05, "loss": 0.1082, "step": 47000 }, { "epoch": 48.0, "eval_loss": 0.30168837308883667, "eval_mae": 0.32336822152137756, "eval_r2": 0.7664652466773987, "eval_rmse": 0.5492613315582275, "eval_runtime": 23.5481, "eval_samples_per_second": 424.663, "eval_steps_per_second": 3.355, "step": 47184 }, { "epoch": 48.32146490335707, "grad_norm": 0.2914319932460785, "learning_rate": 1.7226178365547644e-05, "loss": 0.1041, "step": 47500 }, { "epoch": 48.83011190233977, "grad_norm": 0.3244343101978302, "learning_rate": 1.705662936588674e-05, "loss": 0.1084, "step": 48000 }, { "epoch": 49.0, "eval_loss": 0.3029985725879669, "eval_mae": 0.32335370779037476, "eval_r2": 0.76764976978302, "eval_rmse": 0.550452709197998, "eval_runtime": 21.4309, "eval_samples_per_second": 466.616, "eval_steps_per_second": 3.686, "step": 48167 }, { "epoch": 49.338758901322485, "grad_norm": 0.3769769072532654, "learning_rate": 1.688708036622584e-05, "loss": 0.1051, "step": 48500 }, { "epoch": 49.84740590030519, "grad_norm": 0.3235186040401459, "learning_rate": 1.6717531366564938e-05, "loss": 0.108, "step": 49000 }, { "epoch": 50.0, "eval_loss": 0.3056350648403168, "eval_mae": 0.3234601318836212, "eval_r2": 0.7742241621017456, "eval_rmse": 0.552842378616333, "eval_runtime": 21.2669, "eval_samples_per_second": 470.213, "eval_steps_per_second": 3.715, "step": 49150 }, { "epoch": 50.356052899287896, "grad_norm": 0.32111743092536926, "learning_rate": 1.6547982366904036e-05, "loss": 0.1023, "step": 49500 }, { "epoch": 50.8646998982706, "grad_norm": 0.35004922747612, "learning_rate": 1.6378433367243134e-05, "loss": 0.1065, "step": 50000 }, { "epoch": 51.0, "eval_loss": 0.3032062351703644, "eval_mae": 0.3231772184371948, "eval_r2": 0.7666732668876648, "eval_rmse": 0.5506412982940674, "eval_runtime": 22.5263, "eval_samples_per_second": 443.925, "eval_steps_per_second": 3.507, "step": 50133 }, { "epoch": 51.373346897253306, "grad_norm": 0.3384553790092468, "learning_rate": 1.6208884367582232e-05, "loss": 0.105, "step": 50500 }, { "epoch": 51.88199389623601, "grad_norm": 0.30764466524124146, "learning_rate": 1.603933536792133e-05, "loss": 0.1061, "step": 51000 }, { "epoch": 52.0, "eval_loss": 0.3021974265575409, "eval_mae": 0.32215458154678345, "eval_r2": 0.7661508321762085, "eval_rmse": 0.5497245192527771, "eval_runtime": 21.0716, "eval_samples_per_second": 474.572, "eval_steps_per_second": 3.749, "step": 51116 }, { "epoch": 52.390640895218716, "grad_norm": 0.32773980498313904, "learning_rate": 1.5869786368260425e-05, "loss": 0.1022, "step": 51500 }, { "epoch": 52.89928789420142, "grad_norm": 0.3055395185947418, "learning_rate": 1.5700237368599527e-05, "loss": 0.1065, "step": 52000 }, { "epoch": 53.0, "eval_loss": 0.3047163188457489, "eval_mae": 0.32322755455970764, "eval_r2": 0.7697006464004517, "eval_rmse": 0.552010715007782, "eval_runtime": 20.1406, "eval_samples_per_second": 496.509, "eval_steps_per_second": 3.922, "step": 52099 }, { "epoch": 53.407934893184134, "grad_norm": 0.25924962759017944, "learning_rate": 1.553068836893862e-05, "loss": 0.1022, "step": 52500 }, { "epoch": 53.91658189216684, "grad_norm": 0.29001641273498535, "learning_rate": 1.5361139369277723e-05, "loss": 0.1062, "step": 53000 }, { "epoch": 54.0, "eval_loss": 0.3048439025878906, "eval_mae": 0.32225465774536133, "eval_r2": 0.7680661678314209, "eval_rmse": 0.5521263480186462, "eval_runtime": 22.1784, "eval_samples_per_second": 450.889, "eval_steps_per_second": 3.562, "step": 53082 }, { "epoch": 54.425228891149544, "grad_norm": 0.2913689911365509, "learning_rate": 1.519159036961682e-05, "loss": 0.1012, "step": 53500 }, { "epoch": 54.93387589013225, "grad_norm": 0.27742624282836914, "learning_rate": 1.5022041369955919e-05, "loss": 0.1046, "step": 54000 }, { "epoch": 55.0, "eval_loss": 0.30284586548805237, "eval_mae": 0.3212890028953552, "eval_r2": 0.765329122543335, "eval_rmse": 0.5503140091896057, "eval_runtime": 21.5879, "eval_samples_per_second": 463.223, "eval_steps_per_second": 3.659, "step": 54065 }, { "epoch": 55.442522889114954, "grad_norm": 0.24787943065166473, "learning_rate": 1.4852492370295015e-05, "loss": 0.1032, "step": 54500 }, { "epoch": 55.95116988809766, "grad_norm": 0.3004557192325592, "learning_rate": 1.4682943370634113e-05, "loss": 0.1034, "step": 55000 }, { "epoch": 56.0, "eval_loss": 0.30553364753723145, "eval_mae": 0.3226454555988312, "eval_r2": 0.7709420919418335, "eval_rmse": 0.5527505874633789, "eval_runtime": 20.8948, "eval_samples_per_second": 478.589, "eval_steps_per_second": 3.781, "step": 55048 }, { "epoch": 56.459816887080365, "grad_norm": 0.290238618850708, "learning_rate": 1.4513394370973212e-05, "loss": 0.1011, "step": 55500 }, { "epoch": 56.96846388606307, "grad_norm": 0.34532827138900757, "learning_rate": 1.434384537131231e-05, "loss": 0.1047, "step": 56000 }, { "epoch": 57.0, "eval_loss": 0.3043980598449707, "eval_mae": 0.32213300466537476, "eval_r2": 0.764991283416748, "eval_rmse": 0.5517224073410034, "eval_runtime": 21.1266, "eval_samples_per_second": 473.337, "eval_steps_per_second": 3.739, "step": 56031 }, { "epoch": 57.477110885045775, "grad_norm": 0.2498069405555725, "learning_rate": 1.4174296371651406e-05, "loss": 0.0988, "step": 56500 }, { "epoch": 57.98575788402849, "grad_norm": 0.34196507930755615, "learning_rate": 1.4004747371990506e-05, "loss": 0.1048, "step": 57000 }, { "epoch": 58.0, "eval_loss": 0.3069715201854706, "eval_mae": 0.3227607011795044, "eval_r2": 0.7700543403625488, "eval_rmse": 0.5540497303009033, "eval_runtime": 22.6147, "eval_samples_per_second": 442.191, "eval_steps_per_second": 3.493, "step": 57014 }, { "epoch": 58.49440488301119, "grad_norm": 0.2622218132019043, "learning_rate": 1.3835198372329604e-05, "loss": 0.0996, "step": 57500 }, { "epoch": 59.0, "eval_loss": 0.30414652824401855, "eval_mae": 0.3215589225292206, "eval_r2": 0.769301176071167, "eval_rmse": 0.5514944791793823, "eval_runtime": 21.6832, "eval_samples_per_second": 461.187, "eval_steps_per_second": 3.643, "step": 57997 }, { "epoch": 59.0030518819939, "grad_norm": 0.2528667151927948, "learning_rate": 1.3665649372668702e-05, "loss": 0.104, "step": 58000 }, { "epoch": 59.5116988809766, "grad_norm": 0.3110768795013428, "learning_rate": 1.34961003730078e-05, "loss": 0.1009, "step": 58500 }, { "epoch": 60.0, "eval_loss": 0.30486321449279785, "eval_mae": 0.3213992714881897, "eval_r2": 0.7690669298171997, "eval_rmse": 0.5521438121795654, "eval_runtime": 22.881, "eval_samples_per_second": 437.043, "eval_steps_per_second": 3.453, "step": 58980 }, { "epoch": 60.02034587995931, "grad_norm": 0.23977969586849213, "learning_rate": 1.3326551373346898e-05, "loss": 0.1014, "step": 59000 }, { "epoch": 60.52899287894201, "grad_norm": 0.28639206290245056, "learning_rate": 1.3157002373685996e-05, "loss": 0.0985, "step": 59500 }, { "epoch": 61.0, "eval_loss": 0.30517521500587463, "eval_mae": 0.3211789131164551, "eval_r2": 0.7687903642654419, "eval_rmse": 0.552426278591156, "eval_runtime": 22.942, "eval_samples_per_second": 435.882, "eval_steps_per_second": 3.443, "step": 59963 }, { "epoch": 61.03763987792472, "grad_norm": 0.25792455673217773, "learning_rate": 1.2987453374025093e-05, "loss": 0.1033, "step": 60000 }, { "epoch": 61.54628687690742, "grad_norm": 0.2544417977333069, "learning_rate": 1.281790437436419e-05, "loss": 0.0986, "step": 60500 }, { "epoch": 62.0, "eval_loss": 0.3043474555015564, "eval_mae": 0.320587694644928, "eval_r2": 0.7671276330947876, "eval_rmse": 0.5516765713691711, "eval_runtime": 22.599, "eval_samples_per_second": 442.498, "eval_steps_per_second": 3.496, "step": 60946 }, { "epoch": 62.054933875890136, "grad_norm": 0.23652108013629913, "learning_rate": 1.2648355374703289e-05, "loss": 0.1016, "step": 61000 }, { "epoch": 62.56358087487284, "grad_norm": 0.2460223287343979, "learning_rate": 1.2478806375042387e-05, "loss": 0.0974, "step": 61500 }, { "epoch": 63.0, "eval_loss": 0.3064461946487427, "eval_mae": 0.3217301368713379, "eval_r2": 0.7658741474151611, "eval_rmse": 0.5535755157470703, "eval_runtime": 21.5823, "eval_samples_per_second": 463.343, "eval_steps_per_second": 3.66, "step": 61929 }, { "epoch": 63.072227873855546, "grad_norm": 0.263189435005188, "learning_rate": 1.2309257375381485e-05, "loss": 0.102, "step": 62000 }, { "epoch": 63.58087487283825, "grad_norm": 0.28210920095443726, "learning_rate": 1.2139708375720583e-05, "loss": 0.1002, "step": 62500 }, { "epoch": 64.0, "eval_loss": 0.30489081144332886, "eval_mae": 0.3209025263786316, "eval_r2": 0.7646889686584473, "eval_rmse": 0.5521687865257263, "eval_runtime": 22.5558, "eval_samples_per_second": 443.346, "eval_steps_per_second": 3.502, "step": 62912 }, { "epoch": 64.08952187182095, "grad_norm": 0.28177881240844727, "learning_rate": 1.1970159376059683e-05, "loss": 0.1, "step": 63000 }, { "epoch": 64.59816887080366, "grad_norm": 0.2723771035671234, "learning_rate": 1.180061037639878e-05, "loss": 0.0984, "step": 63500 }, { "epoch": 65.0, "eval_loss": 0.3055484890937805, "eval_mae": 0.3205299973487854, "eval_r2": 0.7690364718437195, "eval_rmse": 0.5527639985084534, "eval_runtime": 23.8261, "eval_samples_per_second": 419.707, "eval_steps_per_second": 3.316, "step": 63895 }, { "epoch": 65.10681586978637, "grad_norm": 0.2683132588863373, "learning_rate": 1.1631061376737877e-05, "loss": 0.0996, "step": 64000 }, { "epoch": 65.61546286876907, "grad_norm": 0.2997967004776001, "learning_rate": 1.1461512377076976e-05, "loss": 0.0985, "step": 64500 }, { "epoch": 66.0, "eval_loss": 0.30802544951438904, "eval_mae": 0.3224177062511444, "eval_r2": 0.7694234848022461, "eval_rmse": 0.5550000071525574, "eval_runtime": 22.5937, "eval_samples_per_second": 442.602, "eval_steps_per_second": 3.497, "step": 64878 }, { "epoch": 66.12410986775178, "grad_norm": 0.2792266309261322, "learning_rate": 1.1291963377416074e-05, "loss": 0.1004, "step": 65000 }, { "epoch": 66.63275686673448, "grad_norm": 0.2717229425907135, "learning_rate": 1.1122414377755172e-05, "loss": 0.0987, "step": 65500 }, { "epoch": 67.0, "eval_loss": 0.3065015971660614, "eval_mae": 0.3211498260498047, "eval_r2": 0.76569664478302, "eval_rmse": 0.5536254644393921, "eval_runtime": 21.705, "eval_samples_per_second": 460.722, "eval_steps_per_second": 3.64, "step": 65861 }, { "epoch": 67.1414038657172, "grad_norm": 0.2361566722393036, "learning_rate": 1.095286537809427e-05, "loss": 0.0983, "step": 66000 }, { "epoch": 67.65005086469989, "grad_norm": 0.2546670436859131, "learning_rate": 1.0783316378433368e-05, "loss": 0.0977, "step": 66500 }, { "epoch": 68.0, "eval_loss": 0.30562883615493774, "eval_mae": 0.32023167610168457, "eval_r2": 0.7649646997451782, "eval_rmse": 0.5528367161750793, "eval_runtime": 20.8457, "eval_samples_per_second": 479.716, "eval_steps_per_second": 3.79, "step": 66844 }, { "epoch": 68.1586978636826, "grad_norm": 0.24419383704662323, "learning_rate": 1.0613767378772464e-05, "loss": 0.0982, "step": 67000 }, { "epoch": 68.66734486266532, "grad_norm": 0.27351826429367065, "learning_rate": 1.0444218379111562e-05, "loss": 0.0986, "step": 67500 }, { "epoch": 69.0, "eval_loss": 0.3047651946544647, "eval_mae": 0.32021564245224, "eval_r2": 0.7680791020393372, "eval_rmse": 0.5520549416542053, "eval_runtime": 21.4713, "eval_samples_per_second": 465.738, "eval_steps_per_second": 3.679, "step": 67827 }, { "epoch": 69.17599186164801, "grad_norm": 0.2680034339427948, "learning_rate": 1.027466937945066e-05, "loss": 0.0986, "step": 68000 }, { "epoch": 69.68463886063073, "grad_norm": 0.2814819812774658, "learning_rate": 1.010512037978976e-05, "loss": 0.0984, "step": 68500 }, { "epoch": 70.0, "eval_loss": 0.307124525308609, "eval_mae": 0.3204113841056824, "eval_r2": 0.7679520845413208, "eval_rmse": 0.5541877150535583, "eval_runtime": 22.1583, "eval_samples_per_second": 451.298, "eval_steps_per_second": 3.565, "step": 68810 }, { "epoch": 70.19328585961343, "grad_norm": 0.27025556564331055, "learning_rate": 9.935571380128858e-06, "loss": 0.0964, "step": 69000 }, { "epoch": 70.70193285859614, "grad_norm": 0.2899576723575592, "learning_rate": 9.766022380467957e-06, "loss": 0.0966, "step": 69500 }, { "epoch": 71.0, "eval_loss": 0.3074840307235718, "eval_mae": 0.3207957148551941, "eval_r2": 0.7695941925048828, "eval_rmse": 0.5545119643211365, "eval_runtime": 23.0374, "eval_samples_per_second": 434.077, "eval_steps_per_second": 3.429, "step": 69793 }, { "epoch": 71.21057985757884, "grad_norm": 0.2534237205982208, "learning_rate": 9.596473380807055e-06, "loss": 0.0985, "step": 70000 }, { "epoch": 71.71922685656155, "grad_norm": 0.2650779187679291, "learning_rate": 9.426924381146151e-06, "loss": 0.0978, "step": 70500 }, { "epoch": 72.0, "eval_loss": 0.3064354956150055, "eval_mae": 0.3201001286506653, "eval_r2": 0.7672066688537598, "eval_rmse": 0.5535657405853271, "eval_runtime": 23.7676, "eval_samples_per_second": 420.74, "eval_steps_per_second": 3.324, "step": 70776 }, { "epoch": 72.22787385554425, "grad_norm": 0.2299046665430069, "learning_rate": 9.257375381485249e-06, "loss": 0.0966, "step": 71000 }, { "epoch": 72.73652085452696, "grad_norm": 0.25810402631759644, "learning_rate": 9.087826381824347e-06, "loss": 0.0962, "step": 71500 }, { "epoch": 73.0, "eval_loss": 0.3067697286605835, "eval_mae": 0.32059645652770996, "eval_r2": 0.770045280456543, "eval_rmse": 0.553867518901825, "eval_runtime": 23.7289, "eval_samples_per_second": 421.427, "eval_steps_per_second": 3.329, "step": 71759 }, { "epoch": 73.24516785350967, "grad_norm": 0.2593117356300354, "learning_rate": 8.918277382163445e-06, "loss": 0.097, "step": 72000 }, { "epoch": 73.75381485249237, "grad_norm": 0.28874120116233826, "learning_rate": 8.748728382502543e-06, "loss": 0.0967, "step": 72500 }, { "epoch": 74.0, "eval_loss": 0.30635932087898254, "eval_mae": 0.31995031237602234, "eval_r2": 0.7648576498031616, "eval_rmse": 0.5534969568252563, "eval_runtime": 22.8055, "eval_samples_per_second": 438.49, "eval_steps_per_second": 3.464, "step": 72742 }, { "epoch": 74.26246185147508, "grad_norm": 0.2876887917518616, "learning_rate": 8.579179382841641e-06, "loss": 0.0955, "step": 73000 }, { "epoch": 74.77110885045778, "grad_norm": 0.2901298701763153, "learning_rate": 8.409630383180738e-06, "loss": 0.0982, "step": 73500 }, { "epoch": 75.0, "eval_loss": 0.3065277338027954, "eval_mae": 0.3198009133338928, "eval_r2": 0.7686657905578613, "eval_rmse": 0.5536490678787231, "eval_runtime": 21.8575, "eval_samples_per_second": 457.509, "eval_steps_per_second": 3.614, "step": 73725 }, { "epoch": 75.27975584944049, "grad_norm": 0.23049291968345642, "learning_rate": 8.240081383519838e-06, "loss": 0.096, "step": 74000 }, { "epoch": 75.78840284842319, "grad_norm": 0.2709825932979584, "learning_rate": 8.070532383858936e-06, "loss": 0.0954, "step": 74500 }, { "epoch": 76.0, "eval_loss": 0.30666035413742065, "eval_mae": 0.32015907764434814, "eval_r2": 0.7675079107284546, "eval_rmse": 0.5537688136100769, "eval_runtime": 23.6719, "eval_samples_per_second": 422.441, "eval_steps_per_second": 3.337, "step": 74708 }, { "epoch": 76.2970498474059, "grad_norm": 0.21834221482276917, "learning_rate": 7.900983384198034e-06, "loss": 0.0954, "step": 75000 }, { "epoch": 76.8056968463886, "grad_norm": 0.20948007702827454, "learning_rate": 7.731434384537132e-06, "loss": 0.097, "step": 75500 }, { "epoch": 77.0, "eval_loss": 0.30686530470848083, "eval_mae": 0.3204137682914734, "eval_r2": 0.7677739858627319, "eval_rmse": 0.5539537668228149, "eval_runtime": 21.4205, "eval_samples_per_second": 466.843, "eval_steps_per_second": 3.688, "step": 75691 }, { "epoch": 77.31434384537131, "grad_norm": 0.23167894780635834, "learning_rate": 7.56188538487623e-06, "loss": 0.0932, "step": 76000 }, { "epoch": 77.82299084435402, "grad_norm": 0.24640928208827972, "learning_rate": 7.392336385215327e-06, "loss": 0.0977, "step": 76500 }, { "epoch": 78.0, "eval_loss": 0.3072910010814667, "eval_mae": 0.32037079334259033, "eval_r2": 0.7697309255599976, "eval_rmse": 0.5543379187583923, "eval_runtime": 19.5509, "eval_samples_per_second": 511.487, "eval_steps_per_second": 4.041, "step": 76674 }, { "epoch": 78.33163784333672, "grad_norm": 0.2685850262641907, "learning_rate": 7.222787385554425e-06, "loss": 0.0942, "step": 77000 }, { "epoch": 78.84028484231943, "grad_norm": 0.2667245864868164, "learning_rate": 7.0532383858935235e-06, "loss": 0.0965, "step": 77500 }, { "epoch": 79.0, "eval_loss": 0.30803820490837097, "eval_mae": 0.32020050287246704, "eval_r2": 0.7687854170799255, "eval_rmse": 0.555011510848999, "eval_runtime": 23.1481, "eval_samples_per_second": 432.0, "eval_steps_per_second": 3.413, "step": 77657 }, { "epoch": 79.34893184130213, "grad_norm": 0.20989225804805756, "learning_rate": 6.883689386232621e-06, "loss": 0.0959, "step": 78000 }, { "epoch": 79.85757884028484, "grad_norm": 0.2710280418395996, "learning_rate": 6.71414038657172e-06, "loss": 0.0945, "step": 78500 }, { "epoch": 80.0, "eval_loss": 0.30728211998939514, "eval_mae": 0.3196803033351898, "eval_r2": 0.7670853137969971, "eval_rmse": 0.5543298125267029, "eval_runtime": 22.6974, "eval_samples_per_second": 440.579, "eval_steps_per_second": 3.481, "step": 78640 }, { "epoch": 80.36622583926754, "grad_norm": 0.23559938371181488, "learning_rate": 6.544591386910818e-06, "loss": 0.0933, "step": 79000 }, { "epoch": 80.87487283825025, "grad_norm": 0.2247001826763153, "learning_rate": 6.375042387249915e-06, "loss": 0.0955, "step": 79500 }, { "epoch": 81.0, "eval_loss": 0.3074042499065399, "eval_mae": 0.31953954696655273, "eval_r2": 0.7664982676506042, "eval_rmse": 0.5544400215148926, "eval_runtime": 21.5373, "eval_samples_per_second": 464.311, "eval_steps_per_second": 3.668, "step": 79623 }, { "epoch": 81.38351983723297, "grad_norm": 0.23154723644256592, "learning_rate": 6.205493387589013e-06, "loss": 0.0928, "step": 80000 }, { "epoch": 81.89216683621567, "grad_norm": 0.2460862100124359, "learning_rate": 6.035944387928111e-06, "loss": 0.0957, "step": 80500 }, { "epoch": 82.0, "eval_loss": 0.30816981196403503, "eval_mae": 0.31992191076278687, "eval_r2": 0.7665845155715942, "eval_rmse": 0.5551300644874573, "eval_runtime": 20.0656, "eval_samples_per_second": 498.366, "eval_steps_per_second": 3.937, "step": 80606 }, { "epoch": 82.40081383519838, "grad_norm": 0.22507379949092865, "learning_rate": 5.866395388267209e-06, "loss": 0.0937, "step": 81000 }, { "epoch": 82.90946083418108, "grad_norm": 0.26125577092170715, "learning_rate": 5.696846388606307e-06, "loss": 0.095, "step": 81500 }, { "epoch": 83.0, "eval_loss": 0.3074418008327484, "eval_mae": 0.31970179080963135, "eval_r2": 0.7669422626495361, "eval_rmse": 0.5544738173484802, "eval_runtime": 23.0334, "eval_samples_per_second": 434.153, "eval_steps_per_second": 3.43, "step": 81589 }, { "epoch": 83.41810783316379, "grad_norm": 0.2263702154159546, "learning_rate": 5.5272973889454055e-06, "loss": 0.0938, "step": 82000 }, { "epoch": 83.92675483214649, "grad_norm": 0.2645528018474579, "learning_rate": 5.357748389284504e-06, "loss": 0.094, "step": 82500 }, { "epoch": 84.0, "eval_loss": 0.30633336305618286, "eval_mae": 0.3188663721084595, "eval_r2": 0.7666647434234619, "eval_rmse": 0.5534735321998596, "eval_runtime": 23.049, "eval_samples_per_second": 433.859, "eval_steps_per_second": 3.427, "step": 82572 }, { "epoch": 84.4354018311292, "grad_norm": 0.21599704027175903, "learning_rate": 5.188199389623601e-06, "loss": 0.0943, "step": 83000 }, { "epoch": 84.9440488301119, "grad_norm": 0.21557337045669556, "learning_rate": 5.018650389962699e-06, "loss": 0.0932, "step": 83500 }, { "epoch": 85.0, "eval_loss": 0.30662301182746887, "eval_mae": 0.3191204369068146, "eval_r2": 0.7680181264877319, "eval_rmse": 0.5537351369857788, "eval_runtime": 22.4327, "eval_samples_per_second": 445.777, "eval_steps_per_second": 3.522, "step": 83555 }, { "epoch": 85.45269582909461, "grad_norm": 0.2520500123500824, "learning_rate": 4.849101390301798e-06, "loss": 0.0936, "step": 84000 }, { "epoch": 85.96134282807732, "grad_norm": 0.24406184256076813, "learning_rate": 4.679552390640896e-06, "loss": 0.0934, "step": 84500 }, { "epoch": 86.0, "eval_loss": 0.3075953722000122, "eval_mae": 0.31922364234924316, "eval_r2": 0.7682535648345947, "eval_rmse": 0.554612398147583, "eval_runtime": 24.2058, "eval_samples_per_second": 413.124, "eval_steps_per_second": 3.264, "step": 84538 }, { "epoch": 86.46998982706002, "grad_norm": 0.29262858629226685, "learning_rate": 4.510003390979993e-06, "loss": 0.0925, "step": 85000 }, { "epoch": 86.97863682604273, "grad_norm": 0.23719151318073273, "learning_rate": 4.340454391319091e-06, "loss": 0.0946, "step": 85500 }, { "epoch": 87.0, "eval_loss": 0.30780571699142456, "eval_mae": 0.3192029595375061, "eval_r2": 0.767227292060852, "eval_rmse": 0.5548020005226135, "eval_runtime": 21.1225, "eval_samples_per_second": 473.43, "eval_steps_per_second": 3.74, "step": 85521 }, { "epoch": 87.48728382502543, "grad_norm": 0.2712896168231964, "learning_rate": 4.170905391658189e-06, "loss": 0.0922, "step": 86000 }, { "epoch": 87.99593082400814, "grad_norm": 0.22555504739284515, "learning_rate": 4.001356391997287e-06, "loss": 0.0939, "step": 86500 }, { "epoch": 88.0, "eval_loss": 0.3082311153411865, "eval_mae": 0.3195423483848572, "eval_r2": 0.7683557271957397, "eval_rmse": 0.5551851987838745, "eval_runtime": 21.258, "eval_samples_per_second": 470.412, "eval_steps_per_second": 3.716, "step": 86504 }, { "epoch": 88.50457782299084, "grad_norm": 0.25731492042541504, "learning_rate": 3.831807392336386e-06, "loss": 0.0917, "step": 87000 }, { "epoch": 89.0, "eval_loss": 0.30769726634025574, "eval_mae": 0.3188689947128296, "eval_r2": 0.7664185166358948, "eval_rmse": 0.5547041893005371, "eval_runtime": 22.8134, "eval_samples_per_second": 438.338, "eval_steps_per_second": 3.463, "step": 87487 }, { "epoch": 89.01322482197355, "grad_norm": 0.21790558099746704, "learning_rate": 3.6622583926754837e-06, "loss": 0.0933, "step": 87500 }, { "epoch": 89.52187182095625, "grad_norm": 0.20840144157409668, "learning_rate": 3.4927093930145814e-06, "loss": 0.0913, "step": 88000 }, { "epoch": 90.0, "eval_loss": 0.3072252869606018, "eval_mae": 0.31897908449172974, "eval_r2": 0.7661517858505249, "eval_rmse": 0.5542786717414856, "eval_runtime": 22.7075, "eval_samples_per_second": 440.383, "eval_steps_per_second": 3.479, "step": 88470 }, { "epoch": 90.03051881993896, "grad_norm": 0.23747815191745758, "learning_rate": 3.323160393353679e-06, "loss": 0.0941, "step": 88500 }, { "epoch": 90.53916581892167, "grad_norm": 0.2666161358356476, "learning_rate": 3.1536113936927776e-06, "loss": 0.0913, "step": 89000 }, { "epoch": 91.0, "eval_loss": 0.30819734930992126, "eval_mae": 0.3193185329437256, "eval_r2": 0.7667792439460754, "eval_rmse": 0.5551548600196838, "eval_runtime": 23.0912, "eval_samples_per_second": 433.065, "eval_steps_per_second": 3.421, "step": 89453 }, { "epoch": 91.04781281790437, "grad_norm": 0.2510707676410675, "learning_rate": 2.9840623940318752e-06, "loss": 0.0932, "step": 89500 }, { "epoch": 91.55645981688708, "grad_norm": 0.2492397129535675, "learning_rate": 2.8145133943709733e-06, "loss": 0.0915, "step": 90000 }, { "epoch": 92.0, "eval_loss": 0.3087303936481476, "eval_mae": 0.3195701241493225, "eval_r2": 0.767082929611206, "eval_rmse": 0.5556347370147705, "eval_runtime": 21.4213, "eval_samples_per_second": 466.825, "eval_steps_per_second": 3.688, "step": 90436 }, { "epoch": 92.06510681586978, "grad_norm": 0.2138824164867401, "learning_rate": 2.6449643947100714e-06, "loss": 0.0921, "step": 90500 }, { "epoch": 92.5737538148525, "grad_norm": 0.2821875810623169, "learning_rate": 2.475415395049169e-06, "loss": 0.091, "step": 91000 }, { "epoch": 93.0, "eval_loss": 0.30839282274246216, "eval_mae": 0.319232702255249, "eval_r2": 0.7667563557624817, "eval_rmse": 0.5553308725357056, "eval_runtime": 21.2783, "eval_samples_per_second": 469.961, "eval_steps_per_second": 3.713, "step": 91419 }, { "epoch": 93.08240081383519, "grad_norm": 0.2118423730134964, "learning_rate": 2.305866395388267e-06, "loss": 0.0929, "step": 91500 }, { "epoch": 93.5910478128179, "grad_norm": 0.22796212136745453, "learning_rate": 2.1363173957273653e-06, "loss": 0.0913, "step": 92000 }, { "epoch": 94.0, "eval_loss": 0.3086921274662018, "eval_mae": 0.3193605840206146, "eval_r2": 0.766891598701477, "eval_rmse": 0.5556001663208008, "eval_runtime": 23.1152, "eval_samples_per_second": 432.616, "eval_steps_per_second": 3.418, "step": 92402 }, { "epoch": 94.09969481180062, "grad_norm": 0.2285338044166565, "learning_rate": 1.9667683960664634e-06, "loss": 0.0917, "step": 92500 }, { "epoch": 94.60834181078332, "grad_norm": 0.20946064591407776, "learning_rate": 1.7972193964055613e-06, "loss": 0.0911, "step": 93000 }, { "epoch": 95.0, "eval_loss": 0.3085222542285919, "eval_mae": 0.3191923499107361, "eval_r2": 0.7661963701248169, "eval_rmse": 0.5554474592208862, "eval_runtime": 23.4962, "eval_samples_per_second": 425.601, "eval_steps_per_second": 3.362, "step": 93385 }, { "epoch": 95.11698880976603, "grad_norm": 0.269754558801651, "learning_rate": 1.6276703967446594e-06, "loss": 0.0923, "step": 93500 }, { "epoch": 95.62563580874873, "grad_norm": 0.2517547309398651, "learning_rate": 1.4581213970837572e-06, "loss": 0.0912, "step": 94000 }, { "epoch": 96.0, "eval_loss": 0.3087967038154602, "eval_mae": 0.3190614581108093, "eval_r2": 0.7664559483528137, "eval_rmse": 0.5556943416595459, "eval_runtime": 21.3044, "eval_samples_per_second": 469.387, "eval_steps_per_second": 3.708, "step": 94368 }, { "epoch": 96.13428280773144, "grad_norm": 0.200937882065773, "learning_rate": 1.2885723974228553e-06, "loss": 0.091, "step": 94500 }, { "epoch": 96.64292980671414, "grad_norm": 0.2715360224246979, "learning_rate": 1.1190233977619532e-06, "loss": 0.0904, "step": 95000 }, { "epoch": 97.0, "eval_loss": 0.30910882353782654, "eval_mae": 0.3192349076271057, "eval_r2": 0.7662383317947388, "eval_rmse": 0.5559751987457275, "eval_runtime": 21.1316, "eval_samples_per_second": 473.224, "eval_steps_per_second": 3.738, "step": 95351 }, { "epoch": 97.15157680569685, "grad_norm": 0.24004267156124115, "learning_rate": 9.494743981010512e-07, "loss": 0.09, "step": 95500 }, { "epoch": 97.66022380467955, "grad_norm": 0.28319135308265686, "learning_rate": 7.799253984401492e-07, "loss": 0.0914, "step": 96000 }, { "epoch": 98.0, "eval_loss": 0.3087212145328522, "eval_mae": 0.3190605640411377, "eval_r2": 0.7665444016456604, "eval_rmse": 0.5556263327598572, "eval_runtime": 18.6312, "eval_samples_per_second": 536.734, "eval_steps_per_second": 4.24, "step": 96334 }, { "epoch": 98.16887080366226, "grad_norm": 0.3092040419578552, "learning_rate": 6.103763987792473e-07, "loss": 0.0906, "step": 96500 }, { "epoch": 98.67751780264497, "grad_norm": 0.27111703157424927, "learning_rate": 4.408273991183452e-07, "loss": 0.0894, "step": 97000 }, { "epoch": 99.0, "eval_loss": 0.3089219331741333, "eval_mae": 0.3191257417201996, "eval_r2": 0.7667442560195923, "eval_rmse": 0.5558070540428162, "eval_runtime": 18.7902, "eval_samples_per_second": 532.191, "eval_steps_per_second": 4.204, "step": 97317 }, { "epoch": 99.18616480162767, "grad_norm": 0.2768039405345917, "learning_rate": 2.712783994574432e-07, "loss": 0.0919, "step": 97500 }, { "epoch": 99.69481180061038, "grad_norm": 0.19111211597919464, "learning_rate": 1.0172939979654121e-07, "loss": 0.0896, "step": 98000 }, { "epoch": 100.0, "eval_loss": 0.30905383825302124, "eval_mae": 0.3191269636154175, "eval_r2": 0.7666448354721069, "eval_rmse": 0.5559256076812744, "eval_runtime": 18.8185, "eval_samples_per_second": 531.391, "eval_steps_per_second": 4.198, "step": 98300 } ], "logging_steps": 500, "max_steps": 98300, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.3115044487280154e+17, "train_batch_size": 128, "trial_name": null, "trial_params": null }