multimat-modernbert / trainer_state.json
korolewadim's picture
Upload folder using huggingface_hub
972a6b3 verified
{
"best_metric": 0.28622403740882874,
"best_model_checkpoint": "embed/MP_modbert_embed_voc1_0_100_MP/checkpoint-15728",
"epoch": 100.0,
"eval_steps": 500,
"global_step": 98300,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.508646998982706,
"grad_norm": 0.3106672167778015,
"learning_rate": 1.5259409969481182e-06,
"loss": 1.541,
"step": 500
},
{
"epoch": 1.0,
"eval_loss": 1.2760541439056396,
"eval_mae": 0.8460478782653809,
"eval_r2": -7.616851806640625,
"eval_rmse": 1.1296252012252808,
"eval_runtime": 20.7046,
"eval_samples_per_second": 482.984,
"eval_steps_per_second": 3.816,
"step": 983
},
{
"epoch": 1.017293997965412,
"grad_norm": 0.6856330037117004,
"learning_rate": 3.0518819938962364e-06,
"loss": 1.3676,
"step": 1000
},
{
"epoch": 1.5259409969481181,
"grad_norm": 1.0226482152938843,
"learning_rate": 4.577822990844354e-06,
"loss": 1.09,
"step": 1500
},
{
"epoch": 2.0,
"eval_loss": 0.7750646471977234,
"eval_mae": 0.6696741580963135,
"eval_r2": -0.1967395395040512,
"eval_rmse": 0.8803772330284119,
"eval_runtime": 20.7332,
"eval_samples_per_second": 482.319,
"eval_steps_per_second": 3.81,
"step": 1966
},
{
"epoch": 2.034587995930824,
"grad_norm": 1.0323240756988525,
"learning_rate": 6.103763987792473e-06,
"loss": 0.8467,
"step": 2000
},
{
"epoch": 2.5432349949135302,
"grad_norm": 1.000579833984375,
"learning_rate": 7.62970498474059e-06,
"loss": 0.6796,
"step": 2500
},
{
"epoch": 3.0,
"eval_loss": 0.5403278470039368,
"eval_mae": 0.5509415864944458,
"eval_r2": 0.3976656198501587,
"eval_rmse": 0.7350696325302124,
"eval_runtime": 19.1568,
"eval_samples_per_second": 522.008,
"eval_steps_per_second": 4.124,
"step": 2949
},
{
"epoch": 3.051881993896236,
"grad_norm": 1.153286099433899,
"learning_rate": 9.155645981688708e-06,
"loss": 0.5626,
"step": 3000
},
{
"epoch": 3.560528992878942,
"grad_norm": 1.2380833625793457,
"learning_rate": 1.0681586978636825e-05,
"loss": 0.4871,
"step": 3500
},
{
"epoch": 4.0,
"eval_loss": 0.4411344826221466,
"eval_mae": 0.4877315163612366,
"eval_r2": 0.5509063005447388,
"eval_rmse": 0.6641793847084045,
"eval_runtime": 17.1276,
"eval_samples_per_second": 583.854,
"eval_steps_per_second": 4.612,
"step": 3932
},
{
"epoch": 4.069175991861648,
"grad_norm": 1.0829737186431885,
"learning_rate": 1.2207527975584946e-05,
"loss": 0.4344,
"step": 4000
},
{
"epoch": 4.577822990844354,
"grad_norm": 0.9966434240341187,
"learning_rate": 1.3733468972533063e-05,
"loss": 0.3966,
"step": 4500
},
{
"epoch": 5.0,
"eval_loss": 0.391430139541626,
"eval_mae": 0.44986167550086975,
"eval_r2": 0.6214621067047119,
"eval_rmse": 0.6256435513496399,
"eval_runtime": 19.311,
"eval_samples_per_second": 517.839,
"eval_steps_per_second": 4.091,
"step": 4915
},
{
"epoch": 5.0864699898270604,
"grad_norm": 1.2099027633666992,
"learning_rate": 1.525940996948118e-05,
"loss": 0.3747,
"step": 5000
},
{
"epoch": 5.595116988809766,
"grad_norm": 1.3580559492111206,
"learning_rate": 1.67853509664293e-05,
"loss": 0.3501,
"step": 5500
},
{
"epoch": 6.0,
"eval_loss": 0.35625067353248596,
"eval_mae": 0.42443710565567017,
"eval_r2": 0.6606752872467041,
"eval_rmse": 0.5968672037124634,
"eval_runtime": 19.6114,
"eval_samples_per_second": 509.909,
"eval_steps_per_second": 4.028,
"step": 5898
},
{
"epoch": 6.103763987792472,
"grad_norm": 1.0308208465576172,
"learning_rate": 1.8311291963377416e-05,
"loss": 0.3332,
"step": 6000
},
{
"epoch": 6.612410986775178,
"grad_norm": 1.0148053169250488,
"learning_rate": 1.9837232960325533e-05,
"loss": 0.3184,
"step": 6500
},
{
"epoch": 7.0,
"eval_loss": 0.3325399160385132,
"eval_mae": 0.4055442214012146,
"eval_r2": 0.714428186416626,
"eval_rmse": 0.576662540435791,
"eval_runtime": 19.3745,
"eval_samples_per_second": 516.143,
"eval_steps_per_second": 4.078,
"step": 6881
},
{
"epoch": 7.121057985757884,
"grad_norm": 1.2479807138442993,
"learning_rate": 2.136317395727365e-05,
"loss": 0.3061,
"step": 7000
},
{
"epoch": 7.62970498474059,
"grad_norm": 0.9779444932937622,
"learning_rate": 2.288911495422177e-05,
"loss": 0.2946,
"step": 7500
},
{
"epoch": 8.0,
"eval_loss": 0.3256845772266388,
"eval_mae": 0.39594584703445435,
"eval_r2": 0.7166282534599304,
"eval_rmse": 0.5706875920295715,
"eval_runtime": 20.9579,
"eval_samples_per_second": 477.147,
"eval_steps_per_second": 3.769,
"step": 7864
},
{
"epoch": 8.138351983723297,
"grad_norm": 0.9078311324119568,
"learning_rate": 2.441505595116989e-05,
"loss": 0.2871,
"step": 8000
},
{
"epoch": 8.646998982706002,
"grad_norm": 0.9875624775886536,
"learning_rate": 2.594099694811801e-05,
"loss": 0.276,
"step": 8500
},
{
"epoch": 9.0,
"eval_loss": 0.31457769870758057,
"eval_mae": 0.3851270079612732,
"eval_r2": 0.7076315879821777,
"eval_rmse": 0.5608720779418945,
"eval_runtime": 18.1766,
"eval_samples_per_second": 550.158,
"eval_steps_per_second": 4.346,
"step": 8847
},
{
"epoch": 9.155645981688709,
"grad_norm": 0.8023911714553833,
"learning_rate": 2.7466937945066126e-05,
"loss": 0.2669,
"step": 9000
},
{
"epoch": 9.664292980671414,
"grad_norm": 0.846149206161499,
"learning_rate": 2.8992878942014243e-05,
"loss": 0.26,
"step": 9500
},
{
"epoch": 10.0,
"eval_loss": 0.3037531077861786,
"eval_mae": 0.37509840726852417,
"eval_r2": 0.7389528155326843,
"eval_rmse": 0.5511377453804016,
"eval_runtime": 19.169,
"eval_samples_per_second": 521.676,
"eval_steps_per_second": 4.121,
"step": 9830
},
{
"epoch": 10.172939979654121,
"grad_norm": 1.080586552619934,
"learning_rate": 2.9942353340115293e-05,
"loss": 0.2499,
"step": 10000
},
{
"epoch": 10.681586978636826,
"grad_norm": 0.7065219879150391,
"learning_rate": 2.9772804340454395e-05,
"loss": 0.2452,
"step": 10500
},
{
"epoch": 11.0,
"eval_loss": 0.2963975965976715,
"eval_mae": 0.36469894647598267,
"eval_r2": 0.7409036159515381,
"eval_rmse": 0.5444238781929016,
"eval_runtime": 19.1182,
"eval_samples_per_second": 523.063,
"eval_steps_per_second": 4.132,
"step": 10813
},
{
"epoch": 11.190233977619531,
"grad_norm": 0.7577599287033081,
"learning_rate": 2.960325534079349e-05,
"loss": 0.2346,
"step": 11000
},
{
"epoch": 11.698880976602238,
"grad_norm": 0.9281987547874451,
"learning_rate": 2.9433706341132588e-05,
"loss": 0.2283,
"step": 11500
},
{
"epoch": 12.0,
"eval_loss": 0.2902641296386719,
"eval_mae": 0.3583831191062927,
"eval_r2": 0.7628405690193176,
"eval_rmse": 0.5387614965438843,
"eval_runtime": 20.6847,
"eval_samples_per_second": 483.45,
"eval_steps_per_second": 3.819,
"step": 11796
},
{
"epoch": 12.207527975584943,
"grad_norm": 0.7825191617012024,
"learning_rate": 2.9264157341471686e-05,
"loss": 0.2218,
"step": 12000
},
{
"epoch": 12.71617497456765,
"grad_norm": 0.7279273867607117,
"learning_rate": 2.9094608341810784e-05,
"loss": 0.2145,
"step": 12500
},
{
"epoch": 13.0,
"eval_loss": 0.2926931083202362,
"eval_mae": 0.35721156001091003,
"eval_r2": 0.7634265422821045,
"eval_rmse": 0.5410109758377075,
"eval_runtime": 19.8102,
"eval_samples_per_second": 504.789,
"eval_steps_per_second": 3.988,
"step": 12779
},
{
"epoch": 13.224821973550355,
"grad_norm": 0.7883431911468506,
"learning_rate": 2.8925059342149882e-05,
"loss": 0.2086,
"step": 13000
},
{
"epoch": 13.733468972533062,
"grad_norm": 0.7794452905654907,
"learning_rate": 2.875551034248898e-05,
"loss": 0.2023,
"step": 13500
},
{
"epoch": 14.0,
"eval_loss": 0.2874827980995178,
"eval_mae": 0.3508231043815613,
"eval_r2": 0.7408859133720398,
"eval_rmse": 0.5361739993095398,
"eval_runtime": 19.5699,
"eval_samples_per_second": 510.99,
"eval_steps_per_second": 4.037,
"step": 13762
},
{
"epoch": 14.242115971515767,
"grad_norm": 0.6752628684043884,
"learning_rate": 2.8585961342828075e-05,
"loss": 0.1954,
"step": 14000
},
{
"epoch": 14.750762970498474,
"grad_norm": 0.7037671804428101,
"learning_rate": 2.8416412343167176e-05,
"loss": 0.1927,
"step": 14500
},
{
"epoch": 15.0,
"eval_loss": 0.2897137701511383,
"eval_mae": 0.34881192445755005,
"eval_r2": 0.7613890171051025,
"eval_rmse": 0.5382503867149353,
"eval_runtime": 20.9472,
"eval_samples_per_second": 477.39,
"eval_steps_per_second": 3.771,
"step": 14745
},
{
"epoch": 15.25940996948118,
"grad_norm": 0.7775920033454895,
"learning_rate": 2.8246863343506274e-05,
"loss": 0.1852,
"step": 15000
},
{
"epoch": 15.768056968463886,
"grad_norm": 0.6624333262443542,
"learning_rate": 2.8077314343845372e-05,
"loss": 0.1846,
"step": 15500
},
{
"epoch": 16.0,
"eval_loss": 0.28622403740882874,
"eval_mae": 0.34425806999206543,
"eval_r2": 0.7593017816543579,
"eval_rmse": 0.5349988341331482,
"eval_runtime": 19.8037,
"eval_samples_per_second": 504.956,
"eval_steps_per_second": 3.989,
"step": 15728
},
{
"epoch": 16.276703967446593,
"grad_norm": 0.6602976322174072,
"learning_rate": 2.790776534418447e-05,
"loss": 0.1776,
"step": 16000
},
{
"epoch": 16.7853509664293,
"grad_norm": 0.6582776308059692,
"learning_rate": 2.773821634452357e-05,
"loss": 0.1762,
"step": 16500
},
{
"epoch": 17.0,
"eval_loss": 0.28853750228881836,
"eval_mae": 0.3421522378921509,
"eval_r2": 0.7615672945976257,
"eval_rmse": 0.5371565818786621,
"eval_runtime": 20.8644,
"eval_samples_per_second": 479.286,
"eval_steps_per_second": 3.786,
"step": 16711
},
{
"epoch": 17.293997965412004,
"grad_norm": 0.7847622632980347,
"learning_rate": 2.7568667344862667e-05,
"loss": 0.1677,
"step": 17000
},
{
"epoch": 17.80264496439471,
"grad_norm": 0.7339199185371399,
"learning_rate": 2.739911834520176e-05,
"loss": 0.1694,
"step": 17500
},
{
"epoch": 18.0,
"eval_loss": 0.29071176052093506,
"eval_mae": 0.34257909655570984,
"eval_r2": 0.7714438438415527,
"eval_rmse": 0.5391767024993896,
"eval_runtime": 19.1023,
"eval_samples_per_second": 523.496,
"eval_steps_per_second": 4.136,
"step": 17694
},
{
"epoch": 18.311291963377418,
"grad_norm": 0.7355333566665649,
"learning_rate": 2.7229569345540863e-05,
"loss": 0.164,
"step": 18000
},
{
"epoch": 18.819938962360123,
"grad_norm": 0.6125873923301697,
"learning_rate": 2.7060020345879958e-05,
"loss": 0.1621,
"step": 18500
},
{
"epoch": 19.0,
"eval_loss": 0.2921219766139984,
"eval_mae": 0.34166020154953003,
"eval_r2": 0.771050214767456,
"eval_rmse": 0.5404828786849976,
"eval_runtime": 18.7584,
"eval_samples_per_second": 533.096,
"eval_steps_per_second": 4.211,
"step": 18677
},
{
"epoch": 19.328585961342828,
"grad_norm": 0.533932626247406,
"learning_rate": 2.689047134621906e-05,
"loss": 0.1574,
"step": 19000
},
{
"epoch": 19.837232960325533,
"grad_norm": 0.5610256791114807,
"learning_rate": 2.6720922346558154e-05,
"loss": 0.1585,
"step": 19500
},
{
"epoch": 20.0,
"eval_loss": 0.2904634177684784,
"eval_mae": 0.3388892710208893,
"eval_r2": 0.759547233581543,
"eval_rmse": 0.5389463305473328,
"eval_runtime": 18.6352,
"eval_samples_per_second": 536.618,
"eval_steps_per_second": 4.239,
"step": 19660
},
{
"epoch": 20.345879959308242,
"grad_norm": 0.639903724193573,
"learning_rate": 2.6551373346897255e-05,
"loss": 0.1518,
"step": 20000
},
{
"epoch": 20.854526958290947,
"grad_norm": 0.6409148573875427,
"learning_rate": 2.6381824347236353e-05,
"loss": 0.1538,
"step": 20500
},
{
"epoch": 21.0,
"eval_loss": 0.28870195150375366,
"eval_mae": 0.3359060287475586,
"eval_r2": 0.7662752866744995,
"eval_rmse": 0.5373095870018005,
"eval_runtime": 19.483,
"eval_samples_per_second": 513.267,
"eval_steps_per_second": 4.055,
"step": 20643
},
{
"epoch": 21.363173957273652,
"grad_norm": 0.6030653715133667,
"learning_rate": 2.6212275347575448e-05,
"loss": 0.1454,
"step": 21000
},
{
"epoch": 21.871820956256357,
"grad_norm": 0.5733378529548645,
"learning_rate": 2.604272634791455e-05,
"loss": 0.1512,
"step": 21500
},
{
"epoch": 22.0,
"eval_loss": 0.2888459265232086,
"eval_mae": 0.336772620677948,
"eval_r2": 0.7591472268104553,
"eval_rmse": 0.5374436974525452,
"eval_runtime": 19.7086,
"eval_samples_per_second": 507.392,
"eval_steps_per_second": 4.008,
"step": 21626
},
{
"epoch": 22.380467955239062,
"grad_norm": 0.5780584812164307,
"learning_rate": 2.5873177348253644e-05,
"loss": 0.1426,
"step": 22000
},
{
"epoch": 22.88911495422177,
"grad_norm": 0.6120862364768982,
"learning_rate": 2.5703628348592746e-05,
"loss": 0.1462,
"step": 22500
},
{
"epoch": 23.0,
"eval_loss": 0.28724002838134766,
"eval_mae": 0.3324105143547058,
"eval_r2": 0.7722494602203369,
"eval_rmse": 0.5359475016593933,
"eval_runtime": 19.302,
"eval_samples_per_second": 518.08,
"eval_steps_per_second": 4.093,
"step": 22609
},
{
"epoch": 23.397761953204476,
"grad_norm": 0.5332029461860657,
"learning_rate": 2.553407934893184e-05,
"loss": 0.1386,
"step": 23000
},
{
"epoch": 23.90640895218718,
"grad_norm": 0.5000107884407043,
"learning_rate": 2.5364530349270942e-05,
"loss": 0.1438,
"step": 23500
},
{
"epoch": 24.0,
"eval_loss": 0.2948116064071655,
"eval_mae": 0.33423200249671936,
"eval_r2": 0.771849513053894,
"eval_rmse": 0.542965292930603,
"eval_runtime": 16.3008,
"eval_samples_per_second": 613.466,
"eval_steps_per_second": 4.846,
"step": 23592
},
{
"epoch": 24.415055951169887,
"grad_norm": 0.4723941385746002,
"learning_rate": 2.5194981349610037e-05,
"loss": 0.1351,
"step": 24000
},
{
"epoch": 24.923702950152595,
"grad_norm": 0.5572139024734497,
"learning_rate": 2.5025432349949135e-05,
"loss": 0.1408,
"step": 24500
},
{
"epoch": 25.0,
"eval_loss": 0.28881922364234924,
"eval_mae": 0.3297838866710663,
"eval_r2": 0.7694551944732666,
"eval_rmse": 0.5374186635017395,
"eval_runtime": 16.1445,
"eval_samples_per_second": 619.406,
"eval_steps_per_second": 4.893,
"step": 24575
},
{
"epoch": 25.4323499491353,
"grad_norm": 0.5389395952224731,
"learning_rate": 2.4855883350288233e-05,
"loss": 0.1332,
"step": 25000
},
{
"epoch": 25.940996948118006,
"grad_norm": 0.7782149910926819,
"learning_rate": 2.468633435062733e-05,
"loss": 0.1374,
"step": 25500
},
{
"epoch": 26.0,
"eval_loss": 0.2942332625389099,
"eval_mae": 0.3317820429801941,
"eval_r2": 0.7754645347595215,
"eval_rmse": 0.5424323081970215,
"eval_runtime": 15.3961,
"eval_samples_per_second": 649.517,
"eval_steps_per_second": 5.131,
"step": 25558
},
{
"epoch": 26.44964394710071,
"grad_norm": 0.462462455034256,
"learning_rate": 2.4516785350966432e-05,
"loss": 0.1295,
"step": 26000
},
{
"epoch": 26.95829094608342,
"grad_norm": 0.5143025517463684,
"learning_rate": 2.4347236351305527e-05,
"loss": 0.137,
"step": 26500
},
{
"epoch": 27.0,
"eval_loss": 0.2935540974140167,
"eval_mae": 0.330686092376709,
"eval_r2": 0.767002284526825,
"eval_rmse": 0.5418060421943665,
"eval_runtime": 15.7155,
"eval_samples_per_second": 636.314,
"eval_steps_per_second": 5.027,
"step": 26541
},
{
"epoch": 27.466937945066125,
"grad_norm": 0.4412664473056793,
"learning_rate": 2.417768735164463e-05,
"loss": 0.1266,
"step": 27000
},
{
"epoch": 27.97558494404883,
"grad_norm": 0.5299028754234314,
"learning_rate": 2.4008138351983723e-05,
"loss": 0.1345,
"step": 27500
},
{
"epoch": 28.0,
"eval_loss": 0.29448285698890686,
"eval_mae": 0.33093565702438354,
"eval_r2": 0.7710368633270264,
"eval_rmse": 0.5426624417304993,
"eval_runtime": 15.225,
"eval_samples_per_second": 656.813,
"eval_steps_per_second": 5.189,
"step": 27524
},
{
"epoch": 28.484231943031535,
"grad_norm": 0.3932570219039917,
"learning_rate": 2.383858935232282e-05,
"loss": 0.126,
"step": 28000
},
{
"epoch": 28.992878942014244,
"grad_norm": 0.5242863893508911,
"learning_rate": 2.366904035266192e-05,
"loss": 0.1295,
"step": 28500
},
{
"epoch": 29.0,
"eval_loss": 0.29432228207588196,
"eval_mae": 0.3288809657096863,
"eval_r2": 0.7685636878013611,
"eval_rmse": 0.542514443397522,
"eval_runtime": 15.4546,
"eval_samples_per_second": 647.058,
"eval_steps_per_second": 5.112,
"step": 28507
},
{
"epoch": 29.50152594099695,
"grad_norm": 0.4471481144428253,
"learning_rate": 2.3499491353001018e-05,
"loss": 0.1237,
"step": 29000
},
{
"epoch": 30.0,
"eval_loss": 0.2978579103946686,
"eval_mae": 0.3290804624557495,
"eval_r2": 0.7655062675476074,
"eval_rmse": 0.5457631945610046,
"eval_runtime": 15.0561,
"eval_samples_per_second": 664.183,
"eval_steps_per_second": 5.247,
"step": 29490
},
{
"epoch": 30.010172939979654,
"grad_norm": 0.5170900821685791,
"learning_rate": 2.3329942353340116e-05,
"loss": 0.1289,
"step": 29500
},
{
"epoch": 30.51881993896236,
"grad_norm": 0.4907204806804657,
"learning_rate": 2.3160393353679214e-05,
"loss": 0.1207,
"step": 30000
},
{
"epoch": 31.0,
"eval_loss": 0.294801265001297,
"eval_mae": 0.32844942808151245,
"eval_r2": 0.7698482275009155,
"eval_rmse": 0.542955756187439,
"eval_runtime": 15.6533,
"eval_samples_per_second": 638.844,
"eval_steps_per_second": 5.047,
"step": 30473
},
{
"epoch": 31.027466937945068,
"grad_norm": 0.4819788932800293,
"learning_rate": 2.2990844354018312e-05,
"loss": 0.1285,
"step": 30500
},
{
"epoch": 31.536113936927773,
"grad_norm": 0.48075565695762634,
"learning_rate": 2.282129535435741e-05,
"loss": 0.1194,
"step": 31000
},
{
"epoch": 32.0,
"eval_loss": 0.2976591885089874,
"eval_mae": 0.3284505605697632,
"eval_r2": 0.770287275314331,
"eval_rmse": 0.5455812811851501,
"eval_runtime": 16.4511,
"eval_samples_per_second": 607.864,
"eval_steps_per_second": 4.802,
"step": 31456
},
{
"epoch": 32.044760935910475,
"grad_norm": 0.4494258165359497,
"learning_rate": 2.2651746354696508e-05,
"loss": 0.1254,
"step": 31500
},
{
"epoch": 32.55340793489319,
"grad_norm": 0.4671981632709503,
"learning_rate": 2.2482197355035606e-05,
"loss": 0.1193,
"step": 32000
},
{
"epoch": 33.0,
"eval_loss": 0.29970449209213257,
"eval_mae": 0.32844069600105286,
"eval_r2": 0.7684862613677979,
"eval_rmse": 0.5474523305892944,
"eval_runtime": 16.4076,
"eval_samples_per_second": 609.473,
"eval_steps_per_second": 4.815,
"step": 32439
},
{
"epoch": 33.06205493387589,
"grad_norm": 0.37166231870651245,
"learning_rate": 2.2312648355374704e-05,
"loss": 0.1237,
"step": 32500
},
{
"epoch": 33.5707019328586,
"grad_norm": 0.34825435280799866,
"learning_rate": 2.2143099355713802e-05,
"loss": 0.1178,
"step": 33000
},
{
"epoch": 34.0,
"eval_loss": 0.30024948716163635,
"eval_mae": 0.3275962471961975,
"eval_r2": 0.7728116512298584,
"eval_rmse": 0.5479499697685242,
"eval_runtime": 18.1242,
"eval_samples_per_second": 551.747,
"eval_steps_per_second": 4.359,
"step": 33422
},
{
"epoch": 34.0793489318413,
"grad_norm": 0.41295087337493896,
"learning_rate": 2.19735503560529e-05,
"loss": 0.1216,
"step": 33500
},
{
"epoch": 34.58799593082401,
"grad_norm": 0.4062606692314148,
"learning_rate": 2.1804001356392e-05,
"loss": 0.1172,
"step": 34000
},
{
"epoch": 35.0,
"eval_loss": 0.2986757755279541,
"eval_mae": 0.3264056444168091,
"eval_r2": 0.7658240795135498,
"eval_rmse": 0.5465120077133179,
"eval_runtime": 18.6496,
"eval_samples_per_second": 536.206,
"eval_steps_per_second": 4.236,
"step": 34405
},
{
"epoch": 35.09664292980671,
"grad_norm": 0.4476313889026642,
"learning_rate": 2.1634452356731097e-05,
"loss": 0.1198,
"step": 34500
},
{
"epoch": 35.60528992878942,
"grad_norm": 0.29763707518577576,
"learning_rate": 2.146490335707019e-05,
"loss": 0.1165,
"step": 35000
},
{
"epoch": 36.0,
"eval_loss": 0.30164480209350586,
"eval_mae": 0.3274855315685272,
"eval_r2": 0.7613018751144409,
"eval_rmse": 0.5492217540740967,
"eval_runtime": 18.3514,
"eval_samples_per_second": 544.919,
"eval_steps_per_second": 4.305,
"step": 35388
},
{
"epoch": 36.11393692777212,
"grad_norm": 0.3980378806591034,
"learning_rate": 2.1295354357409293e-05,
"loss": 0.1178,
"step": 35500
},
{
"epoch": 36.622583926754835,
"grad_norm": 0.48255112767219543,
"learning_rate": 2.1125805357748388e-05,
"loss": 0.1135,
"step": 36000
},
{
"epoch": 37.0,
"eval_loss": 0.30013135075569153,
"eval_mae": 0.3258868455886841,
"eval_r2": 0.7733784914016724,
"eval_rmse": 0.5478420853614807,
"eval_runtime": 20.577,
"eval_samples_per_second": 485.98,
"eval_steps_per_second": 3.839,
"step": 36371
},
{
"epoch": 37.13123092573754,
"grad_norm": 0.3691520392894745,
"learning_rate": 2.095625635808749e-05,
"loss": 0.1176,
"step": 36500
},
{
"epoch": 37.639877924720246,
"grad_norm": 0.36071911454200745,
"learning_rate": 2.0786707358426584e-05,
"loss": 0.1146,
"step": 37000
},
{
"epoch": 38.0,
"eval_loss": 0.30118992924690247,
"eval_mae": 0.327684223651886,
"eval_r2": 0.7721153497695923,
"eval_rmse": 0.5488075017929077,
"eval_runtime": 19.8739,
"eval_samples_per_second": 503.173,
"eval_steps_per_second": 3.975,
"step": 37354
},
{
"epoch": 38.14852492370295,
"grad_norm": 0.3069157898426056,
"learning_rate": 2.0617158358765685e-05,
"loss": 0.1145,
"step": 37500
},
{
"epoch": 38.657171922685656,
"grad_norm": 0.36026087403297424,
"learning_rate": 2.0447609359104783e-05,
"loss": 0.1139,
"step": 38000
},
{
"epoch": 39.0,
"eval_loss": 0.2999809980392456,
"eval_mae": 0.3254566788673401,
"eval_r2": 0.7663213610649109,
"eval_rmse": 0.5477048754692078,
"eval_runtime": 18.5952,
"eval_samples_per_second": 537.773,
"eval_steps_per_second": 4.248,
"step": 38337
},
{
"epoch": 39.16581892166836,
"grad_norm": 0.35690009593963623,
"learning_rate": 2.0278060359443878e-05,
"loss": 0.1149,
"step": 38500
},
{
"epoch": 39.674465920651066,
"grad_norm": 0.34920957684516907,
"learning_rate": 2.010851135978298e-05,
"loss": 0.1136,
"step": 39000
},
{
"epoch": 40.0,
"eval_loss": 0.3011326789855957,
"eval_mae": 0.3251156210899353,
"eval_r2": 0.7704006433486938,
"eval_rmse": 0.5487552285194397,
"eval_runtime": 17.6999,
"eval_samples_per_second": 564.975,
"eval_steps_per_second": 4.463,
"step": 39320
},
{
"epoch": 40.18311291963377,
"grad_norm": 0.3468838036060333,
"learning_rate": 1.9938962360122074e-05,
"loss": 0.1121,
"step": 39500
},
{
"epoch": 40.691759918616484,
"grad_norm": 0.4196014702320099,
"learning_rate": 1.9769413360461176e-05,
"loss": 0.1122,
"step": 40000
},
{
"epoch": 41.0,
"eval_loss": 0.30130186676979065,
"eval_mae": 0.324237585067749,
"eval_r2": 0.7646907567977905,
"eval_rmse": 0.5489093661308289,
"eval_runtime": 17.2647,
"eval_samples_per_second": 579.217,
"eval_steps_per_second": 4.576,
"step": 40303
},
{
"epoch": 41.20040691759919,
"grad_norm": 0.3917727470397949,
"learning_rate": 1.959986436080027e-05,
"loss": 0.1134,
"step": 40500
},
{
"epoch": 41.709053916581894,
"grad_norm": 0.3163571059703827,
"learning_rate": 1.9430315361139372e-05,
"loss": 0.1093,
"step": 41000
},
{
"epoch": 42.0,
"eval_loss": 0.30112671852111816,
"eval_mae": 0.3250886797904968,
"eval_r2": 0.7695274353027344,
"eval_rmse": 0.5487497448921204,
"eval_runtime": 17.9807,
"eval_samples_per_second": 556.152,
"eval_steps_per_second": 4.394,
"step": 41286
},
{
"epoch": 42.2177009155646,
"grad_norm": 0.34252750873565674,
"learning_rate": 1.9260766361478467e-05,
"loss": 0.112,
"step": 41500
},
{
"epoch": 42.726347914547304,
"grad_norm": 0.39978763461112976,
"learning_rate": 1.9091217361817565e-05,
"loss": 0.1105,
"step": 42000
},
{
"epoch": 43.0,
"eval_loss": 0.30118075013160706,
"eval_mae": 0.32392618060112,
"eval_r2": 0.7685805559158325,
"eval_rmse": 0.5487990379333496,
"eval_runtime": 19.3392,
"eval_samples_per_second": 517.085,
"eval_steps_per_second": 4.085,
"step": 42269
},
{
"epoch": 43.23499491353001,
"grad_norm": 0.38831090927124023,
"learning_rate": 1.8921668362156663e-05,
"loss": 0.1105,
"step": 42500
},
{
"epoch": 43.743641912512714,
"grad_norm": 0.3529709577560425,
"learning_rate": 1.875211936249576e-05,
"loss": 0.111,
"step": 43000
},
{
"epoch": 44.0,
"eval_loss": 0.3047633171081543,
"eval_mae": 0.3260849118232727,
"eval_r2": 0.7712223529815674,
"eval_rmse": 0.5520533323287964,
"eval_runtime": 17.5503,
"eval_samples_per_second": 569.792,
"eval_steps_per_second": 4.501,
"step": 43252
},
{
"epoch": 44.25228891149542,
"grad_norm": 0.33766278624534607,
"learning_rate": 1.8582570362834862e-05,
"loss": 0.1084,
"step": 43500
},
{
"epoch": 44.760935910478125,
"grad_norm": 0.4114132225513458,
"learning_rate": 1.8413021363173957e-05,
"loss": 0.1108,
"step": 44000
},
{
"epoch": 45.0,
"eval_loss": 0.3017372488975525,
"eval_mae": 0.32358628511428833,
"eval_r2": 0.7659143209457397,
"eval_rmse": 0.5493058562278748,
"eval_runtime": 19.7823,
"eval_samples_per_second": 505.501,
"eval_steps_per_second": 3.993,
"step": 44235
},
{
"epoch": 45.26958290946084,
"grad_norm": 0.37686386704444885,
"learning_rate": 1.824347236351306e-05,
"loss": 0.1068,
"step": 44500
},
{
"epoch": 45.77822990844354,
"grad_norm": 0.32547298073768616,
"learning_rate": 1.8073923363852153e-05,
"loss": 0.1091,
"step": 45000
},
{
"epoch": 46.0,
"eval_loss": 0.30219876766204834,
"eval_mae": 0.3234938383102417,
"eval_r2": 0.7667394280433655,
"eval_rmse": 0.5497257709503174,
"eval_runtime": 21.0895,
"eval_samples_per_second": 474.169,
"eval_steps_per_second": 3.746,
"step": 45218
},
{
"epoch": 46.28687690742625,
"grad_norm": 0.3565451502799988,
"learning_rate": 1.790437436419125e-05,
"loss": 0.1074,
"step": 45500
},
{
"epoch": 46.79552390640895,
"grad_norm": 0.3059617877006531,
"learning_rate": 1.773482536453035e-05,
"loss": 0.1085,
"step": 46000
},
{
"epoch": 47.0,
"eval_loss": 0.300829142332077,
"eval_mae": 0.32321953773498535,
"eval_r2": 0.7664545178413391,
"eval_rmse": 0.5484786033630371,
"eval_runtime": 26.1836,
"eval_samples_per_second": 381.918,
"eval_steps_per_second": 3.017,
"step": 46201
},
{
"epoch": 47.30417090539166,
"grad_norm": 0.37008407711982727,
"learning_rate": 1.7565276364869448e-05,
"loss": 0.1069,
"step": 46500
},
{
"epoch": 47.81281790437436,
"grad_norm": 0.36229032278060913,
"learning_rate": 1.7395727365208546e-05,
"loss": 0.1082,
"step": 47000
},
{
"epoch": 48.0,
"eval_loss": 0.30168837308883667,
"eval_mae": 0.32336822152137756,
"eval_r2": 0.7664652466773987,
"eval_rmse": 0.5492613315582275,
"eval_runtime": 23.5481,
"eval_samples_per_second": 424.663,
"eval_steps_per_second": 3.355,
"step": 47184
},
{
"epoch": 48.32146490335707,
"grad_norm": 0.2914319932460785,
"learning_rate": 1.7226178365547644e-05,
"loss": 0.1041,
"step": 47500
},
{
"epoch": 48.83011190233977,
"grad_norm": 0.3244343101978302,
"learning_rate": 1.705662936588674e-05,
"loss": 0.1084,
"step": 48000
},
{
"epoch": 49.0,
"eval_loss": 0.3029985725879669,
"eval_mae": 0.32335370779037476,
"eval_r2": 0.76764976978302,
"eval_rmse": 0.550452709197998,
"eval_runtime": 21.4309,
"eval_samples_per_second": 466.616,
"eval_steps_per_second": 3.686,
"step": 48167
},
{
"epoch": 49.338758901322485,
"grad_norm": 0.3769769072532654,
"learning_rate": 1.688708036622584e-05,
"loss": 0.1051,
"step": 48500
},
{
"epoch": 49.84740590030519,
"grad_norm": 0.3235186040401459,
"learning_rate": 1.6717531366564938e-05,
"loss": 0.108,
"step": 49000
},
{
"epoch": 50.0,
"eval_loss": 0.3056350648403168,
"eval_mae": 0.3234601318836212,
"eval_r2": 0.7742241621017456,
"eval_rmse": 0.552842378616333,
"eval_runtime": 21.2669,
"eval_samples_per_second": 470.213,
"eval_steps_per_second": 3.715,
"step": 49150
},
{
"epoch": 50.356052899287896,
"grad_norm": 0.32111743092536926,
"learning_rate": 1.6547982366904036e-05,
"loss": 0.1023,
"step": 49500
},
{
"epoch": 50.8646998982706,
"grad_norm": 0.35004922747612,
"learning_rate": 1.6378433367243134e-05,
"loss": 0.1065,
"step": 50000
},
{
"epoch": 51.0,
"eval_loss": 0.3032062351703644,
"eval_mae": 0.3231772184371948,
"eval_r2": 0.7666732668876648,
"eval_rmse": 0.5506412982940674,
"eval_runtime": 22.5263,
"eval_samples_per_second": 443.925,
"eval_steps_per_second": 3.507,
"step": 50133
},
{
"epoch": 51.373346897253306,
"grad_norm": 0.3384553790092468,
"learning_rate": 1.6208884367582232e-05,
"loss": 0.105,
"step": 50500
},
{
"epoch": 51.88199389623601,
"grad_norm": 0.30764466524124146,
"learning_rate": 1.603933536792133e-05,
"loss": 0.1061,
"step": 51000
},
{
"epoch": 52.0,
"eval_loss": 0.3021974265575409,
"eval_mae": 0.32215458154678345,
"eval_r2": 0.7661508321762085,
"eval_rmse": 0.5497245192527771,
"eval_runtime": 21.0716,
"eval_samples_per_second": 474.572,
"eval_steps_per_second": 3.749,
"step": 51116
},
{
"epoch": 52.390640895218716,
"grad_norm": 0.32773980498313904,
"learning_rate": 1.5869786368260425e-05,
"loss": 0.1022,
"step": 51500
},
{
"epoch": 52.89928789420142,
"grad_norm": 0.3055395185947418,
"learning_rate": 1.5700237368599527e-05,
"loss": 0.1065,
"step": 52000
},
{
"epoch": 53.0,
"eval_loss": 0.3047163188457489,
"eval_mae": 0.32322755455970764,
"eval_r2": 0.7697006464004517,
"eval_rmse": 0.552010715007782,
"eval_runtime": 20.1406,
"eval_samples_per_second": 496.509,
"eval_steps_per_second": 3.922,
"step": 52099
},
{
"epoch": 53.407934893184134,
"grad_norm": 0.25924962759017944,
"learning_rate": 1.553068836893862e-05,
"loss": 0.1022,
"step": 52500
},
{
"epoch": 53.91658189216684,
"grad_norm": 0.29001641273498535,
"learning_rate": 1.5361139369277723e-05,
"loss": 0.1062,
"step": 53000
},
{
"epoch": 54.0,
"eval_loss": 0.3048439025878906,
"eval_mae": 0.32225465774536133,
"eval_r2": 0.7680661678314209,
"eval_rmse": 0.5521263480186462,
"eval_runtime": 22.1784,
"eval_samples_per_second": 450.889,
"eval_steps_per_second": 3.562,
"step": 53082
},
{
"epoch": 54.425228891149544,
"grad_norm": 0.2913689911365509,
"learning_rate": 1.519159036961682e-05,
"loss": 0.1012,
"step": 53500
},
{
"epoch": 54.93387589013225,
"grad_norm": 0.27742624282836914,
"learning_rate": 1.5022041369955919e-05,
"loss": 0.1046,
"step": 54000
},
{
"epoch": 55.0,
"eval_loss": 0.30284586548805237,
"eval_mae": 0.3212890028953552,
"eval_r2": 0.765329122543335,
"eval_rmse": 0.5503140091896057,
"eval_runtime": 21.5879,
"eval_samples_per_second": 463.223,
"eval_steps_per_second": 3.659,
"step": 54065
},
{
"epoch": 55.442522889114954,
"grad_norm": 0.24787943065166473,
"learning_rate": 1.4852492370295015e-05,
"loss": 0.1032,
"step": 54500
},
{
"epoch": 55.95116988809766,
"grad_norm": 0.3004557192325592,
"learning_rate": 1.4682943370634113e-05,
"loss": 0.1034,
"step": 55000
},
{
"epoch": 56.0,
"eval_loss": 0.30553364753723145,
"eval_mae": 0.3226454555988312,
"eval_r2": 0.7709420919418335,
"eval_rmse": 0.5527505874633789,
"eval_runtime": 20.8948,
"eval_samples_per_second": 478.589,
"eval_steps_per_second": 3.781,
"step": 55048
},
{
"epoch": 56.459816887080365,
"grad_norm": 0.290238618850708,
"learning_rate": 1.4513394370973212e-05,
"loss": 0.1011,
"step": 55500
},
{
"epoch": 56.96846388606307,
"grad_norm": 0.34532827138900757,
"learning_rate": 1.434384537131231e-05,
"loss": 0.1047,
"step": 56000
},
{
"epoch": 57.0,
"eval_loss": 0.3043980598449707,
"eval_mae": 0.32213300466537476,
"eval_r2": 0.764991283416748,
"eval_rmse": 0.5517224073410034,
"eval_runtime": 21.1266,
"eval_samples_per_second": 473.337,
"eval_steps_per_second": 3.739,
"step": 56031
},
{
"epoch": 57.477110885045775,
"grad_norm": 0.2498069405555725,
"learning_rate": 1.4174296371651406e-05,
"loss": 0.0988,
"step": 56500
},
{
"epoch": 57.98575788402849,
"grad_norm": 0.34196507930755615,
"learning_rate": 1.4004747371990506e-05,
"loss": 0.1048,
"step": 57000
},
{
"epoch": 58.0,
"eval_loss": 0.3069715201854706,
"eval_mae": 0.3227607011795044,
"eval_r2": 0.7700543403625488,
"eval_rmse": 0.5540497303009033,
"eval_runtime": 22.6147,
"eval_samples_per_second": 442.191,
"eval_steps_per_second": 3.493,
"step": 57014
},
{
"epoch": 58.49440488301119,
"grad_norm": 0.2622218132019043,
"learning_rate": 1.3835198372329604e-05,
"loss": 0.0996,
"step": 57500
},
{
"epoch": 59.0,
"eval_loss": 0.30414652824401855,
"eval_mae": 0.3215589225292206,
"eval_r2": 0.769301176071167,
"eval_rmse": 0.5514944791793823,
"eval_runtime": 21.6832,
"eval_samples_per_second": 461.187,
"eval_steps_per_second": 3.643,
"step": 57997
},
{
"epoch": 59.0030518819939,
"grad_norm": 0.2528667151927948,
"learning_rate": 1.3665649372668702e-05,
"loss": 0.104,
"step": 58000
},
{
"epoch": 59.5116988809766,
"grad_norm": 0.3110768795013428,
"learning_rate": 1.34961003730078e-05,
"loss": 0.1009,
"step": 58500
},
{
"epoch": 60.0,
"eval_loss": 0.30486321449279785,
"eval_mae": 0.3213992714881897,
"eval_r2": 0.7690669298171997,
"eval_rmse": 0.5521438121795654,
"eval_runtime": 22.881,
"eval_samples_per_second": 437.043,
"eval_steps_per_second": 3.453,
"step": 58980
},
{
"epoch": 60.02034587995931,
"grad_norm": 0.23977969586849213,
"learning_rate": 1.3326551373346898e-05,
"loss": 0.1014,
"step": 59000
},
{
"epoch": 60.52899287894201,
"grad_norm": 0.28639206290245056,
"learning_rate": 1.3157002373685996e-05,
"loss": 0.0985,
"step": 59500
},
{
"epoch": 61.0,
"eval_loss": 0.30517521500587463,
"eval_mae": 0.3211789131164551,
"eval_r2": 0.7687903642654419,
"eval_rmse": 0.552426278591156,
"eval_runtime": 22.942,
"eval_samples_per_second": 435.882,
"eval_steps_per_second": 3.443,
"step": 59963
},
{
"epoch": 61.03763987792472,
"grad_norm": 0.25792455673217773,
"learning_rate": 1.2987453374025093e-05,
"loss": 0.1033,
"step": 60000
},
{
"epoch": 61.54628687690742,
"grad_norm": 0.2544417977333069,
"learning_rate": 1.281790437436419e-05,
"loss": 0.0986,
"step": 60500
},
{
"epoch": 62.0,
"eval_loss": 0.3043474555015564,
"eval_mae": 0.320587694644928,
"eval_r2": 0.7671276330947876,
"eval_rmse": 0.5516765713691711,
"eval_runtime": 22.599,
"eval_samples_per_second": 442.498,
"eval_steps_per_second": 3.496,
"step": 60946
},
{
"epoch": 62.054933875890136,
"grad_norm": 0.23652108013629913,
"learning_rate": 1.2648355374703289e-05,
"loss": 0.1016,
"step": 61000
},
{
"epoch": 62.56358087487284,
"grad_norm": 0.2460223287343979,
"learning_rate": 1.2478806375042387e-05,
"loss": 0.0974,
"step": 61500
},
{
"epoch": 63.0,
"eval_loss": 0.3064461946487427,
"eval_mae": 0.3217301368713379,
"eval_r2": 0.7658741474151611,
"eval_rmse": 0.5535755157470703,
"eval_runtime": 21.5823,
"eval_samples_per_second": 463.343,
"eval_steps_per_second": 3.66,
"step": 61929
},
{
"epoch": 63.072227873855546,
"grad_norm": 0.263189435005188,
"learning_rate": 1.2309257375381485e-05,
"loss": 0.102,
"step": 62000
},
{
"epoch": 63.58087487283825,
"grad_norm": 0.28210920095443726,
"learning_rate": 1.2139708375720583e-05,
"loss": 0.1002,
"step": 62500
},
{
"epoch": 64.0,
"eval_loss": 0.30489081144332886,
"eval_mae": 0.3209025263786316,
"eval_r2": 0.7646889686584473,
"eval_rmse": 0.5521687865257263,
"eval_runtime": 22.5558,
"eval_samples_per_second": 443.346,
"eval_steps_per_second": 3.502,
"step": 62912
},
{
"epoch": 64.08952187182095,
"grad_norm": 0.28177881240844727,
"learning_rate": 1.1970159376059683e-05,
"loss": 0.1,
"step": 63000
},
{
"epoch": 64.59816887080366,
"grad_norm": 0.2723771035671234,
"learning_rate": 1.180061037639878e-05,
"loss": 0.0984,
"step": 63500
},
{
"epoch": 65.0,
"eval_loss": 0.3055484890937805,
"eval_mae": 0.3205299973487854,
"eval_r2": 0.7690364718437195,
"eval_rmse": 0.5527639985084534,
"eval_runtime": 23.8261,
"eval_samples_per_second": 419.707,
"eval_steps_per_second": 3.316,
"step": 63895
},
{
"epoch": 65.10681586978637,
"grad_norm": 0.2683132588863373,
"learning_rate": 1.1631061376737877e-05,
"loss": 0.0996,
"step": 64000
},
{
"epoch": 65.61546286876907,
"grad_norm": 0.2997967004776001,
"learning_rate": 1.1461512377076976e-05,
"loss": 0.0985,
"step": 64500
},
{
"epoch": 66.0,
"eval_loss": 0.30802544951438904,
"eval_mae": 0.3224177062511444,
"eval_r2": 0.7694234848022461,
"eval_rmse": 0.5550000071525574,
"eval_runtime": 22.5937,
"eval_samples_per_second": 442.602,
"eval_steps_per_second": 3.497,
"step": 64878
},
{
"epoch": 66.12410986775178,
"grad_norm": 0.2792266309261322,
"learning_rate": 1.1291963377416074e-05,
"loss": 0.1004,
"step": 65000
},
{
"epoch": 66.63275686673448,
"grad_norm": 0.2717229425907135,
"learning_rate": 1.1122414377755172e-05,
"loss": 0.0987,
"step": 65500
},
{
"epoch": 67.0,
"eval_loss": 0.3065015971660614,
"eval_mae": 0.3211498260498047,
"eval_r2": 0.76569664478302,
"eval_rmse": 0.5536254644393921,
"eval_runtime": 21.705,
"eval_samples_per_second": 460.722,
"eval_steps_per_second": 3.64,
"step": 65861
},
{
"epoch": 67.1414038657172,
"grad_norm": 0.2361566722393036,
"learning_rate": 1.095286537809427e-05,
"loss": 0.0983,
"step": 66000
},
{
"epoch": 67.65005086469989,
"grad_norm": 0.2546670436859131,
"learning_rate": 1.0783316378433368e-05,
"loss": 0.0977,
"step": 66500
},
{
"epoch": 68.0,
"eval_loss": 0.30562883615493774,
"eval_mae": 0.32023167610168457,
"eval_r2": 0.7649646997451782,
"eval_rmse": 0.5528367161750793,
"eval_runtime": 20.8457,
"eval_samples_per_second": 479.716,
"eval_steps_per_second": 3.79,
"step": 66844
},
{
"epoch": 68.1586978636826,
"grad_norm": 0.24419383704662323,
"learning_rate": 1.0613767378772464e-05,
"loss": 0.0982,
"step": 67000
},
{
"epoch": 68.66734486266532,
"grad_norm": 0.27351826429367065,
"learning_rate": 1.0444218379111562e-05,
"loss": 0.0986,
"step": 67500
},
{
"epoch": 69.0,
"eval_loss": 0.3047651946544647,
"eval_mae": 0.32021564245224,
"eval_r2": 0.7680791020393372,
"eval_rmse": 0.5520549416542053,
"eval_runtime": 21.4713,
"eval_samples_per_second": 465.738,
"eval_steps_per_second": 3.679,
"step": 67827
},
{
"epoch": 69.17599186164801,
"grad_norm": 0.2680034339427948,
"learning_rate": 1.027466937945066e-05,
"loss": 0.0986,
"step": 68000
},
{
"epoch": 69.68463886063073,
"grad_norm": 0.2814819812774658,
"learning_rate": 1.010512037978976e-05,
"loss": 0.0984,
"step": 68500
},
{
"epoch": 70.0,
"eval_loss": 0.307124525308609,
"eval_mae": 0.3204113841056824,
"eval_r2": 0.7679520845413208,
"eval_rmse": 0.5541877150535583,
"eval_runtime": 22.1583,
"eval_samples_per_second": 451.298,
"eval_steps_per_second": 3.565,
"step": 68810
},
{
"epoch": 70.19328585961343,
"grad_norm": 0.27025556564331055,
"learning_rate": 9.935571380128858e-06,
"loss": 0.0964,
"step": 69000
},
{
"epoch": 70.70193285859614,
"grad_norm": 0.2899576723575592,
"learning_rate": 9.766022380467957e-06,
"loss": 0.0966,
"step": 69500
},
{
"epoch": 71.0,
"eval_loss": 0.3074840307235718,
"eval_mae": 0.3207957148551941,
"eval_r2": 0.7695941925048828,
"eval_rmse": 0.5545119643211365,
"eval_runtime": 23.0374,
"eval_samples_per_second": 434.077,
"eval_steps_per_second": 3.429,
"step": 69793
},
{
"epoch": 71.21057985757884,
"grad_norm": 0.2534237205982208,
"learning_rate": 9.596473380807055e-06,
"loss": 0.0985,
"step": 70000
},
{
"epoch": 71.71922685656155,
"grad_norm": 0.2650779187679291,
"learning_rate": 9.426924381146151e-06,
"loss": 0.0978,
"step": 70500
},
{
"epoch": 72.0,
"eval_loss": 0.3064354956150055,
"eval_mae": 0.3201001286506653,
"eval_r2": 0.7672066688537598,
"eval_rmse": 0.5535657405853271,
"eval_runtime": 23.7676,
"eval_samples_per_second": 420.74,
"eval_steps_per_second": 3.324,
"step": 70776
},
{
"epoch": 72.22787385554425,
"grad_norm": 0.2299046665430069,
"learning_rate": 9.257375381485249e-06,
"loss": 0.0966,
"step": 71000
},
{
"epoch": 72.73652085452696,
"grad_norm": 0.25810402631759644,
"learning_rate": 9.087826381824347e-06,
"loss": 0.0962,
"step": 71500
},
{
"epoch": 73.0,
"eval_loss": 0.3067697286605835,
"eval_mae": 0.32059645652770996,
"eval_r2": 0.770045280456543,
"eval_rmse": 0.553867518901825,
"eval_runtime": 23.7289,
"eval_samples_per_second": 421.427,
"eval_steps_per_second": 3.329,
"step": 71759
},
{
"epoch": 73.24516785350967,
"grad_norm": 0.2593117356300354,
"learning_rate": 8.918277382163445e-06,
"loss": 0.097,
"step": 72000
},
{
"epoch": 73.75381485249237,
"grad_norm": 0.28874120116233826,
"learning_rate": 8.748728382502543e-06,
"loss": 0.0967,
"step": 72500
},
{
"epoch": 74.0,
"eval_loss": 0.30635932087898254,
"eval_mae": 0.31995031237602234,
"eval_r2": 0.7648576498031616,
"eval_rmse": 0.5534969568252563,
"eval_runtime": 22.8055,
"eval_samples_per_second": 438.49,
"eval_steps_per_second": 3.464,
"step": 72742
},
{
"epoch": 74.26246185147508,
"grad_norm": 0.2876887917518616,
"learning_rate": 8.579179382841641e-06,
"loss": 0.0955,
"step": 73000
},
{
"epoch": 74.77110885045778,
"grad_norm": 0.2901298701763153,
"learning_rate": 8.409630383180738e-06,
"loss": 0.0982,
"step": 73500
},
{
"epoch": 75.0,
"eval_loss": 0.3065277338027954,
"eval_mae": 0.3198009133338928,
"eval_r2": 0.7686657905578613,
"eval_rmse": 0.5536490678787231,
"eval_runtime": 21.8575,
"eval_samples_per_second": 457.509,
"eval_steps_per_second": 3.614,
"step": 73725
},
{
"epoch": 75.27975584944049,
"grad_norm": 0.23049291968345642,
"learning_rate": 8.240081383519838e-06,
"loss": 0.096,
"step": 74000
},
{
"epoch": 75.78840284842319,
"grad_norm": 0.2709825932979584,
"learning_rate": 8.070532383858936e-06,
"loss": 0.0954,
"step": 74500
},
{
"epoch": 76.0,
"eval_loss": 0.30666035413742065,
"eval_mae": 0.32015907764434814,
"eval_r2": 0.7675079107284546,
"eval_rmse": 0.5537688136100769,
"eval_runtime": 23.6719,
"eval_samples_per_second": 422.441,
"eval_steps_per_second": 3.337,
"step": 74708
},
{
"epoch": 76.2970498474059,
"grad_norm": 0.21834221482276917,
"learning_rate": 7.900983384198034e-06,
"loss": 0.0954,
"step": 75000
},
{
"epoch": 76.8056968463886,
"grad_norm": 0.20948007702827454,
"learning_rate": 7.731434384537132e-06,
"loss": 0.097,
"step": 75500
},
{
"epoch": 77.0,
"eval_loss": 0.30686530470848083,
"eval_mae": 0.3204137682914734,
"eval_r2": 0.7677739858627319,
"eval_rmse": 0.5539537668228149,
"eval_runtime": 21.4205,
"eval_samples_per_second": 466.843,
"eval_steps_per_second": 3.688,
"step": 75691
},
{
"epoch": 77.31434384537131,
"grad_norm": 0.23167894780635834,
"learning_rate": 7.56188538487623e-06,
"loss": 0.0932,
"step": 76000
},
{
"epoch": 77.82299084435402,
"grad_norm": 0.24640928208827972,
"learning_rate": 7.392336385215327e-06,
"loss": 0.0977,
"step": 76500
},
{
"epoch": 78.0,
"eval_loss": 0.3072910010814667,
"eval_mae": 0.32037079334259033,
"eval_r2": 0.7697309255599976,
"eval_rmse": 0.5543379187583923,
"eval_runtime": 19.5509,
"eval_samples_per_second": 511.487,
"eval_steps_per_second": 4.041,
"step": 76674
},
{
"epoch": 78.33163784333672,
"grad_norm": 0.2685850262641907,
"learning_rate": 7.222787385554425e-06,
"loss": 0.0942,
"step": 77000
},
{
"epoch": 78.84028484231943,
"grad_norm": 0.2667245864868164,
"learning_rate": 7.0532383858935235e-06,
"loss": 0.0965,
"step": 77500
},
{
"epoch": 79.0,
"eval_loss": 0.30803820490837097,
"eval_mae": 0.32020050287246704,
"eval_r2": 0.7687854170799255,
"eval_rmse": 0.555011510848999,
"eval_runtime": 23.1481,
"eval_samples_per_second": 432.0,
"eval_steps_per_second": 3.413,
"step": 77657
},
{
"epoch": 79.34893184130213,
"grad_norm": 0.20989225804805756,
"learning_rate": 6.883689386232621e-06,
"loss": 0.0959,
"step": 78000
},
{
"epoch": 79.85757884028484,
"grad_norm": 0.2710280418395996,
"learning_rate": 6.71414038657172e-06,
"loss": 0.0945,
"step": 78500
},
{
"epoch": 80.0,
"eval_loss": 0.30728211998939514,
"eval_mae": 0.3196803033351898,
"eval_r2": 0.7670853137969971,
"eval_rmse": 0.5543298125267029,
"eval_runtime": 22.6974,
"eval_samples_per_second": 440.579,
"eval_steps_per_second": 3.481,
"step": 78640
},
{
"epoch": 80.36622583926754,
"grad_norm": 0.23559938371181488,
"learning_rate": 6.544591386910818e-06,
"loss": 0.0933,
"step": 79000
},
{
"epoch": 80.87487283825025,
"grad_norm": 0.2247001826763153,
"learning_rate": 6.375042387249915e-06,
"loss": 0.0955,
"step": 79500
},
{
"epoch": 81.0,
"eval_loss": 0.3074042499065399,
"eval_mae": 0.31953954696655273,
"eval_r2": 0.7664982676506042,
"eval_rmse": 0.5544400215148926,
"eval_runtime": 21.5373,
"eval_samples_per_second": 464.311,
"eval_steps_per_second": 3.668,
"step": 79623
},
{
"epoch": 81.38351983723297,
"grad_norm": 0.23154723644256592,
"learning_rate": 6.205493387589013e-06,
"loss": 0.0928,
"step": 80000
},
{
"epoch": 81.89216683621567,
"grad_norm": 0.2460862100124359,
"learning_rate": 6.035944387928111e-06,
"loss": 0.0957,
"step": 80500
},
{
"epoch": 82.0,
"eval_loss": 0.30816981196403503,
"eval_mae": 0.31992191076278687,
"eval_r2": 0.7665845155715942,
"eval_rmse": 0.5551300644874573,
"eval_runtime": 20.0656,
"eval_samples_per_second": 498.366,
"eval_steps_per_second": 3.937,
"step": 80606
},
{
"epoch": 82.40081383519838,
"grad_norm": 0.22507379949092865,
"learning_rate": 5.866395388267209e-06,
"loss": 0.0937,
"step": 81000
},
{
"epoch": 82.90946083418108,
"grad_norm": 0.26125577092170715,
"learning_rate": 5.696846388606307e-06,
"loss": 0.095,
"step": 81500
},
{
"epoch": 83.0,
"eval_loss": 0.3074418008327484,
"eval_mae": 0.31970179080963135,
"eval_r2": 0.7669422626495361,
"eval_rmse": 0.5544738173484802,
"eval_runtime": 23.0334,
"eval_samples_per_second": 434.153,
"eval_steps_per_second": 3.43,
"step": 81589
},
{
"epoch": 83.41810783316379,
"grad_norm": 0.2263702154159546,
"learning_rate": 5.5272973889454055e-06,
"loss": 0.0938,
"step": 82000
},
{
"epoch": 83.92675483214649,
"grad_norm": 0.2645528018474579,
"learning_rate": 5.357748389284504e-06,
"loss": 0.094,
"step": 82500
},
{
"epoch": 84.0,
"eval_loss": 0.30633336305618286,
"eval_mae": 0.3188663721084595,
"eval_r2": 0.7666647434234619,
"eval_rmse": 0.5534735321998596,
"eval_runtime": 23.049,
"eval_samples_per_second": 433.859,
"eval_steps_per_second": 3.427,
"step": 82572
},
{
"epoch": 84.4354018311292,
"grad_norm": 0.21599704027175903,
"learning_rate": 5.188199389623601e-06,
"loss": 0.0943,
"step": 83000
},
{
"epoch": 84.9440488301119,
"grad_norm": 0.21557337045669556,
"learning_rate": 5.018650389962699e-06,
"loss": 0.0932,
"step": 83500
},
{
"epoch": 85.0,
"eval_loss": 0.30662301182746887,
"eval_mae": 0.3191204369068146,
"eval_r2": 0.7680181264877319,
"eval_rmse": 0.5537351369857788,
"eval_runtime": 22.4327,
"eval_samples_per_second": 445.777,
"eval_steps_per_second": 3.522,
"step": 83555
},
{
"epoch": 85.45269582909461,
"grad_norm": 0.2520500123500824,
"learning_rate": 4.849101390301798e-06,
"loss": 0.0936,
"step": 84000
},
{
"epoch": 85.96134282807732,
"grad_norm": 0.24406184256076813,
"learning_rate": 4.679552390640896e-06,
"loss": 0.0934,
"step": 84500
},
{
"epoch": 86.0,
"eval_loss": 0.3075953722000122,
"eval_mae": 0.31922364234924316,
"eval_r2": 0.7682535648345947,
"eval_rmse": 0.554612398147583,
"eval_runtime": 24.2058,
"eval_samples_per_second": 413.124,
"eval_steps_per_second": 3.264,
"step": 84538
},
{
"epoch": 86.46998982706002,
"grad_norm": 0.29262858629226685,
"learning_rate": 4.510003390979993e-06,
"loss": 0.0925,
"step": 85000
},
{
"epoch": 86.97863682604273,
"grad_norm": 0.23719151318073273,
"learning_rate": 4.340454391319091e-06,
"loss": 0.0946,
"step": 85500
},
{
"epoch": 87.0,
"eval_loss": 0.30780571699142456,
"eval_mae": 0.3192029595375061,
"eval_r2": 0.767227292060852,
"eval_rmse": 0.5548020005226135,
"eval_runtime": 21.1225,
"eval_samples_per_second": 473.43,
"eval_steps_per_second": 3.74,
"step": 85521
},
{
"epoch": 87.48728382502543,
"grad_norm": 0.2712896168231964,
"learning_rate": 4.170905391658189e-06,
"loss": 0.0922,
"step": 86000
},
{
"epoch": 87.99593082400814,
"grad_norm": 0.22555504739284515,
"learning_rate": 4.001356391997287e-06,
"loss": 0.0939,
"step": 86500
},
{
"epoch": 88.0,
"eval_loss": 0.3082311153411865,
"eval_mae": 0.3195423483848572,
"eval_r2": 0.7683557271957397,
"eval_rmse": 0.5551851987838745,
"eval_runtime": 21.258,
"eval_samples_per_second": 470.412,
"eval_steps_per_second": 3.716,
"step": 86504
},
{
"epoch": 88.50457782299084,
"grad_norm": 0.25731492042541504,
"learning_rate": 3.831807392336386e-06,
"loss": 0.0917,
"step": 87000
},
{
"epoch": 89.0,
"eval_loss": 0.30769726634025574,
"eval_mae": 0.3188689947128296,
"eval_r2": 0.7664185166358948,
"eval_rmse": 0.5547041893005371,
"eval_runtime": 22.8134,
"eval_samples_per_second": 438.338,
"eval_steps_per_second": 3.463,
"step": 87487
},
{
"epoch": 89.01322482197355,
"grad_norm": 0.21790558099746704,
"learning_rate": 3.6622583926754837e-06,
"loss": 0.0933,
"step": 87500
},
{
"epoch": 89.52187182095625,
"grad_norm": 0.20840144157409668,
"learning_rate": 3.4927093930145814e-06,
"loss": 0.0913,
"step": 88000
},
{
"epoch": 90.0,
"eval_loss": 0.3072252869606018,
"eval_mae": 0.31897908449172974,
"eval_r2": 0.7661517858505249,
"eval_rmse": 0.5542786717414856,
"eval_runtime": 22.7075,
"eval_samples_per_second": 440.383,
"eval_steps_per_second": 3.479,
"step": 88470
},
{
"epoch": 90.03051881993896,
"grad_norm": 0.23747815191745758,
"learning_rate": 3.323160393353679e-06,
"loss": 0.0941,
"step": 88500
},
{
"epoch": 90.53916581892167,
"grad_norm": 0.2666161358356476,
"learning_rate": 3.1536113936927776e-06,
"loss": 0.0913,
"step": 89000
},
{
"epoch": 91.0,
"eval_loss": 0.30819734930992126,
"eval_mae": 0.3193185329437256,
"eval_r2": 0.7667792439460754,
"eval_rmse": 0.5551548600196838,
"eval_runtime": 23.0912,
"eval_samples_per_second": 433.065,
"eval_steps_per_second": 3.421,
"step": 89453
},
{
"epoch": 91.04781281790437,
"grad_norm": 0.2510707676410675,
"learning_rate": 2.9840623940318752e-06,
"loss": 0.0932,
"step": 89500
},
{
"epoch": 91.55645981688708,
"grad_norm": 0.2492397129535675,
"learning_rate": 2.8145133943709733e-06,
"loss": 0.0915,
"step": 90000
},
{
"epoch": 92.0,
"eval_loss": 0.3087303936481476,
"eval_mae": 0.3195701241493225,
"eval_r2": 0.767082929611206,
"eval_rmse": 0.5556347370147705,
"eval_runtime": 21.4213,
"eval_samples_per_second": 466.825,
"eval_steps_per_second": 3.688,
"step": 90436
},
{
"epoch": 92.06510681586978,
"grad_norm": 0.2138824164867401,
"learning_rate": 2.6449643947100714e-06,
"loss": 0.0921,
"step": 90500
},
{
"epoch": 92.5737538148525,
"grad_norm": 0.2821875810623169,
"learning_rate": 2.475415395049169e-06,
"loss": 0.091,
"step": 91000
},
{
"epoch": 93.0,
"eval_loss": 0.30839282274246216,
"eval_mae": 0.319232702255249,
"eval_r2": 0.7667563557624817,
"eval_rmse": 0.5553308725357056,
"eval_runtime": 21.2783,
"eval_samples_per_second": 469.961,
"eval_steps_per_second": 3.713,
"step": 91419
},
{
"epoch": 93.08240081383519,
"grad_norm": 0.2118423730134964,
"learning_rate": 2.305866395388267e-06,
"loss": 0.0929,
"step": 91500
},
{
"epoch": 93.5910478128179,
"grad_norm": 0.22796212136745453,
"learning_rate": 2.1363173957273653e-06,
"loss": 0.0913,
"step": 92000
},
{
"epoch": 94.0,
"eval_loss": 0.3086921274662018,
"eval_mae": 0.3193605840206146,
"eval_r2": 0.766891598701477,
"eval_rmse": 0.5556001663208008,
"eval_runtime": 23.1152,
"eval_samples_per_second": 432.616,
"eval_steps_per_second": 3.418,
"step": 92402
},
{
"epoch": 94.09969481180062,
"grad_norm": 0.2285338044166565,
"learning_rate": 1.9667683960664634e-06,
"loss": 0.0917,
"step": 92500
},
{
"epoch": 94.60834181078332,
"grad_norm": 0.20946064591407776,
"learning_rate": 1.7972193964055613e-06,
"loss": 0.0911,
"step": 93000
},
{
"epoch": 95.0,
"eval_loss": 0.3085222542285919,
"eval_mae": 0.3191923499107361,
"eval_r2": 0.7661963701248169,
"eval_rmse": 0.5554474592208862,
"eval_runtime": 23.4962,
"eval_samples_per_second": 425.601,
"eval_steps_per_second": 3.362,
"step": 93385
},
{
"epoch": 95.11698880976603,
"grad_norm": 0.269754558801651,
"learning_rate": 1.6276703967446594e-06,
"loss": 0.0923,
"step": 93500
},
{
"epoch": 95.62563580874873,
"grad_norm": 0.2517547309398651,
"learning_rate": 1.4581213970837572e-06,
"loss": 0.0912,
"step": 94000
},
{
"epoch": 96.0,
"eval_loss": 0.3087967038154602,
"eval_mae": 0.3190614581108093,
"eval_r2": 0.7664559483528137,
"eval_rmse": 0.5556943416595459,
"eval_runtime": 21.3044,
"eval_samples_per_second": 469.387,
"eval_steps_per_second": 3.708,
"step": 94368
},
{
"epoch": 96.13428280773144,
"grad_norm": 0.200937882065773,
"learning_rate": 1.2885723974228553e-06,
"loss": 0.091,
"step": 94500
},
{
"epoch": 96.64292980671414,
"grad_norm": 0.2715360224246979,
"learning_rate": 1.1190233977619532e-06,
"loss": 0.0904,
"step": 95000
},
{
"epoch": 97.0,
"eval_loss": 0.30910882353782654,
"eval_mae": 0.3192349076271057,
"eval_r2": 0.7662383317947388,
"eval_rmse": 0.5559751987457275,
"eval_runtime": 21.1316,
"eval_samples_per_second": 473.224,
"eval_steps_per_second": 3.738,
"step": 95351
},
{
"epoch": 97.15157680569685,
"grad_norm": 0.24004267156124115,
"learning_rate": 9.494743981010512e-07,
"loss": 0.09,
"step": 95500
},
{
"epoch": 97.66022380467955,
"grad_norm": 0.28319135308265686,
"learning_rate": 7.799253984401492e-07,
"loss": 0.0914,
"step": 96000
},
{
"epoch": 98.0,
"eval_loss": 0.3087212145328522,
"eval_mae": 0.3190605640411377,
"eval_r2": 0.7665444016456604,
"eval_rmse": 0.5556263327598572,
"eval_runtime": 18.6312,
"eval_samples_per_second": 536.734,
"eval_steps_per_second": 4.24,
"step": 96334
},
{
"epoch": 98.16887080366226,
"grad_norm": 0.3092040419578552,
"learning_rate": 6.103763987792473e-07,
"loss": 0.0906,
"step": 96500
},
{
"epoch": 98.67751780264497,
"grad_norm": 0.27111703157424927,
"learning_rate": 4.408273991183452e-07,
"loss": 0.0894,
"step": 97000
},
{
"epoch": 99.0,
"eval_loss": 0.3089219331741333,
"eval_mae": 0.3191257417201996,
"eval_r2": 0.7667442560195923,
"eval_rmse": 0.5558070540428162,
"eval_runtime": 18.7902,
"eval_samples_per_second": 532.191,
"eval_steps_per_second": 4.204,
"step": 97317
},
{
"epoch": 99.18616480162767,
"grad_norm": 0.2768039405345917,
"learning_rate": 2.712783994574432e-07,
"loss": 0.0919,
"step": 97500
},
{
"epoch": 99.69481180061038,
"grad_norm": 0.19111211597919464,
"learning_rate": 1.0172939979654121e-07,
"loss": 0.0896,
"step": 98000
},
{
"epoch": 100.0,
"eval_loss": 0.30905383825302124,
"eval_mae": 0.3191269636154175,
"eval_r2": 0.7666448354721069,
"eval_rmse": 0.5559256076812744,
"eval_runtime": 18.8185,
"eval_samples_per_second": 531.391,
"eval_steps_per_second": 4.198,
"step": 98300
}
],
"logging_steps": 500,
"max_steps": 98300,
"num_input_tokens_seen": 0,
"num_train_epochs": 100,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.3115044487280154e+17,
"train_batch_size": 128,
"trial_name": null,
"trial_params": null
}