n0w0f's picture
Upload folder using huggingface_hub
46f94cb verified
{
"best_global_step": 57250,
"best_metric": 0.3188753128051758,
"best_model_checkpoint": "/shared/nalampara/mattext_ckpt_pretrain/300k/2026-02-13/17-18-37/pretrain/checkpoints/robocrys_rep_test-pretrain/checkpoint-45000",
"epoch": 50.0,
"eval_steps": 50,
"global_step": 58050,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.04306632213608958,
"grad_norm": 3.4339823722839355,
"learning_rate": 0.00019983118001722653,
"loss": 23.69191650390625,
"step": 50
},
{
"epoch": 0.04306632213608958,
"eval_loss": 16.00394630432129,
"eval_runtime": 16.8675,
"eval_samples_per_second": 1126.664,
"eval_steps_per_second": 35.216,
"step": 50
},
{
"epoch": 0.08613264427217916,
"grad_norm": 4.098213195800781,
"learning_rate": 0.0001996589147286822,
"loss": 15.080333251953125,
"step": 100
},
{
"epoch": 0.08613264427217916,
"eval_loss": 14.168619155883789,
"eval_runtime": 17.0695,
"eval_samples_per_second": 1113.33,
"eval_steps_per_second": 34.799,
"step": 100
},
{
"epoch": 0.12919896640826872,
"grad_norm": 2.9972188472747803,
"learning_rate": 0.0001994866494401378,
"loss": 13.9048046875,
"step": 150
},
{
"epoch": 0.12919896640826872,
"eval_loss": 13.282241821289062,
"eval_runtime": 17.0503,
"eval_samples_per_second": 1114.583,
"eval_steps_per_second": 34.838,
"step": 150
},
{
"epoch": 0.17226528854435832,
"grad_norm": 3.62282657623291,
"learning_rate": 0.00019931438415159348,
"loss": 13.15665283203125,
"step": 200
},
{
"epoch": 0.17226528854435832,
"eval_loss": 12.615082740783691,
"eval_runtime": 17.091,
"eval_samples_per_second": 1111.932,
"eval_steps_per_second": 34.755,
"step": 200
},
{
"epoch": 0.2153316106804479,
"grad_norm": 4.212780952453613,
"learning_rate": 0.0001991421188630491,
"loss": 12.563475341796876,
"step": 250
},
{
"epoch": 0.2153316106804479,
"eval_loss": 12.107625007629395,
"eval_runtime": 17.3804,
"eval_samples_per_second": 1093.413,
"eval_steps_per_second": 34.176,
"step": 250
},
{
"epoch": 0.25839793281653745,
"grad_norm": 5.472993850708008,
"learning_rate": 0.00019896985357450474,
"loss": 12.01086181640625,
"step": 300
},
{
"epoch": 0.25839793281653745,
"eval_loss": 11.567380905151367,
"eval_runtime": 17.2108,
"eval_samples_per_second": 1104.188,
"eval_steps_per_second": 34.513,
"step": 300
},
{
"epoch": 0.301464254952627,
"grad_norm": 6.175601959228516,
"learning_rate": 0.00019879758828596038,
"loss": 11.541044921875,
"step": 350
},
{
"epoch": 0.301464254952627,
"eval_loss": 10.8231840133667,
"eval_runtime": 16.0675,
"eval_samples_per_second": 1182.757,
"eval_steps_per_second": 36.969,
"step": 350
},
{
"epoch": 0.34453057708871665,
"grad_norm": 7.172627925872803,
"learning_rate": 0.00019862532299741602,
"loss": 10.729189453125,
"step": 400
},
{
"epoch": 0.34453057708871665,
"eval_loss": 9.469819068908691,
"eval_runtime": 17.0542,
"eval_samples_per_second": 1114.331,
"eval_steps_per_second": 34.83,
"step": 400
},
{
"epoch": 0.3875968992248062,
"grad_norm": 6.581453800201416,
"learning_rate": 0.0001984530577088717,
"loss": 8.910851440429688,
"step": 450
},
{
"epoch": 0.3875968992248062,
"eval_loss": 6.483331680297852,
"eval_runtime": 17.0895,
"eval_samples_per_second": 1112.028,
"eval_steps_per_second": 34.758,
"step": 450
},
{
"epoch": 0.4306632213608958,
"grad_norm": 4.437243938446045,
"learning_rate": 0.0001982807924203273,
"loss": 6.259661254882812,
"step": 500
},
{
"epoch": 0.4306632213608958,
"eval_loss": 4.317221641540527,
"eval_runtime": 17.0015,
"eval_samples_per_second": 1117.783,
"eval_steps_per_second": 34.938,
"step": 500
},
{
"epoch": 0.4737295434969854,
"grad_norm": 3.040710687637329,
"learning_rate": 0.00019810852713178297,
"loss": 4.48141845703125,
"step": 550
},
{
"epoch": 0.4737295434969854,
"eval_loss": 3.2465999126434326,
"eval_runtime": 17.1896,
"eval_samples_per_second": 1105.55,
"eval_steps_per_second": 34.556,
"step": 550
},
{
"epoch": 0.5167958656330749,
"grad_norm": 2.620871067047119,
"learning_rate": 0.0001979362618432386,
"loss": 3.5127716064453125,
"step": 600
},
{
"epoch": 0.5167958656330749,
"eval_loss": 2.8293402194976807,
"eval_runtime": 17.5816,
"eval_samples_per_second": 1080.901,
"eval_steps_per_second": 33.785,
"step": 600
},
{
"epoch": 0.5598621877691645,
"grad_norm": 2.405362606048584,
"learning_rate": 0.00019776399655469423,
"loss": 3.18022216796875,
"step": 650
},
{
"epoch": 0.5598621877691645,
"eval_loss": 2.5354790687561035,
"eval_runtime": 16.9547,
"eval_samples_per_second": 1120.867,
"eval_steps_per_second": 35.034,
"step": 650
},
{
"epoch": 0.602928509905254,
"grad_norm": 2.6026864051818848,
"learning_rate": 0.00019759173126614987,
"loss": 2.8754867553710937,
"step": 700
},
{
"epoch": 0.602928509905254,
"eval_loss": 2.357602119445801,
"eval_runtime": 16.2184,
"eval_samples_per_second": 1171.756,
"eval_steps_per_second": 36.625,
"step": 700
},
{
"epoch": 0.6459948320413437,
"grad_norm": 2.574624538421631,
"learning_rate": 0.00019741946597760552,
"loss": 2.6319024658203123,
"step": 750
},
{
"epoch": 0.6459948320413437,
"eval_loss": 2.1693274974823,
"eval_runtime": 17.0307,
"eval_samples_per_second": 1115.867,
"eval_steps_per_second": 34.878,
"step": 750
},
{
"epoch": 0.6890611541774333,
"grad_norm": 1.7991482019424438,
"learning_rate": 0.00019724720068906118,
"loss": 2.439267578125,
"step": 800
},
{
"epoch": 0.6890611541774333,
"eval_loss": 2.030348300933838,
"eval_runtime": 16.7562,
"eval_samples_per_second": 1134.147,
"eval_steps_per_second": 35.45,
"step": 800
},
{
"epoch": 0.7321274763135228,
"grad_norm": 1.7477670907974243,
"learning_rate": 0.0001970749354005168,
"loss": 2.2564727783203127,
"step": 850
},
{
"epoch": 0.7321274763135228,
"eval_loss": 1.9149311780929565,
"eval_runtime": 16.688,
"eval_samples_per_second": 1138.78,
"eval_steps_per_second": 35.594,
"step": 850
},
{
"epoch": 0.7751937984496124,
"grad_norm": 1.7949120998382568,
"learning_rate": 0.00019690267011197244,
"loss": 2.1781280517578123,
"step": 900
},
{
"epoch": 0.7751937984496124,
"eval_loss": 1.8148161172866821,
"eval_runtime": 17.0557,
"eval_samples_per_second": 1114.231,
"eval_steps_per_second": 34.827,
"step": 900
},
{
"epoch": 0.818260120585702,
"grad_norm": 2.2657155990600586,
"learning_rate": 0.00019673040482342808,
"loss": 1.9842291259765625,
"step": 950
},
{
"epoch": 0.818260120585702,
"eval_loss": 1.6966851949691772,
"eval_runtime": 17.1068,
"eval_samples_per_second": 1110.902,
"eval_steps_per_second": 34.723,
"step": 950
},
{
"epoch": 0.8613264427217916,
"grad_norm": 2.0038652420043945,
"learning_rate": 0.00019655813953488373,
"loss": 1.9545573425292968,
"step": 1000
},
{
"epoch": 0.8613264427217916,
"eval_loss": 1.6101728677749634,
"eval_runtime": 17.0809,
"eval_samples_per_second": 1112.589,
"eval_steps_per_second": 34.776,
"step": 1000
},
{
"epoch": 0.9043927648578811,
"grad_norm": 1.9319428205490112,
"learning_rate": 0.00019638587424633937,
"loss": 1.7500227355957032,
"step": 1050
},
{
"epoch": 0.9043927648578811,
"eval_loss": 1.504974603652954,
"eval_runtime": 17.1529,
"eval_samples_per_second": 1107.92,
"eval_steps_per_second": 34.63,
"step": 1050
},
{
"epoch": 0.9474590869939707,
"grad_norm": 1.970742106437683,
"learning_rate": 0.000196213608957795,
"loss": 1.6700083923339843,
"step": 1100
},
{
"epoch": 0.9474590869939707,
"eval_loss": 1.3932300806045532,
"eval_runtime": 17.4272,
"eval_samples_per_second": 1090.48,
"eval_steps_per_second": 34.085,
"step": 1100
},
{
"epoch": 0.9905254091300603,
"grad_norm": 1.5904282331466675,
"learning_rate": 0.00019604134366925065,
"loss": 1.5361054992675782,
"step": 1150
},
{
"epoch": 0.9905254091300603,
"eval_loss": 1.304945945739746,
"eval_runtime": 16.8173,
"eval_samples_per_second": 1130.028,
"eval_steps_per_second": 35.321,
"step": 1150
},
{
"epoch": 1.0335917312661498,
"grad_norm": 1.6059073209762573,
"learning_rate": 0.0001958690783807063,
"loss": 1.4490776062011719,
"step": 1200
},
{
"epoch": 1.0335917312661498,
"eval_loss": 1.2493284940719604,
"eval_runtime": 16.9616,
"eval_samples_per_second": 1120.415,
"eval_steps_per_second": 35.02,
"step": 1200
},
{
"epoch": 1.0766580534022394,
"grad_norm": 1.632856011390686,
"learning_rate": 0.00019569681309216194,
"loss": 1.3666287231445313,
"step": 1250
},
{
"epoch": 1.0766580534022394,
"eval_loss": 1.1869384050369263,
"eval_runtime": 17.0434,
"eval_samples_per_second": 1115.035,
"eval_steps_per_second": 34.852,
"step": 1250
},
{
"epoch": 1.119724375538329,
"grad_norm": 1.5437358617782593,
"learning_rate": 0.00019552454780361758,
"loss": 1.3059585571289063,
"step": 1300
},
{
"epoch": 1.119724375538329,
"eval_loss": 1.0582355260849,
"eval_runtime": 16.5221,
"eval_samples_per_second": 1150.216,
"eval_steps_per_second": 35.952,
"step": 1300
},
{
"epoch": 1.1627906976744187,
"grad_norm": 1.649256944656372,
"learning_rate": 0.00019535228251507322,
"loss": 1.0846002197265625,
"step": 1350
},
{
"epoch": 1.1627906976744187,
"eval_loss": 0.9497116804122925,
"eval_runtime": 17.1019,
"eval_samples_per_second": 1111.221,
"eval_steps_per_second": 34.733,
"step": 1350
},
{
"epoch": 1.2058570198105083,
"grad_norm": 1.5769813060760498,
"learning_rate": 0.00019518001722652886,
"loss": 0.9779405212402343,
"step": 1400
},
{
"epoch": 1.2058570198105083,
"eval_loss": 0.9034150838851929,
"eval_runtime": 17.0701,
"eval_samples_per_second": 1113.289,
"eval_steps_per_second": 34.798,
"step": 1400
},
{
"epoch": 1.2489233419465977,
"grad_norm": 1.503847599029541,
"learning_rate": 0.0001950077519379845,
"loss": 0.9594493103027344,
"step": 1450
},
{
"epoch": 1.2489233419465977,
"eval_loss": 0.8880420923233032,
"eval_runtime": 17.1136,
"eval_samples_per_second": 1110.461,
"eval_steps_per_second": 34.709,
"step": 1450
},
{
"epoch": 1.2919896640826873,
"grad_norm": 1.324310302734375,
"learning_rate": 0.00019483548664944015,
"loss": 0.9324958801269532,
"step": 1500
},
{
"epoch": 1.2919896640826873,
"eval_loss": 0.8554781079292297,
"eval_runtime": 17.1878,
"eval_samples_per_second": 1105.667,
"eval_steps_per_second": 34.559,
"step": 1500
},
{
"epoch": 1.335055986218777,
"grad_norm": 1.2940914630889893,
"learning_rate": 0.0001946632213608958,
"loss": 0.9000779724121094,
"step": 1550
},
{
"epoch": 1.335055986218777,
"eval_loss": 0.8320924639701843,
"eval_runtime": 16.9956,
"eval_samples_per_second": 1118.173,
"eval_steps_per_second": 34.95,
"step": 1550
},
{
"epoch": 1.3781223083548664,
"grad_norm": 1.3323431015014648,
"learning_rate": 0.00019449095607235143,
"loss": 0.8746331787109375,
"step": 1600
},
{
"epoch": 1.3781223083548664,
"eval_loss": 0.8127567172050476,
"eval_runtime": 16.9374,
"eval_samples_per_second": 1122.017,
"eval_steps_per_second": 35.07,
"step": 1600
},
{
"epoch": 1.421188630490956,
"grad_norm": 1.3652969598770142,
"learning_rate": 0.00019431869078380707,
"loss": 0.8706568145751953,
"step": 1650
},
{
"epoch": 1.421188630490956,
"eval_loss": 0.8145562410354614,
"eval_runtime": 16.5749,
"eval_samples_per_second": 1146.55,
"eval_steps_per_second": 35.837,
"step": 1650
},
{
"epoch": 1.4642549526270456,
"grad_norm": 1.3932275772094727,
"learning_rate": 0.00019414642549526272,
"loss": 0.8283468627929688,
"step": 1700
},
{
"epoch": 1.4642549526270456,
"eval_loss": 0.7912552356719971,
"eval_runtime": 17.1373,
"eval_samples_per_second": 1108.929,
"eval_steps_per_second": 34.661,
"step": 1700
},
{
"epoch": 1.5073212747631353,
"grad_norm": 1.182525873184204,
"learning_rate": 0.00019397416020671836,
"loss": 0.8269770050048828,
"step": 1750
},
{
"epoch": 1.5073212747631353,
"eval_loss": 0.7707593441009521,
"eval_runtime": 16.8899,
"eval_samples_per_second": 1125.167,
"eval_steps_per_second": 35.169,
"step": 1750
},
{
"epoch": 1.550387596899225,
"grad_norm": 1.1756706237792969,
"learning_rate": 0.000193801894918174,
"loss": 0.7841999816894532,
"step": 1800
},
{
"epoch": 1.550387596899225,
"eval_loss": 0.7642788887023926,
"eval_runtime": 17.2573,
"eval_samples_per_second": 1101.218,
"eval_steps_per_second": 34.42,
"step": 1800
},
{
"epoch": 1.5934539190353143,
"grad_norm": 1.1239345073699951,
"learning_rate": 0.00019362962962962964,
"loss": 0.778316650390625,
"step": 1850
},
{
"epoch": 1.5934539190353143,
"eval_loss": 0.7528692483901978,
"eval_runtime": 16.8188,
"eval_samples_per_second": 1129.926,
"eval_steps_per_second": 35.318,
"step": 1850
},
{
"epoch": 1.636520241171404,
"grad_norm": 1.2261638641357422,
"learning_rate": 0.00019345736434108528,
"loss": 0.7577320098876953,
"step": 1900
},
{
"epoch": 1.636520241171404,
"eval_loss": 0.7464138269424438,
"eval_runtime": 16.7312,
"eval_samples_per_second": 1135.845,
"eval_steps_per_second": 35.503,
"step": 1900
},
{
"epoch": 1.6795865633074936,
"grad_norm": 1.094499111175537,
"learning_rate": 0.00019328509905254093,
"loss": 0.7711544799804687,
"step": 1950
},
{
"epoch": 1.6795865633074936,
"eval_loss": 0.7377104163169861,
"eval_runtime": 16.7677,
"eval_samples_per_second": 1133.373,
"eval_steps_per_second": 35.425,
"step": 1950
},
{
"epoch": 1.722652885443583,
"grad_norm": 1.166304349899292,
"learning_rate": 0.00019311283376399657,
"loss": 0.7610466003417968,
"step": 2000
},
{
"epoch": 1.722652885443583,
"eval_loss": 0.732494056224823,
"eval_runtime": 16.6682,
"eval_samples_per_second": 1140.132,
"eval_steps_per_second": 35.637,
"step": 2000
},
{
"epoch": 1.7657192075796728,
"grad_norm": 1.171218991279602,
"learning_rate": 0.0001929405684754522,
"loss": 0.7330171203613282,
"step": 2050
},
{
"epoch": 1.7657192075796728,
"eval_loss": 0.7134495377540588,
"eval_runtime": 16.8525,
"eval_samples_per_second": 1127.667,
"eval_steps_per_second": 35.247,
"step": 2050
},
{
"epoch": 1.8087855297157622,
"grad_norm": 1.0657861232757568,
"learning_rate": 0.00019276830318690785,
"loss": 0.7187896728515625,
"step": 2100
},
{
"epoch": 1.8087855297157622,
"eval_loss": 0.6970002055168152,
"eval_runtime": 16.3557,
"eval_samples_per_second": 1161.916,
"eval_steps_per_second": 36.318,
"step": 2100
},
{
"epoch": 1.8518518518518519,
"grad_norm": 0.9776778817176819,
"learning_rate": 0.0001925960378983635,
"loss": 0.7035723114013672,
"step": 2150
},
{
"epoch": 1.8518518518518519,
"eval_loss": 0.7010369300842285,
"eval_runtime": 16.6178,
"eval_samples_per_second": 1143.593,
"eval_steps_per_second": 35.745,
"step": 2150
},
{
"epoch": 1.8949181739879415,
"grad_norm": 1.1066110134124756,
"learning_rate": 0.0001924237726098191,
"loss": 0.7039942169189453,
"step": 2200
},
{
"epoch": 1.8949181739879415,
"eval_loss": 0.7116236090660095,
"eval_runtime": 16.6683,
"eval_samples_per_second": 1140.126,
"eval_steps_per_second": 35.636,
"step": 2200
},
{
"epoch": 1.937984496124031,
"grad_norm": 1.0685100555419922,
"learning_rate": 0.00019225150732127478,
"loss": 0.70912109375,
"step": 2250
},
{
"epoch": 1.937984496124031,
"eval_loss": 0.6799043416976929,
"eval_runtime": 16.1106,
"eval_samples_per_second": 1179.597,
"eval_steps_per_second": 36.87,
"step": 2250
},
{
"epoch": 1.9810508182601207,
"grad_norm": 1.0897897481918335,
"learning_rate": 0.00019207924203273042,
"loss": 0.691868896484375,
"step": 2300
},
{
"epoch": 1.9810508182601207,
"eval_loss": 0.6700846552848816,
"eval_runtime": 16.1839,
"eval_samples_per_second": 1174.254,
"eval_steps_per_second": 36.703,
"step": 2300
},
{
"epoch": 2.02411714039621,
"grad_norm": 0.9901228547096252,
"learning_rate": 0.00019190697674418606,
"loss": 0.6789529418945313,
"step": 2350
},
{
"epoch": 2.02411714039621,
"eval_loss": 0.6777063012123108,
"eval_runtime": 16.6687,
"eval_samples_per_second": 1140.099,
"eval_steps_per_second": 35.636,
"step": 2350
},
{
"epoch": 2.0671834625322996,
"grad_norm": 1.12906014919281,
"learning_rate": 0.0001917347114556417,
"loss": 0.664688720703125,
"step": 2400
},
{
"epoch": 2.0671834625322996,
"eval_loss": 0.6748037338256836,
"eval_runtime": 16.8591,
"eval_samples_per_second": 1127.228,
"eval_steps_per_second": 35.233,
"step": 2400
},
{
"epoch": 2.1102497846683894,
"grad_norm": 1.0208185911178589,
"learning_rate": 0.00019156244616709732,
"loss": 0.6541690063476563,
"step": 2450
},
{
"epoch": 2.1102497846683894,
"eval_loss": 0.6536487936973572,
"eval_runtime": 16.5497,
"eval_samples_per_second": 1148.299,
"eval_steps_per_second": 35.892,
"step": 2450
},
{
"epoch": 2.153316106804479,
"grad_norm": 1.067574381828308,
"learning_rate": 0.000191390180878553,
"loss": 0.6565885162353515,
"step": 2500
},
{
"epoch": 2.153316106804479,
"eval_loss": 0.6487522125244141,
"eval_runtime": 16.7171,
"eval_samples_per_second": 1136.798,
"eval_steps_per_second": 35.532,
"step": 2500
},
{
"epoch": 2.1963824289405687,
"grad_norm": 1.065882682800293,
"learning_rate": 0.0001912179155900086,
"loss": 0.651094970703125,
"step": 2550
},
{
"epoch": 2.1963824289405687,
"eval_loss": 0.6502730250358582,
"eval_runtime": 17.1893,
"eval_samples_per_second": 1105.573,
"eval_steps_per_second": 34.556,
"step": 2550
},
{
"epoch": 2.239448751076658,
"grad_norm": 0.9784948825836182,
"learning_rate": 0.00019104565030146427,
"loss": 0.6197249603271484,
"step": 2600
},
{
"epoch": 2.239448751076658,
"eval_loss": 0.6479949355125427,
"eval_runtime": 16.9796,
"eval_samples_per_second": 1119.223,
"eval_steps_per_second": 34.983,
"step": 2600
},
{
"epoch": 2.2825150732127475,
"grad_norm": 0.925772488117218,
"learning_rate": 0.00019087338501291992,
"loss": 0.6295633697509766,
"step": 2650
},
{
"epoch": 2.2825150732127475,
"eval_loss": 0.6387772560119629,
"eval_runtime": 16.5664,
"eval_samples_per_second": 1147.14,
"eval_steps_per_second": 35.856,
"step": 2650
},
{
"epoch": 2.3255813953488373,
"grad_norm": 1.0654851198196411,
"learning_rate": 0.00019070111972437553,
"loss": 0.631210823059082,
"step": 2700
},
{
"epoch": 2.3255813953488373,
"eval_loss": 0.636771559715271,
"eval_runtime": 17.4711,
"eval_samples_per_second": 1087.738,
"eval_steps_per_second": 33.999,
"step": 2700
},
{
"epoch": 2.3686477174849268,
"grad_norm": 1.0032150745391846,
"learning_rate": 0.0001905288544358312,
"loss": 0.6157208633422852,
"step": 2750
},
{
"epoch": 2.3686477174849268,
"eval_loss": 0.6328519582748413,
"eval_runtime": 17.1571,
"eval_samples_per_second": 1107.646,
"eval_steps_per_second": 34.621,
"step": 2750
},
{
"epoch": 2.4117140396210166,
"grad_norm": 0.869613528251648,
"learning_rate": 0.00019035658914728681,
"loss": 0.6231210327148438,
"step": 2800
},
{
"epoch": 2.4117140396210166,
"eval_loss": 0.6197053790092468,
"eval_runtime": 17.5434,
"eval_samples_per_second": 1083.258,
"eval_steps_per_second": 33.859,
"step": 2800
},
{
"epoch": 2.454780361757106,
"grad_norm": 1.018100619316101,
"learning_rate": 0.00019018432385874248,
"loss": 0.6205888748168945,
"step": 2850
},
{
"epoch": 2.454780361757106,
"eval_loss": 0.6245771050453186,
"eval_runtime": 17.4136,
"eval_samples_per_second": 1091.331,
"eval_steps_per_second": 34.111,
"step": 2850
},
{
"epoch": 2.4978466838931954,
"grad_norm": 0.9806316494941711,
"learning_rate": 0.0001900120585701981,
"loss": 0.6005197906494141,
"step": 2900
},
{
"epoch": 2.4978466838931954,
"eval_loss": 0.6213578581809998,
"eval_runtime": 17.1385,
"eval_samples_per_second": 1108.851,
"eval_steps_per_second": 34.659,
"step": 2900
},
{
"epoch": 2.5409130060292853,
"grad_norm": 0.8172425627708435,
"learning_rate": 0.00018983979328165377,
"loss": 0.606026611328125,
"step": 2950
},
{
"epoch": 2.5409130060292853,
"eval_loss": 0.6219125986099243,
"eval_runtime": 16.9845,
"eval_samples_per_second": 1118.902,
"eval_steps_per_second": 34.973,
"step": 2950
},
{
"epoch": 2.5839793281653747,
"grad_norm": 0.9219424724578857,
"learning_rate": 0.0001896675279931094,
"loss": 0.6011330032348633,
"step": 3000
},
{
"epoch": 2.5839793281653747,
"eval_loss": 0.6213803887367249,
"eval_runtime": 16.7255,
"eval_samples_per_second": 1136.232,
"eval_steps_per_second": 35.515,
"step": 3000
},
{
"epoch": 2.627045650301464,
"grad_norm": 0.9950935244560242,
"learning_rate": 0.00018949526270456503,
"loss": 0.5897697448730469,
"step": 3050
},
{
"epoch": 2.627045650301464,
"eval_loss": 0.5991339087486267,
"eval_runtime": 17.2627,
"eval_samples_per_second": 1100.871,
"eval_steps_per_second": 34.409,
"step": 3050
},
{
"epoch": 2.670111972437554,
"grad_norm": 0.8786163926124573,
"learning_rate": 0.0001893229974160207,
"loss": 0.5946865844726562,
"step": 3100
},
{
"epoch": 2.670111972437554,
"eval_loss": 0.6077448725700378,
"eval_runtime": 15.8056,
"eval_samples_per_second": 1202.358,
"eval_steps_per_second": 37.582,
"step": 3100
},
{
"epoch": 2.7131782945736433,
"grad_norm": 0.7736066579818726,
"learning_rate": 0.0001891507321274763,
"loss": 0.5817356872558593,
"step": 3150
},
{
"epoch": 2.7131782945736433,
"eval_loss": 0.5996423959732056,
"eval_runtime": 17.1554,
"eval_samples_per_second": 1107.754,
"eval_steps_per_second": 34.625,
"step": 3150
},
{
"epoch": 2.7562446167097328,
"grad_norm": 0.9255921244621277,
"learning_rate": 0.00018897846683893198,
"loss": 0.5700679016113281,
"step": 3200
},
{
"epoch": 2.7562446167097328,
"eval_loss": 0.5959638357162476,
"eval_runtime": 17.7994,
"eval_samples_per_second": 1067.678,
"eval_steps_per_second": 33.372,
"step": 3200
},
{
"epoch": 2.7993109388458226,
"grad_norm": 0.8310986757278442,
"learning_rate": 0.0001888062015503876,
"loss": 0.5805090713500977,
"step": 3250
},
{
"epoch": 2.7993109388458226,
"eval_loss": 0.5928221344947815,
"eval_runtime": 17.5479,
"eval_samples_per_second": 1082.982,
"eval_steps_per_second": 33.85,
"step": 3250
},
{
"epoch": 2.842377260981912,
"grad_norm": 0.9678044319152832,
"learning_rate": 0.00018863393626184324,
"loss": 0.5692436599731445,
"step": 3300
},
{
"epoch": 2.842377260981912,
"eval_loss": 0.5916841626167297,
"eval_runtime": 17.1141,
"eval_samples_per_second": 1110.431,
"eval_steps_per_second": 34.708,
"step": 3300
},
{
"epoch": 2.885443583118002,
"grad_norm": 0.9506468772888184,
"learning_rate": 0.0001884616709732989,
"loss": 0.5596051406860352,
"step": 3350
},
{
"epoch": 2.885443583118002,
"eval_loss": 0.5949987173080444,
"eval_runtime": 17.6707,
"eval_samples_per_second": 1075.45,
"eval_steps_per_second": 33.615,
"step": 3350
},
{
"epoch": 2.9285099052540913,
"grad_norm": 0.8617449998855591,
"learning_rate": 0.00018828940568475452,
"loss": 0.5663171768188476,
"step": 3400
},
{
"epoch": 2.9285099052540913,
"eval_loss": 0.5809502601623535,
"eval_runtime": 17.0812,
"eval_samples_per_second": 1112.571,
"eval_steps_per_second": 34.775,
"step": 3400
},
{
"epoch": 2.971576227390181,
"grad_norm": 1.022600531578064,
"learning_rate": 0.0001881171403962102,
"loss": 0.5594380187988282,
"step": 3450
},
{
"epoch": 2.971576227390181,
"eval_loss": 0.5832746028900146,
"eval_runtime": 15.7435,
"eval_samples_per_second": 1207.102,
"eval_steps_per_second": 37.73,
"step": 3450
},
{
"epoch": 3.0146425495262705,
"grad_norm": 0.8227512240409851,
"learning_rate": 0.0001879448751076658,
"loss": 0.5425720977783203,
"step": 3500
},
{
"epoch": 3.0146425495262705,
"eval_loss": 0.5764813423156738,
"eval_runtime": 17.1751,
"eval_samples_per_second": 1106.485,
"eval_steps_per_second": 34.585,
"step": 3500
},
{
"epoch": 3.05770887166236,
"grad_norm": 0.9377761483192444,
"learning_rate": 0.00018777260981912145,
"loss": 0.5348855590820313,
"step": 3550
},
{
"epoch": 3.05770887166236,
"eval_loss": 0.570465624332428,
"eval_runtime": 16.6757,
"eval_samples_per_second": 1139.621,
"eval_steps_per_second": 35.621,
"step": 3550
},
{
"epoch": 3.10077519379845,
"grad_norm": 0.854451596736908,
"learning_rate": 0.0001876003445305771,
"loss": 0.5510664367675782,
"step": 3600
},
{
"epoch": 3.10077519379845,
"eval_loss": 0.575946569442749,
"eval_runtime": 16.7449,
"eval_samples_per_second": 1134.91,
"eval_steps_per_second": 35.473,
"step": 3600
},
{
"epoch": 3.143841515934539,
"grad_norm": 0.8335583806037903,
"learning_rate": 0.00018742807924203273,
"loss": 0.5319256591796875,
"step": 3650
},
{
"epoch": 3.143841515934539,
"eval_loss": 0.5813077688217163,
"eval_runtime": 17.3381,
"eval_samples_per_second": 1096.084,
"eval_steps_per_second": 34.26,
"step": 3650
},
{
"epoch": 3.1869078380706286,
"grad_norm": 0.8312974572181702,
"learning_rate": 0.0001872558139534884,
"loss": 0.5365386199951172,
"step": 3700
},
{
"epoch": 3.1869078380706286,
"eval_loss": 0.5719412565231323,
"eval_runtime": 17.729,
"eval_samples_per_second": 1071.917,
"eval_steps_per_second": 33.504,
"step": 3700
},
{
"epoch": 3.2299741602067185,
"grad_norm": 0.7145699858665466,
"learning_rate": 0.00018708354866494401,
"loss": 0.5286837768554687,
"step": 3750
},
{
"epoch": 3.2299741602067185,
"eval_loss": 0.563346266746521,
"eval_runtime": 17.1979,
"eval_samples_per_second": 1105.021,
"eval_steps_per_second": 34.539,
"step": 3750
},
{
"epoch": 3.273040482342808,
"grad_norm": 0.7736471891403198,
"learning_rate": 0.00018691128337639968,
"loss": 0.5384387969970703,
"step": 3800
},
{
"epoch": 3.273040482342808,
"eval_loss": 0.562952995300293,
"eval_runtime": 17.2225,
"eval_samples_per_second": 1103.44,
"eval_steps_per_second": 34.49,
"step": 3800
},
{
"epoch": 3.3161068044788973,
"grad_norm": 0.8005329966545105,
"learning_rate": 0.0001867390180878553,
"loss": 0.5317356491088867,
"step": 3850
},
{
"epoch": 3.3161068044788973,
"eval_loss": 0.5772661566734314,
"eval_runtime": 17.4171,
"eval_samples_per_second": 1091.109,
"eval_steps_per_second": 34.104,
"step": 3850
},
{
"epoch": 3.359173126614987,
"grad_norm": 0.9179701805114746,
"learning_rate": 0.00018656675279931094,
"loss": 0.5409442138671875,
"step": 3900
},
{
"epoch": 3.359173126614987,
"eval_loss": 0.5530401468276978,
"eval_runtime": 17.0967,
"eval_samples_per_second": 1111.562,
"eval_steps_per_second": 34.744,
"step": 3900
},
{
"epoch": 3.4022394487510765,
"grad_norm": 0.7085617780685425,
"learning_rate": 0.00018639448751076658,
"loss": 0.5294640350341797,
"step": 3950
},
{
"epoch": 3.4022394487510765,
"eval_loss": 0.5698192119598389,
"eval_runtime": 17.0564,
"eval_samples_per_second": 1114.183,
"eval_steps_per_second": 34.826,
"step": 3950
},
{
"epoch": 3.4453057708871664,
"grad_norm": 0.7268469333648682,
"learning_rate": 0.00018622222222222223,
"loss": 0.5306741333007813,
"step": 4000
},
{
"epoch": 3.4453057708871664,
"eval_loss": 0.5615823268890381,
"eval_runtime": 17.2492,
"eval_samples_per_second": 1101.734,
"eval_steps_per_second": 34.436,
"step": 4000
},
{
"epoch": 3.488372093023256,
"grad_norm": 0.6962466239929199,
"learning_rate": 0.0001860499569336779,
"loss": 0.5262623977661133,
"step": 4050
},
{
"epoch": 3.488372093023256,
"eval_loss": 0.5616022944450378,
"eval_runtime": 16.9577,
"eval_samples_per_second": 1120.674,
"eval_steps_per_second": 35.028,
"step": 4050
},
{
"epoch": 3.5314384151593456,
"grad_norm": 0.8463277816772461,
"learning_rate": 0.0001858776916451335,
"loss": 0.5265095138549805,
"step": 4100
},
{
"epoch": 3.5314384151593456,
"eval_loss": 0.5528528690338135,
"eval_runtime": 17.9044,
"eval_samples_per_second": 1061.414,
"eval_steps_per_second": 33.176,
"step": 4100
},
{
"epoch": 3.574504737295435,
"grad_norm": 0.841475784778595,
"learning_rate": 0.00018570542635658915,
"loss": 0.5126543426513672,
"step": 4150
},
{
"epoch": 3.574504737295435,
"eval_loss": 0.5555074214935303,
"eval_runtime": 17.3706,
"eval_samples_per_second": 1094.032,
"eval_steps_per_second": 34.196,
"step": 4150
},
{
"epoch": 3.6175710594315245,
"grad_norm": 0.925631046295166,
"learning_rate": 0.0001855331610680448,
"loss": 0.518094711303711,
"step": 4200
},
{
"epoch": 3.6175710594315245,
"eval_loss": 0.5519886612892151,
"eval_runtime": 17.2437,
"eval_samples_per_second": 1102.083,
"eval_steps_per_second": 34.447,
"step": 4200
},
{
"epoch": 3.6606373815676143,
"grad_norm": 0.770258903503418,
"learning_rate": 0.00018536089577950044,
"loss": 0.5245172119140625,
"step": 4250
},
{
"epoch": 3.6606373815676143,
"eval_loss": 0.5562922954559326,
"eval_runtime": 17.3784,
"eval_samples_per_second": 1093.54,
"eval_steps_per_second": 34.18,
"step": 4250
},
{
"epoch": 3.7037037037037037,
"grad_norm": 0.790610134601593,
"learning_rate": 0.00018518863049095608,
"loss": 0.5096866989135742,
"step": 4300
},
{
"epoch": 3.7037037037037037,
"eval_loss": 0.5537944436073303,
"eval_runtime": 17.0179,
"eval_samples_per_second": 1116.704,
"eval_steps_per_second": 34.904,
"step": 4300
},
{
"epoch": 3.746770025839793,
"grad_norm": 0.8743278384208679,
"learning_rate": 0.00018501636520241172,
"loss": 0.5054009246826172,
"step": 4350
},
{
"epoch": 3.746770025839793,
"eval_loss": 0.5489095449447632,
"eval_runtime": 17.5664,
"eval_samples_per_second": 1081.839,
"eval_steps_per_second": 33.815,
"step": 4350
},
{
"epoch": 3.789836347975883,
"grad_norm": 1.0607808828353882,
"learning_rate": 0.00018484409991386736,
"loss": 0.5099769973754883,
"step": 4400
},
{
"epoch": 3.789836347975883,
"eval_loss": 0.5474193692207336,
"eval_runtime": 16.9832,
"eval_samples_per_second": 1118.988,
"eval_steps_per_second": 34.976,
"step": 4400
},
{
"epoch": 3.8329026701119724,
"grad_norm": 0.8405239582061768,
"learning_rate": 0.000184671834625323,
"loss": 0.5094646453857422,
"step": 4450
},
{
"epoch": 3.8329026701119724,
"eval_loss": 0.539194643497467,
"eval_runtime": 17.3908,
"eval_samples_per_second": 1092.763,
"eval_steps_per_second": 34.156,
"step": 4450
},
{
"epoch": 3.875968992248062,
"grad_norm": 0.7805312275886536,
"learning_rate": 0.00018449956933677865,
"loss": 0.506949462890625,
"step": 4500
},
{
"epoch": 3.875968992248062,
"eval_loss": 0.542855441570282,
"eval_runtime": 16.5024,
"eval_samples_per_second": 1151.593,
"eval_steps_per_second": 35.995,
"step": 4500
},
{
"epoch": 3.9190353143841516,
"grad_norm": 0.7860807180404663,
"learning_rate": 0.0001843273040482343,
"loss": 0.5034315872192383,
"step": 4550
},
{
"epoch": 3.9190353143841516,
"eval_loss": 0.5419240593910217,
"eval_runtime": 17.33,
"eval_samples_per_second": 1096.598,
"eval_steps_per_second": 34.276,
"step": 4550
},
{
"epoch": 3.962101636520241,
"grad_norm": 0.667561411857605,
"learning_rate": 0.00018415503875968993,
"loss": 0.48937828063964844,
"step": 4600
},
{
"epoch": 3.962101636520241,
"eval_loss": 0.5298347473144531,
"eval_runtime": 17.6483,
"eval_samples_per_second": 1076.82,
"eval_steps_per_second": 33.658,
"step": 4600
},
{
"epoch": 4.0051679586563305,
"grad_norm": 0.8568994998931885,
"learning_rate": 0.00018398277347114557,
"loss": 0.5055955505371094,
"step": 4650
},
{
"epoch": 4.0051679586563305,
"eval_loss": 0.5280942320823669,
"eval_runtime": 17.4227,
"eval_samples_per_second": 1090.759,
"eval_steps_per_second": 34.093,
"step": 4650
},
{
"epoch": 4.04823428079242,
"grad_norm": 0.9342105984687805,
"learning_rate": 0.00018381050818260121,
"loss": 0.48168006896972654,
"step": 4700
},
{
"epoch": 4.04823428079242,
"eval_loss": 0.5394223928451538,
"eval_runtime": 17.1425,
"eval_samples_per_second": 1108.592,
"eval_steps_per_second": 34.651,
"step": 4700
},
{
"epoch": 4.09130060292851,
"grad_norm": 0.7910040616989136,
"learning_rate": 0.00018363824289405686,
"loss": 0.48789196014404296,
"step": 4750
},
{
"epoch": 4.09130060292851,
"eval_loss": 0.534257173538208,
"eval_runtime": 17.3471,
"eval_samples_per_second": 1095.516,
"eval_steps_per_second": 34.242,
"step": 4750
},
{
"epoch": 4.134366925064599,
"grad_norm": 0.7728812098503113,
"learning_rate": 0.0001834659776055125,
"loss": 0.47792926788330076,
"step": 4800
},
{
"epoch": 4.134366925064599,
"eval_loss": 0.5376848578453064,
"eval_runtime": 17.163,
"eval_samples_per_second": 1107.267,
"eval_steps_per_second": 34.609,
"step": 4800
},
{
"epoch": 4.177433247200689,
"grad_norm": 0.9220482110977173,
"learning_rate": 0.00018329371231696814,
"loss": 0.48800254821777345,
"step": 4850
},
{
"epoch": 4.177433247200689,
"eval_loss": 0.5448176860809326,
"eval_runtime": 16.7685,
"eval_samples_per_second": 1133.315,
"eval_steps_per_second": 35.424,
"step": 4850
},
{
"epoch": 4.220499569336779,
"grad_norm": 0.7572962641716003,
"learning_rate": 0.00018312144702842378,
"loss": 0.48915714263916016,
"step": 4900
},
{
"epoch": 4.220499569336779,
"eval_loss": 0.5347938537597656,
"eval_runtime": 19.5201,
"eval_samples_per_second": 973.561,
"eval_steps_per_second": 30.43,
"step": 4900
},
{
"epoch": 4.263565891472869,
"grad_norm": 0.7922815680503845,
"learning_rate": 0.00018294918173987943,
"loss": 0.4717945861816406,
"step": 4950
},
{
"epoch": 4.263565891472869,
"eval_loss": 0.5262234807014465,
"eval_runtime": 18.6404,
"eval_samples_per_second": 1019.509,
"eval_steps_per_second": 31.866,
"step": 4950
},
{
"epoch": 4.306632213608958,
"grad_norm": 0.8557692766189575,
"learning_rate": 0.00018277691645133507,
"loss": 0.48916732788085937,
"step": 5000
},
{
"epoch": 4.306632213608958,
"eval_loss": 0.5336983799934387,
"eval_runtime": 16.6117,
"eval_samples_per_second": 1144.015,
"eval_steps_per_second": 35.758,
"step": 5000
},
{
"epoch": 4.3496985357450475,
"grad_norm": 0.8177461624145508,
"learning_rate": 0.0001826046511627907,
"loss": 0.4893519592285156,
"step": 5050
},
{
"epoch": 4.3496985357450475,
"eval_loss": 0.5244340300559998,
"eval_runtime": 18.6024,
"eval_samples_per_second": 1021.589,
"eval_steps_per_second": 31.931,
"step": 5050
},
{
"epoch": 4.392764857881137,
"grad_norm": 0.9127139449119568,
"learning_rate": 0.00018243238587424635,
"loss": 0.4808861923217773,
"step": 5100
},
{
"epoch": 4.392764857881137,
"eval_loss": 0.5295798778533936,
"eval_runtime": 17.282,
"eval_samples_per_second": 1099.64,
"eval_steps_per_second": 34.371,
"step": 5100
},
{
"epoch": 4.435831180017226,
"grad_norm": 0.7107005715370178,
"learning_rate": 0.000182260120585702,
"loss": 0.4700811767578125,
"step": 5150
},
{
"epoch": 4.435831180017226,
"eval_loss": 0.5272426009178162,
"eval_runtime": 17.251,
"eval_samples_per_second": 1101.619,
"eval_steps_per_second": 34.433,
"step": 5150
},
{
"epoch": 4.478897502153316,
"grad_norm": 0.7757647633552551,
"learning_rate": 0.00018208785529715764,
"loss": 0.48522254943847654,
"step": 5200
},
{
"epoch": 4.478897502153316,
"eval_loss": 0.5197687745094299,
"eval_runtime": 17.2491,
"eval_samples_per_second": 1101.737,
"eval_steps_per_second": 34.437,
"step": 5200
},
{
"epoch": 4.521963824289406,
"grad_norm": 0.7408074140548706,
"learning_rate": 0.00018191559000861328,
"loss": 0.47749095916748047,
"step": 5250
},
{
"epoch": 4.521963824289406,
"eval_loss": 0.5270209312438965,
"eval_runtime": 17.2465,
"eval_samples_per_second": 1101.905,
"eval_steps_per_second": 34.442,
"step": 5250
},
{
"epoch": 4.565030146425495,
"grad_norm": 0.7746986150741577,
"learning_rate": 0.00018174332472006892,
"loss": 0.4729361343383789,
"step": 5300
},
{
"epoch": 4.565030146425495,
"eval_loss": 0.5280850529670715,
"eval_runtime": 16.8871,
"eval_samples_per_second": 1125.354,
"eval_steps_per_second": 35.175,
"step": 5300
},
{
"epoch": 4.608096468561585,
"grad_norm": 0.733068585395813,
"learning_rate": 0.00018157105943152456,
"loss": 0.475450439453125,
"step": 5350
},
{
"epoch": 4.608096468561585,
"eval_loss": 0.5167151689529419,
"eval_runtime": 17.6109,
"eval_samples_per_second": 1079.104,
"eval_steps_per_second": 33.729,
"step": 5350
},
{
"epoch": 4.651162790697675,
"grad_norm": 0.6393090486526489,
"learning_rate": 0.0001813987941429802,
"loss": 0.46974494934082034,
"step": 5400
},
{
"epoch": 4.651162790697675,
"eval_loss": 0.5263372659683228,
"eval_runtime": 17.1455,
"eval_samples_per_second": 1108.395,
"eval_steps_per_second": 34.645,
"step": 5400
},
{
"epoch": 4.694229112833764,
"grad_norm": 0.7762560844421387,
"learning_rate": 0.00018122652885443585,
"loss": 0.4681483459472656,
"step": 5450
},
{
"epoch": 4.694229112833764,
"eval_loss": 0.5209926962852478,
"eval_runtime": 16.9614,
"eval_samples_per_second": 1120.423,
"eval_steps_per_second": 35.021,
"step": 5450
},
{
"epoch": 4.7372954349698535,
"grad_norm": 0.6850082278251648,
"learning_rate": 0.0001810542635658915,
"loss": 0.47497440338134767,
"step": 5500
},
{
"epoch": 4.7372954349698535,
"eval_loss": 0.5202682614326477,
"eval_runtime": 17.7827,
"eval_samples_per_second": 1068.681,
"eval_steps_per_second": 33.403,
"step": 5500
},
{
"epoch": 4.780361757105943,
"grad_norm": 0.5083145499229431,
"learning_rate": 0.00018088199827734713,
"loss": 0.4617390441894531,
"step": 5550
},
{
"epoch": 4.780361757105943,
"eval_loss": 0.5157390832901001,
"eval_runtime": 17.164,
"eval_samples_per_second": 1107.204,
"eval_steps_per_second": 34.607,
"step": 5550
},
{
"epoch": 4.823428079242033,
"grad_norm": 0.6421205401420593,
"learning_rate": 0.00018070973298880277,
"loss": 0.46330387115478516,
"step": 5600
},
{
"epoch": 4.823428079242033,
"eval_loss": 0.5214293003082275,
"eval_runtime": 17.3236,
"eval_samples_per_second": 1097.001,
"eval_steps_per_second": 34.288,
"step": 5600
},
{
"epoch": 4.866494401378122,
"grad_norm": 0.811347246170044,
"learning_rate": 0.00018053746770025841,
"loss": 0.44483318328857424,
"step": 5650
},
{
"epoch": 4.866494401378122,
"eval_loss": 0.5206965208053589,
"eval_runtime": 18.0419,
"eval_samples_per_second": 1053.327,
"eval_steps_per_second": 32.923,
"step": 5650
},
{
"epoch": 4.909560723514212,
"grad_norm": 0.723167359828949,
"learning_rate": 0.00018036520241171403,
"loss": 0.45687950134277344,
"step": 5700
},
{
"epoch": 4.909560723514212,
"eval_loss": 0.5181661248207092,
"eval_runtime": 17.6599,
"eval_samples_per_second": 1076.113,
"eval_steps_per_second": 33.636,
"step": 5700
},
{
"epoch": 4.952627045650301,
"grad_norm": 0.821702241897583,
"learning_rate": 0.0001801929371231697,
"loss": 0.4651851272583008,
"step": 5750
},
{
"epoch": 4.952627045650301,
"eval_loss": 0.5162400007247925,
"eval_runtime": 16.6915,
"eval_samples_per_second": 1138.542,
"eval_steps_per_second": 35.587,
"step": 5750
},
{
"epoch": 4.995693367786391,
"grad_norm": 0.9089387059211731,
"learning_rate": 0.00018002067183462531,
"loss": 0.46469066619873045,
"step": 5800
},
{
"epoch": 4.995693367786391,
"eval_loss": 0.5052764415740967,
"eval_runtime": 17.7442,
"eval_samples_per_second": 1071.0,
"eval_steps_per_second": 33.476,
"step": 5800
},
{
"epoch": 5.038759689922481,
"grad_norm": 0.6017876863479614,
"learning_rate": 0.00017984840654608098,
"loss": 0.44865966796875,
"step": 5850
},
{
"epoch": 5.038759689922481,
"eval_loss": 0.5109136700630188,
"eval_runtime": 16.853,
"eval_samples_per_second": 1127.636,
"eval_steps_per_second": 35.246,
"step": 5850
},
{
"epoch": 5.0818260120585705,
"grad_norm": 0.7853453159332275,
"learning_rate": 0.00017967614125753662,
"loss": 0.46314373016357424,
"step": 5900
},
{
"epoch": 5.0818260120585705,
"eval_loss": 0.513229250907898,
"eval_runtime": 16.6481,
"eval_samples_per_second": 1141.512,
"eval_steps_per_second": 35.68,
"step": 5900
},
{
"epoch": 5.1248923341946595,
"grad_norm": 0.7348341941833496,
"learning_rate": 0.00017950387596899224,
"loss": 0.4561290740966797,
"step": 5950
},
{
"epoch": 5.1248923341946595,
"eval_loss": 0.5073517560958862,
"eval_runtime": 17.3322,
"eval_samples_per_second": 1096.454,
"eval_steps_per_second": 34.271,
"step": 5950
},
{
"epoch": 5.167958656330749,
"grad_norm": 0.902717113494873,
"learning_rate": 0.0001793316106804479,
"loss": 0.4551318359375,
"step": 6000
},
{
"epoch": 5.167958656330749,
"eval_loss": 0.5088045001029968,
"eval_runtime": 17.2698,
"eval_samples_per_second": 1100.421,
"eval_steps_per_second": 34.395,
"step": 6000
},
{
"epoch": 5.211024978466839,
"grad_norm": 0.7658097743988037,
"learning_rate": 0.00017915934539190352,
"loss": 0.4525523376464844,
"step": 6050
},
{
"epoch": 5.211024978466839,
"eval_loss": 0.5058748722076416,
"eval_runtime": 17.313,
"eval_samples_per_second": 1097.669,
"eval_steps_per_second": 34.309,
"step": 6050
},
{
"epoch": 5.254091300602928,
"grad_norm": 0.7149024605751038,
"learning_rate": 0.0001789870801033592,
"loss": 0.45574371337890623,
"step": 6100
},
{
"epoch": 5.254091300602928,
"eval_loss": 0.4991587996482849,
"eval_runtime": 17.203,
"eval_samples_per_second": 1104.694,
"eval_steps_per_second": 34.529,
"step": 6100
},
{
"epoch": 5.297157622739018,
"grad_norm": 0.6680011749267578,
"learning_rate": 0.0001788148148148148,
"loss": 0.4474559020996094,
"step": 6150
},
{
"epoch": 5.297157622739018,
"eval_loss": 0.5035756230354309,
"eval_runtime": 17.0604,
"eval_samples_per_second": 1113.928,
"eval_steps_per_second": 34.818,
"step": 6150
},
{
"epoch": 5.340223944875108,
"grad_norm": 0.6084044575691223,
"learning_rate": 0.00017864254952627045,
"loss": 0.4532883834838867,
"step": 6200
},
{
"epoch": 5.340223944875108,
"eval_loss": 0.5061535239219666,
"eval_runtime": 17.2044,
"eval_samples_per_second": 1104.599,
"eval_steps_per_second": 34.526,
"step": 6200
},
{
"epoch": 5.383290267011197,
"grad_norm": 0.6588345766067505,
"learning_rate": 0.00017847028423772612,
"loss": 0.44038040161132813,
"step": 6250
},
{
"epoch": 5.383290267011197,
"eval_loss": 0.5017246603965759,
"eval_runtime": 17.2744,
"eval_samples_per_second": 1100.123,
"eval_steps_per_second": 34.386,
"step": 6250
},
{
"epoch": 5.426356589147287,
"grad_norm": 0.7204316258430481,
"learning_rate": 0.00017829801894918173,
"loss": 0.45370059967041015,
"step": 6300
},
{
"epoch": 5.426356589147287,
"eval_loss": 0.513796329498291,
"eval_runtime": 17.368,
"eval_samples_per_second": 1094.195,
"eval_steps_per_second": 34.201,
"step": 6300
},
{
"epoch": 5.4694229112833765,
"grad_norm": 0.6742197871208191,
"learning_rate": 0.0001781257536606374,
"loss": 0.4497013473510742,
"step": 6350
},
{
"epoch": 5.4694229112833765,
"eval_loss": 0.4986371695995331,
"eval_runtime": 16.7527,
"eval_samples_per_second": 1134.381,
"eval_steps_per_second": 35.457,
"step": 6350
},
{
"epoch": 5.5124892334194655,
"grad_norm": 0.6411893963813782,
"learning_rate": 0.00017795348837209302,
"loss": 0.4475288009643555,
"step": 6400
},
{
"epoch": 5.5124892334194655,
"eval_loss": 0.4998282492160797,
"eval_runtime": 17.2908,
"eval_samples_per_second": 1099.081,
"eval_steps_per_second": 34.354,
"step": 6400
},
{
"epoch": 5.555555555555555,
"grad_norm": 0.7940775156021118,
"learning_rate": 0.0001777812230835487,
"loss": 0.43942253112792967,
"step": 6450
},
{
"epoch": 5.555555555555555,
"eval_loss": 0.5093288421630859,
"eval_runtime": 17.0928,
"eval_samples_per_second": 1111.813,
"eval_steps_per_second": 34.751,
"step": 6450
},
{
"epoch": 5.598621877691645,
"grad_norm": 0.6469661593437195,
"learning_rate": 0.0001776089577950043,
"loss": 0.44519527435302736,
"step": 6500
},
{
"epoch": 5.598621877691645,
"eval_loss": 0.5019457340240479,
"eval_runtime": 17.1997,
"eval_samples_per_second": 1104.904,
"eval_steps_per_second": 34.536,
"step": 6500
},
{
"epoch": 5.641688199827735,
"grad_norm": 0.6471430063247681,
"learning_rate": 0.00017743669250645995,
"loss": 0.4439122772216797,
"step": 6550
},
{
"epoch": 5.641688199827735,
"eval_loss": 0.4955105185508728,
"eval_runtime": 17.2722,
"eval_samples_per_second": 1100.265,
"eval_steps_per_second": 34.391,
"step": 6550
},
{
"epoch": 5.684754521963824,
"grad_norm": 0.7212668061256409,
"learning_rate": 0.00017726442721791561,
"loss": 0.4470480346679688,
"step": 6600
},
{
"epoch": 5.684754521963824,
"eval_loss": 0.5024229288101196,
"eval_runtime": 17.7954,
"eval_samples_per_second": 1067.916,
"eval_steps_per_second": 33.379,
"step": 6600
},
{
"epoch": 5.727820844099914,
"grad_norm": 0.717725932598114,
"learning_rate": 0.00017709216192937123,
"loss": 0.44252021789550783,
"step": 6650
},
{
"epoch": 5.727820844099914,
"eval_loss": 0.49507880210876465,
"eval_runtime": 17.1214,
"eval_samples_per_second": 1109.957,
"eval_steps_per_second": 34.693,
"step": 6650
},
{
"epoch": 5.770887166236004,
"grad_norm": 0.6818066835403442,
"learning_rate": 0.0001769198966408269,
"loss": 0.44801937103271483,
"step": 6700
},
{
"epoch": 5.770887166236004,
"eval_loss": 0.49282634258270264,
"eval_runtime": 17.2449,
"eval_samples_per_second": 1102.007,
"eval_steps_per_second": 34.445,
"step": 6700
},
{
"epoch": 5.813953488372093,
"grad_norm": 0.7912653684616089,
"learning_rate": 0.00017674763135228251,
"loss": 0.4390088653564453,
"step": 6750
},
{
"epoch": 5.813953488372093,
"eval_loss": 0.49278008937835693,
"eval_runtime": 17.1369,
"eval_samples_per_second": 1108.954,
"eval_steps_per_second": 34.662,
"step": 6750
},
{
"epoch": 5.8570198105081825,
"grad_norm": 0.7058820128440857,
"learning_rate": 0.00017657536606373816,
"loss": 0.4471379089355469,
"step": 6800
},
{
"epoch": 5.8570198105081825,
"eval_loss": 0.4846435487270355,
"eval_runtime": 16.2701,
"eval_samples_per_second": 1168.029,
"eval_steps_per_second": 36.509,
"step": 6800
},
{
"epoch": 5.900086132644272,
"grad_norm": 0.6577419638633728,
"learning_rate": 0.0001764031007751938,
"loss": 0.4329195404052734,
"step": 6850
},
{
"epoch": 5.900086132644272,
"eval_loss": 0.4891928732395172,
"eval_runtime": 17.5298,
"eval_samples_per_second": 1084.097,
"eval_steps_per_second": 33.885,
"step": 6850
},
{
"epoch": 5.943152454780362,
"grad_norm": 0.6675143241882324,
"learning_rate": 0.00017623083548664944,
"loss": 0.4451116180419922,
"step": 6900
},
{
"epoch": 5.943152454780362,
"eval_loss": 0.49225810170173645,
"eval_runtime": 17.2842,
"eval_samples_per_second": 1099.503,
"eval_steps_per_second": 34.367,
"step": 6900
},
{
"epoch": 5.986218776916451,
"grad_norm": 0.637526273727417,
"learning_rate": 0.0001760585701981051,
"loss": 0.4396438217163086,
"step": 6950
},
{
"epoch": 5.986218776916451,
"eval_loss": 0.4892192780971527,
"eval_runtime": 17.4708,
"eval_samples_per_second": 1087.757,
"eval_steps_per_second": 34.0,
"step": 6950
},
{
"epoch": 6.029285099052541,
"grad_norm": 0.6243614554405212,
"learning_rate": 0.00017588630490956072,
"loss": 0.43088115692138673,
"step": 7000
},
{
"epoch": 6.029285099052541,
"eval_loss": 0.4946584403514862,
"eval_runtime": 17.2236,
"eval_samples_per_second": 1103.37,
"eval_steps_per_second": 34.488,
"step": 7000
},
{
"epoch": 6.072351421188631,
"grad_norm": 0.6974560618400574,
"learning_rate": 0.00017571403962101637,
"loss": 0.43844024658203123,
"step": 7050
},
{
"epoch": 6.072351421188631,
"eval_loss": 0.4866611063480377,
"eval_runtime": 17.3366,
"eval_samples_per_second": 1096.177,
"eval_steps_per_second": 34.263,
"step": 7050
},
{
"epoch": 6.11541774332472,
"grad_norm": 0.7083775401115417,
"learning_rate": 0.000175541774332472,
"loss": 0.4352645111083984,
"step": 7100
},
{
"epoch": 6.11541774332472,
"eval_loss": 0.48218822479248047,
"eval_runtime": 17.1233,
"eval_samples_per_second": 1109.831,
"eval_steps_per_second": 34.69,
"step": 7100
},
{
"epoch": 6.15848406546081,
"grad_norm": 0.7368115782737732,
"learning_rate": 0.00017536950904392765,
"loss": 0.43624774932861327,
"step": 7150
},
{
"epoch": 6.15848406546081,
"eval_loss": 0.48245272040367126,
"eval_runtime": 16.2557,
"eval_samples_per_second": 1169.065,
"eval_steps_per_second": 36.541,
"step": 7150
},
{
"epoch": 6.2015503875969,
"grad_norm": 0.6461877226829529,
"learning_rate": 0.0001751972437553833,
"loss": 0.4340004348754883,
"step": 7200
},
{
"epoch": 6.2015503875969,
"eval_loss": 0.48431915044784546,
"eval_runtime": 17.0182,
"eval_samples_per_second": 1116.689,
"eval_steps_per_second": 34.904,
"step": 7200
},
{
"epoch": 6.2446167097329885,
"grad_norm": 0.6845267415046692,
"learning_rate": 0.00017502497846683893,
"loss": 0.41942180633544923,
"step": 7250
},
{
"epoch": 6.2446167097329885,
"eval_loss": 0.48767024278640747,
"eval_runtime": 15.9606,
"eval_samples_per_second": 1190.681,
"eval_steps_per_second": 37.217,
"step": 7250
},
{
"epoch": 6.287683031869078,
"grad_norm": 0.6504621505737305,
"learning_rate": 0.0001748527131782946,
"loss": 0.4262746429443359,
"step": 7300
},
{
"epoch": 6.287683031869078,
"eval_loss": 0.485770046710968,
"eval_runtime": 17.3977,
"eval_samples_per_second": 1092.326,
"eval_steps_per_second": 34.142,
"step": 7300
},
{
"epoch": 6.330749354005168,
"grad_norm": 0.6602944731712341,
"learning_rate": 0.00017468044788975022,
"loss": 0.4175703048706055,
"step": 7350
},
{
"epoch": 6.330749354005168,
"eval_loss": 0.48184552788734436,
"eval_runtime": 17.2587,
"eval_samples_per_second": 1101.126,
"eval_steps_per_second": 34.417,
"step": 7350
},
{
"epoch": 6.373815676141257,
"grad_norm": 0.686592161655426,
"learning_rate": 0.00017450818260120586,
"loss": 0.4337629699707031,
"step": 7400
},
{
"epoch": 6.373815676141257,
"eval_loss": 0.4817441403865814,
"eval_runtime": 17.4563,
"eval_samples_per_second": 1088.659,
"eval_steps_per_second": 34.028,
"step": 7400
},
{
"epoch": 6.416881998277347,
"grad_norm": 0.7309668064117432,
"learning_rate": 0.0001743359173126615,
"loss": 0.42674171447753906,
"step": 7450
},
{
"epoch": 6.416881998277347,
"eval_loss": 0.4860132336616516,
"eval_runtime": 17.4504,
"eval_samples_per_second": 1089.031,
"eval_steps_per_second": 34.039,
"step": 7450
},
{
"epoch": 6.459948320413437,
"grad_norm": 0.7980537414550781,
"learning_rate": 0.00017416365202411715,
"loss": 0.4265460205078125,
"step": 7500
},
{
"epoch": 6.459948320413437,
"eval_loss": 0.48347485065460205,
"eval_runtime": 17.2725,
"eval_samples_per_second": 1100.245,
"eval_steps_per_second": 34.39,
"step": 7500
},
{
"epoch": 6.503014642549527,
"grad_norm": 0.7159664034843445,
"learning_rate": 0.0001739913867355728,
"loss": 0.43710147857666015,
"step": 7550
},
{
"epoch": 6.503014642549527,
"eval_loss": 0.47490188479423523,
"eval_runtime": 17.4854,
"eval_samples_per_second": 1086.852,
"eval_steps_per_second": 33.971,
"step": 7550
},
{
"epoch": 6.546080964685616,
"grad_norm": 0.8943643569946289,
"learning_rate": 0.00017381912144702843,
"loss": 0.42655269622802733,
"step": 7600
},
{
"epoch": 6.546080964685616,
"eval_loss": 0.46805503964424133,
"eval_runtime": 16.7689,
"eval_samples_per_second": 1133.289,
"eval_steps_per_second": 35.423,
"step": 7600
},
{
"epoch": 6.589147286821706,
"grad_norm": 0.6106852889060974,
"learning_rate": 0.00017364685615848407,
"loss": 0.4123693466186523,
"step": 7650
},
{
"epoch": 6.589147286821706,
"eval_loss": 0.47157835960388184,
"eval_runtime": 17.4834,
"eval_samples_per_second": 1086.973,
"eval_steps_per_second": 33.975,
"step": 7650
},
{
"epoch": 6.6322136089577945,
"grad_norm": 0.5885831117630005,
"learning_rate": 0.0001734745908699397,
"loss": 0.41778255462646485,
"step": 7700
},
{
"epoch": 6.6322136089577945,
"eval_loss": 0.4761877954006195,
"eval_runtime": 16.9739,
"eval_samples_per_second": 1119.601,
"eval_steps_per_second": 34.995,
"step": 7700
},
{
"epoch": 6.675279931093884,
"grad_norm": 0.5975524187088013,
"learning_rate": 0.00017330232558139536,
"loss": 0.4212100219726562,
"step": 7750
},
{
"epoch": 6.675279931093884,
"eval_loss": 0.468214750289917,
"eval_runtime": 17.1264,
"eval_samples_per_second": 1109.633,
"eval_steps_per_second": 34.683,
"step": 7750
},
{
"epoch": 6.718346253229974,
"grad_norm": 0.6737027168273926,
"learning_rate": 0.000173130060292851,
"loss": 0.423582763671875,
"step": 7800
},
{
"epoch": 6.718346253229974,
"eval_loss": 0.47411462664604187,
"eval_runtime": 17.3271,
"eval_samples_per_second": 1096.78,
"eval_steps_per_second": 34.282,
"step": 7800
},
{
"epoch": 6.761412575366064,
"grad_norm": 0.6910893321037292,
"learning_rate": 0.00017295779500430664,
"loss": 0.4238215637207031,
"step": 7850
},
{
"epoch": 6.761412575366064,
"eval_loss": 0.4713309705257416,
"eval_runtime": 17.2882,
"eval_samples_per_second": 1099.25,
"eval_steps_per_second": 34.359,
"step": 7850
},
{
"epoch": 6.804478897502153,
"grad_norm": 0.7054369449615479,
"learning_rate": 0.00017278552971576228,
"loss": 0.4182852172851563,
"step": 7900
},
{
"epoch": 6.804478897502153,
"eval_loss": 0.4671083986759186,
"eval_runtime": 17.3277,
"eval_samples_per_second": 1096.743,
"eval_steps_per_second": 34.28,
"step": 7900
},
{
"epoch": 6.847545219638243,
"grad_norm": 0.6247032880783081,
"learning_rate": 0.00017261326442721792,
"loss": 0.41578125,
"step": 7950
},
{
"epoch": 6.847545219638243,
"eval_loss": 0.47773998975753784,
"eval_runtime": 17.4983,
"eval_samples_per_second": 1086.046,
"eval_steps_per_second": 33.946,
"step": 7950
},
{
"epoch": 6.890611541774333,
"grad_norm": 0.8400962352752686,
"learning_rate": 0.00017244099913867357,
"loss": 0.41641212463378907,
"step": 8000
},
{
"epoch": 6.890611541774333,
"eval_loss": 0.4788215756416321,
"eval_runtime": 17.0811,
"eval_samples_per_second": 1112.574,
"eval_steps_per_second": 34.775,
"step": 8000
},
{
"epoch": 6.933677863910422,
"grad_norm": 0.6829719543457031,
"learning_rate": 0.0001722687338501292,
"loss": 0.41895538330078125,
"step": 8050
},
{
"epoch": 6.933677863910422,
"eval_loss": 0.4660840332508087,
"eval_runtime": 16.6677,
"eval_samples_per_second": 1140.173,
"eval_steps_per_second": 35.638,
"step": 8050
},
{
"epoch": 6.976744186046512,
"grad_norm": 0.7007145881652832,
"learning_rate": 0.00017209646856158485,
"loss": 0.42030097961425783,
"step": 8100
},
{
"epoch": 6.976744186046512,
"eval_loss": 0.4828939139842987,
"eval_runtime": 17.0682,
"eval_samples_per_second": 1113.413,
"eval_steps_per_second": 34.801,
"step": 8100
},
{
"epoch": 7.019810508182601,
"grad_norm": 0.5981889963150024,
"learning_rate": 0.0001719242032730405,
"loss": 0.41966373443603516,
"step": 8150
},
{
"epoch": 7.019810508182601,
"eval_loss": 0.4712368845939636,
"eval_runtime": 17.6419,
"eval_samples_per_second": 1077.205,
"eval_steps_per_second": 33.67,
"step": 8150
},
{
"epoch": 7.06287683031869,
"grad_norm": 0.5835450887680054,
"learning_rate": 0.00017175193798449613,
"loss": 0.40881683349609377,
"step": 8200
},
{
"epoch": 7.06287683031869,
"eval_loss": 0.47391384840011597,
"eval_runtime": 16.389,
"eval_samples_per_second": 1159.556,
"eval_steps_per_second": 36.244,
"step": 8200
},
{
"epoch": 7.10594315245478,
"grad_norm": 0.7046903967857361,
"learning_rate": 0.00017157967269595178,
"loss": 0.40905754089355467,
"step": 8250
},
{
"epoch": 7.10594315245478,
"eval_loss": 0.46922409534454346,
"eval_runtime": 17.5686,
"eval_samples_per_second": 1081.705,
"eval_steps_per_second": 33.81,
"step": 8250
},
{
"epoch": 7.14900947459087,
"grad_norm": 0.6183799505233765,
"learning_rate": 0.00017140740740740742,
"loss": 0.4082827377319336,
"step": 8300
},
{
"epoch": 7.14900947459087,
"eval_loss": 0.460680216550827,
"eval_runtime": 17.3062,
"eval_samples_per_second": 1098.102,
"eval_steps_per_second": 34.323,
"step": 8300
},
{
"epoch": 7.192075796726959,
"grad_norm": 0.591304361820221,
"learning_rate": 0.00017123514211886306,
"loss": 0.40852947235107423,
"step": 8350
},
{
"epoch": 7.192075796726959,
"eval_loss": 0.4756697118282318,
"eval_runtime": 17.2275,
"eval_samples_per_second": 1103.119,
"eval_steps_per_second": 34.48,
"step": 8350
},
{
"epoch": 7.235142118863049,
"grad_norm": 0.6740310192108154,
"learning_rate": 0.0001710628768303187,
"loss": 0.4172178268432617,
"step": 8400
},
{
"epoch": 7.235142118863049,
"eval_loss": 0.4743621051311493,
"eval_runtime": 17.3123,
"eval_samples_per_second": 1097.717,
"eval_steps_per_second": 34.311,
"step": 8400
},
{
"epoch": 7.278208440999139,
"grad_norm": 0.5578835010528564,
"learning_rate": 0.00017089061154177434,
"loss": 0.4068143081665039,
"step": 8450
},
{
"epoch": 7.278208440999139,
"eval_loss": 0.47389987111091614,
"eval_runtime": 17.3109,
"eval_samples_per_second": 1097.803,
"eval_steps_per_second": 34.314,
"step": 8450
},
{
"epoch": 7.321274763135229,
"grad_norm": 0.6933954358100891,
"learning_rate": 0.00017071834625323,
"loss": 0.4053021240234375,
"step": 8500
},
{
"epoch": 7.321274763135229,
"eval_loss": 0.47286462783813477,
"eval_runtime": 17.2635,
"eval_samples_per_second": 1100.82,
"eval_steps_per_second": 34.408,
"step": 8500
},
{
"epoch": 7.364341085271318,
"grad_norm": 0.7197252511978149,
"learning_rate": 0.00017054608096468563,
"loss": 0.41045791625976563,
"step": 8550
},
{
"epoch": 7.364341085271318,
"eval_loss": 0.46601465344429016,
"eval_runtime": 16.1238,
"eval_samples_per_second": 1178.628,
"eval_steps_per_second": 36.84,
"step": 8550
},
{
"epoch": 7.407407407407407,
"grad_norm": 0.6437709331512451,
"learning_rate": 0.00017037381567614124,
"loss": 0.4072903823852539,
"step": 8600
},
{
"epoch": 7.407407407407407,
"eval_loss": 0.47019290924072266,
"eval_runtime": 17.3946,
"eval_samples_per_second": 1092.521,
"eval_steps_per_second": 34.148,
"step": 8600
},
{
"epoch": 7.450473729543497,
"grad_norm": 0.5939879417419434,
"learning_rate": 0.0001702015503875969,
"loss": 0.40440605163574217,
"step": 8650
},
{
"epoch": 7.450473729543497,
"eval_loss": 0.4719381630420685,
"eval_runtime": 16.8308,
"eval_samples_per_second": 1129.122,
"eval_steps_per_second": 35.292,
"step": 8650
},
{
"epoch": 7.493540051679586,
"grad_norm": 0.8631351590156555,
"learning_rate": 0.00017002928509905256,
"loss": 0.4140526580810547,
"step": 8700
},
{
"epoch": 7.493540051679586,
"eval_loss": 0.46160271763801575,
"eval_runtime": 17.1488,
"eval_samples_per_second": 1108.18,
"eval_steps_per_second": 34.638,
"step": 8700
},
{
"epoch": 7.536606373815676,
"grad_norm": 0.6466717720031738,
"learning_rate": 0.0001698570198105082,
"loss": 0.40803627014160154,
"step": 8750
},
{
"epoch": 7.536606373815676,
"eval_loss": 0.47316986322402954,
"eval_runtime": 17.3823,
"eval_samples_per_second": 1093.294,
"eval_steps_per_second": 34.173,
"step": 8750
},
{
"epoch": 7.579672695951766,
"grad_norm": 0.7510745525360107,
"learning_rate": 0.00016968475452196384,
"loss": 0.40083221435546873,
"step": 8800
},
{
"epoch": 7.579672695951766,
"eval_loss": 0.472391813993454,
"eval_runtime": 17.7188,
"eval_samples_per_second": 1072.532,
"eval_steps_per_second": 33.524,
"step": 8800
},
{
"epoch": 7.622739018087855,
"grad_norm": 0.7287890911102295,
"learning_rate": 0.00016951248923341948,
"loss": 0.40442337036132814,
"step": 8850
},
{
"epoch": 7.622739018087855,
"eval_loss": 0.46329817175865173,
"eval_runtime": 17.365,
"eval_samples_per_second": 1094.385,
"eval_steps_per_second": 34.207,
"step": 8850
},
{
"epoch": 7.665805340223945,
"grad_norm": 0.6469999551773071,
"learning_rate": 0.00016934022394487512,
"loss": 0.4089640045166016,
"step": 8900
},
{
"epoch": 7.665805340223945,
"eval_loss": 0.4669649004936218,
"eval_runtime": 17.1577,
"eval_samples_per_second": 1107.607,
"eval_steps_per_second": 34.62,
"step": 8900
},
{
"epoch": 7.708871662360035,
"grad_norm": 0.6616798639297485,
"learning_rate": 0.00016916795865633074,
"loss": 0.40827705383300783,
"step": 8950
},
{
"epoch": 7.708871662360035,
"eval_loss": 0.4632900655269623,
"eval_runtime": 17.1597,
"eval_samples_per_second": 1107.478,
"eval_steps_per_second": 34.616,
"step": 8950
},
{
"epoch": 7.751937984496124,
"grad_norm": 0.8429326415061951,
"learning_rate": 0.0001689956933677864,
"loss": 0.40378345489501954,
"step": 9000
},
{
"epoch": 7.751937984496124,
"eval_loss": 0.46200963854789734,
"eval_runtime": 16.748,
"eval_samples_per_second": 1134.703,
"eval_steps_per_second": 35.467,
"step": 9000
},
{
"epoch": 7.795004306632213,
"grad_norm": 0.6626513004302979,
"learning_rate": 0.00016882342807924202,
"loss": 0.3880492401123047,
"step": 9050
},
{
"epoch": 7.795004306632213,
"eval_loss": 0.4620370864868164,
"eval_runtime": 17.4679,
"eval_samples_per_second": 1087.941,
"eval_steps_per_second": 34.005,
"step": 9050
},
{
"epoch": 7.838070628768303,
"grad_norm": 0.588159441947937,
"learning_rate": 0.0001686511627906977,
"loss": 0.3987490844726562,
"step": 9100
},
{
"epoch": 7.838070628768303,
"eval_loss": 0.46975627541542053,
"eval_runtime": 16.9538,
"eval_samples_per_second": 1120.932,
"eval_steps_per_second": 35.036,
"step": 9100
},
{
"epoch": 7.881136950904393,
"grad_norm": 0.632615327835083,
"learning_rate": 0.00016847889750215333,
"loss": 0.3987492370605469,
"step": 9150
},
{
"epoch": 7.881136950904393,
"eval_loss": 0.4580506682395935,
"eval_runtime": 16.9917,
"eval_samples_per_second": 1118.427,
"eval_steps_per_second": 34.958,
"step": 9150
},
{
"epoch": 7.924203273040482,
"grad_norm": 0.521962583065033,
"learning_rate": 0.00016830663221360895,
"loss": 0.3979851531982422,
"step": 9200
},
{
"epoch": 7.924203273040482,
"eval_loss": 0.455099493265152,
"eval_runtime": 17.8257,
"eval_samples_per_second": 1066.101,
"eval_steps_per_second": 33.323,
"step": 9200
},
{
"epoch": 7.967269595176572,
"grad_norm": 0.5939018726348877,
"learning_rate": 0.00016813436692506462,
"loss": 0.40017799377441404,
"step": 9250
},
{
"epoch": 7.967269595176572,
"eval_loss": 0.4639855921268463,
"eval_runtime": 16.0253,
"eval_samples_per_second": 1185.872,
"eval_steps_per_second": 37.066,
"step": 9250
},
{
"epoch": 8.010335917312661,
"grad_norm": 0.6152825951576233,
"learning_rate": 0.00016796210163652023,
"loss": 0.40499732971191404,
"step": 9300
},
{
"epoch": 8.010335917312661,
"eval_loss": 0.4626195430755615,
"eval_runtime": 17.1638,
"eval_samples_per_second": 1107.212,
"eval_steps_per_second": 34.608,
"step": 9300
},
{
"epoch": 8.05340223944875,
"grad_norm": 0.7398512959480286,
"learning_rate": 0.0001677898363479759,
"loss": 0.38915348052978516,
"step": 9350
},
{
"epoch": 8.05340223944875,
"eval_loss": 0.45817697048187256,
"eval_runtime": 16.8467,
"eval_samples_per_second": 1128.053,
"eval_steps_per_second": 35.259,
"step": 9350
},
{
"epoch": 8.09646856158484,
"grad_norm": 0.7731131911277771,
"learning_rate": 0.00016761757105943152,
"loss": 0.3941301727294922,
"step": 9400
},
{
"epoch": 8.09646856158484,
"eval_loss": 0.44625988602638245,
"eval_runtime": 17.3837,
"eval_samples_per_second": 1093.21,
"eval_steps_per_second": 34.17,
"step": 9400
},
{
"epoch": 8.13953488372093,
"grad_norm": 0.5277883410453796,
"learning_rate": 0.00016744530577088716,
"loss": 0.4036808776855469,
"step": 9450
},
{
"epoch": 8.13953488372093,
"eval_loss": 0.4603153467178345,
"eval_runtime": 17.0968,
"eval_samples_per_second": 1111.554,
"eval_steps_per_second": 34.743,
"step": 9450
},
{
"epoch": 8.18260120585702,
"grad_norm": 0.6296549439430237,
"learning_rate": 0.00016727304048234283,
"loss": 0.391630744934082,
"step": 9500
},
{
"epoch": 8.18260120585702,
"eval_loss": 0.46081680059432983,
"eval_runtime": 17.0294,
"eval_samples_per_second": 1115.951,
"eval_steps_per_second": 34.881,
"step": 9500
},
{
"epoch": 8.22566752799311,
"grad_norm": 0.75502610206604,
"learning_rate": 0.00016710077519379844,
"loss": 0.39266380310058596,
"step": 9550
},
{
"epoch": 8.22566752799311,
"eval_loss": 0.46628537774086,
"eval_runtime": 17.2988,
"eval_samples_per_second": 1098.574,
"eval_steps_per_second": 34.338,
"step": 9550
},
{
"epoch": 8.268733850129198,
"grad_norm": 0.7355108261108398,
"learning_rate": 0.0001669285099052541,
"loss": 0.38500797271728515,
"step": 9600
},
{
"epoch": 8.268733850129198,
"eval_loss": 0.45412328839302063,
"eval_runtime": 16.0582,
"eval_samples_per_second": 1183.444,
"eval_steps_per_second": 36.99,
"step": 9600
},
{
"epoch": 8.311800172265288,
"grad_norm": 0.5990964770317078,
"learning_rate": 0.00016675624461670973,
"loss": 0.3953593444824219,
"step": 9650
},
{
"epoch": 8.311800172265288,
"eval_loss": 0.460112065076828,
"eval_runtime": 17.2327,
"eval_samples_per_second": 1102.785,
"eval_steps_per_second": 34.469,
"step": 9650
},
{
"epoch": 8.354866494401378,
"grad_norm": 0.5800876021385193,
"learning_rate": 0.0001665839793281654,
"loss": 0.39939281463623044,
"step": 9700
},
{
"epoch": 8.354866494401378,
"eval_loss": 0.45985108613967896,
"eval_runtime": 16.8147,
"eval_samples_per_second": 1130.204,
"eval_steps_per_second": 35.326,
"step": 9700
},
{
"epoch": 8.397932816537468,
"grad_norm": 0.6307799816131592,
"learning_rate": 0.000166411714039621,
"loss": 0.38937881469726565,
"step": 9750
},
{
"epoch": 8.397932816537468,
"eval_loss": 0.45991334319114685,
"eval_runtime": 19.4581,
"eval_samples_per_second": 976.665,
"eval_steps_per_second": 30.527,
"step": 9750
},
{
"epoch": 8.440999138673558,
"grad_norm": 0.5629838109016418,
"learning_rate": 0.00016623944875107665,
"loss": 0.40010757446289064,
"step": 9800
},
{
"epoch": 8.440999138673558,
"eval_loss": 0.4581594467163086,
"eval_runtime": 18.8434,
"eval_samples_per_second": 1008.521,
"eval_steps_per_second": 31.523,
"step": 9800
},
{
"epoch": 8.484065460809648,
"grad_norm": 0.7436449527740479,
"learning_rate": 0.00016606718346253232,
"loss": 0.3864497375488281,
"step": 9850
},
{
"epoch": 8.484065460809648,
"eval_loss": 0.4545927345752716,
"eval_runtime": 17.6423,
"eval_samples_per_second": 1077.184,
"eval_steps_per_second": 33.669,
"step": 9850
},
{
"epoch": 8.527131782945737,
"grad_norm": 0.6741047501564026,
"learning_rate": 0.00016589491817398794,
"loss": 0.39211551666259764,
"step": 9900
},
{
"epoch": 8.527131782945737,
"eval_loss": 0.45410382747650146,
"eval_runtime": 19.733,
"eval_samples_per_second": 963.059,
"eval_steps_per_second": 30.102,
"step": 9900
},
{
"epoch": 8.570198105081825,
"grad_norm": 0.6780059337615967,
"learning_rate": 0.0001657226528854436,
"loss": 0.3939036178588867,
"step": 9950
},
{
"epoch": 8.570198105081825,
"eval_loss": 0.4540647566318512,
"eval_runtime": 17.4929,
"eval_samples_per_second": 1086.381,
"eval_steps_per_second": 33.957,
"step": 9950
},
{
"epoch": 8.613264427217915,
"grad_norm": 0.6214281916618347,
"learning_rate": 0.00016555038759689922,
"loss": 0.3902143096923828,
"step": 10000
},
{
"epoch": 8.613264427217915,
"eval_loss": 0.45562663674354553,
"eval_runtime": 17.1617,
"eval_samples_per_second": 1107.35,
"eval_steps_per_second": 34.612,
"step": 10000
},
{
"epoch": 8.656330749354005,
"grad_norm": 0.6021105647087097,
"learning_rate": 0.00016537812230835487,
"loss": 0.3947802734375,
"step": 10050
},
{
"epoch": 8.656330749354005,
"eval_loss": 0.45046406984329224,
"eval_runtime": 16.3866,
"eval_samples_per_second": 1159.725,
"eval_steps_per_second": 36.249,
"step": 10050
},
{
"epoch": 8.699397071490095,
"grad_norm": 0.6217834949493408,
"learning_rate": 0.0001652058570198105,
"loss": 0.39479156494140627,
"step": 10100
},
{
"epoch": 8.699397071490095,
"eval_loss": 0.4492938220500946,
"eval_runtime": 16.9921,
"eval_samples_per_second": 1118.405,
"eval_steps_per_second": 34.958,
"step": 10100
},
{
"epoch": 8.742463393626185,
"grad_norm": 0.5626874566078186,
"learning_rate": 0.00016503359173126615,
"loss": 0.39408538818359373,
"step": 10150
},
{
"epoch": 8.742463393626185,
"eval_loss": 0.44171395897865295,
"eval_runtime": 16.0412,
"eval_samples_per_second": 1184.701,
"eval_steps_per_second": 37.03,
"step": 10150
},
{
"epoch": 8.785529715762275,
"grad_norm": 0.582612931728363,
"learning_rate": 0.00016486132644272182,
"loss": 0.38269500732421874,
"step": 10200
},
{
"epoch": 8.785529715762275,
"eval_loss": 0.4445774555206299,
"eval_runtime": 17.3218,
"eval_samples_per_second": 1097.115,
"eval_steps_per_second": 34.292,
"step": 10200
},
{
"epoch": 8.828596037898363,
"grad_norm": 0.5538569092750549,
"learning_rate": 0.00016468906115417743,
"loss": 0.3785874938964844,
"step": 10250
},
{
"epoch": 8.828596037898363,
"eval_loss": 0.44508177042007446,
"eval_runtime": 16.4479,
"eval_samples_per_second": 1155.404,
"eval_steps_per_second": 36.114,
"step": 10250
},
{
"epoch": 8.871662360034453,
"grad_norm": 0.6597520112991333,
"learning_rate": 0.00016451679586563308,
"loss": 0.38223503112792967,
"step": 10300
},
{
"epoch": 8.871662360034453,
"eval_loss": 0.4491458833217621,
"eval_runtime": 17.5632,
"eval_samples_per_second": 1082.038,
"eval_steps_per_second": 33.821,
"step": 10300
},
{
"epoch": 8.914728682170542,
"grad_norm": 0.6764523983001709,
"learning_rate": 0.00016434453057708872,
"loss": 0.3907370376586914,
"step": 10350
},
{
"epoch": 8.914728682170542,
"eval_loss": 0.45313188433647156,
"eval_runtime": 17.1402,
"eval_samples_per_second": 1108.736,
"eval_steps_per_second": 34.655,
"step": 10350
},
{
"epoch": 8.957795004306632,
"grad_norm": 0.6652244329452515,
"learning_rate": 0.00016417226528854436,
"loss": 0.3858833312988281,
"step": 10400
},
{
"epoch": 8.957795004306632,
"eval_loss": 0.43872758746147156,
"eval_runtime": 16.8242,
"eval_samples_per_second": 1129.565,
"eval_steps_per_second": 35.306,
"step": 10400
},
{
"epoch": 9.000861326442722,
"grad_norm": 0.6380756497383118,
"learning_rate": 0.000164,
"loss": 0.3900408935546875,
"step": 10450
},
{
"epoch": 9.000861326442722,
"eval_loss": 0.4445935785770416,
"eval_runtime": 17.3745,
"eval_samples_per_second": 1093.789,
"eval_steps_per_second": 34.188,
"step": 10450
},
{
"epoch": 9.043927648578812,
"grad_norm": 0.5553951859474182,
"learning_rate": 0.00016382773471145564,
"loss": 0.3827755355834961,
"step": 10500
},
{
"epoch": 9.043927648578812,
"eval_loss": 0.43815913796424866,
"eval_runtime": 16.5732,
"eval_samples_per_second": 1146.672,
"eval_steps_per_second": 35.841,
"step": 10500
},
{
"epoch": 9.0869939707149,
"grad_norm": 0.6225996613502502,
"learning_rate": 0.0001636554694229113,
"loss": 0.3869587326049805,
"step": 10550
},
{
"epoch": 9.0869939707149,
"eval_loss": 0.4400031864643097,
"eval_runtime": 17.3172,
"eval_samples_per_second": 1097.409,
"eval_steps_per_second": 34.301,
"step": 10550
},
{
"epoch": 9.13006029285099,
"grad_norm": 0.6141314506530762,
"learning_rate": 0.00016348320413436693,
"loss": 0.38014041900634765,
"step": 10600
},
{
"epoch": 9.13006029285099,
"eval_loss": 0.4501992166042328,
"eval_runtime": 16.1985,
"eval_samples_per_second": 1173.191,
"eval_steps_per_second": 36.67,
"step": 10600
},
{
"epoch": 9.17312661498708,
"grad_norm": 0.6114876866340637,
"learning_rate": 0.00016331093884582257,
"loss": 0.3823957824707031,
"step": 10650
},
{
"epoch": 9.17312661498708,
"eval_loss": 0.44051340222358704,
"eval_runtime": 17.9518,
"eval_samples_per_second": 1058.615,
"eval_steps_per_second": 33.089,
"step": 10650
},
{
"epoch": 9.21619293712317,
"grad_norm": 0.7262207865715027,
"learning_rate": 0.0001631386735572782,
"loss": 0.38280406951904294,
"step": 10700
},
{
"epoch": 9.21619293712317,
"eval_loss": 0.44237202405929565,
"eval_runtime": 17.3535,
"eval_samples_per_second": 1095.109,
"eval_steps_per_second": 34.229,
"step": 10700
},
{
"epoch": 9.25925925925926,
"grad_norm": 0.8027353286743164,
"learning_rate": 0.00016296640826873385,
"loss": 0.38072364807128906,
"step": 10750
},
{
"epoch": 9.25925925925926,
"eval_loss": 0.4526488482952118,
"eval_runtime": 17.7263,
"eval_samples_per_second": 1072.078,
"eval_steps_per_second": 33.509,
"step": 10750
},
{
"epoch": 9.30232558139535,
"grad_norm": 0.6305018067359924,
"learning_rate": 0.0001627941429801895,
"loss": 0.37516212463378906,
"step": 10800
},
{
"epoch": 9.30232558139535,
"eval_loss": 0.4463886618614197,
"eval_runtime": 17.2375,
"eval_samples_per_second": 1102.483,
"eval_steps_per_second": 34.46,
"step": 10800
},
{
"epoch": 9.34539190353144,
"grad_norm": 0.5047479867935181,
"learning_rate": 0.00016262187769164514,
"loss": 0.3838955307006836,
"step": 10850
},
{
"epoch": 9.34539190353144,
"eval_loss": 0.44005444645881653,
"eval_runtime": 17.4853,
"eval_samples_per_second": 1086.859,
"eval_steps_per_second": 33.971,
"step": 10850
},
{
"epoch": 9.388458225667527,
"grad_norm": 0.5472151041030884,
"learning_rate": 0.00016244961240310078,
"loss": 0.3777126693725586,
"step": 10900
},
{
"epoch": 9.388458225667527,
"eval_loss": 0.4376201033592224,
"eval_runtime": 17.3113,
"eval_samples_per_second": 1097.783,
"eval_steps_per_second": 34.313,
"step": 10900
},
{
"epoch": 9.431524547803617,
"grad_norm": 0.5795921087265015,
"learning_rate": 0.00016227734711455642,
"loss": 0.3782763671875,
"step": 10950
},
{
"epoch": 9.431524547803617,
"eval_loss": 0.448537677526474,
"eval_runtime": 16.4602,
"eval_samples_per_second": 1154.545,
"eval_steps_per_second": 36.087,
"step": 10950
},
{
"epoch": 9.474590869939707,
"grad_norm": 0.5140406489372253,
"learning_rate": 0.00016210508182601206,
"loss": 0.3784123611450195,
"step": 11000
},
{
"epoch": 9.474590869939707,
"eval_loss": 0.43698176741600037,
"eval_runtime": 18.2099,
"eval_samples_per_second": 1043.611,
"eval_steps_per_second": 32.62,
"step": 11000
},
{
"epoch": 9.517657192075797,
"grad_norm": 0.7354308366775513,
"learning_rate": 0.0001619328165374677,
"loss": 0.3827821731567383,
"step": 11050
},
{
"epoch": 9.517657192075797,
"eval_loss": 0.4404692053794861,
"eval_runtime": 16.5291,
"eval_samples_per_second": 1149.728,
"eval_steps_per_second": 35.937,
"step": 11050
},
{
"epoch": 9.560723514211887,
"grad_norm": 0.5452671051025391,
"learning_rate": 0.00016176055124892335,
"loss": 0.3785516357421875,
"step": 11100
},
{
"epoch": 9.560723514211887,
"eval_loss": 0.43846017122268677,
"eval_runtime": 17.2509,
"eval_samples_per_second": 1101.623,
"eval_steps_per_second": 34.433,
"step": 11100
},
{
"epoch": 9.603789836347977,
"grad_norm": 0.5908451080322266,
"learning_rate": 0.000161588285960379,
"loss": 0.3758753967285156,
"step": 11150
},
{
"epoch": 9.603789836347977,
"eval_loss": 0.4382474720478058,
"eval_runtime": 16.8649,
"eval_samples_per_second": 1126.835,
"eval_steps_per_second": 35.221,
"step": 11150
},
{
"epoch": 9.646856158484065,
"grad_norm": 0.5804340243339539,
"learning_rate": 0.00016141602067183463,
"loss": 0.361547966003418,
"step": 11200
},
{
"epoch": 9.646856158484065,
"eval_loss": 0.4456506669521332,
"eval_runtime": 17.7288,
"eval_samples_per_second": 1071.926,
"eval_steps_per_second": 33.505,
"step": 11200
},
{
"epoch": 9.689922480620154,
"grad_norm": 0.6554870009422302,
"learning_rate": 0.00016124375538329028,
"loss": 0.3801045608520508,
"step": 11250
},
{
"epoch": 9.689922480620154,
"eval_loss": 0.43516239523887634,
"eval_runtime": 17.3981,
"eval_samples_per_second": 1092.305,
"eval_steps_per_second": 34.142,
"step": 11250
},
{
"epoch": 9.732988802756244,
"grad_norm": 0.7969627976417542,
"learning_rate": 0.00016107149009474592,
"loss": 0.3737542724609375,
"step": 11300
},
{
"epoch": 9.732988802756244,
"eval_loss": 0.44730818271636963,
"eval_runtime": 17.3437,
"eval_samples_per_second": 1095.73,
"eval_steps_per_second": 34.249,
"step": 11300
},
{
"epoch": 9.776055124892334,
"grad_norm": 0.7298914790153503,
"learning_rate": 0.00016089922480620156,
"loss": 0.3751395797729492,
"step": 11350
},
{
"epoch": 9.776055124892334,
"eval_loss": 0.43920770287513733,
"eval_runtime": 17.3811,
"eval_samples_per_second": 1093.369,
"eval_steps_per_second": 34.175,
"step": 11350
},
{
"epoch": 9.819121447028424,
"grad_norm": 0.6960113048553467,
"learning_rate": 0.0001607269595176572,
"loss": 0.38081275939941406,
"step": 11400
},
{
"epoch": 9.819121447028424,
"eval_loss": 0.4437948167324066,
"eval_runtime": 16.202,
"eval_samples_per_second": 1172.942,
"eval_steps_per_second": 36.662,
"step": 11400
},
{
"epoch": 9.862187769164514,
"grad_norm": 0.541384756565094,
"learning_rate": 0.00016055469422911284,
"loss": 0.3859746170043945,
"step": 11450
},
{
"epoch": 9.862187769164514,
"eval_loss": 0.44062063097953796,
"eval_runtime": 17.3324,
"eval_samples_per_second": 1096.441,
"eval_steps_per_second": 34.271,
"step": 11450
},
{
"epoch": 9.905254091300604,
"grad_norm": 0.651509165763855,
"learning_rate": 0.00016038242894056849,
"loss": 0.37168853759765624,
"step": 11500
},
{
"epoch": 9.905254091300604,
"eval_loss": 0.4349174201488495,
"eval_runtime": 16.8433,
"eval_samples_per_second": 1128.285,
"eval_steps_per_second": 35.266,
"step": 11500
},
{
"epoch": 9.948320413436692,
"grad_norm": 0.7806121706962585,
"learning_rate": 0.00016021016365202413,
"loss": 0.37619327545166015,
"step": 11550
},
{
"epoch": 9.948320413436692,
"eval_loss": 0.4412415623664856,
"eval_runtime": 17.4266,
"eval_samples_per_second": 1090.517,
"eval_steps_per_second": 34.086,
"step": 11550
},
{
"epoch": 9.991386735572782,
"grad_norm": 0.550398051738739,
"learning_rate": 0.00016003789836347977,
"loss": 0.37089550018310546,
"step": 11600
},
{
"epoch": 9.991386735572782,
"eval_loss": 0.4356813430786133,
"eval_runtime": 16.8274,
"eval_samples_per_second": 1129.35,
"eval_steps_per_second": 35.3,
"step": 11600
},
{
"epoch": 10.034453057708872,
"grad_norm": 0.6004517674446106,
"learning_rate": 0.0001598656330749354,
"loss": 0.3658511352539062,
"step": 11650
},
{
"epoch": 10.034453057708872,
"eval_loss": 0.4357646405696869,
"eval_runtime": 17.4756,
"eval_samples_per_second": 1087.456,
"eval_steps_per_second": 33.99,
"step": 11650
},
{
"epoch": 10.077519379844961,
"grad_norm": 0.6227307915687561,
"learning_rate": 0.00015969336778639105,
"loss": 0.36172927856445314,
"step": 11700
},
{
"epoch": 10.077519379844961,
"eval_loss": 0.44107383489608765,
"eval_runtime": 17.4553,
"eval_samples_per_second": 1088.724,
"eval_steps_per_second": 34.03,
"step": 11700
},
{
"epoch": 10.120585701981051,
"grad_norm": 0.6746264696121216,
"learning_rate": 0.0001595211024978467,
"loss": 0.3724021911621094,
"step": 11750
},
{
"epoch": 10.120585701981051,
"eval_loss": 0.44043871760368347,
"eval_runtime": 16.1534,
"eval_samples_per_second": 1176.469,
"eval_steps_per_second": 36.772,
"step": 11750
},
{
"epoch": 10.163652024117141,
"grad_norm": 0.7006517648696899,
"learning_rate": 0.00015934883720930234,
"loss": 0.37673473358154297,
"step": 11800
},
{
"epoch": 10.163652024117141,
"eval_loss": 0.4389665424823761,
"eval_runtime": 17.4499,
"eval_samples_per_second": 1089.063,
"eval_steps_per_second": 34.04,
"step": 11800
},
{
"epoch": 10.20671834625323,
"grad_norm": 0.5790155529975891,
"learning_rate": 0.00015917657192075795,
"loss": 0.3703031158447266,
"step": 11850
},
{
"epoch": 10.20671834625323,
"eval_loss": 0.44572848081588745,
"eval_runtime": 16.6654,
"eval_samples_per_second": 1140.325,
"eval_steps_per_second": 35.643,
"step": 11850
},
{
"epoch": 10.249784668389319,
"grad_norm": 0.6631970405578613,
"learning_rate": 0.00015900430663221362,
"loss": 0.374197998046875,
"step": 11900
},
{
"epoch": 10.249784668389319,
"eval_loss": 0.4412307143211365,
"eval_runtime": 17.3364,
"eval_samples_per_second": 1096.193,
"eval_steps_per_second": 34.263,
"step": 11900
},
{
"epoch": 10.292850990525409,
"grad_norm": 0.5273671746253967,
"learning_rate": 0.00015883204134366926,
"loss": 0.37083282470703127,
"step": 11950
},
{
"epoch": 10.292850990525409,
"eval_loss": 0.43490493297576904,
"eval_runtime": 17.5978,
"eval_samples_per_second": 1079.907,
"eval_steps_per_second": 33.754,
"step": 11950
},
{
"epoch": 10.335917312661499,
"grad_norm": 0.528450608253479,
"learning_rate": 0.0001586597760551249,
"loss": 0.3701524353027344,
"step": 12000
},
{
"epoch": 10.335917312661499,
"eval_loss": 0.4375080168247223,
"eval_runtime": 17.927,
"eval_samples_per_second": 1060.078,
"eval_steps_per_second": 33.134,
"step": 12000
},
{
"epoch": 10.378983634797589,
"grad_norm": 0.5573973059654236,
"learning_rate": 0.00015848751076658055,
"loss": 0.3601241683959961,
"step": 12050
},
{
"epoch": 10.378983634797589,
"eval_loss": 0.4350755512714386,
"eval_runtime": 17.0222,
"eval_samples_per_second": 1116.424,
"eval_steps_per_second": 34.896,
"step": 12050
},
{
"epoch": 10.422049956933678,
"grad_norm": 0.6054695844650269,
"learning_rate": 0.0001583152454780362,
"loss": 0.3784658050537109,
"step": 12100
},
{
"epoch": 10.422049956933678,
"eval_loss": 0.43380206823349,
"eval_runtime": 17.5629,
"eval_samples_per_second": 1082.056,
"eval_steps_per_second": 33.821,
"step": 12100
},
{
"epoch": 10.465116279069768,
"grad_norm": 0.6000425815582275,
"learning_rate": 0.00015814298018949183,
"loss": 0.3644602966308594,
"step": 12150
},
{
"epoch": 10.465116279069768,
"eval_loss": 0.43688303232192993,
"eval_runtime": 16.9392,
"eval_samples_per_second": 1121.897,
"eval_steps_per_second": 35.067,
"step": 12150
},
{
"epoch": 10.508182601205856,
"grad_norm": 0.6011264324188232,
"learning_rate": 0.00015797071490094745,
"loss": 0.36014255523681643,
"step": 12200
},
{
"epoch": 10.508182601205856,
"eval_loss": 0.43164920806884766,
"eval_runtime": 16.5719,
"eval_samples_per_second": 1146.76,
"eval_steps_per_second": 35.844,
"step": 12200
},
{
"epoch": 10.551248923341946,
"grad_norm": 0.5600745677947998,
"learning_rate": 0.00015779844961240312,
"loss": 0.3694350433349609,
"step": 12250
},
{
"epoch": 10.551248923341946,
"eval_loss": 0.43914440274238586,
"eval_runtime": 15.8135,
"eval_samples_per_second": 1201.757,
"eval_steps_per_second": 37.563,
"step": 12250
},
{
"epoch": 10.594315245478036,
"grad_norm": 0.7852029800415039,
"learning_rate": 0.00015762618432385876,
"loss": 0.3673841094970703,
"step": 12300
},
{
"epoch": 10.594315245478036,
"eval_loss": 0.4305252432823181,
"eval_runtime": 16.0232,
"eval_samples_per_second": 1186.031,
"eval_steps_per_second": 37.071,
"step": 12300
},
{
"epoch": 10.637381567614126,
"grad_norm": 0.4954133629798889,
"learning_rate": 0.0001574539190353144,
"loss": 0.37212295532226564,
"step": 12350
},
{
"epoch": 10.637381567614126,
"eval_loss": 0.42601069808006287,
"eval_runtime": 15.7868,
"eval_samples_per_second": 1203.79,
"eval_steps_per_second": 37.626,
"step": 12350
},
{
"epoch": 10.680447889750216,
"grad_norm": 0.5571798086166382,
"learning_rate": 0.00015728165374677004,
"loss": 0.36096405029296874,
"step": 12400
},
{
"epoch": 10.680447889750216,
"eval_loss": 0.4234640598297119,
"eval_runtime": 16.6133,
"eval_samples_per_second": 1143.9,
"eval_steps_per_second": 35.754,
"step": 12400
},
{
"epoch": 10.723514211886306,
"grad_norm": 0.6863204836845398,
"learning_rate": 0.00015710938845822566,
"loss": 0.370234375,
"step": 12450
},
{
"epoch": 10.723514211886306,
"eval_loss": 0.4254721999168396,
"eval_runtime": 17.0259,
"eval_samples_per_second": 1116.184,
"eval_steps_per_second": 34.888,
"step": 12450
},
{
"epoch": 10.766580534022394,
"grad_norm": 0.6626876592636108,
"learning_rate": 0.00015693712316968133,
"loss": 0.36537841796875,
"step": 12500
},
{
"epoch": 10.766580534022394,
"eval_loss": 0.4343000054359436,
"eval_runtime": 16.9232,
"eval_samples_per_second": 1122.957,
"eval_steps_per_second": 35.1,
"step": 12500
},
{
"epoch": 10.809646856158484,
"grad_norm": 0.509812593460083,
"learning_rate": 0.00015676485788113694,
"loss": 0.3594136047363281,
"step": 12550
},
{
"epoch": 10.809646856158484,
"eval_loss": 0.430789589881897,
"eval_runtime": 16.2233,
"eval_samples_per_second": 1171.401,
"eval_steps_per_second": 36.614,
"step": 12550
},
{
"epoch": 10.852713178294573,
"grad_norm": 0.786090075969696,
"learning_rate": 0.0001565925925925926,
"loss": 0.35622817993164063,
"step": 12600
},
{
"epoch": 10.852713178294573,
"eval_loss": 0.4235756993293762,
"eval_runtime": 16.8862,
"eval_samples_per_second": 1125.416,
"eval_steps_per_second": 35.177,
"step": 12600
},
{
"epoch": 10.895779500430663,
"grad_norm": 0.6675612330436707,
"learning_rate": 0.00015642032730404823,
"loss": 0.36076969146728516,
"step": 12650
},
{
"epoch": 10.895779500430663,
"eval_loss": 0.432416170835495,
"eval_runtime": 16.6454,
"eval_samples_per_second": 1141.698,
"eval_steps_per_second": 35.686,
"step": 12650
},
{
"epoch": 10.938845822566753,
"grad_norm": 0.6346985697746277,
"learning_rate": 0.00015624806201550387,
"loss": 0.36723545074462893,
"step": 12700
},
{
"epoch": 10.938845822566753,
"eval_loss": 0.42092081904411316,
"eval_runtime": 17.2276,
"eval_samples_per_second": 1103.111,
"eval_steps_per_second": 34.479,
"step": 12700
},
{
"epoch": 10.981912144702843,
"grad_norm": 0.6402779817581177,
"learning_rate": 0.00015607579672695954,
"loss": 0.3656714630126953,
"step": 12750
},
{
"epoch": 10.981912144702843,
"eval_loss": 0.4239721894264221,
"eval_runtime": 16.6403,
"eval_samples_per_second": 1142.05,
"eval_steps_per_second": 35.697,
"step": 12750
},
{
"epoch": 11.024978466838933,
"grad_norm": 0.5922400951385498,
"learning_rate": 0.00015590353143841515,
"loss": 0.3640504455566406,
"step": 12800
},
{
"epoch": 11.024978466838933,
"eval_loss": 0.41864249110221863,
"eval_runtime": 17.2535,
"eval_samples_per_second": 1101.46,
"eval_steps_per_second": 34.428,
"step": 12800
},
{
"epoch": 11.06804478897502,
"grad_norm": 0.5507603287696838,
"learning_rate": 0.00015573126614987082,
"loss": 0.3542987060546875,
"step": 12850
},
{
"epoch": 11.06804478897502,
"eval_loss": 0.4256451725959778,
"eval_runtime": 17.2905,
"eval_samples_per_second": 1099.1,
"eval_steps_per_second": 34.354,
"step": 12850
},
{
"epoch": 11.11111111111111,
"grad_norm": 0.548511266708374,
"learning_rate": 0.00015555900086132644,
"loss": 0.3623085021972656,
"step": 12900
},
{
"epoch": 11.11111111111111,
"eval_loss": 0.42673003673553467,
"eval_runtime": 17.4446,
"eval_samples_per_second": 1089.39,
"eval_steps_per_second": 34.051,
"step": 12900
},
{
"epoch": 11.1541774332472,
"grad_norm": 0.6575692296028137,
"learning_rate": 0.0001553867355727821,
"loss": 0.3683247375488281,
"step": 12950
},
{
"epoch": 11.1541774332472,
"eval_loss": 0.4245961606502533,
"eval_runtime": 17.3578,
"eval_samples_per_second": 1094.837,
"eval_steps_per_second": 34.221,
"step": 12950
},
{
"epoch": 11.19724375538329,
"grad_norm": 0.5141576528549194,
"learning_rate": 0.00015521447028423772,
"loss": 0.3595258331298828,
"step": 13000
},
{
"epoch": 11.19724375538329,
"eval_loss": 0.4224443733692169,
"eval_runtime": 17.488,
"eval_samples_per_second": 1086.686,
"eval_steps_per_second": 33.966,
"step": 13000
},
{
"epoch": 11.24031007751938,
"grad_norm": 0.4834965169429779,
"learning_rate": 0.00015504220499569336,
"loss": 0.3647360992431641,
"step": 13050
},
{
"epoch": 11.24031007751938,
"eval_loss": 0.42276498675346375,
"eval_runtime": 17.1192,
"eval_samples_per_second": 1110.098,
"eval_steps_per_second": 34.698,
"step": 13050
},
{
"epoch": 11.28337639965547,
"grad_norm": 0.6599516272544861,
"learning_rate": 0.00015486993970714903,
"loss": 0.3615505599975586,
"step": 13100
},
{
"epoch": 11.28337639965547,
"eval_loss": 0.4229472577571869,
"eval_runtime": 16.418,
"eval_samples_per_second": 1157.508,
"eval_steps_per_second": 36.18,
"step": 13100
},
{
"epoch": 11.326442721791558,
"grad_norm": 0.6309078931808472,
"learning_rate": 0.00015469767441860465,
"loss": 0.36210174560546876,
"step": 13150
},
{
"epoch": 11.326442721791558,
"eval_loss": 0.42000076174736023,
"eval_runtime": 17.3299,
"eval_samples_per_second": 1096.604,
"eval_steps_per_second": 34.276,
"step": 13150
},
{
"epoch": 11.369509043927648,
"grad_norm": 0.5616578459739685,
"learning_rate": 0.00015452540913006032,
"loss": 0.3634178161621094,
"step": 13200
},
{
"epoch": 11.369509043927648,
"eval_loss": 0.4281771779060364,
"eval_runtime": 17.3687,
"eval_samples_per_second": 1094.15,
"eval_steps_per_second": 34.199,
"step": 13200
},
{
"epoch": 11.412575366063738,
"grad_norm": 0.6235555410385132,
"learning_rate": 0.00015435314384151593,
"loss": 0.35823890686035154,
"step": 13250
},
{
"epoch": 11.412575366063738,
"eval_loss": 0.4235421121120453,
"eval_runtime": 16.7581,
"eval_samples_per_second": 1134.016,
"eval_steps_per_second": 35.445,
"step": 13250
},
{
"epoch": 11.455641688199828,
"grad_norm": 0.5683552026748657,
"learning_rate": 0.00015418087855297157,
"loss": 0.3551355743408203,
"step": 13300
},
{
"epoch": 11.455641688199828,
"eval_loss": 0.40868687629699707,
"eval_runtime": 17.5985,
"eval_samples_per_second": 1079.862,
"eval_steps_per_second": 33.753,
"step": 13300
},
{
"epoch": 11.498708010335918,
"grad_norm": 0.5393732786178589,
"learning_rate": 0.00015400861326442722,
"loss": 0.35280082702636717,
"step": 13350
},
{
"epoch": 11.498708010335918,
"eval_loss": 0.4148114025592804,
"eval_runtime": 17.5185,
"eval_samples_per_second": 1084.798,
"eval_steps_per_second": 33.907,
"step": 13350
},
{
"epoch": 11.541774332472007,
"grad_norm": 0.579129159450531,
"learning_rate": 0.00015383634797588286,
"loss": 0.36021167755126954,
"step": 13400
},
{
"epoch": 11.541774332472007,
"eval_loss": 0.42251133918762207,
"eval_runtime": 17.6195,
"eval_samples_per_second": 1078.579,
"eval_steps_per_second": 33.713,
"step": 13400
},
{
"epoch": 11.584840654608097,
"grad_norm": 0.7097395062446594,
"learning_rate": 0.00015366408268733853,
"loss": 0.3502839660644531,
"step": 13450
},
{
"epoch": 11.584840654608097,
"eval_loss": 0.42374807596206665,
"eval_runtime": 17.2164,
"eval_samples_per_second": 1103.831,
"eval_steps_per_second": 34.502,
"step": 13450
},
{
"epoch": 11.627906976744185,
"grad_norm": 0.766302227973938,
"learning_rate": 0.00015349181739879414,
"loss": 0.3584626770019531,
"step": 13500
},
{
"epoch": 11.627906976744185,
"eval_loss": 0.4234767556190491,
"eval_runtime": 17.3125,
"eval_samples_per_second": 1097.703,
"eval_steps_per_second": 34.31,
"step": 13500
},
{
"epoch": 11.670973298880275,
"grad_norm": 0.7396245002746582,
"learning_rate": 0.00015331955211024979,
"loss": 0.3636350250244141,
"step": 13550
},
{
"epoch": 11.670973298880275,
"eval_loss": 0.42209184169769287,
"eval_runtime": 17.216,
"eval_samples_per_second": 1103.857,
"eval_steps_per_second": 34.503,
"step": 13550
},
{
"epoch": 11.714039621016365,
"grad_norm": 0.6891668438911438,
"learning_rate": 0.00015314728682170543,
"loss": 0.35415069580078123,
"step": 13600
},
{
"epoch": 11.714039621016365,
"eval_loss": 0.42032742500305176,
"eval_runtime": 16.795,
"eval_samples_per_second": 1131.529,
"eval_steps_per_second": 35.368,
"step": 13600
},
{
"epoch": 11.757105943152455,
"grad_norm": 0.6603532433509827,
"learning_rate": 0.00015297502153316107,
"loss": 0.3615140533447266,
"step": 13650
},
{
"epoch": 11.757105943152455,
"eval_loss": 0.4193870723247528,
"eval_runtime": 17.3607,
"eval_samples_per_second": 1094.655,
"eval_steps_per_second": 34.215,
"step": 13650
},
{
"epoch": 11.800172265288545,
"grad_norm": 0.6098962426185608,
"learning_rate": 0.0001528027562446167,
"loss": 0.36224365234375,
"step": 13700
},
{
"epoch": 11.800172265288545,
"eval_loss": 0.4110799729824066,
"eval_runtime": 16.6706,
"eval_samples_per_second": 1139.972,
"eval_steps_per_second": 35.632,
"step": 13700
},
{
"epoch": 11.843238587424635,
"grad_norm": 0.5804024338722229,
"learning_rate": 0.00015263049095607235,
"loss": 0.35254592895507814,
"step": 13750
},
{
"epoch": 11.843238587424635,
"eval_loss": 0.42059120535850525,
"eval_runtime": 17.1304,
"eval_samples_per_second": 1109.375,
"eval_steps_per_second": 34.675,
"step": 13750
},
{
"epoch": 11.886304909560723,
"grad_norm": 0.6594691872596741,
"learning_rate": 0.00015245822566752802,
"loss": 0.35712413787841796,
"step": 13800
},
{
"epoch": 11.886304909560723,
"eval_loss": 0.42418330907821655,
"eval_runtime": 17.2047,
"eval_samples_per_second": 1104.579,
"eval_steps_per_second": 34.525,
"step": 13800
},
{
"epoch": 11.929371231696813,
"grad_norm": 0.5514585971832275,
"learning_rate": 0.00015228596037898364,
"loss": 0.35963973999023435,
"step": 13850
},
{
"epoch": 11.929371231696813,
"eval_loss": 0.41717028617858887,
"eval_runtime": 17.6749,
"eval_samples_per_second": 1075.2,
"eval_steps_per_second": 33.607,
"step": 13850
},
{
"epoch": 11.972437553832902,
"grad_norm": 0.5450137257575989,
"learning_rate": 0.00015211369509043928,
"loss": 0.34988128662109375,
"step": 13900
},
{
"epoch": 11.972437553832902,
"eval_loss": 0.4247604310512543,
"eval_runtime": 17.676,
"eval_samples_per_second": 1075.127,
"eval_steps_per_second": 33.605,
"step": 13900
},
{
"epoch": 12.015503875968992,
"grad_norm": 0.5637671947479248,
"learning_rate": 0.00015194142980189492,
"loss": 0.351929931640625,
"step": 13950
},
{
"epoch": 12.015503875968992,
"eval_loss": 0.42415958642959595,
"eval_runtime": 17.3608,
"eval_samples_per_second": 1094.649,
"eval_steps_per_second": 34.215,
"step": 13950
},
{
"epoch": 12.058570198105082,
"grad_norm": 0.7126480937004089,
"learning_rate": 0.00015176916451335056,
"loss": 0.3480724334716797,
"step": 14000
},
{
"epoch": 12.058570198105082,
"eval_loss": 0.4246508479118347,
"eval_runtime": 17.3426,
"eval_samples_per_second": 1095.8,
"eval_steps_per_second": 34.251,
"step": 14000
},
{
"epoch": 12.101636520241172,
"grad_norm": 0.5521771907806396,
"learning_rate": 0.0001515968992248062,
"loss": 0.3527442169189453,
"step": 14050
},
{
"epoch": 12.101636520241172,
"eval_loss": 0.42050713300704956,
"eval_runtime": 16.8859,
"eval_samples_per_second": 1125.438,
"eval_steps_per_second": 35.177,
"step": 14050
},
{
"epoch": 12.144702842377262,
"grad_norm": 0.626928985118866,
"learning_rate": 0.00015142463393626185,
"loss": 0.3487448883056641,
"step": 14100
},
{
"epoch": 12.144702842377262,
"eval_loss": 0.4141220152378082,
"eval_runtime": 17.5821,
"eval_samples_per_second": 1080.873,
"eval_steps_per_second": 33.784,
"step": 14100
},
{
"epoch": 12.18776916451335,
"grad_norm": 0.6588882803916931,
"learning_rate": 0.0001512523686477175,
"loss": 0.3564021301269531,
"step": 14150
},
{
"epoch": 12.18776916451335,
"eval_loss": 0.421312153339386,
"eval_runtime": 16.615,
"eval_samples_per_second": 1143.784,
"eval_steps_per_second": 35.751,
"step": 14150
},
{
"epoch": 12.23083548664944,
"grad_norm": 0.5131123065948486,
"learning_rate": 0.00015108010335917313,
"loss": 0.349322509765625,
"step": 14200
},
{
"epoch": 12.23083548664944,
"eval_loss": 0.4248192310333252,
"eval_runtime": 17.8653,
"eval_samples_per_second": 1063.736,
"eval_steps_per_second": 33.249,
"step": 14200
},
{
"epoch": 12.27390180878553,
"grad_norm": 0.6044149994850159,
"learning_rate": 0.00015090783807062877,
"loss": 0.34683589935302733,
"step": 14250
},
{
"epoch": 12.27390180878553,
"eval_loss": 0.4210798144340515,
"eval_runtime": 17.3506,
"eval_samples_per_second": 1095.293,
"eval_steps_per_second": 34.235,
"step": 14250
},
{
"epoch": 12.31696813092162,
"grad_norm": 0.5438874363899231,
"learning_rate": 0.00015073557278208442,
"loss": 0.3560785675048828,
"step": 14300
},
{
"epoch": 12.31696813092162,
"eval_loss": 0.4185738265514374,
"eval_runtime": 17.1799,
"eval_samples_per_second": 1106.179,
"eval_steps_per_second": 34.575,
"step": 14300
},
{
"epoch": 12.36003445305771,
"grad_norm": 0.8494352102279663,
"learning_rate": 0.00015056330749354006,
"loss": 0.34643798828125,
"step": 14350
},
{
"epoch": 12.36003445305771,
"eval_loss": 0.41647806763648987,
"eval_runtime": 17.4394,
"eval_samples_per_second": 1089.717,
"eval_steps_per_second": 34.061,
"step": 14350
},
{
"epoch": 12.4031007751938,
"grad_norm": 0.5796271562576294,
"learning_rate": 0.0001503910422049957,
"loss": 0.34534194946289065,
"step": 14400
},
{
"epoch": 12.4031007751938,
"eval_loss": 0.41010603308677673,
"eval_runtime": 17.4261,
"eval_samples_per_second": 1090.549,
"eval_steps_per_second": 34.087,
"step": 14400
},
{
"epoch": 12.446167097329887,
"grad_norm": 0.5342833399772644,
"learning_rate": 0.00015021877691645134,
"loss": 0.34199066162109376,
"step": 14450
},
{
"epoch": 12.446167097329887,
"eval_loss": 0.4103580415248871,
"eval_runtime": 17.3016,
"eval_samples_per_second": 1098.398,
"eval_steps_per_second": 34.332,
"step": 14450
},
{
"epoch": 12.489233419465977,
"grad_norm": 0.5282058715820312,
"learning_rate": 0.00015004651162790698,
"loss": 0.34900962829589843,
"step": 14500
},
{
"epoch": 12.489233419465977,
"eval_loss": 0.4202011227607727,
"eval_runtime": 16.822,
"eval_samples_per_second": 1129.713,
"eval_steps_per_second": 35.311,
"step": 14500
},
{
"epoch": 12.532299741602067,
"grad_norm": 0.5700145959854126,
"learning_rate": 0.00014987424633936263,
"loss": 0.34178398132324217,
"step": 14550
},
{
"epoch": 12.532299741602067,
"eval_loss": 0.42386436462402344,
"eval_runtime": 19.712,
"eval_samples_per_second": 964.082,
"eval_steps_per_second": 30.134,
"step": 14550
},
{
"epoch": 12.575366063738157,
"grad_norm": 0.5954127311706543,
"learning_rate": 0.00014970198105081827,
"loss": 0.3540873718261719,
"step": 14600
},
{
"epoch": 12.575366063738157,
"eval_loss": 0.41673940420150757,
"eval_runtime": 18.1461,
"eval_samples_per_second": 1047.278,
"eval_steps_per_second": 32.734,
"step": 14600
},
{
"epoch": 12.618432385874247,
"grad_norm": 0.5680475831031799,
"learning_rate": 0.0001495297157622739,
"loss": 0.3487066650390625,
"step": 14650
},
{
"epoch": 12.618432385874247,
"eval_loss": 0.41556957364082336,
"eval_runtime": 17.4336,
"eval_samples_per_second": 1090.081,
"eval_steps_per_second": 34.072,
"step": 14650
},
{
"epoch": 12.661498708010337,
"grad_norm": 0.5987432599067688,
"learning_rate": 0.00014935745047372955,
"loss": 0.35428131103515625,
"step": 14700
},
{
"epoch": 12.661498708010337,
"eval_loss": 0.41658732295036316,
"eval_runtime": 19.2492,
"eval_samples_per_second": 987.26,
"eval_steps_per_second": 30.858,
"step": 14700
},
{
"epoch": 12.704565030146426,
"grad_norm": 0.5099753737449646,
"learning_rate": 0.0001491851851851852,
"loss": 0.3417974853515625,
"step": 14750
},
{
"epoch": 12.704565030146426,
"eval_loss": 0.4173789918422699,
"eval_runtime": 17.2326,
"eval_samples_per_second": 1102.796,
"eval_steps_per_second": 34.47,
"step": 14750
},
{
"epoch": 12.747631352282514,
"grad_norm": 0.5181341767311096,
"learning_rate": 0.00014901291989664084,
"loss": 0.3451295471191406,
"step": 14800
},
{
"epoch": 12.747631352282514,
"eval_loss": 0.41771939396858215,
"eval_runtime": 17.5448,
"eval_samples_per_second": 1083.171,
"eval_steps_per_second": 33.856,
"step": 14800
},
{
"epoch": 12.790697674418604,
"grad_norm": 0.7171940207481384,
"learning_rate": 0.00014884065460809648,
"loss": 0.3461411285400391,
"step": 14850
},
{
"epoch": 12.790697674418604,
"eval_loss": 0.41813889145851135,
"eval_runtime": 17.4502,
"eval_samples_per_second": 1089.039,
"eval_steps_per_second": 34.04,
"step": 14850
},
{
"epoch": 12.833763996554694,
"grad_norm": 0.5442430377006531,
"learning_rate": 0.00014866838931955212,
"loss": 0.35834976196289064,
"step": 14900
},
{
"epoch": 12.833763996554694,
"eval_loss": 0.4198405146598816,
"eval_runtime": 17.152,
"eval_samples_per_second": 1107.977,
"eval_steps_per_second": 34.632,
"step": 14900
},
{
"epoch": 12.876830318690784,
"grad_norm": 0.7742429375648499,
"learning_rate": 0.00014849612403100776,
"loss": 0.35129867553710936,
"step": 14950
},
{
"epoch": 12.876830318690784,
"eval_loss": 0.4203258156776428,
"eval_runtime": 16.187,
"eval_samples_per_second": 1174.029,
"eval_steps_per_second": 36.696,
"step": 14950
},
{
"epoch": 12.919896640826874,
"grad_norm": 0.49093976616859436,
"learning_rate": 0.0001483238587424634,
"loss": 0.35147186279296877,
"step": 15000
},
{
"epoch": 12.919896640826874,
"eval_loss": 0.41292715072631836,
"eval_runtime": 18.151,
"eval_samples_per_second": 1046.997,
"eval_steps_per_second": 32.726,
"step": 15000
},
{
"epoch": 12.962962962962964,
"grad_norm": 0.5578615069389343,
"learning_rate": 0.00014815159345391905,
"loss": 0.34760341644287107,
"step": 15050
},
{
"epoch": 12.962962962962964,
"eval_loss": 0.4126093089580536,
"eval_runtime": 16.4589,
"eval_samples_per_second": 1154.636,
"eval_steps_per_second": 36.09,
"step": 15050
},
{
"epoch": 13.006029285099052,
"grad_norm": 0.6248014569282532,
"learning_rate": 0.00014797932816537466,
"loss": 0.34215831756591797,
"step": 15100
},
{
"epoch": 13.006029285099052,
"eval_loss": 0.4089677631855011,
"eval_runtime": 17.221,
"eval_samples_per_second": 1103.538,
"eval_steps_per_second": 34.493,
"step": 15100
},
{
"epoch": 13.049095607235142,
"grad_norm": 0.4811834990978241,
"learning_rate": 0.00014780706287683033,
"loss": 0.3458440399169922,
"step": 15150
},
{
"epoch": 13.049095607235142,
"eval_loss": 0.4163960814476013,
"eval_runtime": 17.1986,
"eval_samples_per_second": 1104.976,
"eval_steps_per_second": 34.538,
"step": 15150
},
{
"epoch": 13.092161929371231,
"grad_norm": 0.5372201204299927,
"learning_rate": 0.00014763479758828597,
"loss": 0.343751220703125,
"step": 15200
},
{
"epoch": 13.092161929371231,
"eval_loss": 0.41129928827285767,
"eval_runtime": 17.3258,
"eval_samples_per_second": 1096.86,
"eval_steps_per_second": 34.284,
"step": 15200
},
{
"epoch": 13.135228251507321,
"grad_norm": 0.48968204855918884,
"learning_rate": 0.00014746253229974162,
"loss": 0.35028865814208987,
"step": 15250
},
{
"epoch": 13.135228251507321,
"eval_loss": 0.41533833742141724,
"eval_runtime": 17.18,
"eval_samples_per_second": 1106.172,
"eval_steps_per_second": 34.575,
"step": 15250
},
{
"epoch": 13.178294573643411,
"grad_norm": 0.6101740002632141,
"learning_rate": 0.00014729026701119726,
"loss": 0.3286838912963867,
"step": 15300
},
{
"epoch": 13.178294573643411,
"eval_loss": 0.41756105422973633,
"eval_runtime": 17.1329,
"eval_samples_per_second": 1109.213,
"eval_steps_per_second": 34.67,
"step": 15300
},
{
"epoch": 13.221360895779501,
"grad_norm": 0.5691545009613037,
"learning_rate": 0.0001471180017226529,
"loss": 0.3424900817871094,
"step": 15350
},
{
"epoch": 13.221360895779501,
"eval_loss": 0.41093096137046814,
"eval_runtime": 17.2687,
"eval_samples_per_second": 1100.491,
"eval_steps_per_second": 34.398,
"step": 15350
},
{
"epoch": 13.264427217915589,
"grad_norm": 0.5297402143478394,
"learning_rate": 0.00014694573643410854,
"loss": 0.34250846862792966,
"step": 15400
},
{
"epoch": 13.264427217915589,
"eval_loss": 0.40691322088241577,
"eval_runtime": 16.9163,
"eval_samples_per_second": 1123.415,
"eval_steps_per_second": 35.114,
"step": 15400
},
{
"epoch": 13.307493540051679,
"grad_norm": 0.555321455001831,
"learning_rate": 0.00014677347114556416,
"loss": 0.3355692291259766,
"step": 15450
},
{
"epoch": 13.307493540051679,
"eval_loss": 0.4075530171394348,
"eval_runtime": 17.4202,
"eval_samples_per_second": 1090.92,
"eval_steps_per_second": 34.098,
"step": 15450
},
{
"epoch": 13.350559862187769,
"grad_norm": 0.5421215295791626,
"learning_rate": 0.00014660120585701983,
"loss": 0.3451258850097656,
"step": 15500
},
{
"epoch": 13.350559862187769,
"eval_loss": 0.40354102849960327,
"eval_runtime": 16.8807,
"eval_samples_per_second": 1125.781,
"eval_steps_per_second": 35.188,
"step": 15500
},
{
"epoch": 13.393626184323859,
"grad_norm": 0.5563757419586182,
"learning_rate": 0.00014642894056847547,
"loss": 0.3422290802001953,
"step": 15550
},
{
"epoch": 13.393626184323859,
"eval_loss": 0.40993013978004456,
"eval_runtime": 17.5299,
"eval_samples_per_second": 1084.091,
"eval_steps_per_second": 33.885,
"step": 15550
},
{
"epoch": 13.436692506459949,
"grad_norm": 0.4965982437133789,
"learning_rate": 0.0001462566752799311,
"loss": 0.34627342224121094,
"step": 15600
},
{
"epoch": 13.436692506459949,
"eval_loss": 0.40034791827201843,
"eval_runtime": 17.5168,
"eval_samples_per_second": 1084.903,
"eval_steps_per_second": 33.91,
"step": 15600
},
{
"epoch": 13.479758828596038,
"grad_norm": 0.5663852691650391,
"learning_rate": 0.00014608440999138675,
"loss": 0.3398374938964844,
"step": 15650
},
{
"epoch": 13.479758828596038,
"eval_loss": 0.41277310252189636,
"eval_runtime": 17.3753,
"eval_samples_per_second": 1093.739,
"eval_steps_per_second": 34.187,
"step": 15650
},
{
"epoch": 13.522825150732128,
"grad_norm": 0.5163738131523132,
"learning_rate": 0.00014591214470284237,
"loss": 0.34459991455078126,
"step": 15700
},
{
"epoch": 13.522825150732128,
"eval_loss": 0.40066856145858765,
"eval_runtime": 17.3668,
"eval_samples_per_second": 1094.271,
"eval_steps_per_second": 34.203,
"step": 15700
},
{
"epoch": 13.565891472868216,
"grad_norm": 0.6366977691650391,
"learning_rate": 0.00014573987941429804,
"loss": 0.34204261779785156,
"step": 15750
},
{
"epoch": 13.565891472868216,
"eval_loss": 0.4138205647468567,
"eval_runtime": 17.4203,
"eval_samples_per_second": 1090.911,
"eval_steps_per_second": 34.098,
"step": 15750
},
{
"epoch": 13.608957795004306,
"grad_norm": 0.6208717226982117,
"learning_rate": 0.00014556761412575365,
"loss": 0.34886016845703127,
"step": 15800
},
{
"epoch": 13.608957795004306,
"eval_loss": 0.4113547205924988,
"eval_runtime": 17.3668,
"eval_samples_per_second": 1094.27,
"eval_steps_per_second": 34.203,
"step": 15800
},
{
"epoch": 13.652024117140396,
"grad_norm": 0.5838480591773987,
"learning_rate": 0.00014539534883720932,
"loss": 0.340546875,
"step": 15850
},
{
"epoch": 13.652024117140396,
"eval_loss": 0.40785887837409973,
"eval_runtime": 16.4475,
"eval_samples_per_second": 1155.434,
"eval_steps_per_second": 36.115,
"step": 15850
},
{
"epoch": 13.695090439276486,
"grad_norm": 0.6009616851806641,
"learning_rate": 0.00014522308354866494,
"loss": 0.3441506958007812,
"step": 15900
},
{
"epoch": 13.695090439276486,
"eval_loss": 0.40609824657440186,
"eval_runtime": 17.4361,
"eval_samples_per_second": 1089.926,
"eval_steps_per_second": 34.067,
"step": 15900
},
{
"epoch": 13.738156761412576,
"grad_norm": 0.7058950066566467,
"learning_rate": 0.00014505081826012058,
"loss": 0.34030899047851565,
"step": 15950
},
{
"epoch": 13.738156761412576,
"eval_loss": 0.4107040464878082,
"eval_runtime": 17.0979,
"eval_samples_per_second": 1111.482,
"eval_steps_per_second": 34.741,
"step": 15950
},
{
"epoch": 13.781223083548666,
"grad_norm": 0.49852386116981506,
"learning_rate": 0.00014487855297157625,
"loss": 0.3450359344482422,
"step": 16000
},
{
"epoch": 13.781223083548666,
"eval_loss": 0.39904460310935974,
"eval_runtime": 17.2069,
"eval_samples_per_second": 1104.44,
"eval_steps_per_second": 34.521,
"step": 16000
},
{
"epoch": 13.824289405684755,
"grad_norm": 0.5838021636009216,
"learning_rate": 0.00014470628768303186,
"loss": 0.33926872253417967,
"step": 16050
},
{
"epoch": 13.824289405684755,
"eval_loss": 0.40694746375083923,
"eval_runtime": 17.3414,
"eval_samples_per_second": 1095.873,
"eval_steps_per_second": 34.253,
"step": 16050
},
{
"epoch": 13.867355727820843,
"grad_norm": 0.8022839426994324,
"learning_rate": 0.00014453402239448753,
"loss": 0.33638832092285154,
"step": 16100
},
{
"epoch": 13.867355727820843,
"eval_loss": 0.41297706961631775,
"eval_runtime": 17.5437,
"eval_samples_per_second": 1083.237,
"eval_steps_per_second": 33.858,
"step": 16100
},
{
"epoch": 13.910422049956933,
"grad_norm": 0.6754311919212341,
"learning_rate": 0.00014436175710594315,
"loss": 0.33161933898925783,
"step": 16150
},
{
"epoch": 13.910422049956933,
"eval_loss": 0.40214139223098755,
"eval_runtime": 17.4008,
"eval_samples_per_second": 1092.133,
"eval_steps_per_second": 34.136,
"step": 16150
},
{
"epoch": 13.953488372093023,
"grad_norm": 0.6820469498634338,
"learning_rate": 0.00014418949181739882,
"loss": 0.34790390014648437,
"step": 16200
},
{
"epoch": 13.953488372093023,
"eval_loss": 0.4014909863471985,
"eval_runtime": 17.2923,
"eval_samples_per_second": 1098.984,
"eval_steps_per_second": 34.35,
"step": 16200
},
{
"epoch": 13.996554694229113,
"grad_norm": 0.5553178787231445,
"learning_rate": 0.00014401722652885443,
"loss": 0.33886314392089845,
"step": 16250
},
{
"epoch": 13.996554694229113,
"eval_loss": 0.40084779262542725,
"eval_runtime": 17.3503,
"eval_samples_per_second": 1095.31,
"eval_steps_per_second": 34.236,
"step": 16250
},
{
"epoch": 14.039621016365203,
"grad_norm": 0.5039793252944946,
"learning_rate": 0.00014384496124031007,
"loss": 0.33563224792480467,
"step": 16300
},
{
"epoch": 14.039621016365203,
"eval_loss": 0.41018620133399963,
"eval_runtime": 16.7245,
"eval_samples_per_second": 1136.298,
"eval_steps_per_second": 35.517,
"step": 16300
},
{
"epoch": 14.082687338501293,
"grad_norm": 0.49403050541877747,
"learning_rate": 0.00014367269595176574,
"loss": 0.34347259521484375,
"step": 16350
},
{
"epoch": 14.082687338501293,
"eval_loss": 0.40463119745254517,
"eval_runtime": 17.326,
"eval_samples_per_second": 1096.846,
"eval_steps_per_second": 34.284,
"step": 16350
},
{
"epoch": 14.12575366063738,
"grad_norm": 0.6298602223396301,
"learning_rate": 0.00014350043066322136,
"loss": 0.33865066528320314,
"step": 16400
},
{
"epoch": 14.12575366063738,
"eval_loss": 0.40046370029449463,
"eval_runtime": 17.0738,
"eval_samples_per_second": 1113.053,
"eval_steps_per_second": 34.79,
"step": 16400
},
{
"epoch": 14.16881998277347,
"grad_norm": 0.611960232257843,
"learning_rate": 0.00014332816537467703,
"loss": 0.3326186370849609,
"step": 16450
},
{
"epoch": 14.16881998277347,
"eval_loss": 0.4022998511791229,
"eval_runtime": 17.3832,
"eval_samples_per_second": 1093.241,
"eval_steps_per_second": 34.171,
"step": 16450
},
{
"epoch": 14.21188630490956,
"grad_norm": 0.6012845039367676,
"learning_rate": 0.00014315590008613264,
"loss": 0.3416895294189453,
"step": 16500
},
{
"epoch": 14.21188630490956,
"eval_loss": 0.40356630086898804,
"eval_runtime": 17.3881,
"eval_samples_per_second": 1092.933,
"eval_steps_per_second": 34.161,
"step": 16500
},
{
"epoch": 14.25495262704565,
"grad_norm": 0.5431012511253357,
"learning_rate": 0.00014298363479758828,
"loss": 0.34037681579589846,
"step": 16550
},
{
"epoch": 14.25495262704565,
"eval_loss": 0.3986068665981293,
"eval_runtime": 17.3416,
"eval_samples_per_second": 1095.86,
"eval_steps_per_second": 34.253,
"step": 16550
},
{
"epoch": 14.29801894918174,
"grad_norm": 0.6223941445350647,
"learning_rate": 0.00014281136950904393,
"loss": 0.3397150802612305,
"step": 16600
},
{
"epoch": 14.29801894918174,
"eval_loss": 0.40138301253318787,
"eval_runtime": 17.4634,
"eval_samples_per_second": 1088.219,
"eval_steps_per_second": 34.014,
"step": 16600
},
{
"epoch": 14.34108527131783,
"grad_norm": 0.5051292181015015,
"learning_rate": 0.00014263910422049957,
"loss": 0.33477035522460935,
"step": 16650
},
{
"epoch": 14.34108527131783,
"eval_loss": 0.39406636357307434,
"eval_runtime": 17.4451,
"eval_samples_per_second": 1089.36,
"eval_steps_per_second": 34.05,
"step": 16650
},
{
"epoch": 14.384151593453918,
"grad_norm": 0.5401943922042847,
"learning_rate": 0.00014246683893195524,
"loss": 0.33589164733886717,
"step": 16700
},
{
"epoch": 14.384151593453918,
"eval_loss": 0.4036507308483124,
"eval_runtime": 17.2724,
"eval_samples_per_second": 1100.252,
"eval_steps_per_second": 34.39,
"step": 16700
},
{
"epoch": 14.427217915590008,
"grad_norm": 0.48140889406204224,
"learning_rate": 0.00014229457364341085,
"loss": 0.34063690185546874,
"step": 16750
},
{
"epoch": 14.427217915590008,
"eval_loss": 0.3962832987308502,
"eval_runtime": 17.325,
"eval_samples_per_second": 1096.914,
"eval_steps_per_second": 34.286,
"step": 16750
},
{
"epoch": 14.470284237726098,
"grad_norm": 0.48055461049079895,
"learning_rate": 0.0001421223083548665,
"loss": 0.33268959045410157,
"step": 16800
},
{
"epoch": 14.470284237726098,
"eval_loss": 0.40976482629776,
"eval_runtime": 17.4337,
"eval_samples_per_second": 1090.07,
"eval_steps_per_second": 34.072,
"step": 16800
},
{
"epoch": 14.513350559862188,
"grad_norm": 0.6199280023574829,
"learning_rate": 0.00014195004306632214,
"loss": 0.336192626953125,
"step": 16850
},
{
"epoch": 14.513350559862188,
"eval_loss": 0.4026069641113281,
"eval_runtime": 17.0752,
"eval_samples_per_second": 1112.96,
"eval_steps_per_second": 34.787,
"step": 16850
},
{
"epoch": 14.556416881998278,
"grad_norm": 0.5128330588340759,
"learning_rate": 0.00014177777777777778,
"loss": 0.3429254913330078,
"step": 16900
},
{
"epoch": 14.556416881998278,
"eval_loss": 0.40689241886138916,
"eval_runtime": 16.679,
"eval_samples_per_second": 1139.395,
"eval_steps_per_second": 35.614,
"step": 16900
},
{
"epoch": 14.599483204134367,
"grad_norm": 0.5781823992729187,
"learning_rate": 0.00014160551248923342,
"loss": 0.3337020111083984,
"step": 16950
},
{
"epoch": 14.599483204134367,
"eval_loss": 0.4006907641887665,
"eval_runtime": 17.6406,
"eval_samples_per_second": 1077.287,
"eval_steps_per_second": 33.672,
"step": 16950
},
{
"epoch": 14.642549526270457,
"grad_norm": 0.6032127737998962,
"learning_rate": 0.00014143324720068906,
"loss": 0.33814579010009765,
"step": 17000
},
{
"epoch": 14.642549526270457,
"eval_loss": 0.39685383439064026,
"eval_runtime": 17.4922,
"eval_samples_per_second": 1086.429,
"eval_steps_per_second": 33.958,
"step": 17000
},
{
"epoch": 14.685615848406545,
"grad_norm": 0.5460181832313538,
"learning_rate": 0.00014126098191214473,
"loss": 0.33991954803466795,
"step": 17050
},
{
"epoch": 14.685615848406545,
"eval_loss": 0.3965746760368347,
"eval_runtime": 17.49,
"eval_samples_per_second": 1086.562,
"eval_steps_per_second": 33.962,
"step": 17050
},
{
"epoch": 14.728682170542635,
"grad_norm": 0.5615427494049072,
"learning_rate": 0.00014108871662360035,
"loss": 0.3355008316040039,
"step": 17100
},
{
"epoch": 14.728682170542635,
"eval_loss": 0.4105238914489746,
"eval_runtime": 17.4493,
"eval_samples_per_second": 1089.1,
"eval_steps_per_second": 34.042,
"step": 17100
},
{
"epoch": 14.771748492678725,
"grad_norm": 0.5080145597457886,
"learning_rate": 0.000140916451335056,
"loss": 0.3328333282470703,
"step": 17150
},
{
"epoch": 14.771748492678725,
"eval_loss": 0.4033212959766388,
"eval_runtime": 17.5468,
"eval_samples_per_second": 1083.044,
"eval_steps_per_second": 33.852,
"step": 17150
},
{
"epoch": 14.814814814814815,
"grad_norm": 0.6350198984146118,
"learning_rate": 0.00014074418604651163,
"loss": 0.333209228515625,
"step": 17200
},
{
"epoch": 14.814814814814815,
"eval_loss": 0.40370309352874756,
"eval_runtime": 16.7941,
"eval_samples_per_second": 1131.585,
"eval_steps_per_second": 35.369,
"step": 17200
},
{
"epoch": 14.857881136950905,
"grad_norm": 0.49065691232681274,
"learning_rate": 0.00014057192075796727,
"loss": 0.3375306701660156,
"step": 17250
},
{
"epoch": 14.857881136950905,
"eval_loss": 0.40457409620285034,
"eval_runtime": 17.1664,
"eval_samples_per_second": 1107.045,
"eval_steps_per_second": 34.602,
"step": 17250
},
{
"epoch": 14.900947459086995,
"grad_norm": 0.4902515709400177,
"learning_rate": 0.00014039965546942292,
"loss": 0.34255237579345704,
"step": 17300
},
{
"epoch": 14.900947459086995,
"eval_loss": 0.4081343412399292,
"eval_runtime": 17.5525,
"eval_samples_per_second": 1082.697,
"eval_steps_per_second": 33.841,
"step": 17300
},
{
"epoch": 14.944013781223084,
"grad_norm": 0.5764937400817871,
"learning_rate": 0.00014022739018087856,
"loss": 0.34118736267089844,
"step": 17350
},
{
"epoch": 14.944013781223084,
"eval_loss": 0.40441158413887024,
"eval_runtime": 17.0335,
"eval_samples_per_second": 1115.685,
"eval_steps_per_second": 34.872,
"step": 17350
},
{
"epoch": 14.987080103359173,
"grad_norm": 0.6214373707771301,
"learning_rate": 0.0001400551248923342,
"loss": 0.33193634033203123,
"step": 17400
},
{
"epoch": 14.987080103359173,
"eval_loss": 0.4039756655693054,
"eval_runtime": 17.4751,
"eval_samples_per_second": 1087.49,
"eval_steps_per_second": 33.991,
"step": 17400
},
{
"epoch": 15.030146425495262,
"grad_norm": 0.5778855681419373,
"learning_rate": 0.00013988285960378984,
"loss": 0.3388556671142578,
"step": 17450
},
{
"epoch": 15.030146425495262,
"eval_loss": 0.39234092831611633,
"eval_runtime": 17.4139,
"eval_samples_per_second": 1091.31,
"eval_steps_per_second": 34.111,
"step": 17450
},
{
"epoch": 15.073212747631352,
"grad_norm": 0.5316303968429565,
"learning_rate": 0.00013971059431524548,
"loss": 0.33368736267089844,
"step": 17500
},
{
"epoch": 15.073212747631352,
"eval_loss": 0.4007318615913391,
"eval_runtime": 17.4159,
"eval_samples_per_second": 1091.187,
"eval_steps_per_second": 34.107,
"step": 17500
},
{
"epoch": 15.116279069767442,
"grad_norm": 0.5068018436431885,
"learning_rate": 0.00013953832902670113,
"loss": 0.32500762939453126,
"step": 17550
},
{
"epoch": 15.116279069767442,
"eval_loss": 0.39454713463783264,
"eval_runtime": 17.3134,
"eval_samples_per_second": 1097.646,
"eval_steps_per_second": 34.309,
"step": 17550
},
{
"epoch": 15.159345391903532,
"grad_norm": 0.6217418909072876,
"learning_rate": 0.00013936606373815677,
"loss": 0.33881416320800783,
"step": 17600
},
{
"epoch": 15.159345391903532,
"eval_loss": 0.3999788463115692,
"eval_runtime": 17.5144,
"eval_samples_per_second": 1085.049,
"eval_steps_per_second": 33.915,
"step": 17600
},
{
"epoch": 15.202411714039622,
"grad_norm": 0.6280364394187927,
"learning_rate": 0.0001391937984496124,
"loss": 0.33057609558105466,
"step": 17650
},
{
"epoch": 15.202411714039622,
"eval_loss": 0.40249961614608765,
"eval_runtime": 17.3541,
"eval_samples_per_second": 1095.072,
"eval_steps_per_second": 34.228,
"step": 17650
},
{
"epoch": 15.24547803617571,
"grad_norm": 0.5168995261192322,
"learning_rate": 0.00013902153316106805,
"loss": 0.3267534255981445,
"step": 17700
},
{
"epoch": 15.24547803617571,
"eval_loss": 0.4028068780899048,
"eval_runtime": 16.6035,
"eval_samples_per_second": 1144.58,
"eval_steps_per_second": 35.776,
"step": 17700
},
{
"epoch": 15.2885443583118,
"grad_norm": 0.6022219061851501,
"learning_rate": 0.0001388492678725237,
"loss": 0.33923439025878904,
"step": 17750
},
{
"epoch": 15.2885443583118,
"eval_loss": 0.3993185758590698,
"eval_runtime": 17.489,
"eval_samples_per_second": 1086.625,
"eval_steps_per_second": 33.964,
"step": 17750
},
{
"epoch": 15.33161068044789,
"grad_norm": 0.5371029376983643,
"learning_rate": 0.00013867700258397934,
"loss": 0.328874397277832,
"step": 17800
},
{
"epoch": 15.33161068044789,
"eval_loss": 0.3954143822193146,
"eval_runtime": 17.0921,
"eval_samples_per_second": 1111.859,
"eval_steps_per_second": 34.753,
"step": 17800
},
{
"epoch": 15.37467700258398,
"grad_norm": 0.6205661296844482,
"learning_rate": 0.00013850473729543498,
"loss": 0.33194568634033206,
"step": 17850
},
{
"epoch": 15.37467700258398,
"eval_loss": 0.39704281091690063,
"eval_runtime": 17.3119,
"eval_samples_per_second": 1097.74,
"eval_steps_per_second": 34.312,
"step": 17850
},
{
"epoch": 15.41774332472007,
"grad_norm": 0.571426272392273,
"learning_rate": 0.00013833247200689062,
"loss": 0.3300050735473633,
"step": 17900
},
{
"epoch": 15.41774332472007,
"eval_loss": 0.39621701836586,
"eval_runtime": 17.2981,
"eval_samples_per_second": 1098.619,
"eval_steps_per_second": 34.339,
"step": 17900
},
{
"epoch": 15.460809646856159,
"grad_norm": 0.7013274431228638,
"learning_rate": 0.00013816020671834626,
"loss": 0.3340178680419922,
"step": 17950
},
{
"epoch": 15.460809646856159,
"eval_loss": 0.39800071716308594,
"eval_runtime": 17.282,
"eval_samples_per_second": 1099.64,
"eval_steps_per_second": 34.371,
"step": 17950
},
{
"epoch": 15.503875968992247,
"grad_norm": 0.6281255483627319,
"learning_rate": 0.0001379879414298019,
"loss": 0.3254861068725586,
"step": 18000
},
{
"epoch": 15.503875968992247,
"eval_loss": 0.3988846242427826,
"eval_runtime": 17.3939,
"eval_samples_per_second": 1092.567,
"eval_steps_per_second": 34.15,
"step": 18000
},
{
"epoch": 15.546942291128337,
"grad_norm": 0.6308976411819458,
"learning_rate": 0.00013781567614125755,
"loss": 0.3320703887939453,
"step": 18050
},
{
"epoch": 15.546942291128337,
"eval_loss": 0.4004760682582855,
"eval_runtime": 17.4506,
"eval_samples_per_second": 1089.017,
"eval_steps_per_second": 34.039,
"step": 18050
},
{
"epoch": 15.590008613264427,
"grad_norm": 0.6034271121025085,
"learning_rate": 0.0001376434108527132,
"loss": 0.33000999450683594,
"step": 18100
},
{
"epoch": 15.590008613264427,
"eval_loss": 0.3943745791912079,
"eval_runtime": 17.2953,
"eval_samples_per_second": 1098.795,
"eval_steps_per_second": 34.345,
"step": 18100
},
{
"epoch": 15.633074935400517,
"grad_norm": 0.6983925104141235,
"learning_rate": 0.00013747114556416883,
"loss": 0.3292758178710937,
"step": 18150
},
{
"epoch": 15.633074935400517,
"eval_loss": 0.3915829658508301,
"eval_runtime": 16.4769,
"eval_samples_per_second": 1153.371,
"eval_steps_per_second": 36.05,
"step": 18150
},
{
"epoch": 15.676141257536607,
"grad_norm": 0.6047272682189941,
"learning_rate": 0.00013729888027562447,
"loss": 0.3290716171264648,
"step": 18200
},
{
"epoch": 15.676141257536607,
"eval_loss": 0.3990899324417114,
"eval_runtime": 18.1527,
"eval_samples_per_second": 1046.899,
"eval_steps_per_second": 32.722,
"step": 18200
},
{
"epoch": 15.719207579672696,
"grad_norm": 0.7523520588874817,
"learning_rate": 0.00013712661498708012,
"loss": 0.32717063903808596,
"step": 18250
},
{
"epoch": 15.719207579672696,
"eval_loss": 0.3920466899871826,
"eval_runtime": 17.4929,
"eval_samples_per_second": 1086.385,
"eval_steps_per_second": 33.957,
"step": 18250
},
{
"epoch": 15.762273901808786,
"grad_norm": 0.5970990061759949,
"learning_rate": 0.00013695434969853576,
"loss": 0.3242684936523437,
"step": 18300
},
{
"epoch": 15.762273901808786,
"eval_loss": 0.4024035930633545,
"eval_runtime": 16.4655,
"eval_samples_per_second": 1154.169,
"eval_steps_per_second": 36.075,
"step": 18300
},
{
"epoch": 15.805340223944874,
"grad_norm": 0.480525404214859,
"learning_rate": 0.00013678208440999137,
"loss": 0.3314341354370117,
"step": 18350
},
{
"epoch": 15.805340223944874,
"eval_loss": 0.3887929320335388,
"eval_runtime": 17.5894,
"eval_samples_per_second": 1080.422,
"eval_steps_per_second": 33.77,
"step": 18350
},
{
"epoch": 15.848406546080964,
"grad_norm": 0.5142746567726135,
"learning_rate": 0.00013660981912144704,
"loss": 0.32969032287597655,
"step": 18400
},
{
"epoch": 15.848406546080964,
"eval_loss": 0.3979225158691406,
"eval_runtime": 17.5068,
"eval_samples_per_second": 1085.522,
"eval_steps_per_second": 33.93,
"step": 18400
},
{
"epoch": 15.891472868217054,
"grad_norm": 0.5760953426361084,
"learning_rate": 0.00013643755383290268,
"loss": 0.3342143630981445,
"step": 18450
},
{
"epoch": 15.891472868217054,
"eval_loss": 0.3954464793205261,
"eval_runtime": 17.5214,
"eval_samples_per_second": 1084.614,
"eval_steps_per_second": 33.901,
"step": 18450
},
{
"epoch": 15.934539190353144,
"grad_norm": 0.5239692330360413,
"learning_rate": 0.00013626528854435833,
"loss": 0.33184822082519533,
"step": 18500
},
{
"epoch": 15.934539190353144,
"eval_loss": 0.39176249504089355,
"eval_runtime": 17.4381,
"eval_samples_per_second": 1089.798,
"eval_steps_per_second": 34.063,
"step": 18500
},
{
"epoch": 15.977605512489234,
"grad_norm": 0.5742284655570984,
"learning_rate": 0.00013609302325581397,
"loss": 0.33209632873535155,
"step": 18550
},
{
"epoch": 15.977605512489234,
"eval_loss": 0.4047396779060364,
"eval_runtime": 17.1881,
"eval_samples_per_second": 1105.65,
"eval_steps_per_second": 34.559,
"step": 18550
},
{
"epoch": 16.020671834625322,
"grad_norm": 0.5419143438339233,
"learning_rate": 0.00013592075796726958,
"loss": 0.3236275863647461,
"step": 18600
},
{
"epoch": 16.020671834625322,
"eval_loss": 0.3944310247898102,
"eval_runtime": 16.3521,
"eval_samples_per_second": 1162.172,
"eval_steps_per_second": 36.326,
"step": 18600
},
{
"epoch": 16.063738156761413,
"grad_norm": 0.4870966374874115,
"learning_rate": 0.00013574849267872525,
"loss": 0.3301403045654297,
"step": 18650
},
{
"epoch": 16.063738156761413,
"eval_loss": 0.39082440733909607,
"eval_runtime": 17.0017,
"eval_samples_per_second": 1117.773,
"eval_steps_per_second": 34.938,
"step": 18650
},
{
"epoch": 16.1068044788975,
"grad_norm": 0.51682448387146,
"learning_rate": 0.00013557622739018087,
"loss": 0.32326805114746093,
"step": 18700
},
{
"epoch": 16.1068044788975,
"eval_loss": 0.3971370756626129,
"eval_runtime": 17.09,
"eval_samples_per_second": 1111.997,
"eval_steps_per_second": 34.757,
"step": 18700
},
{
"epoch": 16.149870801033593,
"grad_norm": 0.5596668720245361,
"learning_rate": 0.00013540396210163654,
"loss": 0.3247249603271484,
"step": 18750
},
{
"epoch": 16.149870801033593,
"eval_loss": 0.39040490984916687,
"eval_runtime": 16.4002,
"eval_samples_per_second": 1158.763,
"eval_steps_per_second": 36.219,
"step": 18750
},
{
"epoch": 16.19293712316968,
"grad_norm": 0.5393404960632324,
"learning_rate": 0.00013523169681309218,
"loss": 0.3202825164794922,
"step": 18800
},
{
"epoch": 16.19293712316968,
"eval_loss": 0.38830479979515076,
"eval_runtime": 17.5879,
"eval_samples_per_second": 1080.518,
"eval_steps_per_second": 33.773,
"step": 18800
},
{
"epoch": 16.23600344530577,
"grad_norm": 0.5143821239471436,
"learning_rate": 0.00013505943152454782,
"loss": 0.3251231384277344,
"step": 18850
},
{
"epoch": 16.23600344530577,
"eval_loss": 0.3889082670211792,
"eval_runtime": 17.6982,
"eval_samples_per_second": 1073.781,
"eval_steps_per_second": 33.563,
"step": 18850
},
{
"epoch": 16.27906976744186,
"grad_norm": 0.49142956733703613,
"learning_rate": 0.00013488716623600346,
"loss": 0.3229839324951172,
"step": 18900
},
{
"epoch": 16.27906976744186,
"eval_loss": 0.3905869722366333,
"eval_runtime": 17.3362,
"eval_samples_per_second": 1096.206,
"eval_steps_per_second": 34.264,
"step": 18900
},
{
"epoch": 16.32213608957795,
"grad_norm": 0.5397304892539978,
"learning_rate": 0.00013471490094745908,
"loss": 0.33159187316894534,
"step": 18950
},
{
"epoch": 16.32213608957795,
"eval_loss": 0.39496833086013794,
"eval_runtime": 17.2737,
"eval_samples_per_second": 1100.169,
"eval_steps_per_second": 34.388,
"step": 18950
},
{
"epoch": 16.36520241171404,
"grad_norm": 0.5198789834976196,
"learning_rate": 0.00013454263565891475,
"loss": 0.3233955383300781,
"step": 19000
},
{
"epoch": 16.36520241171404,
"eval_loss": 0.3942340910434723,
"eval_runtime": 17.4115,
"eval_samples_per_second": 1091.465,
"eval_steps_per_second": 34.115,
"step": 19000
},
{
"epoch": 16.40826873385013,
"grad_norm": 0.6628881692886353,
"learning_rate": 0.00013437037037037036,
"loss": 0.3342235565185547,
"step": 19050
},
{
"epoch": 16.40826873385013,
"eval_loss": 0.3900792598724365,
"eval_runtime": 17.3181,
"eval_samples_per_second": 1097.349,
"eval_steps_per_second": 34.299,
"step": 19050
},
{
"epoch": 16.45133505598622,
"grad_norm": 0.44677937030792236,
"learning_rate": 0.00013419810508182603,
"loss": 0.3183754348754883,
"step": 19100
},
{
"epoch": 16.45133505598622,
"eval_loss": 0.39026835560798645,
"eval_runtime": 16.6126,
"eval_samples_per_second": 1143.954,
"eval_steps_per_second": 35.756,
"step": 19100
},
{
"epoch": 16.49440137812231,
"grad_norm": 0.7109096050262451,
"learning_rate": 0.00013402583979328167,
"loss": 0.33458770751953126,
"step": 19150
},
{
"epoch": 16.49440137812231,
"eval_loss": 0.39101794362068176,
"eval_runtime": 17.4189,
"eval_samples_per_second": 1090.998,
"eval_steps_per_second": 34.101,
"step": 19150
},
{
"epoch": 16.537467700258397,
"grad_norm": 0.5609311461448669,
"learning_rate": 0.0001338535745047373,
"loss": 0.32187355041503907,
"step": 19200
},
{
"epoch": 16.537467700258397,
"eval_loss": 0.3992527425289154,
"eval_runtime": 17.3996,
"eval_samples_per_second": 1092.209,
"eval_steps_per_second": 34.139,
"step": 19200
},
{
"epoch": 16.580534022394488,
"grad_norm": 0.545501708984375,
"learning_rate": 0.00013368130921619296,
"loss": 0.329481315612793,
"step": 19250
},
{
"epoch": 16.580534022394488,
"eval_loss": 0.39770665764808655,
"eval_runtime": 17.1122,
"eval_samples_per_second": 1110.554,
"eval_steps_per_second": 34.712,
"step": 19250
},
{
"epoch": 16.623600344530576,
"grad_norm": 0.5404015779495239,
"learning_rate": 0.00013350904392764857,
"loss": 0.3238807678222656,
"step": 19300
},
{
"epoch": 16.623600344530576,
"eval_loss": 0.39125141501426697,
"eval_runtime": 17.1581,
"eval_samples_per_second": 1107.581,
"eval_steps_per_second": 34.619,
"step": 19300
},
{
"epoch": 16.666666666666668,
"grad_norm": 0.5957887768745422,
"learning_rate": 0.00013333677863910424,
"loss": 0.3259474945068359,
"step": 19350
},
{
"epoch": 16.666666666666668,
"eval_loss": 0.392182856798172,
"eval_runtime": 17.339,
"eval_samples_per_second": 1096.028,
"eval_steps_per_second": 34.258,
"step": 19350
},
{
"epoch": 16.709732988802756,
"grad_norm": 0.5888888239860535,
"learning_rate": 0.00013316451335055986,
"loss": 0.32502166748046873,
"step": 19400
},
{
"epoch": 16.709732988802756,
"eval_loss": 0.38831356167793274,
"eval_runtime": 19.0354,
"eval_samples_per_second": 998.352,
"eval_steps_per_second": 31.205,
"step": 19400
},
{
"epoch": 16.752799310938848,
"grad_norm": 0.5550042986869812,
"learning_rate": 0.0001329922480620155,
"loss": 0.3259937286376953,
"step": 19450
},
{
"epoch": 16.752799310938848,
"eval_loss": 0.39002206921577454,
"eval_runtime": 18.2055,
"eval_samples_per_second": 1043.861,
"eval_steps_per_second": 32.628,
"step": 19450
},
{
"epoch": 16.795865633074936,
"grad_norm": 0.5127618908882141,
"learning_rate": 0.00013281998277347114,
"loss": 0.32699764251708985,
"step": 19500
},
{
"epoch": 16.795865633074936,
"eval_loss": 0.3960529863834381,
"eval_runtime": 17.4398,
"eval_samples_per_second": 1089.69,
"eval_steps_per_second": 34.06,
"step": 19500
},
{
"epoch": 16.838931955211024,
"grad_norm": 0.4416603446006775,
"learning_rate": 0.00013264771748492678,
"loss": 0.32539680480957034,
"step": 19550
},
{
"epoch": 16.838931955211024,
"eval_loss": 0.39588266611099243,
"eval_runtime": 18.3991,
"eval_samples_per_second": 1032.874,
"eval_steps_per_second": 32.284,
"step": 19550
},
{
"epoch": 16.881998277347115,
"grad_norm": 0.5327856540679932,
"learning_rate": 0.00013247545219638245,
"loss": 0.3237502288818359,
"step": 19600
},
{
"epoch": 16.881998277347115,
"eval_loss": 0.39336878061294556,
"eval_runtime": 17.153,
"eval_samples_per_second": 1107.913,
"eval_steps_per_second": 34.63,
"step": 19600
},
{
"epoch": 16.925064599483203,
"grad_norm": 0.5136561989784241,
"learning_rate": 0.00013230318690783807,
"loss": 0.33170738220214846,
"step": 19650
},
{
"epoch": 16.925064599483203,
"eval_loss": 0.39044713973999023,
"eval_runtime": 16.8605,
"eval_samples_per_second": 1127.13,
"eval_steps_per_second": 35.23,
"step": 19650
},
{
"epoch": 16.968130921619295,
"grad_norm": 0.6515547037124634,
"learning_rate": 0.00013213092161929374,
"loss": 0.3308847427368164,
"step": 19700
},
{
"epoch": 16.968130921619295,
"eval_loss": 0.39422011375427246,
"eval_runtime": 17.4151,
"eval_samples_per_second": 1091.239,
"eval_steps_per_second": 34.108,
"step": 19700
},
{
"epoch": 17.011197243755383,
"grad_norm": 0.6061251163482666,
"learning_rate": 0.00013195865633074935,
"loss": 0.31996837615966794,
"step": 19750
},
{
"epoch": 17.011197243755383,
"eval_loss": 0.38462579250335693,
"eval_runtime": 17.2169,
"eval_samples_per_second": 1103.799,
"eval_steps_per_second": 34.501,
"step": 19750
},
{
"epoch": 17.05426356589147,
"grad_norm": 0.5053262114524841,
"learning_rate": 0.000131786391042205,
"loss": 0.32310577392578127,
"step": 19800
},
{
"epoch": 17.05426356589147,
"eval_loss": 0.38237592577934265,
"eval_runtime": 16.7211,
"eval_samples_per_second": 1136.529,
"eval_steps_per_second": 35.524,
"step": 19800
},
{
"epoch": 17.097329888027563,
"grad_norm": 0.5430858731269836,
"learning_rate": 0.00013161412575366064,
"loss": 0.3208709716796875,
"step": 19850
},
{
"epoch": 17.097329888027563,
"eval_loss": 0.3853854537010193,
"eval_runtime": 17.5253,
"eval_samples_per_second": 1084.373,
"eval_steps_per_second": 33.894,
"step": 19850
},
{
"epoch": 17.14039621016365,
"grad_norm": 0.6029484868049622,
"learning_rate": 0.00013144186046511628,
"loss": 0.33539260864257814,
"step": 19900
},
{
"epoch": 17.14039621016365,
"eval_loss": 0.3901008367538452,
"eval_runtime": 17.3659,
"eval_samples_per_second": 1094.326,
"eval_steps_per_second": 34.205,
"step": 19900
},
{
"epoch": 17.183462532299743,
"grad_norm": 0.5201014876365662,
"learning_rate": 0.00013126959517657195,
"loss": 0.3294208908081055,
"step": 19950
},
{
"epoch": 17.183462532299743,
"eval_loss": 0.38815146684646606,
"eval_runtime": 17.2059,
"eval_samples_per_second": 1104.504,
"eval_steps_per_second": 34.523,
"step": 19950
},
{
"epoch": 17.22652885443583,
"grad_norm": 0.501994788646698,
"learning_rate": 0.00013109732988802756,
"loss": 0.32511749267578127,
"step": 20000
},
{
"epoch": 17.22652885443583,
"eval_loss": 0.38529202342033386,
"eval_runtime": 17.3501,
"eval_samples_per_second": 1095.327,
"eval_steps_per_second": 34.236,
"step": 20000
},
{
"epoch": 17.269595176571922,
"grad_norm": 0.5027745366096497,
"learning_rate": 0.0001309250645994832,
"loss": 0.31986518859863283,
"step": 20050
},
{
"epoch": 17.269595176571922,
"eval_loss": 0.391549289226532,
"eval_runtime": 17.1948,
"eval_samples_per_second": 1105.216,
"eval_steps_per_second": 34.545,
"step": 20050
},
{
"epoch": 17.31266149870801,
"grad_norm": 0.6108536720275879,
"learning_rate": 0.00013075279931093885,
"loss": 0.3184634971618652,
"step": 20100
},
{
"epoch": 17.31266149870801,
"eval_loss": 0.38223689794540405,
"eval_runtime": 16.5133,
"eval_samples_per_second": 1150.827,
"eval_steps_per_second": 35.971,
"step": 20100
},
{
"epoch": 17.3557278208441,
"grad_norm": 0.5198239088058472,
"learning_rate": 0.0001305805340223945,
"loss": 0.3237266540527344,
"step": 20150
},
{
"epoch": 17.3557278208441,
"eval_loss": 0.38378673791885376,
"eval_runtime": 17.0237,
"eval_samples_per_second": 1116.326,
"eval_steps_per_second": 34.893,
"step": 20150
},
{
"epoch": 17.39879414298019,
"grad_norm": 0.5638048052787781,
"learning_rate": 0.00013040826873385013,
"loss": 0.32793220520019534,
"step": 20200
},
{
"epoch": 17.39879414298019,
"eval_loss": 0.3887383043766022,
"eval_runtime": 17.368,
"eval_samples_per_second": 1094.196,
"eval_steps_per_second": 34.201,
"step": 20200
},
{
"epoch": 17.441860465116278,
"grad_norm": 0.5240247845649719,
"learning_rate": 0.00013023600344530577,
"loss": 0.3263576889038086,
"step": 20250
},
{
"epoch": 17.441860465116278,
"eval_loss": 0.3850667476654053,
"eval_runtime": 16.5853,
"eval_samples_per_second": 1145.832,
"eval_steps_per_second": 35.815,
"step": 20250
},
{
"epoch": 17.48492678725237,
"grad_norm": 0.4876612424850464,
"learning_rate": 0.00013006373815676141,
"loss": 0.321444091796875,
"step": 20300
},
{
"epoch": 17.48492678725237,
"eval_loss": 0.3869178295135498,
"eval_runtime": 17.3704,
"eval_samples_per_second": 1094.042,
"eval_steps_per_second": 34.196,
"step": 20300
},
{
"epoch": 17.527993109388458,
"grad_norm": 0.4462520182132721,
"learning_rate": 0.00012989147286821706,
"loss": 0.31568416595458987,
"step": 20350
},
{
"epoch": 17.527993109388458,
"eval_loss": 0.38926127552986145,
"eval_runtime": 17.5271,
"eval_samples_per_second": 1084.261,
"eval_steps_per_second": 33.89,
"step": 20350
},
{
"epoch": 17.57105943152455,
"grad_norm": 0.44649818539619446,
"learning_rate": 0.0001297192075796727,
"loss": 0.31538238525390627,
"step": 20400
},
{
"epoch": 17.57105943152455,
"eval_loss": 0.38832584023475647,
"eval_runtime": 17.429,
"eval_samples_per_second": 1090.364,
"eval_steps_per_second": 34.081,
"step": 20400
},
{
"epoch": 17.614125753660637,
"grad_norm": 0.5380814671516418,
"learning_rate": 0.00012954694229112834,
"loss": 0.3133953857421875,
"step": 20450
},
{
"epoch": 17.614125753660637,
"eval_loss": 0.3826683461666107,
"eval_runtime": 17.128,
"eval_samples_per_second": 1109.526,
"eval_steps_per_second": 34.68,
"step": 20450
},
{
"epoch": 17.657192075796726,
"grad_norm": 0.529960036277771,
"learning_rate": 0.00012937467700258398,
"loss": 0.32620044708251955,
"step": 20500
},
{
"epoch": 17.657192075796726,
"eval_loss": 0.392609566450119,
"eval_runtime": 17.057,
"eval_samples_per_second": 1114.15,
"eval_steps_per_second": 34.825,
"step": 20500
},
{
"epoch": 17.700258397932817,
"grad_norm": 0.5545978546142578,
"learning_rate": 0.00012920241171403962,
"loss": 0.3222107696533203,
"step": 20550
},
{
"epoch": 17.700258397932817,
"eval_loss": 0.39162397384643555,
"eval_runtime": 17.1676,
"eval_samples_per_second": 1106.969,
"eval_steps_per_second": 34.6,
"step": 20550
},
{
"epoch": 17.743324720068905,
"grad_norm": 0.4985523819923401,
"learning_rate": 0.00012903014642549527,
"loss": 0.3192410469055176,
"step": 20600
},
{
"epoch": 17.743324720068905,
"eval_loss": 0.3959045708179474,
"eval_runtime": 16.4739,
"eval_samples_per_second": 1153.579,
"eval_steps_per_second": 36.057,
"step": 20600
},
{
"epoch": 17.786391042204997,
"grad_norm": 0.5570954084396362,
"learning_rate": 0.0001288578811369509,
"loss": 0.3183399200439453,
"step": 20650
},
{
"epoch": 17.786391042204997,
"eval_loss": 0.3895127773284912,
"eval_runtime": 17.7317,
"eval_samples_per_second": 1071.752,
"eval_steps_per_second": 33.499,
"step": 20650
},
{
"epoch": 17.829457364341085,
"grad_norm": 0.5547237396240234,
"learning_rate": 0.00012868561584840655,
"loss": 0.32538810729980466,
"step": 20700
},
{
"epoch": 17.829457364341085,
"eval_loss": 0.3861739933490753,
"eval_runtime": 17.6484,
"eval_samples_per_second": 1076.813,
"eval_steps_per_second": 33.657,
"step": 20700
},
{
"epoch": 17.872523686477173,
"grad_norm": 0.447329044342041,
"learning_rate": 0.0001285133505598622,
"loss": 0.3192084121704102,
"step": 20750
},
{
"epoch": 17.872523686477173,
"eval_loss": 0.3876189887523651,
"eval_runtime": 17.7022,
"eval_samples_per_second": 1073.537,
"eval_steps_per_second": 33.555,
"step": 20750
},
{
"epoch": 17.915590008613265,
"grad_norm": 0.5152373313903809,
"learning_rate": 0.00012834108527131784,
"loss": 0.3196234130859375,
"step": 20800
},
{
"epoch": 17.915590008613265,
"eval_loss": 0.3957338035106659,
"eval_runtime": 17.166,
"eval_samples_per_second": 1107.071,
"eval_steps_per_second": 34.603,
"step": 20800
},
{
"epoch": 17.958656330749353,
"grad_norm": 0.6933789253234863,
"learning_rate": 0.00012816881998277348,
"loss": 0.32067276000976563,
"step": 20850
},
{
"epoch": 17.958656330749353,
"eval_loss": 0.3911714255809784,
"eval_runtime": 17.1329,
"eval_samples_per_second": 1109.209,
"eval_steps_per_second": 34.67,
"step": 20850
},
{
"epoch": 18.001722652885444,
"grad_norm": 0.5589037537574768,
"learning_rate": 0.00012799655469422912,
"loss": 0.3197665214538574,
"step": 20900
},
{
"epoch": 18.001722652885444,
"eval_loss": 0.3835658133029938,
"eval_runtime": 17.9174,
"eval_samples_per_second": 1060.645,
"eval_steps_per_second": 33.152,
"step": 20900
},
{
"epoch": 18.044788975021532,
"grad_norm": 0.506934642791748,
"learning_rate": 0.00012782428940568476,
"loss": 0.30830619812011717,
"step": 20950
},
{
"epoch": 18.044788975021532,
"eval_loss": 0.386690229177475,
"eval_runtime": 16.7282,
"eval_samples_per_second": 1136.048,
"eval_steps_per_second": 35.509,
"step": 20950
},
{
"epoch": 18.087855297157624,
"grad_norm": 0.5666943788528442,
"learning_rate": 0.0001276520241171404,
"loss": 0.31840274810791014,
"step": 21000
},
{
"epoch": 18.087855297157624,
"eval_loss": 0.3852052390575409,
"eval_runtime": 17.2738,
"eval_samples_per_second": 1100.166,
"eval_steps_per_second": 34.387,
"step": 21000
},
{
"epoch": 18.130921619293712,
"grad_norm": 0.4481973648071289,
"learning_rate": 0.00012747975882859605,
"loss": 0.31186540603637697,
"step": 21050
},
{
"epoch": 18.130921619293712,
"eval_loss": 0.3922789394855499,
"eval_runtime": 17.4131,
"eval_samples_per_second": 1091.36,
"eval_steps_per_second": 34.112,
"step": 21050
},
{
"epoch": 18.1739879414298,
"grad_norm": 0.4946073889732361,
"learning_rate": 0.0001273074935400517,
"loss": 0.31672819137573244,
"step": 21100
},
{
"epoch": 18.1739879414298,
"eval_loss": 0.38604193925857544,
"eval_runtime": 16.9427,
"eval_samples_per_second": 1121.666,
"eval_steps_per_second": 35.059,
"step": 21100
},
{
"epoch": 18.217054263565892,
"grad_norm": 0.6280601024627686,
"learning_rate": 0.00012713522825150733,
"loss": 0.31234973907470703,
"step": 21150
},
{
"epoch": 18.217054263565892,
"eval_loss": 0.3931697905063629,
"eval_runtime": 17.1136,
"eval_samples_per_second": 1110.463,
"eval_steps_per_second": 34.709,
"step": 21150
},
{
"epoch": 18.26012058570198,
"grad_norm": 0.6457147598266602,
"learning_rate": 0.00012696296296296297,
"loss": 0.3120527648925781,
"step": 21200
},
{
"epoch": 18.26012058570198,
"eval_loss": 0.3861483931541443,
"eval_runtime": 17.1111,
"eval_samples_per_second": 1110.622,
"eval_steps_per_second": 34.714,
"step": 21200
},
{
"epoch": 18.30318690783807,
"grad_norm": 0.6117618680000305,
"learning_rate": 0.00012679069767441861,
"loss": 0.3122466850280762,
"step": 21250
},
{
"epoch": 18.30318690783807,
"eval_loss": 0.38948336243629456,
"eval_runtime": 16.7751,
"eval_samples_per_second": 1132.872,
"eval_steps_per_second": 35.41,
"step": 21250
},
{
"epoch": 18.34625322997416,
"grad_norm": 0.5237821340560913,
"learning_rate": 0.00012661843238587426,
"loss": 0.3228919219970703,
"step": 21300
},
{
"epoch": 18.34625322997416,
"eval_loss": 0.3906412124633789,
"eval_runtime": 16.684,
"eval_samples_per_second": 1139.054,
"eval_steps_per_second": 35.603,
"step": 21300
},
{
"epoch": 18.38931955211025,
"grad_norm": 0.5655460953712463,
"learning_rate": 0.0001264461670973299,
"loss": 0.3129468154907227,
"step": 21350
},
{
"epoch": 18.38931955211025,
"eval_loss": 0.38534438610076904,
"eval_runtime": 16.7814,
"eval_samples_per_second": 1132.447,
"eval_steps_per_second": 35.396,
"step": 21350
},
{
"epoch": 18.43238587424634,
"grad_norm": 0.5534801483154297,
"learning_rate": 0.00012627390180878554,
"loss": 0.3185391616821289,
"step": 21400
},
{
"epoch": 18.43238587424634,
"eval_loss": 0.391643762588501,
"eval_runtime": 16.3245,
"eval_samples_per_second": 1164.138,
"eval_steps_per_second": 36.387,
"step": 21400
},
{
"epoch": 18.475452196382427,
"grad_norm": 0.5594217777252197,
"learning_rate": 0.00012610163652024118,
"loss": 0.321013298034668,
"step": 21450
},
{
"epoch": 18.475452196382427,
"eval_loss": 0.38839325308799744,
"eval_runtime": 16.6874,
"eval_samples_per_second": 1138.825,
"eval_steps_per_second": 35.596,
"step": 21450
},
{
"epoch": 18.51851851851852,
"grad_norm": 0.47908687591552734,
"learning_rate": 0.00012592937123169682,
"loss": 0.31948158264160154,
"step": 21500
},
{
"epoch": 18.51851851851852,
"eval_loss": 0.38242292404174805,
"eval_runtime": 16.8044,
"eval_samples_per_second": 1130.893,
"eval_steps_per_second": 35.348,
"step": 21500
},
{
"epoch": 18.561584840654607,
"grad_norm": 0.43603160977363586,
"learning_rate": 0.00012575710594315247,
"loss": 0.31390554428100587,
"step": 21550
},
{
"epoch": 18.561584840654607,
"eval_loss": 0.37871894240379333,
"eval_runtime": 16.1361,
"eval_samples_per_second": 1177.73,
"eval_steps_per_second": 36.812,
"step": 21550
},
{
"epoch": 18.6046511627907,
"grad_norm": 0.5649058222770691,
"learning_rate": 0.00012558484065460808,
"loss": 0.3198824691772461,
"step": 21600
},
{
"epoch": 18.6046511627907,
"eval_loss": 0.38102349638938904,
"eval_runtime": 16.6623,
"eval_samples_per_second": 1140.537,
"eval_steps_per_second": 35.649,
"step": 21600
},
{
"epoch": 18.647717484926787,
"grad_norm": 0.4506802558898926,
"learning_rate": 0.00012541257536606375,
"loss": 0.31376161575317385,
"step": 21650
},
{
"epoch": 18.647717484926787,
"eval_loss": 0.38414791226387024,
"eval_runtime": 16.5267,
"eval_samples_per_second": 1149.899,
"eval_steps_per_second": 35.942,
"step": 21650
},
{
"epoch": 18.69078380706288,
"grad_norm": 0.4440782368183136,
"learning_rate": 0.0001252403100775194,
"loss": 0.3159283447265625,
"step": 21700
},
{
"epoch": 18.69078380706288,
"eval_loss": 0.37915998697280884,
"eval_runtime": 16.2176,
"eval_samples_per_second": 1171.81,
"eval_steps_per_second": 36.627,
"step": 21700
},
{
"epoch": 18.733850129198967,
"grad_norm": 0.6125892400741577,
"learning_rate": 0.00012506804478897504,
"loss": 0.3128934097290039,
"step": 21750
},
{
"epoch": 18.733850129198967,
"eval_loss": 0.38138166069984436,
"eval_runtime": 15.9943,
"eval_samples_per_second": 1188.175,
"eval_steps_per_second": 37.138,
"step": 21750
},
{
"epoch": 18.776916451335055,
"grad_norm": 0.5350700616836548,
"learning_rate": 0.00012489577950043068,
"loss": 0.31795137405395507,
"step": 21800
},
{
"epoch": 18.776916451335055,
"eval_loss": 0.38072770833969116,
"eval_runtime": 17.0595,
"eval_samples_per_second": 1113.985,
"eval_steps_per_second": 34.819,
"step": 21800
},
{
"epoch": 18.819982773471146,
"grad_norm": 0.6350408792495728,
"learning_rate": 0.0001247235142118863,
"loss": 0.3134295463562012,
"step": 21850
},
{
"epoch": 18.819982773471146,
"eval_loss": 0.3758614957332611,
"eval_runtime": 17.4302,
"eval_samples_per_second": 1090.29,
"eval_steps_per_second": 34.079,
"step": 21850
},
{
"epoch": 18.863049095607234,
"grad_norm": 0.5005162954330444,
"learning_rate": 0.00012455124892334196,
"loss": 0.32502140045166017,
"step": 21900
},
{
"epoch": 18.863049095607234,
"eval_loss": 0.38265460729599,
"eval_runtime": 16.6214,
"eval_samples_per_second": 1143.347,
"eval_steps_per_second": 35.737,
"step": 21900
},
{
"epoch": 18.906115417743326,
"grad_norm": 0.5520798563957214,
"learning_rate": 0.00012437898363479758,
"loss": 0.3212323760986328,
"step": 21950
},
{
"epoch": 18.906115417743326,
"eval_loss": 0.3830348253250122,
"eval_runtime": 17.3028,
"eval_samples_per_second": 1098.317,
"eval_steps_per_second": 34.33,
"step": 21950
},
{
"epoch": 18.949181739879414,
"grad_norm": 0.41653236746788025,
"learning_rate": 0.00012420671834625325,
"loss": 0.3083462142944336,
"step": 22000
},
{
"epoch": 18.949181739879414,
"eval_loss": 0.3885103762149811,
"eval_runtime": 17.2255,
"eval_samples_per_second": 1103.246,
"eval_steps_per_second": 34.484,
"step": 22000
},
{
"epoch": 18.992248062015506,
"grad_norm": 0.42078638076782227,
"learning_rate": 0.0001240344530577089,
"loss": 0.3157729721069336,
"step": 22050
},
{
"epoch": 18.992248062015506,
"eval_loss": 0.38450390100479126,
"eval_runtime": 17.1782,
"eval_samples_per_second": 1106.283,
"eval_steps_per_second": 34.579,
"step": 22050
},
{
"epoch": 19.035314384151594,
"grad_norm": 0.4940221607685089,
"learning_rate": 0.00012386218776916453,
"loss": 0.3200815963745117,
"step": 22100
},
{
"epoch": 19.035314384151594,
"eval_loss": 0.3834143579006195,
"eval_runtime": 17.3493,
"eval_samples_per_second": 1095.375,
"eval_steps_per_second": 34.238,
"step": 22100
},
{
"epoch": 19.078380706287682,
"grad_norm": 0.5358327031135559,
"learning_rate": 0.00012368992248062017,
"loss": 0.3109498596191406,
"step": 22150
},
{
"epoch": 19.078380706287682,
"eval_loss": 0.38372257351875305,
"eval_runtime": 17.6251,
"eval_samples_per_second": 1078.233,
"eval_steps_per_second": 33.702,
"step": 22150
},
{
"epoch": 19.121447028423773,
"grad_norm": 0.5010002255439758,
"learning_rate": 0.0001235176571920758,
"loss": 0.30669967651367186,
"step": 22200
},
{
"epoch": 19.121447028423773,
"eval_loss": 0.3840700387954712,
"eval_runtime": 16.9058,
"eval_samples_per_second": 1124.114,
"eval_steps_per_second": 35.136,
"step": 22200
},
{
"epoch": 19.16451335055986,
"grad_norm": 0.514018714427948,
"learning_rate": 0.00012334539190353146,
"loss": 0.3106396293640137,
"step": 22250
},
{
"epoch": 19.16451335055986,
"eval_loss": 0.3840588927268982,
"eval_runtime": 17.0702,
"eval_samples_per_second": 1113.284,
"eval_steps_per_second": 34.797,
"step": 22250
},
{
"epoch": 19.207579672695953,
"grad_norm": 0.4882144033908844,
"learning_rate": 0.00012317312661498707,
"loss": 0.31231227874755857,
"step": 22300
},
{
"epoch": 19.207579672695953,
"eval_loss": 0.3815734088420868,
"eval_runtime": 17.1064,
"eval_samples_per_second": 1110.929,
"eval_steps_per_second": 34.724,
"step": 22300
},
{
"epoch": 19.25064599483204,
"grad_norm": 0.4594323933124542,
"learning_rate": 0.00012300086132644274,
"loss": 0.3108772659301758,
"step": 22350
},
{
"epoch": 19.25064599483204,
"eval_loss": 0.3820939064025879,
"eval_runtime": 15.8721,
"eval_samples_per_second": 1197.32,
"eval_steps_per_second": 37.424,
"step": 22350
},
{
"epoch": 19.29371231696813,
"grad_norm": 0.740356981754303,
"learning_rate": 0.00012282859603789838,
"loss": 0.31078941345214844,
"step": 22400
},
{
"epoch": 19.29371231696813,
"eval_loss": 0.3824281394481659,
"eval_runtime": 17.3013,
"eval_samples_per_second": 1098.412,
"eval_steps_per_second": 34.333,
"step": 22400
},
{
"epoch": 19.33677863910422,
"grad_norm": 0.7296916842460632,
"learning_rate": 0.000122656330749354,
"loss": 0.31134443283081054,
"step": 22450
},
{
"epoch": 19.33677863910422,
"eval_loss": 0.3865135908126831,
"eval_runtime": 17.5063,
"eval_samples_per_second": 1085.554,
"eval_steps_per_second": 33.931,
"step": 22450
},
{
"epoch": 19.37984496124031,
"grad_norm": 0.6394023299217224,
"learning_rate": 0.00012248406546080967,
"loss": 0.32353458404541013,
"step": 22500
},
{
"epoch": 19.37984496124031,
"eval_loss": 0.38835322856903076,
"eval_runtime": 17.2454,
"eval_samples_per_second": 1101.977,
"eval_steps_per_second": 34.444,
"step": 22500
},
{
"epoch": 19.4229112833764,
"grad_norm": 0.5603283047676086,
"learning_rate": 0.00012231180017226528,
"loss": 0.31344139099121093,
"step": 22550
},
{
"epoch": 19.4229112833764,
"eval_loss": 0.3792956471443176,
"eval_runtime": 17.1044,
"eval_samples_per_second": 1111.061,
"eval_steps_per_second": 34.728,
"step": 22550
},
{
"epoch": 19.46597760551249,
"grad_norm": 0.5717983245849609,
"learning_rate": 0.00012213953488372095,
"loss": 0.3134092903137207,
"step": 22600
},
{
"epoch": 19.46597760551249,
"eval_loss": 0.3799424171447754,
"eval_runtime": 17.2965,
"eval_samples_per_second": 1098.722,
"eval_steps_per_second": 34.342,
"step": 22600
},
{
"epoch": 19.50904392764858,
"grad_norm": 0.5377107262611389,
"learning_rate": 0.00012196726959517657,
"loss": 0.3165386962890625,
"step": 22650
},
{
"epoch": 19.50904392764858,
"eval_loss": 0.3824997544288635,
"eval_runtime": 17.1515,
"eval_samples_per_second": 1108.009,
"eval_steps_per_second": 34.633,
"step": 22650
},
{
"epoch": 19.55211024978467,
"grad_norm": 0.45157045125961304,
"learning_rate": 0.00012179500430663222,
"loss": 0.315402946472168,
"step": 22700
},
{
"epoch": 19.55211024978467,
"eval_loss": 0.3792726695537567,
"eval_runtime": 15.7874,
"eval_samples_per_second": 1203.742,
"eval_steps_per_second": 37.625,
"step": 22700
},
{
"epoch": 19.595176571920756,
"grad_norm": 0.5813359618186951,
"learning_rate": 0.00012162273901808785,
"loss": 0.31789175033569333,
"step": 22750
},
{
"epoch": 19.595176571920756,
"eval_loss": 0.38310182094573975,
"eval_runtime": 17.083,
"eval_samples_per_second": 1112.45,
"eval_steps_per_second": 34.771,
"step": 22750
},
{
"epoch": 19.638242894056848,
"grad_norm": 0.4540177881717682,
"learning_rate": 0.0001214504737295435,
"loss": 0.31223211288452146,
"step": 22800
},
{
"epoch": 19.638242894056848,
"eval_loss": 0.3790459632873535,
"eval_runtime": 15.6609,
"eval_samples_per_second": 1213.466,
"eval_steps_per_second": 37.929,
"step": 22800
},
{
"epoch": 19.681309216192936,
"grad_norm": 0.4714398682117462,
"learning_rate": 0.00012127820844099915,
"loss": 0.3169035530090332,
"step": 22850
},
{
"epoch": 19.681309216192936,
"eval_loss": 0.38244912028312683,
"eval_runtime": 17.3654,
"eval_samples_per_second": 1094.362,
"eval_steps_per_second": 34.206,
"step": 22850
},
{
"epoch": 19.724375538329028,
"grad_norm": 0.4587773382663727,
"learning_rate": 0.00012110594315245478,
"loss": 0.317059211730957,
"step": 22900
},
{
"epoch": 19.724375538329028,
"eval_loss": 0.3824186623096466,
"eval_runtime": 17.9642,
"eval_samples_per_second": 1057.884,
"eval_steps_per_second": 33.066,
"step": 22900
},
{
"epoch": 19.767441860465116,
"grad_norm": 0.48488104343414307,
"learning_rate": 0.00012093367786391043,
"loss": 0.3131584358215332,
"step": 22950
},
{
"epoch": 19.767441860465116,
"eval_loss": 0.377473920583725,
"eval_runtime": 18.2414,
"eval_samples_per_second": 1041.805,
"eval_steps_per_second": 32.563,
"step": 22950
},
{
"epoch": 19.810508182601207,
"grad_norm": 0.45736151933670044,
"learning_rate": 0.00012076141257536606,
"loss": 0.3098099708557129,
"step": 23000
},
{
"epoch": 19.810508182601207,
"eval_loss": 0.3901560604572296,
"eval_runtime": 17.204,
"eval_samples_per_second": 1104.628,
"eval_steps_per_second": 34.527,
"step": 23000
},
{
"epoch": 19.853574504737296,
"grad_norm": 0.44181448221206665,
"learning_rate": 0.00012058914728682172,
"loss": 0.3161106872558594,
"step": 23050
},
{
"epoch": 19.853574504737296,
"eval_loss": 0.3850392699241638,
"eval_runtime": 17.794,
"eval_samples_per_second": 1067.998,
"eval_steps_per_second": 33.382,
"step": 23050
},
{
"epoch": 19.896640826873384,
"grad_norm": 0.48947784304618835,
"learning_rate": 0.00012041688199827734,
"loss": 0.30796041488647463,
"step": 23100
},
{
"epoch": 19.896640826873384,
"eval_loss": 0.3785504996776581,
"eval_runtime": 17.2727,
"eval_samples_per_second": 1100.234,
"eval_steps_per_second": 34.39,
"step": 23100
},
{
"epoch": 19.939707149009475,
"grad_norm": 0.47454968094825745,
"learning_rate": 0.00012024461670973299,
"loss": 0.32041351318359373,
"step": 23150
},
{
"epoch": 19.939707149009475,
"eval_loss": 0.38033241033554077,
"eval_runtime": 16.484,
"eval_samples_per_second": 1152.877,
"eval_steps_per_second": 36.035,
"step": 23150
},
{
"epoch": 19.982773471145563,
"grad_norm": 0.4740263819694519,
"learning_rate": 0.00012007235142118864,
"loss": 0.3148014068603516,
"step": 23200
},
{
"epoch": 19.982773471145563,
"eval_loss": 0.38085371255874634,
"eval_runtime": 17.3432,
"eval_samples_per_second": 1095.761,
"eval_steps_per_second": 34.25,
"step": 23200
},
{
"epoch": 20.025839793281655,
"grad_norm": 0.5026439428329468,
"learning_rate": 0.00011990008613264427,
"loss": 0.3176981353759766,
"step": 23250
},
{
"epoch": 20.025839793281655,
"eval_loss": 0.3808394968509674,
"eval_runtime": 16.7675,
"eval_samples_per_second": 1133.384,
"eval_steps_per_second": 35.426,
"step": 23250
},
{
"epoch": 20.068906115417743,
"grad_norm": 0.4572559595108032,
"learning_rate": 0.00011972782084409993,
"loss": 0.3045423126220703,
"step": 23300
},
{
"epoch": 20.068906115417743,
"eval_loss": 0.38897034525871277,
"eval_runtime": 17.4371,
"eval_samples_per_second": 1089.86,
"eval_steps_per_second": 34.065,
"step": 23300
},
{
"epoch": 20.11197243755383,
"grad_norm": 0.4959510266780853,
"learning_rate": 0.00011955555555555556,
"loss": 0.3092060089111328,
"step": 23350
},
{
"epoch": 20.11197243755383,
"eval_loss": 0.3781077265739441,
"eval_runtime": 17.1755,
"eval_samples_per_second": 1106.461,
"eval_steps_per_second": 34.584,
"step": 23350
},
{
"epoch": 20.155038759689923,
"grad_norm": 0.576425313949585,
"learning_rate": 0.00011938329026701121,
"loss": 0.3072701644897461,
"step": 23400
},
{
"epoch": 20.155038759689923,
"eval_loss": 0.3774796426296234,
"eval_runtime": 17.5053,
"eval_samples_per_second": 1085.616,
"eval_steps_per_second": 33.933,
"step": 23400
},
{
"epoch": 20.19810508182601,
"grad_norm": 0.5191887021064758,
"learning_rate": 0.00011921102497846684,
"loss": 0.3084998512268066,
"step": 23450
},
{
"epoch": 20.19810508182601,
"eval_loss": 0.3787190318107605,
"eval_runtime": 17.5021,
"eval_samples_per_second": 1085.811,
"eval_steps_per_second": 33.939,
"step": 23450
},
{
"epoch": 20.241171403962102,
"grad_norm": 0.4725388288497925,
"learning_rate": 0.00011903875968992248,
"loss": 0.30435737609863284,
"step": 23500
},
{
"epoch": 20.241171403962102,
"eval_loss": 0.3799598217010498,
"eval_runtime": 17.3816,
"eval_samples_per_second": 1093.34,
"eval_steps_per_second": 34.174,
"step": 23500
},
{
"epoch": 20.28423772609819,
"grad_norm": 0.460355281829834,
"learning_rate": 0.00011886649440137814,
"loss": 0.31231136322021485,
"step": 23550
},
{
"epoch": 20.28423772609819,
"eval_loss": 0.3775605857372284,
"eval_runtime": 17.0961,
"eval_samples_per_second": 1111.596,
"eval_steps_per_second": 34.745,
"step": 23550
},
{
"epoch": 20.327304048234282,
"grad_norm": 0.44472384452819824,
"learning_rate": 0.00011869422911283377,
"loss": 0.3119425964355469,
"step": 23600
},
{
"epoch": 20.327304048234282,
"eval_loss": 0.385606974363327,
"eval_runtime": 17.4347,
"eval_samples_per_second": 1090.01,
"eval_steps_per_second": 34.07,
"step": 23600
},
{
"epoch": 20.37037037037037,
"grad_norm": 0.4899062514305115,
"learning_rate": 0.00011852196382428942,
"loss": 0.3081985664367676,
"step": 23650
},
{
"epoch": 20.37037037037037,
"eval_loss": 0.3742313086986542,
"eval_runtime": 17.4633,
"eval_samples_per_second": 1088.223,
"eval_steps_per_second": 34.014,
"step": 23650
},
{
"epoch": 20.41343669250646,
"grad_norm": 0.39342230558395386,
"learning_rate": 0.00011834969853574505,
"loss": 0.3061702919006348,
"step": 23700
},
{
"epoch": 20.41343669250646,
"eval_loss": 0.3832343518733978,
"eval_runtime": 16.5699,
"eval_samples_per_second": 1146.896,
"eval_steps_per_second": 35.848,
"step": 23700
},
{
"epoch": 20.45650301464255,
"grad_norm": 0.5367943644523621,
"learning_rate": 0.00011817743324720069,
"loss": 0.3173460388183594,
"step": 23750
},
{
"epoch": 20.45650301464255,
"eval_loss": 0.3846309781074524,
"eval_runtime": 17.5177,
"eval_samples_per_second": 1084.846,
"eval_steps_per_second": 33.909,
"step": 23750
},
{
"epoch": 20.499569336778638,
"grad_norm": 0.468249648809433,
"learning_rate": 0.00011800516795865632,
"loss": 0.3046343994140625,
"step": 23800
},
{
"epoch": 20.499569336778638,
"eval_loss": 0.3752977252006531,
"eval_runtime": 17.4727,
"eval_samples_per_second": 1087.639,
"eval_steps_per_second": 33.996,
"step": 23800
},
{
"epoch": 20.54263565891473,
"grad_norm": 0.4803526997566223,
"learning_rate": 0.00011783290267011198,
"loss": 0.31560543060302737,
"step": 23850
},
{
"epoch": 20.54263565891473,
"eval_loss": 0.37215656042099,
"eval_runtime": 17.4673,
"eval_samples_per_second": 1087.973,
"eval_steps_per_second": 34.006,
"step": 23850
},
{
"epoch": 20.585701981050818,
"grad_norm": 0.4824640154838562,
"learning_rate": 0.00011766063738156763,
"loss": 0.3100012397766113,
"step": 23900
},
{
"epoch": 20.585701981050818,
"eval_loss": 0.3795510530471802,
"eval_runtime": 17.3565,
"eval_samples_per_second": 1094.921,
"eval_steps_per_second": 34.223,
"step": 23900
},
{
"epoch": 20.62876830318691,
"grad_norm": 0.5121549963951111,
"learning_rate": 0.00011748837209302326,
"loss": 0.3087359619140625,
"step": 23950
},
{
"epoch": 20.62876830318691,
"eval_loss": 0.37073156237602234,
"eval_runtime": 17.7974,
"eval_samples_per_second": 1067.794,
"eval_steps_per_second": 33.376,
"step": 23950
},
{
"epoch": 20.671834625322997,
"grad_norm": 0.5719079971313477,
"learning_rate": 0.0001173161068044789,
"loss": 0.3093043327331543,
"step": 24000
},
{
"epoch": 20.671834625322997,
"eval_loss": 0.3770056366920471,
"eval_runtime": 17.2425,
"eval_samples_per_second": 1102.16,
"eval_steps_per_second": 34.45,
"step": 24000
},
{
"epoch": 20.714900947459086,
"grad_norm": 0.46395695209503174,
"learning_rate": 0.00011714384151593454,
"loss": 0.3123012161254883,
"step": 24050
},
{
"epoch": 20.714900947459086,
"eval_loss": 0.371019572019577,
"eval_runtime": 16.6259,
"eval_samples_per_second": 1143.034,
"eval_steps_per_second": 35.727,
"step": 24050
},
{
"epoch": 20.757967269595177,
"grad_norm": 0.540714681148529,
"learning_rate": 0.00011697157622739019,
"loss": 0.31075138092041016,
"step": 24100
},
{
"epoch": 20.757967269595177,
"eval_loss": 0.37732967734336853,
"eval_runtime": 17.4927,
"eval_samples_per_second": 1086.397,
"eval_steps_per_second": 33.957,
"step": 24100
},
{
"epoch": 20.801033591731265,
"grad_norm": 0.4655165374279022,
"learning_rate": 0.00011679931093884582,
"loss": 0.30832584381103517,
"step": 24150
},
{
"epoch": 20.801033591731265,
"eval_loss": 0.37958091497421265,
"eval_runtime": 17.2454,
"eval_samples_per_second": 1101.973,
"eval_steps_per_second": 34.444,
"step": 24150
},
{
"epoch": 20.844099913867357,
"grad_norm": 0.4928206205368042,
"learning_rate": 0.00011662704565030147,
"loss": 0.31150699615478517,
"step": 24200
},
{
"epoch": 20.844099913867357,
"eval_loss": 0.37537887692451477,
"eval_runtime": 16.4495,
"eval_samples_per_second": 1155.295,
"eval_steps_per_second": 36.111,
"step": 24200
},
{
"epoch": 20.887166236003445,
"grad_norm": 0.5147706270217896,
"learning_rate": 0.00011645478036175713,
"loss": 0.3046610069274902,
"step": 24250
},
{
"epoch": 20.887166236003445,
"eval_loss": 0.3709302246570587,
"eval_runtime": 17.2509,
"eval_samples_per_second": 1101.626,
"eval_steps_per_second": 34.433,
"step": 24250
},
{
"epoch": 20.930232558139537,
"grad_norm": 0.4951704740524292,
"learning_rate": 0.00011628251507321276,
"loss": 0.3043970108032227,
"step": 24300
},
{
"epoch": 20.930232558139537,
"eval_loss": 0.38124656677246094,
"eval_runtime": 17.4681,
"eval_samples_per_second": 1087.923,
"eval_steps_per_second": 34.005,
"step": 24300
},
{
"epoch": 20.973298880275625,
"grad_norm": 0.41925662755966187,
"learning_rate": 0.0001161102497846684,
"loss": 0.3140296173095703,
"step": 24350
},
{
"epoch": 20.973298880275625,
"eval_loss": 0.3799387514591217,
"eval_runtime": 17.5417,
"eval_samples_per_second": 1083.363,
"eval_steps_per_second": 33.862,
"step": 24350
},
{
"epoch": 21.016365202411713,
"grad_norm": 0.5070242285728455,
"learning_rate": 0.00011593798449612403,
"loss": 0.3025740814208984,
"step": 24400
},
{
"epoch": 21.016365202411713,
"eval_loss": 0.3791089355945587,
"eval_runtime": 17.7997,
"eval_samples_per_second": 1067.66,
"eval_steps_per_second": 33.371,
"step": 24400
},
{
"epoch": 21.059431524547804,
"grad_norm": 0.42946505546569824,
"learning_rate": 0.00011576571920757968,
"loss": 0.30825782775878907,
"step": 24450
},
{
"epoch": 21.059431524547804,
"eval_loss": 0.37176281213760376,
"eval_runtime": 19.2954,
"eval_samples_per_second": 984.899,
"eval_steps_per_second": 30.785,
"step": 24450
},
{
"epoch": 21.102497846683892,
"grad_norm": 0.40253499150276184,
"learning_rate": 0.00011559345391903531,
"loss": 0.3015823745727539,
"step": 24500
},
{
"epoch": 21.102497846683892,
"eval_loss": 0.3752162754535675,
"eval_runtime": 18.1997,
"eval_samples_per_second": 1044.192,
"eval_steps_per_second": 32.638,
"step": 24500
},
{
"epoch": 21.145564168819984,
"grad_norm": 0.42977315187454224,
"learning_rate": 0.00011542118863049097,
"loss": 0.308193244934082,
"step": 24550
},
{
"epoch": 21.145564168819984,
"eval_loss": 0.3800414502620697,
"eval_runtime": 18.7255,
"eval_samples_per_second": 1014.872,
"eval_steps_per_second": 31.721,
"step": 24550
},
{
"epoch": 21.188630490956072,
"grad_norm": 0.39198407530784607,
"learning_rate": 0.00011524892334194661,
"loss": 0.3081152725219727,
"step": 24600
},
{
"epoch": 21.188630490956072,
"eval_loss": 0.3643619418144226,
"eval_runtime": 17.8789,
"eval_samples_per_second": 1062.931,
"eval_steps_per_second": 33.224,
"step": 24600
},
{
"epoch": 21.23169681309216,
"grad_norm": 0.5011320114135742,
"learning_rate": 0.00011507665805340224,
"loss": 0.30673561096191404,
"step": 24650
},
{
"epoch": 21.23169681309216,
"eval_loss": 0.37921005487442017,
"eval_runtime": 17.1467,
"eval_samples_per_second": 1108.32,
"eval_steps_per_second": 34.642,
"step": 24650
},
{
"epoch": 21.274763135228252,
"grad_norm": 0.5653364658355713,
"learning_rate": 0.00011490439276485789,
"loss": 0.309172248840332,
"step": 24700
},
{
"epoch": 21.274763135228252,
"eval_loss": 0.377411812543869,
"eval_runtime": 17.148,
"eval_samples_per_second": 1108.234,
"eval_steps_per_second": 34.64,
"step": 24700
},
{
"epoch": 21.31782945736434,
"grad_norm": 0.542508602142334,
"learning_rate": 0.00011473212747631352,
"loss": 0.3056539535522461,
"step": 24750
},
{
"epoch": 21.31782945736434,
"eval_loss": 0.3837369680404663,
"eval_runtime": 18.4413,
"eval_samples_per_second": 1030.514,
"eval_steps_per_second": 32.21,
"step": 24750
},
{
"epoch": 21.36089577950043,
"grad_norm": 0.478859007358551,
"learning_rate": 0.00011455986218776918,
"loss": 0.30355857849121093,
"step": 24800
},
{
"epoch": 21.36089577950043,
"eval_loss": 0.37471258640289307,
"eval_runtime": 17.4466,
"eval_samples_per_second": 1089.264,
"eval_steps_per_second": 34.047,
"step": 24800
},
{
"epoch": 21.40396210163652,
"grad_norm": 0.45857715606689453,
"learning_rate": 0.0001143875968992248,
"loss": 0.30617919921875,
"step": 24850
},
{
"epoch": 21.40396210163652,
"eval_loss": 0.3754553198814392,
"eval_runtime": 17.5133,
"eval_samples_per_second": 1085.116,
"eval_steps_per_second": 33.917,
"step": 24850
},
{
"epoch": 21.44702842377261,
"grad_norm": 0.4991483688354492,
"learning_rate": 0.00011421533161068046,
"loss": 0.30925914764404294,
"step": 24900
},
{
"epoch": 21.44702842377261,
"eval_loss": 0.37263229489326477,
"eval_runtime": 16.9128,
"eval_samples_per_second": 1123.643,
"eval_steps_per_second": 35.121,
"step": 24900
},
{
"epoch": 21.4900947459087,
"grad_norm": 0.4601542055606842,
"learning_rate": 0.0001140430663221361,
"loss": 0.3022798538208008,
"step": 24950
},
{
"epoch": 21.4900947459087,
"eval_loss": 0.3806911110877991,
"eval_runtime": 15.6758,
"eval_samples_per_second": 1212.312,
"eval_steps_per_second": 37.893,
"step": 24950
},
{
"epoch": 21.533161068044787,
"grad_norm": 0.4917987585067749,
"learning_rate": 0.00011387080103359173,
"loss": 0.30524206161499023,
"step": 25000
},
{
"epoch": 21.533161068044787,
"eval_loss": 0.3728342056274414,
"eval_runtime": 17.1627,
"eval_samples_per_second": 1107.283,
"eval_steps_per_second": 34.61,
"step": 25000
},
{
"epoch": 21.57622739018088,
"grad_norm": 0.5263481736183167,
"learning_rate": 0.00011369853574504739,
"loss": 0.30638530731201175,
"step": 25050
},
{
"epoch": 21.57622739018088,
"eval_loss": 0.3712511658668518,
"eval_runtime": 16.0233,
"eval_samples_per_second": 1186.019,
"eval_steps_per_second": 37.071,
"step": 25050
},
{
"epoch": 21.619293712316967,
"grad_norm": 0.48792555928230286,
"learning_rate": 0.00011352627045650302,
"loss": 0.3103178977966309,
"step": 25100
},
{
"epoch": 21.619293712316967,
"eval_loss": 0.370941698551178,
"eval_runtime": 17.3446,
"eval_samples_per_second": 1095.674,
"eval_steps_per_second": 34.247,
"step": 25100
},
{
"epoch": 21.66236003445306,
"grad_norm": 0.5177699327468872,
"learning_rate": 0.00011335400516795867,
"loss": 0.3069496154785156,
"step": 25150
},
{
"epoch": 21.66236003445306,
"eval_loss": 0.3749415874481201,
"eval_runtime": 17.5087,
"eval_samples_per_second": 1085.403,
"eval_steps_per_second": 33.926,
"step": 25150
},
{
"epoch": 21.705426356589147,
"grad_norm": 0.4783251881599426,
"learning_rate": 0.0001131817398794143,
"loss": 0.28859285354614256,
"step": 25200
},
{
"epoch": 21.705426356589147,
"eval_loss": 0.3722775876522064,
"eval_runtime": 17.3202,
"eval_samples_per_second": 1097.216,
"eval_steps_per_second": 34.295,
"step": 25200
},
{
"epoch": 21.74849267872524,
"grad_norm": 0.5633186101913452,
"learning_rate": 0.00011300947459086994,
"loss": 0.30084590911865233,
"step": 25250
},
{
"epoch": 21.74849267872524,
"eval_loss": 0.3847544491291046,
"eval_runtime": 17.2383,
"eval_samples_per_second": 1102.427,
"eval_steps_per_second": 34.458,
"step": 25250
},
{
"epoch": 21.791559000861326,
"grad_norm": 0.5172644853591919,
"learning_rate": 0.0001128372093023256,
"loss": 0.30555530548095705,
"step": 25300
},
{
"epoch": 21.791559000861326,
"eval_loss": 0.37794214487075806,
"eval_runtime": 18.029,
"eval_samples_per_second": 1054.078,
"eval_steps_per_second": 32.947,
"step": 25300
},
{
"epoch": 21.834625322997415,
"grad_norm": 0.48205018043518066,
"learning_rate": 0.00011266494401378123,
"loss": 0.3017234992980957,
"step": 25350
},
{
"epoch": 21.834625322997415,
"eval_loss": 0.3765769302845001,
"eval_runtime": 18.2823,
"eval_samples_per_second": 1039.476,
"eval_steps_per_second": 32.49,
"step": 25350
},
{
"epoch": 21.877691645133506,
"grad_norm": 0.4693802297115326,
"learning_rate": 0.00011249267872523688,
"loss": 0.3170528793334961,
"step": 25400
},
{
"epoch": 21.877691645133506,
"eval_loss": 0.3685433268547058,
"eval_runtime": 16.3179,
"eval_samples_per_second": 1164.61,
"eval_steps_per_second": 36.402,
"step": 25400
},
{
"epoch": 21.920757967269594,
"grad_norm": 0.5655513405799866,
"learning_rate": 0.00011232041343669251,
"loss": 0.30839736938476564,
"step": 25450
},
{
"epoch": 21.920757967269594,
"eval_loss": 0.3736245930194855,
"eval_runtime": 17.5958,
"eval_samples_per_second": 1080.032,
"eval_steps_per_second": 33.758,
"step": 25450
},
{
"epoch": 21.963824289405686,
"grad_norm": 0.49325621128082275,
"learning_rate": 0.00011214814814814815,
"loss": 0.29805870056152345,
"step": 25500
},
{
"epoch": 21.963824289405686,
"eval_loss": 0.37143099308013916,
"eval_runtime": 16.6653,
"eval_samples_per_second": 1140.336,
"eval_steps_per_second": 35.643,
"step": 25500
},
{
"epoch": 22.006890611541774,
"grad_norm": 0.49408647418022156,
"learning_rate": 0.00011197588285960378,
"loss": 0.30263139724731447,
"step": 25550
},
{
"epoch": 22.006890611541774,
"eval_loss": 0.3813234567642212,
"eval_runtime": 17.3618,
"eval_samples_per_second": 1094.584,
"eval_steps_per_second": 34.213,
"step": 25550
},
{
"epoch": 22.049956933677866,
"grad_norm": 0.40663397312164307,
"learning_rate": 0.00011180361757105944,
"loss": 0.2942478942871094,
"step": 25600
},
{
"epoch": 22.049956933677866,
"eval_loss": 0.37827885150909424,
"eval_runtime": 17.5023,
"eval_samples_per_second": 1085.798,
"eval_steps_per_second": 33.938,
"step": 25600
},
{
"epoch": 22.093023255813954,
"grad_norm": 0.554619550704956,
"learning_rate": 0.00011163135228251509,
"loss": 0.29897663116455075,
"step": 25650
},
{
"epoch": 22.093023255813954,
"eval_loss": 0.37729790806770325,
"eval_runtime": 17.5268,
"eval_samples_per_second": 1084.282,
"eval_steps_per_second": 33.891,
"step": 25650
},
{
"epoch": 22.13608957795004,
"grad_norm": 0.5263612866401672,
"learning_rate": 0.00011145908699397072,
"loss": 0.30237943649291993,
"step": 25700
},
{
"epoch": 22.13608957795004,
"eval_loss": 0.3682263195514679,
"eval_runtime": 17.3968,
"eval_samples_per_second": 1092.383,
"eval_steps_per_second": 34.144,
"step": 25700
},
{
"epoch": 22.179155900086133,
"grad_norm": 0.5653451681137085,
"learning_rate": 0.00011128682170542638,
"loss": 0.3112063217163086,
"step": 25750
},
{
"epoch": 22.179155900086133,
"eval_loss": 0.3737627863883972,
"eval_runtime": 17.199,
"eval_samples_per_second": 1104.946,
"eval_steps_per_second": 34.537,
"step": 25750
},
{
"epoch": 22.22222222222222,
"grad_norm": 0.45960068702697754,
"learning_rate": 0.000111114556416882,
"loss": 0.30598041534423825,
"step": 25800
},
{
"epoch": 22.22222222222222,
"eval_loss": 0.3709113895893097,
"eval_runtime": 17.2079,
"eval_samples_per_second": 1104.376,
"eval_steps_per_second": 34.519,
"step": 25800
},
{
"epoch": 22.265288544358313,
"grad_norm": 0.4353873133659363,
"learning_rate": 0.00011094229112833765,
"loss": 0.3021149444580078,
"step": 25850
},
{
"epoch": 22.265288544358313,
"eval_loss": 0.36920997500419617,
"eval_runtime": 17.5125,
"eval_samples_per_second": 1085.17,
"eval_steps_per_second": 33.919,
"step": 25850
},
{
"epoch": 22.3083548664944,
"grad_norm": 0.4611629247665405,
"learning_rate": 0.00011077002583979328,
"loss": 0.30381624221801756,
"step": 25900
},
{
"epoch": 22.3083548664944,
"eval_loss": 0.3706386387348175,
"eval_runtime": 17.3449,
"eval_samples_per_second": 1095.655,
"eval_steps_per_second": 34.246,
"step": 25900
},
{
"epoch": 22.35142118863049,
"grad_norm": 0.5272542834281921,
"learning_rate": 0.00011059776055124893,
"loss": 0.300566520690918,
"step": 25950
},
{
"epoch": 22.35142118863049,
"eval_loss": 0.36453819274902344,
"eval_runtime": 16.8474,
"eval_samples_per_second": 1128.011,
"eval_steps_per_second": 35.258,
"step": 25950
},
{
"epoch": 22.39448751076658,
"grad_norm": 0.4472649395465851,
"learning_rate": 0.00011042549526270459,
"loss": 0.2959398078918457,
"step": 26000
},
{
"epoch": 22.39448751076658,
"eval_loss": 0.37610924243927,
"eval_runtime": 17.4908,
"eval_samples_per_second": 1086.515,
"eval_steps_per_second": 33.961,
"step": 26000
},
{
"epoch": 22.43755383290267,
"grad_norm": 0.4704062342643738,
"learning_rate": 0.00011025322997416022,
"loss": 0.2961640548706055,
"step": 26050
},
{
"epoch": 22.43755383290267,
"eval_loss": 0.37994384765625,
"eval_runtime": 17.5059,
"eval_samples_per_second": 1085.579,
"eval_steps_per_second": 33.931,
"step": 26050
},
{
"epoch": 22.48062015503876,
"grad_norm": 0.7996960878372192,
"learning_rate": 0.00011008096468561586,
"loss": 0.3014845085144043,
"step": 26100
},
{
"epoch": 22.48062015503876,
"eval_loss": 0.3722558319568634,
"eval_runtime": 17.5232,
"eval_samples_per_second": 1084.505,
"eval_steps_per_second": 33.898,
"step": 26100
},
{
"epoch": 22.52368647717485,
"grad_norm": 0.6491348147392273,
"learning_rate": 0.00010990869939707149,
"loss": 0.301578369140625,
"step": 26150
},
{
"epoch": 22.52368647717485,
"eval_loss": 0.37108728289604187,
"eval_runtime": 17.7967,
"eval_samples_per_second": 1067.84,
"eval_steps_per_second": 33.377,
"step": 26150
},
{
"epoch": 22.56675279931094,
"grad_norm": 0.5225486159324646,
"learning_rate": 0.00010973643410852714,
"loss": 0.3060628318786621,
"step": 26200
},
{
"epoch": 22.56675279931094,
"eval_loss": 0.3759528696537018,
"eval_runtime": 17.1394,
"eval_samples_per_second": 1108.788,
"eval_steps_per_second": 34.657,
"step": 26200
},
{
"epoch": 22.60981912144703,
"grad_norm": 0.5029892325401306,
"learning_rate": 0.00010956416881998277,
"loss": 0.3059848785400391,
"step": 26250
},
{
"epoch": 22.60981912144703,
"eval_loss": 0.3835486173629761,
"eval_runtime": 16.7296,
"eval_samples_per_second": 1135.948,
"eval_steps_per_second": 35.506,
"step": 26250
},
{
"epoch": 22.652885443583116,
"grad_norm": 0.46938949823379517,
"learning_rate": 0.00010939190353143843,
"loss": 0.30714372634887693,
"step": 26300
},
{
"epoch": 22.652885443583116,
"eval_loss": 0.3625224232673645,
"eval_runtime": 17.5276,
"eval_samples_per_second": 1084.235,
"eval_steps_per_second": 33.889,
"step": 26300
},
{
"epoch": 22.695951765719208,
"grad_norm": 0.5187062621116638,
"learning_rate": 0.00010921963824289405,
"loss": 0.3000494003295898,
"step": 26350
},
{
"epoch": 22.695951765719208,
"eval_loss": 0.3673667013645172,
"eval_runtime": 16.9361,
"eval_samples_per_second": 1122.098,
"eval_steps_per_second": 35.073,
"step": 26350
},
{
"epoch": 22.739018087855296,
"grad_norm": 0.4383411109447479,
"learning_rate": 0.0001090473729543497,
"loss": 0.30050899505615236,
"step": 26400
},
{
"epoch": 22.739018087855296,
"eval_loss": 0.36597198247909546,
"eval_runtime": 16.702,
"eval_samples_per_second": 1137.826,
"eval_steps_per_second": 35.565,
"step": 26400
},
{
"epoch": 22.782084409991388,
"grad_norm": 0.5613611340522766,
"learning_rate": 0.00010887510766580535,
"loss": 0.29673213958740235,
"step": 26450
},
{
"epoch": 22.782084409991388,
"eval_loss": 0.37194007635116577,
"eval_runtime": 17.8701,
"eval_samples_per_second": 1063.454,
"eval_steps_per_second": 33.24,
"step": 26450
},
{
"epoch": 22.825150732127476,
"grad_norm": 0.47791770100593567,
"learning_rate": 0.00010870284237726098,
"loss": 0.30543540954589843,
"step": 26500
},
{
"epoch": 22.825150732127476,
"eval_loss": 0.3654497265815735,
"eval_runtime": 18.0311,
"eval_samples_per_second": 1053.959,
"eval_steps_per_second": 32.943,
"step": 26500
},
{
"epoch": 22.868217054263567,
"grad_norm": 0.42347297072410583,
"learning_rate": 0.00010853057708871664,
"loss": 0.299227294921875,
"step": 26550
},
{
"epoch": 22.868217054263567,
"eval_loss": 0.36894989013671875,
"eval_runtime": 17.2629,
"eval_samples_per_second": 1100.861,
"eval_steps_per_second": 34.409,
"step": 26550
},
{
"epoch": 22.911283376399656,
"grad_norm": 0.5425981283187866,
"learning_rate": 0.00010835831180017226,
"loss": 0.30300045013427734,
"step": 26600
},
{
"epoch": 22.911283376399656,
"eval_loss": 0.3654426336288452,
"eval_runtime": 17.5264,
"eval_samples_per_second": 1084.31,
"eval_steps_per_second": 33.892,
"step": 26600
},
{
"epoch": 22.954349698535744,
"grad_norm": 0.5579530596733093,
"learning_rate": 0.00010818604651162792,
"loss": 0.3015494918823242,
"step": 26650
},
{
"epoch": 22.954349698535744,
"eval_loss": 0.37281379103660583,
"eval_runtime": 17.2889,
"eval_samples_per_second": 1099.204,
"eval_steps_per_second": 34.357,
"step": 26650
},
{
"epoch": 22.997416020671835,
"grad_norm": 0.648759126663208,
"learning_rate": 0.00010801378122308355,
"loss": 0.30109729766845705,
"step": 26700
},
{
"epoch": 22.997416020671835,
"eval_loss": 0.370487779378891,
"eval_runtime": 16.1964,
"eval_samples_per_second": 1173.347,
"eval_steps_per_second": 36.675,
"step": 26700
},
{
"epoch": 23.040482342807923,
"grad_norm": 0.5155696272850037,
"learning_rate": 0.00010784151593453919,
"loss": 0.29900886535644533,
"step": 26750
},
{
"epoch": 23.040482342807923,
"eval_loss": 0.3741052448749542,
"eval_runtime": 17.5083,
"eval_samples_per_second": 1085.43,
"eval_steps_per_second": 33.927,
"step": 26750
},
{
"epoch": 23.083548664944015,
"grad_norm": 0.47485584020614624,
"learning_rate": 0.00010766925064599485,
"loss": 0.29572803497314454,
"step": 26800
},
{
"epoch": 23.083548664944015,
"eval_loss": 0.3721824288368225,
"eval_runtime": 17.0469,
"eval_samples_per_second": 1114.809,
"eval_steps_per_second": 34.845,
"step": 26800
},
{
"epoch": 23.126614987080103,
"grad_norm": 0.4486505091190338,
"learning_rate": 0.00010749698535745048,
"loss": 0.30237659454345706,
"step": 26850
},
{
"epoch": 23.126614987080103,
"eval_loss": 0.3684123754501343,
"eval_runtime": 17.3869,
"eval_samples_per_second": 1093.004,
"eval_steps_per_second": 34.164,
"step": 26850
},
{
"epoch": 23.169681309216195,
"grad_norm": 0.4725002348423004,
"learning_rate": 0.00010732472006890613,
"loss": 0.2979306221008301,
"step": 26900
},
{
"epoch": 23.169681309216195,
"eval_loss": 0.37552914023399353,
"eval_runtime": 17.4933,
"eval_samples_per_second": 1086.358,
"eval_steps_per_second": 33.956,
"step": 26900
},
{
"epoch": 23.212747631352283,
"grad_norm": 0.5699407458305359,
"learning_rate": 0.00010715245478036176,
"loss": 0.3049004364013672,
"step": 26950
},
{
"epoch": 23.212747631352283,
"eval_loss": 0.37610357999801636,
"eval_runtime": 17.3892,
"eval_samples_per_second": 1092.861,
"eval_steps_per_second": 34.159,
"step": 26950
},
{
"epoch": 23.25581395348837,
"grad_norm": 0.5288128852844238,
"learning_rate": 0.0001069801894918174,
"loss": 0.29878740310668944,
"step": 27000
},
{
"epoch": 23.25581395348837,
"eval_loss": 0.3685736060142517,
"eval_runtime": 17.5063,
"eval_samples_per_second": 1085.555,
"eval_steps_per_second": 33.931,
"step": 27000
},
{
"epoch": 23.298880275624462,
"grad_norm": 0.5954151749610901,
"learning_rate": 0.00010680792420327303,
"loss": 0.30775325775146484,
"step": 27050
},
{
"epoch": 23.298880275624462,
"eval_loss": 0.3689676523208618,
"eval_runtime": 17.4826,
"eval_samples_per_second": 1087.022,
"eval_steps_per_second": 33.977,
"step": 27050
},
{
"epoch": 23.34194659776055,
"grad_norm": 0.49391523003578186,
"learning_rate": 0.00010663565891472869,
"loss": 0.30306617736816405,
"step": 27100
},
{
"epoch": 23.34194659776055,
"eval_loss": 0.36533570289611816,
"eval_runtime": 17.0356,
"eval_samples_per_second": 1115.544,
"eval_steps_per_second": 34.868,
"step": 27100
},
{
"epoch": 23.385012919896642,
"grad_norm": 0.5512219071388245,
"learning_rate": 0.00010646339362618434,
"loss": 0.30098161697387693,
"step": 27150
},
{
"epoch": 23.385012919896642,
"eval_loss": 0.3735466003417969,
"eval_runtime": 16.0404,
"eval_samples_per_second": 1184.757,
"eval_steps_per_second": 37.031,
"step": 27150
},
{
"epoch": 23.42807924203273,
"grad_norm": 0.4772174656391144,
"learning_rate": 0.00010629112833763997,
"loss": 0.2883674812316894,
"step": 27200
},
{
"epoch": 23.42807924203273,
"eval_loss": 0.37100750207901,
"eval_runtime": 17.365,
"eval_samples_per_second": 1094.384,
"eval_steps_per_second": 34.207,
"step": 27200
},
{
"epoch": 23.47114556416882,
"grad_norm": 0.515988290309906,
"learning_rate": 0.00010611886304909561,
"loss": 0.2984621810913086,
"step": 27250
},
{
"epoch": 23.47114556416882,
"eval_loss": 0.3775508999824524,
"eval_runtime": 15.6519,
"eval_samples_per_second": 1214.163,
"eval_steps_per_second": 37.951,
"step": 27250
},
{
"epoch": 23.51421188630491,
"grad_norm": 0.5282620191574097,
"learning_rate": 0.00010594659776055124,
"loss": 0.29439170837402345,
"step": 27300
},
{
"epoch": 23.51421188630491,
"eval_loss": 0.3750689923763275,
"eval_runtime": 17.1233,
"eval_samples_per_second": 1109.83,
"eval_steps_per_second": 34.689,
"step": 27300
},
{
"epoch": 23.557278208440998,
"grad_norm": 0.4884462356567383,
"learning_rate": 0.0001057743324720069,
"loss": 0.29813528060913086,
"step": 27350
},
{
"epoch": 23.557278208440998,
"eval_loss": 0.36741402745246887,
"eval_runtime": 17.1586,
"eval_samples_per_second": 1107.548,
"eval_steps_per_second": 34.618,
"step": 27350
},
{
"epoch": 23.60034453057709,
"grad_norm": 0.6048156023025513,
"learning_rate": 0.00010560206718346252,
"loss": 0.30056821823120117,
"step": 27400
},
{
"epoch": 23.60034453057709,
"eval_loss": 0.3732120990753174,
"eval_runtime": 17.5348,
"eval_samples_per_second": 1083.787,
"eval_steps_per_second": 33.875,
"step": 27400
},
{
"epoch": 23.643410852713178,
"grad_norm": 0.41740378737449646,
"learning_rate": 0.00010542980189491818,
"loss": 0.29642223358154296,
"step": 27450
},
{
"epoch": 23.643410852713178,
"eval_loss": 0.37019461393356323,
"eval_runtime": 17.5282,
"eval_samples_per_second": 1084.198,
"eval_steps_per_second": 33.888,
"step": 27450
},
{
"epoch": 23.68647717484927,
"grad_norm": 0.4838183522224426,
"learning_rate": 0.00010525753660637384,
"loss": 0.3003374481201172,
"step": 27500
},
{
"epoch": 23.68647717484927,
"eval_loss": 0.3708420991897583,
"eval_runtime": 17.6626,
"eval_samples_per_second": 1075.946,
"eval_steps_per_second": 33.63,
"step": 27500
},
{
"epoch": 23.729543496985357,
"grad_norm": 0.6160274147987366,
"learning_rate": 0.00010508527131782946,
"loss": 0.29868263244628906,
"step": 27550
},
{
"epoch": 23.729543496985357,
"eval_loss": 0.3727380037307739,
"eval_runtime": 17.3239,
"eval_samples_per_second": 1096.981,
"eval_steps_per_second": 34.288,
"step": 27550
},
{
"epoch": 23.772609819121445,
"grad_norm": 0.5261010527610779,
"learning_rate": 0.0001049130060292851,
"loss": 0.29750579833984375,
"step": 27600
},
{
"epoch": 23.772609819121445,
"eval_loss": 0.36880743503570557,
"eval_runtime": 16.7074,
"eval_samples_per_second": 1137.462,
"eval_steps_per_second": 35.553,
"step": 27600
},
{
"epoch": 23.815676141257537,
"grad_norm": 0.4147060215473175,
"learning_rate": 0.00010474074074074074,
"loss": 0.2987259864807129,
"step": 27650
},
{
"epoch": 23.815676141257537,
"eval_loss": 0.37036553025245667,
"eval_runtime": 17.5461,
"eval_samples_per_second": 1083.092,
"eval_steps_per_second": 33.854,
"step": 27650
},
{
"epoch": 23.858742463393625,
"grad_norm": 0.6327623128890991,
"learning_rate": 0.00010456847545219639,
"loss": 0.2982285308837891,
"step": 27700
},
{
"epoch": 23.858742463393625,
"eval_loss": 0.3701915442943573,
"eval_runtime": 15.9311,
"eval_samples_per_second": 1192.885,
"eval_steps_per_second": 37.285,
"step": 27700
},
{
"epoch": 23.901808785529717,
"grad_norm": 0.48875871300697327,
"learning_rate": 0.00010439621016365202,
"loss": 0.30142845153808595,
"step": 27750
},
{
"epoch": 23.901808785529717,
"eval_loss": 0.3765123188495636,
"eval_runtime": 17.3892,
"eval_samples_per_second": 1092.862,
"eval_steps_per_second": 34.159,
"step": 27750
},
{
"epoch": 23.944875107665805,
"grad_norm": 0.4395269453525543,
"learning_rate": 0.00010422394487510768,
"loss": 0.30503551483154295,
"step": 27800
},
{
"epoch": 23.944875107665805,
"eval_loss": 0.3670797348022461,
"eval_runtime": 17.7719,
"eval_samples_per_second": 1069.328,
"eval_steps_per_second": 33.424,
"step": 27800
},
{
"epoch": 23.987941429801896,
"grad_norm": 0.561876118183136,
"learning_rate": 0.00010405167958656332,
"loss": 0.30540702819824217,
"step": 27850
},
{
"epoch": 23.987941429801896,
"eval_loss": 0.3722612261772156,
"eval_runtime": 17.3986,
"eval_samples_per_second": 1092.27,
"eval_steps_per_second": 34.141,
"step": 27850
},
{
"epoch": 24.031007751937985,
"grad_norm": 0.5003405213356018,
"learning_rate": 0.00010387941429801895,
"loss": 0.2985442352294922,
"step": 27900
},
{
"epoch": 24.031007751937985,
"eval_loss": 0.3710918426513672,
"eval_runtime": 17.3771,
"eval_samples_per_second": 1093.624,
"eval_steps_per_second": 34.183,
"step": 27900
},
{
"epoch": 24.074074074074073,
"grad_norm": 0.4739573299884796,
"learning_rate": 0.0001037071490094746,
"loss": 0.29766389846801755,
"step": 27950
},
{
"epoch": 24.074074074074073,
"eval_loss": 0.3728407025337219,
"eval_runtime": 17.5212,
"eval_samples_per_second": 1084.626,
"eval_steps_per_second": 33.902,
"step": 27950
},
{
"epoch": 24.117140396210164,
"grad_norm": 0.6355504989624023,
"learning_rate": 0.00010353488372093023,
"loss": 0.30111677169799805,
"step": 28000
},
{
"epoch": 24.117140396210164,
"eval_loss": 0.37044641375541687,
"eval_runtime": 17.4046,
"eval_samples_per_second": 1091.897,
"eval_steps_per_second": 34.129,
"step": 28000
},
{
"epoch": 24.160206718346252,
"grad_norm": 0.505262017250061,
"learning_rate": 0.00010336261843238589,
"loss": 0.29236396789550784,
"step": 28050
},
{
"epoch": 24.160206718346252,
"eval_loss": 0.36897462606430054,
"eval_runtime": 15.9831,
"eval_samples_per_second": 1189.005,
"eval_steps_per_second": 37.164,
"step": 28050
},
{
"epoch": 24.203273040482344,
"grad_norm": 0.5167173147201538,
"learning_rate": 0.00010319035314384151,
"loss": 0.3093917465209961,
"step": 28100
},
{
"epoch": 24.203273040482344,
"eval_loss": 0.36572420597076416,
"eval_runtime": 17.5107,
"eval_samples_per_second": 1085.276,
"eval_steps_per_second": 33.922,
"step": 28100
},
{
"epoch": 24.246339362618432,
"grad_norm": 0.5450465679168701,
"learning_rate": 0.00010301808785529716,
"loss": 0.29390304565429687,
"step": 28150
},
{
"epoch": 24.246339362618432,
"eval_loss": 0.36511915922164917,
"eval_runtime": 16.6281,
"eval_samples_per_second": 1142.885,
"eval_steps_per_second": 35.723,
"step": 28150
},
{
"epoch": 24.289405684754524,
"grad_norm": 0.5009111166000366,
"learning_rate": 0.00010284582256675281,
"loss": 0.29303544998168946,
"step": 28200
},
{
"epoch": 24.289405684754524,
"eval_loss": 0.3654605746269226,
"eval_runtime": 17.6784,
"eval_samples_per_second": 1074.984,
"eval_steps_per_second": 33.6,
"step": 28200
},
{
"epoch": 24.33247200689061,
"grad_norm": 0.41044268012046814,
"learning_rate": 0.00010267355727820844,
"loss": 0.30087459564208985,
"step": 28250
},
{
"epoch": 24.33247200689061,
"eval_loss": 0.3658417761325836,
"eval_runtime": 17.1645,
"eval_samples_per_second": 1107.169,
"eval_steps_per_second": 34.606,
"step": 28250
},
{
"epoch": 24.3755383290267,
"grad_norm": 0.4747028946876526,
"learning_rate": 0.0001025012919896641,
"loss": 0.28880956649780276,
"step": 28300
},
{
"epoch": 24.3755383290267,
"eval_loss": 0.37379440665245056,
"eval_runtime": 17.5312,
"eval_samples_per_second": 1084.01,
"eval_steps_per_second": 33.882,
"step": 28300
},
{
"epoch": 24.41860465116279,
"grad_norm": 0.5528591275215149,
"learning_rate": 0.00010232902670111972,
"loss": 0.30124286651611326,
"step": 28350
},
{
"epoch": 24.41860465116279,
"eval_loss": 0.3714071214199066,
"eval_runtime": 17.5251,
"eval_samples_per_second": 1084.386,
"eval_steps_per_second": 33.894,
"step": 28350
},
{
"epoch": 24.46167097329888,
"grad_norm": 0.4998020529747009,
"learning_rate": 0.00010215676141257538,
"loss": 0.29641380310058596,
"step": 28400
},
{
"epoch": 24.46167097329888,
"eval_loss": 0.3669067323207855,
"eval_runtime": 17.6192,
"eval_samples_per_second": 1078.599,
"eval_steps_per_second": 33.713,
"step": 28400
},
{
"epoch": 24.50473729543497,
"grad_norm": 0.4717235267162323,
"learning_rate": 0.00010198449612403101,
"loss": 0.29951982498168944,
"step": 28450
},
{
"epoch": 24.50473729543497,
"eval_loss": 0.3649687170982361,
"eval_runtime": 17.4214,
"eval_samples_per_second": 1090.842,
"eval_steps_per_second": 34.096,
"step": 28450
},
{
"epoch": 24.54780361757106,
"grad_norm": 0.5200428366661072,
"learning_rate": 0.00010181223083548665,
"loss": 0.30025199890136717,
"step": 28500
},
{
"epoch": 24.54780361757106,
"eval_loss": 0.37060484290122986,
"eval_runtime": 16.6984,
"eval_samples_per_second": 1138.072,
"eval_steps_per_second": 35.572,
"step": 28500
},
{
"epoch": 24.590869939707147,
"grad_norm": 0.5694150328636169,
"learning_rate": 0.0001016399655469423,
"loss": 0.28833511352539065,
"step": 28550
},
{
"epoch": 24.590869939707147,
"eval_loss": 0.36970415711402893,
"eval_runtime": 17.2079,
"eval_samples_per_second": 1104.374,
"eval_steps_per_second": 34.519,
"step": 28550
},
{
"epoch": 24.63393626184324,
"grad_norm": 0.4342597723007202,
"learning_rate": 0.00010146770025839794,
"loss": 0.29415096282958986,
"step": 28600
},
{
"epoch": 24.63393626184324,
"eval_loss": 0.3628757894039154,
"eval_runtime": 16.1679,
"eval_samples_per_second": 1175.418,
"eval_steps_per_second": 36.74,
"step": 28600
},
{
"epoch": 24.677002583979327,
"grad_norm": 0.44152048230171204,
"learning_rate": 0.00010129543496985359,
"loss": 0.28828786849975585,
"step": 28650
},
{
"epoch": 24.677002583979327,
"eval_loss": 0.36343470215797424,
"eval_runtime": 17.8291,
"eval_samples_per_second": 1065.898,
"eval_steps_per_second": 33.316,
"step": 28650
},
{
"epoch": 24.72006890611542,
"grad_norm": 0.4448101222515106,
"learning_rate": 0.00010112316968130922,
"loss": 0.2990582084655762,
"step": 28700
},
{
"epoch": 24.72006890611542,
"eval_loss": 0.3635788857936859,
"eval_runtime": 17.3123,
"eval_samples_per_second": 1097.716,
"eval_steps_per_second": 34.311,
"step": 28700
},
{
"epoch": 24.763135228251507,
"grad_norm": 0.4918091595172882,
"learning_rate": 0.00010095090439276486,
"loss": 0.3049300765991211,
"step": 28750
},
{
"epoch": 24.763135228251507,
"eval_loss": 0.3565711975097656,
"eval_runtime": 17.3735,
"eval_samples_per_second": 1093.85,
"eval_steps_per_second": 34.19,
"step": 28750
},
{
"epoch": 24.8062015503876,
"grad_norm": 0.4032810926437378,
"learning_rate": 0.00010077863910422049,
"loss": 0.2918486022949219,
"step": 28800
},
{
"epoch": 24.8062015503876,
"eval_loss": 0.3619081974029541,
"eval_runtime": 17.7009,
"eval_samples_per_second": 1073.617,
"eval_steps_per_second": 33.558,
"step": 28800
},
{
"epoch": 24.849267872523686,
"grad_norm": 0.4441034495830536,
"learning_rate": 0.00010060637381567615,
"loss": 0.2930795669555664,
"step": 28850
},
{
"epoch": 24.849267872523686,
"eval_loss": 0.3589094281196594,
"eval_runtime": 17.3399,
"eval_samples_per_second": 1095.967,
"eval_steps_per_second": 34.256,
"step": 28850
},
{
"epoch": 24.892334194659774,
"grad_norm": 0.47586753964424133,
"learning_rate": 0.0001004341085271318,
"loss": 0.2950699806213379,
"step": 28900
},
{
"epoch": 24.892334194659774,
"eval_loss": 0.37087494134902954,
"eval_runtime": 16.9329,
"eval_samples_per_second": 1122.313,
"eval_steps_per_second": 35.08,
"step": 28900
},
{
"epoch": 24.935400516795866,
"grad_norm": 0.44512951374053955,
"learning_rate": 0.00010026184323858743,
"loss": 0.2932775688171387,
"step": 28950
},
{
"epoch": 24.935400516795866,
"eval_loss": 0.3673975169658661,
"eval_runtime": 15.7482,
"eval_samples_per_second": 1206.738,
"eval_steps_per_second": 37.718,
"step": 28950
},
{
"epoch": 24.978466838931954,
"grad_norm": 0.46907806396484375,
"learning_rate": 0.00010008957795004307,
"loss": 0.29798852920532226,
"step": 29000
},
{
"epoch": 24.978466838931954,
"eval_loss": 0.3643653094768524,
"eval_runtime": 17.5833,
"eval_samples_per_second": 1080.8,
"eval_steps_per_second": 33.782,
"step": 29000
},
{
"epoch": 25.021533161068046,
"grad_norm": 0.47138622403144836,
"learning_rate": 9.991731266149871e-05,
"loss": 0.2976431083679199,
"step": 29050
},
{
"epoch": 25.021533161068046,
"eval_loss": 0.363147497177124,
"eval_runtime": 16.2005,
"eval_samples_per_second": 1173.052,
"eval_steps_per_second": 36.666,
"step": 29050
},
{
"epoch": 25.064599483204134,
"grad_norm": 0.4845311939716339,
"learning_rate": 9.974504737295436e-05,
"loss": 0.30047725677490233,
"step": 29100
},
{
"epoch": 25.064599483204134,
"eval_loss": 0.3621458411216736,
"eval_runtime": 17.7041,
"eval_samples_per_second": 1073.426,
"eval_steps_per_second": 33.552,
"step": 29100
},
{
"epoch": 25.107665805340226,
"grad_norm": 0.3969680368900299,
"learning_rate": 9.957278208441e-05,
"loss": 0.29270526885986325,
"step": 29150
},
{
"epoch": 25.107665805340226,
"eval_loss": 0.36505424976348877,
"eval_runtime": 17.6953,
"eval_samples_per_second": 1073.958,
"eval_steps_per_second": 33.568,
"step": 29150
},
{
"epoch": 25.150732127476314,
"grad_norm": 0.4725561738014221,
"learning_rate": 9.940051679586564e-05,
"loss": 0.2929987907409668,
"step": 29200
},
{
"epoch": 25.150732127476314,
"eval_loss": 0.35975778102874756,
"eval_runtime": 18.2441,
"eval_samples_per_second": 1041.653,
"eval_steps_per_second": 32.559,
"step": 29200
},
{
"epoch": 25.1937984496124,
"grad_norm": 0.44618576765060425,
"learning_rate": 9.922825150732128e-05,
"loss": 0.29194810867309573,
"step": 29250
},
{
"epoch": 25.1937984496124,
"eval_loss": 0.3662404417991638,
"eval_runtime": 19.5998,
"eval_samples_per_second": 969.604,
"eval_steps_per_second": 30.307,
"step": 29250
},
{
"epoch": 25.236864771748493,
"grad_norm": 0.5035511255264282,
"learning_rate": 9.905598621877692e-05,
"loss": 0.2904472351074219,
"step": 29300
},
{
"epoch": 25.236864771748493,
"eval_loss": 0.37068793177604675,
"eval_runtime": 19.1675,
"eval_samples_per_second": 991.468,
"eval_steps_per_second": 30.99,
"step": 29300
},
{
"epoch": 25.27993109388458,
"grad_norm": 0.49068546295166016,
"learning_rate": 9.888372093023255e-05,
"loss": 0.2910709762573242,
"step": 29350
},
{
"epoch": 25.27993109388458,
"eval_loss": 0.3615714907646179,
"eval_runtime": 17.9591,
"eval_samples_per_second": 1058.184,
"eval_steps_per_second": 33.075,
"step": 29350
},
{
"epoch": 25.322997416020673,
"grad_norm": 0.4490571916103363,
"learning_rate": 9.87114556416882e-05,
"loss": 0.2928461456298828,
"step": 29400
},
{
"epoch": 25.322997416020673,
"eval_loss": 0.366520494222641,
"eval_runtime": 16.733,
"eval_samples_per_second": 1135.722,
"eval_steps_per_second": 35.499,
"step": 29400
},
{
"epoch": 25.36606373815676,
"grad_norm": 0.45667514204978943,
"learning_rate": 9.853919035314385e-05,
"loss": 0.2925313568115234,
"step": 29450
},
{
"epoch": 25.36606373815676,
"eval_loss": 0.36547213792800903,
"eval_runtime": 17.4716,
"eval_samples_per_second": 1087.707,
"eval_steps_per_second": 33.998,
"step": 29450
},
{
"epoch": 25.409130060292853,
"grad_norm": 0.4668999910354614,
"learning_rate": 9.836692506459949e-05,
"loss": 0.2990184593200684,
"step": 29500
},
{
"epoch": 25.409130060292853,
"eval_loss": 0.3627609610557556,
"eval_runtime": 16.4897,
"eval_samples_per_second": 1152.474,
"eval_steps_per_second": 36.022,
"step": 29500
},
{
"epoch": 25.45219638242894,
"grad_norm": 0.5063576698303223,
"learning_rate": 9.819465977605514e-05,
"loss": 0.28932918548583986,
"step": 29550
},
{
"epoch": 25.45219638242894,
"eval_loss": 0.36449888348579407,
"eval_runtime": 17.2565,
"eval_samples_per_second": 1101.265,
"eval_steps_per_second": 34.422,
"step": 29550
},
{
"epoch": 25.49526270456503,
"grad_norm": 0.5766741037368774,
"learning_rate": 9.802239448751078e-05,
"loss": 0.2939249801635742,
"step": 29600
},
{
"epoch": 25.49526270456503,
"eval_loss": 0.3627641499042511,
"eval_runtime": 17.7647,
"eval_samples_per_second": 1069.762,
"eval_steps_per_second": 33.437,
"step": 29600
},
{
"epoch": 25.53832902670112,
"grad_norm": 0.46216633915901184,
"learning_rate": 9.78501291989664e-05,
"loss": 0.2854126739501953,
"step": 29650
},
{
"epoch": 25.53832902670112,
"eval_loss": 0.3621639013290405,
"eval_runtime": 18.3339,
"eval_samples_per_second": 1036.55,
"eval_steps_per_second": 32.399,
"step": 29650
},
{
"epoch": 25.58139534883721,
"grad_norm": 0.4835483729839325,
"learning_rate": 9.767786391042205e-05,
"loss": 0.2938497352600098,
"step": 29700
},
{
"epoch": 25.58139534883721,
"eval_loss": 0.3679318130016327,
"eval_runtime": 18.0955,
"eval_samples_per_second": 1050.208,
"eval_steps_per_second": 32.826,
"step": 29700
},
{
"epoch": 25.6244616709733,
"grad_norm": 0.45520979166030884,
"learning_rate": 9.750559862187769e-05,
"loss": 0.2909726333618164,
"step": 29750
},
{
"epoch": 25.6244616709733,
"eval_loss": 0.3689843416213989,
"eval_runtime": 17.8255,
"eval_samples_per_second": 1066.111,
"eval_steps_per_second": 33.323,
"step": 29750
},
{
"epoch": 25.66752799310939,
"grad_norm": 0.4701136648654938,
"learning_rate": 9.733333333333335e-05,
"loss": 0.2963640785217285,
"step": 29800
},
{
"epoch": 25.66752799310939,
"eval_loss": 0.37323909997940063,
"eval_runtime": 17.5489,
"eval_samples_per_second": 1082.914,
"eval_steps_per_second": 33.848,
"step": 29800
},
{
"epoch": 25.710594315245476,
"grad_norm": 0.5356958508491516,
"learning_rate": 9.716106804478899e-05,
"loss": 0.2972037124633789,
"step": 29850
},
{
"epoch": 25.710594315245476,
"eval_loss": 0.3621019124984741,
"eval_runtime": 16.7547,
"eval_samples_per_second": 1134.25,
"eval_steps_per_second": 35.453,
"step": 29850
},
{
"epoch": 25.753660637381568,
"grad_norm": 0.4646724760532379,
"learning_rate": 9.698880275624463e-05,
"loss": 0.29597414016723633,
"step": 29900
},
{
"epoch": 25.753660637381568,
"eval_loss": 0.37024474143981934,
"eval_runtime": 17.4607,
"eval_samples_per_second": 1088.386,
"eval_steps_per_second": 34.019,
"step": 29900
},
{
"epoch": 25.796726959517656,
"grad_norm": 0.5215739607810974,
"learning_rate": 9.681653746770026e-05,
"loss": 0.2891251754760742,
"step": 29950
},
{
"epoch": 25.796726959517656,
"eval_loss": 0.364225834608078,
"eval_runtime": 16.6391,
"eval_samples_per_second": 1142.126,
"eval_steps_per_second": 35.699,
"step": 29950
},
{
"epoch": 25.839793281653748,
"grad_norm": 0.5005343556404114,
"learning_rate": 9.66442721791559e-05,
"loss": 0.2926528167724609,
"step": 30000
},
{
"epoch": 25.839793281653748,
"eval_loss": 0.36475899815559387,
"eval_runtime": 17.5158,
"eval_samples_per_second": 1084.966,
"eval_steps_per_second": 33.912,
"step": 30000
},
{
"epoch": 25.882859603789836,
"grad_norm": 0.47080203890800476,
"learning_rate": 9.647200689061154e-05,
"loss": 0.2989816093444824,
"step": 30050
},
{
"epoch": 25.882859603789836,
"eval_loss": 0.360416978597641,
"eval_runtime": 17.3856,
"eval_samples_per_second": 1093.087,
"eval_steps_per_second": 34.166,
"step": 30050
},
{
"epoch": 25.925925925925927,
"grad_norm": 0.4474875330924988,
"learning_rate": 9.629974160206718e-05,
"loss": 0.292766227722168,
"step": 30100
},
{
"epoch": 25.925925925925927,
"eval_loss": 0.36163151264190674,
"eval_runtime": 17.7711,
"eval_samples_per_second": 1069.378,
"eval_steps_per_second": 33.425,
"step": 30100
},
{
"epoch": 25.968992248062015,
"grad_norm": 0.5251291990280151,
"learning_rate": 9.612747631352284e-05,
"loss": 0.2900256729125977,
"step": 30150
},
{
"epoch": 25.968992248062015,
"eval_loss": 0.3662976026535034,
"eval_runtime": 17.5637,
"eval_samples_per_second": 1082.002,
"eval_steps_per_second": 33.82,
"step": 30150
},
{
"epoch": 26.012058570198104,
"grad_norm": 0.47462013363838196,
"learning_rate": 9.595521102497847e-05,
"loss": 0.2990392303466797,
"step": 30200
},
{
"epoch": 26.012058570198104,
"eval_loss": 0.36906498670578003,
"eval_runtime": 17.2244,
"eval_samples_per_second": 1103.322,
"eval_steps_per_second": 34.486,
"step": 30200
},
{
"epoch": 26.055124892334195,
"grad_norm": 0.5367721319198608,
"learning_rate": 9.578294573643411e-05,
"loss": 0.2976276969909668,
"step": 30250
},
{
"epoch": 26.055124892334195,
"eval_loss": 0.3585287034511566,
"eval_runtime": 17.0711,
"eval_samples_per_second": 1113.229,
"eval_steps_per_second": 34.796,
"step": 30250
},
{
"epoch": 26.098191214470283,
"grad_norm": 0.42540860176086426,
"learning_rate": 9.561068044788975e-05,
"loss": 0.2866293716430664,
"step": 30300
},
{
"epoch": 26.098191214470283,
"eval_loss": 0.3634384870529175,
"eval_runtime": 17.4952,
"eval_samples_per_second": 1086.241,
"eval_steps_per_second": 33.952,
"step": 30300
},
{
"epoch": 26.141257536606375,
"grad_norm": 0.38212257623672485,
"learning_rate": 9.54384151593454e-05,
"loss": 0.29723587036132815,
"step": 30350
},
{
"epoch": 26.141257536606375,
"eval_loss": 0.3666623830795288,
"eval_runtime": 17.3805,
"eval_samples_per_second": 1093.407,
"eval_steps_per_second": 34.176,
"step": 30350
},
{
"epoch": 26.184323858742463,
"grad_norm": 0.4894464910030365,
"learning_rate": 9.526614987080104e-05,
"loss": 0.2930299377441406,
"step": 30400
},
{
"epoch": 26.184323858742463,
"eval_loss": 0.3624574840068817,
"eval_runtime": 16.5236,
"eval_samples_per_second": 1150.112,
"eval_steps_per_second": 35.949,
"step": 30400
},
{
"epoch": 26.227390180878555,
"grad_norm": 0.441383421421051,
"learning_rate": 9.509388458225668e-05,
"loss": 0.28941658020019534,
"step": 30450
},
{
"epoch": 26.227390180878555,
"eval_loss": 0.3617941439151764,
"eval_runtime": 17.8969,
"eval_samples_per_second": 1061.86,
"eval_steps_per_second": 33.19,
"step": 30450
},
{
"epoch": 26.270456503014643,
"grad_norm": 0.5019906163215637,
"learning_rate": 9.492161929371232e-05,
"loss": 0.29477603912353517,
"step": 30500
},
{
"epoch": 26.270456503014643,
"eval_loss": 0.3631625473499298,
"eval_runtime": 18.874,
"eval_samples_per_second": 1006.886,
"eval_steps_per_second": 31.472,
"step": 30500
},
{
"epoch": 26.31352282515073,
"grad_norm": 0.4819585084915161,
"learning_rate": 9.474935400516796e-05,
"loss": 0.29420578002929687,
"step": 30550
},
{
"epoch": 26.31352282515073,
"eval_loss": 0.37077534198760986,
"eval_runtime": 17.7539,
"eval_samples_per_second": 1070.413,
"eval_steps_per_second": 33.457,
"step": 30550
},
{
"epoch": 26.356589147286822,
"grad_norm": 0.5694970488548279,
"learning_rate": 9.45770887166236e-05,
"loss": 0.2949700355529785,
"step": 30600
},
{
"epoch": 26.356589147286822,
"eval_loss": 0.36437633633613586,
"eval_runtime": 17.703,
"eval_samples_per_second": 1073.488,
"eval_steps_per_second": 33.554,
"step": 30600
},
{
"epoch": 26.39965546942291,
"grad_norm": 0.5988081693649292,
"learning_rate": 9.440482342807925e-05,
"loss": 0.2924643135070801,
"step": 30650
},
{
"epoch": 26.39965546942291,
"eval_loss": 0.36314231157302856,
"eval_runtime": 17.3896,
"eval_samples_per_second": 1092.836,
"eval_steps_per_second": 34.158,
"step": 30650
},
{
"epoch": 26.442721791559002,
"grad_norm": 0.469237744808197,
"learning_rate": 9.423255813953489e-05,
"loss": 0.29159337997436524,
"step": 30700
},
{
"epoch": 26.442721791559002,
"eval_loss": 0.3586832284927368,
"eval_runtime": 17.0922,
"eval_samples_per_second": 1111.855,
"eval_steps_per_second": 34.753,
"step": 30700
},
{
"epoch": 26.48578811369509,
"grad_norm": 0.42142677307128906,
"learning_rate": 9.406029285099053e-05,
"loss": 0.2882324409484863,
"step": 30750
},
{
"epoch": 26.48578811369509,
"eval_loss": 0.36477532982826233,
"eval_runtime": 16.5523,
"eval_samples_per_second": 1148.122,
"eval_steps_per_second": 35.886,
"step": 30750
},
{
"epoch": 26.528854435831178,
"grad_norm": 0.5955842733383179,
"learning_rate": 9.388802756244617e-05,
"loss": 0.2991225433349609,
"step": 30800
},
{
"epoch": 26.528854435831178,
"eval_loss": 0.36673441529273987,
"eval_runtime": 17.4498,
"eval_samples_per_second": 1089.068,
"eval_steps_per_second": 34.041,
"step": 30800
},
{
"epoch": 26.57192075796727,
"grad_norm": 0.5307620763778687,
"learning_rate": 9.37157622739018e-05,
"loss": 0.2927092170715332,
"step": 30850
},
{
"epoch": 26.57192075796727,
"eval_loss": 0.3575948476791382,
"eval_runtime": 17.1996,
"eval_samples_per_second": 1104.91,
"eval_steps_per_second": 34.536,
"step": 30850
},
{
"epoch": 26.614987080103358,
"grad_norm": 0.4583006799221039,
"learning_rate": 9.354349698535746e-05,
"loss": 0.2902518844604492,
"step": 30900
},
{
"epoch": 26.614987080103358,
"eval_loss": 0.3646640181541443,
"eval_runtime": 17.8661,
"eval_samples_per_second": 1063.688,
"eval_steps_per_second": 33.247,
"step": 30900
},
{
"epoch": 26.65805340223945,
"grad_norm": 0.49315473437309265,
"learning_rate": 9.33712316968131e-05,
"loss": 0.2924757385253906,
"step": 30950
},
{
"epoch": 26.65805340223945,
"eval_loss": 0.3622484803199768,
"eval_runtime": 17.2611,
"eval_samples_per_second": 1100.975,
"eval_steps_per_second": 34.413,
"step": 30950
},
{
"epoch": 26.701119724375538,
"grad_norm": 0.4450501501560211,
"learning_rate": 9.319896640826874e-05,
"loss": 0.28469310760498046,
"step": 31000
},
{
"epoch": 26.701119724375538,
"eval_loss": 0.36087697744369507,
"eval_runtime": 17.6979,
"eval_samples_per_second": 1073.802,
"eval_steps_per_second": 33.563,
"step": 31000
},
{
"epoch": 26.74418604651163,
"grad_norm": 0.5609347224235535,
"learning_rate": 9.302670111972438e-05,
"loss": 0.29320995330810545,
"step": 31050
},
{
"epoch": 26.74418604651163,
"eval_loss": 0.3648212254047394,
"eval_runtime": 17.5686,
"eval_samples_per_second": 1081.7,
"eval_steps_per_second": 33.81,
"step": 31050
},
{
"epoch": 26.787252368647717,
"grad_norm": 0.4875686466693878,
"learning_rate": 9.285443583118003e-05,
"loss": 0.2869133186340332,
"step": 31100
},
{
"epoch": 26.787252368647717,
"eval_loss": 0.3665845990180969,
"eval_runtime": 17.5663,
"eval_samples_per_second": 1081.847,
"eval_steps_per_second": 33.815,
"step": 31100
},
{
"epoch": 26.830318690783805,
"grad_norm": 0.5157934427261353,
"learning_rate": 9.268217054263566e-05,
"loss": 0.290893497467041,
"step": 31150
},
{
"epoch": 26.830318690783805,
"eval_loss": 0.36622917652130127,
"eval_runtime": 17.5382,
"eval_samples_per_second": 1083.575,
"eval_steps_per_second": 33.869,
"step": 31150
},
{
"epoch": 26.873385012919897,
"grad_norm": 0.5066894292831421,
"learning_rate": 9.25099052540913e-05,
"loss": 0.29268325805664064,
"step": 31200
},
{
"epoch": 26.873385012919897,
"eval_loss": 0.36323854327201843,
"eval_runtime": 17.5281,
"eval_samples_per_second": 1084.203,
"eval_steps_per_second": 33.888,
"step": 31200
},
{
"epoch": 26.916451335055985,
"grad_norm": 0.40066656470298767,
"learning_rate": 9.233763996554695e-05,
"loss": 0.29453598022460936,
"step": 31250
},
{
"epoch": 26.916451335055985,
"eval_loss": 0.3665629029273987,
"eval_runtime": 17.8401,
"eval_samples_per_second": 1065.238,
"eval_steps_per_second": 33.296,
"step": 31250
},
{
"epoch": 26.959517657192077,
"grad_norm": 0.3987741470336914,
"learning_rate": 9.21653746770026e-05,
"loss": 0.2970641326904297,
"step": 31300
},
{
"epoch": 26.959517657192077,
"eval_loss": 0.3582976758480072,
"eval_runtime": 17.6993,
"eval_samples_per_second": 1073.712,
"eval_steps_per_second": 33.561,
"step": 31300
},
{
"epoch": 27.002583979328165,
"grad_norm": 0.4397093653678894,
"learning_rate": 9.199310938845824e-05,
"loss": 0.2915744400024414,
"step": 31350
},
{
"epoch": 27.002583979328165,
"eval_loss": 0.3566512167453766,
"eval_runtime": 17.5664,
"eval_samples_per_second": 1081.837,
"eval_steps_per_second": 33.815,
"step": 31350
},
{
"epoch": 27.045650301464256,
"grad_norm": 0.48084136843681335,
"learning_rate": 9.182084409991387e-05,
"loss": 0.2919019317626953,
"step": 31400
},
{
"epoch": 27.045650301464256,
"eval_loss": 0.3649647533893585,
"eval_runtime": 17.2612,
"eval_samples_per_second": 1100.965,
"eval_steps_per_second": 34.412,
"step": 31400
},
{
"epoch": 27.088716623600344,
"grad_norm": 0.4913296699523926,
"learning_rate": 9.164857881136951e-05,
"loss": 0.28895715713500975,
"step": 31450
},
{
"epoch": 27.088716623600344,
"eval_loss": 0.35735324025154114,
"eval_runtime": 17.3916,
"eval_samples_per_second": 1092.713,
"eval_steps_per_second": 34.154,
"step": 31450
},
{
"epoch": 27.131782945736433,
"grad_norm": 0.4659770429134369,
"learning_rate": 9.147631352282515e-05,
"loss": 0.2834562110900879,
"step": 31500
},
{
"epoch": 27.131782945736433,
"eval_loss": 0.356306254863739,
"eval_runtime": 17.5932,
"eval_samples_per_second": 1080.187,
"eval_steps_per_second": 33.763,
"step": 31500
},
{
"epoch": 27.174849267872524,
"grad_norm": 0.4521220326423645,
"learning_rate": 9.130404823428079e-05,
"loss": 0.28911502838134767,
"step": 31550
},
{
"epoch": 27.174849267872524,
"eval_loss": 0.3661551773548126,
"eval_runtime": 16.6181,
"eval_samples_per_second": 1143.575,
"eval_steps_per_second": 35.744,
"step": 31550
},
{
"epoch": 27.217915590008612,
"grad_norm": 0.45324376225471497,
"learning_rate": 9.113178294573645e-05,
"loss": 0.2861846733093262,
"step": 31600
},
{
"epoch": 27.217915590008612,
"eval_loss": 0.36247044801712036,
"eval_runtime": 17.7398,
"eval_samples_per_second": 1071.261,
"eval_steps_per_second": 33.484,
"step": 31600
},
{
"epoch": 27.260981912144704,
"grad_norm": 0.4110028147697449,
"learning_rate": 9.095951765719209e-05,
"loss": 0.29361557006835937,
"step": 31650
},
{
"epoch": 27.260981912144704,
"eval_loss": 0.3627540171146393,
"eval_runtime": 17.2415,
"eval_samples_per_second": 1102.226,
"eval_steps_per_second": 34.452,
"step": 31650
},
{
"epoch": 27.304048234280792,
"grad_norm": 0.5808663964271545,
"learning_rate": 9.078725236864772e-05,
"loss": 0.2974928092956543,
"step": 31700
},
{
"epoch": 27.304048234280792,
"eval_loss": 0.3498687148094177,
"eval_runtime": 17.589,
"eval_samples_per_second": 1080.45,
"eval_steps_per_second": 33.771,
"step": 31700
},
{
"epoch": 27.347114556416884,
"grad_norm": 0.4611378014087677,
"learning_rate": 9.061498708010336e-05,
"loss": 0.28919679641723633,
"step": 31750
},
{
"epoch": 27.347114556416884,
"eval_loss": 0.3646209239959717,
"eval_runtime": 17.4877,
"eval_samples_per_second": 1086.705,
"eval_steps_per_second": 33.967,
"step": 31750
},
{
"epoch": 27.39018087855297,
"grad_norm": 0.5321835875511169,
"learning_rate": 9.0442721791559e-05,
"loss": 0.2927637481689453,
"step": 31800
},
{
"epoch": 27.39018087855297,
"eval_loss": 0.36543115973472595,
"eval_runtime": 17.3861,
"eval_samples_per_second": 1093.058,
"eval_steps_per_second": 34.165,
"step": 31800
},
{
"epoch": 27.43324720068906,
"grad_norm": 0.410918653011322,
"learning_rate": 9.027045650301464e-05,
"loss": 0.28457128524780273,
"step": 31850
},
{
"epoch": 27.43324720068906,
"eval_loss": 0.35619232058525085,
"eval_runtime": 17.6293,
"eval_samples_per_second": 1077.976,
"eval_steps_per_second": 33.694,
"step": 31850
},
{
"epoch": 27.47631352282515,
"grad_norm": 0.45185136795043945,
"learning_rate": 9.009819121447029e-05,
"loss": 0.29344728469848635,
"step": 31900
},
{
"epoch": 27.47631352282515,
"eval_loss": 0.35773923993110657,
"eval_runtime": 16.5833,
"eval_samples_per_second": 1145.971,
"eval_steps_per_second": 35.819,
"step": 31900
},
{
"epoch": 27.51937984496124,
"grad_norm": 0.46231284737586975,
"learning_rate": 8.992592592592594e-05,
"loss": 0.28259624481201173,
"step": 31950
},
{
"epoch": 27.51937984496124,
"eval_loss": 0.36896592378616333,
"eval_runtime": 17.7625,
"eval_samples_per_second": 1069.895,
"eval_steps_per_second": 33.441,
"step": 31950
},
{
"epoch": 27.56244616709733,
"grad_norm": 0.6303794384002686,
"learning_rate": 8.975366063738157e-05,
"loss": 0.28135942459106444,
"step": 32000
},
{
"epoch": 27.56244616709733,
"eval_loss": 0.36224251985549927,
"eval_runtime": 17.1832,
"eval_samples_per_second": 1105.963,
"eval_steps_per_second": 34.569,
"step": 32000
},
{
"epoch": 27.60551248923342,
"grad_norm": 0.5213823318481445,
"learning_rate": 8.958139534883721e-05,
"loss": 0.2868742370605469,
"step": 32050
},
{
"epoch": 27.60551248923342,
"eval_loss": 0.35869160294532776,
"eval_runtime": 17.6586,
"eval_samples_per_second": 1076.189,
"eval_steps_per_second": 33.638,
"step": 32050
},
{
"epoch": 27.64857881136951,
"grad_norm": 0.4423479735851288,
"learning_rate": 8.940913006029286e-05,
"loss": 0.28639448165893555,
"step": 32100
},
{
"epoch": 27.64857881136951,
"eval_loss": 0.3628954291343689,
"eval_runtime": 17.719,
"eval_samples_per_second": 1072.523,
"eval_steps_per_second": 33.523,
"step": 32100
},
{
"epoch": 27.6916451335056,
"grad_norm": 0.46652039885520935,
"learning_rate": 8.92368647717485e-05,
"loss": 0.2934608459472656,
"step": 32150
},
{
"epoch": 27.6916451335056,
"eval_loss": 0.36239707469940186,
"eval_runtime": 17.3683,
"eval_samples_per_second": 1094.179,
"eval_steps_per_second": 34.2,
"step": 32150
},
{
"epoch": 27.734711455641687,
"grad_norm": 0.4929044246673584,
"learning_rate": 8.906459948320414e-05,
"loss": 0.28652910232543943,
"step": 32200
},
{
"epoch": 27.734711455641687,
"eval_loss": 0.36099740862846375,
"eval_runtime": 17.6482,
"eval_samples_per_second": 1076.823,
"eval_steps_per_second": 33.658,
"step": 32200
},
{
"epoch": 27.77777777777778,
"grad_norm": 0.49651429057121277,
"learning_rate": 8.889233419465978e-05,
"loss": 0.2868942070007324,
"step": 32250
},
{
"epoch": 27.77777777777778,
"eval_loss": 0.3619493544101715,
"eval_runtime": 16.6205,
"eval_samples_per_second": 1143.405,
"eval_steps_per_second": 35.739,
"step": 32250
},
{
"epoch": 27.820844099913867,
"grad_norm": 0.4310859739780426,
"learning_rate": 8.872006890611541e-05,
"loss": 0.2868019104003906,
"step": 32300
},
{
"epoch": 27.820844099913867,
"eval_loss": 0.35609355568885803,
"eval_runtime": 17.7785,
"eval_samples_per_second": 1068.934,
"eval_steps_per_second": 33.411,
"step": 32300
},
{
"epoch": 27.86391042204996,
"grad_norm": 1.0042189359664917,
"learning_rate": 8.854780361757107e-05,
"loss": 0.2852842903137207,
"step": 32350
},
{
"epoch": 27.86391042204996,
"eval_loss": 0.35966190695762634,
"eval_runtime": 17.3214,
"eval_samples_per_second": 1097.14,
"eval_steps_per_second": 34.293,
"step": 32350
},
{
"epoch": 27.906976744186046,
"grad_norm": 0.6012923121452332,
"learning_rate": 8.837553832902671e-05,
"loss": 0.28470855712890625,
"step": 32400
},
{
"epoch": 27.906976744186046,
"eval_loss": 0.3527715802192688,
"eval_runtime": 17.7082,
"eval_samples_per_second": 1073.176,
"eval_steps_per_second": 33.544,
"step": 32400
},
{
"epoch": 27.950043066322134,
"grad_norm": 0.4886482357978821,
"learning_rate": 8.820327304048235e-05,
"loss": 0.2916010093688965,
"step": 32450
},
{
"epoch": 27.950043066322134,
"eval_loss": 0.3638048470020294,
"eval_runtime": 17.5694,
"eval_samples_per_second": 1081.656,
"eval_steps_per_second": 33.809,
"step": 32450
},
{
"epoch": 27.993109388458226,
"grad_norm": 0.5361951589584351,
"learning_rate": 8.803100775193799e-05,
"loss": 0.2956967735290527,
"step": 32500
},
{
"epoch": 27.993109388458226,
"eval_loss": 0.35803547501564026,
"eval_runtime": 17.0215,
"eval_samples_per_second": 1116.469,
"eval_steps_per_second": 34.897,
"step": 32500
},
{
"epoch": 28.036175710594314,
"grad_norm": 0.4958977997303009,
"learning_rate": 8.785874246339363e-05,
"loss": 0.28964385986328123,
"step": 32550
},
{
"epoch": 28.036175710594314,
"eval_loss": 0.35593509674072266,
"eval_runtime": 17.5511,
"eval_samples_per_second": 1082.781,
"eval_steps_per_second": 33.844,
"step": 32550
},
{
"epoch": 28.079242032730406,
"grad_norm": 0.4543485939502716,
"learning_rate": 8.768647717484926e-05,
"loss": 0.28495445251464846,
"step": 32600
},
{
"epoch": 28.079242032730406,
"eval_loss": 0.3700936436653137,
"eval_runtime": 17.4743,
"eval_samples_per_second": 1087.537,
"eval_steps_per_second": 33.993,
"step": 32600
},
{
"epoch": 28.122308354866494,
"grad_norm": 0.5868750810623169,
"learning_rate": 8.75142118863049e-05,
"loss": 0.28287410736083984,
"step": 32650
},
{
"epoch": 28.122308354866494,
"eval_loss": 0.3608900010585785,
"eval_runtime": 17.6108,
"eval_samples_per_second": 1079.114,
"eval_steps_per_second": 33.729,
"step": 32650
},
{
"epoch": 28.165374677002585,
"grad_norm": 0.521497368812561,
"learning_rate": 8.734194659776056e-05,
"loss": 0.2834699058532715,
"step": 32700
},
{
"epoch": 28.165374677002585,
"eval_loss": 0.35730814933776855,
"eval_runtime": 17.7048,
"eval_samples_per_second": 1073.379,
"eval_steps_per_second": 33.55,
"step": 32700
},
{
"epoch": 28.208440999138674,
"grad_norm": 0.40085679292678833,
"learning_rate": 8.71696813092162e-05,
"loss": 0.2893166351318359,
"step": 32750
},
{
"epoch": 28.208440999138674,
"eval_loss": 0.3633263111114502,
"eval_runtime": 17.657,
"eval_samples_per_second": 1076.287,
"eval_steps_per_second": 33.641,
"step": 32750
},
{
"epoch": 28.25150732127476,
"grad_norm": 0.46187394857406616,
"learning_rate": 8.699741602067184e-05,
"loss": 0.2845230484008789,
"step": 32800
},
{
"epoch": 28.25150732127476,
"eval_loss": 0.3593103289604187,
"eval_runtime": 17.6326,
"eval_samples_per_second": 1077.777,
"eval_steps_per_second": 33.688,
"step": 32800
},
{
"epoch": 28.294573643410853,
"grad_norm": 0.5855560302734375,
"learning_rate": 8.682515073212749e-05,
"loss": 0.2909526824951172,
"step": 32850
},
{
"epoch": 28.294573643410853,
"eval_loss": 0.3548714220523834,
"eval_runtime": 17.1281,
"eval_samples_per_second": 1109.521,
"eval_steps_per_second": 34.68,
"step": 32850
},
{
"epoch": 28.33763996554694,
"grad_norm": 0.5393654704093933,
"learning_rate": 8.665288544358312e-05,
"loss": 0.28526065826416014,
"step": 32900
},
{
"epoch": 28.33763996554694,
"eval_loss": 0.35168689489364624,
"eval_runtime": 17.5555,
"eval_samples_per_second": 1082.512,
"eval_steps_per_second": 33.836,
"step": 32900
},
{
"epoch": 28.380706287683033,
"grad_norm": 0.5609577298164368,
"learning_rate": 8.648062015503876e-05,
"loss": 0.28955772399902346,
"step": 32950
},
{
"epoch": 28.380706287683033,
"eval_loss": 0.35503652691841125,
"eval_runtime": 17.7573,
"eval_samples_per_second": 1070.209,
"eval_steps_per_second": 33.451,
"step": 32950
},
{
"epoch": 28.42377260981912,
"grad_norm": 0.48286551237106323,
"learning_rate": 8.63083548664944e-05,
"loss": 0.2849011993408203,
"step": 33000
},
{
"epoch": 28.42377260981912,
"eval_loss": 0.3584132790565491,
"eval_runtime": 17.458,
"eval_samples_per_second": 1088.557,
"eval_steps_per_second": 34.025,
"step": 33000
},
{
"epoch": 28.466838931955213,
"grad_norm": 0.4265981614589691,
"learning_rate": 8.613608957795005e-05,
"loss": 0.2775753974914551,
"step": 33050
},
{
"epoch": 28.466838931955213,
"eval_loss": 0.3611355125904083,
"eval_runtime": 17.3854,
"eval_samples_per_second": 1093.1,
"eval_steps_per_second": 34.167,
"step": 33050
},
{
"epoch": 28.5099052540913,
"grad_norm": 0.5069410800933838,
"learning_rate": 8.59638242894057e-05,
"loss": 0.28409801483154296,
"step": 33100
},
{
"epoch": 28.5099052540913,
"eval_loss": 0.3576951026916504,
"eval_runtime": 17.6066,
"eval_samples_per_second": 1079.366,
"eval_steps_per_second": 33.737,
"step": 33100
},
{
"epoch": 28.55297157622739,
"grad_norm": 0.4650542438030243,
"learning_rate": 8.579155900086133e-05,
"loss": 0.2903776741027832,
"step": 33150
},
{
"epoch": 28.55297157622739,
"eval_loss": 0.3601308763027191,
"eval_runtime": 17.6624,
"eval_samples_per_second": 1075.961,
"eval_steps_per_second": 33.631,
"step": 33150
},
{
"epoch": 28.59603789836348,
"grad_norm": 0.44358712434768677,
"learning_rate": 8.561929371231697e-05,
"loss": 0.2828477096557617,
"step": 33200
},
{
"epoch": 28.59603789836348,
"eval_loss": 0.34798339009284973,
"eval_runtime": 16.7677,
"eval_samples_per_second": 1133.368,
"eval_steps_per_second": 35.425,
"step": 33200
},
{
"epoch": 28.63910422049957,
"grad_norm": 0.4460907280445099,
"learning_rate": 8.544702842377261e-05,
"loss": 0.2873170280456543,
"step": 33250
},
{
"epoch": 28.63910422049957,
"eval_loss": 0.3584803342819214,
"eval_runtime": 16.701,
"eval_samples_per_second": 1137.898,
"eval_steps_per_second": 35.567,
"step": 33250
},
{
"epoch": 28.68217054263566,
"grad_norm": 0.4862593412399292,
"learning_rate": 8.527476313522825e-05,
"loss": 0.28872970581054686,
"step": 33300
},
{
"epoch": 28.68217054263566,
"eval_loss": 0.35872894525527954,
"eval_runtime": 17.6402,
"eval_samples_per_second": 1077.313,
"eval_steps_per_second": 33.673,
"step": 33300
},
{
"epoch": 28.725236864771748,
"grad_norm": 0.4718325436115265,
"learning_rate": 8.51024978466839e-05,
"loss": 0.28618146896362306,
"step": 33350
},
{
"epoch": 28.725236864771748,
"eval_loss": 0.35736215114593506,
"eval_runtime": 17.2688,
"eval_samples_per_second": 1100.48,
"eval_steps_per_second": 34.397,
"step": 33350
},
{
"epoch": 28.768303186907836,
"grad_norm": 0.41244471073150635,
"learning_rate": 8.493023255813955e-05,
"loss": 0.29159786224365236,
"step": 33400
},
{
"epoch": 28.768303186907836,
"eval_loss": 0.36435920000076294,
"eval_runtime": 16.7461,
"eval_samples_per_second": 1134.833,
"eval_steps_per_second": 35.471,
"step": 33400
},
{
"epoch": 28.811369509043928,
"grad_norm": 0.43774139881134033,
"learning_rate": 8.475796726959518e-05,
"loss": 0.28725341796875,
"step": 33450
},
{
"epoch": 28.811369509043928,
"eval_loss": 0.3537900745868683,
"eval_runtime": 17.1804,
"eval_samples_per_second": 1106.147,
"eval_steps_per_second": 34.574,
"step": 33450
},
{
"epoch": 28.854435831180016,
"grad_norm": 0.46093523502349854,
"learning_rate": 8.458570198105082e-05,
"loss": 0.27915733337402343,
"step": 33500
},
{
"epoch": 28.854435831180016,
"eval_loss": 0.35684412717819214,
"eval_runtime": 17.1789,
"eval_samples_per_second": 1106.242,
"eval_steps_per_second": 34.577,
"step": 33500
},
{
"epoch": 28.897502153316108,
"grad_norm": 0.4768196642398834,
"learning_rate": 8.441343669250646e-05,
"loss": 0.29193500518798826,
"step": 33550
},
{
"epoch": 28.897502153316108,
"eval_loss": 0.3524988293647766,
"eval_runtime": 16.8537,
"eval_samples_per_second": 1127.586,
"eval_steps_per_second": 35.244,
"step": 33550
},
{
"epoch": 28.940568475452196,
"grad_norm": 0.4585319459438324,
"learning_rate": 8.42411714039621e-05,
"loss": 0.28741561889648437,
"step": 33600
},
{
"epoch": 28.940568475452196,
"eval_loss": 0.35232821106910706,
"eval_runtime": 16.5397,
"eval_samples_per_second": 1148.995,
"eval_steps_per_second": 35.914,
"step": 33600
},
{
"epoch": 28.983634797588287,
"grad_norm": 0.41558921337127686,
"learning_rate": 8.406890611541775e-05,
"loss": 0.2895075035095215,
"step": 33650
},
{
"epoch": 28.983634797588287,
"eval_loss": 0.3447812497615814,
"eval_runtime": 17.1759,
"eval_samples_per_second": 1106.432,
"eval_steps_per_second": 34.583,
"step": 33650
},
{
"epoch": 29.026701119724375,
"grad_norm": 0.4974426031112671,
"learning_rate": 8.389664082687339e-05,
"loss": 0.286500244140625,
"step": 33700
},
{
"epoch": 29.026701119724375,
"eval_loss": 0.35480424761772156,
"eval_runtime": 17.4921,
"eval_samples_per_second": 1086.433,
"eval_steps_per_second": 33.958,
"step": 33700
},
{
"epoch": 29.069767441860463,
"grad_norm": 0.4396195709705353,
"learning_rate": 8.372437553832903e-05,
"loss": 0.2853713607788086,
"step": 33750
},
{
"epoch": 29.069767441860463,
"eval_loss": 0.35844433307647705,
"eval_runtime": 17.6779,
"eval_samples_per_second": 1075.015,
"eval_steps_per_second": 33.601,
"step": 33750
},
{
"epoch": 29.112833763996555,
"grad_norm": 0.4725944995880127,
"learning_rate": 8.355211024978467e-05,
"loss": 0.29019893646240236,
"step": 33800
},
{
"epoch": 29.112833763996555,
"eval_loss": 0.3599490523338318,
"eval_runtime": 17.2702,
"eval_samples_per_second": 1100.393,
"eval_steps_per_second": 34.395,
"step": 33800
},
{
"epoch": 29.155900086132643,
"grad_norm": 0.5145936012268066,
"learning_rate": 8.337984496124032e-05,
"loss": 0.28777868270874024,
"step": 33850
},
{
"epoch": 29.155900086132643,
"eval_loss": 0.354200154542923,
"eval_runtime": 17.7173,
"eval_samples_per_second": 1072.626,
"eval_steps_per_second": 33.527,
"step": 33850
},
{
"epoch": 29.198966408268735,
"grad_norm": 0.471347838640213,
"learning_rate": 8.320757967269596e-05,
"loss": 0.28519989013671876,
"step": 33900
},
{
"epoch": 29.198966408268735,
"eval_loss": 0.3546842634677887,
"eval_runtime": 17.5708,
"eval_samples_per_second": 1081.565,
"eval_steps_per_second": 33.806,
"step": 33900
},
{
"epoch": 29.242032730404823,
"grad_norm": 0.5167243480682373,
"learning_rate": 8.30353143841516e-05,
"loss": 0.28689983367919925,
"step": 33950
},
{
"epoch": 29.242032730404823,
"eval_loss": 0.3506035804748535,
"eval_runtime": 17.6473,
"eval_samples_per_second": 1076.881,
"eval_steps_per_second": 33.66,
"step": 33950
},
{
"epoch": 29.285099052540914,
"grad_norm": 0.49219077825546265,
"learning_rate": 8.286304909560724e-05,
"loss": 0.28681949615478514,
"step": 34000
},
{
"epoch": 29.285099052540914,
"eval_loss": 0.35144245624542236,
"eval_runtime": 17.7227,
"eval_samples_per_second": 1072.296,
"eval_steps_per_second": 33.516,
"step": 34000
},
{
"epoch": 29.328165374677003,
"grad_norm": 0.5044933557510376,
"learning_rate": 8.269078380706288e-05,
"loss": 0.28524385452270506,
"step": 34050
},
{
"epoch": 29.328165374677003,
"eval_loss": 0.36039817333221436,
"eval_runtime": 18.0279,
"eval_samples_per_second": 1054.141,
"eval_steps_per_second": 32.949,
"step": 34050
},
{
"epoch": 29.37123169681309,
"grad_norm": 0.4944096803665161,
"learning_rate": 8.251851851851851e-05,
"loss": 0.2815964698791504,
"step": 34100
},
{
"epoch": 29.37123169681309,
"eval_loss": 0.35990285873413086,
"eval_runtime": 19.6306,
"eval_samples_per_second": 968.079,
"eval_steps_per_second": 30.259,
"step": 34100
},
{
"epoch": 29.414298018949182,
"grad_norm": 0.4206973612308502,
"learning_rate": 8.234625322997417e-05,
"loss": 0.2808472442626953,
"step": 34150
},
{
"epoch": 29.414298018949182,
"eval_loss": 0.3539174497127533,
"eval_runtime": 18.5638,
"eval_samples_per_second": 1023.71,
"eval_steps_per_second": 31.998,
"step": 34150
},
{
"epoch": 29.45736434108527,
"grad_norm": 0.46597999334335327,
"learning_rate": 8.217398794142981e-05,
"loss": 0.2880288696289062,
"step": 34200
},
{
"epoch": 29.45736434108527,
"eval_loss": 0.3545425236225128,
"eval_runtime": 19.4963,
"eval_samples_per_second": 974.748,
"eval_steps_per_second": 30.467,
"step": 34200
},
{
"epoch": 29.500430663221362,
"grad_norm": 0.40909266471862793,
"learning_rate": 8.200172265288545e-05,
"loss": 0.28936180114746096,
"step": 34250
},
{
"epoch": 29.500430663221362,
"eval_loss": 0.3585303723812103,
"eval_runtime": 16.8611,
"eval_samples_per_second": 1127.091,
"eval_steps_per_second": 35.229,
"step": 34250
},
{
"epoch": 29.54349698535745,
"grad_norm": 0.3839239478111267,
"learning_rate": 8.18294573643411e-05,
"loss": 0.2772235107421875,
"step": 34300
},
{
"epoch": 29.54349698535745,
"eval_loss": 0.3551238775253296,
"eval_runtime": 17.57,
"eval_samples_per_second": 1081.614,
"eval_steps_per_second": 33.808,
"step": 34300
},
{
"epoch": 29.58656330749354,
"grad_norm": 0.43001672625541687,
"learning_rate": 8.165719207579672e-05,
"loss": 0.2795041847229004,
"step": 34350
},
{
"epoch": 29.58656330749354,
"eval_loss": 0.3595533072948456,
"eval_runtime": 17.835,
"eval_samples_per_second": 1065.543,
"eval_steps_per_second": 33.305,
"step": 34350
},
{
"epoch": 29.62962962962963,
"grad_norm": 0.45755746960639954,
"learning_rate": 8.148492678725236e-05,
"loss": 0.2803531265258789,
"step": 34400
},
{
"epoch": 29.62962962962963,
"eval_loss": 0.3586844503879547,
"eval_runtime": 18.152,
"eval_samples_per_second": 1046.935,
"eval_steps_per_second": 32.724,
"step": 34400
},
{
"epoch": 29.672695951765718,
"grad_norm": 0.436646968126297,
"learning_rate": 8.1312661498708e-05,
"loss": 0.28399742126464844,
"step": 34450
},
{
"epoch": 29.672695951765718,
"eval_loss": 0.3531823754310608,
"eval_runtime": 17.9012,
"eval_samples_per_second": 1061.603,
"eval_steps_per_second": 33.182,
"step": 34450
},
{
"epoch": 29.71576227390181,
"grad_norm": 0.41019946336746216,
"learning_rate": 8.114039621016366e-05,
"loss": 0.2811345100402832,
"step": 34500
},
{
"epoch": 29.71576227390181,
"eval_loss": 0.36139482259750366,
"eval_runtime": 17.6563,
"eval_samples_per_second": 1076.332,
"eval_steps_per_second": 33.642,
"step": 34500
},
{
"epoch": 29.758828596037898,
"grad_norm": 0.38991779088974,
"learning_rate": 8.09681309216193e-05,
"loss": 0.2825624084472656,
"step": 34550
},
{
"epoch": 29.758828596037898,
"eval_loss": 0.3564010262489319,
"eval_runtime": 17.4361,
"eval_samples_per_second": 1089.92,
"eval_steps_per_second": 34.067,
"step": 34550
},
{
"epoch": 29.80189491817399,
"grad_norm": 0.4471035599708557,
"learning_rate": 8.079586563307495e-05,
"loss": 0.28816804885864256,
"step": 34600
},
{
"epoch": 29.80189491817399,
"eval_loss": 0.3620011806488037,
"eval_runtime": 16.5186,
"eval_samples_per_second": 1150.461,
"eval_steps_per_second": 35.959,
"step": 34600
},
{
"epoch": 29.844961240310077,
"grad_norm": 0.5122584700584412,
"learning_rate": 8.062360034453058e-05,
"loss": 0.2832551574707031,
"step": 34650
},
{
"epoch": 29.844961240310077,
"eval_loss": 0.35831230878829956,
"eval_runtime": 17.6589,
"eval_samples_per_second": 1076.174,
"eval_steps_per_second": 33.638,
"step": 34650
},
{
"epoch": 29.88802756244617,
"grad_norm": 0.4535158574581146,
"learning_rate": 8.045133505598622e-05,
"loss": 0.284300365447998,
"step": 34700
},
{
"epoch": 29.88802756244617,
"eval_loss": 0.35620927810668945,
"eval_runtime": 16.3303,
"eval_samples_per_second": 1163.724,
"eval_steps_per_second": 36.374,
"step": 34700
},
{
"epoch": 29.931093884582257,
"grad_norm": 0.42133525013923645,
"learning_rate": 8.027906976744186e-05,
"loss": 0.28102462768554687,
"step": 34750
},
{
"epoch": 29.931093884582257,
"eval_loss": 0.3616844713687897,
"eval_runtime": 18.5174,
"eval_samples_per_second": 1026.276,
"eval_steps_per_second": 32.078,
"step": 34750
},
{
"epoch": 29.974160206718345,
"grad_norm": 0.43094122409820557,
"learning_rate": 8.01068044788975e-05,
"loss": 0.27734567642211916,
"step": 34800
},
{
"epoch": 29.974160206718345,
"eval_loss": 0.3515077233314514,
"eval_runtime": 18.8179,
"eval_samples_per_second": 1009.892,
"eval_steps_per_second": 31.566,
"step": 34800
},
{
"epoch": 30.017226528854437,
"grad_norm": 0.49150723218917847,
"learning_rate": 7.993453919035316e-05,
"loss": 0.2808255386352539,
"step": 34850
},
{
"epoch": 30.017226528854437,
"eval_loss": 0.3547874987125397,
"eval_runtime": 17.868,
"eval_samples_per_second": 1063.577,
"eval_steps_per_second": 33.244,
"step": 34850
},
{
"epoch": 30.060292850990525,
"grad_norm": 0.40897902846336365,
"learning_rate": 7.97622739018088e-05,
"loss": 0.2808572387695312,
"step": 34900
},
{
"epoch": 30.060292850990525,
"eval_loss": 0.3521062731742859,
"eval_runtime": 17.821,
"eval_samples_per_second": 1066.38,
"eval_steps_per_second": 33.331,
"step": 34900
},
{
"epoch": 30.103359173126616,
"grad_norm": 0.47192278504371643,
"learning_rate": 7.959000861326443e-05,
"loss": 0.28496150970458983,
"step": 34950
},
{
"epoch": 30.103359173126616,
"eval_loss": 0.35089150071144104,
"eval_runtime": 17.8659,
"eval_samples_per_second": 1063.701,
"eval_steps_per_second": 33.248,
"step": 34950
},
{
"epoch": 30.146425495262704,
"grad_norm": 0.40365535020828247,
"learning_rate": 7.941774332472007e-05,
"loss": 0.27695383071899415,
"step": 35000
},
{
"epoch": 30.146425495262704,
"eval_loss": 0.3547033965587616,
"eval_runtime": 17.6758,
"eval_samples_per_second": 1075.145,
"eval_steps_per_second": 33.605,
"step": 35000
},
{
"epoch": 30.189491817398793,
"grad_norm": 0.5050297379493713,
"learning_rate": 7.924547803617571e-05,
"loss": 0.2835973358154297,
"step": 35050
},
{
"epoch": 30.189491817398793,
"eval_loss": 0.351387619972229,
"eval_runtime": 17.0333,
"eval_samples_per_second": 1115.696,
"eval_steps_per_second": 34.873,
"step": 35050
},
{
"epoch": 30.232558139534884,
"grad_norm": 0.5353424549102783,
"learning_rate": 7.907321274763135e-05,
"loss": 0.2843050003051758,
"step": 35100
},
{
"epoch": 30.232558139534884,
"eval_loss": 0.36003604531288147,
"eval_runtime": 18.0958,
"eval_samples_per_second": 1050.189,
"eval_steps_per_second": 32.825,
"step": 35100
},
{
"epoch": 30.275624461670972,
"grad_norm": 0.468283474445343,
"learning_rate": 7.8900947459087e-05,
"loss": 0.28055492401123044,
"step": 35150
},
{
"epoch": 30.275624461670972,
"eval_loss": 0.35894954204559326,
"eval_runtime": 17.7049,
"eval_samples_per_second": 1073.375,
"eval_steps_per_second": 33.55,
"step": 35150
},
{
"epoch": 30.318690783807064,
"grad_norm": 0.4638129770755768,
"learning_rate": 7.872868217054264e-05,
"loss": 0.2750652503967285,
"step": 35200
},
{
"epoch": 30.318690783807064,
"eval_loss": 0.35567528009414673,
"eval_runtime": 17.919,
"eval_samples_per_second": 1060.552,
"eval_steps_per_second": 33.149,
"step": 35200
},
{
"epoch": 30.361757105943152,
"grad_norm": 0.5625136494636536,
"learning_rate": 7.855641688199828e-05,
"loss": 0.2850845909118652,
"step": 35250
},
{
"epoch": 30.361757105943152,
"eval_loss": 0.36067649722099304,
"eval_runtime": 17.6804,
"eval_samples_per_second": 1074.863,
"eval_steps_per_second": 33.597,
"step": 35250
},
{
"epoch": 30.404823428079244,
"grad_norm": 0.43467268347740173,
"learning_rate": 7.838415159345392e-05,
"loss": 0.2873003578186035,
"step": 35300
},
{
"epoch": 30.404823428079244,
"eval_loss": 0.3528839349746704,
"eval_runtime": 17.756,
"eval_samples_per_second": 1070.287,
"eval_steps_per_second": 33.454,
"step": 35300
},
{
"epoch": 30.44788975021533,
"grad_norm": 0.4412539303302765,
"learning_rate": 7.821188630490956e-05,
"loss": 0.285908203125,
"step": 35350
},
{
"epoch": 30.44788975021533,
"eval_loss": 0.3487206697463989,
"eval_runtime": 17.9176,
"eval_samples_per_second": 1060.633,
"eval_steps_per_second": 33.152,
"step": 35350
},
{
"epoch": 30.49095607235142,
"grad_norm": 0.4791826903820038,
"learning_rate": 7.80396210163652e-05,
"loss": 0.2815831756591797,
"step": 35400
},
{
"epoch": 30.49095607235142,
"eval_loss": 0.35717254877090454,
"eval_runtime": 17.8176,
"eval_samples_per_second": 1066.587,
"eval_steps_per_second": 33.338,
"step": 35400
},
{
"epoch": 30.53402239448751,
"grad_norm": 0.4532201886177063,
"learning_rate": 7.786735572782085e-05,
"loss": 0.28284832000732424,
"step": 35450
},
{
"epoch": 30.53402239448751,
"eval_loss": 0.3567640781402588,
"eval_runtime": 17.5731,
"eval_samples_per_second": 1081.425,
"eval_steps_per_second": 33.802,
"step": 35450
},
{
"epoch": 30.5770887166236,
"grad_norm": 0.5390316247940063,
"learning_rate": 7.769509043927649e-05,
"loss": 0.2846924591064453,
"step": 35500
},
{
"epoch": 30.5770887166236,
"eval_loss": 0.3540796935558319,
"eval_runtime": 17.2901,
"eval_samples_per_second": 1099.126,
"eval_steps_per_second": 34.355,
"step": 35500
},
{
"epoch": 30.62015503875969,
"grad_norm": 0.4142419993877411,
"learning_rate": 7.752282515073212e-05,
"loss": 0.28491304397583006,
"step": 35550
},
{
"epoch": 30.62015503875969,
"eval_loss": 0.36062875390052795,
"eval_runtime": 17.9366,
"eval_samples_per_second": 1059.508,
"eval_steps_per_second": 33.117,
"step": 35550
},
{
"epoch": 30.66322136089578,
"grad_norm": 0.4472252130508423,
"learning_rate": 7.735055986218777e-05,
"loss": 0.282122917175293,
"step": 35600
},
{
"epoch": 30.66322136089578,
"eval_loss": 0.35857757925987244,
"eval_runtime": 16.936,
"eval_samples_per_second": 1122.107,
"eval_steps_per_second": 35.073,
"step": 35600
},
{
"epoch": 30.70628768303187,
"grad_norm": 0.5478323101997375,
"learning_rate": 7.717829457364342e-05,
"loss": 0.2744486999511719,
"step": 35650
},
{
"epoch": 30.70628768303187,
"eval_loss": 0.35813167691230774,
"eval_runtime": 17.5761,
"eval_samples_per_second": 1081.243,
"eval_steps_per_second": 33.796,
"step": 35650
},
{
"epoch": 30.74935400516796,
"grad_norm": 0.4716998338699341,
"learning_rate": 7.700602928509906e-05,
"loss": 0.28380435943603516,
"step": 35700
},
{
"epoch": 30.74935400516796,
"eval_loss": 0.34930458664894104,
"eval_runtime": 18.1096,
"eval_samples_per_second": 1049.39,
"eval_steps_per_second": 32.8,
"step": 35700
},
{
"epoch": 30.792420327304047,
"grad_norm": 0.4638073742389679,
"learning_rate": 7.68337639965547e-05,
"loss": 0.28032737731933594,
"step": 35750
},
{
"epoch": 30.792420327304047,
"eval_loss": 0.3495262861251831,
"eval_runtime": 17.8334,
"eval_samples_per_second": 1065.639,
"eval_steps_per_second": 33.308,
"step": 35750
},
{
"epoch": 30.83548664944014,
"grad_norm": 0.42722830176353455,
"learning_rate": 7.666149870801034e-05,
"loss": 0.2841494560241699,
"step": 35800
},
{
"epoch": 30.83548664944014,
"eval_loss": 0.35409942269325256,
"eval_runtime": 17.8052,
"eval_samples_per_second": 1067.329,
"eval_steps_per_second": 33.361,
"step": 35800
},
{
"epoch": 30.878552971576227,
"grad_norm": 0.6081225872039795,
"learning_rate": 7.648923341946597e-05,
"loss": 0.27756031036376955,
"step": 35850
},
{
"epoch": 30.878552971576227,
"eval_loss": 0.3533887565135956,
"eval_runtime": 18.4344,
"eval_samples_per_second": 1030.897,
"eval_steps_per_second": 32.222,
"step": 35850
},
{
"epoch": 30.921619293712318,
"grad_norm": 0.4515012502670288,
"learning_rate": 7.631696813092161e-05,
"loss": 0.2825018310546875,
"step": 35900
},
{
"epoch": 30.921619293712318,
"eval_loss": 0.35392051935195923,
"eval_runtime": 17.8146,
"eval_samples_per_second": 1066.767,
"eval_steps_per_second": 33.343,
"step": 35900
},
{
"epoch": 30.964685615848406,
"grad_norm": 0.5015137195587158,
"learning_rate": 7.614470284237727e-05,
"loss": 0.276204948425293,
"step": 35950
},
{
"epoch": 30.964685615848406,
"eval_loss": 0.35323023796081543,
"eval_runtime": 16.7751,
"eval_samples_per_second": 1132.871,
"eval_steps_per_second": 35.41,
"step": 35950
},
{
"epoch": 31.007751937984494,
"grad_norm": 0.49482443928718567,
"learning_rate": 7.597243755383291e-05,
"loss": 0.2751371574401855,
"step": 36000
},
{
"epoch": 31.007751937984494,
"eval_loss": 0.35262081027030945,
"eval_runtime": 17.7106,
"eval_samples_per_second": 1073.033,
"eval_steps_per_second": 33.539,
"step": 36000
},
{
"epoch": 31.050818260120586,
"grad_norm": 0.5488302707672119,
"learning_rate": 7.580017226528855e-05,
"loss": 0.2832693099975586,
"step": 36050
},
{
"epoch": 31.050818260120586,
"eval_loss": 0.3515873849391937,
"eval_runtime": 17.7268,
"eval_samples_per_second": 1072.047,
"eval_steps_per_second": 33.509,
"step": 36050
},
{
"epoch": 31.093884582256674,
"grad_norm": 0.47250041365623474,
"learning_rate": 7.56279069767442e-05,
"loss": 0.2822879409790039,
"step": 36100
},
{
"epoch": 31.093884582256674,
"eval_loss": 0.3520713746547699,
"eval_runtime": 17.2396,
"eval_samples_per_second": 1102.344,
"eval_steps_per_second": 34.456,
"step": 36100
},
{
"epoch": 31.136950904392766,
"grad_norm": 0.45473265647888184,
"learning_rate": 7.545564168819982e-05,
"loss": 0.28120903015136717,
"step": 36150
},
{
"epoch": 31.136950904392766,
"eval_loss": 0.35466763377189636,
"eval_runtime": 18.006,
"eval_samples_per_second": 1055.423,
"eval_steps_per_second": 32.989,
"step": 36150
},
{
"epoch": 31.180017226528854,
"grad_norm": 0.5772674679756165,
"learning_rate": 7.528337639965547e-05,
"loss": 0.28337194442749025,
"step": 36200
},
{
"epoch": 31.180017226528854,
"eval_loss": 0.34932905435562134,
"eval_runtime": 17.878,
"eval_samples_per_second": 1062.984,
"eval_steps_per_second": 33.225,
"step": 36200
},
{
"epoch": 31.223083548664945,
"grad_norm": 0.510277509689331,
"learning_rate": 7.511111111111111e-05,
"loss": 0.2886422348022461,
"step": 36250
},
{
"epoch": 31.223083548664945,
"eval_loss": 0.35099226236343384,
"eval_runtime": 17.8684,
"eval_samples_per_second": 1063.551,
"eval_steps_per_second": 33.243,
"step": 36250
},
{
"epoch": 31.266149870801033,
"grad_norm": 0.5357157588005066,
"learning_rate": 7.493884582256676e-05,
"loss": 0.27557926177978515,
"step": 36300
},
{
"epoch": 31.266149870801033,
"eval_loss": 0.3499546945095062,
"eval_runtime": 18.1845,
"eval_samples_per_second": 1045.067,
"eval_steps_per_second": 32.665,
"step": 36300
},
{
"epoch": 31.30921619293712,
"grad_norm": 0.5062825679779053,
"learning_rate": 7.47665805340224e-05,
"loss": 0.27824714660644534,
"step": 36350
},
{
"epoch": 31.30921619293712,
"eval_loss": 0.3564305901527405,
"eval_runtime": 17.7311,
"eval_samples_per_second": 1071.791,
"eval_steps_per_second": 33.501,
"step": 36350
},
{
"epoch": 31.352282515073213,
"grad_norm": 0.41484689712524414,
"learning_rate": 7.459431524547804e-05,
"loss": 0.28137615203857425,
"step": 36400
},
{
"epoch": 31.352282515073213,
"eval_loss": 0.3556700646877289,
"eval_runtime": 17.3043,
"eval_samples_per_second": 1098.227,
"eval_steps_per_second": 34.327,
"step": 36400
},
{
"epoch": 31.3953488372093,
"grad_norm": 0.43241560459136963,
"learning_rate": 7.442204995693368e-05,
"loss": 0.2793069839477539,
"step": 36450
},
{
"epoch": 31.3953488372093,
"eval_loss": 0.3520503044128418,
"eval_runtime": 16.5928,
"eval_samples_per_second": 1145.316,
"eval_steps_per_second": 35.799,
"step": 36450
},
{
"epoch": 31.438415159345393,
"grad_norm": 0.40315011143684387,
"learning_rate": 7.424978466838932e-05,
"loss": 0.27872093200683595,
"step": 36500
},
{
"epoch": 31.438415159345393,
"eval_loss": 0.3566995859146118,
"eval_runtime": 17.9359,
"eval_samples_per_second": 1059.55,
"eval_steps_per_second": 33.118,
"step": 36500
},
{
"epoch": 31.48148148148148,
"grad_norm": 0.3954852223396301,
"learning_rate": 7.407751937984496e-05,
"loss": 0.28445125579833985,
"step": 36550
},
{
"epoch": 31.48148148148148,
"eval_loss": 0.3513743281364441,
"eval_runtime": 16.2741,
"eval_samples_per_second": 1167.747,
"eval_steps_per_second": 36.5,
"step": 36550
},
{
"epoch": 31.524547803617573,
"grad_norm": 0.44541651010513306,
"learning_rate": 7.39052540913006e-05,
"loss": 0.27661422729492186,
"step": 36600
},
{
"epoch": 31.524547803617573,
"eval_loss": 0.34536993503570557,
"eval_runtime": 17.8737,
"eval_samples_per_second": 1063.237,
"eval_steps_per_second": 33.233,
"step": 36600
},
{
"epoch": 31.56761412575366,
"grad_norm": 0.5039131045341492,
"learning_rate": 7.373298880275626e-05,
"loss": 0.2753811264038086,
"step": 36650
},
{
"epoch": 31.56761412575366,
"eval_loss": 0.34828585386276245,
"eval_runtime": 17.826,
"eval_samples_per_second": 1066.083,
"eval_steps_per_second": 33.322,
"step": 36650
},
{
"epoch": 31.61068044788975,
"grad_norm": 0.455892950296402,
"learning_rate": 7.356072351421189e-05,
"loss": 0.28127681732177734,
"step": 36700
},
{
"epoch": 31.61068044788975,
"eval_loss": 0.34847480058670044,
"eval_runtime": 17.9038,
"eval_samples_per_second": 1061.449,
"eval_steps_per_second": 33.177,
"step": 36700
},
{
"epoch": 31.65374677002584,
"grad_norm": 0.481543630361557,
"learning_rate": 7.338845822566753e-05,
"loss": 0.2735530662536621,
"step": 36750
},
{
"epoch": 31.65374677002584,
"eval_loss": 0.34797731041908264,
"eval_runtime": 17.8173,
"eval_samples_per_second": 1066.603,
"eval_steps_per_second": 33.338,
"step": 36750
},
{
"epoch": 31.69681309216193,
"grad_norm": 0.4258839786052704,
"learning_rate": 7.321619293712317e-05,
"loss": 0.2800648307800293,
"step": 36800
},
{
"epoch": 31.69681309216193,
"eval_loss": 0.35386908054351807,
"eval_runtime": 17.8122,
"eval_samples_per_second": 1066.911,
"eval_steps_per_second": 33.348,
"step": 36800
},
{
"epoch": 31.73987941429802,
"grad_norm": 0.4986749291419983,
"learning_rate": 7.304392764857881e-05,
"loss": 0.2886836624145508,
"step": 36850
},
{
"epoch": 31.73987941429802,
"eval_loss": 0.35045093297958374,
"eval_runtime": 17.9467,
"eval_samples_per_second": 1058.912,
"eval_steps_per_second": 33.098,
"step": 36850
},
{
"epoch": 31.782945736434108,
"grad_norm": 0.428501695394516,
"learning_rate": 7.287166236003446e-05,
"loss": 0.27103757858276367,
"step": 36900
},
{
"epoch": 31.782945736434108,
"eval_loss": 0.349502295255661,
"eval_runtime": 17.3496,
"eval_samples_per_second": 1095.359,
"eval_steps_per_second": 34.237,
"step": 36900
},
{
"epoch": 31.8260120585702,
"grad_norm": 0.44585123658180237,
"learning_rate": 7.26993970714901e-05,
"loss": 0.282184944152832,
"step": 36950
},
{
"epoch": 31.8260120585702,
"eval_loss": 0.3529902994632721,
"eval_runtime": 18.1962,
"eval_samples_per_second": 1044.397,
"eval_steps_per_second": 32.644,
"step": 36950
},
{
"epoch": 31.869078380706288,
"grad_norm": 0.44630590081214905,
"learning_rate": 7.252713178294574e-05,
"loss": 0.2770352554321289,
"step": 37000
},
{
"epoch": 31.869078380706288,
"eval_loss": 0.3461504876613617,
"eval_runtime": 16.8741,
"eval_samples_per_second": 1126.222,
"eval_steps_per_second": 35.202,
"step": 37000
},
{
"epoch": 31.912144702842376,
"grad_norm": 0.41770753264427185,
"learning_rate": 7.235486649440138e-05,
"loss": 0.27702234268188475,
"step": 37050
},
{
"epoch": 31.912144702842376,
"eval_loss": 0.3510807752609253,
"eval_runtime": 17.9156,
"eval_samples_per_second": 1060.75,
"eval_steps_per_second": 33.155,
"step": 37050
},
{
"epoch": 31.955211024978468,
"grad_norm": 0.3740564286708832,
"learning_rate": 7.218260120585702e-05,
"loss": 0.27722122192382814,
"step": 37100
},
{
"epoch": 31.955211024978468,
"eval_loss": 0.3538920283317566,
"eval_runtime": 17.9454,
"eval_samples_per_second": 1058.989,
"eval_steps_per_second": 33.1,
"step": 37100
},
{
"epoch": 31.998277347114556,
"grad_norm": 0.4577646851539612,
"learning_rate": 7.201033591731267e-05,
"loss": 0.2829978561401367,
"step": 37150
},
{
"epoch": 31.998277347114556,
"eval_loss": 0.3476633131504059,
"eval_runtime": 17.8921,
"eval_samples_per_second": 1062.145,
"eval_steps_per_second": 33.199,
"step": 37150
},
{
"epoch": 32.041343669250644,
"grad_norm": 0.5443671941757202,
"learning_rate": 7.183807062876831e-05,
"loss": 0.28137630462646485,
"step": 37200
},
{
"epoch": 32.041343669250644,
"eval_loss": 0.35495200753211975,
"eval_runtime": 17.9027,
"eval_samples_per_second": 1061.513,
"eval_steps_per_second": 33.179,
"step": 37200
},
{
"epoch": 32.084409991386735,
"grad_norm": 0.4282469153404236,
"learning_rate": 7.166580534022395e-05,
"loss": 0.2752407646179199,
"step": 37250
},
{
"epoch": 32.084409991386735,
"eval_loss": 0.3526252508163452,
"eval_runtime": 18.1825,
"eval_samples_per_second": 1045.183,
"eval_steps_per_second": 32.669,
"step": 37250
},
{
"epoch": 32.12747631352283,
"grad_norm": 0.4998242259025574,
"learning_rate": 7.149354005167959e-05,
"loss": 0.2797782516479492,
"step": 37300
},
{
"epoch": 32.12747631352283,
"eval_loss": 0.35174521803855896,
"eval_runtime": 17.6086,
"eval_samples_per_second": 1079.243,
"eval_steps_per_second": 33.733,
"step": 37300
},
{
"epoch": 32.17054263565891,
"grad_norm": 0.49339139461517334,
"learning_rate": 7.132127476313522e-05,
"loss": 0.2803740692138672,
"step": 37350
},
{
"epoch": 32.17054263565891,
"eval_loss": 0.35175102949142456,
"eval_runtime": 16.6815,
"eval_samples_per_second": 1139.226,
"eval_steps_per_second": 35.608,
"step": 37350
},
{
"epoch": 32.213608957795,
"grad_norm": 0.46561405062675476,
"learning_rate": 7.114900947459088e-05,
"loss": 0.27958194732666014,
"step": 37400
},
{
"epoch": 32.213608957795,
"eval_loss": 0.3542179763317108,
"eval_runtime": 17.9346,
"eval_samples_per_second": 1059.627,
"eval_steps_per_second": 33.12,
"step": 37400
},
{
"epoch": 32.256675279931095,
"grad_norm": 0.5260679125785828,
"learning_rate": 7.097674418604652e-05,
"loss": 0.27948333740234377,
"step": 37450
},
{
"epoch": 32.256675279931095,
"eval_loss": 0.35432368516921997,
"eval_runtime": 18.3926,
"eval_samples_per_second": 1033.244,
"eval_steps_per_second": 32.296,
"step": 37450
},
{
"epoch": 32.299741602067186,
"grad_norm": 0.40552613139152527,
"learning_rate": 7.080447889750216e-05,
"loss": 0.2720658493041992,
"step": 37500
},
{
"epoch": 32.299741602067186,
"eval_loss": 0.3506203293800354,
"eval_runtime": 17.1944,
"eval_samples_per_second": 1105.246,
"eval_steps_per_second": 34.546,
"step": 37500
},
{
"epoch": 32.34280792420327,
"grad_norm": 0.574375569820404,
"learning_rate": 7.06322136089578e-05,
"loss": 0.2815470886230469,
"step": 37550
},
{
"epoch": 32.34280792420327,
"eval_loss": 0.34805968403816223,
"eval_runtime": 17.8766,
"eval_samples_per_second": 1063.066,
"eval_steps_per_second": 33.228,
"step": 37550
},
{
"epoch": 32.38587424633936,
"grad_norm": 0.5432583093643188,
"learning_rate": 7.045994832041343e-05,
"loss": 0.27453317642211916,
"step": 37600
},
{
"epoch": 32.38587424633936,
"eval_loss": 0.34897035360336304,
"eval_runtime": 18.4403,
"eval_samples_per_second": 1030.569,
"eval_steps_per_second": 32.212,
"step": 37600
},
{
"epoch": 32.428940568475454,
"grad_norm": 0.49719077348709106,
"learning_rate": 7.028768303186907e-05,
"loss": 0.27978298187255857,
"step": 37650
},
{
"epoch": 32.428940568475454,
"eval_loss": 0.3470863103866577,
"eval_runtime": 17.7674,
"eval_samples_per_second": 1069.602,
"eval_steps_per_second": 33.432,
"step": 37650
},
{
"epoch": 32.47200689061154,
"grad_norm": 0.5088914632797241,
"learning_rate": 7.011541774332472e-05,
"loss": 0.28130838394165036,
"step": 37700
},
{
"epoch": 32.47200689061154,
"eval_loss": 0.3575519919395447,
"eval_runtime": 17.8462,
"eval_samples_per_second": 1064.875,
"eval_steps_per_second": 33.284,
"step": 37700
},
{
"epoch": 32.51507321274763,
"grad_norm": 0.4396224319934845,
"learning_rate": 6.994315245478037e-05,
"loss": 0.27438232421875,
"step": 37750
},
{
"epoch": 32.51507321274763,
"eval_loss": 0.3423137664794922,
"eval_runtime": 17.8343,
"eval_samples_per_second": 1065.588,
"eval_steps_per_second": 33.307,
"step": 37750
},
{
"epoch": 32.55813953488372,
"grad_norm": 0.5027320981025696,
"learning_rate": 6.977088716623601e-05,
"loss": 0.275331974029541,
"step": 37800
},
{
"epoch": 32.55813953488372,
"eval_loss": 0.35206782817840576,
"eval_runtime": 17.7441,
"eval_samples_per_second": 1071.004,
"eval_steps_per_second": 33.476,
"step": 37800
},
{
"epoch": 32.60120585701981,
"grad_norm": 0.3858621418476105,
"learning_rate": 6.959862187769166e-05,
"loss": 0.27590456008911135,
"step": 37850
},
{
"epoch": 32.60120585701981,
"eval_loss": 0.3495849668979645,
"eval_runtime": 16.9167,
"eval_samples_per_second": 1123.384,
"eval_steps_per_second": 35.113,
"step": 37850
},
{
"epoch": 32.6442721791559,
"grad_norm": 0.37935516238212585,
"learning_rate": 6.942635658914728e-05,
"loss": 0.26858043670654297,
"step": 37900
},
{
"epoch": 32.6442721791559,
"eval_loss": 0.34352296590805054,
"eval_runtime": 17.9597,
"eval_samples_per_second": 1058.145,
"eval_steps_per_second": 33.074,
"step": 37900
},
{
"epoch": 32.68733850129199,
"grad_norm": 0.4469800889492035,
"learning_rate": 6.925409130060293e-05,
"loss": 0.28146677017211913,
"step": 37950
},
{
"epoch": 32.68733850129199,
"eval_loss": 0.3506070077419281,
"eval_runtime": 17.4011,
"eval_samples_per_second": 1092.114,
"eval_steps_per_second": 34.136,
"step": 37950
},
{
"epoch": 32.73040482342808,
"grad_norm": 0.42299091815948486,
"learning_rate": 6.908182601205857e-05,
"loss": 0.280551872253418,
"step": 38000
},
{
"epoch": 32.73040482342808,
"eval_loss": 0.3468368947505951,
"eval_runtime": 17.7061,
"eval_samples_per_second": 1073.302,
"eval_steps_per_second": 33.548,
"step": 38000
},
{
"epoch": 32.773471145564166,
"grad_norm": 0.47024187445640564,
"learning_rate": 6.890956072351421e-05,
"loss": 0.28213356018066404,
"step": 38050
},
{
"epoch": 32.773471145564166,
"eval_loss": 0.35156771540641785,
"eval_runtime": 17.9712,
"eval_samples_per_second": 1057.468,
"eval_steps_per_second": 33.053,
"step": 38050
},
{
"epoch": 32.81653746770026,
"grad_norm": 0.5036038160324097,
"learning_rate": 6.873729543496987e-05,
"loss": 0.2840275192260742,
"step": 38100
},
{
"epoch": 32.81653746770026,
"eval_loss": 0.3425002694129944,
"eval_runtime": 18.0866,
"eval_samples_per_second": 1050.724,
"eval_steps_per_second": 32.842,
"step": 38100
},
{
"epoch": 32.85960378983635,
"grad_norm": 0.49322518706321716,
"learning_rate": 6.856503014642551e-05,
"loss": 0.27961723327636717,
"step": 38150
},
{
"epoch": 32.85960378983635,
"eval_loss": 0.34951549768447876,
"eval_runtime": 18.0747,
"eval_samples_per_second": 1051.417,
"eval_steps_per_second": 32.864,
"step": 38150
},
{
"epoch": 32.90267011197244,
"grad_norm": 0.5208374857902527,
"learning_rate": 6.839276485788114e-05,
"loss": 0.2792753982543945,
"step": 38200
},
{
"epoch": 32.90267011197244,
"eval_loss": 0.35184648633003235,
"eval_runtime": 18.2143,
"eval_samples_per_second": 1043.354,
"eval_steps_per_second": 32.612,
"step": 38200
},
{
"epoch": 32.945736434108525,
"grad_norm": 0.5315906405448914,
"learning_rate": 6.822049956933678e-05,
"loss": 0.2755429649353027,
"step": 38250
},
{
"epoch": 32.945736434108525,
"eval_loss": 0.3492070734500885,
"eval_runtime": 17.6981,
"eval_samples_per_second": 1073.789,
"eval_steps_per_second": 33.563,
"step": 38250
},
{
"epoch": 32.98880275624462,
"grad_norm": 0.44382065534591675,
"learning_rate": 6.804823428079242e-05,
"loss": 0.27883129119873046,
"step": 38300
},
{
"epoch": 32.98880275624462,
"eval_loss": 0.3477207124233246,
"eval_runtime": 16.8834,
"eval_samples_per_second": 1125.6,
"eval_steps_per_second": 35.182,
"step": 38300
},
{
"epoch": 33.03186907838071,
"grad_norm": 0.4330918490886688,
"learning_rate": 6.787596899224806e-05,
"loss": 0.2802844429016113,
"step": 38350
},
{
"epoch": 33.03186907838071,
"eval_loss": 0.35456958413124084,
"eval_runtime": 17.6811,
"eval_samples_per_second": 1074.82,
"eval_steps_per_second": 33.595,
"step": 38350
},
{
"epoch": 33.07493540051679,
"grad_norm": 0.5079190135002136,
"learning_rate": 6.77037037037037e-05,
"loss": 0.2710557746887207,
"step": 38400
},
{
"epoch": 33.07493540051679,
"eval_loss": 0.3461274802684784,
"eval_runtime": 16.7603,
"eval_samples_per_second": 1133.867,
"eval_steps_per_second": 35.441,
"step": 38400
},
{
"epoch": 33.118001722652885,
"grad_norm": 0.39602577686309814,
"learning_rate": 6.753143841515935e-05,
"loss": 0.273748836517334,
"step": 38450
},
{
"epoch": 33.118001722652885,
"eval_loss": 0.35338112711906433,
"eval_runtime": 17.5627,
"eval_samples_per_second": 1082.063,
"eval_steps_per_second": 33.822,
"step": 38450
},
{
"epoch": 33.161068044788976,
"grad_norm": 0.5302131175994873,
"learning_rate": 6.735917312661499e-05,
"loss": 0.27604515075683594,
"step": 38500
},
{
"epoch": 33.161068044788976,
"eval_loss": 0.3451584577560425,
"eval_runtime": 17.8874,
"eval_samples_per_second": 1062.425,
"eval_steps_per_second": 33.208,
"step": 38500
},
{
"epoch": 33.20413436692507,
"grad_norm": 0.4645986258983612,
"learning_rate": 6.718690783807063e-05,
"loss": 0.27570083618164065,
"step": 38550
},
{
"epoch": 33.20413436692507,
"eval_loss": 0.3517482280731201,
"eval_runtime": 18.0111,
"eval_samples_per_second": 1055.125,
"eval_steps_per_second": 32.98,
"step": 38550
},
{
"epoch": 33.24720068906115,
"grad_norm": 0.5932883620262146,
"learning_rate": 6.701464254952627e-05,
"loss": 0.27867019653320313,
"step": 38600
},
{
"epoch": 33.24720068906115,
"eval_loss": 0.3499729037284851,
"eval_runtime": 17.9683,
"eval_samples_per_second": 1057.639,
"eval_steps_per_second": 33.058,
"step": 38600
},
{
"epoch": 33.290267011197244,
"grad_norm": 0.5064970254898071,
"learning_rate": 6.684237726098192e-05,
"loss": 0.2713764190673828,
"step": 38650
},
{
"epoch": 33.290267011197244,
"eval_loss": 0.3454400599002838,
"eval_runtime": 17.9809,
"eval_samples_per_second": 1056.896,
"eval_steps_per_second": 33.035,
"step": 38650
},
{
"epoch": 33.333333333333336,
"grad_norm": 0.4735126793384552,
"learning_rate": 6.667011197243756e-05,
"loss": 0.27463886260986325,
"step": 38700
},
{
"epoch": 33.333333333333336,
"eval_loss": 0.34867385029792786,
"eval_runtime": 18.1937,
"eval_samples_per_second": 1044.539,
"eval_steps_per_second": 32.649,
"step": 38700
},
{
"epoch": 33.37639965546942,
"grad_norm": 0.4404645264148712,
"learning_rate": 6.64978466838932e-05,
"loss": 0.27556772232055665,
"step": 38750
},
{
"epoch": 33.37639965546942,
"eval_loss": 0.345106303691864,
"eval_runtime": 17.331,
"eval_samples_per_second": 1096.532,
"eval_steps_per_second": 34.274,
"step": 38750
},
{
"epoch": 33.41946597760551,
"grad_norm": 0.5270001888275146,
"learning_rate": 6.632558139534884e-05,
"loss": 0.27397455215454103,
"step": 38800
},
{
"epoch": 33.41946597760551,
"eval_loss": 0.34988826513290405,
"eval_runtime": 17.8862,
"eval_samples_per_second": 1062.495,
"eval_steps_per_second": 33.21,
"step": 38800
},
{
"epoch": 33.4625322997416,
"grad_norm": 0.44960689544677734,
"learning_rate": 6.615331610680448e-05,
"loss": 0.27975887298583985,
"step": 38850
},
{
"epoch": 33.4625322997416,
"eval_loss": 0.342560738325119,
"eval_runtime": 17.4275,
"eval_samples_per_second": 1090.461,
"eval_steps_per_second": 34.084,
"step": 38850
},
{
"epoch": 33.505598621877695,
"grad_norm": 0.5308107733726501,
"learning_rate": 6.598105081826013e-05,
"loss": 0.2754442596435547,
"step": 38900
},
{
"epoch": 33.505598621877695,
"eval_loss": 0.3444646894931793,
"eval_runtime": 18.9001,
"eval_samples_per_second": 1005.496,
"eval_steps_per_second": 31.428,
"step": 38900
},
{
"epoch": 33.54866494401378,
"grad_norm": 0.4935412108898163,
"learning_rate": 6.580878552971577e-05,
"loss": 0.2755685806274414,
"step": 38950
},
{
"epoch": 33.54866494401378,
"eval_loss": 0.3471962809562683,
"eval_runtime": 19.5679,
"eval_samples_per_second": 971.182,
"eval_steps_per_second": 30.356,
"step": 38950
},
{
"epoch": 33.59173126614987,
"grad_norm": 0.44433435797691345,
"learning_rate": 6.563652024117141e-05,
"loss": 0.2712661170959473,
"step": 39000
},
{
"epoch": 33.59173126614987,
"eval_loss": 0.3420495092868805,
"eval_runtime": 19.4068,
"eval_samples_per_second": 979.247,
"eval_steps_per_second": 30.608,
"step": 39000
},
{
"epoch": 33.63479758828596,
"grad_norm": 0.5094283819198608,
"learning_rate": 6.546425495262705e-05,
"loss": 0.27865215301513674,
"step": 39050
},
{
"epoch": 33.63479758828596,
"eval_loss": 0.3459348678588867,
"eval_runtime": 17.9122,
"eval_samples_per_second": 1060.954,
"eval_steps_per_second": 33.162,
"step": 39050
},
{
"epoch": 33.67786391042205,
"grad_norm": 0.43909311294555664,
"learning_rate": 6.529198966408268e-05,
"loss": 0.2744731903076172,
"step": 39100
},
{
"epoch": 33.67786391042205,
"eval_loss": 0.3503783345222473,
"eval_runtime": 17.8643,
"eval_samples_per_second": 1063.798,
"eval_steps_per_second": 33.251,
"step": 39100
},
{
"epoch": 33.72093023255814,
"grad_norm": 0.44677734375,
"learning_rate": 6.511972437553832e-05,
"loss": 0.2760355567932129,
"step": 39150
},
{
"epoch": 33.72093023255814,
"eval_loss": 0.3437975347042084,
"eval_runtime": 17.6977,
"eval_samples_per_second": 1073.812,
"eval_steps_per_second": 33.564,
"step": 39150
},
{
"epoch": 33.76399655469423,
"grad_norm": 0.5084570050239563,
"learning_rate": 6.494745908699398e-05,
"loss": 0.2722341537475586,
"step": 39200
},
{
"epoch": 33.76399655469423,
"eval_loss": 0.35166263580322266,
"eval_runtime": 17.0033,
"eval_samples_per_second": 1117.668,
"eval_steps_per_second": 34.934,
"step": 39200
},
{
"epoch": 33.807062876830315,
"grad_norm": 0.38987380266189575,
"learning_rate": 6.477519379844962e-05,
"loss": 0.28159662246704104,
"step": 39250
},
{
"epoch": 33.807062876830315,
"eval_loss": 0.34989920258522034,
"eval_runtime": 17.6583,
"eval_samples_per_second": 1076.206,
"eval_steps_per_second": 33.639,
"step": 39250
},
{
"epoch": 33.85012919896641,
"grad_norm": 0.4187825918197632,
"learning_rate": 6.460292850990526e-05,
"loss": 0.27221235275268557,
"step": 39300
},
{
"epoch": 33.85012919896641,
"eval_loss": 0.3462679088115692,
"eval_runtime": 17.8256,
"eval_samples_per_second": 1066.105,
"eval_steps_per_second": 33.323,
"step": 39300
},
{
"epoch": 33.8931955211025,
"grad_norm": 0.4491558372974396,
"learning_rate": 6.443066322136089e-05,
"loss": 0.2739124870300293,
"step": 39350
},
{
"epoch": 33.8931955211025,
"eval_loss": 0.3501715362071991,
"eval_runtime": 17.0734,
"eval_samples_per_second": 1113.08,
"eval_steps_per_second": 34.791,
"step": 39350
},
{
"epoch": 33.93626184323859,
"grad_norm": 0.41994020342826843,
"learning_rate": 6.425839793281653e-05,
"loss": 0.27803466796875,
"step": 39400
},
{
"epoch": 33.93626184323859,
"eval_loss": 0.34925201535224915,
"eval_runtime": 17.6642,
"eval_samples_per_second": 1075.85,
"eval_steps_per_second": 33.627,
"step": 39400
},
{
"epoch": 33.979328165374675,
"grad_norm": 0.41478756070137024,
"learning_rate": 6.408613264427218e-05,
"loss": 0.2822599220275879,
"step": 39450
},
{
"epoch": 33.979328165374675,
"eval_loss": 0.3459310829639435,
"eval_runtime": 17.86,
"eval_samples_per_second": 1064.055,
"eval_steps_per_second": 33.259,
"step": 39450
},
{
"epoch": 34.022394487510766,
"grad_norm": 0.46041932702064514,
"learning_rate": 6.391386735572782e-05,
"loss": 0.2755163383483887,
"step": 39500
},
{
"epoch": 34.022394487510766,
"eval_loss": 0.3456686735153198,
"eval_runtime": 17.9365,
"eval_samples_per_second": 1059.515,
"eval_steps_per_second": 33.117,
"step": 39500
},
{
"epoch": 34.06546080964686,
"grad_norm": 0.4062725603580475,
"learning_rate": 6.374160206718347e-05,
"loss": 0.2705917167663574,
"step": 39550
},
{
"epoch": 34.06546080964686,
"eval_loss": 0.34850847721099854,
"eval_runtime": 17.8359,
"eval_samples_per_second": 1065.489,
"eval_steps_per_second": 33.304,
"step": 39550
},
{
"epoch": 34.10852713178294,
"grad_norm": 0.3924962878227234,
"learning_rate": 6.356933677863912e-05,
"loss": 0.2703757858276367,
"step": 39600
},
{
"epoch": 34.10852713178294,
"eval_loss": 0.344966858625412,
"eval_runtime": 17.7503,
"eval_samples_per_second": 1070.632,
"eval_steps_per_second": 33.464,
"step": 39600
},
{
"epoch": 34.151593453919034,
"grad_norm": 0.4565719962120056,
"learning_rate": 6.339707149009474e-05,
"loss": 0.2750784683227539,
"step": 39650
},
{
"epoch": 34.151593453919034,
"eval_loss": 0.3534243106842041,
"eval_runtime": 17.4653,
"eval_samples_per_second": 1088.103,
"eval_steps_per_second": 34.01,
"step": 39650
},
{
"epoch": 34.194659776055126,
"grad_norm": 0.4472062289714813,
"learning_rate": 6.322480620155039e-05,
"loss": 0.2693879890441895,
"step": 39700
},
{
"epoch": 34.194659776055126,
"eval_loss": 0.35429486632347107,
"eval_runtime": 17.4596,
"eval_samples_per_second": 1088.456,
"eval_steps_per_second": 34.021,
"step": 39700
},
{
"epoch": 34.23772609819122,
"grad_norm": 0.4285549223423004,
"learning_rate": 6.305254091300603e-05,
"loss": 0.26811901092529294,
"step": 39750
},
{
"epoch": 34.23772609819122,
"eval_loss": 0.3482956290245056,
"eval_runtime": 17.6328,
"eval_samples_per_second": 1077.762,
"eval_steps_per_second": 33.687,
"step": 39750
},
{
"epoch": 34.2807924203273,
"grad_norm": 0.4658809006214142,
"learning_rate": 6.288027562446167e-05,
"loss": 0.27812984466552737,
"step": 39800
},
{
"epoch": 34.2807924203273,
"eval_loss": 0.34879428148269653,
"eval_runtime": 17.7477,
"eval_samples_per_second": 1070.784,
"eval_steps_per_second": 33.469,
"step": 39800
},
{
"epoch": 34.32385874246339,
"grad_norm": 0.434271901845932,
"learning_rate": 6.270801033591731e-05,
"loss": 0.27308488845825196,
"step": 39850
},
{
"epoch": 34.32385874246339,
"eval_loss": 0.3474234938621521,
"eval_runtime": 18.2801,
"eval_samples_per_second": 1039.603,
"eval_steps_per_second": 32.494,
"step": 39850
},
{
"epoch": 34.366925064599485,
"grad_norm": 0.6403506994247437,
"learning_rate": 6.253574504737297e-05,
"loss": 0.2769889831542969,
"step": 39900
},
{
"epoch": 34.366925064599485,
"eval_loss": 0.3394336402416229,
"eval_runtime": 18.1496,
"eval_samples_per_second": 1047.077,
"eval_steps_per_second": 32.728,
"step": 39900
},
{
"epoch": 34.40999138673557,
"grad_norm": 0.4239259958267212,
"learning_rate": 6.23634797588286e-05,
"loss": 0.27345130920410154,
"step": 39950
},
{
"epoch": 34.40999138673557,
"eval_loss": 0.3495025038719177,
"eval_runtime": 18.9968,
"eval_samples_per_second": 1000.379,
"eval_steps_per_second": 31.268,
"step": 39950
},
{
"epoch": 34.45305770887166,
"grad_norm": 0.3801671862602234,
"learning_rate": 6.219121447028424e-05,
"loss": 0.27393379211425783,
"step": 40000
},
{
"epoch": 34.45305770887166,
"eval_loss": 0.34322065114974976,
"eval_runtime": 17.9236,
"eval_samples_per_second": 1060.28,
"eval_steps_per_second": 33.141,
"step": 40000
},
{
"epoch": 34.49612403100775,
"grad_norm": 0.4332405626773834,
"learning_rate": 6.201894918173988e-05,
"loss": 0.2788553237915039,
"step": 40050
},
{
"epoch": 34.49612403100775,
"eval_loss": 0.34779658913612366,
"eval_runtime": 17.437,
"eval_samples_per_second": 1089.869,
"eval_steps_per_second": 34.066,
"step": 40050
},
{
"epoch": 34.539190353143844,
"grad_norm": 0.3911673128604889,
"learning_rate": 6.184668389319552e-05,
"loss": 0.2744323348999023,
"step": 40100
},
{
"epoch": 34.539190353143844,
"eval_loss": 0.34955883026123047,
"eval_runtime": 17.213,
"eval_samples_per_second": 1104.047,
"eval_steps_per_second": 34.509,
"step": 40100
},
{
"epoch": 34.58225667527993,
"grad_norm": 0.5038763284683228,
"learning_rate": 6.167441860465117e-05,
"loss": 0.2725095558166504,
"step": 40150
},
{
"epoch": 34.58225667527993,
"eval_loss": 0.34505942463874817,
"eval_runtime": 17.948,
"eval_samples_per_second": 1058.839,
"eval_steps_per_second": 33.096,
"step": 40150
},
{
"epoch": 34.62532299741602,
"grad_norm": 0.4034155011177063,
"learning_rate": 6.150215331610681e-05,
"loss": 0.272111701965332,
"step": 40200
},
{
"epoch": 34.62532299741602,
"eval_loss": 0.3441019654273987,
"eval_runtime": 17.5975,
"eval_samples_per_second": 1079.924,
"eval_steps_per_second": 33.755,
"step": 40200
},
{
"epoch": 34.66838931955211,
"grad_norm": 0.45396238565444946,
"learning_rate": 6.132988802756245e-05,
"loss": 0.27917551040649413,
"step": 40250
},
{
"epoch": 34.66838931955211,
"eval_loss": 0.3423817455768585,
"eval_runtime": 17.1731,
"eval_samples_per_second": 1106.616,
"eval_steps_per_second": 34.589,
"step": 40250
},
{
"epoch": 34.7114556416882,
"grad_norm": 0.4661038815975189,
"learning_rate": 6.115762273901809e-05,
"loss": 0.280230770111084,
"step": 40300
},
{
"epoch": 34.7114556416882,
"eval_loss": 0.347525417804718,
"eval_runtime": 17.9207,
"eval_samples_per_second": 1060.449,
"eval_steps_per_second": 33.146,
"step": 40300
},
{
"epoch": 34.75452196382429,
"grad_norm": 0.519680917263031,
"learning_rate": 6.0985357450473734e-05,
"loss": 0.2720791244506836,
"step": 40350
},
{
"epoch": 34.75452196382429,
"eval_loss": 0.33955568075180054,
"eval_runtime": 17.8871,
"eval_samples_per_second": 1062.442,
"eval_steps_per_second": 33.208,
"step": 40350
},
{
"epoch": 34.79758828596038,
"grad_norm": 0.523288905620575,
"learning_rate": 6.0813092161929376e-05,
"loss": 0.27809135437011717,
"step": 40400
},
{
"epoch": 34.79758828596038,
"eval_loss": 0.3517861068248749,
"eval_runtime": 17.9105,
"eval_samples_per_second": 1061.055,
"eval_steps_per_second": 33.165,
"step": 40400
},
{
"epoch": 34.84065460809647,
"grad_norm": 0.4025672972202301,
"learning_rate": 6.064082687338502e-05,
"loss": 0.2726823043823242,
"step": 40450
},
{
"epoch": 34.84065460809647,
"eval_loss": 0.3407162129878998,
"eval_runtime": 17.7528,
"eval_samples_per_second": 1070.477,
"eval_steps_per_second": 33.459,
"step": 40450
},
{
"epoch": 34.883720930232556,
"grad_norm": 0.4430167078971863,
"learning_rate": 6.046856158484065e-05,
"loss": 0.2641313934326172,
"step": 40500
},
{
"epoch": 34.883720930232556,
"eval_loss": 0.3425806164741516,
"eval_runtime": 17.4356,
"eval_samples_per_second": 1089.951,
"eval_steps_per_second": 34.068,
"step": 40500
},
{
"epoch": 34.92678725236865,
"grad_norm": 0.45299965143203735,
"learning_rate": 6.0296296296296295e-05,
"loss": 0.2765083312988281,
"step": 40550
},
{
"epoch": 34.92678725236865,
"eval_loss": 0.34253862500190735,
"eval_runtime": 16.4253,
"eval_samples_per_second": 1156.993,
"eval_steps_per_second": 36.164,
"step": 40550
},
{
"epoch": 34.96985357450474,
"grad_norm": 0.5068448781967163,
"learning_rate": 6.0124031007751944e-05,
"loss": 0.27179483413696287,
"step": 40600
},
{
"epoch": 34.96985357450474,
"eval_loss": 0.33738985657691956,
"eval_runtime": 17.5247,
"eval_samples_per_second": 1084.411,
"eval_steps_per_second": 33.895,
"step": 40600
},
{
"epoch": 35.012919896640824,
"grad_norm": 0.4455919861793518,
"learning_rate": 5.9951765719207586e-05,
"loss": 0.27295986175537107,
"step": 40650
},
{
"epoch": 35.012919896640824,
"eval_loss": 0.3451545536518097,
"eval_runtime": 17.1512,
"eval_samples_per_second": 1108.03,
"eval_steps_per_second": 34.633,
"step": 40650
},
{
"epoch": 35.055986218776916,
"grad_norm": 0.4618251919746399,
"learning_rate": 5.977950043066323e-05,
"loss": 0.27406063079833987,
"step": 40700
},
{
"epoch": 35.055986218776916,
"eval_loss": 0.34172993898391724,
"eval_runtime": 17.2228,
"eval_samples_per_second": 1103.424,
"eval_steps_per_second": 34.489,
"step": 40700
},
{
"epoch": 35.09905254091301,
"grad_norm": 0.48039114475250244,
"learning_rate": 5.9607235142118864e-05,
"loss": 0.2712633514404297,
"step": 40750
},
{
"epoch": 35.09905254091301,
"eval_loss": 0.34705230593681335,
"eval_runtime": 17.8501,
"eval_samples_per_second": 1064.646,
"eval_steps_per_second": 33.277,
"step": 40750
},
{
"epoch": 35.1421188630491,
"grad_norm": 0.39087414741516113,
"learning_rate": 5.9434969853574506e-05,
"loss": 0.2726176452636719,
"step": 40800
},
{
"epoch": 35.1421188630491,
"eval_loss": 0.35245054960250854,
"eval_runtime": 17.9496,
"eval_samples_per_second": 1058.745,
"eval_steps_per_second": 33.093,
"step": 40800
},
{
"epoch": 35.18518518518518,
"grad_norm": 0.5085448622703552,
"learning_rate": 5.926270456503015e-05,
"loss": 0.27361579895019533,
"step": 40850
},
{
"epoch": 35.18518518518518,
"eval_loss": 0.34160110354423523,
"eval_runtime": 17.7554,
"eval_samples_per_second": 1070.322,
"eval_steps_per_second": 33.455,
"step": 40850
},
{
"epoch": 35.228251507321275,
"grad_norm": 0.3623582124710083,
"learning_rate": 5.909043927648579e-05,
"loss": 0.26523059844970703,
"step": 40900
},
{
"epoch": 35.228251507321275,
"eval_loss": 0.3383851945400238,
"eval_runtime": 18.3149,
"eval_samples_per_second": 1037.623,
"eval_steps_per_second": 32.433,
"step": 40900
},
{
"epoch": 35.27131782945737,
"grad_norm": 0.42701098322868347,
"learning_rate": 5.8918173987941426e-05,
"loss": 0.2780462265014648,
"step": 40950
},
{
"epoch": 35.27131782945737,
"eval_loss": 0.34134912490844727,
"eval_runtime": 17.9587,
"eval_samples_per_second": 1058.205,
"eval_steps_per_second": 33.076,
"step": 40950
},
{
"epoch": 35.31438415159345,
"grad_norm": 0.4851948916912079,
"learning_rate": 5.874590869939708e-05,
"loss": 0.2788583374023437,
"step": 41000
},
{
"epoch": 35.31438415159345,
"eval_loss": 0.33466637134552,
"eval_runtime": 16.9935,
"eval_samples_per_second": 1118.307,
"eval_steps_per_second": 34.954,
"step": 41000
},
{
"epoch": 35.35745047372954,
"grad_norm": 0.3875599205493927,
"learning_rate": 5.8573643410852716e-05,
"loss": 0.26727346420288084,
"step": 41050
},
{
"epoch": 35.35745047372954,
"eval_loss": 0.34064584970474243,
"eval_runtime": 17.7144,
"eval_samples_per_second": 1072.8,
"eval_steps_per_second": 33.532,
"step": 41050
},
{
"epoch": 35.400516795865634,
"grad_norm": 0.46586155891418457,
"learning_rate": 5.840137812230836e-05,
"loss": 0.27534004211425783,
"step": 41100
},
{
"epoch": 35.400516795865634,
"eval_loss": 0.33610883355140686,
"eval_runtime": 17.8537,
"eval_samples_per_second": 1064.432,
"eval_steps_per_second": 33.27,
"step": 41100
},
{
"epoch": 35.443583118001726,
"grad_norm": 0.5082590579986572,
"learning_rate": 5.8229112833764e-05,
"loss": 0.2773623275756836,
"step": 41150
},
{
"epoch": 35.443583118001726,
"eval_loss": 0.3373280465602875,
"eval_runtime": 17.3963,
"eval_samples_per_second": 1092.419,
"eval_steps_per_second": 34.145,
"step": 41150
},
{
"epoch": 35.48664944013781,
"grad_norm": 0.5587695837020874,
"learning_rate": 5.8056847545219636e-05,
"loss": 0.27423599243164065,
"step": 41200
},
{
"epoch": 35.48664944013781,
"eval_loss": 0.34136703610420227,
"eval_runtime": 17.8605,
"eval_samples_per_second": 1064.026,
"eval_steps_per_second": 33.258,
"step": 41200
},
{
"epoch": 35.5297157622739,
"grad_norm": 0.4970998466014862,
"learning_rate": 5.788458225667528e-05,
"loss": 0.27262737274169924,
"step": 41250
},
{
"epoch": 35.5297157622739,
"eval_loss": 0.3357755243778229,
"eval_runtime": 17.8891,
"eval_samples_per_second": 1062.322,
"eval_steps_per_second": 33.205,
"step": 41250
},
{
"epoch": 35.572782084409994,
"grad_norm": 0.4705260694026947,
"learning_rate": 5.771231696813092e-05,
"loss": 0.276383171081543,
"step": 41300
},
{
"epoch": 35.572782084409994,
"eval_loss": 0.34233272075653076,
"eval_runtime": 18.0064,
"eval_samples_per_second": 1055.401,
"eval_steps_per_second": 32.988,
"step": 41300
},
{
"epoch": 35.61584840654608,
"grad_norm": 0.4661819636821747,
"learning_rate": 5.754005167958657e-05,
"loss": 0.2638749313354492,
"step": 41350
},
{
"epoch": 35.61584840654608,
"eval_loss": 0.3379902243614197,
"eval_runtime": 17.6564,
"eval_samples_per_second": 1076.321,
"eval_steps_per_second": 33.642,
"step": 41350
},
{
"epoch": 35.65891472868217,
"grad_norm": 0.4111165702342987,
"learning_rate": 5.736778639104221e-05,
"loss": 0.26883033752441404,
"step": 41400
},
{
"epoch": 35.65891472868217,
"eval_loss": 0.3382817208766937,
"eval_runtime": 17.5428,
"eval_samples_per_second": 1083.296,
"eval_steps_per_second": 33.86,
"step": 41400
},
{
"epoch": 35.70198105081826,
"grad_norm": 0.41107845306396484,
"learning_rate": 5.719552110249785e-05,
"loss": 0.2747551727294922,
"step": 41450
},
{
"epoch": 35.70198105081826,
"eval_loss": 0.34029775857925415,
"eval_runtime": 17.4782,
"eval_samples_per_second": 1087.295,
"eval_steps_per_second": 33.985,
"step": 41450
},
{
"epoch": 35.74504737295435,
"grad_norm": 0.41739150881767273,
"learning_rate": 5.702325581395349e-05,
"loss": 0.2748952102661133,
"step": 41500
},
{
"epoch": 35.74504737295435,
"eval_loss": 0.34697577357292175,
"eval_runtime": 17.7255,
"eval_samples_per_second": 1072.128,
"eval_steps_per_second": 33.511,
"step": 41500
},
{
"epoch": 35.78811369509044,
"grad_norm": 0.41582760214805603,
"learning_rate": 5.685099052540913e-05,
"loss": 0.27996679306030275,
"step": 41550
},
{
"epoch": 35.78811369509044,
"eval_loss": 0.34205004572868347,
"eval_runtime": 17.8009,
"eval_samples_per_second": 1067.589,
"eval_steps_per_second": 33.369,
"step": 41550
},
{
"epoch": 35.83118001722653,
"grad_norm": 0.4034820795059204,
"learning_rate": 5.667872523686477e-05,
"loss": 0.2681726455688477,
"step": 41600
},
{
"epoch": 35.83118001722653,
"eval_loss": 0.3477557599544525,
"eval_runtime": 17.2857,
"eval_samples_per_second": 1099.407,
"eval_steps_per_second": 34.364,
"step": 41600
},
{
"epoch": 35.87424633936262,
"grad_norm": 0.5131692290306091,
"learning_rate": 5.6506459948320415e-05,
"loss": 0.2685848617553711,
"step": 41650
},
{
"epoch": 35.87424633936262,
"eval_loss": 0.34424716234207153,
"eval_runtime": 17.9277,
"eval_samples_per_second": 1060.036,
"eval_steps_per_second": 33.133,
"step": 41650
},
{
"epoch": 35.917312661498705,
"grad_norm": 0.38450688123703003,
"learning_rate": 5.6334194659776064e-05,
"loss": 0.2745730972290039,
"step": 41700
},
{
"epoch": 35.917312661498705,
"eval_loss": 0.3426118791103363,
"eval_runtime": 17.6578,
"eval_samples_per_second": 1076.236,
"eval_steps_per_second": 33.639,
"step": 41700
},
{
"epoch": 35.9603789836348,
"grad_norm": 0.5067197680473328,
"learning_rate": 5.6161929371231706e-05,
"loss": 0.27472471237182616,
"step": 41750
},
{
"epoch": 35.9603789836348,
"eval_loss": 0.34160706400871277,
"eval_runtime": 17.8861,
"eval_samples_per_second": 1062.498,
"eval_steps_per_second": 33.21,
"step": 41750
},
{
"epoch": 36.00344530577089,
"grad_norm": 0.4307672381401062,
"learning_rate": 5.598966408268734e-05,
"loss": 0.26797880172729494,
"step": 41800
},
{
"epoch": 36.00344530577089,
"eval_loss": 0.3427271842956543,
"eval_runtime": 17.9622,
"eval_samples_per_second": 1058.0,
"eval_steps_per_second": 33.069,
"step": 41800
},
{
"epoch": 36.04651162790697,
"grad_norm": 0.5117197632789612,
"learning_rate": 5.581739879414298e-05,
"loss": 0.27051544189453125,
"step": 41850
},
{
"epoch": 36.04651162790697,
"eval_loss": 0.34072092175483704,
"eval_runtime": 17.8202,
"eval_samples_per_second": 1066.432,
"eval_steps_per_second": 33.333,
"step": 41850
},
{
"epoch": 36.089577950043065,
"grad_norm": 0.4320580065250397,
"learning_rate": 5.5645133505598625e-05,
"loss": 0.2716988754272461,
"step": 41900
},
{
"epoch": 36.089577950043065,
"eval_loss": 0.3381943702697754,
"eval_runtime": 17.5501,
"eval_samples_per_second": 1082.844,
"eval_steps_per_second": 33.846,
"step": 41900
},
{
"epoch": 36.13264427217916,
"grad_norm": 0.4369736909866333,
"learning_rate": 5.547286821705426e-05,
"loss": 0.27702898025512696,
"step": 41950
},
{
"epoch": 36.13264427217916,
"eval_loss": 0.3371625542640686,
"eval_runtime": 17.3949,
"eval_samples_per_second": 1092.501,
"eval_steps_per_second": 34.148,
"step": 41950
},
{
"epoch": 36.17571059431525,
"grad_norm": 0.44421809911727905,
"learning_rate": 5.53006029285099e-05,
"loss": 0.2704465866088867,
"step": 42000
},
{
"epoch": 36.17571059431525,
"eval_loss": 0.3457271158695221,
"eval_runtime": 17.9037,
"eval_samples_per_second": 1061.455,
"eval_steps_per_second": 33.177,
"step": 42000
},
{
"epoch": 36.21877691645133,
"grad_norm": 0.3486070930957794,
"learning_rate": 5.512833763996555e-05,
"loss": 0.26162067413330076,
"step": 42050
},
{
"epoch": 36.21877691645133,
"eval_loss": 0.34183192253112793,
"eval_runtime": 16.7764,
"eval_samples_per_second": 1132.779,
"eval_steps_per_second": 35.407,
"step": 42050
},
{
"epoch": 36.261843238587424,
"grad_norm": 0.42011758685112,
"learning_rate": 5.4956072351421194e-05,
"loss": 0.26230512619018553,
"step": 42100
},
{
"epoch": 36.261843238587424,
"eval_loss": 0.3458743095397949,
"eval_runtime": 17.7406,
"eval_samples_per_second": 1071.213,
"eval_steps_per_second": 33.482,
"step": 42100
},
{
"epoch": 36.304909560723516,
"grad_norm": 0.5051075220108032,
"learning_rate": 5.4783807062876836e-05,
"loss": 0.26472633361816406,
"step": 42150
},
{
"epoch": 36.304909560723516,
"eval_loss": 0.3397945761680603,
"eval_runtime": 17.5211,
"eval_samples_per_second": 1084.636,
"eval_steps_per_second": 33.902,
"step": 42150
},
{
"epoch": 36.3479758828596,
"grad_norm": 0.4122444689273834,
"learning_rate": 5.461154177433248e-05,
"loss": 0.27144794464111327,
"step": 42200
},
{
"epoch": 36.3479758828596,
"eval_loss": 0.34024304151535034,
"eval_runtime": 17.8293,
"eval_samples_per_second": 1065.883,
"eval_steps_per_second": 33.316,
"step": 42200
},
{
"epoch": 36.39104220499569,
"grad_norm": 0.4055521786212921,
"learning_rate": 5.443927648578811e-05,
"loss": 0.26872739791870115,
"step": 42250
},
{
"epoch": 36.39104220499569,
"eval_loss": 0.3468708395957947,
"eval_runtime": 17.6978,
"eval_samples_per_second": 1073.804,
"eval_steps_per_second": 33.563,
"step": 42250
},
{
"epoch": 36.434108527131784,
"grad_norm": 0.4319891333580017,
"learning_rate": 5.4267011197243755e-05,
"loss": 0.26956621170043943,
"step": 42300
},
{
"epoch": 36.434108527131784,
"eval_loss": 0.34552404284477234,
"eval_runtime": 16.9548,
"eval_samples_per_second": 1120.861,
"eval_steps_per_second": 35.034,
"step": 42300
},
{
"epoch": 36.477174849267875,
"grad_norm": 0.5158259272575378,
"learning_rate": 5.40947459086994e-05,
"loss": 0.276561336517334,
"step": 42350
},
{
"epoch": 36.477174849267875,
"eval_loss": 0.34414294362068176,
"eval_runtime": 17.7146,
"eval_samples_per_second": 1072.787,
"eval_steps_per_second": 33.532,
"step": 42350
},
{
"epoch": 36.52024117140396,
"grad_norm": 0.4017482399940491,
"learning_rate": 5.392248062015503e-05,
"loss": 0.2677594757080078,
"step": 42400
},
{
"epoch": 36.52024117140396,
"eval_loss": 0.34336283802986145,
"eval_runtime": 16.3716,
"eval_samples_per_second": 1160.791,
"eval_steps_per_second": 36.282,
"step": 42400
},
{
"epoch": 36.56330749354005,
"grad_norm": 0.48331621289253235,
"learning_rate": 5.375021533161069e-05,
"loss": 0.27375417709350586,
"step": 42450
},
{
"epoch": 36.56330749354005,
"eval_loss": 0.3469257652759552,
"eval_runtime": 17.6382,
"eval_samples_per_second": 1077.433,
"eval_steps_per_second": 33.677,
"step": 42450
},
{
"epoch": 36.60637381567614,
"grad_norm": 0.47335347533226013,
"learning_rate": 5.357795004306633e-05,
"loss": 0.2711253356933594,
"step": 42500
},
{
"epoch": 36.60637381567614,
"eval_loss": 0.3430469334125519,
"eval_runtime": 17.6249,
"eval_samples_per_second": 1078.249,
"eval_steps_per_second": 33.702,
"step": 42500
},
{
"epoch": 36.64944013781223,
"grad_norm": 0.4714546203613281,
"learning_rate": 5.3405684754521966e-05,
"loss": 0.2674391555786133,
"step": 42550
},
{
"epoch": 36.64944013781223,
"eval_loss": 0.3497195839881897,
"eval_runtime": 17.853,
"eval_samples_per_second": 1064.473,
"eval_steps_per_second": 33.272,
"step": 42550
},
{
"epoch": 36.69250645994832,
"grad_norm": 0.41258716583251953,
"learning_rate": 5.323341946597761e-05,
"loss": 0.26646110534667966,
"step": 42600
},
{
"epoch": 36.69250645994832,
"eval_loss": 0.34118419885635376,
"eval_runtime": 17.8783,
"eval_samples_per_second": 1062.967,
"eval_steps_per_second": 33.225,
"step": 42600
},
{
"epoch": 36.73557278208441,
"grad_norm": 0.45262160897254944,
"learning_rate": 5.306115417743325e-05,
"loss": 0.2719459533691406,
"step": 42650
},
{
"epoch": 36.73557278208441,
"eval_loss": 0.33805808424949646,
"eval_runtime": 18.0153,
"eval_samples_per_second": 1054.881,
"eval_steps_per_second": 32.972,
"step": 42650
},
{
"epoch": 36.7786391042205,
"grad_norm": 0.36781540513038635,
"learning_rate": 5.2888888888888885e-05,
"loss": 0.2655976486206055,
"step": 42700
},
{
"epoch": 36.7786391042205,
"eval_loss": 0.34015172719955444,
"eval_runtime": 17.5197,
"eval_samples_per_second": 1084.721,
"eval_steps_per_second": 33.905,
"step": 42700
},
{
"epoch": 36.82170542635659,
"grad_norm": 0.47751036286354065,
"learning_rate": 5.271662360034453e-05,
"loss": 0.270654182434082,
"step": 42750
},
{
"epoch": 36.82170542635659,
"eval_loss": 0.3435145318508148,
"eval_runtime": 17.1934,
"eval_samples_per_second": 1105.309,
"eval_steps_per_second": 34.548,
"step": 42750
},
{
"epoch": 36.86477174849268,
"grad_norm": 0.5343551635742188,
"learning_rate": 5.2544358311800176e-05,
"loss": 0.27388362884521483,
"step": 42800
},
{
"epoch": 36.86477174849268,
"eval_loss": 0.33774811029434204,
"eval_runtime": 17.9087,
"eval_samples_per_second": 1061.158,
"eval_steps_per_second": 33.168,
"step": 42800
},
{
"epoch": 36.90783807062877,
"grad_norm": 0.4724558889865875,
"learning_rate": 5.237209302325582e-05,
"loss": 0.26862293243408203,
"step": 42850
},
{
"epoch": 36.90783807062877,
"eval_loss": 0.3435251712799072,
"eval_runtime": 17.0757,
"eval_samples_per_second": 1112.929,
"eval_steps_per_second": 34.786,
"step": 42850
},
{
"epoch": 36.950904392764855,
"grad_norm": 0.5167089104652405,
"learning_rate": 5.219982773471146e-05,
"loss": 0.27372997283935546,
"step": 42900
},
{
"epoch": 36.950904392764855,
"eval_loss": 0.34223997592926025,
"eval_runtime": 17.7156,
"eval_samples_per_second": 1072.728,
"eval_steps_per_second": 33.53,
"step": 42900
},
{
"epoch": 36.99397071490095,
"grad_norm": 0.5571605563163757,
"learning_rate": 5.20275624461671e-05,
"loss": 0.2704277038574219,
"step": 42950
},
{
"epoch": 36.99397071490095,
"eval_loss": 0.34202906489372253,
"eval_runtime": 17.7015,
"eval_samples_per_second": 1073.583,
"eval_steps_per_second": 33.557,
"step": 42950
},
{
"epoch": 37.03703703703704,
"grad_norm": 0.5319112539291382,
"learning_rate": 5.185529715762274e-05,
"loss": 0.26464887619018557,
"step": 43000
},
{
"epoch": 37.03703703703704,
"eval_loss": 0.33549952507019043,
"eval_runtime": 17.6629,
"eval_samples_per_second": 1075.925,
"eval_steps_per_second": 33.63,
"step": 43000
},
{
"epoch": 37.08010335917313,
"grad_norm": 0.4691065549850464,
"learning_rate": 5.168303186907838e-05,
"loss": 0.26262826919555665,
"step": 43050
},
{
"epoch": 37.08010335917313,
"eval_loss": 0.33849185705184937,
"eval_runtime": 17.7204,
"eval_samples_per_second": 1072.434,
"eval_steps_per_second": 33.521,
"step": 43050
},
{
"epoch": 37.123169681309214,
"grad_norm": 0.46527716517448425,
"learning_rate": 5.151076658053402e-05,
"loss": 0.2713047981262207,
"step": 43100
},
{
"epoch": 37.123169681309214,
"eval_loss": 0.3408891260623932,
"eval_runtime": 17.364,
"eval_samples_per_second": 1094.448,
"eval_steps_per_second": 34.209,
"step": 43100
},
{
"epoch": 37.166236003445306,
"grad_norm": 0.552895188331604,
"learning_rate": 5.133850129198967e-05,
"loss": 0.2643990707397461,
"step": 43150
},
{
"epoch": 37.166236003445306,
"eval_loss": 0.3359718918800354,
"eval_runtime": 17.9648,
"eval_samples_per_second": 1057.846,
"eval_steps_per_second": 33.065,
"step": 43150
},
{
"epoch": 37.2093023255814,
"grad_norm": 0.3904784917831421,
"learning_rate": 5.116623600344531e-05,
"loss": 0.2650018310546875,
"step": 43200
},
{
"epoch": 37.2093023255814,
"eval_loss": 0.34700727462768555,
"eval_runtime": 17.1055,
"eval_samples_per_second": 1110.985,
"eval_steps_per_second": 34.726,
"step": 43200
},
{
"epoch": 37.25236864771748,
"grad_norm": 0.6356217265129089,
"learning_rate": 5.099397071490095e-05,
"loss": 0.27305797576904295,
"step": 43250
},
{
"epoch": 37.25236864771748,
"eval_loss": 0.34335824847221375,
"eval_runtime": 17.7907,
"eval_samples_per_second": 1068.199,
"eval_steps_per_second": 33.388,
"step": 43250
},
{
"epoch": 37.295434969853574,
"grad_norm": 0.38687363266944885,
"learning_rate": 5.082170542635659e-05,
"loss": 0.2676750946044922,
"step": 43300
},
{
"epoch": 37.295434969853574,
"eval_loss": 0.34360429644584656,
"eval_runtime": 17.8278,
"eval_samples_per_second": 1065.978,
"eval_steps_per_second": 33.319,
"step": 43300
},
{
"epoch": 37.338501291989665,
"grad_norm": 0.5335272550582886,
"learning_rate": 5.064944013781223e-05,
"loss": 0.2677302360534668,
"step": 43350
},
{
"epoch": 37.338501291989665,
"eval_loss": 0.343604177236557,
"eval_runtime": 17.5758,
"eval_samples_per_second": 1081.257,
"eval_steps_per_second": 33.796,
"step": 43350
},
{
"epoch": 37.38156761412576,
"grad_norm": 0.46221479773521423,
"learning_rate": 5.0477174849267875e-05,
"loss": 0.2667576789855957,
"step": 43400
},
{
"epoch": 37.38156761412576,
"eval_loss": 0.34761863946914673,
"eval_runtime": 17.7665,
"eval_samples_per_second": 1069.655,
"eval_steps_per_second": 33.434,
"step": 43400
},
{
"epoch": 37.42463393626184,
"grad_norm": 0.48034900426864624,
"learning_rate": 5.030490956072351e-05,
"loss": 0.2662837219238281,
"step": 43450
},
{
"epoch": 37.42463393626184,
"eval_loss": 0.34295064210891724,
"eval_runtime": 17.5091,
"eval_samples_per_second": 1085.381,
"eval_steps_per_second": 33.925,
"step": 43450
},
{
"epoch": 37.46770025839793,
"grad_norm": 0.44220060110092163,
"learning_rate": 5.0132644272179166e-05,
"loss": 0.26716251373291017,
"step": 43500
},
{
"epoch": 37.46770025839793,
"eval_loss": 0.33440375328063965,
"eval_runtime": 18.4518,
"eval_samples_per_second": 1029.927,
"eval_steps_per_second": 32.192,
"step": 43500
},
{
"epoch": 37.510766580534025,
"grad_norm": 0.36467257142066956,
"learning_rate": 4.99603789836348e-05,
"loss": 0.2673937225341797,
"step": 43550
},
{
"epoch": 37.510766580534025,
"eval_loss": 0.3415800631046295,
"eval_runtime": 17.1191,
"eval_samples_per_second": 1110.104,
"eval_steps_per_second": 34.698,
"step": 43550
},
{
"epoch": 37.55383290267011,
"grad_norm": 0.5146782994270325,
"learning_rate": 4.978811369509044e-05,
"loss": 0.2724634552001953,
"step": 43600
},
{
"epoch": 37.55383290267011,
"eval_loss": 0.33940932154655457,
"eval_runtime": 17.7888,
"eval_samples_per_second": 1068.311,
"eval_steps_per_second": 33.392,
"step": 43600
},
{
"epoch": 37.5968992248062,
"grad_norm": 0.4268561005592346,
"learning_rate": 4.9615848406546085e-05,
"loss": 0.26595291137695315,
"step": 43650
},
{
"epoch": 37.5968992248062,
"eval_loss": 0.3469422459602356,
"eval_runtime": 20.077,
"eval_samples_per_second": 946.554,
"eval_steps_per_second": 29.586,
"step": 43650
},
{
"epoch": 37.63996554694229,
"grad_norm": 0.4228191673755646,
"learning_rate": 4.944358311800172e-05,
"loss": 0.27800693511962893,
"step": 43700
},
{
"epoch": 37.63996554694229,
"eval_loss": 0.34704458713531494,
"eval_runtime": 19.7857,
"eval_samples_per_second": 960.494,
"eval_steps_per_second": 30.022,
"step": 43700
},
{
"epoch": 37.683031869078384,
"grad_norm": 0.4571135342121124,
"learning_rate": 4.927131782945736e-05,
"loss": 0.26782615661621095,
"step": 43750
},
{
"epoch": 37.683031869078384,
"eval_loss": 0.3405662178993225,
"eval_runtime": 19.7688,
"eval_samples_per_second": 961.311,
"eval_steps_per_second": 30.047,
"step": 43750
},
{
"epoch": 37.72609819121447,
"grad_norm": 0.5446920394897461,
"learning_rate": 4.909905254091301e-05,
"loss": 0.2676548957824707,
"step": 43800
},
{
"epoch": 37.72609819121447,
"eval_loss": 0.3393403887748718,
"eval_runtime": 18.1421,
"eval_samples_per_second": 1047.508,
"eval_steps_per_second": 32.742,
"step": 43800
},
{
"epoch": 37.76916451335056,
"grad_norm": 0.39264625310897827,
"learning_rate": 4.892678725236865e-05,
"loss": 0.2752561569213867,
"step": 43850
},
{
"epoch": 37.76916451335056,
"eval_loss": 0.3424234390258789,
"eval_runtime": 17.7583,
"eval_samples_per_second": 1070.148,
"eval_steps_per_second": 33.449,
"step": 43850
},
{
"epoch": 37.81223083548665,
"grad_norm": 0.44023290276527405,
"learning_rate": 4.875452196382429e-05,
"loss": 0.27062139511108396,
"step": 43900
},
{
"epoch": 37.81223083548665,
"eval_loss": 0.3438037931919098,
"eval_runtime": 16.617,
"eval_samples_per_second": 1143.646,
"eval_steps_per_second": 35.746,
"step": 43900
},
{
"epoch": 37.855297157622736,
"grad_norm": 0.4684908092021942,
"learning_rate": 4.858225667527994e-05,
"loss": 0.27128431320190427,
"step": 43950
},
{
"epoch": 37.855297157622736,
"eval_loss": 0.33739370107650757,
"eval_runtime": 17.9564,
"eval_samples_per_second": 1058.34,
"eval_steps_per_second": 33.08,
"step": 43950
},
{
"epoch": 37.89836347975883,
"grad_norm": 0.468597412109375,
"learning_rate": 4.840999138673557e-05,
"loss": 0.26442058563232423,
"step": 44000
},
{
"epoch": 37.89836347975883,
"eval_loss": 0.34331783652305603,
"eval_runtime": 18.2636,
"eval_samples_per_second": 1040.541,
"eval_steps_per_second": 32.524,
"step": 44000
},
{
"epoch": 37.94142980189492,
"grad_norm": 0.5711420774459839,
"learning_rate": 4.8237726098191215e-05,
"loss": 0.2691427993774414,
"step": 44050
},
{
"epoch": 37.94142980189492,
"eval_loss": 0.34428438544273376,
"eval_runtime": 18.0829,
"eval_samples_per_second": 1050.936,
"eval_steps_per_second": 32.849,
"step": 44050
},
{
"epoch": 37.98449612403101,
"grad_norm": 0.3936568796634674,
"learning_rate": 4.806546080964686e-05,
"loss": 0.2765738868713379,
"step": 44100
},
{
"epoch": 37.98449612403101,
"eval_loss": 0.34059733152389526,
"eval_runtime": 17.6121,
"eval_samples_per_second": 1079.033,
"eval_steps_per_second": 33.727,
"step": 44100
},
{
"epoch": 38.027562446167096,
"grad_norm": 0.47591227293014526,
"learning_rate": 4.78931955211025e-05,
"loss": 0.2679741668701172,
"step": 44150
},
{
"epoch": 38.027562446167096,
"eval_loss": 0.3416222631931305,
"eval_runtime": 17.8412,
"eval_samples_per_second": 1065.172,
"eval_steps_per_second": 33.294,
"step": 44150
},
{
"epoch": 38.07062876830319,
"grad_norm": 0.3419834077358246,
"learning_rate": 4.772093023255814e-05,
"loss": 0.26936922073364256,
"step": 44200
},
{
"epoch": 38.07062876830319,
"eval_loss": 0.3447894752025604,
"eval_runtime": 17.8211,
"eval_samples_per_second": 1066.378,
"eval_steps_per_second": 33.331,
"step": 44200
},
{
"epoch": 38.11369509043928,
"grad_norm": 0.4044359028339386,
"learning_rate": 4.7548664944013784e-05,
"loss": 0.2689459800720215,
"step": 44250
},
{
"epoch": 38.11369509043928,
"eval_loss": 0.33676257729530334,
"eval_runtime": 16.7593,
"eval_samples_per_second": 1133.935,
"eval_steps_per_second": 35.443,
"step": 44250
},
{
"epoch": 38.156761412575364,
"grad_norm": 0.5036713480949402,
"learning_rate": 4.7376399655469426e-05,
"loss": 0.2690622329711914,
"step": 44300
},
{
"epoch": 38.156761412575364,
"eval_loss": 0.33689960837364197,
"eval_runtime": 17.4411,
"eval_samples_per_second": 1089.612,
"eval_steps_per_second": 34.058,
"step": 44300
},
{
"epoch": 38.199827734711455,
"grad_norm": 0.4989610016345978,
"learning_rate": 4.720413436692507e-05,
"loss": 0.26547248840332033,
"step": 44350
},
{
"epoch": 38.199827734711455,
"eval_loss": 0.3423798680305481,
"eval_runtime": 17.1909,
"eval_samples_per_second": 1105.469,
"eval_steps_per_second": 34.553,
"step": 44350
},
{
"epoch": 38.24289405684755,
"grad_norm": 0.5024054050445557,
"learning_rate": 4.703186907838071e-05,
"loss": 0.26653575897216797,
"step": 44400
},
{
"epoch": 38.24289405684755,
"eval_loss": 0.33530908823013306,
"eval_runtime": 16.9929,
"eval_samples_per_second": 1118.352,
"eval_steps_per_second": 34.956,
"step": 44400
},
{
"epoch": 38.28596037898363,
"grad_norm": 0.4604988098144531,
"learning_rate": 4.6859603789836345e-05,
"loss": 0.2670548439025879,
"step": 44450
},
{
"epoch": 38.28596037898363,
"eval_loss": 0.34150001406669617,
"eval_runtime": 16.4311,
"eval_samples_per_second": 1156.588,
"eval_steps_per_second": 36.151,
"step": 44450
},
{
"epoch": 38.32902670111972,
"grad_norm": 0.45105716586112976,
"learning_rate": 4.6687338501291994e-05,
"loss": 0.2611412048339844,
"step": 44500
},
{
"epoch": 38.32902670111972,
"eval_loss": 0.34156933426856995,
"eval_runtime": 16.6019,
"eval_samples_per_second": 1144.686,
"eval_steps_per_second": 35.779,
"step": 44500
},
{
"epoch": 38.372093023255815,
"grad_norm": 0.340572327375412,
"learning_rate": 4.6515073212747636e-05,
"loss": 0.26979669570922854,
"step": 44550
},
{
"epoch": 38.372093023255815,
"eval_loss": 0.33604246377944946,
"eval_runtime": 16.975,
"eval_samples_per_second": 1119.53,
"eval_steps_per_second": 34.993,
"step": 44550
},
{
"epoch": 38.415159345391906,
"grad_norm": 0.4126022458076477,
"learning_rate": 4.634280792420327e-05,
"loss": 0.26638069152832033,
"step": 44600
},
{
"epoch": 38.415159345391906,
"eval_loss": 0.34387487173080444,
"eval_runtime": 16.8252,
"eval_samples_per_second": 1129.499,
"eval_steps_per_second": 35.304,
"step": 44600
},
{
"epoch": 38.45822566752799,
"grad_norm": 0.437680184841156,
"learning_rate": 4.6170542635658914e-05,
"loss": 0.26841705322265624,
"step": 44650
},
{
"epoch": 38.45822566752799,
"eval_loss": 0.34003522992134094,
"eval_runtime": 16.169,
"eval_samples_per_second": 1175.336,
"eval_steps_per_second": 36.737,
"step": 44650
},
{
"epoch": 38.50129198966408,
"grad_norm": 0.4710044264793396,
"learning_rate": 4.599827734711456e-05,
"loss": 0.27303672790527345,
"step": 44700
},
{
"epoch": 38.50129198966408,
"eval_loss": 0.34455159306526184,
"eval_runtime": 16.3577,
"eval_samples_per_second": 1161.78,
"eval_steps_per_second": 36.313,
"step": 44700
},
{
"epoch": 38.544358311800174,
"grad_norm": 0.4871346950531006,
"learning_rate": 4.58260120585702e-05,
"loss": 0.26453098297119143,
"step": 44750
},
{
"epoch": 38.544358311800174,
"eval_loss": 0.3411996066570282,
"eval_runtime": 17.6662,
"eval_samples_per_second": 1075.728,
"eval_steps_per_second": 33.624,
"step": 44750
},
{
"epoch": 38.58742463393626,
"grad_norm": 0.4753357470035553,
"learning_rate": 4.565374677002584e-05,
"loss": 0.26396688461303713,
"step": 44800
},
{
"epoch": 38.58742463393626,
"eval_loss": 0.33602163195610046,
"eval_runtime": 17.3508,
"eval_samples_per_second": 1095.279,
"eval_steps_per_second": 34.235,
"step": 44800
},
{
"epoch": 38.63049095607235,
"grad_norm": 0.3800760805606842,
"learning_rate": 4.548148148148149e-05,
"loss": 0.27464492797851564,
"step": 44850
},
{
"epoch": 38.63049095607235,
"eval_loss": 0.34373241662979126,
"eval_runtime": 18.5757,
"eval_samples_per_second": 1023.059,
"eval_steps_per_second": 31.977,
"step": 44850
},
{
"epoch": 38.67355727820844,
"grad_norm": 0.43500515818595886,
"learning_rate": 4.5309216192937124e-05,
"loss": 0.26696685791015623,
"step": 44900
},
{
"epoch": 38.67355727820844,
"eval_loss": 0.3394354581832886,
"eval_runtime": 18.1744,
"eval_samples_per_second": 1045.648,
"eval_steps_per_second": 32.683,
"step": 44900
},
{
"epoch": 38.71662360034453,
"grad_norm": 0.5373595356941223,
"learning_rate": 4.5136950904392766e-05,
"loss": 0.2654395866394043,
"step": 44950
},
{
"epoch": 38.71662360034453,
"eval_loss": 0.34289655089378357,
"eval_runtime": 17.7815,
"eval_samples_per_second": 1068.754,
"eval_steps_per_second": 33.406,
"step": 44950
},
{
"epoch": 38.75968992248062,
"grad_norm": 0.48044517636299133,
"learning_rate": 4.496468561584841e-05,
"loss": 0.26706460952758787,
"step": 45000
},
{
"epoch": 38.75968992248062,
"eval_loss": 0.3330392837524414,
"eval_runtime": 17.8713,
"eval_samples_per_second": 1063.381,
"eval_steps_per_second": 33.238,
"step": 45000
},
{
"epoch": 38.80275624461671,
"grad_norm": 0.4838259220123291,
"learning_rate": 4.479242032730405e-05,
"loss": 0.2641607666015625,
"step": 45050
},
{
"epoch": 38.80275624461671,
"eval_loss": 0.33986079692840576,
"eval_runtime": 18.0427,
"eval_samples_per_second": 1053.278,
"eval_steps_per_second": 32.922,
"step": 45050
},
{
"epoch": 38.8458225667528,
"grad_norm": 0.4696730077266693,
"learning_rate": 4.462015503875969e-05,
"loss": 0.2635121154785156,
"step": 45100
},
{
"epoch": 38.8458225667528,
"eval_loss": 0.332796573638916,
"eval_runtime": 17.797,
"eval_samples_per_second": 1067.819,
"eval_steps_per_second": 33.376,
"step": 45100
},
{
"epoch": 38.888888888888886,
"grad_norm": 0.5020613670349121,
"learning_rate": 4.4447889750215335e-05,
"loss": 0.26674041748046873,
"step": 45150
},
{
"epoch": 38.888888888888886,
"eval_loss": 0.3357270359992981,
"eval_runtime": 17.2271,
"eval_samples_per_second": 1103.143,
"eval_steps_per_second": 34.48,
"step": 45150
},
{
"epoch": 38.93195521102498,
"grad_norm": 0.4783501625061035,
"learning_rate": 4.427562446167098e-05,
"loss": 0.26975366592407224,
"step": 45200
},
{
"epoch": 38.93195521102498,
"eval_loss": 0.33885446190834045,
"eval_runtime": 18.1031,
"eval_samples_per_second": 1049.766,
"eval_steps_per_second": 32.812,
"step": 45200
},
{
"epoch": 38.97502153316107,
"grad_norm": 0.45317623019218445,
"learning_rate": 4.410335917312662e-05,
"loss": 0.2624387741088867,
"step": 45250
},
{
"epoch": 38.97502153316107,
"eval_loss": 0.33671680092811584,
"eval_runtime": 17.49,
"eval_samples_per_second": 1086.562,
"eval_steps_per_second": 33.962,
"step": 45250
},
{
"epoch": 39.01808785529716,
"grad_norm": 0.3463917672634125,
"learning_rate": 4.393109388458226e-05,
"loss": 0.26262115478515624,
"step": 45300
},
{
"epoch": 39.01808785529716,
"eval_loss": 0.34109896421432495,
"eval_runtime": 17.6444,
"eval_samples_per_second": 1077.057,
"eval_steps_per_second": 33.665,
"step": 45300
},
{
"epoch": 39.061154177433245,
"grad_norm": 0.39748960733413696,
"learning_rate": 4.3758828596037896e-05,
"loss": 0.2630434989929199,
"step": 45350
},
{
"epoch": 39.061154177433245,
"eval_loss": 0.3328157663345337,
"eval_runtime": 18.0689,
"eval_samples_per_second": 1051.75,
"eval_steps_per_second": 32.874,
"step": 45350
},
{
"epoch": 39.10422049956934,
"grad_norm": 0.3978130519390106,
"learning_rate": 4.3586563307493545e-05,
"loss": 0.2649136734008789,
"step": 45400
},
{
"epoch": 39.10422049956934,
"eval_loss": 0.3429396152496338,
"eval_runtime": 18.9002,
"eval_samples_per_second": 1005.493,
"eval_steps_per_second": 31.428,
"step": 45400
},
{
"epoch": 39.14728682170543,
"grad_norm": 0.4477211833000183,
"learning_rate": 4.341429801894919e-05,
"loss": 0.27085290908813475,
"step": 45450
},
{
"epoch": 39.14728682170543,
"eval_loss": 0.3342150151729584,
"eval_runtime": 17.9198,
"eval_samples_per_second": 1060.505,
"eval_steps_per_second": 33.148,
"step": 45450
},
{
"epoch": 39.19035314384151,
"grad_norm": 0.38119587302207947,
"learning_rate": 4.324203273040482e-05,
"loss": 0.270539436340332,
"step": 45500
},
{
"epoch": 39.19035314384151,
"eval_loss": 0.3424154818058014,
"eval_runtime": 17.8606,
"eval_samples_per_second": 1064.017,
"eval_steps_per_second": 33.258,
"step": 45500
},
{
"epoch": 39.233419465977605,
"grad_norm": 0.3684655725955963,
"learning_rate": 4.3069767441860465e-05,
"loss": 0.26475446701049804,
"step": 45550
},
{
"epoch": 39.233419465977605,
"eval_loss": 0.34204351902008057,
"eval_runtime": 17.8644,
"eval_samples_per_second": 1063.789,
"eval_steps_per_second": 33.25,
"step": 45550
},
{
"epoch": 39.276485788113696,
"grad_norm": 0.42131876945495605,
"learning_rate": 4.2897502153316114e-05,
"loss": 0.2601029586791992,
"step": 45600
},
{
"epoch": 39.276485788113696,
"eval_loss": 0.33528417348861694,
"eval_runtime": 17.3248,
"eval_samples_per_second": 1096.927,
"eval_steps_per_second": 34.286,
"step": 45600
},
{
"epoch": 39.31955211024979,
"grad_norm": 0.41459059715270996,
"learning_rate": 4.272523686477175e-05,
"loss": 0.25860076904296875,
"step": 45650
},
{
"epoch": 39.31955211024979,
"eval_loss": 0.3424885869026184,
"eval_runtime": 17.8503,
"eval_samples_per_second": 1064.634,
"eval_steps_per_second": 33.277,
"step": 45650
},
{
"epoch": 39.36261843238587,
"grad_norm": 0.4517412483692169,
"learning_rate": 4.255297157622739e-05,
"loss": 0.2675811386108398,
"step": 45700
},
{
"epoch": 39.36261843238587,
"eval_loss": 0.3346919119358063,
"eval_runtime": 17.7955,
"eval_samples_per_second": 1067.913,
"eval_steps_per_second": 33.379,
"step": 45700
},
{
"epoch": 39.405684754521964,
"grad_norm": 0.4856393039226532,
"learning_rate": 4.238070628768303e-05,
"loss": 0.26250232696533204,
"step": 45750
},
{
"epoch": 39.405684754521964,
"eval_loss": 0.3385407328605652,
"eval_runtime": 17.5635,
"eval_samples_per_second": 1082.015,
"eval_steps_per_second": 33.82,
"step": 45750
},
{
"epoch": 39.448751076658056,
"grad_norm": 0.4645376205444336,
"learning_rate": 4.2208440999138675e-05,
"loss": 0.267506103515625,
"step": 45800
},
{
"epoch": 39.448751076658056,
"eval_loss": 0.34468886256217957,
"eval_runtime": 17.8254,
"eval_samples_per_second": 1066.121,
"eval_steps_per_second": 33.323,
"step": 45800
},
{
"epoch": 39.49181739879414,
"grad_norm": 0.4534069895744324,
"learning_rate": 4.203617571059432e-05,
"loss": 0.2699179267883301,
"step": 45850
},
{
"epoch": 39.49181739879414,
"eval_loss": 0.34161004424095154,
"eval_runtime": 17.9224,
"eval_samples_per_second": 1060.346,
"eval_steps_per_second": 33.143,
"step": 45850
},
{
"epoch": 39.53488372093023,
"grad_norm": 0.4326345920562744,
"learning_rate": 4.186391042204996e-05,
"loss": 0.2619389343261719,
"step": 45900
},
{
"epoch": 39.53488372093023,
"eval_loss": 0.33569958806037903,
"eval_runtime": 18.012,
"eval_samples_per_second": 1055.072,
"eval_steps_per_second": 32.978,
"step": 45900
},
{
"epoch": 39.57795004306632,
"grad_norm": 0.41711539030075073,
"learning_rate": 4.16916451335056e-05,
"loss": 0.2695113945007324,
"step": 45950
},
{
"epoch": 39.57795004306632,
"eval_loss": 0.3339827358722687,
"eval_runtime": 18.1577,
"eval_samples_per_second": 1046.609,
"eval_steps_per_second": 32.713,
"step": 45950
},
{
"epoch": 39.621016365202415,
"grad_norm": 0.4437963664531708,
"learning_rate": 4.1519379844961244e-05,
"loss": 0.26169944763183595,
"step": 46000
},
{
"epoch": 39.621016365202415,
"eval_loss": 0.34232571721076965,
"eval_runtime": 17.7283,
"eval_samples_per_second": 1071.959,
"eval_steps_per_second": 33.506,
"step": 46000
},
{
"epoch": 39.6640826873385,
"grad_norm": 0.4615746736526489,
"learning_rate": 4.1347114556416886e-05,
"loss": 0.2618155288696289,
"step": 46050
},
{
"epoch": 39.6640826873385,
"eval_loss": 0.3370404541492462,
"eval_runtime": 17.4379,
"eval_samples_per_second": 1089.812,
"eval_steps_per_second": 34.064,
"step": 46050
},
{
"epoch": 39.70714900947459,
"grad_norm": 0.4825727045536041,
"learning_rate": 4.117484926787253e-05,
"loss": 0.2661894226074219,
"step": 46100
},
{
"epoch": 39.70714900947459,
"eval_loss": 0.33807504177093506,
"eval_runtime": 17.8459,
"eval_samples_per_second": 1064.892,
"eval_steps_per_second": 33.285,
"step": 46100
},
{
"epoch": 39.75021533161068,
"grad_norm": 0.4135194420814514,
"learning_rate": 4.100258397932817e-05,
"loss": 0.26151920318603517,
"step": 46150
},
{
"epoch": 39.75021533161068,
"eval_loss": 0.3375168442726135,
"eval_runtime": 18.6642,
"eval_samples_per_second": 1018.203,
"eval_steps_per_second": 31.826,
"step": 46150
},
{
"epoch": 39.79328165374677,
"grad_norm": 0.42200180888175964,
"learning_rate": 4.0830318690783805e-05,
"loss": 0.26504571914672853,
"step": 46200
},
{
"epoch": 39.79328165374677,
"eval_loss": 0.3431920111179352,
"eval_runtime": 17.0825,
"eval_samples_per_second": 1112.483,
"eval_steps_per_second": 34.772,
"step": 46200
},
{
"epoch": 39.83634797588286,
"grad_norm": 0.45557790994644165,
"learning_rate": 4.065805340223945e-05,
"loss": 0.27078224182128907,
"step": 46250
},
{
"epoch": 39.83634797588286,
"eval_loss": 0.33381396532058716,
"eval_runtime": 17.8822,
"eval_samples_per_second": 1062.735,
"eval_steps_per_second": 33.217,
"step": 46250
},
{
"epoch": 39.87941429801895,
"grad_norm": 0.4105677902698517,
"learning_rate": 4.0485788113695096e-05,
"loss": 0.2642168426513672,
"step": 46300
},
{
"epoch": 39.87941429801895,
"eval_loss": 0.33261778950691223,
"eval_runtime": 18.0371,
"eval_samples_per_second": 1053.605,
"eval_steps_per_second": 32.932,
"step": 46300
},
{
"epoch": 39.92248062015504,
"grad_norm": 0.4780519902706146,
"learning_rate": 4.031352282515073e-05,
"loss": 0.2639030075073242,
"step": 46350
},
{
"epoch": 39.92248062015504,
"eval_loss": 0.33703893423080444,
"eval_runtime": 17.8657,
"eval_samples_per_second": 1063.713,
"eval_steps_per_second": 33.248,
"step": 46350
},
{
"epoch": 39.96554694229113,
"grad_norm": 0.4140496551990509,
"learning_rate": 4.0141257536606374e-05,
"loss": 0.2669095230102539,
"step": 46400
},
{
"epoch": 39.96554694229113,
"eval_loss": 0.3341706097126007,
"eval_runtime": 17.8602,
"eval_samples_per_second": 1064.04,
"eval_steps_per_second": 33.258,
"step": 46400
},
{
"epoch": 40.00861326442722,
"grad_norm": 0.4176825284957886,
"learning_rate": 3.9968992248062016e-05,
"loss": 0.2692737579345703,
"step": 46450
},
{
"epoch": 40.00861326442722,
"eval_loss": 0.3381114900112152,
"eval_runtime": 18.0122,
"eval_samples_per_second": 1055.065,
"eval_steps_per_second": 32.978,
"step": 46450
},
{
"epoch": 40.05167958656331,
"grad_norm": 0.4883769750595093,
"learning_rate": 3.979672695951766e-05,
"loss": 0.2627450942993164,
"step": 46500
},
{
"epoch": 40.05167958656331,
"eval_loss": 0.3378463685512543,
"eval_runtime": 17.9091,
"eval_samples_per_second": 1061.137,
"eval_steps_per_second": 33.167,
"step": 46500
},
{
"epoch": 40.094745908699394,
"grad_norm": 0.3604494035243988,
"learning_rate": 3.96244616709733e-05,
"loss": 0.2634261703491211,
"step": 46550
},
{
"epoch": 40.094745908699394,
"eval_loss": 0.3384319245815277,
"eval_runtime": 17.3453,
"eval_samples_per_second": 1095.63,
"eval_steps_per_second": 34.246,
"step": 46550
},
{
"epoch": 40.137812230835486,
"grad_norm": 0.5276474952697754,
"learning_rate": 3.945219638242894e-05,
"loss": 0.26404022216796874,
"step": 46600
},
{
"epoch": 40.137812230835486,
"eval_loss": 0.33722805976867676,
"eval_runtime": 17.9865,
"eval_samples_per_second": 1056.568,
"eval_steps_per_second": 33.025,
"step": 46600
},
{
"epoch": 40.18087855297158,
"grad_norm": 0.8581849336624146,
"learning_rate": 3.9279931093884584e-05,
"loss": 0.26176496505737307,
"step": 46650
},
{
"epoch": 40.18087855297158,
"eval_loss": 0.3430112898349762,
"eval_runtime": 17.3636,
"eval_samples_per_second": 1094.474,
"eval_steps_per_second": 34.209,
"step": 46650
},
{
"epoch": 40.22394487510766,
"grad_norm": 0.3712718188762665,
"learning_rate": 3.9107665805340226e-05,
"loss": 0.26714740753173827,
"step": 46700
},
{
"epoch": 40.22394487510766,
"eval_loss": 0.3383539915084839,
"eval_runtime": 18.0349,
"eval_samples_per_second": 1053.733,
"eval_steps_per_second": 32.936,
"step": 46700
},
{
"epoch": 40.267011197243754,
"grad_norm": 0.37334057688713074,
"learning_rate": 3.893540051679587e-05,
"loss": 0.26557273864746095,
"step": 46750
},
{
"epoch": 40.267011197243754,
"eval_loss": 0.33767005801200867,
"eval_runtime": 17.6561,
"eval_samples_per_second": 1076.344,
"eval_steps_per_second": 33.643,
"step": 46750
},
{
"epoch": 40.310077519379846,
"grad_norm": 0.45106396079063416,
"learning_rate": 3.8763135228251504e-05,
"loss": 0.26342580795288084,
"step": 46800
},
{
"epoch": 40.310077519379846,
"eval_loss": 0.33584079146385193,
"eval_runtime": 18.1547,
"eval_samples_per_second": 1046.781,
"eval_steps_per_second": 32.719,
"step": 46800
},
{
"epoch": 40.35314384151594,
"grad_norm": 0.4610295593738556,
"learning_rate": 3.859086993970715e-05,
"loss": 0.2536019325256348,
"step": 46850
},
{
"epoch": 40.35314384151594,
"eval_loss": 0.3431561589241028,
"eval_runtime": 18.4064,
"eval_samples_per_second": 1032.465,
"eval_steps_per_second": 32.271,
"step": 46850
},
{
"epoch": 40.39621016365202,
"grad_norm": 0.36498934030532837,
"learning_rate": 3.8418604651162795e-05,
"loss": 0.2718042373657227,
"step": 46900
},
{
"epoch": 40.39621016365202,
"eval_loss": 0.3425901234149933,
"eval_runtime": 18.0712,
"eval_samples_per_second": 1051.62,
"eval_steps_per_second": 32.87,
"step": 46900
},
{
"epoch": 40.43927648578811,
"grad_norm": 0.5364521741867065,
"learning_rate": 3.824633936261843e-05,
"loss": 0.2712949562072754,
"step": 46950
},
{
"epoch": 40.43927648578811,
"eval_loss": 0.3378094434738159,
"eval_runtime": 17.7612,
"eval_samples_per_second": 1069.972,
"eval_steps_per_second": 33.444,
"step": 46950
},
{
"epoch": 40.482342807924205,
"grad_norm": 0.40720999240875244,
"learning_rate": 3.807407407407408e-05,
"loss": 0.2558038330078125,
"step": 47000
},
{
"epoch": 40.482342807924205,
"eval_loss": 0.3408735394477844,
"eval_runtime": 17.3041,
"eval_samples_per_second": 1098.234,
"eval_steps_per_second": 34.327,
"step": 47000
},
{
"epoch": 40.52540913006029,
"grad_norm": 0.4001590311527252,
"learning_rate": 3.790180878552972e-05,
"loss": 0.26414262771606445,
"step": 47050
},
{
"epoch": 40.52540913006029,
"eval_loss": 0.3310953676700592,
"eval_runtime": 18.044,
"eval_samples_per_second": 1053.203,
"eval_steps_per_second": 32.92,
"step": 47050
},
{
"epoch": 40.56847545219638,
"grad_norm": 0.40518489480018616,
"learning_rate": 3.7729543496985356e-05,
"loss": 0.2695816230773926,
"step": 47100
},
{
"epoch": 40.56847545219638,
"eval_loss": 0.3371044397354126,
"eval_runtime": 17.6114,
"eval_samples_per_second": 1079.074,
"eval_steps_per_second": 33.728,
"step": 47100
},
{
"epoch": 40.61154177433247,
"grad_norm": 0.45011526346206665,
"learning_rate": 3.7557278208441e-05,
"loss": 0.25704208374023435,
"step": 47150
},
{
"epoch": 40.61154177433247,
"eval_loss": 0.3388862907886505,
"eval_runtime": 17.6856,
"eval_samples_per_second": 1074.545,
"eval_steps_per_second": 33.587,
"step": 47150
},
{
"epoch": 40.654608096468564,
"grad_norm": 0.4376562237739563,
"learning_rate": 3.738501291989665e-05,
"loss": 0.2663600158691406,
"step": 47200
},
{
"epoch": 40.654608096468564,
"eval_loss": 0.33221572637557983,
"eval_runtime": 18.0778,
"eval_samples_per_second": 1051.235,
"eval_steps_per_second": 32.858,
"step": 47200
},
{
"epoch": 40.69767441860465,
"grad_norm": 0.463076114654541,
"learning_rate": 3.721274763135228e-05,
"loss": 0.2625453567504883,
"step": 47250
},
{
"epoch": 40.69767441860465,
"eval_loss": 0.3378129303455353,
"eval_runtime": 18.3509,
"eval_samples_per_second": 1035.591,
"eval_steps_per_second": 32.369,
"step": 47250
},
{
"epoch": 40.74074074074074,
"grad_norm": 0.46275097131729126,
"learning_rate": 3.7040482342807925e-05,
"loss": 0.26067514419555665,
"step": 47300
},
{
"epoch": 40.74074074074074,
"eval_loss": 0.3361418843269348,
"eval_runtime": 17.7577,
"eval_samples_per_second": 1070.184,
"eval_steps_per_second": 33.45,
"step": 47300
},
{
"epoch": 40.78380706287683,
"grad_norm": 0.381462961435318,
"learning_rate": 3.686821705426357e-05,
"loss": 0.2565507125854492,
"step": 47350
},
{
"epoch": 40.78380706287683,
"eval_loss": 0.33574655652046204,
"eval_runtime": 17.9409,
"eval_samples_per_second": 1059.255,
"eval_steps_per_second": 33.109,
"step": 47350
},
{
"epoch": 40.82687338501292,
"grad_norm": 0.39149317145347595,
"learning_rate": 3.669595176571921e-05,
"loss": 0.26437042236328123,
"step": 47400
},
{
"epoch": 40.82687338501292,
"eval_loss": 0.33760392665863037,
"eval_runtime": 17.7755,
"eval_samples_per_second": 1069.112,
"eval_steps_per_second": 33.417,
"step": 47400
},
{
"epoch": 40.86993970714901,
"grad_norm": 0.46884697675704956,
"learning_rate": 3.652368647717485e-05,
"loss": 0.26930549621582034,
"step": 47450
},
{
"epoch": 40.86993970714901,
"eval_loss": 0.3397780656814575,
"eval_runtime": 17.4926,
"eval_samples_per_second": 1086.403,
"eval_steps_per_second": 33.957,
"step": 47450
},
{
"epoch": 40.9130060292851,
"grad_norm": 0.4447813332080841,
"learning_rate": 3.635142118863049e-05,
"loss": 0.2629986572265625,
"step": 47500
},
{
"epoch": 40.9130060292851,
"eval_loss": 0.33175262808799744,
"eval_runtime": 16.8599,
"eval_samples_per_second": 1127.169,
"eval_steps_per_second": 35.231,
"step": 47500
},
{
"epoch": 40.95607235142119,
"grad_norm": 0.4096522331237793,
"learning_rate": 3.6179155900086135e-05,
"loss": 0.257623462677002,
"step": 47550
},
{
"epoch": 40.95607235142119,
"eval_loss": 0.3397463262081146,
"eval_runtime": 17.5965,
"eval_samples_per_second": 1079.99,
"eval_steps_per_second": 33.757,
"step": 47550
},
{
"epoch": 40.999138673557276,
"grad_norm": 0.4105287492275238,
"learning_rate": 3.600689061154178e-05,
"loss": 0.26816184997558595,
"step": 47600
},
{
"epoch": 40.999138673557276,
"eval_loss": 0.33248135447502136,
"eval_runtime": 16.1381,
"eval_samples_per_second": 1177.587,
"eval_steps_per_second": 36.807,
"step": 47600
},
{
"epoch": 41.04220499569337,
"grad_norm": 0.4221794605255127,
"learning_rate": 3.583462532299742e-05,
"loss": 0.26319524765014646,
"step": 47650
},
{
"epoch": 41.04220499569337,
"eval_loss": 0.33123907446861267,
"eval_runtime": 18.0613,
"eval_samples_per_second": 1052.193,
"eval_steps_per_second": 32.888,
"step": 47650
},
{
"epoch": 41.08527131782946,
"grad_norm": 0.5156925916671753,
"learning_rate": 3.5662360034453055e-05,
"loss": 0.2654911994934082,
"step": 47700
},
{
"epoch": 41.08527131782946,
"eval_loss": 0.3341188430786133,
"eval_runtime": 18.2245,
"eval_samples_per_second": 1042.77,
"eval_steps_per_second": 32.593,
"step": 47700
},
{
"epoch": 41.128337639965544,
"grad_norm": 0.42098939418792725,
"learning_rate": 3.5490094745908704e-05,
"loss": 0.26546878814697267,
"step": 47750
},
{
"epoch": 41.128337639965544,
"eval_loss": 0.339838445186615,
"eval_runtime": 18.3008,
"eval_samples_per_second": 1038.426,
"eval_steps_per_second": 32.458,
"step": 47750
},
{
"epoch": 41.171403962101635,
"grad_norm": 0.3625338673591614,
"learning_rate": 3.5317829457364346e-05,
"loss": 0.25438928604125977,
"step": 47800
},
{
"epoch": 41.171403962101635,
"eval_loss": 0.33727747201919556,
"eval_runtime": 18.3263,
"eval_samples_per_second": 1036.98,
"eval_steps_per_second": 32.412,
"step": 47800
},
{
"epoch": 41.21447028423773,
"grad_norm": 0.4652734398841858,
"learning_rate": 3.514556416881998e-05,
"loss": 0.2680678939819336,
"step": 47850
},
{
"epoch": 41.21447028423773,
"eval_loss": 0.3393220603466034,
"eval_runtime": 17.9053,
"eval_samples_per_second": 1061.362,
"eval_steps_per_second": 33.175,
"step": 47850
},
{
"epoch": 41.25753660637382,
"grad_norm": 0.46969592571258545,
"learning_rate": 3.497329888027562e-05,
"loss": 0.26060947418212893,
"step": 47900
},
{
"epoch": 41.25753660637382,
"eval_loss": 0.338578999042511,
"eval_runtime": 18.9249,
"eval_samples_per_second": 1004.179,
"eval_steps_per_second": 31.387,
"step": 47900
},
{
"epoch": 41.3006029285099,
"grad_norm": 0.5016558170318604,
"learning_rate": 3.480103359173127e-05,
"loss": 0.26476665496826174,
"step": 47950
},
{
"epoch": 41.3006029285099,
"eval_loss": 0.34051215648651123,
"eval_runtime": 17.4257,
"eval_samples_per_second": 1090.571,
"eval_steps_per_second": 34.088,
"step": 47950
},
{
"epoch": 41.343669250645995,
"grad_norm": 0.42962339520454407,
"learning_rate": 3.462876830318691e-05,
"loss": 0.2569329071044922,
"step": 48000
},
{
"epoch": 41.343669250645995,
"eval_loss": 0.33297860622406006,
"eval_runtime": 17.9455,
"eval_samples_per_second": 1058.985,
"eval_steps_per_second": 33.1,
"step": 48000
},
{
"epoch": 41.38673557278209,
"grad_norm": 0.4176729917526245,
"learning_rate": 3.445650301464255e-05,
"loss": 0.26670978546142576,
"step": 48050
},
{
"epoch": 41.38673557278209,
"eval_loss": 0.3345198929309845,
"eval_runtime": 17.1145,
"eval_samples_per_second": 1110.405,
"eval_steps_per_second": 34.707,
"step": 48050
},
{
"epoch": 41.42980189491817,
"grad_norm": 0.4664563834667206,
"learning_rate": 3.42842377260982e-05,
"loss": 0.26591835021972654,
"step": 48100
},
{
"epoch": 41.42980189491817,
"eval_loss": 0.3355900049209595,
"eval_runtime": 18.6175,
"eval_samples_per_second": 1020.76,
"eval_steps_per_second": 31.905,
"step": 48100
},
{
"epoch": 41.47286821705426,
"grad_norm": 0.4347773492336273,
"learning_rate": 3.4111972437553834e-05,
"loss": 0.26680770874023435,
"step": 48150
},
{
"epoch": 41.47286821705426,
"eval_loss": 0.33552244305610657,
"eval_runtime": 17.8257,
"eval_samples_per_second": 1066.102,
"eval_steps_per_second": 33.323,
"step": 48150
},
{
"epoch": 41.515934539190354,
"grad_norm": 0.3817366361618042,
"learning_rate": 3.3939707149009476e-05,
"loss": 0.2699763870239258,
"step": 48200
},
{
"epoch": 41.515934539190354,
"eval_loss": 0.3416987359523773,
"eval_runtime": 17.8482,
"eval_samples_per_second": 1064.756,
"eval_steps_per_second": 33.281,
"step": 48200
},
{
"epoch": 41.559000861326446,
"grad_norm": 0.49607574939727783,
"learning_rate": 3.376744186046512e-05,
"loss": 0.2648237419128418,
"step": 48250
},
{
"epoch": 41.559000861326446,
"eval_loss": 0.33449888229370117,
"eval_runtime": 18.244,
"eval_samples_per_second": 1041.657,
"eval_steps_per_second": 32.559,
"step": 48250
},
{
"epoch": 41.60206718346253,
"grad_norm": 0.4564961791038513,
"learning_rate": 3.359517657192076e-05,
"loss": 0.26271303176879884,
"step": 48300
},
{
"epoch": 41.60206718346253,
"eval_loss": 0.3342786133289337,
"eval_runtime": 18.0953,
"eval_samples_per_second": 1050.219,
"eval_steps_per_second": 32.826,
"step": 48300
},
{
"epoch": 41.64513350559862,
"grad_norm": 0.41826575994491577,
"learning_rate": 3.34229112833764e-05,
"loss": 0.2643220329284668,
"step": 48350
},
{
"epoch": 41.64513350559862,
"eval_loss": 0.3378792107105255,
"eval_runtime": 18.0764,
"eval_samples_per_second": 1051.317,
"eval_steps_per_second": 32.861,
"step": 48350
},
{
"epoch": 41.688199827734714,
"grad_norm": 0.3788872957229614,
"learning_rate": 3.3250645994832044e-05,
"loss": 0.2604197311401367,
"step": 48400
},
{
"epoch": 41.688199827734714,
"eval_loss": 0.3380422592163086,
"eval_runtime": 17.7567,
"eval_samples_per_second": 1070.243,
"eval_steps_per_second": 33.452,
"step": 48400
},
{
"epoch": 41.7312661498708,
"grad_norm": 0.38110384345054626,
"learning_rate": 3.3078380706287686e-05,
"loss": 0.26157442092895505,
"step": 48450
},
{
"epoch": 41.7312661498708,
"eval_loss": 0.33079907298088074,
"eval_runtime": 20.2463,
"eval_samples_per_second": 938.642,
"eval_steps_per_second": 29.339,
"step": 48450
},
{
"epoch": 41.77433247200689,
"grad_norm": 0.44205212593078613,
"learning_rate": 3.290611541774333e-05,
"loss": 0.267241325378418,
"step": 48500
},
{
"epoch": 41.77433247200689,
"eval_loss": 0.33610066771507263,
"eval_runtime": 19.082,
"eval_samples_per_second": 995.915,
"eval_steps_per_second": 31.129,
"step": 48500
},
{
"epoch": 41.81739879414298,
"grad_norm": 0.41984260082244873,
"learning_rate": 3.273385012919897e-05,
"loss": 0.25937877655029296,
"step": 48550
},
{
"epoch": 41.81739879414298,
"eval_loss": 0.3353855609893799,
"eval_runtime": 19.6219,
"eval_samples_per_second": 968.511,
"eval_steps_per_second": 30.272,
"step": 48550
},
{
"epoch": 41.86046511627907,
"grad_norm": 0.3918072283267975,
"learning_rate": 3.2561584840654606e-05,
"loss": 0.26703338623046874,
"step": 48600
},
{
"epoch": 41.86046511627907,
"eval_loss": 0.3338076174259186,
"eval_runtime": 18.1515,
"eval_samples_per_second": 1046.967,
"eval_steps_per_second": 32.725,
"step": 48600
},
{
"epoch": 41.90353143841516,
"grad_norm": 0.43717628717422485,
"learning_rate": 3.2389319552110255e-05,
"loss": 0.267167911529541,
"step": 48650
},
{
"epoch": 41.90353143841516,
"eval_loss": 0.3376483619213104,
"eval_runtime": 18.5551,
"eval_samples_per_second": 1024.192,
"eval_steps_per_second": 32.013,
"step": 48650
},
{
"epoch": 41.94659776055125,
"grad_norm": 0.4901149272918701,
"learning_rate": 3.221705426356589e-05,
"loss": 0.2627080726623535,
"step": 48700
},
{
"epoch": 41.94659776055125,
"eval_loss": 0.33091285824775696,
"eval_runtime": 18.4233,
"eval_samples_per_second": 1031.522,
"eval_steps_per_second": 32.242,
"step": 48700
},
{
"epoch": 41.98966408268734,
"grad_norm": 0.4586520791053772,
"learning_rate": 3.204478897502153e-05,
"loss": 0.2619011306762695,
"step": 48750
},
{
"epoch": 41.98966408268734,
"eval_loss": 0.3310651183128357,
"eval_runtime": 18.994,
"eval_samples_per_second": 1000.526,
"eval_steps_per_second": 31.273,
"step": 48750
},
{
"epoch": 42.032730404823425,
"grad_norm": 0.5598412156105042,
"learning_rate": 3.1872523686477174e-05,
"loss": 0.2581977081298828,
"step": 48800
},
{
"epoch": 42.032730404823425,
"eval_loss": 0.333484411239624,
"eval_runtime": 17.963,
"eval_samples_per_second": 1057.95,
"eval_steps_per_second": 33.068,
"step": 48800
},
{
"epoch": 42.07579672695952,
"grad_norm": 0.4388487935066223,
"learning_rate": 3.1700258397932816e-05,
"loss": 0.2565720367431641,
"step": 48850
},
{
"epoch": 42.07579672695952,
"eval_loss": 0.33121058344841003,
"eval_runtime": 17.7479,
"eval_samples_per_second": 1070.778,
"eval_steps_per_second": 33.469,
"step": 48850
},
{
"epoch": 42.11886304909561,
"grad_norm": 0.4400928318500519,
"learning_rate": 3.152799310938846e-05,
"loss": 0.2657375717163086,
"step": 48900
},
{
"epoch": 42.11886304909561,
"eval_loss": 0.3297300934791565,
"eval_runtime": 17.5994,
"eval_samples_per_second": 1079.812,
"eval_steps_per_second": 33.751,
"step": 48900
},
{
"epoch": 42.1619293712317,
"grad_norm": 0.4509681761264801,
"learning_rate": 3.13557278208441e-05,
"loss": 0.26203857421875,
"step": 48950
},
{
"epoch": 42.1619293712317,
"eval_loss": 0.33452802896499634,
"eval_runtime": 18.2048,
"eval_samples_per_second": 1043.902,
"eval_steps_per_second": 32.629,
"step": 48950
},
{
"epoch": 42.204995693367785,
"grad_norm": 0.45013585686683655,
"learning_rate": 3.118346253229974e-05,
"loss": 0.2583207893371582,
"step": 49000
},
{
"epoch": 42.204995693367785,
"eval_loss": 0.3354299068450928,
"eval_runtime": 17.4619,
"eval_samples_per_second": 1088.31,
"eval_steps_per_second": 34.017,
"step": 49000
},
{
"epoch": 42.248062015503876,
"grad_norm": 0.3658822774887085,
"learning_rate": 3.1011197243755385e-05,
"loss": 0.269542350769043,
"step": 49050
},
{
"epoch": 42.248062015503876,
"eval_loss": 0.3368607461452484,
"eval_runtime": 17.9875,
"eval_samples_per_second": 1056.513,
"eval_steps_per_second": 33.023,
"step": 49050
},
{
"epoch": 42.29112833763997,
"grad_norm": 0.4629049003124237,
"learning_rate": 3.083893195521103e-05,
"loss": 0.25855308532714844,
"step": 49100
},
{
"epoch": 42.29112833763997,
"eval_loss": 0.3344835638999939,
"eval_runtime": 17.9456,
"eval_samples_per_second": 1058.98,
"eval_steps_per_second": 33.1,
"step": 49100
},
{
"epoch": 42.33419465977605,
"grad_norm": 0.4146256446838379,
"learning_rate": 3.066666666666667e-05,
"loss": 0.2610971069335937,
"step": 49150
},
{
"epoch": 42.33419465977605,
"eval_loss": 0.32769039273262024,
"eval_runtime": 18.1699,
"eval_samples_per_second": 1045.906,
"eval_steps_per_second": 32.691,
"step": 49150
},
{
"epoch": 42.377260981912144,
"grad_norm": 0.3447282612323761,
"learning_rate": 3.049440137812231e-05,
"loss": 0.26025299072265623,
"step": 49200
},
{
"epoch": 42.377260981912144,
"eval_loss": 0.3351108729839325,
"eval_runtime": 18.2652,
"eval_samples_per_second": 1040.448,
"eval_steps_per_second": 32.521,
"step": 49200
},
{
"epoch": 42.420327304048236,
"grad_norm": 0.463405579328537,
"learning_rate": 3.0322136089577953e-05,
"loss": 0.26323734283447264,
"step": 49250
},
{
"epoch": 42.420327304048236,
"eval_loss": 0.335365355014801,
"eval_runtime": 18.0634,
"eval_samples_per_second": 1052.071,
"eval_steps_per_second": 32.884,
"step": 49250
},
{
"epoch": 42.46339362618432,
"grad_norm": 0.3932636082172394,
"learning_rate": 3.0149870801033592e-05,
"loss": 0.2605165672302246,
"step": 49300
},
{
"epoch": 42.46339362618432,
"eval_loss": 0.33185839653015137,
"eval_runtime": 17.917,
"eval_samples_per_second": 1060.669,
"eval_steps_per_second": 33.153,
"step": 49300
},
{
"epoch": 42.50645994832041,
"grad_norm": 0.4551706910133362,
"learning_rate": 2.9977605512489237e-05,
"loss": 0.25998649597167967,
"step": 49350
},
{
"epoch": 42.50645994832041,
"eval_loss": 0.33300530910491943,
"eval_runtime": 17.1804,
"eval_samples_per_second": 1106.144,
"eval_steps_per_second": 34.574,
"step": 49350
},
{
"epoch": 42.549526270456504,
"grad_norm": 0.3885660171508789,
"learning_rate": 2.9805340223944876e-05,
"loss": 0.2636854553222656,
"step": 49400
},
{
"epoch": 42.549526270456504,
"eval_loss": 0.3332418203353882,
"eval_runtime": 18.0481,
"eval_samples_per_second": 1052.965,
"eval_steps_per_second": 32.912,
"step": 49400
},
{
"epoch": 42.592592592592595,
"grad_norm": 0.4408448338508606,
"learning_rate": 2.9633074935400518e-05,
"loss": 0.25896127700805666,
"step": 49450
},
{
"epoch": 42.592592592592595,
"eval_loss": 0.330980122089386,
"eval_runtime": 17.6177,
"eval_samples_per_second": 1078.685,
"eval_steps_per_second": 33.716,
"step": 49450
},
{
"epoch": 42.63565891472868,
"grad_norm": 0.43017804622650146,
"learning_rate": 2.9460809646856157e-05,
"loss": 0.2645112037658691,
"step": 49500
},
{
"epoch": 42.63565891472868,
"eval_loss": 0.32889503240585327,
"eval_runtime": 18.5528,
"eval_samples_per_second": 1024.322,
"eval_steps_per_second": 32.017,
"step": 49500
},
{
"epoch": 42.67872523686477,
"grad_norm": 0.5321599245071411,
"learning_rate": 2.9288544358311802e-05,
"loss": 0.2638208770751953,
"step": 49550
},
{
"epoch": 42.67872523686477,
"eval_loss": 0.33535560965538025,
"eval_runtime": 18.8643,
"eval_samples_per_second": 1007.405,
"eval_steps_per_second": 31.488,
"step": 49550
},
{
"epoch": 42.72179155900086,
"grad_norm": 0.4819716811180115,
"learning_rate": 2.9116279069767444e-05,
"loss": 0.2661098289489746,
"step": 49600
},
{
"epoch": 42.72179155900086,
"eval_loss": 0.3298910856246948,
"eval_runtime": 17.9128,
"eval_samples_per_second": 1060.915,
"eval_steps_per_second": 33.161,
"step": 49600
},
{
"epoch": 42.76485788113695,
"grad_norm": 0.3507688343524933,
"learning_rate": 2.8944013781223083e-05,
"loss": 0.2569438362121582,
"step": 49650
},
{
"epoch": 42.76485788113695,
"eval_loss": 0.32944178581237793,
"eval_runtime": 18.1784,
"eval_samples_per_second": 1045.419,
"eval_steps_per_second": 32.676,
"step": 49650
},
{
"epoch": 42.80792420327304,
"grad_norm": 0.6185017824172974,
"learning_rate": 2.8771748492678725e-05,
"loss": 0.2623113250732422,
"step": 49700
},
{
"epoch": 42.80792420327304,
"eval_loss": 0.32671746611595154,
"eval_runtime": 18.2043,
"eval_samples_per_second": 1043.931,
"eval_steps_per_second": 32.63,
"step": 49700
},
{
"epoch": 42.85099052540913,
"grad_norm": 0.5226140022277832,
"learning_rate": 2.859948320413437e-05,
"loss": 0.2594397735595703,
"step": 49750
},
{
"epoch": 42.85099052540913,
"eval_loss": 0.3336170017719269,
"eval_runtime": 17.6042,
"eval_samples_per_second": 1079.517,
"eval_steps_per_second": 33.742,
"step": 49750
},
{
"epoch": 42.89405684754522,
"grad_norm": 0.4084358513355255,
"learning_rate": 2.842721791559001e-05,
"loss": 0.26164541244506834,
"step": 49800
},
{
"epoch": 42.89405684754522,
"eval_loss": 0.33178937435150146,
"eval_runtime": 16.5647,
"eval_samples_per_second": 1147.257,
"eval_steps_per_second": 35.859,
"step": 49800
},
{
"epoch": 42.93712316968131,
"grad_norm": 0.4608656167984009,
"learning_rate": 2.825495262704565e-05,
"loss": 0.26180377960205076,
"step": 49850
},
{
"epoch": 42.93712316968131,
"eval_loss": 0.33565860986709595,
"eval_runtime": 17.6583,
"eval_samples_per_second": 1076.207,
"eval_steps_per_second": 33.639,
"step": 49850
},
{
"epoch": 42.9801894918174,
"grad_norm": 0.47300195693969727,
"learning_rate": 2.8082687338501297e-05,
"loss": 0.25752685546875,
"step": 49900
},
{
"epoch": 42.9801894918174,
"eval_loss": 0.33200475573539734,
"eval_runtime": 18.0159,
"eval_samples_per_second": 1054.843,
"eval_steps_per_second": 32.971,
"step": 49900
},
{
"epoch": 43.02325581395349,
"grad_norm": 0.47372981905937195,
"learning_rate": 2.7910422049956936e-05,
"loss": 0.25568557739257813,
"step": 49950
},
{
"epoch": 43.02325581395349,
"eval_loss": 0.33379772305488586,
"eval_runtime": 17.2294,
"eval_samples_per_second": 1103.001,
"eval_steps_per_second": 34.476,
"step": 49950
},
{
"epoch": 43.066322136089575,
"grad_norm": 0.4774799942970276,
"learning_rate": 2.7738156761412574e-05,
"loss": 0.2577634620666504,
"step": 50000
},
{
"epoch": 43.066322136089575,
"eval_loss": 0.33078330755233765,
"eval_runtime": 18.2773,
"eval_samples_per_second": 1039.762,
"eval_steps_per_second": 32.499,
"step": 50000
},
{
"epoch": 43.109388458225666,
"grad_norm": 0.45900219678878784,
"learning_rate": 2.7565891472868217e-05,
"loss": 0.25360595703125,
"step": 50050
},
{
"epoch": 43.109388458225666,
"eval_loss": 0.3274356424808502,
"eval_runtime": 18.2902,
"eval_samples_per_second": 1039.027,
"eval_steps_per_second": 32.476,
"step": 50050
},
{
"epoch": 43.15245478036176,
"grad_norm": 0.447608083486557,
"learning_rate": 2.7393626184323862e-05,
"loss": 0.25464582443237305,
"step": 50100
},
{
"epoch": 43.15245478036176,
"eval_loss": 0.33124667406082153,
"eval_runtime": 18.1672,
"eval_samples_per_second": 1046.06,
"eval_steps_per_second": 32.696,
"step": 50100
},
{
"epoch": 43.19552110249785,
"grad_norm": 0.40590664744377136,
"learning_rate": 2.72213608957795e-05,
"loss": 0.2649323844909668,
"step": 50150
},
{
"epoch": 43.19552110249785,
"eval_loss": 0.33773353695869446,
"eval_runtime": 18.3985,
"eval_samples_per_second": 1032.912,
"eval_steps_per_second": 32.285,
"step": 50150
},
{
"epoch": 43.238587424633934,
"grad_norm": 0.42521902918815613,
"learning_rate": 2.7049095607235143e-05,
"loss": 0.2657030487060547,
"step": 50200
},
{
"epoch": 43.238587424633934,
"eval_loss": 0.3342694342136383,
"eval_runtime": 18.241,
"eval_samples_per_second": 1041.829,
"eval_steps_per_second": 32.564,
"step": 50200
},
{
"epoch": 43.281653746770026,
"grad_norm": 0.4506593644618988,
"learning_rate": 2.687683031869079e-05,
"loss": 0.257874755859375,
"step": 50250
},
{
"epoch": 43.281653746770026,
"eval_loss": 0.33657172322273254,
"eval_runtime": 17.7859,
"eval_samples_per_second": 1068.486,
"eval_steps_per_second": 33.397,
"step": 50250
},
{
"epoch": 43.32472006890612,
"grad_norm": 0.4413757026195526,
"learning_rate": 2.6704565030146427e-05,
"loss": 0.26257347106933593,
"step": 50300
},
{
"epoch": 43.32472006890612,
"eval_loss": 0.33235645294189453,
"eval_runtime": 16.8435,
"eval_samples_per_second": 1128.27,
"eval_steps_per_second": 35.266,
"step": 50300
},
{
"epoch": 43.3677863910422,
"grad_norm": 0.46813246607780457,
"learning_rate": 2.653229974160207e-05,
"loss": 0.25420507431030276,
"step": 50350
},
{
"epoch": 43.3677863910422,
"eval_loss": 0.33105531334877014,
"eval_runtime": 18.0603,
"eval_samples_per_second": 1052.255,
"eval_steps_per_second": 32.89,
"step": 50350
},
{
"epoch": 43.41085271317829,
"grad_norm": 0.36964699625968933,
"learning_rate": 2.6360034453057708e-05,
"loss": 0.2610732841491699,
"step": 50400
},
{
"epoch": 43.41085271317829,
"eval_loss": 0.33543485403060913,
"eval_runtime": 17.4611,
"eval_samples_per_second": 1088.362,
"eval_steps_per_second": 34.018,
"step": 50400
},
{
"epoch": 43.453919035314385,
"grad_norm": 0.3838389813899994,
"learning_rate": 2.6187769164513353e-05,
"loss": 0.2597215270996094,
"step": 50450
},
{
"epoch": 43.453919035314385,
"eval_loss": 0.33240464329719543,
"eval_runtime": 18.156,
"eval_samples_per_second": 1046.706,
"eval_steps_per_second": 32.716,
"step": 50450
},
{
"epoch": 43.49698535745048,
"grad_norm": 0.5293746590614319,
"learning_rate": 2.6015503875968995e-05,
"loss": 0.257132568359375,
"step": 50500
},
{
"epoch": 43.49698535745048,
"eval_loss": 0.3345556855201721,
"eval_runtime": 18.2182,
"eval_samples_per_second": 1043.133,
"eval_steps_per_second": 32.605,
"step": 50500
},
{
"epoch": 43.54005167958656,
"grad_norm": 0.47793522477149963,
"learning_rate": 2.5843238587424634e-05,
"loss": 0.25999725341796875,
"step": 50550
},
{
"epoch": 43.54005167958656,
"eval_loss": 0.3289826512336731,
"eval_runtime": 18.4921,
"eval_samples_per_second": 1027.683,
"eval_steps_per_second": 32.122,
"step": 50550
},
{
"epoch": 43.58311800172265,
"grad_norm": 0.38100966811180115,
"learning_rate": 2.5670973298880273e-05,
"loss": 0.2568831443786621,
"step": 50600
},
{
"epoch": 43.58311800172265,
"eval_loss": 0.33748096227645874,
"eval_runtime": 18.0335,
"eval_samples_per_second": 1053.816,
"eval_steps_per_second": 32.939,
"step": 50600
},
{
"epoch": 43.626184323858745,
"grad_norm": 0.4159799814224243,
"learning_rate": 2.549870801033592e-05,
"loss": 0.26660945892333987,
"step": 50650
},
{
"epoch": 43.626184323858745,
"eval_loss": 0.33200082182884216,
"eval_runtime": 18.123,
"eval_samples_per_second": 1048.611,
"eval_steps_per_second": 32.776,
"step": 50650
},
{
"epoch": 43.66925064599483,
"grad_norm": 0.3828164339065552,
"learning_rate": 2.532644272179156e-05,
"loss": 0.260919189453125,
"step": 50700
},
{
"epoch": 43.66925064599483,
"eval_loss": 0.32596173882484436,
"eval_runtime": 18.1437,
"eval_samples_per_second": 1047.418,
"eval_steps_per_second": 32.739,
"step": 50700
},
{
"epoch": 43.71231696813092,
"grad_norm": 0.4727261960506439,
"learning_rate": 2.51541774332472e-05,
"loss": 0.26335853576660156,
"step": 50750
},
{
"epoch": 43.71231696813092,
"eval_loss": 0.3335762321949005,
"eval_runtime": 17.467,
"eval_samples_per_second": 1087.997,
"eval_steps_per_second": 34.007,
"step": 50750
},
{
"epoch": 43.75538329026701,
"grad_norm": 0.46335121989250183,
"learning_rate": 2.4981912144702845e-05,
"loss": 0.2584223747253418,
"step": 50800
},
{
"epoch": 43.75538329026701,
"eval_loss": 0.33142974972724915,
"eval_runtime": 19.7474,
"eval_samples_per_second": 962.354,
"eval_steps_per_second": 30.08,
"step": 50800
},
{
"epoch": 43.798449612403104,
"grad_norm": 0.3923814594745636,
"learning_rate": 2.4809646856158487e-05,
"loss": 0.2604896926879883,
"step": 50850
},
{
"epoch": 43.798449612403104,
"eval_loss": 0.3408214747905731,
"eval_runtime": 17.6173,
"eval_samples_per_second": 1078.711,
"eval_steps_per_second": 33.717,
"step": 50850
},
{
"epoch": 43.84151593453919,
"grad_norm": 0.4811161160469055,
"learning_rate": 2.4637381567614126e-05,
"loss": 0.2605669593811035,
"step": 50900
},
{
"epoch": 43.84151593453919,
"eval_loss": 0.3232431709766388,
"eval_runtime": 18.0865,
"eval_samples_per_second": 1050.729,
"eval_steps_per_second": 32.842,
"step": 50900
},
{
"epoch": 43.88458225667528,
"grad_norm": 0.5609408617019653,
"learning_rate": 2.4465116279069768e-05,
"loss": 0.2697344398498535,
"step": 50950
},
{
"epoch": 43.88458225667528,
"eval_loss": 0.3291715979576111,
"eval_runtime": 18.1056,
"eval_samples_per_second": 1049.618,
"eval_steps_per_second": 32.807,
"step": 50950
},
{
"epoch": 43.92764857881137,
"grad_norm": 0.4350355565547943,
"learning_rate": 2.429285099052541e-05,
"loss": 0.25872198104858396,
"step": 51000
},
{
"epoch": 43.92764857881137,
"eval_loss": 0.33080291748046875,
"eval_runtime": 18.1436,
"eval_samples_per_second": 1047.421,
"eval_steps_per_second": 32.739,
"step": 51000
},
{
"epoch": 43.970714900947456,
"grad_norm": 0.39226019382476807,
"learning_rate": 2.4120585701981052e-05,
"loss": 0.2671630668640137,
"step": 51050
},
{
"epoch": 43.970714900947456,
"eval_loss": 0.32811781764030457,
"eval_runtime": 18.179,
"eval_samples_per_second": 1045.38,
"eval_steps_per_second": 32.675,
"step": 51050
},
{
"epoch": 44.01378122308355,
"grad_norm": 0.38461270928382874,
"learning_rate": 2.3948320413436694e-05,
"loss": 0.25831233978271484,
"step": 51100
},
{
"epoch": 44.01378122308355,
"eval_loss": 0.33200836181640625,
"eval_runtime": 18.2577,
"eval_samples_per_second": 1040.877,
"eval_steps_per_second": 32.534,
"step": 51100
},
{
"epoch": 44.05684754521964,
"grad_norm": 0.44967833161354065,
"learning_rate": 2.3776055124892336e-05,
"loss": 0.2555442428588867,
"step": 51150
},
{
"epoch": 44.05684754521964,
"eval_loss": 0.3291064500808716,
"eval_runtime": 17.8675,
"eval_samples_per_second": 1063.607,
"eval_steps_per_second": 33.245,
"step": 51150
},
{
"epoch": 44.09991386735573,
"grad_norm": 0.41032636165618896,
"learning_rate": 2.3603789836347978e-05,
"loss": 0.2579687690734863,
"step": 51200
},
{
"epoch": 44.09991386735573,
"eval_loss": 0.32970529794692993,
"eval_runtime": 17.543,
"eval_samples_per_second": 1083.283,
"eval_steps_per_second": 33.86,
"step": 51200
},
{
"epoch": 44.142980189491816,
"grad_norm": 0.3892042338848114,
"learning_rate": 2.3431524547803617e-05,
"loss": 0.263288688659668,
"step": 51250
},
{
"epoch": 44.142980189491816,
"eval_loss": 0.33633163571357727,
"eval_runtime": 17.6106,
"eval_samples_per_second": 1079.123,
"eval_steps_per_second": 33.73,
"step": 51250
},
{
"epoch": 44.18604651162791,
"grad_norm": 0.44527554512023926,
"learning_rate": 2.3259259259259262e-05,
"loss": 0.2592180252075195,
"step": 51300
},
{
"epoch": 44.18604651162791,
"eval_loss": 0.33094218373298645,
"eval_runtime": 18.2222,
"eval_samples_per_second": 1042.904,
"eval_steps_per_second": 32.598,
"step": 51300
},
{
"epoch": 44.229112833764,
"grad_norm": 0.3995368480682373,
"learning_rate": 2.30869939707149e-05,
"loss": 0.25935218811035154,
"step": 51350
},
{
"epoch": 44.229112833764,
"eval_loss": 0.3350081741809845,
"eval_runtime": 16.471,
"eval_samples_per_second": 1153.786,
"eval_steps_per_second": 36.063,
"step": 51350
},
{
"epoch": 44.27217915590008,
"grad_norm": 0.49399951100349426,
"learning_rate": 2.2914728682170543e-05,
"loss": 0.26251346588134766,
"step": 51400
},
{
"epoch": 44.27217915590008,
"eval_loss": 0.3334828317165375,
"eval_runtime": 18.0529,
"eval_samples_per_second": 1052.682,
"eval_steps_per_second": 32.903,
"step": 51400
},
{
"epoch": 44.315245478036175,
"grad_norm": 0.42008745670318604,
"learning_rate": 2.2742463393626185e-05,
"loss": 0.25890609741210935,
"step": 51450
},
{
"epoch": 44.315245478036175,
"eval_loss": 0.3312210440635681,
"eval_runtime": 18.1708,
"eval_samples_per_second": 1045.852,
"eval_steps_per_second": 32.69,
"step": 51450
},
{
"epoch": 44.35831180017227,
"grad_norm": 0.4206209182739258,
"learning_rate": 2.2570198105081827e-05,
"loss": 0.2611784744262695,
"step": 51500
},
{
"epoch": 44.35831180017227,
"eval_loss": 0.3322766125202179,
"eval_runtime": 18.1357,
"eval_samples_per_second": 1047.879,
"eval_steps_per_second": 32.753,
"step": 51500
},
{
"epoch": 44.40137812230836,
"grad_norm": 0.41025853157043457,
"learning_rate": 2.2397932816537466e-05,
"loss": 0.2615481185913086,
"step": 51550
},
{
"epoch": 44.40137812230836,
"eval_loss": 0.33196064829826355,
"eval_runtime": 18.1456,
"eval_samples_per_second": 1047.305,
"eval_steps_per_second": 32.735,
"step": 51550
},
{
"epoch": 44.44444444444444,
"grad_norm": 0.39682939648628235,
"learning_rate": 2.222566752799311e-05,
"loss": 0.259089298248291,
"step": 51600
},
{
"epoch": 44.44444444444444,
"eval_loss": 0.32832977175712585,
"eval_runtime": 17.8389,
"eval_samples_per_second": 1065.313,
"eval_steps_per_second": 33.298,
"step": 51600
},
{
"epoch": 44.487510766580534,
"grad_norm": 0.41068243980407715,
"learning_rate": 2.2053402239448754e-05,
"loss": 0.25956289291381834,
"step": 51650
},
{
"epoch": 44.487510766580534,
"eval_loss": 0.3320268988609314,
"eval_runtime": 18.1041,
"eval_samples_per_second": 1049.708,
"eval_steps_per_second": 32.81,
"step": 51650
},
{
"epoch": 44.530577088716626,
"grad_norm": 0.43129387497901917,
"learning_rate": 2.1881136950904392e-05,
"loss": 0.25620647430419924,
"step": 51700
},
{
"epoch": 44.530577088716626,
"eval_loss": 0.332096666097641,
"eval_runtime": 16.4704,
"eval_samples_per_second": 1153.828,
"eval_steps_per_second": 36.065,
"step": 51700
},
{
"epoch": 44.57364341085271,
"grad_norm": 0.478359580039978,
"learning_rate": 2.1708871662360038e-05,
"loss": 0.2545368766784668,
"step": 51750
},
{
"epoch": 44.57364341085271,
"eval_loss": 0.3265804946422577,
"eval_runtime": 17.8095,
"eval_samples_per_second": 1067.072,
"eval_steps_per_second": 33.353,
"step": 51750
},
{
"epoch": 44.6167097329888,
"grad_norm": 0.4747825860977173,
"learning_rate": 2.1536606373815677e-05,
"loss": 0.2612367820739746,
"step": 51800
},
{
"epoch": 44.6167097329888,
"eval_loss": 0.3282797634601593,
"eval_runtime": 16.8719,
"eval_samples_per_second": 1126.371,
"eval_steps_per_second": 35.207,
"step": 51800
},
{
"epoch": 44.659776055124894,
"grad_norm": 0.41625022888183594,
"learning_rate": 2.136434108527132e-05,
"loss": 0.2548894691467285,
"step": 51850
},
{
"epoch": 44.659776055124894,
"eval_loss": 0.33373549580574036,
"eval_runtime": 17.9156,
"eval_samples_per_second": 1060.752,
"eval_steps_per_second": 33.155,
"step": 51850
},
{
"epoch": 44.70284237726098,
"grad_norm": 0.40073272585868835,
"learning_rate": 2.119207579672696e-05,
"loss": 0.2608194923400879,
"step": 51900
},
{
"epoch": 44.70284237726098,
"eval_loss": 0.32843106985092163,
"eval_runtime": 18.2982,
"eval_samples_per_second": 1038.571,
"eval_steps_per_second": 32.462,
"step": 51900
},
{
"epoch": 44.74590869939707,
"grad_norm": 0.4103309214115143,
"learning_rate": 2.1019810508182603e-05,
"loss": 0.25819131851196286,
"step": 51950
},
{
"epoch": 44.74590869939707,
"eval_loss": 0.32719510793685913,
"eval_runtime": 18.27,
"eval_samples_per_second": 1040.176,
"eval_steps_per_second": 32.512,
"step": 51950
},
{
"epoch": 44.78897502153316,
"grad_norm": 0.4365871846675873,
"learning_rate": 2.084754521963824e-05,
"loss": 0.2647978591918945,
"step": 52000
},
{
"epoch": 44.78897502153316,
"eval_loss": 0.3374394476413727,
"eval_runtime": 17.8003,
"eval_samples_per_second": 1067.624,
"eval_steps_per_second": 33.37,
"step": 52000
},
{
"epoch": 44.83204134366925,
"grad_norm": 0.4309022128582001,
"learning_rate": 2.0675279931093887e-05,
"loss": 0.25435049057006837,
"step": 52050
},
{
"epoch": 44.83204134366925,
"eval_loss": 0.33556151390075684,
"eval_runtime": 18.5283,
"eval_samples_per_second": 1025.676,
"eval_steps_per_second": 32.059,
"step": 52050
},
{
"epoch": 44.87510766580534,
"grad_norm": 0.4415874183177948,
"learning_rate": 2.050301464254953e-05,
"loss": 0.2644994354248047,
"step": 52100
},
{
"epoch": 44.87510766580534,
"eval_loss": 0.3290407061576843,
"eval_runtime": 18.4239,
"eval_samples_per_second": 1031.488,
"eval_steps_per_second": 32.241,
"step": 52100
},
{
"epoch": 44.91817398794143,
"grad_norm": 0.4384152293205261,
"learning_rate": 2.0330749354005168e-05,
"loss": 0.2529031372070312,
"step": 52150
},
{
"epoch": 44.91817398794143,
"eval_loss": 0.3312751054763794,
"eval_runtime": 17.7215,
"eval_samples_per_second": 1072.371,
"eval_steps_per_second": 33.519,
"step": 52150
},
{
"epoch": 44.96124031007752,
"grad_norm": 0.40394484996795654,
"learning_rate": 2.015848406546081e-05,
"loss": 0.25720306396484377,
"step": 52200
},
{
"epoch": 44.96124031007752,
"eval_loss": 0.3306456208229065,
"eval_runtime": 17.7033,
"eval_samples_per_second": 1073.47,
"eval_steps_per_second": 33.553,
"step": 52200
},
{
"epoch": 45.004306632213606,
"grad_norm": 0.3723146319389343,
"learning_rate": 1.9986218776916452e-05,
"loss": 0.2507796859741211,
"step": 52250
},
{
"epoch": 45.004306632213606,
"eval_loss": 0.3309019207954407,
"eval_runtime": 17.9496,
"eval_samples_per_second": 1058.741,
"eval_steps_per_second": 33.093,
"step": 52250
},
{
"epoch": 45.0473729543497,
"grad_norm": 0.4547770619392395,
"learning_rate": 1.9813953488372094e-05,
"loss": 0.2613633346557617,
"step": 52300
},
{
"epoch": 45.0473729543497,
"eval_loss": 0.33252662420272827,
"eval_runtime": 17.1825,
"eval_samples_per_second": 1106.008,
"eval_steps_per_second": 34.57,
"step": 52300
},
{
"epoch": 45.09043927648579,
"grad_norm": 0.3941754698753357,
"learning_rate": 1.9641688199827736e-05,
"loss": 0.25863794326782225,
"step": 52350
},
{
"epoch": 45.09043927648579,
"eval_loss": 0.3304085433483124,
"eval_runtime": 18.1853,
"eval_samples_per_second": 1045.019,
"eval_steps_per_second": 32.664,
"step": 52350
},
{
"epoch": 45.13350559862188,
"grad_norm": 0.5553924441337585,
"learning_rate": 1.946942291128338e-05,
"loss": 0.25572742462158204,
"step": 52400
},
{
"epoch": 45.13350559862188,
"eval_loss": 0.3342275321483612,
"eval_runtime": 18.1163,
"eval_samples_per_second": 1048.997,
"eval_steps_per_second": 32.788,
"step": 52400
},
{
"epoch": 45.176571920757965,
"grad_norm": 0.4295162260532379,
"learning_rate": 1.9297157622739017e-05,
"loss": 0.25522724151611326,
"step": 52450
},
{
"epoch": 45.176571920757965,
"eval_loss": 0.33520230650901794,
"eval_runtime": 18.0909,
"eval_samples_per_second": 1050.475,
"eval_steps_per_second": 32.834,
"step": 52450
},
{
"epoch": 45.21963824289406,
"grad_norm": 0.4033823311328888,
"learning_rate": 1.912489233419466e-05,
"loss": 0.2575173568725586,
"step": 52500
},
{
"epoch": 45.21963824289406,
"eval_loss": 0.3277575671672821,
"eval_runtime": 18.049,
"eval_samples_per_second": 1052.913,
"eval_steps_per_second": 32.91,
"step": 52500
},
{
"epoch": 45.26270456503015,
"grad_norm": 0.40482550859451294,
"learning_rate": 1.8952627045650305e-05,
"loss": 0.25760974884033205,
"step": 52550
},
{
"epoch": 45.26270456503015,
"eval_loss": 0.32644209265708923,
"eval_runtime": 18.4841,
"eval_samples_per_second": 1028.129,
"eval_steps_per_second": 32.136,
"step": 52550
},
{
"epoch": 45.30577088716623,
"grad_norm": 0.4248245358467102,
"learning_rate": 1.8780361757105943e-05,
"loss": 0.2546268081665039,
"step": 52600
},
{
"epoch": 45.30577088716623,
"eval_loss": 0.329515278339386,
"eval_runtime": 18.1088,
"eval_samples_per_second": 1049.433,
"eval_steps_per_second": 32.802,
"step": 52600
},
{
"epoch": 45.348837209302324,
"grad_norm": 0.4354708790779114,
"learning_rate": 1.8608096468561585e-05,
"loss": 0.262480640411377,
"step": 52650
},
{
"epoch": 45.348837209302324,
"eval_loss": 0.3266183137893677,
"eval_runtime": 17.4725,
"eval_samples_per_second": 1087.655,
"eval_steps_per_second": 33.996,
"step": 52650
},
{
"epoch": 45.391903531438416,
"grad_norm": 0.4125041663646698,
"learning_rate": 1.8435831180017228e-05,
"loss": 0.24627891540527344,
"step": 52700
},
{
"epoch": 45.391903531438416,
"eval_loss": 0.33478984236717224,
"eval_runtime": 18.0794,
"eval_samples_per_second": 1051.138,
"eval_steps_per_second": 32.855,
"step": 52700
},
{
"epoch": 45.43496985357451,
"grad_norm": 0.40405088663101196,
"learning_rate": 1.826356589147287e-05,
"loss": 0.2658543014526367,
"step": 52750
},
{
"epoch": 45.43496985357451,
"eval_loss": 0.32871609926223755,
"eval_runtime": 17.3887,
"eval_samples_per_second": 1092.896,
"eval_steps_per_second": 34.16,
"step": 52750
},
{
"epoch": 45.47803617571059,
"grad_norm": 0.5048729181289673,
"learning_rate": 1.809130060292851e-05,
"loss": 0.2580097198486328,
"step": 52800
},
{
"epoch": 45.47803617571059,
"eval_loss": 0.3327273428440094,
"eval_runtime": 18.192,
"eval_samples_per_second": 1044.637,
"eval_steps_per_second": 32.652,
"step": 52800
},
{
"epoch": 45.521102497846684,
"grad_norm": 0.44812899827957153,
"learning_rate": 1.7919035314384154e-05,
"loss": 0.26092041015625,
"step": 52850
},
{
"epoch": 45.521102497846684,
"eval_loss": 0.3265608847141266,
"eval_runtime": 18.1555,
"eval_samples_per_second": 1046.735,
"eval_steps_per_second": 32.717,
"step": 52850
},
{
"epoch": 45.564168819982775,
"grad_norm": 0.4317931532859802,
"learning_rate": 1.7746770025839793e-05,
"loss": 0.25728765487670896,
"step": 52900
},
{
"epoch": 45.564168819982775,
"eval_loss": 0.32284799218177795,
"eval_runtime": 17.9284,
"eval_samples_per_second": 1059.994,
"eval_steps_per_second": 33.132,
"step": 52900
},
{
"epoch": 45.60723514211886,
"grad_norm": 0.4485609233379364,
"learning_rate": 1.7574504737295435e-05,
"loss": 0.2618018341064453,
"step": 52950
},
{
"epoch": 45.60723514211886,
"eval_loss": 0.3257175385951996,
"eval_runtime": 18.0633,
"eval_samples_per_second": 1052.076,
"eval_steps_per_second": 32.884,
"step": 52950
},
{
"epoch": 45.65030146425495,
"grad_norm": 0.45407286286354065,
"learning_rate": 1.740223944875108e-05,
"loss": 0.2605839157104492,
"step": 53000
},
{
"epoch": 45.65030146425495,
"eval_loss": 0.32873979210853577,
"eval_runtime": 18.0677,
"eval_samples_per_second": 1051.825,
"eval_steps_per_second": 32.876,
"step": 53000
},
{
"epoch": 45.69336778639104,
"grad_norm": 0.40683576464653015,
"learning_rate": 1.722997416020672e-05,
"loss": 0.25455360412597655,
"step": 53050
},
{
"epoch": 45.69336778639104,
"eval_loss": 0.33273980021476746,
"eval_runtime": 17.9958,
"eval_samples_per_second": 1056.026,
"eval_steps_per_second": 33.008,
"step": 53050
},
{
"epoch": 45.736434108527135,
"grad_norm": 0.48655572533607483,
"learning_rate": 1.705770887166236e-05,
"loss": 0.260802116394043,
"step": 53100
},
{
"epoch": 45.736434108527135,
"eval_loss": 0.33060774207115173,
"eval_runtime": 17.1445,
"eval_samples_per_second": 1108.46,
"eval_steps_per_second": 34.647,
"step": 53100
},
{
"epoch": 45.77950043066322,
"grad_norm": 0.4370211660861969,
"learning_rate": 1.6885443583118003e-05,
"loss": 0.25209793090820315,
"step": 53150
},
{
"epoch": 45.77950043066322,
"eval_loss": 0.3310869038105011,
"eval_runtime": 18.1304,
"eval_samples_per_second": 1048.182,
"eval_steps_per_second": 32.763,
"step": 53150
},
{
"epoch": 45.82256675279931,
"grad_norm": 0.41281944513320923,
"learning_rate": 1.6713178294573645e-05,
"loss": 0.2589684295654297,
"step": 53200
},
{
"epoch": 45.82256675279931,
"eval_loss": 0.3299771547317505,
"eval_runtime": 18.0123,
"eval_samples_per_second": 1055.055,
"eval_steps_per_second": 32.977,
"step": 53200
},
{
"epoch": 45.8656330749354,
"grad_norm": 0.4713948965072632,
"learning_rate": 1.6540913006029284e-05,
"loss": 0.2571216011047363,
"step": 53250
},
{
"epoch": 45.8656330749354,
"eval_loss": 0.3286995589733124,
"eval_runtime": 19.7704,
"eval_samples_per_second": 961.233,
"eval_steps_per_second": 30.045,
"step": 53250
},
{
"epoch": 45.90869939707149,
"grad_norm": 0.35535186529159546,
"learning_rate": 1.636864771748493e-05,
"loss": 0.2573351287841797,
"step": 53300
},
{
"epoch": 45.90869939707149,
"eval_loss": 0.3261243402957916,
"eval_runtime": 17.9332,
"eval_samples_per_second": 1059.709,
"eval_steps_per_second": 33.123,
"step": 53300
},
{
"epoch": 45.95176571920758,
"grad_norm": 0.40924257040023804,
"learning_rate": 1.6196382428940568e-05,
"loss": 0.2555196189880371,
"step": 53350
},
{
"epoch": 45.95176571920758,
"eval_loss": 0.3238484859466553,
"eval_runtime": 18.0699,
"eval_samples_per_second": 1051.695,
"eval_steps_per_second": 32.872,
"step": 53350
},
{
"epoch": 45.99483204134367,
"grad_norm": 0.49290111660957336,
"learning_rate": 1.602411714039621e-05,
"loss": 0.2545882034301758,
"step": 53400
},
{
"epoch": 45.99483204134367,
"eval_loss": 0.33049580454826355,
"eval_runtime": 19.3384,
"eval_samples_per_second": 982.707,
"eval_steps_per_second": 30.716,
"step": 53400
},
{
"epoch": 46.03789836347976,
"grad_norm": 0.37689730525016785,
"learning_rate": 1.5851851851851852e-05,
"loss": 0.2548778533935547,
"step": 53450
},
{
"epoch": 46.03789836347976,
"eval_loss": 0.33154818415641785,
"eval_runtime": 19.374,
"eval_samples_per_second": 980.902,
"eval_steps_per_second": 30.66,
"step": 53450
},
{
"epoch": 46.08096468561585,
"grad_norm": 0.4257497489452362,
"learning_rate": 1.5679586563307494e-05,
"loss": 0.26114877700805667,
"step": 53500
},
{
"epoch": 46.08096468561585,
"eval_loss": 0.33142903447151184,
"eval_runtime": 18.3151,
"eval_samples_per_second": 1037.612,
"eval_steps_per_second": 32.432,
"step": 53500
},
{
"epoch": 46.12403100775194,
"grad_norm": 0.45820152759552,
"learning_rate": 1.5507321274763136e-05,
"loss": 0.25467180252075194,
"step": 53550
},
{
"epoch": 46.12403100775194,
"eval_loss": 0.33298689126968384,
"eval_runtime": 17.1221,
"eval_samples_per_second": 1109.909,
"eval_steps_per_second": 34.692,
"step": 53550
},
{
"epoch": 46.16709732988803,
"grad_norm": 0.3875587284564972,
"learning_rate": 1.533505598621878e-05,
"loss": 0.2580219268798828,
"step": 53600
},
{
"epoch": 46.16709732988803,
"eval_loss": 0.3247772455215454,
"eval_runtime": 17.6361,
"eval_samples_per_second": 1077.56,
"eval_steps_per_second": 33.681,
"step": 53600
},
{
"epoch": 46.210163652024114,
"grad_norm": 0.401325523853302,
"learning_rate": 1.516279069767442e-05,
"loss": 0.2550651168823242,
"step": 53650
},
{
"epoch": 46.210163652024114,
"eval_loss": 0.3269961476325989,
"eval_runtime": 17.6496,
"eval_samples_per_second": 1076.735,
"eval_steps_per_second": 33.655,
"step": 53650
},
{
"epoch": 46.253229974160206,
"grad_norm": 0.3880271315574646,
"learning_rate": 1.499052540913006e-05,
"loss": 0.26004016876220704,
"step": 53700
},
{
"epoch": 46.253229974160206,
"eval_loss": 0.329743891954422,
"eval_runtime": 17.3231,
"eval_samples_per_second": 1097.03,
"eval_steps_per_second": 34.289,
"step": 53700
},
{
"epoch": 46.2962962962963,
"grad_norm": 0.43584710359573364,
"learning_rate": 1.4818260120585703e-05,
"loss": 0.26099538803100586,
"step": 53750
},
{
"epoch": 46.2962962962963,
"eval_loss": 0.32898804545402527,
"eval_runtime": 18.4366,
"eval_samples_per_second": 1030.774,
"eval_steps_per_second": 32.218,
"step": 53750
},
{
"epoch": 46.33936261843239,
"grad_norm": 0.42027243971824646,
"learning_rate": 1.4645994832041344e-05,
"loss": 0.2571408462524414,
"step": 53800
},
{
"epoch": 46.33936261843239,
"eval_loss": 0.3268490135669708,
"eval_runtime": 17.572,
"eval_samples_per_second": 1081.493,
"eval_steps_per_second": 33.804,
"step": 53800
},
{
"epoch": 46.382428940568474,
"grad_norm": 0.45979616045951843,
"learning_rate": 1.4473729543496986e-05,
"loss": 0.26052181243896483,
"step": 53850
},
{
"epoch": 46.382428940568474,
"eval_loss": 0.32973024249076843,
"eval_runtime": 17.8772,
"eval_samples_per_second": 1063.033,
"eval_steps_per_second": 33.227,
"step": 53850
},
{
"epoch": 46.425495262704565,
"grad_norm": 0.42952507734298706,
"learning_rate": 1.430146425495263e-05,
"loss": 0.26365802764892576,
"step": 53900
},
{
"epoch": 46.425495262704565,
"eval_loss": 0.3266441226005554,
"eval_runtime": 17.8894,
"eval_samples_per_second": 1062.304,
"eval_steps_per_second": 33.204,
"step": 53900
},
{
"epoch": 46.46856158484066,
"grad_norm": 0.42997005581855774,
"learning_rate": 1.412919896640827e-05,
"loss": 0.25490142822265627,
"step": 53950
},
{
"epoch": 46.46856158484066,
"eval_loss": 0.3287542164325714,
"eval_runtime": 18.2943,
"eval_samples_per_second": 1038.796,
"eval_steps_per_second": 32.469,
"step": 53950
},
{
"epoch": 46.51162790697674,
"grad_norm": 0.5448318719863892,
"learning_rate": 1.3956933677863912e-05,
"loss": 0.2567383575439453,
"step": 54000
},
{
"epoch": 46.51162790697674,
"eval_loss": 0.3273468613624573,
"eval_runtime": 17.8252,
"eval_samples_per_second": 1066.132,
"eval_steps_per_second": 33.324,
"step": 54000
},
{
"epoch": 46.55469422911283,
"grad_norm": 0.4054015874862671,
"learning_rate": 1.3784668389319552e-05,
"loss": 0.26060123443603517,
"step": 54050
},
{
"epoch": 46.55469422911283,
"eval_loss": 0.3238947093486786,
"eval_runtime": 18.5551,
"eval_samples_per_second": 1024.192,
"eval_steps_per_second": 32.013,
"step": 54050
},
{
"epoch": 46.597760551248925,
"grad_norm": 0.4301529824733734,
"learning_rate": 1.3612403100775195e-05,
"loss": 0.2529969787597656,
"step": 54100
},
{
"epoch": 46.597760551248925,
"eval_loss": 0.32576170563697815,
"eval_runtime": 19.3566,
"eval_samples_per_second": 981.785,
"eval_steps_per_second": 30.687,
"step": 54100
},
{
"epoch": 46.64082687338501,
"grad_norm": 0.4531412422657013,
"learning_rate": 1.3440137812230835e-05,
"loss": 0.25750720977783204,
"step": 54150
},
{
"epoch": 46.64082687338501,
"eval_loss": 0.3297915458679199,
"eval_runtime": 18.2668,
"eval_samples_per_second": 1040.358,
"eval_steps_per_second": 32.518,
"step": 54150
},
{
"epoch": 46.6838931955211,
"grad_norm": 0.4417518377304077,
"learning_rate": 1.3267872523686479e-05,
"loss": 0.25784589767456056,
"step": 54200
},
{
"epoch": 46.6838931955211,
"eval_loss": 0.325896292924881,
"eval_runtime": 18.1482,
"eval_samples_per_second": 1047.157,
"eval_steps_per_second": 32.731,
"step": 54200
},
{
"epoch": 46.72695951765719,
"grad_norm": 0.49990609288215637,
"learning_rate": 1.3095607235142119e-05,
"loss": 0.25872652053833006,
"step": 54250
},
{
"epoch": 46.72695951765719,
"eval_loss": 0.3264675438404083,
"eval_runtime": 17.9997,
"eval_samples_per_second": 1055.796,
"eval_steps_per_second": 33.001,
"step": 54250
},
{
"epoch": 46.770025839793284,
"grad_norm": 0.38507169485092163,
"learning_rate": 1.2923341946597761e-05,
"loss": 0.25351362228393554,
"step": 54300
},
{
"epoch": 46.770025839793284,
"eval_loss": 0.3271394371986389,
"eval_runtime": 17.7696,
"eval_samples_per_second": 1069.467,
"eval_steps_per_second": 33.428,
"step": 54300
},
{
"epoch": 46.81309216192937,
"grad_norm": 0.38704913854599,
"learning_rate": 1.2751076658053403e-05,
"loss": 0.2537462615966797,
"step": 54350
},
{
"epoch": 46.81309216192937,
"eval_loss": 0.328862726688385,
"eval_runtime": 18.6196,
"eval_samples_per_second": 1020.647,
"eval_steps_per_second": 31.902,
"step": 54350
},
{
"epoch": 46.85615848406546,
"grad_norm": 0.41383081674575806,
"learning_rate": 1.2578811369509044e-05,
"loss": 0.2505047607421875,
"step": 54400
},
{
"epoch": 46.85615848406546,
"eval_loss": 0.32361650466918945,
"eval_runtime": 18.0463,
"eval_samples_per_second": 1053.071,
"eval_steps_per_second": 32.915,
"step": 54400
},
{
"epoch": 46.89922480620155,
"grad_norm": 0.45698070526123047,
"learning_rate": 1.2406546080964686e-05,
"loss": 0.2587569618225098,
"step": 54450
},
{
"epoch": 46.89922480620155,
"eval_loss": 0.3258303701877594,
"eval_runtime": 17.9164,
"eval_samples_per_second": 1060.705,
"eval_steps_per_second": 33.154,
"step": 54450
},
{
"epoch": 46.94229112833764,
"grad_norm": 0.47596439719200134,
"learning_rate": 1.2234280792420328e-05,
"loss": 0.25485145568847656,
"step": 54500
},
{
"epoch": 46.94229112833764,
"eval_loss": 0.32724428176879883,
"eval_runtime": 18.2902,
"eval_samples_per_second": 1039.028,
"eval_steps_per_second": 32.476,
"step": 54500
},
{
"epoch": 46.98535745047373,
"grad_norm": 0.5511539578437805,
"learning_rate": 1.206201550387597e-05,
"loss": 0.2561703872680664,
"step": 54550
},
{
"epoch": 46.98535745047373,
"eval_loss": 0.3254358172416687,
"eval_runtime": 18.259,
"eval_samples_per_second": 1040.8,
"eval_steps_per_second": 32.532,
"step": 54550
},
{
"epoch": 47.02842377260982,
"grad_norm": 0.5038396120071411,
"learning_rate": 1.1889750215331612e-05,
"loss": 0.25494846343994143,
"step": 54600
},
{
"epoch": 47.02842377260982,
"eval_loss": 0.3333961069583893,
"eval_runtime": 18.4274,
"eval_samples_per_second": 1031.293,
"eval_steps_per_second": 32.235,
"step": 54600
},
{
"epoch": 47.07149009474591,
"grad_norm": 0.45146816968917847,
"learning_rate": 1.1717484926787253e-05,
"loss": 0.2557457733154297,
"step": 54650
},
{
"epoch": 47.07149009474591,
"eval_loss": 0.3291355073451996,
"eval_runtime": 18.674,
"eval_samples_per_second": 1017.673,
"eval_steps_per_second": 31.809,
"step": 54650
},
{
"epoch": 47.114556416881996,
"grad_norm": 0.4485445022583008,
"learning_rate": 1.1545219638242895e-05,
"loss": 0.2512784957885742,
"step": 54700
},
{
"epoch": 47.114556416881996,
"eval_loss": 0.32779595255851746,
"eval_runtime": 17.9957,
"eval_samples_per_second": 1056.03,
"eval_steps_per_second": 33.008,
"step": 54700
},
{
"epoch": 47.15762273901809,
"grad_norm": 0.3778316080570221,
"learning_rate": 1.1372954349698537e-05,
"loss": 0.25392818450927734,
"step": 54750
},
{
"epoch": 47.15762273901809,
"eval_loss": 0.32727065682411194,
"eval_runtime": 17.4622,
"eval_samples_per_second": 1088.295,
"eval_steps_per_second": 34.016,
"step": 54750
},
{
"epoch": 47.20068906115418,
"grad_norm": 0.3127480447292328,
"learning_rate": 1.1200689061154177e-05,
"loss": 0.2568006324768066,
"step": 54800
},
{
"epoch": 47.20068906115418,
"eval_loss": 0.3252197802066803,
"eval_runtime": 18.0907,
"eval_samples_per_second": 1050.487,
"eval_steps_per_second": 32.835,
"step": 54800
},
{
"epoch": 47.243755383290264,
"grad_norm": 0.37857651710510254,
"learning_rate": 1.102842377260982e-05,
"loss": 0.2547434234619141,
"step": 54850
},
{
"epoch": 47.243755383290264,
"eval_loss": 0.3276120126247406,
"eval_runtime": 17.9336,
"eval_samples_per_second": 1059.684,
"eval_steps_per_second": 33.122,
"step": 54850
},
{
"epoch": 47.286821705426355,
"grad_norm": 0.411051869392395,
"learning_rate": 1.0856158484065461e-05,
"loss": 0.2587508010864258,
"step": 54900
},
{
"epoch": 47.286821705426355,
"eval_loss": 0.3328407108783722,
"eval_runtime": 18.5652,
"eval_samples_per_second": 1023.636,
"eval_steps_per_second": 31.995,
"step": 54900
},
{
"epoch": 47.32988802756245,
"grad_norm": 0.4203979969024658,
"learning_rate": 1.0683893195521102e-05,
"loss": 0.2552628517150879,
"step": 54950
},
{
"epoch": 47.32988802756245,
"eval_loss": 0.3313995897769928,
"eval_runtime": 18.193,
"eval_samples_per_second": 1044.576,
"eval_steps_per_second": 32.65,
"step": 54950
},
{
"epoch": 47.37295434969854,
"grad_norm": 0.39956605434417725,
"learning_rate": 1.0511627906976746e-05,
"loss": 0.2482987403869629,
"step": 55000
},
{
"epoch": 47.37295434969854,
"eval_loss": 0.3247491121292114,
"eval_runtime": 18.1318,
"eval_samples_per_second": 1048.104,
"eval_steps_per_second": 32.76,
"step": 55000
},
{
"epoch": 47.41602067183462,
"grad_norm": 0.4590926170349121,
"learning_rate": 1.0339362618432388e-05,
"loss": 0.25707162857055665,
"step": 55050
},
{
"epoch": 47.41602067183462,
"eval_loss": 0.323581337928772,
"eval_runtime": 18.2993,
"eval_samples_per_second": 1038.512,
"eval_steps_per_second": 32.46,
"step": 55050
},
{
"epoch": 47.459086993970715,
"grad_norm": 0.3866632282733917,
"learning_rate": 1.0167097329888028e-05,
"loss": 0.25558311462402344,
"step": 55100
},
{
"epoch": 47.459086993970715,
"eval_loss": 0.3233027160167694,
"eval_runtime": 18.1385,
"eval_samples_per_second": 1047.718,
"eval_steps_per_second": 32.748,
"step": 55100
},
{
"epoch": 47.502153316106806,
"grad_norm": 0.5196178555488586,
"learning_rate": 9.99483204134367e-06,
"loss": 0.2581501007080078,
"step": 55150
},
{
"epoch": 47.502153316106806,
"eval_loss": 0.3227282166481018,
"eval_runtime": 17.7718,
"eval_samples_per_second": 1069.334,
"eval_steps_per_second": 33.424,
"step": 55150
},
{
"epoch": 47.54521963824289,
"grad_norm": 0.9048319458961487,
"learning_rate": 9.822566752799312e-06,
"loss": 0.2537050628662109,
"step": 55200
},
{
"epoch": 47.54521963824289,
"eval_loss": 0.3279529809951782,
"eval_runtime": 17.8061,
"eval_samples_per_second": 1067.276,
"eval_steps_per_second": 33.359,
"step": 55200
},
{
"epoch": 47.58828596037898,
"grad_norm": 0.4730582535266876,
"learning_rate": 9.650301464254953e-06,
"loss": 0.2550501251220703,
"step": 55250
},
{
"epoch": 47.58828596037898,
"eval_loss": 0.3323853313922882,
"eval_runtime": 18.4314,
"eval_samples_per_second": 1031.065,
"eval_steps_per_second": 32.228,
"step": 55250
},
{
"epoch": 47.631352282515074,
"grad_norm": 0.4794359803199768,
"learning_rate": 9.478036175710595e-06,
"loss": 0.25382413864135744,
"step": 55300
},
{
"epoch": 47.631352282515074,
"eval_loss": 0.3281784951686859,
"eval_runtime": 18.7928,
"eval_samples_per_second": 1011.237,
"eval_steps_per_second": 31.608,
"step": 55300
},
{
"epoch": 47.674418604651166,
"grad_norm": 0.45010513067245483,
"learning_rate": 9.305770887166237e-06,
"loss": 0.256310920715332,
"step": 55350
},
{
"epoch": 47.674418604651166,
"eval_loss": 0.32857298851013184,
"eval_runtime": 18.0712,
"eval_samples_per_second": 1051.618,
"eval_steps_per_second": 32.87,
"step": 55350
},
{
"epoch": 47.71748492678725,
"grad_norm": 0.37499430775642395,
"learning_rate": 9.133505598621877e-06,
"loss": 0.2561494827270508,
"step": 55400
},
{
"epoch": 47.71748492678725,
"eval_loss": 0.3282778263092041,
"eval_runtime": 16.3235,
"eval_samples_per_second": 1164.211,
"eval_steps_per_second": 36.389,
"step": 55400
},
{
"epoch": 47.76055124892334,
"grad_norm": 0.4058770537376404,
"learning_rate": 8.96124031007752e-06,
"loss": 0.251326904296875,
"step": 55450
},
{
"epoch": 47.76055124892334,
"eval_loss": 0.3290371298789978,
"eval_runtime": 17.7668,
"eval_samples_per_second": 1069.636,
"eval_steps_per_second": 33.433,
"step": 55450
},
{
"epoch": 47.80361757105943,
"grad_norm": 0.36981111764907837,
"learning_rate": 8.788975021533161e-06,
"loss": 0.2633692359924316,
"step": 55500
},
{
"epoch": 47.80361757105943,
"eval_loss": 0.323087602853775,
"eval_runtime": 18.1501,
"eval_samples_per_second": 1047.047,
"eval_steps_per_second": 32.727,
"step": 55500
},
{
"epoch": 47.84668389319552,
"grad_norm": 0.4622083306312561,
"learning_rate": 8.616709732988804e-06,
"loss": 0.2480868911743164,
"step": 55550
},
{
"epoch": 47.84668389319552,
"eval_loss": 0.3338957726955414,
"eval_runtime": 19.5237,
"eval_samples_per_second": 973.38,
"eval_steps_per_second": 30.425,
"step": 55550
},
{
"epoch": 47.88975021533161,
"grad_norm": 0.5124086141586304,
"learning_rate": 8.444444444444446e-06,
"loss": 0.2595103645324707,
"step": 55600
},
{
"epoch": 47.88975021533161,
"eval_loss": 0.3314855098724365,
"eval_runtime": 16.1957,
"eval_samples_per_second": 1173.398,
"eval_steps_per_second": 36.676,
"step": 55600
},
{
"epoch": 47.9328165374677,
"grad_norm": 0.4180132746696472,
"learning_rate": 8.272179155900086e-06,
"loss": 0.25893829345703123,
"step": 55650
},
{
"epoch": 47.9328165374677,
"eval_loss": 0.3274875581264496,
"eval_runtime": 16.6508,
"eval_samples_per_second": 1141.326,
"eval_steps_per_second": 35.674,
"step": 55650
},
{
"epoch": 47.97588285960379,
"grad_norm": 0.39777863025665283,
"learning_rate": 8.099913867355728e-06,
"loss": 0.2580837059020996,
"step": 55700
},
{
"epoch": 47.97588285960379,
"eval_loss": 0.32824796438217163,
"eval_runtime": 17.4958,
"eval_samples_per_second": 1086.202,
"eval_steps_per_second": 33.951,
"step": 55700
},
{
"epoch": 48.01894918173988,
"grad_norm": 0.3993493914604187,
"learning_rate": 7.92764857881137e-06,
"loss": 0.26053876876831056,
"step": 55750
},
{
"epoch": 48.01894918173988,
"eval_loss": 0.32810813188552856,
"eval_runtime": 17.3915,
"eval_samples_per_second": 1092.721,
"eval_steps_per_second": 34.155,
"step": 55750
},
{
"epoch": 48.06201550387597,
"grad_norm": 0.4241078495979309,
"learning_rate": 7.75538329026701e-06,
"loss": 0.25140480041503904,
"step": 55800
},
{
"epoch": 48.06201550387597,
"eval_loss": 0.32518553733825684,
"eval_runtime": 17.5472,
"eval_samples_per_second": 1083.024,
"eval_steps_per_second": 33.852,
"step": 55800
},
{
"epoch": 48.10508182601206,
"grad_norm": 0.4179937243461609,
"learning_rate": 7.583118001722653e-06,
"loss": 0.2537392044067383,
"step": 55850
},
{
"epoch": 48.10508182601206,
"eval_loss": 0.3269120454788208,
"eval_runtime": 16.5361,
"eval_samples_per_second": 1149.244,
"eval_steps_per_second": 35.921,
"step": 55850
},
{
"epoch": 48.148148148148145,
"grad_norm": 0.4175696074962616,
"learning_rate": 7.410852713178294e-06,
"loss": 0.2580715560913086,
"step": 55900
},
{
"epoch": 48.148148148148145,
"eval_loss": 0.333906352519989,
"eval_runtime": 17.1017,
"eval_samples_per_second": 1111.232,
"eval_steps_per_second": 34.733,
"step": 55900
},
{
"epoch": 48.19121447028424,
"grad_norm": 0.4846994876861572,
"learning_rate": 7.238587424633937e-06,
"loss": 0.26174249649047854,
"step": 55950
},
{
"epoch": 48.19121447028424,
"eval_loss": 0.3237777352333069,
"eval_runtime": 17.4587,
"eval_samples_per_second": 1088.511,
"eval_steps_per_second": 34.023,
"step": 55950
},
{
"epoch": 48.23428079242033,
"grad_norm": 0.4444175660610199,
"learning_rate": 7.066322136089579e-06,
"loss": 0.25842620849609377,
"step": 56000
},
{
"epoch": 48.23428079242033,
"eval_loss": 0.334178626537323,
"eval_runtime": 17.5253,
"eval_samples_per_second": 1084.378,
"eval_steps_per_second": 33.894,
"step": 56000
},
{
"epoch": 48.27734711455642,
"grad_norm": 0.3756895065307617,
"learning_rate": 6.89405684754522e-06,
"loss": 0.25008062362670896,
"step": 56050
},
{
"epoch": 48.27734711455642,
"eval_loss": 0.3291133940219879,
"eval_runtime": 18.0697,
"eval_samples_per_second": 1051.705,
"eval_steps_per_second": 32.873,
"step": 56050
},
{
"epoch": 48.320413436692505,
"grad_norm": 0.43787986040115356,
"learning_rate": 6.721791559000862e-06,
"loss": 0.2632747268676758,
"step": 56100
},
{
"epoch": 48.320413436692505,
"eval_loss": 0.32688236236572266,
"eval_runtime": 17.8228,
"eval_samples_per_second": 1066.273,
"eval_steps_per_second": 33.328,
"step": 56100
},
{
"epoch": 48.363479758828596,
"grad_norm": 0.37965553998947144,
"learning_rate": 6.549526270456504e-06,
"loss": 0.2511505699157715,
"step": 56150
},
{
"epoch": 48.363479758828596,
"eval_loss": 0.33095940947532654,
"eval_runtime": 17.6923,
"eval_samples_per_second": 1074.138,
"eval_steps_per_second": 33.574,
"step": 56150
},
{
"epoch": 48.40654608096469,
"grad_norm": 0.44011443853378296,
"learning_rate": 6.377260981912145e-06,
"loss": 0.2500026893615723,
"step": 56200
},
{
"epoch": 48.40654608096469,
"eval_loss": 0.3213208317756653,
"eval_runtime": 18.091,
"eval_samples_per_second": 1050.465,
"eval_steps_per_second": 32.834,
"step": 56200
},
{
"epoch": 48.44961240310077,
"grad_norm": 0.41959789395332336,
"learning_rate": 6.204995693367786e-06,
"loss": 0.25453189849853514,
"step": 56250
},
{
"epoch": 48.44961240310077,
"eval_loss": 0.32835957407951355,
"eval_runtime": 18.0779,
"eval_samples_per_second": 1051.228,
"eval_steps_per_second": 32.858,
"step": 56250
},
{
"epoch": 48.492678725236864,
"grad_norm": 0.4679143726825714,
"learning_rate": 6.032730404823428e-06,
"loss": 0.2501845359802246,
"step": 56300
},
{
"epoch": 48.492678725236864,
"eval_loss": 0.3308468163013458,
"eval_runtime": 18.1336,
"eval_samples_per_second": 1048.0,
"eval_steps_per_second": 32.757,
"step": 56300
},
{
"epoch": 48.535745047372956,
"grad_norm": 0.4774787724018097,
"learning_rate": 5.86046511627907e-06,
"loss": 0.26023746490478517,
"step": 56350
},
{
"epoch": 48.535745047372956,
"eval_loss": 0.33171480894088745,
"eval_runtime": 18.1585,
"eval_samples_per_second": 1046.562,
"eval_steps_per_second": 32.712,
"step": 56350
},
{
"epoch": 48.57881136950905,
"grad_norm": 0.4261847138404846,
"learning_rate": 5.688199827734712e-06,
"loss": 0.25504995346069337,
"step": 56400
},
{
"epoch": 48.57881136950905,
"eval_loss": 0.330726683139801,
"eval_runtime": 17.8733,
"eval_samples_per_second": 1063.26,
"eval_steps_per_second": 33.234,
"step": 56400
},
{
"epoch": 48.62187769164513,
"grad_norm": 0.45377856492996216,
"learning_rate": 5.515934539190353e-06,
"loss": 0.25870571136474607,
"step": 56450
},
{
"epoch": 48.62187769164513,
"eval_loss": 0.32752057909965515,
"eval_runtime": 16.8981,
"eval_samples_per_second": 1124.625,
"eval_steps_per_second": 35.152,
"step": 56450
},
{
"epoch": 48.66494401378122,
"grad_norm": 0.4299466609954834,
"learning_rate": 5.343669250645995e-06,
"loss": 0.24694084167480468,
"step": 56500
},
{
"epoch": 48.66494401378122,
"eval_loss": 0.3338819146156311,
"eval_runtime": 18.0458,
"eval_samples_per_second": 1053.096,
"eval_steps_per_second": 32.916,
"step": 56500
},
{
"epoch": 48.708010335917315,
"grad_norm": 0.3479226529598236,
"learning_rate": 5.171403962101637e-06,
"loss": 0.26142990112304687,
"step": 56550
},
{
"epoch": 48.708010335917315,
"eval_loss": 0.32899126410484314,
"eval_runtime": 18.0383,
"eval_samples_per_second": 1053.535,
"eval_steps_per_second": 32.93,
"step": 56550
},
{
"epoch": 48.7510766580534,
"grad_norm": 0.47992098331451416,
"learning_rate": 4.999138673557278e-06,
"loss": 0.25217716217041014,
"step": 56600
},
{
"epoch": 48.7510766580534,
"eval_loss": 0.33228740096092224,
"eval_runtime": 17.5929,
"eval_samples_per_second": 1080.206,
"eval_steps_per_second": 33.764,
"step": 56600
},
{
"epoch": 48.79414298018949,
"grad_norm": 0.3716151714324951,
"learning_rate": 4.82687338501292e-06,
"loss": 0.2575519943237305,
"step": 56650
},
{
"epoch": 48.79414298018949,
"eval_loss": 0.32792726159095764,
"eval_runtime": 17.7377,
"eval_samples_per_second": 1071.389,
"eval_steps_per_second": 33.488,
"step": 56650
},
{
"epoch": 48.83720930232558,
"grad_norm": 0.5173632502555847,
"learning_rate": 4.654608096468562e-06,
"loss": 0.2547392272949219,
"step": 56700
},
{
"epoch": 48.83720930232558,
"eval_loss": 0.3242984712123871,
"eval_runtime": 17.8788,
"eval_samples_per_second": 1062.933,
"eval_steps_per_second": 33.224,
"step": 56700
},
{
"epoch": 48.880275624461675,
"grad_norm": 0.4812866747379303,
"learning_rate": 4.482342807924204e-06,
"loss": 0.25394062042236326,
"step": 56750
},
{
"epoch": 48.880275624461675,
"eval_loss": 0.3332490622997284,
"eval_runtime": 18.2249,
"eval_samples_per_second": 1042.75,
"eval_steps_per_second": 32.593,
"step": 56750
},
{
"epoch": 48.92334194659776,
"grad_norm": 0.3856547772884369,
"learning_rate": 4.310077519379845e-06,
"loss": 0.25180145263671877,
"step": 56800
},
{
"epoch": 48.92334194659776,
"eval_loss": 0.3311121463775635,
"eval_runtime": 18.1641,
"eval_samples_per_second": 1046.238,
"eval_steps_per_second": 32.702,
"step": 56800
},
{
"epoch": 48.96640826873385,
"grad_norm": 0.39660078287124634,
"learning_rate": 4.137812230835487e-06,
"loss": 0.253763427734375,
"step": 56850
},
{
"epoch": 48.96640826873385,
"eval_loss": 0.32043248414993286,
"eval_runtime": 18.0899,
"eval_samples_per_second": 1050.53,
"eval_steps_per_second": 32.836,
"step": 56850
},
{
"epoch": 49.00947459086994,
"grad_norm": 0.49255695939064026,
"learning_rate": 3.965546942291128e-06,
"loss": 0.25162689208984373,
"step": 56900
},
{
"epoch": 49.00947459086994,
"eval_loss": 0.3301583528518677,
"eval_runtime": 18.2011,
"eval_samples_per_second": 1044.111,
"eval_steps_per_second": 32.635,
"step": 56900
},
{
"epoch": 49.05254091300603,
"grad_norm": 0.46763548254966736,
"learning_rate": 3.7932816537467705e-06,
"loss": 0.25125923156738283,
"step": 56950
},
{
"epoch": 49.05254091300603,
"eval_loss": 0.3262256383895874,
"eval_runtime": 17.8076,
"eval_samples_per_second": 1067.186,
"eval_steps_per_second": 33.357,
"step": 56950
},
{
"epoch": 49.09560723514212,
"grad_norm": 0.4103085994720459,
"learning_rate": 3.621016365202412e-06,
"loss": 0.2495307731628418,
"step": 57000
},
{
"epoch": 49.09560723514212,
"eval_loss": 0.3282013237476349,
"eval_runtime": 17.9406,
"eval_samples_per_second": 1059.273,
"eval_steps_per_second": 33.109,
"step": 57000
},
{
"epoch": 49.13867355727821,
"grad_norm": 0.34167978167533875,
"learning_rate": 3.4487510766580534e-06,
"loss": 0.2514477729797363,
"step": 57050
},
{
"epoch": 49.13867355727821,
"eval_loss": 0.3242286145687103,
"eval_runtime": 17.5082,
"eval_samples_per_second": 1085.432,
"eval_steps_per_second": 33.927,
"step": 57050
},
{
"epoch": 49.181739879414295,
"grad_norm": 0.4692245125770569,
"learning_rate": 3.276485788113695e-06,
"loss": 0.25927200317382815,
"step": 57100
},
{
"epoch": 49.181739879414295,
"eval_loss": 0.3265242278575897,
"eval_runtime": 18.1814,
"eval_samples_per_second": 1045.241,
"eval_steps_per_second": 32.671,
"step": 57100
},
{
"epoch": 49.224806201550386,
"grad_norm": 0.48123684525489807,
"learning_rate": 3.104220499569337e-06,
"loss": 0.2532792663574219,
"step": 57150
},
{
"epoch": 49.224806201550386,
"eval_loss": 0.324819952249527,
"eval_runtime": 17.9832,
"eval_samples_per_second": 1056.762,
"eval_steps_per_second": 33.031,
"step": 57150
},
{
"epoch": 49.26787252368648,
"grad_norm": 0.3839048147201538,
"learning_rate": 2.9319552110249785e-06,
"loss": 0.2522504234313965,
"step": 57200
},
{
"epoch": 49.26787252368648,
"eval_loss": 0.3209121823310852,
"eval_runtime": 17.9498,
"eval_samples_per_second": 1058.728,
"eval_steps_per_second": 33.092,
"step": 57200
},
{
"epoch": 49.31093884582257,
"grad_norm": 0.38812488317489624,
"learning_rate": 2.7596899224806206e-06,
"loss": 0.2546123504638672,
"step": 57250
},
{
"epoch": 49.31093884582257,
"eval_loss": 0.3188753128051758,
"eval_runtime": 17.8365,
"eval_samples_per_second": 1065.453,
"eval_steps_per_second": 33.302,
"step": 57250
},
{
"epoch": 49.354005167958654,
"grad_norm": 0.42530906200408936,
"learning_rate": 2.587424633936262e-06,
"loss": 0.2523210525512695,
"step": 57300
},
{
"epoch": 49.354005167958654,
"eval_loss": 0.322894424200058,
"eval_runtime": 18.1021,
"eval_samples_per_second": 1049.822,
"eval_steps_per_second": 32.814,
"step": 57300
},
{
"epoch": 49.397071490094746,
"grad_norm": 0.46608835458755493,
"learning_rate": 2.415159345391904e-06,
"loss": 0.2563011169433594,
"step": 57350
},
{
"epoch": 49.397071490094746,
"eval_loss": 0.3301422595977783,
"eval_runtime": 18.03,
"eval_samples_per_second": 1054.021,
"eval_steps_per_second": 32.945,
"step": 57350
},
{
"epoch": 49.44013781223084,
"grad_norm": 0.3985242247581482,
"learning_rate": 2.242894056847545e-06,
"loss": 0.2541914939880371,
"step": 57400
},
{
"epoch": 49.44013781223084,
"eval_loss": 0.3331603705883026,
"eval_runtime": 17.8584,
"eval_samples_per_second": 1064.148,
"eval_steps_per_second": 33.262,
"step": 57400
},
{
"epoch": 49.48320413436692,
"grad_norm": 0.4562121033668518,
"learning_rate": 2.070628768303187e-06,
"loss": 0.2531538391113281,
"step": 57450
},
{
"epoch": 49.48320413436692,
"eval_loss": 0.3276031017303467,
"eval_runtime": 17.8464,
"eval_samples_per_second": 1064.864,
"eval_steps_per_second": 33.284,
"step": 57450
},
{
"epoch": 49.52627045650301,
"grad_norm": 0.39313840866088867,
"learning_rate": 1.8983634797588287e-06,
"loss": 0.26208810806274413,
"step": 57500
},
{
"epoch": 49.52627045650301,
"eval_loss": 0.33253973722457886,
"eval_runtime": 17.1618,
"eval_samples_per_second": 1107.343,
"eval_steps_per_second": 34.612,
"step": 57500
},
{
"epoch": 49.569336778639105,
"grad_norm": 0.4557619094848633,
"learning_rate": 1.7260981912144702e-06,
"loss": 0.2460823440551758,
"step": 57550
},
{
"epoch": 49.569336778639105,
"eval_loss": 0.327955424785614,
"eval_runtime": 18.1273,
"eval_samples_per_second": 1048.364,
"eval_steps_per_second": 32.768,
"step": 57550
},
{
"epoch": 49.6124031007752,
"grad_norm": 0.5313220620155334,
"learning_rate": 1.553832902670112e-06,
"loss": 0.2594928359985352,
"step": 57600
},
{
"epoch": 49.6124031007752,
"eval_loss": 0.32700130343437195,
"eval_runtime": 18.1934,
"eval_samples_per_second": 1044.556,
"eval_steps_per_second": 32.649,
"step": 57600
},
{
"epoch": 49.65546942291128,
"grad_norm": 0.42723724246025085,
"learning_rate": 1.3815676141257536e-06,
"loss": 0.25847515106201174,
"step": 57650
},
{
"epoch": 49.65546942291128,
"eval_loss": 0.3303566575050354,
"eval_runtime": 18.0919,
"eval_samples_per_second": 1050.416,
"eval_steps_per_second": 32.832,
"step": 57650
},
{
"epoch": 49.69853574504737,
"grad_norm": 0.4465530216693878,
"learning_rate": 1.2093023255813954e-06,
"loss": 0.2579035186767578,
"step": 57700
},
{
"epoch": 49.69853574504737,
"eval_loss": 0.32566210627555847,
"eval_runtime": 18.1173,
"eval_samples_per_second": 1048.941,
"eval_steps_per_second": 32.786,
"step": 57700
},
{
"epoch": 49.741602067183464,
"grad_norm": 0.46042123436927795,
"learning_rate": 1.0370370370370371e-06,
"loss": 0.25134170532226563,
"step": 57750
},
{
"epoch": 49.741602067183464,
"eval_loss": 0.33262744545936584,
"eval_runtime": 18.0792,
"eval_samples_per_second": 1051.155,
"eval_steps_per_second": 32.856,
"step": 57750
},
{
"epoch": 49.78466838931955,
"grad_norm": 0.3972814977169037,
"learning_rate": 8.647717484926788e-07,
"loss": 0.2506429290771484,
"step": 57800
},
{
"epoch": 49.78466838931955,
"eval_loss": 0.32905587553977966,
"eval_runtime": 17.5711,
"eval_samples_per_second": 1081.55,
"eval_steps_per_second": 33.806,
"step": 57800
},
{
"epoch": 49.82773471145564,
"grad_norm": 0.40822234749794006,
"learning_rate": 6.925064599483205e-07,
"loss": 0.2546335220336914,
"step": 57850
},
{
"epoch": 49.82773471145564,
"eval_loss": 0.3248952031135559,
"eval_runtime": 17.4019,
"eval_samples_per_second": 1092.064,
"eval_steps_per_second": 34.134,
"step": 57850
},
{
"epoch": 49.87080103359173,
"grad_norm": 0.3996763825416565,
"learning_rate": 5.202411714039622e-07,
"loss": 0.25602977752685546,
"step": 57900
},
{
"epoch": 49.87080103359173,
"eval_loss": 0.32748210430145264,
"eval_runtime": 18.3587,
"eval_samples_per_second": 1035.148,
"eval_steps_per_second": 32.355,
"step": 57900
},
{
"epoch": 49.913867355727824,
"grad_norm": 0.5123440027236938,
"learning_rate": 3.4797588285960383e-07,
"loss": 0.25229768753051757,
"step": 57950
},
{
"epoch": 49.913867355727824,
"eval_loss": 0.32939910888671875,
"eval_runtime": 18.5038,
"eval_samples_per_second": 1027.032,
"eval_steps_per_second": 32.101,
"step": 57950
},
{
"epoch": 49.95693367786391,
"grad_norm": 0.4721235930919647,
"learning_rate": 1.7571059431524548e-07,
"loss": 0.25507354736328125,
"step": 58000
},
{
"epoch": 49.95693367786391,
"eval_loss": 0.3246816396713257,
"eval_runtime": 18.9481,
"eval_samples_per_second": 1002.948,
"eval_steps_per_second": 31.349,
"step": 58000
},
{
"epoch": 50.0,
"grad_norm": 0.35689032077789307,
"learning_rate": 3.4453057708871665e-09,
"loss": 0.25033538818359374,
"step": 58050
},
{
"epoch": 50.0,
"eval_loss": 0.3281475007534027,
"eval_runtime": 17.9258,
"eval_samples_per_second": 1060.145,
"eval_steps_per_second": 33.137,
"step": 58050
}
],
"logging_steps": 50,
"max_steps": 58050,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.561600021990736e+18,
"train_batch_size": 64,
"trial_name": null,
"trial_params": null
}