SpireLab's picture
Upload folder using huggingface_hub
137c748 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.999894414528561,
"eval_steps": 500,
"global_step": 42618,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.007039031429275332,
"grad_norm": 36.44467544555664,
"learning_rate": 1.1731581417175035e-06,
"loss": 6.2071,
"step": 100
},
{
"epoch": 0.014078062858550663,
"grad_norm": 43.126609802246094,
"learning_rate": 2.346316283435007e-06,
"loss": 5.9231,
"step": 200
},
{
"epoch": 0.021117094287825995,
"grad_norm": 29.171634674072266,
"learning_rate": 3.5194744251525106e-06,
"loss": 5.4145,
"step": 300
},
{
"epoch": 0.028156125717101327,
"grad_norm": 38.217105865478516,
"learning_rate": 4.692632566870014e-06,
"loss": 4.9149,
"step": 400
},
{
"epoch": 0.03519515714637666,
"grad_norm": 35.40254211425781,
"learning_rate": 5.865790708587518e-06,
"loss": 4.5052,
"step": 500
},
{
"epoch": 0.03519515714637666,
"eval_runtime": 191.3754,
"eval_samples_per_second": 148.467,
"eval_steps_per_second": 18.56,
"step": 500
},
{
"epoch": 0.04223418857565199,
"grad_norm": 36.732643127441406,
"learning_rate": 7.038948850305021e-06,
"loss": 4.3715,
"step": 600
},
{
"epoch": 0.04927322000492732,
"grad_norm": 36.27021408081055,
"learning_rate": 8.212106992022525e-06,
"loss": 4.3269,
"step": 700
},
{
"epoch": 0.056312251434202654,
"grad_norm": 42.45858383178711,
"learning_rate": 9.385265133740028e-06,
"loss": 4.0589,
"step": 800
},
{
"epoch": 0.06335128286347799,
"grad_norm": 42.08483123779297,
"learning_rate": 1.0558423275457532e-05,
"loss": 4.1336,
"step": 900
},
{
"epoch": 0.07039031429275332,
"grad_norm": 42.23253631591797,
"learning_rate": 1.1731581417175035e-05,
"loss": 4.0719,
"step": 1000
},
{
"epoch": 0.07039031429275332,
"eval_runtime": 193.8033,
"eval_samples_per_second": 146.607,
"eval_steps_per_second": 18.328,
"step": 1000
},
{
"epoch": 0.07742934572202866,
"grad_norm": 48.905662536621094,
"learning_rate": 1.2904739558892539e-05,
"loss": 3.8613,
"step": 1100
},
{
"epoch": 0.08446837715130398,
"grad_norm": 37.9277458190918,
"learning_rate": 1.4077897700610042e-05,
"loss": 3.8424,
"step": 1200
},
{
"epoch": 0.09150740858057932,
"grad_norm": 48.82701110839844,
"learning_rate": 1.5251055842327546e-05,
"loss": 3.7771,
"step": 1300
},
{
"epoch": 0.09854644000985464,
"grad_norm": 33.38028335571289,
"learning_rate": 1.642421398404505e-05,
"loss": 3.8094,
"step": 1400
},
{
"epoch": 0.10558547143912998,
"grad_norm": 61.35352325439453,
"learning_rate": 1.7597372125762555e-05,
"loss": 3.8331,
"step": 1500
},
{
"epoch": 0.10558547143912998,
"eval_runtime": 191.8384,
"eval_samples_per_second": 148.109,
"eval_steps_per_second": 18.516,
"step": 1500
},
{
"epoch": 0.11262450286840531,
"grad_norm": 46.74394226074219,
"learning_rate": 1.8770530267480057e-05,
"loss": 3.6822,
"step": 1600
},
{
"epoch": 0.11966353429768065,
"grad_norm": 35.53325271606445,
"learning_rate": 1.9943688409197562e-05,
"loss": 3.6282,
"step": 1700
},
{
"epoch": 0.12670256572695598,
"grad_norm": 37.73524856567383,
"learning_rate": 2.1116846550915064e-05,
"loss": 3.5722,
"step": 1800
},
{
"epoch": 0.1337415971562313,
"grad_norm": 33.76814651489258,
"learning_rate": 2.229000469263257e-05,
"loss": 3.6086,
"step": 1900
},
{
"epoch": 0.14078062858550663,
"grad_norm": 41.888282775878906,
"learning_rate": 2.346316283435007e-05,
"loss": 3.6142,
"step": 2000
},
{
"epoch": 0.14078062858550663,
"eval_runtime": 191.5815,
"eval_samples_per_second": 148.308,
"eval_steps_per_second": 18.54,
"step": 2000
},
{
"epoch": 0.14781966001478197,
"grad_norm": 39.62664031982422,
"learning_rate": 2.4636320976067576e-05,
"loss": 3.6029,
"step": 2100
},
{
"epoch": 0.1548586914440573,
"grad_norm": 38.377532958984375,
"learning_rate": 2.5809479117785078e-05,
"loss": 3.4959,
"step": 2200
},
{
"epoch": 0.16189772287333262,
"grad_norm": 32.66987991333008,
"learning_rate": 2.698263725950258e-05,
"loss": 3.5252,
"step": 2300
},
{
"epoch": 0.16893675430260796,
"grad_norm": 39.213592529296875,
"learning_rate": 2.8155795401220085e-05,
"loss": 3.5859,
"step": 2400
},
{
"epoch": 0.1759757857318833,
"grad_norm": 31.646276473999023,
"learning_rate": 2.9328953542937587e-05,
"loss": 3.4995,
"step": 2500
},
{
"epoch": 0.1759757857318833,
"eval_runtime": 194.3308,
"eval_samples_per_second": 146.209,
"eval_steps_per_second": 18.278,
"step": 2500
},
{
"epoch": 0.18301481716115864,
"grad_norm": 32.30677032470703,
"learning_rate": 3.0502111684655092e-05,
"loss": 3.5853,
"step": 2600
},
{
"epoch": 0.19005384859043395,
"grad_norm": 31.175769805908203,
"learning_rate": 3.1675269826372594e-05,
"loss": 3.5134,
"step": 2700
},
{
"epoch": 0.1970928800197093,
"grad_norm": 31.389162063598633,
"learning_rate": 3.28484279680901e-05,
"loss": 3.4909,
"step": 2800
},
{
"epoch": 0.20413191144898463,
"grad_norm": 33.105369567871094,
"learning_rate": 3.4021586109807604e-05,
"loss": 3.4099,
"step": 2900
},
{
"epoch": 0.21117094287825997,
"grad_norm": 25.477977752685547,
"learning_rate": 3.519474425152511e-05,
"loss": 3.3823,
"step": 3000
},
{
"epoch": 0.21117094287825997,
"eval_runtime": 194.3983,
"eval_samples_per_second": 146.159,
"eval_steps_per_second": 18.272,
"step": 3000
},
{
"epoch": 0.21820997430753528,
"grad_norm": 29.61454200744629,
"learning_rate": 3.636790239324261e-05,
"loss": 3.3476,
"step": 3100
},
{
"epoch": 0.22524900573681061,
"grad_norm": 26.82366180419922,
"learning_rate": 3.754106053496011e-05,
"loss": 3.389,
"step": 3200
},
{
"epoch": 0.23228803716608595,
"grad_norm": 26.6168155670166,
"learning_rate": 3.871421867667762e-05,
"loss": 3.3712,
"step": 3300
},
{
"epoch": 0.2393270685953613,
"grad_norm": 24.504793167114258,
"learning_rate": 3.9887376818395124e-05,
"loss": 3.2693,
"step": 3400
},
{
"epoch": 0.2463661000246366,
"grad_norm": 22.34451675415039,
"learning_rate": 4.106053496011262e-05,
"loss": 3.3719,
"step": 3500
},
{
"epoch": 0.2463661000246366,
"eval_runtime": 192.2522,
"eval_samples_per_second": 147.79,
"eval_steps_per_second": 18.476,
"step": 3500
},
{
"epoch": 0.25340513145391197,
"grad_norm": 30.370140075683594,
"learning_rate": 4.223369310183013e-05,
"loss": 3.3216,
"step": 3600
},
{
"epoch": 0.2604441628831873,
"grad_norm": 29.111398696899414,
"learning_rate": 4.340685124354763e-05,
"loss": 3.3085,
"step": 3700
},
{
"epoch": 0.2674831943124626,
"grad_norm": 29.50999641418457,
"learning_rate": 4.458000938526514e-05,
"loss": 3.2907,
"step": 3800
},
{
"epoch": 0.27452222574173796,
"grad_norm": 21.999244689941406,
"learning_rate": 4.5753167526982636e-05,
"loss": 3.2173,
"step": 3900
},
{
"epoch": 0.28156125717101327,
"grad_norm": 28.0905818939209,
"learning_rate": 4.692632566870014e-05,
"loss": 3.3431,
"step": 4000
},
{
"epoch": 0.28156125717101327,
"eval_runtime": 192.489,
"eval_samples_per_second": 147.608,
"eval_steps_per_second": 18.453,
"step": 4000
},
{
"epoch": 0.2886002886002886,
"grad_norm": 27.252222061157227,
"learning_rate": 4.809948381041765e-05,
"loss": 3.4265,
"step": 4100
},
{
"epoch": 0.29563932002956395,
"grad_norm": 20.001508712768555,
"learning_rate": 4.927264195213515e-05,
"loss": 3.2489,
"step": 4200
},
{
"epoch": 0.30267835145883926,
"grad_norm": 24.947546005249023,
"learning_rate": 4.995046407341746e-05,
"loss": 3.2957,
"step": 4300
},
{
"epoch": 0.3097173828881146,
"grad_norm": 18.58955192565918,
"learning_rate": 4.982010637188445e-05,
"loss": 3.2328,
"step": 4400
},
{
"epoch": 0.31675641431738993,
"grad_norm": 22.946285247802734,
"learning_rate": 4.968974867035145e-05,
"loss": 3.177,
"step": 4500
},
{
"epoch": 0.31675641431738993,
"eval_runtime": 192.63,
"eval_samples_per_second": 147.5,
"eval_steps_per_second": 18.439,
"step": 4500
},
{
"epoch": 0.32379544574666524,
"grad_norm": 20.17714500427246,
"learning_rate": 4.955939096881844e-05,
"loss": 3.3241,
"step": 4600
},
{
"epoch": 0.3308344771759406,
"grad_norm": 18.580751419067383,
"learning_rate": 4.9429033267285435e-05,
"loss": 3.3084,
"step": 4700
},
{
"epoch": 0.3378735086052159,
"grad_norm": 16.068750381469727,
"learning_rate": 4.929867556575243e-05,
"loss": 3.1674,
"step": 4800
},
{
"epoch": 0.34491254003449123,
"grad_norm": 23.636520385742188,
"learning_rate": 4.916831786421942e-05,
"loss": 3.2698,
"step": 4900
},
{
"epoch": 0.3519515714637666,
"grad_norm": 19.445907592773438,
"learning_rate": 4.903796016268641e-05,
"loss": 3.2824,
"step": 5000
},
{
"epoch": 0.3519515714637666,
"eval_runtime": 192.779,
"eval_samples_per_second": 147.386,
"eval_steps_per_second": 18.425,
"step": 5000
},
{
"epoch": 0.3589906028930419,
"grad_norm": 20.217737197875977,
"learning_rate": 4.8907602461153405e-05,
"loss": 3.3673,
"step": 5100
},
{
"epoch": 0.3660296343223173,
"grad_norm": 16.03109359741211,
"learning_rate": 4.87772447596204e-05,
"loss": 3.3105,
"step": 5200
},
{
"epoch": 0.3730686657515926,
"grad_norm": 21.388507843017578,
"learning_rate": 4.864688705808739e-05,
"loss": 3.1998,
"step": 5300
},
{
"epoch": 0.3801076971808679,
"grad_norm": 16.931922912597656,
"learning_rate": 4.851652935655439e-05,
"loss": 3.2054,
"step": 5400
},
{
"epoch": 0.38714672861014326,
"grad_norm": 24.145727157592773,
"learning_rate": 4.838617165502138e-05,
"loss": 3.0802,
"step": 5500
},
{
"epoch": 0.38714672861014326,
"eval_runtime": 192.7089,
"eval_samples_per_second": 147.44,
"eval_steps_per_second": 18.432,
"step": 5500
},
{
"epoch": 0.3941857600394186,
"grad_norm": 24.199636459350586,
"learning_rate": 4.8255813953488375e-05,
"loss": 3.0688,
"step": 5600
},
{
"epoch": 0.4012247914686939,
"grad_norm": 18.878751754760742,
"learning_rate": 4.812545625195537e-05,
"loss": 3.1667,
"step": 5700
},
{
"epoch": 0.40826382289796925,
"grad_norm": 17.39369010925293,
"learning_rate": 4.7995098550422363e-05,
"loss": 3.1905,
"step": 5800
},
{
"epoch": 0.41530285432724456,
"grad_norm": 19.778793334960938,
"learning_rate": 4.786474084888936e-05,
"loss": 3.0883,
"step": 5900
},
{
"epoch": 0.42234188575651993,
"grad_norm": 14.80075454711914,
"learning_rate": 4.773438314735635e-05,
"loss": 3.0839,
"step": 6000
},
{
"epoch": 0.42234188575651993,
"eval_runtime": 192.7353,
"eval_samples_per_second": 147.42,
"eval_steps_per_second": 18.429,
"step": 6000
},
{
"epoch": 0.42938091718579524,
"grad_norm": 27.43608856201172,
"learning_rate": 4.760402544582334e-05,
"loss": 3.0621,
"step": 6100
},
{
"epoch": 0.43641994861507055,
"grad_norm": 18.219221115112305,
"learning_rate": 4.747366774429033e-05,
"loss": 3.1461,
"step": 6200
},
{
"epoch": 0.4434589800443459,
"grad_norm": 17.97977638244629,
"learning_rate": 4.734331004275733e-05,
"loss": 3.0795,
"step": 6300
},
{
"epoch": 0.45049801147362123,
"grad_norm": 21.358592987060547,
"learning_rate": 4.721295234122432e-05,
"loss": 3.1361,
"step": 6400
},
{
"epoch": 0.45753704290289654,
"grad_norm": 15.679008483886719,
"learning_rate": 4.7082594639691315e-05,
"loss": 3.0751,
"step": 6500
},
{
"epoch": 0.45753704290289654,
"eval_runtime": 193.5839,
"eval_samples_per_second": 146.774,
"eval_steps_per_second": 18.349,
"step": 6500
},
{
"epoch": 0.4645760743321719,
"grad_norm": 14.288241386413574,
"learning_rate": 4.69522369381583e-05,
"loss": 3.0886,
"step": 6600
},
{
"epoch": 0.4716151057614472,
"grad_norm": 15.016201972961426,
"learning_rate": 4.6821879236625304e-05,
"loss": 2.973,
"step": 6700
},
{
"epoch": 0.4786541371907226,
"grad_norm": 20.513479232788086,
"learning_rate": 4.66915215350923e-05,
"loss": 3.1001,
"step": 6800
},
{
"epoch": 0.4856931686199979,
"grad_norm": 15.093891143798828,
"learning_rate": 4.656116383355929e-05,
"loss": 3.1073,
"step": 6900
},
{
"epoch": 0.4927322000492732,
"grad_norm": 15.074331283569336,
"learning_rate": 4.6430806132026286e-05,
"loss": 3.1247,
"step": 7000
},
{
"epoch": 0.4927322000492732,
"eval_runtime": 193.2211,
"eval_samples_per_second": 147.049,
"eval_steps_per_second": 18.383,
"step": 7000
},
{
"epoch": 0.49977123147854857,
"grad_norm": 19.936674118041992,
"learning_rate": 4.630044843049327e-05,
"loss": 3.0331,
"step": 7100
},
{
"epoch": 0.5068102629078239,
"grad_norm": 22.168909072875977,
"learning_rate": 4.617009072896027e-05,
"loss": 3.0611,
"step": 7200
},
{
"epoch": 0.5138492943370992,
"grad_norm": 19.541671752929688,
"learning_rate": 4.603973302742726e-05,
"loss": 2.9378,
"step": 7300
},
{
"epoch": 0.5208883257663746,
"grad_norm": 16.111820220947266,
"learning_rate": 4.5909375325894256e-05,
"loss": 3.0461,
"step": 7400
},
{
"epoch": 0.5279273571956499,
"grad_norm": 18.88793182373047,
"learning_rate": 4.577901762436125e-05,
"loss": 3.0183,
"step": 7500
},
{
"epoch": 0.5279273571956499,
"eval_runtime": 191.8152,
"eval_samples_per_second": 148.127,
"eval_steps_per_second": 18.518,
"step": 7500
},
{
"epoch": 0.5349663886249252,
"grad_norm": 15.234626770019531,
"learning_rate": 4.5648659922828244e-05,
"loss": 3.0287,
"step": 7600
},
{
"epoch": 0.5420054200542005,
"grad_norm": 16.836956024169922,
"learning_rate": 4.551830222129523e-05,
"loss": 2.9991,
"step": 7700
},
{
"epoch": 0.5490444514834759,
"grad_norm": 20.984453201293945,
"learning_rate": 4.538794451976223e-05,
"loss": 3.0765,
"step": 7800
},
{
"epoch": 0.5560834829127512,
"grad_norm": 18.113004684448242,
"learning_rate": 4.5257586818229226e-05,
"loss": 2.9704,
"step": 7900
},
{
"epoch": 0.5631225143420265,
"grad_norm": 20.654579162597656,
"learning_rate": 4.512722911669622e-05,
"loss": 3.0604,
"step": 8000
},
{
"epoch": 0.5631225143420265,
"eval_runtime": 193.8004,
"eval_samples_per_second": 146.61,
"eval_steps_per_second": 18.328,
"step": 8000
},
{
"epoch": 0.5701615457713018,
"grad_norm": 18.815162658691406,
"learning_rate": 4.499687141516321e-05,
"loss": 2.8726,
"step": 8100
},
{
"epoch": 0.5772005772005772,
"grad_norm": 15.366965293884277,
"learning_rate": 4.48665137136302e-05,
"loss": 3.0509,
"step": 8200
},
{
"epoch": 0.5842396086298526,
"grad_norm": 17.203136444091797,
"learning_rate": 4.4736156012097196e-05,
"loss": 2.9718,
"step": 8300
},
{
"epoch": 0.5912786400591279,
"grad_norm": 15.31092643737793,
"learning_rate": 4.460579831056419e-05,
"loss": 3.0382,
"step": 8400
},
{
"epoch": 0.5983176714884032,
"grad_norm": 12.381194114685059,
"learning_rate": 4.4475440609031184e-05,
"loss": 2.9945,
"step": 8500
},
{
"epoch": 0.5983176714884032,
"eval_runtime": 192.0001,
"eval_samples_per_second": 147.984,
"eval_steps_per_second": 18.5,
"step": 8500
},
{
"epoch": 0.6053567029176785,
"grad_norm": 13.362293243408203,
"learning_rate": 4.434508290749818e-05,
"loss": 3.0427,
"step": 8600
},
{
"epoch": 0.6123957343469538,
"grad_norm": 18.495370864868164,
"learning_rate": 4.4214725205965165e-05,
"loss": 3.0363,
"step": 8700
},
{
"epoch": 0.6194347657762292,
"grad_norm": 16.98731803894043,
"learning_rate": 4.408436750443216e-05,
"loss": 2.9257,
"step": 8800
},
{
"epoch": 0.6264737972055046,
"grad_norm": 18.510801315307617,
"learning_rate": 4.395400980289916e-05,
"loss": 3.0014,
"step": 8900
},
{
"epoch": 0.6335128286347799,
"grad_norm": 15.024514198303223,
"learning_rate": 4.3823652101366155e-05,
"loss": 3.0435,
"step": 9000
},
{
"epoch": 0.6335128286347799,
"eval_runtime": 192.5991,
"eval_samples_per_second": 147.524,
"eval_steps_per_second": 18.442,
"step": 9000
},
{
"epoch": 0.6405518600640552,
"grad_norm": 19.763547897338867,
"learning_rate": 4.369329439983315e-05,
"loss": 2.9219,
"step": 9100
},
{
"epoch": 0.6475908914933305,
"grad_norm": 19.536087036132812,
"learning_rate": 4.3562936698300136e-05,
"loss": 2.9749,
"step": 9200
},
{
"epoch": 0.6546299229226058,
"grad_norm": 15.909920692443848,
"learning_rate": 4.343257899676713e-05,
"loss": 2.8793,
"step": 9300
},
{
"epoch": 0.6616689543518812,
"grad_norm": 17.25068473815918,
"learning_rate": 4.3302221295234124e-05,
"loss": 2.9669,
"step": 9400
},
{
"epoch": 0.6687079857811565,
"grad_norm": 14.32239055633545,
"learning_rate": 4.317186359370112e-05,
"loss": 2.9066,
"step": 9500
},
{
"epoch": 0.6687079857811565,
"eval_runtime": 191.7836,
"eval_samples_per_second": 148.151,
"eval_steps_per_second": 18.521,
"step": 9500
},
{
"epoch": 0.6757470172104318,
"grad_norm": 22.611879348754883,
"learning_rate": 4.304150589216811e-05,
"loss": 2.8333,
"step": 9600
},
{
"epoch": 0.6827860486397072,
"grad_norm": 15.549399375915527,
"learning_rate": 4.2911148190635107e-05,
"loss": 2.9913,
"step": 9700
},
{
"epoch": 0.6898250800689825,
"grad_norm": 13.915739059448242,
"learning_rate": 4.2780790489102094e-05,
"loss": 2.9524,
"step": 9800
},
{
"epoch": 0.6968641114982579,
"grad_norm": 17.621822357177734,
"learning_rate": 4.265043278756909e-05,
"loss": 2.9844,
"step": 9900
},
{
"epoch": 0.7039031429275332,
"grad_norm": 22.748342514038086,
"learning_rate": 4.252007508603609e-05,
"loss": 2.9083,
"step": 10000
},
{
"epoch": 0.7039031429275332,
"eval_runtime": 193.6184,
"eval_samples_per_second": 146.747,
"eval_steps_per_second": 18.345,
"step": 10000
},
{
"epoch": 0.7109421743568085,
"grad_norm": 18.526578903198242,
"learning_rate": 4.238971738450308e-05,
"loss": 2.9706,
"step": 10100
},
{
"epoch": 0.7179812057860838,
"grad_norm": 15.676709175109863,
"learning_rate": 4.225935968297007e-05,
"loss": 2.8265,
"step": 10200
},
{
"epoch": 0.7250202372153591,
"grad_norm": 18.21067237854004,
"learning_rate": 4.2129001981437064e-05,
"loss": 2.8592,
"step": 10300
},
{
"epoch": 0.7320592686446346,
"grad_norm": 22.188024520874023,
"learning_rate": 4.199864427990406e-05,
"loss": 2.8158,
"step": 10400
},
{
"epoch": 0.7390983000739099,
"grad_norm": 18.406801223754883,
"learning_rate": 4.186828657837105e-05,
"loss": 2.9264,
"step": 10500
},
{
"epoch": 0.7390983000739099,
"eval_runtime": 193.9371,
"eval_samples_per_second": 146.506,
"eval_steps_per_second": 18.315,
"step": 10500
},
{
"epoch": 0.7461373315031852,
"grad_norm": 20.65268325805664,
"learning_rate": 4.173792887683805e-05,
"loss": 2.9882,
"step": 10600
},
{
"epoch": 0.7531763629324605,
"grad_norm": 12.223052024841309,
"learning_rate": 4.160757117530504e-05,
"loss": 2.8969,
"step": 10700
},
{
"epoch": 0.7602153943617358,
"grad_norm": 18.218887329101562,
"learning_rate": 4.147721347377203e-05,
"loss": 2.8254,
"step": 10800
},
{
"epoch": 0.7672544257910111,
"grad_norm": 19.009950637817383,
"learning_rate": 4.134685577223902e-05,
"loss": 2.9361,
"step": 10900
},
{
"epoch": 0.7742934572202865,
"grad_norm": 33.881927490234375,
"learning_rate": 4.1216498070706016e-05,
"loss": 2.8528,
"step": 11000
},
{
"epoch": 0.7742934572202865,
"eval_runtime": 194.2555,
"eval_samples_per_second": 146.266,
"eval_steps_per_second": 18.285,
"step": 11000
},
{
"epoch": 0.7813324886495618,
"grad_norm": 19.02928924560547,
"learning_rate": 4.108614036917302e-05,
"loss": 2.9383,
"step": 11100
},
{
"epoch": 0.7883715200788372,
"grad_norm": 18.154483795166016,
"learning_rate": 4.095578266764001e-05,
"loss": 2.7691,
"step": 11200
},
{
"epoch": 0.7954105515081125,
"grad_norm": 13.669476509094238,
"learning_rate": 4.0825424966107e-05,
"loss": 2.8306,
"step": 11300
},
{
"epoch": 0.8024495829373878,
"grad_norm": 16.23528289794922,
"learning_rate": 4.069506726457399e-05,
"loss": 2.8588,
"step": 11400
},
{
"epoch": 0.8094886143666632,
"grad_norm": 16.63111686706543,
"learning_rate": 4.056470956304099e-05,
"loss": 2.91,
"step": 11500
},
{
"epoch": 0.8094886143666632,
"eval_runtime": 193.9814,
"eval_samples_per_second": 146.473,
"eval_steps_per_second": 18.311,
"step": 11500
},
{
"epoch": 0.8165276457959385,
"grad_norm": 19.989736557006836,
"learning_rate": 4.043435186150798e-05,
"loss": 2.8754,
"step": 11600
},
{
"epoch": 0.8235666772252138,
"grad_norm": 15.608553886413574,
"learning_rate": 4.0303994159974975e-05,
"loss": 2.8896,
"step": 11700
},
{
"epoch": 0.8306057086544891,
"grad_norm": 17.236600875854492,
"learning_rate": 4.017363645844196e-05,
"loss": 2.8897,
"step": 11800
},
{
"epoch": 0.8376447400837644,
"grad_norm": 16.03377342224121,
"learning_rate": 4.0043278756908957e-05,
"loss": 2.8936,
"step": 11900
},
{
"epoch": 0.8446837715130399,
"grad_norm": 25.3082332611084,
"learning_rate": 3.991292105537595e-05,
"loss": 2.8939,
"step": 12000
},
{
"epoch": 0.8446837715130399,
"eval_runtime": 192.6986,
"eval_samples_per_second": 147.448,
"eval_steps_per_second": 18.433,
"step": 12000
},
{
"epoch": 0.8517228029423152,
"grad_norm": 18.766387939453125,
"learning_rate": 3.9782563353842945e-05,
"loss": 2.9448,
"step": 12100
},
{
"epoch": 0.8587618343715905,
"grad_norm": 17.019485473632812,
"learning_rate": 3.9652205652309946e-05,
"loss": 2.7899,
"step": 12200
},
{
"epoch": 0.8658008658008658,
"grad_norm": 15.821990966796875,
"learning_rate": 3.952184795077693e-05,
"loss": 2.8069,
"step": 12300
},
{
"epoch": 0.8728398972301411,
"grad_norm": 19.532939910888672,
"learning_rate": 3.939149024924393e-05,
"loss": 2.8797,
"step": 12400
},
{
"epoch": 0.8798789286594164,
"grad_norm": 15.134819030761719,
"learning_rate": 3.926113254771092e-05,
"loss": 2.89,
"step": 12500
},
{
"epoch": 0.8798789286594164,
"eval_runtime": 191.6112,
"eval_samples_per_second": 148.285,
"eval_steps_per_second": 18.538,
"step": 12500
},
{
"epoch": 0.8869179600886918,
"grad_norm": 15.088654518127441,
"learning_rate": 3.9130774846177915e-05,
"loss": 2.8519,
"step": 12600
},
{
"epoch": 0.8939569915179671,
"grad_norm": 16.8511962890625,
"learning_rate": 3.900041714464491e-05,
"loss": 2.7864,
"step": 12700
},
{
"epoch": 0.9009960229472425,
"grad_norm": 19.323467254638672,
"learning_rate": 3.8870059443111903e-05,
"loss": 2.9093,
"step": 12800
},
{
"epoch": 0.9080350543765178,
"grad_norm": 16.42205810546875,
"learning_rate": 3.873970174157889e-05,
"loss": 2.8426,
"step": 12900
},
{
"epoch": 0.9150740858057931,
"grad_norm": 16.470041275024414,
"learning_rate": 3.8609344040045885e-05,
"loss": 2.9097,
"step": 13000
},
{
"epoch": 0.9150740858057931,
"eval_runtime": 192.3033,
"eval_samples_per_second": 147.751,
"eval_steps_per_second": 18.471,
"step": 13000
},
{
"epoch": 0.9221131172350685,
"grad_norm": 15.83154296875,
"learning_rate": 3.847898633851288e-05,
"loss": 2.8286,
"step": 13100
},
{
"epoch": 0.9291521486643438,
"grad_norm": 14.70117473602295,
"learning_rate": 3.834862863697987e-05,
"loss": 2.7575,
"step": 13200
},
{
"epoch": 0.9361911800936191,
"grad_norm": 12.124509811401367,
"learning_rate": 3.821827093544687e-05,
"loss": 2.8621,
"step": 13300
},
{
"epoch": 0.9432302115228944,
"grad_norm": 16.292022705078125,
"learning_rate": 3.808791323391386e-05,
"loss": 2.8619,
"step": 13400
},
{
"epoch": 0.9502692429521697,
"grad_norm": 18.999359130859375,
"learning_rate": 3.7957555532380855e-05,
"loss": 2.8396,
"step": 13500
},
{
"epoch": 0.9502692429521697,
"eval_runtime": 192.4881,
"eval_samples_per_second": 147.609,
"eval_steps_per_second": 18.453,
"step": 13500
},
{
"epoch": 0.9573082743814452,
"grad_norm": 17.78417205810547,
"learning_rate": 3.782719783084785e-05,
"loss": 2.842,
"step": 13600
},
{
"epoch": 0.9643473058107205,
"grad_norm": 16.267335891723633,
"learning_rate": 3.7696840129314844e-05,
"loss": 2.8373,
"step": 13700
},
{
"epoch": 0.9713863372399958,
"grad_norm": 15.711287498474121,
"learning_rate": 3.756648242778184e-05,
"loss": 2.9,
"step": 13800
},
{
"epoch": 0.9784253686692711,
"grad_norm": 26.957563400268555,
"learning_rate": 3.7436124726248825e-05,
"loss": 2.9499,
"step": 13900
},
{
"epoch": 0.9854644000985464,
"grad_norm": 11.875740051269531,
"learning_rate": 3.730576702471582e-05,
"loss": 2.7061,
"step": 14000
},
{
"epoch": 0.9854644000985464,
"eval_runtime": 192.2723,
"eval_samples_per_second": 147.775,
"eval_steps_per_second": 18.474,
"step": 14000
},
{
"epoch": 0.9925034315278217,
"grad_norm": 21.315786361694336,
"learning_rate": 3.717540932318281e-05,
"loss": 2.8303,
"step": 14100
},
{
"epoch": 0.9995424629570971,
"grad_norm": 13.713945388793945,
"learning_rate": 3.704505162164981e-05,
"loss": 2.7378,
"step": 14200
},
{
"epoch": 1.0065814943863725,
"grad_norm": 16.48957633972168,
"learning_rate": 3.69146939201168e-05,
"loss": 2.7313,
"step": 14300
},
{
"epoch": 1.0136205258156479,
"grad_norm": 19.689464569091797,
"learning_rate": 3.6784336218583796e-05,
"loss": 2.7612,
"step": 14400
},
{
"epoch": 1.020659557244923,
"grad_norm": 19.848342895507812,
"learning_rate": 3.665397851705079e-05,
"loss": 2.8205,
"step": 14500
},
{
"epoch": 1.020659557244923,
"eval_runtime": 193.2356,
"eval_samples_per_second": 147.038,
"eval_steps_per_second": 18.382,
"step": 14500
},
{
"epoch": 1.0276985886741985,
"grad_norm": 23.994319915771484,
"learning_rate": 3.6523620815517784e-05,
"loss": 2.7362,
"step": 14600
},
{
"epoch": 1.0347376201034737,
"grad_norm": 16.060455322265625,
"learning_rate": 3.639326311398478e-05,
"loss": 2.8076,
"step": 14700
},
{
"epoch": 1.0417766515327491,
"grad_norm": 17.874704360961914,
"learning_rate": 3.626290541245177e-05,
"loss": 2.6903,
"step": 14800
},
{
"epoch": 1.0488156829620243,
"grad_norm": 15.949551582336426,
"learning_rate": 3.613254771091876e-05,
"loss": 2.7765,
"step": 14900
},
{
"epoch": 1.0558547143912997,
"grad_norm": 17.344772338867188,
"learning_rate": 3.6002190009385754e-05,
"loss": 2.9004,
"step": 15000
},
{
"epoch": 1.0558547143912997,
"eval_runtime": 194.9228,
"eval_samples_per_second": 145.765,
"eval_steps_per_second": 18.223,
"step": 15000
},
{
"epoch": 1.0628937458205752,
"grad_norm": 13.621015548706055,
"learning_rate": 3.587183230785275e-05,
"loss": 2.7808,
"step": 15100
},
{
"epoch": 1.0699327772498504,
"grad_norm": 21.069551467895508,
"learning_rate": 3.574147460631974e-05,
"loss": 2.7421,
"step": 15200
},
{
"epoch": 1.0769718086791258,
"grad_norm": 15.854650497436523,
"learning_rate": 3.5611116904786736e-05,
"loss": 2.7086,
"step": 15300
},
{
"epoch": 1.084010840108401,
"grad_norm": 19.217153549194336,
"learning_rate": 3.548075920325373e-05,
"loss": 2.7957,
"step": 15400
},
{
"epoch": 1.0910498715376764,
"grad_norm": 20.781291961669922,
"learning_rate": 3.5350401501720724e-05,
"loss": 2.8039,
"step": 15500
},
{
"epoch": 1.0910498715376764,
"eval_runtime": 192.3565,
"eval_samples_per_second": 147.71,
"eval_steps_per_second": 18.466,
"step": 15500
},
{
"epoch": 1.0980889029669518,
"grad_norm": 15.170364379882812,
"learning_rate": 3.522004380018772e-05,
"loss": 2.7992,
"step": 15600
},
{
"epoch": 1.105127934396227,
"grad_norm": 18.8775634765625,
"learning_rate": 3.508968609865471e-05,
"loss": 2.7919,
"step": 15700
},
{
"epoch": 1.1121669658255025,
"grad_norm": 19.009754180908203,
"learning_rate": 3.4959328397121706e-05,
"loss": 2.7361,
"step": 15800
},
{
"epoch": 1.1192059972547777,
"grad_norm": 14.632086753845215,
"learning_rate": 3.48289706955887e-05,
"loss": 2.8233,
"step": 15900
},
{
"epoch": 1.126245028684053,
"grad_norm": 20.006601333618164,
"learning_rate": 3.469861299405569e-05,
"loss": 2.738,
"step": 16000
},
{
"epoch": 1.126245028684053,
"eval_runtime": 194.9892,
"eval_samples_per_second": 145.716,
"eval_steps_per_second": 18.216,
"step": 16000
},
{
"epoch": 1.1332840601133285,
"grad_norm": 14.096820831298828,
"learning_rate": 3.456825529252268e-05,
"loss": 2.8094,
"step": 16100
},
{
"epoch": 1.1403230915426037,
"grad_norm": 17.261428833007812,
"learning_rate": 3.4437897590989676e-05,
"loss": 2.8108,
"step": 16200
},
{
"epoch": 1.1473621229718791,
"grad_norm": 14.972962379455566,
"learning_rate": 3.430753988945667e-05,
"loss": 2.8055,
"step": 16300
},
{
"epoch": 1.1544011544011543,
"grad_norm": 14.345026969909668,
"learning_rate": 3.4177182187923664e-05,
"loss": 2.7363,
"step": 16400
},
{
"epoch": 1.1614401858304297,
"grad_norm": 15.4429292678833,
"learning_rate": 3.404682448639065e-05,
"loss": 2.8642,
"step": 16500
},
{
"epoch": 1.1614401858304297,
"eval_runtime": 194.218,
"eval_samples_per_second": 146.294,
"eval_steps_per_second": 18.289,
"step": 16500
},
{
"epoch": 1.1684792172597052,
"grad_norm": 13.637730598449707,
"learning_rate": 3.391646678485765e-05,
"loss": 2.6802,
"step": 16600
},
{
"epoch": 1.1755182486889804,
"grad_norm": 16.29159927368164,
"learning_rate": 3.3786109083324647e-05,
"loss": 2.7796,
"step": 16700
},
{
"epoch": 1.1825572801182558,
"grad_norm": 16.733455657958984,
"learning_rate": 3.365575138179164e-05,
"loss": 2.8216,
"step": 16800
},
{
"epoch": 1.189596311547531,
"grad_norm": 20.382347106933594,
"learning_rate": 3.3525393680258635e-05,
"loss": 2.6737,
"step": 16900
},
{
"epoch": 1.1966353429768064,
"grad_norm": 18.506607055664062,
"learning_rate": 3.339503597872562e-05,
"loss": 2.7144,
"step": 17000
},
{
"epoch": 1.1966353429768064,
"eval_runtime": 192.527,
"eval_samples_per_second": 147.579,
"eval_steps_per_second": 18.449,
"step": 17000
},
{
"epoch": 1.2036743744060816,
"grad_norm": 16.080759048461914,
"learning_rate": 3.3264678277192616e-05,
"loss": 2.8013,
"step": 17100
},
{
"epoch": 1.210713405835357,
"grad_norm": 16.123552322387695,
"learning_rate": 3.313432057565961e-05,
"loss": 2.7444,
"step": 17200
},
{
"epoch": 1.2177524372646324,
"grad_norm": 16.878711700439453,
"learning_rate": 3.3003962874126604e-05,
"loss": 2.8241,
"step": 17300
},
{
"epoch": 1.2247914686939076,
"grad_norm": 18.64569854736328,
"learning_rate": 3.28736051725936e-05,
"loss": 2.684,
"step": 17400
},
{
"epoch": 1.231830500123183,
"grad_norm": 16.125022888183594,
"learning_rate": 3.274324747106059e-05,
"loss": 2.8633,
"step": 17500
},
{
"epoch": 1.231830500123183,
"eval_runtime": 193.4861,
"eval_samples_per_second": 146.848,
"eval_steps_per_second": 18.358,
"step": 17500
},
{
"epoch": 1.2388695315524583,
"grad_norm": 13.886027336120605,
"learning_rate": 3.261288976952758e-05,
"loss": 2.7536,
"step": 17600
},
{
"epoch": 1.2459085629817337,
"grad_norm": 15.769869804382324,
"learning_rate": 3.248253206799458e-05,
"loss": 2.7889,
"step": 17700
},
{
"epoch": 1.2529475944110091,
"grad_norm": 19.419034957885742,
"learning_rate": 3.2352174366461575e-05,
"loss": 2.7672,
"step": 17800
},
{
"epoch": 1.2599866258402843,
"grad_norm": 18.742015838623047,
"learning_rate": 3.222181666492857e-05,
"loss": 2.7427,
"step": 17900
},
{
"epoch": 1.2670256572695597,
"grad_norm": 18.40927505493164,
"learning_rate": 3.209145896339556e-05,
"loss": 2.7596,
"step": 18000
},
{
"epoch": 1.2670256572695597,
"eval_runtime": 194.1066,
"eval_samples_per_second": 146.378,
"eval_steps_per_second": 18.299,
"step": 18000
},
{
"epoch": 1.274064688698835,
"grad_norm": 21.27202033996582,
"learning_rate": 3.196110126186255e-05,
"loss": 2.7888,
"step": 18100
},
{
"epoch": 1.2811037201281104,
"grad_norm": 13.953824043273926,
"learning_rate": 3.1830743560329545e-05,
"loss": 2.7347,
"step": 18200
},
{
"epoch": 1.2881427515573858,
"grad_norm": 16.453821182250977,
"learning_rate": 3.170038585879654e-05,
"loss": 2.8821,
"step": 18300
},
{
"epoch": 1.295181782986661,
"grad_norm": 16.67236328125,
"learning_rate": 3.157002815726353e-05,
"loss": 2.8431,
"step": 18400
},
{
"epoch": 1.3022208144159364,
"grad_norm": 13.558029174804688,
"learning_rate": 3.143967045573053e-05,
"loss": 2.7499,
"step": 18500
},
{
"epoch": 1.3022208144159364,
"eval_runtime": 192.4964,
"eval_samples_per_second": 147.603,
"eval_steps_per_second": 18.452,
"step": 18500
},
{
"epoch": 1.3092598458452116,
"grad_norm": 15.34234619140625,
"learning_rate": 3.1309312754197514e-05,
"loss": 2.8225,
"step": 18600
},
{
"epoch": 1.316298877274487,
"grad_norm": 14.304731369018555,
"learning_rate": 3.117895505266451e-05,
"loss": 2.8369,
"step": 18700
},
{
"epoch": 1.3233379087037624,
"grad_norm": 17.345626831054688,
"learning_rate": 3.104859735113151e-05,
"loss": 2.6865,
"step": 18800
},
{
"epoch": 1.3303769401330376,
"grad_norm": 16.954349517822266,
"learning_rate": 3.09182396495985e-05,
"loss": 2.741,
"step": 18900
},
{
"epoch": 1.337415971562313,
"grad_norm": 17.250642776489258,
"learning_rate": 3.07878819480655e-05,
"loss": 2.8111,
"step": 19000
},
{
"epoch": 1.337415971562313,
"eval_runtime": 192.7825,
"eval_samples_per_second": 147.384,
"eval_steps_per_second": 18.425,
"step": 19000
},
{
"epoch": 1.3444550029915883,
"grad_norm": 17.004776000976562,
"learning_rate": 3.0657524246532485e-05,
"loss": 2.7594,
"step": 19100
},
{
"epoch": 1.3514940344208637,
"grad_norm": 15.450813293457031,
"learning_rate": 3.052716654499948e-05,
"loss": 2.6522,
"step": 19200
},
{
"epoch": 1.358533065850139,
"grad_norm": 15.58588981628418,
"learning_rate": 3.0396808843466473e-05,
"loss": 2.8346,
"step": 19300
},
{
"epoch": 1.3655720972794143,
"grad_norm": 23.079944610595703,
"learning_rate": 3.0266451141933467e-05,
"loss": 2.829,
"step": 19400
},
{
"epoch": 1.3726111287086897,
"grad_norm": 23.278108596801758,
"learning_rate": 3.0136093440400458e-05,
"loss": 2.809,
"step": 19500
},
{
"epoch": 1.3726111287086897,
"eval_runtime": 191.7654,
"eval_samples_per_second": 148.165,
"eval_steps_per_second": 18.523,
"step": 19500
},
{
"epoch": 1.379650160137965,
"grad_norm": 12.144103050231934,
"learning_rate": 3.0005735738867452e-05,
"loss": 2.5999,
"step": 19600
},
{
"epoch": 1.3866891915672404,
"grad_norm": 18.378664016723633,
"learning_rate": 2.9875378037334446e-05,
"loss": 2.8226,
"step": 19700
},
{
"epoch": 1.3937282229965158,
"grad_norm": 15.180033683776855,
"learning_rate": 2.9745020335801437e-05,
"loss": 2.7773,
"step": 19800
},
{
"epoch": 1.400767254425791,
"grad_norm": 16.611019134521484,
"learning_rate": 2.9614662634268438e-05,
"loss": 2.7171,
"step": 19900
},
{
"epoch": 1.4078062858550664,
"grad_norm": 14.491551399230957,
"learning_rate": 2.948430493273543e-05,
"loss": 2.7234,
"step": 20000
},
{
"epoch": 1.4078062858550664,
"eval_runtime": 192.2568,
"eval_samples_per_second": 147.787,
"eval_steps_per_second": 18.475,
"step": 20000
},
{
"epoch": 1.4148453172843416,
"grad_norm": 15.652689933776855,
"learning_rate": 2.9353947231202422e-05,
"loss": 2.6237,
"step": 20100
},
{
"epoch": 1.421884348713617,
"grad_norm": 16.404693603515625,
"learning_rate": 2.9223589529669417e-05,
"loss": 2.7363,
"step": 20200
},
{
"epoch": 1.4289233801428924,
"grad_norm": 13.620403289794922,
"learning_rate": 2.9093231828136407e-05,
"loss": 2.7651,
"step": 20300
},
{
"epoch": 1.4359624115721676,
"grad_norm": 16.975452423095703,
"learning_rate": 2.89628741266034e-05,
"loss": 2.8431,
"step": 20400
},
{
"epoch": 1.443001443001443,
"grad_norm": 16.957857131958008,
"learning_rate": 2.8832516425070395e-05,
"loss": 2.7442,
"step": 20500
},
{
"epoch": 1.443001443001443,
"eval_runtime": 192.9203,
"eval_samples_per_second": 147.278,
"eval_steps_per_second": 18.412,
"step": 20500
},
{
"epoch": 1.4500404744307183,
"grad_norm": 15.085665702819824,
"learning_rate": 2.8702158723537386e-05,
"loss": 2.6764,
"step": 20600
},
{
"epoch": 1.4570795058599937,
"grad_norm": 15.870454788208008,
"learning_rate": 2.857180102200438e-05,
"loss": 2.6491,
"step": 20700
},
{
"epoch": 1.464118537289269,
"grad_norm": 15.54505729675293,
"learning_rate": 2.844144332047137e-05,
"loss": 2.785,
"step": 20800
},
{
"epoch": 1.4711575687185443,
"grad_norm": 16.786861419677734,
"learning_rate": 2.8311085618938365e-05,
"loss": 2.656,
"step": 20900
},
{
"epoch": 1.4781966001478197,
"grad_norm": 17.03700828552246,
"learning_rate": 2.8180727917405363e-05,
"loss": 2.7337,
"step": 21000
},
{
"epoch": 1.4781966001478197,
"eval_runtime": 194.0613,
"eval_samples_per_second": 146.413,
"eval_steps_per_second": 18.303,
"step": 21000
},
{
"epoch": 1.485235631577095,
"grad_norm": 15.768280982971191,
"learning_rate": 2.8050370215872357e-05,
"loss": 2.7419,
"step": 21100
},
{
"epoch": 1.4922746630063703,
"grad_norm": 16.70868682861328,
"learning_rate": 2.792001251433935e-05,
"loss": 2.7001,
"step": 21200
},
{
"epoch": 1.4993136944356458,
"grad_norm": 15.003129005432129,
"learning_rate": 2.778965481280634e-05,
"loss": 2.6372,
"step": 21300
},
{
"epoch": 1.506352725864921,
"grad_norm": 15.321432113647461,
"learning_rate": 2.7659297111273336e-05,
"loss": 2.734,
"step": 21400
},
{
"epoch": 1.5133917572941962,
"grad_norm": 12.912035942077637,
"learning_rate": 2.752893940974033e-05,
"loss": 2.6854,
"step": 21500
},
{
"epoch": 1.5133917572941962,
"eval_runtime": 192.892,
"eval_samples_per_second": 147.3,
"eval_steps_per_second": 18.414,
"step": 21500
},
{
"epoch": 1.5204307887234716,
"grad_norm": 17.954883575439453,
"learning_rate": 2.739858170820732e-05,
"loss": 2.6843,
"step": 21600
},
{
"epoch": 1.527469820152747,
"grad_norm": 20.32744026184082,
"learning_rate": 2.7268224006674315e-05,
"loss": 2.6404,
"step": 21700
},
{
"epoch": 1.5345088515820224,
"grad_norm": 14.839242935180664,
"learning_rate": 2.713786630514131e-05,
"loss": 2.7235,
"step": 21800
},
{
"epoch": 1.5415478830112976,
"grad_norm": 15.594539642333984,
"learning_rate": 2.70075086036083e-05,
"loss": 2.6163,
"step": 21900
},
{
"epoch": 1.5485869144405728,
"grad_norm": 14.877588272094727,
"learning_rate": 2.6877150902075294e-05,
"loss": 2.6331,
"step": 22000
},
{
"epoch": 1.5485869144405728,
"eval_runtime": 191.7338,
"eval_samples_per_second": 148.19,
"eval_steps_per_second": 18.526,
"step": 22000
},
{
"epoch": 1.5556259458698483,
"grad_norm": 14.593866348266602,
"learning_rate": 2.674679320054229e-05,
"loss": 2.7969,
"step": 22100
},
{
"epoch": 1.5626649772991237,
"grad_norm": 22.533540725708008,
"learning_rate": 2.6616435499009285e-05,
"loss": 2.7841,
"step": 22200
},
{
"epoch": 1.569704008728399,
"grad_norm": 16.089982986450195,
"learning_rate": 2.6486077797476276e-05,
"loss": 2.667,
"step": 22300
},
{
"epoch": 1.5767430401576743,
"grad_norm": 17.49601936340332,
"learning_rate": 2.635572009594327e-05,
"loss": 2.7963,
"step": 22400
},
{
"epoch": 1.5837820715869495,
"grad_norm": 17.72164535522461,
"learning_rate": 2.6225362394410264e-05,
"loss": 2.7669,
"step": 22500
},
{
"epoch": 1.5837820715869495,
"eval_runtime": 191.4019,
"eval_samples_per_second": 148.447,
"eval_steps_per_second": 18.558,
"step": 22500
},
{
"epoch": 1.590821103016225,
"grad_norm": 14.238466262817383,
"learning_rate": 2.6095004692877255e-05,
"loss": 2.732,
"step": 22600
},
{
"epoch": 1.5978601344455003,
"grad_norm": 19.9779052734375,
"learning_rate": 2.596464699134425e-05,
"loss": 2.7353,
"step": 22700
},
{
"epoch": 1.6048991658747758,
"grad_norm": 16.89205551147461,
"learning_rate": 2.5834289289811243e-05,
"loss": 2.7311,
"step": 22800
},
{
"epoch": 1.611938197304051,
"grad_norm": 13.072985649108887,
"learning_rate": 2.5703931588278234e-05,
"loss": 2.6313,
"step": 22900
},
{
"epoch": 1.6189772287333262,
"grad_norm": 22.408113479614258,
"learning_rate": 2.5573573886745228e-05,
"loss": 2.605,
"step": 23000
},
{
"epoch": 1.6189772287333262,
"eval_runtime": 192.574,
"eval_samples_per_second": 147.543,
"eval_steps_per_second": 18.445,
"step": 23000
},
{
"epoch": 1.6260162601626016,
"grad_norm": 21.51888084411621,
"learning_rate": 2.5443216185212222e-05,
"loss": 2.5964,
"step": 23100
},
{
"epoch": 1.633055291591877,
"grad_norm": 20.486024856567383,
"learning_rate": 2.531285848367922e-05,
"loss": 2.6883,
"step": 23200
},
{
"epoch": 1.6400943230211524,
"grad_norm": 17.860441207885742,
"learning_rate": 2.5182500782146214e-05,
"loss": 2.7572,
"step": 23300
},
{
"epoch": 1.6471333544504276,
"grad_norm": 19.4054012298584,
"learning_rate": 2.5052143080613204e-05,
"loss": 2.7643,
"step": 23400
},
{
"epoch": 1.6541723858797028,
"grad_norm": 15.56551742553711,
"learning_rate": 2.49217853790802e-05,
"loss": 2.6638,
"step": 23500
},
{
"epoch": 1.6541723858797028,
"eval_runtime": 192.3359,
"eval_samples_per_second": 147.726,
"eval_steps_per_second": 18.468,
"step": 23500
},
{
"epoch": 1.6612114173089783,
"grad_norm": 22.051755905151367,
"learning_rate": 2.4791427677547192e-05,
"loss": 2.6905,
"step": 23600
},
{
"epoch": 1.6682504487382537,
"grad_norm": 19.55982208251953,
"learning_rate": 2.4661069976014183e-05,
"loss": 2.7178,
"step": 23700
},
{
"epoch": 1.675289480167529,
"grad_norm": 14.777819633483887,
"learning_rate": 2.4530712274481177e-05,
"loss": 2.6219,
"step": 23800
},
{
"epoch": 1.6823285115968043,
"grad_norm": 15.4576997756958,
"learning_rate": 2.440035457294817e-05,
"loss": 2.6425,
"step": 23900
},
{
"epoch": 1.6893675430260795,
"grad_norm": 18.520376205444336,
"learning_rate": 2.4269996871415165e-05,
"loss": 2.6541,
"step": 24000
},
{
"epoch": 1.6893675430260795,
"eval_runtime": 191.8782,
"eval_samples_per_second": 148.078,
"eval_steps_per_second": 18.512,
"step": 24000
},
{
"epoch": 1.696406574455355,
"grad_norm": 18.677989959716797,
"learning_rate": 2.413963916988216e-05,
"loss": 2.7502,
"step": 24100
},
{
"epoch": 1.7034456058846303,
"grad_norm": 19.01474380493164,
"learning_rate": 2.400928146834915e-05,
"loss": 2.5849,
"step": 24200
},
{
"epoch": 1.7104846373139058,
"grad_norm": 14.854390144348145,
"learning_rate": 2.3878923766816144e-05,
"loss": 2.6224,
"step": 24300
},
{
"epoch": 1.717523668743181,
"grad_norm": 16.40928077697754,
"learning_rate": 2.374856606528314e-05,
"loss": 2.6996,
"step": 24400
},
{
"epoch": 1.7245627001724562,
"grad_norm": 14.962175369262695,
"learning_rate": 2.3618208363750133e-05,
"loss": 2.6928,
"step": 24500
},
{
"epoch": 1.7245627001724562,
"eval_runtime": 194.5262,
"eval_samples_per_second": 146.063,
"eval_steps_per_second": 18.26,
"step": 24500
},
{
"epoch": 1.7316017316017316,
"grad_norm": 19.39845085144043,
"learning_rate": 2.3487850662217127e-05,
"loss": 2.7458,
"step": 24600
},
{
"epoch": 1.738640763031007,
"grad_norm": 16.46622085571289,
"learning_rate": 2.3357492960684117e-05,
"loss": 2.7465,
"step": 24700
},
{
"epoch": 1.7456797944602824,
"grad_norm": 17.756010055541992,
"learning_rate": 2.322713525915111e-05,
"loss": 2.7617,
"step": 24800
},
{
"epoch": 1.7527188258895576,
"grad_norm": 17.55894660949707,
"learning_rate": 2.3096777557618106e-05,
"loss": 2.6085,
"step": 24900
},
{
"epoch": 1.7597578573188328,
"grad_norm": 16.707901000976562,
"learning_rate": 2.2966419856085096e-05,
"loss": 2.6305,
"step": 25000
},
{
"epoch": 1.7597578573188328,
"eval_runtime": 191.9564,
"eval_samples_per_second": 148.018,
"eval_steps_per_second": 18.504,
"step": 25000
},
{
"epoch": 1.7667968887481083,
"grad_norm": 21.4102840423584,
"learning_rate": 2.2836062154552094e-05,
"loss": 2.7509,
"step": 25100
},
{
"epoch": 1.7738359201773837,
"grad_norm": 21.17198944091797,
"learning_rate": 2.2705704453019085e-05,
"loss": 2.6602,
"step": 25200
},
{
"epoch": 1.7808749516066589,
"grad_norm": 14.665617942810059,
"learning_rate": 2.257534675148608e-05,
"loss": 2.6576,
"step": 25300
},
{
"epoch": 1.7879139830359343,
"grad_norm": 18.63422393798828,
"learning_rate": 2.2444989049953073e-05,
"loss": 2.6682,
"step": 25400
},
{
"epoch": 1.7949530144652095,
"grad_norm": 20.478769302368164,
"learning_rate": 2.2314631348420064e-05,
"loss": 2.5733,
"step": 25500
},
{
"epoch": 1.7949530144652095,
"eval_runtime": 194.5888,
"eval_samples_per_second": 146.016,
"eval_steps_per_second": 18.254,
"step": 25500
},
{
"epoch": 1.801992045894485,
"grad_norm": 13.144091606140137,
"learning_rate": 2.218427364688706e-05,
"loss": 2.5946,
"step": 25600
},
{
"epoch": 1.8090310773237603,
"grad_norm": 22.20168685913086,
"learning_rate": 2.2053915945354052e-05,
"loss": 2.7029,
"step": 25700
},
{
"epoch": 1.8160701087530355,
"grad_norm": 17.39105796813965,
"learning_rate": 2.1923558243821046e-05,
"loss": 2.7007,
"step": 25800
},
{
"epoch": 1.823109140182311,
"grad_norm": 16.70639419555664,
"learning_rate": 2.179320054228804e-05,
"loss": 2.6011,
"step": 25900
},
{
"epoch": 1.8301481716115862,
"grad_norm": 23.447750091552734,
"learning_rate": 2.166284284075503e-05,
"loss": 2.682,
"step": 26000
},
{
"epoch": 1.8301481716115862,
"eval_runtime": 192.912,
"eval_samples_per_second": 147.285,
"eval_steps_per_second": 18.413,
"step": 26000
},
{
"epoch": 1.8371872030408616,
"grad_norm": 20.410226821899414,
"learning_rate": 2.1532485139222025e-05,
"loss": 2.7565,
"step": 26100
},
{
"epoch": 1.844226234470137,
"grad_norm": 20.64243507385254,
"learning_rate": 2.1402127437689022e-05,
"loss": 2.566,
"step": 26200
},
{
"epoch": 1.8512652658994122,
"grad_norm": 15.64989185333252,
"learning_rate": 2.1271769736156013e-05,
"loss": 2.7237,
"step": 26300
},
{
"epoch": 1.8583042973286876,
"grad_norm": 13.451628684997559,
"learning_rate": 2.1141412034623007e-05,
"loss": 2.6218,
"step": 26400
},
{
"epoch": 1.8653433287579628,
"grad_norm": 16.763063430786133,
"learning_rate": 2.1011054333089998e-05,
"loss": 2.5891,
"step": 26500
},
{
"epoch": 1.8653433287579628,
"eval_runtime": 191.7231,
"eval_samples_per_second": 148.198,
"eval_steps_per_second": 18.527,
"step": 26500
},
{
"epoch": 1.8723823601872382,
"grad_norm": 16.719079971313477,
"learning_rate": 2.0880696631556992e-05,
"loss": 2.6568,
"step": 26600
},
{
"epoch": 1.8794213916165137,
"grad_norm": 20.345216751098633,
"learning_rate": 2.075033893002399e-05,
"loss": 2.5889,
"step": 26700
},
{
"epoch": 1.8864604230457889,
"grad_norm": 13.290498733520508,
"learning_rate": 2.061998122849098e-05,
"loss": 2.6915,
"step": 26800
},
{
"epoch": 1.893499454475064,
"grad_norm": 26.90572738647461,
"learning_rate": 2.0489623526957974e-05,
"loss": 2.6635,
"step": 26900
},
{
"epoch": 1.9005384859043395,
"grad_norm": 12.706587791442871,
"learning_rate": 2.035926582542497e-05,
"loss": 2.6886,
"step": 27000
},
{
"epoch": 1.9005384859043395,
"eval_runtime": 193.671,
"eval_samples_per_second": 146.708,
"eval_steps_per_second": 18.34,
"step": 27000
},
{
"epoch": 1.907577517333615,
"grad_norm": 19.337390899658203,
"learning_rate": 2.022890812389196e-05,
"loss": 2.5446,
"step": 27100
},
{
"epoch": 1.9146165487628903,
"grad_norm": 16.442127227783203,
"learning_rate": 2.0098550422358953e-05,
"loss": 2.6562,
"step": 27200
},
{
"epoch": 1.9216555801921655,
"grad_norm": 17.196496963500977,
"learning_rate": 1.9968192720825947e-05,
"loss": 2.5869,
"step": 27300
},
{
"epoch": 1.9286946116214407,
"grad_norm": 15.884928703308105,
"learning_rate": 1.983783501929294e-05,
"loss": 2.6127,
"step": 27400
},
{
"epoch": 1.9357336430507162,
"grad_norm": 15.426615715026855,
"learning_rate": 1.9707477317759935e-05,
"loss": 2.6043,
"step": 27500
},
{
"epoch": 1.9357336430507162,
"eval_runtime": 193.4431,
"eval_samples_per_second": 146.88,
"eval_steps_per_second": 18.362,
"step": 27500
},
{
"epoch": 1.9427726744799916,
"grad_norm": 20.6138858795166,
"learning_rate": 1.9577119616226926e-05,
"loss": 2.6387,
"step": 27600
},
{
"epoch": 1.949811705909267,
"grad_norm": 14.545782089233398,
"learning_rate": 1.944676191469392e-05,
"loss": 2.7687,
"step": 27700
},
{
"epoch": 1.9568507373385422,
"grad_norm": 15.325973510742188,
"learning_rate": 1.9316404213160914e-05,
"loss": 2.6876,
"step": 27800
},
{
"epoch": 1.9638897687678174,
"grad_norm": 16.72733497619629,
"learning_rate": 1.918604651162791e-05,
"loss": 2.6131,
"step": 27900
},
{
"epoch": 1.9709288001970928,
"grad_norm": 22.076963424682617,
"learning_rate": 1.9055688810094903e-05,
"loss": 2.7044,
"step": 28000
},
{
"epoch": 1.9709288001970928,
"eval_runtime": 192.6043,
"eval_samples_per_second": 147.52,
"eval_steps_per_second": 18.442,
"step": 28000
},
{
"epoch": 1.9779678316263682,
"grad_norm": 17.05091094970703,
"learning_rate": 1.8925331108561893e-05,
"loss": 2.6023,
"step": 28100
},
{
"epoch": 1.9850068630556437,
"grad_norm": 17.847782135009766,
"learning_rate": 1.8794973407028887e-05,
"loss": 2.5752,
"step": 28200
},
{
"epoch": 1.9920458944849189,
"grad_norm": 18.966585159301758,
"learning_rate": 1.866461570549588e-05,
"loss": 2.6339,
"step": 28300
},
{
"epoch": 1.999084925914194,
"grad_norm": 18.27726173400879,
"learning_rate": 1.8534258003962876e-05,
"loss": 2.6527,
"step": 28400
},
{
"epoch": 2.0061239573434695,
"grad_norm": 16.40408706665039,
"learning_rate": 1.840390030242987e-05,
"loss": 2.6285,
"step": 28500
},
{
"epoch": 2.0061239573434695,
"eval_runtime": 192.5321,
"eval_samples_per_second": 147.575,
"eval_steps_per_second": 18.449,
"step": 28500
},
{
"epoch": 2.013162988772745,
"grad_norm": 13.299867630004883,
"learning_rate": 1.827354260089686e-05,
"loss": 2.6401,
"step": 28600
},
{
"epoch": 2.0202020202020203,
"grad_norm": 16.995622634887695,
"learning_rate": 1.8143184899363855e-05,
"loss": 2.5986,
"step": 28700
},
{
"epoch": 2.0272410516312958,
"grad_norm": 18.69041633605957,
"learning_rate": 1.801282719783085e-05,
"loss": 2.644,
"step": 28800
},
{
"epoch": 2.0342800830605707,
"grad_norm": 20.12238883972168,
"learning_rate": 1.7882469496297843e-05,
"loss": 2.6802,
"step": 28900
},
{
"epoch": 2.041319114489846,
"grad_norm": 14.631281852722168,
"learning_rate": 1.7752111794764837e-05,
"loss": 2.6362,
"step": 29000
},
{
"epoch": 2.041319114489846,
"eval_runtime": 193.4189,
"eval_samples_per_second": 146.899,
"eval_steps_per_second": 18.364,
"step": 29000
},
{
"epoch": 2.0483581459191216,
"grad_norm": 16.323118209838867,
"learning_rate": 1.7621754093231828e-05,
"loss": 2.6249,
"step": 29100
},
{
"epoch": 2.055397177348397,
"grad_norm": 14.623433113098145,
"learning_rate": 1.7491396391698822e-05,
"loss": 2.6324,
"step": 29200
},
{
"epoch": 2.062436208777672,
"grad_norm": 19.917098999023438,
"learning_rate": 1.7361038690165816e-05,
"loss": 2.6893,
"step": 29300
},
{
"epoch": 2.0694752402069474,
"grad_norm": 14.357760429382324,
"learning_rate": 1.7230680988632807e-05,
"loss": 2.5841,
"step": 29400
},
{
"epoch": 2.076514271636223,
"grad_norm": 15.798065185546875,
"learning_rate": 1.7100323287099804e-05,
"loss": 2.6374,
"step": 29500
},
{
"epoch": 2.076514271636223,
"eval_runtime": 191.4525,
"eval_samples_per_second": 148.408,
"eval_steps_per_second": 18.553,
"step": 29500
},
{
"epoch": 2.0835533030654982,
"grad_norm": 19.128459930419922,
"learning_rate": 1.6969965585566798e-05,
"loss": 2.6451,
"step": 29600
},
{
"epoch": 2.0905923344947737,
"grad_norm": 22.39739990234375,
"learning_rate": 1.683960788403379e-05,
"loss": 2.6732,
"step": 29700
},
{
"epoch": 2.0976313659240486,
"grad_norm": 21.8306827545166,
"learning_rate": 1.6709250182500783e-05,
"loss": 2.6381,
"step": 29800
},
{
"epoch": 2.104670397353324,
"grad_norm": 16.79404640197754,
"learning_rate": 1.6578892480967774e-05,
"loss": 2.6643,
"step": 29900
},
{
"epoch": 2.1117094287825995,
"grad_norm": 20.273427963256836,
"learning_rate": 1.644853477943477e-05,
"loss": 2.6409,
"step": 30000
},
{
"epoch": 2.1117094287825995,
"eval_runtime": 192.3103,
"eval_samples_per_second": 147.746,
"eval_steps_per_second": 18.47,
"step": 30000
},
{
"epoch": 2.118748460211875,
"grad_norm": 16.260501861572266,
"learning_rate": 1.6318177077901765e-05,
"loss": 2.6085,
"step": 30100
},
{
"epoch": 2.1257874916411503,
"grad_norm": 17.500699996948242,
"learning_rate": 1.6187819376368756e-05,
"loss": 2.5923,
"step": 30200
},
{
"epoch": 2.1328265230704253,
"grad_norm": 19.523569107055664,
"learning_rate": 1.605746167483575e-05,
"loss": 2.562,
"step": 30300
},
{
"epoch": 2.1398655544997007,
"grad_norm": 16.805545806884766,
"learning_rate": 1.5927103973302744e-05,
"loss": 2.632,
"step": 30400
},
{
"epoch": 2.146904585928976,
"grad_norm": 14.419663429260254,
"learning_rate": 1.5796746271769735e-05,
"loss": 2.6956,
"step": 30500
},
{
"epoch": 2.146904585928976,
"eval_runtime": 192.4179,
"eval_samples_per_second": 147.663,
"eval_steps_per_second": 18.46,
"step": 30500
},
{
"epoch": 2.1539436173582516,
"grad_norm": 14.469121932983398,
"learning_rate": 1.5666388570236732e-05,
"loss": 2.6734,
"step": 30600
},
{
"epoch": 2.160982648787527,
"grad_norm": 14.521267890930176,
"learning_rate": 1.5536030868703723e-05,
"loss": 2.6272,
"step": 30700
},
{
"epoch": 2.168021680216802,
"grad_norm": 15.565622329711914,
"learning_rate": 1.5405673167170717e-05,
"loss": 2.5995,
"step": 30800
},
{
"epoch": 2.1750607116460774,
"grad_norm": 18.500350952148438,
"learning_rate": 1.527531546563771e-05,
"loss": 2.7019,
"step": 30900
},
{
"epoch": 2.182099743075353,
"grad_norm": 18.180660247802734,
"learning_rate": 1.5144957764104704e-05,
"loss": 2.6347,
"step": 31000
},
{
"epoch": 2.182099743075353,
"eval_runtime": 194.3931,
"eval_samples_per_second": 146.163,
"eval_steps_per_second": 18.272,
"step": 31000
},
{
"epoch": 2.1891387745046282,
"grad_norm": 15.68535041809082,
"learning_rate": 1.5014600062571698e-05,
"loss": 2.6679,
"step": 31100
},
{
"epoch": 2.1961778059339037,
"grad_norm": 18.195068359375,
"learning_rate": 1.4884242361038692e-05,
"loss": 2.6152,
"step": 31200
},
{
"epoch": 2.2032168373631786,
"grad_norm": 19.41796875,
"learning_rate": 1.4753884659505684e-05,
"loss": 2.6453,
"step": 31300
},
{
"epoch": 2.210255868792454,
"grad_norm": 16.178791046142578,
"learning_rate": 1.4623526957972677e-05,
"loss": 2.6175,
"step": 31400
},
{
"epoch": 2.2172949002217295,
"grad_norm": 17.970273971557617,
"learning_rate": 1.4493169256439671e-05,
"loss": 2.5633,
"step": 31500
},
{
"epoch": 2.2172949002217295,
"eval_runtime": 194.9638,
"eval_samples_per_second": 145.735,
"eval_steps_per_second": 18.219,
"step": 31500
},
{
"epoch": 2.224333931651005,
"grad_norm": 13.679678916931152,
"learning_rate": 1.4362811554906663e-05,
"loss": 2.5433,
"step": 31600
},
{
"epoch": 2.2313729630802803,
"grad_norm": 18.061559677124023,
"learning_rate": 1.423245385337366e-05,
"loss": 2.6087,
"step": 31700
},
{
"epoch": 2.2384119945095553,
"grad_norm": 20.32142448425293,
"learning_rate": 1.4102096151840652e-05,
"loss": 2.6079,
"step": 31800
},
{
"epoch": 2.2454510259388307,
"grad_norm": 16.483491897583008,
"learning_rate": 1.3971738450307646e-05,
"loss": 2.6693,
"step": 31900
},
{
"epoch": 2.252490057368106,
"grad_norm": 16.30838394165039,
"learning_rate": 1.3841380748774638e-05,
"loss": 2.5292,
"step": 32000
},
{
"epoch": 2.252490057368106,
"eval_runtime": 192.332,
"eval_samples_per_second": 147.729,
"eval_steps_per_second": 18.468,
"step": 32000
},
{
"epoch": 2.2595290887973816,
"grad_norm": 18.353946685791016,
"learning_rate": 1.371102304724163e-05,
"loss": 2.7162,
"step": 32100
},
{
"epoch": 2.266568120226657,
"grad_norm": 14.376470565795898,
"learning_rate": 1.3580665345708626e-05,
"loss": 2.6581,
"step": 32200
},
{
"epoch": 2.273607151655932,
"grad_norm": 16.625110626220703,
"learning_rate": 1.3450307644175619e-05,
"loss": 2.5675,
"step": 32300
},
{
"epoch": 2.2806461830852074,
"grad_norm": 17.9268798828125,
"learning_rate": 1.3319949942642613e-05,
"loss": 2.6001,
"step": 32400
},
{
"epoch": 2.287685214514483,
"grad_norm": 23.196901321411133,
"learning_rate": 1.3189592241109605e-05,
"loss": 2.6545,
"step": 32500
},
{
"epoch": 2.287685214514483,
"eval_runtime": 192.8693,
"eval_samples_per_second": 147.317,
"eval_steps_per_second": 18.417,
"step": 32500
},
{
"epoch": 2.2947242459437582,
"grad_norm": 18.328662872314453,
"learning_rate": 1.3059234539576598e-05,
"loss": 2.5476,
"step": 32600
},
{
"epoch": 2.3017632773730337,
"grad_norm": 16.62209129333496,
"learning_rate": 1.2928876838043592e-05,
"loss": 2.6778,
"step": 32700
},
{
"epoch": 2.3088023088023086,
"grad_norm": 15.676456451416016,
"learning_rate": 1.2798519136510586e-05,
"loss": 2.6188,
"step": 32800
},
{
"epoch": 2.315841340231584,
"grad_norm": 21.3188533782959,
"learning_rate": 1.266816143497758e-05,
"loss": 2.5383,
"step": 32900
},
{
"epoch": 2.3228803716608595,
"grad_norm": 15.253218650817871,
"learning_rate": 1.2537803733444572e-05,
"loss": 2.6703,
"step": 33000
},
{
"epoch": 2.3228803716608595,
"eval_runtime": 192.6821,
"eval_samples_per_second": 147.46,
"eval_steps_per_second": 18.435,
"step": 33000
},
{
"epoch": 2.329919403090135,
"grad_norm": 17.341787338256836,
"learning_rate": 1.2407446031911565e-05,
"loss": 2.6903,
"step": 33100
},
{
"epoch": 2.3369584345194103,
"grad_norm": 14.856354713439941,
"learning_rate": 1.2277088330378559e-05,
"loss": 2.5655,
"step": 33200
},
{
"epoch": 2.3439974659486853,
"grad_norm": 17.669092178344727,
"learning_rate": 1.2146730628845553e-05,
"loss": 2.6723,
"step": 33300
},
{
"epoch": 2.3510364973779607,
"grad_norm": 18.183189392089844,
"learning_rate": 1.2016372927312545e-05,
"loss": 2.6732,
"step": 33400
},
{
"epoch": 2.358075528807236,
"grad_norm": 20.30499267578125,
"learning_rate": 1.188601522577954e-05,
"loss": 2.6527,
"step": 33500
},
{
"epoch": 2.358075528807236,
"eval_runtime": 192.939,
"eval_samples_per_second": 147.264,
"eval_steps_per_second": 18.41,
"step": 33500
},
{
"epoch": 2.3651145602365116,
"grad_norm": 22.915029525756836,
"learning_rate": 1.1755657524246532e-05,
"loss": 2.4797,
"step": 33600
},
{
"epoch": 2.3721535916657865,
"grad_norm": 16.179378509521484,
"learning_rate": 1.1625299822713528e-05,
"loss": 2.5854,
"step": 33700
},
{
"epoch": 2.379192623095062,
"grad_norm": 14.764082908630371,
"learning_rate": 1.149494212118052e-05,
"loss": 2.4972,
"step": 33800
},
{
"epoch": 2.3862316545243374,
"grad_norm": 21.402334213256836,
"learning_rate": 1.1364584419647513e-05,
"loss": 2.5752,
"step": 33900
},
{
"epoch": 2.393270685953613,
"grad_norm": 19.00446891784668,
"learning_rate": 1.1234226718114507e-05,
"loss": 2.4806,
"step": 34000
},
{
"epoch": 2.393270685953613,
"eval_runtime": 192.7281,
"eval_samples_per_second": 147.425,
"eval_steps_per_second": 18.43,
"step": 34000
},
{
"epoch": 2.4003097173828882,
"grad_norm": 21.23725700378418,
"learning_rate": 1.11038690165815e-05,
"loss": 2.5424,
"step": 34100
},
{
"epoch": 2.407348748812163,
"grad_norm": 14.942157745361328,
"learning_rate": 1.0973511315048493e-05,
"loss": 2.5926,
"step": 34200
},
{
"epoch": 2.4143877802414386,
"grad_norm": 17.429502487182617,
"learning_rate": 1.0843153613515487e-05,
"loss": 2.5892,
"step": 34300
},
{
"epoch": 2.421426811670714,
"grad_norm": 15.42565631866455,
"learning_rate": 1.071279591198248e-05,
"loss": 2.5758,
"step": 34400
},
{
"epoch": 2.4284658430999895,
"grad_norm": 20.9206600189209,
"learning_rate": 1.0582438210449474e-05,
"loss": 2.6666,
"step": 34500
},
{
"epoch": 2.4284658430999895,
"eval_runtime": 192.2868,
"eval_samples_per_second": 147.764,
"eval_steps_per_second": 18.472,
"step": 34500
},
{
"epoch": 2.435504874529265,
"grad_norm": 16.189416885375977,
"learning_rate": 1.0452080508916468e-05,
"loss": 2.5727,
"step": 34600
},
{
"epoch": 2.44254390595854,
"grad_norm": 17.95191192626953,
"learning_rate": 1.032172280738346e-05,
"loss": 2.6171,
"step": 34700
},
{
"epoch": 2.4495829373878153,
"grad_norm": 15.953314781188965,
"learning_rate": 1.0191365105850454e-05,
"loss": 2.5181,
"step": 34800
},
{
"epoch": 2.4566219688170907,
"grad_norm": 20.293758392333984,
"learning_rate": 1.0061007404317447e-05,
"loss": 2.6109,
"step": 34900
},
{
"epoch": 2.463661000246366,
"grad_norm": 13.837769508361816,
"learning_rate": 9.930649702784441e-06,
"loss": 2.6833,
"step": 35000
},
{
"epoch": 2.463661000246366,
"eval_runtime": 192.3429,
"eval_samples_per_second": 147.721,
"eval_steps_per_second": 18.467,
"step": 35000
},
{
"epoch": 2.4707000316756416,
"grad_norm": 15.594371795654297,
"learning_rate": 9.800292001251435e-06,
"loss": 2.6111,
"step": 35100
},
{
"epoch": 2.4777390631049165,
"grad_norm": 18.549043655395508,
"learning_rate": 9.669934299718427e-06,
"loss": 2.5622,
"step": 35200
},
{
"epoch": 2.484778094534192,
"grad_norm": 15.56165599822998,
"learning_rate": 9.53957659818542e-06,
"loss": 2.5254,
"step": 35300
},
{
"epoch": 2.4918171259634674,
"grad_norm": 14.361612319946289,
"learning_rate": 9.409218896652416e-06,
"loss": 2.5388,
"step": 35400
},
{
"epoch": 2.498856157392743,
"grad_norm": 17.944364547729492,
"learning_rate": 9.278861195119408e-06,
"loss": 2.5671,
"step": 35500
},
{
"epoch": 2.498856157392743,
"eval_runtime": 193.2612,
"eval_samples_per_second": 147.019,
"eval_steps_per_second": 18.379,
"step": 35500
},
{
"epoch": 2.5058951888220182,
"grad_norm": 15.994379043579102,
"learning_rate": 9.1485034935864e-06,
"loss": 2.4926,
"step": 35600
},
{
"epoch": 2.512934220251293,
"grad_norm": 15.721161842346191,
"learning_rate": 9.018145792053395e-06,
"loss": 2.5956,
"step": 35700
},
{
"epoch": 2.5199732516805686,
"grad_norm": 21.510955810546875,
"learning_rate": 8.887788090520389e-06,
"loss": 2.6592,
"step": 35800
},
{
"epoch": 2.527012283109844,
"grad_norm": 16.77272605895996,
"learning_rate": 8.757430388987383e-06,
"loss": 2.655,
"step": 35900
},
{
"epoch": 2.5340513145391195,
"grad_norm": 18.944421768188477,
"learning_rate": 8.627072687454375e-06,
"loss": 2.5201,
"step": 36000
},
{
"epoch": 2.5340513145391195,
"eval_runtime": 192.2731,
"eval_samples_per_second": 147.774,
"eval_steps_per_second": 18.474,
"step": 36000
},
{
"epoch": 2.541090345968395,
"grad_norm": 19.00555992126465,
"learning_rate": 8.496714985921368e-06,
"loss": 2.5858,
"step": 36100
},
{
"epoch": 2.54812937739767,
"grad_norm": 16.338956832885742,
"learning_rate": 8.366357284388362e-06,
"loss": 2.5963,
"step": 36200
},
{
"epoch": 2.5551684088269453,
"grad_norm": 15.704483032226562,
"learning_rate": 8.235999582855356e-06,
"loss": 2.5504,
"step": 36300
},
{
"epoch": 2.5622074402562207,
"grad_norm": 17.013628005981445,
"learning_rate": 8.105641881322348e-06,
"loss": 2.6663,
"step": 36400
},
{
"epoch": 2.569246471685496,
"grad_norm": 16.901050567626953,
"learning_rate": 7.975284179789342e-06,
"loss": 2.5827,
"step": 36500
},
{
"epoch": 2.569246471685496,
"eval_runtime": 192.5506,
"eval_samples_per_second": 147.561,
"eval_steps_per_second": 18.447,
"step": 36500
},
{
"epoch": 2.5762855031147716,
"grad_norm": 16.243534088134766,
"learning_rate": 7.844926478256335e-06,
"loss": 2.6065,
"step": 36600
},
{
"epoch": 2.5833245345440465,
"grad_norm": 17.0561580657959,
"learning_rate": 7.714568776723329e-06,
"loss": 2.5166,
"step": 36700
},
{
"epoch": 2.590363565973322,
"grad_norm": 14.800107955932617,
"learning_rate": 7.584211075190323e-06,
"loss": 2.6966,
"step": 36800
},
{
"epoch": 2.5974025974025974,
"grad_norm": 17.22756576538086,
"learning_rate": 7.453853373657315e-06,
"loss": 2.5921,
"step": 36900
},
{
"epoch": 2.604441628831873,
"grad_norm": 16.94314956665039,
"learning_rate": 7.32349567212431e-06,
"loss": 2.7039,
"step": 37000
},
{
"epoch": 2.604441628831873,
"eval_runtime": 192.7372,
"eval_samples_per_second": 147.418,
"eval_steps_per_second": 18.429,
"step": 37000
},
{
"epoch": 2.6114806602611482,
"grad_norm": 15.262337684631348,
"learning_rate": 7.193137970591303e-06,
"loss": 2.5053,
"step": 37100
},
{
"epoch": 2.618519691690423,
"grad_norm": 16.485326766967773,
"learning_rate": 7.062780269058296e-06,
"loss": 2.6282,
"step": 37200
},
{
"epoch": 2.6255587231196986,
"grad_norm": 23.574670791625977,
"learning_rate": 6.93242256752529e-06,
"loss": 2.5995,
"step": 37300
},
{
"epoch": 2.632597754548974,
"grad_norm": 16.39130973815918,
"learning_rate": 6.802064865992283e-06,
"loss": 2.5522,
"step": 37400
},
{
"epoch": 2.6396367859782495,
"grad_norm": 20.67544174194336,
"learning_rate": 6.671707164459276e-06,
"loss": 2.6411,
"step": 37500
},
{
"epoch": 2.6396367859782495,
"eval_runtime": 192.0322,
"eval_samples_per_second": 147.96,
"eval_steps_per_second": 18.497,
"step": 37500
},
{
"epoch": 2.646675817407525,
"grad_norm": 17.333271026611328,
"learning_rate": 6.54134946292627e-06,
"loss": 2.6002,
"step": 37600
},
{
"epoch": 2.6537148488368,
"grad_norm": 17.444929122924805,
"learning_rate": 6.410991761393263e-06,
"loss": 2.4802,
"step": 37700
},
{
"epoch": 2.6607538802660753,
"grad_norm": 17.62455940246582,
"learning_rate": 6.2806340598602564e-06,
"loss": 2.5169,
"step": 37800
},
{
"epoch": 2.6677929116953507,
"grad_norm": 23.869504928588867,
"learning_rate": 6.1502763583272506e-06,
"loss": 2.6119,
"step": 37900
},
{
"epoch": 2.674831943124626,
"grad_norm": 14.378959655761719,
"learning_rate": 6.019918656794243e-06,
"loss": 2.6484,
"step": 38000
},
{
"epoch": 2.674831943124626,
"eval_runtime": 191.7147,
"eval_samples_per_second": 148.205,
"eval_steps_per_second": 18.528,
"step": 38000
},
{
"epoch": 2.6818709745539016,
"grad_norm": 13.199753761291504,
"learning_rate": 5.889560955261237e-06,
"loss": 2.5929,
"step": 38100
},
{
"epoch": 2.6889100059831765,
"grad_norm": 19.931673049926758,
"learning_rate": 5.75920325372823e-06,
"loss": 2.5691,
"step": 38200
},
{
"epoch": 2.695949037412452,
"grad_norm": 16.0571346282959,
"learning_rate": 5.6288455521952244e-06,
"loss": 2.5593,
"step": 38300
},
{
"epoch": 2.7029880688417274,
"grad_norm": 14.09821605682373,
"learning_rate": 5.498487850662217e-06,
"loss": 2.5663,
"step": 38400
},
{
"epoch": 2.710027100271003,
"grad_norm": 16.2088680267334,
"learning_rate": 5.368130149129211e-06,
"loss": 2.5763,
"step": 38500
},
{
"epoch": 2.710027100271003,
"eval_runtime": 192.1652,
"eval_samples_per_second": 147.857,
"eval_steps_per_second": 18.484,
"step": 38500
},
{
"epoch": 2.717066131700278,
"grad_norm": 26.869508743286133,
"learning_rate": 5.237772447596204e-06,
"loss": 2.7026,
"step": 38600
},
{
"epoch": 2.724105163129553,
"grad_norm": 17.842239379882812,
"learning_rate": 5.107414746063198e-06,
"loss": 2.69,
"step": 38700
},
{
"epoch": 2.7311441945588286,
"grad_norm": 17.31543731689453,
"learning_rate": 4.977057044530191e-06,
"loss": 2.6621,
"step": 38800
},
{
"epoch": 2.738183225988104,
"grad_norm": 15.826437950134277,
"learning_rate": 4.846699342997185e-06,
"loss": 2.7036,
"step": 38900
},
{
"epoch": 2.7452222574173795,
"grad_norm": 16.656599044799805,
"learning_rate": 4.716341641464178e-06,
"loss": 2.5078,
"step": 39000
},
{
"epoch": 2.7452222574173795,
"eval_runtime": 191.4626,
"eval_samples_per_second": 148.4,
"eval_steps_per_second": 18.552,
"step": 39000
},
{
"epoch": 2.752261288846655,
"grad_norm": 16.501192092895508,
"learning_rate": 4.585983939931171e-06,
"loss": 2.5312,
"step": 39100
},
{
"epoch": 2.75930032027593,
"grad_norm": 17.555389404296875,
"learning_rate": 4.455626238398165e-06,
"loss": 2.5059,
"step": 39200
},
{
"epoch": 2.7663393517052053,
"grad_norm": 18.289548873901367,
"learning_rate": 4.325268536865158e-06,
"loss": 2.6702,
"step": 39300
},
{
"epoch": 2.7733783831344807,
"grad_norm": 15.688879013061523,
"learning_rate": 4.194910835332152e-06,
"loss": 2.5357,
"step": 39400
},
{
"epoch": 2.780417414563756,
"grad_norm": 14.281635284423828,
"learning_rate": 4.064553133799144e-06,
"loss": 2.6129,
"step": 39500
},
{
"epoch": 2.780417414563756,
"eval_runtime": 193.5326,
"eval_samples_per_second": 146.812,
"eval_steps_per_second": 18.353,
"step": 39500
},
{
"epoch": 2.7874564459930316,
"grad_norm": 22.23700523376465,
"learning_rate": 3.9341954322661385e-06,
"loss": 2.5335,
"step": 39600
},
{
"epoch": 2.7944954774223065,
"grad_norm": 17.91628074645996,
"learning_rate": 3.803837730733132e-06,
"loss": 2.5757,
"step": 39700
},
{
"epoch": 2.801534508851582,
"grad_norm": 16.670568466186523,
"learning_rate": 3.6734800292001254e-06,
"loss": 2.5679,
"step": 39800
},
{
"epoch": 2.8085735402808574,
"grad_norm": 17.128202438354492,
"learning_rate": 3.5431223276671187e-06,
"loss": 2.5285,
"step": 39900
},
{
"epoch": 2.815612571710133,
"grad_norm": 14.024889945983887,
"learning_rate": 3.412764626134112e-06,
"loss": 2.515,
"step": 40000
},
{
"epoch": 2.815612571710133,
"eval_runtime": 192.6175,
"eval_samples_per_second": 147.51,
"eval_steps_per_second": 18.441,
"step": 40000
},
{
"epoch": 2.822651603139408,
"grad_norm": 16.694087982177734,
"learning_rate": 3.2824069246011056e-06,
"loss": 2.625,
"step": 40100
},
{
"epoch": 2.829690634568683,
"grad_norm": 22.633140563964844,
"learning_rate": 3.1520492230680985e-06,
"loss": 2.5637,
"step": 40200
},
{
"epoch": 2.8367296659979586,
"grad_norm": 18.231454849243164,
"learning_rate": 3.021691521535092e-06,
"loss": 2.5672,
"step": 40300
},
{
"epoch": 2.843768697427234,
"grad_norm": 15.228378295898438,
"learning_rate": 2.891333820002086e-06,
"loss": 2.6203,
"step": 40400
},
{
"epoch": 2.8508077288565095,
"grad_norm": 19.437833786010742,
"learning_rate": 2.760976118469079e-06,
"loss": 2.626,
"step": 40500
},
{
"epoch": 2.8508077288565095,
"eval_runtime": 192.5589,
"eval_samples_per_second": 147.555,
"eval_steps_per_second": 18.446,
"step": 40500
},
{
"epoch": 2.857846760285785,
"grad_norm": 16.506317138671875,
"learning_rate": 2.630618416936073e-06,
"loss": 2.5625,
"step": 40600
},
{
"epoch": 2.86488579171506,
"grad_norm": 25.00144386291504,
"learning_rate": 2.500260715403066e-06,
"loss": 2.616,
"step": 40700
},
{
"epoch": 2.8719248231443353,
"grad_norm": 16.7978572845459,
"learning_rate": 2.3699030138700597e-06,
"loss": 2.5959,
"step": 40800
},
{
"epoch": 2.8789638545736107,
"grad_norm": 15.795037269592285,
"learning_rate": 2.239545312337053e-06,
"loss": 2.6245,
"step": 40900
},
{
"epoch": 2.886002886002886,
"grad_norm": 16.411415100097656,
"learning_rate": 2.1091876108040467e-06,
"loss": 2.6368,
"step": 41000
},
{
"epoch": 2.886002886002886,
"eval_runtime": 192.2421,
"eval_samples_per_second": 147.798,
"eval_steps_per_second": 18.477,
"step": 41000
},
{
"epoch": 2.8930419174321615,
"grad_norm": 16.8485050201416,
"learning_rate": 1.97882990927104e-06,
"loss": 2.5946,
"step": 41100
},
{
"epoch": 2.9000809488614365,
"grad_norm": 15.294781684875488,
"learning_rate": 1.8484722077380334e-06,
"loss": 2.6035,
"step": 41200
},
{
"epoch": 2.907119980290712,
"grad_norm": 26.89401626586914,
"learning_rate": 1.7181145062050267e-06,
"loss": 2.6353,
"step": 41300
},
{
"epoch": 2.9141590117199874,
"grad_norm": 13.004213333129883,
"learning_rate": 1.58775680467202e-06,
"loss": 2.6299,
"step": 41400
},
{
"epoch": 2.921198043149263,
"grad_norm": 17.197162628173828,
"learning_rate": 1.4573991031390136e-06,
"loss": 2.6031,
"step": 41500
},
{
"epoch": 2.921198043149263,
"eval_runtime": 192.0882,
"eval_samples_per_second": 147.916,
"eval_steps_per_second": 18.492,
"step": 41500
},
{
"epoch": 2.928237074578538,
"grad_norm": 16.04857063293457,
"learning_rate": 1.327041401606007e-06,
"loss": 2.5869,
"step": 41600
},
{
"epoch": 2.935276106007813,
"grad_norm": 14.147359848022461,
"learning_rate": 1.1966837000730005e-06,
"loss": 2.5851,
"step": 41700
},
{
"epoch": 2.9423151374370886,
"grad_norm": 17.802715301513672,
"learning_rate": 1.0663259985399938e-06,
"loss": 2.4637,
"step": 41800
},
{
"epoch": 2.949354168866364,
"grad_norm": 20.130615234375,
"learning_rate": 9.359682970069872e-07,
"loss": 2.5418,
"step": 41900
},
{
"epoch": 2.9563932002956395,
"grad_norm": 18.158117294311523,
"learning_rate": 8.056105954739805e-07,
"loss": 2.5456,
"step": 42000
},
{
"epoch": 2.9563932002956395,
"eval_runtime": 192.2432,
"eval_samples_per_second": 147.797,
"eval_steps_per_second": 18.477,
"step": 42000
},
{
"epoch": 2.963432231724915,
"grad_norm": 15.710502624511719,
"learning_rate": 6.75252893940974e-07,
"loss": 2.6082,
"step": 42100
},
{
"epoch": 2.97047126315419,
"grad_norm": 15.316740989685059,
"learning_rate": 5.448951924079675e-07,
"loss": 2.5569,
"step": 42200
},
{
"epoch": 2.9775102945834653,
"grad_norm": 17.120691299438477,
"learning_rate": 4.145374908749609e-07,
"loss": 2.6017,
"step": 42300
},
{
"epoch": 2.9845493260127407,
"grad_norm": 14.475923538208008,
"learning_rate": 2.841797893419543e-07,
"loss": 2.5627,
"step": 42400
},
{
"epoch": 2.991588357442016,
"grad_norm": 16.937416076660156,
"learning_rate": 1.5382208780894776e-07,
"loss": 2.557,
"step": 42500
},
{
"epoch": 2.991588357442016,
"eval_runtime": 192.9822,
"eval_samples_per_second": 147.231,
"eval_steps_per_second": 18.406,
"step": 42500
},
{
"epoch": 2.9986273888712915,
"grad_norm": 25.87067413330078,
"learning_rate": 2.3464386275941184e-08,
"loss": 2.5282,
"step": 42600
}
],
"logging_steps": 100,
"max_steps": 42618,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.1152556854519332e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}