{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.991588357442016, "eval_steps": 500, "global_step": 42500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.007039031429275332, "grad_norm": 36.44467544555664, "learning_rate": 1.1731581417175035e-06, "loss": 6.2071, "step": 100 }, { "epoch": 0.014078062858550663, "grad_norm": 43.126609802246094, "learning_rate": 2.346316283435007e-06, "loss": 5.9231, "step": 200 }, { "epoch": 0.021117094287825995, "grad_norm": 29.171634674072266, "learning_rate": 3.5194744251525106e-06, "loss": 5.4145, "step": 300 }, { "epoch": 0.028156125717101327, "grad_norm": 38.217105865478516, "learning_rate": 4.692632566870014e-06, "loss": 4.9149, "step": 400 }, { "epoch": 0.03519515714637666, "grad_norm": 35.40254211425781, "learning_rate": 5.865790708587518e-06, "loss": 4.5052, "step": 500 }, { "epoch": 0.03519515714637666, "eval_runtime": 191.3754, "eval_samples_per_second": 148.467, "eval_steps_per_second": 18.56, "step": 500 }, { "epoch": 0.04223418857565199, "grad_norm": 36.732643127441406, "learning_rate": 7.038948850305021e-06, "loss": 4.3715, "step": 600 }, { "epoch": 0.04927322000492732, "grad_norm": 36.27021408081055, "learning_rate": 8.212106992022525e-06, "loss": 4.3269, "step": 700 }, { "epoch": 0.056312251434202654, "grad_norm": 42.45858383178711, "learning_rate": 9.385265133740028e-06, "loss": 4.0589, "step": 800 }, { "epoch": 0.06335128286347799, "grad_norm": 42.08483123779297, "learning_rate": 1.0558423275457532e-05, "loss": 4.1336, "step": 900 }, { "epoch": 0.07039031429275332, "grad_norm": 42.23253631591797, "learning_rate": 1.1731581417175035e-05, "loss": 4.0719, "step": 1000 }, { "epoch": 0.07039031429275332, "eval_runtime": 193.8033, "eval_samples_per_second": 146.607, "eval_steps_per_second": 18.328, "step": 1000 }, { "epoch": 0.07742934572202866, "grad_norm": 48.905662536621094, "learning_rate": 1.2904739558892539e-05, "loss": 3.8613, "step": 1100 }, { "epoch": 0.08446837715130398, "grad_norm": 37.9277458190918, "learning_rate": 1.4077897700610042e-05, "loss": 3.8424, "step": 1200 }, { "epoch": 0.09150740858057932, "grad_norm": 48.82701110839844, "learning_rate": 1.5251055842327546e-05, "loss": 3.7771, "step": 1300 }, { "epoch": 0.09854644000985464, "grad_norm": 33.38028335571289, "learning_rate": 1.642421398404505e-05, "loss": 3.8094, "step": 1400 }, { "epoch": 0.10558547143912998, "grad_norm": 61.35352325439453, "learning_rate": 1.7597372125762555e-05, "loss": 3.8331, "step": 1500 }, { "epoch": 0.10558547143912998, "eval_runtime": 191.8384, "eval_samples_per_second": 148.109, "eval_steps_per_second": 18.516, "step": 1500 }, { "epoch": 0.11262450286840531, "grad_norm": 46.74394226074219, "learning_rate": 1.8770530267480057e-05, "loss": 3.6822, "step": 1600 }, { "epoch": 0.11966353429768065, "grad_norm": 35.53325271606445, "learning_rate": 1.9943688409197562e-05, "loss": 3.6282, "step": 1700 }, { "epoch": 0.12670256572695598, "grad_norm": 37.73524856567383, "learning_rate": 2.1116846550915064e-05, "loss": 3.5722, "step": 1800 }, { "epoch": 0.1337415971562313, "grad_norm": 33.76814651489258, "learning_rate": 2.229000469263257e-05, "loss": 3.6086, "step": 1900 }, { "epoch": 0.14078062858550663, "grad_norm": 41.888282775878906, "learning_rate": 2.346316283435007e-05, "loss": 3.6142, "step": 2000 }, { "epoch": 0.14078062858550663, "eval_runtime": 191.5815, "eval_samples_per_second": 148.308, "eval_steps_per_second": 18.54, "step": 2000 }, { "epoch": 0.14781966001478197, "grad_norm": 39.62664031982422, "learning_rate": 2.4636320976067576e-05, "loss": 3.6029, "step": 2100 }, { "epoch": 0.1548586914440573, "grad_norm": 38.377532958984375, "learning_rate": 2.5809479117785078e-05, "loss": 3.4959, "step": 2200 }, { "epoch": 0.16189772287333262, "grad_norm": 32.66987991333008, "learning_rate": 2.698263725950258e-05, "loss": 3.5252, "step": 2300 }, { "epoch": 0.16893675430260796, "grad_norm": 39.213592529296875, "learning_rate": 2.8155795401220085e-05, "loss": 3.5859, "step": 2400 }, { "epoch": 0.1759757857318833, "grad_norm": 31.646276473999023, "learning_rate": 2.9328953542937587e-05, "loss": 3.4995, "step": 2500 }, { "epoch": 0.1759757857318833, "eval_runtime": 194.3308, "eval_samples_per_second": 146.209, "eval_steps_per_second": 18.278, "step": 2500 }, { "epoch": 0.18301481716115864, "grad_norm": 32.30677032470703, "learning_rate": 3.0502111684655092e-05, "loss": 3.5853, "step": 2600 }, { "epoch": 0.19005384859043395, "grad_norm": 31.175769805908203, "learning_rate": 3.1675269826372594e-05, "loss": 3.5134, "step": 2700 }, { "epoch": 0.1970928800197093, "grad_norm": 31.389162063598633, "learning_rate": 3.28484279680901e-05, "loss": 3.4909, "step": 2800 }, { "epoch": 0.20413191144898463, "grad_norm": 33.105369567871094, "learning_rate": 3.4021586109807604e-05, "loss": 3.4099, "step": 2900 }, { "epoch": 0.21117094287825997, "grad_norm": 25.477977752685547, "learning_rate": 3.519474425152511e-05, "loss": 3.3823, "step": 3000 }, { "epoch": 0.21117094287825997, "eval_runtime": 194.3983, "eval_samples_per_second": 146.159, "eval_steps_per_second": 18.272, "step": 3000 }, { "epoch": 0.21820997430753528, "grad_norm": 29.61454200744629, "learning_rate": 3.636790239324261e-05, "loss": 3.3476, "step": 3100 }, { "epoch": 0.22524900573681061, "grad_norm": 26.82366180419922, "learning_rate": 3.754106053496011e-05, "loss": 3.389, "step": 3200 }, { "epoch": 0.23228803716608595, "grad_norm": 26.6168155670166, "learning_rate": 3.871421867667762e-05, "loss": 3.3712, "step": 3300 }, { "epoch": 0.2393270685953613, "grad_norm": 24.504793167114258, "learning_rate": 3.9887376818395124e-05, "loss": 3.2693, "step": 3400 }, { "epoch": 0.2463661000246366, "grad_norm": 22.34451675415039, "learning_rate": 4.106053496011262e-05, "loss": 3.3719, "step": 3500 }, { "epoch": 0.2463661000246366, "eval_runtime": 192.2522, "eval_samples_per_second": 147.79, "eval_steps_per_second": 18.476, "step": 3500 }, { "epoch": 0.25340513145391197, "grad_norm": 30.370140075683594, "learning_rate": 4.223369310183013e-05, "loss": 3.3216, "step": 3600 }, { "epoch": 0.2604441628831873, "grad_norm": 29.111398696899414, "learning_rate": 4.340685124354763e-05, "loss": 3.3085, "step": 3700 }, { "epoch": 0.2674831943124626, "grad_norm": 29.50999641418457, "learning_rate": 4.458000938526514e-05, "loss": 3.2907, "step": 3800 }, { "epoch": 0.27452222574173796, "grad_norm": 21.999244689941406, "learning_rate": 4.5753167526982636e-05, "loss": 3.2173, "step": 3900 }, { "epoch": 0.28156125717101327, "grad_norm": 28.0905818939209, "learning_rate": 4.692632566870014e-05, "loss": 3.3431, "step": 4000 }, { "epoch": 0.28156125717101327, "eval_runtime": 192.489, "eval_samples_per_second": 147.608, "eval_steps_per_second": 18.453, "step": 4000 }, { "epoch": 0.2886002886002886, "grad_norm": 27.252222061157227, "learning_rate": 4.809948381041765e-05, "loss": 3.4265, "step": 4100 }, { "epoch": 0.29563932002956395, "grad_norm": 20.001508712768555, "learning_rate": 4.927264195213515e-05, "loss": 3.2489, "step": 4200 }, { "epoch": 0.30267835145883926, "grad_norm": 24.947546005249023, "learning_rate": 4.995046407341746e-05, "loss": 3.2957, "step": 4300 }, { "epoch": 0.3097173828881146, "grad_norm": 18.58955192565918, "learning_rate": 4.982010637188445e-05, "loss": 3.2328, "step": 4400 }, { "epoch": 0.31675641431738993, "grad_norm": 22.946285247802734, "learning_rate": 4.968974867035145e-05, "loss": 3.177, "step": 4500 }, { "epoch": 0.31675641431738993, "eval_runtime": 192.63, "eval_samples_per_second": 147.5, "eval_steps_per_second": 18.439, "step": 4500 }, { "epoch": 0.32379544574666524, "grad_norm": 20.17714500427246, "learning_rate": 4.955939096881844e-05, "loss": 3.3241, "step": 4600 }, { "epoch": 0.3308344771759406, "grad_norm": 18.580751419067383, "learning_rate": 4.9429033267285435e-05, "loss": 3.3084, "step": 4700 }, { "epoch": 0.3378735086052159, "grad_norm": 16.068750381469727, "learning_rate": 4.929867556575243e-05, "loss": 3.1674, "step": 4800 }, { "epoch": 0.34491254003449123, "grad_norm": 23.636520385742188, "learning_rate": 4.916831786421942e-05, "loss": 3.2698, "step": 4900 }, { "epoch": 0.3519515714637666, "grad_norm": 19.445907592773438, "learning_rate": 4.903796016268641e-05, "loss": 3.2824, "step": 5000 }, { "epoch": 0.3519515714637666, "eval_runtime": 192.779, "eval_samples_per_second": 147.386, "eval_steps_per_second": 18.425, "step": 5000 }, { "epoch": 0.3589906028930419, "grad_norm": 20.217737197875977, "learning_rate": 4.8907602461153405e-05, "loss": 3.3673, "step": 5100 }, { "epoch": 0.3660296343223173, "grad_norm": 16.03109359741211, "learning_rate": 4.87772447596204e-05, "loss": 3.3105, "step": 5200 }, { "epoch": 0.3730686657515926, "grad_norm": 21.388507843017578, "learning_rate": 4.864688705808739e-05, "loss": 3.1998, "step": 5300 }, { "epoch": 0.3801076971808679, "grad_norm": 16.931922912597656, "learning_rate": 4.851652935655439e-05, "loss": 3.2054, "step": 5400 }, { "epoch": 0.38714672861014326, "grad_norm": 24.145727157592773, "learning_rate": 4.838617165502138e-05, "loss": 3.0802, "step": 5500 }, { "epoch": 0.38714672861014326, "eval_runtime": 192.7089, "eval_samples_per_second": 147.44, "eval_steps_per_second": 18.432, "step": 5500 }, { "epoch": 0.3941857600394186, "grad_norm": 24.199636459350586, "learning_rate": 4.8255813953488375e-05, "loss": 3.0688, "step": 5600 }, { "epoch": 0.4012247914686939, "grad_norm": 18.878751754760742, "learning_rate": 4.812545625195537e-05, "loss": 3.1667, "step": 5700 }, { "epoch": 0.40826382289796925, "grad_norm": 17.39369010925293, "learning_rate": 4.7995098550422363e-05, "loss": 3.1905, "step": 5800 }, { "epoch": 0.41530285432724456, "grad_norm": 19.778793334960938, "learning_rate": 4.786474084888936e-05, "loss": 3.0883, "step": 5900 }, { "epoch": 0.42234188575651993, "grad_norm": 14.80075454711914, "learning_rate": 4.773438314735635e-05, "loss": 3.0839, "step": 6000 }, { "epoch": 0.42234188575651993, "eval_runtime": 192.7353, "eval_samples_per_second": 147.42, "eval_steps_per_second": 18.429, "step": 6000 }, { "epoch": 0.42938091718579524, "grad_norm": 27.43608856201172, "learning_rate": 4.760402544582334e-05, "loss": 3.0621, "step": 6100 }, { "epoch": 0.43641994861507055, "grad_norm": 18.219221115112305, "learning_rate": 4.747366774429033e-05, "loss": 3.1461, "step": 6200 }, { "epoch": 0.4434589800443459, "grad_norm": 17.97977638244629, "learning_rate": 4.734331004275733e-05, "loss": 3.0795, "step": 6300 }, { "epoch": 0.45049801147362123, "grad_norm": 21.358592987060547, "learning_rate": 4.721295234122432e-05, "loss": 3.1361, "step": 6400 }, { "epoch": 0.45753704290289654, "grad_norm": 15.679008483886719, "learning_rate": 4.7082594639691315e-05, "loss": 3.0751, "step": 6500 }, { "epoch": 0.45753704290289654, "eval_runtime": 193.5839, "eval_samples_per_second": 146.774, "eval_steps_per_second": 18.349, "step": 6500 }, { "epoch": 0.4645760743321719, "grad_norm": 14.288241386413574, "learning_rate": 4.69522369381583e-05, "loss": 3.0886, "step": 6600 }, { "epoch": 0.4716151057614472, "grad_norm": 15.016201972961426, "learning_rate": 4.6821879236625304e-05, "loss": 2.973, "step": 6700 }, { "epoch": 0.4786541371907226, "grad_norm": 20.513479232788086, "learning_rate": 4.66915215350923e-05, "loss": 3.1001, "step": 6800 }, { "epoch": 0.4856931686199979, "grad_norm": 15.093891143798828, "learning_rate": 4.656116383355929e-05, "loss": 3.1073, "step": 6900 }, { "epoch": 0.4927322000492732, "grad_norm": 15.074331283569336, "learning_rate": 4.6430806132026286e-05, "loss": 3.1247, "step": 7000 }, { "epoch": 0.4927322000492732, "eval_runtime": 193.2211, "eval_samples_per_second": 147.049, "eval_steps_per_second": 18.383, "step": 7000 }, { "epoch": 0.49977123147854857, "grad_norm": 19.936674118041992, "learning_rate": 4.630044843049327e-05, "loss": 3.0331, "step": 7100 }, { "epoch": 0.5068102629078239, "grad_norm": 22.168909072875977, "learning_rate": 4.617009072896027e-05, "loss": 3.0611, "step": 7200 }, { "epoch": 0.5138492943370992, "grad_norm": 19.541671752929688, "learning_rate": 4.603973302742726e-05, "loss": 2.9378, "step": 7300 }, { "epoch": 0.5208883257663746, "grad_norm": 16.111820220947266, "learning_rate": 4.5909375325894256e-05, "loss": 3.0461, "step": 7400 }, { "epoch": 0.5279273571956499, "grad_norm": 18.88793182373047, "learning_rate": 4.577901762436125e-05, "loss": 3.0183, "step": 7500 }, { "epoch": 0.5279273571956499, "eval_runtime": 191.8152, "eval_samples_per_second": 148.127, "eval_steps_per_second": 18.518, "step": 7500 }, { "epoch": 0.5349663886249252, "grad_norm": 15.234626770019531, "learning_rate": 4.5648659922828244e-05, "loss": 3.0287, "step": 7600 }, { "epoch": 0.5420054200542005, "grad_norm": 16.836956024169922, "learning_rate": 4.551830222129523e-05, "loss": 2.9991, "step": 7700 }, { "epoch": 0.5490444514834759, "grad_norm": 20.984453201293945, "learning_rate": 4.538794451976223e-05, "loss": 3.0765, "step": 7800 }, { "epoch": 0.5560834829127512, "grad_norm": 18.113004684448242, "learning_rate": 4.5257586818229226e-05, "loss": 2.9704, "step": 7900 }, { "epoch": 0.5631225143420265, "grad_norm": 20.654579162597656, "learning_rate": 4.512722911669622e-05, "loss": 3.0604, "step": 8000 }, { "epoch": 0.5631225143420265, "eval_runtime": 193.8004, "eval_samples_per_second": 146.61, "eval_steps_per_second": 18.328, "step": 8000 }, { "epoch": 0.5701615457713018, "grad_norm": 18.815162658691406, "learning_rate": 4.499687141516321e-05, "loss": 2.8726, "step": 8100 }, { "epoch": 0.5772005772005772, "grad_norm": 15.366965293884277, "learning_rate": 4.48665137136302e-05, "loss": 3.0509, "step": 8200 }, { "epoch": 0.5842396086298526, "grad_norm": 17.203136444091797, "learning_rate": 4.4736156012097196e-05, "loss": 2.9718, "step": 8300 }, { "epoch": 0.5912786400591279, "grad_norm": 15.31092643737793, "learning_rate": 4.460579831056419e-05, "loss": 3.0382, "step": 8400 }, { "epoch": 0.5983176714884032, "grad_norm": 12.381194114685059, "learning_rate": 4.4475440609031184e-05, "loss": 2.9945, "step": 8500 }, { "epoch": 0.5983176714884032, "eval_runtime": 192.0001, "eval_samples_per_second": 147.984, "eval_steps_per_second": 18.5, "step": 8500 }, { "epoch": 0.6053567029176785, "grad_norm": 13.362293243408203, "learning_rate": 4.434508290749818e-05, "loss": 3.0427, "step": 8600 }, { "epoch": 0.6123957343469538, "grad_norm": 18.495370864868164, "learning_rate": 4.4214725205965165e-05, "loss": 3.0363, "step": 8700 }, { "epoch": 0.6194347657762292, "grad_norm": 16.98731803894043, "learning_rate": 4.408436750443216e-05, "loss": 2.9257, "step": 8800 }, { "epoch": 0.6264737972055046, "grad_norm": 18.510801315307617, "learning_rate": 4.395400980289916e-05, "loss": 3.0014, "step": 8900 }, { "epoch": 0.6335128286347799, "grad_norm": 15.024514198303223, "learning_rate": 4.3823652101366155e-05, "loss": 3.0435, "step": 9000 }, { "epoch": 0.6335128286347799, "eval_runtime": 192.5991, "eval_samples_per_second": 147.524, "eval_steps_per_second": 18.442, "step": 9000 }, { "epoch": 0.6405518600640552, "grad_norm": 19.763547897338867, "learning_rate": 4.369329439983315e-05, "loss": 2.9219, "step": 9100 }, { "epoch": 0.6475908914933305, "grad_norm": 19.536087036132812, "learning_rate": 4.3562936698300136e-05, "loss": 2.9749, "step": 9200 }, { "epoch": 0.6546299229226058, "grad_norm": 15.909920692443848, "learning_rate": 4.343257899676713e-05, "loss": 2.8793, "step": 9300 }, { "epoch": 0.6616689543518812, "grad_norm": 17.25068473815918, "learning_rate": 4.3302221295234124e-05, "loss": 2.9669, "step": 9400 }, { "epoch": 0.6687079857811565, "grad_norm": 14.32239055633545, "learning_rate": 4.317186359370112e-05, "loss": 2.9066, "step": 9500 }, { "epoch": 0.6687079857811565, "eval_runtime": 191.7836, "eval_samples_per_second": 148.151, "eval_steps_per_second": 18.521, "step": 9500 }, { "epoch": 0.6757470172104318, "grad_norm": 22.611879348754883, "learning_rate": 4.304150589216811e-05, "loss": 2.8333, "step": 9600 }, { "epoch": 0.6827860486397072, "grad_norm": 15.549399375915527, "learning_rate": 4.2911148190635107e-05, "loss": 2.9913, "step": 9700 }, { "epoch": 0.6898250800689825, "grad_norm": 13.915739059448242, "learning_rate": 4.2780790489102094e-05, "loss": 2.9524, "step": 9800 }, { "epoch": 0.6968641114982579, "grad_norm": 17.621822357177734, "learning_rate": 4.265043278756909e-05, "loss": 2.9844, "step": 9900 }, { "epoch": 0.7039031429275332, "grad_norm": 22.748342514038086, "learning_rate": 4.252007508603609e-05, "loss": 2.9083, "step": 10000 }, { "epoch": 0.7039031429275332, "eval_runtime": 193.6184, "eval_samples_per_second": 146.747, "eval_steps_per_second": 18.345, "step": 10000 }, { "epoch": 0.7109421743568085, "grad_norm": 18.526578903198242, "learning_rate": 4.238971738450308e-05, "loss": 2.9706, "step": 10100 }, { "epoch": 0.7179812057860838, "grad_norm": 15.676709175109863, "learning_rate": 4.225935968297007e-05, "loss": 2.8265, "step": 10200 }, { "epoch": 0.7250202372153591, "grad_norm": 18.21067237854004, "learning_rate": 4.2129001981437064e-05, "loss": 2.8592, "step": 10300 }, { "epoch": 0.7320592686446346, "grad_norm": 22.188024520874023, "learning_rate": 4.199864427990406e-05, "loss": 2.8158, "step": 10400 }, { "epoch": 0.7390983000739099, "grad_norm": 18.406801223754883, "learning_rate": 4.186828657837105e-05, "loss": 2.9264, "step": 10500 }, { "epoch": 0.7390983000739099, "eval_runtime": 193.9371, "eval_samples_per_second": 146.506, "eval_steps_per_second": 18.315, "step": 10500 }, { "epoch": 0.7461373315031852, "grad_norm": 20.65268325805664, "learning_rate": 4.173792887683805e-05, "loss": 2.9882, "step": 10600 }, { "epoch": 0.7531763629324605, "grad_norm": 12.223052024841309, "learning_rate": 4.160757117530504e-05, "loss": 2.8969, "step": 10700 }, { "epoch": 0.7602153943617358, "grad_norm": 18.218887329101562, "learning_rate": 4.147721347377203e-05, "loss": 2.8254, "step": 10800 }, { "epoch": 0.7672544257910111, "grad_norm": 19.009950637817383, "learning_rate": 4.134685577223902e-05, "loss": 2.9361, "step": 10900 }, { "epoch": 0.7742934572202865, "grad_norm": 33.881927490234375, "learning_rate": 4.1216498070706016e-05, "loss": 2.8528, "step": 11000 }, { "epoch": 0.7742934572202865, "eval_runtime": 194.2555, "eval_samples_per_second": 146.266, "eval_steps_per_second": 18.285, "step": 11000 }, { "epoch": 0.7813324886495618, "grad_norm": 19.02928924560547, "learning_rate": 4.108614036917302e-05, "loss": 2.9383, "step": 11100 }, { "epoch": 0.7883715200788372, "grad_norm": 18.154483795166016, "learning_rate": 4.095578266764001e-05, "loss": 2.7691, "step": 11200 }, { "epoch": 0.7954105515081125, "grad_norm": 13.669476509094238, "learning_rate": 4.0825424966107e-05, "loss": 2.8306, "step": 11300 }, { "epoch": 0.8024495829373878, "grad_norm": 16.23528289794922, "learning_rate": 4.069506726457399e-05, "loss": 2.8588, "step": 11400 }, { "epoch": 0.8094886143666632, "grad_norm": 16.63111686706543, "learning_rate": 4.056470956304099e-05, "loss": 2.91, "step": 11500 }, { "epoch": 0.8094886143666632, "eval_runtime": 193.9814, "eval_samples_per_second": 146.473, "eval_steps_per_second": 18.311, "step": 11500 }, { "epoch": 0.8165276457959385, "grad_norm": 19.989736557006836, "learning_rate": 4.043435186150798e-05, "loss": 2.8754, "step": 11600 }, { "epoch": 0.8235666772252138, "grad_norm": 15.608553886413574, "learning_rate": 4.0303994159974975e-05, "loss": 2.8896, "step": 11700 }, { "epoch": 0.8306057086544891, "grad_norm": 17.236600875854492, "learning_rate": 4.017363645844196e-05, "loss": 2.8897, "step": 11800 }, { "epoch": 0.8376447400837644, "grad_norm": 16.03377342224121, "learning_rate": 4.0043278756908957e-05, "loss": 2.8936, "step": 11900 }, { "epoch": 0.8446837715130399, "grad_norm": 25.3082332611084, "learning_rate": 3.991292105537595e-05, "loss": 2.8939, "step": 12000 }, { "epoch": 0.8446837715130399, "eval_runtime": 192.6986, "eval_samples_per_second": 147.448, "eval_steps_per_second": 18.433, "step": 12000 }, { "epoch": 0.8517228029423152, "grad_norm": 18.766387939453125, "learning_rate": 3.9782563353842945e-05, "loss": 2.9448, "step": 12100 }, { "epoch": 0.8587618343715905, "grad_norm": 17.019485473632812, "learning_rate": 3.9652205652309946e-05, "loss": 2.7899, "step": 12200 }, { "epoch": 0.8658008658008658, "grad_norm": 15.821990966796875, "learning_rate": 3.952184795077693e-05, "loss": 2.8069, "step": 12300 }, { "epoch": 0.8728398972301411, "grad_norm": 19.532939910888672, "learning_rate": 3.939149024924393e-05, "loss": 2.8797, "step": 12400 }, { "epoch": 0.8798789286594164, "grad_norm": 15.134819030761719, "learning_rate": 3.926113254771092e-05, "loss": 2.89, "step": 12500 }, { "epoch": 0.8798789286594164, "eval_runtime": 191.6112, "eval_samples_per_second": 148.285, "eval_steps_per_second": 18.538, "step": 12500 }, { "epoch": 0.8869179600886918, "grad_norm": 15.088654518127441, "learning_rate": 3.9130774846177915e-05, "loss": 2.8519, "step": 12600 }, { "epoch": 0.8939569915179671, "grad_norm": 16.8511962890625, "learning_rate": 3.900041714464491e-05, "loss": 2.7864, "step": 12700 }, { "epoch": 0.9009960229472425, "grad_norm": 19.323467254638672, "learning_rate": 3.8870059443111903e-05, "loss": 2.9093, "step": 12800 }, { "epoch": 0.9080350543765178, "grad_norm": 16.42205810546875, "learning_rate": 3.873970174157889e-05, "loss": 2.8426, "step": 12900 }, { "epoch": 0.9150740858057931, "grad_norm": 16.470041275024414, "learning_rate": 3.8609344040045885e-05, "loss": 2.9097, "step": 13000 }, { "epoch": 0.9150740858057931, "eval_runtime": 192.3033, "eval_samples_per_second": 147.751, "eval_steps_per_second": 18.471, "step": 13000 }, { "epoch": 0.9221131172350685, "grad_norm": 15.83154296875, "learning_rate": 3.847898633851288e-05, "loss": 2.8286, "step": 13100 }, { "epoch": 0.9291521486643438, "grad_norm": 14.70117473602295, "learning_rate": 3.834862863697987e-05, "loss": 2.7575, "step": 13200 }, { "epoch": 0.9361911800936191, "grad_norm": 12.124509811401367, "learning_rate": 3.821827093544687e-05, "loss": 2.8621, "step": 13300 }, { "epoch": 0.9432302115228944, "grad_norm": 16.292022705078125, "learning_rate": 3.808791323391386e-05, "loss": 2.8619, "step": 13400 }, { "epoch": 0.9502692429521697, "grad_norm": 18.999359130859375, "learning_rate": 3.7957555532380855e-05, "loss": 2.8396, "step": 13500 }, { "epoch": 0.9502692429521697, "eval_runtime": 192.4881, "eval_samples_per_second": 147.609, "eval_steps_per_second": 18.453, "step": 13500 }, { "epoch": 0.9573082743814452, "grad_norm": 17.78417205810547, "learning_rate": 3.782719783084785e-05, "loss": 2.842, "step": 13600 }, { "epoch": 0.9643473058107205, "grad_norm": 16.267335891723633, "learning_rate": 3.7696840129314844e-05, "loss": 2.8373, "step": 13700 }, { "epoch": 0.9713863372399958, "grad_norm": 15.711287498474121, "learning_rate": 3.756648242778184e-05, "loss": 2.9, "step": 13800 }, { "epoch": 0.9784253686692711, "grad_norm": 26.957563400268555, "learning_rate": 3.7436124726248825e-05, "loss": 2.9499, "step": 13900 }, { "epoch": 0.9854644000985464, "grad_norm": 11.875740051269531, "learning_rate": 3.730576702471582e-05, "loss": 2.7061, "step": 14000 }, { "epoch": 0.9854644000985464, "eval_runtime": 192.2723, "eval_samples_per_second": 147.775, "eval_steps_per_second": 18.474, "step": 14000 }, { "epoch": 0.9925034315278217, "grad_norm": 21.315786361694336, "learning_rate": 3.717540932318281e-05, "loss": 2.8303, "step": 14100 }, { "epoch": 0.9995424629570971, "grad_norm": 13.713945388793945, "learning_rate": 3.704505162164981e-05, "loss": 2.7378, "step": 14200 }, { "epoch": 1.0065814943863725, "grad_norm": 16.48957633972168, "learning_rate": 3.69146939201168e-05, "loss": 2.7313, "step": 14300 }, { "epoch": 1.0136205258156479, "grad_norm": 19.689464569091797, "learning_rate": 3.6784336218583796e-05, "loss": 2.7612, "step": 14400 }, { "epoch": 1.020659557244923, "grad_norm": 19.848342895507812, "learning_rate": 3.665397851705079e-05, "loss": 2.8205, "step": 14500 }, { "epoch": 1.020659557244923, "eval_runtime": 193.2356, "eval_samples_per_second": 147.038, "eval_steps_per_second": 18.382, "step": 14500 }, { "epoch": 1.0276985886741985, "grad_norm": 23.994319915771484, "learning_rate": 3.6523620815517784e-05, "loss": 2.7362, "step": 14600 }, { "epoch": 1.0347376201034737, "grad_norm": 16.060455322265625, "learning_rate": 3.639326311398478e-05, "loss": 2.8076, "step": 14700 }, { "epoch": 1.0417766515327491, "grad_norm": 17.874704360961914, "learning_rate": 3.626290541245177e-05, "loss": 2.6903, "step": 14800 }, { "epoch": 1.0488156829620243, "grad_norm": 15.949551582336426, "learning_rate": 3.613254771091876e-05, "loss": 2.7765, "step": 14900 }, { "epoch": 1.0558547143912997, "grad_norm": 17.344772338867188, "learning_rate": 3.6002190009385754e-05, "loss": 2.9004, "step": 15000 }, { "epoch": 1.0558547143912997, "eval_runtime": 194.9228, "eval_samples_per_second": 145.765, "eval_steps_per_second": 18.223, "step": 15000 }, { "epoch": 1.0628937458205752, "grad_norm": 13.621015548706055, "learning_rate": 3.587183230785275e-05, "loss": 2.7808, "step": 15100 }, { "epoch": 1.0699327772498504, "grad_norm": 21.069551467895508, "learning_rate": 3.574147460631974e-05, "loss": 2.7421, "step": 15200 }, { "epoch": 1.0769718086791258, "grad_norm": 15.854650497436523, "learning_rate": 3.5611116904786736e-05, "loss": 2.7086, "step": 15300 }, { "epoch": 1.084010840108401, "grad_norm": 19.217153549194336, "learning_rate": 3.548075920325373e-05, "loss": 2.7957, "step": 15400 }, { "epoch": 1.0910498715376764, "grad_norm": 20.781291961669922, "learning_rate": 3.5350401501720724e-05, "loss": 2.8039, "step": 15500 }, { "epoch": 1.0910498715376764, "eval_runtime": 192.3565, "eval_samples_per_second": 147.71, "eval_steps_per_second": 18.466, "step": 15500 }, { "epoch": 1.0980889029669518, "grad_norm": 15.170364379882812, "learning_rate": 3.522004380018772e-05, "loss": 2.7992, "step": 15600 }, { "epoch": 1.105127934396227, "grad_norm": 18.8775634765625, "learning_rate": 3.508968609865471e-05, "loss": 2.7919, "step": 15700 }, { "epoch": 1.1121669658255025, "grad_norm": 19.009754180908203, "learning_rate": 3.4959328397121706e-05, "loss": 2.7361, "step": 15800 }, { "epoch": 1.1192059972547777, "grad_norm": 14.632086753845215, "learning_rate": 3.48289706955887e-05, "loss": 2.8233, "step": 15900 }, { "epoch": 1.126245028684053, "grad_norm": 20.006601333618164, "learning_rate": 3.469861299405569e-05, "loss": 2.738, "step": 16000 }, { "epoch": 1.126245028684053, "eval_runtime": 194.9892, "eval_samples_per_second": 145.716, "eval_steps_per_second": 18.216, "step": 16000 }, { "epoch": 1.1332840601133285, "grad_norm": 14.096820831298828, "learning_rate": 3.456825529252268e-05, "loss": 2.8094, "step": 16100 }, { "epoch": 1.1403230915426037, "grad_norm": 17.261428833007812, "learning_rate": 3.4437897590989676e-05, "loss": 2.8108, "step": 16200 }, { "epoch": 1.1473621229718791, "grad_norm": 14.972962379455566, "learning_rate": 3.430753988945667e-05, "loss": 2.8055, "step": 16300 }, { "epoch": 1.1544011544011543, "grad_norm": 14.345026969909668, "learning_rate": 3.4177182187923664e-05, "loss": 2.7363, "step": 16400 }, { "epoch": 1.1614401858304297, "grad_norm": 15.4429292678833, "learning_rate": 3.404682448639065e-05, "loss": 2.8642, "step": 16500 }, { "epoch": 1.1614401858304297, "eval_runtime": 194.218, "eval_samples_per_second": 146.294, "eval_steps_per_second": 18.289, "step": 16500 }, { "epoch": 1.1684792172597052, "grad_norm": 13.637730598449707, "learning_rate": 3.391646678485765e-05, "loss": 2.6802, "step": 16600 }, { "epoch": 1.1755182486889804, "grad_norm": 16.29159927368164, "learning_rate": 3.3786109083324647e-05, "loss": 2.7796, "step": 16700 }, { "epoch": 1.1825572801182558, "grad_norm": 16.733455657958984, "learning_rate": 3.365575138179164e-05, "loss": 2.8216, "step": 16800 }, { "epoch": 1.189596311547531, "grad_norm": 20.382347106933594, "learning_rate": 3.3525393680258635e-05, "loss": 2.6737, "step": 16900 }, { "epoch": 1.1966353429768064, "grad_norm": 18.506607055664062, "learning_rate": 3.339503597872562e-05, "loss": 2.7144, "step": 17000 }, { "epoch": 1.1966353429768064, "eval_runtime": 192.527, "eval_samples_per_second": 147.579, "eval_steps_per_second": 18.449, "step": 17000 }, { "epoch": 1.2036743744060816, "grad_norm": 16.080759048461914, "learning_rate": 3.3264678277192616e-05, "loss": 2.8013, "step": 17100 }, { "epoch": 1.210713405835357, "grad_norm": 16.123552322387695, "learning_rate": 3.313432057565961e-05, "loss": 2.7444, "step": 17200 }, { "epoch": 1.2177524372646324, "grad_norm": 16.878711700439453, "learning_rate": 3.3003962874126604e-05, "loss": 2.8241, "step": 17300 }, { "epoch": 1.2247914686939076, "grad_norm": 18.64569854736328, "learning_rate": 3.28736051725936e-05, "loss": 2.684, "step": 17400 }, { "epoch": 1.231830500123183, "grad_norm": 16.125022888183594, "learning_rate": 3.274324747106059e-05, "loss": 2.8633, "step": 17500 }, { "epoch": 1.231830500123183, "eval_runtime": 193.4861, "eval_samples_per_second": 146.848, "eval_steps_per_second": 18.358, "step": 17500 }, { "epoch": 1.2388695315524583, "grad_norm": 13.886027336120605, "learning_rate": 3.261288976952758e-05, "loss": 2.7536, "step": 17600 }, { "epoch": 1.2459085629817337, "grad_norm": 15.769869804382324, "learning_rate": 3.248253206799458e-05, "loss": 2.7889, "step": 17700 }, { "epoch": 1.2529475944110091, "grad_norm": 19.419034957885742, "learning_rate": 3.2352174366461575e-05, "loss": 2.7672, "step": 17800 }, { "epoch": 1.2599866258402843, "grad_norm": 18.742015838623047, "learning_rate": 3.222181666492857e-05, "loss": 2.7427, "step": 17900 }, { "epoch": 1.2670256572695597, "grad_norm": 18.40927505493164, "learning_rate": 3.209145896339556e-05, "loss": 2.7596, "step": 18000 }, { "epoch": 1.2670256572695597, "eval_runtime": 194.1066, "eval_samples_per_second": 146.378, "eval_steps_per_second": 18.299, "step": 18000 }, { "epoch": 1.274064688698835, "grad_norm": 21.27202033996582, "learning_rate": 3.196110126186255e-05, "loss": 2.7888, "step": 18100 }, { "epoch": 1.2811037201281104, "grad_norm": 13.953824043273926, "learning_rate": 3.1830743560329545e-05, "loss": 2.7347, "step": 18200 }, { "epoch": 1.2881427515573858, "grad_norm": 16.453821182250977, "learning_rate": 3.170038585879654e-05, "loss": 2.8821, "step": 18300 }, { "epoch": 1.295181782986661, "grad_norm": 16.67236328125, "learning_rate": 3.157002815726353e-05, "loss": 2.8431, "step": 18400 }, { "epoch": 1.3022208144159364, "grad_norm": 13.558029174804688, "learning_rate": 3.143967045573053e-05, "loss": 2.7499, "step": 18500 }, { "epoch": 1.3022208144159364, "eval_runtime": 192.4964, "eval_samples_per_second": 147.603, "eval_steps_per_second": 18.452, "step": 18500 }, { "epoch": 1.3092598458452116, "grad_norm": 15.34234619140625, "learning_rate": 3.1309312754197514e-05, "loss": 2.8225, "step": 18600 }, { "epoch": 1.316298877274487, "grad_norm": 14.304731369018555, "learning_rate": 3.117895505266451e-05, "loss": 2.8369, "step": 18700 }, { "epoch": 1.3233379087037624, "grad_norm": 17.345626831054688, "learning_rate": 3.104859735113151e-05, "loss": 2.6865, "step": 18800 }, { "epoch": 1.3303769401330376, "grad_norm": 16.954349517822266, "learning_rate": 3.09182396495985e-05, "loss": 2.741, "step": 18900 }, { "epoch": 1.337415971562313, "grad_norm": 17.250642776489258, "learning_rate": 3.07878819480655e-05, "loss": 2.8111, "step": 19000 }, { "epoch": 1.337415971562313, "eval_runtime": 192.7825, "eval_samples_per_second": 147.384, "eval_steps_per_second": 18.425, "step": 19000 }, { "epoch": 1.3444550029915883, "grad_norm": 17.004776000976562, "learning_rate": 3.0657524246532485e-05, "loss": 2.7594, "step": 19100 }, { "epoch": 1.3514940344208637, "grad_norm": 15.450813293457031, "learning_rate": 3.052716654499948e-05, "loss": 2.6522, "step": 19200 }, { "epoch": 1.358533065850139, "grad_norm": 15.58588981628418, "learning_rate": 3.0396808843466473e-05, "loss": 2.8346, "step": 19300 }, { "epoch": 1.3655720972794143, "grad_norm": 23.079944610595703, "learning_rate": 3.0266451141933467e-05, "loss": 2.829, "step": 19400 }, { "epoch": 1.3726111287086897, "grad_norm": 23.278108596801758, "learning_rate": 3.0136093440400458e-05, "loss": 2.809, "step": 19500 }, { "epoch": 1.3726111287086897, "eval_runtime": 191.7654, "eval_samples_per_second": 148.165, "eval_steps_per_second": 18.523, "step": 19500 }, { "epoch": 1.379650160137965, "grad_norm": 12.144103050231934, "learning_rate": 3.0005735738867452e-05, "loss": 2.5999, "step": 19600 }, { "epoch": 1.3866891915672404, "grad_norm": 18.378664016723633, "learning_rate": 2.9875378037334446e-05, "loss": 2.8226, "step": 19700 }, { "epoch": 1.3937282229965158, "grad_norm": 15.180033683776855, "learning_rate": 2.9745020335801437e-05, "loss": 2.7773, "step": 19800 }, { "epoch": 1.400767254425791, "grad_norm": 16.611019134521484, "learning_rate": 2.9614662634268438e-05, "loss": 2.7171, "step": 19900 }, { "epoch": 1.4078062858550664, "grad_norm": 14.491551399230957, "learning_rate": 2.948430493273543e-05, "loss": 2.7234, "step": 20000 }, { "epoch": 1.4078062858550664, "eval_runtime": 192.2568, "eval_samples_per_second": 147.787, "eval_steps_per_second": 18.475, "step": 20000 }, { "epoch": 1.4148453172843416, "grad_norm": 15.652689933776855, "learning_rate": 2.9353947231202422e-05, "loss": 2.6237, "step": 20100 }, { "epoch": 1.421884348713617, "grad_norm": 16.404693603515625, "learning_rate": 2.9223589529669417e-05, "loss": 2.7363, "step": 20200 }, { "epoch": 1.4289233801428924, "grad_norm": 13.620403289794922, "learning_rate": 2.9093231828136407e-05, "loss": 2.7651, "step": 20300 }, { "epoch": 1.4359624115721676, "grad_norm": 16.975452423095703, "learning_rate": 2.89628741266034e-05, "loss": 2.8431, "step": 20400 }, { "epoch": 1.443001443001443, "grad_norm": 16.957857131958008, "learning_rate": 2.8832516425070395e-05, "loss": 2.7442, "step": 20500 }, { "epoch": 1.443001443001443, "eval_runtime": 192.9203, "eval_samples_per_second": 147.278, "eval_steps_per_second": 18.412, "step": 20500 }, { "epoch": 1.4500404744307183, "grad_norm": 15.085665702819824, "learning_rate": 2.8702158723537386e-05, "loss": 2.6764, "step": 20600 }, { "epoch": 1.4570795058599937, "grad_norm": 15.870454788208008, "learning_rate": 2.857180102200438e-05, "loss": 2.6491, "step": 20700 }, { "epoch": 1.464118537289269, "grad_norm": 15.54505729675293, "learning_rate": 2.844144332047137e-05, "loss": 2.785, "step": 20800 }, { "epoch": 1.4711575687185443, "grad_norm": 16.786861419677734, "learning_rate": 2.8311085618938365e-05, "loss": 2.656, "step": 20900 }, { "epoch": 1.4781966001478197, "grad_norm": 17.03700828552246, "learning_rate": 2.8180727917405363e-05, "loss": 2.7337, "step": 21000 }, { "epoch": 1.4781966001478197, "eval_runtime": 194.0613, "eval_samples_per_second": 146.413, "eval_steps_per_second": 18.303, "step": 21000 }, { "epoch": 1.485235631577095, "grad_norm": 15.768280982971191, "learning_rate": 2.8050370215872357e-05, "loss": 2.7419, "step": 21100 }, { "epoch": 1.4922746630063703, "grad_norm": 16.70868682861328, "learning_rate": 2.792001251433935e-05, "loss": 2.7001, "step": 21200 }, { "epoch": 1.4993136944356458, "grad_norm": 15.003129005432129, "learning_rate": 2.778965481280634e-05, "loss": 2.6372, "step": 21300 }, { "epoch": 1.506352725864921, "grad_norm": 15.321432113647461, "learning_rate": 2.7659297111273336e-05, "loss": 2.734, "step": 21400 }, { "epoch": 1.5133917572941962, "grad_norm": 12.912035942077637, "learning_rate": 2.752893940974033e-05, "loss": 2.6854, "step": 21500 }, { "epoch": 1.5133917572941962, "eval_runtime": 192.892, "eval_samples_per_second": 147.3, "eval_steps_per_second": 18.414, "step": 21500 }, { "epoch": 1.5204307887234716, "grad_norm": 17.954883575439453, "learning_rate": 2.739858170820732e-05, "loss": 2.6843, "step": 21600 }, { "epoch": 1.527469820152747, "grad_norm": 20.32744026184082, "learning_rate": 2.7268224006674315e-05, "loss": 2.6404, "step": 21700 }, { "epoch": 1.5345088515820224, "grad_norm": 14.839242935180664, "learning_rate": 2.713786630514131e-05, "loss": 2.7235, "step": 21800 }, { "epoch": 1.5415478830112976, "grad_norm": 15.594539642333984, "learning_rate": 2.70075086036083e-05, "loss": 2.6163, "step": 21900 }, { "epoch": 1.5485869144405728, "grad_norm": 14.877588272094727, "learning_rate": 2.6877150902075294e-05, "loss": 2.6331, "step": 22000 }, { "epoch": 1.5485869144405728, "eval_runtime": 191.7338, "eval_samples_per_second": 148.19, "eval_steps_per_second": 18.526, "step": 22000 }, { "epoch": 1.5556259458698483, "grad_norm": 14.593866348266602, "learning_rate": 2.674679320054229e-05, "loss": 2.7969, "step": 22100 }, { "epoch": 1.5626649772991237, "grad_norm": 22.533540725708008, "learning_rate": 2.6616435499009285e-05, "loss": 2.7841, "step": 22200 }, { "epoch": 1.569704008728399, "grad_norm": 16.089982986450195, "learning_rate": 2.6486077797476276e-05, "loss": 2.667, "step": 22300 }, { "epoch": 1.5767430401576743, "grad_norm": 17.49601936340332, "learning_rate": 2.635572009594327e-05, "loss": 2.7963, "step": 22400 }, { "epoch": 1.5837820715869495, "grad_norm": 17.72164535522461, "learning_rate": 2.6225362394410264e-05, "loss": 2.7669, "step": 22500 }, { "epoch": 1.5837820715869495, "eval_runtime": 191.4019, "eval_samples_per_second": 148.447, "eval_steps_per_second": 18.558, "step": 22500 }, { "epoch": 1.590821103016225, "grad_norm": 14.238466262817383, "learning_rate": 2.6095004692877255e-05, "loss": 2.732, "step": 22600 }, { "epoch": 1.5978601344455003, "grad_norm": 19.9779052734375, "learning_rate": 2.596464699134425e-05, "loss": 2.7353, "step": 22700 }, { "epoch": 1.6048991658747758, "grad_norm": 16.89205551147461, "learning_rate": 2.5834289289811243e-05, "loss": 2.7311, "step": 22800 }, { "epoch": 1.611938197304051, "grad_norm": 13.072985649108887, "learning_rate": 2.5703931588278234e-05, "loss": 2.6313, "step": 22900 }, { "epoch": 1.6189772287333262, "grad_norm": 22.408113479614258, "learning_rate": 2.5573573886745228e-05, "loss": 2.605, "step": 23000 }, { "epoch": 1.6189772287333262, "eval_runtime": 192.574, "eval_samples_per_second": 147.543, "eval_steps_per_second": 18.445, "step": 23000 }, { "epoch": 1.6260162601626016, "grad_norm": 21.51888084411621, "learning_rate": 2.5443216185212222e-05, "loss": 2.5964, "step": 23100 }, { "epoch": 1.633055291591877, "grad_norm": 20.486024856567383, "learning_rate": 2.531285848367922e-05, "loss": 2.6883, "step": 23200 }, { "epoch": 1.6400943230211524, "grad_norm": 17.860441207885742, "learning_rate": 2.5182500782146214e-05, "loss": 2.7572, "step": 23300 }, { "epoch": 1.6471333544504276, "grad_norm": 19.4054012298584, "learning_rate": 2.5052143080613204e-05, "loss": 2.7643, "step": 23400 }, { "epoch": 1.6541723858797028, "grad_norm": 15.56551742553711, "learning_rate": 2.49217853790802e-05, "loss": 2.6638, "step": 23500 }, { "epoch": 1.6541723858797028, "eval_runtime": 192.3359, "eval_samples_per_second": 147.726, "eval_steps_per_second": 18.468, "step": 23500 }, { "epoch": 1.6612114173089783, "grad_norm": 22.051755905151367, "learning_rate": 2.4791427677547192e-05, "loss": 2.6905, "step": 23600 }, { "epoch": 1.6682504487382537, "grad_norm": 19.55982208251953, "learning_rate": 2.4661069976014183e-05, "loss": 2.7178, "step": 23700 }, { "epoch": 1.675289480167529, "grad_norm": 14.777819633483887, "learning_rate": 2.4530712274481177e-05, "loss": 2.6219, "step": 23800 }, { "epoch": 1.6823285115968043, "grad_norm": 15.4576997756958, "learning_rate": 2.440035457294817e-05, "loss": 2.6425, "step": 23900 }, { "epoch": 1.6893675430260795, "grad_norm": 18.520376205444336, "learning_rate": 2.4269996871415165e-05, "loss": 2.6541, "step": 24000 }, { "epoch": 1.6893675430260795, "eval_runtime": 191.8782, "eval_samples_per_second": 148.078, "eval_steps_per_second": 18.512, "step": 24000 }, { "epoch": 1.696406574455355, "grad_norm": 18.677989959716797, "learning_rate": 2.413963916988216e-05, "loss": 2.7502, "step": 24100 }, { "epoch": 1.7034456058846303, "grad_norm": 19.01474380493164, "learning_rate": 2.400928146834915e-05, "loss": 2.5849, "step": 24200 }, { "epoch": 1.7104846373139058, "grad_norm": 14.854390144348145, "learning_rate": 2.3878923766816144e-05, "loss": 2.6224, "step": 24300 }, { "epoch": 1.717523668743181, "grad_norm": 16.40928077697754, "learning_rate": 2.374856606528314e-05, "loss": 2.6996, "step": 24400 }, { "epoch": 1.7245627001724562, "grad_norm": 14.962175369262695, "learning_rate": 2.3618208363750133e-05, "loss": 2.6928, "step": 24500 }, { "epoch": 1.7245627001724562, "eval_runtime": 194.5262, "eval_samples_per_second": 146.063, "eval_steps_per_second": 18.26, "step": 24500 }, { "epoch": 1.7316017316017316, "grad_norm": 19.39845085144043, "learning_rate": 2.3487850662217127e-05, "loss": 2.7458, "step": 24600 }, { "epoch": 1.738640763031007, "grad_norm": 16.46622085571289, "learning_rate": 2.3357492960684117e-05, "loss": 2.7465, "step": 24700 }, { "epoch": 1.7456797944602824, "grad_norm": 17.756010055541992, "learning_rate": 2.322713525915111e-05, "loss": 2.7617, "step": 24800 }, { "epoch": 1.7527188258895576, "grad_norm": 17.55894660949707, "learning_rate": 2.3096777557618106e-05, "loss": 2.6085, "step": 24900 }, { "epoch": 1.7597578573188328, "grad_norm": 16.707901000976562, "learning_rate": 2.2966419856085096e-05, "loss": 2.6305, "step": 25000 }, { "epoch": 1.7597578573188328, "eval_runtime": 191.9564, "eval_samples_per_second": 148.018, "eval_steps_per_second": 18.504, "step": 25000 }, { "epoch": 1.7667968887481083, "grad_norm": 21.4102840423584, "learning_rate": 2.2836062154552094e-05, "loss": 2.7509, "step": 25100 }, { "epoch": 1.7738359201773837, "grad_norm": 21.17198944091797, "learning_rate": 2.2705704453019085e-05, "loss": 2.6602, "step": 25200 }, { "epoch": 1.7808749516066589, "grad_norm": 14.665617942810059, "learning_rate": 2.257534675148608e-05, "loss": 2.6576, "step": 25300 }, { "epoch": 1.7879139830359343, "grad_norm": 18.63422393798828, "learning_rate": 2.2444989049953073e-05, "loss": 2.6682, "step": 25400 }, { "epoch": 1.7949530144652095, "grad_norm": 20.478769302368164, "learning_rate": 2.2314631348420064e-05, "loss": 2.5733, "step": 25500 }, { "epoch": 1.7949530144652095, "eval_runtime": 194.5888, "eval_samples_per_second": 146.016, "eval_steps_per_second": 18.254, "step": 25500 }, { "epoch": 1.801992045894485, "grad_norm": 13.144091606140137, "learning_rate": 2.218427364688706e-05, "loss": 2.5946, "step": 25600 }, { "epoch": 1.8090310773237603, "grad_norm": 22.20168685913086, "learning_rate": 2.2053915945354052e-05, "loss": 2.7029, "step": 25700 }, { "epoch": 1.8160701087530355, "grad_norm": 17.39105796813965, "learning_rate": 2.1923558243821046e-05, "loss": 2.7007, "step": 25800 }, { "epoch": 1.823109140182311, "grad_norm": 16.70639419555664, "learning_rate": 2.179320054228804e-05, "loss": 2.6011, "step": 25900 }, { "epoch": 1.8301481716115862, "grad_norm": 23.447750091552734, "learning_rate": 2.166284284075503e-05, "loss": 2.682, "step": 26000 }, { "epoch": 1.8301481716115862, "eval_runtime": 192.912, "eval_samples_per_second": 147.285, "eval_steps_per_second": 18.413, "step": 26000 }, { "epoch": 1.8371872030408616, "grad_norm": 20.410226821899414, "learning_rate": 2.1532485139222025e-05, "loss": 2.7565, "step": 26100 }, { "epoch": 1.844226234470137, "grad_norm": 20.64243507385254, "learning_rate": 2.1402127437689022e-05, "loss": 2.566, "step": 26200 }, { "epoch": 1.8512652658994122, "grad_norm": 15.64989185333252, "learning_rate": 2.1271769736156013e-05, "loss": 2.7237, "step": 26300 }, { "epoch": 1.8583042973286876, "grad_norm": 13.451628684997559, "learning_rate": 2.1141412034623007e-05, "loss": 2.6218, "step": 26400 }, { "epoch": 1.8653433287579628, "grad_norm": 16.763063430786133, "learning_rate": 2.1011054333089998e-05, "loss": 2.5891, "step": 26500 }, { "epoch": 1.8653433287579628, "eval_runtime": 191.7231, "eval_samples_per_second": 148.198, "eval_steps_per_second": 18.527, "step": 26500 }, { "epoch": 1.8723823601872382, "grad_norm": 16.719079971313477, "learning_rate": 2.0880696631556992e-05, "loss": 2.6568, "step": 26600 }, { "epoch": 1.8794213916165137, "grad_norm": 20.345216751098633, "learning_rate": 2.075033893002399e-05, "loss": 2.5889, "step": 26700 }, { "epoch": 1.8864604230457889, "grad_norm": 13.290498733520508, "learning_rate": 2.061998122849098e-05, "loss": 2.6915, "step": 26800 }, { "epoch": 1.893499454475064, "grad_norm": 26.90572738647461, "learning_rate": 2.0489623526957974e-05, "loss": 2.6635, "step": 26900 }, { "epoch": 1.9005384859043395, "grad_norm": 12.706587791442871, "learning_rate": 2.035926582542497e-05, "loss": 2.6886, "step": 27000 }, { "epoch": 1.9005384859043395, "eval_runtime": 193.671, "eval_samples_per_second": 146.708, "eval_steps_per_second": 18.34, "step": 27000 }, { "epoch": 1.907577517333615, "grad_norm": 19.337390899658203, "learning_rate": 2.022890812389196e-05, "loss": 2.5446, "step": 27100 }, { "epoch": 1.9146165487628903, "grad_norm": 16.442127227783203, "learning_rate": 2.0098550422358953e-05, "loss": 2.6562, "step": 27200 }, { "epoch": 1.9216555801921655, "grad_norm": 17.196496963500977, "learning_rate": 1.9968192720825947e-05, "loss": 2.5869, "step": 27300 }, { "epoch": 1.9286946116214407, "grad_norm": 15.884928703308105, "learning_rate": 1.983783501929294e-05, "loss": 2.6127, "step": 27400 }, { "epoch": 1.9357336430507162, "grad_norm": 15.426615715026855, "learning_rate": 1.9707477317759935e-05, "loss": 2.6043, "step": 27500 }, { "epoch": 1.9357336430507162, "eval_runtime": 193.4431, "eval_samples_per_second": 146.88, "eval_steps_per_second": 18.362, "step": 27500 }, { "epoch": 1.9427726744799916, "grad_norm": 20.6138858795166, "learning_rate": 1.9577119616226926e-05, "loss": 2.6387, "step": 27600 }, { "epoch": 1.949811705909267, "grad_norm": 14.545782089233398, "learning_rate": 1.944676191469392e-05, "loss": 2.7687, "step": 27700 }, { "epoch": 1.9568507373385422, "grad_norm": 15.325973510742188, "learning_rate": 1.9316404213160914e-05, "loss": 2.6876, "step": 27800 }, { "epoch": 1.9638897687678174, "grad_norm": 16.72733497619629, "learning_rate": 1.918604651162791e-05, "loss": 2.6131, "step": 27900 }, { "epoch": 1.9709288001970928, "grad_norm": 22.076963424682617, "learning_rate": 1.9055688810094903e-05, "loss": 2.7044, "step": 28000 }, { "epoch": 1.9709288001970928, "eval_runtime": 192.6043, "eval_samples_per_second": 147.52, "eval_steps_per_second": 18.442, "step": 28000 }, { "epoch": 1.9779678316263682, "grad_norm": 17.05091094970703, "learning_rate": 1.8925331108561893e-05, "loss": 2.6023, "step": 28100 }, { "epoch": 1.9850068630556437, "grad_norm": 17.847782135009766, "learning_rate": 1.8794973407028887e-05, "loss": 2.5752, "step": 28200 }, { "epoch": 1.9920458944849189, "grad_norm": 18.966585159301758, "learning_rate": 1.866461570549588e-05, "loss": 2.6339, "step": 28300 }, { "epoch": 1.999084925914194, "grad_norm": 18.27726173400879, "learning_rate": 1.8534258003962876e-05, "loss": 2.6527, "step": 28400 }, { "epoch": 2.0061239573434695, "grad_norm": 16.40408706665039, "learning_rate": 1.840390030242987e-05, "loss": 2.6285, "step": 28500 }, { "epoch": 2.0061239573434695, "eval_runtime": 192.5321, "eval_samples_per_second": 147.575, "eval_steps_per_second": 18.449, "step": 28500 }, { "epoch": 2.013162988772745, "grad_norm": 13.299867630004883, "learning_rate": 1.827354260089686e-05, "loss": 2.6401, "step": 28600 }, { "epoch": 2.0202020202020203, "grad_norm": 16.995622634887695, "learning_rate": 1.8143184899363855e-05, "loss": 2.5986, "step": 28700 }, { "epoch": 2.0272410516312958, "grad_norm": 18.69041633605957, "learning_rate": 1.801282719783085e-05, "loss": 2.644, "step": 28800 }, { "epoch": 2.0342800830605707, "grad_norm": 20.12238883972168, "learning_rate": 1.7882469496297843e-05, "loss": 2.6802, "step": 28900 }, { "epoch": 2.041319114489846, "grad_norm": 14.631281852722168, "learning_rate": 1.7752111794764837e-05, "loss": 2.6362, "step": 29000 }, { "epoch": 2.041319114489846, "eval_runtime": 193.4189, "eval_samples_per_second": 146.899, "eval_steps_per_second": 18.364, "step": 29000 }, { "epoch": 2.0483581459191216, "grad_norm": 16.323118209838867, "learning_rate": 1.7621754093231828e-05, "loss": 2.6249, "step": 29100 }, { "epoch": 2.055397177348397, "grad_norm": 14.623433113098145, "learning_rate": 1.7491396391698822e-05, "loss": 2.6324, "step": 29200 }, { "epoch": 2.062436208777672, "grad_norm": 19.917098999023438, "learning_rate": 1.7361038690165816e-05, "loss": 2.6893, "step": 29300 }, { "epoch": 2.0694752402069474, "grad_norm": 14.357760429382324, "learning_rate": 1.7230680988632807e-05, "loss": 2.5841, "step": 29400 }, { "epoch": 2.076514271636223, "grad_norm": 15.798065185546875, "learning_rate": 1.7100323287099804e-05, "loss": 2.6374, "step": 29500 }, { "epoch": 2.076514271636223, "eval_runtime": 191.4525, "eval_samples_per_second": 148.408, "eval_steps_per_second": 18.553, "step": 29500 }, { "epoch": 2.0835533030654982, "grad_norm": 19.128459930419922, "learning_rate": 1.6969965585566798e-05, "loss": 2.6451, "step": 29600 }, { "epoch": 2.0905923344947737, "grad_norm": 22.39739990234375, "learning_rate": 1.683960788403379e-05, "loss": 2.6732, "step": 29700 }, { "epoch": 2.0976313659240486, "grad_norm": 21.8306827545166, "learning_rate": 1.6709250182500783e-05, "loss": 2.6381, "step": 29800 }, { "epoch": 2.104670397353324, "grad_norm": 16.79404640197754, "learning_rate": 1.6578892480967774e-05, "loss": 2.6643, "step": 29900 }, { "epoch": 2.1117094287825995, "grad_norm": 20.273427963256836, "learning_rate": 1.644853477943477e-05, "loss": 2.6409, "step": 30000 }, { "epoch": 2.1117094287825995, "eval_runtime": 192.3103, "eval_samples_per_second": 147.746, "eval_steps_per_second": 18.47, "step": 30000 }, { "epoch": 2.118748460211875, "grad_norm": 16.260501861572266, "learning_rate": 1.6318177077901765e-05, "loss": 2.6085, "step": 30100 }, { "epoch": 2.1257874916411503, "grad_norm": 17.500699996948242, "learning_rate": 1.6187819376368756e-05, "loss": 2.5923, "step": 30200 }, { "epoch": 2.1328265230704253, "grad_norm": 19.523569107055664, "learning_rate": 1.605746167483575e-05, "loss": 2.562, "step": 30300 }, { "epoch": 2.1398655544997007, "grad_norm": 16.805545806884766, "learning_rate": 1.5927103973302744e-05, "loss": 2.632, "step": 30400 }, { "epoch": 2.146904585928976, "grad_norm": 14.419663429260254, "learning_rate": 1.5796746271769735e-05, "loss": 2.6956, "step": 30500 }, { "epoch": 2.146904585928976, "eval_runtime": 192.4179, "eval_samples_per_second": 147.663, "eval_steps_per_second": 18.46, "step": 30500 }, { "epoch": 2.1539436173582516, "grad_norm": 14.469121932983398, "learning_rate": 1.5666388570236732e-05, "loss": 2.6734, "step": 30600 }, { "epoch": 2.160982648787527, "grad_norm": 14.521267890930176, "learning_rate": 1.5536030868703723e-05, "loss": 2.6272, "step": 30700 }, { "epoch": 2.168021680216802, "grad_norm": 15.565622329711914, "learning_rate": 1.5405673167170717e-05, "loss": 2.5995, "step": 30800 }, { "epoch": 2.1750607116460774, "grad_norm": 18.500350952148438, "learning_rate": 1.527531546563771e-05, "loss": 2.7019, "step": 30900 }, { "epoch": 2.182099743075353, "grad_norm": 18.180660247802734, "learning_rate": 1.5144957764104704e-05, "loss": 2.6347, "step": 31000 }, { "epoch": 2.182099743075353, "eval_runtime": 194.3931, "eval_samples_per_second": 146.163, "eval_steps_per_second": 18.272, "step": 31000 }, { "epoch": 2.1891387745046282, "grad_norm": 15.68535041809082, "learning_rate": 1.5014600062571698e-05, "loss": 2.6679, "step": 31100 }, { "epoch": 2.1961778059339037, "grad_norm": 18.195068359375, "learning_rate": 1.4884242361038692e-05, "loss": 2.6152, "step": 31200 }, { "epoch": 2.2032168373631786, "grad_norm": 19.41796875, "learning_rate": 1.4753884659505684e-05, "loss": 2.6453, "step": 31300 }, { "epoch": 2.210255868792454, "grad_norm": 16.178791046142578, "learning_rate": 1.4623526957972677e-05, "loss": 2.6175, "step": 31400 }, { "epoch": 2.2172949002217295, "grad_norm": 17.970273971557617, "learning_rate": 1.4493169256439671e-05, "loss": 2.5633, "step": 31500 }, { "epoch": 2.2172949002217295, "eval_runtime": 194.9638, "eval_samples_per_second": 145.735, "eval_steps_per_second": 18.219, "step": 31500 }, { "epoch": 2.224333931651005, "grad_norm": 13.679678916931152, "learning_rate": 1.4362811554906663e-05, "loss": 2.5433, "step": 31600 }, { "epoch": 2.2313729630802803, "grad_norm": 18.061559677124023, "learning_rate": 1.423245385337366e-05, "loss": 2.6087, "step": 31700 }, { "epoch": 2.2384119945095553, "grad_norm": 20.32142448425293, "learning_rate": 1.4102096151840652e-05, "loss": 2.6079, "step": 31800 }, { "epoch": 2.2454510259388307, "grad_norm": 16.483491897583008, "learning_rate": 1.3971738450307646e-05, "loss": 2.6693, "step": 31900 }, { "epoch": 2.252490057368106, "grad_norm": 16.30838394165039, "learning_rate": 1.3841380748774638e-05, "loss": 2.5292, "step": 32000 }, { "epoch": 2.252490057368106, "eval_runtime": 192.332, "eval_samples_per_second": 147.729, "eval_steps_per_second": 18.468, "step": 32000 }, { "epoch": 2.2595290887973816, "grad_norm": 18.353946685791016, "learning_rate": 1.371102304724163e-05, "loss": 2.7162, "step": 32100 }, { "epoch": 2.266568120226657, "grad_norm": 14.376470565795898, "learning_rate": 1.3580665345708626e-05, "loss": 2.6581, "step": 32200 }, { "epoch": 2.273607151655932, "grad_norm": 16.625110626220703, "learning_rate": 1.3450307644175619e-05, "loss": 2.5675, "step": 32300 }, { "epoch": 2.2806461830852074, "grad_norm": 17.9268798828125, "learning_rate": 1.3319949942642613e-05, "loss": 2.6001, "step": 32400 }, { "epoch": 2.287685214514483, "grad_norm": 23.196901321411133, "learning_rate": 1.3189592241109605e-05, "loss": 2.6545, "step": 32500 }, { "epoch": 2.287685214514483, "eval_runtime": 192.8693, "eval_samples_per_second": 147.317, "eval_steps_per_second": 18.417, "step": 32500 }, { "epoch": 2.2947242459437582, "grad_norm": 18.328662872314453, "learning_rate": 1.3059234539576598e-05, "loss": 2.5476, "step": 32600 }, { "epoch": 2.3017632773730337, "grad_norm": 16.62209129333496, "learning_rate": 1.2928876838043592e-05, "loss": 2.6778, "step": 32700 }, { "epoch": 2.3088023088023086, "grad_norm": 15.676456451416016, "learning_rate": 1.2798519136510586e-05, "loss": 2.6188, "step": 32800 }, { "epoch": 2.315841340231584, "grad_norm": 21.3188533782959, "learning_rate": 1.266816143497758e-05, "loss": 2.5383, "step": 32900 }, { "epoch": 2.3228803716608595, "grad_norm": 15.253218650817871, "learning_rate": 1.2537803733444572e-05, "loss": 2.6703, "step": 33000 }, { "epoch": 2.3228803716608595, "eval_runtime": 192.6821, "eval_samples_per_second": 147.46, "eval_steps_per_second": 18.435, "step": 33000 }, { "epoch": 2.329919403090135, "grad_norm": 17.341787338256836, "learning_rate": 1.2407446031911565e-05, "loss": 2.6903, "step": 33100 }, { "epoch": 2.3369584345194103, "grad_norm": 14.856354713439941, "learning_rate": 1.2277088330378559e-05, "loss": 2.5655, "step": 33200 }, { "epoch": 2.3439974659486853, "grad_norm": 17.669092178344727, "learning_rate": 1.2146730628845553e-05, "loss": 2.6723, "step": 33300 }, { "epoch": 2.3510364973779607, "grad_norm": 18.183189392089844, "learning_rate": 1.2016372927312545e-05, "loss": 2.6732, "step": 33400 }, { "epoch": 2.358075528807236, "grad_norm": 20.30499267578125, "learning_rate": 1.188601522577954e-05, "loss": 2.6527, "step": 33500 }, { "epoch": 2.358075528807236, "eval_runtime": 192.939, "eval_samples_per_second": 147.264, "eval_steps_per_second": 18.41, "step": 33500 }, { "epoch": 2.3651145602365116, "grad_norm": 22.915029525756836, "learning_rate": 1.1755657524246532e-05, "loss": 2.4797, "step": 33600 }, { "epoch": 2.3721535916657865, "grad_norm": 16.179378509521484, "learning_rate": 1.1625299822713528e-05, "loss": 2.5854, "step": 33700 }, { "epoch": 2.379192623095062, "grad_norm": 14.764082908630371, "learning_rate": 1.149494212118052e-05, "loss": 2.4972, "step": 33800 }, { "epoch": 2.3862316545243374, "grad_norm": 21.402334213256836, "learning_rate": 1.1364584419647513e-05, "loss": 2.5752, "step": 33900 }, { "epoch": 2.393270685953613, "grad_norm": 19.00446891784668, "learning_rate": 1.1234226718114507e-05, "loss": 2.4806, "step": 34000 }, { "epoch": 2.393270685953613, "eval_runtime": 192.7281, "eval_samples_per_second": 147.425, "eval_steps_per_second": 18.43, "step": 34000 }, { "epoch": 2.4003097173828882, "grad_norm": 21.23725700378418, "learning_rate": 1.11038690165815e-05, "loss": 2.5424, "step": 34100 }, { "epoch": 2.407348748812163, "grad_norm": 14.942157745361328, "learning_rate": 1.0973511315048493e-05, "loss": 2.5926, "step": 34200 }, { "epoch": 2.4143877802414386, "grad_norm": 17.429502487182617, "learning_rate": 1.0843153613515487e-05, "loss": 2.5892, "step": 34300 }, { "epoch": 2.421426811670714, "grad_norm": 15.42565631866455, "learning_rate": 1.071279591198248e-05, "loss": 2.5758, "step": 34400 }, { "epoch": 2.4284658430999895, "grad_norm": 20.9206600189209, "learning_rate": 1.0582438210449474e-05, "loss": 2.6666, "step": 34500 }, { "epoch": 2.4284658430999895, "eval_runtime": 192.2868, "eval_samples_per_second": 147.764, "eval_steps_per_second": 18.472, "step": 34500 }, { "epoch": 2.435504874529265, "grad_norm": 16.189416885375977, "learning_rate": 1.0452080508916468e-05, "loss": 2.5727, "step": 34600 }, { "epoch": 2.44254390595854, "grad_norm": 17.95191192626953, "learning_rate": 1.032172280738346e-05, "loss": 2.6171, "step": 34700 }, { "epoch": 2.4495829373878153, "grad_norm": 15.953314781188965, "learning_rate": 1.0191365105850454e-05, "loss": 2.5181, "step": 34800 }, { "epoch": 2.4566219688170907, "grad_norm": 20.293758392333984, "learning_rate": 1.0061007404317447e-05, "loss": 2.6109, "step": 34900 }, { "epoch": 2.463661000246366, "grad_norm": 13.837769508361816, "learning_rate": 9.930649702784441e-06, "loss": 2.6833, "step": 35000 }, { "epoch": 2.463661000246366, "eval_runtime": 192.3429, "eval_samples_per_second": 147.721, "eval_steps_per_second": 18.467, "step": 35000 }, { "epoch": 2.4707000316756416, "grad_norm": 15.594371795654297, "learning_rate": 9.800292001251435e-06, "loss": 2.6111, "step": 35100 }, { "epoch": 2.4777390631049165, "grad_norm": 18.549043655395508, "learning_rate": 9.669934299718427e-06, "loss": 2.5622, "step": 35200 }, { "epoch": 2.484778094534192, "grad_norm": 15.56165599822998, "learning_rate": 9.53957659818542e-06, "loss": 2.5254, "step": 35300 }, { "epoch": 2.4918171259634674, "grad_norm": 14.361612319946289, "learning_rate": 9.409218896652416e-06, "loss": 2.5388, "step": 35400 }, { "epoch": 2.498856157392743, "grad_norm": 17.944364547729492, "learning_rate": 9.278861195119408e-06, "loss": 2.5671, "step": 35500 }, { "epoch": 2.498856157392743, "eval_runtime": 193.2612, "eval_samples_per_second": 147.019, "eval_steps_per_second": 18.379, "step": 35500 }, { "epoch": 2.5058951888220182, "grad_norm": 15.994379043579102, "learning_rate": 9.1485034935864e-06, "loss": 2.4926, "step": 35600 }, { "epoch": 2.512934220251293, "grad_norm": 15.721161842346191, "learning_rate": 9.018145792053395e-06, "loss": 2.5956, "step": 35700 }, { "epoch": 2.5199732516805686, "grad_norm": 21.510955810546875, "learning_rate": 8.887788090520389e-06, "loss": 2.6592, "step": 35800 }, { "epoch": 2.527012283109844, "grad_norm": 16.77272605895996, "learning_rate": 8.757430388987383e-06, "loss": 2.655, "step": 35900 }, { "epoch": 2.5340513145391195, "grad_norm": 18.944421768188477, "learning_rate": 8.627072687454375e-06, "loss": 2.5201, "step": 36000 }, { "epoch": 2.5340513145391195, "eval_runtime": 192.2731, "eval_samples_per_second": 147.774, "eval_steps_per_second": 18.474, "step": 36000 }, { "epoch": 2.541090345968395, "grad_norm": 19.00555992126465, "learning_rate": 8.496714985921368e-06, "loss": 2.5858, "step": 36100 }, { "epoch": 2.54812937739767, "grad_norm": 16.338956832885742, "learning_rate": 8.366357284388362e-06, "loss": 2.5963, "step": 36200 }, { "epoch": 2.5551684088269453, "grad_norm": 15.704483032226562, "learning_rate": 8.235999582855356e-06, "loss": 2.5504, "step": 36300 }, { "epoch": 2.5622074402562207, "grad_norm": 17.013628005981445, "learning_rate": 8.105641881322348e-06, "loss": 2.6663, "step": 36400 }, { "epoch": 2.569246471685496, "grad_norm": 16.901050567626953, "learning_rate": 7.975284179789342e-06, "loss": 2.5827, "step": 36500 }, { "epoch": 2.569246471685496, "eval_runtime": 192.5506, "eval_samples_per_second": 147.561, "eval_steps_per_second": 18.447, "step": 36500 }, { "epoch": 2.5762855031147716, "grad_norm": 16.243534088134766, "learning_rate": 7.844926478256335e-06, "loss": 2.6065, "step": 36600 }, { "epoch": 2.5833245345440465, "grad_norm": 17.0561580657959, "learning_rate": 7.714568776723329e-06, "loss": 2.5166, "step": 36700 }, { "epoch": 2.590363565973322, "grad_norm": 14.800107955932617, "learning_rate": 7.584211075190323e-06, "loss": 2.6966, "step": 36800 }, { "epoch": 2.5974025974025974, "grad_norm": 17.22756576538086, "learning_rate": 7.453853373657315e-06, "loss": 2.5921, "step": 36900 }, { "epoch": 2.604441628831873, "grad_norm": 16.94314956665039, "learning_rate": 7.32349567212431e-06, "loss": 2.7039, "step": 37000 }, { "epoch": 2.604441628831873, "eval_runtime": 192.7372, "eval_samples_per_second": 147.418, "eval_steps_per_second": 18.429, "step": 37000 }, { "epoch": 2.6114806602611482, "grad_norm": 15.262337684631348, "learning_rate": 7.193137970591303e-06, "loss": 2.5053, "step": 37100 }, { "epoch": 2.618519691690423, "grad_norm": 16.485326766967773, "learning_rate": 7.062780269058296e-06, "loss": 2.6282, "step": 37200 }, { "epoch": 2.6255587231196986, "grad_norm": 23.574670791625977, "learning_rate": 6.93242256752529e-06, "loss": 2.5995, "step": 37300 }, { "epoch": 2.632597754548974, "grad_norm": 16.39130973815918, "learning_rate": 6.802064865992283e-06, "loss": 2.5522, "step": 37400 }, { "epoch": 2.6396367859782495, "grad_norm": 20.67544174194336, "learning_rate": 6.671707164459276e-06, "loss": 2.6411, "step": 37500 }, { "epoch": 2.6396367859782495, "eval_runtime": 192.0322, "eval_samples_per_second": 147.96, "eval_steps_per_second": 18.497, "step": 37500 }, { "epoch": 2.646675817407525, "grad_norm": 17.333271026611328, "learning_rate": 6.54134946292627e-06, "loss": 2.6002, "step": 37600 }, { "epoch": 2.6537148488368, "grad_norm": 17.444929122924805, "learning_rate": 6.410991761393263e-06, "loss": 2.4802, "step": 37700 }, { "epoch": 2.6607538802660753, "grad_norm": 17.62455940246582, "learning_rate": 6.2806340598602564e-06, "loss": 2.5169, "step": 37800 }, { "epoch": 2.6677929116953507, "grad_norm": 23.869504928588867, "learning_rate": 6.1502763583272506e-06, "loss": 2.6119, "step": 37900 }, { "epoch": 2.674831943124626, "grad_norm": 14.378959655761719, "learning_rate": 6.019918656794243e-06, "loss": 2.6484, "step": 38000 }, { "epoch": 2.674831943124626, "eval_runtime": 191.7147, "eval_samples_per_second": 148.205, "eval_steps_per_second": 18.528, "step": 38000 }, { "epoch": 2.6818709745539016, "grad_norm": 13.199753761291504, "learning_rate": 5.889560955261237e-06, "loss": 2.5929, "step": 38100 }, { "epoch": 2.6889100059831765, "grad_norm": 19.931673049926758, "learning_rate": 5.75920325372823e-06, "loss": 2.5691, "step": 38200 }, { "epoch": 2.695949037412452, "grad_norm": 16.0571346282959, "learning_rate": 5.6288455521952244e-06, "loss": 2.5593, "step": 38300 }, { "epoch": 2.7029880688417274, "grad_norm": 14.09821605682373, "learning_rate": 5.498487850662217e-06, "loss": 2.5663, "step": 38400 }, { "epoch": 2.710027100271003, "grad_norm": 16.2088680267334, "learning_rate": 5.368130149129211e-06, "loss": 2.5763, "step": 38500 }, { "epoch": 2.710027100271003, "eval_runtime": 192.1652, "eval_samples_per_second": 147.857, "eval_steps_per_second": 18.484, "step": 38500 }, { "epoch": 2.717066131700278, "grad_norm": 26.869508743286133, "learning_rate": 5.237772447596204e-06, "loss": 2.7026, "step": 38600 }, { "epoch": 2.724105163129553, "grad_norm": 17.842239379882812, "learning_rate": 5.107414746063198e-06, "loss": 2.69, "step": 38700 }, { "epoch": 2.7311441945588286, "grad_norm": 17.31543731689453, "learning_rate": 4.977057044530191e-06, "loss": 2.6621, "step": 38800 }, { "epoch": 2.738183225988104, "grad_norm": 15.826437950134277, "learning_rate": 4.846699342997185e-06, "loss": 2.7036, "step": 38900 }, { "epoch": 2.7452222574173795, "grad_norm": 16.656599044799805, "learning_rate": 4.716341641464178e-06, "loss": 2.5078, "step": 39000 }, { "epoch": 2.7452222574173795, "eval_runtime": 191.4626, "eval_samples_per_second": 148.4, "eval_steps_per_second": 18.552, "step": 39000 }, { "epoch": 2.752261288846655, "grad_norm": 16.501192092895508, "learning_rate": 4.585983939931171e-06, "loss": 2.5312, "step": 39100 }, { "epoch": 2.75930032027593, "grad_norm": 17.555389404296875, "learning_rate": 4.455626238398165e-06, "loss": 2.5059, "step": 39200 }, { "epoch": 2.7663393517052053, "grad_norm": 18.289548873901367, "learning_rate": 4.325268536865158e-06, "loss": 2.6702, "step": 39300 }, { "epoch": 2.7733783831344807, "grad_norm": 15.688879013061523, "learning_rate": 4.194910835332152e-06, "loss": 2.5357, "step": 39400 }, { "epoch": 2.780417414563756, "grad_norm": 14.281635284423828, "learning_rate": 4.064553133799144e-06, "loss": 2.6129, "step": 39500 }, { "epoch": 2.780417414563756, "eval_runtime": 193.5326, "eval_samples_per_second": 146.812, "eval_steps_per_second": 18.353, "step": 39500 }, { "epoch": 2.7874564459930316, "grad_norm": 22.23700523376465, "learning_rate": 3.9341954322661385e-06, "loss": 2.5335, "step": 39600 }, { "epoch": 2.7944954774223065, "grad_norm": 17.91628074645996, "learning_rate": 3.803837730733132e-06, "loss": 2.5757, "step": 39700 }, { "epoch": 2.801534508851582, "grad_norm": 16.670568466186523, "learning_rate": 3.6734800292001254e-06, "loss": 2.5679, "step": 39800 }, { "epoch": 2.8085735402808574, "grad_norm": 17.128202438354492, "learning_rate": 3.5431223276671187e-06, "loss": 2.5285, "step": 39900 }, { "epoch": 2.815612571710133, "grad_norm": 14.024889945983887, "learning_rate": 3.412764626134112e-06, "loss": 2.515, "step": 40000 }, { "epoch": 2.815612571710133, "eval_runtime": 192.6175, "eval_samples_per_second": 147.51, "eval_steps_per_second": 18.441, "step": 40000 }, { "epoch": 2.822651603139408, "grad_norm": 16.694087982177734, "learning_rate": 3.2824069246011056e-06, "loss": 2.625, "step": 40100 }, { "epoch": 2.829690634568683, "grad_norm": 22.633140563964844, "learning_rate": 3.1520492230680985e-06, "loss": 2.5637, "step": 40200 }, { "epoch": 2.8367296659979586, "grad_norm": 18.231454849243164, "learning_rate": 3.021691521535092e-06, "loss": 2.5672, "step": 40300 }, { "epoch": 2.843768697427234, "grad_norm": 15.228378295898438, "learning_rate": 2.891333820002086e-06, "loss": 2.6203, "step": 40400 }, { "epoch": 2.8508077288565095, "grad_norm": 19.437833786010742, "learning_rate": 2.760976118469079e-06, "loss": 2.626, "step": 40500 }, { "epoch": 2.8508077288565095, "eval_runtime": 192.5589, "eval_samples_per_second": 147.555, "eval_steps_per_second": 18.446, "step": 40500 }, { "epoch": 2.857846760285785, "grad_norm": 16.506317138671875, "learning_rate": 2.630618416936073e-06, "loss": 2.5625, "step": 40600 }, { "epoch": 2.86488579171506, "grad_norm": 25.00144386291504, "learning_rate": 2.500260715403066e-06, "loss": 2.616, "step": 40700 }, { "epoch": 2.8719248231443353, "grad_norm": 16.7978572845459, "learning_rate": 2.3699030138700597e-06, "loss": 2.5959, "step": 40800 }, { "epoch": 2.8789638545736107, "grad_norm": 15.795037269592285, "learning_rate": 2.239545312337053e-06, "loss": 2.6245, "step": 40900 }, { "epoch": 2.886002886002886, "grad_norm": 16.411415100097656, "learning_rate": 2.1091876108040467e-06, "loss": 2.6368, "step": 41000 }, { "epoch": 2.886002886002886, "eval_runtime": 192.2421, "eval_samples_per_second": 147.798, "eval_steps_per_second": 18.477, "step": 41000 }, { "epoch": 2.8930419174321615, "grad_norm": 16.8485050201416, "learning_rate": 1.97882990927104e-06, "loss": 2.5946, "step": 41100 }, { "epoch": 2.9000809488614365, "grad_norm": 15.294781684875488, "learning_rate": 1.8484722077380334e-06, "loss": 2.6035, "step": 41200 }, { "epoch": 2.907119980290712, "grad_norm": 26.89401626586914, "learning_rate": 1.7181145062050267e-06, "loss": 2.6353, "step": 41300 }, { "epoch": 2.9141590117199874, "grad_norm": 13.004213333129883, "learning_rate": 1.58775680467202e-06, "loss": 2.6299, "step": 41400 }, { "epoch": 2.921198043149263, "grad_norm": 17.197162628173828, "learning_rate": 1.4573991031390136e-06, "loss": 2.6031, "step": 41500 }, { "epoch": 2.921198043149263, "eval_runtime": 192.0882, "eval_samples_per_second": 147.916, "eval_steps_per_second": 18.492, "step": 41500 }, { "epoch": 2.928237074578538, "grad_norm": 16.04857063293457, "learning_rate": 1.327041401606007e-06, "loss": 2.5869, "step": 41600 }, { "epoch": 2.935276106007813, "grad_norm": 14.147359848022461, "learning_rate": 1.1966837000730005e-06, "loss": 2.5851, "step": 41700 }, { "epoch": 2.9423151374370886, "grad_norm": 17.802715301513672, "learning_rate": 1.0663259985399938e-06, "loss": 2.4637, "step": 41800 }, { "epoch": 2.949354168866364, "grad_norm": 20.130615234375, "learning_rate": 9.359682970069872e-07, "loss": 2.5418, "step": 41900 }, { "epoch": 2.9563932002956395, "grad_norm": 18.158117294311523, "learning_rate": 8.056105954739805e-07, "loss": 2.5456, "step": 42000 }, { "epoch": 2.9563932002956395, "eval_runtime": 192.2432, "eval_samples_per_second": 147.797, "eval_steps_per_second": 18.477, "step": 42000 }, { "epoch": 2.963432231724915, "grad_norm": 15.710502624511719, "learning_rate": 6.75252893940974e-07, "loss": 2.6082, "step": 42100 }, { "epoch": 2.97047126315419, "grad_norm": 15.316740989685059, "learning_rate": 5.448951924079675e-07, "loss": 2.5569, "step": 42200 }, { "epoch": 2.9775102945834653, "grad_norm": 17.120691299438477, "learning_rate": 4.145374908749609e-07, "loss": 2.6017, "step": 42300 }, { "epoch": 2.9845493260127407, "grad_norm": 14.475923538208008, "learning_rate": 2.841797893419543e-07, "loss": 2.5627, "step": 42400 }, { "epoch": 2.991588357442016, "grad_norm": 16.937416076660156, "learning_rate": 1.5382208780894776e-07, "loss": 2.557, "step": 42500 }, { "epoch": 2.991588357442016, "eval_runtime": 192.9822, "eval_samples_per_second": 147.231, "eval_steps_per_second": 18.406, "step": 42500 } ], "logging_steps": 100, "max_steps": 42618, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.112230945566618e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }