| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.999894414528561, | |
| "eval_steps": 500, | |
| "global_step": 42618, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.007039031429275332, | |
| "grad_norm": 36.44467544555664, | |
| "learning_rate": 1.1731581417175035e-06, | |
| "loss": 6.2071, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.014078062858550663, | |
| "grad_norm": 43.126609802246094, | |
| "learning_rate": 2.346316283435007e-06, | |
| "loss": 5.9231, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.021117094287825995, | |
| "grad_norm": 29.171634674072266, | |
| "learning_rate": 3.5194744251525106e-06, | |
| "loss": 5.4145, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.028156125717101327, | |
| "grad_norm": 38.217105865478516, | |
| "learning_rate": 4.692632566870014e-06, | |
| "loss": 4.9149, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.03519515714637666, | |
| "grad_norm": 35.40254211425781, | |
| "learning_rate": 5.865790708587518e-06, | |
| "loss": 4.5052, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.03519515714637666, | |
| "eval_runtime": 191.3754, | |
| "eval_samples_per_second": 148.467, | |
| "eval_steps_per_second": 18.56, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.04223418857565199, | |
| "grad_norm": 36.732643127441406, | |
| "learning_rate": 7.038948850305021e-06, | |
| "loss": 4.3715, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.04927322000492732, | |
| "grad_norm": 36.27021408081055, | |
| "learning_rate": 8.212106992022525e-06, | |
| "loss": 4.3269, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.056312251434202654, | |
| "grad_norm": 42.45858383178711, | |
| "learning_rate": 9.385265133740028e-06, | |
| "loss": 4.0589, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.06335128286347799, | |
| "grad_norm": 42.08483123779297, | |
| "learning_rate": 1.0558423275457532e-05, | |
| "loss": 4.1336, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.07039031429275332, | |
| "grad_norm": 42.23253631591797, | |
| "learning_rate": 1.1731581417175035e-05, | |
| "loss": 4.0719, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.07039031429275332, | |
| "eval_runtime": 193.8033, | |
| "eval_samples_per_second": 146.607, | |
| "eval_steps_per_second": 18.328, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.07742934572202866, | |
| "grad_norm": 48.905662536621094, | |
| "learning_rate": 1.2904739558892539e-05, | |
| "loss": 3.8613, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.08446837715130398, | |
| "grad_norm": 37.9277458190918, | |
| "learning_rate": 1.4077897700610042e-05, | |
| "loss": 3.8424, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.09150740858057932, | |
| "grad_norm": 48.82701110839844, | |
| "learning_rate": 1.5251055842327546e-05, | |
| "loss": 3.7771, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.09854644000985464, | |
| "grad_norm": 33.38028335571289, | |
| "learning_rate": 1.642421398404505e-05, | |
| "loss": 3.8094, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.10558547143912998, | |
| "grad_norm": 61.35352325439453, | |
| "learning_rate": 1.7597372125762555e-05, | |
| "loss": 3.8331, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.10558547143912998, | |
| "eval_runtime": 191.8384, | |
| "eval_samples_per_second": 148.109, | |
| "eval_steps_per_second": 18.516, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.11262450286840531, | |
| "grad_norm": 46.74394226074219, | |
| "learning_rate": 1.8770530267480057e-05, | |
| "loss": 3.6822, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.11966353429768065, | |
| "grad_norm": 35.53325271606445, | |
| "learning_rate": 1.9943688409197562e-05, | |
| "loss": 3.6282, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.12670256572695598, | |
| "grad_norm": 37.73524856567383, | |
| "learning_rate": 2.1116846550915064e-05, | |
| "loss": 3.5722, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.1337415971562313, | |
| "grad_norm": 33.76814651489258, | |
| "learning_rate": 2.229000469263257e-05, | |
| "loss": 3.6086, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.14078062858550663, | |
| "grad_norm": 41.888282775878906, | |
| "learning_rate": 2.346316283435007e-05, | |
| "loss": 3.6142, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.14078062858550663, | |
| "eval_runtime": 191.5815, | |
| "eval_samples_per_second": 148.308, | |
| "eval_steps_per_second": 18.54, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.14781966001478197, | |
| "grad_norm": 39.62664031982422, | |
| "learning_rate": 2.4636320976067576e-05, | |
| "loss": 3.6029, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 0.1548586914440573, | |
| "grad_norm": 38.377532958984375, | |
| "learning_rate": 2.5809479117785078e-05, | |
| "loss": 3.4959, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 0.16189772287333262, | |
| "grad_norm": 32.66987991333008, | |
| "learning_rate": 2.698263725950258e-05, | |
| "loss": 3.5252, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 0.16893675430260796, | |
| "grad_norm": 39.213592529296875, | |
| "learning_rate": 2.8155795401220085e-05, | |
| "loss": 3.5859, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 0.1759757857318833, | |
| "grad_norm": 31.646276473999023, | |
| "learning_rate": 2.9328953542937587e-05, | |
| "loss": 3.4995, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.1759757857318833, | |
| "eval_runtime": 194.3308, | |
| "eval_samples_per_second": 146.209, | |
| "eval_steps_per_second": 18.278, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 0.18301481716115864, | |
| "grad_norm": 32.30677032470703, | |
| "learning_rate": 3.0502111684655092e-05, | |
| "loss": 3.5853, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 0.19005384859043395, | |
| "grad_norm": 31.175769805908203, | |
| "learning_rate": 3.1675269826372594e-05, | |
| "loss": 3.5134, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 0.1970928800197093, | |
| "grad_norm": 31.389162063598633, | |
| "learning_rate": 3.28484279680901e-05, | |
| "loss": 3.4909, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 0.20413191144898463, | |
| "grad_norm": 33.105369567871094, | |
| "learning_rate": 3.4021586109807604e-05, | |
| "loss": 3.4099, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 0.21117094287825997, | |
| "grad_norm": 25.477977752685547, | |
| "learning_rate": 3.519474425152511e-05, | |
| "loss": 3.3823, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.21117094287825997, | |
| "eval_runtime": 194.3983, | |
| "eval_samples_per_second": 146.159, | |
| "eval_steps_per_second": 18.272, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.21820997430753528, | |
| "grad_norm": 29.61454200744629, | |
| "learning_rate": 3.636790239324261e-05, | |
| "loss": 3.3476, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 0.22524900573681061, | |
| "grad_norm": 26.82366180419922, | |
| "learning_rate": 3.754106053496011e-05, | |
| "loss": 3.389, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 0.23228803716608595, | |
| "grad_norm": 26.6168155670166, | |
| "learning_rate": 3.871421867667762e-05, | |
| "loss": 3.3712, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 0.2393270685953613, | |
| "grad_norm": 24.504793167114258, | |
| "learning_rate": 3.9887376818395124e-05, | |
| "loss": 3.2693, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 0.2463661000246366, | |
| "grad_norm": 22.34451675415039, | |
| "learning_rate": 4.106053496011262e-05, | |
| "loss": 3.3719, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.2463661000246366, | |
| "eval_runtime": 192.2522, | |
| "eval_samples_per_second": 147.79, | |
| "eval_steps_per_second": 18.476, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 0.25340513145391197, | |
| "grad_norm": 30.370140075683594, | |
| "learning_rate": 4.223369310183013e-05, | |
| "loss": 3.3216, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 0.2604441628831873, | |
| "grad_norm": 29.111398696899414, | |
| "learning_rate": 4.340685124354763e-05, | |
| "loss": 3.3085, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 0.2674831943124626, | |
| "grad_norm": 29.50999641418457, | |
| "learning_rate": 4.458000938526514e-05, | |
| "loss": 3.2907, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 0.27452222574173796, | |
| "grad_norm": 21.999244689941406, | |
| "learning_rate": 4.5753167526982636e-05, | |
| "loss": 3.2173, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 0.28156125717101327, | |
| "grad_norm": 28.0905818939209, | |
| "learning_rate": 4.692632566870014e-05, | |
| "loss": 3.3431, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.28156125717101327, | |
| "eval_runtime": 192.489, | |
| "eval_samples_per_second": 147.608, | |
| "eval_steps_per_second": 18.453, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.2886002886002886, | |
| "grad_norm": 27.252222061157227, | |
| "learning_rate": 4.809948381041765e-05, | |
| "loss": 3.4265, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 0.29563932002956395, | |
| "grad_norm": 20.001508712768555, | |
| "learning_rate": 4.927264195213515e-05, | |
| "loss": 3.2489, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 0.30267835145883926, | |
| "grad_norm": 24.947546005249023, | |
| "learning_rate": 4.995046407341746e-05, | |
| "loss": 3.2957, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 0.3097173828881146, | |
| "grad_norm": 18.58955192565918, | |
| "learning_rate": 4.982010637188445e-05, | |
| "loss": 3.2328, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 0.31675641431738993, | |
| "grad_norm": 22.946285247802734, | |
| "learning_rate": 4.968974867035145e-05, | |
| "loss": 3.177, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.31675641431738993, | |
| "eval_runtime": 192.63, | |
| "eval_samples_per_second": 147.5, | |
| "eval_steps_per_second": 18.439, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 0.32379544574666524, | |
| "grad_norm": 20.17714500427246, | |
| "learning_rate": 4.955939096881844e-05, | |
| "loss": 3.3241, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 0.3308344771759406, | |
| "grad_norm": 18.580751419067383, | |
| "learning_rate": 4.9429033267285435e-05, | |
| "loss": 3.3084, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 0.3378735086052159, | |
| "grad_norm": 16.068750381469727, | |
| "learning_rate": 4.929867556575243e-05, | |
| "loss": 3.1674, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 0.34491254003449123, | |
| "grad_norm": 23.636520385742188, | |
| "learning_rate": 4.916831786421942e-05, | |
| "loss": 3.2698, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 0.3519515714637666, | |
| "grad_norm": 19.445907592773438, | |
| "learning_rate": 4.903796016268641e-05, | |
| "loss": 3.2824, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.3519515714637666, | |
| "eval_runtime": 192.779, | |
| "eval_samples_per_second": 147.386, | |
| "eval_steps_per_second": 18.425, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.3589906028930419, | |
| "grad_norm": 20.217737197875977, | |
| "learning_rate": 4.8907602461153405e-05, | |
| "loss": 3.3673, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 0.3660296343223173, | |
| "grad_norm": 16.03109359741211, | |
| "learning_rate": 4.87772447596204e-05, | |
| "loss": 3.3105, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 0.3730686657515926, | |
| "grad_norm": 21.388507843017578, | |
| "learning_rate": 4.864688705808739e-05, | |
| "loss": 3.1998, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 0.3801076971808679, | |
| "grad_norm": 16.931922912597656, | |
| "learning_rate": 4.851652935655439e-05, | |
| "loss": 3.2054, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 0.38714672861014326, | |
| "grad_norm": 24.145727157592773, | |
| "learning_rate": 4.838617165502138e-05, | |
| "loss": 3.0802, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.38714672861014326, | |
| "eval_runtime": 192.7089, | |
| "eval_samples_per_second": 147.44, | |
| "eval_steps_per_second": 18.432, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 0.3941857600394186, | |
| "grad_norm": 24.199636459350586, | |
| "learning_rate": 4.8255813953488375e-05, | |
| "loss": 3.0688, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 0.4012247914686939, | |
| "grad_norm": 18.878751754760742, | |
| "learning_rate": 4.812545625195537e-05, | |
| "loss": 3.1667, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 0.40826382289796925, | |
| "grad_norm": 17.39369010925293, | |
| "learning_rate": 4.7995098550422363e-05, | |
| "loss": 3.1905, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 0.41530285432724456, | |
| "grad_norm": 19.778793334960938, | |
| "learning_rate": 4.786474084888936e-05, | |
| "loss": 3.0883, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 0.42234188575651993, | |
| "grad_norm": 14.80075454711914, | |
| "learning_rate": 4.773438314735635e-05, | |
| "loss": 3.0839, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.42234188575651993, | |
| "eval_runtime": 192.7353, | |
| "eval_samples_per_second": 147.42, | |
| "eval_steps_per_second": 18.429, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.42938091718579524, | |
| "grad_norm": 27.43608856201172, | |
| "learning_rate": 4.760402544582334e-05, | |
| "loss": 3.0621, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 0.43641994861507055, | |
| "grad_norm": 18.219221115112305, | |
| "learning_rate": 4.747366774429033e-05, | |
| "loss": 3.1461, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 0.4434589800443459, | |
| "grad_norm": 17.97977638244629, | |
| "learning_rate": 4.734331004275733e-05, | |
| "loss": 3.0795, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 0.45049801147362123, | |
| "grad_norm": 21.358592987060547, | |
| "learning_rate": 4.721295234122432e-05, | |
| "loss": 3.1361, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 0.45753704290289654, | |
| "grad_norm": 15.679008483886719, | |
| "learning_rate": 4.7082594639691315e-05, | |
| "loss": 3.0751, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.45753704290289654, | |
| "eval_runtime": 193.5839, | |
| "eval_samples_per_second": 146.774, | |
| "eval_steps_per_second": 18.349, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 0.4645760743321719, | |
| "grad_norm": 14.288241386413574, | |
| "learning_rate": 4.69522369381583e-05, | |
| "loss": 3.0886, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 0.4716151057614472, | |
| "grad_norm": 15.016201972961426, | |
| "learning_rate": 4.6821879236625304e-05, | |
| "loss": 2.973, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 0.4786541371907226, | |
| "grad_norm": 20.513479232788086, | |
| "learning_rate": 4.66915215350923e-05, | |
| "loss": 3.1001, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 0.4856931686199979, | |
| "grad_norm": 15.093891143798828, | |
| "learning_rate": 4.656116383355929e-05, | |
| "loss": 3.1073, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 0.4927322000492732, | |
| "grad_norm": 15.074331283569336, | |
| "learning_rate": 4.6430806132026286e-05, | |
| "loss": 3.1247, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.4927322000492732, | |
| "eval_runtime": 193.2211, | |
| "eval_samples_per_second": 147.049, | |
| "eval_steps_per_second": 18.383, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.49977123147854857, | |
| "grad_norm": 19.936674118041992, | |
| "learning_rate": 4.630044843049327e-05, | |
| "loss": 3.0331, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 0.5068102629078239, | |
| "grad_norm": 22.168909072875977, | |
| "learning_rate": 4.617009072896027e-05, | |
| "loss": 3.0611, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 0.5138492943370992, | |
| "grad_norm": 19.541671752929688, | |
| "learning_rate": 4.603973302742726e-05, | |
| "loss": 2.9378, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 0.5208883257663746, | |
| "grad_norm": 16.111820220947266, | |
| "learning_rate": 4.5909375325894256e-05, | |
| "loss": 3.0461, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 0.5279273571956499, | |
| "grad_norm": 18.88793182373047, | |
| "learning_rate": 4.577901762436125e-05, | |
| "loss": 3.0183, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.5279273571956499, | |
| "eval_runtime": 191.8152, | |
| "eval_samples_per_second": 148.127, | |
| "eval_steps_per_second": 18.518, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 0.5349663886249252, | |
| "grad_norm": 15.234626770019531, | |
| "learning_rate": 4.5648659922828244e-05, | |
| "loss": 3.0287, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 0.5420054200542005, | |
| "grad_norm": 16.836956024169922, | |
| "learning_rate": 4.551830222129523e-05, | |
| "loss": 2.9991, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 0.5490444514834759, | |
| "grad_norm": 20.984453201293945, | |
| "learning_rate": 4.538794451976223e-05, | |
| "loss": 3.0765, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 0.5560834829127512, | |
| "grad_norm": 18.113004684448242, | |
| "learning_rate": 4.5257586818229226e-05, | |
| "loss": 2.9704, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 0.5631225143420265, | |
| "grad_norm": 20.654579162597656, | |
| "learning_rate": 4.512722911669622e-05, | |
| "loss": 3.0604, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.5631225143420265, | |
| "eval_runtime": 193.8004, | |
| "eval_samples_per_second": 146.61, | |
| "eval_steps_per_second": 18.328, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.5701615457713018, | |
| "grad_norm": 18.815162658691406, | |
| "learning_rate": 4.499687141516321e-05, | |
| "loss": 2.8726, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 0.5772005772005772, | |
| "grad_norm": 15.366965293884277, | |
| "learning_rate": 4.48665137136302e-05, | |
| "loss": 3.0509, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 0.5842396086298526, | |
| "grad_norm": 17.203136444091797, | |
| "learning_rate": 4.4736156012097196e-05, | |
| "loss": 2.9718, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 0.5912786400591279, | |
| "grad_norm": 15.31092643737793, | |
| "learning_rate": 4.460579831056419e-05, | |
| "loss": 3.0382, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 0.5983176714884032, | |
| "grad_norm": 12.381194114685059, | |
| "learning_rate": 4.4475440609031184e-05, | |
| "loss": 2.9945, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.5983176714884032, | |
| "eval_runtime": 192.0001, | |
| "eval_samples_per_second": 147.984, | |
| "eval_steps_per_second": 18.5, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 0.6053567029176785, | |
| "grad_norm": 13.362293243408203, | |
| "learning_rate": 4.434508290749818e-05, | |
| "loss": 3.0427, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 0.6123957343469538, | |
| "grad_norm": 18.495370864868164, | |
| "learning_rate": 4.4214725205965165e-05, | |
| "loss": 3.0363, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 0.6194347657762292, | |
| "grad_norm": 16.98731803894043, | |
| "learning_rate": 4.408436750443216e-05, | |
| "loss": 2.9257, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 0.6264737972055046, | |
| "grad_norm": 18.510801315307617, | |
| "learning_rate": 4.395400980289916e-05, | |
| "loss": 3.0014, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 0.6335128286347799, | |
| "grad_norm": 15.024514198303223, | |
| "learning_rate": 4.3823652101366155e-05, | |
| "loss": 3.0435, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.6335128286347799, | |
| "eval_runtime": 192.5991, | |
| "eval_samples_per_second": 147.524, | |
| "eval_steps_per_second": 18.442, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.6405518600640552, | |
| "grad_norm": 19.763547897338867, | |
| "learning_rate": 4.369329439983315e-05, | |
| "loss": 2.9219, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 0.6475908914933305, | |
| "grad_norm": 19.536087036132812, | |
| "learning_rate": 4.3562936698300136e-05, | |
| "loss": 2.9749, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 0.6546299229226058, | |
| "grad_norm": 15.909920692443848, | |
| "learning_rate": 4.343257899676713e-05, | |
| "loss": 2.8793, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 0.6616689543518812, | |
| "grad_norm": 17.25068473815918, | |
| "learning_rate": 4.3302221295234124e-05, | |
| "loss": 2.9669, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 0.6687079857811565, | |
| "grad_norm": 14.32239055633545, | |
| "learning_rate": 4.317186359370112e-05, | |
| "loss": 2.9066, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.6687079857811565, | |
| "eval_runtime": 191.7836, | |
| "eval_samples_per_second": 148.151, | |
| "eval_steps_per_second": 18.521, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 0.6757470172104318, | |
| "grad_norm": 22.611879348754883, | |
| "learning_rate": 4.304150589216811e-05, | |
| "loss": 2.8333, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 0.6827860486397072, | |
| "grad_norm": 15.549399375915527, | |
| "learning_rate": 4.2911148190635107e-05, | |
| "loss": 2.9913, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 0.6898250800689825, | |
| "grad_norm": 13.915739059448242, | |
| "learning_rate": 4.2780790489102094e-05, | |
| "loss": 2.9524, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 0.6968641114982579, | |
| "grad_norm": 17.621822357177734, | |
| "learning_rate": 4.265043278756909e-05, | |
| "loss": 2.9844, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.7039031429275332, | |
| "grad_norm": 22.748342514038086, | |
| "learning_rate": 4.252007508603609e-05, | |
| "loss": 2.9083, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.7039031429275332, | |
| "eval_runtime": 193.6184, | |
| "eval_samples_per_second": 146.747, | |
| "eval_steps_per_second": 18.345, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.7109421743568085, | |
| "grad_norm": 18.526578903198242, | |
| "learning_rate": 4.238971738450308e-05, | |
| "loss": 2.9706, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 0.7179812057860838, | |
| "grad_norm": 15.676709175109863, | |
| "learning_rate": 4.225935968297007e-05, | |
| "loss": 2.8265, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.7250202372153591, | |
| "grad_norm": 18.21067237854004, | |
| "learning_rate": 4.2129001981437064e-05, | |
| "loss": 2.8592, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 0.7320592686446346, | |
| "grad_norm": 22.188024520874023, | |
| "learning_rate": 4.199864427990406e-05, | |
| "loss": 2.8158, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.7390983000739099, | |
| "grad_norm": 18.406801223754883, | |
| "learning_rate": 4.186828657837105e-05, | |
| "loss": 2.9264, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.7390983000739099, | |
| "eval_runtime": 193.9371, | |
| "eval_samples_per_second": 146.506, | |
| "eval_steps_per_second": 18.315, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.7461373315031852, | |
| "grad_norm": 20.65268325805664, | |
| "learning_rate": 4.173792887683805e-05, | |
| "loss": 2.9882, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 0.7531763629324605, | |
| "grad_norm": 12.223052024841309, | |
| "learning_rate": 4.160757117530504e-05, | |
| "loss": 2.8969, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 0.7602153943617358, | |
| "grad_norm": 18.218887329101562, | |
| "learning_rate": 4.147721347377203e-05, | |
| "loss": 2.8254, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 0.7672544257910111, | |
| "grad_norm": 19.009950637817383, | |
| "learning_rate": 4.134685577223902e-05, | |
| "loss": 2.9361, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 0.7742934572202865, | |
| "grad_norm": 33.881927490234375, | |
| "learning_rate": 4.1216498070706016e-05, | |
| "loss": 2.8528, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.7742934572202865, | |
| "eval_runtime": 194.2555, | |
| "eval_samples_per_second": 146.266, | |
| "eval_steps_per_second": 18.285, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.7813324886495618, | |
| "grad_norm": 19.02928924560547, | |
| "learning_rate": 4.108614036917302e-05, | |
| "loss": 2.9383, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 0.7883715200788372, | |
| "grad_norm": 18.154483795166016, | |
| "learning_rate": 4.095578266764001e-05, | |
| "loss": 2.7691, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 0.7954105515081125, | |
| "grad_norm": 13.669476509094238, | |
| "learning_rate": 4.0825424966107e-05, | |
| "loss": 2.8306, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 0.8024495829373878, | |
| "grad_norm": 16.23528289794922, | |
| "learning_rate": 4.069506726457399e-05, | |
| "loss": 2.8588, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 0.8094886143666632, | |
| "grad_norm": 16.63111686706543, | |
| "learning_rate": 4.056470956304099e-05, | |
| "loss": 2.91, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.8094886143666632, | |
| "eval_runtime": 193.9814, | |
| "eval_samples_per_second": 146.473, | |
| "eval_steps_per_second": 18.311, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.8165276457959385, | |
| "grad_norm": 19.989736557006836, | |
| "learning_rate": 4.043435186150798e-05, | |
| "loss": 2.8754, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 0.8235666772252138, | |
| "grad_norm": 15.608553886413574, | |
| "learning_rate": 4.0303994159974975e-05, | |
| "loss": 2.8896, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 0.8306057086544891, | |
| "grad_norm": 17.236600875854492, | |
| "learning_rate": 4.017363645844196e-05, | |
| "loss": 2.8897, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 0.8376447400837644, | |
| "grad_norm": 16.03377342224121, | |
| "learning_rate": 4.0043278756908957e-05, | |
| "loss": 2.8936, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 0.8446837715130399, | |
| "grad_norm": 25.3082332611084, | |
| "learning_rate": 3.991292105537595e-05, | |
| "loss": 2.8939, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.8446837715130399, | |
| "eval_runtime": 192.6986, | |
| "eval_samples_per_second": 147.448, | |
| "eval_steps_per_second": 18.433, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.8517228029423152, | |
| "grad_norm": 18.766387939453125, | |
| "learning_rate": 3.9782563353842945e-05, | |
| "loss": 2.9448, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 0.8587618343715905, | |
| "grad_norm": 17.019485473632812, | |
| "learning_rate": 3.9652205652309946e-05, | |
| "loss": 2.7899, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 0.8658008658008658, | |
| "grad_norm": 15.821990966796875, | |
| "learning_rate": 3.952184795077693e-05, | |
| "loss": 2.8069, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 0.8728398972301411, | |
| "grad_norm": 19.532939910888672, | |
| "learning_rate": 3.939149024924393e-05, | |
| "loss": 2.8797, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 0.8798789286594164, | |
| "grad_norm": 15.134819030761719, | |
| "learning_rate": 3.926113254771092e-05, | |
| "loss": 2.89, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.8798789286594164, | |
| "eval_runtime": 191.6112, | |
| "eval_samples_per_second": 148.285, | |
| "eval_steps_per_second": 18.538, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.8869179600886918, | |
| "grad_norm": 15.088654518127441, | |
| "learning_rate": 3.9130774846177915e-05, | |
| "loss": 2.8519, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 0.8939569915179671, | |
| "grad_norm": 16.8511962890625, | |
| "learning_rate": 3.900041714464491e-05, | |
| "loss": 2.7864, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 0.9009960229472425, | |
| "grad_norm": 19.323467254638672, | |
| "learning_rate": 3.8870059443111903e-05, | |
| "loss": 2.9093, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 0.9080350543765178, | |
| "grad_norm": 16.42205810546875, | |
| "learning_rate": 3.873970174157889e-05, | |
| "loss": 2.8426, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 0.9150740858057931, | |
| "grad_norm": 16.470041275024414, | |
| "learning_rate": 3.8609344040045885e-05, | |
| "loss": 2.9097, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.9150740858057931, | |
| "eval_runtime": 192.3033, | |
| "eval_samples_per_second": 147.751, | |
| "eval_steps_per_second": 18.471, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.9221131172350685, | |
| "grad_norm": 15.83154296875, | |
| "learning_rate": 3.847898633851288e-05, | |
| "loss": 2.8286, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 0.9291521486643438, | |
| "grad_norm": 14.70117473602295, | |
| "learning_rate": 3.834862863697987e-05, | |
| "loss": 2.7575, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 0.9361911800936191, | |
| "grad_norm": 12.124509811401367, | |
| "learning_rate": 3.821827093544687e-05, | |
| "loss": 2.8621, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 0.9432302115228944, | |
| "grad_norm": 16.292022705078125, | |
| "learning_rate": 3.808791323391386e-05, | |
| "loss": 2.8619, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 0.9502692429521697, | |
| "grad_norm": 18.999359130859375, | |
| "learning_rate": 3.7957555532380855e-05, | |
| "loss": 2.8396, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.9502692429521697, | |
| "eval_runtime": 192.4881, | |
| "eval_samples_per_second": 147.609, | |
| "eval_steps_per_second": 18.453, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.9573082743814452, | |
| "grad_norm": 17.78417205810547, | |
| "learning_rate": 3.782719783084785e-05, | |
| "loss": 2.842, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 0.9643473058107205, | |
| "grad_norm": 16.267335891723633, | |
| "learning_rate": 3.7696840129314844e-05, | |
| "loss": 2.8373, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 0.9713863372399958, | |
| "grad_norm": 15.711287498474121, | |
| "learning_rate": 3.756648242778184e-05, | |
| "loss": 2.9, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 0.9784253686692711, | |
| "grad_norm": 26.957563400268555, | |
| "learning_rate": 3.7436124726248825e-05, | |
| "loss": 2.9499, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 0.9854644000985464, | |
| "grad_norm": 11.875740051269531, | |
| "learning_rate": 3.730576702471582e-05, | |
| "loss": 2.7061, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.9854644000985464, | |
| "eval_runtime": 192.2723, | |
| "eval_samples_per_second": 147.775, | |
| "eval_steps_per_second": 18.474, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.9925034315278217, | |
| "grad_norm": 21.315786361694336, | |
| "learning_rate": 3.717540932318281e-05, | |
| "loss": 2.8303, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 0.9995424629570971, | |
| "grad_norm": 13.713945388793945, | |
| "learning_rate": 3.704505162164981e-05, | |
| "loss": 2.7378, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 1.0065814943863725, | |
| "grad_norm": 16.48957633972168, | |
| "learning_rate": 3.69146939201168e-05, | |
| "loss": 2.7313, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 1.0136205258156479, | |
| "grad_norm": 19.689464569091797, | |
| "learning_rate": 3.6784336218583796e-05, | |
| "loss": 2.7612, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 1.020659557244923, | |
| "grad_norm": 19.848342895507812, | |
| "learning_rate": 3.665397851705079e-05, | |
| "loss": 2.8205, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 1.020659557244923, | |
| "eval_runtime": 193.2356, | |
| "eval_samples_per_second": 147.038, | |
| "eval_steps_per_second": 18.382, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 1.0276985886741985, | |
| "grad_norm": 23.994319915771484, | |
| "learning_rate": 3.6523620815517784e-05, | |
| "loss": 2.7362, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 1.0347376201034737, | |
| "grad_norm": 16.060455322265625, | |
| "learning_rate": 3.639326311398478e-05, | |
| "loss": 2.8076, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 1.0417766515327491, | |
| "grad_norm": 17.874704360961914, | |
| "learning_rate": 3.626290541245177e-05, | |
| "loss": 2.6903, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 1.0488156829620243, | |
| "grad_norm": 15.949551582336426, | |
| "learning_rate": 3.613254771091876e-05, | |
| "loss": 2.7765, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 1.0558547143912997, | |
| "grad_norm": 17.344772338867188, | |
| "learning_rate": 3.6002190009385754e-05, | |
| "loss": 2.9004, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 1.0558547143912997, | |
| "eval_runtime": 194.9228, | |
| "eval_samples_per_second": 145.765, | |
| "eval_steps_per_second": 18.223, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 1.0628937458205752, | |
| "grad_norm": 13.621015548706055, | |
| "learning_rate": 3.587183230785275e-05, | |
| "loss": 2.7808, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 1.0699327772498504, | |
| "grad_norm": 21.069551467895508, | |
| "learning_rate": 3.574147460631974e-05, | |
| "loss": 2.7421, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 1.0769718086791258, | |
| "grad_norm": 15.854650497436523, | |
| "learning_rate": 3.5611116904786736e-05, | |
| "loss": 2.7086, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 1.084010840108401, | |
| "grad_norm": 19.217153549194336, | |
| "learning_rate": 3.548075920325373e-05, | |
| "loss": 2.7957, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 1.0910498715376764, | |
| "grad_norm": 20.781291961669922, | |
| "learning_rate": 3.5350401501720724e-05, | |
| "loss": 2.8039, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 1.0910498715376764, | |
| "eval_runtime": 192.3565, | |
| "eval_samples_per_second": 147.71, | |
| "eval_steps_per_second": 18.466, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 1.0980889029669518, | |
| "grad_norm": 15.170364379882812, | |
| "learning_rate": 3.522004380018772e-05, | |
| "loss": 2.7992, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 1.105127934396227, | |
| "grad_norm": 18.8775634765625, | |
| "learning_rate": 3.508968609865471e-05, | |
| "loss": 2.7919, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 1.1121669658255025, | |
| "grad_norm": 19.009754180908203, | |
| "learning_rate": 3.4959328397121706e-05, | |
| "loss": 2.7361, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 1.1192059972547777, | |
| "grad_norm": 14.632086753845215, | |
| "learning_rate": 3.48289706955887e-05, | |
| "loss": 2.8233, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 1.126245028684053, | |
| "grad_norm": 20.006601333618164, | |
| "learning_rate": 3.469861299405569e-05, | |
| "loss": 2.738, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.126245028684053, | |
| "eval_runtime": 194.9892, | |
| "eval_samples_per_second": 145.716, | |
| "eval_steps_per_second": 18.216, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 1.1332840601133285, | |
| "grad_norm": 14.096820831298828, | |
| "learning_rate": 3.456825529252268e-05, | |
| "loss": 2.8094, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 1.1403230915426037, | |
| "grad_norm": 17.261428833007812, | |
| "learning_rate": 3.4437897590989676e-05, | |
| "loss": 2.8108, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 1.1473621229718791, | |
| "grad_norm": 14.972962379455566, | |
| "learning_rate": 3.430753988945667e-05, | |
| "loss": 2.8055, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 1.1544011544011543, | |
| "grad_norm": 14.345026969909668, | |
| "learning_rate": 3.4177182187923664e-05, | |
| "loss": 2.7363, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 1.1614401858304297, | |
| "grad_norm": 15.4429292678833, | |
| "learning_rate": 3.404682448639065e-05, | |
| "loss": 2.8642, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 1.1614401858304297, | |
| "eval_runtime": 194.218, | |
| "eval_samples_per_second": 146.294, | |
| "eval_steps_per_second": 18.289, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 1.1684792172597052, | |
| "grad_norm": 13.637730598449707, | |
| "learning_rate": 3.391646678485765e-05, | |
| "loss": 2.6802, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 1.1755182486889804, | |
| "grad_norm": 16.29159927368164, | |
| "learning_rate": 3.3786109083324647e-05, | |
| "loss": 2.7796, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 1.1825572801182558, | |
| "grad_norm": 16.733455657958984, | |
| "learning_rate": 3.365575138179164e-05, | |
| "loss": 2.8216, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 1.189596311547531, | |
| "grad_norm": 20.382347106933594, | |
| "learning_rate": 3.3525393680258635e-05, | |
| "loss": 2.6737, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 1.1966353429768064, | |
| "grad_norm": 18.506607055664062, | |
| "learning_rate": 3.339503597872562e-05, | |
| "loss": 2.7144, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 1.1966353429768064, | |
| "eval_runtime": 192.527, | |
| "eval_samples_per_second": 147.579, | |
| "eval_steps_per_second": 18.449, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 1.2036743744060816, | |
| "grad_norm": 16.080759048461914, | |
| "learning_rate": 3.3264678277192616e-05, | |
| "loss": 2.8013, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 1.210713405835357, | |
| "grad_norm": 16.123552322387695, | |
| "learning_rate": 3.313432057565961e-05, | |
| "loss": 2.7444, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 1.2177524372646324, | |
| "grad_norm": 16.878711700439453, | |
| "learning_rate": 3.3003962874126604e-05, | |
| "loss": 2.8241, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 1.2247914686939076, | |
| "grad_norm": 18.64569854736328, | |
| "learning_rate": 3.28736051725936e-05, | |
| "loss": 2.684, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 1.231830500123183, | |
| "grad_norm": 16.125022888183594, | |
| "learning_rate": 3.274324747106059e-05, | |
| "loss": 2.8633, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 1.231830500123183, | |
| "eval_runtime": 193.4861, | |
| "eval_samples_per_second": 146.848, | |
| "eval_steps_per_second": 18.358, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 1.2388695315524583, | |
| "grad_norm": 13.886027336120605, | |
| "learning_rate": 3.261288976952758e-05, | |
| "loss": 2.7536, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 1.2459085629817337, | |
| "grad_norm": 15.769869804382324, | |
| "learning_rate": 3.248253206799458e-05, | |
| "loss": 2.7889, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 1.2529475944110091, | |
| "grad_norm": 19.419034957885742, | |
| "learning_rate": 3.2352174366461575e-05, | |
| "loss": 2.7672, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 1.2599866258402843, | |
| "grad_norm": 18.742015838623047, | |
| "learning_rate": 3.222181666492857e-05, | |
| "loss": 2.7427, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 1.2670256572695597, | |
| "grad_norm": 18.40927505493164, | |
| "learning_rate": 3.209145896339556e-05, | |
| "loss": 2.7596, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.2670256572695597, | |
| "eval_runtime": 194.1066, | |
| "eval_samples_per_second": 146.378, | |
| "eval_steps_per_second": 18.299, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 1.274064688698835, | |
| "grad_norm": 21.27202033996582, | |
| "learning_rate": 3.196110126186255e-05, | |
| "loss": 2.7888, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 1.2811037201281104, | |
| "grad_norm": 13.953824043273926, | |
| "learning_rate": 3.1830743560329545e-05, | |
| "loss": 2.7347, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 1.2881427515573858, | |
| "grad_norm": 16.453821182250977, | |
| "learning_rate": 3.170038585879654e-05, | |
| "loss": 2.8821, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 1.295181782986661, | |
| "grad_norm": 16.67236328125, | |
| "learning_rate": 3.157002815726353e-05, | |
| "loss": 2.8431, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 1.3022208144159364, | |
| "grad_norm": 13.558029174804688, | |
| "learning_rate": 3.143967045573053e-05, | |
| "loss": 2.7499, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 1.3022208144159364, | |
| "eval_runtime": 192.4964, | |
| "eval_samples_per_second": 147.603, | |
| "eval_steps_per_second": 18.452, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 1.3092598458452116, | |
| "grad_norm": 15.34234619140625, | |
| "learning_rate": 3.1309312754197514e-05, | |
| "loss": 2.8225, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 1.316298877274487, | |
| "grad_norm": 14.304731369018555, | |
| "learning_rate": 3.117895505266451e-05, | |
| "loss": 2.8369, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 1.3233379087037624, | |
| "grad_norm": 17.345626831054688, | |
| "learning_rate": 3.104859735113151e-05, | |
| "loss": 2.6865, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 1.3303769401330376, | |
| "grad_norm": 16.954349517822266, | |
| "learning_rate": 3.09182396495985e-05, | |
| "loss": 2.741, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 1.337415971562313, | |
| "grad_norm": 17.250642776489258, | |
| "learning_rate": 3.07878819480655e-05, | |
| "loss": 2.8111, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.337415971562313, | |
| "eval_runtime": 192.7825, | |
| "eval_samples_per_second": 147.384, | |
| "eval_steps_per_second": 18.425, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 1.3444550029915883, | |
| "grad_norm": 17.004776000976562, | |
| "learning_rate": 3.0657524246532485e-05, | |
| "loss": 2.7594, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 1.3514940344208637, | |
| "grad_norm": 15.450813293457031, | |
| "learning_rate": 3.052716654499948e-05, | |
| "loss": 2.6522, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 1.358533065850139, | |
| "grad_norm": 15.58588981628418, | |
| "learning_rate": 3.0396808843466473e-05, | |
| "loss": 2.8346, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 1.3655720972794143, | |
| "grad_norm": 23.079944610595703, | |
| "learning_rate": 3.0266451141933467e-05, | |
| "loss": 2.829, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 1.3726111287086897, | |
| "grad_norm": 23.278108596801758, | |
| "learning_rate": 3.0136093440400458e-05, | |
| "loss": 2.809, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 1.3726111287086897, | |
| "eval_runtime": 191.7654, | |
| "eval_samples_per_second": 148.165, | |
| "eval_steps_per_second": 18.523, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 1.379650160137965, | |
| "grad_norm": 12.144103050231934, | |
| "learning_rate": 3.0005735738867452e-05, | |
| "loss": 2.5999, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 1.3866891915672404, | |
| "grad_norm": 18.378664016723633, | |
| "learning_rate": 2.9875378037334446e-05, | |
| "loss": 2.8226, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 1.3937282229965158, | |
| "grad_norm": 15.180033683776855, | |
| "learning_rate": 2.9745020335801437e-05, | |
| "loss": 2.7773, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 1.400767254425791, | |
| "grad_norm": 16.611019134521484, | |
| "learning_rate": 2.9614662634268438e-05, | |
| "loss": 2.7171, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 1.4078062858550664, | |
| "grad_norm": 14.491551399230957, | |
| "learning_rate": 2.948430493273543e-05, | |
| "loss": 2.7234, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.4078062858550664, | |
| "eval_runtime": 192.2568, | |
| "eval_samples_per_second": 147.787, | |
| "eval_steps_per_second": 18.475, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 1.4148453172843416, | |
| "grad_norm": 15.652689933776855, | |
| "learning_rate": 2.9353947231202422e-05, | |
| "loss": 2.6237, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 1.421884348713617, | |
| "grad_norm": 16.404693603515625, | |
| "learning_rate": 2.9223589529669417e-05, | |
| "loss": 2.7363, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 1.4289233801428924, | |
| "grad_norm": 13.620403289794922, | |
| "learning_rate": 2.9093231828136407e-05, | |
| "loss": 2.7651, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 1.4359624115721676, | |
| "grad_norm": 16.975452423095703, | |
| "learning_rate": 2.89628741266034e-05, | |
| "loss": 2.8431, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 1.443001443001443, | |
| "grad_norm": 16.957857131958008, | |
| "learning_rate": 2.8832516425070395e-05, | |
| "loss": 2.7442, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 1.443001443001443, | |
| "eval_runtime": 192.9203, | |
| "eval_samples_per_second": 147.278, | |
| "eval_steps_per_second": 18.412, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 1.4500404744307183, | |
| "grad_norm": 15.085665702819824, | |
| "learning_rate": 2.8702158723537386e-05, | |
| "loss": 2.6764, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 1.4570795058599937, | |
| "grad_norm": 15.870454788208008, | |
| "learning_rate": 2.857180102200438e-05, | |
| "loss": 2.6491, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 1.464118537289269, | |
| "grad_norm": 15.54505729675293, | |
| "learning_rate": 2.844144332047137e-05, | |
| "loss": 2.785, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 1.4711575687185443, | |
| "grad_norm": 16.786861419677734, | |
| "learning_rate": 2.8311085618938365e-05, | |
| "loss": 2.656, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 1.4781966001478197, | |
| "grad_norm": 17.03700828552246, | |
| "learning_rate": 2.8180727917405363e-05, | |
| "loss": 2.7337, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 1.4781966001478197, | |
| "eval_runtime": 194.0613, | |
| "eval_samples_per_second": 146.413, | |
| "eval_steps_per_second": 18.303, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 1.485235631577095, | |
| "grad_norm": 15.768280982971191, | |
| "learning_rate": 2.8050370215872357e-05, | |
| "loss": 2.7419, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 1.4922746630063703, | |
| "grad_norm": 16.70868682861328, | |
| "learning_rate": 2.792001251433935e-05, | |
| "loss": 2.7001, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 1.4993136944356458, | |
| "grad_norm": 15.003129005432129, | |
| "learning_rate": 2.778965481280634e-05, | |
| "loss": 2.6372, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 1.506352725864921, | |
| "grad_norm": 15.321432113647461, | |
| "learning_rate": 2.7659297111273336e-05, | |
| "loss": 2.734, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 1.5133917572941962, | |
| "grad_norm": 12.912035942077637, | |
| "learning_rate": 2.752893940974033e-05, | |
| "loss": 2.6854, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 1.5133917572941962, | |
| "eval_runtime": 192.892, | |
| "eval_samples_per_second": 147.3, | |
| "eval_steps_per_second": 18.414, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 1.5204307887234716, | |
| "grad_norm": 17.954883575439453, | |
| "learning_rate": 2.739858170820732e-05, | |
| "loss": 2.6843, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 1.527469820152747, | |
| "grad_norm": 20.32744026184082, | |
| "learning_rate": 2.7268224006674315e-05, | |
| "loss": 2.6404, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 1.5345088515820224, | |
| "grad_norm": 14.839242935180664, | |
| "learning_rate": 2.713786630514131e-05, | |
| "loss": 2.7235, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 1.5415478830112976, | |
| "grad_norm": 15.594539642333984, | |
| "learning_rate": 2.70075086036083e-05, | |
| "loss": 2.6163, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 1.5485869144405728, | |
| "grad_norm": 14.877588272094727, | |
| "learning_rate": 2.6877150902075294e-05, | |
| "loss": 2.6331, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 1.5485869144405728, | |
| "eval_runtime": 191.7338, | |
| "eval_samples_per_second": 148.19, | |
| "eval_steps_per_second": 18.526, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 1.5556259458698483, | |
| "grad_norm": 14.593866348266602, | |
| "learning_rate": 2.674679320054229e-05, | |
| "loss": 2.7969, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 1.5626649772991237, | |
| "grad_norm": 22.533540725708008, | |
| "learning_rate": 2.6616435499009285e-05, | |
| "loss": 2.7841, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 1.569704008728399, | |
| "grad_norm": 16.089982986450195, | |
| "learning_rate": 2.6486077797476276e-05, | |
| "loss": 2.667, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 1.5767430401576743, | |
| "grad_norm": 17.49601936340332, | |
| "learning_rate": 2.635572009594327e-05, | |
| "loss": 2.7963, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 1.5837820715869495, | |
| "grad_norm": 17.72164535522461, | |
| "learning_rate": 2.6225362394410264e-05, | |
| "loss": 2.7669, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 1.5837820715869495, | |
| "eval_runtime": 191.4019, | |
| "eval_samples_per_second": 148.447, | |
| "eval_steps_per_second": 18.558, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 1.590821103016225, | |
| "grad_norm": 14.238466262817383, | |
| "learning_rate": 2.6095004692877255e-05, | |
| "loss": 2.732, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 1.5978601344455003, | |
| "grad_norm": 19.9779052734375, | |
| "learning_rate": 2.596464699134425e-05, | |
| "loss": 2.7353, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 1.6048991658747758, | |
| "grad_norm": 16.89205551147461, | |
| "learning_rate": 2.5834289289811243e-05, | |
| "loss": 2.7311, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 1.611938197304051, | |
| "grad_norm": 13.072985649108887, | |
| "learning_rate": 2.5703931588278234e-05, | |
| "loss": 2.6313, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 1.6189772287333262, | |
| "grad_norm": 22.408113479614258, | |
| "learning_rate": 2.5573573886745228e-05, | |
| "loss": 2.605, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 1.6189772287333262, | |
| "eval_runtime": 192.574, | |
| "eval_samples_per_second": 147.543, | |
| "eval_steps_per_second": 18.445, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 1.6260162601626016, | |
| "grad_norm": 21.51888084411621, | |
| "learning_rate": 2.5443216185212222e-05, | |
| "loss": 2.5964, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 1.633055291591877, | |
| "grad_norm": 20.486024856567383, | |
| "learning_rate": 2.531285848367922e-05, | |
| "loss": 2.6883, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 1.6400943230211524, | |
| "grad_norm": 17.860441207885742, | |
| "learning_rate": 2.5182500782146214e-05, | |
| "loss": 2.7572, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 1.6471333544504276, | |
| "grad_norm": 19.4054012298584, | |
| "learning_rate": 2.5052143080613204e-05, | |
| "loss": 2.7643, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 1.6541723858797028, | |
| "grad_norm": 15.56551742553711, | |
| "learning_rate": 2.49217853790802e-05, | |
| "loss": 2.6638, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 1.6541723858797028, | |
| "eval_runtime": 192.3359, | |
| "eval_samples_per_second": 147.726, | |
| "eval_steps_per_second": 18.468, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 1.6612114173089783, | |
| "grad_norm": 22.051755905151367, | |
| "learning_rate": 2.4791427677547192e-05, | |
| "loss": 2.6905, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 1.6682504487382537, | |
| "grad_norm": 19.55982208251953, | |
| "learning_rate": 2.4661069976014183e-05, | |
| "loss": 2.7178, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 1.675289480167529, | |
| "grad_norm": 14.777819633483887, | |
| "learning_rate": 2.4530712274481177e-05, | |
| "loss": 2.6219, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 1.6823285115968043, | |
| "grad_norm": 15.4576997756958, | |
| "learning_rate": 2.440035457294817e-05, | |
| "loss": 2.6425, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 1.6893675430260795, | |
| "grad_norm": 18.520376205444336, | |
| "learning_rate": 2.4269996871415165e-05, | |
| "loss": 2.6541, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 1.6893675430260795, | |
| "eval_runtime": 191.8782, | |
| "eval_samples_per_second": 148.078, | |
| "eval_steps_per_second": 18.512, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 1.696406574455355, | |
| "grad_norm": 18.677989959716797, | |
| "learning_rate": 2.413963916988216e-05, | |
| "loss": 2.7502, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 1.7034456058846303, | |
| "grad_norm": 19.01474380493164, | |
| "learning_rate": 2.400928146834915e-05, | |
| "loss": 2.5849, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 1.7104846373139058, | |
| "grad_norm": 14.854390144348145, | |
| "learning_rate": 2.3878923766816144e-05, | |
| "loss": 2.6224, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 1.717523668743181, | |
| "grad_norm": 16.40928077697754, | |
| "learning_rate": 2.374856606528314e-05, | |
| "loss": 2.6996, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 1.7245627001724562, | |
| "grad_norm": 14.962175369262695, | |
| "learning_rate": 2.3618208363750133e-05, | |
| "loss": 2.6928, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 1.7245627001724562, | |
| "eval_runtime": 194.5262, | |
| "eval_samples_per_second": 146.063, | |
| "eval_steps_per_second": 18.26, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 1.7316017316017316, | |
| "grad_norm": 19.39845085144043, | |
| "learning_rate": 2.3487850662217127e-05, | |
| "loss": 2.7458, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 1.738640763031007, | |
| "grad_norm": 16.46622085571289, | |
| "learning_rate": 2.3357492960684117e-05, | |
| "loss": 2.7465, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 1.7456797944602824, | |
| "grad_norm": 17.756010055541992, | |
| "learning_rate": 2.322713525915111e-05, | |
| "loss": 2.7617, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 1.7527188258895576, | |
| "grad_norm": 17.55894660949707, | |
| "learning_rate": 2.3096777557618106e-05, | |
| "loss": 2.6085, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 1.7597578573188328, | |
| "grad_norm": 16.707901000976562, | |
| "learning_rate": 2.2966419856085096e-05, | |
| "loss": 2.6305, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.7597578573188328, | |
| "eval_runtime": 191.9564, | |
| "eval_samples_per_second": 148.018, | |
| "eval_steps_per_second": 18.504, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 1.7667968887481083, | |
| "grad_norm": 21.4102840423584, | |
| "learning_rate": 2.2836062154552094e-05, | |
| "loss": 2.7509, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 1.7738359201773837, | |
| "grad_norm": 21.17198944091797, | |
| "learning_rate": 2.2705704453019085e-05, | |
| "loss": 2.6602, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 1.7808749516066589, | |
| "grad_norm": 14.665617942810059, | |
| "learning_rate": 2.257534675148608e-05, | |
| "loss": 2.6576, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 1.7879139830359343, | |
| "grad_norm": 18.63422393798828, | |
| "learning_rate": 2.2444989049953073e-05, | |
| "loss": 2.6682, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 1.7949530144652095, | |
| "grad_norm": 20.478769302368164, | |
| "learning_rate": 2.2314631348420064e-05, | |
| "loss": 2.5733, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 1.7949530144652095, | |
| "eval_runtime": 194.5888, | |
| "eval_samples_per_second": 146.016, | |
| "eval_steps_per_second": 18.254, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 1.801992045894485, | |
| "grad_norm": 13.144091606140137, | |
| "learning_rate": 2.218427364688706e-05, | |
| "loss": 2.5946, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 1.8090310773237603, | |
| "grad_norm": 22.20168685913086, | |
| "learning_rate": 2.2053915945354052e-05, | |
| "loss": 2.7029, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 1.8160701087530355, | |
| "grad_norm": 17.39105796813965, | |
| "learning_rate": 2.1923558243821046e-05, | |
| "loss": 2.7007, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 1.823109140182311, | |
| "grad_norm": 16.70639419555664, | |
| "learning_rate": 2.179320054228804e-05, | |
| "loss": 2.6011, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 1.8301481716115862, | |
| "grad_norm": 23.447750091552734, | |
| "learning_rate": 2.166284284075503e-05, | |
| "loss": 2.682, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 1.8301481716115862, | |
| "eval_runtime": 192.912, | |
| "eval_samples_per_second": 147.285, | |
| "eval_steps_per_second": 18.413, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 1.8371872030408616, | |
| "grad_norm": 20.410226821899414, | |
| "learning_rate": 2.1532485139222025e-05, | |
| "loss": 2.7565, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 1.844226234470137, | |
| "grad_norm": 20.64243507385254, | |
| "learning_rate": 2.1402127437689022e-05, | |
| "loss": 2.566, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 1.8512652658994122, | |
| "grad_norm": 15.64989185333252, | |
| "learning_rate": 2.1271769736156013e-05, | |
| "loss": 2.7237, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 1.8583042973286876, | |
| "grad_norm": 13.451628684997559, | |
| "learning_rate": 2.1141412034623007e-05, | |
| "loss": 2.6218, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 1.8653433287579628, | |
| "grad_norm": 16.763063430786133, | |
| "learning_rate": 2.1011054333089998e-05, | |
| "loss": 2.5891, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 1.8653433287579628, | |
| "eval_runtime": 191.7231, | |
| "eval_samples_per_second": 148.198, | |
| "eval_steps_per_second": 18.527, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 1.8723823601872382, | |
| "grad_norm": 16.719079971313477, | |
| "learning_rate": 2.0880696631556992e-05, | |
| "loss": 2.6568, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 1.8794213916165137, | |
| "grad_norm": 20.345216751098633, | |
| "learning_rate": 2.075033893002399e-05, | |
| "loss": 2.5889, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 1.8864604230457889, | |
| "grad_norm": 13.290498733520508, | |
| "learning_rate": 2.061998122849098e-05, | |
| "loss": 2.6915, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 1.893499454475064, | |
| "grad_norm": 26.90572738647461, | |
| "learning_rate": 2.0489623526957974e-05, | |
| "loss": 2.6635, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 1.9005384859043395, | |
| "grad_norm": 12.706587791442871, | |
| "learning_rate": 2.035926582542497e-05, | |
| "loss": 2.6886, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 1.9005384859043395, | |
| "eval_runtime": 193.671, | |
| "eval_samples_per_second": 146.708, | |
| "eval_steps_per_second": 18.34, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 1.907577517333615, | |
| "grad_norm": 19.337390899658203, | |
| "learning_rate": 2.022890812389196e-05, | |
| "loss": 2.5446, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 1.9146165487628903, | |
| "grad_norm": 16.442127227783203, | |
| "learning_rate": 2.0098550422358953e-05, | |
| "loss": 2.6562, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 1.9216555801921655, | |
| "grad_norm": 17.196496963500977, | |
| "learning_rate": 1.9968192720825947e-05, | |
| "loss": 2.5869, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 1.9286946116214407, | |
| "grad_norm": 15.884928703308105, | |
| "learning_rate": 1.983783501929294e-05, | |
| "loss": 2.6127, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 1.9357336430507162, | |
| "grad_norm": 15.426615715026855, | |
| "learning_rate": 1.9707477317759935e-05, | |
| "loss": 2.6043, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 1.9357336430507162, | |
| "eval_runtime": 193.4431, | |
| "eval_samples_per_second": 146.88, | |
| "eval_steps_per_second": 18.362, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 1.9427726744799916, | |
| "grad_norm": 20.6138858795166, | |
| "learning_rate": 1.9577119616226926e-05, | |
| "loss": 2.6387, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 1.949811705909267, | |
| "grad_norm": 14.545782089233398, | |
| "learning_rate": 1.944676191469392e-05, | |
| "loss": 2.7687, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 1.9568507373385422, | |
| "grad_norm": 15.325973510742188, | |
| "learning_rate": 1.9316404213160914e-05, | |
| "loss": 2.6876, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 1.9638897687678174, | |
| "grad_norm": 16.72733497619629, | |
| "learning_rate": 1.918604651162791e-05, | |
| "loss": 2.6131, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 1.9709288001970928, | |
| "grad_norm": 22.076963424682617, | |
| "learning_rate": 1.9055688810094903e-05, | |
| "loss": 2.7044, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 1.9709288001970928, | |
| "eval_runtime": 192.6043, | |
| "eval_samples_per_second": 147.52, | |
| "eval_steps_per_second": 18.442, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 1.9779678316263682, | |
| "grad_norm": 17.05091094970703, | |
| "learning_rate": 1.8925331108561893e-05, | |
| "loss": 2.6023, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 1.9850068630556437, | |
| "grad_norm": 17.847782135009766, | |
| "learning_rate": 1.8794973407028887e-05, | |
| "loss": 2.5752, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 1.9920458944849189, | |
| "grad_norm": 18.966585159301758, | |
| "learning_rate": 1.866461570549588e-05, | |
| "loss": 2.6339, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 1.999084925914194, | |
| "grad_norm": 18.27726173400879, | |
| "learning_rate": 1.8534258003962876e-05, | |
| "loss": 2.6527, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 2.0061239573434695, | |
| "grad_norm": 16.40408706665039, | |
| "learning_rate": 1.840390030242987e-05, | |
| "loss": 2.6285, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 2.0061239573434695, | |
| "eval_runtime": 192.5321, | |
| "eval_samples_per_second": 147.575, | |
| "eval_steps_per_second": 18.449, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 2.013162988772745, | |
| "grad_norm": 13.299867630004883, | |
| "learning_rate": 1.827354260089686e-05, | |
| "loss": 2.6401, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 2.0202020202020203, | |
| "grad_norm": 16.995622634887695, | |
| "learning_rate": 1.8143184899363855e-05, | |
| "loss": 2.5986, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 2.0272410516312958, | |
| "grad_norm": 18.69041633605957, | |
| "learning_rate": 1.801282719783085e-05, | |
| "loss": 2.644, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 2.0342800830605707, | |
| "grad_norm": 20.12238883972168, | |
| "learning_rate": 1.7882469496297843e-05, | |
| "loss": 2.6802, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 2.041319114489846, | |
| "grad_norm": 14.631281852722168, | |
| "learning_rate": 1.7752111794764837e-05, | |
| "loss": 2.6362, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 2.041319114489846, | |
| "eval_runtime": 193.4189, | |
| "eval_samples_per_second": 146.899, | |
| "eval_steps_per_second": 18.364, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 2.0483581459191216, | |
| "grad_norm": 16.323118209838867, | |
| "learning_rate": 1.7621754093231828e-05, | |
| "loss": 2.6249, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 2.055397177348397, | |
| "grad_norm": 14.623433113098145, | |
| "learning_rate": 1.7491396391698822e-05, | |
| "loss": 2.6324, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 2.062436208777672, | |
| "grad_norm": 19.917098999023438, | |
| "learning_rate": 1.7361038690165816e-05, | |
| "loss": 2.6893, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 2.0694752402069474, | |
| "grad_norm": 14.357760429382324, | |
| "learning_rate": 1.7230680988632807e-05, | |
| "loss": 2.5841, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 2.076514271636223, | |
| "grad_norm": 15.798065185546875, | |
| "learning_rate": 1.7100323287099804e-05, | |
| "loss": 2.6374, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 2.076514271636223, | |
| "eval_runtime": 191.4525, | |
| "eval_samples_per_second": 148.408, | |
| "eval_steps_per_second": 18.553, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 2.0835533030654982, | |
| "grad_norm": 19.128459930419922, | |
| "learning_rate": 1.6969965585566798e-05, | |
| "loss": 2.6451, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 2.0905923344947737, | |
| "grad_norm": 22.39739990234375, | |
| "learning_rate": 1.683960788403379e-05, | |
| "loss": 2.6732, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 2.0976313659240486, | |
| "grad_norm": 21.8306827545166, | |
| "learning_rate": 1.6709250182500783e-05, | |
| "loss": 2.6381, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 2.104670397353324, | |
| "grad_norm": 16.79404640197754, | |
| "learning_rate": 1.6578892480967774e-05, | |
| "loss": 2.6643, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 2.1117094287825995, | |
| "grad_norm": 20.273427963256836, | |
| "learning_rate": 1.644853477943477e-05, | |
| "loss": 2.6409, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 2.1117094287825995, | |
| "eval_runtime": 192.3103, | |
| "eval_samples_per_second": 147.746, | |
| "eval_steps_per_second": 18.47, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 2.118748460211875, | |
| "grad_norm": 16.260501861572266, | |
| "learning_rate": 1.6318177077901765e-05, | |
| "loss": 2.6085, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 2.1257874916411503, | |
| "grad_norm": 17.500699996948242, | |
| "learning_rate": 1.6187819376368756e-05, | |
| "loss": 2.5923, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 2.1328265230704253, | |
| "grad_norm": 19.523569107055664, | |
| "learning_rate": 1.605746167483575e-05, | |
| "loss": 2.562, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 2.1398655544997007, | |
| "grad_norm": 16.805545806884766, | |
| "learning_rate": 1.5927103973302744e-05, | |
| "loss": 2.632, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 2.146904585928976, | |
| "grad_norm": 14.419663429260254, | |
| "learning_rate": 1.5796746271769735e-05, | |
| "loss": 2.6956, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 2.146904585928976, | |
| "eval_runtime": 192.4179, | |
| "eval_samples_per_second": 147.663, | |
| "eval_steps_per_second": 18.46, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 2.1539436173582516, | |
| "grad_norm": 14.469121932983398, | |
| "learning_rate": 1.5666388570236732e-05, | |
| "loss": 2.6734, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 2.160982648787527, | |
| "grad_norm": 14.521267890930176, | |
| "learning_rate": 1.5536030868703723e-05, | |
| "loss": 2.6272, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 2.168021680216802, | |
| "grad_norm": 15.565622329711914, | |
| "learning_rate": 1.5405673167170717e-05, | |
| "loss": 2.5995, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 2.1750607116460774, | |
| "grad_norm": 18.500350952148438, | |
| "learning_rate": 1.527531546563771e-05, | |
| "loss": 2.7019, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 2.182099743075353, | |
| "grad_norm": 18.180660247802734, | |
| "learning_rate": 1.5144957764104704e-05, | |
| "loss": 2.6347, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 2.182099743075353, | |
| "eval_runtime": 194.3931, | |
| "eval_samples_per_second": 146.163, | |
| "eval_steps_per_second": 18.272, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 2.1891387745046282, | |
| "grad_norm": 15.68535041809082, | |
| "learning_rate": 1.5014600062571698e-05, | |
| "loss": 2.6679, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 2.1961778059339037, | |
| "grad_norm": 18.195068359375, | |
| "learning_rate": 1.4884242361038692e-05, | |
| "loss": 2.6152, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 2.2032168373631786, | |
| "grad_norm": 19.41796875, | |
| "learning_rate": 1.4753884659505684e-05, | |
| "loss": 2.6453, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 2.210255868792454, | |
| "grad_norm": 16.178791046142578, | |
| "learning_rate": 1.4623526957972677e-05, | |
| "loss": 2.6175, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 2.2172949002217295, | |
| "grad_norm": 17.970273971557617, | |
| "learning_rate": 1.4493169256439671e-05, | |
| "loss": 2.5633, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 2.2172949002217295, | |
| "eval_runtime": 194.9638, | |
| "eval_samples_per_second": 145.735, | |
| "eval_steps_per_second": 18.219, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 2.224333931651005, | |
| "grad_norm": 13.679678916931152, | |
| "learning_rate": 1.4362811554906663e-05, | |
| "loss": 2.5433, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 2.2313729630802803, | |
| "grad_norm": 18.061559677124023, | |
| "learning_rate": 1.423245385337366e-05, | |
| "loss": 2.6087, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 2.2384119945095553, | |
| "grad_norm": 20.32142448425293, | |
| "learning_rate": 1.4102096151840652e-05, | |
| "loss": 2.6079, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 2.2454510259388307, | |
| "grad_norm": 16.483491897583008, | |
| "learning_rate": 1.3971738450307646e-05, | |
| "loss": 2.6693, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 2.252490057368106, | |
| "grad_norm": 16.30838394165039, | |
| "learning_rate": 1.3841380748774638e-05, | |
| "loss": 2.5292, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 2.252490057368106, | |
| "eval_runtime": 192.332, | |
| "eval_samples_per_second": 147.729, | |
| "eval_steps_per_second": 18.468, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 2.2595290887973816, | |
| "grad_norm": 18.353946685791016, | |
| "learning_rate": 1.371102304724163e-05, | |
| "loss": 2.7162, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 2.266568120226657, | |
| "grad_norm": 14.376470565795898, | |
| "learning_rate": 1.3580665345708626e-05, | |
| "loss": 2.6581, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 2.273607151655932, | |
| "grad_norm": 16.625110626220703, | |
| "learning_rate": 1.3450307644175619e-05, | |
| "loss": 2.5675, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 2.2806461830852074, | |
| "grad_norm": 17.9268798828125, | |
| "learning_rate": 1.3319949942642613e-05, | |
| "loss": 2.6001, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 2.287685214514483, | |
| "grad_norm": 23.196901321411133, | |
| "learning_rate": 1.3189592241109605e-05, | |
| "loss": 2.6545, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 2.287685214514483, | |
| "eval_runtime": 192.8693, | |
| "eval_samples_per_second": 147.317, | |
| "eval_steps_per_second": 18.417, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 2.2947242459437582, | |
| "grad_norm": 18.328662872314453, | |
| "learning_rate": 1.3059234539576598e-05, | |
| "loss": 2.5476, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 2.3017632773730337, | |
| "grad_norm": 16.62209129333496, | |
| "learning_rate": 1.2928876838043592e-05, | |
| "loss": 2.6778, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 2.3088023088023086, | |
| "grad_norm": 15.676456451416016, | |
| "learning_rate": 1.2798519136510586e-05, | |
| "loss": 2.6188, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 2.315841340231584, | |
| "grad_norm": 21.3188533782959, | |
| "learning_rate": 1.266816143497758e-05, | |
| "loss": 2.5383, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 2.3228803716608595, | |
| "grad_norm": 15.253218650817871, | |
| "learning_rate": 1.2537803733444572e-05, | |
| "loss": 2.6703, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 2.3228803716608595, | |
| "eval_runtime": 192.6821, | |
| "eval_samples_per_second": 147.46, | |
| "eval_steps_per_second": 18.435, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 2.329919403090135, | |
| "grad_norm": 17.341787338256836, | |
| "learning_rate": 1.2407446031911565e-05, | |
| "loss": 2.6903, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 2.3369584345194103, | |
| "grad_norm": 14.856354713439941, | |
| "learning_rate": 1.2277088330378559e-05, | |
| "loss": 2.5655, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 2.3439974659486853, | |
| "grad_norm": 17.669092178344727, | |
| "learning_rate": 1.2146730628845553e-05, | |
| "loss": 2.6723, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 2.3510364973779607, | |
| "grad_norm": 18.183189392089844, | |
| "learning_rate": 1.2016372927312545e-05, | |
| "loss": 2.6732, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 2.358075528807236, | |
| "grad_norm": 20.30499267578125, | |
| "learning_rate": 1.188601522577954e-05, | |
| "loss": 2.6527, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 2.358075528807236, | |
| "eval_runtime": 192.939, | |
| "eval_samples_per_second": 147.264, | |
| "eval_steps_per_second": 18.41, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 2.3651145602365116, | |
| "grad_norm": 22.915029525756836, | |
| "learning_rate": 1.1755657524246532e-05, | |
| "loss": 2.4797, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 2.3721535916657865, | |
| "grad_norm": 16.179378509521484, | |
| "learning_rate": 1.1625299822713528e-05, | |
| "loss": 2.5854, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 2.379192623095062, | |
| "grad_norm": 14.764082908630371, | |
| "learning_rate": 1.149494212118052e-05, | |
| "loss": 2.4972, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 2.3862316545243374, | |
| "grad_norm": 21.402334213256836, | |
| "learning_rate": 1.1364584419647513e-05, | |
| "loss": 2.5752, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 2.393270685953613, | |
| "grad_norm": 19.00446891784668, | |
| "learning_rate": 1.1234226718114507e-05, | |
| "loss": 2.4806, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 2.393270685953613, | |
| "eval_runtime": 192.7281, | |
| "eval_samples_per_second": 147.425, | |
| "eval_steps_per_second": 18.43, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 2.4003097173828882, | |
| "grad_norm": 21.23725700378418, | |
| "learning_rate": 1.11038690165815e-05, | |
| "loss": 2.5424, | |
| "step": 34100 | |
| }, | |
| { | |
| "epoch": 2.407348748812163, | |
| "grad_norm": 14.942157745361328, | |
| "learning_rate": 1.0973511315048493e-05, | |
| "loss": 2.5926, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 2.4143877802414386, | |
| "grad_norm": 17.429502487182617, | |
| "learning_rate": 1.0843153613515487e-05, | |
| "loss": 2.5892, | |
| "step": 34300 | |
| }, | |
| { | |
| "epoch": 2.421426811670714, | |
| "grad_norm": 15.42565631866455, | |
| "learning_rate": 1.071279591198248e-05, | |
| "loss": 2.5758, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 2.4284658430999895, | |
| "grad_norm": 20.9206600189209, | |
| "learning_rate": 1.0582438210449474e-05, | |
| "loss": 2.6666, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 2.4284658430999895, | |
| "eval_runtime": 192.2868, | |
| "eval_samples_per_second": 147.764, | |
| "eval_steps_per_second": 18.472, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 2.435504874529265, | |
| "grad_norm": 16.189416885375977, | |
| "learning_rate": 1.0452080508916468e-05, | |
| "loss": 2.5727, | |
| "step": 34600 | |
| }, | |
| { | |
| "epoch": 2.44254390595854, | |
| "grad_norm": 17.95191192626953, | |
| "learning_rate": 1.032172280738346e-05, | |
| "loss": 2.6171, | |
| "step": 34700 | |
| }, | |
| { | |
| "epoch": 2.4495829373878153, | |
| "grad_norm": 15.953314781188965, | |
| "learning_rate": 1.0191365105850454e-05, | |
| "loss": 2.5181, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 2.4566219688170907, | |
| "grad_norm": 20.293758392333984, | |
| "learning_rate": 1.0061007404317447e-05, | |
| "loss": 2.6109, | |
| "step": 34900 | |
| }, | |
| { | |
| "epoch": 2.463661000246366, | |
| "grad_norm": 13.837769508361816, | |
| "learning_rate": 9.930649702784441e-06, | |
| "loss": 2.6833, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 2.463661000246366, | |
| "eval_runtime": 192.3429, | |
| "eval_samples_per_second": 147.721, | |
| "eval_steps_per_second": 18.467, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 2.4707000316756416, | |
| "grad_norm": 15.594371795654297, | |
| "learning_rate": 9.800292001251435e-06, | |
| "loss": 2.6111, | |
| "step": 35100 | |
| }, | |
| { | |
| "epoch": 2.4777390631049165, | |
| "grad_norm": 18.549043655395508, | |
| "learning_rate": 9.669934299718427e-06, | |
| "loss": 2.5622, | |
| "step": 35200 | |
| }, | |
| { | |
| "epoch": 2.484778094534192, | |
| "grad_norm": 15.56165599822998, | |
| "learning_rate": 9.53957659818542e-06, | |
| "loss": 2.5254, | |
| "step": 35300 | |
| }, | |
| { | |
| "epoch": 2.4918171259634674, | |
| "grad_norm": 14.361612319946289, | |
| "learning_rate": 9.409218896652416e-06, | |
| "loss": 2.5388, | |
| "step": 35400 | |
| }, | |
| { | |
| "epoch": 2.498856157392743, | |
| "grad_norm": 17.944364547729492, | |
| "learning_rate": 9.278861195119408e-06, | |
| "loss": 2.5671, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 2.498856157392743, | |
| "eval_runtime": 193.2612, | |
| "eval_samples_per_second": 147.019, | |
| "eval_steps_per_second": 18.379, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 2.5058951888220182, | |
| "grad_norm": 15.994379043579102, | |
| "learning_rate": 9.1485034935864e-06, | |
| "loss": 2.4926, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 2.512934220251293, | |
| "grad_norm": 15.721161842346191, | |
| "learning_rate": 9.018145792053395e-06, | |
| "loss": 2.5956, | |
| "step": 35700 | |
| }, | |
| { | |
| "epoch": 2.5199732516805686, | |
| "grad_norm": 21.510955810546875, | |
| "learning_rate": 8.887788090520389e-06, | |
| "loss": 2.6592, | |
| "step": 35800 | |
| }, | |
| { | |
| "epoch": 2.527012283109844, | |
| "grad_norm": 16.77272605895996, | |
| "learning_rate": 8.757430388987383e-06, | |
| "loss": 2.655, | |
| "step": 35900 | |
| }, | |
| { | |
| "epoch": 2.5340513145391195, | |
| "grad_norm": 18.944421768188477, | |
| "learning_rate": 8.627072687454375e-06, | |
| "loss": 2.5201, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 2.5340513145391195, | |
| "eval_runtime": 192.2731, | |
| "eval_samples_per_second": 147.774, | |
| "eval_steps_per_second": 18.474, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 2.541090345968395, | |
| "grad_norm": 19.00555992126465, | |
| "learning_rate": 8.496714985921368e-06, | |
| "loss": 2.5858, | |
| "step": 36100 | |
| }, | |
| { | |
| "epoch": 2.54812937739767, | |
| "grad_norm": 16.338956832885742, | |
| "learning_rate": 8.366357284388362e-06, | |
| "loss": 2.5963, | |
| "step": 36200 | |
| }, | |
| { | |
| "epoch": 2.5551684088269453, | |
| "grad_norm": 15.704483032226562, | |
| "learning_rate": 8.235999582855356e-06, | |
| "loss": 2.5504, | |
| "step": 36300 | |
| }, | |
| { | |
| "epoch": 2.5622074402562207, | |
| "grad_norm": 17.013628005981445, | |
| "learning_rate": 8.105641881322348e-06, | |
| "loss": 2.6663, | |
| "step": 36400 | |
| }, | |
| { | |
| "epoch": 2.569246471685496, | |
| "grad_norm": 16.901050567626953, | |
| "learning_rate": 7.975284179789342e-06, | |
| "loss": 2.5827, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 2.569246471685496, | |
| "eval_runtime": 192.5506, | |
| "eval_samples_per_second": 147.561, | |
| "eval_steps_per_second": 18.447, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 2.5762855031147716, | |
| "grad_norm": 16.243534088134766, | |
| "learning_rate": 7.844926478256335e-06, | |
| "loss": 2.6065, | |
| "step": 36600 | |
| }, | |
| { | |
| "epoch": 2.5833245345440465, | |
| "grad_norm": 17.0561580657959, | |
| "learning_rate": 7.714568776723329e-06, | |
| "loss": 2.5166, | |
| "step": 36700 | |
| }, | |
| { | |
| "epoch": 2.590363565973322, | |
| "grad_norm": 14.800107955932617, | |
| "learning_rate": 7.584211075190323e-06, | |
| "loss": 2.6966, | |
| "step": 36800 | |
| }, | |
| { | |
| "epoch": 2.5974025974025974, | |
| "grad_norm": 17.22756576538086, | |
| "learning_rate": 7.453853373657315e-06, | |
| "loss": 2.5921, | |
| "step": 36900 | |
| }, | |
| { | |
| "epoch": 2.604441628831873, | |
| "grad_norm": 16.94314956665039, | |
| "learning_rate": 7.32349567212431e-06, | |
| "loss": 2.7039, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 2.604441628831873, | |
| "eval_runtime": 192.7372, | |
| "eval_samples_per_second": 147.418, | |
| "eval_steps_per_second": 18.429, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 2.6114806602611482, | |
| "grad_norm": 15.262337684631348, | |
| "learning_rate": 7.193137970591303e-06, | |
| "loss": 2.5053, | |
| "step": 37100 | |
| }, | |
| { | |
| "epoch": 2.618519691690423, | |
| "grad_norm": 16.485326766967773, | |
| "learning_rate": 7.062780269058296e-06, | |
| "loss": 2.6282, | |
| "step": 37200 | |
| }, | |
| { | |
| "epoch": 2.6255587231196986, | |
| "grad_norm": 23.574670791625977, | |
| "learning_rate": 6.93242256752529e-06, | |
| "loss": 2.5995, | |
| "step": 37300 | |
| }, | |
| { | |
| "epoch": 2.632597754548974, | |
| "grad_norm": 16.39130973815918, | |
| "learning_rate": 6.802064865992283e-06, | |
| "loss": 2.5522, | |
| "step": 37400 | |
| }, | |
| { | |
| "epoch": 2.6396367859782495, | |
| "grad_norm": 20.67544174194336, | |
| "learning_rate": 6.671707164459276e-06, | |
| "loss": 2.6411, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 2.6396367859782495, | |
| "eval_runtime": 192.0322, | |
| "eval_samples_per_second": 147.96, | |
| "eval_steps_per_second": 18.497, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 2.646675817407525, | |
| "grad_norm": 17.333271026611328, | |
| "learning_rate": 6.54134946292627e-06, | |
| "loss": 2.6002, | |
| "step": 37600 | |
| }, | |
| { | |
| "epoch": 2.6537148488368, | |
| "grad_norm": 17.444929122924805, | |
| "learning_rate": 6.410991761393263e-06, | |
| "loss": 2.4802, | |
| "step": 37700 | |
| }, | |
| { | |
| "epoch": 2.6607538802660753, | |
| "grad_norm": 17.62455940246582, | |
| "learning_rate": 6.2806340598602564e-06, | |
| "loss": 2.5169, | |
| "step": 37800 | |
| }, | |
| { | |
| "epoch": 2.6677929116953507, | |
| "grad_norm": 23.869504928588867, | |
| "learning_rate": 6.1502763583272506e-06, | |
| "loss": 2.6119, | |
| "step": 37900 | |
| }, | |
| { | |
| "epoch": 2.674831943124626, | |
| "grad_norm": 14.378959655761719, | |
| "learning_rate": 6.019918656794243e-06, | |
| "loss": 2.6484, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 2.674831943124626, | |
| "eval_runtime": 191.7147, | |
| "eval_samples_per_second": 148.205, | |
| "eval_steps_per_second": 18.528, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 2.6818709745539016, | |
| "grad_norm": 13.199753761291504, | |
| "learning_rate": 5.889560955261237e-06, | |
| "loss": 2.5929, | |
| "step": 38100 | |
| }, | |
| { | |
| "epoch": 2.6889100059831765, | |
| "grad_norm": 19.931673049926758, | |
| "learning_rate": 5.75920325372823e-06, | |
| "loss": 2.5691, | |
| "step": 38200 | |
| }, | |
| { | |
| "epoch": 2.695949037412452, | |
| "grad_norm": 16.0571346282959, | |
| "learning_rate": 5.6288455521952244e-06, | |
| "loss": 2.5593, | |
| "step": 38300 | |
| }, | |
| { | |
| "epoch": 2.7029880688417274, | |
| "grad_norm": 14.09821605682373, | |
| "learning_rate": 5.498487850662217e-06, | |
| "loss": 2.5663, | |
| "step": 38400 | |
| }, | |
| { | |
| "epoch": 2.710027100271003, | |
| "grad_norm": 16.2088680267334, | |
| "learning_rate": 5.368130149129211e-06, | |
| "loss": 2.5763, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 2.710027100271003, | |
| "eval_runtime": 192.1652, | |
| "eval_samples_per_second": 147.857, | |
| "eval_steps_per_second": 18.484, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 2.717066131700278, | |
| "grad_norm": 26.869508743286133, | |
| "learning_rate": 5.237772447596204e-06, | |
| "loss": 2.7026, | |
| "step": 38600 | |
| }, | |
| { | |
| "epoch": 2.724105163129553, | |
| "grad_norm": 17.842239379882812, | |
| "learning_rate": 5.107414746063198e-06, | |
| "loss": 2.69, | |
| "step": 38700 | |
| }, | |
| { | |
| "epoch": 2.7311441945588286, | |
| "grad_norm": 17.31543731689453, | |
| "learning_rate": 4.977057044530191e-06, | |
| "loss": 2.6621, | |
| "step": 38800 | |
| }, | |
| { | |
| "epoch": 2.738183225988104, | |
| "grad_norm": 15.826437950134277, | |
| "learning_rate": 4.846699342997185e-06, | |
| "loss": 2.7036, | |
| "step": 38900 | |
| }, | |
| { | |
| "epoch": 2.7452222574173795, | |
| "grad_norm": 16.656599044799805, | |
| "learning_rate": 4.716341641464178e-06, | |
| "loss": 2.5078, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 2.7452222574173795, | |
| "eval_runtime": 191.4626, | |
| "eval_samples_per_second": 148.4, | |
| "eval_steps_per_second": 18.552, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 2.752261288846655, | |
| "grad_norm": 16.501192092895508, | |
| "learning_rate": 4.585983939931171e-06, | |
| "loss": 2.5312, | |
| "step": 39100 | |
| }, | |
| { | |
| "epoch": 2.75930032027593, | |
| "grad_norm": 17.555389404296875, | |
| "learning_rate": 4.455626238398165e-06, | |
| "loss": 2.5059, | |
| "step": 39200 | |
| }, | |
| { | |
| "epoch": 2.7663393517052053, | |
| "grad_norm": 18.289548873901367, | |
| "learning_rate": 4.325268536865158e-06, | |
| "loss": 2.6702, | |
| "step": 39300 | |
| }, | |
| { | |
| "epoch": 2.7733783831344807, | |
| "grad_norm": 15.688879013061523, | |
| "learning_rate": 4.194910835332152e-06, | |
| "loss": 2.5357, | |
| "step": 39400 | |
| }, | |
| { | |
| "epoch": 2.780417414563756, | |
| "grad_norm": 14.281635284423828, | |
| "learning_rate": 4.064553133799144e-06, | |
| "loss": 2.6129, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 2.780417414563756, | |
| "eval_runtime": 193.5326, | |
| "eval_samples_per_second": 146.812, | |
| "eval_steps_per_second": 18.353, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 2.7874564459930316, | |
| "grad_norm": 22.23700523376465, | |
| "learning_rate": 3.9341954322661385e-06, | |
| "loss": 2.5335, | |
| "step": 39600 | |
| }, | |
| { | |
| "epoch": 2.7944954774223065, | |
| "grad_norm": 17.91628074645996, | |
| "learning_rate": 3.803837730733132e-06, | |
| "loss": 2.5757, | |
| "step": 39700 | |
| }, | |
| { | |
| "epoch": 2.801534508851582, | |
| "grad_norm": 16.670568466186523, | |
| "learning_rate": 3.6734800292001254e-06, | |
| "loss": 2.5679, | |
| "step": 39800 | |
| }, | |
| { | |
| "epoch": 2.8085735402808574, | |
| "grad_norm": 17.128202438354492, | |
| "learning_rate": 3.5431223276671187e-06, | |
| "loss": 2.5285, | |
| "step": 39900 | |
| }, | |
| { | |
| "epoch": 2.815612571710133, | |
| "grad_norm": 14.024889945983887, | |
| "learning_rate": 3.412764626134112e-06, | |
| "loss": 2.515, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 2.815612571710133, | |
| "eval_runtime": 192.6175, | |
| "eval_samples_per_second": 147.51, | |
| "eval_steps_per_second": 18.441, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 2.822651603139408, | |
| "grad_norm": 16.694087982177734, | |
| "learning_rate": 3.2824069246011056e-06, | |
| "loss": 2.625, | |
| "step": 40100 | |
| }, | |
| { | |
| "epoch": 2.829690634568683, | |
| "grad_norm": 22.633140563964844, | |
| "learning_rate": 3.1520492230680985e-06, | |
| "loss": 2.5637, | |
| "step": 40200 | |
| }, | |
| { | |
| "epoch": 2.8367296659979586, | |
| "grad_norm": 18.231454849243164, | |
| "learning_rate": 3.021691521535092e-06, | |
| "loss": 2.5672, | |
| "step": 40300 | |
| }, | |
| { | |
| "epoch": 2.843768697427234, | |
| "grad_norm": 15.228378295898438, | |
| "learning_rate": 2.891333820002086e-06, | |
| "loss": 2.6203, | |
| "step": 40400 | |
| }, | |
| { | |
| "epoch": 2.8508077288565095, | |
| "grad_norm": 19.437833786010742, | |
| "learning_rate": 2.760976118469079e-06, | |
| "loss": 2.626, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 2.8508077288565095, | |
| "eval_runtime": 192.5589, | |
| "eval_samples_per_second": 147.555, | |
| "eval_steps_per_second": 18.446, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 2.857846760285785, | |
| "grad_norm": 16.506317138671875, | |
| "learning_rate": 2.630618416936073e-06, | |
| "loss": 2.5625, | |
| "step": 40600 | |
| }, | |
| { | |
| "epoch": 2.86488579171506, | |
| "grad_norm": 25.00144386291504, | |
| "learning_rate": 2.500260715403066e-06, | |
| "loss": 2.616, | |
| "step": 40700 | |
| }, | |
| { | |
| "epoch": 2.8719248231443353, | |
| "grad_norm": 16.7978572845459, | |
| "learning_rate": 2.3699030138700597e-06, | |
| "loss": 2.5959, | |
| "step": 40800 | |
| }, | |
| { | |
| "epoch": 2.8789638545736107, | |
| "grad_norm": 15.795037269592285, | |
| "learning_rate": 2.239545312337053e-06, | |
| "loss": 2.6245, | |
| "step": 40900 | |
| }, | |
| { | |
| "epoch": 2.886002886002886, | |
| "grad_norm": 16.411415100097656, | |
| "learning_rate": 2.1091876108040467e-06, | |
| "loss": 2.6368, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 2.886002886002886, | |
| "eval_runtime": 192.2421, | |
| "eval_samples_per_second": 147.798, | |
| "eval_steps_per_second": 18.477, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 2.8930419174321615, | |
| "grad_norm": 16.8485050201416, | |
| "learning_rate": 1.97882990927104e-06, | |
| "loss": 2.5946, | |
| "step": 41100 | |
| }, | |
| { | |
| "epoch": 2.9000809488614365, | |
| "grad_norm": 15.294781684875488, | |
| "learning_rate": 1.8484722077380334e-06, | |
| "loss": 2.6035, | |
| "step": 41200 | |
| }, | |
| { | |
| "epoch": 2.907119980290712, | |
| "grad_norm": 26.89401626586914, | |
| "learning_rate": 1.7181145062050267e-06, | |
| "loss": 2.6353, | |
| "step": 41300 | |
| }, | |
| { | |
| "epoch": 2.9141590117199874, | |
| "grad_norm": 13.004213333129883, | |
| "learning_rate": 1.58775680467202e-06, | |
| "loss": 2.6299, | |
| "step": 41400 | |
| }, | |
| { | |
| "epoch": 2.921198043149263, | |
| "grad_norm": 17.197162628173828, | |
| "learning_rate": 1.4573991031390136e-06, | |
| "loss": 2.6031, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 2.921198043149263, | |
| "eval_runtime": 192.0882, | |
| "eval_samples_per_second": 147.916, | |
| "eval_steps_per_second": 18.492, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 2.928237074578538, | |
| "grad_norm": 16.04857063293457, | |
| "learning_rate": 1.327041401606007e-06, | |
| "loss": 2.5869, | |
| "step": 41600 | |
| }, | |
| { | |
| "epoch": 2.935276106007813, | |
| "grad_norm": 14.147359848022461, | |
| "learning_rate": 1.1966837000730005e-06, | |
| "loss": 2.5851, | |
| "step": 41700 | |
| }, | |
| { | |
| "epoch": 2.9423151374370886, | |
| "grad_norm": 17.802715301513672, | |
| "learning_rate": 1.0663259985399938e-06, | |
| "loss": 2.4637, | |
| "step": 41800 | |
| }, | |
| { | |
| "epoch": 2.949354168866364, | |
| "grad_norm": 20.130615234375, | |
| "learning_rate": 9.359682970069872e-07, | |
| "loss": 2.5418, | |
| "step": 41900 | |
| }, | |
| { | |
| "epoch": 2.9563932002956395, | |
| "grad_norm": 18.158117294311523, | |
| "learning_rate": 8.056105954739805e-07, | |
| "loss": 2.5456, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 2.9563932002956395, | |
| "eval_runtime": 192.2432, | |
| "eval_samples_per_second": 147.797, | |
| "eval_steps_per_second": 18.477, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 2.963432231724915, | |
| "grad_norm": 15.710502624511719, | |
| "learning_rate": 6.75252893940974e-07, | |
| "loss": 2.6082, | |
| "step": 42100 | |
| }, | |
| { | |
| "epoch": 2.97047126315419, | |
| "grad_norm": 15.316740989685059, | |
| "learning_rate": 5.448951924079675e-07, | |
| "loss": 2.5569, | |
| "step": 42200 | |
| }, | |
| { | |
| "epoch": 2.9775102945834653, | |
| "grad_norm": 17.120691299438477, | |
| "learning_rate": 4.145374908749609e-07, | |
| "loss": 2.6017, | |
| "step": 42300 | |
| }, | |
| { | |
| "epoch": 2.9845493260127407, | |
| "grad_norm": 14.475923538208008, | |
| "learning_rate": 2.841797893419543e-07, | |
| "loss": 2.5627, | |
| "step": 42400 | |
| }, | |
| { | |
| "epoch": 2.991588357442016, | |
| "grad_norm": 16.937416076660156, | |
| "learning_rate": 1.5382208780894776e-07, | |
| "loss": 2.557, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 2.991588357442016, | |
| "eval_runtime": 192.9822, | |
| "eval_samples_per_second": 147.231, | |
| "eval_steps_per_second": 18.406, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 2.9986273888712915, | |
| "grad_norm": 25.87067413330078, | |
| "learning_rate": 2.3464386275941184e-08, | |
| "loss": 2.5282, | |
| "step": 42600 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 42618, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.1152556854519332e+16, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |