{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 6290, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0001589825119236884, "grad_norm": 1.8488894701004028, "learning_rate": 0.0, "loss": 0.7591, "num_input_tokens_seen": 4190107, "step": 1, "train_runtime": 70.4589, "train_tokens_per_second": 59468.814 }, { "epoch": 0.0003179650238473768, "grad_norm": 2.213782548904419, "learning_rate": 3.846153846153847e-06, "loss": 0.8022, "num_input_tokens_seen": 8010023, "step": 2, "train_runtime": 91.5492, "train_tokens_per_second": 87494.213 }, { "epoch": 0.0004769475357710652, "grad_norm": 2.11086106300354, "learning_rate": 7.692307692307694e-06, "loss": 0.7579, "num_input_tokens_seen": 11540712, "step": 3, "train_runtime": 131.0227, "train_tokens_per_second": 88081.778 }, { "epoch": 0.0006359300476947536, "grad_norm": 1.4698841571807861, "learning_rate": 1.153846153846154e-05, "loss": 0.7571, "num_input_tokens_seen": 15731199, "step": 4, "train_runtime": 172.6064, "train_tokens_per_second": 91139.134 }, { "epoch": 0.000794912559618442, "grad_norm": 0.8004894852638245, "learning_rate": 1.5384615384615387e-05, "loss": 0.7387, "num_input_tokens_seen": 19875689, "step": 5, "train_runtime": 202.7337, "train_tokens_per_second": 98038.413 }, { "epoch": 0.0009538950715421304, "grad_norm": 1.9209990501403809, "learning_rate": 1.923076923076923e-05, "loss": 0.7211, "num_input_tokens_seen": 23392486, "step": 6, "train_runtime": 239.4474, "train_tokens_per_second": 97693.646 }, { "epoch": 0.0011128775834658188, "grad_norm": 1.8014707565307617, "learning_rate": 2.307692307692308e-05, "loss": 0.7171, "num_input_tokens_seen": 27346709, "step": 7, "train_runtime": 279.7582, "train_tokens_per_second": 97751.225 }, { "epoch": 0.0012718600953895071, "grad_norm": 1.2108733654022217, "learning_rate": 2.6923076923076923e-05, "loss": 0.6969, "num_input_tokens_seen": 31524098, "step": 8, "train_runtime": 320.0895, "train_tokens_per_second": 98485.248 }, { "epoch": 0.0014308426073131955, "grad_norm": 0.8073210716247559, "learning_rate": 3.0769230769230774e-05, "loss": 0.7049, "num_input_tokens_seen": 35218974, "step": 9, "train_runtime": 356.3907, "train_tokens_per_second": 98821.236 }, { "epoch": 0.001589825119236884, "grad_norm": 0.6310760378837585, "learning_rate": 3.461538461538462e-05, "loss": 0.6499, "num_input_tokens_seen": 38814741, "step": 10, "train_runtime": 396.9595, "train_tokens_per_second": 97780.095 }, { "epoch": 0.0017488076311605724, "grad_norm": 0.8055503368377686, "learning_rate": 3.846153846153846e-05, "loss": 0.6695, "num_input_tokens_seen": 42883079, "step": 11, "train_runtime": 436.5029, "train_tokens_per_second": 98242.374 }, { "epoch": 0.0019077901430842607, "grad_norm": 0.5952548384666443, "learning_rate": 4.230769230769231e-05, "loss": 0.6545, "num_input_tokens_seen": 47009047, "step": 12, "train_runtime": 473.2177, "train_tokens_per_second": 99339.148 }, { "epoch": 0.002066772655007949, "grad_norm": 0.47586584091186523, "learning_rate": 4.615384615384616e-05, "loss": 0.6184, "num_input_tokens_seen": 50741141, "step": 13, "train_runtime": 513.4906, "train_tokens_per_second": 98816.106 }, { "epoch": 0.0022257551669316376, "grad_norm": 0.4969903528690338, "learning_rate": 5e-05, "loss": 0.6488, "num_input_tokens_seen": 54493686, "step": 14, "train_runtime": 553.9139, "train_tokens_per_second": 98379.35 }, { "epoch": 0.0023847376788553257, "grad_norm": 0.4702528417110443, "learning_rate": 4.999999686883833e-05, "loss": 0.6219, "num_input_tokens_seen": 58539792, "step": 15, "train_runtime": 594.6219, "train_tokens_per_second": 98448.769 }, { "epoch": 0.0025437201907790143, "grad_norm": 0.42143383622169495, "learning_rate": 4.9999987475354104e-05, "loss": 0.6324, "num_input_tokens_seen": 62459322, "step": 16, "train_runtime": 630.9073, "train_tokens_per_second": 98999.209 }, { "epoch": 0.002702702702702703, "grad_norm": 0.39312151074409485, "learning_rate": 4.999997181954967e-05, "loss": 0.6001, "num_input_tokens_seen": 66266386, "step": 17, "train_runtime": 669.9385, "train_tokens_per_second": 98914.139 }, { "epoch": 0.002861685214626391, "grad_norm": 0.36456507444381714, "learning_rate": 4.999994990142896e-05, "loss": 0.6021, "num_input_tokens_seen": 70254622, "step": 18, "train_runtime": 708.5725, "train_tokens_per_second": 99149.518 }, { "epoch": 0.0030206677265500795, "grad_norm": 0.33665215969085693, "learning_rate": 4.999992172099745e-05, "loss": 0.5976, "num_input_tokens_seen": 74259151, "step": 19, "train_runtime": 748.1783, "train_tokens_per_second": 99253.286 }, { "epoch": 0.003179650238473768, "grad_norm": 0.38171350955963135, "learning_rate": 4.9999887278262215e-05, "loss": 0.5962, "num_input_tokens_seen": 78087116, "step": 20, "train_runtime": 786.1418, "train_tokens_per_second": 99329.555 }, { "epoch": 0.003338632750397456, "grad_norm": 0.2831626236438751, "learning_rate": 4.999984657323187e-05, "loss": 0.5856, "num_input_tokens_seen": 81804163, "step": 21, "train_runtime": 826.3703, "train_tokens_per_second": 98992.137 }, { "epoch": 0.0034976152623211448, "grad_norm": 0.3148765563964844, "learning_rate": 4.999979960591662e-05, "loss": 0.5823, "num_input_tokens_seen": 85902084, "step": 22, "train_runtime": 867.4254, "train_tokens_per_second": 99031.087 }, { "epoch": 0.003656597774244833, "grad_norm": 0.28487080335617065, "learning_rate": 4.999974637632823e-05, "loss": 0.5933, "num_input_tokens_seen": 89860090, "step": 23, "train_runtime": 907.1517, "train_tokens_per_second": 99057.403 }, { "epoch": 0.0038155802861685214, "grad_norm": 0.2693890333175659, "learning_rate": 4.999968688448002e-05, "loss": 0.5738, "num_input_tokens_seen": 93645415, "step": 24, "train_runtime": 947.4809, "train_tokens_per_second": 98836.206 }, { "epoch": 0.00397456279809221, "grad_norm": 0.27097493410110474, "learning_rate": 4.9999621130386904e-05, "loss": 0.5778, "num_input_tokens_seen": 97500577, "step": 25, "train_runtime": 987.1128, "train_tokens_per_second": 98773.489 }, { "epoch": 0.004133545310015898, "grad_norm": 0.26419684290885925, "learning_rate": 4.9999549114065355e-05, "loss": 0.5905, "num_input_tokens_seen": 101471220, "step": 26, "train_runtime": 1026.7661, "train_tokens_per_second": 98826.036 }, { "epoch": 0.004292527821939587, "grad_norm": 0.8844764232635498, "learning_rate": 4.9999470835533404e-05, "loss": 0.5887, "num_input_tokens_seen": 105358977, "step": 27, "train_runtime": 1056.7197, "train_tokens_per_second": 99703.809 }, { "epoch": 0.004451510333863275, "grad_norm": 0.26903778314590454, "learning_rate": 4.999938629481067e-05, "loss": 0.5704, "num_input_tokens_seen": 109208436, "step": 28, "train_runtime": 1095.4251, "train_tokens_per_second": 99695.031 }, { "epoch": 0.004610492845786964, "grad_norm": 0.2739573121070862, "learning_rate": 4.9999295491918315e-05, "loss": 0.5582, "num_input_tokens_seen": 113189297, "step": 29, "train_runtime": 1135.3561, "train_tokens_per_second": 99694.974 }, { "epoch": 0.0047694753577106515, "grad_norm": 0.27148371934890747, "learning_rate": 4.9999198426879095e-05, "loss": 0.583, "num_input_tokens_seen": 117234409, "step": 30, "train_runtime": 1173.5321, "train_tokens_per_second": 99898.766 }, { "epoch": 0.00492845786963434, "grad_norm": 0.264448881149292, "learning_rate": 4.999909509971733e-05, "loss": 0.5673, "num_input_tokens_seen": 121045042, "step": 31, "train_runtime": 1210.565, "train_tokens_per_second": 99990.533 }, { "epoch": 0.005087440381558029, "grad_norm": 0.26043567061424255, "learning_rate": 4.999898551045888e-05, "loss": 0.5764, "num_input_tokens_seen": 124934166, "step": 32, "train_runtime": 1249.2444, "train_tokens_per_second": 100007.788 }, { "epoch": 0.005246422893481717, "grad_norm": 0.2543739378452301, "learning_rate": 4.999886965913123e-05, "loss": 0.5581, "num_input_tokens_seen": 128870400, "step": 33, "train_runtime": 1290.2772, "train_tokens_per_second": 99878.076 }, { "epoch": 0.005405405405405406, "grad_norm": 0.2515336573123932, "learning_rate": 4.999874754576337e-05, "loss": 0.5551, "num_input_tokens_seen": 132824868, "step": 34, "train_runtime": 1328.1413, "train_tokens_per_second": 100008.081 }, { "epoch": 0.005564387917329093, "grad_norm": 0.30589884519577026, "learning_rate": 4.99986191703859e-05, "loss": 0.5732, "num_input_tokens_seen": 136623140, "step": 35, "train_runtime": 1367.6023, "train_tokens_per_second": 99899.762 }, { "epoch": 0.005723370429252782, "grad_norm": 0.22217601537704468, "learning_rate": 4.999848453303098e-05, "loss": 0.5472, "num_input_tokens_seen": 140563385, "step": 36, "train_runtime": 1405.9497, "train_tokens_per_second": 99977.536 }, { "epoch": 0.0058823529411764705, "grad_norm": 0.24296316504478455, "learning_rate": 4.9998343633732334e-05, "loss": 0.5627, "num_input_tokens_seen": 144567946, "step": 37, "train_runtime": 1446.7175, "train_tokens_per_second": 99928.25 }, { "epoch": 0.006041335453100159, "grad_norm": 0.21859848499298096, "learning_rate": 4.999819647252525e-05, "loss": 0.5447, "num_input_tokens_seen": 148449337, "step": 38, "train_runtime": 1483.6407, "train_tokens_per_second": 100057.472 }, { "epoch": 0.006200317965023848, "grad_norm": 0.24427489936351776, "learning_rate": 4.9998043049446594e-05, "loss": 0.553, "num_input_tokens_seen": 152293660, "step": 39, "train_runtime": 1521.2528, "train_tokens_per_second": 100110.683 }, { "epoch": 0.006359300476947536, "grad_norm": 0.4473072290420532, "learning_rate": 4.999788336453481e-05, "loss": 0.5539, "num_input_tokens_seen": 156124179, "step": 40, "train_runtime": 1559.1295, "train_tokens_per_second": 100135.479 }, { "epoch": 0.006518282988871224, "grad_norm": 0.25702622532844543, "learning_rate": 4.9997717417829885e-05, "loss": 0.5498, "num_input_tokens_seen": 160101141, "step": 41, "train_runtime": 1600.5161, "train_tokens_per_second": 100030.944 }, { "epoch": 0.006677265500794912, "grad_norm": 0.22569040954113007, "learning_rate": 4.999754520937338e-05, "loss": 0.5277, "num_input_tokens_seen": 163996159, "step": 42, "train_runtime": 1637.1717, "train_tokens_per_second": 100170.41 }, { "epoch": 0.006836248012718601, "grad_norm": 0.24926422536373138, "learning_rate": 4.999736673920844e-05, "loss": 0.5536, "num_input_tokens_seen": 167852789, "step": 43, "train_runtime": 1674.3678, "train_tokens_per_second": 100248.457 }, { "epoch": 0.0069952305246422895, "grad_norm": 0.2586618959903717, "learning_rate": 4.9997182007379785e-05, "loss": 0.5479, "num_input_tokens_seen": 171776315, "step": 44, "train_runtime": 1715.7019, "train_tokens_per_second": 100120.14 }, { "epoch": 0.007154213036565978, "grad_norm": 0.23577344417572021, "learning_rate": 4.999699101393367e-05, "loss": 0.5191, "num_input_tokens_seen": 175690080, "step": 45, "train_runtime": 1757.5894, "train_tokens_per_second": 99960.819 }, { "epoch": 0.007313195548489666, "grad_norm": 0.2508201003074646, "learning_rate": 4.9996793758917936e-05, "loss": 0.5381, "num_input_tokens_seen": 179549644, "step": 46, "train_runtime": 1796.6195, "train_tokens_per_second": 99937.493 }, { "epoch": 0.007472178060413354, "grad_norm": 0.2429482340812683, "learning_rate": 4.9996590242382016e-05, "loss": 0.5431, "num_input_tokens_seen": 183433617, "step": 47, "train_runtime": 1836.3279, "train_tokens_per_second": 99891.538 }, { "epoch": 0.007631160572337043, "grad_norm": 0.23664875328540802, "learning_rate": 4.999638046437686e-05, "loss": 0.5275, "num_input_tokens_seen": 187256892, "step": 48, "train_runtime": 1874.4059, "train_tokens_per_second": 99901.995 }, { "epoch": 0.0077901430842607314, "grad_norm": 0.23554544150829315, "learning_rate": 4.9996164424955036e-05, "loss": 0.5259, "num_input_tokens_seen": 191256153, "step": 49, "train_runtime": 1914.6287, "train_tokens_per_second": 99892.034 }, { "epoch": 0.00794912559618442, "grad_norm": 0.2919403910636902, "learning_rate": 4.999594212417066e-05, "loss": 0.5348, "num_input_tokens_seen": 195107436, "step": 50, "train_runtime": 1950.2841, "train_tokens_per_second": 100040.521 }, { "epoch": 0.008108108108108109, "grad_norm": 0.25511494278907776, "learning_rate": 4.999571356207941e-05, "loss": 0.5331, "num_input_tokens_seen": 198895100, "step": 51, "train_runtime": 1989.7101, "train_tokens_per_second": 99961.85 }, { "epoch": 0.008267090620031796, "grad_norm": 0.23525705933570862, "learning_rate": 4.9995478738738544e-05, "loss": 0.5469, "num_input_tokens_seen": 202832289, "step": 52, "train_runtime": 2029.6408, "train_tokens_per_second": 99935.067 }, { "epoch": 0.008426073131955486, "grad_norm": 0.24957098066806793, "learning_rate": 4.999523765420687e-05, "loss": 0.538, "num_input_tokens_seen": 206822782, "step": 53, "train_runtime": 2070.0504, "train_tokens_per_second": 99911.952 }, { "epoch": 0.008585055643879173, "grad_norm": 0.31494370102882385, "learning_rate": 4.99949903085448e-05, "loss": 0.5259, "num_input_tokens_seen": 210682138, "step": 54, "train_runtime": 2107.8714, "train_tokens_per_second": 99950.187 }, { "epoch": 0.008744038155802861, "grad_norm": 0.6013438105583191, "learning_rate": 4.999473670181428e-05, "loss": 0.543, "num_input_tokens_seen": 214464457, "step": 55, "train_runtime": 2144.5, "train_tokens_per_second": 100006.742 }, { "epoch": 0.00890302066772655, "grad_norm": 0.3003457188606262, "learning_rate": 4.999447683407884e-05, "loss": 0.5289, "num_input_tokens_seen": 218363217, "step": 56, "train_runtime": 2210.657, "train_tokens_per_second": 98777.519 }, { "epoch": 0.009062003179650238, "grad_norm": 0.27745750546455383, "learning_rate": 4.999421070540357e-05, "loss": 0.5318, "num_input_tokens_seen": 222276029, "step": 57, "train_runtime": 2251.2688, "train_tokens_per_second": 98733.67 }, { "epoch": 0.009220985691573928, "grad_norm": 0.3279472589492798, "learning_rate": 4.999393831585513e-05, "loss": 0.5271, "num_input_tokens_seen": 226130802, "step": 58, "train_runtime": 2290.3729, "train_tokens_per_second": 98730.997 }, { "epoch": 0.009379968203497615, "grad_norm": 0.26945388317108154, "learning_rate": 4.9993659665501755e-05, "loss": 0.5339, "num_input_tokens_seen": 230101176, "step": 59, "train_runtime": 2330.016, "train_tokens_per_second": 98755.19 }, { "epoch": 0.009538950715421303, "grad_norm": 0.24990911781787872, "learning_rate": 4.999337475441326e-05, "loss": 0.5338, "num_input_tokens_seen": 233992454, "step": 60, "train_runtime": 2367.5476, "train_tokens_per_second": 98833.264 }, { "epoch": 0.009697933227344992, "grad_norm": 0.30303722620010376, "learning_rate": 4.9993083582660996e-05, "loss": 0.5336, "num_input_tokens_seen": 237966595, "step": 61, "train_runtime": 2407.0733, "train_tokens_per_second": 98861.382 }, { "epoch": 0.00985691573926868, "grad_norm": 0.2763914167881012, "learning_rate": 4.9992786150317904e-05, "loss": 0.5365, "num_input_tokens_seen": 241895878, "step": 62, "train_runtime": 2444.3011, "train_tokens_per_second": 98963.206 }, { "epoch": 0.01001589825119237, "grad_norm": 0.3031845688819885, "learning_rate": 4.999248245745849e-05, "loss": 0.534, "num_input_tokens_seen": 245846778, "step": 63, "train_runtime": 2485.2744, "train_tokens_per_second": 98921.382 }, { "epoch": 0.010174880763116057, "grad_norm": 0.29879263043403625, "learning_rate": 4.9992172504158824e-05, "loss": 0.5304, "num_input_tokens_seen": 249754207, "step": 64, "train_runtime": 2522.1962, "train_tokens_per_second": 99022.513 }, { "epoch": 0.010333863275039745, "grad_norm": 0.3269798755645752, "learning_rate": 4.9991856290496554e-05, "loss": 0.5231, "num_input_tokens_seen": 253680741, "step": 65, "train_runtime": 2560.6824, "train_tokens_per_second": 99067.633 }, { "epoch": 0.010492845786963434, "grad_norm": 0.30369651317596436, "learning_rate": 4.9991533816550875e-05, "loss": 0.5155, "num_input_tokens_seen": 257568392, "step": 66, "train_runtime": 2599.3285, "train_tokens_per_second": 99090.359 }, { "epoch": 0.010651828298887122, "grad_norm": 0.2837687134742737, "learning_rate": 4.999120508240258e-05, "loss": 0.5368, "num_input_tokens_seen": 261527085, "step": 67, "train_runtime": 2638.6381, "train_tokens_per_second": 99114.422 }, { "epoch": 0.010810810810810811, "grad_norm": 0.25034546852111816, "learning_rate": 4.999087008813401e-05, "loss": 0.5284, "num_input_tokens_seen": 265428698, "step": 68, "train_runtime": 2678.5467, "train_tokens_per_second": 99094.296 }, { "epoch": 0.010969793322734499, "grad_norm": 0.27507853507995605, "learning_rate": 4.999052883382908e-05, "loss": 0.5206, "num_input_tokens_seen": 269381509, "step": 69, "train_runtime": 2717.7299, "train_tokens_per_second": 99120.045 }, { "epoch": 0.011128775834658187, "grad_norm": 0.2671317756175995, "learning_rate": 4.999018131957326e-05, "loss": 0.5107, "num_input_tokens_seen": 273324435, "step": 70, "train_runtime": 2755.5156, "train_tokens_per_second": 99191.758 }, { "epoch": 0.011287758346581876, "grad_norm": 0.29595574736595154, "learning_rate": 4.9989827545453616e-05, "loss": 0.5175, "num_input_tokens_seen": 277192979, "step": 71, "train_runtime": 2796.2156, "train_tokens_per_second": 99131.475 }, { "epoch": 0.011446740858505564, "grad_norm": 0.2771182060241699, "learning_rate": 4.9989467511558755e-05, "loss": 0.5101, "num_input_tokens_seen": 281033746, "step": 72, "train_runtime": 2835.6577, "train_tokens_per_second": 99107.076 }, { "epoch": 0.011605723370429253, "grad_norm": 0.2681826055049896, "learning_rate": 4.9989101217978875e-05, "loss": 0.5141, "num_input_tokens_seen": 284963597, "step": 73, "train_runtime": 2875.3874, "train_tokens_per_second": 99104.42 }, { "epoch": 0.011764705882352941, "grad_norm": 0.24770720303058624, "learning_rate": 4.998872866480571e-05, "loss": 0.5166, "num_input_tokens_seen": 288970166, "step": 74, "train_runtime": 2914.568, "train_tokens_per_second": 99146.827 }, { "epoch": 0.01192368839427663, "grad_norm": 0.29981809854507446, "learning_rate": 4.99883498521326e-05, "loss": 0.5217, "num_input_tokens_seen": 292833855, "step": 75, "train_runtime": 2952.1905, "train_tokens_per_second": 99192.06 }, { "epoch": 0.012082670906200318, "grad_norm": 0.25812339782714844, "learning_rate": 4.998796478005443e-05, "loss": 0.5243, "num_input_tokens_seen": 296493022, "step": 76, "train_runtime": 2991.7193, "train_tokens_per_second": 99104.558 }, { "epoch": 0.012241653418124006, "grad_norm": 0.24962005019187927, "learning_rate": 4.998757344866765e-05, "loss": 0.5223, "num_input_tokens_seen": 300449977, "step": 77, "train_runtime": 3032.3958, "train_tokens_per_second": 99080.066 }, { "epoch": 0.012400635930047695, "grad_norm": 0.21624183654785156, "learning_rate": 4.99871758580703e-05, "loss": 0.5032, "num_input_tokens_seen": 304453613, "step": 78, "train_runtime": 3067.6964, "train_tokens_per_second": 99245.028 }, { "epoch": 0.012559618441971383, "grad_norm": 0.2529910206794739, "learning_rate": 4.998677200836196e-05, "loss": 0.5158, "num_input_tokens_seen": 308276804, "step": 79, "train_runtime": 3107.1396, "train_tokens_per_second": 99215.628 }, { "epoch": 0.012718600953895072, "grad_norm": 0.23895801603794098, "learning_rate": 4.9986361899643796e-05, "loss": 0.5143, "num_input_tokens_seen": 312189318, "step": 80, "train_runtime": 3146.5642, "train_tokens_per_second": 99215.938 }, { "epoch": 0.01287758346581876, "grad_norm": 0.2577573359012604, "learning_rate": 4.998594553201854e-05, "loss": 0.5165, "num_input_tokens_seen": 316158270, "step": 81, "train_runtime": 3186.4287, "train_tokens_per_second": 99220.254 }, { "epoch": 0.013036565977742448, "grad_norm": 0.34028664231300354, "learning_rate": 4.9985522905590494e-05, "loss": 0.5127, "num_input_tokens_seen": 320142099, "step": 82, "train_runtime": 3227.8271, "train_tokens_per_second": 99181.922 }, { "epoch": 0.013195548489666137, "grad_norm": 0.3424776494503021, "learning_rate": 4.998509402046551e-05, "loss": 0.5158, "num_input_tokens_seen": 324052579, "step": 83, "train_runtime": 3266.5771, "train_tokens_per_second": 99202.49 }, { "epoch": 0.013354531001589825, "grad_norm": 0.31988725066185, "learning_rate": 4.998465887675103e-05, "loss": 0.524, "num_input_tokens_seen": 327953037, "step": 84, "train_runtime": 3304.4921, "train_tokens_per_second": 99244.612 }, { "epoch": 0.013513513513513514, "grad_norm": 0.2914837896823883, "learning_rate": 4.998421747455605e-05, "loss": 0.5206, "num_input_tokens_seen": 331808376, "step": 85, "train_runtime": 3344.734, "train_tokens_per_second": 99203.218 }, { "epoch": 0.013672496025437202, "grad_norm": 0.2703953683376312, "learning_rate": 4.998376981399114e-05, "loss": 0.5158, "num_input_tokens_seen": 335704878, "step": 86, "train_runtime": 3381.0433, "train_tokens_per_second": 99290.322 }, { "epoch": 0.01383147853736089, "grad_norm": 0.310100257396698, "learning_rate": 4.9983315895168434e-05, "loss": 0.5158, "num_input_tokens_seen": 339580485, "step": 87, "train_runtime": 3419.719, "train_tokens_per_second": 99300.699 }, { "epoch": 0.013990461049284579, "grad_norm": 0.287628710269928, "learning_rate": 4.9982855718201644e-05, "loss": 0.5169, "num_input_tokens_seen": 343402857, "step": 88, "train_runtime": 3456.7496, "train_tokens_per_second": 99342.704 }, { "epoch": 0.014149443561208267, "grad_norm": 0.33464086055755615, "learning_rate": 4.998238928320603e-05, "loss": 0.5253, "num_input_tokens_seen": 347237434, "step": 89, "train_runtime": 3497.0007, "train_tokens_per_second": 99295.786 }, { "epoch": 0.014308426073131956, "grad_norm": 0.4412058889865875, "learning_rate": 4.9981916590298434e-05, "loss": 0.5054, "num_input_tokens_seen": 351020020, "step": 90, "train_runtime": 3536.707, "train_tokens_per_second": 99250.522 }, { "epoch": 0.014467408585055644, "grad_norm": 0.32403963804244995, "learning_rate": 4.998143763959726e-05, "loss": 0.5103, "num_input_tokens_seen": 354960725, "step": 91, "train_runtime": 3577.0123, "train_tokens_per_second": 99233.856 }, { "epoch": 0.014626391096979332, "grad_norm": 0.305324912071228, "learning_rate": 4.998095243122249e-05, "loss": 0.5135, "num_input_tokens_seen": 358831070, "step": 92, "train_runtime": 3615.2736, "train_tokens_per_second": 99254.195 }, { "epoch": 0.014785373608903021, "grad_norm": 0.38979262113571167, "learning_rate": 4.9980460965295665e-05, "loss": 0.5266, "num_input_tokens_seen": 362728472, "step": 93, "train_runtime": 3653.2672, "train_tokens_per_second": 99288.788 }, { "epoch": 0.014944356120826709, "grad_norm": 0.28156930208206177, "learning_rate": 4.9979963241939875e-05, "loss": 0.5109, "num_input_tokens_seen": 366622575, "step": 94, "train_runtime": 3693.1333, "train_tokens_per_second": 99271.417 }, { "epoch": 0.015103338632750398, "grad_norm": 0.26124143600463867, "learning_rate": 4.997945926127983e-05, "loss": 0.5146, "num_input_tokens_seen": 370640417, "step": 95, "train_runtime": 3733.5029, "train_tokens_per_second": 99274.173 }, { "epoch": 0.015262321144674086, "grad_norm": 0.2706736624240875, "learning_rate": 4.9978949023441736e-05, "loss": 0.5158, "num_input_tokens_seen": 374454960, "step": 96, "train_runtime": 3773.6262, "train_tokens_per_second": 99229.48 }, { "epoch": 0.015421303656597773, "grad_norm": 0.2923503816127777, "learning_rate": 4.997843252855343e-05, "loss": 0.509, "num_input_tokens_seen": 378199625, "step": 97, "train_runtime": 3812.8957, "train_tokens_per_second": 99189.608 }, { "epoch": 0.015580286168521463, "grad_norm": 0.3229970335960388, "learning_rate": 4.9977909776744276e-05, "loss": 0.508, "num_input_tokens_seen": 382094147, "step": 98, "train_runtime": 3853.2498, "train_tokens_per_second": 99161.53 }, { "epoch": 0.015739268680445152, "grad_norm": 0.2826981842517853, "learning_rate": 4.997738076814523e-05, "loss": 0.502, "num_input_tokens_seen": 386108760, "step": 99, "train_runtime": 3891.3564, "train_tokens_per_second": 99222.152 }, { "epoch": 0.01589825119236884, "grad_norm": 0.32359620928764343, "learning_rate": 4.9976845502888796e-05, "loss": 0.5133, "num_input_tokens_seen": 390092423, "step": 100, "train_runtime": 3930.1984, "train_tokens_per_second": 99255.148 }, { "epoch": 0.016057233704292528, "grad_norm": 0.3871191143989563, "learning_rate": 4.997630398110906e-05, "loss": 0.5015, "num_input_tokens_seen": 393758069, "step": 101, "train_runtime": 3969.9735, "train_tokens_per_second": 99184.054 }, { "epoch": 0.016216216216216217, "grad_norm": 0.2820076048374176, "learning_rate": 4.997575620294168e-05, "loss": 0.5018, "num_input_tokens_seen": 397730882, "step": 102, "train_runtime": 4009.8885, "train_tokens_per_second": 99187.516 }, { "epoch": 0.016375198728139903, "grad_norm": 0.32991456985473633, "learning_rate": 4.9975202168523847e-05, "loss": 0.494, "num_input_tokens_seen": 401746438, "step": 103, "train_runtime": 4047.8971, "train_tokens_per_second": 99248.185 }, { "epoch": 0.016534181240063592, "grad_norm": 0.44391581416130066, "learning_rate": 4.997464187799435e-05, "loss": 0.5157, "num_input_tokens_seen": 405588668, "step": 104, "train_runtime": 4089.3282, "train_tokens_per_second": 99182.224 }, { "epoch": 0.016693163751987282, "grad_norm": 0.46282175183296204, "learning_rate": 4.997407533149355e-05, "loss": 0.4998, "num_input_tokens_seen": 409444489, "step": 105, "train_runtime": 4127.3869, "train_tokens_per_second": 99201.868 }, { "epoch": 0.01685214626391097, "grad_norm": 0.42842569947242737, "learning_rate": 4.997350252916335e-05, "loss": 0.5019, "num_input_tokens_seen": 413522657, "step": 106, "train_runtime": 4166.0115, "train_tokens_per_second": 99261.047 }, { "epoch": 0.017011128775834657, "grad_norm": 0.3515016734600067, "learning_rate": 4.997292347114725e-05, "loss": 0.509, "num_input_tokens_seen": 417389012, "step": 107, "train_runtime": 4204.0528, "train_tokens_per_second": 99282.534 }, { "epoch": 0.017170111287758347, "grad_norm": 0.31876495480537415, "learning_rate": 4.9972338157590276e-05, "loss": 0.5155, "num_input_tokens_seen": 421111057, "step": 108, "train_runtime": 4242.9562, "train_tokens_per_second": 99249.446 }, { "epoch": 0.017329093799682036, "grad_norm": 0.3272643983364105, "learning_rate": 4.997174658863906e-05, "loss": 0.5013, "num_input_tokens_seen": 425117334, "step": 109, "train_runtime": 4284.2796, "train_tokens_per_second": 99227.261 }, { "epoch": 0.017488076311605722, "grad_norm": 0.3364606201648712, "learning_rate": 4.997114876444179e-05, "loss": 0.5005, "num_input_tokens_seen": 429063005, "step": 110, "train_runtime": 4321.9479, "train_tokens_per_second": 99275.377 }, { "epoch": 0.01764705882352941, "grad_norm": 0.36990299820899963, "learning_rate": 4.99705446851482e-05, "loss": 0.5, "num_input_tokens_seen": 432883039, "step": 111, "train_runtime": 4361.5373, "train_tokens_per_second": 99250.106 }, { "epoch": 0.0178060413354531, "grad_norm": 0.342981219291687, "learning_rate": 4.996993435090962e-05, "loss": 0.4946, "num_input_tokens_seen": 436690657, "step": 112, "train_runtime": 4399.3526, "train_tokens_per_second": 99262.483 }, { "epoch": 0.017965023847376787, "grad_norm": 0.3124958574771881, "learning_rate": 4.996931776187893e-05, "loss": 0.5133, "num_input_tokens_seen": 440754322, "step": 113, "train_runtime": 4436.6513, "train_tokens_per_second": 99343.917 }, { "epoch": 0.018124006359300476, "grad_norm": 0.27988553047180176, "learning_rate": 4.996869491821058e-05, "loss": 0.4987, "num_input_tokens_seen": 444495960, "step": 114, "train_runtime": 4475.6148, "train_tokens_per_second": 99315.061 }, { "epoch": 0.018282988871224166, "grad_norm": 0.2587326765060425, "learning_rate": 4.99680658200606e-05, "loss": 0.5201, "num_input_tokens_seen": 448402794, "step": 115, "train_runtime": 4514.6911, "train_tokens_per_second": 99320.814 }, { "epoch": 0.018441971383147855, "grad_norm": 0.28642454743385315, "learning_rate": 4.9967430467586555e-05, "loss": 0.5074, "num_input_tokens_seen": 452346296, "step": 116, "train_runtime": 4553.4886, "train_tokens_per_second": 99340.601 }, { "epoch": 0.01860095389507154, "grad_norm": 0.3567616045475006, "learning_rate": 4.996678886094761e-05, "loss": 0.5063, "num_input_tokens_seen": 456225019, "step": 117, "train_runtime": 4591.6744, "train_tokens_per_second": 99359.183 }, { "epoch": 0.01875993640699523, "grad_norm": 0.30416253209114075, "learning_rate": 4.9966141000304486e-05, "loss": 0.5082, "num_input_tokens_seen": 460148062, "step": 118, "train_runtime": 4632.059, "train_tokens_per_second": 99339.854 }, { "epoch": 0.01891891891891892, "grad_norm": 0.2728883624076843, "learning_rate": 4.9965486885819456e-05, "loss": 0.5027, "num_input_tokens_seen": 463935001, "step": 119, "train_runtime": 4672.8492, "train_tokens_per_second": 99283.109 }, { "epoch": 0.019077901430842606, "grad_norm": 0.2850625514984131, "learning_rate": 4.996482651765637e-05, "loss": 0.5127, "num_input_tokens_seen": 468025076, "step": 120, "train_runtime": 4711.8127, "train_tokens_per_second": 99330.153 }, { "epoch": 0.019236883942766295, "grad_norm": 0.29963284730911255, "learning_rate": 4.9964159895980663e-05, "loss": 0.4934, "num_input_tokens_seen": 471987161, "step": 121, "train_runtime": 4750.3609, "train_tokens_per_second": 99358.169 }, { "epoch": 0.019395866454689985, "grad_norm": 0.3030235469341278, "learning_rate": 4.9963487020959296e-05, "loss": 0.5032, "num_input_tokens_seen": 475694962, "step": 122, "train_runtime": 4788.5779, "train_tokens_per_second": 99339.505 }, { "epoch": 0.019554848966613674, "grad_norm": 0.34874218702316284, "learning_rate": 4.996280789276084e-05, "loss": 0.5095, "num_input_tokens_seen": 479454226, "step": 123, "train_runtime": 4827.7252, "train_tokens_per_second": 99312.659 }, { "epoch": 0.01971383147853736, "grad_norm": 0.3210648000240326, "learning_rate": 4.99621225115554e-05, "loss": 0.5162, "num_input_tokens_seen": 483546720, "step": 124, "train_runtime": 4867.2346, "train_tokens_per_second": 99347.321 }, { "epoch": 0.01987281399046105, "grad_norm": 0.28254151344299316, "learning_rate": 4.9961430877514656e-05, "loss": 0.4939, "num_input_tokens_seen": 487367924, "step": 125, "train_runtime": 4906.204, "train_tokens_per_second": 99337.068 }, { "epoch": 0.02003179650238474, "grad_norm": 0.6571413278579712, "learning_rate": 4.9960732990811876e-05, "loss": 0.504, "num_input_tokens_seen": 491165374, "step": 126, "train_runtime": 4945.6871, "train_tokens_per_second": 99311.857 }, { "epoch": 0.020190779014308425, "grad_norm": 0.2870993912220001, "learning_rate": 4.996002885162185e-05, "loss": 0.4942, "num_input_tokens_seen": 495147140, "step": 127, "train_runtime": 4986.7698, "train_tokens_per_second": 99292.16 }, { "epoch": 0.020349761526232114, "grad_norm": 0.3054683804512024, "learning_rate": 4.9959318460120986e-05, "loss": 0.4998, "num_input_tokens_seen": 499079535, "step": 128, "train_runtime": 5027.7628, "train_tokens_per_second": 99264.733 }, { "epoch": 0.020508744038155804, "grad_norm": 0.28957775235176086, "learning_rate": 4.9958601816487214e-05, "loss": 0.5053, "num_input_tokens_seen": 502830104, "step": 129, "train_runtime": 5069.0167, "train_tokens_per_second": 99196.773 }, { "epoch": 0.02066772655007949, "grad_norm": 0.3086756467819214, "learning_rate": 4.9957878920900046e-05, "loss": 0.5035, "num_input_tokens_seen": 506738728, "step": 130, "train_runtime": 5110.1259, "train_tokens_per_second": 99163.649 }, { "epoch": 0.02082670906200318, "grad_norm": 0.3282427191734314, "learning_rate": 4.995714977354058e-05, "loss": 0.4913, "num_input_tokens_seen": 510669090, "step": 131, "train_runtime": 5150.1039, "train_tokens_per_second": 99157.047 }, { "epoch": 0.02098569157392687, "grad_norm": 0.3335191607475281, "learning_rate": 4.9956414374591444e-05, "loss": 0.4996, "num_input_tokens_seen": 514504580, "step": 132, "train_runtime": 5190.4505, "train_tokens_per_second": 99125.227 }, { "epoch": 0.021144674085850558, "grad_norm": 0.2915280759334564, "learning_rate": 4.995567272423686e-05, "loss": 0.5157, "num_input_tokens_seen": 518310514, "step": 133, "train_runtime": 5231.9683, "train_tokens_per_second": 99066.065 }, { "epoch": 0.021303656597774244, "grad_norm": 0.320949912071228, "learning_rate": 4.995492482266261e-05, "loss": 0.5014, "num_input_tokens_seen": 522240938, "step": 134, "train_runtime": 5268.7893, "train_tokens_per_second": 99119.724 }, { "epoch": 0.021462639109697933, "grad_norm": 0.34813204407691956, "learning_rate": 4.995417067005602e-05, "loss": 0.5068, "num_input_tokens_seen": 526159652, "step": 135, "train_runtime": 5308.1182, "train_tokens_per_second": 99123.574 }, { "epoch": 0.021621621621621623, "grad_norm": 0.302681028842926, "learning_rate": 4.995341026660603e-05, "loss": 0.5005, "num_input_tokens_seen": 530074653, "step": 136, "train_runtime": 5346.5984, "train_tokens_per_second": 99142.41 }, { "epoch": 0.02178060413354531, "grad_norm": 0.31214621663093567, "learning_rate": 4.995264361250308e-05, "loss": 0.5189, "num_input_tokens_seen": 533986607, "step": 137, "train_runtime": 5385.9083, "train_tokens_per_second": 99145.135 }, { "epoch": 0.021939586645468998, "grad_norm": 0.29969730973243713, "learning_rate": 4.9951870707939244e-05, "loss": 0.4996, "num_input_tokens_seen": 537924224, "step": 138, "train_runtime": 5425.6834, "train_tokens_per_second": 99144.049 }, { "epoch": 0.022098569157392688, "grad_norm": 0.31619971990585327, "learning_rate": 4.9951091553108095e-05, "loss": 0.5066, "num_input_tokens_seen": 541727991, "step": 139, "train_runtime": 5465.1272, "train_tokens_per_second": 99124.498 }, { "epoch": 0.022257551669316374, "grad_norm": 0.30923986434936523, "learning_rate": 4.995030614820484e-05, "loss": 0.493, "num_input_tokens_seen": 545598715, "step": 140, "train_runtime": 5503.5904, "train_tokens_per_second": 99135.05 }, { "epoch": 0.022416534181240063, "grad_norm": 0.3114434778690338, "learning_rate": 4.99495144934262e-05, "loss": 0.4993, "num_input_tokens_seen": 549593528, "step": 141, "train_runtime": 5543.6381, "train_tokens_per_second": 99139.503 }, { "epoch": 0.022575516693163752, "grad_norm": 0.31035202741622925, "learning_rate": 4.994871658897048e-05, "loss": 0.4937, "num_input_tokens_seen": 553504972, "step": 142, "train_runtime": 5582.497, "train_tokens_per_second": 99150.071 }, { "epoch": 0.022734499205087442, "grad_norm": 0.33349621295928955, "learning_rate": 4.994791243503756e-05, "loss": 0.4957, "num_input_tokens_seen": 557313860, "step": 143, "train_runtime": 5622.6743, "train_tokens_per_second": 99119.002 }, { "epoch": 0.022893481717011128, "grad_norm": 0.3287511467933655, "learning_rate": 4.994710203182885e-05, "loss": 0.4919, "num_input_tokens_seen": 561263266, "step": 144, "train_runtime": 5662.4284, "train_tokens_per_second": 99120.594 }, { "epoch": 0.023052464228934817, "grad_norm": 0.2837793529033661, "learning_rate": 4.994628537954738e-05, "loss": 0.5073, "num_input_tokens_seen": 565176862, "step": 145, "train_runtime": 5699.0044, "train_tokens_per_second": 99171.158 }, { "epoch": 0.023211446740858507, "grad_norm": 0.2685563266277313, "learning_rate": 4.994546247839769e-05, "loss": 0.4859, "num_input_tokens_seen": 568995127, "step": 146, "train_runtime": 5735.9847, "train_tokens_per_second": 99197.463 }, { "epoch": 0.023370429252782193, "grad_norm": 0.2681172788143158, "learning_rate": 4.9944633328585933e-05, "loss": 0.5093, "num_input_tokens_seen": 572931784, "step": 147, "train_runtime": 5775.7137, "train_tokens_per_second": 99196.708 }, { "epoch": 0.023529411764705882, "grad_norm": 0.30355575680732727, "learning_rate": 4.9943797930319786e-05, "loss": 0.4947, "num_input_tokens_seen": 576900405, "step": 148, "train_runtime": 5812.5379, "train_tokens_per_second": 99251.035 }, { "epoch": 0.02368839427662957, "grad_norm": 0.3414088487625122, "learning_rate": 4.9942956283808525e-05, "loss": 0.5116, "num_input_tokens_seen": 580812168, "step": 149, "train_runtime": 5850.3786, "train_tokens_per_second": 99277.706 }, { "epoch": 0.02384737678855326, "grad_norm": 0.37295496463775635, "learning_rate": 4.994210838926296e-05, "loss": 0.5038, "num_input_tokens_seen": 584531240, "step": 150, "train_runtime": 5891.6497, "train_tokens_per_second": 99213.509 }, { "epoch": 0.024006359300476947, "grad_norm": 0.3053302764892578, "learning_rate": 4.994125424689551e-05, "loss": 0.5026, "num_input_tokens_seen": 588621517, "step": 151, "train_runtime": 5928.2097, "train_tokens_per_second": 99291.616 }, { "epoch": 0.024165341812400636, "grad_norm": 0.2779223918914795, "learning_rate": 4.994039385692011e-05, "loss": 0.4827, "num_input_tokens_seen": 592601406, "step": 152, "train_runtime": 5966.7287, "train_tokens_per_second": 99317.639 }, { "epoch": 0.024324324324324326, "grad_norm": 0.32714489102363586, "learning_rate": 4.9939527219552274e-05, "loss": 0.5184, "num_input_tokens_seen": 596445202, "step": 153, "train_runtime": 6006.8526, "train_tokens_per_second": 99294.131 }, { "epoch": 0.02448330683624801, "grad_norm": 0.3405693769454956, "learning_rate": 4.993865433500912e-05, "loss": 0.4891, "num_input_tokens_seen": 600374469, "step": 154, "train_runtime": 6044.4637, "train_tokens_per_second": 99326.343 }, { "epoch": 0.0246422893481717, "grad_norm": 0.35723572969436646, "learning_rate": 4.993777520350926e-05, "loss": 0.4931, "num_input_tokens_seen": 604380702, "step": 155, "train_runtime": 6082.7454, "train_tokens_per_second": 99359.855 }, { "epoch": 0.02480127186009539, "grad_norm": 0.3912000358104706, "learning_rate": 4.9936889825272945e-05, "loss": 0.4726, "num_input_tokens_seen": 608175913, "step": 156, "train_runtime": 6123.385, "train_tokens_per_second": 99320.215 }, { "epoch": 0.024960254372019076, "grad_norm": 0.31630992889404297, "learning_rate": 4.993599820052194e-05, "loss": 0.5047, "num_input_tokens_seen": 612074833, "step": 157, "train_runtime": 6163.7843, "train_tokens_per_second": 99301.793 }, { "epoch": 0.025119236883942766, "grad_norm": 0.4234021306037903, "learning_rate": 4.993510032947959e-05, "loss": 0.495, "num_input_tokens_seen": 616050694, "step": 158, "train_runtime": 6201.5503, "train_tokens_per_second": 99338.175 }, { "epoch": 0.025278219395866455, "grad_norm": 0.35685473680496216, "learning_rate": 4.993419621237081e-05, "loss": 0.4977, "num_input_tokens_seen": 619782300, "step": 159, "train_runtime": 6241.0318, "train_tokens_per_second": 99307.666 }, { "epoch": 0.025437201907790145, "grad_norm": 0.3082249164581299, "learning_rate": 4.9933285849422065e-05, "loss": 0.4956, "num_input_tokens_seen": 623696367, "step": 160, "train_runtime": 6279.7596, "train_tokens_per_second": 99318.51 }, { "epoch": 0.02559618441971383, "grad_norm": 0.3734365403652191, "learning_rate": 4.9932369240861406e-05, "loss": 0.4943, "num_input_tokens_seen": 627517276, "step": 161, "train_runtime": 6319.6378, "train_tokens_per_second": 99296.399 }, { "epoch": 0.02575516693163752, "grad_norm": 0.3075200617313385, "learning_rate": 4.993144638691844e-05, "loss": 0.5, "num_input_tokens_seen": 631502142, "step": 162, "train_runtime": 6357.4704, "train_tokens_per_second": 99332.297 }, { "epoch": 0.02591414944356121, "grad_norm": 0.5280249714851379, "learning_rate": 4.993051728782432e-05, "loss": 0.5032, "num_input_tokens_seen": 635357192, "step": 163, "train_runtime": 6400.4311, "train_tokens_per_second": 99267.875 }, { "epoch": 0.026073131955484895, "grad_norm": 0.35317057371139526, "learning_rate": 4.9929581943811786e-05, "loss": 0.5062, "num_input_tokens_seen": 639253604, "step": 164, "train_runtime": 6439.221, "train_tokens_per_second": 99274.99 }, { "epoch": 0.026232114467408585, "grad_norm": 0.30038896203041077, "learning_rate": 4.992864035511514e-05, "loss": 0.4884, "num_input_tokens_seen": 643229973, "step": 165, "train_runtime": 6479.6349, "train_tokens_per_second": 99269.478 }, { "epoch": 0.026391096979332274, "grad_norm": 0.27658703923225403, "learning_rate": 4.9927692521970235e-05, "loss": 0.4839, "num_input_tokens_seen": 647229717, "step": 166, "train_runtime": 6519.864, "train_tokens_per_second": 99270.432 }, { "epoch": 0.02655007949125596, "grad_norm": 0.2860468924045563, "learning_rate": 4.9926738444614514e-05, "loss": 0.5021, "num_input_tokens_seen": 651102085, "step": 167, "train_runtime": 6559.3239, "train_tokens_per_second": 99263.597 }, { "epoch": 0.02670906200317965, "grad_norm": 0.3634198009967804, "learning_rate": 4.992577812328694e-05, "loss": 0.4872, "num_input_tokens_seen": 654933344, "step": 168, "train_runtime": 6598.1745, "train_tokens_per_second": 99259.779 }, { "epoch": 0.02686804451510334, "grad_norm": 0.3112046718597412, "learning_rate": 4.992481155822809e-05, "loss": 0.4948, "num_input_tokens_seen": 658839882, "step": 169, "train_runtime": 6638.9943, "train_tokens_per_second": 99237.905 }, { "epoch": 0.02702702702702703, "grad_norm": 0.3964279294013977, "learning_rate": 4.992383874968007e-05, "loss": 0.5002, "num_input_tokens_seen": 662858415, "step": 170, "train_runtime": 6678.3794, "train_tokens_per_second": 99254.381 }, { "epoch": 0.027186009538950714, "grad_norm": 0.3215843737125397, "learning_rate": 4.9922859697886556e-05, "loss": 0.4661, "num_input_tokens_seen": 666645375, "step": 171, "train_runtime": 6719.2888, "train_tokens_per_second": 99213.681 }, { "epoch": 0.027344992050874404, "grad_norm": 0.2753041982650757, "learning_rate": 4.992187440309281e-05, "loss": 0.5135, "num_input_tokens_seen": 670590472, "step": 172, "train_runtime": 6760.312, "train_tokens_per_second": 99195.195 }, { "epoch": 0.027503974562798093, "grad_norm": 0.32394081354141235, "learning_rate": 4.9920882865545634e-05, "loss": 0.4958, "num_input_tokens_seen": 674568715, "step": 173, "train_runtime": 6800.8027, "train_tokens_per_second": 99189.572 }, { "epoch": 0.02766295707472178, "grad_norm": 0.34483852982521057, "learning_rate": 4.9919885085493395e-05, "loss": 0.4996, "num_input_tokens_seen": 678468491, "step": 174, "train_runtime": 6840.499, "train_tokens_per_second": 99184.063 }, { "epoch": 0.02782193958664547, "grad_norm": 0.3316004276275635, "learning_rate": 4.991888106318604e-05, "loss": 0.5065, "num_input_tokens_seen": 682353777, "step": 175, "train_runtime": 6879.5717, "train_tokens_per_second": 99185.502 }, { "epoch": 0.027980922098569158, "grad_norm": 0.23503349721431732, "learning_rate": 4.9917870798875055e-05, "loss": 0.4949, "num_input_tokens_seen": 686242899, "step": 176, "train_runtime": 6918.3229, "train_tokens_per_second": 99192.089 }, { "epoch": 0.028139904610492844, "grad_norm": 0.27585285902023315, "learning_rate": 4.991685429281352e-05, "loss": 0.494, "num_input_tokens_seen": 690122842, "step": 177, "train_runtime": 6955.5678, "train_tokens_per_second": 99218.765 }, { "epoch": 0.028298887122416534, "grad_norm": 0.3122688829898834, "learning_rate": 4.991583154525605e-05, "loss": 0.503, "num_input_tokens_seen": 694032870, "step": 178, "train_runtime": 6994.7087, "train_tokens_per_second": 99222.555 }, { "epoch": 0.028457869634340223, "grad_norm": 0.30542707443237305, "learning_rate": 4.991480255645885e-05, "loss": 0.493, "num_input_tokens_seen": 697861617, "step": 179, "train_runtime": 7034.9158, "train_tokens_per_second": 99199.711 }, { "epoch": 0.028616852146263912, "grad_norm": 0.32333680987358093, "learning_rate": 4.991376732667966e-05, "loss": 0.4954, "num_input_tokens_seen": 701832965, "step": 180, "train_runtime": 7072.562, "train_tokens_per_second": 99233.2 }, { "epoch": 0.0287758346581876, "grad_norm": 0.29857486486434937, "learning_rate": 4.991272585617781e-05, "loss": 0.5057, "num_input_tokens_seen": 705775104, "step": 181, "train_runtime": 7110.8164, "train_tokens_per_second": 99253.738 }, { "epoch": 0.028934817170111288, "grad_norm": 0.3777329623699188, "learning_rate": 4.991167814521417e-05, "loss": 0.5007, "num_input_tokens_seen": 709531513, "step": 182, "train_runtime": 7150.3287, "train_tokens_per_second": 99230.615 }, { "epoch": 0.029093799682034977, "grad_norm": 0.3991350829601288, "learning_rate": 4.991062419405119e-05, "loss": 0.4902, "num_input_tokens_seen": 713446761, "step": 183, "train_runtime": 7190.0112, "train_tokens_per_second": 99227.49 }, { "epoch": 0.029252782193958663, "grad_norm": 0.3278700113296509, "learning_rate": 4.990956400295288e-05, "loss": 0.4769, "num_input_tokens_seen": 717338298, "step": 184, "train_runtime": 7228.2389, "train_tokens_per_second": 99241.089 }, { "epoch": 0.029411764705882353, "grad_norm": 0.2884068787097931, "learning_rate": 4.9908497572184807e-05, "loss": 0.5028, "num_input_tokens_seen": 721244476, "step": 185, "train_runtime": 7268.8493, "train_tokens_per_second": 99224.024 }, { "epoch": 0.029570747217806042, "grad_norm": 0.2841215133666992, "learning_rate": 4.99074249020141e-05, "loss": 0.497, "num_input_tokens_seen": 725109678, "step": 186, "train_runtime": 7308.0539, "train_tokens_per_second": 99220.625 }, { "epoch": 0.02972972972972973, "grad_norm": 0.31714382767677307, "learning_rate": 4.9906345992709465e-05, "loss": 0.5037, "num_input_tokens_seen": 729162434, "step": 187, "train_runtime": 7348.7549, "train_tokens_per_second": 99222.581 }, { "epoch": 0.029888712241653417, "grad_norm": 0.331468790769577, "learning_rate": 4.9905260844541155e-05, "loss": 0.4841, "num_input_tokens_seen": 733007902, "step": 188, "train_runtime": 7387.9255, "train_tokens_per_second": 99217.013 }, { "epoch": 0.030047694753577107, "grad_norm": 0.26704302430152893, "learning_rate": 4.9904169457780994e-05, "loss": 0.4877, "num_input_tokens_seen": 736845716, "step": 189, "train_runtime": 7427.1871, "train_tokens_per_second": 99209.257 }, { "epoch": 0.030206677265500796, "grad_norm": 0.31829607486724854, "learning_rate": 4.990307183270235e-05, "loss": 0.4857, "num_input_tokens_seen": 740693977, "step": 190, "train_runtime": 7468.0821, "train_tokens_per_second": 99181.284 }, { "epoch": 0.030365659777424482, "grad_norm": 0.37651294469833374, "learning_rate": 4.9901967969580204e-05, "loss": 0.4886, "num_input_tokens_seen": 744794980, "step": 191, "train_runtime": 7509.1688, "train_tokens_per_second": 99184.743 }, { "epoch": 0.03052464228934817, "grad_norm": 0.269993394613266, "learning_rate": 4.990085786869104e-05, "loss": 0.4852, "num_input_tokens_seen": 748686690, "step": 192, "train_runtime": 7546.9075, "train_tokens_per_second": 99204.435 }, { "epoch": 0.03068362480127186, "grad_norm": 0.29989832639694214, "learning_rate": 4.989974153031294e-05, "loss": 0.4914, "num_input_tokens_seen": 752584497, "step": 193, "train_runtime": 7592.4716, "train_tokens_per_second": 99122.464 }, { "epoch": 0.030842607313195547, "grad_norm": 0.29281941056251526, "learning_rate": 4.989861895472554e-05, "loss": 0.4836, "num_input_tokens_seen": 756468467, "step": 194, "train_runtime": 7632.2089, "train_tokens_per_second": 99115.273 }, { "epoch": 0.031001589825119236, "grad_norm": 0.2637275755405426, "learning_rate": 4.989749014221003e-05, "loss": 0.4828, "num_input_tokens_seen": 760444249, "step": 195, "train_runtime": 7676.2979, "train_tokens_per_second": 99063.931 }, { "epoch": 0.031160572337042926, "grad_norm": 0.32027530670166016, "learning_rate": 4.9896355093049174e-05, "loss": 0.4784, "num_input_tokens_seen": 764233028, "step": 196, "train_runtime": 7715.5069, "train_tokens_per_second": 99051.564 }, { "epoch": 0.03131955484896661, "grad_norm": 0.2918871343135834, "learning_rate": 4.9895213807527295e-05, "loss": 0.4799, "num_input_tokens_seen": 768133831, "step": 197, "train_runtime": 7755.8307, "train_tokens_per_second": 99039.531 }, { "epoch": 0.031478537360890305, "grad_norm": 0.28709009289741516, "learning_rate": 4.989406628593028e-05, "loss": 0.4856, "num_input_tokens_seen": 772061210, "step": 198, "train_runtime": 7796.631, "train_tokens_per_second": 99024.978 }, { "epoch": 0.03163751987281399, "grad_norm": 0.28811225295066833, "learning_rate": 4.989291252854556e-05, "loss": 0.4832, "num_input_tokens_seen": 776068525, "step": 199, "train_runtime": 7835.2716, "train_tokens_per_second": 99048.069 }, { "epoch": 0.03179650238473768, "grad_norm": 0.2805559039115906, "learning_rate": 4.989175253566216e-05, "loss": 0.4861, "num_input_tokens_seen": 779943854, "step": 200, "train_runtime": 7875.0035, "train_tokens_per_second": 99040.446 }, { "epoch": 0.03195548489666137, "grad_norm": 0.34363746643066406, "learning_rate": 4.989058630757064e-05, "loss": 0.4896, "num_input_tokens_seen": 783767929, "step": 201, "train_runtime": 8018.2401, "train_tokens_per_second": 97748.125 }, { "epoch": 0.032114467408585055, "grad_norm": 0.2678070366382599, "learning_rate": 4.988941384456314e-05, "loss": 0.4799, "num_input_tokens_seen": 787666379, "step": 202, "train_runtime": 8059.4219, "train_tokens_per_second": 97732.367 }, { "epoch": 0.03227344992050874, "grad_norm": 0.28692278265953064, "learning_rate": 4.9888235146933345e-05, "loss": 0.5031, "num_input_tokens_seen": 791493428, "step": 203, "train_runtime": 8098.7757, "train_tokens_per_second": 97730.009 }, { "epoch": 0.032432432432432434, "grad_norm": 0.24904213845729828, "learning_rate": 4.9887050214976524e-05, "loss": 0.4934, "num_input_tokens_seen": 795448608, "step": 204, "train_runtime": 8139.1894, "train_tokens_per_second": 97730.692 }, { "epoch": 0.03259141494435612, "grad_norm": 0.2856512665748596, "learning_rate": 4.988585904898948e-05, "loss": 0.4826, "num_input_tokens_seen": 799384795, "step": 205, "train_runtime": 8176.8492, "train_tokens_per_second": 97761.959 }, { "epoch": 0.032750397456279806, "grad_norm": 0.29869338870048523, "learning_rate": 4.9884661649270595e-05, "loss": 0.494, "num_input_tokens_seen": 803160630, "step": 206, "train_runtime": 8217.9779, "train_tokens_per_second": 97732.148 }, { "epoch": 0.0329093799682035, "grad_norm": 0.31074997782707214, "learning_rate": 4.988345801611981e-05, "loss": 0.4853, "num_input_tokens_seen": 807011375, "step": 207, "train_runtime": 8255.6865, "train_tokens_per_second": 97752.183 }, { "epoch": 0.033068362480127185, "grad_norm": 0.3570340871810913, "learning_rate": 4.988224814983863e-05, "loss": 0.4713, "num_input_tokens_seen": 810901531, "step": 208, "train_runtime": 8295.2911, "train_tokens_per_second": 97754.439 }, { "epoch": 0.03322734499205087, "grad_norm": 0.22975996136665344, "learning_rate": 4.988103205073011e-05, "loss": 0.4985, "num_input_tokens_seen": 814775429, "step": 209, "train_runtime": 8335.7885, "train_tokens_per_second": 97744.254 }, { "epoch": 0.033386327503974564, "grad_norm": 0.24255706369876862, "learning_rate": 4.987980971909888e-05, "loss": 0.4825, "num_input_tokens_seen": 818679039, "step": 210, "train_runtime": 8375.9548, "train_tokens_per_second": 97741.578 }, { "epoch": 0.03354531001589825, "grad_norm": 0.23595936596393585, "learning_rate": 4.987858115525113e-05, "loss": 0.4842, "num_input_tokens_seen": 822592027, "step": 211, "train_runtime": 8415.8919, "train_tokens_per_second": 97742.703 }, { "epoch": 0.03370429252782194, "grad_norm": 0.24735520780086517, "learning_rate": 4.9877346359494596e-05, "loss": 0.4905, "num_input_tokens_seen": 826476483, "step": 212, "train_runtime": 8454.1614, "train_tokens_per_second": 97759.724 }, { "epoch": 0.03386327503974563, "grad_norm": 0.3602750599384308, "learning_rate": 4.9876105332138595e-05, "loss": 0.4993, "num_input_tokens_seen": 830447524, "step": 213, "train_runtime": 8490.2618, "train_tokens_per_second": 97811.769 }, { "epoch": 0.034022257551669315, "grad_norm": 0.23918628692626953, "learning_rate": 4.987485807349399e-05, "loss": 0.4976, "num_input_tokens_seen": 834243035, "step": 214, "train_runtime": 8527.7285, "train_tokens_per_second": 97827.11 }, { "epoch": 0.03418124006359301, "grad_norm": 0.23701049387454987, "learning_rate": 4.9873604583873215e-05, "loss": 0.4769, "num_input_tokens_seen": 838114283, "step": 215, "train_runtime": 8563.8787, "train_tokens_per_second": 97866.202 }, { "epoch": 0.03434022257551669, "grad_norm": 0.2561703026294708, "learning_rate": 4.987234486359025e-05, "loss": 0.4771, "num_input_tokens_seen": 842030484, "step": 216, "train_runtime": 8603.2061, "train_tokens_per_second": 97874.034 }, { "epoch": 0.03449920508744038, "grad_norm": 0.24137352406978607, "learning_rate": 4.987107891296066e-05, "loss": 0.4746, "num_input_tokens_seen": 845906938, "step": 217, "train_runtime": 8644.5112, "train_tokens_per_second": 97854.802 }, { "epoch": 0.03465818759936407, "grad_norm": 0.22611333429813385, "learning_rate": 4.9869806732301546e-05, "loss": 0.5007, "num_input_tokens_seen": 849866102, "step": 218, "train_runtime": 8682.6559, "train_tokens_per_second": 97880.892 }, { "epoch": 0.03481717011128776, "grad_norm": 0.36904969811439514, "learning_rate": 4.98685283219316e-05, "loss": 0.4942, "num_input_tokens_seen": 853860618, "step": 219, "train_runtime": 8722.5703, "train_tokens_per_second": 97890.941 }, { "epoch": 0.034976152623211444, "grad_norm": 0.22539187967777252, "learning_rate": 4.9867243682171015e-05, "loss": 0.4895, "num_input_tokens_seen": 857750221, "step": 220, "train_runtime": 8760.4162, "train_tokens_per_second": 97912.04 }, { "epoch": 0.03513513513513514, "grad_norm": 0.2303132712841034, "learning_rate": 4.9865952813341624e-05, "loss": 0.4746, "num_input_tokens_seen": 861619064, "step": 221, "train_runtime": 8796.5544, "train_tokens_per_second": 97949.609 }, { "epoch": 0.03529411764705882, "grad_norm": 0.2247791290283203, "learning_rate": 4.986465571576676e-05, "loss": 0.4769, "num_input_tokens_seen": 865394310, "step": 222, "train_runtime": 8835.8297, "train_tokens_per_second": 97941.488 }, { "epoch": 0.03545310015898251, "grad_norm": 0.24088098108768463, "learning_rate": 4.986335238977134e-05, "loss": 0.4849, "num_input_tokens_seen": 869405829, "step": 223, "train_runtime": 8875.1523, "train_tokens_per_second": 97959.539 }, { "epoch": 0.0356120826709062, "grad_norm": 0.2554442584514618, "learning_rate": 4.9862042835681845e-05, "loss": 0.4893, "num_input_tokens_seen": 873408944, "step": 224, "train_runtime": 8912.178, "train_tokens_per_second": 98001.739 }, { "epoch": 0.03577106518282989, "grad_norm": 0.2916496992111206, "learning_rate": 4.98607270538263e-05, "loss": 0.502, "num_input_tokens_seen": 877342667, "step": 225, "train_runtime": 8951.7377, "train_tokens_per_second": 98008.085 }, { "epoch": 0.035930047694753574, "grad_norm": 0.2450057715177536, "learning_rate": 4.98594050445343e-05, "loss": 0.4896, "num_input_tokens_seen": 881173780, "step": 226, "train_runtime": 8990.7905, "train_tokens_per_second": 98008.488 }, { "epoch": 0.03608903020667727, "grad_norm": 0.24006472527980804, "learning_rate": 4.985807680813701e-05, "loss": 0.4785, "num_input_tokens_seen": 885052839, "step": 227, "train_runtime": 9027.5754, "train_tokens_per_second": 98038.82 }, { "epoch": 0.03624801271860095, "grad_norm": 0.30806615948677063, "learning_rate": 4.9856742344967125e-05, "loss": 0.4843, "num_input_tokens_seen": 888973050, "step": 228, "train_runtime": 9066.7205, "train_tokens_per_second": 98047.916 }, { "epoch": 0.036406995230524646, "grad_norm": 0.2567102313041687, "learning_rate": 4.985540165535893e-05, "loss": 0.4964, "num_input_tokens_seen": 892953193, "step": 229, "train_runtime": 9107.1314, "train_tokens_per_second": 98049.885 }, { "epoch": 0.03656597774244833, "grad_norm": 0.26579269766807556, "learning_rate": 4.985405473964827e-05, "loss": 0.4825, "num_input_tokens_seen": 896829596, "step": 230, "train_runtime": 9145.4603, "train_tokens_per_second": 98062.816 }, { "epoch": 0.03672496025437202, "grad_norm": 0.2927641272544861, "learning_rate": 4.985270159817251e-05, "loss": 0.493, "num_input_tokens_seen": 900711437, "step": 231, "train_runtime": 9186.7789, "train_tokens_per_second": 98044.314 }, { "epoch": 0.03688394276629571, "grad_norm": 0.2323330193758011, "learning_rate": 4.9851342231270626e-05, "loss": 0.4908, "num_input_tokens_seen": 904557686, "step": 232, "train_runtime": 9228.1477, "train_tokens_per_second": 98021.587 }, { "epoch": 0.037042925278219396, "grad_norm": 0.22482217848300934, "learning_rate": 4.984997663928312e-05, "loss": 0.4829, "num_input_tokens_seen": 908524409, "step": 233, "train_runtime": 9266.296, "train_tokens_per_second": 98046.124 }, { "epoch": 0.03720190779014308, "grad_norm": 0.2949601411819458, "learning_rate": 4.9848604822552055e-05, "loss": 0.4756, "num_input_tokens_seen": 912372989, "step": 234, "train_runtime": 9304.8979, "train_tokens_per_second": 98052.982 }, { "epoch": 0.037360890302066775, "grad_norm": 0.2988738715648651, "learning_rate": 4.984722678142109e-05, "loss": 0.4887, "num_input_tokens_seen": 916265475, "step": 235, "train_runtime": 9344.1821, "train_tokens_per_second": 98057.322 }, { "epoch": 0.03751987281399046, "grad_norm": 0.2702568471431732, "learning_rate": 4.984584251623539e-05, "loss": 0.4926, "num_input_tokens_seen": 920213401, "step": 236, "train_runtime": 9383.7743, "train_tokens_per_second": 98064.315 }, { "epoch": 0.03767885532591415, "grad_norm": 0.29995906352996826, "learning_rate": 4.98444520273417e-05, "loss": 0.4837, "num_input_tokens_seen": 924169658, "step": 237, "train_runtime": 9423.9928, "train_tokens_per_second": 98065.616 }, { "epoch": 0.03783783783783784, "grad_norm": 0.3407081961631775, "learning_rate": 4.9843055315088336e-05, "loss": 0.4822, "num_input_tokens_seen": 927961827, "step": 238, "train_runtime": 9462.7853, "train_tokens_per_second": 98064.343 }, { "epoch": 0.037996820349761526, "grad_norm": 0.33327192068099976, "learning_rate": 4.9841652379825175e-05, "loss": 0.4746, "num_input_tokens_seen": 931967350, "step": 239, "train_runtime": 9501.6091, "train_tokens_per_second": 98085.213 }, { "epoch": 0.03815580286168521, "grad_norm": 0.3244379758834839, "learning_rate": 4.984024322190363e-05, "loss": 0.4842, "num_input_tokens_seen": 935873413, "step": 240, "train_runtime": 9541.1693, "train_tokens_per_second": 98087.916 }, { "epoch": 0.038314785373608905, "grad_norm": 0.2572750151157379, "learning_rate": 4.9838827841676685e-05, "loss": 0.4833, "num_input_tokens_seen": 939664161, "step": 241, "train_runtime": 9578.2734, "train_tokens_per_second": 98103.711 }, { "epoch": 0.03847376788553259, "grad_norm": 0.26535624265670776, "learning_rate": 4.9837406239498886e-05, "loss": 0.4679, "num_input_tokens_seen": 943610361, "step": 242, "train_runtime": 9618.956, "train_tokens_per_second": 98099.042 }, { "epoch": 0.03863275039745628, "grad_norm": 0.28297051787376404, "learning_rate": 4.9835978415726334e-05, "loss": 0.4844, "num_input_tokens_seen": 947598642, "step": 243, "train_runtime": 9657.7045, "train_tokens_per_second": 98118.413 }, { "epoch": 0.03879173290937997, "grad_norm": 0.2734469473361969, "learning_rate": 4.983454437071669e-05, "loss": 0.4933, "num_input_tokens_seen": 951498274, "step": 244, "train_runtime": 9698.6303, "train_tokens_per_second": 98106.459 }, { "epoch": 0.038950715421303655, "grad_norm": 0.27170243859291077, "learning_rate": 4.983310410482917e-05, "loss": 0.4783, "num_input_tokens_seen": 955408025, "step": 245, "train_runtime": 9737.1977, "train_tokens_per_second": 98119.403 }, { "epoch": 0.03910969793322735, "grad_norm": 0.27516821026802063, "learning_rate": 4.983165761842456e-05, "loss": 0.4843, "num_input_tokens_seen": 959278795, "step": 246, "train_runtime": 9777.2631, "train_tokens_per_second": 98113.223 }, { "epoch": 0.039268680445151034, "grad_norm": 0.28839951753616333, "learning_rate": 4.983020491186517e-05, "loss": 0.4871, "num_input_tokens_seen": 963170057, "step": 247, "train_runtime": 9816.6066, "train_tokens_per_second": 98116.395 }, { "epoch": 0.03942766295707472, "grad_norm": 0.2660924792289734, "learning_rate": 4.9828745985514915e-05, "loss": 0.4833, "num_input_tokens_seen": 966962429, "step": 248, "train_runtime": 9854.3664, "train_tokens_per_second": 98125.277 }, { "epoch": 0.03958664546899841, "grad_norm": 0.24946855008602142, "learning_rate": 4.982728083973924e-05, "loss": 0.4861, "num_input_tokens_seen": 970887149, "step": 249, "train_runtime": 9892.9658, "train_tokens_per_second": 98139.139 }, { "epoch": 0.0397456279809221, "grad_norm": 0.2982906103134155, "learning_rate": 4.982580947490515e-05, "loss": 0.4868, "num_input_tokens_seen": 974821178, "step": 250, "train_runtime": 9933.9163, "train_tokens_per_second": 98130.601 }, { "epoch": 0.039904610492845785, "grad_norm": 0.26529747247695923, "learning_rate": 4.982433189138122e-05, "loss": 0.478, "num_input_tokens_seen": 978749260, "step": 251, "train_runtime": 9973.3993, "train_tokens_per_second": 98135.975 }, { "epoch": 0.04006359300476948, "grad_norm": 0.24415552616119385, "learning_rate": 4.982284808953757e-05, "loss": 0.4866, "num_input_tokens_seen": 982698053, "step": 252, "train_runtime": 10013.6554, "train_tokens_per_second": 98135.797 }, { "epoch": 0.040222575516693164, "grad_norm": 0.25753042101860046, "learning_rate": 4.9821358069745874e-05, "loss": 0.4838, "num_input_tokens_seen": 986593266, "step": 253, "train_runtime": 10053.6944, "train_tokens_per_second": 98132.41 }, { "epoch": 0.04038155802861685, "grad_norm": 0.3058053255081177, "learning_rate": 4.981986183237938e-05, "loss": 0.487, "num_input_tokens_seen": 990484752, "step": 254, "train_runtime": 10094.0988, "train_tokens_per_second": 98125.129 }, { "epoch": 0.04054054054054054, "grad_norm": 0.2809613347053528, "learning_rate": 4.9818359377812885e-05, "loss": 0.4844, "num_input_tokens_seen": 994555074, "step": 255, "train_runtime": 10131.971, "train_tokens_per_second": 98160.079 }, { "epoch": 0.04069952305246423, "grad_norm": 0.25445500016212463, "learning_rate": 4.981685070642274e-05, "loss": 0.4839, "num_input_tokens_seen": 998506663, "step": 256, "train_runtime": 10169.167, "train_tokens_per_second": 98189.622 }, { "epoch": 0.040858505564387915, "grad_norm": 0.27375462651252747, "learning_rate": 4.9815335818586856e-05, "loss": 0.4693, "num_input_tokens_seen": 1002324267, "step": 257, "train_runtime": 10206.0434, "train_tokens_per_second": 98208.897 }, { "epoch": 0.04101748807631161, "grad_norm": 0.2768684923648834, "learning_rate": 4.9813814714684706e-05, "loss": 0.487, "num_input_tokens_seen": 1006223472, "step": 258, "train_runtime": 10247.7599, "train_tokens_per_second": 98189.603 }, { "epoch": 0.041176470588235294, "grad_norm": 0.3193178176879883, "learning_rate": 4.9812287395097314e-05, "loss": 0.4811, "num_input_tokens_seen": 1010172295, "step": 259, "train_runtime": 10286.1805, "train_tokens_per_second": 98206.744 }, { "epoch": 0.04133545310015898, "grad_norm": 0.2921902537345886, "learning_rate": 4.981075386020726e-05, "loss": 0.485, "num_input_tokens_seen": 1014080773, "step": 260, "train_runtime": 10324.156, "train_tokens_per_second": 98224.085 }, { "epoch": 0.04149443561208267, "grad_norm": 0.33112773299217224, "learning_rate": 4.9809214110398685e-05, "loss": 0.4715, "num_input_tokens_seen": 1017866272, "step": 261, "train_runtime": 10363.5843, "train_tokens_per_second": 98215.66 }, { "epoch": 0.04165341812400636, "grad_norm": 0.3145655691623688, "learning_rate": 4.9807668146057286e-05, "loss": 0.4764, "num_input_tokens_seen": 1021788033, "step": 262, "train_runtime": 10404.0891, "train_tokens_per_second": 98210.234 }, { "epoch": 0.041812400635930044, "grad_norm": 0.2909221053123474, "learning_rate": 4.980611596757031e-05, "loss": 0.4629, "num_input_tokens_seen": 1025730533, "step": 263, "train_runtime": 10442.5632, "train_tokens_per_second": 98225.935 }, { "epoch": 0.04197138314785374, "grad_norm": 0.3078227639198303, "learning_rate": 4.980455757532658e-05, "loss": 0.4922, "num_input_tokens_seen": 1029613551, "step": 264, "train_runtime": 10481.2757, "train_tokens_per_second": 98233.61 }, { "epoch": 0.04213036565977742, "grad_norm": 0.3250211477279663, "learning_rate": 4.9802992969716456e-05, "loss": 0.4799, "num_input_tokens_seen": 1033540522, "step": 265, "train_runtime": 10521.5187, "train_tokens_per_second": 98231.115 }, { "epoch": 0.042289348171701116, "grad_norm": 0.3167693018913269, "learning_rate": 4.980142215113186e-05, "loss": 0.4729, "num_input_tokens_seen": 1037456735, "step": 266, "train_runtime": 10560.6643, "train_tokens_per_second": 98237.829 }, { "epoch": 0.0424483306836248, "grad_norm": 0.31975534558296204, "learning_rate": 4.979984511996627e-05, "loss": 0.4816, "num_input_tokens_seen": 1041373781, "step": 267, "train_runtime": 10597.9844, "train_tokens_per_second": 98261.494 }, { "epoch": 0.04260731319554849, "grad_norm": 0.32497096061706543, "learning_rate": 4.979826187661472e-05, "loss": 0.4748, "num_input_tokens_seen": 1045211174, "step": 268, "train_runtime": 10638.5722, "train_tokens_per_second": 98247.317 }, { "epoch": 0.04276629570747218, "grad_norm": 0.23398014903068542, "learning_rate": 4.9796672421473803e-05, "loss": 0.4812, "num_input_tokens_seen": 1049015347, "step": 269, "train_runtime": 10679.6566, "train_tokens_per_second": 98225.569 }, { "epoch": 0.04292527821939587, "grad_norm": 0.2889334261417389, "learning_rate": 4.979507675494167e-05, "loss": 0.4856, "num_input_tokens_seen": 1053025882, "step": 270, "train_runtime": 10716.9355, "train_tokens_per_second": 98258.115 }, { "epoch": 0.04308426073131955, "grad_norm": 0.27041006088256836, "learning_rate": 4.979347487741802e-05, "loss": 0.4873, "num_input_tokens_seen": 1056957226, "step": 271, "train_runtime": 10756.7237, "train_tokens_per_second": 98260.144 }, { "epoch": 0.043243243243243246, "grad_norm": 0.28773611783981323, "learning_rate": 4.979186678930411e-05, "loss": 0.4732, "num_input_tokens_seen": 1060726800, "step": 272, "train_runtime": 10797.4086, "train_tokens_per_second": 98239.016 }, { "epoch": 0.04340222575516693, "grad_norm": 0.31978124380111694, "learning_rate": 4.979025249100275e-05, "loss": 0.477, "num_input_tokens_seen": 1064800931, "step": 273, "train_runtime": 10833.8793, "train_tokens_per_second": 98284.364 }, { "epoch": 0.04356120826709062, "grad_norm": 0.2608067989349365, "learning_rate": 4.9788631982918334e-05, "loss": 0.4796, "num_input_tokens_seen": 1068700249, "step": 274, "train_runtime": 10873.2846, "train_tokens_per_second": 98286.791 }, { "epoch": 0.04372019077901431, "grad_norm": 0.27554428577423096, "learning_rate": 4.978700526545676e-05, "loss": 0.4742, "num_input_tokens_seen": 1072577548, "step": 275, "train_runtime": 10911.2601, "train_tokens_per_second": 98300.063 }, { "epoch": 0.043879173290937996, "grad_norm": 0.2685924470424652, "learning_rate": 4.978537233902553e-05, "loss": 0.4854, "num_input_tokens_seen": 1076462790, "step": 276, "train_runtime": 10950.3306, "train_tokens_per_second": 98304.136 }, { "epoch": 0.04403815580286168, "grad_norm": 0.23871618509292603, "learning_rate": 4.978373320403366e-05, "loss": 0.4772, "num_input_tokens_seen": 1080423904, "step": 277, "train_runtime": 10990.2754, "train_tokens_per_second": 98307.264 }, { "epoch": 0.044197138314785375, "grad_norm": 0.28126874566078186, "learning_rate": 4.978208786089176e-05, "loss": 0.4823, "num_input_tokens_seen": 1084296966, "step": 278, "train_runtime": 11029.3041, "train_tokens_per_second": 98310.551 }, { "epoch": 0.04435612082670906, "grad_norm": 0.29214438796043396, "learning_rate": 4.9780436310011965e-05, "loss": 0.48, "num_input_tokens_seen": 1088131422, "step": 279, "train_runtime": 11067.0857, "train_tokens_per_second": 98321.406 }, { "epoch": 0.04451510333863275, "grad_norm": 0.28401604294776917, "learning_rate": 4.977877855180799e-05, "loss": 0.4929, "num_input_tokens_seen": 1092100933, "step": 280, "train_runtime": 11108.2427, "train_tokens_per_second": 98314.465 }, { "epoch": 0.04467408585055644, "grad_norm": 0.2470414787530899, "learning_rate": 4.9777114586695074e-05, "loss": 0.4791, "num_input_tokens_seen": 1096086577, "step": 281, "train_runtime": 11148.161, "train_tokens_per_second": 98319.945 }, { "epoch": 0.044833068362480126, "grad_norm": 0.2816607654094696, "learning_rate": 4.9775444415090046e-05, "loss": 0.4785, "num_input_tokens_seen": 1099993149, "step": 282, "train_runtime": 11187.4917, "train_tokens_per_second": 98323.483 }, { "epoch": 0.04499205087440382, "grad_norm": 0.3317037522792816, "learning_rate": 4.977376803741126e-05, "loss": 0.4821, "num_input_tokens_seen": 1103931011, "step": 283, "train_runtime": 11227.7502, "train_tokens_per_second": 98321.658 }, { "epoch": 0.045151033386327505, "grad_norm": 0.3118388056755066, "learning_rate": 4.977208545407864e-05, "loss": 0.4916, "num_input_tokens_seen": 1107751938, "step": 284, "train_runtime": 11267.3088, "train_tokens_per_second": 98315.575 }, { "epoch": 0.04531001589825119, "grad_norm": 0.2963658273220062, "learning_rate": 4.977039666551366e-05, "loss": 0.484, "num_input_tokens_seen": 1111723939, "step": 285, "train_runtime": 11305.7683, "train_tokens_per_second": 98332.454 }, { "epoch": 0.045468998410174884, "grad_norm": 0.3057146370410919, "learning_rate": 4.976870167213935e-05, "loss": 0.4858, "num_input_tokens_seen": 1115556636, "step": 286, "train_runtime": 11344.2789, "train_tokens_per_second": 98336.496 }, { "epoch": 0.04562798092209857, "grad_norm": 0.2918989360332489, "learning_rate": 4.976700047438031e-05, "loss": 0.4774, "num_input_tokens_seen": 1119368684, "step": 287, "train_runtime": 11381.4991, "train_tokens_per_second": 98349.846 }, { "epoch": 0.045786963434022256, "grad_norm": 0.31151363253593445, "learning_rate": 4.9765293072662644e-05, "loss": 0.4869, "num_input_tokens_seen": 1123148911, "step": 288, "train_runtime": 11420.8867, "train_tokens_per_second": 98341.656 }, { "epoch": 0.04594594594594595, "grad_norm": 0.2720086872577667, "learning_rate": 4.976357946741408e-05, "loss": 0.4621, "num_input_tokens_seen": 1127104429, "step": 289, "train_runtime": 11461.3568, "train_tokens_per_second": 98339.529 }, { "epoch": 0.046104928457869634, "grad_norm": 0.2551058232784271, "learning_rate": 4.976185965906384e-05, "loss": 0.498, "num_input_tokens_seen": 1130990889, "step": 290, "train_runtime": 11502.938, "train_tokens_per_second": 98321.914 }, { "epoch": 0.04626391096979332, "grad_norm": 0.29063236713409424, "learning_rate": 4.976013364804273e-05, "loss": 0.4743, "num_input_tokens_seen": 1134866823, "step": 291, "train_runtime": 11541.8093, "train_tokens_per_second": 98326.596 }, { "epoch": 0.04642289348171701, "grad_norm": 0.27901771664619446, "learning_rate": 4.975840143478311e-05, "loss": 0.4871, "num_input_tokens_seen": 1138629780, "step": 292, "train_runtime": 11578.7498, "train_tokens_per_second": 98337.886 }, { "epoch": 0.0465818759936407, "grad_norm": 0.29267004132270813, "learning_rate": 4.9756663019718876e-05, "loss": 0.4883, "num_input_tokens_seen": 1142550551, "step": 293, "train_runtime": 11616.5222, "train_tokens_per_second": 98355.647 }, { "epoch": 0.046740858505564385, "grad_norm": 0.2823094129562378, "learning_rate": 4.97549184032855e-05, "loss": 0.4733, "num_input_tokens_seen": 1146496094, "step": 294, "train_runtime": 11654.8418, "train_tokens_per_second": 98370.799 }, { "epoch": 0.04689984101748808, "grad_norm": 0.3522087037563324, "learning_rate": 4.9753167585919984e-05, "loss": 0.4842, "num_input_tokens_seen": 1150358424, "step": 295, "train_runtime": 11694.7815, "train_tokens_per_second": 98365.106 }, { "epoch": 0.047058823529411764, "grad_norm": 0.3631140887737274, "learning_rate": 4.9751410568060905e-05, "loss": 0.4812, "num_input_tokens_seen": 1154221116, "step": 296, "train_runtime": 11735.4378, "train_tokens_per_second": 98353.477 }, { "epoch": 0.04721780604133545, "grad_norm": 0.3298969864845276, "learning_rate": 4.974964735014839e-05, "loss": 0.505, "num_input_tokens_seen": 1158168361, "step": 297, "train_runtime": 11776.4402, "train_tokens_per_second": 98346.219 }, { "epoch": 0.04737678855325914, "grad_norm": 0.3445175588130951, "learning_rate": 4.974787793262409e-05, "loss": 0.4761, "num_input_tokens_seen": 1162117207, "step": 298, "train_runtime": 11814.8549, "train_tokens_per_second": 98360.684 }, { "epoch": 0.04753577106518283, "grad_norm": 0.39686474204063416, "learning_rate": 4.9746102315931255e-05, "loss": 0.4694, "num_input_tokens_seen": 1166005720, "step": 299, "train_runtime": 11853.521, "train_tokens_per_second": 98367.879 }, { "epoch": 0.04769475357710652, "grad_norm": 0.3296308219432831, "learning_rate": 4.9744320500514654e-05, "loss": 0.468, "num_input_tokens_seen": 1169699243, "step": 300, "train_runtime": 11891.7224, "train_tokens_per_second": 98362.475 }, { "epoch": 0.04785373608903021, "grad_norm": 0.26578694581985474, "learning_rate": 4.9742532486820614e-05, "loss": 0.4907, "num_input_tokens_seen": 1173657332, "step": 301, "train_runtime": 11930.2622, "train_tokens_per_second": 98376.491 }, { "epoch": 0.048012718600953894, "grad_norm": 0.3826107084751129, "learning_rate": 4.9740738275297024e-05, "loss": 0.4689, "num_input_tokens_seen": 1177487182, "step": 302, "train_runtime": 11969.7074, "train_tokens_per_second": 98372.261 }, { "epoch": 0.04817170111287759, "grad_norm": 0.3866586983203888, "learning_rate": 4.973893786639333e-05, "loss": 0.472, "num_input_tokens_seen": 1181340575, "step": 303, "train_runtime": 12009.4961, "train_tokens_per_second": 98367.206 }, { "epoch": 0.04833068362480127, "grad_norm": 0.3060658276081085, "learning_rate": 4.9737131260560514e-05, "loss": 0.4746, "num_input_tokens_seen": 1185289618, "step": 304, "train_runtime": 12046.7455, "train_tokens_per_second": 98390.858 }, { "epoch": 0.04848966613672496, "grad_norm": 0.33585086464881897, "learning_rate": 4.973531845825112e-05, "loss": 0.4667, "num_input_tokens_seen": 1189071272, "step": 305, "train_runtime": 12086.9159, "train_tokens_per_second": 98376.73 }, { "epoch": 0.04864864864864865, "grad_norm": 0.3423454761505127, "learning_rate": 4.973349945991923e-05, "loss": 0.4678, "num_input_tokens_seen": 1193072598, "step": 306, "train_runtime": 12126.5366, "train_tokens_per_second": 98385.271 }, { "epoch": 0.04880763116057234, "grad_norm": 0.2438216209411621, "learning_rate": 4.973167426602051e-05, "loss": 0.4868, "num_input_tokens_seen": 1197028491, "step": 307, "train_runtime": 12166.3317, "train_tokens_per_second": 98388.612 }, { "epoch": 0.04896661367249602, "grad_norm": 0.3392586410045624, "learning_rate": 4.972984287701215e-05, "loss": 0.485, "num_input_tokens_seen": 1200882498, "step": 308, "train_runtime": 12201.2092, "train_tokens_per_second": 98423.236 }, { "epoch": 0.049125596184419716, "grad_norm": 0.4062661826610565, "learning_rate": 4.9728005293352896e-05, "loss": 0.4832, "num_input_tokens_seen": 1204705557, "step": 309, "train_runtime": 12242.3116, "train_tokens_per_second": 98405.072 }, { "epoch": 0.0492845786963434, "grad_norm": 0.3168296217918396, "learning_rate": 4.972616151550306e-05, "loss": 0.46, "num_input_tokens_seen": 1208621808, "step": 310, "train_runtime": 12280.3248, "train_tokens_per_second": 98419.368 }, { "epoch": 0.04944356120826709, "grad_norm": 0.2784799337387085, "learning_rate": 4.9724311543924475e-05, "loss": 0.4797, "num_input_tokens_seen": 1212492949, "step": 311, "train_runtime": 12319.2125, "train_tokens_per_second": 98422.927 }, { "epoch": 0.04960254372019078, "grad_norm": 0.3350262939929962, "learning_rate": 4.9722455379080563e-05, "loss": 0.4805, "num_input_tokens_seen": 1216274418, "step": 312, "train_runtime": 12359.4349, "train_tokens_per_second": 98408.579 }, { "epoch": 0.04976152623211447, "grad_norm": 0.3262067437171936, "learning_rate": 4.972059302143628e-05, "loss": 0.4739, "num_input_tokens_seen": 1220186382, "step": 313, "train_runtime": 12398.0746, "train_tokens_per_second": 98417.409 }, { "epoch": 0.04992050874403815, "grad_norm": 0.25039640069007874, "learning_rate": 4.971872447145812e-05, "loss": 0.4801, "num_input_tokens_seen": 1224081731, "step": 314, "train_runtime": 12436.0034, "train_tokens_per_second": 98430.476 }, { "epoch": 0.050079491255961846, "grad_norm": 0.3142513930797577, "learning_rate": 4.971684972961417e-05, "loss": 0.469, "num_input_tokens_seen": 1228010926, "step": 315, "train_runtime": 12475.7609, "train_tokens_per_second": 98431.746 }, { "epoch": 0.05023847376788553, "grad_norm": 0.2975063621997833, "learning_rate": 4.971496879637401e-05, "loss": 0.4954, "num_input_tokens_seen": 1231862821, "step": 316, "train_runtime": 12516.565, "train_tokens_per_second": 98418.601 }, { "epoch": 0.05039745627980922, "grad_norm": 0.31254005432128906, "learning_rate": 4.971308167220881e-05, "loss": 0.4829, "num_input_tokens_seen": 1235850777, "step": 317, "train_runtime": 12555.1644, "train_tokens_per_second": 98433.66 }, { "epoch": 0.05055643879173291, "grad_norm": 0.33144238591194153, "learning_rate": 4.9711188357591276e-05, "loss": 0.4877, "num_input_tokens_seen": 1239742491, "step": 318, "train_runtime": 12592.9136, "train_tokens_per_second": 98447.63 }, { "epoch": 0.050715421303656597, "grad_norm": 0.26514554023742676, "learning_rate": 4.970928885299568e-05, "loss": 0.4756, "num_input_tokens_seen": 1243451458, "step": 319, "train_runtime": 12629.0136, "train_tokens_per_second": 98459.904 }, { "epoch": 0.05087440381558029, "grad_norm": 0.28339165449142456, "learning_rate": 4.970738315889784e-05, "loss": 0.4607, "num_input_tokens_seen": 1247360963, "step": 320, "train_runtime": 12667.9511, "train_tokens_per_second": 98465.881 }, { "epoch": 0.051033386327503975, "grad_norm": 0.3124246895313263, "learning_rate": 4.9705471275775104e-05, "loss": 0.4775, "num_input_tokens_seen": 1251385254, "step": 321, "train_runtime": 12705.3654, "train_tokens_per_second": 98492.662 }, { "epoch": 0.05119236883942766, "grad_norm": 0.328726589679718, "learning_rate": 4.970355320410639e-05, "loss": 0.48, "num_input_tokens_seen": 1255242379, "step": 322, "train_runtime": 12744.4445, "train_tokens_per_second": 98493.299 }, { "epoch": 0.051351351351351354, "grad_norm": 0.30683770775794983, "learning_rate": 4.9701628944372166e-05, "loss": 0.4806, "num_input_tokens_seen": 1259093723, "step": 323, "train_runtime": 12781.5136, "train_tokens_per_second": 98508.968 }, { "epoch": 0.05151033386327504, "grad_norm": 0.3248925805091858, "learning_rate": 4.969969849705444e-05, "loss": 0.4832, "num_input_tokens_seen": 1263093737, "step": 324, "train_runtime": 12820.8661, "train_tokens_per_second": 98518.597 }, { "epoch": 0.051669316375198726, "grad_norm": 0.33073946833610535, "learning_rate": 4.969776186263677e-05, "loss": 0.4617, "num_input_tokens_seen": 1267000413, "step": 325, "train_runtime": 12859.3427, "train_tokens_per_second": 98527.619 }, { "epoch": 0.05182829888712242, "grad_norm": 0.3366115093231201, "learning_rate": 4.9695819041604285e-05, "loss": 0.4695, "num_input_tokens_seen": 1270805192, "step": 326, "train_runtime": 12898.0522, "train_tokens_per_second": 98526.907 }, { "epoch": 0.051987281399046105, "grad_norm": 0.3071666359901428, "learning_rate": 4.969387003444364e-05, "loss": 0.4655, "num_input_tokens_seen": 1274754323, "step": 327, "train_runtime": 12937.4757, "train_tokens_per_second": 98531.92 }, { "epoch": 0.05214626391096979, "grad_norm": 0.25955313444137573, "learning_rate": 4.969191484164304e-05, "loss": 0.4581, "num_input_tokens_seen": 1278773382, "step": 328, "train_runtime": 12978.5405, "train_tokens_per_second": 98529.83 }, { "epoch": 0.052305246422893484, "grad_norm": 0.2675377130508423, "learning_rate": 4.968995346369226e-05, "loss": 0.4734, "num_input_tokens_seen": 1282596803, "step": 329, "train_runtime": 13017.6245, "train_tokens_per_second": 98527.715 }, { "epoch": 0.05246422893481717, "grad_norm": 0.2627231478691101, "learning_rate": 4.968798590108261e-05, "loss": 0.4644, "num_input_tokens_seen": 1286386088, "step": 330, "train_runtime": 13056.9957, "train_tokens_per_second": 98520.833 }, { "epoch": 0.052623211446740856, "grad_norm": 0.28518280386924744, "learning_rate": 4.968601215430694e-05, "loss": 0.4775, "num_input_tokens_seen": 1290278191, "step": 331, "train_runtime": 13097.7849, "train_tokens_per_second": 98511.176 }, { "epoch": 0.05278219395866455, "grad_norm": 0.3081546425819397, "learning_rate": 4.968403222385966e-05, "loss": 0.4701, "num_input_tokens_seen": 1294225746, "step": 332, "train_runtime": 13137.5705, "train_tokens_per_second": 98513.324 }, { "epoch": 0.052941176470588235, "grad_norm": 0.268475741147995, "learning_rate": 4.968204611023674e-05, "loss": 0.4715, "num_input_tokens_seen": 1298214414, "step": 333, "train_runtime": 13177.0899, "train_tokens_per_second": 98520.57 }, { "epoch": 0.05310015898251192, "grad_norm": 0.23578566312789917, "learning_rate": 4.968005381393568e-05, "loss": 0.4745, "num_input_tokens_seen": 1302027515, "step": 334, "train_runtime": 13217.699, "train_tokens_per_second": 98506.368 }, { "epoch": 0.05325914149443561, "grad_norm": 0.3741975724697113, "learning_rate": 4.9678055335455545e-05, "loss": 0.476, "num_input_tokens_seen": 1306086997, "step": 335, "train_runtime": 13253.8763, "train_tokens_per_second": 98543.775 }, { "epoch": 0.0534181240063593, "grad_norm": 0.2903909981250763, "learning_rate": 4.967605067529692e-05, "loss": 0.4786, "num_input_tokens_seen": 1310025449, "step": 336, "train_runtime": 13293.6843, "train_tokens_per_second": 98544.949 }, { "epoch": 0.05357710651828299, "grad_norm": 0.27349749207496643, "learning_rate": 4.9674039833961974e-05, "loss": 0.4835, "num_input_tokens_seen": 1313746580, "step": 337, "train_runtime": 13329.7254, "train_tokens_per_second": 98557.662 }, { "epoch": 0.05373608903020668, "grad_norm": 0.2687027156352997, "learning_rate": 4.967202281195441e-05, "loss": 0.4824, "num_input_tokens_seen": 1317607786, "step": 338, "train_runtime": 13370.6331, "train_tokens_per_second": 98544.906 }, { "epoch": 0.053895071542130364, "grad_norm": 0.27316993474960327, "learning_rate": 4.9669999609779474e-05, "loss": 0.4737, "num_input_tokens_seen": 1321578665, "step": 339, "train_runtime": 13409.6642, "train_tokens_per_second": 98554.195 }, { "epoch": 0.05405405405405406, "grad_norm": 0.24683356285095215, "learning_rate": 4.966797022794395e-05, "loss": 0.4786, "num_input_tokens_seen": 1325510795, "step": 340, "train_runtime": 13448.4825, "train_tokens_per_second": 98562.109 }, { "epoch": 0.05421303656597774, "grad_norm": 0.3147439956665039, "learning_rate": 4.9665934666956204e-05, "loss": 0.4713, "num_input_tokens_seen": 1329373979, "step": 341, "train_runtime": 13489.7397, "train_tokens_per_second": 98547.044 }, { "epoch": 0.05437201907790143, "grad_norm": 0.25766390562057495, "learning_rate": 4.966389292732613e-05, "loss": 0.4842, "num_input_tokens_seen": 1333363739, "step": 342, "train_runtime": 13533.4826, "train_tokens_per_second": 98523.328 }, { "epoch": 0.05453100158982512, "grad_norm": 0.24986252188682556, "learning_rate": 4.966184500956515e-05, "loss": 0.4786, "num_input_tokens_seen": 1337270135, "step": 343, "train_runtime": 13573.6994, "train_tokens_per_second": 98519.21 }, { "epoch": 0.05468998410174881, "grad_norm": 0.27271270751953125, "learning_rate": 4.965979091418627e-05, "loss": 0.4668, "num_input_tokens_seen": 1341184144, "step": 344, "train_runtime": 13610.2079, "train_tokens_per_second": 98542.517 }, { "epoch": 0.054848966613672494, "grad_norm": 0.27825671434402466, "learning_rate": 4.9657730641704015e-05, "loss": 0.4583, "num_input_tokens_seen": 1345011778, "step": 345, "train_runtime": 13646.0834, "train_tokens_per_second": 98563.943 }, { "epoch": 0.05500794912559619, "grad_norm": 0.25620362162590027, "learning_rate": 4.9655664192634484e-05, "loss": 0.4794, "num_input_tokens_seen": 1348855036, "step": 346, "train_runtime": 13680.4887, "train_tokens_per_second": 98596.992 }, { "epoch": 0.05516693163751987, "grad_norm": 0.3995670676231384, "learning_rate": 4.9653591567495294e-05, "loss": 0.4718, "num_input_tokens_seen": 1352702948, "step": 347, "train_runtime": 13721.3401, "train_tokens_per_second": 98583.88 }, { "epoch": 0.05532591414944356, "grad_norm": 0.26186737418174744, "learning_rate": 4.965151276680562e-05, "loss": 0.4725, "num_input_tokens_seen": 1356609824, "step": 348, "train_runtime": 13760.8245, "train_tokens_per_second": 98584.923 }, { "epoch": 0.05548489666136725, "grad_norm": 0.26096153259277344, "learning_rate": 4.9649427791086204e-05, "loss": 0.4606, "num_input_tokens_seen": 1360516045, "step": 349, "train_runtime": 13796.9849, "train_tokens_per_second": 98609.664 }, { "epoch": 0.05564387917329094, "grad_norm": 0.304740309715271, "learning_rate": 4.964733664085931e-05, "loss": 0.458, "num_input_tokens_seen": 1364505045, "step": 350, "train_runtime": 13838.5784, "train_tokens_per_second": 98601.533 }, { "epoch": 0.05580286168521462, "grad_norm": 0.23072221875190735, "learning_rate": 4.964523931664874e-05, "loss": 0.4671, "num_input_tokens_seen": 1368457447, "step": 351, "train_runtime": 13876.1219, "train_tokens_per_second": 98619.59 }, { "epoch": 0.055961844197138316, "grad_norm": 0.24458065629005432, "learning_rate": 4.964313581897989e-05, "loss": 0.4746, "num_input_tokens_seen": 1372453520, "step": 352, "train_runtime": 13915.2887, "train_tokens_per_second": 98629.181 }, { "epoch": 0.056120826709062, "grad_norm": 0.2960222661495209, "learning_rate": 4.9641026148379646e-05, "loss": 0.4806, "num_input_tokens_seen": 1376141184, "step": 353, "train_runtime": 13955.2066, "train_tokens_per_second": 98611.309 }, { "epoch": 0.05627980922098569, "grad_norm": 0.24089020490646362, "learning_rate": 4.9638910305376476e-05, "loss": 0.4745, "num_input_tokens_seen": 1380196986, "step": 354, "train_runtime": 13995.026, "train_tokens_per_second": 98620.538 }, { "epoch": 0.05643879173290938, "grad_norm": 0.22664158046245575, "learning_rate": 4.963678829050038e-05, "loss": 0.4697, "num_input_tokens_seen": 1384242460, "step": 355, "train_runtime": 14034.18, "train_tokens_per_second": 98633.654 }, { "epoch": 0.05659777424483307, "grad_norm": 0.2657104432582855, "learning_rate": 4.963466010428291e-05, "loss": 0.4816, "num_input_tokens_seen": 1388021118, "step": 356, "train_runtime": 14071.4365, "train_tokens_per_second": 98641.039 }, { "epoch": 0.05675675675675676, "grad_norm": 0.2390977442264557, "learning_rate": 4.963252574725715e-05, "loss": 0.4538, "num_input_tokens_seen": 1391844525, "step": 357, "train_runtime": 14110.6875, "train_tokens_per_second": 98637.612 }, { "epoch": 0.056915739268680446, "grad_norm": 0.2730464041233063, "learning_rate": 4.963038521995777e-05, "loss": 0.4695, "num_input_tokens_seen": 1395863064, "step": 358, "train_runtime": 14149.5863, "train_tokens_per_second": 98650.45 }, { "epoch": 0.05707472178060413, "grad_norm": 0.426840603351593, "learning_rate": 4.962823852292093e-05, "loss": 0.4785, "num_input_tokens_seen": 1399703172, "step": 359, "train_runtime": 14187.9459, "train_tokens_per_second": 98654.391 }, { "epoch": 0.057233704292527825, "grad_norm": 0.21953760087490082, "learning_rate": 4.962608565668437e-05, "loss": 0.471, "num_input_tokens_seen": 1403534488, "step": 360, "train_runtime": 14226.9898, "train_tokens_per_second": 98652.948 }, { "epoch": 0.05739268680445151, "grad_norm": 0.25863561034202576, "learning_rate": 4.962392662178737e-05, "loss": 0.4862, "num_input_tokens_seen": 1407509060, "step": 361, "train_runtime": 14267.7233, "train_tokens_per_second": 98649.871 }, { "epoch": 0.0575516693163752, "grad_norm": 0.257956862449646, "learning_rate": 4.9621761418770765e-05, "loss": 0.4577, "num_input_tokens_seen": 1411600907, "step": 362, "train_runtime": 14307.2619, "train_tokens_per_second": 98663.247 }, { "epoch": 0.05771065182829889, "grad_norm": 0.25972697138786316, "learning_rate": 4.96195900481769e-05, "loss": 0.4625, "num_input_tokens_seen": 1415448701, "step": 363, "train_runtime": 14350.5967, "train_tokens_per_second": 98633.439 }, { "epoch": 0.057869634340222575, "grad_norm": 0.26950785517692566, "learning_rate": 4.96174125105497e-05, "loss": 0.4795, "num_input_tokens_seen": 1419333788, "step": 364, "train_runtime": 14390.4047, "train_tokens_per_second": 98630.568 }, { "epoch": 0.05802861685214626, "grad_norm": 0.2585904002189636, "learning_rate": 4.9615228806434626e-05, "loss": 0.4903, "num_input_tokens_seen": 1423293246, "step": 365, "train_runtime": 14431.8105, "train_tokens_per_second": 98621.947 }, { "epoch": 0.058187599364069954, "grad_norm": 0.26650798320770264, "learning_rate": 4.961303893637867e-05, "loss": 0.4704, "num_input_tokens_seen": 1427270612, "step": 366, "train_runtime": 14470.069, "train_tokens_per_second": 98636.061 }, { "epoch": 0.05834658187599364, "grad_norm": 0.2629716396331787, "learning_rate": 4.961084290093039e-05, "loss": 0.463, "num_input_tokens_seen": 1431092009, "step": 367, "train_runtime": 14511.3276, "train_tokens_per_second": 98618.958 }, { "epoch": 0.058505564387917326, "grad_norm": 0.29863211512565613, "learning_rate": 4.960864070063987e-05, "loss": 0.4806, "num_input_tokens_seen": 1435031831, "step": 368, "train_runtime": 14551.4601, "train_tokens_per_second": 98617.721 }, { "epoch": 0.05866454689984102, "grad_norm": 0.30594149231910706, "learning_rate": 4.9606432336058744e-05, "loss": 0.4697, "num_input_tokens_seen": 1439009997, "step": 369, "train_runtime": 14590.5529, "train_tokens_per_second": 98626.146 }, { "epoch": 0.058823529411764705, "grad_norm": 0.28173479437828064, "learning_rate": 4.96042178077402e-05, "loss": 0.4697, "num_input_tokens_seen": 1442972509, "step": 370, "train_runtime": 14629.2671, "train_tokens_per_second": 98636.008 }, { "epoch": 0.05898251192368839, "grad_norm": 0.2965833246707916, "learning_rate": 4.960199711623895e-05, "loss": 0.4626, "num_input_tokens_seen": 1446754060, "step": 371, "train_runtime": 14669.0219, "train_tokens_per_second": 98626.485 }, { "epoch": 0.059141494435612084, "grad_norm": 0.2966563403606415, "learning_rate": 4.959977026211128e-05, "loss": 0.4611, "num_input_tokens_seen": 1450702152, "step": 372, "train_runtime": 14708.641, "train_tokens_per_second": 98629.244 }, { "epoch": 0.05930047694753577, "grad_norm": 0.33344924449920654, "learning_rate": 4.9597537245914985e-05, "loss": 0.4729, "num_input_tokens_seen": 1454679021, "step": 373, "train_runtime": 14748.2647, "train_tokens_per_second": 98633.911 }, { "epoch": 0.05945945945945946, "grad_norm": 0.2915802597999573, "learning_rate": 4.9595298068209426e-05, "loss": 0.479, "num_input_tokens_seen": 1458482626, "step": 374, "train_runtime": 14786.5859, "train_tokens_per_second": 98635.522 }, { "epoch": 0.05961844197138315, "grad_norm": 0.2525638937950134, "learning_rate": 4.9593052729555495e-05, "loss": 0.4656, "num_input_tokens_seen": 1462269798, "step": 375, "train_runtime": 14824.233, "train_tokens_per_second": 98640.503 }, { "epoch": 0.059777424483306835, "grad_norm": 0.2561534643173218, "learning_rate": 4.9590801230515635e-05, "loss": 0.4797, "num_input_tokens_seen": 1466229423, "step": 376, "train_runtime": 14865.0515, "train_tokens_per_second": 98636.014 }, { "epoch": 0.05993640699523053, "grad_norm": 0.2675146460533142, "learning_rate": 4.958854357165385e-05, "loss": 0.4538, "num_input_tokens_seen": 1470138572, "step": 377, "train_runtime": 14906.0408, "train_tokens_per_second": 98627.033 }, { "epoch": 0.060095389507154214, "grad_norm": 0.28387773036956787, "learning_rate": 4.9586279753535634e-05, "loss": 0.4642, "num_input_tokens_seen": 1473990585, "step": 378, "train_runtime": 14946.2804, "train_tokens_per_second": 98619.225 }, { "epoch": 0.0602543720190779, "grad_norm": 0.26889657974243164, "learning_rate": 4.958400977672809e-05, "loss": 0.4693, "num_input_tokens_seen": 1477900497, "step": 379, "train_runtime": 14989.2468, "train_tokens_per_second": 98597.382 }, { "epoch": 0.06041335453100159, "grad_norm": 0.4886305034160614, "learning_rate": 4.958173364179981e-05, "loss": 0.4583, "num_input_tokens_seen": 1481827319, "step": 380, "train_runtime": 15028.5903, "train_tokens_per_second": 98600.554 }, { "epoch": 0.06057233704292528, "grad_norm": 0.2903163731098175, "learning_rate": 4.9579451349320956e-05, "loss": 0.4828, "num_input_tokens_seen": 1485752883, "step": 381, "train_runtime": 15068.2834, "train_tokens_per_second": 98601.337 }, { "epoch": 0.060731319554848964, "grad_norm": 0.286643385887146, "learning_rate": 4.9577162899863225e-05, "loss": 0.4844, "num_input_tokens_seen": 1489681485, "step": 382, "train_runtime": 15108.6194, "train_tokens_per_second": 98598.121 }, { "epoch": 0.06089030206677266, "grad_norm": 0.2475207895040512, "learning_rate": 4.9574868293999855e-05, "loss": 0.4743, "num_input_tokens_seen": 1493519533, "step": 383, "train_runtime": 15146.4725, "train_tokens_per_second": 98605.107 }, { "epoch": 0.06104928457869634, "grad_norm": 0.27702972292900085, "learning_rate": 4.9572567532305635e-05, "loss": 0.4793, "num_input_tokens_seen": 1497469803, "step": 384, "train_runtime": 15188.8562, "train_tokens_per_second": 98590.031 }, { "epoch": 0.06120826709062003, "grad_norm": 0.3154366612434387, "learning_rate": 4.957026061535689e-05, "loss": 0.4743, "num_input_tokens_seen": 1501334826, "step": 385, "train_runtime": 15228.7724, "train_tokens_per_second": 98585.414 }, { "epoch": 0.06136724960254372, "grad_norm": 0.37012264132499695, "learning_rate": 4.956794754373148e-05, "loss": 0.4642, "num_input_tokens_seen": 1505165870, "step": 386, "train_runtime": 15269.4347, "train_tokens_per_second": 98573.778 }, { "epoch": 0.06152623211446741, "grad_norm": 0.26973292231559753, "learning_rate": 4.956562831800882e-05, "loss": 0.4806, "num_input_tokens_seen": 1509175388, "step": 387, "train_runtime": 15309.0747, "train_tokens_per_second": 98580.444 }, { "epoch": 0.061685214626391094, "grad_norm": 0.24035964906215668, "learning_rate": 4.9563302938769854e-05, "loss": 0.4791, "num_input_tokens_seen": 1512912567, "step": 388, "train_runtime": 15347.6576, "train_tokens_per_second": 98576.122 }, { "epoch": 0.06184419713831479, "grad_norm": 0.2926710546016693, "learning_rate": 4.956097140659707e-05, "loss": 0.4844, "num_input_tokens_seen": 1516844824, "step": 389, "train_runtime": 15386.0536, "train_tokens_per_second": 98585.697 }, { "epoch": 0.06200317965023847, "grad_norm": 0.27969610691070557, "learning_rate": 4.955863372207451e-05, "loss": 0.4624, "num_input_tokens_seen": 1520703139, "step": 390, "train_runtime": 15425.7651, "train_tokens_per_second": 98582.024 }, { "epoch": 0.062162162162162166, "grad_norm": 0.2540493905544281, "learning_rate": 4.9556289885787735e-05, "loss": 0.478, "num_input_tokens_seen": 1524620500, "step": 391, "train_runtime": 15464.4455, "train_tokens_per_second": 98588.76 }, { "epoch": 0.06232114467408585, "grad_norm": 0.26312950253486633, "learning_rate": 4.9553939898323875e-05, "loss": 0.4591, "num_input_tokens_seen": 1528573158, "step": 392, "train_runtime": 15504.4258, "train_tokens_per_second": 98589.472 }, { "epoch": 0.06248012718600954, "grad_norm": 0.25086140632629395, "learning_rate": 4.9551583760271574e-05, "loss": 0.4709, "num_input_tokens_seen": 1532493755, "step": 393, "train_runtime": 15543.5548, "train_tokens_per_second": 98593.518 }, { "epoch": 0.06263910969793322, "grad_norm": 0.32307207584381104, "learning_rate": 4.954922147222103e-05, "loss": 0.4827, "num_input_tokens_seen": 1536418069, "step": 394, "train_runtime": 15580.7026, "train_tokens_per_second": 98610.32 }, { "epoch": 0.06279809220985691, "grad_norm": 0.29146578907966614, "learning_rate": 4.9546853034763983e-05, "loss": 0.4685, "num_input_tokens_seen": 1540225449, "step": 395, "train_runtime": 15622.1134, "train_tokens_per_second": 98592.643 }, { "epoch": 0.06295707472178061, "grad_norm": 0.2688943147659302, "learning_rate": 4.954447844849371e-05, "loss": 0.4606, "num_input_tokens_seen": 1544169451, "step": 396, "train_runtime": 15662.4817, "train_tokens_per_second": 98590.344 }, { "epoch": 0.0631160572337043, "grad_norm": 0.2563803791999817, "learning_rate": 4.954209771400502e-05, "loss": 0.4681, "num_input_tokens_seen": 1548077494, "step": 397, "train_runtime": 15700.8809, "train_tokens_per_second": 98598.13 }, { "epoch": 0.06327503974562798, "grad_norm": 0.2395860254764557, "learning_rate": 4.953971083189428e-05, "loss": 0.4636, "num_input_tokens_seen": 1551987949, "step": 398, "train_runtime": 15739.6303, "train_tokens_per_second": 98603.838 }, { "epoch": 0.06343402225755167, "grad_norm": 0.22306907176971436, "learning_rate": 4.9537317802759386e-05, "loss": 0.4685, "num_input_tokens_seen": 1555809294, "step": 399, "train_runtime": 15779.3775, "train_tokens_per_second": 98597.635 }, { "epoch": 0.06359300476947535, "grad_norm": 0.2292768806219101, "learning_rate": 4.9534918627199764e-05, "loss": 0.4646, "num_input_tokens_seen": 1559691270, "step": 400, "train_runtime": 15819.7761, "train_tokens_per_second": 98591.235 }, { "epoch": 0.06375198728139905, "grad_norm": 0.24294345080852509, "learning_rate": 4.95325133058164e-05, "loss": 0.471, "num_input_tokens_seen": 1563479626, "step": 401, "train_runtime": 15975.8957, "train_tokens_per_second": 97864.912 }, { "epoch": 0.06391096979332274, "grad_norm": 0.2336423248052597, "learning_rate": 4.953010183921181e-05, "loss": 0.4635, "num_input_tokens_seen": 1567379100, "step": 402, "train_runtime": 16013.5384, "train_tokens_per_second": 97878.374 }, { "epoch": 0.06406995230524642, "grad_norm": 0.2675757110118866, "learning_rate": 4.952768422799005e-05, "loss": 0.4599, "num_input_tokens_seen": 1571366159, "step": 403, "train_runtime": 16052.6674, "train_tokens_per_second": 97888.165 }, { "epoch": 0.06422893481717011, "grad_norm": 0.24159730970859528, "learning_rate": 4.952526047275671e-05, "loss": 0.4672, "num_input_tokens_seen": 1575255084, "step": 404, "train_runtime": 16092.8809, "train_tokens_per_second": 97885.213 }, { "epoch": 0.0643879173290938, "grad_norm": 0.2532043159008026, "learning_rate": 4.9522830574118927e-05, "loss": 0.4768, "num_input_tokens_seen": 1579086568, "step": 405, "train_runtime": 16133.8146, "train_tokens_per_second": 97874.347 }, { "epoch": 0.06454689984101748, "grad_norm": 0.27218785881996155, "learning_rate": 4.9520394532685364e-05, "loss": 0.4631, "num_input_tokens_seen": 1583067572, "step": 406, "train_runtime": 16172.5837, "train_tokens_per_second": 97885.879 }, { "epoch": 0.06470588235294118, "grad_norm": 0.22813965380191803, "learning_rate": 4.951795234906625e-05, "loss": 0.4707, "num_input_tokens_seen": 1586976534, "step": 407, "train_runtime": 16211.2427, "train_tokens_per_second": 97893.577 }, { "epoch": 0.06486486486486487, "grad_norm": 0.246758833527565, "learning_rate": 4.951550402387332e-05, "loss": 0.4738, "num_input_tokens_seen": 1590943347, "step": 408, "train_runtime": 16249.4236, "train_tokens_per_second": 97907.679 }, { "epoch": 0.06502384737678855, "grad_norm": 0.23769773542881012, "learning_rate": 4.951304955771987e-05, "loss": 0.464, "num_input_tokens_seen": 1594900137, "step": 409, "train_runtime": 16287.8582, "train_tokens_per_second": 97919.574 }, { "epoch": 0.06518282988871224, "grad_norm": 0.6699399352073669, "learning_rate": 4.9510588951220724e-05, "loss": 0.4756, "num_input_tokens_seen": 1598629102, "step": 410, "train_runtime": 16322.7106, "train_tokens_per_second": 97938.948 }, { "epoch": 0.06534181240063593, "grad_norm": 0.24540403485298157, "learning_rate": 4.950812220499225e-05, "loss": 0.4641, "num_input_tokens_seen": 1602552873, "step": 411, "train_runtime": 16359.8865, "train_tokens_per_second": 97956.234 }, { "epoch": 0.06550079491255961, "grad_norm": 0.24290817975997925, "learning_rate": 4.9505649319652335e-05, "loss": 0.4669, "num_input_tokens_seen": 1606592113, "step": 412, "train_runtime": 16397.4517, "train_tokens_per_second": 97978.158 }, { "epoch": 0.06565977742448331, "grad_norm": 0.2540212571620941, "learning_rate": 4.950317029582044e-05, "loss": 0.4721, "num_input_tokens_seen": 1610500140, "step": 413, "train_runtime": 16433.9627, "train_tokens_per_second": 97998.284 }, { "epoch": 0.065818759936407, "grad_norm": 0.2385658621788025, "learning_rate": 4.950068513411753e-05, "loss": 0.4591, "num_input_tokens_seen": 1614406796, "step": 414, "train_runtime": 16473.2019, "train_tokens_per_second": 98002.004 }, { "epoch": 0.06597774244833068, "grad_norm": 0.225785031914711, "learning_rate": 4.949819383516613e-05, "loss": 0.4821, "num_input_tokens_seen": 1618324244, "step": 415, "train_runtime": 16512.3883, "train_tokens_per_second": 98006.673 }, { "epoch": 0.06613672496025437, "grad_norm": 0.2511119544506073, "learning_rate": 4.949569639959028e-05, "loss": 0.4589, "num_input_tokens_seen": 1622224826, "step": 416, "train_runtime": 16553.1738, "train_tokens_per_second": 98000.834 }, { "epoch": 0.06629570747217806, "grad_norm": 0.30048495531082153, "learning_rate": 4.949319282801558e-05, "loss": 0.4756, "num_input_tokens_seen": 1626091867, "step": 417, "train_runtime": 16593.1911, "train_tokens_per_second": 97997.538 }, { "epoch": 0.06645468998410174, "grad_norm": 0.2430267632007599, "learning_rate": 4.9490683121069154e-05, "loss": 0.4692, "num_input_tokens_seen": 1629988310, "step": 418, "train_runtime": 16632.4493, "train_tokens_per_second": 98000.498 }, { "epoch": 0.06661367249602544, "grad_norm": 0.2608335018157959, "learning_rate": 4.9488167279379663e-05, "loss": 0.4795, "num_input_tokens_seen": 1633931562, "step": 419, "train_runtime": 16671.8597, "train_tokens_per_second": 98005.357 }, { "epoch": 0.06677265500794913, "grad_norm": 0.28462934494018555, "learning_rate": 4.9485645303577316e-05, "loss": 0.4682, "num_input_tokens_seen": 1637852806, "step": 420, "train_runtime": 16712.9433, "train_tokens_per_second": 97999.064 }, { "epoch": 0.06693163751987281, "grad_norm": 0.2792109251022339, "learning_rate": 4.948311719429384e-05, "loss": 0.4711, "num_input_tokens_seen": 1641708506, "step": 421, "train_runtime": 16751.8971, "train_tokens_per_second": 98001.348 }, { "epoch": 0.0670906200317965, "grad_norm": 0.2384335994720459, "learning_rate": 4.9480582952162515e-05, "loss": 0.4718, "num_input_tokens_seen": 1645631678, "step": 422, "train_runtime": 16789.2772, "train_tokens_per_second": 98016.827 }, { "epoch": 0.06724960254372019, "grad_norm": 0.23471811413764954, "learning_rate": 4.947804257781815e-05, "loss": 0.4568, "num_input_tokens_seen": 1649466137, "step": 423, "train_runtime": 16826.3167, "train_tokens_per_second": 98028.949 }, { "epoch": 0.06740858505564389, "grad_norm": 0.2409612536430359, "learning_rate": 4.947549607189709e-05, "loss": 0.4837, "num_input_tokens_seen": 1653437735, "step": 424, "train_runtime": 16867.8401, "train_tokens_per_second": 98023.086 }, { "epoch": 0.06756756756756757, "grad_norm": 0.24475806951522827, "learning_rate": 4.9472943435037216e-05, "loss": 0.4561, "num_input_tokens_seen": 1657207179, "step": 425, "train_runtime": 16906.8986, "train_tokens_per_second": 98019.585 }, { "epoch": 0.06772655007949126, "grad_norm": 0.2807217836380005, "learning_rate": 4.9470384667877947e-05, "loss": 0.4809, "num_input_tokens_seen": 1661111732, "step": 426, "train_runtime": 16946.7474, "train_tokens_per_second": 98019.501 }, { "epoch": 0.06788553259141494, "grad_norm": 0.24556498229503632, "learning_rate": 4.946781977106023e-05, "loss": 0.4744, "num_input_tokens_seen": 1664934008, "step": 427, "train_runtime": 16984.617, "train_tokens_per_second": 98025.997 }, { "epoch": 0.06804451510333863, "grad_norm": 0.2609526216983795, "learning_rate": 4.9465248745226567e-05, "loss": 0.4523, "num_input_tokens_seen": 1668805535, "step": 428, "train_runtime": 17022.7592, "train_tokens_per_second": 98033.786 }, { "epoch": 0.06820349761526232, "grad_norm": 0.23556029796600342, "learning_rate": 4.946267159102097e-05, "loss": 0.4777, "num_input_tokens_seen": 1672825998, "step": 429, "train_runtime": 17063.0215, "train_tokens_per_second": 98038.087 }, { "epoch": 0.06836248012718601, "grad_norm": 0.24120007455348969, "learning_rate": 4.9460088309088995e-05, "loss": 0.4726, "num_input_tokens_seen": 1676735882, "step": 430, "train_runtime": 17102.4782, "train_tokens_per_second": 98040.522 }, { "epoch": 0.0685214626391097, "grad_norm": 0.2640591561794281, "learning_rate": 4.945749890007775e-05, "loss": 0.4667, "num_input_tokens_seen": 1680703420, "step": 431, "train_runtime": 17142.2196, "train_tokens_per_second": 98044.679 }, { "epoch": 0.06868044515103339, "grad_norm": 0.3052465319633484, "learning_rate": 4.9454903364635854e-05, "loss": 0.4752, "num_input_tokens_seen": 1684658212, "step": 432, "train_runtime": 17180.9706, "train_tokens_per_second": 98053.728 }, { "epoch": 0.06883942766295707, "grad_norm": 0.23351424932479858, "learning_rate": 4.945230170341347e-05, "loss": 0.4563, "num_input_tokens_seen": 1688577813, "step": 433, "train_runtime": 17220.617, "train_tokens_per_second": 98055.593 }, { "epoch": 0.06899841017488076, "grad_norm": 0.2455449253320694, "learning_rate": 4.94496939170623e-05, "loss": 0.4678, "num_input_tokens_seen": 1692415194, "step": 434, "train_runtime": 17258.6207, "train_tokens_per_second": 98062.019 }, { "epoch": 0.06915739268680444, "grad_norm": 0.21342793107032776, "learning_rate": 4.944708000623557e-05, "loss": 0.466, "num_input_tokens_seen": 1696446633, "step": 435, "train_runtime": 17297.8476, "train_tokens_per_second": 98072.701 }, { "epoch": 0.06931637519872814, "grad_norm": 0.23978428542613983, "learning_rate": 4.944445997158805e-05, "loss": 0.4765, "num_input_tokens_seen": 1700247936, "step": 436, "train_runtime": 17335.6522, "train_tokens_per_second": 98078.106 }, { "epoch": 0.06947535771065183, "grad_norm": 0.2174246609210968, "learning_rate": 4.944183381377605e-05, "loss": 0.4611, "num_input_tokens_seen": 1704129013, "step": 437, "train_runtime": 17375.3289, "train_tokens_per_second": 98077.511 }, { "epoch": 0.06963434022257552, "grad_norm": 0.23581944406032562, "learning_rate": 4.943920153345739e-05, "loss": 0.4698, "num_input_tokens_seen": 1708090308, "step": 438, "train_runtime": 17415.3785, "train_tokens_per_second": 98079.425 }, { "epoch": 0.0697933227344992, "grad_norm": 0.2672484815120697, "learning_rate": 4.9436563131291434e-05, "loss": 0.4754, "num_input_tokens_seen": 1711980974, "step": 439, "train_runtime": 17452.7583, "train_tokens_per_second": 98092.287 }, { "epoch": 0.06995230524642289, "grad_norm": 0.2463432103395462, "learning_rate": 4.94339186079391e-05, "loss": 0.4763, "num_input_tokens_seen": 1715824523, "step": 440, "train_runtime": 17492.2221, "train_tokens_per_second": 98090.712 }, { "epoch": 0.07011128775834659, "grad_norm": 0.2652168571949005, "learning_rate": 4.943126796406281e-05, "loss": 0.4718, "num_input_tokens_seen": 1719687859, "step": 441, "train_runtime": 17532.6457, "train_tokens_per_second": 98084.904 }, { "epoch": 0.07027027027027027, "grad_norm": 0.24186493456363678, "learning_rate": 4.942861120032653e-05, "loss": 0.4674, "num_input_tokens_seen": 1723624870, "step": 442, "train_runtime": 17572.0486, "train_tokens_per_second": 98089.011 }, { "epoch": 0.07042925278219396, "grad_norm": 0.3172110617160797, "learning_rate": 4.942594831739577e-05, "loss": 0.4748, "num_input_tokens_seen": 1727470745, "step": 443, "train_runtime": 17611.0283, "train_tokens_per_second": 98090.283 }, { "epoch": 0.07058823529411765, "grad_norm": 0.4796925485134125, "learning_rate": 4.942327931593756e-05, "loss": 0.4602, "num_input_tokens_seen": 1731316142, "step": 444, "train_runtime": 17650.0796, "train_tokens_per_second": 98091.124 }, { "epoch": 0.07074721780604133, "grad_norm": 0.2324962615966797, "learning_rate": 4.942060419662047e-05, "loss": 0.4659, "num_input_tokens_seen": 1735212093, "step": 445, "train_runtime": 17691.7604, "train_tokens_per_second": 98080.239 }, { "epoch": 0.07090620031796502, "grad_norm": 0.2277347892522812, "learning_rate": 4.9417922960114583e-05, "loss": 0.4735, "num_input_tokens_seen": 1739210894, "step": 446, "train_runtime": 17732.53, "train_tokens_per_second": 98080.245 }, { "epoch": 0.07106518282988872, "grad_norm": 0.24966947734355927, "learning_rate": 4.941523560709154e-05, "loss": 0.4749, "num_input_tokens_seen": 1743099669, "step": 447, "train_runtime": 17770.3326, "train_tokens_per_second": 98090.436 }, { "epoch": 0.0712241653418124, "grad_norm": 0.23888139426708221, "learning_rate": 4.941254213822451e-05, "loss": 0.4666, "num_input_tokens_seen": 1746987020, "step": 448, "train_runtime": 17807.2196, "train_tokens_per_second": 98105.547 }, { "epoch": 0.07138314785373609, "grad_norm": 0.261039137840271, "learning_rate": 4.9409842554188176e-05, "loss": 0.4615, "num_input_tokens_seen": 1750983437, "step": 449, "train_runtime": 17846.8925, "train_tokens_per_second": 98111.391 }, { "epoch": 0.07154213036565978, "grad_norm": 0.26349347829818726, "learning_rate": 4.940713685565878e-05, "loss": 0.465, "num_input_tokens_seen": 1754912608, "step": 450, "train_runtime": 17885.6374, "train_tokens_per_second": 98118.539 }, { "epoch": 0.07170111287758346, "grad_norm": 0.31955549120903015, "learning_rate": 4.9404425043314066e-05, "loss": 0.464, "num_input_tokens_seen": 1758825462, "step": 451, "train_runtime": 17919.3075, "train_tokens_per_second": 98152.535 }, { "epoch": 0.07186009538950715, "grad_norm": 0.24990732967853546, "learning_rate": 4.9401707117833326e-05, "loss": 0.4636, "num_input_tokens_seen": 1762650444, "step": 452, "train_runtime": 17956.6567, "train_tokens_per_second": 98161.393 }, { "epoch": 0.07201907790143085, "grad_norm": 0.25630441308021545, "learning_rate": 4.939898307989739e-05, "loss": 0.4654, "num_input_tokens_seen": 1766505154, "step": 453, "train_runtime": 17996.2304, "train_tokens_per_second": 98159.732 }, { "epoch": 0.07217806041335453, "grad_norm": 0.26568806171417236, "learning_rate": 4.93962529301886e-05, "loss": 0.4671, "num_input_tokens_seen": 1770488438, "step": 454, "train_runtime": 18035.7593, "train_tokens_per_second": 98165.45 }, { "epoch": 0.07233704292527822, "grad_norm": 0.288248747587204, "learning_rate": 4.939351666939085e-05, "loss": 0.4547, "num_input_tokens_seen": 1774397349, "step": 455, "train_runtime": 18075.1966, "train_tokens_per_second": 98167.527 }, { "epoch": 0.0724960254372019, "grad_norm": 0.2599959969520569, "learning_rate": 4.9390774298189544e-05, "loss": 0.4586, "num_input_tokens_seen": 1778095125, "step": 456, "train_runtime": 18113.6733, "train_tokens_per_second": 98163.144 }, { "epoch": 0.07265500794912559, "grad_norm": 0.24806828796863556, "learning_rate": 4.938802581727162e-05, "loss": 0.4659, "num_input_tokens_seen": 1782039034, "step": 457, "train_runtime": 18155.2329, "train_tokens_per_second": 98155.669 }, { "epoch": 0.07281399046104929, "grad_norm": 0.2854563593864441, "learning_rate": 4.938527122732558e-05, "loss": 0.4703, "num_input_tokens_seen": 1785991482, "step": 458, "train_runtime": 18196.6446, "train_tokens_per_second": 98149.495 }, { "epoch": 0.07297297297297298, "grad_norm": 0.24779969453811646, "learning_rate": 4.93825105290414e-05, "loss": 0.459, "num_input_tokens_seen": 1789936451, "step": 459, "train_runtime": 18234.315, "train_tokens_per_second": 98163.076 }, { "epoch": 0.07313195548489666, "grad_norm": 0.24846263229846954, "learning_rate": 4.937974372311063e-05, "loss": 0.4543, "num_input_tokens_seen": 1793826807, "step": 460, "train_runtime": 18274.456, "train_tokens_per_second": 98160.34 }, { "epoch": 0.07329093799682035, "grad_norm": 0.2626080811023712, "learning_rate": 4.937697081022634e-05, "loss": 0.4603, "num_input_tokens_seen": 1797670862, "step": 461, "train_runtime": 18311.6241, "train_tokens_per_second": 98171.023 }, { "epoch": 0.07344992050874403, "grad_norm": 0.3671586513519287, "learning_rate": 4.9374191791083115e-05, "loss": 0.4701, "num_input_tokens_seen": 1801687684, "step": 462, "train_runtime": 18353.0335, "train_tokens_per_second": 98168.387 }, { "epoch": 0.07360890302066772, "grad_norm": 0.24258141219615936, "learning_rate": 4.937140666637708e-05, "loss": 0.4562, "num_input_tokens_seen": 1805497006, "step": 463, "train_runtime": 18394.4204, "train_tokens_per_second": 98154.602 }, { "epoch": 0.07376788553259142, "grad_norm": 0.22638241946697235, "learning_rate": 4.936861543680589e-05, "loss": 0.4604, "num_input_tokens_seen": 1809502849, "step": 464, "train_runtime": 18435.0251, "train_tokens_per_second": 98155.703 }, { "epoch": 0.0739268680445151, "grad_norm": 0.25361907482147217, "learning_rate": 4.936581810306874e-05, "loss": 0.4627, "num_input_tokens_seen": 1813354480, "step": 465, "train_runtime": 18471.4371, "train_tokens_per_second": 98170.731 }, { "epoch": 0.07408585055643879, "grad_norm": 0.3112848103046417, "learning_rate": 4.936301466586633e-05, "loss": 0.4651, "num_input_tokens_seen": 1817192660, "step": 466, "train_runtime": 18509.0049, "train_tokens_per_second": 98178.842 }, { "epoch": 0.07424483306836248, "grad_norm": 0.26537469029426575, "learning_rate": 4.936020512590089e-05, "loss": 0.4644, "num_input_tokens_seen": 1821151460, "step": 467, "train_runtime": 18548.5319, "train_tokens_per_second": 98183.052 }, { "epoch": 0.07440381558028616, "grad_norm": 0.29768338799476624, "learning_rate": 4.935738948387622e-05, "loss": 0.4527, "num_input_tokens_seen": 1825060400, "step": 468, "train_runtime": 18586.663, "train_tokens_per_second": 98191.935 }, { "epoch": 0.07456279809220985, "grad_norm": 0.2843025028705597, "learning_rate": 4.9354567740497595e-05, "loss": 0.4614, "num_input_tokens_seen": 1828913188, "step": 469, "train_runtime": 18624.5995, "train_tokens_per_second": 98198.793 }, { "epoch": 0.07472178060413355, "grad_norm": 0.3995248079299927, "learning_rate": 4.935173989647185e-05, "loss": 0.4728, "num_input_tokens_seen": 1832819653, "step": 470, "train_runtime": 18666.9055, "train_tokens_per_second": 98185.511 }, { "epoch": 0.07488076311605724, "grad_norm": 0.46624234318733215, "learning_rate": 4.934890595250734e-05, "loss": 0.4601, "num_input_tokens_seen": 1836744482, "step": 471, "train_runtime": 18707.3833, "train_tokens_per_second": 98182.865 }, { "epoch": 0.07503974562798092, "grad_norm": 0.283268541097641, "learning_rate": 4.934606590931394e-05, "loss": 0.4742, "num_input_tokens_seen": 1840676244, "step": 472, "train_runtime": 18748.6725, "train_tokens_per_second": 98176.351 }, { "epoch": 0.07519872813990461, "grad_norm": 0.31404298543930054, "learning_rate": 4.934321976760308e-05, "loss": 0.4795, "num_input_tokens_seen": 1844553880, "step": 473, "train_runtime": 18787.1701, "train_tokens_per_second": 98181.571 }, { "epoch": 0.0753577106518283, "grad_norm": 0.304064005613327, "learning_rate": 4.934036752808768e-05, "loss": 0.4678, "num_input_tokens_seen": 1848376626, "step": 474, "train_runtime": 18827.4309, "train_tokens_per_second": 98174.66 }, { "epoch": 0.075516693163752, "grad_norm": 0.2983545660972595, "learning_rate": 4.933750919148221e-05, "loss": 0.479, "num_input_tokens_seen": 1852410293, "step": 475, "train_runtime": 18865.2287, "train_tokens_per_second": 98191.775 }, { "epoch": 0.07567567567567568, "grad_norm": 0.3092314898967743, "learning_rate": 4.933464475850267e-05, "loss": 0.4655, "num_input_tokens_seen": 1856270410, "step": 476, "train_runtime": 18904.9702, "train_tokens_per_second": 98189.544 }, { "epoch": 0.07583465818759937, "grad_norm": 0.2745969593524933, "learning_rate": 4.9331774229866564e-05, "loss": 0.4728, "num_input_tokens_seen": 1860195324, "step": 477, "train_runtime": 18941.7363, "train_tokens_per_second": 98206.167 }, { "epoch": 0.07599364069952305, "grad_norm": 0.2910909652709961, "learning_rate": 4.932889760629296e-05, "loss": 0.471, "num_input_tokens_seen": 1864115506, "step": 478, "train_runtime": 18982.027, "train_tokens_per_second": 98204.239 }, { "epoch": 0.07615262321144674, "grad_norm": 0.33323776721954346, "learning_rate": 4.932601488850243e-05, "loss": 0.4701, "num_input_tokens_seen": 1868069096, "step": 479, "train_runtime": 19022.6791, "train_tokens_per_second": 98202.208 }, { "epoch": 0.07631160572337042, "grad_norm": 0.31960204243659973, "learning_rate": 4.9323126077217055e-05, "loss": 0.4585, "num_input_tokens_seen": 1872041915, "step": 480, "train_runtime": 19062.8978, "train_tokens_per_second": 98203.428 }, { "epoch": 0.07647058823529412, "grad_norm": 0.3514964282512665, "learning_rate": 4.932023117316047e-05, "loss": 0.4684, "num_input_tokens_seen": 1875899660, "step": 481, "train_runtime": 19103.0912, "train_tokens_per_second": 98198.749 }, { "epoch": 0.07662957074721781, "grad_norm": 0.27595630288124084, "learning_rate": 4.9317330177057836e-05, "loss": 0.447, "num_input_tokens_seen": 1879741283, "step": 482, "train_runtime": 19142.7624, "train_tokens_per_second": 98195.926 }, { "epoch": 0.0767885532591415, "grad_norm": 0.248579740524292, "learning_rate": 4.931442308963583e-05, "loss": 0.4778, "num_input_tokens_seen": 1883701458, "step": 483, "train_runtime": 19179.2772, "train_tokens_per_second": 98215.456 }, { "epoch": 0.07694753577106518, "grad_norm": 0.271593302488327, "learning_rate": 4.931150991162265e-05, "loss": 0.4613, "num_input_tokens_seen": 1887556310, "step": 484, "train_runtime": 19215.477, "train_tokens_per_second": 98231.041 }, { "epoch": 0.07710651828298887, "grad_norm": 0.3065283000469208, "learning_rate": 4.930859064374803e-05, "loss": 0.4609, "num_input_tokens_seen": 1891441714, "step": 485, "train_runtime": 19256.7307, "train_tokens_per_second": 98222.369 }, { "epoch": 0.07726550079491255, "grad_norm": 0.30274543166160583, "learning_rate": 4.930566528674323e-05, "loss": 0.466, "num_input_tokens_seen": 1895416383, "step": 486, "train_runtime": 19297.1225, "train_tokens_per_second": 98222.747 }, { "epoch": 0.07742448330683625, "grad_norm": 0.4265804588794708, "learning_rate": 4.930273384134103e-05, "loss": 0.47, "num_input_tokens_seen": 1899371756, "step": 487, "train_runtime": 19336.967, "train_tokens_per_second": 98224.906 }, { "epoch": 0.07758346581875994, "grad_norm": 0.2826431393623352, "learning_rate": 4.929979630827574e-05, "loss": 0.4707, "num_input_tokens_seen": 1903230069, "step": 488, "train_runtime": 19376.4605, "train_tokens_per_second": 98223.825 }, { "epoch": 0.07774244833068363, "grad_norm": 0.47140464186668396, "learning_rate": 4.9296852688283165e-05, "loss": 0.468, "num_input_tokens_seen": 1907137238, "step": 489, "train_runtime": 19416.6253, "train_tokens_per_second": 98221.87 }, { "epoch": 0.07790143084260731, "grad_norm": 0.26217642426490784, "learning_rate": 4.9293902982100704e-05, "loss": 0.4531, "num_input_tokens_seen": 1911084678, "step": 490, "train_runtime": 19457.7573, "train_tokens_per_second": 98217.11 }, { "epoch": 0.078060413354531, "grad_norm": 0.495524138212204, "learning_rate": 4.9290947190467204e-05, "loss": 0.4605, "num_input_tokens_seen": 1915062141, "step": 491, "train_runtime": 19494.9728, "train_tokens_per_second": 98233.64 }, { "epoch": 0.0782193958664547, "grad_norm": 0.299270898103714, "learning_rate": 4.928798531412309e-05, "loss": 0.4583, "num_input_tokens_seen": 1919033088, "step": 492, "train_runtime": 19533.4203, "train_tokens_per_second": 98243.577 }, { "epoch": 0.07837837837837838, "grad_norm": 0.24828854203224182, "learning_rate": 4.928501735381027e-05, "loss": 0.4645, "num_input_tokens_seen": 1922891959, "step": 493, "train_runtime": 19572.6967, "train_tokens_per_second": 98243.589 }, { "epoch": 0.07853736089030207, "grad_norm": 0.2648652493953705, "learning_rate": 4.928204331027221e-05, "loss": 0.4598, "num_input_tokens_seen": 1926776364, "step": 494, "train_runtime": 19611.2754, "train_tokens_per_second": 98248.397 }, { "epoch": 0.07869634340222575, "grad_norm": 0.6644258499145508, "learning_rate": 4.927906318425389e-05, "loss": 0.4605, "num_input_tokens_seen": 1930675365, "step": 495, "train_runtime": 19650.2465, "train_tokens_per_second": 98251.967 }, { "epoch": 0.07885532591414944, "grad_norm": 0.30585965514183044, "learning_rate": 4.927607697650182e-05, "loss": 0.4719, "num_input_tokens_seen": 1934528999, "step": 496, "train_runtime": 19690.66, "train_tokens_per_second": 98246.021 }, { "epoch": 0.07901430842607313, "grad_norm": 0.29429540038108826, "learning_rate": 4.927308468776399e-05, "loss": 0.4639, "num_input_tokens_seen": 1938466231, "step": 497, "train_runtime": 19730.5328, "train_tokens_per_second": 98247.029 }, { "epoch": 0.07917329093799683, "grad_norm": 0.2839626371860504, "learning_rate": 4.9270086318789973e-05, "loss": 0.4672, "num_input_tokens_seen": 1942231146, "step": 498, "train_runtime": 19770.4382, "train_tokens_per_second": 98239.155 }, { "epoch": 0.07933227344992051, "grad_norm": 0.24987854063510895, "learning_rate": 4.926708187033084e-05, "loss": 0.4613, "num_input_tokens_seen": 1946127757, "step": 499, "train_runtime": 19808.1719, "train_tokens_per_second": 98248.731 }, { "epoch": 0.0794912559618442, "grad_norm": 0.3509156405925751, "learning_rate": 4.926407134313918e-05, "loss": 0.4547, "num_input_tokens_seen": 1950167343, "step": 500, "train_runtime": 19851.5545, "train_tokens_per_second": 98237.513 }, { "epoch": 0.07965023847376788, "grad_norm": 0.26572415232658386, "learning_rate": 4.92610547379691e-05, "loss": 0.4564, "num_input_tokens_seen": 1953875155, "step": 501, "train_runtime": 19892.2192, "train_tokens_per_second": 98223.086 }, { "epoch": 0.07980922098569157, "grad_norm": 0.27927690744400024, "learning_rate": 4.925803205557625e-05, "loss": 0.4666, "num_input_tokens_seen": 1957780092, "step": 502, "train_runtime": 19927.6596, "train_tokens_per_second": 98244.357 }, { "epoch": 0.07996820349761526, "grad_norm": 0.2449171096086502, "learning_rate": 4.9255003296717786e-05, "loss": 0.4588, "num_input_tokens_seen": 1961629182, "step": 503, "train_runtime": 19967.4938, "train_tokens_per_second": 98241.131 }, { "epoch": 0.08012718600953896, "grad_norm": 0.28855857253074646, "learning_rate": 4.9251968462152385e-05, "loss": 0.4568, "num_input_tokens_seen": 1965609434, "step": 504, "train_runtime": 20006.9863, "train_tokens_per_second": 98246.153 }, { "epoch": 0.08028616852146264, "grad_norm": 0.2276218682527542, "learning_rate": 4.9248927552640266e-05, "loss": 0.4693, "num_input_tokens_seen": 1969646301, "step": 505, "train_runtime": 20048.4049, "train_tokens_per_second": 98244.539 }, { "epoch": 0.08044515103338633, "grad_norm": 0.2208588868379593, "learning_rate": 4.9245880568943134e-05, "loss": 0.4669, "num_input_tokens_seen": 1973539826, "step": 506, "train_runtime": 20088.7968, "train_tokens_per_second": 98240.818 }, { "epoch": 0.08060413354531001, "grad_norm": 0.2535344958305359, "learning_rate": 4.924282751182426e-05, "loss": 0.4626, "num_input_tokens_seen": 1977403811, "step": 507, "train_runtime": 20127.1429, "train_tokens_per_second": 98245.629 }, { "epoch": 0.0807631160572337, "grad_norm": 0.2433733195066452, "learning_rate": 4.92397683820484e-05, "loss": 0.4789, "num_input_tokens_seen": 1981375579, "step": 508, "train_runtime": 20164.9405, "train_tokens_per_second": 98258.439 }, { "epoch": 0.08092209856915739, "grad_norm": 0.35170459747314453, "learning_rate": 4.923670318038185e-05, "loss": 0.459, "num_input_tokens_seen": 1985238720, "step": 509, "train_runtime": 20204.7601, "train_tokens_per_second": 98255.991 }, { "epoch": 0.08108108108108109, "grad_norm": 0.23266899585723877, "learning_rate": 4.9233631907592415e-05, "loss": 0.456, "num_input_tokens_seen": 1989163442, "step": 510, "train_runtime": 20243.0302, "train_tokens_per_second": 98264.115 }, { "epoch": 0.08124006359300477, "grad_norm": 0.2754119336605072, "learning_rate": 4.9230554564449424e-05, "loss": 0.47, "num_input_tokens_seen": 1993032390, "step": 511, "train_runtime": 20281.149, "train_tokens_per_second": 98270.191 }, { "epoch": 0.08139904610492846, "grad_norm": 0.254635751247406, "learning_rate": 4.922747115172375e-05, "loss": 0.4734, "num_input_tokens_seen": 1996913624, "step": 512, "train_runtime": 20320.9802, "train_tokens_per_second": 98268.568 }, { "epoch": 0.08155802861685214, "grad_norm": 0.2926578223705292, "learning_rate": 4.922438167018774e-05, "loss": 0.4636, "num_input_tokens_seen": 2000809264, "step": 513, "train_runtime": 20360.481, "train_tokens_per_second": 98269.253 }, { "epoch": 0.08171701112877583, "grad_norm": 0.5175108909606934, "learning_rate": 4.922128612061531e-05, "loss": 0.4672, "num_input_tokens_seen": 2004698970, "step": 514, "train_runtime": 20399.7496, "train_tokens_per_second": 98270.764 }, { "epoch": 0.08187599364069953, "grad_norm": 0.30539995431900024, "learning_rate": 4.921818450378185e-05, "loss": 0.4769, "num_input_tokens_seen": 2008584932, "step": 515, "train_runtime": 20438.9943, "train_tokens_per_second": 98272.2 }, { "epoch": 0.08203497615262322, "grad_norm": 0.41307640075683594, "learning_rate": 4.921507682046432e-05, "loss": 0.4694, "num_input_tokens_seen": 2012399008, "step": 516, "train_runtime": 20479.2052, "train_tokens_per_second": 98265.484 }, { "epoch": 0.0821939586645469, "grad_norm": 0.2867785692214966, "learning_rate": 4.9211963071441145e-05, "loss": 0.4507, "num_input_tokens_seen": 2016309279, "step": 517, "train_runtime": 20517.3176, "train_tokens_per_second": 98273.533 }, { "epoch": 0.08235294117647059, "grad_norm": 0.28834977746009827, "learning_rate": 4.9208843257492324e-05, "loss": 0.4601, "num_input_tokens_seen": 2020179881, "step": 518, "train_runtime": 20557.9845, "train_tokens_per_second": 98267.409 }, { "epoch": 0.08251192368839427, "grad_norm": 0.31479257345199585, "learning_rate": 4.920571737939933e-05, "loss": 0.4725, "num_input_tokens_seen": 2024043418, "step": 519, "train_runtime": 20596.2472, "train_tokens_per_second": 98272.437 }, { "epoch": 0.08267090620031796, "grad_norm": 0.2780756652355194, "learning_rate": 4.920258543794517e-05, "loss": 0.4626, "num_input_tokens_seen": 2027905007, "step": 520, "train_runtime": 20635.4752, "train_tokens_per_second": 98272.755 }, { "epoch": 0.08282988871224166, "grad_norm": 0.27241209149360657, "learning_rate": 4.9199447433914396e-05, "loss": 0.4749, "num_input_tokens_seen": 2031754831, "step": 521, "train_runtime": 20673.7171, "train_tokens_per_second": 98277.19 }, { "epoch": 0.08298887122416534, "grad_norm": 0.32051825523376465, "learning_rate": 4.919630336809303e-05, "loss": 0.4674, "num_input_tokens_seen": 2035642570, "step": 522, "train_runtime": 20715.6551, "train_tokens_per_second": 98265.904 }, { "epoch": 0.08314785373608903, "grad_norm": 0.28197014331817627, "learning_rate": 4.919315324126866e-05, "loss": 0.4632, "num_input_tokens_seen": 2039689675, "step": 523, "train_runtime": 20754.9259, "train_tokens_per_second": 98274.968 }, { "epoch": 0.08330683624801272, "grad_norm": 0.24139465391635895, "learning_rate": 4.9189997054230356e-05, "loss": 0.4802, "num_input_tokens_seen": 2043677755, "step": 524, "train_runtime": 20795.4196, "train_tokens_per_second": 98275.38 }, { "epoch": 0.0834658187599364, "grad_norm": 0.287516713142395, "learning_rate": 4.918683480776872e-05, "loss": 0.4689, "num_input_tokens_seen": 2047549189, "step": 525, "train_runtime": 20833.6335, "train_tokens_per_second": 98280.945 }, { "epoch": 0.08362480127186009, "grad_norm": 0.3671610951423645, "learning_rate": 4.9183666502675885e-05, "loss": 0.4685, "num_input_tokens_seen": 2051390252, "step": 526, "train_runtime": 20870.4031, "train_tokens_per_second": 98291.836 }, { "epoch": 0.08378378378378379, "grad_norm": 0.44317370653152466, "learning_rate": 4.918049213974548e-05, "loss": 0.4571, "num_input_tokens_seen": 2055349624, "step": 527, "train_runtime": 20911.3573, "train_tokens_per_second": 98288.676 }, { "epoch": 0.08394276629570747, "grad_norm": 0.29575783014297485, "learning_rate": 4.917731171977266e-05, "loss": 0.4607, "num_input_tokens_seen": 2059202144, "step": 528, "train_runtime": 20948.6562, "train_tokens_per_second": 98297.577 }, { "epoch": 0.08410174880763116, "grad_norm": 0.29779598116874695, "learning_rate": 4.91741252435541e-05, "loss": 0.472, "num_input_tokens_seen": 2063077173, "step": 529, "train_runtime": 20987.3638, "train_tokens_per_second": 98300.92 }, { "epoch": 0.08426073131955485, "grad_norm": 0.3875817060470581, "learning_rate": 4.917093271188799e-05, "loss": 0.4691, "num_input_tokens_seen": 2066937460, "step": 530, "train_runtime": 21025.0542, "train_tokens_per_second": 98308.306 }, { "epoch": 0.08441971383147853, "grad_norm": 0.3888520896434784, "learning_rate": 4.916773412557404e-05, "loss": 0.4693, "num_input_tokens_seen": 2070879400, "step": 531, "train_runtime": 21063.3073, "train_tokens_per_second": 98316.915 }, { "epoch": 0.08457869634340223, "grad_norm": 0.36368706822395325, "learning_rate": 4.916452948541346e-05, "loss": 0.4466, "num_input_tokens_seen": 2074928607, "step": 532, "train_runtime": 21103.1463, "train_tokens_per_second": 98323.187 }, { "epoch": 0.08473767885532592, "grad_norm": 0.29154959321022034, "learning_rate": 4.9161318792209006e-05, "loss": 0.4753, "num_input_tokens_seen": 2078812813, "step": 533, "train_runtime": 21141.4551, "train_tokens_per_second": 98328.748 }, { "epoch": 0.0848966613672496, "grad_norm": 0.2597176134586334, "learning_rate": 4.915810204676492e-05, "loss": 0.4689, "num_input_tokens_seen": 2082653670, "step": 534, "train_runtime": 21179.5818, "train_tokens_per_second": 98333.088 }, { "epoch": 0.08505564387917329, "grad_norm": 0.2767421007156372, "learning_rate": 4.9154879249886986e-05, "loss": 0.4612, "num_input_tokens_seen": 2086539507, "step": 535, "train_runtime": 21220.3196, "train_tokens_per_second": 98327.431 }, { "epoch": 0.08521462639109698, "grad_norm": 0.29445967078208923, "learning_rate": 4.915165040238249e-05, "loss": 0.4736, "num_input_tokens_seen": 2090449052, "step": 536, "train_runtime": 21260.3168, "train_tokens_per_second": 98326.336 }, { "epoch": 0.08537360890302066, "grad_norm": 0.2720232903957367, "learning_rate": 4.9148415505060233e-05, "loss": 0.461, "num_input_tokens_seen": 2094356363, "step": 537, "train_runtime": 21298.914, "train_tokens_per_second": 98331.603 }, { "epoch": 0.08553259141494436, "grad_norm": 0.3060682415962219, "learning_rate": 4.9145174558730526e-05, "loss": 0.4718, "num_input_tokens_seen": 2098166744, "step": 538, "train_runtime": 21337.9814, "train_tokens_per_second": 98330.142 }, { "epoch": 0.08569157392686805, "grad_norm": 0.25226572155952454, "learning_rate": 4.914192756420521e-05, "loss": 0.4575, "num_input_tokens_seen": 2102161385, "step": 539, "train_runtime": 21376.4597, "train_tokens_per_second": 98340.016 }, { "epoch": 0.08585055643879173, "grad_norm": 0.3425740599632263, "learning_rate": 4.913867452229763e-05, "loss": 0.4595, "num_input_tokens_seen": 2106121433, "step": 540, "train_runtime": 21414.2547, "train_tokens_per_second": 98351.377 }, { "epoch": 0.08600953895071542, "grad_norm": 0.26750582456588745, "learning_rate": 4.913541543382267e-05, "loss": 0.4656, "num_input_tokens_seen": 2109962263, "step": 541, "train_runtime": 21456.9079, "train_tokens_per_second": 98334.871 }, { "epoch": 0.0861685214626391, "grad_norm": 0.2922716736793518, "learning_rate": 4.9132150299596685e-05, "loss": 0.4597, "num_input_tokens_seen": 2113847219, "step": 542, "train_runtime": 21494.8351, "train_tokens_per_second": 98342.1 }, { "epoch": 0.08632750397456279, "grad_norm": 0.27564114332199097, "learning_rate": 4.9128879120437574e-05, "loss": 0.4589, "num_input_tokens_seen": 2117873600, "step": 543, "train_runtime": 21531.3341, "train_tokens_per_second": 98362.395 }, { "epoch": 0.08648648648648649, "grad_norm": 0.24546188116073608, "learning_rate": 4.9125601897164754e-05, "loss": 0.4685, "num_input_tokens_seen": 2121668779, "step": 544, "train_runtime": 21571.2536, "train_tokens_per_second": 98356.304 }, { "epoch": 0.08664546899841018, "grad_norm": 0.30155256390571594, "learning_rate": 4.912231863059913e-05, "loss": 0.4618, "num_input_tokens_seen": 2125541019, "step": 545, "train_runtime": 21612.5821, "train_tokens_per_second": 98347.389 }, { "epoch": 0.08680445151033386, "grad_norm": 0.27160969376564026, "learning_rate": 4.911902932156315e-05, "loss": 0.4562, "num_input_tokens_seen": 2129510420, "step": 546, "train_runtime": 21651.4973, "train_tokens_per_second": 98353.956 }, { "epoch": 0.08696343402225755, "grad_norm": 0.2743230164051056, "learning_rate": 4.911573397088076e-05, "loss": 0.456, "num_input_tokens_seen": 2133274146, "step": 547, "train_runtime": 21689.573, "train_tokens_per_second": 98354.825 }, { "epoch": 0.08712241653418124, "grad_norm": 0.2695024609565735, "learning_rate": 4.911243257937742e-05, "loss": 0.4542, "num_input_tokens_seen": 2137162419, "step": 548, "train_runtime": 21729.5581, "train_tokens_per_second": 98352.779 }, { "epoch": 0.08728139904610493, "grad_norm": 0.28093844652175903, "learning_rate": 4.91091251478801e-05, "loss": 0.4533, "num_input_tokens_seen": 2141020970, "step": 549, "train_runtime": 21770.4792, "train_tokens_per_second": 98345.146 }, { "epoch": 0.08744038155802862, "grad_norm": 0.2407512068748474, "learning_rate": 4.91058116772173e-05, "loss": 0.4743, "num_input_tokens_seen": 2144942259, "step": 550, "train_runtime": 21807.6106, "train_tokens_per_second": 98357.509 }, { "epoch": 0.0875993640699523, "grad_norm": 0.5757196545600891, "learning_rate": 4.910249216821902e-05, "loss": 0.4673, "num_input_tokens_seen": 2148892786, "step": 551, "train_runtime": 21848.4006, "train_tokens_per_second": 98354.695 }, { "epoch": 0.08775834658187599, "grad_norm": 0.2829740047454834, "learning_rate": 4.9099166621716755e-05, "loss": 0.4592, "num_input_tokens_seen": 2152742812, "step": 552, "train_runtime": 21888.1092, "train_tokens_per_second": 98352.16 }, { "epoch": 0.08791732909379968, "grad_norm": 0.28897619247436523, "learning_rate": 4.909583503854355e-05, "loss": 0.4579, "num_input_tokens_seen": 2156643633, "step": 553, "train_runtime": 21927.8354, "train_tokens_per_second": 98351.871 }, { "epoch": 0.08807631160572336, "grad_norm": 0.28615257143974304, "learning_rate": 4.9092497419533945e-05, "loss": 0.4588, "num_input_tokens_seen": 2160572630, "step": 554, "train_runtime": 21970.9382, "train_tokens_per_second": 98337.75 }, { "epoch": 0.08823529411764706, "grad_norm": 0.2926349341869354, "learning_rate": 4.908915376552398e-05, "loss": 0.4591, "num_input_tokens_seen": 2164491939, "step": 555, "train_runtime": 22010.2816, "train_tokens_per_second": 98340.039 }, { "epoch": 0.08839427662957075, "grad_norm": 0.26073262095451355, "learning_rate": 4.9085804077351206e-05, "loss": 0.4513, "num_input_tokens_seen": 2168459985, "step": 556, "train_runtime": 22047.6764, "train_tokens_per_second": 98353.221 }, { "epoch": 0.08855325914149444, "grad_norm": 0.2930820882320404, "learning_rate": 4.9082448355854724e-05, "loss": 0.459, "num_input_tokens_seen": 2172384558, "step": 557, "train_runtime": 22084.0901, "train_tokens_per_second": 98368.76 }, { "epoch": 0.08871224165341812, "grad_norm": 0.26319336891174316, "learning_rate": 4.9079086601875094e-05, "loss": 0.463, "num_input_tokens_seen": 2176188224, "step": 558, "train_runtime": 22128.8958, "train_tokens_per_second": 98341.474 }, { "epoch": 0.08887122416534181, "grad_norm": 0.24411408603191376, "learning_rate": 4.907571881625443e-05, "loss": 0.4625, "num_input_tokens_seen": 2180059603, "step": 559, "train_runtime": 22167.5133, "train_tokens_per_second": 98344.798 }, { "epoch": 0.0890302066772655, "grad_norm": 0.24116185307502747, "learning_rate": 4.907234499983633e-05, "loss": 0.4671, "num_input_tokens_seen": 2184095983, "step": 560, "train_runtime": 22207.9546, "train_tokens_per_second": 98347.462 }, { "epoch": 0.0891891891891892, "grad_norm": 0.2854967713356018, "learning_rate": 4.906896515346591e-05, "loss": 0.4672, "num_input_tokens_seen": 2187961454, "step": 561, "train_runtime": 22249.3081, "train_tokens_per_second": 98338.404 }, { "epoch": 0.08934817170111288, "grad_norm": 0.25999367237091064, "learning_rate": 4.906557927798979e-05, "loss": 0.4594, "num_input_tokens_seen": 2191888623, "step": 562, "train_runtime": 22288.9325, "train_tokens_per_second": 98339.776 }, { "epoch": 0.08950715421303657, "grad_norm": 0.23788982629776, "learning_rate": 4.9062187374256124e-05, "loss": 0.4644, "num_input_tokens_seen": 2195714403, "step": 563, "train_runtime": 22330.7303, "train_tokens_per_second": 98327.031 }, { "epoch": 0.08966613672496025, "grad_norm": 0.23838937282562256, "learning_rate": 4.905878944311455e-05, "loss": 0.4642, "num_input_tokens_seen": 2199595324, "step": 564, "train_runtime": 22370.0569, "train_tokens_per_second": 98327.659 }, { "epoch": 0.08982511923688394, "grad_norm": 0.2604010999202728, "learning_rate": 4.9055385485416236e-05, "loss": 0.4731, "num_input_tokens_seen": 2203608578, "step": 565, "train_runtime": 22407.8395, "train_tokens_per_second": 98340.966 }, { "epoch": 0.08998410174880764, "grad_norm": 0.25083693861961365, "learning_rate": 4.905197550201384e-05, "loss": 0.4668, "num_input_tokens_seen": 2207559141, "step": 566, "train_runtime": 22448.4092, "train_tokens_per_second": 98339.224 }, { "epoch": 0.09014308426073132, "grad_norm": 0.24138064682483673, "learning_rate": 4.9048559493761524e-05, "loss": 0.4477, "num_input_tokens_seen": 2211539678, "step": 567, "train_runtime": 22485.9242, "train_tokens_per_second": 98352.18 }, { "epoch": 0.09030206677265501, "grad_norm": 0.25659236311912537, "learning_rate": 4.904513746151501e-05, "loss": 0.4662, "num_input_tokens_seen": 2215364022, "step": 568, "train_runtime": 22527.5057, "train_tokens_per_second": 98340.404 }, { "epoch": 0.0904610492845787, "grad_norm": 0.24049802124500275, "learning_rate": 4.904170940613146e-05, "loss": 0.4643, "num_input_tokens_seen": 2219248121, "step": 569, "train_runtime": 22567.4641, "train_tokens_per_second": 98338.392 }, { "epoch": 0.09062003179650238, "grad_norm": 0.23749041557312012, "learning_rate": 4.9038275328469605e-05, "loss": 0.4592, "num_input_tokens_seen": 2223259946, "step": 570, "train_runtime": 22607.8043, "train_tokens_per_second": 98340.375 }, { "epoch": 0.09077901430842607, "grad_norm": 0.23910818994045258, "learning_rate": 4.903483522938963e-05, "loss": 0.4693, "num_input_tokens_seen": 2227144849, "step": 571, "train_runtime": 22648.261, "train_tokens_per_second": 98336.241 }, { "epoch": 0.09093799682034977, "grad_norm": 0.2578393518924713, "learning_rate": 4.903138910975328e-05, "loss": 0.4486, "num_input_tokens_seen": 2231064995, "step": 572, "train_runtime": 22688.3837, "train_tokens_per_second": 98335.123 }, { "epoch": 0.09109697933227345, "grad_norm": 0.2371053248643875, "learning_rate": 4.902793697042376e-05, "loss": 0.4631, "num_input_tokens_seen": 2234916004, "step": 573, "train_runtime": 22727.2805, "train_tokens_per_second": 98336.27 }, { "epoch": 0.09125596184419714, "grad_norm": 0.21108576655387878, "learning_rate": 4.902447881226583e-05, "loss": 0.4604, "num_input_tokens_seen": 2238917764, "step": 574, "train_runtime": 22765.7281, "train_tokens_per_second": 98345.977 }, { "epoch": 0.09141494435612083, "grad_norm": 0.23281455039978027, "learning_rate": 4.902101463614571e-05, "loss": 0.4614, "num_input_tokens_seen": 2242746231, "step": 575, "train_runtime": 22803.2059, "train_tokens_per_second": 98352.233 }, { "epoch": 0.09157392686804451, "grad_norm": 0.24358613789081573, "learning_rate": 4.901754444293118e-05, "loss": 0.4578, "num_input_tokens_seen": 2246647995, "step": 576, "train_runtime": 22842.8075, "train_tokens_per_second": 98352.534 }, { "epoch": 0.0917329093799682, "grad_norm": 0.21117724478244781, "learning_rate": 4.901406823349147e-05, "loss": 0.4637, "num_input_tokens_seen": 2250637127, "step": 577, "train_runtime": 22882.2189, "train_tokens_per_second": 98357.469 }, { "epoch": 0.0918918918918919, "grad_norm": 0.2792225778102875, "learning_rate": 4.9010586008697364e-05, "loss": 0.4487, "num_input_tokens_seen": 2254586319, "step": 578, "train_runtime": 22922.1269, "train_tokens_per_second": 98358.513 }, { "epoch": 0.09205087440381558, "grad_norm": 0.2160796821117401, "learning_rate": 4.900709776942114e-05, "loss": 0.4525, "num_input_tokens_seen": 2258506857, "step": 579, "train_runtime": 22961.0239, "train_tokens_per_second": 98362.637 }, { "epoch": 0.09220985691573927, "grad_norm": 0.34760555624961853, "learning_rate": 4.9003603516536556e-05, "loss": 0.4539, "num_input_tokens_seen": 2262312760, "step": 580, "train_runtime": 22999.3376, "train_tokens_per_second": 98364.257 }, { "epoch": 0.09236883942766295, "grad_norm": 0.26926299929618835, "learning_rate": 4.9000103250918915e-05, "loss": 0.4539, "num_input_tokens_seen": 2266271121, "step": 581, "train_runtime": 23036.8338, "train_tokens_per_second": 98375.981 }, { "epoch": 0.09252782193958664, "grad_norm": 0.21216091513633728, "learning_rate": 4.8996596973445e-05, "loss": 0.458, "num_input_tokens_seen": 2270143604, "step": 582, "train_runtime": 23076.2401, "train_tokens_per_second": 98375.801 }, { "epoch": 0.09268680445151034, "grad_norm": 0.2667961120605469, "learning_rate": 4.8993084684993105e-05, "loss": 0.4769, "num_input_tokens_seen": 2273927788, "step": 583, "train_runtime": 23113.0515, "train_tokens_per_second": 98382.846 }, { "epoch": 0.09284578696343403, "grad_norm": 0.24110384285449982, "learning_rate": 4.898956638644305e-05, "loss": 0.4585, "num_input_tokens_seen": 2277915602, "step": 584, "train_runtime": 23151.6969, "train_tokens_per_second": 98390.87 }, { "epoch": 0.09300476947535771, "grad_norm": 0.24326500296592712, "learning_rate": 4.898604207867613e-05, "loss": 0.4629, "num_input_tokens_seen": 2281841796, "step": 585, "train_runtime": 23192.3601, "train_tokens_per_second": 98387.649 }, { "epoch": 0.0931637519872814, "grad_norm": 0.2646254003047943, "learning_rate": 4.898251176257517e-05, "loss": 0.4567, "num_input_tokens_seen": 2285751664, "step": 586, "train_runtime": 23232.7348, "train_tokens_per_second": 98384.959 }, { "epoch": 0.09332273449920508, "grad_norm": 0.23540903627872467, "learning_rate": 4.897897543902447e-05, "loss": 0.4623, "num_input_tokens_seen": 2289617325, "step": 587, "train_runtime": 23272.3957, "train_tokens_per_second": 98383.396 }, { "epoch": 0.09348171701112877, "grad_norm": 0.2427922487258911, "learning_rate": 4.897543310890987e-05, "loss": 0.4477, "num_input_tokens_seen": 2293551138, "step": 588, "train_runtime": 23311.0542, "train_tokens_per_second": 98388.993 }, { "epoch": 0.09364069952305247, "grad_norm": 0.24431480467319489, "learning_rate": 4.8971884773118705e-05, "loss": 0.464, "num_input_tokens_seen": 2297396887, "step": 589, "train_runtime": 23350.2286, "train_tokens_per_second": 98388.625 }, { "epoch": 0.09379968203497616, "grad_norm": 0.24957431852817535, "learning_rate": 4.8968330432539786e-05, "loss": 0.4643, "num_input_tokens_seen": 2301198583, "step": 590, "train_runtime": 23390.2607, "train_tokens_per_second": 98382.767 }, { "epoch": 0.09395866454689984, "grad_norm": 0.2870663106441498, "learning_rate": 4.896477008806347e-05, "loss": 0.459, "num_input_tokens_seen": 2305150832, "step": 591, "train_runtime": 23430.3586, "train_tokens_per_second": 98383.08 }, { "epoch": 0.09411764705882353, "grad_norm": 0.2737032175064087, "learning_rate": 4.896120374058158e-05, "loss": 0.4731, "num_input_tokens_seen": 2309020782, "step": 592, "train_runtime": 23469.1229, "train_tokens_per_second": 98385.474 }, { "epoch": 0.09427662957074721, "grad_norm": 0.2386150062084198, "learning_rate": 4.895763139098748e-05, "loss": 0.4712, "num_input_tokens_seen": 2313006853, "step": 593, "train_runtime": 23508.9495, "train_tokens_per_second": 98388.354 }, { "epoch": 0.0944356120826709, "grad_norm": 0.2816508710384369, "learning_rate": 4.8954053040176003e-05, "loss": 0.4489, "num_input_tokens_seen": 2316846032, "step": 594, "train_runtime": 23548.4351, "train_tokens_per_second": 98386.412 }, { "epoch": 0.0945945945945946, "grad_norm": 0.25214263796806335, "learning_rate": 4.895046868904352e-05, "loss": 0.464, "num_input_tokens_seen": 2320844922, "step": 595, "train_runtime": 23587.8361, "train_tokens_per_second": 98391.599 }, { "epoch": 0.09475357710651829, "grad_norm": 0.2589946985244751, "learning_rate": 4.894687833848787e-05, "loss": 0.4541, "num_input_tokens_seen": 2324735386, "step": 596, "train_runtime": 23625.7616, "train_tokens_per_second": 98398.326 }, { "epoch": 0.09491255961844197, "grad_norm": 0.24161137640476227, "learning_rate": 4.894328198940841e-05, "loss": 0.4588, "num_input_tokens_seen": 2328548085, "step": 597, "train_runtime": 23664.4217, "train_tokens_per_second": 98398.69 }, { "epoch": 0.09507154213036566, "grad_norm": 0.23802675306797028, "learning_rate": 4.893967964270599e-05, "loss": 0.4491, "num_input_tokens_seen": 2332373646, "step": 598, "train_runtime": 23704.9025, "train_tokens_per_second": 98392.037 }, { "epoch": 0.09523052464228934, "grad_norm": 0.24710239470005035, "learning_rate": 4.8936071299283006e-05, "loss": 0.4538, "num_input_tokens_seen": 2336353878, "step": 599, "train_runtime": 23742.614, "train_tokens_per_second": 98403.397 }, { "epoch": 0.09538950715421304, "grad_norm": 0.23252640664577484, "learning_rate": 4.8932456960043296e-05, "loss": 0.4562, "num_input_tokens_seen": 2340343030, "step": 600, "train_runtime": 23780.7863, "train_tokens_per_second": 98413.19 }, { "epoch": 0.09554848966613673, "grad_norm": 0.23973892629146576, "learning_rate": 4.8928836625892225e-05, "loss": 0.448, "num_input_tokens_seen": 2344048780, "step": 601, "train_runtime": 23935.2485, "train_tokens_per_second": 97932.92 }, { "epoch": 0.09570747217806042, "grad_norm": 0.25784027576446533, "learning_rate": 4.892521029773667e-05, "loss": 0.4765, "num_input_tokens_seen": 2348066551, "step": 602, "train_runtime": 23977.0905, "train_tokens_per_second": 97929.586 }, { "epoch": 0.0958664546899841, "grad_norm": 0.23957376182079315, "learning_rate": 4.892157797648501e-05, "loss": 0.4647, "num_input_tokens_seen": 2352009270, "step": 603, "train_runtime": 24017.1147, "train_tokens_per_second": 97930.551 }, { "epoch": 0.09602543720190779, "grad_norm": 0.23159149289131165, "learning_rate": 4.89179396630471e-05, "loss": 0.4508, "num_input_tokens_seen": 2355886078, "step": 604, "train_runtime": 24055.5342, "train_tokens_per_second": 97935.305 }, { "epoch": 0.09618441971383147, "grad_norm": 0.2413492351770401, "learning_rate": 4.8914295358334313e-05, "loss": 0.4742, "num_input_tokens_seen": 2359692064, "step": 605, "train_runtime": 24099.2212, "train_tokens_per_second": 97915.698 }, { "epoch": 0.09634340222575517, "grad_norm": 0.2598983943462372, "learning_rate": 4.891064506325953e-05, "loss": 0.4548, "num_input_tokens_seen": 2363668390, "step": 606, "train_runtime": 24141.1728, "train_tokens_per_second": 97910.255 }, { "epoch": 0.09650238473767886, "grad_norm": 0.23252859711647034, "learning_rate": 4.890698877873712e-05, "loss": 0.4684, "num_input_tokens_seen": 2367635841, "step": 607, "train_runtime": 24182.6543, "train_tokens_per_second": 97906.368 }, { "epoch": 0.09666136724960255, "grad_norm": 0.34616774320602417, "learning_rate": 4.890332650568295e-05, "loss": 0.4462, "num_input_tokens_seen": 2371553636, "step": 608, "train_runtime": 24220.5368, "train_tokens_per_second": 97914.991 }, { "epoch": 0.09682034976152623, "grad_norm": 0.24847561120986938, "learning_rate": 4.8899658245014404e-05, "loss": 0.4569, "num_input_tokens_seen": 2375505955, "step": 609, "train_runtime": 24258.9159, "train_tokens_per_second": 97923.006 }, { "epoch": 0.09697933227344992, "grad_norm": 0.23334695398807526, "learning_rate": 4.8895983997650355e-05, "loss": 0.4529, "num_input_tokens_seen": 2379387401, "step": 610, "train_runtime": 24298.7272, "train_tokens_per_second": 97922.306 }, { "epoch": 0.0971383147853736, "grad_norm": 0.2256428301334381, "learning_rate": 4.889230376451116e-05, "loss": 0.4502, "num_input_tokens_seen": 2383252981, "step": 611, "train_runtime": 24338.8514, "train_tokens_per_second": 97919.698 }, { "epoch": 0.0972972972972973, "grad_norm": 0.2138104885816574, "learning_rate": 4.888861754651871e-05, "loss": 0.4592, "num_input_tokens_seen": 2387141341, "step": 612, "train_runtime": 24379.1601, "train_tokens_per_second": 97917.292 }, { "epoch": 0.09745627980922099, "grad_norm": 0.2812614440917969, "learning_rate": 4.888492534459638e-05, "loss": 0.481, "num_input_tokens_seen": 2390993673, "step": 613, "train_runtime": 24420.1676, "train_tokens_per_second": 97910.617 }, { "epoch": 0.09761526232114467, "grad_norm": 0.2332046627998352, "learning_rate": 4.888122715966902e-05, "loss": 0.467, "num_input_tokens_seen": 2394796481, "step": 614, "train_runtime": 24460.7895, "train_tokens_per_second": 97903.483 }, { "epoch": 0.09777424483306836, "grad_norm": 0.22644688189029694, "learning_rate": 4.887752299266301e-05, "loss": 0.4605, "num_input_tokens_seen": 2398839407, "step": 615, "train_runtime": 24500.9959, "train_tokens_per_second": 97907.833 }, { "epoch": 0.09793322734499205, "grad_norm": 0.22258853912353516, "learning_rate": 4.887381284450622e-05, "loss": 0.4631, "num_input_tokens_seen": 2402601844, "step": 616, "train_runtime": 24540.8697, "train_tokens_per_second": 97902.066 }, { "epoch": 0.09809220985691573, "grad_norm": 0.24997523427009583, "learning_rate": 4.887009671612801e-05, "loss": 0.4425, "num_input_tokens_seen": 2406473528, "step": 617, "train_runtime": 24580.7063, "train_tokens_per_second": 97900.911 }, { "epoch": 0.09825119236883943, "grad_norm": 0.2422657608985901, "learning_rate": 4.886637460845925e-05, "loss": 0.454, "num_input_tokens_seen": 2410498206, "step": 618, "train_runtime": 24617.1519, "train_tokens_per_second": 97919.459 }, { "epoch": 0.09841017488076312, "grad_norm": 0.2699436843395233, "learning_rate": 4.88626465224323e-05, "loss": 0.4613, "num_input_tokens_seen": 2414452491, "step": 619, "train_runtime": 24654.3386, "train_tokens_per_second": 97932.154 }, { "epoch": 0.0985691573926868, "grad_norm": 0.3022935092449188, "learning_rate": 4.885891245898101e-05, "loss": 0.4659, "num_input_tokens_seen": 2418354217, "step": 620, "train_runtime": 24695.2849, "train_tokens_per_second": 97927.772 }, { "epoch": 0.09872813990461049, "grad_norm": 0.28351929783821106, "learning_rate": 4.8855172419040754e-05, "loss": 0.4598, "num_input_tokens_seen": 2422144470, "step": 621, "train_runtime": 24735.9602, "train_tokens_per_second": 97919.97 }, { "epoch": 0.09888712241653418, "grad_norm": 0.2573317587375641, "learning_rate": 4.885142640354837e-05, "loss": 0.4538, "num_input_tokens_seen": 2426183883, "step": 622, "train_runtime": 24774.6856, "train_tokens_per_second": 97929.957 }, { "epoch": 0.09904610492845788, "grad_norm": 0.30018237233161926, "learning_rate": 4.8847674413442215e-05, "loss": 0.4578, "num_input_tokens_seen": 2429965998, "step": 623, "train_runtime": 24812.2758, "train_tokens_per_second": 97934.023 }, { "epoch": 0.09920508744038156, "grad_norm": 0.3238093852996826, "learning_rate": 4.884391644966214e-05, "loss": 0.4509, "num_input_tokens_seen": 2433789916, "step": 624, "train_runtime": 24851.4111, "train_tokens_per_second": 97933.671 }, { "epoch": 0.09936406995230525, "grad_norm": 0.31419360637664795, "learning_rate": 4.884015251314948e-05, "loss": 0.4507, "num_input_tokens_seen": 2437776160, "step": 625, "train_runtime": 24892.3202, "train_tokens_per_second": 97932.862 }, { "epoch": 0.09952305246422893, "grad_norm": 0.3027193546295166, "learning_rate": 4.883638260484709e-05, "loss": 0.4519, "num_input_tokens_seen": 2441784039, "step": 626, "train_runtime": 24931.5558, "train_tokens_per_second": 97939.497 }, { "epoch": 0.09968203497615262, "grad_norm": 0.31277143955230713, "learning_rate": 4.883260672569927e-05, "loss": 0.47, "num_input_tokens_seen": 2445672352, "step": 627, "train_runtime": 24970.8978, "train_tokens_per_second": 97940.906 }, { "epoch": 0.0998410174880763, "grad_norm": 0.2429690808057785, "learning_rate": 4.8828824876651895e-05, "loss": 0.4549, "num_input_tokens_seen": 2449361804, "step": 628, "train_runtime": 25012.5518, "train_tokens_per_second": 97925.307 }, { "epoch": 0.1, "grad_norm": 0.32546886801719666, "learning_rate": 4.882503705865227e-05, "loss": 0.4643, "num_input_tokens_seen": 2453359031, "step": 629, "train_runtime": 25051.2941, "train_tokens_per_second": 97933.425 }, { "epoch": 0.10015898251192369, "grad_norm": 0.2826651930809021, "learning_rate": 4.882124327264921e-05, "loss": 0.4494, "num_input_tokens_seen": 2457249323, "step": 630, "train_runtime": 25087.6548, "train_tokens_per_second": 97946.553 }, { "epoch": 0.10031796502384738, "grad_norm": 0.24052001535892487, "learning_rate": 4.8817443519593045e-05, "loss": 0.4546, "num_input_tokens_seen": 2461142140, "step": 631, "train_runtime": 25128.0364, "train_tokens_per_second": 97944.07 }, { "epoch": 0.10047694753577106, "grad_norm": 0.2565423548221588, "learning_rate": 4.8813637800435575e-05, "loss": 0.4639, "num_input_tokens_seen": 2464977434, "step": 632, "train_runtime": 25168.6161, "train_tokens_per_second": 97938.537 }, { "epoch": 0.10063593004769475, "grad_norm": 0.2805245816707611, "learning_rate": 4.880982611613011e-05, "loss": 0.4728, "num_input_tokens_seen": 2468933523, "step": 633, "train_runtime": 25208.8055, "train_tokens_per_second": 97939.33 }, { "epoch": 0.10079491255961844, "grad_norm": 0.35968536138534546, "learning_rate": 4.8806008467631456e-05, "loss": 0.4522, "num_input_tokens_seen": 2472651903, "step": 634, "train_runtime": 25247.8692, "train_tokens_per_second": 97935.073 }, { "epoch": 0.10095389507154214, "grad_norm": 0.2516714036464691, "learning_rate": 4.880218485589591e-05, "loss": 0.4497, "num_input_tokens_seen": 2476647145, "step": 635, "train_runtime": 25285.7061, "train_tokens_per_second": 97946.529 }, { "epoch": 0.10111287758346582, "grad_norm": 0.28106191754341125, "learning_rate": 4.8798355281881235e-05, "loss": 0.4463, "num_input_tokens_seen": 2480478110, "step": 636, "train_runtime": 25320.1877, "train_tokens_per_second": 97964.444 }, { "epoch": 0.10127186009538951, "grad_norm": 0.2863471210002899, "learning_rate": 4.879451974654674e-05, "loss": 0.4471, "num_input_tokens_seen": 2484158657, "step": 637, "train_runtime": 25360.3504, "train_tokens_per_second": 97954.43 }, { "epoch": 0.10143084260731319, "grad_norm": 0.30343613028526306, "learning_rate": 4.879067825085319e-05, "loss": 0.4614, "num_input_tokens_seen": 2488159970, "step": 638, "train_runtime": 25400.164, "train_tokens_per_second": 97958.422 }, { "epoch": 0.10158982511923688, "grad_norm": 0.26381149888038635, "learning_rate": 4.878683079576285e-05, "loss": 0.4566, "num_input_tokens_seen": 2492084185, "step": 639, "train_runtime": 25442.1909, "train_tokens_per_second": 97950.849 }, { "epoch": 0.10174880763116058, "grad_norm": 0.2593386471271515, "learning_rate": 4.878297738223948e-05, "loss": 0.463, "num_input_tokens_seen": 2495986266, "step": 640, "train_runtime": 25482.3185, "train_tokens_per_second": 97949.732 }, { "epoch": 0.10190779014308426, "grad_norm": 0.231521874666214, "learning_rate": 4.877911801124834e-05, "loss": 0.4307, "num_input_tokens_seen": 2499894707, "step": 641, "train_runtime": 25519.8586, "train_tokens_per_second": 97958.799 }, { "epoch": 0.10206677265500795, "grad_norm": 0.26792097091674805, "learning_rate": 4.877525268375616e-05, "loss": 0.4519, "num_input_tokens_seen": 2503732565, "step": 642, "train_runtime": 25563.399, "train_tokens_per_second": 97942.084 }, { "epoch": 0.10222575516693164, "grad_norm": 0.28449809551239014, "learning_rate": 4.87713814007312e-05, "loss": 0.4585, "num_input_tokens_seen": 2507543764, "step": 643, "train_runtime": 25601.6407, "train_tokens_per_second": 97944.651 }, { "epoch": 0.10238473767885532, "grad_norm": 0.23187312483787537, "learning_rate": 4.876750416314316e-05, "loss": 0.4544, "num_input_tokens_seen": 2511499182, "step": 644, "train_runtime": 25640.8019, "train_tokens_per_second": 97949.323 }, { "epoch": 0.10254372019077901, "grad_norm": 0.2438826858997345, "learning_rate": 4.876362097196328e-05, "loss": 0.4584, "num_input_tokens_seen": 2515320737, "step": 645, "train_runtime": 25681.129, "train_tokens_per_second": 97944.321 }, { "epoch": 0.10270270270270271, "grad_norm": 0.23290888965129852, "learning_rate": 4.8759731828164284e-05, "loss": 0.4536, "num_input_tokens_seen": 2519274615, "step": 646, "train_runtime": 25719.2065, "train_tokens_per_second": 97953.046 }, { "epoch": 0.1028616852146264, "grad_norm": 0.22094477713108063, "learning_rate": 4.875583673272035e-05, "loss": 0.4481, "num_input_tokens_seen": 2523215763, "step": 647, "train_runtime": 25759.4148, "train_tokens_per_second": 97953.148 }, { "epoch": 0.10302066772655008, "grad_norm": 0.25553128123283386, "learning_rate": 4.875193568660718e-05, "loss": 0.46, "num_input_tokens_seen": 2527000918, "step": 648, "train_runtime": 25798.9988, "train_tokens_per_second": 97949.573 }, { "epoch": 0.10317965023847377, "grad_norm": 0.24242661893367767, "learning_rate": 4.874802869080196e-05, "loss": 0.446, "num_input_tokens_seen": 2530983665, "step": 649, "train_runtime": 25839.5268, "train_tokens_per_second": 97950.078 }, { "epoch": 0.10333863275039745, "grad_norm": 0.2274998128414154, "learning_rate": 4.874411574628337e-05, "loss": 0.4741, "num_input_tokens_seen": 2534846544, "step": 650, "train_runtime": 25881.9374, "train_tokens_per_second": 97938.825 }, { "epoch": 0.10349761526232114, "grad_norm": 0.22399498522281647, "learning_rate": 4.874019685403156e-05, "loss": 0.4623, "num_input_tokens_seen": 2538731339, "step": 651, "train_runtime": 25922.0621, "train_tokens_per_second": 97937.09 }, { "epoch": 0.10365659777424484, "grad_norm": 0.22542297840118408, "learning_rate": 4.873627201502821e-05, "loss": 0.459, "num_input_tokens_seen": 2542690449, "step": 652, "train_runtime": 25961.3471, "train_tokens_per_second": 97941.391 }, { "epoch": 0.10381558028616852, "grad_norm": 0.27674636244773865, "learning_rate": 4.873234123025644e-05, "loss": 0.4508, "num_input_tokens_seen": 2546528588, "step": 653, "train_runtime": 25998.9617, "train_tokens_per_second": 97947.319 }, { "epoch": 0.10397456279809221, "grad_norm": 0.2234754115343094, "learning_rate": 4.8728404500700884e-05, "loss": 0.4578, "num_input_tokens_seen": 2550367662, "step": 654, "train_runtime": 26039.0365, "train_tokens_per_second": 97944.011 }, { "epoch": 0.1041335453100159, "grad_norm": 0.24134457111358643, "learning_rate": 4.872446182734769e-05, "loss": 0.4542, "num_input_tokens_seen": 2554300733, "step": 655, "train_runtime": 26078.6782, "train_tokens_per_second": 97945.943 }, { "epoch": 0.10429252782193958, "grad_norm": 0.2160264551639557, "learning_rate": 4.872051321118444e-05, "loss": 0.4628, "num_input_tokens_seen": 2558176232, "step": 656, "train_runtime": 26119.9024, "train_tokens_per_second": 97939.732 }, { "epoch": 0.10445151033386328, "grad_norm": 0.2860299348831177, "learning_rate": 4.8716558653200264e-05, "loss": 0.4562, "num_input_tokens_seen": 2562160003, "step": 657, "train_runtime": 26157.8457, "train_tokens_per_second": 97949.962 }, { "epoch": 0.10461049284578697, "grad_norm": 0.24577681720256805, "learning_rate": 4.871259815438572e-05, "loss": 0.4543, "num_input_tokens_seen": 2565988279, "step": 658, "train_runtime": 26195.0672, "train_tokens_per_second": 97956.927 }, { "epoch": 0.10476947535771065, "grad_norm": 0.24205291271209717, "learning_rate": 4.870863171573291e-05, "loss": 0.445, "num_input_tokens_seen": 2569930632, "step": 659, "train_runtime": 26233.6736, "train_tokens_per_second": 97963.048 }, { "epoch": 0.10492845786963434, "grad_norm": 0.2386258840560913, "learning_rate": 4.870465933823538e-05, "loss": 0.4582, "num_input_tokens_seen": 2573791402, "step": 660, "train_runtime": 26274.1648, "train_tokens_per_second": 97959.019 }, { "epoch": 0.10508744038155803, "grad_norm": 0.2675420641899109, "learning_rate": 4.870068102288819e-05, "loss": 0.4559, "num_input_tokens_seen": 2577767555, "step": 661, "train_runtime": 26313.3989, "train_tokens_per_second": 97964.066 }, { "epoch": 0.10524642289348171, "grad_norm": 0.2321314513683319, "learning_rate": 4.869669677068789e-05, "loss": 0.4608, "num_input_tokens_seen": 2581604634, "step": 662, "train_runtime": 26351.7785, "train_tokens_per_second": 97966.998 }, { "epoch": 0.10540540540540541, "grad_norm": 0.2241680771112442, "learning_rate": 4.86927065826325e-05, "loss": 0.4477, "num_input_tokens_seen": 2585392490, "step": 663, "train_runtime": 26390.641, "train_tokens_per_second": 97966.263 }, { "epoch": 0.1055643879173291, "grad_norm": 0.24567796289920807, "learning_rate": 4.868871045972152e-05, "loss": 0.4617, "num_input_tokens_seen": 2589389809, "step": 664, "train_runtime": 26428.9541, "train_tokens_per_second": 97975.493 }, { "epoch": 0.10572337042925278, "grad_norm": 0.2708342671394348, "learning_rate": 4.868470840295597e-05, "loss": 0.4491, "num_input_tokens_seen": 2593196499, "step": 665, "train_runtime": 26469.5477, "train_tokens_per_second": 97969.052 }, { "epoch": 0.10588235294117647, "grad_norm": 1.3213632106781006, "learning_rate": 4.868070041333833e-05, "loss": 0.4558, "num_input_tokens_seen": 2597159622, "step": 666, "train_runtime": 26509.9514, "train_tokens_per_second": 97969.233 }, { "epoch": 0.10604133545310016, "grad_norm": 0.2725995182991028, "learning_rate": 4.867668649187257e-05, "loss": 0.4523, "num_input_tokens_seen": 2600999745, "step": 667, "train_runtime": 26549.0993, "train_tokens_per_second": 97969.416 }, { "epoch": 0.10620031796502384, "grad_norm": 0.2576506435871124, "learning_rate": 4.867266663956416e-05, "loss": 0.4586, "num_input_tokens_seen": 2604866690, "step": 668, "train_runtime": 26588.1817, "train_tokens_per_second": 97970.847 }, { "epoch": 0.10635930047694754, "grad_norm": 0.2804892361164093, "learning_rate": 4.8668640857420024e-05, "loss": 0.4515, "num_input_tokens_seen": 2608689521, "step": 669, "train_runtime": 26628.6564, "train_tokens_per_second": 97965.496 }, { "epoch": 0.10651828298887123, "grad_norm": 0.29799625277519226, "learning_rate": 4.866460914644861e-05, "loss": 0.4526, "num_input_tokens_seen": 2612664947, "step": 670, "train_runtime": 26667.3088, "train_tokens_per_second": 97972.576 }, { "epoch": 0.10667726550079491, "grad_norm": 0.2639753222465515, "learning_rate": 4.8660571507659826e-05, "loss": 0.4645, "num_input_tokens_seen": 2616492375, "step": 671, "train_runtime": 26706.8125, "train_tokens_per_second": 97970.972 }, { "epoch": 0.1068362480127186, "grad_norm": 0.2366926670074463, "learning_rate": 4.8656527942065075e-05, "loss": 0.4693, "num_input_tokens_seen": 2620451430, "step": 672, "train_runtime": 26748.9722, "train_tokens_per_second": 97964.565 }, { "epoch": 0.10699523052464228, "grad_norm": 0.2584143877029419, "learning_rate": 4.8652478450677244e-05, "loss": 0.4633, "num_input_tokens_seen": 2624394645, "step": 673, "train_runtime": 26788.6628, "train_tokens_per_second": 97966.616 }, { "epoch": 0.10715421303656598, "grad_norm": 0.21548986434936523, "learning_rate": 4.864842303451069e-05, "loss": 0.4503, "num_input_tokens_seen": 2628279235, "step": 674, "train_runtime": 26824.4115, "train_tokens_per_second": 97980.872 }, { "epoch": 0.10731319554848967, "grad_norm": 0.3171943426132202, "learning_rate": 4.864436169458127e-05, "loss": 0.4419, "num_input_tokens_seen": 2632212008, "step": 675, "train_runtime": 26867.1615, "train_tokens_per_second": 97971.347 }, { "epoch": 0.10747217806041336, "grad_norm": 0.23043425381183624, "learning_rate": 4.864029443190633e-05, "loss": 0.4541, "num_input_tokens_seen": 2636076206, "step": 676, "train_runtime": 26905.0893, "train_tokens_per_second": 97976.861 }, { "epoch": 0.10763116057233704, "grad_norm": 0.2366182506084442, "learning_rate": 4.8636221247504685e-05, "loss": 0.448, "num_input_tokens_seen": 2640000958, "step": 677, "train_runtime": 26941.4593, "train_tokens_per_second": 97990.273 }, { "epoch": 0.10779014308426073, "grad_norm": 0.2669360339641571, "learning_rate": 4.8632142142396646e-05, "loss": 0.464, "num_input_tokens_seen": 2643800862, "step": 678, "train_runtime": 26980.4799, "train_tokens_per_second": 97989.393 }, { "epoch": 0.10794912559618441, "grad_norm": 0.252747118473053, "learning_rate": 4.8628057117603984e-05, "loss": 0.4643, "num_input_tokens_seen": 2647719753, "step": 679, "train_runtime": 27019.6336, "train_tokens_per_second": 97992.437 }, { "epoch": 0.10810810810810811, "grad_norm": 0.3159548044204712, "learning_rate": 4.862396617414999e-05, "loss": 0.4666, "num_input_tokens_seen": 2651784755, "step": 680, "train_runtime": 27059.0179, "train_tokens_per_second": 98000.037 }, { "epoch": 0.1082670906200318, "grad_norm": 0.25422725081443787, "learning_rate": 4.861986931305939e-05, "loss": 0.4695, "num_input_tokens_seen": 2655599825, "step": 681, "train_runtime": 27097.2908, "train_tokens_per_second": 98002.411 }, { "epoch": 0.10842607313195549, "grad_norm": 0.27013081312179565, "learning_rate": 4.861576653535845e-05, "loss": 0.4661, "num_input_tokens_seen": 2659450803, "step": 682, "train_runtime": 27132.5702, "train_tokens_per_second": 98016.914 }, { "epoch": 0.10858505564387917, "grad_norm": 0.22191056609153748, "learning_rate": 4.861165784207486e-05, "loss": 0.4547, "num_input_tokens_seen": 2663423922, "step": 683, "train_runtime": 27172.5112, "train_tokens_per_second": 98019.057 }, { "epoch": 0.10874403815580286, "grad_norm": 0.24029147624969482, "learning_rate": 4.860754323423783e-05, "loss": 0.4554, "num_input_tokens_seen": 2667346924, "step": 684, "train_runtime": 27213.8461, "train_tokens_per_second": 98014.331 }, { "epoch": 0.10890302066772654, "grad_norm": 0.256112664937973, "learning_rate": 4.8603422712878036e-05, "loss": 0.4521, "num_input_tokens_seen": 2671288703, "step": 685, "train_runtime": 27251.0166, "train_tokens_per_second": 98025.286 }, { "epoch": 0.10906200317965024, "grad_norm": 0.2581539750099182, "learning_rate": 4.859929627902765e-05, "loss": 0.4676, "num_input_tokens_seen": 2675175780, "step": 686, "train_runtime": 27290.1008, "train_tokens_per_second": 98027.332 }, { "epoch": 0.10922098569157393, "grad_norm": 0.25573399662971497, "learning_rate": 4.85951639337203e-05, "loss": 0.4493, "num_input_tokens_seen": 2679140119, "step": 687, "train_runtime": 27329.9166, "train_tokens_per_second": 98029.575 }, { "epoch": 0.10937996820349762, "grad_norm": 0.23165397346019745, "learning_rate": 4.859102567799112e-05, "loss": 0.4443, "num_input_tokens_seen": 2683027931, "step": 688, "train_runtime": 27369.9153, "train_tokens_per_second": 98028.361 }, { "epoch": 0.1095389507154213, "grad_norm": 0.283563494682312, "learning_rate": 4.858688151287671e-05, "loss": 0.4633, "num_input_tokens_seen": 2686909578, "step": 689, "train_runtime": 27409.7379, "train_tokens_per_second": 98027.555 }, { "epoch": 0.10969793322734499, "grad_norm": 0.3083319067955017, "learning_rate": 4.858273143941515e-05, "loss": 0.4519, "num_input_tokens_seen": 2690758511, "step": 690, "train_runtime": 27448.6469, "train_tokens_per_second": 98028.821 }, { "epoch": 0.10985691573926869, "grad_norm": 0.31054067611694336, "learning_rate": 4.8578575458646014e-05, "loss": 0.465, "num_input_tokens_seen": 2694699390, "step": 691, "train_runtime": 27487.097, "train_tokens_per_second": 98035.067 }, { "epoch": 0.11001589825119237, "grad_norm": 0.249383345246315, "learning_rate": 4.8574413571610334e-05, "loss": 0.4706, "num_input_tokens_seen": 2698589295, "step": 692, "train_runtime": 27528.1568, "train_tokens_per_second": 98030.148 }, { "epoch": 0.11017488076311606, "grad_norm": 0.3116813600063324, "learning_rate": 4.8570245779350645e-05, "loss": 0.4527, "num_input_tokens_seen": 2702526489, "step": 693, "train_runtime": 27568.5397, "train_tokens_per_second": 98029.367 }, { "epoch": 0.11033386327503975, "grad_norm": 0.2845156490802765, "learning_rate": 4.856607208291094e-05, "loss": 0.4451, "num_input_tokens_seen": 2706367262, "step": 694, "train_runtime": 27607.6129, "train_tokens_per_second": 98029.745 }, { "epoch": 0.11049284578696343, "grad_norm": 0.2559192180633545, "learning_rate": 4.856189248333671e-05, "loss": 0.4581, "num_input_tokens_seen": 2710182667, "step": 695, "train_runtime": 27644.5355, "train_tokens_per_second": 98036.831 }, { "epoch": 0.11065182829888712, "grad_norm": 0.2947029173374176, "learning_rate": 4.8557706981674906e-05, "loss": 0.4644, "num_input_tokens_seen": 2713987799, "step": 696, "train_runtime": 27680.5135, "train_tokens_per_second": 98046.873 }, { "epoch": 0.11081081081081082, "grad_norm": 0.33887436985969543, "learning_rate": 4.855351557897397e-05, "loss": 0.4537, "num_input_tokens_seen": 2717879541, "step": 697, "train_runtime": 27721.633, "train_tokens_per_second": 98041.827 }, { "epoch": 0.1109697933227345, "grad_norm": 0.2525343894958496, "learning_rate": 4.854931827628382e-05, "loss": 0.4496, "num_input_tokens_seen": 2721875959, "step": 698, "train_runtime": 27760.3903, "train_tokens_per_second": 98048.908 }, { "epoch": 0.11112877583465819, "grad_norm": 0.2602211833000183, "learning_rate": 4.854511507465584e-05, "loss": 0.4539, "num_input_tokens_seen": 2725765685, "step": 699, "train_runtime": 27801.7063, "train_tokens_per_second": 98043.108 }, { "epoch": 0.11128775834658187, "grad_norm": 0.27884435653686523, "learning_rate": 4.854090597514293e-05, "loss": 0.4403, "num_input_tokens_seen": 2729529912, "step": 700, "train_runtime": 27839.5402, "train_tokens_per_second": 98045.079 }, { "epoch": 0.11144674085850556, "grad_norm": 0.27333658933639526, "learning_rate": 4.853669097879942e-05, "loss": 0.4465, "num_input_tokens_seen": 2733331505, "step": 701, "train_runtime": 27880.205, "train_tokens_per_second": 98038.429 }, { "epoch": 0.11160572337042925, "grad_norm": 0.2490442842245102, "learning_rate": 4.8532470086681125e-05, "loss": 0.4549, "num_input_tokens_seen": 2737347665, "step": 702, "train_runtime": 27919.2834, "train_tokens_per_second": 98045.055 }, { "epoch": 0.11176470588235295, "grad_norm": 0.28183913230895996, "learning_rate": 4.8528243299845365e-05, "loss": 0.4591, "num_input_tokens_seen": 2741283706, "step": 703, "train_runtime": 27957.0329, "train_tokens_per_second": 98053.456 }, { "epoch": 0.11192368839427663, "grad_norm": 0.27110227942466736, "learning_rate": 4.852401061935092e-05, "loss": 0.4457, "num_input_tokens_seen": 2745079020, "step": 704, "train_runtime": 27997.9108, "train_tokens_per_second": 98045.852 }, { "epoch": 0.11208267090620032, "grad_norm": 0.3556916415691376, "learning_rate": 4.851977204625805e-05, "loss": 0.445, "num_input_tokens_seen": 2748995782, "step": 705, "train_runtime": 28039.6133, "train_tokens_per_second": 98039.718 }, { "epoch": 0.112241653418124, "grad_norm": 0.24734966456890106, "learning_rate": 4.851552758162847e-05, "loss": 0.475, "num_input_tokens_seen": 2752983618, "step": 706, "train_runtime": 28078.688, "train_tokens_per_second": 98045.308 }, { "epoch": 0.11240063593004769, "grad_norm": 0.24769307672977448, "learning_rate": 4.8511277226525404e-05, "loss": 0.454, "num_input_tokens_seen": 2756794377, "step": 707, "train_runtime": 28117.8826, "train_tokens_per_second": 98044.167 }, { "epoch": 0.11255961844197138, "grad_norm": 0.2718536853790283, "learning_rate": 4.850702098201353e-05, "loss": 0.4588, "num_input_tokens_seen": 2760607085, "step": 708, "train_runtime": 28159.3598, "train_tokens_per_second": 98035.151 }, { "epoch": 0.11271860095389508, "grad_norm": 0.2764139175415039, "learning_rate": 4.850275884915901e-05, "loss": 0.4338, "num_input_tokens_seen": 2764547413, "step": 709, "train_runtime": 28200.4725, "train_tokens_per_second": 98031.953 }, { "epoch": 0.11287758346581876, "grad_norm": 0.28609341382980347, "learning_rate": 4.849849082902948e-05, "loss": 0.4543, "num_input_tokens_seen": 2768550215, "step": 710, "train_runtime": 28237.8448, "train_tokens_per_second": 98043.963 }, { "epoch": 0.11303656597774245, "grad_norm": 0.3177265524864197, "learning_rate": 4.849421692269405e-05, "loss": 0.4568, "num_input_tokens_seen": 2772336003, "step": 711, "train_runtime": 28277.3558, "train_tokens_per_second": 98040.85 }, { "epoch": 0.11319554848966613, "grad_norm": 0.2933841049671173, "learning_rate": 4.848993713122329e-05, "loss": 0.4585, "num_input_tokens_seen": 2776163834, "step": 712, "train_runtime": 28317.2201, "train_tokens_per_second": 98038.007 }, { "epoch": 0.11335453100158982, "grad_norm": 0.2526417076587677, "learning_rate": 4.8485651455689273e-05, "loss": 0.4513, "num_input_tokens_seen": 2780183024, "step": 713, "train_runtime": 28356.7304, "train_tokens_per_second": 98043.145 }, { "epoch": 0.11351351351351352, "grad_norm": 0.24077332019805908, "learning_rate": 4.8481359897165515e-05, "loss": 0.4571, "num_input_tokens_seen": 2784021948, "step": 714, "train_runtime": 28393.8468, "train_tokens_per_second": 98050.186 }, { "epoch": 0.1136724960254372, "grad_norm": 0.26863643527030945, "learning_rate": 4.847706245672704e-05, "loss": 0.4536, "num_input_tokens_seen": 2787801675, "step": 715, "train_runtime": 28433.2164, "train_tokens_per_second": 98047.356 }, { "epoch": 0.11383147853736089, "grad_norm": 0.28191155195236206, "learning_rate": 4.847275913545032e-05, "loss": 0.4518, "num_input_tokens_seen": 2791706554, "step": 716, "train_runtime": 28473.7063, "train_tokens_per_second": 98045.071 }, { "epoch": 0.11399046104928458, "grad_norm": 0.23307165503501892, "learning_rate": 4.846844993441329e-05, "loss": 0.4414, "num_input_tokens_seen": 2795731780, "step": 717, "train_runtime": 28513.4928, "train_tokens_per_second": 98049.432 }, { "epoch": 0.11414944356120826, "grad_norm": 0.23115001618862152, "learning_rate": 4.84641348546954e-05, "loss": 0.4651, "num_input_tokens_seen": 2799648084, "step": 718, "train_runtime": 28552.2313, "train_tokens_per_second": 98053.565 }, { "epoch": 0.11430842607313195, "grad_norm": 0.2742050588130951, "learning_rate": 4.8459813897377525e-05, "loss": 0.45, "num_input_tokens_seen": 2803415530, "step": 719, "train_runtime": 28592.1269, "train_tokens_per_second": 98048.513 }, { "epoch": 0.11446740858505565, "grad_norm": 0.3097846210002899, "learning_rate": 4.845548706354205e-05, "loss": 0.4583, "num_input_tokens_seen": 2807367688, "step": 720, "train_runtime": 28631.7273, "train_tokens_per_second": 98050.937 }, { "epoch": 0.11462639109697934, "grad_norm": 0.30291056632995605, "learning_rate": 4.845115435427281e-05, "loss": 0.4644, "num_input_tokens_seen": 2811268262, "step": 721, "train_runtime": 28668.8171, "train_tokens_per_second": 98060.142 }, { "epoch": 0.11478537360890302, "grad_norm": 0.29075464606285095, "learning_rate": 4.844681577065512e-05, "loss": 0.4517, "num_input_tokens_seen": 2815187302, "step": 722, "train_runtime": 28707.993, "train_tokens_per_second": 98062.839 }, { "epoch": 0.11494435612082671, "grad_norm": 0.29658573865890503, "learning_rate": 4.844247131377576e-05, "loss": 0.4344, "num_input_tokens_seen": 2819085888, "step": 723, "train_runtime": 28746.4741, "train_tokens_per_second": 98067.188 }, { "epoch": 0.1151033386327504, "grad_norm": 0.3650790750980377, "learning_rate": 4.843812098472299e-05, "loss": 0.4492, "num_input_tokens_seen": 2823085653, "step": 724, "train_runtime": 28784.4773, "train_tokens_per_second": 98076.669 }, { "epoch": 0.11526232114467408, "grad_norm": 0.3230043649673462, "learning_rate": 4.843376478458653e-05, "loss": 0.4413, "num_input_tokens_seen": 2826957746, "step": 725, "train_runtime": 28823.7363, "train_tokens_per_second": 98077.422 }, { "epoch": 0.11542130365659778, "grad_norm": 0.6352295279502869, "learning_rate": 4.8429402714457586e-05, "loss": 0.4466, "num_input_tokens_seen": 2830889022, "step": 726, "train_runtime": 28862.0475, "train_tokens_per_second": 98083.444 }, { "epoch": 0.11558028616852146, "grad_norm": 0.23399995267391205, "learning_rate": 4.8425034775428825e-05, "loss": 0.4652, "num_input_tokens_seen": 2834869507, "step": 727, "train_runtime": 28901.623, "train_tokens_per_second": 98086.862 }, { "epoch": 0.11573926868044515, "grad_norm": 0.24995940923690796, "learning_rate": 4.842066096859438e-05, "loss": 0.4651, "num_input_tokens_seen": 2838786456, "step": 728, "train_runtime": 28942.4687, "train_tokens_per_second": 98083.771 }, { "epoch": 0.11589825119236884, "grad_norm": 0.2609986662864685, "learning_rate": 4.841628129504986e-05, "loss": 0.4556, "num_input_tokens_seen": 2842687499, "step": 729, "train_runtime": 28981.665, "train_tokens_per_second": 98085.721 }, { "epoch": 0.11605723370429252, "grad_norm": 0.29036983847618103, "learning_rate": 4.841189575589235e-05, "loss": 0.4687, "num_input_tokens_seen": 2846482969, "step": 730, "train_runtime": 29021.3705, "train_tokens_per_second": 98082.307 }, { "epoch": 0.11621621621621622, "grad_norm": 0.3130400478839874, "learning_rate": 4.840750435222039e-05, "loss": 0.4691, "num_input_tokens_seen": 2850463940, "step": 731, "train_runtime": 29060.2073, "train_tokens_per_second": 98088.218 }, { "epoch": 0.11637519872813991, "grad_norm": 0.27626821398735046, "learning_rate": 4.8403107085133984e-05, "loss": 0.4538, "num_input_tokens_seen": 2854353290, "step": 732, "train_runtime": 29100.8575, "train_tokens_per_second": 98084.852 }, { "epoch": 0.1165341812400636, "grad_norm": 0.24716950953006744, "learning_rate": 4.839870395573464e-05, "loss": 0.4545, "num_input_tokens_seen": 2858315113, "step": 733, "train_runtime": 29139.2872, "train_tokens_per_second": 98091.456 }, { "epoch": 0.11669316375198728, "grad_norm": 0.272442489862442, "learning_rate": 4.839429496512529e-05, "loss": 0.4721, "num_input_tokens_seen": 2862188959, "step": 734, "train_runtime": 29178.5905, "train_tokens_per_second": 98092.091 }, { "epoch": 0.11685214626391097, "grad_norm": 0.4372948408126831, "learning_rate": 4.838988011441036e-05, "loss": 0.4549, "num_input_tokens_seen": 2866074240, "step": 735, "train_runtime": 29218.6542, "train_tokens_per_second": 98090.563 }, { "epoch": 0.11701112877583465, "grad_norm": 0.2675490081310272, "learning_rate": 4.8385459404695755e-05, "loss": 0.45, "num_input_tokens_seen": 2869967162, "step": 736, "train_runtime": 29256.7371, "train_tokens_per_second": 98095.941 }, { "epoch": 0.11717011128775835, "grad_norm": 0.41833072900772095, "learning_rate": 4.838103283708881e-05, "loss": 0.4566, "num_input_tokens_seen": 2873869306, "step": 737, "train_runtime": 29294.5037, "train_tokens_per_second": 98102.68 }, { "epoch": 0.11732909379968204, "grad_norm": 0.3128334879875183, "learning_rate": 4.837660041269836e-05, "loss": 0.4542, "num_input_tokens_seen": 2877762562, "step": 738, "train_runtime": 29333.352, "train_tokens_per_second": 98105.479 }, { "epoch": 0.11748807631160572, "grad_norm": 0.31424376368522644, "learning_rate": 4.8372162132634694e-05, "loss": 0.4608, "num_input_tokens_seen": 2881710321, "step": 739, "train_runtime": 29374.9138, "train_tokens_per_second": 98101.065 }, { "epoch": 0.11764705882352941, "grad_norm": 0.29221782088279724, "learning_rate": 4.836771799800957e-05, "loss": 0.4736, "num_input_tokens_seen": 2885801717, "step": 740, "train_runtime": 29414.3045, "train_tokens_per_second": 98108.786 }, { "epoch": 0.1178060413354531, "grad_norm": 0.2836483418941498, "learning_rate": 4.8363268009936216e-05, "loss": 0.4574, "num_input_tokens_seen": 2889705977, "step": 741, "train_runtime": 29453.2839, "train_tokens_per_second": 98111.504 }, { "epoch": 0.11796502384737678, "grad_norm": 0.3098735213279724, "learning_rate": 4.835881216952931e-05, "loss": 0.4483, "num_input_tokens_seen": 2893542921, "step": 742, "train_runtime": 29491.5101, "train_tokens_per_second": 98114.437 }, { "epoch": 0.11812400635930048, "grad_norm": 0.295393705368042, "learning_rate": 4.8354350477905024e-05, "loss": 0.4561, "num_input_tokens_seen": 2897399226, "step": 743, "train_runtime": 29532.6988, "train_tokens_per_second": 98108.176 }, { "epoch": 0.11828298887122417, "grad_norm": 0.2885628044605255, "learning_rate": 4.834988293618097e-05, "loss": 0.456, "num_input_tokens_seen": 2901373859, "step": 744, "train_runtime": 29573.2127, "train_tokens_per_second": 98108.173 }, { "epoch": 0.11844197138314785, "grad_norm": 0.3431042432785034, "learning_rate": 4.834540954547624e-05, "loss": 0.4619, "num_input_tokens_seen": 2905173450, "step": 745, "train_runtime": 29610.9626, "train_tokens_per_second": 98111.415 }, { "epoch": 0.11860095389507154, "grad_norm": 0.26634281873703003, "learning_rate": 4.834093030691139e-05, "loss": 0.4451, "num_input_tokens_seen": 2909054305, "step": 746, "train_runtime": 29647.0843, "train_tokens_per_second": 98122.779 }, { "epoch": 0.11875993640699523, "grad_norm": 0.3283827304840088, "learning_rate": 4.833644522160843e-05, "loss": 0.4624, "num_input_tokens_seen": 2912955633, "step": 747, "train_runtime": 29687.9992, "train_tokens_per_second": 98118.961 }, { "epoch": 0.11891891891891893, "grad_norm": 0.22662454843521118, "learning_rate": 4.8331954290690837e-05, "loss": 0.4529, "num_input_tokens_seen": 2916979818, "step": 748, "train_runtime": 29727.7604, "train_tokens_per_second": 98123.094 }, { "epoch": 0.11907790143084261, "grad_norm": 0.28336402773857117, "learning_rate": 4.832745751528358e-05, "loss": 0.4445, "num_input_tokens_seen": 2920830340, "step": 749, "train_runtime": 29765.17, "train_tokens_per_second": 98129.134 }, { "epoch": 0.1192368839427663, "grad_norm": 0.29853612184524536, "learning_rate": 4.832295489651305e-05, "loss": 0.446, "num_input_tokens_seen": 2924567523, "step": 750, "train_runtime": 29803.5318, "train_tokens_per_second": 98128.22 }, { "epoch": 0.11939586645468998, "grad_norm": 0.29533177614212036, "learning_rate": 4.831844643550713e-05, "loss": 0.4526, "num_input_tokens_seen": 2928460425, "step": 751, "train_runtime": 29842.2076, "train_tokens_per_second": 98131.494 }, { "epoch": 0.11955484896661367, "grad_norm": 0.2699669897556305, "learning_rate": 4.8313932133395146e-05, "loss": 0.4466, "num_input_tokens_seen": 2932463003, "step": 752, "train_runtime": 29882.3272, "train_tokens_per_second": 98133.689 }, { "epoch": 0.11971383147853736, "grad_norm": 0.28118887543678284, "learning_rate": 4.830941199130791e-05, "loss": 0.4548, "num_input_tokens_seen": 2936388811, "step": 753, "train_runtime": 29921.2557, "train_tokens_per_second": 98137.219 }, { "epoch": 0.11987281399046106, "grad_norm": 0.29744014143943787, "learning_rate": 4.8304886010377686e-05, "loss": 0.4488, "num_input_tokens_seen": 2940273717, "step": 754, "train_runtime": 29960.8293, "train_tokens_per_second": 98137.261 }, { "epoch": 0.12003179650238474, "grad_norm": 0.32473328709602356, "learning_rate": 4.8300354191738196e-05, "loss": 0.4485, "num_input_tokens_seen": 2944178298, "step": 755, "train_runtime": 29998.7713, "train_tokens_per_second": 98143.296 }, { "epoch": 0.12019077901430843, "grad_norm": 0.24070283770561218, "learning_rate": 4.829581653652463e-05, "loss": 0.459, "num_input_tokens_seen": 2948162568, "step": 756, "train_runtime": 30037.1111, "train_tokens_per_second": 98150.67 }, { "epoch": 0.12034976152623211, "grad_norm": 0.2587828040122986, "learning_rate": 4.829127304587363e-05, "loss": 0.4468, "num_input_tokens_seen": 2952040990, "step": 757, "train_runtime": 30076.2284, "train_tokens_per_second": 98151.967 }, { "epoch": 0.1205087440381558, "grad_norm": 0.2630459666252136, "learning_rate": 4.8286723720923324e-05, "loss": 0.4554, "num_input_tokens_seen": 2955954846, "step": 758, "train_runtime": 30116.4078, "train_tokens_per_second": 98150.977 }, { "epoch": 0.12066772655007948, "grad_norm": 0.246249720454216, "learning_rate": 4.828216856281328e-05, "loss": 0.459, "num_input_tokens_seen": 2959844894, "step": 759, "train_runtime": 30154.5631, "train_tokens_per_second": 98155.788 }, { "epoch": 0.12082670906200318, "grad_norm": 0.2595362663269043, "learning_rate": 4.8277607572684525e-05, "loss": 0.455, "num_input_tokens_seen": 2963783355, "step": 760, "train_runtime": 30190.7053, "train_tokens_per_second": 98168.735 }, { "epoch": 0.12098569157392687, "grad_norm": 0.320855975151062, "learning_rate": 4.827304075167957e-05, "loss": 0.4353, "num_input_tokens_seen": 2967675923, "step": 761, "train_runtime": 30231.1419, "train_tokens_per_second": 98166.187 }, { "epoch": 0.12114467408585056, "grad_norm": 0.2812439799308777, "learning_rate": 4.826846810094235e-05, "loss": 0.4594, "num_input_tokens_seen": 2971530221, "step": 762, "train_runtime": 30271.8326, "train_tokens_per_second": 98161.557 }, { "epoch": 0.12130365659777424, "grad_norm": 0.23208270967006683, "learning_rate": 4.8263889621618304e-05, "loss": 0.4647, "num_input_tokens_seen": 2975532536, "step": 763, "train_runtime": 30310.2508, "train_tokens_per_second": 98169.182 }, { "epoch": 0.12146263910969793, "grad_norm": 0.47011247277259827, "learning_rate": 4.82593053148543e-05, "loss": 0.4466, "num_input_tokens_seen": 2979410245, "step": 764, "train_runtime": 30347.5449, "train_tokens_per_second": 98176.319 }, { "epoch": 0.12162162162162163, "grad_norm": 0.2672637403011322, "learning_rate": 4.825471518179866e-05, "loss": 0.4603, "num_input_tokens_seen": 2983262529, "step": 765, "train_runtime": 30388.2541, "train_tokens_per_second": 98171.567 }, { "epoch": 0.12178060413354531, "grad_norm": 0.23612642288208008, "learning_rate": 4.82501192236012e-05, "loss": 0.4484, "num_input_tokens_seen": 2987314953, "step": 766, "train_runtime": 30426.3106, "train_tokens_per_second": 98181.965 }, { "epoch": 0.121939586645469, "grad_norm": 0.22974880039691925, "learning_rate": 4.8245517441413166e-05, "loss": 0.4517, "num_input_tokens_seen": 2991289393, "step": 767, "train_runtime": 30465.7573, "train_tokens_per_second": 98185.296 }, { "epoch": 0.12209856915739269, "grad_norm": 0.3322402834892273, "learning_rate": 4.824090983638728e-05, "loss": 0.4727, "num_input_tokens_seen": 2995025092, "step": 768, "train_runtime": 30507.483, "train_tokens_per_second": 98173.458 }, { "epoch": 0.12225755166931637, "grad_norm": 0.24932608008384705, "learning_rate": 4.8236296409677704e-05, "loss": 0.4524, "num_input_tokens_seen": 2998911725, "step": 769, "train_runtime": 30547.4537, "train_tokens_per_second": 98172.232 }, { "epoch": 0.12241653418124006, "grad_norm": 0.2745214104652405, "learning_rate": 4.823167716244008e-05, "loss": 0.4446, "num_input_tokens_seen": 3002893431, "step": 770, "train_runtime": 30585.9854, "train_tokens_per_second": 98178.737 }, { "epoch": 0.12257551669316376, "grad_norm": 0.20324404537677765, "learning_rate": 4.8227052095831485e-05, "loss": 0.4518, "num_input_tokens_seen": 3006876728, "step": 771, "train_runtime": 30626.5477, "train_tokens_per_second": 98178.768 }, { "epoch": 0.12273449920508744, "grad_norm": 0.2397637665271759, "learning_rate": 4.8222421211010475e-05, "loss": 0.4548, "num_input_tokens_seen": 3010781567, "step": 772, "train_runtime": 30665.8496, "train_tokens_per_second": 98180.276 }, { "epoch": 0.12289348171701113, "grad_norm": 0.22462737560272217, "learning_rate": 4.821778450913704e-05, "loss": 0.4491, "num_input_tokens_seen": 3014621826, "step": 773, "train_runtime": 30707.1928, "train_tokens_per_second": 98173.149 }, { "epoch": 0.12305246422893482, "grad_norm": 0.2289862036705017, "learning_rate": 4.8213141991372653e-05, "loss": 0.4648, "num_input_tokens_seen": 3018511208, "step": 774, "train_runtime": 30745.5563, "train_tokens_per_second": 98177.154 }, { "epoch": 0.1232114467408585, "grad_norm": 0.2543877065181732, "learning_rate": 4.820849365888023e-05, "loss": 0.4675, "num_input_tokens_seen": 3022339723, "step": 775, "train_runtime": 30785.2947, "train_tokens_per_second": 98174.786 }, { "epoch": 0.12337042925278219, "grad_norm": 0.2462921142578125, "learning_rate": 4.8203839512824145e-05, "loss": 0.455, "num_input_tokens_seen": 3026243546, "step": 776, "train_runtime": 30823.07, "train_tokens_per_second": 98181.12 }, { "epoch": 0.12352941176470589, "grad_norm": 0.2460344284772873, "learning_rate": 4.819917955437023e-05, "loss": 0.4602, "num_input_tokens_seen": 3030180152, "step": 777, "train_runtime": 30863.3382, "train_tokens_per_second": 98180.57 }, { "epoch": 0.12368839427662957, "grad_norm": 0.2157267928123474, "learning_rate": 4.819451378468577e-05, "loss": 0.4553, "num_input_tokens_seen": 3033975423, "step": 778, "train_runtime": 30902.0434, "train_tokens_per_second": 98180.414 }, { "epoch": 0.12384737678855326, "grad_norm": 0.23030078411102295, "learning_rate": 4.8189842204939505e-05, "loss": 0.4454, "num_input_tokens_seen": 3037902962, "step": 779, "train_runtime": 30940.6339, "train_tokens_per_second": 98184.897 }, { "epoch": 0.12400635930047695, "grad_norm": 0.24717777967453003, "learning_rate": 4.8185164816301646e-05, "loss": 0.4639, "num_input_tokens_seen": 3041893597, "step": 780, "train_runtime": 30979.6298, "train_tokens_per_second": 98190.121 }, { "epoch": 0.12416534181240063, "grad_norm": 0.2272942215204239, "learning_rate": 4.818048161994382e-05, "loss": 0.4719, "num_input_tokens_seen": 3045698098, "step": 781, "train_runtime": 31017.4522, "train_tokens_per_second": 98193.046 }, { "epoch": 0.12432432432432433, "grad_norm": 0.273865282535553, "learning_rate": 4.817579261703916e-05, "loss": 0.4549, "num_input_tokens_seen": 3049700956, "step": 782, "train_runtime": 31060.6433, "train_tokens_per_second": 98185.376 }, { "epoch": 0.12448330683624802, "grad_norm": 0.4532919228076935, "learning_rate": 4.817109780876221e-05, "loss": 0.461, "num_input_tokens_seen": 3053522001, "step": 783, "train_runtime": 31099.2107, "train_tokens_per_second": 98186.479 }, { "epoch": 0.1246422893481717, "grad_norm": 0.21630461513996124, "learning_rate": 4.8166397196289e-05, "loss": 0.4459, "num_input_tokens_seen": 3057446274, "step": 784, "train_runtime": 31138.5476, "train_tokens_per_second": 98188.468 }, { "epoch": 0.12480127186009539, "grad_norm": 0.24952532351016998, "learning_rate": 4.816169078079699e-05, "loss": 0.4498, "num_input_tokens_seen": 3061216280, "step": 785, "train_runtime": 31177.4781, "train_tokens_per_second": 98186.783 }, { "epoch": 0.12496025437201908, "grad_norm": 0.23642824590206146, "learning_rate": 4.8156978563465114e-05, "loss": 0.4506, "num_input_tokens_seen": 3065145366, "step": 786, "train_runtime": 31215.6645, "train_tokens_per_second": 98192.539 }, { "epoch": 0.12511923688394277, "grad_norm": 0.24351757764816284, "learning_rate": 4.8152260545473735e-05, "loss": 0.4513, "num_input_tokens_seen": 3069015349, "step": 787, "train_runtime": 31255.3051, "train_tokens_per_second": 98191.822 }, { "epoch": 0.12527821939586645, "grad_norm": 0.2351301610469818, "learning_rate": 4.81475367280047e-05, "loss": 0.4513, "num_input_tokens_seen": 3072883280, "step": 788, "train_runtime": 31295.6681, "train_tokens_per_second": 98188.774 }, { "epoch": 0.12543720190779015, "grad_norm": 0.2073129415512085, "learning_rate": 4.814280711224128e-05, "loss": 0.4502, "num_input_tokens_seen": 3076849524, "step": 789, "train_runtime": 31334.2986, "train_tokens_per_second": 98194.3 }, { "epoch": 0.12559618441971382, "grad_norm": 0.212493434548378, "learning_rate": 4.8138071699368216e-05, "loss": 0.4674, "num_input_tokens_seen": 3080782054, "step": 790, "train_runtime": 31371.1881, "train_tokens_per_second": 98204.188 }, { "epoch": 0.12575516693163752, "grad_norm": 0.25112253427505493, "learning_rate": 4.8133330490571685e-05, "loss": 0.4467, "num_input_tokens_seen": 3084682812, "step": 791, "train_runtime": 31407.1259, "train_tokens_per_second": 98216.017 }, { "epoch": 0.12591414944356122, "grad_norm": 0.21558775007724762, "learning_rate": 4.812858348703934e-05, "loss": 0.4592, "num_input_tokens_seen": 3088386176, "step": 792, "train_runtime": 31446.0962, "train_tokens_per_second": 98212.069 }, { "epoch": 0.1260731319554849, "grad_norm": 0.23533464968204498, "learning_rate": 4.812383068996027e-05, "loss": 0.4557, "num_input_tokens_seen": 3092313027, "step": 793, "train_runtime": 31488.3817, "train_tokens_per_second": 98204.889 }, { "epoch": 0.1262321144674086, "grad_norm": 0.19977477192878723, "learning_rate": 4.811907210052501e-05, "loss": 0.4455, "num_input_tokens_seen": 3096095566, "step": 794, "train_runtime": 31526.705, "train_tokens_per_second": 98205.492 }, { "epoch": 0.12639109697933226, "grad_norm": 0.23298704624176025, "learning_rate": 4.811430771992555e-05, "loss": 0.4584, "num_input_tokens_seen": 3100030961, "step": 795, "train_runtime": 31565.1861, "train_tokens_per_second": 98210.445 }, { "epoch": 0.12655007949125596, "grad_norm": 0.21810631453990936, "learning_rate": 4.8109537549355346e-05, "loss": 0.4537, "num_input_tokens_seen": 3103836126, "step": 796, "train_runtime": 31604.6528, "train_tokens_per_second": 98208.202 }, { "epoch": 0.12670906200317966, "grad_norm": 0.2218260020017624, "learning_rate": 4.8104761590009284e-05, "loss": 0.4624, "num_input_tokens_seen": 3107823836, "step": 797, "train_runtime": 31644.6875, "train_tokens_per_second": 98209.971 }, { "epoch": 0.12686804451510333, "grad_norm": 0.24671585857868195, "learning_rate": 4.8099979843083706e-05, "loss": 0.4606, "num_input_tokens_seen": 3111744122, "step": 798, "train_runtime": 31684.6615, "train_tokens_per_second": 98209.796 }, { "epoch": 0.12702702702702703, "grad_norm": 0.2379617542028427, "learning_rate": 4.809519230977642e-05, "loss": 0.4382, "num_input_tokens_seen": 3115513679, "step": 799, "train_runtime": 31724.0947, "train_tokens_per_second": 98206.543 }, { "epoch": 0.1271860095389507, "grad_norm": 0.1957179456949234, "learning_rate": 4.8090398991286654e-05, "loss": 0.4579, "num_input_tokens_seen": 3119419046, "step": 800, "train_runtime": 31761.7878, "train_tokens_per_second": 98212.955 }, { "epoch": 0.1273449920508744, "grad_norm": 0.2207964062690735, "learning_rate": 4.808559988881511e-05, "loss": 0.4584, "num_input_tokens_seen": 3123363576, "step": 801, "train_runtime": 31900.8627, "train_tokens_per_second": 97908.436 }, { "epoch": 0.1275039745627981, "grad_norm": 0.3198299705982208, "learning_rate": 4.808079500356392e-05, "loss": 0.4389, "num_input_tokens_seen": 3127204881, "step": 802, "train_runtime": 31941.388, "train_tokens_per_second": 97904.477 }, { "epoch": 0.12766295707472178, "grad_norm": 0.25184008479118347, "learning_rate": 4.807598433673668e-05, "loss": 0.4575, "num_input_tokens_seen": 3131214861, "step": 803, "train_runtime": 31979.6763, "train_tokens_per_second": 97912.65 }, { "epoch": 0.12782193958664548, "grad_norm": 0.2524524927139282, "learning_rate": 4.807116788953843e-05, "loss": 0.4554, "num_input_tokens_seen": 3135107409, "step": 804, "train_runtime": 32018.4857, "train_tokens_per_second": 97915.543 }, { "epoch": 0.12798092209856915, "grad_norm": 0.22992782294750214, "learning_rate": 4.8066345663175644e-05, "loss": 0.45, "num_input_tokens_seen": 3138989830, "step": 805, "train_runtime": 32057.813, "train_tokens_per_second": 97916.531 }, { "epoch": 0.12813990461049285, "grad_norm": 0.21909524500370026, "learning_rate": 4.806151765885627e-05, "loss": 0.4547, "num_input_tokens_seen": 3142857077, "step": 806, "train_runtime": 32096.3973, "train_tokens_per_second": 97919.31 }, { "epoch": 0.12829888712241652, "grad_norm": 0.24215072393417358, "learning_rate": 4.805668387778968e-05, "loss": 0.4582, "num_input_tokens_seen": 3146847322, "step": 807, "train_runtime": 32135.1062, "train_tokens_per_second": 97925.53 }, { "epoch": 0.12845786963434022, "grad_norm": 0.2415633499622345, "learning_rate": 4.8051844321186715e-05, "loss": 0.4501, "num_input_tokens_seen": 3150713690, "step": 808, "train_runtime": 32176.5267, "train_tokens_per_second": 97919.633 }, { "epoch": 0.12861685214626392, "grad_norm": 0.2299763411283493, "learning_rate": 4.804699899025963e-05, "loss": 0.4436, "num_input_tokens_seen": 3154564467, "step": 809, "train_runtime": 32216.0313, "train_tokens_per_second": 97919.09 }, { "epoch": 0.1287758346581876, "grad_norm": 0.2605205774307251, "learning_rate": 4.8042147886222154e-05, "loss": 0.4577, "num_input_tokens_seen": 3158444780, "step": 810, "train_runtime": 32256.3664, "train_tokens_per_second": 97916.943 }, { "epoch": 0.1289348171701113, "grad_norm": 0.2213960886001587, "learning_rate": 4.803729101028946e-05, "loss": 0.4463, "num_input_tokens_seen": 3162396362, "step": 811, "train_runtime": 32294.8989, "train_tokens_per_second": 97922.473 }, { "epoch": 0.12909379968203497, "grad_norm": 0.2628748118877411, "learning_rate": 4.8032428363678164e-05, "loss": 0.4439, "num_input_tokens_seen": 3166188387, "step": 812, "train_runtime": 32335.7363, "train_tokens_per_second": 97916.075 }, { "epoch": 0.12925278219395867, "grad_norm": 0.2551816701889038, "learning_rate": 4.8027559947606314e-05, "loss": 0.4454, "num_input_tokens_seen": 3170235675, "step": 813, "train_runtime": 32374.8462, "train_tokens_per_second": 97922.803 }, { "epoch": 0.12941176470588237, "grad_norm": 0.31526097655296326, "learning_rate": 4.8022685763293416e-05, "loss": 0.4504, "num_input_tokens_seen": 3174136931, "step": 814, "train_runtime": 32414.9799, "train_tokens_per_second": 97921.916 }, { "epoch": 0.12957074721780604, "grad_norm": 0.2260509729385376, "learning_rate": 4.801780581196042e-05, "loss": 0.4524, "num_input_tokens_seen": 3178007922, "step": 815, "train_runtime": 32454.6851, "train_tokens_per_second": 97921.392 }, { "epoch": 0.12972972972972974, "grad_norm": 0.21692979335784912, "learning_rate": 4.801292009482972e-05, "loss": 0.4541, "num_input_tokens_seen": 3181886263, "step": 816, "train_runtime": 32492.5302, "train_tokens_per_second": 97926.7 }, { "epoch": 0.1298887122416534, "grad_norm": 0.2861810326576233, "learning_rate": 4.800802861312515e-05, "loss": 0.4473, "num_input_tokens_seen": 3185722300, "step": 817, "train_runtime": 32530.4934, "train_tokens_per_second": 97930.341 }, { "epoch": 0.1300476947535771, "grad_norm": 0.24579153954982758, "learning_rate": 4.800313136807201e-05, "loss": 0.4613, "num_input_tokens_seen": 3189641349, "step": 818, "train_runtime": 32568.9005, "train_tokens_per_second": 97935.187 }, { "epoch": 0.1302066772655008, "grad_norm": 0.269347220659256, "learning_rate": 4.7998228360897e-05, "loss": 0.4438, "num_input_tokens_seen": 3193487401, "step": 819, "train_runtime": 32608.5369, "train_tokens_per_second": 97934.09 }, { "epoch": 0.13036565977742448, "grad_norm": 0.2649823725223541, "learning_rate": 4.79933195928283e-05, "loss": 0.4545, "num_input_tokens_seen": 3197394234, "step": 820, "train_runtime": 32648.2797, "train_tokens_per_second": 97934.539 }, { "epoch": 0.13052464228934818, "grad_norm": 0.2577219307422638, "learning_rate": 4.798840506509552e-05, "loss": 0.4444, "num_input_tokens_seen": 3201486874, "step": 821, "train_runtime": 32691.2298, "train_tokens_per_second": 97931.063 }, { "epoch": 0.13068362480127185, "grad_norm": 0.24424487352371216, "learning_rate": 4.798348477892972e-05, "loss": 0.4508, "num_input_tokens_seen": 3205372012, "step": 822, "train_runtime": 32731.6424, "train_tokens_per_second": 97928.847 }, { "epoch": 0.13084260731319555, "grad_norm": 0.24412934482097626, "learning_rate": 4.7978558735563384e-05, "loss": 0.4495, "num_input_tokens_seen": 3209285235, "step": 823, "train_runtime": 32769.9554, "train_tokens_per_second": 97933.769 }, { "epoch": 0.13100158982511922, "grad_norm": 0.24648435413837433, "learning_rate": 4.7973626936230465e-05, "loss": 0.4512, "num_input_tokens_seen": 3213263898, "step": 824, "train_runtime": 32807.2482, "train_tokens_per_second": 97943.719 }, { "epoch": 0.13116057233704292, "grad_norm": 0.23703816533088684, "learning_rate": 4.7968689382166335e-05, "loss": 0.4404, "num_input_tokens_seen": 3217122740, "step": 825, "train_runtime": 32845.2557, "train_tokens_per_second": 97947.867 }, { "epoch": 0.13131955484896662, "grad_norm": 0.2516811192035675, "learning_rate": 4.796374607460782e-05, "loss": 0.4487, "num_input_tokens_seen": 3221053393, "step": 826, "train_runtime": 32887.8367, "train_tokens_per_second": 97940.568 }, { "epoch": 0.1314785373608903, "grad_norm": 0.24634519219398499, "learning_rate": 4.795879701479319e-05, "loss": 0.4516, "num_input_tokens_seen": 3224985195, "step": 827, "train_runtime": 32927.2229, "train_tokens_per_second": 97942.824 }, { "epoch": 0.131637519872814, "grad_norm": 0.2363511174917221, "learning_rate": 4.795384220396214e-05, "loss": 0.4653, "num_input_tokens_seen": 3228932824, "step": 828, "train_runtime": 32965.9812, "train_tokens_per_second": 97947.421 }, { "epoch": 0.13179650238473767, "grad_norm": 0.24973104894161224, "learning_rate": 4.794888164335582e-05, "loss": 0.4556, "num_input_tokens_seen": 3232615944, "step": 829, "train_runtime": 33006.4181, "train_tokens_per_second": 97939.011 }, { "epoch": 0.13195548489666137, "grad_norm": 0.2464277148246765, "learning_rate": 4.794391533421681e-05, "loss": 0.437, "num_input_tokens_seen": 3236614329, "step": 830, "train_runtime": 33046.1432, "train_tokens_per_second": 97942.271 }, { "epoch": 0.13211446740858507, "grad_norm": 0.2405272275209427, "learning_rate": 4.793894327778913e-05, "loss": 0.4471, "num_input_tokens_seen": 3240555225, "step": 831, "train_runtime": 33087.7047, "train_tokens_per_second": 97938.351 }, { "epoch": 0.13227344992050874, "grad_norm": 0.2536347806453705, "learning_rate": 4.793396547531827e-05, "loss": 0.4401, "num_input_tokens_seen": 3244443672, "step": 832, "train_runtime": 33127.4827, "train_tokens_per_second": 97938.129 }, { "epoch": 0.13243243243243244, "grad_norm": 0.23925043642520905, "learning_rate": 4.7928981928051096e-05, "loss": 0.4559, "num_input_tokens_seen": 3248337658, "step": 833, "train_runtime": 33167.6442, "train_tokens_per_second": 97936.942 }, { "epoch": 0.1325914149443561, "grad_norm": 0.25752684473991394, "learning_rate": 4.7923992637235994e-05, "loss": 0.4525, "num_input_tokens_seen": 3252139381, "step": 834, "train_runtime": 33207.6939, "train_tokens_per_second": 97933.31 }, { "epoch": 0.1327503974562798, "grad_norm": 0.20816366374492645, "learning_rate": 4.791899760412272e-05, "loss": 0.4573, "num_input_tokens_seen": 3256055902, "step": 835, "train_runtime": 33247.8571, "train_tokens_per_second": 97932.805 }, { "epoch": 0.13290937996820348, "grad_norm": 0.22570523619651794, "learning_rate": 4.7913996829962494e-05, "loss": 0.4625, "num_input_tokens_seen": 3259906553, "step": 836, "train_runtime": 33289.1255, "train_tokens_per_second": 97927.071 }, { "epoch": 0.13306836248012718, "grad_norm": 0.2356865555047989, "learning_rate": 4.7908990316007987e-05, "loss": 0.4542, "num_input_tokens_seen": 3263853898, "step": 837, "train_runtime": 33329.4645, "train_tokens_per_second": 97926.983 }, { "epoch": 0.13322734499205088, "grad_norm": 0.2255311906337738, "learning_rate": 4.790397806351328e-05, "loss": 0.4575, "num_input_tokens_seen": 3267829452, "step": 838, "train_runtime": 33367.9683, "train_tokens_per_second": 97933.126 }, { "epoch": 0.13338632750397456, "grad_norm": 0.21762102842330933, "learning_rate": 4.789896007373392e-05, "loss": 0.4492, "num_input_tokens_seen": 3271586265, "step": 839, "train_runtime": 33407.3851, "train_tokens_per_second": 97930.031 }, { "epoch": 0.13354531001589826, "grad_norm": 0.22119437158107758, "learning_rate": 4.7893936347926885e-05, "loss": 0.4654, "num_input_tokens_seen": 3275538428, "step": 840, "train_runtime": 33445.6626, "train_tokens_per_second": 97936.12 }, { "epoch": 0.13370429252782193, "grad_norm": 0.24375805258750916, "learning_rate": 4.788890688735056e-05, "loss": 0.4579, "num_input_tokens_seen": 3279440899, "step": 841, "train_runtime": 33483.0453, "train_tokens_per_second": 97943.328 }, { "epoch": 0.13386327503974563, "grad_norm": 0.21506814658641815, "learning_rate": 4.78838716932648e-05, "loss": 0.4548, "num_input_tokens_seen": 3283366092, "step": 842, "train_runtime": 33519.1264, "train_tokens_per_second": 97955.002 }, { "epoch": 0.13402225755166933, "grad_norm": 0.2376105934381485, "learning_rate": 4.7878830766930886e-05, "loss": 0.4636, "num_input_tokens_seen": 3287210181, "step": 843, "train_runtime": 33557.245, "train_tokens_per_second": 97958.285 }, { "epoch": 0.134181240063593, "grad_norm": 0.20666548609733582, "learning_rate": 4.7873784109611544e-05, "loss": 0.457, "num_input_tokens_seen": 3291198185, "step": 844, "train_runtime": 33596.8402, "train_tokens_per_second": 97961.539 }, { "epoch": 0.1343402225755167, "grad_norm": 0.21363934874534607, "learning_rate": 4.7868731722570905e-05, "loss": 0.4487, "num_input_tokens_seen": 3295207993, "step": 845, "train_runtime": 33637.9099, "train_tokens_per_second": 97961.14 }, { "epoch": 0.13449920508744037, "grad_norm": 0.20201966166496277, "learning_rate": 4.786367360707458e-05, "loss": 0.4496, "num_input_tokens_seen": 3299119149, "step": 846, "train_runtime": 33675.6835, "train_tokens_per_second": 97967.4 }, { "epoch": 0.13465818759936407, "grad_norm": 0.21201083064079285, "learning_rate": 4.785860976438957e-05, "loss": 0.4426, "num_input_tokens_seen": 3303014016, "step": 847, "train_runtime": 33715.3375, "train_tokens_per_second": 97967.698 }, { "epoch": 0.13481717011128777, "grad_norm": 0.23176220059394836, "learning_rate": 4.785354019578434e-05, "loss": 0.4514, "num_input_tokens_seen": 3306786211, "step": 848, "train_runtime": 33755.1973, "train_tokens_per_second": 97963.765 }, { "epoch": 0.13497615262321144, "grad_norm": 0.27123263478279114, "learning_rate": 4.784846490252879e-05, "loss": 0.4511, "num_input_tokens_seen": 3310707207, "step": 849, "train_runtime": 33794.3338, "train_tokens_per_second": 97966.34 }, { "epoch": 0.13513513513513514, "grad_norm": 0.22756065428256989, "learning_rate": 4.7843383885894226e-05, "loss": 0.4509, "num_input_tokens_seen": 3314618277, "step": 850, "train_runtime": 33833.9463, "train_tokens_per_second": 97967.238 }, { "epoch": 0.13529411764705881, "grad_norm": 0.3847958743572235, "learning_rate": 4.7838297147153425e-05, "loss": 0.4435, "num_input_tokens_seen": 3318557156, "step": 851, "train_runtime": 33874.3657, "train_tokens_per_second": 97966.621 }, { "epoch": 0.13545310015898251, "grad_norm": 0.23466230928897858, "learning_rate": 4.783320468758057e-05, "loss": 0.4493, "num_input_tokens_seen": 3322409365, "step": 852, "train_runtime": 33912.2156, "train_tokens_per_second": 97970.873 }, { "epoch": 0.1356120826709062, "grad_norm": 0.2673245966434479, "learning_rate": 4.78281065084513e-05, "loss": 0.4605, "num_input_tokens_seen": 3326394046, "step": 853, "train_runtime": 33951.7825, "train_tokens_per_second": 97974.062 }, { "epoch": 0.1357710651828299, "grad_norm": 0.26088693737983704, "learning_rate": 4.782300261104265e-05, "loss": 0.4455, "num_input_tokens_seen": 3330154271, "step": 854, "train_runtime": 33991.0606, "train_tokens_per_second": 97971.473 }, { "epoch": 0.1359300476947536, "grad_norm": 0.23145413398742676, "learning_rate": 4.781789299663312e-05, "loss": 0.45, "num_input_tokens_seen": 3334148862, "step": 855, "train_runtime": 34028.0256, "train_tokens_per_second": 97982.437 }, { "epoch": 0.13608903020667726, "grad_norm": 0.22744889557361603, "learning_rate": 4.7812777666502634e-05, "loss": 0.4525, "num_input_tokens_seen": 3337991413, "step": 856, "train_runtime": 34067.5179, "train_tokens_per_second": 97981.644 }, { "epoch": 0.13624801271860096, "grad_norm": 0.20557665824890137, "learning_rate": 4.780765662193255e-05, "loss": 0.4469, "num_input_tokens_seen": 3341919615, "step": 857, "train_runtime": 34106.2727, "train_tokens_per_second": 97985.483 }, { "epoch": 0.13640699523052463, "grad_norm": 0.22426509857177734, "learning_rate": 4.780252986420565e-05, "loss": 0.4431, "num_input_tokens_seen": 3345810635, "step": 858, "train_runtime": 34144.926, "train_tokens_per_second": 97988.516 }, { "epoch": 0.13656597774244833, "grad_norm": 0.22242018580436707, "learning_rate": 4.7797397394606144e-05, "loss": 0.4374, "num_input_tokens_seen": 3349688550, "step": 859, "train_runtime": 34181.9828, "train_tokens_per_second": 97995.736 }, { "epoch": 0.13672496025437203, "grad_norm": 0.2275928407907486, "learning_rate": 4.779225921441969e-05, "loss": 0.4326, "num_input_tokens_seen": 3353577868, "step": 860, "train_runtime": 34224.5272, "train_tokens_per_second": 97987.559 }, { "epoch": 0.1368839427662957, "grad_norm": 0.24131892621517181, "learning_rate": 4.7787115324933354e-05, "loss": 0.4488, "num_input_tokens_seen": 3357463900, "step": 861, "train_runtime": 34264.0547, "train_tokens_per_second": 97987.933 }, { "epoch": 0.1370429252782194, "grad_norm": 0.26474910974502563, "learning_rate": 4.778196572743566e-05, "loss": 0.4479, "num_input_tokens_seen": 3361389774, "step": 862, "train_runtime": 34301.7764, "train_tokens_per_second": 97994.627 }, { "epoch": 0.13720190779014307, "grad_norm": 0.24508069455623627, "learning_rate": 4.7776810423216536e-05, "loss": 0.4394, "num_input_tokens_seen": 3365252354, "step": 863, "train_runtime": 34342.3584, "train_tokens_per_second": 97991.3 }, { "epoch": 0.13736089030206677, "grad_norm": 0.19742880761623383, "learning_rate": 4.777164941356734e-05, "loss": 0.4537, "num_input_tokens_seen": 3369022212, "step": 864, "train_runtime": 34381.6323, "train_tokens_per_second": 97989.013 }, { "epoch": 0.13751987281399047, "grad_norm": 0.23898883163928986, "learning_rate": 4.7766482699780886e-05, "loss": 0.4467, "num_input_tokens_seen": 3373088668, "step": 865, "train_runtime": 34419.4132, "train_tokens_per_second": 97999.598 }, { "epoch": 0.13767885532591415, "grad_norm": 0.2601195275783539, "learning_rate": 4.776131028315139e-05, "loss": 0.4448, "num_input_tokens_seen": 3377056815, "step": 866, "train_runtime": 34457.8122, "train_tokens_per_second": 98005.549 }, { "epoch": 0.13783783783783785, "grad_norm": 0.2362729161977768, "learning_rate": 4.775613216497451e-05, "loss": 0.4373, "num_input_tokens_seen": 3380835820, "step": 867, "train_runtime": 34496.5006, "train_tokens_per_second": 98005.182 }, { "epoch": 0.13799682034976152, "grad_norm": 0.2486230731010437, "learning_rate": 4.7750948346547317e-05, "loss": 0.4472, "num_input_tokens_seen": 3384843766, "step": 868, "train_runtime": 34534.6392, "train_tokens_per_second": 98013.005 }, { "epoch": 0.13815580286168522, "grad_norm": 0.2793114483356476, "learning_rate": 4.774575882916834e-05, "loss": 0.4552, "num_input_tokens_seen": 3388619030, "step": 869, "train_runtime": 34575.0283, "train_tokens_per_second": 98007.701 }, { "epoch": 0.1383147853736089, "grad_norm": 0.24226117134094238, "learning_rate": 4.77405636141375e-05, "loss": 0.462, "num_input_tokens_seen": 3392450615, "step": 870, "train_runtime": 34613.572, "train_tokens_per_second": 98009.261 }, { "epoch": 0.1384737678855326, "grad_norm": 1.9604781866073608, "learning_rate": 4.773536270275617e-05, "loss": 0.4322, "num_input_tokens_seen": 3396380345, "step": 871, "train_runtime": 34652.9159, "train_tokens_per_second": 98011.387 }, { "epoch": 0.1386327503974563, "grad_norm": 0.3208828568458557, "learning_rate": 4.773015609632714e-05, "loss": 0.4548, "num_input_tokens_seen": 3400253057, "step": 872, "train_runtime": 34693.6196, "train_tokens_per_second": 98008.023 }, { "epoch": 0.13879173290937996, "grad_norm": 0.32237452268600464, "learning_rate": 4.772494379615462e-05, "loss": 0.4373, "num_input_tokens_seen": 3404165938, "step": 873, "train_runtime": 34730.2863, "train_tokens_per_second": 98017.215 }, { "epoch": 0.13895071542130366, "grad_norm": 0.3868151605129242, "learning_rate": 4.771972580354427e-05, "loss": 0.4505, "num_input_tokens_seen": 3408168234, "step": 874, "train_runtime": 34771.1301, "train_tokens_per_second": 98017.183 }, { "epoch": 0.13910969793322733, "grad_norm": 0.28758057951927185, "learning_rate": 4.771450211980315e-05, "loss": 0.439, "num_input_tokens_seen": 3412157604, "step": 875, "train_runtime": 34809.6706, "train_tokens_per_second": 98023.266 }, { "epoch": 0.13926868044515103, "grad_norm": 0.21110928058624268, "learning_rate": 4.770927274623975e-05, "loss": 0.4442, "num_input_tokens_seen": 3416169047, "step": 876, "train_runtime": 34846.2217, "train_tokens_per_second": 98035.566 }, { "epoch": 0.13942766295707473, "grad_norm": 0.2717263102531433, "learning_rate": 4.770403768416401e-05, "loss": 0.4557, "num_input_tokens_seen": 3419947303, "step": 877, "train_runtime": 34884.6379, "train_tokens_per_second": 98035.912 }, { "epoch": 0.1395866454689984, "grad_norm": 0.28478193283081055, "learning_rate": 4.769879693488726e-05, "loss": 0.4507, "num_input_tokens_seen": 3423744404, "step": 878, "train_runtime": 34922.0705, "train_tokens_per_second": 98039.559 }, { "epoch": 0.1397456279809221, "grad_norm": 0.24720077216625214, "learning_rate": 4.7693550499722276e-05, "loss": 0.4468, "num_input_tokens_seen": 3427740549, "step": 879, "train_runtime": 34962.0055, "train_tokens_per_second": 98041.874 }, { "epoch": 0.13990461049284578, "grad_norm": 0.24823696911334991, "learning_rate": 4.768829837998325e-05, "loss": 0.4332, "num_input_tokens_seen": 3431716083, "step": 880, "train_runtime": 35001.223, "train_tokens_per_second": 98045.605 }, { "epoch": 0.14006359300476948, "grad_norm": 0.24375024437904358, "learning_rate": 4.7683040576985806e-05, "loss": 0.4493, "num_input_tokens_seen": 3435663027, "step": 881, "train_runtime": 35040.4668, "train_tokens_per_second": 98048.438 }, { "epoch": 0.14022257551669318, "grad_norm": 0.23259805142879486, "learning_rate": 4.7677777092046986e-05, "loss": 0.4378, "num_input_tokens_seen": 3439435525, "step": 882, "train_runtime": 35080.8943, "train_tokens_per_second": 98042.983 }, { "epoch": 0.14038155802861685, "grad_norm": 0.24587976932525635, "learning_rate": 4.767250792648525e-05, "loss": 0.4671, "num_input_tokens_seen": 3443517042, "step": 883, "train_runtime": 35120.2151, "train_tokens_per_second": 98049.429 }, { "epoch": 0.14054054054054055, "grad_norm": 0.21248772740364075, "learning_rate": 4.766723308162049e-05, "loss": 0.4423, "num_input_tokens_seen": 3447335556, "step": 884, "train_runtime": 35159.0605, "train_tokens_per_second": 98049.706 }, { "epoch": 0.14069952305246422, "grad_norm": 0.22836928069591522, "learning_rate": 4.766195255877402e-05, "loss": 0.4398, "num_input_tokens_seen": 3451198194, "step": 885, "train_runtime": 35199.8525, "train_tokens_per_second": 98045.814 }, { "epoch": 0.14085850556438792, "grad_norm": 0.24216468632221222, "learning_rate": 4.765666635926857e-05, "loss": 0.4457, "num_input_tokens_seen": 3455070737, "step": 886, "train_runtime": 35241.2627, "train_tokens_per_second": 98040.492 }, { "epoch": 0.1410174880763116, "grad_norm": 0.2252722680568695, "learning_rate": 4.76513744844283e-05, "loss": 0.4585, "num_input_tokens_seen": 3459088210, "step": 887, "train_runtime": 35280.0462, "train_tokens_per_second": 98046.59 }, { "epoch": 0.1411764705882353, "grad_norm": 0.23440077900886536, "learning_rate": 4.7646076935578775e-05, "loss": 0.4571, "num_input_tokens_seen": 3462950492, "step": 888, "train_runtime": 35319.2701, "train_tokens_per_second": 98047.057 }, { "epoch": 0.141335453100159, "grad_norm": 0.25588634610176086, "learning_rate": 4.764077371404702e-05, "loss": 0.447, "num_input_tokens_seen": 3466932552, "step": 889, "train_runtime": 35358.9029, "train_tokens_per_second": 98049.777 }, { "epoch": 0.14149443561208266, "grad_norm": 0.2212911695241928, "learning_rate": 4.763546482116142e-05, "loss": 0.4547, "num_input_tokens_seen": 3470858601, "step": 890, "train_runtime": 35400.2796, "train_tokens_per_second": 98046.079 }, { "epoch": 0.14165341812400636, "grad_norm": 0.219281405210495, "learning_rate": 4.7630150258251835e-05, "loss": 0.4347, "num_input_tokens_seen": 3474796020, "step": 891, "train_runtime": 35438.1123, "train_tokens_per_second": 98052.514 }, { "epoch": 0.14181240063593004, "grad_norm": 0.23433740437030792, "learning_rate": 4.762483002664953e-05, "loss": 0.4366, "num_input_tokens_seen": 3478674009, "step": 892, "train_runtime": 35476.6757, "train_tokens_per_second": 98055.242 }, { "epoch": 0.14197138314785374, "grad_norm": 0.20943057537078857, "learning_rate": 4.761950412768718e-05, "loss": 0.4453, "num_input_tokens_seen": 3482614567, "step": 893, "train_runtime": 35516.022, "train_tokens_per_second": 98057.563 }, { "epoch": 0.14213036565977744, "grad_norm": 0.25481078028678894, "learning_rate": 4.761417256269887e-05, "loss": 0.4492, "num_input_tokens_seen": 3486604240, "step": 894, "train_runtime": 35555.6903, "train_tokens_per_second": 98060.373 }, { "epoch": 0.1422893481717011, "grad_norm": 0.2616185247898102, "learning_rate": 4.7608835333020144e-05, "loss": 0.447, "num_input_tokens_seen": 3490461769, "step": 895, "train_runtime": 35594.2087, "train_tokens_per_second": 98062.631 }, { "epoch": 0.1424483306836248, "grad_norm": 0.23441636562347412, "learning_rate": 4.760349243998793e-05, "loss": 0.4401, "num_input_tokens_seen": 3494287137, "step": 896, "train_runtime": 35634.4233, "train_tokens_per_second": 98059.315 }, { "epoch": 0.14260731319554848, "grad_norm": 0.3889283239841461, "learning_rate": 4.759814388494058e-05, "loss": 0.4512, "num_input_tokens_seen": 3498250158, "step": 897, "train_runtime": 35674.7588, "train_tokens_per_second": 98059.532 }, { "epoch": 0.14276629570747218, "grad_norm": 0.2663809061050415, "learning_rate": 4.7592789669217875e-05, "loss": 0.439, "num_input_tokens_seen": 3502089659, "step": 898, "train_runtime": 35713.9206, "train_tokens_per_second": 98059.513 }, { "epoch": 0.14292527821939588, "grad_norm": 0.2600483298301697, "learning_rate": 4.7587429794161e-05, "loss": 0.4488, "num_input_tokens_seen": 3506074735, "step": 899, "train_runtime": 35753.4675, "train_tokens_per_second": 98062.509 }, { "epoch": 0.14308426073131955, "grad_norm": 0.24330073595046997, "learning_rate": 4.758206426111258e-05, "loss": 0.4431, "num_input_tokens_seen": 3509882241, "step": 900, "train_runtime": 35792.2467, "train_tokens_per_second": 98062.641 }, { "epoch": 0.14324324324324325, "grad_norm": 0.24136102199554443, "learning_rate": 4.7576693071416637e-05, "loss": 0.4451, "num_input_tokens_seen": 3513882989, "step": 901, "train_runtime": 35831.0117, "train_tokens_per_second": 98068.205 }, { "epoch": 0.14340222575516692, "grad_norm": 0.23407000303268433, "learning_rate": 4.75713162264186e-05, "loss": 0.4487, "num_input_tokens_seen": 3517700960, "step": 902, "train_runtime": 35870.7895, "train_tokens_per_second": 98065.892 }, { "epoch": 0.14356120826709062, "grad_norm": 0.24189350008964539, "learning_rate": 4.7565933727465365e-05, "loss": 0.4368, "num_input_tokens_seen": 3521564682, "step": 903, "train_runtime": 35908.9076, "train_tokens_per_second": 98069.39 }, { "epoch": 0.1437201907790143, "grad_norm": 0.2652661204338074, "learning_rate": 4.756054557590518e-05, "loss": 0.449, "num_input_tokens_seen": 3525423219, "step": 904, "train_runtime": 35948.1951, "train_tokens_per_second": 98069.547 }, { "epoch": 0.143879173290938, "grad_norm": 0.21748337149620056, "learning_rate": 4.7555151773087744e-05, "loss": 0.4555, "num_input_tokens_seen": 3529356767, "step": 905, "train_runtime": 35989.0786, "train_tokens_per_second": 98067.439 }, { "epoch": 0.1440381558028617, "grad_norm": 0.258842408657074, "learning_rate": 4.754975232036418e-05, "loss": 0.4631, "num_input_tokens_seen": 3533299066, "step": 906, "train_runtime": 36030.1295, "train_tokens_per_second": 98065.123 }, { "epoch": 0.14419713831478537, "grad_norm": 0.2607496976852417, "learning_rate": 4.7544347219087e-05, "loss": 0.4395, "num_input_tokens_seen": 3537091419, "step": 907, "train_runtime": 36068.5377, "train_tokens_per_second": 98065.839 }, { "epoch": 0.14435612082670907, "grad_norm": 0.25779303908348083, "learning_rate": 4.753893647061015e-05, "loss": 0.4547, "num_input_tokens_seen": 3540981686, "step": 908, "train_runtime": 36107.3376, "train_tokens_per_second": 98068.202 }, { "epoch": 0.14451510333863274, "grad_norm": 0.2650364637374878, "learning_rate": 4.753352007628899e-05, "loss": 0.4435, "num_input_tokens_seen": 3544911809, "step": 909, "train_runtime": 36146.7801, "train_tokens_per_second": 98069.919 }, { "epoch": 0.14467408585055644, "grad_norm": 0.21114115417003632, "learning_rate": 4.7528098037480264e-05, "loss": 0.4328, "num_input_tokens_seen": 3548823980, "step": 910, "train_runtime": 36185.1341, "train_tokens_per_second": 98074.087 }, { "epoch": 0.14483306836248014, "grad_norm": 0.27069032192230225, "learning_rate": 4.752267035554218e-05, "loss": 0.4445, "num_input_tokens_seen": 3552712083, "step": 911, "train_runtime": 36224.4258, "train_tokens_per_second": 98075.042 }, { "epoch": 0.1449920508744038, "grad_norm": 0.25628721714019775, "learning_rate": 4.7517237031834325e-05, "loss": 0.4591, "num_input_tokens_seen": 3556601110, "step": 912, "train_runtime": 36264.3646, "train_tokens_per_second": 98074.271 }, { "epoch": 0.1451510333863275, "grad_norm": 0.2535039484500885, "learning_rate": 4.7511798067717705e-05, "loss": 0.4377, "num_input_tokens_seen": 3560476780, "step": 913, "train_runtime": 36303.5931, "train_tokens_per_second": 98075.052 }, { "epoch": 0.14531001589825118, "grad_norm": 0.23674342036247253, "learning_rate": 4.750635346455475e-05, "loss": 0.4557, "num_input_tokens_seen": 3564435495, "step": 914, "train_runtime": 36343.172, "train_tokens_per_second": 98077.171 }, { "epoch": 0.14546899841017488, "grad_norm": 0.28290459513664246, "learning_rate": 4.750090322370929e-05, "loss": 0.4597, "num_input_tokens_seen": 3568307240, "step": 915, "train_runtime": 36384.0651, "train_tokens_per_second": 98073.353 }, { "epoch": 0.14562798092209858, "grad_norm": 0.29822489619255066, "learning_rate": 4.749544734654657e-05, "loss": 0.4428, "num_input_tokens_seen": 3572081280, "step": 916, "train_runtime": 36425.0245, "train_tokens_per_second": 98066.682 }, { "epoch": 0.14578696343402225, "grad_norm": 0.2294008582830429, "learning_rate": 4.748998583443325e-05, "loss": 0.4486, "num_input_tokens_seen": 3576023679, "step": 917, "train_runtime": 36462.9789, "train_tokens_per_second": 98072.724 }, { "epoch": 0.14594594594594595, "grad_norm": 0.25027140974998474, "learning_rate": 4.74845186887374e-05, "loss": 0.4467, "num_input_tokens_seen": 3579951178, "step": 918, "train_runtime": 36502.7231, "train_tokens_per_second": 98073.537 }, { "epoch": 0.14610492845786963, "grad_norm": 0.24628248810768127, "learning_rate": 4.74790459108285e-05, "loss": 0.4484, "num_input_tokens_seen": 3583862812, "step": 919, "train_runtime": 36541.8189, "train_tokens_per_second": 98075.655 }, { "epoch": 0.14626391096979333, "grad_norm": 0.22266611456871033, "learning_rate": 4.7473567502077447e-05, "loss": 0.4499, "num_input_tokens_seen": 3587714716, "step": 920, "train_runtime": 36580.4787, "train_tokens_per_second": 98077.304 }, { "epoch": 0.146422893481717, "grad_norm": 0.22164760529994965, "learning_rate": 4.746808346385654e-05, "loss": 0.4572, "num_input_tokens_seen": 3591726969, "step": 921, "train_runtime": 36620.7757, "train_tokens_per_second": 98078.943 }, { "epoch": 0.1465818759936407, "grad_norm": 0.23550286889076233, "learning_rate": 4.7462593797539494e-05, "loss": 0.4582, "num_input_tokens_seen": 3595578564, "step": 922, "train_runtime": 36660.3736, "train_tokens_per_second": 98078.066 }, { "epoch": 0.1467408585055644, "grad_norm": 0.2243964672088623, "learning_rate": 4.745709850450143e-05, "loss": 0.4528, "num_input_tokens_seen": 3599487499, "step": 923, "train_runtime": 36700.1155, "train_tokens_per_second": 98078.37 }, { "epoch": 0.14689984101748807, "grad_norm": 0.30130085349082947, "learning_rate": 4.745159758611888e-05, "loss": 0.4606, "num_input_tokens_seen": 3603363714, "step": 924, "train_runtime": 36740.2705, "train_tokens_per_second": 98076.679 }, { "epoch": 0.14705882352941177, "grad_norm": 0.22378116846084595, "learning_rate": 4.7446091043769786e-05, "loss": 0.4465, "num_input_tokens_seen": 3607331293, "step": 925, "train_runtime": 36780.9789, "train_tokens_per_second": 98076.0 }, { "epoch": 0.14721780604133544, "grad_norm": 0.3531389832496643, "learning_rate": 4.74405788788335e-05, "loss": 0.4463, "num_input_tokens_seen": 3611182729, "step": 926, "train_runtime": 36818.2185, "train_tokens_per_second": 98081.408 }, { "epoch": 0.14737678855325914, "grad_norm": 0.2760814428329468, "learning_rate": 4.743506109269077e-05, "loss": 0.4504, "num_input_tokens_seen": 3615015437, "step": 927, "train_runtime": 36856.9235, "train_tokens_per_second": 98082.398 }, { "epoch": 0.14753577106518284, "grad_norm": 0.2705558240413666, "learning_rate": 4.742953768672378e-05, "loss": 0.4441, "num_input_tokens_seen": 3618968728, "step": 928, "train_runtime": 36897.1578, "train_tokens_per_second": 98082.588 }, { "epoch": 0.1476947535771065, "grad_norm": 0.2400878667831421, "learning_rate": 4.742400866231609e-05, "loss": 0.4568, "num_input_tokens_seen": 3622875414, "step": 929, "train_runtime": 36935.8844, "train_tokens_per_second": 98085.52 }, { "epoch": 0.1478537360890302, "grad_norm": 0.24198494851589203, "learning_rate": 4.7418474020852686e-05, "loss": 0.4522, "num_input_tokens_seen": 3626844738, "step": 930, "train_runtime": 36975.8923, "train_tokens_per_second": 98086.74 }, { "epoch": 0.14801271860095389, "grad_norm": 0.23727086186408997, "learning_rate": 4.741293376371996e-05, "loss": 0.4549, "num_input_tokens_seen": 3630738352, "step": 931, "train_runtime": 37011.7924, "train_tokens_per_second": 98096.799 }, { "epoch": 0.14817170111287759, "grad_norm": 0.28648829460144043, "learning_rate": 4.74073878923057e-05, "loss": 0.4431, "num_input_tokens_seen": 3634552755, "step": 932, "train_runtime": 37052.788, "train_tokens_per_second": 98091.209 }, { "epoch": 0.14833068362480128, "grad_norm": 0.23236073553562164, "learning_rate": 4.740183640799911e-05, "loss": 0.4371, "num_input_tokens_seen": 3638482746, "step": 933, "train_runtime": 37091.8658, "train_tokens_per_second": 98093.818 }, { "epoch": 0.14848966613672496, "grad_norm": 0.22270479798316956, "learning_rate": 4.73962793121908e-05, "loss": 0.4518, "num_input_tokens_seen": 3642366419, "step": 934, "train_runtime": 37131.2713, "train_tokens_per_second": 98094.309 }, { "epoch": 0.14864864864864866, "grad_norm": 0.2918831706047058, "learning_rate": 4.739071660627278e-05, "loss": 0.4357, "num_input_tokens_seen": 3646236428, "step": 935, "train_runtime": 37171.8276, "train_tokens_per_second": 98091.395 }, { "epoch": 0.14880763116057233, "grad_norm": 0.27525970339775085, "learning_rate": 4.738514829163848e-05, "loss": 0.4526, "num_input_tokens_seen": 3650182229, "step": 936, "train_runtime": 37211.0267, "train_tokens_per_second": 98094.102 }, { "epoch": 0.14896661367249603, "grad_norm": 0.2132565826177597, "learning_rate": 4.7379574369682703e-05, "loss": 0.4489, "num_input_tokens_seen": 3654134341, "step": 937, "train_runtime": 37249.942, "train_tokens_per_second": 98097.719 }, { "epoch": 0.1491255961844197, "grad_norm": 0.24803698062896729, "learning_rate": 4.737399484180169e-05, "loss": 0.4396, "num_input_tokens_seen": 3657979177, "step": 938, "train_runtime": 37289.762, "train_tokens_per_second": 98096.072 }, { "epoch": 0.1492845786963434, "grad_norm": 0.22960440814495087, "learning_rate": 4.736840970939308e-05, "loss": 0.458, "num_input_tokens_seen": 3661986146, "step": 939, "train_runtime": 37330.1927, "train_tokens_per_second": 98097.167 }, { "epoch": 0.1494435612082671, "grad_norm": 0.2602882981300354, "learning_rate": 4.73628189738559e-05, "loss": 0.4465, "num_input_tokens_seen": 3665901334, "step": 940, "train_runtime": 37368.2161, "train_tokens_per_second": 98102.123 }, { "epoch": 0.14960254372019077, "grad_norm": 0.22877530753612518, "learning_rate": 4.735722263659058e-05, "loss": 0.4476, "num_input_tokens_seen": 3669798027, "step": 941, "train_runtime": 37407.9688, "train_tokens_per_second": 98102.039 }, { "epoch": 0.14976152623211447, "grad_norm": 0.30758774280548096, "learning_rate": 4.735162069899899e-05, "loss": 0.4443, "num_input_tokens_seen": 3673764738, "step": 942, "train_runtime": 37448.0989, "train_tokens_per_second": 98102.837 }, { "epoch": 0.14992050874403814, "grad_norm": 0.2126237452030182, "learning_rate": 4.734601316248435e-05, "loss": 0.4455, "num_input_tokens_seen": 3677632072, "step": 943, "train_runtime": 37486.2669, "train_tokens_per_second": 98106.117 }, { "epoch": 0.15007949125596184, "grad_norm": 0.23739449679851532, "learning_rate": 4.734040002845131e-05, "loss": 0.4475, "num_input_tokens_seen": 3681618840, "step": 944, "train_runtime": 37523.6696, "train_tokens_per_second": 98114.573 }, { "epoch": 0.15023847376788554, "grad_norm": 0.22242209315299988, "learning_rate": 4.733478129830594e-05, "loss": 0.4331, "num_input_tokens_seen": 3685520453, "step": 945, "train_runtime": 37560.8926, "train_tokens_per_second": 98121.216 }, { "epoch": 0.15039745627980922, "grad_norm": 0.22601573169231415, "learning_rate": 4.732915697345567e-05, "loss": 0.4606, "num_input_tokens_seen": 3689388165, "step": 946, "train_runtime": 37599.2408, "train_tokens_per_second": 98124.007 }, { "epoch": 0.15055643879173292, "grad_norm": 0.25094738602638245, "learning_rate": 4.732352705530938e-05, "loss": 0.4517, "num_input_tokens_seen": 3693234880, "step": 947, "train_runtime": 37636.7332, "train_tokens_per_second": 98128.466 }, { "epoch": 0.1507154213036566, "grad_norm": 0.21488815546035767, "learning_rate": 4.7317891545277296e-05, "loss": 0.4365, "num_input_tokens_seen": 3697157223, "step": 948, "train_runtime": 37676.8979, "train_tokens_per_second": 98127.962 }, { "epoch": 0.1508744038155803, "grad_norm": 0.24501115083694458, "learning_rate": 4.7312250444771086e-05, "loss": 0.4416, "num_input_tokens_seen": 3701033906, "step": 949, "train_runtime": 37716.7173, "train_tokens_per_second": 98127.148 }, { "epoch": 0.151033386327504, "grad_norm": 0.2557896077632904, "learning_rate": 4.73066037552038e-05, "loss": 0.4463, "num_input_tokens_seen": 3704822763, "step": 950, "train_runtime": 37756.9105, "train_tokens_per_second": 98123.038 }, { "epoch": 0.15119236883942766, "grad_norm": 0.24009136855602264, "learning_rate": 4.7300951477989914e-05, "loss": 0.441, "num_input_tokens_seen": 3708733162, "step": 951, "train_runtime": 37795.2292, "train_tokens_per_second": 98127.019 }, { "epoch": 0.15135135135135136, "grad_norm": 0.23898006975650787, "learning_rate": 4.729529361454526e-05, "loss": 0.4364, "num_input_tokens_seen": 3712680663, "step": 952, "train_runtime": 37835.8255, "train_tokens_per_second": 98126.065 }, { "epoch": 0.15151033386327503, "grad_norm": 0.7035461664199829, "learning_rate": 4.72896301662871e-05, "loss": 0.4463, "num_input_tokens_seen": 3716585762, "step": 953, "train_runtime": 37873.4459, "train_tokens_per_second": 98131.704 }, { "epoch": 0.15166931637519873, "grad_norm": 0.2586140036582947, "learning_rate": 4.72839611346341e-05, "loss": 0.452, "num_input_tokens_seen": 3720562562, "step": 954, "train_runtime": 37910.0228, "train_tokens_per_second": 98141.924 }, { "epoch": 0.1518282988871224, "grad_norm": 0.28519853949546814, "learning_rate": 4.72782865210063e-05, "loss": 0.4381, "num_input_tokens_seen": 3724297537, "step": 955, "train_runtime": 37949.5006, "train_tokens_per_second": 98138.249 }, { "epoch": 0.1519872813990461, "grad_norm": 0.30704328417778015, "learning_rate": 4.7272606326825144e-05, "loss": 0.4377, "num_input_tokens_seen": 3728180111, "step": 956, "train_runtime": 37990.7735, "train_tokens_per_second": 98133.83 }, { "epoch": 0.1521462639109698, "grad_norm": 0.39622998237609863, "learning_rate": 4.7266920553513494e-05, "loss": 0.4466, "num_input_tokens_seen": 3732052350, "step": 957, "train_runtime": 38030.4413, "train_tokens_per_second": 98133.291 }, { "epoch": 0.15230524642289348, "grad_norm": 0.289985328912735, "learning_rate": 4.726122920249559e-05, "loss": 0.4331, "num_input_tokens_seen": 3736062234, "step": 958, "train_runtime": 38071.33, "train_tokens_per_second": 98133.221 }, { "epoch": 0.15246422893481718, "grad_norm": 0.24980942904949188, "learning_rate": 4.725553227519708e-05, "loss": 0.4593, "num_input_tokens_seen": 3739929278, "step": 959, "train_runtime": 38109.9722, "train_tokens_per_second": 98135.188 }, { "epoch": 0.15262321144674085, "grad_norm": 0.3143407702445984, "learning_rate": 4.7249829773044994e-05, "loss": 0.4323, "num_input_tokens_seen": 3743746184, "step": 960, "train_runtime": 38149.1105, "train_tokens_per_second": 98134.56 }, { "epoch": 0.15278219395866455, "grad_norm": 0.259549081325531, "learning_rate": 4.724412169746779e-05, "loss": 0.4557, "num_input_tokens_seen": 3747756720, "step": 961, "train_runtime": 38188.5961, "train_tokens_per_second": 98138.112 }, { "epoch": 0.15294117647058825, "grad_norm": 0.2380835860967636, "learning_rate": 4.723840804989527e-05, "loss": 0.4569, "num_input_tokens_seen": 3751652069, "step": 962, "train_runtime": 38227.3675, "train_tokens_per_second": 98140.477 }, { "epoch": 0.15310015898251192, "grad_norm": 0.2942010760307312, "learning_rate": 4.7232688831758686e-05, "loss": 0.4471, "num_input_tokens_seen": 3755523120, "step": 963, "train_runtime": 38264.1064, "train_tokens_per_second": 98147.415 }, { "epoch": 0.15325914149443562, "grad_norm": 0.2118634283542633, "learning_rate": 4.722696404449065e-05, "loss": 0.439, "num_input_tokens_seen": 3759291849, "step": 964, "train_runtime": 38303.9958, "train_tokens_per_second": 98143.595 }, { "epoch": 0.1534181240063593, "grad_norm": 0.22341294586658478, "learning_rate": 4.722123368952518e-05, "loss": 0.4481, "num_input_tokens_seen": 3763183262, "step": 965, "train_runtime": 38342.7194, "train_tokens_per_second": 98145.967 }, { "epoch": 0.153577106518283, "grad_norm": 0.243682861328125, "learning_rate": 4.72154977682977e-05, "loss": 0.4384, "num_input_tokens_seen": 3767247742, "step": 966, "train_runtime": 38380.4175, "train_tokens_per_second": 98155.465 }, { "epoch": 0.1537360890302067, "grad_norm": 0.2068396657705307, "learning_rate": 4.720975628224501e-05, "loss": 0.4409, "num_input_tokens_seen": 3770992606, "step": 967, "train_runtime": 38421.5676, "train_tokens_per_second": 98147.807 }, { "epoch": 0.15389507154213036, "grad_norm": 0.2510193884372711, "learning_rate": 4.7204009232805313e-05, "loss": 0.4375, "num_input_tokens_seen": 3774948712, "step": 968, "train_runtime": 38461.376, "train_tokens_per_second": 98149.081 }, { "epoch": 0.15405405405405406, "grad_norm": 0.26022711396217346, "learning_rate": 4.7198256621418205e-05, "loss": 0.4466, "num_input_tokens_seen": 3778866271, "step": 969, "train_runtime": 38502.2059, "train_tokens_per_second": 98146.747 }, { "epoch": 0.15421303656597773, "grad_norm": 0.2284679114818573, "learning_rate": 4.719249844952468e-05, "loss": 0.4466, "num_input_tokens_seen": 3782848974, "step": 970, "train_runtime": 38541.8442, "train_tokens_per_second": 98149.143 }, { "epoch": 0.15437201907790143, "grad_norm": 0.22722874581813812, "learning_rate": 4.7186734718567105e-05, "loss": 0.4485, "num_input_tokens_seen": 3786662705, "step": 971, "train_runtime": 38579.2401, "train_tokens_per_second": 98152.859 }, { "epoch": 0.1545310015898251, "grad_norm": 0.2625904977321625, "learning_rate": 4.718096542998927e-05, "loss": 0.4391, "num_input_tokens_seen": 3790547279, "step": 972, "train_runtime": 38620.0287, "train_tokens_per_second": 98149.779 }, { "epoch": 0.1546899841017488, "grad_norm": 0.22871196269989014, "learning_rate": 4.717519058523633e-05, "loss": 0.4489, "num_input_tokens_seen": 3794498874, "step": 973, "train_runtime": 38659.0755, "train_tokens_per_second": 98152.861 }, { "epoch": 0.1548489666136725, "grad_norm": 0.27903804183006287, "learning_rate": 4.716941018575484e-05, "loss": 0.4495, "num_input_tokens_seen": 3798419805, "step": 974, "train_runtime": 38697.8385, "train_tokens_per_second": 98155.865 }, { "epoch": 0.15500794912559618, "grad_norm": 0.3763938844203949, "learning_rate": 4.716362423299276e-05, "loss": 0.4296, "num_input_tokens_seen": 3802283165, "step": 975, "train_runtime": 38737.3364, "train_tokens_per_second": 98155.514 }, { "epoch": 0.15516693163751988, "grad_norm": 0.24875226616859436, "learning_rate": 4.715783272839943e-05, "loss": 0.4524, "num_input_tokens_seen": 3806184045, "step": 976, "train_runtime": 38777.4255, "train_tokens_per_second": 98154.635 }, { "epoch": 0.15532591414944355, "grad_norm": 0.22675712406635284, "learning_rate": 4.715203567342556e-05, "loss": 0.4444, "num_input_tokens_seen": 3810115180, "step": 977, "train_runtime": 38817.6267, "train_tokens_per_second": 98154.254 }, { "epoch": 0.15548489666136725, "grad_norm": 0.23575951159000397, "learning_rate": 4.71462330695233e-05, "loss": 0.4518, "num_input_tokens_seen": 3813988990, "step": 978, "train_runtime": 38858.8711, "train_tokens_per_second": 98149.763 }, { "epoch": 0.15564387917329095, "grad_norm": 0.2654387056827545, "learning_rate": 4.714042491814615e-05, "loss": 0.4435, "num_input_tokens_seen": 3817909633, "step": 979, "train_runtime": 38898.0977, "train_tokens_per_second": 98151.577 }, { "epoch": 0.15580286168521462, "grad_norm": 0.5041520595550537, "learning_rate": 4.7134611220749015e-05, "loss": 0.4478, "num_input_tokens_seen": 3821864320, "step": 980, "train_runtime": 38935.4031, "train_tokens_per_second": 98159.105 }, { "epoch": 0.15596184419713832, "grad_norm": 0.27274560928344727, "learning_rate": 4.7128791978788174e-05, "loss": 0.4464, "num_input_tokens_seen": 3825679247, "step": 981, "train_runtime": 38974.8072, "train_tokens_per_second": 98157.747 }, { "epoch": 0.156120826709062, "grad_norm": 0.2487170398235321, "learning_rate": 4.712296719372131e-05, "loss": 0.4426, "num_input_tokens_seen": 3829491627, "step": 982, "train_runtime": 39014.8177, "train_tokens_per_second": 98154.8 }, { "epoch": 0.1562798092209857, "grad_norm": 0.2326260656118393, "learning_rate": 4.71171368670075e-05, "loss": 0.4568, "num_input_tokens_seen": 3833497515, "step": 983, "train_runtime": 39054.0785, "train_tokens_per_second": 98158.698 }, { "epoch": 0.1564387917329094, "grad_norm": 0.23387646675109863, "learning_rate": 4.711130100010718e-05, "loss": 0.4545, "num_input_tokens_seen": 3837484382, "step": 984, "train_runtime": 39094.3897, "train_tokens_per_second": 98159.465 }, { "epoch": 0.15659777424483307, "grad_norm": 0.2565469741821289, "learning_rate": 4.710545959448221e-05, "loss": 0.4612, "num_input_tokens_seen": 3841268602, "step": 985, "train_runtime": 39132.8676, "train_tokens_per_second": 98159.65 }, { "epoch": 0.15675675675675677, "grad_norm": 0.2557179629802704, "learning_rate": 4.709961265159583e-05, "loss": 0.445, "num_input_tokens_seen": 3845213835, "step": 986, "train_runtime": 39170.7149, "train_tokens_per_second": 98165.526 }, { "epoch": 0.15691573926868044, "grad_norm": 0.2586371898651123, "learning_rate": 4.709376017291263e-05, "loss": 0.4442, "num_input_tokens_seen": 3849215725, "step": 987, "train_runtime": 39209.8545, "train_tokens_per_second": 98169.6 }, { "epoch": 0.15707472178060414, "grad_norm": 0.21991930902004242, "learning_rate": 4.708790215989864e-05, "loss": 0.4604, "num_input_tokens_seen": 3853086647, "step": 988, "train_runtime": 39248.396, "train_tokens_per_second": 98171.825 }, { "epoch": 0.1572337042925278, "grad_norm": 0.22650639712810516, "learning_rate": 4.708203861402123e-05, "loss": 0.4451, "num_input_tokens_seen": 3856937386, "step": 989, "train_runtime": 39287.8309, "train_tokens_per_second": 98171.299 }, { "epoch": 0.1573926868044515, "grad_norm": 0.24326185882091522, "learning_rate": 4.707616953674919e-05, "loss": 0.4418, "num_input_tokens_seen": 3860908513, "step": 990, "train_runtime": 39328.2142, "train_tokens_per_second": 98171.468 }, { "epoch": 0.1575516693163752, "grad_norm": 0.251630961894989, "learning_rate": 4.707029492955267e-05, "loss": 0.4515, "num_input_tokens_seen": 3864826006, "step": 991, "train_runtime": 39367.9011, "train_tokens_per_second": 98172.011 }, { "epoch": 0.15771065182829888, "grad_norm": 0.228254035115242, "learning_rate": 4.706441479390325e-05, "loss": 0.4433, "num_input_tokens_seen": 3868764146, "step": 992, "train_runtime": 39405.8383, "train_tokens_per_second": 98177.435 }, { "epoch": 0.15786963434022258, "grad_norm": 0.27986645698547363, "learning_rate": 4.705852913127381e-05, "loss": 0.4368, "num_input_tokens_seen": 3872579169, "step": 993, "train_runtime": 39446.397, "train_tokens_per_second": 98173.204 }, { "epoch": 0.15802861685214625, "grad_norm": 0.430959016084671, "learning_rate": 4.7052637943138714e-05, "loss": 0.446, "num_input_tokens_seen": 3876581158, "step": 994, "train_runtime": 39484.7349, "train_tokens_per_second": 98179.237 }, { "epoch": 0.15818759936406995, "grad_norm": 0.2570563554763794, "learning_rate": 4.7046741230973644e-05, "loss": 0.4424, "num_input_tokens_seen": 3880508092, "step": 995, "train_runtime": 39526.1503, "train_tokens_per_second": 98175.716 }, { "epoch": 0.15834658187599365, "grad_norm": 0.21915751695632935, "learning_rate": 4.704083899625568e-05, "loss": 0.4496, "num_input_tokens_seen": 3884379900, "step": 996, "train_runtime": 39565.0329, "train_tokens_per_second": 98177.093 }, { "epoch": 0.15850556438791732, "grad_norm": 0.240756094455719, "learning_rate": 4.70349312404633e-05, "loss": 0.45, "num_input_tokens_seen": 3888184940, "step": 997, "train_runtime": 39605.6001, "train_tokens_per_second": 98172.605 }, { "epoch": 0.15866454689984102, "grad_norm": 0.29040858149528503, "learning_rate": 4.702901796507635e-05, "loss": 0.4497, "num_input_tokens_seen": 3892067495, "step": 998, "train_runtime": 39641.3755, "train_tokens_per_second": 98181.949 }, { "epoch": 0.1588235294117647, "grad_norm": 0.23741573095321655, "learning_rate": 4.702309917157606e-05, "loss": 0.4493, "num_input_tokens_seen": 3896010958, "step": 999, "train_runtime": 39680.3252, "train_tokens_per_second": 98184.955 }, { "epoch": 0.1589825119236884, "grad_norm": 0.264017254114151, "learning_rate": 4.7017174861445054e-05, "loss": 0.4381, "num_input_tokens_seen": 3899851539, "step": 1000, "train_runtime": 39721.9002, "train_tokens_per_second": 98178.877 }, { "epoch": 0.1591414944356121, "grad_norm": 0.24536189436912537, "learning_rate": 4.701124503616733e-05, "loss": 0.4339, "num_input_tokens_seen": 3903764992, "step": 1001, "train_runtime": 39876.0213, "train_tokens_per_second": 97897.555 }, { "epoch": 0.15930047694753577, "grad_norm": 0.2388545274734497, "learning_rate": 4.700530969722826e-05, "loss": 0.4387, "num_input_tokens_seen": 3907606665, "step": 1002, "train_runtime": 39915.6576, "train_tokens_per_second": 97896.587 }, { "epoch": 0.15945945945945947, "grad_norm": 0.23479872941970825, "learning_rate": 4.699936884611461e-05, "loss": 0.4457, "num_input_tokens_seen": 3911579899, "step": 1003, "train_runtime": 39954.7447, "train_tokens_per_second": 97900.26 }, { "epoch": 0.15961844197138314, "grad_norm": 0.2659243047237396, "learning_rate": 4.699342248431452e-05, "loss": 0.4424, "num_input_tokens_seen": 3915461157, "step": 1004, "train_runtime": 39994.5699, "train_tokens_per_second": 97899.819 }, { "epoch": 0.15977742448330684, "grad_norm": 0.29806360602378845, "learning_rate": 4.698747061331751e-05, "loss": 0.4498, "num_input_tokens_seen": 3919259950, "step": 1005, "train_runtime": 40032.6115, "train_tokens_per_second": 97901.681 }, { "epoch": 0.1599364069952305, "grad_norm": 0.2564929723739624, "learning_rate": 4.698151323461448e-05, "loss": 0.4629, "num_input_tokens_seen": 3923184141, "step": 1006, "train_runtime": 40071.3, "train_tokens_per_second": 97905.088 }, { "epoch": 0.1600953895071542, "grad_norm": 0.23583383858203888, "learning_rate": 4.6975550349697716e-05, "loss": 0.4486, "num_input_tokens_seen": 3927000963, "step": 1007, "train_runtime": 40110.9285, "train_tokens_per_second": 97903.517 }, { "epoch": 0.1602543720190779, "grad_norm": 0.2745251953601837, "learning_rate": 4.6969581960060874e-05, "loss": 0.4512, "num_input_tokens_seen": 3931023086, "step": 1008, "train_runtime": 40150.6011, "train_tokens_per_second": 97906.955 }, { "epoch": 0.16041335453100158, "grad_norm": 0.22992803156375885, "learning_rate": 4.6963608067198994e-05, "loss": 0.4395, "num_input_tokens_seen": 3934976210, "step": 1009, "train_runtime": 40190.9406, "train_tokens_per_second": 97907.044 }, { "epoch": 0.16057233704292528, "grad_norm": 0.263315886259079, "learning_rate": 4.69576286726085e-05, "loss": 0.4628, "num_input_tokens_seen": 3938835555, "step": 1010, "train_runtime": 40229.1056, "train_tokens_per_second": 97910.095 }, { "epoch": 0.16073131955484896, "grad_norm": 0.42701345682144165, "learning_rate": 4.695164377778719e-05, "loss": 0.4509, "num_input_tokens_seen": 3942692599, "step": 1011, "train_runtime": 40267.794, "train_tokens_per_second": 97911.81 }, { "epoch": 0.16089030206677266, "grad_norm": 0.2704410254955292, "learning_rate": 4.694565338423422e-05, "loss": 0.4482, "num_input_tokens_seen": 3946510719, "step": 1012, "train_runtime": 40307.7634, "train_tokens_per_second": 97909.444 }, { "epoch": 0.16104928457869636, "grad_norm": 0.2675473392009735, "learning_rate": 4.693965749345016e-05, "loss": 0.4455, "num_input_tokens_seen": 3950463814, "step": 1013, "train_runtime": 40348.664, "train_tokens_per_second": 97908.169 }, { "epoch": 0.16120826709062003, "grad_norm": 0.24799129366874695, "learning_rate": 4.693365610693693e-05, "loss": 0.4395, "num_input_tokens_seen": 3954336174, "step": 1014, "train_runtime": 40386.6092, "train_tokens_per_second": 97912.062 }, { "epoch": 0.16136724960254373, "grad_norm": 0.2439851015806198, "learning_rate": 4.692764922619783e-05, "loss": 0.4644, "num_input_tokens_seen": 3958238872, "step": 1015, "train_runtime": 40427.6501, "train_tokens_per_second": 97909.2 }, { "epoch": 0.1615262321144674, "grad_norm": 0.2975598871707916, "learning_rate": 4.692163685273756e-05, "loss": 0.4457, "num_input_tokens_seen": 3962158686, "step": 1016, "train_runtime": 40465.1594, "train_tokens_per_second": 97915.311 }, { "epoch": 0.1616852146263911, "grad_norm": 0.3816700279712677, "learning_rate": 4.691561898806215e-05, "loss": 0.456, "num_input_tokens_seen": 3966023392, "step": 1017, "train_runtime": 40504.5798, "train_tokens_per_second": 97915.431 }, { "epoch": 0.16184419713831477, "grad_norm": 0.34248241782188416, "learning_rate": 4.690959563367905e-05, "loss": 0.4462, "num_input_tokens_seen": 3969899851, "step": 1018, "train_runtime": 40544.4822, "train_tokens_per_second": 97914.676 }, { "epoch": 0.16200317965023847, "grad_norm": 0.42096635699272156, "learning_rate": 4.6903566791097076e-05, "loss": 0.4378, "num_input_tokens_seen": 3973797190, "step": 1019, "train_runtime": 40583.5956, "train_tokens_per_second": 97916.341 }, { "epoch": 0.16216216216216217, "grad_norm": 0.25404343008995056, "learning_rate": 4.689753246182639e-05, "loss": 0.4369, "num_input_tokens_seen": 3977750288, "step": 1020, "train_runtime": 40621.7675, "train_tokens_per_second": 97921.645 }, { "epoch": 0.16232114467408584, "grad_norm": 0.45918112993240356, "learning_rate": 4.689149264737856e-05, "loss": 0.4578, "num_input_tokens_seen": 3981713828, "step": 1021, "train_runtime": 40662.228, "train_tokens_per_second": 97921.684 }, { "epoch": 0.16248012718600954, "grad_norm": 0.48844513297080994, "learning_rate": 4.688544734926651e-05, "loss": 0.4507, "num_input_tokens_seen": 3985605045, "step": 1022, "train_runtime": 40701.8188, "train_tokens_per_second": 97922.038 }, { "epoch": 0.16263910969793322, "grad_norm": 0.646515429019928, "learning_rate": 4.687939656900456e-05, "loss": 0.4589, "num_input_tokens_seen": 3989441833, "step": 1023, "train_runtime": 40740.8491, "train_tokens_per_second": 97922.403 }, { "epoch": 0.16279809220985691, "grad_norm": 0.260409951210022, "learning_rate": 4.687334030810837e-05, "loss": 0.4424, "num_input_tokens_seen": 3993415574, "step": 1024, "train_runtime": 40780.6762, "train_tokens_per_second": 97924.212 }, { "epoch": 0.16295707472178061, "grad_norm": 0.3053061068058014, "learning_rate": 4.6867278568094993e-05, "loss": 0.4623, "num_input_tokens_seen": 3997297617, "step": 1025, "train_runtime": 40820.5148, "train_tokens_per_second": 97923.743 }, { "epoch": 0.1631160572337043, "grad_norm": 0.25088945031166077, "learning_rate": 4.686121135048287e-05, "loss": 0.4479, "num_input_tokens_seen": 4001141028, "step": 1026, "train_runtime": 40857.3126, "train_tokens_per_second": 97929.618 }, { "epoch": 0.163275039745628, "grad_norm": 0.2668812572956085, "learning_rate": 4.6855138656791775e-05, "loss": 0.4401, "num_input_tokens_seen": 4005034834, "step": 1027, "train_runtime": 40896.2548, "train_tokens_per_second": 97931.58 }, { "epoch": 0.16343402225755166, "grad_norm": 0.2690502405166626, "learning_rate": 4.684906048854288e-05, "loss": 0.436, "num_input_tokens_seen": 4008949084, "step": 1028, "train_runtime": 40935.8272, "train_tokens_per_second": 97932.529 }, { "epoch": 0.16359300476947536, "grad_norm": 0.28026431798934937, "learning_rate": 4.6842976847258725e-05, "loss": 0.4487, "num_input_tokens_seen": 4012897275, "step": 1029, "train_runtime": 40974.2996, "train_tokens_per_second": 97936.934 }, { "epoch": 0.16375198728139906, "grad_norm": 0.2696095407009125, "learning_rate": 4.683688773446322e-05, "loss": 0.4448, "num_input_tokens_seen": 4016700257, "step": 1030, "train_runtime": 41013.9575, "train_tokens_per_second": 97934.959 }, { "epoch": 0.16391096979332273, "grad_norm": 0.24247823655605316, "learning_rate": 4.683079315168165e-05, "loss": 0.4484, "num_input_tokens_seen": 4020687097, "step": 1031, "train_runtime": 41055.0183, "train_tokens_per_second": 97934.12 }, { "epoch": 0.16406995230524643, "grad_norm": 0.3598299026489258, "learning_rate": 4.682469310044066e-05, "loss": 0.4447, "num_input_tokens_seen": 4024739899, "step": 1032, "train_runtime": 41095.5178, "train_tokens_per_second": 97936.226 }, { "epoch": 0.1642289348171701, "grad_norm": 0.20653483271598816, "learning_rate": 4.6818587582268267e-05, "loss": 0.4476, "num_input_tokens_seen": 4028525094, "step": 1033, "train_runtime": 41131.6161, "train_tokens_per_second": 97942.3 }, { "epoch": 0.1643879173290938, "grad_norm": 0.24013543128967285, "learning_rate": 4.681247659869386e-05, "loss": 0.4358, "num_input_tokens_seen": 4032352030, "step": 1034, "train_runtime": 41169.412, "train_tokens_per_second": 97945.339 }, { "epoch": 0.16454689984101747, "grad_norm": 0.23724012076854706, "learning_rate": 4.6806360151248206e-05, "loss": 0.4395, "num_input_tokens_seen": 4036319827, "step": 1035, "train_runtime": 41210.1085, "train_tokens_per_second": 97944.897 }, { "epoch": 0.16470588235294117, "grad_norm": 0.2730478048324585, "learning_rate": 4.6800238241463415e-05, "loss": 0.4324, "num_input_tokens_seen": 4040262963, "step": 1036, "train_runtime": 41248.3904, "train_tokens_per_second": 97949.591 }, { "epoch": 0.16486486486486487, "grad_norm": 0.2230813205242157, "learning_rate": 4.6794110870872995e-05, "loss": 0.4386, "num_input_tokens_seen": 4044163797, "step": 1037, "train_runtime": 41285.7642, "train_tokens_per_second": 97955.406 }, { "epoch": 0.16502384737678855, "grad_norm": 0.25216394662857056, "learning_rate": 4.678797804101182e-05, "loss": 0.4348, "num_input_tokens_seen": 4048081522, "step": 1038, "train_runtime": 41322.7043, "train_tokens_per_second": 97962.648 }, { "epoch": 0.16518282988871225, "grad_norm": 0.2476217895746231, "learning_rate": 4.6781839753416094e-05, "loss": 0.4518, "num_input_tokens_seen": 4051947334, "step": 1039, "train_runtime": 41361.5367, "train_tokens_per_second": 97964.139 }, { "epoch": 0.16534181240063592, "grad_norm": 0.22869278490543365, "learning_rate": 4.6775696009623435e-05, "loss": 0.457, "num_input_tokens_seen": 4055886100, "step": 1040, "train_runtime": 41401.8749, "train_tokens_per_second": 97963.827 }, { "epoch": 0.16550079491255962, "grad_norm": 0.248054638504982, "learning_rate": 4.676954681117279e-05, "loss": 0.4315, "num_input_tokens_seen": 4059723409, "step": 1041, "train_runtime": 41442.241, "train_tokens_per_second": 97961.001 }, { "epoch": 0.16565977742448332, "grad_norm": 0.26420795917510986, "learning_rate": 4.676339215960451e-05, "loss": 0.4474, "num_input_tokens_seen": 4063619663, "step": 1042, "train_runtime": 41482.4446, "train_tokens_per_second": 97959.985 }, { "epoch": 0.165818759936407, "grad_norm": 0.5480397343635559, "learning_rate": 4.675723205646028e-05, "loss": 0.4452, "num_input_tokens_seen": 4067498991, "step": 1043, "train_runtime": 41522.0913, "train_tokens_per_second": 97959.878 }, { "epoch": 0.1659777424483307, "grad_norm": 0.21387892961502075, "learning_rate": 4.6751066503283166e-05, "loss": 0.4436, "num_input_tokens_seen": 4071450033, "step": 1044, "train_runtime": 41559.7016, "train_tokens_per_second": 97966.296 }, { "epoch": 0.16613672496025436, "grad_norm": 0.22639648616313934, "learning_rate": 4.6744895501617594e-05, "loss": 0.4256, "num_input_tokens_seen": 4075330787, "step": 1045, "train_runtime": 41599.9515, "train_tokens_per_second": 97964.797 }, { "epoch": 0.16629570747217806, "grad_norm": 0.2173670381307602, "learning_rate": 4.673871905300935e-05, "loss": 0.4383, "num_input_tokens_seen": 4079188831, "step": 1046, "train_runtime": 41639.8183, "train_tokens_per_second": 97963.656 }, { "epoch": 0.16645468998410176, "grad_norm": 0.21140463650226593, "learning_rate": 4.6732537159005596e-05, "loss": 0.4484, "num_input_tokens_seen": 4083125558, "step": 1047, "train_runtime": 41678.7932, "train_tokens_per_second": 97966.502 }, { "epoch": 0.16661367249602543, "grad_norm": 0.2066238522529602, "learning_rate": 4.672634982115486e-05, "loss": 0.4591, "num_input_tokens_seen": 4087028650, "step": 1048, "train_runtime": 41717.1369, "train_tokens_per_second": 97970.018 }, { "epoch": 0.16677265500794913, "grad_norm": 0.21659080684185028, "learning_rate": 4.6720157041007016e-05, "loss": 0.4402, "num_input_tokens_seen": 4090850606, "step": 1049, "train_runtime": 41758.0396, "train_tokens_per_second": 97965.581 }, { "epoch": 0.1669316375198728, "grad_norm": 0.23748546838760376, "learning_rate": 4.671395882011331e-05, "loss": 0.4427, "num_input_tokens_seen": 4094841853, "step": 1050, "train_runtime": 41795.8266, "train_tokens_per_second": 97972.505 }, { "epoch": 0.1670906200317965, "grad_norm": 0.2352863848209381, "learning_rate": 4.670775516002637e-05, "loss": 0.43, "num_input_tokens_seen": 4098765112, "step": 1051, "train_runtime": 41835.0834, "train_tokens_per_second": 97974.35 }, { "epoch": 0.16724960254372018, "grad_norm": 0.2685403525829315, "learning_rate": 4.6701546062300147e-05, "loss": 0.4358, "num_input_tokens_seen": 4102693728, "step": 1052, "train_runtime": 41874.5548, "train_tokens_per_second": 97975.817 }, { "epoch": 0.16740858505564388, "grad_norm": 0.2515857517719269, "learning_rate": 4.669533152849e-05, "loss": 0.4525, "num_input_tokens_seen": 4106674779, "step": 1053, "train_runtime": 41916.6337, "train_tokens_per_second": 97972.438 }, { "epoch": 0.16756756756756758, "grad_norm": 0.24298113584518433, "learning_rate": 4.66891115601526e-05, "loss": 0.4519, "num_input_tokens_seen": 4110509700, "step": 1054, "train_runtime": 41954.3105, "train_tokens_per_second": 97975.861 }, { "epoch": 0.16772655007949125, "grad_norm": 0.26514244079589844, "learning_rate": 4.6682886158846015e-05, "loss": 0.4459, "num_input_tokens_seen": 4114339321, "step": 1055, "train_runtime": 41992.4742, "train_tokens_per_second": 97978.016 }, { "epoch": 0.16788553259141495, "grad_norm": 0.23022650182247162, "learning_rate": 4.667665532612967e-05, "loss": 0.4549, "num_input_tokens_seen": 4118218034, "step": 1056, "train_runtime": 42032.3044, "train_tokens_per_second": 97977.451 }, { "epoch": 0.16804451510333862, "grad_norm": 0.2137541025876999, "learning_rate": 4.667041906356434e-05, "loss": 0.4392, "num_input_tokens_seen": 4122065955, "step": 1057, "train_runtime": 42071.834, "train_tokens_per_second": 97976.854 }, { "epoch": 0.16820349761526232, "grad_norm": 0.237344428896904, "learning_rate": 4.666417737271217e-05, "loss": 0.4367, "num_input_tokens_seen": 4126050226, "step": 1058, "train_runtime": 42109.828, "train_tokens_per_second": 97983.07 }, { "epoch": 0.16836248012718602, "grad_norm": 0.21168604493141174, "learning_rate": 4.665793025513665e-05, "loss": 0.4401, "num_input_tokens_seen": 4129931196, "step": 1059, "train_runtime": 42150.8086, "train_tokens_per_second": 97979.881 }, { "epoch": 0.1685214626391097, "grad_norm": 0.31338420510292053, "learning_rate": 4.665167771240265e-05, "loss": 0.4519, "num_input_tokens_seen": 4133785774, "step": 1060, "train_runtime": 42187.8509, "train_tokens_per_second": 97985.218 }, { "epoch": 0.1686804451510334, "grad_norm": 0.23478567600250244, "learning_rate": 4.664541974607637e-05, "loss": 0.4485, "num_input_tokens_seen": 4137657911, "step": 1061, "train_runtime": 42227.782, "train_tokens_per_second": 97984.259 }, { "epoch": 0.16883942766295706, "grad_norm": 0.24041880667209625, "learning_rate": 4.663915635772541e-05, "loss": 0.4529, "num_input_tokens_seen": 4141536552, "step": 1062, "train_runtime": 42265.6486, "train_tokens_per_second": 97988.241 }, { "epoch": 0.16899841017488076, "grad_norm": 0.2231205850839615, "learning_rate": 4.663288754891869e-05, "loss": 0.4483, "num_input_tokens_seen": 4145481089, "step": 1063, "train_runtime": 42305.2749, "train_tokens_per_second": 97989.697 }, { "epoch": 0.16915739268680446, "grad_norm": 0.22366516292095184, "learning_rate": 4.66266133212265e-05, "loss": 0.4383, "num_input_tokens_seen": 4149353962, "step": 1064, "train_runtime": 42345.0429, "train_tokens_per_second": 97989.131 }, { "epoch": 0.16931637519872814, "grad_norm": 0.2537432909011841, "learning_rate": 4.662033367622049e-05, "loss": 0.445, "num_input_tokens_seen": 4153274970, "step": 1065, "train_runtime": 42386.094, "train_tokens_per_second": 97986.735 }, { "epoch": 0.16947535771065184, "grad_norm": 0.25627756118774414, "learning_rate": 4.661404861547368e-05, "loss": 0.4438, "num_input_tokens_seen": 4157222566, "step": 1066, "train_runtime": 42422.3395, "train_tokens_per_second": 97996.07 }, { "epoch": 0.1696343402225755, "grad_norm": 0.20725376904010773, "learning_rate": 4.660775814056042e-05, "loss": 0.4358, "num_input_tokens_seen": 4161072751, "step": 1067, "train_runtime": 42461.5238, "train_tokens_per_second": 97996.312 }, { "epoch": 0.1697933227344992, "grad_norm": 0.20511473715305328, "learning_rate": 4.660146225305643e-05, "loss": 0.4397, "num_input_tokens_seen": 4165006068, "step": 1068, "train_runtime": 42500.477, "train_tokens_per_second": 97999.043 }, { "epoch": 0.16995230524642288, "grad_norm": 0.2563074827194214, "learning_rate": 4.65951609545388e-05, "loss": 0.4535, "num_input_tokens_seen": 4168985355, "step": 1069, "train_runtime": 42541.0086, "train_tokens_per_second": 97999.213 }, { "epoch": 0.17011128775834658, "grad_norm": 0.25147032737731934, "learning_rate": 4.658885424658594e-05, "loss": 0.4482, "num_input_tokens_seen": 4172850007, "step": 1070, "train_runtime": 42579.4352, "train_tokens_per_second": 98001.535 }, { "epoch": 0.17027027027027028, "grad_norm": 0.257821649312973, "learning_rate": 4.658254213077765e-05, "loss": 0.4454, "num_input_tokens_seen": 4176803713, "step": 1071, "train_runtime": 42618.7756, "train_tokens_per_second": 98003.841 }, { "epoch": 0.17042925278219395, "grad_norm": 0.9455848932266235, "learning_rate": 4.657622460869506e-05, "loss": 0.4398, "num_input_tokens_seen": 4180652638, "step": 1072, "train_runtime": 42657.0291, "train_tokens_per_second": 98006.184 }, { "epoch": 0.17058823529411765, "grad_norm": 0.23442378640174866, "learning_rate": 4.656990168192068e-05, "loss": 0.4415, "num_input_tokens_seen": 4184491060, "step": 1073, "train_runtime": 42694.6472, "train_tokens_per_second": 98009.735 }, { "epoch": 0.17074721780604132, "grad_norm": 0.21244901418685913, "learning_rate": 4.656357335203835e-05, "loss": 0.4519, "num_input_tokens_seen": 4188439658, "step": 1074, "train_runtime": 42733.9498, "train_tokens_per_second": 98011.995 }, { "epoch": 0.17090620031796502, "grad_norm": 0.25581949949264526, "learning_rate": 4.655723962063327e-05, "loss": 0.4452, "num_input_tokens_seen": 4192331982, "step": 1075, "train_runtime": 42773.3008, "train_tokens_per_second": 98012.823 }, { "epoch": 0.17106518282988872, "grad_norm": 0.2282644659280777, "learning_rate": 4.6550900489291985e-05, "loss": 0.4585, "num_input_tokens_seen": 4196259924, "step": 1076, "train_runtime": 42813.8263, "train_tokens_per_second": 98011.794 }, { "epoch": 0.1712241653418124, "grad_norm": 0.24932020902633667, "learning_rate": 4.654455595960242e-05, "loss": 0.4387, "num_input_tokens_seen": 4200150358, "step": 1077, "train_runtime": 42853.5818, "train_tokens_per_second": 98011.652 }, { "epoch": 0.1713831478537361, "grad_norm": 0.2449694722890854, "learning_rate": 4.653820603315383e-05, "loss": 0.4551, "num_input_tokens_seen": 4204128313, "step": 1078, "train_runtime": 42891.6527, "train_tokens_per_second": 98017.401 }, { "epoch": 0.17154213036565977, "grad_norm": 0.5291522145271301, "learning_rate": 4.653185071153683e-05, "loss": 0.4383, "num_input_tokens_seen": 4207924852, "step": 1079, "train_runtime": 42930.6706, "train_tokens_per_second": 98016.751 }, { "epoch": 0.17170111287758347, "grad_norm": 0.3157670199871063, "learning_rate": 4.652548999634337e-05, "loss": 0.4478, "num_input_tokens_seen": 4211691166, "step": 1080, "train_runtime": 42967.3789, "train_tokens_per_second": 98020.668 }, { "epoch": 0.17186009538950717, "grad_norm": 0.23426765203475952, "learning_rate": 4.6519123889166774e-05, "loss": 0.4496, "num_input_tokens_seen": 4215690701, "step": 1081, "train_runtime": 43004.1636, "train_tokens_per_second": 98029.827 }, { "epoch": 0.17201907790143084, "grad_norm": 0.6055298447608948, "learning_rate": 4.651275239160171e-05, "loss": 0.4442, "num_input_tokens_seen": 4219593536, "step": 1082, "train_runtime": 43044.1368, "train_tokens_per_second": 98029.461 }, { "epoch": 0.17217806041335454, "grad_norm": 0.2692350745201111, "learning_rate": 4.6506375505244186e-05, "loss": 0.439, "num_input_tokens_seen": 4223397480, "step": 1083, "train_runtime": 43083.9091, "train_tokens_per_second": 98027.258 }, { "epoch": 0.1723370429252782, "grad_norm": 0.2787763774394989, "learning_rate": 4.649999323169157e-05, "loss": 0.4484, "num_input_tokens_seen": 4227359860, "step": 1084, "train_runtime": 43123.5467, "train_tokens_per_second": 98029.039 }, { "epoch": 0.1724960254372019, "grad_norm": 0.23367077112197876, "learning_rate": 4.6493605572542575e-05, "loss": 0.4383, "num_input_tokens_seen": 4231300580, "step": 1085, "train_runtime": 43162.636, "train_tokens_per_second": 98031.561 }, { "epoch": 0.17265500794912558, "grad_norm": 0.2416381686925888, "learning_rate": 4.648721252939727e-05, "loss": 0.4495, "num_input_tokens_seen": 4235181322, "step": 1086, "train_runtime": 43203.0636, "train_tokens_per_second": 98029.653 }, { "epoch": 0.17281399046104928, "grad_norm": 0.25862544775009155, "learning_rate": 4.648081410385706e-05, "loss": 0.4438, "num_input_tokens_seen": 4238946264, "step": 1087, "train_runtime": 43242.2805, "train_tokens_per_second": 98027.815 }, { "epoch": 0.17297297297297298, "grad_norm": 0.24534447491168976, "learning_rate": 4.6474410297524704e-05, "loss": 0.4504, "num_input_tokens_seen": 4242862673, "step": 1088, "train_runtime": 43281.2425, "train_tokens_per_second": 98030.057 }, { "epoch": 0.17313195548489665, "grad_norm": 0.2632269263267517, "learning_rate": 4.646800111200432e-05, "loss": 0.4453, "num_input_tokens_seen": 4246678430, "step": 1089, "train_runtime": 43322.3211, "train_tokens_per_second": 98025.182 }, { "epoch": 0.17329093799682035, "grad_norm": 0.2513768672943115, "learning_rate": 4.6461586548901356e-05, "loss": 0.4456, "num_input_tokens_seen": 4250667023, "step": 1090, "train_runtime": 43362.5107, "train_tokens_per_second": 98026.312 }, { "epoch": 0.17344992050874403, "grad_norm": 0.22666335105895996, "learning_rate": 4.6455166609822606e-05, "loss": 0.4417, "num_input_tokens_seen": 4254674921, "step": 1091, "train_runtime": 43399.2068, "train_tokens_per_second": 98035.776 }, { "epoch": 0.17360890302066773, "grad_norm": 0.22151009738445282, "learning_rate": 4.6448741296376246e-05, "loss": 0.4431, "num_input_tokens_seen": 4258459485, "step": 1092, "train_runtime": 43437.1904, "train_tokens_per_second": 98037.176 }, { "epoch": 0.17376788553259143, "grad_norm": 0.2918737530708313, "learning_rate": 4.6442310610171745e-05, "loss": 0.4531, "num_input_tokens_seen": 4262359234, "step": 1093, "train_runtime": 43478.4293, "train_tokens_per_second": 98033.883 }, { "epoch": 0.1739268680445151, "grad_norm": 0.2718450427055359, "learning_rate": 4.6435874552819966e-05, "loss": 0.466, "num_input_tokens_seen": 4266321913, "step": 1094, "train_runtime": 43516.8895, "train_tokens_per_second": 98038.301 }, { "epoch": 0.1740858505564388, "grad_norm": 0.21868044137954712, "learning_rate": 4.6429433125933074e-05, "loss": 0.4484, "num_input_tokens_seen": 4270220521, "step": 1095, "train_runtime": 43554.9923, "train_tokens_per_second": 98042.045 }, { "epoch": 0.17424483306836247, "grad_norm": 0.23970100283622742, "learning_rate": 4.642298633112462e-05, "loss": 0.4649, "num_input_tokens_seen": 4274024065, "step": 1096, "train_runtime": 43596.3148, "train_tokens_per_second": 98036.361 }, { "epoch": 0.17440381558028617, "grad_norm": 0.3191007077693939, "learning_rate": 4.6416534170009465e-05, "loss": 0.4341, "num_input_tokens_seen": 4278082977, "step": 1097, "train_runtime": 43634.8558, "train_tokens_per_second": 98042.789 }, { "epoch": 0.17456279809220987, "grad_norm": 0.30402228236198425, "learning_rate": 4.641007664420384e-05, "loss": 0.4468, "num_input_tokens_seen": 4281916733, "step": 1098, "train_runtime": 43674.8763, "train_tokens_per_second": 98040.729 }, { "epoch": 0.17472178060413354, "grad_norm": 0.24169208109378815, "learning_rate": 4.6403613755325306e-05, "loss": 0.4473, "num_input_tokens_seen": 4285699489, "step": 1099, "train_runtime": 43714.3335, "train_tokens_per_second": 98038.77 }, { "epoch": 0.17488076311605724, "grad_norm": 0.24503318965435028, "learning_rate": 4.6397145504992776e-05, "loss": 0.4313, "num_input_tokens_seen": 4289548023, "step": 1100, "train_runtime": 43751.4263, "train_tokens_per_second": 98043.616 }, { "epoch": 0.1750397456279809, "grad_norm": 0.3379063308238983, "learning_rate": 4.63906718948265e-05, "loss": 0.4377, "num_input_tokens_seen": 4293500954, "step": 1101, "train_runtime": 43790.6823, "train_tokens_per_second": 98045.994 }, { "epoch": 0.1751987281399046, "grad_norm": 0.24760903418064117, "learning_rate": 4.638419292644805e-05, "loss": 0.4364, "num_input_tokens_seen": 4297423848, "step": 1102, "train_runtime": 43828.5046, "train_tokens_per_second": 98050.889 }, { "epoch": 0.17535771065182829, "grad_norm": 0.24653176963329315, "learning_rate": 4.63777086014804e-05, "loss": 0.4391, "num_input_tokens_seen": 4301350823, "step": 1103, "train_runtime": 43867.8357, "train_tokens_per_second": 98052.497 }, { "epoch": 0.17551669316375199, "grad_norm": 0.227976456284523, "learning_rate": 4.63712189215478e-05, "loss": 0.4484, "num_input_tokens_seen": 4305214521, "step": 1104, "train_runtime": 43906.8878, "train_tokens_per_second": 98053.284 }, { "epoch": 0.17567567567567569, "grad_norm": 0.2504689395427704, "learning_rate": 4.636472388827587e-05, "loss": 0.4406, "num_input_tokens_seen": 4309237612, "step": 1105, "train_runtime": 43948.4157, "train_tokens_per_second": 98052.172 }, { "epoch": 0.17583465818759936, "grad_norm": 0.3526633679866791, "learning_rate": 4.635822350329159e-05, "loss": 0.4486, "num_input_tokens_seen": 4312970601, "step": 1106, "train_runtime": 43987.514, "train_tokens_per_second": 98049.883 }, { "epoch": 0.17599364069952306, "grad_norm": 0.567436695098877, "learning_rate": 4.635171776822324e-05, "loss": 0.4472, "num_input_tokens_seen": 4316993193, "step": 1107, "train_runtime": 44027.3917, "train_tokens_per_second": 98052.44 }, { "epoch": 0.17615262321144673, "grad_norm": 0.44585973024368286, "learning_rate": 4.634520668470047e-05, "loss": 0.4408, "num_input_tokens_seen": 4320836636, "step": 1108, "train_runtime": 44066.0651, "train_tokens_per_second": 98053.607 }, { "epoch": 0.17631160572337043, "grad_norm": 0.5612204074859619, "learning_rate": 4.633869025435425e-05, "loss": 0.4523, "num_input_tokens_seen": 4324746535, "step": 1109, "train_runtime": 44105.1665, "train_tokens_per_second": 98055.327 }, { "epoch": 0.17647058823529413, "grad_norm": 0.32000768184661865, "learning_rate": 4.6332168478816926e-05, "loss": 0.4354, "num_input_tokens_seen": 4328705440, "step": 1110, "train_runtime": 44144.8768, "train_tokens_per_second": 98056.802 }, { "epoch": 0.1766295707472178, "grad_norm": 0.40478286147117615, "learning_rate": 4.632564135972213e-05, "loss": 0.4459, "num_input_tokens_seen": 4332596520, "step": 1111, "train_runtime": 44183.8354, "train_tokens_per_second": 98058.407 }, { "epoch": 0.1767885532591415, "grad_norm": 0.2797462046146393, "learning_rate": 4.631910889870486e-05, "loss": 0.4446, "num_input_tokens_seen": 4336524356, "step": 1112, "train_runtime": 44222.3547, "train_tokens_per_second": 98061.815 }, { "epoch": 0.17694753577106517, "grad_norm": 0.3080621063709259, "learning_rate": 4.6312571097401466e-05, "loss": 0.4487, "num_input_tokens_seen": 4340412261, "step": 1113, "train_runtime": 44259.4879, "train_tokens_per_second": 98067.385 }, { "epoch": 0.17710651828298887, "grad_norm": 0.33752313256263733, "learning_rate": 4.630602795744961e-05, "loss": 0.446, "num_input_tokens_seen": 4344315626, "step": 1114, "train_runtime": 44296.0218, "train_tokens_per_second": 98074.623 }, { "epoch": 0.17726550079491257, "grad_norm": 0.29621267318725586, "learning_rate": 4.629947948048831e-05, "loss": 0.4663, "num_input_tokens_seen": 4348229772, "step": 1115, "train_runtime": 44335.1105, "train_tokens_per_second": 98076.439 }, { "epoch": 0.17742448330683624, "grad_norm": 0.220293328166008, "learning_rate": 4.629292566815791e-05, "loss": 0.445, "num_input_tokens_seen": 4352152028, "step": 1116, "train_runtime": 44375.2021, "train_tokens_per_second": 98076.219 }, { "epoch": 0.17758346581875994, "grad_norm": 0.24466419219970703, "learning_rate": 4.628636652210009e-05, "loss": 0.4404, "num_input_tokens_seen": 4356118482, "step": 1117, "train_runtime": 44413.1548, "train_tokens_per_second": 98081.717 }, { "epoch": 0.17774244833068362, "grad_norm": 0.2433171421289444, "learning_rate": 4.6279802043957875e-05, "loss": 0.4553, "num_input_tokens_seen": 4360051381, "step": 1118, "train_runtime": 44454.6196, "train_tokens_per_second": 98078.702 }, { "epoch": 0.17790143084260732, "grad_norm": 0.22027252614498138, "learning_rate": 4.6273232235375626e-05, "loss": 0.4426, "num_input_tokens_seen": 4363720036, "step": 1119, "train_runtime": 44492.9631, "train_tokens_per_second": 98076.634 }, { "epoch": 0.178060413354531, "grad_norm": 0.21107937395572662, "learning_rate": 4.626665709799902e-05, "loss": 0.4486, "num_input_tokens_seen": 4367661934, "step": 1120, "train_runtime": 44532.3113, "train_tokens_per_second": 98078.492 }, { "epoch": 0.1782193958664547, "grad_norm": 0.2088969349861145, "learning_rate": 4.6260076633475086e-05, "loss": 0.4348, "num_input_tokens_seen": 4371568196, "step": 1121, "train_runtime": 44571.8502, "train_tokens_per_second": 98079.128 }, { "epoch": 0.1783783783783784, "grad_norm": 0.22047539055347443, "learning_rate": 4.6253490843452186e-05, "loss": 0.4348, "num_input_tokens_seen": 4375415118, "step": 1122, "train_runtime": 44609.7176, "train_tokens_per_second": 98082.107 }, { "epoch": 0.17853736089030206, "grad_norm": 0.2188820093870163, "learning_rate": 4.6246899729580015e-05, "loss": 0.4431, "num_input_tokens_seen": 4379463259, "step": 1123, "train_runtime": 44649.2265, "train_tokens_per_second": 98085.983 }, { "epoch": 0.17869634340222576, "grad_norm": 0.22942940890789032, "learning_rate": 4.624030329350961e-05, "loss": 0.4389, "num_input_tokens_seen": 4383277262, "step": 1124, "train_runtime": 44689.3479, "train_tokens_per_second": 98083.267 }, { "epoch": 0.17885532591414943, "grad_norm": 0.24680939316749573, "learning_rate": 4.623370153689331e-05, "loss": 0.46, "num_input_tokens_seen": 4387108837, "step": 1125, "train_runtime": 44728.878, "train_tokens_per_second": 98082.246 }, { "epoch": 0.17901430842607313, "grad_norm": 0.3410927951335907, "learning_rate": 4.622709446138482e-05, "loss": 0.4508, "num_input_tokens_seen": 4391112161, "step": 1126, "train_runtime": 44765.8946, "train_tokens_per_second": 98090.571 }, { "epoch": 0.17917329093799683, "grad_norm": 0.2584172189235687, "learning_rate": 4.622048206863917e-05, "loss": 0.4273, "num_input_tokens_seen": 4394913276, "step": 1127, "train_runtime": 44804.536, "train_tokens_per_second": 98090.811 }, { "epoch": 0.1793322734499205, "grad_norm": 0.263923317193985, "learning_rate": 4.6213864360312715e-05, "loss": 0.4443, "num_input_tokens_seen": 4398925352, "step": 1128, "train_runtime": 44844.0318, "train_tokens_per_second": 98093.886 }, { "epoch": 0.1794912559618442, "grad_norm": 0.23756113648414612, "learning_rate": 4.6207241338063134e-05, "loss": 0.4387, "num_input_tokens_seen": 4402760812, "step": 1129, "train_runtime": 44885.901, "train_tokens_per_second": 98087.834 }, { "epoch": 0.17965023847376788, "grad_norm": 0.23263530433177948, "learning_rate": 4.620061300354946e-05, "loss": 0.4591, "num_input_tokens_seen": 4406625271, "step": 1130, "train_runtime": 44927.6835, "train_tokens_per_second": 98082.628 }, { "epoch": 0.17980922098569158, "grad_norm": 0.2106216698884964, "learning_rate": 4.619397935843205e-05, "loss": 0.4327, "num_input_tokens_seen": 4410471034, "step": 1131, "train_runtime": 44963.293, "train_tokens_per_second": 98090.481 }, { "epoch": 0.17996820349761528, "grad_norm": 0.2356870174407959, "learning_rate": 4.618734040437256e-05, "loss": 0.4435, "num_input_tokens_seen": 4414268585, "step": 1132, "train_runtime": 45000.4069, "train_tokens_per_second": 98093.97 }, { "epoch": 0.18012718600953895, "grad_norm": 0.24012348055839539, "learning_rate": 4.618069614303402e-05, "loss": 0.4579, "num_input_tokens_seen": 4418206916, "step": 1133, "train_runtime": 45039.3864, "train_tokens_per_second": 98096.517 }, { "epoch": 0.18028616852146265, "grad_norm": 0.21601566672325134, "learning_rate": 4.617404657608077e-05, "loss": 0.4451, "num_input_tokens_seen": 4422202188, "step": 1134, "train_runtime": 45079.8975, "train_tokens_per_second": 98096.988 }, { "epoch": 0.18044515103338632, "grad_norm": 0.2761297821998596, "learning_rate": 4.616739170517847e-05, "loss": 0.4561, "num_input_tokens_seen": 4426134138, "step": 1135, "train_runtime": 45117.7535, "train_tokens_per_second": 98101.829 }, { "epoch": 0.18060413354531002, "grad_norm": 0.288546621799469, "learning_rate": 4.616073153199413e-05, "loss": 0.4439, "num_input_tokens_seen": 4429979993, "step": 1136, "train_runtime": 45157.4486, "train_tokens_per_second": 98100.759 }, { "epoch": 0.1807631160572337, "grad_norm": 0.24862606823444366, "learning_rate": 4.615406605819607e-05, "loss": 0.4345, "num_input_tokens_seen": 4433948431, "step": 1137, "train_runtime": 45196.0952, "train_tokens_per_second": 98104.68 }, { "epoch": 0.1809220985691574, "grad_norm": 0.2700712978839874, "learning_rate": 4.614739528545394e-05, "loss": 0.4406, "num_input_tokens_seen": 4437738082, "step": 1138, "train_runtime": 45235.9503, "train_tokens_per_second": 98102.02 }, { "epoch": 0.1810810810810811, "grad_norm": 0.2662597894668579, "learning_rate": 4.614071921543873e-05, "loss": 0.4412, "num_input_tokens_seen": 4441667405, "step": 1139, "train_runtime": 45275.032, "train_tokens_per_second": 98104.125 }, { "epoch": 0.18124006359300476, "grad_norm": 0.2861645221710205, "learning_rate": 4.6134037849822745e-05, "loss": 0.438, "num_input_tokens_seen": 4445429034, "step": 1140, "train_runtime": 45314.0412, "train_tokens_per_second": 98102.683 }, { "epoch": 0.18139904610492846, "grad_norm": 0.22939537465572357, "learning_rate": 4.612735119027962e-05, "loss": 0.4473, "num_input_tokens_seen": 4449389463, "step": 1141, "train_runtime": 45352.7241, "train_tokens_per_second": 98106.333 }, { "epoch": 0.18155802861685214, "grad_norm": 0.23543797433376312, "learning_rate": 4.612065923848431e-05, "loss": 0.4448, "num_input_tokens_seen": 4453372057, "step": 1142, "train_runtime": 45391.5888, "train_tokens_per_second": 98110.072 }, { "epoch": 0.18171701112877583, "grad_norm": 0.4915226101875305, "learning_rate": 4.611396199611311e-05, "loss": 0.4481, "num_input_tokens_seen": 4457284369, "step": 1143, "train_runtime": 45432.0733, "train_tokens_per_second": 98108.76 }, { "epoch": 0.18187599364069953, "grad_norm": 0.28348949551582336, "learning_rate": 4.610725946484363e-05, "loss": 0.4561, "num_input_tokens_seen": 4461127307, "step": 1144, "train_runtime": 45472.7136, "train_tokens_per_second": 98105.588 }, { "epoch": 0.1820349761526232, "grad_norm": 0.2402498573064804, "learning_rate": 4.61005516463548e-05, "loss": 0.4457, "num_input_tokens_seen": 4465024762, "step": 1145, "train_runtime": 45510.9085, "train_tokens_per_second": 98108.891 }, { "epoch": 0.1821939586645469, "grad_norm": 0.2359353005886078, "learning_rate": 4.609383854232689e-05, "loss": 0.4429, "num_input_tokens_seen": 4468909415, "step": 1146, "train_runtime": 45549.765, "train_tokens_per_second": 98110.482 }, { "epoch": 0.18235294117647058, "grad_norm": 0.2999405264854431, "learning_rate": 4.608712015444148e-05, "loss": 0.45, "num_input_tokens_seen": 4472756817, "step": 1147, "train_runtime": 45588.7629, "train_tokens_per_second": 98110.95 }, { "epoch": 0.18251192368839428, "grad_norm": 0.27648821473121643, "learning_rate": 4.6080396484381486e-05, "loss": 0.4477, "num_input_tokens_seen": 4476633041, "step": 1148, "train_runtime": 45627.4212, "train_tokens_per_second": 98112.778 }, { "epoch": 0.18267090620031798, "grad_norm": 0.2434069812297821, "learning_rate": 4.607366753383112e-05, "loss": 0.4341, "num_input_tokens_seen": 4480559339, "step": 1149, "train_runtime": 45664.19, "train_tokens_per_second": 98119.759 }, { "epoch": 0.18282988871224165, "grad_norm": 0.22906813025474548, "learning_rate": 4.606693330447597e-05, "loss": 0.4609, "num_input_tokens_seen": 4484418457, "step": 1150, "train_runtime": 45704.1151, "train_tokens_per_second": 98118.483 }, { "epoch": 0.18298887122416535, "grad_norm": 0.2882539927959442, "learning_rate": 4.606019379800288e-05, "loss": 0.4317, "num_input_tokens_seen": 4488368461, "step": 1151, "train_runtime": 45744.9076, "train_tokens_per_second": 98117.336 }, { "epoch": 0.18314785373608902, "grad_norm": 0.25017085671424866, "learning_rate": 4.605344901610007e-05, "loss": 0.439, "num_input_tokens_seen": 4492305647, "step": 1152, "train_runtime": 45781.724, "train_tokens_per_second": 98124.432 }, { "epoch": 0.18330683624801272, "grad_norm": 0.2817278206348419, "learning_rate": 4.6046698960457044e-05, "loss": 0.4535, "num_input_tokens_seen": 4496095431, "step": 1153, "train_runtime": 45820.4077, "train_tokens_per_second": 98124.3 }, { "epoch": 0.1834658187599364, "grad_norm": 0.22082051634788513, "learning_rate": 4.603994363276466e-05, "loss": 0.4456, "num_input_tokens_seen": 4500071090, "step": 1154, "train_runtime": 45859.3173, "train_tokens_per_second": 98127.738 }, { "epoch": 0.1836248012718601, "grad_norm": 0.24453996121883392, "learning_rate": 4.603318303471507e-05, "loss": 0.4507, "num_input_tokens_seen": 4504009160, "step": 1155, "train_runtime": 45896.6402, "train_tokens_per_second": 98133.744 }, { "epoch": 0.1837837837837838, "grad_norm": 0.2508196234703064, "learning_rate": 4.602641716800176e-05, "loss": 0.4462, "num_input_tokens_seen": 4507922828, "step": 1156, "train_runtime": 45938.2462, "train_tokens_per_second": 98130.059 }, { "epoch": 0.18394276629570747, "grad_norm": 0.27411797642707825, "learning_rate": 4.601964603431952e-05, "loss": 0.4403, "num_input_tokens_seen": 4511764763, "step": 1157, "train_runtime": 45979.8368, "train_tokens_per_second": 98124.854 }, { "epoch": 0.18410174880763117, "grad_norm": 0.2264517992734909, "learning_rate": 4.6012869635364485e-05, "loss": 0.4595, "num_input_tokens_seen": 4515770979, "step": 1158, "train_runtime": 46021.1107, "train_tokens_per_second": 98123.903 }, { "epoch": 0.18426073131955484, "grad_norm": 0.27602261304855347, "learning_rate": 4.600608797283409e-05, "loss": 0.4413, "num_input_tokens_seen": 4519705073, "step": 1159, "train_runtime": 46058.8963, "train_tokens_per_second": 98128.818 }, { "epoch": 0.18441971383147854, "grad_norm": 0.2587061822414398, "learning_rate": 4.5999301048427104e-05, "loss": 0.4345, "num_input_tokens_seen": 4523499291, "step": 1160, "train_runtime": 46095.8306, "train_tokens_per_second": 98132.504 }, { "epoch": 0.18457869634340224, "grad_norm": 0.24217544496059418, "learning_rate": 4.599250886384358e-05, "loss": 0.4419, "num_input_tokens_seen": 4527335537, "step": 1161, "train_runtime": 46136.2434, "train_tokens_per_second": 98129.696 }, { "epoch": 0.1847376788553259, "grad_norm": 0.21650706231594086, "learning_rate": 4.598571142078494e-05, "loss": 0.4436, "num_input_tokens_seen": 4531266565, "step": 1162, "train_runtime": 46173.4411, "train_tokens_per_second": 98135.778 }, { "epoch": 0.1848966613672496, "grad_norm": 0.23993101716041565, "learning_rate": 4.5978908720953865e-05, "loss": 0.4549, "num_input_tokens_seen": 4535146101, "step": 1163, "train_runtime": 46213.7702, "train_tokens_per_second": 98134.086 }, { "epoch": 0.18505564387917328, "grad_norm": 0.2679230868816376, "learning_rate": 4.597210076605441e-05, "loss": 0.4285, "num_input_tokens_seen": 4538869241, "step": 1164, "train_runtime": 46254.0783, "train_tokens_per_second": 98129.06 }, { "epoch": 0.18521462639109698, "grad_norm": 0.27569445967674255, "learning_rate": 4.596528755779191e-05, "loss": 0.4461, "num_input_tokens_seen": 4542914244, "step": 1165, "train_runtime": 46293.5782, "train_tokens_per_second": 98132.709 }, { "epoch": 0.18537360890302068, "grad_norm": 1.1628018617630005, "learning_rate": 4.595846909787302e-05, "loss": 0.4506, "num_input_tokens_seen": 4546806531, "step": 1166, "train_runtime": 46331.9445, "train_tokens_per_second": 98135.457 }, { "epoch": 0.18553259141494435, "grad_norm": 0.22570878267288208, "learning_rate": 4.595164538800573e-05, "loss": 0.4229, "num_input_tokens_seen": 4550711423, "step": 1167, "train_runtime": 46370.1547, "train_tokens_per_second": 98138.802 }, { "epoch": 0.18569157392686805, "grad_norm": 0.3791438341140747, "learning_rate": 4.5944816429899305e-05, "loss": 0.4382, "num_input_tokens_seen": 4554527589, "step": 1168, "train_runtime": 46409.0233, "train_tokens_per_second": 98138.837 }, { "epoch": 0.18585055643879173, "grad_norm": 0.3477688431739807, "learning_rate": 4.5937982225264384e-05, "loss": 0.4461, "num_input_tokens_seen": 4558489317, "step": 1169, "train_runtime": 46448.6802, "train_tokens_per_second": 98140.341 }, { "epoch": 0.18600953895071543, "grad_norm": 0.2149677574634552, "learning_rate": 4.5931142775812865e-05, "loss": 0.4474, "num_input_tokens_seen": 4562362341, "step": 1170, "train_runtime": 46487.8444, "train_tokens_per_second": 98140.974 }, { "epoch": 0.1861685214626391, "grad_norm": 0.22462138533592224, "learning_rate": 4.592429808325799e-05, "loss": 0.4518, "num_input_tokens_seen": 4566247587, "step": 1171, "train_runtime": 46528.3024, "train_tokens_per_second": 98139.14 }, { "epoch": 0.1863275039745628, "grad_norm": 0.21554265916347504, "learning_rate": 4.5917448149314304e-05, "loss": 0.4371, "num_input_tokens_seen": 4570106379, "step": 1172, "train_runtime": 46568.2538, "train_tokens_per_second": 98137.809 }, { "epoch": 0.1864864864864865, "grad_norm": 0.22903235256671906, "learning_rate": 4.591059297569766e-05, "loss": 0.4514, "num_input_tokens_seen": 4574064703, "step": 1173, "train_runtime": 46608.4536, "train_tokens_per_second": 98138.092 }, { "epoch": 0.18664546899841017, "grad_norm": 0.2168252021074295, "learning_rate": 4.590373256412524e-05, "loss": 0.4425, "num_input_tokens_seen": 4578008059, "step": 1174, "train_runtime": 46647.2224, "train_tokens_per_second": 98141.065 }, { "epoch": 0.18680445151033387, "grad_norm": 0.2067456841468811, "learning_rate": 4.589686691631553e-05, "loss": 0.445, "num_input_tokens_seen": 4581901819, "step": 1175, "train_runtime": 46686.148, "train_tokens_per_second": 98142.64 }, { "epoch": 0.18696343402225754, "grad_norm": 0.2205818146467209, "learning_rate": 4.5889996033988314e-05, "loss": 0.4503, "num_input_tokens_seen": 4585767215, "step": 1176, "train_runtime": 46727.23, "train_tokens_per_second": 98139.077 }, { "epoch": 0.18712241653418124, "grad_norm": 0.22557607293128967, "learning_rate": 4.5883119918864716e-05, "loss": 0.4427, "num_input_tokens_seen": 4589670518, "step": 1177, "train_runtime": 46765.9828, "train_tokens_per_second": 98141.218 }, { "epoch": 0.18728139904610494, "grad_norm": 0.22054553031921387, "learning_rate": 4.587623857266714e-05, "loss": 0.4397, "num_input_tokens_seen": 4593621183, "step": 1178, "train_runtime": 46807.4243, "train_tokens_per_second": 98138.73 }, { "epoch": 0.1874403815580286, "grad_norm": 0.24419547617435455, "learning_rate": 4.586935199711932e-05, "loss": 0.4381, "num_input_tokens_seen": 4597462630, "step": 1179, "train_runtime": 46847.5781, "train_tokens_per_second": 98136.613 }, { "epoch": 0.1875993640699523, "grad_norm": 0.23686762154102325, "learning_rate": 4.58624601939463e-05, "loss": 0.4413, "num_input_tokens_seen": 4601398234, "step": 1180, "train_runtime": 46887.0534, "train_tokens_per_second": 98137.927 }, { "epoch": 0.18775834658187598, "grad_norm": 0.21315959095954895, "learning_rate": 4.585556316487442e-05, "loss": 0.4362, "num_input_tokens_seen": 4605325111, "step": 1181, "train_runtime": 46926.8059, "train_tokens_per_second": 98138.474 }, { "epoch": 0.18791732909379968, "grad_norm": 0.31611648201942444, "learning_rate": 4.584866091163134e-05, "loss": 0.4483, "num_input_tokens_seen": 4609232540, "step": 1182, "train_runtime": 46966.1394, "train_tokens_per_second": 98139.481 }, { "epoch": 0.18807631160572338, "grad_norm": 0.2680906355381012, "learning_rate": 4.584175343594603e-05, "loss": 0.4464, "num_input_tokens_seen": 4613152256, "step": 1183, "train_runtime": 47005.4506, "train_tokens_per_second": 98140.794 }, { "epoch": 0.18823529411764706, "grad_norm": 0.21482470631599426, "learning_rate": 4.5834840739548755e-05, "loss": 0.4388, "num_input_tokens_seen": 4617146293, "step": 1184, "train_runtime": 47043.5133, "train_tokens_per_second": 98146.29 }, { "epoch": 0.18839427662957076, "grad_norm": 0.24640077352523804, "learning_rate": 4.58279228241711e-05, "loss": 0.4318, "num_input_tokens_seen": 4621015715, "step": 1185, "train_runtime": 47083.361, "train_tokens_per_second": 98145.409 }, { "epoch": 0.18855325914149443, "grad_norm": 0.3173198401927948, "learning_rate": 4.582099969154596e-05, "loss": 0.4418, "num_input_tokens_seen": 4624968370, "step": 1186, "train_runtime": 47123.3563, "train_tokens_per_second": 98145.988 }, { "epoch": 0.18871224165341813, "grad_norm": 0.2121065855026245, "learning_rate": 4.581407134340752e-05, "loss": 0.4565, "num_input_tokens_seen": 4628945924, "step": 1187, "train_runtime": 47163.8587, "train_tokens_per_second": 98146.039 }, { "epoch": 0.1888712241653418, "grad_norm": 0.23557373881340027, "learning_rate": 4.580713778149129e-05, "loss": 0.4387, "num_input_tokens_seen": 4632783579, "step": 1188, "train_runtime": 47202.3737, "train_tokens_per_second": 98147.259 }, { "epoch": 0.1890302066772655, "grad_norm": 0.22485262155532837, "learning_rate": 4.580019900753408e-05, "loss": 0.4341, "num_input_tokens_seen": 4636703980, "step": 1189, "train_runtime": 47242.4499, "train_tokens_per_second": 98146.984 }, { "epoch": 0.1891891891891892, "grad_norm": 0.21469677984714508, "learning_rate": 4.5793255023273996e-05, "loss": 0.4505, "num_input_tokens_seen": 4640701523, "step": 1190, "train_runtime": 47282.1599, "train_tokens_per_second": 98149.102 }, { "epoch": 0.18934817170111287, "grad_norm": 0.255754679441452, "learning_rate": 4.578630583045046e-05, "loss": 0.4509, "num_input_tokens_seen": 4644484664, "step": 1191, "train_runtime": 47321.1195, "train_tokens_per_second": 98148.241 }, { "epoch": 0.18950715421303657, "grad_norm": 0.22938761115074158, "learning_rate": 4.577935143080419e-05, "loss": 0.4361, "num_input_tokens_seen": 4648294661, "step": 1192, "train_runtime": 47362.1132, "train_tokens_per_second": 98143.734 }, { "epoch": 0.18966613672496024, "grad_norm": 0.20817574858665466, "learning_rate": 4.5772391826077235e-05, "loss": 0.4352, "num_input_tokens_seen": 4652270899, "step": 1193, "train_runtime": 47402.1523, "train_tokens_per_second": 98144.719 }, { "epoch": 0.18982511923688394, "grad_norm": 0.20112088322639465, "learning_rate": 4.576542701801291e-05, "loss": 0.4481, "num_input_tokens_seen": 4656270376, "step": 1194, "train_runtime": 47442.6292, "train_tokens_per_second": 98145.285 }, { "epoch": 0.18998410174880764, "grad_norm": 0.24571825563907623, "learning_rate": 4.5758457008355846e-05, "loss": 0.4441, "num_input_tokens_seen": 4660040542, "step": 1195, "train_runtime": 47479.9588, "train_tokens_per_second": 98147.527 }, { "epoch": 0.19014308426073132, "grad_norm": 0.2193192094564438, "learning_rate": 4.5751481798852e-05, "loss": 0.4351, "num_input_tokens_seen": 4663903647, "step": 1196, "train_runtime": 47522.1942, "train_tokens_per_second": 98141.589 }, { "epoch": 0.19030206677265502, "grad_norm": 0.19966693222522736, "learning_rate": 4.574450139124859e-05, "loss": 0.442, "num_input_tokens_seen": 4667781175, "step": 1197, "train_runtime": 47559.4417, "train_tokens_per_second": 98146.257 }, { "epoch": 0.1904610492845787, "grad_norm": 0.2185032218694687, "learning_rate": 4.5737515787294175e-05, "loss": 0.4516, "num_input_tokens_seen": 4671711148, "step": 1198, "train_runtime": 47599.8795, "train_tokens_per_second": 98145.441 }, { "epoch": 0.1906200317965024, "grad_norm": 0.20912985503673553, "learning_rate": 4.573052498873859e-05, "loss": 0.4444, "num_input_tokens_seen": 4675467445, "step": 1199, "train_runtime": 47639.1707, "train_tokens_per_second": 98143.343 }, { "epoch": 0.1907790143084261, "grad_norm": 0.2080649733543396, "learning_rate": 4.572352899733299e-05, "loss": 0.4513, "num_input_tokens_seen": 4679388189, "step": 1200, "train_runtime": 47679.8096, "train_tokens_per_second": 98141.923 }, { "epoch": 0.19093799682034976, "grad_norm": 0.2284799963235855, "learning_rate": 4.571652781482981e-05, "loss": 0.4374, "num_input_tokens_seen": 4683340942, "step": 1201, "train_runtime": 47834.7831, "train_tokens_per_second": 97906.599 }, { "epoch": 0.19109697933227346, "grad_norm": 0.20886380970478058, "learning_rate": 4.57095214429828e-05, "loss": 0.431, "num_input_tokens_seen": 4687264445, "step": 1202, "train_runtime": 47873.6511, "train_tokens_per_second": 97909.066 }, { "epoch": 0.19125596184419713, "grad_norm": 0.5377106070518494, "learning_rate": 4.570250988354701e-05, "loss": 0.4375, "num_input_tokens_seen": 4691148259, "step": 1203, "train_runtime": 47912.6594, "train_tokens_per_second": 97910.413 }, { "epoch": 0.19141494435612083, "grad_norm": 0.24360662698745728, "learning_rate": 4.569549313827879e-05, "loss": 0.4517, "num_input_tokens_seen": 4694972469, "step": 1204, "train_runtime": 47952.3683, "train_tokens_per_second": 97909.084 }, { "epoch": 0.1915739268680445, "grad_norm": 0.28749310970306396, "learning_rate": 4.5688471208935776e-05, "loss": 0.4442, "num_input_tokens_seen": 4698999262, "step": 1205, "train_runtime": 47991.9889, "train_tokens_per_second": 97912.159 }, { "epoch": 0.1917329093799682, "grad_norm": 0.22291229665279388, "learning_rate": 4.568144409727693e-05, "loss": 0.4397, "num_input_tokens_seen": 4702822138, "step": 1206, "train_runtime": 48032.8122, "train_tokens_per_second": 97908.532 }, { "epoch": 0.1918918918918919, "grad_norm": 0.2205171138048172, "learning_rate": 4.5674411805062466e-05, "loss": 0.4401, "num_input_tokens_seen": 4706691614, "step": 1207, "train_runtime": 48070.704, "train_tokens_per_second": 97911.851 }, { "epoch": 0.19205087440381557, "grad_norm": 0.22991468012332916, "learning_rate": 4.5667374334053936e-05, "loss": 0.4462, "num_input_tokens_seen": 4710680906, "step": 1208, "train_runtime": 48110.2395, "train_tokens_per_second": 97914.31 }, { "epoch": 0.19220985691573927, "grad_norm": 0.22080880403518677, "learning_rate": 4.566033168601419e-05, "loss": 0.4336, "num_input_tokens_seen": 4714518481, "step": 1209, "train_runtime": 48149.291, "train_tokens_per_second": 97914.598 }, { "epoch": 0.19236883942766295, "grad_norm": 0.2705855965614319, "learning_rate": 4.565328386270734e-05, "loss": 0.4495, "num_input_tokens_seen": 4718316593, "step": 1210, "train_runtime": 48190.4279, "train_tokens_per_second": 97909.83 }, { "epoch": 0.19252782193958665, "grad_norm": 0.20775888860225677, "learning_rate": 4.564623086589883e-05, "loss": 0.437, "num_input_tokens_seen": 4722312637, "step": 1211, "train_runtime": 48227.2132, "train_tokens_per_second": 97918.008 }, { "epoch": 0.19268680445151035, "grad_norm": 0.2101748287677765, "learning_rate": 4.563917269735538e-05, "loss": 0.4396, "num_input_tokens_seen": 4726305292, "step": 1212, "train_runtime": 48264.8775, "train_tokens_per_second": 97924.32 }, { "epoch": 0.19284578696343402, "grad_norm": 0.2320111244916916, "learning_rate": 4.563210935884501e-05, "loss": 0.4512, "num_input_tokens_seen": 4730181746, "step": 1213, "train_runtime": 48305.5952, "train_tokens_per_second": 97922.026 }, { "epoch": 0.19300476947535772, "grad_norm": 0.21445997059345245, "learning_rate": 4.5625040852137046e-05, "loss": 0.441, "num_input_tokens_seen": 4734098822, "step": 1214, "train_runtime": 48345.3208, "train_tokens_per_second": 97922.586 }, { "epoch": 0.1931637519872814, "grad_norm": 0.21495836973190308, "learning_rate": 4.561796717900208e-05, "loss": 0.4315, "num_input_tokens_seen": 4737992777, "step": 1215, "train_runtime": 48385.7266, "train_tokens_per_second": 97921.29 }, { "epoch": 0.1933227344992051, "grad_norm": 0.20522014796733856, "learning_rate": 4.561088834121205e-05, "loss": 0.4483, "num_input_tokens_seen": 4741858599, "step": 1216, "train_runtime": 48427.2812, "train_tokens_per_second": 97917.093 }, { "epoch": 0.19348171701112876, "grad_norm": 0.22441644966602325, "learning_rate": 4.5603804340540126e-05, "loss": 0.4362, "num_input_tokens_seen": 4745749314, "step": 1217, "train_runtime": 48466.4957, "train_tokens_per_second": 97918.144 }, { "epoch": 0.19364069952305246, "grad_norm": 0.2498585283756256, "learning_rate": 4.55967151787608e-05, "loss": 0.446, "num_input_tokens_seen": 4749652551, "step": 1218, "train_runtime": 48506.6113, "train_tokens_per_second": 97917.633 }, { "epoch": 0.19379968203497616, "grad_norm": 0.24820148944854736, "learning_rate": 4.558962085764987e-05, "loss": 0.4372, "num_input_tokens_seen": 4753548324, "step": 1219, "train_runtime": 48545.9108, "train_tokens_per_second": 97918.614 }, { "epoch": 0.19395866454689983, "grad_norm": 0.2652827203273773, "learning_rate": 4.558252137898441e-05, "loss": 0.4333, "num_input_tokens_seen": 4757511527, "step": 1220, "train_runtime": 48584.9915, "train_tokens_per_second": 97921.424 }, { "epoch": 0.19411764705882353, "grad_norm": 0.28497883677482605, "learning_rate": 4.557541674454279e-05, "loss": 0.4293, "num_input_tokens_seen": 4761368520, "step": 1221, "train_runtime": 48621.0631, "train_tokens_per_second": 97928.104 }, { "epoch": 0.1942766295707472, "grad_norm": 0.2293694168329239, "learning_rate": 4.556830695610467e-05, "loss": 0.442, "num_input_tokens_seen": 4765370239, "step": 1222, "train_runtime": 48660.9318, "train_tokens_per_second": 97930.107 }, { "epoch": 0.1944356120826709, "grad_norm": 0.21263554692268372, "learning_rate": 4.556119201545099e-05, "loss": 0.4304, "num_input_tokens_seen": 4769218998, "step": 1223, "train_runtime": 48702.5712, "train_tokens_per_second": 97925.405 }, { "epoch": 0.1945945945945946, "grad_norm": 0.2224036455154419, "learning_rate": 4.555407192436401e-05, "loss": 0.4304, "num_input_tokens_seen": 4773022567, "step": 1224, "train_runtime": 48739.9484, "train_tokens_per_second": 97928.347 }, { "epoch": 0.19475357710651828, "grad_norm": 0.25400781631469727, "learning_rate": 4.5546946684627256e-05, "loss": 0.4355, "num_input_tokens_seen": 4776834975, "step": 1225, "train_runtime": 48780.9448, "train_tokens_per_second": 97924.2 }, { "epoch": 0.19491255961844198, "grad_norm": 0.21377822756767273, "learning_rate": 4.553981629802555e-05, "loss": 0.4321, "num_input_tokens_seen": 4780773837, "step": 1226, "train_runtime": 48818.4136, "train_tokens_per_second": 97929.725 }, { "epoch": 0.19507154213036565, "grad_norm": 0.20339322090148926, "learning_rate": 4.5532680766345e-05, "loss": 0.444, "num_input_tokens_seen": 4784705227, "step": 1227, "train_runtime": 48858.4387, "train_tokens_per_second": 97929.966 }, { "epoch": 0.19523052464228935, "grad_norm": 0.23412267863750458, "learning_rate": 4.552554009137301e-05, "loss": 0.447, "num_input_tokens_seen": 4788487165, "step": 1228, "train_runtime": 48899.3678, "train_tokens_per_second": 97925.339 }, { "epoch": 0.19538950715421305, "grad_norm": 0.24259287118911743, "learning_rate": 4.5518394274898266e-05, "loss": 0.4341, "num_input_tokens_seen": 4792238644, "step": 1229, "train_runtime": 48940.1414, "train_tokens_per_second": 97920.409 }, { "epoch": 0.19554848966613672, "grad_norm": 0.21633875370025635, "learning_rate": 4.551124331871075e-05, "loss": 0.4317, "num_input_tokens_seen": 4796222318, "step": 1230, "train_runtime": 48977.5131, "train_tokens_per_second": 97927.028 }, { "epoch": 0.19570747217806042, "grad_norm": 0.2260461151599884, "learning_rate": 4.5504087224601724e-05, "loss": 0.4343, "num_input_tokens_seen": 4800075996, "step": 1231, "train_runtime": 49017.7642, "train_tokens_per_second": 97925.233 }, { "epoch": 0.1958664546899841, "grad_norm": 0.20667219161987305, "learning_rate": 4.549692599436373e-05, "loss": 0.435, "num_input_tokens_seen": 4804007057, "step": 1232, "train_runtime": 49057.8039, "train_tokens_per_second": 97925.441 }, { "epoch": 0.1960254372019078, "grad_norm": 0.2508518099784851, "learning_rate": 4.548975962979062e-05, "loss": 0.4389, "num_input_tokens_seen": 4807933231, "step": 1233, "train_runtime": 49097.4012, "train_tokens_per_second": 97926.43 }, { "epoch": 0.19618441971383146, "grad_norm": 0.20699414610862732, "learning_rate": 4.5482588132677504e-05, "loss": 0.4533, "num_input_tokens_seen": 4811835078, "step": 1234, "train_runtime": 49135.0696, "train_tokens_per_second": 97930.767 }, { "epoch": 0.19634340222575516, "grad_norm": 0.2846757173538208, "learning_rate": 4.54754115048208e-05, "loss": 0.4425, "num_input_tokens_seen": 4815574079, "step": 1235, "train_runtime": 49173.9481, "train_tokens_per_second": 97929.377 }, { "epoch": 0.19650238473767886, "grad_norm": 0.23109981417655945, "learning_rate": 4.54682297480182e-05, "loss": 0.4314, "num_input_tokens_seen": 4819477202, "step": 1236, "train_runtime": 49212.5321, "train_tokens_per_second": 97931.909 }, { "epoch": 0.19666136724960254, "grad_norm": 0.2112904191017151, "learning_rate": 4.546104286406869e-05, "loss": 0.435, "num_input_tokens_seen": 4823401641, "step": 1237, "train_runtime": 49254.0085, "train_tokens_per_second": 97929.119 }, { "epoch": 0.19682034976152624, "grad_norm": 0.24054104089736938, "learning_rate": 4.545385085477252e-05, "loss": 0.4383, "num_input_tokens_seen": 4827198177, "step": 1238, "train_runtime": 49293.0605, "train_tokens_per_second": 97928.555 }, { "epoch": 0.1969793322734499, "grad_norm": 0.22128552198410034, "learning_rate": 4.5446653721931254e-05, "loss": 0.4272, "num_input_tokens_seen": 4831142246, "step": 1239, "train_runtime": 49331.5249, "train_tokens_per_second": 97932.149 }, { "epoch": 0.1971383147853736, "grad_norm": 0.27720877528190613, "learning_rate": 4.54394514673477e-05, "loss": 0.4291, "num_input_tokens_seen": 4835066103, "step": 1240, "train_runtime": 49371.3846, "train_tokens_per_second": 97932.56 }, { "epoch": 0.1972972972972973, "grad_norm": 0.22640660405158997, "learning_rate": 4.5432244092826e-05, "loss": 0.4356, "num_input_tokens_seen": 4839028142, "step": 1241, "train_runtime": 49405.1929, "train_tokens_per_second": 97945.739 }, { "epoch": 0.19745627980922098, "grad_norm": 0.2236480563879013, "learning_rate": 4.542503160017153e-05, "loss": 0.4282, "num_input_tokens_seen": 4842779385, "step": 1242, "train_runtime": 49443.325, "train_tokens_per_second": 97946.07 }, { "epoch": 0.19761526232114468, "grad_norm": 0.3733328580856323, "learning_rate": 4.5417813991190975e-05, "loss": 0.4415, "num_input_tokens_seen": 4846582863, "step": 1243, "train_runtime": 49481.4949, "train_tokens_per_second": 97947.382 }, { "epoch": 0.19777424483306835, "grad_norm": 0.20531563460826874, "learning_rate": 4.541059126769229e-05, "loss": 0.4551, "num_input_tokens_seen": 4850587436, "step": 1244, "train_runtime": 49522.6744, "train_tokens_per_second": 97946.799 }, { "epoch": 0.19793322734499205, "grad_norm": 0.24354514479637146, "learning_rate": 4.540336343148473e-05, "loss": 0.4433, "num_input_tokens_seen": 4854474401, "step": 1245, "train_runtime": 49563.5023, "train_tokens_per_second": 97944.539 }, { "epoch": 0.19809220985691575, "grad_norm": 0.28049710392951965, "learning_rate": 4.53961304843788e-05, "loss": 0.4352, "num_input_tokens_seen": 4858499553, "step": 1246, "train_runtime": 49600.3817, "train_tokens_per_second": 97952.866 }, { "epoch": 0.19825119236883942, "grad_norm": 0.21272853016853333, "learning_rate": 4.5388892428186315e-05, "loss": 0.4329, "num_input_tokens_seen": 4862289233, "step": 1247, "train_runtime": 49640.2124, "train_tokens_per_second": 97950.613 }, { "epoch": 0.19841017488076312, "grad_norm": 0.21705549955368042, "learning_rate": 4.538164926472035e-05, "loss": 0.4504, "num_input_tokens_seen": 4866155246, "step": 1248, "train_runtime": 49679.9964, "train_tokens_per_second": 97949.992 }, { "epoch": 0.1985691573926868, "grad_norm": 0.22802619636058807, "learning_rate": 4.537440099579527e-05, "loss": 0.4482, "num_input_tokens_seen": 4870062862, "step": 1249, "train_runtime": 49720.273, "train_tokens_per_second": 97949.238 }, { "epoch": 0.1987281399046105, "grad_norm": 0.27343907952308655, "learning_rate": 4.536714762322671e-05, "loss": 0.4441, "num_input_tokens_seen": 4873948875, "step": 1250, "train_runtime": 49758.8036, "train_tokens_per_second": 97951.488 }, { "epoch": 0.19888712241653417, "grad_norm": 0.20559482276439667, "learning_rate": 4.535988914883159e-05, "loss": 0.448, "num_input_tokens_seen": 4877846080, "step": 1251, "train_runtime": 49799.8448, "train_tokens_per_second": 97949.022 }, { "epoch": 0.19904610492845787, "grad_norm": 0.262753963470459, "learning_rate": 4.535262557442811e-05, "loss": 0.4283, "num_input_tokens_seen": 4881689714, "step": 1252, "train_runtime": 49838.4023, "train_tokens_per_second": 97950.365 }, { "epoch": 0.19920508744038157, "grad_norm": 0.2079746574163437, "learning_rate": 4.534535690183574e-05, "loss": 0.4458, "num_input_tokens_seen": 4885699969, "step": 1253, "train_runtime": 49879.0804, "train_tokens_per_second": 97950.883 }, { "epoch": 0.19936406995230524, "grad_norm": 0.2261313796043396, "learning_rate": 4.533808313287524e-05, "loss": 0.4345, "num_input_tokens_seen": 4889537277, "step": 1254, "train_runtime": 49917.4894, "train_tokens_per_second": 97952.388 }, { "epoch": 0.19952305246422894, "grad_norm": 0.23371100425720215, "learning_rate": 4.5330804269368635e-05, "loss": 0.4314, "num_input_tokens_seen": 4893452628, "step": 1255, "train_runtime": 49958.4835, "train_tokens_per_second": 97950.384 }, { "epoch": 0.1996820349761526, "grad_norm": 0.22543492913246155, "learning_rate": 4.532352031313922e-05, "loss": 0.4441, "num_input_tokens_seen": 4897369488, "step": 1256, "train_runtime": 49997.9853, "train_tokens_per_second": 97951.337 }, { "epoch": 0.1998410174880763, "grad_norm": 0.23490777611732483, "learning_rate": 4.5316231266011574e-05, "loss": 0.4477, "num_input_tokens_seen": 4901124378, "step": 1257, "train_runtime": 50036.9479, "train_tokens_per_second": 97950.107 }, { "epoch": 0.2, "grad_norm": 0.2957420349121094, "learning_rate": 4.5308937129811564e-05, "loss": 0.4488, "num_input_tokens_seen": 4905071841, "step": 1258, "train_runtime": 50076.8085, "train_tokens_per_second": 97950.967 }, { "epoch": 0.20015898251192368, "grad_norm": 0.23316903412342072, "learning_rate": 4.530163790636631e-05, "loss": 0.4296, "num_input_tokens_seen": 4908891998, "step": 1259, "train_runtime": 50116.111, "train_tokens_per_second": 97950.378 }, { "epoch": 0.20031796502384738, "grad_norm": 0.2617826759815216, "learning_rate": 4.5294333597504225e-05, "loss": 0.4351, "num_input_tokens_seen": 4912820959, "step": 1260, "train_runtime": 50155.1622, "train_tokens_per_second": 97952.449 }, { "epoch": 0.20047694753577106, "grad_norm": 0.3878009617328644, "learning_rate": 4.5287024205054975e-05, "loss": 0.4563, "num_input_tokens_seen": 4916691503, "step": 1261, "train_runtime": 50198.6501, "train_tokens_per_second": 97944.696 }, { "epoch": 0.20063593004769475, "grad_norm": 0.2627198100090027, "learning_rate": 4.5279709730849515e-05, "loss": 0.4294, "num_input_tokens_seen": 4920595316, "step": 1262, "train_runtime": 50237.6498, "train_tokens_per_second": 97946.368 }, { "epoch": 0.20079491255961845, "grad_norm": 0.22979162633419037, "learning_rate": 4.527239017672007e-05, "loss": 0.4492, "num_input_tokens_seen": 4924394810, "step": 1263, "train_runtime": 50277.5793, "train_tokens_per_second": 97944.151 }, { "epoch": 0.20095389507154213, "grad_norm": 0.253202885389328, "learning_rate": 4.526506554450015e-05, "loss": 0.4416, "num_input_tokens_seen": 4928329489, "step": 1264, "train_runtime": 50315.5748, "train_tokens_per_second": 97948.389 }, { "epoch": 0.20111287758346583, "grad_norm": 0.2652120292186737, "learning_rate": 4.5257735836024495e-05, "loss": 0.4411, "num_input_tokens_seen": 4932174850, "step": 1265, "train_runtime": 50354.4279, "train_tokens_per_second": 97949.179 }, { "epoch": 0.2012718600953895, "grad_norm": 0.21420302987098694, "learning_rate": 4.5250401053129165e-05, "loss": 0.4313, "num_input_tokens_seen": 4936129088, "step": 1266, "train_runtime": 50390.7146, "train_tokens_per_second": 97957.116 }, { "epoch": 0.2014308426073132, "grad_norm": 0.28678667545318604, "learning_rate": 4.524306119765148e-05, "loss": 0.4435, "num_input_tokens_seen": 4939942411, "step": 1267, "train_runtime": 50428.6134, "train_tokens_per_second": 97959.116 }, { "epoch": 0.20158982511923687, "grad_norm": 0.22219449281692505, "learning_rate": 4.5235716271429995e-05, "loss": 0.4334, "num_input_tokens_seen": 4943790647, "step": 1268, "train_runtime": 50467.611, "train_tokens_per_second": 97959.673 }, { "epoch": 0.20174880763116057, "grad_norm": 0.22166500985622406, "learning_rate": 4.522836627630459e-05, "loss": 0.4346, "num_input_tokens_seen": 4947666943, "step": 1269, "train_runtime": 50507.4226, "train_tokens_per_second": 97959.205 }, { "epoch": 0.20190779014308427, "grad_norm": 0.2465212643146515, "learning_rate": 4.522101121411636e-05, "loss": 0.4327, "num_input_tokens_seen": 4951464580, "step": 1270, "train_runtime": 50547.0707, "train_tokens_per_second": 97957.498 }, { "epoch": 0.20206677265500794, "grad_norm": 0.21529541909694672, "learning_rate": 4.5213651086707723e-05, "loss": 0.4251, "num_input_tokens_seen": 4955372802, "step": 1271, "train_runtime": 50585.4355, "train_tokens_per_second": 97960.465 }, { "epoch": 0.20222575516693164, "grad_norm": 0.21478493511676788, "learning_rate": 4.520628589592232e-05, "loss": 0.4337, "num_input_tokens_seen": 4959337446, "step": 1272, "train_runtime": 50626.9322, "train_tokens_per_second": 97958.482 }, { "epoch": 0.20238473767885531, "grad_norm": 0.24646180868148804, "learning_rate": 4.5198915643605076e-05, "loss": 0.4413, "num_input_tokens_seen": 4963311166, "step": 1273, "train_runtime": 50666.2475, "train_tokens_per_second": 97960.899 }, { "epoch": 0.20254372019077901, "grad_norm": 0.2188180387020111, "learning_rate": 4.51915403316022e-05, "loss": 0.4324, "num_input_tokens_seen": 4967111204, "step": 1274, "train_runtime": 50704.797, "train_tokens_per_second": 97961.367 }, { "epoch": 0.20270270270270271, "grad_norm": 0.46812060475349426, "learning_rate": 4.518415996176116e-05, "loss": 0.4347, "num_input_tokens_seen": 4970992873, "step": 1275, "train_runtime": 50745.371, "train_tokens_per_second": 97959.534 }, { "epoch": 0.20286168521462639, "grad_norm": 0.2612902820110321, "learning_rate": 4.5176774535930675e-05, "loss": 0.4332, "num_input_tokens_seen": 4974811586, "step": 1276, "train_runtime": 50786.5065, "train_tokens_per_second": 97955.381 }, { "epoch": 0.20302066772655009, "grad_norm": 0.21799756586551666, "learning_rate": 4.516938405596074e-05, "loss": 0.4418, "num_input_tokens_seen": 4978825417, "step": 1277, "train_runtime": 50827.3413, "train_tokens_per_second": 97955.653 }, { "epoch": 0.20317965023847376, "grad_norm": 0.23966792225837708, "learning_rate": 4.516198852370262e-05, "loss": 0.4353, "num_input_tokens_seen": 4982821061, "step": 1278, "train_runtime": 50866.1968, "train_tokens_per_second": 97959.379 }, { "epoch": 0.20333863275039746, "grad_norm": 0.2804495692253113, "learning_rate": 4.5154587941008855e-05, "loss": 0.4401, "num_input_tokens_seen": 4986597547, "step": 1279, "train_runtime": 50905.2885, "train_tokens_per_second": 97958.34 }, { "epoch": 0.20349761526232116, "grad_norm": 0.25040408968925476, "learning_rate": 4.514718230973323e-05, "loss": 0.4401, "num_input_tokens_seen": 4990619269, "step": 1280, "train_runtime": 50942.8669, "train_tokens_per_second": 97965.026 }, { "epoch": 0.20365659777424483, "grad_norm": 0.24109846353530884, "learning_rate": 4.51397716317308e-05, "loss": 0.4346, "num_input_tokens_seen": 4994515323, "step": 1281, "train_runtime": 50983.1764, "train_tokens_per_second": 97963.989 }, { "epoch": 0.20381558028616853, "grad_norm": 0.22659628093242645, "learning_rate": 4.5132355908857894e-05, "loss": 0.4397, "num_input_tokens_seen": 4998366900, "step": 1282, "train_runtime": 51021.1533, "train_tokens_per_second": 97966.56 }, { "epoch": 0.2039745627980922, "grad_norm": 0.2444046586751938, "learning_rate": 4.512493514297209e-05, "loss": 0.4458, "num_input_tokens_seen": 5002247244, "step": 1283, "train_runtime": 51060.7379, "train_tokens_per_second": 97966.607 }, { "epoch": 0.2041335453100159, "grad_norm": 0.27419713139533997, "learning_rate": 4.511750933593225e-05, "loss": 0.4556, "num_input_tokens_seen": 5006047341, "step": 1284, "train_runtime": 51098.6691, "train_tokens_per_second": 97968.253 }, { "epoch": 0.20429252782193957, "grad_norm": 0.24338899552822113, "learning_rate": 4.5110078489598474e-05, "loss": 0.4433, "num_input_tokens_seen": 5009950684, "step": 1285, "train_runtime": 51137.4695, "train_tokens_per_second": 97970.25 }, { "epoch": 0.20445151033386327, "grad_norm": 0.2507351338863373, "learning_rate": 4.5102642605832136e-05, "loss": 0.4401, "num_input_tokens_seen": 5013892957, "step": 1286, "train_runtime": 51176.7792, "train_tokens_per_second": 97972.03 }, { "epoch": 0.20461049284578697, "grad_norm": 0.27186018228530884, "learning_rate": 4.509520168649589e-05, "loss": 0.4362, "num_input_tokens_seen": 5017800006, "step": 1287, "train_runtime": 51219.0742, "train_tokens_per_second": 97967.409 }, { "epoch": 0.20476947535771065, "grad_norm": 0.29152989387512207, "learning_rate": 4.5087755733453604e-05, "loss": 0.4441, "num_input_tokens_seen": 5021736400, "step": 1288, "train_runtime": 51259.4513, "train_tokens_per_second": 97967.034 }, { "epoch": 0.20492845786963435, "grad_norm": 0.21358223259449005, "learning_rate": 4.508030474857047e-05, "loss": 0.4431, "num_input_tokens_seen": 5025594983, "step": 1289, "train_runtime": 51299.4822, "train_tokens_per_second": 97965.803 }, { "epoch": 0.20508744038155802, "grad_norm": 0.24911212921142578, "learning_rate": 4.5072848733712886e-05, "loss": 0.428, "num_input_tokens_seen": 5029409410, "step": 1290, "train_runtime": 51336.2572, "train_tokens_per_second": 97969.928 }, { "epoch": 0.20524642289348172, "grad_norm": 0.24587523937225342, "learning_rate": 4.506538769074854e-05, "loss": 0.4469, "num_input_tokens_seen": 5033417654, "step": 1291, "train_runtime": 51376.7708, "train_tokens_per_second": 97970.689 }, { "epoch": 0.20540540540540542, "grad_norm": 0.23284272849559784, "learning_rate": 4.5057921621546365e-05, "loss": 0.4339, "num_input_tokens_seen": 5037263813, "step": 1292, "train_runtime": 51415.9057, "train_tokens_per_second": 97970.924 }, { "epoch": 0.2055643879173291, "grad_norm": 0.21258696913719177, "learning_rate": 4.505045052797655e-05, "loss": 0.4314, "num_input_tokens_seen": 5041147650, "step": 1293, "train_runtime": 51453.976, "train_tokens_per_second": 97973.919 }, { "epoch": 0.2057233704292528, "grad_norm": 0.22698721289634705, "learning_rate": 4.504297441191057e-05, "loss": 0.4417, "num_input_tokens_seen": 5044999581, "step": 1294, "train_runtime": 51494.3356, "train_tokens_per_second": 97971.933 }, { "epoch": 0.20588235294117646, "grad_norm": 0.20540882647037506, "learning_rate": 4.503549327522113e-05, "loss": 0.4269, "num_input_tokens_seen": 5048912588, "step": 1295, "train_runtime": 51532.3555, "train_tokens_per_second": 97975.583 }, { "epoch": 0.20604133545310016, "grad_norm": 0.4733109772205353, "learning_rate": 4.50280071197822e-05, "loss": 0.4374, "num_input_tokens_seen": 5052794360, "step": 1296, "train_runtime": 51571.872, "train_tokens_per_second": 97975.78 }, { "epoch": 0.20620031796502386, "grad_norm": 0.24462610483169556, "learning_rate": 4.502051594746901e-05, "loss": 0.4433, "num_input_tokens_seen": 5056625073, "step": 1297, "train_runtime": 51609.9312, "train_tokens_per_second": 97977.753 }, { "epoch": 0.20635930047694753, "grad_norm": 0.20403490960597992, "learning_rate": 4.5013019760158046e-05, "loss": 0.4368, "num_input_tokens_seen": 5060530386, "step": 1298, "train_runtime": 51648.3367, "train_tokens_per_second": 97980.51 }, { "epoch": 0.20651828298887123, "grad_norm": 0.24571123719215393, "learning_rate": 4.500551855972706e-05, "loss": 0.4447, "num_input_tokens_seen": 5064492275, "step": 1299, "train_runtime": 51687.3173, "train_tokens_per_second": 97983.268 }, { "epoch": 0.2066772655007949, "grad_norm": 0.2207627147436142, "learning_rate": 4.499801234805504e-05, "loss": 0.4383, "num_input_tokens_seen": 5068352144, "step": 1300, "train_runtime": 51726.9264, "train_tokens_per_second": 97982.859 }, { "epoch": 0.2068362480127186, "grad_norm": 0.19690318405628204, "learning_rate": 4.499050112702223e-05, "loss": 0.4311, "num_input_tokens_seen": 5072312221, "step": 1301, "train_runtime": 51768.0383, "train_tokens_per_second": 97981.542 }, { "epoch": 0.20699523052464228, "grad_norm": 0.2374134212732315, "learning_rate": 4.498298489851016e-05, "loss": 0.4528, "num_input_tokens_seen": 5076163055, "step": 1302, "train_runtime": 51805.5385, "train_tokens_per_second": 97984.949 }, { "epoch": 0.20715421303656598, "grad_norm": 0.3518825173377991, "learning_rate": 4.497546366440157e-05, "loss": 0.4341, "num_input_tokens_seen": 5080019361, "step": 1303, "train_runtime": 51844.2391, "train_tokens_per_second": 97986.188 }, { "epoch": 0.20731319554848968, "grad_norm": 0.2915911078453064, "learning_rate": 4.4967937426580485e-05, "loss": 0.435, "num_input_tokens_seen": 5083971261, "step": 1304, "train_runtime": 51885.0879, "train_tokens_per_second": 97985.21 }, { "epoch": 0.20747217806041335, "grad_norm": 0.21880929172039032, "learning_rate": 4.496040618693218e-05, "loss": 0.4345, "num_input_tokens_seen": 5087788611, "step": 1305, "train_runtime": 51922.0631, "train_tokens_per_second": 97988.953 }, { "epoch": 0.20763116057233705, "grad_norm": 0.21505936980247498, "learning_rate": 4.495286994734317e-05, "loss": 0.4261, "num_input_tokens_seen": 5091789374, "step": 1306, "train_runtime": 51963.9026, "train_tokens_per_second": 97987.047 }, { "epoch": 0.20779014308426072, "grad_norm": 0.28391334414482117, "learning_rate": 4.494532870970124e-05, "loss": 0.4382, "num_input_tokens_seen": 5095645064, "step": 1307, "train_runtime": 52005.3531, "train_tokens_per_second": 97983.088 }, { "epoch": 0.20794912559618442, "grad_norm": 0.21746480464935303, "learning_rate": 4.49377824758954e-05, "loss": 0.4384, "num_input_tokens_seen": 5099596534, "step": 1308, "train_runtime": 52044.429, "train_tokens_per_second": 97985.445 }, { "epoch": 0.20810810810810812, "grad_norm": 0.43052542209625244, "learning_rate": 4.493023124781593e-05, "loss": 0.4275, "num_input_tokens_seen": 5103415672, "step": 1309, "train_runtime": 52084.5325, "train_tokens_per_second": 97983.325 }, { "epoch": 0.2082670906200318, "grad_norm": 0.2533348500728607, "learning_rate": 4.492267502735439e-05, "loss": 0.4477, "num_input_tokens_seen": 5107267259, "step": 1310, "train_runtime": 52122.1325, "train_tokens_per_second": 97986.537 }, { "epoch": 0.2084260731319555, "grad_norm": 0.24309642612934113, "learning_rate": 4.491511381640352e-05, "loss": 0.4337, "num_input_tokens_seen": 5111149607, "step": 1311, "train_runtime": 52163.5964, "train_tokens_per_second": 97983.075 }, { "epoch": 0.20858505564387916, "grad_norm": 0.1885644942522049, "learning_rate": 4.490754761685737e-05, "loss": 0.426, "num_input_tokens_seen": 5115103820, "step": 1312, "train_runtime": 52203.0001, "train_tokens_per_second": 97984.863 }, { "epoch": 0.20874403815580286, "grad_norm": 0.20127694308757782, "learning_rate": 4.489997643061121e-05, "loss": 0.4425, "num_input_tokens_seen": 5119025170, "step": 1313, "train_runtime": 52238.6578, "train_tokens_per_second": 97993.046 }, { "epoch": 0.20890302066772656, "grad_norm": 0.21465352177619934, "learning_rate": 4.489240025956158e-05, "loss": 0.433, "num_input_tokens_seen": 5122912895, "step": 1314, "train_runtime": 52277.9401, "train_tokens_per_second": 97993.779 }, { "epoch": 0.20906200317965024, "grad_norm": 0.24661307036876678, "learning_rate": 4.488481910560625e-05, "loss": 0.4371, "num_input_tokens_seen": 5126852086, "step": 1315, "train_runtime": 52317.5833, "train_tokens_per_second": 97994.819 }, { "epoch": 0.20922098569157394, "grad_norm": 0.2194853127002716, "learning_rate": 4.4877232970644245e-05, "loss": 0.4295, "num_input_tokens_seen": 5130761696, "step": 1316, "train_runtime": 52356.867, "train_tokens_per_second": 97995.965 }, { "epoch": 0.2093799682034976, "grad_norm": 0.28011223673820496, "learning_rate": 4.4869641856575844e-05, "loss": 0.4416, "num_input_tokens_seen": 5134622419, "step": 1317, "train_runtime": 52395.8637, "train_tokens_per_second": 97996.713 }, { "epoch": 0.2095389507154213, "grad_norm": 0.23234261572360992, "learning_rate": 4.4862045765302564e-05, "loss": 0.4314, "num_input_tokens_seen": 5138564713, "step": 1318, "train_runtime": 52436.9616, "train_tokens_per_second": 97995.089 }, { "epoch": 0.20969793322734498, "grad_norm": 0.21279466152191162, "learning_rate": 4.485444469872717e-05, "loss": 0.4258, "num_input_tokens_seen": 5142580662, "step": 1319, "train_runtime": 52475.7933, "train_tokens_per_second": 97999.103 }, { "epoch": 0.20985691573926868, "grad_norm": 0.266198068857193, "learning_rate": 4.4846838658753674e-05, "loss": 0.4473, "num_input_tokens_seen": 5146453166, "step": 1320, "train_runtime": 52515.3914, "train_tokens_per_second": 97998.949 }, { "epoch": 0.21001589825119238, "grad_norm": 0.2065744400024414, "learning_rate": 4.4839227647287355e-05, "loss": 0.4273, "num_input_tokens_seen": 5150239549, "step": 1321, "train_runtime": 52554.2561, "train_tokens_per_second": 97998.524 }, { "epoch": 0.21017488076311605, "grad_norm": 0.21758493781089783, "learning_rate": 4.483161166623468e-05, "loss": 0.4269, "num_input_tokens_seen": 5154153081, "step": 1322, "train_runtime": 52594.6405, "train_tokens_per_second": 97997.686 }, { "epoch": 0.21033386327503975, "grad_norm": 0.21088843047618866, "learning_rate": 4.482399071750343e-05, "loss": 0.4256, "num_input_tokens_seen": 5158150895, "step": 1323, "train_runtime": 52631.6068, "train_tokens_per_second": 98004.815 }, { "epoch": 0.21049284578696342, "grad_norm": 0.2581126391887665, "learning_rate": 4.481636480300259e-05, "loss": 0.4374, "num_input_tokens_seen": 5161957817, "step": 1324, "train_runtime": 52670.1426, "train_tokens_per_second": 98005.389 }, { "epoch": 0.21065182829888712, "grad_norm": 0.20890505611896515, "learning_rate": 4.4808733924642396e-05, "loss": 0.432, "num_input_tokens_seen": 5165721301, "step": 1325, "train_runtime": 52708.7483, "train_tokens_per_second": 98005.008 }, { "epoch": 0.21081081081081082, "grad_norm": 0.20371420681476593, "learning_rate": 4.480109808433432e-05, "loss": 0.4322, "num_input_tokens_seen": 5169743137, "step": 1326, "train_runtime": 52746.8258, "train_tokens_per_second": 98010.507 }, { "epoch": 0.2109697933227345, "grad_norm": 0.22952142357826233, "learning_rate": 4.4793457283991106e-05, "loss": 0.4362, "num_input_tokens_seen": 5173703015, "step": 1327, "train_runtime": 52785.242, "train_tokens_per_second": 98014.195 }, { "epoch": 0.2111287758346582, "grad_norm": 0.3362862467765808, "learning_rate": 4.478581152552671e-05, "loss": 0.4365, "num_input_tokens_seen": 5177527289, "step": 1328, "train_runtime": 52823.4906, "train_tokens_per_second": 98015.622 }, { "epoch": 0.21128775834658187, "grad_norm": 0.2148774117231369, "learning_rate": 4.4778160810856334e-05, "loss": 0.4355, "num_input_tokens_seen": 5181537407, "step": 1329, "train_runtime": 52862.608, "train_tokens_per_second": 98018.951 }, { "epoch": 0.21144674085850557, "grad_norm": 0.21950958669185638, "learning_rate": 4.477050514189644e-05, "loss": 0.4646, "num_input_tokens_seen": 5185542246, "step": 1330, "train_runtime": 52903.1192, "train_tokens_per_second": 98019.594 }, { "epoch": 0.21160572337042927, "grad_norm": 0.2122083306312561, "learning_rate": 4.476284452056471e-05, "loss": 0.4339, "num_input_tokens_seen": 5189311878, "step": 1331, "train_runtime": 52944.0569, "train_tokens_per_second": 98015.003 }, { "epoch": 0.21176470588235294, "grad_norm": 0.2546745240688324, "learning_rate": 4.475517894878008e-05, "loss": 0.4384, "num_input_tokens_seen": 5193132427, "step": 1332, "train_runtime": 52983.6788, "train_tokens_per_second": 98013.814 }, { "epoch": 0.21192368839427664, "grad_norm": 0.2280600368976593, "learning_rate": 4.474750842846272e-05, "loss": 0.4386, "num_input_tokens_seen": 5197053730, "step": 1333, "train_runtime": 53024.6278, "train_tokens_per_second": 98012.074 }, { "epoch": 0.2120826709062003, "grad_norm": 0.20859214663505554, "learning_rate": 4.4739832961534044e-05, "loss": 0.4206, "num_input_tokens_seen": 5200994522, "step": 1334, "train_runtime": 53064.6454, "train_tokens_per_second": 98012.424 }, { "epoch": 0.212241653418124, "grad_norm": 0.21615146100521088, "learning_rate": 4.4732152549916694e-05, "loss": 0.4301, "num_input_tokens_seen": 5204835773, "step": 1335, "train_runtime": 53101.7963, "train_tokens_per_second": 98016.19 }, { "epoch": 0.21240063593004768, "grad_norm": 0.21007853746414185, "learning_rate": 4.472446719553457e-05, "loss": 0.4429, "num_input_tokens_seen": 5208763240, "step": 1336, "train_runtime": 53141.3224, "train_tokens_per_second": 98017.193 }, { "epoch": 0.21255961844197138, "grad_norm": 0.20738866925239563, "learning_rate": 4.471677690031279e-05, "loss": 0.433, "num_input_tokens_seen": 5212730320, "step": 1337, "train_runtime": 53179.3955, "train_tokens_per_second": 98021.617 }, { "epoch": 0.21271860095389508, "grad_norm": 0.3192014992237091, "learning_rate": 4.470908166617772e-05, "loss": 0.4386, "num_input_tokens_seen": 5216677918, "step": 1338, "train_runtime": 53214.1715, "train_tokens_per_second": 98031.742 }, { "epoch": 0.21287758346581875, "grad_norm": 0.2224998027086258, "learning_rate": 4.470138149505697e-05, "loss": 0.4436, "num_input_tokens_seen": 5220560319, "step": 1339, "train_runtime": 53253.9498, "train_tokens_per_second": 98031.42 }, { "epoch": 0.21303656597774245, "grad_norm": 0.22956904768943787, "learning_rate": 4.469367638887937e-05, "loss": 0.4379, "num_input_tokens_seen": 5224323406, "step": 1340, "train_runtime": 53294.715, "train_tokens_per_second": 98027.045 }, { "epoch": 0.21319554848966613, "grad_norm": 0.21216115355491638, "learning_rate": 4.4685966349574996e-05, "loss": 0.4376, "num_input_tokens_seen": 5228311592, "step": 1341, "train_runtime": 53332.8248, "train_tokens_per_second": 98031.777 }, { "epoch": 0.21335453100158983, "grad_norm": 0.2447480708360672, "learning_rate": 4.4678251379075164e-05, "loss": 0.4318, "num_input_tokens_seen": 5232296940, "step": 1342, "train_runtime": 53368.965, "train_tokens_per_second": 98040.068 }, { "epoch": 0.21351351351351353, "grad_norm": 0.2255619317293167, "learning_rate": 4.467053147931241e-05, "loss": 0.4215, "num_input_tokens_seen": 5236108850, "step": 1343, "train_runtime": 53408.3974, "train_tokens_per_second": 98039.056 }, { "epoch": 0.2136724960254372, "grad_norm": 0.19545742869377136, "learning_rate": 4.466280665222052e-05, "loss": 0.438, "num_input_tokens_seen": 5240161971, "step": 1344, "train_runtime": 53446.7871, "train_tokens_per_second": 98044.471 }, { "epoch": 0.2138314785373609, "grad_norm": 0.21622243523597717, "learning_rate": 4.4655076899734504e-05, "loss": 0.44, "num_input_tokens_seen": 5244145822, "step": 1345, "train_runtime": 53483.757, "train_tokens_per_second": 98051.186 }, { "epoch": 0.21399046104928457, "grad_norm": 0.21636411547660828, "learning_rate": 4.464734222379062e-05, "loss": 0.4369, "num_input_tokens_seen": 5248025654, "step": 1346, "train_runtime": 53523.6729, "train_tokens_per_second": 98050.552 }, { "epoch": 0.21414944356120827, "grad_norm": 0.2512688636779785, "learning_rate": 4.463960262632635e-05, "loss": 0.4374, "num_input_tokens_seen": 5251991625, "step": 1347, "train_runtime": 53559.4504, "train_tokens_per_second": 98059.102 }, { "epoch": 0.21430842607313197, "grad_norm": 0.25174015760421753, "learning_rate": 4.463185810928039e-05, "loss": 0.4416, "num_input_tokens_seen": 5255931005, "step": 1348, "train_runtime": 53598.4832, "train_tokens_per_second": 98061.189 }, { "epoch": 0.21446740858505564, "grad_norm": 0.22635775804519653, "learning_rate": 4.46241086745927e-05, "loss": 0.4317, "num_input_tokens_seen": 5259859178, "step": 1349, "train_runtime": 53636.6225, "train_tokens_per_second": 98064.698 }, { "epoch": 0.21462639109697934, "grad_norm": 0.24070630967617035, "learning_rate": 4.4616354324204465e-05, "loss": 0.4276, "num_input_tokens_seen": 5263690919, "step": 1350, "train_runtime": 53674.1558, "train_tokens_per_second": 98067.512 }, { "epoch": 0.214785373608903, "grad_norm": 0.24461233615875244, "learning_rate": 4.460859506005809e-05, "loss": 0.4399, "num_input_tokens_seen": 5267645457, "step": 1351, "train_runtime": 53714.5268, "train_tokens_per_second": 98067.427 }, { "epoch": 0.2149443561208267, "grad_norm": 0.22153033316135406, "learning_rate": 4.460083088409721e-05, "loss": 0.4387, "num_input_tokens_seen": 5271622916, "step": 1352, "train_runtime": 53752.2371, "train_tokens_per_second": 98072.624 }, { "epoch": 0.21510333863275038, "grad_norm": 0.22261537611484528, "learning_rate": 4.45930617982667e-05, "loss": 0.438, "num_input_tokens_seen": 5275432391, "step": 1353, "train_runtime": 53789.4107, "train_tokens_per_second": 98075.668 }, { "epoch": 0.21526232114467408, "grad_norm": 0.2862461507320404, "learning_rate": 4.458528780451265e-05, "loss": 0.4413, "num_input_tokens_seen": 5279358774, "step": 1354, "train_runtime": 53825.6471, "train_tokens_per_second": 98082.588 }, { "epoch": 0.21542130365659778, "grad_norm": 0.2980280816555023, "learning_rate": 4.4577508904782413e-05, "loss": 0.4295, "num_input_tokens_seen": 5283280636, "step": 1355, "train_runtime": 53863.8356, "train_tokens_per_second": 98085.86 }, { "epoch": 0.21558028616852146, "grad_norm": 0.23114652931690216, "learning_rate": 4.456972510102454e-05, "loss": 0.4289, "num_input_tokens_seen": 5287128594, "step": 1356, "train_runtime": 53902.423, "train_tokens_per_second": 98087.03 }, { "epoch": 0.21573926868044516, "grad_norm": 0.21272578835487366, "learning_rate": 4.456193639518881e-05, "loss": 0.4308, "num_input_tokens_seen": 5291036000, "step": 1357, "train_runtime": 53941.7032, "train_tokens_per_second": 98088.041 }, { "epoch": 0.21589825119236883, "grad_norm": 0.22199049592018127, "learning_rate": 4.4554142789226244e-05, "loss": 0.4332, "num_input_tokens_seen": 5294917378, "step": 1358, "train_runtime": 53979.0413, "train_tokens_per_second": 98092.097 }, { "epoch": 0.21605723370429253, "grad_norm": 0.22954526543617249, "learning_rate": 4.4546344285089095e-05, "loss": 0.442, "num_input_tokens_seen": 5298706462, "step": 1359, "train_runtime": 54016.8943, "train_tokens_per_second": 98093.504 }, { "epoch": 0.21621621621621623, "grad_norm": 0.2556326687335968, "learning_rate": 4.453854088473081e-05, "loss": 0.4361, "num_input_tokens_seen": 5302699806, "step": 1360, "train_runtime": 54055.9628, "train_tokens_per_second": 98096.483 }, { "epoch": 0.2163751987281399, "grad_norm": 0.20351718366146088, "learning_rate": 4.453073259010611e-05, "loss": 0.4283, "num_input_tokens_seen": 5306454085, "step": 1361, "train_runtime": 54092.2335, "train_tokens_per_second": 98100.111 }, { "epoch": 0.2165341812400636, "grad_norm": 0.22954382002353668, "learning_rate": 4.45229194031709e-05, "loss": 0.4303, "num_input_tokens_seen": 5310407145, "step": 1362, "train_runtime": 54131.2167, "train_tokens_per_second": 98102.49 }, { "epoch": 0.21669316375198727, "grad_norm": 0.24967646598815918, "learning_rate": 4.451510132588233e-05, "loss": 0.4348, "num_input_tokens_seen": 5314342907, "step": 1363, "train_runtime": 54170.172, "train_tokens_per_second": 98104.597 }, { "epoch": 0.21685214626391097, "grad_norm": 0.256633460521698, "learning_rate": 4.4507278360198795e-05, "loss": 0.4429, "num_input_tokens_seen": 5318248938, "step": 1364, "train_runtime": 54207.5597, "train_tokens_per_second": 98108.99 }, { "epoch": 0.21701112877583467, "grad_norm": 0.22023847699165344, "learning_rate": 4.4499450508079865e-05, "loss": 0.4374, "num_input_tokens_seen": 5322296050, "step": 1365, "train_runtime": 54249.4691, "train_tokens_per_second": 98107.8 }, { "epoch": 0.21717011128775834, "grad_norm": 0.23269496858119965, "learning_rate": 4.449161777148638e-05, "loss": 0.4363, "num_input_tokens_seen": 5326251424, "step": 1366, "train_runtime": 54290.7734, "train_tokens_per_second": 98106.015 }, { "epoch": 0.21732909379968204, "grad_norm": 0.245035320520401, "learning_rate": 4.4483780152380375e-05, "loss": 0.4297, "num_input_tokens_seen": 5330161848, "step": 1367, "train_runtime": 54330.5123, "train_tokens_per_second": 98106.232 }, { "epoch": 0.21748807631160572, "grad_norm": 0.2600143551826477, "learning_rate": 4.447593765272512e-05, "loss": 0.4428, "num_input_tokens_seen": 5334026858, "step": 1368, "train_runtime": 54369.966, "train_tokens_per_second": 98106.128 }, { "epoch": 0.21764705882352942, "grad_norm": 0.2660319209098816, "learning_rate": 4.44680902744851e-05, "loss": 0.4323, "num_input_tokens_seen": 5337999305, "step": 1369, "train_runtime": 54409.033, "train_tokens_per_second": 98108.697 }, { "epoch": 0.2178060413354531, "grad_norm": 0.2222912609577179, "learning_rate": 4.446023801962605e-05, "loss": 0.438, "num_input_tokens_seen": 5341837159, "step": 1370, "train_runtime": 54445.0193, "train_tokens_per_second": 98114.34 }, { "epoch": 0.2179650238473768, "grad_norm": 0.22348962724208832, "learning_rate": 4.445238089011487e-05, "loss": 0.4283, "num_input_tokens_seen": 5345714928, "step": 1371, "train_runtime": 54483.8835, "train_tokens_per_second": 98115.527 }, { "epoch": 0.2181240063593005, "grad_norm": 0.20934675633907318, "learning_rate": 4.444451888791975e-05, "loss": 0.4284, "num_input_tokens_seen": 5349503012, "step": 1372, "train_runtime": 54521.913, "train_tokens_per_second": 98116.569 }, { "epoch": 0.21828298887122416, "grad_norm": 0.21143370866775513, "learning_rate": 4.4436652015010043e-05, "loss": 0.4482, "num_input_tokens_seen": 5353412279, "step": 1373, "train_runtime": 54561.9113, "train_tokens_per_second": 98116.289 }, { "epoch": 0.21844197138314786, "grad_norm": 0.23742540180683136, "learning_rate": 4.442878027335636e-05, "loss": 0.4379, "num_input_tokens_seen": 5357343114, "step": 1374, "train_runtime": 54601.767, "train_tokens_per_second": 98116.662 }, { "epoch": 0.21860095389507153, "grad_norm": 0.23539206385612488, "learning_rate": 4.44209036649305e-05, "loss": 0.4384, "num_input_tokens_seen": 5361194778, "step": 1375, "train_runtime": 54640.0164, "train_tokens_per_second": 98118.47 }, { "epoch": 0.21875993640699523, "grad_norm": 0.20842325687408447, "learning_rate": 4.4413022191705514e-05, "loss": 0.4421, "num_input_tokens_seen": 5365057484, "step": 1376, "train_runtime": 54680.6383, "train_tokens_per_second": 98116.219 }, { "epoch": 0.21891891891891893, "grad_norm": 0.26590126752853394, "learning_rate": 4.4405135855655636e-05, "loss": 0.4339, "num_input_tokens_seen": 5368964324, "step": 1377, "train_runtime": 54721.9836, "train_tokens_per_second": 98113.482 }, { "epoch": 0.2190779014308426, "grad_norm": 0.2044706642627716, "learning_rate": 4.4397244658756364e-05, "loss": 0.442, "num_input_tokens_seen": 5372900431, "step": 1378, "train_runtime": 54760.0635, "train_tokens_per_second": 98117.133 }, { "epoch": 0.2192368839427663, "grad_norm": 0.23625314235687256, "learning_rate": 4.438934860298437e-05, "loss": 0.4339, "num_input_tokens_seen": 5376900331, "step": 1379, "train_runtime": 54798.2284, "train_tokens_per_second": 98121.791 }, { "epoch": 0.21939586645468998, "grad_norm": 0.26785412430763245, "learning_rate": 4.438144769031756e-05, "loss": 0.4337, "num_input_tokens_seen": 5380792020, "step": 1380, "train_runtime": 54838.6655, "train_tokens_per_second": 98120.404 }, { "epoch": 0.21955484896661367, "grad_norm": 0.2308882623910904, "learning_rate": 4.437354192273505e-05, "loss": 0.4429, "num_input_tokens_seen": 5384772393, "step": 1381, "train_runtime": 54877.4986, "train_tokens_per_second": 98123.503 }, { "epoch": 0.21971383147853737, "grad_norm": 0.2027357965707779, "learning_rate": 4.43656313022172e-05, "loss": 0.4418, "num_input_tokens_seen": 5388564133, "step": 1382, "train_runtime": 54916.1936, "train_tokens_per_second": 98123.409 }, { "epoch": 0.21987281399046105, "grad_norm": 0.21956674754619598, "learning_rate": 4.435771583074555e-05, "loss": 0.442, "num_input_tokens_seen": 5392397917, "step": 1383, "train_runtime": 54955.4804, "train_tokens_per_second": 98123.024 }, { "epoch": 0.22003179650238475, "grad_norm": 0.24377842247486115, "learning_rate": 4.434979551030288e-05, "loss": 0.4318, "num_input_tokens_seen": 5396293629, "step": 1384, "train_runtime": 54993.9864, "train_tokens_per_second": 98125.158 }, { "epoch": 0.22019077901430842, "grad_norm": 0.2228633165359497, "learning_rate": 4.434187034287316e-05, "loss": 0.4432, "num_input_tokens_seen": 5400254187, "step": 1385, "train_runtime": 55032.5638, "train_tokens_per_second": 98128.341 }, { "epoch": 0.22034976152623212, "grad_norm": 0.2638510763645172, "learning_rate": 4.4333940330441595e-05, "loss": 0.4268, "num_input_tokens_seen": 5404233054, "step": 1386, "train_runtime": 55072.6413, "train_tokens_per_second": 98129.179 }, { "epoch": 0.2205087440381558, "grad_norm": 0.3003721237182617, "learning_rate": 4.4326005474994605e-05, "loss": 0.4304, "num_input_tokens_seen": 5408055005, "step": 1387, "train_runtime": 55113.8321, "train_tokens_per_second": 98125.186 }, { "epoch": 0.2206677265500795, "grad_norm": 0.212246835231781, "learning_rate": 4.43180657785198e-05, "loss": 0.423, "num_input_tokens_seen": 5412001517, "step": 1388, "train_runtime": 55151.1715, "train_tokens_per_second": 98130.309 }, { "epoch": 0.2208267090620032, "grad_norm": 0.254666805267334, "learning_rate": 4.431012124300603e-05, "loss": 0.4379, "num_input_tokens_seen": 5415903738, "step": 1389, "train_runtime": 55189.8226, "train_tokens_per_second": 98132.291 }, { "epoch": 0.22098569157392686, "grad_norm": 0.2327001392841339, "learning_rate": 4.430217187044334e-05, "loss": 0.4468, "num_input_tokens_seen": 5419820011, "step": 1390, "train_runtime": 55228.5976, "train_tokens_per_second": 98134.305 }, { "epoch": 0.22114467408585056, "grad_norm": 0.24340911209583282, "learning_rate": 4.429421766282299e-05, "loss": 0.4327, "num_input_tokens_seen": 5423700786, "step": 1391, "train_runtime": 55269.5178, "train_tokens_per_second": 98131.864 }, { "epoch": 0.22130365659777423, "grad_norm": 0.22883903980255127, "learning_rate": 4.4286258622137455e-05, "loss": 0.4282, "num_input_tokens_seen": 5427685120, "step": 1392, "train_runtime": 55309.3231, "train_tokens_per_second": 98133.277 }, { "epoch": 0.22146263910969793, "grad_norm": 0.2820753753185272, "learning_rate": 4.427829475038042e-05, "loss": 0.4402, "num_input_tokens_seen": 5431562237, "step": 1393, "train_runtime": 55350.4229, "train_tokens_per_second": 98130.456 }, { "epoch": 0.22162162162162163, "grad_norm": 0.20158877968788147, "learning_rate": 4.427032604954678e-05, "loss": 0.4233, "num_input_tokens_seen": 5435554140, "step": 1394, "train_runtime": 55390.1366, "train_tokens_per_second": 98132.167 }, { "epoch": 0.2217806041335453, "grad_norm": 0.2398872673511505, "learning_rate": 4.426235252163263e-05, "loss": 0.4305, "num_input_tokens_seen": 5439369543, "step": 1395, "train_runtime": 55429.6492, "train_tokens_per_second": 98131.048 }, { "epoch": 0.221939586645469, "grad_norm": 0.320297509431839, "learning_rate": 4.425437416863529e-05, "loss": 0.4453, "num_input_tokens_seen": 5443334452, "step": 1396, "train_runtime": 55470.0083, "train_tokens_per_second": 98131.127 }, { "epoch": 0.22209856915739268, "grad_norm": 0.24272720515727997, "learning_rate": 4.424639099255328e-05, "loss": 0.4581, "num_input_tokens_seen": 5447217847, "step": 1397, "train_runtime": 55509.8663, "train_tokens_per_second": 98130.625 }, { "epoch": 0.22225755166931638, "grad_norm": 0.2603910267353058, "learning_rate": 4.423840299538633e-05, "loss": 0.4352, "num_input_tokens_seen": 5451163667, "step": 1398, "train_runtime": 55546.8849, "train_tokens_per_second": 98136.262 }, { "epoch": 0.22241653418124005, "grad_norm": 0.25523796677589417, "learning_rate": 4.4230410179135376e-05, "loss": 0.4284, "num_input_tokens_seen": 5455083462, "step": 1399, "train_runtime": 55587.569, "train_tokens_per_second": 98134.953 }, { "epoch": 0.22257551669316375, "grad_norm": 0.3601329028606415, "learning_rate": 4.422241254580256e-05, "loss": 0.4272, "num_input_tokens_seen": 5458935125, "step": 1400, "train_runtime": 55625.7144, "train_tokens_per_second": 98136.899 }, { "epoch": 0.22273449920508745, "grad_norm": 0.2539448142051697, "learning_rate": 4.421441009739122e-05, "loss": 0.4464, "num_input_tokens_seen": 5462848672, "step": 1401, "train_runtime": 55786.7353, "train_tokens_per_second": 97923.792 }, { "epoch": 0.22289348171701112, "grad_norm": 0.27288633584976196, "learning_rate": 4.4206402835905946e-05, "loss": 0.4344, "num_input_tokens_seen": 5466717301, "step": 1402, "train_runtime": 55827.8317, "train_tokens_per_second": 97921.003 }, { "epoch": 0.22305246422893482, "grad_norm": 0.21475060284137726, "learning_rate": 4.419839076335247e-05, "loss": 0.4476, "num_input_tokens_seen": 5470720730, "step": 1403, "train_runtime": 55867.303, "train_tokens_per_second": 97923.48 }, { "epoch": 0.2232114467408585, "grad_norm": 0.24908456206321716, "learning_rate": 4.419037388173777e-05, "loss": 0.4338, "num_input_tokens_seen": 5474436251, "step": 1404, "train_runtime": 55905.9577, "train_tokens_per_second": 97922.234 }, { "epoch": 0.2233704292527822, "grad_norm": 0.21683476865291595, "learning_rate": 4.418235219307003e-05, "loss": 0.4493, "num_input_tokens_seen": 5478395813, "step": 1405, "train_runtime": 55945.0664, "train_tokens_per_second": 97924.557 }, { "epoch": 0.2235294117647059, "grad_norm": 0.22326722741127014, "learning_rate": 4.4174325699358604e-05, "loss": 0.4214, "num_input_tokens_seen": 5482222822, "step": 1406, "train_runtime": 55983.9659, "train_tokens_per_second": 97924.874 }, { "epoch": 0.22368839427662957, "grad_norm": 0.2489425390958786, "learning_rate": 4.416629440261408e-05, "loss": 0.4214, "num_input_tokens_seen": 5486224480, "step": 1407, "train_runtime": 56022.3704, "train_tokens_per_second": 97929.174 }, { "epoch": 0.22384737678855327, "grad_norm": 0.27902600169181824, "learning_rate": 4.4158258304848254e-05, "loss": 0.4442, "num_input_tokens_seen": 5490220475, "step": 1408, "train_runtime": 56061.3017, "train_tokens_per_second": 97932.447 }, { "epoch": 0.22400635930047694, "grad_norm": 0.2871807813644409, "learning_rate": 4.4150217408074104e-05, "loss": 0.4271, "num_input_tokens_seen": 5494070384, "step": 1409, "train_runtime": 56101.8375, "train_tokens_per_second": 97930.311 }, { "epoch": 0.22416534181240064, "grad_norm": 0.20716838538646698, "learning_rate": 4.4142171714305805e-05, "loss": 0.4385, "num_input_tokens_seen": 5498008741, "step": 1410, "train_runtime": 56141.1158, "train_tokens_per_second": 97931.946 }, { "epoch": 0.22432432432432434, "grad_norm": 0.21017414331436157, "learning_rate": 4.4134121225558754e-05, "loss": 0.4188, "num_input_tokens_seen": 5501771774, "step": 1411, "train_runtime": 56180.7808, "train_tokens_per_second": 97929.785 }, { "epoch": 0.224483306836248, "grad_norm": 0.2368234395980835, "learning_rate": 4.4126065943849554e-05, "loss": 0.4305, "num_input_tokens_seen": 5505663120, "step": 1412, "train_runtime": 56218.1549, "train_tokens_per_second": 97933.899 }, { "epoch": 0.2246422893481717, "grad_norm": 0.2206428498029709, "learning_rate": 4.4118005871195986e-05, "loss": 0.427, "num_input_tokens_seen": 5509512259, "step": 1413, "train_runtime": 56258.5395, "train_tokens_per_second": 97932.017 }, { "epoch": 0.22480127186009538, "grad_norm": 0.21439185738563538, "learning_rate": 4.410994100961704e-05, "loss": 0.4338, "num_input_tokens_seen": 5513458239, "step": 1414, "train_runtime": 56299.6066, "train_tokens_per_second": 97930.671 }, { "epoch": 0.22496025437201908, "grad_norm": 0.4388218820095062, "learning_rate": 4.410187136113291e-05, "loss": 0.4424, "num_input_tokens_seen": 5517350233, "step": 1415, "train_runtime": 56338.2585, "train_tokens_per_second": 97932.566 }, { "epoch": 0.22511923688394275, "grad_norm": 0.20139579474925995, "learning_rate": 4.409379692776498e-05, "loss": 0.4379, "num_input_tokens_seen": 5521220969, "step": 1416, "train_runtime": 56377.2054, "train_tokens_per_second": 97933.57 }, { "epoch": 0.22527821939586645, "grad_norm": 0.20977622270584106, "learning_rate": 4.408571771153585e-05, "loss": 0.4424, "num_input_tokens_seen": 5525166940, "step": 1417, "train_runtime": 56415.8089, "train_tokens_per_second": 97936.501 }, { "epoch": 0.22543720190779015, "grad_norm": 0.18713879585266113, "learning_rate": 4.407763371446929e-05, "loss": 0.4329, "num_input_tokens_seen": 5528933203, "step": 1418, "train_runtime": 56455.4658, "train_tokens_per_second": 97934.418 }, { "epoch": 0.22559618441971382, "grad_norm": 0.22005097568035126, "learning_rate": 4.40695449385903e-05, "loss": 0.4395, "num_input_tokens_seen": 5532826292, "step": 1419, "train_runtime": 56496.1959, "train_tokens_per_second": 97932.723 }, { "epoch": 0.22575516693163752, "grad_norm": 0.20371119678020477, "learning_rate": 4.4061451385925054e-05, "loss": 0.438, "num_input_tokens_seen": 5536712523, "step": 1420, "train_runtime": 56532.1456, "train_tokens_per_second": 97939.19 }, { "epoch": 0.2259141494435612, "grad_norm": 0.27110257744789124, "learning_rate": 4.405335305850093e-05, "loss": 0.4341, "num_input_tokens_seen": 5540541029, "step": 1421, "train_runtime": 56573.0746, "train_tokens_per_second": 97936.007 }, { "epoch": 0.2260731319554849, "grad_norm": 0.20608310401439667, "learning_rate": 4.4045249958346504e-05, "loss": 0.4235, "num_input_tokens_seen": 5544547803, "step": 1422, "train_runtime": 56611.5925, "train_tokens_per_second": 97940.149 }, { "epoch": 0.2262321144674086, "grad_norm": 0.19951102137565613, "learning_rate": 4.4037142087491545e-05, "loss": 0.4406, "num_input_tokens_seen": 5548534756, "step": 1423, "train_runtime": 56651.83, "train_tokens_per_second": 97940.962 }, { "epoch": 0.22639109697933227, "grad_norm": 0.2168378084897995, "learning_rate": 4.402902944796701e-05, "loss": 0.4389, "num_input_tokens_seen": 5552310934, "step": 1424, "train_runtime": 56691.4975, "train_tokens_per_second": 97939.042 }, { "epoch": 0.22655007949125597, "grad_norm": 0.20407959818840027, "learning_rate": 4.402091204180507e-05, "loss": 0.431, "num_input_tokens_seen": 5556216849, "step": 1425, "train_runtime": 56732.8394, "train_tokens_per_second": 97936.52 }, { "epoch": 0.22670906200317964, "grad_norm": 0.21090511977672577, "learning_rate": 4.401278987103907e-05, "loss": 0.4355, "num_input_tokens_seen": 5560179262, "step": 1426, "train_runtime": 56771.8107, "train_tokens_per_second": 97939.086 }, { "epoch": 0.22686804451510334, "grad_norm": 0.2018243819475174, "learning_rate": 4.400466293770356e-05, "loss": 0.4396, "num_input_tokens_seen": 5564013854, "step": 1427, "train_runtime": 56812.2672, "train_tokens_per_second": 97936.839 }, { "epoch": 0.22702702702702704, "grad_norm": 0.3148197531700134, "learning_rate": 4.399653124383428e-05, "loss": 0.4333, "num_input_tokens_seen": 5567878579, "step": 1428, "train_runtime": 56851.4342, "train_tokens_per_second": 97937.346 }, { "epoch": 0.2271860095389507, "grad_norm": 0.2527037560939789, "learning_rate": 4.398839479146816e-05, "loss": 0.4411, "num_input_tokens_seen": 5571774192, "step": 1429, "train_runtime": 56892.3342, "train_tokens_per_second": 97935.412 }, { "epoch": 0.2273449920508744, "grad_norm": 0.20751067996025085, "learning_rate": 4.3980253582643317e-05, "loss": 0.4357, "num_input_tokens_seen": 5575790688, "step": 1430, "train_runtime": 56932.4322, "train_tokens_per_second": 97936.984 }, { "epoch": 0.22750397456279808, "grad_norm": 0.21635253727436066, "learning_rate": 4.3972107619399075e-05, "loss": 0.4344, "num_input_tokens_seen": 5579617670, "step": 1431, "train_runtime": 56971.1492, "train_tokens_per_second": 97937.601 }, { "epoch": 0.22766295707472178, "grad_norm": 0.27064380049705505, "learning_rate": 4.3963956903775936e-05, "loss": 0.4356, "num_input_tokens_seen": 5583530609, "step": 1432, "train_runtime": 57012.5587, "train_tokens_per_second": 97935.1 }, { "epoch": 0.22782193958664546, "grad_norm": 0.2885216474533081, "learning_rate": 4.39558014378156e-05, "loss": 0.4383, "num_input_tokens_seen": 5587365892, "step": 1433, "train_runtime": 57052.7963, "train_tokens_per_second": 97933.252 }, { "epoch": 0.22798092209856916, "grad_norm": 0.21089527010917664, "learning_rate": 4.3947641223560954e-05, "loss": 0.4464, "num_input_tokens_seen": 5591359194, "step": 1434, "train_runtime": 57091.6825, "train_tokens_per_second": 97936.494 }, { "epoch": 0.22813990461049286, "grad_norm": 0.23631319403648376, "learning_rate": 4.393947626305607e-05, "loss": 0.4241, "num_input_tokens_seen": 5595130144, "step": 1435, "train_runtime": 57133.6405, "train_tokens_per_second": 97930.573 }, { "epoch": 0.22829888712241653, "grad_norm": 0.295696496963501, "learning_rate": 4.3931306558346216e-05, "loss": 0.4215, "num_input_tokens_seen": 5599120102, "step": 1436, "train_runtime": 57173.9421, "train_tokens_per_second": 97931.328 }, { "epoch": 0.22845786963434023, "grad_norm": 0.20737646520137787, "learning_rate": 4.392313211147785e-05, "loss": 0.433, "num_input_tokens_seen": 5603012614, "step": 1437, "train_runtime": 57212.7314, "train_tokens_per_second": 97932.968 }, { "epoch": 0.2286168521462639, "grad_norm": 0.19555318355560303, "learning_rate": 4.39149529244986e-05, "loss": 0.442, "num_input_tokens_seen": 5606899537, "step": 1438, "train_runtime": 57249.3588, "train_tokens_per_second": 97938.207 }, { "epoch": 0.2287758346581876, "grad_norm": 0.2257331758737564, "learning_rate": 4.39067689994573e-05, "loss": 0.4411, "num_input_tokens_seen": 5610727827, "step": 1439, "train_runtime": 57290.5463, "train_tokens_per_second": 97934.619 }, { "epoch": 0.2289348171701113, "grad_norm": 0.20606057345867157, "learning_rate": 4.389858033840398e-05, "loss": 0.4422, "num_input_tokens_seen": 5614739849, "step": 1440, "train_runtime": 57330.5364, "train_tokens_per_second": 97936.287 }, { "epoch": 0.22909379968203497, "grad_norm": 0.20741480588912964, "learning_rate": 4.3890386943389814e-05, "loss": 0.4332, "num_input_tokens_seen": 5618614710, "step": 1441, "train_runtime": 57368.669, "train_tokens_per_second": 97938.732 }, { "epoch": 0.22925278219395867, "grad_norm": 0.2150677740573883, "learning_rate": 4.388218881646722e-05, "loss": 0.4252, "num_input_tokens_seen": 5622588701, "step": 1442, "train_runtime": 57406.7004, "train_tokens_per_second": 97943.074 }, { "epoch": 0.22941176470588234, "grad_norm": 0.21114249527454376, "learning_rate": 4.3873985959689745e-05, "loss": 0.4128, "num_input_tokens_seen": 5626392477, "step": 1443, "train_runtime": 57442.9614, "train_tokens_per_second": 97947.465 }, { "epoch": 0.22957074721780604, "grad_norm": 0.3227333724498749, "learning_rate": 4.386577837511216e-05, "loss": 0.4437, "num_input_tokens_seen": 5630416549, "step": 1444, "train_runtime": 57481.5578, "train_tokens_per_second": 97951.704 }, { "epoch": 0.22972972972972974, "grad_norm": 0.2299245297908783, "learning_rate": 4.385756606479041e-05, "loss": 0.4316, "num_input_tokens_seen": 5634215559, "step": 1445, "train_runtime": 57521.436, "train_tokens_per_second": 97949.842 }, { "epoch": 0.22988871224165341, "grad_norm": 0.21580955386161804, "learning_rate": 4.3849349030781605e-05, "loss": 0.4306, "num_input_tokens_seen": 5638106881, "step": 1446, "train_runtime": 57561.8707, "train_tokens_per_second": 97948.639 }, { "epoch": 0.23004769475357711, "grad_norm": 0.267124205827713, "learning_rate": 4.384112727514407e-05, "loss": 0.4433, "num_input_tokens_seen": 5642059133, "step": 1447, "train_runtime": 57602.2053, "train_tokens_per_second": 97948.665 }, { "epoch": 0.2302066772655008, "grad_norm": 0.222331240773201, "learning_rate": 4.383290079993729e-05, "loss": 0.4309, "num_input_tokens_seen": 5646046070, "step": 1448, "train_runtime": 57642.3781, "train_tokens_per_second": 97949.569 }, { "epoch": 0.2303656597774245, "grad_norm": 0.29427891969680786, "learning_rate": 4.382466960722194e-05, "loss": 0.4285, "num_input_tokens_seen": 5649874524, "step": 1449, "train_runtime": 57680.8982, "train_tokens_per_second": 97950.53 }, { "epoch": 0.23052464228934816, "grad_norm": 0.24235042929649353, "learning_rate": 4.381643369905987e-05, "loss": 0.4419, "num_input_tokens_seen": 5653768807, "step": 1450, "train_runtime": 57719.3947, "train_tokens_per_second": 97952.67 }, { "epoch": 0.23068362480127186, "grad_norm": 0.2330087572336197, "learning_rate": 4.3808193077514126e-05, "loss": 0.4472, "num_input_tokens_seen": 5657742031, "step": 1451, "train_runtime": 57757.787, "train_tokens_per_second": 97956.351 }, { "epoch": 0.23084260731319556, "grad_norm": 0.24252568185329437, "learning_rate": 4.379994774464892e-05, "loss": 0.4323, "num_input_tokens_seen": 5661615757, "step": 1452, "train_runtime": 57796.7876, "train_tokens_per_second": 97957.274 }, { "epoch": 0.23100158982511923, "grad_norm": 0.2509019374847412, "learning_rate": 4.3791697702529656e-05, "loss": 0.4297, "num_input_tokens_seen": 5665418297, "step": 1453, "train_runtime": 57836.5228, "train_tokens_per_second": 97955.721 }, { "epoch": 0.23116057233704293, "grad_norm": 0.2345794141292572, "learning_rate": 4.37834429532229e-05, "loss": 0.4336, "num_input_tokens_seen": 5669367071, "step": 1454, "train_runtime": 57877.1702, "train_tokens_per_second": 97955.153 }, { "epoch": 0.2313195548489666, "grad_norm": 0.2608726918697357, "learning_rate": 4.377518349879641e-05, "loss": 0.4395, "num_input_tokens_seen": 5673289125, "step": 1455, "train_runtime": 57916.8775, "train_tokens_per_second": 97955.715 }, { "epoch": 0.2314785373608903, "grad_norm": 0.3181867301464081, "learning_rate": 4.376691934131914e-05, "loss": 0.4252, "num_input_tokens_seen": 5677204278, "step": 1456, "train_runtime": 57955.957, "train_tokens_per_second": 97957.217 }, { "epoch": 0.231637519872814, "grad_norm": 0.2682175040245056, "learning_rate": 4.375865048286118e-05, "loss": 0.4256, "num_input_tokens_seen": 5681010512, "step": 1457, "train_runtime": 57995.4793, "train_tokens_per_second": 97956.092 }, { "epoch": 0.23179650238473767, "grad_norm": 0.2540958821773529, "learning_rate": 4.375037692549383e-05, "loss": 0.4355, "num_input_tokens_seen": 5684915622, "step": 1458, "train_runtime": 58032.7373, "train_tokens_per_second": 97960.494 }, { "epoch": 0.23195548489666137, "grad_norm": 0.21882687509059906, "learning_rate": 4.374209867128955e-05, "loss": 0.4409, "num_input_tokens_seen": 5688740629, "step": 1459, "train_runtime": 58072.0655, "train_tokens_per_second": 97960.019 }, { "epoch": 0.23211446740858505, "grad_norm": 0.871353030204773, "learning_rate": 4.373381572232201e-05, "loss": 0.4391, "num_input_tokens_seen": 5692597337, "step": 1460, "train_runtime": 58109.9979, "train_tokens_per_second": 97962.443 }, { "epoch": 0.23227344992050875, "grad_norm": 0.2270166277885437, "learning_rate": 4.3725528080666e-05, "loss": 0.4315, "num_input_tokens_seen": 5696560953, "step": 1461, "train_runtime": 58149.4301, "train_tokens_per_second": 97964.175 }, { "epoch": 0.23243243243243245, "grad_norm": 0.223251610994339, "learning_rate": 4.3717235748397524e-05, "loss": 0.4176, "num_input_tokens_seen": 5700415985, "step": 1462, "train_runtime": 58190.2665, "train_tokens_per_second": 97961.675 }, { "epoch": 0.23259141494435612, "grad_norm": 0.24011638760566711, "learning_rate": 4.370893872759376e-05, "loss": 0.4336, "num_input_tokens_seen": 5704270760, "step": 1463, "train_runtime": 58228.1004, "train_tokens_per_second": 97964.226 }, { "epoch": 0.23275039745627982, "grad_norm": 0.2421402484178543, "learning_rate": 4.370063702033305e-05, "loss": 0.4367, "num_input_tokens_seen": 5708159884, "step": 1464, "train_runtime": 58267.3025, "train_tokens_per_second": 97965.062 }, { "epoch": 0.2329093799682035, "grad_norm": 0.29120415449142456, "learning_rate": 4.369233062869491e-05, "loss": 0.4459, "num_input_tokens_seen": 5712018662, "step": 1465, "train_runtime": 58305.7262, "train_tokens_per_second": 97966.684 }, { "epoch": 0.2330683624801272, "grad_norm": 0.2008248120546341, "learning_rate": 4.368401955476003e-05, "loss": 0.4281, "num_input_tokens_seen": 5715901913, "step": 1466, "train_runtime": 58345.4328, "train_tokens_per_second": 97966.57 }, { "epoch": 0.23322734499205086, "grad_norm": 0.21973054111003876, "learning_rate": 4.367570380061028e-05, "loss": 0.4423, "num_input_tokens_seen": 5719945435, "step": 1467, "train_runtime": 58385.2837, "train_tokens_per_second": 97968.958 }, { "epoch": 0.23338632750397456, "grad_norm": 0.2057942897081375, "learning_rate": 4.36673833683287e-05, "loss": 0.4476, "num_input_tokens_seen": 5723677106, "step": 1468, "train_runtime": 58420.3097, "train_tokens_per_second": 97974.097 }, { "epoch": 0.23354531001589826, "grad_norm": 0.20895709097385406, "learning_rate": 4.3659058259999496e-05, "loss": 0.4373, "num_input_tokens_seen": 5727587793, "step": 1469, "train_runtime": 58460.528, "train_tokens_per_second": 97973.59 }, { "epoch": 0.23370429252782193, "grad_norm": 0.2982800006866455, "learning_rate": 4.3650728477708044e-05, "loss": 0.4299, "num_input_tokens_seen": 5731545154, "step": 1470, "train_runtime": 58502.1133, "train_tokens_per_second": 97971.592 }, { "epoch": 0.23386327503974563, "grad_norm": 0.25220584869384766, "learning_rate": 4.3642394023540914e-05, "loss": 0.4333, "num_input_tokens_seen": 5735525502, "step": 1471, "train_runtime": 58539.2631, "train_tokens_per_second": 97977.412 }, { "epoch": 0.2340222575516693, "grad_norm": 0.2437572032213211, "learning_rate": 4.363405489958581e-05, "loss": 0.4369, "num_input_tokens_seen": 5739444732, "step": 1472, "train_runtime": 58577.3308, "train_tokens_per_second": 97980.646 }, { "epoch": 0.234181240063593, "grad_norm": 0.22559265792369843, "learning_rate": 4.362571110793162e-05, "loss": 0.438, "num_input_tokens_seen": 5743294551, "step": 1473, "train_runtime": 58616.6424, "train_tokens_per_second": 97980.613 }, { "epoch": 0.2343402225755167, "grad_norm": 0.22995461523532867, "learning_rate": 4.361736265066842e-05, "loss": 0.4347, "num_input_tokens_seen": 5747134042, "step": 1474, "train_runtime": 58658.2822, "train_tokens_per_second": 97976.515 }, { "epoch": 0.23449920508744038, "grad_norm": 0.3218045234680176, "learning_rate": 4.3609009529887434e-05, "loss": 0.4373, "num_input_tokens_seen": 5751011635, "step": 1475, "train_runtime": 58697.9175, "train_tokens_per_second": 97976.417 }, { "epoch": 0.23465818759936408, "grad_norm": 0.20723643898963928, "learning_rate": 4.3600651747681055e-05, "loss": 0.4233, "num_input_tokens_seen": 5754908889, "step": 1476, "train_runtime": 58736.4857, "train_tokens_per_second": 97978.434 }, { "epoch": 0.23481717011128775, "grad_norm": 0.3050440549850464, "learning_rate": 4.359228930614285e-05, "loss": 0.4317, "num_input_tokens_seen": 5758833393, "step": 1477, "train_runtime": 58776.5954, "train_tokens_per_second": 97978.343 }, { "epoch": 0.23497615262321145, "grad_norm": 0.1990855485200882, "learning_rate": 4.358392220736756e-05, "loss": 0.4186, "num_input_tokens_seen": 5762806639, "step": 1478, "train_runtime": 58815.5904, "train_tokens_per_second": 97980.937 }, { "epoch": 0.23513513513513515, "grad_norm": 0.3290938436985016, "learning_rate": 4.357555045345107e-05, "loss": 0.4426, "num_input_tokens_seen": 5766701964, "step": 1479, "train_runtime": 58855.7354, "train_tokens_per_second": 97980.289 }, { "epoch": 0.23529411764705882, "grad_norm": 0.21854941546916962, "learning_rate": 4.356717404649046e-05, "loss": 0.4408, "num_input_tokens_seen": 5770460398, "step": 1480, "train_runtime": 58891.398, "train_tokens_per_second": 97984.775 }, { "epoch": 0.23545310015898252, "grad_norm": 0.21456648409366608, "learning_rate": 4.355879298858395e-05, "loss": 0.4216, "num_input_tokens_seen": 5774292209, "step": 1481, "train_runtime": 58929.4118, "train_tokens_per_second": 97986.592 }, { "epoch": 0.2356120826709062, "grad_norm": 0.24472856521606445, "learning_rate": 4.355040728183094e-05, "loss": 0.4369, "num_input_tokens_seen": 5778266679, "step": 1482, "train_runtime": 58968.3174, "train_tokens_per_second": 97989.343 }, { "epoch": 0.2357710651828299, "grad_norm": 0.2710803747177124, "learning_rate": 4.354201692833199e-05, "loss": 0.4411, "num_input_tokens_seen": 5782083314, "step": 1483, "train_runtime": 59008.8473, "train_tokens_per_second": 97986.719 }, { "epoch": 0.23593004769475356, "grad_norm": 0.22634877264499664, "learning_rate": 4.353362193018883e-05, "loss": 0.452, "num_input_tokens_seen": 5785852824, "step": 1484, "train_runtime": 59048.5556, "train_tokens_per_second": 97984.663 }, { "epoch": 0.23608903020667726, "grad_norm": 0.5368086099624634, "learning_rate": 4.352522228950433e-05, "loss": 0.4385, "num_input_tokens_seen": 5789883390, "step": 1485, "train_runtime": 59088.0226, "train_tokens_per_second": 97987.429 }, { "epoch": 0.23624801271860096, "grad_norm": 0.3677392899990082, "learning_rate": 4.351681800838256e-05, "loss": 0.4305, "num_input_tokens_seen": 5793809038, "step": 1486, "train_runtime": 59127.2694, "train_tokens_per_second": 97988.781 }, { "epoch": 0.23640699523052464, "grad_norm": 0.22445812821388245, "learning_rate": 4.350840908892872e-05, "loss": 0.4403, "num_input_tokens_seen": 5797610521, "step": 1487, "train_runtime": 59166.3622, "train_tokens_per_second": 97988.288 }, { "epoch": 0.23656597774244834, "grad_norm": 0.19077786803245544, "learning_rate": 4.349999553324919e-05, "loss": 0.4336, "num_input_tokens_seen": 5801495210, "step": 1488, "train_runtime": 59205.8758, "train_tokens_per_second": 97988.504 }, { "epoch": 0.236724960254372, "grad_norm": 0.21201962232589722, "learning_rate": 4.349157734345152e-05, "loss": 0.4495, "num_input_tokens_seen": 5805550828, "step": 1489, "train_runtime": 59244.2703, "train_tokens_per_second": 97993.457 }, { "epoch": 0.2368839427662957, "grad_norm": 0.22602158784866333, "learning_rate": 4.348315452164438e-05, "loss": 0.4025, "num_input_tokens_seen": 5809394407, "step": 1490, "train_runtime": 59280.0034, "train_tokens_per_second": 97999.225 }, { "epoch": 0.2370429252782194, "grad_norm": 0.20989353954792023, "learning_rate": 4.347472706993764e-05, "loss": 0.4311, "num_input_tokens_seen": 5813339261, "step": 1491, "train_runtime": 59319.6439, "train_tokens_per_second": 98000.239 }, { "epoch": 0.23720190779014308, "grad_norm": 0.24838688969612122, "learning_rate": 4.346629499044232e-05, "loss": 0.4347, "num_input_tokens_seen": 5817194506, "step": 1492, "train_runtime": 59360.0149, "train_tokens_per_second": 97998.535 }, { "epoch": 0.23736089030206678, "grad_norm": 0.21085943281650543, "learning_rate": 4.3457858285270594e-05, "loss": 0.4341, "num_input_tokens_seen": 5821117109, "step": 1493, "train_runtime": 59400.7868, "train_tokens_per_second": 97997.307 }, { "epoch": 0.23751987281399045, "grad_norm": 0.24037790298461914, "learning_rate": 4.34494169565358e-05, "loss": 0.4321, "num_input_tokens_seen": 5824976210, "step": 1494, "train_runtime": 59441.8589, "train_tokens_per_second": 97994.516 }, { "epoch": 0.23767885532591415, "grad_norm": 0.23441250622272491, "learning_rate": 4.344097100635243e-05, "loss": 0.4366, "num_input_tokens_seen": 5828835291, "step": 1495, "train_runtime": 59481.4311, "train_tokens_per_second": 97994.201 }, { "epoch": 0.23783783783783785, "grad_norm": 0.19386768341064453, "learning_rate": 4.343252043683613e-05, "loss": 0.4222, "num_input_tokens_seen": 5832753285, "step": 1496, "train_runtime": 59520.8367, "train_tokens_per_second": 97995.149 }, { "epoch": 0.23799682034976152, "grad_norm": 0.2183782309293747, "learning_rate": 4.3424065250103704e-05, "loss": 0.4389, "num_input_tokens_seen": 5836681953, "step": 1497, "train_runtime": 59563.3882, "train_tokens_per_second": 97991.1 }, { "epoch": 0.23815580286168522, "grad_norm": 0.2024141252040863, "learning_rate": 4.341560544827314e-05, "loss": 0.4286, "num_input_tokens_seen": 5840539440, "step": 1498, "train_runtime": 59603.1355, "train_tokens_per_second": 97990.473 }, { "epoch": 0.2383147853736089, "grad_norm": 0.19260354340076447, "learning_rate": 4.3407141033463526e-05, "loss": 0.4264, "num_input_tokens_seen": 5844491504, "step": 1499, "train_runtime": 59642.7605, "train_tokens_per_second": 97991.633 }, { "epoch": 0.2384737678855326, "grad_norm": 0.23149636387825012, "learning_rate": 4.339867200779517e-05, "loss": 0.4364, "num_input_tokens_seen": 5848351681, "step": 1500, "train_runtime": 59682.1191, "train_tokens_per_second": 97991.689 }, { "epoch": 0.23863275039745627, "grad_norm": 0.2041919231414795, "learning_rate": 4.339019837338948e-05, "loss": 0.4365, "num_input_tokens_seen": 5852169488, "step": 1501, "train_runtime": 59722.3931, "train_tokens_per_second": 97989.534 }, { "epoch": 0.23879173290937997, "grad_norm": 0.23667268455028534, "learning_rate": 4.338172013236905e-05, "loss": 0.4293, "num_input_tokens_seen": 5856050363, "step": 1502, "train_runtime": 59763.1632, "train_tokens_per_second": 97987.624 }, { "epoch": 0.23895071542130367, "grad_norm": 0.2490101307630539, "learning_rate": 4.3373237286857616e-05, "loss": 0.4215, "num_input_tokens_seen": 5860041168, "step": 1503, "train_runtime": 59801.72, "train_tokens_per_second": 97991.181 }, { "epoch": 0.23910969793322734, "grad_norm": 0.22236807644367218, "learning_rate": 4.336474983898008e-05, "loss": 0.4213, "num_input_tokens_seen": 5863898619, "step": 1504, "train_runtime": 59838.4761, "train_tokens_per_second": 97995.454 }, { "epoch": 0.23926868044515104, "grad_norm": 0.2090919464826584, "learning_rate": 4.335625779086248e-05, "loss": 0.4336, "num_input_tokens_seen": 5867725903, "step": 1505, "train_runtime": 59875.8117, "train_tokens_per_second": 97998.269 }, { "epoch": 0.2394276629570747, "grad_norm": 0.2425350397825241, "learning_rate": 4.3347761144632006e-05, "loss": 0.4369, "num_input_tokens_seen": 5871512539, "step": 1506, "train_runtime": 59916.2543, "train_tokens_per_second": 97995.32 }, { "epoch": 0.2395866454689984, "grad_norm": 0.2109111100435257, "learning_rate": 4.333925990241702e-05, "loss": 0.4248, "num_input_tokens_seen": 5875410922, "step": 1507, "train_runtime": 59956.3038, "train_tokens_per_second": 97994.882 }, { "epoch": 0.2397456279809221, "grad_norm": 0.2278325855731964, "learning_rate": 4.333075406634702e-05, "loss": 0.4317, "num_input_tokens_seen": 5879400593, "step": 1508, "train_runtime": 59996.7307, "train_tokens_per_second": 97995.349 }, { "epoch": 0.23990461049284578, "grad_norm": 0.22607669234275818, "learning_rate": 4.332224363855266e-05, "loss": 0.427, "num_input_tokens_seen": 5883323754, "step": 1509, "train_runtime": 60037.5329, "train_tokens_per_second": 97994.096 }, { "epoch": 0.24006359300476948, "grad_norm": 0.20940402150154114, "learning_rate": 4.331372862116574e-05, "loss": 0.4275, "num_input_tokens_seen": 5887240185, "step": 1510, "train_runtime": 60075.0992, "train_tokens_per_second": 97998.01 }, { "epoch": 0.24022257551669315, "grad_norm": 0.851281464099884, "learning_rate": 4.330520901631921e-05, "loss": 0.4372, "num_input_tokens_seen": 5891169331, "step": 1511, "train_runtime": 60116.1257, "train_tokens_per_second": 97996.49 }, { "epoch": 0.24038155802861685, "grad_norm": 0.23917336761951447, "learning_rate": 4.329668482614718e-05, "loss": 0.4361, "num_input_tokens_seen": 5895173296, "step": 1512, "train_runtime": 60152.7107, "train_tokens_per_second": 98003.452 }, { "epoch": 0.24054054054054055, "grad_norm": 0.24004055559635162, "learning_rate": 4.328815605278489e-05, "loss": 0.4273, "num_input_tokens_seen": 5899029762, "step": 1513, "train_runtime": 60189.9284, "train_tokens_per_second": 98006.924 }, { "epoch": 0.24069952305246423, "grad_norm": 0.22903624176979065, "learning_rate": 4.327962269836873e-05, "loss": 0.4414, "num_input_tokens_seen": 5902957540, "step": 1514, "train_runtime": 60230.7262, "train_tokens_per_second": 98005.751 }, { "epoch": 0.24085850556438793, "grad_norm": 0.20841555297374725, "learning_rate": 4.3271084765036264e-05, "loss": 0.4497, "num_input_tokens_seen": 5906843965, "step": 1515, "train_runtime": 60271.0294, "train_tokens_per_second": 98004.697 }, { "epoch": 0.2410174880763116, "grad_norm": 0.23796334862709045, "learning_rate": 4.326254225492617e-05, "loss": 0.4343, "num_input_tokens_seen": 5910744488, "step": 1516, "train_runtime": 60311.3647, "train_tokens_per_second": 98003.826 }, { "epoch": 0.2411764705882353, "grad_norm": 0.22953730821609497, "learning_rate": 4.325399517017829e-05, "loss": 0.4294, "num_input_tokens_seen": 5914577593, "step": 1517, "train_runtime": 60350.9194, "train_tokens_per_second": 98003.107 }, { "epoch": 0.24133545310015897, "grad_norm": 0.22906526923179626, "learning_rate": 4.324544351293362e-05, "loss": 0.4305, "num_input_tokens_seen": 5918586750, "step": 1518, "train_runtime": 60391.2492, "train_tokens_per_second": 98004.046 }, { "epoch": 0.24149443561208267, "grad_norm": 0.2618595063686371, "learning_rate": 4.323688728533426e-05, "loss": 0.435, "num_input_tokens_seen": 5922627233, "step": 1519, "train_runtime": 60433.16, "train_tokens_per_second": 98002.938 }, { "epoch": 0.24165341812400637, "grad_norm": 0.30928078293800354, "learning_rate": 4.322832648952351e-05, "loss": 0.4306, "num_input_tokens_seen": 5926477249, "step": 1520, "train_runtime": 60473.6544, "train_tokens_per_second": 98000.978 }, { "epoch": 0.24181240063593004, "grad_norm": 0.24355584383010864, "learning_rate": 4.321976112764579e-05, "loss": 0.4289, "num_input_tokens_seen": 5930318975, "step": 1521, "train_runtime": 60513.374, "train_tokens_per_second": 98000.138 }, { "epoch": 0.24197138314785374, "grad_norm": 0.23490595817565918, "learning_rate": 4.321119120184664e-05, "loss": 0.4291, "num_input_tokens_seen": 5934203276, "step": 1522, "train_runtime": 60551.5822, "train_tokens_per_second": 98002.448 }, { "epoch": 0.2421303656597774, "grad_norm": 0.5313863158226013, "learning_rate": 4.3202616714272794e-05, "loss": 0.4403, "num_input_tokens_seen": 5938039105, "step": 1523, "train_runtime": 60588.5284, "train_tokens_per_second": 98005.997 }, { "epoch": 0.2422893481717011, "grad_norm": 0.26967760920524597, "learning_rate": 4.3194037667072085e-05, "loss": 0.419, "num_input_tokens_seen": 5941951534, "step": 1524, "train_runtime": 60623.9897, "train_tokens_per_second": 98013.205 }, { "epoch": 0.2424483306836248, "grad_norm": 0.22981415688991547, "learning_rate": 4.318545406239351e-05, "loss": 0.4355, "num_input_tokens_seen": 5945830828, "step": 1525, "train_runtime": 60665.0944, "train_tokens_per_second": 98010.741 }, { "epoch": 0.24260731319554849, "grad_norm": 0.2114245593547821, "learning_rate": 4.317686590238719e-05, "loss": 0.4419, "num_input_tokens_seen": 5949819671, "step": 1526, "train_runtime": 60704.7146, "train_tokens_per_second": 98012.481 }, { "epoch": 0.24276629570747219, "grad_norm": 0.23870466649532318, "learning_rate": 4.31682731892044e-05, "loss": 0.4389, "num_input_tokens_seen": 5953712888, "step": 1527, "train_runtime": 60746.028, "train_tokens_per_second": 98009.912 }, { "epoch": 0.24292527821939586, "grad_norm": 0.24634194374084473, "learning_rate": 4.3159675924997564e-05, "loss": 0.4283, "num_input_tokens_seen": 5957689728, "step": 1528, "train_runtime": 60783.8024, "train_tokens_per_second": 98014.43 }, { "epoch": 0.24308426073131956, "grad_norm": 0.25272586941719055, "learning_rate": 4.3151074111920235e-05, "loss": 0.4472, "num_input_tokens_seen": 5961642777, "step": 1529, "train_runtime": 60824.2689, "train_tokens_per_second": 98014.212 }, { "epoch": 0.24324324324324326, "grad_norm": 0.20867103338241577, "learning_rate": 4.3142467752127106e-05, "loss": 0.4294, "num_input_tokens_seen": 5965434393, "step": 1530, "train_runtime": 60859.9187, "train_tokens_per_second": 98019.099 }, { "epoch": 0.24340222575516693, "grad_norm": 0.21479414403438568, "learning_rate": 4.3133856847773996e-05, "loss": 0.4337, "num_input_tokens_seen": 5969350750, "step": 1531, "train_runtime": 60898.4004, "train_tokens_per_second": 98021.47 }, { "epoch": 0.24356120826709063, "grad_norm": 0.2521754503250122, "learning_rate": 4.312524140101789e-05, "loss": 0.4403, "num_input_tokens_seen": 5973187407, "step": 1532, "train_runtime": 60938.756, "train_tokens_per_second": 98019.517 }, { "epoch": 0.2437201907790143, "grad_norm": 0.24756179749965668, "learning_rate": 4.31166214140169e-05, "loss": 0.4397, "num_input_tokens_seen": 5977111941, "step": 1533, "train_runtime": 60978.4959, "train_tokens_per_second": 98019.996 }, { "epoch": 0.243879173290938, "grad_norm": 0.24439631402492523, "learning_rate": 4.310799688893026e-05, "loss": 0.433, "num_input_tokens_seen": 5981065648, "step": 1534, "train_runtime": 61016.7187, "train_tokens_per_second": 98023.391 }, { "epoch": 0.24403815580286167, "grad_norm": 0.2134857028722763, "learning_rate": 4.309936782791835e-05, "loss": 0.436, "num_input_tokens_seen": 5984864449, "step": 1535, "train_runtime": 61053.6819, "train_tokens_per_second": 98026.266 }, { "epoch": 0.24419713831478537, "grad_norm": 0.2384064793586731, "learning_rate": 4.30907342331427e-05, "loss": 0.4272, "num_input_tokens_seen": 5988773732, "step": 1536, "train_runtime": 61092.4021, "train_tokens_per_second": 98028.127 }, { "epoch": 0.24435612082670907, "grad_norm": 0.21520395576953888, "learning_rate": 4.308209610676596e-05, "loss": 0.4349, "num_input_tokens_seen": 5992706094, "step": 1537, "train_runtime": 61132.0381, "train_tokens_per_second": 98028.894 }, { "epoch": 0.24451510333863274, "grad_norm": 0.21378779411315918, "learning_rate": 4.307345345095192e-05, "loss": 0.4307, "num_input_tokens_seen": 5996529308, "step": 1538, "train_runtime": 61169.529, "train_tokens_per_second": 98031.314 }, { "epoch": 0.24467408585055644, "grad_norm": 0.23515239357948303, "learning_rate": 4.306480626786551e-05, "loss": 0.4276, "num_input_tokens_seen": 6000514549, "step": 1539, "train_runtime": 61209.3347, "train_tokens_per_second": 98032.671 }, { "epoch": 0.24483306836248012, "grad_norm": 0.22462953627109528, "learning_rate": 4.3056154559672775e-05, "loss": 0.4392, "num_input_tokens_seen": 6004480450, "step": 1540, "train_runtime": 61250.635, "train_tokens_per_second": 98031.318 }, { "epoch": 0.24499205087440382, "grad_norm": 0.23022544384002686, "learning_rate": 4.304749832854091e-05, "loss": 0.4387, "num_input_tokens_seen": 6008328220, "step": 1541, "train_runtime": 61289.1505, "train_tokens_per_second": 98032.493 }, { "epoch": 0.24515103338632752, "grad_norm": 0.228342667222023, "learning_rate": 4.3038837576638245e-05, "loss": 0.4308, "num_input_tokens_seen": 6012207886, "step": 1542, "train_runtime": 61329.0535, "train_tokens_per_second": 98031.969 }, { "epoch": 0.2453100158982512, "grad_norm": 0.2504430413246155, "learning_rate": 4.3030172306134245e-05, "loss": 0.4352, "num_input_tokens_seen": 6016161369, "step": 1543, "train_runtime": 61370.287, "train_tokens_per_second": 98030.524 }, { "epoch": 0.2454689984101749, "grad_norm": 0.2432154417037964, "learning_rate": 4.3021502519199484e-05, "loss": 0.4399, "num_input_tokens_seen": 6020162970, "step": 1544, "train_runtime": 61410.1203, "train_tokens_per_second": 98032.099 }, { "epoch": 0.24562798092209856, "grad_norm": 0.21151737868785858, "learning_rate": 4.3012828218005693e-05, "loss": 0.4367, "num_input_tokens_seen": 6023991507, "step": 1545, "train_runtime": 61449.3758, "train_tokens_per_second": 98031.777 }, { "epoch": 0.24578696343402226, "grad_norm": 0.23231181502342224, "learning_rate": 4.3004149404725716e-05, "loss": 0.4461, "num_input_tokens_seen": 6027909378, "step": 1546, "train_runtime": 61489.042, "train_tokens_per_second": 98032.254 }, { "epoch": 0.24594594594594596, "grad_norm": 0.2244872897863388, "learning_rate": 4.2995466081533534e-05, "loss": 0.4321, "num_input_tokens_seen": 6031774312, "step": 1547, "train_runtime": 61527.9826, "train_tokens_per_second": 98033.026 }, { "epoch": 0.24610492845786963, "grad_norm": 0.2247372269630432, "learning_rate": 4.298677825060426e-05, "loss": 0.4388, "num_input_tokens_seen": 6035684464, "step": 1548, "train_runtime": 61568.9025, "train_tokens_per_second": 98031.38 }, { "epoch": 0.24626391096979333, "grad_norm": 0.2137204110622406, "learning_rate": 4.2978085914114135e-05, "loss": 0.4272, "num_input_tokens_seen": 6039528770, "step": 1549, "train_runtime": 61608.3423, "train_tokens_per_second": 98031.022 }, { "epoch": 0.246422893481717, "grad_norm": 0.23939552903175354, "learning_rate": 4.296938907424053e-05, "loss": 0.4365, "num_input_tokens_seen": 6043354490, "step": 1550, "train_runtime": 61646.2064, "train_tokens_per_second": 98032.869 }, { "epoch": 0.2465818759936407, "grad_norm": 0.21767568588256836, "learning_rate": 4.296068773316194e-05, "loss": 0.4325, "num_input_tokens_seen": 6047263553, "step": 1551, "train_runtime": 61687.0482, "train_tokens_per_second": 98031.333 }, { "epoch": 0.24674085850556438, "grad_norm": 0.19545122981071472, "learning_rate": 4.295198189305799e-05, "loss": 0.4396, "num_input_tokens_seen": 6051270845, "step": 1552, "train_runtime": 61726.0525, "train_tokens_per_second": 98034.308 }, { "epoch": 0.24689984101748808, "grad_norm": 0.21470296382904053, "learning_rate": 4.294327155610943e-05, "loss": 0.4341, "num_input_tokens_seen": 6055149815, "step": 1553, "train_runtime": 61765.6984, "train_tokens_per_second": 98034.184 }, { "epoch": 0.24705882352941178, "grad_norm": 0.23326998949050903, "learning_rate": 4.2934556724498146e-05, "loss": 0.4267, "num_input_tokens_seen": 6058968580, "step": 1554, "train_runtime": 61804.5176, "train_tokens_per_second": 98034.396 }, { "epoch": 0.24721780604133545, "grad_norm": 0.21090739965438843, "learning_rate": 4.292583740040712e-05, "loss": 0.4329, "num_input_tokens_seen": 6062943567, "step": 1555, "train_runtime": 61843.3569, "train_tokens_per_second": 98037.103 }, { "epoch": 0.24737678855325915, "grad_norm": 0.3329719603061676, "learning_rate": 4.291711358602051e-05, "loss": 0.4163, "num_input_tokens_seen": 6066783437, "step": 1556, "train_runtime": 61882.6467, "train_tokens_per_second": 98036.91 }, { "epoch": 0.24753577106518282, "grad_norm": 0.21344414353370667, "learning_rate": 4.290838528352355e-05, "loss": 0.44, "num_input_tokens_seen": 6070608538, "step": 1557, "train_runtime": 61921.4205, "train_tokens_per_second": 98037.294 }, { "epoch": 0.24769475357710652, "grad_norm": 0.21880747377872467, "learning_rate": 4.289965249510262e-05, "loss": 0.4191, "num_input_tokens_seen": 6074561268, "step": 1558, "train_runtime": 61959.6972, "train_tokens_per_second": 98040.525 }, { "epoch": 0.24785373608903022, "grad_norm": 0.23871243000030518, "learning_rate": 4.289091522294522e-05, "loss": 0.4353, "num_input_tokens_seen": 6078469869, "step": 1559, "train_runtime": 62001.85, "train_tokens_per_second": 98036.911 }, { "epoch": 0.2480127186009539, "grad_norm": 0.20714884996414185, "learning_rate": 4.288217346923999e-05, "loss": 0.4192, "num_input_tokens_seen": 6082457848, "step": 1560, "train_runtime": 62040.685, "train_tokens_per_second": 98039.824 }, { "epoch": 0.2481717011128776, "grad_norm": 0.2342289537191391, "learning_rate": 4.287342723617667e-05, "loss": 0.4307, "num_input_tokens_seen": 6086297175, "step": 1561, "train_runtime": 62078.1806, "train_tokens_per_second": 98042.454 }, { "epoch": 0.24833068362480126, "grad_norm": 0.2063332051038742, "learning_rate": 4.286467652594611e-05, "loss": 0.4374, "num_input_tokens_seen": 6090145195, "step": 1562, "train_runtime": 62117.4083, "train_tokens_per_second": 98042.487 }, { "epoch": 0.24848966613672496, "grad_norm": 0.21585418283939362, "learning_rate": 4.2855921340740336e-05, "loss": 0.4365, "num_input_tokens_seen": 6094104348, "step": 1563, "train_runtime": 62158.1574, "train_tokens_per_second": 98041.908 }, { "epoch": 0.24864864864864866, "grad_norm": 0.1945313662290573, "learning_rate": 4.284716168275244e-05, "loss": 0.4379, "num_input_tokens_seen": 6098041765, "step": 1564, "train_runtime": 62198.3594, "train_tokens_per_second": 98041.843 }, { "epoch": 0.24880763116057233, "grad_norm": 0.2654784619808197, "learning_rate": 4.2838397554176644e-05, "loss": 0.4353, "num_input_tokens_seen": 6101823167, "step": 1565, "train_runtime": 62238.6146, "train_tokens_per_second": 98039.187 }, { "epoch": 0.24896661367249603, "grad_norm": 0.202678844332695, "learning_rate": 4.282962895720833e-05, "loss": 0.4276, "num_input_tokens_seen": 6105699150, "step": 1566, "train_runtime": 62275.9734, "train_tokens_per_second": 98042.613 }, { "epoch": 0.2491255961844197, "grad_norm": 0.28408071398735046, "learning_rate": 4.2820855894043945e-05, "loss": 0.4322, "num_input_tokens_seen": 6109559454, "step": 1567, "train_runtime": 62314.6767, "train_tokens_per_second": 98043.668 }, { "epoch": 0.2492845786963434, "grad_norm": 0.20327793061733246, "learning_rate": 4.281207836688109e-05, "loss": 0.4328, "num_input_tokens_seen": 6113380598, "step": 1568, "train_runtime": 62354.2301, "train_tokens_per_second": 98042.757 }, { "epoch": 0.24944356120826708, "grad_norm": 0.20347952842712402, "learning_rate": 4.280329637791847e-05, "loss": 0.4263, "num_input_tokens_seen": 6117320339, "step": 1569, "train_runtime": 62391.7986, "train_tokens_per_second": 98046.866 }, { "epoch": 0.24960254372019078, "grad_norm": 0.256175696849823, "learning_rate": 4.27945099293559e-05, "loss": 0.427, "num_input_tokens_seen": 6121216135, "step": 1570, "train_runtime": 62430.1528, "train_tokens_per_second": 98049.033 }, { "epoch": 0.24976152623211448, "grad_norm": 0.24132072925567627, "learning_rate": 4.278571902339434e-05, "loss": 0.4268, "num_input_tokens_seen": 6125109626, "step": 1571, "train_runtime": 62470.1224, "train_tokens_per_second": 98048.625 }, { "epoch": 0.24992050874403815, "grad_norm": 0.2169778048992157, "learning_rate": 4.2776923662235854e-05, "loss": 0.4171, "num_input_tokens_seen": 6129025145, "step": 1572, "train_runtime": 62507.5561, "train_tokens_per_second": 98052.548 }, { "epoch": 0.25007949125596185, "grad_norm": 0.21492019295692444, "learning_rate": 4.2768123848083604e-05, "loss": 0.4318, "num_input_tokens_seen": 6132907986, "step": 1573, "train_runtime": 62547.5989, "train_tokens_per_second": 98051.853 }, { "epoch": 0.25023847376788555, "grad_norm": 0.3405470550060272, "learning_rate": 4.275931958314189e-05, "loss": 0.408, "num_input_tokens_seen": 6136813798, "step": 1574, "train_runtime": 62587.4414, "train_tokens_per_second": 98051.84 }, { "epoch": 0.2503974562798092, "grad_norm": 0.34632471203804016, "learning_rate": 4.27505108696161e-05, "loss": 0.4378, "num_input_tokens_seen": 6140651256, "step": 1575, "train_runtime": 62627.1007, "train_tokens_per_second": 98051.022 }, { "epoch": 0.2505564387917329, "grad_norm": 0.2145463079214096, "learning_rate": 4.274169770971279e-05, "loss": 0.4393, "num_input_tokens_seen": 6144463713, "step": 1576, "train_runtime": 62665.3863, "train_tokens_per_second": 98051.956 }, { "epoch": 0.2507154213036566, "grad_norm": 0.2550230920314789, "learning_rate": 4.273288010563956e-05, "loss": 0.4332, "num_input_tokens_seen": 6148503223, "step": 1577, "train_runtime": 62703.3663, "train_tokens_per_second": 98056.988 }, { "epoch": 0.2508744038155803, "grad_norm": 0.3064785897731781, "learning_rate": 4.2724058059605165e-05, "loss": 0.4395, "num_input_tokens_seen": 6152303486, "step": 1578, "train_runtime": 62744.4248, "train_tokens_per_second": 98053.389 }, { "epoch": 0.251033386327504, "grad_norm": 0.2682541608810425, "learning_rate": 4.271523157381948e-05, "loss": 0.4367, "num_input_tokens_seen": 6156221013, "step": 1579, "train_runtime": 62783.4001, "train_tokens_per_second": 98054.916 }, { "epoch": 0.25119236883942764, "grad_norm": 0.2599823772907257, "learning_rate": 4.270640065049347e-05, "loss": 0.4368, "num_input_tokens_seen": 6160253292, "step": 1580, "train_runtime": 62824.1285, "train_tokens_per_second": 98055.531 }, { "epoch": 0.25135135135135134, "grad_norm": 0.2416861653327942, "learning_rate": 4.269756529183921e-05, "loss": 0.4357, "num_input_tokens_seen": 6164109243, "step": 1581, "train_runtime": 62862.3375, "train_tokens_per_second": 98057.271 }, { "epoch": 0.25151033386327504, "grad_norm": 0.2735997140407562, "learning_rate": 4.26887255000699e-05, "loss": 0.4185, "num_input_tokens_seen": 6167966688, "step": 1582, "train_runtime": 62900.9421, "train_tokens_per_second": 98058.415 }, { "epoch": 0.25166931637519874, "grad_norm": 0.2615826427936554, "learning_rate": 4.267988127739985e-05, "loss": 0.4244, "num_input_tokens_seen": 6171796925, "step": 1583, "train_runtime": 62942.0371, "train_tokens_per_second": 98055.246 }, { "epoch": 0.25182829888712244, "grad_norm": 0.2511170208454132, "learning_rate": 4.267103262604447e-05, "loss": 0.4231, "num_input_tokens_seen": 6175830938, "step": 1584, "train_runtime": 62981.4174, "train_tokens_per_second": 98057.986 }, { "epoch": 0.2519872813990461, "grad_norm": 0.19918778538703918, "learning_rate": 4.266217954822029e-05, "loss": 0.431, "num_input_tokens_seen": 6179758220, "step": 1585, "train_runtime": 63021.2324, "train_tokens_per_second": 98058.352 }, { "epoch": 0.2521462639109698, "grad_norm": 0.22982393205165863, "learning_rate": 4.265332204614494e-05, "loss": 0.4458, "num_input_tokens_seen": 6183527434, "step": 1586, "train_runtime": 63062.2863, "train_tokens_per_second": 98054.286 }, { "epoch": 0.2523052464228935, "grad_norm": 0.26494327187538147, "learning_rate": 4.264446012203716e-05, "loss": 0.4318, "num_input_tokens_seen": 6187407345, "step": 1587, "train_runtime": 63100.8663, "train_tokens_per_second": 98055.822 }, { "epoch": 0.2524642289348172, "grad_norm": 0.24296315014362335, "learning_rate": 4.2635593778116805e-05, "loss": 0.427, "num_input_tokens_seen": 6191399509, "step": 1588, "train_runtime": 63140.0097, "train_tokens_per_second": 98058.26 }, { "epoch": 0.2526232114467409, "grad_norm": 0.23247142136096954, "learning_rate": 4.262672301660483e-05, "loss": 0.4392, "num_input_tokens_seen": 6195369656, "step": 1589, "train_runtime": 63181.1678, "train_tokens_per_second": 98057.22 }, { "epoch": 0.2527821939586645, "grad_norm": 0.2253086417913437, "learning_rate": 4.261784783972329e-05, "loss": 0.4255, "num_input_tokens_seen": 6199139484, "step": 1590, "train_runtime": 63219.6613, "train_tokens_per_second": 98057.145 }, { "epoch": 0.2529411764705882, "grad_norm": 0.3113428056240082, "learning_rate": 4.2608968249695356e-05, "loss": 0.4436, "num_input_tokens_seen": 6203080026, "step": 1591, "train_runtime": 63259.1315, "train_tokens_per_second": 98058.255 }, { "epoch": 0.2531001589825119, "grad_norm": 0.22992070019245148, "learning_rate": 4.260008424874532e-05, "loss": 0.4253, "num_input_tokens_seen": 6206881961, "step": 1592, "train_runtime": 63299.0115, "train_tokens_per_second": 98056.539 }, { "epoch": 0.2532591414944356, "grad_norm": 0.21571990847587585, "learning_rate": 4.259119583909854e-05, "loss": 0.4353, "num_input_tokens_seen": 6210756016, "step": 1593, "train_runtime": 63337.2649, "train_tokens_per_second": 98058.482 }, { "epoch": 0.2534181240063593, "grad_norm": 0.20619620382785797, "learning_rate": 4.2582303022981504e-05, "loss": 0.4254, "num_input_tokens_seen": 6214656158, "step": 1594, "train_runtime": 63377.0388, "train_tokens_per_second": 98058.481 }, { "epoch": 0.25357710651828297, "grad_norm": 0.2303582727909088, "learning_rate": 4.257340580262181e-05, "loss": 0.4417, "num_input_tokens_seen": 6218733808, "step": 1595, "train_runtime": 63418.0272, "train_tokens_per_second": 98059.402 }, { "epoch": 0.25373608903020667, "grad_norm": 0.2110375463962555, "learning_rate": 4.256450418024814e-05, "loss": 0.4226, "num_input_tokens_seen": 6222662168, "step": 1596, "train_runtime": 63454.2807, "train_tokens_per_second": 98065.286 }, { "epoch": 0.25389507154213037, "grad_norm": 0.23107531666755676, "learning_rate": 4.2555598158090294e-05, "loss": 0.4376, "num_input_tokens_seen": 6226553358, "step": 1597, "train_runtime": 63493.8162, "train_tokens_per_second": 98065.508 }, { "epoch": 0.25405405405405407, "grad_norm": 0.24325521290302277, "learning_rate": 4.254668773837916e-05, "loss": 0.4474, "num_input_tokens_seen": 6230361436, "step": 1598, "train_runtime": 63534.4583, "train_tokens_per_second": 98062.714 }, { "epoch": 0.25421303656597777, "grad_norm": 0.27918311953544617, "learning_rate": 4.2537772923346734e-05, "loss": 0.4282, "num_input_tokens_seen": 6234281516, "step": 1599, "train_runtime": 63571.3653, "train_tokens_per_second": 98067.447 }, { "epoch": 0.2543720190779014, "grad_norm": 0.24554525315761566, "learning_rate": 4.252885371522613e-05, "loss": 0.4238, "num_input_tokens_seen": 6238155971, "step": 1600, "train_runtime": 63610.0556, "train_tokens_per_second": 98068.708 }, { "epoch": 0.2545310015898251, "grad_norm": 0.20545272529125214, "learning_rate": 4.251993011625152e-05, "loss": 0.4271, "num_input_tokens_seen": 6242061535, "step": 1601, "train_runtime": 63740.5985, "train_tokens_per_second": 97929.133 }, { "epoch": 0.2546899841017488, "grad_norm": 0.23342569172382355, "learning_rate": 4.2511002128658225e-05, "loss": 0.4197, "num_input_tokens_seen": 6245884328, "step": 1602, "train_runtime": 63780.2168, "train_tokens_per_second": 97928.239 }, { "epoch": 0.2548489666136725, "grad_norm": 0.23636117577552795, "learning_rate": 4.250206975468264e-05, "loss": 0.4382, "num_input_tokens_seen": 6249845715, "step": 1603, "train_runtime": 63817.1965, "train_tokens_per_second": 97933.567 }, { "epoch": 0.2550079491255962, "grad_norm": 0.20980630815029144, "learning_rate": 4.249313299656226e-05, "loss": 0.4273, "num_input_tokens_seen": 6253662256, "step": 1604, "train_runtime": 63856.3196, "train_tokens_per_second": 97933.334 }, { "epoch": 0.25516693163751986, "grad_norm": 0.2219458520412445, "learning_rate": 4.2484191856535665e-05, "loss": 0.4267, "num_input_tokens_seen": 6257524431, "step": 1605, "train_runtime": 63896.3569, "train_tokens_per_second": 97932.413 }, { "epoch": 0.25532591414944356, "grad_norm": 0.2379862517118454, "learning_rate": 4.247524633684257e-05, "loss": 0.4258, "num_input_tokens_seen": 6261341718, "step": 1606, "train_runtime": 63934.3843, "train_tokens_per_second": 97933.871 }, { "epoch": 0.25548489666136726, "grad_norm": 0.2199501246213913, "learning_rate": 4.2466296439723744e-05, "loss": 0.4247, "num_input_tokens_seen": 6265298209, "step": 1607, "train_runtime": 63971.3628, "train_tokens_per_second": 97939.108 }, { "epoch": 0.25564387917329096, "grad_norm": 0.2713136374950409, "learning_rate": 4.2457342167421096e-05, "loss": 0.4256, "num_input_tokens_seen": 6269254624, "step": 1608, "train_runtime": 64013.8571, "train_tokens_per_second": 97935.899 }, { "epoch": 0.2558028616852146, "grad_norm": 0.22286270558834076, "learning_rate": 4.244838352217758e-05, "loss": 0.4269, "num_input_tokens_seen": 6273213717, "step": 1609, "train_runtime": 64054.7929, "train_tokens_per_second": 97935.118 }, { "epoch": 0.2559618441971383, "grad_norm": 0.2521105706691742, "learning_rate": 4.243942050623729e-05, "loss": 0.428, "num_input_tokens_seen": 6277251497, "step": 1610, "train_runtime": 64092.3708, "train_tokens_per_second": 97940.697 }, { "epoch": 0.256120826709062, "grad_norm": 0.2709832191467285, "learning_rate": 4.243045312184541e-05, "loss": 0.4471, "num_input_tokens_seen": 6281092376, "step": 1611, "train_runtime": 64132.9261, "train_tokens_per_second": 97938.653 }, { "epoch": 0.2562798092209857, "grad_norm": 0.2630532681941986, "learning_rate": 4.2421481371248186e-05, "loss": 0.4317, "num_input_tokens_seen": 6284871665, "step": 1612, "train_runtime": 64171.6743, "train_tokens_per_second": 97938.409 }, { "epoch": 0.2564387917329094, "grad_norm": 0.2199823409318924, "learning_rate": 4.2412505256692984e-05, "loss": 0.4314, "num_input_tokens_seen": 6288780931, "step": 1613, "train_runtime": 64211.9997, "train_tokens_per_second": 97937.784 }, { "epoch": 0.25659777424483304, "grad_norm": 0.25690269470214844, "learning_rate": 4.240352478042825e-05, "loss": 0.4268, "num_input_tokens_seen": 6292785141, "step": 1614, "train_runtime": 64248.3826, "train_tokens_per_second": 97944.647 }, { "epoch": 0.25675675675675674, "grad_norm": 0.25900596380233765, "learning_rate": 4.2394539944703546e-05, "loss": 0.4288, "num_input_tokens_seen": 6296793883, "step": 1615, "train_runtime": 64290.5013, "train_tokens_per_second": 97942.834 }, { "epoch": 0.25691573926868044, "grad_norm": 0.24139389395713806, "learning_rate": 4.23855507517695e-05, "loss": 0.4229, "num_input_tokens_seen": 6300654758, "step": 1616, "train_runtime": 64329.7988, "train_tokens_per_second": 97943.02 }, { "epoch": 0.25707472178060414, "grad_norm": 0.35140374302864075, "learning_rate": 4.237655720387785e-05, "loss": 0.4314, "num_input_tokens_seen": 6304553757, "step": 1617, "train_runtime": 64368.3256, "train_tokens_per_second": 97944.971 }, { "epoch": 0.25723370429252784, "grad_norm": 0.22778178751468658, "learning_rate": 4.23675593032814e-05, "loss": 0.4316, "num_input_tokens_seen": 6308394172, "step": 1618, "train_runtime": 64399.4267, "train_tokens_per_second": 97957.303 }, { "epoch": 0.2573926868044515, "grad_norm": 0.29728156328201294, "learning_rate": 4.235855705223407e-05, "loss": 0.4288, "num_input_tokens_seen": 6312189368, "step": 1619, "train_runtime": 64440.2508, "train_tokens_per_second": 97954.14 }, { "epoch": 0.2575516693163752, "grad_norm": 0.20435436069965363, "learning_rate": 4.2349550452990864e-05, "loss": 0.4227, "num_input_tokens_seen": 6316199045, "step": 1620, "train_runtime": 64478.8333, "train_tokens_per_second": 97957.713 }, { "epoch": 0.2577106518282989, "grad_norm": 0.23631946742534637, "learning_rate": 4.234053950780787e-05, "loss": 0.4275, "num_input_tokens_seen": 6320195916, "step": 1621, "train_runtime": 64518.4305, "train_tokens_per_second": 97959.542 }, { "epoch": 0.2578696343402226, "grad_norm": 0.2505689561367035, "learning_rate": 4.233152421894227e-05, "loss": 0.4226, "num_input_tokens_seen": 6324090693, "step": 1622, "train_runtime": 64559.7067, "train_tokens_per_second": 97957.24 }, { "epoch": 0.2580286168521463, "grad_norm": 0.2501639425754547, "learning_rate": 4.232250458865231e-05, "loss": 0.436, "num_input_tokens_seen": 6327929977, "step": 1623, "train_runtime": 64599.5202, "train_tokens_per_second": 97956.3 }, { "epoch": 0.25818759936406993, "grad_norm": 0.23173026740550995, "learning_rate": 4.2313480619197354e-05, "loss": 0.4317, "num_input_tokens_seen": 6331709363, "step": 1624, "train_runtime": 64638.6074, "train_tokens_per_second": 97955.535 }, { "epoch": 0.25834658187599363, "grad_norm": 0.22349387407302856, "learning_rate": 4.230445231283785e-05, "loss": 0.4332, "num_input_tokens_seen": 6335548522, "step": 1625, "train_runtime": 64677.2639, "train_tokens_per_second": 97956.347 }, { "epoch": 0.25850556438791733, "grad_norm": 0.2548055648803711, "learning_rate": 4.2295419671835336e-05, "loss": 0.4297, "num_input_tokens_seen": 6339529730, "step": 1626, "train_runtime": 64715.504, "train_tokens_per_second": 97959.984 }, { "epoch": 0.25866454689984103, "grad_norm": 0.21260100603103638, "learning_rate": 4.22863826984524e-05, "loss": 0.4375, "num_input_tokens_seen": 6343374485, "step": 1627, "train_runtime": 64756.2651, "train_tokens_per_second": 97957.695 }, { "epoch": 0.25882352941176473, "grad_norm": 1.2109630107879639, "learning_rate": 4.227734139495274e-05, "loss": 0.427, "num_input_tokens_seen": 6347233740, "step": 1628, "train_runtime": 64796.2302, "train_tokens_per_second": 97956.837 }, { "epoch": 0.2589825119236884, "grad_norm": 0.2255323976278305, "learning_rate": 4.2268295763601153e-05, "loss": 0.4386, "num_input_tokens_seen": 6351096181, "step": 1629, "train_runtime": 64833.1778, "train_tokens_per_second": 97960.587 }, { "epoch": 0.2591414944356121, "grad_norm": 0.24387797713279724, "learning_rate": 4.225924580666351e-05, "loss": 0.4289, "num_input_tokens_seen": 6355029842, "step": 1630, "train_runtime": 64874.1986, "train_tokens_per_second": 97959.281 }, { "epoch": 0.2593004769475358, "grad_norm": 0.3200264573097229, "learning_rate": 4.2250191526406744e-05, "loss": 0.4354, "num_input_tokens_seen": 6359040714, "step": 1631, "train_runtime": 64912.5877, "train_tokens_per_second": 97963.137 }, { "epoch": 0.2594594594594595, "grad_norm": 0.22528725862503052, "learning_rate": 4.2241132925098904e-05, "loss": 0.4293, "num_input_tokens_seen": 6362906324, "step": 1632, "train_runtime": 64952.5078, "train_tokens_per_second": 97962.443 }, { "epoch": 0.2596184419713832, "grad_norm": 0.19278523325920105, "learning_rate": 4.223207000500909e-05, "loss": 0.4327, "num_input_tokens_seen": 6366710752, "step": 1633, "train_runtime": 64991.1574, "train_tokens_per_second": 97962.723 }, { "epoch": 0.2597774244833068, "grad_norm": 0.22521066665649414, "learning_rate": 4.222300276840752e-05, "loss": 0.4261, "num_input_tokens_seen": 6370713837, "step": 1634, "train_runtime": 65031.0183, "train_tokens_per_second": 97964.233 }, { "epoch": 0.2599364069952305, "grad_norm": 0.2315695881843567, "learning_rate": 4.221393121756545e-05, "loss": 0.4446, "num_input_tokens_seen": 6374613381, "step": 1635, "train_runtime": 65066.0046, "train_tokens_per_second": 97971.489 }, { "epoch": 0.2600953895071542, "grad_norm": 0.2213297039270401, "learning_rate": 4.2204855354755244e-05, "loss": 0.446, "num_input_tokens_seen": 6378355149, "step": 1636, "train_runtime": 65107.8924, "train_tokens_per_second": 97965.929 }, { "epoch": 0.2602543720190779, "grad_norm": 0.22744973003864288, "learning_rate": 4.219577518225036e-05, "loss": 0.4388, "num_input_tokens_seen": 6382200632, "step": 1637, "train_runtime": 65150.0831, "train_tokens_per_second": 97961.512 }, { "epoch": 0.2604133545310016, "grad_norm": 0.23452012240886688, "learning_rate": 4.2186690702325296e-05, "loss": 0.438, "num_input_tokens_seen": 6386185940, "step": 1638, "train_runtime": 65191.7408, "train_tokens_per_second": 97960.046 }, { "epoch": 0.26057233704292526, "grad_norm": 0.2273285686969757, "learning_rate": 4.217760191725566e-05, "loss": 0.4325, "num_input_tokens_seen": 6390161225, "step": 1639, "train_runtime": 65230.2825, "train_tokens_per_second": 97963.108 }, { "epoch": 0.26073131955484896, "grad_norm": 0.2709862291812897, "learning_rate": 4.216850882931813e-05, "loss": 0.4217, "num_input_tokens_seen": 6393937389, "step": 1640, "train_runtime": 65267.155, "train_tokens_per_second": 97965.621 }, { "epoch": 0.26089030206677266, "grad_norm": 0.2297467142343521, "learning_rate": 4.2159411440790455e-05, "loss": 0.435, "num_input_tokens_seen": 6397843941, "step": 1641, "train_runtime": 65305.0257, "train_tokens_per_second": 97968.631 }, { "epoch": 0.26104928457869636, "grad_norm": 0.19650858640670776, "learning_rate": 4.2150309753951467e-05, "loss": 0.4361, "num_input_tokens_seen": 6401714703, "step": 1642, "train_runtime": 65342.3356, "train_tokens_per_second": 97971.93 }, { "epoch": 0.26120826709062, "grad_norm": 0.2071325033903122, "learning_rate": 4.214120377108108e-05, "loss": 0.4262, "num_input_tokens_seen": 6405611332, "step": 1643, "train_runtime": 65381.2673, "train_tokens_per_second": 97973.19 }, { "epoch": 0.2613672496025437, "grad_norm": 0.215549498796463, "learning_rate": 4.213209349446027e-05, "loss": 0.4115, "num_input_tokens_seen": 6409506450, "step": 1644, "train_runtime": 65419.8303, "train_tokens_per_second": 97974.978 }, { "epoch": 0.2615262321144674, "grad_norm": 0.23530738055706024, "learning_rate": 4.2122978926371104e-05, "loss": 0.4234, "num_input_tokens_seen": 6413314401, "step": 1645, "train_runtime": 65461.221, "train_tokens_per_second": 97971.2 }, { "epoch": 0.2616852146263911, "grad_norm": 0.20158500969409943, "learning_rate": 4.211386006909671e-05, "loss": 0.4326, "num_input_tokens_seen": 6417277096, "step": 1646, "train_runtime": 65500.0631, "train_tokens_per_second": 97973.602 }, { "epoch": 0.2618441971383148, "grad_norm": 0.22851209342479706, "learning_rate": 4.210473692492131e-05, "loss": 0.4387, "num_input_tokens_seen": 6421318840, "step": 1647, "train_runtime": 65538.5052, "train_tokens_per_second": 97977.804 }, { "epoch": 0.26200317965023845, "grad_norm": 0.20763634145259857, "learning_rate": 4.2095609496130164e-05, "loss": 0.4367, "num_input_tokens_seen": 6425193042, "step": 1648, "train_runtime": 65576.8501, "train_tokens_per_second": 97979.592 }, { "epoch": 0.26216216216216215, "grad_norm": 0.20597338676452637, "learning_rate": 4.208647778500965e-05, "loss": 0.421, "num_input_tokens_seen": 6429067948, "step": 1649, "train_runtime": 65615.9782, "train_tokens_per_second": 97980.22 }, { "epoch": 0.26232114467408585, "grad_norm": 0.21646764874458313, "learning_rate": 4.207734179384719e-05, "loss": 0.4365, "num_input_tokens_seen": 6433078561, "step": 1650, "train_runtime": 65657.8226, "train_tokens_per_second": 97978.859 }, { "epoch": 0.26248012718600955, "grad_norm": 0.23344844579696655, "learning_rate": 4.206820152493129e-05, "loss": 0.4335, "num_input_tokens_seen": 6436928129, "step": 1651, "train_runtime": 65698.7351, "train_tokens_per_second": 97976.439 }, { "epoch": 0.26263910969793325, "grad_norm": 0.34901395440101624, "learning_rate": 4.2059056980551515e-05, "loss": 0.4276, "num_input_tokens_seen": 6440719998, "step": 1652, "train_runtime": 65737.0025, "train_tokens_per_second": 97977.087 }, { "epoch": 0.2627980922098569, "grad_norm": 0.26016518473625183, "learning_rate": 4.204990816299851e-05, "loss": 0.4315, "num_input_tokens_seen": 6444661149, "step": 1653, "train_runtime": 65778.1995, "train_tokens_per_second": 97975.639 }, { "epoch": 0.2629570747217806, "grad_norm": 0.22337821125984192, "learning_rate": 4.204075507456399e-05, "loss": 0.4236, "num_input_tokens_seen": 6448590418, "step": 1654, "train_runtime": 65817.6802, "train_tokens_per_second": 97976.568 }, { "epoch": 0.2631160572337043, "grad_norm": 0.20994625985622406, "learning_rate": 4.203159771754074e-05, "loss": 0.4264, "num_input_tokens_seen": 6452421671, "step": 1655, "train_runtime": 65854.194, "train_tokens_per_second": 97980.421 }, { "epoch": 0.263275039745628, "grad_norm": 0.2150062620639801, "learning_rate": 4.2022436094222615e-05, "loss": 0.4426, "num_input_tokens_seen": 6456198220, "step": 1656, "train_runtime": 65890.6037, "train_tokens_per_second": 97983.595 }, { "epoch": 0.2634340222575517, "grad_norm": 0.2485087364912033, "learning_rate": 4.201327020690453e-05, "loss": 0.422, "num_input_tokens_seen": 6460098214, "step": 1657, "train_runtime": 65929.1312, "train_tokens_per_second": 97985.49 }, { "epoch": 0.26359300476947534, "grad_norm": 0.36049970984458923, "learning_rate": 4.200410005788249e-05, "loss": 0.4437, "num_input_tokens_seen": 6464097902, "step": 1658, "train_runtime": 65967.9923, "train_tokens_per_second": 97988.398 }, { "epoch": 0.26375198728139904, "grad_norm": 0.22090047597885132, "learning_rate": 4.199492564945353e-05, "loss": 0.427, "num_input_tokens_seen": 6467929911, "step": 1659, "train_runtime": 66007.4706, "train_tokens_per_second": 97987.847 }, { "epoch": 0.26391096979332274, "grad_norm": 0.21924667060375214, "learning_rate": 4.1985746983915795e-05, "loss": 0.4334, "num_input_tokens_seen": 6471906410, "step": 1660, "train_runtime": 66043.92, "train_tokens_per_second": 97993.978 }, { "epoch": 0.26406995230524644, "grad_norm": 0.21682091057300568, "learning_rate": 4.197656406356847e-05, "loss": 0.4282, "num_input_tokens_seen": 6475814427, "step": 1661, "train_runtime": 66081.2555, "train_tokens_per_second": 97997.751 }, { "epoch": 0.26422893481717014, "grad_norm": 0.22741378843784332, "learning_rate": 4.196737689071181e-05, "loss": 0.4367, "num_input_tokens_seen": 6479722794, "step": 1662, "train_runtime": 66118.7388, "train_tokens_per_second": 98001.307 }, { "epoch": 0.2643879173290938, "grad_norm": 0.23739810287952423, "learning_rate": 4.195818546764713e-05, "loss": 0.4414, "num_input_tokens_seen": 6483485406, "step": 1663, "train_runtime": 66158.6671, "train_tokens_per_second": 97999.033 }, { "epoch": 0.2645468998410175, "grad_norm": 0.21869824826717377, "learning_rate": 4.1948989796676816e-05, "loss": 0.4241, "num_input_tokens_seen": 6487451511, "step": 1664, "train_runtime": 66198.7635, "train_tokens_per_second": 97999.587 }, { "epoch": 0.2647058823529412, "grad_norm": 0.24855270981788635, "learning_rate": 4.1939789880104344e-05, "loss": 0.4397, "num_input_tokens_seen": 6491396524, "step": 1665, "train_runtime": 66238.0731, "train_tokens_per_second": 98000.987 }, { "epoch": 0.2648648648648649, "grad_norm": 0.22374753654003143, "learning_rate": 4.1930585720234196e-05, "loss": 0.4351, "num_input_tokens_seen": 6495309851, "step": 1666, "train_runtime": 66276.1308, "train_tokens_per_second": 98003.758 }, { "epoch": 0.2650238473767886, "grad_norm": 0.2149195671081543, "learning_rate": 4.192137731937197e-05, "loss": 0.4242, "num_input_tokens_seen": 6499100914, "step": 1667, "train_runtime": 66314.9495, "train_tokens_per_second": 98003.557 }, { "epoch": 0.2651828298887122, "grad_norm": 0.20820581912994385, "learning_rate": 4.191216467982429e-05, "loss": 0.4405, "num_input_tokens_seen": 6503142542, "step": 1668, "train_runtime": 66353.4748, "train_tokens_per_second": 98007.566 }, { "epoch": 0.2653418124006359, "grad_norm": 0.24792206287384033, "learning_rate": 4.190294780389887e-05, "loss": 0.4294, "num_input_tokens_seen": 6506970727, "step": 1669, "train_runtime": 66393.665, "train_tokens_per_second": 98005.898 }, { "epoch": 0.2655007949125596, "grad_norm": 0.26141929626464844, "learning_rate": 4.189372669390447e-05, "loss": 0.4236, "num_input_tokens_seen": 6510898381, "step": 1670, "train_runtime": 66432.3883, "train_tokens_per_second": 98007.893 }, { "epoch": 0.2656597774244833, "grad_norm": 0.2215668261051178, "learning_rate": 4.188450135215091e-05, "loss": 0.4326, "num_input_tokens_seen": 6514775964, "step": 1671, "train_runtime": 66472.2729, "train_tokens_per_second": 98007.42 }, { "epoch": 0.26581875993640697, "grad_norm": 0.353622168302536, "learning_rate": 4.1875271780949066e-05, "loss": 0.4409, "num_input_tokens_seen": 6518680959, "step": 1672, "train_runtime": 66512.5934, "train_tokens_per_second": 98006.718 }, { "epoch": 0.26597774244833067, "grad_norm": 0.23127798736095428, "learning_rate": 4.186603798261089e-05, "loss": 0.4321, "num_input_tokens_seen": 6522541706, "step": 1673, "train_runtime": 66552.3217, "train_tokens_per_second": 98006.223 }, { "epoch": 0.26613672496025437, "grad_norm": 0.20166607201099396, "learning_rate": 4.185679995944938e-05, "loss": 0.4231, "num_input_tokens_seen": 6526427480, "step": 1674, "train_runtime": 66591.9681, "train_tokens_per_second": 98006.226 }, { "epoch": 0.26629570747217807, "grad_norm": 0.29787102341651917, "learning_rate": 4.1847557713778596e-05, "loss": 0.4231, "num_input_tokens_seen": 6530177060, "step": 1675, "train_runtime": 66631.9957, "train_tokens_per_second": 98003.624 }, { "epoch": 0.26645468998410177, "grad_norm": 0.2502807676792145, "learning_rate": 4.183831124791366e-05, "loss": 0.419, "num_input_tokens_seen": 6534179759, "step": 1676, "train_runtime": 66673.0656, "train_tokens_per_second": 98003.29 }, { "epoch": 0.2666136724960254, "grad_norm": 0.1920136958360672, "learning_rate": 4.182906056417074e-05, "loss": 0.4264, "num_input_tokens_seen": 6538040219, "step": 1677, "train_runtime": 66711.6649, "train_tokens_per_second": 98004.453 }, { "epoch": 0.2667726550079491, "grad_norm": 0.23709408938884735, "learning_rate": 4.1819805664867076e-05, "loss": 0.4156, "num_input_tokens_seen": 6541847729, "step": 1678, "train_runtime": 66746.9075, "train_tokens_per_second": 98009.75 }, { "epoch": 0.2669316375198728, "grad_norm": 0.21735505759716034, "learning_rate": 4.1810546552320926e-05, "loss": 0.4292, "num_input_tokens_seen": 6545810494, "step": 1679, "train_runtime": 66788.7073, "train_tokens_per_second": 98007.743 }, { "epoch": 0.2670906200317965, "grad_norm": 0.20333266258239746, "learning_rate": 4.1801283228851676e-05, "loss": 0.4248, "num_input_tokens_seen": 6549753174, "step": 1680, "train_runtime": 66831.2835, "train_tokens_per_second": 98004.3 }, { "epoch": 0.2672496025437202, "grad_norm": 0.2310853898525238, "learning_rate": 4.1792015696779694e-05, "loss": 0.4374, "num_input_tokens_seen": 6553631532, "step": 1681, "train_runtime": 66869.9086, "train_tokens_per_second": 98005.69 }, { "epoch": 0.26740858505564385, "grad_norm": 0.22020487487316132, "learning_rate": 4.178274395842643e-05, "loss": 0.4263, "num_input_tokens_seen": 6557497942, "step": 1682, "train_runtime": 66908.6825, "train_tokens_per_second": 98006.682 }, { "epoch": 0.26756756756756755, "grad_norm": 0.24456191062927246, "learning_rate": 4.177346801611441e-05, "loss": 0.446, "num_input_tokens_seen": 6561391685, "step": 1683, "train_runtime": 66948.9538, "train_tokens_per_second": 98005.888 }, { "epoch": 0.26772655007949125, "grad_norm": 0.31931057572364807, "learning_rate": 4.176418787216717e-05, "loss": 0.43, "num_input_tokens_seen": 6565298667, "step": 1684, "train_runtime": 66988.2989, "train_tokens_per_second": 98006.649 }, { "epoch": 0.26788553259141495, "grad_norm": 0.22040483355522156, "learning_rate": 4.175490352890934e-05, "loss": 0.4324, "num_input_tokens_seen": 6569108318, "step": 1685, "train_runtime": 67026.4804, "train_tokens_per_second": 98007.657 }, { "epoch": 0.26804451510333865, "grad_norm": 0.23977459967136383, "learning_rate": 4.174561498866656e-05, "loss": 0.4298, "num_input_tokens_seen": 6572965073, "step": 1686, "train_runtime": 67066.482, "train_tokens_per_second": 98006.707 }, { "epoch": 0.2682034976152623, "grad_norm": 0.21981874108314514, "learning_rate": 4.1736322253765564e-05, "loss": 0.4236, "num_input_tokens_seen": 6576812112, "step": 1687, "train_runtime": 67105.7595, "train_tokens_per_second": 98006.671 }, { "epoch": 0.268362480127186, "grad_norm": 0.19721566140651703, "learning_rate": 4.1727025326534105e-05, "loss": 0.4231, "num_input_tokens_seen": 6580793071, "step": 1688, "train_runtime": 67144.2602, "train_tokens_per_second": 98009.764 }, { "epoch": 0.2685214626391097, "grad_norm": 0.21298231184482574, "learning_rate": 4.1717724209301e-05, "loss": 0.425, "num_input_tokens_seen": 6584719378, "step": 1689, "train_runtime": 67179.9361, "train_tokens_per_second": 98016.16 }, { "epoch": 0.2686804451510334, "grad_norm": 0.26154831051826477, "learning_rate": 4.170841890439612e-05, "loss": 0.4312, "num_input_tokens_seen": 6588557530, "step": 1690, "train_runtime": 67220.668, "train_tokens_per_second": 98013.866 }, { "epoch": 0.2688394276629571, "grad_norm": 0.24506910145282745, "learning_rate": 4.169910941415036e-05, "loss": 0.4138, "num_input_tokens_seen": 6592441729, "step": 1691, "train_runtime": 67260.752, "train_tokens_per_second": 98013.203 }, { "epoch": 0.26899841017488074, "grad_norm": 0.20526020228862762, "learning_rate": 4.1689795740895695e-05, "loss": 0.4174, "num_input_tokens_seen": 6596415430, "step": 1692, "train_runtime": 67299.9493, "train_tokens_per_second": 98015.162 }, { "epoch": 0.26915739268680444, "grad_norm": 0.2279944270849228, "learning_rate": 4.168047788696514e-05, "loss": 0.4404, "num_input_tokens_seen": 6600143140, "step": 1693, "train_runtime": 67339.0669, "train_tokens_per_second": 98013.582 }, { "epoch": 0.26931637519872814, "grad_norm": 0.21813543140888214, "learning_rate": 4.1671155854692746e-05, "loss": 0.4202, "num_input_tokens_seen": 6604111324, "step": 1694, "train_runtime": 67378.0998, "train_tokens_per_second": 98015.696 }, { "epoch": 0.26947535771065184, "grad_norm": 0.29280349612236023, "learning_rate": 4.1661829646413615e-05, "loss": 0.4427, "num_input_tokens_seen": 6607986706, "step": 1695, "train_runtime": 67414.2189, "train_tokens_per_second": 98020.667 }, { "epoch": 0.26963434022257554, "grad_norm": 0.2129502147436142, "learning_rate": 4.165249926446389e-05, "loss": 0.4359, "num_input_tokens_seen": 6611956581, "step": 1696, "train_runtime": 67453.8233, "train_tokens_per_second": 98021.969 }, { "epoch": 0.2697933227344992, "grad_norm": 0.2114795446395874, "learning_rate": 4.164316471118077e-05, "loss": 0.4326, "num_input_tokens_seen": 6615713661, "step": 1697, "train_runtime": 67493.4437, "train_tokens_per_second": 98020.093 }, { "epoch": 0.2699523052464229, "grad_norm": 0.23678241670131683, "learning_rate": 4.1633825988902505e-05, "loss": 0.4262, "num_input_tokens_seen": 6619750163, "step": 1698, "train_runtime": 67529.4981, "train_tokens_per_second": 98027.534 }, { "epoch": 0.2701112877583466, "grad_norm": 0.23402152955532074, "learning_rate": 4.162448309996837e-05, "loss": 0.4282, "num_input_tokens_seen": 6623594959, "step": 1699, "train_runtime": 67569.1938, "train_tokens_per_second": 98026.846 }, { "epoch": 0.2702702702702703, "grad_norm": 0.1935535967350006, "learning_rate": 4.1615136046718684e-05, "loss": 0.4278, "num_input_tokens_seen": 6627335639, "step": 1700, "train_runtime": 67607.4889, "train_tokens_per_second": 98026.65 }, { "epoch": 0.270429252782194, "grad_norm": 0.23931072652339935, "learning_rate": 4.160578483149483e-05, "loss": 0.4366, "num_input_tokens_seen": 6631380253, "step": 1701, "train_runtime": 67648.104, "train_tokens_per_second": 98027.585 }, { "epoch": 0.27058823529411763, "grad_norm": 0.214886873960495, "learning_rate": 4.1596429456639216e-05, "loss": 0.4298, "num_input_tokens_seen": 6635226953, "step": 1702, "train_runtime": 67684.4033, "train_tokens_per_second": 98031.845 }, { "epoch": 0.27074721780604133, "grad_norm": 0.21270975470542908, "learning_rate": 4.158706992449531e-05, "loss": 0.4315, "num_input_tokens_seen": 6639212462, "step": 1703, "train_runtime": 67722.4048, "train_tokens_per_second": 98035.687 }, { "epoch": 0.27090620031796503, "grad_norm": 0.32732686400413513, "learning_rate": 4.157770623740759e-05, "loss": 0.42, "num_input_tokens_seen": 6643109355, "step": 1704, "train_runtime": 67763.4005, "train_tokens_per_second": 98033.884 }, { "epoch": 0.27106518282988873, "grad_norm": 0.214927539229393, "learning_rate": 4.15683383977216e-05, "loss": 0.4225, "num_input_tokens_seen": 6646960145, "step": 1705, "train_runtime": 67800.9555, "train_tokens_per_second": 98036.379 }, { "epoch": 0.2712241653418124, "grad_norm": 0.2936553359031677, "learning_rate": 4.155896640778392e-05, "loss": 0.4391, "num_input_tokens_seen": 6650885608, "step": 1706, "train_runtime": 67838.1613, "train_tokens_per_second": 98040.476 }, { "epoch": 0.2713831478537361, "grad_norm": 0.2720873951911926, "learning_rate": 4.1549590269942176e-05, "loss": 0.4263, "num_input_tokens_seen": 6654787958, "step": 1707, "train_runtime": 67873.8854, "train_tokens_per_second": 98046.368 }, { "epoch": 0.2715421303656598, "grad_norm": 0.21888332068920135, "learning_rate": 4.1540209986545e-05, "loss": 0.415, "num_input_tokens_seen": 6658622469, "step": 1708, "train_runtime": 67914.5498, "train_tokens_per_second": 98044.123 }, { "epoch": 0.2717011128775835, "grad_norm": 0.21105322241783142, "learning_rate": 4.153082555994211e-05, "loss": 0.4235, "num_input_tokens_seen": 6662598077, "step": 1709, "train_runtime": 67952.7595, "train_tokens_per_second": 98047.498 }, { "epoch": 0.2718600953895072, "grad_norm": 0.2291017770767212, "learning_rate": 4.152143699248422e-05, "loss": 0.422, "num_input_tokens_seen": 6666469019, "step": 1710, "train_runtime": 67991.7668, "train_tokens_per_second": 98048.18 }, { "epoch": 0.2720190779014308, "grad_norm": 0.3440234065055847, "learning_rate": 4.151204428652312e-05, "loss": 0.4226, "num_input_tokens_seen": 6670353844, "step": 1711, "train_runtime": 68032.0683, "train_tokens_per_second": 98047.2 }, { "epoch": 0.2721780604133545, "grad_norm": 0.233799010515213, "learning_rate": 4.150264744441161e-05, "loss": 0.4188, "num_input_tokens_seen": 6674284922, "step": 1712, "train_runtime": 68071.4836, "train_tokens_per_second": 98048.178 }, { "epoch": 0.2723370429252782, "grad_norm": 0.2229311615228653, "learning_rate": 4.1493246468503515e-05, "loss": 0.4339, "num_input_tokens_seen": 6678189641, "step": 1713, "train_runtime": 68110.992, "train_tokens_per_second": 98048.633 }, { "epoch": 0.2724960254372019, "grad_norm": 0.20918907225131989, "learning_rate": 4.148384136115374e-05, "loss": 0.4249, "num_input_tokens_seen": 6682110930, "step": 1714, "train_runtime": 68150.7087, "train_tokens_per_second": 98049.031 }, { "epoch": 0.2726550079491256, "grad_norm": 0.22014866769313812, "learning_rate": 4.147443212471818e-05, "loss": 0.4366, "num_input_tokens_seen": 6686019635, "step": 1715, "train_runtime": 68189.8795, "train_tokens_per_second": 98050.029 }, { "epoch": 0.27281399046104926, "grad_norm": 0.23047280311584473, "learning_rate": 4.146501876155378e-05, "loss": 0.4302, "num_input_tokens_seen": 6689875610, "step": 1716, "train_runtime": 68226.7916, "train_tokens_per_second": 98053.499 }, { "epoch": 0.27297297297297296, "grad_norm": 0.23246312141418457, "learning_rate": 4.145560127401853e-05, "loss": 0.4329, "num_input_tokens_seen": 6693830375, "step": 1717, "train_runtime": 68263.1449, "train_tokens_per_second": 98059.215 }, { "epoch": 0.27313195548489666, "grad_norm": 0.2204269915819168, "learning_rate": 4.1446179664471454e-05, "loss": 0.4194, "num_input_tokens_seen": 6697714301, "step": 1718, "train_runtime": 68303.3787, "train_tokens_per_second": 98058.316 }, { "epoch": 0.27329093799682036, "grad_norm": 0.2167212814092636, "learning_rate": 4.143675393527257e-05, "loss": 0.4248, "num_input_tokens_seen": 6701634402, "step": 1719, "train_runtime": 68342.5611, "train_tokens_per_second": 98059.457 }, { "epoch": 0.27344992050874406, "grad_norm": 0.22450551390647888, "learning_rate": 4.142732408878298e-05, "loss": 0.4249, "num_input_tokens_seen": 6705457855, "step": 1720, "train_runtime": 68382.3609, "train_tokens_per_second": 98058.297 }, { "epoch": 0.2736089030206677, "grad_norm": 0.20208485424518585, "learning_rate": 4.141789012736479e-05, "loss": 0.43, "num_input_tokens_seen": 6709402962, "step": 1721, "train_runtime": 68420.1179, "train_tokens_per_second": 98061.844 }, { "epoch": 0.2737678855325914, "grad_norm": 0.22661365568637848, "learning_rate": 4.140845205338114e-05, "loss": 0.4263, "num_input_tokens_seen": 6713405056, "step": 1722, "train_runtime": 68461.1659, "train_tokens_per_second": 98061.506 }, { "epoch": 0.2739268680445151, "grad_norm": 0.23645596206188202, "learning_rate": 4.139900986919619e-05, "loss": 0.424, "num_input_tokens_seen": 6717277276, "step": 1723, "train_runtime": 68499.9059, "train_tokens_per_second": 98062.577 }, { "epoch": 0.2740858505564388, "grad_norm": 0.21798105537891388, "learning_rate": 4.138956357717515e-05, "loss": 0.4429, "num_input_tokens_seen": 6721217401, "step": 1724, "train_runtime": 68539.414, "train_tokens_per_second": 98063.538 }, { "epoch": 0.2742448330683625, "grad_norm": 0.24901635944843292, "learning_rate": 4.138011317968425e-05, "loss": 0.4155, "num_input_tokens_seen": 6724984088, "step": 1725, "train_runtime": 68576.45, "train_tokens_per_second": 98065.503 }, { "epoch": 0.27440381558028615, "grad_norm": 0.22650250792503357, "learning_rate": 4.1370658679090754e-05, "loss": 0.4209, "num_input_tokens_seen": 6728836422, "step": 1726, "train_runtime": 68616.0497, "train_tokens_per_second": 98065.051 }, { "epoch": 0.27456279809220985, "grad_norm": 0.23121586441993713, "learning_rate": 4.136120007776293e-05, "loss": 0.4253, "num_input_tokens_seen": 6732730056, "step": 1727, "train_runtime": 68653.9726, "train_tokens_per_second": 98067.596 }, { "epoch": 0.27472178060413355, "grad_norm": 0.2314501851797104, "learning_rate": 4.13517373780701e-05, "loss": 0.4323, "num_input_tokens_seen": 6736633558, "step": 1728, "train_runtime": 68691.7824, "train_tokens_per_second": 98070.443 }, { "epoch": 0.27488076311605725, "grad_norm": 0.20568262040615082, "learning_rate": 4.134227058238261e-05, "loss": 0.4386, "num_input_tokens_seen": 6740485790, "step": 1729, "train_runtime": 68730.4844, "train_tokens_per_second": 98071.269 }, { "epoch": 0.27503974562798095, "grad_norm": 0.24130980670452118, "learning_rate": 4.133279969307182e-05, "loss": 0.4319, "num_input_tokens_seen": 6744376542, "step": 1730, "train_runtime": 68767.6479, "train_tokens_per_second": 98074.847 }, { "epoch": 0.2751987281399046, "grad_norm": 0.2220749408006668, "learning_rate": 4.132332471251011e-05, "loss": 0.4294, "num_input_tokens_seen": 6748231624, "step": 1731, "train_runtime": 68804.175, "train_tokens_per_second": 98078.81 }, { "epoch": 0.2753577106518283, "grad_norm": 1.784227967262268, "learning_rate": 4.131384564307092e-05, "loss": 0.4212, "num_input_tokens_seen": 6752067905, "step": 1732, "train_runtime": 68841.946, "train_tokens_per_second": 98080.724 }, { "epoch": 0.275516693163752, "grad_norm": 0.24676541984081268, "learning_rate": 4.130436248712867e-05, "loss": 0.4275, "num_input_tokens_seen": 6756036346, "step": 1733, "train_runtime": 68879.9732, "train_tokens_per_second": 98084.19 }, { "epoch": 0.2756756756756757, "grad_norm": 0.2267119586467743, "learning_rate": 4.1294875247058824e-05, "loss": 0.4428, "num_input_tokens_seen": 6759918186, "step": 1734, "train_runtime": 68921.1011, "train_tokens_per_second": 98081.982 }, { "epoch": 0.2758346581875994, "grad_norm": 0.20784114301204681, "learning_rate": 4.1285383925237876e-05, "loss": 0.4289, "num_input_tokens_seen": 6763892612, "step": 1735, "train_runtime": 68957.8744, "train_tokens_per_second": 98087.313 }, { "epoch": 0.27599364069952304, "grad_norm": 0.21001118421554565, "learning_rate": 4.1275888524043336e-05, "loss": 0.4443, "num_input_tokens_seen": 6767741985, "step": 1736, "train_runtime": 68995.9381, "train_tokens_per_second": 98088.991 }, { "epoch": 0.27615262321144673, "grad_norm": 0.22866754233837128, "learning_rate": 4.1266389045853734e-05, "loss": 0.4232, "num_input_tokens_seen": 6771503576, "step": 1737, "train_runtime": 69034.0845, "train_tokens_per_second": 98089.279 }, { "epoch": 0.27631160572337043, "grad_norm": 0.1993308663368225, "learning_rate": 4.1256885493048616e-05, "loss": 0.4235, "num_input_tokens_seen": 6775525757, "step": 1738, "train_runtime": 69072.0326, "train_tokens_per_second": 98093.621 }, { "epoch": 0.27647058823529413, "grad_norm": 0.2227138876914978, "learning_rate": 4.124737786800856e-05, "loss": 0.4324, "num_input_tokens_seen": 6779482438, "step": 1739, "train_runtime": 69112.9364, "train_tokens_per_second": 98092.814 }, { "epoch": 0.2766295707472178, "grad_norm": 0.2037983387708664, "learning_rate": 4.123786617311516e-05, "loss": 0.4314, "num_input_tokens_seen": 6783298302, "step": 1740, "train_runtime": 69150.4349, "train_tokens_per_second": 98094.803 }, { "epoch": 0.2767885532591415, "grad_norm": 0.21995307505130768, "learning_rate": 4.122835041075101e-05, "loss": 0.4285, "num_input_tokens_seen": 6787083764, "step": 1741, "train_runtime": 69189.6563, "train_tokens_per_second": 98093.908 }, { "epoch": 0.2769475357710652, "grad_norm": 0.25683119893074036, "learning_rate": 4.121883058329978e-05, "loss": 0.4304, "num_input_tokens_seen": 6790996605, "step": 1742, "train_runtime": 69230.3072, "train_tokens_per_second": 98092.828 }, { "epoch": 0.2771065182829889, "grad_norm": 0.2180788815021515, "learning_rate": 4.120930669314609e-05, "loss": 0.4362, "num_input_tokens_seen": 6795041517, "step": 1743, "train_runtime": 69270.9932, "train_tokens_per_second": 98093.606 }, { "epoch": 0.2772655007949126, "grad_norm": 0.526923418045044, "learning_rate": 4.119977874267561e-05, "loss": 0.4286, "num_input_tokens_seen": 6798722960, "step": 1744, "train_runtime": 69308.8071, "train_tokens_per_second": 98093.204 }, { "epoch": 0.2774244833068362, "grad_norm": 0.2146320641040802, "learning_rate": 4.119024673427503e-05, "loss": 0.4235, "num_input_tokens_seen": 6802743960, "step": 1745, "train_runtime": 69347.9935, "train_tokens_per_second": 98095.758 }, { "epoch": 0.2775834658187599, "grad_norm": 0.25091105699539185, "learning_rate": 4.1180710670332046e-05, "loss": 0.4167, "num_input_tokens_seen": 6806734365, "step": 1746, "train_runtime": 69387.0414, "train_tokens_per_second": 98098.063 }, { "epoch": 0.2777424483306836, "grad_norm": 0.21599316596984863, "learning_rate": 4.117117055323538e-05, "loss": 0.4266, "num_input_tokens_seen": 6810641483, "step": 1747, "train_runtime": 69422.85, "train_tokens_per_second": 98103.744 }, { "epoch": 0.2779014308426073, "grad_norm": 0.2511465549468994, "learning_rate": 4.116162638537476e-05, "loss": 0.4397, "num_input_tokens_seen": 6814441484, "step": 1748, "train_runtime": 69459.1159, "train_tokens_per_second": 98107.23 }, { "epoch": 0.278060413354531, "grad_norm": 0.2718942165374756, "learning_rate": 4.115207816914094e-05, "loss": 0.4325, "num_input_tokens_seen": 6818336234, "step": 1749, "train_runtime": 69498.6483, "train_tokens_per_second": 98107.465 }, { "epoch": 0.27821939586645467, "grad_norm": 0.2296641767024994, "learning_rate": 4.1142525906925674e-05, "loss": 0.4331, "num_input_tokens_seen": 6822280814, "step": 1750, "train_runtime": 69539.4738, "train_tokens_per_second": 98106.592 }, { "epoch": 0.27837837837837837, "grad_norm": 0.23681560158729553, "learning_rate": 4.1132969601121735e-05, "loss": 0.4232, "num_input_tokens_seen": 6826195374, "step": 1751, "train_runtime": 69577.3225, "train_tokens_per_second": 98109.486 }, { "epoch": 0.27853736089030207, "grad_norm": 0.22063465416431427, "learning_rate": 4.112340925412291e-05, "loss": 0.4352, "num_input_tokens_seen": 6830171028, "step": 1752, "train_runtime": 69615.4612, "train_tokens_per_second": 98112.846 }, { "epoch": 0.27869634340222577, "grad_norm": 0.3368675410747528, "learning_rate": 4.1113844868324e-05, "loss": 0.4202, "num_input_tokens_seen": 6834018371, "step": 1753, "train_runtime": 69656.2895, "train_tokens_per_second": 98110.571 }, { "epoch": 0.27885532591414947, "grad_norm": 0.2388824075460434, "learning_rate": 4.110427644612083e-05, "loss": 0.4323, "num_input_tokens_seen": 6838036937, "step": 1754, "train_runtime": 69695.2101, "train_tokens_per_second": 98113.442 }, { "epoch": 0.2790143084260731, "grad_norm": 0.23530228435993195, "learning_rate": 4.1094703989910187e-05, "loss": 0.4435, "num_input_tokens_seen": 6841834458, "step": 1755, "train_runtime": 69733.3924, "train_tokens_per_second": 98114.178 }, { "epoch": 0.2791732909379968, "grad_norm": 0.21554866433143616, "learning_rate": 4.108512750208994e-05, "loss": 0.4239, "num_input_tokens_seen": 6845689745, "step": 1756, "train_runtime": 69773.2725, "train_tokens_per_second": 98113.353 }, { "epoch": 0.2793322734499205, "grad_norm": 0.21869724988937378, "learning_rate": 4.107554698505891e-05, "loss": 0.4225, "num_input_tokens_seen": 6849693768, "step": 1757, "train_runtime": 69812.836, "train_tokens_per_second": 98115.105 }, { "epoch": 0.2794912559618442, "grad_norm": 0.20006251335144043, "learning_rate": 4.106596244121697e-05, "loss": 0.4167, "num_input_tokens_seen": 6853514588, "step": 1758, "train_runtime": 69851.4743, "train_tokens_per_second": 98115.532 }, { "epoch": 0.2796502384737679, "grad_norm": 0.22707436978816986, "learning_rate": 4.105637387296495e-05, "loss": 0.4325, "num_input_tokens_seen": 6857397191, "step": 1759, "train_runtime": 69889.9539, "train_tokens_per_second": 98117.066 }, { "epoch": 0.27980922098569155, "grad_norm": 0.2383897602558136, "learning_rate": 4.104678128270474e-05, "loss": 0.4372, "num_input_tokens_seen": 6861381367, "step": 1760, "train_runtime": 69930.0278, "train_tokens_per_second": 98117.813 }, { "epoch": 0.27996820349761525, "grad_norm": 0.27352166175842285, "learning_rate": 4.103718467283921e-05, "loss": 0.4395, "num_input_tokens_seen": 6865250383, "step": 1761, "train_runtime": 69965.2979, "train_tokens_per_second": 98123.65 }, { "epoch": 0.28012718600953895, "grad_norm": 0.2149377316236496, "learning_rate": 4.102758404577224e-05, "loss": 0.4244, "num_input_tokens_seen": 6868949175, "step": 1762, "train_runtime": 70002.0205, "train_tokens_per_second": 98125.013 }, { "epoch": 0.28028616852146265, "grad_norm": 0.2165600061416626, "learning_rate": 4.101797940390873e-05, "loss": 0.4379, "num_input_tokens_seen": 6872989721, "step": 1763, "train_runtime": 70040.639, "train_tokens_per_second": 98128.598 }, { "epoch": 0.28044515103338635, "grad_norm": 0.21467019617557526, "learning_rate": 4.100837074965457e-05, "loss": 0.4385, "num_input_tokens_seen": 6876921648, "step": 1764, "train_runtime": 70080.9069, "train_tokens_per_second": 98128.32 }, { "epoch": 0.28060413354531, "grad_norm": 0.21903984248638153, "learning_rate": 4.099875808541665e-05, "loss": 0.4335, "num_input_tokens_seen": 6880692224, "step": 1765, "train_runtime": 70120.5067, "train_tokens_per_second": 98126.676 }, { "epoch": 0.2807631160572337, "grad_norm": 0.23103639483451843, "learning_rate": 4.098914141360289e-05, "loss": 0.4414, "num_input_tokens_seen": 6884583862, "step": 1766, "train_runtime": 70162.7757, "train_tokens_per_second": 98123.026 }, { "epoch": 0.2809220985691574, "grad_norm": 0.23718608915805817, "learning_rate": 4.097952073662218e-05, "loss": 0.4157, "num_input_tokens_seen": 6888480212, "step": 1767, "train_runtime": 70203.021, "train_tokens_per_second": 98122.276 }, { "epoch": 0.2810810810810811, "grad_norm": 0.21133562922477722, "learning_rate": 4.096989605688445e-05, "loss": 0.4271, "num_input_tokens_seen": 6892488954, "step": 1768, "train_runtime": 70240.9462, "train_tokens_per_second": 98126.368 }, { "epoch": 0.2812400635930048, "grad_norm": 0.34382325410842896, "learning_rate": 4.096026737680061e-05, "loss": 0.4305, "num_input_tokens_seen": 6896419101, "step": 1769, "train_runtime": 70282.1804, "train_tokens_per_second": 98124.718 }, { "epoch": 0.28139904610492844, "grad_norm": 0.23606127500534058, "learning_rate": 4.0950634698782564e-05, "loss": 0.4112, "num_input_tokens_seen": 6900290166, "step": 1770, "train_runtime": 70322.1515, "train_tokens_per_second": 98123.991 }, { "epoch": 0.28155802861685214, "grad_norm": 0.283515065908432, "learning_rate": 4.094099802524325e-05, "loss": 0.4265, "num_input_tokens_seen": 6904286841, "step": 1771, "train_runtime": 70360.4834, "train_tokens_per_second": 98127.337 }, { "epoch": 0.28171701112877584, "grad_norm": 0.2244548797607422, "learning_rate": 4.093135735859657e-05, "loss": 0.4304, "num_input_tokens_seen": 6908145354, "step": 1772, "train_runtime": 70397.6301, "train_tokens_per_second": 98130.368 }, { "epoch": 0.28187599364069954, "grad_norm": 0.2585252523422241, "learning_rate": 4.092171270125744e-05, "loss": 0.4213, "num_input_tokens_seen": 6911960160, "step": 1773, "train_runtime": 70435.4261, "train_tokens_per_second": 98131.871 }, { "epoch": 0.2820349761526232, "grad_norm": 0.23800334334373474, "learning_rate": 4.0912064055641797e-05, "loss": 0.4386, "num_input_tokens_seen": 6915837457, "step": 1774, "train_runtime": 70475.2934, "train_tokens_per_second": 98131.375 }, { "epoch": 0.2821939586645469, "grad_norm": 0.23060303926467896, "learning_rate": 4.090241142416655e-05, "loss": 0.4388, "num_input_tokens_seen": 6919675592, "step": 1775, "train_runtime": 70517.1363, "train_tokens_per_second": 98127.575 }, { "epoch": 0.2823529411764706, "grad_norm": 0.22966407239437103, "learning_rate": 4.08927548092496e-05, "loss": 0.414, "num_input_tokens_seen": 6923698327, "step": 1776, "train_runtime": 70556.0697, "train_tokens_per_second": 98130.442 }, { "epoch": 0.2825119236883943, "grad_norm": 0.21224075555801392, "learning_rate": 4.088309421330989e-05, "loss": 0.4156, "num_input_tokens_seen": 6927507580, "step": 1777, "train_runtime": 70595.8612, "train_tokens_per_second": 98129.09 }, { "epoch": 0.282670906200318, "grad_norm": 0.20861485600471497, "learning_rate": 4.0873429638767305e-05, "loss": 0.4236, "num_input_tokens_seen": 6931409477, "step": 1778, "train_runtime": 70637.1523, "train_tokens_per_second": 98126.966 }, { "epoch": 0.28282988871224163, "grad_norm": 0.21924300491809845, "learning_rate": 4.086376108804276e-05, "loss": 0.4408, "num_input_tokens_seen": 6935199314, "step": 1779, "train_runtime": 70675.1124, "train_tokens_per_second": 98127.885 }, { "epoch": 0.28298887122416533, "grad_norm": 0.1973802000284195, "learning_rate": 4.085408856355817e-05, "loss": 0.4349, "num_input_tokens_seen": 6939219599, "step": 1780, "train_runtime": 70716.6191, "train_tokens_per_second": 98127.14 }, { "epoch": 0.28314785373608903, "grad_norm": 0.22719988226890564, "learning_rate": 4.084441206773642e-05, "loss": 0.437, "num_input_tokens_seen": 6943099138, "step": 1781, "train_runtime": 70756.1299, "train_tokens_per_second": 98127.175 }, { "epoch": 0.28330683624801273, "grad_norm": 0.22836968302726746, "learning_rate": 4.083473160300142e-05, "loss": 0.4381, "num_input_tokens_seen": 6946990416, "step": 1782, "train_runtime": 70792.6938, "train_tokens_per_second": 98131.46 }, { "epoch": 0.2834658187599364, "grad_norm": 0.240984246134758, "learning_rate": 4.082504717177803e-05, "loss": 0.4324, "num_input_tokens_seen": 6950873580, "step": 1783, "train_runtime": 70832.7839, "train_tokens_per_second": 98130.741 }, { "epoch": 0.28362480127186007, "grad_norm": 0.9778336882591248, "learning_rate": 4.0815358776492166e-05, "loss": 0.4278, "num_input_tokens_seen": 6954762505, "step": 1784, "train_runtime": 70872.1835, "train_tokens_per_second": 98131.06 }, { "epoch": 0.28378378378378377, "grad_norm": 0.2191561460494995, "learning_rate": 4.080566641957068e-05, "loss": 0.4197, "num_input_tokens_seen": 6958671284, "step": 1785, "train_runtime": 70910.5743, "train_tokens_per_second": 98133.055 }, { "epoch": 0.28394276629570747, "grad_norm": 0.2847149968147278, "learning_rate": 4.079597010344144e-05, "loss": 0.412, "num_input_tokens_seen": 6962639354, "step": 1786, "train_runtime": 70949.6555, "train_tokens_per_second": 98134.928 }, { "epoch": 0.28410174880763117, "grad_norm": 0.21608777344226837, "learning_rate": 4.078626983053331e-05, "loss": 0.4305, "num_input_tokens_seen": 6966489094, "step": 1787, "train_runtime": 70989.3364, "train_tokens_per_second": 98134.304 }, { "epoch": 0.28426073131955487, "grad_norm": 0.20452803373336792, "learning_rate": 4.077656560327613e-05, "loss": 0.4181, "num_input_tokens_seen": 6970434808, "step": 1788, "train_runtime": 71028.9144, "train_tokens_per_second": 98135.173 }, { "epoch": 0.2844197138314785, "grad_norm": 0.20268388092517853, "learning_rate": 4.0766857424100764e-05, "loss": 0.4211, "num_input_tokens_seen": 6974301650, "step": 1789, "train_runtime": 71068.2902, "train_tokens_per_second": 98135.211 }, { "epoch": 0.2845786963434022, "grad_norm": 0.20975863933563232, "learning_rate": 4.075714529543903e-05, "loss": 0.4322, "num_input_tokens_seen": 6978286335, "step": 1790, "train_runtime": 71109.8364, "train_tokens_per_second": 98133.911 }, { "epoch": 0.2847376788553259, "grad_norm": 0.19900348782539368, "learning_rate": 4.074742921972373e-05, "loss": 0.427, "num_input_tokens_seen": 6982132270, "step": 1791, "train_runtime": 71147.4928, "train_tokens_per_second": 98136.027 }, { "epoch": 0.2848966613672496, "grad_norm": 0.2103060930967331, "learning_rate": 4.0737709199388695e-05, "loss": 0.4313, "num_input_tokens_seen": 6985961840, "step": 1792, "train_runtime": 71188.7545, "train_tokens_per_second": 98132.941 }, { "epoch": 0.2850556438791733, "grad_norm": 0.19617953896522522, "learning_rate": 4.072798523686871e-05, "loss": 0.4284, "num_input_tokens_seen": 6989880596, "step": 1793, "train_runtime": 71227.9096, "train_tokens_per_second": 98134.013 }, { "epoch": 0.28521462639109696, "grad_norm": 0.21326005458831787, "learning_rate": 4.0718257334599565e-05, "loss": 0.4149, "num_input_tokens_seen": 6993831486, "step": 1794, "train_runtime": 71266.587, "train_tokens_per_second": 98136.192 }, { "epoch": 0.28537360890302066, "grad_norm": 0.19191035628318787, "learning_rate": 4.070852549501803e-05, "loss": 0.4279, "num_input_tokens_seen": 6997691034, "step": 1795, "train_runtime": 71306.2871, "train_tokens_per_second": 98135.681 }, { "epoch": 0.28553259141494436, "grad_norm": 0.2628558576107025, "learning_rate": 4.0698789720561855e-05, "loss": 0.4339, "num_input_tokens_seen": 7001560554, "step": 1796, "train_runtime": 71347.0175, "train_tokens_per_second": 98133.893 }, { "epoch": 0.28569157392686806, "grad_norm": 0.19816412031650543, "learning_rate": 4.06890500136698e-05, "loss": 0.4302, "num_input_tokens_seen": 7005564647, "step": 1797, "train_runtime": 71387.8745, "train_tokens_per_second": 98133.817 }, { "epoch": 0.28585055643879176, "grad_norm": 0.21245411038398743, "learning_rate": 4.067930637678157e-05, "loss": 0.4381, "num_input_tokens_seen": 7009526691, "step": 1798, "train_runtime": 71429.0705, "train_tokens_per_second": 98132.688 }, { "epoch": 0.2860095389507154, "grad_norm": 0.24509099125862122, "learning_rate": 4.0669558812337894e-05, "loss": 0.4313, "num_input_tokens_seen": 7013343521, "step": 1799, "train_runtime": 71469.002, "train_tokens_per_second": 98131.264 }, { "epoch": 0.2861685214626391, "grad_norm": 0.2094162404537201, "learning_rate": 4.065980732278046e-05, "loss": 0.4274, "num_input_tokens_seen": 7017286274, "step": 1800, "train_runtime": 71509.3728, "train_tokens_per_second": 98131.0 }, { "epoch": 0.2863275039745628, "grad_norm": 0.24043171107769012, "learning_rate": 4.0650051910551946e-05, "loss": 0.421, "num_input_tokens_seen": 7021258438, "step": 1801, "train_runtime": 71662.2381, "train_tokens_per_second": 97977.102 }, { "epoch": 0.2864864864864865, "grad_norm": 0.22184871137142181, "learning_rate": 4.0640292578096025e-05, "loss": 0.4381, "num_input_tokens_seen": 7025179302, "step": 1802, "train_runtime": 71698.0109, "train_tokens_per_second": 97982.904 }, { "epoch": 0.2866454689984102, "grad_norm": 0.21326173841953278, "learning_rate": 4.0630529327857325e-05, "loss": 0.428, "num_input_tokens_seen": 7028876520, "step": 1803, "train_runtime": 71738.9812, "train_tokens_per_second": 97978.483 }, { "epoch": 0.28680445151033385, "grad_norm": 0.24336667358875275, "learning_rate": 4.0620762162281476e-05, "loss": 0.4333, "num_input_tokens_seen": 7032907127, "step": 1804, "train_runtime": 71777.6673, "train_tokens_per_second": 97981.829 }, { "epoch": 0.28696343402225755, "grad_norm": 0.24030067026615143, "learning_rate": 4.06109910838151e-05, "loss": 0.4334, "num_input_tokens_seen": 7036848697, "step": 1805, "train_runtime": 71815.9357, "train_tokens_per_second": 97984.502 }, { "epoch": 0.28712241653418125, "grad_norm": 0.25199827551841736, "learning_rate": 4.0601216094905756e-05, "loss": 0.4285, "num_input_tokens_seen": 7040601109, "step": 1806, "train_runtime": 71855.9846, "train_tokens_per_second": 97982.112 }, { "epoch": 0.28728139904610495, "grad_norm": 0.20893852412700653, "learning_rate": 4.059143719800204e-05, "loss": 0.4255, "num_input_tokens_seen": 7044495401, "step": 1807, "train_runtime": 71896.8573, "train_tokens_per_second": 97980.575 }, { "epoch": 0.2874403815580286, "grad_norm": 0.19858406484127045, "learning_rate": 4.058165439555347e-05, "loss": 0.4332, "num_input_tokens_seen": 7048425605, "step": 1808, "train_runtime": 71933.7113, "train_tokens_per_second": 97985.013 }, { "epoch": 0.2875993640699523, "grad_norm": 0.19611963629722595, "learning_rate": 4.057186769001058e-05, "loss": 0.4373, "num_input_tokens_seen": 7052350848, "step": 1809, "train_runtime": 71972.3882, "train_tokens_per_second": 97986.895 }, { "epoch": 0.287758346581876, "grad_norm": 0.21720804274082184, "learning_rate": 4.056207708382488e-05, "loss": 0.4422, "num_input_tokens_seen": 7056238410, "step": 1810, "train_runtime": 72011.7329, "train_tokens_per_second": 97987.344 }, { "epoch": 0.2879173290937997, "grad_norm": 0.1994384080171585, "learning_rate": 4.055228257944883e-05, "loss": 0.4118, "num_input_tokens_seen": 7060073889, "step": 1811, "train_runtime": 72052.2633, "train_tokens_per_second": 97985.456 }, { "epoch": 0.2880763116057234, "grad_norm": 0.2044236809015274, "learning_rate": 4.0542484179335885e-05, "loss": 0.4194, "num_input_tokens_seen": 7064122326, "step": 1812, "train_runtime": 72092.0812, "train_tokens_per_second": 97987.493 }, { "epoch": 0.28823529411764703, "grad_norm": 0.2246216982603073, "learning_rate": 4.053268188594049e-05, "loss": 0.4276, "num_input_tokens_seen": 7067988848, "step": 1813, "train_runtime": 72131.1287, "train_tokens_per_second": 97988.053 }, { "epoch": 0.28839427662957073, "grad_norm": 0.19948308169841766, "learning_rate": 4.052287570171804e-05, "loss": 0.4465, "num_input_tokens_seen": 7071807426, "step": 1814, "train_runtime": 72171.7664, "train_tokens_per_second": 97985.788 }, { "epoch": 0.28855325914149443, "grad_norm": 0.21916987001895905, "learning_rate": 4.0513065629124913e-05, "loss": 0.4253, "num_input_tokens_seen": 7075791544, "step": 1815, "train_runtime": 72211.1238, "train_tokens_per_second": 97987.556 }, { "epoch": 0.28871224165341813, "grad_norm": 0.23066455125808716, "learning_rate": 4.0503251670618476e-05, "loss": 0.4226, "num_input_tokens_seen": 7079654187, "step": 1816, "train_runtime": 72248.9494, "train_tokens_per_second": 97989.718 }, { "epoch": 0.28887122416534183, "grad_norm": 0.2068965584039688, "learning_rate": 4.049343382865704e-05, "loss": 0.4148, "num_input_tokens_seen": 7083547844, "step": 1817, "train_runtime": 72289.7783, "train_tokens_per_second": 97988.236 }, { "epoch": 0.2890302066772655, "grad_norm": 0.2049674242734909, "learning_rate": 4.0483612105699906e-05, "loss": 0.4104, "num_input_tokens_seen": 7087397189, "step": 1818, "train_runtime": 72327.5126, "train_tokens_per_second": 97990.335 }, { "epoch": 0.2891891891891892, "grad_norm": 0.1943521797657013, "learning_rate": 4.047378650420737e-05, "loss": 0.4291, "num_input_tokens_seen": 7091376263, "step": 1819, "train_runtime": 72364.7107, "train_tokens_per_second": 97994.951 }, { "epoch": 0.2893481717011129, "grad_norm": 0.20161184668540955, "learning_rate": 4.046395702664064e-05, "loss": 0.4233, "num_input_tokens_seen": 7095107977, "step": 1820, "train_runtime": 72404.7516, "train_tokens_per_second": 97992.298 }, { "epoch": 0.2895071542130366, "grad_norm": 0.20286822319030762, "learning_rate": 4.0454123675461964e-05, "loss": 0.4239, "num_input_tokens_seen": 7099007326, "step": 1821, "train_runtime": 72445.4849, "train_tokens_per_second": 97991.025 }, { "epoch": 0.2896661367249603, "grad_norm": 0.19412976503372192, "learning_rate": 4.0444286453134495e-05, "loss": 0.4221, "num_input_tokens_seen": 7102979106, "step": 1822, "train_runtime": 72484.7172, "train_tokens_per_second": 97992.782 }, { "epoch": 0.2898251192368839, "grad_norm": 0.29347506165504456, "learning_rate": 4.043444536212241e-05, "loss": 0.4351, "num_input_tokens_seen": 7106822056, "step": 1823, "train_runtime": 72525.7582, "train_tokens_per_second": 97990.317 }, { "epoch": 0.2899841017488076, "grad_norm": 0.25045979022979736, "learning_rate": 4.0424600404890835e-05, "loss": 0.4237, "num_input_tokens_seen": 7110792765, "step": 1824, "train_runtime": 72563.9867, "train_tokens_per_second": 97993.414 }, { "epoch": 0.2901430842607313, "grad_norm": 0.19787415862083435, "learning_rate": 4.041475158390585e-05, "loss": 0.4234, "num_input_tokens_seen": 7114682529, "step": 1825, "train_runtime": 72602.231, "train_tokens_per_second": 97995.37 }, { "epoch": 0.290302066772655, "grad_norm": 0.2107653021812439, "learning_rate": 4.040489890163452e-05, "loss": 0.4185, "num_input_tokens_seen": 7118604476, "step": 1826, "train_runtime": 72642.8782, "train_tokens_per_second": 97994.527 }, { "epoch": 0.2904610492845787, "grad_norm": 0.19760678708553314, "learning_rate": 4.039504236054486e-05, "loss": 0.4353, "num_input_tokens_seen": 7122513769, "step": 1827, "train_runtime": 72682.8405, "train_tokens_per_second": 97994.433 }, { "epoch": 0.29062003179650236, "grad_norm": 0.2947481870651245, "learning_rate": 4.0385181963105886e-05, "loss": 0.4152, "num_input_tokens_seen": 7126252470, "step": 1828, "train_runtime": 72723.4024, "train_tokens_per_second": 97991.186 }, { "epoch": 0.29077901430842606, "grad_norm": 0.18880921602249146, "learning_rate": 4.0375317711787555e-05, "loss": 0.4322, "num_input_tokens_seen": 7130241869, "step": 1829, "train_runtime": 72763.092, "train_tokens_per_second": 97992.563 }, { "epoch": 0.29093799682034976, "grad_norm": 0.3180794417858124, "learning_rate": 4.036544960906078e-05, "loss": 0.4321, "num_input_tokens_seen": 7134189598, "step": 1830, "train_runtime": 72800.9086, "train_tokens_per_second": 97995.887 }, { "epoch": 0.29109697933227346, "grad_norm": 0.22326499223709106, "learning_rate": 4.035557765739745e-05, "loss": 0.4266, "num_input_tokens_seen": 7137983711, "step": 1831, "train_runtime": 72841.0027, "train_tokens_per_second": 97994.034 }, { "epoch": 0.29125596184419716, "grad_norm": 0.3135766386985779, "learning_rate": 4.0345701859270424e-05, "loss": 0.4336, "num_input_tokens_seen": 7141836384, "step": 1832, "train_runtime": 72875.6384, "train_tokens_per_second": 98000.327 }, { "epoch": 0.2914149443561208, "grad_norm": 0.2739872634410858, "learning_rate": 4.033582221715352e-05, "loss": 0.4394, "num_input_tokens_seen": 7145764348, "step": 1833, "train_runtime": 72914.6745, "train_tokens_per_second": 98001.731 }, { "epoch": 0.2915739268680445, "grad_norm": 0.2175765335559845, "learning_rate": 4.032593873352153e-05, "loss": 0.4302, "num_input_tokens_seen": 7149604171, "step": 1834, "train_runtime": 72952.3006, "train_tokens_per_second": 98003.82 }, { "epoch": 0.2917329093799682, "grad_norm": 0.21371902525424957, "learning_rate": 4.031605141085018e-05, "loss": 0.4345, "num_input_tokens_seen": 7153472963, "step": 1835, "train_runtime": 72992.5414, "train_tokens_per_second": 98002.793 }, { "epoch": 0.2918918918918919, "grad_norm": 0.25129806995391846, "learning_rate": 4.030616025161618e-05, "loss": 0.423, "num_input_tokens_seen": 7157313372, "step": 1836, "train_runtime": 73031.8595, "train_tokens_per_second": 98002.617 }, { "epoch": 0.2920508744038156, "grad_norm": 0.2695156931877136, "learning_rate": 4.029626525829718e-05, "loss": 0.4041, "num_input_tokens_seen": 7161167864, "step": 1837, "train_runtime": 73068.0192, "train_tokens_per_second": 98006.87 }, { "epoch": 0.29220985691573925, "grad_norm": 0.24058352410793304, "learning_rate": 4.028636643337184e-05, "loss": 0.4247, "num_input_tokens_seen": 7165007195, "step": 1838, "train_runtime": 73109.1554, "train_tokens_per_second": 98004.24 }, { "epoch": 0.29236883942766295, "grad_norm": 0.28639793395996094, "learning_rate": 4.027646377931972e-05, "loss": 0.4216, "num_input_tokens_seen": 7168872635, "step": 1839, "train_runtime": 73149.1732, "train_tokens_per_second": 98003.468 }, { "epoch": 0.29252782193958665, "grad_norm": 0.2196362018585205, "learning_rate": 4.026655729862138e-05, "loss": 0.4167, "num_input_tokens_seen": 7172800422, "step": 1840, "train_runtime": 73188.944, "train_tokens_per_second": 98003.879 }, { "epoch": 0.29268680445151035, "grad_norm": 0.257875919342041, "learning_rate": 4.025664699375831e-05, "loss": 0.4297, "num_input_tokens_seen": 7176711024, "step": 1841, "train_runtime": 73228.7153, "train_tokens_per_second": 98004.055 }, { "epoch": 0.292845786963434, "grad_norm": 0.20397596061229706, "learning_rate": 4.0246732867212975e-05, "loss": 0.4267, "num_input_tokens_seen": 7180573535, "step": 1842, "train_runtime": 73266.9563, "train_tokens_per_second": 98005.621 }, { "epoch": 0.2930047694753577, "grad_norm": 0.23711442947387695, "learning_rate": 4.02368149214688e-05, "loss": 0.4275, "num_input_tokens_seen": 7184363603, "step": 1843, "train_runtime": 73308.1592, "train_tokens_per_second": 98002.237 }, { "epoch": 0.2931637519872814, "grad_norm": 0.21387030184268951, "learning_rate": 4.022689315901015e-05, "loss": 0.4212, "num_input_tokens_seen": 7188311704, "step": 1844, "train_runtime": 73348.4531, "train_tokens_per_second": 98002.226 }, { "epoch": 0.2933227344992051, "grad_norm": 0.19712291657924652, "learning_rate": 4.0216967582322366e-05, "loss": 0.423, "num_input_tokens_seen": 7192152660, "step": 1845, "train_runtime": 73385.5534, "train_tokens_per_second": 98005.02 }, { "epoch": 0.2934817170111288, "grad_norm": 0.22744055092334747, "learning_rate": 4.020703819389173e-05, "loss": 0.4199, "num_input_tokens_seen": 7196049832, "step": 1846, "train_runtime": 73425.0943, "train_tokens_per_second": 98005.32 }, { "epoch": 0.29364069952305244, "grad_norm": 0.20811733603477478, "learning_rate": 4.0197104996205484e-05, "loss": 0.4282, "num_input_tokens_seen": 7199917021, "step": 1847, "train_runtime": 73466.3515, "train_tokens_per_second": 98002.921 }, { "epoch": 0.29379968203497614, "grad_norm": 0.1935250163078308, "learning_rate": 4.018716799175183e-05, "loss": 0.4426, "num_input_tokens_seen": 7203919674, "step": 1848, "train_runtime": 73505.5194, "train_tokens_per_second": 98005.153 }, { "epoch": 0.29395866454689984, "grad_norm": 0.21007925271987915, "learning_rate": 4.0177227183019906e-05, "loss": 0.4373, "num_input_tokens_seen": 7207844358, "step": 1849, "train_runtime": 73547.5851, "train_tokens_per_second": 98002.461 }, { "epoch": 0.29411764705882354, "grad_norm": 0.7918097972869873, "learning_rate": 4.016728257249982e-05, "loss": 0.4262, "num_input_tokens_seen": 7211750518, "step": 1850, "train_runtime": 73586.4177, "train_tokens_per_second": 98003.827 }, { "epoch": 0.29427662957074724, "grad_norm": 0.21072620153427124, "learning_rate": 4.0157334162682634e-05, "loss": 0.4245, "num_input_tokens_seen": 7215792160, "step": 1851, "train_runtime": 73627.5372, "train_tokens_per_second": 98003.986 }, { "epoch": 0.2944356120826709, "grad_norm": 0.21084411442279816, "learning_rate": 4.014738195606034e-05, "loss": 0.4195, "num_input_tokens_seen": 7219524784, "step": 1852, "train_runtime": 73665.5219, "train_tokens_per_second": 98004.122 }, { "epoch": 0.2945945945945946, "grad_norm": 0.2271938920021057, "learning_rate": 4.0137425955125906e-05, "loss": 0.4228, "num_input_tokens_seen": 7223425094, "step": 1853, "train_runtime": 73702.7752, "train_tokens_per_second": 98007.505 }, { "epoch": 0.2947535771065183, "grad_norm": 0.2451433390378952, "learning_rate": 4.012746616237324e-05, "loss": 0.4257, "num_input_tokens_seen": 7227283479, "step": 1854, "train_runtime": 73741.1993, "train_tokens_per_second": 98008.76 }, { "epoch": 0.294912559618442, "grad_norm": 0.21490125358104706, "learning_rate": 4.011750258029719e-05, "loss": 0.4248, "num_input_tokens_seen": 7231058249, "step": 1855, "train_runtime": 73780.2324, "train_tokens_per_second": 98008.071 }, { "epoch": 0.2950715421303657, "grad_norm": 0.219997838139534, "learning_rate": 4.010753521139357e-05, "loss": 0.4155, "num_input_tokens_seen": 7235082568, "step": 1856, "train_runtime": 73818.6107, "train_tokens_per_second": 98011.633 }, { "epoch": 0.2952305246422893, "grad_norm": 0.2332872748374939, "learning_rate": 4.009756405815914e-05, "loss": 0.4193, "num_input_tokens_seen": 7238953953, "step": 1857, "train_runtime": 73859.4843, "train_tokens_per_second": 98009.809 }, { "epoch": 0.295389507154213, "grad_norm": 0.4927186071872711, "learning_rate": 4.0087589123091596e-05, "loss": 0.4143, "num_input_tokens_seen": 7242847765, "step": 1858, "train_runtime": 73897.7313, "train_tokens_per_second": 98011.774 }, { "epoch": 0.2955484896661367, "grad_norm": 0.20403434336185455, "learning_rate": 4.007761040868959e-05, "loss": 0.4335, "num_input_tokens_seen": 7246739647, "step": 1859, "train_runtime": 73935.4075, "train_tokens_per_second": 98014.468 }, { "epoch": 0.2957074721780604, "grad_norm": 0.25765007734298706, "learning_rate": 4.0067627917452715e-05, "loss": 0.4167, "num_input_tokens_seen": 7250615040, "step": 1860, "train_runtime": 73975.9662, "train_tokens_per_second": 98013.117 }, { "epoch": 0.2958664546899841, "grad_norm": 0.18752247095108032, "learning_rate": 4.005764165188153e-05, "loss": 0.43, "num_input_tokens_seen": 7254636200, "step": 1861, "train_runtime": 74014.7106, "train_tokens_per_second": 98016.14 }, { "epoch": 0.29602543720190777, "grad_norm": 0.20065419375896454, "learning_rate": 4.00476516144775e-05, "loss": 0.4376, "num_input_tokens_seen": 7258446384, "step": 1862, "train_runtime": 74052.3783, "train_tokens_per_second": 98017.735 }, { "epoch": 0.29618441971383147, "grad_norm": 0.19328702986240387, "learning_rate": 4.003765780774309e-05, "loss": 0.4158, "num_input_tokens_seen": 7262267485, "step": 1863, "train_runtime": 74090.3269, "train_tokens_per_second": 98019.104 }, { "epoch": 0.29634340222575517, "grad_norm": 0.2204860895872116, "learning_rate": 4.0027660234181645e-05, "loss": 0.4257, "num_input_tokens_seen": 7266261027, "step": 1864, "train_runtime": 74128.6953, "train_tokens_per_second": 98022.244 }, { "epoch": 0.29650238473767887, "grad_norm": 0.19560837745666504, "learning_rate": 4.001765889629751e-05, "loss": 0.4241, "num_input_tokens_seen": 7270291713, "step": 1865, "train_runtime": 74165.8359, "train_tokens_per_second": 98027.503 }, { "epoch": 0.29666136724960257, "grad_norm": 0.21178022027015686, "learning_rate": 4.000765379659595e-05, "loss": 0.4142, "num_input_tokens_seen": 7274259439, "step": 1866, "train_runtime": 74207.1602, "train_tokens_per_second": 98026.382 }, { "epoch": 0.2968203497615262, "grad_norm": 0.2098436802625656, "learning_rate": 3.9997644937583146e-05, "loss": 0.4149, "num_input_tokens_seen": 7278107675, "step": 1867, "train_runtime": 74247.1604, "train_tokens_per_second": 98025.401 }, { "epoch": 0.2969793322734499, "grad_norm": 0.17918382585048676, "learning_rate": 3.9987632321766265e-05, "loss": 0.4185, "num_input_tokens_seen": 7282133382, "step": 1868, "train_runtime": 74288.0835, "train_tokens_per_second": 98025.592 }, { "epoch": 0.2971383147853736, "grad_norm": 0.20646251738071442, "learning_rate": 3.9977615951653404e-05, "loss": 0.4325, "num_input_tokens_seen": 7286072162, "step": 1869, "train_runtime": 74326.8528, "train_tokens_per_second": 98027.454 }, { "epoch": 0.2972972972972973, "grad_norm": 0.2370682805776596, "learning_rate": 3.996759582975358e-05, "loss": 0.4307, "num_input_tokens_seen": 7289843660, "step": 1870, "train_runtime": 74367.0464, "train_tokens_per_second": 98025.187 }, { "epoch": 0.29745627980922096, "grad_norm": 0.24535652995109558, "learning_rate": 3.995757195857677e-05, "loss": 0.4228, "num_input_tokens_seen": 7293842569, "step": 1871, "train_runtime": 74406.0757, "train_tokens_per_second": 98027.513 }, { "epoch": 0.29761526232114466, "grad_norm": 0.21836122870445251, "learning_rate": 3.994754434063388e-05, "loss": 0.4333, "num_input_tokens_seen": 7297804754, "step": 1872, "train_runtime": 74444.3791, "train_tokens_per_second": 98030.299 }, { "epoch": 0.29777424483306836, "grad_norm": 0.22169800102710724, "learning_rate": 3.9937512978436754e-05, "loss": 0.4282, "num_input_tokens_seen": 7301660792, "step": 1873, "train_runtime": 74483.3289, "train_tokens_per_second": 98030.806 }, { "epoch": 0.29793322734499206, "grad_norm": 0.2449295073747635, "learning_rate": 3.9927477874498184e-05, "loss": 0.4356, "num_input_tokens_seen": 7305499819, "step": 1874, "train_runtime": 74522.4352, "train_tokens_per_second": 98030.879 }, { "epoch": 0.29809220985691576, "grad_norm": 0.23144377768039703, "learning_rate": 3.9917439031331896e-05, "loss": 0.4277, "num_input_tokens_seen": 7309498303, "step": 1875, "train_runtime": 74562.724, "train_tokens_per_second": 98031.535 }, { "epoch": 0.2982511923688394, "grad_norm": 0.19442693889141083, "learning_rate": 3.990739645145253e-05, "loss": 0.4334, "num_input_tokens_seen": 7313466176, "step": 1876, "train_runtime": 74601.7874, "train_tokens_per_second": 98033.391 }, { "epoch": 0.2984101748807631, "grad_norm": 0.23866993188858032, "learning_rate": 3.989735013737569e-05, "loss": 0.4192, "num_input_tokens_seen": 7317234932, "step": 1877, "train_runtime": 74639.1738, "train_tokens_per_second": 98034.779 }, { "epoch": 0.2985691573926868, "grad_norm": 0.22809983789920807, "learning_rate": 3.9887300091617916e-05, "loss": 0.4402, "num_input_tokens_seen": 7321132575, "step": 1878, "train_runtime": 74678.7534, "train_tokens_per_second": 98035.013 }, { "epoch": 0.2987281399046105, "grad_norm": 0.22006380558013916, "learning_rate": 3.987724631669668e-05, "loss": 0.4271, "num_input_tokens_seen": 7325050633, "step": 1879, "train_runtime": 74717.6681, "train_tokens_per_second": 98036.392 }, { "epoch": 0.2988871224165342, "grad_norm": 0.22515559196472168, "learning_rate": 3.986718881513036e-05, "loss": 0.4293, "num_input_tokens_seen": 7328978966, "step": 1880, "train_runtime": 74757.5507, "train_tokens_per_second": 98036.638 }, { "epoch": 0.29904610492845785, "grad_norm": 0.21600341796875, "learning_rate": 3.98571275894383e-05, "loss": 0.4167, "num_input_tokens_seen": 7332658114, "step": 1881, "train_runtime": 74798.1887, "train_tokens_per_second": 98032.563 }, { "epoch": 0.29920508744038155, "grad_norm": 0.22748829424381256, "learning_rate": 3.9847062642140756e-05, "loss": 0.4164, "num_input_tokens_seen": 7336531950, "step": 1882, "train_runtime": 74839.1382, "train_tokens_per_second": 98030.685 }, { "epoch": 0.29936406995230525, "grad_norm": 0.22764872014522552, "learning_rate": 3.983699397575895e-05, "loss": 0.4302, "num_input_tokens_seen": 7340498344, "step": 1883, "train_runtime": 74878.581, "train_tokens_per_second": 98032.017 }, { "epoch": 0.29952305246422894, "grad_norm": 0.2211335152387619, "learning_rate": 3.982692159281499e-05, "loss": 0.4227, "num_input_tokens_seen": 7344361957, "step": 1884, "train_runtime": 74916.3294, "train_tokens_per_second": 98034.194 }, { "epoch": 0.29968203497615264, "grad_norm": 0.2644980549812317, "learning_rate": 3.9816845495831936e-05, "loss": 0.4156, "num_input_tokens_seen": 7348176248, "step": 1885, "train_runtime": 74955.1107, "train_tokens_per_second": 98034.359 }, { "epoch": 0.2998410174880763, "grad_norm": 0.20234456658363342, "learning_rate": 3.980676568733379e-05, "loss": 0.4319, "num_input_tokens_seen": 7352099417, "step": 1886, "train_runtime": 74996.0321, "train_tokens_per_second": 98033.179 }, { "epoch": 0.3, "grad_norm": 0.21102002263069153, "learning_rate": 3.979668216984547e-05, "loss": 0.4211, "num_input_tokens_seen": 7356060554, "step": 1887, "train_runtime": 75035.483, "train_tokens_per_second": 98034.427 }, { "epoch": 0.3001589825119237, "grad_norm": 0.22520595788955688, "learning_rate": 3.978659494589283e-05, "loss": 0.4217, "num_input_tokens_seen": 7359948777, "step": 1888, "train_runtime": 75074.988, "train_tokens_per_second": 98034.631 }, { "epoch": 0.3003179650238474, "grad_norm": 0.22199492156505585, "learning_rate": 3.977650401800264e-05, "loss": 0.4092, "num_input_tokens_seen": 7363817322, "step": 1889, "train_runtime": 75113.1337, "train_tokens_per_second": 98036.348 }, { "epoch": 0.3004769475357711, "grad_norm": 0.23357315361499786, "learning_rate": 3.976640938870261e-05, "loss": 0.4127, "num_input_tokens_seen": 7367799121, "step": 1890, "train_runtime": 75152.047, "train_tokens_per_second": 98038.569 }, { "epoch": 0.30063593004769473, "grad_norm": 0.30038195848464966, "learning_rate": 3.9756311060521366e-05, "loss": 0.427, "num_input_tokens_seen": 7371547882, "step": 1891, "train_runtime": 75189.966, "train_tokens_per_second": 98038.984 }, { "epoch": 0.30079491255961843, "grad_norm": 0.2400987148284912, "learning_rate": 3.9746209035988485e-05, "loss": 0.4349, "num_input_tokens_seen": 7375351846, "step": 1892, "train_runtime": 75229.8691, "train_tokens_per_second": 98037.547 }, { "epoch": 0.30095389507154213, "grad_norm": 0.230215921998024, "learning_rate": 3.9736103317634444e-05, "loss": 0.4193, "num_input_tokens_seen": 7379273681, "step": 1893, "train_runtime": 75269.326, "train_tokens_per_second": 98038.259 }, { "epoch": 0.30111287758346583, "grad_norm": 0.22489947080612183, "learning_rate": 3.972599390799064e-05, "loss": 0.4196, "num_input_tokens_seen": 7383120371, "step": 1894, "train_runtime": 75306.611, "train_tokens_per_second": 98040.8 }, { "epoch": 0.30127186009538953, "grad_norm": 0.19619660079479218, "learning_rate": 3.9715880809589424e-05, "loss": 0.4235, "num_input_tokens_seen": 7386959765, "step": 1895, "train_runtime": 75346.1335, "train_tokens_per_second": 98040.33 }, { "epoch": 0.3014308426073132, "grad_norm": 0.22016045451164246, "learning_rate": 3.970576402496406e-05, "loss": 0.4267, "num_input_tokens_seen": 7390901039, "step": 1896, "train_runtime": 75386.9983, "train_tokens_per_second": 98039.466 }, { "epoch": 0.3015898251192369, "grad_norm": 0.22973306477069855, "learning_rate": 3.969564355664872e-05, "loss": 0.4202, "num_input_tokens_seen": 7394843836, "step": 1897, "train_runtime": 75427.7579, "train_tokens_per_second": 98038.76 }, { "epoch": 0.3017488076311606, "grad_norm": 0.2065707892179489, "learning_rate": 3.968551940717852e-05, "loss": 0.4296, "num_input_tokens_seen": 7398729268, "step": 1898, "train_runtime": 75467.814, "train_tokens_per_second": 98038.208 }, { "epoch": 0.3019077901430843, "grad_norm": 0.20386314392089844, "learning_rate": 3.967539157908947e-05, "loss": 0.4289, "num_input_tokens_seen": 7402365848, "step": 1899, "train_runtime": 75508.0945, "train_tokens_per_second": 98034.07 }, { "epoch": 0.302066772655008, "grad_norm": 0.19292502105236053, "learning_rate": 3.966526007491853e-05, "loss": 0.4391, "num_input_tokens_seen": 7406361795, "step": 1900, "train_runtime": 75548.7271, "train_tokens_per_second": 98034.237 }, { "epoch": 0.3022257551669316, "grad_norm": 0.19699935615062714, "learning_rate": 3.9655124897203574e-05, "loss": 0.4255, "num_input_tokens_seen": 7410397379, "step": 1901, "train_runtime": 75588.8526, "train_tokens_per_second": 98035.585 }, { "epoch": 0.3023847376788553, "grad_norm": 0.24589388072490692, "learning_rate": 3.96449860484834e-05, "loss": 0.4259, "num_input_tokens_seen": 7414224775, "step": 1902, "train_runtime": 75626.3682, "train_tokens_per_second": 98037.562 }, { "epoch": 0.302543720190779, "grad_norm": 0.2027699202299118, "learning_rate": 3.96348435312977e-05, "loss": 0.4122, "num_input_tokens_seen": 7418103885, "step": 1903, "train_runtime": 75664.601, "train_tokens_per_second": 98039.292 }, { "epoch": 0.3027027027027027, "grad_norm": 0.18640561401844025, "learning_rate": 3.9624697348187115e-05, "loss": 0.4305, "num_input_tokens_seen": 7422179937, "step": 1904, "train_runtime": 75707.3373, "train_tokens_per_second": 98037.789 }, { "epoch": 0.30286168521462636, "grad_norm": 0.20229879021644592, "learning_rate": 3.9614547501693176e-05, "loss": 0.4276, "num_input_tokens_seen": 7426108835, "step": 1905, "train_runtime": 75747.6892, "train_tokens_per_second": 98037.431 }, { "epoch": 0.30302066772655006, "grad_norm": 0.19429026544094086, "learning_rate": 3.960439399435837e-05, "loss": 0.4307, "num_input_tokens_seen": 7429864339, "step": 1906, "train_runtime": 75786.7592, "train_tokens_per_second": 98036.444 }, { "epoch": 0.30317965023847376, "grad_norm": 0.2844739258289337, "learning_rate": 3.9594236828726064e-05, "loss": 0.4282, "num_input_tokens_seen": 7433701197, "step": 1907, "train_runtime": 75826.3777, "train_tokens_per_second": 98035.821 }, { "epoch": 0.30333863275039746, "grad_norm": 0.21722261607646942, "learning_rate": 3.9584076007340564e-05, "loss": 0.4207, "num_input_tokens_seen": 7437678782, "step": 1908, "train_runtime": 75865.0187, "train_tokens_per_second": 98038.317 }, { "epoch": 0.30349761526232116, "grad_norm": 0.17952793836593628, "learning_rate": 3.957391153274708e-05, "loss": 0.4265, "num_input_tokens_seen": 7441508043, "step": 1909, "train_runtime": 75905.0039, "train_tokens_per_second": 98037.121 }, { "epoch": 0.3036565977742448, "grad_norm": 0.21899235248565674, "learning_rate": 3.9563743407491735e-05, "loss": 0.4114, "num_input_tokens_seen": 7445292547, "step": 1910, "train_runtime": 75944.6796, "train_tokens_per_second": 98035.736 }, { "epoch": 0.3038155802861685, "grad_norm": 0.21593087911605835, "learning_rate": 3.955357163412158e-05, "loss": 0.4153, "num_input_tokens_seen": 7449199503, "step": 1911, "train_runtime": 75981.2476, "train_tokens_per_second": 98039.974 }, { "epoch": 0.3039745627980922, "grad_norm": 0.20028145611286163, "learning_rate": 3.954339621518458e-05, "loss": 0.4271, "num_input_tokens_seen": 7453186668, "step": 1912, "train_runtime": 76018.1564, "train_tokens_per_second": 98044.823 }, { "epoch": 0.3041335453100159, "grad_norm": 0.28513890504837036, "learning_rate": 3.9533217153229584e-05, "loss": 0.4229, "num_input_tokens_seen": 7456975990, "step": 1913, "train_runtime": 76056.8354, "train_tokens_per_second": 98044.784 }, { "epoch": 0.3042925278219396, "grad_norm": 0.19507363438606262, "learning_rate": 3.95230344508064e-05, "loss": 0.4143, "num_input_tokens_seen": 7460869582, "step": 1914, "train_runtime": 76097.6625, "train_tokens_per_second": 98043.348 }, { "epoch": 0.30445151033386325, "grad_norm": 0.21235500276088715, "learning_rate": 3.9512848110465704e-05, "loss": 0.4138, "num_input_tokens_seen": 7464856927, "step": 1915, "train_runtime": 76137.4188, "train_tokens_per_second": 98044.523 }, { "epoch": 0.30461049284578695, "grad_norm": 0.22900579869747162, "learning_rate": 3.9502658134759116e-05, "loss": 0.4248, "num_input_tokens_seen": 7468740925, "step": 1916, "train_runtime": 76174.6237, "train_tokens_per_second": 98047.625 }, { "epoch": 0.30476947535771065, "grad_norm": 0.19984957575798035, "learning_rate": 3.949246452623914e-05, "loss": 0.4367, "num_input_tokens_seen": 7472565392, "step": 1917, "train_runtime": 76215.0295, "train_tokens_per_second": 98045.824 }, { "epoch": 0.30492845786963435, "grad_norm": 0.22194331884384155, "learning_rate": 3.948226728745921e-05, "loss": 0.4305, "num_input_tokens_seen": 7476417152, "step": 1918, "train_runtime": 76253.4709, "train_tokens_per_second": 98046.909 }, { "epoch": 0.30508744038155805, "grad_norm": 0.22384975850582123, "learning_rate": 3.9472066420973665e-05, "loss": 0.4149, "num_input_tokens_seen": 7480352226, "step": 1919, "train_runtime": 76291.6991, "train_tokens_per_second": 98049.359 }, { "epoch": 0.3052464228934817, "grad_norm": 0.18781927227973938, "learning_rate": 3.946186192933774e-05, "loss": 0.4315, "num_input_tokens_seen": 7484322157, "step": 1920, "train_runtime": 76332.9955, "train_tokens_per_second": 98048.323 }, { "epoch": 0.3054054054054054, "grad_norm": 0.2234094738960266, "learning_rate": 3.9451653815107596e-05, "loss": 0.4361, "num_input_tokens_seen": 7488177588, "step": 1921, "train_runtime": 76374.5795, "train_tokens_per_second": 98045.418 }, { "epoch": 0.3055643879173291, "grad_norm": 0.22332875430583954, "learning_rate": 3.9441442080840296e-05, "loss": 0.4133, "num_input_tokens_seen": 7492187887, "step": 1922, "train_runtime": 76417.7992, "train_tokens_per_second": 98042.445 }, { "epoch": 0.3057233704292528, "grad_norm": 0.21794559061527252, "learning_rate": 3.9431226729093795e-05, "loss": 0.4333, "num_input_tokens_seen": 7496020475, "step": 1923, "train_runtime": 76457.4806, "train_tokens_per_second": 98041.688 }, { "epoch": 0.3058823529411765, "grad_norm": 0.2249642163515091, "learning_rate": 3.942100776242698e-05, "loss": 0.4384, "num_input_tokens_seen": 7500050394, "step": 1924, "train_runtime": 76498.5586, "train_tokens_per_second": 98041.722 }, { "epoch": 0.30604133545310014, "grad_norm": 0.32659024000167847, "learning_rate": 3.9410785183399626e-05, "loss": 0.4186, "num_input_tokens_seen": 7503874236, "step": 1925, "train_runtime": 76539.3499, "train_tokens_per_second": 98039.43 }, { "epoch": 0.30620031796502384, "grad_norm": 0.21184681355953217, "learning_rate": 3.9400558994572414e-05, "loss": 0.4246, "num_input_tokens_seen": 7507757064, "step": 1926, "train_runtime": 76576.9158, "train_tokens_per_second": 98042.04 }, { "epoch": 0.30635930047694754, "grad_norm": 0.29043757915496826, "learning_rate": 3.9390329198506936e-05, "loss": 0.4235, "num_input_tokens_seen": 7511619519, "step": 1927, "train_runtime": 76616.4369, "train_tokens_per_second": 98041.88 }, { "epoch": 0.30651828298887124, "grad_norm": 0.3598482608795166, "learning_rate": 3.938009579776567e-05, "loss": 0.4349, "num_input_tokens_seen": 7515503050, "step": 1928, "train_runtime": 76656.3641, "train_tokens_per_second": 98041.476 }, { "epoch": 0.30667726550079494, "grad_norm": 0.2385646551847458, "learning_rate": 3.936985879491204e-05, "loss": 0.4253, "num_input_tokens_seen": 7519394639, "step": 1929, "train_runtime": 76695.2936, "train_tokens_per_second": 98042.452 }, { "epoch": 0.3068362480127186, "grad_norm": 0.23795855045318604, "learning_rate": 3.9359618192510305e-05, "loss": 0.418, "num_input_tokens_seen": 7523253957, "step": 1930, "train_runtime": 76735.4775, "train_tokens_per_second": 98041.404 }, { "epoch": 0.3069952305246423, "grad_norm": 0.25948622822761536, "learning_rate": 3.934937399312569e-05, "loss": 0.4131, "num_input_tokens_seen": 7527196041, "step": 1931, "train_runtime": 76775.5343, "train_tokens_per_second": 98041.598 }, { "epoch": 0.307154213036566, "grad_norm": 0.3196270763874054, "learning_rate": 3.9339126199324297e-05, "loss": 0.4196, "num_input_tokens_seen": 7531080739, "step": 1932, "train_runtime": 76812.7574, "train_tokens_per_second": 98044.661 }, { "epoch": 0.3073131955484897, "grad_norm": 0.2441847175359726, "learning_rate": 3.93288748136731e-05, "loss": 0.4102, "num_input_tokens_seen": 7535055599, "step": 1933, "train_runtime": 76850.2528, "train_tokens_per_second": 98048.547 }, { "epoch": 0.3074721780604134, "grad_norm": 0.2863779664039612, "learning_rate": 3.9318619838740026e-05, "loss": 0.4347, "num_input_tokens_seen": 7539011733, "step": 1934, "train_runtime": 76888.5355, "train_tokens_per_second": 98051.181 }, { "epoch": 0.307631160572337, "grad_norm": 0.2480381578207016, "learning_rate": 3.9308361277093864e-05, "loss": 0.4057, "num_input_tokens_seen": 7542946446, "step": 1935, "train_runtime": 76928.6774, "train_tokens_per_second": 98051.165 }, { "epoch": 0.3077901430842607, "grad_norm": 0.22296550869941711, "learning_rate": 3.92980991313043e-05, "loss": 0.4299, "num_input_tokens_seen": 7546775514, "step": 1936, "train_runtime": 76967.658, "train_tokens_per_second": 98051.256 }, { "epoch": 0.3079491255961844, "grad_norm": 0.32408663630485535, "learning_rate": 3.9287833403941946e-05, "loss": 0.4121, "num_input_tokens_seen": 7550652902, "step": 1937, "train_runtime": 77005.3708, "train_tokens_per_second": 98053.588 }, { "epoch": 0.3081081081081081, "grad_norm": 1.541614055633545, "learning_rate": 3.927756409757829e-05, "loss": 0.4191, "num_input_tokens_seen": 7554599627, "step": 1938, "train_runtime": 77044.8211, "train_tokens_per_second": 98054.607 }, { "epoch": 0.30826709062003177, "grad_norm": 0.2670196294784546, "learning_rate": 3.926729121478572e-05, "loss": 0.4257, "num_input_tokens_seen": 7558574205, "step": 1939, "train_runtime": 77084.0503, "train_tokens_per_second": 98056.267 }, { "epoch": 0.30842607313195547, "grad_norm": 0.26050347089767456, "learning_rate": 3.925701475813751e-05, "loss": 0.4254, "num_input_tokens_seen": 7562444588, "step": 1940, "train_runtime": 77121.6237, "train_tokens_per_second": 98058.68 }, { "epoch": 0.30858505564387917, "grad_norm": 0.28452378511428833, "learning_rate": 3.9246734730207843e-05, "loss": 0.4267, "num_input_tokens_seen": 7566323352, "step": 1941, "train_runtime": 77160.522, "train_tokens_per_second": 98059.515 }, { "epoch": 0.30874403815580287, "grad_norm": 0.2404371052980423, "learning_rate": 3.9236451133571806e-05, "loss": 0.4282, "num_input_tokens_seen": 7570229648, "step": 1942, "train_runtime": 77201.0303, "train_tokens_per_second": 98058.661 }, { "epoch": 0.30890302066772657, "grad_norm": 0.2552313208580017, "learning_rate": 3.9226163970805365e-05, "loss": 0.4323, "num_input_tokens_seen": 7574160571, "step": 1943, "train_runtime": 77237.4154, "train_tokens_per_second": 98063.361 }, { "epoch": 0.3090620031796502, "grad_norm": 0.22666820883750916, "learning_rate": 3.9215873244485365e-05, "loss": 0.4341, "num_input_tokens_seen": 7578025382, "step": 1944, "train_runtime": 77275.6706, "train_tokens_per_second": 98064.828 }, { "epoch": 0.3092209856915739, "grad_norm": 0.22119323909282684, "learning_rate": 3.9205578957189585e-05, "loss": 0.423, "num_input_tokens_seen": 7581896917, "step": 1945, "train_runtime": 77314.9709, "train_tokens_per_second": 98065.056 }, { "epoch": 0.3093799682034976, "grad_norm": 0.21964482963085175, "learning_rate": 3.919528111149665e-05, "loss": 0.4161, "num_input_tokens_seen": 7585797622, "step": 1946, "train_runtime": 77353.0992, "train_tokens_per_second": 98067.145 }, { "epoch": 0.3095389507154213, "grad_norm": 0.35834696888923645, "learning_rate": 3.91849797099861e-05, "loss": 0.4148, "num_input_tokens_seen": 7589648774, "step": 1947, "train_runtime": 77391.9886, "train_tokens_per_second": 98067.628 }, { "epoch": 0.309697933227345, "grad_norm": 0.21941113471984863, "learning_rate": 3.9174674755238386e-05, "loss": 0.425, "num_input_tokens_seen": 7593594038, "step": 1948, "train_runtime": 77431.6776, "train_tokens_per_second": 98068.314 }, { "epoch": 0.30985691573926866, "grad_norm": 0.25496917963027954, "learning_rate": 3.9164366249834795e-05, "loss": 0.427, "num_input_tokens_seen": 7597408151, "step": 1949, "train_runtime": 77471.773, "train_tokens_per_second": 98066.791 }, { "epoch": 0.31001589825119236, "grad_norm": 0.2523728311061859, "learning_rate": 3.915405419635755e-05, "loss": 0.4197, "num_input_tokens_seen": 7601308634, "step": 1950, "train_runtime": 77510.9916, "train_tokens_per_second": 98067.493 }, { "epoch": 0.31017488076311606, "grad_norm": 0.20732295513153076, "learning_rate": 3.9143738597389746e-05, "loss": 0.4304, "num_input_tokens_seen": 7605219371, "step": 1951, "train_runtime": 77552.0608, "train_tokens_per_second": 98065.987 }, { "epoch": 0.31033386327503976, "grad_norm": 0.2750503122806549, "learning_rate": 3.913341945551537e-05, "loss": 0.4187, "num_input_tokens_seen": 7609060211, "step": 1952, "train_runtime": 77591.4221, "train_tokens_per_second": 98065.74 }, { "epoch": 0.31049284578696346, "grad_norm": 0.23219110071659088, "learning_rate": 3.912309677331929e-05, "loss": 0.419, "num_input_tokens_seen": 7612994438, "step": 1953, "train_runtime": 77629.8231, "train_tokens_per_second": 98067.909 }, { "epoch": 0.3106518282988871, "grad_norm": 0.2217720001935959, "learning_rate": 3.9112770553387266e-05, "loss": 0.4278, "num_input_tokens_seen": 7616886332, "step": 1954, "train_runtime": 77667.8384, "train_tokens_per_second": 98070.018 }, { "epoch": 0.3108108108108108, "grad_norm": 0.24782714247703552, "learning_rate": 3.910244079830595e-05, "loss": 0.4309, "num_input_tokens_seen": 7620819230, "step": 1955, "train_runtime": 77709.015, "train_tokens_per_second": 98068.663 }, { "epoch": 0.3109697933227345, "grad_norm": 0.24782422184944153, "learning_rate": 3.9092107510662866e-05, "loss": 0.4434, "num_input_tokens_seen": 7624717316, "step": 1956, "train_runtime": 77750.6837, "train_tokens_per_second": 98066.241 }, { "epoch": 0.3111287758346582, "grad_norm": 0.2042006105184555, "learning_rate": 3.908177069304643e-05, "loss": 0.4197, "num_input_tokens_seen": 7628772132, "step": 1957, "train_runtime": 77787.8599, "train_tokens_per_second": 98071.5 }, { "epoch": 0.3112877583465819, "grad_norm": 0.31256186962127686, "learning_rate": 3.907143034804594e-05, "loss": 0.4382, "num_input_tokens_seen": 7632686813, "step": 1958, "train_runtime": 77828.5986, "train_tokens_per_second": 98070.464 }, { "epoch": 0.31144674085850554, "grad_norm": 0.251903235912323, "learning_rate": 3.9061086478251586e-05, "loss": 0.4271, "num_input_tokens_seen": 7636510093, "step": 1959, "train_runtime": 77866.8365, "train_tokens_per_second": 98071.405 }, { "epoch": 0.31160572337042924, "grad_norm": 0.2100095897912979, "learning_rate": 3.9050739086254424e-05, "loss": 0.4426, "num_input_tokens_seen": 7640490341, "step": 1960, "train_runtime": 77906.205, "train_tokens_per_second": 98072.937 }, { "epoch": 0.31176470588235294, "grad_norm": 0.2533082664012909, "learning_rate": 3.904038817464642e-05, "loss": 0.4255, "num_input_tokens_seen": 7644295316, "step": 1961, "train_runtime": 77942.1301, "train_tokens_per_second": 98076.551 }, { "epoch": 0.31192368839427664, "grad_norm": 0.5235679149627686, "learning_rate": 3.903003374602038e-05, "loss": 0.4324, "num_input_tokens_seen": 7648173956, "step": 1962, "train_runtime": 77981.7363, "train_tokens_per_second": 98076.477 }, { "epoch": 0.31208267090620034, "grad_norm": 0.41080787777900696, "learning_rate": 3.901967580297002e-05, "loss": 0.4354, "num_input_tokens_seen": 7652181042, "step": 1963, "train_runtime": 78021.8514, "train_tokens_per_second": 98077.409 }, { "epoch": 0.312241653418124, "grad_norm": 0.2619337737560272, "learning_rate": 3.9009314348089956e-05, "loss": 0.4342, "num_input_tokens_seen": 7656077406, "step": 1964, "train_runtime": 78060.3634, "train_tokens_per_second": 98078.936 }, { "epoch": 0.3124006359300477, "grad_norm": 0.22683551907539368, "learning_rate": 3.899894938397564e-05, "loss": 0.4152, "num_input_tokens_seen": 7659984674, "step": 1965, "train_runtime": 78098.8345, "train_tokens_per_second": 98080.653 }, { "epoch": 0.3125596184419714, "grad_norm": 0.24519573152065277, "learning_rate": 3.8988580913223415e-05, "loss": 0.4283, "num_input_tokens_seen": 7663841637, "step": 1966, "train_runtime": 78138.1919, "train_tokens_per_second": 98080.611 }, { "epoch": 0.3127186009538951, "grad_norm": 0.22972549498081207, "learning_rate": 3.897820893843053e-05, "loss": 0.4303, "num_input_tokens_seen": 7667751563, "step": 1967, "train_runtime": 78179.4373, "train_tokens_per_second": 98078.879 }, { "epoch": 0.3128775834658188, "grad_norm": 0.21960213780403137, "learning_rate": 3.8967833462195075e-05, "loss": 0.4211, "num_input_tokens_seen": 7671726086, "step": 1968, "train_runtime": 78220.2659, "train_tokens_per_second": 98078.497 }, { "epoch": 0.31303656597774243, "grad_norm": 0.2926222085952759, "learning_rate": 3.895745448711604e-05, "loss": 0.4133, "num_input_tokens_seen": 7675637932, "step": 1969, "train_runtime": 78258.8185, "train_tokens_per_second": 98080.166 }, { "epoch": 0.31319554848966613, "grad_norm": 0.22424058616161346, "learning_rate": 3.894707201579329e-05, "loss": 0.4531, "num_input_tokens_seen": 7679594316, "step": 1970, "train_runtime": 78299.0222, "train_tokens_per_second": 98080.335 }, { "epoch": 0.31335453100158983, "grad_norm": 0.24197502434253693, "learning_rate": 3.893668605082755e-05, "loss": 0.4272, "num_input_tokens_seen": 7683536076, "step": 1971, "train_runtime": 78339.646, "train_tokens_per_second": 98079.791 }, { "epoch": 0.31351351351351353, "grad_norm": 0.23258836567401886, "learning_rate": 3.892629659482043e-05, "loss": 0.4277, "num_input_tokens_seen": 7687369558, "step": 1972, "train_runtime": 78377.2761, "train_tokens_per_second": 98081.612 }, { "epoch": 0.3136724960254372, "grad_norm": 1.4887019395828247, "learning_rate": 3.891590365037443e-05, "loss": 0.4319, "num_input_tokens_seen": 7691191869, "step": 1973, "train_runtime": 78414.3838, "train_tokens_per_second": 98083.942 }, { "epoch": 0.3138314785373609, "grad_norm": 0.23021863400936127, "learning_rate": 3.890550722009289e-05, "loss": 0.4186, "num_input_tokens_seen": 7695130094, "step": 1974, "train_runtime": 78452.5889, "train_tokens_per_second": 98086.375 }, { "epoch": 0.3139904610492846, "grad_norm": 0.5008619427680969, "learning_rate": 3.889510730658007e-05, "loss": 0.4339, "num_input_tokens_seen": 7699119162, "step": 1975, "train_runtime": 78492.9806, "train_tokens_per_second": 98086.722 }, { "epoch": 0.3141494435612083, "grad_norm": 0.23655900359153748, "learning_rate": 3.8884703912441054e-05, "loss": 0.4256, "num_input_tokens_seen": 7702951526, "step": 1976, "train_runtime": 78533.8677, "train_tokens_per_second": 98084.454 }, { "epoch": 0.314308426073132, "grad_norm": 0.21593895554542542, "learning_rate": 3.887429704028181e-05, "loss": 0.4285, "num_input_tokens_seen": 7706926798, "step": 1977, "train_runtime": 78574.4327, "train_tokens_per_second": 98084.409 }, { "epoch": 0.3144674085850556, "grad_norm": 0.20711180567741394, "learning_rate": 3.88638866927092e-05, "loss": 0.4327, "num_input_tokens_seen": 7710784659, "step": 1978, "train_runtime": 78612.8206, "train_tokens_per_second": 98085.587 }, { "epoch": 0.3146263910969793, "grad_norm": 0.23464609682559967, "learning_rate": 3.8853472872330955e-05, "loss": 0.431, "num_input_tokens_seen": 7714739392, "step": 1979, "train_runtime": 78652.0827, "train_tokens_per_second": 98086.905 }, { "epoch": 0.314785373608903, "grad_norm": 0.24959678947925568, "learning_rate": 3.8843055581755636e-05, "loss": 0.4318, "num_input_tokens_seen": 7718547961, "step": 1980, "train_runtime": 78691.5843, "train_tokens_per_second": 98086.066 }, { "epoch": 0.3149443561208267, "grad_norm": 0.21711018681526184, "learning_rate": 3.883263482359271e-05, "loss": 0.4161, "num_input_tokens_seen": 7722527131, "step": 1981, "train_runtime": 78728.8948, "train_tokens_per_second": 98090.125 }, { "epoch": 0.3151033386327504, "grad_norm": 0.29304537177085876, "learning_rate": 3.882221060045251e-05, "loss": 0.429, "num_input_tokens_seen": 7726481250, "step": 1982, "train_runtime": 78768.2337, "train_tokens_per_second": 98091.336 }, { "epoch": 0.31526232114467406, "grad_norm": 0.229558065533638, "learning_rate": 3.881178291494624e-05, "loss": 0.4294, "num_input_tokens_seen": 7730304979, "step": 1983, "train_runtime": 78806.3904, "train_tokens_per_second": 98092.362 }, { "epoch": 0.31542130365659776, "grad_norm": 0.2215113341808319, "learning_rate": 3.8801351769685934e-05, "loss": 0.4365, "num_input_tokens_seen": 7734237004, "step": 1984, "train_runtime": 78847.7098, "train_tokens_per_second": 98090.826 }, { "epoch": 0.31558028616852146, "grad_norm": 0.2731112241744995, "learning_rate": 3.879091716728454e-05, "loss": 0.4218, "num_input_tokens_seen": 7738165455, "step": 1985, "train_runtime": 78886.8717, "train_tokens_per_second": 98091.929 }, { "epoch": 0.31573926868044516, "grad_norm": 0.25943905115127563, "learning_rate": 3.878047911035585e-05, "loss": 0.4233, "num_input_tokens_seen": 7742019995, "step": 1986, "train_runtime": 78924.1908, "train_tokens_per_second": 98094.385 }, { "epoch": 0.31589825119236886, "grad_norm": 0.22373254597187042, "learning_rate": 3.877003760151452e-05, "loss": 0.4267, "num_input_tokens_seen": 7745916041, "step": 1987, "train_runtime": 78963.7897, "train_tokens_per_second": 98094.533 }, { "epoch": 0.3160572337042925, "grad_norm": 0.2120436280965805, "learning_rate": 3.875959264337608e-05, "loss": 0.4152, "num_input_tokens_seen": 7749880213, "step": 1988, "train_runtime": 79004.3983, "train_tokens_per_second": 98094.288 }, { "epoch": 0.3162162162162162, "grad_norm": 0.6284686923027039, "learning_rate": 3.8749144238556894e-05, "loss": 0.4314, "num_input_tokens_seen": 7753799616, "step": 1989, "train_runtime": 79043.7265, "train_tokens_per_second": 98095.067 }, { "epoch": 0.3163751987281399, "grad_norm": 0.217734694480896, "learning_rate": 3.873869238967425e-05, "loss": 0.4233, "num_input_tokens_seen": 7757723623, "step": 1990, "train_runtime": 79081.5501, "train_tokens_per_second": 98097.769 }, { "epoch": 0.3165341812400636, "grad_norm": 0.23641352355480194, "learning_rate": 3.872823709934623e-05, "loss": 0.4169, "num_input_tokens_seen": 7761558203, "step": 1991, "train_runtime": 79120.0496, "train_tokens_per_second": 98098.5 }, { "epoch": 0.3166931637519873, "grad_norm": 0.20676712691783905, "learning_rate": 3.871777837019184e-05, "loss": 0.4415, "num_input_tokens_seen": 7765445885, "step": 1992, "train_runtime": 79160.5334, "train_tokens_per_second": 98097.443 }, { "epoch": 0.31685214626391095, "grad_norm": 0.20265746116638184, "learning_rate": 3.8707316204830895e-05, "loss": 0.4378, "num_input_tokens_seen": 7769504632, "step": 1993, "train_runtime": 79200.6915, "train_tokens_per_second": 98098.949 }, { "epoch": 0.31701112877583465, "grad_norm": 0.19788628816604614, "learning_rate": 3.8696850605884106e-05, "loss": 0.4311, "num_input_tokens_seen": 7773374190, "step": 1994, "train_runtime": 79239.2308, "train_tokens_per_second": 98100.071 }, { "epoch": 0.31717011128775835, "grad_norm": 0.2605203092098236, "learning_rate": 3.8686381575973016e-05, "loss": 0.4361, "num_input_tokens_seen": 7777231977, "step": 1995, "train_runtime": 79278.1485, "train_tokens_per_second": 98100.575 }, { "epoch": 0.31732909379968205, "grad_norm": 0.18937698006629944, "learning_rate": 3.867590911772007e-05, "loss": 0.4355, "num_input_tokens_seen": 7781006466, "step": 1996, "train_runtime": 79316.5837, "train_tokens_per_second": 98100.625 }, { "epoch": 0.31748807631160575, "grad_norm": 0.21038265526294708, "learning_rate": 3.866543323374852e-05, "loss": 0.4216, "num_input_tokens_seen": 7785041819, "step": 1997, "train_runtime": 79357.4503, "train_tokens_per_second": 98100.957 }, { "epoch": 0.3176470588235294, "grad_norm": 0.21988970041275024, "learning_rate": 3.865495392668251e-05, "loss": 0.4026, "num_input_tokens_seen": 7788984756, "step": 1998, "train_runtime": 79397.6833, "train_tokens_per_second": 98100.907 }, { "epoch": 0.3178060413354531, "grad_norm": 0.19234178960323334, "learning_rate": 3.8644471199147034e-05, "loss": 0.4247, "num_input_tokens_seen": 7792898533, "step": 1999, "train_runtime": 79435.1023, "train_tokens_per_second": 98103.965 }, { "epoch": 0.3179650238473768, "grad_norm": 0.2016603648662567, "learning_rate": 3.863398505376793e-05, "loss": 0.4215, "num_input_tokens_seen": 7796814086, "step": 2000, "train_runtime": 79475.4081, "train_tokens_per_second": 98103.48 }, { "epoch": 0.3181240063593005, "grad_norm": 0.20685017108917236, "learning_rate": 3.862349549317192e-05, "loss": 0.4085, "num_input_tokens_seen": 7800708228, "step": 2001, "train_runtime": 79603.7505, "train_tokens_per_second": 97994.23 }, { "epoch": 0.3182829888712242, "grad_norm": 0.503902018070221, "learning_rate": 3.8613002519986564e-05, "loss": 0.4294, "num_input_tokens_seen": 7804634097, "step": 2002, "train_runtime": 79642.9893, "train_tokens_per_second": 97995.243 }, { "epoch": 0.31844197138314784, "grad_norm": 0.2177058309316635, "learning_rate": 3.8602506136840265e-05, "loss": 0.4145, "num_input_tokens_seen": 7808492543, "step": 2003, "train_runtime": 79681.8997, "train_tokens_per_second": 97995.813 }, { "epoch": 0.31860095389507154, "grad_norm": 0.21900947391986847, "learning_rate": 3.85920063463623e-05, "loss": 0.4276, "num_input_tokens_seen": 7812469437, "step": 2004, "train_runtime": 79723.1543, "train_tokens_per_second": 97994.987 }, { "epoch": 0.31875993640699524, "grad_norm": 0.21516774594783783, "learning_rate": 3.858150315118278e-05, "loss": 0.4199, "num_input_tokens_seen": 7816343393, "step": 2005, "train_runtime": 79762.5676, "train_tokens_per_second": 97995.133 }, { "epoch": 0.31891891891891894, "grad_norm": 0.2171662598848343, "learning_rate": 3.8570996553932716e-05, "loss": 0.4105, "num_input_tokens_seen": 7820212212, "step": 2006, "train_runtime": 79799.8117, "train_tokens_per_second": 97997.878 }, { "epoch": 0.3190779014308426, "grad_norm": 0.22834812104701996, "learning_rate": 3.85604865572439e-05, "loss": 0.4279, "num_input_tokens_seen": 7824147105, "step": 2007, "train_runtime": 79839.0628, "train_tokens_per_second": 97998.985 }, { "epoch": 0.3192368839427663, "grad_norm": 0.23301757872104645, "learning_rate": 3.854997316374902e-05, "loss": 0.4254, "num_input_tokens_seen": 7828065239, "step": 2008, "train_runtime": 79877.9524, "train_tokens_per_second": 98000.324 }, { "epoch": 0.31939586645469, "grad_norm": 0.20828741788864136, "learning_rate": 3.853945637608162e-05, "loss": 0.4337, "num_input_tokens_seen": 7831931656, "step": 2009, "train_runtime": 79917.9525, "train_tokens_per_second": 97999.654 }, { "epoch": 0.3195548489666137, "grad_norm": 0.22751206159591675, "learning_rate": 3.8528936196876066e-05, "loss": 0.4167, "num_input_tokens_seen": 7835833500, "step": 2010, "train_runtime": 79957.1156, "train_tokens_per_second": 98000.452 }, { "epoch": 0.3197138314785374, "grad_norm": 0.20567359030246735, "learning_rate": 3.85184126287676e-05, "loss": 0.4378, "num_input_tokens_seen": 7839787458, "step": 2011, "train_runtime": 79996.9475, "train_tokens_per_second": 98001.083 }, { "epoch": 0.319872813990461, "grad_norm": 0.21362616121768951, "learning_rate": 3.850788567439229e-05, "loss": 0.4161, "num_input_tokens_seen": 7843764891, "step": 2012, "train_runtime": 80035.1665, "train_tokens_per_second": 98003.98 }, { "epoch": 0.3200317965023847, "grad_norm": 0.2106139212846756, "learning_rate": 3.8497355336387065e-05, "loss": 0.433, "num_input_tokens_seen": 7847657783, "step": 2013, "train_runtime": 80073.3236, "train_tokens_per_second": 98005.895 }, { "epoch": 0.3201907790143084, "grad_norm": 0.21029730141162872, "learning_rate": 3.8486821617389714e-05, "loss": 0.4292, "num_input_tokens_seen": 7851443262, "step": 2014, "train_runtime": 80112.3335, "train_tokens_per_second": 98005.425 }, { "epoch": 0.3203497615262321, "grad_norm": 0.22029541432857513, "learning_rate": 3.847628452003885e-05, "loss": 0.435, "num_input_tokens_seen": 7855333735, "step": 2015, "train_runtime": 80151.974, "train_tokens_per_second": 98005.493 }, { "epoch": 0.3205087440381558, "grad_norm": 0.21193881332874298, "learning_rate": 3.846574404697395e-05, "loss": 0.4285, "num_input_tokens_seen": 7859086148, "step": 2016, "train_runtime": 80191.9283, "train_tokens_per_second": 98003.456 }, { "epoch": 0.32066772655007947, "grad_norm": 0.3724924623966217, "learning_rate": 3.8455200200835304e-05, "loss": 0.4279, "num_input_tokens_seen": 7863135219, "step": 2017, "train_runtime": 80230.453, "train_tokens_per_second": 98006.866 }, { "epoch": 0.32082670906200317, "grad_norm": 0.2153388410806656, "learning_rate": 3.844465298426409e-05, "loss": 0.4153, "num_input_tokens_seen": 7867031707, "step": 2018, "train_runtime": 80269.8028, "train_tokens_per_second": 98007.363 }, { "epoch": 0.32098569157392687, "grad_norm": 0.18447630107402802, "learning_rate": 3.843410239990232e-05, "loss": 0.4218, "num_input_tokens_seen": 7870936214, "step": 2019, "train_runtime": 80308.7248, "train_tokens_per_second": 98008.482 }, { "epoch": 0.32114467408585057, "grad_norm": 0.2065601795911789, "learning_rate": 3.842354845039282e-05, "loss": 0.4224, "num_input_tokens_seen": 7874881925, "step": 2020, "train_runtime": 80346.0741, "train_tokens_per_second": 98012.031 }, { "epoch": 0.32130365659777427, "grad_norm": 0.40149742364883423, "learning_rate": 3.8412991138379286e-05, "loss": 0.432, "num_input_tokens_seen": 7878756497, "step": 2021, "train_runtime": 80385.47, "train_tokens_per_second": 98012.197 }, { "epoch": 0.3214626391096979, "grad_norm": 0.20556344091892242, "learning_rate": 3.8402430466506254e-05, "loss": 0.4003, "num_input_tokens_seen": 7882481669, "step": 2022, "train_runtime": 80425.0229, "train_tokens_per_second": 98010.313 }, { "epoch": 0.3216216216216216, "grad_norm": 0.20730754733085632, "learning_rate": 3.8391866437419104e-05, "loss": 0.4135, "num_input_tokens_seen": 7886364119, "step": 2023, "train_runtime": 80464.3452, "train_tokens_per_second": 98010.667 }, { "epoch": 0.3217806041335453, "grad_norm": 0.21584349870681763, "learning_rate": 3.838129905376404e-05, "loss": 0.4204, "num_input_tokens_seen": 7890286651, "step": 2024, "train_runtime": 80502.0436, "train_tokens_per_second": 98013.495 }, { "epoch": 0.321939586645469, "grad_norm": 0.2435566782951355, "learning_rate": 3.837072831818812e-05, "loss": 0.4028, "num_input_tokens_seen": 7894127698, "step": 2025, "train_runtime": 80542.741, "train_tokens_per_second": 98011.659 }, { "epoch": 0.3220985691573927, "grad_norm": 0.23728135228157043, "learning_rate": 3.8360154233339234e-05, "loss": 0.4183, "num_input_tokens_seen": 7897982455, "step": 2026, "train_runtime": 80581.3529, "train_tokens_per_second": 98012.532 }, { "epoch": 0.32225755166931636, "grad_norm": 0.2112887054681778, "learning_rate": 3.834957680186612e-05, "loss": 0.4191, "num_input_tokens_seen": 7901963897, "step": 2027, "train_runtime": 80619.1266, "train_tokens_per_second": 98015.995 }, { "epoch": 0.32241653418124006, "grad_norm": 0.19269375503063202, "learning_rate": 3.833899602641835e-05, "loss": 0.4279, "num_input_tokens_seen": 7905902388, "step": 2028, "train_runtime": 80659.7687, "train_tokens_per_second": 98015.436 }, { "epoch": 0.32257551669316376, "grad_norm": 0.2670805752277374, "learning_rate": 3.832841190964634e-05, "loss": 0.4267, "num_input_tokens_seen": 7909673345, "step": 2029, "train_runtime": 80700.6583, "train_tokens_per_second": 98012.501 }, { "epoch": 0.32273449920508746, "grad_norm": 0.21511463820934296, "learning_rate": 3.8317824454201326e-05, "loss": 0.4153, "num_input_tokens_seen": 7913614692, "step": 2030, "train_runtime": 80739.6423, "train_tokens_per_second": 98013.993 }, { "epoch": 0.32289348171701115, "grad_norm": 0.27721351385116577, "learning_rate": 3.83072336627354e-05, "loss": 0.4271, "num_input_tokens_seen": 7917647458, "step": 2031, "train_runtime": 80779.4156, "train_tokens_per_second": 98015.657 }, { "epoch": 0.3230524642289348, "grad_norm": 0.25383591651916504, "learning_rate": 3.829663953790147e-05, "loss": 0.4216, "num_input_tokens_seen": 7921346735, "step": 2032, "train_runtime": 80815.6448, "train_tokens_per_second": 98017.491 }, { "epoch": 0.3232114467408585, "grad_norm": 0.23309345543384552, "learning_rate": 3.8286042082353294e-05, "loss": 0.4079, "num_input_tokens_seen": 7925266242, "step": 2033, "train_runtime": 80855.6375, "train_tokens_per_second": 98017.485 }, { "epoch": 0.3233704292527822, "grad_norm": 0.19243542850017548, "learning_rate": 3.827544129874546e-05, "loss": 0.4272, "num_input_tokens_seen": 7929183026, "step": 2034, "train_runtime": 80894.8723, "train_tokens_per_second": 98018.364 }, { "epoch": 0.3235294117647059, "grad_norm": 0.22482424974441528, "learning_rate": 3.826483718973339e-05, "loss": 0.4307, "num_input_tokens_seen": 7933048004, "step": 2035, "train_runtime": 80936.3716, "train_tokens_per_second": 98015.859 }, { "epoch": 0.32368839427662954, "grad_norm": 0.264775812625885, "learning_rate": 3.825422975797334e-05, "loss": 0.4211, "num_input_tokens_seen": 7936884382, "step": 2036, "train_runtime": 80977.6283, "train_tokens_per_second": 98013.298 }, { "epoch": 0.32384737678855324, "grad_norm": 0.21170099079608917, "learning_rate": 3.824361900612239e-05, "loss": 0.4314, "num_input_tokens_seen": 7940762927, "step": 2037, "train_runtime": 81017.0899, "train_tokens_per_second": 98013.431 }, { "epoch": 0.32400635930047694, "grad_norm": 0.5605229735374451, "learning_rate": 3.8233004936838466e-05, "loss": 0.4022, "num_input_tokens_seen": 7944728243, "step": 2038, "train_runtime": 81053.8904, "train_tokens_per_second": 98017.852 }, { "epoch": 0.32416534181240064, "grad_norm": 0.23507092893123627, "learning_rate": 3.8222387552780315e-05, "loss": 0.4234, "num_input_tokens_seen": 7948648625, "step": 2039, "train_runtime": 81090.8377, "train_tokens_per_second": 98021.538 }, { "epoch": 0.32432432432432434, "grad_norm": 0.23199531435966492, "learning_rate": 3.8211766856607516e-05, "loss": 0.4241, "num_input_tokens_seen": 7952538020, "step": 2040, "train_runtime": 81130.2495, "train_tokens_per_second": 98021.861 }, { "epoch": 0.324483306836248, "grad_norm": 0.23267614841461182, "learning_rate": 3.820114285098048e-05, "loss": 0.4392, "num_input_tokens_seen": 7956472374, "step": 2041, "train_runtime": 81170.4475, "train_tokens_per_second": 98021.788 }, { "epoch": 0.3246422893481717, "grad_norm": 0.943717896938324, "learning_rate": 3.8190515538560435e-05, "loss": 0.4266, "num_input_tokens_seen": 7960454803, "step": 2042, "train_runtime": 81212.8064, "train_tokens_per_second": 98019.699 }, { "epoch": 0.3248012718600954, "grad_norm": 0.25283968448638916, "learning_rate": 3.817988492200947e-05, "loss": 0.4182, "num_input_tokens_seen": 7964180223, "step": 2043, "train_runtime": 81252.491, "train_tokens_per_second": 98017.675 }, { "epoch": 0.3249602543720191, "grad_norm": 0.2763700485229492, "learning_rate": 3.816925100399046e-05, "loss": 0.4272, "num_input_tokens_seen": 7968108900, "step": 2044, "train_runtime": 81291.6651, "train_tokens_per_second": 98018.768 }, { "epoch": 0.3251192368839428, "grad_norm": 0.24956199526786804, "learning_rate": 3.8158613787167125e-05, "loss": 0.4296, "num_input_tokens_seen": 7972118131, "step": 2045, "train_runtime": 81332.5732, "train_tokens_per_second": 98018.762 }, { "epoch": 0.32527821939586643, "grad_norm": 0.2781706750392914, "learning_rate": 3.814797327420403e-05, "loss": 0.4233, "num_input_tokens_seen": 7976003888, "step": 2046, "train_runtime": 81371.088, "train_tokens_per_second": 98020.121 }, { "epoch": 0.32543720190779013, "grad_norm": 0.23433542251586914, "learning_rate": 3.813732946776653e-05, "loss": 0.4219, "num_input_tokens_seen": 7979820029, "step": 2047, "train_runtime": 81409.165, "train_tokens_per_second": 98021.151 }, { "epoch": 0.32559618441971383, "grad_norm": 0.3505499064922333, "learning_rate": 3.812668237052084e-05, "loss": 0.4236, "num_input_tokens_seen": 7983649249, "step": 2048, "train_runtime": 81450.0366, "train_tokens_per_second": 98018.977 }, { "epoch": 0.32575516693163753, "grad_norm": 0.504500687122345, "learning_rate": 3.811603198513397e-05, "loss": 0.423, "num_input_tokens_seen": 7987625631, "step": 2049, "train_runtime": 81490.7843, "train_tokens_per_second": 98018.76 }, { "epoch": 0.32591414944356123, "grad_norm": 0.21526803076267242, "learning_rate": 3.810537831427376e-05, "loss": 0.427, "num_input_tokens_seen": 7991482072, "step": 2050, "train_runtime": 81529.7323, "train_tokens_per_second": 98019.236 }, { "epoch": 0.3260731319554849, "grad_norm": 0.44719862937927246, "learning_rate": 3.80947213606089e-05, "loss": 0.4223, "num_input_tokens_seen": 7995255461, "step": 2051, "train_runtime": 81570.2546, "train_tokens_per_second": 98016.802 }, { "epoch": 0.3262321144674086, "grad_norm": 0.24323666095733643, "learning_rate": 3.808406112680887e-05, "loss": 0.4288, "num_input_tokens_seen": 7999187540, "step": 2052, "train_runtime": 81610.2721, "train_tokens_per_second": 98016.92 }, { "epoch": 0.3263910969793323, "grad_norm": 0.252453088760376, "learning_rate": 3.8073397615543985e-05, "loss": 0.426, "num_input_tokens_seen": 8003152363, "step": 2053, "train_runtime": 81650.9543, "train_tokens_per_second": 98016.642 }, { "epoch": 0.326550079491256, "grad_norm": 0.29302048683166504, "learning_rate": 3.806273082948537e-05, "loss": 0.4215, "num_input_tokens_seen": 8006968122, "step": 2054, "train_runtime": 81691.5952, "train_tokens_per_second": 98014.589 }, { "epoch": 0.3267090620031797, "grad_norm": 0.25154101848602295, "learning_rate": 3.8052060771305e-05, "loss": 0.4246, "num_input_tokens_seen": 8010858059, "step": 2055, "train_runtime": 81730.9193, "train_tokens_per_second": 98015.024 }, { "epoch": 0.3268680445151033, "grad_norm": 0.22657924890518188, "learning_rate": 3.804138744367564e-05, "loss": 0.4209, "num_input_tokens_seen": 8014810777, "step": 2056, "train_runtime": 81769.402, "train_tokens_per_second": 98017.236 }, { "epoch": 0.327027027027027, "grad_norm": 0.2091270089149475, "learning_rate": 3.8030710849270865e-05, "loss": 0.4167, "num_input_tokens_seen": 8018796422, "step": 2057, "train_runtime": 81809.0452, "train_tokens_per_second": 98018.458 }, { "epoch": 0.3271860095389507, "grad_norm": 0.21440847218036652, "learning_rate": 3.8020030990765115e-05, "loss": 0.4295, "num_input_tokens_seen": 8022699573, "step": 2058, "train_runtime": 81846.5982, "train_tokens_per_second": 98021.173 }, { "epoch": 0.3273449920508744, "grad_norm": 0.228862464427948, "learning_rate": 3.80093478708336e-05, "loss": 0.4245, "num_input_tokens_seen": 8026598294, "step": 2059, "train_runtime": 81884.9956, "train_tokens_per_second": 98022.821 }, { "epoch": 0.3275039745627981, "grad_norm": 0.20939886569976807, "learning_rate": 3.799866149215237e-05, "loss": 0.4376, "num_input_tokens_seen": 8030405880, "step": 2060, "train_runtime": 81924.5066, "train_tokens_per_second": 98022.023 }, { "epoch": 0.32766295707472176, "grad_norm": 0.21374064683914185, "learning_rate": 3.7987971857398296e-05, "loss": 0.4294, "num_input_tokens_seen": 8034278016, "step": 2061, "train_runtime": 81963.9051, "train_tokens_per_second": 98022.148 }, { "epoch": 0.32782193958664546, "grad_norm": 0.2371569722890854, "learning_rate": 3.797727896924904e-05, "loss": 0.4148, "num_input_tokens_seen": 8038139988, "step": 2062, "train_runtime": 82002.0195, "train_tokens_per_second": 98023.683 }, { "epoch": 0.32798092209856916, "grad_norm": 0.21364818513393402, "learning_rate": 3.796658283038312e-05, "loss": 0.4435, "num_input_tokens_seen": 8042051325, "step": 2063, "train_runtime": 82040.1748, "train_tokens_per_second": 98025.77 }, { "epoch": 0.32813990461049286, "grad_norm": 0.2814807593822479, "learning_rate": 3.7955883443479814e-05, "loss": 0.4025, "num_input_tokens_seen": 8045901871, "step": 2064, "train_runtime": 82076.4306, "train_tokens_per_second": 98029.383 }, { "epoch": 0.32829888712241656, "grad_norm": 0.2312985509634018, "learning_rate": 3.794518081121926e-05, "loss": 0.4111, "num_input_tokens_seen": 8049850958, "step": 2065, "train_runtime": 82117.1551, "train_tokens_per_second": 98028.858 }, { "epoch": 0.3284578696343402, "grad_norm": 0.22856071591377258, "learning_rate": 3.7934474936282386e-05, "loss": 0.423, "num_input_tokens_seen": 8053658256, "step": 2066, "train_runtime": 82156.6741, "train_tokens_per_second": 98028.046 }, { "epoch": 0.3286168521462639, "grad_norm": 0.3421512842178345, "learning_rate": 3.7923765821350944e-05, "loss": 0.4264, "num_input_tokens_seen": 8057617572, "step": 2067, "train_runtime": 82194.5217, "train_tokens_per_second": 98031.078 }, { "epoch": 0.3287758346581876, "grad_norm": 0.20233269035816193, "learning_rate": 3.7913053469107486e-05, "loss": 0.4263, "num_input_tokens_seen": 8061591173, "step": 2068, "train_runtime": 82233.263, "train_tokens_per_second": 98033.215 }, { "epoch": 0.3289348171701113, "grad_norm": 0.22415941953659058, "learning_rate": 3.790233788223537e-05, "loss": 0.419, "num_input_tokens_seen": 8065521108, "step": 2069, "train_runtime": 82272.7028, "train_tokens_per_second": 98033.987 }, { "epoch": 0.32909379968203495, "grad_norm": 0.21659761667251587, "learning_rate": 3.789161906341881e-05, "loss": 0.4072, "num_input_tokens_seen": 8069356955, "step": 2070, "train_runtime": 82310.8399, "train_tokens_per_second": 98035.167 }, { "epoch": 0.32925278219395865, "grad_norm": 0.22135718166828156, "learning_rate": 3.788089701534275e-05, "loss": 0.4202, "num_input_tokens_seen": 8073241260, "step": 2071, "train_runtime": 82349.0071, "train_tokens_per_second": 98036.899 }, { "epoch": 0.32941176470588235, "grad_norm": 0.19485628604888916, "learning_rate": 3.7870171740693025e-05, "loss": 0.4169, "num_input_tokens_seen": 8077043078, "step": 2072, "train_runtime": 82387.0121, "train_tokens_per_second": 98037.82 }, { "epoch": 0.32957074721780605, "grad_norm": 0.27872559428215027, "learning_rate": 3.785944324215622e-05, "loss": 0.4419, "num_input_tokens_seen": 8080965790, "step": 2073, "train_runtime": 82427.7644, "train_tokens_per_second": 98036.94 }, { "epoch": 0.32972972972972975, "grad_norm": 0.196533665060997, "learning_rate": 3.784871152241975e-05, "loss": 0.42, "num_input_tokens_seen": 8084848611, "step": 2074, "train_runtime": 82467.4544, "train_tokens_per_second": 98036.84 }, { "epoch": 0.3298887122416534, "grad_norm": 0.20754094421863556, "learning_rate": 3.783797658417183e-05, "loss": 0.4175, "num_input_tokens_seen": 8088731639, "step": 2075, "train_runtime": 82507.5419, "train_tokens_per_second": 98036.27 }, { "epoch": 0.3300476947535771, "grad_norm": 0.22255921363830566, "learning_rate": 3.7827238430101504e-05, "loss": 0.419, "num_input_tokens_seen": 8092622290, "step": 2076, "train_runtime": 82548.3202, "train_tokens_per_second": 98034.972 }, { "epoch": 0.3302066772655008, "grad_norm": 0.22392044961452484, "learning_rate": 3.7816497062898584e-05, "loss": 0.423, "num_input_tokens_seen": 8096691235, "step": 2077, "train_runtime": 82586.928, "train_tokens_per_second": 98038.412 }, { "epoch": 0.3303656597774245, "grad_norm": 0.25310856103897095, "learning_rate": 3.780575248525373e-05, "loss": 0.4159, "num_input_tokens_seen": 8100587076, "step": 2078, "train_runtime": 82626.188, "train_tokens_per_second": 98038.979 }, { "epoch": 0.3305246422893482, "grad_norm": 0.22674022614955902, "learning_rate": 3.779500469985835e-05, "loss": 0.417, "num_input_tokens_seen": 8104317801, "step": 2079, "train_runtime": 82666.0961, "train_tokens_per_second": 98036.779 }, { "epoch": 0.33068362480127184, "grad_norm": 0.25820887088775635, "learning_rate": 3.7784253709404725e-05, "loss": 0.4082, "num_input_tokens_seen": 8108364522, "step": 2080, "train_runtime": 82706.5214, "train_tokens_per_second": 98037.789 }, { "epoch": 0.33084260731319554, "grad_norm": 0.2240394800901413, "learning_rate": 3.777349951658587e-05, "loss": 0.4155, "num_input_tokens_seen": 8112288552, "step": 2081, "train_runtime": 82743.5881, "train_tokens_per_second": 98041.295 }, { "epoch": 0.33100158982511924, "grad_norm": 0.2691735625267029, "learning_rate": 3.776274212409565e-05, "loss": 0.4392, "num_input_tokens_seen": 8116020468, "step": 2082, "train_runtime": 82784.5644, "train_tokens_per_second": 98037.847 }, { "epoch": 0.33116057233704294, "grad_norm": 0.3102949261665344, "learning_rate": 3.775198153462872e-05, "loss": 0.4196, "num_input_tokens_seen": 8119902859, "step": 2083, "train_runtime": 82822.0655, "train_tokens_per_second": 98040.333 }, { "epoch": 0.33131955484896664, "grad_norm": 0.21284720301628113, "learning_rate": 3.774121775088052e-05, "loss": 0.4309, "num_input_tokens_seen": 8123827104, "step": 2084, "train_runtime": 82862.2723, "train_tokens_per_second": 98040.12 }, { "epoch": 0.3314785373608903, "grad_norm": 0.2437898963689804, "learning_rate": 3.773045077554731e-05, "loss": 0.4269, "num_input_tokens_seen": 8127662713, "step": 2085, "train_runtime": 82901.029, "train_tokens_per_second": 98040.553 }, { "epoch": 0.331637519872814, "grad_norm": 0.22986385226249695, "learning_rate": 3.771968061132614e-05, "loss": 0.4222, "num_input_tokens_seen": 8131616681, "step": 2086, "train_runtime": 82939.1104, "train_tokens_per_second": 98043.211 }, { "epoch": 0.3317965023847377, "grad_norm": 0.21086733043193817, "learning_rate": 3.770890726091486e-05, "loss": 0.4204, "num_input_tokens_seen": 8135494987, "step": 2087, "train_runtime": 82979.4394, "train_tokens_per_second": 98042.299 }, { "epoch": 0.3319554848966614, "grad_norm": 0.21672719717025757, "learning_rate": 3.769813072701212e-05, "loss": 0.4207, "num_input_tokens_seen": 8139397910, "step": 2088, "train_runtime": 83016.1743, "train_tokens_per_second": 98045.929 }, { "epoch": 0.3321144674085851, "grad_norm": 0.23540833592414856, "learning_rate": 3.768735101231735e-05, "loss": 0.4279, "num_input_tokens_seen": 8143366203, "step": 2089, "train_runtime": 83065.1724, "train_tokens_per_second": 98035.867 }, { "epoch": 0.3322734499205087, "grad_norm": 0.2062399983406067, "learning_rate": 3.7676568119530806e-05, "loss": 0.424, "num_input_tokens_seen": 8147174211, "step": 2090, "train_runtime": 83105.4767, "train_tokens_per_second": 98034.143 }, { "epoch": 0.3324324324324324, "grad_norm": 0.20418287813663483, "learning_rate": 3.7665782051353526e-05, "loss": 0.4152, "num_input_tokens_seen": 8151203484, "step": 2091, "train_runtime": 83145.6103, "train_tokens_per_second": 98035.284 }, { "epoch": 0.3325914149443561, "grad_norm": 0.2065865844488144, "learning_rate": 3.7654992810487354e-05, "loss": 0.4258, "num_input_tokens_seen": 8155057029, "step": 2092, "train_runtime": 83183.8817, "train_tokens_per_second": 98036.505 }, { "epoch": 0.3327503974562798, "grad_norm": 0.19555211067199707, "learning_rate": 3.7644200399634896e-05, "loss": 0.4275, "num_input_tokens_seen": 8158939515, "step": 2093, "train_runtime": 83220.9011, "train_tokens_per_second": 98039.548 }, { "epoch": 0.3329093799682035, "grad_norm": 0.220467209815979, "learning_rate": 3.763340482149959e-05, "loss": 0.4283, "num_input_tokens_seen": 8162878669, "step": 2094, "train_runtime": 83258.8176, "train_tokens_per_second": 98042.212 }, { "epoch": 0.33306836248012717, "grad_norm": 0.21442045271396637, "learning_rate": 3.7622606078785656e-05, "loss": 0.4367, "num_input_tokens_seen": 8166733250, "step": 2095, "train_runtime": 83298.5383, "train_tokens_per_second": 98041.735 }, { "epoch": 0.33322734499205087, "grad_norm": 0.2503238618373871, "learning_rate": 3.761180417419808e-05, "loss": 0.4328, "num_input_tokens_seen": 8170684910, "step": 2096, "train_runtime": 83337.1681, "train_tokens_per_second": 98043.707 }, { "epoch": 0.33338632750397457, "grad_norm": 0.19964618980884552, "learning_rate": 3.760099911044269e-05, "loss": 0.428, "num_input_tokens_seen": 8174540358, "step": 2097, "train_runtime": 83376.9845, "train_tokens_per_second": 98043.128 }, { "epoch": 0.33354531001589827, "grad_norm": 0.19423890113830566, "learning_rate": 3.759019089022606e-05, "loss": 0.4201, "num_input_tokens_seen": 8178440635, "step": 2098, "train_runtime": 83415.2148, "train_tokens_per_second": 98044.951 }, { "epoch": 0.33370429252782197, "grad_norm": 0.23001711070537567, "learning_rate": 3.757937951625559e-05, "loss": 0.4261, "num_input_tokens_seen": 8182390262, "step": 2099, "train_runtime": 83454.0597, "train_tokens_per_second": 98046.641 }, { "epoch": 0.3338632750397456, "grad_norm": 0.2412482500076294, "learning_rate": 3.756856499123942e-05, "loss": 0.4294, "num_input_tokens_seen": 8186293627, "step": 2100, "train_runtime": 83491.3868, "train_tokens_per_second": 98049.559 }, { "epoch": 0.3340222575516693, "grad_norm": 0.20794104039669037, "learning_rate": 3.755774731788655e-05, "loss": 0.4277, "num_input_tokens_seen": 8190141790, "step": 2101, "train_runtime": 83530.4363, "train_tokens_per_second": 98049.791 }, { "epoch": 0.334181240063593, "grad_norm": 0.23204892873764038, "learning_rate": 3.754692649890671e-05, "loss": 0.409, "num_input_tokens_seen": 8193880504, "step": 2102, "train_runtime": 83568.7121, "train_tokens_per_second": 98049.62 }, { "epoch": 0.3343402225755167, "grad_norm": 0.21536144614219666, "learning_rate": 3.7536102537010445e-05, "loss": 0.4034, "num_input_tokens_seen": 8197838530, "step": 2103, "train_runtime": 83607.355, "train_tokens_per_second": 98051.643 }, { "epoch": 0.33449920508744035, "grad_norm": 0.2230854630470276, "learning_rate": 3.7525275434909077e-05, "loss": 0.4133, "num_input_tokens_seen": 8201843466, "step": 2104, "train_runtime": 83646.7558, "train_tokens_per_second": 98053.336 }, { "epoch": 0.33465818759936405, "grad_norm": 0.20585177838802338, "learning_rate": 3.7514445195314716e-05, "loss": 0.4143, "num_input_tokens_seen": 8205723129, "step": 2105, "train_runtime": 83684.6764, "train_tokens_per_second": 98055.265 }, { "epoch": 0.33481717011128775, "grad_norm": 0.204660102725029, "learning_rate": 3.7503611820940275e-05, "loss": 0.406, "num_input_tokens_seen": 8209636080, "step": 2106, "train_runtime": 83724.935, "train_tokens_per_second": 98054.852 }, { "epoch": 0.33497615262321145, "grad_norm": 0.20080524682998657, "learning_rate": 3.749277531449942e-05, "loss": 0.4168, "num_input_tokens_seen": 8213454587, "step": 2107, "train_runtime": 83764.1092, "train_tokens_per_second": 98054.58 }, { "epoch": 0.33513513513513515, "grad_norm": 0.2296069860458374, "learning_rate": 3.748193567870663e-05, "loss": 0.4293, "num_input_tokens_seen": 8217361110, "step": 2108, "train_runtime": 83802.9157, "train_tokens_per_second": 98055.79 }, { "epoch": 0.3352941176470588, "grad_norm": 0.2638278603553772, "learning_rate": 3.747109291627715e-05, "loss": 0.4184, "num_input_tokens_seen": 8221225539, "step": 2109, "train_runtime": 83841.0173, "train_tokens_per_second": 98057.321 }, { "epoch": 0.3354531001589825, "grad_norm": 0.23320841789245605, "learning_rate": 3.746024702992703e-05, "loss": 0.4135, "num_input_tokens_seen": 8225241162, "step": 2110, "train_runtime": 83879.8957, "train_tokens_per_second": 98059.745 }, { "epoch": 0.3356120826709062, "grad_norm": 0.2524004578590393, "learning_rate": 3.744939802237307e-05, "loss": 0.4185, "num_input_tokens_seen": 8229092049, "step": 2111, "train_runtime": 83919.6834, "train_tokens_per_second": 98059.141 }, { "epoch": 0.3357710651828299, "grad_norm": 0.22757844626903534, "learning_rate": 3.7438545896332874e-05, "loss": 0.4173, "num_input_tokens_seen": 8233115271, "step": 2112, "train_runtime": 83957.8918, "train_tokens_per_second": 98062.435 }, { "epoch": 0.3359300476947536, "grad_norm": 0.2084929198026657, "learning_rate": 3.7427690654524835e-05, "loss": 0.4231, "num_input_tokens_seen": 8236930417, "step": 2113, "train_runtime": 83998.1859, "train_tokens_per_second": 98060.813 }, { "epoch": 0.33608903020667724, "grad_norm": 0.2182374894618988, "learning_rate": 3.74168322996681e-05, "loss": 0.4193, "num_input_tokens_seen": 8240733029, "step": 2114, "train_runtime": 84038.4812, "train_tokens_per_second": 98059.043 }, { "epoch": 0.33624801271860094, "grad_norm": 0.2735609710216522, "learning_rate": 3.7405970834482614e-05, "loss": 0.4221, "num_input_tokens_seen": 8244627180, "step": 2115, "train_runtime": 84078.8288, "train_tokens_per_second": 98058.302 }, { "epoch": 0.33640699523052464, "grad_norm": 0.197895810008049, "learning_rate": 3.73951062616891e-05, "loss": 0.4098, "num_input_tokens_seen": 8248540104, "step": 2116, "train_runtime": 84118.0885, "train_tokens_per_second": 98059.053 }, { "epoch": 0.33656597774244834, "grad_norm": 0.23127254843711853, "learning_rate": 3.7384238584009055e-05, "loss": 0.4196, "num_input_tokens_seen": 8252388811, "step": 2117, "train_runtime": 84157.7756, "train_tokens_per_second": 98058.542 }, { "epoch": 0.33672496025437204, "grad_norm": 0.20575131475925446, "learning_rate": 3.737336780416476e-05, "loss": 0.4102, "num_input_tokens_seen": 8256285518, "step": 2118, "train_runtime": 84198.1465, "train_tokens_per_second": 98057.806 }, { "epoch": 0.3368839427662957, "grad_norm": 0.4545816481113434, "learning_rate": 3.736249392487927e-05, "loss": 0.4165, "num_input_tokens_seen": 8260275630, "step": 2119, "train_runtime": 84236.8995, "train_tokens_per_second": 98060.062 }, { "epoch": 0.3370429252782194, "grad_norm": 0.20277941226959229, "learning_rate": 3.73516169488764e-05, "loss": 0.4144, "num_input_tokens_seen": 8264071897, "step": 2120, "train_runtime": 84275.1439, "train_tokens_per_second": 98060.609 }, { "epoch": 0.3372019077901431, "grad_norm": 0.1972692906856537, "learning_rate": 3.734073687888077e-05, "loss": 0.4259, "num_input_tokens_seen": 8267965399, "step": 2121, "train_runtime": 84312.3084, "train_tokens_per_second": 98063.563 }, { "epoch": 0.3373608903020668, "grad_norm": 0.22348947823047638, "learning_rate": 3.7329853717617746e-05, "loss": 0.4196, "num_input_tokens_seen": 8271882271, "step": 2122, "train_runtime": 84352.0865, "train_tokens_per_second": 98063.754 }, { "epoch": 0.3375198728139905, "grad_norm": 0.2940234839916229, "learning_rate": 3.73189674678135e-05, "loss": 0.4182, "num_input_tokens_seen": 8275849078, "step": 2123, "train_runtime": 84393.3204, "train_tokens_per_second": 98062.845 }, { "epoch": 0.33767885532591413, "grad_norm": 0.21536816656589508, "learning_rate": 3.730807813219496e-05, "loss": 0.4127, "num_input_tokens_seen": 8279717543, "step": 2124, "train_runtime": 84432.4687, "train_tokens_per_second": 98063.194 }, { "epoch": 0.33783783783783783, "grad_norm": 0.19435051083564758, "learning_rate": 3.7297185713489806e-05, "loss": 0.4222, "num_input_tokens_seen": 8283556697, "step": 2125, "train_runtime": 84470.4457, "train_tokens_per_second": 98064.555 }, { "epoch": 0.33799682034976153, "grad_norm": 0.21105018258094788, "learning_rate": 3.7286290214426535e-05, "loss": 0.4143, "num_input_tokens_seen": 8287488373, "step": 2126, "train_runtime": 84509.7441, "train_tokens_per_second": 98065.477 }, { "epoch": 0.33815580286168523, "grad_norm": 0.2869243919849396, "learning_rate": 3.7275391637734393e-05, "loss": 0.4236, "num_input_tokens_seen": 8291357586, "step": 2127, "train_runtime": 84549.4202, "train_tokens_per_second": 98065.221 }, { "epoch": 0.33831478537360893, "grad_norm": 0.5832518935203552, "learning_rate": 3.7264489986143376e-05, "loss": 0.4238, "num_input_tokens_seen": 8295262388, "step": 2128, "train_runtime": 84588.1773, "train_tokens_per_second": 98066.452 }, { "epoch": 0.3384737678855326, "grad_norm": 0.24142354726791382, "learning_rate": 3.725358526238429e-05, "loss": 0.4397, "num_input_tokens_seen": 8299236466, "step": 2129, "train_runtime": 84629.7005, "train_tokens_per_second": 98065.294 }, { "epoch": 0.3386327503974563, "grad_norm": 0.20144881308078766, "learning_rate": 3.724267746918868e-05, "loss": 0.4335, "num_input_tokens_seen": 8303149149, "step": 2130, "train_runtime": 84665.6586, "train_tokens_per_second": 98069.858 }, { "epoch": 0.33879173290938, "grad_norm": 0.20653893053531647, "learning_rate": 3.7231766609288877e-05, "loss": 0.4273, "num_input_tokens_seen": 8307008274, "step": 2131, "train_runtime": 84700.2245, "train_tokens_per_second": 98075.399 }, { "epoch": 0.33895071542130367, "grad_norm": 0.20349819958209991, "learning_rate": 3.722085268541796e-05, "loss": 0.4174, "num_input_tokens_seen": 8310810712, "step": 2132, "train_runtime": 84741.1354, "train_tokens_per_second": 98072.921 }, { "epoch": 0.33910969793322737, "grad_norm": 0.214603990316391, "learning_rate": 3.7209935700309826e-05, "loss": 0.418, "num_input_tokens_seen": 8314777572, "step": 2133, "train_runtime": 84781.4602, "train_tokens_per_second": 98073.064 }, { "epoch": 0.339268680445151, "grad_norm": 0.23189927637577057, "learning_rate": 3.719901565669906e-05, "loss": 0.4193, "num_input_tokens_seen": 8318781892, "step": 2134, "train_runtime": 84817.5322, "train_tokens_per_second": 98078.566 }, { "epoch": 0.3394276629570747, "grad_norm": 0.20894862711429596, "learning_rate": 3.7188092557321085e-05, "loss": 0.4308, "num_input_tokens_seen": 8322667187, "step": 2135, "train_runtime": 84857.6312, "train_tokens_per_second": 98078.005 }, { "epoch": 0.3395866454689984, "grad_norm": 0.2212691605091095, "learning_rate": 3.7177166404912046e-05, "loss": 0.4287, "num_input_tokens_seen": 8326560898, "step": 2136, "train_runtime": 84898.1415, "train_tokens_per_second": 98077.069 }, { "epoch": 0.3397456279809221, "grad_norm": 0.2923341393470764, "learning_rate": 3.7166237202208867e-05, "loss": 0.4119, "num_input_tokens_seen": 8330498639, "step": 2137, "train_runtime": 84938.0172, "train_tokens_per_second": 98077.385 }, { "epoch": 0.33990461049284576, "grad_norm": 0.2307652086019516, "learning_rate": 3.7155304951949244e-05, "loss": 0.4214, "num_input_tokens_seen": 8334403507, "step": 2138, "train_runtime": 84977.1817, "train_tokens_per_second": 98078.135 }, { "epoch": 0.34006359300476946, "grad_norm": 0.23622404038906097, "learning_rate": 3.7144369656871616e-05, "loss": 0.4166, "num_input_tokens_seen": 8338205675, "step": 2139, "train_runtime": 85017.6483, "train_tokens_per_second": 98076.174 }, { "epoch": 0.34022257551669316, "grad_norm": 0.20502057671546936, "learning_rate": 3.713343131971522e-05, "loss": 0.4246, "num_input_tokens_seen": 8342170128, "step": 2140, "train_runtime": 85056.6147, "train_tokens_per_second": 98077.853 }, { "epoch": 0.34038155802861686, "grad_norm": 0.21136395633220673, "learning_rate": 3.712248994322001e-05, "loss": 0.4369, "num_input_tokens_seen": 8346051980, "step": 2141, "train_runtime": 85096.0646, "train_tokens_per_second": 98078.002 }, { "epoch": 0.34054054054054056, "grad_norm": 0.24352313578128815, "learning_rate": 3.711154553012673e-05, "loss": 0.4299, "num_input_tokens_seen": 8350047108, "step": 2142, "train_runtime": 85135.2934, "train_tokens_per_second": 98079.736 }, { "epoch": 0.3406995230524642, "grad_norm": 0.2345777302980423, "learning_rate": 3.710059808317688e-05, "loss": 0.4186, "num_input_tokens_seen": 8353833412, "step": 2143, "train_runtime": 85175.081, "train_tokens_per_second": 98078.374 }, { "epoch": 0.3408585055643879, "grad_norm": 0.2098367065191269, "learning_rate": 3.708964760511272e-05, "loss": 0.4214, "num_input_tokens_seen": 8357738726, "step": 2144, "train_runtime": 85213.0623, "train_tokens_per_second": 98080.488 }, { "epoch": 0.3410174880763116, "grad_norm": 0.21032096445560455, "learning_rate": 3.7078694098677256e-05, "loss": 0.4217, "num_input_tokens_seen": 8361581621, "step": 2145, "train_runtime": 85253.3431, "train_tokens_per_second": 98079.223 }, { "epoch": 0.3411764705882353, "grad_norm": 0.20865985751152039, "learning_rate": 3.706773756661428e-05, "loss": 0.4249, "num_input_tokens_seen": 8365450552, "step": 2146, "train_runtime": 85293.1079, "train_tokens_per_second": 98078.857 }, { "epoch": 0.341335453100159, "grad_norm": 0.19822515547275543, "learning_rate": 3.705677801166832e-05, "loss": 0.4094, "num_input_tokens_seen": 8369352203, "step": 2147, "train_runtime": 85331.1447, "train_tokens_per_second": 98080.862 }, { "epoch": 0.34149443561208265, "grad_norm": 0.1981876790523529, "learning_rate": 3.704581543658466e-05, "loss": 0.4245, "num_input_tokens_seen": 8373267404, "step": 2148, "train_runtime": 85369.1996, "train_tokens_per_second": 98083.002 }, { "epoch": 0.34165341812400635, "grad_norm": 0.18685097992420197, "learning_rate": 3.7034849844109356e-05, "loss": 0.4069, "num_input_tokens_seen": 8377117788, "step": 2149, "train_runtime": 85407.3121, "train_tokens_per_second": 98084.316 }, { "epoch": 0.34181240063593005, "grad_norm": 0.2125839740037918, "learning_rate": 3.702388123698921e-05, "loss": 0.4221, "num_input_tokens_seen": 8380975497, "step": 2150, "train_runtime": 85448.2568, "train_tokens_per_second": 98082.463 }, { "epoch": 0.34197138314785375, "grad_norm": 0.20140017569065094, "learning_rate": 3.701290961797178e-05, "loss": 0.4218, "num_input_tokens_seen": 8384931864, "step": 2151, "train_runtime": 85487.0806, "train_tokens_per_second": 98084.2 }, { "epoch": 0.34213036565977745, "grad_norm": 0.2806876599788666, "learning_rate": 3.7001934989805374e-05, "loss": 0.4121, "num_input_tokens_seen": 8388847463, "step": 2152, "train_runtime": 85526.193, "train_tokens_per_second": 98085.127 }, { "epoch": 0.3422893481717011, "grad_norm": 0.22166520357131958, "learning_rate": 3.6990957355239067e-05, "loss": 0.4224, "num_input_tokens_seen": 8392698322, "step": 2153, "train_runtime": 85563.8947, "train_tokens_per_second": 98086.913 }, { "epoch": 0.3424483306836248, "grad_norm": 0.19463621079921722, "learning_rate": 3.697997671702267e-05, "loss": 0.4118, "num_input_tokens_seen": 8396550974, "step": 2154, "train_runtime": 85602.8676, "train_tokens_per_second": 98087.263 }, { "epoch": 0.3426073131955485, "grad_norm": 0.26541706919670105, "learning_rate": 3.6968993077906755e-05, "loss": 0.4141, "num_input_tokens_seen": 8400499057, "step": 2155, "train_runtime": 85641.1183, "train_tokens_per_second": 98089.554 }, { "epoch": 0.3427662957074722, "grad_norm": 0.18838246166706085, "learning_rate": 3.6958006440642664e-05, "loss": 0.4159, "num_input_tokens_seen": 8404454216, "step": 2156, "train_runtime": 85678.7834, "train_tokens_per_second": 98092.595 }, { "epoch": 0.3429252782193959, "grad_norm": 0.1900673657655716, "learning_rate": 3.6947016807982446e-05, "loss": 0.428, "num_input_tokens_seen": 8408359244, "step": 2157, "train_runtime": 85719.545, "train_tokens_per_second": 98091.506 }, { "epoch": 0.34308426073131953, "grad_norm": 0.21061281859874725, "learning_rate": 3.693602418267895e-05, "loss": 0.4241, "num_input_tokens_seen": 8412347902, "step": 2158, "train_runtime": 85757.6006, "train_tokens_per_second": 98094.488 }, { "epoch": 0.34324324324324323, "grad_norm": 0.21568240225315094, "learning_rate": 3.692502856748574e-05, "loss": 0.4184, "num_input_tokens_seen": 8416187848, "step": 2159, "train_runtime": 85799.3991, "train_tokens_per_second": 98091.454 }, { "epoch": 0.34340222575516693, "grad_norm": 0.20563632249832153, "learning_rate": 3.691402996515714e-05, "loss": 0.4221, "num_input_tokens_seen": 8420078517, "step": 2160, "train_runtime": 85836.6482, "train_tokens_per_second": 98094.214 }, { "epoch": 0.34356120826709063, "grad_norm": 0.2120743989944458, "learning_rate": 3.690302837844822e-05, "loss": 0.4198, "num_input_tokens_seen": 8424044703, "step": 2161, "train_runtime": 85877.6361, "train_tokens_per_second": 98093.579 }, { "epoch": 0.34372019077901433, "grad_norm": 0.19571392238140106, "learning_rate": 3.68920238101148e-05, "loss": 0.4074, "num_input_tokens_seen": 8427916685, "step": 2162, "train_runtime": 85917.8998, "train_tokens_per_second": 98092.676 }, { "epoch": 0.343879173290938, "grad_norm": 0.24182316660881042, "learning_rate": 3.688101626291345e-05, "loss": 0.4222, "num_input_tokens_seen": 8431799586, "step": 2163, "train_runtime": 85957.603, "train_tokens_per_second": 98092.54 }, { "epoch": 0.3440381558028617, "grad_norm": 0.2526041865348816, "learning_rate": 3.687000573960148e-05, "loss": 0.405, "num_input_tokens_seen": 8435572116, "step": 2164, "train_runtime": 85998.1095, "train_tokens_per_second": 98090.204 }, { "epoch": 0.3441971383147854, "grad_norm": 0.18017518520355225, "learning_rate": 3.6858992242936956e-05, "loss": 0.4094, "num_input_tokens_seen": 8439566184, "step": 2165, "train_runtime": 86034.8439, "train_tokens_per_second": 98094.746 }, { "epoch": 0.3443561208267091, "grad_norm": 0.21761389076709747, "learning_rate": 3.684797577567868e-05, "loss": 0.4055, "num_input_tokens_seen": 8443384572, "step": 2166, "train_runtime": 86072.06, "train_tokens_per_second": 98096.694 }, { "epoch": 0.3445151033386328, "grad_norm": 0.22449488937854767, "learning_rate": 3.683695634058618e-05, "loss": 0.4385, "num_input_tokens_seen": 8447228652, "step": 2167, "train_runtime": 86111.8829, "train_tokens_per_second": 98095.97 }, { "epoch": 0.3446740858505564, "grad_norm": 0.20303553342819214, "learning_rate": 3.6825933940419766e-05, "loss": 0.4057, "num_input_tokens_seen": 8451192401, "step": 2168, "train_runtime": 86149.0241, "train_tokens_per_second": 98099.688 }, { "epoch": 0.3448330683624801, "grad_norm": 0.20413701236248016, "learning_rate": 3.681490857794047e-05, "loss": 0.4242, "num_input_tokens_seen": 8455159299, "step": 2169, "train_runtime": 86188.1035, "train_tokens_per_second": 98101.234 }, { "epoch": 0.3449920508744038, "grad_norm": 0.19846883416175842, "learning_rate": 3.680388025591005e-05, "loss": 0.4242, "num_input_tokens_seen": 8459007536, "step": 2170, "train_runtime": 86227.0704, "train_tokens_per_second": 98101.53 }, { "epoch": 0.3451510333863275, "grad_norm": 0.23449678719043732, "learning_rate": 3.679284897709105e-05, "loss": 0.4309, "num_input_tokens_seen": 8462832755, "step": 2171, "train_runtime": 86266.9687, "train_tokens_per_second": 98100.5 }, { "epoch": 0.34531001589825117, "grad_norm": 0.18201643228530884, "learning_rate": 3.678181474424671e-05, "loss": 0.4194, "num_input_tokens_seen": 8466821158, "step": 2172, "train_runtime": 86303.906, "train_tokens_per_second": 98104.727 }, { "epoch": 0.34546899841017487, "grad_norm": 0.291618287563324, "learning_rate": 3.677077756014103e-05, "loss": 0.4271, "num_input_tokens_seen": 8470759951, "step": 2173, "train_runtime": 86343.1692, "train_tokens_per_second": 98105.734 }, { "epoch": 0.34562798092209857, "grad_norm": 0.20857486128807068, "learning_rate": 3.6759737427538756e-05, "loss": 0.4108, "num_input_tokens_seen": 8474645994, "step": 2174, "train_runtime": 86384.8556, "train_tokens_per_second": 98103.376 }, { "epoch": 0.34578696343402227, "grad_norm": 0.20688241720199585, "learning_rate": 3.674869434920535e-05, "loss": 0.4254, "num_input_tokens_seen": 8478590311, "step": 2175, "train_runtime": 86427.1794, "train_tokens_per_second": 98100.972 }, { "epoch": 0.34594594594594597, "grad_norm": 0.25244683027267456, "learning_rate": 3.673764832790702e-05, "loss": 0.4259, "num_input_tokens_seen": 8482405428, "step": 2176, "train_runtime": 86466.2486, "train_tokens_per_second": 98100.768 }, { "epoch": 0.3461049284578696, "grad_norm": 0.1987432837486267, "learning_rate": 3.672659936641074e-05, "loss": 0.4197, "num_input_tokens_seen": 8486358564, "step": 2177, "train_runtime": 86503.9835, "train_tokens_per_second": 98103.674 }, { "epoch": 0.3462639109697933, "grad_norm": 0.25636228919029236, "learning_rate": 3.671554746748417e-05, "loss": 0.43, "num_input_tokens_seen": 8490148807, "step": 2178, "train_runtime": 86545.4875, "train_tokens_per_second": 98100.422 }, { "epoch": 0.346422893481717, "grad_norm": 0.243147611618042, "learning_rate": 3.670449263389575e-05, "loss": 0.4159, "num_input_tokens_seen": 8494161664, "step": 2179, "train_runtime": 86583.8557, "train_tokens_per_second": 98103.296 }, { "epoch": 0.3465818759936407, "grad_norm": 0.23647916316986084, "learning_rate": 3.669343486841464e-05, "loss": 0.426, "num_input_tokens_seen": 8498033539, "step": 2180, "train_runtime": 86623.9989, "train_tokens_per_second": 98102.531 }, { "epoch": 0.3467408585055644, "grad_norm": 0.1864645630121231, "learning_rate": 3.6682374173810715e-05, "loss": 0.4146, "num_input_tokens_seen": 8501884194, "step": 2181, "train_runtime": 86661.874, "train_tokens_per_second": 98104.089 }, { "epoch": 0.34689984101748805, "grad_norm": 0.21749144792556763, "learning_rate": 3.667131055285462e-05, "loss": 0.4219, "num_input_tokens_seen": 8505894802, "step": 2182, "train_runtime": 86699.8116, "train_tokens_per_second": 98107.42 }, { "epoch": 0.34705882352941175, "grad_norm": 0.2237401157617569, "learning_rate": 3.6660244008317706e-05, "loss": 0.4196, "num_input_tokens_seen": 8509729362, "step": 2183, "train_runtime": 86737.9385, "train_tokens_per_second": 98108.504 }, { "epoch": 0.34721780604133545, "grad_norm": 0.20461779832839966, "learning_rate": 3.6649174542972057e-05, "loss": 0.4152, "num_input_tokens_seen": 8513641861, "step": 2184, "train_runtime": 86776.5651, "train_tokens_per_second": 98109.92 }, { "epoch": 0.34737678855325915, "grad_norm": 0.19153429567813873, "learning_rate": 3.6638102159590504e-05, "loss": 0.4111, "num_input_tokens_seen": 8517452855, "step": 2185, "train_runtime": 86815.5467, "train_tokens_per_second": 98109.765 }, { "epoch": 0.34753577106518285, "grad_norm": 0.1962917000055313, "learning_rate": 3.66270268609466e-05, "loss": 0.4301, "num_input_tokens_seen": 8521502304, "step": 2186, "train_runtime": 86856.6044, "train_tokens_per_second": 98110.01 }, { "epoch": 0.3476947535771065, "grad_norm": 0.1777418553829193, "learning_rate": 3.661594864981462e-05, "loss": 0.4025, "num_input_tokens_seen": 8525262017, "step": 2187, "train_runtime": 86894.8332, "train_tokens_per_second": 98110.114 }, { "epoch": 0.3478537360890302, "grad_norm": 0.19611065089702606, "learning_rate": 3.6604867528969604e-05, "loss": 0.4169, "num_input_tokens_seen": 8529159975, "step": 2188, "train_runtime": 86934.9688, "train_tokens_per_second": 98109.657 }, { "epoch": 0.3480127186009539, "grad_norm": 0.20014703273773193, "learning_rate": 3.659378350118726e-05, "loss": 0.4348, "num_input_tokens_seen": 8533008192, "step": 2189, "train_runtime": 86972.8962, "train_tokens_per_second": 98111.119 }, { "epoch": 0.3481717011128776, "grad_norm": 0.18723738193511963, "learning_rate": 3.658269656924409e-05, "loss": 0.4308, "num_input_tokens_seen": 8536914635, "step": 2190, "train_runtime": 87013.271, "train_tokens_per_second": 98110.49 }, { "epoch": 0.3483306836248013, "grad_norm": 0.2170405387878418, "learning_rate": 3.6571606735917265e-05, "loss": 0.4097, "num_input_tokens_seen": 8540849887, "step": 2191, "train_runtime": 87052.7795, "train_tokens_per_second": 98111.168 }, { "epoch": 0.34848966613672494, "grad_norm": 0.20167823135852814, "learning_rate": 3.656051400398472e-05, "loss": 0.4132, "num_input_tokens_seen": 8544917873, "step": 2192, "train_runtime": 87089.7354, "train_tokens_per_second": 98116.246 }, { "epoch": 0.34864864864864864, "grad_norm": 0.28767964243888855, "learning_rate": 3.6549418376225115e-05, "loss": 0.4148, "num_input_tokens_seen": 8548799900, "step": 2193, "train_runtime": 87126.5197, "train_tokens_per_second": 98119.378 }, { "epoch": 0.34880763116057234, "grad_norm": 0.19660019874572754, "learning_rate": 3.6538319855417815e-05, "loss": 0.422, "num_input_tokens_seen": 8552553981, "step": 2194, "train_runtime": 87162.7948, "train_tokens_per_second": 98121.613 }, { "epoch": 0.34896661367249604, "grad_norm": 0.1871909648180008, "learning_rate": 3.652721844434292e-05, "loss": 0.416, "num_input_tokens_seen": 8556516746, "step": 2195, "train_runtime": 87200.683, "train_tokens_per_second": 98124.424 }, { "epoch": 0.34912559618441974, "grad_norm": 0.18520191311836243, "learning_rate": 3.651611414578127e-05, "loss": 0.405, "num_input_tokens_seen": 8560343291, "step": 2196, "train_runtime": 87239.6895, "train_tokens_per_second": 98124.413 }, { "epoch": 0.3492845786963434, "grad_norm": 0.26003608107566833, "learning_rate": 3.65050069625144e-05, "loss": 0.4148, "num_input_tokens_seen": 8564296151, "step": 2197, "train_runtime": 87278.2159, "train_tokens_per_second": 98126.389 }, { "epoch": 0.3494435612082671, "grad_norm": 0.5228798985481262, "learning_rate": 3.649389689732459e-05, "loss": 0.414, "num_input_tokens_seen": 8568088001, "step": 2198, "train_runtime": 87316.3816, "train_tokens_per_second": 98126.925 }, { "epoch": 0.3496025437201908, "grad_norm": 0.21438108384609222, "learning_rate": 3.648278395299481e-05, "loss": 0.4194, "num_input_tokens_seen": 8572090213, "step": 2199, "train_runtime": 87354.2124, "train_tokens_per_second": 98130.244 }, { "epoch": 0.3497615262321145, "grad_norm": 0.249662846326828, "learning_rate": 3.64716681323088e-05, "loss": 0.4252, "num_input_tokens_seen": 8575991847, "step": 2200, "train_runtime": 87393.4785, "train_tokens_per_second": 98130.799 }, { "epoch": 0.3499205087440382, "grad_norm": 0.3697627782821655, "learning_rate": 3.646054943805099e-05, "loss": 0.419, "num_input_tokens_seen": 8579834466, "step": 2201, "train_runtime": 87544.7103, "train_tokens_per_second": 98005.173 }, { "epoch": 0.3500794912559618, "grad_norm": 0.21169401705265045, "learning_rate": 3.6449427873006526e-05, "loss": 0.4278, "num_input_tokens_seen": 8583665565, "step": 2202, "train_runtime": 87585.3177, "train_tokens_per_second": 98003.476 }, { "epoch": 0.3502384737678855, "grad_norm": 0.19224633276462555, "learning_rate": 3.643830343996129e-05, "loss": 0.4208, "num_input_tokens_seen": 8587654475, "step": 2203, "train_runtime": 87624.6736, "train_tokens_per_second": 98004.981 }, { "epoch": 0.3503974562798092, "grad_norm": 0.24567025899887085, "learning_rate": 3.6427176141701866e-05, "loss": 0.422, "num_input_tokens_seen": 8591605133, "step": 2204, "train_runtime": 87662.4789, "train_tokens_per_second": 98007.782 }, { "epoch": 0.3505564387917329, "grad_norm": 0.19822411239147186, "learning_rate": 3.641604598101557e-05, "loss": 0.4206, "num_input_tokens_seen": 8595462494, "step": 2205, "train_runtime": 87701.1959, "train_tokens_per_second": 98008.498 }, { "epoch": 0.35071542130365657, "grad_norm": 0.22284597158432007, "learning_rate": 3.640491296069042e-05, "loss": 0.4254, "num_input_tokens_seen": 8599338793, "step": 2206, "train_runtime": 87741.1949, "train_tokens_per_second": 98007.997 }, { "epoch": 0.35087440381558027, "grad_norm": 0.17517805099487305, "learning_rate": 3.6393777083515166e-05, "loss": 0.4052, "num_input_tokens_seen": 8603248311, "step": 2207, "train_runtime": 87780.8799, "train_tokens_per_second": 98008.226 }, { "epoch": 0.35103338632750397, "grad_norm": 0.21436180174350739, "learning_rate": 3.6382638352279265e-05, "loss": 0.4246, "num_input_tokens_seen": 8607005259, "step": 2208, "train_runtime": 87819.4684, "train_tokens_per_second": 98007.941 }, { "epoch": 0.35119236883942767, "grad_norm": 0.20511946082115173, "learning_rate": 3.637149676977289e-05, "loss": 0.4136, "num_input_tokens_seen": 8610905978, "step": 2209, "train_runtime": 87860.3043, "train_tokens_per_second": 98006.785 }, { "epoch": 0.35135135135135137, "grad_norm": 0.3059311509132385, "learning_rate": 3.6360352338786935e-05, "loss": 0.4163, "num_input_tokens_seen": 8614949853, "step": 2210, "train_runtime": 87898.7156, "train_tokens_per_second": 98009.963 }, { "epoch": 0.351510333863275, "grad_norm": 0.2619416415691376, "learning_rate": 3.634920506211299e-05, "loss": 0.4271, "num_input_tokens_seen": 8618773647, "step": 2211, "train_runtime": 87939.4358, "train_tokens_per_second": 98008.062 }, { "epoch": 0.3516693163751987, "grad_norm": 0.18895433843135834, "learning_rate": 3.633805494254338e-05, "loss": 0.4124, "num_input_tokens_seen": 8622670609, "step": 2212, "train_runtime": 87976.8935, "train_tokens_per_second": 98010.628 }, { "epoch": 0.3518282988871224, "grad_norm": 0.2001063972711563, "learning_rate": 3.632690198287112e-05, "loss": 0.4146, "num_input_tokens_seen": 8626638767, "step": 2213, "train_runtime": 88016.4793, "train_tokens_per_second": 98011.632 }, { "epoch": 0.3519872813990461, "grad_norm": 0.17826394736766815, "learning_rate": 3.6315746185889947e-05, "loss": 0.4232, "num_input_tokens_seen": 8630540349, "step": 2214, "train_runtime": 88053.8969, "train_tokens_per_second": 98014.292 }, { "epoch": 0.3521462639109698, "grad_norm": 0.2024923712015152, "learning_rate": 3.630458755439433e-05, "loss": 0.4128, "num_input_tokens_seen": 8634429754, "step": 2215, "train_runtime": 88094.7159, "train_tokens_per_second": 98013.027 }, { "epoch": 0.35230524642289346, "grad_norm": 0.20331555604934692, "learning_rate": 3.629342609117941e-05, "loss": 0.4223, "num_input_tokens_seen": 8638336922, "step": 2216, "train_runtime": 88134.6813, "train_tokens_per_second": 98012.914 }, { "epoch": 0.35246422893481716, "grad_norm": 0.1920572966337204, "learning_rate": 3.6282261799041046e-05, "loss": 0.407, "num_input_tokens_seen": 8642162748, "step": 2217, "train_runtime": 88174.2289, "train_tokens_per_second": 98012.343 }, { "epoch": 0.35262321144674086, "grad_norm": 0.18469998240470886, "learning_rate": 3.627109468077583e-05, "loss": 0.4077, "num_input_tokens_seen": 8646165491, "step": 2218, "train_runtime": 88211.3942, "train_tokens_per_second": 98016.425 }, { "epoch": 0.35278219395866456, "grad_norm": 0.2216024100780487, "learning_rate": 3.625992473918105e-05, "loss": 0.4051, "num_input_tokens_seen": 8649962876, "step": 2219, "train_runtime": 88251.5304, "train_tokens_per_second": 98014.877 }, { "epoch": 0.35294117647058826, "grad_norm": 0.20205730199813843, "learning_rate": 3.624875197705468e-05, "loss": 0.4215, "num_input_tokens_seen": 8653874782, "step": 2220, "train_runtime": 88290.8968, "train_tokens_per_second": 98015.482 }, { "epoch": 0.3531001589825119, "grad_norm": 0.272381067276001, "learning_rate": 3.623757639719544e-05, "loss": 0.4185, "num_input_tokens_seen": 8657707341, "step": 2221, "train_runtime": 88332.3568, "train_tokens_per_second": 98012.865 }, { "epoch": 0.3532591414944356, "grad_norm": 0.19417136907577515, "learning_rate": 3.6226398002402704e-05, "loss": 0.4208, "num_input_tokens_seen": 8661679660, "step": 2222, "train_runtime": 88371.1269, "train_tokens_per_second": 98014.815 }, { "epoch": 0.3534181240063593, "grad_norm": 0.20258735120296478, "learning_rate": 3.621521679547661e-05, "loss": 0.4361, "num_input_tokens_seen": 8665601315, "step": 2223, "train_runtime": 88407.9761, "train_tokens_per_second": 98018.32 }, { "epoch": 0.353577106518283, "grad_norm": 0.5386667847633362, "learning_rate": 3.620403277921795e-05, "loss": 0.4161, "num_input_tokens_seen": 8669498903, "step": 2224, "train_runtime": 88446.7168, "train_tokens_per_second": 98019.454 }, { "epoch": 0.3537360890302067, "grad_norm": 0.2121400684118271, "learning_rate": 3.619284595642826e-05, "loss": 0.4178, "num_input_tokens_seen": 8673456151, "step": 2225, "train_runtime": 88487.0515, "train_tokens_per_second": 98019.496 }, { "epoch": 0.35389507154213035, "grad_norm": 0.2234317809343338, "learning_rate": 3.618165632990975e-05, "loss": 0.4133, "num_input_tokens_seen": 8677468045, "step": 2226, "train_runtime": 88527.6223, "train_tokens_per_second": 98019.893 }, { "epoch": 0.35405405405405405, "grad_norm": 0.18835967779159546, "learning_rate": 3.617046390246533e-05, "loss": 0.4095, "num_input_tokens_seen": 8681314371, "step": 2227, "train_runtime": 88567.4348, "train_tokens_per_second": 98019.259 }, { "epoch": 0.35421303656597775, "grad_norm": 0.21760737895965576, "learning_rate": 3.615926867689866e-05, "loss": 0.419, "num_input_tokens_seen": 8685145780, "step": 2228, "train_runtime": 88600.6611, "train_tokens_per_second": 98025.745 }, { "epoch": 0.35437201907790145, "grad_norm": 0.21880611777305603, "learning_rate": 3.6148070656014026e-05, "loss": 0.4149, "num_input_tokens_seen": 8688974749, "step": 2229, "train_runtime": 88642.1147, "train_tokens_per_second": 98023.099 }, { "epoch": 0.35453100158982515, "grad_norm": 0.19961631298065186, "learning_rate": 3.6136869842616475e-05, "loss": 0.4106, "num_input_tokens_seen": 8692886586, "step": 2230, "train_runtime": 88681.224, "train_tokens_per_second": 98023.981 }, { "epoch": 0.3546899841017488, "grad_norm": 0.31862348318099976, "learning_rate": 3.612566623951172e-05, "loss": 0.4146, "num_input_tokens_seen": 8696872869, "step": 2231, "train_runtime": 88721.8906, "train_tokens_per_second": 98023.98 }, { "epoch": 0.3548489666136725, "grad_norm": 0.216831773519516, "learning_rate": 3.611445984950619e-05, "loss": 0.4189, "num_input_tokens_seen": 8700829644, "step": 2232, "train_runtime": 88759.712, "train_tokens_per_second": 98026.79 }, { "epoch": 0.3550079491255962, "grad_norm": 0.22053539752960205, "learning_rate": 3.6103250675407017e-05, "loss": 0.4153, "num_input_tokens_seen": 8704701165, "step": 2233, "train_runtime": 88797.914, "train_tokens_per_second": 98028.217 }, { "epoch": 0.3551669316375199, "grad_norm": 0.21602526307106018, "learning_rate": 3.609203872002199e-05, "loss": 0.4191, "num_input_tokens_seen": 8708628521, "step": 2234, "train_runtime": 88838.3043, "train_tokens_per_second": 98027.856 }, { "epoch": 0.35532591414944353, "grad_norm": 0.21289044618606567, "learning_rate": 3.608082398615966e-05, "loss": 0.4224, "num_input_tokens_seen": 8712566701, "step": 2235, "train_runtime": 88879.3143, "train_tokens_per_second": 98026.934 }, { "epoch": 0.35548489666136723, "grad_norm": 0.1968725323677063, "learning_rate": 3.6069606476629215e-05, "loss": 0.4283, "num_input_tokens_seen": 8716421816, "step": 2236, "train_runtime": 88920.3071, "train_tokens_per_second": 98025.098 }, { "epoch": 0.35564387917329093, "grad_norm": 0.22951878607273102, "learning_rate": 3.6058386194240574e-05, "loss": 0.4099, "num_input_tokens_seen": 8720430796, "step": 2237, "train_runtime": 88960.8669, "train_tokens_per_second": 98025.47 }, { "epoch": 0.35580286168521463, "grad_norm": 0.21369114518165588, "learning_rate": 3.604716314180433e-05, "loss": 0.4237, "num_input_tokens_seen": 8724351002, "step": 2238, "train_runtime": 88999.5519, "train_tokens_per_second": 98026.909 }, { "epoch": 0.35596184419713833, "grad_norm": 0.2117416262626648, "learning_rate": 3.6035937322131786e-05, "loss": 0.4358, "num_input_tokens_seen": 8728241696, "step": 2239, "train_runtime": 89039.4355, "train_tokens_per_second": 98026.696 }, { "epoch": 0.356120826709062, "grad_norm": 0.2602427005767822, "learning_rate": 3.602470873803492e-05, "loss": 0.4244, "num_input_tokens_seen": 8732165672, "step": 2240, "train_runtime": 89077.8105, "train_tokens_per_second": 98028.517 }, { "epoch": 0.3562798092209857, "grad_norm": 0.49406397342681885, "learning_rate": 3.6013477392326424e-05, "loss": 0.4324, "num_input_tokens_seen": 8736073473, "step": 2241, "train_runtime": 89116.6579, "train_tokens_per_second": 98029.635 }, { "epoch": 0.3564387917329094, "grad_norm": 0.22843441367149353, "learning_rate": 3.600224328781967e-05, "loss": 0.4149, "num_input_tokens_seen": 8739949754, "step": 2242, "train_runtime": 89155.1326, "train_tokens_per_second": 98030.809 }, { "epoch": 0.3565977742448331, "grad_norm": 0.20876099169254303, "learning_rate": 3.599100642732871e-05, "loss": 0.4326, "num_input_tokens_seen": 8743727977, "step": 2243, "train_runtime": 89192.566, "train_tokens_per_second": 98032.026 }, { "epoch": 0.3567567567567568, "grad_norm": 0.31532567739486694, "learning_rate": 3.597976681366831e-05, "loss": 0.4274, "num_input_tokens_seen": 8747598034, "step": 2244, "train_runtime": 89232.6226, "train_tokens_per_second": 98031.39 }, { "epoch": 0.3569157392686804, "grad_norm": 0.2460031807422638, "learning_rate": 3.596852444965391e-05, "loss": 0.4209, "num_input_tokens_seen": 8751530359, "step": 2245, "train_runtime": 89272.8186, "train_tokens_per_second": 98031.299 }, { "epoch": 0.3570747217806041, "grad_norm": 0.20854294300079346, "learning_rate": 3.595727933810165e-05, "loss": 0.4231, "num_input_tokens_seen": 8755453545, "step": 2246, "train_runtime": 89311.9206, "train_tokens_per_second": 98032.306 }, { "epoch": 0.3572337042925278, "grad_norm": 0.18999536335468292, "learning_rate": 3.594603148182833e-05, "loss": 0.418, "num_input_tokens_seen": 8759369681, "step": 2247, "train_runtime": 89351.1279, "train_tokens_per_second": 98033.118 }, { "epoch": 0.3573926868044515, "grad_norm": 0.20085807144641876, "learning_rate": 3.593478088365149e-05, "loss": 0.4177, "num_input_tokens_seen": 8763317917, "step": 2248, "train_runtime": 89390.0265, "train_tokens_per_second": 98034.627 }, { "epoch": 0.3575516693163752, "grad_norm": 0.1857375055551529, "learning_rate": 3.592352754638929e-05, "loss": 0.4075, "num_input_tokens_seen": 8767296929, "step": 2249, "train_runtime": 89430.0578, "train_tokens_per_second": 98035.237 }, { "epoch": 0.35771065182829886, "grad_norm": 0.19875934720039368, "learning_rate": 3.5912271472860634e-05, "loss": 0.4026, "num_input_tokens_seen": 8771233381, "step": 2250, "train_runtime": 89468.7762, "train_tokens_per_second": 98036.81 }, { "epoch": 0.35786963434022256, "grad_norm": 0.20252804458141327, "learning_rate": 3.5901012665885085e-05, "loss": 0.4135, "num_input_tokens_seen": 8775108381, "step": 2251, "train_runtime": 89509.2416, "train_tokens_per_second": 98035.781 }, { "epoch": 0.35802861685214626, "grad_norm": 0.22241924703121185, "learning_rate": 3.588975112828289e-05, "loss": 0.4214, "num_input_tokens_seen": 8779023107, "step": 2252, "train_runtime": 89547.5508, "train_tokens_per_second": 98037.557 }, { "epoch": 0.35818759936406996, "grad_norm": 0.19790662825107574, "learning_rate": 3.5878486862875005e-05, "loss": 0.4246, "num_input_tokens_seen": 8782864029, "step": 2253, "train_runtime": 89589.1208, "train_tokens_per_second": 98034.939 }, { "epoch": 0.35834658187599366, "grad_norm": 0.21596288681030273, "learning_rate": 3.5867219872483014e-05, "loss": 0.4158, "num_input_tokens_seen": 8786630353, "step": 2254, "train_runtime": 89628.4239, "train_tokens_per_second": 98033.971 }, { "epoch": 0.3585055643879173, "grad_norm": 0.19014102220535278, "learning_rate": 3.585595015992924e-05, "loss": 0.4234, "num_input_tokens_seen": 8790563634, "step": 2255, "train_runtime": 89667.2508, "train_tokens_per_second": 98035.387 }, { "epoch": 0.358664546899841, "grad_norm": 0.2120181918144226, "learning_rate": 3.5844677728036666e-05, "loss": 0.4216, "num_input_tokens_seen": 8794477695, "step": 2256, "train_runtime": 89709.0856, "train_tokens_per_second": 98033.3 }, { "epoch": 0.3588235294117647, "grad_norm": 0.18824636936187744, "learning_rate": 3.5833402579628956e-05, "loss": 0.418, "num_input_tokens_seen": 8798400568, "step": 2257, "train_runtime": 89747.8544, "train_tokens_per_second": 98034.662 }, { "epoch": 0.3589825119236884, "grad_norm": 0.1864030808210373, "learning_rate": 3.582212471753045e-05, "loss": 0.4097, "num_input_tokens_seen": 8802416209, "step": 2258, "train_runtime": 89786.7555, "train_tokens_per_second": 98036.912 }, { "epoch": 0.3591414944356121, "grad_norm": 0.20879854261875153, "learning_rate": 3.581084414456617e-05, "loss": 0.4236, "num_input_tokens_seen": 8806240025, "step": 2259, "train_runtime": 89823.4352, "train_tokens_per_second": 98039.448 }, { "epoch": 0.35930047694753575, "grad_norm": 0.174599289894104, "learning_rate": 3.579956086356183e-05, "loss": 0.408, "num_input_tokens_seen": 8810139490, "step": 2260, "train_runtime": 89862.8737, "train_tokens_per_second": 98039.815 }, { "epoch": 0.35945945945945945, "grad_norm": 0.24269026517868042, "learning_rate": 3.578827487734381e-05, "loss": 0.4298, "num_input_tokens_seen": 8813966624, "step": 2261, "train_runtime": 89903.5988, "train_tokens_per_second": 98037.973 }, { "epoch": 0.35961844197138315, "grad_norm": 0.23831585049629211, "learning_rate": 3.577698618873917e-05, "loss": 0.4066, "num_input_tokens_seen": 8817931243, "step": 2262, "train_runtime": 89942.8377, "train_tokens_per_second": 98039.282 }, { "epoch": 0.35977742448330685, "grad_norm": 0.2020299881696701, "learning_rate": 3.5765694800575635e-05, "loss": 0.4327, "num_input_tokens_seen": 8821773741, "step": 2263, "train_runtime": 89982.0226, "train_tokens_per_second": 98039.291 }, { "epoch": 0.35993640699523055, "grad_norm": 0.22080986201763153, "learning_rate": 3.575440071568162e-05, "loss": 0.4105, "num_input_tokens_seen": 8825580056, "step": 2264, "train_runtime": 90020.1852, "train_tokens_per_second": 98040.012 }, { "epoch": 0.3600953895071542, "grad_norm": 0.1885266751050949, "learning_rate": 3.574310393688624e-05, "loss": 0.4214, "num_input_tokens_seen": 8829534438, "step": 2265, "train_runtime": 90058.9916, "train_tokens_per_second": 98041.676 }, { "epoch": 0.3602543720190779, "grad_norm": 0.24250611662864685, "learning_rate": 3.5731804467019225e-05, "loss": 0.4101, "num_input_tokens_seen": 8833460587, "step": 2266, "train_runtime": 90098.4835, "train_tokens_per_second": 98042.278 }, { "epoch": 0.3604133545310016, "grad_norm": 0.21196864545345306, "learning_rate": 3.572050230891102e-05, "loss": 0.4155, "num_input_tokens_seen": 8837283273, "step": 2267, "train_runtime": 90139.4931, "train_tokens_per_second": 98040.082 }, { "epoch": 0.3605723370429253, "grad_norm": 0.223637655377388, "learning_rate": 3.570919746539276e-05, "loss": 0.4276, "num_input_tokens_seen": 8841157261, "step": 2268, "train_runtime": 90181.1077, "train_tokens_per_second": 98037.798 }, { "epoch": 0.36073131955484894, "grad_norm": 0.19881832599639893, "learning_rate": 3.56978899392962e-05, "loss": 0.4135, "num_input_tokens_seen": 8845064616, "step": 2269, "train_runtime": 90220.5811, "train_tokens_per_second": 98038.214 }, { "epoch": 0.36089030206677264, "grad_norm": 0.21354500949382782, "learning_rate": 3.5686579733453805e-05, "loss": 0.4235, "num_input_tokens_seen": 8849066947, "step": 2270, "train_runtime": 90260.7626, "train_tokens_per_second": 98038.912 }, { "epoch": 0.36104928457869634, "grad_norm": 0.19675682485103607, "learning_rate": 3.5675266850698697e-05, "loss": 0.4189, "num_input_tokens_seen": 8852775799, "step": 2271, "train_runtime": 90300.0148, "train_tokens_per_second": 98037.368 }, { "epoch": 0.36120826709062004, "grad_norm": 0.18826986849308014, "learning_rate": 3.5663951293864686e-05, "loss": 0.4185, "num_input_tokens_seen": 8856778360, "step": 2272, "train_runtime": 90342.4082, "train_tokens_per_second": 98035.668 }, { "epoch": 0.36136724960254374, "grad_norm": 0.24416103959083557, "learning_rate": 3.5652633065786225e-05, "loss": 0.4199, "num_input_tokens_seen": 8860710922, "step": 2273, "train_runtime": 90381.4032, "train_tokens_per_second": 98036.882 }, { "epoch": 0.3615262321144674, "grad_norm": 0.20799775421619415, "learning_rate": 3.564131216929846e-05, "loss": 0.4217, "num_input_tokens_seen": 8864587235, "step": 2274, "train_runtime": 90421.7902, "train_tokens_per_second": 98035.962 }, { "epoch": 0.3616852146263911, "grad_norm": 0.1938069462776184, "learning_rate": 3.562998860723719e-05, "loss": 0.407, "num_input_tokens_seen": 8868455469, "step": 2275, "train_runtime": 90461.5149, "train_tokens_per_second": 98035.673 }, { "epoch": 0.3618441971383148, "grad_norm": 0.2589607238769531, "learning_rate": 3.5618662382438894e-05, "loss": 0.3929, "num_input_tokens_seen": 8872354782, "step": 2276, "train_runtime": 90502.2935, "train_tokens_per_second": 98034.585 }, { "epoch": 0.3620031796502385, "grad_norm": 0.3639679551124573, "learning_rate": 3.56073334977407e-05, "loss": 0.4014, "num_input_tokens_seen": 8876269939, "step": 2277, "train_runtime": 90542.8834, "train_tokens_per_second": 98033.877 }, { "epoch": 0.3621621621621622, "grad_norm": 0.897819995880127, "learning_rate": 3.559600195598043e-05, "loss": 0.4058, "num_input_tokens_seen": 8880093729, "step": 2278, "train_runtime": 90581.313, "train_tokens_per_second": 98034.5 }, { "epoch": 0.3623211446740858, "grad_norm": 0.20045943558216095, "learning_rate": 3.558466775999653e-05, "loss": 0.4243, "num_input_tokens_seen": 8884003037, "step": 2279, "train_runtime": 90622.084, "train_tokens_per_second": 98033.533 }, { "epoch": 0.3624801271860095, "grad_norm": 0.31716665625572205, "learning_rate": 3.5573330912628164e-05, "loss": 0.4254, "num_input_tokens_seen": 8887889128, "step": 2280, "train_runtime": 90663.192, "train_tokens_per_second": 98031.946 }, { "epoch": 0.3626391096979332, "grad_norm": 0.20092645287513733, "learning_rate": 3.5561991416715116e-05, "loss": 0.4138, "num_input_tokens_seen": 8891859737, "step": 2281, "train_runtime": 90702.2142, "train_tokens_per_second": 98033.547 }, { "epoch": 0.3627980922098569, "grad_norm": 0.21113839745521545, "learning_rate": 3.555064927509784e-05, "loss": 0.4284, "num_input_tokens_seen": 8895703598, "step": 2282, "train_runtime": 90739.9379, "train_tokens_per_second": 98035.152 }, { "epoch": 0.3629570747217806, "grad_norm": 0.2013465315103531, "learning_rate": 3.553930449061749e-05, "loss": 0.4157, "num_input_tokens_seen": 8899609208, "step": 2283, "train_runtime": 90778.2773, "train_tokens_per_second": 98036.771 }, { "epoch": 0.36311605723370427, "grad_norm": 0.1976185441017151, "learning_rate": 3.552795706611583e-05, "loss": 0.4302, "num_input_tokens_seen": 8903542933, "step": 2284, "train_runtime": 90817.059, "train_tokens_per_second": 98038.221 }, { "epoch": 0.36327503974562797, "grad_norm": 0.21040306985378265, "learning_rate": 3.551660700443533e-05, "loss": 0.4177, "num_input_tokens_seen": 8907515032, "step": 2285, "train_runtime": 90853.738, "train_tokens_per_second": 98042.362 }, { "epoch": 0.36343402225755167, "grad_norm": 0.219656303524971, "learning_rate": 3.550525430841909e-05, "loss": 0.4209, "num_input_tokens_seen": 8911344971, "step": 2286, "train_runtime": 90893.9384, "train_tokens_per_second": 98041.136 }, { "epoch": 0.36359300476947537, "grad_norm": 0.21938931941986084, "learning_rate": 3.549389898091087e-05, "loss": 0.4272, "num_input_tokens_seen": 8915236035, "step": 2287, "train_runtime": 90937.3725, "train_tokens_per_second": 98037.097 }, { "epoch": 0.36375198728139907, "grad_norm": 0.2580340504646301, "learning_rate": 3.548254102475512e-05, "loss": 0.4179, "num_input_tokens_seen": 8919213425, "step": 2288, "train_runtime": 90977.4304, "train_tokens_per_second": 98037.649 }, { "epoch": 0.3639109697933227, "grad_norm": 0.2057373970746994, "learning_rate": 3.54711804427969e-05, "loss": 0.423, "num_input_tokens_seen": 8923252401, "step": 2289, "train_runtime": 91015.5478, "train_tokens_per_second": 98040.968 }, { "epoch": 0.3640699523052464, "grad_norm": 0.24123267829418182, "learning_rate": 3.545981723788198e-05, "loss": 0.4016, "num_input_tokens_seen": 8927047871, "step": 2290, "train_runtime": 91055.5494, "train_tokens_per_second": 98039.581 }, { "epoch": 0.3642289348171701, "grad_norm": 0.18789461255073547, "learning_rate": 3.544845141285676e-05, "loss": 0.4175, "num_input_tokens_seen": 8930935432, "step": 2291, "train_runtime": 91095.5438, "train_tokens_per_second": 98039.213 }, { "epoch": 0.3643879173290938, "grad_norm": 0.6280192136764526, "learning_rate": 3.543708297056829e-05, "loss": 0.4204, "num_input_tokens_seen": 8934859379, "step": 2292, "train_runtime": 91135.068, "train_tokens_per_second": 98039.751 }, { "epoch": 0.3645468998410175, "grad_norm": 0.2047199159860611, "learning_rate": 3.542571191386429e-05, "loss": 0.4181, "num_input_tokens_seen": 8938832645, "step": 2293, "train_runtime": 91172.6989, "train_tokens_per_second": 98042.865 }, { "epoch": 0.36470588235294116, "grad_norm": 0.226146399974823, "learning_rate": 3.5414338245593125e-05, "loss": 0.4159, "num_input_tokens_seen": 8942665935, "step": 2294, "train_runtime": 91215.2616, "train_tokens_per_second": 98039.141 }, { "epoch": 0.36486486486486486, "grad_norm": 0.2035144567489624, "learning_rate": 3.540296196860382e-05, "loss": 0.4142, "num_input_tokens_seen": 8946576979, "step": 2295, "train_runtime": 91254.0397, "train_tokens_per_second": 98040.339 }, { "epoch": 0.36502384737678856, "grad_norm": 0.26705846190452576, "learning_rate": 3.5391583085746055e-05, "loss": 0.4227, "num_input_tokens_seen": 8950559252, "step": 2296, "train_runtime": 91293.2065, "train_tokens_per_second": 98041.898 }, { "epoch": 0.36518282988871226, "grad_norm": 0.198929101228714, "learning_rate": 3.5380201599870156e-05, "loss": 0.4217, "num_input_tokens_seen": 8954523175, "step": 2297, "train_runtime": 91333.1364, "train_tokens_per_second": 98042.436 }, { "epoch": 0.36534181240063596, "grad_norm": 0.19316352903842926, "learning_rate": 3.5368817513827105e-05, "loss": 0.431, "num_input_tokens_seen": 8958363663, "step": 2298, "train_runtime": 91373.6524, "train_tokens_per_second": 98040.993 }, { "epoch": 0.3655007949125596, "grad_norm": 0.18847547471523285, "learning_rate": 3.535743083046854e-05, "loss": 0.4156, "num_input_tokens_seen": 8962301695, "step": 2299, "train_runtime": 91411.1797, "train_tokens_per_second": 98043.825 }, { "epoch": 0.3656597774244833, "grad_norm": 0.2139873504638672, "learning_rate": 3.5346041552646744e-05, "loss": 0.4249, "num_input_tokens_seen": 8966204835, "step": 2300, "train_runtime": 91450.0389, "train_tokens_per_second": 98044.844 }, { "epoch": 0.365818759936407, "grad_norm": 0.20839032530784607, "learning_rate": 3.533464968321464e-05, "loss": 0.4351, "num_input_tokens_seen": 8970132042, "step": 2301, "train_runtime": 91484.4916, "train_tokens_per_second": 98050.849 }, { "epoch": 0.3659777424483307, "grad_norm": 0.255948930978775, "learning_rate": 3.532325522502582e-05, "loss": 0.4138, "num_input_tokens_seen": 8973906495, "step": 2302, "train_runtime": 91524.74, "train_tokens_per_second": 98048.97 }, { "epoch": 0.36613672496025434, "grad_norm": 0.18780605494976044, "learning_rate": 3.531185818093451e-05, "loss": 0.4196, "num_input_tokens_seen": 8977791813, "step": 2303, "train_runtime": 91564.8821, "train_tokens_per_second": 98048.418 }, { "epoch": 0.36629570747217804, "grad_norm": 0.1958339363336563, "learning_rate": 3.53004585537956e-05, "loss": 0.4095, "num_input_tokens_seen": 8981767959, "step": 2304, "train_runtime": 91603.6467, "train_tokens_per_second": 98050.332 }, { "epoch": 0.36645468998410174, "grad_norm": 0.2079755961894989, "learning_rate": 3.528905634646462e-05, "loss": 0.4254, "num_input_tokens_seen": 8985644625, "step": 2305, "train_runtime": 91642.6449, "train_tokens_per_second": 98050.909 }, { "epoch": 0.36661367249602544, "grad_norm": 0.20298966765403748, "learning_rate": 3.5277651561797715e-05, "loss": 0.4202, "num_input_tokens_seen": 8989537395, "step": 2306, "train_runtime": 91684.2551, "train_tokens_per_second": 98048.868 }, { "epoch": 0.36677265500794914, "grad_norm": 0.22723180055618286, "learning_rate": 3.526624420265172e-05, "loss": 0.419, "num_input_tokens_seen": 8993552223, "step": 2307, "train_runtime": 91724.748, "train_tokens_per_second": 98049.353 }, { "epoch": 0.3669316375198728, "grad_norm": 0.19832998514175415, "learning_rate": 3.5254834271884105e-05, "loss": 0.4295, "num_input_tokens_seen": 8997425633, "step": 2308, "train_runtime": 91765.5437, "train_tokens_per_second": 98047.974 }, { "epoch": 0.3670906200317965, "grad_norm": 0.18668001890182495, "learning_rate": 3.524342177235296e-05, "loss": 0.4121, "num_input_tokens_seen": 9001422150, "step": 2309, "train_runtime": 91804.9706, "train_tokens_per_second": 98049.399 }, { "epoch": 0.3672496025437202, "grad_norm": 0.21440277993679047, "learning_rate": 3.5232006706917056e-05, "loss": 0.4064, "num_input_tokens_seen": 9005283969, "step": 2310, "train_runtime": 91845.0531, "train_tokens_per_second": 98048.655 }, { "epoch": 0.3674085850556439, "grad_norm": 0.20152895152568817, "learning_rate": 3.522058907843576e-05, "loss": 0.4104, "num_input_tokens_seen": 9009173428, "step": 2311, "train_runtime": 91885.4236, "train_tokens_per_second": 98047.907 }, { "epoch": 0.3675675675675676, "grad_norm": 0.206503763794899, "learning_rate": 3.5209168889769135e-05, "loss": 0.4401, "num_input_tokens_seen": 9013059180, "step": 2312, "train_runtime": 91926.6752, "train_tokens_per_second": 98046.178 }, { "epoch": 0.36772655007949123, "grad_norm": 0.6660175323486328, "learning_rate": 3.5197746143777835e-05, "loss": 0.4268, "num_input_tokens_seen": 9016880740, "step": 2313, "train_runtime": 91964.5526, "train_tokens_per_second": 98047.351 }, { "epoch": 0.36788553259141493, "grad_norm": 0.22966626286506653, "learning_rate": 3.518632084332319e-05, "loss": 0.4239, "num_input_tokens_seen": 9020848408, "step": 2314, "train_runtime": 92003.6677, "train_tokens_per_second": 98048.791 }, { "epoch": 0.36804451510333863, "grad_norm": 0.20411090552806854, "learning_rate": 3.517489299126716e-05, "loss": 0.4143, "num_input_tokens_seen": 9024799764, "step": 2315, "train_runtime": 92040.796, "train_tokens_per_second": 98052.17 }, { "epoch": 0.36820349761526233, "grad_norm": 0.19728972017765045, "learning_rate": 3.516346259047233e-05, "loss": 0.3979, "num_input_tokens_seen": 9028703574, "step": 2316, "train_runtime": 92081.4807, "train_tokens_per_second": 98051.242 }, { "epoch": 0.36836248012718603, "grad_norm": 0.2515794634819031, "learning_rate": 3.5152029643801944e-05, "loss": 0.4315, "num_input_tokens_seen": 9032659174, "step": 2317, "train_runtime": 92118.3654, "train_tokens_per_second": 98054.922 }, { "epoch": 0.3685214626391097, "grad_norm": 0.2016158252954483, "learning_rate": 3.5140594154119865e-05, "loss": 0.4142, "num_input_tokens_seen": 9036416738, "step": 2318, "train_runtime": 92158.5901, "train_tokens_per_second": 98052.897 }, { "epoch": 0.3686804451510334, "grad_norm": 0.19566522538661957, "learning_rate": 3.512915612429061e-05, "loss": 0.4123, "num_input_tokens_seen": 9040355761, "step": 2319, "train_runtime": 92196.0445, "train_tokens_per_second": 98055.788 }, { "epoch": 0.3688394276629571, "grad_norm": 0.19933059811592102, "learning_rate": 3.511771555717932e-05, "loss": 0.4232, "num_input_tokens_seen": 9044238632, "step": 2320, "train_runtime": 92235.1221, "train_tokens_per_second": 98056.341 }, { "epoch": 0.3689984101748808, "grad_norm": 0.2134518027305603, "learning_rate": 3.510627245565178e-05, "loss": 0.4191, "num_input_tokens_seen": 9048069059, "step": 2321, "train_runtime": 92273.7118, "train_tokens_per_second": 98056.845 }, { "epoch": 0.3691573926868045, "grad_norm": 0.20096780359745026, "learning_rate": 3.50948268225744e-05, "loss": 0.4253, "num_input_tokens_seen": 9051998005, "step": 2322, "train_runtime": 92311.8705, "train_tokens_per_second": 98058.873 }, { "epoch": 0.3693163751987281, "grad_norm": 0.21166181564331055, "learning_rate": 3.508337866081424e-05, "loss": 0.4249, "num_input_tokens_seen": 9056015460, "step": 2323, "train_runtime": 92351.7955, "train_tokens_per_second": 98059.983 }, { "epoch": 0.3694753577106518, "grad_norm": 0.2081201672554016, "learning_rate": 3.5071927973238985e-05, "loss": 0.4202, "num_input_tokens_seen": 9059966701, "step": 2324, "train_runtime": 92390.2586, "train_tokens_per_second": 98061.926 }, { "epoch": 0.3696343402225755, "grad_norm": 0.1910460889339447, "learning_rate": 3.506047476271693e-05, "loss": 0.4152, "num_input_tokens_seen": 9063831246, "step": 2325, "train_runtime": 92430.5662, "train_tokens_per_second": 98060.973 }, { "epoch": 0.3697933227344992, "grad_norm": 0.1916612982749939, "learning_rate": 3.5049019032117044e-05, "loss": 0.4105, "num_input_tokens_seen": 9067700516, "step": 2326, "train_runtime": 92469.8043, "train_tokens_per_second": 98061.206 }, { "epoch": 0.3699523052464229, "grad_norm": 0.20835267007350922, "learning_rate": 3.50375607843089e-05, "loss": 0.4092, "num_input_tokens_seen": 9071649085, "step": 2327, "train_runtime": 92510.0339, "train_tokens_per_second": 98061.245 }, { "epoch": 0.37011128775834656, "grad_norm": 0.1851896196603775, "learning_rate": 3.5026100022162705e-05, "loss": 0.4249, "num_input_tokens_seen": 9075486904, "step": 2328, "train_runtime": 92551.2463, "train_tokens_per_second": 98059.046 }, { "epoch": 0.37027027027027026, "grad_norm": 0.3939000070095062, "learning_rate": 3.501463674854931e-05, "loss": 0.4166, "num_input_tokens_seen": 9079354034, "step": 2329, "train_runtime": 92592.8838, "train_tokens_per_second": 98056.715 }, { "epoch": 0.37042925278219396, "grad_norm": 0.19862820208072662, "learning_rate": 3.500317096634016e-05, "loss": 0.4213, "num_input_tokens_seen": 9083200294, "step": 2330, "train_runtime": 92631.4386, "train_tokens_per_second": 98057.424 }, { "epoch": 0.37058823529411766, "grad_norm": 0.3240334093570709, "learning_rate": 3.499170267840739e-05, "loss": 0.4086, "num_input_tokens_seen": 9087098389, "step": 2331, "train_runtime": 92668.6207, "train_tokens_per_second": 98060.145 }, { "epoch": 0.37074721780604136, "grad_norm": 0.22361107170581818, "learning_rate": 3.4980231887623693e-05, "loss": 0.4257, "num_input_tokens_seen": 9091005891, "step": 2332, "train_runtime": 92709.2708, "train_tokens_per_second": 98059.297 }, { "epoch": 0.370906200317965, "grad_norm": 0.21915870904922485, "learning_rate": 3.496875859686244e-05, "loss": 0.4304, "num_input_tokens_seen": 9094896416, "step": 2333, "train_runtime": 92749.9066, "train_tokens_per_second": 98058.281 }, { "epoch": 0.3710651828298887, "grad_norm": 0.19192726910114288, "learning_rate": 3.495728280899759e-05, "loss": 0.4092, "num_input_tokens_seen": 9098925658, "step": 2334, "train_runtime": 92791.2857, "train_tokens_per_second": 98057.976 }, { "epoch": 0.3712241653418124, "grad_norm": 0.20224061608314514, "learning_rate": 3.494580452690376e-05, "loss": 0.4092, "num_input_tokens_seen": 9102870127, "step": 2335, "train_runtime": 92830.8924, "train_tokens_per_second": 98058.63 }, { "epoch": 0.3713831478537361, "grad_norm": 0.20867827534675598, "learning_rate": 3.493432375345618e-05, "loss": 0.4367, "num_input_tokens_seen": 9106730010, "step": 2336, "train_runtime": 92869.4893, "train_tokens_per_second": 98059.439 }, { "epoch": 0.37154213036565975, "grad_norm": 0.2012656331062317, "learning_rate": 3.4922840491530705e-05, "loss": 0.4237, "num_input_tokens_seen": 9110538475, "step": 2337, "train_runtime": 92907.0106, "train_tokens_per_second": 98060.829 }, { "epoch": 0.37170111287758345, "grad_norm": 0.1944226324558258, "learning_rate": 3.4911354744003797e-05, "loss": 0.4214, "num_input_tokens_seen": 9114519599, "step": 2338, "train_runtime": 92945.4697, "train_tokens_per_second": 98063.086 }, { "epoch": 0.37186009538950715, "grad_norm": 0.18537628650665283, "learning_rate": 3.489986651375257e-05, "loss": 0.4181, "num_input_tokens_seen": 9118360380, "step": 2339, "train_runtime": 92984.4854, "train_tokens_per_second": 98063.245 }, { "epoch": 0.37201907790143085, "grad_norm": 0.24002103507518768, "learning_rate": 3.4888375803654724e-05, "loss": 0.4151, "num_input_tokens_seen": 9122187217, "step": 2340, "train_runtime": 93023.0018, "train_tokens_per_second": 98063.78 }, { "epoch": 0.37217806041335455, "grad_norm": 0.18638306856155396, "learning_rate": 3.4876882616588624e-05, "loss": 0.4299, "num_input_tokens_seen": 9126146663, "step": 2341, "train_runtime": 93062.2464, "train_tokens_per_second": 98064.973 }, { "epoch": 0.3723370429252782, "grad_norm": 0.2239873856306076, "learning_rate": 3.4865386955433224e-05, "loss": 0.4165, "num_input_tokens_seen": 9130096235, "step": 2342, "train_runtime": 93100.882, "train_tokens_per_second": 98066.7 }, { "epoch": 0.3724960254372019, "grad_norm": 0.20430058240890503, "learning_rate": 3.4853888823068104e-05, "loss": 0.4133, "num_input_tokens_seen": 9133971233, "step": 2343, "train_runtime": 93138.8058, "train_tokens_per_second": 98068.374 }, { "epoch": 0.3726550079491256, "grad_norm": 0.20864737033843994, "learning_rate": 3.4842388222373454e-05, "loss": 0.425, "num_input_tokens_seen": 9137920385, "step": 2344, "train_runtime": 93178.8867, "train_tokens_per_second": 98068.572 }, { "epoch": 0.3728139904610493, "grad_norm": 0.20450252294540405, "learning_rate": 3.483088515623011e-05, "loss": 0.4091, "num_input_tokens_seen": 9141848603, "step": 2345, "train_runtime": 93218.0858, "train_tokens_per_second": 98069.474 }, { "epoch": 0.372972972972973, "grad_norm": 0.18111641705036163, "learning_rate": 3.481937962751951e-05, "loss": 0.415, "num_input_tokens_seen": 9145819073, "step": 2346, "train_runtime": 93257.0278, "train_tokens_per_second": 98071.098 }, { "epoch": 0.37313195548489664, "grad_norm": 0.1938970685005188, "learning_rate": 3.480787163912369e-05, "loss": 0.4116, "num_input_tokens_seen": 9149696270, "step": 2347, "train_runtime": 93297.4464, "train_tokens_per_second": 98070.168 }, { "epoch": 0.37329093799682034, "grad_norm": 0.6034178137779236, "learning_rate": 3.4796361193925345e-05, "loss": 0.4119, "num_input_tokens_seen": 9153643988, "step": 2348, "train_runtime": 93338.3015, "train_tokens_per_second": 98069.537 }, { "epoch": 0.37344992050874404, "grad_norm": 0.1970760077238083, "learning_rate": 3.478484829480773e-05, "loss": 0.4135, "num_input_tokens_seen": 9157565670, "step": 2349, "train_runtime": 93376.9525, "train_tokens_per_second": 98070.942 }, { "epoch": 0.37360890302066774, "grad_norm": 0.1910424828529358, "learning_rate": 3.4773332944654764e-05, "loss": 0.4148, "num_input_tokens_seen": 9161431872, "step": 2350, "train_runtime": 93416.0178, "train_tokens_per_second": 98071.317 }, { "epoch": 0.37376788553259144, "grad_norm": 0.20540539920330048, "learning_rate": 3.476181514635096e-05, "loss": 0.4129, "num_input_tokens_seen": 9165343252, "step": 2351, "train_runtime": 93454.7661, "train_tokens_per_second": 98072.507 }, { "epoch": 0.3739268680445151, "grad_norm": 0.18764878809452057, "learning_rate": 3.475029490278144e-05, "loss": 0.4117, "num_input_tokens_seen": 9169206340, "step": 2352, "train_runtime": 93493.8226, "train_tokens_per_second": 98072.857 }, { "epoch": 0.3740858505564388, "grad_norm": 0.2023249864578247, "learning_rate": 3.473877221683194e-05, "loss": 0.4227, "num_input_tokens_seen": 9173138618, "step": 2353, "train_runtime": 93534.4521, "train_tokens_per_second": 98072.298 }, { "epoch": 0.3742448330683625, "grad_norm": 0.20522239804267883, "learning_rate": 3.472724709138883e-05, "loss": 0.4084, "num_input_tokens_seen": 9176997069, "step": 2354, "train_runtime": 93574.0664, "train_tokens_per_second": 98072.013 }, { "epoch": 0.3744038155802862, "grad_norm": 0.2097887098789215, "learning_rate": 3.471571952933905e-05, "loss": 0.4255, "num_input_tokens_seen": 9180942620, "step": 2355, "train_runtime": 93613.2921, "train_tokens_per_second": 98073.067 }, { "epoch": 0.3745627980922099, "grad_norm": 0.19052718579769135, "learning_rate": 3.47041895335702e-05, "loss": 0.4144, "num_input_tokens_seen": 9184815703, "step": 2356, "train_runtime": 93654.0615, "train_tokens_per_second": 98071.729 }, { "epoch": 0.3747217806041335, "grad_norm": 0.2218666821718216, "learning_rate": 3.4692657106970425e-05, "loss": 0.4082, "num_input_tokens_seen": 9188692386, "step": 2357, "train_runtime": 93691.8911, "train_tokens_per_second": 98073.508 }, { "epoch": 0.3748807631160572, "grad_norm": 0.1840825229883194, "learning_rate": 3.468112225242855e-05, "loss": 0.4191, "num_input_tokens_seen": 9192605496, "step": 2358, "train_runtime": 93730.2856, "train_tokens_per_second": 98075.083 }, { "epoch": 0.3750397456279809, "grad_norm": 0.21409741044044495, "learning_rate": 3.466958497283395e-05, "loss": 0.4415, "num_input_tokens_seen": 9196469288, "step": 2359, "train_runtime": 93768.5446, "train_tokens_per_second": 98076.272 }, { "epoch": 0.3751987281399046, "grad_norm": 0.2358168661594391, "learning_rate": 3.465804527107665e-05, "loss": 0.4211, "num_input_tokens_seen": 9200481747, "step": 2360, "train_runtime": 93807.3709, "train_tokens_per_second": 98078.452 }, { "epoch": 0.3753577106518283, "grad_norm": 0.19366654753684998, "learning_rate": 3.464650315004724e-05, "loss": 0.4303, "num_input_tokens_seen": 9204394109, "step": 2361, "train_runtime": 93845.4218, "train_tokens_per_second": 98080.374 }, { "epoch": 0.37551669316375197, "grad_norm": 0.23432610929012299, "learning_rate": 3.463495861263698e-05, "loss": 0.4249, "num_input_tokens_seen": 9208265673, "step": 2362, "train_runtime": 93884.6687, "train_tokens_per_second": 98080.611 }, { "epoch": 0.37567567567567567, "grad_norm": 0.19024606049060822, "learning_rate": 3.462341166173766e-05, "loss": 0.423, "num_input_tokens_seen": 9212185279, "step": 2363, "train_runtime": 93923.2961, "train_tokens_per_second": 98082.006 }, { "epoch": 0.37583465818759937, "grad_norm": 0.19984984397888184, "learning_rate": 3.461186230024173e-05, "loss": 0.4086, "num_input_tokens_seen": 9216031338, "step": 2364, "train_runtime": 93962.372, "train_tokens_per_second": 98082.149 }, { "epoch": 0.37599364069952307, "grad_norm": 0.30582794547080994, "learning_rate": 3.46003105310422e-05, "loss": 0.4261, "num_input_tokens_seen": 9219991186, "step": 2365, "train_runtime": 94002.9774, "train_tokens_per_second": 98081.906 }, { "epoch": 0.37615262321144677, "grad_norm": 0.21084217727184296, "learning_rate": 3.4588756357032725e-05, "loss": 0.4146, "num_input_tokens_seen": 9223783110, "step": 2366, "train_runtime": 94043.5133, "train_tokens_per_second": 98079.95 }, { "epoch": 0.3763116057233704, "grad_norm": 0.19181102514266968, "learning_rate": 3.457719978110754e-05, "loss": 0.4127, "num_input_tokens_seen": 9227746149, "step": 2367, "train_runtime": 94085.1494, "train_tokens_per_second": 98078.668 }, { "epoch": 0.3764705882352941, "grad_norm": 0.1937091201543808, "learning_rate": 3.456564080616149e-05, "loss": 0.423, "num_input_tokens_seen": 9231577878, "step": 2368, "train_runtime": 94125.1478, "train_tokens_per_second": 98077.699 }, { "epoch": 0.3766295707472178, "grad_norm": 0.20114636421203613, "learning_rate": 3.455407943509002e-05, "loss": 0.414, "num_input_tokens_seen": 9235559082, "step": 2369, "train_runtime": 94162.7845, "train_tokens_per_second": 98080.777 }, { "epoch": 0.3767885532591415, "grad_norm": 0.1939169466495514, "learning_rate": 3.454251567078915e-05, "loss": 0.4089, "num_input_tokens_seen": 9239436620, "step": 2370, "train_runtime": 94202.0592, "train_tokens_per_second": 98081.047 }, { "epoch": 0.37694753577106516, "grad_norm": 0.2720220983028412, "learning_rate": 3.4530949516155545e-05, "loss": 0.4228, "num_input_tokens_seen": 9243421939, "step": 2371, "train_runtime": 94241.0419, "train_tokens_per_second": 98082.765 }, { "epoch": 0.37710651828298886, "grad_norm": 0.22661137580871582, "learning_rate": 3.451938097408644e-05, "loss": 0.4258, "num_input_tokens_seen": 9247329521, "step": 2372, "train_runtime": 94281.4752, "train_tokens_per_second": 98082.147 }, { "epoch": 0.37726550079491256, "grad_norm": 0.20529764890670776, "learning_rate": 3.4507810047479655e-05, "loss": 0.4126, "num_input_tokens_seen": 9251118814, "step": 2373, "train_runtime": 94320.3866, "train_tokens_per_second": 98081.859 }, { "epoch": 0.37742448330683626, "grad_norm": 0.20460395514965057, "learning_rate": 3.449623673923364e-05, "loss": 0.4244, "num_input_tokens_seen": 9254933367, "step": 2374, "train_runtime": 94360.3121, "train_tokens_per_second": 98080.784 }, { "epoch": 0.37758346581875996, "grad_norm": 0.21955162286758423, "learning_rate": 3.448466105224743e-05, "loss": 0.424, "num_input_tokens_seen": 9258916486, "step": 2375, "train_runtime": 94399.4128, "train_tokens_per_second": 98082.353 }, { "epoch": 0.3777424483306836, "grad_norm": 0.21360142529010773, "learning_rate": 3.447308298942065e-05, "loss": 0.4156, "num_input_tokens_seen": 9262933049, "step": 2376, "train_runtime": 94437.82, "train_tokens_per_second": 98084.994 }, { "epoch": 0.3779014308426073, "grad_norm": 0.2255627065896988, "learning_rate": 3.446150255365351e-05, "loss": 0.4216, "num_input_tokens_seen": 9266753908, "step": 2377, "train_runtime": 94473.9395, "train_tokens_per_second": 98087.938 }, { "epoch": 0.378060413354531, "grad_norm": 0.21283890306949615, "learning_rate": 3.444991974784685e-05, "loss": 0.4223, "num_input_tokens_seen": 9270567136, "step": 2378, "train_runtime": 94513.6461, "train_tokens_per_second": 98087.075 }, { "epoch": 0.3782193958664547, "grad_norm": 0.43871235847473145, "learning_rate": 3.443833457490206e-05, "loss": 0.4163, "num_input_tokens_seen": 9274532725, "step": 2379, "train_runtime": 94551.9249, "train_tokens_per_second": 98089.306 }, { "epoch": 0.3783783783783784, "grad_norm": 0.23010653257369995, "learning_rate": 3.442674703772117e-05, "loss": 0.4168, "num_input_tokens_seen": 9278456532, "step": 2380, "train_runtime": 94591.5518, "train_tokens_per_second": 98089.696 }, { "epoch": 0.37853736089030204, "grad_norm": 0.2030169814825058, "learning_rate": 3.441515713920675e-05, "loss": 0.4184, "num_input_tokens_seen": 9282249200, "step": 2381, "train_runtime": 94630.7562, "train_tokens_per_second": 98089.137 }, { "epoch": 0.37869634340222574, "grad_norm": 0.22167839109897614, "learning_rate": 3.4403564882261995e-05, "loss": 0.4074, "num_input_tokens_seen": 9286185866, "step": 2382, "train_runtime": 94668.4397, "train_tokens_per_second": 98091.675 }, { "epoch": 0.37885532591414944, "grad_norm": 0.21005363762378693, "learning_rate": 3.439197026979069e-05, "loss": 0.403, "num_input_tokens_seen": 9290064825, "step": 2383, "train_runtime": 94707.1546, "train_tokens_per_second": 98092.534 }, { "epoch": 0.37901430842607314, "grad_norm": 0.18600724637508392, "learning_rate": 3.43803733046972e-05, "loss": 0.4187, "num_input_tokens_seen": 9294005258, "step": 2384, "train_runtime": 94745.0676, "train_tokens_per_second": 98094.872 }, { "epoch": 0.37917329093799684, "grad_norm": 0.2101978212594986, "learning_rate": 3.436877398988647e-05, "loss": 0.4139, "num_input_tokens_seen": 9297891881, "step": 2385, "train_runtime": 94783.8825, "train_tokens_per_second": 98095.706 }, { "epoch": 0.3793322734499205, "grad_norm": 0.2117864489555359, "learning_rate": 3.4357172328264075e-05, "loss": 0.4154, "num_input_tokens_seen": 9301874310, "step": 2386, "train_runtime": 94820.9641, "train_tokens_per_second": 98099.343 }, { "epoch": 0.3794912559618442, "grad_norm": 0.2170220911502838, "learning_rate": 3.4345568322736125e-05, "loss": 0.4215, "num_input_tokens_seen": 9305828457, "step": 2387, "train_runtime": 94859.577, "train_tokens_per_second": 98101.096 }, { "epoch": 0.3796502384737679, "grad_norm": 0.45221906900405884, "learning_rate": 3.4333961976209355e-05, "loss": 0.4193, "num_input_tokens_seen": 9309592589, "step": 2388, "train_runtime": 94897.5834, "train_tokens_per_second": 98101.472 }, { "epoch": 0.3798092209856916, "grad_norm": 0.21019534766674042, "learning_rate": 3.432235329159106e-05, "loss": 0.4163, "num_input_tokens_seen": 9313500597, "step": 2389, "train_runtime": 94935.1248, "train_tokens_per_second": 98103.843 }, { "epoch": 0.3799682034976153, "grad_norm": 0.2704361379146576, "learning_rate": 3.431074227178914e-05, "loss": 0.4161, "num_input_tokens_seen": 9317415685, "step": 2390, "train_runtime": 94975.141, "train_tokens_per_second": 98103.731 }, { "epoch": 0.38012718600953893, "grad_norm": 0.21137379109859467, "learning_rate": 3.4299128919712076e-05, "loss": 0.4091, "num_input_tokens_seen": 9321209371, "step": 2391, "train_runtime": 95013.3032, "train_tokens_per_second": 98104.256 }, { "epoch": 0.38028616852146263, "grad_norm": 0.2730497717857361, "learning_rate": 3.4287513238268934e-05, "loss": 0.4363, "num_input_tokens_seen": 9325259365, "step": 2392, "train_runtime": 95054.1505, "train_tokens_per_second": 98104.705 }, { "epoch": 0.38044515103338633, "grad_norm": 0.20867857336997986, "learning_rate": 3.427589523036935e-05, "loss": 0.4153, "num_input_tokens_seen": 9329198524, "step": 2393, "train_runtime": 95092.5759, "train_tokens_per_second": 98106.487 }, { "epoch": 0.38060413354531003, "grad_norm": 0.20952975749969482, "learning_rate": 3.426427489892355e-05, "loss": 0.4254, "num_input_tokens_seen": 9333016328, "step": 2394, "train_runtime": 95131.2323, "train_tokens_per_second": 98106.753 }, { "epoch": 0.38076311605723373, "grad_norm": 0.25599852204322815, "learning_rate": 3.425265224684237e-05, "loss": 0.421, "num_input_tokens_seen": 9336980448, "step": 2395, "train_runtime": 95171.5809, "train_tokens_per_second": 98106.813 }, { "epoch": 0.3809220985691574, "grad_norm": 0.22162871062755585, "learning_rate": 3.4241027277037176e-05, "loss": 0.4266, "num_input_tokens_seen": 9340866200, "step": 2396, "train_runtime": 95211.1568, "train_tokens_per_second": 98106.845 }, { "epoch": 0.3810810810810811, "grad_norm": 0.22636117041110992, "learning_rate": 3.4229399992419944e-05, "loss": 0.415, "num_input_tokens_seen": 9344849594, "step": 2397, "train_runtime": 95249.265, "train_tokens_per_second": 98109.414 }, { "epoch": 0.3812400635930048, "grad_norm": 0.20536692440509796, "learning_rate": 3.4217770395903234e-05, "loss": 0.4181, "num_input_tokens_seen": 9348678726, "step": 2398, "train_runtime": 95288.4276, "train_tokens_per_second": 98109.277 }, { "epoch": 0.3813990461049285, "grad_norm": 0.23475590348243713, "learning_rate": 3.420613849040018e-05, "loss": 0.4249, "num_input_tokens_seen": 9352482181, "step": 2399, "train_runtime": 95324.4474, "train_tokens_per_second": 98112.105 }, { "epoch": 0.3815580286168522, "grad_norm": 0.23067468404769897, "learning_rate": 3.4194504278824486e-05, "loss": 0.4281, "num_input_tokens_seen": 9356387740, "step": 2400, "train_runtime": 95364.4543, "train_tokens_per_second": 98111.899 }, { "epoch": 0.3817170111287758, "grad_norm": 0.19066627323627472, "learning_rate": 3.418286776409044e-05, "loss": 0.4142, "num_input_tokens_seen": 9360444461, "step": 2401, "train_runtime": 95513.4033, "train_tokens_per_second": 98001.371 }, { "epoch": 0.3818759936406995, "grad_norm": 0.2271936535835266, "learning_rate": 3.417122894911292e-05, "loss": 0.4161, "num_input_tokens_seen": 9364371396, "step": 2402, "train_runtime": 95552.0171, "train_tokens_per_second": 98002.865 }, { "epoch": 0.3820349761526232, "grad_norm": 0.24159234762191772, "learning_rate": 3.4159587836807344e-05, "loss": 0.417, "num_input_tokens_seen": 9368195039, "step": 2403, "train_runtime": 95590.3378, "train_tokens_per_second": 98003.577 }, { "epoch": 0.3821939586645469, "grad_norm": 0.2651727795600891, "learning_rate": 3.414794443008974e-05, "loss": 0.4326, "num_input_tokens_seen": 9372275372, "step": 2404, "train_runtime": 95630.582, "train_tokens_per_second": 98005.002 }, { "epoch": 0.38235294117647056, "grad_norm": 0.24467667937278748, "learning_rate": 3.4136298731876706e-05, "loss": 0.4094, "num_input_tokens_seen": 9376115555, "step": 2405, "train_runtime": 95670.1583, "train_tokens_per_second": 98004.6 }, { "epoch": 0.38251192368839426, "grad_norm": 0.26397228240966797, "learning_rate": 3.412465074508539e-05, "loss": 0.4168, "num_input_tokens_seen": 9379984664, "step": 2406, "train_runtime": 95709.1756, "train_tokens_per_second": 98005.072 }, { "epoch": 0.38267090620031796, "grad_norm": 0.19321809709072113, "learning_rate": 3.4113000472633546e-05, "loss": 0.4224, "num_input_tokens_seen": 9383964335, "step": 2407, "train_runtime": 95749.4367, "train_tokens_per_second": 98005.426 }, { "epoch": 0.38282988871224166, "grad_norm": 0.22021102905273438, "learning_rate": 3.410134791743947e-05, "loss": 0.4088, "num_input_tokens_seen": 9387977414, "step": 2408, "train_runtime": 95789.0739, "train_tokens_per_second": 98006.767 }, { "epoch": 0.38298887122416536, "grad_norm": 0.2170533686876297, "learning_rate": 3.4089693082422066e-05, "loss": 0.414, "num_input_tokens_seen": 9391820373, "step": 2409, "train_runtime": 95828.8134, "train_tokens_per_second": 98006.226 }, { "epoch": 0.383147853736089, "grad_norm": 0.24467416107654572, "learning_rate": 3.407803597050077e-05, "loss": 0.4231, "num_input_tokens_seen": 9395679423, "step": 2410, "train_runtime": 95867.9229, "train_tokens_per_second": 98006.498 }, { "epoch": 0.3833068362480127, "grad_norm": 0.22061140835285187, "learning_rate": 3.406637658459562e-05, "loss": 0.4228, "num_input_tokens_seen": 9399615527, "step": 2411, "train_runtime": 95907.2207, "train_tokens_per_second": 98007.381 }, { "epoch": 0.3834658187599364, "grad_norm": 0.2216600477695465, "learning_rate": 3.405471492762719e-05, "loss": 0.409, "num_input_tokens_seen": 9403420760, "step": 2412, "train_runtime": 95946.3674, "train_tokens_per_second": 98007.053 }, { "epoch": 0.3836248012718601, "grad_norm": 0.36510536074638367, "learning_rate": 3.404305100251666e-05, "loss": 0.4079, "num_input_tokens_seen": 9407366816, "step": 2413, "train_runtime": 95987.0379, "train_tokens_per_second": 98006.637 }, { "epoch": 0.3837837837837838, "grad_norm": 0.25049883127212524, "learning_rate": 3.4031384812185765e-05, "loss": 0.4088, "num_input_tokens_seen": 9411355217, "step": 2414, "train_runtime": 96025.8447, "train_tokens_per_second": 98008.565 }, { "epoch": 0.38394276629570745, "grad_norm": 0.27959299087524414, "learning_rate": 3.401971635955678e-05, "loss": 0.4192, "num_input_tokens_seen": 9415244842, "step": 2415, "train_runtime": 96062.3984, "train_tokens_per_second": 98011.761 }, { "epoch": 0.38410174880763115, "grad_norm": 0.22031547129154205, "learning_rate": 3.400804564755258e-05, "loss": 0.4203, "num_input_tokens_seen": 9419194559, "step": 2416, "train_runtime": 96102.2815, "train_tokens_per_second": 98012.185 }, { "epoch": 0.38426073131955485, "grad_norm": 0.22851675748825073, "learning_rate": 3.399637267909661e-05, "loss": 0.4082, "num_input_tokens_seen": 9422996884, "step": 2417, "train_runtime": 96142.9376, "train_tokens_per_second": 98010.287 }, { "epoch": 0.38441971383147855, "grad_norm": 0.2015339583158493, "learning_rate": 3.3984697457112855e-05, "loss": 0.4138, "num_input_tokens_seen": 9426931823, "step": 2418, "train_runtime": 96180.5559, "train_tokens_per_second": 98012.865 }, { "epoch": 0.38457869634340225, "grad_norm": 0.18524308502674103, "learning_rate": 3.3973019984525876e-05, "loss": 0.4281, "num_input_tokens_seen": 9430734548, "step": 2419, "train_runtime": 96218.2225, "train_tokens_per_second": 98014.018 }, { "epoch": 0.3847376788553259, "grad_norm": 0.21295174956321716, "learning_rate": 3.3961340264260794e-05, "loss": 0.4139, "num_input_tokens_seen": 9434696265, "step": 2420, "train_runtime": 96258.6371, "train_tokens_per_second": 98014.023 }, { "epoch": 0.3848966613672496, "grad_norm": 0.18269208073616028, "learning_rate": 3.39496582992433e-05, "loss": 0.4054, "num_input_tokens_seen": 9438667281, "step": 2421, "train_runtime": 96298.3783, "train_tokens_per_second": 98014.81 }, { "epoch": 0.3850556438791733, "grad_norm": 0.4279605746269226, "learning_rate": 3.393797409239964e-05, "loss": 0.4132, "num_input_tokens_seen": 9442632877, "step": 2422, "train_runtime": 96337.9344, "train_tokens_per_second": 98015.729 }, { "epoch": 0.385214626391097, "grad_norm": 0.2296474277973175, "learning_rate": 3.3926287646656636e-05, "loss": 0.4013, "num_input_tokens_seen": 9446299438, "step": 2423, "train_runtime": 96376.5418, "train_tokens_per_second": 98014.509 }, { "epoch": 0.3853736089030207, "grad_norm": 0.2408822625875473, "learning_rate": 3.3914598964941645e-05, "loss": 0.4114, "num_input_tokens_seen": 9450304833, "step": 2424, "train_runtime": 96417.1197, "train_tokens_per_second": 98014.801 }, { "epoch": 0.38553259141494434, "grad_norm": 0.22428745031356812, "learning_rate": 3.3902908050182615e-05, "loss": 0.4146, "num_input_tokens_seen": 9454328431, "step": 2425, "train_runtime": 96455.1018, "train_tokens_per_second": 98017.92 }, { "epoch": 0.38569157392686804, "grad_norm": 0.23445408046245575, "learning_rate": 3.3891214905308025e-05, "loss": 0.4219, "num_input_tokens_seen": 9458091234, "step": 2426, "train_runtime": 96491.6148, "train_tokens_per_second": 98019.825 }, { "epoch": 0.38585055643879174, "grad_norm": 0.27343782782554626, "learning_rate": 3.387951953324693e-05, "loss": 0.4157, "num_input_tokens_seen": 9461940964, "step": 2427, "train_runtime": 96532.5454, "train_tokens_per_second": 98018.144 }, { "epoch": 0.38600953895071544, "grad_norm": 0.20960941910743713, "learning_rate": 3.386782193692894e-05, "loss": 0.4161, "num_input_tokens_seen": 9465951440, "step": 2428, "train_runtime": 96569.9676, "train_tokens_per_second": 98021.69 }, { "epoch": 0.38616852146263914, "grad_norm": 0.19838714599609375, "learning_rate": 3.385612211928422e-05, "loss": 0.4163, "num_input_tokens_seen": 9469895543, "step": 2429, "train_runtime": 96610.9855, "train_tokens_per_second": 98020.898 }, { "epoch": 0.3863275039745628, "grad_norm": 0.2329113632440567, "learning_rate": 3.3844420083243476e-05, "loss": 0.4124, "num_input_tokens_seen": 9473781833, "step": 2430, "train_runtime": 96646.975, "train_tokens_per_second": 98024.608 }, { "epoch": 0.3864864864864865, "grad_norm": 0.24209871888160706, "learning_rate": 3.383271583173801e-05, "loss": 0.4259, "num_input_tokens_seen": 9477686050, "step": 2431, "train_runtime": 96685.8638, "train_tokens_per_second": 98025.561 }, { "epoch": 0.3866454689984102, "grad_norm": 0.21588723361492157, "learning_rate": 3.382100936769964e-05, "loss": 0.404, "num_input_tokens_seen": 9481624682, "step": 2432, "train_runtime": 96726.1228, "train_tokens_per_second": 98025.481 }, { "epoch": 0.3868044515103339, "grad_norm": 0.18961462378501892, "learning_rate": 3.3809300694060764e-05, "loss": 0.4121, "num_input_tokens_seen": 9485547527, "step": 2433, "train_runtime": 96763.8502, "train_tokens_per_second": 98027.802 }, { "epoch": 0.3869634340222575, "grad_norm": 0.2084359973669052, "learning_rate": 3.37975898137543e-05, "loss": 0.4193, "num_input_tokens_seen": 9489459976, "step": 2434, "train_runtime": 96803.1106, "train_tokens_per_second": 98028.461 }, { "epoch": 0.3871224165341812, "grad_norm": 0.3834991753101349, "learning_rate": 3.378587672971377e-05, "loss": 0.4069, "num_input_tokens_seen": 9493300838, "step": 2435, "train_runtime": 96842.727, "train_tokens_per_second": 98028.021 }, { "epoch": 0.3872813990461049, "grad_norm": 0.2788194417953491, "learning_rate": 3.3774161444873196e-05, "loss": 0.4042, "num_input_tokens_seen": 9497335532, "step": 2436, "train_runtime": 96881.5407, "train_tokens_per_second": 98030.393 }, { "epoch": 0.3874403815580286, "grad_norm": 0.20342543721199036, "learning_rate": 3.376244396216718e-05, "loss": 0.418, "num_input_tokens_seen": 9501128117, "step": 2437, "train_runtime": 96922.1931, "train_tokens_per_second": 98028.406 }, { "epoch": 0.3875993640699523, "grad_norm": 0.2013140171766281, "learning_rate": 3.375072428453086e-05, "loss": 0.4086, "num_input_tokens_seen": 9505016303, "step": 2438, "train_runtime": 96961.4431, "train_tokens_per_second": 98028.825 }, { "epoch": 0.38775834658187597, "grad_norm": 0.1811753362417221, "learning_rate": 3.373900241489996e-05, "loss": 0.4141, "num_input_tokens_seen": 9508813295, "step": 2439, "train_runtime": 96999.8327, "train_tokens_per_second": 98029.172 }, { "epoch": 0.38791732909379967, "grad_norm": 0.20547403395175934, "learning_rate": 3.37272783562107e-05, "loss": 0.4144, "num_input_tokens_seen": 9512819084, "step": 2440, "train_runtime": 97038.4229, "train_tokens_per_second": 98031.468 }, { "epoch": 0.38807631160572337, "grad_norm": 0.18559904396533966, "learning_rate": 3.3715552111399884e-05, "loss": 0.4127, "num_input_tokens_seen": 9516697718, "step": 2441, "train_runtime": 97076.2259, "train_tokens_per_second": 98033.248 }, { "epoch": 0.38823529411764707, "grad_norm": 0.21208585798740387, "learning_rate": 3.370382368340485e-05, "loss": 0.4333, "num_input_tokens_seen": 9520657797, "step": 2442, "train_runtime": 97115.1838, "train_tokens_per_second": 98034.699 }, { "epoch": 0.38839427662957077, "grad_norm": 0.20697815716266632, "learning_rate": 3.369209307516349e-05, "loss": 0.4165, "num_input_tokens_seen": 9524510399, "step": 2443, "train_runtime": 97156.9611, "train_tokens_per_second": 98032.198 }, { "epoch": 0.3885532591414944, "grad_norm": 0.20004060864448547, "learning_rate": 3.368036028961423e-05, "loss": 0.4188, "num_input_tokens_seen": 9528520058, "step": 2444, "train_runtime": 97195.5695, "train_tokens_per_second": 98034.51 }, { "epoch": 0.3887122416534181, "grad_norm": 0.220608189702034, "learning_rate": 3.3668625329696055e-05, "loss": 0.4144, "num_input_tokens_seen": 9532345893, "step": 2445, "train_runtime": 97233.829, "train_tokens_per_second": 98035.283 }, { "epoch": 0.3888712241653418, "grad_norm": 0.2147708684206009, "learning_rate": 3.36568881983485e-05, "loss": 0.4198, "num_input_tokens_seen": 9536251636, "step": 2446, "train_runtime": 97273.2134, "train_tokens_per_second": 98035.742 }, { "epoch": 0.3890302066772655, "grad_norm": 0.18805374205112457, "learning_rate": 3.364514889851162e-05, "loss": 0.4129, "num_input_tokens_seen": 9540251477, "step": 2447, "train_runtime": 97311.5276, "train_tokens_per_second": 98038.246 }, { "epoch": 0.3891891891891892, "grad_norm": 0.22754883766174316, "learning_rate": 3.363340743312603e-05, "loss": 0.4058, "num_input_tokens_seen": 9544230396, "step": 2448, "train_runtime": 97351.8039, "train_tokens_per_second": 98038.557 }, { "epoch": 0.38934817170111286, "grad_norm": 0.23336420953273773, "learning_rate": 3.362166380513288e-05, "loss": 0.4082, "num_input_tokens_seen": 9548234965, "step": 2449, "train_runtime": 97390.1793, "train_tokens_per_second": 98041.045 }, { "epoch": 0.38950715421303655, "grad_norm": 0.22216713428497314, "learning_rate": 3.360991801747388e-05, "loss": 0.4222, "num_input_tokens_seen": 9552108437, "step": 2450, "train_runtime": 97430.4825, "train_tokens_per_second": 98040.246 }, { "epoch": 0.38966613672496025, "grad_norm": 0.2384008914232254, "learning_rate": 3.3598170073091257e-05, "loss": 0.4213, "num_input_tokens_seen": 9556049533, "step": 2451, "train_runtime": 97468.3939, "train_tokens_per_second": 98042.546 }, { "epoch": 0.38982511923688395, "grad_norm": 0.2136860340833664, "learning_rate": 3.358641997492778e-05, "loss": 0.396, "num_input_tokens_seen": 9559843612, "step": 2452, "train_runtime": 97506.8712, "train_tokens_per_second": 98042.769 }, { "epoch": 0.38998410174880765, "grad_norm": 0.28213024139404297, "learning_rate": 3.357466772592678e-05, "loss": 0.4249, "num_input_tokens_seen": 9563878339, "step": 2453, "train_runtime": 97545.6627, "train_tokens_per_second": 98045.142 }, { "epoch": 0.3901430842607313, "grad_norm": 0.2011684626340866, "learning_rate": 3.3562913329032095e-05, "loss": 0.4239, "num_input_tokens_seen": 9567780265, "step": 2454, "train_runtime": 97583.0732, "train_tokens_per_second": 98047.54 }, { "epoch": 0.390302066772655, "grad_norm": 0.26077762246131897, "learning_rate": 3.355115678718814e-05, "loss": 0.417, "num_input_tokens_seen": 9571765841, "step": 2455, "train_runtime": 97620.5106, "train_tokens_per_second": 98050.766 }, { "epoch": 0.3904610492845787, "grad_norm": 0.22002142667770386, "learning_rate": 3.3539398103339836e-05, "loss": 0.4268, "num_input_tokens_seen": 9575590625, "step": 2456, "train_runtime": 97661.9261, "train_tokens_per_second": 98048.349 }, { "epoch": 0.3906200317965024, "grad_norm": 0.21255165338516235, "learning_rate": 3.352763728043264e-05, "loss": 0.4139, "num_input_tokens_seen": 9579591847, "step": 2457, "train_runtime": 97702.1801, "train_tokens_per_second": 98048.906 }, { "epoch": 0.3907790143084261, "grad_norm": 0.22154691815376282, "learning_rate": 3.3515874321412585e-05, "loss": 0.4176, "num_input_tokens_seen": 9583533101, "step": 2458, "train_runtime": 97741.9389, "train_tokens_per_second": 98049.345 }, { "epoch": 0.39093799682034974, "grad_norm": 0.35474616289138794, "learning_rate": 3.350410922922617e-05, "loss": 0.4079, "num_input_tokens_seen": 9587391761, "step": 2459, "train_runtime": 97783.2065, "train_tokens_per_second": 98047.427 }, { "epoch": 0.39109697933227344, "grad_norm": 0.23797301948070526, "learning_rate": 3.34923420068205e-05, "loss": 0.4133, "num_input_tokens_seen": 9591380869, "step": 2460, "train_runtime": 97823.727, "train_tokens_per_second": 98047.592 }, { "epoch": 0.39125596184419714, "grad_norm": 0.2936416268348694, "learning_rate": 3.348057265714315e-05, "loss": 0.4086, "num_input_tokens_seen": 9595261681, "step": 2461, "train_runtime": 97862.4639, "train_tokens_per_second": 98048.438 }, { "epoch": 0.39141494435612084, "grad_norm": 0.21893690526485443, "learning_rate": 3.346880118314228e-05, "loss": 0.413, "num_input_tokens_seen": 9599084263, "step": 2462, "train_runtime": 97901.259, "train_tokens_per_second": 98048.629 }, { "epoch": 0.39157392686804454, "grad_norm": 0.22085770964622498, "learning_rate": 3.3457027587766566e-05, "loss": 0.4045, "num_input_tokens_seen": 9602962952, "step": 2463, "train_runtime": 97940.9301, "train_tokens_per_second": 98048.517 }, { "epoch": 0.3917329093799682, "grad_norm": 0.22728390991687775, "learning_rate": 3.34452518739652e-05, "loss": 0.4168, "num_input_tokens_seen": 9606924272, "step": 2464, "train_runtime": 97979.9735, "train_tokens_per_second": 98049.876 }, { "epoch": 0.3918918918918919, "grad_norm": 0.2114351987838745, "learning_rate": 3.343347404468791e-05, "loss": 0.4168, "num_input_tokens_seen": 9610871339, "step": 2465, "train_runtime": 98020.0321, "train_tokens_per_second": 98050.073 }, { "epoch": 0.3920508744038156, "grad_norm": 0.20605553686618805, "learning_rate": 3.3421694102884975e-05, "loss": 0.4192, "num_input_tokens_seen": 9614700075, "step": 2466, "train_runtime": 98060.4034, "train_tokens_per_second": 98048.751 }, { "epoch": 0.3922098569157393, "grad_norm": 0.23384161293506622, "learning_rate": 3.340991205150717e-05, "loss": 0.4205, "num_input_tokens_seen": 9618633475, "step": 2467, "train_runtime": 98099.0159, "train_tokens_per_second": 98050.254 }, { "epoch": 0.39236883942766293, "grad_norm": 0.21987251937389374, "learning_rate": 3.3398127893505825e-05, "loss": 0.4233, "num_input_tokens_seen": 9622505876, "step": 2468, "train_runtime": 98138.885, "train_tokens_per_second": 98049.88 }, { "epoch": 0.39252782193958663, "grad_norm": 0.20289938151836395, "learning_rate": 3.33863416318328e-05, "loss": 0.404, "num_input_tokens_seen": 9626499608, "step": 2469, "train_runtime": 98178.6003, "train_tokens_per_second": 98050.895 }, { "epoch": 0.39268680445151033, "grad_norm": 0.18575216829776764, "learning_rate": 3.337455326944044e-05, "loss": 0.412, "num_input_tokens_seen": 9630329437, "step": 2470, "train_runtime": 98219.5875, "train_tokens_per_second": 98048.97 }, { "epoch": 0.39284578696343403, "grad_norm": 0.24991363286972046, "learning_rate": 3.3362762809281666e-05, "loss": 0.4095, "num_input_tokens_seen": 9634197650, "step": 2471, "train_runtime": 98259.9744, "train_tokens_per_second": 98048.037 }, { "epoch": 0.39300476947535773, "grad_norm": 0.2249368280172348, "learning_rate": 3.33509702543099e-05, "loss": 0.4161, "num_input_tokens_seen": 9638182883, "step": 2472, "train_runtime": 98299.5908, "train_tokens_per_second": 98049.064 }, { "epoch": 0.3931637519872814, "grad_norm": 0.1997724175453186, "learning_rate": 3.3339175607479107e-05, "loss": 0.4212, "num_input_tokens_seen": 9642141760, "step": 2473, "train_runtime": 98340.2717, "train_tokens_per_second": 98048.761 }, { "epoch": 0.3933227344992051, "grad_norm": 0.20453692972660065, "learning_rate": 3.3327378871743746e-05, "loss": 0.4145, "num_input_tokens_seen": 9646071158, "step": 2474, "train_runtime": 98380.4476, "train_tokens_per_second": 98048.661 }, { "epoch": 0.3934817170111288, "grad_norm": 0.19370411336421967, "learning_rate": 3.331558005005882e-05, "loss": 0.4219, "num_input_tokens_seen": 9649987303, "step": 2475, "train_runtime": 98418.9681, "train_tokens_per_second": 98050.076 }, { "epoch": 0.3936406995230525, "grad_norm": 0.25569191575050354, "learning_rate": 3.3303779145379857e-05, "loss": 0.4086, "num_input_tokens_seen": 9653819215, "step": 2476, "train_runtime": 98454.6824, "train_tokens_per_second": 98053.429 }, { "epoch": 0.3937996820349762, "grad_norm": 0.1726914644241333, "learning_rate": 3.329197616066289e-05, "loss": 0.417, "num_input_tokens_seen": 9657738658, "step": 2477, "train_runtime": 98493.5433, "train_tokens_per_second": 98054.536 }, { "epoch": 0.3939586645468998, "grad_norm": 0.21518222987651825, "learning_rate": 3.32801710988645e-05, "loss": 0.4209, "num_input_tokens_seen": 9661631761, "step": 2478, "train_runtime": 98533.7846, "train_tokens_per_second": 98054.0 }, { "epoch": 0.3941176470588235, "grad_norm": 0.1810808628797531, "learning_rate": 3.326836396294174e-05, "loss": 0.4115, "num_input_tokens_seen": 9665496841, "step": 2479, "train_runtime": 98575.3321, "train_tokens_per_second": 98051.882 }, { "epoch": 0.3942766295707472, "grad_norm": 0.1918884813785553, "learning_rate": 3.325655475585225e-05, "loss": 0.4185, "num_input_tokens_seen": 9669539171, "step": 2480, "train_runtime": 98616.4739, "train_tokens_per_second": 98051.966 }, { "epoch": 0.3944356120826709, "grad_norm": 0.21375977993011475, "learning_rate": 3.324474348055414e-05, "loss": 0.4165, "num_input_tokens_seen": 9673432776, "step": 2481, "train_runtime": 98655.569, "train_tokens_per_second": 98052.577 }, { "epoch": 0.3945945945945946, "grad_norm": 0.21846555173397064, "learning_rate": 3.3232930140006036e-05, "loss": 0.4233, "num_input_tokens_seen": 9677227568, "step": 2482, "train_runtime": 98693.1988, "train_tokens_per_second": 98053.642 }, { "epoch": 0.39475357710651826, "grad_norm": 0.198723703622818, "learning_rate": 3.322111473716712e-05, "loss": 0.4031, "num_input_tokens_seen": 9681096359, "step": 2483, "train_runtime": 98733.4178, "train_tokens_per_second": 98052.884 }, { "epoch": 0.39491255961844196, "grad_norm": 0.19233229756355286, "learning_rate": 3.320929727499705e-05, "loss": 0.4075, "num_input_tokens_seen": 9685127058, "step": 2484, "train_runtime": 98774.8087, "train_tokens_per_second": 98052.603 }, { "epoch": 0.39507154213036566, "grad_norm": 0.19628074765205383, "learning_rate": 3.319747775645602e-05, "loss": 0.4122, "num_input_tokens_seen": 9688938562, "step": 2485, "train_runtime": 98813.4264, "train_tokens_per_second": 98052.855 }, { "epoch": 0.39523052464228936, "grad_norm": 0.20511579513549805, "learning_rate": 3.3185656184504744e-05, "loss": 0.4191, "num_input_tokens_seen": 9692837136, "step": 2486, "train_runtime": 98851.3398, "train_tokens_per_second": 98054.686 }, { "epoch": 0.39538950715421306, "grad_norm": 0.18535888195037842, "learning_rate": 3.317383256210443e-05, "loss": 0.4157, "num_input_tokens_seen": 9696766837, "step": 2487, "train_runtime": 98890.2215, "train_tokens_per_second": 98055.871 }, { "epoch": 0.3955484896661367, "grad_norm": 0.18053635954856873, "learning_rate": 3.3162006892216824e-05, "loss": 0.4139, "num_input_tokens_seen": 9700734914, "step": 2488, "train_runtime": 98929.2749, "train_tokens_per_second": 98057.273 }, { "epoch": 0.3957074721780604, "grad_norm": 0.22473011910915375, "learning_rate": 3.3150179177804165e-05, "loss": 0.4066, "num_input_tokens_seen": 9704616024, "step": 2489, "train_runtime": 98970.1202, "train_tokens_per_second": 98056.019 }, { "epoch": 0.3958664546899841, "grad_norm": 0.2081012725830078, "learning_rate": 3.313834942182922e-05, "loss": 0.4155, "num_input_tokens_seen": 9708557812, "step": 2490, "train_runtime": 99007.6544, "train_tokens_per_second": 98058.659 }, { "epoch": 0.3960254372019078, "grad_norm": 0.18013536930084229, "learning_rate": 3.312651762725525e-05, "loss": 0.3994, "num_input_tokens_seen": 9712501598, "step": 2491, "train_runtime": 99046.2481, "train_tokens_per_second": 98060.268 }, { "epoch": 0.3961844197138315, "grad_norm": 0.20346325635910034, "learning_rate": 3.311468379704603e-05, "loss": 0.4137, "num_input_tokens_seen": 9716381878, "step": 2492, "train_runtime": 99085.057, "train_tokens_per_second": 98061.021 }, { "epoch": 0.39634340222575515, "grad_norm": 0.20939134061336517, "learning_rate": 3.310284793416587e-05, "loss": 0.4293, "num_input_tokens_seen": 9720261341, "step": 2493, "train_runtime": 99123.9061, "train_tokens_per_second": 98061.726 }, { "epoch": 0.39650238473767885, "grad_norm": 0.23643667995929718, "learning_rate": 3.309101004157955e-05, "loss": 0.4124, "num_input_tokens_seen": 9724112051, "step": 2494, "train_runtime": 99163.7143, "train_tokens_per_second": 98061.192 }, { "epoch": 0.39666136724960255, "grad_norm": 0.21311822533607483, "learning_rate": 3.307917012225239e-05, "loss": 0.4177, "num_input_tokens_seen": 9728097575, "step": 2495, "train_runtime": 99203.6705, "train_tokens_per_second": 98061.871 }, { "epoch": 0.39682034976152625, "grad_norm": 0.20097362995147705, "learning_rate": 3.306732817915022e-05, "loss": 0.427, "num_input_tokens_seen": 9732024073, "step": 2496, "train_runtime": 99241.4045, "train_tokens_per_second": 98064.151 }, { "epoch": 0.39697933227344995, "grad_norm": 0.18445488810539246, "learning_rate": 3.305548421523934e-05, "loss": 0.4165, "num_input_tokens_seen": 9735892008, "step": 2497, "train_runtime": 99280.0567, "train_tokens_per_second": 98064.932 }, { "epoch": 0.3971383147853736, "grad_norm": 0.19305595755577087, "learning_rate": 3.3043638233486584e-05, "loss": 0.4052, "num_input_tokens_seen": 9739837652, "step": 2498, "train_runtime": 99318.2508, "train_tokens_per_second": 98066.947 }, { "epoch": 0.3972972972972973, "grad_norm": 0.1844949871301651, "learning_rate": 3.303179023685929e-05, "loss": 0.4179, "num_input_tokens_seen": 9743746086, "step": 2499, "train_runtime": 99357.1442, "train_tokens_per_second": 98067.896 }, { "epoch": 0.397456279809221, "grad_norm": 0.19939997792243958, "learning_rate": 3.30199402283253e-05, "loss": 0.419, "num_input_tokens_seen": 9747632626, "step": 2500, "train_runtime": 99395.3179, "train_tokens_per_second": 98069.334 }, { "epoch": 0.3976152623211447, "grad_norm": 0.20075254142284393, "learning_rate": 3.300808821085295e-05, "loss": 0.4219, "num_input_tokens_seen": 9751597545, "step": 2501, "train_runtime": 99432.7688, "train_tokens_per_second": 98072.272 }, { "epoch": 0.39777424483306834, "grad_norm": 0.18356235325336456, "learning_rate": 3.299623418741109e-05, "loss": 0.4163, "num_input_tokens_seen": 9755549859, "step": 2502, "train_runtime": 99472.6255, "train_tokens_per_second": 98072.709 }, { "epoch": 0.39793322734499204, "grad_norm": 0.18484768271446228, "learning_rate": 3.298437816096908e-05, "loss": 0.4158, "num_input_tokens_seen": 9759317758, "step": 2503, "train_runtime": 99510.9884, "train_tokens_per_second": 98072.765 }, { "epoch": 0.39809220985691574, "grad_norm": 0.18552739918231964, "learning_rate": 3.297252013449675e-05, "loss": 0.4131, "num_input_tokens_seen": 9763280554, "step": 2504, "train_runtime": 99550.1174, "train_tokens_per_second": 98074.024 }, { "epoch": 0.39825119236883944, "grad_norm": 0.18848690390586853, "learning_rate": 3.296066011096447e-05, "loss": 0.4138, "num_input_tokens_seen": 9767236953, "step": 2505, "train_runtime": 99589.4761, "train_tokens_per_second": 98074.991 }, { "epoch": 0.39841017488076313, "grad_norm": 0.24184857308864594, "learning_rate": 3.2948798093343085e-05, "loss": 0.4168, "num_input_tokens_seen": 9771039714, "step": 2506, "train_runtime": 99630.5427, "train_tokens_per_second": 98072.734 }, { "epoch": 0.3985691573926868, "grad_norm": 0.22846415638923645, "learning_rate": 3.293693408460394e-05, "loss": 0.4291, "num_input_tokens_seen": 9774952834, "step": 2507, "train_runtime": 99671.4501, "train_tokens_per_second": 98071.743 }, { "epoch": 0.3987281399046105, "grad_norm": 0.2291615903377533, "learning_rate": 3.2925068087718895e-05, "loss": 0.4009, "num_input_tokens_seen": 9778908311, "step": 2508, "train_runtime": 99709.2566, "train_tokens_per_second": 98074.227 }, { "epoch": 0.3988871224165342, "grad_norm": 0.2021409571170807, "learning_rate": 3.29132001056603e-05, "loss": 0.4091, "num_input_tokens_seen": 9782706206, "step": 2509, "train_runtime": 99748.4103, "train_tokens_per_second": 98073.806 }, { "epoch": 0.3990461049284579, "grad_norm": 0.46242445707321167, "learning_rate": 3.2901330141400995e-05, "loss": 0.4116, "num_input_tokens_seen": 9786628373, "step": 2510, "train_runtime": 99790.2054, "train_tokens_per_second": 98072.034 }, { "epoch": 0.3992050874403816, "grad_norm": 0.19772323966026306, "learning_rate": 3.288945819791431e-05, "loss": 0.4097, "num_input_tokens_seen": 9790538596, "step": 2511, "train_runtime": 99829.5665, "train_tokens_per_second": 98072.534 }, { "epoch": 0.3993640699523052, "grad_norm": 0.2369210124015808, "learning_rate": 3.287758427817411e-05, "loss": 0.4221, "num_input_tokens_seen": 9794426299, "step": 2512, "train_runtime": 99867.1099, "train_tokens_per_second": 98074.594 }, { "epoch": 0.3995230524642289, "grad_norm": 0.21113549172878265, "learning_rate": 3.286570838515471e-05, "loss": 0.4016, "num_input_tokens_seen": 9798263381, "step": 2513, "train_runtime": 99905.6073, "train_tokens_per_second": 98075.21 }, { "epoch": 0.3996820349761526, "grad_norm": 0.2060297280550003, "learning_rate": 3.285383052183095e-05, "loss": 0.4099, "num_input_tokens_seen": 9802213998, "step": 2514, "train_runtime": 99945.1663, "train_tokens_per_second": 98075.919 }, { "epoch": 0.3998410174880763, "grad_norm": 0.2541334331035614, "learning_rate": 3.284195069117814e-05, "loss": 0.4139, "num_input_tokens_seen": 9806128212, "step": 2515, "train_runtime": 99983.3029, "train_tokens_per_second": 98077.658 }, { "epoch": 0.4, "grad_norm": 0.21800272166728973, "learning_rate": 3.283006889617208e-05, "loss": 0.4151, "num_input_tokens_seen": 9810089727, "step": 2516, "train_runtime": 100022.6645, "train_tokens_per_second": 98078.668 }, { "epoch": 0.40015898251192367, "grad_norm": 0.20376746356487274, "learning_rate": 3.28181851397891e-05, "loss": 0.427, "num_input_tokens_seen": 9813942598, "step": 2517, "train_runtime": 100063.2485, "train_tokens_per_second": 98077.393 }, { "epoch": 0.40031796502384737, "grad_norm": 0.22644329071044922, "learning_rate": 3.280629942500599e-05, "loss": 0.4011, "num_input_tokens_seen": 9817936221, "step": 2518, "train_runtime": 100104.1035, "train_tokens_per_second": 98077.26 }, { "epoch": 0.40047694753577107, "grad_norm": 0.20673085749149323, "learning_rate": 3.279441175480004e-05, "loss": 0.4127, "num_input_tokens_seen": 9821819454, "step": 2519, "train_runtime": 100141.0004, "train_tokens_per_second": 98079.901 }, { "epoch": 0.40063593004769477, "grad_norm": 0.18143802881240845, "learning_rate": 3.278252213214902e-05, "loss": 0.4153, "num_input_tokens_seen": 9825812342, "step": 2520, "train_runtime": 100179.4743, "train_tokens_per_second": 98082.091 }, { "epoch": 0.40079491255961847, "grad_norm": 0.265608012676239, "learning_rate": 3.2770630560031205e-05, "loss": 0.4096, "num_input_tokens_seen": 9829566792, "step": 2521, "train_runtime": 100218.5593, "train_tokens_per_second": 98081.302 }, { "epoch": 0.4009538950715421, "grad_norm": 0.2045113891363144, "learning_rate": 3.2758737041425335e-05, "loss": 0.4235, "num_input_tokens_seen": 9833449092, "step": 2522, "train_runtime": 100258.4006, "train_tokens_per_second": 98081.049 }, { "epoch": 0.4011128775834658, "grad_norm": 0.19428016245365143, "learning_rate": 3.274684157931067e-05, "loss": 0.4058, "num_input_tokens_seen": 9837393225, "step": 2523, "train_runtime": 100299.1541, "train_tokens_per_second": 98080.52 }, { "epoch": 0.4012718600953895, "grad_norm": 0.21199703216552734, "learning_rate": 3.2734944176666927e-05, "loss": 0.4151, "num_input_tokens_seen": 9841203199, "step": 2524, "train_runtime": 100339.5839, "train_tokens_per_second": 98078.972 }, { "epoch": 0.4014308426073132, "grad_norm": 0.1985011100769043, "learning_rate": 3.272304483647433e-05, "loss": 0.4127, "num_input_tokens_seen": 9845104564, "step": 2525, "train_runtime": 100378.0101, "train_tokens_per_second": 98080.292 }, { "epoch": 0.4015898251192369, "grad_norm": 0.237498939037323, "learning_rate": 3.271114356171356e-05, "loss": 0.4253, "num_input_tokens_seen": 9849049446, "step": 2526, "train_runtime": 100417.0252, "train_tokens_per_second": 98081.47 }, { "epoch": 0.40174880763116055, "grad_norm": 0.1921333521604538, "learning_rate": 3.269924035536583e-05, "loss": 0.4027, "num_input_tokens_seen": 9853044977, "step": 2527, "train_runtime": 100456.1562, "train_tokens_per_second": 98083.038 }, { "epoch": 0.40190779014308425, "grad_norm": 0.2077057957649231, "learning_rate": 3.26873352204128e-05, "loss": 0.3987, "num_input_tokens_seen": 9856855598, "step": 2528, "train_runtime": 100494.6633, "train_tokens_per_second": 98083.374 }, { "epoch": 0.40206677265500795, "grad_norm": 0.2005910724401474, "learning_rate": 3.267542815983662e-05, "loss": 0.4097, "num_input_tokens_seen": 9860791488, "step": 2529, "train_runtime": 100534.2188, "train_tokens_per_second": 98083.932 }, { "epoch": 0.40222575516693165, "grad_norm": 0.2630491554737091, "learning_rate": 3.266351917661992e-05, "loss": 0.4114, "num_input_tokens_seen": 9864633524, "step": 2530, "train_runtime": 100573.1408, "train_tokens_per_second": 98084.175 }, { "epoch": 0.40238473767885535, "grad_norm": 0.2019592970609665, "learning_rate": 3.2651608273745815e-05, "loss": 0.4079, "num_input_tokens_seen": 9868442060, "step": 2531, "train_runtime": 100613.0288, "train_tokens_per_second": 98083.143 }, { "epoch": 0.402543720190779, "grad_norm": 0.21124860644340515, "learning_rate": 3.263969545419791e-05, "loss": 0.4113, "num_input_tokens_seen": 9872331916, "step": 2532, "train_runtime": 100652.6214, "train_tokens_per_second": 98083.207 }, { "epoch": 0.4027027027027027, "grad_norm": 0.19786228239536285, "learning_rate": 3.262778072096028e-05, "loss": 0.4168, "num_input_tokens_seen": 9876289607, "step": 2533, "train_runtime": 100693.393, "train_tokens_per_second": 98082.797 }, { "epoch": 0.4028616852146264, "grad_norm": 0.2246251404285431, "learning_rate": 3.261586407701747e-05, "loss": 0.4101, "num_input_tokens_seen": 9880082383, "step": 2534, "train_runtime": 100731.2691, "train_tokens_per_second": 98083.569 }, { "epoch": 0.4030206677265501, "grad_norm": 0.18747708201408386, "learning_rate": 3.260394552535453e-05, "loss": 0.4069, "num_input_tokens_seen": 9883946674, "step": 2535, "train_runtime": 100770.1725, "train_tokens_per_second": 98084.05 }, { "epoch": 0.40317965023847374, "grad_norm": 0.21812006831169128, "learning_rate": 3.259202506895698e-05, "loss": 0.415, "num_input_tokens_seen": 9887816308, "step": 2536, "train_runtime": 100810.3662, "train_tokens_per_second": 98083.329 }, { "epoch": 0.40333863275039744, "grad_norm": 0.19534800946712494, "learning_rate": 3.258010271081078e-05, "loss": 0.4142, "num_input_tokens_seen": 9891787888, "step": 2537, "train_runtime": 100851.4199, "train_tokens_per_second": 98082.783 }, { "epoch": 0.40349761526232114, "grad_norm": 0.21172362565994263, "learning_rate": 3.256817845390242e-05, "loss": 0.4262, "num_input_tokens_seen": 9895701084, "step": 2538, "train_runtime": 100888.414, "train_tokens_per_second": 98085.605 }, { "epoch": 0.40365659777424484, "grad_norm": 0.1920296996831894, "learning_rate": 3.255625230121885e-05, "loss": 0.4162, "num_input_tokens_seen": 9899450569, "step": 2539, "train_runtime": 100928.0355, "train_tokens_per_second": 98084.249 }, { "epoch": 0.40381558028616854, "grad_norm": 0.24537551403045654, "learning_rate": 3.2544324255747455e-05, "loss": 0.4055, "num_input_tokens_seen": 9903410136, "step": 2540, "train_runtime": 100969.2517, "train_tokens_per_second": 98083.426 }, { "epoch": 0.4039745627980922, "grad_norm": 0.20593376457691193, "learning_rate": 3.253239432047615e-05, "loss": 0.4171, "num_input_tokens_seen": 9907329062, "step": 2541, "train_runtime": 101007.9606, "train_tokens_per_second": 98084.636 }, { "epoch": 0.4041335453100159, "grad_norm": 0.25416186451911926, "learning_rate": 3.25204624983933e-05, "loss": 0.4056, "num_input_tokens_seen": 9911259180, "step": 2542, "train_runtime": 101044.7191, "train_tokens_per_second": 98087.849 }, { "epoch": 0.4042925278219396, "grad_norm": 0.2262747436761856, "learning_rate": 3.250852879248774e-05, "loss": 0.4091, "num_input_tokens_seen": 9915101458, "step": 2543, "train_runtime": 101084.3119, "train_tokens_per_second": 98087.441 }, { "epoch": 0.4044515103338633, "grad_norm": 0.28063708543777466, "learning_rate": 3.2496593205748766e-05, "loss": 0.4321, "num_input_tokens_seen": 9918900295, "step": 2544, "train_runtime": 101125.7602, "train_tokens_per_second": 98084.803 }, { "epoch": 0.404610492845787, "grad_norm": 0.21582502126693726, "learning_rate": 3.248465574116617e-05, "loss": 0.4184, "num_input_tokens_seen": 9922928484, "step": 2545, "train_runtime": 101167.0569, "train_tokens_per_second": 98084.582 }, { "epoch": 0.40476947535771063, "grad_norm": 0.2528725266456604, "learning_rate": 3.2472716401730206e-05, "loss": 0.4265, "num_input_tokens_seen": 9926763234, "step": 2546, "train_runtime": 101206.1188, "train_tokens_per_second": 98084.615 }, { "epoch": 0.40492845786963433, "grad_norm": 0.26350435614585876, "learning_rate": 3.246077519043158e-05, "loss": 0.4086, "num_input_tokens_seen": 9930700624, "step": 2547, "train_runtime": 101243.0345, "train_tokens_per_second": 98087.742 }, { "epoch": 0.40508744038155803, "grad_norm": 0.20917484164237976, "learning_rate": 3.244883211026149e-05, "loss": 0.4161, "num_input_tokens_seen": 9934571606, "step": 2548, "train_runtime": 101282.5869, "train_tokens_per_second": 98087.657 }, { "epoch": 0.40524642289348173, "grad_norm": 0.22891555726528168, "learning_rate": 3.243688716421159e-05, "loss": 0.4176, "num_input_tokens_seen": 9938479652, "step": 2549, "train_runtime": 101318.503, "train_tokens_per_second": 98091.458 }, { "epoch": 0.40540540540540543, "grad_norm": 0.2239324301481247, "learning_rate": 3.242494035527402e-05, "loss": 0.4122, "num_input_tokens_seen": 9942265624, "step": 2550, "train_runtime": 101356.0346, "train_tokens_per_second": 98092.488 }, { "epoch": 0.40556438791732907, "grad_norm": 0.2677963376045227, "learning_rate": 3.2412991686441347e-05, "loss": 0.4186, "num_input_tokens_seen": 9946153438, "step": 2551, "train_runtime": 101395.5256, "train_tokens_per_second": 98092.627 }, { "epoch": 0.40572337042925277, "grad_norm": 0.24893149733543396, "learning_rate": 3.240104116070664e-05, "loss": 0.4245, "num_input_tokens_seen": 9950186315, "step": 2552, "train_runtime": 101433.8406, "train_tokens_per_second": 98095.332 }, { "epoch": 0.40588235294117647, "grad_norm": 0.20946013927459717, "learning_rate": 3.2389088781063424e-05, "loss": 0.4293, "num_input_tokens_seen": 9954041291, "step": 2553, "train_runtime": 101469.5587, "train_tokens_per_second": 98098.794 }, { "epoch": 0.40604133545310017, "grad_norm": 0.2183649092912674, "learning_rate": 3.2377134550505684e-05, "loss": 0.4138, "num_input_tokens_seen": 9957784473, "step": 2554, "train_runtime": 101508.8582, "train_tokens_per_second": 98097.69 }, { "epoch": 0.40620031796502387, "grad_norm": 0.20844466984272003, "learning_rate": 3.2365178472027866e-05, "loss": 0.4273, "num_input_tokens_seen": 9961730708, "step": 2555, "train_runtime": 101548.3336, "train_tokens_per_second": 98098.416 }, { "epoch": 0.4063593004769475, "grad_norm": 0.21835799515247345, "learning_rate": 3.235322054862489e-05, "loss": 0.403, "num_input_tokens_seen": 9965700548, "step": 2556, "train_runtime": 101586.8047, "train_tokens_per_second": 98100.345 }, { "epoch": 0.4065182829888712, "grad_norm": 0.22216445207595825, "learning_rate": 3.234126078329212e-05, "loss": 0.4187, "num_input_tokens_seen": 9969625397, "step": 2557, "train_runtime": 101621.7975, "train_tokens_per_second": 98105.186 }, { "epoch": 0.4066772655007949, "grad_norm": 0.21071681380271912, "learning_rate": 3.2329299179025394e-05, "loss": 0.4025, "num_input_tokens_seen": 9973356014, "step": 2558, "train_runtime": 101660.5644, "train_tokens_per_second": 98104.472 }, { "epoch": 0.4068362480127186, "grad_norm": 0.22689858078956604, "learning_rate": 3.231733573882103e-05, "loss": 0.4157, "num_input_tokens_seen": 9977342427, "step": 2559, "train_runtime": 101700.3677, "train_tokens_per_second": 98105.274 }, { "epoch": 0.4069952305246423, "grad_norm": 0.31470826268196106, "learning_rate": 3.230537046567576e-05, "loss": 0.4218, "num_input_tokens_seen": 9981283698, "step": 2560, "train_runtime": 101740.7124, "train_tokens_per_second": 98105.109 }, { "epoch": 0.40715421303656596, "grad_norm": 0.2099905014038086, "learning_rate": 3.229340336258682e-05, "loss": 0.4079, "num_input_tokens_seen": 9985039163, "step": 2561, "train_runtime": 101777.5552, "train_tokens_per_second": 98106.495 }, { "epoch": 0.40731319554848966, "grad_norm": 0.23922966420650482, "learning_rate": 3.2281434432551874e-05, "loss": 0.4086, "num_input_tokens_seen": 9988854892, "step": 2562, "train_runtime": 101816.9705, "train_tokens_per_second": 98105.992 }, { "epoch": 0.40747217806041336, "grad_norm": 0.20860028266906738, "learning_rate": 3.226946367856904e-05, "loss": 0.4174, "num_input_tokens_seen": 9992887991, "step": 2563, "train_runtime": 101857.5507, "train_tokens_per_second": 98106.502 }, { "epoch": 0.40763116057233706, "grad_norm": 0.2258378565311432, "learning_rate": 3.225749110363694e-05, "loss": 0.4123, "num_input_tokens_seen": 9996804377, "step": 2564, "train_runtime": 101897.6451, "train_tokens_per_second": 98106.334 }, { "epoch": 0.40779014308426076, "grad_norm": 0.21992740035057068, "learning_rate": 3.2245516710754594e-05, "loss": 0.4169, "num_input_tokens_seen": 10000643280, "step": 2565, "train_runtime": 101934.5395, "train_tokens_per_second": 98108.485 }, { "epoch": 0.4079491255961844, "grad_norm": 0.2270665168762207, "learning_rate": 3.2233540502921515e-05, "loss": 0.3921, "num_input_tokens_seen": 10004566227, "step": 2566, "train_runtime": 101975.2002, "train_tokens_per_second": 98107.836 }, { "epoch": 0.4081081081081081, "grad_norm": 0.20872490108013153, "learning_rate": 3.222156248313765e-05, "loss": 0.4159, "num_input_tokens_seen": 10008563420, "step": 2567, "train_runtime": 102012.2333, "train_tokens_per_second": 98111.404 }, { "epoch": 0.4082670906200318, "grad_norm": 0.24426229298114777, "learning_rate": 3.2209582654403426e-05, "loss": 0.422, "num_input_tokens_seen": 10012558622, "step": 2568, "train_runtime": 102052.7575, "train_tokens_per_second": 98111.593 }, { "epoch": 0.4084260731319555, "grad_norm": 0.22350312769412994, "learning_rate": 3.219760101971969e-05, "loss": 0.4251, "num_input_tokens_seen": 10016441064, "step": 2569, "train_runtime": 102093.6056, "train_tokens_per_second": 98110.367 }, { "epoch": 0.40858505564387915, "grad_norm": 0.21303074061870575, "learning_rate": 3.218561758208775e-05, "loss": 0.4144, "num_input_tokens_seen": 10020413666, "step": 2570, "train_runtime": 102134.9579, "train_tokens_per_second": 98109.539 }, { "epoch": 0.40874403815580285, "grad_norm": 0.3773512840270996, "learning_rate": 3.217363234450938e-05, "loss": 0.4026, "num_input_tokens_seen": 10024291042, "step": 2571, "train_runtime": 102173.8399, "train_tokens_per_second": 98110.153 }, { "epoch": 0.40890302066772655, "grad_norm": 0.19844451546669006, "learning_rate": 3.21616453099868e-05, "loss": 0.4203, "num_input_tokens_seen": 10028164208, "step": 2572, "train_runtime": 102212.4558, "train_tokens_per_second": 98110.98 }, { "epoch": 0.40906200317965025, "grad_norm": 0.23194368183612823, "learning_rate": 3.214965648152268e-05, "loss": 0.4199, "num_input_tokens_seen": 10031962748, "step": 2573, "train_runtime": 102253.6704, "train_tokens_per_second": 98108.583 }, { "epoch": 0.40922098569157395, "grad_norm": 0.21285267174243927, "learning_rate": 3.2137665862120135e-05, "loss": 0.4093, "num_input_tokens_seen": 10035979776, "step": 2574, "train_runtime": 102293.4121, "train_tokens_per_second": 98109.737 }, { "epoch": 0.4093799682034976, "grad_norm": 0.2603198289871216, "learning_rate": 3.2125673454782726e-05, "loss": 0.4111, "num_input_tokens_seen": 10039799934, "step": 2575, "train_runtime": 102329.6912, "train_tokens_per_second": 98112.286 }, { "epoch": 0.4095389507154213, "grad_norm": 0.24593433737754822, "learning_rate": 3.211367926251448e-05, "loss": 0.4084, "num_input_tokens_seen": 10043659040, "step": 2576, "train_runtime": 102369.9091, "train_tokens_per_second": 98111.439 }, { "epoch": 0.409697933227345, "grad_norm": 0.22102349996566772, "learning_rate": 3.210168328831984e-05, "loss": 0.4101, "num_input_tokens_seen": 10047506796, "step": 2577, "train_runtime": 102409.8091, "train_tokens_per_second": 98110.785 }, { "epoch": 0.4098569157392687, "grad_norm": 0.21941952407360077, "learning_rate": 3.2089685535203715e-05, "loss": 0.4128, "num_input_tokens_seen": 10051457424, "step": 2578, "train_runtime": 102450.4031, "train_tokens_per_second": 98110.472 }, { "epoch": 0.4100158982511924, "grad_norm": 0.225287064909935, "learning_rate": 3.2077686006171456e-05, "loss": 0.4055, "num_input_tokens_seen": 10055366014, "step": 2579, "train_runtime": 102489.1988, "train_tokens_per_second": 98111.471 }, { "epoch": 0.41017488076311603, "grad_norm": 0.23494850099086761, "learning_rate": 3.2065684704228874e-05, "loss": 0.4135, "num_input_tokens_seen": 10059187984, "step": 2580, "train_runtime": 102529.1883, "train_tokens_per_second": 98110.481 }, { "epoch": 0.41033386327503973, "grad_norm": 0.2050384283065796, "learning_rate": 3.2053681632382204e-05, "loss": 0.4209, "num_input_tokens_seen": 10063190909, "step": 2581, "train_runtime": 102567.8845, "train_tokens_per_second": 98112.494 }, { "epoch": 0.41049284578696343, "grad_norm": 0.29640457034111023, "learning_rate": 3.204167679363813e-05, "loss": 0.4206, "num_input_tokens_seen": 10067068584, "step": 2582, "train_runtime": 102607.9327, "train_tokens_per_second": 98111.991 }, { "epoch": 0.41065182829888713, "grad_norm": 0.2211441993713379, "learning_rate": 3.202967019100377e-05, "loss": 0.4028, "num_input_tokens_seen": 10070964813, "step": 2583, "train_runtime": 102647.2555, "train_tokens_per_second": 98112.363 }, { "epoch": 0.41081081081081083, "grad_norm": 0.22371289134025574, "learning_rate": 3.201766182748671e-05, "loss": 0.4043, "num_input_tokens_seen": 10074951761, "step": 2584, "train_runtime": 102689.5726, "train_tokens_per_second": 98110.758 }, { "epoch": 0.4109697933227345, "grad_norm": 0.2102792114019394, "learning_rate": 3.200565170609494e-05, "loss": 0.4141, "num_input_tokens_seen": 10078829124, "step": 2585, "train_runtime": 102729.5657, "train_tokens_per_second": 98110.306 }, { "epoch": 0.4111287758346582, "grad_norm": 0.21441678702831268, "learning_rate": 3.1993639829836936e-05, "loss": 0.4191, "num_input_tokens_seen": 10082687289, "step": 2586, "train_runtime": 102769.4867, "train_tokens_per_second": 98109.737 }, { "epoch": 0.4112877583465819, "grad_norm": 0.2154729962348938, "learning_rate": 3.1981626201721564e-05, "loss": 0.3973, "num_input_tokens_seen": 10086473639, "step": 2587, "train_runtime": 102808.5138, "train_tokens_per_second": 98109.322 }, { "epoch": 0.4114467408585056, "grad_norm": 0.2668992280960083, "learning_rate": 3.196961082475817e-05, "loss": 0.4182, "num_input_tokens_seen": 10090546393, "step": 2588, "train_runtime": 102850.2561, "train_tokens_per_second": 98109.103 }, { "epoch": 0.4116057233704293, "grad_norm": 0.1955244392156601, "learning_rate": 3.1957593701956506e-05, "loss": 0.4166, "num_input_tokens_seen": 10094491512, "step": 2589, "train_runtime": 102889.4527, "train_tokens_per_second": 98110.071 }, { "epoch": 0.4117647058823529, "grad_norm": 0.2342722862958908, "learning_rate": 3.1945574836326786e-05, "loss": 0.404, "num_input_tokens_seen": 10098352133, "step": 2590, "train_runtime": 102929.7909, "train_tokens_per_second": 98109.129 }, { "epoch": 0.4119236883942766, "grad_norm": 0.19292420148849487, "learning_rate": 3.1933554230879656e-05, "loss": 0.4327, "num_input_tokens_seen": 10102222235, "step": 2591, "train_runtime": 102970.3008, "train_tokens_per_second": 98108.116 }, { "epoch": 0.4120826709062003, "grad_norm": 0.2665086090564728, "learning_rate": 3.192153188862618e-05, "loss": 0.4075, "num_input_tokens_seen": 10106211451, "step": 2592, "train_runtime": 103009.5395, "train_tokens_per_second": 98109.471 }, { "epoch": 0.412241653418124, "grad_norm": 0.19911549985408783, "learning_rate": 3.1909507812577886e-05, "loss": 0.4078, "num_input_tokens_seen": 10110012138, "step": 2593, "train_runtime": 103047.2283, "train_tokens_per_second": 98110.471 }, { "epoch": 0.4124006359300477, "grad_norm": 0.21875956654548645, "learning_rate": 3.18974820057467e-05, "loss": 0.413, "num_input_tokens_seen": 10113913501, "step": 2594, "train_runtime": 103085.1881, "train_tokens_per_second": 98112.189 }, { "epoch": 0.41255961844197137, "grad_norm": 0.1967795193195343, "learning_rate": 3.1885454471145007e-05, "loss": 0.429, "num_input_tokens_seen": 10117778647, "step": 2595, "train_runtime": 103125.6064, "train_tokens_per_second": 98111.216 }, { "epoch": 0.41271860095389507, "grad_norm": 0.21594688296318054, "learning_rate": 3.187342521178564e-05, "loss": 0.4172, "num_input_tokens_seen": 10121778889, "step": 2596, "train_runtime": 103165.6606, "train_tokens_per_second": 98111.899 }, { "epoch": 0.41287758346581876, "grad_norm": 0.36125555634498596, "learning_rate": 3.186139423068182e-05, "loss": 0.4237, "num_input_tokens_seen": 10125646289, "step": 2597, "train_runtime": 103203.967, "train_tokens_per_second": 98112.956 }, { "epoch": 0.41303656597774246, "grad_norm": 0.5334799289703369, "learning_rate": 3.1849361530847224e-05, "loss": 0.3996, "num_input_tokens_seen": 10129498544, "step": 2598, "train_runtime": 103245.2143, "train_tokens_per_second": 98111.071 }, { "epoch": 0.4131955484896661, "grad_norm": 0.19434568285942078, "learning_rate": 3.1837327115295965e-05, "loss": 0.4259, "num_input_tokens_seen": 10133397142, "step": 2599, "train_runtime": 103287.1058, "train_tokens_per_second": 98109.024 }, { "epoch": 0.4133545310015898, "grad_norm": 0.18590235710144043, "learning_rate": 3.182529098704259e-05, "loss": 0.4297, "num_input_tokens_seen": 10137359746, "step": 2600, "train_runtime": 103327.4722, "train_tokens_per_second": 98109.046 }, { "epoch": 0.4135135135135135, "grad_norm": 0.20365042984485626, "learning_rate": 3.1813253149102045e-05, "loss": 0.4185, "num_input_tokens_seen": 10141199005, "step": 2601, "train_runtime": 103475.3082, "train_tokens_per_second": 98005.98 }, { "epoch": 0.4136724960254372, "grad_norm": 0.1895475834608078, "learning_rate": 3.180121360448973e-05, "loss": 0.4142, "num_input_tokens_seen": 10145159733, "step": 2602, "train_runtime": 103513.3377, "train_tokens_per_second": 98008.237 }, { "epoch": 0.4138314785373609, "grad_norm": 0.21197935938835144, "learning_rate": 3.178917235622147e-05, "loss": 0.4204, "num_input_tokens_seen": 10149144211, "step": 2603, "train_runtime": 103553.2342, "train_tokens_per_second": 98008.954 }, { "epoch": 0.41399046104928455, "grad_norm": 0.17818698287010193, "learning_rate": 3.17771294073135e-05, "loss": 0.4346, "num_input_tokens_seen": 10152981524, "step": 2604, "train_runtime": 103590.0884, "train_tokens_per_second": 98011.129 }, { "epoch": 0.41414944356120825, "grad_norm": 0.3227137327194214, "learning_rate": 3.176508476078252e-05, "loss": 0.4205, "num_input_tokens_seen": 10156873361, "step": 2605, "train_runtime": 103630.2921, "train_tokens_per_second": 98010.66 }, { "epoch": 0.41430842607313195, "grad_norm": 0.19584763050079346, "learning_rate": 3.17530384196456e-05, "loss": 0.4174, "num_input_tokens_seen": 10160782415, "step": 2606, "train_runtime": 103670.3343, "train_tokens_per_second": 98010.511 }, { "epoch": 0.41446740858505565, "grad_norm": 0.21813926100730896, "learning_rate": 3.174099038692028e-05, "loss": 0.4179, "num_input_tokens_seen": 10164707631, "step": 2607, "train_runtime": 103711.5398, "train_tokens_per_second": 98009.418 }, { "epoch": 0.41462639109697935, "grad_norm": 0.2345413714647293, "learning_rate": 3.1728940665624494e-05, "loss": 0.4116, "num_input_tokens_seen": 10168518046, "step": 2608, "train_runtime": 103749.9739, "train_tokens_per_second": 98009.837 }, { "epoch": 0.414785373608903, "grad_norm": 0.1976460963487625, "learning_rate": 3.1716889258776634e-05, "loss": 0.4077, "num_input_tokens_seen": 10172404954, "step": 2609, "train_runtime": 103790.7877, "train_tokens_per_second": 98008.746 }, { "epoch": 0.4149443561208267, "grad_norm": 0.21303412318229675, "learning_rate": 3.170483616939547e-05, "loss": 0.4206, "num_input_tokens_seen": 10176359118, "step": 2610, "train_runtime": 103830.8307, "train_tokens_per_second": 98009.031 }, { "epoch": 0.4151033386327504, "grad_norm": 0.20344999432563782, "learning_rate": 3.169278140050023e-05, "loss": 0.4103, "num_input_tokens_seen": 10180238788, "step": 2611, "train_runtime": 103868.6981, "train_tokens_per_second": 98010.652 }, { "epoch": 0.4152623211446741, "grad_norm": 0.33819785714149475, "learning_rate": 3.168072495511053e-05, "loss": 0.4081, "num_input_tokens_seen": 10184075467, "step": 2612, "train_runtime": 103907.3093, "train_tokens_per_second": 98011.156 }, { "epoch": 0.4154213036565978, "grad_norm": 0.2197241336107254, "learning_rate": 3.166866683624644e-05, "loss": 0.4206, "num_input_tokens_seen": 10187857069, "step": 2613, "train_runtime": 103949.0071, "train_tokens_per_second": 98008.219 }, { "epoch": 0.41558028616852144, "grad_norm": 0.21819047629833221, "learning_rate": 3.165660704692844e-05, "loss": 0.4074, "num_input_tokens_seen": 10191838554, "step": 2614, "train_runtime": 103988.621, "train_tokens_per_second": 98009.171 }, { "epoch": 0.41573926868044514, "grad_norm": 0.2544807195663452, "learning_rate": 3.164454559017741e-05, "loss": 0.4054, "num_input_tokens_seen": 10195764135, "step": 2615, "train_runtime": 104023.9606, "train_tokens_per_second": 98013.612 }, { "epoch": 0.41589825119236884, "grad_norm": 0.19820518791675568, "learning_rate": 3.163248246901467e-05, "loss": 0.4285, "num_input_tokens_seen": 10199437075, "step": 2616, "train_runtime": 104066.4357, "train_tokens_per_second": 98008.902 }, { "epoch": 0.41605723370429254, "grad_norm": 0.21985305845737457, "learning_rate": 3.162041768646192e-05, "loss": 0.4253, "num_input_tokens_seen": 10203462670, "step": 2617, "train_runtime": 104106.2908, "train_tokens_per_second": 98010.049 }, { "epoch": 0.41621621621621624, "grad_norm": 0.2199181616306305, "learning_rate": 3.160835124554134e-05, "loss": 0.4056, "num_input_tokens_seen": 10207504805, "step": 2618, "train_runtime": 104148.0253, "train_tokens_per_second": 98009.586 }, { "epoch": 0.4163751987281399, "grad_norm": 0.5045865178108215, "learning_rate": 3.159628314927546e-05, "loss": 0.4154, "num_input_tokens_seen": 10211324897, "step": 2619, "train_runtime": 104186.9473, "train_tokens_per_second": 98009.637 }, { "epoch": 0.4165341812400636, "grad_norm": 0.20984528958797455, "learning_rate": 3.1584213400687266e-05, "loss": 0.4096, "num_input_tokens_seen": 10215262369, "step": 2620, "train_runtime": 104227.1833, "train_tokens_per_second": 98009.579 }, { "epoch": 0.4166931637519873, "grad_norm": 0.19564928114414215, "learning_rate": 3.157214200280013e-05, "loss": 0.4148, "num_input_tokens_seen": 10219133415, "step": 2621, "train_runtime": 104266.8846, "train_tokens_per_second": 98009.387 }, { "epoch": 0.416852146263911, "grad_norm": 0.2197643220424652, "learning_rate": 3.1560068958637875e-05, "loss": 0.4282, "num_input_tokens_seen": 10223030899, "step": 2622, "train_runtime": 104305.7737, "train_tokens_per_second": 98010.211 }, { "epoch": 0.4170111287758347, "grad_norm": 0.21473516523838043, "learning_rate": 3.15479942712247e-05, "loss": 0.4013, "num_input_tokens_seen": 10226830410, "step": 2623, "train_runtime": 104344.6687, "train_tokens_per_second": 98010.09 }, { "epoch": 0.4171701112877583, "grad_norm": 0.29432496428489685, "learning_rate": 3.1535917943585234e-05, "loss": 0.4194, "num_input_tokens_seen": 10230740374, "step": 2624, "train_runtime": 104385.1759, "train_tokens_per_second": 98009.514 }, { "epoch": 0.417329093799682, "grad_norm": 0.18814095854759216, "learning_rate": 3.152383997874451e-05, "loss": 0.415, "num_input_tokens_seen": 10234763531, "step": 2625, "train_runtime": 104426.9042, "train_tokens_per_second": 98008.876 }, { "epoch": 0.4174880763116057, "grad_norm": 0.19242803752422333, "learning_rate": 3.151176037972796e-05, "loss": 0.4234, "num_input_tokens_seen": 10238595443, "step": 2626, "train_runtime": 104466.6425, "train_tokens_per_second": 98008.275 }, { "epoch": 0.4176470588235294, "grad_norm": 0.20788364112377167, "learning_rate": 3.1499679149561456e-05, "loss": 0.415, "num_input_tokens_seen": 10242452080, "step": 2627, "train_runtime": 104504.3085, "train_tokens_per_second": 98009.855 }, { "epoch": 0.4178060413354531, "grad_norm": 0.2296641319990158, "learning_rate": 3.1487596291271255e-05, "loss": 0.4051, "num_input_tokens_seen": 10246410546, "step": 2628, "train_runtime": 104543.3099, "train_tokens_per_second": 98011.155 }, { "epoch": 0.41796502384737677, "grad_norm": 0.1960410326719284, "learning_rate": 3.1475511807884023e-05, "loss": 0.4059, "num_input_tokens_seen": 10250355681, "step": 2629, "train_runtime": 104580.1893, "train_tokens_per_second": 98014.316 }, { "epoch": 0.41812400635930047, "grad_norm": 0.21857525408267975, "learning_rate": 3.146342570242684e-05, "loss": 0.4123, "num_input_tokens_seen": 10254157813, "step": 2630, "train_runtime": 104619.6169, "train_tokens_per_second": 98013.72 }, { "epoch": 0.41828298887122417, "grad_norm": 0.23775210976600647, "learning_rate": 3.1451337977927194e-05, "loss": 0.4248, "num_input_tokens_seen": 10257918502, "step": 2631, "train_runtime": 104657.7402, "train_tokens_per_second": 98013.95 }, { "epoch": 0.41844197138314787, "grad_norm": 0.20454244315624237, "learning_rate": 3.1439248637412974e-05, "loss": 0.4062, "num_input_tokens_seen": 10261878941, "step": 2632, "train_runtime": 104696.012, "train_tokens_per_second": 98015.949 }, { "epoch": 0.4186009538950715, "grad_norm": 0.22122742235660553, "learning_rate": 3.142715768391248e-05, "loss": 0.4166, "num_input_tokens_seen": 10265736995, "step": 2633, "train_runtime": 104734.414, "train_tokens_per_second": 98016.847 }, { "epoch": 0.4187599364069952, "grad_norm": 0.3691316843032837, "learning_rate": 3.141506512045439e-05, "loss": 0.4091, "num_input_tokens_seen": 10269633129, "step": 2634, "train_runtime": 104773.3412, "train_tokens_per_second": 98017.616 }, { "epoch": 0.4189189189189189, "grad_norm": 0.29994329810142517, "learning_rate": 3.140297095006782e-05, "loss": 0.4148, "num_input_tokens_seen": 10273639496, "step": 2635, "train_runtime": 104813.977, "train_tokens_per_second": 98017.839 }, { "epoch": 0.4190779014308426, "grad_norm": 0.20819388329982758, "learning_rate": 3.139087517578228e-05, "loss": 0.4085, "num_input_tokens_seen": 10277569168, "step": 2636, "train_runtime": 104853.2227, "train_tokens_per_second": 98018.629 }, { "epoch": 0.4192368839427663, "grad_norm": 0.3849034607410431, "learning_rate": 3.137877780062766e-05, "loss": 0.4087, "num_input_tokens_seen": 10281426312, "step": 2637, "train_runtime": 104891.6406, "train_tokens_per_second": 98019.501 }, { "epoch": 0.41939586645468996, "grad_norm": 0.23016589879989624, "learning_rate": 3.136667882763427e-05, "loss": 0.4172, "num_input_tokens_seen": 10285327879, "step": 2638, "train_runtime": 104932.2253, "train_tokens_per_second": 98018.772 }, { "epoch": 0.41955484896661366, "grad_norm": 0.2011646032333374, "learning_rate": 3.135457825983284e-05, "loss": 0.4165, "num_input_tokens_seen": 10289291046, "step": 2639, "train_runtime": 104972.2659, "train_tokens_per_second": 98019.138 }, { "epoch": 0.41971383147853736, "grad_norm": 0.19268740713596344, "learning_rate": 3.134247610025445e-05, "loss": 0.4165, "num_input_tokens_seen": 10293247267, "step": 2640, "train_runtime": 105007.2932, "train_tokens_per_second": 98024.118 }, { "epoch": 0.41987281399046106, "grad_norm": 0.234602153301239, "learning_rate": 3.133037235193062e-05, "loss": 0.4065, "num_input_tokens_seen": 10296939943, "step": 2641, "train_runtime": 105045.4616, "train_tokens_per_second": 98023.654 }, { "epoch": 0.42003179650238476, "grad_norm": 0.30854856967926025, "learning_rate": 3.1318267017893243e-05, "loss": 0.4232, "num_input_tokens_seen": 10300961351, "step": 2642, "train_runtime": 105085.5694, "train_tokens_per_second": 98024.509 }, { "epoch": 0.4201907790143084, "grad_norm": 0.1893431544303894, "learning_rate": 3.130616010117462e-05, "loss": 0.4209, "num_input_tokens_seen": 10304866345, "step": 2643, "train_runtime": 105126.035, "train_tokens_per_second": 98023.923 }, { "epoch": 0.4203497615262321, "grad_norm": 0.1996915489435196, "learning_rate": 3.129405160480746e-05, "loss": 0.4235, "num_input_tokens_seen": 10308742857, "step": 2644, "train_runtime": 105165.8865, "train_tokens_per_second": 98023.639 }, { "epoch": 0.4205087440381558, "grad_norm": 0.23431091010570526, "learning_rate": 3.128194153182486e-05, "loss": 0.4136, "num_input_tokens_seen": 10312446030, "step": 2645, "train_runtime": 105206.1833, "train_tokens_per_second": 98021.292 }, { "epoch": 0.4206677265500795, "grad_norm": 0.1819189339876175, "learning_rate": 3.1269829885260286e-05, "loss": 0.4131, "num_input_tokens_seen": 10316495075, "step": 2646, "train_runtime": 105246.998, "train_tokens_per_second": 98021.751 }, { "epoch": 0.4208267090620032, "grad_norm": 0.20372982323169708, "learning_rate": 3.125771666814762e-05, "loss": 0.4188, "num_input_tokens_seen": 10320503493, "step": 2647, "train_runtime": 105284.0034, "train_tokens_per_second": 98025.371 }, { "epoch": 0.42098569157392685, "grad_norm": 0.308686226606369, "learning_rate": 3.124560188352116e-05, "loss": 0.416, "num_input_tokens_seen": 10324291118, "step": 2648, "train_runtime": 105322.9356, "train_tokens_per_second": 98025.098 }, { "epoch": 0.42114467408585055, "grad_norm": 0.2067563533782959, "learning_rate": 3.123348553441556e-05, "loss": 0.4073, "num_input_tokens_seen": 10328153311, "step": 2649, "train_runtime": 105362.6308, "train_tokens_per_second": 98024.824 }, { "epoch": 0.42130365659777425, "grad_norm": 0.2051180899143219, "learning_rate": 3.122136762386589e-05, "loss": 0.4171, "num_input_tokens_seen": 10332151817, "step": 2650, "train_runtime": 105403.3505, "train_tokens_per_second": 98024.89 }, { "epoch": 0.42146263910969795, "grad_norm": 0.1941251903772354, "learning_rate": 3.120924815490758e-05, "loss": 0.4072, "num_input_tokens_seen": 10336027185, "step": 2651, "train_runtime": 105441.4428, "train_tokens_per_second": 98026.231 }, { "epoch": 0.42162162162162165, "grad_norm": 0.20549187064170837, "learning_rate": 3.1197127130576485e-05, "loss": 0.4124, "num_input_tokens_seen": 10339879661, "step": 2652, "train_runtime": 105480.6006, "train_tokens_per_second": 98026.363 }, { "epoch": 0.4217806041335453, "grad_norm": 0.19412215054035187, "learning_rate": 3.118500455390884e-05, "loss": 0.3987, "num_input_tokens_seen": 10343708528, "step": 2653, "train_runtime": 105520.6472, "train_tokens_per_second": 98025.446 }, { "epoch": 0.421939586645469, "grad_norm": 0.3903769552707672, "learning_rate": 3.117288042794126e-05, "loss": 0.4116, "num_input_tokens_seen": 10347630781, "step": 2654, "train_runtime": 105558.3463, "train_tokens_per_second": 98027.595 }, { "epoch": 0.4220985691573927, "grad_norm": 0.2548346519470215, "learning_rate": 3.116075475571075e-05, "loss": 0.4203, "num_input_tokens_seen": 10351590593, "step": 2655, "train_runtime": 105597.3333, "train_tokens_per_second": 98028.902 }, { "epoch": 0.4222575516693164, "grad_norm": 0.20534542202949524, "learning_rate": 3.114862754025471e-05, "loss": 0.4162, "num_input_tokens_seen": 10355415569, "step": 2656, "train_runtime": 105636.8775, "train_tokens_per_second": 98028.414 }, { "epoch": 0.4224165341812401, "grad_norm": 0.2216128557920456, "learning_rate": 3.113649878461091e-05, "loss": 0.4163, "num_input_tokens_seen": 10359431754, "step": 2657, "train_runtime": 105676.1288, "train_tokens_per_second": 98030.008 }, { "epoch": 0.42257551669316373, "grad_norm": 0.2854541540145874, "learning_rate": 3.112436849181754e-05, "loss": 0.418, "num_input_tokens_seen": 10363221628, "step": 2658, "train_runtime": 105716.0973, "train_tokens_per_second": 98028.795 }, { "epoch": 0.42273449920508743, "grad_norm": 0.19456402957439423, "learning_rate": 3.1112236664913134e-05, "loss": 0.4155, "num_input_tokens_seen": 10367132126, "step": 2659, "train_runtime": 105755.1427, "train_tokens_per_second": 98029.579 }, { "epoch": 0.42289348171701113, "grad_norm": 0.19547072052955627, "learning_rate": 3.110010330693663e-05, "loss": 0.4156, "num_input_tokens_seen": 10371084819, "step": 2660, "train_runtime": 105794.4523, "train_tokens_per_second": 98030.517 }, { "epoch": 0.42305246422893483, "grad_norm": 0.36728957295417786, "learning_rate": 3.108796842092736e-05, "loss": 0.4041, "num_input_tokens_seen": 10374988052, "step": 2661, "train_runtime": 105833.2611, "train_tokens_per_second": 98031.45 }, { "epoch": 0.42321144674085853, "grad_norm": 0.21422649919986725, "learning_rate": 3.107583200992502e-05, "loss": 0.416, "num_input_tokens_seen": 10378948908, "step": 2662, "train_runtime": 105873.9123, "train_tokens_per_second": 98031.221 }, { "epoch": 0.4233704292527822, "grad_norm": 0.25488677620887756, "learning_rate": 3.1063694076969697e-05, "loss": 0.4039, "num_input_tokens_seen": 10382831650, "step": 2663, "train_runtime": 105912.3221, "train_tokens_per_second": 98032.329 }, { "epoch": 0.4235294117647059, "grad_norm": 0.18803656101226807, "learning_rate": 3.105155462510185e-05, "loss": 0.4031, "num_input_tokens_seen": 10386739515, "step": 2664, "train_runtime": 105954.0059, "train_tokens_per_second": 98030.645 }, { "epoch": 0.4236883942766296, "grad_norm": 0.19020003080368042, "learning_rate": 3.1039413657362337e-05, "loss": 0.4154, "num_input_tokens_seen": 10390538086, "step": 2665, "train_runtime": 105995.0593, "train_tokens_per_second": 98028.513 }, { "epoch": 0.4238473767885533, "grad_norm": 0.23260822892189026, "learning_rate": 3.102727117679238e-05, "loss": 0.4211, "num_input_tokens_seen": 10394335797, "step": 2666, "train_runtime": 106036.1418, "train_tokens_per_second": 98026.349 }, { "epoch": 0.4240063593004769, "grad_norm": 0.3581683337688446, "learning_rate": 3.101512718643358e-05, "loss": 0.4112, "num_input_tokens_seen": 10398330522, "step": 2667, "train_runtime": 106076.9426, "train_tokens_per_second": 98026.303 }, { "epoch": 0.4241653418124006, "grad_norm": 0.2770054042339325, "learning_rate": 3.1002981689327924e-05, "loss": 0.4111, "num_input_tokens_seen": 10402210559, "step": 2668, "train_runtime": 106115.6221, "train_tokens_per_second": 98027.136 }, { "epoch": 0.4243243243243243, "grad_norm": 0.22366100549697876, "learning_rate": 3.099083468851778e-05, "loss": 0.4107, "num_input_tokens_seen": 10406233500, "step": 2669, "train_runtime": 106155.7627, "train_tokens_per_second": 98027.966 }, { "epoch": 0.424483306836248, "grad_norm": 0.2069595754146576, "learning_rate": 3.097868618704587e-05, "loss": 0.4081, "num_input_tokens_seen": 10410045233, "step": 2670, "train_runtime": 106196.4628, "train_tokens_per_second": 98026.29 }, { "epoch": 0.4246422893481717, "grad_norm": 0.20328590273857117, "learning_rate": 3.096653618795533e-05, "loss": 0.4222, "num_input_tokens_seen": 10414031737, "step": 2671, "train_runtime": 106235.8549, "train_tokens_per_second": 98027.467 }, { "epoch": 0.42480127186009536, "grad_norm": 0.2400970309972763, "learning_rate": 3.095438469428963e-05, "loss": 0.4184, "num_input_tokens_seen": 10418001658, "step": 2672, "train_runtime": 106276.1035, "train_tokens_per_second": 98027.697 }, { "epoch": 0.42496025437201906, "grad_norm": 0.2024695873260498, "learning_rate": 3.094223170909264e-05, "loss": 0.406, "num_input_tokens_seen": 10421890626, "step": 2673, "train_runtime": 106314.8928, "train_tokens_per_second": 98028.511 }, { "epoch": 0.42511923688394276, "grad_norm": 0.1920812577009201, "learning_rate": 3.09300772354086e-05, "loss": 0.406, "num_input_tokens_seen": 10425862560, "step": 2674, "train_runtime": 106355.4061, "train_tokens_per_second": 98028.515 }, { "epoch": 0.42527821939586646, "grad_norm": 0.31481996178627014, "learning_rate": 3.0917921276282105e-05, "loss": 0.4173, "num_input_tokens_seen": 10429743973, "step": 2675, "train_runtime": 106394.0403, "train_tokens_per_second": 98029.4 }, { "epoch": 0.42543720190779016, "grad_norm": 0.24375855922698975, "learning_rate": 3.0905763834758156e-05, "loss": 0.4191, "num_input_tokens_seen": 10433535882, "step": 2676, "train_runtime": 106434.4868, "train_tokens_per_second": 98027.775 }, { "epoch": 0.4255961844197138, "grad_norm": 0.21495197713375092, "learning_rate": 3.0893604913882105e-05, "loss": 0.4059, "num_input_tokens_seen": 10437495916, "step": 2677, "train_runtime": 106471.8858, "train_tokens_per_second": 98030.535 }, { "epoch": 0.4257551669316375, "grad_norm": 0.2833768129348755, "learning_rate": 3.088144451669966e-05, "loss": 0.4109, "num_input_tokens_seen": 10441342134, "step": 2678, "train_runtime": 106511.9615, "train_tokens_per_second": 98029.761 }, { "epoch": 0.4259141494435612, "grad_norm": 0.2811621129512787, "learning_rate": 3.086928264625693e-05, "loss": 0.4043, "num_input_tokens_seen": 10445183417, "step": 2679, "train_runtime": 106551.8672, "train_tokens_per_second": 98029.098 }, { "epoch": 0.4260731319554849, "grad_norm": 0.22323891520500183, "learning_rate": 3.0857119305600365e-05, "loss": 0.3984, "num_input_tokens_seen": 10449116836, "step": 2680, "train_runtime": 106588.6205, "train_tokens_per_second": 98032.199 }, { "epoch": 0.4262321144674086, "grad_norm": 0.20909743010997772, "learning_rate": 3.084495449777681e-05, "loss": 0.4125, "num_input_tokens_seen": 10452897040, "step": 2681, "train_runtime": 106625.4063, "train_tokens_per_second": 98033.831 }, { "epoch": 0.42639109697933225, "grad_norm": 0.24017328023910522, "learning_rate": 3.083278822583345e-05, "loss": 0.4171, "num_input_tokens_seen": 10456804209, "step": 2682, "train_runtime": 106664.9125, "train_tokens_per_second": 98034.152 }, { "epoch": 0.42655007949125595, "grad_norm": 0.218078151345253, "learning_rate": 3.0820620492817866e-05, "loss": 0.4117, "num_input_tokens_seen": 10460647719, "step": 2683, "train_runtime": 106703.9393, "train_tokens_per_second": 98034.316 }, { "epoch": 0.42670906200317965, "grad_norm": 0.19928161799907684, "learning_rate": 3.0808451301777965e-05, "loss": 0.3996, "num_input_tokens_seen": 10464659230, "step": 2684, "train_runtime": 106740.4929, "train_tokens_per_second": 98038.326 }, { "epoch": 0.42686804451510335, "grad_norm": 0.212192103266716, "learning_rate": 3.079628065576206e-05, "loss": 0.4141, "num_input_tokens_seen": 10468490310, "step": 2685, "train_runtime": 106779.7052, "train_tokens_per_second": 98038.202 }, { "epoch": 0.42702702702702705, "grad_norm": 0.21909640729427338, "learning_rate": 3.078410855781882e-05, "loss": 0.4149, "num_input_tokens_seen": 10472332830, "step": 2686, "train_runtime": 106818.9244, "train_tokens_per_second": 98038.179 }, { "epoch": 0.4271860095389507, "grad_norm": 0.22476042807102203, "learning_rate": 3.077193501099725e-05, "loss": 0.4089, "num_input_tokens_seen": 10476271211, "step": 2687, "train_runtime": 106857.0933, "train_tokens_per_second": 98040.017 }, { "epoch": 0.4273449920508744, "grad_norm": 0.21248304843902588, "learning_rate": 3.075976001834675e-05, "loss": 0.4143, "num_input_tokens_seen": 10480196282, "step": 2688, "train_runtime": 106894.1569, "train_tokens_per_second": 98042.742 }, { "epoch": 0.4275039745627981, "grad_norm": 0.20350432395935059, "learning_rate": 3.074758358291706e-05, "loss": 0.4044, "num_input_tokens_seen": 10484094557, "step": 2689, "train_runtime": 106930.5488, "train_tokens_per_second": 98045.831 }, { "epoch": 0.4276629570747218, "grad_norm": 0.21795795857906342, "learning_rate": 3.0735405707758306e-05, "loss": 0.4057, "num_input_tokens_seen": 10487976894, "step": 2690, "train_runtime": 106971.2191, "train_tokens_per_second": 98044.848 }, { "epoch": 0.4278219395866455, "grad_norm": 0.22690905630588531, "learning_rate": 3.072322639592095e-05, "loss": 0.4052, "num_input_tokens_seen": 10491806175, "step": 2691, "train_runtime": 107011.2433, "train_tokens_per_second": 98043.961 }, { "epoch": 0.42798092209856914, "grad_norm": 0.20168909430503845, "learning_rate": 3.071104565045582e-05, "loss": 0.4271, "num_input_tokens_seen": 10495782575, "step": 2692, "train_runtime": 107050.735, "train_tokens_per_second": 98044.937 }, { "epoch": 0.42813990461049284, "grad_norm": 0.22626009583473206, "learning_rate": 3.0698863474414105e-05, "loss": 0.4129, "num_input_tokens_seen": 10499618483, "step": 2693, "train_runtime": 107091.1774, "train_tokens_per_second": 98043.73 }, { "epoch": 0.42829888712241654, "grad_norm": 0.22104954719543457, "learning_rate": 3.068667987084737e-05, "loss": 0.405, "num_input_tokens_seen": 10503527538, "step": 2694, "train_runtime": 107131.2448, "train_tokens_per_second": 98043.55 }, { "epoch": 0.42845786963434024, "grad_norm": 0.20378299057483673, "learning_rate": 3.06744948428075e-05, "loss": 0.4239, "num_input_tokens_seen": 10507459003, "step": 2695, "train_runtime": 107172.0659, "train_tokens_per_second": 98042.889 }, { "epoch": 0.42861685214626394, "grad_norm": 0.31989583373069763, "learning_rate": 3.066230839334678e-05, "loss": 0.4019, "num_input_tokens_seen": 10511303837, "step": 2696, "train_runtime": 107211.5027, "train_tokens_per_second": 98042.687 }, { "epoch": 0.4287758346581876, "grad_norm": 0.18905670940876007, "learning_rate": 3.0650120525517815e-05, "loss": 0.4085, "num_input_tokens_seen": 10515252722, "step": 2697, "train_runtime": 107250.5048, "train_tokens_per_second": 98043.853 }, { "epoch": 0.4289348171701113, "grad_norm": 0.22418752312660217, "learning_rate": 3.0637931242373584e-05, "loss": 0.3907, "num_input_tokens_seen": 10519098335, "step": 2698, "train_runtime": 107289.5967, "train_tokens_per_second": 98043.973 }, { "epoch": 0.429093799682035, "grad_norm": 0.20128582417964935, "learning_rate": 3.0625740546967416e-05, "loss": 0.4054, "num_input_tokens_seen": 10523112548, "step": 2699, "train_runtime": 107325.5496, "train_tokens_per_second": 98048.532 }, { "epoch": 0.4292527821939587, "grad_norm": 0.21467110514640808, "learning_rate": 3.0613548442353e-05, "loss": 0.4078, "num_input_tokens_seen": 10526972470, "step": 2700, "train_runtime": 107365.6017, "train_tokens_per_second": 98047.906 }, { "epoch": 0.4294117647058823, "grad_norm": 0.18680451810359955, "learning_rate": 3.060135493158436e-05, "loss": 0.4063, "num_input_tokens_seen": 10530767761, "step": 2701, "train_runtime": 107403.8248, "train_tokens_per_second": 98048.35 }, { "epoch": 0.429570747217806, "grad_norm": 0.24584414064884186, "learning_rate": 3.0589160017715896e-05, "loss": 0.4012, "num_input_tokens_seen": 10534809872, "step": 2702, "train_runtime": 107443.6841, "train_tokens_per_second": 98049.597 }, { "epoch": 0.4297297297297297, "grad_norm": 0.2042957842350006, "learning_rate": 3.057696370380234e-05, "loss": 0.4158, "num_input_tokens_seen": 10538731307, "step": 2703, "train_runtime": 107478.5396, "train_tokens_per_second": 98054.285 }, { "epoch": 0.4298887122416534, "grad_norm": 0.259320467710495, "learning_rate": 3.056476599289879e-05, "loss": 0.4196, "num_input_tokens_seen": 10542537722, "step": 2704, "train_runtime": 107517.5127, "train_tokens_per_second": 98054.144 }, { "epoch": 0.4300476947535771, "grad_norm": 0.33365991711616516, "learning_rate": 3.055256688806067e-05, "loss": 0.4151, "num_input_tokens_seen": 10546404174, "step": 2705, "train_runtime": 107557.0533, "train_tokens_per_second": 98054.045 }, { "epoch": 0.43020667726550077, "grad_norm": 0.22097383439540863, "learning_rate": 3.054036639234379e-05, "loss": 0.4175, "num_input_tokens_seen": 10550333676, "step": 2706, "train_runtime": 107597.9067, "train_tokens_per_second": 98053.336 }, { "epoch": 0.43036565977742447, "grad_norm": 0.19881005585193634, "learning_rate": 3.0528164508804274e-05, "loss": 0.4122, "num_input_tokens_seen": 10554246775, "step": 2707, "train_runtime": 107631.1035, "train_tokens_per_second": 98059.45 }, { "epoch": 0.43052464228934817, "grad_norm": 0.18693089485168457, "learning_rate": 3.0515961240498614e-05, "loss": 0.4183, "num_input_tokens_seen": 10558072064, "step": 2708, "train_runtime": 107671.7615, "train_tokens_per_second": 98057.949 }, { "epoch": 0.43068362480127187, "grad_norm": 0.26763495802879333, "learning_rate": 3.0503756590483646e-05, "loss": 0.3993, "num_input_tokens_seen": 10562051734, "step": 2709, "train_runtime": 107712.3124, "train_tokens_per_second": 98057.98 }, { "epoch": 0.43084260731319557, "grad_norm": 0.22871807217597961, "learning_rate": 3.0491550561816535e-05, "loss": 0.4162, "num_input_tokens_seen": 10566098264, "step": 2710, "train_runtime": 107752.3363, "train_tokens_per_second": 98059.111 }, { "epoch": 0.4310015898251192, "grad_norm": 0.21296916902065277, "learning_rate": 3.0479343157554826e-05, "loss": 0.4139, "num_input_tokens_seen": 10569769131, "step": 2711, "train_runtime": 107792.443, "train_tokens_per_second": 98056.68 }, { "epoch": 0.4311605723370429, "grad_norm": 0.2603754997253418, "learning_rate": 3.046713438075636e-05, "loss": 0.3998, "num_input_tokens_seen": 10573565159, "step": 2712, "train_runtime": 107832.9903, "train_tokens_per_second": 98055.012 }, { "epoch": 0.4313195548489666, "grad_norm": 0.19628842175006866, "learning_rate": 3.0454924234479375e-05, "loss": 0.3968, "num_input_tokens_seen": 10577622127, "step": 2713, "train_runtime": 107874.1438, "train_tokens_per_second": 98055.213 }, { "epoch": 0.4314785373608903, "grad_norm": 0.2781887948513031, "learning_rate": 3.0442712721782414e-05, "loss": 0.4322, "num_input_tokens_seen": 10581486893, "step": 2714, "train_runtime": 107913.3561, "train_tokens_per_second": 98055.396 }, { "epoch": 0.431637519872814, "grad_norm": 0.22416174411773682, "learning_rate": 3.043049984572437e-05, "loss": 0.4131, "num_input_tokens_seen": 10585329172, "step": 2715, "train_runtime": 107951.3929, "train_tokens_per_second": 98056.439 }, { "epoch": 0.43179650238473766, "grad_norm": 0.2428717166185379, "learning_rate": 3.041828560936449e-05, "loss": 0.4066, "num_input_tokens_seen": 10589201093, "step": 2716, "train_runtime": 107989.5403, "train_tokens_per_second": 98057.655 }, { "epoch": 0.43195548489666136, "grad_norm": 0.20800870656967163, "learning_rate": 3.0406070015762356e-05, "loss": 0.4137, "num_input_tokens_seen": 10593175288, "step": 2717, "train_runtime": 108025.7033, "train_tokens_per_second": 98061.618 }, { "epoch": 0.43211446740858506, "grad_norm": 0.20651569962501526, "learning_rate": 3.0393853067977886e-05, "loss": 0.4222, "num_input_tokens_seen": 10597039865, "step": 2718, "train_runtime": 108062.8338, "train_tokens_per_second": 98063.687 }, { "epoch": 0.43227344992050876, "grad_norm": 0.2283860594034195, "learning_rate": 3.0381634769071337e-05, "loss": 0.4161, "num_input_tokens_seen": 10600879508, "step": 2719, "train_runtime": 108101.9122, "train_tokens_per_second": 98063.756 }, { "epoch": 0.43243243243243246, "grad_norm": 0.23900260031223297, "learning_rate": 3.03694151221033e-05, "loss": 0.4213, "num_input_tokens_seen": 10604845382, "step": 2720, "train_runtime": 108140.7139, "train_tokens_per_second": 98065.243 }, { "epoch": 0.4325914149443561, "grad_norm": 0.19760242104530334, "learning_rate": 3.0357194130134718e-05, "loss": 0.4146, "num_input_tokens_seen": 10608817283, "step": 2721, "train_runtime": 108179.5891, "train_tokens_per_second": 98066.718 }, { "epoch": 0.4327503974562798, "grad_norm": 0.21935856342315674, "learning_rate": 3.0344971796226873e-05, "loss": 0.4122, "num_input_tokens_seen": 10612576985, "step": 2722, "train_runtime": 108218.2305, "train_tokens_per_second": 98066.443 }, { "epoch": 0.4329093799682035, "grad_norm": 0.2847015857696533, "learning_rate": 3.033274812344135e-05, "loss": 0.4019, "num_input_tokens_seen": 10616550190, "step": 2723, "train_runtime": 108254.5406, "train_tokens_per_second": 98070.253 }, { "epoch": 0.4330683624801272, "grad_norm": 0.22260303795337677, "learning_rate": 3.032052311484011e-05, "loss": 0.4129, "num_input_tokens_seen": 10620373766, "step": 2724, "train_runtime": 108295.2669, "train_tokens_per_second": 98068.679 }, { "epoch": 0.4332273449920509, "grad_norm": 0.2658635973930359, "learning_rate": 3.0308296773485427e-05, "loss": 0.4044, "num_input_tokens_seen": 10624350939, "step": 2725, "train_runtime": 108334.2345, "train_tokens_per_second": 98070.116 }, { "epoch": 0.43338632750397454, "grad_norm": 0.20924633741378784, "learning_rate": 3.0296069102439915e-05, "loss": 0.4141, "num_input_tokens_seen": 10628202511, "step": 2726, "train_runtime": 108372.1696, "train_tokens_per_second": 98071.327 }, { "epoch": 0.43354531001589824, "grad_norm": 0.2164803147315979, "learning_rate": 3.028384010476652e-05, "loss": 0.4162, "num_input_tokens_seen": 10632185038, "step": 2727, "train_runtime": 108416.2345, "train_tokens_per_second": 98068.201 }, { "epoch": 0.43370429252782194, "grad_norm": 0.22702957689762115, "learning_rate": 3.0271609783528503e-05, "loss": 0.4041, "num_input_tokens_seen": 10636109867, "step": 2728, "train_runtime": 108456.7394, "train_tokens_per_second": 98067.764 }, { "epoch": 0.43386327503974564, "grad_norm": 0.22657622396945953, "learning_rate": 3.0259378141789497e-05, "loss": 0.4163, "num_input_tokens_seen": 10640009857, "step": 2729, "train_runtime": 108495.9968, "train_tokens_per_second": 98068.225 }, { "epoch": 0.43402225755166934, "grad_norm": 0.2193780541419983, "learning_rate": 3.0247145182613422e-05, "loss": 0.4218, "num_input_tokens_seen": 10643717121, "step": 2730, "train_runtime": 108536.5857, "train_tokens_per_second": 98065.708 }, { "epoch": 0.434181240063593, "grad_norm": 0.20244717597961426, "learning_rate": 3.0234910909064563e-05, "loss": 0.399, "num_input_tokens_seen": 10647671761, "step": 2731, "train_runtime": 108577.3453, "train_tokens_per_second": 98065.317 }, { "epoch": 0.4343402225755167, "grad_norm": 0.22834739089012146, "learning_rate": 3.0222675324207518e-05, "loss": 0.4188, "num_input_tokens_seen": 10651608138, "step": 2732, "train_runtime": 108616.5321, "train_tokens_per_second": 98066.178 }, { "epoch": 0.4344992050874404, "grad_norm": 0.19641803205013275, "learning_rate": 3.0210438431107202e-05, "loss": 0.4076, "num_input_tokens_seen": 10655478118, "step": 2733, "train_runtime": 108650.6084, "train_tokens_per_second": 98071.04 }, { "epoch": 0.4346581875993641, "grad_norm": 0.19092626869678497, "learning_rate": 3.019820023282888e-05, "loss": 0.4099, "num_input_tokens_seen": 10659332127, "step": 2734, "train_runtime": 108689.0961, "train_tokens_per_second": 98071.771 }, { "epoch": 0.43481717011128773, "grad_norm": 0.27215343713760376, "learning_rate": 3.0185960732438133e-05, "loss": 0.401, "num_input_tokens_seen": 10663156482, "step": 2735, "train_runtime": 108729.7345, "train_tokens_per_second": 98070.289 }, { "epoch": 0.43497615262321143, "grad_norm": 0.24805289506912231, "learning_rate": 3.017371993300087e-05, "loss": 0.4128, "num_input_tokens_seen": 10667141468, "step": 2736, "train_runtime": 108768.0392, "train_tokens_per_second": 98072.389 }, { "epoch": 0.43513513513513513, "grad_norm": 0.26943084597587585, "learning_rate": 3.016147783758332e-05, "loss": 0.415, "num_input_tokens_seen": 10671057388, "step": 2737, "train_runtime": 108809.9644, "train_tokens_per_second": 98070.59 }, { "epoch": 0.43529411764705883, "grad_norm": 0.20970726013183594, "learning_rate": 3.014923444925204e-05, "loss": 0.4166, "num_input_tokens_seen": 10674855719, "step": 2738, "train_runtime": 108849.9657, "train_tokens_per_second": 98069.445 }, { "epoch": 0.43545310015898253, "grad_norm": 0.24577221274375916, "learning_rate": 3.013698977107392e-05, "loss": 0.4106, "num_input_tokens_seen": 10678822005, "step": 2739, "train_runtime": 108889.2988, "train_tokens_per_second": 98070.445 }, { "epoch": 0.4356120826709062, "grad_norm": 0.1966293603181839, "learning_rate": 3.012474380611616e-05, "loss": 0.4206, "num_input_tokens_seen": 10682809122, "step": 2740, "train_runtime": 108929.0474, "train_tokens_per_second": 98071.262 }, { "epoch": 0.4357710651828299, "grad_norm": 0.20024918019771576, "learning_rate": 3.01124965574463e-05, "loss": 0.4164, "num_input_tokens_seen": 10686581108, "step": 2741, "train_runtime": 108967.8309, "train_tokens_per_second": 98070.972 }, { "epoch": 0.4359300476947536, "grad_norm": 0.19683581590652466, "learning_rate": 3.010024802813217e-05, "loss": 0.4024, "num_input_tokens_seen": 10690501796, "step": 2742, "train_runtime": 109007.8956, "train_tokens_per_second": 98070.894 }, { "epoch": 0.4360890302066773, "grad_norm": 0.3136347830295563, "learning_rate": 3.008799822124195e-05, "loss": 0.4017, "num_input_tokens_seen": 10694514486, "step": 2743, "train_runtime": 109048.1849, "train_tokens_per_second": 98071.458 }, { "epoch": 0.436248012718601, "grad_norm": 0.2045731544494629, "learning_rate": 3.0075747139844136e-05, "loss": 0.4092, "num_input_tokens_seen": 10698381215, "step": 2744, "train_runtime": 109085.6501, "train_tokens_per_second": 98073.222 }, { "epoch": 0.4364069952305246, "grad_norm": 0.2123294621706009, "learning_rate": 3.006349478700753e-05, "loss": 0.4147, "num_input_tokens_seen": 10702336550, "step": 2745, "train_runtime": 109123.7248, "train_tokens_per_second": 98075.25 }, { "epoch": 0.4365659777424483, "grad_norm": 0.31150907278060913, "learning_rate": 3.005124116580125e-05, "loss": 0.4084, "num_input_tokens_seen": 10706266173, "step": 2746, "train_runtime": 109163.5929, "train_tokens_per_second": 98075.429 }, { "epoch": 0.436724960254372, "grad_norm": 0.20445282757282257, "learning_rate": 3.0038986279294755e-05, "loss": 0.4201, "num_input_tokens_seen": 10710231235, "step": 2747, "train_runtime": 109202.8023, "train_tokens_per_second": 98076.524 }, { "epoch": 0.4368839427662957, "grad_norm": 0.1990327388048172, "learning_rate": 3.0026730130557807e-05, "loss": 0.4107, "num_input_tokens_seen": 10714111469, "step": 2748, "train_runtime": 109241.5558, "train_tokens_per_second": 98077.251 }, { "epoch": 0.4370429252782194, "grad_norm": 0.48007136583328247, "learning_rate": 3.001447272266048e-05, "loss": 0.4213, "num_input_tokens_seen": 10718037542, "step": 2749, "train_runtime": 109280.9928, "train_tokens_per_second": 98077.783 }, { "epoch": 0.43720190779014306, "grad_norm": 0.2235858142375946, "learning_rate": 3.0002214058673173e-05, "loss": 0.4121, "num_input_tokens_seen": 10721901178, "step": 2750, "train_runtime": 109322.39, "train_tokens_per_second": 98075.986 }, { "epoch": 0.43736089030206676, "grad_norm": 0.2286907434463501, "learning_rate": 2.99899541416666e-05, "loss": 0.4116, "num_input_tokens_seen": 10725819132, "step": 2751, "train_runtime": 109360.8363, "train_tokens_per_second": 98077.333 }, { "epoch": 0.43751987281399046, "grad_norm": 0.20011430978775024, "learning_rate": 2.9977692974711764e-05, "loss": 0.4046, "num_input_tokens_seen": 10729592758, "step": 2752, "train_runtime": 109398.6291, "train_tokens_per_second": 98077.945 }, { "epoch": 0.43767885532591416, "grad_norm": 0.19483447074890137, "learning_rate": 2.9965430560880008e-05, "loss": 0.4176, "num_input_tokens_seen": 10733463527, "step": 2753, "train_runtime": 109434.736, "train_tokens_per_second": 98080.956 }, { "epoch": 0.43783783783783786, "grad_norm": 0.20482830703258514, "learning_rate": 2.9953166903242992e-05, "loss": 0.4172, "num_input_tokens_seen": 10737431027, "step": 2754, "train_runtime": 109474.0904, "train_tokens_per_second": 98081.939 }, { "epoch": 0.4379968203497615, "grad_norm": 0.21949587762355804, "learning_rate": 2.9940902004872668e-05, "loss": 0.4066, "num_input_tokens_seen": 10741346910, "step": 2755, "train_runtime": 109513.7455, "train_tokens_per_second": 98082.18 }, { "epoch": 0.4381558028616852, "grad_norm": 0.21611915528774261, "learning_rate": 2.9928635868841297e-05, "loss": 0.4225, "num_input_tokens_seen": 10745240547, "step": 2756, "train_runtime": 109552.2163, "train_tokens_per_second": 98083.279 }, { "epoch": 0.4383147853736089, "grad_norm": 0.21235260367393494, "learning_rate": 2.9916368498221476e-05, "loss": 0.4006, "num_input_tokens_seen": 10749126893, "step": 2757, "train_runtime": 109591.9252, "train_tokens_per_second": 98083.202 }, { "epoch": 0.4384737678855326, "grad_norm": 0.2102307230234146, "learning_rate": 2.9904099896086087e-05, "loss": 0.4213, "num_input_tokens_seen": 10752969011, "step": 2758, "train_runtime": 109631.2298, "train_tokens_per_second": 98083.083 }, { "epoch": 0.4386327503974563, "grad_norm": 0.2189190685749054, "learning_rate": 2.989183006550833e-05, "loss": 0.4098, "num_input_tokens_seen": 10756848562, "step": 2759, "train_runtime": 109671.4631, "train_tokens_per_second": 98082.475 }, { "epoch": 0.43879173290937995, "grad_norm": 0.1915406882762909, "learning_rate": 2.9879559009561703e-05, "loss": 0.3918, "num_input_tokens_seen": 10760777250, "step": 2760, "train_runtime": 109711.3433, "train_tokens_per_second": 98082.631 }, { "epoch": 0.43895071542130365, "grad_norm": 1.3834508657455444, "learning_rate": 2.9867286731320027e-05, "loss": 0.4026, "num_input_tokens_seen": 10764787393, "step": 2761, "train_runtime": 109750.2269, "train_tokens_per_second": 98084.42 }, { "epoch": 0.43910969793322735, "grad_norm": 0.24999625980854034, "learning_rate": 2.9855013233857415e-05, "loss": 0.4114, "num_input_tokens_seen": 10768753386, "step": 2762, "train_runtime": 109790.2736, "train_tokens_per_second": 98084.767 }, { "epoch": 0.43926868044515105, "grad_norm": 0.2742011845111847, "learning_rate": 2.9842738520248302e-05, "loss": 0.4179, "num_input_tokens_seen": 10772676706, "step": 2763, "train_runtime": 109830.4764, "train_tokens_per_second": 98084.585 }, { "epoch": 0.43942766295707475, "grad_norm": 0.37133121490478516, "learning_rate": 2.983046259356741e-05, "loss": 0.4093, "num_input_tokens_seen": 10776478328, "step": 2764, "train_runtime": 109870.2144, "train_tokens_per_second": 98083.711 }, { "epoch": 0.4395866454689984, "grad_norm": 0.27469033002853394, "learning_rate": 2.981818545688977e-05, "loss": 0.4167, "num_input_tokens_seen": 10780413954, "step": 2765, "train_runtime": 109911.8305, "train_tokens_per_second": 98082.38 }, { "epoch": 0.4397456279809221, "grad_norm": 0.23427878320217133, "learning_rate": 2.9805907113290722e-05, "loss": 0.4117, "num_input_tokens_seen": 10784353396, "step": 2766, "train_runtime": 109950.2298, "train_tokens_per_second": 98083.955 }, { "epoch": 0.4399046104928458, "grad_norm": 0.8412659168243408, "learning_rate": 2.9793627565845892e-05, "loss": 0.4105, "num_input_tokens_seen": 10788258796, "step": 2767, "train_runtime": 109990.286, "train_tokens_per_second": 98083.742 }, { "epoch": 0.4400635930047695, "grad_norm": 0.25372347235679626, "learning_rate": 2.9781346817631246e-05, "loss": 0.4146, "num_input_tokens_seen": 10792059692, "step": 2768, "train_runtime": 110027.1104, "train_tokens_per_second": 98085.46 }, { "epoch": 0.44022257551669314, "grad_norm": 0.23759204149246216, "learning_rate": 2.9769064871722996e-05, "loss": 0.4103, "num_input_tokens_seen": 10796102399, "step": 2769, "train_runtime": 110066.7068, "train_tokens_per_second": 98086.903 }, { "epoch": 0.44038155802861684, "grad_norm": 0.21031047403812408, "learning_rate": 2.9756781731197703e-05, "loss": 0.4085, "num_input_tokens_seen": 10800031430, "step": 2770, "train_runtime": 110105.777, "train_tokens_per_second": 98087.782 }, { "epoch": 0.44054054054054054, "grad_norm": 0.21229395270347595, "learning_rate": 2.9744497399132192e-05, "loss": 0.4251, "num_input_tokens_seen": 10804060723, "step": 2771, "train_runtime": 110141.4918, "train_tokens_per_second": 98092.558 }, { "epoch": 0.44069952305246424, "grad_norm": 0.38728946447372437, "learning_rate": 2.9732211878603623e-05, "loss": 0.4122, "num_input_tokens_seen": 10807831945, "step": 2772, "train_runtime": 110180.6075, "train_tokens_per_second": 98091.962 }, { "epoch": 0.44085850556438794, "grad_norm": 0.32769542932510376, "learning_rate": 2.97199251726894e-05, "loss": 0.4029, "num_input_tokens_seen": 10811856901, "step": 2773, "train_runtime": 110220.0428, "train_tokens_per_second": 98093.383 }, { "epoch": 0.4410174880763116, "grad_norm": 0.19290339946746826, "learning_rate": 2.970763728446729e-05, "loss": 0.416, "num_input_tokens_seen": 10815839745, "step": 2774, "train_runtime": 110255.9478, "train_tokens_per_second": 98097.563 }, { "epoch": 0.4411764705882353, "grad_norm": 0.20026156306266785, "learning_rate": 2.9695348217015294e-05, "loss": 0.4275, "num_input_tokens_seen": 10819692302, "step": 2775, "train_runtime": 110295.545, "train_tokens_per_second": 98097.274 }, { "epoch": 0.441335453100159, "grad_norm": 0.2153264582157135, "learning_rate": 2.9683057973411753e-05, "loss": 0.4075, "num_input_tokens_seen": 10823557414, "step": 2776, "train_runtime": 110333.636, "train_tokens_per_second": 98098.439 }, { "epoch": 0.4414944356120827, "grad_norm": 0.22539378702640533, "learning_rate": 2.9670766556735286e-05, "loss": 0.4053, "num_input_tokens_seen": 10827597636, "step": 2777, "train_runtime": 110374.1346, "train_tokens_per_second": 98099.049 }, { "epoch": 0.4416534181240064, "grad_norm": 0.219930961728096, "learning_rate": 2.965847397006479e-05, "loss": 0.4068, "num_input_tokens_seen": 10831520697, "step": 2778, "train_runtime": 110415.3808, "train_tokens_per_second": 98097.934 }, { "epoch": 0.44181240063593, "grad_norm": 0.18951956927776337, "learning_rate": 2.9646180216479496e-05, "loss": 0.4025, "num_input_tokens_seen": 10835414860, "step": 2779, "train_runtime": 110454.7767, "train_tokens_per_second": 98098.201 }, { "epoch": 0.4419713831478537, "grad_norm": 0.2109401822090149, "learning_rate": 2.9633885299058883e-05, "loss": 0.4078, "num_input_tokens_seen": 10839304458, "step": 2780, "train_runtime": 110492.5555, "train_tokens_per_second": 98099.862 }, { "epoch": 0.4421303656597774, "grad_norm": 0.19782626628875732, "learning_rate": 2.962158922088275e-05, "loss": 0.4096, "num_input_tokens_seen": 10843235219, "step": 2781, "train_runtime": 110532.6255, "train_tokens_per_second": 98099.861 }, { "epoch": 0.4422893481717011, "grad_norm": 0.2240896075963974, "learning_rate": 2.960929198503118e-05, "loss": 0.4196, "num_input_tokens_seen": 10847122355, "step": 2782, "train_runtime": 110572.9051, "train_tokens_per_second": 98099.28 }, { "epoch": 0.4424483306836248, "grad_norm": 1.1863601207733154, "learning_rate": 2.9596993594584527e-05, "loss": 0.4084, "num_input_tokens_seen": 10851076609, "step": 2783, "train_runtime": 110612.3827, "train_tokens_per_second": 98100.017 }, { "epoch": 0.44260731319554847, "grad_norm": 0.19720686972141266, "learning_rate": 2.9584694052623462e-05, "loss": 0.4108, "num_input_tokens_seen": 10854952183, "step": 2784, "train_runtime": 110651.5777, "train_tokens_per_second": 98100.293 }, { "epoch": 0.44276629570747217, "grad_norm": 0.19230790436267853, "learning_rate": 2.957239336222894e-05, "loss": 0.414, "num_input_tokens_seen": 10858923790, "step": 2785, "train_runtime": 110691.3507, "train_tokens_per_second": 98100.924 }, { "epoch": 0.44292527821939587, "grad_norm": 0.21252380311489105, "learning_rate": 2.9560091526482192e-05, "loss": 0.4157, "num_input_tokens_seen": 10862819140, "step": 2786, "train_runtime": 110730.0386, "train_tokens_per_second": 98101.827 }, { "epoch": 0.44308426073131957, "grad_norm": 0.21994291245937347, "learning_rate": 2.954778854846474e-05, "loss": 0.4216, "num_input_tokens_seen": 10866716088, "step": 2787, "train_runtime": 110768.6035, "train_tokens_per_second": 98102.854 }, { "epoch": 0.44324324324324327, "grad_norm": 0.2648133337497711, "learning_rate": 2.9535484431258387e-05, "loss": 0.4194, "num_input_tokens_seen": 10870465631, "step": 2788, "train_runtime": 110808.2056, "train_tokens_per_second": 98101.63 }, { "epoch": 0.4434022257551669, "grad_norm": 0.38647645711898804, "learning_rate": 2.9523179177945242e-05, "loss": 0.4129, "num_input_tokens_seen": 10874377281, "step": 2789, "train_runtime": 110846.3685, "train_tokens_per_second": 98103.144 }, { "epoch": 0.4435612082670906, "grad_norm": 0.23771226406097412, "learning_rate": 2.9510872791607667e-05, "loss": 0.4117, "num_input_tokens_seen": 10878234702, "step": 2790, "train_runtime": 110884.0213, "train_tokens_per_second": 98104.619 }, { "epoch": 0.4437201907790143, "grad_norm": 0.17840225994586945, "learning_rate": 2.9498565275328344e-05, "loss": 0.4047, "num_input_tokens_seen": 10882201538, "step": 2791, "train_runtime": 110922.7797, "train_tokens_per_second": 98106.102 }, { "epoch": 0.443879173290938, "grad_norm": 0.19857893884181976, "learning_rate": 2.9486256632190195e-05, "loss": 0.4006, "num_input_tokens_seen": 10886076501, "step": 2792, "train_runtime": 110961.1817, "train_tokens_per_second": 98107.071 }, { "epoch": 0.4440381558028617, "grad_norm": 0.18822474777698517, "learning_rate": 2.947394686527647e-05, "loss": 0.4021, "num_input_tokens_seen": 10889988849, "step": 2793, "train_runtime": 110999.329, "train_tokens_per_second": 98108.601 }, { "epoch": 0.44419713831478536, "grad_norm": 0.20953288674354553, "learning_rate": 2.946163597767066e-05, "loss": 0.4195, "num_input_tokens_seen": 10893902096, "step": 2794, "train_runtime": 111037.6069, "train_tokens_per_second": 98110.022 }, { "epoch": 0.44435612082670906, "grad_norm": 0.19084608554840088, "learning_rate": 2.9449323972456577e-05, "loss": 0.4062, "num_input_tokens_seen": 10897867774, "step": 2795, "train_runtime": 111076.4646, "train_tokens_per_second": 98111.403 }, { "epoch": 0.44451510333863276, "grad_norm": 0.18603944778442383, "learning_rate": 2.9437010852718273e-05, "loss": 0.4231, "num_input_tokens_seen": 10901762856, "step": 2796, "train_runtime": 111116.4773, "train_tokens_per_second": 98111.127 }, { "epoch": 0.44467408585055646, "grad_norm": 0.21649736166000366, "learning_rate": 2.9424696621540104e-05, "loss": 0.4066, "num_input_tokens_seen": 10905581938, "step": 2797, "train_runtime": 111156.086, "train_tokens_per_second": 98110.525 }, { "epoch": 0.4448330683624801, "grad_norm": 0.1969357430934906, "learning_rate": 2.9412381282006702e-05, "loss": 0.4041, "num_input_tokens_seen": 10909597707, "step": 2798, "train_runtime": 111193.1666, "train_tokens_per_second": 98113.922 }, { "epoch": 0.4449920508744038, "grad_norm": 0.21710024774074554, "learning_rate": 2.940006483720297e-05, "loss": 0.4188, "num_input_tokens_seen": 10913454317, "step": 2799, "train_runtime": 111232.1529, "train_tokens_per_second": 98114.206 }, { "epoch": 0.4451510333863275, "grad_norm": 0.18744684755802155, "learning_rate": 2.938774729021408e-05, "loss": 0.4089, "num_input_tokens_seen": 10917335310, "step": 2800, "train_runtime": 111271.8352, "train_tokens_per_second": 98114.094 }, { "epoch": 0.4453100158982512, "grad_norm": 0.18486294150352478, "learning_rate": 2.9375428644125503e-05, "loss": 0.4123, "num_input_tokens_seen": 10921279113, "step": 2801, "train_runtime": 111430.0276, "train_tokens_per_second": 98010.198 }, { "epoch": 0.4454689984101749, "grad_norm": 0.22746725380420685, "learning_rate": 2.9363108902022962e-05, "loss": 0.4178, "num_input_tokens_seen": 10925116928, "step": 2802, "train_runtime": 111468.2367, "train_tokens_per_second": 98011.032 }, { "epoch": 0.44562798092209854, "grad_norm": 0.19165284931659698, "learning_rate": 2.935078806699248e-05, "loss": 0.4122, "num_input_tokens_seen": 10929054391, "step": 2803, "train_runtime": 111507.9834, "train_tokens_per_second": 98011.407 }, { "epoch": 0.44578696343402224, "grad_norm": 0.19116871058940887, "learning_rate": 2.9338466142120335e-05, "loss": 0.4084, "num_input_tokens_seen": 10932882953, "step": 2804, "train_runtime": 111546.3286, "train_tokens_per_second": 98012.038 }, { "epoch": 0.44594594594594594, "grad_norm": 0.1892593652009964, "learning_rate": 2.9326143130493074e-05, "loss": 0.4141, "num_input_tokens_seen": 10936755124, "step": 2805, "train_runtime": 111579.5179, "train_tokens_per_second": 98017.587 }, { "epoch": 0.44610492845786964, "grad_norm": 0.19574721157550812, "learning_rate": 2.931381903519753e-05, "loss": 0.4171, "num_input_tokens_seen": 10940631635, "step": 2806, "train_runtime": 111621.5294, "train_tokens_per_second": 98015.425 }, { "epoch": 0.44626391096979334, "grad_norm": 0.19220945239067078, "learning_rate": 2.9301493859320795e-05, "loss": 0.4152, "num_input_tokens_seen": 10944505317, "step": 2807, "train_runtime": 111659.7758, "train_tokens_per_second": 98016.544 }, { "epoch": 0.446422893481717, "grad_norm": 0.18841363489627838, "learning_rate": 2.9289167605950247e-05, "loss": 0.4042, "num_input_tokens_seen": 10948508391, "step": 2808, "train_runtime": 111700.8129, "train_tokens_per_second": 98016.372 }, { "epoch": 0.4465818759936407, "grad_norm": 0.21625393629074097, "learning_rate": 2.9276840278173528e-05, "loss": 0.4174, "num_input_tokens_seen": 10952268392, "step": 2809, "train_runtime": 111737.9939, "train_tokens_per_second": 98017.407 }, { "epoch": 0.4467408585055644, "grad_norm": 0.1776374876499176, "learning_rate": 2.926451187907854e-05, "loss": 0.4105, "num_input_tokens_seen": 10956144530, "step": 2810, "train_runtime": 111776.7564, "train_tokens_per_second": 98018.093 }, { "epoch": 0.4468998410174881, "grad_norm": 0.18756799399852753, "learning_rate": 2.9252182411753454e-05, "loss": 0.4076, "num_input_tokens_seen": 10960042406, "step": 2811, "train_runtime": 111816.5076, "train_tokens_per_second": 98018.107 }, { "epoch": 0.4470588235294118, "grad_norm": 0.22698934376239777, "learning_rate": 2.9239851879286722e-05, "loss": 0.4188, "num_input_tokens_seen": 10963908577, "step": 2812, "train_runtime": 111855.9981, "train_tokens_per_second": 98018.066 }, { "epoch": 0.44721780604133543, "grad_norm": 0.17664341628551483, "learning_rate": 2.922752028476706e-05, "loss": 0.41, "num_input_tokens_seen": 10967713192, "step": 2813, "train_runtime": 111893.0303, "train_tokens_per_second": 98019.628 }, { "epoch": 0.44737678855325913, "grad_norm": 0.267187237739563, "learning_rate": 2.9215187631283443e-05, "loss": 0.4065, "num_input_tokens_seen": 10971685020, "step": 2814, "train_runtime": 111933.8016, "train_tokens_per_second": 98019.408 }, { "epoch": 0.44753577106518283, "grad_norm": 0.18188133835792542, "learning_rate": 2.92028539219251e-05, "loss": 0.4067, "num_input_tokens_seen": 10975564365, "step": 2815, "train_runtime": 111970.9168, "train_tokens_per_second": 98021.564 }, { "epoch": 0.44769475357710653, "grad_norm": 0.2002672702074051, "learning_rate": 2.9190519159781554e-05, "loss": 0.423, "num_input_tokens_seen": 10979470718, "step": 2816, "train_runtime": 112010.3944, "train_tokens_per_second": 98021.891 }, { "epoch": 0.44785373608903023, "grad_norm": 0.21553225815296173, "learning_rate": 2.9178183347942566e-05, "loss": 0.4063, "num_input_tokens_seen": 10983363683, "step": 2817, "train_runtime": 112047.4497, "train_tokens_per_second": 98024.218 }, { "epoch": 0.4480127186009539, "grad_norm": 0.20163023471832275, "learning_rate": 2.9165846489498182e-05, "loss": 0.4122, "num_input_tokens_seen": 10987379879, "step": 2818, "train_runtime": 112086.3273, "train_tokens_per_second": 98026.05 }, { "epoch": 0.4481717011128776, "grad_norm": 0.19877059757709503, "learning_rate": 2.915350858753868e-05, "loss": 0.4025, "num_input_tokens_seen": 10991208307, "step": 2819, "train_runtime": 112126.104, "train_tokens_per_second": 98025.419 }, { "epoch": 0.4483306836248013, "grad_norm": 0.216533362865448, "learning_rate": 2.914116964515463e-05, "loss": 0.4106, "num_input_tokens_seen": 10995088803, "step": 2820, "train_runtime": 112165.044, "train_tokens_per_second": 98025.984 }, { "epoch": 0.448489666136725, "grad_norm": 0.2124008983373642, "learning_rate": 2.9128829665436856e-05, "loss": 0.4102, "num_input_tokens_seen": 10998959259, "step": 2821, "train_runtime": 112205.5989, "train_tokens_per_second": 98025.048 }, { "epoch": 0.4486486486486487, "grad_norm": 0.21155647933483124, "learning_rate": 2.9116488651476427e-05, "loss": 0.4017, "num_input_tokens_seen": 11002837399, "step": 2822, "train_runtime": 112246.0751, "train_tokens_per_second": 98024.251 }, { "epoch": 0.4488076311605723, "grad_norm": 0.21630246937274933, "learning_rate": 2.9104146606364667e-05, "loss": 0.4102, "num_input_tokens_seen": 11006852890, "step": 2823, "train_runtime": 112286.1324, "train_tokens_per_second": 98025.042 }, { "epoch": 0.448966613672496, "grad_norm": 0.18859675526618958, "learning_rate": 2.9091803533193186e-05, "loss": 0.4041, "num_input_tokens_seen": 11010716945, "step": 2824, "train_runtime": 112324.9268, "train_tokens_per_second": 98025.588 }, { "epoch": 0.4491255961844197, "grad_norm": 0.25425052642822266, "learning_rate": 2.9079459435053834e-05, "loss": 0.4097, "num_input_tokens_seen": 11014575696, "step": 2825, "train_runtime": 112365.3865, "train_tokens_per_second": 98024.632 }, { "epoch": 0.4492845786963434, "grad_norm": 0.25355663895606995, "learning_rate": 2.9067114315038722e-05, "loss": 0.4082, "num_input_tokens_seen": 11018482803, "step": 2826, "train_runtime": 112403.6679, "train_tokens_per_second": 98026.008 }, { "epoch": 0.4494435612082671, "grad_norm": 0.6806780695915222, "learning_rate": 2.9054768176240215e-05, "loss": 0.4028, "num_input_tokens_seen": 11022272232, "step": 2827, "train_runtime": 112443.8242, "train_tokens_per_second": 98024.701 }, { "epoch": 0.44960254372019076, "grad_norm": 0.3007127046585083, "learning_rate": 2.904242102175093e-05, "loss": 0.3984, "num_input_tokens_seen": 11026230022, "step": 2828, "train_runtime": 112484.3077, "train_tokens_per_second": 98024.607 }, { "epoch": 0.44976152623211446, "grad_norm": 0.2848386764526367, "learning_rate": 2.9030072854663754e-05, "loss": 0.4041, "num_input_tokens_seen": 11029957640, "step": 2829, "train_runtime": 112521.5228, "train_tokens_per_second": 98025.314 }, { "epoch": 0.44992050874403816, "grad_norm": 0.19540542364120483, "learning_rate": 2.9017723678071797e-05, "loss": 0.4112, "num_input_tokens_seen": 11033864753, "step": 2830, "train_runtime": 112560.1892, "train_tokens_per_second": 98026.352 }, { "epoch": 0.45007949125596186, "grad_norm": 0.18549028038978577, "learning_rate": 2.9005373495068448e-05, "loss": 0.3953, "num_input_tokens_seen": 11037826748, "step": 2831, "train_runtime": 112599.9292, "train_tokens_per_second": 98026.942 }, { "epoch": 0.4502384737678855, "grad_norm": 0.25854071974754333, "learning_rate": 2.8993022308747343e-05, "loss": 0.416, "num_input_tokens_seen": 11041753507, "step": 2832, "train_runtime": 112640.7727, "train_tokens_per_second": 98026.259 }, { "epoch": 0.4503974562798092, "grad_norm": 0.19367051124572754, "learning_rate": 2.8980670122202368e-05, "loss": 0.3984, "num_input_tokens_seen": 11045702869, "step": 2833, "train_runtime": 112680.4297, "train_tokens_per_second": 98026.808 }, { "epoch": 0.4505564387917329, "grad_norm": 0.20044304430484772, "learning_rate": 2.896831693852765e-05, "loss": 0.412, "num_input_tokens_seen": 11049673757, "step": 2834, "train_runtime": 112723.2029, "train_tokens_per_second": 98024.838 }, { "epoch": 0.4507154213036566, "grad_norm": 0.2449989914894104, "learning_rate": 2.8955962760817585e-05, "loss": 0.4139, "num_input_tokens_seen": 11053510121, "step": 2835, "train_runtime": 112762.5733, "train_tokens_per_second": 98024.635 }, { "epoch": 0.4508744038155803, "grad_norm": 0.18588295578956604, "learning_rate": 2.8943607592166806e-05, "loss": 0.4233, "num_input_tokens_seen": 11057493114, "step": 2836, "train_runtime": 112801.6054, "train_tokens_per_second": 98026.026 }, { "epoch": 0.45103338632750395, "grad_norm": 0.21000592410564423, "learning_rate": 2.893125143567019e-05, "loss": 0.4007, "num_input_tokens_seen": 11061280582, "step": 2837, "train_runtime": 112842.2618, "train_tokens_per_second": 98024.272 }, { "epoch": 0.45119236883942765, "grad_norm": 0.22830627858638763, "learning_rate": 2.891889429442287e-05, "loss": 0.4058, "num_input_tokens_seen": 11065265743, "step": 2838, "train_runtime": 112880.9704, "train_tokens_per_second": 98025.962 }, { "epoch": 0.45135135135135135, "grad_norm": 0.2021588534116745, "learning_rate": 2.8906536171520215e-05, "loss": 0.4115, "num_input_tokens_seen": 11069238574, "step": 2839, "train_runtime": 112920.2265, "train_tokens_per_second": 98027.067 }, { "epoch": 0.45151033386327505, "grad_norm": 0.21003319323062897, "learning_rate": 2.8894177070057864e-05, "loss": 0.403, "num_input_tokens_seen": 11073207060, "step": 2840, "train_runtime": 112957.4494, "train_tokens_per_second": 98029.896 }, { "epoch": 0.45166931637519875, "grad_norm": 0.21879306435585022, "learning_rate": 2.8881816993131666e-05, "loss": 0.4173, "num_input_tokens_seen": 11076935783, "step": 2841, "train_runtime": 112996.9121, "train_tokens_per_second": 98028.659 }, { "epoch": 0.4518282988871224, "grad_norm": 0.19187374413013458, "learning_rate": 2.8869455943837743e-05, "loss": 0.4162, "num_input_tokens_seen": 11080864310, "step": 2842, "train_runtime": 113036.1193, "train_tokens_per_second": 98029.412 }, { "epoch": 0.4519872813990461, "grad_norm": 0.17717137932777405, "learning_rate": 2.8857093925272448e-05, "loss": 0.413, "num_input_tokens_seen": 11084918340, "step": 2843, "train_runtime": 113076.1382, "train_tokens_per_second": 98030.571 }, { "epoch": 0.4521462639109698, "grad_norm": 0.20552963018417358, "learning_rate": 2.884473094053239e-05, "loss": 0.4153, "num_input_tokens_seen": 11088782368, "step": 2844, "train_runtime": 113114.5058, "train_tokens_per_second": 98031.48 }, { "epoch": 0.4523052464228935, "grad_norm": 0.2770993113517761, "learning_rate": 2.8832366992714387e-05, "loss": 0.3971, "num_input_tokens_seen": 11092480520, "step": 2845, "train_runtime": 113151.667, "train_tokens_per_second": 98031.967 }, { "epoch": 0.4524642289348172, "grad_norm": 0.1780921071767807, "learning_rate": 2.882000208491554e-05, "loss": 0.4197, "num_input_tokens_seen": 11096550472, "step": 2846, "train_runtime": 113191.1288, "train_tokens_per_second": 98033.747 }, { "epoch": 0.45262321144674084, "grad_norm": 0.1994808316230774, "learning_rate": 2.8807636220233164e-05, "loss": 0.4067, "num_input_tokens_seen": 11100453561, "step": 2847, "train_runtime": 113229.8731, "train_tokens_per_second": 98034.673 }, { "epoch": 0.45278219395866454, "grad_norm": 0.22376516461372375, "learning_rate": 2.8795269401764817e-05, "loss": 0.414, "num_input_tokens_seen": 11104327773, "step": 2848, "train_runtime": 113269.0129, "train_tokens_per_second": 98035.001 }, { "epoch": 0.45294117647058824, "grad_norm": 0.19921261072158813, "learning_rate": 2.878290163260831e-05, "loss": 0.4118, "num_input_tokens_seen": 11108186310, "step": 2849, "train_runtime": 113310.6909, "train_tokens_per_second": 98032.994 }, { "epoch": 0.45310015898251194, "grad_norm": 0.18887299299240112, "learning_rate": 2.8770532915861675e-05, "loss": 0.4051, "num_input_tokens_seen": 11112160914, "step": 2850, "train_runtime": 113348.8472, "train_tokens_per_second": 98035.059 }, { "epoch": 0.45325914149443564, "grad_norm": 0.20943684875965118, "learning_rate": 2.8758163254623183e-05, "loss": 0.4006, "num_input_tokens_seen": 11115993602, "step": 2851, "train_runtime": 113387.5166, "train_tokens_per_second": 98035.427 }, { "epoch": 0.4534181240063593, "grad_norm": 0.2657046616077423, "learning_rate": 2.8745792651991356e-05, "loss": 0.4058, "num_input_tokens_seen": 11119821493, "step": 2852, "train_runtime": 113426.5221, "train_tokens_per_second": 98035.462 }, { "epoch": 0.453577106518283, "grad_norm": 0.2106996327638626, "learning_rate": 2.8733421111064934e-05, "loss": 0.4045, "num_input_tokens_seen": 11123759662, "step": 2853, "train_runtime": 113467.3072, "train_tokens_per_second": 98034.931 }, { "epoch": 0.4537360890302067, "grad_norm": 0.19877833127975464, "learning_rate": 2.8721048634942914e-05, "loss": 0.4138, "num_input_tokens_seen": 11127722764, "step": 2854, "train_runtime": 113507.6227, "train_tokens_per_second": 98035.026 }, { "epoch": 0.4538950715421304, "grad_norm": 0.2225445806980133, "learning_rate": 2.8708675226724503e-05, "loss": 0.4009, "num_input_tokens_seen": 11131548321, "step": 2855, "train_runtime": 113545.8015, "train_tokens_per_second": 98035.755 }, { "epoch": 0.4540540540540541, "grad_norm": 0.23420220613479614, "learning_rate": 2.8696300889509147e-05, "loss": 0.4261, "num_input_tokens_seen": 11135395712, "step": 2856, "train_runtime": 113584.1379, "train_tokens_per_second": 98036.539 }, { "epoch": 0.4542130365659777, "grad_norm": 0.2128322720527649, "learning_rate": 2.868392562639654e-05, "loss": 0.4071, "num_input_tokens_seen": 11139307994, "step": 2857, "train_runtime": 113625.3845, "train_tokens_per_second": 98035.382 }, { "epoch": 0.4543720190779014, "grad_norm": 0.20154428482055664, "learning_rate": 2.86715494404866e-05, "loss": 0.4166, "num_input_tokens_seen": 11143208437, "step": 2858, "train_runtime": 113662.9104, "train_tokens_per_second": 98037.332 }, { "epoch": 0.4545310015898251, "grad_norm": 0.19309590756893158, "learning_rate": 2.8659172334879465e-05, "loss": 0.4077, "num_input_tokens_seen": 11147145072, "step": 2859, "train_runtime": 113702.6998, "train_tokens_per_second": 98037.646 }, { "epoch": 0.4546899841017488, "grad_norm": 0.20634110271930695, "learning_rate": 2.864679431267552e-05, "loss": 0.4134, "num_input_tokens_seen": 11150996223, "step": 2860, "train_runtime": 113742.8309, "train_tokens_per_second": 98036.915 }, { "epoch": 0.4548489666136725, "grad_norm": 0.18953856825828552, "learning_rate": 2.863441537697537e-05, "loss": 0.4029, "num_input_tokens_seen": 11154937265, "step": 2861, "train_runtime": 113783.1489, "train_tokens_per_second": 98036.813 }, { "epoch": 0.45500794912559617, "grad_norm": 0.19820734858512878, "learning_rate": 2.862203553087984e-05, "loss": 0.4061, "num_input_tokens_seen": 11158806403, "step": 2862, "train_runtime": 113822.2199, "train_tokens_per_second": 98037.153 }, { "epoch": 0.45516693163751987, "grad_norm": 0.21176362037658691, "learning_rate": 2.8609654777490023e-05, "loss": 0.4056, "num_input_tokens_seen": 11162732653, "step": 2863, "train_runtime": 113862.2442, "train_tokens_per_second": 98037.174 }, { "epoch": 0.45532591414944357, "grad_norm": 0.19684256613254547, "learning_rate": 2.8597273119907174e-05, "loss": 0.4272, "num_input_tokens_seen": 11166624132, "step": 2864, "train_runtime": 113900.7227, "train_tokens_per_second": 98038.22 }, { "epoch": 0.45548489666136727, "grad_norm": 0.21438373625278473, "learning_rate": 2.8584890561232837e-05, "loss": 0.4039, "num_input_tokens_seen": 11170560117, "step": 2865, "train_runtime": 113937.5551, "train_tokens_per_second": 98041.073 }, { "epoch": 0.4556438791732909, "grad_norm": 0.21065902709960938, "learning_rate": 2.8572507104568742e-05, "loss": 0.4094, "num_input_tokens_seen": 11174387562, "step": 2866, "train_runtime": 113977.3288, "train_tokens_per_second": 98040.441 }, { "epoch": 0.4558028616852146, "grad_norm": 0.20554891228675842, "learning_rate": 2.8560122753016872e-05, "loss": 0.4027, "num_input_tokens_seen": 11178263374, "step": 2867, "train_runtime": 114016.4243, "train_tokens_per_second": 98040.817 }, { "epoch": 0.4559618441971383, "grad_norm": 0.20186755061149597, "learning_rate": 2.8547737509679395e-05, "loss": 0.4093, "num_input_tokens_seen": 11182209169, "step": 2868, "train_runtime": 114056.2313, "train_tokens_per_second": 98041.195 }, { "epoch": 0.456120826709062, "grad_norm": 0.23968663811683655, "learning_rate": 2.8535351377658754e-05, "loss": 0.3935, "num_input_tokens_seen": 11186018921, "step": 2869, "train_runtime": 114096.0489, "train_tokens_per_second": 98040.371 }, { "epoch": 0.4562798092209857, "grad_norm": 0.2036881297826767, "learning_rate": 2.8522964360057568e-05, "loss": 0.4102, "num_input_tokens_seen": 11189809385, "step": 2870, "train_runtime": 114135.0022, "train_tokens_per_second": 98040.121 }, { "epoch": 0.45643879173290935, "grad_norm": 0.2282949537038803, "learning_rate": 2.85105764599787e-05, "loss": 0.4082, "num_input_tokens_seen": 11193837707, "step": 2871, "train_runtime": 114175.572, "train_tokens_per_second": 98040.566 }, { "epoch": 0.45659777424483305, "grad_norm": 0.22969453036785126, "learning_rate": 2.8498187680525245e-05, "loss": 0.414, "num_input_tokens_seen": 11197709313, "step": 2872, "train_runtime": 114215.7025, "train_tokens_per_second": 98040.016 }, { "epoch": 0.45675675675675675, "grad_norm": 0.1914365440607071, "learning_rate": 2.8485798024800493e-05, "loss": 0.4213, "num_input_tokens_seen": 11201566054, "step": 2873, "train_runtime": 114256.8037, "train_tokens_per_second": 98038.504 }, { "epoch": 0.45691573926868045, "grad_norm": 0.17891737818717957, "learning_rate": 2.8473407495907968e-05, "loss": 0.4194, "num_input_tokens_seen": 11205505184, "step": 2874, "train_runtime": 114297.8899, "train_tokens_per_second": 98037.726 }, { "epoch": 0.45707472178060415, "grad_norm": 0.19439548254013062, "learning_rate": 2.8461016096951403e-05, "loss": 0.411, "num_input_tokens_seen": 11209329135, "step": 2875, "train_runtime": 114336.9215, "train_tokens_per_second": 98037.703 }, { "epoch": 0.4572337042925278, "grad_norm": 0.22464697062969208, "learning_rate": 2.8448623831034772e-05, "loss": 0.4089, "num_input_tokens_seen": 11213197049, "step": 2876, "train_runtime": 114376.7433, "train_tokens_per_second": 98037.387 }, { "epoch": 0.4573926868044515, "grad_norm": 0.18427787721157074, "learning_rate": 2.8436230701262236e-05, "loss": 0.4155, "num_input_tokens_seen": 11217179372, "step": 2877, "train_runtime": 114417.4241, "train_tokens_per_second": 98037.335 }, { "epoch": 0.4575516693163752, "grad_norm": 0.2548995912075043, "learning_rate": 2.842383671073819e-05, "loss": 0.4126, "num_input_tokens_seen": 11221082226, "step": 2878, "train_runtime": 114457.0438, "train_tokens_per_second": 98037.498 }, { "epoch": 0.4577106518282989, "grad_norm": 0.2428940236568451, "learning_rate": 2.841144186256724e-05, "loss": 0.3957, "num_input_tokens_seen": 11225073701, "step": 2879, "train_runtime": 114498.3327, "train_tokens_per_second": 98037.006 }, { "epoch": 0.4578696343402226, "grad_norm": 0.32926610112190247, "learning_rate": 2.8399046159854216e-05, "loss": 0.4006, "num_input_tokens_seen": 11228934804, "step": 2880, "train_runtime": 114537.7533, "train_tokens_per_second": 98036.975 }, { "epoch": 0.45802861685214624, "grad_norm": 0.21469393372535706, "learning_rate": 2.8386649605704147e-05, "loss": 0.4064, "num_input_tokens_seen": 11232796741, "step": 2881, "train_runtime": 114576.5968, "train_tokens_per_second": 98037.444 }, { "epoch": 0.45818759936406994, "grad_norm": 0.24564526975154877, "learning_rate": 2.8374252203222275e-05, "loss": 0.4024, "num_input_tokens_seen": 11236730925, "step": 2882, "train_runtime": 114617.4356, "train_tokens_per_second": 98036.838 }, { "epoch": 0.45834658187599364, "grad_norm": 0.23538897931575775, "learning_rate": 2.836185395551407e-05, "loss": 0.413, "num_input_tokens_seen": 11240686053, "step": 2883, "train_runtime": 114657.3607, "train_tokens_per_second": 98037.195 }, { "epoch": 0.45850556438791734, "grad_norm": 0.2041841745376587, "learning_rate": 2.834945486568522e-05, "loss": 0.41, "num_input_tokens_seen": 11244529135, "step": 2884, "train_runtime": 114697.871, "train_tokens_per_second": 98036.075 }, { "epoch": 0.45866454689984104, "grad_norm": 0.19690126180648804, "learning_rate": 2.8337054936841585e-05, "loss": 0.4097, "num_input_tokens_seen": 11248443383, "step": 2885, "train_runtime": 114734.3385, "train_tokens_per_second": 98039.031 }, { "epoch": 0.4588235294117647, "grad_norm": 0.20958982408046722, "learning_rate": 2.8324654172089272e-05, "loss": 0.4045, "num_input_tokens_seen": 11252404807, "step": 2886, "train_runtime": 114775.2384, "train_tokens_per_second": 98038.61 }, { "epoch": 0.4589825119236884, "grad_norm": 0.2501447796821594, "learning_rate": 2.8312252574534576e-05, "loss": 0.4071, "num_input_tokens_seen": 11256150980, "step": 2887, "train_runtime": 114813.7282, "train_tokens_per_second": 98038.372 }, { "epoch": 0.4591414944356121, "grad_norm": 0.19233086705207825, "learning_rate": 2.829985014728402e-05, "loss": 0.4017, "num_input_tokens_seen": 11259962483, "step": 2888, "train_runtime": 114852.2855, "train_tokens_per_second": 98038.645 }, { "epoch": 0.4593004769475358, "grad_norm": 0.19996239244937897, "learning_rate": 2.8287446893444314e-05, "loss": 0.4049, "num_input_tokens_seen": 11263940680, "step": 2889, "train_runtime": 114893.309, "train_tokens_per_second": 98038.265 }, { "epoch": 0.4594594594594595, "grad_norm": 0.2386453002691269, "learning_rate": 2.82750428161224e-05, "loss": 0.4045, "num_input_tokens_seen": 11267822776, "step": 2890, "train_runtime": 114933.2293, "train_tokens_per_second": 98037.99 }, { "epoch": 0.45961844197138313, "grad_norm": 0.18404613435268402, "learning_rate": 2.8262637918425394e-05, "loss": 0.4097, "num_input_tokens_seen": 11271801789, "step": 2891, "train_runtime": 114973.5001, "train_tokens_per_second": 98038.259 }, { "epoch": 0.45977742448330683, "grad_norm": 0.1956634819507599, "learning_rate": 2.8250232203460658e-05, "loss": 0.3995, "num_input_tokens_seen": 11275613034, "step": 2892, "train_runtime": 115012.302, "train_tokens_per_second": 98038.321 }, { "epoch": 0.45993640699523053, "grad_norm": 0.24067695438861847, "learning_rate": 2.823782567433571e-05, "loss": 0.4103, "num_input_tokens_seen": 11279439614, "step": 2893, "train_runtime": 115052.2968, "train_tokens_per_second": 98037.501 }, { "epoch": 0.46009538950715423, "grad_norm": 0.2239156812429428, "learning_rate": 2.8225418334158314e-05, "loss": 0.4181, "num_input_tokens_seen": 11283277970, "step": 2894, "train_runtime": 115091.7168, "train_tokens_per_second": 98037.272 }, { "epoch": 0.46025437201907793, "grad_norm": 0.1969916969537735, "learning_rate": 2.821301018603642e-05, "loss": 0.4208, "num_input_tokens_seen": 11287040321, "step": 2895, "train_runtime": 115133.6689, "train_tokens_per_second": 98034.228 }, { "epoch": 0.4604133545310016, "grad_norm": 0.17970001697540283, "learning_rate": 2.8200601233078176e-05, "loss": 0.399, "num_input_tokens_seen": 11290950138, "step": 2896, "train_runtime": 115172.2109, "train_tokens_per_second": 98035.369 }, { "epoch": 0.4605723370429253, "grad_norm": 0.1990012228488922, "learning_rate": 2.818819147839194e-05, "loss": 0.4065, "num_input_tokens_seen": 11295016334, "step": 2897, "train_runtime": 115212.2905, "train_tokens_per_second": 98036.557 }, { "epoch": 0.460731319554849, "grad_norm": 0.1917579621076584, "learning_rate": 2.8175780925086265e-05, "loss": 0.4015, "num_input_tokens_seen": 11298921577, "step": 2898, "train_runtime": 115252.4601, "train_tokens_per_second": 98036.272 }, { "epoch": 0.4608903020667727, "grad_norm": 0.31478092074394226, "learning_rate": 2.8163369576269915e-05, "loss": 0.4051, "num_input_tokens_seen": 11302894684, "step": 2899, "train_runtime": 115292.1029, "train_tokens_per_second": 98037.024 }, { "epoch": 0.4610492845786963, "grad_norm": 0.18425999581813812, "learning_rate": 2.8150957435051834e-05, "loss": 0.4097, "num_input_tokens_seen": 11306771204, "step": 2900, "train_runtime": 115331.386, "train_tokens_per_second": 98037.244 }, { "epoch": 0.46120826709062, "grad_norm": 0.22088530659675598, "learning_rate": 2.8138544504541188e-05, "loss": 0.4112, "num_input_tokens_seen": 11310681048, "step": 2901, "train_runtime": 115373.4347, "train_tokens_per_second": 98035.402 }, { "epoch": 0.4613672496025437, "grad_norm": 0.1982358992099762, "learning_rate": 2.8126130787847317e-05, "loss": 0.4102, "num_input_tokens_seen": 11314697067, "step": 2902, "train_runtime": 115411.9992, "train_tokens_per_second": 98037.441 }, { "epoch": 0.4615262321144674, "grad_norm": 0.20796597003936768, "learning_rate": 2.8113716288079773e-05, "loss": 0.4052, "num_input_tokens_seen": 11318492676, "step": 2903, "train_runtime": 115448.8714, "train_tokens_per_second": 98039.007 }, { "epoch": 0.4616852146263911, "grad_norm": 0.2910042405128479, "learning_rate": 2.8101301008348303e-05, "loss": 0.3988, "num_input_tokens_seen": 11322431626, "step": 2904, "train_runtime": 115489.6455, "train_tokens_per_second": 98038.5 }, { "epoch": 0.46184419713831476, "grad_norm": 0.18300853669643402, "learning_rate": 2.8088884951762846e-05, "loss": 0.4088, "num_input_tokens_seen": 11326287812, "step": 2905, "train_runtime": 115529.7254, "train_tokens_per_second": 98037.867 }, { "epoch": 0.46200317965023846, "grad_norm": 0.2108365297317505, "learning_rate": 2.807646812143353e-05, "loss": 0.3949, "num_input_tokens_seen": 11330208142, "step": 2906, "train_runtime": 115568.1929, "train_tokens_per_second": 98039.156 }, { "epoch": 0.46216216216216216, "grad_norm": 0.20093412697315216, "learning_rate": 2.80640505204707e-05, "loss": 0.4168, "num_input_tokens_seen": 11334147233, "step": 2907, "train_runtime": 115609.8231, "train_tokens_per_second": 98037.926 }, { "epoch": 0.46232114467408586, "grad_norm": 0.2531784474849701, "learning_rate": 2.8051632151984858e-05, "loss": 0.3957, "num_input_tokens_seen": 11338056148, "step": 2908, "train_runtime": 115648.929, "train_tokens_per_second": 98038.575 }, { "epoch": 0.46248012718600956, "grad_norm": 0.21415624022483826, "learning_rate": 2.803921301908673e-05, "loss": 0.4139, "num_input_tokens_seen": 11341975087, "step": 2909, "train_runtime": 115690.3933, "train_tokens_per_second": 98037.311 }, { "epoch": 0.4626391096979332, "grad_norm": 0.21074122190475464, "learning_rate": 2.8026793124887217e-05, "loss": 0.422, "num_input_tokens_seen": 11345909051, "step": 2910, "train_runtime": 115730.8762, "train_tokens_per_second": 98037.01 }, { "epoch": 0.4627980922098569, "grad_norm": 0.23087948560714722, "learning_rate": 2.8014372472497408e-05, "loss": 0.4092, "num_input_tokens_seen": 11349771445, "step": 2911, "train_runtime": 115771.3049, "train_tokens_per_second": 98036.136 }, { "epoch": 0.4629570747217806, "grad_norm": 0.20259612798690796, "learning_rate": 2.8001951065028596e-05, "loss": 0.4055, "num_input_tokens_seen": 11353740722, "step": 2912, "train_runtime": 115809.0439, "train_tokens_per_second": 98038.463 }, { "epoch": 0.4631160572337043, "grad_norm": 0.20838524401187897, "learning_rate": 2.7989528905592265e-05, "loss": 0.4133, "num_input_tokens_seen": 11357579764, "step": 2913, "train_runtime": 115846.511, "train_tokens_per_second": 98039.895 }, { "epoch": 0.463275039745628, "grad_norm": 0.19196178019046783, "learning_rate": 2.7977105997300062e-05, "loss": 0.4095, "num_input_tokens_seen": 11361437616, "step": 2914, "train_runtime": 115886.3723, "train_tokens_per_second": 98039.462 }, { "epoch": 0.46343402225755165, "grad_norm": 0.24844548106193542, "learning_rate": 2.7964682343263843e-05, "loss": 0.4201, "num_input_tokens_seen": 11365291425, "step": 2915, "train_runtime": 115925.3673, "train_tokens_per_second": 98039.728 }, { "epoch": 0.46359300476947535, "grad_norm": 0.19864293932914734, "learning_rate": 2.795225794659565e-05, "loss": 0.4006, "num_input_tokens_seen": 11369189133, "step": 2916, "train_runtime": 115964.3398, "train_tokens_per_second": 98040.39 }, { "epoch": 0.46375198728139905, "grad_norm": 0.19592100381851196, "learning_rate": 2.7939832810407707e-05, "loss": 0.4031, "num_input_tokens_seen": 11373101747, "step": 2917, "train_runtime": 116003.2219, "train_tokens_per_second": 98041.258 }, { "epoch": 0.46391096979332275, "grad_norm": 0.19899703562259674, "learning_rate": 2.7927406937812405e-05, "loss": 0.4084, "num_input_tokens_seen": 11376933946, "step": 2918, "train_runtime": 116042.4228, "train_tokens_per_second": 98041.162 }, { "epoch": 0.46406995230524645, "grad_norm": 0.1991507112979889, "learning_rate": 2.7914980331922357e-05, "loss": 0.412, "num_input_tokens_seen": 11380881691, "step": 2919, "train_runtime": 116080.426, "train_tokens_per_second": 98043.073 }, { "epoch": 0.4642289348171701, "grad_norm": 0.24976786971092224, "learning_rate": 2.790255299585034e-05, "loss": 0.4042, "num_input_tokens_seen": 11384785732, "step": 2920, "train_runtime": 116119.8455, "train_tokens_per_second": 98043.411 }, { "epoch": 0.4643879173290938, "grad_norm": 0.19607120752334595, "learning_rate": 2.7890124932709298e-05, "loss": 0.4056, "num_input_tokens_seen": 11388638989, "step": 2921, "train_runtime": 116159.4568, "train_tokens_per_second": 98043.149 }, { "epoch": 0.4645468998410175, "grad_norm": 0.20407165586948395, "learning_rate": 2.7877696145612392e-05, "loss": 0.4037, "num_input_tokens_seen": 11392605459, "step": 2922, "train_runtime": 116200.9176, "train_tokens_per_second": 98042.302 }, { "epoch": 0.4647058823529412, "grad_norm": 0.18896439671516418, "learning_rate": 2.786526663767293e-05, "loss": 0.4078, "num_input_tokens_seen": 11396506114, "step": 2923, "train_runtime": 116238.6611, "train_tokens_per_second": 98044.024 }, { "epoch": 0.4648648648648649, "grad_norm": 0.18532612919807434, "learning_rate": 2.7852836412004436e-05, "loss": 0.396, "num_input_tokens_seen": 11400280312, "step": 2924, "train_runtime": 116278.1453, "train_tokens_per_second": 98043.19 }, { "epoch": 0.46502384737678853, "grad_norm": 0.19220592081546783, "learning_rate": 2.7840405471720565e-05, "loss": 0.3982, "num_input_tokens_seen": 11404121376, "step": 2925, "train_runtime": 116317.0867, "train_tokens_per_second": 98043.389 }, { "epoch": 0.46518282988871223, "grad_norm": 0.3144510090351105, "learning_rate": 2.78279738199352e-05, "loss": 0.4115, "num_input_tokens_seen": 11408017536, "step": 2926, "train_runtime": 116356.1881, "train_tokens_per_second": 98043.926 }, { "epoch": 0.46534181240063593, "grad_norm": 0.20006822049617767, "learning_rate": 2.7815541459762372e-05, "loss": 0.4057, "num_input_tokens_seen": 11411948844, "step": 2927, "train_runtime": 116395.556, "train_tokens_per_second": 98044.541 }, { "epoch": 0.46550079491255963, "grad_norm": 0.23792266845703125, "learning_rate": 2.7803108394316307e-05, "loss": 0.4039, "num_input_tokens_seen": 11415799035, "step": 2928, "train_runtime": 116435.8962, "train_tokens_per_second": 98043.64 }, { "epoch": 0.46565977742448333, "grad_norm": 0.1991821825504303, "learning_rate": 2.77906746267114e-05, "loss": 0.4069, "num_input_tokens_seen": 11419783458, "step": 2929, "train_runtime": 116472.4557, "train_tokens_per_second": 98047.074 }, { "epoch": 0.465818759936407, "grad_norm": 0.2274869829416275, "learning_rate": 2.7778240160062214e-05, "loss": 0.4163, "num_input_tokens_seen": 11423840927, "step": 2930, "train_runtime": 116513.4665, "train_tokens_per_second": 98047.387 }, { "epoch": 0.4659777424483307, "grad_norm": 0.21721431612968445, "learning_rate": 2.7765804997483504e-05, "loss": 0.4086, "num_input_tokens_seen": 11427542106, "step": 2931, "train_runtime": 116553.528, "train_tokens_per_second": 98045.441 }, { "epoch": 0.4661367249602544, "grad_norm": 0.23070767521858215, "learning_rate": 2.775336914209019e-05, "loss": 0.4002, "num_input_tokens_seen": 11431465168, "step": 2932, "train_runtime": 116596.4484, "train_tokens_per_second": 98042.996 }, { "epoch": 0.4662957074721781, "grad_norm": 0.2074798196554184, "learning_rate": 2.7740932596997354e-05, "loss": 0.4142, "num_input_tokens_seen": 11435423962, "step": 2933, "train_runtime": 116636.4842, "train_tokens_per_second": 98043.284 }, { "epoch": 0.4664546899841017, "grad_norm": 0.530981719493866, "learning_rate": 2.7728495365320272e-05, "loss": 0.4009, "num_input_tokens_seen": 11439306216, "step": 2934, "train_runtime": 116672.8947, "train_tokens_per_second": 98045.962 }, { "epoch": 0.4666136724960254, "grad_norm": 0.24213308095932007, "learning_rate": 2.771605745017439e-05, "loss": 0.4077, "num_input_tokens_seen": 11443083762, "step": 2935, "train_runtime": 116711.8538, "train_tokens_per_second": 98045.6 }, { "epoch": 0.4667726550079491, "grad_norm": 0.19970108568668365, "learning_rate": 2.7703618854675295e-05, "loss": 0.4107, "num_input_tokens_seen": 11446945232, "step": 2936, "train_runtime": 116751.0554, "train_tokens_per_second": 98045.754 }, { "epoch": 0.4669316375198728, "grad_norm": 0.200904980301857, "learning_rate": 2.7691179581938787e-05, "loss": 0.3943, "num_input_tokens_seen": 11450893999, "step": 2937, "train_runtime": 116788.9118, "train_tokens_per_second": 98047.784 }, { "epoch": 0.4670906200317965, "grad_norm": 0.2295939028263092, "learning_rate": 2.7678739635080814e-05, "loss": 0.4067, "num_input_tokens_seen": 11454788956, "step": 2938, "train_runtime": 116829.1165, "train_tokens_per_second": 98047.381 }, { "epoch": 0.46724960254372017, "grad_norm": 0.23148250579833984, "learning_rate": 2.7666299017217497e-05, "loss": 0.4065, "num_input_tokens_seen": 11458632943, "step": 2939, "train_runtime": 116869.7473, "train_tokens_per_second": 98046.186 }, { "epoch": 0.46740858505564387, "grad_norm": 0.22622229158878326, "learning_rate": 2.765385773146511e-05, "loss": 0.3998, "num_input_tokens_seen": 11462568871, "step": 2940, "train_runtime": 116904.6, "train_tokens_per_second": 98050.623 }, { "epoch": 0.46756756756756757, "grad_norm": 0.19366410374641418, "learning_rate": 2.7641415780940112e-05, "loss": 0.3922, "num_input_tokens_seen": 11466519992, "step": 2941, "train_runtime": 116943.1144, "train_tokens_per_second": 98052.117 }, { "epoch": 0.46772655007949127, "grad_norm": 0.23745481669902802, "learning_rate": 2.7628973168759126e-05, "loss": 0.422, "num_input_tokens_seen": 11470344204, "step": 2942, "train_runtime": 116983.9327, "train_tokens_per_second": 98050.595 }, { "epoch": 0.46788553259141497, "grad_norm": 0.2537466883659363, "learning_rate": 2.761652989803894e-05, "loss": 0.4205, "num_input_tokens_seen": 11474344069, "step": 2943, "train_runtime": 117024.0486, "train_tokens_per_second": 98051.163 }, { "epoch": 0.4680445151033386, "grad_norm": 0.2390303909778595, "learning_rate": 2.7604085971896504e-05, "loss": 0.3863, "num_input_tokens_seen": 11478185002, "step": 2944, "train_runtime": 117063.2672, "train_tokens_per_second": 98051.125 }, { "epoch": 0.4682034976152623, "grad_norm": 0.20933295786380768, "learning_rate": 2.7591641393448936e-05, "loss": 0.4049, "num_input_tokens_seen": 11482017120, "step": 2945, "train_runtime": 117101.0306, "train_tokens_per_second": 98052.229 }, { "epoch": 0.468362480127186, "grad_norm": 0.19108162820339203, "learning_rate": 2.757919616581351e-05, "loss": 0.4128, "num_input_tokens_seen": 11485938430, "step": 2946, "train_runtime": 117139.391, "train_tokens_per_second": 98053.595 }, { "epoch": 0.4685214626391097, "grad_norm": 0.23321451246738434, "learning_rate": 2.7566750292107672e-05, "loss": 0.4168, "num_input_tokens_seen": 11489820587, "step": 2947, "train_runtime": 117180.3537, "train_tokens_per_second": 98052.448 }, { "epoch": 0.4686804451510334, "grad_norm": 1.3234502077102661, "learning_rate": 2.7554303775449024e-05, "loss": 0.415, "num_input_tokens_seen": 11493753202, "step": 2948, "train_runtime": 117220.5895, "train_tokens_per_second": 98052.341 }, { "epoch": 0.46883942766295705, "grad_norm": 0.28431785106658936, "learning_rate": 2.754185661895533e-05, "loss": 0.4177, "num_input_tokens_seen": 11497672899, "step": 2949, "train_runtime": 117260.9938, "train_tokens_per_second": 98051.982 }, { "epoch": 0.46899841017488075, "grad_norm": 0.2952378988265991, "learning_rate": 2.7529408825744512e-05, "loss": 0.4321, "num_input_tokens_seen": 11501548810, "step": 2950, "train_runtime": 117302.376, "train_tokens_per_second": 98050.433 }, { "epoch": 0.46915739268680445, "grad_norm": 0.21933667361736298, "learning_rate": 2.7516960398934654e-05, "loss": 0.4255, "num_input_tokens_seen": 11505511051, "step": 2951, "train_runtime": 117341.4638, "train_tokens_per_second": 98051.538 }, { "epoch": 0.46931637519872815, "grad_norm": 0.19670219719409943, "learning_rate": 2.7504511341644008e-05, "loss": 0.4128, "num_input_tokens_seen": 11509389230, "step": 2952, "train_runtime": 117376.8008, "train_tokens_per_second": 98055.06 }, { "epoch": 0.46947535771065185, "grad_norm": 0.2188776582479477, "learning_rate": 2.749206165699096e-05, "loss": 0.4037, "num_input_tokens_seen": 11513323685, "step": 2953, "train_runtime": 117414.7071, "train_tokens_per_second": 98056.913 }, { "epoch": 0.4696343402225755, "grad_norm": 0.205160453915596, "learning_rate": 2.7479611348094085e-05, "loss": 0.4101, "num_input_tokens_seen": 11517315006, "step": 2954, "train_runtime": 117454.5665, "train_tokens_per_second": 98057.618 }, { "epoch": 0.4697933227344992, "grad_norm": 0.20746436715126038, "learning_rate": 2.7467160418072085e-05, "loss": 0.41, "num_input_tokens_seen": 11521218979, "step": 2955, "train_runtime": 117493.3261, "train_tokens_per_second": 98058.497 }, { "epoch": 0.4699523052464229, "grad_norm": 0.20467738807201385, "learning_rate": 2.7454708870043843e-05, "loss": 0.3997, "num_input_tokens_seen": 11525134470, "step": 2956, "train_runtime": 117531.5156, "train_tokens_per_second": 98059.949 }, { "epoch": 0.4701112877583466, "grad_norm": 0.32587510347366333, "learning_rate": 2.7442256707128366e-05, "loss": 0.3983, "num_input_tokens_seen": 11528946340, "step": 2957, "train_runtime": 117570.6176, "train_tokens_per_second": 98059.758 }, { "epoch": 0.4702702702702703, "grad_norm": 0.3112965524196625, "learning_rate": 2.7429803932444847e-05, "loss": 0.4079, "num_input_tokens_seen": 11533017738, "step": 2958, "train_runtime": 117612.2852, "train_tokens_per_second": 98059.635 }, { "epoch": 0.47042925278219394, "grad_norm": 0.19694533944129944, "learning_rate": 2.7417350549112607e-05, "loss": 0.4086, "num_input_tokens_seen": 11536873542, "step": 2959, "train_runtime": 117649.9187, "train_tokens_per_second": 98061.041 }, { "epoch": 0.47058823529411764, "grad_norm": 0.24935489892959595, "learning_rate": 2.7404896560251137e-05, "loss": 0.4137, "num_input_tokens_seen": 11540577802, "step": 2960, "train_runtime": 117688.1294, "train_tokens_per_second": 98060.678 }, { "epoch": 0.47074721780604134, "grad_norm": 0.19994114339351654, "learning_rate": 2.739244196898007e-05, "loss": 0.4009, "num_input_tokens_seen": 11544517686, "step": 2961, "train_runtime": 117726.2137, "train_tokens_per_second": 98062.422 }, { "epoch": 0.47090620031796504, "grad_norm": 0.22491303086280823, "learning_rate": 2.7379986778419196e-05, "loss": 0.3962, "num_input_tokens_seen": 11548371057, "step": 2962, "train_runtime": 117766.0387, "train_tokens_per_second": 98061.981 }, { "epoch": 0.4710651828298887, "grad_norm": 0.23503616452217102, "learning_rate": 2.7367530991688462e-05, "loss": 0.4126, "num_input_tokens_seen": 11552342259, "step": 2963, "train_runtime": 117800.8965, "train_tokens_per_second": 98066.675 }, { "epoch": 0.4712241653418124, "grad_norm": 0.1924910843372345, "learning_rate": 2.7355074611907945e-05, "loss": 0.4178, "num_input_tokens_seen": 11556194057, "step": 2964, "train_runtime": 117843.7541, "train_tokens_per_second": 98063.696 }, { "epoch": 0.4713831478537361, "grad_norm": 0.24293097853660583, "learning_rate": 2.7342617642197864e-05, "loss": 0.412, "num_input_tokens_seen": 11560109841, "step": 2965, "train_runtime": 117882.3464, "train_tokens_per_second": 98064.809 }, { "epoch": 0.4715421303656598, "grad_norm": 0.18801705539226532, "learning_rate": 2.7330160085678625e-05, "loss": 0.4062, "num_input_tokens_seen": 11564085962, "step": 2966, "train_runtime": 117920.2935, "train_tokens_per_second": 98066.971 }, { "epoch": 0.4717011128775835, "grad_norm": 0.22788521647453308, "learning_rate": 2.731770194547076e-05, "loss": 0.4075, "num_input_tokens_seen": 11568013559, "step": 2967, "train_runtime": 117958.6464, "train_tokens_per_second": 98068.382 }, { "epoch": 0.47186009538950713, "grad_norm": 0.19543813169002533, "learning_rate": 2.7305243224694922e-05, "loss": 0.4059, "num_input_tokens_seen": 11571840280, "step": 2968, "train_runtime": 117998.2207, "train_tokens_per_second": 98067.922 }, { "epoch": 0.47201907790143083, "grad_norm": 0.2883329391479492, "learning_rate": 2.729278392647195e-05, "loss": 0.4216, "num_input_tokens_seen": 11575777683, "step": 2969, "train_runtime": 118038.7662, "train_tokens_per_second": 98067.593 }, { "epoch": 0.47217806041335453, "grad_norm": 0.2505757808685303, "learning_rate": 2.72803240539228e-05, "loss": 0.4041, "num_input_tokens_seen": 11579668179, "step": 2970, "train_runtime": 118077.5356, "train_tokens_per_second": 98068.342 }, { "epoch": 0.4723370429252782, "grad_norm": 0.22880296409130096, "learning_rate": 2.72678636101686e-05, "loss": 0.407, "num_input_tokens_seen": 11583528681, "step": 2971, "train_runtime": 118117.6125, "train_tokens_per_second": 98067.752 }, { "epoch": 0.4724960254372019, "grad_norm": 0.2001541256904602, "learning_rate": 2.7255402598330588e-05, "loss": 0.4041, "num_input_tokens_seen": 11587376179, "step": 2972, "train_runtime": 118159.1225, "train_tokens_per_second": 98065.862 }, { "epoch": 0.47265500794912557, "grad_norm": 0.3815215826034546, "learning_rate": 2.724294102153016e-05, "loss": 0.4038, "num_input_tokens_seen": 11591388086, "step": 2973, "train_runtime": 118198.8065, "train_tokens_per_second": 98066.879 }, { "epoch": 0.47281399046104927, "grad_norm": 0.18002015352249146, "learning_rate": 2.7230478882888854e-05, "loss": 0.4078, "num_input_tokens_seen": 11595362202, "step": 2974, "train_runtime": 118236.2856, "train_tokens_per_second": 98069.405 }, { "epoch": 0.47297297297297297, "grad_norm": 0.21028396487236023, "learning_rate": 2.721801618552835e-05, "loss": 0.4081, "num_input_tokens_seen": 11599148526, "step": 2975, "train_runtime": 118276.7562, "train_tokens_per_second": 98067.861 }, { "epoch": 0.47313195548489667, "grad_norm": 0.20652151107788086, "learning_rate": 2.7205552932570466e-05, "loss": 0.4155, "num_input_tokens_seen": 11603074418, "step": 2976, "train_runtime": 118316.7641, "train_tokens_per_second": 98067.882 }, { "epoch": 0.47329093799682037, "grad_norm": 0.21648496389389038, "learning_rate": 2.7193089127137157e-05, "loss": 0.408, "num_input_tokens_seen": 11607032337, "step": 2977, "train_runtime": 118356.2155, "train_tokens_per_second": 98068.634 }, { "epoch": 0.473449920508744, "grad_norm": 0.26213666796684265, "learning_rate": 2.7180624772350517e-05, "loss": 0.4007, "num_input_tokens_seen": 11610833946, "step": 2978, "train_runtime": 118395.2021, "train_tokens_per_second": 98068.45 }, { "epoch": 0.4736089030206677, "grad_norm": 0.20778538286685944, "learning_rate": 2.7168159871332787e-05, "loss": 0.4285, "num_input_tokens_seen": 11614737723, "step": 2979, "train_runtime": 118437.7538, "train_tokens_per_second": 98066.177 }, { "epoch": 0.4737678855325914, "grad_norm": 0.19831253588199615, "learning_rate": 2.7155694427206324e-05, "loss": 0.414, "num_input_tokens_seen": 11618669156, "step": 2980, "train_runtime": 118477.7883, "train_tokens_per_second": 98066.223 }, { "epoch": 0.4739268680445151, "grad_norm": 0.24522484838962555, "learning_rate": 2.7143228443093642e-05, "loss": 0.4087, "num_input_tokens_seen": 11622599412, "step": 2981, "train_runtime": 118518.2926, "train_tokens_per_second": 98065.869 }, { "epoch": 0.4740858505564388, "grad_norm": 0.6245201230049133, "learning_rate": 2.713076192211738e-05, "loss": 0.4091, "num_input_tokens_seen": 11626505762, "step": 2982, "train_runtime": 118557.0172, "train_tokens_per_second": 98066.787 }, { "epoch": 0.47424483306836246, "grad_norm": 0.2156086564064026, "learning_rate": 2.7118294867400313e-05, "loss": 0.4094, "num_input_tokens_seen": 11630364579, "step": 2983, "train_runtime": 118597.2406, "train_tokens_per_second": 98066.064 }, { "epoch": 0.47440381558028616, "grad_norm": 0.18700045347213745, "learning_rate": 2.7105827282065344e-05, "loss": 0.4109, "num_input_tokens_seen": 11634213527, "step": 2984, "train_runtime": 118635.8061, "train_tokens_per_second": 98066.629 }, { "epoch": 0.47456279809220986, "grad_norm": 0.21241801977157593, "learning_rate": 2.709335916923553e-05, "loss": 0.4073, "num_input_tokens_seen": 11638160500, "step": 2985, "train_runtime": 118676.0331, "train_tokens_per_second": 98066.646 }, { "epoch": 0.47472178060413356, "grad_norm": 0.1820506900548935, "learning_rate": 2.7080890532034038e-05, "loss": 0.4132, "num_input_tokens_seen": 11641958264, "step": 2986, "train_runtime": 118715.0475, "train_tokens_per_second": 98066.408 }, { "epoch": 0.47488076311605726, "grad_norm": 0.2158978134393692, "learning_rate": 2.706842137358417e-05, "loss": 0.4122, "num_input_tokens_seen": 11645820111, "step": 2987, "train_runtime": 118752.2589, "train_tokens_per_second": 98068.199 }, { "epoch": 0.4750397456279809, "grad_norm": 0.30425676703453064, "learning_rate": 2.7055951697009357e-05, "loss": 0.4075, "num_input_tokens_seen": 11649727335, "step": 2988, "train_runtime": 118791.0352, "train_tokens_per_second": 98069.078 }, { "epoch": 0.4751987281399046, "grad_norm": 0.20952799916267395, "learning_rate": 2.7043481505433177e-05, "loss": 0.4146, "num_input_tokens_seen": 11653680336, "step": 2989, "train_runtime": 118832.7923, "train_tokens_per_second": 98067.883 }, { "epoch": 0.4753577106518283, "grad_norm": 0.19931437075138092, "learning_rate": 2.7031010801979322e-05, "loss": 0.4027, "num_input_tokens_seen": 11657592299, "step": 2990, "train_runtime": 118872.3347, "train_tokens_per_second": 98068.17 }, { "epoch": 0.475516693163752, "grad_norm": 0.20732875168323517, "learning_rate": 2.701853958977161e-05, "loss": 0.4159, "num_input_tokens_seen": 11661489575, "step": 2991, "train_runtime": 118909.7459, "train_tokens_per_second": 98070.091 }, { "epoch": 0.4756756756756757, "grad_norm": 0.2033282071352005, "learning_rate": 2.7006067871933993e-05, "loss": 0.4154, "num_input_tokens_seen": 11665389710, "step": 2992, "train_runtime": 118949.5365, "train_tokens_per_second": 98070.073 }, { "epoch": 0.47583465818759935, "grad_norm": 0.19739560782909393, "learning_rate": 2.6993595651590552e-05, "loss": 0.4117, "num_input_tokens_seen": 11669264081, "step": 2993, "train_runtime": 118989.1471, "train_tokens_per_second": 98069.987 }, { "epoch": 0.47599364069952305, "grad_norm": 0.3441230058670044, "learning_rate": 2.6981122931865494e-05, "loss": 0.3973, "num_input_tokens_seen": 11673217936, "step": 2994, "train_runtime": 119025.7196, "train_tokens_per_second": 98073.072 }, { "epoch": 0.47615262321144675, "grad_norm": 0.19049927592277527, "learning_rate": 2.696864971588313e-05, "loss": 0.3988, "num_input_tokens_seen": 11676978223, "step": 2995, "train_runtime": 119062.9309, "train_tokens_per_second": 98074.003 }, { "epoch": 0.47631160572337045, "grad_norm": 0.4573463201522827, "learning_rate": 2.695617600676793e-05, "loss": 0.4034, "num_input_tokens_seen": 11680941785, "step": 2996, "train_runtime": 119102.3304, "train_tokens_per_second": 98074.838 }, { "epoch": 0.4764705882352941, "grad_norm": 0.1973290592432022, "learning_rate": 2.6943701807644462e-05, "loss": 0.4046, "num_input_tokens_seen": 11684841546, "step": 2997, "train_runtime": 119142.9698, "train_tokens_per_second": 98074.117 }, { "epoch": 0.4766295707472178, "grad_norm": 0.23059126734733582, "learning_rate": 2.6931227121637426e-05, "loss": 0.4095, "num_input_tokens_seen": 11688755079, "step": 2998, "train_runtime": 119178.6981, "train_tokens_per_second": 98077.553 }, { "epoch": 0.4767885532591415, "grad_norm": 0.2272626757621765, "learning_rate": 2.6918751951871645e-05, "loss": 0.4131, "num_input_tokens_seen": 11692681733, "step": 2999, "train_runtime": 119215.8952, "train_tokens_per_second": 98079.889 }, { "epoch": 0.4769475357710652, "grad_norm": 0.18680359423160553, "learning_rate": 2.690627630147205e-05, "loss": 0.4148, "num_input_tokens_seen": 11696396130, "step": 3000, "train_runtime": 119254.1091, "train_tokens_per_second": 98079.607 }, { "epoch": 0.4771065182829889, "grad_norm": 0.20392856001853943, "learning_rate": 2.6893800173563715e-05, "loss": 0.4061, "num_input_tokens_seen": 11700396272, "step": 3001, "train_runtime": 119405.9694, "train_tokens_per_second": 97988.37 }, { "epoch": 0.47726550079491253, "grad_norm": 0.1943286657333374, "learning_rate": 2.6881323571271826e-05, "loss": 0.4146, "num_input_tokens_seen": 11704299504, "step": 3002, "train_runtime": 119444.5943, "train_tokens_per_second": 97989.361 }, { "epoch": 0.47742448330683623, "grad_norm": 0.2532272934913635, "learning_rate": 2.686884649772167e-05, "loss": 0.4174, "num_input_tokens_seen": 11708260496, "step": 3003, "train_runtime": 119485.8071, "train_tokens_per_second": 97988.713 }, { "epoch": 0.47758346581875993, "grad_norm": 0.2628113925457001, "learning_rate": 2.685636895603868e-05, "loss": 0.405, "num_input_tokens_seen": 11712102765, "step": 3004, "train_runtime": 119525.0653, "train_tokens_per_second": 97988.675 }, { "epoch": 0.47774244833068363, "grad_norm": 0.21081280708312988, "learning_rate": 2.6843890949348376e-05, "loss": 0.3947, "num_input_tokens_seen": 11715982043, "step": 3005, "train_runtime": 119563.7614, "train_tokens_per_second": 97989.407 }, { "epoch": 0.47790143084260733, "grad_norm": 0.20557564496994019, "learning_rate": 2.6831412480776417e-05, "loss": 0.4088, "num_input_tokens_seen": 11719678642, "step": 3006, "train_runtime": 119603.9955, "train_tokens_per_second": 97987.351 }, { "epoch": 0.478060413354531, "grad_norm": 0.2172253578901291, "learning_rate": 2.681893355344858e-05, "loss": 0.4102, "num_input_tokens_seen": 11723535335, "step": 3007, "train_runtime": 119645.9933, "train_tokens_per_second": 97985.19 }, { "epoch": 0.4782193958664547, "grad_norm": 0.21227803826332092, "learning_rate": 2.680645417049074e-05, "loss": 0.4039, "num_input_tokens_seen": 11727556164, "step": 3008, "train_runtime": 119684.0257, "train_tokens_per_second": 97987.648 }, { "epoch": 0.4783783783783784, "grad_norm": 0.19871476292610168, "learning_rate": 2.6793974335028893e-05, "loss": 0.4084, "num_input_tokens_seen": 11731521621, "step": 3009, "train_runtime": 119723.8283, "train_tokens_per_second": 97988.193 }, { "epoch": 0.4785373608903021, "grad_norm": 0.2007964551448822, "learning_rate": 2.678149405018915e-05, "loss": 0.4048, "num_input_tokens_seen": 11735329560, "step": 3010, "train_runtime": 119764.3327, "train_tokens_per_second": 97986.849 }, { "epoch": 0.4786963434022258, "grad_norm": 0.20912882685661316, "learning_rate": 2.676901331909774e-05, "loss": 0.4034, "num_input_tokens_seen": 11739284673, "step": 3011, "train_runtime": 119803.635, "train_tokens_per_second": 97987.717 }, { "epoch": 0.4788553259141494, "grad_norm": 0.22579655051231384, "learning_rate": 2.6756532144880992e-05, "loss": 0.4103, "num_input_tokens_seen": 11743178623, "step": 3012, "train_runtime": 119844.1262, "train_tokens_per_second": 97987.102 }, { "epoch": 0.4790143084260731, "grad_norm": 0.25330889225006104, "learning_rate": 2.674405053066536e-05, "loss": 0.4056, "num_input_tokens_seen": 11747050363, "step": 3013, "train_runtime": 119882.5338, "train_tokens_per_second": 97988.005 }, { "epoch": 0.4791732909379968, "grad_norm": 0.19571855664253235, "learning_rate": 2.6731568479577386e-05, "loss": 0.418, "num_input_tokens_seen": 11750922351, "step": 3014, "train_runtime": 119921.775, "train_tokens_per_second": 97988.229 }, { "epoch": 0.4793322734499205, "grad_norm": 0.21976977586746216, "learning_rate": 2.6719085994743742e-05, "loss": 0.4104, "num_input_tokens_seen": 11754888586, "step": 3015, "train_runtime": 119961.0295, "train_tokens_per_second": 97989.227 }, { "epoch": 0.4794912559618442, "grad_norm": 0.2235933095216751, "learning_rate": 2.670660307929121e-05, "loss": 0.4141, "num_input_tokens_seen": 11758839217, "step": 3016, "train_runtime": 119999.3855, "train_tokens_per_second": 97990.829 }, { "epoch": 0.47965023847376786, "grad_norm": 0.19161903858184814, "learning_rate": 2.6694119736346667e-05, "loss": 0.4173, "num_input_tokens_seen": 11762773501, "step": 3017, "train_runtime": 120037.8353, "train_tokens_per_second": 97992.216 }, { "epoch": 0.47980922098569156, "grad_norm": 0.18409991264343262, "learning_rate": 2.66816359690371e-05, "loss": 0.4096, "num_input_tokens_seen": 11766688866, "step": 3018, "train_runtime": 120077.4077, "train_tokens_per_second": 97992.529 }, { "epoch": 0.47996820349761526, "grad_norm": 0.22812986373901367, "learning_rate": 2.6669151780489603e-05, "loss": 0.3969, "num_input_tokens_seen": 11770629896, "step": 3019, "train_runtime": 120117.8412, "train_tokens_per_second": 97992.353 }, { "epoch": 0.48012718600953896, "grad_norm": 0.19361041486263275, "learning_rate": 2.6656667173831386e-05, "loss": 0.4108, "num_input_tokens_seen": 11774503885, "step": 3020, "train_runtime": 120158.0291, "train_tokens_per_second": 97991.819 }, { "epoch": 0.48028616852146266, "grad_norm": 0.2227124273777008, "learning_rate": 2.664418215218974e-05, "loss": 0.4148, "num_input_tokens_seen": 11778256101, "step": 3021, "train_runtime": 120196.8585, "train_tokens_per_second": 97991.381 }, { "epoch": 0.4804451510333863, "grad_norm": 0.2341272532939911, "learning_rate": 2.663169671869209e-05, "loss": 0.3996, "num_input_tokens_seen": 11782189117, "step": 3022, "train_runtime": 120235.3274, "train_tokens_per_second": 97992.739 }, { "epoch": 0.48060413354531, "grad_norm": 0.18530669808387756, "learning_rate": 2.6619210876465938e-05, "loss": 0.4144, "num_input_tokens_seen": 11786187697, "step": 3023, "train_runtime": 120276.501, "train_tokens_per_second": 97992.439 }, { "epoch": 0.4807631160572337, "grad_norm": 0.19439533352851868, "learning_rate": 2.66067246286389e-05, "loss": 0.4007, "num_input_tokens_seen": 11789962986, "step": 3024, "train_runtime": 120313.1681, "train_tokens_per_second": 97993.953 }, { "epoch": 0.4809220985691574, "grad_norm": 0.1986853927373886, "learning_rate": 2.65942379783387e-05, "loss": 0.409, "num_input_tokens_seen": 11793864963, "step": 3025, "train_runtime": 120352.7763, "train_tokens_per_second": 97994.125 }, { "epoch": 0.4810810810810811, "grad_norm": 0.1907223016023636, "learning_rate": 2.658175092869316e-05, "loss": 0.4149, "num_input_tokens_seen": 11797855096, "step": 3026, "train_runtime": 120391.4807, "train_tokens_per_second": 97995.764 }, { "epoch": 0.48124006359300475, "grad_norm": 0.20357081294059753, "learning_rate": 2.6569263482830188e-05, "loss": 0.4078, "num_input_tokens_seen": 11801712342, "step": 3027, "train_runtime": 120429.0642, "train_tokens_per_second": 97997.21 }, { "epoch": 0.48139904610492845, "grad_norm": 0.19953344762325287, "learning_rate": 2.655677564387779e-05, "loss": 0.4065, "num_input_tokens_seen": 11805460523, "step": 3028, "train_runtime": 120467.445, "train_tokens_per_second": 97997.102 }, { "epoch": 0.48155802861685215, "grad_norm": 0.31040677428245544, "learning_rate": 2.6544287414964098e-05, "loss": 0.4175, "num_input_tokens_seen": 11809282981, "step": 3029, "train_runtime": 120507.3911, "train_tokens_per_second": 97996.338 }, { "epoch": 0.48171701112877585, "grad_norm": 0.19418710470199585, "learning_rate": 2.653179879921732e-05, "loss": 0.414, "num_input_tokens_seen": 11813285091, "step": 3030, "train_runtime": 120546.0993, "train_tokens_per_second": 97998.07 }, { "epoch": 0.4818759936406995, "grad_norm": 0.23569025099277496, "learning_rate": 2.6519309799765778e-05, "loss": 0.4118, "num_input_tokens_seen": 11817129874, "step": 3031, "train_runtime": 120587.5887, "train_tokens_per_second": 97996.237 }, { "epoch": 0.4820349761526232, "grad_norm": 0.3017585575580597, "learning_rate": 2.650682041973786e-05, "loss": 0.4168, "num_input_tokens_seen": 11821042927, "step": 3032, "train_runtime": 120629.3806, "train_tokens_per_second": 97994.725 }, { "epoch": 0.4821939586645469, "grad_norm": 0.19488239288330078, "learning_rate": 2.6494330662262075e-05, "loss": 0.4129, "num_input_tokens_seen": 11825033557, "step": 3033, "train_runtime": 120668.1722, "train_tokens_per_second": 97996.293 }, { "epoch": 0.4823529411764706, "grad_norm": 0.20514678955078125, "learning_rate": 2.6481840530467018e-05, "loss": 0.4037, "num_input_tokens_seen": 11829100334, "step": 3034, "train_runtime": 120703.4801, "train_tokens_per_second": 98001.32 }, { "epoch": 0.4825119236883943, "grad_norm": 0.20093181729316711, "learning_rate": 2.6469350027481382e-05, "loss": 0.4251, "num_input_tokens_seen": 11832877899, "step": 3035, "train_runtime": 120746.8824, "train_tokens_per_second": 97997.378 }, { "epoch": 0.48267090620031794, "grad_norm": 0.21056446433067322, "learning_rate": 2.6456859156433943e-05, "loss": 0.3947, "num_input_tokens_seen": 11836713656, "step": 3036, "train_runtime": 120786.2914, "train_tokens_per_second": 97997.161 }, { "epoch": 0.48282988871224164, "grad_norm": 0.2283869832754135, "learning_rate": 2.6444367920453584e-05, "loss": 0.416, "num_input_tokens_seen": 11840712376, "step": 3037, "train_runtime": 120826.7739, "train_tokens_per_second": 97997.422 }, { "epoch": 0.48298887122416534, "grad_norm": 0.18533053994178772, "learning_rate": 2.6431876322669264e-05, "loss": 0.4094, "num_input_tokens_seen": 11844462340, "step": 3038, "train_runtime": 120864.7969, "train_tokens_per_second": 97997.619 }, { "epoch": 0.48314785373608904, "grad_norm": 0.19107887148857117, "learning_rate": 2.6419384366210035e-05, "loss": 0.4119, "num_input_tokens_seen": 11848434609, "step": 3039, "train_runtime": 120903.4935, "train_tokens_per_second": 97999.109 }, { "epoch": 0.48330683624801274, "grad_norm": 0.1984696090221405, "learning_rate": 2.6406892054205068e-05, "loss": 0.4048, "num_input_tokens_seen": 11852361592, "step": 3040, "train_runtime": 120942.0836, "train_tokens_per_second": 98000.309 }, { "epoch": 0.4834658187599364, "grad_norm": 0.22571460902690887, "learning_rate": 2.6394399389783576e-05, "loss": 0.4078, "num_input_tokens_seen": 11856290017, "step": 3041, "train_runtime": 120979.6749, "train_tokens_per_second": 98002.33 }, { "epoch": 0.4836248012718601, "grad_norm": 0.19797095656394958, "learning_rate": 2.6381906376074896e-05, "loss": 0.3954, "num_input_tokens_seen": 11860169086, "step": 3042, "train_runtime": 121016.5295, "train_tokens_per_second": 98004.538 }, { "epoch": 0.4837837837837838, "grad_norm": 0.20691275596618652, "learning_rate": 2.636941301620842e-05, "loss": 0.4112, "num_input_tokens_seen": 11864032563, "step": 3043, "train_runtime": 121056.4532, "train_tokens_per_second": 98004.132 }, { "epoch": 0.4839427662957075, "grad_norm": 0.48961812257766724, "learning_rate": 2.6356919313313676e-05, "loss": 0.4075, "num_input_tokens_seen": 11867936275, "step": 3044, "train_runtime": 121094.9729, "train_tokens_per_second": 98005.194 }, { "epoch": 0.4841017488076312, "grad_norm": 1.437163233757019, "learning_rate": 2.6344425270520222e-05, "loss": 0.4126, "num_input_tokens_seen": 11871834914, "step": 3045, "train_runtime": 121134.3715, "train_tokens_per_second": 98005.502 }, { "epoch": 0.4842607313195548, "grad_norm": 0.2176293134689331, "learning_rate": 2.633193089095774e-05, "loss": 0.4144, "num_input_tokens_seen": 11875783702, "step": 3046, "train_runtime": 121173.6549, "train_tokens_per_second": 98006.318 }, { "epoch": 0.4844197138314785, "grad_norm": 0.19939447939395905, "learning_rate": 2.631943617775598e-05, "loss": 0.4161, "num_input_tokens_seen": 11879591954, "step": 3047, "train_runtime": 121209.8682, "train_tokens_per_second": 98008.455 }, { "epoch": 0.4845786963434022, "grad_norm": 0.21213990449905396, "learning_rate": 2.6306941134044783e-05, "loss": 0.4077, "num_input_tokens_seen": 11883514757, "step": 3048, "train_runtime": 121248.0656, "train_tokens_per_second": 98009.933 }, { "epoch": 0.4847376788553259, "grad_norm": 0.2525310218334198, "learning_rate": 2.629444576295407e-05, "loss": 0.3962, "num_input_tokens_seen": 11887376062, "step": 3049, "train_runtime": 121284.5963, "train_tokens_per_second": 98012.249 }, { "epoch": 0.4848966613672496, "grad_norm": 0.20378237962722778, "learning_rate": 2.6281950067613843e-05, "loss": 0.4101, "num_input_tokens_seen": 11891188362, "step": 3050, "train_runtime": 121323.1569, "train_tokens_per_second": 98012.52 }, { "epoch": 0.48505564387917327, "grad_norm": 0.19392208755016327, "learning_rate": 2.626945405115418e-05, "loss": 0.4022, "num_input_tokens_seen": 11895179858, "step": 3051, "train_runtime": 121363.7216, "train_tokens_per_second": 98012.649 }, { "epoch": 0.48521462639109697, "grad_norm": 0.21020235121250153, "learning_rate": 2.6256957716705245e-05, "loss": 0.4004, "num_input_tokens_seen": 11899114982, "step": 3052, "train_runtime": 121402.2001, "train_tokens_per_second": 98013.998 }, { "epoch": 0.48537360890302067, "grad_norm": 0.2428434193134308, "learning_rate": 2.6244461067397286e-05, "loss": 0.4078, "num_input_tokens_seen": 11903145233, "step": 3053, "train_runtime": 121442.6856, "train_tokens_per_second": 98014.509 }, { "epoch": 0.48553259141494437, "grad_norm": 0.28219887614250183, "learning_rate": 2.6231964106360616e-05, "loss": 0.41, "num_input_tokens_seen": 11906935264, "step": 3054, "train_runtime": 121482.9787, "train_tokens_per_second": 98013.198 }, { "epoch": 0.48569157392686807, "grad_norm": 0.2072434425354004, "learning_rate": 2.6219466836725653e-05, "loss": 0.4151, "num_input_tokens_seen": 11910837093, "step": 3055, "train_runtime": 121520.7733, "train_tokens_per_second": 98014.823 }, { "epoch": 0.4858505564387917, "grad_norm": 0.23835711181163788, "learning_rate": 2.6206969261622853e-05, "loss": 0.4211, "num_input_tokens_seen": 11914661315, "step": 3056, "train_runtime": 121559.5744, "train_tokens_per_second": 98014.997 }, { "epoch": 0.4860095389507154, "grad_norm": 0.8120005130767822, "learning_rate": 2.619447138418279e-05, "loss": 0.4223, "num_input_tokens_seen": 11918624824, "step": 3057, "train_runtime": 121599.1069, "train_tokens_per_second": 98015.727 }, { "epoch": 0.4861685214626391, "grad_norm": 0.18963444232940674, "learning_rate": 2.6181973207536093e-05, "loss": 0.4185, "num_input_tokens_seen": 11922507593, "step": 3058, "train_runtime": 121639.2853, "train_tokens_per_second": 98015.272 }, { "epoch": 0.4863275039745628, "grad_norm": 0.19297674298286438, "learning_rate": 2.6169474734813455e-05, "loss": 0.4006, "num_input_tokens_seen": 11926472564, "step": 3059, "train_runtime": 121677.5245, "train_tokens_per_second": 98017.055 }, { "epoch": 0.4864864864864865, "grad_norm": 0.1912756711244583, "learning_rate": 2.6156975969145654e-05, "loss": 0.395, "num_input_tokens_seen": 11930349251, "step": 3060, "train_runtime": 121718.8117, "train_tokens_per_second": 98015.657 }, { "epoch": 0.48664546899841016, "grad_norm": 0.19869881868362427, "learning_rate": 2.6144476913663552e-05, "loss": 0.3925, "num_input_tokens_seen": 11934216811, "step": 3061, "train_runtime": 121760.2141, "train_tokens_per_second": 98014.092 }, { "epoch": 0.48680445151033386, "grad_norm": 0.195643812417984, "learning_rate": 2.6131977571498073e-05, "loss": 0.4002, "num_input_tokens_seen": 11938129421, "step": 3062, "train_runtime": 121802.1401, "train_tokens_per_second": 98012.477 }, { "epoch": 0.48696343402225756, "grad_norm": 0.199337899684906, "learning_rate": 2.61194779457802e-05, "loss": 0.4102, "num_input_tokens_seen": 11941971129, "step": 3063, "train_runtime": 121841.7164, "train_tokens_per_second": 98012.171 }, { "epoch": 0.48712241653418126, "grad_norm": 0.2216518670320511, "learning_rate": 2.6106978039641022e-05, "loss": 0.4089, "num_input_tokens_seen": 11945793004, "step": 3064, "train_runtime": 121882.5164, "train_tokens_per_second": 98010.718 }, { "epoch": 0.4872813990461049, "grad_norm": 0.18643707036972046, "learning_rate": 2.6094477856211663e-05, "loss": 0.403, "num_input_tokens_seen": 11949739146, "step": 3065, "train_runtime": 121921.7747, "train_tokens_per_second": 98011.526 }, { "epoch": 0.4874403815580286, "grad_norm": 0.20641690492630005, "learning_rate": 2.6081977398623343e-05, "loss": 0.4002, "num_input_tokens_seen": 11953578689, "step": 3066, "train_runtime": 121957.6072, "train_tokens_per_second": 98014.211 }, { "epoch": 0.4875993640699523, "grad_norm": 0.1890469342470169, "learning_rate": 2.6069476670007326e-05, "loss": 0.4138, "num_input_tokens_seen": 11957510359, "step": 3067, "train_runtime": 121993.9089, "train_tokens_per_second": 98017.274 }, { "epoch": 0.487758346581876, "grad_norm": 0.24024127423763275, "learning_rate": 2.605697567349496e-05, "loss": 0.4213, "num_input_tokens_seen": 11961449211, "step": 3068, "train_runtime": 122033.4357, "train_tokens_per_second": 98017.803 }, { "epoch": 0.4879173290937997, "grad_norm": 0.18229185044765472, "learning_rate": 2.604447441221765e-05, "loss": 0.4149, "num_input_tokens_seen": 11965315291, "step": 3069, "train_runtime": 122073.4848, "train_tokens_per_second": 98017.316 }, { "epoch": 0.48807631160572335, "grad_norm": 0.19280776381492615, "learning_rate": 2.603197288930689e-05, "loss": 0.3969, "num_input_tokens_seen": 11969218227, "step": 3070, "train_runtime": 122114.1938, "train_tokens_per_second": 98016.601 }, { "epoch": 0.48823529411764705, "grad_norm": 0.21208401024341583, "learning_rate": 2.601947110789421e-05, "loss": 0.4006, "num_input_tokens_seen": 11973036129, "step": 3071, "train_runtime": 122152.1328, "train_tokens_per_second": 98017.414 }, { "epoch": 0.48839427662957074, "grad_norm": 0.2573983073234558, "learning_rate": 2.600696907111122e-05, "loss": 0.4006, "num_input_tokens_seen": 11976932233, "step": 3072, "train_runtime": 122196.1168, "train_tokens_per_second": 98014.017 }, { "epoch": 0.48855325914149444, "grad_norm": 0.2515972852706909, "learning_rate": 2.5994466782089593e-05, "loss": 0.4263, "num_input_tokens_seen": 11980937491, "step": 3073, "train_runtime": 122236.179, "train_tokens_per_second": 98014.66 }, { "epoch": 0.48871224165341814, "grad_norm": 0.34863269329071045, "learning_rate": 2.5981964243961072e-05, "loss": 0.4102, "num_input_tokens_seen": 11984826254, "step": 3074, "train_runtime": 122276.3861, "train_tokens_per_second": 98014.233 }, { "epoch": 0.4888712241653418, "grad_norm": 0.18777869641780853, "learning_rate": 2.5969461459857436e-05, "loss": 0.3954, "num_input_tokens_seen": 11988596488, "step": 3075, "train_runtime": 122315.9408, "train_tokens_per_second": 98013.361 }, { "epoch": 0.4890302066772655, "grad_norm": 0.201775923371315, "learning_rate": 2.5956958432910568e-05, "loss": 0.4065, "num_input_tokens_seen": 11992435376, "step": 3076, "train_runtime": 122355.5299, "train_tokens_per_second": 98013.023 }, { "epoch": 0.4891891891891892, "grad_norm": 0.22392235696315765, "learning_rate": 2.5944455166252363e-05, "loss": 0.4109, "num_input_tokens_seen": 11996428361, "step": 3077, "train_runtime": 122393.4436, "train_tokens_per_second": 98015.286 }, { "epoch": 0.4893481717011129, "grad_norm": 0.1983828991651535, "learning_rate": 2.5931951663014814e-05, "loss": 0.4102, "num_input_tokens_seen": 12000292596, "step": 3078, "train_runtime": 122433.695, "train_tokens_per_second": 98014.624 }, { "epoch": 0.4895071542130366, "grad_norm": 5.612316131591797, "learning_rate": 2.591944792632996e-05, "loss": 0.4222, "num_input_tokens_seen": 12004217368, "step": 3079, "train_runtime": 122473.9761, "train_tokens_per_second": 98014.433 }, { "epoch": 0.48966613672496023, "grad_norm": 0.27741682529449463, "learning_rate": 2.59069439593299e-05, "loss": 0.4087, "num_input_tokens_seen": 12008125939, "step": 3080, "train_runtime": 122512.7516, "train_tokens_per_second": 98015.315 }, { "epoch": 0.48982511923688393, "grad_norm": 0.1976679414510727, "learning_rate": 2.5894439765146783e-05, "loss": 0.392, "num_input_tokens_seen": 12012090280, "step": 3081, "train_runtime": 122550.7315, "train_tokens_per_second": 98017.287 }, { "epoch": 0.48998410174880763, "grad_norm": 0.2192000448703766, "learning_rate": 2.5881935346912833e-05, "loss": 0.3977, "num_input_tokens_seen": 12015864324, "step": 3082, "train_runtime": 122587.9973, "train_tokens_per_second": 98018.277 }, { "epoch": 0.49014308426073133, "grad_norm": 0.4729969799518585, "learning_rate": 2.58694307077603e-05, "loss": 0.4048, "num_input_tokens_seen": 12019668442, "step": 3083, "train_runtime": 122630.9324, "train_tokens_per_second": 98014.98 }, { "epoch": 0.49030206677265503, "grad_norm": 0.20083928108215332, "learning_rate": 2.5856925850821524e-05, "loss": 0.4178, "num_input_tokens_seen": 12023637250, "step": 3084, "train_runtime": 122670.7589, "train_tokens_per_second": 98015.512 }, { "epoch": 0.4904610492845787, "grad_norm": 0.19067174196243286, "learning_rate": 2.5844420779228886e-05, "loss": 0.4053, "num_input_tokens_seen": 12027613780, "step": 3085, "train_runtime": 122708.9466, "train_tokens_per_second": 98017.415 }, { "epoch": 0.4906200317965024, "grad_norm": 0.291839599609375, "learning_rate": 2.5831915496114794e-05, "loss": 0.4008, "num_input_tokens_seen": 12031399503, "step": 3086, "train_runtime": 122749.2191, "train_tokens_per_second": 98016.098 }, { "epoch": 0.4907790143084261, "grad_norm": 0.1987738013267517, "learning_rate": 2.5819410004611755e-05, "loss": 0.4134, "num_input_tokens_seen": 12035251067, "step": 3087, "train_runtime": 122789.5179, "train_tokens_per_second": 98015.297 }, { "epoch": 0.4909379968203498, "grad_norm": 0.22286689281463623, "learning_rate": 2.5806904307852298e-05, "loss": 0.408, "num_input_tokens_seen": 12039313866, "step": 3088, "train_runtime": 122829.0818, "train_tokens_per_second": 98016.803 }, { "epoch": 0.4910969793322735, "grad_norm": 0.28015080094337463, "learning_rate": 2.5794398408969024e-05, "loss": 0.4138, "num_input_tokens_seen": 12043125652, "step": 3089, "train_runtime": 122867.6807, "train_tokens_per_second": 98017.034 }, { "epoch": 0.4912559618441971, "grad_norm": 0.19456036388874054, "learning_rate": 2.5781892311094557e-05, "loss": 0.4069, "num_input_tokens_seen": 12046972855, "step": 3090, "train_runtime": 122906.4271, "train_tokens_per_second": 98017.436 }, { "epoch": 0.4914149443561208, "grad_norm": 0.19854968786239624, "learning_rate": 2.5769386017361586e-05, "loss": 0.4103, "num_input_tokens_seen": 12050960974, "step": 3091, "train_runtime": 122947.6864, "train_tokens_per_second": 98016.98 }, { "epoch": 0.4915739268680445, "grad_norm": 0.22549153864383698, "learning_rate": 2.5756879530902857e-05, "loss": 0.4011, "num_input_tokens_seen": 12055022092, "step": 3092, "train_runtime": 122987.1311, "train_tokens_per_second": 98018.565 }, { "epoch": 0.4917329093799682, "grad_norm": 0.19571498036384583, "learning_rate": 2.574437285485115e-05, "loss": 0.4182, "num_input_tokens_seen": 12058797707, "step": 3093, "train_runtime": 123024.7769, "train_tokens_per_second": 98019.261 }, { "epoch": 0.4918918918918919, "grad_norm": 0.22197410464286804, "learning_rate": 2.5731865992339316e-05, "loss": 0.4091, "num_input_tokens_seen": 12062584685, "step": 3094, "train_runtime": 123063.7229, "train_tokens_per_second": 98019.013 }, { "epoch": 0.49205087440381556, "grad_norm": 0.19557887315750122, "learning_rate": 2.571935894650021e-05, "loss": 0.4187, "num_input_tokens_seen": 12066509357, "step": 3095, "train_runtime": 123102.0578, "train_tokens_per_second": 98020.371 }, { "epoch": 0.49220985691573926, "grad_norm": 0.1811646819114685, "learning_rate": 2.5706851720466772e-05, "loss": 0.4011, "num_input_tokens_seen": 12070426644, "step": 3096, "train_runtime": 123144.8026, "train_tokens_per_second": 98018.157 }, { "epoch": 0.49236883942766296, "grad_norm": 0.19583538174629211, "learning_rate": 2.5694344317371982e-05, "loss": 0.4201, "num_input_tokens_seen": 12074247689, "step": 3097, "train_runtime": 123184.2233, "train_tokens_per_second": 98017.809 }, { "epoch": 0.49252782193958666, "grad_norm": 0.18547338247299194, "learning_rate": 2.568183674034884e-05, "loss": 0.4088, "num_input_tokens_seen": 12078102031, "step": 3098, "train_runtime": 123224.3291, "train_tokens_per_second": 98017.186 }, { "epoch": 0.4926868044515103, "grad_norm": 0.2030184417963028, "learning_rate": 2.5669328992530417e-05, "loss": 0.4062, "num_input_tokens_seen": 12082034464, "step": 3099, "train_runtime": 123262.8417, "train_tokens_per_second": 98018.464 }, { "epoch": 0.492845786963434, "grad_norm": 0.18240858614444733, "learning_rate": 2.5656821077049802e-05, "loss": 0.4124, "num_input_tokens_seen": 12085788547, "step": 3100, "train_runtime": 123301.057, "train_tokens_per_second": 98018.532 }, { "epoch": 0.4930047694753577, "grad_norm": 0.1787053495645523, "learning_rate": 2.5644312997040155e-05, "loss": 0.4082, "num_input_tokens_seen": 12089748190, "step": 3101, "train_runtime": 123340.5888, "train_tokens_per_second": 98019.219 }, { "epoch": 0.4931637519872814, "grad_norm": 0.1694810539484024, "learning_rate": 2.563180475563465e-05, "loss": 0.4089, "num_input_tokens_seen": 12093730726, "step": 3102, "train_runtime": 123377.9191, "train_tokens_per_second": 98021.841 }, { "epoch": 0.4933227344992051, "grad_norm": 0.21290864050388336, "learning_rate": 2.5619296355966527e-05, "loss": 0.412, "num_input_tokens_seen": 12097566642, "step": 3103, "train_runtime": 123416.4377, "train_tokens_per_second": 98022.329 }, { "epoch": 0.49348171701112875, "grad_norm": 0.20210151374340057, "learning_rate": 2.5606787801169042e-05, "loss": 0.4039, "num_input_tokens_seen": 12101455622, "step": 3104, "train_runtime": 123454.1074, "train_tokens_per_second": 98023.921 }, { "epoch": 0.49364069952305245, "grad_norm": 0.19915147125720978, "learning_rate": 2.5594279094375495e-05, "loss": 0.4107, "num_input_tokens_seen": 12105441711, "step": 3105, "train_runtime": 123495.3154, "train_tokens_per_second": 98023.489 }, { "epoch": 0.49379968203497615, "grad_norm": 0.1972646862268448, "learning_rate": 2.558177023871924e-05, "loss": 0.4115, "num_input_tokens_seen": 12109338584, "step": 3106, "train_runtime": 123535.3427, "train_tokens_per_second": 98023.273 }, { "epoch": 0.49395866454689985, "grad_norm": 0.18544042110443115, "learning_rate": 2.556926123733364e-05, "loss": 0.4104, "num_input_tokens_seen": 12113236063, "step": 3107, "train_runtime": 123574.484, "train_tokens_per_second": 98023.764 }, { "epoch": 0.49411764705882355, "grad_norm": 0.21476538479328156, "learning_rate": 2.555675209335214e-05, "loss": 0.4077, "num_input_tokens_seen": 12117176986, "step": 3108, "train_runtime": 123613.6881, "train_tokens_per_second": 98024.557 }, { "epoch": 0.4942766295707472, "grad_norm": 0.5681371092796326, "learning_rate": 2.5544242809908157e-05, "loss": 0.4259, "num_input_tokens_seen": 12121209917, "step": 3109, "train_runtime": 123654.8551, "train_tokens_per_second": 98024.537 }, { "epoch": 0.4944356120826709, "grad_norm": 0.19738398492336273, "learning_rate": 2.5531733390135205e-05, "loss": 0.4019, "num_input_tokens_seen": 12125048091, "step": 3110, "train_runtime": 123693.9737, "train_tokens_per_second": 98024.566 }, { "epoch": 0.4945945945945946, "grad_norm": 0.24770726263523102, "learning_rate": 2.5519223837166793e-05, "loss": 0.4047, "num_input_tokens_seen": 12128844863, "step": 3111, "train_runtime": 123733.1096, "train_tokens_per_second": 98024.247 }, { "epoch": 0.4947535771065183, "grad_norm": 0.19625656306743622, "learning_rate": 2.5506714154136485e-05, "loss": 0.4048, "num_input_tokens_seen": 12132739391, "step": 3112, "train_runtime": 123771.1734, "train_tokens_per_second": 98025.567 }, { "epoch": 0.494912559618442, "grad_norm": 0.1933165341615677, "learning_rate": 2.5494204344177857e-05, "loss": 0.3997, "num_input_tokens_seen": 12136754120, "step": 3113, "train_runtime": 123810.8168, "train_tokens_per_second": 98026.606 }, { "epoch": 0.49507154213036564, "grad_norm": 0.3170150816440582, "learning_rate": 2.548169441042454e-05, "loss": 0.3999, "num_input_tokens_seen": 12140548980, "step": 3114, "train_runtime": 123848.2719, "train_tokens_per_second": 98027.601 }, { "epoch": 0.49523052464228934, "grad_norm": 0.18499748408794403, "learning_rate": 2.5469184356010167e-05, "loss": 0.4086, "num_input_tokens_seen": 12144445179, "step": 3115, "train_runtime": 123889.5971, "train_tokens_per_second": 98026.351 }, { "epoch": 0.49538950715421304, "grad_norm": 0.2191588580608368, "learning_rate": 2.545667418406843e-05, "loss": 0.3958, "num_input_tokens_seen": 12148472442, "step": 3116, "train_runtime": 123927.444, "train_tokens_per_second": 98028.912 }, { "epoch": 0.49554848966613674, "grad_norm": 0.1794114112854004, "learning_rate": 2.544416389773304e-05, "loss": 0.411, "num_input_tokens_seen": 12152364268, "step": 3117, "train_runtime": 123962.8004, "train_tokens_per_second": 98032.347 }, { "epoch": 0.49570747217806044, "grad_norm": 0.19246649742126465, "learning_rate": 2.5431653500137736e-05, "loss": 0.4066, "num_input_tokens_seen": 12156157201, "step": 3118, "train_runtime": 123999.3422, "train_tokens_per_second": 98034.046 }, { "epoch": 0.4958664546899841, "grad_norm": 0.19956187903881073, "learning_rate": 2.5419142994416274e-05, "loss": 0.4145, "num_input_tokens_seen": 12159986736, "step": 3119, "train_runtime": 124038.9026, "train_tokens_per_second": 98033.653 }, { "epoch": 0.4960254372019078, "grad_norm": 0.24010400474071503, "learning_rate": 2.540663238370245e-05, "loss": 0.4024, "num_input_tokens_seen": 12164075229, "step": 3120, "train_runtime": 124078.1164, "train_tokens_per_second": 98035.621 }, { "epoch": 0.4961844197138315, "grad_norm": 0.2113117128610611, "learning_rate": 2.53941216711301e-05, "loss": 0.3942, "num_input_tokens_seen": 12167954183, "step": 3121, "train_runtime": 124115.4626, "train_tokens_per_second": 98037.375 }, { "epoch": 0.4963434022257552, "grad_norm": 0.24878861010074615, "learning_rate": 2.538161085983305e-05, "loss": 0.4126, "num_input_tokens_seen": 12171801132, "step": 3122, "train_runtime": 124154.5066, "train_tokens_per_second": 98037.53 }, { "epoch": 0.4965023847376789, "grad_norm": 0.29130277037620544, "learning_rate": 2.5369099952945176e-05, "loss": 0.4075, "num_input_tokens_seen": 12175691881, "step": 3123, "train_runtime": 124194.7027, "train_tokens_per_second": 98037.127 }, { "epoch": 0.4966613672496025, "grad_norm": 0.3446498513221741, "learning_rate": 2.535658895360037e-05, "loss": 0.4124, "num_input_tokens_seen": 12179640861, "step": 3124, "train_runtime": 124231.3227, "train_tokens_per_second": 98040.016 }, { "epoch": 0.4968203497615262, "grad_norm": 0.2911662757396698, "learning_rate": 2.5344077864932546e-05, "loss": 0.411, "num_input_tokens_seen": 12183645672, "step": 3125, "train_runtime": 124269.8524, "train_tokens_per_second": 98041.845 }, { "epoch": 0.4969793322734499, "grad_norm": 0.21235157549381256, "learning_rate": 2.5331566690075665e-05, "loss": 0.4228, "num_input_tokens_seen": 12187395244, "step": 3126, "train_runtime": 124310.0609, "train_tokens_per_second": 98040.297 }, { "epoch": 0.4971383147853736, "grad_norm": 0.2820274829864502, "learning_rate": 2.5319055432163657e-05, "loss": 0.4215, "num_input_tokens_seen": 12191317579, "step": 3127, "train_runtime": 124351.1811, "train_tokens_per_second": 98039.419 }, { "epoch": 0.4972972972972973, "grad_norm": 0.1984335035085678, "learning_rate": 2.530654409433052e-05, "loss": 0.3943, "num_input_tokens_seen": 12195330801, "step": 3128, "train_runtime": 124391.2326, "train_tokens_per_second": 98040.115 }, { "epoch": 0.49745627980922097, "grad_norm": 0.2050645351409912, "learning_rate": 2.5294032679710255e-05, "loss": 0.4054, "num_input_tokens_seen": 12199252117, "step": 3129, "train_runtime": 124428.5168, "train_tokens_per_second": 98042.253 }, { "epoch": 0.49761526232114467, "grad_norm": 0.20060007274150848, "learning_rate": 2.5281521191436886e-05, "loss": 0.4109, "num_input_tokens_seen": 12203023645, "step": 3130, "train_runtime": 124469.2026, "train_tokens_per_second": 98040.506 }, { "epoch": 0.49777424483306837, "grad_norm": 0.2058439999818802, "learning_rate": 2.5269009632644442e-05, "loss": 0.3972, "num_input_tokens_seen": 12206947186, "step": 3131, "train_runtime": 124506.8608, "train_tokens_per_second": 98042.366 }, { "epoch": 0.49793322734499207, "grad_norm": 0.22423215210437775, "learning_rate": 2.5256498006466987e-05, "loss": 0.4019, "num_input_tokens_seen": 12210990358, "step": 3132, "train_runtime": 124546.2684, "train_tokens_per_second": 98043.807 }, { "epoch": 0.4980922098569157, "grad_norm": 0.21123088896274567, "learning_rate": 2.5243986316038593e-05, "loss": 0.4231, "num_input_tokens_seen": 12214735759, "step": 3133, "train_runtime": 124590.0993, "train_tokens_per_second": 98039.377 }, { "epoch": 0.4982511923688394, "grad_norm": 1.6875075101852417, "learning_rate": 2.523147456449335e-05, "loss": 0.4139, "num_input_tokens_seen": 12218686531, "step": 3134, "train_runtime": 124626.8924, "train_tokens_per_second": 98042.134 }, { "epoch": 0.4984101748807631, "grad_norm": 0.2258339524269104, "learning_rate": 2.5218962754965368e-05, "loss": 0.4172, "num_input_tokens_seen": 12222582512, "step": 3135, "train_runtime": 124666.2222, "train_tokens_per_second": 98042.455 }, { "epoch": 0.4985691573926868, "grad_norm": 0.22337469458580017, "learning_rate": 2.5206450890588764e-05, "loss": 0.41, "num_input_tokens_seen": 12226494594, "step": 3136, "train_runtime": 124705.8319, "train_tokens_per_second": 98042.685 }, { "epoch": 0.4987281399046105, "grad_norm": 0.2229273021221161, "learning_rate": 2.519393897449767e-05, "loss": 0.4055, "num_input_tokens_seen": 12230481474, "step": 3137, "train_runtime": 124745.3124, "train_tokens_per_second": 98043.616 }, { "epoch": 0.49888712241653416, "grad_norm": 0.19777674973011017, "learning_rate": 2.518142700982623e-05, "loss": 0.3998, "num_input_tokens_seen": 12234412488, "step": 3138, "train_runtime": 124783.8973, "train_tokens_per_second": 98044.802 }, { "epoch": 0.49904610492845786, "grad_norm": 0.22191138565540314, "learning_rate": 2.516891499970861e-05, "loss": 0.4053, "num_input_tokens_seen": 12238286854, "step": 3139, "train_runtime": 124822.3823, "train_tokens_per_second": 98045.612 }, { "epoch": 0.49920508744038156, "grad_norm": 0.25495263934135437, "learning_rate": 2.5156402947278972e-05, "loss": 0.4179, "num_input_tokens_seen": 12242076935, "step": 3140, "train_runtime": 124861.5173, "train_tokens_per_second": 98045.236 }, { "epoch": 0.49936406995230526, "grad_norm": 0.2347296178340912, "learning_rate": 2.5143890855671505e-05, "loss": 0.409, "num_input_tokens_seen": 12246120314, "step": 3141, "train_runtime": 124901.4379, "train_tokens_per_second": 98046.272 }, { "epoch": 0.49952305246422896, "grad_norm": 0.2159462720155716, "learning_rate": 2.5131378728020388e-05, "loss": 0.4271, "num_input_tokens_seen": 12249994005, "step": 3142, "train_runtime": 124939.7783, "train_tokens_per_second": 98047.189 }, { "epoch": 0.4996820349761526, "grad_norm": 0.20740234851837158, "learning_rate": 2.511886656745983e-05, "loss": 0.4189, "num_input_tokens_seen": 12253792198, "step": 3143, "train_runtime": 124979.1317, "train_tokens_per_second": 98046.706 }, { "epoch": 0.4998410174880763, "grad_norm": 0.21278509497642517, "learning_rate": 2.5106354377124047e-05, "loss": 0.4122, "num_input_tokens_seen": 12257760273, "step": 3144, "train_runtime": 125018.8954, "train_tokens_per_second": 98047.261 }, { "epoch": 0.5, "grad_norm": 0.22498415410518646, "learning_rate": 2.509384216014724e-05, "loss": 0.4094, "num_input_tokens_seen": 12261545675, "step": 3145, "train_runtime": 125057.9228, "train_tokens_per_second": 98046.932 }, { "epoch": 0.5001589825119237, "grad_norm": 0.2090299129486084, "learning_rate": 2.5081329919663626e-05, "loss": 0.4057, "num_input_tokens_seen": 12265427159, "step": 3146, "train_runtime": 125097.2553, "train_tokens_per_second": 98047.132 }, { "epoch": 0.5003179650238474, "grad_norm": 0.21355731785297394, "learning_rate": 2.5068817658807447e-05, "loss": 0.4087, "num_input_tokens_seen": 12269435512, "step": 3147, "train_runtime": 125136.8489, "train_tokens_per_second": 98048.142 }, { "epoch": 0.5004769475357711, "grad_norm": 0.21199245750904083, "learning_rate": 2.5056305380712937e-05, "loss": 0.3963, "num_input_tokens_seen": 12273421148, "step": 3148, "train_runtime": 125175.411, "train_tokens_per_second": 98049.777 }, { "epoch": 0.5006359300476948, "grad_norm": 0.41091957688331604, "learning_rate": 2.5043793088514323e-05, "loss": 0.4228, "num_input_tokens_seen": 12277233110, "step": 3149, "train_runtime": 125213.9062, "train_tokens_per_second": 98050.077 }, { "epoch": 0.5007949125596184, "grad_norm": 0.22239059209823608, "learning_rate": 2.5031280785345846e-05, "loss": 0.398, "num_input_tokens_seen": 12281073736, "step": 3150, "train_runtime": 125254.2083, "train_tokens_per_second": 98049.191 }, { "epoch": 0.5009538950715421, "grad_norm": 0.2055506408214569, "learning_rate": 2.501876847434175e-05, "loss": 0.4042, "num_input_tokens_seen": 12285039686, "step": 3151, "train_runtime": 125294.3143, "train_tokens_per_second": 98049.459 }, { "epoch": 0.5011128775834658, "grad_norm": 0.21829043328762054, "learning_rate": 2.5006256158636292e-05, "loss": 0.4096, "num_input_tokens_seen": 12288986009, "step": 3152, "train_runtime": 125331.2935, "train_tokens_per_second": 98052.016 }, { "epoch": 0.5012718600953895, "grad_norm": 0.21728438138961792, "learning_rate": 2.499374384136371e-05, "loss": 0.4109, "num_input_tokens_seen": 12292724633, "step": 3153, "train_runtime": 125373.0565, "train_tokens_per_second": 98049.174 }, { "epoch": 0.5014308426073132, "grad_norm": 0.20999477803707123, "learning_rate": 2.4981231525658248e-05, "loss": 0.3986, "num_input_tokens_seen": 12296603540, "step": 3154, "train_runtime": 125411.2393, "train_tokens_per_second": 98050.251 }, { "epoch": 0.5015898251192369, "grad_norm": 0.2699524462223053, "learning_rate": 2.496871921465416e-05, "loss": 0.4068, "num_input_tokens_seen": 12300559946, "step": 3155, "train_runtime": 125450.6681, "train_tokens_per_second": 98050.972 }, { "epoch": 0.5017488076311606, "grad_norm": 0.2191333770751953, "learning_rate": 2.495620691148569e-05, "loss": 0.4075, "num_input_tokens_seen": 12304426406, "step": 3156, "train_runtime": 125490.4477, "train_tokens_per_second": 98050.701 }, { "epoch": 0.5019077901430843, "grad_norm": 0.20212116837501526, "learning_rate": 2.4943694619287065e-05, "loss": 0.4042, "num_input_tokens_seen": 12308283174, "step": 3157, "train_runtime": 125530.8297, "train_tokens_per_second": 98049.883 }, { "epoch": 0.502066772655008, "grad_norm": 0.2225043773651123, "learning_rate": 2.4931182341192556e-05, "loss": 0.4106, "num_input_tokens_seen": 12312192422, "step": 3158, "train_runtime": 125570.0391, "train_tokens_per_second": 98050.399 }, { "epoch": 0.5022257551669317, "grad_norm": 0.2670263648033142, "learning_rate": 2.4918670080336376e-05, "loss": 0.41, "num_input_tokens_seen": 12316020150, "step": 3159, "train_runtime": 125609.9032, "train_tokens_per_second": 98049.754 }, { "epoch": 0.5023847376788553, "grad_norm": 0.2096012830734253, "learning_rate": 2.490615783985277e-05, "loss": 0.4017, "num_input_tokens_seen": 12319936730, "step": 3160, "train_runtime": 125647.0784, "train_tokens_per_second": 98051.916 }, { "epoch": 0.502543720190779, "grad_norm": 0.24148698151111603, "learning_rate": 2.4893645622875962e-05, "loss": 0.4095, "num_input_tokens_seen": 12323877890, "step": 3161, "train_runtime": 125689.0042, "train_tokens_per_second": 98050.565 }, { "epoch": 0.5027027027027027, "grad_norm": 0.2591533064842224, "learning_rate": 2.4881133432540168e-05, "loss": 0.4013, "num_input_tokens_seen": 12327809656, "step": 3162, "train_runtime": 125728.7831, "train_tokens_per_second": 98050.815 }, { "epoch": 0.5028616852146264, "grad_norm": 0.2193414717912674, "learning_rate": 2.4868621271979618e-05, "loss": 0.4086, "num_input_tokens_seen": 12331729408, "step": 3163, "train_runtime": 125768.6762, "train_tokens_per_second": 98050.88 }, { "epoch": 0.5030206677265501, "grad_norm": 0.2307102531194687, "learning_rate": 2.485610914432851e-05, "loss": 0.425, "num_input_tokens_seen": 12335583744, "step": 3164, "train_runtime": 125807.7732, "train_tokens_per_second": 98051.046 }, { "epoch": 0.5031796502384738, "grad_norm": 0.21447615325450897, "learning_rate": 2.4843597052721034e-05, "loss": 0.41, "num_input_tokens_seen": 12339453696, "step": 3165, "train_runtime": 125846.9515, "train_tokens_per_second": 98051.272 }, { "epoch": 0.5033386327503975, "grad_norm": 0.209181010723114, "learning_rate": 2.4831085000291402e-05, "loss": 0.4137, "num_input_tokens_seen": 12343456133, "step": 3166, "train_runtime": 125887.5072, "train_tokens_per_second": 98051.478 }, { "epoch": 0.5034976152623212, "grad_norm": 0.23216688632965088, "learning_rate": 2.4818572990173773e-05, "loss": 0.4062, "num_input_tokens_seen": 12347349389, "step": 3167, "train_runtime": 125924.6931, "train_tokens_per_second": 98053.44 }, { "epoch": 0.5036565977742449, "grad_norm": 0.6188234686851501, "learning_rate": 2.4806061025502335e-05, "loss": 0.4128, "num_input_tokens_seen": 12351287392, "step": 3168, "train_runtime": 125963.3852, "train_tokens_per_second": 98054.584 }, { "epoch": 0.5038155802861686, "grad_norm": 0.2404913753271103, "learning_rate": 2.4793549109411242e-05, "loss": 0.3988, "num_input_tokens_seen": 12355240482, "step": 3169, "train_runtime": 126003.1435, "train_tokens_per_second": 98055.018 }, { "epoch": 0.5039745627980922, "grad_norm": 0.2786354720592499, "learning_rate": 2.4781037245034634e-05, "loss": 0.4029, "num_input_tokens_seen": 12359101013, "step": 3170, "train_runtime": 126043.2156, "train_tokens_per_second": 98054.472 }, { "epoch": 0.5041335453100159, "grad_norm": 0.2608959972858429, "learning_rate": 2.4768525435506655e-05, "loss": 0.405, "num_input_tokens_seen": 12362924916, "step": 3171, "train_runtime": 126080.3518, "train_tokens_per_second": 98055.92 }, { "epoch": 0.5042925278219396, "grad_norm": 0.23588523268699646, "learning_rate": 2.4756013683961406e-05, "loss": 0.412, "num_input_tokens_seen": 12366797431, "step": 3172, "train_runtime": 126121.4058, "train_tokens_per_second": 98054.706 }, { "epoch": 0.5044515103338633, "grad_norm": 0.3382149636745453, "learning_rate": 2.474350199353302e-05, "loss": 0.4083, "num_input_tokens_seen": 12370698363, "step": 3173, "train_runtime": 126161.2119, "train_tokens_per_second": 98054.689 }, { "epoch": 0.504610492845787, "grad_norm": 0.21176780760288239, "learning_rate": 2.473099036735557e-05, "loss": 0.4109, "num_input_tokens_seen": 12374639820, "step": 3174, "train_runtime": 126199.8934, "train_tokens_per_second": 98055.866 }, { "epoch": 0.5047694753577107, "grad_norm": 0.22385142743587494, "learning_rate": 2.4718478808563117e-05, "loss": 0.4159, "num_input_tokens_seen": 12378540322, "step": 3175, "train_runtime": 126238.7861, "train_tokens_per_second": 98056.554 }, { "epoch": 0.5049284578696344, "grad_norm": 0.22239470481872559, "learning_rate": 2.470596732028975e-05, "loss": 0.4066, "num_input_tokens_seen": 12382371379, "step": 3176, "train_runtime": 126277.6372, "train_tokens_per_second": 98056.724 }, { "epoch": 0.5050874403815581, "grad_norm": 0.25232166051864624, "learning_rate": 2.469345590566948e-05, "loss": 0.4155, "num_input_tokens_seen": 12386256146, "step": 3177, "train_runtime": 126315.9997, "train_tokens_per_second": 98057.698 }, { "epoch": 0.5052464228934818, "grad_norm": 0.3023361265659332, "learning_rate": 2.468094456783635e-05, "loss": 0.4014, "num_input_tokens_seen": 12390117041, "step": 3178, "train_runtime": 126354.1352, "train_tokens_per_second": 98058.659 }, { "epoch": 0.5054054054054054, "grad_norm": 0.23251745104789734, "learning_rate": 2.466843330992435e-05, "loss": 0.3946, "num_input_tokens_seen": 12394064200, "step": 3179, "train_runtime": 126391.2826, "train_tokens_per_second": 98061.068 }, { "epoch": 0.505564387917329, "grad_norm": 0.21794191002845764, "learning_rate": 2.465592213506745e-05, "loss": 0.412, "num_input_tokens_seen": 12397898344, "step": 3180, "train_runtime": 126432.3841, "train_tokens_per_second": 98059.516 }, { "epoch": 0.5057233704292527, "grad_norm": 0.2639566659927368, "learning_rate": 2.4643411046399637e-05, "loss": 0.4073, "num_input_tokens_seen": 12401832676, "step": 3181, "train_runtime": 126472.1363, "train_tokens_per_second": 98059.802 }, { "epoch": 0.5058823529411764, "grad_norm": 0.22901086509227753, "learning_rate": 2.4630900047054837e-05, "loss": 0.4166, "num_input_tokens_seen": 12405681648, "step": 3182, "train_runtime": 126510.5785, "train_tokens_per_second": 98060.429 }, { "epoch": 0.5060413354531001, "grad_norm": 0.2502426207065582, "learning_rate": 2.4618389140166952e-05, "loss": 0.4125, "num_input_tokens_seen": 12409596407, "step": 3183, "train_runtime": 126549.7484, "train_tokens_per_second": 98061.012 }, { "epoch": 0.5062003179650238, "grad_norm": 0.28796660900115967, "learning_rate": 2.4605878328869907e-05, "loss": 0.4031, "num_input_tokens_seen": 12413587422, "step": 3184, "train_runtime": 126594.2443, "train_tokens_per_second": 98058.071 }, { "epoch": 0.5063593004769475, "grad_norm": 0.23762358725070953, "learning_rate": 2.4593367616297548e-05, "loss": 0.4116, "num_input_tokens_seen": 12417444715, "step": 3185, "train_runtime": 126634.7638, "train_tokens_per_second": 98057.155 }, { "epoch": 0.5065182829888712, "grad_norm": 0.2002575397491455, "learning_rate": 2.4580857005583732e-05, "loss": 0.4052, "num_input_tokens_seen": 12421369131, "step": 3186, "train_runtime": 126673.8814, "train_tokens_per_second": 98057.855 }, { "epoch": 0.506677265500795, "grad_norm": 0.19582389295101166, "learning_rate": 2.4568346499862276e-05, "loss": 0.4031, "num_input_tokens_seen": 12425281591, "step": 3187, "train_runtime": 126710.5878, "train_tokens_per_second": 98060.326 }, { "epoch": 0.5068362480127186, "grad_norm": 0.3410726487636566, "learning_rate": 2.455583610226696e-05, "loss": 0.4041, "num_input_tokens_seen": 12429253137, "step": 3188, "train_runtime": 126747.6788, "train_tokens_per_second": 98062.965 }, { "epoch": 0.5069952305246422, "grad_norm": 0.19349855184555054, "learning_rate": 2.4543325815931576e-05, "loss": 0.4181, "num_input_tokens_seen": 12433007094, "step": 3189, "train_runtime": 126787.7738, "train_tokens_per_second": 98061.562 }, { "epoch": 0.5071542130365659, "grad_norm": 0.2157949060201645, "learning_rate": 2.4530815643989832e-05, "loss": 0.4068, "num_input_tokens_seen": 12436933132, "step": 3190, "train_runtime": 126827.5012, "train_tokens_per_second": 98061.801 }, { "epoch": 0.5073131955484896, "grad_norm": 0.19165316224098206, "learning_rate": 2.4518305589575466e-05, "loss": 0.3964, "num_input_tokens_seen": 12440925981, "step": 3191, "train_runtime": 126868.158, "train_tokens_per_second": 98061.848 }, { "epoch": 0.5074721780604133, "grad_norm": 0.25737112760543823, "learning_rate": 2.450579565582215e-05, "loss": 0.3888, "num_input_tokens_seen": 12444855448, "step": 3192, "train_runtime": 126908.0916, "train_tokens_per_second": 98061.954 }, { "epoch": 0.507631160572337, "grad_norm": 0.19320182502269745, "learning_rate": 2.449328584586352e-05, "loss": 0.3996, "num_input_tokens_seen": 12448746664, "step": 3193, "train_runtime": 126945.9519, "train_tokens_per_second": 98063.361 }, { "epoch": 0.5077901430842607, "grad_norm": 0.21793532371520996, "learning_rate": 2.448077616283321e-05, "loss": 0.3997, "num_input_tokens_seen": 12452587367, "step": 3194, "train_runtime": 126986.608, "train_tokens_per_second": 98062.21 }, { "epoch": 0.5079491255961844, "grad_norm": 0.2256234735250473, "learning_rate": 2.4468266609864794e-05, "loss": 0.4022, "num_input_tokens_seen": 12456449391, "step": 3195, "train_runtime": 127025.1651, "train_tokens_per_second": 98062.847 }, { "epoch": 0.5081081081081081, "grad_norm": 0.1916537582874298, "learning_rate": 2.4455757190091845e-05, "loss": 0.4051, "num_input_tokens_seen": 12460378121, "step": 3196, "train_runtime": 127065.8882, "train_tokens_per_second": 98062.338 }, { "epoch": 0.5082670906200318, "grad_norm": 0.18460704386234283, "learning_rate": 2.4443247906647873e-05, "loss": 0.4062, "num_input_tokens_seen": 12464273242, "step": 3197, "train_runtime": 127106.6175, "train_tokens_per_second": 98061.56 }, { "epoch": 0.5084260731319555, "grad_norm": 0.20686902105808258, "learning_rate": 2.4430738762666355e-05, "loss": 0.4041, "num_input_tokens_seen": 12468038765, "step": 3198, "train_runtime": 127146.6073, "train_tokens_per_second": 98060.334 }, { "epoch": 0.5085850556438791, "grad_norm": 0.43329668045043945, "learning_rate": 2.4418229761280767e-05, "loss": 0.4084, "num_input_tokens_seen": 12471988131, "step": 3199, "train_runtime": 127185.889, "train_tokens_per_second": 98061.1 }, { "epoch": 0.5087440381558028, "grad_norm": 0.1757301241159439, "learning_rate": 2.4405720905624507e-05, "loss": 0.4072, "num_input_tokens_seen": 12475913694, "step": 3200, "train_runtime": 127227.5498, "train_tokens_per_second": 98059.844 }, { "epoch": 0.5089030206677265, "grad_norm": 0.1973833292722702, "learning_rate": 2.4393212198830967e-05, "loss": 0.4216, "num_input_tokens_seen": 12479835350, "step": 3201, "train_runtime": 127359.1648, "train_tokens_per_second": 97989.3 }, { "epoch": 0.5090620031796502, "grad_norm": 0.3338612914085388, "learning_rate": 2.4380703644033482e-05, "loss": 0.4069, "num_input_tokens_seen": 12483703635, "step": 3202, "train_runtime": 127396.6213, "train_tokens_per_second": 97990.853 }, { "epoch": 0.5092209856915739, "grad_norm": 0.24029086530208588, "learning_rate": 2.4368195244365347e-05, "loss": 0.4206, "num_input_tokens_seen": 12487584876, "step": 3203, "train_runtime": 127437.6871, "train_tokens_per_second": 97989.733 }, { "epoch": 0.5093799682034976, "grad_norm": 0.19151915609836578, "learning_rate": 2.435568700295985e-05, "loss": 0.4142, "num_input_tokens_seen": 12491445746, "step": 3204, "train_runtime": 127475.4599, "train_tokens_per_second": 97990.984 }, { "epoch": 0.5095389507154213, "grad_norm": 0.18095871806144714, "learning_rate": 2.4343178922950204e-05, "loss": 0.4095, "num_input_tokens_seen": 12495315223, "step": 3205, "train_runtime": 127514.272, "train_tokens_per_second": 97991.503 }, { "epoch": 0.509697933227345, "grad_norm": 0.2059636414051056, "learning_rate": 2.4330671007469592e-05, "loss": 0.4081, "num_input_tokens_seen": 12499164085, "step": 3206, "train_runtime": 127553.0557, "train_tokens_per_second": 97991.883 }, { "epoch": 0.5098569157392687, "grad_norm": 0.2198101431131363, "learning_rate": 2.4318163259651166e-05, "loss": 0.401, "num_input_tokens_seen": 12503163503, "step": 3207, "train_runtime": 127590.2358, "train_tokens_per_second": 97994.674 }, { "epoch": 0.5100158982511924, "grad_norm": 0.1693221926689148, "learning_rate": 2.4305655682628023e-05, "loss": 0.3994, "num_input_tokens_seen": 12507097012, "step": 3208, "train_runtime": 127628.659, "train_tokens_per_second": 97995.992 }, { "epoch": 0.510174880763116, "grad_norm": 0.23362556099891663, "learning_rate": 2.429314827953323e-05, "loss": 0.405, "num_input_tokens_seen": 12511059999, "step": 3209, "train_runtime": 127667.6234, "train_tokens_per_second": 97997.125 }, { "epoch": 0.5103338632750397, "grad_norm": 0.1960223913192749, "learning_rate": 2.4280641053499802e-05, "loss": 0.4081, "num_input_tokens_seen": 12514953960, "step": 3210, "train_runtime": 127709.7346, "train_tokens_per_second": 97995.302 }, { "epoch": 0.5104928457869634, "grad_norm": 0.2459215521812439, "learning_rate": 2.4268134007660693e-05, "loss": 0.4035, "num_input_tokens_seen": 12518772979, "step": 3211, "train_runtime": 127749.2708, "train_tokens_per_second": 97994.868 }, { "epoch": 0.5106518282988871, "grad_norm": 0.17974086105823517, "learning_rate": 2.4255627145148853e-05, "loss": 0.4113, "num_input_tokens_seen": 12522735730, "step": 3212, "train_runtime": 127789.0447, "train_tokens_per_second": 97995.378 }, { "epoch": 0.5108108108108108, "grad_norm": 0.20517811179161072, "learning_rate": 2.4243120469097142e-05, "loss": 0.4001, "num_input_tokens_seen": 12526668760, "step": 3213, "train_runtime": 127830.675, "train_tokens_per_second": 97994.232 }, { "epoch": 0.5109697933227345, "grad_norm": 0.1704142540693283, "learning_rate": 2.4230613982638416e-05, "loss": 0.3971, "num_input_tokens_seen": 12530558873, "step": 3214, "train_runtime": 127870.0074, "train_tokens_per_second": 97994.511 }, { "epoch": 0.5111287758346582, "grad_norm": 0.2154741734266281, "learning_rate": 2.4218107688905452e-05, "loss": 0.4025, "num_input_tokens_seen": 12534560503, "step": 3215, "train_runtime": 127910.0896, "train_tokens_per_second": 97995.088 }, { "epoch": 0.5112877583465819, "grad_norm": 0.20580634474754333, "learning_rate": 2.420560159103098e-05, "loss": 0.4063, "num_input_tokens_seen": 12538405284, "step": 3216, "train_runtime": 127948.8442, "train_tokens_per_second": 97995.456 }, { "epoch": 0.5114467408585056, "grad_norm": 0.19578227400779724, "learning_rate": 2.4193095692147704e-05, "loss": 0.4044, "num_input_tokens_seen": 12542374271, "step": 3217, "train_runtime": 127989.9048, "train_tokens_per_second": 97995.028 }, { "epoch": 0.5116057233704292, "grad_norm": 0.19319979846477509, "learning_rate": 2.4180589995388244e-05, "loss": 0.4039, "num_input_tokens_seen": 12546212761, "step": 3218, "train_runtime": 128031.0191, "train_tokens_per_second": 97993.54 }, { "epoch": 0.5117647058823529, "grad_norm": 0.22291605174541473, "learning_rate": 2.416808450388521e-05, "loss": 0.4168, "num_input_tokens_seen": 12550067487, "step": 3219, "train_runtime": 128070.8181, "train_tokens_per_second": 97993.186 }, { "epoch": 0.5119236883942766, "grad_norm": 0.23983918130397797, "learning_rate": 2.415557922077113e-05, "loss": 0.4104, "num_input_tokens_seen": 12554015241, "step": 3220, "train_runtime": 128110.2823, "train_tokens_per_second": 97993.815 }, { "epoch": 0.5120826709062003, "grad_norm": 0.2056696116924286, "learning_rate": 2.4143074149178475e-05, "loss": 0.3954, "num_input_tokens_seen": 12557965946, "step": 3221, "train_runtime": 128149.9165, "train_tokens_per_second": 97994.336 }, { "epoch": 0.512241653418124, "grad_norm": 0.17673514783382416, "learning_rate": 2.4130569292239703e-05, "loss": 0.3942, "num_input_tokens_seen": 12561833270, "step": 3222, "train_runtime": 128186.5573, "train_tokens_per_second": 97996.495 }, { "epoch": 0.5124006359300477, "grad_norm": 0.20831623673439026, "learning_rate": 2.411806465308718e-05, "loss": 0.4124, "num_input_tokens_seen": 12565816325, "step": 3223, "train_runtime": 128225.3904, "train_tokens_per_second": 97997.879 }, { "epoch": 0.5125596184419714, "grad_norm": 0.1903272569179535, "learning_rate": 2.4105560234853223e-05, "loss": 0.3992, "num_input_tokens_seen": 12569622694, "step": 3224, "train_runtime": 128264.1715, "train_tokens_per_second": 97997.925 }, { "epoch": 0.5127186009538951, "grad_norm": 0.21893978118896484, "learning_rate": 2.4093056040670112e-05, "loss": 0.4049, "num_input_tokens_seen": 12573566910, "step": 3225, "train_runtime": 128300.2633, "train_tokens_per_second": 98001.1 }, { "epoch": 0.5128775834658188, "grad_norm": 0.21494051814079285, "learning_rate": 2.4080552073670043e-05, "loss": 0.4039, "num_input_tokens_seen": 12577497604, "step": 3226, "train_runtime": 128340.27, "train_tokens_per_second": 98001.178 }, { "epoch": 0.5130365659777425, "grad_norm": 0.19402359426021576, "learning_rate": 2.4068048336985192e-05, "loss": 0.409, "num_input_tokens_seen": 12581293660, "step": 3227, "train_runtime": 128379.526, "train_tokens_per_second": 98000.78 }, { "epoch": 0.5131955484896661, "grad_norm": 0.18531128764152527, "learning_rate": 2.405554483374765e-05, "loss": 0.3976, "num_input_tokens_seen": 12585287565, "step": 3228, "train_runtime": 128419.3272, "train_tokens_per_second": 98001.507 }, { "epoch": 0.5133545310015898, "grad_norm": 0.1984296441078186, "learning_rate": 2.404304156708944e-05, "loss": 0.4026, "num_input_tokens_seen": 12589218584, "step": 3229, "train_runtime": 128457.9637, "train_tokens_per_second": 98002.632 }, { "epoch": 0.5135135135135135, "grad_norm": 0.20471306145191193, "learning_rate": 2.403053854014257e-05, "loss": 0.4031, "num_input_tokens_seen": 12593118095, "step": 3230, "train_runtime": 128498.9126, "train_tokens_per_second": 98001.748 }, { "epoch": 0.5136724960254372, "grad_norm": 0.17953169345855713, "learning_rate": 2.4018035756038937e-05, "loss": 0.4036, "num_input_tokens_seen": 12596960255, "step": 3231, "train_runtime": 128538.4073, "train_tokens_per_second": 98001.527 }, { "epoch": 0.5138314785373609, "grad_norm": 0.22131486237049103, "learning_rate": 2.4005533217910413e-05, "loss": 0.4034, "num_input_tokens_seen": 12600912029, "step": 3232, "train_runtime": 128580.132, "train_tokens_per_second": 98000.46 }, { "epoch": 0.5139904610492846, "grad_norm": 0.2352144569158554, "learning_rate": 2.399303092888879e-05, "loss": 0.4056, "num_input_tokens_seen": 12604785344, "step": 3233, "train_runtime": 128618.0377, "train_tokens_per_second": 98001.692 }, { "epoch": 0.5141494435612083, "grad_norm": 0.1963152438402176, "learning_rate": 2.3980528892105795e-05, "loss": 0.4084, "num_input_tokens_seen": 12608689972, "step": 3234, "train_runtime": 128659.5523, "train_tokens_per_second": 98000.418 }, { "epoch": 0.514308426073132, "grad_norm": 0.19389142096042633, "learning_rate": 2.3968027110693117e-05, "loss": 0.4065, "num_input_tokens_seen": 12612595679, "step": 3235, "train_runtime": 128698.6514, "train_tokens_per_second": 98000.993 }, { "epoch": 0.5144674085850557, "grad_norm": 0.1768852174282074, "learning_rate": 2.3955525587782345e-05, "loss": 0.3993, "num_input_tokens_seen": 12616511488, "step": 3236, "train_runtime": 128739.5814, "train_tokens_per_second": 98000.253 }, { "epoch": 0.5146263910969794, "grad_norm": 0.17820261418819427, "learning_rate": 2.3943024326505048e-05, "loss": 0.3926, "num_input_tokens_seen": 12620451394, "step": 3237, "train_runtime": 128779.9328, "train_tokens_per_second": 98000.14 }, { "epoch": 0.514785373608903, "grad_norm": 0.34554943442344666, "learning_rate": 2.3930523329992683e-05, "loss": 0.4033, "num_input_tokens_seen": 12624340459, "step": 3238, "train_runtime": 128819.6213, "train_tokens_per_second": 98000.136 }, { "epoch": 0.5149443561208267, "grad_norm": 0.20688004791736603, "learning_rate": 2.3918022601376663e-05, "loss": 0.3945, "num_input_tokens_seen": 12628178413, "step": 3239, "train_runtime": 128859.0105, "train_tokens_per_second": 97999.964 }, { "epoch": 0.5151033386327504, "grad_norm": 0.16792157292366028, "learning_rate": 2.3905522143788342e-05, "loss": 0.3965, "num_input_tokens_seen": 12632036616, "step": 3240, "train_runtime": 128899.474, "train_tokens_per_second": 97999.132 }, { "epoch": 0.5152623211446741, "grad_norm": 0.2094193696975708, "learning_rate": 2.3893021960358987e-05, "loss": 0.3966, "num_input_tokens_seen": 12635980980, "step": 3241, "train_runtime": 128939.3302, "train_tokens_per_second": 97999.431 }, { "epoch": 0.5154213036565978, "grad_norm": 0.17544814944267273, "learning_rate": 2.3880522054219802e-05, "loss": 0.4057, "num_input_tokens_seen": 12639950905, "step": 3242, "train_runtime": 128980.6386, "train_tokens_per_second": 97998.824 }, { "epoch": 0.5155802861685215, "grad_norm": 0.17806801199913025, "learning_rate": 2.386802242850194e-05, "loss": 0.4121, "num_input_tokens_seen": 12643840885, "step": 3243, "train_runtime": 129021.7415, "train_tokens_per_second": 97997.754 }, { "epoch": 0.5157392686804452, "grad_norm": 0.18363600969314575, "learning_rate": 2.385552308633645e-05, "loss": 0.4151, "num_input_tokens_seen": 12647677566, "step": 3244, "train_runtime": 129059.5926, "train_tokens_per_second": 97998.741 }, { "epoch": 0.5158982511923689, "grad_norm": 0.22146061062812805, "learning_rate": 2.3843024030854352e-05, "loss": 0.4009, "num_input_tokens_seen": 12651555098, "step": 3245, "train_runtime": 129101.2917, "train_tokens_per_second": 97997.122 }, { "epoch": 0.5160572337042926, "grad_norm": 0.19182361662387848, "learning_rate": 2.3830525265186558e-05, "loss": 0.3942, "num_input_tokens_seen": 12655473933, "step": 3246, "train_runtime": 129137.5464, "train_tokens_per_second": 97999.957 }, { "epoch": 0.5162162162162162, "grad_norm": 0.18799442052841187, "learning_rate": 2.3818026792463916e-05, "loss": 0.4151, "num_input_tokens_seen": 12659391415, "step": 3247, "train_runtime": 129176.3115, "train_tokens_per_second": 98000.874 }, { "epoch": 0.5163751987281399, "grad_norm": 0.1995520442724228, "learning_rate": 2.3805528615817213e-05, "loss": 0.4064, "num_input_tokens_seen": 12663222557, "step": 3248, "train_runtime": 129216.8578, "train_tokens_per_second": 97999.772 }, { "epoch": 0.5165341812400636, "grad_norm": 0.18686461448669434, "learning_rate": 2.3793030738377142e-05, "loss": 0.3975, "num_input_tokens_seen": 12667199719, "step": 3249, "train_runtime": 129253.8298, "train_tokens_per_second": 98002.51 }, { "epoch": 0.5166931637519873, "grad_norm": 0.2378966212272644, "learning_rate": 2.3780533163274353e-05, "loss": 0.402, "num_input_tokens_seen": 12671131483, "step": 3250, "train_runtime": 129290.7463, "train_tokens_per_second": 98004.937 }, { "epoch": 0.516852146263911, "grad_norm": 0.20566965639591217, "learning_rate": 2.376803589363939e-05, "loss": 0.4, "num_input_tokens_seen": 12674887900, "step": 3251, "train_runtime": 129329.5244, "train_tokens_per_second": 98004.597 }, { "epoch": 0.5170111287758347, "grad_norm": 0.2054501175880432, "learning_rate": 2.375553893260272e-05, "loss": 0.4141, "num_input_tokens_seen": 12678863481, "step": 3252, "train_runtime": 129369.1232, "train_tokens_per_second": 98005.329 }, { "epoch": 0.5171701112877584, "grad_norm": 0.19124607741832733, "learning_rate": 2.3743042283294765e-05, "loss": 0.4053, "num_input_tokens_seen": 12682746764, "step": 3253, "train_runtime": 129408.6013, "train_tokens_per_second": 98005.439 }, { "epoch": 0.5173290937996821, "grad_norm": 0.20054782927036285, "learning_rate": 2.373054594884583e-05, "loss": 0.406, "num_input_tokens_seen": 12686747661, "step": 3254, "train_runtime": 129444.4529, "train_tokens_per_second": 98009.203 }, { "epoch": 0.5174880763116058, "grad_norm": 0.2036776840686798, "learning_rate": 2.3718049932386167e-05, "loss": 0.3925, "num_input_tokens_seen": 12690557246, "step": 3255, "train_runtime": 129484.695, "train_tokens_per_second": 98008.164 }, { "epoch": 0.5176470588235295, "grad_norm": 0.19324886798858643, "learning_rate": 2.370555423704593e-05, "loss": 0.3988, "num_input_tokens_seen": 12694508853, "step": 3256, "train_runtime": 129522.4157, "train_tokens_per_second": 98010.13 }, { "epoch": 0.517806041335453, "grad_norm": 0.19831129908561707, "learning_rate": 2.3693058865955213e-05, "loss": 0.3988, "num_input_tokens_seen": 12698402325, "step": 3257, "train_runtime": 129562.1955, "train_tokens_per_second": 98010.089 }, { "epoch": 0.5179650238473767, "grad_norm": 0.2014850527048111, "learning_rate": 2.3680563822244024e-05, "loss": 0.4088, "num_input_tokens_seen": 12702179070, "step": 3258, "train_runtime": 129602.9873, "train_tokens_per_second": 98008.382 }, { "epoch": 0.5181240063593004, "grad_norm": 0.20607133209705353, "learning_rate": 2.366806910904226e-05, "loss": 0.3927, "num_input_tokens_seen": 12706045800, "step": 3259, "train_runtime": 129642.1977, "train_tokens_per_second": 98008.565 }, { "epoch": 0.5182829888712241, "grad_norm": 0.27061864733695984, "learning_rate": 2.3655574729479783e-05, "loss": 0.4007, "num_input_tokens_seen": 12709963645, "step": 3260, "train_runtime": 129681.3238, "train_tokens_per_second": 98009.207 }, { "epoch": 0.5184419713831478, "grad_norm": 0.190857395529747, "learning_rate": 2.3643080686686333e-05, "loss": 0.4043, "num_input_tokens_seen": 12713848495, "step": 3261, "train_runtime": 129718.9003, "train_tokens_per_second": 98010.764 }, { "epoch": 0.5186009538950715, "grad_norm": 0.1844693124294281, "learning_rate": 2.3630586983791582e-05, "loss": 0.3932, "num_input_tokens_seen": 12717676089, "step": 3262, "train_runtime": 129758.5849, "train_tokens_per_second": 98010.287 }, { "epoch": 0.5187599364069952, "grad_norm": 0.1945173144340515, "learning_rate": 2.361809362392511e-05, "loss": 0.403, "num_input_tokens_seen": 12721583183, "step": 3263, "train_runtime": 129798.8341, "train_tokens_per_second": 98009.996 }, { "epoch": 0.518918918918919, "grad_norm": 0.22064772248268127, "learning_rate": 2.360560061021643e-05, "loss": 0.4131, "num_input_tokens_seen": 12725607448, "step": 3264, "train_runtime": 129838.8497, "train_tokens_per_second": 98010.784 }, { "epoch": 0.5190779014308426, "grad_norm": 0.21147669851779938, "learning_rate": 2.3593107945794938e-05, "loss": 0.4087, "num_input_tokens_seen": 12729348811, "step": 3265, "train_runtime": 129877.101, "train_tokens_per_second": 98010.725 }, { "epoch": 0.5192368839427663, "grad_norm": 0.19325652718544006, "learning_rate": 2.3580615633789967e-05, "loss": 0.4021, "num_input_tokens_seen": 12733207290, "step": 3266, "train_runtime": 129918.2584, "train_tokens_per_second": 98009.375 }, { "epoch": 0.5193958664546899, "grad_norm": 0.18318161368370056, "learning_rate": 2.356812367733074e-05, "loss": 0.3782, "num_input_tokens_seen": 12737186322, "step": 3267, "train_runtime": 129955.8807, "train_tokens_per_second": 98011.619 }, { "epoch": 0.5195548489666136, "grad_norm": 0.2730632424354553, "learning_rate": 2.3555632079546425e-05, "loss": 0.3956, "num_input_tokens_seen": 12741021541, "step": 3268, "train_runtime": 129995.0738, "train_tokens_per_second": 98011.572 }, { "epoch": 0.5197138314785373, "grad_norm": 0.22061514854431152, "learning_rate": 2.3543140843566066e-05, "loss": 0.3987, "num_input_tokens_seen": 12744879541, "step": 3269, "train_runtime": 130034.4091, "train_tokens_per_second": 98011.593 }, { "epoch": 0.519872813990461, "grad_norm": 0.2204616367816925, "learning_rate": 2.353064997251862e-05, "loss": 0.4108, "num_input_tokens_seen": 12748862428, "step": 3270, "train_runtime": 130074.9928, "train_tokens_per_second": 98011.633 }, { "epoch": 0.5200317965023847, "grad_norm": 0.217095285654068, "learning_rate": 2.3518159469532984e-05, "loss": 0.3959, "num_input_tokens_seen": 12752636990, "step": 3271, "train_runtime": 130113.5175, "train_tokens_per_second": 98011.623 }, { "epoch": 0.5201907790143084, "grad_norm": 0.2022862583398819, "learning_rate": 2.3505669337737928e-05, "loss": 0.3984, "num_input_tokens_seen": 12756521757, "step": 3272, "train_runtime": 130153.5242, "train_tokens_per_second": 98011.343 }, { "epoch": 0.5203497615262321, "grad_norm": 0.22586843371391296, "learning_rate": 2.3493179580262146e-05, "loss": 0.4062, "num_input_tokens_seen": 12760359033, "step": 3273, "train_runtime": 130191.805, "train_tokens_per_second": 98011.999 }, { "epoch": 0.5205087440381558, "grad_norm": 0.22069062292575836, "learning_rate": 2.348069020023423e-05, "loss": 0.4114, "num_input_tokens_seen": 12764203101, "step": 3274, "train_runtime": 130233.2534, "train_tokens_per_second": 98010.322 }, { "epoch": 0.5206677265500795, "grad_norm": 0.2011738419532776, "learning_rate": 2.3468201200782675e-05, "loss": 0.3938, "num_input_tokens_seen": 12768104575, "step": 3275, "train_runtime": 130271.4425, "train_tokens_per_second": 98011.539 }, { "epoch": 0.5208267090620032, "grad_norm": 0.21107907593250275, "learning_rate": 2.3455712585035905e-05, "loss": 0.4147, "num_input_tokens_seen": 12772152776, "step": 3276, "train_runtime": 130313.0665, "train_tokens_per_second": 98011.298 }, { "epoch": 0.5209856915739268, "grad_norm": 0.19571556150913239, "learning_rate": 2.344322435612221e-05, "loss": 0.4045, "num_input_tokens_seen": 12776003876, "step": 3277, "train_runtime": 130352.2395, "train_tokens_per_second": 98011.388 }, { "epoch": 0.5211446740858505, "grad_norm": 0.268625408411026, "learning_rate": 2.343073651716982e-05, "loss": 0.3939, "num_input_tokens_seen": 12779900552, "step": 3278, "train_runtime": 130392.0298, "train_tokens_per_second": 98011.363 }, { "epoch": 0.5213036565977742, "grad_norm": 0.19526073336601257, "learning_rate": 2.3418249071306848e-05, "loss": 0.401, "num_input_tokens_seen": 12783834266, "step": 3279, "train_runtime": 130430.4288, "train_tokens_per_second": 98012.668 }, { "epoch": 0.5214626391096979, "grad_norm": 0.25501549243927, "learning_rate": 2.3405762021661297e-05, "loss": 0.3956, "num_input_tokens_seen": 12787778660, "step": 3280, "train_runtime": 130470.4415, "train_tokens_per_second": 98012.841 }, { "epoch": 0.5216216216216216, "grad_norm": 0.2052144855260849, "learning_rate": 2.3393275371361104e-05, "loss": 0.4151, "num_input_tokens_seen": 12791638181, "step": 3281, "train_runtime": 130510.1101, "train_tokens_per_second": 98012.623 }, { "epoch": 0.5217806041335453, "grad_norm": 0.20332208275794983, "learning_rate": 2.3380789123534075e-05, "loss": 0.4011, "num_input_tokens_seen": 12795470620, "step": 3282, "train_runtime": 130546.9409, "train_tokens_per_second": 98014.328 }, { "epoch": 0.521939586645469, "grad_norm": 0.19737839698791504, "learning_rate": 2.3368303281307915e-05, "loss": 0.4037, "num_input_tokens_seen": 12799405801, "step": 3283, "train_runtime": 130586.8917, "train_tokens_per_second": 98014.476 }, { "epoch": 0.5220985691573927, "grad_norm": 0.19269587099552155, "learning_rate": 2.3355817847810267e-05, "loss": 0.4014, "num_input_tokens_seen": 12803212865, "step": 3284, "train_runtime": 130626.1419, "train_tokens_per_second": 98014.17 }, { "epoch": 0.5222575516693164, "grad_norm": 0.2716123163700104, "learning_rate": 2.3343332826168616e-05, "loss": 0.4036, "num_input_tokens_seen": 12807143074, "step": 3285, "train_runtime": 130663.2465, "train_tokens_per_second": 98016.416 }, { "epoch": 0.52241653418124, "grad_norm": 0.35660165548324585, "learning_rate": 2.33308482195104e-05, "loss": 0.4121, "num_input_tokens_seen": 12811031439, "step": 3286, "train_runtime": 130703.0204, "train_tokens_per_second": 98016.338 }, { "epoch": 0.5225755166931637, "grad_norm": 0.23808161914348602, "learning_rate": 2.3318364030962908e-05, "loss": 0.3988, "num_input_tokens_seen": 12814987791, "step": 3287, "train_runtime": 130740.2638, "train_tokens_per_second": 98018.678 }, { "epoch": 0.5227344992050874, "grad_norm": 0.22418919205665588, "learning_rate": 2.330588026365334e-05, "loss": 0.4148, "num_input_tokens_seen": 12818886973, "step": 3288, "train_runtime": 130779.2626, "train_tokens_per_second": 98019.263 }, { "epoch": 0.5228934817170111, "grad_norm": 0.18021072447299957, "learning_rate": 2.3293396920708797e-05, "loss": 0.4022, "num_input_tokens_seen": 12822747770, "step": 3289, "train_runtime": 130818.7787, "train_tokens_per_second": 98019.167 }, { "epoch": 0.5230524642289348, "grad_norm": 0.2453639805316925, "learning_rate": 2.3280914005256254e-05, "loss": 0.3988, "num_input_tokens_seen": 12826728212, "step": 3290, "train_runtime": 130857.811, "train_tokens_per_second": 98020.348 }, { "epoch": 0.5232114467408585, "grad_norm": 0.17717809975147247, "learning_rate": 2.326843152042262e-05, "loss": 0.4027, "num_input_tokens_seen": 12830660022, "step": 3291, "train_runtime": 130898.5052, "train_tokens_per_second": 98019.912 }, { "epoch": 0.5233704292527822, "grad_norm": 0.20502837002277374, "learning_rate": 2.3255949469334654e-05, "loss": 0.404, "num_input_tokens_seen": 12834506857, "step": 3292, "train_runtime": 130936.7435, "train_tokens_per_second": 98020.666 }, { "epoch": 0.5235294117647059, "grad_norm": 0.18323810398578644, "learning_rate": 2.3243467855119007e-05, "loss": 0.4041, "num_input_tokens_seen": 12838401279, "step": 3293, "train_runtime": 130975.2845, "train_tokens_per_second": 98021.557 }, { "epoch": 0.5236883942766296, "grad_norm": 0.2002183496952057, "learning_rate": 2.3230986680902265e-05, "loss": 0.406, "num_input_tokens_seen": 12842218285, "step": 3294, "train_runtime": 131014.5932, "train_tokens_per_second": 98021.281 }, { "epoch": 0.5238473767885533, "grad_norm": 0.47994816303253174, "learning_rate": 2.321850594981085e-05, "loss": 0.3897, "num_input_tokens_seen": 12846076154, "step": 3295, "train_runtime": 131054.1081, "train_tokens_per_second": 98021.163 }, { "epoch": 0.5240063593004769, "grad_norm": 0.26594793796539307, "learning_rate": 2.3206025664971116e-05, "loss": 0.4022, "num_input_tokens_seen": 12850029080, "step": 3296, "train_runtime": 131093.6394, "train_tokens_per_second": 98021.759 }, { "epoch": 0.5241653418124006, "grad_norm": 0.21908152103424072, "learning_rate": 2.3193545829509273e-05, "loss": 0.3969, "num_input_tokens_seen": 12854068290, "step": 3297, "train_runtime": 131131.2097, "train_tokens_per_second": 98024.477 }, { "epoch": 0.5243243243243243, "grad_norm": 0.19725766777992249, "learning_rate": 2.318106644655142e-05, "loss": 0.4067, "num_input_tokens_seen": 12857934901, "step": 3298, "train_runtime": 131171.1583, "train_tokens_per_second": 98024.101 }, { "epoch": 0.524483306836248, "grad_norm": 0.20719268918037415, "learning_rate": 2.3168587519223585e-05, "loss": 0.4027, "num_input_tokens_seen": 12861860695, "step": 3299, "train_runtime": 131211.9301, "train_tokens_per_second": 98023.561 }, { "epoch": 0.5246422893481717, "grad_norm": 0.24225223064422607, "learning_rate": 2.3156109050651633e-05, "loss": 0.4002, "num_input_tokens_seen": 12865811987, "step": 3300, "train_runtime": 131251.9701, "train_tokens_per_second": 98023.763 }, { "epoch": 0.5248012718600954, "grad_norm": 0.41595184803009033, "learning_rate": 2.3143631043961326e-05, "loss": 0.4101, "num_input_tokens_seen": 12869745438, "step": 3301, "train_runtime": 131289.6315, "train_tokens_per_second": 98025.604 }, { "epoch": 0.5249602543720191, "grad_norm": 0.2345207780599594, "learning_rate": 2.313115350227833e-05, "loss": 0.4032, "num_input_tokens_seen": 12873588067, "step": 3302, "train_runtime": 131329.5153, "train_tokens_per_second": 98025.094 }, { "epoch": 0.5251192368839428, "grad_norm": 0.25103649497032166, "learning_rate": 2.311867642872818e-05, "loss": 0.4113, "num_input_tokens_seen": 12877476828, "step": 3303, "train_runtime": 131368.6999, "train_tokens_per_second": 98025.457 }, { "epoch": 0.5252782193958665, "grad_norm": 0.2639332413673401, "learning_rate": 2.3106199826436287e-05, "loss": 0.4147, "num_input_tokens_seen": 12881488530, "step": 3304, "train_runtime": 131408.1443, "train_tokens_per_second": 98026.561 }, { "epoch": 0.5254372019077902, "grad_norm": 0.25922003388404846, "learning_rate": 2.3093723698527958e-05, "loss": 0.417, "num_input_tokens_seen": 12885295685, "step": 3305, "train_runtime": 131447.4332, "train_tokens_per_second": 98026.225 }, { "epoch": 0.5255961844197138, "grad_norm": 0.20321738719940186, "learning_rate": 2.308124804812836e-05, "loss": 0.3944, "num_input_tokens_seen": 12889206548, "step": 3306, "train_runtime": 131485.4227, "train_tokens_per_second": 98027.647 }, { "epoch": 0.5257551669316375, "grad_norm": 0.21530777215957642, "learning_rate": 2.3068772878362584e-05, "loss": 0.4029, "num_input_tokens_seen": 12893071213, "step": 3307, "train_runtime": 131526.3534, "train_tokens_per_second": 98026.524 }, { "epoch": 0.5259141494435612, "grad_norm": 0.9719943404197693, "learning_rate": 2.3056298192355537e-05, "loss": 0.4049, "num_input_tokens_seen": 12896855918, "step": 3308, "train_runtime": 131564.0027, "train_tokens_per_second": 98027.239 }, { "epoch": 0.5260731319554849, "grad_norm": 0.25026899576187134, "learning_rate": 2.304382399323207e-05, "loss": 0.405, "num_input_tokens_seen": 12900765386, "step": 3309, "train_runtime": 131602.5443, "train_tokens_per_second": 98028.237 }, { "epoch": 0.5262321144674086, "grad_norm": 0.22069044411182404, "learning_rate": 2.3031350284116874e-05, "loss": 0.3968, "num_input_tokens_seen": 12904816104, "step": 3310, "train_runtime": 131643.2447, "train_tokens_per_second": 98028.7 }, { "epoch": 0.5263910969793323, "grad_norm": 0.6059657335281372, "learning_rate": 2.3018877068134515e-05, "loss": 0.3898, "num_input_tokens_seen": 12908779435, "step": 3311, "train_runtime": 131684.2736, "train_tokens_per_second": 98028.254 }, { "epoch": 0.526550079491256, "grad_norm": 0.36850032210350037, "learning_rate": 2.3006404348409454e-05, "loss": 0.4037, "num_input_tokens_seen": 12912675382, "step": 3312, "train_runtime": 131724.0341, "train_tokens_per_second": 98028.241 }, { "epoch": 0.5267090620031797, "grad_norm": 0.2038550078868866, "learning_rate": 2.2993932128066002e-05, "loss": 0.3864, "num_input_tokens_seen": 12916477398, "step": 3313, "train_runtime": 131762.5358, "train_tokens_per_second": 98028.452 }, { "epoch": 0.5268680445151034, "grad_norm": 0.2228744477033615, "learning_rate": 2.2981460410228394e-05, "loss": 0.4094, "num_input_tokens_seen": 12920504288, "step": 3314, "train_runtime": 131801.7572, "train_tokens_per_second": 98029.833 }, { "epoch": 0.527027027027027, "grad_norm": 0.19878430664539337, "learning_rate": 2.2968989198020683e-05, "loss": 0.4027, "num_input_tokens_seen": 12924494608, "step": 3315, "train_runtime": 131839.6513, "train_tokens_per_second": 98031.923 }, { "epoch": 0.5271860095389507, "grad_norm": 0.2701463997364044, "learning_rate": 2.2956518494566822e-05, "loss": 0.3928, "num_input_tokens_seen": 12928409387, "step": 3316, "train_runtime": 131877.6023, "train_tokens_per_second": 98033.397 }, { "epoch": 0.5273449920508744, "grad_norm": 0.2558564841747284, "learning_rate": 2.2944048302990645e-05, "loss": 0.4008, "num_input_tokens_seen": 12932300621, "step": 3317, "train_runtime": 131919.2645, "train_tokens_per_second": 98031.934 }, { "epoch": 0.5275039745627981, "grad_norm": 0.25296565890312195, "learning_rate": 2.2931578626415842e-05, "loss": 0.39, "num_input_tokens_seen": 12936245431, "step": 3318, "train_runtime": 131960.6417, "train_tokens_per_second": 98031.089 }, { "epoch": 0.5276629570747218, "grad_norm": 0.22437071800231934, "learning_rate": 2.2919109467965968e-05, "loss": 0.4053, "num_input_tokens_seen": 12940250531, "step": 3319, "train_runtime": 132000.6664, "train_tokens_per_second": 98031.706 }, { "epoch": 0.5278219395866455, "grad_norm": 0.29083922505378723, "learning_rate": 2.2906640830764478e-05, "loss": 0.4016, "num_input_tokens_seen": 12944038013, "step": 3320, "train_runtime": 132034.4653, "train_tokens_per_second": 98035.297 }, { "epoch": 0.5279809220985692, "grad_norm": 0.25021564960479736, "learning_rate": 2.2894172717934652e-05, "loss": 0.4059, "num_input_tokens_seen": 12947968141, "step": 3321, "train_runtime": 132077.6646, "train_tokens_per_second": 98032.988 }, { "epoch": 0.5281399046104929, "grad_norm": 0.2897081673145294, "learning_rate": 2.2881705132599696e-05, "loss": 0.4154, "num_input_tokens_seen": 12951847533, "step": 3322, "train_runtime": 132119.0507, "train_tokens_per_second": 98031.642 }, { "epoch": 0.5282988871224166, "grad_norm": 0.2286565750837326, "learning_rate": 2.286923807788263e-05, "loss": 0.4101, "num_input_tokens_seen": 12955913989, "step": 3323, "train_runtime": 132156.7279, "train_tokens_per_second": 98034.464 }, { "epoch": 0.5284578696343403, "grad_norm": 0.2546605169773102, "learning_rate": 2.2856771556906363e-05, "loss": 0.3873, "num_input_tokens_seen": 12959666278, "step": 3324, "train_runtime": 132195.9163, "train_tokens_per_second": 98033.787 }, { "epoch": 0.5286168521462639, "grad_norm": 0.24556419253349304, "learning_rate": 2.2844305572793682e-05, "loss": 0.394, "num_input_tokens_seen": 12963432557, "step": 3325, "train_runtime": 132235.6079, "train_tokens_per_second": 98032.843 }, { "epoch": 0.5287758346581876, "grad_norm": 0.26329305768013, "learning_rate": 2.2831840128667222e-05, "loss": 0.4025, "num_input_tokens_seen": 12967434180, "step": 3326, "train_runtime": 132273.8772, "train_tokens_per_second": 98034.733 }, { "epoch": 0.5289348171701113, "grad_norm": 0.22578570246696472, "learning_rate": 2.281937522764949e-05, "loss": 0.4082, "num_input_tokens_seen": 12971393662, "step": 3327, "train_runtime": 132312.9211, "train_tokens_per_second": 98035.729 }, { "epoch": 0.529093799682035, "grad_norm": 0.23988120257854462, "learning_rate": 2.2806910872862855e-05, "loss": 0.4114, "num_input_tokens_seen": 12975222040, "step": 3328, "train_runtime": 132353.6789, "train_tokens_per_second": 98034.465 }, { "epoch": 0.5292527821939587, "grad_norm": 0.19734397530555725, "learning_rate": 2.279444706742954e-05, "loss": 0.3892, "num_input_tokens_seen": 12979089757, "step": 3329, "train_runtime": 132393.8869, "train_tokens_per_second": 98033.905 }, { "epoch": 0.5294117647058824, "grad_norm": 0.19681723415851593, "learning_rate": 2.278198381447166e-05, "loss": 0.408, "num_input_tokens_seen": 12983019085, "step": 3330, "train_runtime": 132434.1878, "train_tokens_per_second": 98033.743 }, { "epoch": 0.5295707472178061, "grad_norm": 0.25497880578041077, "learning_rate": 2.2769521117111148e-05, "loss": 0.3949, "num_input_tokens_seen": 12986834149, "step": 3331, "train_runtime": 132474.3604, "train_tokens_per_second": 98032.813 }, { "epoch": 0.5297297297297298, "grad_norm": 0.33471208810806274, "learning_rate": 2.2757058978469846e-05, "loss": 0.4159, "num_input_tokens_seen": 12990727394, "step": 3332, "train_runtime": 132514.4363, "train_tokens_per_second": 98032.545 }, { "epoch": 0.5298887122416535, "grad_norm": 0.19064024090766907, "learning_rate": 2.274459740166942e-05, "loss": 0.4124, "num_input_tokens_seen": 12994665447, "step": 3333, "train_runtime": 132555.1013, "train_tokens_per_second": 98032.179 }, { "epoch": 0.5300476947535772, "grad_norm": 0.18951596319675446, "learning_rate": 2.2732136389831404e-05, "loss": 0.4072, "num_input_tokens_seen": 12998545133, "step": 3334, "train_runtime": 132592.5345, "train_tokens_per_second": 98033.763 }, { "epoch": 0.5302066772655007, "grad_norm": 0.22724571824073792, "learning_rate": 2.2719675946077205e-05, "loss": 0.3994, "num_input_tokens_seen": 13002457807, "step": 3335, "train_runtime": 132630.1084, "train_tokens_per_second": 98035.491 }, { "epoch": 0.5303656597774244, "grad_norm": 0.18274056911468506, "learning_rate": 2.2707216073528052e-05, "loss": 0.3992, "num_input_tokens_seen": 13006388514, "step": 3336, "train_runtime": 132667.4683, "train_tokens_per_second": 98037.512 }, { "epoch": 0.5305246422893481, "grad_norm": 0.20802146196365356, "learning_rate": 2.2694756775305084e-05, "loss": 0.4096, "num_input_tokens_seen": 13010337056, "step": 3337, "train_runtime": 132704.5743, "train_tokens_per_second": 98039.854 }, { "epoch": 0.5306836248012718, "grad_norm": 0.20611968636512756, "learning_rate": 2.2682298054529256e-05, "loss": 0.3967, "num_input_tokens_seen": 13014311365, "step": 3338, "train_runtime": 132744.6933, "train_tokens_per_second": 98040.163 }, { "epoch": 0.5308426073131955, "grad_norm": 0.39300253987312317, "learning_rate": 2.266983991432137e-05, "loss": 0.4156, "num_input_tokens_seen": 13018085424, "step": 3339, "train_runtime": 132785.5112, "train_tokens_per_second": 98038.448 }, { "epoch": 0.5310015898251192, "grad_norm": 0.2164389044046402, "learning_rate": 2.2657382357802138e-05, "loss": 0.4074, "num_input_tokens_seen": 13022077039, "step": 3340, "train_runtime": 132823.8795, "train_tokens_per_second": 98040.18 }, { "epoch": 0.531160572337043, "grad_norm": 0.21066348254680634, "learning_rate": 2.264492538809207e-05, "loss": 0.4011, "num_input_tokens_seen": 13025957614, "step": 3341, "train_runtime": 132863.3584, "train_tokens_per_second": 98040.256 }, { "epoch": 0.5313195548489666, "grad_norm": 0.2179938405752182, "learning_rate": 2.2632469008311544e-05, "loss": 0.3985, "num_input_tokens_seen": 13029763528, "step": 3342, "train_runtime": 132900.9749, "train_tokens_per_second": 98041.143 }, { "epoch": 0.5314785373608903, "grad_norm": 0.17853496968746185, "learning_rate": 2.2620013221580806e-05, "loss": 0.4021, "num_input_tokens_seen": 13033678256, "step": 3343, "train_runtime": 132941.99, "train_tokens_per_second": 98040.343 }, { "epoch": 0.5316375198728139, "grad_norm": 0.18741260468959808, "learning_rate": 2.260755803101993e-05, "loss": 0.4007, "num_input_tokens_seen": 13037646169, "step": 3344, "train_runtime": 132981.5798, "train_tokens_per_second": 98040.993 }, { "epoch": 0.5317965023847376, "grad_norm": 0.22050514817237854, "learning_rate": 2.259510343974887e-05, "loss": 0.4093, "num_input_tokens_seen": 13041587686, "step": 3345, "train_runtime": 133020.0968, "train_tokens_per_second": 98042.236 }, { "epoch": 0.5319554848966613, "grad_norm": 0.19388838112354279, "learning_rate": 2.2582649450887405e-05, "loss": 0.4006, "num_input_tokens_seen": 13045485164, "step": 3346, "train_runtime": 133059.9994, "train_tokens_per_second": 98042.126 }, { "epoch": 0.532114467408585, "grad_norm": 0.3031415045261383, "learning_rate": 2.2570196067555162e-05, "loss": 0.4059, "num_input_tokens_seen": 13049426812, "step": 3347, "train_runtime": 133101.6548, "train_tokens_per_second": 98041.056 }, { "epoch": 0.5322734499205087, "grad_norm": 0.18687422573566437, "learning_rate": 2.255774329287164e-05, "loss": 0.3972, "num_input_tokens_seen": 13053307071, "step": 3348, "train_runtime": 133140.0606, "train_tokens_per_second": 98041.919 }, { "epoch": 0.5324324324324324, "grad_norm": 0.21748045086860657, "learning_rate": 2.2545291129956163e-05, "loss": 0.4043, "num_input_tokens_seen": 13057221986, "step": 3349, "train_runtime": 133175.1104, "train_tokens_per_second": 98045.513 }, { "epoch": 0.5325914149443561, "grad_norm": 0.212589293718338, "learning_rate": 2.2532839581927917e-05, "loss": 0.4095, "num_input_tokens_seen": 13061120924, "step": 3350, "train_runtime": 133214.5606, "train_tokens_per_second": 98045.746 }, { "epoch": 0.5327503974562798, "grad_norm": 0.24419666826725006, "learning_rate": 2.252038865190592e-05, "loss": 0.4003, "num_input_tokens_seen": 13065053910, "step": 3351, "train_runtime": 133253.3716, "train_tokens_per_second": 98046.704 }, { "epoch": 0.5329093799682035, "grad_norm": 0.23812370002269745, "learning_rate": 2.2507938343009037e-05, "loss": 0.4035, "num_input_tokens_seen": 13069032206, "step": 3352, "train_runtime": 133293.0753, "train_tokens_per_second": 98047.345 }, { "epoch": 0.5330683624801272, "grad_norm": 0.2571009695529938, "learning_rate": 2.2495488658356e-05, "loss": 0.4059, "num_input_tokens_seen": 13072904818, "step": 3353, "train_runtime": 133332.0425, "train_tokens_per_second": 98047.735 }, { "epoch": 0.5332273449920508, "grad_norm": 0.22630199790000916, "learning_rate": 2.2483039601065345e-05, "loss": 0.4045, "num_input_tokens_seen": 13076886119, "step": 3354, "train_runtime": 133372.044, "train_tokens_per_second": 98048.18 }, { "epoch": 0.5333863275039745, "grad_norm": 0.19983814656734467, "learning_rate": 2.2470591174255494e-05, "loss": 0.3929, "num_input_tokens_seen": 13080787008, "step": 3355, "train_runtime": 133410.0339, "train_tokens_per_second": 98049.499 }, { "epoch": 0.5335453100158982, "grad_norm": 0.18558263778686523, "learning_rate": 2.2458143381044682e-05, "loss": 0.3982, "num_input_tokens_seen": 13084653542, "step": 3356, "train_runtime": 133448.9067, "train_tokens_per_second": 98049.912 }, { "epoch": 0.5337042925278219, "grad_norm": 0.2034788578748703, "learning_rate": 2.2445696224550982e-05, "loss": 0.3906, "num_input_tokens_seen": 13088531000, "step": 3357, "train_runtime": 133484.5762, "train_tokens_per_second": 98052.759 }, { "epoch": 0.5338632750397456, "grad_norm": 0.2229115217924118, "learning_rate": 2.243324970789233e-05, "loss": 0.4243, "num_input_tokens_seen": 13092414770, "step": 3358, "train_runtime": 133523.3646, "train_tokens_per_second": 98053.362 }, { "epoch": 0.5340222575516693, "grad_norm": 0.20524924993515015, "learning_rate": 2.2420803834186492e-05, "loss": 0.404, "num_input_tokens_seen": 13096310803, "step": 3359, "train_runtime": 133565.0666, "train_tokens_per_second": 98051.917 }, { "epoch": 0.534181240063593, "grad_norm": 0.23537211120128632, "learning_rate": 2.2408358606551067e-05, "loss": 0.4058, "num_input_tokens_seen": 13100257948, "step": 3360, "train_runtime": 133605.31, "train_tokens_per_second": 98051.926 }, { "epoch": 0.5343402225755167, "grad_norm": 0.22279010713100433, "learning_rate": 2.23959140281035e-05, "loss": 0.3971, "num_input_tokens_seen": 13104105915, "step": 3361, "train_runtime": 133644.6539, "train_tokens_per_second": 98051.853 }, { "epoch": 0.5344992050874404, "grad_norm": 0.20057298243045807, "learning_rate": 2.238347010196106e-05, "loss": 0.3924, "num_input_tokens_seen": 13107918675, "step": 3362, "train_runtime": 133684.0622, "train_tokens_per_second": 98051.469 }, { "epoch": 0.5346581875993641, "grad_norm": 0.20226143300533295, "learning_rate": 2.237102683124088e-05, "loss": 0.395, "num_input_tokens_seen": 13111854402, "step": 3363, "train_runtime": 133721.7595, "train_tokens_per_second": 98053.26 }, { "epoch": 0.5348171701112877, "grad_norm": 0.22325703501701355, "learning_rate": 2.23585842190599e-05, "loss": 0.3967, "num_input_tokens_seen": 13115625047, "step": 3364, "train_runtime": 133757.9611, "train_tokens_per_second": 98054.912 }, { "epoch": 0.5349761526232114, "grad_norm": 0.2451910525560379, "learning_rate": 2.2346142268534894e-05, "loss": 0.4048, "num_input_tokens_seen": 13119545259, "step": 3365, "train_runtime": 133798.6864, "train_tokens_per_second": 98054.365 }, { "epoch": 0.5351351351351351, "grad_norm": 0.20545360445976257, "learning_rate": 2.2333700982782512e-05, "loss": 0.4118, "num_input_tokens_seen": 13123531725, "step": 3366, "train_runtime": 133838.6133, "train_tokens_per_second": 98054.899 }, { "epoch": 0.5352941176470588, "grad_norm": 0.2150743007659912, "learning_rate": 2.2321260364919185e-05, "loss": 0.403, "num_input_tokens_seen": 13127492707, "step": 3367, "train_runtime": 133877.6302, "train_tokens_per_second": 98055.909 }, { "epoch": 0.5354531001589825, "grad_norm": 0.23611436784267426, "learning_rate": 2.2308820418061216e-05, "loss": 0.3961, "num_input_tokens_seen": 13131353414, "step": 3368, "train_runtime": 133916.2157, "train_tokens_per_second": 98056.485 }, { "epoch": 0.5356120826709062, "grad_norm": 0.22394661605358124, "learning_rate": 2.2296381145324715e-05, "loss": 0.4226, "num_input_tokens_seen": 13135236786, "step": 3369, "train_runtime": 133956.3634, "train_tokens_per_second": 98056.087 }, { "epoch": 0.5357710651828299, "grad_norm": 0.2340734899044037, "learning_rate": 2.228394254982562e-05, "loss": 0.4143, "num_input_tokens_seen": 13139045537, "step": 3370, "train_runtime": 133996.9267, "train_tokens_per_second": 98054.828 }, { "epoch": 0.5359300476947536, "grad_norm": 0.253500759601593, "learning_rate": 2.2271504634679734e-05, "loss": 0.4159, "num_input_tokens_seen": 13143006810, "step": 3371, "train_runtime": 134037.1958, "train_tokens_per_second": 98054.922 }, { "epoch": 0.5360890302066773, "grad_norm": 0.2179294228553772, "learning_rate": 2.2259067403002652e-05, "loss": 0.4077, "num_input_tokens_seen": 13146887704, "step": 3372, "train_runtime": 134076.6188, "train_tokens_per_second": 98055.036 }, { "epoch": 0.536248012718601, "grad_norm": 0.213954895734787, "learning_rate": 2.2246630857909816e-05, "loss": 0.4171, "num_input_tokens_seen": 13150798991, "step": 3373, "train_runtime": 134115.207, "train_tokens_per_second": 98055.987 }, { "epoch": 0.5364069952305246, "grad_norm": 0.20077736675739288, "learning_rate": 2.22341950025165e-05, "loss": 0.3935, "num_input_tokens_seen": 13154621972, "step": 3374, "train_runtime": 134152.0656, "train_tokens_per_second": 98057.543 }, { "epoch": 0.5365659777424483, "grad_norm": 0.23512503504753113, "learning_rate": 2.222175983993779e-05, "loss": 0.3997, "num_input_tokens_seen": 13158407944, "step": 3375, "train_runtime": 134190.2956, "train_tokens_per_second": 98057.821 }, { "epoch": 0.536724960254372, "grad_norm": 0.21077166497707367, "learning_rate": 2.220932537328861e-05, "loss": 0.3908, "num_input_tokens_seen": 13162342935, "step": 3376, "train_runtime": 134230.8158, "train_tokens_per_second": 98057.535 }, { "epoch": 0.5368839427662957, "grad_norm": 0.21834534406661987, "learning_rate": 2.2196891605683702e-05, "loss": 0.4164, "num_input_tokens_seen": 13166209975, "step": 3377, "train_runtime": 134270.2215, "train_tokens_per_second": 98057.558 }, { "epoch": 0.5370429252782194, "grad_norm": 0.46909299492836, "learning_rate": 2.218445854023763e-05, "loss": 0.4023, "num_input_tokens_seen": 13170195945, "step": 3378, "train_runtime": 134310.5199, "train_tokens_per_second": 98057.814 }, { "epoch": 0.5372019077901431, "grad_norm": 0.25093069672584534, "learning_rate": 2.217202618006481e-05, "loss": 0.4065, "num_input_tokens_seen": 13174086355, "step": 3379, "train_runtime": 134350.3833, "train_tokens_per_second": 98057.676 }, { "epoch": 0.5373608903020668, "grad_norm": 0.29027441143989563, "learning_rate": 2.2159594528279437e-05, "loss": 0.3955, "num_input_tokens_seen": 13177955249, "step": 3380, "train_runtime": 134390.7372, "train_tokens_per_second": 98057.02 }, { "epoch": 0.5375198728139905, "grad_norm": 0.20560115575790405, "learning_rate": 2.2147163587995573e-05, "loss": 0.4109, "num_input_tokens_seen": 13181807440, "step": 3381, "train_runtime": 134431.3188, "train_tokens_per_second": 98056.075 }, { "epoch": 0.5376788553259142, "grad_norm": 0.19528664648532867, "learning_rate": 2.2134733362327075e-05, "loss": 0.3994, "num_input_tokens_seen": 13185718068, "step": 3382, "train_runtime": 134470.7702, "train_tokens_per_second": 98056.388 }, { "epoch": 0.5378378378378378, "grad_norm": 0.20948919653892517, "learning_rate": 2.212230385438761e-05, "loss": 0.3957, "num_input_tokens_seen": 13189448474, "step": 3383, "train_runtime": 134510.6996, "train_tokens_per_second": 98055.014 }, { "epoch": 0.5379968203497615, "grad_norm": 0.2499476820230484, "learning_rate": 2.2109875067290704e-05, "loss": 0.3955, "num_input_tokens_seen": 13193416694, "step": 3384, "train_runtime": 134547.7956, "train_tokens_per_second": 98057.472 }, { "epoch": 0.5381558028616852, "grad_norm": 0.20323120057582855, "learning_rate": 2.2097447004149663e-05, "loss": 0.405, "num_input_tokens_seen": 13197406466, "step": 3385, "train_runtime": 134585.3976, "train_tokens_per_second": 98059.72 }, { "epoch": 0.5383147853736089, "grad_norm": 0.20431363582611084, "learning_rate": 2.2085019668077645e-05, "loss": 0.4103, "num_input_tokens_seen": 13201345870, "step": 3386, "train_runtime": 134626.5124, "train_tokens_per_second": 98059.035 }, { "epoch": 0.5384737678855326, "grad_norm": 0.23512707650661469, "learning_rate": 2.20725930621876e-05, "loss": 0.4033, "num_input_tokens_seen": 13205263549, "step": 3387, "train_runtime": 134661.8775, "train_tokens_per_second": 98062.375 }, { "epoch": 0.5386327503974563, "grad_norm": 0.7362500429153442, "learning_rate": 2.2060167189592302e-05, "loss": 0.4012, "num_input_tokens_seen": 13209157527, "step": 3388, "train_runtime": 134700.5698, "train_tokens_per_second": 98063.115 }, { "epoch": 0.53879173290938, "grad_norm": 0.18949715793132782, "learning_rate": 2.2047742053404354e-05, "loss": 0.4027, "num_input_tokens_seen": 13213086129, "step": 3389, "train_runtime": 134738.4609, "train_tokens_per_second": 98064.695 }, { "epoch": 0.5389507154213037, "grad_norm": 0.2726874053478241, "learning_rate": 2.203531765673616e-05, "loss": 0.394, "num_input_tokens_seen": 13216956466, "step": 3390, "train_runtime": 134778.0023, "train_tokens_per_second": 98064.641 }, { "epoch": 0.5391096979332274, "grad_norm": 0.22440150380134583, "learning_rate": 2.2022894002699944e-05, "loss": 0.4151, "num_input_tokens_seen": 13220909297, "step": 3391, "train_runtime": 134813.381, "train_tokens_per_second": 98068.227 }, { "epoch": 0.5392686804451511, "grad_norm": 0.21817730367183685, "learning_rate": 2.2010471094407744e-05, "loss": 0.4081, "num_input_tokens_seen": 13224886450, "step": 3392, "train_runtime": 134855.4717, "train_tokens_per_second": 98067.111 }, { "epoch": 0.5394276629570747, "grad_norm": 0.1917688250541687, "learning_rate": 2.19980489349714e-05, "loss": 0.391, "num_input_tokens_seen": 13228749221, "step": 3393, "train_runtime": 134895.4035, "train_tokens_per_second": 98066.716 }, { "epoch": 0.5395866454689984, "grad_norm": 0.21996156871318817, "learning_rate": 2.1985627527502594e-05, "loss": 0.3921, "num_input_tokens_seen": 13232582324, "step": 3394, "train_runtime": 134933.3738, "train_tokens_per_second": 98067.527 }, { "epoch": 0.5397456279809221, "grad_norm": 0.22434280812740326, "learning_rate": 2.1973206875112796e-05, "loss": 0.3924, "num_input_tokens_seen": 13236517342, "step": 3395, "train_runtime": 134973.3135, "train_tokens_per_second": 98067.662 }, { "epoch": 0.5399046104928458, "grad_norm": 0.20235921442508698, "learning_rate": 2.1960786980913274e-05, "loss": 0.4018, "num_input_tokens_seen": 13240415678, "step": 3396, "train_runtime": 135012.4234, "train_tokens_per_second": 98068.128 }, { "epoch": 0.5400635930047695, "grad_norm": 0.22903354465961456, "learning_rate": 2.1948367848015148e-05, "loss": 0.3949, "num_input_tokens_seen": 13244409304, "step": 3397, "train_runtime": 135050.5044, "train_tokens_per_second": 98070.047 }, { "epoch": 0.5402225755166932, "grad_norm": 0.26536232233047485, "learning_rate": 2.1935949479529304e-05, "loss": 0.4146, "num_input_tokens_seen": 13248221024, "step": 3398, "train_runtime": 135086.2237, "train_tokens_per_second": 98072.332 }, { "epoch": 0.5403815580286169, "grad_norm": 0.20241539180278778, "learning_rate": 2.1923531878566472e-05, "loss": 0.3976, "num_input_tokens_seen": 13252223360, "step": 3399, "train_runtime": 135125.8826, "train_tokens_per_second": 98073.168 }, { "epoch": 0.5405405405405406, "grad_norm": 0.19454897940158844, "learning_rate": 2.1911115048237166e-05, "loss": 0.4029, "num_input_tokens_seen": 13256229506, "step": 3400, "train_runtime": 135165.6554, "train_tokens_per_second": 98073.948 }, { "epoch": 0.5406995230524643, "grad_norm": 0.21583667397499084, "learning_rate": 2.18986989916517e-05, "loss": 0.4142, "num_input_tokens_seen": 13260122631, "step": 3401, "train_runtime": 135303.5052, "train_tokens_per_second": 98002.802 }, { "epoch": 0.540858505564388, "grad_norm": 0.21042102575302124, "learning_rate": 2.1886283711920236e-05, "loss": 0.4025, "num_input_tokens_seen": 13263975002, "step": 3402, "train_runtime": 135342.3915, "train_tokens_per_second": 98003.108 }, { "epoch": 0.5410174880763116, "grad_norm": 0.21280722320079803, "learning_rate": 2.1873869212152682e-05, "loss": 0.3945, "num_input_tokens_seen": 13267880979, "step": 3403, "train_runtime": 135382.4022, "train_tokens_per_second": 98002.996 }, { "epoch": 0.5411764705882353, "grad_norm": 0.21724478900432587, "learning_rate": 2.1861455495458818e-05, "loss": 0.4098, "num_input_tokens_seen": 13271833699, "step": 3404, "train_runtime": 135421.7075, "train_tokens_per_second": 98003.739 }, { "epoch": 0.541335453100159, "grad_norm": 0.23675791919231415, "learning_rate": 2.1849042564948172e-05, "loss": 0.3982, "num_input_tokens_seen": 13275795482, "step": 3405, "train_runtime": 135460.4328, "train_tokens_per_second": 98004.969 }, { "epoch": 0.5414944356120827, "grad_norm": 0.2211027443408966, "learning_rate": 2.183663042373009e-05, "loss": 0.4066, "num_input_tokens_seen": 13279693177, "step": 3406, "train_runtime": 135500.4511, "train_tokens_per_second": 98004.789 }, { "epoch": 0.5416534181240064, "grad_norm": 0.2322012186050415, "learning_rate": 2.1824219074913744e-05, "loss": 0.3992, "num_input_tokens_seen": 13283714855, "step": 3407, "train_runtime": 135536.4961, "train_tokens_per_second": 98008.398 }, { "epoch": 0.5418124006359301, "grad_norm": 0.2392561137676239, "learning_rate": 2.181180852160806e-05, "loss": 0.4109, "num_input_tokens_seen": 13287539960, "step": 3408, "train_runtime": 135575.6834, "train_tokens_per_second": 98008.283 }, { "epoch": 0.5419713831478538, "grad_norm": 0.20368218421936035, "learning_rate": 2.179939876692183e-05, "loss": 0.3906, "num_input_tokens_seen": 13291417484, "step": 3409, "train_runtime": 135614.3092, "train_tokens_per_second": 98008.961 }, { "epoch": 0.5421303656597775, "grad_norm": 0.27041682600975037, "learning_rate": 2.178698981396359e-05, "loss": 0.4047, "num_input_tokens_seen": 13295420442, "step": 3410, "train_runtime": 135654.0676, "train_tokens_per_second": 98009.744 }, { "epoch": 0.5422893481717012, "grad_norm": 0.2355557233095169, "learning_rate": 2.1774581665841688e-05, "loss": 0.406, "num_input_tokens_seen": 13299231763, "step": 3411, "train_runtime": 135691.8853, "train_tokens_per_second": 98010.517 }, { "epoch": 0.5424483306836247, "grad_norm": 0.3128635585308075, "learning_rate": 2.1762174325664295e-05, "loss": 0.4076, "num_input_tokens_seen": 13303246599, "step": 3412, "train_runtime": 135732.4224, "train_tokens_per_second": 98010.824 }, { "epoch": 0.5426073131955484, "grad_norm": 0.28069430589675903, "learning_rate": 2.174976779653935e-05, "loss": 0.4182, "num_input_tokens_seen": 13307184654, "step": 3413, "train_runtime": 135771.2256, "train_tokens_per_second": 98011.818 }, { "epoch": 0.5427662957074721, "grad_norm": 0.22207914292812347, "learning_rate": 2.1737362081574608e-05, "loss": 0.4162, "num_input_tokens_seen": 13311108208, "step": 3414, "train_runtime": 135810.0924, "train_tokens_per_second": 98012.659 }, { "epoch": 0.5429252782193958, "grad_norm": 0.277316153049469, "learning_rate": 2.172495718387761e-05, "loss": 0.3961, "num_input_tokens_seen": 13314876128, "step": 3415, "train_runtime": 135848.8634, "train_tokens_per_second": 98012.422 }, { "epoch": 0.5430842607313195, "grad_norm": 0.24062955379486084, "learning_rate": 2.1712553106555685e-05, "loss": 0.4049, "num_input_tokens_seen": 13318733439, "step": 3416, "train_runtime": 135888.3754, "train_tokens_per_second": 98012.309 }, { "epoch": 0.5432432432432432, "grad_norm": 0.25712355971336365, "learning_rate": 2.170014985271599e-05, "loss": 0.4085, "num_input_tokens_seen": 13322697888, "step": 3417, "train_runtime": 135929.7388, "train_tokens_per_second": 98011.649 }, { "epoch": 0.543402225755167, "grad_norm": 0.24977152049541473, "learning_rate": 2.1687747425465436e-05, "loss": 0.4111, "num_input_tokens_seen": 13326542911, "step": 3418, "train_runtime": 135968.5249, "train_tokens_per_second": 98011.969 }, { "epoch": 0.5435612082670906, "grad_norm": 0.22579409182071686, "learning_rate": 2.1675345827910737e-05, "loss": 0.4007, "num_input_tokens_seen": 13330365014, "step": 3419, "train_runtime": 136008.342, "train_tokens_per_second": 98011.378 }, { "epoch": 0.5437201907790143, "grad_norm": 0.19266845285892487, "learning_rate": 2.1662945063158424e-05, "loss": 0.3993, "num_input_tokens_seen": 13334358651, "step": 3420, "train_runtime": 136049.4789, "train_tokens_per_second": 98011.097 }, { "epoch": 0.543879173290938, "grad_norm": 0.21898724138736725, "learning_rate": 2.1650545134314785e-05, "loss": 0.3942, "num_input_tokens_seen": 13338231644, "step": 3421, "train_runtime": 136089.5668, "train_tokens_per_second": 98010.685 }, { "epoch": 0.5440381558028616, "grad_norm": 0.1968674510717392, "learning_rate": 2.163814604448593e-05, "loss": 0.394, "num_input_tokens_seen": 13342282681, "step": 3422, "train_runtime": 136129.0416, "train_tokens_per_second": 98012.022 }, { "epoch": 0.5441971383147853, "grad_norm": 0.2007906287908554, "learning_rate": 2.1625747796777734e-05, "loss": 0.4079, "num_input_tokens_seen": 13345921353, "step": 3423, "train_runtime": 136170.269, "train_tokens_per_second": 98009.069 }, { "epoch": 0.544356120826709, "grad_norm": 0.2315877377986908, "learning_rate": 2.1613350394295862e-05, "loss": 0.404, "num_input_tokens_seen": 13349901923, "step": 3424, "train_runtime": 136209.6401, "train_tokens_per_second": 98009.964 }, { "epoch": 0.5445151033386327, "grad_norm": 0.2023772895336151, "learning_rate": 2.1600953840145794e-05, "loss": 0.3946, "num_input_tokens_seen": 13353869082, "step": 3425, "train_runtime": 136249.5276, "train_tokens_per_second": 98010.388 }, { "epoch": 0.5446740858505564, "grad_norm": 0.4942050278186798, "learning_rate": 2.1588558137432756e-05, "loss": 0.4029, "num_input_tokens_seen": 13357638566, "step": 3426, "train_runtime": 136288.0532, "train_tokens_per_second": 98010.341 }, { "epoch": 0.5448330683624801, "grad_norm": 0.21987579762935638, "learning_rate": 2.1576163289261813e-05, "loss": 0.3931, "num_input_tokens_seen": 13361523845, "step": 3427, "train_runtime": 136327.5721, "train_tokens_per_second": 98010.429 }, { "epoch": 0.5449920508744038, "grad_norm": 0.2088281661272049, "learning_rate": 2.1563769298737774e-05, "loss": 0.3959, "num_input_tokens_seen": 13365516214, "step": 3428, "train_runtime": 136365.9009, "train_tokens_per_second": 98012.158 }, { "epoch": 0.5451510333863275, "grad_norm": 0.20909197628498077, "learning_rate": 2.1551376168965237e-05, "loss": 0.4006, "num_input_tokens_seen": 13369363509, "step": 3429, "train_runtime": 136405.9728, "train_tokens_per_second": 98011.57 }, { "epoch": 0.5453100158982512, "grad_norm": 0.28134551644325256, "learning_rate": 2.1538983903048603e-05, "loss": 0.3951, "num_input_tokens_seen": 13373195472, "step": 3430, "train_runtime": 136445.6207, "train_tokens_per_second": 98011.174 }, { "epoch": 0.5454689984101749, "grad_norm": 0.21668389439582825, "learning_rate": 2.1526592504092034e-05, "loss": 0.4034, "num_input_tokens_seen": 13377237291, "step": 3431, "train_runtime": 136483.9112, "train_tokens_per_second": 98013.291 }, { "epoch": 0.5456279809220985, "grad_norm": 0.24618665874004364, "learning_rate": 2.1514201975199513e-05, "loss": 0.409, "num_input_tokens_seen": 13381091631, "step": 3432, "train_runtime": 136524.0025, "train_tokens_per_second": 98012.741 }, { "epoch": 0.5457869634340222, "grad_norm": 0.1978386789560318, "learning_rate": 2.1501812319474764e-05, "loss": 0.399, "num_input_tokens_seen": 13384968731, "step": 3433, "train_runtime": 136560.7652, "train_tokens_per_second": 98014.746 }, { "epoch": 0.5459459459459459, "grad_norm": 0.27357417345046997, "learning_rate": 2.1489423540021296e-05, "loss": 0.4049, "num_input_tokens_seen": 13388919071, "step": 3434, "train_runtime": 136597.7358, "train_tokens_per_second": 98017.138 }, { "epoch": 0.5461049284578696, "grad_norm": 0.23097184300422668, "learning_rate": 2.1477035639942438e-05, "loss": 0.4048, "num_input_tokens_seen": 13392737299, "step": 3435, "train_runtime": 136637.4415, "train_tokens_per_second": 98016.599 }, { "epoch": 0.5462639109697933, "grad_norm": 0.22087149322032928, "learning_rate": 2.146464862234126e-05, "loss": 0.3987, "num_input_tokens_seen": 13396648191, "step": 3436, "train_runtime": 136676.7994, "train_tokens_per_second": 98016.988 }, { "epoch": 0.546422893481717, "grad_norm": 0.26019561290740967, "learning_rate": 2.1452262490320608e-05, "loss": 0.4043, "num_input_tokens_seen": 13400519981, "step": 3437, "train_runtime": 136715.0332, "train_tokens_per_second": 98017.897 }, { "epoch": 0.5465818759936407, "grad_norm": 0.22505217790603638, "learning_rate": 2.1439877246983144e-05, "loss": 0.4067, "num_input_tokens_seen": 13404414206, "step": 3438, "train_runtime": 136753.4141, "train_tokens_per_second": 98018.863 }, { "epoch": 0.5467408585055644, "grad_norm": 0.22107873857021332, "learning_rate": 2.1427492895431257e-05, "loss": 0.4014, "num_input_tokens_seen": 13408265550, "step": 3439, "train_runtime": 136792.7699, "train_tokens_per_second": 98018.818 }, { "epoch": 0.5468998410174881, "grad_norm": 0.25295525789260864, "learning_rate": 2.1415109438767165e-05, "loss": 0.4043, "num_input_tokens_seen": 13412235380, "step": 3440, "train_runtime": 136831.3178, "train_tokens_per_second": 98020.216 }, { "epoch": 0.5470588235294118, "grad_norm": 0.22287020087242126, "learning_rate": 2.140272688009283e-05, "loss": 0.4141, "num_input_tokens_seen": 13416206755, "step": 3441, "train_runtime": 136871.2036, "train_tokens_per_second": 98020.668 }, { "epoch": 0.5472178060413354, "grad_norm": 0.19422495365142822, "learning_rate": 2.1390345222509982e-05, "loss": 0.403, "num_input_tokens_seen": 13420129464, "step": 3442, "train_runtime": 136909.6093, "train_tokens_per_second": 98021.823 }, { "epoch": 0.5473767885532591, "grad_norm": 0.23083071410655975, "learning_rate": 2.137796446912016e-05, "loss": 0.4095, "num_input_tokens_seen": 13424048482, "step": 3443, "train_runtime": 136950.0559, "train_tokens_per_second": 98021.49 }, { "epoch": 0.5475357710651828, "grad_norm": 0.21840031445026398, "learning_rate": 2.1365584623024636e-05, "loss": 0.3925, "num_input_tokens_seen": 13427903542, "step": 3444, "train_runtime": 136988.6307, "train_tokens_per_second": 98022.029 }, { "epoch": 0.5476947535771065, "grad_norm": 0.24544070661067963, "learning_rate": 2.135320568732449e-05, "loss": 0.3976, "num_input_tokens_seen": 13431840894, "step": 3445, "train_runtime": 137029.8107, "train_tokens_per_second": 98021.305 }, { "epoch": 0.5478537360890302, "grad_norm": 0.19392403960227966, "learning_rate": 2.1340827665120548e-05, "loss": 0.411, "num_input_tokens_seen": 13435750185, "step": 3446, "train_runtime": 137071.3084, "train_tokens_per_second": 98020.15 }, { "epoch": 0.5480127186009539, "grad_norm": 0.2465548813343048, "learning_rate": 2.1328450559513404e-05, "loss": 0.3993, "num_input_tokens_seen": 13439627981, "step": 3447, "train_runtime": 137110.15, "train_tokens_per_second": 98020.664 }, { "epoch": 0.5481717011128776, "grad_norm": 0.22578254342079163, "learning_rate": 2.1316074373603468e-05, "loss": 0.4033, "num_input_tokens_seen": 13443660417, "step": 3448, "train_runtime": 137151.4437, "train_tokens_per_second": 98020.553 }, { "epoch": 0.5483306836248013, "grad_norm": 0.21745038032531738, "learning_rate": 2.1303699110490855e-05, "loss": 0.4072, "num_input_tokens_seen": 13447612554, "step": 3449, "train_runtime": 137191.4353, "train_tokens_per_second": 98020.788 }, { "epoch": 0.548489666136725, "grad_norm": 0.19269979000091553, "learning_rate": 2.1291324773275507e-05, "loss": 0.4031, "num_input_tokens_seen": 13451550056, "step": 3450, "train_runtime": 137229.9848, "train_tokens_per_second": 98021.945 }, { "epoch": 0.5486486486486486, "grad_norm": 0.6542009711265564, "learning_rate": 2.1278951365057095e-05, "loss": 0.3957, "num_input_tokens_seen": 13455473662, "step": 3451, "train_runtime": 137269.9012, "train_tokens_per_second": 98022.025 }, { "epoch": 0.5488076311605723, "grad_norm": 0.32572171092033386, "learning_rate": 2.1266578888935068e-05, "loss": 0.3936, "num_input_tokens_seen": 13459373374, "step": 3452, "train_runtime": 137306.2846, "train_tokens_per_second": 98024.452 }, { "epoch": 0.548966613672496, "grad_norm": 0.20895661413669586, "learning_rate": 2.1254207348008653e-05, "loss": 0.4196, "num_input_tokens_seen": 13463261667, "step": 3453, "train_runtime": 137346.9579, "train_tokens_per_second": 98023.734 }, { "epoch": 0.5491255961844197, "grad_norm": 0.2243141084909439, "learning_rate": 2.1241836745376822e-05, "loss": 0.4035, "num_input_tokens_seen": 13467185001, "step": 3454, "train_runtime": 137387.0578, "train_tokens_per_second": 98023.68 }, { "epoch": 0.5492845786963434, "grad_norm": 0.1735706627368927, "learning_rate": 2.122946708413833e-05, "loss": 0.3902, "num_input_tokens_seen": 13471109910, "step": 3455, "train_runtime": 137424.5461, "train_tokens_per_second": 98025.5 }, { "epoch": 0.5494435612082671, "grad_norm": 0.17933635413646698, "learning_rate": 2.1217098367391696e-05, "loss": 0.4067, "num_input_tokens_seen": 13475035434, "step": 3456, "train_runtime": 137463.8112, "train_tokens_per_second": 98026.057 }, { "epoch": 0.5496025437201908, "grad_norm": 0.23135264217853546, "learning_rate": 2.120473059823518e-05, "loss": 0.4001, "num_input_tokens_seen": 13478723834, "step": 3457, "train_runtime": 137503.3878, "train_tokens_per_second": 98024.667 }, { "epoch": 0.5497615262321145, "grad_norm": 0.22402901947498322, "learning_rate": 2.1192363779766838e-05, "loss": 0.3987, "num_input_tokens_seen": 13482726965, "step": 3458, "train_runtime": 137543.0784, "train_tokens_per_second": 98025.485 }, { "epoch": 0.5499205087440382, "grad_norm": 0.2598245143890381, "learning_rate": 2.1179997915084464e-05, "loss": 0.406, "num_input_tokens_seen": 13486577437, "step": 3459, "train_runtime": 137583.1347, "train_tokens_per_second": 98024.932 }, { "epoch": 0.5500794912559619, "grad_norm": 0.23965094983577728, "learning_rate": 2.1167633007285616e-05, "loss": 0.4136, "num_input_tokens_seen": 13490487585, "step": 3460, "train_runtime": 137622.5134, "train_tokens_per_second": 98025.296 }, { "epoch": 0.5502384737678855, "grad_norm": 0.19945663213729858, "learning_rate": 2.1155269059467617e-05, "loss": 0.3956, "num_input_tokens_seen": 13494304165, "step": 3461, "train_runtime": 137662.3826, "train_tokens_per_second": 98024.63 }, { "epoch": 0.5503974562798092, "grad_norm": 0.22445623576641083, "learning_rate": 2.114290607472755e-05, "loss": 0.4072, "num_input_tokens_seen": 13498241396, "step": 3462, "train_runtime": 137700.1679, "train_tokens_per_second": 98026.325 }, { "epoch": 0.5505564387917329, "grad_norm": 0.21840377151966095, "learning_rate": 2.113054405616226e-05, "loss": 0.4014, "num_input_tokens_seen": 13502262425, "step": 3463, "train_runtime": 137741.6299, "train_tokens_per_second": 98026.01 }, { "epoch": 0.5507154213036566, "grad_norm": 0.20779721438884735, "learning_rate": 2.111818300686834e-05, "loss": 0.4152, "num_input_tokens_seen": 13506062605, "step": 3464, "train_runtime": 137781.6623, "train_tokens_per_second": 98025.11 }, { "epoch": 0.5508744038155803, "grad_norm": 0.26809895038604736, "learning_rate": 2.110582292994214e-05, "loss": 0.4184, "num_input_tokens_seen": 13510016946, "step": 3465, "train_runtime": 137819.0036, "train_tokens_per_second": 98027.243 }, { "epoch": 0.551033386327504, "grad_norm": 0.24291874468326569, "learning_rate": 2.1093463828479788e-05, "loss": 0.4125, "num_input_tokens_seen": 13513823103, "step": 3466, "train_runtime": 137858.229, "train_tokens_per_second": 98026.96 }, { "epoch": 0.5511923688394277, "grad_norm": 0.2106127142906189, "learning_rate": 2.1081105705577138e-05, "loss": 0.3914, "num_input_tokens_seen": 13517688785, "step": 3467, "train_runtime": 137896.8679, "train_tokens_per_second": 98027.526 }, { "epoch": 0.5513513513513514, "grad_norm": 0.21786610782146454, "learning_rate": 2.1068748564329814e-05, "loss": 0.4003, "num_input_tokens_seen": 13521584979, "step": 3468, "train_runtime": 137935.6219, "train_tokens_per_second": 98028.231 }, { "epoch": 0.5515103338632751, "grad_norm": 0.18490560352802277, "learning_rate": 2.1056392407833196e-05, "loss": 0.3938, "num_input_tokens_seen": 13525581881, "step": 3469, "train_runtime": 137970.0644, "train_tokens_per_second": 98032.729 }, { "epoch": 0.5516693163751988, "grad_norm": 0.28954875469207764, "learning_rate": 2.1044037239182414e-05, "loss": 0.3885, "num_input_tokens_seen": 13529521127, "step": 3470, "train_runtime": 138008.0963, "train_tokens_per_second": 98034.257 }, { "epoch": 0.5518282988871224, "grad_norm": 0.3367234766483307, "learning_rate": 2.1031683061472353e-05, "loss": 0.4125, "num_input_tokens_seen": 13533352058, "step": 3471, "train_runtime": 138048.0219, "train_tokens_per_second": 98033.654 }, { "epoch": 0.5519872813990461, "grad_norm": 0.2095719426870346, "learning_rate": 2.1019329877797634e-05, "loss": 0.4121, "num_input_tokens_seen": 13537426202, "step": 3472, "train_runtime": 138086.177, "train_tokens_per_second": 98036.071 }, { "epoch": 0.5521462639109698, "grad_norm": 0.1944386512041092, "learning_rate": 2.100697769125266e-05, "loss": 0.4021, "num_input_tokens_seen": 13541356275, "step": 3473, "train_runtime": 138125.6038, "train_tokens_per_second": 98036.54 }, { "epoch": 0.5523052464228935, "grad_norm": 0.20543964207172394, "learning_rate": 2.099462650493156e-05, "loss": 0.4021, "num_input_tokens_seen": 13545123445, "step": 3474, "train_runtime": 138163.4654, "train_tokens_per_second": 98036.941 }, { "epoch": 0.5524642289348172, "grad_norm": 0.1848720759153366, "learning_rate": 2.0982276321928206e-05, "loss": 0.4035, "num_input_tokens_seen": 13549103112, "step": 3475, "train_runtime": 138203.0197, "train_tokens_per_second": 98037.678 }, { "epoch": 0.5526232114467409, "grad_norm": 0.20872896909713745, "learning_rate": 2.0969927145336255e-05, "loss": 0.4035, "num_input_tokens_seen": 13553034090, "step": 3476, "train_runtime": 138243.4338, "train_tokens_per_second": 98037.453 }, { "epoch": 0.5527821939586646, "grad_norm": 3.2128193378448486, "learning_rate": 2.0957578978249075e-05, "loss": 0.4007, "num_input_tokens_seen": 13556906807, "step": 3477, "train_runtime": 138280.3906, "train_tokens_per_second": 98039.257 }, { "epoch": 0.5529411764705883, "grad_norm": 0.23447036743164062, "learning_rate": 2.0945231823759787e-05, "loss": 0.4101, "num_input_tokens_seen": 13560730130, "step": 3478, "train_runtime": 138319.7049, "train_tokens_per_second": 98039.033 }, { "epoch": 0.553100158982512, "grad_norm": 0.25335609912872314, "learning_rate": 2.0932885684961283e-05, "loss": 0.3929, "num_input_tokens_seen": 13564649119, "step": 3479, "train_runtime": 138359.1357, "train_tokens_per_second": 98039.418 }, { "epoch": 0.5532591414944356, "grad_norm": 0.19337372481822968, "learning_rate": 2.0920540564946168e-05, "loss": 0.3907, "num_input_tokens_seen": 13568586341, "step": 3480, "train_runtime": 138395.7906, "train_tokens_per_second": 98041.901 }, { "epoch": 0.5534181240063593, "grad_norm": 0.2325575351715088, "learning_rate": 2.090819646680682e-05, "loss": 0.4027, "num_input_tokens_seen": 13572445618, "step": 3481, "train_runtime": 138434.7371, "train_tokens_per_second": 98042.196 }, { "epoch": 0.553577106518283, "grad_norm": 0.21718865633010864, "learning_rate": 2.0895853393635346e-05, "loss": 0.3997, "num_input_tokens_seen": 13576465545, "step": 3482, "train_runtime": 138475.3785, "train_tokens_per_second": 98042.451 }, { "epoch": 0.5537360890302067, "grad_norm": 0.22416500747203827, "learning_rate": 2.0883511348523583e-05, "loss": 0.4102, "num_input_tokens_seen": 13580334756, "step": 3483, "train_runtime": 138514.4252, "train_tokens_per_second": 98042.747 }, { "epoch": 0.5538950715421304, "grad_norm": 0.19791074097156525, "learning_rate": 2.087117033456315e-05, "loss": 0.4109, "num_input_tokens_seen": 13584268737, "step": 3484, "train_runtime": 138553.264, "train_tokens_per_second": 98043.657 }, { "epoch": 0.5540540540540541, "grad_norm": 0.25903379917144775, "learning_rate": 2.0858830354845364e-05, "loss": 0.4079, "num_input_tokens_seen": 13588047435, "step": 3485, "train_runtime": 138590.6214, "train_tokens_per_second": 98044.495 }, { "epoch": 0.5542130365659778, "grad_norm": 1.1133675575256348, "learning_rate": 2.084649141246132e-05, "loss": 0.4017, "num_input_tokens_seen": 13592039840, "step": 3486, "train_runtime": 138630.3923, "train_tokens_per_second": 98045.166 }, { "epoch": 0.5543720190779015, "grad_norm": 0.31974124908447266, "learning_rate": 2.0834153510501827e-05, "loss": 0.4128, "num_input_tokens_seen": 13596021246, "step": 3487, "train_runtime": 138668.5103, "train_tokens_per_second": 98046.927 }, { "epoch": 0.5545310015898252, "grad_norm": 0.18369200825691223, "learning_rate": 2.0821816652057433e-05, "loss": 0.3982, "num_input_tokens_seen": 13599873115, "step": 3488, "train_runtime": 138705.0467, "train_tokens_per_second": 98048.87 }, { "epoch": 0.5546899841017489, "grad_norm": 0.20444221794605255, "learning_rate": 2.0809480840218452e-05, "loss": 0.4091, "num_input_tokens_seen": 13603747207, "step": 3489, "train_runtime": 138744.6161, "train_tokens_per_second": 98048.829 }, { "epoch": 0.5548489666136724, "grad_norm": 0.22254210710525513, "learning_rate": 2.0797146078074896e-05, "loss": 0.4003, "num_input_tokens_seen": 13607658647, "step": 3490, "train_runtime": 138784.998, "train_tokens_per_second": 98048.484 }, { "epoch": 0.5550079491255961, "grad_norm": 0.2202116847038269, "learning_rate": 2.0784812368716563e-05, "loss": 0.407, "num_input_tokens_seen": 13611514657, "step": 3491, "train_runtime": 138825.6834, "train_tokens_per_second": 98047.525 }, { "epoch": 0.5551669316375198, "grad_norm": 0.22750553488731384, "learning_rate": 2.0772479715232944e-05, "loss": 0.4016, "num_input_tokens_seen": 13615403744, "step": 3492, "train_runtime": 138863.542, "train_tokens_per_second": 98048.801 }, { "epoch": 0.5553259141494435, "grad_norm": 0.19837471842765808, "learning_rate": 2.076014812071328e-05, "loss": 0.4059, "num_input_tokens_seen": 13619260186, "step": 3493, "train_runtime": 138902.9912, "train_tokens_per_second": 98048.718 }, { "epoch": 0.5554848966613672, "grad_norm": 0.2146899700164795, "learning_rate": 2.0747817588246555e-05, "loss": 0.4055, "num_input_tokens_seen": 13623151130, "step": 3494, "train_runtime": 138943.9734, "train_tokens_per_second": 98047.802 }, { "epoch": 0.5556438791732909, "grad_norm": 0.20452530682086945, "learning_rate": 2.0735488120921477e-05, "loss": 0.4124, "num_input_tokens_seen": 13627027102, "step": 3495, "train_runtime": 138982.7132, "train_tokens_per_second": 98048.36 }, { "epoch": 0.5558028616852146, "grad_norm": 0.23385390639305115, "learning_rate": 2.0723159721826478e-05, "loss": 0.4084, "num_input_tokens_seen": 13630898317, "step": 3496, "train_runtime": 139020.3116, "train_tokens_per_second": 98049.689 }, { "epoch": 0.5559618441971383, "grad_norm": 0.2064310908317566, "learning_rate": 2.0710832394049755e-05, "loss": 0.3958, "num_input_tokens_seen": 13634797471, "step": 3497, "train_runtime": 139060.9158, "train_tokens_per_second": 98049.099 }, { "epoch": 0.556120826709062, "grad_norm": 0.21286939084529877, "learning_rate": 2.06985061406792e-05, "loss": 0.4097, "num_input_tokens_seen": 13638751445, "step": 3498, "train_runtime": 139099.215, "train_tokens_per_second": 98050.528 }, { "epoch": 0.5562798092209857, "grad_norm": 0.1913093775510788, "learning_rate": 2.0686180964802478e-05, "loss": 0.3957, "num_input_tokens_seen": 13642496488, "step": 3499, "train_runtime": 139139.8925, "train_tokens_per_second": 98048.778 }, { "epoch": 0.5564387917329093, "grad_norm": 0.21631716191768646, "learning_rate": 2.0673856869506935e-05, "loss": 0.4011, "num_input_tokens_seen": 13646369316, "step": 3500, "train_runtime": 139178.8545, "train_tokens_per_second": 98049.157 }, { "epoch": 0.556597774244833, "grad_norm": 0.26353567838668823, "learning_rate": 2.0661533857879674e-05, "loss": 0.4009, "num_input_tokens_seen": 13650240826, "step": 3501, "train_runtime": 139217.6331, "train_tokens_per_second": 98049.655 }, { "epoch": 0.5567567567567567, "grad_norm": 0.19780157506465912, "learning_rate": 2.0649211933007525e-05, "loss": 0.387, "num_input_tokens_seen": 13654105698, "step": 3502, "train_runtime": 139255.9547, "train_tokens_per_second": 98050.426 }, { "epoch": 0.5569157392686804, "grad_norm": 0.21761099994182587, "learning_rate": 2.0636891097977034e-05, "loss": 0.4079, "num_input_tokens_seen": 13657975224, "step": 3503, "train_runtime": 139292.6419, "train_tokens_per_second": 98052.381 }, { "epoch": 0.5570747217806041, "grad_norm": 0.1823747456073761, "learning_rate": 2.0624571355874506e-05, "loss": 0.4069, "num_input_tokens_seen": 13661910618, "step": 3504, "train_runtime": 139331.4309, "train_tokens_per_second": 98053.329 }, { "epoch": 0.5572337042925278, "grad_norm": 0.19295740127563477, "learning_rate": 2.061225270978593e-05, "loss": 0.4022, "num_input_tokens_seen": 13665810294, "step": 3505, "train_runtime": 139369.991, "train_tokens_per_second": 98054.181 }, { "epoch": 0.5573926868044515, "grad_norm": 0.19253510236740112, "learning_rate": 2.0599935162797037e-05, "loss": 0.3957, "num_input_tokens_seen": 13669613232, "step": 3506, "train_runtime": 139409.1332, "train_tokens_per_second": 98053.929 }, { "epoch": 0.5575516693163752, "grad_norm": 0.20797425508499146, "learning_rate": 2.0587618717993304e-05, "loss": 0.4071, "num_input_tokens_seen": 13673666632, "step": 3507, "train_runtime": 139448.283, "train_tokens_per_second": 98055.468 }, { "epoch": 0.5577106518282989, "grad_norm": 0.18883377313613892, "learning_rate": 2.05753033784599e-05, "loss": 0.4064, "num_input_tokens_seen": 13677617254, "step": 3508, "train_runtime": 139489.0985, "train_tokens_per_second": 98055.098 }, { "epoch": 0.5578696343402226, "grad_norm": 0.19489426910877228, "learning_rate": 2.0562989147281732e-05, "loss": 0.3928, "num_input_tokens_seen": 13681408545, "step": 3509, "train_runtime": 139526.7543, "train_tokens_per_second": 98055.807 }, { "epoch": 0.5580286168521462, "grad_norm": 0.19290150701999664, "learning_rate": 2.0550676027543435e-05, "loss": 0.3854, "num_input_tokens_seen": 13685323972, "step": 3510, "train_runtime": 139565.2135, "train_tokens_per_second": 98056.841 }, { "epoch": 0.5581875993640699, "grad_norm": 0.2571811079978943, "learning_rate": 2.0538364022329338e-05, "loss": 0.3971, "num_input_tokens_seen": 13689253081, "step": 3511, "train_runtime": 139605.2686, "train_tokens_per_second": 98056.851 }, { "epoch": 0.5583465818759936, "grad_norm": 0.20036165416240692, "learning_rate": 2.052605313472354e-05, "loss": 0.4069, "num_input_tokens_seen": 13693127701, "step": 3512, "train_runtime": 139645.3063, "train_tokens_per_second": 98056.484 }, { "epoch": 0.5585055643879173, "grad_norm": 0.17971254885196686, "learning_rate": 2.051374336780981e-05, "loss": 0.3973, "num_input_tokens_seen": 13697023104, "step": 3513, "train_runtime": 139681.5666, "train_tokens_per_second": 98058.917 }, { "epoch": 0.558664546899841, "grad_norm": 0.2112085372209549, "learning_rate": 2.0501434724671665e-05, "loss": 0.4057, "num_input_tokens_seen": 13700794742, "step": 3514, "train_runtime": 139722.3665, "train_tokens_per_second": 98057.277 }, { "epoch": 0.5588235294117647, "grad_norm": 0.2104506492614746, "learning_rate": 2.0489127208392335e-05, "loss": 0.4042, "num_input_tokens_seen": 13704677736, "step": 3515, "train_runtime": 139761.0589, "train_tokens_per_second": 98057.913 }, { "epoch": 0.5589825119236884, "grad_norm": 0.17120705544948578, "learning_rate": 2.0476820822054764e-05, "loss": 0.3939, "num_input_tokens_seen": 13708629916, "step": 3516, "train_runtime": 139797.7495, "train_tokens_per_second": 98060.448 }, { "epoch": 0.5591414944356121, "grad_norm": 0.2298281043767929, "learning_rate": 2.0464515568741616e-05, "loss": 0.4097, "num_input_tokens_seen": 13712504384, "step": 3517, "train_runtime": 139838.751, "train_tokens_per_second": 98059.403 }, { "epoch": 0.5593004769475358, "grad_norm": 0.19530528783798218, "learning_rate": 2.045221145153527e-05, "loss": 0.4029, "num_input_tokens_seen": 13716462573, "step": 3518, "train_runtime": 139877.8197, "train_tokens_per_second": 98060.312 }, { "epoch": 0.5594594594594594, "grad_norm": 0.19268560409545898, "learning_rate": 2.043990847351781e-05, "loss": 0.4091, "num_input_tokens_seen": 13720324930, "step": 3519, "train_runtime": 139918.4914, "train_tokens_per_second": 98059.412 }, { "epoch": 0.5596184419713831, "grad_norm": 0.21793320775032043, "learning_rate": 2.0427606637771063e-05, "loss": 0.3964, "num_input_tokens_seen": 13724216875, "step": 3520, "train_runtime": 139958.8192, "train_tokens_per_second": 98058.964 }, { "epoch": 0.5597774244833068, "grad_norm": 0.22408504784107208, "learning_rate": 2.0415305947376534e-05, "loss": 0.4, "num_input_tokens_seen": 13728118016, "step": 3521, "train_runtime": 139999.7046, "train_tokens_per_second": 98058.193 }, { "epoch": 0.5599364069952305, "grad_norm": 0.21667902171611786, "learning_rate": 2.040300640541548e-05, "loss": 0.4074, "num_input_tokens_seen": 13731959187, "step": 3522, "train_runtime": 140037.4222, "train_tokens_per_second": 98059.211 }, { "epoch": 0.5600953895071542, "grad_norm": 0.19566543400287628, "learning_rate": 2.0390708014968834e-05, "loss": 0.4152, "num_input_tokens_seen": 13735901615, "step": 3523, "train_runtime": 140075.2796, "train_tokens_per_second": 98060.854 }, { "epoch": 0.5602543720190779, "grad_norm": 0.18856804072856903, "learning_rate": 2.0378410779117255e-05, "loss": 0.393, "num_input_tokens_seen": 13739751811, "step": 3524, "train_runtime": 140115.393, "train_tokens_per_second": 98060.26 }, { "epoch": 0.5604133545310016, "grad_norm": 0.19509346783161163, "learning_rate": 2.0366114700941126e-05, "loss": 0.3941, "num_input_tokens_seen": 13743640659, "step": 3525, "train_runtime": 140152.6106, "train_tokens_per_second": 98061.967 }, { "epoch": 0.5605723370429253, "grad_norm": 0.20664682984352112, "learning_rate": 2.0353819783520506e-05, "loss": 0.4146, "num_input_tokens_seen": 13747496421, "step": 3526, "train_runtime": 140193.8947, "train_tokens_per_second": 98060.593 }, { "epoch": 0.560731319554849, "grad_norm": 0.18656757473945618, "learning_rate": 2.034152602993521e-05, "loss": 0.4035, "num_input_tokens_seen": 13751401290, "step": 3527, "train_runtime": 140233.8211, "train_tokens_per_second": 98060.519 }, { "epoch": 0.5608903020667727, "grad_norm": 0.18271920084953308, "learning_rate": 2.0329233443264724e-05, "loss": 0.4043, "num_input_tokens_seen": 13755196894, "step": 3528, "train_runtime": 140275.3014, "train_tokens_per_second": 98058.58 }, { "epoch": 0.5610492845786963, "grad_norm": 0.2813625633716583, "learning_rate": 2.031694202658825e-05, "loss": 0.3909, "num_input_tokens_seen": 13759169888, "step": 3529, "train_runtime": 140314.7486, "train_tokens_per_second": 98059.328 }, { "epoch": 0.56120826709062, "grad_norm": 0.23062749207019806, "learning_rate": 2.030465178298471e-05, "loss": 0.3972, "num_input_tokens_seen": 13762958506, "step": 3530, "train_runtime": 140352.9585, "train_tokens_per_second": 98059.625 }, { "epoch": 0.5613672496025437, "grad_norm": 0.2328909933567047, "learning_rate": 2.0292362715532724e-05, "loss": 0.4125, "num_input_tokens_seen": 13766811118, "step": 3531, "train_runtime": 140392.9924, "train_tokens_per_second": 98059.105 }, { "epoch": 0.5615262321144674, "grad_norm": 0.2540287971496582, "learning_rate": 2.0280074827310603e-05, "loss": 0.4041, "num_input_tokens_seen": 13770840127, "step": 3532, "train_runtime": 140433.6514, "train_tokens_per_second": 98059.404 }, { "epoch": 0.5616852146263911, "grad_norm": 0.18686184287071228, "learning_rate": 2.0267788121396393e-05, "loss": 0.4098, "num_input_tokens_seen": 13774753408, "step": 3533, "train_runtime": 140474.8595, "train_tokens_per_second": 98058.496 }, { "epoch": 0.5618441971383148, "grad_norm": 0.32257843017578125, "learning_rate": 2.0255502600867807e-05, "loss": 0.4045, "num_input_tokens_seen": 13778568804, "step": 3534, "train_runtime": 140512.9365, "train_tokens_per_second": 98059.077 }, { "epoch": 0.5620031796502385, "grad_norm": 0.2087022215127945, "learning_rate": 2.0243218268802306e-05, "loss": 0.3972, "num_input_tokens_seen": 13782510685, "step": 3535, "train_runtime": 140549.1244, "train_tokens_per_second": 98061.875 }, { "epoch": 0.5621621621621622, "grad_norm": 0.22166043519973755, "learning_rate": 2.0230935128277013e-05, "loss": 0.3961, "num_input_tokens_seen": 13786387045, "step": 3536, "train_runtime": 140588.7077, "train_tokens_per_second": 98061.838 }, { "epoch": 0.5623211446740859, "grad_norm": 0.49833595752716064, "learning_rate": 2.021865318236876e-05, "loss": 0.4166, "num_input_tokens_seen": 13790175088, "step": 3537, "train_runtime": 140628.7115, "train_tokens_per_second": 98060.879 }, { "epoch": 0.5624801271860096, "grad_norm": 0.19257371127605438, "learning_rate": 2.020637243415411e-05, "loss": 0.3898, "num_input_tokens_seen": 13794134916, "step": 3538, "train_runtime": 140665.7533, "train_tokens_per_second": 98063.207 }, { "epoch": 0.5626391096979332, "grad_norm": 0.19765512645244598, "learning_rate": 2.0194092886709287e-05, "loss": 0.3963, "num_input_tokens_seen": 13797993369, "step": 3539, "train_runtime": 140706.2595, "train_tokens_per_second": 98062.399 }, { "epoch": 0.5627980922098569, "grad_norm": 0.2444930225610733, "learning_rate": 2.018181454311024e-05, "loss": 0.3889, "num_input_tokens_seen": 13801918005, "step": 3540, "train_runtime": 140744.7887, "train_tokens_per_second": 98063.439 }, { "epoch": 0.5629570747217806, "grad_norm": 0.19423988461494446, "learning_rate": 2.0169537406432604e-05, "loss": 0.3898, "num_input_tokens_seen": 13805753029, "step": 3541, "train_runtime": 140783.3002, "train_tokens_per_second": 98063.854 }, { "epoch": 0.5631160572337043, "grad_norm": 0.2602628767490387, "learning_rate": 2.01572614797517e-05, "loss": 0.4099, "num_input_tokens_seen": 13809596133, "step": 3542, "train_runtime": 140822.5836, "train_tokens_per_second": 98063.789 }, { "epoch": 0.563275039745628, "grad_norm": 0.1954098641872406, "learning_rate": 2.0144986766142588e-05, "loss": 0.395, "num_input_tokens_seen": 13813496071, "step": 3543, "train_runtime": 140861.3417, "train_tokens_per_second": 98064.493 }, { "epoch": 0.5634340222575517, "grad_norm": 0.3569042980670929, "learning_rate": 2.0132713268679972e-05, "loss": 0.4091, "num_input_tokens_seen": 13817339431, "step": 3544, "train_runtime": 140900.2924, "train_tokens_per_second": 98064.661 }, { "epoch": 0.5635930047694754, "grad_norm": 0.21961402893066406, "learning_rate": 2.01204409904383e-05, "loss": 0.3956, "num_input_tokens_seen": 13821280354, "step": 3545, "train_runtime": 140940.5296, "train_tokens_per_second": 98064.626 }, { "epoch": 0.5637519872813991, "grad_norm": 0.19632627069950104, "learning_rate": 2.010816993449168e-05, "loss": 0.4045, "num_input_tokens_seen": 13825323948, "step": 3546, "train_runtime": 140978.5867, "train_tokens_per_second": 98066.836 }, { "epoch": 0.5639109697933228, "grad_norm": 0.18802347779273987, "learning_rate": 2.0095900103913916e-05, "loss": 0.4018, "num_input_tokens_seen": 13829242563, "step": 3547, "train_runtime": 141017.7025, "train_tokens_per_second": 98067.422 }, { "epoch": 0.5640699523052464, "grad_norm": 0.17445088922977448, "learning_rate": 2.0083631501778534e-05, "loss": 0.375, "num_input_tokens_seen": 13833180044, "step": 3548, "train_runtime": 141055.107, "train_tokens_per_second": 98069.331 }, { "epoch": 0.5642289348171701, "grad_norm": 0.17960971593856812, "learning_rate": 2.0071364131158702e-05, "loss": 0.3998, "num_input_tokens_seen": 13837110455, "step": 3549, "train_runtime": 141095.357, "train_tokens_per_second": 98069.212 }, { "epoch": 0.5643879173290938, "grad_norm": 0.191161647439003, "learning_rate": 2.005909799512734e-05, "loss": 0.4077, "num_input_tokens_seen": 13841058769, "step": 3550, "train_runtime": 141132.6125, "train_tokens_per_second": 98071.3 }, { "epoch": 0.5645468998410175, "grad_norm": 0.2021871656179428, "learning_rate": 2.0046833096757017e-05, "loss": 0.4029, "num_input_tokens_seen": 13845043791, "step": 3551, "train_runtime": 141172.0594, "train_tokens_per_second": 98072.125 }, { "epoch": 0.5647058823529412, "grad_norm": 0.19624565541744232, "learning_rate": 2.003456943911999e-05, "loss": 0.3954, "num_input_tokens_seen": 13848820754, "step": 3552, "train_runtime": 141211.8807, "train_tokens_per_second": 98071.215 }, { "epoch": 0.5648648648648649, "grad_norm": 0.2072814702987671, "learning_rate": 2.0022307025288245e-05, "loss": 0.4153, "num_input_tokens_seen": 13852782342, "step": 3553, "train_runtime": 141250.2657, "train_tokens_per_second": 98072.611 }, { "epoch": 0.5650238473767886, "grad_norm": 0.2726183533668518, "learning_rate": 2.0010045858333413e-05, "loss": 0.4224, "num_input_tokens_seen": 13856749572, "step": 3554, "train_runtime": 141290.025, "train_tokens_per_second": 98073.092 }, { "epoch": 0.5651828298887123, "grad_norm": 0.20462757349014282, "learning_rate": 1.999778594132683e-05, "loss": 0.408, "num_input_tokens_seen": 13860577835, "step": 3555, "train_runtime": 141329.0838, "train_tokens_per_second": 98073.075 }, { "epoch": 0.565341812400636, "grad_norm": 0.23301002383232117, "learning_rate": 1.9985527277339523e-05, "loss": 0.4039, "num_input_tokens_seen": 13864510702, "step": 3556, "train_runtime": 141366.2996, "train_tokens_per_second": 98075.077 }, { "epoch": 0.5655007949125597, "grad_norm": 0.18194572627544403, "learning_rate": 1.9973269869442192e-05, "loss": 0.4095, "num_input_tokens_seen": 13868372541, "step": 3557, "train_runtime": 141406.4246, "train_tokens_per_second": 98074.558 }, { "epoch": 0.5656597774244833, "grad_norm": 0.19775132834911346, "learning_rate": 1.996101372070525e-05, "loss": 0.3946, "num_input_tokens_seen": 13872205695, "step": 3558, "train_runtime": 141445.8922, "train_tokens_per_second": 98074.292 }, { "epoch": 0.565818759936407, "grad_norm": 0.1776951402425766, "learning_rate": 1.994875883419876e-05, "loss": 0.4023, "num_input_tokens_seen": 13876118161, "step": 3559, "train_runtime": 141484.1767, "train_tokens_per_second": 98075.407 }, { "epoch": 0.5659777424483307, "grad_norm": 0.1811799705028534, "learning_rate": 1.9936505212992477e-05, "loss": 0.3991, "num_input_tokens_seen": 13880095119, "step": 3560, "train_runtime": 141523.9576, "train_tokens_per_second": 98075.94 }, { "epoch": 0.5661367249602544, "grad_norm": 0.27806422114372253, "learning_rate": 1.992425286015587e-05, "loss": 0.4019, "num_input_tokens_seen": 13884029551, "step": 3561, "train_runtime": 141565.7857, "train_tokens_per_second": 98074.754 }, { "epoch": 0.5662957074721781, "grad_norm": 0.20901796221733093, "learning_rate": 1.991200177875805e-05, "loss": 0.4185, "num_input_tokens_seen": 13887961292, "step": 3562, "train_runtime": 141604.1873, "train_tokens_per_second": 98075.922 }, { "epoch": 0.5664546899841018, "grad_norm": 0.18688328564167023, "learning_rate": 1.9899751971867834e-05, "loss": 0.4042, "num_input_tokens_seen": 13891879436, "step": 3563, "train_runtime": 141643.9539, "train_tokens_per_second": 98076.049 }, { "epoch": 0.5666136724960255, "grad_norm": 0.25920963287353516, "learning_rate": 1.9887503442553706e-05, "loss": 0.4037, "num_input_tokens_seen": 13895770169, "step": 3564, "train_runtime": 141681.4711, "train_tokens_per_second": 98077.54 }, { "epoch": 0.5667726550079492, "grad_norm": 0.19492468237876892, "learning_rate": 1.9875256193883838e-05, "loss": 0.3922, "num_input_tokens_seen": 13899683643, "step": 3565, "train_runtime": 141722.4783, "train_tokens_per_second": 98076.775 }, { "epoch": 0.5669316375198729, "grad_norm": 0.21878768503665924, "learning_rate": 1.9863010228926084e-05, "loss": 0.3936, "num_input_tokens_seen": 13903589570, "step": 3566, "train_runtime": 141759.8187, "train_tokens_per_second": 98078.494 }, { "epoch": 0.5670906200317966, "grad_norm": 0.19418427348136902, "learning_rate": 1.985076555074796e-05, "loss": 0.4, "num_input_tokens_seen": 13907425581, "step": 3567, "train_runtime": 141800.9235, "train_tokens_per_second": 98077.116 }, { "epoch": 0.5672496025437201, "grad_norm": 0.2344890683889389, "learning_rate": 1.9838522162416687e-05, "loss": 0.4075, "num_input_tokens_seen": 13911412262, "step": 3568, "train_runtime": 141841.4166, "train_tokens_per_second": 98077.223 }, { "epoch": 0.5674085850556438, "grad_norm": 0.22367064654827118, "learning_rate": 1.9826280066999143e-05, "loss": 0.4069, "num_input_tokens_seen": 13915278853, "step": 3569, "train_runtime": 141881.7967, "train_tokens_per_second": 98076.562 }, { "epoch": 0.5675675675675675, "grad_norm": 0.18284094333648682, "learning_rate": 1.9814039267561872e-05, "loss": 0.3957, "num_input_tokens_seen": 13919153557, "step": 3570, "train_runtime": 141920.4927, "train_tokens_per_second": 98077.123 }, { "epoch": 0.5677265500794912, "grad_norm": 0.2524445652961731, "learning_rate": 1.9801799767171125e-05, "loss": 0.3988, "num_input_tokens_seen": 13922995965, "step": 3571, "train_runtime": 141958.6518, "train_tokens_per_second": 98077.826 }, { "epoch": 0.5678855325914149, "grad_norm": 0.19749946892261505, "learning_rate": 1.9789561568892807e-05, "loss": 0.3811, "num_input_tokens_seen": 13926829165, "step": 3572, "train_runtime": 141996.9442, "train_tokens_per_second": 98078.372 }, { "epoch": 0.5680445151033386, "grad_norm": 0.24868056178092957, "learning_rate": 1.977732467579249e-05, "loss": 0.4094, "num_input_tokens_seen": 13930847243, "step": 3573, "train_runtime": 142037.63, "train_tokens_per_second": 98078.567 }, { "epoch": 0.5682034976152623, "grad_norm": 0.23857107758522034, "learning_rate": 1.9765089090935443e-05, "loss": 0.4036, "num_input_tokens_seen": 13934742965, "step": 3574, "train_runtime": 142076.4821, "train_tokens_per_second": 98079.167 }, { "epoch": 0.568362480127186, "grad_norm": 0.17540177702903748, "learning_rate": 1.9752854817386577e-05, "loss": 0.3994, "num_input_tokens_seen": 13938612087, "step": 3575, "train_runtime": 142116.1494, "train_tokens_per_second": 98079.016 }, { "epoch": 0.5685214626391097, "grad_norm": 0.2193773090839386, "learning_rate": 1.9740621858210512e-05, "loss": 0.3973, "num_input_tokens_seen": 13942555080, "step": 3576, "train_runtime": 142154.5072, "train_tokens_per_second": 98080.289 }, { "epoch": 0.5686804451510333, "grad_norm": 0.1752547323703766, "learning_rate": 1.9728390216471503e-05, "loss": 0.4118, "num_input_tokens_seen": 13946418564, "step": 3577, "train_runtime": 142193.7394, "train_tokens_per_second": 98080.398 }, { "epoch": 0.568839427662957, "grad_norm": 0.19581246376037598, "learning_rate": 1.9716159895233486e-05, "loss": 0.3907, "num_input_tokens_seen": 13950390921, "step": 3578, "train_runtime": 142232.0962, "train_tokens_per_second": 98081.877 }, { "epoch": 0.5689984101748807, "grad_norm": 0.21454553306102753, "learning_rate": 1.970393089756009e-05, "loss": 0.408, "num_input_tokens_seen": 13954171091, "step": 3579, "train_runtime": 142270.4661, "train_tokens_per_second": 98081.995 }, { "epoch": 0.5691573926868044, "grad_norm": 0.209858700633049, "learning_rate": 1.9691703226514575e-05, "loss": 0.3916, "num_input_tokens_seen": 13958184588, "step": 3580, "train_runtime": 142309.5998, "train_tokens_per_second": 98083.226 }, { "epoch": 0.5693163751987281, "grad_norm": 0.19506390392780304, "learning_rate": 1.9679476885159894e-05, "loss": 0.3957, "num_input_tokens_seen": 13962134109, "step": 3581, "train_runtime": 142349.6556, "train_tokens_per_second": 98083.371 }, { "epoch": 0.5694753577106518, "grad_norm": 0.20946431159973145, "learning_rate": 1.9667251876558656e-05, "loss": 0.3898, "num_input_tokens_seen": 13966033832, "step": 3582, "train_runtime": 142388.2099, "train_tokens_per_second": 98084.201 }, { "epoch": 0.5696343402225755, "grad_norm": 0.2949838638305664, "learning_rate": 1.9655028203773132e-05, "loss": 0.3961, "num_input_tokens_seen": 13969911666, "step": 3583, "train_runtime": 142428.9949, "train_tokens_per_second": 98083.341 }, { "epoch": 0.5697933227344992, "grad_norm": 0.2071828842163086, "learning_rate": 1.9642805869865284e-05, "loss": 0.3955, "num_input_tokens_seen": 13973810301, "step": 3584, "train_runtime": 142470.8928, "train_tokens_per_second": 98081.861 }, { "epoch": 0.5699523052464229, "grad_norm": 0.22093906998634338, "learning_rate": 1.9630584877896698e-05, "loss": 0.3942, "num_input_tokens_seen": 13977711757, "step": 3585, "train_runtime": 142512.8365, "train_tokens_per_second": 98080.37 }, { "epoch": 0.5701112877583466, "grad_norm": 0.25591033697128296, "learning_rate": 1.961836523092867e-05, "loss": 0.4106, "num_input_tokens_seen": 13981651613, "step": 3586, "train_runtime": 142549.4052, "train_tokens_per_second": 98082.848 }, { "epoch": 0.5702702702702702, "grad_norm": 0.19359415769577026, "learning_rate": 1.960614693202212e-05, "loss": 0.3961, "num_input_tokens_seen": 13985535733, "step": 3587, "train_runtime": 142589.3295, "train_tokens_per_second": 98082.625 }, { "epoch": 0.5704292527821939, "grad_norm": 0.19278307259082794, "learning_rate": 1.9593929984237646e-05, "loss": 0.4015, "num_input_tokens_seen": 13989455896, "step": 3588, "train_runtime": 142626.4643, "train_tokens_per_second": 98084.573 }, { "epoch": 0.5705882352941176, "grad_norm": 0.28242695331573486, "learning_rate": 1.9581714390635513e-05, "loss": 0.3904, "num_input_tokens_seen": 13993371931, "step": 3589, "train_runtime": 142666.1, "train_tokens_per_second": 98084.772 }, { "epoch": 0.5707472178060413, "grad_norm": 0.22092591226100922, "learning_rate": 1.956950015427564e-05, "loss": 0.4052, "num_input_tokens_seen": 13997265956, "step": 3590, "train_runtime": 142703.1282, "train_tokens_per_second": 98086.609 }, { "epoch": 0.570906200317965, "grad_norm": 0.22474543750286102, "learning_rate": 1.9557287278217595e-05, "loss": 0.3909, "num_input_tokens_seen": 14001112212, "step": 3591, "train_runtime": 142742.8922, "train_tokens_per_second": 98086.23 }, { "epoch": 0.5710651828298887, "grad_norm": 0.2132899910211563, "learning_rate": 1.9545075765520634e-05, "loss": 0.3978, "num_input_tokens_seen": 14005123318, "step": 3592, "train_runtime": 142780.4382, "train_tokens_per_second": 98088.53 }, { "epoch": 0.5712241653418124, "grad_norm": 0.2364567220211029, "learning_rate": 1.9532865619243638e-05, "loss": 0.3928, "num_input_tokens_seen": 14008831632, "step": 3593, "train_runtime": 142819.7997, "train_tokens_per_second": 98087.462 }, { "epoch": 0.5713831478537361, "grad_norm": 0.20068909227848053, "learning_rate": 1.9520656842445184e-05, "loss": 0.4083, "num_input_tokens_seen": 14012753930, "step": 3594, "train_runtime": 142860.6782, "train_tokens_per_second": 98086.85 }, { "epoch": 0.5715421303656598, "grad_norm": 0.18715780973434448, "learning_rate": 1.9508449438183468e-05, "loss": 0.3954, "num_input_tokens_seen": 14016673374, "step": 3595, "train_runtime": 142901.5759, "train_tokens_per_second": 98086.206 }, { "epoch": 0.5717011128775835, "grad_norm": 0.1977829933166504, "learning_rate": 1.949624340951636e-05, "loss": 0.4037, "num_input_tokens_seen": 14020596272, "step": 3596, "train_runtime": 142938.0313, "train_tokens_per_second": 98088.634 }, { "epoch": 0.5718600953895071, "grad_norm": 0.46877560019493103, "learning_rate": 1.948403875950139e-05, "loss": 0.4061, "num_input_tokens_seen": 14024438681, "step": 3597, "train_runtime": 142975.8725, "train_tokens_per_second": 98089.548 }, { "epoch": 0.5720190779014308, "grad_norm": 0.2522440552711487, "learning_rate": 1.9471835491195725e-05, "loss": 0.3862, "num_input_tokens_seen": 14028374336, "step": 3598, "train_runtime": 143015.4952, "train_tokens_per_second": 98089.891 }, { "epoch": 0.5721780604133545, "grad_norm": 0.20596006512641907, "learning_rate": 1.9459633607656215e-05, "loss": 0.3923, "num_input_tokens_seen": 14032258302, "step": 3599, "train_runtime": 143055.4279, "train_tokens_per_second": 98089.66 }, { "epoch": 0.5723370429252782, "grad_norm": 0.3317966163158417, "learning_rate": 1.9447433111939334e-05, "loss": 0.3983, "num_input_tokens_seen": 14036100603, "step": 3600, "train_runtime": 143091.7562, "train_tokens_per_second": 98091.609 }, { "epoch": 0.5724960254372019, "grad_norm": 0.2028813511133194, "learning_rate": 1.9435234007101215e-05, "loss": 0.3996, "num_input_tokens_seen": 14039977568, "step": 3601, "train_runtime": 143245.2226, "train_tokens_per_second": 98013.583 }, { "epoch": 0.5726550079491256, "grad_norm": 0.4160497486591339, "learning_rate": 1.9423036296197663e-05, "loss": 0.3905, "num_input_tokens_seen": 14043876624, "step": 3602, "train_runtime": 143285.6385, "train_tokens_per_second": 98013.149 }, { "epoch": 0.5728139904610493, "grad_norm": 0.20099486410617828, "learning_rate": 1.9410839982284106e-05, "loss": 0.3946, "num_input_tokens_seen": 14047762891, "step": 3603, "train_runtime": 143324.1141, "train_tokens_per_second": 98013.952 }, { "epoch": 0.572972972972973, "grad_norm": 0.21529966592788696, "learning_rate": 1.9398645068415645e-05, "loss": 0.4103, "num_input_tokens_seen": 14051706792, "step": 3604, "train_runtime": 143365.129, "train_tokens_per_second": 98013.421 }, { "epoch": 0.5731319554848967, "grad_norm": 0.20678012073040009, "learning_rate": 1.9386451557647008e-05, "loss": 0.4057, "num_input_tokens_seen": 14055697303, "step": 3605, "train_runtime": 143406.1261, "train_tokens_per_second": 98013.228 }, { "epoch": 0.5732909379968204, "grad_norm": 0.2086184024810791, "learning_rate": 1.937425945303258e-05, "loss": 0.3958, "num_input_tokens_seen": 14059579934, "step": 3606, "train_runtime": 143445.2361, "train_tokens_per_second": 98013.572 }, { "epoch": 0.573449920508744, "grad_norm": 0.24856382608413696, "learning_rate": 1.936206875762642e-05, "loss": 0.3989, "num_input_tokens_seen": 14063550373, "step": 3607, "train_runtime": 143485.217, "train_tokens_per_second": 98013.932 }, { "epoch": 0.5736089030206677, "grad_norm": 0.20505064725875854, "learning_rate": 1.9349879474482194e-05, "loss": 0.3977, "num_input_tokens_seen": 14067389015, "step": 3608, "train_runtime": 143524.7426, "train_tokens_per_second": 98013.686 }, { "epoch": 0.5737678855325914, "grad_norm": 0.23717810213565826, "learning_rate": 1.933769160665322e-05, "loss": 0.3813, "num_input_tokens_seen": 14071267678, "step": 3609, "train_runtime": 143564.4227, "train_tokens_per_second": 98013.612 }, { "epoch": 0.5739268680445151, "grad_norm": 0.20781934261322021, "learning_rate": 1.9325505157192503e-05, "loss": 0.3982, "num_input_tokens_seen": 14075196126, "step": 3610, "train_runtime": 143606.1036, "train_tokens_per_second": 98012.52 }, { "epoch": 0.5740858505564388, "grad_norm": 0.17776434123516083, "learning_rate": 1.9313320129152638e-05, "loss": 0.408, "num_input_tokens_seen": 14079124293, "step": 3611, "train_runtime": 143643.6958, "train_tokens_per_second": 98014.216 }, { "epoch": 0.5742448330683625, "grad_norm": 0.20207659900188446, "learning_rate": 1.93011365255859e-05, "loss": 0.4054, "num_input_tokens_seen": 14083065421, "step": 3612, "train_runtime": 143684.2148, "train_tokens_per_second": 98014.006 }, { "epoch": 0.5744038155802862, "grad_norm": 0.2445525974035263, "learning_rate": 1.9288954349544197e-05, "loss": 0.3904, "num_input_tokens_seen": 14086909925, "step": 3613, "train_runtime": 143722.8203, "train_tokens_per_second": 98014.427 }, { "epoch": 0.5745627980922099, "grad_norm": 0.1985427439212799, "learning_rate": 1.9276773604079057e-05, "loss": 0.4101, "num_input_tokens_seen": 14090872032, "step": 3614, "train_runtime": 143764.1752, "train_tokens_per_second": 98013.792 }, { "epoch": 0.5747217806041336, "grad_norm": 0.19335204362869263, "learning_rate": 1.9264594292241704e-05, "loss": 0.3965, "num_input_tokens_seen": 14094545783, "step": 3615, "train_runtime": 143802.4614, "train_tokens_per_second": 98013.244 }, { "epoch": 0.5748807631160572, "grad_norm": 0.2033414989709854, "learning_rate": 1.9252416417082936e-05, "loss": 0.39, "num_input_tokens_seen": 14098599549, "step": 3616, "train_runtime": 143841.6147, "train_tokens_per_second": 98014.748 }, { "epoch": 0.5750397456279809, "grad_norm": 0.1908659040927887, "learning_rate": 1.9240239981653255e-05, "loss": 0.4021, "num_input_tokens_seen": 14102391912, "step": 3617, "train_runtime": 143880.1896, "train_tokens_per_second": 98014.827 }, { "epoch": 0.5751987281399046, "grad_norm": 0.18590766191482544, "learning_rate": 1.9228064989002755e-05, "loss": 0.3925, "num_input_tokens_seen": 14106329632, "step": 3618, "train_runtime": 143920.0087, "train_tokens_per_second": 98015.069 }, { "epoch": 0.5753577106518283, "grad_norm": 0.24493460357189178, "learning_rate": 1.9215891442181186e-05, "loss": 0.3898, "num_input_tokens_seen": 14110312975, "step": 3619, "train_runtime": 143958.5532, "train_tokens_per_second": 98016.496 }, { "epoch": 0.575516693163752, "grad_norm": 0.1761881560087204, "learning_rate": 1.9203719344237943e-05, "loss": 0.3969, "num_input_tokens_seen": 14114307936, "step": 3620, "train_runtime": 143998.8104, "train_tokens_per_second": 98016.837 }, { "epoch": 0.5756756756756757, "grad_norm": 0.19253608584403992, "learning_rate": 1.9191548698222034e-05, "loss": 0.3935, "num_input_tokens_seen": 14118314202, "step": 3621, "train_runtime": 144037.8662, "train_tokens_per_second": 98018.074 }, { "epoch": 0.5758346581875994, "grad_norm": 0.18722786009311676, "learning_rate": 1.9179379507182143e-05, "loss": 0.3873, "num_input_tokens_seen": 14122167336, "step": 3622, "train_runtime": 144076.4971, "train_tokens_per_second": 98018.536 }, { "epoch": 0.5759936406995231, "grad_norm": 0.18739379942417145, "learning_rate": 1.9167211774166554e-05, "loss": 0.3909, "num_input_tokens_seen": 14126043646, "step": 3623, "train_runtime": 144116.1873, "train_tokens_per_second": 98018.439 }, { "epoch": 0.5761526232114468, "grad_norm": 0.47869184613227844, "learning_rate": 1.915504550222319e-05, "loss": 0.396, "num_input_tokens_seen": 14130003961, "step": 3624, "train_runtime": 144156.7923, "train_tokens_per_second": 98018.302 }, { "epoch": 0.5763116057233705, "grad_norm": 0.18569208681583405, "learning_rate": 1.9142880694399638e-05, "loss": 0.4083, "num_input_tokens_seen": 14133861318, "step": 3625, "train_runtime": 144192.4287, "train_tokens_per_second": 98020.828 }, { "epoch": 0.5764705882352941, "grad_norm": 0.1853358894586563, "learning_rate": 1.9130717353743073e-05, "loss": 0.3877, "num_input_tokens_seen": 14137734788, "step": 3626, "train_runtime": 144230.1841, "train_tokens_per_second": 98022.026 }, { "epoch": 0.5766295707472178, "grad_norm": 0.20390702784061432, "learning_rate": 1.9118555483300344e-05, "loss": 0.3992, "num_input_tokens_seen": 14141675355, "step": 3627, "train_runtime": 144270.7009, "train_tokens_per_second": 98021.811 }, { "epoch": 0.5767885532591415, "grad_norm": 0.2039474993944168, "learning_rate": 1.9106395086117908e-05, "loss": 0.4119, "num_input_tokens_seen": 14145643253, "step": 3628, "train_runtime": 144308.838, "train_tokens_per_second": 98023.402 }, { "epoch": 0.5769475357710652, "grad_norm": 0.229741632938385, "learning_rate": 1.9094236165241843e-05, "loss": 0.3997, "num_input_tokens_seen": 14149487358, "step": 3629, "train_runtime": 144349.6892, "train_tokens_per_second": 98022.292 }, { "epoch": 0.5771065182829889, "grad_norm": 0.1983143389225006, "learning_rate": 1.90820787237179e-05, "loss": 0.3899, "num_input_tokens_seen": 14153321333, "step": 3630, "train_runtime": 144390.269, "train_tokens_per_second": 98021.296 }, { "epoch": 0.5772655007949126, "grad_norm": 0.23513591289520264, "learning_rate": 1.9069922764591415e-05, "loss": 0.3963, "num_input_tokens_seen": 14157254578, "step": 3631, "train_runtime": 144430.4336, "train_tokens_per_second": 98021.27 }, { "epoch": 0.5774244833068363, "grad_norm": 0.2006419152021408, "learning_rate": 1.9057768290907365e-05, "loss": 0.4051, "num_input_tokens_seen": 14161173087, "step": 3632, "train_runtime": 144472.6902, "train_tokens_per_second": 98019.723 }, { "epoch": 0.57758346581876, "grad_norm": 0.2597225308418274, "learning_rate": 1.904561530571038e-05, "loss": 0.4044, "num_input_tokens_seen": 14164930882, "step": 3633, "train_runtime": 144511.0792, "train_tokens_per_second": 98019.688 }, { "epoch": 0.5777424483306837, "grad_norm": 0.22147318720817566, "learning_rate": 1.9033463812044678e-05, "loss": 0.367, "num_input_tokens_seen": 14168773788, "step": 3634, "train_runtime": 144552.3188, "train_tokens_per_second": 98018.309 }, { "epoch": 0.5779014308426074, "grad_norm": 0.2798529267311096, "learning_rate": 1.9021313812954134e-05, "loss": 0.3935, "num_input_tokens_seen": 14172812457, "step": 3635, "train_runtime": 144590.7246, "train_tokens_per_second": 98020.205 }, { "epoch": 0.578060413354531, "grad_norm": 0.173292875289917, "learning_rate": 1.9009165311482236e-05, "loss": 0.3993, "num_input_tokens_seen": 14176796751, "step": 3636, "train_runtime": 144627.9509, "train_tokens_per_second": 98022.524 }, { "epoch": 0.5782193958664547, "grad_norm": 0.22121454775333405, "learning_rate": 1.899701831067208e-05, "loss": 0.3907, "num_input_tokens_seen": 14180540123, "step": 3637, "train_runtime": 144667.3741, "train_tokens_per_second": 98021.687 }, { "epoch": 0.5783783783783784, "grad_norm": 0.2644701302051544, "learning_rate": 1.898487281356643e-05, "loss": 0.4038, "num_input_tokens_seen": 14184463173, "step": 3638, "train_runtime": 144707.0549, "train_tokens_per_second": 98021.919 }, { "epoch": 0.578537360890302, "grad_norm": 0.18366611003875732, "learning_rate": 1.8972728823207624e-05, "loss": 0.4095, "num_input_tokens_seen": 14188438610, "step": 3639, "train_runtime": 144746.748, "train_tokens_per_second": 98022.503 }, { "epoch": 0.5786963434022258, "grad_norm": 0.20781970024108887, "learning_rate": 1.8960586342637666e-05, "loss": 0.4043, "num_input_tokens_seen": 14192247280, "step": 3640, "train_runtime": 144783.7281, "train_tokens_per_second": 98023.773 }, { "epoch": 0.5788553259141495, "grad_norm": 0.21691226959228516, "learning_rate": 1.8948445374898154e-05, "loss": 0.3843, "num_input_tokens_seen": 14196137816, "step": 3641, "train_runtime": 144825.6419, "train_tokens_per_second": 98022.268 }, { "epoch": 0.5790143084260732, "grad_norm": 0.26191237568855286, "learning_rate": 1.893630592303031e-05, "loss": 0.408, "num_input_tokens_seen": 14200152410, "step": 3642, "train_runtime": 144866.3298, "train_tokens_per_second": 98022.449 }, { "epoch": 0.5791732909379969, "grad_norm": 0.3233679533004761, "learning_rate": 1.8924167990074986e-05, "loss": 0.4103, "num_input_tokens_seen": 14203980678, "step": 3643, "train_runtime": 144905.2104, "train_tokens_per_second": 98022.567 }, { "epoch": 0.5793322734499206, "grad_norm": 0.16874949634075165, "learning_rate": 1.8912031579072637e-05, "loss": 0.4157, "num_input_tokens_seen": 14207844206, "step": 3644, "train_runtime": 144945.119, "train_tokens_per_second": 98022.233 }, { "epoch": 0.5794912559618441, "grad_norm": 0.19439896941184998, "learning_rate": 1.8899896693063368e-05, "loss": 0.3923, "num_input_tokens_seen": 14211595673, "step": 3645, "train_runtime": 144985.0446, "train_tokens_per_second": 98021.115 }, { "epoch": 0.5796502384737678, "grad_norm": 0.17977666854858398, "learning_rate": 1.8887763335086875e-05, "loss": 0.409, "num_input_tokens_seen": 14215615602, "step": 3646, "train_runtime": 145021.4174, "train_tokens_per_second": 98024.25 }, { "epoch": 0.5798092209856915, "grad_norm": 0.19354955852031708, "learning_rate": 1.8875631508182462e-05, "loss": 0.3902, "num_input_tokens_seen": 14219418290, "step": 3647, "train_runtime": 145060.6558, "train_tokens_per_second": 98023.949 }, { "epoch": 0.5799682034976152, "grad_norm": 0.18821673095226288, "learning_rate": 1.886350121538909e-05, "loss": 0.3899, "num_input_tokens_seen": 14223294887, "step": 3648, "train_runtime": 145099.333, "train_tokens_per_second": 98024.537 }, { "epoch": 0.5801271860095389, "grad_norm": 0.18551091849803925, "learning_rate": 1.88513724597453e-05, "loss": 0.4027, "num_input_tokens_seen": 14227342912, "step": 3649, "train_runtime": 145140.6629, "train_tokens_per_second": 98024.514 }, { "epoch": 0.5802861685214626, "grad_norm": 0.30869272351264954, "learning_rate": 1.883924524428926e-05, "loss": 0.396, "num_input_tokens_seen": 14231203330, "step": 3650, "train_runtime": 145178.6977, "train_tokens_per_second": 98025.424 }, { "epoch": 0.5804451510333863, "grad_norm": 0.20114080607891083, "learning_rate": 1.882711957205875e-05, "loss": 0.4059, "num_input_tokens_seen": 14235058236, "step": 3651, "train_runtime": 145217.5466, "train_tokens_per_second": 98025.745 }, { "epoch": 0.58060413354531, "grad_norm": 0.20879991352558136, "learning_rate": 1.8814995446091164e-05, "loss": 0.3883, "num_input_tokens_seen": 14238978826, "step": 3652, "train_runtime": 145256.1128, "train_tokens_per_second": 98026.71 }, { "epoch": 0.5807631160572337, "grad_norm": 0.19814100861549377, "learning_rate": 1.8802872869423517e-05, "loss": 0.4015, "num_input_tokens_seen": 14242887393, "step": 3653, "train_runtime": 145292.8322, "train_tokens_per_second": 98028.837 }, { "epoch": 0.5809220985691574, "grad_norm": 0.19908250868320465, "learning_rate": 1.879075184509243e-05, "loss": 0.4004, "num_input_tokens_seen": 14246822617, "step": 3654, "train_runtime": 145331.3092, "train_tokens_per_second": 98029.961 }, { "epoch": 0.581081081081081, "grad_norm": 0.2861635982990265, "learning_rate": 1.8778632376134116e-05, "loss": 0.406, "num_input_tokens_seen": 14250728343, "step": 3655, "train_runtime": 145372.9258, "train_tokens_per_second": 98028.765 }, { "epoch": 0.5812400635930047, "grad_norm": 0.21592754125595093, "learning_rate": 1.8766514465584445e-05, "loss": 0.4066, "num_input_tokens_seen": 14254785453, "step": 3656, "train_runtime": 145411.1191, "train_tokens_per_second": 98030.918 }, { "epoch": 0.5813990461049284, "grad_norm": 0.2079858034849167, "learning_rate": 1.875439811647884e-05, "loss": 0.401, "num_input_tokens_seen": 14258673877, "step": 3657, "train_runtime": 145451.8025, "train_tokens_per_second": 98030.232 }, { "epoch": 0.5815580286168521, "grad_norm": 0.3082723617553711, "learning_rate": 1.8742283331852383e-05, "loss": 0.4022, "num_input_tokens_seen": 14262423752, "step": 3658, "train_runtime": 145490.2773, "train_tokens_per_second": 98030.082 }, { "epoch": 0.5817170111287758, "grad_norm": 0.18972301483154297, "learning_rate": 1.8730170114739727e-05, "loss": 0.3984, "num_input_tokens_seen": 14266403169, "step": 3659, "train_runtime": 145531.0267, "train_tokens_per_second": 98029.977 }, { "epoch": 0.5818759936406995, "grad_norm": 0.19449254870414734, "learning_rate": 1.8718058468175152e-05, "loss": 0.406, "num_input_tokens_seen": 14270377930, "step": 3660, "train_runtime": 145568.5279, "train_tokens_per_second": 98032.027 }, { "epoch": 0.5820349761526232, "grad_norm": 0.20136980712413788, "learning_rate": 1.8705948395192544e-05, "loss": 0.3951, "num_input_tokens_seen": 14274200705, "step": 3661, "train_runtime": 145606.6228, "train_tokens_per_second": 98032.634 }, { "epoch": 0.5821939586645469, "grad_norm": 0.25336599349975586, "learning_rate": 1.8693839898825377e-05, "loss": 0.4055, "num_input_tokens_seen": 14278012864, "step": 3662, "train_runtime": 145644.4992, "train_tokens_per_second": 98033.314 }, { "epoch": 0.5823529411764706, "grad_norm": 0.23189809918403625, "learning_rate": 1.8681732982106766e-05, "loss": 0.3854, "num_input_tokens_seen": 14281920709, "step": 3663, "train_runtime": 145684.0904, "train_tokens_per_second": 98033.496 }, { "epoch": 0.5825119236883943, "grad_norm": 0.1938524842262268, "learning_rate": 1.8669627648069393e-05, "loss": 0.4042, "num_input_tokens_seen": 14285819059, "step": 3664, "train_runtime": 145723.5907, "train_tokens_per_second": 98033.675 }, { "epoch": 0.5826709062003179, "grad_norm": 1.0109388828277588, "learning_rate": 1.8657523899745554e-05, "loss": 0.3989, "num_input_tokens_seen": 14289650340, "step": 3665, "train_runtime": 145762.8024, "train_tokens_per_second": 98033.587 }, { "epoch": 0.5828298887122416, "grad_norm": 0.22207413613796234, "learning_rate": 1.8645421740167168e-05, "loss": 0.4009, "num_input_tokens_seen": 14293505377, "step": 3666, "train_runtime": 145803.3171, "train_tokens_per_second": 98032.786 }, { "epoch": 0.5829888712241653, "grad_norm": 0.20478087663650513, "learning_rate": 1.863332117236573e-05, "loss": 0.4124, "num_input_tokens_seen": 14297568793, "step": 3667, "train_runtime": 145843.2034, "train_tokens_per_second": 98033.837 }, { "epoch": 0.583147853736089, "grad_norm": 0.21368397772312164, "learning_rate": 1.862122219937235e-05, "loss": 0.3933, "num_input_tokens_seen": 14301576217, "step": 3668, "train_runtime": 145883.2909, "train_tokens_per_second": 98034.368 }, { "epoch": 0.5833068362480127, "grad_norm": 0.22133928537368774, "learning_rate": 1.8609124824217735e-05, "loss": 0.3902, "num_input_tokens_seen": 14305211758, "step": 3669, "train_runtime": 145921.8345, "train_tokens_per_second": 98033.388 }, { "epoch": 0.5834658187599364, "grad_norm": 0.19304756820201874, "learning_rate": 1.8597029049932183e-05, "loss": 0.4017, "num_input_tokens_seen": 14309181974, "step": 3670, "train_runtime": 145963.744, "train_tokens_per_second": 98032.44 }, { "epoch": 0.5836248012718601, "grad_norm": 0.23853005468845367, "learning_rate": 1.8584934879545618e-05, "loss": 0.4031, "num_input_tokens_seen": 14313190136, "step": 3671, "train_runtime": 146003.7612, "train_tokens_per_second": 98033.023 }, { "epoch": 0.5837837837837838, "grad_norm": 0.2165200412273407, "learning_rate": 1.8572842316087537e-05, "loss": 0.4005, "num_input_tokens_seen": 14317121105, "step": 3672, "train_runtime": 146040.9481, "train_tokens_per_second": 98034.978 }, { "epoch": 0.5839427662957075, "grad_norm": 0.21814759075641632, "learning_rate": 1.8560751362587025e-05, "loss": 0.4022, "num_input_tokens_seen": 14320948942, "step": 3673, "train_runtime": 146079.7121, "train_tokens_per_second": 98035.167 }, { "epoch": 0.5841017488076312, "grad_norm": 0.38196346163749695, "learning_rate": 1.8548662022072808e-05, "loss": 0.3962, "num_input_tokens_seen": 14324890925, "step": 3674, "train_runtime": 146120.9894, "train_tokens_per_second": 98034.451 }, { "epoch": 0.5842607313195548, "grad_norm": 0.20518085360527039, "learning_rate": 1.853657429757316e-05, "loss": 0.393, "num_input_tokens_seen": 14328753106, "step": 3675, "train_runtime": 146159.7591, "train_tokens_per_second": 98034.871 }, { "epoch": 0.5844197138314785, "grad_norm": 0.20625050365924835, "learning_rate": 1.8524488192115986e-05, "loss": 0.3893, "num_input_tokens_seen": 14332609632, "step": 3676, "train_runtime": 146198.1672, "train_tokens_per_second": 98035.495 }, { "epoch": 0.5845786963434022, "grad_norm": 0.33655592799186707, "learning_rate": 1.8512403708728758e-05, "loss": 0.4, "num_input_tokens_seen": 14336548395, "step": 3677, "train_runtime": 146237.6717, "train_tokens_per_second": 98035.945 }, { "epoch": 0.5847376788553259, "grad_norm": 0.1863851547241211, "learning_rate": 1.8500320850438546e-05, "loss": 0.3994, "num_input_tokens_seen": 14340547934, "step": 3678, "train_runtime": 146274.7526, "train_tokens_per_second": 98038.436 }, { "epoch": 0.5848966613672496, "grad_norm": 0.2341460883617401, "learning_rate": 1.8488239620272046e-05, "loss": 0.3957, "num_input_tokens_seen": 14344356645, "step": 3679, "train_runtime": 146315.4569, "train_tokens_per_second": 98037.193 }, { "epoch": 0.5850556438791733, "grad_norm": 0.2402547299861908, "learning_rate": 1.84761600212555e-05, "loss": 0.3938, "num_input_tokens_seen": 14348154053, "step": 3680, "train_runtime": 146356.3431, "train_tokens_per_second": 98035.751 }, { "epoch": 0.585214626391097, "grad_norm": 0.2221783548593521, "learning_rate": 1.8464082056414765e-05, "loss": 0.3924, "num_input_tokens_seen": 14352093643, "step": 3681, "train_runtime": 146394.0116, "train_tokens_per_second": 98037.437 }, { "epoch": 0.5853736089030207, "grad_norm": 0.2013949602842331, "learning_rate": 1.8452005728775302e-05, "loss": 0.3977, "num_input_tokens_seen": 14356077311, "step": 3682, "train_runtime": 146432.4185, "train_tokens_per_second": 98038.928 }, { "epoch": 0.5855325914149444, "grad_norm": 0.22794672846794128, "learning_rate": 1.8439931041362124e-05, "loss": 0.4098, "num_input_tokens_seen": 14359833700, "step": 3683, "train_runtime": 146471.9273, "train_tokens_per_second": 98038.129 }, { "epoch": 0.585691573926868, "grad_norm": 0.23272131383419037, "learning_rate": 1.842785799719987e-05, "loss": 0.3865, "num_input_tokens_seen": 14363773320, "step": 3684, "train_runtime": 146510.7991, "train_tokens_per_second": 98039.007 }, { "epoch": 0.5858505564387917, "grad_norm": 0.23825442790985107, "learning_rate": 1.841578659931274e-05, "loss": 0.3936, "num_input_tokens_seen": 14367760193, "step": 3685, "train_runtime": 146547.2801, "train_tokens_per_second": 98041.807 }, { "epoch": 0.5860095389507154, "grad_norm": 0.22930821776390076, "learning_rate": 1.8403716850724546e-05, "loss": 0.4083, "num_input_tokens_seen": 14371629223, "step": 3686, "train_runtime": 146587.4868, "train_tokens_per_second": 98041.31 }, { "epoch": 0.5861685214626391, "grad_norm": 0.22279110550880432, "learning_rate": 1.8391648754458673e-05, "loss": 0.3865, "num_input_tokens_seen": 14375492965, "step": 3687, "train_runtime": 146628.2914, "train_tokens_per_second": 98040.377 }, { "epoch": 0.5863275039745628, "grad_norm": 0.23370212316513062, "learning_rate": 1.837958231353808e-05, "loss": 0.3945, "num_input_tokens_seen": 14379541776, "step": 3688, "train_runtime": 146669.6581, "train_tokens_per_second": 98040.331 }, { "epoch": 0.5864864864864865, "grad_norm": 0.3744857907295227, "learning_rate": 1.836751753098534e-05, "loss": 0.3957, "num_input_tokens_seen": 14383476488, "step": 3689, "train_runtime": 146708.6391, "train_tokens_per_second": 98041.101 }, { "epoch": 0.5866454689984102, "grad_norm": 0.26209530234336853, "learning_rate": 1.8355454409822595e-05, "loss": 0.3885, "num_input_tokens_seen": 14387296286, "step": 3690, "train_runtime": 146747.4698, "train_tokens_per_second": 98041.188 }, { "epoch": 0.5868044515103339, "grad_norm": 0.2229240983724594, "learning_rate": 1.834339295307156e-05, "loss": 0.3979, "num_input_tokens_seen": 14391203473, "step": 3691, "train_runtime": 146787.3474, "train_tokens_per_second": 98041.171 }, { "epoch": 0.5869634340222576, "grad_norm": 0.23360586166381836, "learning_rate": 1.833133316375356e-05, "loss": 0.3992, "num_input_tokens_seen": 14395161010, "step": 3692, "train_runtime": 146826.6023, "train_tokens_per_second": 98041.913 }, { "epoch": 0.5871224165341813, "grad_norm": 0.23213572800159454, "learning_rate": 1.831927504488947e-05, "loss": 0.4095, "num_input_tokens_seen": 14399073131, "step": 3693, "train_runtime": 146866.2737, "train_tokens_per_second": 98042.068 }, { "epoch": 0.5872813990461049, "grad_norm": 0.22686907649040222, "learning_rate": 1.830721859949978e-05, "loss": 0.3991, "num_input_tokens_seen": 14403006168, "step": 3694, "train_runtime": 146905.7492, "train_tokens_per_second": 98042.495 }, { "epoch": 0.5874403815580286, "grad_norm": 0.21470803022384644, "learning_rate": 1.829516383060454e-05, "loss": 0.3932, "num_input_tokens_seen": 14406981052, "step": 3695, "train_runtime": 146942.1272, "train_tokens_per_second": 98045.274 }, { "epoch": 0.5875993640699523, "grad_norm": 0.20050670206546783, "learning_rate": 1.828311074122337e-05, "loss": 0.4063, "num_input_tokens_seen": 14410922187, "step": 3696, "train_runtime": 146980.1159, "train_tokens_per_second": 98046.747 }, { "epoch": 0.587758346581876, "grad_norm": 0.2324628084897995, "learning_rate": 1.8271059334375505e-05, "loss": 0.3882, "num_input_tokens_seen": 14414854544, "step": 3697, "train_runtime": 147019.4834, "train_tokens_per_second": 98047.24 }, { "epoch": 0.5879173290937997, "grad_norm": 0.29657819867134094, "learning_rate": 1.8259009613079724e-05, "loss": 0.3957, "num_input_tokens_seen": 14418770998, "step": 3698, "train_runtime": 147059.6579, "train_tokens_per_second": 98047.087 }, { "epoch": 0.5880763116057234, "grad_norm": 0.3513432741165161, "learning_rate": 1.8246961580354405e-05, "loss": 0.4006, "num_input_tokens_seen": 14422729412, "step": 3699, "train_runtime": 147100.4326, "train_tokens_per_second": 98046.818 }, { "epoch": 0.5882352941176471, "grad_norm": 0.23488152027130127, "learning_rate": 1.823491523921749e-05, "loss": 0.3994, "num_input_tokens_seen": 14426486143, "step": 3700, "train_runtime": 147142.1503, "train_tokens_per_second": 98044.552 }, { "epoch": 0.5883942766295708, "grad_norm": 0.2811725437641144, "learning_rate": 1.8222870592686496e-05, "loss": 0.4036, "num_input_tokens_seen": 14430485979, "step": 3701, "train_runtime": 147181.9057, "train_tokens_per_second": 98045.245 }, { "epoch": 0.5885532591414945, "grad_norm": 0.20542363822460175, "learning_rate": 1.8210827643778537e-05, "loss": 0.3909, "num_input_tokens_seen": 14434386106, "step": 3702, "train_runtime": 147221.9628, "train_tokens_per_second": 98045.06 }, { "epoch": 0.5887122416534182, "grad_norm": 0.21606585383415222, "learning_rate": 1.819878639551027e-05, "loss": 0.4176, "num_input_tokens_seen": 14438300709, "step": 3703, "train_runtime": 147264.1129, "train_tokens_per_second": 98043.579 }, { "epoch": 0.5888712241653418, "grad_norm": 0.22000883519649506, "learning_rate": 1.818674685089796e-05, "loss": 0.3991, "num_input_tokens_seen": 14442242901, "step": 3704, "train_runtime": 147304.7507, "train_tokens_per_second": 98043.293 }, { "epoch": 0.5890302066772655, "grad_norm": 0.2332734614610672, "learning_rate": 1.817470901295742e-05, "loss": 0.3973, "num_input_tokens_seen": 14446105683, "step": 3705, "train_runtime": 147342.8683, "train_tokens_per_second": 98044.146 }, { "epoch": 0.5891891891891892, "grad_norm": 0.22657565772533417, "learning_rate": 1.8162672884704034e-05, "loss": 0.3967, "num_input_tokens_seen": 14450064458, "step": 3706, "train_runtime": 147381.5557, "train_tokens_per_second": 98045.27 }, { "epoch": 0.5893481717011129, "grad_norm": 0.21536226570606232, "learning_rate": 1.8150638469152785e-05, "loss": 0.3948, "num_input_tokens_seen": 14453843765, "step": 3707, "train_runtime": 147421.7449, "train_tokens_per_second": 98044.178 }, { "epoch": 0.5895071542130366, "grad_norm": 0.2456032782793045, "learning_rate": 1.8138605769318196e-05, "loss": 0.3949, "num_input_tokens_seen": 14457745125, "step": 3708, "train_runtime": 147460.6661, "train_tokens_per_second": 98044.757 }, { "epoch": 0.5896661367249603, "grad_norm": 0.27353498339653015, "learning_rate": 1.812657478821437e-05, "loss": 0.3945, "num_input_tokens_seen": 14461654776, "step": 3709, "train_runtime": 147500.0232, "train_tokens_per_second": 98045.102 }, { "epoch": 0.589825119236884, "grad_norm": 0.34045079350471497, "learning_rate": 1.8114545528854992e-05, "loss": 0.3942, "num_input_tokens_seen": 14465558885, "step": 3710, "train_runtime": 147535.2139, "train_tokens_per_second": 98048.178 }, { "epoch": 0.5899841017488077, "grad_norm": 0.2548065781593323, "learning_rate": 1.8102517994253302e-05, "loss": 0.3861, "num_input_tokens_seen": 14469352643, "step": 3711, "train_runtime": 147575.3394, "train_tokens_per_second": 98047.226 }, { "epoch": 0.5901430842607314, "grad_norm": 0.2121097445487976, "learning_rate": 1.809049218742212e-05, "loss": 0.4014, "num_input_tokens_seen": 14473319051, "step": 3712, "train_runtime": 147614.1683, "train_tokens_per_second": 98048.305 }, { "epoch": 0.590302066772655, "grad_norm": 0.2106509506702423, "learning_rate": 1.8078468111373827e-05, "loss": 0.4021, "num_input_tokens_seen": 14477267226, "step": 3713, "train_runtime": 147654.018, "train_tokens_per_second": 98048.583 }, { "epoch": 0.5904610492845787, "grad_norm": 0.2049339860677719, "learning_rate": 1.806644576912035e-05, "loss": 0.3907, "num_input_tokens_seen": 14481116730, "step": 3714, "train_runtime": 147692.2395, "train_tokens_per_second": 98049.273 }, { "epoch": 0.5906200317965024, "grad_norm": 0.23604939877986908, "learning_rate": 1.8054425163673216e-05, "loss": 0.4131, "num_input_tokens_seen": 14484870595, "step": 3715, "train_runtime": 147733.2249, "train_tokens_per_second": 98047.481 }, { "epoch": 0.590779014308426, "grad_norm": 0.20963184535503387, "learning_rate": 1.8042406298043493e-05, "loss": 0.4044, "num_input_tokens_seen": 14488895766, "step": 3716, "train_runtime": 147771.1942, "train_tokens_per_second": 98049.527 }, { "epoch": 0.5909379968203498, "grad_norm": 0.3660030663013458, "learning_rate": 1.803038917524184e-05, "loss": 0.3948, "num_input_tokens_seen": 14492880485, "step": 3717, "train_runtime": 147808.8339, "train_tokens_per_second": 98051.518 }, { "epoch": 0.5910969793322735, "grad_norm": 0.1928553432226181, "learning_rate": 1.8018373798278442e-05, "loss": 0.39, "num_input_tokens_seen": 14496735335, "step": 3718, "train_runtime": 147845.4026, "train_tokens_per_second": 98053.339 }, { "epoch": 0.5912559618441972, "grad_norm": 0.2058311104774475, "learning_rate": 1.8006360170163066e-05, "loss": 0.4092, "num_input_tokens_seen": 14500612197, "step": 3719, "train_runtime": 147885.889, "train_tokens_per_second": 98052.71 }, { "epoch": 0.5914149443561209, "grad_norm": 0.1904243528842926, "learning_rate": 1.799434829390506e-05, "loss": 0.3881, "num_input_tokens_seen": 14504499768, "step": 3720, "train_runtime": 147923.7599, "train_tokens_per_second": 98053.888 }, { "epoch": 0.5915739268680446, "grad_norm": 0.20838679373264313, "learning_rate": 1.7982338172513296e-05, "loss": 0.3805, "num_input_tokens_seen": 14508360442, "step": 3721, "train_runtime": 147960.1399, "train_tokens_per_second": 98055.871 }, { "epoch": 0.5917329093799683, "grad_norm": 0.22572320699691772, "learning_rate": 1.7970329808996237e-05, "loss": 0.4095, "num_input_tokens_seen": 14512197159, "step": 3722, "train_runtime": 148002.9603, "train_tokens_per_second": 98053.425 }, { "epoch": 0.5918918918918918, "grad_norm": 0.23556865751743317, "learning_rate": 1.795832320636188e-05, "loss": 0.392, "num_input_tokens_seen": 14516149336, "step": 3723, "train_runtime": 148042.9025, "train_tokens_per_second": 98053.666 }, { "epoch": 0.5920508744038155, "grad_norm": 0.3208346962928772, "learning_rate": 1.79463183676178e-05, "loss": 0.406, "num_input_tokens_seen": 14520163583, "step": 3724, "train_runtime": 148081.5776, "train_tokens_per_second": 98055.165 }, { "epoch": 0.5922098569157392, "grad_norm": 0.1822548508644104, "learning_rate": 1.793431529577113e-05, "loss": 0.3928, "num_input_tokens_seen": 14524027625, "step": 3725, "train_runtime": 148118.5584, "train_tokens_per_second": 98056.771 }, { "epoch": 0.5923688394276629, "grad_norm": 0.2018888294696808, "learning_rate": 1.792231399382855e-05, "loss": 0.3918, "num_input_tokens_seen": 14527846182, "step": 3726, "train_runtime": 148155.3099, "train_tokens_per_second": 98058.221 }, { "epoch": 0.5925278219395866, "grad_norm": 0.3158271610736847, "learning_rate": 1.791031446479629e-05, "loss": 0.3904, "num_input_tokens_seen": 14531869001, "step": 3727, "train_runtime": 148194.7942, "train_tokens_per_second": 98059.241 }, { "epoch": 0.5926868044515103, "grad_norm": 0.20924337208271027, "learning_rate": 1.789831671168017e-05, "loss": 0.41, "num_input_tokens_seen": 14535838292, "step": 3728, "train_runtime": 148235.0052, "train_tokens_per_second": 98059.418 }, { "epoch": 0.592845786963434, "grad_norm": 0.2036954015493393, "learning_rate": 1.788632073748553e-05, "loss": 0.3901, "num_input_tokens_seen": 14539523983, "step": 3729, "train_runtime": 148274.9189, "train_tokens_per_second": 98057.879 }, { "epoch": 0.5930047694753577, "grad_norm": 0.33933186531066895, "learning_rate": 1.7874326545217277e-05, "loss": 0.4095, "num_input_tokens_seen": 14543499445, "step": 3730, "train_runtime": 148315.1236, "train_tokens_per_second": 98058.102 }, { "epoch": 0.5931637519872814, "grad_norm": 0.20857274532318115, "learning_rate": 1.786233413787987e-05, "loss": 0.4078, "num_input_tokens_seen": 14547379440, "step": 3731, "train_runtime": 148354.9903, "train_tokens_per_second": 98057.904 }, { "epoch": 0.5933227344992051, "grad_norm": 0.18520714342594147, "learning_rate": 1.785034351847732e-05, "loss": 0.3931, "num_input_tokens_seen": 14551429676, "step": 3732, "train_runtime": 148393.0022, "train_tokens_per_second": 98060.08 }, { "epoch": 0.5934817170111287, "grad_norm": 0.23579531908035278, "learning_rate": 1.7838354690013204e-05, "loss": 0.3945, "num_input_tokens_seen": 14555349015, "step": 3733, "train_runtime": 148432.2449, "train_tokens_per_second": 98060.56 }, { "epoch": 0.5936406995230524, "grad_norm": 0.24862997233867645, "learning_rate": 1.782636765549062e-05, "loss": 0.3995, "num_input_tokens_seen": 14559197696, "step": 3734, "train_runtime": 148470.9444, "train_tokens_per_second": 98060.922 }, { "epoch": 0.5937996820349761, "grad_norm": 0.204408660531044, "learning_rate": 1.781438241791226e-05, "loss": 0.4015, "num_input_tokens_seen": 14563115710, "step": 3735, "train_runtime": 148511.6266, "train_tokens_per_second": 98060.442 }, { "epoch": 0.5939586645468998, "grad_norm": 0.2220432013273239, "learning_rate": 1.7802398980280326e-05, "loss": 0.3925, "num_input_tokens_seen": 14566786060, "step": 3736, "train_runtime": 148550.2239, "train_tokens_per_second": 98059.671 }, { "epoch": 0.5941176470588235, "grad_norm": 0.18452073633670807, "learning_rate": 1.7790417345596583e-05, "loss": 0.4098, "num_input_tokens_seen": 14570778528, "step": 3737, "train_runtime": 148590.5295, "train_tokens_per_second": 98059.941 }, { "epoch": 0.5942766295707472, "grad_norm": 0.22494646906852722, "learning_rate": 1.777843751686235e-05, "loss": 0.4153, "num_input_tokens_seen": 14574713501, "step": 3738, "train_runtime": 148630.6653, "train_tokens_per_second": 98059.936 }, { "epoch": 0.5944356120826709, "grad_norm": 0.1806906908750534, "learning_rate": 1.7766459497078484e-05, "loss": 0.4005, "num_input_tokens_seen": 14578593221, "step": 3739, "train_runtime": 148670.9248, "train_tokens_per_second": 98059.478 }, { "epoch": 0.5945945945945946, "grad_norm": 0.21299205720424652, "learning_rate": 1.7754483289245408e-05, "loss": 0.3941, "num_input_tokens_seen": 14582530533, "step": 3740, "train_runtime": 148711.5884, "train_tokens_per_second": 98059.14 }, { "epoch": 0.5947535771065183, "grad_norm": 0.21104741096496582, "learning_rate": 1.774250889636307e-05, "loss": 0.3949, "num_input_tokens_seen": 14586408703, "step": 3741, "train_runtime": 148752.9309, "train_tokens_per_second": 98057.958 }, { "epoch": 0.5949125596184419, "grad_norm": 0.27800917625427246, "learning_rate": 1.7730536321430956e-05, "loss": 0.399, "num_input_tokens_seen": 14590217608, "step": 3742, "train_runtime": 148789.8728, "train_tokens_per_second": 98059.212 }, { "epoch": 0.5950715421303656, "grad_norm": 0.1968245506286621, "learning_rate": 1.7718565567448138e-05, "loss": 0.4057, "num_input_tokens_seen": 14594143655, "step": 3743, "train_runtime": 148830.9905, "train_tokens_per_second": 98058.5 }, { "epoch": 0.5952305246422893, "grad_norm": 0.18228784203529358, "learning_rate": 1.770659663741319e-05, "loss": 0.3862, "num_input_tokens_seen": 14597931252, "step": 3744, "train_runtime": 148869.7796, "train_tokens_per_second": 98058.392 }, { "epoch": 0.595389507154213, "grad_norm": 0.26885122060775757, "learning_rate": 1.769462953432424e-05, "loss": 0.3959, "num_input_tokens_seen": 14601822496, "step": 3745, "train_runtime": 148909.1518, "train_tokens_per_second": 98058.597 }, { "epoch": 0.5955484896661367, "grad_norm": 0.19611351191997528, "learning_rate": 1.768266426117898e-05, "loss": 0.3978, "num_input_tokens_seen": 14605729182, "step": 3746, "train_runtime": 148946.7477, "train_tokens_per_second": 98060.075 }, { "epoch": 0.5957074721780604, "grad_norm": 0.18471762537956238, "learning_rate": 1.7670700820974602e-05, "loss": 0.4004, "num_input_tokens_seen": 14609587024, "step": 3747, "train_runtime": 148987.2262, "train_tokens_per_second": 98059.326 }, { "epoch": 0.5958664546899841, "grad_norm": 0.22738175094127655, "learning_rate": 1.7658739216707888e-05, "loss": 0.4077, "num_input_tokens_seen": 14613436595, "step": 3748, "train_runtime": 149028.0511, "train_tokens_per_second": 98058.295 }, { "epoch": 0.5960254372019078, "grad_norm": 0.21660561859607697, "learning_rate": 1.7646779451375124e-05, "loss": 0.395, "num_input_tokens_seen": 14617397046, "step": 3749, "train_runtime": 149064.5971, "train_tokens_per_second": 98060.823 }, { "epoch": 0.5961844197138315, "grad_norm": 0.2812272310256958, "learning_rate": 1.7634821527972136e-05, "loss": 0.3971, "num_input_tokens_seen": 14621372726, "step": 3750, "train_runtime": 149103.7974, "train_tokens_per_second": 98061.706 }, { "epoch": 0.5963434022257552, "grad_norm": 0.2744421660900116, "learning_rate": 1.762286544949432e-05, "loss": 0.3917, "num_input_tokens_seen": 14625160474, "step": 3751, "train_runtime": 149143.8744, "train_tokens_per_second": 98060.752 }, { "epoch": 0.5965023847376788, "grad_norm": 0.20313715934753418, "learning_rate": 1.7610911218936578e-05, "loss": 0.4021, "num_input_tokens_seen": 14629059821, "step": 3752, "train_runtime": 149182.6559, "train_tokens_per_second": 98061.398 }, { "epoch": 0.5966613672496025, "grad_norm": 0.2886793315410614, "learning_rate": 1.7598958839293364e-05, "loss": 0.4061, "num_input_tokens_seen": 14632949577, "step": 3753, "train_runtime": 149223.6643, "train_tokens_per_second": 98060.516 }, { "epoch": 0.5968203497615262, "grad_norm": 0.2720091640949249, "learning_rate": 1.7587008313558663e-05, "loss": 0.4087, "num_input_tokens_seen": 14636919358, "step": 3754, "train_runtime": 149263.3403, "train_tokens_per_second": 98061.047 }, { "epoch": 0.5969793322734499, "grad_norm": 0.19609615206718445, "learning_rate": 1.7575059644725987e-05, "loss": 0.3996, "num_input_tokens_seen": 14640862362, "step": 3755, "train_runtime": 149302.9229, "train_tokens_per_second": 98061.458 }, { "epoch": 0.5971383147853736, "grad_norm": 0.1957818567752838, "learning_rate": 1.756311283578841e-05, "loss": 0.4006, "num_input_tokens_seen": 14644728065, "step": 3756, "train_runtime": 149339.9818, "train_tokens_per_second": 98063.01 }, { "epoch": 0.5972972972972973, "grad_norm": 0.20989398658275604, "learning_rate": 1.7551167889738507e-05, "loss": 0.4073, "num_input_tokens_seen": 14648664803, "step": 3757, "train_runtime": 149379.02, "train_tokens_per_second": 98063.736 }, { "epoch": 0.597456279809221, "grad_norm": 0.18912522494792938, "learning_rate": 1.753922480956842e-05, "loss": 0.4096, "num_input_tokens_seen": 14652597109, "step": 3758, "train_runtime": 149418.5863, "train_tokens_per_second": 98064.086 }, { "epoch": 0.5976152623211447, "grad_norm": 0.24235908687114716, "learning_rate": 1.7527283598269807e-05, "loss": 0.3951, "num_input_tokens_seen": 14656395599, "step": 3759, "train_runtime": 149457.281, "train_tokens_per_second": 98064.112 }, { "epoch": 0.5977742448330684, "grad_norm": 0.21293912827968597, "learning_rate": 1.7515344258833833e-05, "loss": 0.4068, "num_input_tokens_seen": 14660393647, "step": 3760, "train_runtime": 149496.7951, "train_tokens_per_second": 98064.936 }, { "epoch": 0.5979332273449921, "grad_norm": 0.21164646744728088, "learning_rate": 1.7503406794251243e-05, "loss": 0.4055, "num_input_tokens_seen": 14664154319, "step": 3761, "train_runtime": 149535.0609, "train_tokens_per_second": 98064.99 }, { "epoch": 0.5980922098569157, "grad_norm": 0.2027624100446701, "learning_rate": 1.7491471207512265e-05, "loss": 0.3999, "num_input_tokens_seen": 14667964707, "step": 3762, "train_runtime": 149572.6708, "train_tokens_per_second": 98065.807 }, { "epoch": 0.5982511923688394, "grad_norm": 0.20150785148143768, "learning_rate": 1.7479537501606703e-05, "loss": 0.3998, "num_input_tokens_seen": 14671948945, "step": 3763, "train_runtime": 149613.0021, "train_tokens_per_second": 98066.002 }, { "epoch": 0.5984101748807631, "grad_norm": 0.20443345606327057, "learning_rate": 1.7467605679523853e-05, "loss": 0.4095, "num_input_tokens_seen": 14675929856, "step": 3764, "train_runtime": 149652.724, "train_tokens_per_second": 98066.573 }, { "epoch": 0.5985691573926868, "grad_norm": 0.20994308590888977, "learning_rate": 1.745567574425254e-05, "loss": 0.3984, "num_input_tokens_seen": 14679805623, "step": 3765, "train_runtime": 149686.6586, "train_tokens_per_second": 98070.234 }, { "epoch": 0.5987281399046105, "grad_norm": 0.2453664243221283, "learning_rate": 1.744374769878116e-05, "loss": 0.4107, "num_input_tokens_seen": 14683767076, "step": 3766, "train_runtime": 149726.666, "train_tokens_per_second": 98070.487 }, { "epoch": 0.5988871224165342, "grad_norm": 0.22476911544799805, "learning_rate": 1.7431821546097583e-05, "loss": 0.3863, "num_input_tokens_seen": 14687630419, "step": 3767, "train_runtime": 149766.2249, "train_tokens_per_second": 98070.379 }, { "epoch": 0.5990461049284579, "grad_norm": 0.2654315233230591, "learning_rate": 1.7419897289189223e-05, "loss": 0.3832, "num_input_tokens_seen": 14691546931, "step": 3768, "train_runtime": 149804.5912, "train_tokens_per_second": 98071.406 }, { "epoch": 0.5992050874403816, "grad_norm": 0.22387093305587769, "learning_rate": 1.740797493104303e-05, "loss": 0.3899, "num_input_tokens_seen": 14695446857, "step": 3769, "train_runtime": 149841.8616, "train_tokens_per_second": 98073.04 }, { "epoch": 0.5993640699523053, "grad_norm": 0.20239011943340302, "learning_rate": 1.7396054474645468e-05, "loss": 0.4014, "num_input_tokens_seen": 14699383687, "step": 3770, "train_runtime": 149882.6934, "train_tokens_per_second": 98072.588 }, { "epoch": 0.599523052464229, "grad_norm": 0.20182649791240692, "learning_rate": 1.7384135922982537e-05, "loss": 0.393, "num_input_tokens_seen": 14703340941, "step": 3771, "train_runtime": 149921.3187, "train_tokens_per_second": 98073.717 }, { "epoch": 0.5996820349761526, "grad_norm": 0.18790492415428162, "learning_rate": 1.7372219279039736e-05, "loss": 0.3945, "num_input_tokens_seen": 14707265639, "step": 3772, "train_runtime": 149959.5965, "train_tokens_per_second": 98074.855 }, { "epoch": 0.5998410174880763, "grad_norm": 0.22546379268169403, "learning_rate": 1.7360304545802096e-05, "loss": 0.3952, "num_input_tokens_seen": 14711129219, "step": 3773, "train_runtime": 149998.6553, "train_tokens_per_second": 98075.074 }, { "epoch": 0.6, "grad_norm": 0.2236759215593338, "learning_rate": 1.7348391726254194e-05, "loss": 0.4008, "num_input_tokens_seen": 14715023948, "step": 3774, "train_runtime": 150035.5625, "train_tokens_per_second": 98076.907 }, { "epoch": 0.6001589825119237, "grad_norm": 0.21798960864543915, "learning_rate": 1.733648082338009e-05, "loss": 0.4049, "num_input_tokens_seen": 14718980957, "step": 3775, "train_runtime": 150076.4538, "train_tokens_per_second": 98076.551 }, { "epoch": 0.6003179650238474, "grad_norm": 0.6354433298110962, "learning_rate": 1.7324571840163385e-05, "loss": 0.4132, "num_input_tokens_seen": 14722879344, "step": 3776, "train_runtime": 150114.2998, "train_tokens_per_second": 98077.794 }, { "epoch": 0.6004769475357711, "grad_norm": 0.24246706068515778, "learning_rate": 1.73126647795872e-05, "loss": 0.3924, "num_input_tokens_seen": 14726634689, "step": 3777, "train_runtime": 150153.7987, "train_tokens_per_second": 98077.004 }, { "epoch": 0.6006359300476948, "grad_norm": 0.18485186994075775, "learning_rate": 1.7300759644634167e-05, "loss": 0.3913, "num_input_tokens_seen": 14730579188, "step": 3778, "train_runtime": 150194.8252, "train_tokens_per_second": 98076.476 }, { "epoch": 0.6007949125596185, "grad_norm": 0.23415011167526245, "learning_rate": 1.7288856438286437e-05, "loss": 0.4043, "num_input_tokens_seen": 14734643782, "step": 3779, "train_runtime": 150234.5701, "train_tokens_per_second": 98077.585 }, { "epoch": 0.6009538950715422, "grad_norm": 0.2210589349269867, "learning_rate": 1.7276955163525675e-05, "loss": 0.3977, "num_input_tokens_seen": 14738508775, "step": 3780, "train_runtime": 150271.9678, "train_tokens_per_second": 98078.896 }, { "epoch": 0.6011128775834658, "grad_norm": 0.20731517672538757, "learning_rate": 1.7265055823333076e-05, "loss": 0.4035, "num_input_tokens_seen": 14742372782, "step": 3781, "train_runtime": 150312.5665, "train_tokens_per_second": 98078.112 }, { "epoch": 0.6012718600953895, "grad_norm": 0.19493629038333893, "learning_rate": 1.725315842068934e-05, "loss": 0.3954, "num_input_tokens_seen": 14746305321, "step": 3782, "train_runtime": 150351.3842, "train_tokens_per_second": 98078.946 }, { "epoch": 0.6014308426073132, "grad_norm": 0.2867315411567688, "learning_rate": 1.7241262958574664e-05, "loss": 0.3976, "num_input_tokens_seen": 14750217857, "step": 3783, "train_runtime": 150390.551, "train_tokens_per_second": 98079.419 }, { "epoch": 0.6015898251192369, "grad_norm": 0.2346656769514084, "learning_rate": 1.72293694399688e-05, "loss": 0.3939, "num_input_tokens_seen": 14754140299, "step": 3784, "train_runtime": 150430.7114, "train_tokens_per_second": 98079.309 }, { "epoch": 0.6017488076311606, "grad_norm": 0.23097963631153107, "learning_rate": 1.7217477867850984e-05, "loss": 0.3947, "num_input_tokens_seen": 14758085764, "step": 3785, "train_runtime": 150469.0931, "train_tokens_per_second": 98080.513 }, { "epoch": 0.6019077901430843, "grad_norm": 0.2494661808013916, "learning_rate": 1.7205588245199962e-05, "loss": 0.4129, "num_input_tokens_seen": 14761996070, "step": 3786, "train_runtime": 150508.637, "train_tokens_per_second": 98080.724 }, { "epoch": 0.602066772655008, "grad_norm": 0.23099543154239655, "learning_rate": 1.7193700574994016e-05, "loss": 0.3913, "num_input_tokens_seen": 14765936286, "step": 3787, "train_runtime": 150548.3477, "train_tokens_per_second": 98081.025 }, { "epoch": 0.6022257551669317, "grad_norm": 0.17944443225860596, "learning_rate": 1.71818148602109e-05, "loss": 0.3918, "num_input_tokens_seen": 14769867738, "step": 3788, "train_runtime": 150587.2566, "train_tokens_per_second": 98081.79 }, { "epoch": 0.6023847376788554, "grad_norm": 0.19156195223331451, "learning_rate": 1.7169931103827924e-05, "loss": 0.3961, "num_input_tokens_seen": 14773647494, "step": 3789, "train_runtime": 150628.39, "train_tokens_per_second": 98080.1 }, { "epoch": 0.6025437201907791, "grad_norm": 0.31946951150894165, "learning_rate": 1.7158049308821878e-05, "loss": 0.401, "num_input_tokens_seen": 14777531700, "step": 3790, "train_runtime": 150670.0658, "train_tokens_per_second": 98078.75 }, { "epoch": 0.6027027027027027, "grad_norm": 0.21950975060462952, "learning_rate": 1.7146169478169056e-05, "loss": 0.3902, "num_input_tokens_seen": 14781411154, "step": 3791, "train_runtime": 150710.2002, "train_tokens_per_second": 98078.372 }, { "epoch": 0.6028616852146264, "grad_norm": 0.19485628604888916, "learning_rate": 1.7134291614845287e-05, "loss": 0.3975, "num_input_tokens_seen": 14785325341, "step": 3792, "train_runtime": 150744.9058, "train_tokens_per_second": 98081.758 }, { "epoch": 0.60302066772655, "grad_norm": 0.2666427493095398, "learning_rate": 1.712241572182589e-05, "loss": 0.4119, "num_input_tokens_seen": 14789256166, "step": 3793, "train_runtime": 150788.2036, "train_tokens_per_second": 98079.663 }, { "epoch": 0.6031796502384738, "grad_norm": 0.20371848344802856, "learning_rate": 1.711054180208569e-05, "loss": 0.4028, "num_input_tokens_seen": 14793159577, "step": 3794, "train_runtime": 150828.2207, "train_tokens_per_second": 98079.521 }, { "epoch": 0.6033386327503975, "grad_norm": 0.24483048915863037, "learning_rate": 1.709866985859902e-05, "loss": 0.3948, "num_input_tokens_seen": 14797138426, "step": 3795, "train_runtime": 150867.0365, "train_tokens_per_second": 98080.659 }, { "epoch": 0.6034976152623212, "grad_norm": 0.41314512491226196, "learning_rate": 1.7086799894339705e-05, "loss": 0.3864, "num_input_tokens_seen": 14800932547, "step": 3796, "train_runtime": 150906.9701, "train_tokens_per_second": 98079.847 }, { "epoch": 0.6036565977742449, "grad_norm": 0.2062775045633316, "learning_rate": 1.7074931912281108e-05, "loss": 0.3919, "num_input_tokens_seen": 14804880869, "step": 3797, "train_runtime": 150947.6992, "train_tokens_per_second": 98079.54 }, { "epoch": 0.6038155802861686, "grad_norm": 0.18037672340869904, "learning_rate": 1.706306591539606e-05, "loss": 0.3985, "num_input_tokens_seen": 14808875581, "step": 3798, "train_runtime": 150985.6623, "train_tokens_per_second": 98081.337 }, { "epoch": 0.6039745627980923, "grad_norm": 0.20598728954792023, "learning_rate": 1.705120190665692e-05, "loss": 0.4024, "num_input_tokens_seen": 14812701688, "step": 3799, "train_runtime": 151024.8222, "train_tokens_per_second": 98081.239 }, { "epoch": 0.604133545310016, "grad_norm": 0.23291614651679993, "learning_rate": 1.7039339889035537e-05, "loss": 0.3969, "num_input_tokens_seen": 14816497516, "step": 3800, "train_runtime": 151064.7595, "train_tokens_per_second": 98080.436 }, { "epoch": 0.6042925278219395, "grad_norm": 0.21288762986660004, "learning_rate": 1.7027479865503255e-05, "loss": 0.3954, "num_input_tokens_seen": 14820411415, "step": 3801, "train_runtime": 151175.5304, "train_tokens_per_second": 98034.46 }, { "epoch": 0.6044515103338632, "grad_norm": 0.22023069858551025, "learning_rate": 1.7015621839030933e-05, "loss": 0.4087, "num_input_tokens_seen": 14824404017, "step": 3802, "train_runtime": 151213.1396, "train_tokens_per_second": 98036.481 }, { "epoch": 0.6046104928457869, "grad_norm": 0.20669178664684296, "learning_rate": 1.7003765812588917e-05, "loss": 0.3942, "num_input_tokens_seen": 14828167656, "step": 3803, "train_runtime": 151250.0708, "train_tokens_per_second": 98037.426 }, { "epoch": 0.6047694753577106, "grad_norm": 0.20099496841430664, "learning_rate": 1.6991911789147054e-05, "loss": 0.3988, "num_input_tokens_seen": 14832149897, "step": 3804, "train_runtime": 151290.6228, "train_tokens_per_second": 98037.47 }, { "epoch": 0.6049284578696343, "grad_norm": 0.22552438080310822, "learning_rate": 1.698005977167471e-05, "loss": 0.3988, "num_input_tokens_seen": 14836088598, "step": 3805, "train_runtime": 151329.3217, "train_tokens_per_second": 98038.427 }, { "epoch": 0.605087440381558, "grad_norm": 0.2547420263290405, "learning_rate": 1.696820976314071e-05, "loss": 0.3976, "num_input_tokens_seen": 14839992635, "step": 3806, "train_runtime": 151365.7782, "train_tokens_per_second": 98040.606 }, { "epoch": 0.6052464228934817, "grad_norm": 0.19415970146656036, "learning_rate": 1.6956361766513422e-05, "loss": 0.3924, "num_input_tokens_seen": 14843630734, "step": 3807, "train_runtime": 151402.9507, "train_tokens_per_second": 98040.564 }, { "epoch": 0.6054054054054054, "grad_norm": 0.19844746589660645, "learning_rate": 1.694451578476067e-05, "loss": 0.3916, "num_input_tokens_seen": 14847570403, "step": 3808, "train_runtime": 151443.6086, "train_tokens_per_second": 98040.258 }, { "epoch": 0.6055643879173291, "grad_norm": 0.1873123049736023, "learning_rate": 1.6932671820849782e-05, "loss": 0.4008, "num_input_tokens_seen": 14851571304, "step": 3809, "train_runtime": 151484.3166, "train_tokens_per_second": 98040.323 }, { "epoch": 0.6057233704292527, "grad_norm": 0.21227876842021942, "learning_rate": 1.6920829877747607e-05, "loss": 0.4004, "num_input_tokens_seen": 14855259459, "step": 3810, "train_runtime": 151523.4458, "train_tokens_per_second": 98039.346 }, { "epoch": 0.6058823529411764, "grad_norm": 0.1915523260831833, "learning_rate": 1.6908989958420446e-05, "loss": 0.3924, "num_input_tokens_seen": 14859218701, "step": 3811, "train_runtime": 151564.4317, "train_tokens_per_second": 98038.956 }, { "epoch": 0.6060413354531001, "grad_norm": 0.19574128091335297, "learning_rate": 1.689715206583414e-05, "loss": 0.3974, "num_input_tokens_seen": 14863239100, "step": 3812, "train_runtime": 151603.0053, "train_tokens_per_second": 98040.531 }, { "epoch": 0.6062003179650238, "grad_norm": 0.2545127272605896, "learning_rate": 1.6885316202953976e-05, "loss": 0.3867, "num_input_tokens_seen": 14867112793, "step": 3813, "train_runtime": 151641.951, "train_tokens_per_second": 98040.896 }, { "epoch": 0.6063593004769475, "grad_norm": 0.18872706592082977, "learning_rate": 1.6873482372744757e-05, "loss": 0.3902, "num_input_tokens_seen": 14871065054, "step": 3814, "train_runtime": 151679.5997, "train_tokens_per_second": 98042.618 }, { "epoch": 0.6065182829888712, "grad_norm": 0.26730191707611084, "learning_rate": 1.686165057817079e-05, "loss": 0.3966, "num_input_tokens_seen": 14874953361, "step": 3815, "train_runtime": 151719.1738, "train_tokens_per_second": 98042.673 }, { "epoch": 0.6066772655007949, "grad_norm": 0.2405884861946106, "learning_rate": 1.6849820822195834e-05, "loss": 0.4128, "num_input_tokens_seen": 14878933628, "step": 3816, "train_runtime": 151756.8912, "train_tokens_per_second": 98044.534 }, { "epoch": 0.6068362480127186, "grad_norm": 0.20314966142177582, "learning_rate": 1.683799310778318e-05, "loss": 0.3967, "num_input_tokens_seen": 14882856632, "step": 3817, "train_runtime": 151793.4603, "train_tokens_per_second": 98046.758 }, { "epoch": 0.6069952305246423, "grad_norm": 0.25677841901779175, "learning_rate": 1.6826167437895578e-05, "loss": 0.4119, "num_input_tokens_seen": 14886554002, "step": 3818, "train_runtime": 151833.7625, "train_tokens_per_second": 98045.084 }, { "epoch": 0.607154213036566, "grad_norm": 0.22060422599315643, "learning_rate": 1.6814343815495258e-05, "loss": 0.3905, "num_input_tokens_seen": 14890565855, "step": 3819, "train_runtime": 151875.7697, "train_tokens_per_second": 98044.381 }, { "epoch": 0.6073131955484896, "grad_norm": 0.20751924812793732, "learning_rate": 1.6802522243543982e-05, "loss": 0.3973, "num_input_tokens_seen": 14894486854, "step": 3820, "train_runtime": 151913.9567, "train_tokens_per_second": 98045.546 }, { "epoch": 0.6074721780604133, "grad_norm": 0.21072515845298767, "learning_rate": 1.679070272500296e-05, "loss": 0.3824, "num_input_tokens_seen": 14898338002, "step": 3821, "train_runtime": 151953.3402, "train_tokens_per_second": 98045.479 }, { "epoch": 0.607631160572337, "grad_norm": 0.38863906264305115, "learning_rate": 1.6778885262832882e-05, "loss": 0.4029, "num_input_tokens_seen": 14902270911, "step": 3822, "train_runtime": 151992.482, "train_tokens_per_second": 98046.105 }, { "epoch": 0.6077901430842607, "grad_norm": 0.2052750289440155, "learning_rate": 1.6767069859993967e-05, "loss": 0.4133, "num_input_tokens_seen": 14906196204, "step": 3823, "train_runtime": 152032.3103, "train_tokens_per_second": 98046.239 }, { "epoch": 0.6079491255961844, "grad_norm": 0.20655874907970428, "learning_rate": 1.6755256519445867e-05, "loss": 0.4025, "num_input_tokens_seen": 14910072437, "step": 3824, "train_runtime": 152071.9241, "train_tokens_per_second": 98046.188 }, { "epoch": 0.6081081081081081, "grad_norm": 0.18922674655914307, "learning_rate": 1.6743445244147753e-05, "loss": 0.393, "num_input_tokens_seen": 14914022707, "step": 3825, "train_runtime": 152111.4402, "train_tokens_per_second": 98046.687 }, { "epoch": 0.6082670906200318, "grad_norm": 0.2248895764350891, "learning_rate": 1.6731636037058263e-05, "loss": 0.3996, "num_input_tokens_seen": 14917863910, "step": 3826, "train_runtime": 152150.4271, "train_tokens_per_second": 98046.809 }, { "epoch": 0.6084260731319555, "grad_norm": 0.1959095150232315, "learning_rate": 1.671982890113551e-05, "loss": 0.4115, "num_input_tokens_seen": 14921897884, "step": 3827, "train_runtime": 152189.3281, "train_tokens_per_second": 98048.254 }, { "epoch": 0.6085850556438792, "grad_norm": 0.6562185287475586, "learning_rate": 1.6708023839337114e-05, "loss": 0.3991, "num_input_tokens_seen": 14925732624, "step": 3828, "train_runtime": 152230.6013, "train_tokens_per_second": 98046.861 }, { "epoch": 0.6087440381558029, "grad_norm": 0.3869742453098297, "learning_rate": 1.6696220854620142e-05, "loss": 0.3898, "num_input_tokens_seen": 14929528368, "step": 3829, "train_runtime": 152269.5742, "train_tokens_per_second": 98046.694 }, { "epoch": 0.6089030206677265, "grad_norm": 0.1990610808134079, "learning_rate": 1.6684419949941183e-05, "loss": 0.4025, "num_input_tokens_seen": 14933428307, "step": 3830, "train_runtime": 152309.2296, "train_tokens_per_second": 98046.772 }, { "epoch": 0.6090620031796502, "grad_norm": 0.22764170169830322, "learning_rate": 1.667262112825626e-05, "loss": 0.3887, "num_input_tokens_seen": 14937306847, "step": 3831, "train_runtime": 152349.3558, "train_tokens_per_second": 98046.406 }, { "epoch": 0.6092209856915739, "grad_norm": 0.5102499127388, "learning_rate": 1.66608243925209e-05, "loss": 0.4145, "num_input_tokens_seen": 14941246151, "step": 3832, "train_runtime": 152388.502, "train_tokens_per_second": 98047.07 }, { "epoch": 0.6093799682034976, "grad_norm": 0.1971644014120102, "learning_rate": 1.6649029745690102e-05, "loss": 0.3936, "num_input_tokens_seen": 14945200044, "step": 3833, "train_runtime": 152428.959, "train_tokens_per_second": 98046.986 }, { "epoch": 0.6095389507154213, "grad_norm": 0.23378854990005493, "learning_rate": 1.6637237190718337e-05, "loss": 0.4122, "num_input_tokens_seen": 14949173684, "step": 3834, "train_runtime": 152467.6081, "train_tokens_per_second": 98048.194 }, { "epoch": 0.609697933227345, "grad_norm": 0.18799953162670135, "learning_rate": 1.662544673055957e-05, "loss": 0.3869, "num_input_tokens_seen": 14953162447, "step": 3835, "train_runtime": 152506.9934, "train_tokens_per_second": 98049.028 }, { "epoch": 0.6098569157392687, "grad_norm": 0.30870404839515686, "learning_rate": 1.661365836816722e-05, "loss": 0.3959, "num_input_tokens_seen": 14957057869, "step": 3836, "train_runtime": 152543.6417, "train_tokens_per_second": 98051.008 }, { "epoch": 0.6100158982511924, "grad_norm": 0.2099357694387436, "learning_rate": 1.6601872106494174e-05, "loss": 0.3943, "num_input_tokens_seen": 14960863113, "step": 3837, "train_runtime": 152582.9406, "train_tokens_per_second": 98050.693 }, { "epoch": 0.6101748807631161, "grad_norm": 0.189215749502182, "learning_rate": 1.6590087948492834e-05, "loss": 0.3876, "num_input_tokens_seen": 14964791971, "step": 3838, "train_runtime": 152621.9133, "train_tokens_per_second": 98051.398 }, { "epoch": 0.6103338632750398, "grad_norm": 0.430012583732605, "learning_rate": 1.6578305897115028e-05, "loss": 0.3987, "num_input_tokens_seen": 14968718891, "step": 3839, "train_runtime": 152661.7999, "train_tokens_per_second": 98051.503 }, { "epoch": 0.6104928457869634, "grad_norm": 0.19582483172416687, "learning_rate": 1.6566525955312093e-05, "loss": 0.395, "num_input_tokens_seen": 14972612435, "step": 3840, "train_runtime": 152701.8259, "train_tokens_per_second": 98051.299 }, { "epoch": 0.6106518282988871, "grad_norm": 0.21080340445041656, "learning_rate": 1.6554748126034807e-05, "loss": 0.3972, "num_input_tokens_seen": 14976594387, "step": 3841, "train_runtime": 152742.4119, "train_tokens_per_second": 98051.315 }, { "epoch": 0.6108108108108108, "grad_norm": 0.3895959258079529, "learning_rate": 1.6542972412233433e-05, "loss": 0.3994, "num_input_tokens_seen": 14980614612, "step": 3842, "train_runtime": 152783.1623, "train_tokens_per_second": 98051.476 }, { "epoch": 0.6109697933227345, "grad_norm": 0.21447838842868805, "learning_rate": 1.6531198816857717e-05, "loss": 0.4091, "num_input_tokens_seen": 14984422456, "step": 3843, "train_runtime": 152822.2558, "train_tokens_per_second": 98051.311 }, { "epoch": 0.6111287758346582, "grad_norm": 0.16960108280181885, "learning_rate": 1.6519427342856857e-05, "loss": 0.393, "num_input_tokens_seen": 14988358863, "step": 3844, "train_runtime": 152860.7584, "train_tokens_per_second": 98052.365 }, { "epoch": 0.6112877583465819, "grad_norm": 0.211873397231102, "learning_rate": 1.650765799317951e-05, "loss": 0.4056, "num_input_tokens_seen": 14992318630, "step": 3845, "train_runtime": 152898.4396, "train_tokens_per_second": 98054.098 }, { "epoch": 0.6114467408585056, "grad_norm": 0.19568492472171783, "learning_rate": 1.6495890770773838e-05, "loss": 0.3905, "num_input_tokens_seen": 14996156812, "step": 3846, "train_runtime": 152938.9965, "train_tokens_per_second": 98053.192 }, { "epoch": 0.6116057233704293, "grad_norm": 0.17835845053195953, "learning_rate": 1.6484125678587424e-05, "loss": 0.3928, "num_input_tokens_seen": 15000124944, "step": 3847, "train_runtime": 152977.1019, "train_tokens_per_second": 98054.707 }, { "epoch": 0.611764705882353, "grad_norm": 0.1827370822429657, "learning_rate": 1.6472362719567358e-05, "loss": 0.397, "num_input_tokens_seen": 15004037386, "step": 3848, "train_runtime": 153015.6453, "train_tokens_per_second": 98055.577 }, { "epoch": 0.6119236883942766, "grad_norm": 0.30986225605010986, "learning_rate": 1.6460601896660173e-05, "loss": 0.3994, "num_input_tokens_seen": 15007969332, "step": 3849, "train_runtime": 153055.7861, "train_tokens_per_second": 98055.55 }, { "epoch": 0.6120826709062003, "grad_norm": 0.17377012968063354, "learning_rate": 1.644884321281186e-05, "loss": 0.3967, "num_input_tokens_seen": 15011772623, "step": 3850, "train_runtime": 153094.1844, "train_tokens_per_second": 98055.799 }, { "epoch": 0.612241653418124, "grad_norm": 0.22653493285179138, "learning_rate": 1.6437086670967907e-05, "loss": 0.4201, "num_input_tokens_seen": 15015630312, "step": 3851, "train_runtime": 153132.7523, "train_tokens_per_second": 98056.295 }, { "epoch": 0.6124006359300477, "grad_norm": 0.19483521580696106, "learning_rate": 1.6425332274073225e-05, "loss": 0.3856, "num_input_tokens_seen": 15019657334, "step": 3852, "train_runtime": 153173.6626, "train_tokens_per_second": 98056.396 }, { "epoch": 0.6125596184419714, "grad_norm": 0.1997503936290741, "learning_rate": 1.6413580025072226e-05, "loss": 0.408, "num_input_tokens_seen": 15023628878, "step": 3853, "train_runtime": 153212.6924, "train_tokens_per_second": 98057.339 }, { "epoch": 0.6127186009538951, "grad_norm": 0.18751239776611328, "learning_rate": 1.6401829926908756e-05, "loss": 0.4063, "num_input_tokens_seen": 15027392927, "step": 3854, "train_runtime": 153253.5677, "train_tokens_per_second": 98055.746 }, { "epoch": 0.6128775834658188, "grad_norm": 0.22627036273479462, "learning_rate": 1.6390081982526125e-05, "loss": 0.3953, "num_input_tokens_seen": 15031245722, "step": 3855, "train_runtime": 153294.2866, "train_tokens_per_second": 98054.833 }, { "epoch": 0.6130365659777425, "grad_norm": 0.18068280816078186, "learning_rate": 1.6378336194867123e-05, "loss": 0.391, "num_input_tokens_seen": 15035196096, "step": 3856, "train_runtime": 153331.9429, "train_tokens_per_second": 98056.516 }, { "epoch": 0.6131955484896662, "grad_norm": 0.18323928117752075, "learning_rate": 1.6366592566873972e-05, "loss": 0.3952, "num_input_tokens_seen": 15039201585, "step": 3857, "train_runtime": 153368.8522, "train_tokens_per_second": 98059.035 }, { "epoch": 0.6133545310015899, "grad_norm": 0.21008355915546417, "learning_rate": 1.635485110148838e-05, "loss": 0.3988, "num_input_tokens_seen": 15043053668, "step": 3858, "train_runtime": 153409.6965, "train_tokens_per_second": 98058.037 }, { "epoch": 0.6135135135135135, "grad_norm": 0.2176087647676468, "learning_rate": 1.6343111801651507e-05, "loss": 0.3998, "num_input_tokens_seen": 15046964249, "step": 3859, "train_runtime": 153450.4376, "train_tokens_per_second": 98057.487 }, { "epoch": 0.6136724960254372, "grad_norm": 0.20149725675582886, "learning_rate": 1.633137467030394e-05, "loss": 0.3968, "num_input_tokens_seen": 15050876075, "step": 3860, "train_runtime": 153490.4116, "train_tokens_per_second": 98057.435 }, { "epoch": 0.6138314785373609, "grad_norm": 0.17312578856945038, "learning_rate": 1.6319639710385773e-05, "loss": 0.4056, "num_input_tokens_seen": 15054809872, "step": 3861, "train_runtime": 153527.934, "train_tokens_per_second": 98059.092 }, { "epoch": 0.6139904610492846, "grad_norm": 0.196290522813797, "learning_rate": 1.6307906924836518e-05, "loss": 0.413, "num_input_tokens_seen": 15058687198, "step": 3862, "train_runtime": 153565.1104, "train_tokens_per_second": 98060.602 }, { "epoch": 0.6141494435612083, "grad_norm": 0.2118608057498932, "learning_rate": 1.629617631659516e-05, "loss": 0.405, "num_input_tokens_seen": 15062589395, "step": 3863, "train_runtime": 153603.9981, "train_tokens_per_second": 98061.181 }, { "epoch": 0.614308426073132, "grad_norm": 0.16962169110774994, "learning_rate": 1.6284447888600125e-05, "loss": 0.3964, "num_input_tokens_seen": 15066558423, "step": 3864, "train_runtime": 153642.5255, "train_tokens_per_second": 98062.424 }, { "epoch": 0.6144674085850557, "grad_norm": 0.18455770611763, "learning_rate": 1.62727216437893e-05, "loss": 0.3947, "num_input_tokens_seen": 15070459650, "step": 3865, "train_runtime": 153681.0593, "train_tokens_per_second": 98063.221 }, { "epoch": 0.6146263910969794, "grad_norm": 0.22044023871421814, "learning_rate": 1.6260997585100047e-05, "loss": 0.3988, "num_input_tokens_seen": 15074280361, "step": 3866, "train_runtime": 153721.7445, "train_tokens_per_second": 98062.121 }, { "epoch": 0.6147853736089031, "grad_norm": 0.20031246542930603, "learning_rate": 1.6249275715469142e-05, "loss": 0.3939, "num_input_tokens_seen": 15078152814, "step": 3867, "train_runtime": 153759.4053, "train_tokens_per_second": 98063.288 }, { "epoch": 0.6149443561208268, "grad_norm": 0.2168876677751541, "learning_rate": 1.6237556037832823e-05, "loss": 0.4007, "num_input_tokens_seen": 15082105390, "step": 3868, "train_runtime": 153797.7979, "train_tokens_per_second": 98064.508 }, { "epoch": 0.6151033386327504, "grad_norm": 0.19126677513122559, "learning_rate": 1.6225838555126817e-05, "loss": 0.3875, "num_input_tokens_seen": 15085974197, "step": 3869, "train_runtime": 153837.4181, "train_tokens_per_second": 98064.401 }, { "epoch": 0.615262321144674, "grad_norm": 0.1858721375465393, "learning_rate": 1.6214123270286236e-05, "loss": 0.3915, "num_input_tokens_seen": 15089851900, "step": 3870, "train_runtime": 153876.5478, "train_tokens_per_second": 98064.664 }, { "epoch": 0.6154213036565978, "grad_norm": 0.19742488861083984, "learning_rate": 1.6202410186245703e-05, "loss": 0.3994, "num_input_tokens_seen": 15093694327, "step": 3871, "train_runtime": 153915.9682, "train_tokens_per_second": 98064.512 }, { "epoch": 0.6155802861685215, "grad_norm": 0.20828165113925934, "learning_rate": 1.6190699305939245e-05, "loss": 0.4093, "num_input_tokens_seen": 15097611074, "step": 3872, "train_runtime": 153954.6307, "train_tokens_per_second": 98065.326 }, { "epoch": 0.6157392686804452, "grad_norm": 0.18036940693855286, "learning_rate": 1.6178990632300362e-05, "loss": 0.3901, "num_input_tokens_seen": 15101523072, "step": 3873, "train_runtime": 153996.2678, "train_tokens_per_second": 98064.215 }, { "epoch": 0.6158982511923689, "grad_norm": 0.2084684520959854, "learning_rate": 1.6167284168261994e-05, "loss": 0.4014, "num_input_tokens_seen": 15105363656, "step": 3874, "train_runtime": 154036.0341, "train_tokens_per_second": 98063.831 }, { "epoch": 0.6160572337042926, "grad_norm": 0.18015462160110474, "learning_rate": 1.615557991675652e-05, "loss": 0.3899, "num_input_tokens_seen": 15109289031, "step": 3875, "train_runtime": 154076.9264, "train_tokens_per_second": 98063.282 }, { "epoch": 0.6162162162162163, "grad_norm": 0.18532344698905945, "learning_rate": 1.6143877880715792e-05, "loss": 0.3907, "num_input_tokens_seen": 15113241296, "step": 3876, "train_runtime": 154116.555, "train_tokens_per_second": 98063.711 }, { "epoch": 0.61637519872814, "grad_norm": 0.2094343602657318, "learning_rate": 1.6132178063071067e-05, "loss": 0.4061, "num_input_tokens_seen": 15117171119, "step": 3877, "train_runtime": 154155.233, "train_tokens_per_second": 98064.599 }, { "epoch": 0.6165341812400635, "grad_norm": 0.18533927202224731, "learning_rate": 1.6120480466753075e-05, "loss": 0.3982, "num_input_tokens_seen": 15121064145, "step": 3878, "train_runtime": 154195.9327, "train_tokens_per_second": 98063.962 }, { "epoch": 0.6166931637519872, "grad_norm": 0.20275036990642548, "learning_rate": 1.610878509469198e-05, "loss": 0.394, "num_input_tokens_seen": 15124986547, "step": 3879, "train_runtime": 154233.4623, "train_tokens_per_second": 98065.532 }, { "epoch": 0.6168521462639109, "grad_norm": 0.21322888135910034, "learning_rate": 1.6097091949817394e-05, "loss": 0.3977, "num_input_tokens_seen": 15128950208, "step": 3880, "train_runtime": 154274.13, "train_tokens_per_second": 98065.374 }, { "epoch": 0.6170111287758346, "grad_norm": 0.19307224452495575, "learning_rate": 1.6085401035058358e-05, "loss": 0.4023, "num_input_tokens_seen": 15132711201, "step": 3881, "train_runtime": 154314.3676, "train_tokens_per_second": 98064.175 }, { "epoch": 0.6171701112877583, "grad_norm": 0.20186369121074677, "learning_rate": 1.6073712353343376e-05, "loss": 0.3992, "num_input_tokens_seen": 15136695462, "step": 3882, "train_runtime": 154351.2807, "train_tokens_per_second": 98066.536 }, { "epoch": 0.617329093799682, "grad_norm": 0.2559959888458252, "learning_rate": 1.606202590760036e-05, "loss": 0.408, "num_input_tokens_seen": 15140640475, "step": 3883, "train_runtime": 154390.1317, "train_tokens_per_second": 98067.411 }, { "epoch": 0.6174880763116057, "grad_norm": 0.19664303958415985, "learning_rate": 1.6050341700756707e-05, "loss": 0.3975, "num_input_tokens_seen": 15144580763, "step": 3884, "train_runtime": 154427.8205, "train_tokens_per_second": 98068.992 }, { "epoch": 0.6176470588235294, "grad_norm": 0.19955512881278992, "learning_rate": 1.6038659735739215e-05, "loss": 0.3998, "num_input_tokens_seen": 15148481915, "step": 3885, "train_runtime": 154468.6874, "train_tokens_per_second": 98068.302 }, { "epoch": 0.6178060413354531, "grad_norm": 0.2648668587207794, "learning_rate": 1.602698001547413e-05, "loss": 0.397, "num_input_tokens_seen": 15152380119, "step": 3886, "train_runtime": 154507.7654, "train_tokens_per_second": 98068.729 }, { "epoch": 0.6179650238473768, "grad_norm": 0.2102145254611969, "learning_rate": 1.601530254288715e-05, "loss": 0.3927, "num_input_tokens_seen": 15156263763, "step": 3887, "train_runtime": 154545.9869, "train_tokens_per_second": 98069.604 }, { "epoch": 0.6181240063593004, "grad_norm": 0.20883798599243164, "learning_rate": 1.6003627320903393e-05, "loss": 0.4076, "num_input_tokens_seen": 15160168351, "step": 3888, "train_runtime": 154584.5137, "train_tokens_per_second": 98070.421 }, { "epoch": 0.6182829888712241, "grad_norm": 0.21532700955867767, "learning_rate": 1.599195435244742e-05, "loss": 0.3998, "num_input_tokens_seen": 15164033995, "step": 3889, "train_runtime": 154623.4916, "train_tokens_per_second": 98070.7 }, { "epoch": 0.6184419713831478, "grad_norm": 0.21385127305984497, "learning_rate": 1.5980283640443235e-05, "loss": 0.3966, "num_input_tokens_seen": 15167991758, "step": 3890, "train_runtime": 154664.1715, "train_tokens_per_second": 98070.494 }, { "epoch": 0.6186009538950715, "grad_norm": 0.23459979891777039, "learning_rate": 1.5968615187814244e-05, "loss": 0.4024, "num_input_tokens_seen": 15171909603, "step": 3891, "train_runtime": 154704.0866, "train_tokens_per_second": 98070.516 }, { "epoch": 0.6187599364069952, "grad_norm": 0.18560431897640228, "learning_rate": 1.5956948997483343e-05, "loss": 0.3837, "num_input_tokens_seen": 15175824149, "step": 3892, "train_runtime": 154742.7285, "train_tokens_per_second": 98071.323 }, { "epoch": 0.6189189189189189, "grad_norm": 0.22730088233947754, "learning_rate": 1.5945285072372806e-05, "loss": 0.3989, "num_input_tokens_seen": 15179667547, "step": 3893, "train_runtime": 154782.9221, "train_tokens_per_second": 98070.687 }, { "epoch": 0.6190779014308426, "grad_norm": 0.2019042670726776, "learning_rate": 1.5933623415404387e-05, "loss": 0.4113, "num_input_tokens_seen": 15183510678, "step": 3894, "train_runtime": 154822.4559, "train_tokens_per_second": 98070.468 }, { "epoch": 0.6192368839427663, "grad_norm": 0.1772555559873581, "learning_rate": 1.5921964029499233e-05, "loss": 0.3913, "num_input_tokens_seen": 15187499900, "step": 3895, "train_runtime": 154863.8822, "train_tokens_per_second": 98069.993 }, { "epoch": 0.61939586645469, "grad_norm": 0.20312315225601196, "learning_rate": 1.591030691757794e-05, "loss": 0.381, "num_input_tokens_seen": 15191371295, "step": 3896, "train_runtime": 154903.0091, "train_tokens_per_second": 98070.214 }, { "epoch": 0.6195548489666137, "grad_norm": 0.20401306450366974, "learning_rate": 1.5898652082560535e-05, "loss": 0.3982, "num_input_tokens_seen": 15195278599, "step": 3897, "train_runtime": 154940.7862, "train_tokens_per_second": 98071.521 }, { "epoch": 0.6197138314785373, "grad_norm": 0.19971512258052826, "learning_rate": 1.588699952736646e-05, "loss": 0.3958, "num_input_tokens_seen": 15199232871, "step": 3898, "train_runtime": 154980.7539, "train_tokens_per_second": 98071.744 }, { "epoch": 0.619872813990461, "grad_norm": 0.22004736959934235, "learning_rate": 1.5875349254914614e-05, "loss": 0.3993, "num_input_tokens_seen": 15203060584, "step": 3899, "train_runtime": 155019.8425, "train_tokens_per_second": 98071.707 }, { "epoch": 0.6200317965023847, "grad_norm": 0.1971684992313385, "learning_rate": 1.5863701268123307e-05, "loss": 0.4009, "num_input_tokens_seen": 15207052382, "step": 3900, "train_runtime": 155060.336, "train_tokens_per_second": 98071.839 }, { "epoch": 0.6201907790143084, "grad_norm": 0.18761751055717468, "learning_rate": 1.5852055569910263e-05, "loss": 0.3959, "num_input_tokens_seen": 15211000512, "step": 3901, "train_runtime": 155100.5814, "train_tokens_per_second": 98071.847 }, { "epoch": 0.6203497615262321, "grad_norm": 0.2598487436771393, "learning_rate": 1.5840412163192665e-05, "loss": 0.3914, "num_input_tokens_seen": 15214931559, "step": 3902, "train_runtime": 155142.2143, "train_tokens_per_second": 98070.868 }, { "epoch": 0.6205087440381558, "grad_norm": 0.23832572996616364, "learning_rate": 1.5828771050887094e-05, "loss": 0.3868, "num_input_tokens_seen": 15218869556, "step": 3903, "train_runtime": 155181.985, "train_tokens_per_second": 98071.11 }, { "epoch": 0.6206677265500795, "grad_norm": 0.1917109340429306, "learning_rate": 1.581713223590956e-05, "loss": 0.4051, "num_input_tokens_seen": 15222798457, "step": 3904, "train_runtime": 155221.4405, "train_tokens_per_second": 98071.493 }, { "epoch": 0.6208267090620032, "grad_norm": 0.18533290922641754, "learning_rate": 1.5805495721175516e-05, "loss": 0.3924, "num_input_tokens_seen": 15226770623, "step": 3905, "train_runtime": 155260.2424, "train_tokens_per_second": 98072.568 }, { "epoch": 0.6209856915739269, "grad_norm": 0.1960870921611786, "learning_rate": 1.5793861509599817e-05, "loss": 0.393, "num_input_tokens_seen": 15230595457, "step": 3906, "train_runtime": 155299.5482, "train_tokens_per_second": 98072.375 }, { "epoch": 0.6211446740858505, "grad_norm": 0.31261327862739563, "learning_rate": 1.5782229604096765e-05, "loss": 0.3912, "num_input_tokens_seen": 15234514484, "step": 3907, "train_runtime": 155337.1964, "train_tokens_per_second": 98073.834 }, { "epoch": 0.6213036565977742, "grad_norm": 0.24711140990257263, "learning_rate": 1.577060000758006e-05, "loss": 0.3842, "num_input_tokens_seen": 15238435139, "step": 3908, "train_runtime": 155376.6725, "train_tokens_per_second": 98074.15 }, { "epoch": 0.6214626391096979, "grad_norm": 0.19208940863609314, "learning_rate": 1.5758972722962827e-05, "loss": 0.3886, "num_input_tokens_seen": 15242402319, "step": 3909, "train_runtime": 155410.4437, "train_tokens_per_second": 98078.366 }, { "epoch": 0.6216216216216216, "grad_norm": 0.18698811531066895, "learning_rate": 1.5747347753157637e-05, "loss": 0.3876, "num_input_tokens_seen": 15246325239, "step": 3910, "train_runtime": 155452.1251, "train_tokens_per_second": 98077.303 }, { "epoch": 0.6217806041335453, "grad_norm": 0.18218210339546204, "learning_rate": 1.5735725101076442e-05, "loss": 0.3851, "num_input_tokens_seen": 15250193894, "step": 3911, "train_runtime": 155492.5176, "train_tokens_per_second": 98076.706 }, { "epoch": 0.621939586645469, "grad_norm": 0.19935661554336548, "learning_rate": 1.5724104769630655e-05, "loss": 0.3892, "num_input_tokens_seen": 15254179362, "step": 3912, "train_runtime": 155532.6789, "train_tokens_per_second": 98077.005 }, { "epoch": 0.6220985691573927, "grad_norm": 0.255723774433136, "learning_rate": 1.5712486761731075e-05, "loss": 0.3958, "num_input_tokens_seen": 15258023479, "step": 3913, "train_runtime": 155572.9032, "train_tokens_per_second": 98076.356 }, { "epoch": 0.6222575516693164, "grad_norm": 0.27516528964042664, "learning_rate": 1.570087108028792e-05, "loss": 0.3935, "num_input_tokens_seen": 15261928722, "step": 3914, "train_runtime": 155611.7027, "train_tokens_per_second": 98076.998 }, { "epoch": 0.6224165341812401, "grad_norm": 0.25057435035705566, "learning_rate": 1.5689257728210862e-05, "loss": 0.3981, "num_input_tokens_seen": 15265789746, "step": 3915, "train_runtime": 155650.9311, "train_tokens_per_second": 98077.086 }, { "epoch": 0.6225755166931638, "grad_norm": 0.21253164112567902, "learning_rate": 1.5677646708408937e-05, "loss": 0.3915, "num_input_tokens_seen": 15269708520, "step": 3916, "train_runtime": 155690.7061, "train_tokens_per_second": 98077.2 }, { "epoch": 0.6227344992050874, "grad_norm": 0.1888233721256256, "learning_rate": 1.566603802379065e-05, "loss": 0.3886, "num_input_tokens_seen": 15273649007, "step": 3917, "train_runtime": 155729.6622, "train_tokens_per_second": 98077.969 }, { "epoch": 0.6228934817170111, "grad_norm": 0.2256990522146225, "learning_rate": 1.5654431677263877e-05, "loss": 0.4085, "num_input_tokens_seen": 15277519236, "step": 3918, "train_runtime": 155770.4529, "train_tokens_per_second": 98077.132 }, { "epoch": 0.6230524642289348, "grad_norm": 0.1851760298013687, "learning_rate": 1.5642827671735927e-05, "loss": 0.397, "num_input_tokens_seen": 15281506836, "step": 3919, "train_runtime": 155810.9078, "train_tokens_per_second": 98077.26 }, { "epoch": 0.6232114467408585, "grad_norm": 0.2887892723083496, "learning_rate": 1.563122601011353e-05, "loss": 0.399, "num_input_tokens_seen": 15285398022, "step": 3920, "train_runtime": 155851.7994, "train_tokens_per_second": 98076.494 }, { "epoch": 0.6233704292527822, "grad_norm": 0.22799627482891083, "learning_rate": 1.5619626695302814e-05, "loss": 0.3997, "num_input_tokens_seen": 15289207277, "step": 3921, "train_runtime": 155892.66, "train_tokens_per_second": 98075.222 }, { "epoch": 0.6235294117647059, "grad_norm": 0.18374770879745483, "learning_rate": 1.5608029730209316e-05, "loss": 0.4097, "num_input_tokens_seen": 15293147075, "step": 3922, "train_runtime": 155932.9103, "train_tokens_per_second": 98075.173 }, { "epoch": 0.6236883942766296, "grad_norm": 0.2685049772262573, "learning_rate": 1.559643511773801e-05, "loss": 0.4035, "num_input_tokens_seen": 15297179105, "step": 3923, "train_runtime": 155970.161, "train_tokens_per_second": 98077.6 }, { "epoch": 0.6238473767885533, "grad_norm": 0.20870845019817352, "learning_rate": 1.5584842860793253e-05, "loss": 0.3897, "num_input_tokens_seen": 15300947392, "step": 3924, "train_runtime": 156009.4484, "train_tokens_per_second": 98077.056 }, { "epoch": 0.624006359300477, "grad_norm": 0.20196950435638428, "learning_rate": 1.5573252962278835e-05, "loss": 0.3897, "num_input_tokens_seen": 15304921779, "step": 3925, "train_runtime": 156045.8696, "train_tokens_per_second": 98079.634 }, { "epoch": 0.6241653418124007, "grad_norm": 0.1738186478614807, "learning_rate": 1.5561665425097936e-05, "loss": 0.394, "num_input_tokens_seen": 15308797229, "step": 3926, "train_runtime": 156086.1211, "train_tokens_per_second": 98079.17 }, { "epoch": 0.6243243243243243, "grad_norm": 0.1920064389705658, "learning_rate": 1.5550080252153154e-05, "loss": 0.3963, "num_input_tokens_seen": 15312776868, "step": 3927, "train_runtime": 156123.7627, "train_tokens_per_second": 98081.013 }, { "epoch": 0.624483306836248, "grad_norm": 0.18076784908771515, "learning_rate": 1.5538497446346494e-05, "loss": 0.3971, "num_input_tokens_seen": 15316535691, "step": 3928, "train_runtime": 156160.3812, "train_tokens_per_second": 98082.084 }, { "epoch": 0.6246422893481717, "grad_norm": 0.17522801458835602, "learning_rate": 1.5526917010579355e-05, "loss": 0.3921, "num_input_tokens_seen": 15320420628, "step": 3929, "train_runtime": 156199.52, "train_tokens_per_second": 98082.38 }, { "epoch": 0.6248012718600954, "grad_norm": 0.18965159356594086, "learning_rate": 1.551533894775258e-05, "loss": 0.3798, "num_input_tokens_seen": 15324357219, "step": 3930, "train_runtime": 156237.5009, "train_tokens_per_second": 98083.732 }, { "epoch": 0.6249602543720191, "grad_norm": 0.24202565848827362, "learning_rate": 1.5503763260766368e-05, "loss": 0.4016, "num_input_tokens_seen": 15328207263, "step": 3931, "train_runtime": 156276.9324, "train_tokens_per_second": 98083.62 }, { "epoch": 0.6251192368839428, "grad_norm": 0.1931619644165039, "learning_rate": 1.549218995252035e-05, "loss": 0.4062, "num_input_tokens_seen": 15332153650, "step": 3932, "train_runtime": 156317.8276, "train_tokens_per_second": 98083.206 }, { "epoch": 0.6252782193958665, "grad_norm": 0.2335135042667389, "learning_rate": 1.5480619025913575e-05, "loss": 0.3921, "num_input_tokens_seen": 15336164932, "step": 3933, "train_runtime": 156359.0577, "train_tokens_per_second": 98082.997 }, { "epoch": 0.6254372019077902, "grad_norm": 0.18512512743473053, "learning_rate": 1.5469050483844458e-05, "loss": 0.409, "num_input_tokens_seen": 15340129605, "step": 3934, "train_runtime": 156394.2615, "train_tokens_per_second": 98086.269 }, { "epoch": 0.6255961844197139, "grad_norm": 0.19937610626220703, "learning_rate": 1.5457484329210853e-05, "loss": 0.4055, "num_input_tokens_seen": 15343958102, "step": 3935, "train_runtime": 156433.8541, "train_tokens_per_second": 98085.917 }, { "epoch": 0.6257551669316376, "grad_norm": 0.22926576435565948, "learning_rate": 1.544592056490999e-05, "loss": 0.383, "num_input_tokens_seen": 15347914809, "step": 3936, "train_runtime": 156473.5662, "train_tokens_per_second": 98086.311 }, { "epoch": 0.6259141494435612, "grad_norm": 0.2530640661716461, "learning_rate": 1.5434359193838503e-05, "loss": 0.3998, "num_input_tokens_seen": 15351837775, "step": 3937, "train_runtime": 156508.9569, "train_tokens_per_second": 98089.196 }, { "epoch": 0.6260731319554849, "grad_norm": 0.24445225298404694, "learning_rate": 1.5422800218892457e-05, "loss": 0.3802, "num_input_tokens_seen": 15355696019, "step": 3938, "train_runtime": 156546.791, "train_tokens_per_second": 98090.136 }, { "epoch": 0.6262321144674086, "grad_norm": 0.2345081865787506, "learning_rate": 1.541124364296728e-05, "loss": 0.3863, "num_input_tokens_seen": 15359567692, "step": 3939, "train_runtime": 156586.6079, "train_tokens_per_second": 98089.919 }, { "epoch": 0.6263910969793323, "grad_norm": 0.24154867231845856, "learning_rate": 1.5399689468957803e-05, "loss": 0.4042, "num_input_tokens_seen": 15363539612, "step": 3940, "train_runtime": 156625.8763, "train_tokens_per_second": 98090.686 }, { "epoch": 0.626550079491256, "grad_norm": 0.2226591408252716, "learning_rate": 1.5388137699758286e-05, "loss": 0.4082, "num_input_tokens_seen": 15367479022, "step": 3941, "train_runtime": 156662.7956, "train_tokens_per_second": 98092.715 }, { "epoch": 0.6267090620031797, "grad_norm": 0.2060922384262085, "learning_rate": 1.5376588338262345e-05, "loss": 0.3991, "num_input_tokens_seen": 15371287498, "step": 3942, "train_runtime": 156701.3828, "train_tokens_per_second": 98092.864 }, { "epoch": 0.6268680445151034, "grad_norm": 0.22468768060207367, "learning_rate": 1.5365041387363032e-05, "loss": 0.3881, "num_input_tokens_seen": 15375183598, "step": 3943, "train_runtime": 156739.5927, "train_tokens_per_second": 98093.809 }, { "epoch": 0.6270270270270271, "grad_norm": 0.20012272894382477, "learning_rate": 1.5353496849952765e-05, "loss": 0.4018, "num_input_tokens_seen": 15379125578, "step": 3944, "train_runtime": 156779.6738, "train_tokens_per_second": 98093.874 }, { "epoch": 0.6271860095389508, "grad_norm": 0.19587473571300507, "learning_rate": 1.5341954728923365e-05, "loss": 0.3951, "num_input_tokens_seen": 15382976414, "step": 3945, "train_runtime": 156816.7165, "train_tokens_per_second": 98095.259 }, { "epoch": 0.6273449920508744, "grad_norm": 0.2201821506023407, "learning_rate": 1.533041502716606e-05, "loss": 0.3981, "num_input_tokens_seen": 15386962512, "step": 3946, "train_runtime": 156857.6278, "train_tokens_per_second": 98095.086 }, { "epoch": 0.627503974562798, "grad_norm": 0.2060127556324005, "learning_rate": 1.531887774757146e-05, "loss": 0.3869, "num_input_tokens_seen": 15390767366, "step": 3947, "train_runtime": 156898.0253, "train_tokens_per_second": 98094.08 }, { "epoch": 0.6276629570747218, "grad_norm": 0.1746152937412262, "learning_rate": 1.530734289302958e-05, "loss": 0.3893, "num_input_tokens_seen": 15394670637, "step": 3948, "train_runtime": 156935.4968, "train_tokens_per_second": 98095.529 }, { "epoch": 0.6278219395866454, "grad_norm": 0.22702662646770477, "learning_rate": 1.5295810466429816e-05, "loss": 0.393, "num_input_tokens_seen": 15398621104, "step": 3949, "train_runtime": 156974.2758, "train_tokens_per_second": 98096.462 }, { "epoch": 0.6279809220985691, "grad_norm": 0.21704714000225067, "learning_rate": 1.528428047066095e-05, "loss": 0.3977, "num_input_tokens_seen": 15402402388, "step": 3950, "train_runtime": 157015.6988, "train_tokens_per_second": 98094.665 }, { "epoch": 0.6281399046104928, "grad_norm": 0.2718389332294464, "learning_rate": 1.5272752908611178e-05, "loss": 0.4063, "num_input_tokens_seen": 15406390178, "step": 3951, "train_runtime": 157055.4066, "train_tokens_per_second": 98095.255 }, { "epoch": 0.6282988871224165, "grad_norm": 0.1978420466184616, "learning_rate": 1.5261227783168055e-05, "loss": 0.4019, "num_input_tokens_seen": 15410281921, "step": 3952, "train_runtime": 157093.2959, "train_tokens_per_second": 98096.369 }, { "epoch": 0.6284578696343402, "grad_norm": 0.20500431954860687, "learning_rate": 1.5249705097218563e-05, "loss": 0.3849, "num_input_tokens_seen": 15414156247, "step": 3953, "train_runtime": 157129.3576, "train_tokens_per_second": 98098.512 }, { "epoch": 0.628616852146264, "grad_norm": 0.3004668653011322, "learning_rate": 1.5238184853649051e-05, "loss": 0.3918, "num_input_tokens_seen": 15418003089, "step": 3954, "train_runtime": 157170.9384, "train_tokens_per_second": 98097.035 }, { "epoch": 0.6287758346581876, "grad_norm": 0.22337549924850464, "learning_rate": 1.522666705534524e-05, "loss": 0.3973, "num_input_tokens_seen": 15421937609, "step": 3955, "train_runtime": 157209.5237, "train_tokens_per_second": 98097.986 }, { "epoch": 0.6289348171701112, "grad_norm": 0.1906699538230896, "learning_rate": 1.5215151705192277e-05, "loss": 0.3996, "num_input_tokens_seen": 15425947633, "step": 3956, "train_runtime": 157250.1125, "train_tokens_per_second": 98098.166 }, { "epoch": 0.6290937996820349, "grad_norm": 0.1973932981491089, "learning_rate": 1.5203638806074671e-05, "loss": 0.3932, "num_input_tokens_seen": 15429909343, "step": 3957, "train_runtime": 157289.328, "train_tokens_per_second": 98098.895 }, { "epoch": 0.6292527821939586, "grad_norm": 0.23921304941177368, "learning_rate": 1.5192128360876312e-05, "loss": 0.4076, "num_input_tokens_seen": 15433872143, "step": 3958, "train_runtime": 157325.9963, "train_tokens_per_second": 98101.22 }, { "epoch": 0.6294117647058823, "grad_norm": 0.20165948569774628, "learning_rate": 1.5180620372480504e-05, "loss": 0.4076, "num_input_tokens_seen": 15437806270, "step": 3959, "train_runtime": 157366.4868, "train_tokens_per_second": 98100.978 }, { "epoch": 0.629570747217806, "grad_norm": 0.22237026691436768, "learning_rate": 1.5169114843769888e-05, "loss": 0.3934, "num_input_tokens_seen": 15441576974, "step": 3960, "train_runtime": 157404.9554, "train_tokens_per_second": 98100.958 }, { "epoch": 0.6297297297297297, "grad_norm": 0.19950126111507416, "learning_rate": 1.5157611777626551e-05, "loss": 0.4064, "num_input_tokens_seen": 15445623539, "step": 3961, "train_runtime": 157443.3587, "train_tokens_per_second": 98102.731 }, { "epoch": 0.6298887122416534, "grad_norm": 0.20239059627056122, "learning_rate": 1.514611117693191e-05, "loss": 0.3958, "num_input_tokens_seen": 15449500085, "step": 3962, "train_runtime": 157480.8377, "train_tokens_per_second": 98104.0 }, { "epoch": 0.6300476947535771, "grad_norm": 0.2300134301185608, "learning_rate": 1.513461304456678e-05, "loss": 0.3956, "num_input_tokens_seen": 15453466591, "step": 3963, "train_runtime": 157519.9135, "train_tokens_per_second": 98104.844 }, { "epoch": 0.6302066772655008, "grad_norm": 0.19668830931186676, "learning_rate": 1.5123117383411379e-05, "loss": 0.4076, "num_input_tokens_seen": 15457373103, "step": 3964, "train_runtime": 157559.1375, "train_tokens_per_second": 98105.215 }, { "epoch": 0.6303656597774245, "grad_norm": 0.2192411720752716, "learning_rate": 1.5111624196345275e-05, "loss": 0.4027, "num_input_tokens_seen": 15461241393, "step": 3965, "train_runtime": 157599.233, "train_tokens_per_second": 98104.801 }, { "epoch": 0.6305246422893481, "grad_norm": 0.19535961747169495, "learning_rate": 1.5100133486247442e-05, "loss": 0.4042, "num_input_tokens_seen": 15465180865, "step": 3966, "train_runtime": 157639.0553, "train_tokens_per_second": 98105.009 }, { "epoch": 0.6306836248012718, "grad_norm": 0.20870228111743927, "learning_rate": 1.508864525599621e-05, "loss": 0.3923, "num_input_tokens_seen": 15468969712, "step": 3967, "train_runtime": 157678.4193, "train_tokens_per_second": 98104.546 }, { "epoch": 0.6308426073131955, "grad_norm": 0.1856527179479599, "learning_rate": 1.5077159508469302e-05, "loss": 0.3898, "num_input_tokens_seen": 15472859629, "step": 3968, "train_runtime": 157720.0071, "train_tokens_per_second": 98103.341 }, { "epoch": 0.6310015898251192, "grad_norm": 0.1873500645160675, "learning_rate": 1.5065676246543821e-05, "loss": 0.4026, "num_input_tokens_seen": 15476799035, "step": 3969, "train_runtime": 157761.1677, "train_tokens_per_second": 98102.716 }, { "epoch": 0.6311605723370429, "grad_norm": 0.19838036596775055, "learning_rate": 1.5054195473096236e-05, "loss": 0.3979, "num_input_tokens_seen": 15480733339, "step": 3970, "train_runtime": 157799.8977, "train_tokens_per_second": 98103.57 }, { "epoch": 0.6313195548489666, "grad_norm": 0.19847792387008667, "learning_rate": 1.5042717191002412e-05, "loss": 0.3869, "num_input_tokens_seen": 15484506667, "step": 3971, "train_runtime": 157839.6557, "train_tokens_per_second": 98102.765 }, { "epoch": 0.6314785373608903, "grad_norm": 0.510717511177063, "learning_rate": 1.5031241403137569e-05, "loss": 0.4056, "num_input_tokens_seen": 15488337649, "step": 3972, "train_runtime": 157879.1724, "train_tokens_per_second": 98102.476 }, { "epoch": 0.631637519872814, "grad_norm": 0.19041617214679718, "learning_rate": 1.5019768112376309e-05, "loss": 0.3857, "num_input_tokens_seen": 15492453810, "step": 3973, "train_runtime": 157917.4582, "train_tokens_per_second": 98104.757 }, { "epoch": 0.6317965023847377, "grad_norm": 0.20094604790210724, "learning_rate": 1.5008297321592613e-05, "loss": 0.393, "num_input_tokens_seen": 15496274354, "step": 3974, "train_runtime": 157957.6287, "train_tokens_per_second": 98103.995 }, { "epoch": 0.6319554848966613, "grad_norm": 0.21766892075538635, "learning_rate": 1.4996829033659832e-05, "loss": 0.4028, "num_input_tokens_seen": 15500140955, "step": 3975, "train_runtime": 157996.7967, "train_tokens_per_second": 98104.147 }, { "epoch": 0.632114467408585, "grad_norm": 0.2553071975708008, "learning_rate": 1.4985363251450698e-05, "loss": 0.39, "num_input_tokens_seen": 15504170689, "step": 3976, "train_runtime": 158034.8445, "train_tokens_per_second": 98106.027 }, { "epoch": 0.6322734499205087, "grad_norm": 0.690142810344696, "learning_rate": 1.4973899977837302e-05, "loss": 0.4098, "num_input_tokens_seen": 15508170357, "step": 3977, "train_runtime": 158074.7274, "train_tokens_per_second": 98106.577 }, { "epoch": 0.6324324324324324, "grad_norm": 0.2599807381629944, "learning_rate": 1.4962439215691105e-05, "loss": 0.3906, "num_input_tokens_seen": 15511927361, "step": 3978, "train_runtime": 158117.2531, "train_tokens_per_second": 98103.952 }, { "epoch": 0.6325914149443561, "grad_norm": 0.19136883318424225, "learning_rate": 1.4950980967882963e-05, "loss": 0.3913, "num_input_tokens_seen": 15515815564, "step": 3979, "train_runtime": 158155.4164, "train_tokens_per_second": 98104.864 }, { "epoch": 0.6327503974562798, "grad_norm": 0.1980409175157547, "learning_rate": 1.493952523728308e-05, "loss": 0.3986, "num_input_tokens_seen": 15519746717, "step": 3980, "train_runtime": 158192.7762, "train_tokens_per_second": 98106.545 }, { "epoch": 0.6329093799682035, "grad_norm": 0.2348732352256775, "learning_rate": 1.4928072026761025e-05, "loss": 0.3781, "num_input_tokens_seen": 15523556804, "step": 3981, "train_runtime": 158232.9429, "train_tokens_per_second": 98105.72 }, { "epoch": 0.6330683624801272, "grad_norm": 0.21007879078388214, "learning_rate": 1.4916621339185766e-05, "loss": 0.4057, "num_input_tokens_seen": 15527443651, "step": 3982, "train_runtime": 158271.9646, "train_tokens_per_second": 98106.09 }, { "epoch": 0.6332273449920509, "grad_norm": 0.20857135951519012, "learning_rate": 1.4905173177425601e-05, "loss": 0.398, "num_input_tokens_seen": 15531450875, "step": 3983, "train_runtime": 158313.0358, "train_tokens_per_second": 98105.951 }, { "epoch": 0.6333863275039746, "grad_norm": 0.22514723241329193, "learning_rate": 1.4893727544348229e-05, "loss": 0.4034, "num_input_tokens_seen": 15535227172, "step": 3984, "train_runtime": 158348.3146, "train_tokens_per_second": 98107.941 }, { "epoch": 0.6335453100158982, "grad_norm": 0.19645638763904572, "learning_rate": 1.4882284442820691e-05, "loss": 0.3998, "num_input_tokens_seen": 15539015422, "step": 3985, "train_runtime": 158388.0232, "train_tokens_per_second": 98107.263 }, { "epoch": 0.6337042925278219, "grad_norm": 0.21864396333694458, "learning_rate": 1.4870843875709394e-05, "loss": 0.3843, "num_input_tokens_seen": 15542930505, "step": 3986, "train_runtime": 158425.0158, "train_tokens_per_second": 98109.067 }, { "epoch": 0.6338632750397456, "grad_norm": 0.19638925790786743, "learning_rate": 1.4859405845880143e-05, "loss": 0.3913, "num_input_tokens_seen": 15546963544, "step": 3987, "train_runtime": 158464.214, "train_tokens_per_second": 98110.249 }, { "epoch": 0.6340222575516693, "grad_norm": 0.19048772752285004, "learning_rate": 1.4847970356198063e-05, "loss": 0.3956, "num_input_tokens_seen": 15550773137, "step": 3988, "train_runtime": 158501.5177, "train_tokens_per_second": 98111.194 }, { "epoch": 0.634181240063593, "grad_norm": 0.2541404068470001, "learning_rate": 1.4836537409527673e-05, "loss": 0.3898, "num_input_tokens_seen": 15554566994, "step": 3989, "train_runtime": 158541.3235, "train_tokens_per_second": 98110.49 }, { "epoch": 0.6343402225755167, "grad_norm": 0.2256358563899994, "learning_rate": 1.4825107008732848e-05, "loss": 0.3955, "num_input_tokens_seen": 15558506751, "step": 3990, "train_runtime": 158580.1866, "train_tokens_per_second": 98111.29 }, { "epoch": 0.6344992050874404, "grad_norm": 0.2367280125617981, "learning_rate": 1.4813679156676807e-05, "loss": 0.3919, "num_input_tokens_seen": 15562457705, "step": 3991, "train_runtime": 158621.1203, "train_tokens_per_second": 98110.88 }, { "epoch": 0.6346581875993641, "grad_norm": 0.22665411233901978, "learning_rate": 1.4802253856222171e-05, "loss": 0.3941, "num_input_tokens_seen": 15566210103, "step": 3992, "train_runtime": 158658.1562, "train_tokens_per_second": 98111.629 }, { "epoch": 0.6348171701112878, "grad_norm": 0.21314369142055511, "learning_rate": 1.4790831110230868e-05, "loss": 0.3889, "num_input_tokens_seen": 15570125170, "step": 3993, "train_runtime": 158700.1678, "train_tokens_per_second": 98110.326 }, { "epoch": 0.6349761526232115, "grad_norm": 0.3279072344303131, "learning_rate": 1.477941092156424e-05, "loss": 0.3934, "num_input_tokens_seen": 15573931737, "step": 3994, "train_runtime": 158738.6414, "train_tokens_per_second": 98110.527 }, { "epoch": 0.6351351351351351, "grad_norm": 0.19198055565357208, "learning_rate": 1.4767993293082957e-05, "loss": 0.3953, "num_input_tokens_seen": 15577900584, "step": 3995, "train_runtime": 158778.2304, "train_tokens_per_second": 98111.061 }, { "epoch": 0.6352941176470588, "grad_norm": 0.22423163056373596, "learning_rate": 1.4756578227647039e-05, "loss": 0.3823, "num_input_tokens_seen": 15581897945, "step": 3996, "train_runtime": 158817.5146, "train_tokens_per_second": 98111.962 }, { "epoch": 0.6354531001589825, "grad_norm": 0.21570441126823425, "learning_rate": 1.4745165728115901e-05, "loss": 0.4016, "num_input_tokens_seen": 15585846330, "step": 3997, "train_runtime": 158857.9596, "train_tokens_per_second": 98111.838 }, { "epoch": 0.6356120826709062, "grad_norm": 1.2218581438064575, "learning_rate": 1.4733755797348286e-05, "loss": 0.3943, "num_input_tokens_seen": 15589747528, "step": 3998, "train_runtime": 158898.8855, "train_tokens_per_second": 98111.119 }, { "epoch": 0.6357710651828299, "grad_norm": 0.18584083020687103, "learning_rate": 1.4722348438202293e-05, "loss": 0.4016, "num_input_tokens_seen": 15593541815, "step": 3999, "train_runtime": 158938.7381, "train_tokens_per_second": 98110.391 }, { "epoch": 0.6359300476947536, "grad_norm": 0.21049943566322327, "learning_rate": 1.4710943653535395e-05, "loss": 0.3842, "num_input_tokens_seen": 15597450681, "step": 4000, "train_runtime": 158975.288, "train_tokens_per_second": 98112.423 }, { "epoch": 0.6360890302066773, "grad_norm": 0.2026798129081726, "learning_rate": 1.4699541446204396e-05, "loss": 0.3887, "num_input_tokens_seen": 15601426085, "step": 4001, "train_runtime": 159114.5632, "train_tokens_per_second": 98051.528 }, { "epoch": 0.636248012718601, "grad_norm": 0.1878293752670288, "learning_rate": 1.4688141819065488e-05, "loss": 0.3797, "num_input_tokens_seen": 15605345583, "step": 4002, "train_runtime": 159149.7338, "train_tokens_per_second": 98054.487 }, { "epoch": 0.6364069952305247, "grad_norm": 0.1969972550868988, "learning_rate": 1.4676744774974187e-05, "loss": 0.395, "num_input_tokens_seen": 15609233538, "step": 4003, "train_runtime": 159190.1616, "train_tokens_per_second": 98054.009 }, { "epoch": 0.6365659777424484, "grad_norm": 0.2587447166442871, "learning_rate": 1.4665350316785364e-05, "loss": 0.3869, "num_input_tokens_seen": 15613174236, "step": 4004, "train_runtime": 159230.5092, "train_tokens_per_second": 98053.911 }, { "epoch": 0.636724960254372, "grad_norm": 0.1720552295446396, "learning_rate": 1.4653958447353267e-05, "loss": 0.3978, "num_input_tokens_seen": 15617079766, "step": 4005, "train_runtime": 159270.8725, "train_tokens_per_second": 98053.583 }, { "epoch": 0.6368839427662957, "grad_norm": 0.21679359674453735, "learning_rate": 1.4642569169531462e-05, "loss": 0.398, "num_input_tokens_seen": 15620944735, "step": 4006, "train_runtime": 159310.0715, "train_tokens_per_second": 98053.717 }, { "epoch": 0.6370429252782194, "grad_norm": 0.18850083649158478, "learning_rate": 1.46311824861729e-05, "loss": 0.3913, "num_input_tokens_seen": 15624758762, "step": 4007, "train_runtime": 159348.2466, "train_tokens_per_second": 98054.162 }, { "epoch": 0.6372019077901431, "grad_norm": 0.24506297707557678, "learning_rate": 1.4619798400129853e-05, "loss": 0.3848, "num_input_tokens_seen": 15628666234, "step": 4008, "train_runtime": 159388.4077, "train_tokens_per_second": 98053.971 }, { "epoch": 0.6373608903020668, "grad_norm": 0.1709403395652771, "learning_rate": 1.4608416914253947e-05, "loss": 0.3819, "num_input_tokens_seen": 15632616332, "step": 4009, "train_runtime": 159428.7049, "train_tokens_per_second": 98053.963 }, { "epoch": 0.6375198728139905, "grad_norm": 0.219572052359581, "learning_rate": 1.4597038031396188e-05, "loss": 0.4026, "num_input_tokens_seen": 15636521931, "step": 4010, "train_runtime": 159466.6778, "train_tokens_per_second": 98055.106 }, { "epoch": 0.6376788553259142, "grad_norm": 0.20875266194343567, "learning_rate": 1.4585661754406882e-05, "loss": 0.397, "num_input_tokens_seen": 15640338492, "step": 4011, "train_runtime": 159507.2767, "train_tokens_per_second": 98054.075 }, { "epoch": 0.6378378378378379, "grad_norm": 0.1833823025226593, "learning_rate": 1.4574288086135713e-05, "loss": 0.414, "num_input_tokens_seen": 15644273222, "step": 4012, "train_runtime": 159547.9122, "train_tokens_per_second": 98053.763 }, { "epoch": 0.6379968203497616, "grad_norm": 0.1927548497915268, "learning_rate": 1.4562917029431719e-05, "loss": 0.3999, "num_input_tokens_seen": 15648219015, "step": 4013, "train_runtime": 159585.9942, "train_tokens_per_second": 98055.09 }, { "epoch": 0.6381558028616852, "grad_norm": 0.40194109082221985, "learning_rate": 1.4551548587143238e-05, "loss": 0.3919, "num_input_tokens_seen": 15652026722, "step": 4014, "train_runtime": 159623.3523, "train_tokens_per_second": 98055.996 }, { "epoch": 0.6383147853736089, "grad_norm": 0.19323964416980743, "learning_rate": 1.454018276211802e-05, "loss": 0.3867, "num_input_tokens_seen": 15655919046, "step": 4015, "train_runtime": 159660.2413, "train_tokens_per_second": 98057.719 }, { "epoch": 0.6384737678855326, "grad_norm": 0.1906774789094925, "learning_rate": 1.4528819557203102e-05, "loss": 0.3847, "num_input_tokens_seen": 15659786838, "step": 4016, "train_runtime": 159699.4671, "train_tokens_per_second": 98057.853 }, { "epoch": 0.6386327503974563, "grad_norm": 0.21699142456054688, "learning_rate": 1.451745897524489e-05, "loss": 0.3844, "num_input_tokens_seen": 15663720084, "step": 4017, "train_runtime": 159738.4776, "train_tokens_per_second": 98058.529 }, { "epoch": 0.63879173290938, "grad_norm": 0.44169169664382935, "learning_rate": 1.450610101908914e-05, "loss": 0.4002, "num_input_tokens_seen": 15667604810, "step": 4018, "train_runtime": 159777.9857, "train_tokens_per_second": 98058.595 }, { "epoch": 0.6389507154213037, "grad_norm": 0.1988692283630371, "learning_rate": 1.4494745691580914e-05, "loss": 0.3906, "num_input_tokens_seen": 15671468858, "step": 4019, "train_runtime": 159818.2283, "train_tokens_per_second": 98058.081 }, { "epoch": 0.6391096979332274, "grad_norm": 0.2414090931415558, "learning_rate": 1.4483392995564676e-05, "loss": 0.4011, "num_input_tokens_seen": 15675445267, "step": 4020, "train_runtime": 159856.5678, "train_tokens_per_second": 98059.438 }, { "epoch": 0.6392686804451511, "grad_norm": 0.20907649397850037, "learning_rate": 1.447204293388417e-05, "loss": 0.4006, "num_input_tokens_seen": 15679384308, "step": 4021, "train_runtime": 159897.0254, "train_tokens_per_second": 98059.262 }, { "epoch": 0.6394276629570748, "grad_norm": 0.2069476842880249, "learning_rate": 1.4460695509382512e-05, "loss": 0.3962, "num_input_tokens_seen": 15683245489, "step": 4022, "train_runtime": 159935.4799, "train_tokens_per_second": 98059.827 }, { "epoch": 0.6395866454689985, "grad_norm": 0.215724378824234, "learning_rate": 1.4449350724902157e-05, "loss": 0.3823, "num_input_tokens_seen": 15687079420, "step": 4023, "train_runtime": 159973.4981, "train_tokens_per_second": 98060.489 }, { "epoch": 0.639745627980922, "grad_norm": 0.18130838871002197, "learning_rate": 1.443800858328489e-05, "loss": 0.397, "num_input_tokens_seen": 15691009955, "step": 4024, "train_runtime": 160010.6597, "train_tokens_per_second": 98062.279 }, { "epoch": 0.6399046104928457, "grad_norm": 0.21193118393421173, "learning_rate": 1.4426669087371849e-05, "loss": 0.3932, "num_input_tokens_seen": 15694892961, "step": 4025, "train_runtime": 160048.4091, "train_tokens_per_second": 98063.411 }, { "epoch": 0.6400635930047694, "grad_norm": 0.2093924731016159, "learning_rate": 1.4415332240003477e-05, "loss": 0.3879, "num_input_tokens_seen": 15698737129, "step": 4026, "train_runtime": 160088.3773, "train_tokens_per_second": 98062.941 }, { "epoch": 0.6402225755166931, "grad_norm": 0.1937035322189331, "learning_rate": 1.4403998044019585e-05, "loss": 0.4088, "num_input_tokens_seen": 15702633165, "step": 4027, "train_runtime": 160125.7426, "train_tokens_per_second": 98064.389 }, { "epoch": 0.6403815580286168, "grad_norm": 0.214511439204216, "learning_rate": 1.4392666502259305e-05, "loss": 0.3901, "num_input_tokens_seen": 15706576892, "step": 4028, "train_runtime": 160165.9138, "train_tokens_per_second": 98064.417 }, { "epoch": 0.6405405405405405, "grad_norm": 0.21431271731853485, "learning_rate": 1.438133761756111e-05, "loss": 0.3864, "num_input_tokens_seen": 15710407599, "step": 4029, "train_runtime": 160205.3687, "train_tokens_per_second": 98064.177 }, { "epoch": 0.6406995230524642, "grad_norm": 0.26273784041404724, "learning_rate": 1.4370011392762811e-05, "loss": 0.3881, "num_input_tokens_seen": 15714311896, "step": 4030, "train_runtime": 160243.3624, "train_tokens_per_second": 98065.291 }, { "epoch": 0.640858505564388, "grad_norm": 0.1877318173646927, "learning_rate": 1.435868783070155e-05, "loss": 0.3917, "num_input_tokens_seen": 15718317511, "step": 4031, "train_runtime": 160281.5514, "train_tokens_per_second": 98066.916 }, { "epoch": 0.6410174880763116, "grad_norm": 0.19873756170272827, "learning_rate": 1.4347366934213779e-05, "loss": 0.4021, "num_input_tokens_seen": 15722206532, "step": 4032, "train_runtime": 160321.8793, "train_tokens_per_second": 98066.506 }, { "epoch": 0.6411764705882353, "grad_norm": 0.19370724260807037, "learning_rate": 1.4336048706135321e-05, "loss": 0.4, "num_input_tokens_seen": 15726042102, "step": 4033, "train_runtime": 160360.2151, "train_tokens_per_second": 98066.981 }, { "epoch": 0.6413354531001589, "grad_norm": 0.28410232067108154, "learning_rate": 1.4324733149301306e-05, "loss": 0.4004, "num_input_tokens_seen": 15729949313, "step": 4034, "train_runtime": 160401.366, "train_tokens_per_second": 98066.181 }, { "epoch": 0.6414944356120826, "grad_norm": 0.1981056034564972, "learning_rate": 1.4313420266546202e-05, "loss": 0.3846, "num_input_tokens_seen": 15733930172, "step": 4035, "train_runtime": 160438.0493, "train_tokens_per_second": 98068.571 }, { "epoch": 0.6416534181240063, "grad_norm": 0.20234094560146332, "learning_rate": 1.4302110060703816e-05, "loss": 0.4061, "num_input_tokens_seen": 15737765556, "step": 4036, "train_runtime": 160477.4847, "train_tokens_per_second": 98068.371 }, { "epoch": 0.64181240063593, "grad_norm": 0.28418979048728943, "learning_rate": 1.4290802534607243e-05, "loss": 0.3985, "num_input_tokens_seen": 15741636090, "step": 4037, "train_runtime": 160517.7882, "train_tokens_per_second": 98067.861 }, { "epoch": 0.6419713831478537, "grad_norm": 0.18765628337860107, "learning_rate": 1.427949769108898e-05, "loss": 0.398, "num_input_tokens_seen": 15745576042, "step": 4038, "train_runtime": 160558.9256, "train_tokens_per_second": 98067.273 }, { "epoch": 0.6421303656597774, "grad_norm": 0.22013136744499207, "learning_rate": 1.4268195532980782e-05, "loss": 0.3816, "num_input_tokens_seen": 15749559342, "step": 4039, "train_runtime": 160599.2661, "train_tokens_per_second": 98067.443 }, { "epoch": 0.6422893481717011, "grad_norm": 0.21800386905670166, "learning_rate": 1.4256896063113766e-05, "loss": 0.393, "num_input_tokens_seen": 15753392445, "step": 4040, "train_runtime": 160637.125, "train_tokens_per_second": 98068.192 }, { "epoch": 0.6424483306836248, "grad_norm": 0.22559477388858795, "learning_rate": 1.4245599284318376e-05, "loss": 0.3901, "num_input_tokens_seen": 15757154998, "step": 4041, "train_runtime": 160675.4993, "train_tokens_per_second": 98068.188 }, { "epoch": 0.6426073131955485, "grad_norm": 0.23205633461475372, "learning_rate": 1.4234305199424369e-05, "loss": 0.3939, "num_input_tokens_seen": 15761157570, "step": 4042, "train_runtime": 160713.8795, "train_tokens_per_second": 98069.673 }, { "epoch": 0.6427662957074721, "grad_norm": 0.20031052827835083, "learning_rate": 1.4223013811260843e-05, "loss": 0.3875, "num_input_tokens_seen": 15765038137, "step": 4043, "train_runtime": 160752.9808, "train_tokens_per_second": 98069.958 }, { "epoch": 0.6429252782193958, "grad_norm": 0.2580338716506958, "learning_rate": 1.4211725122656195e-05, "loss": 0.3892, "num_input_tokens_seen": 15768965969, "step": 4044, "train_runtime": 160792.4026, "train_tokens_per_second": 98070.342 }, { "epoch": 0.6430842607313195, "grad_norm": 0.3176249563694, "learning_rate": 1.420043913643817e-05, "loss": 0.3877, "num_input_tokens_seen": 15772845946, "step": 4045, "train_runtime": 160830.0363, "train_tokens_per_second": 98071.519 }, { "epoch": 0.6432432432432432, "grad_norm": 0.22276657819747925, "learning_rate": 1.4189155855433828e-05, "loss": 0.3969, "num_input_tokens_seen": 15776770788, "step": 4046, "train_runtime": 160871.1471, "train_tokens_per_second": 98070.854 }, { "epoch": 0.6434022257551669, "grad_norm": 0.2226789891719818, "learning_rate": 1.4177875282469552e-05, "loss": 0.3899, "num_input_tokens_seen": 15780827147, "step": 4047, "train_runtime": 160913.7584, "train_tokens_per_second": 98070.092 }, { "epoch": 0.6435612082670906, "grad_norm": 0.19689416885375977, "learning_rate": 1.4166597420371041e-05, "loss": 0.4045, "num_input_tokens_seen": 15784656365, "step": 4048, "train_runtime": 160953.7171, "train_tokens_per_second": 98069.536 }, { "epoch": 0.6437201907790143, "grad_norm": 0.20280855894088745, "learning_rate": 1.415532227196334e-05, "loss": 0.404, "num_input_tokens_seen": 15788580424, "step": 4049, "train_runtime": 160993.1301, "train_tokens_per_second": 98069.902 }, { "epoch": 0.643879173290938, "grad_norm": 0.1996580809354782, "learning_rate": 1.4144049840070761e-05, "loss": 0.3992, "num_input_tokens_seen": 15792481881, "step": 4050, "train_runtime": 161033.5637, "train_tokens_per_second": 98069.505 }, { "epoch": 0.6440381558028617, "grad_norm": 0.2698691487312317, "learning_rate": 1.413278012751699e-05, "loss": 0.3951, "num_input_tokens_seen": 15796305055, "step": 4051, "train_runtime": 161074.8506, "train_tokens_per_second": 98068.103 }, { "epoch": 0.6441971383147854, "grad_norm": 0.21491704881191254, "learning_rate": 1.4121513137125003e-05, "loss": 0.3844, "num_input_tokens_seen": 15800130886, "step": 4052, "train_runtime": 161114.0981, "train_tokens_per_second": 98067.96 }, { "epoch": 0.644356120826709, "grad_norm": 0.1839483678340912, "learning_rate": 1.4110248871717102e-05, "loss": 0.3969, "num_input_tokens_seen": 15804097025, "step": 4053, "train_runtime": 161153.7131, "train_tokens_per_second": 98068.463 }, { "epoch": 0.6445151033386327, "grad_norm": 0.17881639301776886, "learning_rate": 1.4098987334114922e-05, "loss": 0.3908, "num_input_tokens_seen": 15808019299, "step": 4054, "train_runtime": 161190.3643, "train_tokens_per_second": 98070.498 }, { "epoch": 0.6446740858505564, "grad_norm": 0.30669066309928894, "learning_rate": 1.4087728527139363e-05, "loss": 0.3866, "num_input_tokens_seen": 15811893021, "step": 4055, "train_runtime": 161231.3399, "train_tokens_per_second": 98069.6 }, { "epoch": 0.6448330683624801, "grad_norm": 0.2874656319618225, "learning_rate": 1.4076472453610717e-05, "loss": 0.3871, "num_input_tokens_seen": 15815759059, "step": 4056, "train_runtime": 161269.64, "train_tokens_per_second": 98070.282 }, { "epoch": 0.6449920508744038, "grad_norm": 0.19400858879089355, "learning_rate": 1.4065219116348521e-05, "loss": 0.3875, "num_input_tokens_seen": 15819572953, "step": 4057, "train_runtime": 161309.2878, "train_tokens_per_second": 98069.821 }, { "epoch": 0.6451510333863275, "grad_norm": 0.20436489582061768, "learning_rate": 1.405396851817167e-05, "loss": 0.3961, "num_input_tokens_seen": 15823524985, "step": 4058, "train_runtime": 161346.8822, "train_tokens_per_second": 98071.464 }, { "epoch": 0.6453100158982512, "grad_norm": 0.2158602476119995, "learning_rate": 1.4042720661898365e-05, "loss": 0.4025, "num_input_tokens_seen": 15827453516, "step": 4059, "train_runtime": 161389.849, "train_tokens_per_second": 98069.697 }, { "epoch": 0.6454689984101749, "grad_norm": 0.1929311603307724, "learning_rate": 1.4031475550346086e-05, "loss": 0.3882, "num_input_tokens_seen": 15831369160, "step": 4060, "train_runtime": 161428.2545, "train_tokens_per_second": 98070.621 }, { "epoch": 0.6456279809220986, "grad_norm": 0.21990720927715302, "learning_rate": 1.4020233186331694e-05, "loss": 0.4045, "num_input_tokens_seen": 15835226229, "step": 4061, "train_runtime": 161468.7706, "train_tokens_per_second": 98069.9 }, { "epoch": 0.6457869634340223, "grad_norm": 0.22704319655895233, "learning_rate": 1.4008993572671295e-05, "loss": 0.3916, "num_input_tokens_seen": 15839091801, "step": 4062, "train_runtime": 161509.7322, "train_tokens_per_second": 98068.962 }, { "epoch": 0.6459459459459459, "grad_norm": 0.31183484196662903, "learning_rate": 1.3997756712180338e-05, "loss": 0.3853, "num_input_tokens_seen": 15843103031, "step": 4063, "train_runtime": 161549.0197, "train_tokens_per_second": 98069.942 }, { "epoch": 0.6461049284578696, "grad_norm": 0.29378369450569153, "learning_rate": 1.3986522607673577e-05, "loss": 0.3929, "num_input_tokens_seen": 15846911851, "step": 4064, "train_runtime": 161587.5041, "train_tokens_per_second": 98070.157 }, { "epoch": 0.6462639109697933, "grad_norm": 0.4278663396835327, "learning_rate": 1.3975291261965078e-05, "loss": 0.4086, "num_input_tokens_seen": 15850698065, "step": 4065, "train_runtime": 161625.9476, "train_tokens_per_second": 98070.256 }, { "epoch": 0.646422893481717, "grad_norm": 1.1840436458587646, "learning_rate": 1.3964062677868225e-05, "loss": 0.398, "num_input_tokens_seen": 15854595528, "step": 4066, "train_runtime": 161662.7719, "train_tokens_per_second": 98072.026 }, { "epoch": 0.6465818759936407, "grad_norm": 0.4803662896156311, "learning_rate": 1.3952836858195677e-05, "loss": 0.4009, "num_input_tokens_seen": 15858583421, "step": 4067, "train_runtime": 161702.3165, "train_tokens_per_second": 98072.704 }, { "epoch": 0.6467408585055644, "grad_norm": 0.4876065254211426, "learning_rate": 1.3941613805759432e-05, "loss": 0.3844, "num_input_tokens_seen": 15862393898, "step": 4068, "train_runtime": 161742.3585, "train_tokens_per_second": 98071.983 }, { "epoch": 0.6468998410174881, "grad_norm": 0.3830960988998413, "learning_rate": 1.3930393523370788e-05, "loss": 0.3928, "num_input_tokens_seen": 15866366313, "step": 4069, "train_runtime": 161782.5205, "train_tokens_per_second": 98072.191 }, { "epoch": 0.6470588235294118, "grad_norm": 0.4116317629814148, "learning_rate": 1.3919176013840344e-05, "loss": 0.3855, "num_input_tokens_seen": 15870268772, "step": 4070, "train_runtime": 161820.2365, "train_tokens_per_second": 98073.449 }, { "epoch": 0.6472178060413355, "grad_norm": 0.37996506690979004, "learning_rate": 1.3907961279978004e-05, "loss": 0.4054, "num_input_tokens_seen": 15874267867, "step": 4071, "train_runtime": 161860.0062, "train_tokens_per_second": 98074.059 }, { "epoch": 0.6473767885532591, "grad_norm": 0.33325737714767456, "learning_rate": 1.3896749324592998e-05, "loss": 0.3937, "num_input_tokens_seen": 15878155805, "step": 4072, "train_runtime": 161897.957, "train_tokens_per_second": 98075.084 }, { "epoch": 0.6475357710651828, "grad_norm": 0.24728797376155853, "learning_rate": 1.3885540150493815e-05, "loss": 0.3843, "num_input_tokens_seen": 15882106206, "step": 4073, "train_runtime": 161939.644, "train_tokens_per_second": 98074.232 }, { "epoch": 0.6476947535771065, "grad_norm": 0.3593236804008484, "learning_rate": 1.3874333760488283e-05, "loss": 0.4074, "num_input_tokens_seen": 15885944198, "step": 4074, "train_runtime": 161980.2975, "train_tokens_per_second": 98073.312 }, { "epoch": 0.6478537360890302, "grad_norm": 0.20762266218662262, "learning_rate": 1.3863130157383541e-05, "loss": 0.4108, "num_input_tokens_seen": 15889826984, "step": 4075, "train_runtime": 162017.3909, "train_tokens_per_second": 98074.823 }, { "epoch": 0.6480127186009539, "grad_norm": 0.18703976273536682, "learning_rate": 1.3851929343985975e-05, "loss": 0.4041, "num_input_tokens_seen": 15893751361, "step": 4076, "train_runtime": 162056.2349, "train_tokens_per_second": 98075.531 }, { "epoch": 0.6481717011128776, "grad_norm": 0.21897411346435547, "learning_rate": 1.3840731323101352e-05, "loss": 0.401, "num_input_tokens_seen": 15897704070, "step": 4077, "train_runtime": 162095.5216, "train_tokens_per_second": 98076.146 }, { "epoch": 0.6483306836248013, "grad_norm": 0.2284163236618042, "learning_rate": 1.3829536097534657e-05, "loss": 0.3936, "num_input_tokens_seen": 15901653686, "step": 4078, "train_runtime": 162136.3973, "train_tokens_per_second": 98075.78 }, { "epoch": 0.648489666136725, "grad_norm": 0.2248498946428299, "learning_rate": 1.3818343670090255e-05, "loss": 0.3899, "num_input_tokens_seen": 15905489296, "step": 4079, "train_runtime": 162174.1426, "train_tokens_per_second": 98076.605 }, { "epoch": 0.6486486486486487, "grad_norm": 0.33597373962402344, "learning_rate": 1.380715404357174e-05, "loss": 0.3833, "num_input_tokens_seen": 15909375334, "step": 4080, "train_runtime": 162213.5289, "train_tokens_per_second": 98076.748 }, { "epoch": 0.6488076311605724, "grad_norm": 0.22015178203582764, "learning_rate": 1.3795967220782048e-05, "loss": 0.3864, "num_input_tokens_seen": 15913279189, "step": 4081, "train_runtime": 162253.2248, "train_tokens_per_second": 98076.813 }, { "epoch": 0.648966613672496, "grad_norm": 0.21649126708507538, "learning_rate": 1.3784783204523402e-05, "loss": 0.3927, "num_input_tokens_seen": 15917206160, "step": 4082, "train_runtime": 162292.9195, "train_tokens_per_second": 98077.022 }, { "epoch": 0.6491255961844197, "grad_norm": 0.26386192440986633, "learning_rate": 1.3773601997597291e-05, "loss": 0.3974, "num_input_tokens_seen": 15921172931, "step": 4083, "train_runtime": 162332.6441, "train_tokens_per_second": 98077.457 }, { "epoch": 0.6492845786963434, "grad_norm": 0.19994057714939117, "learning_rate": 1.3762423602804575e-05, "loss": 0.3928, "num_input_tokens_seen": 15924984868, "step": 4084, "train_runtime": 162369.3947, "train_tokens_per_second": 98078.735 }, { "epoch": 0.6494435612082671, "grad_norm": 0.24243099987506866, "learning_rate": 1.3751248022945324e-05, "loss": 0.3941, "num_input_tokens_seen": 15928934122, "step": 4085, "train_runtime": 162410.0344, "train_tokens_per_second": 98078.51 }, { "epoch": 0.6496025437201908, "grad_norm": 0.20243768393993378, "learning_rate": 1.3740075260818957e-05, "loss": 0.4056, "num_input_tokens_seen": 15932883881, "step": 4086, "train_runtime": 162449.5414, "train_tokens_per_second": 98078.971 }, { "epoch": 0.6497615262321145, "grad_norm": 0.17681783437728882, "learning_rate": 1.372890531922417e-05, "loss": 0.3882, "num_input_tokens_seen": 15936917422, "step": 4087, "train_runtime": 162486.3605, "train_tokens_per_second": 98081.57 }, { "epoch": 0.6499205087440382, "grad_norm": 0.3172273635864258, "learning_rate": 1.3717738200958958e-05, "loss": 0.3991, "num_input_tokens_seen": 15940738997, "step": 4088, "train_runtime": 162527.5144, "train_tokens_per_second": 98080.248 }, { "epoch": 0.6500794912559619, "grad_norm": 0.31245914101600647, "learning_rate": 1.3706573908820607e-05, "loss": 0.4019, "num_input_tokens_seen": 15944658888, "step": 4089, "train_runtime": 162568.1639, "train_tokens_per_second": 98079.836 }, { "epoch": 0.6502384737678856, "grad_norm": 0.19782754778862, "learning_rate": 1.3695412445605682e-05, "loss": 0.4004, "num_input_tokens_seen": 15948629452, "step": 4090, "train_runtime": 162607.598, "train_tokens_per_second": 98080.469 }, { "epoch": 0.6503974562798093, "grad_norm": 0.3388977348804474, "learning_rate": 1.3684253814110054e-05, "loss": 0.4003, "num_input_tokens_seen": 15952432975, "step": 4091, "train_runtime": 162645.2255, "train_tokens_per_second": 98081.164 }, { "epoch": 0.6505564387917329, "grad_norm": 0.20278753340244293, "learning_rate": 1.3673098017128887e-05, "loss": 0.396, "num_input_tokens_seen": 15956313367, "step": 4092, "train_runtime": 162687.0172, "train_tokens_per_second": 98079.82 }, { "epoch": 0.6507154213036566, "grad_norm": 0.2223024070262909, "learning_rate": 1.3661945057456637e-05, "loss": 0.4069, "num_input_tokens_seen": 15960211888, "step": 4093, "train_runtime": 162726.7703, "train_tokens_per_second": 98079.817 }, { "epoch": 0.6508744038155803, "grad_norm": 0.20504450798034668, "learning_rate": 1.3650794937887012e-05, "loss": 0.3958, "num_input_tokens_seen": 15964107911, "step": 4094, "train_runtime": 162765.1683, "train_tokens_per_second": 98080.616 }, { "epoch": 0.651033386327504, "grad_norm": 0.2004583328962326, "learning_rate": 1.3639647661213076e-05, "loss": 0.3941, "num_input_tokens_seen": 15968065556, "step": 4095, "train_runtime": 162804.0022, "train_tokens_per_second": 98081.53 }, { "epoch": 0.6511923688394277, "grad_norm": 0.2508423924446106, "learning_rate": 1.3628503230227113e-05, "loss": 0.3855, "num_input_tokens_seen": 15971890391, "step": 4096, "train_runtime": 162844.1027, "train_tokens_per_second": 98080.865 }, { "epoch": 0.6513513513513514, "grad_norm": 0.2126467227935791, "learning_rate": 1.361736164772074e-05, "loss": 0.3892, "num_input_tokens_seen": 15975873405, "step": 4097, "train_runtime": 162881.367, "train_tokens_per_second": 98082.879 }, { "epoch": 0.6515103338632751, "grad_norm": 0.19197046756744385, "learning_rate": 1.3606222916484848e-05, "loss": 0.3972, "num_input_tokens_seen": 15979692631, "step": 4098, "train_runtime": 162921.4742, "train_tokens_per_second": 98082.176 }, { "epoch": 0.6516693163751988, "grad_norm": 0.22258228063583374, "learning_rate": 1.3595087039309584e-05, "loss": 0.3871, "num_input_tokens_seen": 15983617025, "step": 4099, "train_runtime": 162959.5031, "train_tokens_per_second": 98083.369 }, { "epoch": 0.6518282988871225, "grad_norm": 0.19064944982528687, "learning_rate": 1.3583954018984446e-05, "loss": 0.3959, "num_input_tokens_seen": 15987468519, "step": 4100, "train_runtime": 162999.6499, "train_tokens_per_second": 98082.84 }, { "epoch": 0.6519872813990462, "grad_norm": 0.21739114820957184, "learning_rate": 1.3572823858298134e-05, "loss": 0.3965, "num_input_tokens_seen": 15991399410, "step": 4101, "train_runtime": 163036.3531, "train_tokens_per_second": 98084.869 }, { "epoch": 0.6521462639109697, "grad_norm": 0.19030708074569702, "learning_rate": 1.3561696560038717e-05, "loss": 0.399, "num_input_tokens_seen": 15995277301, "step": 4102, "train_runtime": 163076.5954, "train_tokens_per_second": 98084.445 }, { "epoch": 0.6523052464228934, "grad_norm": 0.1936064213514328, "learning_rate": 1.3550572126993477e-05, "loss": 0.379, "num_input_tokens_seen": 15999204545, "step": 4103, "train_runtime": 163116.2949, "train_tokens_per_second": 98084.649 }, { "epoch": 0.6524642289348171, "grad_norm": 0.20742031931877136, "learning_rate": 1.3539450561949013e-05, "loss": 0.3919, "num_input_tokens_seen": 16003043277, "step": 4104, "train_runtime": 163157.5987, "train_tokens_per_second": 98083.346 }, { "epoch": 0.6526232114467408, "grad_norm": 0.28440672159194946, "learning_rate": 1.3528331867691207e-05, "loss": 0.3935, "num_input_tokens_seen": 16006960814, "step": 4105, "train_runtime": 163196.1326, "train_tokens_per_second": 98084.192 }, { "epoch": 0.6527821939586645, "grad_norm": 0.22535090148448944, "learning_rate": 1.3517216047005187e-05, "loss": 0.4164, "num_input_tokens_seen": 16010826665, "step": 4106, "train_runtime": 163237.3085, "train_tokens_per_second": 98083.133 }, { "epoch": 0.6529411764705882, "grad_norm": 0.19258762896060944, "learning_rate": 1.3506103102675427e-05, "loss": 0.3982, "num_input_tokens_seen": 16014719461, "step": 4107, "train_runtime": 163276.2563, "train_tokens_per_second": 98083.578 }, { "epoch": 0.653100158982512, "grad_norm": 0.3038681149482727, "learning_rate": 1.3494993037485609e-05, "loss": 0.4058, "num_input_tokens_seen": 16018639364, "step": 4108, "train_runtime": 163317.2028, "train_tokens_per_second": 98082.989 }, { "epoch": 0.6532591414944356, "grad_norm": 0.1852940022945404, "learning_rate": 1.3483885854218736e-05, "loss": 0.3761, "num_input_tokens_seen": 16022401204, "step": 4109, "train_runtime": 163357.9416, "train_tokens_per_second": 98081.557 }, { "epoch": 0.6534181240063593, "grad_norm": 0.30418094992637634, "learning_rate": 1.347278155565708e-05, "loss": 0.4018, "num_input_tokens_seen": 16026410663, "step": 4110, "train_runtime": 163396.8909, "train_tokens_per_second": 98082.715 }, { "epoch": 0.6535771065182829, "grad_norm": 0.21603111922740936, "learning_rate": 1.3461680144582189e-05, "loss": 0.4024, "num_input_tokens_seen": 16030269537, "step": 4111, "train_runtime": 163436.15, "train_tokens_per_second": 98082.765 }, { "epoch": 0.6537360890302066, "grad_norm": 0.20497742295265198, "learning_rate": 1.3450581623774897e-05, "loss": 0.382, "num_input_tokens_seen": 16034136054, "step": 4112, "train_runtime": 163476.8214, "train_tokens_per_second": 98082.015 }, { "epoch": 0.6538950715421303, "grad_norm": 0.19928771257400513, "learning_rate": 1.3439485996015285e-05, "loss": 0.415, "num_input_tokens_seen": 16037986536, "step": 4113, "train_runtime": 163513.8198, "train_tokens_per_second": 98083.37 }, { "epoch": 0.654054054054054, "grad_norm": 0.28441962599754333, "learning_rate": 1.3428393264082744e-05, "loss": 0.4039, "num_input_tokens_seen": 16041855866, "step": 4114, "train_runtime": 163553.7801, "train_tokens_per_second": 98083.064 }, { "epoch": 0.6542130365659777, "grad_norm": 0.20491886138916016, "learning_rate": 1.3417303430755918e-05, "loss": 0.3975, "num_input_tokens_seen": 16045695177, "step": 4115, "train_runtime": 163592.6056, "train_tokens_per_second": 98083.255 }, { "epoch": 0.6543720190779014, "grad_norm": 0.2697145342826843, "learning_rate": 1.3406216498812745e-05, "loss": 0.3939, "num_input_tokens_seen": 16049674261, "step": 4116, "train_runtime": 163632.2315, "train_tokens_per_second": 98083.82 }, { "epoch": 0.6545310015898251, "grad_norm": 0.4433947503566742, "learning_rate": 1.3395132471030398e-05, "loss": 0.3852, "num_input_tokens_seen": 16053584905, "step": 4117, "train_runtime": 163672.2748, "train_tokens_per_second": 98083.716 }, { "epoch": 0.6546899841017488, "grad_norm": 0.3787783980369568, "learning_rate": 1.3384051350185378e-05, "loss": 0.3968, "num_input_tokens_seen": 16057404324, "step": 4118, "train_runtime": 163712.7926, "train_tokens_per_second": 98082.771 }, { "epoch": 0.6548489666136725, "grad_norm": 0.20652854442596436, "learning_rate": 1.3372973139053404e-05, "loss": 0.393, "num_input_tokens_seen": 16061401817, "step": 4119, "train_runtime": 163751.8478, "train_tokens_per_second": 98083.79 }, { "epoch": 0.6550079491255962, "grad_norm": 0.2146434485912323, "learning_rate": 1.3361897840409499e-05, "loss": 0.3904, "num_input_tokens_seen": 16065333977, "step": 4120, "train_runtime": 163790.7338, "train_tokens_per_second": 98084.511 }, { "epoch": 0.6551669316375198, "grad_norm": 0.48499539494514465, "learning_rate": 1.3350825457027944e-05, "loss": 0.3932, "num_input_tokens_seen": 16069188053, "step": 4121, "train_runtime": 163831.1521, "train_tokens_per_second": 98083.837 }, { "epoch": 0.6553259141494435, "grad_norm": 0.48627033829689026, "learning_rate": 1.3339755991682296e-05, "loss": 0.3989, "num_input_tokens_seen": 16073061026, "step": 4122, "train_runtime": 163870.3629, "train_tokens_per_second": 98084.002 }, { "epoch": 0.6554848966613672, "grad_norm": 0.4298131465911865, "learning_rate": 1.3328689447145388e-05, "loss": 0.3923, "num_input_tokens_seen": 16077090075, "step": 4123, "train_runtime": 163910.7408, "train_tokens_per_second": 98084.421 }, { "epoch": 0.6556438791732909, "grad_norm": 0.3394937515258789, "learning_rate": 1.3317625826189278e-05, "loss": 0.3911, "num_input_tokens_seen": 16081009669, "step": 4124, "train_runtime": 163949.1954, "train_tokens_per_second": 98085.322 }, { "epoch": 0.6558028616852146, "grad_norm": 0.218486949801445, "learning_rate": 1.3306565131585369e-05, "loss": 0.405, "num_input_tokens_seen": 16084942563, "step": 4125, "train_runtime": 163988.9636, "train_tokens_per_second": 98085.519 }, { "epoch": 0.6559618441971383, "grad_norm": 0.36314305663108826, "learning_rate": 1.3295507366104251e-05, "loss": 0.3901, "num_input_tokens_seen": 16088705209, "step": 4126, "train_runtime": 164028.8848, "train_tokens_per_second": 98084.586 }, { "epoch": 0.656120826709062, "grad_norm": 0.18200957775115967, "learning_rate": 1.3284452532515831e-05, "loss": 0.4062, "num_input_tokens_seen": 16092681268, "step": 4127, "train_runtime": 164066.9931, "train_tokens_per_second": 98086.038 }, { "epoch": 0.6562798092209857, "grad_norm": 0.2374577522277832, "learning_rate": 1.3273400633589266e-05, "loss": 0.3775, "num_input_tokens_seen": 16096540234, "step": 4128, "train_runtime": 164104.9776, "train_tokens_per_second": 98086.849 }, { "epoch": 0.6564387917329094, "grad_norm": 0.2045517861843109, "learning_rate": 1.3262351672092979e-05, "loss": 0.3949, "num_input_tokens_seen": 16100425826, "step": 4129, "train_runtime": 164144.2165, "train_tokens_per_second": 98087.073 }, { "epoch": 0.6565977742448331, "grad_norm": 0.20493993163108826, "learning_rate": 1.3251305650794665e-05, "loss": 0.4031, "num_input_tokens_seen": 16104454175, "step": 4130, "train_runtime": 164182.8064, "train_tokens_per_second": 98088.555 }, { "epoch": 0.6567567567567567, "grad_norm": 0.21511298418045044, "learning_rate": 1.3240262572461255e-05, "loss": 0.4014, "num_input_tokens_seen": 16108396602, "step": 4131, "train_runtime": 164222.5717, "train_tokens_per_second": 98088.81 }, { "epoch": 0.6569157392686804, "grad_norm": 0.6957588791847229, "learning_rate": 1.322922243985897e-05, "loss": 0.3939, "num_input_tokens_seen": 16112116261, "step": 4132, "train_runtime": 164261.3155, "train_tokens_per_second": 98088.319 }, { "epoch": 0.6570747217806041, "grad_norm": 0.24456006288528442, "learning_rate": 1.321818525575329e-05, "loss": 0.3969, "num_input_tokens_seen": 16116009301, "step": 4133, "train_runtime": 164300.2494, "train_tokens_per_second": 98088.77 }, { "epoch": 0.6572337042925278, "grad_norm": 0.3715684711933136, "learning_rate": 1.3207151022908959e-05, "loss": 0.383, "num_input_tokens_seen": 16119839977, "step": 4134, "train_runtime": 164338.4973, "train_tokens_per_second": 98089.25 }, { "epoch": 0.6573926868044515, "grad_norm": 0.24508780241012573, "learning_rate": 1.3196119744089944e-05, "loss": 0.3944, "num_input_tokens_seen": 16123833100, "step": 4135, "train_runtime": 164378.6565, "train_tokens_per_second": 98089.578 }, { "epoch": 0.6575516693163752, "grad_norm": 0.24223756790161133, "learning_rate": 1.3185091422059542e-05, "loss": 0.3952, "num_input_tokens_seen": 16127748394, "step": 4136, "train_runtime": 164417.5584, "train_tokens_per_second": 98090.183 }, { "epoch": 0.6577106518282989, "grad_norm": 0.23412708938121796, "learning_rate": 1.3174066059580242e-05, "loss": 0.4042, "num_input_tokens_seen": 16131620091, "step": 4137, "train_runtime": 164455.6107, "train_tokens_per_second": 98091.029 }, { "epoch": 0.6578696343402226, "grad_norm": 0.2964262366294861, "learning_rate": 1.3163043659413827e-05, "loss": 0.3891, "num_input_tokens_seen": 16135480884, "step": 4138, "train_runtime": 164494.2269, "train_tokens_per_second": 98091.472 }, { "epoch": 0.6580286168521463, "grad_norm": 0.36485055088996887, "learning_rate": 1.315202422432134e-05, "loss": 0.3876, "num_input_tokens_seen": 16139409238, "step": 4139, "train_runtime": 164534.4621, "train_tokens_per_second": 98091.36 }, { "epoch": 0.6581875993640699, "grad_norm": 0.25643932819366455, "learning_rate": 1.3141007757063045e-05, "loss": 0.3863, "num_input_tokens_seen": 16143233285, "step": 4140, "train_runtime": 164574.6425, "train_tokens_per_second": 98090.648 }, { "epoch": 0.6583465818759936, "grad_norm": 0.2062484622001648, "learning_rate": 1.3129994260398526e-05, "loss": 0.3907, "num_input_tokens_seen": 16147132681, "step": 4141, "train_runtime": 164613.5718, "train_tokens_per_second": 98091.139 }, { "epoch": 0.6585055643879173, "grad_norm": 0.4177265167236328, "learning_rate": 1.3118983737086554e-05, "loss": 0.41, "num_input_tokens_seen": 16151036142, "step": 4142, "train_runtime": 164653.9364, "train_tokens_per_second": 98090.799 }, { "epoch": 0.658664546899841, "grad_norm": 0.22067371010780334, "learning_rate": 1.3107976189885205e-05, "loss": 0.399, "num_input_tokens_seen": 16154895440, "step": 4143, "train_runtime": 164692.7324, "train_tokens_per_second": 98091.125 }, { "epoch": 0.6588235294117647, "grad_norm": 0.2972305417060852, "learning_rate": 1.3096971621551785e-05, "loss": 0.3954, "num_input_tokens_seen": 16158865192, "step": 4144, "train_runtime": 164731.174, "train_tokens_per_second": 98092.333 }, { "epoch": 0.6589825119236884, "grad_norm": 0.24240908026695251, "learning_rate": 1.3085970034842866e-05, "loss": 0.3893, "num_input_tokens_seen": 16162684842, "step": 4145, "train_runtime": 164770.5849, "train_tokens_per_second": 98092.052 }, { "epoch": 0.6591414944356121, "grad_norm": 0.23460733890533447, "learning_rate": 1.3074971432514271e-05, "loss": 0.388, "num_input_tokens_seen": 16166735320, "step": 4146, "train_runtime": 164809.453, "train_tokens_per_second": 98093.495 }, { "epoch": 0.6593004769475358, "grad_norm": 0.23867124319076538, "learning_rate": 1.3063975817321045e-05, "loss": 0.3977, "num_input_tokens_seen": 16170538704, "step": 4147, "train_runtime": 164850.0072, "train_tokens_per_second": 98092.436 }, { "epoch": 0.6594594594594595, "grad_norm": 0.23848675191402435, "learning_rate": 1.3052983192017553e-05, "loss": 0.3975, "num_input_tokens_seen": 16174497811, "step": 4148, "train_runtime": 164886.7759, "train_tokens_per_second": 98094.573 }, { "epoch": 0.6596184419713832, "grad_norm": 0.2468993365764618, "learning_rate": 1.3041993559357344e-05, "loss": 0.4046, "num_input_tokens_seen": 16178387961, "step": 4149, "train_runtime": 164926.2493, "train_tokens_per_second": 98094.682 }, { "epoch": 0.6597774244833068, "grad_norm": 0.6622341871261597, "learning_rate": 1.3031006922093245e-05, "loss": 0.3945, "num_input_tokens_seen": 16182275534, "step": 4150, "train_runtime": 164966.1094, "train_tokens_per_second": 98094.546 }, { "epoch": 0.6599364069952305, "grad_norm": 0.2743118703365326, "learning_rate": 1.3020023282977333e-05, "loss": 0.3851, "num_input_tokens_seen": 16186069999, "step": 4151, "train_runtime": 165004.7053, "train_tokens_per_second": 98094.597 }, { "epoch": 0.6600953895071542, "grad_norm": 0.23688910901546478, "learning_rate": 1.300904264476095e-05, "loss": 0.3912, "num_input_tokens_seen": 16189993591, "step": 4152, "train_runtime": 165049.6282, "train_tokens_per_second": 98091.67 }, { "epoch": 0.6602543720190779, "grad_norm": 0.7732822895050049, "learning_rate": 1.2998065010194637e-05, "loss": 0.4055, "num_input_tokens_seen": 16193969627, "step": 4153, "train_runtime": 165088.2791, "train_tokens_per_second": 98092.788 }, { "epoch": 0.6604133545310016, "grad_norm": 0.1930806189775467, "learning_rate": 1.298709038202823e-05, "loss": 0.3972, "num_input_tokens_seen": 16197793933, "step": 4154, "train_runtime": 165130.0502, "train_tokens_per_second": 98091.134 }, { "epoch": 0.6605723370429253, "grad_norm": 0.21587519347667694, "learning_rate": 1.2976118763010798e-05, "loss": 0.3967, "num_input_tokens_seen": 16201801527, "step": 4155, "train_runtime": 165170.9596, "train_tokens_per_second": 98091.102 }, { "epoch": 0.660731319554849, "grad_norm": 0.18302667140960693, "learning_rate": 1.2965150155890648e-05, "loss": 0.3947, "num_input_tokens_seen": 16205739112, "step": 4156, "train_runtime": 165214.3017, "train_tokens_per_second": 98089.203 }, { "epoch": 0.6608903020667727, "grad_norm": 0.19895438849925995, "learning_rate": 1.295418456341535e-05, "loss": 0.3948, "num_input_tokens_seen": 16209514461, "step": 4157, "train_runtime": 165252.8213, "train_tokens_per_second": 98089.184 }, { "epoch": 0.6610492845786964, "grad_norm": 1.1652461290359497, "learning_rate": 1.2943221988331683e-05, "loss": 0.395, "num_input_tokens_seen": 16213428666, "step": 4158, "train_runtime": 165292.776, "train_tokens_per_second": 98089.155 }, { "epoch": 0.6612082670906201, "grad_norm": 0.26521652936935425, "learning_rate": 1.2932262433385728e-05, "loss": 0.3918, "num_input_tokens_seen": 16217351683, "step": 4159, "train_runtime": 165331.0067, "train_tokens_per_second": 98090.201 }, { "epoch": 0.6613672496025437, "grad_norm": 0.21533019840717316, "learning_rate": 1.2921305901322748e-05, "loss": 0.3949, "num_input_tokens_seen": 16221145420, "step": 4160, "train_runtime": 165373.3038, "train_tokens_per_second": 98088.053 }, { "epoch": 0.6615262321144674, "grad_norm": 0.3012082278728485, "learning_rate": 1.2910352394887286e-05, "loss": 0.4056, "num_input_tokens_seen": 16225009957, "step": 4161, "train_runtime": 165412.8148, "train_tokens_per_second": 98087.987 }, { "epoch": 0.6616852146263911, "grad_norm": 0.24391485750675201, "learning_rate": 1.2899401916823134e-05, "loss": 0.3871, "num_input_tokens_seen": 16228936559, "step": 4162, "train_runtime": 165452.5937, "train_tokens_per_second": 98088.136 }, { "epoch": 0.6618441971383148, "grad_norm": 0.19794698059558868, "learning_rate": 1.2888454469873273e-05, "loss": 0.4111, "num_input_tokens_seen": 16232965007, "step": 4163, "train_runtime": 165493.3888, "train_tokens_per_second": 98088.299 }, { "epoch": 0.6620031796502385, "grad_norm": 0.19611220061779022, "learning_rate": 1.287751005678e-05, "loss": 0.3875, "num_input_tokens_seen": 16236897286, "step": 4164, "train_runtime": 165530.0211, "train_tokens_per_second": 98090.347 }, { "epoch": 0.6621621621621622, "grad_norm": 0.21097725629806519, "learning_rate": 1.286656868028478e-05, "loss": 0.4071, "num_input_tokens_seen": 16240787324, "step": 4165, "train_runtime": 165572.9127, "train_tokens_per_second": 98088.432 }, { "epoch": 0.6623211446740859, "grad_norm": 0.23640774190425873, "learning_rate": 1.2855630343128383e-05, "loss": 0.4044, "num_input_tokens_seen": 16244660106, "step": 4166, "train_runtime": 165612.8318, "train_tokens_per_second": 98088.173 }, { "epoch": 0.6624801271860096, "grad_norm": 0.2139759063720703, "learning_rate": 1.2844695048050762e-05, "loss": 0.3929, "num_input_tokens_seen": 16248584744, "step": 4167, "train_runtime": 165651.34, "train_tokens_per_second": 98089.063 }, { "epoch": 0.6626391096979333, "grad_norm": 0.23869280517101288, "learning_rate": 1.2833762797791136e-05, "loss": 0.395, "num_input_tokens_seen": 16252454441, "step": 4168, "train_runtime": 165690.1346, "train_tokens_per_second": 98089.452 }, { "epoch": 0.662798092209857, "grad_norm": 0.22546271979808807, "learning_rate": 1.2822833595087968e-05, "loss": 0.3809, "num_input_tokens_seen": 16256296257, "step": 4169, "train_runtime": 165728.4173, "train_tokens_per_second": 98089.975 }, { "epoch": 0.6629570747217806, "grad_norm": 0.1966240406036377, "learning_rate": 1.2811907442678928e-05, "loss": 0.3912, "num_input_tokens_seen": 16260230688, "step": 4170, "train_runtime": 165767.2625, "train_tokens_per_second": 98090.723 }, { "epoch": 0.6631160572337043, "grad_norm": 0.37463608384132385, "learning_rate": 1.2800984343300945e-05, "loss": 0.3916, "num_input_tokens_seen": 16264106560, "step": 4171, "train_runtime": 165804.2623, "train_tokens_per_second": 98092.21 }, { "epoch": 0.663275039745628, "grad_norm": 0.35125061869621277, "learning_rate": 1.2790064299690185e-05, "loss": 0.3923, "num_input_tokens_seen": 16267938010, "step": 4172, "train_runtime": 165846.0632, "train_tokens_per_second": 98090.589 }, { "epoch": 0.6634340222575517, "grad_norm": 0.29575422406196594, "learning_rate": 1.2779147314582036e-05, "loss": 0.391, "num_input_tokens_seen": 16271804543, "step": 4173, "train_runtime": 165886.8394, "train_tokens_per_second": 98089.786 }, { "epoch": 0.6635930047694754, "grad_norm": 0.22950834035873413, "learning_rate": 1.2768233390711126e-05, "loss": 0.3856, "num_input_tokens_seen": 16275814454, "step": 4174, "train_runtime": 165926.8871, "train_tokens_per_second": 98090.278 }, { "epoch": 0.6637519872813991, "grad_norm": 0.29764261841773987, "learning_rate": 1.2757322530811334e-05, "loss": 0.393, "num_input_tokens_seen": 16279716861, "step": 4175, "train_runtime": 165970.5661, "train_tokens_per_second": 98087.976 }, { "epoch": 0.6639109697933228, "grad_norm": 0.21899698674678802, "learning_rate": 1.274641473761572e-05, "loss": 0.4045, "num_input_tokens_seen": 16283712011, "step": 4176, "train_runtime": 166008.6048, "train_tokens_per_second": 98089.566 }, { "epoch": 0.6640699523052465, "grad_norm": 0.19685707986354828, "learning_rate": 1.273551001385663e-05, "loss": 0.3909, "num_input_tokens_seen": 16287622770, "step": 4177, "train_runtime": 166048.3269, "train_tokens_per_second": 98089.653 }, { "epoch": 0.6642289348171702, "grad_norm": 0.20313797891139984, "learning_rate": 1.2724608362265616e-05, "loss": 0.4004, "num_input_tokens_seen": 16291517254, "step": 4178, "train_runtime": 166085.3034, "train_tokens_per_second": 98091.263 }, { "epoch": 0.6643879173290937, "grad_norm": 0.2047979086637497, "learning_rate": 1.2713709785573463e-05, "loss": 0.3999, "num_input_tokens_seen": 16295295102, "step": 4179, "train_runtime": 166125.0551, "train_tokens_per_second": 98090.532 }, { "epoch": 0.6645468998410174, "grad_norm": 0.2046276181936264, "learning_rate": 1.27028142865102e-05, "loss": 0.3819, "num_input_tokens_seen": 16299171114, "step": 4180, "train_runtime": 166165.2107, "train_tokens_per_second": 98090.154 }, { "epoch": 0.6647058823529411, "grad_norm": 0.19095970690250397, "learning_rate": 1.2691921867805045e-05, "loss": 0.3934, "num_input_tokens_seen": 16303143809, "step": 4181, "train_runtime": 166205.4463, "train_tokens_per_second": 98090.31 }, { "epoch": 0.6648648648648648, "grad_norm": 0.20106874406337738, "learning_rate": 1.2681032532186505e-05, "loss": 0.387, "num_input_tokens_seen": 16307045739, "step": 4182, "train_runtime": 166242.635, "train_tokens_per_second": 98091.839 }, { "epoch": 0.6650238473767885, "grad_norm": 0.2251329869031906, "learning_rate": 1.2670146282382258e-05, "loss": 0.394, "num_input_tokens_seen": 16310939703, "step": 4183, "train_runtime": 166282.9476, "train_tokens_per_second": 98091.476 }, { "epoch": 0.6651828298887122, "grad_norm": 0.18606187403202057, "learning_rate": 1.2659263121119242e-05, "loss": 0.4009, "num_input_tokens_seen": 16314802101, "step": 4184, "train_runtime": 166322.9088, "train_tokens_per_second": 98091.13 }, { "epoch": 0.665341812400636, "grad_norm": 0.18486174941062927, "learning_rate": 1.2648383051123613e-05, "loss": 0.3867, "num_input_tokens_seen": 16318724041, "step": 4185, "train_runtime": 166361.6283, "train_tokens_per_second": 98091.875 }, { "epoch": 0.6655007949125596, "grad_norm": 0.2733969986438751, "learning_rate": 1.2637506075120737e-05, "loss": 0.3929, "num_input_tokens_seen": 16322570709, "step": 4186, "train_runtime": 166401.848, "train_tokens_per_second": 98091.283 }, { "epoch": 0.6656597774244833, "grad_norm": 0.24393270909786224, "learning_rate": 1.2626632195835247e-05, "loss": 0.3905, "num_input_tokens_seen": 16326467618, "step": 4187, "train_runtime": 166439.8149, "train_tokens_per_second": 98092.32 }, { "epoch": 0.665818759936407, "grad_norm": 0.19098113477230072, "learning_rate": 1.261576141599094e-05, "loss": 0.4005, "num_input_tokens_seen": 16330405903, "step": 4188, "train_runtime": 166477.5645, "train_tokens_per_second": 98093.734 }, { "epoch": 0.6659777424483306, "grad_norm": 0.205941841006279, "learning_rate": 1.2604893738310903e-05, "loss": 0.4034, "num_input_tokens_seen": 16334351901, "step": 4189, "train_runtime": 166518.3738, "train_tokens_per_second": 98093.391 }, { "epoch": 0.6661367249602543, "grad_norm": 0.22961106896400452, "learning_rate": 1.259402916551739e-05, "loss": 0.4028, "num_input_tokens_seen": 16338233360, "step": 4190, "train_runtime": 166557.45, "train_tokens_per_second": 98093.681 }, { "epoch": 0.666295707472178, "grad_norm": 0.19693394005298615, "learning_rate": 1.2583167700331904e-05, "loss": 0.3841, "num_input_tokens_seen": 16342236900, "step": 4191, "train_runtime": 166594.2694, "train_tokens_per_second": 98096.033 }, { "epoch": 0.6664546899841017, "grad_norm": 0.23289547860622406, "learning_rate": 1.2572309345475176e-05, "loss": 0.3876, "num_input_tokens_seen": 16346031429, "step": 4192, "train_runtime": 166632.5374, "train_tokens_per_second": 98096.276 }, { "epoch": 0.6666136724960254, "grad_norm": 0.18681208789348602, "learning_rate": 1.256145410366713e-05, "loss": 0.4014, "num_input_tokens_seen": 16349950548, "step": 4193, "train_runtime": 166672.2731, "train_tokens_per_second": 98096.403 }, { "epoch": 0.6667726550079491, "grad_norm": 0.22513127326965332, "learning_rate": 1.2550601977626938e-05, "loss": 0.4011, "num_input_tokens_seen": 16353814883, "step": 4194, "train_runtime": 166710.9218, "train_tokens_per_second": 98096.842 }, { "epoch": 0.6669316375198728, "grad_norm": 0.19047383964061737, "learning_rate": 1.2539752970072977e-05, "loss": 0.3787, "num_input_tokens_seen": 16357721833, "step": 4195, "train_runtime": 166750.5826, "train_tokens_per_second": 98096.94 }, { "epoch": 0.6670906200317965, "grad_norm": 0.191853329539299, "learning_rate": 1.252890708372285e-05, "loss": 0.378, "num_input_tokens_seen": 16361571876, "step": 4196, "train_runtime": 166789.7179, "train_tokens_per_second": 98097.006 }, { "epoch": 0.6672496025437202, "grad_norm": 0.211904838681221, "learning_rate": 1.251806432129337e-05, "loss": 0.3939, "num_input_tokens_seen": 16365486950, "step": 4197, "train_runtime": 166829.5738, "train_tokens_per_second": 98097.037 }, { "epoch": 0.6674085850556439, "grad_norm": 0.1913607120513916, "learning_rate": 1.250722468550059e-05, "loss": 0.3887, "num_input_tokens_seen": 16369438335, "step": 4198, "train_runtime": 166868.7685, "train_tokens_per_second": 98097.676 }, { "epoch": 0.6675675675675675, "grad_norm": 0.18403662741184235, "learning_rate": 1.2496388179059734e-05, "loss": 0.3853, "num_input_tokens_seen": 16373286143, "step": 4199, "train_runtime": 166906.3794, "train_tokens_per_second": 98098.624 }, { "epoch": 0.6677265500794912, "grad_norm": 0.18742558360099792, "learning_rate": 1.2485554804685286e-05, "loss": 0.3896, "num_input_tokens_seen": 16377035813, "step": 4200, "train_runtime": 166944.6652, "train_tokens_per_second": 98098.587 }, { "epoch": 0.6678855325914149, "grad_norm": 0.2029261291027069, "learning_rate": 1.2474724565090928e-05, "loss": 0.3912, "num_input_tokens_seen": 16380992806, "step": 4201, "train_runtime": 167096.145, "train_tokens_per_second": 98033.338 }, { "epoch": 0.6680445151033386, "grad_norm": 0.1976757049560547, "learning_rate": 1.246389746298956e-05, "loss": 0.386, "num_input_tokens_seen": 16384703868, "step": 4202, "train_runtime": 167134.5346, "train_tokens_per_second": 98033.024 }, { "epoch": 0.6682034976152623, "grad_norm": 0.19029462337493896, "learning_rate": 1.2453073501093298e-05, "loss": 0.4007, "num_input_tokens_seen": 16388548174, "step": 4203, "train_runtime": 167174.1093, "train_tokens_per_second": 98032.813 }, { "epoch": 0.668362480127186, "grad_norm": 0.22275082767009735, "learning_rate": 1.2442252682113449e-05, "loss": 0.3908, "num_input_tokens_seen": 16392494047, "step": 4204, "train_runtime": 167214.2961, "train_tokens_per_second": 98032.85 }, { "epoch": 0.6685214626391097, "grad_norm": 0.3749208450317383, "learning_rate": 1.243143500876058e-05, "loss": 0.3906, "num_input_tokens_seen": 16396470482, "step": 4205, "train_runtime": 167254.8823, "train_tokens_per_second": 98032.836 }, { "epoch": 0.6686804451510334, "grad_norm": 0.1822504699230194, "learning_rate": 1.2420620483744422e-05, "loss": 0.3926, "num_input_tokens_seen": 16400357392, "step": 4206, "train_runtime": 167295.594, "train_tokens_per_second": 98032.214 }, { "epoch": 0.6688394276629571, "grad_norm": 0.22038596868515015, "learning_rate": 1.2409809109773943e-05, "loss": 0.3906, "num_input_tokens_seen": 16404175604, "step": 4207, "train_runtime": 167335.2174, "train_tokens_per_second": 98031.818 }, { "epoch": 0.6689984101748807, "grad_norm": 0.2295084297657013, "learning_rate": 1.2399000889557321e-05, "loss": 0.4013, "num_input_tokens_seen": 16408098015, "step": 4208, "train_runtime": 167374.5694, "train_tokens_per_second": 98032.205 }, { "epoch": 0.6691573926868044, "grad_norm": 0.39337074756622314, "learning_rate": 1.2388195825801915e-05, "loss": 0.3722, "num_input_tokens_seen": 16412004876, "step": 4209, "train_runtime": 167413.0077, "train_tokens_per_second": 98033.033 }, { "epoch": 0.6693163751987281, "grad_norm": 0.21371181309223175, "learning_rate": 1.2377393921214357e-05, "loss": 0.3798, "num_input_tokens_seen": 16415866851, "step": 4210, "train_runtime": 167451.6484, "train_tokens_per_second": 98033.474 }, { "epoch": 0.6694753577106518, "grad_norm": 0.19693392515182495, "learning_rate": 1.2366595178500412e-05, "loss": 0.3918, "num_input_tokens_seen": 16419656037, "step": 4211, "train_runtime": 167491.04, "train_tokens_per_second": 98033.041 }, { "epoch": 0.6696343402225755, "grad_norm": 0.24125784635543823, "learning_rate": 1.2355799600365107e-05, "loss": 0.4054, "num_input_tokens_seen": 16423667447, "step": 4212, "train_runtime": 167531.4108, "train_tokens_per_second": 98033.362 }, { "epoch": 0.6697933227344992, "grad_norm": 0.21274901926517487, "learning_rate": 1.234500718951265e-05, "loss": 0.4076, "num_input_tokens_seen": 16427536867, "step": 4213, "train_runtime": 167570.958, "train_tokens_per_second": 98033.317 }, { "epoch": 0.6699523052464229, "grad_norm": 0.18930160999298096, "learning_rate": 1.233421794864647e-05, "loss": 0.3844, "num_input_tokens_seen": 16431432417, "step": 4214, "train_runtime": 167610.8753, "train_tokens_per_second": 98033.212 }, { "epoch": 0.6701112877583466, "grad_norm": 0.4076865315437317, "learning_rate": 1.2323431880469203e-05, "loss": 0.3879, "num_input_tokens_seen": 16435346772, "step": 4215, "train_runtime": 167651.1608, "train_tokens_per_second": 98033.003 }, { "epoch": 0.6702702702702703, "grad_norm": 0.28071603178977966, "learning_rate": 1.2312648987682662e-05, "loss": 0.3878, "num_input_tokens_seen": 16439291493, "step": 4216, "train_runtime": 167688.9399, "train_tokens_per_second": 98034.441 }, { "epoch": 0.670429252782194, "grad_norm": 0.23731601238250732, "learning_rate": 1.2301869272987896e-05, "loss": 0.3873, "num_input_tokens_seen": 16443143765, "step": 4217, "train_runtime": 167727.7794, "train_tokens_per_second": 98034.707 }, { "epoch": 0.6705882352941176, "grad_norm": 0.49880319833755493, "learning_rate": 1.2291092739085147e-05, "loss": 0.3896, "num_input_tokens_seen": 16447130003, "step": 4218, "train_runtime": 167768.9102, "train_tokens_per_second": 98034.433 }, { "epoch": 0.6707472178060413, "grad_norm": 0.22007162868976593, "learning_rate": 1.2280319388673863e-05, "loss": 0.3974, "num_input_tokens_seen": 16450989242, "step": 4219, "train_runtime": 167804.5613, "train_tokens_per_second": 98036.604 }, { "epoch": 0.670906200317965, "grad_norm": 0.2369753122329712, "learning_rate": 1.226954922445269e-05, "loss": 0.3933, "num_input_tokens_seen": 16454890576, "step": 4220, "train_runtime": 167845.6634, "train_tokens_per_second": 98035.84 }, { "epoch": 0.6710651828298887, "grad_norm": 0.1852794736623764, "learning_rate": 1.2258782249119489e-05, "loss": 0.3867, "num_input_tokens_seen": 16458624074, "step": 4221, "train_runtime": 167884.4731, "train_tokens_per_second": 98035.415 }, { "epoch": 0.6712241653418124, "grad_norm": 0.21329547464847565, "learning_rate": 1.2248018465371288e-05, "loss": 0.3987, "num_input_tokens_seen": 16462604319, "step": 4222, "train_runtime": 167924.774, "train_tokens_per_second": 98035.59 }, { "epoch": 0.6713831478537361, "grad_norm": 0.25179100036621094, "learning_rate": 1.2237257875904355e-05, "loss": 0.3837, "num_input_tokens_seen": 16466537011, "step": 4223, "train_runtime": 167964.7112, "train_tokens_per_second": 98035.694 }, { "epoch": 0.6715421303656598, "grad_norm": 0.3332647383213043, "learning_rate": 1.2226500483414136e-05, "loss": 0.3944, "num_input_tokens_seen": 16470328586, "step": 4224, "train_runtime": 168003.6696, "train_tokens_per_second": 98035.529 }, { "epoch": 0.6717011128775835, "grad_norm": 0.2662065625190735, "learning_rate": 1.2215746290595281e-05, "loss": 0.3944, "num_input_tokens_seen": 16474282744, "step": 4225, "train_runtime": 168042.0765, "train_tokens_per_second": 98036.653 }, { "epoch": 0.6718600953895072, "grad_norm": 0.21281953155994415, "learning_rate": 1.2204995300141655e-05, "loss": 0.3904, "num_input_tokens_seen": 16478156597, "step": 4226, "train_runtime": 168078.7863, "train_tokens_per_second": 98038.289 }, { "epoch": 0.6720190779014309, "grad_norm": 0.33450645208358765, "learning_rate": 1.2194247514746274e-05, "loss": 0.3796, "num_input_tokens_seen": 16482009801, "step": 4227, "train_runtime": 168119.4275, "train_tokens_per_second": 98037.508 }, { "epoch": 0.6721780604133545, "grad_norm": 0.2652530074119568, "learning_rate": 1.2183502937101418e-05, "loss": 0.4003, "num_input_tokens_seen": 16486083968, "step": 4228, "train_runtime": 168159.031, "train_tokens_per_second": 98038.648 }, { "epoch": 0.6723370429252782, "grad_norm": 0.1869659274816513, "learning_rate": 1.2172761569898503e-05, "loss": 0.3911, "num_input_tokens_seen": 16489963088, "step": 4229, "train_runtime": 168198.7596, "train_tokens_per_second": 98038.553 }, { "epoch": 0.6724960254372019, "grad_norm": 0.19014252722263336, "learning_rate": 1.2162023415828172e-05, "loss": 0.398, "num_input_tokens_seen": 16493868746, "step": 4230, "train_runtime": 168238.2897, "train_tokens_per_second": 98038.733 }, { "epoch": 0.6726550079491256, "grad_norm": 0.2209918349981308, "learning_rate": 1.2151288477580256e-05, "loss": 0.3991, "num_input_tokens_seen": 16497759980, "step": 4231, "train_runtime": 168277.6427, "train_tokens_per_second": 98038.93 }, { "epoch": 0.6728139904610493, "grad_norm": 0.1834999918937683, "learning_rate": 1.2140556757843785e-05, "loss": 0.3866, "num_input_tokens_seen": 16501713375, "step": 4232, "train_runtime": 168315.4778, "train_tokens_per_second": 98040.38 }, { "epoch": 0.672972972972973, "grad_norm": 0.18534745275974274, "learning_rate": 1.2129828259306982e-05, "loss": 0.3951, "num_input_tokens_seen": 16505711207, "step": 4233, "train_runtime": 168353.2958, "train_tokens_per_second": 98042.103 }, { "epoch": 0.6731319554848967, "grad_norm": 0.19683553278446198, "learning_rate": 1.2119102984657252e-05, "loss": 0.3947, "num_input_tokens_seen": 16509530391, "step": 4234, "train_runtime": 168392.9243, "train_tokens_per_second": 98041.711 }, { "epoch": 0.6732909379968204, "grad_norm": 0.21351094543933868, "learning_rate": 1.2108380936581199e-05, "loss": 0.3967, "num_input_tokens_seen": 16513456839, "step": 4235, "train_runtime": 168434.1292, "train_tokens_per_second": 98041.038 }, { "epoch": 0.6734499205087441, "grad_norm": 0.20238374173641205, "learning_rate": 1.2097662117764624e-05, "loss": 0.3954, "num_input_tokens_seen": 16517300891, "step": 4236, "train_runtime": 168473.1446, "train_tokens_per_second": 98041.15 }, { "epoch": 0.6736089030206678, "grad_norm": 0.22464729845523834, "learning_rate": 1.2086946530892521e-05, "loss": 0.3784, "num_input_tokens_seen": 16521187448, "step": 4237, "train_runtime": 168512.5377, "train_tokens_per_second": 98041.295 }, { "epoch": 0.6737678855325914, "grad_norm": 0.195974662899971, "learning_rate": 1.207623417864906e-05, "loss": 0.4024, "num_input_tokens_seen": 16525118718, "step": 4238, "train_runtime": 168549.7012, "train_tokens_per_second": 98043.002 }, { "epoch": 0.6739268680445151, "grad_norm": 0.2988460958003998, "learning_rate": 1.2065525063717623e-05, "loss": 0.3962, "num_input_tokens_seen": 16529109149, "step": 4239, "train_runtime": 168584.7161, "train_tokens_per_second": 98046.309 }, { "epoch": 0.6740858505564388, "grad_norm": 0.992950439453125, "learning_rate": 1.2054819188780747e-05, "loss": 0.3918, "num_input_tokens_seen": 16532894683, "step": 4240, "train_runtime": 168622.4823, "train_tokens_per_second": 98046.799 }, { "epoch": 0.6742448330683625, "grad_norm": 0.22350937128067017, "learning_rate": 1.2044116556520193e-05, "loss": 0.3966, "num_input_tokens_seen": 16536692302, "step": 4241, "train_runtime": 168660.9941, "train_tokens_per_second": 98046.928 }, { "epoch": 0.6744038155802862, "grad_norm": 0.19533349573612213, "learning_rate": 1.2033417169616886e-05, "loss": 0.3952, "num_input_tokens_seen": 16540620404, "step": 4242, "train_runtime": 168698.1735, "train_tokens_per_second": 98048.604 }, { "epoch": 0.6745627980922099, "grad_norm": 0.2033209502696991, "learning_rate": 1.2022721030750955e-05, "loss": 0.3985, "num_input_tokens_seen": 16544626783, "step": 4243, "train_runtime": 168735.6034, "train_tokens_per_second": 98050.598 }, { "epoch": 0.6747217806041336, "grad_norm": 0.3139253258705139, "learning_rate": 1.2012028142601715e-05, "loss": 0.3891, "num_input_tokens_seen": 16548474599, "step": 4244, "train_runtime": 168776.7793, "train_tokens_per_second": 98049.475 }, { "epoch": 0.6748807631160573, "grad_norm": 0.18804149329662323, "learning_rate": 1.2001338507847627e-05, "loss": 0.3921, "num_input_tokens_seen": 16552434407, "step": 4245, "train_runtime": 168816.5901, "train_tokens_per_second": 98049.809 }, { "epoch": 0.675039745627981, "grad_norm": 0.35363340377807617, "learning_rate": 1.1990652129166404e-05, "loss": 0.3911, "num_input_tokens_seen": 16556409843, "step": 4246, "train_runtime": 168855.2997, "train_tokens_per_second": 98050.875 }, { "epoch": 0.6751987281399046, "grad_norm": 0.2938863933086395, "learning_rate": 1.197996900923489e-05, "loss": 0.3897, "num_input_tokens_seen": 16560306182, "step": 4247, "train_runtime": 168896.3343, "train_tokens_per_second": 98050.122 }, { "epoch": 0.6753577106518283, "grad_norm": 0.259010374546051, "learning_rate": 1.1969289150729134e-05, "loss": 0.3961, "num_input_tokens_seen": 16564213806, "step": 4248, "train_runtime": 168937.3253, "train_tokens_per_second": 98049.462 }, { "epoch": 0.675516693163752, "grad_norm": 0.21639004349708557, "learning_rate": 1.1958612556324376e-05, "loss": 0.395, "num_input_tokens_seen": 16568100778, "step": 4249, "train_runtime": 168977.998, "train_tokens_per_second": 98048.864 }, { "epoch": 0.6756756756756757, "grad_norm": 0.21509726345539093, "learning_rate": 1.1947939228694996e-05, "loss": 0.3841, "num_input_tokens_seen": 16572006504, "step": 4250, "train_runtime": 169014.7616, "train_tokens_per_second": 98050.646 }, { "epoch": 0.6758346581875994, "grad_norm": 0.7210568189620972, "learning_rate": 1.193726917051463e-05, "loss": 0.3839, "num_input_tokens_seen": 16575819679, "step": 4251, "train_runtime": 169054.2528, "train_tokens_per_second": 98050.297 }, { "epoch": 0.6759936406995231, "grad_norm": 0.18623512983322144, "learning_rate": 1.1926602384456021e-05, "loss": 0.3948, "num_input_tokens_seen": 16579820187, "step": 4252, "train_runtime": 169091.8793, "train_tokens_per_second": 98052.137 }, { "epoch": 0.6761526232114468, "grad_norm": 0.21016573905944824, "learning_rate": 1.1915938873191132e-05, "loss": 0.3904, "num_input_tokens_seen": 16583737485, "step": 4253, "train_runtime": 169131.1944, "train_tokens_per_second": 98052.506 }, { "epoch": 0.6763116057233705, "grad_norm": 0.2113744020462036, "learning_rate": 1.1905278639391102e-05, "loss": 0.3873, "num_input_tokens_seen": 16587584839, "step": 4254, "train_runtime": 169169.34, "train_tokens_per_second": 98053.139 }, { "epoch": 0.6764705882352942, "grad_norm": 0.21284642815589905, "learning_rate": 1.1894621685726238e-05, "loss": 0.3982, "num_input_tokens_seen": 16591439216, "step": 4255, "train_runtime": 169208.8304, "train_tokens_per_second": 98053.034 }, { "epoch": 0.6766295707472179, "grad_norm": 0.2015148401260376, "learning_rate": 1.1883968014866046e-05, "loss": 0.3948, "num_input_tokens_seen": 16595260849, "step": 4256, "train_runtime": 169247.6985, "train_tokens_per_second": 98053.096 }, { "epoch": 0.6767885532591414, "grad_norm": 0.39132770895957947, "learning_rate": 1.1873317629479172e-05, "loss": 0.399, "num_input_tokens_seen": 16599073675, "step": 4257, "train_runtime": 169286.2142, "train_tokens_per_second": 98053.31 }, { "epoch": 0.6769475357710651, "grad_norm": 0.18371319770812988, "learning_rate": 1.1862670532233475e-05, "loss": 0.3829, "num_input_tokens_seen": 16603041229, "step": 4258, "train_runtime": 169325.8758, "train_tokens_per_second": 98053.774 }, { "epoch": 0.6771065182829888, "grad_norm": 0.18783460557460785, "learning_rate": 1.1852026725795979e-05, "loss": 0.3853, "num_input_tokens_seen": 16606946075, "step": 4259, "train_runtime": 169362.3125, "train_tokens_per_second": 98055.735 }, { "epoch": 0.6772655007949125, "grad_norm": 0.18109923601150513, "learning_rate": 1.1841386212832876e-05, "loss": 0.3895, "num_input_tokens_seen": 16610738769, "step": 4260, "train_runtime": 169401.3683, "train_tokens_per_second": 98055.517 }, { "epoch": 0.6774244833068362, "grad_norm": 0.1837083101272583, "learning_rate": 1.1830748996009544e-05, "loss": 0.3936, "num_input_tokens_seen": 16614790799, "step": 4261, "train_runtime": 169439.4978, "train_tokens_per_second": 98057.366 }, { "epoch": 0.67758346581876, "grad_norm": 0.22175323963165283, "learning_rate": 1.1820115077990543e-05, "loss": 0.3957, "num_input_tokens_seen": 16618601205, "step": 4262, "train_runtime": 169479.2876, "train_tokens_per_second": 98056.827 }, { "epoch": 0.6777424483306836, "grad_norm": 0.2156973034143448, "learning_rate": 1.1809484461439569e-05, "loss": 0.4016, "num_input_tokens_seen": 16622334277, "step": 4263, "train_runtime": 169517.0702, "train_tokens_per_second": 98056.994 }, { "epoch": 0.6779014308426073, "grad_norm": 0.1884552240371704, "learning_rate": 1.179885714901953e-05, "loss": 0.3816, "num_input_tokens_seen": 16626247877, "step": 4264, "train_runtime": 169557.1632, "train_tokens_per_second": 98056.889 }, { "epoch": 0.678060413354531, "grad_norm": 0.1911202073097229, "learning_rate": 1.1788233143392489e-05, "loss": 0.3952, "num_input_tokens_seen": 16630171396, "step": 4265, "train_runtime": 169596.6504, "train_tokens_per_second": 98057.192 }, { "epoch": 0.6782193958664547, "grad_norm": 0.1963866800069809, "learning_rate": 1.1777612447219688e-05, "loss": 0.3901, "num_input_tokens_seen": 16634010040, "step": 4266, "train_runtime": 169636.0531, "train_tokens_per_second": 98057.045 }, { "epoch": 0.6783783783783783, "grad_norm": 0.19502903521060944, "learning_rate": 1.1766995063161543e-05, "loss": 0.3985, "num_input_tokens_seen": 16637970107, "step": 4267, "train_runtime": 169675.6355, "train_tokens_per_second": 98057.509 }, { "epoch": 0.678537360890302, "grad_norm": 0.23551656305789948, "learning_rate": 1.1756380993877608e-05, "loss": 0.3869, "num_input_tokens_seen": 16641892350, "step": 4268, "train_runtime": 169713.6841, "train_tokens_per_second": 98058.636 }, { "epoch": 0.6786963434022257, "grad_norm": 0.2058309018611908, "learning_rate": 1.174577024202667e-05, "loss": 0.3943, "num_input_tokens_seen": 16645739929, "step": 4269, "train_runtime": 169752.4771, "train_tokens_per_second": 98058.893 }, { "epoch": 0.6788553259141494, "grad_norm": 0.20728661119937897, "learning_rate": 1.1735162810266615e-05, "loss": 0.3949, "num_input_tokens_seen": 16649587904, "step": 4270, "train_runtime": 169790.3529, "train_tokens_per_second": 98059.681 }, { "epoch": 0.6790143084260731, "grad_norm": 0.4255560636520386, "learning_rate": 1.1724558701254545e-05, "loss": 0.3845, "num_input_tokens_seen": 16653534402, "step": 4271, "train_runtime": 169829.3254, "train_tokens_per_second": 98060.417 }, { "epoch": 0.6791732909379968, "grad_norm": 0.2104579210281372, "learning_rate": 1.171395791764672e-05, "loss": 0.3963, "num_input_tokens_seen": 16657422530, "step": 4272, "train_runtime": 169864.6759, "train_tokens_per_second": 98062.899 }, { "epoch": 0.6793322734499205, "grad_norm": 0.2577704191207886, "learning_rate": 1.170336046209853e-05, "loss": 0.3965, "num_input_tokens_seen": 16661297958, "step": 4273, "train_runtime": 169904.0529, "train_tokens_per_second": 98062.981 }, { "epoch": 0.6794912559618442, "grad_norm": 0.25746893882751465, "learning_rate": 1.169276633726461e-05, "loss": 0.389, "num_input_tokens_seen": 16665122053, "step": 4274, "train_runtime": 169942.7169, "train_tokens_per_second": 98063.173 }, { "epoch": 0.6796502384737679, "grad_norm": 0.2383444607257843, "learning_rate": 1.1682175545798676e-05, "loss": 0.394, "num_input_tokens_seen": 16669090498, "step": 4275, "train_runtime": 169980.9788, "train_tokens_per_second": 98064.446 }, { "epoch": 0.6798092209856915, "grad_norm": 0.19120194017887115, "learning_rate": 1.167158809035366e-05, "loss": 0.395, "num_input_tokens_seen": 16672972199, "step": 4276, "train_runtime": 170019.7989, "train_tokens_per_second": 98064.886 }, { "epoch": 0.6799682034976152, "grad_norm": 0.2780977189540863, "learning_rate": 1.1661003973581647e-05, "loss": 0.3954, "num_input_tokens_seen": 16676766478, "step": 4277, "train_runtime": 170059.0288, "train_tokens_per_second": 98064.576 }, { "epoch": 0.6801271860095389, "grad_norm": 0.20727843046188354, "learning_rate": 1.1650423198133879e-05, "loss": 0.389, "num_input_tokens_seen": 16680676939, "step": 4278, "train_runtime": 170099.7181, "train_tokens_per_second": 98064.107 }, { "epoch": 0.6802861685214626, "grad_norm": 0.22737976908683777, "learning_rate": 1.1639845766660772e-05, "loss": 0.3948, "num_input_tokens_seen": 16684594373, "step": 4279, "train_runtime": 170136.8615, "train_tokens_per_second": 98065.723 }, { "epoch": 0.6804451510333863, "grad_norm": 0.22012914717197418, "learning_rate": 1.162927168181189e-05, "loss": 0.3898, "num_input_tokens_seen": 16688388528, "step": 4280, "train_runtime": 170177.2872, "train_tokens_per_second": 98064.723 }, { "epoch": 0.68060413354531, "grad_norm": 0.23036736249923706, "learning_rate": 1.1618700946235967e-05, "loss": 0.381, "num_input_tokens_seen": 16692240497, "step": 4281, "train_runtime": 170218.175, "train_tokens_per_second": 98063.797 }, { "epoch": 0.6807631160572337, "grad_norm": 0.202787846326828, "learning_rate": 1.1608133562580898e-05, "loss": 0.4022, "num_input_tokens_seen": 16696255407, "step": 4282, "train_runtime": 170257.0906, "train_tokens_per_second": 98064.964 }, { "epoch": 0.6809220985691574, "grad_norm": 0.20187528431415558, "learning_rate": 1.1597569533493743e-05, "loss": 0.3932, "num_input_tokens_seen": 16700115159, "step": 4283, "train_runtime": 170294.9848, "train_tokens_per_second": 98065.807 }, { "epoch": 0.6810810810810811, "grad_norm": 0.19897423684597015, "learning_rate": 1.1587008861620711e-05, "loss": 0.3843, "num_input_tokens_seen": 16703968289, "step": 4284, "train_runtime": 170333.743, "train_tokens_per_second": 98066.114 }, { "epoch": 0.6812400635930048, "grad_norm": 0.22383305430412292, "learning_rate": 1.157645154960719e-05, "loss": 0.3905, "num_input_tokens_seen": 16707745298, "step": 4285, "train_runtime": 170372.7208, "train_tokens_per_second": 98065.848 }, { "epoch": 0.6813990461049284, "grad_norm": 0.17838920652866364, "learning_rate": 1.1565897600097689e-05, "loss": 0.383, "num_input_tokens_seen": 16711769488, "step": 4286, "train_runtime": 170413.7191, "train_tokens_per_second": 98065.869 }, { "epoch": 0.6815580286168521, "grad_norm": 0.19598181545734406, "learning_rate": 1.155534701573591e-05, "loss": 0.4003, "num_input_tokens_seen": 16715732983, "step": 4287, "train_runtime": 170452.5767, "train_tokens_per_second": 98066.766 }, { "epoch": 0.6817170111287758, "grad_norm": 0.22892434895038605, "learning_rate": 1.1544799799164707e-05, "loss": 0.3946, "num_input_tokens_seen": 16719474206, "step": 4288, "train_runtime": 170491.8825, "train_tokens_per_second": 98066.101 }, { "epoch": 0.6818759936406995, "grad_norm": 0.20877425372600555, "learning_rate": 1.1534255953026057e-05, "loss": 0.383, "num_input_tokens_seen": 16723410811, "step": 4289, "train_runtime": 170529.0129, "train_tokens_per_second": 98067.833 }, { "epoch": 0.6820349761526232, "grad_norm": 0.2064221203327179, "learning_rate": 1.1523715479961156e-05, "loss": 0.4038, "num_input_tokens_seen": 16727443263, "step": 4290, "train_runtime": 170569.814, "train_tokens_per_second": 98068.016 }, { "epoch": 0.6821939586645469, "grad_norm": 0.21600653231143951, "learning_rate": 1.1513178382610281e-05, "loss": 0.3959, "num_input_tokens_seen": 16731363238, "step": 4291, "train_runtime": 170607.2417, "train_tokens_per_second": 98069.479 }, { "epoch": 0.6823529411764706, "grad_norm": 0.17055420577526093, "learning_rate": 1.1502644663612937e-05, "loss": 0.3853, "num_input_tokens_seen": 16735316573, "step": 4292, "train_runtime": 170647.3605, "train_tokens_per_second": 98069.589 }, { "epoch": 0.6825119236883943, "grad_norm": 0.18557791411876678, "learning_rate": 1.149211432560772e-05, "loss": 0.3988, "num_input_tokens_seen": 16739198119, "step": 4293, "train_runtime": 170686.1562, "train_tokens_per_second": 98070.04 }, { "epoch": 0.682670906200318, "grad_norm": 0.17581437528133392, "learning_rate": 1.1481587371232411e-05, "loss": 0.3957, "num_input_tokens_seen": 16743243656, "step": 4294, "train_runtime": 170724.9533, "train_tokens_per_second": 98071.45 }, { "epoch": 0.6828298887122417, "grad_norm": 0.1989501714706421, "learning_rate": 1.1471063803123947e-05, "loss": 0.3858, "num_input_tokens_seen": 16747088127, "step": 4295, "train_runtime": 170764.1389, "train_tokens_per_second": 98071.458 }, { "epoch": 0.6829888712241653, "grad_norm": 0.29525288939476013, "learning_rate": 1.1460543623918385e-05, "loss": 0.3825, "num_input_tokens_seen": 16751000935, "step": 4296, "train_runtime": 170803.729, "train_tokens_per_second": 98071.635 }, { "epoch": 0.683147853736089, "grad_norm": 0.17965148389339447, "learning_rate": 1.1450026836250987e-05, "loss": 0.3914, "num_input_tokens_seen": 16754967686, "step": 4297, "train_runtime": 170844.3708, "train_tokens_per_second": 98071.523 }, { "epoch": 0.6833068362480127, "grad_norm": 0.20394036173820496, "learning_rate": 1.1439513442756109e-05, "loss": 0.4017, "num_input_tokens_seen": 16758868683, "step": 4298, "train_runtime": 170884.6138, "train_tokens_per_second": 98071.256 }, { "epoch": 0.6834658187599364, "grad_norm": 0.21751312911510468, "learning_rate": 1.1429003446067291e-05, "loss": 0.3938, "num_input_tokens_seen": 16762749916, "step": 4299, "train_runtime": 170922.9974, "train_tokens_per_second": 98071.94 }, { "epoch": 0.6836248012718601, "grad_norm": 0.20131736993789673, "learning_rate": 1.1418496848817212e-05, "loss": 0.3969, "num_input_tokens_seen": 16766620062, "step": 4300, "train_runtime": 170961.1462, "train_tokens_per_second": 98072.693 }, { "epoch": 0.6837837837837838, "grad_norm": 0.20342065393924713, "learning_rate": 1.14079936536377e-05, "loss": 0.4117, "num_input_tokens_seen": 16770471334, "step": 4301, "train_runtime": 171000.486, "train_tokens_per_second": 98072.653 }, { "epoch": 0.6839427662957075, "grad_norm": 0.1821083128452301, "learning_rate": 1.1397493863159741e-05, "loss": 0.3785, "num_input_tokens_seen": 16774368649, "step": 4302, "train_runtime": 171039.8452, "train_tokens_per_second": 98072.871 }, { "epoch": 0.6841017488076312, "grad_norm": 0.18204925954341888, "learning_rate": 1.1386997480013442e-05, "loss": 0.3967, "num_input_tokens_seen": 16778234042, "step": 4303, "train_runtime": 171080.5535, "train_tokens_per_second": 98072.129 }, { "epoch": 0.6842607313195549, "grad_norm": 0.24795405566692352, "learning_rate": 1.137650450682808e-05, "loss": 0.4002, "num_input_tokens_seen": 16782188524, "step": 4304, "train_runtime": 171121.3423, "train_tokens_per_second": 98071.861 }, { "epoch": 0.6844197138314785, "grad_norm": 0.22396257519721985, "learning_rate": 1.1366014946232068e-05, "loss": 0.394, "num_input_tokens_seen": 16786091277, "step": 4305, "train_runtime": 171161.3756, "train_tokens_per_second": 98071.725 }, { "epoch": 0.6845786963434022, "grad_norm": 0.18320141732692719, "learning_rate": 1.135552880085298e-05, "loss": 0.4017, "num_input_tokens_seen": 16790077451, "step": 4306, "train_runtime": 171200.8611, "train_tokens_per_second": 98072.389 }, { "epoch": 0.6847376788553259, "grad_norm": 0.18725088238716125, "learning_rate": 1.1345046073317491e-05, "loss": 0.3827, "num_input_tokens_seen": 16793918735, "step": 4307, "train_runtime": 171239.9846, "train_tokens_per_second": 98072.414 }, { "epoch": 0.6848966613672496, "grad_norm": 0.1900402009487152, "learning_rate": 1.133456676625149e-05, "loss": 0.4004, "num_input_tokens_seen": 16797926565, "step": 4308, "train_runtime": 171278.376, "train_tokens_per_second": 98073.831 }, { "epoch": 0.6850556438791733, "grad_norm": 0.17872007191181183, "learning_rate": 1.132409088227994e-05, "loss": 0.3995, "num_input_tokens_seen": 16801712810, "step": 4309, "train_runtime": 171315.5331, "train_tokens_per_second": 98074.661 }, { "epoch": 0.685214626391097, "grad_norm": 0.20186692476272583, "learning_rate": 1.1313618424026984e-05, "loss": 0.3939, "num_input_tokens_seen": 16805719578, "step": 4310, "train_runtime": 171354.9311, "train_tokens_per_second": 98075.494 }, { "epoch": 0.6853736089030207, "grad_norm": 0.21471956372261047, "learning_rate": 1.1303149394115908e-05, "loss": 0.3915, "num_input_tokens_seen": 16809482472, "step": 4311, "train_runtime": 171394.7835, "train_tokens_per_second": 98074.645 }, { "epoch": 0.6855325914149444, "grad_norm": 0.18758705258369446, "learning_rate": 1.1292683795169106e-05, "loss": 0.3851, "num_input_tokens_seen": 16813410423, "step": 4312, "train_runtime": 171436.3382, "train_tokens_per_second": 98073.784 }, { "epoch": 0.6856915739268681, "grad_norm": 0.18581534922122955, "learning_rate": 1.128222162980817e-05, "loss": 0.3959, "num_input_tokens_seen": 16817280940, "step": 4313, "train_runtime": 171475.1744, "train_tokens_per_second": 98074.144 }, { "epoch": 0.6858505564387918, "grad_norm": 0.21584056317806244, "learning_rate": 1.1271762900653765e-05, "loss": 0.3982, "num_input_tokens_seen": 16821117078, "step": 4314, "train_runtime": 171514.2516, "train_tokens_per_second": 98074.165 }, { "epoch": 0.6860095389507154, "grad_norm": 0.32488560676574707, "learning_rate": 1.126130761032576e-05, "loss": 0.3931, "num_input_tokens_seen": 16825094156, "step": 4315, "train_runtime": 171554.418, "train_tokens_per_second": 98074.386 }, { "epoch": 0.6861685214626391, "grad_norm": 0.22482888400554657, "learning_rate": 1.125085576144311e-05, "loss": 0.378, "num_input_tokens_seen": 16828967520, "step": 4316, "train_runtime": 171594.1798, "train_tokens_per_second": 98074.233 }, { "epoch": 0.6863275039745628, "grad_norm": 0.1951904147863388, "learning_rate": 1.1240407356623933e-05, "loss": 0.3993, "num_input_tokens_seen": 16832831344, "step": 4317, "train_runtime": 171632.7531, "train_tokens_per_second": 98074.703 }, { "epoch": 0.6864864864864865, "grad_norm": 0.21237227320671082, "learning_rate": 1.1229962398485493e-05, "loss": 0.394, "num_input_tokens_seen": 16836773645, "step": 4318, "train_runtime": 171674.1365, "train_tokens_per_second": 98074.026 }, { "epoch": 0.6866454689984102, "grad_norm": 0.19980421662330627, "learning_rate": 1.121952088964415e-05, "loss": 0.3934, "num_input_tokens_seen": 16840702060, "step": 4319, "train_runtime": 171711.4068, "train_tokens_per_second": 98075.616 }, { "epoch": 0.6868044515103339, "grad_norm": 0.19566161930561066, "learning_rate": 1.1209082832715465e-05, "loss": 0.3935, "num_input_tokens_seen": 16844610861, "step": 4320, "train_runtime": 171751.6124, "train_tokens_per_second": 98075.416 }, { "epoch": 0.6869634340222576, "grad_norm": 0.1839914619922638, "learning_rate": 1.1198648230314068e-05, "loss": 0.3975, "num_input_tokens_seen": 16848507092, "step": 4321, "train_runtime": 171792.1893, "train_tokens_per_second": 98074.931 }, { "epoch": 0.6871224165341813, "grad_norm": 0.2468312531709671, "learning_rate": 1.1188217085053767e-05, "loss": 0.3942, "num_input_tokens_seen": 16852451374, "step": 4322, "train_runtime": 171829.8851, "train_tokens_per_second": 98076.37 }, { "epoch": 0.687281399046105, "grad_norm": 0.31376394629478455, "learning_rate": 1.1177789399547486e-05, "loss": 0.3976, "num_input_tokens_seen": 16856237141, "step": 4323, "train_runtime": 171870.5773, "train_tokens_per_second": 98075.176 }, { "epoch": 0.6874403815580287, "grad_norm": 0.2269231528043747, "learning_rate": 1.1167365176407288e-05, "loss": 0.381, "num_input_tokens_seen": 16860182110, "step": 4324, "train_runtime": 171907.2598, "train_tokens_per_second": 98077.197 }, { "epoch": 0.6875993640699523, "grad_norm": 0.35949110984802246, "learning_rate": 1.1156944418244375e-05, "loss": 0.397, "num_input_tokens_seen": 16863932716, "step": 4325, "train_runtime": 171947.1461, "train_tokens_per_second": 98076.258 }, { "epoch": 0.687758346581876, "grad_norm": 0.32931211590766907, "learning_rate": 1.1146527127669058e-05, "loss": 0.3844, "num_input_tokens_seen": 16867864604, "step": 4326, "train_runtime": 171986.325, "train_tokens_per_second": 98076.778 }, { "epoch": 0.6879173290937997, "grad_norm": 0.19644293189048767, "learning_rate": 1.11361133072908e-05, "loss": 0.3904, "num_input_tokens_seen": 16871808135, "step": 4327, "train_runtime": 172024.5316, "train_tokens_per_second": 98077.919 }, { "epoch": 0.6880763116057234, "grad_norm": 0.3451267182826996, "learning_rate": 1.1125702959718194e-05, "loss": 0.4064, "num_input_tokens_seen": 16875743591, "step": 4328, "train_runtime": 172063.0943, "train_tokens_per_second": 98078.81 }, { "epoch": 0.6882352941176471, "grad_norm": 0.1832500696182251, "learning_rate": 1.1115296087558962e-05, "loss": 0.3862, "num_input_tokens_seen": 16879455908, "step": 4329, "train_runtime": 172104.0007, "train_tokens_per_second": 98077.069 }, { "epoch": 0.6883942766295708, "grad_norm": 0.24364827573299408, "learning_rate": 1.110489269341993e-05, "loss": 0.3903, "num_input_tokens_seen": 16883333014, "step": 4330, "train_runtime": 172145.5363, "train_tokens_per_second": 98075.927 }, { "epoch": 0.6885532591414945, "grad_norm": 0.2764301300048828, "learning_rate": 1.1094492779907106e-05, "loss": 0.401, "num_input_tokens_seen": 16887355817, "step": 4331, "train_runtime": 172186.4056, "train_tokens_per_second": 98076.011 }, { "epoch": 0.6887122416534182, "grad_norm": 0.20842495560646057, "learning_rate": 1.1084096349625572e-05, "loss": 0.3858, "num_input_tokens_seen": 16891255239, "step": 4332, "train_runtime": 172225.6696, "train_tokens_per_second": 98076.293 }, { "epoch": 0.6888712241653419, "grad_norm": 0.22739845514297485, "learning_rate": 1.107370340517957e-05, "loss": 0.4081, "num_input_tokens_seen": 16895158246, "step": 4333, "train_runtime": 172264.1364, "train_tokens_per_second": 98077.05 }, { "epoch": 0.6890302066772656, "grad_norm": 0.21221473813056946, "learning_rate": 1.1063313949172454e-05, "loss": 0.4016, "num_input_tokens_seen": 16899032630, "step": 4334, "train_runtime": 172303.0995, "train_tokens_per_second": 98077.357 }, { "epoch": 0.6891891891891891, "grad_norm": 0.36471158266067505, "learning_rate": 1.1052927984206712e-05, "loss": 0.3981, "num_input_tokens_seen": 16903053585, "step": 4335, "train_runtime": 172340.7959, "train_tokens_per_second": 98079.236 }, { "epoch": 0.6893481717011128, "grad_norm": 0.2114102989435196, "learning_rate": 1.1042545512883967e-05, "loss": 0.3983, "num_input_tokens_seen": 16906922837, "step": 4336, "train_runtime": 172381.3471, "train_tokens_per_second": 98078.61 }, { "epoch": 0.6895071542130365, "grad_norm": 0.1856624037027359, "learning_rate": 1.1032166537804923e-05, "loss": 0.3941, "num_input_tokens_seen": 16910838643, "step": 4337, "train_runtime": 172420.4234, "train_tokens_per_second": 98079.092 }, { "epoch": 0.6896661367249602, "grad_norm": 0.2241416573524475, "learning_rate": 1.1021791061569479e-05, "loss": 0.393, "num_input_tokens_seen": 16914766878, "step": 4338, "train_runtime": 172460.3345, "train_tokens_per_second": 98079.172 }, { "epoch": 0.6898251192368839, "grad_norm": 0.19112801551818848, "learning_rate": 1.1011419086776587e-05, "loss": 0.3883, "num_input_tokens_seen": 16918662046, "step": 4339, "train_runtime": 172499.9332, "train_tokens_per_second": 98079.238 }, { "epoch": 0.6899841017488076, "grad_norm": 0.18325276672840118, "learning_rate": 1.1001050616024366e-05, "loss": 0.392, "num_input_tokens_seen": 16922576220, "step": 4340, "train_runtime": 172540.3981, "train_tokens_per_second": 98078.922 }, { "epoch": 0.6901430842607313, "grad_norm": 0.2407819628715515, "learning_rate": 1.0990685651910046e-05, "loss": 0.3943, "num_input_tokens_seen": 16926307758, "step": 4341, "train_runtime": 172577.9701, "train_tokens_per_second": 98079.191 }, { "epoch": 0.690302066772655, "grad_norm": 0.204826220870018, "learning_rate": 1.0980324197029977e-05, "loss": 0.3886, "num_input_tokens_seen": 16930290223, "step": 4342, "train_runtime": 172617.9753, "train_tokens_per_second": 98079.532 }, { "epoch": 0.6904610492845787, "grad_norm": 0.19917245209217072, "learning_rate": 1.0969966253979633e-05, "loss": 0.3953, "num_input_tokens_seen": 16934101451, "step": 4343, "train_runtime": 172653.1964, "train_tokens_per_second": 98081.598 }, { "epoch": 0.6906200317965023, "grad_norm": 0.4359554052352905, "learning_rate": 1.0959611825353594e-05, "loss": 0.403, "num_input_tokens_seen": 16938097065, "step": 4344, "train_runtime": 172690.8663, "train_tokens_per_second": 98083.341 }, { "epoch": 0.690779014308426, "grad_norm": 0.19047749042510986, "learning_rate": 1.094926091374558e-05, "loss": 0.3782, "num_input_tokens_seen": 16941906419, "step": 4345, "train_runtime": 172729.0207, "train_tokens_per_second": 98083.729 }, { "epoch": 0.6909379968203497, "grad_norm": 0.6883844137191772, "learning_rate": 1.0938913521748418e-05, "loss": 0.392, "num_input_tokens_seen": 16945842375, "step": 4346, "train_runtime": 172767.8709, "train_tokens_per_second": 98084.455 }, { "epoch": 0.6910969793322734, "grad_norm": 0.1830396205186844, "learning_rate": 1.0928569651954067e-05, "loss": 0.3836, "num_input_tokens_seen": 16949793397, "step": 4347, "train_runtime": 172804.6912, "train_tokens_per_second": 98086.419 }, { "epoch": 0.6912559618441971, "grad_norm": 0.2267557978630066, "learning_rate": 1.0918229306953567e-05, "loss": 0.4049, "num_input_tokens_seen": 16953503612, "step": 4348, "train_runtime": 172842.3677, "train_tokens_per_second": 98086.504 }, { "epoch": 0.6914149443561208, "grad_norm": 0.3443117141723633, "learning_rate": 1.0907892489337138e-05, "loss": 0.3943, "num_input_tokens_seen": 16957477583, "step": 4349, "train_runtime": 172883.2811, "train_tokens_per_second": 98086.278 }, { "epoch": 0.6915739268680445, "grad_norm": 0.4443567097187042, "learning_rate": 1.0897559201694052e-05, "loss": 0.4043, "num_input_tokens_seen": 16961427525, "step": 4350, "train_runtime": 172922.9115, "train_tokens_per_second": 98086.641 }, { "epoch": 0.6917329093799682, "grad_norm": 0.3395429253578186, "learning_rate": 1.0887229446612732e-05, "loss": 0.4001, "num_input_tokens_seen": 16965303138, "step": 4351, "train_runtime": 172961.762, "train_tokens_per_second": 98087.016 }, { "epoch": 0.6918918918918919, "grad_norm": 0.20166045427322388, "learning_rate": 1.0876903226680719e-05, "loss": 0.3972, "num_input_tokens_seen": 16969262424, "step": 4352, "train_runtime": 173000.1054, "train_tokens_per_second": 98088.162 }, { "epoch": 0.6920508744038156, "grad_norm": 0.18272385001182556, "learning_rate": 1.086658054448463e-05, "loss": 0.3845, "num_input_tokens_seen": 16973187480, "step": 4353, "train_runtime": 173041.1278, "train_tokens_per_second": 98087.592 }, { "epoch": 0.6922098569157392, "grad_norm": 0.2850431799888611, "learning_rate": 1.0856261402610262e-05, "loss": 0.3911, "num_input_tokens_seen": 16977227202, "step": 4354, "train_runtime": 173081.5881, "train_tokens_per_second": 98088.002 }, { "epoch": 0.6923688394276629, "grad_norm": 0.20879359543323517, "learning_rate": 1.0845945803642448e-05, "loss": 0.3895, "num_input_tokens_seen": 16980962683, "step": 4355, "train_runtime": 173118.7167, "train_tokens_per_second": 98088.543 }, { "epoch": 0.6925278219395866, "grad_norm": 0.2012658268213272, "learning_rate": 1.0835633750165215e-05, "loss": 0.3889, "num_input_tokens_seen": 16985000748, "step": 4356, "train_runtime": 173157.2703, "train_tokens_per_second": 98090.024 }, { "epoch": 0.6926868044515103, "grad_norm": 0.21762971580028534, "learning_rate": 1.0825325244761625e-05, "loss": 0.3989, "num_input_tokens_seen": 16988895306, "step": 4357, "train_runtime": 173197.109, "train_tokens_per_second": 98089.947 }, { "epoch": 0.692845786963434, "grad_norm": 0.1929832547903061, "learning_rate": 1.0815020290013897e-05, "loss": 0.3887, "num_input_tokens_seen": 16992811250, "step": 4358, "train_runtime": 173235.9948, "train_tokens_per_second": 98090.534 }, { "epoch": 0.6930047694753577, "grad_norm": 0.18785730004310608, "learning_rate": 1.0804718888503363e-05, "loss": 0.3874, "num_input_tokens_seen": 16996694937, "step": 4359, "train_runtime": 173276.3816, "train_tokens_per_second": 98090.085 }, { "epoch": 0.6931637519872814, "grad_norm": 0.17996105551719666, "learning_rate": 1.0794421042810418e-05, "loss": 0.3848, "num_input_tokens_seen": 17000635387, "step": 4360, "train_runtime": 173316.1445, "train_tokens_per_second": 98090.316 }, { "epoch": 0.6933227344992051, "grad_norm": 0.21196646988391876, "learning_rate": 1.0784126755514637e-05, "loss": 0.3871, "num_input_tokens_seen": 17004568478, "step": 4361, "train_runtime": 173354.8063, "train_tokens_per_second": 98091.128 }, { "epoch": 0.6934817170111288, "grad_norm": 0.1769607663154602, "learning_rate": 1.0773836029194642e-05, "loss": 0.3873, "num_input_tokens_seen": 17008459328, "step": 4362, "train_runtime": 173391.3067, "train_tokens_per_second": 98092.919 }, { "epoch": 0.6936406995230525, "grad_norm": 0.20351432263851166, "learning_rate": 1.0763548866428193e-05, "loss": 0.3981, "num_input_tokens_seen": 17012396375, "step": 4363, "train_runtime": 173431.8112, "train_tokens_per_second": 98092.71 }, { "epoch": 0.6937996820349761, "grad_norm": 0.44614771008491516, "learning_rate": 1.0753265269792157e-05, "loss": 0.3954, "num_input_tokens_seen": 17016326936, "step": 4364, "train_runtime": 173472.2525, "train_tokens_per_second": 98092.5 }, { "epoch": 0.6939586645468998, "grad_norm": 0.20743970572948456, "learning_rate": 1.0742985241862505e-05, "loss": 0.3892, "num_input_tokens_seen": 17020175173, "step": 4365, "train_runtime": 173512.2297, "train_tokens_per_second": 98092.078 }, { "epoch": 0.6941176470588235, "grad_norm": 0.20465785264968872, "learning_rate": 1.0732708785214296e-05, "loss": 0.3844, "num_input_tokens_seen": 17024189318, "step": 4366, "train_runtime": 173550.1684, "train_tokens_per_second": 98093.764 }, { "epoch": 0.6942766295707472, "grad_norm": 0.2121220827102661, "learning_rate": 1.0722435902421717e-05, "loss": 0.3918, "num_input_tokens_seen": 17028016822, "step": 4367, "train_runtime": 173589.2418, "train_tokens_per_second": 98093.733 }, { "epoch": 0.6944356120826709, "grad_norm": 0.2043353170156479, "learning_rate": 1.0712166596058057e-05, "loss": 0.3865, "num_input_tokens_seen": 17031894771, "step": 4368, "train_runtime": 173627.706, "train_tokens_per_second": 98094.337 }, { "epoch": 0.6945945945945946, "grad_norm": 0.2528136372566223, "learning_rate": 1.07019008686957e-05, "loss": 0.3956, "num_input_tokens_seen": 17035829794, "step": 4369, "train_runtime": 173668.5529, "train_tokens_per_second": 98093.924 }, { "epoch": 0.6947535771065183, "grad_norm": 0.34979715943336487, "learning_rate": 1.069163872290615e-05, "loss": 0.3884, "num_input_tokens_seen": 17039783950, "step": 4370, "train_runtime": 173708.5547, "train_tokens_per_second": 98094.098 }, { "epoch": 0.694912559618442, "grad_norm": 0.19636960327625275, "learning_rate": 1.0681380161259973e-05, "loss": 0.395, "num_input_tokens_seen": 17043796077, "step": 4371, "train_runtime": 173749.6716, "train_tokens_per_second": 98093.976 }, { "epoch": 0.6950715421303657, "grad_norm": 0.20707738399505615, "learning_rate": 1.0671125186326905e-05, "loss": 0.3927, "num_input_tokens_seen": 17047548127, "step": 4372, "train_runtime": 173790.9797, "train_tokens_per_second": 98092.249 }, { "epoch": 0.6952305246422893, "grad_norm": 0.19839011132717133, "learning_rate": 1.0660873800675714e-05, "loss": 0.3915, "num_input_tokens_seen": 17051496729, "step": 4373, "train_runtime": 173828.7537, "train_tokens_per_second": 98093.649 }, { "epoch": 0.695389507154213, "grad_norm": 0.18672291934490204, "learning_rate": 1.0650626006874311e-05, "loss": 0.4039, "num_input_tokens_seen": 17055428753, "step": 4374, "train_runtime": 173870.2482, "train_tokens_per_second": 98092.853 }, { "epoch": 0.6955484896661367, "grad_norm": 0.1881093829870224, "learning_rate": 1.0640381807489706e-05, "loss": 0.3897, "num_input_tokens_seen": 17059203875, "step": 4375, "train_runtime": 173908.1605, "train_tokens_per_second": 98093.176 }, { "epoch": 0.6957074721780604, "grad_norm": 0.21075773239135742, "learning_rate": 1.0630141205087968e-05, "loss": 0.3798, "num_input_tokens_seen": 17063151544, "step": 4376, "train_runtime": 173947.5525, "train_tokens_per_second": 98093.657 }, { "epoch": 0.6958664546899841, "grad_norm": 0.24545340240001678, "learning_rate": 1.0619904202234335e-05, "loss": 0.3795, "num_input_tokens_seen": 17067004405, "step": 4377, "train_runtime": 173987.0726, "train_tokens_per_second": 98093.52 }, { "epoch": 0.6960254372019078, "grad_norm": 0.22504115104675293, "learning_rate": 1.0609670801493065e-05, "loss": 0.3949, "num_input_tokens_seen": 17070997869, "step": 4378, "train_runtime": 174027.0377, "train_tokens_per_second": 98093.94 }, { "epoch": 0.6961844197138315, "grad_norm": 0.302407830953598, "learning_rate": 1.059944100542759e-05, "loss": 0.3902, "num_input_tokens_seen": 17074856485, "step": 4379, "train_runtime": 174067.058, "train_tokens_per_second": 98093.555 }, { "epoch": 0.6963434022257552, "grad_norm": 0.2380703091621399, "learning_rate": 1.0589214816600377e-05, "loss": 0.3857, "num_input_tokens_seen": 17078747782, "step": 4380, "train_runtime": 174104.7316, "train_tokens_per_second": 98094.679 }, { "epoch": 0.6965023847376789, "grad_norm": 0.2119692862033844, "learning_rate": 1.0578992237573019e-05, "loss": 0.4046, "num_input_tokens_seen": 17082693189, "step": 4381, "train_runtime": 174144.0693, "train_tokens_per_second": 98095.176 }, { "epoch": 0.6966613672496026, "grad_norm": 0.23816098272800446, "learning_rate": 1.0568773270906213e-05, "loss": 0.3924, "num_input_tokens_seen": 17086459989, "step": 4382, "train_runtime": 174184.7733, "train_tokens_per_second": 98093.879 }, { "epoch": 0.6968203497615262, "grad_norm": 0.20432619750499725, "learning_rate": 1.0558557919159715e-05, "loss": 0.3932, "num_input_tokens_seen": 17090423563, "step": 4383, "train_runtime": 174223.8068, "train_tokens_per_second": 98094.651 }, { "epoch": 0.6969793322734499, "grad_norm": 0.24169839918613434, "learning_rate": 1.0548346184892411e-05, "loss": 0.3802, "num_input_tokens_seen": 17094344183, "step": 4384, "train_runtime": 174262.5026, "train_tokens_per_second": 98095.367 }, { "epoch": 0.6971383147853736, "grad_norm": 0.19335758686065674, "learning_rate": 1.0538138070662266e-05, "loss": 0.3983, "num_input_tokens_seen": 17098263907, "step": 4385, "train_runtime": 174302.4761, "train_tokens_per_second": 98095.359 }, { "epoch": 0.6972972972972973, "grad_norm": 0.23592513799667358, "learning_rate": 1.0527933579026342e-05, "loss": 0.3976, "num_input_tokens_seen": 17102227738, "step": 4386, "train_runtime": 174341.4166, "train_tokens_per_second": 98096.184 }, { "epoch": 0.697456279809221, "grad_norm": 0.19739319384098053, "learning_rate": 1.0517732712540793e-05, "loss": 0.3948, "num_input_tokens_seen": 17106200164, "step": 4387, "train_runtime": 174383.1335, "train_tokens_per_second": 98095.497 }, { "epoch": 0.6976152623211447, "grad_norm": 0.19868765771389008, "learning_rate": 1.0507535473760872e-05, "loss": 0.3924, "num_input_tokens_seen": 17110004598, "step": 4388, "train_runtime": 174420.9129, "train_tokens_per_second": 98096.062 }, { "epoch": 0.6977742448330684, "grad_norm": 0.23721055686473846, "learning_rate": 1.0497341865240895e-05, "loss": 0.3896, "num_input_tokens_seen": 17113928446, "step": 4389, "train_runtime": 174461.6717, "train_tokens_per_second": 98095.635 }, { "epoch": 0.6979332273449921, "grad_norm": 0.2416653037071228, "learning_rate": 1.04871518895343e-05, "loss": 0.3924, "num_input_tokens_seen": 17117833776, "step": 4390, "train_runtime": 174502.5895, "train_tokens_per_second": 98095.013 }, { "epoch": 0.6980922098569158, "grad_norm": 0.22845523059368134, "learning_rate": 1.0476965549193604e-05, "loss": 0.3928, "num_input_tokens_seen": 17121725752, "step": 4391, "train_runtime": 174540.9965, "train_tokens_per_second": 98095.726 }, { "epoch": 0.6982511923688395, "grad_norm": 0.21993058919906616, "learning_rate": 1.0466782846770415e-05, "loss": 0.4073, "num_input_tokens_seen": 17125584150, "step": 4392, "train_runtime": 174579.7174, "train_tokens_per_second": 98096.07 }, { "epoch": 0.6984101748807631, "grad_norm": 0.17991337180137634, "learning_rate": 1.0456603784815434e-05, "loss": 0.3956, "num_input_tokens_seen": 17129522239, "step": 4393, "train_runtime": 174619.2932, "train_tokens_per_second": 98096.39 }, { "epoch": 0.6985691573926868, "grad_norm": 0.35646507143974304, "learning_rate": 1.0446428365878416e-05, "loss": 0.3912, "num_input_tokens_seen": 17133373424, "step": 4394, "train_runtime": 174657.1493, "train_tokens_per_second": 98097.178 }, { "epoch": 0.6987281399046105, "grad_norm": 0.2048468440771103, "learning_rate": 1.0436256592508273e-05, "loss": 0.4027, "num_input_tokens_seen": 17137338849, "step": 4395, "train_runtime": 174696.9335, "train_tokens_per_second": 98097.537 }, { "epoch": 0.6988871224165342, "grad_norm": 0.2612191140651703, "learning_rate": 1.0426088467252929e-05, "loss": 0.3838, "num_input_tokens_seen": 17141255614, "step": 4396, "train_runtime": 174735.318, "train_tokens_per_second": 98098.403 }, { "epoch": 0.6990461049284579, "grad_norm": 0.21663150191307068, "learning_rate": 1.041592399265944e-05, "loss": 0.3923, "num_input_tokens_seen": 17145164222, "step": 4397, "train_runtime": 174773.7258, "train_tokens_per_second": 98099.209 }, { "epoch": 0.6992050874403816, "grad_norm": 0.32937827706336975, "learning_rate": 1.0405763171273946e-05, "loss": 0.3958, "num_input_tokens_seen": 17149061566, "step": 4398, "train_runtime": 174812.7509, "train_tokens_per_second": 98099.604 }, { "epoch": 0.6993640699523053, "grad_norm": 0.26009485125541687, "learning_rate": 1.0395606005641631e-05, "loss": 0.3972, "num_input_tokens_seen": 17152985155, "step": 4399, "train_runtime": 174852.5538, "train_tokens_per_second": 98099.712 }, { "epoch": 0.699523052464229, "grad_norm": 0.22022756934165955, "learning_rate": 1.0385452498306828e-05, "loss": 0.3996, "num_input_tokens_seen": 17156941174, "step": 4400, "train_runtime": 174891.3798, "train_tokens_per_second": 98100.554 }, { "epoch": 0.6996820349761527, "grad_norm": 0.20336581766605377, "learning_rate": 1.0375302651812886e-05, "loss": 0.3856, "num_input_tokens_seen": 17160722453, "step": 4401, "train_runtime": 175049.7594, "train_tokens_per_second": 98033.396 }, { "epoch": 0.6998410174880764, "grad_norm": 0.18530726432800293, "learning_rate": 1.0365156468702303e-05, "loss": 0.3852, "num_input_tokens_seen": 17164804941, "step": 4402, "train_runtime": 175090.0647, "train_tokens_per_second": 98034.146 }, { "epoch": 0.7, "grad_norm": 0.20300135016441345, "learning_rate": 1.0355013951516603e-05, "loss": 0.3932, "num_input_tokens_seen": 17168714991, "step": 4403, "train_runtime": 175127.8301, "train_tokens_per_second": 98035.332 }, { "epoch": 0.7001589825119237, "grad_norm": 0.2225635051727295, "learning_rate": 1.034487510279642e-05, "loss": 0.3993, "num_input_tokens_seen": 17172500552, "step": 4404, "train_runtime": 175166.3497, "train_tokens_per_second": 98035.385 }, { "epoch": 0.7003179650238474, "grad_norm": 0.19181255996227264, "learning_rate": 1.0334739925081475e-05, "loss": 0.3902, "num_input_tokens_seen": 17176422238, "step": 4405, "train_runtime": 175205.9228, "train_tokens_per_second": 98035.626 }, { "epoch": 0.700476947535771, "grad_norm": 0.1966923177242279, "learning_rate": 1.0324608420910539e-05, "loss": 0.3837, "num_input_tokens_seen": 17180308223, "step": 4406, "train_runtime": 175242.7413, "train_tokens_per_second": 98037.203 }, { "epoch": 0.7006359300476948, "grad_norm": 0.2012721300125122, "learning_rate": 1.0314480592821493e-05, "loss": 0.3858, "num_input_tokens_seen": 17184209031, "step": 4407, "train_runtime": 175280.3142, "train_tokens_per_second": 98038.443 }, { "epoch": 0.7007949125596185, "grad_norm": 0.2036832571029663, "learning_rate": 1.0304356443351285e-05, "loss": 0.3781, "num_input_tokens_seen": 17188151559, "step": 4408, "train_runtime": 175320.099, "train_tokens_per_second": 98038.683 }, { "epoch": 0.7009538950715422, "grad_norm": 0.5577464699745178, "learning_rate": 1.0294235975035942e-05, "loss": 0.3786, "num_input_tokens_seen": 17192128603, "step": 4409, "train_runtime": 175360.4096, "train_tokens_per_second": 98038.826 }, { "epoch": 0.7011128775834659, "grad_norm": 0.22934524714946747, "learning_rate": 1.0284119190410575e-05, "loss": 0.3979, "num_input_tokens_seen": 17196039592, "step": 4410, "train_runtime": 175397.4653, "train_tokens_per_second": 98040.411 }, { "epoch": 0.7012718600953896, "grad_norm": 0.2464170902967453, "learning_rate": 1.0274006092009369e-05, "loss": 0.3762, "num_input_tokens_seen": 17199859893, "step": 4411, "train_runtime": 175436.257, "train_tokens_per_second": 98040.509 }, { "epoch": 0.7014308426073131, "grad_norm": 0.20059816539287567, "learning_rate": 1.026389668236557e-05, "loss": 0.387, "num_input_tokens_seen": 17203854959, "step": 4412, "train_runtime": 175476.0724, "train_tokens_per_second": 98041.03 }, { "epoch": 0.7015898251192368, "grad_norm": 0.26457536220550537, "learning_rate": 1.025379096401152e-05, "loss": 0.3913, "num_input_tokens_seen": 17207731301, "step": 4413, "train_runtime": 175516.7563, "train_tokens_per_second": 98040.39 }, { "epoch": 0.7017488076311605, "grad_norm": 0.1964796632528305, "learning_rate": 1.0243688939478635e-05, "loss": 0.3821, "num_input_tokens_seen": 17211583110, "step": 4414, "train_runtime": 175556.3382, "train_tokens_per_second": 98040.226 }, { "epoch": 0.7019077901430842, "grad_norm": 0.18956845998764038, "learning_rate": 1.0233590611297394e-05, "loss": 0.3871, "num_input_tokens_seen": 17215398885, "step": 4415, "train_runtime": 175594.3848, "train_tokens_per_second": 98040.714 }, { "epoch": 0.7020667726550079, "grad_norm": 0.22076836228370667, "learning_rate": 1.0223495981997369e-05, "loss": 0.3934, "num_input_tokens_seen": 17219361067, "step": 4416, "train_runtime": 175634.1367, "train_tokens_per_second": 98041.084 }, { "epoch": 0.7022257551669316, "grad_norm": 0.25130295753479004, "learning_rate": 1.021340505410717e-05, "loss": 0.3919, "num_input_tokens_seen": 17223274351, "step": 4417, "train_runtime": 175675.2516, "train_tokens_per_second": 98040.414 }, { "epoch": 0.7023847376788553, "grad_norm": 0.19925861060619354, "learning_rate": 1.0203317830154533e-05, "loss": 0.4007, "num_input_tokens_seen": 17227107745, "step": 4418, "train_runtime": 175715.1286, "train_tokens_per_second": 98039.98 }, { "epoch": 0.702543720190779, "grad_norm": 0.19874754548072815, "learning_rate": 1.0193234312666214e-05, "loss": 0.3787, "num_input_tokens_seen": 17230894067, "step": 4419, "train_runtime": 175749.4736, "train_tokens_per_second": 98042.365 }, { "epoch": 0.7027027027027027, "grad_norm": 0.1990734040737152, "learning_rate": 1.0183154504168066e-05, "loss": 0.386, "num_input_tokens_seen": 17234822070, "step": 4420, "train_runtime": 175788.0434, "train_tokens_per_second": 98043.199 }, { "epoch": 0.7028616852146264, "grad_norm": 0.6984015107154846, "learning_rate": 1.0173078407185027e-05, "loss": 0.3898, "num_input_tokens_seen": 17238722875, "step": 4421, "train_runtime": 175827.6975, "train_tokens_per_second": 98043.273 }, { "epoch": 0.70302066772655, "grad_norm": 0.24176639318466187, "learning_rate": 1.0163006024241053e-05, "loss": 0.3849, "num_input_tokens_seen": 17242594110, "step": 4422, "train_runtime": 175865.8684, "train_tokens_per_second": 98044.005 }, { "epoch": 0.7031796502384737, "grad_norm": 0.21227535605430603, "learning_rate": 1.0152937357859247e-05, "loss": 0.3808, "num_input_tokens_seen": 17246578137, "step": 4423, "train_runtime": 175904.8055, "train_tokens_per_second": 98044.952 }, { "epoch": 0.7033386327503974, "grad_norm": 0.21785223484039307, "learning_rate": 1.014287241056171e-05, "loss": 0.3948, "num_input_tokens_seen": 17250369665, "step": 4424, "train_runtime": 175942.7718, "train_tokens_per_second": 98045.344 }, { "epoch": 0.7034976152623211, "grad_norm": 0.23310551047325134, "learning_rate": 1.0132811184869648e-05, "loss": 0.3996, "num_input_tokens_seen": 17254336051, "step": 4425, "train_runtime": 175980.7896, "train_tokens_per_second": 98046.702 }, { "epoch": 0.7036565977742448, "grad_norm": 0.21097879111766815, "learning_rate": 1.0122753683303324e-05, "loss": 0.3879, "num_input_tokens_seen": 17258249500, "step": 4426, "train_runtime": 176021.0887, "train_tokens_per_second": 98046.488 }, { "epoch": 0.7038155802861685, "grad_norm": 0.4831251800060272, "learning_rate": 1.0112699908382078e-05, "loss": 0.3926, "num_input_tokens_seen": 17262247262, "step": 4427, "train_runtime": 176061.6397, "train_tokens_per_second": 98046.612 }, { "epoch": 0.7039745627980922, "grad_norm": 0.607609212398529, "learning_rate": 1.0102649862624314e-05, "loss": 0.3893, "num_input_tokens_seen": 17266135694, "step": 4428, "train_runtime": 176101.1686, "train_tokens_per_second": 98046.684 }, { "epoch": 0.7041335453100159, "grad_norm": 0.22362732887268066, "learning_rate": 1.009260354854748e-05, "loss": 0.378, "num_input_tokens_seen": 17270091009, "step": 4429, "train_runtime": 176140.7161, "train_tokens_per_second": 98047.126 }, { "epoch": 0.7042925278219396, "grad_norm": 0.24346521496772766, "learning_rate": 1.0082560968668116e-05, "loss": 0.3937, "num_input_tokens_seen": 17273955513, "step": 4430, "train_runtime": 176180.3719, "train_tokens_per_second": 98046.992 }, { "epoch": 0.7044515103338633, "grad_norm": 0.22323720157146454, "learning_rate": 1.0072522125501818e-05, "loss": 0.3837, "num_input_tokens_seen": 17277915821, "step": 4431, "train_runtime": 176218.8312, "train_tokens_per_second": 98048.067 }, { "epoch": 0.7046104928457869, "grad_norm": 0.2935403883457184, "learning_rate": 1.0062487021563246e-05, "loss": 0.3948, "num_input_tokens_seen": 17281828762, "step": 4432, "train_runtime": 176260.5001, "train_tokens_per_second": 98047.088 }, { "epoch": 0.7047694753577106, "grad_norm": 0.27925020456314087, "learning_rate": 1.0052455659366119e-05, "loss": 0.3885, "num_input_tokens_seen": 17285651390, "step": 4433, "train_runtime": 176299.3681, "train_tokens_per_second": 98047.155 }, { "epoch": 0.7049284578696343, "grad_norm": 0.24260784685611725, "learning_rate": 1.0042428041423235e-05, "loss": 0.3859, "num_input_tokens_seen": 17289611854, "step": 4434, "train_runtime": 176338.9892, "train_tokens_per_second": 98047.584 }, { "epoch": 0.705087440381558, "grad_norm": 0.21866731345653534, "learning_rate": 1.0032404170246423e-05, "loss": 0.3901, "num_input_tokens_seen": 17293450718, "step": 4435, "train_runtime": 176377.0252, "train_tokens_per_second": 98048.205 }, { "epoch": 0.7052464228934817, "grad_norm": 0.22865527868270874, "learning_rate": 1.00223840483466e-05, "loss": 0.3806, "num_input_tokens_seen": 17297427792, "step": 4436, "train_runtime": 176418.2319, "train_tokens_per_second": 98047.847 }, { "epoch": 0.7054054054054054, "grad_norm": 0.1911623179912567, "learning_rate": 1.0012367678233737e-05, "loss": 0.3858, "num_input_tokens_seen": 17301402449, "step": 4437, "train_runtime": 176458.5016, "train_tokens_per_second": 98047.996 }, { "epoch": 0.7055643879173291, "grad_norm": 0.21534956991672516, "learning_rate": 1.0002355062416858e-05, "loss": 0.3916, "num_input_tokens_seen": 17305314884, "step": 4438, "train_runtime": 176498.6803, "train_tokens_per_second": 98047.843 }, { "epoch": 0.7057233704292528, "grad_norm": 0.19244031608104706, "learning_rate": 9.992346203404068e-06, "loss": 0.3953, "num_input_tokens_seen": 17309076282, "step": 4439, "train_runtime": 176538.2238, "train_tokens_per_second": 98047.187 }, { "epoch": 0.7058823529411765, "grad_norm": 0.20115409791469574, "learning_rate": 9.982341103702489e-06, "loss": 0.3834, "num_input_tokens_seen": 17312994357, "step": 4440, "train_runtime": 176574.8995, "train_tokens_per_second": 98049.012 }, { "epoch": 0.7060413354531001, "grad_norm": 0.2336462289094925, "learning_rate": 9.972339765818359e-06, "loss": 0.3961, "num_input_tokens_seen": 17316915393, "step": 4441, "train_runtime": 176614.5282, "train_tokens_per_second": 98049.212 }, { "epoch": 0.7062003179650238, "grad_norm": 0.2163151353597641, "learning_rate": 9.96234219225692e-06, "loss": 0.3915, "num_input_tokens_seen": 17320851671, "step": 4442, "train_runtime": 176653.1003, "train_tokens_per_second": 98050.086 }, { "epoch": 0.7063593004769475, "grad_norm": 0.20574280619621277, "learning_rate": 9.9523483855225e-06, "loss": 0.3848, "num_input_tokens_seen": 17324724999, "step": 4443, "train_runtime": 176691.7667, "train_tokens_per_second": 98050.551 }, { "epoch": 0.7065182829888712, "grad_norm": 0.25079137086868286, "learning_rate": 9.942358348118478e-06, "loss": 0.3982, "num_input_tokens_seen": 17328540813, "step": 4444, "train_runtime": 176732.5913, "train_tokens_per_second": 98049.492 }, { "epoch": 0.7066772655007949, "grad_norm": 0.17062275111675262, "learning_rate": 9.932372082547284e-06, "loss": 0.379, "num_input_tokens_seen": 17332566198, "step": 4445, "train_runtime": 176769.0872, "train_tokens_per_second": 98052.021 }, { "epoch": 0.7068362480127186, "grad_norm": 0.24060894548892975, "learning_rate": 9.92238959131042e-06, "loss": 0.3829, "num_input_tokens_seen": 17336423430, "step": 4446, "train_runtime": 176807.9566, "train_tokens_per_second": 98052.281 }, { "epoch": 0.7069952305246423, "grad_norm": 0.2057768851518631, "learning_rate": 9.912410876908412e-06, "loss": 0.3919, "num_input_tokens_seen": 17340292572, "step": 4447, "train_runtime": 176847.5951, "train_tokens_per_second": 98052.182 }, { "epoch": 0.707154213036566, "grad_norm": 0.1743878424167633, "learning_rate": 9.902435941840865e-06, "loss": 0.3986, "num_input_tokens_seen": 17344302402, "step": 4448, "train_runtime": 176886.6375, "train_tokens_per_second": 98053.209 }, { "epoch": 0.7073131955484897, "grad_norm": 0.17277701199054718, "learning_rate": 9.892464788606431e-06, "loss": 0.3883, "num_input_tokens_seen": 17348199605, "step": 4449, "train_runtime": 176927.0529, "train_tokens_per_second": 98052.838 }, { "epoch": 0.7074721780604134, "grad_norm": 0.20359300076961517, "learning_rate": 9.88249741970281e-06, "loss": 0.399, "num_input_tokens_seen": 17352176343, "step": 4450, "train_runtime": 176967.6818, "train_tokens_per_second": 98052.798 }, { "epoch": 0.707631160572337, "grad_norm": 0.20332498848438263, "learning_rate": 9.872533837626763e-06, "loss": 0.3923, "num_input_tokens_seen": 17356047254, "step": 4451, "train_runtime": 177009.0611, "train_tokens_per_second": 98051.745 }, { "epoch": 0.7077901430842607, "grad_norm": 0.2103494554758072, "learning_rate": 9.862574044874098e-06, "loss": 0.3904, "num_input_tokens_seen": 17359994087, "step": 4452, "train_runtime": 177049.877, "train_tokens_per_second": 98051.433 }, { "epoch": 0.7079491255961844, "grad_norm": 0.2098589539527893, "learning_rate": 9.852618043939662e-06, "loss": 0.3966, "num_input_tokens_seen": 17363903716, "step": 4453, "train_runtime": 177088.4095, "train_tokens_per_second": 98052.175 }, { "epoch": 0.7081081081081081, "grad_norm": 0.18534784018993378, "learning_rate": 9.842665837317372e-06, "loss": 0.3958, "num_input_tokens_seen": 17367738576, "step": 4454, "train_runtime": 177126.6515, "train_tokens_per_second": 98052.656 }, { "epoch": 0.7082670906200318, "grad_norm": 0.22118344902992249, "learning_rate": 9.83271742750018e-06, "loss": 0.3955, "num_input_tokens_seen": 17371640828, "step": 4455, "train_runtime": 177165.5342, "train_tokens_per_second": 98053.162 }, { "epoch": 0.7084260731319555, "grad_norm": 0.22706469893455505, "learning_rate": 9.822772816980095e-06, "loss": 0.4144, "num_input_tokens_seen": 17375494346, "step": 4456, "train_runtime": 177206.3953, "train_tokens_per_second": 98052.298 }, { "epoch": 0.7085850556438792, "grad_norm": 0.19918674230575562, "learning_rate": 9.81283200824818e-06, "loss": 0.3847, "num_input_tokens_seen": 17379455032, "step": 4457, "train_runtime": 177244.5813, "train_tokens_per_second": 98053.52 }, { "epoch": 0.7087440381558029, "grad_norm": 0.22983285784721375, "learning_rate": 9.80289500379451e-06, "loss": 0.3941, "num_input_tokens_seen": 17383408283, "step": 4458, "train_runtime": 177285.0689, "train_tokens_per_second": 98053.425 }, { "epoch": 0.7089030206677266, "grad_norm": 0.17460541427135468, "learning_rate": 9.792961806108275e-06, "loss": 0.3962, "num_input_tokens_seen": 17387359335, "step": 4459, "train_runtime": 177322.6074, "train_tokens_per_second": 98054.95 }, { "epoch": 0.7090620031796503, "grad_norm": 0.2117106318473816, "learning_rate": 9.783032417677637e-06, "loss": 0.395, "num_input_tokens_seen": 17391238397, "step": 4460, "train_runtime": 177364.5665, "train_tokens_per_second": 98053.623 }, { "epoch": 0.7092209856915739, "grad_norm": 0.19565102458000183, "learning_rate": 9.773106840989851e-06, "loss": 0.3977, "num_input_tokens_seen": 17395005519, "step": 4461, "train_runtime": 177404.6858, "train_tokens_per_second": 98052.684 }, { "epoch": 0.7093799682034976, "grad_norm": 0.1763371378183365, "learning_rate": 9.763185078531212e-06, "loss": 0.3945, "num_input_tokens_seen": 17399012733, "step": 4462, "train_runtime": 177445.4454, "train_tokens_per_second": 98052.743 }, { "epoch": 0.7095389507154213, "grad_norm": 0.1902129054069519, "learning_rate": 9.753267132787026e-06, "loss": 0.3965, "num_input_tokens_seen": 17402931181, "step": 4463, "train_runtime": 177482.8583, "train_tokens_per_second": 98054.152 }, { "epoch": 0.709697933227345, "grad_norm": 0.20010395348072052, "learning_rate": 9.743353006241698e-06, "loss": 0.376, "num_input_tokens_seen": 17406777491, "step": 4464, "train_runtime": 177522.9884, "train_tokens_per_second": 98053.653 }, { "epoch": 0.7098569157392687, "grad_norm": 0.2280399203300476, "learning_rate": 9.733442701378628e-06, "loss": 0.3942, "num_input_tokens_seen": 17410714387, "step": 4465, "train_runtime": 177562.3451, "train_tokens_per_second": 98054.091 }, { "epoch": 0.7100158982511924, "grad_norm": 0.21476148068904877, "learning_rate": 9.723536220680282e-06, "loss": 0.3867, "num_input_tokens_seen": 17414543514, "step": 4466, "train_runtime": 177600.1743, "train_tokens_per_second": 98054.766 }, { "epoch": 0.7101748807631161, "grad_norm": 0.4515811800956726, "learning_rate": 9.713633566628163e-06, "loss": 0.4049, "num_input_tokens_seen": 17418403714, "step": 4467, "train_runtime": 177638.6273, "train_tokens_per_second": 98055.271 }, { "epoch": 0.7103338632750398, "grad_norm": 0.23364806175231934, "learning_rate": 9.703734741702816e-06, "loss": 0.3914, "num_input_tokens_seen": 17422438613, "step": 4468, "train_runtime": 177674.9245, "train_tokens_per_second": 98057.949 }, { "epoch": 0.7104928457869635, "grad_norm": 0.23990191519260406, "learning_rate": 9.693839748383836e-06, "loss": 0.3883, "num_input_tokens_seen": 17426230817, "step": 4469, "train_runtime": 177713.8615, "train_tokens_per_second": 98057.803 }, { "epoch": 0.7106518282988871, "grad_norm": 0.2696344256401062, "learning_rate": 9.683948589149833e-06, "loss": 0.3872, "num_input_tokens_seen": 17430114081, "step": 4470, "train_runtime": 177753.0227, "train_tokens_per_second": 98058.046 }, { "epoch": 0.7108108108108108, "grad_norm": 0.2810855507850647, "learning_rate": 9.674061266478479e-06, "loss": 0.4027, "num_input_tokens_seen": 17434015642, "step": 4471, "train_runtime": 177790.3459, "train_tokens_per_second": 98059.406 }, { "epoch": 0.7109697933227345, "grad_norm": 0.20352062582969666, "learning_rate": 9.664177782846481e-06, "loss": 0.3967, "num_input_tokens_seen": 17438005159, "step": 4472, "train_runtime": 177836.0573, "train_tokens_per_second": 98056.634 }, { "epoch": 0.7111287758346582, "grad_norm": 0.22654704749584198, "learning_rate": 9.654298140729579e-06, "loss": 0.3988, "num_input_tokens_seen": 17441768959, "step": 4473, "train_runtime": 177875.5946, "train_tokens_per_second": 98055.998 }, { "epoch": 0.7112877583465819, "grad_norm": 0.21721386909484863, "learning_rate": 9.644422342602552e-06, "loss": 0.3837, "num_input_tokens_seen": 17445784758, "step": 4474, "train_runtime": 177913.4957, "train_tokens_per_second": 98057.681 }, { "epoch": 0.7114467408585056, "grad_norm": 0.180290088057518, "learning_rate": 9.634550390939232e-06, "loss": 0.3903, "num_input_tokens_seen": 17449644845, "step": 4475, "train_runtime": 177949.5566, "train_tokens_per_second": 98059.502 }, { "epoch": 0.7116057233704293, "grad_norm": 0.22203829884529114, "learning_rate": 9.624682288212452e-06, "loss": 0.3849, "num_input_tokens_seen": 17453503840, "step": 4476, "train_runtime": 177990.2972, "train_tokens_per_second": 98058.738 }, { "epoch": 0.711764705882353, "grad_norm": 0.29162079095840454, "learning_rate": 9.61481803689411e-06, "loss": 0.3872, "num_input_tokens_seen": 17457470953, "step": 4477, "train_runtime": 178031.2089, "train_tokens_per_second": 98058.487 }, { "epoch": 0.7119236883942767, "grad_norm": 0.18850092589855194, "learning_rate": 9.604957639455136e-06, "loss": 0.3974, "num_input_tokens_seen": 17461500284, "step": 4478, "train_runtime": 178068.6326, "train_tokens_per_second": 98060.506 }, { "epoch": 0.7120826709062004, "grad_norm": 0.2415783554315567, "learning_rate": 9.595101098365484e-06, "loss": 0.3967, "num_input_tokens_seen": 17465268086, "step": 4479, "train_runtime": 178108.8112, "train_tokens_per_second": 98059.54 }, { "epoch": 0.712241653418124, "grad_norm": 0.21350817382335663, "learning_rate": 9.585248416094158e-06, "loss": 0.396, "num_input_tokens_seen": 17469087023, "step": 4480, "train_runtime": 178148.8009, "train_tokens_per_second": 98058.965 }, { "epoch": 0.7124006359300477, "grad_norm": 0.19351699948310852, "learning_rate": 9.575399595109164e-06, "loss": 0.391, "num_input_tokens_seen": 17473106546, "step": 4481, "train_runtime": 178189.9346, "train_tokens_per_second": 98058.886 }, { "epoch": 0.7125596184419714, "grad_norm": 0.2501576244831085, "learning_rate": 9.565554637877591e-06, "loss": 0.3967, "num_input_tokens_seen": 17477001263, "step": 4482, "train_runtime": 178228.8435, "train_tokens_per_second": 98059.332 }, { "epoch": 0.712718600953895, "grad_norm": 0.21564140915870667, "learning_rate": 9.555713546865507e-06, "loss": 0.3866, "num_input_tokens_seen": 17480876536, "step": 4483, "train_runtime": 178268.2545, "train_tokens_per_second": 98059.391 }, { "epoch": 0.7128775834658188, "grad_norm": 0.20838384330272675, "learning_rate": 9.545876324538045e-06, "loss": 0.3932, "num_input_tokens_seen": 17484783479, "step": 4484, "train_runtime": 178306.3382, "train_tokens_per_second": 98060.359 }, { "epoch": 0.7130365659777425, "grad_norm": 0.2799793779850006, "learning_rate": 9.536042973359368e-06, "loss": 0.3852, "num_input_tokens_seen": 17488671336, "step": 4485, "train_runtime": 178346.8983, "train_tokens_per_second": 98059.857 }, { "epoch": 0.7131955484896662, "grad_norm": 0.20167799293994904, "learning_rate": 9.526213495792636e-06, "loss": 0.3911, "num_input_tokens_seen": 17492572789, "step": 4486, "train_runtime": 178386.6181, "train_tokens_per_second": 98059.894 }, { "epoch": 0.7133545310015899, "grad_norm": 0.2100210040807724, "learning_rate": 9.516387894300097e-06, "loss": 0.3923, "num_input_tokens_seen": 17496455923, "step": 4487, "train_runtime": 178426.6635, "train_tokens_per_second": 98059.649 }, { "epoch": 0.7135135135135136, "grad_norm": 0.3088065981864929, "learning_rate": 9.506566171342968e-06, "loss": 0.3911, "num_input_tokens_seen": 17500295431, "step": 4488, "train_runtime": 178465.0689, "train_tokens_per_second": 98060.06 }, { "epoch": 0.7136724960254373, "grad_norm": 0.2088637351989746, "learning_rate": 9.49674832938153e-06, "loss": 0.3857, "num_input_tokens_seen": 17504273736, "step": 4489, "train_runtime": 178505.943, "train_tokens_per_second": 98059.893 }, { "epoch": 0.7138314785373608, "grad_norm": 0.21262459456920624, "learning_rate": 9.486934370875086e-06, "loss": 0.3805, "num_input_tokens_seen": 17508090284, "step": 4490, "train_runtime": 178546.6116, "train_tokens_per_second": 98058.933 }, { "epoch": 0.7139904610492845, "grad_norm": 0.18406806886196136, "learning_rate": 9.477124298281961e-06, "loss": 0.3699, "num_input_tokens_seen": 17511971240, "step": 4491, "train_runtime": 178587.9833, "train_tokens_per_second": 98057.948 }, { "epoch": 0.7141494435612082, "grad_norm": 0.2075268030166626, "learning_rate": 9.46731811405952e-06, "loss": 0.3956, "num_input_tokens_seen": 17515930647, "step": 4492, "train_runtime": 178626.5722, "train_tokens_per_second": 98058.931 }, { "epoch": 0.7143084260731319, "grad_norm": 0.20472396910190582, "learning_rate": 9.45751582066412e-06, "loss": 0.385, "num_input_tokens_seen": 17519900285, "step": 4493, "train_runtime": 178668.2192, "train_tokens_per_second": 98058.291 }, { "epoch": 0.7144674085850556, "grad_norm": 0.1945725977420807, "learning_rate": 9.44771742055118e-06, "loss": 0.3922, "num_input_tokens_seen": 17523699045, "step": 4494, "train_runtime": 178706.6616, "train_tokens_per_second": 98058.454 }, { "epoch": 0.7146263910969793, "grad_norm": 0.21554218232631683, "learning_rate": 9.437922916175127e-06, "loss": 0.4024, "num_input_tokens_seen": 17527686049, "step": 4495, "train_runtime": 178747.5762, "train_tokens_per_second": 98058.315 }, { "epoch": 0.714785373608903, "grad_norm": 0.25765466690063477, "learning_rate": 9.428132309989418e-06, "loss": 0.3997, "num_input_tokens_seen": 17531499268, "step": 4496, "train_runtime": 178788.8335, "train_tokens_per_second": 98057.015 }, { "epoch": 0.7149443561208267, "grad_norm": 0.19670283794403076, "learning_rate": 9.41834560444653e-06, "loss": 0.3835, "num_input_tokens_seen": 17535423446, "step": 4497, "train_runtime": 178825.7994, "train_tokens_per_second": 98058.689 }, { "epoch": 0.7151033386327504, "grad_norm": 0.21669389307498932, "learning_rate": 9.408562801997967e-06, "loss": 0.3952, "num_input_tokens_seen": 17539427529, "step": 4498, "train_runtime": 178867.3206, "train_tokens_per_second": 98058.312 }, { "epoch": 0.7152623211446741, "grad_norm": 0.23555174469947815, "learning_rate": 9.398783905094241e-06, "loss": 0.3949, "num_input_tokens_seen": 17543181374, "step": 4499, "train_runtime": 178908.2424, "train_tokens_per_second": 98056.865 }, { "epoch": 0.7154213036565977, "grad_norm": 0.20993371307849884, "learning_rate": 9.389008916184905e-06, "loss": 0.3771, "num_input_tokens_seen": 17547241559, "step": 4500, "train_runtime": 178947.7122, "train_tokens_per_second": 98057.926 }, { "epoch": 0.7155802861685214, "grad_norm": 0.2266187220811844, "learning_rate": 9.37923783771853e-06, "loss": 0.3869, "num_input_tokens_seen": 17551188745, "step": 4501, "train_runtime": 178988.2731, "train_tokens_per_second": 98057.758 }, { "epoch": 0.7157392686804451, "grad_norm": 0.36071741580963135, "learning_rate": 9.369470672142677e-06, "loss": 0.3905, "num_input_tokens_seen": 17554937104, "step": 4502, "train_runtime": 179025.3148, "train_tokens_per_second": 98058.407 }, { "epoch": 0.7158982511923688, "grad_norm": 0.17457468807697296, "learning_rate": 9.359707421903988e-06, "loss": 0.3991, "num_input_tokens_seen": 17558993541, "step": 4503, "train_runtime": 179065.8117, "train_tokens_per_second": 98058.883 }, { "epoch": 0.7160572337042925, "grad_norm": 0.22993694245815277, "learning_rate": 9.349948089448051e-06, "loss": 0.378, "num_input_tokens_seen": 17562827457, "step": 4504, "train_runtime": 179106.408, "train_tokens_per_second": 98058.063 }, { "epoch": 0.7162162162162162, "grad_norm": 0.2305442839860916, "learning_rate": 9.340192677219547e-06, "loss": 0.389, "num_input_tokens_seen": 17566803104, "step": 4505, "train_runtime": 179138.2402, "train_tokens_per_second": 98062.832 }, { "epoch": 0.7163751987281399, "grad_norm": 0.28670695424079895, "learning_rate": 9.330441187662111e-06, "loss": 0.3971, "num_input_tokens_seen": 17570690201, "step": 4506, "train_runtime": 179183.829, "train_tokens_per_second": 98059.575 }, { "epoch": 0.7165341812400636, "grad_norm": 0.18111489713191986, "learning_rate": 9.320693623218433e-06, "loss": 0.3898, "num_input_tokens_seen": 17574454448, "step": 4507, "train_runtime": 179224.0992, "train_tokens_per_second": 98058.545 }, { "epoch": 0.7166931637519873, "grad_norm": 0.196627676486969, "learning_rate": 9.310949986330214e-06, "loss": 0.3985, "num_input_tokens_seen": 17578417394, "step": 4508, "train_runtime": 179267.0556, "train_tokens_per_second": 98057.155 }, { "epoch": 0.7168521462639109, "grad_norm": 0.2144642472267151, "learning_rate": 9.301210279438142e-06, "loss": 0.3931, "num_input_tokens_seen": 17582276378, "step": 4509, "train_runtime": 179304.3434, "train_tokens_per_second": 98058.285 }, { "epoch": 0.7170111287758346, "grad_norm": 0.18678681552410126, "learning_rate": 9.291474504981978e-06, "loss": 0.3895, "num_input_tokens_seen": 17586192623, "step": 4510, "train_runtime": 179344.0285, "train_tokens_per_second": 98058.423 }, { "epoch": 0.7171701112877583, "grad_norm": 0.20374789834022522, "learning_rate": 9.28174266540044e-06, "loss": 0.4048, "num_input_tokens_seen": 17590093161, "step": 4511, "train_runtime": 179384.4484, "train_tokens_per_second": 98058.072 }, { "epoch": 0.717329093799682, "grad_norm": 0.20769768953323364, "learning_rate": 9.272014763131293e-06, "loss": 0.4022, "num_input_tokens_seen": 17594043640, "step": 4512, "train_runtime": 179421.9471, "train_tokens_per_second": 98059.596 }, { "epoch": 0.7174880763116057, "grad_norm": 0.19882066547870636, "learning_rate": 9.262290800611309e-06, "loss": 0.3915, "num_input_tokens_seen": 17597853215, "step": 4513, "train_runtime": 179458.9095, "train_tokens_per_second": 98060.627 }, { "epoch": 0.7176470588235294, "grad_norm": 0.19629648327827454, "learning_rate": 9.252570780276273e-06, "loss": 0.3842, "num_input_tokens_seen": 17601691876, "step": 4514, "train_runtime": 179497.6923, "train_tokens_per_second": 98060.825 }, { "epoch": 0.7178060413354531, "grad_norm": 0.19748951494693756, "learning_rate": 9.242854704560987e-06, "loss": 0.3874, "num_input_tokens_seen": 17605622534, "step": 4515, "train_runtime": 179539.0033, "train_tokens_per_second": 98060.155 }, { "epoch": 0.7179650238473768, "grad_norm": 0.303989052772522, "learning_rate": 9.233142575899242e-06, "loss": 0.4171, "num_input_tokens_seen": 17609550825, "step": 4516, "train_runtime": 179579.271, "train_tokens_per_second": 98060.042 }, { "epoch": 0.7181240063593005, "grad_norm": 0.301893949508667, "learning_rate": 9.223434396723871e-06, "loss": 0.3943, "num_input_tokens_seen": 17613506480, "step": 4517, "train_runtime": 179617.5836, "train_tokens_per_second": 98061.148 }, { "epoch": 0.7182829888712242, "grad_norm": 0.24610945582389832, "learning_rate": 9.213730169466697e-06, "loss": 0.3934, "num_input_tokens_seen": 17617358655, "step": 4518, "train_runtime": 179658.0106, "train_tokens_per_second": 98060.524 }, { "epoch": 0.7184419713831478, "grad_norm": 0.20238395035266876, "learning_rate": 9.204029896558575e-06, "loss": 0.3945, "num_input_tokens_seen": 17621367208, "step": 4519, "train_runtime": 179699.6656, "train_tokens_per_second": 98060.1 }, { "epoch": 0.7186009538950715, "grad_norm": 0.21960344910621643, "learning_rate": 9.194333580429327e-06, "loss": 0.3811, "num_input_tokens_seen": 17625261596, "step": 4520, "train_runtime": 179739.1453, "train_tokens_per_second": 98060.228 }, { "epoch": 0.7187599364069952, "grad_norm": 0.17681357264518738, "learning_rate": 9.184641223507845e-06, "loss": 0.383, "num_input_tokens_seen": 17629085798, "step": 4521, "train_runtime": 179777.5944, "train_tokens_per_second": 98060.528 }, { "epoch": 0.7189189189189189, "grad_norm": 0.3254520297050476, "learning_rate": 9.174952828221972e-06, "loss": 0.3881, "num_input_tokens_seen": 17632962326, "step": 4522, "train_runtime": 179817.7363, "train_tokens_per_second": 98060.195 }, { "epoch": 0.7190779014308426, "grad_norm": 0.18922390043735504, "learning_rate": 9.165268396998589e-06, "loss": 0.4057, "num_input_tokens_seen": 17636944158, "step": 4523, "train_runtime": 179859.2291, "train_tokens_per_second": 98059.712 }, { "epoch": 0.7192368839427663, "grad_norm": 0.2395799458026886, "learning_rate": 9.155587932263588e-06, "loss": 0.3837, "num_input_tokens_seen": 17640931446, "step": 4524, "train_runtime": 179896.2293, "train_tokens_per_second": 98061.708 }, { "epoch": 0.71939586645469, "grad_norm": 0.35159412026405334, "learning_rate": 9.145911436441829e-06, "loss": 0.3847, "num_input_tokens_seen": 17644810116, "step": 4525, "train_runtime": 179935.6017, "train_tokens_per_second": 98061.806 }, { "epoch": 0.7195548489666137, "grad_norm": 0.2388232797384262, "learning_rate": 9.136238911957243e-06, "loss": 0.3764, "num_input_tokens_seen": 17648704205, "step": 4526, "train_runtime": 179973.66, "train_tokens_per_second": 98062.707 }, { "epoch": 0.7197138314785374, "grad_norm": 0.1803322583436966, "learning_rate": 9.126570361232694e-06, "loss": 0.3677, "num_input_tokens_seen": 17652644107, "step": 4527, "train_runtime": 180014.1597, "train_tokens_per_second": 98062.531 }, { "epoch": 0.7198728139904611, "grad_norm": 0.31589311361312866, "learning_rate": 9.11690578669012e-06, "loss": 0.3905, "num_input_tokens_seen": 17656605992, "step": 4528, "train_runtime": 180052.8635, "train_tokens_per_second": 98063.456 }, { "epoch": 0.7200317965023847, "grad_norm": 0.20754536986351013, "learning_rate": 9.107245190750401e-06, "loss": 0.3989, "num_input_tokens_seen": 17660505529, "step": 4529, "train_runtime": 180092.243, "train_tokens_per_second": 98063.666 }, { "epoch": 0.7201907790143084, "grad_norm": 0.20012280344963074, "learning_rate": 9.097588575833458e-06, "loss": 0.3902, "num_input_tokens_seen": 17664398977, "step": 4530, "train_runtime": 180131.9924, "train_tokens_per_second": 98063.641 }, { "epoch": 0.7203497615262321, "grad_norm": 0.26055479049682617, "learning_rate": 9.087935944358214e-06, "loss": 0.3962, "num_input_tokens_seen": 17668276461, "step": 4531, "train_runtime": 180174.0955, "train_tokens_per_second": 98062.246 }, { "epoch": 0.7205087440381558, "grad_norm": 0.24759571254253387, "learning_rate": 9.078287298742558e-06, "loss": 0.3996, "num_input_tokens_seen": 17672254309, "step": 4532, "train_runtime": 180214.9618, "train_tokens_per_second": 98062.082 }, { "epoch": 0.7206677265500795, "grad_norm": 0.19599886238574982, "learning_rate": 9.06864264140344e-06, "loss": 0.3978, "num_input_tokens_seen": 17676096278, "step": 4533, "train_runtime": 180255.4819, "train_tokens_per_second": 98061.352 }, { "epoch": 0.7208267090620032, "grad_norm": 0.2109650820493698, "learning_rate": 9.05900197475676e-06, "loss": 0.3886, "num_input_tokens_seen": 17680062204, "step": 4534, "train_runtime": 180295.6309, "train_tokens_per_second": 98061.512 }, { "epoch": 0.7209856915739269, "grad_norm": 0.22224850952625275, "learning_rate": 9.049365301217438e-06, "loss": 0.3981, "num_input_tokens_seen": 17683940202, "step": 4535, "train_runtime": 180333.5958, "train_tokens_per_second": 98062.372 }, { "epoch": 0.7211446740858506, "grad_norm": 0.21439939737319946, "learning_rate": 9.039732623199395e-06, "loss": 0.4, "num_input_tokens_seen": 17687799627, "step": 4536, "train_runtime": 180373.8113, "train_tokens_per_second": 98061.905 }, { "epoch": 0.7213036565977743, "grad_norm": 0.20596785843372345, "learning_rate": 9.03010394311555e-06, "loss": 0.3935, "num_input_tokens_seen": 17691760032, "step": 4537, "train_runtime": 180411.0551, "train_tokens_per_second": 98063.614 }, { "epoch": 0.7214626391096979, "grad_norm": 0.21518968045711517, "learning_rate": 9.020479263377826e-06, "loss": 0.3831, "num_input_tokens_seen": 17695563572, "step": 4538, "train_runtime": 180450.2602, "train_tokens_per_second": 98063.386 }, { "epoch": 0.7216216216216216, "grad_norm": 0.21040865778923035, "learning_rate": 9.010858586397122e-06, "loss": 0.3876, "num_input_tokens_seen": 17699425359, "step": 4539, "train_runtime": 180490.6296, "train_tokens_per_second": 98062.849 }, { "epoch": 0.7217806041335453, "grad_norm": 0.18024951219558716, "learning_rate": 9.001241914583356e-06, "loss": 0.3947, "num_input_tokens_seen": 17703412540, "step": 4540, "train_runtime": 180529.5139, "train_tokens_per_second": 98063.813 }, { "epoch": 0.721939586645469, "grad_norm": 0.38254377245903015, "learning_rate": 8.991629250345437e-06, "loss": 0.3837, "num_input_tokens_seen": 17707329551, "step": 4541, "train_runtime": 180567.4211, "train_tokens_per_second": 98064.919 }, { "epoch": 0.7220985691573927, "grad_norm": 0.20364715158939362, "learning_rate": 8.98202059609128e-06, "loss": 0.3794, "num_input_tokens_seen": 17711206476, "step": 4542, "train_runtime": 180605.3897, "train_tokens_per_second": 98065.769 }, { "epoch": 0.7222575516693164, "grad_norm": 0.2166081964969635, "learning_rate": 8.972415954227759e-06, "loss": 0.3977, "num_input_tokens_seen": 17715070342, "step": 4543, "train_runtime": 180643.926, "train_tokens_per_second": 98066.239 }, { "epoch": 0.7224165341812401, "grad_norm": 0.18433228135108948, "learning_rate": 8.9628153271608e-06, "loss": 0.3943, "num_input_tokens_seen": 17718887531, "step": 4544, "train_runtime": 180683.6245, "train_tokens_per_second": 98065.819 }, { "epoch": 0.7225755166931638, "grad_norm": 0.27130982279777527, "learning_rate": 8.953218717295269e-06, "loss": 0.3911, "num_input_tokens_seen": 17722822807, "step": 4545, "train_runtime": 180722.5942, "train_tokens_per_second": 98066.448 }, { "epoch": 0.7227344992050875, "grad_norm": 0.26816293597221375, "learning_rate": 8.943626127035057e-06, "loss": 0.3975, "num_input_tokens_seen": 17726685102, "step": 4546, "train_runtime": 180762.4096, "train_tokens_per_second": 98066.214 }, { "epoch": 0.7228934817170112, "grad_norm": 0.22941556572914124, "learning_rate": 8.93403755878304e-06, "loss": 0.3852, "num_input_tokens_seen": 17730674108, "step": 4547, "train_runtime": 180801.8022, "train_tokens_per_second": 98066.91 }, { "epoch": 0.7230524642289348, "grad_norm": 0.19509534537792206, "learning_rate": 8.924453014941089e-06, "loss": 0.3939, "num_input_tokens_seen": 17734557997, "step": 4548, "train_runtime": 180842.3631, "train_tokens_per_second": 98066.392 }, { "epoch": 0.7232114467408585, "grad_norm": 0.1860765814781189, "learning_rate": 8.914872497910068e-06, "loss": 0.3908, "num_input_tokens_seen": 17738451152, "step": 4549, "train_runtime": 180882.0378, "train_tokens_per_second": 98066.405 }, { "epoch": 0.7233704292527822, "grad_norm": 0.17985782027244568, "learning_rate": 8.905296010089809e-06, "loss": 0.3914, "num_input_tokens_seen": 17742458295, "step": 4550, "train_runtime": 180921.6658, "train_tokens_per_second": 98067.073 }, { "epoch": 0.7235294117647059, "grad_norm": 0.21792785823345184, "learning_rate": 8.89572355387918e-06, "loss": 0.4027, "num_input_tokens_seen": 17746317844, "step": 4551, "train_runtime": 180961.2481, "train_tokens_per_second": 98066.951 }, { "epoch": 0.7236883942766296, "grad_norm": 0.18727044761180878, "learning_rate": 8.886155131676e-06, "loss": 0.3903, "num_input_tokens_seen": 17750198397, "step": 4552, "train_runtime": 181001.1595, "train_tokens_per_second": 98066.766 }, { "epoch": 0.7238473767885533, "grad_norm": 0.1949104219675064, "learning_rate": 8.876590745877091e-06, "loss": 0.3816, "num_input_tokens_seen": 17754061075, "step": 4553, "train_runtime": 181038.6748, "train_tokens_per_second": 98067.781 }, { "epoch": 0.724006359300477, "grad_norm": 0.25568315386772156, "learning_rate": 8.867030398878268e-06, "loss": 0.3918, "num_input_tokens_seen": 17757946728, "step": 4554, "train_runtime": 181080.0416, "train_tokens_per_second": 98066.836 }, { "epoch": 0.7241653418124007, "grad_norm": 0.21879033744335175, "learning_rate": 8.857474093074325e-06, "loss": 0.4021, "num_input_tokens_seen": 17761911771, "step": 4555, "train_runtime": 181118.3619, "train_tokens_per_second": 98067.979 }, { "epoch": 0.7243243243243244, "grad_norm": 0.21180492639541626, "learning_rate": 8.847921830859066e-06, "loss": 0.3898, "num_input_tokens_seen": 17765827051, "step": 4556, "train_runtime": 181159.4286, "train_tokens_per_second": 98067.361 }, { "epoch": 0.7244833068362481, "grad_norm": 0.33556920289993286, "learning_rate": 8.83837361462524e-06, "loss": 0.3811, "num_input_tokens_seen": 17769691563, "step": 4557, "train_runtime": 181199.5186, "train_tokens_per_second": 98066.991 }, { "epoch": 0.7246422893481717, "grad_norm": 0.21007153391838074, "learning_rate": 8.828829446764621e-06, "loss": 0.3895, "num_input_tokens_seen": 17773558694, "step": 4558, "train_runtime": 181239.9108, "train_tokens_per_second": 98066.472 }, { "epoch": 0.7248012718600954, "grad_norm": 0.18884722888469696, "learning_rate": 8.819289329667954e-06, "loss": 0.3982, "num_input_tokens_seen": 17777487078, "step": 4559, "train_runtime": 181279.4987, "train_tokens_per_second": 98066.727 }, { "epoch": 0.724960254372019, "grad_norm": 0.19556017220020294, "learning_rate": 8.809753265724979e-06, "loss": 0.383, "num_input_tokens_seen": 17781468372, "step": 4560, "train_runtime": 181318.6251, "train_tokens_per_second": 98067.523 }, { "epoch": 0.7251192368839428, "grad_norm": 0.2714522182941437, "learning_rate": 8.80022125732439e-06, "loss": 0.3982, "num_input_tokens_seen": 17785411047, "step": 4561, "train_runtime": 181358.6715, "train_tokens_per_second": 98067.608 }, { "epoch": 0.7252782193958665, "grad_norm": 0.20204591751098633, "learning_rate": 8.79069330685392e-06, "loss": 0.3978, "num_input_tokens_seen": 17789296043, "step": 4562, "train_runtime": 181400.0458, "train_tokens_per_second": 98066.657 }, { "epoch": 0.7254372019077902, "grad_norm": 1.3829108476638794, "learning_rate": 8.781169416700227e-06, "loss": 0.3793, "num_input_tokens_seen": 17793224046, "step": 4563, "train_runtime": 181440.2276, "train_tokens_per_second": 98066.588 }, { "epoch": 0.7255961844197139, "grad_norm": 0.7116141319274902, "learning_rate": 8.771649589248986e-06, "loss": 0.3897, "num_input_tokens_seen": 17797134271, "step": 4564, "train_runtime": 181479.9779, "train_tokens_per_second": 98066.654 }, { "epoch": 0.7257551669316376, "grad_norm": 0.3123117983341217, "learning_rate": 8.762133826884855e-06, "loss": 0.3934, "num_input_tokens_seen": 17801123858, "step": 4565, "train_runtime": 181520.9346, "train_tokens_per_second": 98066.506 }, { "epoch": 0.7259141494435613, "grad_norm": 0.20293986797332764, "learning_rate": 8.752622131991445e-06, "loss": 0.4014, "num_input_tokens_seen": 17805090256, "step": 4566, "train_runtime": 181560.3926, "train_tokens_per_second": 98067.04 }, { "epoch": 0.726073131955485, "grad_norm": 0.28625792264938354, "learning_rate": 8.743114506951393e-06, "loss": 0.3796, "num_input_tokens_seen": 17808934408, "step": 4567, "train_runtime": 181598.222, "train_tokens_per_second": 98067.78 }, { "epoch": 0.7262321144674085, "grad_norm": 0.2058645635843277, "learning_rate": 8.733610954146268e-06, "loss": 0.3872, "num_input_tokens_seen": 17812790325, "step": 4568, "train_runtime": 181637.5017, "train_tokens_per_second": 98067.801 }, { "epoch": 0.7263910969793322, "grad_norm": 0.1900797188282013, "learning_rate": 8.72411147595667e-06, "loss": 0.3878, "num_input_tokens_seen": 17816724861, "step": 4569, "train_runtime": 181677.4601, "train_tokens_per_second": 98067.888 }, { "epoch": 0.7265500794912559, "grad_norm": 0.25085166096687317, "learning_rate": 8.714616074762128e-06, "loss": 0.3755, "num_input_tokens_seen": 17820618584, "step": 4570, "train_runtime": 181716.5171, "train_tokens_per_second": 98068.238 }, { "epoch": 0.7267090620031796, "grad_norm": 0.21827970445156097, "learning_rate": 8.70512475294118e-06, "loss": 0.3847, "num_input_tokens_seen": 17824311771, "step": 4571, "train_runtime": 181757.1184, "train_tokens_per_second": 98066.65 }, { "epoch": 0.7268680445151033, "grad_norm": 0.20230001211166382, "learning_rate": 8.695637512871343e-06, "loss": 0.3866, "num_input_tokens_seen": 17828288067, "step": 4572, "train_runtime": 181796.0028, "train_tokens_per_second": 98067.547 }, { "epoch": 0.727027027027027, "grad_norm": 0.25859707593917847, "learning_rate": 8.686154356929083e-06, "loss": 0.3923, "num_input_tokens_seen": 17832290380, "step": 4573, "train_runtime": 181834.6948, "train_tokens_per_second": 98068.69 }, { "epoch": 0.7271860095389507, "grad_norm": 0.17628563940525055, "learning_rate": 8.676675287489893e-06, "loss": 0.3721, "num_input_tokens_seen": 17836159818, "step": 4574, "train_runtime": 181872.0747, "train_tokens_per_second": 98069.81 }, { "epoch": 0.7273449920508744, "grad_norm": 0.22708851099014282, "learning_rate": 8.66720030692819e-06, "loss": 0.4038, "num_input_tokens_seen": 17839972096, "step": 4575, "train_runtime": 181911.1301, "train_tokens_per_second": 98069.712 }, { "epoch": 0.7275039745627981, "grad_norm": 0.22688286006450653, "learning_rate": 8.657729417617393e-06, "loss": 0.3914, "num_input_tokens_seen": 17843747180, "step": 4576, "train_runtime": 181950.0938, "train_tokens_per_second": 98069.459 }, { "epoch": 0.7276629570747217, "grad_norm": 0.24266742169857025, "learning_rate": 8.6482626219299e-06, "loss": 0.3928, "num_input_tokens_seen": 17847771412, "step": 4577, "train_runtime": 181988.0628, "train_tokens_per_second": 98071.11 }, { "epoch": 0.7278219395866454, "grad_norm": 0.2290036678314209, "learning_rate": 8.63879992223708e-06, "loss": 0.3855, "num_input_tokens_seen": 17851664671, "step": 4578, "train_runtime": 182028.8981, "train_tokens_per_second": 98070.498 }, { "epoch": 0.7279809220985691, "grad_norm": 0.21295587718486786, "learning_rate": 8.629341320909256e-06, "loss": 0.3851, "num_input_tokens_seen": 17855485714, "step": 4579, "train_runtime": 182067.4133, "train_tokens_per_second": 98070.739 }, { "epoch": 0.7281399046104928, "grad_norm": 0.5116291046142578, "learning_rate": 8.619886820315753e-06, "loss": 0.387, "num_input_tokens_seen": 17859468666, "step": 4580, "train_runtime": 182106.2415, "train_tokens_per_second": 98071.7 }, { "epoch": 0.7282988871224165, "grad_norm": 0.24840852618217468, "learning_rate": 8.610436422824852e-06, "loss": 0.3794, "num_input_tokens_seen": 17863224007, "step": 4581, "train_runtime": 182144.0838, "train_tokens_per_second": 98071.942 }, { "epoch": 0.7284578696343402, "grad_norm": 0.23297818005084991, "learning_rate": 8.600990130803816e-06, "loss": 0.3877, "num_input_tokens_seen": 17867176912, "step": 4582, "train_runtime": 182184.7159, "train_tokens_per_second": 98071.766 }, { "epoch": 0.7286168521462639, "grad_norm": 0.23726145923137665, "learning_rate": 8.591547946618874e-06, "loss": 0.3965, "num_input_tokens_seen": 17871141714, "step": 4583, "train_runtime": 182225.0025, "train_tokens_per_second": 98071.842 }, { "epoch": 0.7287758346581876, "grad_norm": 0.19810518622398376, "learning_rate": 8.582109872635211e-06, "loss": 0.4048, "num_input_tokens_seen": 17875060990, "step": 4584, "train_runtime": 182265.1858, "train_tokens_per_second": 98071.724 }, { "epoch": 0.7289348171701113, "grad_norm": 0.19997374713420868, "learning_rate": 8.572675911217026e-06, "loss": 0.393, "num_input_tokens_seen": 17878944505, "step": 4585, "train_runtime": 182301.9406, "train_tokens_per_second": 98073.254 }, { "epoch": 0.729093799682035, "grad_norm": 0.22901853919029236, "learning_rate": 8.563246064727434e-06, "loss": 0.3923, "num_input_tokens_seen": 17882758163, "step": 4586, "train_runtime": 182340.7148, "train_tokens_per_second": 98073.314 }, { "epoch": 0.7292527821939586, "grad_norm": 0.19994208216667175, "learning_rate": 8.553820335528557e-06, "loss": 0.3953, "num_input_tokens_seen": 17886744171, "step": 4587, "train_runtime": 182386.0366, "train_tokens_per_second": 98070.798 }, { "epoch": 0.7294117647058823, "grad_norm": 0.24306146800518036, "learning_rate": 8.544398725981478e-06, "loss": 0.3846, "num_input_tokens_seen": 17890706789, "step": 4588, "train_runtime": 182421.2511, "train_tokens_per_second": 98073.589 }, { "epoch": 0.729570747217806, "grad_norm": 0.18083226680755615, "learning_rate": 8.53498123844622e-06, "loss": 0.3857, "num_input_tokens_seen": 17894433409, "step": 4589, "train_runtime": 182458.9764, "train_tokens_per_second": 98073.736 }, { "epoch": 0.7297297297297297, "grad_norm": 0.26642999053001404, "learning_rate": 8.525567875281835e-06, "loss": 0.3941, "num_input_tokens_seen": 17898309623, "step": 4590, "train_runtime": 182499.2346, "train_tokens_per_second": 98073.341 }, { "epoch": 0.7298887122416534, "grad_norm": 0.22447296977043152, "learning_rate": 8.51615863884626e-06, "loss": 0.4044, "num_input_tokens_seen": 17902295437, "step": 4591, "train_runtime": 182538.2974, "train_tokens_per_second": 98074.189 }, { "epoch": 0.7300476947535771, "grad_norm": 0.18423038721084595, "learning_rate": 8.506753531496489e-06, "loss": 0.3925, "num_input_tokens_seen": 17906319732, "step": 4592, "train_runtime": 182579.0897, "train_tokens_per_second": 98074.318 }, { "epoch": 0.7302066772655008, "grad_norm": 0.19359397888183594, "learning_rate": 8.497352555588398e-06, "loss": 0.4, "num_input_tokens_seen": 17910031904, "step": 4593, "train_runtime": 182618.4977, "train_tokens_per_second": 98073.482 }, { "epoch": 0.7303656597774245, "grad_norm": 0.21822668612003326, "learning_rate": 8.487955713476881e-06, "loss": 0.4041, "num_input_tokens_seen": 17913921886, "step": 4594, "train_runtime": 182656.7484, "train_tokens_per_second": 98074.241 }, { "epoch": 0.7305246422893482, "grad_norm": 0.23316943645477295, "learning_rate": 8.478563007515784e-06, "loss": 0.3843, "num_input_tokens_seen": 17917914413, "step": 4595, "train_runtime": 182694.3852, "train_tokens_per_second": 98075.89 }, { "epoch": 0.7306836248012719, "grad_norm": 0.23464351892471313, "learning_rate": 8.4691744400579e-06, "loss": 0.3893, "num_input_tokens_seen": 17921758862, "step": 4596, "train_runtime": 182731.6898, "train_tokens_per_second": 98076.907 }, { "epoch": 0.7308426073131955, "grad_norm": 0.19316500425338745, "learning_rate": 8.459790013455004e-06, "loss": 0.398, "num_input_tokens_seen": 17925605610, "step": 4597, "train_runtime": 182771.3453, "train_tokens_per_second": 98076.674 }, { "epoch": 0.7310015898251192, "grad_norm": 0.22345556318759918, "learning_rate": 8.450409730057834e-06, "loss": 0.3847, "num_input_tokens_seen": 17929554402, "step": 4598, "train_runtime": 182812.3509, "train_tokens_per_second": 98076.275 }, { "epoch": 0.7311605723370429, "grad_norm": 0.2135506570339203, "learning_rate": 8.44103359221608e-06, "loss": 0.394, "num_input_tokens_seen": 17933398710, "step": 4599, "train_runtime": 182853.0481, "train_tokens_per_second": 98075.47 }, { "epoch": 0.7313195548489666, "grad_norm": 0.1879250705242157, "learning_rate": 8.4316616022784e-06, "loss": 0.3868, "num_input_tokens_seen": 17937289555, "step": 4600, "train_runtime": 182890.5408, "train_tokens_per_second": 98076.639 }, { "epoch": 0.7314785373608903, "grad_norm": 0.2506811022758484, "learning_rate": 8.42229376259242e-06, "loss": 0.3986, "num_input_tokens_seen": 17941258904, "step": 4601, "train_runtime": 183039.4297, "train_tokens_per_second": 98018.547 }, { "epoch": 0.731637519872814, "grad_norm": 0.2096242606639862, "learning_rate": 8.412930075504699e-06, "loss": 0.3916, "num_input_tokens_seen": 17945245614, "step": 4602, "train_runtime": 183076.8173, "train_tokens_per_second": 98020.306 }, { "epoch": 0.7317965023847377, "grad_norm": 0.26177719235420227, "learning_rate": 8.403570543360783e-06, "loss": 0.3938, "num_input_tokens_seen": 17949220454, "step": 4603, "train_runtime": 183114.0018, "train_tokens_per_second": 98022.108 }, { "epoch": 0.7319554848966614, "grad_norm": 0.25446364283561707, "learning_rate": 8.394215168505171e-06, "loss": 0.3896, "num_input_tokens_seen": 17953046897, "step": 4604, "train_runtime": 183154.2684, "train_tokens_per_second": 98021.45 }, { "epoch": 0.7321144674085851, "grad_norm": 0.24381136894226074, "learning_rate": 8.384863953281317e-06, "loss": 0.3826, "num_input_tokens_seen": 17956945577, "step": 4605, "train_runtime": 183194.2617, "train_tokens_per_second": 98021.332 }, { "epoch": 0.7322734499205087, "grad_norm": 0.3016887605190277, "learning_rate": 8.375516900031643e-06, "loss": 0.3761, "num_input_tokens_seen": 17960880095, "step": 4606, "train_runtime": 183234.6996, "train_tokens_per_second": 98021.172 }, { "epoch": 0.7324324324324324, "grad_norm": 0.277298241853714, "learning_rate": 8.366174011097494e-06, "loss": 0.379, "num_input_tokens_seen": 17964668867, "step": 4607, "train_runtime": 183275.0593, "train_tokens_per_second": 98020.259 }, { "epoch": 0.7325914149443561, "grad_norm": 0.22725079953670502, "learning_rate": 8.356835288819231e-06, "loss": 0.3868, "num_input_tokens_seen": 17968587795, "step": 4608, "train_runtime": 183315.7471, "train_tokens_per_second": 98019.881 }, { "epoch": 0.7327503974562798, "grad_norm": 0.18520067632198334, "learning_rate": 8.347500735536112e-06, "loss": 0.389, "num_input_tokens_seen": 17972468845, "step": 4609, "train_runtime": 183356.1454, "train_tokens_per_second": 98019.452 }, { "epoch": 0.7329093799682035, "grad_norm": 0.19221177697181702, "learning_rate": 8.33817035358639e-06, "loss": 0.382, "num_input_tokens_seen": 17976377051, "step": 4610, "train_runtime": 183395.6295, "train_tokens_per_second": 98019.659 }, { "epoch": 0.7330683624801272, "grad_norm": 0.19343985617160797, "learning_rate": 8.328844145307263e-06, "loss": 0.3814, "num_input_tokens_seen": 17980413826, "step": 4611, "train_runtime": 183434.8015, "train_tokens_per_second": 98020.734 }, { "epoch": 0.7332273449920509, "grad_norm": 0.18840065598487854, "learning_rate": 8.319522113034855e-06, "loss": 0.3803, "num_input_tokens_seen": 17984252443, "step": 4612, "train_runtime": 183474.67, "train_tokens_per_second": 98020.356 }, { "epoch": 0.7333863275039746, "grad_norm": 0.22267033159732819, "learning_rate": 8.310204259104307e-06, "loss": 0.3865, "num_input_tokens_seen": 17988136827, "step": 4613, "train_runtime": 183512.624, "train_tokens_per_second": 98021.25 }, { "epoch": 0.7335453100158983, "grad_norm": 0.20385582745075226, "learning_rate": 8.300890585849636e-06, "loss": 0.3975, "num_input_tokens_seen": 17992109421, "step": 4614, "train_runtime": 183549.9688, "train_tokens_per_second": 98022.95 }, { "epoch": 0.733704292527822, "grad_norm": 0.2185019701719284, "learning_rate": 8.29158109560389e-06, "loss": 0.3818, "num_input_tokens_seen": 17995820071, "step": 4615, "train_runtime": 183589.983, "train_tokens_per_second": 98021.797 }, { "epoch": 0.7338632750397456, "grad_norm": 0.18572306632995605, "learning_rate": 8.282275790699004e-06, "loss": 0.3817, "num_input_tokens_seen": 17999723885, "step": 4616, "train_runtime": 183630.6728, "train_tokens_per_second": 98021.336 }, { "epoch": 0.7340222575516693, "grad_norm": 0.24239160120487213, "learning_rate": 8.272974673465897e-06, "loss": 0.3961, "num_input_tokens_seen": 18003773628, "step": 4617, "train_runtime": 183670.5319, "train_tokens_per_second": 98022.113 }, { "epoch": 0.734181240063593, "grad_norm": 0.20640826225280762, "learning_rate": 8.263677746234447e-06, "loss": 0.3928, "num_input_tokens_seen": 18007620697, "step": 4618, "train_runtime": 183711.5885, "train_tokens_per_second": 98021.147 }, { "epoch": 0.7343402225755167, "grad_norm": 0.461261123418808, "learning_rate": 8.254385011333446e-06, "loss": 0.3898, "num_input_tokens_seen": 18011461931, "step": 4619, "train_runtime": 183749.677, "train_tokens_per_second": 98021.734 }, { "epoch": 0.7344992050874404, "grad_norm": 0.23270629346370697, "learning_rate": 8.24509647109067e-06, "loss": 0.3832, "num_input_tokens_seen": 18015211869, "step": 4620, "train_runtime": 183787.69, "train_tokens_per_second": 98021.864 }, { "epoch": 0.7346581875993641, "grad_norm": 2.9277238845825195, "learning_rate": 8.235812127832831e-06, "loss": 0.3835, "num_input_tokens_seen": 18019197226, "step": 4621, "train_runtime": 183830.3791, "train_tokens_per_second": 98020.78 }, { "epoch": 0.7348171701112878, "grad_norm": 0.2048233449459076, "learning_rate": 8.226531983885594e-06, "loss": 0.3913, "num_input_tokens_seen": 18023124481, "step": 4622, "train_runtime": 183868.4326, "train_tokens_per_second": 98021.853 }, { "epoch": 0.7349761526232115, "grad_norm": 0.18876133859157562, "learning_rate": 8.217256041573568e-06, "loss": 0.393, "num_input_tokens_seen": 18026893457, "step": 4623, "train_runtime": 183908.9039, "train_tokens_per_second": 98020.776 }, { "epoch": 0.7351351351351352, "grad_norm": 0.18176674842834473, "learning_rate": 8.207984303220317e-06, "loss": 0.3768, "num_input_tokens_seen": 18030727583, "step": 4624, "train_runtime": 183948.6502, "train_tokens_per_second": 98020.44 }, { "epoch": 0.7352941176470589, "grad_norm": 0.2094569057226181, "learning_rate": 8.198716771148331e-06, "loss": 0.3988, "num_input_tokens_seen": 18034707906, "step": 4625, "train_runtime": 183989.3127, "train_tokens_per_second": 98020.41 }, { "epoch": 0.7354531001589825, "grad_norm": 0.18048155307769775, "learning_rate": 8.189453447679074e-06, "loss": 0.3901, "num_input_tokens_seen": 18038655461, "step": 4626, "train_runtime": 184028.8838, "train_tokens_per_second": 98020.784 }, { "epoch": 0.7356120826709062, "grad_norm": 0.29761481285095215, "learning_rate": 8.180194335132935e-06, "loss": 0.3877, "num_input_tokens_seen": 18042471122, "step": 4627, "train_runtime": 184067.932, "train_tokens_per_second": 98020.719 }, { "epoch": 0.7357710651828299, "grad_norm": 0.20581617951393127, "learning_rate": 8.170939435829261e-06, "loss": 0.388, "num_input_tokens_seen": 18046408715, "step": 4628, "train_runtime": 184106.4062, "train_tokens_per_second": 98021.623 }, { "epoch": 0.7359300476947536, "grad_norm": 0.18525098264217377, "learning_rate": 8.16168875208635e-06, "loss": 0.3927, "num_input_tokens_seen": 18050316244, "step": 4629, "train_runtime": 184146.1758, "train_tokens_per_second": 98021.673 }, { "epoch": 0.7360890302066773, "grad_norm": 0.6369149684906006, "learning_rate": 8.152442286221401e-06, "loss": 0.3841, "num_input_tokens_seen": 18054150065, "step": 4630, "train_runtime": 184185.8889, "train_tokens_per_second": 98021.353 }, { "epoch": 0.736248012718601, "grad_norm": 0.24067674577236176, "learning_rate": 8.143200040550627e-06, "loss": 0.3812, "num_input_tokens_seen": 18058104425, "step": 4631, "train_runtime": 184225.8988, "train_tokens_per_second": 98021.53 }, { "epoch": 0.7364069952305247, "grad_norm": 0.21312133967876434, "learning_rate": 8.133962017389119e-06, "loss": 0.3914, "num_input_tokens_seen": 18061993326, "step": 4632, "train_runtime": 184265.0815, "train_tokens_per_second": 98021.791 }, { "epoch": 0.7365659777424484, "grad_norm": 0.190565288066864, "learning_rate": 8.124728219050942e-06, "loss": 0.3941, "num_input_tokens_seen": 18065878161, "step": 4633, "train_runtime": 184302.3728, "train_tokens_per_second": 98023.036 }, { "epoch": 0.7367249602543721, "grad_norm": 0.20699971914291382, "learning_rate": 8.115498647849108e-06, "loss": 0.3873, "num_input_tokens_seen": 18069844023, "step": 4634, "train_runtime": 184343.7119, "train_tokens_per_second": 98022.568 }, { "epoch": 0.7368839427662957, "grad_norm": 0.2258968949317932, "learning_rate": 8.106273306095535e-06, "loss": 0.3931, "num_input_tokens_seen": 18073627834, "step": 4635, "train_runtime": 184384.1637, "train_tokens_per_second": 98021.584 }, { "epoch": 0.7370429252782194, "grad_norm": 0.1915234625339508, "learning_rate": 8.097052196101137e-06, "loss": 0.3806, "num_input_tokens_seen": 18077575538, "step": 4636, "train_runtime": 184421.6825, "train_tokens_per_second": 98023.049 }, { "epoch": 0.737201907790143, "grad_norm": 0.19577060639858246, "learning_rate": 8.087835320175716e-06, "loss": 0.3778, "num_input_tokens_seen": 18081403417, "step": 4637, "train_runtime": 184461.55, "train_tokens_per_second": 98022.615 }, { "epoch": 0.7373608903020668, "grad_norm": 0.24890701472759247, "learning_rate": 8.07862268062804e-06, "loss": 0.3978, "num_input_tokens_seen": 18085415661, "step": 4638, "train_runtime": 184500.9203, "train_tokens_per_second": 98023.444 }, { "epoch": 0.7375198728139905, "grad_norm": 0.20853114128112793, "learning_rate": 8.069414279765808e-06, "loss": 0.4037, "num_input_tokens_seen": 18089324572, "step": 4639, "train_runtime": 184540.356, "train_tokens_per_second": 98023.679 }, { "epoch": 0.7376788553259142, "grad_norm": 0.22936928272247314, "learning_rate": 8.060210119895664e-06, "loss": 0.3871, "num_input_tokens_seen": 18093104414, "step": 4640, "train_runtime": 184578.9469, "train_tokens_per_second": 98023.663 }, { "epoch": 0.7378378378378379, "grad_norm": 0.25346189737319946, "learning_rate": 8.05101020332319e-06, "loss": 0.3847, "num_input_tokens_seen": 18097054964, "step": 4641, "train_runtime": 184620.0442, "train_tokens_per_second": 98023.24 }, { "epoch": 0.7379968203497616, "grad_norm": 0.22502000629901886, "learning_rate": 8.041814532352882e-06, "loss": 0.3858, "num_input_tokens_seen": 18100980325, "step": 4642, "train_runtime": 184657.5008, "train_tokens_per_second": 98024.614 }, { "epoch": 0.7381558028616853, "grad_norm": 0.20832212269306183, "learning_rate": 8.0326231092882e-06, "loss": 0.3871, "num_input_tokens_seen": 18104971978, "step": 4643, "train_runtime": 184694.1659, "train_tokens_per_second": 98026.767 }, { "epoch": 0.738314785373609, "grad_norm": 0.21578149497509003, "learning_rate": 8.023435936431536e-06, "loss": 0.3833, "num_input_tokens_seen": 18108913356, "step": 4644, "train_runtime": 184736.5335, "train_tokens_per_second": 98025.621 }, { "epoch": 0.7384737678855325, "grad_norm": 0.1943684071302414, "learning_rate": 8.014253016084206e-06, "loss": 0.4004, "num_input_tokens_seen": 18112822331, "step": 4645, "train_runtime": 184777.6218, "train_tokens_per_second": 98024.978 }, { "epoch": 0.7386327503974562, "grad_norm": 0.21453669667243958, "learning_rate": 8.005074350546469e-06, "loss": 0.4012, "num_input_tokens_seen": 18116746504, "step": 4646, "train_runtime": 184817.5499, "train_tokens_per_second": 98025.033 }, { "epoch": 0.7387917329093799, "grad_norm": 0.23799172043800354, "learning_rate": 7.995899942117522e-06, "loss": 0.3888, "num_input_tokens_seen": 18120508119, "step": 4647, "train_runtime": 184854.4964, "train_tokens_per_second": 98025.79 }, { "epoch": 0.7389507154213036, "grad_norm": 0.25721558928489685, "learning_rate": 7.986729793095473e-06, "loss": 0.3941, "num_input_tokens_seen": 18124410966, "step": 4648, "train_runtime": 184895.1307, "train_tokens_per_second": 98025.356 }, { "epoch": 0.7391096979332273, "grad_norm": 0.4833729565143585, "learning_rate": 7.977563905777391e-06, "loss": 0.3952, "num_input_tokens_seen": 18128339505, "step": 4649, "train_runtime": 184935.3346, "train_tokens_per_second": 98025.288 }, { "epoch": 0.739268680445151, "grad_norm": 0.2563703656196594, "learning_rate": 7.968402282459264e-06, "loss": 0.3941, "num_input_tokens_seen": 18132256940, "step": 4650, "train_runtime": 184975.5896, "train_tokens_per_second": 98025.134 }, { "epoch": 0.7394276629570747, "grad_norm": 0.24370576441287994, "learning_rate": 7.959244925436013e-06, "loss": 0.3913, "num_input_tokens_seen": 18136091014, "step": 4651, "train_runtime": 185015.1491, "train_tokens_per_second": 98024.897 }, { "epoch": 0.7395866454689984, "grad_norm": 0.24593627452850342, "learning_rate": 7.950091837001503e-06, "loss": 0.3892, "num_input_tokens_seen": 18139994197, "step": 4652, "train_runtime": 185056.6809, "train_tokens_per_second": 98023.99 }, { "epoch": 0.7397456279809221, "grad_norm": 0.21483926475048065, "learning_rate": 7.940943019448488e-06, "loss": 0.3798, "num_input_tokens_seen": 18143798729, "step": 4653, "train_runtime": 185095.5991, "train_tokens_per_second": 98023.934 }, { "epoch": 0.7399046104928458, "grad_norm": 0.17630304396152496, "learning_rate": 7.931798475068717e-06, "loss": 0.3814, "num_input_tokens_seen": 18147780256, "step": 4654, "train_runtime": 185135.6867, "train_tokens_per_second": 98024.214 }, { "epoch": 0.7400635930047694, "grad_norm": 0.18950051069259644, "learning_rate": 7.922658206152811e-06, "loss": 0.3838, "num_input_tokens_seen": 18151669535, "step": 4655, "train_runtime": 185175.8289, "train_tokens_per_second": 98023.968 }, { "epoch": 0.7402225755166931, "grad_norm": 0.2760191261768341, "learning_rate": 7.913522214990352e-06, "loss": 0.3898, "num_input_tokens_seen": 18155578017, "step": 4656, "train_runtime": 185215.1177, "train_tokens_per_second": 98024.277 }, { "epoch": 0.7403815580286168, "grad_norm": 0.6980077624320984, "learning_rate": 7.904390503869839e-06, "loss": 0.3904, "num_input_tokens_seen": 18159513212, "step": 4657, "train_runtime": 185253.3678, "train_tokens_per_second": 98025.28 }, { "epoch": 0.7405405405405405, "grad_norm": 0.21472756564617157, "learning_rate": 7.895263075078696e-06, "loss": 0.3934, "num_input_tokens_seen": 18163496095, "step": 4658, "train_runtime": 185293.0237, "train_tokens_per_second": 98025.796 }, { "epoch": 0.7406995230524642, "grad_norm": 0.2273457944393158, "learning_rate": 7.886139930903298e-06, "loss": 0.3807, "num_input_tokens_seen": 18167344039, "step": 4659, "train_runtime": 185332.8032, "train_tokens_per_second": 98025.518 }, { "epoch": 0.7408585055643879, "grad_norm": 0.21970504522323608, "learning_rate": 7.877021073628905e-06, "loss": 0.3877, "num_input_tokens_seen": 18171188063, "step": 4660, "train_runtime": 185370.7146, "train_tokens_per_second": 98026.207 }, { "epoch": 0.7410174880763116, "grad_norm": 0.22881916165351868, "learning_rate": 7.867906505539735e-06, "loss": 0.397, "num_input_tokens_seen": 18175158554, "step": 4661, "train_runtime": 185410.7183, "train_tokens_per_second": 98026.472 }, { "epoch": 0.7411764705882353, "grad_norm": 0.2874017357826233, "learning_rate": 7.858796228918924e-06, "loss": 0.3974, "num_input_tokens_seen": 18179009485, "step": 4662, "train_runtime": 185450.4163, "train_tokens_per_second": 98026.253 }, { "epoch": 0.741335453100159, "grad_norm": 0.2238074243068695, "learning_rate": 7.849690246048532e-06, "loss": 0.3889, "num_input_tokens_seen": 18182872352, "step": 4663, "train_runtime": 185490.2663, "train_tokens_per_second": 98026.019 }, { "epoch": 0.7414944356120827, "grad_norm": 0.1736130267381668, "learning_rate": 7.840588559209547e-06, "loss": 0.3905, "num_input_tokens_seen": 18186771779, "step": 4664, "train_runtime": 185528.7814, "train_tokens_per_second": 98026.687 }, { "epoch": 0.7416534181240063, "grad_norm": 0.223256915807724, "learning_rate": 7.831491170681875e-06, "loss": 0.3764, "num_input_tokens_seen": 18190604318, "step": 4665, "train_runtime": 185569.2568, "train_tokens_per_second": 98025.959 }, { "epoch": 0.74181240063593, "grad_norm": 0.21682463586330414, "learning_rate": 7.822398082744346e-06, "loss": 0.3901, "num_input_tokens_seen": 18194422225, "step": 4666, "train_runtime": 185609.8803, "train_tokens_per_second": 98025.074 }, { "epoch": 0.7419713831478537, "grad_norm": 0.19391337037086487, "learning_rate": 7.813309297674706e-06, "loss": 0.3875, "num_input_tokens_seen": 18198417972, "step": 4667, "train_runtime": 185650.1901, "train_tokens_per_second": 98025.313 }, { "epoch": 0.7421303656597774, "grad_norm": 0.202061265707016, "learning_rate": 7.804224817749644e-06, "loss": 0.401, "num_input_tokens_seen": 18202339052, "step": 4668, "train_runtime": 185690.2826, "train_tokens_per_second": 98025.264 }, { "epoch": 0.7422893481717011, "grad_norm": 0.20814178884029388, "learning_rate": 7.795144645244754e-06, "loss": 0.3945, "num_input_tokens_seen": 18206083029, "step": 4669, "train_runtime": 185729.2338, "train_tokens_per_second": 98024.865 }, { "epoch": 0.7424483306836248, "grad_norm": 0.2998485863208771, "learning_rate": 7.786068782434563e-06, "loss": 0.3905, "num_input_tokens_seen": 18209881755, "step": 4670, "train_runtime": 185769.6171, "train_tokens_per_second": 98024.004 }, { "epoch": 0.7426073131955485, "grad_norm": 0.20882508158683777, "learning_rate": 7.776997231592484e-06, "loss": 0.3772, "num_input_tokens_seen": 18213905699, "step": 4671, "train_runtime": 185806.2945, "train_tokens_per_second": 98026.311 }, { "epoch": 0.7427662957074722, "grad_norm": 0.3630424439907074, "learning_rate": 7.767929994990914e-06, "loss": 0.395, "num_input_tokens_seen": 18217840057, "step": 4672, "train_runtime": 185845.5164, "train_tokens_per_second": 98026.794 }, { "epoch": 0.7429252782193959, "grad_norm": 0.2189139425754547, "learning_rate": 7.758867074901102e-06, "loss": 0.3849, "num_input_tokens_seen": 18221681667, "step": 4673, "train_runtime": 185885.4758, "train_tokens_per_second": 98026.387 }, { "epoch": 0.7430842607313195, "grad_norm": 0.196206197142601, "learning_rate": 7.749808473593256e-06, "loss": 0.4, "num_input_tokens_seen": 18225580649, "step": 4674, "train_runtime": 185926.5354, "train_tokens_per_second": 98025.71 }, { "epoch": 0.7432432432432432, "grad_norm": 0.1838534027338028, "learning_rate": 7.7407541933365e-06, "loss": 0.3878, "num_input_tokens_seen": 18229396261, "step": 4675, "train_runtime": 185964.7385, "train_tokens_per_second": 98026.09 }, { "epoch": 0.7434022257551669, "grad_norm": 0.19348575174808502, "learning_rate": 7.73170423639884e-06, "loss": 0.3903, "num_input_tokens_seen": 18233340604, "step": 4676, "train_runtime": 186004.5387, "train_tokens_per_second": 98026.321 }, { "epoch": 0.7435612082670906, "grad_norm": 0.20150691270828247, "learning_rate": 7.722658605047264e-06, "loss": 0.3845, "num_input_tokens_seen": 18237165479, "step": 4677, "train_runtime": 186043.1835, "train_tokens_per_second": 98026.518 }, { "epoch": 0.7437201907790143, "grad_norm": 0.24021482467651367, "learning_rate": 7.71361730154761e-06, "loss": 0.3912, "num_input_tokens_seen": 18241120430, "step": 4678, "train_runtime": 186081.9976, "train_tokens_per_second": 98027.325 }, { "epoch": 0.743879173290938, "grad_norm": 0.26595476269721985, "learning_rate": 7.704580328164673e-06, "loss": 0.3858, "num_input_tokens_seen": 18245132481, "step": 4679, "train_runtime": 186122.839, "train_tokens_per_second": 98027.37 }, { "epoch": 0.7440381558028617, "grad_norm": 0.5659416317939758, "learning_rate": 7.695547687162146e-06, "loss": 0.3828, "num_input_tokens_seen": 18248965095, "step": 4680, "train_runtime": 186164.2512, "train_tokens_per_second": 98026.152 }, { "epoch": 0.7441971383147854, "grad_norm": 0.19576172530651093, "learning_rate": 7.686519380802645e-06, "loss": 0.3899, "num_input_tokens_seen": 18252928299, "step": 4681, "train_runtime": 186203.5594, "train_tokens_per_second": 98026.742 }, { "epoch": 0.7443561208267091, "grad_norm": 0.20716965198516846, "learning_rate": 7.677495411347702e-06, "loss": 0.3865, "num_input_tokens_seen": 18256824053, "step": 4682, "train_runtime": 186241.4181, "train_tokens_per_second": 98027.733 }, { "epoch": 0.7445151033386328, "grad_norm": 0.2341262549161911, "learning_rate": 7.668475781057747e-06, "loss": 0.3757, "num_input_tokens_seen": 18260789478, "step": 4683, "train_runtime": 186280.9502, "train_tokens_per_second": 98028.217 }, { "epoch": 0.7446740858505564, "grad_norm": 0.2158580869436264, "learning_rate": 7.659460492192134e-06, "loss": 0.3889, "num_input_tokens_seen": 18264565011, "step": 4684, "train_runtime": 186319.2053, "train_tokens_per_second": 98028.354 }, { "epoch": 0.7448330683624801, "grad_norm": 0.22308869659900665, "learning_rate": 7.650449547009137e-06, "loss": 0.385, "num_input_tokens_seen": 18268491670, "step": 4685, "train_runtime": 186357.9978, "train_tokens_per_second": 98029.019 }, { "epoch": 0.7449920508744038, "grad_norm": 0.18356603384017944, "learning_rate": 7.641442947765928e-06, "loss": 0.379, "num_input_tokens_seen": 18272390013, "step": 4686, "train_runtime": 186395.1108, "train_tokens_per_second": 98030.415 }, { "epoch": 0.7451510333863275, "grad_norm": 0.3458755314350128, "learning_rate": 7.632440696718598e-06, "loss": 0.3886, "num_input_tokens_seen": 18276203621, "step": 4687, "train_runtime": 186432.688, "train_tokens_per_second": 98031.111 }, { "epoch": 0.7453100158982512, "grad_norm": 0.2299545705318451, "learning_rate": 7.62344279612216e-06, "loss": 0.3858, "num_input_tokens_seen": 18280234207, "step": 4688, "train_runtime": 186471.6271, "train_tokens_per_second": 98032.256 }, { "epoch": 0.7454689984101749, "grad_norm": 0.3217381536960602, "learning_rate": 7.614449248230502e-06, "loss": 0.3768, "num_input_tokens_seen": 18284128159, "step": 4689, "train_runtime": 186512.6864, "train_tokens_per_second": 98031.552 }, { "epoch": 0.7456279809220986, "grad_norm": 0.21476617455482483, "learning_rate": 7.605460055296456e-06, "loss": 0.3924, "num_input_tokens_seen": 18287887745, "step": 4690, "train_runtime": 186551.7274, "train_tokens_per_second": 98031.19 }, { "epoch": 0.7457869634340223, "grad_norm": 0.20380350947380066, "learning_rate": 7.59647521957175e-06, "loss": 0.3888, "num_input_tokens_seen": 18291866718, "step": 4691, "train_runtime": 186592.1299, "train_tokens_per_second": 98031.287 }, { "epoch": 0.745945945945946, "grad_norm": 0.21736924350261688, "learning_rate": 7.587494743307022e-06, "loss": 0.3811, "num_input_tokens_seen": 18295748467, "step": 4692, "train_runtime": 186633.3098, "train_tokens_per_second": 98030.456 }, { "epoch": 0.7461049284578697, "grad_norm": 0.2184658944606781, "learning_rate": 7.578518628751824e-06, "loss": 0.3847, "num_input_tokens_seen": 18299611806, "step": 4693, "train_runtime": 186672.2928, "train_tokens_per_second": 98030.68 }, { "epoch": 0.7462639109697933, "grad_norm": 0.1859433799982071, "learning_rate": 7.56954687815459e-06, "loss": 0.3898, "num_input_tokens_seen": 18303468734, "step": 4694, "train_runtime": 186709.505, "train_tokens_per_second": 98031.799 }, { "epoch": 0.746422893481717, "grad_norm": 0.24021312594413757, "learning_rate": 7.560579493762707e-06, "loss": 0.3907, "num_input_tokens_seen": 18307473253, "step": 4695, "train_runtime": 186749.4738, "train_tokens_per_second": 98032.262 }, { "epoch": 0.7465818759936407, "grad_norm": 0.7493414878845215, "learning_rate": 7.551616477822421e-06, "loss": 0.39, "num_input_tokens_seen": 18311339408, "step": 4696, "train_runtime": 186789.4506, "train_tokens_per_second": 98031.979 }, { "epoch": 0.7467408585055644, "grad_norm": 0.2020186334848404, "learning_rate": 7.542657832578912e-06, "loss": 0.3897, "num_input_tokens_seen": 18315271119, "step": 4697, "train_runtime": 186828.2701, "train_tokens_per_second": 98032.654 }, { "epoch": 0.7468998410174881, "grad_norm": 0.20382624864578247, "learning_rate": 7.533703560276262e-06, "loss": 0.3738, "num_input_tokens_seen": 18319064342, "step": 4698, "train_runtime": 186867.7592, "train_tokens_per_second": 98032.236 }, { "epoch": 0.7470588235294118, "grad_norm": 0.1906735599040985, "learning_rate": 7.524753663157433e-06, "loss": 0.3842, "num_input_tokens_seen": 18323096192, "step": 4699, "train_runtime": 186906.1149, "train_tokens_per_second": 98033.69 }, { "epoch": 0.7472178060413355, "grad_norm": 0.19690461456775665, "learning_rate": 7.51580814346434e-06, "loss": 0.4037, "num_input_tokens_seen": 18326912099, "step": 4700, "train_runtime": 186944.1548, "train_tokens_per_second": 98034.154 }, { "epoch": 0.7473767885532592, "grad_norm": 0.2842102348804474, "learning_rate": 7.506867003437751e-06, "loss": 0.3883, "num_input_tokens_seen": 18330733998, "step": 4701, "train_runtime": 186983.6167, "train_tokens_per_second": 98033.904 }, { "epoch": 0.7475357710651829, "grad_norm": 0.23476505279541016, "learning_rate": 7.497930245317364e-06, "loss": 0.3902, "num_input_tokens_seen": 18334605907, "step": 4702, "train_runtime": 187020.1432, "train_tokens_per_second": 98035.461 }, { "epoch": 0.7476947535771065, "grad_norm": 0.22110943496227264, "learning_rate": 7.488997871341774e-06, "loss": 0.386, "num_input_tokens_seen": 18338578799, "step": 4703, "train_runtime": 187060.0461, "train_tokens_per_second": 98035.787 }, { "epoch": 0.7478537360890302, "grad_norm": 0.5689375400543213, "learning_rate": 7.480069883748478e-06, "loss": 0.3771, "num_input_tokens_seen": 18342472105, "step": 4704, "train_runtime": 187097.9616, "train_tokens_per_second": 98036.729 }, { "epoch": 0.7480127186009539, "grad_norm": 0.22705772519111633, "learning_rate": 7.471146284773883e-06, "loss": 0.3905, "num_input_tokens_seen": 18346379350, "step": 4705, "train_runtime": 187137.6266, "train_tokens_per_second": 98036.828 }, { "epoch": 0.7481717011128776, "grad_norm": 0.30266517400741577, "learning_rate": 7.462227076653272e-06, "loss": 0.386, "num_input_tokens_seen": 18350230116, "step": 4706, "train_runtime": 187176.4125, "train_tokens_per_second": 98037.086 }, { "epoch": 0.7483306836248013, "grad_norm": 0.23770925402641296, "learning_rate": 7.453312261620849e-06, "loss": 0.377, "num_input_tokens_seen": 18354231057, "step": 4707, "train_runtime": 187214.2219, "train_tokens_per_second": 98038.658 }, { "epoch": 0.748489666136725, "grad_norm": 0.2132243812084198, "learning_rate": 7.444401841909712e-06, "loss": 0.3924, "num_input_tokens_seen": 18358088589, "step": 4708, "train_runtime": 187254.7157, "train_tokens_per_second": 98038.058 }, { "epoch": 0.7486486486486487, "grad_norm": 0.1897212266921997, "learning_rate": 7.4354958197518616e-06, "loss": 0.3924, "num_input_tokens_seen": 18361964230, "step": 4709, "train_runtime": 187295.0495, "train_tokens_per_second": 98037.638 }, { "epoch": 0.7488076311605724, "grad_norm": 0.35940316319465637, "learning_rate": 7.426594197378189e-06, "loss": 0.3798, "num_input_tokens_seen": 18365826627, "step": 4710, "train_runtime": 187335.766, "train_tokens_per_second": 98036.947 }, { "epoch": 0.7489666136724961, "grad_norm": 0.20684599876403809, "learning_rate": 7.417696977018501e-06, "loss": 0.3828, "num_input_tokens_seen": 18369728165, "step": 4711, "train_runtime": 187371.9339, "train_tokens_per_second": 98038.846 }, { "epoch": 0.7491255961844198, "grad_norm": 0.19967465102672577, "learning_rate": 7.408804160901467e-06, "loss": 0.3883, "num_input_tokens_seen": 18373614674, "step": 4712, "train_runtime": 187412.1375, "train_tokens_per_second": 98038.552 }, { "epoch": 0.7492845786963434, "grad_norm": 0.2161114513874054, "learning_rate": 7.399915751254688e-06, "loss": 0.3892, "num_input_tokens_seen": 18377430669, "step": 4713, "train_runtime": 187452.6729, "train_tokens_per_second": 98037.709 }, { "epoch": 0.749443561208267, "grad_norm": 0.24770209193229675, "learning_rate": 7.39103175030465e-06, "loss": 0.3994, "num_input_tokens_seen": 18381315789, "step": 4714, "train_runtime": 187492.4236, "train_tokens_per_second": 98037.646 }, { "epoch": 0.7496025437201908, "grad_norm": 0.4328847825527191, "learning_rate": 7.382152160276715e-06, "loss": 0.3864, "num_input_tokens_seen": 18385234258, "step": 4715, "train_runtime": 187531.1043, "train_tokens_per_second": 98038.319 }, { "epoch": 0.7497615262321145, "grad_norm": 0.2535432279109955, "learning_rate": 7.3732769833951835e-06, "loss": 0.391, "num_input_tokens_seen": 18389199041, "step": 4716, "train_runtime": 187569.8682, "train_tokens_per_second": 98039.196 }, { "epoch": 0.7499205087440382, "grad_norm": 0.21809902787208557, "learning_rate": 7.364406221883194e-06, "loss": 0.384, "num_input_tokens_seen": 18393085162, "step": 4717, "train_runtime": 187606.8512, "train_tokens_per_second": 98040.584 }, { "epoch": 0.7500794912559618, "grad_norm": 0.22736085951328278, "learning_rate": 7.355539877962844e-06, "loss": 0.3785, "num_input_tokens_seen": 18397003321, "step": 4718, "train_runtime": 187645.7119, "train_tokens_per_second": 98041.16 }, { "epoch": 0.7502384737678855, "grad_norm": 0.2867449223995209, "learning_rate": 7.346677953855063e-06, "loss": 0.3915, "num_input_tokens_seen": 18400863861, "step": 4719, "train_runtime": 187685.8659, "train_tokens_per_second": 98040.754 }, { "epoch": 0.7503974562798092, "grad_norm": 0.20542754232883453, "learning_rate": 7.337820451779712e-06, "loss": 0.3808, "num_input_tokens_seen": 18404755165, "step": 4720, "train_runtime": 187723.399, "train_tokens_per_second": 98041.881 }, { "epoch": 0.750556438791733, "grad_norm": 0.1833629012107849, "learning_rate": 7.328967373955537e-06, "loss": 0.3944, "num_input_tokens_seen": 18408688466, "step": 4721, "train_runtime": 187762.8842, "train_tokens_per_second": 98042.212 }, { "epoch": 0.7507154213036566, "grad_norm": 0.19744527339935303, "learning_rate": 7.32011872260015e-06, "loss": 0.3889, "num_input_tokens_seen": 18412648465, "step": 4722, "train_runtime": 187801.0377, "train_tokens_per_second": 98043.38 }, { "epoch": 0.7508744038155802, "grad_norm": 0.21365691721439362, "learning_rate": 7.311274499930107e-06, "loss": 0.3966, "num_input_tokens_seen": 18416511935, "step": 4723, "train_runtime": 187840.3024, "train_tokens_per_second": 98043.453 }, { "epoch": 0.7510333863275039, "grad_norm": 0.20146727561950684, "learning_rate": 7.302434708160799e-06, "loss": 0.3866, "num_input_tokens_seen": 18420477761, "step": 4724, "train_runtime": 187876.9727, "train_tokens_per_second": 98045.426 }, { "epoch": 0.7511923688394276, "grad_norm": 0.22509858012199402, "learning_rate": 7.293599349506538e-06, "loss": 0.3811, "num_input_tokens_seen": 18424343591, "step": 4725, "train_runtime": 187914.9229, "train_tokens_per_second": 98046.197 }, { "epoch": 0.7513513513513513, "grad_norm": 0.17845746874809265, "learning_rate": 7.284768426180522e-06, "loss": 0.3969, "num_input_tokens_seen": 18428259546, "step": 4726, "train_runtime": 187953.1252, "train_tokens_per_second": 98047.104 }, { "epoch": 0.751510333863275, "grad_norm": 0.2080775946378708, "learning_rate": 7.275941940394834e-06, "loss": 0.3916, "num_input_tokens_seen": 18432167988, "step": 4727, "train_runtime": 187992.3224, "train_tokens_per_second": 98047.451 }, { "epoch": 0.7516693163751987, "grad_norm": 0.21003605425357819, "learning_rate": 7.26711989436045e-06, "loss": 0.3924, "num_input_tokens_seen": 18436098826, "step": 4728, "train_runtime": 188032.4339, "train_tokens_per_second": 98047.44 }, { "epoch": 0.7518282988871224, "grad_norm": 0.18891283869743347, "learning_rate": 7.25830229028722e-06, "loss": 0.3947, "num_input_tokens_seen": 18439954966, "step": 4729, "train_runtime": 188073.1048, "train_tokens_per_second": 98046.741 }, { "epoch": 0.7519872813990461, "grad_norm": 0.2440156191587448, "learning_rate": 7.2494891303838965e-06, "loss": 0.3909, "num_input_tokens_seen": 18443886929, "step": 4730, "train_runtime": 188113.3681, "train_tokens_per_second": 98046.657 }, { "epoch": 0.7521462639109698, "grad_norm": 0.17873679101467133, "learning_rate": 7.240680416858115e-06, "loss": 0.3869, "num_input_tokens_seen": 18447797944, "step": 4731, "train_runtime": 188152.0572, "train_tokens_per_second": 98047.283 }, { "epoch": 0.7523052464228935, "grad_norm": 0.20593711733818054, "learning_rate": 7.231876151916403e-06, "loss": 0.3874, "num_input_tokens_seen": 18451687191, "step": 4732, "train_runtime": 188191.7124, "train_tokens_per_second": 98047.289 }, { "epoch": 0.7524642289348171, "grad_norm": 0.20797091722488403, "learning_rate": 7.223076337764145e-06, "loss": 0.3949, "num_input_tokens_seen": 18455569941, "step": 4733, "train_runtime": 188231.3695, "train_tokens_per_second": 98047.26 }, { "epoch": 0.7526232114467408, "grad_norm": 0.18973524868488312, "learning_rate": 7.214280976605661e-06, "loss": 0.3872, "num_input_tokens_seen": 18459489925, "step": 4734, "train_runtime": 188268.7105, "train_tokens_per_second": 98048.634 }, { "epoch": 0.7527821939586645, "grad_norm": 0.19082927703857422, "learning_rate": 7.205490070644102e-06, "loss": 0.3969, "num_input_tokens_seen": 18463296478, "step": 4735, "train_runtime": 188307.0393, "train_tokens_per_second": 98048.892 }, { "epoch": 0.7529411764705882, "grad_norm": 0.25489699840545654, "learning_rate": 7.196703622081541e-06, "loss": 0.3851, "num_input_tokens_seen": 18467136905, "step": 4736, "train_runtime": 188346.5853, "train_tokens_per_second": 98048.695 }, { "epoch": 0.7531001589825119, "grad_norm": 1.0887885093688965, "learning_rate": 7.187921633118919e-06, "loss": 0.3785, "num_input_tokens_seen": 18471007891, "step": 4737, "train_runtime": 188386.163, "train_tokens_per_second": 98048.644 }, { "epoch": 0.7532591414944356, "grad_norm": 0.18834295868873596, "learning_rate": 7.1791441059560594e-06, "loss": 0.3809, "num_input_tokens_seen": 18475007575, "step": 4738, "train_runtime": 188425.3265, "train_tokens_per_second": 98049.492 }, { "epoch": 0.7534181240063593, "grad_norm": 0.20033779740333557, "learning_rate": 7.170371042791679e-06, "loss": 0.3903, "num_input_tokens_seen": 18478934979, "step": 4739, "train_runtime": 188464.6966, "train_tokens_per_second": 98049.849 }, { "epoch": 0.753577106518283, "grad_norm": 0.25561439990997314, "learning_rate": 7.161602445823351e-06, "loss": 0.3872, "num_input_tokens_seen": 18482781320, "step": 4740, "train_runtime": 188503.0712, "train_tokens_per_second": 98050.293 }, { "epoch": 0.7537360890302067, "grad_norm": 0.20836001634597778, "learning_rate": 7.15283831724757e-06, "loss": 0.3837, "num_input_tokens_seen": 18486786596, "step": 4741, "train_runtime": 188542.3538, "train_tokens_per_second": 98051.107 }, { "epoch": 0.7538950715421303, "grad_norm": 0.23098866641521454, "learning_rate": 7.14407865925967e-06, "loss": 0.3993, "num_input_tokens_seen": 18490572838, "step": 4742, "train_runtime": 188582.7323, "train_tokens_per_second": 98050.191 }, { "epoch": 0.754054054054054, "grad_norm": 0.29502516984939575, "learning_rate": 7.135323474053887e-06, "loss": 0.3788, "num_input_tokens_seen": 18494517491, "step": 4743, "train_runtime": 188622.0123, "train_tokens_per_second": 98050.685 }, { "epoch": 0.7542130365659777, "grad_norm": 0.21066023409366608, "learning_rate": 7.126572763823339e-06, "loss": 0.3788, "num_input_tokens_seen": 18498320431, "step": 4744, "train_runtime": 188662.1177, "train_tokens_per_second": 98049.999 }, { "epoch": 0.7543720190779014, "grad_norm": 0.20639099180698395, "learning_rate": 7.117826530760013e-06, "loss": 0.3947, "num_input_tokens_seen": 18502242446, "step": 4745, "train_runtime": 188701.2158, "train_tokens_per_second": 98050.468 }, { "epoch": 0.7545310015898251, "grad_norm": 0.21101605892181396, "learning_rate": 7.1090847770547865e-06, "loss": 0.3872, "num_input_tokens_seen": 18506198181, "step": 4746, "train_runtime": 188739.5131, "train_tokens_per_second": 98051.531 }, { "epoch": 0.7546899841017488, "grad_norm": 0.3553829491138458, "learning_rate": 7.100347504897392e-06, "loss": 0.3839, "num_input_tokens_seen": 18510017914, "step": 4747, "train_runtime": 188778.8962, "train_tokens_per_second": 98051.309 }, { "epoch": 0.7548489666136725, "grad_norm": 0.24209898710250854, "learning_rate": 7.091614716476461e-06, "loss": 0.3876, "num_input_tokens_seen": 18513831742, "step": 4748, "train_runtime": 188818.8837, "train_tokens_per_second": 98050.742 }, { "epoch": 0.7550079491255962, "grad_norm": 0.17910701036453247, "learning_rate": 7.082886413979497e-06, "loss": 0.3842, "num_input_tokens_seen": 18517841388, "step": 4749, "train_runtime": 188859.4302, "train_tokens_per_second": 98050.923 }, { "epoch": 0.7551669316375199, "grad_norm": 0.19342143833637238, "learning_rate": 7.074162599592879e-06, "loss": 0.3951, "num_input_tokens_seen": 18521703148, "step": 4750, "train_runtime": 188896.2473, "train_tokens_per_second": 98052.256 }, { "epoch": 0.7553259141494436, "grad_norm": 0.23852835595607758, "learning_rate": 7.06544327550186e-06, "loss": 0.3892, "num_input_tokens_seen": 18525558169, "step": 4751, "train_runtime": 188934.806, "train_tokens_per_second": 98052.649 }, { "epoch": 0.7554848966613672, "grad_norm": 0.1954200118780136, "learning_rate": 7.056728443890576e-06, "loss": 0.3842, "num_input_tokens_seen": 18529435216, "step": 4752, "train_runtime": 188972.4211, "train_tokens_per_second": 98053.648 }, { "epoch": 0.7556438791732909, "grad_norm": 0.20541895925998688, "learning_rate": 7.0480181069420145e-06, "loss": 0.3815, "num_input_tokens_seen": 18533240958, "step": 4753, "train_runtime": 189009.7382, "train_tokens_per_second": 98054.424 }, { "epoch": 0.7558028616852146, "grad_norm": 0.18729305267333984, "learning_rate": 7.039312266838064e-06, "loss": 0.3813, "num_input_tokens_seen": 18537204668, "step": 4754, "train_runtime": 189048.617, "train_tokens_per_second": 98055.225 }, { "epoch": 0.7559618441971383, "grad_norm": 0.1783076822757721, "learning_rate": 7.030610925759479e-06, "loss": 0.3944, "num_input_tokens_seen": 18541087737, "step": 4755, "train_runtime": 189085.2548, "train_tokens_per_second": 98056.762 }, { "epoch": 0.756120826709062, "grad_norm": 0.2306596338748932, "learning_rate": 7.021914085885864e-06, "loss": 0.3892, "num_input_tokens_seen": 18545051862, "step": 4756, "train_runtime": 189123.8168, "train_tokens_per_second": 98057.728 }, { "epoch": 0.7562798092209857, "grad_norm": 0.24342364072799683, "learning_rate": 7.013221749395748e-06, "loss": 0.3985, "num_input_tokens_seen": 18548910421, "step": 4757, "train_runtime": 189163.1725, "train_tokens_per_second": 98057.725 }, { "epoch": 0.7564387917329094, "grad_norm": 0.2517886459827423, "learning_rate": 7.004533918466475e-06, "loss": 0.3734, "num_input_tokens_seen": 18552752668, "step": 4758, "train_runtime": 189203.5201, "train_tokens_per_second": 98057.122 }, { "epoch": 0.7565977742448331, "grad_norm": 0.21029983460903168, "learning_rate": 6.995850595274292e-06, "loss": 0.3889, "num_input_tokens_seen": 18556642424, "step": 4759, "train_runtime": 189240.355, "train_tokens_per_second": 98058.59 }, { "epoch": 0.7567567567567568, "grad_norm": 0.21700917184352875, "learning_rate": 6.9871717819943114e-06, "loss": 0.3833, "num_input_tokens_seen": 18560524104, "step": 4760, "train_runtime": 189277.6548, "train_tokens_per_second": 98059.774 }, { "epoch": 0.7569157392686805, "grad_norm": 0.19639523327350616, "learning_rate": 6.978497480800517e-06, "loss": 0.3943, "num_input_tokens_seen": 18564531223, "step": 4761, "train_runtime": 189317.9498, "train_tokens_per_second": 98060.069 }, { "epoch": 0.7570747217806041, "grad_norm": 0.25213658809661865, "learning_rate": 6.969827693865763e-06, "loss": 0.3923, "num_input_tokens_seen": 18568485865, "step": 4762, "train_runtime": 189358.6199, "train_tokens_per_second": 98059.892 }, { "epoch": 0.7572337042925278, "grad_norm": 0.2170913815498352, "learning_rate": 6.9611624233617476e-06, "loss": 0.3962, "num_input_tokens_seen": 18572237131, "step": 4763, "train_runtime": 189397.1134, "train_tokens_per_second": 98059.769 }, { "epoch": 0.7573926868044515, "grad_norm": 0.20569469034671783, "learning_rate": 6.952501671459094e-06, "loss": 0.3944, "num_input_tokens_seen": 18576166443, "step": 4764, "train_runtime": 189436.5876, "train_tokens_per_second": 98060.077 }, { "epoch": 0.7575516693163752, "grad_norm": 0.23852698504924774, "learning_rate": 6.943845440327232e-06, "loss": 0.3988, "num_input_tokens_seen": 18580100913, "step": 4765, "train_runtime": 189477.0745, "train_tokens_per_second": 98059.889 }, { "epoch": 0.7577106518282989, "grad_norm": 0.29028838872909546, "learning_rate": 6.9351937321344985e-06, "loss": 0.3849, "num_input_tokens_seen": 18583928987, "step": 4766, "train_runtime": 189516.129, "train_tokens_per_second": 98059.881 }, { "epoch": 0.7578696343402226, "grad_norm": 0.24744564294815063, "learning_rate": 6.92654654904808e-06, "loss": 0.3882, "num_input_tokens_seen": 18587910903, "step": 4767, "train_runtime": 189555.6169, "train_tokens_per_second": 98060.46 }, { "epoch": 0.7580286168521463, "grad_norm": 0.6924234628677368, "learning_rate": 6.91790389323404e-06, "loss": 0.3979, "num_input_tokens_seen": 18591756285, "step": 4768, "train_runtime": 189597.4047, "train_tokens_per_second": 98059.129 }, { "epoch": 0.75818759936407, "grad_norm": 0.23201312124729156, "learning_rate": 6.909265766857306e-06, "loss": 0.39, "num_input_tokens_seen": 18595756957, "step": 4769, "train_runtime": 189636.4717, "train_tokens_per_second": 98060.024 }, { "epoch": 0.7583465818759937, "grad_norm": 0.2192658632993698, "learning_rate": 6.9006321720816555e-06, "loss": 0.3926, "num_input_tokens_seen": 18599587652, "step": 4770, "train_runtime": 189677.2712, "train_tokens_per_second": 98059.127 }, { "epoch": 0.7585055643879173, "grad_norm": 0.21551638841629028, "learning_rate": 6.89200311106975e-06, "loss": 0.3894, "num_input_tokens_seen": 18603435220, "step": 4771, "train_runtime": 189718.4589, "train_tokens_per_second": 98058.119 }, { "epoch": 0.758664546899841, "grad_norm": 0.5679590106010437, "learning_rate": 6.883378585983105e-06, "loss": 0.3819, "num_input_tokens_seen": 18607381276, "step": 4772, "train_runtime": 189756.8716, "train_tokens_per_second": 98059.064 }, { "epoch": 0.7588235294117647, "grad_norm": 0.22025643289089203, "learning_rate": 6.8747585989821175e-06, "loss": 0.3897, "num_input_tokens_seen": 18611223890, "step": 4773, "train_runtime": 189796.4058, "train_tokens_per_second": 98058.885 }, { "epoch": 0.7589825119236884, "grad_norm": 0.179075688123703, "learning_rate": 6.866143152226001e-06, "loss": 0.3929, "num_input_tokens_seen": 18615095153, "step": 4774, "train_runtime": 189836.9804, "train_tokens_per_second": 98058.319 }, { "epoch": 0.7591414944356121, "grad_norm": 0.20442749559879303, "learning_rate": 6.857532247872905e-06, "loss": 0.3833, "num_input_tokens_seen": 18618895103, "step": 4775, "train_runtime": 189876.5275, "train_tokens_per_second": 98057.908 }, { "epoch": 0.7593004769475358, "grad_norm": 0.27675652503967285, "learning_rate": 6.848925888079768e-06, "loss": 0.3862, "num_input_tokens_seen": 18622868708, "step": 4776, "train_runtime": 189915.1743, "train_tokens_per_second": 98058.877 }, { "epoch": 0.7594594594594595, "grad_norm": 0.17560826241970062, "learning_rate": 6.840324075002438e-06, "loss": 0.3965, "num_input_tokens_seen": 18626746611, "step": 4777, "train_runtime": 189954.8273, "train_tokens_per_second": 98058.822 }, { "epoch": 0.7596184419713832, "grad_norm": 0.18329539895057678, "learning_rate": 6.83172681079561e-06, "loss": 0.4054, "num_input_tokens_seen": 18630579958, "step": 4778, "train_runtime": 189993.3891, "train_tokens_per_second": 98059.096 }, { "epoch": 0.7597774244833069, "grad_norm": 0.3367099165916443, "learning_rate": 6.8231340976128185e-06, "loss": 0.3804, "num_input_tokens_seen": 18634519615, "step": 4779, "train_runtime": 190032.8845, "train_tokens_per_second": 98059.447 }, { "epoch": 0.7599364069952306, "grad_norm": 0.6008399724960327, "learning_rate": 6.814545937606506e-06, "loss": 0.3846, "num_input_tokens_seen": 18638378029, "step": 4780, "train_runtime": 190072.2983, "train_tokens_per_second": 98059.413 }, { "epoch": 0.7600953895071542, "grad_norm": 0.2430844008922577, "learning_rate": 6.805962332927915e-06, "loss": 0.392, "num_input_tokens_seen": 18642268932, "step": 4781, "train_runtime": 190112.485, "train_tokens_per_second": 98059.151 }, { "epoch": 0.7602543720190779, "grad_norm": 0.21900539100170135, "learning_rate": 6.7973832857272106e-06, "loss": 0.3867, "num_input_tokens_seen": 18646111869, "step": 4782, "train_runtime": 190151.9638, "train_tokens_per_second": 98059.002 }, { "epoch": 0.7604133545310016, "grad_norm": 0.2311830073595047, "learning_rate": 6.78880879815336e-06, "loss": 0.3994, "num_input_tokens_seen": 18650000339, "step": 4783, "train_runtime": 190192.8113, "train_tokens_per_second": 98058.387 }, { "epoch": 0.7605723370429253, "grad_norm": 0.21970075368881226, "learning_rate": 6.780238872354217e-06, "loss": 0.3909, "num_input_tokens_seen": 18653994142, "step": 4784, "train_runtime": 190231.5917, "train_tokens_per_second": 98059.392 }, { "epoch": 0.760731319554849, "grad_norm": 0.2346399575471878, "learning_rate": 6.771673510476498e-06, "loss": 0.3903, "num_input_tokens_seen": 18657963239, "step": 4785, "train_runtime": 190271.6109, "train_tokens_per_second": 98059.627 }, { "epoch": 0.7608903020667727, "grad_norm": 0.26536139845848083, "learning_rate": 6.76311271466574e-06, "loss": 0.3939, "num_input_tokens_seen": 18661712751, "step": 4786, "train_runtime": 190310.1336, "train_tokens_per_second": 98059.48 }, { "epoch": 0.7610492845786964, "grad_norm": 0.1730605959892273, "learning_rate": 6.754556487066396e-06, "loss": 0.3794, "num_input_tokens_seen": 18665608398, "step": 4787, "train_runtime": 190349.0144, "train_tokens_per_second": 98059.916 }, { "epoch": 0.7612082670906201, "grad_norm": 0.2129235863685608, "learning_rate": 6.746004829821714e-06, "loss": 0.3772, "num_input_tokens_seen": 18669496623, "step": 4788, "train_runtime": 190387.6818, "train_tokens_per_second": 98060.423 }, { "epoch": 0.7613672496025438, "grad_norm": 0.31288695335388184, "learning_rate": 6.737457745073833e-06, "loss": 0.3727, "num_input_tokens_seen": 18673401577, "step": 4789, "train_runtime": 190424.7404, "train_tokens_per_second": 98061.846 }, { "epoch": 0.7615262321144675, "grad_norm": 0.23077210783958435, "learning_rate": 6.72891523496374e-06, "loss": 0.3914, "num_input_tokens_seen": 18677333660, "step": 4790, "train_runtime": 190463.1424, "train_tokens_per_second": 98062.719 }, { "epoch": 0.761685214626391, "grad_norm": 0.27175238728523254, "learning_rate": 6.720377301631276e-06, "loss": 0.3859, "num_input_tokens_seen": 18681173478, "step": 4791, "train_runtime": 190500.6742, "train_tokens_per_second": 98063.556 }, { "epoch": 0.7618441971383147, "grad_norm": 0.26803430914878845, "learning_rate": 6.711843947215121e-06, "loss": 0.3843, "num_input_tokens_seen": 18685092052, "step": 4792, "train_runtime": 190538.4575, "train_tokens_per_second": 98064.676 }, { "epoch": 0.7620031796502384, "grad_norm": 0.21400097012519836, "learning_rate": 6.7033151738528255e-06, "loss": 0.3957, "num_input_tokens_seen": 18688993716, "step": 4793, "train_runtime": 190577.7095, "train_tokens_per_second": 98064.951 }, { "epoch": 0.7621621621621621, "grad_norm": 0.2988486588001251, "learning_rate": 6.694790983680787e-06, "loss": 0.4005, "num_input_tokens_seen": 18692870443, "step": 4794, "train_runtime": 190618.2672, "train_tokens_per_second": 98064.423 }, { "epoch": 0.7623211446740858, "grad_norm": 0.2791173756122589, "learning_rate": 6.6862713788342605e-06, "loss": 0.3808, "num_input_tokens_seen": 18696864686, "step": 4795, "train_runtime": 190659.6651, "train_tokens_per_second": 98064.08 }, { "epoch": 0.7624801271860095, "grad_norm": 0.2045617699623108, "learning_rate": 6.677756361447346e-06, "loss": 0.3933, "num_input_tokens_seen": 18700757261, "step": 4796, "train_runtime": 190699.0347, "train_tokens_per_second": 98064.247 }, { "epoch": 0.7626391096979332, "grad_norm": 0.20657004415988922, "learning_rate": 6.669245933652979e-06, "loss": 0.3778, "num_input_tokens_seen": 18704587770, "step": 4797, "train_runtime": 190737.6279, "train_tokens_per_second": 98064.488 }, { "epoch": 0.762798092209857, "grad_norm": 0.22595934569835663, "learning_rate": 6.660740097582988e-06, "loss": 0.39, "num_input_tokens_seen": 18708474093, "step": 4798, "train_runtime": 190777.133, "train_tokens_per_second": 98064.552 }, { "epoch": 0.7629570747217806, "grad_norm": 0.19962549209594727, "learning_rate": 6.652238855368001e-06, "loss": 0.3868, "num_input_tokens_seen": 18712404680, "step": 4799, "train_runtime": 190814.1143, "train_tokens_per_second": 98066.145 }, { "epoch": 0.7631160572337043, "grad_norm": 0.29282477498054504, "learning_rate": 6.6437422091375325e-06, "loss": 0.3986, "num_input_tokens_seen": 18716283967, "step": 4800, "train_runtime": 190852.3838, "train_tokens_per_second": 98066.807 }, { "epoch": 0.7632750397456279, "grad_norm": 0.20361864566802979, "learning_rate": 6.635250161019935e-06, "loss": 0.3814, "num_input_tokens_seen": 18720091431, "step": 4801, "train_runtime": 191000.6277, "train_tokens_per_second": 98010.628 }, { "epoch": 0.7634340222575516, "grad_norm": 0.5391572117805481, "learning_rate": 6.626762713142384e-06, "loss": 0.3865, "num_input_tokens_seen": 18724100009, "step": 4802, "train_runtime": 191039.224, "train_tokens_per_second": 98011.809 }, { "epoch": 0.7635930047694753, "grad_norm": 0.213047593832016, "learning_rate": 6.618279867630958e-06, "loss": 0.3827, "num_input_tokens_seen": 18727883241, "step": 4803, "train_runtime": 191078.1327, "train_tokens_per_second": 98011.651 }, { "epoch": 0.763751987281399, "grad_norm": 0.28260597586631775, "learning_rate": 6.609801626610521e-06, "loss": 0.3861, "num_input_tokens_seen": 18731761522, "step": 4804, "train_runtime": 191119.0477, "train_tokens_per_second": 98010.961 }, { "epoch": 0.7639109697933227, "grad_norm": 0.24482062458992004, "learning_rate": 6.601327992204837e-06, "loss": 0.406, "num_input_tokens_seen": 18735673912, "step": 4805, "train_runtime": 191159.493, "train_tokens_per_second": 98010.691 }, { "epoch": 0.7640699523052464, "grad_norm": 0.2706707715988159, "learning_rate": 6.592858966536472e-06, "loss": 0.3836, "num_input_tokens_seen": 18739498017, "step": 4806, "train_runtime": 191197.4187, "train_tokens_per_second": 98011.25 }, { "epoch": 0.7642289348171701, "grad_norm": 0.21699917316436768, "learning_rate": 6.584394551726864e-06, "loss": 0.3707, "num_input_tokens_seen": 18743366520, "step": 4807, "train_runtime": 191237.9276, "train_tokens_per_second": 98010.718 }, { "epoch": 0.7643879173290938, "grad_norm": 0.20947511494159698, "learning_rate": 6.575934749896298e-06, "loss": 0.3937, "num_input_tokens_seen": 18747151707, "step": 4808, "train_runtime": 191276.6788, "train_tokens_per_second": 98010.65 }, { "epoch": 0.7645468998410175, "grad_norm": 0.40213268995285034, "learning_rate": 6.567479563163878e-06, "loss": 0.3777, "num_input_tokens_seen": 18751204887, "step": 4809, "train_runtime": 191316.1307, "train_tokens_per_second": 98011.625 }, { "epoch": 0.7647058823529411, "grad_norm": 0.22118602693080902, "learning_rate": 6.5590289936475785e-06, "loss": 0.3884, "num_input_tokens_seen": 18754946600, "step": 4810, "train_runtime": 191356.2839, "train_tokens_per_second": 98010.613 }, { "epoch": 0.7648648648648648, "grad_norm": 0.20416712760925293, "learning_rate": 6.550583043464204e-06, "loss": 0.3932, "num_input_tokens_seen": 18758892908, "step": 4811, "train_runtime": 191396.1223, "train_tokens_per_second": 98010.831 }, { "epoch": 0.7650238473767885, "grad_norm": 0.2893202304840088, "learning_rate": 6.542141714729408e-06, "loss": 0.3765, "num_input_tokens_seen": 18762898776, "step": 4812, "train_runtime": 191433.9929, "train_tokens_per_second": 98012.367 }, { "epoch": 0.7651828298887122, "grad_norm": 0.23310202360153198, "learning_rate": 6.533705009557681e-06, "loss": 0.3751, "num_input_tokens_seen": 18766769780, "step": 4813, "train_runtime": 191468.3177, "train_tokens_per_second": 98015.014 }, { "epoch": 0.7653418124006359, "grad_norm": 0.26134273409843445, "learning_rate": 6.52527293006237e-06, "loss": 0.3942, "num_input_tokens_seen": 18770522853, "step": 4814, "train_runtime": 191506.1055, "train_tokens_per_second": 98015.271 }, { "epoch": 0.7655007949125596, "grad_norm": 0.5015230774879456, "learning_rate": 6.516845478355632e-06, "loss": 0.3963, "num_input_tokens_seen": 18774506852, "step": 4815, "train_runtime": 191545.9967, "train_tokens_per_second": 98015.658 }, { "epoch": 0.7656597774244833, "grad_norm": 0.4133063554763794, "learning_rate": 6.508422656548494e-06, "loss": 0.3841, "num_input_tokens_seen": 18778417496, "step": 4816, "train_runtime": 191584.33, "train_tokens_per_second": 98016.458 }, { "epoch": 0.765818759936407, "grad_norm": 0.19338923692703247, "learning_rate": 6.500004466750811e-06, "loss": 0.3845, "num_input_tokens_seen": 18782388965, "step": 4817, "train_runtime": 191624.5325, "train_tokens_per_second": 98016.62 }, { "epoch": 0.7659777424483307, "grad_norm": 0.2632770538330078, "learning_rate": 6.4915909110712825e-06, "loss": 0.397, "num_input_tokens_seen": 18786060037, "step": 4818, "train_runtime": 191665.8784, "train_tokens_per_second": 98014.629 }, { "epoch": 0.7661367249602544, "grad_norm": 0.2577974200248718, "learning_rate": 6.483181991617451e-06, "loss": 0.3892, "num_input_tokens_seen": 18790125401, "step": 4819, "train_runtime": 191704.2968, "train_tokens_per_second": 98016.193 }, { "epoch": 0.766295707472178, "grad_norm": 0.2364821434020996, "learning_rate": 6.474777710495669e-06, "loss": 0.4015, "num_input_tokens_seen": 18794098949, "step": 4820, "train_runtime": 191744.5653, "train_tokens_per_second": 98016.332 }, { "epoch": 0.7664546899841017, "grad_norm": 0.21284067630767822, "learning_rate": 6.466378069811183e-06, "loss": 0.396, "num_input_tokens_seen": 18797935981, "step": 4821, "train_runtime": 191783.1387, "train_tokens_per_second": 98016.625 }, { "epoch": 0.7666136724960254, "grad_norm": 0.30811479687690735, "learning_rate": 6.457983071668014e-06, "loss": 0.4003, "num_input_tokens_seen": 18801812743, "step": 4822, "train_runtime": 191821.9598, "train_tokens_per_second": 98016.999 }, { "epoch": 0.7667726550079491, "grad_norm": 0.26290181279182434, "learning_rate": 6.449592718169062e-06, "loss": 0.3892, "num_input_tokens_seen": 18805737226, "step": 4823, "train_runtime": 191861.7955, "train_tokens_per_second": 98017.102 }, { "epoch": 0.7669316375198728, "grad_norm": 0.2693381905555725, "learning_rate": 6.441207011416059e-06, "loss": 0.3941, "num_input_tokens_seen": 18809630609, "step": 4824, "train_runtime": 191902.6836, "train_tokens_per_second": 98016.506 }, { "epoch": 0.7670906200317965, "grad_norm": 0.23330366611480713, "learning_rate": 6.432825953509541e-06, "loss": 0.3894, "num_input_tokens_seen": 18813501163, "step": 4825, "train_runtime": 191941.816, "train_tokens_per_second": 98016.688 }, { "epoch": 0.7672496025437202, "grad_norm": 0.21778248250484467, "learning_rate": 6.424449546548933e-06, "loss": 0.3978, "num_input_tokens_seen": 18817412369, "step": 4826, "train_runtime": 191981.3988, "train_tokens_per_second": 98016.852 }, { "epoch": 0.7674085850556439, "grad_norm": 0.19182974100112915, "learning_rate": 6.416077792632438e-06, "loss": 0.3853, "num_input_tokens_seen": 18821404656, "step": 4827, "train_runtime": 192020.3351, "train_tokens_per_second": 98017.768 }, { "epoch": 0.7675675675675676, "grad_norm": 0.2942534387111664, "learning_rate": 6.407710693857153e-06, "loss": 0.3815, "num_input_tokens_seen": 18825221481, "step": 4828, "train_runtime": 192060.7697, "train_tokens_per_second": 98017.005 }, { "epoch": 0.7677265500794913, "grad_norm": 0.44388407468795776, "learning_rate": 6.3993482523189486e-06, "loss": 0.3792, "num_input_tokens_seen": 18829127851, "step": 4829, "train_runtime": 192101.1796, "train_tokens_per_second": 98016.722 }, { "epoch": 0.7678855325914149, "grad_norm": 0.24035146832466125, "learning_rate": 6.3909904701125705e-06, "loss": 0.3808, "num_input_tokens_seen": 18833089965, "step": 4830, "train_runtime": 192138.4794, "train_tokens_per_second": 98018.315 }, { "epoch": 0.7680445151033386, "grad_norm": 0.24125435948371887, "learning_rate": 6.382637349331588e-06, "loss": 0.3982, "num_input_tokens_seen": 18836861573, "step": 4831, "train_runtime": 192176.8569, "train_tokens_per_second": 98018.366 }, { "epoch": 0.7682034976152623, "grad_norm": 0.4352966547012329, "learning_rate": 6.374288892068386e-06, "loss": 0.3918, "num_input_tokens_seen": 18840678088, "step": 4832, "train_runtime": 192215.4049, "train_tokens_per_second": 98018.565 }, { "epoch": 0.768362480127186, "grad_norm": 0.21060413122177124, "learning_rate": 6.3659451004142e-06, "loss": 0.3843, "num_input_tokens_seen": 18844620231, "step": 4833, "train_runtime": 192254.8539, "train_tokens_per_second": 98018.957 }, { "epoch": 0.7685214626391097, "grad_norm": 0.2899544835090637, "learning_rate": 6.3576059764590914e-06, "loss": 0.3981, "num_input_tokens_seen": 18848512705, "step": 4834, "train_runtime": 192294.114, "train_tokens_per_second": 98019.187 }, { "epoch": 0.7686804451510334, "grad_norm": 0.22728252410888672, "learning_rate": 6.349271522291952e-06, "loss": 0.3828, "num_input_tokens_seen": 18852313879, "step": 4835, "train_runtime": 192334.355, "train_tokens_per_second": 98018.442 }, { "epoch": 0.7688394276629571, "grad_norm": 0.22590339183807373, "learning_rate": 6.340941740000506e-06, "loss": 0.3879, "num_input_tokens_seen": 18856200214, "step": 4836, "train_runtime": 192373.7764, "train_tokens_per_second": 98018.558 }, { "epoch": 0.7689984101748808, "grad_norm": 0.24796916544437408, "learning_rate": 6.332616631671307e-06, "loss": 0.392, "num_input_tokens_seen": 18860202430, "step": 4837, "train_runtime": 192413.0518, "train_tokens_per_second": 98019.351 }, { "epoch": 0.7691573926868045, "grad_norm": 0.2106155902147293, "learning_rate": 6.324296199389726e-06, "loss": 0.3854, "num_input_tokens_seen": 18864111222, "step": 4838, "train_runtime": 192450.9991, "train_tokens_per_second": 98020.334 }, { "epoch": 0.7693163751987281, "grad_norm": 0.22502975165843964, "learning_rate": 6.3159804452399774e-06, "loss": 0.3833, "num_input_tokens_seen": 18867949264, "step": 4839, "train_runtime": 192488.2822, "train_tokens_per_second": 98021.288 }, { "epoch": 0.7694753577106518, "grad_norm": 0.1893066018819809, "learning_rate": 6.3076693713051e-06, "loss": 0.3988, "num_input_tokens_seen": 18871756828, "step": 4840, "train_runtime": 192527.3072, "train_tokens_per_second": 98021.196 }, { "epoch": 0.7696343402225755, "grad_norm": 0.2536267638206482, "learning_rate": 6.299362979666956e-06, "loss": 0.3821, "num_input_tokens_seen": 18875662122, "step": 4841, "train_runtime": 192566.9298, "train_tokens_per_second": 98021.307 }, { "epoch": 0.7697933227344992, "grad_norm": 0.248777374625206, "learning_rate": 6.291061272406251e-06, "loss": 0.3891, "num_input_tokens_seen": 18879628102, "step": 4842, "train_runtime": 192607.5882, "train_tokens_per_second": 98021.206 }, { "epoch": 0.7699523052464229, "grad_norm": 0.1998460590839386, "learning_rate": 6.282764251602477e-06, "loss": 0.3839, "num_input_tokens_seen": 18883379184, "step": 4843, "train_runtime": 192647.9395, "train_tokens_per_second": 98020.146 }, { "epoch": 0.7701112877583466, "grad_norm": 0.18878602981567383, "learning_rate": 6.27447191933401e-06, "loss": 0.385, "num_input_tokens_seen": 18887365826, "step": 4844, "train_runtime": 192687.2978, "train_tokens_per_second": 98020.814 }, { "epoch": 0.7702702702702703, "grad_norm": 0.23625430464744568, "learning_rate": 6.266184277677997e-06, "loss": 0.3964, "num_input_tokens_seen": 18891230438, "step": 4845, "train_runtime": 192726.456, "train_tokens_per_second": 98020.951 }, { "epoch": 0.770429252782194, "grad_norm": 0.20541517436504364, "learning_rate": 6.257901328710444e-06, "loss": 0.3977, "num_input_tokens_seen": 18895127627, "step": 4846, "train_runtime": 192766.4937, "train_tokens_per_second": 98020.809 }, { "epoch": 0.7705882352941177, "grad_norm": 0.31197425723075867, "learning_rate": 6.249623074506172e-06, "loss": 0.4043, "num_input_tokens_seen": 18899095541, "step": 4847, "train_runtime": 192807.5239, "train_tokens_per_second": 98020.529 }, { "epoch": 0.7707472178060414, "grad_norm": 0.2219797670841217, "learning_rate": 6.24134951713882e-06, "loss": 0.3927, "num_input_tokens_seen": 18902961305, "step": 4848, "train_runtime": 192849.2647, "train_tokens_per_second": 98019.359 }, { "epoch": 0.770906200317965, "grad_norm": 0.23175452649593353, "learning_rate": 6.2330806586808675e-06, "loss": 0.3803, "num_input_tokens_seen": 18906918336, "step": 4849, "train_runtime": 192890.1894, "train_tokens_per_second": 98019.077 }, { "epoch": 0.7710651828298887, "grad_norm": 0.18048647046089172, "learning_rate": 6.22481650120359e-06, "loss": 0.3973, "num_input_tokens_seen": 18910859571, "step": 4850, "train_runtime": 192929.5371, "train_tokens_per_second": 98019.515 }, { "epoch": 0.7712241653418124, "grad_norm": 0.21819967031478882, "learning_rate": 6.2165570467771055e-06, "loss": 0.3841, "num_input_tokens_seen": 18914724082, "step": 4851, "train_runtime": 192968.0277, "train_tokens_per_second": 98019.99 }, { "epoch": 0.7713831478537361, "grad_norm": 0.3081495463848114, "learning_rate": 6.20830229747035e-06, "loss": 0.386, "num_input_tokens_seen": 18918640984, "step": 4852, "train_runtime": 193007.9157, "train_tokens_per_second": 98020.026 }, { "epoch": 0.7715421303656598, "grad_norm": 0.18147492408752441, "learning_rate": 6.200052255351083e-06, "loss": 0.3872, "num_input_tokens_seen": 18922519517, "step": 4853, "train_runtime": 193049.1524, "train_tokens_per_second": 98019.179 }, { "epoch": 0.7717011128775835, "grad_norm": 0.18122777342796326, "learning_rate": 6.191806922485877e-06, "loss": 0.385, "num_input_tokens_seen": 18926445769, "step": 4854, "train_runtime": 193088.9347, "train_tokens_per_second": 98019.318 }, { "epoch": 0.7718600953895072, "grad_norm": 0.18720504641532898, "learning_rate": 6.1835663009401385e-06, "loss": 0.3861, "num_input_tokens_seen": 18930318044, "step": 4855, "train_runtime": 193126.9878, "train_tokens_per_second": 98020.055 }, { "epoch": 0.7720190779014309, "grad_norm": 0.1945580691099167, "learning_rate": 6.17533039277807e-06, "loss": 0.3883, "num_input_tokens_seen": 18934145457, "step": 4856, "train_runtime": 193165.4384, "train_tokens_per_second": 98020.358 }, { "epoch": 0.7721780604133546, "grad_norm": 0.2027820497751236, "learning_rate": 6.167099200062718e-06, "loss": 0.3908, "num_input_tokens_seen": 18938027004, "step": 4857, "train_runtime": 193204.4818, "train_tokens_per_second": 98020.64 }, { "epoch": 0.7723370429252783, "grad_norm": 0.21272043883800507, "learning_rate": 6.1588727248559354e-06, "loss": 0.3879, "num_input_tokens_seen": 18941974755, "step": 4858, "train_runtime": 193244.9137, "train_tokens_per_second": 98020.561 }, { "epoch": 0.7724960254372019, "grad_norm": 0.2559719681739807, "learning_rate": 6.150650969218397e-06, "loss": 0.379, "num_input_tokens_seen": 18945900932, "step": 4859, "train_runtime": 193284.2317, "train_tokens_per_second": 98020.934 }, { "epoch": 0.7726550079491256, "grad_norm": 0.9994222521781921, "learning_rate": 6.1424339352096015e-06, "loss": 0.3777, "num_input_tokens_seen": 18949745788, "step": 4860, "train_runtime": 193321.9807, "train_tokens_per_second": 98021.682 }, { "epoch": 0.7728139904610493, "grad_norm": 0.32633623480796814, "learning_rate": 6.13422162488784e-06, "loss": 0.3846, "num_input_tokens_seen": 18953714638, "step": 4861, "train_runtime": 193360.7343, "train_tokens_per_second": 98022.562 }, { "epoch": 0.772972972972973, "grad_norm": 0.18493163585662842, "learning_rate": 6.12601404031026e-06, "loss": 0.4005, "num_input_tokens_seen": 18957694324, "step": 4862, "train_runtime": 193400.9234, "train_tokens_per_second": 98022.77 }, { "epoch": 0.7731319554848967, "grad_norm": 0.2022465467453003, "learning_rate": 6.117811183532788e-06, "loss": 0.386, "num_input_tokens_seen": 18961482025, "step": 4863, "train_runtime": 193441.229, "train_tokens_per_second": 98021.927 }, { "epoch": 0.7732909379968204, "grad_norm": 0.1843346357345581, "learning_rate": 6.109613056610184e-06, "loss": 0.3911, "num_input_tokens_seen": 18965392938, "step": 4864, "train_runtime": 193479.026, "train_tokens_per_second": 98022.992 }, { "epoch": 0.7734499205087441, "grad_norm": 0.1957731544971466, "learning_rate": 6.101419661596031e-06, "loss": 0.37, "num_input_tokens_seen": 18969210666, "step": 4865, "train_runtime": 193519.2565, "train_tokens_per_second": 98022.342 }, { "epoch": 0.7736089030206678, "grad_norm": 0.18270821869373322, "learning_rate": 6.093231000542696e-06, "loss": 0.3905, "num_input_tokens_seen": 18973123776, "step": 4866, "train_runtime": 193557.9058, "train_tokens_per_second": 98022.985 }, { "epoch": 0.7737678855325915, "grad_norm": 0.2095329314470291, "learning_rate": 6.085047075501407e-06, "loss": 0.3875, "num_input_tokens_seen": 18976970492, "step": 4867, "train_runtime": 193594.6566, "train_tokens_per_second": 98024.247 }, { "epoch": 0.773926868044515, "grad_norm": 0.18796423077583313, "learning_rate": 6.076867888522158e-06, "loss": 0.388, "num_input_tokens_seen": 18980798297, "step": 4868, "train_runtime": 193642.5555, "train_tokens_per_second": 98019.768 }, { "epoch": 0.7740858505564387, "grad_norm": 0.22240115702152252, "learning_rate": 6.068693441653786e-06, "loss": 0.396, "num_input_tokens_seen": 18984737504, "step": 4869, "train_runtime": 193683.6898, "train_tokens_per_second": 98019.289 }, { "epoch": 0.7742448330683624, "grad_norm": 0.17886684834957123, "learning_rate": 6.060523736943932e-06, "loss": 0.3995, "num_input_tokens_seen": 18988662198, "step": 4870, "train_runtime": 193724.304, "train_tokens_per_second": 98018.998 }, { "epoch": 0.7744038155802861, "grad_norm": 0.19673258066177368, "learning_rate": 6.052358776439046e-06, "loss": 0.3822, "num_input_tokens_seen": 18992369820, "step": 4871, "train_runtime": 193765.6287, "train_tokens_per_second": 98017.228 }, { "epoch": 0.7745627980922098, "grad_norm": 0.17244704067707062, "learning_rate": 6.044198562184405e-06, "loss": 0.3811, "num_input_tokens_seen": 18996306656, "step": 4872, "train_runtime": 193804.6352, "train_tokens_per_second": 98017.814 }, { "epoch": 0.7747217806041335, "grad_norm": 0.1754322350025177, "learning_rate": 6.03604309622407e-06, "loss": 0.382, "num_input_tokens_seen": 19000217751, "step": 4873, "train_runtime": 193845.951, "train_tokens_per_second": 98017.099 }, { "epoch": 0.7748807631160572, "grad_norm": 2.0236167907714844, "learning_rate": 6.027892380600933e-06, "loss": 0.3855, "num_input_tokens_seen": 19004157434, "step": 4874, "train_runtime": 193884.3389, "train_tokens_per_second": 98018.012 }, { "epoch": 0.775039745627981, "grad_norm": 0.18845847249031067, "learning_rate": 6.019746417356689e-06, "loss": 0.3898, "num_input_tokens_seen": 19008027340, "step": 4875, "train_runtime": 193923.8812, "train_tokens_per_second": 98017.981 }, { "epoch": 0.7751987281399046, "grad_norm": 0.5242911577224731, "learning_rate": 6.011605208531848e-06, "loss": 0.377, "num_input_tokens_seen": 19011937443, "step": 4876, "train_runtime": 193963.5017, "train_tokens_per_second": 98018.118 }, { "epoch": 0.7753577106518283, "grad_norm": 0.1985706239938736, "learning_rate": 6.003468756165723e-06, "loss": 0.3893, "num_input_tokens_seen": 19015729785, "step": 4877, "train_runtime": 194004.2831, "train_tokens_per_second": 98017.062 }, { "epoch": 0.7755166931637519, "grad_norm": 0.20650146901607513, "learning_rate": 5.995337062296446e-06, "loss": 0.3999, "num_input_tokens_seen": 19019633479, "step": 4878, "train_runtime": 194044.0894, "train_tokens_per_second": 98017.072 }, { "epoch": 0.7756756756756756, "grad_norm": 0.1758013814687729, "learning_rate": 5.987210128960935e-06, "loss": 0.3801, "num_input_tokens_seen": 19023567303, "step": 4879, "train_runtime": 194081.7041, "train_tokens_per_second": 98018.344 }, { "epoch": 0.7758346581875993, "grad_norm": 0.20552752912044525, "learning_rate": 5.979087958194934e-06, "loss": 0.3904, "num_input_tokens_seen": 19027464495, "step": 4880, "train_runtime": 194119.7998, "train_tokens_per_second": 98019.185 }, { "epoch": 0.775993640699523, "grad_norm": 0.632387638092041, "learning_rate": 5.970970552032992e-06, "loss": 0.3885, "num_input_tokens_seen": 19031317228, "step": 4881, "train_runtime": 194161.0572, "train_tokens_per_second": 98018.199 }, { "epoch": 0.7761526232114467, "grad_norm": 0.23766936361789703, "learning_rate": 5.96285791250846e-06, "loss": 0.3817, "num_input_tokens_seen": 19035117286, "step": 4882, "train_runtime": 194201.2421, "train_tokens_per_second": 98017.485 }, { "epoch": 0.7763116057233704, "grad_norm": 0.1832556426525116, "learning_rate": 5.954750041653503e-06, "loss": 0.4036, "num_input_tokens_seen": 19039003447, "step": 4883, "train_runtime": 194240.7754, "train_tokens_per_second": 98017.542 }, { "epoch": 0.7764705882352941, "grad_norm": 0.1819184571504593, "learning_rate": 5.946646941499071e-06, "loss": 0.3862, "num_input_tokens_seen": 19042911059, "step": 4884, "train_runtime": 194280.6427, "train_tokens_per_second": 98017.542 }, { "epoch": 0.7766295707472178, "grad_norm": 0.20937658846378326, "learning_rate": 5.938548614074951e-06, "loss": 0.3949, "num_input_tokens_seen": 19046804672, "step": 4885, "train_runtime": 194318.614, "train_tokens_per_second": 98018.426 }, { "epoch": 0.7767885532591415, "grad_norm": 0.3587273955345154, "learning_rate": 5.930455061409706e-06, "loss": 0.3828, "num_input_tokens_seen": 19050642752, "step": 4886, "train_runtime": 194361.3252, "train_tokens_per_second": 98016.633 }, { "epoch": 0.7769475357710652, "grad_norm": 0.19264239072799683, "learning_rate": 5.922366285530714e-06, "loss": 0.3869, "num_input_tokens_seen": 19054607529, "step": 4887, "train_runtime": 194397.2913, "train_tokens_per_second": 98018.894 }, { "epoch": 0.7771065182829888, "grad_norm": 0.19769063591957092, "learning_rate": 5.914282288464165e-06, "loss": 0.3893, "num_input_tokens_seen": 19058472204, "step": 4888, "train_runtime": 194439.6513, "train_tokens_per_second": 98017.416 }, { "epoch": 0.7772655007949125, "grad_norm": 0.25637388229370117, "learning_rate": 5.9062030722350224e-06, "loss": 0.3951, "num_input_tokens_seen": 19062485746, "step": 4889, "train_runtime": 194477.9869, "train_tokens_per_second": 98018.732 }, { "epoch": 0.7774244833068362, "grad_norm": 0.23004388809204102, "learning_rate": 5.898128638867101e-06, "loss": 0.3746, "num_input_tokens_seen": 19066347509, "step": 4890, "train_runtime": 194515.7584, "train_tokens_per_second": 98019.552 }, { "epoch": 0.7775834658187599, "grad_norm": 0.2376985400915146, "learning_rate": 5.890058990382968e-06, "loss": 0.379, "num_input_tokens_seen": 19070173047, "step": 4891, "train_runtime": 194558.5869, "train_tokens_per_second": 98017.638 }, { "epoch": 0.7777424483306836, "grad_norm": 0.22257143259048462, "learning_rate": 5.881994128804019e-06, "loss": 0.3764, "num_input_tokens_seen": 19073931463, "step": 4892, "train_runtime": 194598.7168, "train_tokens_per_second": 98016.738 }, { "epoch": 0.7779014308426073, "grad_norm": 0.1977459192276001, "learning_rate": 5.873934056150449e-06, "loss": 0.3873, "num_input_tokens_seen": 19077781306, "step": 4893, "train_runtime": 194637.964, "train_tokens_per_second": 98016.753 }, { "epoch": 0.778060413354531, "grad_norm": 0.21268585324287415, "learning_rate": 5.865878774441244e-06, "loss": 0.3852, "num_input_tokens_seen": 19081681181, "step": 4894, "train_runtime": 194678.0374, "train_tokens_per_second": 98016.61 }, { "epoch": 0.7782193958664547, "grad_norm": 0.22708113491535187, "learning_rate": 5.857828285694205e-06, "loss": 0.3869, "num_input_tokens_seen": 19085551030, "step": 4895, "train_runtime": 194717.9757, "train_tokens_per_second": 98016.38 }, { "epoch": 0.7783783783783784, "grad_norm": 0.1847326010465622, "learning_rate": 5.849782591925909e-06, "loss": 0.3856, "num_input_tokens_seen": 19089423203, "step": 4896, "train_runtime": 194760.9943, "train_tokens_per_second": 98014.612 }, { "epoch": 0.7785373608903021, "grad_norm": 0.1925256848335266, "learning_rate": 5.841741695151751e-06, "loss": 0.3781, "num_input_tokens_seen": 19093461510, "step": 4897, "train_runtime": 194799.505, "train_tokens_per_second": 98015.965 }, { "epoch": 0.7786963434022257, "grad_norm": 0.18232525885105133, "learning_rate": 5.833705597385919e-06, "loss": 0.3833, "num_input_tokens_seen": 19097315062, "step": 4898, "train_runtime": 194839.767, "train_tokens_per_second": 98015.489 }, { "epoch": 0.7788553259141494, "grad_norm": 0.2690924406051636, "learning_rate": 5.825674300641398e-06, "loss": 0.3838, "num_input_tokens_seen": 19101060451, "step": 4899, "train_runtime": 194879.9159, "train_tokens_per_second": 98014.515 }, { "epoch": 0.7790143084260731, "grad_norm": 0.19748187065124512, "learning_rate": 5.8176478069299736e-06, "loss": 0.3801, "num_input_tokens_seen": 19105028496, "step": 4900, "train_runtime": 194917.6747, "train_tokens_per_second": 98015.886 }, { "epoch": 0.7791732909379968, "grad_norm": 0.1756758838891983, "learning_rate": 5.8096261182622316e-06, "loss": 0.3848, "num_input_tokens_seen": 19108940716, "step": 4901, "train_runtime": 194955.8968, "train_tokens_per_second": 98016.736 }, { "epoch": 0.7793322734499205, "grad_norm": 0.17788814008235931, "learning_rate": 5.801609236647534e-06, "loss": 0.3771, "num_input_tokens_seen": 19112827196, "step": 4902, "train_runtime": 194994.9465, "train_tokens_per_second": 98017.039 }, { "epoch": 0.7794912559618442, "grad_norm": 0.25077852606773376, "learning_rate": 5.793597164094058e-06, "loss": 0.3875, "num_input_tokens_seen": 19116621707, "step": 4903, "train_runtime": 195035.3277, "train_tokens_per_second": 98016.2 }, { "epoch": 0.7796502384737679, "grad_norm": 0.16979864239692688, "learning_rate": 5.785589902608776e-06, "loss": 0.3999, "num_input_tokens_seen": 19120627068, "step": 4904, "train_runtime": 195074.9102, "train_tokens_per_second": 98016.844 }, { "epoch": 0.7798092209856916, "grad_norm": 0.1898317188024521, "learning_rate": 5.777587454197448e-06, "loss": 0.3978, "num_input_tokens_seen": 19124544205, "step": 4905, "train_runtime": 195113.5499, "train_tokens_per_second": 98017.509 }, { "epoch": 0.7799682034976153, "grad_norm": 0.23626850545406342, "learning_rate": 5.769589820864635e-06, "loss": 0.4004, "num_input_tokens_seen": 19128332624, "step": 4906, "train_runtime": 195153.7731, "train_tokens_per_second": 98016.719 }, { "epoch": 0.7801271860095389, "grad_norm": 0.18050053715705872, "learning_rate": 5.76159700461367e-06, "loss": 0.3816, "num_input_tokens_seen": 19132261692, "step": 4907, "train_runtime": 195193.2549, "train_tokens_per_second": 98017.023 }, { "epoch": 0.7802861685214626, "grad_norm": 0.17409059405326843, "learning_rate": 5.753609007446722e-06, "loss": 0.3891, "num_input_tokens_seen": 19136312593, "step": 4908, "train_runtime": 195233.7471, "train_tokens_per_second": 98017.443 }, { "epoch": 0.7804451510333863, "grad_norm": 0.18070806562900543, "learning_rate": 5.745625831364712e-06, "loss": 0.3866, "num_input_tokens_seen": 19140066386, "step": 4909, "train_runtime": 195274.8622, "train_tokens_per_second": 98016.028 }, { "epoch": 0.78060413354531, "grad_norm": 0.3126530945301056, "learning_rate": 5.737647478367372e-06, "loss": 0.3833, "num_input_tokens_seen": 19143980924, "step": 4910, "train_runtime": 195313.7865, "train_tokens_per_second": 98016.537 }, { "epoch": 0.7807631160572337, "grad_norm": 0.18733642995357513, "learning_rate": 5.72967395045323e-06, "loss": 0.3915, "num_input_tokens_seen": 19147988069, "step": 4911, "train_runtime": 195354.9445, "train_tokens_per_second": 98016.398 }, { "epoch": 0.7809220985691574, "grad_norm": 0.19527986645698547, "learning_rate": 5.72170524961958e-06, "loss": 0.385, "num_input_tokens_seen": 19151985859, "step": 4912, "train_runtime": 195394.9857, "train_tokens_per_second": 98016.772 }, { "epoch": 0.7810810810810811, "grad_norm": 0.17225171625614166, "learning_rate": 5.713741377862549e-06, "loss": 0.3798, "num_input_tokens_seen": 19155845610, "step": 4913, "train_runtime": 195433.856, "train_tokens_per_second": 98017.027 }, { "epoch": 0.7812400635930048, "grad_norm": 0.22519545257091522, "learning_rate": 5.705782337177015e-06, "loss": 0.391, "num_input_tokens_seen": 19159721570, "step": 4914, "train_runtime": 195473.6503, "train_tokens_per_second": 98016.902 }, { "epoch": 0.7813990461049285, "grad_norm": 0.1842638999223709, "learning_rate": 5.697828129556667e-06, "loss": 0.3874, "num_input_tokens_seen": 19163631504, "step": 4915, "train_runtime": 195511.5073, "train_tokens_per_second": 98017.921 }, { "epoch": 0.7815580286168522, "grad_norm": 0.20778627693653107, "learning_rate": 5.6898787569939745e-06, "loss": 0.3955, "num_input_tokens_seen": 19167422722, "step": 4916, "train_runtime": 195551.431, "train_tokens_per_second": 98017.297 }, { "epoch": 0.7817170111287758, "grad_norm": 0.3670446574687958, "learning_rate": 5.681934221480201e-06, "loss": 0.4097, "num_input_tokens_seen": 19171268233, "step": 4917, "train_runtime": 195591.676, "train_tokens_per_second": 98016.79 }, { "epoch": 0.7818759936406995, "grad_norm": 0.218939870595932, "learning_rate": 5.673994525005405e-06, "loss": 0.3903, "num_input_tokens_seen": 19175223051, "step": 4918, "train_runtime": 195633.1415, "train_tokens_per_second": 98016.23 }, { "epoch": 0.7820349761526232, "grad_norm": 0.3472652733325958, "learning_rate": 5.666059669558407e-06, "loss": 0.3907, "num_input_tokens_seen": 19179152168, "step": 4919, "train_runtime": 195671.5661, "train_tokens_per_second": 98017.063 }, { "epoch": 0.7821939586645469, "grad_norm": 0.18960341811180115, "learning_rate": 5.658129657126845e-06, "loss": 0.3811, "num_input_tokens_seen": 19183044313, "step": 4920, "train_runtime": 195709.684, "train_tokens_per_second": 98017.86 }, { "epoch": 0.7823529411764706, "grad_norm": 0.16905075311660767, "learning_rate": 5.650204489697125e-06, "loss": 0.4006, "num_input_tokens_seen": 19187000485, "step": 4921, "train_runtime": 195750.3331, "train_tokens_per_second": 98017.716 }, { "epoch": 0.7825119236883943, "grad_norm": 0.19088181853294373, "learning_rate": 5.642284169254447e-06, "loss": 0.3891, "num_input_tokens_seen": 19190959691, "step": 4922, "train_runtime": 195790.478, "train_tokens_per_second": 98017.84 }, { "epoch": 0.782670906200318, "grad_norm": 0.2824302017688751, "learning_rate": 5.634368697782799e-06, "loss": 0.3895, "num_input_tokens_seen": 19194892216, "step": 4923, "train_runtime": 195827.8969, "train_tokens_per_second": 98019.192 }, { "epoch": 0.7828298887122417, "grad_norm": 0.2114768922328949, "learning_rate": 5.626458077264951e-06, "loss": 0.3952, "num_input_tokens_seen": 19198828599, "step": 4924, "train_runtime": 195865.4884, "train_tokens_per_second": 98020.477 }, { "epoch": 0.7829888712241654, "grad_norm": 0.19061990082263947, "learning_rate": 5.6185523096824515e-06, "loss": 0.3925, "num_input_tokens_seen": 19202704649, "step": 4925, "train_runtime": 195905.4342, "train_tokens_per_second": 98020.276 }, { "epoch": 0.7831478537360891, "grad_norm": 0.3339797556400299, "learning_rate": 5.61065139701564e-06, "loss": 0.3872, "num_input_tokens_seen": 19206674170, "step": 4926, "train_runtime": 195943.5717, "train_tokens_per_second": 98021.456 }, { "epoch": 0.7833068362480127, "grad_norm": 0.19052337110042572, "learning_rate": 5.602755341243649e-06, "loss": 0.3864, "num_input_tokens_seen": 19210534796, "step": 4927, "train_runtime": 195981.4924, "train_tokens_per_second": 98022.189 }, { "epoch": 0.7834658187599364, "grad_norm": 0.20179483294487, "learning_rate": 5.594864144344361e-06, "loss": 0.3775, "num_input_tokens_seen": 19214452606, "step": 4928, "train_runtime": 196020.1797, "train_tokens_per_second": 98022.829 }, { "epoch": 0.7836248012718601, "grad_norm": 0.2760106921195984, "learning_rate": 5.5869778082945e-06, "loss": 0.3853, "num_input_tokens_seen": 19218393131, "step": 4929, "train_runtime": 196058.455, "train_tokens_per_second": 98023.792 }, { "epoch": 0.7837837837837838, "grad_norm": 0.18102994561195374, "learning_rate": 5.5790963350695e-06, "loss": 0.3768, "num_input_tokens_seen": 19222237815, "step": 4930, "train_runtime": 196098.0414, "train_tokens_per_second": 98023.609 }, { "epoch": 0.7839427662957075, "grad_norm": 0.20757459104061127, "learning_rate": 5.571219726643651e-06, "loss": 0.3979, "num_input_tokens_seen": 19226004005, "step": 4931, "train_runtime": 196137.8512, "train_tokens_per_second": 98022.915 }, { "epoch": 0.7841017488076312, "grad_norm": 0.2034120112657547, "learning_rate": 5.5633479849899614e-06, "loss": 0.3829, "num_input_tokens_seen": 19229876352, "step": 4932, "train_runtime": 196178.376, "train_tokens_per_second": 98022.406 }, { "epoch": 0.7842607313195549, "grad_norm": 0.36773917078971863, "learning_rate": 5.555481112080254e-06, "loss": 0.3925, "num_input_tokens_seen": 19233835982, "step": 4933, "train_runtime": 196219.0494, "train_tokens_per_second": 98022.267 }, { "epoch": 0.7844197138314786, "grad_norm": 0.21035972237586975, "learning_rate": 5.547619109885136e-06, "loss": 0.3843, "num_input_tokens_seen": 19237747354, "step": 4934, "train_runtime": 196257.2953, "train_tokens_per_second": 98023.094 }, { "epoch": 0.7845786963434023, "grad_norm": 0.2084801346063614, "learning_rate": 5.539761980373956e-06, "loss": 0.3828, "num_input_tokens_seen": 19241557924, "step": 4935, "train_runtime": 196295.1067, "train_tokens_per_second": 98023.625 }, { "epoch": 0.7847376788553259, "grad_norm": 0.19364029169082642, "learning_rate": 5.531909725514903e-06, "loss": 0.3888, "num_input_tokens_seen": 19245522457, "step": 4936, "train_runtime": 196334.8187, "train_tokens_per_second": 98023.991 }, { "epoch": 0.7848966613672496, "grad_norm": 0.21523268520832062, "learning_rate": 5.5240623472748885e-06, "loss": 0.4023, "num_input_tokens_seen": 19249529643, "step": 4937, "train_runtime": 196374.7766, "train_tokens_per_second": 98024.451 }, { "epoch": 0.7850556438791733, "grad_norm": 0.19674688577651978, "learning_rate": 5.516219847619633e-06, "loss": 0.3801, "num_input_tokens_seen": 19253450720, "step": 4938, "train_runtime": 196412.8634, "train_tokens_per_second": 98025.406 }, { "epoch": 0.785214626391097, "grad_norm": 0.1987476497888565, "learning_rate": 5.508382228513623e-06, "loss": 0.3874, "num_input_tokens_seen": 19257216186, "step": 4939, "train_runtime": 196452.9833, "train_tokens_per_second": 98024.555 }, { "epoch": 0.7853736089030207, "grad_norm": 0.1990709751844406, "learning_rate": 5.500549491920135e-06, "loss": 0.3988, "num_input_tokens_seen": 19261201719, "step": 4940, "train_runtime": 196491.5566, "train_tokens_per_second": 98025.595 }, { "epoch": 0.7855325914149444, "grad_norm": 0.19614805281162262, "learning_rate": 5.492721639801213e-06, "loss": 0.3845, "num_input_tokens_seen": 19265081661, "step": 4941, "train_runtime": 196528.7564, "train_tokens_per_second": 98026.783 }, { "epoch": 0.7856915739268681, "grad_norm": 0.3163403272628784, "learning_rate": 5.484898674117666e-06, "loss": 0.3999, "num_input_tokens_seen": 19268954122, "step": 4942, "train_runtime": 196568.1934, "train_tokens_per_second": 98026.816 }, { "epoch": 0.7858505564387918, "grad_norm": 0.2105873078107834, "learning_rate": 5.477080596829104e-06, "loss": 0.3961, "num_input_tokens_seen": 19272871065, "step": 4943, "train_runtime": 196605.7599, "train_tokens_per_second": 98028.008 }, { "epoch": 0.7860095389507155, "grad_norm": 0.28537076711654663, "learning_rate": 5.469267409893897e-06, "loss": 0.3834, "num_input_tokens_seen": 19276875666, "step": 4944, "train_runtime": 196643.0068, "train_tokens_per_second": 98029.805 }, { "epoch": 0.7861685214626392, "grad_norm": 0.19744811952114105, "learning_rate": 5.461459115269199e-06, "loss": 0.4005, "num_input_tokens_seen": 19280707303, "step": 4945, "train_runtime": 196681.952, "train_tokens_per_second": 98029.876 }, { "epoch": 0.7863275039745627, "grad_norm": 0.24745669960975647, "learning_rate": 5.453655714910913e-06, "loss": 0.3842, "num_input_tokens_seen": 19284499977, "step": 4946, "train_runtime": 196721.4307, "train_tokens_per_second": 98029.482 }, { "epoch": 0.7864864864864864, "grad_norm": 0.18508069217205048, "learning_rate": 5.445857210773761e-06, "loss": 0.3933, "num_input_tokens_seen": 19288418725, "step": 4947, "train_runtime": 196761.5916, "train_tokens_per_second": 98029.39 }, { "epoch": 0.7866454689984101, "grad_norm": 0.17885412275791168, "learning_rate": 5.438063604811194e-06, "loss": 0.3865, "num_input_tokens_seen": 19292411479, "step": 4948, "train_runtime": 196799.761, "train_tokens_per_second": 98030.665 }, { "epoch": 0.7868044515103338, "grad_norm": 0.20275606215000153, "learning_rate": 5.430274898975466e-06, "loss": 0.3739, "num_input_tokens_seen": 19296197156, "step": 4949, "train_runtime": 196840.0896, "train_tokens_per_second": 98029.813 }, { "epoch": 0.7869634340222575, "grad_norm": 0.2068333476781845, "learning_rate": 5.4224910952175856e-06, "loss": 0.4015, "num_input_tokens_seen": 19300059313, "step": 4950, "train_runtime": 196878.6712, "train_tokens_per_second": 98030.219 }, { "epoch": 0.7871224165341812, "grad_norm": 0.271677702665329, "learning_rate": 5.414712195487348e-06, "loss": 0.3938, "num_input_tokens_seen": 19303880877, "step": 4951, "train_runtime": 196919.9732, "train_tokens_per_second": 98029.065 }, { "epoch": 0.787281399046105, "grad_norm": 0.1736995130777359, "learning_rate": 5.4069382017333135e-06, "loss": 0.3812, "num_input_tokens_seen": 19307837250, "step": 4952, "train_runtime": 196958.2201, "train_tokens_per_second": 98030.116 }, { "epoch": 0.7874403815580286, "grad_norm": 0.200663223862648, "learning_rate": 5.399169115902794e-06, "loss": 0.3888, "num_input_tokens_seen": 19311689579, "step": 4953, "train_runtime": 196999.3165, "train_tokens_per_second": 98029.221 }, { "epoch": 0.7875993640699523, "grad_norm": 0.16734489798545837, "learning_rate": 5.391404939941918e-06, "loss": 0.3705, "num_input_tokens_seen": 19315669611, "step": 4954, "train_runtime": 197035.5835, "train_tokens_per_second": 98031.377 }, { "epoch": 0.787758346581876, "grad_norm": 0.191436767578125, "learning_rate": 5.383645675795537e-06, "loss": 0.385, "num_input_tokens_seen": 19319628588, "step": 4955, "train_runtime": 197075.0936, "train_tokens_per_second": 98031.812 }, { "epoch": 0.7879173290937996, "grad_norm": 0.18301743268966675, "learning_rate": 5.375891325407298e-06, "loss": 0.3767, "num_input_tokens_seen": 19323416944, "step": 4956, "train_runtime": 197112.665, "train_tokens_per_second": 98032.346 }, { "epoch": 0.7880763116057233, "grad_norm": 0.4693538248538971, "learning_rate": 5.368141890719611e-06, "loss": 0.3981, "num_input_tokens_seen": 19327401608, "step": 4957, "train_runtime": 197153.504, "train_tokens_per_second": 98032.25 }, { "epoch": 0.788235294117647, "grad_norm": 0.24485330283641815, "learning_rate": 5.360397373673656e-06, "loss": 0.3849, "num_input_tokens_seen": 19331331182, "step": 4958, "train_runtime": 197193.3952, "train_tokens_per_second": 98032.346 }, { "epoch": 0.7883942766295707, "grad_norm": 0.3628443777561188, "learning_rate": 5.352657776209383e-06, "loss": 0.3952, "num_input_tokens_seen": 19335266960, "step": 4959, "train_runtime": 197231.3671, "train_tokens_per_second": 98033.428 }, { "epoch": 0.7885532591414944, "grad_norm": 0.22114907205104828, "learning_rate": 5.344923100265497e-06, "loss": 0.3819, "num_input_tokens_seen": 19339054793, "step": 4960, "train_runtime": 197271.7786, "train_tokens_per_second": 98032.546 }, { "epoch": 0.7887122416534181, "grad_norm": 0.18630458414554596, "learning_rate": 5.337193347779485e-06, "loss": 0.3838, "num_input_tokens_seen": 19342878617, "step": 4961, "train_runtime": 197311.2016, "train_tokens_per_second": 98032.339 }, { "epoch": 0.7888712241653418, "grad_norm": 0.2038518190383911, "learning_rate": 5.329468520687595e-06, "loss": 0.3888, "num_input_tokens_seen": 19346893415, "step": 4962, "train_runtime": 197351.207, "train_tokens_per_second": 98032.81 }, { "epoch": 0.7890302066772655, "grad_norm": 0.200789675116539, "learning_rate": 5.3217486209248405e-06, "loss": 0.384, "num_input_tokens_seen": 19350734825, "step": 4963, "train_runtime": 197388.377, "train_tokens_per_second": 98033.811 }, { "epoch": 0.7891891891891892, "grad_norm": 0.19823837280273438, "learning_rate": 5.314033650425004e-06, "loss": 0.3897, "num_input_tokens_seen": 19354691194, "step": 4964, "train_runtime": 197429.2026, "train_tokens_per_second": 98033.578 }, { "epoch": 0.7893481717011129, "grad_norm": 0.19234274327754974, "learning_rate": 5.306323611120637e-06, "loss": 0.3903, "num_input_tokens_seen": 19358567023, "step": 4965, "train_runtime": 197468.5938, "train_tokens_per_second": 98033.65 }, { "epoch": 0.7895071542130365, "grad_norm": 0.20844995975494385, "learning_rate": 5.2986185049430346e-06, "loss": 0.38, "num_input_tokens_seen": 19362452814, "step": 4966, "train_runtime": 197508.2615, "train_tokens_per_second": 98033.635 }, { "epoch": 0.7896661367249602, "grad_norm": 0.1740596741437912, "learning_rate": 5.290918333822281e-06, "loss": 0.3887, "num_input_tokens_seen": 19366278776, "step": 4967, "train_runtime": 197547.0058, "train_tokens_per_second": 98033.775 }, { "epoch": 0.7898251192368839, "grad_norm": 0.2926936745643616, "learning_rate": 5.28322309968722e-06, "loss": 0.3827, "num_input_tokens_seen": 19370262046, "step": 4968, "train_runtime": 197586.1701, "train_tokens_per_second": 98034.503 }, { "epoch": 0.7899841017488076, "grad_norm": 0.2030116766691208, "learning_rate": 5.2755328044654325e-06, "loss": 0.3863, "num_input_tokens_seen": 19374220684, "step": 4969, "train_runtime": 197623.9979, "train_tokens_per_second": 98035.769 }, { "epoch": 0.7901430842607313, "grad_norm": 0.18998132646083832, "learning_rate": 5.267847450083313e-06, "loss": 0.3978, "num_input_tokens_seen": 19378111980, "step": 4970, "train_runtime": 197664.9209, "train_tokens_per_second": 98035.159 }, { "epoch": 0.790302066772655, "grad_norm": 0.1986863613128662, "learning_rate": 5.260167038465957e-06, "loss": 0.3743, "num_input_tokens_seen": 19381980801, "step": 4971, "train_runtime": 197705.1654, "train_tokens_per_second": 98034.772 }, { "epoch": 0.7904610492845787, "grad_norm": 0.1888909637928009, "learning_rate": 5.252491571537285e-06, "loss": 0.3773, "num_input_tokens_seen": 19385879216, "step": 4972, "train_runtime": 197745.2075, "train_tokens_per_second": 98034.635 }, { "epoch": 0.7906200317965024, "grad_norm": 0.25010132789611816, "learning_rate": 5.244821051219922e-06, "loss": 0.3834, "num_input_tokens_seen": 19389824011, "step": 4973, "train_runtime": 197784.6892, "train_tokens_per_second": 98035.01 }, { "epoch": 0.7907790143084261, "grad_norm": 0.28886184096336365, "learning_rate": 5.237155479435291e-06, "loss": 0.3898, "num_input_tokens_seen": 19393730736, "step": 4974, "train_runtime": 197826.5517, "train_tokens_per_second": 98034.013 }, { "epoch": 0.7909379968203497, "grad_norm": 0.2715921103954315, "learning_rate": 5.229494858103568e-06, "loss": 0.3924, "num_input_tokens_seen": 19397645581, "step": 4975, "train_runtime": 197864.6185, "train_tokens_per_second": 98034.938 }, { "epoch": 0.7910969793322734, "grad_norm": 0.23989471793174744, "learning_rate": 5.221839189143662e-06, "loss": 0.3816, "num_input_tokens_seen": 19401506903, "step": 4976, "train_runtime": 197904.3275, "train_tokens_per_second": 98034.778 }, { "epoch": 0.7912559618441971, "grad_norm": 0.2250499278306961, "learning_rate": 5.214188474473297e-06, "loss": 0.3842, "num_input_tokens_seen": 19405421526, "step": 4977, "train_runtime": 197943.832, "train_tokens_per_second": 98034.99 }, { "epoch": 0.7914149443561208, "grad_norm": 0.2607291042804718, "learning_rate": 5.206542716008897e-06, "loss": 0.3801, "num_input_tokens_seen": 19409236787, "step": 4978, "train_runtime": 197984.5588, "train_tokens_per_second": 98034.094 }, { "epoch": 0.7915739268680445, "grad_norm": 0.19505611062049866, "learning_rate": 5.198901915665677e-06, "loss": 0.3794, "num_input_tokens_seen": 19413087131, "step": 4979, "train_runtime": 198022.0317, "train_tokens_per_second": 98034.986 }, { "epoch": 0.7917329093799682, "grad_norm": 0.2208031564950943, "learning_rate": 5.191266075357607e-06, "loss": 0.3926, "num_input_tokens_seen": 19416990413, "step": 4980, "train_runtime": 198061.3592, "train_tokens_per_second": 98035.228 }, { "epoch": 0.7918918918918919, "grad_norm": 0.16680864989757538, "learning_rate": 5.183635196997411e-06, "loss": 0.3824, "num_input_tokens_seen": 19420925188, "step": 4981, "train_runtime": 198101.6181, "train_tokens_per_second": 98035.167 }, { "epoch": 0.7920508744038156, "grad_norm": 0.20616215467453003, "learning_rate": 5.176009282496574e-06, "loss": 0.3882, "num_input_tokens_seen": 19424776238, "step": 4982, "train_runtime": 198142.2888, "train_tokens_per_second": 98034.48 }, { "epoch": 0.7922098569157393, "grad_norm": 0.20645073056221008, "learning_rate": 5.1683883337653225e-06, "loss": 0.3781, "num_input_tokens_seen": 19428748395, "step": 4983, "train_runtime": 198178.6785, "train_tokens_per_second": 98036.522 }, { "epoch": 0.792368839427663, "grad_norm": 0.1744464635848999, "learning_rate": 5.1607723527126565e-06, "loss": 0.3825, "num_input_tokens_seen": 19432614602, "step": 4984, "train_runtime": 198218.6767, "train_tokens_per_second": 98036.244 }, { "epoch": 0.7925278219395866, "grad_norm": 0.2203979641199112, "learning_rate": 5.1531613412463234e-06, "loss": 0.3925, "num_input_tokens_seen": 19436434837, "step": 4985, "train_runtime": 198258.4618, "train_tokens_per_second": 98035.84 }, { "epoch": 0.7926868044515103, "grad_norm": 0.47351962327957153, "learning_rate": 5.145555301272836e-06, "loss": 0.3903, "num_input_tokens_seen": 19440395131, "step": 4986, "train_runtime": 198297.9109, "train_tokens_per_second": 98036.308 }, { "epoch": 0.792845786963434, "grad_norm": 0.305038720369339, "learning_rate": 5.137954234697437e-06, "loss": 0.3789, "num_input_tokens_seen": 19444292562, "step": 4987, "train_runtime": 198336.2024, "train_tokens_per_second": 98037.032 }, { "epoch": 0.7930047694753577, "grad_norm": 0.19142349064350128, "learning_rate": 5.1303581434241606e-06, "loss": 0.3718, "num_input_tokens_seen": 19448021530, "step": 4988, "train_runtime": 198375.9953, "train_tokens_per_second": 98036.164 }, { "epoch": 0.7931637519872814, "grad_norm": 0.2579117715358734, "learning_rate": 5.122767029355757e-06, "loss": 0.397, "num_input_tokens_seen": 19452031336, "step": 4989, "train_runtime": 198415.2403, "train_tokens_per_second": 98036.982 }, { "epoch": 0.7933227344992051, "grad_norm": 0.4894926846027374, "learning_rate": 5.1151808943937535e-06, "loss": 0.382, "num_input_tokens_seen": 19455895963, "step": 4990, "train_runtime": 198455.6857, "train_tokens_per_second": 98036.475 }, { "epoch": 0.7934817170111288, "grad_norm": 0.2512432932853699, "learning_rate": 5.107599740438429e-06, "loss": 0.3878, "num_input_tokens_seen": 19459827382, "step": 4991, "train_runtime": 198494.896, "train_tokens_per_second": 98036.916 }, { "epoch": 0.7936406995230525, "grad_norm": 0.4772554934024811, "learning_rate": 5.1000235693887895e-06, "loss": 0.3986, "num_input_tokens_seen": 19463541992, "step": 4992, "train_runtime": 198532.8388, "train_tokens_per_second": 98036.89 }, { "epoch": 0.7937996820349762, "grad_norm": 0.21806283295154572, "learning_rate": 5.09245238314264e-06, "loss": 0.3976, "num_input_tokens_seen": 19467526651, "step": 4993, "train_runtime": 198571.3463, "train_tokens_per_second": 98037.945 }, { "epoch": 0.7939586645468999, "grad_norm": 0.17684078216552734, "learning_rate": 5.084886183596482e-06, "loss": 0.381, "num_input_tokens_seen": 19471475879, "step": 4994, "train_runtime": 198611.3147, "train_tokens_per_second": 98038.1 }, { "epoch": 0.7941176470588235, "grad_norm": 0.2261912077665329, "learning_rate": 5.0773249726456196e-06, "loss": 0.3767, "num_input_tokens_seen": 19475262474, "step": 4995, "train_runtime": 198648.6587, "train_tokens_per_second": 98038.731 }, { "epoch": 0.7942766295707472, "grad_norm": 0.19757895171642303, "learning_rate": 5.069768752184065e-06, "loss": 0.3902, "num_input_tokens_seen": 19479171971, "step": 4996, "train_runtime": 198688.6839, "train_tokens_per_second": 98038.658 }, { "epoch": 0.7944356120826709, "grad_norm": 0.26077958941459656, "learning_rate": 5.0622175241046035e-06, "loss": 0.3925, "num_input_tokens_seen": 19483223853, "step": 4997, "train_runtime": 198729.8296, "train_tokens_per_second": 98038.749 }, { "epoch": 0.7945945945945946, "grad_norm": 0.30369776487350464, "learning_rate": 5.0546712902987704e-06, "loss": 0.3835, "num_input_tokens_seen": 19487055213, "step": 4998, "train_runtime": 198769.8983, "train_tokens_per_second": 98038.261 }, { "epoch": 0.7947535771065183, "grad_norm": 0.18882927298545837, "learning_rate": 5.047130052656826e-06, "loss": 0.3796, "num_input_tokens_seen": 19490852519, "step": 4999, "train_runtime": 198809.6793, "train_tokens_per_second": 98037.744 }, { "epoch": 0.794912559618442, "grad_norm": 0.17864124476909637, "learning_rate": 5.039593813067822e-06, "loss": 0.3793, "num_input_tokens_seen": 19494806519, "step": 5000, "train_runtime": 198848.8476, "train_tokens_per_second": 98038.318 }, { "epoch": 0.7950715421303657, "grad_norm": 0.31913191080093384, "learning_rate": 5.032062573419516e-06, "loss": 0.3833, "num_input_tokens_seen": 19498694577, "step": 5001, "train_runtime": 198998.922, "train_tokens_per_second": 97983.921 }, { "epoch": 0.7952305246422894, "grad_norm": 0.27446848154067993, "learning_rate": 5.024536335598434e-06, "loss": 0.3816, "num_input_tokens_seen": 19502629580, "step": 5002, "train_runtime": 199039.7503, "train_tokens_per_second": 97983.591 }, { "epoch": 0.7953895071542131, "grad_norm": 0.18621821701526642, "learning_rate": 5.017015101489847e-06, "loss": 0.3826, "num_input_tokens_seen": 19506519414, "step": 5003, "train_runtime": 199080.4456, "train_tokens_per_second": 97983.101 }, { "epoch": 0.7955484896661367, "grad_norm": 0.4485498368740082, "learning_rate": 5.009498872977774e-06, "loss": 0.3735, "num_input_tokens_seen": 19510435400, "step": 5004, "train_runtime": 199120.0947, "train_tokens_per_second": 97983.257 }, { "epoch": 0.7957074721780604, "grad_norm": 0.22672931849956512, "learning_rate": 5.00198765194497e-06, "loss": 0.3787, "num_input_tokens_seen": 19514373833, "step": 5005, "train_runtime": 199160.2125, "train_tokens_per_second": 97983.295 }, { "epoch": 0.7958664546899841, "grad_norm": 0.2287336140871048, "learning_rate": 4.994481440272944e-06, "loss": 0.3836, "num_input_tokens_seen": 19518250409, "step": 5006, "train_runtime": 199197.855, "train_tokens_per_second": 97984.24 }, { "epoch": 0.7960254372019078, "grad_norm": 0.20337460935115814, "learning_rate": 4.9869802398419514e-06, "loss": 0.3831, "num_input_tokens_seen": 19522076565, "step": 5007, "train_runtime": 199236.7209, "train_tokens_per_second": 97984.33 }, { "epoch": 0.7961844197138315, "grad_norm": 0.4025125503540039, "learning_rate": 4.979484052530992e-06, "loss": 0.3862, "num_input_tokens_seen": 19526126019, "step": 5008, "train_runtime": 199276.2589, "train_tokens_per_second": 97985.21 }, { "epoch": 0.7963434022257552, "grad_norm": 0.2942853569984436, "learning_rate": 4.97199288021781e-06, "loss": 0.3836, "num_input_tokens_seen": 19530057408, "step": 5009, "train_runtime": 199316.346, "train_tokens_per_second": 97985.227 }, { "epoch": 0.7965023847376789, "grad_norm": 0.1894349902868271, "learning_rate": 4.964506724778872e-06, "loss": 0.3847, "num_input_tokens_seen": 19533894421, "step": 5010, "train_runtime": 199449.584, "train_tokens_per_second": 97939.008 }, { "epoch": 0.7966613672496026, "grad_norm": 0.8349338173866272, "learning_rate": 4.957025588089437e-06, "loss": 0.3809, "num_input_tokens_seen": 19537784077, "step": 5011, "train_runtime": 199488.1462, "train_tokens_per_second": 97939.574 }, { "epoch": 0.7968203497615263, "grad_norm": 0.2420867532491684, "learning_rate": 4.949549472023454e-06, "loss": 0.3873, "num_input_tokens_seen": 19541570284, "step": 5012, "train_runtime": 199527.0882, "train_tokens_per_second": 97939.435 }, { "epoch": 0.79697933227345, "grad_norm": 0.21119602024555206, "learning_rate": 4.942078378453646e-06, "loss": 0.3925, "num_input_tokens_seen": 19545501571, "step": 5013, "train_runtime": 199566.0811, "train_tokens_per_second": 97939.998 }, { "epoch": 0.7971383147853736, "grad_norm": 0.32256680727005005, "learning_rate": 4.934612309251474e-06, "loss": 0.3911, "num_input_tokens_seen": 19549373121, "step": 5014, "train_runtime": 199605.5339, "train_tokens_per_second": 97940.036 }, { "epoch": 0.7972972972972973, "grad_norm": 0.22761297225952148, "learning_rate": 4.927151266287114e-06, "loss": 0.3819, "num_input_tokens_seen": 19553122596, "step": 5015, "train_runtime": 199643.6888, "train_tokens_per_second": 97940.099 }, { "epoch": 0.797456279809221, "grad_norm": 0.33638879656791687, "learning_rate": 4.919695251429535e-06, "loss": 0.3882, "num_input_tokens_seen": 19557062900, "step": 5016, "train_runtime": 199683.7169, "train_tokens_per_second": 97940.199 }, { "epoch": 0.7976152623211447, "grad_norm": 0.2112555205821991, "learning_rate": 4.91224426654639e-06, "loss": 0.3851, "num_input_tokens_seen": 19561008288, "step": 5017, "train_runtime": 199723.7265, "train_tokens_per_second": 97940.333 }, { "epoch": 0.7977742448330684, "grad_norm": 0.2406270056962967, "learning_rate": 4.9047983135041194e-06, "loss": 0.3876, "num_input_tokens_seen": 19564961377, "step": 5018, "train_runtime": 199759.9308, "train_tokens_per_second": 97942.372 }, { "epoch": 0.7979332273449921, "grad_norm": 0.20894506573677063, "learning_rate": 4.897357394167864e-06, "loss": 0.37, "num_input_tokens_seen": 19568712060, "step": 5019, "train_runtime": 199797.3119, "train_tokens_per_second": 97942.82 }, { "epoch": 0.7980922098569158, "grad_norm": 0.28135916590690613, "learning_rate": 4.889921510401529e-06, "loss": 0.3906, "num_input_tokens_seen": 19572648489, "step": 5020, "train_runtime": 199837.7454, "train_tokens_per_second": 97942.701 }, { "epoch": 0.7982511923688395, "grad_norm": 0.2962430715560913, "learning_rate": 4.8824906640677595e-06, "loss": 0.3988, "num_input_tokens_seen": 19576531467, "step": 5021, "train_runtime": 199876.2645, "train_tokens_per_second": 97943.253 }, { "epoch": 0.7984101748807632, "grad_norm": 0.20532473921775818, "learning_rate": 4.875064857027908e-06, "loss": 0.3915, "num_input_tokens_seen": 19580432934, "step": 5022, "train_runtime": 199911.3386, "train_tokens_per_second": 97945.585 }, { "epoch": 0.7985691573926869, "grad_norm": 0.21883466839790344, "learning_rate": 4.867644091142112e-06, "loss": 0.3692, "num_input_tokens_seen": 19584209930, "step": 5023, "train_runtime": 199951.2615, "train_tokens_per_second": 97944.918 }, { "epoch": 0.7987281399046104, "grad_norm": 0.18390820920467377, "learning_rate": 4.860228368269204e-06, "loss": 0.3675, "num_input_tokens_seen": 19588211589, "step": 5024, "train_runtime": 199991.802, "train_tokens_per_second": 97945.073 }, { "epoch": 0.7988871224165341, "grad_norm": 0.18874286115169525, "learning_rate": 4.852817690266773e-06, "loss": 0.3894, "num_input_tokens_seen": 19592179899, "step": 5025, "train_runtime": 200034.3659, "train_tokens_per_second": 97944.07 }, { "epoch": 0.7990461049284578, "grad_norm": 0.19486838579177856, "learning_rate": 4.845412058991144e-06, "loss": 0.3885, "num_input_tokens_seen": 19595957731, "step": 5026, "train_runtime": 200070.0049, "train_tokens_per_second": 97945.505 }, { "epoch": 0.7992050874403815, "grad_norm": 0.3621329069137573, "learning_rate": 4.838011476297383e-06, "loss": 0.3773, "num_input_tokens_seen": 19599809772, "step": 5027, "train_runtime": 200108.6121, "train_tokens_per_second": 97945.858 }, { "epoch": 0.7993640699523052, "grad_norm": 0.2660357654094696, "learning_rate": 4.830615944039265e-06, "loss": 0.3789, "num_input_tokens_seen": 19603805929, "step": 5028, "train_runtime": 200149.6494, "train_tokens_per_second": 97945.742 }, { "epoch": 0.799523052464229, "grad_norm": 0.21351760625839233, "learning_rate": 4.823225464069334e-06, "loss": 0.3748, "num_input_tokens_seen": 19607699431, "step": 5029, "train_runtime": 200187.4374, "train_tokens_per_second": 97946.703 }, { "epoch": 0.7996820349761526, "grad_norm": 0.22682718932628632, "learning_rate": 4.815840038238845e-06, "loss": 0.3907, "num_input_tokens_seen": 19611634835, "step": 5030, "train_runtime": 200225.9237, "train_tokens_per_second": 97947.531 }, { "epoch": 0.7998410174880763, "grad_norm": 0.24496468901634216, "learning_rate": 4.808459668397799e-06, "loss": 0.3975, "num_input_tokens_seen": 19615503266, "step": 5031, "train_runtime": 200264.5501, "train_tokens_per_second": 97947.956 }, { "epoch": 0.8, "grad_norm": 0.21907134354114532, "learning_rate": 4.80108435639493e-06, "loss": 0.3811, "num_input_tokens_seen": 19619450830, "step": 5032, "train_runtime": 200301.2169, "train_tokens_per_second": 97949.734 }, { "epoch": 0.8001589825119236, "grad_norm": 0.25797808170318604, "learning_rate": 4.793714104077687e-06, "loss": 0.3884, "num_input_tokens_seen": 19623245025, "step": 5033, "train_runtime": 200340.4372, "train_tokens_per_second": 97949.497 }, { "epoch": 0.8003179650238473, "grad_norm": 0.26581189036369324, "learning_rate": 4.7863489132922875e-06, "loss": 0.3853, "num_input_tokens_seen": 19627197903, "step": 5034, "train_runtime": 200380.628, "train_tokens_per_second": 97949.578 }, { "epoch": 0.800476947535771, "grad_norm": 0.22068314254283905, "learning_rate": 4.778988785883642e-06, "loss": 0.3709, "num_input_tokens_seen": 19631171862, "step": 5035, "train_runtime": 200420.0353, "train_tokens_per_second": 97950.147 }, { "epoch": 0.8006359300476947, "grad_norm": 0.18390288949012756, "learning_rate": 4.771633723695418e-06, "loss": 0.3758, "num_input_tokens_seen": 19635013575, "step": 5036, "train_runtime": 200457.7319, "train_tokens_per_second": 97950.892 }, { "epoch": 0.8007949125596184, "grad_norm": 0.19014646112918854, "learning_rate": 4.764283728570012e-06, "loss": 0.3788, "num_input_tokens_seen": 19638784525, "step": 5037, "train_runtime": 200497.1948, "train_tokens_per_second": 97950.42 }, { "epoch": 0.8009538950715421, "grad_norm": 0.2502298653125763, "learning_rate": 4.756938802348524e-06, "loss": 0.3899, "num_input_tokens_seen": 19642714425, "step": 5038, "train_runtime": 200533.9586, "train_tokens_per_second": 97952.06 }, { "epoch": 0.8011128775834658, "grad_norm": 0.21905481815338135, "learning_rate": 4.749598946870837e-06, "loss": 0.3781, "num_input_tokens_seen": 19646705683, "step": 5039, "train_runtime": 200574.3812, "train_tokens_per_second": 97952.219 }, { "epoch": 0.8012718600953895, "grad_norm": 0.32977229356765747, "learning_rate": 4.742264163975502e-06, "loss": 0.3848, "num_input_tokens_seen": 19650514204, "step": 5040, "train_runtime": 200614.9772, "train_tokens_per_second": 97951.382 }, { "epoch": 0.8014308426073132, "grad_norm": 0.3031315803527832, "learning_rate": 4.7349344554998624e-06, "loss": 0.385, "num_input_tokens_seen": 19654321732, "step": 5041, "train_runtime": 200653.949, "train_tokens_per_second": 97951.333 }, { "epoch": 0.8015898251192369, "grad_norm": 0.20422761142253876, "learning_rate": 4.72760982327993e-06, "loss": 0.3843, "num_input_tokens_seen": 19658320338, "step": 5042, "train_runtime": 200691.6841, "train_tokens_per_second": 97952.84 }, { "epoch": 0.8017488076311605, "grad_norm": 0.19720478355884552, "learning_rate": 4.720290269150488e-06, "loss": 0.3828, "num_input_tokens_seen": 19662291661, "step": 5043, "train_runtime": 200732.8267, "train_tokens_per_second": 97952.547 }, { "epoch": 0.8019077901430842, "grad_norm": 0.21077553927898407, "learning_rate": 4.712975794945038e-06, "loss": 0.3882, "num_input_tokens_seen": 19666076260, "step": 5044, "train_runtime": 200771.0286, "train_tokens_per_second": 97952.759 }, { "epoch": 0.8020667726550079, "grad_norm": 0.2097778022289276, "learning_rate": 4.705666402495787e-06, "loss": 0.3786, "num_input_tokens_seen": 19669960305, "step": 5045, "train_runtime": 200811.2765, "train_tokens_per_second": 97952.469 }, { "epoch": 0.8022257551669316, "grad_norm": 0.27965065836906433, "learning_rate": 4.698362093633696e-06, "loss": 0.3775, "num_input_tokens_seen": 19673883540, "step": 5046, "train_runtime": 200850.742, "train_tokens_per_second": 97952.755 }, { "epoch": 0.8023847376788553, "grad_norm": 0.45636335015296936, "learning_rate": 4.691062870188442e-06, "loss": 0.389, "num_input_tokens_seen": 19677800741, "step": 5047, "train_runtime": 200890.1829, "train_tokens_per_second": 97953.023 }, { "epoch": 0.802543720190779, "grad_norm": 0.24246561527252197, "learning_rate": 4.683768733988428e-06, "loss": 0.3801, "num_input_tokens_seen": 19681656991, "step": 5048, "train_runtime": 200930.1714, "train_tokens_per_second": 97952.721 }, { "epoch": 0.8027027027027027, "grad_norm": 0.2189873903989792, "learning_rate": 4.676479686860785e-06, "loss": 0.3794, "num_input_tokens_seen": 19685629665, "step": 5049, "train_runtime": 200970.5417, "train_tokens_per_second": 97952.812 }, { "epoch": 0.8028616852146264, "grad_norm": 0.19425837695598602, "learning_rate": 4.669195730631373e-06, "loss": 0.3843, "num_input_tokens_seen": 19689586627, "step": 5050, "train_runtime": 201009.744, "train_tokens_per_second": 97953.394 }, { "epoch": 0.8030206677265501, "grad_norm": 0.205730140209198, "learning_rate": 4.661916867124763e-06, "loss": 0.3863, "num_input_tokens_seen": 19693445454, "step": 5051, "train_runtime": 201049.2617, "train_tokens_per_second": 97953.334 }, { "epoch": 0.8031796502384738, "grad_norm": 0.21166077256202698, "learning_rate": 4.654643098164257e-06, "loss": 0.3702, "num_input_tokens_seen": 19697242845, "step": 5052, "train_runtime": 201089.1648, "train_tokens_per_second": 97952.781 }, { "epoch": 0.8033386327503974, "grad_norm": 0.20681563019752502, "learning_rate": 4.64737442557189e-06, "loss": 0.3925, "num_input_tokens_seen": 19701200046, "step": 5053, "train_runtime": 201129.2004, "train_tokens_per_second": 97952.958 }, { "epoch": 0.8034976152623211, "grad_norm": 0.20507831871509552, "learning_rate": 4.640110851168411e-06, "loss": 0.3837, "num_input_tokens_seen": 19705145465, "step": 5054, "train_runtime": 201167.6234, "train_tokens_per_second": 97953.861 }, { "epoch": 0.8036565977742448, "grad_norm": 0.19933369755744934, "learning_rate": 4.6328523767733e-06, "loss": 0.3766, "num_input_tokens_seen": 19708854757, "step": 5055, "train_runtime": 201208.2615, "train_tokens_per_second": 97952.513 }, { "epoch": 0.8038155802861685, "grad_norm": 0.22978265583515167, "learning_rate": 4.625599004204733e-06, "loss": 0.3966, "num_input_tokens_seen": 19712863745, "step": 5056, "train_runtime": 201248.021, "train_tokens_per_second": 97953.081 }, { "epoch": 0.8039745627980922, "grad_norm": 0.22307808697223663, "learning_rate": 4.618350735279656e-06, "loss": 0.3835, "num_input_tokens_seen": 19716821109, "step": 5057, "train_runtime": 201288.4068, "train_tokens_per_second": 97953.088 }, { "epoch": 0.8041335453100159, "grad_norm": 0.49300482869148254, "learning_rate": 4.6111075718136914e-06, "loss": 0.3911, "num_input_tokens_seen": 19720649422, "step": 5058, "train_runtime": 201326.8748, "train_tokens_per_second": 97953.388 }, { "epoch": 0.8042925278219396, "grad_norm": 0.18473853170871735, "learning_rate": 4.603869515621201e-06, "loss": 0.3789, "num_input_tokens_seen": 19724651047, "step": 5059, "train_runtime": 201365.6632, "train_tokens_per_second": 97954.392 }, { "epoch": 0.8044515103338633, "grad_norm": 0.21847182512283325, "learning_rate": 4.596636568515275e-06, "loss": 0.3743, "num_input_tokens_seen": 19728585142, "step": 5060, "train_runtime": 201406.4523, "train_tokens_per_second": 97954.087 }, { "epoch": 0.804610492845787, "grad_norm": 0.20357415080070496, "learning_rate": 4.58940873230771e-06, "loss": 0.3973, "num_input_tokens_seen": 19732514633, "step": 5061, "train_runtime": 201443.5587, "train_tokens_per_second": 97955.55 }, { "epoch": 0.8047694753577107, "grad_norm": 0.2005995213985443, "learning_rate": 4.582186008809033e-06, "loss": 0.3787, "num_input_tokens_seen": 19736350646, "step": 5062, "train_runtime": 201483.3196, "train_tokens_per_second": 97955.258 }, { "epoch": 0.8049284578696343, "grad_norm": 0.22022834420204163, "learning_rate": 4.5749683998284775e-06, "loss": 0.384, "num_input_tokens_seen": 19740167894, "step": 5063, "train_runtime": 201522.9523, "train_tokens_per_second": 97954.936 }, { "epoch": 0.805087440381558, "grad_norm": 0.1840551495552063, "learning_rate": 4.567755907174007e-06, "loss": 0.3776, "num_input_tokens_seen": 19744120771, "step": 5064, "train_runtime": 201560.2438, "train_tokens_per_second": 97956.424 }, { "epoch": 0.8052464228934817, "grad_norm": 0.23123164474964142, "learning_rate": 4.560548532652298e-06, "loss": 0.3857, "num_input_tokens_seen": 19748007525, "step": 5065, "train_runtime": 201601.1242, "train_tokens_per_second": 97955.84 }, { "epoch": 0.8054054054054054, "grad_norm": 0.25533539056777954, "learning_rate": 4.553346278068752e-06, "loss": 0.3847, "num_input_tokens_seen": 19751750423, "step": 5066, "train_runtime": 201639.7191, "train_tokens_per_second": 97955.653 }, { "epoch": 0.8055643879173291, "grad_norm": 0.22711129486560822, "learning_rate": 4.5461491452274794e-06, "loss": 0.3876, "num_input_tokens_seen": 19755683876, "step": 5067, "train_runtime": 201677.9595, "train_tokens_per_second": 97956.583 }, { "epoch": 0.8057233704292528, "grad_norm": 0.31807631254196167, "learning_rate": 4.538957135931318e-06, "loss": 0.3926, "num_input_tokens_seen": 19759489297, "step": 5068, "train_runtime": 201715.7191, "train_tokens_per_second": 97957.112 }, { "epoch": 0.8058823529411765, "grad_norm": 0.21465209126472473, "learning_rate": 4.531770251981801e-06, "loss": 0.382, "num_input_tokens_seen": 19763340768, "step": 5069, "train_runtime": 201755.3675, "train_tokens_per_second": 97956.952 }, { "epoch": 0.8060413354531002, "grad_norm": 0.2061471939086914, "learning_rate": 4.524588495179203e-06, "loss": 0.3841, "num_input_tokens_seen": 19767258296, "step": 5070, "train_runtime": 201795.5819, "train_tokens_per_second": 97956.844 }, { "epoch": 0.8062003179650239, "grad_norm": 0.23975202441215515, "learning_rate": 4.5174118673224994e-06, "loss": 0.3839, "num_input_tokens_seen": 19771201198, "step": 5071, "train_runtime": 201834.7839, "train_tokens_per_second": 97957.353 }, { "epoch": 0.8063593004769475, "grad_norm": 0.2210058569908142, "learning_rate": 4.510240370209384e-06, "loss": 0.3821, "num_input_tokens_seen": 19775111780, "step": 5072, "train_runtime": 201875.0469, "train_tokens_per_second": 97957.187 }, { "epoch": 0.8065182829888712, "grad_norm": 0.4460011124610901, "learning_rate": 4.5030740056362765e-06, "loss": 0.3729, "num_input_tokens_seen": 19778933103, "step": 5073, "train_runtime": 201915.0141, "train_tokens_per_second": 97956.723 }, { "epoch": 0.8066772655007949, "grad_norm": 0.19866488873958588, "learning_rate": 4.495912775398281e-06, "loss": 0.3854, "num_input_tokens_seen": 19782932081, "step": 5074, "train_runtime": 201952.9383, "train_tokens_per_second": 97958.13 }, { "epoch": 0.8068362480127186, "grad_norm": 0.1817469596862793, "learning_rate": 4.488756681289255e-06, "loss": 0.3888, "num_input_tokens_seen": 19786812537, "step": 5075, "train_runtime": 201992.2642, "train_tokens_per_second": 97958.269 }, { "epoch": 0.8069952305246423, "grad_norm": 0.2320428341627121, "learning_rate": 4.481605725101737e-06, "loss": 0.3837, "num_input_tokens_seen": 19790766179, "step": 5076, "train_runtime": 202031.2889, "train_tokens_per_second": 97958.917 }, { "epoch": 0.807154213036566, "grad_norm": 0.16969437897205353, "learning_rate": 4.474459908626993e-06, "loss": 0.3767, "num_input_tokens_seen": 19794513417, "step": 5077, "train_runtime": 202068.3239, "train_tokens_per_second": 97959.507 }, { "epoch": 0.8073131955484897, "grad_norm": 0.3587660491466522, "learning_rate": 4.46731923365501e-06, "loss": 0.3897, "num_input_tokens_seen": 19798433055, "step": 5078, "train_runtime": 202110.0044, "train_tokens_per_second": 97958.699 }, { "epoch": 0.8074721780604134, "grad_norm": 0.16750337183475494, "learning_rate": 4.460183701974452e-06, "loss": 0.3825, "num_input_tokens_seen": 19802373497, "step": 5079, "train_runtime": 202150.6641, "train_tokens_per_second": 97958.488 }, { "epoch": 0.8076311605723371, "grad_norm": 0.19428279995918274, "learning_rate": 4.453053315372752e-06, "loss": 0.3872, "num_input_tokens_seen": 19806255076, "step": 5080, "train_runtime": 202191.3304, "train_tokens_per_second": 97957.984 }, { "epoch": 0.8077901430842608, "grad_norm": 0.22598230838775635, "learning_rate": 4.445928075635994e-06, "loss": 0.3854, "num_input_tokens_seen": 19810175677, "step": 5081, "train_runtime": 202230.6578, "train_tokens_per_second": 97958.321 }, { "epoch": 0.8079491255961844, "grad_norm": 0.19440819323062897, "learning_rate": 4.438807984549012e-06, "loss": 0.3918, "num_input_tokens_seen": 19814129202, "step": 5082, "train_runtime": 202269.7486, "train_tokens_per_second": 97958.935 }, { "epoch": 0.8081081081081081, "grad_norm": 0.20516522228717804, "learning_rate": 4.431693043895338e-06, "loss": 0.3886, "num_input_tokens_seen": 19817811735, "step": 5083, "train_runtime": 202309.9425, "train_tokens_per_second": 97957.676 }, { "epoch": 0.8082670906200318, "grad_norm": 0.20209309458732605, "learning_rate": 4.424583255457212e-06, "loss": 0.3793, "num_input_tokens_seen": 19821789147, "step": 5084, "train_runtime": 202347.861, "train_tokens_per_second": 97958.975 }, { "epoch": 0.8084260731319555, "grad_norm": 0.36250734329223633, "learning_rate": 4.417478621015595e-06, "loss": 0.4062, "num_input_tokens_seen": 19825632999, "step": 5085, "train_runtime": 202388.0864, "train_tokens_per_second": 97958.498 }, { "epoch": 0.8085850556438792, "grad_norm": 0.9354975819587708, "learning_rate": 4.410379142350132e-06, "loss": 0.3835, "num_input_tokens_seen": 19829477450, "step": 5086, "train_runtime": 202424.9447, "train_tokens_per_second": 97959.654 }, { "epoch": 0.8087440381558029, "grad_norm": 0.193095862865448, "learning_rate": 4.4032848212392035e-06, "loss": 0.399, "num_input_tokens_seen": 19833311941, "step": 5087, "train_runtime": 202466.142, "train_tokens_per_second": 97958.66 }, { "epoch": 0.8089030206677266, "grad_norm": 0.19742019474506378, "learning_rate": 4.396195659459881e-06, "loss": 0.3839, "num_input_tokens_seen": 19837295106, "step": 5088, "train_runtime": 202505.4043, "train_tokens_per_second": 97959.337 }, { "epoch": 0.8090620031796503, "grad_norm": 0.1682729572057724, "learning_rate": 4.389111658787953e-06, "loss": 0.385, "num_input_tokens_seen": 19841229194, "step": 5089, "train_runtime": 202544.5673, "train_tokens_per_second": 97959.819 }, { "epoch": 0.809220985691574, "grad_norm": 0.4365544319152832, "learning_rate": 4.382032820997911e-06, "loss": 0.3803, "num_input_tokens_seen": 19845075230, "step": 5090, "train_runtime": 202583.9992, "train_tokens_per_second": 97959.737 }, { "epoch": 0.8093799682034977, "grad_norm": 0.1979852020740509, "learning_rate": 4.374959147862961e-06, "loss": 0.3849, "num_input_tokens_seen": 19848921987, "step": 5091, "train_runtime": 202621.1076, "train_tokens_per_second": 97960.781 }, { "epoch": 0.8095389507154213, "grad_norm": 0.2145877182483673, "learning_rate": 4.367890641154992e-06, "loss": 0.3847, "num_input_tokens_seen": 19852862925, "step": 5092, "train_runtime": 202662.8263, "train_tokens_per_second": 97960.061 }, { "epoch": 0.809697933227345, "grad_norm": 0.22071918845176697, "learning_rate": 4.360827302644624e-06, "loss": 0.4009, "num_input_tokens_seen": 19856885091, "step": 5093, "train_runtime": 202704.4979, "train_tokens_per_second": 97959.766 }, { "epoch": 0.8098569157392687, "grad_norm": 0.20365966856479645, "learning_rate": 4.353769134101174e-06, "loss": 0.3833, "num_input_tokens_seen": 19860681803, "step": 5094, "train_runtime": 202743.882, "train_tokens_per_second": 97959.463 }, { "epoch": 0.8100158982511924, "grad_norm": 0.25193357467651367, "learning_rate": 4.3467161372926615e-06, "loss": 0.3816, "num_input_tokens_seen": 19864513131, "step": 5095, "train_runtime": 202783.5141, "train_tokens_per_second": 97959.211 }, { "epoch": 0.8101748807631161, "grad_norm": 0.24926020205020905, "learning_rate": 4.33966831398582e-06, "loss": 0.3892, "num_input_tokens_seen": 19868412583, "step": 5096, "train_runtime": 202825.4404, "train_tokens_per_second": 97958.188 }, { "epoch": 0.8103338632750398, "grad_norm": 0.20936177670955658, "learning_rate": 4.3326256659460605e-06, "loss": 0.3874, "num_input_tokens_seen": 19872270870, "step": 5097, "train_runtime": 202863.9678, "train_tokens_per_second": 97958.603 }, { "epoch": 0.8104928457869635, "grad_norm": 0.17987217009067535, "learning_rate": 4.325588194937541e-06, "loss": 0.3766, "num_input_tokens_seen": 19876095245, "step": 5098, "train_runtime": 202902.8243, "train_tokens_per_second": 97958.692 }, { "epoch": 0.8106518282988872, "grad_norm": 0.22749310731887817, "learning_rate": 4.318555902723079e-06, "loss": 0.3861, "num_input_tokens_seen": 19880095788, "step": 5099, "train_runtime": 202943.5909, "train_tokens_per_second": 97958.727 }, { "epoch": 0.8108108108108109, "grad_norm": 0.2174622267484665, "learning_rate": 4.31152879106422e-06, "loss": 0.3996, "num_input_tokens_seen": 19883956773, "step": 5100, "train_runtime": 202987.1227, "train_tokens_per_second": 97956.74 }, { "epoch": 0.8109697933227344, "grad_norm": 0.27263352274894714, "learning_rate": 4.304506861721217e-06, "loss": 0.3856, "num_input_tokens_seen": 19887784003, "step": 5101, "train_runtime": 203027.1947, "train_tokens_per_second": 97956.257 }, { "epoch": 0.8111287758346581, "grad_norm": 0.2029689997434616, "learning_rate": 4.297490116452987e-06, "loss": 0.3824, "num_input_tokens_seen": 19891620673, "step": 5102, "train_runtime": 203067.7637, "train_tokens_per_second": 97955.58 }, { "epoch": 0.8112877583465818, "grad_norm": 0.2093271166086197, "learning_rate": 4.290478557017208e-06, "loss": 0.3862, "num_input_tokens_seen": 19895465871, "step": 5103, "train_runtime": 203107.6288, "train_tokens_per_second": 97955.286 }, { "epoch": 0.8114467408585055, "grad_norm": 0.18961186707019806, "learning_rate": 4.283472185170201e-06, "loss": 0.4026, "num_input_tokens_seen": 19899375518, "step": 5104, "train_runtime": 203147.8131, "train_tokens_per_second": 97955.155 }, { "epoch": 0.8116057233704292, "grad_norm": 0.20545653998851776, "learning_rate": 4.276471002667021e-06, "loss": 0.3809, "num_input_tokens_seen": 19903280319, "step": 5105, "train_runtime": 203183.9198, "train_tokens_per_second": 97956.966 }, { "epoch": 0.8117647058823529, "grad_norm": 0.22009441256523132, "learning_rate": 4.269475011261414e-06, "loss": 0.3805, "num_input_tokens_seen": 19907208968, "step": 5106, "train_runtime": 203224.5477, "train_tokens_per_second": 97956.714 }, { "epoch": 0.8119236883942766, "grad_norm": 0.23308172821998596, "learning_rate": 4.262484212705831e-06, "loss": 0.3876, "num_input_tokens_seen": 19911098223, "step": 5107, "train_runtime": 203265.6407, "train_tokens_per_second": 97956.045 }, { "epoch": 0.8120826709062003, "grad_norm": 0.23637746274471283, "learning_rate": 4.255498608751418e-06, "loss": 0.3796, "num_input_tokens_seen": 19915024345, "step": 5108, "train_runtime": 203306.7356, "train_tokens_per_second": 97955.556 }, { "epoch": 0.812241653418124, "grad_norm": 0.2754271626472473, "learning_rate": 4.248518201148008e-06, "loss": 0.3809, "num_input_tokens_seen": 19918908015, "step": 5109, "train_runtime": 203347.3117, "train_tokens_per_second": 97955.109 }, { "epoch": 0.8124006359300477, "grad_norm": 0.20942449569702148, "learning_rate": 4.241542991644154e-06, "loss": 0.3815, "num_input_tokens_seen": 19922800745, "step": 5110, "train_runtime": 203388.1096, "train_tokens_per_second": 97954.599 }, { "epoch": 0.8125596184419713, "grad_norm": 0.18438924849033356, "learning_rate": 4.234572981987092e-06, "loss": 0.3902, "num_input_tokens_seen": 19926746727, "step": 5111, "train_runtime": 203427.2703, "train_tokens_per_second": 97955.14 }, { "epoch": 0.812718600953895, "grad_norm": 0.2424730509519577, "learning_rate": 4.227608173922764e-06, "loss": 0.3824, "num_input_tokens_seen": 19930622273, "step": 5112, "train_runtime": 203466.0682, "train_tokens_per_second": 97955.509 }, { "epoch": 0.8128775834658187, "grad_norm": 0.24452704191207886, "learning_rate": 4.220648569195804e-06, "loss": 0.3851, "num_input_tokens_seen": 19934428860, "step": 5113, "train_runtime": 203505.9246, "train_tokens_per_second": 97955.03 }, { "epoch": 0.8130365659777424, "grad_norm": 0.20345164835453033, "learning_rate": 4.213694169549548e-06, "loss": 0.3856, "num_input_tokens_seen": 19938271060, "step": 5114, "train_runtime": 203546.6969, "train_tokens_per_second": 97954.284 }, { "epoch": 0.8131955484896661, "grad_norm": 0.258695125579834, "learning_rate": 4.206744976726013e-06, "loss": 0.3852, "num_input_tokens_seen": 19942313037, "step": 5115, "train_runtime": 203585.8747, "train_tokens_per_second": 97955.288 }, { "epoch": 0.8133545310015898, "grad_norm": 0.1986963450908661, "learning_rate": 4.199800992465927e-06, "loss": 0.392, "num_input_tokens_seen": 19946147358, "step": 5116, "train_runtime": 203624.7074, "train_tokens_per_second": 97955.438 }, { "epoch": 0.8135135135135135, "grad_norm": 0.2308768779039383, "learning_rate": 4.192862218508714e-06, "loss": 0.3884, "num_input_tokens_seen": 19950043114, "step": 5117, "train_runtime": 203662.3192, "train_tokens_per_second": 97956.476 }, { "epoch": 0.8136724960254372, "grad_norm": 0.2009858340024948, "learning_rate": 4.185928656592483e-06, "loss": 0.3892, "num_input_tokens_seen": 19953963170, "step": 5118, "train_runtime": 203701.7311, "train_tokens_per_second": 97956.768 }, { "epoch": 0.8138314785373609, "grad_norm": 0.18177831172943115, "learning_rate": 4.179000308454051e-06, "loss": 0.3949, "num_input_tokens_seen": 19957813149, "step": 5119, "train_runtime": 203742.4876, "train_tokens_per_second": 97956.069 }, { "epoch": 0.8139904610492846, "grad_norm": 0.18942558765411377, "learning_rate": 4.172077175828901e-06, "loss": 0.3794, "num_input_tokens_seen": 19961748732, "step": 5120, "train_runtime": 203779.3636, "train_tokens_per_second": 97957.656 }, { "epoch": 0.8141494435612082, "grad_norm": 0.17850621044635773, "learning_rate": 4.165159260451251e-06, "loss": 0.3666, "num_input_tokens_seen": 19965647064, "step": 5121, "train_runtime": 203819.3053, "train_tokens_per_second": 97957.586 }, { "epoch": 0.8143084260731319, "grad_norm": 0.19106411933898926, "learning_rate": 4.158246564053975e-06, "loss": 0.3949, "num_input_tokens_seen": 19969606332, "step": 5122, "train_runtime": 203857.9084, "train_tokens_per_second": 97958.458 }, { "epoch": 0.8144674085850556, "grad_norm": 0.25455206632614136, "learning_rate": 4.151339088368661e-06, "loss": 0.3829, "num_input_tokens_seen": 19973562550, "step": 5123, "train_runtime": 203896.2841, "train_tokens_per_second": 97959.424 }, { "epoch": 0.8146263910969793, "grad_norm": 0.18427656590938568, "learning_rate": 4.144436835125587e-06, "loss": 0.3714, "num_input_tokens_seen": 19977484052, "step": 5124, "train_runtime": 203936.3637, "train_tokens_per_second": 97959.401 }, { "epoch": 0.814785373608903, "grad_norm": 0.20084232091903687, "learning_rate": 4.137539806053701e-06, "loss": 0.3875, "num_input_tokens_seen": 19981323947, "step": 5125, "train_runtime": 203973.9541, "train_tokens_per_second": 97960.174 }, { "epoch": 0.8149443561208267, "grad_norm": 0.23345063626766205, "learning_rate": 4.130648002880682e-06, "loss": 0.3862, "num_input_tokens_seen": 19985217416, "step": 5126, "train_runtime": 204014.32, "train_tokens_per_second": 97959.876 }, { "epoch": 0.8151033386327504, "grad_norm": 0.21636715531349182, "learning_rate": 4.123761427332865e-06, "loss": 0.3971, "num_input_tokens_seen": 19989091877, "step": 5127, "train_runtime": 204048.9857, "train_tokens_per_second": 97962.221 }, { "epoch": 0.8152623211446741, "grad_norm": 0.2621639370918274, "learning_rate": 4.116880081135291e-06, "loss": 0.3835, "num_input_tokens_seen": 19993118084, "step": 5128, "train_runtime": 204088.244, "train_tokens_per_second": 97963.105 }, { "epoch": 0.8154213036565978, "grad_norm": 0.20278704166412354, "learning_rate": 4.110003966011689e-06, "loss": 0.3935, "num_input_tokens_seen": 19997024503, "step": 5129, "train_runtime": 204128.2192, "train_tokens_per_second": 97963.058 }, { "epoch": 0.8155802861685215, "grad_norm": 0.21137981116771698, "learning_rate": 4.103133083684474e-06, "loss": 0.3874, "num_input_tokens_seen": 20000921894, "step": 5130, "train_runtime": 204167.6338, "train_tokens_per_second": 97963.235 }, { "epoch": 0.8157392686804451, "grad_norm": 0.2091004103422165, "learning_rate": 4.096267435874768e-06, "loss": 0.3923, "num_input_tokens_seen": 20004810244, "step": 5131, "train_runtime": 204204.8385, "train_tokens_per_second": 97964.428 }, { "epoch": 0.8158982511923688, "grad_norm": 0.23757024109363556, "learning_rate": 4.089407024302347e-06, "loss": 0.3947, "num_input_tokens_seen": 20008747198, "step": 5132, "train_runtime": 204245.6135, "train_tokens_per_second": 97964.146 }, { "epoch": 0.8160572337042925, "grad_norm": 0.1876242607831955, "learning_rate": 4.082551850685706e-06, "loss": 0.3834, "num_input_tokens_seen": 20012623613, "step": 5133, "train_runtime": 204286.5104, "train_tokens_per_second": 97963.51 }, { "epoch": 0.8162162162162162, "grad_norm": 0.18781712651252747, "learning_rate": 4.0757019167420165e-06, "loss": 0.394, "num_input_tokens_seen": 20016581811, "step": 5134, "train_runtime": 204324.0198, "train_tokens_per_second": 97964.898 }, { "epoch": 0.8163751987281399, "grad_norm": 0.19037775695323944, "learning_rate": 4.068857224187137e-06, "loss": 0.3748, "num_input_tokens_seen": 20020430571, "step": 5135, "train_runtime": 204362.8179, "train_tokens_per_second": 97965.133 }, { "epoch": 0.8165341812400636, "grad_norm": 0.18737657368183136, "learning_rate": 4.062017774735619e-06, "loss": 0.3814, "num_input_tokens_seen": 20024307403, "step": 5136, "train_runtime": 204402.8309, "train_tokens_per_second": 97964.922 }, { "epoch": 0.8166931637519873, "grad_norm": 0.21338389813899994, "learning_rate": 4.055183570100695e-06, "loss": 0.3926, "num_input_tokens_seen": 20028163755, "step": 5137, "train_runtime": 204441.6156, "train_tokens_per_second": 97965.2 }, { "epoch": 0.816852146263911, "grad_norm": 0.21912576258182526, "learning_rate": 4.048354611994282e-06, "loss": 0.3772, "num_input_tokens_seen": 20032165176, "step": 5138, "train_runtime": 204482.1712, "train_tokens_per_second": 97965.339 }, { "epoch": 0.8170111287758347, "grad_norm": 0.27248886227607727, "learning_rate": 4.041530902126983e-06, "loss": 0.3768, "num_input_tokens_seen": 20035966468, "step": 5139, "train_runtime": 204523.4544, "train_tokens_per_second": 97964.15 }, { "epoch": 0.8171701112877583, "grad_norm": 0.2353684902191162, "learning_rate": 4.0347124422081e-06, "loss": 0.3744, "num_input_tokens_seen": 20039865671, "step": 5140, "train_runtime": 204561.6169, "train_tokens_per_second": 97964.936 }, { "epoch": 0.817329093799682, "grad_norm": 0.17313334345817566, "learning_rate": 4.027899233945592e-06, "loss": 0.3994, "num_input_tokens_seen": 20043848527, "step": 5141, "train_runtime": 204601.9977, "train_tokens_per_second": 97965.068 }, { "epoch": 0.8174880763116057, "grad_norm": 0.2107042819261551, "learning_rate": 4.021091279046141e-06, "loss": 0.3821, "num_input_tokens_seen": 20047716295, "step": 5142, "train_runtime": 204642.0536, "train_tokens_per_second": 97964.792 }, { "epoch": 0.8176470588235294, "grad_norm": 0.1964285522699356, "learning_rate": 4.014288579215067e-06, "loss": 0.3822, "num_input_tokens_seen": 20051597442, "step": 5143, "train_runtime": 204678.3192, "train_tokens_per_second": 97966.397 }, { "epoch": 0.8178060413354531, "grad_norm": 0.26574933528900146, "learning_rate": 4.007491136156424e-06, "loss": 0.3882, "num_input_tokens_seen": 20055550087, "step": 5144, "train_runtime": 204718.5389, "train_tokens_per_second": 97966.458 }, { "epoch": 0.8179650238473768, "grad_norm": 0.20527108013629913, "learning_rate": 4.000698951572904e-06, "loss": 0.3821, "num_input_tokens_seen": 20059472918, "step": 5145, "train_runtime": 204758.0288, "train_tokens_per_second": 97966.722 }, { "epoch": 0.8181240063593005, "grad_norm": 0.3624740540981293, "learning_rate": 3.9939120271659106e-06, "loss": 0.3877, "num_input_tokens_seen": 20063442711, "step": 5146, "train_runtime": 204798.0273, "train_tokens_per_second": 97966.972 }, { "epoch": 0.8182829888712242, "grad_norm": 0.17019549012184143, "learning_rate": 3.987130364635522e-06, "loss": 0.3578, "num_input_tokens_seen": 20067316090, "step": 5147, "train_runtime": 204836.2605, "train_tokens_per_second": 97967.596 }, { "epoch": 0.8184419713831479, "grad_norm": 0.2086661159992218, "learning_rate": 3.980353965680481e-06, "loss": 0.387, "num_input_tokens_seen": 20071073939, "step": 5148, "train_runtime": 204874.6935, "train_tokens_per_second": 97967.561 }, { "epoch": 0.8186009538950716, "grad_norm": 0.1997610479593277, "learning_rate": 3.973582831998252e-06, "loss": 0.3799, "num_input_tokens_seen": 20075002808, "step": 5149, "train_runtime": 204913.325, "train_tokens_per_second": 97968.264 }, { "epoch": 0.8187599364069952, "grad_norm": 0.20018130540847778, "learning_rate": 3.966816965284936e-06, "loss": 0.398, "num_input_tokens_seen": 20078873538, "step": 5150, "train_runtime": 204951.1222, "train_tokens_per_second": 97969.083 }, { "epoch": 0.8189189189189189, "grad_norm": 0.2051224559545517, "learning_rate": 3.960056367235346e-06, "loss": 0.4065, "num_input_tokens_seen": 20082839518, "step": 5151, "train_runtime": 204991.6831, "train_tokens_per_second": 97969.045 }, { "epoch": 0.8190779014308426, "grad_norm": 0.18736985325813293, "learning_rate": 3.953301039542956e-06, "loss": 0.3864, "num_input_tokens_seen": 20086664132, "step": 5152, "train_runtime": 205031.1008, "train_tokens_per_second": 97968.865 }, { "epoch": 0.8192368839427663, "grad_norm": 0.5188668966293335, "learning_rate": 3.9465509838999355e-06, "loss": 0.3841, "num_input_tokens_seen": 20090572771, "step": 5153, "train_runtime": 205072.6398, "train_tokens_per_second": 97968.08 }, { "epoch": 0.81939586645469, "grad_norm": 0.28230637311935425, "learning_rate": 3.939806201997126e-06, "loss": 0.3936, "num_input_tokens_seen": 20094439915, "step": 5154, "train_runtime": 205114.3865, "train_tokens_per_second": 97966.994 }, { "epoch": 0.8195548489666137, "grad_norm": 0.20775620639324188, "learning_rate": 3.93306669552404e-06, "loss": 0.3894, "num_input_tokens_seen": 20098367995, "step": 5155, "train_runtime": 205150.8363, "train_tokens_per_second": 97968.735 }, { "epoch": 0.8197138314785374, "grad_norm": 1.299083948135376, "learning_rate": 3.926332466168878e-06, "loss": 0.3844, "num_input_tokens_seen": 20102193136, "step": 5156, "train_runtime": 205191.7128, "train_tokens_per_second": 97967.861 }, { "epoch": 0.8198728139904611, "grad_norm": 0.21592552959918976, "learning_rate": 3.919603515618522e-06, "loss": 0.3744, "num_input_tokens_seen": 20106082353, "step": 5157, "train_runtime": 205232.112, "train_tokens_per_second": 97967.526 }, { "epoch": 0.8200317965023848, "grad_norm": 0.2094532996416092, "learning_rate": 3.912879845558528e-06, "loss": 0.3831, "num_input_tokens_seen": 20110067016, "step": 5158, "train_runtime": 205270.2636, "train_tokens_per_second": 97968.73 }, { "epoch": 0.8201907790143085, "grad_norm": 0.2620615065097809, "learning_rate": 3.906161457673113e-06, "loss": 0.3897, "num_input_tokens_seen": 20113960737, "step": 5159, "train_runtime": 205307.9235, "train_tokens_per_second": 97969.725 }, { "epoch": 0.8203497615262321, "grad_norm": 0.20158647000789642, "learning_rate": 3.899448353645208e-06, "loss": 0.3734, "num_input_tokens_seen": 20117903395, "step": 5160, "train_runtime": 205346.8624, "train_tokens_per_second": 97970.347 }, { "epoch": 0.8205087440381558, "grad_norm": 0.21604181826114655, "learning_rate": 3.89274053515638e-06, "loss": 0.3893, "num_input_tokens_seen": 20121731712, "step": 5161, "train_runtime": 205387.5063, "train_tokens_per_second": 97969.599 }, { "epoch": 0.8206677265500795, "grad_norm": 0.17779377102851868, "learning_rate": 3.8860380038868965e-06, "loss": 0.3934, "num_input_tokens_seen": 20125672389, "step": 5162, "train_runtime": 205427.5994, "train_tokens_per_second": 97969.662 }, { "epoch": 0.8208267090620032, "grad_norm": 0.23032133281230927, "learning_rate": 3.8793407615156925e-06, "loss": 0.3785, "num_input_tokens_seen": 20129519523, "step": 5163, "train_runtime": 205467.068, "train_tokens_per_second": 97969.566 }, { "epoch": 0.8209856915739269, "grad_norm": 0.23353293538093567, "learning_rate": 3.872648809720386e-06, "loss": 0.3867, "num_input_tokens_seen": 20133351982, "step": 5164, "train_runtime": 205507.1953, "train_tokens_per_second": 97969.085 }, { "epoch": 0.8211446740858506, "grad_norm": 0.2041199803352356, "learning_rate": 3.865962150177263e-06, "loss": 0.3718, "num_input_tokens_seen": 20137274046, "step": 5165, "train_runtime": 205546.6164, "train_tokens_per_second": 97969.377 }, { "epoch": 0.8213036565977743, "grad_norm": 0.233102485537529, "learning_rate": 3.85928078456127e-06, "loss": 0.3847, "num_input_tokens_seen": 20141266316, "step": 5166, "train_runtime": 205586.2385, "train_tokens_per_second": 97969.915 }, { "epoch": 0.821462639109698, "grad_norm": 0.2412567138671875, "learning_rate": 3.852604714546063e-06, "loss": 0.3857, "num_input_tokens_seen": 20145238609, "step": 5167, "train_runtime": 205626.232, "train_tokens_per_second": 97970.178 }, { "epoch": 0.8216216216216217, "grad_norm": 0.2493503987789154, "learning_rate": 3.8459339418039335e-06, "loss": 0.3929, "num_input_tokens_seen": 20149212676, "step": 5168, "train_runtime": 205665.4902, "train_tokens_per_second": 97970.8 }, { "epoch": 0.8217806041335453, "grad_norm": 0.19728899002075195, "learning_rate": 3.839268468005872e-06, "loss": 0.374, "num_input_tokens_seen": 20153204473, "step": 5169, "train_runtime": 205706.2735, "train_tokens_per_second": 97970.782 }, { "epoch": 0.821939586645469, "grad_norm": 0.17840397357940674, "learning_rate": 3.83260829482153e-06, "loss": 0.3889, "num_input_tokens_seen": 20157118720, "step": 5170, "train_runtime": 205743.3133, "train_tokens_per_second": 97972.169 }, { "epoch": 0.8220985691573927, "grad_norm": 0.17910632491111755, "learning_rate": 3.825953423919232e-06, "loss": 0.3969, "num_input_tokens_seen": 20160928124, "step": 5171, "train_runtime": 205783.0225, "train_tokens_per_second": 97971.776 }, { "epoch": 0.8222575516693164, "grad_norm": 0.2263697385787964, "learning_rate": 3.819303856965983e-06, "loss": 0.3939, "num_input_tokens_seen": 20164734882, "step": 5172, "train_runtime": 205823.2047, "train_tokens_per_second": 97971.144 }, { "epoch": 0.82241653418124, "grad_norm": 0.3274329602718353, "learning_rate": 3.8126595956274435e-06, "loss": 0.3792, "num_input_tokens_seen": 20168735117, "step": 5173, "train_runtime": 205859.1599, "train_tokens_per_second": 97973.465 }, { "epoch": 0.8225755166931638, "grad_norm": 0.2021740823984146, "learning_rate": 3.8060206415679588e-06, "loss": 0.3813, "num_input_tokens_seen": 20172632267, "step": 5174, "train_runtime": 205897.5241, "train_tokens_per_second": 97974.137 }, { "epoch": 0.8227344992050875, "grad_norm": 0.22369155287742615, "learning_rate": 3.799386996450538e-06, "loss": 0.3827, "num_input_tokens_seen": 20176445577, "step": 5175, "train_runtime": 205937.0288, "train_tokens_per_second": 97973.86 }, { "epoch": 0.8228934817170112, "grad_norm": 0.20311027765274048, "learning_rate": 3.7927586619368657e-06, "loss": 0.3907, "num_input_tokens_seen": 20180474471, "step": 5176, "train_runtime": 205975.8506, "train_tokens_per_second": 97974.954 }, { "epoch": 0.8230524642289349, "grad_norm": 0.21803410351276398, "learning_rate": 3.7861356396872883e-06, "loss": 0.4017, "num_input_tokens_seen": 20184371761, "step": 5177, "train_runtime": 206015.8295, "train_tokens_per_second": 97974.859 }, { "epoch": 0.8232114467408586, "grad_norm": 0.21865856647491455, "learning_rate": 3.7795179313608347e-06, "loss": 0.3969, "num_input_tokens_seen": 20188162497, "step": 5178, "train_runtime": 206053.357, "train_tokens_per_second": 97975.412 }, { "epoch": 0.8233704292527821, "grad_norm": 0.2872954308986664, "learning_rate": 3.7729055386151817e-06, "loss": 0.3959, "num_input_tokens_seen": 20192136409, "step": 5179, "train_runtime": 206092.1192, "train_tokens_per_second": 97976.267 }, { "epoch": 0.8235294117647058, "grad_norm": 0.2641088366508484, "learning_rate": 3.7662984631066934e-06, "loss": 0.375, "num_input_tokens_seen": 20196062576, "step": 5180, "train_runtime": 206132.6322, "train_tokens_per_second": 97976.057 }, { "epoch": 0.8236883942766295, "grad_norm": 0.24677428603172302, "learning_rate": 3.7596967064904036e-06, "loss": 0.3799, "num_input_tokens_seen": 20200009562, "step": 5181, "train_runtime": 206173.1537, "train_tokens_per_second": 97975.945 }, { "epoch": 0.8238473767885532, "grad_norm": 0.18823225796222687, "learning_rate": 3.753100270419982e-06, "loss": 0.3901, "num_input_tokens_seen": 20203843604, "step": 5182, "train_runtime": 206210.8129, "train_tokens_per_second": 97976.645 }, { "epoch": 0.8240063593004769, "grad_norm": 0.18954317271709442, "learning_rate": 3.74650915654782e-06, "loss": 0.3788, "num_input_tokens_seen": 20207747177, "step": 5183, "train_runtime": 206246.7294, "train_tokens_per_second": 97978.51 }, { "epoch": 0.8241653418124006, "grad_norm": 0.23277990520000458, "learning_rate": 3.7399233665249138e-06, "loss": 0.3838, "num_input_tokens_seen": 20211668985, "step": 5184, "train_runtime": 206285.8178, "train_tokens_per_second": 97978.956 }, { "epoch": 0.8243243243243243, "grad_norm": 0.1934533566236496, "learning_rate": 3.7333429020009884e-06, "loss": 0.4059, "num_input_tokens_seen": 20215556666, "step": 5185, "train_runtime": 206327.4112, "train_tokens_per_second": 97978.046 }, { "epoch": 0.824483306836248, "grad_norm": 0.27134591341018677, "learning_rate": 3.726767764624381e-06, "loss": 0.3862, "num_input_tokens_seen": 20219421121, "step": 5186, "train_runtime": 206366.8796, "train_tokens_per_second": 97978.034 }, { "epoch": 0.8246422893481717, "grad_norm": 0.17281249165534973, "learning_rate": 3.7201979560421236e-06, "loss": 0.3753, "num_input_tokens_seen": 20223300949, "step": 5187, "train_runtime": 206403.1834, "train_tokens_per_second": 97979.598 }, { "epoch": 0.8248012718600954, "grad_norm": 0.1977144479751587, "learning_rate": 3.7136334778999137e-06, "loss": 0.3866, "num_input_tokens_seen": 20227146384, "step": 5188, "train_runtime": 206441.8904, "train_tokens_per_second": 97979.855 }, { "epoch": 0.824960254372019, "grad_norm": 0.19377538561820984, "learning_rate": 3.707074331842089e-06, "loss": 0.3785, "num_input_tokens_seen": 20231062992, "step": 5189, "train_runtime": 206481.9393, "train_tokens_per_second": 97979.819 }, { "epoch": 0.8251192368839427, "grad_norm": 0.20010051131248474, "learning_rate": 3.700520519511694e-06, "loss": 0.392, "num_input_tokens_seen": 20234913607, "step": 5190, "train_runtime": 206520.5904, "train_tokens_per_second": 97980.127 }, { "epoch": 0.8252782193958664, "grad_norm": 0.4207363426685333, "learning_rate": 3.693972042550392e-06, "loss": 0.3882, "num_input_tokens_seen": 20238777468, "step": 5191, "train_runtime": 206558.3145, "train_tokens_per_second": 97980.938 }, { "epoch": 0.8254372019077901, "grad_norm": 0.18985822796821594, "learning_rate": 3.6874289025985385e-06, "loss": 0.3819, "num_input_tokens_seen": 20242542730, "step": 5192, "train_runtime": 206597.271, "train_tokens_per_second": 97980.688 }, { "epoch": 0.8255961844197138, "grad_norm": 0.18206429481506348, "learning_rate": 3.680891101295142e-06, "loss": 0.377, "num_input_tokens_seen": 20246584402, "step": 5193, "train_runtime": 206636.4658, "train_tokens_per_second": 97981.662 }, { "epoch": 0.8257551669316375, "grad_norm": 0.2522641718387604, "learning_rate": 3.674358640277878e-06, "loss": 0.3925, "num_input_tokens_seen": 20250361567, "step": 5194, "train_runtime": 206672.2771, "train_tokens_per_second": 97982.96 }, { "epoch": 0.8259141494435612, "grad_norm": 0.19350731372833252, "learning_rate": 3.667831521183082e-06, "loss": 0.3809, "num_input_tokens_seen": 20254212841, "step": 5195, "train_runtime": 206712.0671, "train_tokens_per_second": 97982.731 }, { "epoch": 0.8260731319554849, "grad_norm": 0.36270391941070557, "learning_rate": 3.6613097456457467e-06, "loss": 0.3932, "num_input_tokens_seen": 20258159673, "step": 5196, "train_runtime": 206752.4406, "train_tokens_per_second": 97982.687 }, { "epoch": 0.8262321144674086, "grad_norm": 0.17472580075263977, "learning_rate": 3.6547933152995317e-06, "loss": 0.3953, "num_input_tokens_seen": 20262146540, "step": 5197, "train_runtime": 206792.4823, "train_tokens_per_second": 97982.994 }, { "epoch": 0.8263910969793322, "grad_norm": 0.2523878514766693, "learning_rate": 3.6482822317767617e-06, "loss": 0.3934, "num_input_tokens_seen": 20265944178, "step": 5198, "train_runtime": 206830.6834, "train_tokens_per_second": 97983.258 }, { "epoch": 0.8265500794912559, "grad_norm": 0.19867458939552307, "learning_rate": 3.6417764967084155e-06, "loss": 0.3841, "num_input_tokens_seen": 20269832462, "step": 5199, "train_runtime": 206868.2858, "train_tokens_per_second": 97984.243 }, { "epoch": 0.8267090620031796, "grad_norm": 0.26160579919815063, "learning_rate": 3.635276111724123e-06, "loss": 0.3776, "num_input_tokens_seen": 20273763579, "step": 5200, "train_runtime": 206909.1774, "train_tokens_per_second": 97983.878 }, { "epoch": 0.8268680445151033, "grad_norm": 0.18590901792049408, "learning_rate": 3.6287810784522064e-06, "loss": 0.3844, "num_input_tokens_seen": 20277663668, "step": 5201, "train_runtime": 207043.3625, "train_tokens_per_second": 97939.212 }, { "epoch": 0.827027027027027, "grad_norm": 0.2601257562637329, "learning_rate": 3.6222913985196034e-06, "loss": 0.3848, "num_input_tokens_seen": 20281450757, "step": 5202, "train_runtime": 207081.4772, "train_tokens_per_second": 97939.473 }, { "epoch": 0.8271860095389507, "grad_norm": 0.19125595688819885, "learning_rate": 3.615807073551947e-06, "loss": 0.3819, "num_input_tokens_seen": 20285392623, "step": 5203, "train_runtime": 207121.6658, "train_tokens_per_second": 97939.501 }, { "epoch": 0.8273449920508744, "grad_norm": 0.2044553905725479, "learning_rate": 3.6093281051735123e-06, "loss": 0.3865, "num_input_tokens_seen": 20289351701, "step": 5204, "train_runtime": 207158.32, "train_tokens_per_second": 97941.283 }, { "epoch": 0.8275039745627981, "grad_norm": 0.20917882025241852, "learning_rate": 3.602854495007224e-06, "loss": 0.3763, "num_input_tokens_seen": 20293276926, "step": 5205, "train_runtime": 207196.1643, "train_tokens_per_second": 97942.339 }, { "epoch": 0.8276629570747218, "grad_norm": 0.3297644853591919, "learning_rate": 3.596386244674696e-06, "loss": 0.3969, "num_input_tokens_seen": 20297028927, "step": 5206, "train_runtime": 207239.5332, "train_tokens_per_second": 97939.947 }, { "epoch": 0.8278219395866455, "grad_norm": 0.28105878829956055, "learning_rate": 3.589923355796157e-06, "loss": 0.3817, "num_input_tokens_seen": 20300848532, "step": 5207, "train_runtime": 207279.2208, "train_tokens_per_second": 97939.622 }, { "epoch": 0.8279809220985691, "grad_norm": 0.26154738664627075, "learning_rate": 3.583465829990537e-06, "loss": 0.3855, "num_input_tokens_seen": 20304957991, "step": 5208, "train_runtime": 207319.9727, "train_tokens_per_second": 97940.192 }, { "epoch": 0.8281399046104928, "grad_norm": 0.2094544768333435, "learning_rate": 3.5770136688753842e-06, "loss": 0.3932, "num_input_tokens_seen": 20308785330, "step": 5209, "train_runtime": 207360.0764, "train_tokens_per_second": 97939.708 }, { "epoch": 0.8282988871224165, "grad_norm": 0.19028520584106445, "learning_rate": 3.5705668740669275e-06, "loss": 0.3807, "num_input_tokens_seen": 20312785921, "step": 5210, "train_runtime": 207399.9079, "train_tokens_per_second": 97940.188 }, { "epoch": 0.8284578696343402, "grad_norm": 0.2844293713569641, "learning_rate": 3.5641254471800445e-06, "loss": 0.3874, "num_input_tokens_seen": 20316777935, "step": 5211, "train_runtime": 207439.5799, "train_tokens_per_second": 97940.701 }, { "epoch": 0.8286168521462639, "grad_norm": 0.21255207061767578, "learning_rate": 3.55768938982825e-06, "loss": 0.3774, "num_input_tokens_seen": 20320646188, "step": 5212, "train_runtime": 207476.4563, "train_tokens_per_second": 97941.938 }, { "epoch": 0.8287758346581876, "grad_norm": 0.21432389318943024, "learning_rate": 3.551258703623758e-06, "loss": 0.3884, "num_input_tokens_seen": 20324570166, "step": 5213, "train_runtime": 207516.9242, "train_tokens_per_second": 97941.747 }, { "epoch": 0.8289348171701113, "grad_norm": 0.181437149643898, "learning_rate": 3.5448333901773904e-06, "loss": 0.3933, "num_input_tokens_seen": 20328347199, "step": 5214, "train_runtime": 207557.8287, "train_tokens_per_second": 97940.643 }, { "epoch": 0.829093799682035, "grad_norm": 0.21516376733779907, "learning_rate": 3.5384134510986514e-06, "loss": 0.3867, "num_input_tokens_seen": 20332230196, "step": 5215, "train_runtime": 207599.4574, "train_tokens_per_second": 97939.708 }, { "epoch": 0.8292527821939587, "grad_norm": 0.271716833114624, "learning_rate": 3.5319988879956856e-06, "loss": 0.3715, "num_input_tokens_seen": 20336058591, "step": 5216, "train_runtime": 207639.3885, "train_tokens_per_second": 97939.311 }, { "epoch": 0.8294117647058824, "grad_norm": 0.19628432393074036, "learning_rate": 3.5255897024753014e-06, "loss": 0.3944, "num_input_tokens_seen": 20339985613, "step": 5217, "train_runtime": 207680.1694, "train_tokens_per_second": 97938.988 }, { "epoch": 0.829570747217806, "grad_norm": 0.24106846749782562, "learning_rate": 3.5191858961429484e-06, "loss": 0.3808, "num_input_tokens_seen": 20343867933, "step": 5218, "train_runtime": 207719.1822, "train_tokens_per_second": 97939.284 }, { "epoch": 0.8297297297297297, "grad_norm": 0.21421277523040771, "learning_rate": 3.5127874706027352e-06, "loss": 0.3881, "num_input_tokens_seen": 20347686136, "step": 5219, "train_runtime": 207756.1023, "train_tokens_per_second": 97940.257 }, { "epoch": 0.8298887122416534, "grad_norm": 0.2224472314119339, "learning_rate": 3.506394427457427e-06, "loss": 0.3877, "num_input_tokens_seen": 20351544837, "step": 5220, "train_runtime": 207793.5901, "train_tokens_per_second": 97941.158 }, { "epoch": 0.8300476947535771, "grad_norm": 0.25978943705558777, "learning_rate": 3.500006768308431e-06, "loss": 0.3881, "num_input_tokens_seen": 20355513095, "step": 5221, "train_runtime": 207832.8039, "train_tokens_per_second": 97941.772 }, { "epoch": 0.8302066772655008, "grad_norm": 0.2036219835281372, "learning_rate": 3.4936244947558206e-06, "loss": 0.3674, "num_input_tokens_seen": 20359469082, "step": 5222, "train_runtime": 207872.5556, "train_tokens_per_second": 97942.073 }, { "epoch": 0.8303656597774245, "grad_norm": 0.19491928815841675, "learning_rate": 3.487247608398289e-06, "loss": 0.3878, "num_input_tokens_seen": 20363339157, "step": 5223, "train_runtime": 207911.5639, "train_tokens_per_second": 97942.312 }, { "epoch": 0.8305246422893482, "grad_norm": 0.21787935495376587, "learning_rate": 3.4808761108332284e-06, "loss": 0.389, "num_input_tokens_seen": 20367215562, "step": 5224, "train_runtime": 207948.9713, "train_tokens_per_second": 97943.334 }, { "epoch": 0.8306836248012719, "grad_norm": 0.20948022603988647, "learning_rate": 3.4745100036566334e-06, "loss": 0.3818, "num_input_tokens_seen": 20371041302, "step": 5225, "train_runtime": 207988.4969, "train_tokens_per_second": 97943.115 }, { "epoch": 0.8308426073131956, "grad_norm": 0.20994833111763, "learning_rate": 3.468149288463174e-06, "loss": 0.3922, "num_input_tokens_seen": 20374973076, "step": 5226, "train_runtime": 208028.9984, "train_tokens_per_second": 97942.947 }, { "epoch": 0.8310015898251193, "grad_norm": 0.20434023439884186, "learning_rate": 3.4617939668461744e-06, "loss": 0.3887, "num_input_tokens_seen": 20378837205, "step": 5227, "train_runtime": 208067.2533, "train_tokens_per_second": 97943.511 }, { "epoch": 0.8311605723370429, "grad_norm": 0.24684570729732513, "learning_rate": 3.455444040397579e-06, "loss": 0.378, "num_input_tokens_seen": 20382799785, "step": 5228, "train_runtime": 208106.0118, "train_tokens_per_second": 97944.31 }, { "epoch": 0.8313195548489666, "grad_norm": 0.24330642819404602, "learning_rate": 3.4490995107080203e-06, "loss": 0.3761, "num_input_tokens_seen": 20386729668, "step": 5229, "train_runtime": 208144.3688, "train_tokens_per_second": 97945.142 }, { "epoch": 0.8314785373608903, "grad_norm": 0.21169713139533997, "learning_rate": 3.442760379366736e-06, "loss": 0.3794, "num_input_tokens_seen": 20390707507, "step": 5230, "train_runtime": 208181.9079, "train_tokens_per_second": 97946.588 }, { "epoch": 0.831637519872814, "grad_norm": 0.2718225121498108, "learning_rate": 3.4364266479616573e-06, "loss": 0.3733, "num_input_tokens_seen": 20394424421, "step": 5231, "train_runtime": 208219.943, "train_tokens_per_second": 97946.547 }, { "epoch": 0.8317965023847377, "grad_norm": 0.21406398713588715, "learning_rate": 3.430098318079322e-06, "loss": 0.3892, "num_input_tokens_seen": 20398449863, "step": 5232, "train_runtime": 208260.9251, "train_tokens_per_second": 97946.602 }, { "epoch": 0.8319554848966614, "grad_norm": 0.21654601395130157, "learning_rate": 3.423775391304937e-06, "loss": 0.3768, "num_input_tokens_seen": 20402247111, "step": 5233, "train_runtime": 208300.0466, "train_tokens_per_second": 97946.436 }, { "epoch": 0.8321144674085851, "grad_norm": 0.25365257263183594, "learning_rate": 3.417457869222357e-06, "loss": 0.3823, "num_input_tokens_seen": 20406184444, "step": 5234, "train_runtime": 208341.5692, "train_tokens_per_second": 97945.813 }, { "epoch": 0.8322734499205088, "grad_norm": 0.17739221453666687, "learning_rate": 3.4111457534140607e-06, "loss": 0.3779, "num_input_tokens_seen": 20409971676, "step": 5235, "train_runtime": 208378.9256, "train_tokens_per_second": 97946.429 }, { "epoch": 0.8324324324324325, "grad_norm": 0.25517943501472473, "learning_rate": 3.4048390454612082e-06, "loss": 0.3792, "num_input_tokens_seen": 20413922443, "step": 5236, "train_runtime": 208418.8438, "train_tokens_per_second": 97946.625 }, { "epoch": 0.8325914149443561, "grad_norm": 0.2184903919696808, "learning_rate": 3.39853774694357e-06, "loss": 0.3826, "num_input_tokens_seen": 20417689157, "step": 5237, "train_runtime": 208457.4525, "train_tokens_per_second": 97946.554 }, { "epoch": 0.8327503974562798, "grad_norm": 0.18411318957805634, "learning_rate": 3.392241859439585e-06, "loss": 0.3899, "num_input_tokens_seen": 20421592027, "step": 5238, "train_runtime": 208499.1498, "train_tokens_per_second": 97945.685 }, { "epoch": 0.8329093799682035, "grad_norm": 0.2504538297653198, "learning_rate": 3.3859513845263256e-06, "loss": 0.3831, "num_input_tokens_seen": 20425530311, "step": 5239, "train_runtime": 208538.6993, "train_tokens_per_second": 97945.995 }, { "epoch": 0.8330683624801272, "grad_norm": 0.2083476185798645, "learning_rate": 3.379666323779515e-06, "loss": 0.3865, "num_input_tokens_seen": 20429452444, "step": 5240, "train_runtime": 208577.1899, "train_tokens_per_second": 97946.724 }, { "epoch": 0.8332273449920509, "grad_norm": 0.18176311254501343, "learning_rate": 3.3733866787735104e-06, "loss": 0.3905, "num_input_tokens_seen": 20433243892, "step": 5241, "train_runtime": 208619.3852, "train_tokens_per_second": 97945.087 }, { "epoch": 0.8333863275039746, "grad_norm": 0.2278287410736084, "learning_rate": 3.3671124510813197e-06, "loss": 0.391, "num_input_tokens_seen": 20437132888, "step": 5242, "train_runtime": 208658.9133, "train_tokens_per_second": 97945.171 }, { "epoch": 0.8335453100158983, "grad_norm": 0.22766365110874176, "learning_rate": 3.360843642274597e-06, "loss": 0.3864, "num_input_tokens_seen": 20441057959, "step": 5243, "train_runtime": 208698.833, "train_tokens_per_second": 97945.243 }, { "epoch": 0.833704292527822, "grad_norm": 0.19623462855815887, "learning_rate": 3.354580253923631e-06, "loss": 0.3888, "num_input_tokens_seen": 20444878253, "step": 5244, "train_runtime": 208738.0571, "train_tokens_per_second": 97945.14 }, { "epoch": 0.8338632750397457, "grad_norm": 0.23647336661815643, "learning_rate": 3.348322287597361e-06, "loss": 0.3769, "num_input_tokens_seen": 20448797697, "step": 5245, "train_runtime": 208779.6479, "train_tokens_per_second": 97944.402 }, { "epoch": 0.8340222575516694, "grad_norm": 0.29725927114486694, "learning_rate": 3.3420697448633494e-06, "loss": 0.3805, "num_input_tokens_seen": 20452729341, "step": 5246, "train_runtime": 208817.7582, "train_tokens_per_second": 97945.354 }, { "epoch": 0.834181240063593, "grad_norm": 0.1991494596004486, "learning_rate": 3.3358226272878353e-06, "loss": 0.3921, "num_input_tokens_seen": 20456599250, "step": 5247, "train_runtime": 208856.2494, "train_tokens_per_second": 97945.833 }, { "epoch": 0.8343402225755167, "grad_norm": 0.20347891747951508, "learning_rate": 3.329580936435661e-06, "loss": 0.3801, "num_input_tokens_seen": 20460513340, "step": 5248, "train_runtime": 208895.7566, "train_tokens_per_second": 97946.046 }, { "epoch": 0.8344992050874404, "grad_norm": 0.21067239344120026, "learning_rate": 3.323344673870332e-06, "loss": 0.3888, "num_input_tokens_seen": 20464387420, "step": 5249, "train_runtime": 208935.1993, "train_tokens_per_second": 97946.098 }, { "epoch": 0.834658187599364, "grad_norm": 0.2039824277162552, "learning_rate": 3.3171138411539932e-06, "loss": 0.3836, "num_input_tokens_seen": 20468315803, "step": 5250, "train_runtime": 208974.1654, "train_tokens_per_second": 97946.633 }, { "epoch": 0.8348171701112878, "grad_norm": 0.3055916130542755, "learning_rate": 3.3108884398474065e-06, "loss": 0.389, "num_input_tokens_seen": 20472165146, "step": 5251, "train_runtime": 209015.1343, "train_tokens_per_second": 97945.851 }, { "epoch": 0.8349761526232115, "grad_norm": 0.3490515351295471, "learning_rate": 3.3046684715100112e-06, "loss": 0.395, "num_input_tokens_seen": 20475933616, "step": 5252, "train_runtime": 209055.9142, "train_tokens_per_second": 97944.771 }, { "epoch": 0.8351351351351352, "grad_norm": 0.1896062195301056, "learning_rate": 3.2984539376998495e-06, "loss": 0.3774, "num_input_tokens_seen": 20479977754, "step": 5253, "train_runtime": 209094.7992, "train_tokens_per_second": 97945.897 }, { "epoch": 0.8352941176470589, "grad_norm": 0.2246100753545761, "learning_rate": 3.292244839973635e-06, "loss": 0.3802, "num_input_tokens_seen": 20483983457, "step": 5254, "train_runtime": 209135.1214, "train_tokens_per_second": 97946.167 }, { "epoch": 0.8354531001589826, "grad_norm": 0.29664576053619385, "learning_rate": 3.2860411798866897e-06, "loss": 0.3848, "num_input_tokens_seen": 20487775773, "step": 5255, "train_runtime": 209174.0752, "train_tokens_per_second": 97946.056 }, { "epoch": 0.8356120826709063, "grad_norm": 0.2504778802394867, "learning_rate": 3.2798429589929874e-06, "loss": 0.3884, "num_input_tokens_seen": 20491701867, "step": 5256, "train_runtime": 209212.123, "train_tokens_per_second": 97947.01 }, { "epoch": 0.8357710651828298, "grad_norm": 0.2658427357673645, "learning_rate": 3.27365017884515e-06, "loss": 0.3881, "num_input_tokens_seen": 20495643386, "step": 5257, "train_runtime": 209250.4685, "train_tokens_per_second": 97947.897 }, { "epoch": 0.8359300476947535, "grad_norm": 0.21281060576438904, "learning_rate": 3.267462840994409e-06, "loss": 0.3973, "num_input_tokens_seen": 20499615783, "step": 5258, "train_runtime": 209290.7398, "train_tokens_per_second": 97948.031 }, { "epoch": 0.8360890302066772, "grad_norm": 0.3742836117744446, "learning_rate": 3.261280946990658e-06, "loss": 0.3911, "num_input_tokens_seen": 20503440601, "step": 5259, "train_runtime": 209331.3404, "train_tokens_per_second": 97947.305 }, { "epoch": 0.8362480127186009, "grad_norm": 0.18731461465358734, "learning_rate": 3.2551044983824138e-06, "loss": 0.3875, "num_input_tokens_seen": 20507339034, "step": 5260, "train_runtime": 209370.0768, "train_tokens_per_second": 97947.803 }, { "epoch": 0.8364069952305246, "grad_norm": 0.22431977093219757, "learning_rate": 3.2489334967168384e-06, "loss": 0.3794, "num_input_tokens_seen": 20511266861, "step": 5261, "train_runtime": 209409.2755, "train_tokens_per_second": 97948.225 }, { "epoch": 0.8365659777424483, "grad_norm": 0.2650987207889557, "learning_rate": 3.242767943539721e-06, "loss": 0.3962, "num_input_tokens_seen": 20515281885, "step": 5262, "train_runtime": 209449.9334, "train_tokens_per_second": 97948.381 }, { "epoch": 0.836724960254372, "grad_norm": 0.3122248351573944, "learning_rate": 3.2366078403954946e-06, "loss": 0.3847, "num_input_tokens_seen": 20519086724, "step": 5263, "train_runtime": 209487.4458, "train_tokens_per_second": 97949.004 }, { "epoch": 0.8368839427662957, "grad_norm": 0.2457035481929779, "learning_rate": 3.2304531888272092e-06, "loss": 0.39, "num_input_tokens_seen": 20522931684, "step": 5264, "train_runtime": 209525.7025, "train_tokens_per_second": 97949.471 }, { "epoch": 0.8370429252782194, "grad_norm": 0.20259487628936768, "learning_rate": 3.224303990376573e-06, "loss": 0.3847, "num_input_tokens_seen": 20526707191, "step": 5265, "train_runtime": 209566.1793, "train_tokens_per_second": 97948.568 }, { "epoch": 0.837201907790143, "grad_norm": 0.19129987061023712, "learning_rate": 3.2181602465839093e-06, "loss": 0.3918, "num_input_tokens_seen": 20530578479, "step": 5266, "train_runtime": 209605.9222, "train_tokens_per_second": 97948.466 }, { "epoch": 0.8373608903020667, "grad_norm": 0.19307132065296173, "learning_rate": 3.2120219589881844e-06, "loss": 0.3884, "num_input_tokens_seen": 20534517721, "step": 5267, "train_runtime": 209645.1819, "train_tokens_per_second": 97948.913 }, { "epoch": 0.8375198728139904, "grad_norm": 0.21192501485347748, "learning_rate": 3.205889129127007e-06, "loss": 0.3892, "num_input_tokens_seen": 20538392301, "step": 5268, "train_runtime": 209685.5393, "train_tokens_per_second": 97948.539 }, { "epoch": 0.8376788553259141, "grad_norm": 0.30566564202308655, "learning_rate": 3.1997617585365846e-06, "loss": 0.3931, "num_input_tokens_seen": 20542336188, "step": 5269, "train_runtime": 209725.2284, "train_tokens_per_second": 97948.808 }, { "epoch": 0.8378378378378378, "grad_norm": 0.21494057774543762, "learning_rate": 3.193639848751803e-06, "loss": 0.3831, "num_input_tokens_seen": 20546280952, "step": 5270, "train_runtime": 209766.3774, "train_tokens_per_second": 97948.4 }, { "epoch": 0.8379968203497615, "grad_norm": 0.1904999017715454, "learning_rate": 3.1875234013061444e-06, "loss": 0.3756, "num_input_tokens_seen": 20550284107, "step": 5271, "train_runtime": 209807.2469, "train_tokens_per_second": 97948.4 }, { "epoch": 0.8381558028616852, "grad_norm": 0.2568375766277313, "learning_rate": 3.1814124177317384e-06, "loss": 0.3862, "num_input_tokens_seen": 20554053451, "step": 5272, "train_runtime": 209848.1289, "train_tokens_per_second": 97947.28 }, { "epoch": 0.8383147853736089, "grad_norm": 0.24917174875736237, "learning_rate": 3.1753068995593438e-06, "loss": 0.3856, "num_input_tokens_seen": 20557925214, "step": 5273, "train_runtime": 209888.3427, "train_tokens_per_second": 97946.961 }, { "epoch": 0.8384737678855326, "grad_norm": 0.31876900792121887, "learning_rate": 3.1692068483183513e-06, "loss": 0.3806, "num_input_tokens_seen": 20561962219, "step": 5274, "train_runtime": 209928.1145, "train_tokens_per_second": 97947.634 }, { "epoch": 0.8386327503974563, "grad_norm": 0.38744446635246277, "learning_rate": 3.1631122655367827e-06, "loss": 0.3915, "num_input_tokens_seen": 20565953975, "step": 5275, "train_runtime": 209967.6307, "train_tokens_per_second": 97948.212 }, { "epoch": 0.8387917329093799, "grad_norm": 0.5413846373558044, "learning_rate": 3.1570231527412785e-06, "loss": 0.3763, "num_input_tokens_seen": 20569706506, "step": 5276, "train_runtime": 210008.1258, "train_tokens_per_second": 97947.193 }, { "epoch": 0.8389507154213036, "grad_norm": 0.21135908365249634, "learning_rate": 3.150939511457127e-06, "loss": 0.3886, "num_input_tokens_seen": 20573673047, "step": 5277, "train_runtime": 210046.6053, "train_tokens_per_second": 97948.134 }, { "epoch": 0.8391096979332273, "grad_norm": 0.4884941577911377, "learning_rate": 3.144861343208233e-06, "loss": 0.3816, "num_input_tokens_seen": 20577737827, "step": 5278, "train_runtime": 210083.2708, "train_tokens_per_second": 97950.388 }, { "epoch": 0.839268680445151, "grad_norm": 0.19790226221084595, "learning_rate": 3.1387886495171354e-06, "loss": 0.3819, "num_input_tokens_seen": 20581505395, "step": 5279, "train_runtime": 210121.2741, "train_tokens_per_second": 97950.602 }, { "epoch": 0.8394276629570747, "grad_norm": 0.23809555172920227, "learning_rate": 3.1327214319050056e-06, "loss": 0.3766, "num_input_tokens_seen": 20585325862, "step": 5280, "train_runtime": 210160.6013, "train_tokens_per_second": 97950.452 }, { "epoch": 0.8395866454689984, "grad_norm": 0.18970046937465668, "learning_rate": 3.126659691891637e-06, "loss": 0.3874, "num_input_tokens_seen": 20589335615, "step": 5281, "train_runtime": 210201.4345, "train_tokens_per_second": 97950.5 }, { "epoch": 0.8397456279809221, "grad_norm": 0.20674487948417664, "learning_rate": 3.1206034309954473e-06, "loss": 0.3855, "num_input_tokens_seen": 20593219434, "step": 5282, "train_runtime": 210241.6589, "train_tokens_per_second": 97950.233 }, { "epoch": 0.8399046104928458, "grad_norm": 0.21229279041290283, "learning_rate": 3.1145526507334905e-06, "loss": 0.3815, "num_input_tokens_seen": 20597139891, "step": 5283, "train_runtime": 210282.3643, "train_tokens_per_second": 97949.916 }, { "epoch": 0.8400635930047695, "grad_norm": 0.1869606226682663, "learning_rate": 3.1085073526214426e-06, "loss": 0.3839, "num_input_tokens_seen": 20600943475, "step": 5284, "train_runtime": 210323.1908, "train_tokens_per_second": 97948.987 }, { "epoch": 0.8402225755166932, "grad_norm": 0.1769375503063202, "learning_rate": 3.102467538173612e-06, "loss": 0.3782, "num_input_tokens_seen": 20604907670, "step": 5285, "train_runtime": 210361.3188, "train_tokens_per_second": 97950.078 }, { "epoch": 0.8403815580286168, "grad_norm": 0.2249092310667038, "learning_rate": 3.0964332089029323e-06, "loss": 0.393, "num_input_tokens_seen": 20608859503, "step": 5286, "train_runtime": 210402.5621, "train_tokens_per_second": 97949.66 }, { "epoch": 0.8405405405405405, "grad_norm": 0.7364928126335144, "learning_rate": 3.0904043663209455e-06, "loss": 0.3846, "num_input_tokens_seen": 20612770587, "step": 5287, "train_runtime": 210438.7205, "train_tokens_per_second": 97951.416 }, { "epoch": 0.8406995230524642, "grad_norm": 0.2728542983531952, "learning_rate": 3.0843810119378534e-06, "loss": 0.3829, "num_input_tokens_seen": 20616683251, "step": 5288, "train_runtime": 210475.6226, "train_tokens_per_second": 97952.832 }, { "epoch": 0.8408585055643879, "grad_norm": 0.24157287180423737, "learning_rate": 3.07836314726245e-06, "loss": 0.3856, "num_input_tokens_seen": 20620532276, "step": 5289, "train_runtime": 210516.4732, "train_tokens_per_second": 97952.108 }, { "epoch": 0.8410174880763116, "grad_norm": 0.22646917402744293, "learning_rate": 3.072350773802171e-06, "loss": 0.3862, "num_input_tokens_seen": 20624534581, "step": 5290, "train_runtime": 210558.9487, "train_tokens_per_second": 97951.356 }, { "epoch": 0.8411764705882353, "grad_norm": 0.1949353814125061, "learning_rate": 3.066343893063081e-06, "loss": 0.3817, "num_input_tokens_seen": 20628337480, "step": 5291, "train_runtime": 210596.3516, "train_tokens_per_second": 97952.017 }, { "epoch": 0.841335453100159, "grad_norm": 0.17698481678962708, "learning_rate": 3.060342506549843e-06, "loss": 0.3699, "num_input_tokens_seen": 20632247032, "step": 5292, "train_runtime": 210636.7069, "train_tokens_per_second": 97951.812 }, { "epoch": 0.8414944356120827, "grad_norm": 0.2253551185131073, "learning_rate": 3.054346615765785e-06, "loss": 0.3783, "num_input_tokens_seen": 20636109746, "step": 5293, "train_runtime": 210676.8744, "train_tokens_per_second": 97951.471 }, { "epoch": 0.8416534181240064, "grad_norm": 0.23077109456062317, "learning_rate": 3.0483562222128197e-06, "loss": 0.3782, "num_input_tokens_seen": 20639919176, "step": 5294, "train_runtime": 210710.4274, "train_tokens_per_second": 97953.952 }, { "epoch": 0.8418124006359301, "grad_norm": 0.2525691092014313, "learning_rate": 3.042371327391502e-06, "loss": 0.3846, "num_input_tokens_seen": 20643859491, "step": 5295, "train_runtime": 210752.2453, "train_tokens_per_second": 97953.213 }, { "epoch": 0.8419713831478537, "grad_norm": 0.18395347893238068, "learning_rate": 3.0363919328010054e-06, "loss": 0.3906, "num_input_tokens_seen": 20647779959, "step": 5296, "train_runtime": 210793.3056, "train_tokens_per_second": 97952.731 }, { "epoch": 0.8421303656597774, "grad_norm": 0.2712642550468445, "learning_rate": 3.0304180399391284e-06, "loss": 0.3783, "num_input_tokens_seen": 20651662072, "step": 5297, "train_runtime": 210834.1345, "train_tokens_per_second": 97952.175 }, { "epoch": 0.8422893481717011, "grad_norm": 0.24976414442062378, "learning_rate": 3.024449650302294e-06, "loss": 0.3903, "num_input_tokens_seen": 20655550320, "step": 5298, "train_runtime": 210874.0804, "train_tokens_per_second": 97952.059 }, { "epoch": 0.8424483306836248, "grad_norm": 0.19823680818080902, "learning_rate": 3.018486765385528e-06, "loss": 0.3892, "num_input_tokens_seen": 20659452101, "step": 5299, "train_runtime": 210913.5155, "train_tokens_per_second": 97952.244 }, { "epoch": 0.8426073131955485, "grad_norm": 0.2226104736328125, "learning_rate": 3.0125293866824973e-06, "loss": 0.3666, "num_input_tokens_seen": 20663327273, "step": 5300, "train_runtime": 210952.0287, "train_tokens_per_second": 97952.731 }, { "epoch": 0.8427662957074722, "grad_norm": 0.20664285123348236, "learning_rate": 3.006577515685485e-06, "loss": 0.385, "num_input_tokens_seen": 20667320407, "step": 5301, "train_runtime": 210989.8923, "train_tokens_per_second": 97954.078 }, { "epoch": 0.8429252782193959, "grad_norm": 0.18855248391628265, "learning_rate": 3.0006311538853916e-06, "loss": 0.3768, "num_input_tokens_seen": 20671196687, "step": 5302, "train_runtime": 211028.9122, "train_tokens_per_second": 97954.335 }, { "epoch": 0.8430842607313196, "grad_norm": 0.2508607506752014, "learning_rate": 2.994690302771741e-06, "loss": 0.3767, "num_input_tokens_seen": 20674994569, "step": 5303, "train_runtime": 211070.941, "train_tokens_per_second": 97952.823 }, { "epoch": 0.8432432432432433, "grad_norm": 0.18623454868793488, "learning_rate": 2.988754963832677e-06, "loss": 0.3764, "num_input_tokens_seen": 20679013954, "step": 5304, "train_runtime": 211109.9428, "train_tokens_per_second": 97953.766 }, { "epoch": 0.8434022257551669, "grad_norm": 0.2232726663351059, "learning_rate": 2.9828251385549484e-06, "loss": 0.3878, "num_input_tokens_seen": 20682887783, "step": 5305, "train_runtime": 211149.3171, "train_tokens_per_second": 97953.846 }, { "epoch": 0.8435612082670906, "grad_norm": 0.2518492639064789, "learning_rate": 2.97690082842394e-06, "loss": 0.3631, "num_input_tokens_seen": 20686720562, "step": 5306, "train_runtime": 211185.0236, "train_tokens_per_second": 97955.434 }, { "epoch": 0.8437201907790143, "grad_norm": 0.22250951826572418, "learning_rate": 2.970982034923653e-06, "loss": 0.3872, "num_input_tokens_seen": 20690702403, "step": 5307, "train_runtime": 211226.0028, "train_tokens_per_second": 97955.281 }, { "epoch": 0.843879173290938, "grad_norm": 1.5273343324661255, "learning_rate": 2.9650687595367017e-06, "loss": 0.3837, "num_input_tokens_seen": 20694659128, "step": 5308, "train_runtime": 211264.7028, "train_tokens_per_second": 97956.066 }, { "epoch": 0.8440381558028617, "grad_norm": 0.27237918972969055, "learning_rate": 2.9591610037443265e-06, "loss": 0.3843, "num_input_tokens_seen": 20698502483, "step": 5309, "train_runtime": 211306.168, "train_tokens_per_second": 97955.032 }, { "epoch": 0.8441971383147854, "grad_norm": 0.5626935362815857, "learning_rate": 2.9532587690263567e-06, "loss": 0.3893, "num_input_tokens_seen": 20702334652, "step": 5310, "train_runtime": 211343.4623, "train_tokens_per_second": 97955.879 }, { "epoch": 0.8443561208267091, "grad_norm": 0.35518574714660645, "learning_rate": 2.9473620568612884e-06, "loss": 0.3837, "num_input_tokens_seen": 20706299517, "step": 5311, "train_runtime": 211379.5166, "train_tokens_per_second": 97957.928 }, { "epoch": 0.8445151033386328, "grad_norm": 0.216736301779747, "learning_rate": 2.94147086872619e-06, "loss": 0.3894, "num_input_tokens_seen": 20710222459, "step": 5312, "train_runtime": 211415.9902, "train_tokens_per_second": 97959.584 }, { "epoch": 0.8446740858505565, "grad_norm": 0.19239796698093414, "learning_rate": 2.9355852060967636e-06, "loss": 0.3713, "num_input_tokens_seen": 20714078834, "step": 5313, "train_runtime": 211454.6675, "train_tokens_per_second": 97959.904 }, { "epoch": 0.8448330683624802, "grad_norm": 0.19951583445072174, "learning_rate": 2.9297050704473304e-06, "loss": 0.3837, "num_input_tokens_seen": 20718049541, "step": 5314, "train_runtime": 211494.462, "train_tokens_per_second": 97960.246 }, { "epoch": 0.8449920508744038, "grad_norm": 0.23691263794898987, "learning_rate": 2.9238304632508153e-06, "loss": 0.3837, "num_input_tokens_seen": 20722005367, "step": 5315, "train_runtime": 211534.8927, "train_tokens_per_second": 97960.223 }, { "epoch": 0.8451510333863275, "grad_norm": 0.3650696575641632, "learning_rate": 2.917961385978779e-06, "loss": 0.379, "num_input_tokens_seen": 20725905477, "step": 5316, "train_runtime": 211574.9534, "train_tokens_per_second": 97960.109 }, { "epoch": 0.8453100158982512, "grad_norm": 0.20759758353233337, "learning_rate": 2.9120978401013715e-06, "loss": 0.3728, "num_input_tokens_seen": 20729776683, "step": 5317, "train_runtime": 211615.683, "train_tokens_per_second": 97959.548 }, { "epoch": 0.8454689984101749, "grad_norm": 0.263753741979599, "learning_rate": 2.906239827087373e-06, "loss": 0.3851, "num_input_tokens_seen": 20733805173, "step": 5318, "train_runtime": 211656.0833, "train_tokens_per_second": 97959.883 }, { "epoch": 0.8456279809220986, "grad_norm": 0.61348557472229, "learning_rate": 2.9003873484041767e-06, "loss": 0.3823, "num_input_tokens_seen": 20737714157, "step": 5319, "train_runtime": 211696.6573, "train_tokens_per_second": 97959.573 }, { "epoch": 0.8457869634340223, "grad_norm": 0.22470775246620178, "learning_rate": 2.8945404055177844e-06, "loss": 0.3717, "num_input_tokens_seen": 20741644125, "step": 5320, "train_runtime": 211738.3503, "train_tokens_per_second": 97958.844 }, { "epoch": 0.845945945945946, "grad_norm": 0.22024652361869812, "learning_rate": 2.888698999892822e-06, "loss": 0.3821, "num_input_tokens_seen": 20745541558, "step": 5321, "train_runtime": 211778.2483, "train_tokens_per_second": 97958.793 }, { "epoch": 0.8461049284578697, "grad_norm": 0.17628848552703857, "learning_rate": 2.8828631329925084e-06, "loss": 0.3845, "num_input_tokens_seen": 20749432964, "step": 5322, "train_runtime": 211815.9936, "train_tokens_per_second": 97959.708 }, { "epoch": 0.8462639109697934, "grad_norm": 0.1980583816766739, "learning_rate": 2.877032806278693e-06, "loss": 0.3913, "num_input_tokens_seen": 20753267923, "step": 5323, "train_runtime": 211853.8152, "train_tokens_per_second": 97960.322 }, { "epoch": 0.8464228934817171, "grad_norm": 0.27596622705459595, "learning_rate": 2.8712080212118297e-06, "loss": 0.3678, "num_input_tokens_seen": 20757043061, "step": 5324, "train_runtime": 211893.8504, "train_tokens_per_second": 97959.629 }, { "epoch": 0.8465818759936407, "grad_norm": 0.32846763730049133, "learning_rate": 2.8653887792509883e-06, "loss": 0.3901, "num_input_tokens_seen": 20761089206, "step": 5325, "train_runtime": 211933.956, "train_tokens_per_second": 97960.183 }, { "epoch": 0.8467408585055644, "grad_norm": 0.19709976017475128, "learning_rate": 2.8595750818538463e-06, "loss": 0.3845, "num_input_tokens_seen": 20765014009, "step": 5326, "train_runtime": 211974.7162, "train_tokens_per_second": 97959.862 }, { "epoch": 0.846899841017488, "grad_norm": 0.27665045857429504, "learning_rate": 2.8537669304767006e-06, "loss": 0.3768, "num_input_tokens_seen": 20768830887, "step": 5327, "train_runtime": 212013.2668, "train_tokens_per_second": 97960.053 }, { "epoch": 0.8470588235294118, "grad_norm": 0.25247955322265625, "learning_rate": 2.84796432657444e-06, "loss": 0.3823, "num_input_tokens_seen": 20772758295, "step": 5328, "train_runtime": 212054.2505, "train_tokens_per_second": 97959.641 }, { "epoch": 0.8472178060413355, "grad_norm": 0.19234730303287506, "learning_rate": 2.8421672716005794e-06, "loss": 0.3776, "num_input_tokens_seen": 20776684332, "step": 5329, "train_runtime": 212093.8333, "train_tokens_per_second": 97959.87 }, { "epoch": 0.8473767885532592, "grad_norm": 0.19460493326187134, "learning_rate": 2.8363757670072433e-06, "loss": 0.3955, "num_input_tokens_seen": 20780457432, "step": 5330, "train_runtime": 212131.8183, "train_tokens_per_second": 97960.116 }, { "epoch": 0.8475357710651829, "grad_norm": 0.7494669556617737, "learning_rate": 2.8305898142451633e-06, "loss": 0.3889, "num_input_tokens_seen": 20784373513, "step": 5331, "train_runtime": 212172.1857, "train_tokens_per_second": 97959.935 }, { "epoch": 0.8476947535771066, "grad_norm": 0.29111412167549133, "learning_rate": 2.824809414763682e-06, "loss": 0.3797, "num_input_tokens_seen": 20788280427, "step": 5332, "train_runtime": 212213.2598, "train_tokens_per_second": 97959.385 }, { "epoch": 0.8478537360890303, "grad_norm": 0.17126193642616272, "learning_rate": 2.819034570010734e-06, "loss": 0.3869, "num_input_tokens_seen": 20792222379, "step": 5333, "train_runtime": 212253.8641, "train_tokens_per_second": 97959.217 }, { "epoch": 0.8480127186009538, "grad_norm": 0.23878073692321777, "learning_rate": 2.8132652814328973e-06, "loss": 0.3951, "num_input_tokens_seen": 20796121780, "step": 5334, "train_runtime": 212292.9484, "train_tokens_per_second": 97959.55 }, { "epoch": 0.8481717011128775, "grad_norm": 0.2072075456380844, "learning_rate": 2.807501550475325e-06, "loss": 0.3827, "num_input_tokens_seen": 20800044507, "step": 5335, "train_runtime": 212334.4424, "train_tokens_per_second": 97958.882 }, { "epoch": 0.8483306836248012, "grad_norm": 0.22586718201637268, "learning_rate": 2.801743378581795e-06, "loss": 0.3707, "num_input_tokens_seen": 20803866986, "step": 5336, "train_runtime": 212373.9694, "train_tokens_per_second": 97958.648 }, { "epoch": 0.8484896661367249, "grad_norm": 0.19755814969539642, "learning_rate": 2.7959907671946893e-06, "loss": 0.3925, "num_input_tokens_seen": 20807814429, "step": 5337, "train_runtime": 212407.6824, "train_tokens_per_second": 97961.685 }, { "epoch": 0.8486486486486486, "grad_norm": 0.44355252385139465, "learning_rate": 2.7902437177549893e-06, "loss": 0.3841, "num_input_tokens_seen": 20811645028, "step": 5338, "train_runtime": 212446.3065, "train_tokens_per_second": 97961.906 }, { "epoch": 0.8488076311605723, "grad_norm": 0.23561957478523254, "learning_rate": 2.7845022317023016e-06, "loss": 0.3935, "num_input_tokens_seen": 20815572450, "step": 5339, "train_runtime": 212487.8363, "train_tokens_per_second": 97961.242 }, { "epoch": 0.848966613672496, "grad_norm": 0.20656229555606842, "learning_rate": 2.7787663104748204e-06, "loss": 0.4026, "num_input_tokens_seen": 20819549561, "step": 5340, "train_runtime": 212525.487, "train_tokens_per_second": 97962.601 }, { "epoch": 0.8491255961844197, "grad_norm": 0.2739180028438568, "learning_rate": 2.7730359555093565e-06, "loss": 0.3912, "num_input_tokens_seen": 20823486003, "step": 5341, "train_runtime": 212565.458, "train_tokens_per_second": 97962.699 }, { "epoch": 0.8492845786963434, "grad_norm": 0.2666744291782379, "learning_rate": 2.7673111682413184e-06, "loss": 0.3949, "num_input_tokens_seen": 20827389862, "step": 5342, "train_runtime": 212607.4702, "train_tokens_per_second": 97961.703 }, { "epoch": 0.8494435612082671, "grad_norm": 0.290984183549881, "learning_rate": 2.7615919501047303e-06, "loss": 0.3751, "num_input_tokens_seen": 20831310110, "step": 5343, "train_runtime": 212645.9407, "train_tokens_per_second": 97962.416 }, { "epoch": 0.8496025437201907, "grad_norm": 0.23804205656051636, "learning_rate": 2.755878302532219e-06, "loss": 0.3852, "num_input_tokens_seen": 20835248386, "step": 5344, "train_runtime": 212686.2811, "train_tokens_per_second": 97962.352 }, { "epoch": 0.8497615262321144, "grad_norm": 0.27718862891197205, "learning_rate": 2.750170226955004e-06, "loss": 0.3697, "num_input_tokens_seen": 20839010814, "step": 5345, "train_runtime": 212724.9571, "train_tokens_per_second": 97962.228 }, { "epoch": 0.8499205087440381, "grad_norm": 0.20806364715099335, "learning_rate": 2.7444677248029237e-06, "loss": 0.37, "num_input_tokens_seen": 20843017368, "step": 5346, "train_runtime": 212764.2376, "train_tokens_per_second": 97962.974 }, { "epoch": 0.8500794912559618, "grad_norm": 0.23480871319770813, "learning_rate": 2.7387707975044113e-06, "loss": 0.3768, "num_input_tokens_seen": 20846816515, "step": 5347, "train_runtime": 212802.1972, "train_tokens_per_second": 97963.352 }, { "epoch": 0.8502384737678855, "grad_norm": 0.236563578248024, "learning_rate": 2.7330794464865056e-06, "loss": 0.3955, "num_input_tokens_seen": 20850718100, "step": 5348, "train_runtime": 212840.8118, "train_tokens_per_second": 97963.91 }, { "epoch": 0.8503974562798092, "grad_norm": 0.21873490512371063, "learning_rate": 2.727393673174855e-06, "loss": 0.3913, "num_input_tokens_seen": 20854709010, "step": 5349, "train_runtime": 212881.2507, "train_tokens_per_second": 97964.048 }, { "epoch": 0.8505564387917329, "grad_norm": 0.21353091299533844, "learning_rate": 2.7217134789937095e-06, "loss": 0.3758, "num_input_tokens_seen": 20858675124, "step": 5350, "train_runtime": 212923.0822, "train_tokens_per_second": 97963.428 }, { "epoch": 0.8507154213036566, "grad_norm": 0.2893266975879669, "learning_rate": 2.7160388653659053e-06, "loss": 0.3731, "num_input_tokens_seen": 20862564451, "step": 5351, "train_runtime": 212961.7363, "train_tokens_per_second": 97963.91 }, { "epoch": 0.8508744038155803, "grad_norm": 0.30915552377700806, "learning_rate": 2.7103698337128973e-06, "loss": 0.3885, "num_input_tokens_seen": 20866367278, "step": 5352, "train_runtime": 213000.6009, "train_tokens_per_second": 97963.889 }, { "epoch": 0.851033386327504, "grad_norm": 0.21142958104610443, "learning_rate": 2.704706385454742e-06, "loss": 0.3846, "num_input_tokens_seen": 20870249161, "step": 5353, "train_runtime": 213041.0416, "train_tokens_per_second": 97963.514 }, { "epoch": 0.8511923688394276, "grad_norm": 0.1894015669822693, "learning_rate": 2.6990485220100894e-06, "loss": 0.3834, "num_input_tokens_seen": 20874212699, "step": 5354, "train_runtime": 213080.8942, "train_tokens_per_second": 97963.793 }, { "epoch": 0.8513513513513513, "grad_norm": 0.16939257085323334, "learning_rate": 2.6933962447962006e-06, "loss": 0.3767, "num_input_tokens_seen": 20878212159, "step": 5355, "train_runtime": 213119.3482, "train_tokens_per_second": 97964.884 }, { "epoch": 0.851510333863275, "grad_norm": 0.3211209177970886, "learning_rate": 2.687749555228919e-06, "loss": 0.3798, "num_input_tokens_seen": 20881996760, "step": 5356, "train_runtime": 213158.9965, "train_tokens_per_second": 97964.417 }, { "epoch": 0.8516693163751987, "grad_norm": 0.30017921328544617, "learning_rate": 2.682108454722715e-06, "loss": 0.3847, "num_input_tokens_seen": 20885967767, "step": 5357, "train_runtime": 213199.2087, "train_tokens_per_second": 97964.565 }, { "epoch": 0.8518282988871224, "grad_norm": 0.776901364326477, "learning_rate": 2.6764729446906307e-06, "loss": 0.3748, "num_input_tokens_seen": 20889986570, "step": 5358, "train_runtime": 213237.1322, "train_tokens_per_second": 97965.989 }, { "epoch": 0.8519872813990461, "grad_norm": 0.20125310122966766, "learning_rate": 2.6708430265443287e-06, "loss": 0.3847, "num_input_tokens_seen": 20893789177, "step": 5359, "train_runtime": 213277.6233, "train_tokens_per_second": 97965.219 }, { "epoch": 0.8521462639109698, "grad_norm": 0.2218085378408432, "learning_rate": 2.6652187016940638e-06, "loss": 0.3831, "num_input_tokens_seen": 20897619186, "step": 5360, "train_runtime": 213319.3997, "train_tokens_per_second": 97963.988 }, { "epoch": 0.8523052464228935, "grad_norm": 0.22910158336162567, "learning_rate": 2.65959997154869e-06, "loss": 0.3876, "num_input_tokens_seen": 20901685412, "step": 5361, "train_runtime": 213359.3502, "train_tokens_per_second": 97964.703 }, { "epoch": 0.8524642289348172, "grad_norm": 0.1784042865037918, "learning_rate": 2.6539868375156596e-06, "loss": 0.3801, "num_input_tokens_seen": 20905651750, "step": 5362, "train_runtime": 213399.1288, "train_tokens_per_second": 97965.029 }, { "epoch": 0.8526232114467409, "grad_norm": 0.19914738833904266, "learning_rate": 2.6483793010010195e-06, "loss": 0.3887, "num_input_tokens_seen": 20909488331, "step": 5363, "train_runtime": 213438.8034, "train_tokens_per_second": 97964.794 }, { "epoch": 0.8527821939586645, "grad_norm": 0.20174120366573334, "learning_rate": 2.642777363409418e-06, "loss": 0.388, "num_input_tokens_seen": 20913350097, "step": 5364, "train_runtime": 213478.2087, "train_tokens_per_second": 97964.8 }, { "epoch": 0.8529411764705882, "grad_norm": 0.18230365216732025, "learning_rate": 2.637181026144106e-06, "loss": 0.3944, "num_input_tokens_seen": 20917295455, "step": 5365, "train_runtime": 213516.8464, "train_tokens_per_second": 97965.551 }, { "epoch": 0.8531001589825119, "grad_norm": 0.1953483521938324, "learning_rate": 2.6315902906069195e-06, "loss": 0.3899, "num_input_tokens_seen": 20921205144, "step": 5366, "train_runtime": 213553.5346, "train_tokens_per_second": 97967.028 }, { "epoch": 0.8532591414944356, "grad_norm": 0.1995704174041748, "learning_rate": 2.6260051581983055e-06, "loss": 0.3852, "num_input_tokens_seen": 20925017981, "step": 5367, "train_runtime": 213593.5707, "train_tokens_per_second": 97966.516 }, { "epoch": 0.8534181240063593, "grad_norm": 0.21114088594913483, "learning_rate": 2.6204256303172998e-06, "loss": 0.3784, "num_input_tokens_seen": 20928975243, "step": 5368, "train_runtime": 213632.305, "train_tokens_per_second": 97967.277 }, { "epoch": 0.853577106518283, "grad_norm": 0.24960783123970032, "learning_rate": 2.6148517083615292e-06, "loss": 0.3913, "num_input_tokens_seen": 20932762644, "step": 5369, "train_runtime": 213675.818, "train_tokens_per_second": 97965.052 }, { "epoch": 0.8537360890302067, "grad_norm": 0.1928403079509735, "learning_rate": 2.6092833937272214e-06, "loss": 0.3949, "num_input_tokens_seen": 20936559113, "step": 5370, "train_runtime": 213713.43, "train_tokens_per_second": 97965.575 }, { "epoch": 0.8538950715421304, "grad_norm": 0.2214960753917694, "learning_rate": 2.6037206878092078e-06, "loss": 0.3867, "num_input_tokens_seen": 20940552256, "step": 5371, "train_runtime": 213750.8001, "train_tokens_per_second": 97967.129 }, { "epoch": 0.8540540540540541, "grad_norm": 0.38310766220092773, "learning_rate": 2.5981635920008932e-06, "loss": 0.3896, "num_input_tokens_seen": 20944452127, "step": 5372, "train_runtime": 213787.3384, "train_tokens_per_second": 97968.628 }, { "epoch": 0.8542130365659777, "grad_norm": 0.22112956643104553, "learning_rate": 2.592612107694309e-06, "loss": 0.3945, "num_input_tokens_seen": 20948319405, "step": 5373, "train_runtime": 213826.6208, "train_tokens_per_second": 97968.716 }, { "epoch": 0.8543720190779014, "grad_norm": 0.18957659602165222, "learning_rate": 2.5870662362800504e-06, "loss": 0.3782, "num_input_tokens_seen": 20952100598, "step": 5374, "train_runtime": 213865.8656, "train_tokens_per_second": 97968.418 }, { "epoch": 0.8545310015898251, "grad_norm": 0.2370932549238205, "learning_rate": 2.5815259791473203e-06, "loss": 0.3889, "num_input_tokens_seen": 20956056702, "step": 5375, "train_runtime": 213905.3402, "train_tokens_per_second": 97968.834 }, { "epoch": 0.8546899841017488, "grad_norm": 0.19986455142498016, "learning_rate": 2.575991337683914e-06, "loss": 0.3776, "num_input_tokens_seen": 20959956797, "step": 5376, "train_runtime": 213944.8312, "train_tokens_per_second": 97968.98 }, { "epoch": 0.8548489666136725, "grad_norm": 0.19143414497375488, "learning_rate": 2.5704623132762227e-06, "loss": 0.4034, "num_input_tokens_seen": 20963873863, "step": 5377, "train_runtime": 213985.9634, "train_tokens_per_second": 97968.453 }, { "epoch": 0.8550079491255962, "grad_norm": 0.2132435441017151, "learning_rate": 2.5649389073092335e-06, "loss": 0.3861, "num_input_tokens_seen": 20967763427, "step": 5378, "train_runtime": 214024.2403, "train_tokens_per_second": 97969.106 }, { "epoch": 0.8551669316375199, "grad_norm": 0.21714802086353302, "learning_rate": 2.5594211211665024e-06, "loss": 0.3917, "num_input_tokens_seen": 20971760714, "step": 5379, "train_runtime": 214065.1979, "train_tokens_per_second": 97969.034 }, { "epoch": 0.8553259141494436, "grad_norm": 0.20063892006874084, "learning_rate": 2.5539089562302194e-06, "loss": 0.3811, "num_input_tokens_seen": 20975473729, "step": 5380, "train_runtime": 214103.3097, "train_tokens_per_second": 97968.937 }, { "epoch": 0.8554848966613673, "grad_norm": 0.3333151042461395, "learning_rate": 2.5484024138811237e-06, "loss": 0.3821, "num_input_tokens_seen": 20979417739, "step": 5381, "train_runtime": 214140.4626, "train_tokens_per_second": 97970.358 }, { "epoch": 0.855643879173291, "grad_norm": 0.22744181752204895, "learning_rate": 2.542901495498573e-06, "loss": 0.3773, "num_input_tokens_seen": 20983342140, "step": 5382, "train_runtime": 214181.3712, "train_tokens_per_second": 97969.968 }, { "epoch": 0.8558028616852146, "grad_norm": 0.22283945977687836, "learning_rate": 2.537406202460507e-06, "loss": 0.3876, "num_input_tokens_seen": 20987279454, "step": 5383, "train_runtime": 214219.8057, "train_tokens_per_second": 97970.771 }, { "epoch": 0.8559618441971383, "grad_norm": 0.1777832955121994, "learning_rate": 2.531916536143461e-06, "loss": 0.3852, "num_input_tokens_seen": 20991203171, "step": 5384, "train_runtime": 214258.0249, "train_tokens_per_second": 97971.608 }, { "epoch": 0.856120826709062, "grad_norm": 0.21431481838226318, "learning_rate": 2.5264324979225566e-06, "loss": 0.3559, "num_input_tokens_seen": 20994983434, "step": 5385, "train_runtime": 214295.5264, "train_tokens_per_second": 97972.103 }, { "epoch": 0.8562798092209857, "grad_norm": 1.3516467809677124, "learning_rate": 2.5209540891715037e-06, "loss": 0.382, "num_input_tokens_seen": 20998956547, "step": 5386, "train_runtime": 214335.8825, "train_tokens_per_second": 97972.193 }, { "epoch": 0.8564387917329094, "grad_norm": 0.19122956693172455, "learning_rate": 2.5154813112626047e-06, "loss": 0.3939, "num_input_tokens_seen": 21002904985, "step": 5387, "train_runtime": 214373.9596, "train_tokens_per_second": 97973.21 }, { "epoch": 0.8565977742448331, "grad_norm": 0.19648730754852295, "learning_rate": 2.510014165566757e-06, "loss": 0.382, "num_input_tokens_seen": 21006724903, "step": 5388, "train_runtime": 214413.5759, "train_tokens_per_second": 97972.924 }, { "epoch": 0.8567567567567568, "grad_norm": 0.1929999142885208, "learning_rate": 2.5045526534534347e-06, "loss": 0.3817, "num_input_tokens_seen": 21010656275, "step": 5389, "train_runtime": 214451.113, "train_tokens_per_second": 97974.107 }, { "epoch": 0.8569157392686805, "grad_norm": 0.21833954751491547, "learning_rate": 2.4990967762907132e-06, "loss": 0.3862, "num_input_tokens_seen": 21014579776, "step": 5390, "train_runtime": 214489.3182, "train_tokens_per_second": 97974.948 }, { "epoch": 0.8570747217806042, "grad_norm": 0.19668175280094147, "learning_rate": 2.493646535445257e-06, "loss": 0.3894, "num_input_tokens_seen": 21018470551, "step": 5391, "train_runtime": 214529.6406, "train_tokens_per_second": 97974.669 }, { "epoch": 0.8572337042925279, "grad_norm": 0.24054944515228271, "learning_rate": 2.488201932282297e-06, "loss": 0.3846, "num_input_tokens_seen": 21022370766, "step": 5392, "train_runtime": 214569.8018, "train_tokens_per_second": 97974.508 }, { "epoch": 0.8573926868044515, "grad_norm": 0.2885299324989319, "learning_rate": 2.4827629681656806e-06, "loss": 0.3825, "num_input_tokens_seen": 21026353437, "step": 5393, "train_runtime": 214608.3602, "train_tokens_per_second": 97975.463 }, { "epoch": 0.8575516693163752, "grad_norm": 0.2068479210138321, "learning_rate": 2.4773296444578293e-06, "loss": 0.3811, "num_input_tokens_seen": 21030142930, "step": 5394, "train_runtime": 214647.6418, "train_tokens_per_second": 97975.187 }, { "epoch": 0.8577106518282989, "grad_norm": 0.20480617880821228, "learning_rate": 2.4719019625197374e-06, "loss": 0.39, "num_input_tokens_seen": 21033992513, "step": 5395, "train_runtime": 214687.1493, "train_tokens_per_second": 97975.089 }, { "epoch": 0.8578696343402226, "grad_norm": 0.22402870655059814, "learning_rate": 2.4664799237110237e-06, "loss": 0.3901, "num_input_tokens_seen": 21037961841, "step": 5396, "train_runtime": 214726.6318, "train_tokens_per_second": 97975.559 }, { "epoch": 0.8580286168521463, "grad_norm": 0.41398483514785767, "learning_rate": 2.461063529389851e-06, "loss": 0.3769, "num_input_tokens_seen": 21041871575, "step": 5397, "train_runtime": 214765.6827, "train_tokens_per_second": 97975.949 }, { "epoch": 0.85818759936407, "grad_norm": 0.2813805937767029, "learning_rate": 2.4556527809130035e-06, "loss": 0.3913, "num_input_tokens_seen": 21045721453, "step": 5398, "train_runtime": 214805.321, "train_tokens_per_second": 97975.792 }, { "epoch": 0.8583465818759937, "grad_norm": 0.226951465010643, "learning_rate": 2.4502476796358224e-06, "loss": 0.3798, "num_input_tokens_seen": 21049700825, "step": 5399, "train_runtime": 214845.8705, "train_tokens_per_second": 97975.822 }, { "epoch": 0.8585055643879174, "grad_norm": 0.1820862740278244, "learning_rate": 2.4448482269122554e-06, "loss": 0.383, "num_input_tokens_seen": 21053559065, "step": 5400, "train_runtime": 214885.3676, "train_tokens_per_second": 97975.769 }, { "epoch": 0.8586645468998411, "grad_norm": 0.2702847123146057, "learning_rate": 2.4394544240948283e-06, "loss": 0.3898, "num_input_tokens_seen": 21057366894, "step": 5401, "train_runtime": 215032.9611, "train_tokens_per_second": 97926.229 }, { "epoch": 0.8588235294117647, "grad_norm": 0.23656409978866577, "learning_rate": 2.434066272534641e-06, "loss": 0.3801, "num_input_tokens_seen": 21061271816, "step": 5402, "train_runtime": 215070.6321, "train_tokens_per_second": 97927.233 }, { "epoch": 0.8589825119236884, "grad_norm": 0.18276070058345795, "learning_rate": 2.4286837735814e-06, "loss": 0.381, "num_input_tokens_seen": 21065206843, "step": 5403, "train_runtime": 215110.2229, "train_tokens_per_second": 97927.502 }, { "epoch": 0.859141494435612, "grad_norm": 0.21898336708545685, "learning_rate": 2.4233069285833714e-06, "loss": 0.3799, "num_input_tokens_seen": 21069117673, "step": 5404, "train_runtime": 215148.5821, "train_tokens_per_second": 97928.22 }, { "epoch": 0.8593004769475358, "grad_norm": 0.34318020939826965, "learning_rate": 2.4179357388874225e-06, "loss": 0.383, "num_input_tokens_seen": 21072888775, "step": 5405, "train_runtime": 215186.9189, "train_tokens_per_second": 97928.298 }, { "epoch": 0.8594594594594595, "grad_norm": 0.2827482521533966, "learning_rate": 2.4125702058390016e-06, "loss": 0.3781, "num_input_tokens_seen": 21076833732, "step": 5406, "train_runtime": 215224.806, "train_tokens_per_second": 97929.389 }, { "epoch": 0.8596184419713832, "grad_norm": 0.2076873928308487, "learning_rate": 2.4072103307821307e-06, "loss": 0.3774, "num_input_tokens_seen": 21080780710, "step": 5407, "train_runtime": 215264.2863, "train_tokens_per_second": 97929.764 }, { "epoch": 0.8597774244833069, "grad_norm": 0.17167864739894867, "learning_rate": 2.4018561150594282e-06, "loss": 0.3785, "num_input_tokens_seen": 21084634293, "step": 5408, "train_runtime": 215302.417, "train_tokens_per_second": 97930.319 }, { "epoch": 0.8599364069952306, "grad_norm": 0.1978985220193863, "learning_rate": 2.3965075600120763e-06, "loss": 0.3792, "num_input_tokens_seen": 21088409349, "step": 5409, "train_runtime": 215343.5546, "train_tokens_per_second": 97929.141 }, { "epoch": 0.8600953895071543, "grad_norm": 0.7768799066543579, "learning_rate": 2.3911646669798587e-06, "loss": 0.3875, "num_input_tokens_seen": 21092339769, "step": 5410, "train_runtime": 215383.3667, "train_tokens_per_second": 97929.288 }, { "epoch": 0.860254372019078, "grad_norm": 0.20146304368972778, "learning_rate": 2.3858274373011304e-06, "loss": 0.3823, "num_input_tokens_seen": 21096307685, "step": 5411, "train_runtime": 215421.2377, "train_tokens_per_second": 97930.491 }, { "epoch": 0.8604133545310015, "grad_norm": 0.4776744544506073, "learning_rate": 2.3804958723128323e-06, "loss": 0.3923, "num_input_tokens_seen": 21100224776, "step": 5412, "train_runtime": 215461.9727, "train_tokens_per_second": 97930.157 }, { "epoch": 0.8605723370429252, "grad_norm": 0.1860707551240921, "learning_rate": 2.3751699733504717e-06, "loss": 0.3939, "num_input_tokens_seen": 21104112641, "step": 5413, "train_runtime": 215502.0256, "train_tokens_per_second": 97929.997 }, { "epoch": 0.8607313195548489, "grad_norm": 0.20529977977275848, "learning_rate": 2.3698497417481673e-06, "loss": 0.3914, "num_input_tokens_seen": 21108096518, "step": 5414, "train_runtime": 215541.905, "train_tokens_per_second": 97930.361 }, { "epoch": 0.8608903020667726, "grad_norm": 0.4437759518623352, "learning_rate": 2.3645351788385867e-06, "loss": 0.3791, "num_input_tokens_seen": 21112055994, "step": 5415, "train_runtime": 215579.9407, "train_tokens_per_second": 97931.449 }, { "epoch": 0.8610492845786963, "grad_norm": 0.3122945725917816, "learning_rate": 2.3592262859529923e-06, "loss": 0.3761, "num_input_tokens_seen": 21115968587, "step": 5416, "train_runtime": 215620.9373, "train_tokens_per_second": 97930.975 }, { "epoch": 0.86120826709062, "grad_norm": 0.25868645310401917, "learning_rate": 2.353923064421229e-06, "loss": 0.3692, "num_input_tokens_seen": 21119918968, "step": 5417, "train_runtime": 215660.0699, "train_tokens_per_second": 97931.522 }, { "epoch": 0.8613672496025437, "grad_norm": 0.17989003658294678, "learning_rate": 2.3486255155717063e-06, "loss": 0.389, "num_input_tokens_seen": 21123807804, "step": 5418, "train_runtime": 215699.8795, "train_tokens_per_second": 97931.477 }, { "epoch": 0.8615262321144674, "grad_norm": 0.28123071789741516, "learning_rate": 2.343333640731435e-06, "loss": 0.3696, "num_input_tokens_seen": 21127675388, "step": 5419, "train_runtime": 215737.8869, "train_tokens_per_second": 97932.151 }, { "epoch": 0.8616852146263911, "grad_norm": 0.35691937804222107, "learning_rate": 2.3380474412259794e-06, "loss": 0.3821, "num_input_tokens_seen": 21131482439, "step": 5420, "train_runtime": 215776.7572, "train_tokens_per_second": 97932.153 }, { "epoch": 0.8618441971383148, "grad_norm": 0.17211580276489258, "learning_rate": 2.3327669183795144e-06, "loss": 0.3908, "num_input_tokens_seen": 21135503851, "step": 5421, "train_runtime": 215815.853, "train_tokens_per_second": 97933.046 }, { "epoch": 0.8620031796502384, "grad_norm": 0.27990585565567017, "learning_rate": 2.327492073514753e-06, "loss": 0.39, "num_input_tokens_seen": 21139355399, "step": 5422, "train_runtime": 215857.1628, "train_tokens_per_second": 97932.147 }, { "epoch": 0.8621621621621621, "grad_norm": 0.19911298155784607, "learning_rate": 2.3222229079530173e-06, "loss": 0.3815, "num_input_tokens_seen": 21143281047, "step": 5423, "train_runtime": 215895.236, "train_tokens_per_second": 97933.06 }, { "epoch": 0.8623211446740858, "grad_norm": 0.22581584751605988, "learning_rate": 2.316959423014198e-06, "loss": 0.382, "num_input_tokens_seen": 21147109574, "step": 5424, "train_runtime": 215935.9284, "train_tokens_per_second": 97932.335 }, { "epoch": 0.8624801271860095, "grad_norm": 0.24414943158626556, "learning_rate": 2.3117016200167486e-06, "loss": 0.389, "num_input_tokens_seen": 21151046220, "step": 5425, "train_runtime": 215976.5947, "train_tokens_per_second": 97932.122 }, { "epoch": 0.8626391096979332, "grad_norm": 0.24818052351474762, "learning_rate": 2.306449500277727e-06, "loss": 0.3778, "num_input_tokens_seen": 21155010062, "step": 5426, "train_runtime": 216017.1151, "train_tokens_per_second": 97932.102 }, { "epoch": 0.8627980922098569, "grad_norm": 0.19930778443813324, "learning_rate": 2.301203065112742e-06, "loss": 0.3825, "num_input_tokens_seen": 21158895792, "step": 5427, "train_runtime": 216059.0728, "train_tokens_per_second": 97931.068 }, { "epoch": 0.8629570747217806, "grad_norm": 0.2045861780643463, "learning_rate": 2.295962315835992e-06, "loss": 0.3846, "num_input_tokens_seen": 21162843478, "step": 5428, "train_runtime": 216099.9442, "train_tokens_per_second": 97930.814 }, { "epoch": 0.8631160572337043, "grad_norm": 0.1866108924150467, "learning_rate": 2.290727253760247e-06, "loss": 0.3734, "num_input_tokens_seen": 21166763189, "step": 5429, "train_runtime": 216135.8672, "train_tokens_per_second": 97932.673 }, { "epoch": 0.863275039745628, "grad_norm": 0.21525540947914124, "learning_rate": 2.2854978801968576e-06, "loss": 0.3923, "num_input_tokens_seen": 21170580515, "step": 5430, "train_runtime": 216177.5726, "train_tokens_per_second": 97931.438 }, { "epoch": 0.8634340222575516, "grad_norm": 0.21800599992275238, "learning_rate": 2.280274196455734e-06, "loss": 0.3894, "num_input_tokens_seen": 21174283376, "step": 5431, "train_runtime": 216218.3471, "train_tokens_per_second": 97930.095 }, { "epoch": 0.8635930047694753, "grad_norm": 0.26047971844673157, "learning_rate": 2.27505620384538e-06, "loss": 0.3701, "num_input_tokens_seen": 21178193920, "step": 5432, "train_runtime": 216257.7316, "train_tokens_per_second": 97930.343 }, { "epoch": 0.863751987281399, "grad_norm": 0.21115411818027496, "learning_rate": 2.2698439036728664e-06, "loss": 0.3932, "num_input_tokens_seen": 21182197773, "step": 5433, "train_runtime": 216297.5415, "train_tokens_per_second": 97930.83 }, { "epoch": 0.8639109697933227, "grad_norm": 0.19897110760211945, "learning_rate": 2.2646372972438324e-06, "loss": 0.3853, "num_input_tokens_seen": 21186080930, "step": 5434, "train_runtime": 216338.9296, "train_tokens_per_second": 97930.044 }, { "epoch": 0.8640699523052464, "grad_norm": 0.19544197618961334, "learning_rate": 2.2594363858625066e-06, "loss": 0.3712, "num_input_tokens_seen": 21189957454, "step": 5435, "train_runtime": 216378.5907, "train_tokens_per_second": 97930.01 }, { "epoch": 0.8642289348171701, "grad_norm": 0.21869981288909912, "learning_rate": 2.2542411708316614e-06, "loss": 0.3795, "num_input_tokens_seen": 21193907412, "step": 5436, "train_runtime": 216417.3828, "train_tokens_per_second": 97930.707 }, { "epoch": 0.8643879173290938, "grad_norm": 0.27167272567749023, "learning_rate": 2.249051653452686e-06, "loss": 0.3875, "num_input_tokens_seen": 21197916971, "step": 5437, "train_runtime": 216455.5003, "train_tokens_per_second": 97931.986 }, { "epoch": 0.8645468998410175, "grad_norm": 0.21046936511993408, "learning_rate": 2.2438678350254984e-06, "loss": 0.3952, "num_input_tokens_seen": 21201689997, "step": 5438, "train_runtime": 216496.4994, "train_tokens_per_second": 97930.867 }, { "epoch": 0.8647058823529412, "grad_norm": 0.19085454940795898, "learning_rate": 2.2386897168486133e-06, "loss": 0.3846, "num_input_tokens_seen": 21205711939, "step": 5439, "train_runtime": 216537.3706, "train_tokens_per_second": 97930.957 }, { "epoch": 0.8648648648648649, "grad_norm": 0.18617628514766693, "learning_rate": 2.2335173002191236e-06, "loss": 0.3793, "num_input_tokens_seen": 21209663902, "step": 5440, "train_runtime": 216576.2595, "train_tokens_per_second": 97931.62 }, { "epoch": 0.8650238473767885, "grad_norm": 0.22020259499549866, "learning_rate": 2.228350586432662e-06, "loss": 0.3933, "num_input_tokens_seen": 21213632445, "step": 5441, "train_runtime": 216615.0501, "train_tokens_per_second": 97932.403 }, { "epoch": 0.8651828298887122, "grad_norm": 0.2844475507736206, "learning_rate": 2.223189576783477e-06, "loss": 0.388, "num_input_tokens_seen": 21217390637, "step": 5442, "train_runtime": 216654.5082, "train_tokens_per_second": 97931.914 }, { "epoch": 0.8653418124006359, "grad_norm": 0.1874580979347229, "learning_rate": 2.218034272564343e-06, "loss": 0.3839, "num_input_tokens_seen": 21221419373, "step": 5443, "train_runtime": 216694.5574, "train_tokens_per_second": 97932.406 }, { "epoch": 0.8655007949125596, "grad_norm": 0.23004589974880219, "learning_rate": 2.2128846750666484e-06, "loss": 0.3856, "num_input_tokens_seen": 21225368944, "step": 5444, "train_runtime": 216734.9838, "train_tokens_per_second": 97932.362 }, { "epoch": 0.8656597774244833, "grad_norm": 0.22454437613487244, "learning_rate": 2.207740785580317e-06, "loss": 0.3892, "num_input_tokens_seen": 21229224694, "step": 5445, "train_runtime": 216773.1787, "train_tokens_per_second": 97932.894 }, { "epoch": 0.865818759936407, "grad_norm": 0.19598659873008728, "learning_rate": 2.202602605393861e-06, "loss": 0.3768, "num_input_tokens_seen": 21233130825, "step": 5446, "train_runtime": 216812.8979, "train_tokens_per_second": 97932.969 }, { "epoch": 0.8659777424483307, "grad_norm": 0.1839369237422943, "learning_rate": 2.1974701357943605e-06, "loss": 0.3914, "num_input_tokens_seen": 21237158835, "step": 5447, "train_runtime": 216851.602, "train_tokens_per_second": 97934.065 }, { "epoch": 0.8661367249602544, "grad_norm": 0.2090959995985031, "learning_rate": 2.192343378067452e-06, "loss": 0.3804, "num_input_tokens_seen": 21241072505, "step": 5448, "train_runtime": 216888.904, "train_tokens_per_second": 97935.266 }, { "epoch": 0.8662957074721781, "grad_norm": 0.23673658072948456, "learning_rate": 2.1872223334973684e-06, "loss": 0.3991, "num_input_tokens_seen": 21244949625, "step": 5449, "train_runtime": 216928.2977, "train_tokens_per_second": 97935.354 }, { "epoch": 0.8664546899841018, "grad_norm": 0.1685527265071869, "learning_rate": 2.1821070033668846e-06, "loss": 0.3875, "num_input_tokens_seen": 21248814146, "step": 5450, "train_runtime": 216969.4802, "train_tokens_per_second": 97934.576 }, { "epoch": 0.8666136724960254, "grad_norm": 0.20988741517066956, "learning_rate": 2.176997388957358e-06, "loss": 0.3899, "num_input_tokens_seen": 21252698884, "step": 5451, "train_runtime": 217006.9797, "train_tokens_per_second": 97935.554 }, { "epoch": 0.8667726550079491, "grad_norm": 0.20127882063388824, "learning_rate": 2.1718934915487072e-06, "loss": 0.3888, "num_input_tokens_seen": 21256479634, "step": 5452, "train_runtime": 217046.8204, "train_tokens_per_second": 97934.997 }, { "epoch": 0.8669316375198728, "grad_norm": 0.20172543823719025, "learning_rate": 2.1667953124194313e-06, "loss": 0.3833, "num_input_tokens_seen": 21260425721, "step": 5453, "train_runtime": 217086.2229, "train_tokens_per_second": 97935.398 }, { "epoch": 0.8670906200317965, "grad_norm": 0.2081916630268097, "learning_rate": 2.1617028528465783e-06, "loss": 0.3939, "num_input_tokens_seen": 21264377146, "step": 5454, "train_runtime": 217124.0132, "train_tokens_per_second": 97936.552 }, { "epoch": 0.8672496025437202, "grad_norm": 0.18275555968284607, "learning_rate": 2.156616114105775e-06, "loss": 0.3897, "num_input_tokens_seen": 21268190319, "step": 5455, "train_runtime": 217163.4605, "train_tokens_per_second": 97936.321 }, { "epoch": 0.8674085850556439, "grad_norm": 0.3497071862220764, "learning_rate": 2.151535097471219e-06, "loss": 0.3845, "num_input_tokens_seen": 21272021970, "step": 5456, "train_runtime": 217204.6207, "train_tokens_per_second": 97935.403 }, { "epoch": 0.8675675675675676, "grad_norm": 0.18559390306472778, "learning_rate": 2.1464598042156643e-06, "loss": 0.3848, "num_input_tokens_seen": 21275987113, "step": 5457, "train_runtime": 217240.8258, "train_tokens_per_second": 97937.333 }, { "epoch": 0.8677265500794913, "grad_norm": 0.19469183683395386, "learning_rate": 2.1413902356104408e-06, "loss": 0.3779, "num_input_tokens_seen": 21279807755, "step": 5458, "train_runtime": 217280.8735, "train_tokens_per_second": 97936.866 }, { "epoch": 0.867885532591415, "grad_norm": 0.25948184728622437, "learning_rate": 2.1363263929254278e-06, "loss": 0.389, "num_input_tokens_seen": 21283700150, "step": 5459, "train_runtime": 217320.5333, "train_tokens_per_second": 97936.904 }, { "epoch": 0.8680445151033387, "grad_norm": 0.22401899099349976, "learning_rate": 2.131268277429099e-06, "loss": 0.3865, "num_input_tokens_seen": 21287549931, "step": 5460, "train_runtime": 217359.1221, "train_tokens_per_second": 97937.228 }, { "epoch": 0.8682034976152623, "grad_norm": 0.20398743450641632, "learning_rate": 2.1262158903884666e-06, "loss": 0.4037, "num_input_tokens_seen": 21291520275, "step": 5461, "train_runtime": 217396.7488, "train_tokens_per_second": 97938.54 }, { "epoch": 0.868362480127186, "grad_norm": 0.20533449947834015, "learning_rate": 2.1211692330691173e-06, "loss": 0.3755, "num_input_tokens_seen": 21295420767, "step": 5462, "train_runtime": 217437.3896, "train_tokens_per_second": 97938.173 }, { "epoch": 0.8685214626391097, "grad_norm": 0.2951985001564026, "learning_rate": 2.1161283067352046e-06, "loss": 0.3798, "num_input_tokens_seen": 21299237253, "step": 5463, "train_runtime": 217479.604, "train_tokens_per_second": 97936.712 }, { "epoch": 0.8686804451510334, "grad_norm": 0.43929365277290344, "learning_rate": 2.1110931126494456e-06, "loss": 0.3892, "num_input_tokens_seen": 21303191171, "step": 5464, "train_runtime": 217520.1647, "train_tokens_per_second": 97936.627 }, { "epoch": 0.8688394276629571, "grad_norm": 0.21653862297534943, "learning_rate": 2.106063652073126e-06, "loss": 0.389, "num_input_tokens_seen": 21307161668, "step": 5465, "train_runtime": 217558.0444, "train_tokens_per_second": 97937.825 }, { "epoch": 0.8689984101748808, "grad_norm": 0.20720627903938293, "learning_rate": 2.101039926266077e-06, "loss": 0.3838, "num_input_tokens_seen": 21311112584, "step": 5466, "train_runtime": 217597.5909, "train_tokens_per_second": 97938.183 }, { "epoch": 0.8691573926868045, "grad_norm": 0.2163832187652588, "learning_rate": 2.0960219364867224e-06, "loss": 0.3694, "num_input_tokens_seen": 21314872770, "step": 5467, "train_runtime": 217636.6933, "train_tokens_per_second": 97937.864 }, { "epoch": 0.8693163751987282, "grad_norm": 0.21629633009433746, "learning_rate": 2.091009683992021e-06, "loss": 0.3815, "num_input_tokens_seen": 21318835833, "step": 5468, "train_runtime": 217677.7182, "train_tokens_per_second": 97937.612 }, { "epoch": 0.8694753577106519, "grad_norm": 0.25833770632743835, "learning_rate": 2.0860031700375095e-06, "loss": 0.3898, "num_input_tokens_seen": 21322719581, "step": 5469, "train_runtime": 217715.0324, "train_tokens_per_second": 97938.665 }, { "epoch": 0.8696343402225755, "grad_norm": 0.3152388036251068, "learning_rate": 2.081002395877285e-06, "loss": 0.3864, "num_input_tokens_seen": 21326521044, "step": 5470, "train_runtime": 217754.7724, "train_tokens_per_second": 97938.249 }, { "epoch": 0.8697933227344992, "grad_norm": 0.25766026973724365, "learning_rate": 2.0760073627640126e-06, "loss": 0.3868, "num_input_tokens_seen": 21330454471, "step": 5471, "train_runtime": 217794.9831, "train_tokens_per_second": 97938.227 }, { "epoch": 0.8699523052464229, "grad_norm": 0.2786926329135895, "learning_rate": 2.071018071948905e-06, "loss": 0.3798, "num_input_tokens_seen": 21334395770, "step": 5472, "train_runtime": 217833.6706, "train_tokens_per_second": 97938.926 }, { "epoch": 0.8701112877583466, "grad_norm": 0.21433477103710175, "learning_rate": 2.0660345246817426e-06, "loss": 0.3936, "num_input_tokens_seen": 21338268194, "step": 5473, "train_runtime": 217872.0352, "train_tokens_per_second": 97939.454 }, { "epoch": 0.8702702702702703, "grad_norm": 0.17787228524684906, "learning_rate": 2.061056722210872e-06, "loss": 0.3898, "num_input_tokens_seen": 21342231187, "step": 5474, "train_runtime": 217911.6868, "train_tokens_per_second": 97939.819 }, { "epoch": 0.870429252782194, "grad_norm": 0.22651204466819763, "learning_rate": 2.056084665783198e-06, "loss": 0.4016, "num_input_tokens_seen": 21346096328, "step": 5475, "train_runtime": 217948.4821, "train_tokens_per_second": 97941.019 }, { "epoch": 0.8705882352941177, "grad_norm": 0.20887084305286407, "learning_rate": 2.0511183566441912e-06, "loss": 0.401, "num_input_tokens_seen": 21349946362, "step": 5476, "train_runtime": 217987.8098, "train_tokens_per_second": 97941.01 }, { "epoch": 0.8707472178060414, "grad_norm": 0.22454001009464264, "learning_rate": 2.046157796037862e-06, "loss": 0.382, "num_input_tokens_seen": 21353851852, "step": 5477, "train_runtime": 218024.5238, "train_tokens_per_second": 97942.431 }, { "epoch": 0.8709062003179651, "grad_norm": 0.22567325830459595, "learning_rate": 2.041202985206814e-06, "loss": 0.3926, "num_input_tokens_seen": 21357801378, "step": 5478, "train_runtime": 218065.0923, "train_tokens_per_second": 97942.322 }, { "epoch": 0.8710651828298888, "grad_norm": 0.1892589032649994, "learning_rate": 2.0362539253921787e-06, "loss": 0.3869, "num_input_tokens_seen": 21361623534, "step": 5479, "train_runtime": 218106.4356, "train_tokens_per_second": 97941.28 }, { "epoch": 0.8712241653418124, "grad_norm": 0.22445464134216309, "learning_rate": 2.031310617833665e-06, "loss": 0.372, "num_input_tokens_seen": 21365574310, "step": 5480, "train_runtime": 218143.1342, "train_tokens_per_second": 97942.914 }, { "epoch": 0.871383147853736, "grad_norm": 0.2299535870552063, "learning_rate": 2.0263730637695404e-06, "loss": 0.3935, "num_input_tokens_seen": 21369394024, "step": 5481, "train_runtime": 218183.9738, "train_tokens_per_second": 97942.088 }, { "epoch": 0.8715421303656598, "grad_norm": 0.1998152732849121, "learning_rate": 2.021441264436616e-06, "loss": 0.3983, "num_input_tokens_seen": 21373426085, "step": 5482, "train_runtime": 218224.2713, "train_tokens_per_second": 97942.479 }, { "epoch": 0.8717011128775835, "grad_norm": 0.21151939034461975, "learning_rate": 2.016515221070289e-06, "loss": 0.3809, "num_input_tokens_seen": 21377364355, "step": 5483, "train_runtime": 218260.404, "train_tokens_per_second": 97944.309 }, { "epoch": 0.8718600953895072, "grad_norm": 0.1875341236591339, "learning_rate": 2.0115949349044787e-06, "loss": 0.3733, "num_input_tokens_seen": 21381247207, "step": 5484, "train_runtime": 218298.7493, "train_tokens_per_second": 97944.891 }, { "epoch": 0.8720190779014309, "grad_norm": 0.19138182699680328, "learning_rate": 2.0066804071717046e-06, "loss": 0.3803, "num_input_tokens_seen": 21385143707, "step": 5485, "train_runtime": 218337.1815, "train_tokens_per_second": 97945.497 }, { "epoch": 0.8721780604133546, "grad_norm": 0.27864572405815125, "learning_rate": 2.001771639103003e-06, "loss": 0.3873, "num_input_tokens_seen": 21389047566, "step": 5486, "train_runtime": 218376.8327, "train_tokens_per_second": 97945.589 }, { "epoch": 0.8723370429252782, "grad_norm": 0.23365211486816406, "learning_rate": 1.996868631927995e-06, "loss": 0.3761, "num_input_tokens_seen": 21392890439, "step": 5487, "train_runtime": 218415.6565, "train_tokens_per_second": 97945.774 }, { "epoch": 0.872496025437202, "grad_norm": 0.19418710470199585, "learning_rate": 1.99197138687485e-06, "loss": 0.38, "num_input_tokens_seen": 21396853445, "step": 5488, "train_runtime": 218454.8692, "train_tokens_per_second": 97946.333 }, { "epoch": 0.8726550079491256, "grad_norm": 0.25947943329811096, "learning_rate": 1.987079905170283e-06, "loss": 0.3704, "num_input_tokens_seen": 21400717812, "step": 5489, "train_runtime": 218496.124, "train_tokens_per_second": 97945.526 }, { "epoch": 0.8728139904610492, "grad_norm": 0.4147205650806427, "learning_rate": 1.982194188039585e-06, "loss": 0.3845, "num_input_tokens_seen": 21404676152, "step": 5490, "train_runtime": 218536.9509, "train_tokens_per_second": 97945.341 }, { "epoch": 0.8729729729729729, "grad_norm": 0.2540743052959442, "learning_rate": 1.97731423670659e-06, "loss": 0.3778, "num_input_tokens_seen": 21408662207, "step": 5491, "train_runtime": 218576.8991, "train_tokens_per_second": 97945.676 }, { "epoch": 0.8731319554848966, "grad_norm": 0.2091066539287567, "learning_rate": 1.9724400523936924e-06, "loss": 0.3894, "num_input_tokens_seen": 21412375097, "step": 5492, "train_runtime": 218615.087, "train_tokens_per_second": 97945.551 }, { "epoch": 0.8732909379968203, "grad_norm": 0.17987214028835297, "learning_rate": 1.9675716363218383e-06, "loss": 0.3773, "num_input_tokens_seen": 21416348199, "step": 5493, "train_runtime": 218653.9787, "train_tokens_per_second": 97946.3 }, { "epoch": 0.873449920508744, "grad_norm": 0.3429924249649048, "learning_rate": 1.962708989710543e-06, "loss": 0.3922, "num_input_tokens_seen": 21420237967, "step": 5494, "train_runtime": 218695.2024, "train_tokens_per_second": 97945.624 }, { "epoch": 0.8736089030206677, "grad_norm": 0.22186923027038574, "learning_rate": 1.957852113777847e-06, "loss": 0.3882, "num_input_tokens_seen": 21424186346, "step": 5495, "train_runtime": 218731.6885, "train_tokens_per_second": 97947.337 }, { "epoch": 0.8737678855325914, "grad_norm": 0.24488861858844757, "learning_rate": 1.9530010097403766e-06, "loss": 0.3848, "num_input_tokens_seen": 21428084467, "step": 5496, "train_runtime": 218770.3068, "train_tokens_per_second": 97947.865 }, { "epoch": 0.8739268680445151, "grad_norm": 0.2610284090042114, "learning_rate": 1.9481556788132933e-06, "loss": 0.3626, "num_input_tokens_seen": 21431971925, "step": 5497, "train_runtime": 218810.3597, "train_tokens_per_second": 97947.702 }, { "epoch": 0.8740858505564388, "grad_norm": 0.37733304500579834, "learning_rate": 1.9433161222103203e-06, "loss": 0.3802, "num_input_tokens_seen": 21435848795, "step": 5498, "train_runtime": 218850.2857, "train_tokens_per_second": 97947.548 }, { "epoch": 0.8742448330683624, "grad_norm": 0.24388083815574646, "learning_rate": 1.938482341143735e-06, "loss": 0.3736, "num_input_tokens_seen": 21439723908, "step": 5499, "train_runtime": 218888.7551, "train_tokens_per_second": 97948.037 }, { "epoch": 0.8744038155802861, "grad_norm": 0.2533378005027771, "learning_rate": 1.933654336824356e-06, "loss": 0.3841, "num_input_tokens_seen": 21443660133, "step": 5500, "train_runtime": 218930.2415, "train_tokens_per_second": 97947.456 }, { "epoch": 0.8745627980922098, "grad_norm": 0.21076276898384094, "learning_rate": 1.9288321104615786e-06, "loss": 0.3839, "num_input_tokens_seen": 21447595239, "step": 5501, "train_runtime": 218969.2103, "train_tokens_per_second": 97947.996 }, { "epoch": 0.8747217806041335, "grad_norm": 0.28193777799606323, "learning_rate": 1.9240156632633234e-06, "loss": 0.3856, "num_input_tokens_seen": 21451505667, "step": 5502, "train_runtime": 219006.8639, "train_tokens_per_second": 97949.011 }, { "epoch": 0.8748807631160572, "grad_norm": 0.30089476704597473, "learning_rate": 1.919204996436083e-06, "loss": 0.374, "num_input_tokens_seen": 21455382329, "step": 5503, "train_runtime": 219047.1135, "train_tokens_per_second": 97948.711 }, { "epoch": 0.8750397456279809, "grad_norm": 0.19731047749519348, "learning_rate": 1.914400111184897e-06, "loss": 0.3916, "num_input_tokens_seen": 21459369228, "step": 5504, "train_runtime": 219087.005, "train_tokens_per_second": 97949.074 }, { "epoch": 0.8751987281399046, "grad_norm": 0.17616118490695953, "learning_rate": 1.9096010087133438e-06, "loss": 0.3748, "num_input_tokens_seen": 21463293342, "step": 5505, "train_runtime": 219125.8909, "train_tokens_per_second": 97949.6 }, { "epoch": 0.8753577106518283, "grad_norm": 0.19979409873485565, "learning_rate": 1.904807690223584e-06, "loss": 0.3952, "num_input_tokens_seen": 21467149829, "step": 5506, "train_runtime": 219165.0485, "train_tokens_per_second": 97949.696 }, { "epoch": 0.875516693163752, "grad_norm": 0.2812233865261078, "learning_rate": 1.9000201569162907e-06, "loss": 0.3931, "num_input_tokens_seen": 21470983324, "step": 5507, "train_runtime": 219204.3866, "train_tokens_per_second": 97949.606 }, { "epoch": 0.8756756756756757, "grad_norm": 0.20881882309913635, "learning_rate": 1.8952384099907183e-06, "loss": 0.3945, "num_input_tokens_seen": 21474883728, "step": 5508, "train_runtime": 219243.4116, "train_tokens_per_second": 97949.962 }, { "epoch": 0.8758346581875993, "grad_norm": 0.20597830414772034, "learning_rate": 1.8904624506446561e-06, "loss": 0.3874, "num_input_tokens_seen": 21478798172, "step": 5509, "train_runtime": 219281.5554, "train_tokens_per_second": 97950.774 }, { "epoch": 0.875993640699523, "grad_norm": 0.21100851893424988, "learning_rate": 1.8856922800744525e-06, "loss": 0.395, "num_input_tokens_seen": 21482769019, "step": 5510, "train_runtime": 219319.6507, "train_tokens_per_second": 97951.866 }, { "epoch": 0.8761526232114467, "grad_norm": 0.20806585252285004, "learning_rate": 1.880927899474999e-06, "loss": 0.3902, "num_input_tokens_seen": 21486650873, "step": 5511, "train_runtime": 219360.643, "train_tokens_per_second": 97951.258 }, { "epoch": 0.8763116057233704, "grad_norm": 0.22064834833145142, "learning_rate": 1.8761693100397381e-06, "loss": 0.372, "num_input_tokens_seen": 21490583463, "step": 5512, "train_runtime": 219399.1849, "train_tokens_per_second": 97951.975 }, { "epoch": 0.8764705882352941, "grad_norm": 0.2185972034931183, "learning_rate": 1.8714165129606636e-06, "loss": 0.3863, "num_input_tokens_seen": 21494451124, "step": 5513, "train_runtime": 219440.3203, "train_tokens_per_second": 97951.238 }, { "epoch": 0.8766295707472178, "grad_norm": 0.20130029320716858, "learning_rate": 1.8666695094283176e-06, "loss": 0.3954, "num_input_tokens_seen": 21498353737, "step": 5514, "train_runtime": 219479.971, "train_tokens_per_second": 97951.324 }, { "epoch": 0.8767885532591415, "grad_norm": 0.20545199513435364, "learning_rate": 1.8619283006317905e-06, "loss": 0.383, "num_input_tokens_seen": 21502205177, "step": 5515, "train_runtime": 219518.7017, "train_tokens_per_second": 97951.587 }, { "epoch": 0.8769475357710652, "grad_norm": 0.21264685690402985, "learning_rate": 1.8571928877587214e-06, "loss": 0.3828, "num_input_tokens_seen": 21506192181, "step": 5516, "train_runtime": 219558.8128, "train_tokens_per_second": 97951.851 }, { "epoch": 0.8771065182829889, "grad_norm": 0.3001772463321686, "learning_rate": 1.8524632719953055e-06, "loss": 0.3833, "num_input_tokens_seen": 21510116586, "step": 5517, "train_runtime": 219599.7779, "train_tokens_per_second": 97951.45 }, { "epoch": 0.8772655007949126, "grad_norm": 0.2323330193758011, "learning_rate": 1.847739454526265e-06, "loss": 0.3835, "num_input_tokens_seen": 21514020396, "step": 5518, "train_runtime": 219639.5042, "train_tokens_per_second": 97951.507 }, { "epoch": 0.8774244833068362, "grad_norm": 0.2083505243062973, "learning_rate": 1.8430214365348918e-06, "loss": 0.3762, "num_input_tokens_seen": 21517935019, "step": 5519, "train_runtime": 219680.2735, "train_tokens_per_second": 97951.148 }, { "epoch": 0.8775834658187599, "grad_norm": 0.2618905305862427, "learning_rate": 1.8383092192030104e-06, "loss": 0.3922, "num_input_tokens_seen": 21521768757, "step": 5520, "train_runtime": 219716.8222, "train_tokens_per_second": 97952.303 }, { "epoch": 0.8777424483306836, "grad_norm": 0.21826618909835815, "learning_rate": 1.8336028037110014e-06, "loss": 0.3848, "num_input_tokens_seen": 21525657346, "step": 5521, "train_runtime": 219755.4256, "train_tokens_per_second": 97952.791 }, { "epoch": 0.8779014308426073, "grad_norm": 0.2591378390789032, "learning_rate": 1.828902191237794e-06, "loss": 0.3888, "num_input_tokens_seen": 21529555408, "step": 5522, "train_runtime": 219792.6266, "train_tokens_per_second": 97953.948 }, { "epoch": 0.878060413354531, "grad_norm": 0.21731270849704742, "learning_rate": 1.8242073829608436e-06, "loss": 0.3919, "num_input_tokens_seen": 21533634073, "step": 5523, "train_runtime": 219832.4873, "train_tokens_per_second": 97954.74 }, { "epoch": 0.8782193958664547, "grad_norm": 0.2557952105998993, "learning_rate": 1.8195183800561844e-06, "loss": 0.3857, "num_input_tokens_seen": 21537512637, "step": 5524, "train_runtime": 219871.1691, "train_tokens_per_second": 97955.147 }, { "epoch": 0.8783783783783784, "grad_norm": 0.2875441014766693, "learning_rate": 1.814835183698363e-06, "loss": 0.4051, "num_input_tokens_seen": 21541433532, "step": 5525, "train_runtime": 219912.0971, "train_tokens_per_second": 97954.746 }, { "epoch": 0.8785373608903021, "grad_norm": 0.29289957880973816, "learning_rate": 1.8101577950604936e-06, "loss": 0.3859, "num_input_tokens_seen": 21545415634, "step": 5526, "train_runtime": 219952.422, "train_tokens_per_second": 97954.892 }, { "epoch": 0.8786963434022258, "grad_norm": 0.2701798677444458, "learning_rate": 1.8054862153142365e-06, "loss": 0.3742, "num_input_tokens_seen": 21549254148, "step": 5527, "train_runtime": 219989.7019, "train_tokens_per_second": 97955.74 }, { "epoch": 0.8788553259141495, "grad_norm": 0.21089953184127808, "learning_rate": 1.8008204456297723e-06, "loss": 0.3938, "num_input_tokens_seen": 21553214867, "step": 5528, "train_runtime": 220029.5299, "train_tokens_per_second": 97956.01 }, { "epoch": 0.8790143084260731, "grad_norm": 0.2271532416343689, "learning_rate": 1.7961604871758603e-06, "loss": 0.3938, "num_input_tokens_seen": 21557115941, "step": 5529, "train_runtime": 220068.2436, "train_tokens_per_second": 97956.505 }, { "epoch": 0.8791732909379968, "grad_norm": 0.18575593829154968, "learning_rate": 1.7915063411197753e-06, "loss": 0.38, "num_input_tokens_seen": 21561020418, "step": 5530, "train_runtime": 220110.2188, "train_tokens_per_second": 97955.563 }, { "epoch": 0.8793322734499205, "grad_norm": 0.18323855102062225, "learning_rate": 1.7868580086273513e-06, "loss": 0.3849, "num_input_tokens_seen": 21564917687, "step": 5531, "train_runtime": 220150.9989, "train_tokens_per_second": 97955.121 }, { "epoch": 0.8794912559618442, "grad_norm": 0.286222368478775, "learning_rate": 1.7822154908629651e-06, "loss": 0.3722, "num_input_tokens_seen": 21568755738, "step": 5532, "train_runtime": 220188.8137, "train_tokens_per_second": 97955.729 }, { "epoch": 0.8796502384737679, "grad_norm": 0.30888631939888, "learning_rate": 1.777578788989534e-06, "loss": 0.3865, "num_input_tokens_seen": 21572692207, "step": 5533, "train_runtime": 220226.606, "train_tokens_per_second": 97956.794 }, { "epoch": 0.8798092209856916, "grad_norm": 0.37907639145851135, "learning_rate": 1.7729479041685233e-06, "loss": 0.3795, "num_input_tokens_seen": 21576559396, "step": 5534, "train_runtime": 220266.2354, "train_tokens_per_second": 97956.727 }, { "epoch": 0.8799682034976153, "grad_norm": 0.22294475138187408, "learning_rate": 1.768322837559927e-06, "loss": 0.3804, "num_input_tokens_seen": 21580498919, "step": 5535, "train_runtime": 220306.5904, "train_tokens_per_second": 97956.665 }, { "epoch": 0.880127186009539, "grad_norm": 0.24014078080654144, "learning_rate": 1.7637035903222965e-06, "loss": 0.3777, "num_input_tokens_seen": 21584476991, "step": 5536, "train_runtime": 220346.139, "train_tokens_per_second": 97957.137 }, { "epoch": 0.8802861685214627, "grad_norm": 0.1850401908159256, "learning_rate": 1.75909016361272e-06, "loss": 0.3793, "num_input_tokens_seen": 21588377705, "step": 5537, "train_runtime": 220387.1078, "train_tokens_per_second": 97956.627 }, { "epoch": 0.8804451510333863, "grad_norm": 0.3880978226661682, "learning_rate": 1.7544825585868313e-06, "loss": 0.3766, "num_input_tokens_seen": 21592219245, "step": 5538, "train_runtime": 220428.2931, "train_tokens_per_second": 97955.752 }, { "epoch": 0.88060413354531, "grad_norm": 0.22166012227535248, "learning_rate": 1.7498807763987989e-06, "loss": 0.3789, "num_input_tokens_seen": 21596214621, "step": 5539, "train_runtime": 220468.4186, "train_tokens_per_second": 97956.046 }, { "epoch": 0.8807631160572337, "grad_norm": 0.25856223702430725, "learning_rate": 1.7452848182013425e-06, "loss": 0.392, "num_input_tokens_seen": 21600152026, "step": 5540, "train_runtime": 220506.9601, "train_tokens_per_second": 97956.781 }, { "epoch": 0.8809220985691574, "grad_norm": 0.46367642283439636, "learning_rate": 1.7406946851457106e-06, "loss": 0.3925, "num_input_tokens_seen": 21603934025, "step": 5541, "train_runtime": 220545.0891, "train_tokens_per_second": 97956.994 }, { "epoch": 0.8810810810810811, "grad_norm": 0.23319227993488312, "learning_rate": 1.7361103783817e-06, "loss": 0.3825, "num_input_tokens_seen": 21607958850, "step": 5542, "train_runtime": 220585.1669, "train_tokens_per_second": 97957.443 }, { "epoch": 0.8812400635930048, "grad_norm": 0.3534557521343231, "learning_rate": 1.7315318990576478e-06, "loss": 0.3721, "num_input_tokens_seen": 21611830975, "step": 5543, "train_runtime": 220626.8494, "train_tokens_per_second": 97956.486 }, { "epoch": 0.8813990461049285, "grad_norm": 0.4338257908821106, "learning_rate": 1.726959248320434e-06, "loss": 0.383, "num_input_tokens_seen": 21615711294, "step": 5544, "train_runtime": 220667.7355, "train_tokens_per_second": 97955.921 }, { "epoch": 0.8815580286168522, "grad_norm": 0.1919676810503006, "learning_rate": 1.7223924273154778e-06, "loss": 0.3912, "num_input_tokens_seen": 21619657372, "step": 5545, "train_runtime": 220706.3887, "train_tokens_per_second": 97956.645 }, { "epoch": 0.8817170111287759, "grad_norm": 0.3236781060695648, "learning_rate": 1.7178314371867205e-06, "loss": 0.3775, "num_input_tokens_seen": 21623535580, "step": 5546, "train_runtime": 220746.8634, "train_tokens_per_second": 97956.253 }, { "epoch": 0.8818759936406996, "grad_norm": 0.3142209053039551, "learning_rate": 1.7132762790766753e-06, "loss": 0.3979, "num_input_tokens_seen": 21627372852, "step": 5547, "train_runtime": 220785.4928, "train_tokens_per_second": 97956.494 }, { "epoch": 0.8820349761526232, "grad_norm": 0.21305759251117706, "learning_rate": 1.708726954126369e-06, "loss": 0.3757, "num_input_tokens_seen": 21631337159, "step": 5548, "train_runtime": 220825.1282, "train_tokens_per_second": 97956.864 }, { "epoch": 0.8821939586645469, "grad_norm": 0.3090517818927765, "learning_rate": 1.7041834634753756e-06, "loss": 0.3906, "num_input_tokens_seen": 21635245339, "step": 5549, "train_runtime": 220861.4693, "train_tokens_per_second": 97958.442 }, { "epoch": 0.8823529411764706, "grad_norm": 0.2084796130657196, "learning_rate": 1.69964580826181e-06, "loss": 0.3828, "num_input_tokens_seen": 21639075576, "step": 5550, "train_runtime": 220906.9597, "train_tokens_per_second": 97955.608 }, { "epoch": 0.8825119236883943, "grad_norm": 0.2758156955242157, "learning_rate": 1.6951139896223156e-06, "loss": 0.3844, "num_input_tokens_seen": 21643034006, "step": 5551, "train_runtime": 220946.9985, "train_tokens_per_second": 97955.773 }, { "epoch": 0.882670906200318, "grad_norm": 0.19859054684638977, "learning_rate": 1.6905880086920927e-06, "loss": 0.3944, "num_input_tokens_seen": 21646887466, "step": 5552, "train_runtime": 220989.0373, "train_tokens_per_second": 97954.576 }, { "epoch": 0.8828298887122417, "grad_norm": 0.21921657025814056, "learning_rate": 1.6860678666048569e-06, "loss": 0.3896, "num_input_tokens_seen": 21650840630, "step": 5553, "train_runtime": 221030.0977, "train_tokens_per_second": 97954.264 }, { "epoch": 0.8829888712241654, "grad_norm": 0.21429285407066345, "learning_rate": 1.681553564492877e-06, "loss": 0.3918, "num_input_tokens_seen": 21654666305, "step": 5554, "train_runtime": 221068.6735, "train_tokens_per_second": 97954.477 }, { "epoch": 0.8831478537360891, "grad_norm": 0.1896909773349762, "learning_rate": 1.6770451034869516e-06, "loss": 0.3819, "num_input_tokens_seen": 21658691054, "step": 5555, "train_runtime": 221108.8116, "train_tokens_per_second": 97954.898 }, { "epoch": 0.8833068362480128, "grad_norm": 0.259115993976593, "learning_rate": 1.6725424847164218e-06, "loss": 0.3754, "num_input_tokens_seen": 21662458916, "step": 5556, "train_runtime": 221147.0741, "train_tokens_per_second": 97954.988 }, { "epoch": 0.8834658187599365, "grad_norm": 0.8276145458221436, "learning_rate": 1.6680457093091629e-06, "loss": 0.3863, "num_input_tokens_seen": 21666369646, "step": 5557, "train_runtime": 221189.6554, "train_tokens_per_second": 97953.811 }, { "epoch": 0.88362480127186, "grad_norm": 0.3460961878299713, "learning_rate": 1.6635547783915767e-06, "loss": 0.3775, "num_input_tokens_seen": 21670327002, "step": 5558, "train_runtime": 221227.2472, "train_tokens_per_second": 97955.054 }, { "epoch": 0.8837837837837837, "grad_norm": 0.2330908179283142, "learning_rate": 1.6590696930886164e-06, "loss": 0.3997, "num_input_tokens_seen": 21674194443, "step": 5559, "train_runtime": 221266.5286, "train_tokens_per_second": 97955.143 }, { "epoch": 0.8839427662957074, "grad_norm": 0.23984089493751526, "learning_rate": 1.6545904545237606e-06, "loss": 0.3663, "num_input_tokens_seen": 21677949622, "step": 5560, "train_runtime": 221304.6692, "train_tokens_per_second": 97955.229 }, { "epoch": 0.8841017488076311, "grad_norm": 0.21118417382240295, "learning_rate": 1.6501170638190316e-06, "loss": 0.3837, "num_input_tokens_seen": 21681939467, "step": 5561, "train_runtime": 221344.8907, "train_tokens_per_second": 97955.455 }, { "epoch": 0.8842607313195548, "grad_norm": 0.2071118801832199, "learning_rate": 1.6456495220949774e-06, "loss": 0.3881, "num_input_tokens_seen": 21685857803, "step": 5562, "train_runtime": 221382.4167, "train_tokens_per_second": 97956.55 }, { "epoch": 0.8844197138314785, "grad_norm": 0.1876755952835083, "learning_rate": 1.6411878304706917e-06, "loss": 0.3817, "num_input_tokens_seen": 21689708413, "step": 5563, "train_runtime": 221422.0447, "train_tokens_per_second": 97956.409 }, { "epoch": 0.8845786963434022, "grad_norm": 0.1923924684524536, "learning_rate": 1.6367319900637917e-06, "loss": 0.3916, "num_input_tokens_seen": 21693663969, "step": 5564, "train_runtime": 221461.2489, "train_tokens_per_second": 97956.93 }, { "epoch": 0.884737678855326, "grad_norm": 0.5500655174255371, "learning_rate": 1.6322820019904317e-06, "loss": 0.3938, "num_input_tokens_seen": 21697545564, "step": 5565, "train_runtime": 221502.6495, "train_tokens_per_second": 97956.145 }, { "epoch": 0.8848966613672496, "grad_norm": 0.25451913475990295, "learning_rate": 1.627837867365309e-06, "loss": 0.3815, "num_input_tokens_seen": 21701505280, "step": 5566, "train_runtime": 221540.5337, "train_tokens_per_second": 97957.267 }, { "epoch": 0.8850556438791732, "grad_norm": 0.21017970144748688, "learning_rate": 1.623399587301641e-06, "loss": 0.3892, "num_input_tokens_seen": 21705358967, "step": 5567, "train_runtime": 221580.9221, "train_tokens_per_second": 97956.804 }, { "epoch": 0.8852146263910969, "grad_norm": 0.24028341472148895, "learning_rate": 1.6189671629111974e-06, "loss": 0.3713, "num_input_tokens_seen": 21709313547, "step": 5568, "train_runtime": 221619.92, "train_tokens_per_second": 97957.411 }, { "epoch": 0.8853736089030206, "grad_norm": 0.18702545762062073, "learning_rate": 1.6145405953042508e-06, "loss": 0.3764, "num_input_tokens_seen": 21713262514, "step": 5569, "train_runtime": 221659.9818, "train_tokens_per_second": 97957.522 }, { "epoch": 0.8855325914149443, "grad_norm": 0.21748009324073792, "learning_rate": 1.6101198855896422e-06, "loss": 0.383, "num_input_tokens_seen": 21717112852, "step": 5570, "train_runtime": 221699.2968, "train_tokens_per_second": 97957.518 }, { "epoch": 0.885691573926868, "grad_norm": 0.31244394183158875, "learning_rate": 1.605705034874716e-06, "loss": 0.3806, "num_input_tokens_seen": 21720896074, "step": 5571, "train_runtime": 221736.6411, "train_tokens_per_second": 97958.082 }, { "epoch": 0.8858505564387917, "grad_norm": 0.2890303432941437, "learning_rate": 1.6012960442653685e-06, "loss": 0.3863, "num_input_tokens_seen": 21724888710, "step": 5572, "train_runtime": 221774.6651, "train_tokens_per_second": 97959.29 }, { "epoch": 0.8860095389507154, "grad_norm": 0.323542982339859, "learning_rate": 1.5968929148660188e-06, "loss": 0.3912, "num_input_tokens_seen": 21728808924, "step": 5573, "train_runtime": 221813.9808, "train_tokens_per_second": 97959.6 }, { "epoch": 0.8861685214626391, "grad_norm": 0.2490537315607071, "learning_rate": 1.5924956477796182e-06, "loss": 0.3773, "num_input_tokens_seen": 21732734968, "step": 5574, "train_runtime": 221853.5495, "train_tokens_per_second": 97959.825 }, { "epoch": 0.8863275039745628, "grad_norm": 0.23963753879070282, "learning_rate": 1.5881042441076577e-06, "loss": 0.3724, "num_input_tokens_seen": 21736606964, "step": 5575, "train_runtime": 221894.0363, "train_tokens_per_second": 97959.401 }, { "epoch": 0.8864864864864865, "grad_norm": 0.4059653580188751, "learning_rate": 1.5837187049501406e-06, "loss": 0.3802, "num_input_tokens_seen": 21740433145, "step": 5576, "train_runtime": 221932.8372, "train_tokens_per_second": 97959.515 }, { "epoch": 0.8866454689984101, "grad_norm": 0.20744384825229645, "learning_rate": 1.5793390314056217e-06, "loss": 0.3884, "num_input_tokens_seen": 21744392939, "step": 5577, "train_runtime": 221970.8741, "train_tokens_per_second": 97960.568 }, { "epoch": 0.8868044515103338, "grad_norm": 0.21126021444797516, "learning_rate": 1.5749652245711789e-06, "loss": 0.3918, "num_input_tokens_seen": 21748236826, "step": 5578, "train_runtime": 222009.2973, "train_tokens_per_second": 97960.928 }, { "epoch": 0.8869634340222575, "grad_norm": 0.18692047894001007, "learning_rate": 1.5705972855424133e-06, "loss": 0.3999, "num_input_tokens_seen": 21752129396, "step": 5579, "train_runtime": 222047.865, "train_tokens_per_second": 97961.444 }, { "epoch": 0.8871224165341812, "grad_norm": 0.19794796407222748, "learning_rate": 1.5662352154134697e-06, "loss": 0.3861, "num_input_tokens_seen": 21756038735, "step": 5580, "train_runtime": 222085.4598, "train_tokens_per_second": 97962.463 }, { "epoch": 0.8872813990461049, "grad_norm": 0.18911300599575043, "learning_rate": 1.561879015277018e-06, "loss": 0.3809, "num_input_tokens_seen": 21759908073, "step": 5581, "train_runtime": 222124.0683, "train_tokens_per_second": 97962.856 }, { "epoch": 0.8874403815580286, "grad_norm": 0.2455318123102188, "learning_rate": 1.5575286862242467e-06, "loss": 0.3914, "num_input_tokens_seen": 21763786118, "step": 5582, "train_runtime": 222161.858, "train_tokens_per_second": 97963.648 }, { "epoch": 0.8875993640699523, "grad_norm": 0.19070160388946533, "learning_rate": 1.5531842293448867e-06, "loss": 0.3794, "num_input_tokens_seen": 21767683749, "step": 5583, "train_runtime": 222201.6149, "train_tokens_per_second": 97963.661 }, { "epoch": 0.887758346581876, "grad_norm": 0.18872925639152527, "learning_rate": 1.5488456457271983e-06, "loss": 0.383, "num_input_tokens_seen": 21771718928, "step": 5584, "train_runtime": 222240.5556, "train_tokens_per_second": 97964.653 }, { "epoch": 0.8879173290937997, "grad_norm": 0.19048307836055756, "learning_rate": 1.5445129364579508e-06, "loss": 0.383, "num_input_tokens_seen": 21775524750, "step": 5585, "train_runtime": 222278.4511, "train_tokens_per_second": 97965.073 }, { "epoch": 0.8880763116057234, "grad_norm": 0.18222779035568237, "learning_rate": 1.5401861026224789e-06, "loss": 0.3938, "num_input_tokens_seen": 21779431590, "step": 5586, "train_runtime": 222317.2578, "train_tokens_per_second": 97965.546 }, { "epoch": 0.888235294117647, "grad_norm": 0.25493526458740234, "learning_rate": 1.5358651453046048e-06, "loss": 0.3891, "num_input_tokens_seen": 21783342974, "step": 5587, "train_runtime": 222357.3347, "train_tokens_per_second": 97965.48 }, { "epoch": 0.8883942766295707, "grad_norm": 0.1965859979391098, "learning_rate": 1.531550065586712e-06, "loss": 0.397, "num_input_tokens_seen": 21787226734, "step": 5588, "train_runtime": 222394.8164, "train_tokens_per_second": 97966.432 }, { "epoch": 0.8885532591414944, "grad_norm": 0.19026575982570648, "learning_rate": 1.5272408645496894e-06, "loss": 0.3912, "num_input_tokens_seen": 21791038980, "step": 5589, "train_runtime": 222432.4372, "train_tokens_per_second": 97967.002 }, { "epoch": 0.8887122416534181, "grad_norm": 0.19197368621826172, "learning_rate": 1.522937543272962e-06, "loss": 0.39, "num_input_tokens_seen": 21794865868, "step": 5590, "train_runtime": 222470.0556, "train_tokens_per_second": 97967.638 }, { "epoch": 0.8888712241653418, "grad_norm": 0.19901059567928314, "learning_rate": 1.518640102834487e-06, "loss": 0.3807, "num_input_tokens_seen": 21798798397, "step": 5591, "train_runtime": 222510.6254, "train_tokens_per_second": 97967.449 }, { "epoch": 0.8890302066772655, "grad_norm": 0.20080621540546417, "learning_rate": 1.5143485443107315e-06, "loss": 0.3889, "num_input_tokens_seen": 21802755284, "step": 5592, "train_runtime": 222550.6559, "train_tokens_per_second": 97967.607 }, { "epoch": 0.8891891891891892, "grad_norm": 0.5745012760162354, "learning_rate": 1.5100628687767154e-06, "loss": 0.4028, "num_input_tokens_seen": 21806589960, "step": 5593, "train_runtime": 222589.2416, "train_tokens_per_second": 97967.852 }, { "epoch": 0.8893481717011129, "grad_norm": 0.1873610019683838, "learning_rate": 1.5057830773059584e-06, "loss": 0.3749, "num_input_tokens_seen": 21810520945, "step": 5594, "train_runtime": 222629.1426, "train_tokens_per_second": 97967.951 }, { "epoch": 0.8895071542130366, "grad_norm": 0.20564107596874237, "learning_rate": 1.501509170970522e-06, "loss": 0.3923, "num_input_tokens_seen": 21814409543, "step": 5595, "train_runtime": 222668.2738, "train_tokens_per_second": 97968.198 }, { "epoch": 0.8896661367249602, "grad_norm": 0.21553584933280945, "learning_rate": 1.497241150840989e-06, "loss": 0.3874, "num_input_tokens_seen": 21818355595, "step": 5596, "train_runtime": 222703.6078, "train_tokens_per_second": 97970.373 }, { "epoch": 0.8898251192368839, "grad_norm": 0.20868639647960663, "learning_rate": 1.4929790179864706e-06, "loss": 0.3895, "num_input_tokens_seen": 21822178173, "step": 5597, "train_runtime": 222744.7575, "train_tokens_per_second": 97969.436 }, { "epoch": 0.8899841017488076, "grad_norm": 0.18468432128429413, "learning_rate": 1.4887227734746018e-06, "loss": 0.3711, "num_input_tokens_seen": 21826110872, "step": 5598, "train_runtime": 222783.8055, "train_tokens_per_second": 97969.917 }, { "epoch": 0.8901430842607313, "grad_norm": 0.23175543546676636, "learning_rate": 1.4844724183715354e-06, "loss": 0.3902, "num_input_tokens_seen": 21830017731, "step": 5599, "train_runtime": 222824.3659, "train_tokens_per_second": 97969.617 }, { "epoch": 0.890302066772655, "grad_norm": 0.1900079846382141, "learning_rate": 1.4802279537419612e-06, "loss": 0.3909, "num_input_tokens_seen": 21833822317, "step": 5600, "train_runtime": 222861.4639, "train_tokens_per_second": 97970.38 }, { "epoch": 0.8904610492845787, "grad_norm": 0.1802975833415985, "learning_rate": 1.4759893806490843e-06, "loss": 0.3802, "num_input_tokens_seen": 21837682434, "step": 5601, "train_runtime": 223020.1535, "train_tokens_per_second": 97917.978 }, { "epoch": 0.8906200317965024, "grad_norm": 0.46171751618385315, "learning_rate": 1.4717567001546389e-06, "loss": 0.3897, "num_input_tokens_seen": 21841640400, "step": 5602, "train_runtime": 223059.0876, "train_tokens_per_second": 97918.631 }, { "epoch": 0.8907790143084261, "grad_norm": 0.1924252212047577, "learning_rate": 1.4675299133188796e-06, "loss": 0.3966, "num_input_tokens_seen": 21845499348, "step": 5603, "train_runtime": 223099.4884, "train_tokens_per_second": 97918.196 }, { "epoch": 0.8909379968203498, "grad_norm": 0.627646803855896, "learning_rate": 1.463309021200593e-06, "loss": 0.3862, "num_input_tokens_seen": 21849330437, "step": 5604, "train_runtime": 223140.9186, "train_tokens_per_second": 97917.184 }, { "epoch": 0.8910969793322735, "grad_norm": 0.16764585673809052, "learning_rate": 1.4590940248570744e-06, "loss": 0.3803, "num_input_tokens_seen": 21853307624, "step": 5605, "train_runtime": 223179.345, "train_tokens_per_second": 97918.146 }, { "epoch": 0.8912559618441971, "grad_norm": 0.20416581630706787, "learning_rate": 1.4548849253441549e-06, "loss": 0.3823, "num_input_tokens_seen": 21857287070, "step": 5606, "train_runtime": 223212.8052, "train_tokens_per_second": 97921.296 }, { "epoch": 0.8914149443561208, "grad_norm": 0.25637340545654297, "learning_rate": 1.450681723716188e-06, "loss": 0.389, "num_input_tokens_seen": 21861027272, "step": 5607, "train_runtime": 223250.3331, "train_tokens_per_second": 97921.589 }, { "epoch": 0.8915739268680445, "grad_norm": 0.17146794497966766, "learning_rate": 1.4464844210260314e-06, "loss": 0.3803, "num_input_tokens_seen": 21864903697, "step": 5608, "train_runtime": 223288.1508, "train_tokens_per_second": 97922.365 }, { "epoch": 0.8917329093799682, "grad_norm": 0.29445919394493103, "learning_rate": 1.4422930183251e-06, "loss": 0.3821, "num_input_tokens_seen": 21868826821, "step": 5609, "train_runtime": 223327.6207, "train_tokens_per_second": 97922.625 }, { "epoch": 0.8918918918918919, "grad_norm": 0.4621414244174957, "learning_rate": 1.4381075166632897e-06, "loss": 0.3784, "num_input_tokens_seen": 21872699560, "step": 5610, "train_runtime": 223362.903, "train_tokens_per_second": 97924.495 }, { "epoch": 0.8920508744038156, "grad_norm": 0.22564563155174255, "learning_rate": 1.433927917089059e-06, "loss": 0.3802, "num_input_tokens_seen": 21876542570, "step": 5611, "train_runtime": 223403.6214, "train_tokens_per_second": 97923.849 }, { "epoch": 0.8922098569157393, "grad_norm": 0.2506016194820404, "learning_rate": 1.4297542206493569e-06, "loss": 0.3856, "num_input_tokens_seen": 21880506310, "step": 5612, "train_runtime": 223440.6618, "train_tokens_per_second": 97925.356 }, { "epoch": 0.892368839427663, "grad_norm": 0.18465441465377808, "learning_rate": 1.4255864283896663e-06, "loss": 0.3757, "num_input_tokens_seen": 21884284196, "step": 5613, "train_runtime": 223479.4761, "train_tokens_per_second": 97925.253 }, { "epoch": 0.8925278219395867, "grad_norm": 0.17549210786819458, "learning_rate": 1.4214245413539912e-06, "loss": 0.3822, "num_input_tokens_seen": 21888227203, "step": 5614, "train_runtime": 223517.4535, "train_tokens_per_second": 97926.255 }, { "epoch": 0.8926868044515104, "grad_norm": 0.23921872675418854, "learning_rate": 1.41726856058485e-06, "loss": 0.3919, "num_input_tokens_seen": 21892035054, "step": 5615, "train_runtime": 223557.915, "train_tokens_per_second": 97925.565 }, { "epoch": 0.892845786963434, "grad_norm": 0.235887348651886, "learning_rate": 1.4131184871232966e-06, "loss": 0.3636, "num_input_tokens_seen": 21896077645, "step": 5616, "train_runtime": 223598.5324, "train_tokens_per_second": 97925.856 }, { "epoch": 0.8930047694753577, "grad_norm": 0.2093767523765564, "learning_rate": 1.4089743220088852e-06, "loss": 0.3878, "num_input_tokens_seen": 21900029259, "step": 5617, "train_runtime": 223637.4809, "train_tokens_per_second": 97926.471 }, { "epoch": 0.8931637519872814, "grad_norm": 0.2773835361003876, "learning_rate": 1.4048360662797022e-06, "loss": 0.391, "num_input_tokens_seen": 21903841561, "step": 5618, "train_runtime": 223678.1108, "train_tokens_per_second": 97925.727 }, { "epoch": 0.8933227344992051, "grad_norm": 0.320902943611145, "learning_rate": 1.4007037209723544e-06, "loss": 0.3759, "num_input_tokens_seen": 21907770652, "step": 5619, "train_runtime": 223716.7349, "train_tokens_per_second": 97926.383 }, { "epoch": 0.8934817170111288, "grad_norm": 0.180466428399086, "learning_rate": 1.396577287121964e-06, "loss": 0.3738, "num_input_tokens_seen": 21911686482, "step": 5620, "train_runtime": 223757.6888, "train_tokens_per_second": 97925.96 }, { "epoch": 0.8936406995230525, "grad_norm": 0.25088027119636536, "learning_rate": 1.3924567657621761e-06, "loss": 0.3866, "num_input_tokens_seen": 21915577788, "step": 5621, "train_runtime": 223797.3297, "train_tokens_per_second": 97926.002 }, { "epoch": 0.8937996820349762, "grad_norm": 0.22670577466487885, "learning_rate": 1.388342157925146e-06, "loss": 0.387, "num_input_tokens_seen": 21919370275, "step": 5622, "train_runtime": 223837.7223, "train_tokens_per_second": 97925.274 }, { "epoch": 0.8939586645468999, "grad_norm": 0.18824873864650726, "learning_rate": 1.3842334646415572e-06, "loss": 0.3818, "num_input_tokens_seen": 21923408479, "step": 5623, "train_runtime": 223876.2214, "train_tokens_per_second": 97926.472 }, { "epoch": 0.8941176470588236, "grad_norm": 0.20684979856014252, "learning_rate": 1.380130686940606e-06, "loss": 0.3703, "num_input_tokens_seen": 21927344004, "step": 5624, "train_runtime": 223914.6505, "train_tokens_per_second": 97927.241 }, { "epoch": 0.8942766295707473, "grad_norm": 0.22053536772727966, "learning_rate": 1.3760338258500177e-06, "loss": 0.3782, "num_input_tokens_seen": 21931249953, "step": 5625, "train_runtime": 223952.9014, "train_tokens_per_second": 97927.956 }, { "epoch": 0.8944356120826709, "grad_norm": 0.20966960489749908, "learning_rate": 1.371942882396013e-06, "loss": 0.3714, "num_input_tokens_seen": 21935150985, "step": 5626, "train_runtime": 223992.8755, "train_tokens_per_second": 97927.896 }, { "epoch": 0.8945945945945946, "grad_norm": 0.23821978271007538, "learning_rate": 1.3678578576033578e-06, "loss": 0.3828, "num_input_tokens_seen": 21939042991, "step": 5627, "train_runtime": 224033.2106, "train_tokens_per_second": 97927.637 }, { "epoch": 0.8947535771065183, "grad_norm": 0.34680598974227905, "learning_rate": 1.3637787524953122e-06, "loss": 0.38, "num_input_tokens_seen": 21942936465, "step": 5628, "train_runtime": 224073.348, "train_tokens_per_second": 97927.472 }, { "epoch": 0.894912559618442, "grad_norm": 0.18280963599681854, "learning_rate": 1.3597055680936693e-06, "loss": 0.3861, "num_input_tokens_seen": 21946789185, "step": 5629, "train_runtime": 224112.7777, "train_tokens_per_second": 97927.434 }, { "epoch": 0.8950715421303657, "grad_norm": 0.20794370770454407, "learning_rate": 1.3556383054187328e-06, "loss": 0.3908, "num_input_tokens_seen": 21950586673, "step": 5630, "train_runtime": 224150.5027, "train_tokens_per_second": 97927.894 }, { "epoch": 0.8952305246422894, "grad_norm": 0.19493845105171204, "learning_rate": 1.3515769654893151e-06, "loss": 0.3847, "num_input_tokens_seen": 21954507654, "step": 5631, "train_runtime": 224190.4864, "train_tokens_per_second": 97927.918 }, { "epoch": 0.8953895071542131, "grad_norm": 0.9820147752761841, "learning_rate": 1.3475215493227667e-06, "loss": 0.3886, "num_input_tokens_seen": 21958319813, "step": 5632, "train_runtime": 224230.9097, "train_tokens_per_second": 97927.265 }, { "epoch": 0.8955484896661368, "grad_norm": 0.1947670429944992, "learning_rate": 1.3434720579349275e-06, "loss": 0.3856, "num_input_tokens_seen": 21962380413, "step": 5633, "train_runtime": 224271.4166, "train_tokens_per_second": 97927.684 }, { "epoch": 0.8957074721780605, "grad_norm": 0.2615770697593689, "learning_rate": 1.3394284923401778e-06, "loss": 0.3923, "num_input_tokens_seen": 21966324897, "step": 5634, "train_runtime": 224309.8936, "train_tokens_per_second": 97928.471 }, { "epoch": 0.895866454689984, "grad_norm": 0.20084349811077118, "learning_rate": 1.3353908535513938e-06, "loss": 0.3735, "num_input_tokens_seen": 21970181527, "step": 5635, "train_runtime": 224348.5815, "train_tokens_per_second": 97928.774 }, { "epoch": 0.8960254372019077, "grad_norm": 0.20692411065101624, "learning_rate": 1.3313591425799799e-06, "loss": 0.3766, "num_input_tokens_seen": 21974195257, "step": 5636, "train_runtime": 224386.7715, "train_tokens_per_second": 97929.994 }, { "epoch": 0.8961844197138314, "grad_norm": 0.1838957816362381, "learning_rate": 1.3273333604358506e-06, "loss": 0.3812, "num_input_tokens_seen": 21977932818, "step": 5637, "train_runtime": 224426.6145, "train_tokens_per_second": 97929.262 }, { "epoch": 0.8963434022257551, "grad_norm": 0.250150203704834, "learning_rate": 1.32331350812743e-06, "loss": 0.3868, "num_input_tokens_seen": 21981875799, "step": 5638, "train_runtime": 224460.7019, "train_tokens_per_second": 97931.957 }, { "epoch": 0.8965023847376788, "grad_norm": 0.19539016485214233, "learning_rate": 1.3192995866616735e-06, "loss": 0.4065, "num_input_tokens_seen": 21985736429, "step": 5639, "train_runtime": 224501.5866, "train_tokens_per_second": 97931.319 }, { "epoch": 0.8966613672496025, "grad_norm": 0.19413241744041443, "learning_rate": 1.3152915970440326e-06, "loss": 0.3943, "num_input_tokens_seen": 21989585061, "step": 5640, "train_runtime": 224538.5867, "train_tokens_per_second": 97932.322 }, { "epoch": 0.8968203497615262, "grad_norm": 0.22936929762363434, "learning_rate": 1.311289540278482e-06, "loss": 0.3805, "num_input_tokens_seen": 21993393476, "step": 5641, "train_runtime": 224577.772, "train_tokens_per_second": 97932.192 }, { "epoch": 0.89697933227345, "grad_norm": 0.1881009042263031, "learning_rate": 1.3072934173675082e-06, "loss": 0.3931, "num_input_tokens_seen": 21997432638, "step": 5642, "train_runtime": 224618.0083, "train_tokens_per_second": 97932.632 }, { "epoch": 0.8971383147853736, "grad_norm": 0.22639326751232147, "learning_rate": 1.303303229312114e-06, "loss": 0.3786, "num_input_tokens_seen": 22001386100, "step": 5643, "train_runtime": 224655.5365, "train_tokens_per_second": 97933.87 }, { "epoch": 0.8972972972972973, "grad_norm": 0.20365554094314575, "learning_rate": 1.2993189771118104e-06, "loss": 0.3827, "num_input_tokens_seen": 22005233068, "step": 5644, "train_runtime": 224693.9342, "train_tokens_per_second": 97934.255 }, { "epoch": 0.8974562798092209, "grad_norm": 0.19201348721981049, "learning_rate": 1.2953406617646242e-06, "loss": 0.3864, "num_input_tokens_seen": 22009004309, "step": 5645, "train_runtime": 224732.1818, "train_tokens_per_second": 97934.369 }, { "epoch": 0.8976152623211446, "grad_norm": 0.24519479274749756, "learning_rate": 1.2913682842671e-06, "loss": 0.3881, "num_input_tokens_seen": 22012951906, "step": 5646, "train_runtime": 224773.0304, "train_tokens_per_second": 97934.133 }, { "epoch": 0.8977742448330683, "grad_norm": 0.2940067946910858, "learning_rate": 1.2874018456142833e-06, "loss": 0.3849, "num_input_tokens_seen": 22016851587, "step": 5647, "train_runtime": 224811.9634, "train_tokens_per_second": 97934.519 }, { "epoch": 0.897933227344992, "grad_norm": 0.21997158229351044, "learning_rate": 1.2834413467997464e-06, "loss": 0.3842, "num_input_tokens_seen": 22020753312, "step": 5648, "train_runtime": 224849.9827, "train_tokens_per_second": 97935.312 }, { "epoch": 0.8980922098569157, "grad_norm": 0.20034117996692657, "learning_rate": 1.2794867888155565e-06, "loss": 0.3816, "num_input_tokens_seen": 22024638257, "step": 5649, "train_runtime": 224888.4226, "train_tokens_per_second": 97935.847 }, { "epoch": 0.8982511923688394, "grad_norm": 0.24187533557415009, "learning_rate": 1.2755381726523158e-06, "loss": 0.3774, "num_input_tokens_seen": 22028502948, "step": 5650, "train_runtime": 224926.2916, "train_tokens_per_second": 97936.541 }, { "epoch": 0.8984101748807631, "grad_norm": 0.2667488157749176, "learning_rate": 1.2715954992991165e-06, "loss": 0.3846, "num_input_tokens_seen": 22032510619, "step": 5651, "train_runtime": 224962.209, "train_tokens_per_second": 97938.719 }, { "epoch": 0.8985691573926868, "grad_norm": 0.17346811294555664, "learning_rate": 1.2676587697435683e-06, "loss": 0.3765, "num_input_tokens_seen": 22036379958, "step": 5652, "train_runtime": 225001.5378, "train_tokens_per_second": 97938.797 }, { "epoch": 0.8987281399046105, "grad_norm": 0.18337281048297882, "learning_rate": 1.2637279849717991e-06, "loss": 0.3924, "num_input_tokens_seen": 22040330433, "step": 5653, "train_runtime": 225041.1813, "train_tokens_per_second": 97939.099 }, { "epoch": 0.8988871224165342, "grad_norm": 0.32861894369125366, "learning_rate": 1.2598031459684357e-06, "loss": 0.3822, "num_input_tokens_seen": 22044183897, "step": 5654, "train_runtime": 225079.8166, "train_tokens_per_second": 97939.408 }, { "epoch": 0.8990461049284578, "grad_norm": 0.25468912720680237, "learning_rate": 1.2558842537166355e-06, "loss": 0.365, "num_input_tokens_seen": 22048040840, "step": 5655, "train_runtime": 225120.0329, "train_tokens_per_second": 97939.044 }, { "epoch": 0.8992050874403815, "grad_norm": 0.21239739656448364, "learning_rate": 1.2519713091980385e-06, "loss": 0.3803, "num_input_tokens_seen": 22052052824, "step": 5656, "train_runtime": 225159.321, "train_tokens_per_second": 97939.773 }, { "epoch": 0.8993640699523052, "grad_norm": 0.2574118375778198, "learning_rate": 1.2480643133928217e-06, "loss": 0.3834, "num_input_tokens_seen": 22055934647, "step": 5657, "train_runtime": 225198.2574, "train_tokens_per_second": 97940.077 }, { "epoch": 0.8995230524642289, "grad_norm": 0.1818733513355255, "learning_rate": 1.244163267279652e-06, "loss": 0.3868, "num_input_tokens_seen": 22059869119, "step": 5658, "train_runtime": 225238.8057, "train_tokens_per_second": 97939.913 }, { "epoch": 0.8996820349761526, "grad_norm": 0.18632066249847412, "learning_rate": 1.240268171835718e-06, "loss": 0.3833, "num_input_tokens_seen": 22063660994, "step": 5659, "train_runtime": 225278.8539, "train_tokens_per_second": 97939.334 }, { "epoch": 0.8998410174880763, "grad_norm": 0.24765993654727936, "learning_rate": 1.236379028036716e-06, "loss": 0.3733, "num_input_tokens_seen": 22067672626, "step": 5660, "train_runtime": 225319.2608, "train_tokens_per_second": 97939.575 }, { "epoch": 0.9, "grad_norm": 0.7669104337692261, "learning_rate": 1.2324958368568367e-06, "loss": 0.3776, "num_input_tokens_seen": 22071497330, "step": 5661, "train_runtime": 225357.4672, "train_tokens_per_second": 97939.942 }, { "epoch": 0.9001589825119237, "grad_norm": 0.1813311129808426, "learning_rate": 1.2286185992688076e-06, "loss": 0.3666, "num_input_tokens_seen": 22075444721, "step": 5662, "train_runtime": 225395.8985, "train_tokens_per_second": 97940.756 }, { "epoch": 0.9003179650238474, "grad_norm": 0.2374476194381714, "learning_rate": 1.2247473162438405e-06, "loss": 0.379, "num_input_tokens_seen": 22079155531, "step": 5663, "train_runtime": 225436.4008, "train_tokens_per_second": 97939.62 }, { "epoch": 0.900476947535771, "grad_norm": 0.2545534074306488, "learning_rate": 1.2208819887516654e-06, "loss": 0.377, "num_input_tokens_seen": 22083171885, "step": 5664, "train_runtime": 225475.4532, "train_tokens_per_second": 97940.47 }, { "epoch": 0.9006359300476947, "grad_norm": 0.20117279887199402, "learning_rate": 1.2170226177605216e-06, "loss": 0.3832, "num_input_tokens_seen": 22087118725, "step": 5665, "train_runtime": 225513.1568, "train_tokens_per_second": 97941.597 }, { "epoch": 0.9007949125596184, "grad_norm": 0.2100125402212143, "learning_rate": 1.213169204237155e-06, "loss": 0.3719, "num_input_tokens_seen": 22091000891, "step": 5666, "train_runtime": 225552.049, "train_tokens_per_second": 97941.921 }, { "epoch": 0.9009538950715421, "grad_norm": 0.19982856512069702, "learning_rate": 1.2093217491468135e-06, "loss": 0.3777, "num_input_tokens_seen": 22094885763, "step": 5667, "train_runtime": 225590.2331, "train_tokens_per_second": 97942.564 }, { "epoch": 0.9011128775834658, "grad_norm": 0.35123950242996216, "learning_rate": 1.2054802534532617e-06, "loss": 0.3903, "num_input_tokens_seen": 22098834266, "step": 5668, "train_runtime": 225629.2019, "train_tokens_per_second": 97943.148 }, { "epoch": 0.9012718600953895, "grad_norm": 0.24423280358314514, "learning_rate": 1.2016447181187662e-06, "loss": 0.3848, "num_input_tokens_seen": 22102678750, "step": 5669, "train_runtime": 225666.6797, "train_tokens_per_second": 97943.918 }, { "epoch": 0.9014308426073132, "grad_norm": 0.20224016904830933, "learning_rate": 1.1978151441041002e-06, "loss": 0.383, "num_input_tokens_seen": 22106598787, "step": 5670, "train_runtime": 225707.1641, "train_tokens_per_second": 97943.718 }, { "epoch": 0.9015898251192369, "grad_norm": 0.1885005086660385, "learning_rate": 1.193991532368549e-06, "loss": 0.3835, "num_input_tokens_seen": 22110450005, "step": 5671, "train_runtime": 225747.277, "train_tokens_per_second": 97943.374 }, { "epoch": 0.9017488076311606, "grad_norm": 0.17918261885643005, "learning_rate": 1.190173883869891e-06, "loss": 0.3856, "num_input_tokens_seen": 22114424421, "step": 5672, "train_runtime": 225787.4617, "train_tokens_per_second": 97943.545 }, { "epoch": 0.9019077901430843, "grad_norm": 0.1900479644536972, "learning_rate": 1.1863621995644337e-06, "loss": 0.378, "num_input_tokens_seen": 22118370353, "step": 5673, "train_runtime": 225827.3082, "train_tokens_per_second": 97943.736 }, { "epoch": 0.9020667726550079, "grad_norm": 0.19363345205783844, "learning_rate": 1.1825564804069655e-06, "loss": 0.3861, "num_input_tokens_seen": 22122276143, "step": 5674, "train_runtime": 225866.44, "train_tokens_per_second": 97944.06 }, { "epoch": 0.9022257551669316, "grad_norm": 0.2371155470609665, "learning_rate": 1.1787567273507965e-06, "loss": 0.3853, "num_input_tokens_seen": 22126075683, "step": 5675, "train_runtime": 225904.5573, "train_tokens_per_second": 97944.353 }, { "epoch": 0.9023847376788553, "grad_norm": 0.2007426917552948, "learning_rate": 1.1749629413477376e-06, "loss": 0.3892, "num_input_tokens_seen": 22129938575, "step": 5676, "train_runtime": 225943.8632, "train_tokens_per_second": 97944.411 }, { "epoch": 0.902543720190779, "grad_norm": 0.21322059631347656, "learning_rate": 1.1711751233481062e-06, "loss": 0.3853, "num_input_tokens_seen": 22133750836, "step": 5677, "train_runtime": 225983.6366, "train_tokens_per_second": 97944.042 }, { "epoch": 0.9027027027027027, "grad_norm": 0.24145649373531342, "learning_rate": 1.1673932743007293e-06, "loss": 0.3822, "num_input_tokens_seen": 22137708595, "step": 5678, "train_runtime": 226021.979, "train_tokens_per_second": 97944.937 }, { "epoch": 0.9028616852146264, "grad_norm": 0.20007646083831787, "learning_rate": 1.163617395152919e-06, "loss": 0.3756, "num_input_tokens_seen": 22141574871, "step": 5679, "train_runtime": 226058.8032, "train_tokens_per_second": 97946.086 }, { "epoch": 0.9030206677265501, "grad_norm": 0.25362837314605713, "learning_rate": 1.159847486850521e-06, "loss": 0.3783, "num_input_tokens_seen": 22145367814, "step": 5680, "train_runtime": 226098.8866, "train_tokens_per_second": 97945.497 }, { "epoch": 0.9031796502384738, "grad_norm": 0.26657113432884216, "learning_rate": 1.1560835503378608e-06, "loss": 0.3766, "num_input_tokens_seen": 22149248963, "step": 5681, "train_runtime": 226137.7513, "train_tokens_per_second": 97945.827 }, { "epoch": 0.9033386327503975, "grad_norm": 0.1856769174337387, "learning_rate": 1.1523255865577841e-06, "loss": 0.3948, "num_input_tokens_seen": 22153220492, "step": 5682, "train_runtime": 226177.5603, "train_tokens_per_second": 97946.147 }, { "epoch": 0.9034976152623212, "grad_norm": 0.2355460822582245, "learning_rate": 1.1485735964516298e-06, "loss": 0.3798, "num_input_tokens_seen": 22157099665, "step": 5683, "train_runtime": 226218.0686, "train_tokens_per_second": 97945.756 }, { "epoch": 0.9036565977742448, "grad_norm": 0.4671041667461395, "learning_rate": 1.1448275809592512e-06, "loss": 0.3841, "num_input_tokens_seen": 22160991743, "step": 5684, "train_runtime": 226254.6445, "train_tokens_per_second": 97947.124 }, { "epoch": 0.9038155802861685, "grad_norm": 0.2791192829608917, "learning_rate": 1.1410875410189898e-06, "loss": 0.3784, "num_input_tokens_seen": 22164880426, "step": 5685, "train_runtime": 226294.073, "train_tokens_per_second": 97947.242 }, { "epoch": 0.9039745627980922, "grad_norm": 0.21367330849170685, "learning_rate": 1.137353477567707e-06, "loss": 0.3807, "num_input_tokens_seen": 22168672872, "step": 5686, "train_runtime": 226332.7454, "train_tokens_per_second": 97947.263 }, { "epoch": 0.9041335453100159, "grad_norm": 0.3941410779953003, "learning_rate": 1.1336253915407547e-06, "loss": 0.3718, "num_input_tokens_seen": 22172548630, "step": 5687, "train_runtime": 226370.9622, "train_tokens_per_second": 97947.848 }, { "epoch": 0.9042925278219396, "grad_norm": 0.3335579037666321, "learning_rate": 1.1299032838719913e-06, "loss": 0.3829, "num_input_tokens_seen": 22176416170, "step": 5688, "train_runtime": 226411.5679, "train_tokens_per_second": 97947.364 }, { "epoch": 0.9044515103338633, "grad_norm": 0.1951245218515396, "learning_rate": 1.1261871554937853e-06, "loss": 0.3741, "num_input_tokens_seen": 22180174374, "step": 5689, "train_runtime": 226454.2444, "train_tokens_per_second": 97945.501 }, { "epoch": 0.904610492845787, "grad_norm": 0.24584956467151642, "learning_rate": 1.1224770073369916e-06, "loss": 0.3874, "num_input_tokens_seen": 22184143903, "step": 5690, "train_runtime": 226493.6031, "train_tokens_per_second": 97946.006 }, { "epoch": 0.9047694753577107, "grad_norm": 0.20361454784870148, "learning_rate": 1.1187728403309865e-06, "loss": 0.3787, "num_input_tokens_seen": 22188041383, "step": 5691, "train_runtime": 226532.6036, "train_tokens_per_second": 97946.349 }, { "epoch": 0.9049284578696344, "grad_norm": 0.24729499220848083, "learning_rate": 1.115074655403628e-06, "loss": 0.3762, "num_input_tokens_seen": 22191886343, "step": 5692, "train_runtime": 226570.2857, "train_tokens_per_second": 97947.029 }, { "epoch": 0.9050874403815581, "grad_norm": 0.21342279016971588, "learning_rate": 1.1113824534812884e-06, "loss": 0.3788, "num_input_tokens_seen": 22195847081, "step": 5693, "train_runtime": 226610.6616, "train_tokens_per_second": 97947.056 }, { "epoch": 0.9052464228934817, "grad_norm": 0.21988330781459808, "learning_rate": 1.107696235488842e-06, "loss": 0.3808, "num_input_tokens_seen": 22199761064, "step": 5694, "train_runtime": 226651.8029, "train_tokens_per_second": 97946.545 }, { "epoch": 0.9054054054054054, "grad_norm": 0.2628428041934967, "learning_rate": 1.1040160023496505e-06, "loss": 0.3702, "num_input_tokens_seen": 22203679204, "step": 5695, "train_runtime": 226693.7277, "train_tokens_per_second": 97945.715 }, { "epoch": 0.9055643879173291, "grad_norm": 0.22399315237998962, "learning_rate": 1.100341754985601e-06, "loss": 0.3781, "num_input_tokens_seen": 22207628181, "step": 5696, "train_runtime": 226734.0113, "train_tokens_per_second": 97945.73 }, { "epoch": 0.9057233704292528, "grad_norm": 0.21365278959274292, "learning_rate": 1.0966734943170493e-06, "loss": 0.3921, "num_input_tokens_seen": 22211564902, "step": 5697, "train_runtime": 226773.6401, "train_tokens_per_second": 97945.973 }, { "epoch": 0.9058823529411765, "grad_norm": 0.213460311293602, "learning_rate": 1.093011221262888e-06, "loss": 0.3888, "num_input_tokens_seen": 22215456876, "step": 5698, "train_runtime": 226813.374, "train_tokens_per_second": 97945.974 }, { "epoch": 0.9060413354531002, "grad_norm": 0.1823984682559967, "learning_rate": 1.0893549367404748e-06, "loss": 0.3858, "num_input_tokens_seen": 22219305279, "step": 5699, "train_runtime": 226854.3119, "train_tokens_per_second": 97945.263 }, { "epoch": 0.9062003179650239, "grad_norm": 0.30619966983795166, "learning_rate": 1.0857046416656885e-06, "loss": 0.3869, "num_input_tokens_seen": 22223203983, "step": 5700, "train_runtime": 226892.5731, "train_tokens_per_second": 97945.93 }, { "epoch": 0.9063593004769476, "grad_norm": 0.20158164203166962, "learning_rate": 1.0820603369529086e-06, "loss": 0.3866, "num_input_tokens_seen": 22227016987, "step": 5701, "train_runtime": 226932.5312, "train_tokens_per_second": 97945.486 }, { "epoch": 0.9065182829888713, "grad_norm": 0.5342326760292053, "learning_rate": 1.0784220235149967e-06, "loss": 0.3787, "num_input_tokens_seen": 22230981400, "step": 5702, "train_runtime": 226970.3959, "train_tokens_per_second": 97946.612 }, { "epoch": 0.9066772655007949, "grad_norm": 0.21514447033405304, "learning_rate": 1.0747897022633292e-06, "loss": 0.3737, "num_input_tokens_seen": 22234893803, "step": 5703, "train_runtime": 227009.8711, "train_tokens_per_second": 97946.815 }, { "epoch": 0.9068362480127186, "grad_norm": 0.22990484535694122, "learning_rate": 1.071163374107778e-06, "loss": 0.385, "num_input_tokens_seen": 22238777579, "step": 5704, "train_runtime": 227051.092, "train_tokens_per_second": 97946.138 }, { "epoch": 0.9069952305246423, "grad_norm": 0.2483682632446289, "learning_rate": 1.0675430399567115e-06, "loss": 0.3969, "num_input_tokens_seen": 22242733502, "step": 5705, "train_runtime": 227091.8213, "train_tokens_per_second": 97945.991 }, { "epoch": 0.907154213036566, "grad_norm": 0.21491976082324982, "learning_rate": 1.0639287007169984e-06, "loss": 0.3801, "num_input_tokens_seen": 22246599260, "step": 5706, "train_runtime": 227130.7855, "train_tokens_per_second": 97946.208 }, { "epoch": 0.9073131955484897, "grad_norm": 0.18237879872322083, "learning_rate": 1.060320357294009e-06, "loss": 0.3912, "num_input_tokens_seen": 22250415305, "step": 5707, "train_runtime": 227168.6295, "train_tokens_per_second": 97946.69 }, { "epoch": 0.9074721780604134, "grad_norm": 0.2449824959039688, "learning_rate": 1.0567180105916007e-06, "loss": 0.3889, "num_input_tokens_seen": 22254242871, "step": 5708, "train_runtime": 227206.7057, "train_tokens_per_second": 97947.122 }, { "epoch": 0.9076311605723371, "grad_norm": 0.19917179644107819, "learning_rate": 1.0531216615121382e-06, "loss": 0.3816, "num_input_tokens_seen": 22258170779, "step": 5709, "train_runtime": 227247.0932, "train_tokens_per_second": 97946.999 }, { "epoch": 0.9077901430842608, "grad_norm": 0.19682201743125916, "learning_rate": 1.0495313109564835e-06, "loss": 0.3727, "num_input_tokens_seen": 22262167978, "step": 5710, "train_runtime": 227287.2985, "train_tokens_per_second": 97947.259 }, { "epoch": 0.9079491255961845, "grad_norm": 0.1762292981147766, "learning_rate": 1.045946959823993e-06, "loss": 0.3877, "num_input_tokens_seen": 22266088003, "step": 5711, "train_runtime": 227325.6797, "train_tokens_per_second": 97947.966 }, { "epoch": 0.9081081081081082, "grad_norm": 0.21393129229545593, "learning_rate": 1.0423686090125229e-06, "loss": 0.3888, "num_input_tokens_seen": 22269962449, "step": 5712, "train_runtime": 227366.147, "train_tokens_per_second": 97947.574 }, { "epoch": 0.9082670906200317, "grad_norm": 0.19869299232959747, "learning_rate": 1.0387962594184175e-06, "loss": 0.3918, "num_input_tokens_seen": 22273858705, "step": 5713, "train_runtime": 227408.0775, "train_tokens_per_second": 97946.647 }, { "epoch": 0.9084260731319554, "grad_norm": 0.1986849308013916, "learning_rate": 1.035229911936536e-06, "loss": 0.3921, "num_input_tokens_seen": 22277744102, "step": 5714, "train_runtime": 227444.8326, "train_tokens_per_second": 97947.902 }, { "epoch": 0.9085850556438791, "grad_norm": 0.2713124454021454, "learning_rate": 1.0316695674602168e-06, "loss": 0.3761, "num_input_tokens_seen": 22281629194, "step": 5715, "train_runtime": 227486.5095, "train_tokens_per_second": 97947.035 }, { "epoch": 0.9087440381558028, "grad_norm": 0.18962809443473816, "learning_rate": 1.0281152268813015e-06, "loss": 0.3918, "num_input_tokens_seen": 22285546432, "step": 5716, "train_runtime": 227524.7299, "train_tokens_per_second": 97947.799 }, { "epoch": 0.9089030206677265, "grad_norm": 0.19756293296813965, "learning_rate": 1.0245668910901313e-06, "loss": 0.3879, "num_input_tokens_seen": 22289464913, "step": 5717, "train_runtime": 227561.6497, "train_tokens_per_second": 97949.127 }, { "epoch": 0.9090620031796502, "grad_norm": 0.21082091331481934, "learning_rate": 1.0210245609755308e-06, "loss": 0.3759, "num_input_tokens_seen": 22293201934, "step": 5718, "train_runtime": 227600.0158, "train_tokens_per_second": 97949.035 }, { "epoch": 0.909220985691574, "grad_norm": 0.20629628002643585, "learning_rate": 1.0174882374248378e-06, "loss": 0.3872, "num_input_tokens_seen": 22297225423, "step": 5719, "train_runtime": 227638.5264, "train_tokens_per_second": 97950.14 }, { "epoch": 0.9093799682034976, "grad_norm": 0.20714934170246124, "learning_rate": 1.0139579213238714e-06, "loss": 0.3864, "num_input_tokens_seen": 22301015407, "step": 5720, "train_runtime": 227676.9961, "train_tokens_per_second": 97950.236 }, { "epoch": 0.9095389507154213, "grad_norm": 0.2829619348049164, "learning_rate": 1.010433613556952e-06, "loss": 0.384, "num_input_tokens_seen": 22305074985, "step": 5721, "train_runtime": 227716.9312, "train_tokens_per_second": 97950.885 }, { "epoch": 0.909697933227345, "grad_norm": 0.25336453318595886, "learning_rate": 1.0069153150068927e-06, "loss": 0.3719, "num_input_tokens_seen": 22308927537, "step": 5722, "train_runtime": 227757.5932, "train_tokens_per_second": 97950.313 }, { "epoch": 0.9098569157392686, "grad_norm": 0.34159547090530396, "learning_rate": 1.0034030265550026e-06, "loss": 0.3862, "num_input_tokens_seen": 22312796091, "step": 5723, "train_runtime": 227797.6762, "train_tokens_per_second": 97950.06 }, { "epoch": 0.9100158982511923, "grad_norm": 0.21919405460357666, "learning_rate": 9.998967490810917e-07, "loss": 0.3867, "num_input_tokens_seen": 22316742939, "step": 5724, "train_runtime": 227835.4273, "train_tokens_per_second": 97951.154 }, { "epoch": 0.910174880763116, "grad_norm": 0.32206204533576965, "learning_rate": 9.963964834634464e-07, "loss": 0.3732, "num_input_tokens_seen": 22320636199, "step": 5725, "train_runtime": 227874.5305, "train_tokens_per_second": 97951.43 }, { "epoch": 0.9103338632750397, "grad_norm": 0.22037620842456818, "learning_rate": 9.929022305788654e-07, "loss": 0.3864, "num_input_tokens_seen": 22324426139, "step": 5726, "train_runtime": 227914.9956, "train_tokens_per_second": 97950.668 }, { "epoch": 0.9104928457869634, "grad_norm": 0.18354260921478271, "learning_rate": 9.894139913026318e-07, "loss": 0.3874, "num_input_tokens_seen": 22328279041, "step": 5727, "train_runtime": 227954.123, "train_tokens_per_second": 97950.758 }, { "epoch": 0.9106518282988871, "grad_norm": 0.23605135083198547, "learning_rate": 9.859317665085272e-07, "loss": 0.3871, "num_input_tokens_seen": 22332197906, "step": 5728, "train_runtime": 227993.7213, "train_tokens_per_second": 97950.934 }, { "epoch": 0.9108108108108108, "grad_norm": 0.23474381864070892, "learning_rate": 9.824555570688233e-07, "loss": 0.3756, "num_input_tokens_seen": 22336263828, "step": 5729, "train_runtime": 228031.2521, "train_tokens_per_second": 97952.643 }, { "epoch": 0.9109697933227345, "grad_norm": 0.2456366866827011, "learning_rate": 9.789853638542874e-07, "loss": 0.3862, "num_input_tokens_seen": 22339990059, "step": 5730, "train_runtime": 228071.3055, "train_tokens_per_second": 97951.779 }, { "epoch": 0.9111287758346582, "grad_norm": 0.25656965374946594, "learning_rate": 9.75521187734177e-07, "loss": 0.3862, "num_input_tokens_seen": 22343918462, "step": 5731, "train_runtime": 228109.3041, "train_tokens_per_second": 97952.683 }, { "epoch": 0.9112877583465818, "grad_norm": 0.2057362198829651, "learning_rate": 9.720630295762395e-07, "loss": 0.3742, "num_input_tokens_seen": 22347811188, "step": 5732, "train_runtime": 228148.9952, "train_tokens_per_second": 97952.705 }, { "epoch": 0.9114467408585055, "grad_norm": 0.19881051778793335, "learning_rate": 9.686108902467268e-07, "loss": 0.3954, "num_input_tokens_seen": 22351702681, "step": 5733, "train_runtime": 228187.8039, "train_tokens_per_second": 97953.1 }, { "epoch": 0.9116057233704292, "grad_norm": 0.21100440621376038, "learning_rate": 9.65164770610369e-07, "loss": 0.3902, "num_input_tokens_seen": 22355527807, "step": 5734, "train_runtime": 228228.574, "train_tokens_per_second": 97952.362 }, { "epoch": 0.9117647058823529, "grad_norm": 0.20791450142860413, "learning_rate": 9.617246715304034e-07, "loss": 0.3748, "num_input_tokens_seen": 22359403259, "step": 5735, "train_runtime": 228264.6435, "train_tokens_per_second": 97953.861 }, { "epoch": 0.9119236883942766, "grad_norm": 0.23574574291706085, "learning_rate": 9.582905938685354e-07, "loss": 0.392, "num_input_tokens_seen": 22363321521, "step": 5736, "train_runtime": 228303.1927, "train_tokens_per_second": 97954.484 }, { "epoch": 0.9120826709062003, "grad_norm": 0.19150038063526154, "learning_rate": 9.548625384849963e-07, "loss": 0.3855, "num_input_tokens_seen": 22367105964, "step": 5737, "train_runtime": 228339.5383, "train_tokens_per_second": 97955.466 }, { "epoch": 0.912241653418124, "grad_norm": 0.24728934466838837, "learning_rate": 9.514405062384741e-07, "loss": 0.3778, "num_input_tokens_seen": 22371094903, "step": 5738, "train_runtime": 228379.6728, "train_tokens_per_second": 97955.718 }, { "epoch": 0.9124006359300477, "grad_norm": 0.21607154607772827, "learning_rate": 9.480244979861691e-07, "loss": 0.3944, "num_input_tokens_seen": 22374985122, "step": 5739, "train_runtime": 228420.3437, "train_tokens_per_second": 97955.308 }, { "epoch": 0.9125596184419714, "grad_norm": 0.1878720074892044, "learning_rate": 9.44614514583772e-07, "loss": 0.3831, "num_input_tokens_seen": 22378990001, "step": 5740, "train_runtime": 228456.0438, "train_tokens_per_second": 97957.531 }, { "epoch": 0.9127186009538951, "grad_norm": 0.20843146741390228, "learning_rate": 9.412105568854495e-07, "loss": 0.3834, "num_input_tokens_seen": 22382902735, "step": 5741, "train_runtime": 228496.4279, "train_tokens_per_second": 97957.342 }, { "epoch": 0.9128775834658187, "grad_norm": 0.17693305015563965, "learning_rate": 9.378126257438807e-07, "loss": 0.3732, "num_input_tokens_seen": 22386750560, "step": 5742, "train_runtime": 228534.9769, "train_tokens_per_second": 97957.656 }, { "epoch": 0.9130365659777424, "grad_norm": 0.16722866892814636, "learning_rate": 9.344207220102125e-07, "loss": 0.3806, "num_input_tokens_seen": 22390744222, "step": 5743, "train_runtime": 228573.3786, "train_tokens_per_second": 97958.67 }, { "epoch": 0.9131955484896661, "grad_norm": 0.17116045951843262, "learning_rate": 9.310348465340985e-07, "loss": 0.3873, "num_input_tokens_seen": 22394677096, "step": 5744, "train_runtime": 228609.9008, "train_tokens_per_second": 97960.224 }, { "epoch": 0.9133545310015898, "grad_norm": 0.21993450820446014, "learning_rate": 9.276550001636769e-07, "loss": 0.3845, "num_input_tokens_seen": 22398483147, "step": 5745, "train_runtime": 228648.2399, "train_tokens_per_second": 97960.444 }, { "epoch": 0.9135135135135135, "grad_norm": 0.22932274639606476, "learning_rate": 9.242811837455733e-07, "loss": 0.3772, "num_input_tokens_seen": 22402557507, "step": 5746, "train_runtime": 228687.5492, "train_tokens_per_second": 97961.422 }, { "epoch": 0.9136724960254372, "grad_norm": 0.19526797533035278, "learning_rate": 9.209133981249063e-07, "loss": 0.3829, "num_input_tokens_seen": 22406494897, "step": 5747, "train_runtime": 228728.3596, "train_tokens_per_second": 97961.158 }, { "epoch": 0.9138314785373609, "grad_norm": 0.19178329408168793, "learning_rate": 9.175516441452814e-07, "loss": 0.3897, "num_input_tokens_seen": 22410219956, "step": 5748, "train_runtime": 228765.3805, "train_tokens_per_second": 97961.588 }, { "epoch": 0.9139904610492846, "grad_norm": 0.29889556765556335, "learning_rate": 9.141959226487945e-07, "loss": 0.3903, "num_input_tokens_seen": 22414148130, "step": 5749, "train_runtime": 228806.6934, "train_tokens_per_second": 97961.068 }, { "epoch": 0.9141494435612083, "grad_norm": 0.2849881649017334, "learning_rate": 9.108462344760288e-07, "loss": 0.386, "num_input_tokens_seen": 22418152641, "step": 5750, "train_runtime": 228846.0332, "train_tokens_per_second": 97961.727 }, { "epoch": 0.914308426073132, "grad_norm": 0.20355913043022156, "learning_rate": 9.075025804660603e-07, "loss": 0.3725, "num_input_tokens_seen": 22421982374, "step": 5751, "train_runtime": 228885.2977, "train_tokens_per_second": 97961.654 }, { "epoch": 0.9144674085850556, "grad_norm": 0.22057627141475677, "learning_rate": 9.041649614564496e-07, "loss": 0.3773, "num_input_tokens_seen": 22425763256, "step": 5752, "train_runtime": 228922.0111, "train_tokens_per_second": 97962.46 }, { "epoch": 0.9146263910969793, "grad_norm": 0.21726064383983612, "learning_rate": 9.008333782832474e-07, "loss": 0.3669, "num_input_tokens_seen": 22429743835, "step": 5753, "train_runtime": 228962.2748, "train_tokens_per_second": 97962.618 }, { "epoch": 0.914785373608903, "grad_norm": 0.25353512167930603, "learning_rate": 8.975078317809915e-07, "loss": 0.3844, "num_input_tokens_seen": 22433636550, "step": 5754, "train_runtime": 229001.149, "train_tokens_per_second": 97962.987 }, { "epoch": 0.9149443561208267, "grad_norm": 0.2059965282678604, "learning_rate": 8.941883227827019e-07, "loss": 0.3764, "num_input_tokens_seen": 22437417736, "step": 5755, "train_runtime": 229041.6, "train_tokens_per_second": 97962.194 }, { "epoch": 0.9151033386327504, "grad_norm": 0.2407243847846985, "learning_rate": 8.908748521199023e-07, "loss": 0.3817, "num_input_tokens_seen": 22441363431, "step": 5756, "train_runtime": 229081.4666, "train_tokens_per_second": 97962.37 }, { "epoch": 0.9152623211446741, "grad_norm": 0.19492053985595703, "learning_rate": 8.875674206225842e-07, "loss": 0.3707, "num_input_tokens_seen": 22445329886, "step": 5757, "train_runtime": 229120.1929, "train_tokens_per_second": 97963.124 }, { "epoch": 0.9154213036565978, "grad_norm": 0.19590255618095398, "learning_rate": 8.842660291192462e-07, "loss": 0.396, "num_input_tokens_seen": 22449270746, "step": 5758, "train_runtime": 229158.2861, "train_tokens_per_second": 97964.037 }, { "epoch": 0.9155802861685215, "grad_norm": 0.26324477791786194, "learning_rate": 8.809706784368516e-07, "loss": 0.3793, "num_input_tokens_seen": 22453145431, "step": 5759, "train_runtime": 229197.4969, "train_tokens_per_second": 97964.183 }, { "epoch": 0.9157392686804452, "grad_norm": 0.21400484442710876, "learning_rate": 8.776813694008734e-07, "loss": 0.3962, "num_input_tokens_seen": 22457102990, "step": 5760, "train_runtime": 229236.2342, "train_tokens_per_second": 97964.892 }, { "epoch": 0.9158982511923688, "grad_norm": 0.19954803586006165, "learning_rate": 8.743981028352527e-07, "loss": 0.3858, "num_input_tokens_seen": 22461042696, "step": 5761, "train_runtime": 229274.646, "train_tokens_per_second": 97965.663 }, { "epoch": 0.9160572337042925, "grad_norm": 0.207880899310112, "learning_rate": 8.711208795624259e-07, "loss": 0.3918, "num_input_tokens_seen": 22464952843, "step": 5762, "train_runtime": 229313.213, "train_tokens_per_second": 97966.238 }, { "epoch": 0.9162162162162162, "grad_norm": 0.20539188385009766, "learning_rate": 8.678497004033198e-07, "loss": 0.3949, "num_input_tokens_seen": 22468836800, "step": 5763, "train_runtime": 229351.6948, "train_tokens_per_second": 97966.735 }, { "epoch": 0.9163751987281399, "grad_norm": 0.18688473105430603, "learning_rate": 8.645845661773316e-07, "loss": 0.3777, "num_input_tokens_seen": 22472734318, "step": 5764, "train_runtime": 229392.6045, "train_tokens_per_second": 97966.255 }, { "epoch": 0.9165341812400636, "grad_norm": 0.2533746659755707, "learning_rate": 8.613254777023683e-07, "loss": 0.3783, "num_input_tokens_seen": 22476616148, "step": 5765, "train_runtime": 229432.9262, "train_tokens_per_second": 97965.957 }, { "epoch": 0.9166931637519873, "grad_norm": 0.214362233877182, "learning_rate": 8.580724357947933e-07, "loss": 0.3865, "num_input_tokens_seen": 22480456650, "step": 5766, "train_runtime": 229473.3187, "train_tokens_per_second": 97965.449 }, { "epoch": 0.916852146263911, "grad_norm": 0.20247691869735718, "learning_rate": 8.548254412694801e-07, "loss": 0.3696, "num_input_tokens_seen": 22484436817, "step": 5767, "train_runtime": 229512.115, "train_tokens_per_second": 97966.231 }, { "epoch": 0.9170111287758347, "grad_norm": 0.24661846458911896, "learning_rate": 8.51584494939775e-07, "loss": 0.3854, "num_input_tokens_seen": 22488255231, "step": 5768, "train_runtime": 229551.846, "train_tokens_per_second": 97965.909 }, { "epoch": 0.9171701112877584, "grad_norm": 0.18737800419330597, "learning_rate": 8.483495976175149e-07, "loss": 0.3822, "num_input_tokens_seen": 22492136285, "step": 5769, "train_runtime": 229589.9951, "train_tokens_per_second": 97966.535 }, { "epoch": 0.9173290937996821, "grad_norm": 0.2986370325088501, "learning_rate": 8.451207501130154e-07, "loss": 0.3818, "num_input_tokens_seen": 22495974631, "step": 5770, "train_runtime": 229630.054, "train_tokens_per_second": 97966.16 }, { "epoch": 0.9174880763116057, "grad_norm": 0.1800830215215683, "learning_rate": 8.418979532350824e-07, "loss": 0.3967, "num_input_tokens_seen": 22499977770, "step": 5771, "train_runtime": 229665.6138, "train_tokens_per_second": 97968.422 }, { "epoch": 0.9176470588235294, "grad_norm": 0.5101851224899292, "learning_rate": 8.386812077910006e-07, "loss": 0.3846, "num_input_tokens_seen": 22503883753, "step": 5772, "train_runtime": 229703.6358, "train_tokens_per_second": 97969.21 }, { "epoch": 0.9178060413354531, "grad_norm": 0.33631956577301025, "learning_rate": 8.354705145865421e-07, "loss": 0.3792, "num_input_tokens_seen": 22507809870, "step": 5773, "train_runtime": 229743.4276, "train_tokens_per_second": 97969.331 }, { "epoch": 0.9179650238473768, "grad_norm": 0.29113587737083435, "learning_rate": 8.322658744259665e-07, "loss": 0.3821, "num_input_tokens_seen": 22511608200, "step": 5774, "train_runtime": 229783.6338, "train_tokens_per_second": 97968.719 }, { "epoch": 0.9181240063593005, "grad_norm": 0.22678475081920624, "learning_rate": 8.29067288112012e-07, "loss": 0.3826, "num_input_tokens_seen": 22515511602, "step": 5775, "train_runtime": 229820.9344, "train_tokens_per_second": 97969.803 }, { "epoch": 0.9182829888712242, "grad_norm": 0.2386743277311325, "learning_rate": 8.258747564459019e-07, "loss": 0.3801, "num_input_tokens_seen": 22519405265, "step": 5776, "train_runtime": 229859.8678, "train_tokens_per_second": 97970.148 }, { "epoch": 0.9184419713831479, "grad_norm": 0.37688586115837097, "learning_rate": 8.226882802273433e-07, "loss": 0.3797, "num_input_tokens_seen": 22523308323, "step": 5777, "train_runtime": 229896.5078, "train_tokens_per_second": 97971.511 }, { "epoch": 0.9186009538950716, "grad_norm": 0.2823828458786011, "learning_rate": 8.195078602545231e-07, "loss": 0.3907, "num_input_tokens_seen": 22527313035, "step": 5778, "train_runtime": 229934.7653, "train_tokens_per_second": 97972.627 }, { "epoch": 0.9187599364069953, "grad_norm": 0.2637365460395813, "learning_rate": 8.163334973241149e-07, "loss": 0.3859, "num_input_tokens_seen": 22531198287, "step": 5779, "train_runtime": 229973.2331, "train_tokens_per_second": 97973.134 }, { "epoch": 0.918918918918919, "grad_norm": 0.3758320212364197, "learning_rate": 8.131651922312772e-07, "loss": 0.381, "num_input_tokens_seen": 22535004746, "step": 5780, "train_runtime": 230010.5811, "train_tokens_per_second": 97973.774 }, { "epoch": 0.9190779014308426, "grad_norm": 0.20498326420783997, "learning_rate": 8.100029457696473e-07, "loss": 0.3768, "num_input_tokens_seen": 22538988246, "step": 5781, "train_runtime": 230048.214, "train_tokens_per_second": 97975.063 }, { "epoch": 0.9192368839427663, "grad_norm": 0.19895461201667786, "learning_rate": 8.068467587313416e-07, "loss": 0.4049, "num_input_tokens_seen": 22542739726, "step": 5782, "train_runtime": 230087.7726, "train_tokens_per_second": 97974.523 }, { "epoch": 0.91939586645469, "grad_norm": 0.25217270851135254, "learning_rate": 8.036966319069667e-07, "loss": 0.3913, "num_input_tokens_seen": 22546708218, "step": 5783, "train_runtime": 230124.291, "train_tokens_per_second": 97976.22 }, { "epoch": 0.9195548489666137, "grad_norm": 0.33157697319984436, "learning_rate": 8.005525660856078e-07, "loss": 0.3762, "num_input_tokens_seen": 22550459455, "step": 5784, "train_runtime": 230162.8274, "train_tokens_per_second": 97976.114 }, { "epoch": 0.9197138314785374, "grad_norm": 0.1834709644317627, "learning_rate": 7.974145620548268e-07, "loss": 0.3831, "num_input_tokens_seen": 22554478770, "step": 5785, "train_runtime": 230202.9824, "train_tokens_per_second": 97976.484 }, { "epoch": 0.9198728139904611, "grad_norm": 0.20247270166873932, "learning_rate": 7.942826206006726e-07, "loss": 0.3805, "num_input_tokens_seen": 22558390058, "step": 5786, "train_runtime": 230244.183, "train_tokens_per_second": 97975.939 }, { "epoch": 0.9200317965023848, "grad_norm": 0.24891164898872375, "learning_rate": 7.911567425076788e-07, "loss": 0.3817, "num_input_tokens_seen": 22562269902, "step": 5787, "train_runtime": 230280.7987, "train_tokens_per_second": 97977.209 }, { "epoch": 0.9201907790143085, "grad_norm": 0.23143264651298523, "learning_rate": 7.880369285588551e-07, "loss": 0.389, "num_input_tokens_seen": 22566171002, "step": 5788, "train_runtime": 230319.4526, "train_tokens_per_second": 97977.703 }, { "epoch": 0.9203497615262322, "grad_norm": 0.5428652763366699, "learning_rate": 7.849231795356848e-07, "loss": 0.3813, "num_input_tokens_seen": 22570107420, "step": 5789, "train_runtime": 230358.8334, "train_tokens_per_second": 97978.042 }, { "epoch": 0.9205087440381559, "grad_norm": 0.4345470666885376, "learning_rate": 7.818154962181495e-07, "loss": 0.3825, "num_input_tokens_seen": 22574095281, "step": 5790, "train_runtime": 230398.3918, "train_tokens_per_second": 97978.528 }, { "epoch": 0.9206677265500794, "grad_norm": 0.2172270566225052, "learning_rate": 7.78713879384696e-07, "loss": 0.3807, "num_input_tokens_seen": 22577904873, "step": 5791, "train_runtime": 230438.7525, "train_tokens_per_second": 97977.899 }, { "epoch": 0.9208267090620031, "grad_norm": 0.26024842262268066, "learning_rate": 7.75618329812261e-07, "loss": 0.3809, "num_input_tokens_seen": 22581844909, "step": 5792, "train_runtime": 230478.5699, "train_tokens_per_second": 97978.068 }, { "epoch": 0.9209856915739268, "grad_norm": 0.2505006194114685, "learning_rate": 7.72528848276255e-07, "loss": 0.3631, "num_input_tokens_seen": 22585760810, "step": 5793, "train_runtime": 230518.7488, "train_tokens_per_second": 97977.978 }, { "epoch": 0.9211446740858505, "grad_norm": 0.18663878738880157, "learning_rate": 7.694454355505754e-07, "loss": 0.3763, "num_input_tokens_seen": 22589637522, "step": 5794, "train_runtime": 230555.5902, "train_tokens_per_second": 97979.136 }, { "epoch": 0.9213036565977742, "grad_norm": 0.23527956008911133, "learning_rate": 7.663680924075905e-07, "loss": 0.392, "num_input_tokens_seen": 22593441060, "step": 5795, "train_runtime": 230595.2467, "train_tokens_per_second": 97978.78 }, { "epoch": 0.921462639109698, "grad_norm": 0.19732342660427094, "learning_rate": 7.632968196181561e-07, "loss": 0.3763, "num_input_tokens_seen": 22597331990, "step": 5796, "train_runtime": 230633.6932, "train_tokens_per_second": 97979.318 }, { "epoch": 0.9216216216216216, "grad_norm": 0.1784907728433609, "learning_rate": 7.602316179516039e-07, "loss": 0.3842, "num_input_tokens_seen": 22601268664, "step": 5797, "train_runtime": 230674.1475, "train_tokens_per_second": 97979.201 }, { "epoch": 0.9217806041335453, "grad_norm": 0.1746741384267807, "learning_rate": 7.571724881757419e-07, "loss": 0.3684, "num_input_tokens_seen": 22605170909, "step": 5798, "train_runtime": 230712.0887, "train_tokens_per_second": 97980.002 }, { "epoch": 0.921939586645469, "grad_norm": 0.21999306976795197, "learning_rate": 7.541194310568655e-07, "loss": 0.3785, "num_input_tokens_seen": 22609142915, "step": 5799, "train_runtime": 230753.7075, "train_tokens_per_second": 97979.543 }, { "epoch": 0.9220985691573926, "grad_norm": 0.24634943902492523, "learning_rate": 7.510724473597381e-07, "loss": 0.3836, "num_input_tokens_seen": 22613116175, "step": 5800, "train_runtime": 230793.8301, "train_tokens_per_second": 97979.726 }, { "epoch": 0.9222575516693163, "grad_norm": 0.18749724328517914, "learning_rate": 7.480315378476155e-07, "loss": 0.3929, "num_input_tokens_seen": 22617062548, "step": 5801, "train_runtime": 230949.5512, "train_tokens_per_second": 97930.749 }, { "epoch": 0.92241653418124, "grad_norm": 0.1945691853761673, "learning_rate": 7.449967032822192e-07, "loss": 0.3892, "num_input_tokens_seen": 22620767567, "step": 5802, "train_runtime": 230984.9564, "train_tokens_per_second": 97931.778 }, { "epoch": 0.9225755166931637, "grad_norm": 0.28852078318595886, "learning_rate": 7.419679444237521e-07, "loss": 0.3945, "num_input_tokens_seen": 22624592119, "step": 5803, "train_runtime": 231026.4972, "train_tokens_per_second": 97930.724 }, { "epoch": 0.9227344992050874, "grad_norm": 0.16882435977458954, "learning_rate": 7.389452620309018e-07, "loss": 0.367, "num_input_tokens_seen": 22628649917, "step": 5804, "train_runtime": 231067.2731, "train_tokens_per_second": 97931.003 }, { "epoch": 0.9228934817170111, "grad_norm": 0.2826617658138275, "learning_rate": 7.359286568608237e-07, "loss": 0.3868, "num_input_tokens_seen": 22632449026, "step": 5805, "train_runtime": 231103.5947, "train_tokens_per_second": 97932.051 }, { "epoch": 0.9230524642289348, "grad_norm": 0.2187211662530899, "learning_rate": 7.329181296691606e-07, "loss": 0.3865, "num_input_tokens_seen": 22636235915, "step": 5806, "train_runtime": 231143.5151, "train_tokens_per_second": 97931.521 }, { "epoch": 0.9232114467408585, "grad_norm": 0.2110346257686615, "learning_rate": 7.29913681210026e-07, "loss": 0.3762, "num_input_tokens_seen": 22640247522, "step": 5807, "train_runtime": 231183.4574, "train_tokens_per_second": 97931.953 }, { "epoch": 0.9233704292527822, "grad_norm": 0.18728990852832794, "learning_rate": 7.269153122360123e-07, "loss": 0.3982, "num_input_tokens_seen": 22644251125, "step": 5808, "train_runtime": 231224.3913, "train_tokens_per_second": 97931.931 }, { "epoch": 0.9235294117647059, "grad_norm": 0.18787743151187897, "learning_rate": 7.239230234981881e-07, "loss": 0.3754, "num_input_tokens_seen": 22648041105, "step": 5809, "train_runtime": 231263.6406, "train_tokens_per_second": 97931.698 }, { "epoch": 0.9236883942766295, "grad_norm": 0.2529025077819824, "learning_rate": 7.209368157461066e-07, "loss": 0.3866, "num_input_tokens_seen": 22651944585, "step": 5810, "train_runtime": 231304.6307, "train_tokens_per_second": 97931.22 }, { "epoch": 0.9238473767885532, "grad_norm": 0.2814071476459503, "learning_rate": 7.17956689727789e-07, "loss": 0.3864, "num_input_tokens_seen": 22655970755, "step": 5811, "train_runtime": 231344.1264, "train_tokens_per_second": 97931.904 }, { "epoch": 0.9240063593004769, "grad_norm": 0.18537098169326782, "learning_rate": 7.149826461897353e-07, "loss": 0.3882, "num_input_tokens_seen": 22659851929, "step": 5812, "train_runtime": 231383.3133, "train_tokens_per_second": 97932.092 }, { "epoch": 0.9241653418124006, "grad_norm": 0.28134700655937195, "learning_rate": 7.120146858769188e-07, "loss": 0.3824, "num_input_tokens_seen": 22663778155, "step": 5813, "train_runtime": 231422.7303, "train_tokens_per_second": 97932.377 }, { "epoch": 0.9243243243243243, "grad_norm": 0.20453190803527832, "learning_rate": 7.090528095327981e-07, "loss": 0.3913, "num_input_tokens_seen": 22667615632, "step": 5814, "train_runtime": 231462.8082, "train_tokens_per_second": 97932.0 }, { "epoch": 0.924483306836248, "grad_norm": 0.24118000268936157, "learning_rate": 7.060970178993015e-07, "loss": 0.395, "num_input_tokens_seen": 22671635936, "step": 5815, "train_runtime": 231502.0501, "train_tokens_per_second": 97932.765 }, { "epoch": 0.9246422893481717, "grad_norm": 0.24574242532253265, "learning_rate": 7.031473117168314e-07, "loss": 0.3786, "num_input_tokens_seen": 22675454843, "step": 5816, "train_runtime": 231540.1786, "train_tokens_per_second": 97933.132 }, { "epoch": 0.9248012718600954, "grad_norm": 0.40692415833473206, "learning_rate": 7.002036917242715e-07, "loss": 0.3902, "num_input_tokens_seen": 22679374510, "step": 5817, "train_runtime": 231577.8644, "train_tokens_per_second": 97934.121 }, { "epoch": 0.9249602543720191, "grad_norm": 0.18160732090473175, "learning_rate": 6.972661586589712e-07, "loss": 0.3725, "num_input_tokens_seen": 22683310281, "step": 5818, "train_runtime": 231615.4301, "train_tokens_per_second": 97935.229 }, { "epoch": 0.9251192368839428, "grad_norm": 0.26704898476600647, "learning_rate": 6.943347132567696e-07, "loss": 0.3839, "num_input_tokens_seen": 22687296847, "step": 5819, "train_runtime": 231656.3287, "train_tokens_per_second": 97935.148 }, { "epoch": 0.9252782193958664, "grad_norm": 0.19171269237995148, "learning_rate": 6.914093562519736e-07, "loss": 0.379, "num_input_tokens_seen": 22691135715, "step": 5820, "train_runtime": 231695.1229, "train_tokens_per_second": 97935.319 }, { "epoch": 0.9254372019077901, "grad_norm": 0.1991463154554367, "learning_rate": 6.884900883773526e-07, "loss": 0.3854, "num_input_tokens_seen": 22695074176, "step": 5821, "train_runtime": 231734.8791, "train_tokens_per_second": 97935.513 }, { "epoch": 0.9255961844197138, "grad_norm": 0.26138195395469666, "learning_rate": 6.855769103641774e-07, "loss": 0.3851, "num_input_tokens_seen": 22698831323, "step": 5822, "train_runtime": 231772.8376, "train_tokens_per_second": 97935.684 }, { "epoch": 0.9257551669316375, "grad_norm": 0.22673414647579193, "learning_rate": 6.826698229421669e-07, "loss": 0.3859, "num_input_tokens_seen": 22702803430, "step": 5823, "train_runtime": 231809.7149, "train_tokens_per_second": 97937.239 }, { "epoch": 0.9259141494435612, "grad_norm": 0.17680953443050385, "learning_rate": 6.797688268395358e-07, "loss": 0.3879, "num_input_tokens_seen": 22706662284, "step": 5824, "train_runtime": 231848.8279, "train_tokens_per_second": 97937.361 }, { "epoch": 0.9260731319554849, "grad_norm": 0.22920389473438263, "learning_rate": 6.768739227829557e-07, "loss": 0.3836, "num_input_tokens_seen": 22710605624, "step": 5825, "train_runtime": 231888.9591, "train_tokens_per_second": 97937.417 }, { "epoch": 0.9262321144674086, "grad_norm": 0.3593702018260956, "learning_rate": 6.739851114975798e-07, "loss": 0.3906, "num_input_tokens_seen": 22714424211, "step": 5826, "train_runtime": 231927.6255, "train_tokens_per_second": 97937.553 }, { "epoch": 0.9263910969793323, "grad_norm": 0.3449673354625702, "learning_rate": 6.711023937070432e-07, "loss": 0.3937, "num_input_tokens_seen": 22718404792, "step": 5827, "train_runtime": 231968.0609, "train_tokens_per_second": 97937.642 }, { "epoch": 0.926550079491256, "grad_norm": 0.18914447724819183, "learning_rate": 6.682257701334349e-07, "loss": 0.3701, "num_input_tokens_seen": 22722321917, "step": 5828, "train_runtime": 232008.7302, "train_tokens_per_second": 97937.357 }, { "epoch": 0.9267090620031796, "grad_norm": 0.27809715270996094, "learning_rate": 6.653552414973368e-07, "loss": 0.3912, "num_input_tokens_seen": 22726112745, "step": 5829, "train_runtime": 232044.7769, "train_tokens_per_second": 97938.48 }, { "epoch": 0.9268680445151033, "grad_norm": 0.21432583034038544, "learning_rate": 6.624908085177934e-07, "loss": 0.3851, "num_input_tokens_seen": 22730061264, "step": 5830, "train_runtime": 232083.8511, "train_tokens_per_second": 97939.004 }, { "epoch": 0.927027027027027, "grad_norm": 0.23975178599357605, "learning_rate": 6.596324719123248e-07, "loss": 0.3933, "num_input_tokens_seen": 22733888075, "step": 5831, "train_runtime": 232123.4243, "train_tokens_per_second": 97938.793 }, { "epoch": 0.9271860095389507, "grad_norm": 0.2693716585636139, "learning_rate": 6.567802323969225e-07, "loss": 0.3839, "num_input_tokens_seen": 22737964684, "step": 5832, "train_runtime": 232160.6899, "train_tokens_per_second": 97940.632 }, { "epoch": 0.9273449920508744, "grad_norm": 0.19645971059799194, "learning_rate": 6.539340906860563e-07, "loss": 0.3735, "num_input_tokens_seen": 22741795870, "step": 5833, "train_runtime": 232199.7146, "train_tokens_per_second": 97940.671 }, { "epoch": 0.9275039745627981, "grad_norm": 0.19989290833473206, "learning_rate": 6.510940474926646e-07, "loss": 0.3844, "num_input_tokens_seen": 22745706945, "step": 5834, "train_runtime": 232239.1128, "train_tokens_per_second": 97940.897 }, { "epoch": 0.9276629570747218, "grad_norm": 0.2400759607553482, "learning_rate": 6.482601035281532e-07, "loss": 0.382, "num_input_tokens_seen": 22749613790, "step": 5835, "train_runtime": 232276.7289, "train_tokens_per_second": 97941.855 }, { "epoch": 0.9278219395866455, "grad_norm": 0.2700004279613495, "learning_rate": 6.454322595024098e-07, "loss": 0.3855, "num_input_tokens_seen": 22753487983, "step": 5836, "train_runtime": 232316.2679, "train_tokens_per_second": 97941.863 }, { "epoch": 0.9279809220985692, "grad_norm": 0.22147196531295776, "learning_rate": 6.426105161237845e-07, "loss": 0.3884, "num_input_tokens_seen": 22757408387, "step": 5837, "train_runtime": 232355.6353, "train_tokens_per_second": 97942.141 }, { "epoch": 0.9281399046104929, "grad_norm": 0.20687377452850342, "learning_rate": 6.397948740991117e-07, "loss": 0.3853, "num_input_tokens_seen": 22761285618, "step": 5838, "train_runtime": 232396.0193, "train_tokens_per_second": 97941.805 }, { "epoch": 0.9282988871224165, "grad_norm": 0.17716388404369354, "learning_rate": 6.369853341336773e-07, "loss": 0.3884, "num_input_tokens_seen": 22765251768, "step": 5839, "train_runtime": 232432.3098, "train_tokens_per_second": 97943.577 }, { "epoch": 0.9284578696343402, "grad_norm": 0.1997322142124176, "learning_rate": 6.341818969312685e-07, "loss": 0.3744, "num_input_tokens_seen": 22769214581, "step": 5840, "train_runtime": 232470.637, "train_tokens_per_second": 97944.475 }, { "epoch": 0.9286168521462639, "grad_norm": 0.2308640331029892, "learning_rate": 6.313845631941096e-07, "loss": 0.3758, "num_input_tokens_seen": 22773189299, "step": 5841, "train_runtime": 232510.5104, "train_tokens_per_second": 97944.774 }, { "epoch": 0.9287758346581876, "grad_norm": 0.18482674658298492, "learning_rate": 6.285933336229232e-07, "loss": 0.382, "num_input_tokens_seen": 22777055548, "step": 5842, "train_runtime": 232550.7635, "train_tokens_per_second": 97944.445 }, { "epoch": 0.9289348171701113, "grad_norm": 0.20188094675540924, "learning_rate": 6.25808208916892e-07, "loss": 0.3773, "num_input_tokens_seen": 22780859062, "step": 5843, "train_runtime": 232588.8215, "train_tokens_per_second": 97944.772 }, { "epoch": 0.929093799682035, "grad_norm": 0.19934429228305817, "learning_rate": 6.230291897736634e-07, "loss": 0.3787, "num_input_tokens_seen": 22784655884, "step": 5844, "train_runtime": 232629.4016, "train_tokens_per_second": 97944.008 }, { "epoch": 0.9292527821939587, "grad_norm": 0.17929889261722565, "learning_rate": 6.202562768893694e-07, "loss": 0.3923, "num_input_tokens_seen": 22788581124, "step": 5845, "train_runtime": 232669.8445, "train_tokens_per_second": 97943.853 }, { "epoch": 0.9294117647058824, "grad_norm": 0.19277213513851166, "learning_rate": 6.174894709585988e-07, "loss": 0.3775, "num_input_tokens_seen": 22792491252, "step": 5846, "train_runtime": 232713.1073, "train_tokens_per_second": 97942.447 }, { "epoch": 0.9295707472178061, "grad_norm": 0.17568786442279816, "learning_rate": 6.14728772674425e-07, "loss": 0.3882, "num_input_tokens_seen": 22796437660, "step": 5847, "train_runtime": 232753.3286, "train_tokens_per_second": 97942.478 }, { "epoch": 0.9297297297297298, "grad_norm": 0.24284715950489044, "learning_rate": 6.119741827283781e-07, "loss": 0.3735, "num_input_tokens_seen": 22800338297, "step": 5848, "train_runtime": 232792.9743, "train_tokens_per_second": 97942.553 }, { "epoch": 0.9298887122416534, "grad_norm": 0.34112057089805603, "learning_rate": 6.092257018104619e-07, "loss": 0.3816, "num_input_tokens_seen": 22804255367, "step": 5849, "train_runtime": 232833.396, "train_tokens_per_second": 97942.373 }, { "epoch": 0.9300476947535771, "grad_norm": 0.20992180705070496, "learning_rate": 6.06483330609156e-07, "loss": 0.3847, "num_input_tokens_seen": 22808060795, "step": 5850, "train_runtime": 232872.6319, "train_tokens_per_second": 97942.212 }, { "epoch": 0.9302066772655008, "grad_norm": 0.23155926167964935, "learning_rate": 6.037470698113973e-07, "loss": 0.4031, "num_input_tokens_seen": 22811974058, "step": 5851, "train_runtime": 232909.65, "train_tokens_per_second": 97943.447 }, { "epoch": 0.9303656597774245, "grad_norm": 0.9050425291061401, "learning_rate": 6.010169201026122e-07, "loss": 0.393, "num_input_tokens_seen": 22815717943, "step": 5852, "train_runtime": 232949.4638, "train_tokens_per_second": 97942.779 }, { "epoch": 0.9305246422893482, "grad_norm": 0.2086598426103592, "learning_rate": 5.982928821666734e-07, "loss": 0.386, "num_input_tokens_seen": 22819675408, "step": 5853, "train_runtime": 232989.9656, "train_tokens_per_second": 97942.739 }, { "epoch": 0.9306836248012719, "grad_norm": 0.25555387139320374, "learning_rate": 5.955749566859348e-07, "loss": 0.3714, "num_input_tokens_seen": 22823541280, "step": 5854, "train_runtime": 233030.5542, "train_tokens_per_second": 97942.269 }, { "epoch": 0.9308426073131956, "grad_norm": 0.2316432148218155, "learning_rate": 5.928631443412214e-07, "loss": 0.3877, "num_input_tokens_seen": 22827438497, "step": 5855, "train_runtime": 233069.3296, "train_tokens_per_second": 97942.696 }, { "epoch": 0.9310015898251193, "grad_norm": 0.18592052161693573, "learning_rate": 5.90157445811823e-07, "loss": 0.3815, "num_input_tokens_seen": 22831279565, "step": 5856, "train_runtime": 233107.4501, "train_tokens_per_second": 97943.157 }, { "epoch": 0.931160572337043, "grad_norm": 1.5240899324417114, "learning_rate": 5.874578617754922e-07, "loss": 0.3757, "num_input_tokens_seen": 22835102344, "step": 5857, "train_runtime": 233147.3511, "train_tokens_per_second": 97942.791 }, { "epoch": 0.9313195548489667, "grad_norm": 0.258466511964798, "learning_rate": 5.847643929084601e-07, "loss": 0.3816, "num_input_tokens_seen": 22839059312, "step": 5858, "train_runtime": 233186.4706, "train_tokens_per_second": 97943.329 }, { "epoch": 0.9314785373608903, "grad_norm": 0.41132739186286926, "learning_rate": 5.820770398854203e-07, "loss": 0.3869, "num_input_tokens_seen": 22843053923, "step": 5859, "train_runtime": 233224.0837, "train_tokens_per_second": 97944.661 }, { "epoch": 0.931637519872814, "grad_norm": 0.4062902629375458, "learning_rate": 5.793958033795372e-07, "loss": 0.3899, "num_input_tokens_seen": 22846872115, "step": 5860, "train_runtime": 233263.8462, "train_tokens_per_second": 97944.334 }, { "epoch": 0.9317965023847377, "grad_norm": 0.2007797807455063, "learning_rate": 5.767206840624428e-07, "loss": 0.3971, "num_input_tokens_seen": 22850843646, "step": 5861, "train_runtime": 233304.7962, "train_tokens_per_second": 97944.166 }, { "epoch": 0.9319554848966614, "grad_norm": 0.2838321924209595, "learning_rate": 5.740516826042291e-07, "loss": 0.3942, "num_input_tokens_seen": 22854822739, "step": 5862, "train_runtime": 233345.1783, "train_tokens_per_second": 97944.268 }, { "epoch": 0.9321144674085851, "grad_norm": 0.21765534579753876, "learning_rate": 5.713887996734724e-07, "loss": 0.3829, "num_input_tokens_seen": 22858546604, "step": 5863, "train_runtime": 233383.8632, "train_tokens_per_second": 97943.989 }, { "epoch": 0.9322734499205088, "grad_norm": 0.21149888634681702, "learning_rate": 5.687320359371973e-07, "loss": 0.3879, "num_input_tokens_seen": 22862478389, "step": 5864, "train_runtime": 233421.6669, "train_tokens_per_second": 97944.971 }, { "epoch": 0.9324324324324325, "grad_norm": 0.30251356959342957, "learning_rate": 5.660813920609049e-07, "loss": 0.3794, "num_input_tokens_seen": 22866440965, "step": 5865, "train_runtime": 233459.8395, "train_tokens_per_second": 97945.929 }, { "epoch": 0.9325914149443562, "grad_norm": 0.20343083143234253, "learning_rate": 5.634368687085695e-07, "loss": 0.3853, "num_input_tokens_seen": 22870413914, "step": 5866, "train_runtime": 233497.5095, "train_tokens_per_second": 97947.143 }, { "epoch": 0.9327503974562799, "grad_norm": 0.21858324110507965, "learning_rate": 5.60798466542617e-07, "loss": 0.3854, "num_input_tokens_seen": 22874278320, "step": 5867, "train_runtime": 233535.5519, "train_tokens_per_second": 97947.735 }, { "epoch": 0.9329093799682034, "grad_norm": 0.20228195190429688, "learning_rate": 5.581661862239574e-07, "loss": 0.4, "num_input_tokens_seen": 22878081401, "step": 5868, "train_runtime": 233575.0531, "train_tokens_per_second": 97947.452 }, { "epoch": 0.9330683624801271, "grad_norm": 0.29218220710754395, "learning_rate": 5.555400284119466e-07, "loss": 0.3912, "num_input_tokens_seen": 22881937996, "step": 5869, "train_runtime": 233612.9701, "train_tokens_per_second": 97948.063 }, { "epoch": 0.9332273449920508, "grad_norm": 0.188187375664711, "learning_rate": 5.529199937644309e-07, "loss": 0.3799, "num_input_tokens_seen": 22885918102, "step": 5870, "train_runtime": 233652.4104, "train_tokens_per_second": 97948.564 }, { "epoch": 0.9333863275039745, "grad_norm": 0.1890752613544464, "learning_rate": 5.503060829377043e-07, "loss": 0.3943, "num_input_tokens_seen": 22889714005, "step": 5871, "train_runtime": 233690.0185, "train_tokens_per_second": 97949.044 }, { "epoch": 0.9335453100158982, "grad_norm": 0.79957515001297, "learning_rate": 5.476982965865323e-07, "loss": 0.3765, "num_input_tokens_seen": 22893691147, "step": 5872, "train_runtime": 233728.6256, "train_tokens_per_second": 97949.881 }, { "epoch": 0.9337042925278219, "grad_norm": 0.24359896779060364, "learning_rate": 5.450966353641506e-07, "loss": 0.3993, "num_input_tokens_seen": 22897585022, "step": 5873, "train_runtime": 233767.0959, "train_tokens_per_second": 97950.419 }, { "epoch": 0.9338632750397456, "grad_norm": 0.2764322757720947, "learning_rate": 5.425010999222519e-07, "loss": 0.3853, "num_input_tokens_seen": 22901448209, "step": 5874, "train_runtime": 233805.0408, "train_tokens_per_second": 97951.046 }, { "epoch": 0.9340222575516693, "grad_norm": 0.23552735149860382, "learning_rate": 5.399116909110046e-07, "loss": 0.3775, "num_input_tokens_seen": 22905415618, "step": 5875, "train_runtime": 233844.8625, "train_tokens_per_second": 97951.331 }, { "epoch": 0.934181240063593, "grad_norm": 0.20546749234199524, "learning_rate": 5.373284089790348e-07, "loss": 0.3723, "num_input_tokens_seen": 22909213242, "step": 5876, "train_runtime": 233886.4261, "train_tokens_per_second": 97950.162 }, { "epoch": 0.9343402225755167, "grad_norm": 0.18588073551654816, "learning_rate": 5.347512547734357e-07, "loss": 0.3809, "num_input_tokens_seen": 22913197057, "step": 5877, "train_runtime": 233926.1722, "train_tokens_per_second": 97950.549 }, { "epoch": 0.9344992050874403, "grad_norm": 0.7910546064376831, "learning_rate": 5.321802289397687e-07, "loss": 0.3868, "num_input_tokens_seen": 22917057358, "step": 5878, "train_runtime": 233966.1129, "train_tokens_per_second": 97950.327 }, { "epoch": 0.934658187599364, "grad_norm": 0.2341330349445343, "learning_rate": 5.296153321220576e-07, "loss": 0.3841, "num_input_tokens_seen": 22920979533, "step": 5879, "train_runtime": 234003.4017, "train_tokens_per_second": 97951.48 }, { "epoch": 0.9348171701112877, "grad_norm": 0.24912536144256592, "learning_rate": 5.270565649627857e-07, "loss": 0.3812, "num_input_tokens_seen": 22924887888, "step": 5880, "train_runtime": 234041.8874, "train_tokens_per_second": 97952.072 }, { "epoch": 0.9349761526232114, "grad_norm": 0.21017049252986908, "learning_rate": 5.245039281029124e-07, "loss": 0.3813, "num_input_tokens_seen": 22928866632, "step": 5881, "train_runtime": 234081.4318, "train_tokens_per_second": 97952.522 }, { "epoch": 0.9351351351351351, "grad_norm": 0.20376838743686676, "learning_rate": 5.219574221818513e-07, "loss": 0.3707, "num_input_tokens_seen": 22932814137, "step": 5882, "train_runtime": 234120.7544, "train_tokens_per_second": 97952.931 }, { "epoch": 0.9352941176470588, "grad_norm": 0.2798883616924286, "learning_rate": 5.194170478374838e-07, "loss": 0.382, "num_input_tokens_seen": 22936695690, "step": 5883, "train_runtime": 234162.3896, "train_tokens_per_second": 97952.091 }, { "epoch": 0.9354531001589825, "grad_norm": 0.17554359138011932, "learning_rate": 5.16882805706162e-07, "loss": 0.4013, "num_input_tokens_seen": 22940612040, "step": 5884, "train_runtime": 234203.2339, "train_tokens_per_second": 97951.73 }, { "epoch": 0.9356120826709062, "grad_norm": 0.3110840916633606, "learning_rate": 5.143546964226864e-07, "loss": 0.3892, "num_input_tokens_seen": 22944508650, "step": 5885, "train_runtime": 234242.4382, "train_tokens_per_second": 97951.972 }, { "epoch": 0.9357710651828299, "grad_norm": 0.2257680743932724, "learning_rate": 5.118327206203366e-07, "loss": 0.3744, "num_input_tokens_seen": 22948340310, "step": 5886, "train_runtime": 234282.8845, "train_tokens_per_second": 97951.416 }, { "epoch": 0.9359300476947536, "grad_norm": 0.20157590508460999, "learning_rate": 5.093168789308489e-07, "loss": 0.3891, "num_input_tokens_seen": 22952302214, "step": 5887, "train_runtime": 234322.3781, "train_tokens_per_second": 97951.815 }, { "epoch": 0.9360890302066772, "grad_norm": 0.18672014772891998, "learning_rate": 5.068071719844247e-07, "loss": 0.3693, "num_input_tokens_seen": 22956251564, "step": 5888, "train_runtime": 234360.4091, "train_tokens_per_second": 97952.771 }, { "epoch": 0.9362480127186009, "grad_norm": 0.2571352422237396, "learning_rate": 5.043036004097223e-07, "loss": 0.3858, "num_input_tokens_seen": 22960088494, "step": 5889, "train_runtime": 234400.7792, "train_tokens_per_second": 97952.27 }, { "epoch": 0.9364069952305246, "grad_norm": 0.38802313804626465, "learning_rate": 5.018061648338762e-07, "loss": 0.3806, "num_input_tokens_seen": 22964025015, "step": 5890, "train_runtime": 234441.4246, "train_tokens_per_second": 97952.079 }, { "epoch": 0.9365659777424483, "grad_norm": 0.18729707598686218, "learning_rate": 4.99314865882472e-07, "loss": 0.3828, "num_input_tokens_seen": 22967904983, "step": 5891, "train_runtime": 234481.0244, "train_tokens_per_second": 97952.084 }, { "epoch": 0.936724960254372, "grad_norm": 0.21634463965892792, "learning_rate": 4.968297041795633e-07, "loss": 0.3962, "num_input_tokens_seen": 22971831627, "step": 5892, "train_runtime": 234519.0654, "train_tokens_per_second": 97952.939 }, { "epoch": 0.9368839427662957, "grad_norm": 0.3017696440219879, "learning_rate": 4.943506803476661e-07, "loss": 0.3865, "num_input_tokens_seen": 22975693786, "step": 5893, "train_runtime": 234558.1751, "train_tokens_per_second": 97953.072 }, { "epoch": 0.9370429252782194, "grad_norm": 0.20164582133293152, "learning_rate": 4.918777950077585e-07, "loss": 0.3804, "num_input_tokens_seen": 22979532336, "step": 5894, "train_runtime": 234598.717, "train_tokens_per_second": 97952.506 }, { "epoch": 0.9372019077901431, "grad_norm": 0.23359256982803345, "learning_rate": 4.894110487792785e-07, "loss": 0.375, "num_input_tokens_seen": 22983431578, "step": 5895, "train_runtime": 234637.0491, "train_tokens_per_second": 97953.122 }, { "epoch": 0.9373608903020668, "grad_norm": 0.1818910539150238, "learning_rate": 4.869504422801291e-07, "loss": 0.3805, "num_input_tokens_seen": 22987344118, "step": 5896, "train_runtime": 234675.3243, "train_tokens_per_second": 97953.818 }, { "epoch": 0.9375198728139904, "grad_norm": 0.23836910724639893, "learning_rate": 4.84495976126681e-07, "loss": 0.386, "num_input_tokens_seen": 22991193022, "step": 5897, "train_runtime": 234715.1628, "train_tokens_per_second": 97953.591 }, { "epoch": 0.9376788553259141, "grad_norm": 0.29535484313964844, "learning_rate": 4.820476509337535e-07, "loss": 0.3833, "num_input_tokens_seen": 22995170637, "step": 5898, "train_runtime": 234756.6175, "train_tokens_per_second": 97953.237 }, { "epoch": 0.9378378378378378, "grad_norm": 0.28583279252052307, "learning_rate": 4.796054673146338e-07, "loss": 0.3839, "num_input_tokens_seen": 22999184107, "step": 5899, "train_runtime": 234796.7353, "train_tokens_per_second": 97953.594 }, { "epoch": 0.9379968203497615, "grad_norm": 0.21634335815906525, "learning_rate": 4.771694258810771e-07, "loss": 0.3905, "num_input_tokens_seen": 23003101044, "step": 5900, "train_runtime": 234835.3988, "train_tokens_per_second": 97954.146 }, { "epoch": 0.9381558028616852, "grad_norm": 0.2776516079902649, "learning_rate": 4.7473952724329254e-07, "loss": 0.3894, "num_input_tokens_seen": 23006926339, "step": 5901, "train_runtime": 234875.3869, "train_tokens_per_second": 97953.756 }, { "epoch": 0.9383147853736089, "grad_norm": 0.2085501104593277, "learning_rate": 4.723157720099541e-07, "loss": 0.3858, "num_input_tokens_seen": 23010760267, "step": 5902, "train_runtime": 234914.265, "train_tokens_per_second": 97953.865 }, { "epoch": 0.9384737678855326, "grad_norm": 0.212423637509346, "learning_rate": 4.6989816078819024e-07, "loss": 0.3867, "num_input_tokens_seen": 23014594804, "step": 5903, "train_runtime": 234950.7689, "train_tokens_per_second": 97954.967 }, { "epoch": 0.9386327503974563, "grad_norm": 0.17751093208789825, "learning_rate": 4.6748669418359983e-07, "loss": 0.3758, "num_input_tokens_seen": 23018428939, "step": 5904, "train_runtime": 234990.4975, "train_tokens_per_second": 97954.722 }, { "epoch": 0.93879173290938, "grad_norm": 0.2621767818927765, "learning_rate": 4.6508137280023843e-07, "loss": 0.3899, "num_input_tokens_seen": 23022363340, "step": 5905, "train_runtime": 235030.5083, "train_tokens_per_second": 97954.787 }, { "epoch": 0.9389507154213037, "grad_norm": 0.22670724987983704, "learning_rate": 4.626821972406187e-07, "loss": 0.3751, "num_input_tokens_seen": 23026391371, "step": 5906, "train_runtime": 235070.9684, "train_tokens_per_second": 97955.062 }, { "epoch": 0.9391096979332273, "grad_norm": 0.20418795943260193, "learning_rate": 4.6028916810572364e-07, "loss": 0.3817, "num_input_tokens_seen": 23030371932, "step": 5907, "train_runtime": 235110.2796, "train_tokens_per_second": 97955.615 }, { "epoch": 0.939268680445151, "grad_norm": 0.22934968769550323, "learning_rate": 4.579022859949794e-07, "loss": 0.3821, "num_input_tokens_seen": 23034281571, "step": 5908, "train_runtime": 235145.2831, "train_tokens_per_second": 97957.659 }, { "epoch": 0.9394276629570747, "grad_norm": 0.21998904645442963, "learning_rate": 4.5552155150629383e-07, "loss": 0.3947, "num_input_tokens_seen": 23038109267, "step": 5909, "train_runtime": 235185.9798, "train_tokens_per_second": 97956.984 }, { "epoch": 0.9395866454689984, "grad_norm": 0.5551968812942505, "learning_rate": 4.5314696523601483e-07, "loss": 0.371, "num_input_tokens_seen": 23042038153, "step": 5910, "train_runtime": 235223.7515, "train_tokens_per_second": 97957.957 }, { "epoch": 0.9397456279809221, "grad_norm": 0.2155977040529251, "learning_rate": 4.507785277789695e-07, "loss": 0.3933, "num_input_tokens_seen": 23045967304, "step": 5911, "train_runtime": 235263.682, "train_tokens_per_second": 97958.032 }, { "epoch": 0.9399046104928458, "grad_norm": 0.1993749439716339, "learning_rate": 4.484162397284275e-07, "loss": 0.379, "num_input_tokens_seen": 23049903333, "step": 5912, "train_runtime": 235304.0423, "train_tokens_per_second": 97957.957 }, { "epoch": 0.9400635930047695, "grad_norm": 0.33656904101371765, "learning_rate": 4.460601016761268e-07, "loss": 0.3925, "num_input_tokens_seen": 23053758203, "step": 5913, "train_runtime": 235341.001, "train_tokens_per_second": 97958.954 }, { "epoch": 0.9402225755166932, "grad_norm": 0.327195942401886, "learning_rate": 4.437101142122674e-07, "loss": 0.3735, "num_input_tokens_seen": 23057597968, "step": 5914, "train_runtime": 235378.7087, "train_tokens_per_second": 97959.574 }, { "epoch": 0.9403815580286169, "grad_norm": 0.4911068379878998, "learning_rate": 4.413662779254979e-07, "loss": 0.3842, "num_input_tokens_seen": 23061464222, "step": 5915, "train_runtime": 235417.2098, "train_tokens_per_second": 97959.976 }, { "epoch": 0.9405405405405406, "grad_norm": 0.19368235766887665, "learning_rate": 4.390285934029348e-07, "loss": 0.3936, "num_input_tokens_seen": 23065338348, "step": 5916, "train_runtime": 235455.7662, "train_tokens_per_second": 97960.389 }, { "epoch": 0.9406995230524642, "grad_norm": 0.18339748680591583, "learning_rate": 4.3669706123015406e-07, "loss": 0.389, "num_input_tokens_seen": 23069344588, "step": 5917, "train_runtime": 235495.6972, "train_tokens_per_second": 97960.79 }, { "epoch": 0.9408585055643879, "grad_norm": 0.21100157499313354, "learning_rate": 4.3437168199118583e-07, "loss": 0.385, "num_input_tokens_seen": 23073200812, "step": 5918, "train_runtime": 235534.9352, "train_tokens_per_second": 97960.843 }, { "epoch": 0.9410174880763116, "grad_norm": 0.17882241308689117, "learning_rate": 4.3205245626852244e-07, "loss": 0.3819, "num_input_tokens_seen": 23077095309, "step": 5919, "train_runtime": 235574.2563, "train_tokens_per_second": 97961.024 }, { "epoch": 0.9411764705882353, "grad_norm": 0.18612918257713318, "learning_rate": 4.29739384643113e-07, "loss": 0.3725, "num_input_tokens_seen": 23080912911, "step": 5920, "train_runtime": 235612.9189, "train_tokens_per_second": 97961.152 }, { "epoch": 0.941335453100159, "grad_norm": 0.1991008073091507, "learning_rate": 4.274324676943664e-07, "loss": 0.3897, "num_input_tokens_seen": 23084820250, "step": 5921, "train_runtime": 235651.1392, "train_tokens_per_second": 97961.844 }, { "epoch": 0.9414944356120827, "grad_norm": 0.20691779255867004, "learning_rate": 4.251317060001453e-07, "loss": 0.3918, "num_input_tokens_seen": 23088728802, "step": 5922, "train_runtime": 235690.0037, "train_tokens_per_second": 97962.274 }, { "epoch": 0.9416534181240064, "grad_norm": 0.20775899291038513, "learning_rate": 4.2283710013678037e-07, "loss": 0.3837, "num_input_tokens_seen": 23092558161, "step": 5923, "train_runtime": 235730.1358, "train_tokens_per_second": 97961.841 }, { "epoch": 0.9418124006359301, "grad_norm": 0.25309038162231445, "learning_rate": 4.205486506790479e-07, "loss": 0.3907, "num_input_tokens_seen": 23096531908, "step": 5924, "train_runtime": 235769.3156, "train_tokens_per_second": 97962.417 }, { "epoch": 0.9419713831478538, "grad_norm": 0.23064011335372925, "learning_rate": 4.18266358200195e-07, "loss": 0.3952, "num_input_tokens_seen": 23100389243, "step": 5925, "train_runtime": 235808.7956, "train_tokens_per_second": 97962.373 }, { "epoch": 0.9421303656597774, "grad_norm": 0.18579047918319702, "learning_rate": 4.159902232719143e-07, "loss": 0.3806, "num_input_tokens_seen": 23104253552, "step": 5926, "train_runtime": 235847.5492, "train_tokens_per_second": 97962.661 }, { "epoch": 0.9422893481717011, "grad_norm": 1.0307611227035522, "learning_rate": 4.137202464643636e-07, "loss": 0.3824, "num_input_tokens_seen": 23108169213, "step": 5927, "train_runtime": 235886.4378, "train_tokens_per_second": 97963.111 }, { "epoch": 0.9424483306836248, "grad_norm": 0.25507065653800964, "learning_rate": 4.114564283461547e-07, "loss": 0.3723, "num_input_tokens_seen": 23111970616, "step": 5928, "train_runtime": 235924.8048, "train_tokens_per_second": 97963.292 }, { "epoch": 0.9426073131955485, "grad_norm": 0.21974726021289825, "learning_rate": 4.091987694843619e-07, "loss": 0.3927, "num_input_tokens_seen": 23115943085, "step": 5929, "train_runtime": 235965.2198, "train_tokens_per_second": 97963.349 }, { "epoch": 0.9427662957074722, "grad_norm": 0.29441767930984497, "learning_rate": 4.069472704445104e-07, "loss": 0.3765, "num_input_tokens_seen": 23119874551, "step": 5930, "train_runtime": 236004.1394, "train_tokens_per_second": 97963.852 }, { "epoch": 0.9429252782193959, "grad_norm": 0.43311062455177307, "learning_rate": 4.047019317905798e-07, "loss": 0.391, "num_input_tokens_seen": 23123785986, "step": 5931, "train_runtime": 236045.4616, "train_tokens_per_second": 97963.273 }, { "epoch": 0.9430842607313196, "grad_norm": 0.2139342874288559, "learning_rate": 4.024627540850201e-07, "loss": 0.368, "num_input_tokens_seen": 23127654267, "step": 5932, "train_runtime": 236085.1581, "train_tokens_per_second": 97963.186 }, { "epoch": 0.9432432432432433, "grad_norm": 0.19176727533340454, "learning_rate": 4.002297378887243e-07, "loss": 0.3752, "num_input_tokens_seen": 23131618335, "step": 5933, "train_runtime": 236124.6641, "train_tokens_per_second": 97963.584 }, { "epoch": 0.943402225755167, "grad_norm": 0.24103012681007385, "learning_rate": 3.980028837610478e-07, "loss": 0.3912, "num_input_tokens_seen": 23135499507, "step": 5934, "train_runtime": 236163.5152, "train_tokens_per_second": 97963.902 }, { "epoch": 0.9435612082670907, "grad_norm": 0.22918643057346344, "learning_rate": 3.957821922598026e-07, "loss": 0.3785, "num_input_tokens_seen": 23139462867, "step": 5935, "train_runtime": 236203.1691, "train_tokens_per_second": 97964.235 }, { "epoch": 0.9437201907790143, "grad_norm": 0.19857391715049744, "learning_rate": 3.935676639412578e-07, "loss": 0.3959, "num_input_tokens_seen": 23143325818, "step": 5936, "train_runtime": 236243.0095, "train_tokens_per_second": 97964.066 }, { "epoch": 0.943879173290938, "grad_norm": 0.3638882637023926, "learning_rate": 3.9135929936013615e-07, "loss": 0.3825, "num_input_tokens_seen": 23147193518, "step": 5937, "train_runtime": 236281.5376, "train_tokens_per_second": 97964.461 }, { "epoch": 0.9440381558028617, "grad_norm": 0.20763978362083435, "learning_rate": 3.891570990696147e-07, "loss": 0.3752, "num_input_tokens_seen": 23151096249, "step": 5938, "train_runtime": 236320.683, "train_tokens_per_second": 97964.748 }, { "epoch": 0.9441971383147854, "grad_norm": 0.18503360450267792, "learning_rate": 3.8696106362133e-07, "loss": 0.3882, "num_input_tokens_seen": 23154986993, "step": 5939, "train_runtime": 236362.1972, "train_tokens_per_second": 97964.003 }, { "epoch": 0.9443561208267091, "grad_norm": 0.2965971529483795, "learning_rate": 3.8477119356537804e-07, "loss": 0.3806, "num_input_tokens_seen": 23158896603, "step": 5940, "train_runtime": 236401.524, "train_tokens_per_second": 97964.244 }, { "epoch": 0.9445151033386328, "grad_norm": 0.1878087818622589, "learning_rate": 3.825874894502979e-07, "loss": 0.3806, "num_input_tokens_seen": 23162836698, "step": 5941, "train_runtime": 236440.8016, "train_tokens_per_second": 97964.634 }, { "epoch": 0.9446740858505565, "grad_norm": 0.18321667611598969, "learning_rate": 3.804099518230991e-07, "loss": 0.3831, "num_input_tokens_seen": 23166717600, "step": 5942, "train_runtime": 236479.3866, "train_tokens_per_second": 97965.061 }, { "epoch": 0.9448330683624802, "grad_norm": 0.17883719503879547, "learning_rate": 3.7823858122923715e-07, "loss": 0.3892, "num_input_tokens_seen": 23170680578, "step": 5943, "train_runtime": 236518.0108, "train_tokens_per_second": 97965.819 }, { "epoch": 0.9449920508744039, "grad_norm": 0.21724115312099457, "learning_rate": 3.760733782126241e-07, "loss": 0.3959, "num_input_tokens_seen": 23174584445, "step": 5944, "train_runtime": 236557.8555, "train_tokens_per_second": 97965.821 }, { "epoch": 0.9451510333863276, "grad_norm": 0.21680662035942078, "learning_rate": 3.7391434331562615e-07, "loss": 0.3931, "num_input_tokens_seen": 23178391771, "step": 5945, "train_runtime": 236596.9014, "train_tokens_per_second": 97965.745 }, { "epoch": 0.9453100158982511, "grad_norm": 0.2168215662240982, "learning_rate": 3.7176147707907174e-07, "loss": 0.3856, "num_input_tokens_seen": 23182472210, "step": 5946, "train_runtime": 236637.9225, "train_tokens_per_second": 97966.006 }, { "epoch": 0.9454689984101748, "grad_norm": 0.18310965597629547, "learning_rate": 3.696147800422323e-07, "loss": 0.3773, "num_input_tokens_seen": 23186327625, "step": 5947, "train_runtime": 236675.706, "train_tokens_per_second": 97966.657 }, { "epoch": 0.9456279809220985, "grad_norm": 0.2564848065376282, "learning_rate": 3.6747425274284706e-07, "loss": 0.3876, "num_input_tokens_seen": 23190182905, "step": 5948, "train_runtime": 236716.1326, "train_tokens_per_second": 97966.212 }, { "epoch": 0.9457869634340222, "grad_norm": 0.20665377378463745, "learning_rate": 3.653398957170956e-07, "loss": 0.3831, "num_input_tokens_seen": 23194145559, "step": 5949, "train_runtime": 236756.3321, "train_tokens_per_second": 97966.316 }, { "epoch": 0.9459459459459459, "grad_norm": 0.2083345502614975, "learning_rate": 3.6321170949962793e-07, "loss": 0.3854, "num_input_tokens_seen": 23198069064, "step": 5950, "train_runtime": 236795.5795, "train_tokens_per_second": 97966.648 }, { "epoch": 0.9461049284578696, "grad_norm": 0.22130504250526428, "learning_rate": 3.6108969462353146e-07, "loss": 0.378, "num_input_tokens_seen": 23202052603, "step": 5951, "train_runtime": 236834.0209, "train_tokens_per_second": 97967.566 }, { "epoch": 0.9462639109697933, "grad_norm": 0.2247617244720459, "learning_rate": 3.589738516203589e-07, "loss": 0.3994, "num_input_tokens_seen": 23205827188, "step": 5952, "train_runtime": 236873.4119, "train_tokens_per_second": 97967.21 }, { "epoch": 0.946422893481717, "grad_norm": 0.2836274206638336, "learning_rate": 3.568641810201168e-07, "loss": 0.3963, "num_input_tokens_seen": 23209811728, "step": 5953, "train_runtime": 236910.4677, "train_tokens_per_second": 97968.705 }, { "epoch": 0.9465818759936407, "grad_norm": 0.31771114468574524, "learning_rate": 3.5476068335125733e-07, "loss": 0.3751, "num_input_tokens_seen": 23213712121, "step": 5954, "train_runtime": 236951.5426, "train_tokens_per_second": 97968.183 }, { "epoch": 0.9467408585055644, "grad_norm": 0.25490427017211914, "learning_rate": 3.5266335914069804e-07, "loss": 0.3907, "num_input_tokens_seen": 23217469404, "step": 5955, "train_runtime": 236989.0897, "train_tokens_per_second": 97968.516 }, { "epoch": 0.946899841017488, "grad_norm": 0.1974523961544037, "learning_rate": 3.5057220891379917e-07, "loss": 0.3765, "num_input_tokens_seen": 23221454179, "step": 5956, "train_runtime": 237028.2782, "train_tokens_per_second": 97969.13 }, { "epoch": 0.9470588235294117, "grad_norm": 0.2521542012691498, "learning_rate": 3.484872331943778e-07, "loss": 0.3901, "num_input_tokens_seen": 23225226980, "step": 5957, "train_runtime": 237069.4063, "train_tokens_per_second": 97968.048 }, { "epoch": 0.9472178060413354, "grad_norm": 0.29118987917900085, "learning_rate": 3.464084325047107e-07, "loss": 0.3825, "num_input_tokens_seen": 23229077060, "step": 5958, "train_runtime": 237108.8435, "train_tokens_per_second": 97967.991 }, { "epoch": 0.9473767885532591, "grad_norm": 0.23942536115646362, "learning_rate": 3.443358073655201e-07, "loss": 0.3838, "num_input_tokens_seen": 23233061118, "step": 5959, "train_runtime": 237147.9917, "train_tokens_per_second": 97968.618 }, { "epoch": 0.9475357710651828, "grad_norm": 0.23348508775234222, "learning_rate": 3.422693582959852e-07, "loss": 0.384, "num_input_tokens_seen": 23236924643, "step": 5960, "train_runtime": 237188.5007, "train_tokens_per_second": 97968.175 }, { "epoch": 0.9476947535771065, "grad_norm": 0.1993388533592224, "learning_rate": 3.4020908581373356e-07, "loss": 0.3998, "num_input_tokens_seen": 23240834291, "step": 5961, "train_runtime": 237229.1069, "train_tokens_per_second": 97967.887 }, { "epoch": 0.9478537360890302, "grad_norm": 0.3682944178581238, "learning_rate": 3.381549904348524e-07, "loss": 0.3827, "num_input_tokens_seen": 23244723653, "step": 5962, "train_runtime": 237269.198, "train_tokens_per_second": 97967.725 }, { "epoch": 0.9480127186009539, "grad_norm": 0.21274219453334808, "learning_rate": 3.361070726738774e-07, "loss": 0.3773, "num_input_tokens_seen": 23248660579, "step": 5963, "train_runtime": 237306.7643, "train_tokens_per_second": 97968.807 }, { "epoch": 0.9481717011128776, "grad_norm": 0.21264255046844482, "learning_rate": 3.340653330437954e-07, "loss": 0.374, "num_input_tokens_seen": 23252510770, "step": 5964, "train_runtime": 237346.1344, "train_tokens_per_second": 97968.778 }, { "epoch": 0.9483306836248012, "grad_norm": 0.23908498883247375, "learning_rate": 3.3202977205605003e-07, "loss": 0.3957, "num_input_tokens_seen": 23256425032, "step": 5965, "train_runtime": 237385.825, "train_tokens_per_second": 97968.887 }, { "epoch": 0.9484896661367249, "grad_norm": 0.18642403185367584, "learning_rate": 3.300003902205362e-07, "loss": 0.386, "num_input_tokens_seen": 23260209239, "step": 5966, "train_runtime": 237423.0705, "train_tokens_per_second": 97969.457 }, { "epoch": 0.9486486486486486, "grad_norm": 0.2107210010290146, "learning_rate": 3.2797718804559453e-07, "loss": 0.3939, "num_input_tokens_seen": 23264215750, "step": 5967, "train_runtime": 237462.3456, "train_tokens_per_second": 97970.125 }, { "epoch": 0.9488076311605723, "grad_norm": 0.1790679395198822, "learning_rate": 3.259601660380279e-07, "loss": 0.3831, "num_input_tokens_seen": 23268131174, "step": 5968, "train_runtime": 237500.6959, "train_tokens_per_second": 97970.792 }, { "epoch": 0.948966613672496, "grad_norm": 0.1815321445465088, "learning_rate": 3.2394932470308226e-07, "loss": 0.3897, "num_input_tokens_seen": 23272082263, "step": 5969, "train_runtime": 237538.9952, "train_tokens_per_second": 97971.629 }, { "epoch": 0.9491255961844197, "grad_norm": 0.19014057517051697, "learning_rate": 3.2194466454446295e-07, "loss": 0.3873, "num_input_tokens_seen": 23275956145, "step": 5970, "train_runtime": 237578.1267, "train_tokens_per_second": 97971.798 }, { "epoch": 0.9492845786963434, "grad_norm": 0.20269803702831268, "learning_rate": 3.19946186064321e-07, "loss": 0.3908, "num_input_tokens_seen": 23279817131, "step": 5971, "train_runtime": 237615.4783, "train_tokens_per_second": 97972.646 }, { "epoch": 0.9494435612082671, "grad_norm": 0.26103004813194275, "learning_rate": 3.179538897632617e-07, "loss": 0.3781, "num_input_tokens_seen": 23283706944, "step": 5972, "train_runtime": 237653.5712, "train_tokens_per_second": 97973.31 }, { "epoch": 0.9496025437201908, "grad_norm": 0.36263933777809143, "learning_rate": 3.1596777614034126e-07, "loss": 0.3831, "num_input_tokens_seen": 23287627949, "step": 5973, "train_runtime": 237692.1639, "train_tokens_per_second": 97973.899 }, { "epoch": 0.9497615262321145, "grad_norm": 0.3767465054988861, "learning_rate": 3.1398784569306447e-07, "loss": 0.3818, "num_input_tokens_seen": 23291556940, "step": 5974, "train_runtime": 237732.7177, "train_tokens_per_second": 97973.713 }, { "epoch": 0.9499205087440381, "grad_norm": 0.22603833675384521, "learning_rate": 3.1201409891739586e-07, "loss": 0.3832, "num_input_tokens_seen": 23295449687, "step": 5975, "train_runtime": 237774.5816, "train_tokens_per_second": 97972.834 }, { "epoch": 0.9500794912559618, "grad_norm": 0.24236465990543365, "learning_rate": 3.1004653630773706e-07, "loss": 0.384, "num_input_tokens_seen": 23299351211, "step": 5976, "train_runtime": 237814.3355, "train_tokens_per_second": 97972.863 }, { "epoch": 0.9502384737678855, "grad_norm": 0.18018357455730438, "learning_rate": 3.0808515835695494e-07, "loss": 0.401, "num_input_tokens_seen": 23303361636, "step": 5977, "train_runtime": 237854.9073, "train_tokens_per_second": 97973.012 }, { "epoch": 0.9503974562798092, "grad_norm": 0.21157030761241913, "learning_rate": 3.061299655563621e-07, "loss": 0.3875, "num_input_tokens_seen": 23307178204, "step": 5978, "train_runtime": 237892.2588, "train_tokens_per_second": 97973.672 }, { "epoch": 0.9505564387917329, "grad_norm": 0.20476074516773224, "learning_rate": 3.0418095839571394e-07, "loss": 0.3838, "num_input_tokens_seen": 23311021072, "step": 5979, "train_runtime": 237930.265, "train_tokens_per_second": 97974.174 }, { "epoch": 0.9507154213036566, "grad_norm": 0.45903632044792175, "learning_rate": 3.0223813736322827e-07, "loss": 0.3829, "num_input_tokens_seen": 23314960107, "step": 5980, "train_runtime": 237968.7836, "train_tokens_per_second": 97974.868 }, { "epoch": 0.9508744038155803, "grad_norm": 0.2108822613954544, "learning_rate": 3.0030150294556305e-07, "loss": 0.3806, "num_input_tokens_seen": 23318821871, "step": 5981, "train_runtime": 238011.1171, "train_tokens_per_second": 97973.667 }, { "epoch": 0.951033386327504, "grad_norm": 0.23603743314743042, "learning_rate": 2.9837105562783576e-07, "loss": 0.3774, "num_input_tokens_seen": 23322687245, "step": 5982, "train_runtime": 238049.736, "train_tokens_per_second": 97974.01 }, { "epoch": 0.9511923688394277, "grad_norm": 0.21762511134147644, "learning_rate": 2.964467958936096e-07, "loss": 0.3835, "num_input_tokens_seen": 23326570471, "step": 5983, "train_runtime": 238090.7571, "train_tokens_per_second": 97973.44 }, { "epoch": 0.9513513513513514, "grad_norm": 0.21913635730743408, "learning_rate": 2.9452872422489895e-07, "loss": 0.3836, "num_input_tokens_seen": 23330565421, "step": 5984, "train_runtime": 238126.5562, "train_tokens_per_second": 97975.487 }, { "epoch": 0.951510333863275, "grad_norm": 0.2761322557926178, "learning_rate": 2.9261684110216404e-07, "loss": 0.3878, "num_input_tokens_seen": 23334402732, "step": 5985, "train_runtime": 238169.0953, "train_tokens_per_second": 97974.1 }, { "epoch": 0.9516693163751987, "grad_norm": 0.23713448643684387, "learning_rate": 2.9071114700431613e-07, "loss": 0.387, "num_input_tokens_seen": 23338243956, "step": 5986, "train_runtime": 238207.0196, "train_tokens_per_second": 97974.627 }, { "epoch": 0.9518282988871224, "grad_norm": 0.20551447570323944, "learning_rate": 2.888116424087234e-07, "loss": 0.3842, "num_input_tokens_seen": 23342177742, "step": 5987, "train_runtime": 238247.326, "train_tokens_per_second": 97974.563 }, { "epoch": 0.9519872813990461, "grad_norm": 0.2099614292383194, "learning_rate": 2.8691832779119685e-07, "loss": 0.3865, "num_input_tokens_seen": 23346035432, "step": 5988, "train_runtime": 238287.7263, "train_tokens_per_second": 97974.142 }, { "epoch": 0.9521462639109698, "grad_norm": 0.18600840866565704, "learning_rate": 2.850312036259989e-07, "loss": 0.3808, "num_input_tokens_seen": 23349751627, "step": 5989, "train_runtime": 238328.5116, "train_tokens_per_second": 97972.968 }, { "epoch": 0.9523052464228935, "grad_norm": 0.1953485757112503, "learning_rate": 2.831502703858374e-07, "loss": 0.3816, "num_input_tokens_seen": 23353732195, "step": 5990, "train_runtime": 238367.6177, "train_tokens_per_second": 97973.594 }, { "epoch": 0.9524642289348172, "grad_norm": 0.20362049341201782, "learning_rate": 2.812755285418772e-07, "loss": 0.3741, "num_input_tokens_seen": 23357730104, "step": 5991, "train_runtime": 238406.6477, "train_tokens_per_second": 97974.324 }, { "epoch": 0.9526232114467409, "grad_norm": 0.20334461331367493, "learning_rate": 2.794069785637232e-07, "loss": 0.3821, "num_input_tokens_seen": 23361661256, "step": 5992, "train_runtime": 238446.7409, "train_tokens_per_second": 97974.337 }, { "epoch": 0.9527821939586646, "grad_norm": 0.3316170871257782, "learning_rate": 2.775446209194399e-07, "loss": 0.3882, "num_input_tokens_seen": 23365508872, "step": 5993, "train_runtime": 238487.0309, "train_tokens_per_second": 97973.918 }, { "epoch": 0.9529411764705882, "grad_norm": 0.22242610156536102, "learning_rate": 2.7568845607553185e-07, "loss": 0.3792, "num_input_tokens_seen": 23369392593, "step": 5994, "train_runtime": 238524.9701, "train_tokens_per_second": 97974.617 }, { "epoch": 0.9531001589825119, "grad_norm": 0.22870098054409027, "learning_rate": 2.738384844969494e-07, "loss": 0.3784, "num_input_tokens_seen": 23373272247, "step": 5995, "train_runtime": 238563.1095, "train_tokens_per_second": 97975.216 }, { "epoch": 0.9532591414944356, "grad_norm": 0.23324662446975708, "learning_rate": 2.7199470664710793e-07, "loss": 0.3798, "num_input_tokens_seen": 23377132636, "step": 5996, "train_runtime": 238602.69, "train_tokens_per_second": 97975.143 }, { "epoch": 0.9534181240063593, "grad_norm": 0.21170362830162048, "learning_rate": 2.701571229878547e-07, "loss": 0.3771, "num_input_tokens_seen": 23380980670, "step": 5997, "train_runtime": 238642.0875, "train_tokens_per_second": 97975.093 }, { "epoch": 0.953577106518283, "grad_norm": 0.19617430865764618, "learning_rate": 2.683257339794909e-07, "loss": 0.383, "num_input_tokens_seen": 23384961200, "step": 5998, "train_runtime": 238683.0301, "train_tokens_per_second": 97974.964 }, { "epoch": 0.9537360890302067, "grad_norm": 0.2198040783405304, "learning_rate": 2.665005400807691e-07, "loss": 0.3837, "num_input_tokens_seen": 23388771385, "step": 5999, "train_runtime": 238722.6016, "train_tokens_per_second": 97974.684 }, { "epoch": 0.9538950715421304, "grad_norm": 0.22270941734313965, "learning_rate": 2.646815417488846e-07, "loss": 0.4066, "num_input_tokens_seen": 23392626842, "step": 6000, "train_runtime": 238762.3618, "train_tokens_per_second": 97974.516 }, { "epoch": 0.9540540540540541, "grad_norm": 0.2790178656578064, "learning_rate": 2.6286873943948966e-07, "loss": 0.3886, "num_input_tokens_seen": 23396619674, "step": 6001, "train_runtime": 238904.4651, "train_tokens_per_second": 97932.953 }, { "epoch": 0.9542130365659778, "grad_norm": 0.22289469838142395, "learning_rate": 2.61062133606671e-07, "loss": 0.3904, "num_input_tokens_seen": 23400506330, "step": 6002, "train_runtime": 238943.8597, "train_tokens_per_second": 97933.072 }, { "epoch": 0.9543720190779015, "grad_norm": 0.3427695333957672, "learning_rate": 2.5926172470297494e-07, "loss": 0.3765, "num_input_tokens_seen": 23404412925, "step": 6003, "train_runtime": 238982.3689, "train_tokens_per_second": 97933.639 }, { "epoch": 0.9545310015898251, "grad_norm": 0.24685019254684448, "learning_rate": 2.57467513179388e-07, "loss": 0.3913, "num_input_tokens_seen": 23408289119, "step": 6004, "train_runtime": 239022.6926, "train_tokens_per_second": 97933.334 }, { "epoch": 0.9546899841017488, "grad_norm": 0.18154969811439514, "learning_rate": 2.556794994853506e-07, "loss": 0.3697, "num_input_tokens_seen": 23412311030, "step": 6005, "train_runtime": 239062.2126, "train_tokens_per_second": 97933.968 }, { "epoch": 0.9548489666136725, "grad_norm": 0.268075555562973, "learning_rate": 2.5389768406874623e-07, "loss": 0.3778, "num_input_tokens_seen": 23416207976, "step": 6006, "train_runtime": 239100.6987, "train_tokens_per_second": 97934.503 }, { "epoch": 0.9550079491255962, "grad_norm": 0.24600186944007874, "learning_rate": 2.521220673759095e-07, "loss": 0.3892, "num_input_tokens_seen": 23419991908, "step": 6007, "train_runtime": 239140.4058, "train_tokens_per_second": 97934.064 }, { "epoch": 0.9551669316375199, "grad_norm": 0.3606269359588623, "learning_rate": 2.503526498516151e-07, "loss": 0.378, "num_input_tokens_seen": 23423889624, "step": 6008, "train_runtime": 239179.7941, "train_tokens_per_second": 97934.233 }, { "epoch": 0.9553259141494436, "grad_norm": 0.19572804868221283, "learning_rate": 2.4858943193909466e-07, "loss": 0.3794, "num_input_tokens_seen": 23427833884, "step": 6009, "train_runtime": 239220.9191, "train_tokens_per_second": 97933.885 }, { "epoch": 0.9554848966613673, "grad_norm": 0.22635363042354584, "learning_rate": 2.468324140800171e-07, "loss": 0.3849, "num_input_tokens_seen": 23431834185, "step": 6010, "train_runtime": 239257.2655, "train_tokens_per_second": 97935.727 }, { "epoch": 0.955643879173291, "grad_norm": 0.21070504188537598, "learning_rate": 2.4508159671450537e-07, "loss": 0.3929, "num_input_tokens_seen": 23435812549, "step": 6011, "train_runtime": 239298.3253, "train_tokens_per_second": 97935.548 }, { "epoch": 0.9558028616852147, "grad_norm": 0.18153859674930573, "learning_rate": 2.433369802811281e-07, "loss": 0.378, "num_input_tokens_seen": 23439688182, "step": 6012, "train_runtime": 239339.7696, "train_tokens_per_second": 97934.782 }, { "epoch": 0.9559618441971384, "grad_norm": 0.18521428108215332, "learning_rate": 2.4159856521689674e-07, "loss": 0.3917, "num_input_tokens_seen": 23443665143, "step": 6013, "train_runtime": 239379.5407, "train_tokens_per_second": 97935.125 }, { "epoch": 0.956120826709062, "grad_norm": 0.20369237661361694, "learning_rate": 2.398663519572741e-07, "loss": 0.3833, "num_input_tokens_seen": 23447535340, "step": 6014, "train_runtime": 239418.7891, "train_tokens_per_second": 97935.235 }, { "epoch": 0.9562798092209857, "grad_norm": 0.19187608361244202, "learning_rate": 2.38140340936166e-07, "loss": 0.3788, "num_input_tokens_seen": 23451383620, "step": 6015, "train_runtime": 239460.1001, "train_tokens_per_second": 97934.41 }, { "epoch": 0.9564387917329094, "grad_norm": 0.24660325050354004, "learning_rate": 2.3642053258592646e-07, "loss": 0.3805, "num_input_tokens_seen": 23455235920, "step": 6016, "train_runtime": 239499.844, "train_tokens_per_second": 97934.243 }, { "epoch": 0.956597774244833, "grad_norm": 0.22911417484283447, "learning_rate": 2.347069273373581e-07, "loss": 0.3825, "num_input_tokens_seen": 23459232816, "step": 6017, "train_runtime": 239537.8376, "train_tokens_per_second": 97935.395 }, { "epoch": 0.9567567567567568, "grad_norm": 0.2950246036052704, "learning_rate": 2.3299952561969818e-07, "loss": 0.3861, "num_input_tokens_seen": 23463041589, "step": 6018, "train_runtime": 239579.0576, "train_tokens_per_second": 97934.443 }, { "epoch": 0.9569157392686805, "grad_norm": 0.2463361769914627, "learning_rate": 2.3129832786064898e-07, "loss": 0.3881, "num_input_tokens_seen": 23467043444, "step": 6019, "train_runtime": 239619.1098, "train_tokens_per_second": 97934.774 }, { "epoch": 0.9570747217806042, "grad_norm": 0.2775970995426178, "learning_rate": 2.2960333448634463e-07, "loss": 0.3969, "num_input_tokens_seen": 23470935247, "step": 6020, "train_runtime": 239658.8415, "train_tokens_per_second": 97934.777 }, { "epoch": 0.9572337042925279, "grad_norm": 0.4060233533382416, "learning_rate": 2.2791454592136496e-07, "loss": 0.3779, "num_input_tokens_seen": 23474786184, "step": 6021, "train_runtime": 239698.8683, "train_tokens_per_second": 97934.489 }, { "epoch": 0.9573926868044516, "grad_norm": 0.20945894718170166, "learning_rate": 2.2623196258874656e-07, "loss": 0.3824, "num_input_tokens_seen": 23478548325, "step": 6022, "train_runtime": 239738.0627, "train_tokens_per_second": 97934.171 }, { "epoch": 0.9575516693163753, "grad_norm": 0.2141111046075821, "learning_rate": 2.2455558490996066e-07, "loss": 0.3715, "num_input_tokens_seen": 23482497456, "step": 6023, "train_runtime": 239779.0333, "train_tokens_per_second": 97933.907 }, { "epoch": 0.9577106518282988, "grad_norm": 0.19051922857761383, "learning_rate": 2.2288541330492963e-07, "loss": 0.3793, "num_input_tokens_seen": 23486550637, "step": 6024, "train_runtime": 239819.3961, "train_tokens_per_second": 97934.325 }, { "epoch": 0.9578696343402225, "grad_norm": 0.20267529785633087, "learning_rate": 2.2122144819201607e-07, "loss": 0.383, "num_input_tokens_seen": 23490318830, "step": 6025, "train_runtime": 239859.4851, "train_tokens_per_second": 97933.667 }, { "epoch": 0.9580286168521462, "grad_norm": 0.2043759971857071, "learning_rate": 2.1956368998803657e-07, "loss": 0.3822, "num_input_tokens_seen": 23494209092, "step": 6026, "train_runtime": 239900.4029, "train_tokens_per_second": 97933.179 }, { "epoch": 0.9581875993640699, "grad_norm": 0.2386837899684906, "learning_rate": 2.1791213910824504e-07, "loss": 0.3847, "num_input_tokens_seen": 23498125041, "step": 6027, "train_runtime": 239941.5949, "train_tokens_per_second": 97932.687 }, { "epoch": 0.9583465818759936, "grad_norm": 1.0220410823822021, "learning_rate": 2.1626679596634115e-07, "loss": 0.3831, "num_input_tokens_seen": 23502061280, "step": 6028, "train_runtime": 239979.2982, "train_tokens_per_second": 97933.703 }, { "epoch": 0.9585055643879173, "grad_norm": 0.20641176402568817, "learning_rate": 2.146276609744785e-07, "loss": 0.3838, "num_input_tokens_seen": 23505911607, "step": 6029, "train_runtime": 240017.6189, "train_tokens_per_second": 97934.109 }, { "epoch": 0.958664546899841, "grad_norm": 0.23062054812908173, "learning_rate": 2.129947345432426e-07, "loss": 0.384, "num_input_tokens_seen": 23509771474, "step": 6030, "train_runtime": 240052.9611, "train_tokens_per_second": 97935.77 }, { "epoch": 0.9588235294117647, "grad_norm": 0.19323068857192993, "learning_rate": 2.113680170816701e-07, "loss": 0.3694, "num_input_tokens_seen": 23513778732, "step": 6031, "train_runtime": 240089.2771, "train_tokens_per_second": 97937.646 }, { "epoch": 0.9589825119236884, "grad_norm": 0.20776937901973724, "learning_rate": 2.0974750899724616e-07, "loss": 0.3804, "num_input_tokens_seen": 23517694099, "step": 6032, "train_runtime": 240128.888, "train_tokens_per_second": 97937.796 }, { "epoch": 0.959141494435612, "grad_norm": 0.26743239164352417, "learning_rate": 2.0813321069589609e-07, "loss": 0.3981, "num_input_tokens_seen": 23521553817, "step": 6033, "train_runtime": 240167.0533, "train_tokens_per_second": 97938.304 }, { "epoch": 0.9593004769475357, "grad_norm": 0.29557541012763977, "learning_rate": 2.065251225819853e-07, "loss": 0.3753, "num_input_tokens_seen": 23525504240, "step": 6034, "train_runtime": 240205.188, "train_tokens_per_second": 97939.201 }, { "epoch": 0.9594594594594594, "grad_norm": 0.22612962126731873, "learning_rate": 2.0492324505833326e-07, "loss": 0.3892, "num_input_tokens_seen": 23529401047, "step": 6035, "train_runtime": 240247.0521, "train_tokens_per_second": 97938.355 }, { "epoch": 0.9596184419713831, "grad_norm": 0.1986289769411087, "learning_rate": 2.0332757852619678e-07, "loss": 0.3759, "num_input_tokens_seen": 23533231005, "step": 6036, "train_runtime": 240286.3956, "train_tokens_per_second": 97938.258 }, { "epoch": 0.9597774244833068, "grad_norm": 0.21138735115528107, "learning_rate": 2.0173812338528387e-07, "loss": 0.3878, "num_input_tokens_seen": 23537216415, "step": 6037, "train_runtime": 240327.353, "train_tokens_per_second": 97938.15 }, { "epoch": 0.9599364069952305, "grad_norm": 0.4027063846588135, "learning_rate": 2.0015488003373438e-07, "loss": 0.3931, "num_input_tokens_seen": 23541000037, "step": 6038, "train_runtime": 240367.1567, "train_tokens_per_second": 97937.673 }, { "epoch": 0.9600953895071542, "grad_norm": 0.24510201811790466, "learning_rate": 1.9857784886814222e-07, "loss": 0.3904, "num_input_tokens_seen": 23544907249, "step": 6039, "train_runtime": 240410.2658, "train_tokens_per_second": 97936.364 }, { "epoch": 0.9602543720190779, "grad_norm": 0.32666903734207153, "learning_rate": 1.9700703028354416e-07, "loss": 0.3698, "num_input_tokens_seen": 23548931212, "step": 6040, "train_runtime": 240449.2938, "train_tokens_per_second": 97937.203 }, { "epoch": 0.9604133545310016, "grad_norm": 0.17410053312778473, "learning_rate": 1.9544242467341712e-07, "loss": 0.3753, "num_input_tokens_seen": 23552917484, "step": 6041, "train_runtime": 240487.7028, "train_tokens_per_second": 97938.137 }, { "epoch": 0.9605723370429253, "grad_norm": 0.24517174065113068, "learning_rate": 1.9388403242968923e-07, "loss": 0.3711, "num_input_tokens_seen": 23556714167, "step": 6042, "train_runtime": 240527.7614, "train_tokens_per_second": 97937.61 }, { "epoch": 0.9607313195548489, "grad_norm": 0.1816767454147339, "learning_rate": 1.9233185394271768e-07, "loss": 0.3776, "num_input_tokens_seen": 23560596056, "step": 6043, "train_runtime": 240569.4807, "train_tokens_per_second": 97936.762 }, { "epoch": 0.9608903020667726, "grad_norm": 0.2680511474609375, "learning_rate": 1.9078588960131927e-07, "loss": 0.3826, "num_input_tokens_seen": 23564591155, "step": 6044, "train_runtime": 240609.6529, "train_tokens_per_second": 97937.015 }, { "epoch": 0.9610492845786963, "grad_norm": 0.2325913906097412, "learning_rate": 1.892461397927453e-07, "loss": 0.3944, "num_input_tokens_seen": 23568377801, "step": 6045, "train_runtime": 240650.8048, "train_tokens_per_second": 97936.002 }, { "epoch": 0.96120826709062, "grad_norm": 0.18633586168289185, "learning_rate": 1.8771260490269005e-07, "loss": 0.3841, "num_input_tokens_seen": 23572375400, "step": 6046, "train_runtime": 240690.6485, "train_tokens_per_second": 97936.399 }, { "epoch": 0.9613672496025437, "grad_norm": 0.1863066405057907, "learning_rate": 1.8618528531529899e-07, "loss": 0.3756, "num_input_tokens_seen": 23576311633, "step": 6047, "train_runtime": 240731.7576, "train_tokens_per_second": 97936.026 }, { "epoch": 0.9615262321144674, "grad_norm": 0.4334338903427124, "learning_rate": 1.8466418141314669e-07, "loss": 0.3942, "num_input_tokens_seen": 23580214117, "step": 6048, "train_runtime": 240768.3908, "train_tokens_per_second": 97937.333 }, { "epoch": 0.9616852146263911, "grad_norm": 0.19937001168727875, "learning_rate": 1.8314929357726441e-07, "loss": 0.3807, "num_input_tokens_seen": 23584133918, "step": 6049, "train_runtime": 240809.2009, "train_tokens_per_second": 97937.013 }, { "epoch": 0.9618441971383148, "grad_norm": 0.2244577556848526, "learning_rate": 1.8164062218711809e-07, "loss": 0.3854, "num_input_tokens_seen": 23588002490, "step": 6050, "train_runtime": 240848.751, "train_tokens_per_second": 97936.993 }, { "epoch": 0.9620031796502385, "grad_norm": 0.20456427335739136, "learning_rate": 1.801381676206221e-07, "loss": 0.3833, "num_input_tokens_seen": 23591998798, "step": 6051, "train_runtime": 240888.6297, "train_tokens_per_second": 97937.37 }, { "epoch": 0.9621621621621622, "grad_norm": 0.19863708317279816, "learning_rate": 1.7864193025412813e-07, "loss": 0.3872, "num_input_tokens_seen": 23595894093, "step": 6052, "train_runtime": 240928.3346, "train_tokens_per_second": 97937.398 }, { "epoch": 0.9623211446740858, "grad_norm": 0.203058660030365, "learning_rate": 1.771519104624364e-07, "loss": 0.371, "num_input_tokens_seen": 23599671880, "step": 6053, "train_runtime": 240968.8443, "train_tokens_per_second": 97936.611 }, { "epoch": 0.9624801271860095, "grad_norm": 0.30775076150894165, "learning_rate": 1.7566810861878169e-07, "loss": 0.3862, "num_input_tokens_seen": 23603615783, "step": 6054, "train_runtime": 241007.3533, "train_tokens_per_second": 97937.326 }, { "epoch": 0.9626391096979332, "grad_norm": 0.18573437631130219, "learning_rate": 1.7419052509485e-07, "loss": 0.3814, "num_input_tokens_seen": 23607586227, "step": 6055, "train_runtime": 241045.8603, "train_tokens_per_second": 97938.153 }, { "epoch": 0.9627980922098569, "grad_norm": 0.193894162774086, "learning_rate": 1.7271916026076195e-07, "loss": 0.3825, "num_input_tokens_seen": 23611515463, "step": 6056, "train_runtime": 241084.6667, "train_tokens_per_second": 97938.686 }, { "epoch": 0.9629570747217806, "grad_norm": 0.3084978461265564, "learning_rate": 1.712540144850866e-07, "loss": 0.3728, "num_input_tokens_seen": 23615421861, "step": 6057, "train_runtime": 241120.3363, "train_tokens_per_second": 97940.399 }, { "epoch": 0.9631160572337043, "grad_norm": 0.1921175867319107, "learning_rate": 1.6979508813483313e-07, "loss": 0.3915, "num_input_tokens_seen": 23619390944, "step": 6058, "train_runtime": 241160.621, "train_tokens_per_second": 97940.496 }, { "epoch": 0.963275039745628, "grad_norm": 0.2677019238471985, "learning_rate": 1.6834238157544813e-07, "loss": 0.3728, "num_input_tokens_seen": 23623270000, "step": 6059, "train_runtime": 241196.724, "train_tokens_per_second": 97941.919 }, { "epoch": 0.9634340222575517, "grad_norm": 0.37156975269317627, "learning_rate": 1.6689589517083215e-07, "loss": 0.3835, "num_input_tokens_seen": 23627099248, "step": 6060, "train_runtime": 241238.4548, "train_tokens_per_second": 97940.85 }, { "epoch": 0.9635930047694754, "grad_norm": 0.18488946557044983, "learning_rate": 1.6545562928331203e-07, "loss": 0.3813, "num_input_tokens_seen": 23630945538, "step": 6061, "train_runtime": 241278.9049, "train_tokens_per_second": 97940.371 }, { "epoch": 0.963751987281399, "grad_norm": 0.18322378396987915, "learning_rate": 1.6402158427366865e-07, "loss": 0.3766, "num_input_tokens_seen": 23634863006, "step": 6062, "train_runtime": 241320.3479, "train_tokens_per_second": 97939.785 }, { "epoch": 0.9639109697933227, "grad_norm": 0.1930866688489914, "learning_rate": 1.625937605011174e-07, "loss": 0.3759, "num_input_tokens_seen": 23638780899, "step": 6063, "train_runtime": 241359.3998, "train_tokens_per_second": 97940.171 }, { "epoch": 0.9640699523052464, "grad_norm": 0.39928025007247925, "learning_rate": 1.6117215832331666e-07, "loss": 0.3832, "num_input_tokens_seen": 23642576117, "step": 6064, "train_runtime": 241398.2688, "train_tokens_per_second": 97940.123 }, { "epoch": 0.9642289348171701, "grad_norm": 0.40840214490890503, "learning_rate": 1.5975677809637603e-07, "loss": 0.3759, "num_input_tokens_seen": 23646515338, "step": 6065, "train_runtime": 241438.3018, "train_tokens_per_second": 97940.199 }, { "epoch": 0.9643879173290938, "grad_norm": 0.2270745038986206, "learning_rate": 1.583476201748285e-07, "loss": 0.3794, "num_input_tokens_seen": 23650437460, "step": 6066, "train_runtime": 241476.9009, "train_tokens_per_second": 97940.786 }, { "epoch": 0.9645468998410175, "grad_norm": 0.19290590286254883, "learning_rate": 1.5694468491166402e-07, "loss": 0.376, "num_input_tokens_seen": 23654253618, "step": 6067, "train_runtime": 241515.5416, "train_tokens_per_second": 97940.917 }, { "epoch": 0.9647058823529412, "grad_norm": 0.1972798854112625, "learning_rate": 1.5554797265830423e-07, "loss": 0.3599, "num_input_tokens_seen": 23658203854, "step": 6068, "train_runtime": 241555.1177, "train_tokens_per_second": 97941.224 }, { "epoch": 0.9648648648648649, "grad_norm": 0.2555009722709656, "learning_rate": 1.5415748376461924e-07, "loss": 0.3838, "num_input_tokens_seen": 23662010968, "step": 6069, "train_runtime": 241596.0747, "train_tokens_per_second": 97940.378 }, { "epoch": 0.9650238473767886, "grad_norm": 0.20112918317317963, "learning_rate": 1.527732185789166e-07, "loss": 0.3894, "num_input_tokens_seen": 23665969044, "step": 6070, "train_runtime": 241636.3857, "train_tokens_per_second": 97940.42 }, { "epoch": 0.9651828298887123, "grad_norm": 0.25457432866096497, "learning_rate": 1.5139517744794106e-07, "loss": 0.378, "num_input_tokens_seen": 23669790522, "step": 6071, "train_runtime": 241674.1006, "train_tokens_per_second": 97940.948 }, { "epoch": 0.9653418124006359, "grad_norm": 0.1914699673652649, "learning_rate": 1.5002336071688327e-07, "loss": 0.3873, "num_input_tokens_seen": 23673579263, "step": 6072, "train_runtime": 241712.934, "train_tokens_per_second": 97940.887 }, { "epoch": 0.9655007949125596, "grad_norm": 0.19652236998081207, "learning_rate": 1.4865776872937664e-07, "loss": 0.3879, "num_input_tokens_seen": 23677582000, "step": 6073, "train_runtime": 241752.6371, "train_tokens_per_second": 97941.36 }, { "epoch": 0.9656597774244833, "grad_norm": 0.1751047819852829, "learning_rate": 1.47298401827492e-07, "loss": 0.3786, "num_input_tokens_seen": 23681520300, "step": 6074, "train_runtime": 241791.7846, "train_tokens_per_second": 97941.79 }, { "epoch": 0.965818759936407, "grad_norm": 0.24736842513084412, "learning_rate": 1.459452603517375e-07, "loss": 0.3858, "num_input_tokens_seen": 23685498367, "step": 6075, "train_runtime": 241830.5214, "train_tokens_per_second": 97942.552 }, { "epoch": 0.9659777424483307, "grad_norm": 0.3844843804836273, "learning_rate": 1.445983446410698e-07, "loss": 0.3912, "num_input_tokens_seen": 23689279342, "step": 6076, "train_runtime": 241871.2202, "train_tokens_per_second": 97941.704 }, { "epoch": 0.9661367249602544, "grad_norm": 0.21079646050930023, "learning_rate": 1.4325765503287736e-07, "loss": 0.3946, "num_input_tokens_seen": 23693114980, "step": 6077, "train_runtime": 241913.4127, "train_tokens_per_second": 97940.477 }, { "epoch": 0.9662957074721781, "grad_norm": 0.22141417860984802, "learning_rate": 1.4192319186299708e-07, "loss": 0.3854, "num_input_tokens_seen": 23697120238, "step": 6078, "train_runtime": 241952.4134, "train_tokens_per_second": 97941.244 }, { "epoch": 0.9664546899841018, "grad_norm": 0.19459325075149536, "learning_rate": 1.4059495546570323e-07, "loss": 0.3713, "num_input_tokens_seen": 23700907183, "step": 6079, "train_runtime": 241991.0969, "train_tokens_per_second": 97941.236 }, { "epoch": 0.9666136724960255, "grad_norm": 0.3046247959136963, "learning_rate": 1.3927294617370467e-07, "loss": 0.3904, "num_input_tokens_seen": 23704834840, "step": 6080, "train_runtime": 242029.2576, "train_tokens_per_second": 97942.022 }, { "epoch": 0.9667726550079492, "grad_norm": 0.2106245458126068, "learning_rate": 1.3795716431816152e-07, "loss": 0.3755, "num_input_tokens_seen": 23708785507, "step": 6081, "train_runtime": 242069.3855, "train_tokens_per_second": 97942.106 }, { "epoch": 0.9669316375198728, "grad_norm": 0.1910688877105713, "learning_rate": 1.3664761022866013e-07, "loss": 0.3733, "num_input_tokens_seen": 23712705794, "step": 6082, "train_runtime": 242109.7847, "train_tokens_per_second": 97941.956 }, { "epoch": 0.9670906200317965, "grad_norm": 0.2053217738866806, "learning_rate": 1.3534428423324365e-07, "loss": 0.365, "num_input_tokens_seen": 23716486273, "step": 6083, "train_runtime": 242148.5495, "train_tokens_per_second": 97941.889 }, { "epoch": 0.9672496025437202, "grad_norm": 0.19885484874248505, "learning_rate": 1.3404718665837867e-07, "loss": 0.3837, "num_input_tokens_seen": 23720422675, "step": 6084, "train_runtime": 242188.1797, "train_tokens_per_second": 97942.116 }, { "epoch": 0.9674085850556439, "grad_norm": 0.18465472757816315, "learning_rate": 1.327563178289831e-07, "loss": 0.3744, "num_input_tokens_seen": 23724351032, "step": 6085, "train_runtime": 242230.7086, "train_tokens_per_second": 97941.137 }, { "epoch": 0.9675675675675676, "grad_norm": 0.23814460635185242, "learning_rate": 1.3147167806840942e-07, "loss": 0.3905, "num_input_tokens_seen": 23728238991, "step": 6086, "train_runtime": 242269.5762, "train_tokens_per_second": 97941.472 }, { "epoch": 0.9677265500794913, "grad_norm": 0.21520158648490906, "learning_rate": 1.3019326769845018e-07, "loss": 0.3818, "num_input_tokens_seen": 23732203557, "step": 6087, "train_runtime": 242310.4254, "train_tokens_per_second": 97941.323 }, { "epoch": 0.967885532591415, "grad_norm": 0.19958841800689697, "learning_rate": 1.289210870393409e-07, "loss": 0.3886, "num_input_tokens_seen": 23736103316, "step": 6088, "train_runtime": 242350.3758, "train_tokens_per_second": 97941.269 }, { "epoch": 0.9680445151033387, "grad_norm": 0.2148357331752777, "learning_rate": 1.276551364097489e-07, "loss": 0.3813, "num_input_tokens_seen": 23739864829, "step": 6089, "train_runtime": 242387.5172, "train_tokens_per_second": 97941.78 }, { "epoch": 0.9682034976152624, "grad_norm": 0.3261571526527405, "learning_rate": 1.263954161267872e-07, "loss": 0.3795, "num_input_tokens_seen": 23743788501, "step": 6090, "train_runtime": 242425.7461, "train_tokens_per_second": 97942.52 }, { "epoch": 0.9683624801271861, "grad_norm": 0.2917168438434601, "learning_rate": 1.2514192650601165e-07, "loss": 0.3815, "num_input_tokens_seen": 23747825070, "step": 6091, "train_runtime": 242464.6754, "train_tokens_per_second": 97943.443 }, { "epoch": 0.9685214626391097, "grad_norm": 0.1830935925245285, "learning_rate": 1.2389466786140725e-07, "loss": 0.3728, "num_input_tokens_seen": 23751739156, "step": 6092, "train_runtime": 242502.1917, "train_tokens_per_second": 97944.431 }, { "epoch": 0.9686804451510334, "grad_norm": 0.21869230270385742, "learning_rate": 1.2265364050540463e-07, "loss": 0.3925, "num_input_tokens_seen": 23755643823, "step": 6093, "train_runtime": 242542.3507, "train_tokens_per_second": 97944.313 }, { "epoch": 0.968839427662957, "grad_norm": 0.23154041171073914, "learning_rate": 1.2141884474887456e-07, "loss": 0.3894, "num_input_tokens_seen": 23759490969, "step": 6094, "train_runtime": 242582.1274, "train_tokens_per_second": 97944.112 }, { "epoch": 0.9689984101748808, "grad_norm": 0.22408749163150787, "learning_rate": 1.201902809011196e-07, "loss": 0.3645, "num_input_tokens_seen": 23763461150, "step": 6095, "train_runtime": 242620.2547, "train_tokens_per_second": 97945.084 }, { "epoch": 0.9691573926868045, "grad_norm": 0.22596682608127594, "learning_rate": 1.1896794926989363e-07, "loss": 0.3889, "num_input_tokens_seen": 23767359955, "step": 6096, "train_runtime": 242658.5216, "train_tokens_per_second": 97945.705 }, { "epoch": 0.9693163751987282, "grad_norm": 0.2541663646697998, "learning_rate": 1.1775185016137668e-07, "loss": 0.3756, "num_input_tokens_seen": 23771263932, "step": 6097, "train_runtime": 242699.5144, "train_tokens_per_second": 97945.247 }, { "epoch": 0.9694753577106519, "grad_norm": 0.3718150556087494, "learning_rate": 1.1654198388019456e-07, "loss": 0.3914, "num_input_tokens_seen": 23775264754, "step": 6098, "train_runtime": 242738.509, "train_tokens_per_second": 97945.995 }, { "epoch": 0.9696343402225756, "grad_norm": 0.21750454604625702, "learning_rate": 1.1533835072941035e-07, "loss": 0.3831, "num_input_tokens_seen": 23779117046, "step": 6099, "train_runtime": 242779.423, "train_tokens_per_second": 97945.356 }, { "epoch": 0.9697933227344993, "grad_norm": 0.24621668457984924, "learning_rate": 1.1414095101052458e-07, "loss": 0.3945, "num_input_tokens_seen": 23783009348, "step": 6100, "train_runtime": 242820.0277, "train_tokens_per_second": 97945.007 }, { "epoch": 0.9699523052464228, "grad_norm": 0.2788945734500885, "learning_rate": 1.1294978502348063e-07, "loss": 0.3951, "num_input_tokens_seen": 23786831563, "step": 6101, "train_runtime": 242856.9893, "train_tokens_per_second": 97945.839 }, { "epoch": 0.9701112877583465, "grad_norm": 0.22372344136238098, "learning_rate": 1.1176485306665096e-07, "loss": 0.4012, "num_input_tokens_seen": 23790806020, "step": 6102, "train_runtime": 242895.9568, "train_tokens_per_second": 97946.488 }, { "epoch": 0.9702702702702702, "grad_norm": 0.2591397166252136, "learning_rate": 1.1058615543685924e-07, "loss": 0.369, "num_input_tokens_seen": 23794613175, "step": 6103, "train_runtime": 242935.7705, "train_tokens_per_second": 97946.108 }, { "epoch": 0.9704292527821939, "grad_norm": 0.2223871946334839, "learning_rate": 1.0941369242936095e-07, "loss": 0.3739, "num_input_tokens_seen": 23798501519, "step": 6104, "train_runtime": 242973.074, "train_tokens_per_second": 97947.073 }, { "epoch": 0.9705882352941176, "grad_norm": 0.22614766657352448, "learning_rate": 1.082474643378406e-07, "loss": 0.3912, "num_input_tokens_seen": 23802514409, "step": 6105, "train_runtime": 243013.3397, "train_tokens_per_second": 97947.357 }, { "epoch": 0.9707472178060413, "grad_norm": 0.2121562510728836, "learning_rate": 1.0708747145444231e-07, "loss": 0.383, "num_input_tokens_seen": 23806351090, "step": 6106, "train_runtime": 243052.9823, "train_tokens_per_second": 97947.167 }, { "epoch": 0.970906200317965, "grad_norm": 0.2825269401073456, "learning_rate": 1.0593371406972808e-07, "loss": 0.3954, "num_input_tokens_seen": 23810334122, "step": 6107, "train_runtime": 243090.1549, "train_tokens_per_second": 97948.574 }, { "epoch": 0.9710651828298887, "grad_norm": 0.18951472640037537, "learning_rate": 1.0478619247270838e-07, "loss": 0.371, "num_input_tokens_seen": 23814275924, "step": 6108, "train_runtime": 243127.2038, "train_tokens_per_second": 97949.861 }, { "epoch": 0.9712241653418124, "grad_norm": 0.19849959015846252, "learning_rate": 1.0364490695082829e-07, "loss": 0.3939, "num_input_tokens_seen": 23818237920, "step": 6109, "train_runtime": 243166.4966, "train_tokens_per_second": 97950.327 }, { "epoch": 0.9713831478537361, "grad_norm": 0.20505832135677338, "learning_rate": 1.0250985778997302e-07, "loss": 0.3863, "num_input_tokens_seen": 23822069355, "step": 6110, "train_runtime": 243205.97, "train_tokens_per_second": 97950.183 }, { "epoch": 0.9715421303656597, "grad_norm": 0.2257377803325653, "learning_rate": 1.0138104527446512e-07, "loss": 0.3736, "num_input_tokens_seen": 23825981481, "step": 6111, "train_runtime": 243246.0197, "train_tokens_per_second": 97950.139 }, { "epoch": 0.9717011128775834, "grad_norm": 0.18993784487247467, "learning_rate": 1.0025846968706177e-07, "loss": 0.3772, "num_input_tokens_seen": 23829935230, "step": 6112, "train_runtime": 243283.6352, "train_tokens_per_second": 97951.246 }, { "epoch": 0.9718600953895071, "grad_norm": 0.19775186479091644, "learning_rate": 9.914213130896022e-08, "loss": 0.3852, "num_input_tokens_seen": 23833864030, "step": 6113, "train_runtime": 243322.9773, "train_tokens_per_second": 97951.555 }, { "epoch": 0.9720190779014308, "grad_norm": 0.3683343529701233, "learning_rate": 9.803203041979791e-08, "loss": 0.3872, "num_input_tokens_seen": 23837698186, "step": 6114, "train_runtime": 243362.1966, "train_tokens_per_second": 97951.525 }, { "epoch": 0.9721780604133545, "grad_norm": 0.23500517010688782, "learning_rate": 9.692816729764686e-08, "loss": 0.3777, "num_input_tokens_seen": 23841559854, "step": 6115, "train_runtime": 243401.2221, "train_tokens_per_second": 97951.685 }, { "epoch": 0.9723370429252782, "grad_norm": 0.2052549421787262, "learning_rate": 9.583054221901367e-08, "loss": 0.389, "num_input_tokens_seen": 23845388248, "step": 6116, "train_runtime": 243441.917, "train_tokens_per_second": 97951.037 }, { "epoch": 0.9724960254372019, "grad_norm": 0.21602536737918854, "learning_rate": 9.473915545885059e-08, "loss": 0.3826, "num_input_tokens_seen": 23849324644, "step": 6117, "train_runtime": 243481.8248, "train_tokens_per_second": 97951.15 }, { "epoch": 0.9726550079491256, "grad_norm": 0.20838752388954163, "learning_rate": 9.365400729053897e-08, "loss": 0.3833, "num_input_tokens_seen": 23853316093, "step": 6118, "train_runtime": 243521.507, "train_tokens_per_second": 97951.579 }, { "epoch": 0.9728139904610493, "grad_norm": 0.23401406407356262, "learning_rate": 9.257509798590025e-08, "loss": 0.3871, "num_input_tokens_seen": 23857073217, "step": 6119, "train_runtime": 243561.1273, "train_tokens_per_second": 97951.071 }, { "epoch": 0.972972972972973, "grad_norm": 0.22709649801254272, "learning_rate": 9.150242781519602e-08, "loss": 0.3895, "num_input_tokens_seen": 23860946654, "step": 6120, "train_runtime": 243599.4889, "train_tokens_per_second": 97951.546 }, { "epoch": 0.9731319554848966, "grad_norm": 0.22058796882629395, "learning_rate": 9.04359970471197e-08, "loss": 0.3782, "num_input_tokens_seen": 23864779136, "step": 6121, "train_runtime": 243639.5954, "train_tokens_per_second": 97951.152 }, { "epoch": 0.9732909379968203, "grad_norm": 0.45063358545303345, "learning_rate": 8.937580594881035e-08, "loss": 0.3756, "num_input_tokens_seen": 23868698777, "step": 6122, "train_runtime": 243678.9613, "train_tokens_per_second": 97951.414 }, { "epoch": 0.973449920508744, "grad_norm": 0.18713290989398956, "learning_rate": 8.832185478583054e-08, "loss": 0.3774, "num_input_tokens_seen": 23872665069, "step": 6123, "train_runtime": 243719.3986, "train_tokens_per_second": 97951.436 }, { "epoch": 0.9736089030206677, "grad_norm": 0.21846511960029602, "learning_rate": 8.727414382219412e-08, "loss": 0.3766, "num_input_tokens_seen": 23876417864, "step": 6124, "train_runtime": 243759.5724, "train_tokens_per_second": 97950.688 }, { "epoch": 0.9737678855325914, "grad_norm": 0.1898488700389862, "learning_rate": 8.623267332033835e-08, "loss": 0.3652, "num_input_tokens_seen": 23880413312, "step": 6125, "train_runtime": 243800.1426, "train_tokens_per_second": 97950.777 }, { "epoch": 0.9739268680445151, "grad_norm": 0.23272453248500824, "learning_rate": 8.519744354115177e-08, "loss": 0.3885, "num_input_tokens_seen": 23884293190, "step": 6126, "train_runtime": 243839.0143, "train_tokens_per_second": 97951.073 }, { "epoch": 0.9740858505564388, "grad_norm": 0.2506776750087738, "learning_rate": 8.416845474394919e-08, "loss": 0.3836, "num_input_tokens_seen": 23888195996, "step": 6127, "train_runtime": 243878.242, "train_tokens_per_second": 97951.321 }, { "epoch": 0.9742448330683625, "grad_norm": 0.2489164173603058, "learning_rate": 8.314570718648274e-08, "loss": 0.3941, "num_input_tokens_seen": 23892050897, "step": 6128, "train_runtime": 243918.4204, "train_tokens_per_second": 97950.991 }, { "epoch": 0.9744038155802862, "grad_norm": 0.3603883385658264, "learning_rate": 8.212920112494748e-08, "loss": 0.3833, "num_input_tokens_seen": 23896131632, "step": 6129, "train_runtime": 243956.1072, "train_tokens_per_second": 97952.586 }, { "epoch": 0.9745627980922098, "grad_norm": 0.19573618471622467, "learning_rate": 8.111893681396754e-08, "loss": 0.381, "num_input_tokens_seen": 23900000692, "step": 6130, "train_runtime": 243996.6974, "train_tokens_per_second": 97952.148 }, { "epoch": 0.9747217806041335, "grad_norm": 0.22842814028263092, "learning_rate": 8.011491450660713e-08, "loss": 0.3873, "num_input_tokens_seen": 23903815038, "step": 6131, "train_runtime": 244035.5617, "train_tokens_per_second": 97952.179 }, { "epoch": 0.9748807631160572, "grad_norm": 0.211652472615242, "learning_rate": 7.911713445437063e-08, "loss": 0.3859, "num_input_tokens_seen": 23907775531, "step": 6132, "train_runtime": 244075.4425, "train_tokens_per_second": 97952.401 }, { "epoch": 0.9750397456279809, "grad_norm": 0.20774006843566895, "learning_rate": 7.812559690719146e-08, "loss": 0.3855, "num_input_tokens_seen": 23911684567, "step": 6133, "train_runtime": 244110.9335, "train_tokens_per_second": 97954.173 }, { "epoch": 0.9751987281399046, "grad_norm": 0.2242809683084488, "learning_rate": 7.714030211344315e-08, "loss": 0.3804, "num_input_tokens_seen": 23915529514, "step": 6134, "train_runtime": 244149.7909, "train_tokens_per_second": 97954.331 }, { "epoch": 0.9753577106518283, "grad_norm": 0.3905705511569977, "learning_rate": 7.616125031993382e-08, "loss": 0.3774, "num_input_tokens_seen": 23919352083, "step": 6135, "train_runtime": 244188.5412, "train_tokens_per_second": 97954.441 }, { "epoch": 0.975516693163752, "grad_norm": 0.23632197082042694, "learning_rate": 7.518844177191175e-08, "loss": 0.3873, "num_input_tokens_seen": 23923319647, "step": 6136, "train_runtime": 244224.943, "train_tokens_per_second": 97956.087 }, { "epoch": 0.9756756756756757, "grad_norm": 0.26269131898880005, "learning_rate": 7.42218767130598e-08, "loss": 0.3815, "num_input_tokens_seen": 23927130955, "step": 6137, "train_runtime": 244264.1193, "train_tokens_per_second": 97955.979 }, { "epoch": 0.9758346581875994, "grad_norm": 0.20341238379478455, "learning_rate": 7.326155538549262e-08, "loss": 0.3847, "num_input_tokens_seen": 23931029422, "step": 6138, "train_runtime": 244303.8496, "train_tokens_per_second": 97956.006 }, { "epoch": 0.9759936406995231, "grad_norm": 0.23034784197807312, "learning_rate": 7.230747802976224e-08, "loss": 0.3974, "num_input_tokens_seen": 23934961144, "step": 6139, "train_runtime": 244344.0742, "train_tokens_per_second": 97955.971 }, { "epoch": 0.9761526232114467, "grad_norm": 0.48238319158554077, "learning_rate": 7.13596448848608e-08, "loss": 0.379, "num_input_tokens_seen": 23938854676, "step": 6140, "train_runtime": 244382.1249, "train_tokens_per_second": 97956.652 }, { "epoch": 0.9763116057233704, "grad_norm": 0.23525449633598328, "learning_rate": 7.041805618821507e-08, "loss": 0.381, "num_input_tokens_seen": 23942646097, "step": 6141, "train_runtime": 244420.9416, "train_tokens_per_second": 97956.607 }, { "epoch": 0.9764705882352941, "grad_norm": 1.0048996210098267, "learning_rate": 6.948271217568636e-08, "loss": 0.3789, "num_input_tokens_seen": 23946532359, "step": 6142, "train_runtime": 244457.6834, "train_tokens_per_second": 97957.782 }, { "epoch": 0.9766295707472178, "grad_norm": 0.18648363649845123, "learning_rate": 6.855361308156782e-08, "loss": 0.3949, "num_input_tokens_seen": 23950535277, "step": 6143, "train_runtime": 244497.0041, "train_tokens_per_second": 97958.4 }, { "epoch": 0.9767885532591415, "grad_norm": 0.20483365654945374, "learning_rate": 6.76307591385955e-08, "loss": 0.3781, "num_input_tokens_seen": 23954291561, "step": 6144, "train_runtime": 244536.5413, "train_tokens_per_second": 97957.922 }, { "epoch": 0.9769475357710652, "grad_norm": 0.2169157713651657, "learning_rate": 6.671415057794006e-08, "loss": 0.391, "num_input_tokens_seen": 23958087334, "step": 6145, "train_runtime": 244574.6284, "train_tokens_per_second": 97958.188 }, { "epoch": 0.9771065182829889, "grad_norm": 0.19539660215377808, "learning_rate": 6.580378762919837e-08, "loss": 0.3774, "num_input_tokens_seen": 23962040328, "step": 6146, "train_runtime": 244614.9595, "train_tokens_per_second": 97958.197 }, { "epoch": 0.9772655007949126, "grad_norm": 0.21531794965267181, "learning_rate": 6.489967052041578e-08, "loss": 0.3919, "num_input_tokens_seen": 23965921178, "step": 6147, "train_runtime": 244655.5533, "train_tokens_per_second": 97957.806 }, { "epoch": 0.9774244833068363, "grad_norm": 0.2176254391670227, "learning_rate": 6.400179947806672e-08, "loss": 0.4025, "num_input_tokens_seen": 23969860870, "step": 6148, "train_runtime": 244695.0874, "train_tokens_per_second": 97958.08 }, { "epoch": 0.97758346581876, "grad_norm": 0.2016599327325821, "learning_rate": 6.311017472706015e-08, "loss": 0.3776, "num_input_tokens_seen": 23973741716, "step": 6149, "train_runtime": 244736.4575, "train_tokens_per_second": 97957.378 }, { "epoch": 0.9777424483306836, "grad_norm": 0.3814932107925415, "learning_rate": 6.222479649073965e-08, "loss": 0.3877, "num_input_tokens_seen": 23977610938, "step": 6150, "train_runtime": 244775.1423, "train_tokens_per_second": 97957.704 }, { "epoch": 0.9779014308426073, "grad_norm": 0.24856583774089813, "learning_rate": 6.134566499089167e-08, "loss": 0.3763, "num_input_tokens_seen": 23981454034, "step": 6151, "train_runtime": 244813.952, "train_tokens_per_second": 97957.873 }, { "epoch": 0.978060413354531, "grad_norm": 0.2853667140007019, "learning_rate": 6.04727804477262e-08, "loss": 0.3987, "num_input_tokens_seen": 23985343269, "step": 6152, "train_runtime": 244855.2719, "train_tokens_per_second": 97957.226 }, { "epoch": 0.9782193958664547, "grad_norm": 0.21826091408729553, "learning_rate": 5.960614307989887e-08, "loss": 0.3806, "num_input_tokens_seen": 23989289976, "step": 6153, "train_runtime": 244895.6864, "train_tokens_per_second": 97957.177 }, { "epoch": 0.9783783783783784, "grad_norm": 0.19573970139026642, "learning_rate": 5.874575310449437e-08, "loss": 0.387, "num_input_tokens_seen": 23993159583, "step": 6154, "train_runtime": 244932.8869, "train_tokens_per_second": 97958.097 }, { "epoch": 0.9785373608903021, "grad_norm": 0.20919784903526306, "learning_rate": 5.789161073703753e-08, "loss": 0.3721, "num_input_tokens_seen": 23997078027, "step": 6155, "train_runtime": 244973.0152, "train_tokens_per_second": 97958.047 }, { "epoch": 0.9786963434022258, "grad_norm": 0.2068309336900711, "learning_rate": 5.7043716191479414e-08, "loss": 0.3885, "num_input_tokens_seen": 24000903348, "step": 6156, "train_runtime": 245010.6843, "train_tokens_per_second": 97958.599 }, { "epoch": 0.9788553259141495, "grad_norm": 0.2103499323129654, "learning_rate": 5.62020696802168e-08, "loss": 0.3816, "num_input_tokens_seen": 24004761129, "step": 6157, "train_runtime": 245047.9103, "train_tokens_per_second": 97959.461 }, { "epoch": 0.9790143084260732, "grad_norm": 0.1738952249288559, "learning_rate": 5.5366671414072705e-08, "loss": 0.3903, "num_input_tokens_seen": 24008699258, "step": 6158, "train_runtime": 245088.2022, "train_tokens_per_second": 97959.425 }, { "epoch": 0.9791732909379968, "grad_norm": 0.21705259382724762, "learning_rate": 5.453752160231029e-08, "loss": 0.3699, "num_input_tokens_seen": 24012560819, "step": 6159, "train_runtime": 245124.9189, "train_tokens_per_second": 97960.505 }, { "epoch": 0.9793322734499205, "grad_norm": 0.18054834008216858, "learning_rate": 5.3714620452627315e-08, "loss": 0.3884, "num_input_tokens_seen": 24016550898, "step": 6160, "train_runtime": 245164.5449, "train_tokens_per_second": 97960.947 }, { "epoch": 0.9794912559618442, "grad_norm": 0.2592661678791046, "learning_rate": 5.2897968171150555e-08, "loss": 0.3804, "num_input_tokens_seen": 24020274922, "step": 6161, "train_runtime": 245204.6506, "train_tokens_per_second": 97960.112 }, { "epoch": 0.9796502384737679, "grad_norm": 0.20614135265350342, "learning_rate": 5.208756496244693e-08, "loss": 0.3811, "num_input_tokens_seen": 24024247949, "step": 6162, "train_runtime": 245244.5371, "train_tokens_per_second": 97960.38 }, { "epoch": 0.9798092209856916, "grad_norm": 0.22015826404094696, "learning_rate": 5.1283411029520725e-08, "loss": 0.3746, "num_input_tokens_seen": 24028263081, "step": 6163, "train_runtime": 245284.8466, "train_tokens_per_second": 97960.65 }, { "epoch": 0.9799682034976153, "grad_norm": 0.239760622382164, "learning_rate": 5.048550657379969e-08, "loss": 0.3947, "num_input_tokens_seen": 24032169684, "step": 6164, "train_runtime": 245321.0947, "train_tokens_per_second": 97962.1 }, { "epoch": 0.980127186009539, "grad_norm": 0.22347351908683777, "learning_rate": 4.9693851795160064e-08, "loss": 0.3837, "num_input_tokens_seen": 24036067673, "step": 6165, "train_runtime": 245359.583, "train_tokens_per_second": 97962.62 }, { "epoch": 0.9802861685214627, "grad_norm": 0.2234141230583191, "learning_rate": 4.8908446891901547e-08, "loss": 0.3961, "num_input_tokens_seen": 24039922890, "step": 6166, "train_runtime": 245399.2953, "train_tokens_per_second": 97962.477 }, { "epoch": 0.9804451510333864, "grad_norm": 0.2408965677022934, "learning_rate": 4.8129292060764e-08, "loss": 0.3854, "num_input_tokens_seen": 24043974150, "step": 6167, "train_runtime": 245436.4847, "train_tokens_per_second": 97964.14 }, { "epoch": 0.9806041335453101, "grad_norm": 0.5280996561050415, "learning_rate": 4.7356387496921836e-08, "loss": 0.3665, "num_input_tokens_seen": 24047818839, "step": 6168, "train_runtime": 245475.6223, "train_tokens_per_second": 97964.183 }, { "epoch": 0.9807631160572337, "grad_norm": 0.33474624156951904, "learning_rate": 4.658973339397854e-08, "loss": 0.3757, "num_input_tokens_seen": 24051607730, "step": 6169, "train_runtime": 245515.5268, "train_tokens_per_second": 97963.693 }, { "epoch": 0.9809220985691574, "grad_norm": 0.19198483228683472, "learning_rate": 4.58293299439777e-08, "loss": 0.3748, "num_input_tokens_seen": 24055563250, "step": 6170, "train_runtime": 245552.8036, "train_tokens_per_second": 97964.93 }, { "epoch": 0.981081081081081, "grad_norm": 0.3716559112071991, "learning_rate": 4.507517733739475e-08, "loss": 0.3856, "num_input_tokens_seen": 24059543111, "step": 6171, "train_runtime": 245592.3365, "train_tokens_per_second": 97965.366 }, { "epoch": 0.9812400635930048, "grad_norm": 0.2024337500333786, "learning_rate": 4.432727576313966e-08, "loss": 0.3686, "num_input_tokens_seen": 24063410342, "step": 6172, "train_runtime": 245632.0927, "train_tokens_per_second": 97965.254 }, { "epoch": 0.9813990461049285, "grad_norm": 0.19263705611228943, "learning_rate": 4.358562540855704e-08, "loss": 0.3754, "num_input_tokens_seen": 24067253204, "step": 6173, "train_runtime": 245672.2193, "train_tokens_per_second": 97964.895 }, { "epoch": 0.9815580286168522, "grad_norm": 0.19007302820682526, "learning_rate": 4.285022645942327e-08, "loss": 0.3849, "num_input_tokens_seen": 24071157683, "step": 6174, "train_runtime": 245712.1163, "train_tokens_per_second": 97964.879 }, { "epoch": 0.9817170111287759, "grad_norm": 0.18726031482219696, "learning_rate": 4.212107909995488e-08, "loss": 0.3806, "num_input_tokens_seen": 24075055353, "step": 6175, "train_runtime": 245752.0753, "train_tokens_per_second": 97964.81 }, { "epoch": 0.9818759936406996, "grad_norm": 0.19011351466178894, "learning_rate": 4.13981835127919e-08, "loss": 0.3885, "num_input_tokens_seen": 24078847565, "step": 6176, "train_runtime": 245792.7221, "train_tokens_per_second": 97964.038 }, { "epoch": 0.9820349761526233, "grad_norm": 0.22993822395801544, "learning_rate": 4.068153987901724e-08, "loss": 0.3853, "num_input_tokens_seen": 24082769448, "step": 6177, "train_runtime": 245835.2311, "train_tokens_per_second": 97963.052 }, { "epoch": 0.982193958664547, "grad_norm": 0.29595625400543213, "learning_rate": 3.997114837814564e-08, "loss": 0.3683, "num_input_tokens_seen": 24086799304, "step": 6178, "train_runtime": 245874.3292, "train_tokens_per_second": 97963.864 }, { "epoch": 0.9823529411764705, "grad_norm": 0.17928387224674225, "learning_rate": 3.9267009188126424e-08, "loss": 0.3812, "num_input_tokens_seen": 24090661627, "step": 6179, "train_runtime": 245912.0426, "train_tokens_per_second": 97964.546 }, { "epoch": 0.9825119236883942, "grad_norm": 0.21797172725200653, "learning_rate": 3.8569122485340726e-08, "loss": 0.3859, "num_input_tokens_seen": 24094459035, "step": 6180, "train_runtime": 245950.4756, "train_tokens_per_second": 97964.678 }, { "epoch": 0.9826709062003179, "grad_norm": 0.3019524812698364, "learning_rate": 3.7877488444601485e-08, "loss": 0.3778, "num_input_tokens_seen": 24098382772, "step": 6181, "train_runtime": 245991.8944, "train_tokens_per_second": 97964.133 }, { "epoch": 0.9828298887122416, "grad_norm": 0.2584483325481415, "learning_rate": 3.7192107239161775e-08, "loss": 0.3939, "num_input_tokens_seen": 24102268509, "step": 6182, "train_runtime": 246031.7377, "train_tokens_per_second": 97964.062 }, { "epoch": 0.9829888712241653, "grad_norm": 0.20857416093349457, "learning_rate": 3.651297904070372e-08, "loss": 0.3898, "num_input_tokens_seen": 24106026726, "step": 6183, "train_runtime": 246069.6673, "train_tokens_per_second": 97964.235 }, { "epoch": 0.983147853736089, "grad_norm": 0.25451144576072693, "learning_rate": 3.584010401934124e-08, "loss": 0.3904, "num_input_tokens_seen": 24109968170, "step": 6184, "train_runtime": 246110.7141, "train_tokens_per_second": 97963.911 }, { "epoch": 0.9833068362480127, "grad_norm": 0.26269158720970154, "learning_rate": 3.5173482343628384e-08, "loss": 0.3934, "num_input_tokens_seen": 24113872812, "step": 6185, "train_runtime": 246150.8298, "train_tokens_per_second": 97963.809 }, { "epoch": 0.9834658187599364, "grad_norm": 0.2312016785144806, "learning_rate": 3.451311418054826e-08, "loss": 0.3763, "num_input_tokens_seen": 24117807646, "step": 6186, "train_runtime": 246190.7353, "train_tokens_per_second": 97963.912 }, { "epoch": 0.9836248012718601, "grad_norm": 0.18337762355804443, "learning_rate": 3.3858999695518535e-08, "loss": 0.3799, "num_input_tokens_seen": 24121549957, "step": 6187, "train_runtime": 246230.92, "train_tokens_per_second": 97963.123 }, { "epoch": 0.9837837837837838, "grad_norm": 0.27696120738983154, "learning_rate": 3.321113905238871e-08, "loss": 0.391, "num_input_tokens_seen": 24125387928, "step": 6188, "train_runtime": 246269.8778, "train_tokens_per_second": 97963.211 }, { "epoch": 0.9839427662957074, "grad_norm": 0.54659503698349, "learning_rate": 3.2569532413445625e-08, "loss": 0.3765, "num_input_tokens_seen": 24129358932, "step": 6189, "train_runtime": 246310.0878, "train_tokens_per_second": 97963.34 }, { "epoch": 0.9841017488076311, "grad_norm": 0.23855271935462952, "learning_rate": 3.193417993940517e-08, "loss": 0.3884, "num_input_tokens_seen": 24133293183, "step": 6190, "train_runtime": 246349.7644, "train_tokens_per_second": 97963.533 }, { "epoch": 0.9842607313195548, "grad_norm": 0.21046985685825348, "learning_rate": 3.1305081789420596e-08, "loss": 0.3967, "num_input_tokens_seen": 24137140608, "step": 6191, "train_runtime": 246386.8354, "train_tokens_per_second": 97964.409 }, { "epoch": 0.9844197138314785, "grad_norm": 0.4902938902378082, "learning_rate": 3.068223812107418e-08, "loss": 0.3764, "num_input_tokens_seen": 24141029808, "step": 6192, "train_runtime": 246424.428, "train_tokens_per_second": 97965.246 }, { "epoch": 0.9845786963434022, "grad_norm": 0.2195441871881485, "learning_rate": 3.006564909038556e-08, "loss": 0.3945, "num_input_tokens_seen": 24145072739, "step": 6193, "train_runtime": 246464.2519, "train_tokens_per_second": 97965.821 }, { "epoch": 0.9847376788553259, "grad_norm": 0.20016531646251678, "learning_rate": 2.9455314851803416e-08, "loss": 0.3902, "num_input_tokens_seen": 24148932205, "step": 6194, "train_runtime": 246502.1291, "train_tokens_per_second": 97966.424 }, { "epoch": 0.9848966613672496, "grad_norm": 0.21477645635604858, "learning_rate": 2.885123555821656e-08, "loss": 0.3791, "num_input_tokens_seen": 24152799444, "step": 6195, "train_runtime": 246542.0732, "train_tokens_per_second": 97966.238 }, { "epoch": 0.9850556438791733, "grad_norm": 0.2076631486415863, "learning_rate": 2.8253411360940062e-08, "loss": 0.3915, "num_input_tokens_seen": 24156731928, "step": 6196, "train_runtime": 246581.9394, "train_tokens_per_second": 97966.347 }, { "epoch": 0.985214626391097, "grad_norm": 0.1811819076538086, "learning_rate": 2.7661842409726356e-08, "loss": 0.3776, "num_input_tokens_seen": 24160482895, "step": 6197, "train_runtime": 246619.7183, "train_tokens_per_second": 97966.55 }, { "epoch": 0.9853736089030206, "grad_norm": 0.2004258632659912, "learning_rate": 2.7076528852754135e-08, "loss": 0.3803, "num_input_tokens_seen": 24164475453, "step": 6198, "train_runtime": 246658.1845, "train_tokens_per_second": 97967.458 }, { "epoch": 0.9855325914149443, "grad_norm": 0.27797016501426697, "learning_rate": 2.6497470836647774e-08, "loss": 0.3893, "num_input_tokens_seen": 24168386820, "step": 6199, "train_runtime": 246696.5838, "train_tokens_per_second": 97968.064 }, { "epoch": 0.985691573926868, "grad_norm": 0.16982996463775635, "learning_rate": 2.592466850644959e-08, "loss": 0.3755, "num_input_tokens_seen": 24172286905, "step": 6200, "train_runtime": 246734.5561, "train_tokens_per_second": 97968.794 }, { "epoch": 0.9858505564387917, "grad_norm": 0.18435022234916687, "learning_rate": 2.535812200564758e-08, "loss": 0.3801, "num_input_tokens_seen": 24176129021, "step": 6201, "train_runtime": 246881.3063, "train_tokens_per_second": 97926.122 }, { "epoch": 0.9860095389507154, "grad_norm": 0.179336279630661, "learning_rate": 2.479783147615877e-08, "loss": 0.3785, "num_input_tokens_seen": 24180103749, "step": 6202, "train_runtime": 246919.9429, "train_tokens_per_second": 97926.897 }, { "epoch": 0.9861685214626391, "grad_norm": 0.19478121399879456, "learning_rate": 2.4243797058326467e-08, "loss": 0.386, "num_input_tokens_seen": 24184123384, "step": 6203, "train_runtime": 246959.457, "train_tokens_per_second": 97927.505 }, { "epoch": 0.9863275039745628, "grad_norm": 0.1713828593492508, "learning_rate": 2.3696018890939643e-08, "loss": 0.3805, "num_input_tokens_seen": 24188013342, "step": 6204, "train_runtime": 246999.7581, "train_tokens_per_second": 97927.275 }, { "epoch": 0.9864864864864865, "grad_norm": 0.22095607221126556, "learning_rate": 2.3154497111205208e-08, "loss": 0.3821, "num_input_tokens_seen": 24191857561, "step": 6205, "train_runtime": 247039.9396, "train_tokens_per_second": 97926.908 }, { "epoch": 0.9866454689984102, "grad_norm": 0.22102592885494232, "learning_rate": 2.2619231854772992e-08, "loss": 0.3816, "num_input_tokens_seen": 24195826587, "step": 6206, "train_runtime": 247079.6803, "train_tokens_per_second": 97927.222 }, { "epoch": 0.9868044515103339, "grad_norm": 0.34744203090667725, "learning_rate": 2.2090223255727406e-08, "loss": 0.3858, "num_input_tokens_seen": 24199663350, "step": 6207, "train_runtime": 247118.5133, "train_tokens_per_second": 97927.359 }, { "epoch": 0.9869634340222575, "grad_norm": 0.22744831442832947, "learning_rate": 2.156747144657634e-08, "loss": 0.386, "num_input_tokens_seen": 24203465112, "step": 6208, "train_runtime": 247156.7327, "train_tokens_per_second": 97927.598 }, { "epoch": 0.9871224165341812, "grad_norm": 0.2011406570672989, "learning_rate": 2.1050976558267822e-08, "loss": 0.379, "num_input_tokens_seen": 24207368959, "step": 6209, "train_runtime": 247196.7636, "train_tokens_per_second": 97927.532 }, { "epoch": 0.9872813990461049, "grad_norm": 0.2094634473323822, "learning_rate": 2.0540738720178922e-08, "loss": 0.3887, "num_input_tokens_seen": 24211240926, "step": 6210, "train_runtime": 247234.8746, "train_tokens_per_second": 97928.098 }, { "epoch": 0.9874403815580286, "grad_norm": 0.22857406735420227, "learning_rate": 2.003675806012406e-08, "loss": 0.3793, "num_input_tokens_seen": 24215131595, "step": 6211, "train_runtime": 247274.782, "train_tokens_per_second": 97928.027 }, { "epoch": 0.9875993640699523, "grad_norm": 0.18668535351753235, "learning_rate": 1.9539034704341153e-08, "loss": 0.3797, "num_input_tokens_seen": 24219037316, "step": 6212, "train_runtime": 247314.0737, "train_tokens_per_second": 97928.262 }, { "epoch": 0.987758346581876, "grad_norm": 0.2687065303325653, "learning_rate": 1.9047568777511015e-08, "loss": 0.3817, "num_input_tokens_seen": 24222980777, "step": 6213, "train_runtime": 247355.2389, "train_tokens_per_second": 97927.907 }, { "epoch": 0.9879173290937997, "grad_norm": 0.5206528902053833, "learning_rate": 1.856236040274073e-08, "loss": 0.4034, "num_input_tokens_seen": 24226947503, "step": 6214, "train_runtime": 247396.3976, "train_tokens_per_second": 97927.649 }, { "epoch": 0.9880763116057234, "grad_norm": 0.16583110392093658, "learning_rate": 1.808340970156919e-08, "loss": 0.3675, "num_input_tokens_seen": 24230836039, "step": 6215, "train_runtime": 247435.8423, "train_tokens_per_second": 97927.753 }, { "epoch": 0.9882352941176471, "grad_norm": 0.19407103955745697, "learning_rate": 1.7610716793972636e-08, "loss": 0.383, "num_input_tokens_seen": 24234688205, "step": 6216, "train_runtime": 247474.4213, "train_tokens_per_second": 97928.053 }, { "epoch": 0.9883942766295708, "grad_norm": 0.2549782991409302, "learning_rate": 1.714428179835914e-08, "loss": 0.3992, "num_input_tokens_seen": 24238628542, "step": 6217, "train_runtime": 247515.127, "train_tokens_per_second": 97927.867 }, { "epoch": 0.9885532591414944, "grad_norm": 0.22089730203151703, "learning_rate": 1.6684104831565794e-08, "loss": 0.3713, "num_input_tokens_seen": 24242521438, "step": 6218, "train_runtime": 247556.0401, "train_tokens_per_second": 97927.408 }, { "epoch": 0.9887122416534181, "grad_norm": 0.18603840470314026, "learning_rate": 1.6230186008861503e-08, "loss": 0.3691, "num_input_tokens_seen": 24246432806, "step": 6219, "train_runtime": 247596.0977, "train_tokens_per_second": 97927.362 }, { "epoch": 0.9888712241653418, "grad_norm": 0.31463873386383057, "learning_rate": 1.5782525443952534e-08, "loss": 0.375, "num_input_tokens_seen": 24250245297, "step": 6220, "train_runtime": 247635.0678, "train_tokens_per_second": 97927.347 }, { "epoch": 0.9890302066772655, "grad_norm": 0.19062592089176178, "learning_rate": 1.534112324897419e-08, "loss": 0.395, "num_input_tokens_seen": 24254257576, "step": 6221, "train_runtime": 247674.0476, "train_tokens_per_second": 97928.135 }, { "epoch": 0.9891891891891892, "grad_norm": 0.24138425290584564, "learning_rate": 1.4905979534493574e-08, "loss": 0.3849, "num_input_tokens_seen": 24258096209, "step": 6222, "train_runtime": 247714.4176, "train_tokens_per_second": 97927.672 }, { "epoch": 0.9893481717011129, "grad_norm": 0.22222650051116943, "learning_rate": 1.4477094409509617e-08, "loss": 0.3904, "num_input_tokens_seen": 24262102275, "step": 6223, "train_runtime": 247752.4635, "train_tokens_per_second": 97928.803 }, { "epoch": 0.9895071542130366, "grad_norm": 0.2885309159755707, "learning_rate": 1.4054467981458596e-08, "loss": 0.3753, "num_input_tokens_seen": 24265880518, "step": 6224, "train_runtime": 247789.2046, "train_tokens_per_second": 97929.531 }, { "epoch": 0.9896661367249603, "grad_norm": 0.3805859088897705, "learning_rate": 1.3638100356205829e-08, "loss": 0.3942, "num_input_tokens_seen": 24269810197, "step": 6225, "train_runtime": 247828.0828, "train_tokens_per_second": 97930.024 }, { "epoch": 0.989825119236884, "grad_norm": 0.22457313537597656, "learning_rate": 1.3227991638042892e-08, "loss": 0.3847, "num_input_tokens_seen": 24273606194, "step": 6226, "train_runtime": 247869.9755, "train_tokens_per_second": 97928.788 }, { "epoch": 0.9899841017488076, "grad_norm": 0.21878103911876678, "learning_rate": 1.282414192970427e-08, "loss": 0.3854, "num_input_tokens_seen": 24277634996, "step": 6227, "train_runtime": 247909.3459, "train_tokens_per_second": 97929.487 }, { "epoch": 0.9901430842607313, "grad_norm": 0.5074643492698669, "learning_rate": 1.2426551332350711e-08, "loss": 0.3755, "num_input_tokens_seen": 24281498115, "step": 6228, "train_runtime": 247943.768, "train_tokens_per_second": 97931.472 }, { "epoch": 0.990302066772655, "grad_norm": 0.2811833322048187, "learning_rate": 1.2035219945574772e-08, "loss": 0.387, "num_input_tokens_seen": 24285327616, "step": 6229, "train_runtime": 247982.5868, "train_tokens_per_second": 97931.584 }, { "epoch": 0.9904610492845787, "grad_norm": 0.31379517912864685, "learning_rate": 1.1650147867400818e-08, "loss": 0.3921, "num_input_tokens_seen": 24289202547, "step": 6230, "train_runtime": 248022.6522, "train_tokens_per_second": 97931.388 }, { "epoch": 0.9906200317965024, "grad_norm": 0.19244703650474548, "learning_rate": 1.1271335194290578e-08, "loss": 0.3794, "num_input_tokens_seen": 24293205367, "step": 6231, "train_runtime": 248062.0863, "train_tokens_per_second": 97931.956 }, { "epoch": 0.9907790143084261, "grad_norm": 1.2804715633392334, "learning_rate": 1.089878202112926e-08, "loss": 0.3882, "num_input_tokens_seen": 24297161206, "step": 6232, "train_runtime": 248101.2112, "train_tokens_per_second": 97932.457 }, { "epoch": 0.9909379968203498, "grad_norm": 0.3087700307369232, "learning_rate": 1.0532488441244992e-08, "loss": 0.3887, "num_input_tokens_seen": 24300912143, "step": 6233, "train_runtime": 248142.7326, "train_tokens_per_second": 97931.186 }, { "epoch": 0.9910969793322735, "grad_norm": 0.21758994460105896, "learning_rate": 1.0172454546383825e-08, "loss": 0.3785, "num_input_tokens_seen": 24304891870, "step": 6234, "train_runtime": 248182.9615, "train_tokens_per_second": 97931.348 }, { "epoch": 0.9912559618441972, "grad_norm": 0.5101909041404724, "learning_rate": 9.818680426737504e-09, "loss": 0.3934, "num_input_tokens_seen": 24308775223, "step": 6235, "train_runtime": 248221.1787, "train_tokens_per_second": 97931.914 }, { "epoch": 0.9914149443561209, "grad_norm": 0.19613809883594513, "learning_rate": 9.471166170924028e-09, "loss": 0.3866, "num_input_tokens_seen": 24312598761, "step": 6236, "train_runtime": 248258.3481, "train_tokens_per_second": 97932.653 }, { "epoch": 0.9915739268680445, "grad_norm": 0.2851869463920593, "learning_rate": 9.129911865990437e-09, "loss": 0.3861, "num_input_tokens_seen": 24316461877, "step": 6237, "train_runtime": 248296.2727, "train_tokens_per_second": 97933.254 }, { "epoch": 0.9917329093799682, "grad_norm": 0.2015022188425064, "learning_rate": 8.79491759741835e-09, "loss": 0.3756, "num_input_tokens_seen": 24320376134, "step": 6238, "train_runtime": 248331.7452, "train_tokens_per_second": 97935.027 }, { "epoch": 0.9918918918918919, "grad_norm": 0.20013673603534698, "learning_rate": 8.466183449123977e-09, "loss": 0.3756, "num_input_tokens_seen": 24324315137, "step": 6239, "train_runtime": 248369.3292, "train_tokens_per_second": 97936.067 }, { "epoch": 0.9920508744038156, "grad_norm": 0.218586727976799, "learning_rate": 8.143709503449782e-09, "loss": 0.3711, "num_input_tokens_seen": 24328111623, "step": 6240, "train_runtime": 248409.9164, "train_tokens_per_second": 97935.348 }, { "epoch": 0.9922098569157393, "grad_norm": 0.29405850172042847, "learning_rate": 7.827495841178367e-09, "loss": 0.3881, "num_input_tokens_seen": 24332080697, "step": 6241, "train_runtime": 248450.4939, "train_tokens_per_second": 97935.328 }, { "epoch": 0.992368839427663, "grad_norm": 0.28684836626052856, "learning_rate": 7.517542541513045e-09, "loss": 0.3808, "num_input_tokens_seen": 24336106930, "step": 6242, "train_runtime": 248488.2579, "train_tokens_per_second": 97936.648 }, { "epoch": 0.9925278219395867, "grad_norm": 0.17715343832969666, "learning_rate": 7.2138496821000375e-09, "loss": 0.3781, "num_input_tokens_seen": 24340005277, "step": 6243, "train_runtime": 248528.8102, "train_tokens_per_second": 97936.353 }, { "epoch": 0.9926868044515104, "grad_norm": 3.232085704803467, "learning_rate": 6.916417339006276e-09, "loss": 0.3696, "num_input_tokens_seen": 24343727741, "step": 6244, "train_runtime": 248567.4498, "train_tokens_per_second": 97936.105 }, { "epoch": 0.9928457869634341, "grad_norm": 0.2170003205537796, "learning_rate": 6.625245586744377e-09, "loss": 0.3793, "num_input_tokens_seen": 24347732451, "step": 6245, "train_runtime": 248604.0881, "train_tokens_per_second": 97937.78 }, { "epoch": 0.9930047694753578, "grad_norm": 0.19323742389678955, "learning_rate": 6.340334498244893e-09, "loss": 0.3752, "num_input_tokens_seen": 24351528429, "step": 6246, "train_runtime": 248642.836, "train_tokens_per_second": 97937.784 }, { "epoch": 0.9931637519872814, "grad_norm": 0.20239102840423584, "learning_rate": 6.0616841448757346e-09, "loss": 0.3811, "num_input_tokens_seen": 24355328126, "step": 6247, "train_runtime": 248681.8403, "train_tokens_per_second": 97937.703 }, { "epoch": 0.993322734499205, "grad_norm": 0.22387078404426575, "learning_rate": 5.789294596439399e-09, "loss": 0.3911, "num_input_tokens_seen": 24359315830, "step": 6248, "train_runtime": 248721.8883, "train_tokens_per_second": 97937.966 }, { "epoch": 0.9934817170111288, "grad_norm": 0.2916348874568939, "learning_rate": 5.523165921167417e-09, "loss": 0.381, "num_input_tokens_seen": 24363321568, "step": 6249, "train_runtime": 248760.7377, "train_tokens_per_second": 97938.774 }, { "epoch": 0.9936406995230525, "grad_norm": 0.2499283105134964, "learning_rate": 5.263298185723131e-09, "loss": 0.3798, "num_input_tokens_seen": 24367142397, "step": 6250, "train_runtime": 248800.8139, "train_tokens_per_second": 97938.355 }, { "epoch": 0.9937996820349762, "grad_norm": 0.18332378566265106, "learning_rate": 5.009691455201693e-09, "loss": 0.3765, "num_input_tokens_seen": 24371057395, "step": 6251, "train_runtime": 248840.3631, "train_tokens_per_second": 97938.522 }, { "epoch": 0.9939586645468999, "grad_norm": 0.15803474187850952, "learning_rate": 4.762345793127287e-09, "loss": 0.3821, "num_input_tokens_seen": 24375068076, "step": 6252, "train_runtime": 248880.5688, "train_tokens_per_second": 97938.815 }, { "epoch": 0.9941176470588236, "grad_norm": 0.34389394521713257, "learning_rate": 4.521261261461463e-09, "loss": 0.3967, "num_input_tokens_seen": 24378945589, "step": 6253, "train_runtime": 248919.9496, "train_tokens_per_second": 97938.898 }, { "epoch": 0.9942766295707473, "grad_norm": 0.26347845792770386, "learning_rate": 4.286437920592024e-09, "loss": 0.3861, "num_input_tokens_seen": 24382738265, "step": 6254, "train_runtime": 248957.2967, "train_tokens_per_second": 97939.44 }, { "epoch": 0.994435612082671, "grad_norm": 0.19951771199703217, "learning_rate": 4.057875829341362e-09, "loss": 0.3793, "num_input_tokens_seen": 24386589005, "step": 6255, "train_runtime": 248997.7943, "train_tokens_per_second": 97938.976 }, { "epoch": 0.9945945945945946, "grad_norm": 0.20390240848064423, "learning_rate": 3.835575044960904e-09, "loss": 0.3898, "num_input_tokens_seen": 24390561396, "step": 6256, "train_runtime": 249036.9231, "train_tokens_per_second": 97939.539 }, { "epoch": 0.9947535771065182, "grad_norm": 0.20865748822689056, "learning_rate": 3.6195356231394362e-09, "loss": 0.3953, "num_input_tokens_seen": 24394344818, "step": 6257, "train_runtime": 249077.3406, "train_tokens_per_second": 97938.836 }, { "epoch": 0.9949125596184419, "grad_norm": 0.1792127937078476, "learning_rate": 3.4097576179920043e-09, "loss": 0.3823, "num_input_tokens_seen": 24398344164, "step": 6258, "train_runtime": 249117.6651, "train_tokens_per_second": 97939.037 }, { "epoch": 0.9950715421303656, "grad_norm": 0.4266204833984375, "learning_rate": 3.2062410820626887e-09, "loss": 0.3946, "num_input_tokens_seen": 24402356858, "step": 6259, "train_runtime": 249157.0171, "train_tokens_per_second": 97939.673 }, { "epoch": 0.9952305246422893, "grad_norm": 0.2618193030357361, "learning_rate": 3.008986066335706e-09, "loss": 0.3891, "num_input_tokens_seen": 24406087977, "step": 6260, "train_runtime": 249197.5008, "train_tokens_per_second": 97938.735 }, { "epoch": 0.995389507154213, "grad_norm": 0.2532740533351898, "learning_rate": 2.8179926202215324e-09, "loss": 0.3824, "num_input_tokens_seen": 24410021116, "step": 6261, "train_runtime": 249235.4237, "train_tokens_per_second": 97939.614 }, { "epoch": 0.9955484896661367, "grad_norm": 0.20523175597190857, "learning_rate": 2.6332607915596772e-09, "loss": 0.3906, "num_input_tokens_seen": 24413827364, "step": 6262, "train_runtime": 249273.4435, "train_tokens_per_second": 97939.945 }, { "epoch": 0.9957074721780604, "grad_norm": 0.17844067513942719, "learning_rate": 2.454790626624237e-09, "loss": 0.3682, "num_input_tokens_seen": 24417781025, "step": 6263, "train_runtime": 249309.6354, "train_tokens_per_second": 97941.586 }, { "epoch": 0.9958664546899841, "grad_norm": 0.2383543699979782, "learning_rate": 2.2825821701238924e-09, "loss": 0.375, "num_input_tokens_seen": 24421757370, "step": 6264, "train_runtime": 249348.8038, "train_tokens_per_second": 97942.148 }, { "epoch": 0.9960254372019078, "grad_norm": 0.2588902413845062, "learning_rate": 2.1166354651935836e-09, "loss": 0.378, "num_input_tokens_seen": 24425582025, "step": 6265, "train_runtime": 249387.4151, "train_tokens_per_second": 97942.32 }, { "epoch": 0.9961844197138314, "grad_norm": 0.21475540101528168, "learning_rate": 1.9569505534028367e-09, "loss": 0.3966, "num_input_tokens_seen": 24429513541, "step": 6266, "train_runtime": 249427.3395, "train_tokens_per_second": 97942.405 }, { "epoch": 0.9963434022257551, "grad_norm": 0.21638649702072144, "learning_rate": 1.8035274747529862e-09, "loss": 0.3757, "num_input_tokens_seen": 24433367993, "step": 6267, "train_runtime": 249468.0356, "train_tokens_per_second": 97941.878 }, { "epoch": 0.9965023847376788, "grad_norm": 0.29444608092308044, "learning_rate": 1.6563662676688519e-09, "loss": 0.3857, "num_input_tokens_seen": 24437228133, "step": 6268, "train_runtime": 249507.0289, "train_tokens_per_second": 97942.043 }, { "epoch": 0.9966613672496025, "grad_norm": 0.257074236869812, "learning_rate": 1.51546696902094e-09, "loss": 0.3874, "num_input_tokens_seen": 24441128841, "step": 6269, "train_runtime": 249547.8554, "train_tokens_per_second": 97941.651 }, { "epoch": 0.9968203497615262, "grad_norm": 0.18560564517974854, "learning_rate": 1.380829614100465e-09, "loss": 0.3898, "num_input_tokens_seen": 24445091870, "step": 6270, "train_runtime": 249586.9293, "train_tokens_per_second": 97942.196 }, { "epoch": 0.9969793322734499, "grad_norm": 0.20829670131206512, "learning_rate": 1.252454236633227e-09, "loss": 0.3941, "num_input_tokens_seen": 24449004401, "step": 6271, "train_runtime": 249625.1771, "train_tokens_per_second": 97942.862 }, { "epoch": 0.9971383147853736, "grad_norm": 0.2101942002773285, "learning_rate": 1.1303408687740602e-09, "loss": 0.3781, "num_input_tokens_seen": 24452885201, "step": 6272, "train_runtime": 249663.7916, "train_tokens_per_second": 97943.258 }, { "epoch": 0.9972972972972973, "grad_norm": 0.4488917589187622, "learning_rate": 1.0144895411151601e-09, "loss": 0.3737, "num_input_tokens_seen": 24456779916, "step": 6273, "train_runtime": 249701.8674, "train_tokens_per_second": 97943.921 }, { "epoch": 0.997456279809221, "grad_norm": 0.355453222990036, "learning_rate": 9.049002826749809e-10, "loss": 0.3813, "num_input_tokens_seen": 24460654728, "step": 6274, "train_runtime": 249740.402, "train_tokens_per_second": 97944.324 }, { "epoch": 0.9976152623211447, "grad_norm": 0.337401807308197, "learning_rate": 8.015731209065624e-10, "loss": 0.3923, "num_input_tokens_seen": 24464491186, "step": 6275, "train_runtime": 249781.3109, "train_tokens_per_second": 97943.642 }, { "epoch": 0.9977742448330683, "grad_norm": 0.20893822610378265, "learning_rate": 7.045080816892035e-10, "loss": 0.366, "num_input_tokens_seen": 24468319572, "step": 6276, "train_runtime": 249820.8419, "train_tokens_per_second": 97943.468 }, { "epoch": 0.997933227344992, "grad_norm": 0.23226089775562286, "learning_rate": 6.137051893367884e-10, "loss": 0.3846, "num_input_tokens_seen": 24472215316, "step": 6277, "train_runtime": 249860.5057, "train_tokens_per_second": 97943.511 }, { "epoch": 0.9980922098569157, "grad_norm": 0.25326988101005554, "learning_rate": 5.291644666005624e-10, "loss": 0.369, "num_input_tokens_seen": 24476133597, "step": 6278, "train_runtime": 249899.2066, "train_tokens_per_second": 97944.023 }, { "epoch": 0.9982511923688394, "grad_norm": 0.18674485385417938, "learning_rate": 4.50885934649703e-10, "loss": 0.3864, "num_input_tokens_seen": 24480018372, "step": 6279, "train_runtime": 249940.0851, "train_tokens_per_second": 97943.547 }, { "epoch": 0.9984101748807631, "grad_norm": 0.34159931540489197, "learning_rate": 3.788696130990754e-10, "loss": 0.3873, "num_input_tokens_seen": 24483870970, "step": 6280, "train_runtime": 249978.7084, "train_tokens_per_second": 97943.825 }, { "epoch": 0.9985691573926868, "grad_norm": 0.2692713141441345, "learning_rate": 3.131155199842528e-10, "loss": 0.3936, "num_input_tokens_seen": 24487954627, "step": 6281, "train_runtime": 250017.3858, "train_tokens_per_second": 97945.007 }, { "epoch": 0.9987281399046105, "grad_norm": 0.23143000900745392, "learning_rate": 2.5362367177816924e-10, "loss": 0.3707, "num_input_tokens_seen": 24491610334, "step": 6282, "train_runtime": 250057.3496, "train_tokens_per_second": 97943.973 }, { "epoch": 0.9988871224165342, "grad_norm": 0.20123863220214844, "learning_rate": 2.0039408338279332e-10, "loss": 0.3843, "num_input_tokens_seen": 24495551049, "step": 6283, "train_runtime": 250098.3538, "train_tokens_per_second": 97943.672 }, { "epoch": 0.9990461049284579, "grad_norm": 0.3167588412761688, "learning_rate": 1.5342676812912794e-10, "loss": 0.383, "num_input_tokens_seen": 24499564250, "step": 6284, "train_runtime": 250137.5721, "train_tokens_per_second": 97944.359 }, { "epoch": 0.9992050874403816, "grad_norm": 0.25161588191986084, "learning_rate": 1.1272173778553719e-10, "loss": 0.3697, "num_input_tokens_seen": 24503357866, "step": 6285, "train_runtime": 250174.1199, "train_tokens_per_second": 97945.215 }, { "epoch": 0.9993640699523052, "grad_norm": 0.23720522224903107, "learning_rate": 7.827900254941956e-11, "loss": 0.3856, "num_input_tokens_seen": 24507226544, "step": 6286, "train_runtime": 250213.4157, "train_tokens_per_second": 97945.294 }, { "epoch": 0.9995230524642289, "grad_norm": 0.21383430063724518, "learning_rate": 5.0098571041656825e-11, "loss": 0.3932, "num_input_tokens_seen": 24511235843, "step": 6287, "train_runtime": 250254.2345, "train_tokens_per_second": 97945.339 }, { "epoch": 0.9996820349761526, "grad_norm": 0.1982111632823944, "learning_rate": 2.8180450328818554e-11, "loss": 0.3938, "num_input_tokens_seen": 24515167899, "step": 6288, "train_runtime": 250293.2127, "train_tokens_per_second": 97945.796 }, { "epoch": 0.9998410174880763, "grad_norm": 0.1930631697177887, "learning_rate": 1.252464589818203e-11, "loss": 0.373, "num_input_tokens_seen": 24519085677, "step": 6289, "train_runtime": 250328.8465, "train_tokens_per_second": 97947.504 }, { "epoch": 1.0, "grad_norm": 0.2245548814535141, "learning_rate": 3.1311616716100946e-12, "loss": 0.3837, "num_input_tokens_seen": 24523036733, "step": 6290, "train_runtime": 250368.7695, "train_tokens_per_second": 97947.666 } ], "logging_steps": 1, "max_steps": 6290, "num_input_samples_seen": 14359951, "num_input_tokens_seen": 24523036733, "num_train_epochs": 9223372036854775807, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2888439743615022e+21, "train_batch_size": 1, "trial_name": null, "trial_params": null }