diff --git "a/v127rc_exp2/B_mup/checkpoint-4200/trainer_state.json" "b/v127rc_exp2/B_mup/checkpoint-4200/trainer_state.json" new file mode 100644--- /dev/null +++ "b/v127rc_exp2/B_mup/checkpoint-4200/trainer_state.json" @@ -0,0 +1,42034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.1634349030470914, + "eval_steps": 500, + "global_step": 4200, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0002770083102493075, + "grad_norm": 0.19974803924560547, + "learning_rate": 0.0, + "loss": 1.3008754253387451, + "num_input_tokens_seen": 16376, + "step": 1, + "train_runtime": 10.5713, + "train_tokens_per_second": 1549.107 + }, + { + "epoch": 0.000554016620498615, + "grad_norm": 0.1806424856185913, + "learning_rate": 2.770083102493075e-07, + "loss": 1.177140474319458, + "num_input_tokens_seen": 32752, + "step": 2, + "train_runtime": 18.7409, + "train_tokens_per_second": 1747.622 + }, + { + "epoch": 0.0008310249307479224, + "grad_norm": 0.22019389271736145, + "learning_rate": 5.54016620498615e-07, + "loss": 1.5572855472564697, + "num_input_tokens_seen": 49128, + "step": 3, + "train_runtime": 26.9202, + "train_tokens_per_second": 1824.952 + }, + { + "epoch": 0.00110803324099723, + "grad_norm": 0.21318066120147705, + "learning_rate": 8.310249307479226e-07, + "loss": 1.133988618850708, + "num_input_tokens_seen": 65504, + "step": 4, + "train_runtime": 35.1085, + "train_tokens_per_second": 1865.756 + }, + { + "epoch": 0.0013850415512465374, + "grad_norm": 0.19083918631076813, + "learning_rate": 1.10803324099723e-06, + "loss": 1.3664568662643433, + "num_input_tokens_seen": 81880, + "step": 5, + "train_runtime": 43.3016, + "train_tokens_per_second": 1890.923 + }, + { + "epoch": 0.0016620498614958448, + "grad_norm": 0.2037476748228073, + "learning_rate": 1.3850415512465373e-06, + "loss": 1.3227899074554443, + "num_input_tokens_seen": 98256, + "step": 6, + "train_runtime": 51.507, + "train_tokens_per_second": 1907.625 + }, + { + "epoch": 0.0019390581717451524, + "grad_norm": 0.20721429586410522, + "learning_rate": 1.662049861495845e-06, + "loss": 1.4504868984222412, + "num_input_tokens_seen": 114632, + "step": 7, + "train_runtime": 59.7167, + "train_tokens_per_second": 1919.596 + }, + { + "epoch": 0.00221606648199446, + "grad_norm": 0.19762864708900452, + "learning_rate": 1.9390581717451524e-06, + "loss": 1.3840701580047607, + "num_input_tokens_seen": 131008, + "step": 8, + "train_runtime": 67.9367, + "train_tokens_per_second": 1928.383 + }, + { + "epoch": 0.002493074792243767, + "grad_norm": 0.2058102935552597, + "learning_rate": 2.21606648199446e-06, + "loss": 1.2206066846847534, + "num_input_tokens_seen": 147384, + "step": 9, + "train_runtime": 76.159, + "train_tokens_per_second": 1935.214 + }, + { + "epoch": 0.002770083102493075, + "grad_norm": 0.23172931373119354, + "learning_rate": 2.4930747922437675e-06, + "loss": 1.4343682527542114, + "num_input_tokens_seen": 163760, + "step": 10, + "train_runtime": 84.3974, + "train_tokens_per_second": 1940.345 + }, + { + "epoch": 0.0030470914127423824, + "grad_norm": 0.2082594484090805, + "learning_rate": 2.7700831024930747e-06, + "loss": 1.3065305948257446, + "num_input_tokens_seen": 180136, + "step": 11, + "train_runtime": 92.6346, + "train_tokens_per_second": 1944.587 + }, + { + "epoch": 0.0033240997229916896, + "grad_norm": 0.1818283051252365, + "learning_rate": 3.0470914127423827e-06, + "loss": 1.196625828742981, + "num_input_tokens_seen": 196512, + "step": 12, + "train_runtime": 100.8595, + "train_tokens_per_second": 1948.374 + }, + { + "epoch": 0.003601108033240997, + "grad_norm": 0.21664302051067352, + "learning_rate": 3.32409972299169e-06, + "loss": 1.3087671995162964, + "num_input_tokens_seen": 212888, + "step": 13, + "train_runtime": 109.0866, + "train_tokens_per_second": 1951.55 + }, + { + "epoch": 0.003878116343490305, + "grad_norm": 0.20675958693027496, + "learning_rate": 3.6011080332409978e-06, + "loss": 1.387635350227356, + "num_input_tokens_seen": 229264, + "step": 14, + "train_runtime": 117.3052, + "train_tokens_per_second": 1954.424 + }, + { + "epoch": 0.004155124653739612, + "grad_norm": 0.18978440761566162, + "learning_rate": 3.878116343490305e-06, + "loss": 1.432822585105896, + "num_input_tokens_seen": 245640, + "step": 15, + "train_runtime": 125.5188, + "train_tokens_per_second": 1956.998 + }, + { + "epoch": 0.00443213296398892, + "grad_norm": 0.2052181214094162, + "learning_rate": 4.155124653739612e-06, + "loss": 1.4600969552993774, + "num_input_tokens_seen": 262016, + "step": 16, + "train_runtime": 133.7283, + "train_tokens_per_second": 1959.316 + }, + { + "epoch": 0.004709141274238227, + "grad_norm": 0.19366899132728577, + "learning_rate": 4.43213296398892e-06, + "loss": 1.2621376514434814, + "num_input_tokens_seen": 278392, + "step": 17, + "train_runtime": 141.9452, + "train_tokens_per_second": 1961.264 + }, + { + "epoch": 0.004986149584487534, + "grad_norm": 0.20840443670749664, + "learning_rate": 4.709141274238227e-06, + "loss": 1.290602684020996, + "num_input_tokens_seen": 294768, + "step": 18, + "train_runtime": 150.1635, + "train_tokens_per_second": 1962.981 + }, + { + "epoch": 0.005263157894736842, + "grad_norm": 0.26558950543403625, + "learning_rate": 4.986149584487535e-06, + "loss": 1.4251869916915894, + "num_input_tokens_seen": 311144, + "step": 19, + "train_runtime": 158.3932, + "train_tokens_per_second": 1964.377 + }, + { + "epoch": 0.00554016620498615, + "grad_norm": 0.1892579197883606, + "learning_rate": 5.263157894736842e-06, + "loss": 1.1918714046478271, + "num_input_tokens_seen": 327520, + "step": 20, + "train_runtime": 166.6212, + "train_tokens_per_second": 1965.656 + }, + { + "epoch": 0.005817174515235457, + "grad_norm": 0.23158018290996552, + "learning_rate": 5.540166204986149e-06, + "loss": 1.401190161705017, + "num_input_tokens_seen": 343896, + "step": 21, + "train_runtime": 174.8613, + "train_tokens_per_second": 1966.678 + }, + { + "epoch": 0.006094182825484765, + "grad_norm": 0.24416176974773407, + "learning_rate": 5.817174515235457e-06, + "loss": 1.4381251335144043, + "num_input_tokens_seen": 360272, + "step": 22, + "train_runtime": 183.0891, + "train_tokens_per_second": 1967.741 + }, + { + "epoch": 0.006371191135734072, + "grad_norm": 0.20257079601287842, + "learning_rate": 6.094182825484765e-06, + "loss": 1.12482488155365, + "num_input_tokens_seen": 376648, + "step": 23, + "train_runtime": 191.3147, + "train_tokens_per_second": 1968.735 + }, + { + "epoch": 0.006648199445983379, + "grad_norm": 0.1978844702243805, + "learning_rate": 6.3711911357340724e-06, + "loss": 1.228573203086853, + "num_input_tokens_seen": 393024, + "step": 24, + "train_runtime": 199.5438, + "train_tokens_per_second": 1969.613 + }, + { + "epoch": 0.006925207756232687, + "grad_norm": 0.23471006751060486, + "learning_rate": 6.64819944598338e-06, + "loss": 1.400299310684204, + "num_input_tokens_seen": 409400, + "step": 25, + "train_runtime": 207.7749, + "train_tokens_per_second": 1970.401 + }, + { + "epoch": 0.007202216066481994, + "grad_norm": 0.20937736332416534, + "learning_rate": 6.9252077562326875e-06, + "loss": 1.4444844722747803, + "num_input_tokens_seen": 425776, + "step": 26, + "train_runtime": 216.0038, + "train_tokens_per_second": 1971.151 + }, + { + "epoch": 0.007479224376731302, + "grad_norm": 0.26524004340171814, + "learning_rate": 7.2022160664819955e-06, + "loss": 1.3200058937072754, + "num_input_tokens_seen": 442152, + "step": 27, + "train_runtime": 224.234, + "train_tokens_per_second": 1971.833 + }, + { + "epoch": 0.00775623268698061, + "grad_norm": 0.20903293788433075, + "learning_rate": 7.479224376731302e-06, + "loss": 1.409676194190979, + "num_input_tokens_seen": 458528, + "step": 28, + "train_runtime": 232.4645, + "train_tokens_per_second": 1972.465 + }, + { + "epoch": 0.008033240997229917, + "grad_norm": 0.20190700888633728, + "learning_rate": 7.75623268698061e-06, + "loss": 1.3784735202789307, + "num_input_tokens_seen": 474904, + "step": 29, + "train_runtime": 240.6929, + "train_tokens_per_second": 1973.07 + }, + { + "epoch": 0.008310249307479225, + "grad_norm": 0.2016134113073349, + "learning_rate": 8.033240997229918e-06, + "loss": 1.1249107122421265, + "num_input_tokens_seen": 491280, + "step": 30, + "train_runtime": 248.9364, + "train_tokens_per_second": 1973.516 + }, + { + "epoch": 0.008587257617728532, + "grad_norm": 0.2025245577096939, + "learning_rate": 8.310249307479224e-06, + "loss": 1.3168435096740723, + "num_input_tokens_seen": 507656, + "step": 31, + "train_runtime": 257.1663, + "train_tokens_per_second": 1974.038 + }, + { + "epoch": 0.00886426592797784, + "grad_norm": 0.2089613378047943, + "learning_rate": 8.587257617728532e-06, + "loss": 1.3293673992156982, + "num_input_tokens_seen": 524032, + "step": 32, + "train_runtime": 265.38, + "train_tokens_per_second": 1974.648 + }, + { + "epoch": 0.009141274238227148, + "grad_norm": 0.18817956745624542, + "learning_rate": 8.86426592797784e-06, + "loss": 1.1466889381408691, + "num_input_tokens_seen": 540408, + "step": 33, + "train_runtime": 273.5932, + "train_tokens_per_second": 1975.224 + }, + { + "epoch": 0.009418282548476454, + "grad_norm": 0.19996502995491028, + "learning_rate": 9.141274238227148e-06, + "loss": 1.32096529006958, + "num_input_tokens_seen": 556784, + "step": 34, + "train_runtime": 281.8233, + "train_tokens_per_second": 1975.649 + }, + { + "epoch": 0.009695290858725761, + "grad_norm": 0.20531931519508362, + "learning_rate": 9.418282548476454e-06, + "loss": 1.3096777200698853, + "num_input_tokens_seen": 573160, + "step": 35, + "train_runtime": 290.0585, + "train_tokens_per_second": 1976.015 + }, + { + "epoch": 0.009972299168975069, + "grad_norm": 0.22196975350379944, + "learning_rate": 9.695290858725762e-06, + "loss": 1.4356218576431274, + "num_input_tokens_seen": 589536, + "step": 36, + "train_runtime": 298.3257, + "train_tokens_per_second": 1976.149 + }, + { + "epoch": 0.010249307479224376, + "grad_norm": 0.19673702120780945, + "learning_rate": 9.97229916897507e-06, + "loss": 1.3413242101669312, + "num_input_tokens_seen": 605912, + "step": 37, + "train_runtime": 306.563, + "train_tokens_per_second": 1976.468 + }, + { + "epoch": 0.010526315789473684, + "grad_norm": 0.19606688618659973, + "learning_rate": 1.0249307479224378e-05, + "loss": 1.14158296585083, + "num_input_tokens_seen": 622288, + "step": 38, + "train_runtime": 314.7773, + "train_tokens_per_second": 1976.915 + }, + { + "epoch": 0.010803324099722992, + "grad_norm": 0.1987162083387375, + "learning_rate": 1.0526315789473684e-05, + "loss": 1.4169780015945435, + "num_input_tokens_seen": 638664, + "step": 39, + "train_runtime": 322.9913, + "train_tokens_per_second": 1977.341 + }, + { + "epoch": 0.0110803324099723, + "grad_norm": 0.18349622189998627, + "learning_rate": 1.0803324099722992e-05, + "loss": 1.227226972579956, + "num_input_tokens_seen": 655040, + "step": 40, + "train_runtime": 331.2144, + "train_tokens_per_second": 1977.692 + }, + { + "epoch": 0.011357340720221607, + "grad_norm": 0.19013476371765137, + "learning_rate": 1.1080332409972299e-05, + "loss": 1.3007169961929321, + "num_input_tokens_seen": 671416, + "step": 41, + "train_runtime": 339.4429, + "train_tokens_per_second": 1977.994 + }, + { + "epoch": 0.011634349030470914, + "grad_norm": 0.22841231524944305, + "learning_rate": 1.1357340720221607e-05, + "loss": 1.661263108253479, + "num_input_tokens_seen": 687792, + "step": 42, + "train_runtime": 347.6728, + "train_tokens_per_second": 1978.274 + }, + { + "epoch": 0.011911357340720222, + "grad_norm": 0.18880786001682281, + "learning_rate": 1.1634349030470915e-05, + "loss": 1.2077555656433105, + "num_input_tokens_seen": 704168, + "step": 43, + "train_runtime": 355.8991, + "train_tokens_per_second": 1978.561 + }, + { + "epoch": 0.01218836565096953, + "grad_norm": 0.19857576489448547, + "learning_rate": 1.1911357340720223e-05, + "loss": 1.3297723531723022, + "num_input_tokens_seen": 720544, + "step": 44, + "train_runtime": 364.1291, + "train_tokens_per_second": 1978.815 + }, + { + "epoch": 0.012465373961218837, + "grad_norm": 0.20376235246658325, + "learning_rate": 1.218836565096953e-05, + "loss": 1.3305563926696777, + "num_input_tokens_seen": 736920, + "step": 45, + "train_runtime": 372.358, + "train_tokens_per_second": 1979.063 + }, + { + "epoch": 0.012742382271468145, + "grad_norm": 0.20336413383483887, + "learning_rate": 1.2465373961218837e-05, + "loss": 1.3658658266067505, + "num_input_tokens_seen": 753296, + "step": 46, + "train_runtime": 380.5965, + "train_tokens_per_second": 1979.251 + }, + { + "epoch": 0.01301939058171745, + "grad_norm": 0.2024649977684021, + "learning_rate": 1.2742382271468145e-05, + "loss": 1.2490519285202026, + "num_input_tokens_seen": 769672, + "step": 47, + "train_runtime": 388.8189, + "train_tokens_per_second": 1979.513 + }, + { + "epoch": 0.013296398891966758, + "grad_norm": 0.23284997045993805, + "learning_rate": 1.3019390581717453e-05, + "loss": 1.50986647605896, + "num_input_tokens_seen": 786048, + "step": 48, + "train_runtime": 397.0586, + "train_tokens_per_second": 1979.678 + }, + { + "epoch": 0.013573407202216066, + "grad_norm": 0.19301240146160126, + "learning_rate": 1.329639889196676e-05, + "loss": 1.2635934352874756, + "num_input_tokens_seen": 802424, + "step": 49, + "train_runtime": 405.2913, + "train_tokens_per_second": 1979.87 + }, + { + "epoch": 0.013850415512465374, + "grad_norm": 0.21042048931121826, + "learning_rate": 1.3573407202216069e-05, + "loss": 1.3483132123947144, + "num_input_tokens_seen": 818800, + "step": 50, + "train_runtime": 413.5235, + "train_tokens_per_second": 1980.057 + }, + { + "epoch": 0.014127423822714681, + "grad_norm": 0.20898906886577606, + "learning_rate": 1.3850415512465375e-05, + "loss": 1.343579649925232, + "num_input_tokens_seen": 835176, + "step": 51, + "train_runtime": 421.7633, + "train_tokens_per_second": 1980.201 + }, + { + "epoch": 0.014404432132963989, + "grad_norm": 0.2052023708820343, + "learning_rate": 1.4127423822714683e-05, + "loss": 1.2853317260742188, + "num_input_tokens_seen": 851552, + "step": 52, + "train_runtime": 429.99, + "train_tokens_per_second": 1980.4 + }, + { + "epoch": 0.014681440443213296, + "grad_norm": 0.18434950709342957, + "learning_rate": 1.4404432132963991e-05, + "loss": 1.1593763828277588, + "num_input_tokens_seen": 867928, + "step": 53, + "train_runtime": 438.2224, + "train_tokens_per_second": 1980.565 + }, + { + "epoch": 0.014958448753462604, + "grad_norm": 0.22465690970420837, + "learning_rate": 1.4681440443213299e-05, + "loss": 1.471274971961975, + "num_input_tokens_seen": 884304, + "step": 54, + "train_runtime": 446.4408, + "train_tokens_per_second": 1980.787 + }, + { + "epoch": 0.015235457063711912, + "grad_norm": 0.2104768306016922, + "learning_rate": 1.4958448753462604e-05, + "loss": 1.2986819744110107, + "num_input_tokens_seen": 900680, + "step": 55, + "train_runtime": 454.6597, + "train_tokens_per_second": 1980.998 + }, + { + "epoch": 0.01551246537396122, + "grad_norm": 0.2263578176498413, + "learning_rate": 1.5235457063711912e-05, + "loss": 1.3823051452636719, + "num_input_tokens_seen": 917056, + "step": 56, + "train_runtime": 462.871, + "train_tokens_per_second": 1981.234 + }, + { + "epoch": 0.015789473684210527, + "grad_norm": 0.2213507443666458, + "learning_rate": 1.551246537396122e-05, + "loss": 1.3738175630569458, + "num_input_tokens_seen": 933432, + "step": 57, + "train_runtime": 471.0856, + "train_tokens_per_second": 1981.449 + }, + { + "epoch": 0.016066481994459834, + "grad_norm": 0.25201913714408875, + "learning_rate": 1.5789473684210526e-05, + "loss": 1.443530797958374, + "num_input_tokens_seen": 949808, + "step": 58, + "train_runtime": 479.3091, + "train_tokens_per_second": 1981.619 + }, + { + "epoch": 0.016343490304709142, + "grad_norm": 0.208817258477211, + "learning_rate": 1.6066481994459835e-05, + "loss": 1.2694791555404663, + "num_input_tokens_seen": 966184, + "step": 59, + "train_runtime": 487.5388, + "train_tokens_per_second": 1981.758 + }, + { + "epoch": 0.01662049861495845, + "grad_norm": 0.2082168608903885, + "learning_rate": 1.6343490304709142e-05, + "loss": 1.198103427886963, + "num_input_tokens_seen": 982560, + "step": 60, + "train_runtime": 495.7593, + "train_tokens_per_second": 1981.93 + }, + { + "epoch": 0.016897506925207757, + "grad_norm": 0.22764891386032104, + "learning_rate": 1.6620498614958448e-05, + "loss": 1.401847243309021, + "num_input_tokens_seen": 998936, + "step": 61, + "train_runtime": 503.9832, + "train_tokens_per_second": 1982.082 + }, + { + "epoch": 0.017174515235457065, + "grad_norm": 0.22006699442863464, + "learning_rate": 1.6897506925207758e-05, + "loss": 1.1394731998443604, + "num_input_tokens_seen": 1015312, + "step": 62, + "train_runtime": 512.199, + "train_tokens_per_second": 1982.261 + }, + { + "epoch": 0.017451523545706372, + "grad_norm": 0.23141850531101227, + "learning_rate": 1.7174515235457064e-05, + "loss": 1.394463062286377, + "num_input_tokens_seen": 1031688, + "step": 63, + "train_runtime": 520.4056, + "train_tokens_per_second": 1982.469 + }, + { + "epoch": 0.01772853185595568, + "grad_norm": 0.1988488882780075, + "learning_rate": 1.745152354570637e-05, + "loss": 1.1406528949737549, + "num_input_tokens_seen": 1048064, + "step": 64, + "train_runtime": 528.6217, + "train_tokens_per_second": 1982.635 + }, + { + "epoch": 0.018005540166204988, + "grad_norm": 0.22045518457889557, + "learning_rate": 1.772853185595568e-05, + "loss": 1.2544196844100952, + "num_input_tokens_seen": 1064440, + "step": 65, + "train_runtime": 536.8583, + "train_tokens_per_second": 1982.721 + }, + { + "epoch": 0.018282548476454295, + "grad_norm": 0.2336575835943222, + "learning_rate": 1.8005540166204986e-05, + "loss": 1.3780077695846558, + "num_input_tokens_seen": 1080816, + "step": 66, + "train_runtime": 545.0956, + "train_tokens_per_second": 1982.801 + }, + { + "epoch": 0.0185595567867036, + "grad_norm": 0.2238014191389084, + "learning_rate": 1.8282548476454296e-05, + "loss": 1.1195652484893799, + "num_input_tokens_seen": 1097192, + "step": 67, + "train_runtime": 553.3236, + "train_tokens_per_second": 1982.912 + }, + { + "epoch": 0.018836565096952907, + "grad_norm": 0.22228729724884033, + "learning_rate": 1.8559556786703602e-05, + "loss": 1.1844733953475952, + "num_input_tokens_seen": 1113568, + "step": 68, + "train_runtime": 561.5491, + "train_tokens_per_second": 1983.029 + }, + { + "epoch": 0.019113573407202215, + "grad_norm": 0.2227269560098648, + "learning_rate": 1.883656509695291e-05, + "loss": 1.1785049438476562, + "num_input_tokens_seen": 1129944, + "step": 69, + "train_runtime": 569.7684, + "train_tokens_per_second": 1983.164 + }, + { + "epoch": 0.019390581717451522, + "grad_norm": 0.24630776047706604, + "learning_rate": 1.9113573407202218e-05, + "loss": 1.0950614213943481, + "num_input_tokens_seen": 1146320, + "step": 70, + "train_runtime": 577.9758, + "train_tokens_per_second": 1983.336 + }, + { + "epoch": 0.01966759002770083, + "grad_norm": 0.24070221185684204, + "learning_rate": 1.9390581717451524e-05, + "loss": 1.1477469205856323, + "num_input_tokens_seen": 1162696, + "step": 71, + "train_runtime": 586.1868, + "train_tokens_per_second": 1983.491 + }, + { + "epoch": 0.019944598337950138, + "grad_norm": 0.22338292002677917, + "learning_rate": 1.9667590027700834e-05, + "loss": 1.179660439491272, + "num_input_tokens_seen": 1179072, + "step": 72, + "train_runtime": 594.4142, + "train_tokens_per_second": 1983.587 + }, + { + "epoch": 0.020221606648199445, + "grad_norm": 0.22183576226234436, + "learning_rate": 1.994459833795014e-05, + "loss": 1.154234528541565, + "num_input_tokens_seen": 1195448, + "step": 73, + "train_runtime": 602.6339, + "train_tokens_per_second": 1983.705 + }, + { + "epoch": 0.020498614958448753, + "grad_norm": 0.2353476583957672, + "learning_rate": 2.0221606648199447e-05, + "loss": 1.1822134256362915, + "num_input_tokens_seen": 1211824, + "step": 74, + "train_runtime": 610.8397, + "train_tokens_per_second": 1983.866 + }, + { + "epoch": 0.02077562326869806, + "grad_norm": 0.23614031076431274, + "learning_rate": 2.0498614958448756e-05, + "loss": 1.1752800941467285, + "num_input_tokens_seen": 1228200, + "step": 75, + "train_runtime": 619.0569, + "train_tokens_per_second": 1983.986 + }, + { + "epoch": 0.021052631578947368, + "grad_norm": 0.24093110859394073, + "learning_rate": 2.077562326869806e-05, + "loss": 1.1092638969421387, + "num_input_tokens_seen": 1244576, + "step": 76, + "train_runtime": 627.2721, + "train_tokens_per_second": 1984.109 + }, + { + "epoch": 0.021329639889196676, + "grad_norm": 0.247078999876976, + "learning_rate": 2.105263157894737e-05, + "loss": 1.192847490310669, + "num_input_tokens_seen": 1260952, + "step": 77, + "train_runtime": 635.4912, + "train_tokens_per_second": 1984.216 + }, + { + "epoch": 0.021606648199445983, + "grad_norm": 0.2571609318256378, + "learning_rate": 2.1329639889196675e-05, + "loss": 1.0357508659362793, + "num_input_tokens_seen": 1277328, + "step": 78, + "train_runtime": 643.7085, + "train_tokens_per_second": 1984.327 + }, + { + "epoch": 0.02188365650969529, + "grad_norm": 0.26755985617637634, + "learning_rate": 2.1606648199445985e-05, + "loss": 1.279419183731079, + "num_input_tokens_seen": 1293704, + "step": 79, + "train_runtime": 651.9378, + "train_tokens_per_second": 1984.398 + }, + { + "epoch": 0.0221606648199446, + "grad_norm": 0.26527658104896545, + "learning_rate": 2.188365650969529e-05, + "loss": 1.033733606338501, + "num_input_tokens_seen": 1310080, + "step": 80, + "train_runtime": 660.1651, + "train_tokens_per_second": 1984.473 + }, + { + "epoch": 0.022437673130193906, + "grad_norm": 0.2815229296684265, + "learning_rate": 2.2160664819944597e-05, + "loss": 1.1851569414138794, + "num_input_tokens_seen": 1326456, + "step": 81, + "train_runtime": 668.3887, + "train_tokens_per_second": 1984.558 + }, + { + "epoch": 0.022714681440443214, + "grad_norm": 0.2681605815887451, + "learning_rate": 2.2437673130193907e-05, + "loss": 1.11195969581604, + "num_input_tokens_seen": 1342832, + "step": 82, + "train_runtime": 676.6172, + "train_tokens_per_second": 1984.626 + }, + { + "epoch": 0.02299168975069252, + "grad_norm": 0.2715722322463989, + "learning_rate": 2.2714681440443213e-05, + "loss": 1.2121024131774902, + "num_input_tokens_seen": 1359208, + "step": 83, + "train_runtime": 684.8455, + "train_tokens_per_second": 1984.693 + }, + { + "epoch": 0.02326869806094183, + "grad_norm": 0.26133817434310913, + "learning_rate": 2.2991689750692523e-05, + "loss": 1.1711103916168213, + "num_input_tokens_seen": 1375584, + "step": 84, + "train_runtime": 693.0639, + "train_tokens_per_second": 1984.787 + }, + { + "epoch": 0.023545706371191136, + "grad_norm": 0.27183660864830017, + "learning_rate": 2.326869806094183e-05, + "loss": 0.9399521350860596, + "num_input_tokens_seen": 1391960, + "step": 85, + "train_runtime": 701.2865, + "train_tokens_per_second": 1984.866 + }, + { + "epoch": 0.023822714681440444, + "grad_norm": 0.35709071159362793, + "learning_rate": 2.3545706371191136e-05, + "loss": 1.2213917970657349, + "num_input_tokens_seen": 1408336, + "step": 86, + "train_runtime": 709.5188, + "train_tokens_per_second": 1984.917 + }, + { + "epoch": 0.02409972299168975, + "grad_norm": 0.2848862409591675, + "learning_rate": 2.3822714681440445e-05, + "loss": 1.132256031036377, + "num_input_tokens_seen": 1424712, + "step": 87, + "train_runtime": 717.7591, + "train_tokens_per_second": 1984.944 + }, + { + "epoch": 0.02437673130193906, + "grad_norm": 0.27834179997444153, + "learning_rate": 2.409972299168975e-05, + "loss": 1.0612107515335083, + "num_input_tokens_seen": 1441088, + "step": 88, + "train_runtime": 725.9929, + "train_tokens_per_second": 1984.989 + }, + { + "epoch": 0.024653739612188367, + "grad_norm": 0.3374454379081726, + "learning_rate": 2.437673130193906e-05, + "loss": 1.2137848138809204, + "num_input_tokens_seen": 1457464, + "step": 89, + "train_runtime": 734.2234, + "train_tokens_per_second": 1985.041 + }, + { + "epoch": 0.024930747922437674, + "grad_norm": 0.3023698329925537, + "learning_rate": 2.4653739612188367e-05, + "loss": 0.910720705986023, + "num_input_tokens_seen": 1473840, + "step": 90, + "train_runtime": 742.4648, + "train_tokens_per_second": 1985.064 + }, + { + "epoch": 0.025207756232686982, + "grad_norm": 0.28809505701065063, + "learning_rate": 2.4930747922437674e-05, + "loss": 1.058011531829834, + "num_input_tokens_seen": 1490216, + "step": 91, + "train_runtime": 750.6959, + "train_tokens_per_second": 1985.113 + }, + { + "epoch": 0.02548476454293629, + "grad_norm": 0.2890704274177551, + "learning_rate": 2.520775623268698e-05, + "loss": 1.0406876802444458, + "num_input_tokens_seen": 1506592, + "step": 92, + "train_runtime": 758.927, + "train_tokens_per_second": 1985.161 + }, + { + "epoch": 0.025761772853185594, + "grad_norm": 0.32054319977760315, + "learning_rate": 2.548476454293629e-05, + "loss": 0.9748150706291199, + "num_input_tokens_seen": 1522968, + "step": 93, + "train_runtime": 767.2045, + "train_tokens_per_second": 1985.087 + }, + { + "epoch": 0.0260387811634349, + "grad_norm": 0.31482261419296265, + "learning_rate": 2.5761772853185596e-05, + "loss": 1.0520613193511963, + "num_input_tokens_seen": 1539344, + "step": 94, + "train_runtime": 775.4485, + "train_tokens_per_second": 1985.102 + }, + { + "epoch": 0.02631578947368421, + "grad_norm": 0.328294575214386, + "learning_rate": 2.6038781163434906e-05, + "loss": 1.049708366394043, + "num_input_tokens_seen": 1555720, + "step": 95, + "train_runtime": 783.6815, + "train_tokens_per_second": 1985.143 + }, + { + "epoch": 0.026592797783933517, + "grad_norm": 0.3329083323478699, + "learning_rate": 2.6315789473684212e-05, + "loss": 1.0165297985076904, + "num_input_tokens_seen": 1572096, + "step": 96, + "train_runtime": 791.9168, + "train_tokens_per_second": 1985.178 + }, + { + "epoch": 0.026869806094182824, + "grad_norm": 0.40896469354629517, + "learning_rate": 2.659279778393352e-05, + "loss": 0.9879429936408997, + "num_input_tokens_seen": 1588472, + "step": 97, + "train_runtime": 800.1595, + "train_tokens_per_second": 1985.194 + }, + { + "epoch": 0.027146814404432132, + "grad_norm": 0.35054847598075867, + "learning_rate": 2.6869806094182825e-05, + "loss": 1.0290201902389526, + "num_input_tokens_seen": 1604848, + "step": 98, + "train_runtime": 809.1871, + "train_tokens_per_second": 1983.284 + }, + { + "epoch": 0.02742382271468144, + "grad_norm": 0.40867283940315247, + "learning_rate": 2.7146814404432138e-05, + "loss": 1.0820136070251465, + "num_input_tokens_seen": 1621224, + "step": 99, + "train_runtime": 817.4343, + "train_tokens_per_second": 1983.308 + }, + { + "epoch": 0.027700831024930747, + "grad_norm": 0.32213497161865234, + "learning_rate": 2.742382271468144e-05, + "loss": 0.9336585402488708, + "num_input_tokens_seen": 1637600, + "step": 100, + "train_runtime": 825.6612, + "train_tokens_per_second": 1983.38 + }, + { + "epoch": 0.027977839335180055, + "grad_norm": 0.419733464717865, + "learning_rate": 2.770083102493075e-05, + "loss": 1.0998961925506592, + "num_input_tokens_seen": 1653976, + "step": 101, + "train_runtime": 835.3345, + "train_tokens_per_second": 1980.016 + }, + { + "epoch": 0.028254847645429362, + "grad_norm": 0.39754173159599304, + "learning_rate": 2.7977839335180056e-05, + "loss": 1.0648705959320068, + "num_input_tokens_seen": 1670352, + "step": 102, + "train_runtime": 843.5596, + "train_tokens_per_second": 1980.123 + }, + { + "epoch": 0.02853185595567867, + "grad_norm": 0.36923086643218994, + "learning_rate": 2.8254847645429366e-05, + "loss": 0.867785632610321, + "num_input_tokens_seen": 1686728, + "step": 103, + "train_runtime": 851.7923, + "train_tokens_per_second": 1980.211 + }, + { + "epoch": 0.028808864265927978, + "grad_norm": 0.37676048278808594, + "learning_rate": 2.8531855955678672e-05, + "loss": 0.9541873335838318, + "num_input_tokens_seen": 1703104, + "step": 104, + "train_runtime": 860.0127, + "train_tokens_per_second": 1980.324 + }, + { + "epoch": 0.029085872576177285, + "grad_norm": 0.40412741899490356, + "learning_rate": 2.8808864265927982e-05, + "loss": 0.9540935754776001, + "num_input_tokens_seen": 1719480, + "step": 105, + "train_runtime": 868.2256, + "train_tokens_per_second": 1980.453 + }, + { + "epoch": 0.029362880886426593, + "grad_norm": 0.43296173214912415, + "learning_rate": 2.9085872576177285e-05, + "loss": 0.9661292433738708, + "num_input_tokens_seen": 1735856, + "step": 106, + "train_runtime": 876.468, + "train_tokens_per_second": 1980.513 + }, + { + "epoch": 0.0296398891966759, + "grad_norm": 0.3736564815044403, + "learning_rate": 2.9362880886426598e-05, + "loss": 0.8791632056236267, + "num_input_tokens_seen": 1752232, + "step": 107, + "train_runtime": 884.7099, + "train_tokens_per_second": 1980.572 + }, + { + "epoch": 0.029916897506925208, + "grad_norm": 0.4281712472438812, + "learning_rate": 2.96398891966759e-05, + "loss": 0.8967458009719849, + "num_input_tokens_seen": 1768608, + "step": 108, + "train_runtime": 892.9475, + "train_tokens_per_second": 1980.641 + }, + { + "epoch": 0.030193905817174516, + "grad_norm": 0.4422169029712677, + "learning_rate": 2.9916897506925207e-05, + "loss": 0.9783757328987122, + "num_input_tokens_seen": 1784984, + "step": 109, + "train_runtime": 901.1704, + "train_tokens_per_second": 1980.74 + }, + { + "epoch": 0.030470914127423823, + "grad_norm": 0.45073533058166504, + "learning_rate": 3.0193905817174517e-05, + "loss": 0.9160676002502441, + "num_input_tokens_seen": 1801360, + "step": 110, + "train_runtime": 909.3958, + "train_tokens_per_second": 1980.832 + }, + { + "epoch": 0.03074792243767313, + "grad_norm": 0.4822501242160797, + "learning_rate": 3.0470914127423823e-05, + "loss": 0.8231464624404907, + "num_input_tokens_seen": 1817736, + "step": 111, + "train_runtime": 917.6221, + "train_tokens_per_second": 1980.92 + }, + { + "epoch": 0.03102493074792244, + "grad_norm": 0.5717563629150391, + "learning_rate": 3.074792243767313e-05, + "loss": 1.0020201206207275, + "num_input_tokens_seen": 1834112, + "step": 112, + "train_runtime": 925.8372, + "train_tokens_per_second": 1981.031 + }, + { + "epoch": 0.031301939058171746, + "grad_norm": 0.5511406064033508, + "learning_rate": 3.102493074792244e-05, + "loss": 0.9801983833312988, + "num_input_tokens_seen": 1850488, + "step": 113, + "train_runtime": 934.0486, + "train_tokens_per_second": 1981.147 + }, + { + "epoch": 0.031578947368421054, + "grad_norm": 0.5460030436515808, + "learning_rate": 3.130193905817175e-05, + "loss": 0.9769917130470276, + "num_input_tokens_seen": 1866864, + "step": 114, + "train_runtime": 942.2578, + "train_tokens_per_second": 1981.267 + }, + { + "epoch": 0.03185595567867036, + "grad_norm": 0.47561749815940857, + "learning_rate": 3.157894736842105e-05, + "loss": 0.8316506147384644, + "num_input_tokens_seen": 1883240, + "step": 115, + "train_runtime": 950.4805, + "train_tokens_per_second": 1981.356 + }, + { + "epoch": 0.03213296398891967, + "grad_norm": 0.5160908699035645, + "learning_rate": 3.185595567867036e-05, + "loss": 0.9141228199005127, + "num_input_tokens_seen": 1899616, + "step": 116, + "train_runtime": 958.7128, + "train_tokens_per_second": 1981.423 + }, + { + "epoch": 0.032409972299168976, + "grad_norm": 0.49668774008750916, + "learning_rate": 3.213296398891967e-05, + "loss": 0.8340078592300415, + "num_input_tokens_seen": 1915992, + "step": 117, + "train_runtime": 966.9455, + "train_tokens_per_second": 1981.489 + }, + { + "epoch": 0.032686980609418284, + "grad_norm": 0.46899357438087463, + "learning_rate": 3.240997229916898e-05, + "loss": 0.8158856630325317, + "num_input_tokens_seen": 1932368, + "step": 118, + "train_runtime": 975.1814, + "train_tokens_per_second": 1981.547 + }, + { + "epoch": 0.03296398891966759, + "grad_norm": 0.5576912760734558, + "learning_rate": 3.2686980609418284e-05, + "loss": 0.8255398869514465, + "num_input_tokens_seen": 1948744, + "step": 119, + "train_runtime": 983.4155, + "train_tokens_per_second": 1981.608 + }, + { + "epoch": 0.0332409972299169, + "grad_norm": 0.5019871592521667, + "learning_rate": 3.296398891966759e-05, + "loss": 0.8549615740776062, + "num_input_tokens_seen": 1965120, + "step": 120, + "train_runtime": 991.6398, + "train_tokens_per_second": 1981.687 + }, + { + "epoch": 0.03351800554016621, + "grad_norm": 0.5380167365074158, + "learning_rate": 3.3240997229916896e-05, + "loss": 0.8504559397697449, + "num_input_tokens_seen": 1981496, + "step": 121, + "train_runtime": 999.873, + "train_tokens_per_second": 1981.748 + }, + { + "epoch": 0.033795013850415515, + "grad_norm": 0.5316630601882935, + "learning_rate": 3.3518005540166206e-05, + "loss": 0.7893936634063721, + "num_input_tokens_seen": 1997872, + "step": 122, + "train_runtime": 1008.1015, + "train_tokens_per_second": 1981.816 + }, + { + "epoch": 0.03407202216066482, + "grad_norm": 0.5375428199768066, + "learning_rate": 3.3795013850415515e-05, + "loss": 0.8549631834030151, + "num_input_tokens_seen": 2014248, + "step": 123, + "train_runtime": 1016.3332, + "train_tokens_per_second": 1981.878 + }, + { + "epoch": 0.03434903047091413, + "grad_norm": 0.5288248658180237, + "learning_rate": 3.4072022160664825e-05, + "loss": 0.8434067964553833, + "num_input_tokens_seen": 2030624, + "step": 124, + "train_runtime": 1024.5577, + "train_tokens_per_second": 1981.952 + }, + { + "epoch": 0.03462603878116344, + "grad_norm": 0.5220904350280762, + "learning_rate": 3.434903047091413e-05, + "loss": 0.7650355696678162, + "num_input_tokens_seen": 2047000, + "step": 125, + "train_runtime": 1032.7804, + "train_tokens_per_second": 1982.028 + }, + { + "epoch": 0.034903047091412745, + "grad_norm": 0.6318022608757019, + "learning_rate": 3.462603878116344e-05, + "loss": 0.870831310749054, + "num_input_tokens_seen": 2063376, + "step": 126, + "train_runtime": 1041.0014, + "train_tokens_per_second": 1982.107 + }, + { + "epoch": 0.03518005540166205, + "grad_norm": 0.5783331394195557, + "learning_rate": 3.490304709141274e-05, + "loss": 0.8253586888313293, + "num_input_tokens_seen": 2079752, + "step": 127, + "train_runtime": 1049.2316, + "train_tokens_per_second": 1982.167 + }, + { + "epoch": 0.03545706371191136, + "grad_norm": 0.5494397282600403, + "learning_rate": 3.518005540166206e-05, + "loss": 0.6846734881401062, + "num_input_tokens_seen": 2096128, + "step": 128, + "train_runtime": 1057.4655, + "train_tokens_per_second": 1982.219 + }, + { + "epoch": 0.03573407202216067, + "grad_norm": 0.5411443710327148, + "learning_rate": 3.545706371191136e-05, + "loss": 0.7486408948898315, + "num_input_tokens_seen": 2112504, + "step": 129, + "train_runtime": 1065.6887, + "train_tokens_per_second": 1982.29 + }, + { + "epoch": 0.036011080332409975, + "grad_norm": 0.6407768130302429, + "learning_rate": 3.573407202216066e-05, + "loss": 0.7942906618118286, + "num_input_tokens_seen": 2128880, + "step": 130, + "train_runtime": 1073.9064, + "train_tokens_per_second": 1982.37 + }, + { + "epoch": 0.03628808864265928, + "grad_norm": 0.6041961312294006, + "learning_rate": 3.601108033240997e-05, + "loss": 0.7769217491149902, + "num_input_tokens_seen": 2145256, + "step": 131, + "train_runtime": 1082.1302, + "train_tokens_per_second": 1982.438 + }, + { + "epoch": 0.03656509695290859, + "grad_norm": 0.5925173759460449, + "learning_rate": 3.628808864265928e-05, + "loss": 0.8205530643463135, + "num_input_tokens_seen": 2161632, + "step": 132, + "train_runtime": 1090.3648, + "train_tokens_per_second": 1982.485 + }, + { + "epoch": 0.03684210526315789, + "grad_norm": 0.5825573801994324, + "learning_rate": 3.656509695290859e-05, + "loss": 0.6715656518936157, + "num_input_tokens_seen": 2178008, + "step": 133, + "train_runtime": 1098.5926, + "train_tokens_per_second": 1982.544 + }, + { + "epoch": 0.0371191135734072, + "grad_norm": 0.622559130191803, + "learning_rate": 3.6842105263157895e-05, + "loss": 0.8514711856842041, + "num_input_tokens_seen": 2194384, + "step": 134, + "train_runtime": 1106.8246, + "train_tokens_per_second": 1982.594 + }, + { + "epoch": 0.037396121883656507, + "grad_norm": 0.5422042608261108, + "learning_rate": 3.7119113573407204e-05, + "loss": 0.667987585067749, + "num_input_tokens_seen": 2210760, + "step": 135, + "train_runtime": 1115.046, + "train_tokens_per_second": 1982.663 + }, + { + "epoch": 0.037673130193905814, + "grad_norm": 0.6610152125358582, + "learning_rate": 3.739612188365651e-05, + "loss": 0.7157915830612183, + "num_input_tokens_seen": 2227136, + "step": 136, + "train_runtime": 1123.2683, + "train_tokens_per_second": 1982.728 + }, + { + "epoch": 0.03795013850415512, + "grad_norm": 0.6288188695907593, + "learning_rate": 3.767313019390582e-05, + "loss": 0.7624672651290894, + "num_input_tokens_seen": 2243512, + "step": 137, + "train_runtime": 1131.4938, + "train_tokens_per_second": 1982.788 + }, + { + "epoch": 0.03822714681440443, + "grad_norm": 0.5814260840415955, + "learning_rate": 3.795013850415513e-05, + "loss": 0.664568305015564, + "num_input_tokens_seen": 2259888, + "step": 138, + "train_runtime": 1139.7137, + "train_tokens_per_second": 1982.856 + }, + { + "epoch": 0.03850415512465374, + "grad_norm": 0.6125353574752808, + "learning_rate": 3.8227146814404436e-05, + "loss": 0.6742233037948608, + "num_input_tokens_seen": 2276264, + "step": 139, + "train_runtime": 1147.9453, + "train_tokens_per_second": 1982.903 + }, + { + "epoch": 0.038781163434903045, + "grad_norm": 0.6241163611412048, + "learning_rate": 3.850415512465374e-05, + "loss": 0.6337404251098633, + "num_input_tokens_seen": 2292640, + "step": 140, + "train_runtime": 1156.1815, + "train_tokens_per_second": 1982.941 + }, + { + "epoch": 0.03905817174515235, + "grad_norm": 0.6121036410331726, + "learning_rate": 3.878116343490305e-05, + "loss": 0.6775527000427246, + "num_input_tokens_seen": 2309016, + "step": 141, + "train_runtime": 1164.4103, + "train_tokens_per_second": 1982.992 + }, + { + "epoch": 0.03933518005540166, + "grad_norm": 0.6845881342887878, + "learning_rate": 3.905817174515236e-05, + "loss": 0.6503578424453735, + "num_input_tokens_seen": 2325392, + "step": 142, + "train_runtime": 1172.6399, + "train_tokens_per_second": 1983.04 + }, + { + "epoch": 0.03961218836565097, + "grad_norm": 0.8014799356460571, + "learning_rate": 3.933518005540167e-05, + "loss": 0.7017160654067993, + "num_input_tokens_seen": 2341768, + "step": 143, + "train_runtime": 1180.8604, + "train_tokens_per_second": 1983.103 + }, + { + "epoch": 0.039889196675900275, + "grad_norm": 0.6656845211982727, + "learning_rate": 3.961218836565097e-05, + "loss": 0.6555854082107544, + "num_input_tokens_seen": 2358144, + "step": 144, + "train_runtime": 1189.0904, + "train_tokens_per_second": 1983.149 + }, + { + "epoch": 0.04016620498614958, + "grad_norm": 0.6964138746261597, + "learning_rate": 3.988919667590028e-05, + "loss": 0.7460485696792603, + "num_input_tokens_seen": 2374520, + "step": 145, + "train_runtime": 1197.3197, + "train_tokens_per_second": 1983.196 + }, + { + "epoch": 0.04044321329639889, + "grad_norm": 0.6211307644844055, + "learning_rate": 4.0166204986149584e-05, + "loss": 0.6867573261260986, + "num_input_tokens_seen": 2390896, + "step": 146, + "train_runtime": 1205.5464, + "train_tokens_per_second": 1983.247 + }, + { + "epoch": 0.0407202216066482, + "grad_norm": 0.603557825088501, + "learning_rate": 4.044321329639889e-05, + "loss": 0.643031120300293, + "num_input_tokens_seen": 2407272, + "step": 147, + "train_runtime": 1213.7697, + "train_tokens_per_second": 1983.302 + }, + { + "epoch": 0.040997229916897505, + "grad_norm": 0.5553196668624878, + "learning_rate": 4.07202216066482e-05, + "loss": 0.6040917634963989, + "num_input_tokens_seen": 2423648, + "step": 148, + "train_runtime": 1221.9986, + "train_tokens_per_second": 1983.348 + }, + { + "epoch": 0.04127423822714681, + "grad_norm": 0.6502822637557983, + "learning_rate": 4.099722991689751e-05, + "loss": 0.5840516686439514, + "num_input_tokens_seen": 2440024, + "step": 149, + "train_runtime": 1230.2252, + "train_tokens_per_second": 1983.396 + }, + { + "epoch": 0.04155124653739612, + "grad_norm": 0.5683599710464478, + "learning_rate": 4.1274238227146816e-05, + "loss": 0.652279257774353, + "num_input_tokens_seen": 2456400, + "step": 150, + "train_runtime": 1238.4568, + "train_tokens_per_second": 1983.436 + }, + { + "epoch": 0.04182825484764543, + "grad_norm": 0.6586402654647827, + "learning_rate": 4.155124653739612e-05, + "loss": 0.6302953958511353, + "num_input_tokens_seen": 2472776, + "step": 151, + "train_runtime": 1246.6806, + "train_tokens_per_second": 1983.488 + }, + { + "epoch": 0.042105263157894736, + "grad_norm": 0.6843027472496033, + "learning_rate": 4.1828254847645435e-05, + "loss": 0.5746461153030396, + "num_input_tokens_seen": 2489152, + "step": 152, + "train_runtime": 1254.9157, + "train_tokens_per_second": 1983.521 + }, + { + "epoch": 0.042382271468144044, + "grad_norm": 0.6432904005050659, + "learning_rate": 4.210526315789474e-05, + "loss": 0.5630192160606384, + "num_input_tokens_seen": 2505528, + "step": 153, + "train_runtime": 1263.1418, + "train_tokens_per_second": 1983.568 + }, + { + "epoch": 0.04265927977839335, + "grad_norm": 0.6768400073051453, + "learning_rate": 4.238227146814405e-05, + "loss": 0.6090478301048279, + "num_input_tokens_seen": 2521904, + "step": 154, + "train_runtime": 1271.3642, + "train_tokens_per_second": 1983.62 + }, + { + "epoch": 0.04293628808864266, + "grad_norm": 0.835658848285675, + "learning_rate": 4.265927977839335e-05, + "loss": 0.5923334360122681, + "num_input_tokens_seen": 2538280, + "step": 155, + "train_runtime": 1279.5908, + "train_tokens_per_second": 1983.665 + }, + { + "epoch": 0.043213296398891966, + "grad_norm": 0.8492304682731628, + "learning_rate": 4.293628808864266e-05, + "loss": 0.6741846799850464, + "num_input_tokens_seen": 2554656, + "step": 156, + "train_runtime": 1287.8169, + "train_tokens_per_second": 1983.711 + }, + { + "epoch": 0.043490304709141274, + "grad_norm": 0.7162511944770813, + "learning_rate": 4.321329639889197e-05, + "loss": 0.5312638282775879, + "num_input_tokens_seen": 2571032, + "step": 157, + "train_runtime": 1296.0326, + "train_tokens_per_second": 1983.771 + }, + { + "epoch": 0.04376731301939058, + "grad_norm": 0.6748389601707458, + "learning_rate": 4.349030470914128e-05, + "loss": 0.5988590717315674, + "num_input_tokens_seen": 2587408, + "step": 158, + "train_runtime": 1304.2425, + "train_tokens_per_second": 1983.84 + }, + { + "epoch": 0.04404432132963989, + "grad_norm": 0.8594232797622681, + "learning_rate": 4.376731301939058e-05, + "loss": 0.44427329301834106, + "num_input_tokens_seen": 2603784, + "step": 159, + "train_runtime": 1312.4561, + "train_tokens_per_second": 1983.902 + }, + { + "epoch": 0.0443213296398892, + "grad_norm": 0.7662527561187744, + "learning_rate": 4.404432132963989e-05, + "loss": 0.6289910078048706, + "num_input_tokens_seen": 2620160, + "step": 160, + "train_runtime": 1320.6736, + "train_tokens_per_second": 1983.957 + }, + { + "epoch": 0.044598337950138504, + "grad_norm": 0.7507277727127075, + "learning_rate": 4.4321329639889195e-05, + "loss": 0.5615860819816589, + "num_input_tokens_seen": 2636536, + "step": 161, + "train_runtime": 1328.9034, + "train_tokens_per_second": 1983.994 + }, + { + "epoch": 0.04487534626038781, + "grad_norm": 0.721984326839447, + "learning_rate": 4.459833795013851e-05, + "loss": 0.53988116979599, + "num_input_tokens_seen": 2652912, + "step": 162, + "train_runtime": 1337.133, + "train_tokens_per_second": 1984.03 + }, + { + "epoch": 0.04515235457063712, + "grad_norm": 0.8825350403785706, + "learning_rate": 4.4875346260387814e-05, + "loss": 0.6807432174682617, + "num_input_tokens_seen": 2669288, + "step": 163, + "train_runtime": 1345.3573, + "train_tokens_per_second": 1984.074 + }, + { + "epoch": 0.04542936288088643, + "grad_norm": 0.6342299580574036, + "learning_rate": 4.5152354570637124e-05, + "loss": 0.4914466142654419, + "num_input_tokens_seen": 2685664, + "step": 164, + "train_runtime": 1353.5758, + "train_tokens_per_second": 1984.125 + }, + { + "epoch": 0.045706371191135735, + "grad_norm": 0.7735069990158081, + "learning_rate": 4.542936288088643e-05, + "loss": 0.5724454522132874, + "num_input_tokens_seen": 2702040, + "step": 165, + "train_runtime": 1361.7832, + "train_tokens_per_second": 1984.193 + }, + { + "epoch": 0.04598337950138504, + "grad_norm": 0.6329130530357361, + "learning_rate": 4.5706371191135736e-05, + "loss": 0.5151958465576172, + "num_input_tokens_seen": 2718416, + "step": 166, + "train_runtime": 1369.992, + "train_tokens_per_second": 1984.257 + }, + { + "epoch": 0.04626038781163435, + "grad_norm": 0.8033583760261536, + "learning_rate": 4.5983379501385046e-05, + "loss": 0.5595269203186035, + "num_input_tokens_seen": 2734792, + "step": 167, + "train_runtime": 1378.1996, + "train_tokens_per_second": 1984.322 + }, + { + "epoch": 0.04653739612188366, + "grad_norm": 0.7229690551757812, + "learning_rate": 4.6260387811634356e-05, + "loss": 0.5408182144165039, + "num_input_tokens_seen": 2751168, + "step": 168, + "train_runtime": 1386.4026, + "train_tokens_per_second": 1984.393 + }, + { + "epoch": 0.046814404432132965, + "grad_norm": 0.6378253102302551, + "learning_rate": 4.653739612188366e-05, + "loss": 0.41595572233200073, + "num_input_tokens_seen": 2767544, + "step": 169, + "train_runtime": 1394.6111, + "train_tokens_per_second": 1984.456 + }, + { + "epoch": 0.04709141274238227, + "grad_norm": 0.7234790921211243, + "learning_rate": 4.681440443213297e-05, + "loss": 0.460693359375, + "num_input_tokens_seen": 2783920, + "step": 170, + "train_runtime": 1402.8155, + "train_tokens_per_second": 1984.523 + }, + { + "epoch": 0.04736842105263158, + "grad_norm": 0.7319244146347046, + "learning_rate": 4.709141274238227e-05, + "loss": 0.48342108726501465, + "num_input_tokens_seen": 2800296, + "step": 171, + "train_runtime": 1411.0239, + "train_tokens_per_second": 1984.584 + }, + { + "epoch": 0.04764542936288089, + "grad_norm": 1.007758378982544, + "learning_rate": 4.736842105263158e-05, + "loss": 0.5681762099266052, + "num_input_tokens_seen": 2816672, + "step": 172, + "train_runtime": 1419.2416, + "train_tokens_per_second": 1984.632 + }, + { + "epoch": 0.047922437673130196, + "grad_norm": 0.6794661283493042, + "learning_rate": 4.764542936288089e-05, + "loss": 0.46674343943595886, + "num_input_tokens_seen": 2833048, + "step": 173, + "train_runtime": 1427.4626, + "train_tokens_per_second": 1984.674 + }, + { + "epoch": 0.0481994459833795, + "grad_norm": 0.7622226476669312, + "learning_rate": 4.7922437673130193e-05, + "loss": 0.410085529088974, + "num_input_tokens_seen": 2849424, + "step": 174, + "train_runtime": 1435.6872, + "train_tokens_per_second": 1984.711 + }, + { + "epoch": 0.04847645429362881, + "grad_norm": 0.8027065396308899, + "learning_rate": 4.81994459833795e-05, + "loss": 0.5102354884147644, + "num_input_tokens_seen": 2865800, + "step": 175, + "train_runtime": 1443.9131, + "train_tokens_per_second": 1984.746 + }, + { + "epoch": 0.04875346260387812, + "grad_norm": 0.8335910439491272, + "learning_rate": 4.8476454293628806e-05, + "loss": 0.48492008447647095, + "num_input_tokens_seen": 2882176, + "step": 176, + "train_runtime": 1452.1288, + "train_tokens_per_second": 1984.794 + }, + { + "epoch": 0.049030470914127426, + "grad_norm": 0.7958452105522156, + "learning_rate": 4.875346260387812e-05, + "loss": 0.4718579947948456, + "num_input_tokens_seen": 2898552, + "step": 177, + "train_runtime": 1460.3445, + "train_tokens_per_second": 1984.841 + }, + { + "epoch": 0.049307479224376734, + "grad_norm": 0.7302278876304626, + "learning_rate": 4.9030470914127425e-05, + "loss": 0.4404051601886749, + "num_input_tokens_seen": 2914928, + "step": 178, + "train_runtime": 1468.57, + "train_tokens_per_second": 1984.875 + }, + { + "epoch": 0.04958448753462604, + "grad_norm": 0.7446356415748596, + "learning_rate": 4.9307479224376735e-05, + "loss": 0.3907979130744934, + "num_input_tokens_seen": 2931304, + "step": 179, + "train_runtime": 1476.7943, + "train_tokens_per_second": 1984.91 + }, + { + "epoch": 0.04986149584487535, + "grad_norm": 0.7327796220779419, + "learning_rate": 4.958448753462604e-05, + "loss": 0.4673866331577301, + "num_input_tokens_seen": 2947680, + "step": 180, + "train_runtime": 1485.0068, + "train_tokens_per_second": 1984.961 + }, + { + "epoch": 0.05013850415512466, + "grad_norm": 0.8006020784378052, + "learning_rate": 4.986149584487535e-05, + "loss": 0.4191367030143738, + "num_input_tokens_seen": 2964056, + "step": 181, + "train_runtime": 1493.2113, + "train_tokens_per_second": 1985.021 + }, + { + "epoch": 0.050415512465373964, + "grad_norm": 0.7394649386405945, + "learning_rate": 5.013850415512466e-05, + "loss": 0.4343605637550354, + "num_input_tokens_seen": 2980432, + "step": 182, + "train_runtime": 1501.4208, + "train_tokens_per_second": 1985.074 + }, + { + "epoch": 0.05069252077562327, + "grad_norm": 0.770579993724823, + "learning_rate": 5.041551246537396e-05, + "loss": 0.39876240491867065, + "num_input_tokens_seen": 2996808, + "step": 183, + "train_runtime": 1509.6458, + "train_tokens_per_second": 1985.107 + }, + { + "epoch": 0.05096952908587258, + "grad_norm": 0.8138972520828247, + "learning_rate": 5.0692520775623277e-05, + "loss": 0.4310626983642578, + "num_input_tokens_seen": 3013184, + "step": 184, + "train_runtime": 1517.8623, + "train_tokens_per_second": 1985.15 + }, + { + "epoch": 0.05124653739612189, + "grad_norm": 0.7137347459793091, + "learning_rate": 5.096952908587258e-05, + "loss": 0.37273189425468445, + "num_input_tokens_seen": 3029560, + "step": 185, + "train_runtime": 1526.0692, + "train_tokens_per_second": 1985.205 + }, + { + "epoch": 0.05152354570637119, + "grad_norm": 0.8495726585388184, + "learning_rate": 5.124653739612188e-05, + "loss": 0.3868299722671509, + "num_input_tokens_seen": 3045936, + "step": 186, + "train_runtime": 1534.2772, + "train_tokens_per_second": 1985.258 + }, + { + "epoch": 0.051800554016620495, + "grad_norm": 0.8363239765167236, + "learning_rate": 5.152354570637119e-05, + "loss": 0.34481367468833923, + "num_input_tokens_seen": 3062312, + "step": 187, + "train_runtime": 1542.4786, + "train_tokens_per_second": 1985.319 + }, + { + "epoch": 0.0520775623268698, + "grad_norm": 0.8199629187583923, + "learning_rate": 5.180055401662051e-05, + "loss": 0.47626566886901855, + "num_input_tokens_seen": 3078688, + "step": 188, + "train_runtime": 1550.6926, + "train_tokens_per_second": 1985.363 + }, + { + "epoch": 0.05235457063711911, + "grad_norm": 0.8818345665931702, + "learning_rate": 5.207756232686981e-05, + "loss": 0.42213356494903564, + "num_input_tokens_seen": 3095064, + "step": 189, + "train_runtime": 1558.9147, + "train_tokens_per_second": 1985.397 + }, + { + "epoch": 0.05263157894736842, + "grad_norm": 0.7775227427482605, + "learning_rate": 5.2354570637119114e-05, + "loss": 0.40203213691711426, + "num_input_tokens_seen": 3111440, + "step": 190, + "train_runtime": 1567.1381, + "train_tokens_per_second": 1985.428 + }, + { + "epoch": 0.052908587257617726, + "grad_norm": 0.8110833764076233, + "learning_rate": 5.2631578947368424e-05, + "loss": 0.3831946551799774, + "num_input_tokens_seen": 3127816, + "step": 191, + "train_runtime": 1575.3565, + "train_tokens_per_second": 1985.465 + }, + { + "epoch": 0.05318559556786703, + "grad_norm": 0.7839464545249939, + "learning_rate": 5.2908587257617734e-05, + "loss": 0.38302338123321533, + "num_input_tokens_seen": 3144192, + "step": 192, + "train_runtime": 1583.5677, + "train_tokens_per_second": 1985.512 + }, + { + "epoch": 0.05346260387811634, + "grad_norm": 0.8701349496841431, + "learning_rate": 5.318559556786704e-05, + "loss": 0.3388274312019348, + "num_input_tokens_seen": 3160568, + "step": 193, + "train_runtime": 1591.7913, + "train_tokens_per_second": 1985.542 + }, + { + "epoch": 0.05373961218836565, + "grad_norm": 0.7987439036369324, + "learning_rate": 5.3462603878116346e-05, + "loss": 0.371830016374588, + "num_input_tokens_seen": 3176944, + "step": 194, + "train_runtime": 1600.0249, + "train_tokens_per_second": 1985.559 + }, + { + "epoch": 0.054016620498614956, + "grad_norm": 1.0139018297195435, + "learning_rate": 5.373961218836565e-05, + "loss": 0.3299805521965027, + "num_input_tokens_seen": 3193320, + "step": 195, + "train_runtime": 1608.2568, + "train_tokens_per_second": 1985.578 + }, + { + "epoch": 0.054293628808864264, + "grad_norm": 0.9826958775520325, + "learning_rate": 5.401662049861496e-05, + "loss": 0.37591779232025146, + "num_input_tokens_seen": 3209696, + "step": 196, + "train_runtime": 1616.4865, + "train_tokens_per_second": 1985.6 + }, + { + "epoch": 0.05457063711911357, + "grad_norm": 0.8998488187789917, + "learning_rate": 5.4293628808864275e-05, + "loss": 0.3858615756034851, + "num_input_tokens_seen": 3226072, + "step": 197, + "train_runtime": 1624.7084, + "train_tokens_per_second": 1985.631 + }, + { + "epoch": 0.05484764542936288, + "grad_norm": 0.8115880489349365, + "learning_rate": 5.457063711911358e-05, + "loss": 0.3694916367530823, + "num_input_tokens_seen": 3242448, + "step": 198, + "train_runtime": 1632.9347, + "train_tokens_per_second": 1985.657 + }, + { + "epoch": 0.05512465373961219, + "grad_norm": 0.8328195214271545, + "learning_rate": 5.484764542936288e-05, + "loss": 0.35051947832107544, + "num_input_tokens_seen": 3258824, + "step": 199, + "train_runtime": 1641.1626, + "train_tokens_per_second": 1985.68 + }, + { + "epoch": 0.055401662049861494, + "grad_norm": 0.9102795124053955, + "learning_rate": 5.5124653739612184e-05, + "loss": 0.35309845209121704, + "num_input_tokens_seen": 3275200, + "step": 200, + "train_runtime": 1649.3798, + "train_tokens_per_second": 1985.716 + }, + { + "epoch": 0.0556786703601108, + "grad_norm": 0.6769919395446777, + "learning_rate": 5.54016620498615e-05, + "loss": 0.28506797552108765, + "num_input_tokens_seen": 3291576, + "step": 201, + "train_runtime": 1659.0979, + "train_tokens_per_second": 1983.955 + }, + { + "epoch": 0.05595567867036011, + "grad_norm": 0.8542789816856384, + "learning_rate": 5.567867036011081e-05, + "loss": 0.32119032740592957, + "num_input_tokens_seen": 3307952, + "step": 202, + "train_runtime": 1667.3189, + "train_tokens_per_second": 1983.995 + }, + { + "epoch": 0.05623268698060942, + "grad_norm": 1.0241471529006958, + "learning_rate": 5.595567867036011e-05, + "loss": 0.35830721259117126, + "num_input_tokens_seen": 3324328, + "step": 203, + "train_runtime": 1675.5466, + "train_tokens_per_second": 1984.026 + }, + { + "epoch": 0.056509695290858725, + "grad_norm": 0.9385203719139099, + "learning_rate": 5.6232686980609416e-05, + "loss": 0.33349379897117615, + "num_input_tokens_seen": 3340704, + "step": 204, + "train_runtime": 1683.7696, + "train_tokens_per_second": 1984.062 + }, + { + "epoch": 0.05678670360110803, + "grad_norm": 0.9214120507240295, + "learning_rate": 5.650969529085873e-05, + "loss": 0.32935604453086853, + "num_input_tokens_seen": 3357080, + "step": 205, + "train_runtime": 1691.9897, + "train_tokens_per_second": 1984.102 + }, + { + "epoch": 0.05706371191135734, + "grad_norm": 0.9299399256706238, + "learning_rate": 5.6786703601108035e-05, + "loss": 0.34500712156295776, + "num_input_tokens_seen": 3373456, + "step": 206, + "train_runtime": 1700.2144, + "train_tokens_per_second": 1984.136 + }, + { + "epoch": 0.05734072022160665, + "grad_norm": 0.8989976644515991, + "learning_rate": 5.7063711911357345e-05, + "loss": 0.29620489478111267, + "num_input_tokens_seen": 3389832, + "step": 207, + "train_runtime": 1708.4409, + "train_tokens_per_second": 1984.167 + }, + { + "epoch": 0.057617728531855955, + "grad_norm": 0.9352446794509888, + "learning_rate": 5.734072022160665e-05, + "loss": 0.32714247703552246, + "num_input_tokens_seen": 3406208, + "step": 208, + "train_runtime": 1716.6667, + "train_tokens_per_second": 1984.199 + }, + { + "epoch": 0.05789473684210526, + "grad_norm": 0.866721510887146, + "learning_rate": 5.7617728531855964e-05, + "loss": 0.29009270668029785, + "num_input_tokens_seen": 3422584, + "step": 209, + "train_runtime": 1724.8947, + "train_tokens_per_second": 1984.228 + }, + { + "epoch": 0.05817174515235457, + "grad_norm": 0.8501016497612, + "learning_rate": 5.789473684210527e-05, + "loss": 0.28530213236808777, + "num_input_tokens_seen": 3438960, + "step": 210, + "train_runtime": 1733.1195, + "train_tokens_per_second": 1984.26 + }, + { + "epoch": 0.05844875346260388, + "grad_norm": 0.923227846622467, + "learning_rate": 5.817174515235457e-05, + "loss": 0.30728232860565186, + "num_input_tokens_seen": 3455336, + "step": 211, + "train_runtime": 1741.3578, + "train_tokens_per_second": 1984.277 + }, + { + "epoch": 0.058725761772853186, + "grad_norm": 0.8790254592895508, + "learning_rate": 5.844875346260388e-05, + "loss": 0.2652836740016937, + "num_input_tokens_seen": 3471712, + "step": 212, + "train_runtime": 1749.5878, + "train_tokens_per_second": 1984.303 + }, + { + "epoch": 0.05900277008310249, + "grad_norm": 0.9197275638580322, + "learning_rate": 5.8725761772853196e-05, + "loss": 0.28842446208000183, + "num_input_tokens_seen": 3488088, + "step": 213, + "train_runtime": 1757.8239, + "train_tokens_per_second": 1984.322 + }, + { + "epoch": 0.0592797783933518, + "grad_norm": 1.0009058713912964, + "learning_rate": 5.90027700831025e-05, + "loss": 0.2721683979034424, + "num_input_tokens_seen": 3504464, + "step": 214, + "train_runtime": 1766.0557, + "train_tokens_per_second": 1984.345 + }, + { + "epoch": 0.05955678670360111, + "grad_norm": 1.1590752601623535, + "learning_rate": 5.92797783933518e-05, + "loss": 0.3538288474082947, + "num_input_tokens_seen": 3520840, + "step": 215, + "train_runtime": 1774.2865, + "train_tokens_per_second": 1984.37 + }, + { + "epoch": 0.059833795013850416, + "grad_norm": 1.3458961248397827, + "learning_rate": 5.955678670360111e-05, + "loss": 0.3757060468196869, + "num_input_tokens_seen": 3537216, + "step": 216, + "train_runtime": 1782.5102, + "train_tokens_per_second": 1984.402 + }, + { + "epoch": 0.060110803324099724, + "grad_norm": 0.8640219569206238, + "learning_rate": 5.9833795013850414e-05, + "loss": 0.21398551762104034, + "num_input_tokens_seen": 3553592, + "step": 217, + "train_runtime": 1790.7362, + "train_tokens_per_second": 1984.431 + }, + { + "epoch": 0.06038781163434903, + "grad_norm": 1.1077229976654053, + "learning_rate": 6.011080332409973e-05, + "loss": 0.34661075472831726, + "num_input_tokens_seen": 3569968, + "step": 218, + "train_runtime": 1798.9609, + "train_tokens_per_second": 1984.461 + }, + { + "epoch": 0.06066481994459834, + "grad_norm": 0.9336462020874023, + "learning_rate": 6.0387811634349034e-05, + "loss": 0.24630483984947205, + "num_input_tokens_seen": 3586344, + "step": 219, + "train_runtime": 1807.1925, + "train_tokens_per_second": 1984.484 + }, + { + "epoch": 0.060941828254847646, + "grad_norm": 0.8975627422332764, + "learning_rate": 6.0664819944598337e-05, + "loss": 0.33064141869544983, + "num_input_tokens_seen": 3602720, + "step": 220, + "train_runtime": 1815.4241, + "train_tokens_per_second": 1984.506 + }, + { + "epoch": 0.061218836565096954, + "grad_norm": 0.8667930960655212, + "learning_rate": 6.0941828254847646e-05, + "loss": 0.2380654215812683, + "num_input_tokens_seen": 3619096, + "step": 221, + "train_runtime": 1823.6467, + "train_tokens_per_second": 1984.538 + }, + { + "epoch": 0.06149584487534626, + "grad_norm": 0.9911587834358215, + "learning_rate": 6.121883656509696e-05, + "loss": 0.2708589434623718, + "num_input_tokens_seen": 3635472, + "step": 222, + "train_runtime": 1831.871, + "train_tokens_per_second": 1984.568 + }, + { + "epoch": 0.06177285318559557, + "grad_norm": 0.9351817965507507, + "learning_rate": 6.149584487534626e-05, + "loss": 0.32700833678245544, + "num_input_tokens_seen": 3651848, + "step": 223, + "train_runtime": 1840.0941, + "train_tokens_per_second": 1984.598 + }, + { + "epoch": 0.06204986149584488, + "grad_norm": 0.9969749450683594, + "learning_rate": 6.177285318559557e-05, + "loss": 0.2831845283508301, + "num_input_tokens_seen": 3668224, + "step": 224, + "train_runtime": 1848.3251, + "train_tokens_per_second": 1984.621 + }, + { + "epoch": 0.062326869806094184, + "grad_norm": 0.9704735279083252, + "learning_rate": 6.204986149584488e-05, + "loss": 0.268660306930542, + "num_input_tokens_seen": 3684600, + "step": 225, + "train_runtime": 1856.5563, + "train_tokens_per_second": 1984.642 + }, + { + "epoch": 0.06260387811634349, + "grad_norm": 1.0723077058792114, + "learning_rate": 6.232686980609419e-05, + "loss": 0.3679491877555847, + "num_input_tokens_seen": 3700976, + "step": 226, + "train_runtime": 1864.7804, + "train_tokens_per_second": 1984.671 + }, + { + "epoch": 0.0628808864265928, + "grad_norm": 0.9896561503410339, + "learning_rate": 6.26038781163435e-05, + "loss": 0.24680808186531067, + "num_input_tokens_seen": 3717352, + "step": 227, + "train_runtime": 1873.0032, + "train_tokens_per_second": 1984.701 + }, + { + "epoch": 0.06315789473684211, + "grad_norm": 0.9291359186172485, + "learning_rate": 6.28808864265928e-05, + "loss": 0.233191579580307, + "num_input_tokens_seen": 3733728, + "step": 228, + "train_runtime": 1881.2332, + "train_tokens_per_second": 1984.724 + }, + { + "epoch": 0.06343490304709141, + "grad_norm": 0.7933772206306458, + "learning_rate": 6.31578947368421e-05, + "loss": 0.17709730565547943, + "num_input_tokens_seen": 3750104, + "step": 229, + "train_runtime": 1889.4549, + "train_tokens_per_second": 1984.754 + }, + { + "epoch": 0.06371191135734072, + "grad_norm": 0.9626915454864502, + "learning_rate": 6.343490304709143e-05, + "loss": 0.26448825001716614, + "num_input_tokens_seen": 3766480, + "step": 230, + "train_runtime": 1897.665, + "train_tokens_per_second": 1984.797 + }, + { + "epoch": 0.06398891966759003, + "grad_norm": 0.902040421962738, + "learning_rate": 6.371191135734072e-05, + "loss": 0.2375335991382599, + "num_input_tokens_seen": 3782856, + "step": 231, + "train_runtime": 1905.8743, + "train_tokens_per_second": 1984.84 + }, + { + "epoch": 0.06426592797783934, + "grad_norm": 0.8851885199546814, + "learning_rate": 6.398891966759003e-05, + "loss": 0.23321346938610077, + "num_input_tokens_seen": 3799232, + "step": 232, + "train_runtime": 1914.0831, + "train_tokens_per_second": 1984.883 + }, + { + "epoch": 0.06454293628808865, + "grad_norm": 0.8996667265892029, + "learning_rate": 6.426592797783934e-05, + "loss": 0.2802973985671997, + "num_input_tokens_seen": 3815608, + "step": 233, + "train_runtime": 1922.3039, + "train_tokens_per_second": 1984.914 + }, + { + "epoch": 0.06481994459833795, + "grad_norm": 1.091469168663025, + "learning_rate": 6.454293628808865e-05, + "loss": 0.266166627407074, + "num_input_tokens_seen": 3831984, + "step": 234, + "train_runtime": 1930.5369, + "train_tokens_per_second": 1984.932 + }, + { + "epoch": 0.06509695290858726, + "grad_norm": 0.9449445009231567, + "learning_rate": 6.481994459833796e-05, + "loss": 0.23764996230602264, + "num_input_tokens_seen": 3848360, + "step": 235, + "train_runtime": 1938.764, + "train_tokens_per_second": 1984.955 + }, + { + "epoch": 0.06537396121883657, + "grad_norm": 1.1320970058441162, + "learning_rate": 6.509695290858726e-05, + "loss": 0.23096317052841187, + "num_input_tokens_seen": 3864736, + "step": 236, + "train_runtime": 1946.9904, + "train_tokens_per_second": 1984.98 + }, + { + "epoch": 0.06565096952908588, + "grad_norm": 0.8096081018447876, + "learning_rate": 6.537396121883657e-05, + "loss": 0.15398479998111725, + "num_input_tokens_seen": 3881112, + "step": 237, + "train_runtime": 1955.2176, + "train_tokens_per_second": 1985.003 + }, + { + "epoch": 0.06592797783933518, + "grad_norm": 0.942337691783905, + "learning_rate": 6.565096952908588e-05, + "loss": 0.24263320863246918, + "num_input_tokens_seen": 3897488, + "step": 238, + "train_runtime": 1963.4567, + "train_tokens_per_second": 1985.014 + }, + { + "epoch": 0.06620498614958449, + "grad_norm": 1.0381815433502197, + "learning_rate": 6.592797783933519e-05, + "loss": 0.2181507647037506, + "num_input_tokens_seen": 3913864, + "step": 239, + "train_runtime": 1971.6824, + "train_tokens_per_second": 1985.038 + }, + { + "epoch": 0.0664819944598338, + "grad_norm": 0.9946973323822021, + "learning_rate": 6.62049861495845e-05, + "loss": 0.20501570403575897, + "num_input_tokens_seen": 3930240, + "step": 240, + "train_runtime": 1979.8992, + "train_tokens_per_second": 1985.071 + }, + { + "epoch": 0.0667590027700831, + "grad_norm": 1.2149981260299683, + "learning_rate": 6.648199445983379e-05, + "loss": 0.20167860388755798, + "num_input_tokens_seen": 3946616, + "step": 241, + "train_runtime": 1988.1145, + "train_tokens_per_second": 1985.105 + }, + { + "epoch": 0.06703601108033241, + "grad_norm": 0.7778007388114929, + "learning_rate": 6.67590027700831e-05, + "loss": 0.15157456696033478, + "num_input_tokens_seen": 3962992, + "step": 242, + "train_runtime": 1996.3416, + "train_tokens_per_second": 1985.127 + }, + { + "epoch": 0.06731301939058172, + "grad_norm": 0.9597538709640503, + "learning_rate": 6.703601108033241e-05, + "loss": 0.1881304383277893, + "num_input_tokens_seen": 3979368, + "step": 243, + "train_runtime": 2004.5645, + "train_tokens_per_second": 1985.153 + }, + { + "epoch": 0.06759002770083103, + "grad_norm": 0.9158643484115601, + "learning_rate": 6.731301939058172e-05, + "loss": 0.22169294953346252, + "num_input_tokens_seen": 3995744, + "step": 244, + "train_runtime": 2012.7934, + "train_tokens_per_second": 1985.173 + }, + { + "epoch": 0.06786703601108034, + "grad_norm": 0.8323157429695129, + "learning_rate": 6.759002770083103e-05, + "loss": 0.1474589854478836, + "num_input_tokens_seen": 4012120, + "step": 245, + "train_runtime": 2021.0151, + "train_tokens_per_second": 1985.2 + }, + { + "epoch": 0.06814404432132964, + "grad_norm": 0.860209584236145, + "learning_rate": 6.786703601108033e-05, + "loss": 0.17631705105304718, + "num_input_tokens_seen": 4028496, + "step": 246, + "train_runtime": 2029.2318, + "train_tokens_per_second": 1985.232 + }, + { + "epoch": 0.06842105263157895, + "grad_norm": 1.2871291637420654, + "learning_rate": 6.814404432132965e-05, + "loss": 0.20548926293849945, + "num_input_tokens_seen": 4044872, + "step": 247, + "train_runtime": 2037.4578, + "train_tokens_per_second": 1985.254 + }, + { + "epoch": 0.06869806094182826, + "grad_norm": 0.8846837282180786, + "learning_rate": 6.842105263157895e-05, + "loss": 0.16354207694530487, + "num_input_tokens_seen": 4061248, + "step": 248, + "train_runtime": 2045.6941, + "train_tokens_per_second": 1985.266 + }, + { + "epoch": 0.06897506925207757, + "grad_norm": 0.8529284596443176, + "learning_rate": 6.869806094182826e-05, + "loss": 0.17933666706085205, + "num_input_tokens_seen": 4077624, + "step": 249, + "train_runtime": 2053.9301, + "train_tokens_per_second": 1985.279 + }, + { + "epoch": 0.06925207756232687, + "grad_norm": 0.844264805316925, + "learning_rate": 6.897506925207757e-05, + "loss": 0.14344659447669983, + "num_input_tokens_seen": 4094000, + "step": 250, + "train_runtime": 2062.167, + "train_tokens_per_second": 1985.29 + }, + { + "epoch": 0.06952908587257618, + "grad_norm": 1.0764896869659424, + "learning_rate": 6.925207756232688e-05, + "loss": 0.1679542362689972, + "num_input_tokens_seen": 4110376, + "step": 251, + "train_runtime": 2070.3833, + "train_tokens_per_second": 1985.321 + }, + { + "epoch": 0.06980609418282549, + "grad_norm": 1.452364206314087, + "learning_rate": 6.952908587257619e-05, + "loss": 0.2432863563299179, + "num_input_tokens_seen": 4126752, + "step": 252, + "train_runtime": 2078.7622, + "train_tokens_per_second": 1985.197 + }, + { + "epoch": 0.0700831024930748, + "grad_norm": 0.8690814971923828, + "learning_rate": 6.980609418282548e-05, + "loss": 0.1839189976453781, + "num_input_tokens_seen": 4143128, + "step": 253, + "train_runtime": 2087.0152, + "train_tokens_per_second": 1985.193 + }, + { + "epoch": 0.0703601108033241, + "grad_norm": 0.7490035891532898, + "learning_rate": 7.008310249307479e-05, + "loss": 0.12937016785144806, + "num_input_tokens_seen": 4159504, + "step": 254, + "train_runtime": 2095.2376, + "train_tokens_per_second": 1985.218 + }, + { + "epoch": 0.07063711911357341, + "grad_norm": 0.9276263117790222, + "learning_rate": 7.036011080332411e-05, + "loss": 0.2217385172843933, + "num_input_tokens_seen": 4175880, + "step": 255, + "train_runtime": 2103.4643, + "train_tokens_per_second": 1985.239 + }, + { + "epoch": 0.07091412742382272, + "grad_norm": 0.8929899334907532, + "learning_rate": 7.063711911357341e-05, + "loss": 0.2036968171596527, + "num_input_tokens_seen": 4192256, + "step": 256, + "train_runtime": 2111.6862, + "train_tokens_per_second": 1985.265 + }, + { + "epoch": 0.07119113573407203, + "grad_norm": 0.7868210077285767, + "learning_rate": 7.091412742382272e-05, + "loss": 0.1167093813419342, + "num_input_tokens_seen": 4208632, + "step": 257, + "train_runtime": 2119.9151, + "train_tokens_per_second": 1985.283 + }, + { + "epoch": 0.07146814404432134, + "grad_norm": 1.1176955699920654, + "learning_rate": 7.119113573407203e-05, + "loss": 0.14472581446170807, + "num_input_tokens_seen": 4225008, + "step": 258, + "train_runtime": 2128.14, + "train_tokens_per_second": 1985.305 + }, + { + "epoch": 0.07174515235457064, + "grad_norm": 0.7410415410995483, + "learning_rate": 7.146814404432133e-05, + "loss": 0.1166708767414093, + "num_input_tokens_seen": 4241384, + "step": 259, + "train_runtime": 2136.3685, + "train_tokens_per_second": 1985.324 + }, + { + "epoch": 0.07202216066481995, + "grad_norm": 0.9488644599914551, + "learning_rate": 7.174515235457065e-05, + "loss": 0.16376934945583344, + "num_input_tokens_seen": 4257760, + "step": 260, + "train_runtime": 2144.5902, + "train_tokens_per_second": 1985.349 + }, + { + "epoch": 0.07229916897506926, + "grad_norm": 1.026352047920227, + "learning_rate": 7.202216066481994e-05, + "loss": 0.1832423061132431, + "num_input_tokens_seen": 4274136, + "step": 261, + "train_runtime": 2152.8168, + "train_tokens_per_second": 1985.369 + }, + { + "epoch": 0.07257617728531857, + "grad_norm": 1.0023043155670166, + "learning_rate": 7.229916897506925e-05, + "loss": 0.21170593798160553, + "num_input_tokens_seen": 4290512, + "step": 262, + "train_runtime": 2161.0419, + "train_tokens_per_second": 1985.39 + }, + { + "epoch": 0.07285318559556787, + "grad_norm": 0.8630689382553101, + "learning_rate": 7.257617728531856e-05, + "loss": 0.11727003008127213, + "num_input_tokens_seen": 4306888, + "step": 263, + "train_runtime": 2169.2776, + "train_tokens_per_second": 1985.402 + }, + { + "epoch": 0.07313019390581718, + "grad_norm": 0.8117961287498474, + "learning_rate": 7.285318559556787e-05, + "loss": 0.12165223062038422, + "num_input_tokens_seen": 4323264, + "step": 264, + "train_runtime": 2177.5122, + "train_tokens_per_second": 1985.414 + }, + { + "epoch": 0.07340720221606649, + "grad_norm": 0.8872103095054626, + "learning_rate": 7.313019390581718e-05, + "loss": 0.13954338431358337, + "num_input_tokens_seen": 4339640, + "step": 265, + "train_runtime": 2185.7391, + "train_tokens_per_second": 1985.434 + }, + { + "epoch": 0.07368421052631578, + "grad_norm": 0.9249241948127747, + "learning_rate": 7.340720221606648e-05, + "loss": 0.1678493767976761, + "num_input_tokens_seen": 4356016, + "step": 266, + "train_runtime": 2193.9873, + "train_tokens_per_second": 1985.434 + }, + { + "epoch": 0.07396121883656509, + "grad_norm": 0.8554092049598694, + "learning_rate": 7.368421052631579e-05, + "loss": 0.09879390895366669, + "num_input_tokens_seen": 4372392, + "step": 267, + "train_runtime": 2202.2314, + "train_tokens_per_second": 1985.437 + }, + { + "epoch": 0.0742382271468144, + "grad_norm": 1.1033114194869995, + "learning_rate": 7.39612188365651e-05, + "loss": 0.12795543670654297, + "num_input_tokens_seen": 4388768, + "step": 268, + "train_runtime": 2210.4587, + "train_tokens_per_second": 1985.456 + }, + { + "epoch": 0.0745152354570637, + "grad_norm": 0.8365858197212219, + "learning_rate": 7.423822714681441e-05, + "loss": 0.11136002093553543, + "num_input_tokens_seen": 4405144, + "step": 269, + "train_runtime": 2218.686, + "train_tokens_per_second": 1985.474 + }, + { + "epoch": 0.07479224376731301, + "grad_norm": 1.1226816177368164, + "learning_rate": 7.451523545706372e-05, + "loss": 0.13193386793136597, + "num_input_tokens_seen": 4421520, + "step": 270, + "train_runtime": 2226.9216, + "train_tokens_per_second": 1985.485 + }, + { + "epoch": 0.07506925207756232, + "grad_norm": 0.9901308417320251, + "learning_rate": 7.479224376731301e-05, + "loss": 0.14404615759849548, + "num_input_tokens_seen": 4437896, + "step": 271, + "train_runtime": 2235.1413, + "train_tokens_per_second": 1985.51 + }, + { + "epoch": 0.07534626038781163, + "grad_norm": 0.8998082876205444, + "learning_rate": 7.506925207756234e-05, + "loss": 0.13489121198654175, + "num_input_tokens_seen": 4454272, + "step": 272, + "train_runtime": 2243.3699, + "train_tokens_per_second": 1985.527 + }, + { + "epoch": 0.07562326869806094, + "grad_norm": 0.9557387828826904, + "learning_rate": 7.534626038781163e-05, + "loss": 0.14852826297283173, + "num_input_tokens_seen": 4470648, + "step": 273, + "train_runtime": 2251.6045, + "train_tokens_per_second": 1985.539 + }, + { + "epoch": 0.07590027700831024, + "grad_norm": 0.8634775280952454, + "learning_rate": 7.562326869806094e-05, + "loss": 0.13415108621120453, + "num_input_tokens_seen": 4487024, + "step": 274, + "train_runtime": 2259.8303, + "train_tokens_per_second": 1985.558 + }, + { + "epoch": 0.07617728531855955, + "grad_norm": 0.9254340529441833, + "learning_rate": 7.590027700831025e-05, + "loss": 0.1311567723751068, + "num_input_tokens_seen": 4503400, + "step": 275, + "train_runtime": 2268.0589, + "train_tokens_per_second": 1985.575 + }, + { + "epoch": 0.07645429362880886, + "grad_norm": 0.7731362581253052, + "learning_rate": 7.617728531855956e-05, + "loss": 0.1262234002351761, + "num_input_tokens_seen": 4519776, + "step": 276, + "train_runtime": 2276.2885, + "train_tokens_per_second": 1985.59 + }, + { + "epoch": 0.07673130193905817, + "grad_norm": 0.8197715282440186, + "learning_rate": 7.645429362880887e-05, + "loss": 0.11852206289768219, + "num_input_tokens_seen": 4536152, + "step": 277, + "train_runtime": 2284.5148, + "train_tokens_per_second": 1985.609 + }, + { + "epoch": 0.07700831024930747, + "grad_norm": 1.1195882558822632, + "learning_rate": 7.673130193905818e-05, + "loss": 0.15502828359603882, + "num_input_tokens_seen": 4552528, + "step": 278, + "train_runtime": 2292.7474, + "train_tokens_per_second": 1985.621 + }, + { + "epoch": 0.07728531855955678, + "grad_norm": 0.9957787990570068, + "learning_rate": 7.700831024930748e-05, + "loss": 0.16502322256565094, + "num_input_tokens_seen": 4568904, + "step": 279, + "train_runtime": 2300.9816, + "train_tokens_per_second": 1985.633 + }, + { + "epoch": 0.07756232686980609, + "grad_norm": 0.8210909962654114, + "learning_rate": 7.728531855955679e-05, + "loss": 0.12436240911483765, + "num_input_tokens_seen": 4585280, + "step": 280, + "train_runtime": 2309.2162, + "train_tokens_per_second": 1985.643 + }, + { + "epoch": 0.0778393351800554, + "grad_norm": 0.8228148221969604, + "learning_rate": 7.75623268698061e-05, + "loss": 0.14675581455230713, + "num_input_tokens_seen": 4601656, + "step": 281, + "train_runtime": 2317.4438, + "train_tokens_per_second": 1985.66 + }, + { + "epoch": 0.0781163434903047, + "grad_norm": 0.8989976048469543, + "learning_rate": 7.783933518005541e-05, + "loss": 0.13241221010684967, + "num_input_tokens_seen": 4618032, + "step": 282, + "train_runtime": 2325.6673, + "train_tokens_per_second": 1985.68 + }, + { + "epoch": 0.07839335180055401, + "grad_norm": 0.7516988515853882, + "learning_rate": 7.811634349030472e-05, + "loss": 0.11735007911920547, + "num_input_tokens_seen": 4634408, + "step": 283, + "train_runtime": 2333.8971, + "train_tokens_per_second": 1985.695 + }, + { + "epoch": 0.07867036011080332, + "grad_norm": 1.07669198513031, + "learning_rate": 7.839335180055401e-05, + "loss": 0.1703200489282608, + "num_input_tokens_seen": 4650784, + "step": 284, + "train_runtime": 2342.1281, + "train_tokens_per_second": 1985.709 + }, + { + "epoch": 0.07894736842105263, + "grad_norm": 0.7619025111198425, + "learning_rate": 7.867036011080334e-05, + "loss": 0.11342278122901917, + "num_input_tokens_seen": 4667160, + "step": 285, + "train_runtime": 2350.356, + "train_tokens_per_second": 1985.725 + }, + { + "epoch": 0.07922437673130193, + "grad_norm": 0.918539822101593, + "learning_rate": 7.894736842105263e-05, + "loss": 0.15221461653709412, + "num_input_tokens_seen": 4683536, + "step": 286, + "train_runtime": 2358.583, + "train_tokens_per_second": 1985.741 + }, + { + "epoch": 0.07950138504155124, + "grad_norm": 0.7847246527671814, + "learning_rate": 7.922437673130194e-05, + "loss": 0.12442533671855927, + "num_input_tokens_seen": 4699912, + "step": 287, + "train_runtime": 2366.8143, + "train_tokens_per_second": 1985.754 + }, + { + "epoch": 0.07977839335180055, + "grad_norm": 0.754916787147522, + "learning_rate": 7.950138504155125e-05, + "loss": 0.10869459062814713, + "num_input_tokens_seen": 4716288, + "step": 288, + "train_runtime": 2375.0443, + "train_tokens_per_second": 1985.768 + }, + { + "epoch": 0.08005540166204986, + "grad_norm": 0.7556501626968384, + "learning_rate": 7.977839335180056e-05, + "loss": 0.10071774572134018, + "num_input_tokens_seen": 4732664, + "step": 289, + "train_runtime": 2383.2782, + "train_tokens_per_second": 1985.779 + }, + { + "epoch": 0.08033240997229917, + "grad_norm": 0.8473782539367676, + "learning_rate": 8.005540166204987e-05, + "loss": 0.13374993205070496, + "num_input_tokens_seen": 4749040, + "step": 290, + "train_runtime": 2391.5112, + "train_tokens_per_second": 1985.79 + }, + { + "epoch": 0.08060941828254847, + "grad_norm": 0.838242769241333, + "learning_rate": 8.033240997229917e-05, + "loss": 0.1270388960838318, + "num_input_tokens_seen": 4765416, + "step": 291, + "train_runtime": 2399.7402, + "train_tokens_per_second": 1985.805 + }, + { + "epoch": 0.08088642659279778, + "grad_norm": 0.8086673021316528, + "learning_rate": 8.060941828254848e-05, + "loss": 0.14766152203083038, + "num_input_tokens_seen": 4781792, + "step": 292, + "train_runtime": 2407.9714, + "train_tokens_per_second": 1985.818 + }, + { + "epoch": 0.08116343490304709, + "grad_norm": 0.7079471349716187, + "learning_rate": 8.088642659279779e-05, + "loss": 0.1011151522397995, + "num_input_tokens_seen": 4798168, + "step": 293, + "train_runtime": 2416.1985, + "train_tokens_per_second": 1985.834 + }, + { + "epoch": 0.0814404432132964, + "grad_norm": 0.8532183170318604, + "learning_rate": 8.11634349030471e-05, + "loss": 0.09587599337100983, + "num_input_tokens_seen": 4814544, + "step": 294, + "train_runtime": 2424.4203, + "train_tokens_per_second": 1985.854 + }, + { + "epoch": 0.0817174515235457, + "grad_norm": 0.6936272382736206, + "learning_rate": 8.14404432132964e-05, + "loss": 0.0921529158949852, + "num_input_tokens_seen": 4830920, + "step": 295, + "train_runtime": 2432.6464, + "train_tokens_per_second": 1985.87 + }, + { + "epoch": 0.08199445983379501, + "grad_norm": 0.7730256915092468, + "learning_rate": 8.17174515235457e-05, + "loss": 0.09187868237495422, + "num_input_tokens_seen": 4847296, + "step": 296, + "train_runtime": 2440.8717, + "train_tokens_per_second": 1985.887 + }, + { + "epoch": 0.08227146814404432, + "grad_norm": 0.8883402347564697, + "learning_rate": 8.199445983379503e-05, + "loss": 0.1197829619050026, + "num_input_tokens_seen": 4863672, + "step": 297, + "train_runtime": 2449.1038, + "train_tokens_per_second": 1985.899 + }, + { + "epoch": 0.08254847645429363, + "grad_norm": 0.8324491381645203, + "learning_rate": 8.227146814404432e-05, + "loss": 0.09860750287771225, + "num_input_tokens_seen": 4880048, + "step": 298, + "train_runtime": 2457.3232, + "train_tokens_per_second": 1985.92 + }, + { + "epoch": 0.08282548476454293, + "grad_norm": 0.7802547812461853, + "learning_rate": 8.254847645429363e-05, + "loss": 0.09796471148729324, + "num_input_tokens_seen": 4896424, + "step": 299, + "train_runtime": 2465.556, + "train_tokens_per_second": 1985.931 + }, + { + "epoch": 0.08310249307479224, + "grad_norm": 0.7481993436813354, + "learning_rate": 8.282548476454294e-05, + "loss": 0.08816869556903839, + "num_input_tokens_seen": 4912800, + "step": 300, + "train_runtime": 2473.7813, + "train_tokens_per_second": 1985.948 + }, + { + "epoch": 0.08337950138504155, + "grad_norm": 0.8775613903999329, + "learning_rate": 8.310249307479224e-05, + "loss": 0.10939601808786392, + "num_input_tokens_seen": 4929176, + "step": 301, + "train_runtime": 2483.4956, + "train_tokens_per_second": 1984.773 + }, + { + "epoch": 0.08365650969529086, + "grad_norm": 0.8593982458114624, + "learning_rate": 8.337950138504156e-05, + "loss": 0.10480751097202301, + "num_input_tokens_seen": 4945552, + "step": 302, + "train_runtime": 2491.7196, + "train_tokens_per_second": 1984.795 + }, + { + "epoch": 0.08393351800554016, + "grad_norm": 0.8840969204902649, + "learning_rate": 8.365650969529087e-05, + "loss": 0.11241204291582108, + "num_input_tokens_seen": 4961928, + "step": 303, + "train_runtime": 2499.9558, + "train_tokens_per_second": 1984.806 + }, + { + "epoch": 0.08421052631578947, + "grad_norm": 0.8014760613441467, + "learning_rate": 8.393351800554017e-05, + "loss": 0.1084151491522789, + "num_input_tokens_seen": 4978304, + "step": 304, + "train_runtime": 2508.174, + "train_tokens_per_second": 1984.832 + }, + { + "epoch": 0.08448753462603878, + "grad_norm": 0.8197492361068726, + "learning_rate": 8.421052631578948e-05, + "loss": 0.11503700911998749, + "num_input_tokens_seen": 4994680, + "step": 305, + "train_runtime": 2516.3932, + "train_tokens_per_second": 1984.857 + }, + { + "epoch": 0.08476454293628809, + "grad_norm": 0.7649335265159607, + "learning_rate": 8.448753462603879e-05, + "loss": 0.10183963179588318, + "num_input_tokens_seen": 5011056, + "step": 306, + "train_runtime": 2524.6251, + "train_tokens_per_second": 1984.871 + }, + { + "epoch": 0.0850415512465374, + "grad_norm": 0.6844969391822815, + "learning_rate": 8.47645429362881e-05, + "loss": 0.0791616216301918, + "num_input_tokens_seen": 5027432, + "step": 307, + "train_runtime": 2532.8553, + "train_tokens_per_second": 1984.887 + }, + { + "epoch": 0.0853185595567867, + "grad_norm": 0.7877629399299622, + "learning_rate": 8.50415512465374e-05, + "loss": 0.09427321702241898, + "num_input_tokens_seen": 5043808, + "step": 308, + "train_runtime": 2541.0831, + "train_tokens_per_second": 1984.905 + }, + { + "epoch": 0.08559556786703601, + "grad_norm": 1.0957889556884766, + "learning_rate": 8.53185595567867e-05, + "loss": 0.12922906875610352, + "num_input_tokens_seen": 5060184, + "step": 309, + "train_runtime": 2549.3101, + "train_tokens_per_second": 1984.923 + }, + { + "epoch": 0.08587257617728532, + "grad_norm": 0.9932659864425659, + "learning_rate": 8.559556786703602e-05, + "loss": 0.11494754999876022, + "num_input_tokens_seen": 5076560, + "step": 310, + "train_runtime": 2557.5277, + "train_tokens_per_second": 1984.948 + }, + { + "epoch": 0.08614958448753463, + "grad_norm": 0.73907071352005, + "learning_rate": 8.587257617728532e-05, + "loss": 0.08333182334899902, + "num_input_tokens_seen": 5092936, + "step": 311, + "train_runtime": 2565.7449, + "train_tokens_per_second": 1984.974 + }, + { + "epoch": 0.08642659279778393, + "grad_norm": 0.6974031925201416, + "learning_rate": 8.614958448753463e-05, + "loss": 0.08768966048955917, + "num_input_tokens_seen": 5109312, + "step": 312, + "train_runtime": 2573.9735, + "train_tokens_per_second": 1984.99 + }, + { + "epoch": 0.08670360110803324, + "grad_norm": 0.7075063586235046, + "learning_rate": 8.642659279778394e-05, + "loss": 0.08319346606731415, + "num_input_tokens_seen": 5125688, + "step": 313, + "train_runtime": 2582.2082, + "train_tokens_per_second": 1985.002 + }, + { + "epoch": 0.08698060941828255, + "grad_norm": 0.7835124135017395, + "learning_rate": 8.670360110803325e-05, + "loss": 0.08880278468132019, + "num_input_tokens_seen": 5142064, + "step": 314, + "train_runtime": 2590.431, + "train_tokens_per_second": 1985.023 + }, + { + "epoch": 0.08725761772853186, + "grad_norm": 1.1086758375167847, + "learning_rate": 8.698060941828256e-05, + "loss": 0.11389125883579254, + "num_input_tokens_seen": 5158440, + "step": 315, + "train_runtime": 2598.6595, + "train_tokens_per_second": 1985.039 + }, + { + "epoch": 0.08753462603878116, + "grad_norm": 0.919829785823822, + "learning_rate": 8.725761772853185e-05, + "loss": 0.09386332333087921, + "num_input_tokens_seen": 5174816, + "step": 316, + "train_runtime": 2606.8898, + "train_tokens_per_second": 1985.054 + }, + { + "epoch": 0.08781163434903047, + "grad_norm": 0.7066811323165894, + "learning_rate": 8.753462603878116e-05, + "loss": 0.08214502036571503, + "num_input_tokens_seen": 5191192, + "step": 317, + "train_runtime": 2615.1185, + "train_tokens_per_second": 1985.07 + }, + { + "epoch": 0.08808864265927978, + "grad_norm": 0.8140756487846375, + "learning_rate": 8.781163434903047e-05, + "loss": 0.09574220329523087, + "num_input_tokens_seen": 5207568, + "step": 318, + "train_runtime": 2623.3401, + "train_tokens_per_second": 1985.091 + }, + { + "epoch": 0.08836565096952909, + "grad_norm": 0.8448074460029602, + "learning_rate": 8.808864265927978e-05, + "loss": 0.08687430620193481, + "num_input_tokens_seen": 5223944, + "step": 319, + "train_runtime": 2631.5554, + "train_tokens_per_second": 1985.117 + }, + { + "epoch": 0.0886426592797784, + "grad_norm": 0.7577835321426392, + "learning_rate": 8.83656509695291e-05, + "loss": 0.0885535404086113, + "num_input_tokens_seen": 5240320, + "step": 320, + "train_runtime": 2639.7723, + "train_tokens_per_second": 1985.141 + }, + { + "epoch": 0.0889196675900277, + "grad_norm": 0.7267913222312927, + "learning_rate": 8.864265927977839e-05, + "loss": 0.07692497223615646, + "num_input_tokens_seen": 5256696, + "step": 321, + "train_runtime": 2647.9987, + "train_tokens_per_second": 1985.158 + }, + { + "epoch": 0.08919667590027701, + "grad_norm": 0.8731513023376465, + "learning_rate": 8.89196675900277e-05, + "loss": 0.08742271363735199, + "num_input_tokens_seen": 5273072, + "step": 322, + "train_runtime": 2656.2254, + "train_tokens_per_second": 1985.175 + }, + { + "epoch": 0.08947368421052632, + "grad_norm": 0.7708449363708496, + "learning_rate": 8.919667590027702e-05, + "loss": 0.07926444709300995, + "num_input_tokens_seen": 5289448, + "step": 323, + "train_runtime": 2664.4694, + "train_tokens_per_second": 1985.179 + }, + { + "epoch": 0.08975069252077562, + "grad_norm": 0.712697446346283, + "learning_rate": 8.947368421052632e-05, + "loss": 0.07375358045101166, + "num_input_tokens_seen": 5305824, + "step": 324, + "train_runtime": 2672.6859, + "train_tokens_per_second": 1985.203 + }, + { + "epoch": 0.09002770083102493, + "grad_norm": 0.6057803630828857, + "learning_rate": 8.975069252077563e-05, + "loss": 0.06511040031909943, + "num_input_tokens_seen": 5322200, + "step": 325, + "train_runtime": 2680.8979, + "train_tokens_per_second": 1985.23 + }, + { + "epoch": 0.09030470914127424, + "grad_norm": 0.8071460723876953, + "learning_rate": 9.002770083102492e-05, + "loss": 0.10748353600502014, + "num_input_tokens_seen": 5338576, + "step": 326, + "train_runtime": 2689.134, + "train_tokens_per_second": 1985.24 + }, + { + "epoch": 0.09058171745152355, + "grad_norm": 0.6300349235534668, + "learning_rate": 9.030470914127425e-05, + "loss": 0.07798698544502258, + "num_input_tokens_seen": 5354952, + "step": 327, + "train_runtime": 2697.361, + "train_tokens_per_second": 1985.256 + }, + { + "epoch": 0.09085872576177285, + "grad_norm": 0.8071503639221191, + "learning_rate": 9.058171745152356e-05, + "loss": 0.07426194101572037, + "num_input_tokens_seen": 5371328, + "step": 328, + "train_runtime": 2705.5959, + "train_tokens_per_second": 1985.266 + }, + { + "epoch": 0.09113573407202216, + "grad_norm": 0.7282470464706421, + "learning_rate": 9.085872576177285e-05, + "loss": 0.07015716284513474, + "num_input_tokens_seen": 5387704, + "step": 329, + "train_runtime": 2713.8183, + "train_tokens_per_second": 1985.285 + }, + { + "epoch": 0.09141274238227147, + "grad_norm": 0.6332882046699524, + "learning_rate": 9.113573407202216e-05, + "loss": 0.07258857786655426, + "num_input_tokens_seen": 5404080, + "step": 330, + "train_runtime": 2722.0321, + "train_tokens_per_second": 1985.311 + }, + { + "epoch": 0.09168975069252078, + "grad_norm": 0.6719120144844055, + "learning_rate": 9.141274238227147e-05, + "loss": 0.07262572646141052, + "num_input_tokens_seen": 5420456, + "step": 331, + "train_runtime": 2730.2373, + "train_tokens_per_second": 1985.342 + }, + { + "epoch": 0.09196675900277008, + "grad_norm": 0.8084395527839661, + "learning_rate": 9.168975069252078e-05, + "loss": 0.07191002368927002, + "num_input_tokens_seen": 5436832, + "step": 332, + "train_runtime": 2738.4548, + "train_tokens_per_second": 1985.365 + }, + { + "epoch": 0.09224376731301939, + "grad_norm": 0.8508367538452148, + "learning_rate": 9.196675900277009e-05, + "loss": 0.08415211737155914, + "num_input_tokens_seen": 5453208, + "step": 333, + "train_runtime": 2746.6842, + "train_tokens_per_second": 1985.379 + }, + { + "epoch": 0.0925207756232687, + "grad_norm": 0.9882177114486694, + "learning_rate": 9.224376731301939e-05, + "loss": 0.12353769689798355, + "num_input_tokens_seen": 5469584, + "step": 334, + "train_runtime": 2754.9026, + "train_tokens_per_second": 1985.4 + }, + { + "epoch": 0.09279778393351801, + "grad_norm": 0.5798330307006836, + "learning_rate": 9.252077562326871e-05, + "loss": 0.06807387620210648, + "num_input_tokens_seen": 5485960, + "step": 335, + "train_runtime": 2763.1087, + "train_tokens_per_second": 1985.43 + }, + { + "epoch": 0.09307479224376732, + "grad_norm": 0.8260757923126221, + "learning_rate": 9.279778393351801e-05, + "loss": 0.09232799708843231, + "num_input_tokens_seen": 5502336, + "step": 336, + "train_runtime": 2771.3201, + "train_tokens_per_second": 1985.457 + }, + { + "epoch": 0.09335180055401662, + "grad_norm": 0.7299746870994568, + "learning_rate": 9.307479224376732e-05, + "loss": 0.08205694705247879, + "num_input_tokens_seen": 5518712, + "step": 337, + "train_runtime": 2779.5304, + "train_tokens_per_second": 1985.484 + }, + { + "epoch": 0.09362880886426593, + "grad_norm": 0.6919077634811401, + "learning_rate": 9.335180055401663e-05, + "loss": 0.06999707221984863, + "num_input_tokens_seen": 5535088, + "step": 338, + "train_runtime": 2787.7392, + "train_tokens_per_second": 1985.511 + }, + { + "epoch": 0.09390581717451524, + "grad_norm": 0.7547946572303772, + "learning_rate": 9.362880886426594e-05, + "loss": 0.09170259535312653, + "num_input_tokens_seen": 5551464, + "step": 339, + "train_runtime": 2795.9562, + "train_tokens_per_second": 1985.533 + }, + { + "epoch": 0.09418282548476455, + "grad_norm": 0.8605533242225647, + "learning_rate": 9.390581717451525e-05, + "loss": 0.08793085813522339, + "num_input_tokens_seen": 5567840, + "step": 340, + "train_runtime": 2804.1651, + "train_tokens_per_second": 1985.561 + }, + { + "epoch": 0.09445983379501385, + "grad_norm": 0.7158651947975159, + "learning_rate": 9.418282548476454e-05, + "loss": 0.08066239953041077, + "num_input_tokens_seen": 5584216, + "step": 341, + "train_runtime": 2812.3732, + "train_tokens_per_second": 1985.589 + }, + { + "epoch": 0.09473684210526316, + "grad_norm": 0.6144101619720459, + "learning_rate": 9.445983379501385e-05, + "loss": 0.06989593058824539, + "num_input_tokens_seen": 5600592, + "step": 342, + "train_runtime": 2820.5891, + "train_tokens_per_second": 1985.611 + }, + { + "epoch": 0.09501385041551247, + "grad_norm": 0.8014315962791443, + "learning_rate": 9.473684210526316e-05, + "loss": 0.09485288709402084, + "num_input_tokens_seen": 5616968, + "step": 343, + "train_runtime": 2828.8183, + "train_tokens_per_second": 1985.623 + }, + { + "epoch": 0.09529085872576178, + "grad_norm": 0.8197758793830872, + "learning_rate": 9.501385041551247e-05, + "loss": 0.08519220352172852, + "num_input_tokens_seen": 5633344, + "step": 344, + "train_runtime": 2837.0311, + "train_tokens_per_second": 1985.648 + }, + { + "epoch": 0.09556786703601108, + "grad_norm": 0.7892965078353882, + "learning_rate": 9.529085872576178e-05, + "loss": 0.07971470803022385, + "num_input_tokens_seen": 5649720, + "step": 345, + "train_runtime": 2845.2452, + "train_tokens_per_second": 1985.671 + }, + { + "epoch": 0.09584487534626039, + "grad_norm": 0.5773614048957825, + "learning_rate": 9.556786703601108e-05, + "loss": 0.0723600685596466, + "num_input_tokens_seen": 5666096, + "step": 346, + "train_runtime": 2853.4627, + "train_tokens_per_second": 1985.691 + }, + { + "epoch": 0.0961218836565097, + "grad_norm": 0.6952322721481323, + "learning_rate": 9.584487534626039e-05, + "loss": 0.06886263191699982, + "num_input_tokens_seen": 5682472, + "step": 347, + "train_runtime": 2861.6715, + "train_tokens_per_second": 1985.718 + }, + { + "epoch": 0.096398891966759, + "grad_norm": 0.687017023563385, + "learning_rate": 9.612188365650971e-05, + "loss": 0.06820032000541687, + "num_input_tokens_seen": 5698848, + "step": 348, + "train_runtime": 2869.8776, + "train_tokens_per_second": 1985.746 + }, + { + "epoch": 0.09667590027700831, + "grad_norm": 0.7202360033988953, + "learning_rate": 9.6398891966759e-05, + "loss": 0.07866030186414719, + "num_input_tokens_seen": 5715224, + "step": 349, + "train_runtime": 2878.0901, + "train_tokens_per_second": 1985.77 + }, + { + "epoch": 0.09695290858725762, + "grad_norm": 0.8464450240135193, + "learning_rate": 9.667590027700832e-05, + "loss": 0.07819720357656479, + "num_input_tokens_seen": 5731600, + "step": 350, + "train_runtime": 2886.2983, + "train_tokens_per_second": 1985.796 + }, + { + "epoch": 0.09722991689750693, + "grad_norm": 0.7476164102554321, + "learning_rate": 9.695290858725761e-05, + "loss": 0.06847623735666275, + "num_input_tokens_seen": 5747976, + "step": 351, + "train_runtime": 2894.5188, + "train_tokens_per_second": 1985.814 + }, + { + "epoch": 0.09750692520775624, + "grad_norm": 0.7774441838264465, + "learning_rate": 9.722991689750694e-05, + "loss": 0.09128645062446594, + "num_input_tokens_seen": 5764352, + "step": 352, + "train_runtime": 2902.7443, + "train_tokens_per_second": 1985.828 + }, + { + "epoch": 0.09778393351800554, + "grad_norm": 0.672705352306366, + "learning_rate": 9.750692520775624e-05, + "loss": 0.060948796570301056, + "num_input_tokens_seen": 5780728, + "step": 353, + "train_runtime": 2910.9677, + "train_tokens_per_second": 1985.844 + }, + { + "epoch": 0.09806094182825485, + "grad_norm": 0.6531115174293518, + "learning_rate": 9.778393351800554e-05, + "loss": 0.07760587334632874, + "num_input_tokens_seen": 5797104, + "step": 354, + "train_runtime": 2919.1852, + "train_tokens_per_second": 1985.864 + }, + { + "epoch": 0.09833795013850416, + "grad_norm": 0.6454180479049683, + "learning_rate": 9.806094182825485e-05, + "loss": 0.0750250369310379, + "num_input_tokens_seen": 5813480, + "step": 355, + "train_runtime": 2927.4183, + "train_tokens_per_second": 1985.873 + }, + { + "epoch": 0.09861495844875347, + "grad_norm": 0.6681795120239258, + "learning_rate": 9.833795013850416e-05, + "loss": 0.08155518770217896, + "num_input_tokens_seen": 5829856, + "step": 356, + "train_runtime": 2935.6432, + "train_tokens_per_second": 1985.887 + }, + { + "epoch": 0.09889196675900278, + "grad_norm": 0.7228246927261353, + "learning_rate": 9.861495844875347e-05, + "loss": 0.06801558285951614, + "num_input_tokens_seen": 5846232, + "step": 357, + "train_runtime": 2943.8706, + "train_tokens_per_second": 1985.9 + }, + { + "epoch": 0.09916897506925208, + "grad_norm": 0.7212700247764587, + "learning_rate": 9.889196675900278e-05, + "loss": 0.06556021422147751, + "num_input_tokens_seen": 5862608, + "step": 358, + "train_runtime": 2952.1019, + "train_tokens_per_second": 1985.91 + }, + { + "epoch": 0.09944598337950139, + "grad_norm": 0.6292449235916138, + "learning_rate": 9.916897506925208e-05, + "loss": 0.05852118507027626, + "num_input_tokens_seen": 5878984, + "step": 359, + "train_runtime": 2960.3378, + "train_tokens_per_second": 1985.917 + }, + { + "epoch": 0.0997229916897507, + "grad_norm": 0.6141517162322998, + "learning_rate": 9.94459833795014e-05, + "loss": 0.07888314127922058, + "num_input_tokens_seen": 5895360, + "step": 360, + "train_runtime": 2968.555, + "train_tokens_per_second": 1985.936 + }, + { + "epoch": 0.1, + "grad_norm": 0.692671000957489, + "learning_rate": 9.97229916897507e-05, + "loss": 0.07206640392541885, + "num_input_tokens_seen": 5911736, + "step": 361, + "train_runtime": 2976.7797, + "train_tokens_per_second": 1985.95 + }, + { + "epoch": 0.10027700831024931, + "grad_norm": 0.8042985796928406, + "learning_rate": 0.0001, + "loss": 0.07918693125247955, + "num_input_tokens_seen": 5928112, + "step": 362, + "train_runtime": 2984.9939, + "train_tokens_per_second": 1985.971 + }, + { + "epoch": 0.10055401662049862, + "grad_norm": 0.6544278860092163, + "learning_rate": 9.99999998068232e-05, + "loss": 0.07934118062257767, + "num_input_tokens_seen": 5944488, + "step": 363, + "train_runtime": 2993.2057, + "train_tokens_per_second": 1985.994 + }, + { + "epoch": 0.10083102493074793, + "grad_norm": 0.7040328979492188, + "learning_rate": 9.999999922729282e-05, + "loss": 0.07194405049085617, + "num_input_tokens_seen": 5960864, + "step": 364, + "train_runtime": 3001.418, + "train_tokens_per_second": 1986.016 + }, + { + "epoch": 0.10110803324099724, + "grad_norm": 0.5847904682159424, + "learning_rate": 9.999999826140884e-05, + "loss": 0.06274966895580292, + "num_input_tokens_seen": 5977240, + "step": 365, + "train_runtime": 3009.6233, + "train_tokens_per_second": 1986.043 + }, + { + "epoch": 0.10138504155124654, + "grad_norm": 0.5973916053771973, + "learning_rate": 9.999999690917128e-05, + "loss": 0.06092659756541252, + "num_input_tokens_seen": 5993616, + "step": 366, + "train_runtime": 3017.8352, + "train_tokens_per_second": 1986.065 + }, + { + "epoch": 0.10166204986149585, + "grad_norm": 0.7511621713638306, + "learning_rate": 9.999999517058017e-05, + "loss": 0.06388077884912491, + "num_input_tokens_seen": 6009992, + "step": 367, + "train_runtime": 3026.0406, + "train_tokens_per_second": 1986.091 + }, + { + "epoch": 0.10193905817174516, + "grad_norm": 0.6676619052886963, + "learning_rate": 9.999999304563547e-05, + "loss": 0.06179898977279663, + "num_input_tokens_seen": 6026368, + "step": 368, + "train_runtime": 3034.2666, + "train_tokens_per_second": 1986.104 + }, + { + "epoch": 0.10221606648199447, + "grad_norm": 0.5403165221214294, + "learning_rate": 9.999999053433724e-05, + "loss": 0.062318723648786545, + "num_input_tokens_seen": 6042744, + "step": 369, + "train_runtime": 3042.5005, + "train_tokens_per_second": 1986.111 + }, + { + "epoch": 0.10249307479224377, + "grad_norm": 0.7993464469909668, + "learning_rate": 9.99999876366855e-05, + "loss": 0.07549713551998138, + "num_input_tokens_seen": 6059120, + "step": 370, + "train_runtime": 3050.7387, + "train_tokens_per_second": 1986.116 + }, + { + "epoch": 0.10277008310249308, + "grad_norm": 0.5890397429466248, + "learning_rate": 9.999998435268025e-05, + "loss": 0.05559977889060974, + "num_input_tokens_seen": 6075496, + "step": 371, + "train_runtime": 3058.9691, + "train_tokens_per_second": 1986.125 + }, + { + "epoch": 0.10304709141274238, + "grad_norm": 0.6075968742370605, + "learning_rate": 9.999998068232155e-05, + "loss": 0.06747864931821823, + "num_input_tokens_seen": 6091872, + "step": 372, + "train_runtime": 3067.1877, + "train_tokens_per_second": 1986.143 + }, + { + "epoch": 0.10332409972299168, + "grad_norm": 0.6431262493133545, + "learning_rate": 9.999997662560938e-05, + "loss": 0.06680714339017868, + "num_input_tokens_seen": 6108248, + "step": 373, + "train_runtime": 3075.4025, + "train_tokens_per_second": 1986.162 + }, + { + "epoch": 0.10360110803324099, + "grad_norm": 0.7075386643409729, + "learning_rate": 9.99999721825438e-05, + "loss": 0.06949344277381897, + "num_input_tokens_seen": 6124624, + "step": 374, + "train_runtime": 3083.6142, + "train_tokens_per_second": 1986.184 + }, + { + "epoch": 0.1038781163434903, + "grad_norm": 0.6201297044754028, + "learning_rate": 9.999996735312485e-05, + "loss": 0.05782192572951317, + "num_input_tokens_seen": 6141000, + "step": 375, + "train_runtime": 3091.8384, + "train_tokens_per_second": 1986.197 + }, + { + "epoch": 0.1041551246537396, + "grad_norm": 0.5871919989585876, + "learning_rate": 9.999996213735257e-05, + "loss": 0.06231506168842316, + "num_input_tokens_seen": 6157376, + "step": 376, + "train_runtime": 3100.0708, + "train_tokens_per_second": 1986.205 + }, + { + "epoch": 0.10443213296398891, + "grad_norm": 0.7552358508110046, + "learning_rate": 9.999995653522695e-05, + "loss": 0.07650149613618851, + "num_input_tokens_seen": 6173752, + "step": 377, + "train_runtime": 3108.2947, + "train_tokens_per_second": 1986.218 + }, + { + "epoch": 0.10470914127423822, + "grad_norm": 0.5804473161697388, + "learning_rate": 9.999995054674811e-05, + "loss": 0.060341425240039825, + "num_input_tokens_seen": 6190128, + "step": 378, + "train_runtime": 3116.519, + "train_tokens_per_second": 1986.231 + }, + { + "epoch": 0.10498614958448753, + "grad_norm": 0.6267532706260681, + "learning_rate": 9.999994417191605e-05, + "loss": 0.06378314644098282, + "num_input_tokens_seen": 6206504, + "step": 379, + "train_runtime": 3124.7545, + "train_tokens_per_second": 1986.237 + }, + { + "epoch": 0.10526315789473684, + "grad_norm": 0.6603133082389832, + "learning_rate": 9.999993741073082e-05, + "loss": 0.06212187930941582, + "num_input_tokens_seen": 6222880, + "step": 380, + "train_runtime": 3132.9689, + "train_tokens_per_second": 1986.257 + }, + { + "epoch": 0.10554016620498614, + "grad_norm": 0.5890621542930603, + "learning_rate": 9.999993026319248e-05, + "loss": 0.06561557203531265, + "num_input_tokens_seen": 6239256, + "step": 381, + "train_runtime": 3141.2011, + "train_tokens_per_second": 1986.264 + }, + { + "epoch": 0.10581717451523545, + "grad_norm": 0.80515456199646, + "learning_rate": 9.999992272930108e-05, + "loss": 0.08803797513246536, + "num_input_tokens_seen": 6255632, + "step": 382, + "train_runtime": 3149.4222, + "train_tokens_per_second": 1986.279 + }, + { + "epoch": 0.10609418282548476, + "grad_norm": 0.5653188824653625, + "learning_rate": 9.999991480905669e-05, + "loss": 0.06382325291633606, + "num_input_tokens_seen": 6272008, + "step": 383, + "train_runtime": 3157.6373, + "train_tokens_per_second": 1986.298 + }, + { + "epoch": 0.10637119113573407, + "grad_norm": 0.5328158140182495, + "learning_rate": 9.999990650245936e-05, + "loss": 0.05109722539782524, + "num_input_tokens_seen": 6288384, + "step": 384, + "train_runtime": 3165.8435, + "train_tokens_per_second": 1986.322 + }, + { + "epoch": 0.10664819944598337, + "grad_norm": 0.6360264420509338, + "learning_rate": 9.999989780950915e-05, + "loss": 0.06897810101509094, + "num_input_tokens_seen": 6304760, + "step": 385, + "train_runtime": 3174.0545, + "train_tokens_per_second": 1986.343 + }, + { + "epoch": 0.10692520775623268, + "grad_norm": 0.6032077670097351, + "learning_rate": 9.999988873020616e-05, + "loss": 0.06534336507320404, + "num_input_tokens_seen": 6321136, + "step": 386, + "train_runtime": 3182.2872, + "train_tokens_per_second": 1986.35 + }, + { + "epoch": 0.10720221606648199, + "grad_norm": 0.7075746059417725, + "learning_rate": 9.999987926455044e-05, + "loss": 0.06934862583875656, + "num_input_tokens_seen": 6337512, + "step": 387, + "train_runtime": 3190.503, + "train_tokens_per_second": 1986.368 + }, + { + "epoch": 0.1074792243767313, + "grad_norm": 0.4961213171482086, + "learning_rate": 9.999986941254203e-05, + "loss": 0.05104951560497284, + "num_input_tokens_seen": 6353888, + "step": 388, + "train_runtime": 3198.733, + "train_tokens_per_second": 1986.376 + }, + { + "epoch": 0.1077562326869806, + "grad_norm": 0.6685221791267395, + "learning_rate": 9.999985917418105e-05, + "loss": 0.0656336322426796, + "num_input_tokens_seen": 6370264, + "step": 389, + "train_runtime": 3206.9559, + "train_tokens_per_second": 1986.39 + }, + { + "epoch": 0.10803324099722991, + "grad_norm": 0.5982024073600769, + "learning_rate": 9.999984854946755e-05, + "loss": 0.0649900734424591, + "num_input_tokens_seen": 6386640, + "step": 390, + "train_runtime": 3215.1776, + "train_tokens_per_second": 1986.403 + }, + { + "epoch": 0.10831024930747922, + "grad_norm": 0.5617763996124268, + "learning_rate": 9.999983753840166e-05, + "loss": 0.06581994146108627, + "num_input_tokens_seen": 6403016, + "step": 391, + "train_runtime": 3223.3912, + "train_tokens_per_second": 1986.422 + }, + { + "epoch": 0.10858725761772853, + "grad_norm": 0.5684589147567749, + "learning_rate": 9.99998261409834e-05, + "loss": 0.05878477543592453, + "num_input_tokens_seen": 6419392, + "step": 392, + "train_runtime": 3231.6193, + "train_tokens_per_second": 1986.432 + }, + { + "epoch": 0.10886426592797784, + "grad_norm": 0.6201686263084412, + "learning_rate": 9.99998143572129e-05, + "loss": 0.060286592692136765, + "num_input_tokens_seen": 6435768, + "step": 393, + "train_runtime": 3239.8464, + "train_tokens_per_second": 1986.442 + }, + { + "epoch": 0.10914127423822714, + "grad_norm": 0.8365944623947144, + "learning_rate": 9.999980218709023e-05, + "loss": 0.06307109445333481, + "num_input_tokens_seen": 6452144, + "step": 394, + "train_runtime": 3248.0658, + "train_tokens_per_second": 1986.457 + }, + { + "epoch": 0.10941828254847645, + "grad_norm": 0.7132045030593872, + "learning_rate": 9.999978963061551e-05, + "loss": 0.06888192892074585, + "num_input_tokens_seen": 6468520, + "step": 395, + "train_runtime": 3256.2833, + "train_tokens_per_second": 1986.473 + }, + { + "epoch": 0.10969529085872576, + "grad_norm": 0.6766054034233093, + "learning_rate": 9.999977668778882e-05, + "loss": 0.06043960899114609, + "num_input_tokens_seen": 6484896, + "step": 396, + "train_runtime": 3264.507, + "train_tokens_per_second": 1986.486 + }, + { + "epoch": 0.10997229916897507, + "grad_norm": 0.6155004501342773, + "learning_rate": 9.999976335861025e-05, + "loss": 0.06996041536331177, + "num_input_tokens_seen": 6501272, + "step": 397, + "train_runtime": 3272.7373, + "train_tokens_per_second": 1986.494 + }, + { + "epoch": 0.11024930747922437, + "grad_norm": 0.5266620516777039, + "learning_rate": 9.999974964307993e-05, + "loss": 0.05424066632986069, + "num_input_tokens_seen": 6517648, + "step": 398, + "train_runtime": 3280.9673, + "train_tokens_per_second": 1986.502 + }, + { + "epoch": 0.11052631578947368, + "grad_norm": 0.5705076456069946, + "learning_rate": 9.999973554119795e-05, + "loss": 0.06080394610762596, + "num_input_tokens_seen": 6534024, + "step": 399, + "train_runtime": 3289.1887, + "train_tokens_per_second": 1986.515 + }, + { + "epoch": 0.11080332409972299, + "grad_norm": 0.6415627598762512, + "learning_rate": 9.99997210529644e-05, + "loss": 0.06841950118541718, + "num_input_tokens_seen": 6550400, + "step": 400, + "train_runtime": 3297.4228, + "train_tokens_per_second": 1986.521 + }, + { + "epoch": 0.1110803324099723, + "grad_norm": 0.5395914912223816, + "learning_rate": 9.999970617837943e-05, + "loss": 0.05855146050453186, + "num_input_tokens_seen": 6566776, + "step": 401, + "train_runtime": 3307.1796, + "train_tokens_per_second": 1985.612 + }, + { + "epoch": 0.1113573407202216, + "grad_norm": 0.48179343342781067, + "learning_rate": 9.999969091744313e-05, + "loss": 0.05181819200515747, + "num_input_tokens_seen": 6583152, + "step": 402, + "train_runtime": 3315.4054, + "train_tokens_per_second": 1985.625 + }, + { + "epoch": 0.11163434903047091, + "grad_norm": 0.6734169721603394, + "learning_rate": 9.999967527015563e-05, + "loss": 0.07323645055294037, + "num_input_tokens_seen": 6599528, + "step": 403, + "train_runtime": 3323.6222, + "train_tokens_per_second": 1985.643 + }, + { + "epoch": 0.11191135734072022, + "grad_norm": 0.5799394249916077, + "learning_rate": 9.999965923651704e-05, + "loss": 0.05234455317258835, + "num_input_tokens_seen": 6615904, + "step": 404, + "train_runtime": 3331.8336, + "train_tokens_per_second": 1985.665 + }, + { + "epoch": 0.11218836565096953, + "grad_norm": 0.5104697346687317, + "learning_rate": 9.99996428165275e-05, + "loss": 0.0584915392100811, + "num_input_tokens_seen": 6632280, + "step": 405, + "train_runtime": 3340.0338, + "train_tokens_per_second": 1985.692 + }, + { + "epoch": 0.11246537396121883, + "grad_norm": 0.5726069211959839, + "learning_rate": 9.99996260101871e-05, + "loss": 0.05712741240859032, + "num_input_tokens_seen": 6648656, + "step": 406, + "train_runtime": 3348.236, + "train_tokens_per_second": 1985.719 + }, + { + "epoch": 0.11274238227146814, + "grad_norm": 0.529773473739624, + "learning_rate": 9.999960881749601e-05, + "loss": 0.04860013350844383, + "num_input_tokens_seen": 6665032, + "step": 407, + "train_runtime": 3356.4455, + "train_tokens_per_second": 1985.741 + }, + { + "epoch": 0.11301939058171745, + "grad_norm": 0.47984030842781067, + "learning_rate": 9.999959123845436e-05, + "loss": 0.0444425530731678, + "num_input_tokens_seen": 6681408, + "step": 408, + "train_runtime": 3364.6579, + "train_tokens_per_second": 1985.761 + }, + { + "epoch": 0.11329639889196676, + "grad_norm": 0.7062572836875916, + "learning_rate": 9.999957327306227e-05, + "loss": 0.06411318480968475, + "num_input_tokens_seen": 6697784, + "step": 409, + "train_runtime": 3372.8697, + "train_tokens_per_second": 1985.782 + }, + { + "epoch": 0.11357340720221606, + "grad_norm": 0.604327380657196, + "learning_rate": 9.999955492131988e-05, + "loss": 0.057638928294181824, + "num_input_tokens_seen": 6714160, + "step": 410, + "train_runtime": 3381.1004, + "train_tokens_per_second": 1985.791 + }, + { + "epoch": 0.11385041551246537, + "grad_norm": 0.46603474020957947, + "learning_rate": 9.999953618322732e-05, + "loss": 0.05891625955700874, + "num_input_tokens_seen": 6730536, + "step": 411, + "train_runtime": 3389.3117, + "train_tokens_per_second": 1985.812 + }, + { + "epoch": 0.11412742382271468, + "grad_norm": 0.5344932675361633, + "learning_rate": 9.999951705878477e-05, + "loss": 0.056414686143398285, + "num_input_tokens_seen": 6746912, + "step": 412, + "train_runtime": 3397.519, + "train_tokens_per_second": 1985.835 + }, + { + "epoch": 0.11440443213296399, + "grad_norm": 0.7101984024047852, + "learning_rate": 9.999949754799237e-05, + "loss": 0.07599898427724838, + "num_input_tokens_seen": 6763288, + "step": 413, + "train_runtime": 3405.7201, + "train_tokens_per_second": 1985.861 + }, + { + "epoch": 0.1146814404432133, + "grad_norm": 0.5285444259643555, + "learning_rate": 9.999947765085023e-05, + "loss": 0.052575837820768356, + "num_input_tokens_seen": 6779664, + "step": 414, + "train_runtime": 3413.9367, + "train_tokens_per_second": 1985.879 + }, + { + "epoch": 0.1149584487534626, + "grad_norm": 0.5981202721595764, + "learning_rate": 9.999945736735853e-05, + "loss": 0.060912687331438065, + "num_input_tokens_seen": 6796040, + "step": 415, + "train_runtime": 3422.1544, + "train_tokens_per_second": 1985.895 + }, + { + "epoch": 0.11523545706371191, + "grad_norm": 0.7082583904266357, + "learning_rate": 9.999943669751745e-05, + "loss": 0.06304559856653214, + "num_input_tokens_seen": 6812416, + "step": 416, + "train_runtime": 3430.3606, + "train_tokens_per_second": 1985.918 + }, + { + "epoch": 0.11551246537396122, + "grad_norm": 0.5518990755081177, + "learning_rate": 9.999941564132713e-05, + "loss": 0.04889700934290886, + "num_input_tokens_seen": 6828792, + "step": 417, + "train_runtime": 3438.5691, + "train_tokens_per_second": 1985.94 + }, + { + "epoch": 0.11578947368421053, + "grad_norm": 0.6155416965484619, + "learning_rate": 9.999939419878772e-05, + "loss": 0.05551523342728615, + "num_input_tokens_seen": 6845168, + "step": 418, + "train_runtime": 3446.7726, + "train_tokens_per_second": 1985.964 + }, + { + "epoch": 0.11606648199445983, + "grad_norm": 0.6257419586181641, + "learning_rate": 9.99993723698994e-05, + "loss": 0.06397726386785507, + "num_input_tokens_seen": 6861544, + "step": 419, + "train_runtime": 3454.9855, + "train_tokens_per_second": 1985.983 + }, + { + "epoch": 0.11634349030470914, + "grad_norm": 0.5064537525177002, + "learning_rate": 9.999935015466233e-05, + "loss": 0.05214916914701462, + "num_input_tokens_seen": 6877920, + "step": 420, + "train_runtime": 3463.1914, + "train_tokens_per_second": 1986.006 + }, + { + "epoch": 0.11662049861495845, + "grad_norm": 0.40731510519981384, + "learning_rate": 9.99993275530767e-05, + "loss": 0.041942231357097626, + "num_input_tokens_seen": 6894296, + "step": 421, + "train_runtime": 3471.4006, + "train_tokens_per_second": 1986.027 + }, + { + "epoch": 0.11689750692520776, + "grad_norm": 0.5984621644020081, + "learning_rate": 9.999930456514265e-05, + "loss": 0.05911904200911522, + "num_input_tokens_seen": 6910672, + "step": 422, + "train_runtime": 3479.6038, + "train_tokens_per_second": 1986.051 + }, + { + "epoch": 0.11717451523545706, + "grad_norm": 0.46702486276626587, + "learning_rate": 9.999928119086041e-05, + "loss": 0.049427445977926254, + "num_input_tokens_seen": 6927048, + "step": 423, + "train_runtime": 3487.823, + "train_tokens_per_second": 1986.066 + }, + { + "epoch": 0.11745152354570637, + "grad_norm": 0.7037188410758972, + "learning_rate": 9.99992574302301e-05, + "loss": 0.05886170268058777, + "num_input_tokens_seen": 6943424, + "step": 424, + "train_runtime": 3496.0534, + "train_tokens_per_second": 1986.075 + }, + { + "epoch": 0.11772853185595568, + "grad_norm": 0.48699572682380676, + "learning_rate": 9.999923328325196e-05, + "loss": 0.050936147570610046, + "num_input_tokens_seen": 6959800, + "step": 425, + "train_runtime": 3504.2785, + "train_tokens_per_second": 1986.086 + }, + { + "epoch": 0.11800554016620499, + "grad_norm": 0.550636887550354, + "learning_rate": 9.999920874992615e-05, + "loss": 0.053120002150535583, + "num_input_tokens_seen": 6976176, + "step": 426, + "train_runtime": 3512.5097, + "train_tokens_per_second": 1986.094 + }, + { + "epoch": 0.1182825484764543, + "grad_norm": 0.49667876958847046, + "learning_rate": 9.999918383025283e-05, + "loss": 0.04591799154877663, + "num_input_tokens_seen": 6992552, + "step": 427, + "train_runtime": 3520.7405, + "train_tokens_per_second": 1986.103 + }, + { + "epoch": 0.1185595567867036, + "grad_norm": 0.5859022736549377, + "learning_rate": 9.999915852423225e-05, + "loss": 0.0530957467854023, + "num_input_tokens_seen": 7008928, + "step": 428, + "train_runtime": 3528.9699, + "train_tokens_per_second": 1986.112 + }, + { + "epoch": 0.11883656509695291, + "grad_norm": 0.49076730012893677, + "learning_rate": 9.999913283186457e-05, + "loss": 0.045807551592588425, + "num_input_tokens_seen": 7025304, + "step": 429, + "train_runtime": 3537.2037, + "train_tokens_per_second": 1986.118 + }, + { + "epoch": 0.11911357340720222, + "grad_norm": 0.6536352634429932, + "learning_rate": 9.999910675315001e-05, + "loss": 0.05204036831855774, + "num_input_tokens_seen": 7041680, + "step": 430, + "train_runtime": 3545.4318, + "train_tokens_per_second": 1986.128 + }, + { + "epoch": 0.11939058171745152, + "grad_norm": 0.5101304054260254, + "learning_rate": 9.999908028808875e-05, + "loss": 0.04969941824674606, + "num_input_tokens_seen": 7058056, + "step": 431, + "train_runtime": 3553.6627, + "train_tokens_per_second": 1986.136 + }, + { + "epoch": 0.11966759002770083, + "grad_norm": 0.4203695058822632, + "learning_rate": 9.999905343668099e-05, + "loss": 0.045464955270290375, + "num_input_tokens_seen": 7074432, + "step": 432, + "train_runtime": 3561.8995, + "train_tokens_per_second": 1986.14 + }, + { + "epoch": 0.11994459833795014, + "grad_norm": 0.438571959733963, + "learning_rate": 9.999902619892696e-05, + "loss": 0.047611068934202194, + "num_input_tokens_seen": 7090808, + "step": 433, + "train_runtime": 3570.1388, + "train_tokens_per_second": 1986.143 + }, + { + "epoch": 0.12022160664819945, + "grad_norm": 0.5673303008079529, + "learning_rate": 9.999899857482686e-05, + "loss": 0.048890531063079834, + "num_input_tokens_seen": 7107184, + "step": 434, + "train_runtime": 3578.3703, + "train_tokens_per_second": 1986.151 + }, + { + "epoch": 0.12049861495844875, + "grad_norm": 0.6330506801605225, + "learning_rate": 9.99989705643809e-05, + "loss": 0.0596369169652462, + "num_input_tokens_seen": 7123560, + "step": 435, + "train_runtime": 3586.5993, + "train_tokens_per_second": 1986.16 + }, + { + "epoch": 0.12077562326869806, + "grad_norm": 0.5195157527923584, + "learning_rate": 9.999894216758932e-05, + "loss": 0.05859924107789993, + "num_input_tokens_seen": 7139936, + "step": 436, + "train_runtime": 3594.8232, + "train_tokens_per_second": 1986.172 + }, + { + "epoch": 0.12105263157894737, + "grad_norm": 0.4180755913257599, + "learning_rate": 9.999891338445229e-05, + "loss": 0.045620981603860855, + "num_input_tokens_seen": 7156312, + "step": 437, + "train_runtime": 3603.0545, + "train_tokens_per_second": 1986.179 + }, + { + "epoch": 0.12132963988919668, + "grad_norm": 0.4925903081893921, + "learning_rate": 9.999888421497008e-05, + "loss": 0.04952852800488472, + "num_input_tokens_seen": 7172688, + "step": 438, + "train_runtime": 3611.2673, + "train_tokens_per_second": 1986.197 + }, + { + "epoch": 0.12160664819944599, + "grad_norm": 0.5068244338035583, + "learning_rate": 9.99988546591429e-05, + "loss": 0.053967658430337906, + "num_input_tokens_seen": 7189064, + "step": 439, + "train_runtime": 3619.4817, + "train_tokens_per_second": 1986.214 + }, + { + "epoch": 0.12188365650969529, + "grad_norm": 0.49308156967163086, + "learning_rate": 9.999882471697097e-05, + "loss": 0.04591640457510948, + "num_input_tokens_seen": 7205440, + "step": 440, + "train_runtime": 3627.6938, + "train_tokens_per_second": 1986.232 + }, + { + "epoch": 0.1221606648199446, + "grad_norm": 0.5047193169593811, + "learning_rate": 9.999879438845453e-05, + "loss": 0.05388277769088745, + "num_input_tokens_seen": 7221816, + "step": 441, + "train_runtime": 3635.9235, + "train_tokens_per_second": 1986.24 + }, + { + "epoch": 0.12243767313019391, + "grad_norm": 0.7333080172538757, + "learning_rate": 9.99987636735938e-05, + "loss": 0.05960899963974953, + "num_input_tokens_seen": 7238192, + "step": 442, + "train_runtime": 3644.1373, + "train_tokens_per_second": 1986.257 + }, + { + "epoch": 0.12271468144044322, + "grad_norm": 0.4565502107143402, + "learning_rate": 9.999873257238905e-05, + "loss": 0.04918278008699417, + "num_input_tokens_seen": 7254568, + "step": 443, + "train_runtime": 3652.3422, + "train_tokens_per_second": 1986.278 + }, + { + "epoch": 0.12299168975069252, + "grad_norm": 0.5146927833557129, + "learning_rate": 9.999870108484048e-05, + "loss": 0.051080405712127686, + "num_input_tokens_seen": 7270944, + "step": 444, + "train_runtime": 3660.5538, + "train_tokens_per_second": 1986.296 + }, + { + "epoch": 0.12326869806094183, + "grad_norm": 0.5074971914291382, + "learning_rate": 9.999866921094838e-05, + "loss": 0.054684512317180634, + "num_input_tokens_seen": 7287320, + "step": 445, + "train_runtime": 3668.7586, + "train_tokens_per_second": 1986.318 + }, + { + "epoch": 0.12354570637119114, + "grad_norm": 0.5596457719802856, + "learning_rate": 9.999863695071294e-05, + "loss": 0.05356535315513611, + "num_input_tokens_seen": 7303696, + "step": 446, + "train_runtime": 3676.9688, + "train_tokens_per_second": 1986.336 + }, + { + "epoch": 0.12382271468144045, + "grad_norm": 0.6376465559005737, + "learning_rate": 9.999860430413446e-05, + "loss": 0.05092785879969597, + "num_input_tokens_seen": 7320072, + "step": 447, + "train_runtime": 3685.1746, + "train_tokens_per_second": 1986.357 + }, + { + "epoch": 0.12409972299168975, + "grad_norm": 0.52330082654953, + "learning_rate": 9.999857127121314e-05, + "loss": 0.04536956921219826, + "num_input_tokens_seen": 7336448, + "step": 448, + "train_runtime": 3693.3802, + "train_tokens_per_second": 1986.378 + }, + { + "epoch": 0.12437673130193906, + "grad_norm": 0.4583214521408081, + "learning_rate": 9.99985378519493e-05, + "loss": 0.0498373918235302, + "num_input_tokens_seen": 7352824, + "step": 449, + "train_runtime": 3701.6051, + "train_tokens_per_second": 1986.388 + }, + { + "epoch": 0.12465373961218837, + "grad_norm": 0.5449318885803223, + "learning_rate": 9.999850404634316e-05, + "loss": 0.04828133061528206, + "num_input_tokens_seen": 7369200, + "step": 450, + "train_runtime": 3709.8195, + "train_tokens_per_second": 1986.404 + }, + { + "epoch": 0.12493074792243768, + "grad_norm": 0.667927086353302, + "learning_rate": 9.999846985439497e-05, + "loss": 0.061976149678230286, + "num_input_tokens_seen": 7385576, + "step": 451, + "train_runtime": 3718.0541, + "train_tokens_per_second": 1986.409 + }, + { + "epoch": 0.12520775623268698, + "grad_norm": 0.5284788608551025, + "learning_rate": 9.999843527610502e-05, + "loss": 0.05678265541791916, + "num_input_tokens_seen": 7401952, + "step": 452, + "train_runtime": 3726.2858, + "train_tokens_per_second": 1986.416 + }, + { + "epoch": 0.12548476454293628, + "grad_norm": 0.4165763258934021, + "learning_rate": 9.999840031147356e-05, + "loss": 0.04454851523041725, + "num_input_tokens_seen": 7418328, + "step": 453, + "train_runtime": 3734.5018, + "train_tokens_per_second": 1986.43 + }, + { + "epoch": 0.1257617728531856, + "grad_norm": 0.4624375104904175, + "learning_rate": 9.999836496050088e-05, + "loss": 0.04547818750143051, + "num_input_tokens_seen": 7434704, + "step": 454, + "train_runtime": 3742.7202, + "train_tokens_per_second": 1986.444 + }, + { + "epoch": 0.1260387811634349, + "grad_norm": 0.4344637989997864, + "learning_rate": 9.999832922318723e-05, + "loss": 0.05047646537423134, + "num_input_tokens_seen": 7451080, + "step": 455, + "train_runtime": 3750.9324, + "train_tokens_per_second": 1986.461 + }, + { + "epoch": 0.12631578947368421, + "grad_norm": 0.5510473847389221, + "learning_rate": 9.999829309953289e-05, + "loss": 0.049236852675676346, + "num_input_tokens_seen": 7467456, + "step": 456, + "train_runtime": 3759.1392, + "train_tokens_per_second": 1986.48 + }, + { + "epoch": 0.1265927977839335, + "grad_norm": 0.4263777732849121, + "learning_rate": 9.999825658953815e-05, + "loss": 0.04300235956907272, + "num_input_tokens_seen": 7483832, + "step": 457, + "train_runtime": 3767.3575, + "train_tokens_per_second": 1986.494 + }, + { + "epoch": 0.12686980609418283, + "grad_norm": 0.41193729639053345, + "learning_rate": 9.999821969320329e-05, + "loss": 0.05139067396521568, + "num_input_tokens_seen": 7500208, + "step": 458, + "train_runtime": 3775.5822, + "train_tokens_per_second": 1986.504 + }, + { + "epoch": 0.12714681440443212, + "grad_norm": 0.6757765412330627, + "learning_rate": 9.999818241052859e-05, + "loss": 0.05112706869840622, + "num_input_tokens_seen": 7516584, + "step": 459, + "train_runtime": 3783.8084, + "train_tokens_per_second": 1986.513 + }, + { + "epoch": 0.12742382271468145, + "grad_norm": 0.6113940477371216, + "learning_rate": 9.999814474151435e-05, + "loss": 0.06495877355337143, + "num_input_tokens_seen": 7532960, + "step": 460, + "train_runtime": 3792.0337, + "train_tokens_per_second": 1986.522 + }, + { + "epoch": 0.12770083102493074, + "grad_norm": 0.3952314257621765, + "learning_rate": 9.999810668616086e-05, + "loss": 0.046168066561222076, + "num_input_tokens_seen": 7549336, + "step": 461, + "train_runtime": 3800.2646, + "train_tokens_per_second": 1986.529 + }, + { + "epoch": 0.12797783933518006, + "grad_norm": 0.533384382724762, + "learning_rate": 9.999806824446839e-05, + "loss": 0.05279422178864479, + "num_input_tokens_seen": 7565712, + "step": 462, + "train_runtime": 3808.4892, + "train_tokens_per_second": 1986.539 + }, + { + "epoch": 0.12825484764542935, + "grad_norm": 0.5506687164306641, + "learning_rate": 9.999802941643724e-05, + "loss": 0.05312853306531906, + "num_input_tokens_seen": 7582088, + "step": 463, + "train_runtime": 3816.7113, + "train_tokens_per_second": 1986.55 + }, + { + "epoch": 0.12853185595567868, + "grad_norm": 0.39918792247772217, + "learning_rate": 9.999799020206773e-05, + "loss": 0.04289465397596359, + "num_input_tokens_seen": 7598464, + "step": 464, + "train_runtime": 3824.9318, + "train_tokens_per_second": 1986.562 + }, + { + "epoch": 0.12880886426592797, + "grad_norm": 0.455290824174881, + "learning_rate": 9.999795060136017e-05, + "loss": 0.04974079132080078, + "num_input_tokens_seen": 7614840, + "step": 465, + "train_runtime": 3833.1539, + "train_tokens_per_second": 1986.573 + }, + { + "epoch": 0.1290858725761773, + "grad_norm": 0.4738784432411194, + "learning_rate": 9.999791061431485e-05, + "loss": 0.04224593937397003, + "num_input_tokens_seen": 7631216, + "step": 466, + "train_runtime": 3841.3652, + "train_tokens_per_second": 1986.59 + }, + { + "epoch": 0.12936288088642658, + "grad_norm": 0.6277029514312744, + "learning_rate": 9.999787024093208e-05, + "loss": 0.05924304947257042, + "num_input_tokens_seen": 7647592, + "step": 467, + "train_runtime": 3849.5912, + "train_tokens_per_second": 1986.598 + }, + { + "epoch": 0.1296398891966759, + "grad_norm": 0.4311698079109192, + "learning_rate": 9.999782948121216e-05, + "loss": 0.0510065034031868, + "num_input_tokens_seen": 7663968, + "step": 468, + "train_runtime": 3857.8201, + "train_tokens_per_second": 1986.606 + }, + { + "epoch": 0.1299168975069252, + "grad_norm": 0.45002681016921997, + "learning_rate": 9.999778833515543e-05, + "loss": 0.05554133653640747, + "num_input_tokens_seen": 7680344, + "step": 469, + "train_runtime": 3866.0402, + "train_tokens_per_second": 1986.618 + }, + { + "epoch": 0.13019390581717452, + "grad_norm": 0.4105798900127411, + "learning_rate": 9.999774680276219e-05, + "loss": 0.04458684101700783, + "num_input_tokens_seen": 7696720, + "step": 470, + "train_runtime": 3874.2534, + "train_tokens_per_second": 1986.633 + }, + { + "epoch": 0.13047091412742381, + "grad_norm": 1.0174962282180786, + "learning_rate": 9.999770488403277e-05, + "loss": 0.06455190479755402, + "num_input_tokens_seen": 7713096, + "step": 471, + "train_runtime": 3882.4612, + "train_tokens_per_second": 1986.651 + }, + { + "epoch": 0.13074792243767314, + "grad_norm": 0.5130550861358643, + "learning_rate": 9.999766257896749e-05, + "loss": 0.051370784640312195, + "num_input_tokens_seen": 7729472, + "step": 472, + "train_runtime": 3890.6733, + "train_tokens_per_second": 1986.667 + }, + { + "epoch": 0.13102493074792243, + "grad_norm": 0.42469584941864014, + "learning_rate": 9.999761988756669e-05, + "loss": 0.04212632775306702, + "num_input_tokens_seen": 7745848, + "step": 473, + "train_runtime": 3898.8887, + "train_tokens_per_second": 1986.681 + }, + { + "epoch": 0.13130193905817175, + "grad_norm": 0.4950283169746399, + "learning_rate": 9.999757680983066e-05, + "loss": 0.0502178892493248, + "num_input_tokens_seen": 7762224, + "step": 474, + "train_runtime": 3907.1003, + "train_tokens_per_second": 1986.697 + }, + { + "epoch": 0.13157894736842105, + "grad_norm": 0.4445037245750427, + "learning_rate": 9.999753334575978e-05, + "loss": 0.05439046025276184, + "num_input_tokens_seen": 7778600, + "step": 475, + "train_runtime": 3915.3201, + "train_tokens_per_second": 1986.709 + }, + { + "epoch": 0.13185595567867037, + "grad_norm": 0.6277253031730652, + "learning_rate": 9.999748949535436e-05, + "loss": 0.05337975546717644, + "num_input_tokens_seen": 7794976, + "step": 476, + "train_runtime": 3923.5542, + "train_tokens_per_second": 1986.713 + }, + { + "epoch": 0.13213296398891966, + "grad_norm": 0.48948296904563904, + "learning_rate": 9.999744525861474e-05, + "loss": 0.0505623035132885, + "num_input_tokens_seen": 7811352, + "step": 477, + "train_runtime": 3931.7798, + "train_tokens_per_second": 1986.722 + }, + { + "epoch": 0.13240997229916898, + "grad_norm": 0.5303763747215271, + "learning_rate": 9.999740063554129e-05, + "loss": 0.057518914341926575, + "num_input_tokens_seen": 7827728, + "step": 478, + "train_runtime": 3940.0019, + "train_tokens_per_second": 1986.732 + }, + { + "epoch": 0.13268698060941828, + "grad_norm": 0.4210052788257599, + "learning_rate": 9.99973556261343e-05, + "loss": 0.0439632311463356, + "num_input_tokens_seen": 7844104, + "step": 479, + "train_runtime": 3948.2265, + "train_tokens_per_second": 1986.741 + }, + { + "epoch": 0.1329639889196676, + "grad_norm": 0.4993128180503845, + "learning_rate": 9.999731023039416e-05, + "loss": 0.05183611437678337, + "num_input_tokens_seen": 7860480, + "step": 480, + "train_runtime": 3956.4535, + "train_tokens_per_second": 1986.749 + }, + { + "epoch": 0.1332409972299169, + "grad_norm": 0.47915056347846985, + "learning_rate": 9.999726444832123e-05, + "loss": 0.04788004979491234, + "num_input_tokens_seen": 7876856, + "step": 481, + "train_runtime": 3964.6743, + "train_tokens_per_second": 1986.76 + }, + { + "epoch": 0.1335180055401662, + "grad_norm": 0.4597660303115845, + "learning_rate": 9.999721827991581e-05, + "loss": 0.04422389343380928, + "num_input_tokens_seen": 7893232, + "step": 482, + "train_runtime": 3972.8939, + "train_tokens_per_second": 1986.771 + }, + { + "epoch": 0.1337950138504155, + "grad_norm": 0.46941009163856506, + "learning_rate": 9.999717172517832e-05, + "loss": 0.047058966010808945, + "num_input_tokens_seen": 7909608, + "step": 483, + "train_runtime": 3981.113, + "train_tokens_per_second": 1986.783 + }, + { + "epoch": 0.13407202216066483, + "grad_norm": 0.3689025044441223, + "learning_rate": 9.999712478410907e-05, + "loss": 0.040124449878931046, + "num_input_tokens_seen": 7925984, + "step": 484, + "train_runtime": 3989.3383, + "train_tokens_per_second": 1986.792 + }, + { + "epoch": 0.13434903047091412, + "grad_norm": 0.39920639991760254, + "learning_rate": 9.999707745670846e-05, + "loss": 0.0400286465883255, + "num_input_tokens_seen": 7942360, + "step": 485, + "train_runtime": 3997.5682, + "train_tokens_per_second": 1986.798 + }, + { + "epoch": 0.13462603878116344, + "grad_norm": 0.4753592312335968, + "learning_rate": 9.999702974297682e-05, + "loss": 0.04847956448793411, + "num_input_tokens_seen": 7958736, + "step": 486, + "train_runtime": 4005.7964, + "train_tokens_per_second": 1986.805 + }, + { + "epoch": 0.13490304709141274, + "grad_norm": 0.47061145305633545, + "learning_rate": 9.999698164291456e-05, + "loss": 0.047937266528606415, + "num_input_tokens_seen": 7975112, + "step": 487, + "train_runtime": 4014.0262, + "train_tokens_per_second": 1986.811 + }, + { + "epoch": 0.13518005540166206, + "grad_norm": 0.5106936693191528, + "learning_rate": 9.999693315652201e-05, + "loss": 0.045017607510089874, + "num_input_tokens_seen": 7991488, + "step": 488, + "train_runtime": 4022.2577, + "train_tokens_per_second": 1986.816 + }, + { + "epoch": 0.13545706371191135, + "grad_norm": 0.4730679392814636, + "learning_rate": 9.999688428379958e-05, + "loss": 0.0434904620051384, + "num_input_tokens_seen": 8007864, + "step": 489, + "train_runtime": 4030.4914, + "train_tokens_per_second": 1986.821 + }, + { + "epoch": 0.13573407202216067, + "grad_norm": 0.4832095503807068, + "learning_rate": 9.999683502474761e-05, + "loss": 0.05289595574140549, + "num_input_tokens_seen": 8024240, + "step": 490, + "train_runtime": 4038.7239, + "train_tokens_per_second": 1986.826 + }, + { + "epoch": 0.13601108033240997, + "grad_norm": 0.3527802526950836, + "learning_rate": 9.999678537936652e-05, + "loss": 0.03897542506456375, + "num_input_tokens_seen": 8040616, + "step": 491, + "train_runtime": 4046.9548, + "train_tokens_per_second": 1986.831 + }, + { + "epoch": 0.1362880886426593, + "grad_norm": 0.41247686743736267, + "learning_rate": 9.999673534765668e-05, + "loss": 0.04329197108745575, + "num_input_tokens_seen": 8056992, + "step": 492, + "train_runtime": 4055.1612, + "train_tokens_per_second": 1986.849 + }, + { + "epoch": 0.13656509695290858, + "grad_norm": 0.4355025589466095, + "learning_rate": 9.999668492961847e-05, + "loss": 0.04352717101573944, + "num_input_tokens_seen": 8073368, + "step": 493, + "train_runtime": 4063.3677, + "train_tokens_per_second": 1986.866 + }, + { + "epoch": 0.1368421052631579, + "grad_norm": 0.3918461501598358, + "learning_rate": 9.999663412525226e-05, + "loss": 0.043284978717565536, + "num_input_tokens_seen": 8089744, + "step": 494, + "train_runtime": 4071.5683, + "train_tokens_per_second": 1986.886 + }, + { + "epoch": 0.1371191135734072, + "grad_norm": 0.4751986563205719, + "learning_rate": 9.999658293455849e-05, + "loss": 0.04048747941851616, + "num_input_tokens_seen": 8106120, + "step": 495, + "train_runtime": 4079.7718, + "train_tokens_per_second": 1986.905 + }, + { + "epoch": 0.13739612188365652, + "grad_norm": 0.402007132768631, + "learning_rate": 9.999653135753752e-05, + "loss": 0.045284852385520935, + "num_input_tokens_seen": 8122496, + "step": 496, + "train_runtime": 4087.9808, + "train_tokens_per_second": 1986.921 + }, + { + "epoch": 0.1376731301939058, + "grad_norm": 0.409411758184433, + "learning_rate": 9.999647939418975e-05, + "loss": 0.04308444634079933, + "num_input_tokens_seen": 8138872, + "step": 497, + "train_runtime": 4096.1952, + "train_tokens_per_second": 1986.935 + }, + { + "epoch": 0.13795013850415513, + "grad_norm": 0.3808010220527649, + "learning_rate": 9.999642704451559e-05, + "loss": 0.040507540106773376, + "num_input_tokens_seen": 8155248, + "step": 498, + "train_runtime": 4104.3971, + "train_tokens_per_second": 1986.954 + }, + { + "epoch": 0.13822714681440443, + "grad_norm": 0.34606099128723145, + "learning_rate": 9.999637430851547e-05, + "loss": 0.033152684569358826, + "num_input_tokens_seen": 8171624, + "step": 499, + "train_runtime": 4112.6092, + "train_tokens_per_second": 1986.968 + }, + { + "epoch": 0.13850415512465375, + "grad_norm": 0.38120102882385254, + "learning_rate": 9.999632118618976e-05, + "loss": 0.04589967429637909, + "num_input_tokens_seen": 8188000, + "step": 500, + "train_runtime": 4120.8411, + "train_tokens_per_second": 1986.973 + }, + { + "epoch": 0.13878116343490304, + "grad_norm": 0.5153139233589172, + "learning_rate": 9.999626767753886e-05, + "loss": 0.05209559574723244, + "num_input_tokens_seen": 8204376, + "step": 501, + "train_runtime": 4130.4964, + "train_tokens_per_second": 1986.293 + }, + { + "epoch": 0.13905817174515236, + "grad_norm": 0.3777496814727783, + "learning_rate": 9.999621378256324e-05, + "loss": 0.03766633942723274, + "num_input_tokens_seen": 8220752, + "step": 502, + "train_runtime": 4138.7097, + "train_tokens_per_second": 1986.308 + }, + { + "epoch": 0.13933518005540166, + "grad_norm": 0.3569754362106323, + "learning_rate": 9.999615950126325e-05, + "loss": 0.04503545165061951, + "num_input_tokens_seen": 8237128, + "step": 503, + "train_runtime": 4146.9236, + "train_tokens_per_second": 1986.323 + }, + { + "epoch": 0.13961218836565098, + "grad_norm": 0.4038015305995941, + "learning_rate": 9.999610483363936e-05, + "loss": 0.046262361109256744, + "num_input_tokens_seen": 8253504, + "step": 504, + "train_runtime": 4155.1274, + "train_tokens_per_second": 1986.342 + }, + { + "epoch": 0.13988919667590027, + "grad_norm": 0.37634509801864624, + "learning_rate": 9.999604977969197e-05, + "loss": 0.044874854385852814, + "num_input_tokens_seen": 8269880, + "step": 505, + "train_runtime": 4163.3364, + "train_tokens_per_second": 1986.359 + }, + { + "epoch": 0.1401662049861496, + "grad_norm": 0.42247524857521057, + "learning_rate": 9.999599433942152e-05, + "loss": 0.03890887275338173, + "num_input_tokens_seen": 8286256, + "step": 506, + "train_runtime": 4171.5637, + "train_tokens_per_second": 1986.367 + }, + { + "epoch": 0.1404432132963989, + "grad_norm": 0.45276960730552673, + "learning_rate": 9.999593851282843e-05, + "loss": 0.04506635665893555, + "num_input_tokens_seen": 8302632, + "step": 507, + "train_runtime": 4179.7766, + "train_tokens_per_second": 1986.382 + }, + { + "epoch": 0.1407202216066482, + "grad_norm": 0.39941176772117615, + "learning_rate": 9.99958822999131e-05, + "loss": 0.0389941930770874, + "num_input_tokens_seen": 8319008, + "step": 508, + "train_runtime": 4187.9916, + "train_tokens_per_second": 1986.396 + }, + { + "epoch": 0.1409972299168975, + "grad_norm": 0.3668064475059509, + "learning_rate": 9.999582570067603e-05, + "loss": 0.040938831865787506, + "num_input_tokens_seen": 8335384, + "step": 509, + "train_runtime": 4196.2178, + "train_tokens_per_second": 1986.404 + }, + { + "epoch": 0.14127423822714683, + "grad_norm": 0.3679547607898712, + "learning_rate": 9.99957687151176e-05, + "loss": 0.04048880934715271, + "num_input_tokens_seen": 8351760, + "step": 510, + "train_runtime": 4204.4401, + "train_tokens_per_second": 1986.414 + }, + { + "epoch": 0.14155124653739612, + "grad_norm": 0.4502631723880768, + "learning_rate": 9.999571134323827e-05, + "loss": 0.04733272269368172, + "num_input_tokens_seen": 8368136, + "step": 511, + "train_runtime": 4212.661, + "train_tokens_per_second": 1986.425 + }, + { + "epoch": 0.14182825484764544, + "grad_norm": 0.38846081495285034, + "learning_rate": 9.99956535850385e-05, + "loss": 0.03811134397983551, + "num_input_tokens_seen": 8384512, + "step": 512, + "train_runtime": 4220.8708, + "train_tokens_per_second": 1986.441 + }, + { + "epoch": 0.14210526315789473, + "grad_norm": 0.42856723070144653, + "learning_rate": 9.99955954405187e-05, + "loss": 0.046691231429576874, + "num_input_tokens_seen": 8400888, + "step": 513, + "train_runtime": 4229.0923, + "train_tokens_per_second": 1986.452 + }, + { + "epoch": 0.14238227146814406, + "grad_norm": 0.3391413390636444, + "learning_rate": 9.999553690967935e-05, + "loss": 0.03822413831949234, + "num_input_tokens_seen": 8417264, + "step": 514, + "train_runtime": 4237.317, + "train_tokens_per_second": 1986.461 + }, + { + "epoch": 0.14265927977839335, + "grad_norm": 0.47407713532447815, + "learning_rate": 9.99954779925209e-05, + "loss": 0.04879220202565193, + "num_input_tokens_seen": 8433640, + "step": 515, + "train_runtime": 4245.5376, + "train_tokens_per_second": 1986.472 + }, + { + "epoch": 0.14293628808864267, + "grad_norm": 0.29918956756591797, + "learning_rate": 9.99954186890438e-05, + "loss": 0.04035411775112152, + "num_input_tokens_seen": 8450016, + "step": 516, + "train_runtime": 4253.7635, + "train_tokens_per_second": 1986.48 + }, + { + "epoch": 0.14321329639889196, + "grad_norm": 0.2538292407989502, + "learning_rate": 9.99953589992485e-05, + "loss": 0.03452184796333313, + "num_input_tokens_seen": 8466392, + "step": 517, + "train_runtime": 4261.9799, + "train_tokens_per_second": 1986.493 + }, + { + "epoch": 0.1434903047091413, + "grad_norm": 0.3810885548591614, + "learning_rate": 9.999529892313547e-05, + "loss": 0.04446618631482124, + "num_input_tokens_seen": 8482768, + "step": 518, + "train_runtime": 4270.1881, + "train_tokens_per_second": 1986.509 + }, + { + "epoch": 0.14376731301939058, + "grad_norm": 0.5138489603996277, + "learning_rate": 9.999523846070517e-05, + "loss": 0.04710788279771805, + "num_input_tokens_seen": 8499144, + "step": 519, + "train_runtime": 4278.3951, + "train_tokens_per_second": 1986.526 + }, + { + "epoch": 0.1440443213296399, + "grad_norm": 0.3393917381763458, + "learning_rate": 9.999517761195807e-05, + "loss": 0.04294002056121826, + "num_input_tokens_seen": 8515520, + "step": 520, + "train_runtime": 4286.6182, + "train_tokens_per_second": 1986.536 + }, + { + "epoch": 0.1443213296398892, + "grad_norm": 0.3406863212585449, + "learning_rate": 9.999511637689463e-05, + "loss": 0.045909564942121506, + "num_input_tokens_seen": 8531896, + "step": 521, + "train_runtime": 4294.853, + "train_tokens_per_second": 1986.54 + }, + { + "epoch": 0.14459833795013852, + "grad_norm": 0.36361920833587646, + "learning_rate": 9.999505475551534e-05, + "loss": 0.039627574384212494, + "num_input_tokens_seen": 8548272, + "step": 522, + "train_runtime": 4303.0711, + "train_tokens_per_second": 1986.551 + }, + { + "epoch": 0.1448753462603878, + "grad_norm": 0.37241896986961365, + "learning_rate": 9.999499274782067e-05, + "loss": 0.038851652294397354, + "num_input_tokens_seen": 8564648, + "step": 523, + "train_runtime": 4311.2852, + "train_tokens_per_second": 1986.565 + }, + { + "epoch": 0.14515235457063713, + "grad_norm": 0.3908030688762665, + "learning_rate": 9.999493035381109e-05, + "loss": 0.0399450957775116, + "num_input_tokens_seen": 8581024, + "step": 524, + "train_runtime": 4319.4997, + "train_tokens_per_second": 1986.578 + }, + { + "epoch": 0.14542936288088643, + "grad_norm": 0.2981976568698883, + "learning_rate": 9.99948675734871e-05, + "loss": 0.034289129078388214, + "num_input_tokens_seen": 8597400, + "step": 525, + "train_runtime": 4327.7123, + "train_tokens_per_second": 1986.592 + }, + { + "epoch": 0.14570637119113575, + "grad_norm": 0.35891062021255493, + "learning_rate": 9.999480440684916e-05, + "loss": 0.03985242545604706, + "num_input_tokens_seen": 8613776, + "step": 526, + "train_runtime": 4335.9183, + "train_tokens_per_second": 1986.609 + }, + { + "epoch": 0.14598337950138504, + "grad_norm": 0.3489942252635956, + "learning_rate": 9.999474085389778e-05, + "loss": 0.0367513932287693, + "num_input_tokens_seen": 8630152, + "step": 527, + "train_runtime": 4344.1417, + "train_tokens_per_second": 1986.618 + }, + { + "epoch": 0.14626038781163436, + "grad_norm": 0.4657258689403534, + "learning_rate": 9.999467691463344e-05, + "loss": 0.04023357853293419, + "num_input_tokens_seen": 8646528, + "step": 528, + "train_runtime": 4352.3675, + "train_tokens_per_second": 1986.626 + }, + { + "epoch": 0.14653739612188366, + "grad_norm": 0.3269367516040802, + "learning_rate": 9.999461258905666e-05, + "loss": 0.0326668955385685, + "num_input_tokens_seen": 8662904, + "step": 529, + "train_runtime": 4360.5884, + "train_tokens_per_second": 1986.637 + }, + { + "epoch": 0.14681440443213298, + "grad_norm": 0.40379950404167175, + "learning_rate": 9.999454787716789e-05, + "loss": 0.035820771008729935, + "num_input_tokens_seen": 8679280, + "step": 530, + "train_runtime": 4368.8192, + "train_tokens_per_second": 1986.642 + }, + { + "epoch": 0.14709141274238227, + "grad_norm": 0.4247462749481201, + "learning_rate": 9.999448277896767e-05, + "loss": 0.04659070819616318, + "num_input_tokens_seen": 8695656, + "step": 531, + "train_runtime": 4377.0303, + "train_tokens_per_second": 1986.657 + }, + { + "epoch": 0.14736842105263157, + "grad_norm": 0.3125925660133362, + "learning_rate": 9.999441729445648e-05, + "loss": 0.04027542099356651, + "num_input_tokens_seen": 8712032, + "step": 532, + "train_runtime": 4385.2581, + "train_tokens_per_second": 1986.663 + }, + { + "epoch": 0.1476454293628809, + "grad_norm": 0.41836294531822205, + "learning_rate": 9.999435142363484e-05, + "loss": 0.05102977901697159, + "num_input_tokens_seen": 8728408, + "step": 533, + "train_runtime": 4393.4827, + "train_tokens_per_second": 1986.672 + }, + { + "epoch": 0.14792243767313018, + "grad_norm": 0.4390362799167633, + "learning_rate": 9.999428516650325e-05, + "loss": 0.04257376864552498, + "num_input_tokens_seen": 8744784, + "step": 534, + "train_runtime": 4401.7035, + "train_tokens_per_second": 1986.682 + }, + { + "epoch": 0.1481994459833795, + "grad_norm": 0.42463231086730957, + "learning_rate": 9.999421852306223e-05, + "loss": 0.03733404353260994, + "num_input_tokens_seen": 8761160, + "step": 535, + "train_runtime": 4409.9198, + "train_tokens_per_second": 1986.694 + }, + { + "epoch": 0.1484764542936288, + "grad_norm": 0.41322875022888184, + "learning_rate": 9.999415149331228e-05, + "loss": 0.03755655884742737, + "num_input_tokens_seen": 8777536, + "step": 536, + "train_runtime": 4418.1267, + "train_tokens_per_second": 1986.71 + }, + { + "epoch": 0.14875346260387812, + "grad_norm": 0.3537989854812622, + "learning_rate": 9.999408407725393e-05, + "loss": 0.03892496973276138, + "num_input_tokens_seen": 8793912, + "step": 537, + "train_runtime": 4426.3317, + "train_tokens_per_second": 1986.727 + }, + { + "epoch": 0.1490304709141274, + "grad_norm": 0.4134999215602875, + "learning_rate": 9.999401627488769e-05, + "loss": 0.04585300385951996, + "num_input_tokens_seen": 8810288, + "step": 538, + "train_runtime": 4434.5347, + "train_tokens_per_second": 1986.745 + }, + { + "epoch": 0.14930747922437673, + "grad_norm": 0.3812928795814514, + "learning_rate": 9.999394808621411e-05, + "loss": 0.045488812029361725, + "num_input_tokens_seen": 8826664, + "step": 539, + "train_runtime": 4442.7731, + "train_tokens_per_second": 1986.747 + }, + { + "epoch": 0.14958448753462603, + "grad_norm": 0.33026015758514404, + "learning_rate": 9.999387951123369e-05, + "loss": 0.03697238862514496, + "num_input_tokens_seen": 8843040, + "step": 540, + "train_runtime": 4451.007, + "train_tokens_per_second": 1986.75 + }, + { + "epoch": 0.14986149584487535, + "grad_norm": 0.3420630693435669, + "learning_rate": 9.999381054994699e-05, + "loss": 0.0387381911277771, + "num_input_tokens_seen": 8859416, + "step": 541, + "train_runtime": 4459.2367, + "train_tokens_per_second": 1986.756 + }, + { + "epoch": 0.15013850415512464, + "grad_norm": 0.4229990839958191, + "learning_rate": 9.99937412023545e-05, + "loss": 0.0461319200694561, + "num_input_tokens_seen": 8875792, + "step": 542, + "train_runtime": 4467.4625, + "train_tokens_per_second": 1986.764 + }, + { + "epoch": 0.15041551246537396, + "grad_norm": 0.46801984310150146, + "learning_rate": 9.999367146845677e-05, + "loss": 0.04402647167444229, + "num_input_tokens_seen": 8892168, + "step": 543, + "train_runtime": 4475.6897, + "train_tokens_per_second": 1986.77 + }, + { + "epoch": 0.15069252077562326, + "grad_norm": 0.34582310914993286, + "learning_rate": 9.999360134825437e-05, + "loss": 0.04476650431752205, + "num_input_tokens_seen": 8908544, + "step": 544, + "train_runtime": 4483.9183, + "train_tokens_per_second": 1986.777 + }, + { + "epoch": 0.15096952908587258, + "grad_norm": 0.29230648279190063, + "learning_rate": 9.999353084174781e-05, + "loss": 0.03330588340759277, + "num_input_tokens_seen": 8924920, + "step": 545, + "train_runtime": 4492.1529, + "train_tokens_per_second": 1986.78 + }, + { + "epoch": 0.15124653739612187, + "grad_norm": 0.36828482151031494, + "learning_rate": 9.999345994893765e-05, + "loss": 0.04320496320724487, + "num_input_tokens_seen": 8941296, + "step": 546, + "train_runtime": 4500.3824, + "train_tokens_per_second": 1986.786 + }, + { + "epoch": 0.1515235457063712, + "grad_norm": 0.33817681670188904, + "learning_rate": 9.999338866982442e-05, + "loss": 0.036875322461128235, + "num_input_tokens_seen": 8957672, + "step": 547, + "train_runtime": 4508.6093, + "train_tokens_per_second": 1986.793 + }, + { + "epoch": 0.1518005540166205, + "grad_norm": 0.3512486517429352, + "learning_rate": 9.999331700440869e-05, + "loss": 0.03795760124921799, + "num_input_tokens_seen": 8974048, + "step": 548, + "train_runtime": 4516.8406, + "train_tokens_per_second": 1986.798 + }, + { + "epoch": 0.1520775623268698, + "grad_norm": 0.37600719928741455, + "learning_rate": 9.9993244952691e-05, + "loss": 0.03680210933089256, + "num_input_tokens_seen": 8990424, + "step": 549, + "train_runtime": 4525.0688, + "train_tokens_per_second": 1986.804 + }, + { + "epoch": 0.1523545706371191, + "grad_norm": 0.30995917320251465, + "learning_rate": 9.999317251467192e-05, + "loss": 0.034930333495140076, + "num_input_tokens_seen": 9006800, + "step": 550, + "train_runtime": 4533.2971, + "train_tokens_per_second": 1986.81 + }, + { + "epoch": 0.15263157894736842, + "grad_norm": 0.34278103709220886, + "learning_rate": 9.9993099690352e-05, + "loss": 0.04095737263560295, + "num_input_tokens_seen": 9023176, + "step": 551, + "train_runtime": 4541.528, + "train_tokens_per_second": 1986.815 + }, + { + "epoch": 0.15290858725761772, + "grad_norm": 0.45947110652923584, + "learning_rate": 9.99930264797318e-05, + "loss": 0.046734850853681564, + "num_input_tokens_seen": 9039552, + "step": 552, + "train_runtime": 4549.7614, + "train_tokens_per_second": 1986.819 + }, + { + "epoch": 0.15318559556786704, + "grad_norm": 0.32216787338256836, + "learning_rate": 9.99929528828119e-05, + "loss": 0.03838067129254341, + "num_input_tokens_seen": 9055928, + "step": 553, + "train_runtime": 4557.9867, + "train_tokens_per_second": 1986.826 + }, + { + "epoch": 0.15346260387811633, + "grad_norm": 0.32768329977989197, + "learning_rate": 9.999287889959286e-05, + "loss": 0.0413237139582634, + "num_input_tokens_seen": 9072304, + "step": 554, + "train_runtime": 4566.2109, + "train_tokens_per_second": 1986.834 + }, + { + "epoch": 0.15373961218836565, + "grad_norm": 0.3278532922267914, + "learning_rate": 9.999280453007524e-05, + "loss": 0.03934377431869507, + "num_input_tokens_seen": 9088680, + "step": 555, + "train_runtime": 4574.434, + "train_tokens_per_second": 1986.843 + }, + { + "epoch": 0.15401662049861495, + "grad_norm": 0.47450152039527893, + "learning_rate": 9.999272977425963e-05, + "loss": 0.04360464587807655, + "num_input_tokens_seen": 9105056, + "step": 556, + "train_runtime": 4582.6575, + "train_tokens_per_second": 1986.851 + }, + { + "epoch": 0.15429362880886427, + "grad_norm": 0.417397677898407, + "learning_rate": 9.99926546321466e-05, + "loss": 0.03623701632022858, + "num_input_tokens_seen": 9121432, + "step": 557, + "train_runtime": 4590.8853, + "train_tokens_per_second": 1986.857 + }, + { + "epoch": 0.15457063711911356, + "grad_norm": 0.33006197214126587, + "learning_rate": 9.999257910373674e-05, + "loss": 0.037303853780031204, + "num_input_tokens_seen": 9137808, + "step": 558, + "train_runtime": 4599.1091, + "train_tokens_per_second": 1986.865 + }, + { + "epoch": 0.15484764542936288, + "grad_norm": 0.2920289635658264, + "learning_rate": 9.999250318903065e-05, + "loss": 0.04049134626984596, + "num_input_tokens_seen": 9154184, + "step": 559, + "train_runtime": 4607.3414, + "train_tokens_per_second": 1986.869 + }, + { + "epoch": 0.15512465373961218, + "grad_norm": 0.3606850206851959, + "learning_rate": 9.999242688802886e-05, + "loss": 0.03664075583219528, + "num_input_tokens_seen": 9170560, + "step": 560, + "train_runtime": 4615.5635, + "train_tokens_per_second": 1986.878 + }, + { + "epoch": 0.1554016620498615, + "grad_norm": 0.4824540317058563, + "learning_rate": 9.999235020073201e-05, + "loss": 0.053596965968608856, + "num_input_tokens_seen": 9186936, + "step": 561, + "train_runtime": 4623.7911, + "train_tokens_per_second": 1986.884 + }, + { + "epoch": 0.1556786703601108, + "grad_norm": 0.5891644954681396, + "learning_rate": 9.999227312714067e-05, + "loss": 0.053619276732206345, + "num_input_tokens_seen": 9203312, + "step": 562, + "train_runtime": 4632.0225, + "train_tokens_per_second": 1986.888 + }, + { + "epoch": 0.15595567867036012, + "grad_norm": 0.2978259325027466, + "learning_rate": 9.999219566725544e-05, + "loss": 0.0355391725897789, + "num_input_tokens_seen": 9219688, + "step": 563, + "train_runtime": 4640.2434, + "train_tokens_per_second": 1986.898 + }, + { + "epoch": 0.1562326869806094, + "grad_norm": 0.30103883147239685, + "learning_rate": 9.999211782107694e-05, + "loss": 0.03305068612098694, + "num_input_tokens_seen": 9236064, + "step": 564, + "train_runtime": 4648.4724, + "train_tokens_per_second": 1986.903 + }, + { + "epoch": 0.15650969529085873, + "grad_norm": 0.3283030390739441, + "learning_rate": 9.999203958860572e-05, + "loss": 0.03938453644514084, + "num_input_tokens_seen": 9252440, + "step": 565, + "train_runtime": 4656.7034, + "train_tokens_per_second": 1986.908 + }, + { + "epoch": 0.15678670360110802, + "grad_norm": 0.31263551115989685, + "learning_rate": 9.999196096984245e-05, + "loss": 0.03489915281534195, + "num_input_tokens_seen": 9268816, + "step": 566, + "train_runtime": 4664.9205, + "train_tokens_per_second": 1986.918 + }, + { + "epoch": 0.15706371191135735, + "grad_norm": 0.36258062720298767, + "learning_rate": 9.999188196478769e-05, + "loss": 0.04209110140800476, + "num_input_tokens_seen": 9285192, + "step": 567, + "train_runtime": 4673.1357, + "train_tokens_per_second": 1986.93 + }, + { + "epoch": 0.15734072022160664, + "grad_norm": 0.41754046082496643, + "learning_rate": 9.999180257344206e-05, + "loss": 0.03586412966251373, + "num_input_tokens_seen": 9301568, + "step": 568, + "train_runtime": 4681.3528, + "train_tokens_per_second": 1986.94 + }, + { + "epoch": 0.15761772853185596, + "grad_norm": 0.3914431631565094, + "learning_rate": 9.999172279580618e-05, + "loss": 0.03371211886405945, + "num_input_tokens_seen": 9317944, + "step": 569, + "train_runtime": 4689.566, + "train_tokens_per_second": 1986.952 + }, + { + "epoch": 0.15789473684210525, + "grad_norm": 0.31103184819221497, + "learning_rate": 9.999164263188068e-05, + "loss": 0.03652796521782875, + "num_input_tokens_seen": 9334320, + "step": 570, + "train_runtime": 4697.7739, + "train_tokens_per_second": 1986.967 + }, + { + "epoch": 0.15817174515235458, + "grad_norm": 0.38733023405075073, + "learning_rate": 9.999156208166614e-05, + "loss": 0.039936840534210205, + "num_input_tokens_seen": 9350696, + "step": 571, + "train_runtime": 4705.9829, + "train_tokens_per_second": 1986.98 + }, + { + "epoch": 0.15844875346260387, + "grad_norm": 0.3553179204463959, + "learning_rate": 9.999148114516322e-05, + "loss": 0.039945147931575775, + "num_input_tokens_seen": 9367072, + "step": 572, + "train_runtime": 4714.1979, + "train_tokens_per_second": 1986.992 + }, + { + "epoch": 0.1587257617728532, + "grad_norm": 0.42589282989501953, + "learning_rate": 9.999139982237253e-05, + "loss": 0.04679698497056961, + "num_input_tokens_seen": 9383448, + "step": 573, + "train_runtime": 4722.4173, + "train_tokens_per_second": 1987.001 + }, + { + "epoch": 0.15900277008310248, + "grad_norm": 0.3138888478279114, + "learning_rate": 9.99913181132947e-05, + "loss": 0.03734732046723366, + "num_input_tokens_seen": 9399824, + "step": 574, + "train_runtime": 4730.6309, + "train_tokens_per_second": 1987.013 + }, + { + "epoch": 0.1592797783933518, + "grad_norm": 0.2935049533843994, + "learning_rate": 9.999123601793036e-05, + "loss": 0.033087536692619324, + "num_input_tokens_seen": 9416200, + "step": 575, + "train_runtime": 4738.8368, + "train_tokens_per_second": 1987.028 + }, + { + "epoch": 0.1595567867036011, + "grad_norm": 0.34437668323516846, + "learning_rate": 9.999115353628015e-05, + "loss": 0.04069557785987854, + "num_input_tokens_seen": 9432576, + "step": 576, + "train_runtime": 4747.052, + "train_tokens_per_second": 1987.039 + }, + { + "epoch": 0.15983379501385042, + "grad_norm": 0.35058748722076416, + "learning_rate": 9.99910706683447e-05, + "loss": 0.04071161150932312, + "num_input_tokens_seen": 9448952, + "step": 577, + "train_runtime": 4755.2572, + "train_tokens_per_second": 1987.054 + }, + { + "epoch": 0.16011080332409972, + "grad_norm": 0.314387708902359, + "learning_rate": 9.999098741412466e-05, + "loss": 0.03225620090961456, + "num_input_tokens_seen": 9465328, + "step": 578, + "train_runtime": 4763.4715, + "train_tokens_per_second": 1987.065 + }, + { + "epoch": 0.16038781163434904, + "grad_norm": 0.30339139699935913, + "learning_rate": 9.999090377362066e-05, + "loss": 0.0377013199031353, + "num_input_tokens_seen": 9481704, + "step": 579, + "train_runtime": 4771.6826, + "train_tokens_per_second": 1987.078 + }, + { + "epoch": 0.16066481994459833, + "grad_norm": 0.2814827263355255, + "learning_rate": 9.999081974683336e-05, + "loss": 0.032003968954086304, + "num_input_tokens_seen": 9498080, + "step": 580, + "train_runtime": 4779.8926, + "train_tokens_per_second": 1987.091 + }, + { + "epoch": 0.16094182825484765, + "grad_norm": 0.3734593093395233, + "learning_rate": 9.99907353337634e-05, + "loss": 0.039743270725011826, + "num_input_tokens_seen": 9514456, + "step": 581, + "train_runtime": 4788.1058, + "train_tokens_per_second": 1987.102 + }, + { + "epoch": 0.16121883656509695, + "grad_norm": 0.36695948243141174, + "learning_rate": 9.999065053441144e-05, + "loss": 0.041670504957437515, + "num_input_tokens_seen": 9530832, + "step": 582, + "train_runtime": 4796.3163, + "train_tokens_per_second": 1987.115 + }, + { + "epoch": 0.16149584487534627, + "grad_norm": 0.2783602476119995, + "learning_rate": 9.999056534877811e-05, + "loss": 0.03543706238269806, + "num_input_tokens_seen": 9547208, + "step": 583, + "train_runtime": 4804.5219, + "train_tokens_per_second": 1987.13 + }, + { + "epoch": 0.16177285318559556, + "grad_norm": 0.2693309485912323, + "learning_rate": 9.999047977686411e-05, + "loss": 0.031914591789245605, + "num_input_tokens_seen": 9563584, + "step": 584, + "train_runtime": 4812.7273, + "train_tokens_per_second": 1987.144 + }, + { + "epoch": 0.16204986149584488, + "grad_norm": 0.39359816908836365, + "learning_rate": 9.999039381867005e-05, + "loss": 0.03920818120241165, + "num_input_tokens_seen": 9579960, + "step": 585, + "train_runtime": 4820.934, + "train_tokens_per_second": 1987.159 + }, + { + "epoch": 0.16232686980609418, + "grad_norm": 0.2769273519515991, + "learning_rate": 9.999030747419667e-05, + "loss": 0.03343990072607994, + "num_input_tokens_seen": 9596336, + "step": 586, + "train_runtime": 4829.1376, + "train_tokens_per_second": 1987.174 + }, + { + "epoch": 0.1626038781163435, + "grad_norm": 0.3857783377170563, + "learning_rate": 9.999022074344456e-05, + "loss": 0.042146917432546616, + "num_input_tokens_seen": 9612712, + "step": 587, + "train_runtime": 4837.3539, + "train_tokens_per_second": 1987.184 + }, + { + "epoch": 0.1628808864265928, + "grad_norm": 0.31259584426879883, + "learning_rate": 9.999013362641443e-05, + "loss": 0.04155489057302475, + "num_input_tokens_seen": 9629088, + "step": 588, + "train_runtime": 4845.5734, + "train_tokens_per_second": 1987.193 + }, + { + "epoch": 0.1631578947368421, + "grad_norm": 0.3039202392101288, + "learning_rate": 9.999004612310694e-05, + "loss": 0.04005073010921478, + "num_input_tokens_seen": 9645464, + "step": 589, + "train_runtime": 4853.8003, + "train_tokens_per_second": 1987.198 + }, + { + "epoch": 0.1634349030470914, + "grad_norm": 0.28345537185668945, + "learning_rate": 9.998995823352276e-05, + "loss": 0.029278580099344254, + "num_input_tokens_seen": 9661840, + "step": 590, + "train_runtime": 4862.0274, + "train_tokens_per_second": 1987.204 + }, + { + "epoch": 0.16371191135734073, + "grad_norm": 0.28483647108078003, + "learning_rate": 9.99898699576626e-05, + "loss": 0.03587590157985687, + "num_input_tokens_seen": 9678216, + "step": 591, + "train_runtime": 4870.2523, + "train_tokens_per_second": 1987.21 + }, + { + "epoch": 0.16398891966759002, + "grad_norm": 0.40859806537628174, + "learning_rate": 9.99897812955271e-05, + "loss": 0.038734424859285355, + "num_input_tokens_seen": 9694592, + "step": 592, + "train_runtime": 4878.4819, + "train_tokens_per_second": 1987.215 + }, + { + "epoch": 0.16426592797783934, + "grad_norm": 0.3220232427120209, + "learning_rate": 9.998969224711698e-05, + "loss": 0.038320742547512054, + "num_input_tokens_seen": 9710968, + "step": 593, + "train_runtime": 4886.7102, + "train_tokens_per_second": 1987.22 + }, + { + "epoch": 0.16454293628808864, + "grad_norm": 0.26126691699028015, + "learning_rate": 9.998960281243293e-05, + "loss": 0.03164512291550636, + "num_input_tokens_seen": 9727344, + "step": 594, + "train_runtime": 4894.9377, + "train_tokens_per_second": 1987.225 + }, + { + "epoch": 0.16481994459833796, + "grad_norm": 0.33305591344833374, + "learning_rate": 9.998951299147561e-05, + "loss": 0.036703091114759445, + "num_input_tokens_seen": 9743720, + "step": 595, + "train_runtime": 4903.1578, + "train_tokens_per_second": 1987.234 + }, + { + "epoch": 0.16509695290858725, + "grad_norm": 0.3185664117336273, + "learning_rate": 9.998942278424572e-05, + "loss": 0.032960161566734314, + "num_input_tokens_seen": 9760096, + "step": 596, + "train_runtime": 4911.3822, + "train_tokens_per_second": 1987.24 + }, + { + "epoch": 0.16537396121883657, + "grad_norm": 0.3144180178642273, + "learning_rate": 9.998933219074398e-05, + "loss": 0.038415372371673584, + "num_input_tokens_seen": 9776472, + "step": 597, + "train_runtime": 4919.6109, + "train_tokens_per_second": 1987.245 + }, + { + "epoch": 0.16565096952908587, + "grad_norm": 0.3369980454444885, + "learning_rate": 9.998924121097107e-05, + "loss": 0.03827934339642525, + "num_input_tokens_seen": 9792848, + "step": 598, + "train_runtime": 4927.8389, + "train_tokens_per_second": 1987.25 + }, + { + "epoch": 0.1659279778393352, + "grad_norm": 0.38625088334083557, + "learning_rate": 9.99891498449277e-05, + "loss": 0.035148054361343384, + "num_input_tokens_seen": 9809224, + "step": 599, + "train_runtime": 4936.061, + "train_tokens_per_second": 1987.257 + }, + { + "epoch": 0.16620498614958448, + "grad_norm": 0.28768640756607056, + "learning_rate": 9.998905809261459e-05, + "loss": 0.033397797495126724, + "num_input_tokens_seen": 9825600, + "step": 600, + "train_runtime": 4944.2815, + "train_tokens_per_second": 1987.265 + }, + { + "epoch": 0.1664819944598338, + "grad_norm": 0.34920749068260193, + "learning_rate": 9.998896595403242e-05, + "loss": 0.03295561298727989, + "num_input_tokens_seen": 9841976, + "step": 601, + "train_runtime": 4954.2705, + "train_tokens_per_second": 1986.564 + }, + { + "epoch": 0.1667590027700831, + "grad_norm": 0.32349029183387756, + "learning_rate": 9.998887342918193e-05, + "loss": 0.03570696339011192, + "num_input_tokens_seen": 9858352, + "step": 602, + "train_runtime": 4962.4872, + "train_tokens_per_second": 1986.575 + }, + { + "epoch": 0.16703601108033242, + "grad_norm": 0.2988620698451996, + "learning_rate": 9.998878051806382e-05, + "loss": 0.03337020054459572, + "num_input_tokens_seen": 9874728, + "step": 603, + "train_runtime": 4970.711, + "train_tokens_per_second": 1986.583 + }, + { + "epoch": 0.1673130193905817, + "grad_norm": 0.2667366564273834, + "learning_rate": 9.99886872206788e-05, + "loss": 0.03370322659611702, + "num_input_tokens_seen": 9891104, + "step": 604, + "train_runtime": 4978.9314, + "train_tokens_per_second": 1986.592 + }, + { + "epoch": 0.16759002770083103, + "grad_norm": 0.3126153349876404, + "learning_rate": 9.99885935370276e-05, + "loss": 0.03170401602983475, + "num_input_tokens_seen": 9907480, + "step": 605, + "train_runtime": 4987.1557, + "train_tokens_per_second": 1986.599 + }, + { + "epoch": 0.16786703601108033, + "grad_norm": 0.4531151354312897, + "learning_rate": 9.998849946711095e-05, + "loss": 0.04312300682067871, + "num_input_tokens_seen": 9923856, + "step": 606, + "train_runtime": 4995.3802, + "train_tokens_per_second": 1986.607 + }, + { + "epoch": 0.16814404432132965, + "grad_norm": 0.2969793677330017, + "learning_rate": 9.998840501092957e-05, + "loss": 0.03367358818650246, + "num_input_tokens_seen": 9940232, + "step": 607, + "train_runtime": 5003.6093, + "train_tokens_per_second": 1986.612 + }, + { + "epoch": 0.16842105263157894, + "grad_norm": 0.37572452425956726, + "learning_rate": 9.99883101684842e-05, + "loss": 0.03826024755835533, + "num_input_tokens_seen": 9956608, + "step": 608, + "train_runtime": 5011.8358, + "train_tokens_per_second": 1986.619 + }, + { + "epoch": 0.16869806094182827, + "grad_norm": 0.3168768286705017, + "learning_rate": 9.998821493977555e-05, + "loss": 0.0354745015501976, + "num_input_tokens_seen": 9972984, + "step": 609, + "train_runtime": 5020.0662, + "train_tokens_per_second": 1986.624 + }, + { + "epoch": 0.16897506925207756, + "grad_norm": 0.31048035621643066, + "learning_rate": 9.998811932480438e-05, + "loss": 0.03580518066883087, + "num_input_tokens_seen": 9989360, + "step": 610, + "train_runtime": 5028.2961, + "train_tokens_per_second": 1986.629 + }, + { + "epoch": 0.16925207756232688, + "grad_norm": 0.2719159722328186, + "learning_rate": 9.99880233235714e-05, + "loss": 0.03102930448949337, + "num_input_tokens_seen": 10005736, + "step": 611, + "train_runtime": 5036.5196, + "train_tokens_per_second": 1986.637 + }, + { + "epoch": 0.16952908587257617, + "grad_norm": 0.2631823718547821, + "learning_rate": 9.99879269360774e-05, + "loss": 0.036860737949609756, + "num_input_tokens_seen": 10022112, + "step": 612, + "train_runtime": 5044.762, + "train_tokens_per_second": 1986.637 + }, + { + "epoch": 0.1698060941828255, + "grad_norm": 0.2952190935611725, + "learning_rate": 9.998783016232308e-05, + "loss": 0.0304066464304924, + "num_input_tokens_seen": 10038488, + "step": 613, + "train_runtime": 5052.9953, + "train_tokens_per_second": 1986.641 + }, + { + "epoch": 0.1700831024930748, + "grad_norm": 0.28800198435783386, + "learning_rate": 9.998773300230922e-05, + "loss": 0.03669572249054909, + "num_input_tokens_seen": 10054864, + "step": 614, + "train_runtime": 5061.2259, + "train_tokens_per_second": 1986.646 + }, + { + "epoch": 0.1703601108033241, + "grad_norm": 0.2721065580844879, + "learning_rate": 9.998763545603654e-05, + "loss": 0.02993099018931389, + "num_input_tokens_seen": 10071240, + "step": 615, + "train_runtime": 5069.4534, + "train_tokens_per_second": 1986.652 + }, + { + "epoch": 0.1706371191135734, + "grad_norm": 0.29938215017318726, + "learning_rate": 9.99875375235058e-05, + "loss": 0.03363535553216934, + "num_input_tokens_seen": 10087616, + "step": 616, + "train_runtime": 5077.6857, + "train_tokens_per_second": 1986.656 + }, + { + "epoch": 0.17091412742382273, + "grad_norm": 0.3272131085395813, + "learning_rate": 9.998743920471776e-05, + "loss": 0.03660627081990242, + "num_input_tokens_seen": 10103992, + "step": 617, + "train_runtime": 5085.911, + "train_tokens_per_second": 1986.663 + }, + { + "epoch": 0.17119113573407202, + "grad_norm": 0.32105258107185364, + "learning_rate": 9.998734049967319e-05, + "loss": 0.03347909078001976, + "num_input_tokens_seen": 10120368, + "step": 618, + "train_runtime": 5094.1381, + "train_tokens_per_second": 1986.669 + }, + { + "epoch": 0.17146814404432134, + "grad_norm": 0.31419143080711365, + "learning_rate": 9.998724140837287e-05, + "loss": 0.03587684780359268, + "num_input_tokens_seen": 10136744, + "step": 619, + "train_runtime": 5102.3631, + "train_tokens_per_second": 1986.676 + }, + { + "epoch": 0.17174515235457063, + "grad_norm": 0.2848030924797058, + "learning_rate": 9.998714193081753e-05, + "loss": 0.03325265645980835, + "num_input_tokens_seen": 10153120, + "step": 620, + "train_runtime": 5110.5868, + "train_tokens_per_second": 1986.684 + }, + { + "epoch": 0.17202216066481996, + "grad_norm": 0.37927255034446716, + "learning_rate": 9.998704206700793e-05, + "loss": 0.03650137409567833, + "num_input_tokens_seen": 10169496, + "step": 621, + "train_runtime": 5118.8136, + "train_tokens_per_second": 1986.69 + }, + { + "epoch": 0.17229916897506925, + "grad_norm": 0.2450612634420395, + "learning_rate": 9.99869418169449e-05, + "loss": 0.032409217208623886, + "num_input_tokens_seen": 10185872, + "step": 622, + "train_runtime": 5127.0371, + "train_tokens_per_second": 1986.698 + }, + { + "epoch": 0.17257617728531857, + "grad_norm": 0.3291293978691101, + "learning_rate": 9.998684118062915e-05, + "loss": 0.03728678449988365, + "num_input_tokens_seen": 10202248, + "step": 623, + "train_runtime": 5135.2612, + "train_tokens_per_second": 1986.705 + }, + { + "epoch": 0.17285318559556787, + "grad_norm": 0.21044185757637024, + "learning_rate": 9.998674015806148e-05, + "loss": 0.02694820985198021, + "num_input_tokens_seen": 10218624, + "step": 624, + "train_runtime": 5143.4852, + "train_tokens_per_second": 1986.712 + }, + { + "epoch": 0.1731301939058172, + "grad_norm": 0.30427950620651245, + "learning_rate": 9.99866387492427e-05, + "loss": 0.03706635534763336, + "num_input_tokens_seen": 10235000, + "step": 625, + "train_runtime": 5151.7191, + "train_tokens_per_second": 1986.715 + }, + { + "epoch": 0.17340720221606648, + "grad_norm": 0.25269028544425964, + "learning_rate": 9.998653695417356e-05, + "loss": 0.031120534986257553, + "num_input_tokens_seen": 10251376, + "step": 626, + "train_runtime": 5159.9525, + "train_tokens_per_second": 1986.719 + }, + { + "epoch": 0.1736842105263158, + "grad_norm": 0.35486090183258057, + "learning_rate": 9.998643477285486e-05, + "loss": 0.03715316951274872, + "num_input_tokens_seen": 10267752, + "step": 627, + "train_runtime": 5168.1842, + "train_tokens_per_second": 1986.723 + }, + { + "epoch": 0.1739612188365651, + "grad_norm": 0.279275506734848, + "learning_rate": 9.998633220528737e-05, + "loss": 0.027981547638773918, + "num_input_tokens_seen": 10284128, + "step": 628, + "train_runtime": 5176.4091, + "train_tokens_per_second": 1986.73 + }, + { + "epoch": 0.17423822714681442, + "grad_norm": 0.34654173254966736, + "learning_rate": 9.998622925147192e-05, + "loss": 0.041155777871608734, + "num_input_tokens_seen": 10300504, + "step": 629, + "train_runtime": 5184.6227, + "train_tokens_per_second": 1986.741 + }, + { + "epoch": 0.1745152354570637, + "grad_norm": 0.3701731562614441, + "learning_rate": 9.998612591140927e-05, + "loss": 0.03714402765035629, + "num_input_tokens_seen": 10316880, + "step": 630, + "train_runtime": 5192.8416, + "train_tokens_per_second": 1986.75 + }, + { + "epoch": 0.17479224376731303, + "grad_norm": 0.2814449667930603, + "learning_rate": 9.998602218510022e-05, + "loss": 0.03499933332204819, + "num_input_tokens_seen": 10333256, + "step": 631, + "train_runtime": 5201.0661, + "train_tokens_per_second": 1986.757 + }, + { + "epoch": 0.17506925207756233, + "grad_norm": 0.3294924199581146, + "learning_rate": 9.99859180725456e-05, + "loss": 0.037025850266218185, + "num_input_tokens_seen": 10349632, + "step": 632, + "train_runtime": 5209.3015, + "train_tokens_per_second": 1986.76 + }, + { + "epoch": 0.17534626038781165, + "grad_norm": 0.2684018313884735, + "learning_rate": 9.99858135737462e-05, + "loss": 0.03199697658419609, + "num_input_tokens_seen": 10366008, + "step": 633, + "train_runtime": 5217.5286, + "train_tokens_per_second": 1986.766 + }, + { + "epoch": 0.17562326869806094, + "grad_norm": 0.2421170473098755, + "learning_rate": 9.998570868870283e-05, + "loss": 0.028730664402246475, + "num_input_tokens_seen": 10382384, + "step": 634, + "train_runtime": 5225.7611, + "train_tokens_per_second": 1986.77 + }, + { + "epoch": 0.17590027700831026, + "grad_norm": 0.3147653043270111, + "learning_rate": 9.998560341741628e-05, + "loss": 0.03115054965019226, + "num_input_tokens_seen": 10398760, + "step": 635, + "train_runtime": 5233.9987, + "train_tokens_per_second": 1986.772 + }, + { + "epoch": 0.17617728531855956, + "grad_norm": 0.26865777373313904, + "learning_rate": 9.99854977598874e-05, + "loss": 0.03404965624213219, + "num_input_tokens_seen": 10415136, + "step": 636, + "train_runtime": 5242.2334, + "train_tokens_per_second": 1986.775 + }, + { + "epoch": 0.17645429362880888, + "grad_norm": 0.2520793080329895, + "learning_rate": 9.998539171611697e-05, + "loss": 0.03407640382647514, + "num_input_tokens_seen": 10431512, + "step": 637, + "train_runtime": 5250.4644, + "train_tokens_per_second": 1986.779 + }, + { + "epoch": 0.17673130193905817, + "grad_norm": 0.4040892422199249, + "learning_rate": 9.998528528610583e-05, + "loss": 0.03998992592096329, + "num_input_tokens_seen": 10447888, + "step": 638, + "train_runtime": 5258.6833, + "train_tokens_per_second": 1986.788 + }, + { + "epoch": 0.17700831024930747, + "grad_norm": 0.4579291343688965, + "learning_rate": 9.998517846985479e-05, + "loss": 0.03773218393325806, + "num_input_tokens_seen": 10464264, + "step": 639, + "train_runtime": 5266.903, + "train_tokens_per_second": 1986.796 + }, + { + "epoch": 0.1772853185595568, + "grad_norm": 0.3428322672843933, + "learning_rate": 9.998507126736469e-05, + "loss": 0.03609687462449074, + "num_input_tokens_seen": 10480640, + "step": 640, + "train_runtime": 5275.1204, + "train_tokens_per_second": 1986.806 + }, + { + "epoch": 0.17756232686980608, + "grad_norm": 0.24259524047374725, + "learning_rate": 9.998496367863634e-05, + "loss": 0.029704401269555092, + "num_input_tokens_seen": 10497016, + "step": 641, + "train_runtime": 5283.3312, + "train_tokens_per_second": 1986.818 + }, + { + "epoch": 0.1778393351800554, + "grad_norm": 0.39456692337989807, + "learning_rate": 9.99848557036706e-05, + "loss": 0.033120643347501755, + "num_input_tokens_seen": 10513392, + "step": 642, + "train_runtime": 5291.5356, + "train_tokens_per_second": 1986.832 + }, + { + "epoch": 0.1781163434903047, + "grad_norm": 0.3259308338165283, + "learning_rate": 9.998474734246828e-05, + "loss": 0.030435286462306976, + "num_input_tokens_seen": 10529768, + "step": 643, + "train_runtime": 5299.754, + "train_tokens_per_second": 1986.841 + }, + { + "epoch": 0.17839335180055402, + "grad_norm": 0.22573968768119812, + "learning_rate": 9.998463859503022e-05, + "loss": 0.030549336224794388, + "num_input_tokens_seen": 10546144, + "step": 644, + "train_runtime": 5307.9667, + "train_tokens_per_second": 1986.852 + }, + { + "epoch": 0.1786703601108033, + "grad_norm": 0.29036852717399597, + "learning_rate": 9.998452946135728e-05, + "loss": 0.03865551948547363, + "num_input_tokens_seen": 10562520, + "step": 645, + "train_runtime": 5316.1903, + "train_tokens_per_second": 1986.859 + }, + { + "epoch": 0.17894736842105263, + "grad_norm": 0.435884565114975, + "learning_rate": 9.998441994145028e-05, + "loss": 0.041301727294921875, + "num_input_tokens_seen": 10578896, + "step": 646, + "train_runtime": 5324.4018, + "train_tokens_per_second": 1986.87 + }, + { + "epoch": 0.17922437673130193, + "grad_norm": 0.26364666223526, + "learning_rate": 9.998431003531007e-05, + "loss": 0.03310798108577728, + "num_input_tokens_seen": 10595272, + "step": 647, + "train_runtime": 5332.6098, + "train_tokens_per_second": 1986.883 + }, + { + "epoch": 0.17950138504155125, + "grad_norm": 0.29987338185310364, + "learning_rate": 9.998419974293752e-05, + "loss": 0.028734199702739716, + "num_input_tokens_seen": 10611648, + "step": 648, + "train_runtime": 5340.8219, + "train_tokens_per_second": 1986.894 + }, + { + "epoch": 0.17977839335180054, + "grad_norm": 0.2795774042606354, + "learning_rate": 9.998408906433345e-05, + "loss": 0.03503423556685448, + "num_input_tokens_seen": 10628024, + "step": 649, + "train_runtime": 5349.0306, + "train_tokens_per_second": 1986.907 + }, + { + "epoch": 0.18005540166204986, + "grad_norm": 0.333110511302948, + "learning_rate": 9.998397799949872e-05, + "loss": 0.03395010158419609, + "num_input_tokens_seen": 10644400, + "step": 650, + "train_runtime": 5357.2383, + "train_tokens_per_second": 1986.919 + }, + { + "epoch": 0.18033240997229916, + "grad_norm": 0.31093117594718933, + "learning_rate": 9.998386654843421e-05, + "loss": 0.0393977165222168, + "num_input_tokens_seen": 10660776, + "step": 651, + "train_runtime": 5365.4519, + "train_tokens_per_second": 1986.93 + }, + { + "epoch": 0.18060941828254848, + "grad_norm": 0.4421995282173157, + "learning_rate": 9.998375471114079e-05, + "loss": 0.04055335000157356, + "num_input_tokens_seen": 10677152, + "step": 652, + "train_runtime": 5373.6574, + "train_tokens_per_second": 1986.943 + }, + { + "epoch": 0.18088642659279777, + "grad_norm": 0.2746591567993164, + "learning_rate": 9.998364248761928e-05, + "loss": 0.038429223001003265, + "num_input_tokens_seen": 10693528, + "step": 653, + "train_runtime": 5381.8777, + "train_tokens_per_second": 1986.951 + }, + { + "epoch": 0.1811634349030471, + "grad_norm": 0.2598212957382202, + "learning_rate": 9.998352987787058e-05, + "loss": 0.025797748938202858, + "num_input_tokens_seen": 10709904, + "step": 654, + "train_runtime": 5390.1033, + "train_tokens_per_second": 1986.957 + }, + { + "epoch": 0.1814404432132964, + "grad_norm": 0.2685640752315521, + "learning_rate": 9.998341688189554e-05, + "loss": 0.03067754954099655, + "num_input_tokens_seen": 10726280, + "step": 655, + "train_runtime": 5398.3313, + "train_tokens_per_second": 1986.962 + }, + { + "epoch": 0.1817174515235457, + "grad_norm": 0.2970483899116516, + "learning_rate": 9.998330349969507e-05, + "loss": 0.033578645437955856, + "num_input_tokens_seen": 10742656, + "step": 656, + "train_runtime": 5406.5523, + "train_tokens_per_second": 1986.97 + }, + { + "epoch": 0.181994459833795, + "grad_norm": 0.3988739550113678, + "learning_rate": 9.998318973127e-05, + "loss": 0.04003111273050308, + "num_input_tokens_seen": 10759032, + "step": 657, + "train_runtime": 5414.7781, + "train_tokens_per_second": 1986.976 + }, + { + "epoch": 0.18227146814404432, + "grad_norm": 0.25360459089279175, + "learning_rate": 9.998307557662124e-05, + "loss": 0.031260181218385696, + "num_input_tokens_seen": 10775408, + "step": 658, + "train_runtime": 5423.0019, + "train_tokens_per_second": 1986.982 + }, + { + "epoch": 0.18254847645429362, + "grad_norm": 0.2559495270252228, + "learning_rate": 9.998296103574967e-05, + "loss": 0.03114575520157814, + "num_input_tokens_seen": 10791784, + "step": 659, + "train_runtime": 5431.2237, + "train_tokens_per_second": 1986.989 + }, + { + "epoch": 0.18282548476454294, + "grad_norm": 0.33316299319267273, + "learning_rate": 9.998284610865615e-05, + "loss": 0.029527418315410614, + "num_input_tokens_seen": 10808160, + "step": 660, + "train_runtime": 5439.4542, + "train_tokens_per_second": 1986.993 + }, + { + "epoch": 0.18310249307479223, + "grad_norm": 0.3753569424152374, + "learning_rate": 9.998273079534159e-05, + "loss": 0.03177731856703758, + "num_input_tokens_seen": 10824536, + "step": 661, + "train_runtime": 5447.6751, + "train_tokens_per_second": 1987.001 + }, + { + "epoch": 0.18337950138504155, + "grad_norm": 0.32232266664505005, + "learning_rate": 9.998261509580688e-05, + "loss": 0.034058116376399994, + "num_input_tokens_seen": 10840912, + "step": 662, + "train_runtime": 5455.8986, + "train_tokens_per_second": 1987.008 + }, + { + "epoch": 0.18365650969529085, + "grad_norm": 0.33636170625686646, + "learning_rate": 9.998249901005292e-05, + "loss": 0.03962479904294014, + "num_input_tokens_seen": 10857288, + "step": 663, + "train_runtime": 5464.1143, + "train_tokens_per_second": 1987.017 + }, + { + "epoch": 0.18393351800554017, + "grad_norm": 0.23145028948783875, + "learning_rate": 9.998238253808058e-05, + "loss": 0.03339424729347229, + "num_input_tokens_seen": 10873664, + "step": 664, + "train_runtime": 5472.3264, + "train_tokens_per_second": 1987.028 + }, + { + "epoch": 0.18421052631578946, + "grad_norm": 0.2759709060192108, + "learning_rate": 9.998226567989079e-05, + "loss": 0.029782375320792198, + "num_input_tokens_seen": 10890040, + "step": 665, + "train_runtime": 5480.5313, + "train_tokens_per_second": 1987.041 + }, + { + "epoch": 0.18448753462603878, + "grad_norm": 0.3164655268192291, + "learning_rate": 9.998214843548444e-05, + "loss": 0.03250996023416519, + "num_input_tokens_seen": 10906416, + "step": 666, + "train_runtime": 5488.7396, + "train_tokens_per_second": 1987.053 + }, + { + "epoch": 0.18476454293628808, + "grad_norm": 0.3506127595901489, + "learning_rate": 9.998203080486241e-05, + "loss": 0.031668469309806824, + "num_input_tokens_seen": 10922792, + "step": 667, + "train_runtime": 5496.9526, + "train_tokens_per_second": 1987.063 + }, + { + "epoch": 0.1850415512465374, + "grad_norm": 0.2425169050693512, + "learning_rate": 9.998191278802567e-05, + "loss": 0.028270741924643517, + "num_input_tokens_seen": 10939168, + "step": 668, + "train_runtime": 5505.1851, + "train_tokens_per_second": 1987.066 + }, + { + "epoch": 0.1853185595567867, + "grad_norm": 0.2778569161891937, + "learning_rate": 9.998179438497508e-05, + "loss": 0.03254803642630577, + "num_input_tokens_seen": 10955544, + "step": 669, + "train_runtime": 5513.4078, + "train_tokens_per_second": 1987.073 + }, + { + "epoch": 0.18559556786703602, + "grad_norm": 0.24324151873588562, + "learning_rate": 9.998167559571158e-05, + "loss": 0.02657662332057953, + "num_input_tokens_seen": 10971920, + "step": 670, + "train_runtime": 5521.6236, + "train_tokens_per_second": 1987.082 + }, + { + "epoch": 0.1858725761772853, + "grad_norm": 0.31123441457748413, + "learning_rate": 9.998155642023608e-05, + "loss": 0.03268449008464813, + "num_input_tokens_seen": 10988296, + "step": 671, + "train_runtime": 5529.8602, + "train_tokens_per_second": 1987.084 + }, + { + "epoch": 0.18614958448753463, + "grad_norm": 0.31652769446372986, + "learning_rate": 9.998143685854949e-05, + "loss": 0.03767869621515274, + "num_input_tokens_seen": 11004672, + "step": 672, + "train_runtime": 5538.0932, + "train_tokens_per_second": 1987.087 + }, + { + "epoch": 0.18642659279778392, + "grad_norm": 0.2801250219345093, + "learning_rate": 9.998131691065275e-05, + "loss": 0.027929004281759262, + "num_input_tokens_seen": 11021048, + "step": 673, + "train_runtime": 5546.3255, + "train_tokens_per_second": 1987.09 + }, + { + "epoch": 0.18670360110803325, + "grad_norm": 0.3171105682849884, + "learning_rate": 9.998119657654679e-05, + "loss": 0.03187079727649689, + "num_input_tokens_seen": 11037424, + "step": 674, + "train_runtime": 5554.5581, + "train_tokens_per_second": 1987.093 + }, + { + "epoch": 0.18698060941828254, + "grad_norm": 0.36600664258003235, + "learning_rate": 9.998107585623253e-05, + "loss": 0.03876674175262451, + "num_input_tokens_seen": 11053800, + "step": 675, + "train_runtime": 5562.7932, + "train_tokens_per_second": 1987.095 + }, + { + "epoch": 0.18725761772853186, + "grad_norm": 0.24421526491641998, + "learning_rate": 9.998095474971089e-05, + "loss": 0.03434855118393898, + "num_input_tokens_seen": 11070176, + "step": 676, + "train_runtime": 5571.0254, + "train_tokens_per_second": 1987.098 + }, + { + "epoch": 0.18753462603878115, + "grad_norm": 0.2391757220029831, + "learning_rate": 9.998083325698284e-05, + "loss": 0.03253127261996269, + "num_input_tokens_seen": 11086552, + "step": 677, + "train_runtime": 5579.2525, + "train_tokens_per_second": 1987.103 + }, + { + "epoch": 0.18781163434903048, + "grad_norm": 0.3194746673107147, + "learning_rate": 9.998071137804928e-05, + "loss": 0.0393541157245636, + "num_input_tokens_seen": 11102928, + "step": 678, + "train_runtime": 5587.4709, + "train_tokens_per_second": 1987.112 + }, + { + "epoch": 0.18808864265927977, + "grad_norm": 0.23425212502479553, + "learning_rate": 9.998058911291119e-05, + "loss": 0.028910916298627853, + "num_input_tokens_seen": 11119304, + "step": 679, + "train_runtime": 5595.6796, + "train_tokens_per_second": 1987.123 + }, + { + "epoch": 0.1883656509695291, + "grad_norm": 0.27521607279777527, + "learning_rate": 9.998046646156949e-05, + "loss": 0.02987668290734291, + "num_input_tokens_seen": 11135680, + "step": 680, + "train_runtime": 5603.9052, + "train_tokens_per_second": 1987.129 + }, + { + "epoch": 0.18864265927977839, + "grad_norm": 0.2502542734146118, + "learning_rate": 9.998034342402513e-05, + "loss": 0.028432641178369522, + "num_input_tokens_seen": 11152056, + "step": 681, + "train_runtime": 5612.1342, + "train_tokens_per_second": 1987.133 + }, + { + "epoch": 0.1889196675900277, + "grad_norm": 0.35816285014152527, + "learning_rate": 9.998022000027906e-05, + "loss": 0.037086766213178635, + "num_input_tokens_seen": 11168432, + "step": 682, + "train_runtime": 5620.3686, + "train_tokens_per_second": 1987.135 + }, + { + "epoch": 0.189196675900277, + "grad_norm": 0.34554219245910645, + "learning_rate": 9.998009619033224e-05, + "loss": 0.036241017282009125, + "num_input_tokens_seen": 11184808, + "step": 683, + "train_runtime": 5628.6111, + "train_tokens_per_second": 1987.135 + }, + { + "epoch": 0.18947368421052632, + "grad_norm": 0.2652454078197479, + "learning_rate": 9.997997199418562e-05, + "loss": 0.0315791517496109, + "num_input_tokens_seen": 11201184, + "step": 684, + "train_runtime": 5636.8441, + "train_tokens_per_second": 1987.137 + }, + { + "epoch": 0.18975069252077562, + "grad_norm": 0.22164076566696167, + "learning_rate": 9.997984741184016e-05, + "loss": 0.02674875222146511, + "num_input_tokens_seen": 11217560, + "step": 685, + "train_runtime": 5645.0707, + "train_tokens_per_second": 1987.143 + }, + { + "epoch": 0.19002770083102494, + "grad_norm": 0.29365232586860657, + "learning_rate": 9.997972244329684e-05, + "loss": 0.03381628170609474, + "num_input_tokens_seen": 11233936, + "step": 686, + "train_runtime": 5653.297, + "train_tokens_per_second": 1987.148 + }, + { + "epoch": 0.19030470914127423, + "grad_norm": 0.3124910891056061, + "learning_rate": 9.99795970885566e-05, + "loss": 0.03178275749087334, + "num_input_tokens_seen": 11250312, + "step": 687, + "train_runtime": 5661.5244, + "train_tokens_per_second": 1987.152 + }, + { + "epoch": 0.19058171745152355, + "grad_norm": 0.19379925727844238, + "learning_rate": 9.997947134762045e-05, + "loss": 0.026300504803657532, + "num_input_tokens_seen": 11266688, + "step": 688, + "train_runtime": 5669.7563, + "train_tokens_per_second": 1987.156 + }, + { + "epoch": 0.19085872576177285, + "grad_norm": 0.5484428405761719, + "learning_rate": 9.99793452204893e-05, + "loss": 0.03020613081753254, + "num_input_tokens_seen": 11283064, + "step": 689, + "train_runtime": 5677.9711, + "train_tokens_per_second": 1987.165 + }, + { + "epoch": 0.19113573407202217, + "grad_norm": 0.26573702692985535, + "learning_rate": 9.997921870716417e-05, + "loss": 0.03175842761993408, + "num_input_tokens_seen": 11299440, + "step": 690, + "train_runtime": 5686.1964, + "train_tokens_per_second": 1987.17 + }, + { + "epoch": 0.19141274238227146, + "grad_norm": 0.23622217774391174, + "learning_rate": 9.997909180764603e-05, + "loss": 0.03141329437494278, + "num_input_tokens_seen": 11315816, + "step": 691, + "train_runtime": 5694.4285, + "train_tokens_per_second": 1987.173 + }, + { + "epoch": 0.19168975069252078, + "grad_norm": 0.27838587760925293, + "learning_rate": 9.997896452193584e-05, + "loss": 0.03721851110458374, + "num_input_tokens_seen": 11332192, + "step": 692, + "train_runtime": 5702.6557, + "train_tokens_per_second": 1987.178 + }, + { + "epoch": 0.19196675900277008, + "grad_norm": 0.26820242404937744, + "learning_rate": 9.997883685003459e-05, + "loss": 0.03310339152812958, + "num_input_tokens_seen": 11348568, + "step": 693, + "train_runtime": 5710.8857, + "train_tokens_per_second": 1987.182 + }, + { + "epoch": 0.1922437673130194, + "grad_norm": 0.2556637227535248, + "learning_rate": 9.997870879194331e-05, + "loss": 0.03127646446228027, + "num_input_tokens_seen": 11364944, + "step": 694, + "train_runtime": 5719.1158, + "train_tokens_per_second": 1987.185 + }, + { + "epoch": 0.1925207756232687, + "grad_norm": 0.2596582770347595, + "learning_rate": 9.997858034766294e-05, + "loss": 0.0277941282838583, + "num_input_tokens_seen": 11381320, + "step": 695, + "train_runtime": 5727.3515, + "train_tokens_per_second": 1987.187 + }, + { + "epoch": 0.192797783933518, + "grad_norm": 0.25657564401626587, + "learning_rate": 9.997845151719447e-05, + "loss": 0.026682991534471512, + "num_input_tokens_seen": 11397696, + "step": 696, + "train_runtime": 5735.5886, + "train_tokens_per_second": 1987.189 + }, + { + "epoch": 0.1930747922437673, + "grad_norm": 0.19960550963878632, + "learning_rate": 9.997832230053893e-05, + "loss": 0.030307264998555183, + "num_input_tokens_seen": 11414072, + "step": 697, + "train_runtime": 5743.8223, + "train_tokens_per_second": 1987.191 + }, + { + "epoch": 0.19335180055401663, + "grad_norm": 0.21937310695648193, + "learning_rate": 9.99781926976973e-05, + "loss": 0.02310645952820778, + "num_input_tokens_seen": 11430448, + "step": 698, + "train_runtime": 5752.0524, + "train_tokens_per_second": 1987.195 + }, + { + "epoch": 0.19362880886426592, + "grad_norm": 0.24430984258651733, + "learning_rate": 9.997806270867058e-05, + "loss": 0.034632157534360886, + "num_input_tokens_seen": 11446824, + "step": 699, + "train_runtime": 5760.26, + "train_tokens_per_second": 1987.206 + }, + { + "epoch": 0.19390581717451524, + "grad_norm": 0.3172627389431, + "learning_rate": 9.997793233345978e-05, + "loss": 0.03744668513536453, + "num_input_tokens_seen": 11463200, + "step": 700, + "train_runtime": 5768.4784, + "train_tokens_per_second": 1987.214 + }, + { + "epoch": 0.19418282548476454, + "grad_norm": 0.40630587935447693, + "learning_rate": 9.997780157206589e-05, + "loss": 0.03569541499018669, + "num_input_tokens_seen": 11479576, + "step": 701, + "train_runtime": 5778.2296, + "train_tokens_per_second": 1986.694 + }, + { + "epoch": 0.19445983379501386, + "grad_norm": 0.24434888362884521, + "learning_rate": 9.997767042448995e-05, + "loss": 0.029119256883859634, + "num_input_tokens_seen": 11495952, + "step": 702, + "train_runtime": 5786.4344, + "train_tokens_per_second": 1986.707 + }, + { + "epoch": 0.19473684210526315, + "grad_norm": 0.313765287399292, + "learning_rate": 9.997753889073296e-05, + "loss": 0.03499050438404083, + "num_input_tokens_seen": 11512328, + "step": 703, + "train_runtime": 5794.6534, + "train_tokens_per_second": 1986.716 + }, + { + "epoch": 0.19501385041551247, + "grad_norm": 0.29488661885261536, + "learning_rate": 9.997740697079594e-05, + "loss": 0.03490179404616356, + "num_input_tokens_seen": 11528704, + "step": 704, + "train_runtime": 5802.8644, + "train_tokens_per_second": 1986.726 + }, + { + "epoch": 0.19529085872576177, + "grad_norm": 0.26601532101631165, + "learning_rate": 9.997727466467988e-05, + "loss": 0.03288222476840019, + "num_input_tokens_seen": 11545080, + "step": 705, + "train_runtime": 5811.0961, + "train_tokens_per_second": 1986.73 + }, + { + "epoch": 0.1955678670360111, + "grad_norm": 0.25483691692352295, + "learning_rate": 9.997714197238584e-05, + "loss": 0.030960697680711746, + "num_input_tokens_seen": 11561456, + "step": 706, + "train_runtime": 5819.3252, + "train_tokens_per_second": 1986.735 + }, + { + "epoch": 0.19584487534626038, + "grad_norm": 0.3222620189189911, + "learning_rate": 9.997700889391484e-05, + "loss": 0.03523701801896095, + "num_input_tokens_seen": 11577832, + "step": 707, + "train_runtime": 5827.5364, + "train_tokens_per_second": 1986.746 + }, + { + "epoch": 0.1961218836565097, + "grad_norm": 0.2090315967798233, + "learning_rate": 9.997687542926789e-05, + "loss": 0.029931969940662384, + "num_input_tokens_seen": 11594208, + "step": 708, + "train_runtime": 5835.7522, + "train_tokens_per_second": 1986.755 + }, + { + "epoch": 0.196398891966759, + "grad_norm": 0.23724788427352905, + "learning_rate": 9.997674157844604e-05, + "loss": 0.028983822092413902, + "num_input_tokens_seen": 11610584, + "step": 709, + "train_runtime": 5843.962, + "train_tokens_per_second": 1986.766 + }, + { + "epoch": 0.19667590027700832, + "grad_norm": 0.2183166891336441, + "learning_rate": 9.99766073414503e-05, + "loss": 0.025172561407089233, + "num_input_tokens_seen": 11626960, + "step": 710, + "train_runtime": 5852.1841, + "train_tokens_per_second": 1986.773 + }, + { + "epoch": 0.1969529085872576, + "grad_norm": 0.2922898828983307, + "learning_rate": 9.997647271828175e-05, + "loss": 0.024902023375034332, + "num_input_tokens_seen": 11643336, + "step": 711, + "train_runtime": 5860.407, + "train_tokens_per_second": 1986.779 + }, + { + "epoch": 0.19722991689750694, + "grad_norm": 0.2683303952217102, + "learning_rate": 9.997633770894137e-05, + "loss": 0.028868740424513817, + "num_input_tokens_seen": 11659712, + "step": 712, + "train_runtime": 5868.6426, + "train_tokens_per_second": 1986.782 + }, + { + "epoch": 0.19750692520775623, + "grad_norm": 0.23288185894489288, + "learning_rate": 9.997620231343026e-05, + "loss": 0.030545219779014587, + "num_input_tokens_seen": 11676088, + "step": 713, + "train_runtime": 5876.8632, + "train_tokens_per_second": 1986.789 + }, + { + "epoch": 0.19778393351800555, + "grad_norm": 0.25725454092025757, + "learning_rate": 9.997606653174942e-05, + "loss": 0.03084186650812626, + "num_input_tokens_seen": 11692464, + "step": 714, + "train_runtime": 5885.0798, + "train_tokens_per_second": 1986.798 + }, + { + "epoch": 0.19806094182825484, + "grad_norm": 0.2789117395877838, + "learning_rate": 9.997593036389994e-05, + "loss": 0.03527204692363739, + "num_input_tokens_seen": 11708840, + "step": 715, + "train_runtime": 5893.2901, + "train_tokens_per_second": 1986.809 + }, + { + "epoch": 0.19833795013850417, + "grad_norm": 0.2713892459869385, + "learning_rate": 9.997579380988285e-05, + "loss": 0.03590555861592293, + "num_input_tokens_seen": 11725216, + "step": 716, + "train_runtime": 5901.5083, + "train_tokens_per_second": 1986.817 + }, + { + "epoch": 0.19861495844875346, + "grad_norm": 0.20148791372776031, + "learning_rate": 9.997565686969921e-05, + "loss": 0.026323577389121056, + "num_input_tokens_seen": 11741592, + "step": 717, + "train_runtime": 5909.7256, + "train_tokens_per_second": 1986.825 + }, + { + "epoch": 0.19889196675900278, + "grad_norm": 0.255799263715744, + "learning_rate": 9.997551954335008e-05, + "loss": 0.02971688099205494, + "num_input_tokens_seen": 11757968, + "step": 718, + "train_runtime": 5917.958, + "train_tokens_per_second": 1986.829 + }, + { + "epoch": 0.19916897506925207, + "grad_norm": 0.19565340876579285, + "learning_rate": 9.997538183083651e-05, + "loss": 0.028799882158637047, + "num_input_tokens_seen": 11774344, + "step": 719, + "train_runtime": 5926.1938, + "train_tokens_per_second": 1986.831 + }, + { + "epoch": 0.1994459833795014, + "grad_norm": 0.3178609013557434, + "learning_rate": 9.997524373215958e-05, + "loss": 0.032857149839401245, + "num_input_tokens_seen": 11790720, + "step": 720, + "train_runtime": 5934.411, + "train_tokens_per_second": 1986.839 + }, + { + "epoch": 0.1997229916897507, + "grad_norm": 0.2549521028995514, + "learning_rate": 9.997510524732034e-05, + "loss": 0.02550780028104782, + "num_input_tokens_seen": 11807096, + "step": 721, + "train_runtime": 5942.6534, + "train_tokens_per_second": 1986.839 + }, + { + "epoch": 0.2, + "grad_norm": 0.22218342125415802, + "learning_rate": 9.997496637631988e-05, + "loss": 0.028328899294137955, + "num_input_tokens_seen": 11823472, + "step": 722, + "train_runtime": 5950.8709, + "train_tokens_per_second": 1986.847 + }, + { + "epoch": 0.2002770083102493, + "grad_norm": 0.2572973668575287, + "learning_rate": 9.997482711915927e-05, + "loss": 0.030360383912920952, + "num_input_tokens_seen": 11839848, + "step": 723, + "train_runtime": 5959.0845, + "train_tokens_per_second": 1986.857 + }, + { + "epoch": 0.20055401662049863, + "grad_norm": 0.2690895199775696, + "learning_rate": 9.997468747583956e-05, + "loss": 0.03155839443206787, + "num_input_tokens_seen": 11856224, + "step": 724, + "train_runtime": 5967.3002, + "train_tokens_per_second": 1986.866 + }, + { + "epoch": 0.20083102493074792, + "grad_norm": 0.285602867603302, + "learning_rate": 9.997454744636186e-05, + "loss": 0.02989688515663147, + "num_input_tokens_seen": 11872600, + "step": 725, + "train_runtime": 5975.5137, + "train_tokens_per_second": 1986.875 + }, + { + "epoch": 0.20110803324099724, + "grad_norm": 0.24673093855381012, + "learning_rate": 9.997440703072723e-05, + "loss": 0.03036007098853588, + "num_input_tokens_seen": 11888976, + "step": 726, + "train_runtime": 5983.7325, + "train_tokens_per_second": 1986.883 + }, + { + "epoch": 0.20138504155124654, + "grad_norm": 0.24253325164318085, + "learning_rate": 9.997426622893678e-05, + "loss": 0.030693797394633293, + "num_input_tokens_seen": 11905352, + "step": 727, + "train_runtime": 5991.9655, + "train_tokens_per_second": 1986.886 + }, + { + "epoch": 0.20166204986149586, + "grad_norm": 0.22904986143112183, + "learning_rate": 9.997412504099156e-05, + "loss": 0.03274228423833847, + "num_input_tokens_seen": 11921728, + "step": 728, + "train_runtime": 6000.201, + "train_tokens_per_second": 1986.888 + }, + { + "epoch": 0.20193905817174515, + "grad_norm": 0.23527881503105164, + "learning_rate": 9.997398346689271e-05, + "loss": 0.02814854308962822, + "num_input_tokens_seen": 11938104, + "step": 729, + "train_runtime": 6008.4266, + "train_tokens_per_second": 1986.894 + }, + { + "epoch": 0.20221606648199447, + "grad_norm": 0.20000816881656647, + "learning_rate": 9.997384150664128e-05, + "loss": 0.028536774218082428, + "num_input_tokens_seen": 11954480, + "step": 730, + "train_runtime": 6016.6602, + "train_tokens_per_second": 1986.896 + }, + { + "epoch": 0.20249307479224377, + "grad_norm": 0.25875240564346313, + "learning_rate": 9.997369916023837e-05, + "loss": 0.02940649352967739, + "num_input_tokens_seen": 11970856, + "step": 731, + "train_runtime": 6024.8867, + "train_tokens_per_second": 1986.901 + }, + { + "epoch": 0.2027700831024931, + "grad_norm": 0.29325661063194275, + "learning_rate": 9.997355642768514e-05, + "loss": 0.03619043529033661, + "num_input_tokens_seen": 11987232, + "step": 732, + "train_runtime": 6033.1046, + "train_tokens_per_second": 1986.909 + }, + { + "epoch": 0.20304709141274238, + "grad_norm": 0.17969556152820587, + "learning_rate": 9.99734133089826e-05, + "loss": 0.024989282712340355, + "num_input_tokens_seen": 12003608, + "step": 733, + "train_runtime": 6041.3175, + "train_tokens_per_second": 1986.919 + }, + { + "epoch": 0.2033240997229917, + "grad_norm": 0.23492880165576935, + "learning_rate": 9.997326980413193e-05, + "loss": 0.03316040337085724, + "num_input_tokens_seen": 12019984, + "step": 734, + "train_runtime": 6049.5314, + "train_tokens_per_second": 1986.928 + }, + { + "epoch": 0.203601108033241, + "grad_norm": 0.25538796186447144, + "learning_rate": 9.997312591313421e-05, + "loss": 0.0279450211673975, + "num_input_tokens_seen": 12036360, + "step": 735, + "train_runtime": 6057.7438, + "train_tokens_per_second": 1986.938 + }, + { + "epoch": 0.20387811634349032, + "grad_norm": 0.2619663178920746, + "learning_rate": 9.997298163599056e-05, + "loss": 0.03129222244024277, + "num_input_tokens_seen": 12052736, + "step": 736, + "train_runtime": 6065.9755, + "train_tokens_per_second": 1986.941 + }, + { + "epoch": 0.2041551246537396, + "grad_norm": 0.2143256813287735, + "learning_rate": 9.997283697270208e-05, + "loss": 0.02575850673019886, + "num_input_tokens_seen": 12069112, + "step": 737, + "train_runtime": 6074.2119, + "train_tokens_per_second": 1986.943 + }, + { + "epoch": 0.20443213296398893, + "grad_norm": 0.28044095635414124, + "learning_rate": 9.997269192326988e-05, + "loss": 0.030958371236920357, + "num_input_tokens_seen": 12085488, + "step": 738, + "train_runtime": 6082.4381, + "train_tokens_per_second": 1986.948 + }, + { + "epoch": 0.20470914127423823, + "grad_norm": 0.23940759897232056, + "learning_rate": 9.997254648769511e-05, + "loss": 0.034114282578229904, + "num_input_tokens_seen": 12101864, + "step": 739, + "train_runtime": 6090.6703, + "train_tokens_per_second": 1986.951 + }, + { + "epoch": 0.20498614958448755, + "grad_norm": 0.2909270226955414, + "learning_rate": 9.997240066597891e-05, + "loss": 0.03543189540505409, + "num_input_tokens_seen": 12118240, + "step": 740, + "train_runtime": 6098.9053, + "train_tokens_per_second": 1986.953 + }, + { + "epoch": 0.20526315789473684, + "grad_norm": 0.255402147769928, + "learning_rate": 9.997225445812235e-05, + "loss": 0.025214064866304398, + "num_input_tokens_seen": 12134616, + "step": 741, + "train_runtime": 6107.1343, + "train_tokens_per_second": 1986.957 + }, + { + "epoch": 0.20554016620498616, + "grad_norm": 0.2714443802833557, + "learning_rate": 9.997210786412659e-05, + "loss": 0.029647108167409897, + "num_input_tokens_seen": 12150992, + "step": 742, + "train_runtime": 6115.3654, + "train_tokens_per_second": 1986.961 + }, + { + "epoch": 0.20581717451523546, + "grad_norm": 0.21110492944717407, + "learning_rate": 9.997196088399276e-05, + "loss": 0.027580711990594864, + "num_input_tokens_seen": 12167368, + "step": 743, + "train_runtime": 6123.5943, + "train_tokens_per_second": 1986.965 + }, + { + "epoch": 0.20609418282548475, + "grad_norm": 0.18819519877433777, + "learning_rate": 9.9971813517722e-05, + "loss": 0.028858501464128494, + "num_input_tokens_seen": 12183744, + "step": 744, + "train_runtime": 6131.8266, + "train_tokens_per_second": 1986.968 + }, + { + "epoch": 0.20637119113573407, + "grad_norm": 0.2007208615541458, + "learning_rate": 9.997166576531543e-05, + "loss": 0.030805619433522224, + "num_input_tokens_seen": 12200120, + "step": 745, + "train_runtime": 6140.0614, + "train_tokens_per_second": 1986.97 + }, + { + "epoch": 0.20664819944598337, + "grad_norm": 0.2038053274154663, + "learning_rate": 9.997151762677423e-05, + "loss": 0.03290221840143204, + "num_input_tokens_seen": 12216496, + "step": 746, + "train_runtime": 6148.2921, + "train_tokens_per_second": 1986.974 + }, + { + "epoch": 0.2069252077562327, + "grad_norm": 0.21743915975093842, + "learning_rate": 9.99713691020995e-05, + "loss": 0.03025367297232151, + "num_input_tokens_seen": 12232872, + "step": 747, + "train_runtime": 6156.5226, + "train_tokens_per_second": 1986.978 + }, + { + "epoch": 0.20720221606648198, + "grad_norm": 0.24300071597099304, + "learning_rate": 9.997122019129244e-05, + "loss": 0.02959544025361538, + "num_input_tokens_seen": 12249248, + "step": 748, + "train_runtime": 6164.7529, + "train_tokens_per_second": 1986.981 + }, + { + "epoch": 0.2074792243767313, + "grad_norm": 0.3064585030078888, + "learning_rate": 9.997107089435414e-05, + "loss": 0.03541306406259537, + "num_input_tokens_seen": 12265624, + "step": 749, + "train_runtime": 6172.9889, + "train_tokens_per_second": 1986.983 + }, + { + "epoch": 0.2077562326869806, + "grad_norm": 0.17091862857341766, + "learning_rate": 9.99709212112858e-05, + "loss": 0.023109396919608116, + "num_input_tokens_seen": 12282000, + "step": 750, + "train_runtime": 6181.2189, + "train_tokens_per_second": 1986.987 + }, + { + "epoch": 0.20803324099722992, + "grad_norm": 0.27897515892982483, + "learning_rate": 9.997077114208855e-05, + "loss": 0.037392400205135345, + "num_input_tokens_seen": 12298376, + "step": 751, + "train_runtime": 6189.4538, + "train_tokens_per_second": 1986.989 + }, + { + "epoch": 0.2083102493074792, + "grad_norm": 0.18711696565151215, + "learning_rate": 9.997062068676358e-05, + "loss": 0.027379997074604034, + "num_input_tokens_seen": 12314752, + "step": 752, + "train_runtime": 6197.6818, + "train_tokens_per_second": 1986.993 + }, + { + "epoch": 0.20858725761772853, + "grad_norm": 0.25203442573547363, + "learning_rate": 9.9970469845312e-05, + "loss": 0.027783628553152084, + "num_input_tokens_seen": 12331128, + "step": 753, + "train_runtime": 6205.916, + "train_tokens_per_second": 1986.996 + }, + { + "epoch": 0.20886426592797783, + "grad_norm": 0.18950623273849487, + "learning_rate": 9.997031861773503e-05, + "loss": 0.0270581915974617, + "num_input_tokens_seen": 12347504, + "step": 754, + "train_runtime": 6214.1432, + "train_tokens_per_second": 1987.0 + }, + { + "epoch": 0.20914127423822715, + "grad_norm": 0.169386088848114, + "learning_rate": 9.99701670040338e-05, + "loss": 0.024471139535307884, + "num_input_tokens_seen": 12363880, + "step": 755, + "train_runtime": 6222.3721, + "train_tokens_per_second": 1987.004 + }, + { + "epoch": 0.20941828254847644, + "grad_norm": 0.22165536880493164, + "learning_rate": 9.997001500420951e-05, + "loss": 0.02845328114926815, + "num_input_tokens_seen": 12380256, + "step": 756, + "train_runtime": 6230.5852, + "train_tokens_per_second": 1987.013 + }, + { + "epoch": 0.20969529085872576, + "grad_norm": 0.22869297862052917, + "learning_rate": 9.996986261826334e-05, + "loss": 0.02844597026705742, + "num_input_tokens_seen": 12396632, + "step": 757, + "train_runtime": 6238.7961, + "train_tokens_per_second": 1987.023 + }, + { + "epoch": 0.20997229916897506, + "grad_norm": 0.22801600396633148, + "learning_rate": 9.996970984619641e-05, + "loss": 0.025838425382971764, + "num_input_tokens_seen": 12413008, + "step": 758, + "train_runtime": 6247.0068, + "train_tokens_per_second": 1987.033 + }, + { + "epoch": 0.21024930747922438, + "grad_norm": 0.2561948299407959, + "learning_rate": 9.996955668800996e-05, + "loss": 0.0377567894756794, + "num_input_tokens_seen": 12429384, + "step": 759, + "train_runtime": 6255.2114, + "train_tokens_per_second": 1987.045 + }, + { + "epoch": 0.21052631578947367, + "grad_norm": 0.2396022379398346, + "learning_rate": 9.996940314370515e-05, + "loss": 0.028361912816762924, + "num_input_tokens_seen": 12445760, + "step": 760, + "train_runtime": 6263.4248, + "train_tokens_per_second": 1987.053 + }, + { + "epoch": 0.210803324099723, + "grad_norm": 0.20452703535556793, + "learning_rate": 9.99692492132832e-05, + "loss": 0.031941935420036316, + "num_input_tokens_seen": 12462136, + "step": 761, + "train_runtime": 6271.6354, + "train_tokens_per_second": 1987.063 + }, + { + "epoch": 0.2110803324099723, + "grad_norm": 0.22984939813613892, + "learning_rate": 9.996909489674522e-05, + "loss": 0.025677014142274857, + "num_input_tokens_seen": 12478512, + "step": 762, + "train_runtime": 6279.8609, + "train_tokens_per_second": 1987.068 + }, + { + "epoch": 0.2113573407202216, + "grad_norm": 0.2807806432247162, + "learning_rate": 9.996894019409249e-05, + "loss": 0.027781562879681587, + "num_input_tokens_seen": 12494888, + "step": 763, + "train_runtime": 6288.075, + "train_tokens_per_second": 1987.077 + }, + { + "epoch": 0.2116343490304709, + "grad_norm": 0.22406812012195587, + "learning_rate": 9.996878510532614e-05, + "loss": 0.031938180327415466, + "num_input_tokens_seen": 12511264, + "step": 764, + "train_runtime": 6296.2846, + "train_tokens_per_second": 1987.087 + }, + { + "epoch": 0.21191135734072022, + "grad_norm": 0.1664067804813385, + "learning_rate": 9.996862963044742e-05, + "loss": 0.02229980006814003, + "num_input_tokens_seen": 12527640, + "step": 765, + "train_runtime": 6304.5035, + "train_tokens_per_second": 1987.094 + }, + { + "epoch": 0.21218836565096952, + "grad_norm": 0.24078255891799927, + "learning_rate": 9.996847376945749e-05, + "loss": 0.03144388645887375, + "num_input_tokens_seen": 12544016, + "step": 766, + "train_runtime": 6312.7285, + "train_tokens_per_second": 1987.099 + }, + { + "epoch": 0.21246537396121884, + "grad_norm": 0.2012488692998886, + "learning_rate": 9.996831752235758e-05, + "loss": 0.028169400990009308, + "num_input_tokens_seen": 12560392, + "step": 767, + "train_runtime": 6320.9514, + "train_tokens_per_second": 1987.105 + }, + { + "epoch": 0.21274238227146813, + "grad_norm": 0.18423044681549072, + "learning_rate": 9.996816088914889e-05, + "loss": 0.028965400531888008, + "num_input_tokens_seen": 12576768, + "step": 768, + "train_runtime": 6329.1743, + "train_tokens_per_second": 1987.11 + }, + { + "epoch": 0.21301939058171745, + "grad_norm": 0.2604973018169403, + "learning_rate": 9.996800386983263e-05, + "loss": 0.024816906079649925, + "num_input_tokens_seen": 12593144, + "step": 769, + "train_runtime": 6337.4093, + "train_tokens_per_second": 1987.112 + }, + { + "epoch": 0.21329639889196675, + "grad_norm": 0.3144100308418274, + "learning_rate": 9.996784646441e-05, + "loss": 0.03149198368191719, + "num_input_tokens_seen": 12609520, + "step": 770, + "train_runtime": 6345.6352, + "train_tokens_per_second": 1987.117 + }, + { + "epoch": 0.21357340720221607, + "grad_norm": 0.25528618693351746, + "learning_rate": 9.996768867288224e-05, + "loss": 0.03190237283706665, + "num_input_tokens_seen": 12625896, + "step": 771, + "train_runtime": 6353.8714, + "train_tokens_per_second": 1987.119 + }, + { + "epoch": 0.21385041551246536, + "grad_norm": 0.21157263219356537, + "learning_rate": 9.996753049525057e-05, + "loss": 0.02711913175880909, + "num_input_tokens_seen": 12642272, + "step": 772, + "train_runtime": 6362.1111, + "train_tokens_per_second": 1987.119 + }, + { + "epoch": 0.21412742382271469, + "grad_norm": 0.2411404252052307, + "learning_rate": 9.996737193151617e-05, + "loss": 0.02895837090909481, + "num_input_tokens_seen": 12658648, + "step": 773, + "train_runtime": 6370.3517, + "train_tokens_per_second": 1987.119 + }, + { + "epoch": 0.21440443213296398, + "grad_norm": 0.22123481333255768, + "learning_rate": 9.996721298168032e-05, + "loss": 0.027512820437550545, + "num_input_tokens_seen": 12675024, + "step": 774, + "train_runtime": 6378.5783, + "train_tokens_per_second": 1987.124 + }, + { + "epoch": 0.2146814404432133, + "grad_norm": 0.20179180800914764, + "learning_rate": 9.996705364574422e-05, + "loss": 0.026477767154574394, + "num_input_tokens_seen": 12691400, + "step": 775, + "train_runtime": 6386.8111, + "train_tokens_per_second": 1987.126 + }, + { + "epoch": 0.2149584487534626, + "grad_norm": 0.2939153015613556, + "learning_rate": 9.996689392370909e-05, + "loss": 0.032572224736213684, + "num_input_tokens_seen": 12707776, + "step": 776, + "train_runtime": 6395.0323, + "train_tokens_per_second": 1987.132 + }, + { + "epoch": 0.21523545706371192, + "grad_norm": 0.18594050407409668, + "learning_rate": 9.99667338155762e-05, + "loss": 0.027180003002285957, + "num_input_tokens_seen": 12724152, + "step": 777, + "train_runtime": 6403.258, + "train_tokens_per_second": 1987.137 + }, + { + "epoch": 0.2155124653739612, + "grad_norm": 0.19618961215019226, + "learning_rate": 9.996657332134675e-05, + "loss": 0.02806224673986435, + "num_input_tokens_seen": 12740528, + "step": 778, + "train_runtime": 6411.4938, + "train_tokens_per_second": 1987.139 + }, + { + "epoch": 0.21578947368421053, + "grad_norm": 0.25002339482307434, + "learning_rate": 9.996641244102201e-05, + "loss": 0.03277413547039032, + "num_input_tokens_seen": 12756904, + "step": 779, + "train_runtime": 6419.7118, + "train_tokens_per_second": 1987.146 + }, + { + "epoch": 0.21606648199445982, + "grad_norm": 0.22928717732429504, + "learning_rate": 9.996625117460318e-05, + "loss": 0.03008495271205902, + "num_input_tokens_seen": 12773280, + "step": 780, + "train_runtime": 6427.9225, + "train_tokens_per_second": 1987.155 + }, + { + "epoch": 0.21634349030470915, + "grad_norm": 0.20460836589336395, + "learning_rate": 9.996608952209156e-05, + "loss": 0.027908194810152054, + "num_input_tokens_seen": 12789656, + "step": 781, + "train_runtime": 6436.1357, + "train_tokens_per_second": 1987.164 + }, + { + "epoch": 0.21662049861495844, + "grad_norm": 0.254922091960907, + "learning_rate": 9.996592748348839e-05, + "loss": 0.03161931037902832, + "num_input_tokens_seen": 12806032, + "step": 782, + "train_runtime": 6444.3414, + "train_tokens_per_second": 1987.175 + }, + { + "epoch": 0.21689750692520776, + "grad_norm": 0.22200974822044373, + "learning_rate": 9.996576505879488e-05, + "loss": 0.028200387954711914, + "num_input_tokens_seen": 12822408, + "step": 783, + "train_runtime": 6452.5608, + "train_tokens_per_second": 1987.181 + }, + { + "epoch": 0.21717451523545706, + "grad_norm": 0.2612984776496887, + "learning_rate": 9.996560224801232e-05, + "loss": 0.027662605047225952, + "num_input_tokens_seen": 12838784, + "step": 784, + "train_runtime": 6460.7821, + "train_tokens_per_second": 1987.187 + }, + { + "epoch": 0.21745152354570638, + "grad_norm": 0.20670372247695923, + "learning_rate": 9.996543905114197e-05, + "loss": 0.029083333909511566, + "num_input_tokens_seen": 12855160, + "step": 785, + "train_runtime": 6469.0171, + "train_tokens_per_second": 1987.189 + }, + { + "epoch": 0.21772853185595567, + "grad_norm": 0.24035605788230896, + "learning_rate": 9.996527546818506e-05, + "loss": 0.030892513692378998, + "num_input_tokens_seen": 12871536, + "step": 786, + "train_runtime": 6477.2541, + "train_tokens_per_second": 1987.19 + }, + { + "epoch": 0.218005540166205, + "grad_norm": 0.24883092939853668, + "learning_rate": 9.99651114991429e-05, + "loss": 0.02981548197567463, + "num_input_tokens_seen": 12887912, + "step": 787, + "train_runtime": 6485.475, + "train_tokens_per_second": 1987.196 + }, + { + "epoch": 0.21828254847645429, + "grad_norm": 0.22850462794303894, + "learning_rate": 9.996494714401672e-05, + "loss": 0.029015276581048965, + "num_input_tokens_seen": 12904288, + "step": 788, + "train_runtime": 6493.7028, + "train_tokens_per_second": 1987.2 + }, + { + "epoch": 0.2185595567867036, + "grad_norm": 0.21972614526748657, + "learning_rate": 9.99647824028078e-05, + "loss": 0.029748892411589622, + "num_input_tokens_seen": 12920664, + "step": 789, + "train_runtime": 6501.93, + "train_tokens_per_second": 1987.204 + }, + { + "epoch": 0.2188365650969529, + "grad_norm": 0.17959339916706085, + "learning_rate": 9.99646172755174e-05, + "loss": 0.027060288935899734, + "num_input_tokens_seen": 12937040, + "step": 790, + "train_runtime": 6510.1426, + "train_tokens_per_second": 1987.213 + }, + { + "epoch": 0.21911357340720222, + "grad_norm": 0.27114802598953247, + "learning_rate": 9.996445176214684e-05, + "loss": 0.027981938794255257, + "num_input_tokens_seen": 12953416, + "step": 791, + "train_runtime": 6518.3565, + "train_tokens_per_second": 1987.221 + }, + { + "epoch": 0.21939058171745152, + "grad_norm": 0.1846366971731186, + "learning_rate": 9.996428586269734e-05, + "loss": 0.027972402051091194, + "num_input_tokens_seen": 12969792, + "step": 792, + "train_runtime": 6526.582, + "train_tokens_per_second": 1987.226 + }, + { + "epoch": 0.21966759002770084, + "grad_norm": 0.24343253672122955, + "learning_rate": 9.996411957717025e-05, + "loss": 0.029957178980112076, + "num_input_tokens_seen": 12986168, + "step": 793, + "train_runtime": 6534.813, + "train_tokens_per_second": 1987.229 + }, + { + "epoch": 0.21994459833795013, + "grad_norm": 0.2199891060590744, + "learning_rate": 9.99639529055668e-05, + "loss": 0.030277596786618233, + "num_input_tokens_seen": 13002544, + "step": 794, + "train_runtime": 6543.0247, + "train_tokens_per_second": 1987.237 + }, + { + "epoch": 0.22022160664819945, + "grad_norm": 0.3695112466812134, + "learning_rate": 9.996378584788829e-05, + "loss": 0.0319889597594738, + "num_input_tokens_seen": 13018920, + "step": 795, + "train_runtime": 6551.2318, + "train_tokens_per_second": 1987.248 + }, + { + "epoch": 0.22049861495844875, + "grad_norm": 0.19220663607120514, + "learning_rate": 9.996361840413601e-05, + "loss": 0.02735767886042595, + "num_input_tokens_seen": 13035296, + "step": 796, + "train_runtime": 6559.4402, + "train_tokens_per_second": 1987.257 + }, + { + "epoch": 0.22077562326869807, + "grad_norm": 0.1756911277770996, + "learning_rate": 9.996345057431125e-05, + "loss": 0.0257730670273304, + "num_input_tokens_seen": 13051672, + "step": 797, + "train_runtime": 6567.6536, + "train_tokens_per_second": 1987.266 + }, + { + "epoch": 0.22105263157894736, + "grad_norm": 0.21748580038547516, + "learning_rate": 9.996328235841534e-05, + "loss": 0.02603931352496147, + "num_input_tokens_seen": 13068048, + "step": 798, + "train_runtime": 6575.8734, + "train_tokens_per_second": 1987.272 + }, + { + "epoch": 0.22132963988919668, + "grad_norm": 0.2367708832025528, + "learning_rate": 9.996311375644955e-05, + "loss": 0.02517983503639698, + "num_input_tokens_seen": 13084424, + "step": 799, + "train_runtime": 6584.0956, + "train_tokens_per_second": 1987.277 + }, + { + "epoch": 0.22160664819944598, + "grad_norm": 0.1683516800403595, + "learning_rate": 9.99629447684152e-05, + "loss": 0.02336364984512329, + "num_input_tokens_seen": 13100800, + "step": 800, + "train_runtime": 6592.3348, + "train_tokens_per_second": 1987.278 + }, + { + "epoch": 0.2218836565096953, + "grad_norm": 0.16921475529670715, + "learning_rate": 9.996277539431355e-05, + "loss": 0.02439906820654869, + "num_input_tokens_seen": 13117176, + "step": 801, + "train_runtime": 6602.0805, + "train_tokens_per_second": 1986.825 + }, + { + "epoch": 0.2221606648199446, + "grad_norm": 0.1525999754667282, + "learning_rate": 9.996260563414597e-05, + "loss": 0.02577461302280426, + "num_input_tokens_seen": 13133552, + "step": 802, + "train_runtime": 6610.2956, + "train_tokens_per_second": 1986.833 + }, + { + "epoch": 0.2224376731301939, + "grad_norm": 0.2389349490404129, + "learning_rate": 9.996243548791373e-05, + "loss": 0.02913159504532814, + "num_input_tokens_seen": 13149928, + "step": 803, + "train_runtime": 6618.536, + "train_tokens_per_second": 1986.833 + }, + { + "epoch": 0.2227146814404432, + "grad_norm": 0.24167987704277039, + "learning_rate": 9.996226495561819e-05, + "loss": 0.022744446992874146, + "num_input_tokens_seen": 13166304, + "step": 804, + "train_runtime": 6626.7544, + "train_tokens_per_second": 1986.84 + }, + { + "epoch": 0.22299168975069253, + "grad_norm": 0.2871224284172058, + "learning_rate": 9.99620940372606e-05, + "loss": 0.03239988163113594, + "num_input_tokens_seen": 13182680, + "step": 805, + "train_runtime": 6634.9662, + "train_tokens_per_second": 1986.85 + }, + { + "epoch": 0.22326869806094182, + "grad_norm": 0.1777031421661377, + "learning_rate": 9.996192273284234e-05, + "loss": 0.025610554963350296, + "num_input_tokens_seen": 13199056, + "step": 806, + "train_runtime": 6643.1831, + "train_tokens_per_second": 1986.857 + }, + { + "epoch": 0.22354570637119114, + "grad_norm": 0.28306353092193604, + "learning_rate": 9.996175104236471e-05, + "loss": 0.02937653288245201, + "num_input_tokens_seen": 13215432, + "step": 807, + "train_runtime": 6651.4114, + "train_tokens_per_second": 1986.861 + }, + { + "epoch": 0.22382271468144044, + "grad_norm": 0.17344705760478973, + "learning_rate": 9.996157896582903e-05, + "loss": 0.02646913006901741, + "num_input_tokens_seen": 13231808, + "step": 808, + "train_runtime": 6659.6344, + "train_tokens_per_second": 1986.867 + }, + { + "epoch": 0.22409972299168976, + "grad_norm": 0.22042421996593475, + "learning_rate": 9.996140650323664e-05, + "loss": 0.027148637920618057, + "num_input_tokens_seen": 13248184, + "step": 809, + "train_runtime": 6667.8614, + "train_tokens_per_second": 1986.872 + }, + { + "epoch": 0.22437673130193905, + "grad_norm": 0.2257331907749176, + "learning_rate": 9.996123365458887e-05, + "loss": 0.026675738394260406, + "num_input_tokens_seen": 13264560, + "step": 810, + "train_runtime": 6676.0817, + "train_tokens_per_second": 1986.878 + }, + { + "epoch": 0.22465373961218837, + "grad_norm": 0.17708048224449158, + "learning_rate": 9.996106041988707e-05, + "loss": 0.02557792328298092, + "num_input_tokens_seen": 13280936, + "step": 811, + "train_runtime": 6684.3023, + "train_tokens_per_second": 1986.884 + }, + { + "epoch": 0.22493074792243767, + "grad_norm": 0.18888285756111145, + "learning_rate": 9.996088679913255e-05, + "loss": 0.02969602309167385, + "num_input_tokens_seen": 13297312, + "step": 812, + "train_runtime": 6692.522, + "train_tokens_per_second": 1986.891 + }, + { + "epoch": 0.225207756232687, + "grad_norm": 0.19420932233333588, + "learning_rate": 9.996071279232667e-05, + "loss": 0.02901577018201351, + "num_input_tokens_seen": 13313688, + "step": 813, + "train_runtime": 6700.7439, + "train_tokens_per_second": 1986.897 + }, + { + "epoch": 0.22548476454293628, + "grad_norm": 0.20376375317573547, + "learning_rate": 9.996053839947077e-05, + "loss": 0.026378881186246872, + "num_input_tokens_seen": 13330064, + "step": 814, + "train_runtime": 6708.9667, + "train_tokens_per_second": 1986.903 + }, + { + "epoch": 0.2257617728531856, + "grad_norm": 0.17690525949001312, + "learning_rate": 9.99603636205662e-05, + "loss": 0.027372539043426514, + "num_input_tokens_seen": 13346440, + "step": 815, + "train_runtime": 6717.1959, + "train_tokens_per_second": 1986.906 + }, + { + "epoch": 0.2260387811634349, + "grad_norm": 0.27132898569107056, + "learning_rate": 9.996018845561433e-05, + "loss": 0.026088248938322067, + "num_input_tokens_seen": 13362816, + "step": 816, + "train_runtime": 6725.4112, + "train_tokens_per_second": 1986.914 + }, + { + "epoch": 0.22631578947368422, + "grad_norm": 0.20121406018733978, + "learning_rate": 9.996001290461646e-05, + "loss": 0.0273626446723938, + "num_input_tokens_seen": 13379192, + "step": 817, + "train_runtime": 6733.6267, + "train_tokens_per_second": 1986.922 + }, + { + "epoch": 0.22659279778393351, + "grad_norm": 0.2216673344373703, + "learning_rate": 9.9959836967574e-05, + "loss": 0.030007706955075264, + "num_input_tokens_seen": 13395568, + "step": 818, + "train_runtime": 6741.8528, + "train_tokens_per_second": 1986.927 + }, + { + "epoch": 0.22686980609418284, + "grad_norm": 0.2426028847694397, + "learning_rate": 9.995966064448828e-05, + "loss": 0.033333394676446915, + "num_input_tokens_seen": 13411944, + "step": 819, + "train_runtime": 6750.0667, + "train_tokens_per_second": 1986.935 + }, + { + "epoch": 0.22714681440443213, + "grad_norm": 0.18631033599376678, + "learning_rate": 9.995948393536068e-05, + "loss": 0.02241778001189232, + "num_input_tokens_seen": 13428320, + "step": 820, + "train_runtime": 6758.2819, + "train_tokens_per_second": 1986.943 + }, + { + "epoch": 0.22742382271468145, + "grad_norm": 0.20317555963993073, + "learning_rate": 9.995930684019255e-05, + "loss": 0.026953764259815216, + "num_input_tokens_seen": 13444696, + "step": 821, + "train_runtime": 6766.5108, + "train_tokens_per_second": 1986.947 + }, + { + "epoch": 0.22770083102493074, + "grad_norm": 0.2055114507675171, + "learning_rate": 9.995912935898526e-05, + "loss": 0.026826169341802597, + "num_input_tokens_seen": 13461072, + "step": 822, + "train_runtime": 6774.7418, + "train_tokens_per_second": 1986.95 + }, + { + "epoch": 0.22797783933518007, + "grad_norm": 0.29393577575683594, + "learning_rate": 9.99589514917402e-05, + "loss": 0.02553924359381199, + "num_input_tokens_seen": 13477448, + "step": 823, + "train_runtime": 6782.9567, + "train_tokens_per_second": 1986.958 + }, + { + "epoch": 0.22825484764542936, + "grad_norm": 0.21038317680358887, + "learning_rate": 9.995877323845872e-05, + "loss": 0.028386306017637253, + "num_input_tokens_seen": 13493824, + "step": 824, + "train_runtime": 6791.1751, + "train_tokens_per_second": 1986.965 + }, + { + "epoch": 0.22853185595567868, + "grad_norm": 0.24319569766521454, + "learning_rate": 9.99585945991422e-05, + "loss": 0.027634913101792336, + "num_input_tokens_seen": 13510200, + "step": 825, + "train_runtime": 6799.4062, + "train_tokens_per_second": 1986.968 + }, + { + "epoch": 0.22880886426592797, + "grad_norm": 0.8440421223640442, + "learning_rate": 9.995841557379205e-05, + "loss": 0.09466484934091568, + "num_input_tokens_seen": 13526576, + "step": 826, + "train_runtime": 6807.6287, + "train_tokens_per_second": 1986.973 + }, + { + "epoch": 0.2290858725761773, + "grad_norm": 0.24091510474681854, + "learning_rate": 9.995823616240961e-05, + "loss": 0.02746668830513954, + "num_input_tokens_seen": 13542952, + "step": 827, + "train_runtime": 6815.8522, + "train_tokens_per_second": 1986.979 + }, + { + "epoch": 0.2293628808864266, + "grad_norm": 0.3002420663833618, + "learning_rate": 9.995805636499631e-05, + "loss": 0.030927743762731552, + "num_input_tokens_seen": 13559328, + "step": 828, + "train_runtime": 6824.0704, + "train_tokens_per_second": 1986.985 + }, + { + "epoch": 0.2296398891966759, + "grad_norm": 0.182831808924675, + "learning_rate": 9.99578761815535e-05, + "loss": 0.024816937744617462, + "num_input_tokens_seen": 13575704, + "step": 829, + "train_runtime": 6832.281, + "train_tokens_per_second": 1986.994 + }, + { + "epoch": 0.2299168975069252, + "grad_norm": 0.18354561924934387, + "learning_rate": 9.995769561208259e-05, + "loss": 0.024105645716190338, + "num_input_tokens_seen": 13592080, + "step": 830, + "train_runtime": 6840.4866, + "train_tokens_per_second": 1987.005 + }, + { + "epoch": 0.23019390581717453, + "grad_norm": 0.22534404695034027, + "learning_rate": 9.995751465658499e-05, + "loss": 0.027859488502144814, + "num_input_tokens_seen": 13608456, + "step": 831, + "train_runtime": 6848.6952, + "train_tokens_per_second": 1987.014 + }, + { + "epoch": 0.23047091412742382, + "grad_norm": 0.2856912910938263, + "learning_rate": 9.995733331506208e-05, + "loss": 0.029877029359340668, + "num_input_tokens_seen": 13624832, + "step": 832, + "train_runtime": 6856.9151, + "train_tokens_per_second": 1987.021 + }, + { + "epoch": 0.23074792243767314, + "grad_norm": 0.23649325966835022, + "learning_rate": 9.995715158751526e-05, + "loss": 0.0318717285990715, + "num_input_tokens_seen": 13641208, + "step": 833, + "train_runtime": 6865.133, + "train_tokens_per_second": 1987.027 + }, + { + "epoch": 0.23102493074792244, + "grad_norm": 0.16492272913455963, + "learning_rate": 9.995696947394592e-05, + "loss": 0.0297623872756958, + "num_input_tokens_seen": 13657584, + "step": 834, + "train_runtime": 6873.3628, + "train_tokens_per_second": 1987.031 + }, + { + "epoch": 0.23130193905817176, + "grad_norm": 0.20011943578720093, + "learning_rate": 9.995678697435552e-05, + "loss": 0.0265134796500206, + "num_input_tokens_seen": 13673960, + "step": 835, + "train_runtime": 6881.5905, + "train_tokens_per_second": 1987.035 + }, + { + "epoch": 0.23157894736842105, + "grad_norm": 0.18021470308303833, + "learning_rate": 9.995660408874542e-05, + "loss": 0.023640451952815056, + "num_input_tokens_seen": 13690336, + "step": 836, + "train_runtime": 6889.8182, + "train_tokens_per_second": 1987.039 + }, + { + "epoch": 0.23185595567867037, + "grad_norm": 0.218649223446846, + "learning_rate": 9.995642081711705e-05, + "loss": 0.029294872656464577, + "num_input_tokens_seen": 13706712, + "step": 837, + "train_runtime": 6898.0531, + "train_tokens_per_second": 1987.041 + }, + { + "epoch": 0.23213296398891967, + "grad_norm": 0.2263524979352951, + "learning_rate": 9.995623715947182e-05, + "loss": 0.02638411521911621, + "num_input_tokens_seen": 13723088, + "step": 838, + "train_runtime": 6906.2826, + "train_tokens_per_second": 1987.044 + }, + { + "epoch": 0.232409972299169, + "grad_norm": 0.39041250944137573, + "learning_rate": 9.995605311581118e-05, + "loss": 0.026458632200956345, + "num_input_tokens_seen": 13739464, + "step": 839, + "train_runtime": 6914.5169, + "train_tokens_per_second": 1987.046 + }, + { + "epoch": 0.23268698060941828, + "grad_norm": 0.20770485699176788, + "learning_rate": 9.99558686861365e-05, + "loss": 0.028389226645231247, + "num_input_tokens_seen": 13755840, + "step": 840, + "train_runtime": 6922.7596, + "train_tokens_per_second": 1987.046 + }, + { + "epoch": 0.2329639889196676, + "grad_norm": 0.2106969654560089, + "learning_rate": 9.995568387044924e-05, + "loss": 0.02543022111058235, + "num_input_tokens_seen": 13772216, + "step": 841, + "train_runtime": 6930.9818, + "train_tokens_per_second": 1987.051 + }, + { + "epoch": 0.2332409972299169, + "grad_norm": 0.17870661616325378, + "learning_rate": 9.995549866875082e-05, + "loss": 0.02468886598944664, + "num_input_tokens_seen": 13788592, + "step": 842, + "train_runtime": 6939.2175, + "train_tokens_per_second": 1987.053 + }, + { + "epoch": 0.23351800554016622, + "grad_norm": 0.20297521352767944, + "learning_rate": 9.995531308104267e-05, + "loss": 0.028329813852906227, + "num_input_tokens_seen": 13804968, + "step": 843, + "train_runtime": 6947.4521, + "train_tokens_per_second": 1987.055 + }, + { + "epoch": 0.2337950138504155, + "grad_norm": 0.21181751787662506, + "learning_rate": 9.995512710732623e-05, + "loss": 0.02772308513522148, + "num_input_tokens_seen": 13821344, + "step": 844, + "train_runtime": 6955.6787, + "train_tokens_per_second": 1987.059 + }, + { + "epoch": 0.23407202216066483, + "grad_norm": 0.23665407299995422, + "learning_rate": 9.995494074760292e-05, + "loss": 0.02292132005095482, + "num_input_tokens_seen": 13837720, + "step": 845, + "train_runtime": 6963.9077, + "train_tokens_per_second": 1987.063 + }, + { + "epoch": 0.23434903047091413, + "grad_norm": 0.2060859054327011, + "learning_rate": 9.99547540018742e-05, + "loss": 0.026605626568198204, + "num_input_tokens_seen": 13854096, + "step": 846, + "train_runtime": 6972.1421, + "train_tokens_per_second": 1987.064 + }, + { + "epoch": 0.23462603878116345, + "grad_norm": 0.244363933801651, + "learning_rate": 9.995456687014151e-05, + "loss": 0.030689049512147903, + "num_input_tokens_seen": 13870472, + "step": 847, + "train_runtime": 6980.3784, + "train_tokens_per_second": 1987.066 + }, + { + "epoch": 0.23490304709141274, + "grad_norm": 0.205518439412117, + "learning_rate": 9.995437935240628e-05, + "loss": 0.0209107156842947, + "num_input_tokens_seen": 13886848, + "step": 848, + "train_runtime": 6988.6056, + "train_tokens_per_second": 1987.07 + }, + { + "epoch": 0.23518005540166206, + "grad_norm": 0.18793389201164246, + "learning_rate": 9.995419144866999e-05, + "loss": 0.025495797395706177, + "num_input_tokens_seen": 13903224, + "step": 849, + "train_runtime": 6996.8421, + "train_tokens_per_second": 1987.071 + }, + { + "epoch": 0.23545706371191136, + "grad_norm": 0.2450936734676361, + "learning_rate": 9.995400315893406e-05, + "loss": 0.02596084028482437, + "num_input_tokens_seen": 13919600, + "step": 850, + "train_runtime": 7005.0605, + "train_tokens_per_second": 1987.078 + }, + { + "epoch": 0.23573407202216065, + "grad_norm": 0.21317529678344727, + "learning_rate": 9.995381448319994e-05, + "loss": 0.02191855199635029, + "num_input_tokens_seen": 13935976, + "step": 851, + "train_runtime": 7013.2768, + "train_tokens_per_second": 1987.085 + }, + { + "epoch": 0.23601108033240997, + "grad_norm": 0.2814950942993164, + "learning_rate": 9.995362542146913e-05, + "loss": 0.027720946818590164, + "num_input_tokens_seen": 13952352, + "step": 852, + "train_runtime": 7021.5127, + "train_tokens_per_second": 1987.086 + }, + { + "epoch": 0.23628808864265927, + "grad_norm": 0.18426012992858887, + "learning_rate": 9.995343597374307e-05, + "loss": 0.023583995178341866, + "num_input_tokens_seen": 13968728, + "step": 853, + "train_runtime": 7029.7396, + "train_tokens_per_second": 1987.09 + }, + { + "epoch": 0.2365650969529086, + "grad_norm": 0.218709796667099, + "learning_rate": 9.99532461400232e-05, + "loss": 0.02555435709655285, + "num_input_tokens_seen": 13985104, + "step": 854, + "train_runtime": 7037.9535, + "train_tokens_per_second": 1987.098 + }, + { + "epoch": 0.23684210526315788, + "grad_norm": 0.3690670430660248, + "learning_rate": 9.995305592031101e-05, + "loss": 0.03403222933411598, + "num_input_tokens_seen": 14001480, + "step": 855, + "train_runtime": 7046.1852, + "train_tokens_per_second": 1987.101 + }, + { + "epoch": 0.2371191135734072, + "grad_norm": 0.20612868666648865, + "learning_rate": 9.995286531460797e-05, + "loss": 0.032851915806531906, + "num_input_tokens_seen": 14017856, + "step": 856, + "train_runtime": 7054.4314, + "train_tokens_per_second": 1987.099 + }, + { + "epoch": 0.2373961218836565, + "grad_norm": 0.14543859660625458, + "learning_rate": 9.995267432291555e-05, + "loss": 0.024009602144360542, + "num_input_tokens_seen": 14034232, + "step": 857, + "train_runtime": 7062.6706, + "train_tokens_per_second": 1987.1 + }, + { + "epoch": 0.23767313019390582, + "grad_norm": 0.2014191746711731, + "learning_rate": 9.995248294523523e-05, + "loss": 0.023327015340328217, + "num_input_tokens_seen": 14050608, + "step": 858, + "train_runtime": 7070.8871, + "train_tokens_per_second": 1987.107 + }, + { + "epoch": 0.2379501385041551, + "grad_norm": 0.19695675373077393, + "learning_rate": 9.995229118156848e-05, + "loss": 0.027657978236675262, + "num_input_tokens_seen": 14066984, + "step": 859, + "train_runtime": 7079.0965, + "train_tokens_per_second": 1987.116 + }, + { + "epoch": 0.23822714681440443, + "grad_norm": 0.2250225841999054, + "learning_rate": 9.995209903191678e-05, + "loss": 0.02374822460114956, + "num_input_tokens_seen": 14083360, + "step": 860, + "train_runtime": 7087.3058, + "train_tokens_per_second": 1987.125 + }, + { + "epoch": 0.23850415512465373, + "grad_norm": 0.1665189266204834, + "learning_rate": 9.995190649628161e-05, + "loss": 0.022288890555500984, + "num_input_tokens_seen": 14099736, + "step": 861, + "train_runtime": 7095.522, + "train_tokens_per_second": 1987.132 + }, + { + "epoch": 0.23878116343490305, + "grad_norm": 0.2421930581331253, + "learning_rate": 9.995171357466449e-05, + "loss": 0.03121122159063816, + "num_input_tokens_seen": 14116112, + "step": 862, + "train_runtime": 7103.753, + "train_tokens_per_second": 1987.134 + }, + { + "epoch": 0.23905817174515234, + "grad_norm": 0.2083137333393097, + "learning_rate": 9.995152026706687e-05, + "loss": 0.02788706310093403, + "num_input_tokens_seen": 14132488, + "step": 863, + "train_runtime": 7111.9869, + "train_tokens_per_second": 1987.136 + }, + { + "epoch": 0.23933518005540166, + "grad_norm": 0.20525947213172913, + "learning_rate": 9.995132657349027e-05, + "loss": 0.028339985758066177, + "num_input_tokens_seen": 14148864, + "step": 864, + "train_runtime": 7120.2089, + "train_tokens_per_second": 1987.142 + }, + { + "epoch": 0.23961218836565096, + "grad_norm": 0.2492087185382843, + "learning_rate": 9.99511324939362e-05, + "loss": 0.0293111652135849, + "num_input_tokens_seen": 14165240, + "step": 865, + "train_runtime": 7128.4317, + "train_tokens_per_second": 1987.147 + }, + { + "epoch": 0.23988919667590028, + "grad_norm": 0.19669079780578613, + "learning_rate": 9.995093802840612e-05, + "loss": 0.02996930666267872, + "num_input_tokens_seen": 14181616, + "step": 866, + "train_runtime": 7136.6607, + "train_tokens_per_second": 1987.15 + }, + { + "epoch": 0.24016620498614957, + "grad_norm": 0.15833628177642822, + "learning_rate": 9.995074317690155e-05, + "loss": 0.030423592776060104, + "num_input_tokens_seen": 14197992, + "step": 867, + "train_runtime": 7144.8805, + "train_tokens_per_second": 1987.156 + }, + { + "epoch": 0.2404432132963989, + "grad_norm": 0.20399010181427002, + "learning_rate": 9.9950547939424e-05, + "loss": 0.024728627875447273, + "num_input_tokens_seen": 14214368, + "step": 868, + "train_runtime": 7153.101, + "train_tokens_per_second": 1987.162 + }, + { + "epoch": 0.2407202216066482, + "grad_norm": 0.19778944551944733, + "learning_rate": 9.995035231597497e-05, + "loss": 0.02920094132423401, + "num_input_tokens_seen": 14230744, + "step": 869, + "train_runtime": 7161.3256, + "train_tokens_per_second": 1987.166 + }, + { + "epoch": 0.2409972299168975, + "grad_norm": 0.16848105192184448, + "learning_rate": 9.995015630655597e-05, + "loss": 0.022000553086400032, + "num_input_tokens_seen": 14247120, + "step": 870, + "train_runtime": 7169.5526, + "train_tokens_per_second": 1987.17 + }, + { + "epoch": 0.2412742382271468, + "grad_norm": 0.2532685399055481, + "learning_rate": 9.994995991116854e-05, + "loss": 0.029051542282104492, + "num_input_tokens_seen": 14263496, + "step": 871, + "train_runtime": 7177.7747, + "train_tokens_per_second": 1987.175 + }, + { + "epoch": 0.24155124653739612, + "grad_norm": 0.19534723460674286, + "learning_rate": 9.994976312981418e-05, + "loss": 0.023829789832234383, + "num_input_tokens_seen": 14279872, + "step": 872, + "train_runtime": 7185.9857, + "train_tokens_per_second": 1987.183 + }, + { + "epoch": 0.24182825484764542, + "grad_norm": 0.19579379260540009, + "learning_rate": 9.99495659624944e-05, + "loss": 0.02587406523525715, + "num_input_tokens_seen": 14296248, + "step": 873, + "train_runtime": 7194.1907, + "train_tokens_per_second": 1987.193 + }, + { + "epoch": 0.24210526315789474, + "grad_norm": 0.2269769161939621, + "learning_rate": 9.994936840921075e-05, + "loss": 0.02540653757750988, + "num_input_tokens_seen": 14312624, + "step": 874, + "train_runtime": 7202.4102, + "train_tokens_per_second": 1987.199 + }, + { + "epoch": 0.24238227146814403, + "grad_norm": 0.1842319518327713, + "learning_rate": 9.994917046996472e-05, + "loss": 0.02744685672223568, + "num_input_tokens_seen": 14329000, + "step": 875, + "train_runtime": 7210.6362, + "train_tokens_per_second": 1987.203 + }, + { + "epoch": 0.24265927977839336, + "grad_norm": 0.16651073098182678, + "learning_rate": 9.994897214475787e-05, + "loss": 0.025367263704538345, + "num_input_tokens_seen": 14345376, + "step": 876, + "train_runtime": 7218.8573, + "train_tokens_per_second": 1987.209 + }, + { + "epoch": 0.24293628808864265, + "grad_norm": 0.3282005786895752, + "learning_rate": 9.994877343359172e-05, + "loss": 0.026430316269397736, + "num_input_tokens_seen": 14361752, + "step": 877, + "train_runtime": 7227.0774, + "train_tokens_per_second": 1987.214 + }, + { + "epoch": 0.24321329639889197, + "grad_norm": 0.22071796655654907, + "learning_rate": 9.99485743364678e-05, + "loss": 0.02635180950164795, + "num_input_tokens_seen": 14378128, + "step": 878, + "train_runtime": 7235.2957, + "train_tokens_per_second": 1987.221 + }, + { + "epoch": 0.24349030470914126, + "grad_norm": 0.22844305634498596, + "learning_rate": 9.994837485338766e-05, + "loss": 0.030178619548678398, + "num_input_tokens_seen": 14394504, + "step": 879, + "train_runtime": 7243.5053, + "train_tokens_per_second": 1987.229 + }, + { + "epoch": 0.24376731301939059, + "grad_norm": 0.2783288359642029, + "learning_rate": 9.994817498435285e-05, + "loss": 0.028789127245545387, + "num_input_tokens_seen": 14410880, + "step": 880, + "train_runtime": 7251.7169, + "train_tokens_per_second": 1987.237 + }, + { + "epoch": 0.24404432132963988, + "grad_norm": 0.25737684965133667, + "learning_rate": 9.994797472936487e-05, + "loss": 0.02710326761007309, + "num_input_tokens_seen": 14427256, + "step": 881, + "train_runtime": 7259.9285, + "train_tokens_per_second": 1987.245 + }, + { + "epoch": 0.2443213296398892, + "grad_norm": 0.2152530550956726, + "learning_rate": 9.994777408842534e-05, + "loss": 0.01922561042010784, + "num_input_tokens_seen": 14443632, + "step": 882, + "train_runtime": 7268.1296, + "train_tokens_per_second": 1987.256 + }, + { + "epoch": 0.2445983379501385, + "grad_norm": 0.1845952570438385, + "learning_rate": 9.994757306153575e-05, + "loss": 0.02695777453482151, + "num_input_tokens_seen": 14460008, + "step": 883, + "train_runtime": 7276.3524, + "train_tokens_per_second": 1987.261 + }, + { + "epoch": 0.24487534626038782, + "grad_norm": 0.20799750089645386, + "learning_rate": 9.994737164869768e-05, + "loss": 0.025061041116714478, + "num_input_tokens_seen": 14476384, + "step": 884, + "train_runtime": 7284.5619, + "train_tokens_per_second": 1987.269 + }, + { + "epoch": 0.2451523545706371, + "grad_norm": 0.24867157638072968, + "learning_rate": 9.994716984991266e-05, + "loss": 0.02761947363615036, + "num_input_tokens_seen": 14492760, + "step": 885, + "train_runtime": 7292.7694, + "train_tokens_per_second": 1987.278 + }, + { + "epoch": 0.24542936288088643, + "grad_norm": 0.24874025583267212, + "learning_rate": 9.994696766518227e-05, + "loss": 0.03142504021525383, + "num_input_tokens_seen": 14509136, + "step": 886, + "train_runtime": 7300.9773, + "train_tokens_per_second": 1987.287 + }, + { + "epoch": 0.24570637119113573, + "grad_norm": 0.17893019318580627, + "learning_rate": 9.994676509450809e-05, + "loss": 0.026652038097381592, + "num_input_tokens_seen": 14525512, + "step": 887, + "train_runtime": 7309.1807, + "train_tokens_per_second": 1987.297 + }, + { + "epoch": 0.24598337950138505, + "grad_norm": 0.16510295867919922, + "learning_rate": 9.994656213789166e-05, + "loss": 0.02257191762328148, + "num_input_tokens_seen": 14541888, + "step": 888, + "train_runtime": 7317.4013, + "train_tokens_per_second": 1987.302 + }, + { + "epoch": 0.24626038781163434, + "grad_norm": 0.1864057332277298, + "learning_rate": 9.994635879533456e-05, + "loss": 0.024082744494080544, + "num_input_tokens_seen": 14558264, + "step": 889, + "train_runtime": 7325.6147, + "train_tokens_per_second": 1987.31 + }, + { + "epoch": 0.24653739612188366, + "grad_norm": 0.1978907734155655, + "learning_rate": 9.994615506683834e-05, + "loss": 0.024319438263773918, + "num_input_tokens_seen": 14574640, + "step": 890, + "train_runtime": 7333.8282, + "train_tokens_per_second": 1987.317 + }, + { + "epoch": 0.24681440443213296, + "grad_norm": 0.1795327365398407, + "learning_rate": 9.99459509524046e-05, + "loss": 0.025288773700594902, + "num_input_tokens_seen": 14591016, + "step": 891, + "train_runtime": 7342.0354, + "train_tokens_per_second": 1987.326 + }, + { + "epoch": 0.24709141274238228, + "grad_norm": 0.16868239641189575, + "learning_rate": 9.99457464520349e-05, + "loss": 0.02506047673523426, + "num_input_tokens_seen": 14607392, + "step": 892, + "train_runtime": 7350.2522, + "train_tokens_per_second": 1987.332 + }, + { + "epoch": 0.24736842105263157, + "grad_norm": 0.1986074000597, + "learning_rate": 9.994554156573084e-05, + "loss": 0.024395886808633804, + "num_input_tokens_seen": 14623768, + "step": 893, + "train_runtime": 7358.4879, + "train_tokens_per_second": 1987.333 + }, + { + "epoch": 0.2476454293628809, + "grad_norm": 0.20004841685295105, + "learning_rate": 9.994533629349398e-05, + "loss": 0.0279482863843441, + "num_input_tokens_seen": 14640144, + "step": 894, + "train_runtime": 7366.7169, + "train_tokens_per_second": 1987.336 + }, + { + "epoch": 0.24792243767313019, + "grad_norm": 0.1756807565689087, + "learning_rate": 9.994513063532591e-05, + "loss": 0.026669519022107124, + "num_input_tokens_seen": 14656520, + "step": 895, + "train_runtime": 7374.9528, + "train_tokens_per_second": 1987.337 + }, + { + "epoch": 0.2481994459833795, + "grad_norm": 0.16757360100746155, + "learning_rate": 9.994492459122824e-05, + "loss": 0.022783052176237106, + "num_input_tokens_seen": 14672896, + "step": 896, + "train_runtime": 7383.1816, + "train_tokens_per_second": 1987.341 + }, + { + "epoch": 0.2484764542936288, + "grad_norm": 0.26798295974731445, + "learning_rate": 9.994471816120255e-05, + "loss": 0.02952551282942295, + "num_input_tokens_seen": 14689272, + "step": 897, + "train_runtime": 7391.4207, + "train_tokens_per_second": 1987.341 + }, + { + "epoch": 0.24875346260387812, + "grad_norm": 0.1769370287656784, + "learning_rate": 9.994451134525041e-05, + "loss": 0.02406635507941246, + "num_input_tokens_seen": 14705648, + "step": 898, + "train_runtime": 7399.6613, + "train_tokens_per_second": 1987.341 + }, + { + "epoch": 0.24903047091412742, + "grad_norm": 0.19307555258274078, + "learning_rate": 9.994430414337345e-05, + "loss": 0.024278316646814346, + "num_input_tokens_seen": 14722024, + "step": 899, + "train_runtime": 7407.8874, + "train_tokens_per_second": 1987.344 + }, + { + "epoch": 0.24930747922437674, + "grad_norm": 0.18157581984996796, + "learning_rate": 9.994409655557329e-05, + "loss": 0.02645995281636715, + "num_input_tokens_seen": 14738400, + "step": 900, + "train_runtime": 7416.1281, + "train_tokens_per_second": 1987.344 + }, + { + "epoch": 0.24958448753462603, + "grad_norm": 0.2725801169872284, + "learning_rate": 9.994388858185147e-05, + "loss": 0.022249147295951843, + "num_input_tokens_seen": 14754776, + "step": 901, + "train_runtime": 7426.0756, + "train_tokens_per_second": 1986.887 + }, + { + "epoch": 0.24986149584487535, + "grad_norm": 0.17605125904083252, + "learning_rate": 9.994368022220964e-05, + "loss": 0.02105383202433586, + "num_input_tokens_seen": 14771152, + "step": 902, + "train_runtime": 7434.3001, + "train_tokens_per_second": 1986.892 + }, + { + "epoch": 0.2501385041551247, + "grad_norm": 0.16388697922229767, + "learning_rate": 9.994347147664942e-05, + "loss": 0.021140960976481438, + "num_input_tokens_seen": 14787528, + "step": 903, + "train_runtime": 7442.5316, + "train_tokens_per_second": 1986.895 + }, + { + "epoch": 0.25041551246537397, + "grad_norm": 0.16610711812973022, + "learning_rate": 9.994326234517239e-05, + "loss": 0.02497965842485428, + "num_input_tokens_seen": 14803904, + "step": 904, + "train_runtime": 7450.7534, + "train_tokens_per_second": 1986.9 + }, + { + "epoch": 0.25069252077562326, + "grad_norm": 0.17361487448215485, + "learning_rate": 9.99430528277802e-05, + "loss": 0.028389213606715202, + "num_input_tokens_seen": 14820280, + "step": 905, + "train_runtime": 7458.9756, + "train_tokens_per_second": 1986.906 + }, + { + "epoch": 0.25096952908587256, + "grad_norm": 0.15028329193592072, + "learning_rate": 9.994284292447443e-05, + "loss": 0.020646236836910248, + "num_input_tokens_seen": 14836656, + "step": 906, + "train_runtime": 7467.1866, + "train_tokens_per_second": 1986.914 + }, + { + "epoch": 0.2512465373961219, + "grad_norm": 0.24270954728126526, + "learning_rate": 9.994263263525673e-05, + "loss": 0.025069059804081917, + "num_input_tokens_seen": 14853032, + "step": 907, + "train_runtime": 7475.4027, + "train_tokens_per_second": 1986.921 + }, + { + "epoch": 0.2515235457063712, + "grad_norm": 0.658909261226654, + "learning_rate": 9.994242196012872e-05, + "loss": 0.02504490129649639, + "num_input_tokens_seen": 14869408, + "step": 908, + "train_runtime": 7483.6365, + "train_tokens_per_second": 1986.923 + }, + { + "epoch": 0.2518005540166205, + "grad_norm": 0.22687804698944092, + "learning_rate": 9.994221089909203e-05, + "loss": 0.024782732129096985, + "num_input_tokens_seen": 14885784, + "step": 909, + "train_runtime": 7491.8713, + "train_tokens_per_second": 1986.925 + }, + { + "epoch": 0.2520775623268698, + "grad_norm": 0.1706182062625885, + "learning_rate": 9.994199945214828e-05, + "loss": 0.02328740432858467, + "num_input_tokens_seen": 14902160, + "step": 910, + "train_runtime": 7500.1001, + "train_tokens_per_second": 1986.928 + }, + { + "epoch": 0.25235457063711914, + "grad_norm": 0.16871702671051025, + "learning_rate": 9.994178761929911e-05, + "loss": 0.028631363064050674, + "num_input_tokens_seen": 14918536, + "step": 911, + "train_runtime": 7508.3174, + "train_tokens_per_second": 1986.935 + }, + { + "epoch": 0.25263157894736843, + "grad_norm": 0.27350640296936035, + "learning_rate": 9.994157540054616e-05, + "loss": 0.024980468675494194, + "num_input_tokens_seen": 14934912, + "step": 912, + "train_runtime": 7516.5414, + "train_tokens_per_second": 1986.939 + }, + { + "epoch": 0.2529085872576177, + "grad_norm": 0.20289498567581177, + "learning_rate": 9.994136279589106e-05, + "loss": 0.024839770048856735, + "num_input_tokens_seen": 14951288, + "step": 913, + "train_runtime": 7524.7766, + "train_tokens_per_second": 1986.941 + }, + { + "epoch": 0.253185595567867, + "grad_norm": 0.14613203704357147, + "learning_rate": 9.994114980533548e-05, + "loss": 0.024799007922410965, + "num_input_tokens_seen": 14967664, + "step": 914, + "train_runtime": 7533.0088, + "train_tokens_per_second": 1986.944 + }, + { + "epoch": 0.25346260387811637, + "grad_norm": 0.16328032314777374, + "learning_rate": 9.994093642888102e-05, + "loss": 0.02247389778494835, + "num_input_tokens_seen": 14984040, + "step": 915, + "train_runtime": 7541.2382, + "train_tokens_per_second": 1986.947 + }, + { + "epoch": 0.25373961218836566, + "grad_norm": 0.15637992322444916, + "learning_rate": 9.994072266652937e-05, + "loss": 0.022322172299027443, + "num_input_tokens_seen": 15000416, + "step": 916, + "train_runtime": 7549.4557, + "train_tokens_per_second": 1986.953 + }, + { + "epoch": 0.25401662049861495, + "grad_norm": 0.15893803536891937, + "learning_rate": 9.994050851828217e-05, + "loss": 0.02384340576827526, + "num_input_tokens_seen": 15016792, + "step": 917, + "train_runtime": 7557.6835, + "train_tokens_per_second": 1986.957 + }, + { + "epoch": 0.25429362880886425, + "grad_norm": 0.14501631259918213, + "learning_rate": 9.994029398414106e-05, + "loss": 0.02108130231499672, + "num_input_tokens_seen": 15033168, + "step": 918, + "train_runtime": 7565.9178, + "train_tokens_per_second": 1986.959 + }, + { + "epoch": 0.2545706371191136, + "grad_norm": 0.19052138924598694, + "learning_rate": 9.994007906410771e-05, + "loss": 0.026035837829113007, + "num_input_tokens_seen": 15049544, + "step": 919, + "train_runtime": 7574.1372, + "train_tokens_per_second": 1986.965 + }, + { + "epoch": 0.2548476454293629, + "grad_norm": 0.17812925577163696, + "learning_rate": 9.993986375818379e-05, + "loss": 0.023693155497312546, + "num_input_tokens_seen": 15065920, + "step": 920, + "train_runtime": 7582.3697, + "train_tokens_per_second": 1986.967 + }, + { + "epoch": 0.2551246537396122, + "grad_norm": 0.18234047293663025, + "learning_rate": 9.993964806637095e-05, + "loss": 0.024774739518761635, + "num_input_tokens_seen": 15082296, + "step": 921, + "train_runtime": 7590.5894, + "train_tokens_per_second": 1986.973 + }, + { + "epoch": 0.2554016620498615, + "grad_norm": 0.2409335821866989, + "learning_rate": 9.993943198867085e-05, + "loss": 0.03333528712391853, + "num_input_tokens_seen": 15098672, + "step": 922, + "train_runtime": 7598.8259, + "train_tokens_per_second": 1986.974 + }, + { + "epoch": 0.2556786703601108, + "grad_norm": 0.2797365188598633, + "learning_rate": 9.993921552508518e-05, + "loss": 0.030820367857813835, + "num_input_tokens_seen": 15115048, + "step": 923, + "train_runtime": 7607.0567, + "train_tokens_per_second": 1986.977 + }, + { + "epoch": 0.2559556786703601, + "grad_norm": 0.17652840912342072, + "learning_rate": 9.99389986756156e-05, + "loss": 0.02345399558544159, + "num_input_tokens_seen": 15131424, + "step": 924, + "train_runtime": 7615.2651, + "train_tokens_per_second": 1986.986 + }, + { + "epoch": 0.2562326869806094, + "grad_norm": 0.17230656743049622, + "learning_rate": 9.993878144026378e-05, + "loss": 0.024941084906458855, + "num_input_tokens_seen": 15147800, + "step": 925, + "train_runtime": 7623.4781, + "train_tokens_per_second": 1986.993 + }, + { + "epoch": 0.2565096952908587, + "grad_norm": 0.21064700186252594, + "learning_rate": 9.993856381903141e-05, + "loss": 0.0285328458994627, + "num_input_tokens_seen": 15164176, + "step": 926, + "train_runtime": 7631.6954, + "train_tokens_per_second": 1987.0 + }, + { + "epoch": 0.25678670360110806, + "grad_norm": 0.1599854677915573, + "learning_rate": 9.993834581192016e-05, + "loss": 0.02487838640809059, + "num_input_tokens_seen": 15180552, + "step": 927, + "train_runtime": 7639.9169, + "train_tokens_per_second": 1987.005 + }, + { + "epoch": 0.25706371191135735, + "grad_norm": 0.1936967819929123, + "learning_rate": 9.993812741893173e-05, + "loss": 0.020068887621164322, + "num_input_tokens_seen": 15196928, + "step": 928, + "train_runtime": 7648.1546, + "train_tokens_per_second": 1987.006 + }, + { + "epoch": 0.25734072022160664, + "grad_norm": 0.150935560464859, + "learning_rate": 9.993790864006783e-05, + "loss": 0.024069251492619514, + "num_input_tokens_seen": 15213304, + "step": 929, + "train_runtime": 7656.3834, + "train_tokens_per_second": 1987.009 + }, + { + "epoch": 0.25761772853185594, + "grad_norm": 0.17071941494941711, + "learning_rate": 9.993768947533008e-05, + "loss": 0.029304299503564835, + "num_input_tokens_seen": 15229680, + "step": 930, + "train_runtime": 7664.6092, + "train_tokens_per_second": 1987.013 + }, + { + "epoch": 0.2578947368421053, + "grad_norm": 0.16253404319286346, + "learning_rate": 9.993746992472022e-05, + "loss": 0.027791811153292656, + "num_input_tokens_seen": 15246056, + "step": 931, + "train_runtime": 7672.8526, + "train_tokens_per_second": 1987.013 + }, + { + "epoch": 0.2581717451523546, + "grad_norm": 0.16741327941417694, + "learning_rate": 9.993724998823995e-05, + "loss": 0.020080944523215294, + "num_input_tokens_seen": 15262432, + "step": 932, + "train_runtime": 7681.0829, + "train_tokens_per_second": 1987.016 + }, + { + "epoch": 0.2584487534626039, + "grad_norm": 0.22808831930160522, + "learning_rate": 9.993702966589096e-05, + "loss": 0.03443126752972603, + "num_input_tokens_seen": 15278808, + "step": 933, + "train_runtime": 7689.3144, + "train_tokens_per_second": 1987.018 + }, + { + "epoch": 0.25872576177285317, + "grad_norm": 0.18006493151187897, + "learning_rate": 9.993680895767495e-05, + "loss": 0.022446442395448685, + "num_input_tokens_seen": 15295184, + "step": 934, + "train_runtime": 7697.5383, + "train_tokens_per_second": 1987.023 + }, + { + "epoch": 0.2590027700831025, + "grad_norm": 0.1672252118587494, + "learning_rate": 9.993658786359362e-05, + "loss": 0.02786257676780224, + "num_input_tokens_seen": 15311560, + "step": 935, + "train_runtime": 7705.7684, + "train_tokens_per_second": 1987.026 + }, + { + "epoch": 0.2592797783933518, + "grad_norm": 0.18242554366588593, + "learning_rate": 9.99363663836487e-05, + "loss": 0.024109065532684326, + "num_input_tokens_seen": 15327936, + "step": 936, + "train_runtime": 7713.9898, + "train_tokens_per_second": 1987.031 + }, + { + "epoch": 0.2595567867036011, + "grad_norm": 0.16943654417991638, + "learning_rate": 9.993614451784188e-05, + "loss": 0.02536948025226593, + "num_input_tokens_seen": 15344312, + "step": 937, + "train_runtime": 7722.2109, + "train_tokens_per_second": 1987.036 + }, + { + "epoch": 0.2598337950138504, + "grad_norm": 0.1778716892004013, + "learning_rate": 9.993592226617487e-05, + "loss": 0.027220875024795532, + "num_input_tokens_seen": 15360688, + "step": 938, + "train_runtime": 7730.4414, + "train_tokens_per_second": 1987.039 + }, + { + "epoch": 0.26011080332409975, + "grad_norm": 0.16620925068855286, + "learning_rate": 9.993569962864943e-05, + "loss": 0.025238288566470146, + "num_input_tokens_seen": 15377064, + "step": 939, + "train_runtime": 7738.6622, + "train_tokens_per_second": 1987.044 + }, + { + "epoch": 0.26038781163434904, + "grad_norm": 0.15234322845935822, + "learning_rate": 9.993547660526723e-05, + "loss": 0.02655555121600628, + "num_input_tokens_seen": 15393440, + "step": 940, + "train_runtime": 7746.8867, + "train_tokens_per_second": 1987.049 + }, + { + "epoch": 0.26066481994459834, + "grad_norm": 0.2456250935792923, + "learning_rate": 9.993525319603e-05, + "loss": 0.027966031804680824, + "num_input_tokens_seen": 15409816, + "step": 941, + "train_runtime": 7755.1117, + "train_tokens_per_second": 1987.053 + }, + { + "epoch": 0.26094182825484763, + "grad_norm": 0.24138350784778595, + "learning_rate": 9.993502940093951e-05, + "loss": 0.026897422969341278, + "num_input_tokens_seen": 15426192, + "step": 942, + "train_runtime": 7763.336, + "train_tokens_per_second": 1987.057 + }, + { + "epoch": 0.261218836565097, + "grad_norm": 0.23362664878368378, + "learning_rate": 9.993480521999745e-05, + "loss": 0.026110319420695305, + "num_input_tokens_seen": 15442568, + "step": 943, + "train_runtime": 7771.5554, + "train_tokens_per_second": 1987.063 + }, + { + "epoch": 0.2614958448753463, + "grad_norm": 0.1734882891178131, + "learning_rate": 9.993458065320554e-05, + "loss": 0.02787313424050808, + "num_input_tokens_seen": 15458944, + "step": 944, + "train_runtime": 7779.7816, + "train_tokens_per_second": 1987.067 + }, + { + "epoch": 0.26177285318559557, + "grad_norm": 0.21173237264156342, + "learning_rate": 9.993435570056556e-05, + "loss": 0.026480315253138542, + "num_input_tokens_seen": 15475320, + "step": 945, + "train_runtime": 7788.0163, + "train_tokens_per_second": 1987.068 + }, + { + "epoch": 0.26204986149584486, + "grad_norm": 0.18411728739738464, + "learning_rate": 9.993413036207921e-05, + "loss": 0.026142871007323265, + "num_input_tokens_seen": 15491696, + "step": 946, + "train_runtime": 7796.2522, + "train_tokens_per_second": 1987.07 + }, + { + "epoch": 0.2623268698060942, + "grad_norm": 0.1984865367412567, + "learning_rate": 9.993390463774825e-05, + "loss": 0.028312593698501587, + "num_input_tokens_seen": 15508072, + "step": 947, + "train_runtime": 7804.4742, + "train_tokens_per_second": 1987.075 + }, + { + "epoch": 0.2626038781163435, + "grad_norm": 0.13530446588993073, + "learning_rate": 9.993367852757443e-05, + "loss": 0.018686970695853233, + "num_input_tokens_seen": 15524448, + "step": 948, + "train_runtime": 7812.6922, + "train_tokens_per_second": 1987.08 + }, + { + "epoch": 0.2628808864265928, + "grad_norm": 0.15911328792572021, + "learning_rate": 9.993345203155946e-05, + "loss": 0.030134763568639755, + "num_input_tokens_seen": 15540824, + "step": 949, + "train_runtime": 7820.9227, + "train_tokens_per_second": 1987.083 + }, + { + "epoch": 0.2631578947368421, + "grad_norm": 0.1805344820022583, + "learning_rate": 9.993322514970513e-05, + "loss": 0.024302663281559944, + "num_input_tokens_seen": 15557200, + "step": 950, + "train_runtime": 7829.1522, + "train_tokens_per_second": 1987.086 + }, + { + "epoch": 0.26343490304709144, + "grad_norm": 0.17490877211093903, + "learning_rate": 9.993299788201319e-05, + "loss": 0.02213442698121071, + "num_input_tokens_seen": 15573576, + "step": 951, + "train_runtime": 7837.3853, + "train_tokens_per_second": 1987.088 + }, + { + "epoch": 0.26371191135734073, + "grad_norm": 0.10751171410083771, + "learning_rate": 9.993277022848538e-05, + "loss": 0.02041861042380333, + "num_input_tokens_seen": 15589952, + "step": 952, + "train_runtime": 7845.6015, + "train_tokens_per_second": 1987.095 + }, + { + "epoch": 0.26398891966759, + "grad_norm": 0.21628983318805695, + "learning_rate": 9.993254218912346e-05, + "loss": 0.030199764296412468, + "num_input_tokens_seen": 15606328, + "step": 953, + "train_runtime": 7853.8164, + "train_tokens_per_second": 1987.101 + }, + { + "epoch": 0.2642659279778393, + "grad_norm": 0.13229431211948395, + "learning_rate": 9.993231376392922e-05, + "loss": 0.02093045972287655, + "num_input_tokens_seen": 15622704, + "step": 954, + "train_runtime": 7862.0263, + "train_tokens_per_second": 1987.109 + }, + { + "epoch": 0.26454293628808867, + "grad_norm": 0.1589435338973999, + "learning_rate": 9.993208495290438e-05, + "loss": 0.02371392957866192, + "num_input_tokens_seen": 15639080, + "step": 955, + "train_runtime": 7870.2348, + "train_tokens_per_second": 1987.117 + }, + { + "epoch": 0.26481994459833796, + "grad_norm": 0.21026448905467987, + "learning_rate": 9.993185575605073e-05, + "loss": 0.027992434799671173, + "num_input_tokens_seen": 15655456, + "step": 956, + "train_runtime": 7878.4524, + "train_tokens_per_second": 1987.123 + }, + { + "epoch": 0.26509695290858726, + "grad_norm": 0.1953686624765396, + "learning_rate": 9.993162617337005e-05, + "loss": 0.02152480185031891, + "num_input_tokens_seen": 15671832, + "step": 957, + "train_runtime": 7886.6791, + "train_tokens_per_second": 1987.127 + }, + { + "epoch": 0.26537396121883655, + "grad_norm": 0.2339850217103958, + "learning_rate": 9.99313962048641e-05, + "loss": 0.025983309373259544, + "num_input_tokens_seen": 15688208, + "step": 958, + "train_runtime": 7894.9088, + "train_tokens_per_second": 1987.13 + }, + { + "epoch": 0.26565096952908585, + "grad_norm": 0.16738338768482208, + "learning_rate": 9.993116585053467e-05, + "loss": 0.023840434849262238, + "num_input_tokens_seen": 15704584, + "step": 959, + "train_runtime": 7903.1384, + "train_tokens_per_second": 1987.133 + }, + { + "epoch": 0.2659279778393352, + "grad_norm": 0.21752123534679413, + "learning_rate": 9.993093511038352e-05, + "loss": 0.023505819961428642, + "num_input_tokens_seen": 15720960, + "step": 960, + "train_runtime": 7911.3704, + "train_tokens_per_second": 1987.135 + }, + { + "epoch": 0.2662049861495845, + "grad_norm": 0.20224128663539886, + "learning_rate": 9.993070398441247e-05, + "loss": 0.023546744138002396, + "num_input_tokens_seen": 15737336, + "step": 961, + "train_runtime": 7919.5883, + "train_tokens_per_second": 1987.141 + }, + { + "epoch": 0.2664819944598338, + "grad_norm": 0.2017151266336441, + "learning_rate": 9.993047247262325e-05, + "loss": 0.02586751990020275, + "num_input_tokens_seen": 15753712, + "step": 962, + "train_runtime": 7927.8023, + "train_tokens_per_second": 1987.147 + }, + { + "epoch": 0.2667590027700831, + "grad_norm": 0.14566147327423096, + "learning_rate": 9.99302405750177e-05, + "loss": 0.025929613038897514, + "num_input_tokens_seen": 15770088, + "step": 963, + "train_runtime": 7936.0152, + "train_tokens_per_second": 1987.154 + }, + { + "epoch": 0.2670360110803324, + "grad_norm": 0.15230807662010193, + "learning_rate": 9.993000829159759e-05, + "loss": 0.023608200252056122, + "num_input_tokens_seen": 15786464, + "step": 964, + "train_runtime": 7944.226, + "train_tokens_per_second": 1987.162 + }, + { + "epoch": 0.2673130193905817, + "grad_norm": 0.17467093467712402, + "learning_rate": 9.992977562236471e-05, + "loss": 0.024607546627521515, + "num_input_tokens_seen": 15802840, + "step": 965, + "train_runtime": 7952.4329, + "train_tokens_per_second": 1987.17 + }, + { + "epoch": 0.267590027700831, + "grad_norm": 0.1499156355857849, + "learning_rate": 9.992954256732088e-05, + "loss": 0.02504926733672619, + "num_input_tokens_seen": 15819216, + "step": 966, + "train_runtime": 7960.6398, + "train_tokens_per_second": 1987.179 + }, + { + "epoch": 0.2678670360110803, + "grad_norm": 0.183879092335701, + "learning_rate": 9.992930912646787e-05, + "loss": 0.025343991816043854, + "num_input_tokens_seen": 15835592, + "step": 967, + "train_runtime": 7968.8516, + "train_tokens_per_second": 1987.186 + }, + { + "epoch": 0.26814404432132966, + "grad_norm": 0.20636272430419922, + "learning_rate": 9.992907529980751e-05, + "loss": 0.02588171325623989, + "num_input_tokens_seen": 15851968, + "step": 968, + "train_runtime": 7977.0625, + "train_tokens_per_second": 1987.194 + }, + { + "epoch": 0.26842105263157895, + "grad_norm": 0.17922182381153107, + "learning_rate": 9.992884108734159e-05, + "loss": 0.02256583608686924, + "num_input_tokens_seen": 15868344, + "step": 969, + "train_runtime": 7985.2696, + "train_tokens_per_second": 1987.202 + }, + { + "epoch": 0.26869806094182824, + "grad_norm": 0.17287328839302063, + "learning_rate": 9.992860648907192e-05, + "loss": 0.02311789244413376, + "num_input_tokens_seen": 15884720, + "step": 970, + "train_runtime": 7993.4802, + "train_tokens_per_second": 1987.21 + }, + { + "epoch": 0.26897506925207754, + "grad_norm": 0.19822144508361816, + "learning_rate": 9.992837150500033e-05, + "loss": 0.027842065319418907, + "num_input_tokens_seen": 15901096, + "step": 971, + "train_runtime": 8001.6867, + "train_tokens_per_second": 1987.218 + }, + { + "epoch": 0.2692520775623269, + "grad_norm": 0.16457408666610718, + "learning_rate": 9.992813613512862e-05, + "loss": 0.023625832051038742, + "num_input_tokens_seen": 15917472, + "step": 972, + "train_runtime": 8009.8981, + "train_tokens_per_second": 1987.225 + }, + { + "epoch": 0.2695290858725762, + "grad_norm": 0.21237362921237946, + "learning_rate": 9.992790037945862e-05, + "loss": 0.02665727399289608, + "num_input_tokens_seen": 15933848, + "step": 973, + "train_runtime": 8018.1096, + "train_tokens_per_second": 1987.233 + }, + { + "epoch": 0.2698060941828255, + "grad_norm": 0.1601550579071045, + "learning_rate": 9.992766423799213e-05, + "loss": 0.025894880294799805, + "num_input_tokens_seen": 15950224, + "step": 974, + "train_runtime": 8026.3155, + "train_tokens_per_second": 1987.241 + }, + { + "epoch": 0.27008310249307477, + "grad_norm": 0.1621096432209015, + "learning_rate": 9.9927427710731e-05, + "loss": 0.025526512414216995, + "num_input_tokens_seen": 15966600, + "step": 975, + "train_runtime": 8034.5438, + "train_tokens_per_second": 1987.244 + }, + { + "epoch": 0.2703601108033241, + "grad_norm": 0.2562674283981323, + "learning_rate": 9.992719079767705e-05, + "loss": 0.026342231780290604, + "num_input_tokens_seen": 15982976, + "step": 976, + "train_runtime": 8042.7638, + "train_tokens_per_second": 1987.249 + }, + { + "epoch": 0.2706371191135734, + "grad_norm": 0.1305997222661972, + "learning_rate": 9.992695349883211e-05, + "loss": 0.02625022642314434, + "num_input_tokens_seen": 15999352, + "step": 977, + "train_runtime": 8050.9803, + "train_tokens_per_second": 1987.255 + }, + { + "epoch": 0.2709141274238227, + "grad_norm": 0.13998204469680786, + "learning_rate": 9.9926715814198e-05, + "loss": 0.0224491935223341, + "num_input_tokens_seen": 16015728, + "step": 978, + "train_runtime": 8059.2182, + "train_tokens_per_second": 1987.256 + }, + { + "epoch": 0.271191135734072, + "grad_norm": 0.2220943719148636, + "learning_rate": 9.992647774377658e-05, + "loss": 0.027270633727312088, + "num_input_tokens_seen": 16032104, + "step": 979, + "train_runtime": 8067.4523, + "train_tokens_per_second": 1987.257 + }, + { + "epoch": 0.27146814404432135, + "grad_norm": 0.2239622324705124, + "learning_rate": 9.992623928756968e-05, + "loss": 0.02811247855424881, + "num_input_tokens_seen": 16048480, + "step": 980, + "train_runtime": 8075.6787, + "train_tokens_per_second": 1987.261 + }, + { + "epoch": 0.27174515235457064, + "grad_norm": 0.1466871201992035, + "learning_rate": 9.992600044557913e-05, + "loss": 0.02184474654495716, + "num_input_tokens_seen": 16064856, + "step": 981, + "train_runtime": 8083.9055, + "train_tokens_per_second": 1987.264 + }, + { + "epoch": 0.27202216066481993, + "grad_norm": 0.1601216346025467, + "learning_rate": 9.99257612178068e-05, + "loss": 0.023286717012524605, + "num_input_tokens_seen": 16081232, + "step": 982, + "train_runtime": 8092.1432, + "train_tokens_per_second": 1987.265 + }, + { + "epoch": 0.27229916897506923, + "grad_norm": 0.22372455894947052, + "learning_rate": 9.99255216042545e-05, + "loss": 0.02716692164540291, + "num_input_tokens_seen": 16097608, + "step": 983, + "train_runtime": 8100.3749, + "train_tokens_per_second": 1987.267 + }, + { + "epoch": 0.2725761772853186, + "grad_norm": 0.16405276954174042, + "learning_rate": 9.992528160492412e-05, + "loss": 0.0235601793974638, + "num_input_tokens_seen": 16113984, + "step": 984, + "train_runtime": 8108.5992, + "train_tokens_per_second": 1987.271 + }, + { + "epoch": 0.27285318559556787, + "grad_norm": 0.26221776008605957, + "learning_rate": 9.992504121981749e-05, + "loss": 0.026271022856235504, + "num_input_tokens_seen": 16130360, + "step": 985, + "train_runtime": 8116.8228, + "train_tokens_per_second": 1987.275 + }, + { + "epoch": 0.27313019390581716, + "grad_norm": 0.1610620617866516, + "learning_rate": 9.992480044893649e-05, + "loss": 0.02387000434100628, + "num_input_tokens_seen": 16146736, + "step": 986, + "train_runtime": 8125.0543, + "train_tokens_per_second": 1987.277 + }, + { + "epoch": 0.27340720221606646, + "grad_norm": 0.18962165713310242, + "learning_rate": 9.992455929228297e-05, + "loss": 0.022724168375134468, + "num_input_tokens_seen": 16163112, + "step": 987, + "train_runtime": 8133.2825, + "train_tokens_per_second": 1987.28 + }, + { + "epoch": 0.2736842105263158, + "grad_norm": 0.1853899508714676, + "learning_rate": 9.992431774985879e-05, + "loss": 0.02118835598230362, + "num_input_tokens_seen": 16179488, + "step": 988, + "train_runtime": 8141.4869, + "train_tokens_per_second": 1987.289 + }, + { + "epoch": 0.2739612188365651, + "grad_norm": 0.14042146503925323, + "learning_rate": 9.992407582166581e-05, + "loss": 0.02549121156334877, + "num_input_tokens_seen": 16195864, + "step": 989, + "train_runtime": 8149.7009, + "train_tokens_per_second": 1987.296 + }, + { + "epoch": 0.2742382271468144, + "grad_norm": 0.23451824486255646, + "learning_rate": 9.992383350770593e-05, + "loss": 0.030296072363853455, + "num_input_tokens_seen": 16212240, + "step": 990, + "train_runtime": 8157.9113, + "train_tokens_per_second": 1987.303 + }, + { + "epoch": 0.2745152354570637, + "grad_norm": 0.2036575973033905, + "learning_rate": 9.992359080798097e-05, + "loss": 0.02388814650475979, + "num_input_tokens_seen": 16228616, + "step": 991, + "train_runtime": 8166.163, + "train_tokens_per_second": 1987.3 + }, + { + "epoch": 0.27479224376731304, + "grad_norm": 0.20838774740695953, + "learning_rate": 9.992334772249286e-05, + "loss": 0.02611522004008293, + "num_input_tokens_seen": 16244992, + "step": 992, + "train_runtime": 8174.3818, + "train_tokens_per_second": 1987.305 + }, + { + "epoch": 0.27506925207756233, + "grad_norm": 0.1400173157453537, + "learning_rate": 9.992310425124343e-05, + "loss": 0.02394520863890648, + "num_input_tokens_seen": 16261368, + "step": 993, + "train_runtime": 8182.5972, + "train_tokens_per_second": 1987.311 + }, + { + "epoch": 0.2753462603878116, + "grad_norm": 0.18808767199516296, + "learning_rate": 9.99228603942346e-05, + "loss": 0.026477903127670288, + "num_input_tokens_seen": 16277744, + "step": 994, + "train_runtime": 8190.8205, + "train_tokens_per_second": 1987.315 + }, + { + "epoch": 0.2756232686980609, + "grad_norm": 0.1791106015443802, + "learning_rate": 9.992261615146825e-05, + "loss": 0.021666204556822777, + "num_input_tokens_seen": 16294120, + "step": 995, + "train_runtime": 8199.0404, + "train_tokens_per_second": 1987.32 + }, + { + "epoch": 0.27590027700831027, + "grad_norm": 0.19191110134124756, + "learning_rate": 9.992237152294623e-05, + "loss": 0.025428975000977516, + "num_input_tokens_seen": 16310496, + "step": 996, + "train_runtime": 8207.2704, + "train_tokens_per_second": 1987.323 + }, + { + "epoch": 0.27617728531855956, + "grad_norm": 0.21839921176433563, + "learning_rate": 9.992212650867048e-05, + "loss": 0.025501739233732224, + "num_input_tokens_seen": 16326872, + "step": 997, + "train_runtime": 8215.5052, + "train_tokens_per_second": 1987.324 + }, + { + "epoch": 0.27645429362880886, + "grad_norm": 0.16724136471748352, + "learning_rate": 9.992188110864287e-05, + "loss": 0.022279758006334305, + "num_input_tokens_seen": 16343248, + "step": 998, + "train_runtime": 8223.7289, + "train_tokens_per_second": 1987.328 + }, + { + "epoch": 0.27673130193905815, + "grad_norm": 0.18534642457962036, + "learning_rate": 9.992163532286529e-05, + "loss": 0.027268756181001663, + "num_input_tokens_seen": 16359624, + "step": 999, + "train_runtime": 8231.9564, + "train_tokens_per_second": 1987.331 + }, + { + "epoch": 0.2770083102493075, + "grad_norm": 0.15519466996192932, + "learning_rate": 9.992138915133965e-05, + "loss": 0.027878817170858383, + "num_input_tokens_seen": 16376000, + "step": 1000, + "train_runtime": 8240.1925, + "train_tokens_per_second": 1987.332 + }, + { + "epoch": 0.2772853185595568, + "grad_norm": 0.15402540564537048, + "learning_rate": 9.992114259406783e-05, + "loss": 0.024497197940945625, + "num_input_tokens_seen": 16392376, + "step": 1001, + "train_runtime": 8250.138, + "train_tokens_per_second": 1986.921 + }, + { + "epoch": 0.2775623268698061, + "grad_norm": 0.16120629012584686, + "learning_rate": 9.992089565105176e-05, + "loss": 0.027395427227020264, + "num_input_tokens_seen": 16408752, + "step": 1002, + "train_runtime": 8258.3602, + "train_tokens_per_second": 1986.926 + }, + { + "epoch": 0.2778393351800554, + "grad_norm": 0.1683523952960968, + "learning_rate": 9.992064832229336e-05, + "loss": 0.024159587919712067, + "num_input_tokens_seen": 16425128, + "step": 1003, + "train_runtime": 8266.5821, + "train_tokens_per_second": 1986.931 + }, + { + "epoch": 0.27811634349030473, + "grad_norm": 0.13804484903812408, + "learning_rate": 9.99204006077945e-05, + "loss": 0.023670583963394165, + "num_input_tokens_seen": 16441504, + "step": 1004, + "train_runtime": 8274.7943, + "train_tokens_per_second": 1986.938 + }, + { + "epoch": 0.278393351800554, + "grad_norm": 0.25383880734443665, + "learning_rate": 9.992015250755711e-05, + "loss": 0.026313988491892815, + "num_input_tokens_seen": 16457880, + "step": 1005, + "train_runtime": 8283.0103, + "train_tokens_per_second": 1986.944 + }, + { + "epoch": 0.2786703601108033, + "grad_norm": 0.17504367232322693, + "learning_rate": 9.991990402158313e-05, + "loss": 0.02368432842195034, + "num_input_tokens_seen": 16474256, + "step": 1006, + "train_runtime": 8291.236, + "train_tokens_per_second": 1986.948 + }, + { + "epoch": 0.2789473684210526, + "grad_norm": 0.2290000468492508, + "learning_rate": 9.991965514987447e-05, + "loss": 0.029036428779363632, + "num_input_tokens_seen": 16490632, + "step": 1007, + "train_runtime": 8299.4561, + "train_tokens_per_second": 1986.953 + }, + { + "epoch": 0.27922437673130196, + "grad_norm": 0.16755715012550354, + "learning_rate": 9.991940589243304e-05, + "loss": 0.020788095891475677, + "num_input_tokens_seen": 16507008, + "step": 1008, + "train_runtime": 8307.6624, + "train_tokens_per_second": 1986.962 + }, + { + "epoch": 0.27950138504155125, + "grad_norm": 0.1619686782360077, + "learning_rate": 9.991915624926077e-05, + "loss": 0.02274647355079651, + "num_input_tokens_seen": 16523384, + "step": 1009, + "train_runtime": 8315.8772, + "train_tokens_per_second": 1986.968 + }, + { + "epoch": 0.27977839335180055, + "grad_norm": 0.2177937775850296, + "learning_rate": 9.991890622035959e-05, + "loss": 0.024791203439235687, + "num_input_tokens_seen": 16539760, + "step": 1010, + "train_runtime": 8324.1053, + "train_tokens_per_second": 1986.972 + }, + { + "epoch": 0.28005540166204984, + "grad_norm": 0.13139982521533966, + "learning_rate": 9.991865580573143e-05, + "loss": 0.020151009783148766, + "num_input_tokens_seen": 16556136, + "step": 1011, + "train_runtime": 8332.3348, + "train_tokens_per_second": 1986.974 + }, + { + "epoch": 0.2803324099722992, + "grad_norm": 0.17272086441516876, + "learning_rate": 9.991840500537823e-05, + "loss": 0.02301293984055519, + "num_input_tokens_seen": 16572512, + "step": 1012, + "train_runtime": 8340.5864, + "train_tokens_per_second": 1986.972 + }, + { + "epoch": 0.2806094182825485, + "grad_norm": 0.18144965171813965, + "learning_rate": 9.991815381930194e-05, + "loss": 0.025012215599417686, + "num_input_tokens_seen": 16588888, + "step": 1013, + "train_runtime": 8348.829, + "train_tokens_per_second": 1986.972 + }, + { + "epoch": 0.2808864265927978, + "grad_norm": 0.13951127231121063, + "learning_rate": 9.991790224750448e-05, + "loss": 0.02438695915043354, + "num_input_tokens_seen": 16605264, + "step": 1014, + "train_runtime": 8357.0628, + "train_tokens_per_second": 1986.974 + }, + { + "epoch": 0.28116343490304707, + "grad_norm": 0.13020163774490356, + "learning_rate": 9.991765028998779e-05, + "loss": 0.02163827046751976, + "num_input_tokens_seen": 16621640, + "step": 1015, + "train_runtime": 8365.2915, + "train_tokens_per_second": 1986.977 + }, + { + "epoch": 0.2814404432132964, + "grad_norm": 0.23467856645584106, + "learning_rate": 9.991739794675385e-05, + "loss": 0.025148091837763786, + "num_input_tokens_seen": 16638016, + "step": 1016, + "train_runtime": 8373.5256, + "train_tokens_per_second": 1986.979 + }, + { + "epoch": 0.2817174515235457, + "grad_norm": 0.22505635023117065, + "learning_rate": 9.991714521780458e-05, + "loss": 0.024809693917632103, + "num_input_tokens_seen": 16654392, + "step": 1017, + "train_runtime": 8381.7551, + "train_tokens_per_second": 1986.981 + }, + { + "epoch": 0.281994459833795, + "grad_norm": 0.21691486239433289, + "learning_rate": 9.991689210314192e-05, + "loss": 0.037083178758621216, + "num_input_tokens_seen": 16670768, + "step": 1018, + "train_runtime": 8389.9908, + "train_tokens_per_second": 1986.983 + }, + { + "epoch": 0.2822714681440443, + "grad_norm": 0.11059269309043884, + "learning_rate": 9.991663860276787e-05, + "loss": 0.022361649200320244, + "num_input_tokens_seen": 16687144, + "step": 1019, + "train_runtime": 8398.2142, + "train_tokens_per_second": 1986.987 + }, + { + "epoch": 0.28254847645429365, + "grad_norm": 0.13045012950897217, + "learning_rate": 9.991638471668437e-05, + "loss": 0.023022903129458427, + "num_input_tokens_seen": 16703520, + "step": 1020, + "train_runtime": 8406.4317, + "train_tokens_per_second": 1986.993 + }, + { + "epoch": 0.28282548476454294, + "grad_norm": 0.13695159554481506, + "learning_rate": 9.991613044489337e-05, + "loss": 0.024795370176434517, + "num_input_tokens_seen": 16719896, + "step": 1021, + "train_runtime": 8414.6539, + "train_tokens_per_second": 1986.997 + }, + { + "epoch": 0.28310249307479224, + "grad_norm": 0.1516469269990921, + "learning_rate": 9.991587578739684e-05, + "loss": 0.02070513553917408, + "num_input_tokens_seen": 16736272, + "step": 1022, + "train_runtime": 8422.8766, + "train_tokens_per_second": 1987.002 + }, + { + "epoch": 0.28337950138504153, + "grad_norm": 0.20662519335746765, + "learning_rate": 9.991562074419673e-05, + "loss": 0.022602546960115433, + "num_input_tokens_seen": 16752648, + "step": 1023, + "train_runtime": 8431.1055, + "train_tokens_per_second": 1987.005 + }, + { + "epoch": 0.2836565096952909, + "grad_norm": 0.16547153890132904, + "learning_rate": 9.991536531529504e-05, + "loss": 0.021395128220319748, + "num_input_tokens_seen": 16769024, + "step": 1024, + "train_runtime": 8439.3328, + "train_tokens_per_second": 1987.008 + }, + { + "epoch": 0.2839335180055402, + "grad_norm": 0.2080991268157959, + "learning_rate": 9.991510950069376e-05, + "loss": 0.02706253156065941, + "num_input_tokens_seen": 16785400, + "step": 1025, + "train_runtime": 8447.5596, + "train_tokens_per_second": 1987.012 + }, + { + "epoch": 0.28421052631578947, + "grad_norm": 0.27152082324028015, + "learning_rate": 9.991485330039483e-05, + "loss": 0.02203500084578991, + "num_input_tokens_seen": 16801776, + "step": 1026, + "train_runtime": 8455.795, + "train_tokens_per_second": 1987.013 + }, + { + "epoch": 0.28448753462603876, + "grad_norm": 0.23036445677280426, + "learning_rate": 9.991459671440021e-05, + "loss": 0.027452126145362854, + "num_input_tokens_seen": 16818152, + "step": 1027, + "train_runtime": 8464.0304, + "train_tokens_per_second": 1987.015 + }, + { + "epoch": 0.2847645429362881, + "grad_norm": 0.35606440901756287, + "learning_rate": 9.991433974271196e-05, + "loss": 0.03178657591342926, + "num_input_tokens_seen": 16834528, + "step": 1028, + "train_runtime": 8472.2677, + "train_tokens_per_second": 1987.016 + }, + { + "epoch": 0.2850415512465374, + "grad_norm": 0.19688495993614197, + "learning_rate": 9.9914082385332e-05, + "loss": 0.01918906718492508, + "num_input_tokens_seen": 16850904, + "step": 1029, + "train_runtime": 8480.5063, + "train_tokens_per_second": 1987.016 + }, + { + "epoch": 0.2853185595567867, + "grad_norm": 0.16338172554969788, + "learning_rate": 9.991382464226233e-05, + "loss": 0.025707922875881195, + "num_input_tokens_seen": 16867280, + "step": 1030, + "train_runtime": 8488.7349, + "train_tokens_per_second": 1987.019 + }, + { + "epoch": 0.285595567867036, + "grad_norm": 0.18441416323184967, + "learning_rate": 9.991356651350495e-05, + "loss": 0.025881433859467506, + "num_input_tokens_seen": 16883656, + "step": 1031, + "train_runtime": 8496.9532, + "train_tokens_per_second": 1987.025 + }, + { + "epoch": 0.28587257617728534, + "grad_norm": 0.13578660786151886, + "learning_rate": 9.991330799906187e-05, + "loss": 0.022447697818279266, + "num_input_tokens_seen": 16900032, + "step": 1032, + "train_runtime": 8505.1656, + "train_tokens_per_second": 1987.032 + }, + { + "epoch": 0.28614958448753464, + "grad_norm": 0.16593244671821594, + "learning_rate": 9.991304909893506e-05, + "loss": 0.02158893086016178, + "num_input_tokens_seen": 16916408, + "step": 1033, + "train_runtime": 8513.3745, + "train_tokens_per_second": 1987.039 + }, + { + "epoch": 0.28642659279778393, + "grad_norm": 0.15423519909381866, + "learning_rate": 9.991278981312652e-05, + "loss": 0.026262495666742325, + "num_input_tokens_seen": 16932784, + "step": 1034, + "train_runtime": 8521.5884, + "train_tokens_per_second": 1987.046 + }, + { + "epoch": 0.2867036011080332, + "grad_norm": 0.17730818688869476, + "learning_rate": 9.991253014163829e-05, + "loss": 0.02413349039852619, + "num_input_tokens_seen": 16949160, + "step": 1035, + "train_runtime": 8529.8093, + "train_tokens_per_second": 1987.05 + }, + { + "epoch": 0.2869806094182826, + "grad_norm": 0.1399245262145996, + "learning_rate": 9.991227008447234e-05, + "loss": 0.021290352568030357, + "num_input_tokens_seen": 16965536, + "step": 1036, + "train_runtime": 8538.0424, + "train_tokens_per_second": 1987.052 + }, + { + "epoch": 0.28725761772853187, + "grad_norm": 0.17199480533599854, + "learning_rate": 9.991200964163069e-05, + "loss": 0.0200081504881382, + "num_input_tokens_seen": 16981912, + "step": 1037, + "train_runtime": 8546.2668, + "train_tokens_per_second": 1987.056 + }, + { + "epoch": 0.28753462603878116, + "grad_norm": 0.18977180123329163, + "learning_rate": 9.991174881311535e-05, + "loss": 0.026680277660489082, + "num_input_tokens_seen": 16998288, + "step": 1038, + "train_runtime": 8554.4952, + "train_tokens_per_second": 1987.059 + }, + { + "epoch": 0.28781163434903045, + "grad_norm": 0.20439089834690094, + "learning_rate": 9.991148759892834e-05, + "loss": 0.0261450856924057, + "num_input_tokens_seen": 17014664, + "step": 1039, + "train_runtime": 8562.721, + "train_tokens_per_second": 1987.063 + }, + { + "epoch": 0.2880886426592798, + "grad_norm": 0.16455794870853424, + "learning_rate": 9.991122599907168e-05, + "loss": 0.026370249688625336, + "num_input_tokens_seen": 17031040, + "step": 1040, + "train_runtime": 8570.9537, + "train_tokens_per_second": 1987.065 + }, + { + "epoch": 0.2883656509695291, + "grad_norm": 0.1424552947282791, + "learning_rate": 9.99109640135474e-05, + "loss": 0.019686158746480942, + "num_input_tokens_seen": 17047416, + "step": 1041, + "train_runtime": 8579.1838, + "train_tokens_per_second": 1987.067 + }, + { + "epoch": 0.2886426592797784, + "grad_norm": 0.15079525113105774, + "learning_rate": 9.99107016423575e-05, + "loss": 0.02186334691941738, + "num_input_tokens_seen": 17063792, + "step": 1042, + "train_runtime": 8587.4174, + "train_tokens_per_second": 1987.069 + }, + { + "epoch": 0.2889196675900277, + "grad_norm": 0.1705741435289383, + "learning_rate": 9.991043888550403e-05, + "loss": 0.02603154629468918, + "num_input_tokens_seen": 17080168, + "step": 1043, + "train_runtime": 8595.6525, + "train_tokens_per_second": 1987.071 + }, + { + "epoch": 0.28919667590027703, + "grad_norm": 0.11989989131689072, + "learning_rate": 9.9910175742989e-05, + "loss": 0.02335817739367485, + "num_input_tokens_seen": 17096544, + "step": 1044, + "train_runtime": 8603.8859, + "train_tokens_per_second": 1987.072 + }, + { + "epoch": 0.2894736842105263, + "grad_norm": 0.20050930976867676, + "learning_rate": 9.990991221481448e-05, + "loss": 0.02752123400568962, + "num_input_tokens_seen": 17112920, + "step": 1045, + "train_runtime": 8612.1206, + "train_tokens_per_second": 1987.074 + }, + { + "epoch": 0.2897506925207756, + "grad_norm": 0.18039308488368988, + "learning_rate": 9.990964830098246e-05, + "loss": 0.02601800486445427, + "num_input_tokens_seen": 17129296, + "step": 1046, + "train_runtime": 8620.3536, + "train_tokens_per_second": 1987.076 + }, + { + "epoch": 0.2900277008310249, + "grad_norm": 0.12312311679124832, + "learning_rate": 9.9909384001495e-05, + "loss": 0.01984407752752304, + "num_input_tokens_seen": 17145672, + "step": 1047, + "train_runtime": 8628.5812, + "train_tokens_per_second": 1987.079 + }, + { + "epoch": 0.29030470914127426, + "grad_norm": 0.17546668648719788, + "learning_rate": 9.990911931635414e-05, + "loss": 0.025205805897712708, + "num_input_tokens_seen": 17162048, + "step": 1048, + "train_runtime": 8636.8087, + "train_tokens_per_second": 1987.082 + }, + { + "epoch": 0.29058171745152356, + "grad_norm": 0.15046393871307373, + "learning_rate": 9.990885424556194e-05, + "loss": 0.021678566932678223, + "num_input_tokens_seen": 17178424, + "step": 1049, + "train_runtime": 8645.0375, + "train_tokens_per_second": 1987.085 + }, + { + "epoch": 0.29085872576177285, + "grad_norm": 0.1199592724442482, + "learning_rate": 9.990858878912043e-05, + "loss": 0.025461087003350258, + "num_input_tokens_seen": 17194800, + "step": 1050, + "train_runtime": 8653.2616, + "train_tokens_per_second": 1987.089 + }, + { + "epoch": 0.29113573407202215, + "grad_norm": 0.13194091618061066, + "learning_rate": 9.990832294703166e-05, + "loss": 0.022615080699324608, + "num_input_tokens_seen": 17211176, + "step": 1051, + "train_runtime": 8661.4874, + "train_tokens_per_second": 1987.092 + }, + { + "epoch": 0.2914127423822715, + "grad_norm": 0.147831529378891, + "learning_rate": 9.990805671929771e-05, + "loss": 0.02251332812011242, + "num_input_tokens_seen": 17227552, + "step": 1052, + "train_runtime": 8669.7208, + "train_tokens_per_second": 1987.094 + }, + { + "epoch": 0.2916897506925208, + "grad_norm": 0.21045193076133728, + "learning_rate": 9.99077901059206e-05, + "loss": 0.026532119140028954, + "num_input_tokens_seen": 17243928, + "step": 1053, + "train_runtime": 8677.9523, + "train_tokens_per_second": 1987.096 + }, + { + "epoch": 0.2919667590027701, + "grad_norm": 0.1333169788122177, + "learning_rate": 9.990752310690242e-05, + "loss": 0.020132949575781822, + "num_input_tokens_seen": 17260304, + "step": 1054, + "train_runtime": 8686.1857, + "train_tokens_per_second": 1987.098 + }, + { + "epoch": 0.2922437673130194, + "grad_norm": 0.19311366975307465, + "learning_rate": 9.990725572224521e-05, + "loss": 0.0245622918009758, + "num_input_tokens_seen": 17276680, + "step": 1055, + "train_runtime": 8694.4129, + "train_tokens_per_second": 1987.101 + }, + { + "epoch": 0.2925207756232687, + "grad_norm": 0.12396460771560669, + "learning_rate": 9.990698795195106e-05, + "loss": 0.020789269357919693, + "num_input_tokens_seen": 17293056, + "step": 1056, + "train_runtime": 8702.6356, + "train_tokens_per_second": 1987.106 + }, + { + "epoch": 0.292797783933518, + "grad_norm": 0.1717703640460968, + "learning_rate": 9.990671979602202e-05, + "loss": 0.02411501109600067, + "num_input_tokens_seen": 17309432, + "step": 1057, + "train_runtime": 8710.8533, + "train_tokens_per_second": 1987.111 + }, + { + "epoch": 0.2930747922437673, + "grad_norm": 0.13519111275672913, + "learning_rate": 9.990645125446017e-05, + "loss": 0.021178755909204483, + "num_input_tokens_seen": 17325808, + "step": 1058, + "train_runtime": 8719.065, + "train_tokens_per_second": 1987.118 + }, + { + "epoch": 0.2933518005540166, + "grad_norm": 0.1752476841211319, + "learning_rate": 9.990618232726758e-05, + "loss": 0.02032526023685932, + "num_input_tokens_seen": 17342184, + "step": 1059, + "train_runtime": 8727.2761, + "train_tokens_per_second": 1987.124 + }, + { + "epoch": 0.29362880886426596, + "grad_norm": 0.09116439521312714, + "learning_rate": 9.990591301444634e-05, + "loss": 0.01831035129725933, + "num_input_tokens_seen": 17358560, + "step": 1060, + "train_runtime": 8735.4911, + "train_tokens_per_second": 1987.13 + }, + { + "epoch": 0.29390581717451525, + "grad_norm": 0.19274769723415375, + "learning_rate": 9.990564331599851e-05, + "loss": 0.021072441712021828, + "num_input_tokens_seen": 17374936, + "step": 1061, + "train_runtime": 8743.7011, + "train_tokens_per_second": 1987.137 + }, + { + "epoch": 0.29418282548476454, + "grad_norm": 0.1635330766439438, + "learning_rate": 9.990537323192619e-05, + "loss": 0.023766228929162025, + "num_input_tokens_seen": 17391312, + "step": 1062, + "train_runtime": 8751.9101, + "train_tokens_per_second": 1987.145 + }, + { + "epoch": 0.29445983379501384, + "grad_norm": 0.22038903832435608, + "learning_rate": 9.990510276223149e-05, + "loss": 0.026553941890597343, + "num_input_tokens_seen": 17407688, + "step": 1063, + "train_runtime": 8760.1228, + "train_tokens_per_second": 1987.151 + }, + { + "epoch": 0.29473684210526313, + "grad_norm": 0.1412297487258911, + "learning_rate": 9.990483190691644e-05, + "loss": 0.02258201688528061, + "num_input_tokens_seen": 17424064, + "step": 1064, + "train_runtime": 8768.3522, + "train_tokens_per_second": 1987.154 + }, + { + "epoch": 0.2950138504155125, + "grad_norm": 0.1700057089328766, + "learning_rate": 9.990456066598319e-05, + "loss": 0.023691147565841675, + "num_input_tokens_seen": 17440440, + "step": 1065, + "train_runtime": 8776.5814, + "train_tokens_per_second": 1987.156 + }, + { + "epoch": 0.2952908587257618, + "grad_norm": 0.16454514861106873, + "learning_rate": 9.99042890394338e-05, + "loss": 0.021028762683272362, + "num_input_tokens_seen": 17456816, + "step": 1066, + "train_runtime": 8784.8086, + "train_tokens_per_second": 1987.16 + }, + { + "epoch": 0.29556786703601107, + "grad_norm": 0.142667755484581, + "learning_rate": 9.990401702727038e-05, + "loss": 0.02676055021584034, + "num_input_tokens_seen": 17473192, + "step": 1067, + "train_runtime": 8793.0316, + "train_tokens_per_second": 1987.164 + }, + { + "epoch": 0.29584487534626036, + "grad_norm": 0.16013546288013458, + "learning_rate": 9.990374462949504e-05, + "loss": 0.023207172751426697, + "num_input_tokens_seen": 17489568, + "step": 1068, + "train_runtime": 8801.2581, + "train_tokens_per_second": 1987.167 + }, + { + "epoch": 0.2961218836565097, + "grad_norm": 0.12019862979650497, + "learning_rate": 9.990347184610988e-05, + "loss": 0.020351571962237358, + "num_input_tokens_seen": 17505944, + "step": 1069, + "train_runtime": 8809.4956, + "train_tokens_per_second": 1987.168 + }, + { + "epoch": 0.296398891966759, + "grad_norm": 0.18166182935237885, + "learning_rate": 9.990319867711699e-05, + "loss": 0.022017601877450943, + "num_input_tokens_seen": 17522320, + "step": 1070, + "train_runtime": 8817.7333, + "train_tokens_per_second": 1987.168 + }, + { + "epoch": 0.2966759002770083, + "grad_norm": 0.12800493836402893, + "learning_rate": 9.990292512251852e-05, + "loss": 0.023987896740436554, + "num_input_tokens_seen": 17538696, + "step": 1071, + "train_runtime": 8825.9586, + "train_tokens_per_second": 1987.172 + }, + { + "epoch": 0.2969529085872576, + "grad_norm": 0.11118417233228683, + "learning_rate": 9.990265118231654e-05, + "loss": 0.02038075588643551, + "num_input_tokens_seen": 17555072, + "step": 1072, + "train_runtime": 8834.1887, + "train_tokens_per_second": 1987.174 + }, + { + "epoch": 0.29722991689750694, + "grad_norm": 0.19580429792404175, + "learning_rate": 9.99023768565132e-05, + "loss": 0.029241403564810753, + "num_input_tokens_seen": 17571448, + "step": 1073, + "train_runtime": 8842.4143, + "train_tokens_per_second": 1987.178 + }, + { + "epoch": 0.29750692520775623, + "grad_norm": 0.1608182191848755, + "learning_rate": 9.990210214511061e-05, + "loss": 0.022186020389199257, + "num_input_tokens_seen": 17587824, + "step": 1074, + "train_runtime": 8850.6391, + "train_tokens_per_second": 1987.181 + }, + { + "epoch": 0.29778393351800553, + "grad_norm": 0.1806584894657135, + "learning_rate": 9.990182704811088e-05, + "loss": 0.02500210329890251, + "num_input_tokens_seen": 17604200, + "step": 1075, + "train_runtime": 8858.8644, + "train_tokens_per_second": 1987.185 + }, + { + "epoch": 0.2980609418282548, + "grad_norm": 0.14770089089870453, + "learning_rate": 9.990155156551615e-05, + "loss": 0.021596239879727364, + "num_input_tokens_seen": 17620576, + "step": 1076, + "train_runtime": 8867.0905, + "train_tokens_per_second": 1987.188 + }, + { + "epoch": 0.29833795013850417, + "grad_norm": 0.11291127651929855, + "learning_rate": 9.990127569732855e-05, + "loss": 0.021486809477210045, + "num_input_tokens_seen": 17636952, + "step": 1077, + "train_runtime": 8875.314, + "train_tokens_per_second": 1987.192 + }, + { + "epoch": 0.29861495844875346, + "grad_norm": 0.12110594660043716, + "learning_rate": 9.990099944355021e-05, + "loss": 0.020651541650295258, + "num_input_tokens_seen": 17653328, + "step": 1078, + "train_runtime": 8883.5532, + "train_tokens_per_second": 1987.192 + }, + { + "epoch": 0.29889196675900276, + "grad_norm": 0.15734843909740448, + "learning_rate": 9.990072280418324e-05, + "loss": 0.02543764002621174, + "num_input_tokens_seen": 17669704, + "step": 1079, + "train_runtime": 8891.7754, + "train_tokens_per_second": 1987.196 + }, + { + "epoch": 0.29916897506925205, + "grad_norm": 0.16381651163101196, + "learning_rate": 9.990044577922982e-05, + "loss": 0.02051658183336258, + "num_input_tokens_seen": 17686080, + "step": 1080, + "train_runtime": 8900.0016, + "train_tokens_per_second": 1987.2 + }, + { + "epoch": 0.2994459833795014, + "grad_norm": 0.1730310469865799, + "learning_rate": 9.990016836869205e-05, + "loss": 0.022184252738952637, + "num_input_tokens_seen": 17702456, + "step": 1081, + "train_runtime": 8908.2257, + "train_tokens_per_second": 1987.203 + }, + { + "epoch": 0.2997229916897507, + "grad_norm": 0.13670647144317627, + "learning_rate": 9.98998905725721e-05, + "loss": 0.023742105811834335, + "num_input_tokens_seen": 17718832, + "step": 1082, + "train_runtime": 8916.4559, + "train_tokens_per_second": 1987.206 + }, + { + "epoch": 0.3, + "grad_norm": 0.18264605104923248, + "learning_rate": 9.989961239087212e-05, + "loss": 0.02430185116827488, + "num_input_tokens_seen": 17735208, + "step": 1083, + "train_runtime": 8924.672, + "train_tokens_per_second": 1987.211 + }, + { + "epoch": 0.3002770083102493, + "grad_norm": 0.11861112713813782, + "learning_rate": 9.989933382359422e-05, + "loss": 0.019650546833872795, + "num_input_tokens_seen": 17751584, + "step": 1084, + "train_runtime": 8932.8931, + "train_tokens_per_second": 1987.216 + }, + { + "epoch": 0.30055401662049863, + "grad_norm": 0.15652482211589813, + "learning_rate": 9.98990548707406e-05, + "loss": 0.02283664047718048, + "num_input_tokens_seen": 17767960, + "step": 1085, + "train_runtime": 8941.118, + "train_tokens_per_second": 1987.219 + }, + { + "epoch": 0.3008310249307479, + "grad_norm": 0.14718542993068695, + "learning_rate": 9.989877553231339e-05, + "loss": 0.023804739117622375, + "num_input_tokens_seen": 17784336, + "step": 1086, + "train_runtime": 8949.3534, + "train_tokens_per_second": 1987.22 + }, + { + "epoch": 0.3011080332409972, + "grad_norm": 0.1648627370595932, + "learning_rate": 9.989849580831476e-05, + "loss": 0.027192365378141403, + "num_input_tokens_seen": 17800712, + "step": 1087, + "train_runtime": 8957.5807, + "train_tokens_per_second": 1987.223 + }, + { + "epoch": 0.3013850415512465, + "grad_norm": 0.133889839053154, + "learning_rate": 9.989821569874687e-05, + "loss": 0.019359340891242027, + "num_input_tokens_seen": 17817088, + "step": 1088, + "train_runtime": 8965.8161, + "train_tokens_per_second": 1987.224 + }, + { + "epoch": 0.30166204986149586, + "grad_norm": 0.11287525296211243, + "learning_rate": 9.989793520361188e-05, + "loss": 0.021997574716806412, + "num_input_tokens_seen": 17833464, + "step": 1089, + "train_runtime": 8974.0521, + "train_tokens_per_second": 1987.225 + }, + { + "epoch": 0.30193905817174516, + "grad_norm": 0.13275574147701263, + "learning_rate": 9.989765432291196e-05, + "loss": 0.022373106330633163, + "num_input_tokens_seen": 17849840, + "step": 1090, + "train_runtime": 8982.2752, + "train_tokens_per_second": 1987.229 + }, + { + "epoch": 0.30221606648199445, + "grad_norm": 0.13583077490329742, + "learning_rate": 9.989737305664928e-05, + "loss": 0.019795387983322144, + "num_input_tokens_seen": 17866216, + "step": 1091, + "train_runtime": 8990.5042, + "train_tokens_per_second": 1987.232 + }, + { + "epoch": 0.30249307479224374, + "grad_norm": 0.23681005835533142, + "learning_rate": 9.989709140482599e-05, + "loss": 0.025877540931105614, + "num_input_tokens_seen": 17882592, + "step": 1092, + "train_runtime": 8998.7263, + "train_tokens_per_second": 1987.236 + }, + { + "epoch": 0.3027700831024931, + "grad_norm": 0.1724916398525238, + "learning_rate": 9.98968093674443e-05, + "loss": 0.02273283712565899, + "num_input_tokens_seen": 17898968, + "step": 1093, + "train_runtime": 9006.9556, + "train_tokens_per_second": 1987.238 + }, + { + "epoch": 0.3030470914127424, + "grad_norm": 0.10652446746826172, + "learning_rate": 9.989652694450639e-05, + "loss": 0.020892802625894547, + "num_input_tokens_seen": 17915344, + "step": 1094, + "train_runtime": 9015.1887, + "train_tokens_per_second": 1987.24 + }, + { + "epoch": 0.3033240997229917, + "grad_norm": 0.1195911094546318, + "learning_rate": 9.989624413601442e-05, + "loss": 0.02202163264155388, + "num_input_tokens_seen": 17931720, + "step": 1095, + "train_runtime": 9023.4166, + "train_tokens_per_second": 1987.243 + }, + { + "epoch": 0.303601108033241, + "grad_norm": 0.14279282093048096, + "learning_rate": 9.989596094197057e-05, + "loss": 0.018506715074181557, + "num_input_tokens_seen": 17948096, + "step": 1096, + "train_runtime": 9031.6305, + "train_tokens_per_second": 1987.249 + }, + { + "epoch": 0.3038781163434903, + "grad_norm": 0.1839582771062851, + "learning_rate": 9.989567736237708e-05, + "loss": 0.02364673838019371, + "num_input_tokens_seen": 17964472, + "step": 1097, + "train_runtime": 9039.8528, + "train_tokens_per_second": 1987.253 + }, + { + "epoch": 0.3041551246537396, + "grad_norm": 0.1697724610567093, + "learning_rate": 9.989539339723608e-05, + "loss": 0.023188604041934013, + "num_input_tokens_seen": 17980848, + "step": 1098, + "train_runtime": 9048.0835, + "train_tokens_per_second": 1987.255 + }, + { + "epoch": 0.3044321329639889, + "grad_norm": 0.11612016707658768, + "learning_rate": 9.989510904654979e-05, + "loss": 0.02147260308265686, + "num_input_tokens_seen": 17997224, + "step": 1099, + "train_runtime": 9056.3079, + "train_tokens_per_second": 1987.258 + }, + { + "epoch": 0.3047091412742382, + "grad_norm": 0.1703636348247528, + "learning_rate": 9.98948243103204e-05, + "loss": 0.021814562380313873, + "num_input_tokens_seen": 18013600, + "step": 1100, + "train_runtime": 9064.5304, + "train_tokens_per_second": 1987.262 + }, + { + "epoch": 0.30498614958448755, + "grad_norm": 0.11202675104141235, + "learning_rate": 9.989453918855013e-05, + "loss": 0.016182493418455124, + "num_input_tokens_seen": 18029976, + "step": 1101, + "train_runtime": 9074.3107, + "train_tokens_per_second": 1986.925 + }, + { + "epoch": 0.30526315789473685, + "grad_norm": 0.17380331456661224, + "learning_rate": 9.989425368124116e-05, + "loss": 0.022774621844291687, + "num_input_tokens_seen": 18046352, + "step": 1102, + "train_runtime": 9082.531, + "train_tokens_per_second": 1986.93 + }, + { + "epoch": 0.30554016620498614, + "grad_norm": 0.1770675629377365, + "learning_rate": 9.989396778839571e-05, + "loss": 0.024876698851585388, + "num_input_tokens_seen": 18062728, + "step": 1103, + "train_runtime": 9090.7525, + "train_tokens_per_second": 1986.934 + }, + { + "epoch": 0.30581717451523543, + "grad_norm": 0.19499050080776215, + "learning_rate": 9.9893681510016e-05, + "loss": 0.02913479134440422, + "num_input_tokens_seen": 18079104, + "step": 1104, + "train_runtime": 9098.9714, + "train_tokens_per_second": 1986.939 + }, + { + "epoch": 0.3060941828254848, + "grad_norm": 0.14641447365283966, + "learning_rate": 9.98933948461042e-05, + "loss": 0.020707324147224426, + "num_input_tokens_seen": 18095480, + "step": 1105, + "train_runtime": 9107.1909, + "train_tokens_per_second": 1986.944 + }, + { + "epoch": 0.3063711911357341, + "grad_norm": 0.1524886041879654, + "learning_rate": 9.989310779666256e-05, + "loss": 0.017926618456840515, + "num_input_tokens_seen": 18111856, + "step": 1106, + "train_runtime": 9115.4148, + "train_tokens_per_second": 1986.948 + }, + { + "epoch": 0.30664819944598337, + "grad_norm": 0.15060557425022125, + "learning_rate": 9.98928203616933e-05, + "loss": 0.02198066934943199, + "num_input_tokens_seen": 18128232, + "step": 1107, + "train_runtime": 9123.6543, + "train_tokens_per_second": 1986.949 + }, + { + "epoch": 0.30692520775623267, + "grad_norm": 0.14572109282016754, + "learning_rate": 9.989253254119861e-05, + "loss": 0.02062201127409935, + "num_input_tokens_seen": 18144608, + "step": 1108, + "train_runtime": 9131.8775, + "train_tokens_per_second": 1986.953 + }, + { + "epoch": 0.307202216066482, + "grad_norm": 0.18739007413387299, + "learning_rate": 9.989224433518076e-05, + "loss": 0.028971830382943153, + "num_input_tokens_seen": 18160984, + "step": 1109, + "train_runtime": 9140.0975, + "train_tokens_per_second": 1986.957 + }, + { + "epoch": 0.3074792243767313, + "grad_norm": 0.15140724182128906, + "learning_rate": 9.989195574364194e-05, + "loss": 0.02243928797543049, + "num_input_tokens_seen": 18177360, + "step": 1110, + "train_runtime": 9148.311, + "train_tokens_per_second": 1986.963 + }, + { + "epoch": 0.3077562326869806, + "grad_norm": 0.16076162457466125, + "learning_rate": 9.989166676658439e-05, + "loss": 0.02226898819208145, + "num_input_tokens_seen": 18193736, + "step": 1111, + "train_runtime": 9156.526, + "train_tokens_per_second": 1986.969 + }, + { + "epoch": 0.3080332409972299, + "grad_norm": 0.18055447936058044, + "learning_rate": 9.989137740401036e-05, + "loss": 0.029387736693024635, + "num_input_tokens_seen": 18210112, + "step": 1112, + "train_runtime": 9164.7394, + "train_tokens_per_second": 1986.975 + }, + { + "epoch": 0.30831024930747924, + "grad_norm": 0.18211011588573456, + "learning_rate": 9.989108765592205e-05, + "loss": 0.02147553116083145, + "num_input_tokens_seen": 18226488, + "step": 1113, + "train_runtime": 9172.9625, + "train_tokens_per_second": 1986.979 + }, + { + "epoch": 0.30858725761772854, + "grad_norm": 0.1642215996980667, + "learning_rate": 9.989079752232174e-05, + "loss": 0.024018818512558937, + "num_input_tokens_seen": 18242864, + "step": 1114, + "train_runtime": 9181.1795, + "train_tokens_per_second": 1986.985 + }, + { + "epoch": 0.30886426592797783, + "grad_norm": 0.1302528977394104, + "learning_rate": 9.989050700321162e-05, + "loss": 0.019765539094805717, + "num_input_tokens_seen": 18259240, + "step": 1115, + "train_runtime": 9189.4032, + "train_tokens_per_second": 1986.989 + }, + { + "epoch": 0.3091412742382271, + "grad_norm": 0.14968036115169525, + "learning_rate": 9.9890216098594e-05, + "loss": 0.019820597022771835, + "num_input_tokens_seen": 18275616, + "step": 1116, + "train_runtime": 9197.6534, + "train_tokens_per_second": 1986.987 + }, + { + "epoch": 0.3094182825484765, + "grad_norm": 0.2168160229921341, + "learning_rate": 9.988992480847107e-05, + "loss": 0.032724469900131226, + "num_input_tokens_seen": 18291992, + "step": 1117, + "train_runtime": 9205.8819, + "train_tokens_per_second": 1986.99 + }, + { + "epoch": 0.30969529085872577, + "grad_norm": 0.08268298953771591, + "learning_rate": 9.988963313284512e-05, + "loss": 0.019032061100006104, + "num_input_tokens_seen": 18308368, + "step": 1118, + "train_runtime": 9214.1028, + "train_tokens_per_second": 1986.994 + }, + { + "epoch": 0.30997229916897506, + "grad_norm": 0.17464640736579895, + "learning_rate": 9.988934107171837e-05, + "loss": 0.028270401060581207, + "num_input_tokens_seen": 18324744, + "step": 1119, + "train_runtime": 9222.3316, + "train_tokens_per_second": 1986.997 + }, + { + "epoch": 0.31024930747922436, + "grad_norm": 0.12970103323459625, + "learning_rate": 9.988904862509312e-05, + "loss": 0.019185449928045273, + "num_input_tokens_seen": 18341120, + "step": 1120, + "train_runtime": 9230.5693, + "train_tokens_per_second": 1986.998 + }, + { + "epoch": 0.3105263157894737, + "grad_norm": 0.22625160217285156, + "learning_rate": 9.988875579297159e-05, + "loss": 0.024360215291380882, + "num_input_tokens_seen": 18357496, + "step": 1121, + "train_runtime": 9238.8056, + "train_tokens_per_second": 1986.999 + }, + { + "epoch": 0.310803324099723, + "grad_norm": 0.21867075562477112, + "learning_rate": 9.988846257535607e-05, + "loss": 0.02719898335635662, + "num_input_tokens_seen": 18373872, + "step": 1122, + "train_runtime": 9247.0285, + "train_tokens_per_second": 1987.003 + }, + { + "epoch": 0.3110803324099723, + "grad_norm": 0.1253052055835724, + "learning_rate": 9.98881689722488e-05, + "loss": 0.021345512941479683, + "num_input_tokens_seen": 18390248, + "step": 1123, + "train_runtime": 9255.2556, + "train_tokens_per_second": 1987.006 + }, + { + "epoch": 0.3113573407202216, + "grad_norm": 0.15018494427204132, + "learning_rate": 9.988787498365206e-05, + "loss": 0.022243589162826538, + "num_input_tokens_seen": 18406624, + "step": 1124, + "train_runtime": 9263.4759, + "train_tokens_per_second": 1987.011 + }, + { + "epoch": 0.31163434903047094, + "grad_norm": 0.17563731968402863, + "learning_rate": 9.988758060956814e-05, + "loss": 0.025524191558361053, + "num_input_tokens_seen": 18423000, + "step": 1125, + "train_runtime": 9271.6883, + "train_tokens_per_second": 1987.017 + }, + { + "epoch": 0.31191135734072023, + "grad_norm": 0.1602250337600708, + "learning_rate": 9.988728584999928e-05, + "loss": 0.020811930298805237, + "num_input_tokens_seen": 18439376, + "step": 1126, + "train_runtime": 9279.9168, + "train_tokens_per_second": 1987.02 + }, + { + "epoch": 0.3121883656509695, + "grad_norm": 0.16488951444625854, + "learning_rate": 9.98869907049478e-05, + "loss": 0.02177748829126358, + "num_input_tokens_seen": 18455752, + "step": 1127, + "train_runtime": 9288.162, + "train_tokens_per_second": 1987.019 + }, + { + "epoch": 0.3124653739612188, + "grad_norm": 0.13535644114017487, + "learning_rate": 9.988669517441594e-05, + "loss": 0.022569485008716583, + "num_input_tokens_seen": 18472128, + "step": 1128, + "train_runtime": 9296.3978, + "train_tokens_per_second": 1987.02 + }, + { + "epoch": 0.31274238227146817, + "grad_norm": 0.1875724494457245, + "learning_rate": 9.988639925840601e-05, + "loss": 0.025437580421566963, + "num_input_tokens_seen": 18488504, + "step": 1129, + "train_runtime": 9304.6414, + "train_tokens_per_second": 1987.02 + }, + { + "epoch": 0.31301939058171746, + "grad_norm": 0.12042931467294693, + "learning_rate": 9.988610295692029e-05, + "loss": 0.020594051107764244, + "num_input_tokens_seen": 18504880, + "step": 1130, + "train_runtime": 9312.8691, + "train_tokens_per_second": 1987.022 + }, + { + "epoch": 0.31329639889196675, + "grad_norm": 0.15163937211036682, + "learning_rate": 9.988580626996106e-05, + "loss": 0.027722693979740143, + "num_input_tokens_seen": 18521256, + "step": 1131, + "train_runtime": 9321.084, + "train_tokens_per_second": 1987.028 + }, + { + "epoch": 0.31357340720221605, + "grad_norm": 0.11313503980636597, + "learning_rate": 9.988550919753061e-05, + "loss": 0.018188757821917534, + "num_input_tokens_seen": 18537632, + "step": 1132, + "train_runtime": 9329.305, + "train_tokens_per_second": 1987.032 + }, + { + "epoch": 0.3138504155124654, + "grad_norm": 0.13765449821949005, + "learning_rate": 9.988521173963125e-05, + "loss": 0.022762205451726913, + "num_input_tokens_seen": 18554008, + "step": 1133, + "train_runtime": 9337.5197, + "train_tokens_per_second": 1987.038 + }, + { + "epoch": 0.3141274238227147, + "grad_norm": 0.13420304656028748, + "learning_rate": 9.988491389626529e-05, + "loss": 0.023924212902784348, + "num_input_tokens_seen": 18570384, + "step": 1134, + "train_runtime": 9345.7329, + "train_tokens_per_second": 1987.044 + }, + { + "epoch": 0.314404432132964, + "grad_norm": 0.17004354298114777, + "learning_rate": 9.9884615667435e-05, + "loss": 0.02722472883760929, + "num_input_tokens_seen": 18586760, + "step": 1135, + "train_runtime": 9353.9441, + "train_tokens_per_second": 1987.051 + }, + { + "epoch": 0.3146814404432133, + "grad_norm": 0.1585075557231903, + "learning_rate": 9.98843170531427e-05, + "loss": 0.021152744069695473, + "num_input_tokens_seen": 18603136, + "step": 1136, + "train_runtime": 9362.1562, + "train_tokens_per_second": 1987.057 + }, + { + "epoch": 0.3149584487534626, + "grad_norm": 0.14242318272590637, + "learning_rate": 9.98840180533907e-05, + "loss": 0.02244136668741703, + "num_input_tokens_seen": 18619512, + "step": 1137, + "train_runtime": 9370.3766, + "train_tokens_per_second": 1987.061 + }, + { + "epoch": 0.3152354570637119, + "grad_norm": 0.13788728415966034, + "learning_rate": 9.98837186681813e-05, + "loss": 0.02245003916323185, + "num_input_tokens_seen": 18635888, + "step": 1138, + "train_runtime": 9378.6169, + "train_tokens_per_second": 1987.061 + }, + { + "epoch": 0.3155124653739612, + "grad_norm": 0.140054851770401, + "learning_rate": 9.988341889751682e-05, + "loss": 0.025028046220541, + "num_input_tokens_seen": 18652264, + "step": 1139, + "train_runtime": 9386.8391, + "train_tokens_per_second": 1987.065 + }, + { + "epoch": 0.3157894736842105, + "grad_norm": 0.12337026000022888, + "learning_rate": 9.98831187413996e-05, + "loss": 0.02155180275440216, + "num_input_tokens_seen": 18668640, + "step": 1140, + "train_runtime": 9395.0638, + "train_tokens_per_second": 1987.069 + }, + { + "epoch": 0.31606648199445986, + "grad_norm": 0.12982340157032013, + "learning_rate": 9.98828181998319e-05, + "loss": 0.024434169754385948, + "num_input_tokens_seen": 18685016, + "step": 1141, + "train_runtime": 9403.2939, + "train_tokens_per_second": 1987.071 + }, + { + "epoch": 0.31634349030470915, + "grad_norm": 0.1519041508436203, + "learning_rate": 9.98825172728161e-05, + "loss": 0.017361635342240334, + "num_input_tokens_seen": 18701392, + "step": 1142, + "train_runtime": 9411.5224, + "train_tokens_per_second": 1987.074 + }, + { + "epoch": 0.31662049861495845, + "grad_norm": 0.1533532440662384, + "learning_rate": 9.98822159603545e-05, + "loss": 0.02283002994954586, + "num_input_tokens_seen": 18717768, + "step": 1143, + "train_runtime": 9419.7535, + "train_tokens_per_second": 1987.076 + }, + { + "epoch": 0.31689750692520774, + "grad_norm": 0.15693175792694092, + "learning_rate": 9.988191426244944e-05, + "loss": 0.0231732577085495, + "num_input_tokens_seen": 18734144, + "step": 1144, + "train_runtime": 9427.9798, + "train_tokens_per_second": 1987.079 + }, + { + "epoch": 0.3171745152354571, + "grad_norm": 0.11297991126775742, + "learning_rate": 9.988161217910323e-05, + "loss": 0.021827900782227516, + "num_input_tokens_seen": 18750520, + "step": 1145, + "train_runtime": 9436.2193, + "train_tokens_per_second": 1987.08 + }, + { + "epoch": 0.3174515235457064, + "grad_norm": 0.08112559467554092, + "learning_rate": 9.988130971031821e-05, + "loss": 0.01616458036005497, + "num_input_tokens_seen": 18766896, + "step": 1146, + "train_runtime": 9444.466, + "train_tokens_per_second": 1987.079 + }, + { + "epoch": 0.3177285318559557, + "grad_norm": 0.12072674185037613, + "learning_rate": 9.988100685609673e-05, + "loss": 0.02180613949894905, + "num_input_tokens_seen": 18783272, + "step": 1147, + "train_runtime": 9452.6869, + "train_tokens_per_second": 1987.083 + }, + { + "epoch": 0.31800554016620497, + "grad_norm": 0.16715264320373535, + "learning_rate": 9.988070361644115e-05, + "loss": 0.018426120281219482, + "num_input_tokens_seen": 18799648, + "step": 1148, + "train_runtime": 9460.8969, + "train_tokens_per_second": 1987.089 + }, + { + "epoch": 0.3182825484764543, + "grad_norm": 0.10180127620697021, + "learning_rate": 9.988039999135377e-05, + "loss": 0.018123624846339226, + "num_input_tokens_seen": 18816024, + "step": 1149, + "train_runtime": 9469.1198, + "train_tokens_per_second": 1987.093 + }, + { + "epoch": 0.3185595567867036, + "grad_norm": 0.14395740628242493, + "learning_rate": 9.988009598083694e-05, + "loss": 0.02075469121336937, + "num_input_tokens_seen": 18832400, + "step": 1150, + "train_runtime": 9477.3324, + "train_tokens_per_second": 1987.099 + }, + { + "epoch": 0.3188365650969529, + "grad_norm": 0.18152306973934174, + "learning_rate": 9.987979158489305e-05, + "loss": 0.02634117752313614, + "num_input_tokens_seen": 18848776, + "step": 1151, + "train_runtime": 9485.5519, + "train_tokens_per_second": 1987.104 + }, + { + "epoch": 0.3191135734072022, + "grad_norm": 0.1716216802597046, + "learning_rate": 9.987948680352441e-05, + "loss": 0.018750257790088654, + "num_input_tokens_seen": 18865152, + "step": 1152, + "train_runtime": 9493.7706, + "train_tokens_per_second": 1987.108 + }, + { + "epoch": 0.31939058171745155, + "grad_norm": 0.13083088397979736, + "learning_rate": 9.987918163673337e-05, + "loss": 0.021891653537750244, + "num_input_tokens_seen": 18881528, + "step": 1153, + "train_runtime": 9502.0004, + "train_tokens_per_second": 1987.111 + }, + { + "epoch": 0.31966759002770084, + "grad_norm": 0.13981521129608154, + "learning_rate": 9.987887608452235e-05, + "loss": 0.02040412463247776, + "num_input_tokens_seen": 18897904, + "step": 1154, + "train_runtime": 9510.2258, + "train_tokens_per_second": 1987.114 + }, + { + "epoch": 0.31994459833795014, + "grad_norm": 0.1252577006816864, + "learning_rate": 9.987857014689364e-05, + "loss": 0.020782243460416794, + "num_input_tokens_seen": 18914280, + "step": 1155, + "train_runtime": 9518.4654, + "train_tokens_per_second": 1987.114 + }, + { + "epoch": 0.32022160664819943, + "grad_norm": 0.11425184458494186, + "learning_rate": 9.987826382384964e-05, + "loss": 0.021059628576040268, + "num_input_tokens_seen": 18930656, + "step": 1156, + "train_runtime": 9526.6996, + "train_tokens_per_second": 1987.116 + }, + { + "epoch": 0.3204986149584488, + "grad_norm": 0.17044851183891296, + "learning_rate": 9.987795711539271e-05, + "loss": 0.022839728742837906, + "num_input_tokens_seen": 18947032, + "step": 1157, + "train_runtime": 9534.9182, + "train_tokens_per_second": 1987.121 + }, + { + "epoch": 0.3207756232686981, + "grad_norm": 0.15386968851089478, + "learning_rate": 9.987765002152523e-05, + "loss": 0.018687285482883453, + "num_input_tokens_seen": 18963408, + "step": 1158, + "train_runtime": 9543.1374, + "train_tokens_per_second": 1987.125 + }, + { + "epoch": 0.32105263157894737, + "grad_norm": 0.15178245306015015, + "learning_rate": 9.987734254224955e-05, + "loss": 0.024312900379300117, + "num_input_tokens_seen": 18979784, + "step": 1159, + "train_runtime": 9551.3659, + "train_tokens_per_second": 1987.128 + }, + { + "epoch": 0.32132963988919666, + "grad_norm": 0.13665583729743958, + "learning_rate": 9.987703467756807e-05, + "loss": 0.02335061877965927, + "num_input_tokens_seen": 18996160, + "step": 1160, + "train_runtime": 9559.5972, + "train_tokens_per_second": 1987.13 + }, + { + "epoch": 0.321606648199446, + "grad_norm": 0.14738787710666656, + "learning_rate": 9.987672642748315e-05, + "loss": 0.022621501237154007, + "num_input_tokens_seen": 19012536, + "step": 1161, + "train_runtime": 9567.8282, + "train_tokens_per_second": 1987.132 + }, + { + "epoch": 0.3218836565096953, + "grad_norm": 0.15588700771331787, + "learning_rate": 9.98764177919972e-05, + "loss": 0.023037582635879517, + "num_input_tokens_seen": 19028912, + "step": 1162, + "train_runtime": 9576.0592, + "train_tokens_per_second": 1987.134 + }, + { + "epoch": 0.3221606648199446, + "grad_norm": 0.16259731352329254, + "learning_rate": 9.987610877111255e-05, + "loss": 0.02274232730269432, + "num_input_tokens_seen": 19045288, + "step": 1163, + "train_runtime": 9584.2861, + "train_tokens_per_second": 1987.137 + }, + { + "epoch": 0.3224376731301939, + "grad_norm": 0.17433299124240875, + "learning_rate": 9.987579936483164e-05, + "loss": 0.024680519476532936, + "num_input_tokens_seen": 19061664, + "step": 1164, + "train_runtime": 9592.5256, + "train_tokens_per_second": 1987.137 + }, + { + "epoch": 0.32271468144044324, + "grad_norm": 0.10634005069732666, + "learning_rate": 9.987548957315685e-05, + "loss": 0.020931104198098183, + "num_input_tokens_seen": 19078040, + "step": 1165, + "train_runtime": 9600.7523, + "train_tokens_per_second": 1987.14 + }, + { + "epoch": 0.32299168975069253, + "grad_norm": 0.14059647917747498, + "learning_rate": 9.987517939609055e-05, + "loss": 0.023468144237995148, + "num_input_tokens_seen": 19094416, + "step": 1166, + "train_runtime": 9608.9717, + "train_tokens_per_second": 1987.145 + }, + { + "epoch": 0.32326869806094183, + "grad_norm": 0.16447575390338898, + "learning_rate": 9.987486883363518e-05, + "loss": 0.022531362250447273, + "num_input_tokens_seen": 19110792, + "step": 1167, + "train_runtime": 9617.2055, + "train_tokens_per_second": 1987.146 + }, + { + "epoch": 0.3235457063711911, + "grad_norm": 0.17750468850135803, + "learning_rate": 9.987455788579309e-05, + "loss": 0.021129807457327843, + "num_input_tokens_seen": 19127168, + "step": 1168, + "train_runtime": 9625.4311, + "train_tokens_per_second": 1987.149 + }, + { + "epoch": 0.3238227146814404, + "grad_norm": 0.17793233692646027, + "learning_rate": 9.987424655256672e-05, + "loss": 0.0199295561760664, + "num_input_tokens_seen": 19143544, + "step": 1169, + "train_runtime": 9633.6692, + "train_tokens_per_second": 1987.15 + }, + { + "epoch": 0.32409972299168976, + "grad_norm": 0.1058010533452034, + "learning_rate": 9.987393483395845e-05, + "loss": 0.016823703423142433, + "num_input_tokens_seen": 19159920, + "step": 1170, + "train_runtime": 9641.8993, + "train_tokens_per_second": 1987.152 + }, + { + "epoch": 0.32437673130193906, + "grad_norm": 0.1683518886566162, + "learning_rate": 9.987362272997071e-05, + "loss": 0.01837576925754547, + "num_input_tokens_seen": 19176296, + "step": 1171, + "train_runtime": 9650.1378, + "train_tokens_per_second": 1987.153 + }, + { + "epoch": 0.32465373961218835, + "grad_norm": 0.13919749855995178, + "learning_rate": 9.98733102406059e-05, + "loss": 0.021472271531820297, + "num_input_tokens_seen": 19192672, + "step": 1172, + "train_runtime": 9658.3705, + "train_tokens_per_second": 1987.154 + }, + { + "epoch": 0.32493074792243765, + "grad_norm": 0.11987041682004929, + "learning_rate": 9.987299736586644e-05, + "loss": 0.02060738578438759, + "num_input_tokens_seen": 19209048, + "step": 1173, + "train_runtime": 9666.6035, + "train_tokens_per_second": 1987.156 + }, + { + "epoch": 0.325207756232687, + "grad_norm": 0.14700515568256378, + "learning_rate": 9.987268410575473e-05, + "loss": 0.023163173347711563, + "num_input_tokens_seen": 19225424, + "step": 1174, + "train_runtime": 9674.8353, + "train_tokens_per_second": 1987.158 + }, + { + "epoch": 0.3254847645429363, + "grad_norm": 0.18798844516277313, + "learning_rate": 9.987237046027323e-05, + "loss": 0.023282881826162338, + "num_input_tokens_seen": 19241800, + "step": 1175, + "train_runtime": 9683.0636, + "train_tokens_per_second": 1987.16 + }, + { + "epoch": 0.3257617728531856, + "grad_norm": 0.15000300109386444, + "learning_rate": 9.987205642942432e-05, + "loss": 0.022339006885886192, + "num_input_tokens_seen": 19258176, + "step": 1176, + "train_runtime": 9691.2918, + "train_tokens_per_second": 1987.163 + }, + { + "epoch": 0.3260387811634349, + "grad_norm": 0.21294619143009186, + "learning_rate": 9.987174201321044e-05, + "loss": 0.029464619234204292, + "num_input_tokens_seen": 19274552, + "step": 1177, + "train_runtime": 9699.5103, + "train_tokens_per_second": 1987.168 + }, + { + "epoch": 0.3263157894736842, + "grad_norm": 0.13552474975585938, + "learning_rate": 9.987142721163404e-05, + "loss": 0.01852753385901451, + "num_input_tokens_seen": 19290928, + "step": 1178, + "train_runtime": 9707.7253, + "train_tokens_per_second": 1987.173 + }, + { + "epoch": 0.3265927977839335, + "grad_norm": 0.15264149010181427, + "learning_rate": 9.987111202469754e-05, + "loss": 0.02290058508515358, + "num_input_tokens_seen": 19307304, + "step": 1179, + "train_runtime": 9715.9352, + "train_tokens_per_second": 1987.179 + }, + { + "epoch": 0.3268698060941828, + "grad_norm": 0.18758700788021088, + "learning_rate": 9.987079645240335e-05, + "loss": 0.023463992401957512, + "num_input_tokens_seen": 19323680, + "step": 1180, + "train_runtime": 9724.1417, + "train_tokens_per_second": 1987.186 + }, + { + "epoch": 0.3271468144044321, + "grad_norm": 0.10090292990207672, + "learning_rate": 9.987048049475396e-05, + "loss": 0.01837424747645855, + "num_input_tokens_seen": 19340056, + "step": 1181, + "train_runtime": 9732.362, + "train_tokens_per_second": 1987.19 + }, + { + "epoch": 0.32742382271468146, + "grad_norm": 0.12924882769584656, + "learning_rate": 9.987016415175177e-05, + "loss": 0.02118430659174919, + "num_input_tokens_seen": 19356432, + "step": 1182, + "train_runtime": 9740.6002, + "train_tokens_per_second": 1987.191 + }, + { + "epoch": 0.32770083102493075, + "grad_norm": 0.17089416086673737, + "learning_rate": 9.986984742339924e-05, + "loss": 0.024672189727425575, + "num_input_tokens_seen": 19372808, + "step": 1183, + "train_runtime": 9748.8313, + "train_tokens_per_second": 1987.193 + }, + { + "epoch": 0.32797783933518004, + "grad_norm": 0.12197092175483704, + "learning_rate": 9.98695303096988e-05, + "loss": 0.02132883481681347, + "num_input_tokens_seen": 19389184, + "step": 1184, + "train_runtime": 9757.062, + "train_tokens_per_second": 1987.195 + }, + { + "epoch": 0.32825484764542934, + "grad_norm": 0.14439216256141663, + "learning_rate": 9.986921281065292e-05, + "loss": 0.020178522914648056, + "num_input_tokens_seen": 19405560, + "step": 1185, + "train_runtime": 9765.2902, + "train_tokens_per_second": 1987.197 + }, + { + "epoch": 0.3285318559556787, + "grad_norm": 0.12373988330364227, + "learning_rate": 9.986889492626406e-05, + "loss": 0.02023923769593239, + "num_input_tokens_seen": 19421936, + "step": 1186, + "train_runtime": 9773.5131, + "train_tokens_per_second": 1987.201 + }, + { + "epoch": 0.328808864265928, + "grad_norm": 0.26629969477653503, + "learning_rate": 9.986857665653466e-05, + "loss": 0.027797961607575417, + "num_input_tokens_seen": 19438312, + "step": 1187, + "train_runtime": 9781.7549, + "train_tokens_per_second": 1987.201 + }, + { + "epoch": 0.3290858725761773, + "grad_norm": 0.17118528485298157, + "learning_rate": 9.986825800146717e-05, + "loss": 0.025594571605324745, + "num_input_tokens_seen": 19454688, + "step": 1188, + "train_runtime": 9789.9884, + "train_tokens_per_second": 1987.202 + }, + { + "epoch": 0.32936288088642657, + "grad_norm": 0.14207859337329865, + "learning_rate": 9.986793896106408e-05, + "loss": 0.02317594364285469, + "num_input_tokens_seen": 19471064, + "step": 1189, + "train_runtime": 9798.2259, + "train_tokens_per_second": 1987.203 + }, + { + "epoch": 0.3296398891966759, + "grad_norm": 0.12161527574062347, + "learning_rate": 9.986761953532784e-05, + "loss": 0.021016616374254227, + "num_input_tokens_seen": 19487440, + "step": 1190, + "train_runtime": 9806.4542, + "train_tokens_per_second": 1987.206 + }, + { + "epoch": 0.3299168975069252, + "grad_norm": 0.18563449382781982, + "learning_rate": 9.986729972426092e-05, + "loss": 0.027172934263944626, + "num_input_tokens_seen": 19503816, + "step": 1191, + "train_runtime": 9814.6856, + "train_tokens_per_second": 1987.207 + }, + { + "epoch": 0.3301939058171745, + "grad_norm": 0.13802824914455414, + "learning_rate": 9.986697952786578e-05, + "loss": 0.02222455106675625, + "num_input_tokens_seen": 19520192, + "step": 1192, + "train_runtime": 9822.9088, + "train_tokens_per_second": 1987.211 + }, + { + "epoch": 0.3304709141274238, + "grad_norm": 0.13692259788513184, + "learning_rate": 9.986665894614493e-05, + "loss": 0.020296797156333923, + "num_input_tokens_seen": 19536568, + "step": 1193, + "train_runtime": 9831.1546, + "train_tokens_per_second": 1987.21 + }, + { + "epoch": 0.33074792243767315, + "grad_norm": 0.1563963145017624, + "learning_rate": 9.98663379791008e-05, + "loss": 0.022578617557883263, + "num_input_tokens_seen": 19552944, + "step": 1194, + "train_runtime": 9839.3917, + "train_tokens_per_second": 1987.211 + }, + { + "epoch": 0.33102493074792244, + "grad_norm": 0.11187558621168137, + "learning_rate": 9.986601662673589e-05, + "loss": 0.01878518983721733, + "num_input_tokens_seen": 19569320, + "step": 1195, + "train_runtime": 9847.6271, + "train_tokens_per_second": 1987.212 + }, + { + "epoch": 0.33130193905817173, + "grad_norm": 0.13111549615859985, + "learning_rate": 9.98656948890527e-05, + "loss": 0.023805491626262665, + "num_input_tokens_seen": 19585696, + "step": 1196, + "train_runtime": 9855.8523, + "train_tokens_per_second": 1987.215 + }, + { + "epoch": 0.33157894736842103, + "grad_norm": 0.10731448978185654, + "learning_rate": 9.98653727660537e-05, + "loss": 0.022495094686746597, + "num_input_tokens_seen": 19602072, + "step": 1197, + "train_runtime": 9864.0652, + "train_tokens_per_second": 1987.22 + }, + { + "epoch": 0.3318559556786704, + "grad_norm": 0.14517398178577423, + "learning_rate": 9.986505025774138e-05, + "loss": 0.021561220288276672, + "num_input_tokens_seen": 19618448, + "step": 1198, + "train_runtime": 9872.2784, + "train_tokens_per_second": 1987.226 + }, + { + "epoch": 0.33213296398891967, + "grad_norm": 0.16206412017345428, + "learning_rate": 9.986472736411823e-05, + "loss": 0.02170235477387905, + "num_input_tokens_seen": 19634824, + "step": 1199, + "train_runtime": 9880.5046, + "train_tokens_per_second": 1987.229 + }, + { + "epoch": 0.33240997229916897, + "grad_norm": 0.22004692256450653, + "learning_rate": 9.986440408518673e-05, + "loss": 0.023810235783457756, + "num_input_tokens_seen": 19651200, + "step": 1200, + "train_runtime": 9888.7187, + "train_tokens_per_second": 1987.234 + }, + { + "epoch": 0.33268698060941826, + "grad_norm": 0.16228240728378296, + "learning_rate": 9.986408042094942e-05, + "loss": 0.02027033269405365, + "num_input_tokens_seen": 19667576, + "step": 1201, + "train_runtime": 9898.644, + "train_tokens_per_second": 1986.896 + }, + { + "epoch": 0.3329639889196676, + "grad_norm": 0.18161408603191376, + "learning_rate": 9.986375637140877e-05, + "loss": 0.020575260743498802, + "num_input_tokens_seen": 19683952, + "step": 1202, + "train_runtime": 9906.877, + "train_tokens_per_second": 1986.898 + }, + { + "epoch": 0.3332409972299169, + "grad_norm": 0.16092944145202637, + "learning_rate": 9.986343193656729e-05, + "loss": 0.028263669461011887, + "num_input_tokens_seen": 19700328, + "step": 1203, + "train_runtime": 9915.0935, + "train_tokens_per_second": 1986.903 + }, + { + "epoch": 0.3335180055401662, + "grad_norm": 0.11992378532886505, + "learning_rate": 9.986310711642748e-05, + "loss": 0.021813685074448586, + "num_input_tokens_seen": 19716704, + "step": 1204, + "train_runtime": 9923.307, + "train_tokens_per_second": 1986.909 + }, + { + "epoch": 0.3337950138504155, + "grad_norm": 0.14513202011585236, + "learning_rate": 9.986278191099186e-05, + "loss": 0.02406424656510353, + "num_input_tokens_seen": 19733080, + "step": 1205, + "train_runtime": 9931.518, + "train_tokens_per_second": 1986.915 + }, + { + "epoch": 0.33407202216066484, + "grad_norm": 0.16739970445632935, + "learning_rate": 9.986245632026295e-05, + "loss": 0.01849079504609108, + "num_input_tokens_seen": 19749456, + "step": 1206, + "train_runtime": 9939.7284, + "train_tokens_per_second": 1986.921 + }, + { + "epoch": 0.33434903047091413, + "grad_norm": 0.120297871530056, + "learning_rate": 9.986213034424324e-05, + "loss": 0.022588049992918968, + "num_input_tokens_seen": 19765832, + "step": 1207, + "train_runtime": 9947.9529, + "train_tokens_per_second": 1986.925 + }, + { + "epoch": 0.3346260387811634, + "grad_norm": 0.16877517104148865, + "learning_rate": 9.986180398293528e-05, + "loss": 0.02190742827951908, + "num_input_tokens_seen": 19782208, + "step": 1208, + "train_runtime": 9956.1647, + "train_tokens_per_second": 1986.931 + }, + { + "epoch": 0.3349030470914127, + "grad_norm": 0.1585240364074707, + "learning_rate": 9.986147723634156e-05, + "loss": 0.024480542168021202, + "num_input_tokens_seen": 19798584, + "step": 1209, + "train_runtime": 9964.3806, + "train_tokens_per_second": 1986.936 + }, + { + "epoch": 0.33518005540166207, + "grad_norm": 0.15012137591838837, + "learning_rate": 9.986115010446462e-05, + "loss": 0.02380707859992981, + "num_input_tokens_seen": 19814960, + "step": 1210, + "train_runtime": 9972.5964, + "train_tokens_per_second": 1986.941 + }, + { + "epoch": 0.33545706371191136, + "grad_norm": 0.18458175659179688, + "learning_rate": 9.986082258730701e-05, + "loss": 0.021413268521428108, + "num_input_tokens_seen": 19831336, + "step": 1211, + "train_runtime": 9980.8124, + "train_tokens_per_second": 1986.946 + }, + { + "epoch": 0.33573407202216066, + "grad_norm": 0.16228540241718292, + "learning_rate": 9.986049468487123e-05, + "loss": 0.022065419703722, + "num_input_tokens_seen": 19847712, + "step": 1212, + "train_runtime": 9989.0271, + "train_tokens_per_second": 1986.951 + }, + { + "epoch": 0.33601108033240995, + "grad_norm": 0.15686605870723724, + "learning_rate": 9.986016639715983e-05, + "loss": 0.017847271636128426, + "num_input_tokens_seen": 19864088, + "step": 1213, + "train_runtime": 9997.2425, + "train_tokens_per_second": 1986.957 + }, + { + "epoch": 0.3362880886426593, + "grad_norm": 0.10679984092712402, + "learning_rate": 9.985983772417533e-05, + "loss": 0.017473764717578888, + "num_input_tokens_seen": 19880464, + "step": 1214, + "train_runtime": 10005.468, + "train_tokens_per_second": 1986.96 + }, + { + "epoch": 0.3365650969529086, + "grad_norm": 0.21528710424900055, + "learning_rate": 9.985950866592029e-05, + "loss": 0.020741969347000122, + "num_input_tokens_seen": 19896840, + "step": 1215, + "train_runtime": 10013.6942, + "train_tokens_per_second": 1986.963 + }, + { + "epoch": 0.3368421052631579, + "grad_norm": 0.13800619542598724, + "learning_rate": 9.985917922239722e-05, + "loss": 0.01761069893836975, + "num_input_tokens_seen": 19913216, + "step": 1216, + "train_runtime": 10021.9212, + "train_tokens_per_second": 1986.966 + }, + { + "epoch": 0.3371191135734072, + "grad_norm": 0.1373826116323471, + "learning_rate": 9.985884939360872e-05, + "loss": 0.02199496328830719, + "num_input_tokens_seen": 19929592, + "step": 1217, + "train_runtime": 10030.1539, + "train_tokens_per_second": 1986.968 + }, + { + "epoch": 0.33739612188365653, + "grad_norm": 0.1687798947095871, + "learning_rate": 9.985851917955729e-05, + "loss": 0.01975940726697445, + "num_input_tokens_seen": 19945968, + "step": 1218, + "train_runtime": 10038.3847, + "train_tokens_per_second": 1986.97 + }, + { + "epoch": 0.3376731301939058, + "grad_norm": 0.19068947434425354, + "learning_rate": 9.985818858024549e-05, + "loss": 0.023084979504346848, + "num_input_tokens_seen": 19962344, + "step": 1219, + "train_runtime": 10046.6076, + "train_tokens_per_second": 1986.974 + }, + { + "epoch": 0.3379501385041551, + "grad_norm": 0.1269627958536148, + "learning_rate": 9.985785759567591e-05, + "loss": 0.02370471879839897, + "num_input_tokens_seen": 19978720, + "step": 1220, + "train_runtime": 10054.8357, + "train_tokens_per_second": 1986.976 + }, + { + "epoch": 0.3382271468144044, + "grad_norm": 0.12260947376489639, + "learning_rate": 9.985752622585106e-05, + "loss": 0.01822194643318653, + "num_input_tokens_seen": 19995096, + "step": 1221, + "train_runtime": 10063.0555, + "train_tokens_per_second": 1986.981 + }, + { + "epoch": 0.33850415512465376, + "grad_norm": 0.12316292524337769, + "learning_rate": 9.985719447077353e-05, + "loss": 0.020278137177228928, + "num_input_tokens_seen": 20011472, + "step": 1222, + "train_runtime": 10071.2854, + "train_tokens_per_second": 1986.983 + }, + { + "epoch": 0.33878116343490305, + "grad_norm": 0.1535922735929489, + "learning_rate": 9.985686233044586e-05, + "loss": 0.023012036457657814, + "num_input_tokens_seen": 20027848, + "step": 1223, + "train_runtime": 10079.5221, + "train_tokens_per_second": 1986.984 + }, + { + "epoch": 0.33905817174515235, + "grad_norm": 0.1887931525707245, + "learning_rate": 9.985652980487066e-05, + "loss": 0.022737586870789528, + "num_input_tokens_seen": 20044224, + "step": 1224, + "train_runtime": 10087.7622, + "train_tokens_per_second": 1986.984 + }, + { + "epoch": 0.33933518005540164, + "grad_norm": 0.17341522872447968, + "learning_rate": 9.985619689405044e-05, + "loss": 0.021598465740680695, + "num_input_tokens_seen": 20060600, + "step": 1225, + "train_runtime": 10096.0022, + "train_tokens_per_second": 1986.985 + }, + { + "epoch": 0.339612188365651, + "grad_norm": 0.11168327182531357, + "learning_rate": 9.985586359798782e-05, + "loss": 0.020427992567420006, + "num_input_tokens_seen": 20076976, + "step": 1226, + "train_runtime": 10104.2429, + "train_tokens_per_second": 1986.985 + }, + { + "epoch": 0.3398891966759003, + "grad_norm": 0.12747646868228912, + "learning_rate": 9.985552991668535e-05, + "loss": 0.02068936452269554, + "num_input_tokens_seen": 20093352, + "step": 1227, + "train_runtime": 10112.4799, + "train_tokens_per_second": 1986.986 + }, + { + "epoch": 0.3401662049861496, + "grad_norm": 0.16303662955760956, + "learning_rate": 9.985519585014562e-05, + "loss": 0.02176317386329174, + "num_input_tokens_seen": 20109728, + "step": 1228, + "train_runtime": 10120.6972, + "train_tokens_per_second": 1986.99 + }, + { + "epoch": 0.34044321329639887, + "grad_norm": 0.14605847001075745, + "learning_rate": 9.985486139837121e-05, + "loss": 0.02362580969929695, + "num_input_tokens_seen": 20126104, + "step": 1229, + "train_runtime": 10128.9081, + "train_tokens_per_second": 1986.996 + }, + { + "epoch": 0.3407202216066482, + "grad_norm": 0.13361065089702606, + "learning_rate": 9.98545265613647e-05, + "loss": 0.023407382890582085, + "num_input_tokens_seen": 20142480, + "step": 1230, + "train_runtime": 10137.1364, + "train_tokens_per_second": 1986.999 + }, + { + "epoch": 0.3409972299168975, + "grad_norm": 0.14839625358581543, + "learning_rate": 9.985419133912869e-05, + "loss": 0.02173086628317833, + "num_input_tokens_seen": 20158856, + "step": 1231, + "train_runtime": 10145.3615, + "train_tokens_per_second": 1987.002 + }, + { + "epoch": 0.3412742382271468, + "grad_norm": 0.14981132745742798, + "learning_rate": 9.985385573166575e-05, + "loss": 0.01887657679617405, + "num_input_tokens_seen": 20175232, + "step": 1232, + "train_runtime": 10153.5885, + "train_tokens_per_second": 1987.005 + }, + { + "epoch": 0.3415512465373961, + "grad_norm": 0.1214304268360138, + "learning_rate": 9.985351973897848e-05, + "loss": 0.019119950011372566, + "num_input_tokens_seen": 20191608, + "step": 1233, + "train_runtime": 10161.8105, + "train_tokens_per_second": 1987.009 + }, + { + "epoch": 0.34182825484764545, + "grad_norm": 0.18084047734737396, + "learning_rate": 9.985318336106949e-05, + "loss": 0.027109241113066673, + "num_input_tokens_seen": 20207984, + "step": 1234, + "train_runtime": 10170.0264, + "train_tokens_per_second": 1987.014 + }, + { + "epoch": 0.34210526315789475, + "grad_norm": 0.1076149195432663, + "learning_rate": 9.985284659794136e-05, + "loss": 0.02099642902612686, + "num_input_tokens_seen": 20224360, + "step": 1235, + "train_runtime": 10178.2663, + "train_tokens_per_second": 1987.014 + }, + { + "epoch": 0.34238227146814404, + "grad_norm": 0.14369523525238037, + "learning_rate": 9.98525094495967e-05, + "loss": 0.020847158506512642, + "num_input_tokens_seen": 20240736, + "step": 1236, + "train_runtime": 10186.4969, + "train_tokens_per_second": 1987.016 + }, + { + "epoch": 0.34265927977839333, + "grad_norm": 0.09691997617483139, + "learning_rate": 9.985217191603813e-05, + "loss": 0.01589980535209179, + "num_input_tokens_seen": 20257112, + "step": 1237, + "train_runtime": 10194.7304, + "train_tokens_per_second": 1987.018 + }, + { + "epoch": 0.3429362880886427, + "grad_norm": 0.1204170510172844, + "learning_rate": 9.985183399726824e-05, + "loss": 0.019113367423415184, + "num_input_tokens_seen": 20273488, + "step": 1238, + "train_runtime": 10202.9617, + "train_tokens_per_second": 1987.02 + }, + { + "epoch": 0.343213296398892, + "grad_norm": 0.16229385137557983, + "learning_rate": 9.985149569328963e-05, + "loss": 0.026642674580216408, + "num_input_tokens_seen": 20289864, + "step": 1239, + "train_runtime": 10211.1995, + "train_tokens_per_second": 1987.021 + }, + { + "epoch": 0.34349030470914127, + "grad_norm": 0.14046265184879303, + "learning_rate": 9.985115700410495e-05, + "loss": 0.0220619086176157, + "num_input_tokens_seen": 20306240, + "step": 1240, + "train_runtime": 10219.4364, + "train_tokens_per_second": 1987.022 + }, + { + "epoch": 0.34376731301939056, + "grad_norm": 0.13860993087291718, + "learning_rate": 9.985081792971678e-05, + "loss": 0.02395845204591751, + "num_input_tokens_seen": 20322616, + "step": 1241, + "train_runtime": 10227.6668, + "train_tokens_per_second": 1987.024 + }, + { + "epoch": 0.3440443213296399, + "grad_norm": 0.14432719349861145, + "learning_rate": 9.985047847012776e-05, + "loss": 0.02055436372756958, + "num_input_tokens_seen": 20338992, + "step": 1242, + "train_runtime": 10235.8923, + "train_tokens_per_second": 1987.027 + }, + { + "epoch": 0.3443213296398892, + "grad_norm": 0.10279717296361923, + "learning_rate": 9.985013862534051e-05, + "loss": 0.021534623578190804, + "num_input_tokens_seen": 20355368, + "step": 1243, + "train_runtime": 10244.1223, + "train_tokens_per_second": 1987.029 + }, + { + "epoch": 0.3445983379501385, + "grad_norm": 0.12713073194026947, + "learning_rate": 9.984979839535766e-05, + "loss": 0.017339980229735374, + "num_input_tokens_seen": 20371744, + "step": 1244, + "train_runtime": 10252.3624, + "train_tokens_per_second": 1987.029 + }, + { + "epoch": 0.3448753462603878, + "grad_norm": 0.12264974415302277, + "learning_rate": 9.984945778018184e-05, + "loss": 0.02138311043381691, + "num_input_tokens_seen": 20388120, + "step": 1245, + "train_runtime": 10260.592, + "train_tokens_per_second": 1987.032 + }, + { + "epoch": 0.34515235457063714, + "grad_norm": 0.1335284262895584, + "learning_rate": 9.984911677981567e-05, + "loss": 0.021219685673713684, + "num_input_tokens_seen": 20404496, + "step": 1246, + "train_runtime": 10268.8138, + "train_tokens_per_second": 1987.035 + }, + { + "epoch": 0.34542936288088644, + "grad_norm": 0.14338596165180206, + "learning_rate": 9.984877539426179e-05, + "loss": 0.01874539442360401, + "num_input_tokens_seen": 20420872, + "step": 1247, + "train_runtime": 10277.0445, + "train_tokens_per_second": 1987.037 + }, + { + "epoch": 0.34570637119113573, + "grad_norm": 0.14244528114795685, + "learning_rate": 9.984843362352284e-05, + "loss": 0.021066362038254738, + "num_input_tokens_seen": 20437248, + "step": 1248, + "train_runtime": 10285.2749, + "train_tokens_per_second": 1987.04 + }, + { + "epoch": 0.345983379501385, + "grad_norm": 0.10828595608472824, + "learning_rate": 9.984809146760146e-05, + "loss": 0.019954897463321686, + "num_input_tokens_seen": 20453624, + "step": 1249, + "train_runtime": 10293.517, + "train_tokens_per_second": 1987.039 + }, + { + "epoch": 0.3462603878116344, + "grad_norm": 0.10766288638114929, + "learning_rate": 9.98477489265003e-05, + "loss": 0.023509696125984192, + "num_input_tokens_seen": 20470000, + "step": 1250, + "train_runtime": 10301.7409, + "train_tokens_per_second": 1987.043 + }, + { + "epoch": 0.34653739612188367, + "grad_norm": 0.13136829435825348, + "learning_rate": 9.9847406000222e-05, + "loss": 0.018354974687099457, + "num_input_tokens_seen": 20486376, + "step": 1251, + "train_runtime": 10309.9707, + "train_tokens_per_second": 1987.045 + }, + { + "epoch": 0.34681440443213296, + "grad_norm": 0.12130106985569, + "learning_rate": 9.98470626887692e-05, + "loss": 0.020155921578407288, + "num_input_tokens_seen": 20502752, + "step": 1252, + "train_runtime": 10318.196, + "train_tokens_per_second": 1987.048 + }, + { + "epoch": 0.34709141274238225, + "grad_norm": 0.12538857758045197, + "learning_rate": 9.984671899214457e-05, + "loss": 0.021250663325190544, + "num_input_tokens_seen": 20519128, + "step": 1253, + "train_runtime": 10326.4148, + "train_tokens_per_second": 1987.052 + }, + { + "epoch": 0.3473684210526316, + "grad_norm": 0.15048104524612427, + "learning_rate": 9.984637491035078e-05, + "loss": 0.02590947411954403, + "num_input_tokens_seen": 20535504, + "step": 1254, + "train_runtime": 10334.6436, + "train_tokens_per_second": 1987.055 + }, + { + "epoch": 0.3476454293628809, + "grad_norm": 0.16410855948925018, + "learning_rate": 9.984603044339044e-05, + "loss": 0.026690050959587097, + "num_input_tokens_seen": 20551880, + "step": 1255, + "train_runtime": 10342.8757, + "train_tokens_per_second": 1987.057 + }, + { + "epoch": 0.3479224376731302, + "grad_norm": 0.13022373616695404, + "learning_rate": 9.984568559126624e-05, + "loss": 0.020010408014059067, + "num_input_tokens_seen": 20568256, + "step": 1256, + "train_runtime": 10351.0991, + "train_tokens_per_second": 1987.06 + }, + { + "epoch": 0.3481994459833795, + "grad_norm": 0.14752882719039917, + "learning_rate": 9.984534035398086e-05, + "loss": 0.024834642186760902, + "num_input_tokens_seen": 20584632, + "step": 1257, + "train_runtime": 10359.341, + "train_tokens_per_second": 1987.06 + }, + { + "epoch": 0.34847645429362883, + "grad_norm": 0.1275721788406372, + "learning_rate": 9.984499473153694e-05, + "loss": 0.020420655608177185, + "num_input_tokens_seen": 20601008, + "step": 1258, + "train_runtime": 10367.5754, + "train_tokens_per_second": 1987.061 + }, + { + "epoch": 0.34875346260387813, + "grad_norm": 0.09400663524866104, + "learning_rate": 9.984464872393717e-05, + "loss": 0.018887728452682495, + "num_input_tokens_seen": 20617384, + "step": 1259, + "train_runtime": 10375.8048, + "train_tokens_per_second": 1987.064 + }, + { + "epoch": 0.3490304709141274, + "grad_norm": 0.0887271836400032, + "learning_rate": 9.984430233118422e-05, + "loss": 0.01833583414554596, + "num_input_tokens_seen": 20633760, + "step": 1260, + "train_runtime": 10384.0334, + "train_tokens_per_second": 1987.066 + }, + { + "epoch": 0.3493074792243767, + "grad_norm": 0.1282779723405838, + "learning_rate": 9.984395555328076e-05, + "loss": 0.02102718874812126, + "num_input_tokens_seen": 20650136, + "step": 1261, + "train_runtime": 10392.2645, + "train_tokens_per_second": 1987.068 + }, + { + "epoch": 0.34958448753462606, + "grad_norm": 0.12714526057243347, + "learning_rate": 9.984360839022948e-05, + "loss": 0.01910846307873726, + "num_input_tokens_seen": 20666512, + "step": 1262, + "train_runtime": 10400.4976, + "train_tokens_per_second": 1987.07 + }, + { + "epoch": 0.34986149584487536, + "grad_norm": 0.1266443133354187, + "learning_rate": 9.984326084203303e-05, + "loss": 0.023405518382787704, + "num_input_tokens_seen": 20682888, + "step": 1263, + "train_runtime": 10408.7325, + "train_tokens_per_second": 1987.071 + }, + { + "epoch": 0.35013850415512465, + "grad_norm": 0.13454076647758484, + "learning_rate": 9.984291290869415e-05, + "loss": 0.020483968779444695, + "num_input_tokens_seen": 20699264, + "step": 1264, + "train_runtime": 10416.9638, + "train_tokens_per_second": 1987.073 + }, + { + "epoch": 0.35041551246537395, + "grad_norm": 0.15149809420108795, + "learning_rate": 9.984256459021548e-05, + "loss": 0.0217170100659132, + "num_input_tokens_seen": 20715640, + "step": 1265, + "train_runtime": 10425.1972, + "train_tokens_per_second": 1987.074 + }, + { + "epoch": 0.3506925207756233, + "grad_norm": 0.13447648286819458, + "learning_rate": 9.984221588659975e-05, + "loss": 0.020868049934506416, + "num_input_tokens_seen": 20732016, + "step": 1266, + "train_runtime": 10433.4385, + "train_tokens_per_second": 1987.074 + }, + { + "epoch": 0.3509695290858726, + "grad_norm": 0.16921180486679077, + "learning_rate": 9.984186679784961e-05, + "loss": 0.0179742518812418, + "num_input_tokens_seen": 20748392, + "step": 1267, + "train_runtime": 10441.6868, + "train_tokens_per_second": 1987.073 + }, + { + "epoch": 0.3512465373961219, + "grad_norm": 0.1324063390493393, + "learning_rate": 9.98415173239678e-05, + "loss": 0.02215653285384178, + "num_input_tokens_seen": 20764768, + "step": 1268, + "train_runtime": 10449.9179, + "train_tokens_per_second": 1987.075 + }, + { + "epoch": 0.3515235457063712, + "grad_norm": 0.12096592038869858, + "learning_rate": 9.9841167464957e-05, + "loss": 0.01819027028977871, + "num_input_tokens_seen": 20781144, + "step": 1269, + "train_runtime": 10458.1577, + "train_tokens_per_second": 1987.075 + }, + { + "epoch": 0.3518005540166205, + "grad_norm": 0.16397765278816223, + "learning_rate": 9.984081722081993e-05, + "loss": 0.02181631699204445, + "num_input_tokens_seen": 20797520, + "step": 1270, + "train_runtime": 10466.3923, + "train_tokens_per_second": 1987.076 + }, + { + "epoch": 0.3520775623268698, + "grad_norm": 0.13001234829425812, + "learning_rate": 9.984046659155926e-05, + "loss": 0.02024838700890541, + "num_input_tokens_seen": 20813896, + "step": 1271, + "train_runtime": 10474.6277, + "train_tokens_per_second": 1987.077 + }, + { + "epoch": 0.3523545706371191, + "grad_norm": 0.15021269023418427, + "learning_rate": 9.984011557717773e-05, + "loss": 0.02278660424053669, + "num_input_tokens_seen": 20830272, + "step": 1272, + "train_runtime": 10482.8601, + "train_tokens_per_second": 1987.079 + }, + { + "epoch": 0.3526315789473684, + "grad_norm": 0.09868161380290985, + "learning_rate": 9.983976417767804e-05, + "loss": 0.020174287259578705, + "num_input_tokens_seen": 20846648, + "step": 1273, + "train_runtime": 10491.0925, + "train_tokens_per_second": 1987.081 + }, + { + "epoch": 0.35290858725761776, + "grad_norm": 0.11405650526285172, + "learning_rate": 9.983941239306291e-05, + "loss": 0.01933852769434452, + "num_input_tokens_seen": 20863024, + "step": 1274, + "train_runtime": 10499.3045, + "train_tokens_per_second": 1987.086 + }, + { + "epoch": 0.35318559556786705, + "grad_norm": 0.1253572702407837, + "learning_rate": 9.983906022333507e-05, + "loss": 0.019643062725663185, + "num_input_tokens_seen": 20879400, + "step": 1275, + "train_runtime": 10507.5319, + "train_tokens_per_second": 1987.089 + }, + { + "epoch": 0.35346260387811634, + "grad_norm": 0.1276039034128189, + "learning_rate": 9.983870766849724e-05, + "loss": 0.018540406599640846, + "num_input_tokens_seen": 20895776, + "step": 1276, + "train_runtime": 10515.7411, + "train_tokens_per_second": 1987.095 + }, + { + "epoch": 0.35373961218836564, + "grad_norm": 0.15629108250141144, + "learning_rate": 9.98383547285521e-05, + "loss": 0.021658750250935555, + "num_input_tokens_seen": 20912152, + "step": 1277, + "train_runtime": 10523.9671, + "train_tokens_per_second": 1987.098 + }, + { + "epoch": 0.35401662049861493, + "grad_norm": 0.08223596960306168, + "learning_rate": 9.983800140350244e-05, + "loss": 0.016864866018295288, + "num_input_tokens_seen": 20928528, + "step": 1278, + "train_runtime": 10532.1924, + "train_tokens_per_second": 1987.101 + }, + { + "epoch": 0.3542936288088643, + "grad_norm": 0.14277347922325134, + "learning_rate": 9.983764769335094e-05, + "loss": 0.02157927118241787, + "num_input_tokens_seen": 20944904, + "step": 1279, + "train_runtime": 10540.4159, + "train_tokens_per_second": 1987.104 + }, + { + "epoch": 0.3545706371191136, + "grad_norm": 0.14220313727855682, + "learning_rate": 9.983729359810037e-05, + "loss": 0.020464876666665077, + "num_input_tokens_seen": 20961280, + "step": 1280, + "train_runtime": 10548.6534, + "train_tokens_per_second": 1987.105 + }, + { + "epoch": 0.35484764542936287, + "grad_norm": 0.13475298881530762, + "learning_rate": 9.983693911775344e-05, + "loss": 0.0190537478774786, + "num_input_tokens_seen": 20977656, + "step": 1281, + "train_runtime": 10556.8848, + "train_tokens_per_second": 1987.107 + }, + { + "epoch": 0.35512465373961216, + "grad_norm": 0.17818686366081238, + "learning_rate": 9.983658425231291e-05, + "loss": 0.02435901202261448, + "num_input_tokens_seen": 20994032, + "step": 1282, + "train_runtime": 10565.1161, + "train_tokens_per_second": 1987.108 + }, + { + "epoch": 0.3554016620498615, + "grad_norm": 0.10566544532775879, + "learning_rate": 9.983622900178148e-05, + "loss": 0.02320200763642788, + "num_input_tokens_seen": 21010408, + "step": 1283, + "train_runtime": 10573.341, + "train_tokens_per_second": 1987.112 + }, + { + "epoch": 0.3556786703601108, + "grad_norm": 0.13584153354167938, + "learning_rate": 9.983587336616195e-05, + "loss": 0.020030738785862923, + "num_input_tokens_seen": 21026784, + "step": 1284, + "train_runtime": 10581.5623, + "train_tokens_per_second": 1987.115 + }, + { + "epoch": 0.3559556786703601, + "grad_norm": 0.1154097318649292, + "learning_rate": 9.983551734545705e-05, + "loss": 0.01945488713681698, + "num_input_tokens_seen": 21043160, + "step": 1285, + "train_runtime": 10589.8009, + "train_tokens_per_second": 1987.116 + }, + { + "epoch": 0.3562326869806094, + "grad_norm": 0.1232052594423294, + "learning_rate": 9.983516093966952e-05, + "loss": 0.017615877091884613, + "num_input_tokens_seen": 21059536, + "step": 1286, + "train_runtime": 10598.031, + "train_tokens_per_second": 1987.118 + }, + { + "epoch": 0.35650969529085874, + "grad_norm": 0.10917158424854279, + "learning_rate": 9.983480414880212e-05, + "loss": 0.020353740081191063, + "num_input_tokens_seen": 21075912, + "step": 1287, + "train_runtime": 10606.2673, + "train_tokens_per_second": 1987.119 + }, + { + "epoch": 0.35678670360110804, + "grad_norm": 0.1362210065126419, + "learning_rate": 9.98344469728576e-05, + "loss": 0.022587498649954796, + "num_input_tokens_seen": 21092288, + "step": 1288, + "train_runtime": 10614.4988, + "train_tokens_per_second": 1987.12 + }, + { + "epoch": 0.35706371191135733, + "grad_norm": 0.14392496645450592, + "learning_rate": 9.983408941183874e-05, + "loss": 0.020195908844470978, + "num_input_tokens_seen": 21108664, + "step": 1289, + "train_runtime": 10622.7276, + "train_tokens_per_second": 1987.123 + }, + { + "epoch": 0.3573407202216066, + "grad_norm": 0.13293282687664032, + "learning_rate": 9.983373146574829e-05, + "loss": 0.018345830962061882, + "num_input_tokens_seen": 21125040, + "step": 1290, + "train_runtime": 10630.9606, + "train_tokens_per_second": 1987.124 + }, + { + "epoch": 0.35761772853185597, + "grad_norm": 0.11166903376579285, + "learning_rate": 9.983337313458899e-05, + "loss": 0.02296501398086548, + "num_input_tokens_seen": 21141416, + "step": 1291, + "train_runtime": 10639.6331, + "train_tokens_per_second": 1987.044 + }, + { + "epoch": 0.35789473684210527, + "grad_norm": 0.11346077919006348, + "learning_rate": 9.983301441836366e-05, + "loss": 0.017590561881661415, + "num_input_tokens_seen": 21157792, + "step": 1292, + "train_runtime": 10647.8956, + "train_tokens_per_second": 1987.04 + }, + { + "epoch": 0.35817174515235456, + "grad_norm": 0.1322748214006424, + "learning_rate": 9.983265531707502e-05, + "loss": 0.02309482917189598, + "num_input_tokens_seen": 21174168, + "step": 1293, + "train_runtime": 10656.1236, + "train_tokens_per_second": 1987.042 + }, + { + "epoch": 0.35844875346260385, + "grad_norm": 0.13798676431179047, + "learning_rate": 9.983229583072589e-05, + "loss": 0.024432722479104996, + "num_input_tokens_seen": 21190544, + "step": 1294, + "train_runtime": 10664.3517, + "train_tokens_per_second": 1987.045 + }, + { + "epoch": 0.3587257617728532, + "grad_norm": 0.12151209264993668, + "learning_rate": 9.983193595931903e-05, + "loss": 0.018217070028185844, + "num_input_tokens_seen": 21206920, + "step": 1295, + "train_runtime": 10672.5826, + "train_tokens_per_second": 1987.047 + }, + { + "epoch": 0.3590027700831025, + "grad_norm": 0.09562014043331146, + "learning_rate": 9.98315757028572e-05, + "loss": 0.022387275472283363, + "num_input_tokens_seen": 21223296, + "step": 1296, + "train_runtime": 10680.8081, + "train_tokens_per_second": 1987.05 + }, + { + "epoch": 0.3592797783933518, + "grad_norm": 0.14024193584918976, + "learning_rate": 9.983121506134322e-05, + "loss": 0.019231272861361504, + "num_input_tokens_seen": 21239672, + "step": 1297, + "train_runtime": 10689.0336, + "train_tokens_per_second": 1987.053 + }, + { + "epoch": 0.3595567867036011, + "grad_norm": 0.09824497997760773, + "learning_rate": 9.983085403477985e-05, + "loss": 0.01793626882135868, + "num_input_tokens_seen": 21256048, + "step": 1298, + "train_runtime": 10697.2606, + "train_tokens_per_second": 1987.055 + }, + { + "epoch": 0.35983379501385043, + "grad_norm": 0.11672700196504593, + "learning_rate": 9.98304926231699e-05, + "loss": 0.020093513652682304, + "num_input_tokens_seen": 21272424, + "step": 1299, + "train_runtime": 10705.4783, + "train_tokens_per_second": 1987.06 + }, + { + "epoch": 0.3601108033240997, + "grad_norm": 0.16163988411426544, + "learning_rate": 9.983013082651615e-05, + "loss": 0.021490536630153656, + "num_input_tokens_seen": 21288800, + "step": 1300, + "train_runtime": 10713.703, + "train_tokens_per_second": 1987.063 + }, + { + "epoch": 0.360387811634349, + "grad_norm": 0.10593137890100479, + "learning_rate": 9.982976864482139e-05, + "loss": 0.02191612683236599, + "num_input_tokens_seen": 21305176, + "step": 1301, + "train_runtime": 10723.5803, + "train_tokens_per_second": 1986.76 + }, + { + "epoch": 0.3606648199445983, + "grad_norm": 0.15320436656475067, + "learning_rate": 9.982940607808843e-05, + "loss": 0.02202986367046833, + "num_input_tokens_seen": 21321552, + "step": 1302, + "train_runtime": 10731.7811, + "train_tokens_per_second": 1986.767 + }, + { + "epoch": 0.36094182825484766, + "grad_norm": 0.15642398595809937, + "learning_rate": 9.982904312632007e-05, + "loss": 0.019327102228999138, + "num_input_tokens_seen": 21337928, + "step": 1303, + "train_runtime": 10739.9871, + "train_tokens_per_second": 1986.774 + }, + { + "epoch": 0.36121883656509696, + "grad_norm": 0.12377449870109558, + "learning_rate": 9.98286797895191e-05, + "loss": 0.022178618237376213, + "num_input_tokens_seen": 21354304, + "step": 1304, + "train_runtime": 10748.2108, + "train_tokens_per_second": 1986.778 + }, + { + "epoch": 0.36149584487534625, + "grad_norm": 0.11186200380325317, + "learning_rate": 9.982831606768836e-05, + "loss": 0.02076810970902443, + "num_input_tokens_seen": 21370680, + "step": 1305, + "train_runtime": 10756.4371, + "train_tokens_per_second": 1986.781 + }, + { + "epoch": 0.36177285318559554, + "grad_norm": 0.11493954807519913, + "learning_rate": 9.982795196083062e-05, + "loss": 0.017069458961486816, + "num_input_tokens_seen": 21387056, + "step": 1306, + "train_runtime": 10764.662, + "train_tokens_per_second": 1986.784 + }, + { + "epoch": 0.3620498614958449, + "grad_norm": 0.1616387963294983, + "learning_rate": 9.982758746894872e-05, + "loss": 0.025397228077054024, + "num_input_tokens_seen": 21403432, + "step": 1307, + "train_runtime": 10772.8892, + "train_tokens_per_second": 1986.787 + }, + { + "epoch": 0.3623268698060942, + "grad_norm": 0.14425058662891388, + "learning_rate": 9.982722259204548e-05, + "loss": 0.024584945291280746, + "num_input_tokens_seen": 21419808, + "step": 1308, + "train_runtime": 10781.1132, + "train_tokens_per_second": 1986.79 + }, + { + "epoch": 0.3626038781163435, + "grad_norm": 0.14066611230373383, + "learning_rate": 9.98268573301237e-05, + "loss": 0.018594810739159584, + "num_input_tokens_seen": 21436184, + "step": 1309, + "train_runtime": 10789.3539, + "train_tokens_per_second": 1986.79 + }, + { + "epoch": 0.3628808864265928, + "grad_norm": 0.136274054646492, + "learning_rate": 9.98264916831862e-05, + "loss": 0.02067681960761547, + "num_input_tokens_seen": 21452560, + "step": 1310, + "train_runtime": 10797.5815, + "train_tokens_per_second": 1986.793 + }, + { + "epoch": 0.3631578947368421, + "grad_norm": 0.11813928931951523, + "learning_rate": 9.982612565123585e-05, + "loss": 0.02260502427816391, + "num_input_tokens_seen": 21468936, + "step": 1311, + "train_runtime": 10805.7959, + "train_tokens_per_second": 1986.798 + }, + { + "epoch": 0.3634349030470914, + "grad_norm": 0.16723814606666565, + "learning_rate": 9.982575923427541e-05, + "loss": 0.025245796889066696, + "num_input_tokens_seen": 21485312, + "step": 1312, + "train_runtime": 10814.0055, + "train_tokens_per_second": 1986.804 + }, + { + "epoch": 0.3637119113573407, + "grad_norm": 0.09184238314628601, + "learning_rate": 9.982539243230777e-05, + "loss": 0.018615474924445152, + "num_input_tokens_seen": 21501688, + "step": 1313, + "train_runtime": 10822.2278, + "train_tokens_per_second": 1986.808 + }, + { + "epoch": 0.36398891966759, + "grad_norm": 0.10911108553409576, + "learning_rate": 9.982502524533574e-05, + "loss": 0.016845136880874634, + "num_input_tokens_seen": 21518064, + "step": 1314, + "train_runtime": 10830.4386, + "train_tokens_per_second": 1986.814 + }, + { + "epoch": 0.36426592797783935, + "grad_norm": 0.1694604903459549, + "learning_rate": 9.982465767336215e-05, + "loss": 0.02171187661588192, + "num_input_tokens_seen": 21534440, + "step": 1315, + "train_runtime": 10838.6545, + "train_tokens_per_second": 1986.819 + }, + { + "epoch": 0.36454293628808865, + "grad_norm": 0.11895275861024857, + "learning_rate": 9.982428971638986e-05, + "loss": 0.021031538024544716, + "num_input_tokens_seen": 21550816, + "step": 1316, + "train_runtime": 10846.8644, + "train_tokens_per_second": 1986.825 + }, + { + "epoch": 0.36481994459833794, + "grad_norm": 0.15391351282596588, + "learning_rate": 9.982392137442169e-05, + "loss": 0.022400643676519394, + "num_input_tokens_seen": 21567192, + "step": 1317, + "train_runtime": 10855.0712, + "train_tokens_per_second": 1986.831 + }, + { + "epoch": 0.36509695290858724, + "grad_norm": 0.13187435269355774, + "learning_rate": 9.98235526474605e-05, + "loss": 0.017693843692541122, + "num_input_tokens_seen": 21583568, + "step": 1318, + "train_runtime": 10863.2831, + "train_tokens_per_second": 1986.837 + }, + { + "epoch": 0.3653739612188366, + "grad_norm": 0.1491229087114334, + "learning_rate": 9.982318353550915e-05, + "loss": 0.01989898458123207, + "num_input_tokens_seen": 21599944, + "step": 1319, + "train_runtime": 10871.5134, + "train_tokens_per_second": 1986.839 + }, + { + "epoch": 0.3656509695290859, + "grad_norm": 0.1519072949886322, + "learning_rate": 9.982281403857047e-05, + "loss": 0.01854877546429634, + "num_input_tokens_seen": 21616320, + "step": 1320, + "train_runtime": 10879.7553, + "train_tokens_per_second": 1986.839 + }, + { + "epoch": 0.3659279778393352, + "grad_norm": 0.12690743803977966, + "learning_rate": 9.982244415664733e-05, + "loss": 0.019545119255781174, + "num_input_tokens_seen": 21632696, + "step": 1321, + "train_runtime": 10887.9828, + "train_tokens_per_second": 1986.841 + }, + { + "epoch": 0.36620498614958447, + "grad_norm": 0.13359619677066803, + "learning_rate": 9.982207388974257e-05, + "loss": 0.02492537908256054, + "num_input_tokens_seen": 21649072, + "step": 1322, + "train_runtime": 10896.2056, + "train_tokens_per_second": 1986.845 + }, + { + "epoch": 0.3664819944598338, + "grad_norm": 0.1420634537935257, + "learning_rate": 9.982170323785908e-05, + "loss": 0.02404472604393959, + "num_input_tokens_seen": 21665448, + "step": 1323, + "train_runtime": 10904.4234, + "train_tokens_per_second": 1986.849 + }, + { + "epoch": 0.3667590027700831, + "grad_norm": 0.1558813899755478, + "learning_rate": 9.982133220099969e-05, + "loss": 0.017676424235105515, + "num_input_tokens_seen": 21681824, + "step": 1324, + "train_runtime": 10912.6554, + "train_tokens_per_second": 1986.851 + }, + { + "epoch": 0.3670360110803324, + "grad_norm": 0.11931122839450836, + "learning_rate": 9.98209607791673e-05, + "loss": 0.021333124488592148, + "num_input_tokens_seen": 21698200, + "step": 1325, + "train_runtime": 10920.8903, + "train_tokens_per_second": 1986.853 + }, + { + "epoch": 0.3673130193905817, + "grad_norm": 0.1498405635356903, + "learning_rate": 9.982058897236477e-05, + "loss": 0.02655941992998123, + "num_input_tokens_seen": 21714576, + "step": 1326, + "train_runtime": 10929.1207, + "train_tokens_per_second": 1986.855 + }, + { + "epoch": 0.36759002770083105, + "grad_norm": 0.09428603947162628, + "learning_rate": 9.982021678059494e-05, + "loss": 0.017200030386447906, + "num_input_tokens_seen": 21730952, + "step": 1327, + "train_runtime": 10937.3541, + "train_tokens_per_second": 1986.856 + }, + { + "epoch": 0.36786703601108034, + "grad_norm": 0.15003693103790283, + "learning_rate": 9.981984420386075e-05, + "loss": 0.021416133269667625, + "num_input_tokens_seen": 21747328, + "step": 1328, + "train_runtime": 10945.5838, + "train_tokens_per_second": 1986.859 + }, + { + "epoch": 0.36814404432132963, + "grad_norm": 0.13402032852172852, + "learning_rate": 9.981947124216501e-05, + "loss": 0.020707666873931885, + "num_input_tokens_seen": 21763704, + "step": 1329, + "train_runtime": 10953.8101, + "train_tokens_per_second": 1986.862 + }, + { + "epoch": 0.3684210526315789, + "grad_norm": 0.14896458387374878, + "learning_rate": 9.981909789551065e-05, + "loss": 0.019204270094633102, + "num_input_tokens_seen": 21780080, + "step": 1330, + "train_runtime": 10962.029, + "train_tokens_per_second": 1986.866 + }, + { + "epoch": 0.3686980609418283, + "grad_norm": 0.12685047090053558, + "learning_rate": 9.981872416390054e-05, + "loss": 0.02498066984117031, + "num_input_tokens_seen": 21796456, + "step": 1331, + "train_runtime": 10970.2675, + "train_tokens_per_second": 1986.866 + }, + { + "epoch": 0.36897506925207757, + "grad_norm": 0.15606121718883514, + "learning_rate": 9.981835004733758e-05, + "loss": 0.020151464268565178, + "num_input_tokens_seen": 21812832, + "step": 1332, + "train_runtime": 10978.4944, + "train_tokens_per_second": 1986.869 + }, + { + "epoch": 0.36925207756232686, + "grad_norm": 0.11296838521957397, + "learning_rate": 9.981797554582463e-05, + "loss": 0.017156608402729034, + "num_input_tokens_seen": 21829208, + "step": 1333, + "train_runtime": 10986.7207, + "train_tokens_per_second": 1986.872 + }, + { + "epoch": 0.36952908587257616, + "grad_norm": 0.12115295976400375, + "learning_rate": 9.981760065936461e-05, + "loss": 0.021695904433727264, + "num_input_tokens_seen": 21845584, + "step": 1334, + "train_runtime": 10994.9428, + "train_tokens_per_second": 1986.876 + }, + { + "epoch": 0.3698060941828255, + "grad_norm": 0.1352895349264145, + "learning_rate": 9.981722538796041e-05, + "loss": 0.020244568586349487, + "num_input_tokens_seen": 21861960, + "step": 1335, + "train_runtime": 11003.1722, + "train_tokens_per_second": 1986.878 + }, + { + "epoch": 0.3700831024930748, + "grad_norm": 0.1325417160987854, + "learning_rate": 9.981684973161491e-05, + "loss": 0.023188797757029533, + "num_input_tokens_seen": 21878336, + "step": 1336, + "train_runtime": 11011.3957, + "train_tokens_per_second": 1986.881 + }, + { + "epoch": 0.3703601108033241, + "grad_norm": 0.13294437527656555, + "learning_rate": 9.981647369033105e-05, + "loss": 0.020602189004421234, + "num_input_tokens_seen": 21894712, + "step": 1337, + "train_runtime": 11019.6192, + "train_tokens_per_second": 1986.885 + }, + { + "epoch": 0.3706371191135734, + "grad_norm": 0.14113658666610718, + "learning_rate": 9.981609726411172e-05, + "loss": 0.022745406255126, + "num_input_tokens_seen": 21911088, + "step": 1338, + "train_runtime": 11027.8392, + "train_tokens_per_second": 1986.889 + }, + { + "epoch": 0.37091412742382274, + "grad_norm": 0.09787464141845703, + "learning_rate": 9.981572045295982e-05, + "loss": 0.022685673087835312, + "num_input_tokens_seen": 21927464, + "step": 1339, + "train_runtime": 11036.0687, + "train_tokens_per_second": 1986.891 + }, + { + "epoch": 0.37119113573407203, + "grad_norm": 0.11578145623207092, + "learning_rate": 9.981534325687826e-05, + "loss": 0.019161097705364227, + "num_input_tokens_seen": 21943840, + "step": 1340, + "train_runtime": 11044.2903, + "train_tokens_per_second": 1986.895 + }, + { + "epoch": 0.3714681440443213, + "grad_norm": 0.11361049860715866, + "learning_rate": 9.981496567586997e-05, + "loss": 0.02651159092783928, + "num_input_tokens_seen": 21960216, + "step": 1341, + "train_runtime": 11052.5145, + "train_tokens_per_second": 1986.898 + }, + { + "epoch": 0.3717451523545706, + "grad_norm": 0.10192397236824036, + "learning_rate": 9.981458770993786e-05, + "loss": 0.018890032544732094, + "num_input_tokens_seen": 21976592, + "step": 1342, + "train_runtime": 11060.7395, + "train_tokens_per_second": 1986.901 + }, + { + "epoch": 0.37202216066481997, + "grad_norm": 0.1272159218788147, + "learning_rate": 9.981420935908485e-05, + "loss": 0.019477693364024162, + "num_input_tokens_seen": 21992968, + "step": 1343, + "train_runtime": 11068.9712, + "train_tokens_per_second": 1986.903 + }, + { + "epoch": 0.37229916897506926, + "grad_norm": 0.07652931660413742, + "learning_rate": 9.981383062331387e-05, + "loss": 0.016822630539536476, + "num_input_tokens_seen": 22009344, + "step": 1344, + "train_runtime": 11077.2027, + "train_tokens_per_second": 1986.905 + }, + { + "epoch": 0.37257617728531855, + "grad_norm": 0.11765778064727783, + "learning_rate": 9.981345150262783e-05, + "loss": 0.01890856772661209, + "num_input_tokens_seen": 22025720, + "step": 1345, + "train_runtime": 11085.419, + "train_tokens_per_second": 1986.909 + }, + { + "epoch": 0.37285318559556785, + "grad_norm": 0.11891385167837143, + "learning_rate": 9.981307199702968e-05, + "loss": 0.020184127613902092, + "num_input_tokens_seen": 22042096, + "step": 1346, + "train_runtime": 11093.6392, + "train_tokens_per_second": 1986.913 + }, + { + "epoch": 0.3731301939058172, + "grad_norm": 0.09463480859994888, + "learning_rate": 9.981269210652233e-05, + "loss": 0.018467886373400688, + "num_input_tokens_seen": 22058472, + "step": 1347, + "train_runtime": 11101.8695, + "train_tokens_per_second": 1986.915 + }, + { + "epoch": 0.3734072022160665, + "grad_norm": 0.08899486809968948, + "learning_rate": 9.981231183110873e-05, + "loss": 0.018335850909352303, + "num_input_tokens_seen": 22074848, + "step": 1348, + "train_runtime": 11110.0912, + "train_tokens_per_second": 1986.919 + }, + { + "epoch": 0.3736842105263158, + "grad_norm": 0.13486383855342865, + "learning_rate": 9.981193117079181e-05, + "loss": 0.024089990183711052, + "num_input_tokens_seen": 22091224, + "step": 1349, + "train_runtime": 11118.3086, + "train_tokens_per_second": 1986.923 + }, + { + "epoch": 0.3739612188365651, + "grad_norm": 0.1498267650604248, + "learning_rate": 9.981155012557454e-05, + "loss": 0.02387627400457859, + "num_input_tokens_seen": 22107600, + "step": 1350, + "train_runtime": 11126.5242, + "train_tokens_per_second": 1986.928 + }, + { + "epoch": 0.37423822714681443, + "grad_norm": 0.13638979196548462, + "learning_rate": 9.981116869545982e-05, + "loss": 0.024324918165802956, + "num_input_tokens_seen": 22123976, + "step": 1351, + "train_runtime": 11134.7378, + "train_tokens_per_second": 1986.933 + }, + { + "epoch": 0.3745152354570637, + "grad_norm": 0.1565149426460266, + "learning_rate": 9.981078688045062e-05, + "loss": 0.022280458360910416, + "num_input_tokens_seen": 22140352, + "step": 1352, + "train_runtime": 11142.9542, + "train_tokens_per_second": 1986.937 + }, + { + "epoch": 0.374792243767313, + "grad_norm": 0.08077062666416168, + "learning_rate": 9.98104046805499e-05, + "loss": 0.022112878039479256, + "num_input_tokens_seen": 22156728, + "step": 1353, + "train_runtime": 11151.1634, + "train_tokens_per_second": 1986.943 + }, + { + "epoch": 0.3750692520775623, + "grad_norm": 0.3415915369987488, + "learning_rate": 9.98100220957606e-05, + "loss": 0.02181086130440235, + "num_input_tokens_seen": 22173104, + "step": 1354, + "train_runtime": 11159.3732, + "train_tokens_per_second": 1986.949 + }, + { + "epoch": 0.37534626038781166, + "grad_norm": 0.09059765934944153, + "learning_rate": 9.980963912608568e-05, + "loss": 0.019478725269436836, + "num_input_tokens_seen": 22189480, + "step": 1355, + "train_runtime": 11167.5952, + "train_tokens_per_second": 1986.952 + }, + { + "epoch": 0.37562326869806095, + "grad_norm": 0.11201357841491699, + "learning_rate": 9.980925577152811e-05, + "loss": 0.018537871539592743, + "num_input_tokens_seen": 22205856, + "step": 1356, + "train_runtime": 11175.8149, + "train_tokens_per_second": 1986.956 + }, + { + "epoch": 0.37590027700831025, + "grad_norm": 0.08679451048374176, + "learning_rate": 9.980887203209081e-05, + "loss": 0.01935119926929474, + "num_input_tokens_seen": 22222232, + "step": 1357, + "train_runtime": 11184.046, + "train_tokens_per_second": 1986.958 + }, + { + "epoch": 0.37617728531855954, + "grad_norm": 0.07966191321611404, + "learning_rate": 9.98084879077768e-05, + "loss": 0.014826874248683453, + "num_input_tokens_seen": 22238608, + "step": 1358, + "train_runtime": 11192.2803, + "train_tokens_per_second": 1986.96 + }, + { + "epoch": 0.3764542936288089, + "grad_norm": 0.11868930608034134, + "learning_rate": 9.980810339858901e-05, + "loss": 0.01933567225933075, + "num_input_tokens_seen": 22254984, + "step": 1359, + "train_runtime": 11200.5132, + "train_tokens_per_second": 1986.961 + }, + { + "epoch": 0.3767313019390582, + "grad_norm": 0.122674360871315, + "learning_rate": 9.980771850453044e-05, + "loss": 0.020328037440776825, + "num_input_tokens_seen": 22271360, + "step": 1360, + "train_runtime": 11208.7422, + "train_tokens_per_second": 1986.963 + }, + { + "epoch": 0.3770083102493075, + "grad_norm": 0.14918574690818787, + "learning_rate": 9.980733322560405e-05, + "loss": 0.0229499414563179, + "num_input_tokens_seen": 22287736, + "step": 1361, + "train_runtime": 11216.9872, + "train_tokens_per_second": 1986.963 + }, + { + "epoch": 0.37728531855955677, + "grad_norm": 0.15004956722259521, + "learning_rate": 9.980694756181279e-05, + "loss": 0.01989992894232273, + "num_input_tokens_seen": 22304112, + "step": 1362, + "train_runtime": 11225.2235, + "train_tokens_per_second": 1986.964 + }, + { + "epoch": 0.3775623268698061, + "grad_norm": 0.11533299833536148, + "learning_rate": 9.980656151315969e-05, + "loss": 0.02007365971803665, + "num_input_tokens_seen": 22320488, + "step": 1363, + "train_runtime": 11233.4558, + "train_tokens_per_second": 1986.965 + }, + { + "epoch": 0.3778393351800554, + "grad_norm": 0.11958016455173492, + "learning_rate": 9.98061750796477e-05, + "loss": 0.018390310928225517, + "num_input_tokens_seen": 22336864, + "step": 1364, + "train_runtime": 11241.6795, + "train_tokens_per_second": 1986.969 + }, + { + "epoch": 0.3781163434903047, + "grad_norm": 0.10220043361186981, + "learning_rate": 9.980578826127981e-05, + "loss": 0.02276994287967682, + "num_input_tokens_seen": 22353240, + "step": 1365, + "train_runtime": 11249.9021, + "train_tokens_per_second": 1986.972 + }, + { + "epoch": 0.378393351800554, + "grad_norm": 0.15894261002540588, + "learning_rate": 9.980540105805903e-05, + "loss": 0.021187057718634605, + "num_input_tokens_seen": 22369616, + "step": 1366, + "train_runtime": 11258.1259, + "train_tokens_per_second": 1986.975 + }, + { + "epoch": 0.37867036011080335, + "grad_norm": 0.10818879306316376, + "learning_rate": 9.980501346998833e-05, + "loss": 0.017390308901667595, + "num_input_tokens_seen": 22385992, + "step": 1367, + "train_runtime": 11266.3555, + "train_tokens_per_second": 1986.977 + }, + { + "epoch": 0.37894736842105264, + "grad_norm": 0.13749060034751892, + "learning_rate": 9.98046254970707e-05, + "loss": 0.02100571244955063, + "num_input_tokens_seen": 22402368, + "step": 1368, + "train_runtime": 11274.5788, + "train_tokens_per_second": 1986.98 + }, + { + "epoch": 0.37922437673130194, + "grad_norm": 0.12322071194648743, + "learning_rate": 9.980423713930916e-05, + "loss": 0.018876727670431137, + "num_input_tokens_seen": 22418744, + "step": 1369, + "train_runtime": 11282.7969, + "train_tokens_per_second": 1986.985 + }, + { + "epoch": 0.37950138504155123, + "grad_norm": 0.11222860217094421, + "learning_rate": 9.980384839670671e-05, + "loss": 0.018486885353922844, + "num_input_tokens_seen": 22435120, + "step": 1370, + "train_runtime": 11291.0039, + "train_tokens_per_second": 1986.991 + }, + { + "epoch": 0.3797783933518006, + "grad_norm": 0.10942346602678299, + "learning_rate": 9.980345926926633e-05, + "loss": 0.01951097510755062, + "num_input_tokens_seen": 22451496, + "step": 1371, + "train_runtime": 11299.2179, + "train_tokens_per_second": 1986.996 + }, + { + "epoch": 0.3800554016620499, + "grad_norm": 0.09594390541315079, + "learning_rate": 9.980306975699103e-05, + "loss": 0.017540931701660156, + "num_input_tokens_seen": 22467872, + "step": 1372, + "train_runtime": 11307.4256, + "train_tokens_per_second": 1987.002 + }, + { + "epoch": 0.38033240997229917, + "grad_norm": 0.14338555932044983, + "learning_rate": 9.980267985988385e-05, + "loss": 0.017609575763344765, + "num_input_tokens_seen": 22484248, + "step": 1373, + "train_runtime": 11315.6363, + "train_tokens_per_second": 1987.007 + }, + { + "epoch": 0.38060941828254846, + "grad_norm": 0.1728397160768509, + "learning_rate": 9.980228957794777e-05, + "loss": 0.027312347665429115, + "num_input_tokens_seen": 22500624, + "step": 1374, + "train_runtime": 11323.8431, + "train_tokens_per_second": 1987.013 + }, + { + "epoch": 0.3808864265927978, + "grad_norm": 0.11137452721595764, + "learning_rate": 9.980189891118583e-05, + "loss": 0.01847725920379162, + "num_input_tokens_seen": 22517000, + "step": 1375, + "train_runtime": 11332.0556, + "train_tokens_per_second": 1987.018 + }, + { + "epoch": 0.3811634349030471, + "grad_norm": 0.12137161940336227, + "learning_rate": 9.980150785960103e-05, + "loss": 0.02072913385927677, + "num_input_tokens_seen": 22533376, + "step": 1376, + "train_runtime": 11340.2659, + "train_tokens_per_second": 1987.024 + }, + { + "epoch": 0.3814404432132964, + "grad_norm": 0.12719029188156128, + "learning_rate": 9.98011164231964e-05, + "loss": 0.021218780428171158, + "num_input_tokens_seen": 22549752, + "step": 1377, + "train_runtime": 11348.4719, + "train_tokens_per_second": 1987.03 + }, + { + "epoch": 0.3817174515235457, + "grad_norm": 0.13128651678562164, + "learning_rate": 9.980072460197497e-05, + "loss": 0.022072983905673027, + "num_input_tokens_seen": 22566128, + "step": 1378, + "train_runtime": 11356.681, + "train_tokens_per_second": 1987.035 + }, + { + "epoch": 0.38199445983379504, + "grad_norm": 0.08374883234500885, + "learning_rate": 9.980033239593975e-05, + "loss": 0.017331020906567574, + "num_input_tokens_seen": 22582504, + "step": 1379, + "train_runtime": 11364.8843, + "train_tokens_per_second": 1987.042 + }, + { + "epoch": 0.38227146814404434, + "grad_norm": 0.14907202124595642, + "learning_rate": 9.979993980509378e-05, + "loss": 0.022423163056373596, + "num_input_tokens_seen": 22598880, + "step": 1380, + "train_runtime": 11373.101, + "train_tokens_per_second": 1987.046 + }, + { + "epoch": 0.38254847645429363, + "grad_norm": 0.08771399408578873, + "learning_rate": 9.979954682944011e-05, + "loss": 0.016922950744628906, + "num_input_tokens_seen": 22615256, + "step": 1381, + "train_runtime": 11381.5318, + "train_tokens_per_second": 1987.013 + }, + { + "epoch": 0.3828254847645429, + "grad_norm": 0.1032857671380043, + "learning_rate": 9.979915346898176e-05, + "loss": 0.021863378584384918, + "num_input_tokens_seen": 22631632, + "step": 1382, + "train_runtime": 11389.7731, + "train_tokens_per_second": 1987.013 + }, + { + "epoch": 0.3831024930747922, + "grad_norm": 0.15037326514720917, + "learning_rate": 9.979875972372175e-05, + "loss": 0.022166360169649124, + "num_input_tokens_seen": 22648008, + "step": 1383, + "train_runtime": 11397.9826, + "train_tokens_per_second": 1987.019 + }, + { + "epoch": 0.38337950138504157, + "grad_norm": 0.0930008664727211, + "learning_rate": 9.979836559366318e-05, + "loss": 0.016094369813799858, + "num_input_tokens_seen": 22664384, + "step": 1384, + "train_runtime": 11406.1924, + "train_tokens_per_second": 1987.025 + }, + { + "epoch": 0.38365650969529086, + "grad_norm": 0.11169441044330597, + "learning_rate": 9.979797107880903e-05, + "loss": 0.02111062966287136, + "num_input_tokens_seen": 22680760, + "step": 1385, + "train_runtime": 11414.4025, + "train_tokens_per_second": 1987.03 + }, + { + "epoch": 0.38393351800554015, + "grad_norm": 0.1369129866361618, + "learning_rate": 9.979757617916239e-05, + "loss": 0.02662837877869606, + "num_input_tokens_seen": 22697136, + "step": 1386, + "train_runtime": 11422.6205, + "train_tokens_per_second": 1987.034 + }, + { + "epoch": 0.38421052631578945, + "grad_norm": 0.11653035879135132, + "learning_rate": 9.979718089472629e-05, + "loss": 0.01987096481025219, + "num_input_tokens_seen": 22713512, + "step": 1387, + "train_runtime": 11430.846, + "train_tokens_per_second": 1987.037 + }, + { + "epoch": 0.3844875346260388, + "grad_norm": 0.09672946482896805, + "learning_rate": 9.979678522550382e-05, + "loss": 0.022180398926138878, + "num_input_tokens_seen": 22729888, + "step": 1388, + "train_runtime": 11439.0709, + "train_tokens_per_second": 1987.04 + }, + { + "epoch": 0.3847645429362881, + "grad_norm": 0.1379476636648178, + "learning_rate": 9.979638917149798e-05, + "loss": 0.02158326655626297, + "num_input_tokens_seen": 22746264, + "step": 1389, + "train_runtime": 11447.2976, + "train_tokens_per_second": 1987.042 + }, + { + "epoch": 0.3850415512465374, + "grad_norm": 0.10146293044090271, + "learning_rate": 9.979599273271187e-05, + "loss": 0.019252676516771317, + "num_input_tokens_seen": 22762640, + "step": 1390, + "train_runtime": 11455.5248, + "train_tokens_per_second": 1987.045 + }, + { + "epoch": 0.3853185595567867, + "grad_norm": 0.14928007125854492, + "learning_rate": 9.979559590914855e-05, + "loss": 0.021237527951598167, + "num_input_tokens_seen": 22779016, + "step": 1391, + "train_runtime": 11463.7564, + "train_tokens_per_second": 1987.046 + }, + { + "epoch": 0.385595567867036, + "grad_norm": 0.10893085598945618, + "learning_rate": 9.979519870081108e-05, + "loss": 0.018530234694480896, + "num_input_tokens_seen": 22795392, + "step": 1392, + "train_runtime": 11471.9786, + "train_tokens_per_second": 1987.05 + }, + { + "epoch": 0.3858725761772853, + "grad_norm": 0.15201523900032043, + "learning_rate": 9.979480110770254e-05, + "loss": 0.024779994040727615, + "num_input_tokens_seen": 22811768, + "step": 1393, + "train_runtime": 11480.2146, + "train_tokens_per_second": 1987.051 + }, + { + "epoch": 0.3861495844875346, + "grad_norm": 0.121940977871418, + "learning_rate": 9.979440312982599e-05, + "loss": 0.022237403318285942, + "num_input_tokens_seen": 22828144, + "step": 1394, + "train_runtime": 11488.4398, + "train_tokens_per_second": 1987.053 + }, + { + "epoch": 0.3864265927977839, + "grad_norm": 0.1425483673810959, + "learning_rate": 9.97940047671845e-05, + "loss": 0.019074540585279465, + "num_input_tokens_seen": 22844520, + "step": 1395, + "train_runtime": 11496.6686, + "train_tokens_per_second": 1987.056 + }, + { + "epoch": 0.38670360110803326, + "grad_norm": 0.10753645747900009, + "learning_rate": 9.979360601978116e-05, + "loss": 0.018136750906705856, + "num_input_tokens_seen": 22860896, + "step": 1396, + "train_runtime": 11504.8867, + "train_tokens_per_second": 1987.06 + }, + { + "epoch": 0.38698060941828255, + "grad_norm": 0.14524301886558533, + "learning_rate": 9.979320688761904e-05, + "loss": 0.021872155368328094, + "num_input_tokens_seen": 22877272, + "step": 1397, + "train_runtime": 11513.1111, + "train_tokens_per_second": 1987.063 + }, + { + "epoch": 0.38725761772853184, + "grad_norm": 0.09464802592992783, + "learning_rate": 9.979280737070124e-05, + "loss": 0.01961708813905716, + "num_input_tokens_seen": 22893648, + "step": 1398, + "train_runtime": 11521.3365, + "train_tokens_per_second": 1987.065 + }, + { + "epoch": 0.38753462603878114, + "grad_norm": 0.10570830851793289, + "learning_rate": 9.979240746903084e-05, + "loss": 0.018019825220108032, + "num_input_tokens_seen": 22910024, + "step": 1399, + "train_runtime": 11529.5657, + "train_tokens_per_second": 1987.067 + }, + { + "epoch": 0.3878116343490305, + "grad_norm": 0.11284946650266647, + "learning_rate": 9.979200718261092e-05, + "loss": 0.01737750694155693, + "num_input_tokens_seen": 22926400, + "step": 1400, + "train_runtime": 11537.7852, + "train_tokens_per_second": 1987.071 + }, + { + "epoch": 0.3880886426592798, + "grad_norm": 0.10750672221183777, + "learning_rate": 9.979160651144459e-05, + "loss": 0.018947161734104156, + "num_input_tokens_seen": 22942776, + "step": 1401, + "train_runtime": 11547.532, + "train_tokens_per_second": 1986.812 + }, + { + "epoch": 0.3883656509695291, + "grad_norm": 0.14042726159095764, + "learning_rate": 9.979120545553493e-05, + "loss": 0.01879994384944439, + "num_input_tokens_seen": 22959152, + "step": 1402, + "train_runtime": 11555.7326, + "train_tokens_per_second": 1986.819 + }, + { + "epoch": 0.38864265927977837, + "grad_norm": 0.10780936479568481, + "learning_rate": 9.979080401488506e-05, + "loss": 0.019660508260130882, + "num_input_tokens_seen": 22975528, + "step": 1403, + "train_runtime": 11563.9418, + "train_tokens_per_second": 1986.825 + }, + { + "epoch": 0.3889196675900277, + "grad_norm": 0.10281259566545486, + "learning_rate": 9.979040218949805e-05, + "loss": 0.02043335698544979, + "num_input_tokens_seen": 22991904, + "step": 1404, + "train_runtime": 11572.1657, + "train_tokens_per_second": 1986.828 + }, + { + "epoch": 0.389196675900277, + "grad_norm": 0.08577962964773178, + "learning_rate": 9.978999997937703e-05, + "loss": 0.017152296379208565, + "num_input_tokens_seen": 23008280, + "step": 1405, + "train_runtime": 11580.3885, + "train_tokens_per_second": 1986.831 + }, + { + "epoch": 0.3894736842105263, + "grad_norm": 0.09565763920545578, + "learning_rate": 9.97895973845251e-05, + "loss": 0.019972028210759163, + "num_input_tokens_seen": 23024656, + "step": 1406, + "train_runtime": 11588.6201, + "train_tokens_per_second": 1986.833 + }, + { + "epoch": 0.3897506925207756, + "grad_norm": 0.1691974550485611, + "learning_rate": 9.978919440494539e-05, + "loss": 0.019520839676260948, + "num_input_tokens_seen": 23041032, + "step": 1407, + "train_runtime": 11596.8577, + "train_tokens_per_second": 1986.834 + }, + { + "epoch": 0.39002770083102495, + "grad_norm": 0.12567900121212006, + "learning_rate": 9.978879104064098e-05, + "loss": 0.018341219052672386, + "num_input_tokens_seen": 23057408, + "step": 1408, + "train_runtime": 11605.0961, + "train_tokens_per_second": 1986.835 + }, + { + "epoch": 0.39030470914127424, + "grad_norm": 0.12106873095035553, + "learning_rate": 9.9788387291615e-05, + "loss": 0.02166348323225975, + "num_input_tokens_seen": 23073784, + "step": 1409, + "train_runtime": 11613.3267, + "train_tokens_per_second": 1986.837 + }, + { + "epoch": 0.39058171745152354, + "grad_norm": 0.10300263017416, + "learning_rate": 9.978798315787057e-05, + "loss": 0.015929456800222397, + "num_input_tokens_seen": 23090160, + "step": 1410, + "train_runtime": 11621.555, + "train_tokens_per_second": 1986.839 + }, + { + "epoch": 0.39085872576177283, + "grad_norm": 0.1328110545873642, + "learning_rate": 9.978757863941081e-05, + "loss": 0.019724003970623016, + "num_input_tokens_seen": 23106536, + "step": 1411, + "train_runtime": 11629.7903, + "train_tokens_per_second": 1986.84 + }, + { + "epoch": 0.3911357340720222, + "grad_norm": 0.12087315320968628, + "learning_rate": 9.978717373623888e-05, + "loss": 0.021558932960033417, + "num_input_tokens_seen": 23122912, + "step": 1412, + "train_runtime": 11638.023, + "train_tokens_per_second": 1986.842 + }, + { + "epoch": 0.3914127423822715, + "grad_norm": 0.10661081969738007, + "learning_rate": 9.978676844835785e-05, + "loss": 0.017588024958968163, + "num_input_tokens_seen": 23139288, + "step": 1413, + "train_runtime": 11646.2553, + "train_tokens_per_second": 1986.844 + }, + { + "epoch": 0.39168975069252077, + "grad_norm": 0.13145245611667633, + "learning_rate": 9.978636277577091e-05, + "loss": 0.01931574195623398, + "num_input_tokens_seen": 23155664, + "step": 1414, + "train_runtime": 11654.4876, + "train_tokens_per_second": 1986.845 + }, + { + "epoch": 0.39196675900277006, + "grad_norm": 0.13352163136005402, + "learning_rate": 9.978595671848114e-05, + "loss": 0.02091841772198677, + "num_input_tokens_seen": 23172040, + "step": 1415, + "train_runtime": 11662.7136, + "train_tokens_per_second": 1986.848 + }, + { + "epoch": 0.3922437673130194, + "grad_norm": 0.13328449428081512, + "learning_rate": 9.978555027649173e-05, + "loss": 0.023572934791445732, + "num_input_tokens_seen": 23188416, + "step": 1416, + "train_runtime": 11670.9547, + "train_tokens_per_second": 1986.848 + }, + { + "epoch": 0.3925207756232687, + "grad_norm": 0.11556278169155121, + "learning_rate": 9.978514344980578e-05, + "loss": 0.020245224237442017, + "num_input_tokens_seen": 23204792, + "step": 1417, + "train_runtime": 11679.1828, + "train_tokens_per_second": 1986.851 + }, + { + "epoch": 0.392797783933518, + "grad_norm": 0.12527261674404144, + "learning_rate": 9.978473623842644e-05, + "loss": 0.02090843766927719, + "num_input_tokens_seen": 23221168, + "step": 1418, + "train_runtime": 11687.4247, + "train_tokens_per_second": 1986.851 + }, + { + "epoch": 0.3930747922437673, + "grad_norm": 0.12378775328397751, + "learning_rate": 9.978432864235688e-05, + "loss": 0.019817322492599487, + "num_input_tokens_seen": 23237544, + "step": 1419, + "train_runtime": 11695.6575, + "train_tokens_per_second": 1986.852 + }, + { + "epoch": 0.39335180055401664, + "grad_norm": 0.07333667576313019, + "learning_rate": 9.978392066160022e-05, + "loss": 0.01585201919078827, + "num_input_tokens_seen": 23253920, + "step": 1420, + "train_runtime": 11703.8828, + "train_tokens_per_second": 1986.855 + }, + { + "epoch": 0.39362880886426593, + "grad_norm": 0.16389329731464386, + "learning_rate": 9.978351229615963e-05, + "loss": 0.02258925884962082, + "num_input_tokens_seen": 23270296, + "step": 1421, + "train_runtime": 11712.1031, + "train_tokens_per_second": 1986.859 + }, + { + "epoch": 0.3939058171745152, + "grad_norm": 0.11368757486343384, + "learning_rate": 9.978310354603828e-05, + "loss": 0.01875455677509308, + "num_input_tokens_seen": 23286672, + "step": 1422, + "train_runtime": 11720.334, + "train_tokens_per_second": 1986.861 + }, + { + "epoch": 0.3941828254847645, + "grad_norm": 0.10353541374206543, + "learning_rate": 9.978269441123929e-05, + "loss": 0.016394823789596558, + "num_input_tokens_seen": 23303048, + "step": 1423, + "train_runtime": 11728.5566, + "train_tokens_per_second": 1986.864 + }, + { + "epoch": 0.39445983379501387, + "grad_norm": 0.1020311787724495, + "learning_rate": 9.978228489176585e-05, + "loss": 0.02134857513010502, + "num_input_tokens_seen": 23319424, + "step": 1424, + "train_runtime": 11736.7888, + "train_tokens_per_second": 1986.866 + }, + { + "epoch": 0.39473684210526316, + "grad_norm": 0.1724388152360916, + "learning_rate": 9.978187498762112e-05, + "loss": 0.02224324643611908, + "num_input_tokens_seen": 23335800, + "step": 1425, + "train_runtime": 11745.0201, + "train_tokens_per_second": 1986.868 + }, + { + "epoch": 0.39501385041551246, + "grad_norm": 0.12591984868049622, + "learning_rate": 9.978146469880824e-05, + "loss": 0.020153608173131943, + "num_input_tokens_seen": 23352176, + "step": 1426, + "train_runtime": 11753.2444, + "train_tokens_per_second": 1986.871 + }, + { + "epoch": 0.39529085872576175, + "grad_norm": 0.08962909132242203, + "learning_rate": 9.978105402533045e-05, + "loss": 0.014975407160818577, + "num_input_tokens_seen": 23368552, + "step": 1427, + "train_runtime": 11761.4685, + "train_tokens_per_second": 1986.874 + }, + { + "epoch": 0.3955678670360111, + "grad_norm": 0.11404858529567719, + "learning_rate": 9.978064296719082e-05, + "loss": 0.020581895485520363, + "num_input_tokens_seen": 23384928, + "step": 1428, + "train_runtime": 11769.6985, + "train_tokens_per_second": 1986.876 + }, + { + "epoch": 0.3958448753462604, + "grad_norm": 0.10441016405820847, + "learning_rate": 9.978023152439263e-05, + "loss": 0.017258424311876297, + "num_input_tokens_seen": 23401304, + "step": 1429, + "train_runtime": 11777.9307, + "train_tokens_per_second": 1986.877 + }, + { + "epoch": 0.3961218836565097, + "grad_norm": 0.1659630984067917, + "learning_rate": 9.9779819696939e-05, + "loss": 0.02398928627371788, + "num_input_tokens_seen": 23417680, + "step": 1430, + "train_runtime": 11786.1579, + "train_tokens_per_second": 1986.88 + }, + { + "epoch": 0.396398891966759, + "grad_norm": 0.13021285831928253, + "learning_rate": 9.977940748483312e-05, + "loss": 0.020423542708158493, + "num_input_tokens_seen": 23434056, + "step": 1431, + "train_runtime": 11794.3888, + "train_tokens_per_second": 1986.882 + }, + { + "epoch": 0.39667590027700833, + "grad_norm": 0.1574818342924118, + "learning_rate": 9.977899488807818e-05, + "loss": 0.02716538868844509, + "num_input_tokens_seen": 23450432, + "step": 1432, + "train_runtime": 11802.6094, + "train_tokens_per_second": 1986.885 + }, + { + "epoch": 0.3969529085872576, + "grad_norm": 0.0889403373003006, + "learning_rate": 9.977858190667738e-05, + "loss": 0.019342580810189247, + "num_input_tokens_seen": 23466808, + "step": 1433, + "train_runtime": 11810.8277, + "train_tokens_per_second": 1986.889 + }, + { + "epoch": 0.3972299168975069, + "grad_norm": 0.1303287297487259, + "learning_rate": 9.977816854063389e-05, + "loss": 0.025119906291365623, + "num_input_tokens_seen": 23483184, + "step": 1434, + "train_runtime": 11819.0468, + "train_tokens_per_second": 1986.893 + }, + { + "epoch": 0.3975069252077562, + "grad_norm": 0.11135120689868927, + "learning_rate": 9.977775478995091e-05, + "loss": 0.01818278804421425, + "num_input_tokens_seen": 23499560, + "step": 1435, + "train_runtime": 11827.275, + "train_tokens_per_second": 1986.896 + }, + { + "epoch": 0.39778393351800556, + "grad_norm": 0.1384686827659607, + "learning_rate": 9.977734065463165e-05, + "loss": 0.024331215769052505, + "num_input_tokens_seen": 23515936, + "step": 1436, + "train_runtime": 11835.5054, + "train_tokens_per_second": 1986.897 + }, + { + "epoch": 0.39806094182825486, + "grad_norm": 0.1013227179646492, + "learning_rate": 9.977692613467928e-05, + "loss": 0.016748948022723198, + "num_input_tokens_seen": 23532312, + "step": 1437, + "train_runtime": 11843.7351, + "train_tokens_per_second": 1986.9 + }, + { + "epoch": 0.39833795013850415, + "grad_norm": 0.13221092522144318, + "learning_rate": 9.977651123009705e-05, + "loss": 0.021303439512848854, + "num_input_tokens_seen": 23548688, + "step": 1438, + "train_runtime": 11851.959, + "train_tokens_per_second": 1986.903 + }, + { + "epoch": 0.39861495844875344, + "grad_norm": 0.11111052334308624, + "learning_rate": 9.977609594088813e-05, + "loss": 0.01770632527768612, + "num_input_tokens_seen": 23565064, + "step": 1439, + "train_runtime": 11860.1821, + "train_tokens_per_second": 1986.906 + }, + { + "epoch": 0.3988919667590028, + "grad_norm": 0.10395681113004684, + "learning_rate": 9.977568026705574e-05, + "loss": 0.017903007566928864, + "num_input_tokens_seen": 23581440, + "step": 1440, + "train_runtime": 11868.4088, + "train_tokens_per_second": 1986.908 + }, + { + "epoch": 0.3991689750692521, + "grad_norm": 0.10112018883228302, + "learning_rate": 9.977526420860307e-05, + "loss": 0.015175897628068924, + "num_input_tokens_seen": 23597816, + "step": 1441, + "train_runtime": 11876.6383, + "train_tokens_per_second": 1986.91 + }, + { + "epoch": 0.3994459833795014, + "grad_norm": 0.09462549537420273, + "learning_rate": 9.977484776553339e-05, + "loss": 0.01745697297155857, + "num_input_tokens_seen": 23614192, + "step": 1442, + "train_runtime": 11884.8665, + "train_tokens_per_second": 1986.913 + }, + { + "epoch": 0.3997229916897507, + "grad_norm": 0.09717104583978653, + "learning_rate": 9.977443093784987e-05, + "loss": 0.01577497087419033, + "num_input_tokens_seen": 23630568, + "step": 1443, + "train_runtime": 11893.0942, + "train_tokens_per_second": 1986.915 + }, + { + "epoch": 0.4, + "grad_norm": 0.14647357165813446, + "learning_rate": 9.977401372555575e-05, + "loss": 0.02124577946960926, + "num_input_tokens_seen": 23646944, + "step": 1444, + "train_runtime": 11901.3252, + "train_tokens_per_second": 1986.917 + }, + { + "epoch": 0.4002770083102493, + "grad_norm": 0.16520008444786072, + "learning_rate": 9.977359612865423e-05, + "loss": 0.022104062139987946, + "num_input_tokens_seen": 23663320, + "step": 1445, + "train_runtime": 11909.5587, + "train_tokens_per_second": 1986.918 + }, + { + "epoch": 0.4005540166204986, + "grad_norm": 0.0857977569103241, + "learning_rate": 9.977317814714857e-05, + "loss": 0.019577205181121826, + "num_input_tokens_seen": 23679696, + "step": 1446, + "train_runtime": 11917.785, + "train_tokens_per_second": 1986.921 + }, + { + "epoch": 0.4008310249307479, + "grad_norm": 0.09424147009849548, + "learning_rate": 9.977275978104199e-05, + "loss": 0.01711466908454895, + "num_input_tokens_seen": 23696072, + "step": 1447, + "train_runtime": 11926.0135, + "train_tokens_per_second": 1986.923 + }, + { + "epoch": 0.40110803324099725, + "grad_norm": 0.11239974200725555, + "learning_rate": 9.97723410303377e-05, + "loss": 0.01549205370247364, + "num_input_tokens_seen": 23712448, + "step": 1448, + "train_runtime": 11934.2353, + "train_tokens_per_second": 1986.926 + }, + { + "epoch": 0.40138504155124655, + "grad_norm": 0.13988250494003296, + "learning_rate": 9.977192189503896e-05, + "loss": 0.02351415902376175, + "num_input_tokens_seen": 23728824, + "step": 1449, + "train_runtime": 11942.4656, + "train_tokens_per_second": 1986.928 + }, + { + "epoch": 0.40166204986149584, + "grad_norm": 0.10761825740337372, + "learning_rate": 9.977150237514901e-05, + "loss": 0.017015304416418076, + "num_input_tokens_seen": 23745200, + "step": 1450, + "train_runtime": 11950.6789, + "train_tokens_per_second": 1986.933 + }, + { + "epoch": 0.40193905817174513, + "grad_norm": 0.09676454961299896, + "learning_rate": 9.977108247067108e-05, + "loss": 0.017625859007239342, + "num_input_tokens_seen": 23761576, + "step": 1451, + "train_runtime": 11958.8919, + "train_tokens_per_second": 1986.938 + }, + { + "epoch": 0.4022160664819945, + "grad_norm": 0.14228302240371704, + "learning_rate": 9.977066218160842e-05, + "loss": 0.019920777529478073, + "num_input_tokens_seen": 23777952, + "step": 1452, + "train_runtime": 11967.1106, + "train_tokens_per_second": 1986.942 + }, + { + "epoch": 0.4024930747922438, + "grad_norm": 0.15242205560207367, + "learning_rate": 9.977024150796426e-05, + "loss": 0.016867468133568764, + "num_input_tokens_seen": 23794328, + "step": 1453, + "train_runtime": 11975.3387, + "train_tokens_per_second": 1986.944 + }, + { + "epoch": 0.40277008310249307, + "grad_norm": 0.10874484479427338, + "learning_rate": 9.976982044974188e-05, + "loss": 0.018069732934236526, + "num_input_tokens_seen": 23810704, + "step": 1454, + "train_runtime": 11983.5585, + "train_tokens_per_second": 1986.948 + }, + { + "epoch": 0.40304709141274236, + "grad_norm": 0.08135002106428146, + "learning_rate": 9.976939900694454e-05, + "loss": 0.017346663400530815, + "num_input_tokens_seen": 23827080, + "step": 1455, + "train_runtime": 11991.788, + "train_tokens_per_second": 1986.95 + }, + { + "epoch": 0.4033240997229917, + "grad_norm": 0.12163814902305603, + "learning_rate": 9.976897717957544e-05, + "loss": 0.017803296446800232, + "num_input_tokens_seen": 23843456, + "step": 1456, + "train_runtime": 12000.0117, + "train_tokens_per_second": 1986.953 + }, + { + "epoch": 0.403601108033241, + "grad_norm": 0.10590958595275879, + "learning_rate": 9.976855496763788e-05, + "loss": 0.015565024688839912, + "num_input_tokens_seen": 23859832, + "step": 1457, + "train_runtime": 12008.2317, + "train_tokens_per_second": 1986.956 + }, + { + "epoch": 0.4038781163434903, + "grad_norm": 0.08834324032068253, + "learning_rate": 9.976813237113514e-05, + "loss": 0.01741892658174038, + "num_input_tokens_seen": 23876208, + "step": 1458, + "train_runtime": 12016.4555, + "train_tokens_per_second": 1986.959 + }, + { + "epoch": 0.4041551246537396, + "grad_norm": 0.12535062432289124, + "learning_rate": 9.976770939007046e-05, + "loss": 0.025454003363847733, + "num_input_tokens_seen": 23892584, + "step": 1459, + "train_runtime": 12024.684, + "train_tokens_per_second": 1986.961 + }, + { + "epoch": 0.40443213296398894, + "grad_norm": 0.14856038987636566, + "learning_rate": 9.97672860244471e-05, + "loss": 0.015978828072547913, + "num_input_tokens_seen": 23908960, + "step": 1460, + "train_runtime": 12032.9093, + "train_tokens_per_second": 1986.964 + }, + { + "epoch": 0.40470914127423824, + "grad_norm": 0.07467913627624512, + "learning_rate": 9.976686227426835e-05, + "loss": 0.017224032431840897, + "num_input_tokens_seen": 23925336, + "step": 1461, + "train_runtime": 12041.1343, + "train_tokens_per_second": 1986.967 + }, + { + "epoch": 0.40498614958448753, + "grad_norm": 0.14212077856063843, + "learning_rate": 9.976643813953747e-05, + "loss": 0.023701708763837814, + "num_input_tokens_seen": 23941712, + "step": 1462, + "train_runtime": 12049.3657, + "train_tokens_per_second": 1986.969 + }, + { + "epoch": 0.4052631578947368, + "grad_norm": 0.09268060326576233, + "learning_rate": 9.976601362025777e-05, + "loss": 0.022813329473137856, + "num_input_tokens_seen": 23958088, + "step": 1463, + "train_runtime": 12057.5913, + "train_tokens_per_second": 1986.971 + }, + { + "epoch": 0.4055401662049862, + "grad_norm": 0.10124783217906952, + "learning_rate": 9.976558871643249e-05, + "loss": 0.019459903240203857, + "num_input_tokens_seen": 23974464, + "step": 1464, + "train_runtime": 12065.819, + "train_tokens_per_second": 1986.974 + }, + { + "epoch": 0.40581717451523547, + "grad_norm": 0.09711147844791412, + "learning_rate": 9.976516342806492e-05, + "loss": 0.015619718469679356, + "num_input_tokens_seen": 23990840, + "step": 1465, + "train_runtime": 12074.0402, + "train_tokens_per_second": 1986.977 + }, + { + "epoch": 0.40609418282548476, + "grad_norm": 0.10615287721157074, + "learning_rate": 9.976473775515837e-05, + "loss": 0.019735613837838173, + "num_input_tokens_seen": 24007216, + "step": 1466, + "train_runtime": 12082.2689, + "train_tokens_per_second": 1986.979 + }, + { + "epoch": 0.40637119113573406, + "grad_norm": 0.1439591348171234, + "learning_rate": 9.97643116977161e-05, + "loss": 0.019362367689609528, + "num_input_tokens_seen": 24023592, + "step": 1467, + "train_runtime": 12090.4886, + "train_tokens_per_second": 1986.983 + }, + { + "epoch": 0.4066481994459834, + "grad_norm": 0.10864324122667313, + "learning_rate": 9.976388525574143e-05, + "loss": 0.020369775593280792, + "num_input_tokens_seen": 24039968, + "step": 1468, + "train_runtime": 12098.7013, + "train_tokens_per_second": 1986.987 + }, + { + "epoch": 0.4069252077562327, + "grad_norm": 0.10514485836029053, + "learning_rate": 9.976345842923763e-05, + "loss": 0.01761981099843979, + "num_input_tokens_seen": 24056344, + "step": 1469, + "train_runtime": 12106.9269, + "train_tokens_per_second": 1986.99 + }, + { + "epoch": 0.407202216066482, + "grad_norm": 0.1198030412197113, + "learning_rate": 9.976303121820803e-05, + "loss": 0.019808705896139145, + "num_input_tokens_seen": 24072720, + "step": 1470, + "train_runtime": 12115.1558, + "train_tokens_per_second": 1986.992 + }, + { + "epoch": 0.4074792243767313, + "grad_norm": 0.12656067311763763, + "learning_rate": 9.976260362265589e-05, + "loss": 0.017924414947628975, + "num_input_tokens_seen": 24089096, + "step": 1471, + "train_runtime": 12123.3749, + "train_tokens_per_second": 1986.996 + }, + { + "epoch": 0.40775623268698064, + "grad_norm": 0.12035638839006424, + "learning_rate": 9.976217564258454e-05, + "loss": 0.01969754695892334, + "num_input_tokens_seen": 24105472, + "step": 1472, + "train_runtime": 12131.595, + "train_tokens_per_second": 1986.999 + }, + { + "epoch": 0.40803324099722993, + "grad_norm": 0.09629876166582108, + "learning_rate": 9.976174727799728e-05, + "loss": 0.017929431051015854, + "num_input_tokens_seen": 24121848, + "step": 1473, + "train_runtime": 12139.8246, + "train_tokens_per_second": 1987.001 + }, + { + "epoch": 0.4083102493074792, + "grad_norm": 0.15497882664203644, + "learning_rate": 9.976131852889743e-05, + "loss": 0.01881234347820282, + "num_input_tokens_seen": 24138224, + "step": 1474, + "train_runtime": 12148.044, + "train_tokens_per_second": 1987.005 + }, + { + "epoch": 0.4085872576177285, + "grad_norm": 0.11241941154003143, + "learning_rate": 9.976088939528829e-05, + "loss": 0.020264316350221634, + "num_input_tokens_seen": 24154600, + "step": 1475, + "train_runtime": 12156.275, + "train_tokens_per_second": 1987.007 + }, + { + "epoch": 0.40886426592797787, + "grad_norm": 0.10788419842720032, + "learning_rate": 9.976045987717317e-05, + "loss": 0.018632281571626663, + "num_input_tokens_seen": 24170976, + "step": 1476, + "train_runtime": 12164.5008, + "train_tokens_per_second": 1987.009 + }, + { + "epoch": 0.40914127423822716, + "grad_norm": 0.12213849276304245, + "learning_rate": 9.976002997455542e-05, + "loss": 0.017743250355124474, + "num_input_tokens_seen": 24187352, + "step": 1477, + "train_runtime": 12172.7253, + "train_tokens_per_second": 1987.012 + }, + { + "epoch": 0.40941828254847645, + "grad_norm": 0.14651905000209808, + "learning_rate": 9.975959968743833e-05, + "loss": 0.01861337013542652, + "num_input_tokens_seen": 24203728, + "step": 1478, + "train_runtime": 12180.9558, + "train_tokens_per_second": 1987.014 + }, + { + "epoch": 0.40969529085872575, + "grad_norm": 0.10905776172876358, + "learning_rate": 9.975916901582525e-05, + "loss": 0.016033615916967392, + "num_input_tokens_seen": 24220104, + "step": 1479, + "train_runtime": 12189.1782, + "train_tokens_per_second": 1987.017 + }, + { + "epoch": 0.4099722991689751, + "grad_norm": 0.17321863770484924, + "learning_rate": 9.975873795971949e-05, + "loss": 0.02238447032868862, + "num_input_tokens_seen": 24236480, + "step": 1480, + "train_runtime": 12197.3995, + "train_tokens_per_second": 1987.02 + }, + { + "epoch": 0.4102493074792244, + "grad_norm": 0.14567236602306366, + "learning_rate": 9.975830651912436e-05, + "loss": 0.01872994564473629, + "num_input_tokens_seen": 24252856, + "step": 1481, + "train_runtime": 12205.6238, + "train_tokens_per_second": 1987.023 + }, + { + "epoch": 0.4105263157894737, + "grad_norm": 0.09454458206892014, + "learning_rate": 9.975787469404326e-05, + "loss": 0.016687791794538498, + "num_input_tokens_seen": 24269232, + "step": 1482, + "train_runtime": 12213.8556, + "train_tokens_per_second": 1987.025 + }, + { + "epoch": 0.410803324099723, + "grad_norm": 0.11998715996742249, + "learning_rate": 9.975744248447945e-05, + "loss": 0.01684304140508175, + "num_input_tokens_seen": 24285608, + "step": 1483, + "train_runtime": 12222.0836, + "train_tokens_per_second": 1987.027 + }, + { + "epoch": 0.4110803324099723, + "grad_norm": 0.09721237421035767, + "learning_rate": 9.975700989043633e-05, + "loss": 0.01851879619061947, + "num_input_tokens_seen": 24301984, + "step": 1484, + "train_runtime": 12230.3141, + "train_tokens_per_second": 1987.029 + }, + { + "epoch": 0.4113573407202216, + "grad_norm": 0.08585555851459503, + "learning_rate": 9.97565769119172e-05, + "loss": 0.016712235286831856, + "num_input_tokens_seen": 24318360, + "step": 1485, + "train_runtime": 12238.5432, + "train_tokens_per_second": 1987.031 + }, + { + "epoch": 0.4116343490304709, + "grad_norm": 0.11576534807682037, + "learning_rate": 9.975614354892543e-05, + "loss": 0.01649549789726734, + "num_input_tokens_seen": 24334736, + "step": 1486, + "train_runtime": 12246.777, + "train_tokens_per_second": 1987.032 + }, + { + "epoch": 0.4119113573407202, + "grad_norm": 0.09698686748743057, + "learning_rate": 9.975570980146436e-05, + "loss": 0.021939102560281754, + "num_input_tokens_seen": 24351112, + "step": 1487, + "train_runtime": 12255.0042, + "train_tokens_per_second": 1987.034 + }, + { + "epoch": 0.4121883656509695, + "grad_norm": 0.09843093901872635, + "learning_rate": 9.975527566953735e-05, + "loss": 0.017935797572135925, + "num_input_tokens_seen": 24367488, + "step": 1488, + "train_runtime": 12263.2342, + "train_tokens_per_second": 1987.036 + }, + { + "epoch": 0.41246537396121885, + "grad_norm": 0.12397830188274384, + "learning_rate": 9.975484115314775e-05, + "loss": 0.02303002029657364, + "num_input_tokens_seen": 24383864, + "step": 1489, + "train_runtime": 12271.4608, + "train_tokens_per_second": 1987.038 + }, + { + "epoch": 0.41274238227146814, + "grad_norm": 0.11422710865736008, + "learning_rate": 9.97544062522989e-05, + "loss": 0.02046043612062931, + "num_input_tokens_seen": 24400240, + "step": 1490, + "train_runtime": 12279.6858, + "train_tokens_per_second": 1987.041 + }, + { + "epoch": 0.41301939058171744, + "grad_norm": 0.12034820020198822, + "learning_rate": 9.97539709669942e-05, + "loss": 0.01920977607369423, + "num_input_tokens_seen": 24416616, + "step": 1491, + "train_runtime": 12287.9103, + "train_tokens_per_second": 1987.044 + }, + { + "epoch": 0.41329639889196673, + "grad_norm": 0.09031160175800323, + "learning_rate": 9.975353529723699e-05, + "loss": 0.017306998372077942, + "num_input_tokens_seen": 24432992, + "step": 1492, + "train_runtime": 12296.1417, + "train_tokens_per_second": 1987.045 + }, + { + "epoch": 0.4135734072022161, + "grad_norm": 0.12611347436904907, + "learning_rate": 9.975309924303063e-05, + "loss": 0.020747173577547073, + "num_input_tokens_seen": 24449368, + "step": 1493, + "train_runtime": 12304.3689, + "train_tokens_per_second": 1987.048 + }, + { + "epoch": 0.4138504155124654, + "grad_norm": 0.1171162948012352, + "learning_rate": 9.975266280437848e-05, + "loss": 0.018753645941615105, + "num_input_tokens_seen": 24465744, + "step": 1494, + "train_runtime": 12312.5975, + "train_tokens_per_second": 1987.05 + }, + { + "epoch": 0.41412742382271467, + "grad_norm": 0.11182406544685364, + "learning_rate": 9.975222598128394e-05, + "loss": 0.01824847236275673, + "num_input_tokens_seen": 24482120, + "step": 1495, + "train_runtime": 12320.8192, + "train_tokens_per_second": 1987.053 + }, + { + "epoch": 0.41440443213296396, + "grad_norm": 0.13898223638534546, + "learning_rate": 9.975178877375038e-05, + "loss": 0.022053487598896027, + "num_input_tokens_seen": 24498496, + "step": 1496, + "train_runtime": 12329.0453, + "train_tokens_per_second": 1987.055 + }, + { + "epoch": 0.4146814404432133, + "grad_norm": 0.07403433322906494, + "learning_rate": 9.975135118178117e-05, + "loss": 0.016938677057623863, + "num_input_tokens_seen": 24514872, + "step": 1497, + "train_runtime": 12337.2766, + "train_tokens_per_second": 1987.057 + }, + { + "epoch": 0.4149584487534626, + "grad_norm": 0.1118258386850357, + "learning_rate": 9.975091320537968e-05, + "loss": 0.018670212477445602, + "num_input_tokens_seen": 24531248, + "step": 1498, + "train_runtime": 12345.5043, + "train_tokens_per_second": 1987.059 + }, + { + "epoch": 0.4152354570637119, + "grad_norm": 0.09544995427131653, + "learning_rate": 9.975047484454931e-05, + "loss": 0.017644129693508148, + "num_input_tokens_seen": 24547624, + "step": 1499, + "train_runtime": 12353.7372, + "train_tokens_per_second": 1987.061 + }, + { + "epoch": 0.4155124653739612, + "grad_norm": 0.08185995370149612, + "learning_rate": 9.975003609929346e-05, + "loss": 0.01701216772198677, + "num_input_tokens_seen": 24564000, + "step": 1500, + "train_runtime": 12361.9608, + "train_tokens_per_second": 1987.063 + }, + { + "epoch": 0.41578947368421054, + "grad_norm": 0.09390932321548462, + "learning_rate": 9.97495969696155e-05, + "loss": 0.017619900405406952, + "num_input_tokens_seen": 24580376, + "step": 1501, + "train_runtime": 12371.7727, + "train_tokens_per_second": 1986.811 + }, + { + "epoch": 0.41606648199445984, + "grad_norm": 0.13030026853084564, + "learning_rate": 9.974915745551882e-05, + "loss": 0.019778285175561905, + "num_input_tokens_seen": 24596752, + "step": 1502, + "train_runtime": 12379.982, + "train_tokens_per_second": 1986.816 + }, + { + "epoch": 0.41634349030470913, + "grad_norm": 0.14327822625637054, + "learning_rate": 9.974871755700684e-05, + "loss": 0.020008303225040436, + "num_input_tokens_seen": 24613128, + "step": 1503, + "train_runtime": 12388.1827, + "train_tokens_per_second": 1986.823 + }, + { + "epoch": 0.4166204986149584, + "grad_norm": 0.17894473671913147, + "learning_rate": 9.974827727408293e-05, + "loss": 0.0238664373755455, + "num_input_tokens_seen": 24629504, + "step": 1504, + "train_runtime": 12396.3992, + "train_tokens_per_second": 1986.827 + }, + { + "epoch": 0.4168975069252078, + "grad_norm": 0.09922441095113754, + "learning_rate": 9.974783660675052e-05, + "loss": 0.015153187327086926, + "num_input_tokens_seen": 24645880, + "step": 1505, + "train_runtime": 12404.6171, + "train_tokens_per_second": 1986.831 + }, + { + "epoch": 0.41717451523545707, + "grad_norm": 0.1238904595375061, + "learning_rate": 9.974739555501298e-05, + "loss": 0.015612883493304253, + "num_input_tokens_seen": 24662256, + "step": 1506, + "train_runtime": 12412.8391, + "train_tokens_per_second": 1986.834 + }, + { + "epoch": 0.41745152354570636, + "grad_norm": 0.14022022485733032, + "learning_rate": 9.974695411887376e-05, + "loss": 0.02228214591741562, + "num_input_tokens_seen": 24678632, + "step": 1507, + "train_runtime": 12421.0637, + "train_tokens_per_second": 1986.837 + }, + { + "epoch": 0.41772853185595565, + "grad_norm": 0.11893945932388306, + "learning_rate": 9.974651229833623e-05, + "loss": 0.023093614727258682, + "num_input_tokens_seen": 24695008, + "step": 1508, + "train_runtime": 12429.292, + "train_tokens_per_second": 1986.839 + }, + { + "epoch": 0.418005540166205, + "grad_norm": 0.08481961488723755, + "learning_rate": 9.974607009340384e-05, + "loss": 0.014169312082231045, + "num_input_tokens_seen": 24711384, + "step": 1509, + "train_runtime": 12437.5138, + "train_tokens_per_second": 1986.843 + }, + { + "epoch": 0.4182825484764543, + "grad_norm": 0.09857413172721863, + "learning_rate": 9.974562750407999e-05, + "loss": 0.017760032787919044, + "num_input_tokens_seen": 24727760, + "step": 1510, + "train_runtime": 12445.739, + "train_tokens_per_second": 1986.845 + }, + { + "epoch": 0.4185595567867036, + "grad_norm": 0.10259909927845001, + "learning_rate": 9.974518453036808e-05, + "loss": 0.01507970504462719, + "num_input_tokens_seen": 24744136, + "step": 1511, + "train_runtime": 12453.964, + "train_tokens_per_second": 1986.848 + }, + { + "epoch": 0.4188365650969529, + "grad_norm": 0.11883003264665604, + "learning_rate": 9.974474117227157e-05, + "loss": 0.020981548354029655, + "num_input_tokens_seen": 24760512, + "step": 1512, + "train_runtime": 12462.1884, + "train_tokens_per_second": 1986.851 + }, + { + "epoch": 0.41911357340720223, + "grad_norm": 0.14107069373130798, + "learning_rate": 9.974429742979387e-05, + "loss": 0.021176718175411224, + "num_input_tokens_seen": 24776888, + "step": 1513, + "train_runtime": 12470.4124, + "train_tokens_per_second": 1986.854 + }, + { + "epoch": 0.4193905817174515, + "grad_norm": 0.11886392533779144, + "learning_rate": 9.974385330293841e-05, + "loss": 0.015223896130919456, + "num_input_tokens_seen": 24793264, + "step": 1514, + "train_runtime": 12478.6237, + "train_tokens_per_second": 1986.859 + }, + { + "epoch": 0.4196675900277008, + "grad_norm": 0.1369084268808365, + "learning_rate": 9.97434087917086e-05, + "loss": 0.021829761564731598, + "num_input_tokens_seen": 24809640, + "step": 1515, + "train_runtime": 12486.8558, + "train_tokens_per_second": 1986.86 + }, + { + "epoch": 0.4199445983379501, + "grad_norm": 0.14784511923789978, + "learning_rate": 9.974296389610792e-05, + "loss": 0.021770641207695007, + "num_input_tokens_seen": 24826016, + "step": 1516, + "train_runtime": 12495.0809, + "train_tokens_per_second": 1986.863 + }, + { + "epoch": 0.42022160664819946, + "grad_norm": 0.1427927166223526, + "learning_rate": 9.974251861613977e-05, + "loss": 0.01945625990629196, + "num_input_tokens_seen": 24842392, + "step": 1517, + "train_runtime": 12503.3319, + "train_tokens_per_second": 1986.862 + }, + { + "epoch": 0.42049861495844876, + "grad_norm": 0.10556577891111374, + "learning_rate": 9.974207295180761e-05, + "loss": 0.01871604099869728, + "num_input_tokens_seen": 24858768, + "step": 1518, + "train_runtime": 12511.5687, + "train_tokens_per_second": 1986.863 + }, + { + "epoch": 0.42077562326869805, + "grad_norm": 0.09240654110908508, + "learning_rate": 9.974162690311488e-05, + "loss": 0.014579557813704014, + "num_input_tokens_seen": 24875144, + "step": 1519, + "train_runtime": 12519.8037, + "train_tokens_per_second": 1986.864 + }, + { + "epoch": 0.42105263157894735, + "grad_norm": 0.0932985171675682, + "learning_rate": 9.9741180470065e-05, + "loss": 0.015129639767110348, + "num_input_tokens_seen": 24891520, + "step": 1520, + "train_runtime": 12528.0368, + "train_tokens_per_second": 1986.865 + }, + { + "epoch": 0.4213296398891967, + "grad_norm": 0.1273890882730484, + "learning_rate": 9.974073365266148e-05, + "loss": 0.021944157779216766, + "num_input_tokens_seen": 24907896, + "step": 1521, + "train_runtime": 12536.2666, + "train_tokens_per_second": 1986.867 + }, + { + "epoch": 0.421606648199446, + "grad_norm": 0.08121524751186371, + "learning_rate": 9.97402864509077e-05, + "loss": 0.02198510244488716, + "num_input_tokens_seen": 24924272, + "step": 1522, + "train_runtime": 12544.4989, + "train_tokens_per_second": 1986.869 + }, + { + "epoch": 0.4218836565096953, + "grad_norm": 0.11132524907588959, + "learning_rate": 9.973983886480718e-05, + "loss": 0.018897822126746178, + "num_input_tokens_seen": 24940648, + "step": 1523, + "train_runtime": 12552.7315, + "train_tokens_per_second": 1986.87 + }, + { + "epoch": 0.4221606648199446, + "grad_norm": 0.13040630519390106, + "learning_rate": 9.973939089436332e-05, + "loss": 0.018578460440039635, + "num_input_tokens_seen": 24957024, + "step": 1524, + "train_runtime": 12560.9695, + "train_tokens_per_second": 1986.871 + }, + { + "epoch": 0.4224376731301939, + "grad_norm": 0.09887528419494629, + "learning_rate": 9.973894253957964e-05, + "loss": 0.019314052537083626, + "num_input_tokens_seen": 24973400, + "step": 1525, + "train_runtime": 12569.2035, + "train_tokens_per_second": 1986.872 + }, + { + "epoch": 0.4227146814404432, + "grad_norm": 0.1400228589773178, + "learning_rate": 9.973849380045956e-05, + "loss": 0.01660303771495819, + "num_input_tokens_seen": 24989776, + "step": 1526, + "train_runtime": 12577.4415, + "train_tokens_per_second": 1986.873 + }, + { + "epoch": 0.4229916897506925, + "grad_norm": 0.12698403000831604, + "learning_rate": 9.973804467700657e-05, + "loss": 0.017753584310412407, + "num_input_tokens_seen": 25006152, + "step": 1527, + "train_runtime": 12585.6844, + "train_tokens_per_second": 1986.873 + }, + { + "epoch": 0.4232686980609418, + "grad_norm": 0.12230190634727478, + "learning_rate": 9.973759516922414e-05, + "loss": 0.01637866534292698, + "num_input_tokens_seen": 25022528, + "step": 1528, + "train_runtime": 12593.9177, + "train_tokens_per_second": 1986.874 + }, + { + "epoch": 0.42354570637119116, + "grad_norm": 0.12949851155281067, + "learning_rate": 9.973714527711573e-05, + "loss": 0.019342346116900444, + "num_input_tokens_seen": 25038904, + "step": 1529, + "train_runtime": 12602.1555, + "train_tokens_per_second": 1986.875 + }, + { + "epoch": 0.42382271468144045, + "grad_norm": 0.12811441719532013, + "learning_rate": 9.97366950006848e-05, + "loss": 0.01964392140507698, + "num_input_tokens_seen": 25055280, + "step": 1530, + "train_runtime": 12610.3891, + "train_tokens_per_second": 1986.876 + }, + { + "epoch": 0.42409972299168974, + "grad_norm": 0.1301550418138504, + "learning_rate": 9.973624433993489e-05, + "loss": 0.020306790247559547, + "num_input_tokens_seen": 25071656, + "step": 1531, + "train_runtime": 12618.6292, + "train_tokens_per_second": 1986.876 + }, + { + "epoch": 0.42437673130193904, + "grad_norm": 0.09931158274412155, + "learning_rate": 9.973579329486943e-05, + "loss": 0.01682884432375431, + "num_input_tokens_seen": 25088032, + "step": 1532, + "train_runtime": 12626.861, + "train_tokens_per_second": 1986.878 + }, + { + "epoch": 0.4246537396121884, + "grad_norm": 0.10524309426546097, + "learning_rate": 9.973534186549192e-05, + "loss": 0.016335617750883102, + "num_input_tokens_seen": 25104408, + "step": 1533, + "train_runtime": 12635.099, + "train_tokens_per_second": 1986.879 + }, + { + "epoch": 0.4249307479224377, + "grad_norm": 0.10724035650491714, + "learning_rate": 9.973489005180585e-05, + "loss": 0.015604635700583458, + "num_input_tokens_seen": 25120784, + "step": 1534, + "train_runtime": 12643.3262, + "train_tokens_per_second": 1986.881 + }, + { + "epoch": 0.425207756232687, + "grad_norm": 0.09406310319900513, + "learning_rate": 9.97344378538147e-05, + "loss": 0.018252257257699966, + "num_input_tokens_seen": 25137160, + "step": 1535, + "train_runtime": 12651.5572, + "train_tokens_per_second": 1986.883 + }, + { + "epoch": 0.42548476454293627, + "grad_norm": 0.09816833585500717, + "learning_rate": 9.973398527152199e-05, + "loss": 0.014595930464565754, + "num_input_tokens_seen": 25153536, + "step": 1536, + "train_runtime": 12659.7916, + "train_tokens_per_second": 1986.884 + }, + { + "epoch": 0.4257617728531856, + "grad_norm": 0.08685023337602615, + "learning_rate": 9.973353230493119e-05, + "loss": 0.01729949191212654, + "num_input_tokens_seen": 25169912, + "step": 1537, + "train_runtime": 12668.0237, + "train_tokens_per_second": 1986.885 + }, + { + "epoch": 0.4260387811634349, + "grad_norm": 0.0923730880022049, + "learning_rate": 9.973307895404582e-05, + "loss": 0.015374931506812572, + "num_input_tokens_seen": 25186288, + "step": 1538, + "train_runtime": 12676.259, + "train_tokens_per_second": 1986.887 + }, + { + "epoch": 0.4263157894736842, + "grad_norm": 0.11073173582553864, + "learning_rate": 9.973262521886937e-05, + "loss": 0.013903262093663216, + "num_input_tokens_seen": 25202664, + "step": 1539, + "train_runtime": 12684.4996, + "train_tokens_per_second": 1986.887 + }, + { + "epoch": 0.4265927977839335, + "grad_norm": 0.11924225836992264, + "learning_rate": 9.973217109940534e-05, + "loss": 0.015899095684289932, + "num_input_tokens_seen": 25219040, + "step": 1540, + "train_runtime": 12692.7444, + "train_tokens_per_second": 1986.886 + }, + { + "epoch": 0.42686980609418285, + "grad_norm": 0.11140584945678711, + "learning_rate": 9.973171659565727e-05, + "loss": 0.018729818984866142, + "num_input_tokens_seen": 25235416, + "step": 1541, + "train_runtime": 12700.9791, + "train_tokens_per_second": 1986.887 + }, + { + "epoch": 0.42714681440443214, + "grad_norm": 0.10627549141645432, + "learning_rate": 9.973126170762863e-05, + "loss": 0.018375808373093605, + "num_input_tokens_seen": 25251792, + "step": 1542, + "train_runtime": 12709.2141, + "train_tokens_per_second": 1986.889 + }, + { + "epoch": 0.42742382271468143, + "grad_norm": 0.1008073017001152, + "learning_rate": 9.973080643532297e-05, + "loss": 0.018206287175416946, + "num_input_tokens_seen": 25268168, + "step": 1543, + "train_runtime": 12717.4562, + "train_tokens_per_second": 1986.889 + }, + { + "epoch": 0.4277008310249307, + "grad_norm": 0.11355632543563843, + "learning_rate": 9.97303507787438e-05, + "loss": 0.019367670640349388, + "num_input_tokens_seen": 25284544, + "step": 1544, + "train_runtime": 12725.6844, + "train_tokens_per_second": 1986.891 + }, + { + "epoch": 0.4279778393351801, + "grad_norm": 0.10155728459358215, + "learning_rate": 9.972989473789462e-05, + "loss": 0.01667173206806183, + "num_input_tokens_seen": 25300920, + "step": 1545, + "train_runtime": 12733.9327, + "train_tokens_per_second": 1986.89 + }, + { + "epoch": 0.42825484764542937, + "grad_norm": 0.0810021311044693, + "learning_rate": 9.972943831277897e-05, + "loss": 0.012352019548416138, + "num_input_tokens_seen": 25317296, + "step": 1546, + "train_runtime": 12742.1699, + "train_tokens_per_second": 1986.89 + }, + { + "epoch": 0.42853185595567866, + "grad_norm": 0.08865518867969513, + "learning_rate": 9.972898150340038e-05, + "loss": 0.017111318185925484, + "num_input_tokens_seen": 25333672, + "step": 1547, + "train_runtime": 12750.3947, + "train_tokens_per_second": 1986.893 + }, + { + "epoch": 0.42880886426592796, + "grad_norm": 0.10631067305803299, + "learning_rate": 9.972852430976239e-05, + "loss": 0.01923285610973835, + "num_input_tokens_seen": 25350048, + "step": 1548, + "train_runtime": 12758.6192, + "train_tokens_per_second": 1986.896 + }, + { + "epoch": 0.4290858725761773, + "grad_norm": 0.13701912760734558, + "learning_rate": 9.97280667318685e-05, + "loss": 0.02296541817486286, + "num_input_tokens_seen": 25366424, + "step": 1549, + "train_runtime": 12766.8348, + "train_tokens_per_second": 1986.9 + }, + { + "epoch": 0.4293628808864266, + "grad_norm": 0.07557890564203262, + "learning_rate": 9.972760876972226e-05, + "loss": 0.013895041309297085, + "num_input_tokens_seen": 25382800, + "step": 1550, + "train_runtime": 12775.0469, + "train_tokens_per_second": 1986.905 + }, + { + "epoch": 0.4296398891966759, + "grad_norm": 0.11716300249099731, + "learning_rate": 9.972715042332721e-05, + "loss": 0.020553652197122574, + "num_input_tokens_seen": 25399176, + "step": 1551, + "train_runtime": 12783.2704, + "train_tokens_per_second": 1986.908 + }, + { + "epoch": 0.4299168975069252, + "grad_norm": 0.12344459444284439, + "learning_rate": 9.972669169268692e-05, + "loss": 0.018880700692534447, + "num_input_tokens_seen": 25415552, + "step": 1552, + "train_runtime": 12791.4861, + "train_tokens_per_second": 1986.912 + }, + { + "epoch": 0.43019390581717454, + "grad_norm": 0.10608179122209549, + "learning_rate": 9.972623257780489e-05, + "loss": 0.016719866544008255, + "num_input_tokens_seen": 25431928, + "step": 1553, + "train_runtime": 12799.7155, + "train_tokens_per_second": 1986.914 + }, + { + "epoch": 0.43047091412742383, + "grad_norm": 0.15239013731479645, + "learning_rate": 9.97257730786847e-05, + "loss": 0.01800607331097126, + "num_input_tokens_seen": 25448304, + "step": 1554, + "train_runtime": 12807.9393, + "train_tokens_per_second": 1986.916 + }, + { + "epoch": 0.4307479224376731, + "grad_norm": 0.11699217557907104, + "learning_rate": 9.972531319532986e-05, + "loss": 0.019775712862610817, + "num_input_tokens_seen": 25464680, + "step": 1555, + "train_runtime": 12816.1642, + "train_tokens_per_second": 1986.919 + }, + { + "epoch": 0.4310249307479224, + "grad_norm": 0.09052786231040955, + "learning_rate": 9.972485292774397e-05, + "loss": 0.017511524260044098, + "num_input_tokens_seen": 25481056, + "step": 1556, + "train_runtime": 12824.3915, + "train_tokens_per_second": 1986.921 + }, + { + "epoch": 0.43130193905817177, + "grad_norm": 0.09232236444950104, + "learning_rate": 9.972439227593057e-05, + "loss": 0.015650376677513123, + "num_input_tokens_seen": 25497432, + "step": 1557, + "train_runtime": 12832.613, + "train_tokens_per_second": 1986.924 + }, + { + "epoch": 0.43157894736842106, + "grad_norm": 0.10192456096410751, + "learning_rate": 9.97239312398932e-05, + "loss": 0.014786607585847378, + "num_input_tokens_seen": 25513808, + "step": 1558, + "train_runtime": 12840.8383, + "train_tokens_per_second": 1986.927 + }, + { + "epoch": 0.43185595567867036, + "grad_norm": 0.12328295409679413, + "learning_rate": 9.972346981963546e-05, + "loss": 0.02046160399913788, + "num_input_tokens_seen": 25530184, + "step": 1559, + "train_runtime": 12849.0605, + "train_tokens_per_second": 1986.93 + }, + { + "epoch": 0.43213296398891965, + "grad_norm": 0.12501497566699982, + "learning_rate": 9.972300801516089e-05, + "loss": 0.012646907940506935, + "num_input_tokens_seen": 25546560, + "step": 1560, + "train_runtime": 12857.284, + "train_tokens_per_second": 1986.933 + }, + { + "epoch": 0.432409972299169, + "grad_norm": 0.08934593945741653, + "learning_rate": 9.972254582647305e-05, + "loss": 0.01634339801967144, + "num_input_tokens_seen": 25562936, + "step": 1561, + "train_runtime": 12865.5084, + "train_tokens_per_second": 1986.936 + }, + { + "epoch": 0.4326869806094183, + "grad_norm": 0.14740946888923645, + "learning_rate": 9.972208325357555e-05, + "loss": 0.018298307433724403, + "num_input_tokens_seen": 25579312, + "step": 1562, + "train_runtime": 12873.7199, + "train_tokens_per_second": 1986.94 + }, + { + "epoch": 0.4329639889196676, + "grad_norm": 0.10673544555902481, + "learning_rate": 9.972162029647192e-05, + "loss": 0.016629433259367943, + "num_input_tokens_seen": 25595688, + "step": 1563, + "train_runtime": 12881.9335, + "train_tokens_per_second": 1986.945 + }, + { + "epoch": 0.4332409972299169, + "grad_norm": 0.1630849391222, + "learning_rate": 9.972115695516575e-05, + "loss": 0.022462517023086548, + "num_input_tokens_seen": 25612064, + "step": 1564, + "train_runtime": 12890.14, + "train_tokens_per_second": 1986.95 + }, + { + "epoch": 0.43351800554016623, + "grad_norm": 0.10794494301080704, + "learning_rate": 9.972069322966065e-05, + "loss": 0.020513249561190605, + "num_input_tokens_seen": 25628440, + "step": 1565, + "train_runtime": 12898.3553, + "train_tokens_per_second": 1986.954 + }, + { + "epoch": 0.4337950138504155, + "grad_norm": 0.11724110692739487, + "learning_rate": 9.972022911996015e-05, + "loss": 0.019993262365460396, + "num_input_tokens_seen": 25644816, + "step": 1566, + "train_runtime": 12906.5758, + "train_tokens_per_second": 1986.957 + }, + { + "epoch": 0.4340720221606648, + "grad_norm": 0.10330456495285034, + "learning_rate": 9.971976462606789e-05, + "loss": 0.01991885155439377, + "num_input_tokens_seen": 25661192, + "step": 1567, + "train_runtime": 12914.7977, + "train_tokens_per_second": 1986.96 + }, + { + "epoch": 0.4343490304709141, + "grad_norm": 0.30228114128112793, + "learning_rate": 9.971929974798742e-05, + "loss": 0.027390949428081512, + "num_input_tokens_seen": 25677568, + "step": 1568, + "train_runtime": 12923.0155, + "train_tokens_per_second": 1986.964 + }, + { + "epoch": 0.43462603878116346, + "grad_norm": 0.14542868733406067, + "learning_rate": 9.971883448572234e-05, + "loss": 0.021153803914785385, + "num_input_tokens_seen": 25693944, + "step": 1569, + "train_runtime": 12931.226, + "train_tokens_per_second": 1986.969 + }, + { + "epoch": 0.43490304709141275, + "grad_norm": 0.11644655466079712, + "learning_rate": 9.971836883927628e-05, + "loss": 0.017320796847343445, + "num_input_tokens_seen": 25710320, + "step": 1570, + "train_runtime": 12939.457, + "train_tokens_per_second": 1986.971 + }, + { + "epoch": 0.43518005540166205, + "grad_norm": 0.14358653128147125, + "learning_rate": 9.971790280865278e-05, + "loss": 0.018912091851234436, + "num_input_tokens_seen": 25726696, + "step": 1571, + "train_runtime": 12947.6808, + "train_tokens_per_second": 1986.973 + }, + { + "epoch": 0.43545706371191134, + "grad_norm": 0.08814447373151779, + "learning_rate": 9.971743639385551e-05, + "loss": 0.01876037009060383, + "num_input_tokens_seen": 25743072, + "step": 1572, + "train_runtime": 12955.9033, + "train_tokens_per_second": 1986.976 + }, + { + "epoch": 0.4357340720221607, + "grad_norm": 0.11971133947372437, + "learning_rate": 9.9716969594888e-05, + "loss": 0.019347338005900383, + "num_input_tokens_seen": 25759448, + "step": 1573, + "train_runtime": 12964.126, + "train_tokens_per_second": 1986.979 + }, + { + "epoch": 0.43601108033241, + "grad_norm": 0.1191132590174675, + "learning_rate": 9.97165024117539e-05, + "loss": 0.021990541368722916, + "num_input_tokens_seen": 25775824, + "step": 1574, + "train_runtime": 12972.3558, + "train_tokens_per_second": 1986.981 + }, + { + "epoch": 0.4362880886426593, + "grad_norm": 0.12068365514278412, + "learning_rate": 9.971603484445682e-05, + "loss": 0.018631277605891228, + "num_input_tokens_seen": 25792200, + "step": 1575, + "train_runtime": 12980.5718, + "train_tokens_per_second": 1986.985 + }, + { + "epoch": 0.43656509695290857, + "grad_norm": 0.09791439026594162, + "learning_rate": 9.971556689300037e-05, + "loss": 0.012464242056012154, + "num_input_tokens_seen": 25808576, + "step": 1576, + "train_runtime": 12988.7836, + "train_tokens_per_second": 1986.989 + }, + { + "epoch": 0.4368421052631579, + "grad_norm": 0.13267669081687927, + "learning_rate": 9.971509855738814e-05, + "loss": 0.021933557465672493, + "num_input_tokens_seen": 25824952, + "step": 1577, + "train_runtime": 12996.9984, + "train_tokens_per_second": 1986.994 + }, + { + "epoch": 0.4371191135734072, + "grad_norm": 0.10230008512735367, + "learning_rate": 9.971462983762379e-05, + "loss": 0.022113312035799026, + "num_input_tokens_seen": 25841328, + "step": 1578, + "train_runtime": 13005.2271, + "train_tokens_per_second": 1986.996 + }, + { + "epoch": 0.4373961218836565, + "grad_norm": 0.10596469044685364, + "learning_rate": 9.971416073371092e-05, + "loss": 0.020090797916054726, + "num_input_tokens_seen": 25857704, + "step": 1579, + "train_runtime": 13013.4561, + "train_tokens_per_second": 1986.997 + }, + { + "epoch": 0.4376731301939058, + "grad_norm": 0.08834446966648102, + "learning_rate": 9.971369124565315e-05, + "loss": 0.01697266474366188, + "num_input_tokens_seen": 25874080, + "step": 1580, + "train_runtime": 13021.673, + "train_tokens_per_second": 1987.001 + }, + { + "epoch": 0.43795013850415515, + "grad_norm": 0.12609325349330902, + "learning_rate": 9.971322137345411e-05, + "loss": 0.020721470937132835, + "num_input_tokens_seen": 25890456, + "step": 1581, + "train_runtime": 13029.8812, + "train_tokens_per_second": 1987.006 + }, + { + "epoch": 0.43822714681440444, + "grad_norm": 0.15049059689044952, + "learning_rate": 9.971275111711745e-05, + "loss": 0.01843881607055664, + "num_input_tokens_seen": 25906832, + "step": 1582, + "train_runtime": 13038.0941, + "train_tokens_per_second": 1987.011 + }, + { + "epoch": 0.43850415512465374, + "grad_norm": 0.12569500505924225, + "learning_rate": 9.971228047664677e-05, + "loss": 0.016223682090640068, + "num_input_tokens_seen": 25923208, + "step": 1583, + "train_runtime": 13046.3004, + "train_tokens_per_second": 1987.016 + }, + { + "epoch": 0.43878116343490303, + "grad_norm": 0.09640896320343018, + "learning_rate": 9.971180945204575e-05, + "loss": 0.016037747263908386, + "num_input_tokens_seen": 25939584, + "step": 1584, + "train_runtime": 13054.5102, + "train_tokens_per_second": 1987.021 + }, + { + "epoch": 0.4390581717451524, + "grad_norm": 0.10194913297891617, + "learning_rate": 9.971133804331798e-05, + "loss": 0.01951240934431553, + "num_input_tokens_seen": 25955960, + "step": 1585, + "train_runtime": 13062.7227, + "train_tokens_per_second": 1987.025 + }, + { + "epoch": 0.4393351800554017, + "grad_norm": 0.09362557530403137, + "learning_rate": 9.971086625046716e-05, + "loss": 0.019374364987015724, + "num_input_tokens_seen": 25972336, + "step": 1586, + "train_runtime": 13070.9628, + "train_tokens_per_second": 1987.025 + }, + { + "epoch": 0.43961218836565097, + "grad_norm": 0.1688130646944046, + "learning_rate": 9.971039407349688e-05, + "loss": 0.021863028407096863, + "num_input_tokens_seen": 25988712, + "step": 1587, + "train_runtime": 13079.1869, + "train_tokens_per_second": 1987.028 + }, + { + "epoch": 0.43988919667590026, + "grad_norm": 0.10478191077709198, + "learning_rate": 9.97099215124108e-05, + "loss": 0.018479574471712112, + "num_input_tokens_seen": 26005088, + "step": 1588, + "train_runtime": 13087.4115, + "train_tokens_per_second": 1987.031 + }, + { + "epoch": 0.4401662049861496, + "grad_norm": 0.13631387054920197, + "learning_rate": 9.970944856721261e-05, + "loss": 0.01788567751646042, + "num_input_tokens_seen": 26021464, + "step": 1589, + "train_runtime": 13095.6373, + "train_tokens_per_second": 1987.033 + }, + { + "epoch": 0.4404432132963989, + "grad_norm": 0.10367582738399506, + "learning_rate": 9.970897523790591e-05, + "loss": 0.016262786462903023, + "num_input_tokens_seen": 26037840, + "step": 1590, + "train_runtime": 13103.8634, + "train_tokens_per_second": 1987.035 + }, + { + "epoch": 0.4407202216066482, + "grad_norm": 0.11734354496002197, + "learning_rate": 9.970850152449441e-05, + "loss": 0.02063903585076332, + "num_input_tokens_seen": 26054216, + "step": 1591, + "train_runtime": 13112.0945, + "train_tokens_per_second": 1987.037 + }, + { + "epoch": 0.4409972299168975, + "grad_norm": 0.12453746050596237, + "learning_rate": 9.970802742698173e-05, + "loss": 0.01959729939699173, + "num_input_tokens_seen": 26070592, + "step": 1592, + "train_runtime": 13120.321, + "train_tokens_per_second": 1987.039 + }, + { + "epoch": 0.4412742382271468, + "grad_norm": 0.08755531907081604, + "learning_rate": 9.970755294537155e-05, + "loss": 0.0169521551579237, + "num_input_tokens_seen": 26086968, + "step": 1593, + "train_runtime": 13128.5559, + "train_tokens_per_second": 1987.04 + }, + { + "epoch": 0.44155124653739614, + "grad_norm": 0.10193091630935669, + "learning_rate": 9.970707807966755e-05, + "loss": 0.01633201539516449, + "num_input_tokens_seen": 26103344, + "step": 1594, + "train_runtime": 13136.7789, + "train_tokens_per_second": 1987.043 + }, + { + "epoch": 0.44182825484764543, + "grad_norm": 0.1256265789270401, + "learning_rate": 9.970660282987338e-05, + "loss": 0.02275225892663002, + "num_input_tokens_seen": 26119720, + "step": 1595, + "train_runtime": 13145.0056, + "train_tokens_per_second": 1987.045 + }, + { + "epoch": 0.4421052631578947, + "grad_norm": 0.1325414478778839, + "learning_rate": 9.970612719599271e-05, + "loss": 0.01918572001159191, + "num_input_tokens_seen": 26136096, + "step": 1596, + "train_runtime": 13153.2276, + "train_tokens_per_second": 1987.048 + }, + { + "epoch": 0.442382271468144, + "grad_norm": 0.09451834112405777, + "learning_rate": 9.970565117802922e-05, + "loss": 0.015897445380687714, + "num_input_tokens_seen": 26152472, + "step": 1597, + "train_runtime": 13161.4417, + "train_tokens_per_second": 1987.052 + }, + { + "epoch": 0.44265927977839337, + "grad_norm": 0.12764009833335876, + "learning_rate": 9.970517477598659e-05, + "loss": 0.02160155028104782, + "num_input_tokens_seen": 26168848, + "step": 1598, + "train_runtime": 13169.6569, + "train_tokens_per_second": 1987.056 + }, + { + "epoch": 0.44293628808864266, + "grad_norm": 0.10901941359043121, + "learning_rate": 9.970469798986852e-05, + "loss": 0.01577366702258587, + "num_input_tokens_seen": 26185224, + "step": 1599, + "train_runtime": 13177.863, + "train_tokens_per_second": 1987.061 + }, + { + "epoch": 0.44321329639889195, + "grad_norm": 0.09186606854200363, + "learning_rate": 9.970422081967866e-05, + "loss": 0.017964370548725128, + "num_input_tokens_seen": 26201600, + "step": 1600, + "train_runtime": 13186.0734, + "train_tokens_per_second": 1987.066 + }, + { + "epoch": 0.44349030470914125, + "grad_norm": 0.11299016326665878, + "learning_rate": 9.970374326542072e-05, + "loss": 0.019685551524162292, + "num_input_tokens_seen": 26217976, + "step": 1601, + "train_runtime": 13195.9434, + "train_tokens_per_second": 1986.821 + }, + { + "epoch": 0.4437673130193906, + "grad_norm": 0.09262750297784805, + "learning_rate": 9.970326532709838e-05, + "loss": 0.01733386516571045, + "num_input_tokens_seen": 26234352, + "step": 1602, + "train_runtime": 13204.1699, + "train_tokens_per_second": 1986.823 + }, + { + "epoch": 0.4440443213296399, + "grad_norm": 0.13206543028354645, + "learning_rate": 9.970278700471533e-05, + "loss": 0.019003072753548622, + "num_input_tokens_seen": 26250728, + "step": 1603, + "train_runtime": 13212.3924, + "train_tokens_per_second": 1986.826 + }, + { + "epoch": 0.4443213296398892, + "grad_norm": 0.12989354133605957, + "learning_rate": 9.97023082982753e-05, + "loss": 0.023152286186814308, + "num_input_tokens_seen": 26267104, + "step": 1604, + "train_runtime": 13220.6134, + "train_tokens_per_second": 1986.829 + }, + { + "epoch": 0.4445983379501385, + "grad_norm": 0.11354223638772964, + "learning_rate": 9.970182920778193e-05, + "loss": 0.020905418321490288, + "num_input_tokens_seen": 26283480, + "step": 1605, + "train_runtime": 13228.8289, + "train_tokens_per_second": 1986.833 + }, + { + "epoch": 0.4448753462603878, + "grad_norm": 0.10191630572080612, + "learning_rate": 9.970134973323896e-05, + "loss": 0.021601535379886627, + "num_input_tokens_seen": 26299856, + "step": 1606, + "train_runtime": 13237.048, + "train_tokens_per_second": 1986.837 + }, + { + "epoch": 0.4451523545706371, + "grad_norm": 0.05375562608242035, + "learning_rate": 9.970086987465009e-05, + "loss": 0.01611861027777195, + "num_input_tokens_seen": 26316232, + "step": 1607, + "train_runtime": 13245.2745, + "train_tokens_per_second": 1986.839 + }, + { + "epoch": 0.4454293628808864, + "grad_norm": 0.12895795702934265, + "learning_rate": 9.970038963201903e-05, + "loss": 0.01818884164094925, + "num_input_tokens_seen": 26332608, + "step": 1608, + "train_runtime": 13253.4935, + "train_tokens_per_second": 1986.843 + }, + { + "epoch": 0.4457063711911357, + "grad_norm": 0.11335770040750504, + "learning_rate": 9.969990900534949e-05, + "loss": 0.018654238432645798, + "num_input_tokens_seen": 26348984, + "step": 1609, + "train_runtime": 13261.7204, + "train_tokens_per_second": 1986.845 + }, + { + "epoch": 0.44598337950138506, + "grad_norm": 0.1268661469221115, + "learning_rate": 9.969942799464517e-05, + "loss": 0.02340175211429596, + "num_input_tokens_seen": 26365360, + "step": 1610, + "train_runtime": 13269.9461, + "train_tokens_per_second": 1986.848 + }, + { + "epoch": 0.44626038781163435, + "grad_norm": 0.10781239718198776, + "learning_rate": 9.96989465999098e-05, + "loss": 0.02008126676082611, + "num_input_tokens_seen": 26381736, + "step": 1611, + "train_runtime": 13278.1641, + "train_tokens_per_second": 1986.851 + }, + { + "epoch": 0.44653739612188365, + "grad_norm": 0.08803955465555191, + "learning_rate": 9.969846482114711e-05, + "loss": 0.018989218398928642, + "num_input_tokens_seen": 26398112, + "step": 1612, + "train_runtime": 13286.3879, + "train_tokens_per_second": 1986.854 + }, + { + "epoch": 0.44681440443213294, + "grad_norm": 0.09620010107755661, + "learning_rate": 9.969798265836079e-05, + "loss": 0.019343072548508644, + "num_input_tokens_seen": 26414488, + "step": 1613, + "train_runtime": 13294.6089, + "train_tokens_per_second": 1986.857 + }, + { + "epoch": 0.4470914127423823, + "grad_norm": 0.09790344536304474, + "learning_rate": 9.96975001115546e-05, + "loss": 0.018055200576782227, + "num_input_tokens_seen": 26430864, + "step": 1614, + "train_runtime": 13302.8187, + "train_tokens_per_second": 1986.862 + }, + { + "epoch": 0.4473684210526316, + "grad_norm": 0.1307222843170166, + "learning_rate": 9.969701718073225e-05, + "loss": 0.020703420042991638, + "num_input_tokens_seen": 26447240, + "step": 1615, + "train_runtime": 13311.0241, + "train_tokens_per_second": 1986.867 + }, + { + "epoch": 0.4476454293628809, + "grad_norm": 0.08279240876436234, + "learning_rate": 9.969653386589748e-05, + "loss": 0.017637716606259346, + "num_input_tokens_seen": 26463616, + "step": 1616, + "train_runtime": 13319.228, + "train_tokens_per_second": 1986.873 + }, + { + "epoch": 0.44792243767313017, + "grad_norm": 0.1378100961446762, + "learning_rate": 9.969605016705401e-05, + "loss": 0.02020157128572464, + "num_input_tokens_seen": 26479992, + "step": 1617, + "train_runtime": 13327.4373, + "train_tokens_per_second": 1986.878 + }, + { + "epoch": 0.4481994459833795, + "grad_norm": 0.12596404552459717, + "learning_rate": 9.969556608420561e-05, + "loss": 0.01794278621673584, + "num_input_tokens_seen": 26496368, + "step": 1618, + "train_runtime": 13335.6553, + "train_tokens_per_second": 1986.882 + }, + { + "epoch": 0.4484764542936288, + "grad_norm": 0.11786355078220367, + "learning_rate": 9.969508161735598e-05, + "loss": 0.017331751063466072, + "num_input_tokens_seen": 26512744, + "step": 1619, + "train_runtime": 13343.8792, + "train_tokens_per_second": 1986.884 + }, + { + "epoch": 0.4487534626038781, + "grad_norm": 0.06476707756519318, + "learning_rate": 9.969459676650889e-05, + "loss": 0.016437696292996407, + "num_input_tokens_seen": 26529120, + "step": 1620, + "train_runtime": 13352.1031, + "train_tokens_per_second": 1986.887 + }, + { + "epoch": 0.4490304709141274, + "grad_norm": 0.07745865732431412, + "learning_rate": 9.969411153166808e-05, + "loss": 0.017975131049752235, + "num_input_tokens_seen": 26545496, + "step": 1621, + "train_runtime": 13360.3308, + "train_tokens_per_second": 1986.889 + }, + { + "epoch": 0.44930747922437675, + "grad_norm": 0.08174668252468109, + "learning_rate": 9.969362591283729e-05, + "loss": 0.01710684411227703, + "num_input_tokens_seen": 26561872, + "step": 1622, + "train_runtime": 13368.5607, + "train_tokens_per_second": 1986.891 + }, + { + "epoch": 0.44958448753462604, + "grad_norm": 0.08652200549840927, + "learning_rate": 9.969313991002028e-05, + "loss": 0.016555944457650185, + "num_input_tokens_seen": 26578248, + "step": 1623, + "train_runtime": 13376.786, + "train_tokens_per_second": 1986.893 + }, + { + "epoch": 0.44986149584487534, + "grad_norm": 0.0722123309969902, + "learning_rate": 9.969265352322082e-05, + "loss": 0.018853362649679184, + "num_input_tokens_seen": 26594624, + "step": 1624, + "train_runtime": 13385.0097, + "train_tokens_per_second": 1986.896 + }, + { + "epoch": 0.45013850415512463, + "grad_norm": 0.113675557076931, + "learning_rate": 9.969216675244264e-05, + "loss": 0.020314306020736694, + "num_input_tokens_seen": 26611000, + "step": 1625, + "train_runtime": 13393.2359, + "train_tokens_per_second": 1986.898 + }, + { + "epoch": 0.450415512465374, + "grad_norm": 0.09458376467227936, + "learning_rate": 9.969167959768953e-05, + "loss": 0.017359895631670952, + "num_input_tokens_seen": 26627376, + "step": 1626, + "train_runtime": 13401.4671, + "train_tokens_per_second": 1986.9 + }, + { + "epoch": 0.4506925207756233, + "grad_norm": 0.11036255955696106, + "learning_rate": 9.969119205896523e-05, + "loss": 0.016041139140725136, + "num_input_tokens_seen": 26643752, + "step": 1627, + "train_runtime": 13409.6954, + "train_tokens_per_second": 1986.902 + }, + { + "epoch": 0.45096952908587257, + "grad_norm": 0.11617142707109451, + "learning_rate": 9.96907041362735e-05, + "loss": 0.017002468928694725, + "num_input_tokens_seen": 26660128, + "step": 1628, + "train_runtime": 13417.9201, + "train_tokens_per_second": 1986.905 + }, + { + "epoch": 0.45124653739612186, + "grad_norm": 0.087308868765831, + "learning_rate": 9.969021582961815e-05, + "loss": 0.016831468790769577, + "num_input_tokens_seen": 26676504, + "step": 1629, + "train_runtime": 13426.1566, + "train_tokens_per_second": 1986.905 + }, + { + "epoch": 0.4515235457063712, + "grad_norm": 0.08549369126558304, + "learning_rate": 9.968972713900293e-05, + "loss": 0.014869563281536102, + "num_input_tokens_seen": 26692880, + "step": 1630, + "train_runtime": 13434.3826, + "train_tokens_per_second": 1986.908 + }, + { + "epoch": 0.4518005540166205, + "grad_norm": 0.1040751188993454, + "learning_rate": 9.968923806443163e-05, + "loss": 0.012636476196348667, + "num_input_tokens_seen": 26709256, + "step": 1631, + "train_runtime": 13442.5939, + "train_tokens_per_second": 1986.912 + }, + { + "epoch": 0.4520775623268698, + "grad_norm": 0.09824266284704208, + "learning_rate": 9.968874860590799e-05, + "loss": 0.01879957504570484, + "num_input_tokens_seen": 26725632, + "step": 1632, + "train_runtime": 13450.8094, + "train_tokens_per_second": 1986.916 + }, + { + "epoch": 0.4523545706371191, + "grad_norm": 0.09462254494428635, + "learning_rate": 9.968825876343582e-05, + "loss": 0.017192209139466286, + "num_input_tokens_seen": 26742008, + "step": 1633, + "train_runtime": 13459.0227, + "train_tokens_per_second": 1986.92 + }, + { + "epoch": 0.45263157894736844, + "grad_norm": 0.10769343376159668, + "learning_rate": 9.96877685370189e-05, + "loss": 0.018526438623666763, + "num_input_tokens_seen": 26758384, + "step": 1634, + "train_runtime": 13467.2392, + "train_tokens_per_second": 1986.924 + }, + { + "epoch": 0.45290858725761773, + "grad_norm": 0.09771836549043655, + "learning_rate": 9.968727792666105e-05, + "loss": 0.015122860670089722, + "num_input_tokens_seen": 26774760, + "step": 1635, + "train_runtime": 13475.4554, + "train_tokens_per_second": 1986.928 + }, + { + "epoch": 0.45318559556786703, + "grad_norm": 0.13923145830631256, + "learning_rate": 9.968678693236601e-05, + "loss": 0.017784101888537407, + "num_input_tokens_seen": 26791136, + "step": 1636, + "train_runtime": 13483.6641, + "train_tokens_per_second": 1986.933 + }, + { + "epoch": 0.4534626038781163, + "grad_norm": 0.11202523857355118, + "learning_rate": 9.96862955541376e-05, + "loss": 0.02113550342619419, + "num_input_tokens_seen": 26807512, + "step": 1637, + "train_runtime": 13491.8775, + "train_tokens_per_second": 1986.937 + }, + { + "epoch": 0.45373961218836567, + "grad_norm": 0.12390346080064774, + "learning_rate": 9.968580379197961e-05, + "loss": 0.023780759423971176, + "num_input_tokens_seen": 26823888, + "step": 1638, + "train_runtime": 13500.1125, + "train_tokens_per_second": 1986.938 + }, + { + "epoch": 0.45401662049861496, + "grad_norm": 0.08532112836837769, + "learning_rate": 9.968531164589585e-05, + "loss": 0.01771438494324684, + "num_input_tokens_seen": 26840264, + "step": 1639, + "train_runtime": 13508.3417, + "train_tokens_per_second": 1986.94 + }, + { + "epoch": 0.45429362880886426, + "grad_norm": 0.08894100040197372, + "learning_rate": 9.968481911589011e-05, + "loss": 0.017170175909996033, + "num_input_tokens_seen": 26856640, + "step": 1640, + "train_runtime": 13516.5662, + "train_tokens_per_second": 1986.943 + }, + { + "epoch": 0.45457063711911355, + "grad_norm": 0.09896495938301086, + "learning_rate": 9.96843262019662e-05, + "loss": 0.015677323564887047, + "num_input_tokens_seen": 26873016, + "step": 1641, + "train_runtime": 13524.7887, + "train_tokens_per_second": 1986.945 + }, + { + "epoch": 0.4548476454293629, + "grad_norm": 0.08906494826078415, + "learning_rate": 9.968383290412794e-05, + "loss": 0.01650339737534523, + "num_input_tokens_seen": 26889392, + "step": 1642, + "train_runtime": 13533.0096, + "train_tokens_per_second": 1986.948 + }, + { + "epoch": 0.4551246537396122, + "grad_norm": 0.10947053134441376, + "learning_rate": 9.968333922237911e-05, + "loss": 0.020125597715377808, + "num_input_tokens_seen": 26905768, + "step": 1643, + "train_runtime": 13541.2319, + "train_tokens_per_second": 1986.951 + }, + { + "epoch": 0.4554016620498615, + "grad_norm": 0.0729607418179512, + "learning_rate": 9.968284515672358e-05, + "loss": 0.015561400912702084, + "num_input_tokens_seen": 26922144, + "step": 1644, + "train_runtime": 13549.4578, + "train_tokens_per_second": 1986.954 + }, + { + "epoch": 0.4556786703601108, + "grad_norm": 0.07974833995103836, + "learning_rate": 9.96823507071651e-05, + "loss": 0.014119237661361694, + "num_input_tokens_seen": 26938520, + "step": 1645, + "train_runtime": 13557.6808, + "train_tokens_per_second": 1986.956 + }, + { + "epoch": 0.45595567867036013, + "grad_norm": 0.16953052580356598, + "learning_rate": 9.968185587370756e-05, + "loss": 0.020756451413035393, + "num_input_tokens_seen": 26954896, + "step": 1646, + "train_runtime": 13565.9158, + "train_tokens_per_second": 1986.957 + }, + { + "epoch": 0.4562326869806094, + "grad_norm": 0.07855205237865448, + "learning_rate": 9.968136065635471e-05, + "loss": 0.016447633504867554, + "num_input_tokens_seen": 26971272, + "step": 1647, + "train_runtime": 13574.1664, + "train_tokens_per_second": 1986.956 + }, + { + "epoch": 0.4565096952908587, + "grad_norm": 0.11368906497955322, + "learning_rate": 9.968086505511045e-05, + "loss": 0.02101805806159973, + "num_input_tokens_seen": 26987648, + "step": 1648, + "train_runtime": 13582.3978, + "train_tokens_per_second": 1986.958 + }, + { + "epoch": 0.456786703601108, + "grad_norm": 0.11490623652935028, + "learning_rate": 9.968036906997855e-05, + "loss": 0.020520616322755814, + "num_input_tokens_seen": 27004024, + "step": 1649, + "train_runtime": 13590.6191, + "train_tokens_per_second": 1986.961 + }, + { + "epoch": 0.45706371191135736, + "grad_norm": 0.11628682166337967, + "learning_rate": 9.967987270096288e-05, + "loss": 0.019881678745150566, + "num_input_tokens_seen": 27020400, + "step": 1650, + "train_runtime": 13598.8575, + "train_tokens_per_second": 1986.961 + }, + { + "epoch": 0.45734072022160666, + "grad_norm": 0.09265205264091492, + "learning_rate": 9.967937594806727e-05, + "loss": 0.017463969066739082, + "num_input_tokens_seen": 27036776, + "step": 1651, + "train_runtime": 13607.0772, + "train_tokens_per_second": 1986.964 + }, + { + "epoch": 0.45761772853185595, + "grad_norm": 0.07122180610895157, + "learning_rate": 9.967887881129553e-05, + "loss": 0.017577437683939934, + "num_input_tokens_seen": 27053152, + "step": 1652, + "train_runtime": 13615.2891, + "train_tokens_per_second": 1986.969 + }, + { + "epoch": 0.45789473684210524, + "grad_norm": 0.10975942760705948, + "learning_rate": 9.967838129065153e-05, + "loss": 0.01877872832119465, + "num_input_tokens_seen": 27069528, + "step": 1653, + "train_runtime": 13623.5058, + "train_tokens_per_second": 1986.972 + }, + { + "epoch": 0.4581717451523546, + "grad_norm": 0.11718037724494934, + "learning_rate": 9.967788338613912e-05, + "loss": 0.020734446123242378, + "num_input_tokens_seen": 27085904, + "step": 1654, + "train_runtime": 13631.7198, + "train_tokens_per_second": 1986.976 + }, + { + "epoch": 0.4584487534626039, + "grad_norm": 0.11796779185533524, + "learning_rate": 9.967738509776212e-05, + "loss": 0.01693706214427948, + "num_input_tokens_seen": 27102280, + "step": 1655, + "train_runtime": 13639.9391, + "train_tokens_per_second": 1986.98 + }, + { + "epoch": 0.4587257617728532, + "grad_norm": 0.10191608220338821, + "learning_rate": 9.96768864255244e-05, + "loss": 0.017241695895791054, + "num_input_tokens_seen": 27118656, + "step": 1656, + "train_runtime": 13648.1558, + "train_tokens_per_second": 1986.983 + }, + { + "epoch": 0.4590027700831025, + "grad_norm": 0.13233573734760284, + "learning_rate": 9.967638736942981e-05, + "loss": 0.019871219992637634, + "num_input_tokens_seen": 27135032, + "step": 1657, + "train_runtime": 13656.3641, + "train_tokens_per_second": 1986.988 + }, + { + "epoch": 0.4592797783933518, + "grad_norm": 0.08449795842170715, + "learning_rate": 9.967588792948219e-05, + "loss": 0.012721863575279713, + "num_input_tokens_seen": 27151408, + "step": 1658, + "train_runtime": 13664.5756, + "train_tokens_per_second": 1986.992 + }, + { + "epoch": 0.4595567867036011, + "grad_norm": 0.10266535729169846, + "learning_rate": 9.967538810568544e-05, + "loss": 0.019538559019565582, + "num_input_tokens_seen": 27167784, + "step": 1659, + "train_runtime": 13672.7898, + "train_tokens_per_second": 1986.996 + }, + { + "epoch": 0.4598337950138504, + "grad_norm": 0.10234329849481583, + "learning_rate": 9.967488789804337e-05, + "loss": 0.018116768449544907, + "num_input_tokens_seen": 27184160, + "step": 1660, + "train_runtime": 13681.0147, + "train_tokens_per_second": 1986.999 + }, + { + "epoch": 0.4601108033240997, + "grad_norm": 0.1054227352142334, + "learning_rate": 9.967438730655989e-05, + "loss": 0.019134018570184708, + "num_input_tokens_seen": 27200536, + "step": 1661, + "train_runtime": 13689.242, + "train_tokens_per_second": 1987.001 + }, + { + "epoch": 0.46038781163434905, + "grad_norm": 0.1207452267408371, + "learning_rate": 9.967388633123884e-05, + "loss": 0.018375808373093605, + "num_input_tokens_seen": 27216912, + "step": 1662, + "train_runtime": 13697.481, + "train_tokens_per_second": 1987.001 + }, + { + "epoch": 0.46066481994459835, + "grad_norm": 0.07500681281089783, + "learning_rate": 9.96733849720841e-05, + "loss": 0.01764960028231144, + "num_input_tokens_seen": 27233288, + "step": 1663, + "train_runtime": 13705.7148, + "train_tokens_per_second": 1987.002 + }, + { + "epoch": 0.46094182825484764, + "grad_norm": 0.08041887730360031, + "learning_rate": 9.967288322909953e-05, + "loss": 0.019199656322598457, + "num_input_tokens_seen": 27249664, + "step": 1664, + "train_runtime": 13713.9388, + "train_tokens_per_second": 1987.005 + }, + { + "epoch": 0.46121883656509693, + "grad_norm": 0.10707356035709381, + "learning_rate": 9.967238110228905e-05, + "loss": 0.018280817195773125, + "num_input_tokens_seen": 27266040, + "step": 1665, + "train_runtime": 13722.1641, + "train_tokens_per_second": 1987.007 + }, + { + "epoch": 0.4614958448753463, + "grad_norm": 0.11938481777906418, + "learning_rate": 9.967187859165649e-05, + "loss": 0.01710333302617073, + "num_input_tokens_seen": 27282416, + "step": 1666, + "train_runtime": 13730.3972, + "train_tokens_per_second": 1987.009 + }, + { + "epoch": 0.4617728531855956, + "grad_norm": 0.12434875965118408, + "learning_rate": 9.967137569720576e-05, + "loss": 0.0190595630556345, + "num_input_tokens_seen": 27298792, + "step": 1667, + "train_runtime": 13738.6171, + "train_tokens_per_second": 1987.012 + }, + { + "epoch": 0.46204986149584487, + "grad_norm": 0.15117813646793365, + "learning_rate": 9.967087241894073e-05, + "loss": 0.02208014763891697, + "num_input_tokens_seen": 27315168, + "step": 1668, + "train_runtime": 13746.858, + "train_tokens_per_second": 1987.012 + }, + { + "epoch": 0.46232686980609417, + "grad_norm": 0.16033653914928436, + "learning_rate": 9.967036875686532e-05, + "loss": 0.021204203367233276, + "num_input_tokens_seen": 27331544, + "step": 1669, + "train_runtime": 13755.0914, + "train_tokens_per_second": 1987.013 + }, + { + "epoch": 0.4626038781163435, + "grad_norm": 0.09504067897796631, + "learning_rate": 9.966986471098338e-05, + "loss": 0.01807369850575924, + "num_input_tokens_seen": 27347920, + "step": 1670, + "train_runtime": 13763.3162, + "train_tokens_per_second": 1987.015 + }, + { + "epoch": 0.4628808864265928, + "grad_norm": 0.0924244374036789, + "learning_rate": 9.966936028129882e-05, + "loss": 0.01454536896198988, + "num_input_tokens_seen": 27364296, + "step": 1671, + "train_runtime": 13771.5272, + "train_tokens_per_second": 1987.02 + }, + { + "epoch": 0.4631578947368421, + "grad_norm": 0.11073523014783859, + "learning_rate": 9.966885546781557e-05, + "loss": 0.015877844765782356, + "num_input_tokens_seen": 27380672, + "step": 1672, + "train_runtime": 13779.7565, + "train_tokens_per_second": 1987.021 + }, + { + "epoch": 0.4634349030470914, + "grad_norm": 0.10145188868045807, + "learning_rate": 9.966835027053749e-05, + "loss": 0.022697243839502335, + "num_input_tokens_seen": 27397048, + "step": 1673, + "train_runtime": 13787.9845, + "train_tokens_per_second": 1987.023 + }, + { + "epoch": 0.46371191135734074, + "grad_norm": 0.1048620194196701, + "learning_rate": 9.96678446894685e-05, + "loss": 0.018016302958130836, + "num_input_tokens_seen": 27413424, + "step": 1674, + "train_runtime": 13796.2063, + "train_tokens_per_second": 1987.026 + }, + { + "epoch": 0.46398891966759004, + "grad_norm": 0.1141623705625534, + "learning_rate": 9.96673387246125e-05, + "loss": 0.01892116479575634, + "num_input_tokens_seen": 27429800, + "step": 1675, + "train_runtime": 13804.4557, + "train_tokens_per_second": 1987.025 + }, + { + "epoch": 0.46426592797783933, + "grad_norm": 0.11812768876552582, + "learning_rate": 9.966683237597341e-05, + "loss": 0.016917023807764053, + "num_input_tokens_seen": 27446176, + "step": 1676, + "train_runtime": 13812.6908, + "train_tokens_per_second": 1987.026 + }, + { + "epoch": 0.4645429362880886, + "grad_norm": 0.09408872574567795, + "learning_rate": 9.966632564355514e-05, + "loss": 0.021035699173808098, + "num_input_tokens_seen": 27462552, + "step": 1677, + "train_runtime": 13820.9263, + "train_tokens_per_second": 1987.027 + }, + { + "epoch": 0.464819944598338, + "grad_norm": 0.15043295919895172, + "learning_rate": 9.966581852736159e-05, + "loss": 0.020646410062909126, + "num_input_tokens_seen": 27478928, + "step": 1678, + "train_runtime": 13829.1714, + "train_tokens_per_second": 1987.026 + }, + { + "epoch": 0.46509695290858727, + "grad_norm": 0.11620227247476578, + "learning_rate": 9.966531102739668e-05, + "loss": 0.022586453706026077, + "num_input_tokens_seen": 27495304, + "step": 1679, + "train_runtime": 13837.4197, + "train_tokens_per_second": 1987.025 + }, + { + "epoch": 0.46537396121883656, + "grad_norm": 0.1099284291267395, + "learning_rate": 9.966480314366435e-05, + "loss": 0.017483215779066086, + "num_input_tokens_seen": 27511680, + "step": 1680, + "train_runtime": 13845.6588, + "train_tokens_per_second": 1987.026 + }, + { + "epoch": 0.46565096952908586, + "grad_norm": 0.09789589792490005, + "learning_rate": 9.966429487616853e-05, + "loss": 0.015302571468055248, + "num_input_tokens_seen": 27528056, + "step": 1681, + "train_runtime": 13853.8958, + "train_tokens_per_second": 1987.026 + }, + { + "epoch": 0.4659279778393352, + "grad_norm": 0.08048471063375473, + "learning_rate": 9.966378622491312e-05, + "loss": 0.016960307955741882, + "num_input_tokens_seen": 27544432, + "step": 1682, + "train_runtime": 13862.1306, + "train_tokens_per_second": 1987.027 + }, + { + "epoch": 0.4662049861495845, + "grad_norm": 0.13261066377162933, + "learning_rate": 9.966327718990206e-05, + "loss": 0.019554832950234413, + "num_input_tokens_seen": 27560808, + "step": 1683, + "train_runtime": 13870.3643, + "train_tokens_per_second": 1987.028 + }, + { + "epoch": 0.4664819944598338, + "grad_norm": 0.11481089144945145, + "learning_rate": 9.966276777113932e-05, + "loss": 0.02115415595471859, + "num_input_tokens_seen": 27577184, + "step": 1684, + "train_runtime": 13878.601, + "train_tokens_per_second": 1987.029 + }, + { + "epoch": 0.4667590027700831, + "grad_norm": 0.10510184615850449, + "learning_rate": 9.966225796862878e-05, + "loss": 0.016269907355308533, + "num_input_tokens_seen": 27593560, + "step": 1685, + "train_runtime": 13886.8563, + "train_tokens_per_second": 1987.027 + }, + { + "epoch": 0.46703601108033244, + "grad_norm": 0.09564338624477386, + "learning_rate": 9.96617477823744e-05, + "loss": 0.019533362239599228, + "num_input_tokens_seen": 27609936, + "step": 1686, + "train_runtime": 13895.0773, + "train_tokens_per_second": 1987.03 + }, + { + "epoch": 0.46731301939058173, + "grad_norm": 0.14080457389354706, + "learning_rate": 9.966123721238013e-05, + "loss": 0.020905762910842896, + "num_input_tokens_seen": 27626312, + "step": 1687, + "train_runtime": 13903.2942, + "train_tokens_per_second": 1987.034 + }, + { + "epoch": 0.467590027700831, + "grad_norm": 0.08564060181379318, + "learning_rate": 9.96607262586499e-05, + "loss": 0.015510285273194313, + "num_input_tokens_seen": 27642688, + "step": 1688, + "train_runtime": 13911.5115, + "train_tokens_per_second": 1987.037 + }, + { + "epoch": 0.4678670360110803, + "grad_norm": 0.08716385066509247, + "learning_rate": 9.966021492118769e-05, + "loss": 0.014042847789824009, + "num_input_tokens_seen": 27659064, + "step": 1689, + "train_runtime": 13919.7266, + "train_tokens_per_second": 1987.041 + }, + { + "epoch": 0.46814404432132967, + "grad_norm": 0.11228643357753754, + "learning_rate": 9.965970319999742e-05, + "loss": 0.0174108874052763, + "num_input_tokens_seen": 27675440, + "step": 1690, + "train_runtime": 13927.9442, + "train_tokens_per_second": 1987.044 + }, + { + "epoch": 0.46842105263157896, + "grad_norm": 0.10028811544179916, + "learning_rate": 9.965919109508305e-05, + "loss": 0.017554279416799545, + "num_input_tokens_seen": 27691816, + "step": 1691, + "train_runtime": 13936.1595, + "train_tokens_per_second": 1987.048 + }, + { + "epoch": 0.46869806094182825, + "grad_norm": 0.07679687440395355, + "learning_rate": 9.965867860644854e-05, + "loss": 0.01701902411878109, + "num_input_tokens_seen": 27708192, + "step": 1692, + "train_runtime": 13944.3899, + "train_tokens_per_second": 1987.049 + }, + { + "epoch": 0.46897506925207755, + "grad_norm": 0.1809258908033371, + "learning_rate": 9.965816573409785e-05, + "loss": 0.019352030009031296, + "num_input_tokens_seen": 27724568, + "step": 1693, + "train_runtime": 13952.6343, + "train_tokens_per_second": 1987.049 + }, + { + "epoch": 0.4692520775623269, + "grad_norm": 0.10434333235025406, + "learning_rate": 9.965765247803495e-05, + "loss": 0.015490951016545296, + "num_input_tokens_seen": 27740944, + "step": 1694, + "train_runtime": 13960.8736, + "train_tokens_per_second": 1987.049 + }, + { + "epoch": 0.4695290858725762, + "grad_norm": 0.10012761503458023, + "learning_rate": 9.96571388382638e-05, + "loss": 0.018265072256326675, + "num_input_tokens_seen": 27757320, + "step": 1695, + "train_runtime": 13969.0938, + "train_tokens_per_second": 1987.052 + }, + { + "epoch": 0.4698060941828255, + "grad_norm": 0.10309416800737381, + "learning_rate": 9.965662481478836e-05, + "loss": 0.01607505977153778, + "num_input_tokens_seen": 27773696, + "step": 1696, + "train_runtime": 13977.3078, + "train_tokens_per_second": 1987.056 + }, + { + "epoch": 0.4700831024930748, + "grad_norm": 0.08073077350854874, + "learning_rate": 9.965611040761263e-05, + "loss": 0.015735311433672905, + "num_input_tokens_seen": 27790072, + "step": 1697, + "train_runtime": 13985.5217, + "train_tokens_per_second": 1987.06 + }, + { + "epoch": 0.4703601108033241, + "grad_norm": 0.09027353674173355, + "learning_rate": 9.965559561674056e-05, + "loss": 0.020143909379839897, + "num_input_tokens_seen": 27806448, + "step": 1698, + "train_runtime": 13993.7552, + "train_tokens_per_second": 1987.061 + }, + { + "epoch": 0.4706371191135734, + "grad_norm": 0.0939023420214653, + "learning_rate": 9.965508044217613e-05, + "loss": 0.017351476475596428, + "num_input_tokens_seen": 27822824, + "step": 1699, + "train_runtime": 14001.9935, + "train_tokens_per_second": 1987.062 + }, + { + "epoch": 0.4709141274238227, + "grad_norm": 0.13391084969043732, + "learning_rate": 9.965456488392331e-05, + "loss": 0.0181913822889328, + "num_input_tokens_seen": 27839200, + "step": 1700, + "train_runtime": 14010.2292, + "train_tokens_per_second": 1987.062 + }, + { + "epoch": 0.471191135734072, + "grad_norm": 0.09825117886066437, + "learning_rate": 9.965404894198612e-05, + "loss": 0.014036037027835846, + "num_input_tokens_seen": 27855576, + "step": 1701, + "train_runtime": 14019.9268, + "train_tokens_per_second": 1986.856 + }, + { + "epoch": 0.4714681440443213, + "grad_norm": 0.08485151827335358, + "learning_rate": 9.965353261636852e-05, + "loss": 0.01684107445180416, + "num_input_tokens_seen": 27871952, + "step": 1702, + "train_runtime": 14028.1389, + "train_tokens_per_second": 1986.86 + }, + { + "epoch": 0.47174515235457065, + "grad_norm": 0.07265803217887878, + "learning_rate": 9.965301590707449e-05, + "loss": 0.015917520970106125, + "num_input_tokens_seen": 27888328, + "step": 1703, + "train_runtime": 14036.3678, + "train_tokens_per_second": 1986.862 + }, + { + "epoch": 0.47202216066481995, + "grad_norm": 0.09551875293254852, + "learning_rate": 9.965249881410805e-05, + "loss": 0.014300229959189892, + "num_input_tokens_seen": 27904704, + "step": 1704, + "train_runtime": 14044.6107, + "train_tokens_per_second": 1986.862 + }, + { + "epoch": 0.47229916897506924, + "grad_norm": 0.12279205769300461, + "learning_rate": 9.965198133747318e-05, + "loss": 0.0205849502235651, + "num_input_tokens_seen": 27921080, + "step": 1705, + "train_runtime": 14052.8341, + "train_tokens_per_second": 1986.865 + }, + { + "epoch": 0.47257617728531853, + "grad_norm": 0.1416158527135849, + "learning_rate": 9.965146347717386e-05, + "loss": 0.01884635165333748, + "num_input_tokens_seen": 27937456, + "step": 1706, + "train_runtime": 14061.0652, + "train_tokens_per_second": 1986.866 + }, + { + "epoch": 0.4728531855955679, + "grad_norm": 0.10546586662530899, + "learning_rate": 9.965094523321414e-05, + "loss": 0.015435227192938328, + "num_input_tokens_seen": 27953832, + "step": 1707, + "train_runtime": 14069.2928, + "train_tokens_per_second": 1986.868 + }, + { + "epoch": 0.4731301939058172, + "grad_norm": 0.10201351344585419, + "learning_rate": 9.9650426605598e-05, + "loss": 0.015448454767465591, + "num_input_tokens_seen": 27970208, + "step": 1708, + "train_runtime": 14077.5155, + "train_tokens_per_second": 1986.871 + }, + { + "epoch": 0.47340720221606647, + "grad_norm": 0.0933891162276268, + "learning_rate": 9.964990759432944e-05, + "loss": 0.01702006533741951, + "num_input_tokens_seen": 27986584, + "step": 1709, + "train_runtime": 14085.7449, + "train_tokens_per_second": 1986.873 + }, + { + "epoch": 0.47368421052631576, + "grad_norm": 0.07841524481773376, + "learning_rate": 9.964938819941247e-05, + "loss": 0.015039760619401932, + "num_input_tokens_seen": 28002960, + "step": 1710, + "train_runtime": 14093.9762, + "train_tokens_per_second": 1986.874 + }, + { + "epoch": 0.4739612188365651, + "grad_norm": 0.09667322039604187, + "learning_rate": 9.96488684208511e-05, + "loss": 0.023432711139321327, + "num_input_tokens_seen": 28019336, + "step": 1711, + "train_runtime": 14102.2088, + "train_tokens_per_second": 1986.876 + }, + { + "epoch": 0.4742382271468144, + "grad_norm": 0.11050653457641602, + "learning_rate": 9.964834825864937e-05, + "loss": 0.017128070816397667, + "num_input_tokens_seen": 28035712, + "step": 1712, + "train_runtime": 14110.4293, + "train_tokens_per_second": 1986.879 + }, + { + "epoch": 0.4745152354570637, + "grad_norm": 0.07871145009994507, + "learning_rate": 9.964782771281127e-05, + "loss": 0.021866997703909874, + "num_input_tokens_seen": 28052088, + "step": 1713, + "train_runtime": 14118.6459, + "train_tokens_per_second": 1986.882 + }, + { + "epoch": 0.474792243767313, + "grad_norm": 0.10300387442111969, + "learning_rate": 9.964730678334085e-05, + "loss": 0.020077181980013847, + "num_input_tokens_seen": 28068464, + "step": 1714, + "train_runtime": 14126.8587, + "train_tokens_per_second": 1986.886 + }, + { + "epoch": 0.47506925207756234, + "grad_norm": 0.1608133167028427, + "learning_rate": 9.964678547024213e-05, + "loss": 0.01600331999361515, + "num_input_tokens_seen": 28084840, + "step": 1715, + "train_runtime": 14135.0741, + "train_tokens_per_second": 1986.89 + }, + { + "epoch": 0.47534626038781164, + "grad_norm": 0.07432222366333008, + "learning_rate": 9.964626377351911e-05, + "loss": 0.01708264835178852, + "num_input_tokens_seen": 28101216, + "step": 1716, + "train_runtime": 14143.289, + "train_tokens_per_second": 1986.894 + }, + { + "epoch": 0.47562326869806093, + "grad_norm": 0.10785902291536331, + "learning_rate": 9.964574169317584e-05, + "loss": 0.01764124631881714, + "num_input_tokens_seen": 28117592, + "step": 1717, + "train_runtime": 14151.5042, + "train_tokens_per_second": 1986.898 + }, + { + "epoch": 0.4759002770083102, + "grad_norm": 0.07810298353433609, + "learning_rate": 9.964521922921636e-05, + "loss": 0.01524444855749607, + "num_input_tokens_seen": 28133968, + "step": 1718, + "train_runtime": 14159.7141, + "train_tokens_per_second": 1986.902 + }, + { + "epoch": 0.4761772853185596, + "grad_norm": 0.1088566780090332, + "learning_rate": 9.964469638164472e-05, + "loss": 0.019766077399253845, + "num_input_tokens_seen": 28150344, + "step": 1719, + "train_runtime": 14167.9297, + "train_tokens_per_second": 1986.906 + }, + { + "epoch": 0.47645429362880887, + "grad_norm": 0.13216689229011536, + "learning_rate": 9.964417315046492e-05, + "loss": 0.01776941306889057, + "num_input_tokens_seen": 28166720, + "step": 1720, + "train_runtime": 14176.1467, + "train_tokens_per_second": 1986.909 + }, + { + "epoch": 0.47673130193905816, + "grad_norm": 0.12866583466529846, + "learning_rate": 9.964364953568104e-05, + "loss": 0.015630587935447693, + "num_input_tokens_seen": 28183096, + "step": 1721, + "train_runtime": 14184.3667, + "train_tokens_per_second": 1986.913 + }, + { + "epoch": 0.47700831024930745, + "grad_norm": 0.10381344705820084, + "learning_rate": 9.96431255372971e-05, + "loss": 0.016472993418574333, + "num_input_tokens_seen": 28199472, + "step": 1722, + "train_runtime": 14192.595, + "train_tokens_per_second": 1986.914 + }, + { + "epoch": 0.4772853185595568, + "grad_norm": 0.08619578182697296, + "learning_rate": 9.964260115531718e-05, + "loss": 0.01750616356730461, + "num_input_tokens_seen": 28215848, + "step": 1723, + "train_runtime": 14200.8098, + "train_tokens_per_second": 1986.918 + }, + { + "epoch": 0.4775623268698061, + "grad_norm": 0.1539287120103836, + "learning_rate": 9.96420763897453e-05, + "loss": 0.02296973578631878, + "num_input_tokens_seen": 28232224, + "step": 1724, + "train_runtime": 14209.0276, + "train_tokens_per_second": 1986.922 + }, + { + "epoch": 0.4778393351800554, + "grad_norm": 0.13359110057353973, + "learning_rate": 9.964155124058554e-05, + "loss": 0.0197739414870739, + "num_input_tokens_seen": 28248600, + "step": 1725, + "train_runtime": 14217.2439, + "train_tokens_per_second": 1986.925 + }, + { + "epoch": 0.4781163434903047, + "grad_norm": 0.08665718883275986, + "learning_rate": 9.964102570784193e-05, + "loss": 0.014194546267390251, + "num_input_tokens_seen": 28264976, + "step": 1726, + "train_runtime": 14225.4835, + "train_tokens_per_second": 1986.926 + }, + { + "epoch": 0.47839335180055403, + "grad_norm": 0.10977083444595337, + "learning_rate": 9.964049979151854e-05, + "loss": 0.015510530211031437, + "num_input_tokens_seen": 28281352, + "step": 1727, + "train_runtime": 14233.7168, + "train_tokens_per_second": 1986.927 + }, + { + "epoch": 0.47867036011080333, + "grad_norm": 0.11334510892629623, + "learning_rate": 9.963997349161945e-05, + "loss": 0.019948994740843773, + "num_input_tokens_seen": 28297728, + "step": 1728, + "train_runtime": 14241.9361, + "train_tokens_per_second": 1986.93 + }, + { + "epoch": 0.4789473684210526, + "grad_norm": 0.1133597195148468, + "learning_rate": 9.963944680814872e-05, + "loss": 0.019952207803726196, + "num_input_tokens_seen": 28314104, + "step": 1729, + "train_runtime": 14250.1552, + "train_tokens_per_second": 1986.933 + }, + { + "epoch": 0.4792243767313019, + "grad_norm": 0.10390126705169678, + "learning_rate": 9.963891974111042e-05, + "loss": 0.01578519679605961, + "num_input_tokens_seen": 28330480, + "step": 1730, + "train_runtime": 14258.3844, + "train_tokens_per_second": 1986.935 + }, + { + "epoch": 0.47950138504155126, + "grad_norm": 0.058603715151548386, + "learning_rate": 9.963839229050861e-05, + "loss": 0.016209399327635765, + "num_input_tokens_seen": 28346856, + "step": 1731, + "train_runtime": 14266.6168, + "train_tokens_per_second": 1986.936 + }, + { + "epoch": 0.47977839335180056, + "grad_norm": 0.09344319254159927, + "learning_rate": 9.963786445634737e-05, + "loss": 0.016988856717944145, + "num_input_tokens_seen": 28363232, + "step": 1732, + "train_runtime": 14274.8364, + "train_tokens_per_second": 1986.939 + }, + { + "epoch": 0.48005540166204985, + "grad_norm": 0.16866588592529297, + "learning_rate": 9.96373362386308e-05, + "loss": 0.021242544054985046, + "num_input_tokens_seen": 28379608, + "step": 1733, + "train_runtime": 14283.0669, + "train_tokens_per_second": 1986.941 + }, + { + "epoch": 0.48033240997229915, + "grad_norm": 0.10322989523410797, + "learning_rate": 9.963680763736296e-05, + "loss": 0.017101280391216278, + "num_input_tokens_seen": 28395984, + "step": 1734, + "train_runtime": 14291.2811, + "train_tokens_per_second": 1986.945 + }, + { + "epoch": 0.4806094182825485, + "grad_norm": 0.11479680985212326, + "learning_rate": 9.963627865254793e-05, + "loss": 0.01861853152513504, + "num_input_tokens_seen": 28412360, + "step": 1735, + "train_runtime": 14299.4996, + "train_tokens_per_second": 1986.948 + }, + { + "epoch": 0.4808864265927978, + "grad_norm": 0.11088168621063232, + "learning_rate": 9.963574928418982e-05, + "loss": 0.016754938289523125, + "num_input_tokens_seen": 28428736, + "step": 1736, + "train_runtime": 14307.7269, + "train_tokens_per_second": 1986.95 + }, + { + "epoch": 0.4811634349030471, + "grad_norm": 0.11306315660476685, + "learning_rate": 9.963521953229268e-05, + "loss": 0.01867087185382843, + "num_input_tokens_seen": 28445112, + "step": 1737, + "train_runtime": 14315.9545, + "train_tokens_per_second": 1986.952 + }, + { + "epoch": 0.4814404432132964, + "grad_norm": 0.15642108023166656, + "learning_rate": 9.963468939686067e-05, + "loss": 0.0231150072067976, + "num_input_tokens_seen": 28461488, + "step": 1738, + "train_runtime": 14324.1681, + "train_tokens_per_second": 1986.956 + }, + { + "epoch": 0.4817174515235457, + "grad_norm": 0.09746941179037094, + "learning_rate": 9.963415887789782e-05, + "loss": 0.02019793540239334, + "num_input_tokens_seen": 28477864, + "step": 1739, + "train_runtime": 14332.3868, + "train_tokens_per_second": 1986.959 + }, + { + "epoch": 0.481994459833795, + "grad_norm": 0.1341215819120407, + "learning_rate": 9.963362797540827e-05, + "loss": 0.016127096489071846, + "num_input_tokens_seen": 28494240, + "step": 1740, + "train_runtime": 14340.6028, + "train_tokens_per_second": 1986.962 + }, + { + "epoch": 0.4822714681440443, + "grad_norm": 0.12085568159818649, + "learning_rate": 9.963309668939611e-05, + "loss": 0.01864861138164997, + "num_input_tokens_seen": 28510616, + "step": 1741, + "train_runtime": 14348.8213, + "train_tokens_per_second": 1986.966 + }, + { + "epoch": 0.4825484764542936, + "grad_norm": 0.1001528799533844, + "learning_rate": 9.963256501986545e-05, + "loss": 0.017599837854504585, + "num_input_tokens_seen": 28526992, + "step": 1742, + "train_runtime": 14357.0379, + "train_tokens_per_second": 1986.969 + }, + { + "epoch": 0.48282548476454296, + "grad_norm": 0.0915159210562706, + "learning_rate": 9.963203296682037e-05, + "loss": 0.016721142455935478, + "num_input_tokens_seen": 28543368, + "step": 1743, + "train_runtime": 14365.2715, + "train_tokens_per_second": 1986.97 + }, + { + "epoch": 0.48310249307479225, + "grad_norm": 0.11745981872081757, + "learning_rate": 9.963150053026502e-05, + "loss": 0.018629197031259537, + "num_input_tokens_seen": 28559744, + "step": 1744, + "train_runtime": 14373.5096, + "train_tokens_per_second": 1986.971 + }, + { + "epoch": 0.48337950138504154, + "grad_norm": 0.09589648991823196, + "learning_rate": 9.963096771020348e-05, + "loss": 0.017227934673428535, + "num_input_tokens_seen": 28576120, + "step": 1745, + "train_runtime": 14381.7404, + "train_tokens_per_second": 1986.972 + }, + { + "epoch": 0.48365650969529084, + "grad_norm": 0.11419370770454407, + "learning_rate": 9.963043450663992e-05, + "loss": 0.02014348655939102, + "num_input_tokens_seen": 28592496, + "step": 1746, + "train_runtime": 14389.964, + "train_tokens_per_second": 1986.975 + }, + { + "epoch": 0.4839335180055402, + "grad_norm": 0.10095401853322983, + "learning_rate": 9.962990091957839e-05, + "loss": 0.016133155673742294, + "num_input_tokens_seen": 28608872, + "step": 1747, + "train_runtime": 14398.1876, + "train_tokens_per_second": 1986.977 + }, + { + "epoch": 0.4842105263157895, + "grad_norm": 0.11471909284591675, + "learning_rate": 9.962936694902307e-05, + "loss": 0.015766242519021034, + "num_input_tokens_seen": 28625248, + "step": 1748, + "train_runtime": 14406.4059, + "train_tokens_per_second": 1986.981 + }, + { + "epoch": 0.4844875346260388, + "grad_norm": 0.08076173812150955, + "learning_rate": 9.962883259497804e-05, + "loss": 0.015744345262646675, + "num_input_tokens_seen": 28641624, + "step": 1749, + "train_runtime": 14414.6279, + "train_tokens_per_second": 1986.983 + }, + { + "epoch": 0.48476454293628807, + "grad_norm": 0.11013410985469818, + "learning_rate": 9.962829785744748e-05, + "loss": 0.0152151919901371, + "num_input_tokens_seen": 28658000, + "step": 1750, + "train_runtime": 14422.8634, + "train_tokens_per_second": 1986.984 + }, + { + "epoch": 0.4850415512465374, + "grad_norm": 0.07041093707084656, + "learning_rate": 9.962776273643548e-05, + "loss": 0.017541132867336273, + "num_input_tokens_seen": 28674376, + "step": 1751, + "train_runtime": 14431.0941, + "train_tokens_per_second": 1986.986 + }, + { + "epoch": 0.4853185595567867, + "grad_norm": 0.10444201529026031, + "learning_rate": 9.962722723194619e-05, + "loss": 0.02010366879403591, + "num_input_tokens_seen": 28690752, + "step": 1752, + "train_runtime": 14439.3216, + "train_tokens_per_second": 1986.988 + }, + { + "epoch": 0.485595567867036, + "grad_norm": 0.11461137980222702, + "learning_rate": 9.962669134398377e-05, + "loss": 0.01992829516530037, + "num_input_tokens_seen": 28707128, + "step": 1753, + "train_runtime": 14447.5447, + "train_tokens_per_second": 1986.99 + }, + { + "epoch": 0.4858725761772853, + "grad_norm": 0.09594380855560303, + "learning_rate": 9.96261550725523e-05, + "loss": 0.017191117629408836, + "num_input_tokens_seen": 28723504, + "step": 1754, + "train_runtime": 14455.7761, + "train_tokens_per_second": 1986.991 + }, + { + "epoch": 0.48614958448753465, + "grad_norm": 0.10380503535270691, + "learning_rate": 9.9625618417656e-05, + "loss": 0.017789579927921295, + "num_input_tokens_seen": 28739880, + "step": 1755, + "train_runtime": 14464.0047, + "train_tokens_per_second": 1986.993 + }, + { + "epoch": 0.48642659279778394, + "grad_norm": 0.09908714145421982, + "learning_rate": 9.962508137929897e-05, + "loss": 0.019406652078032494, + "num_input_tokens_seen": 28756256, + "step": 1756, + "train_runtime": 14472.2357, + "train_tokens_per_second": 1986.995 + }, + { + "epoch": 0.48670360110803323, + "grad_norm": 0.08928915858268738, + "learning_rate": 9.962454395748536e-05, + "loss": 0.0207972452044487, + "num_input_tokens_seen": 28772632, + "step": 1757, + "train_runtime": 14480.4452, + "train_tokens_per_second": 1986.999 + }, + { + "epoch": 0.48698060941828253, + "grad_norm": 0.10347535461187363, + "learning_rate": 9.962400615221934e-05, + "loss": 0.01804092526435852, + "num_input_tokens_seen": 28789008, + "step": 1758, + "train_runtime": 14488.656, + "train_tokens_per_second": 1987.003 + }, + { + "epoch": 0.4872576177285319, + "grad_norm": 0.07383065670728683, + "learning_rate": 9.962346796350504e-05, + "loss": 0.019416365772485733, + "num_input_tokens_seen": 28805384, + "step": 1759, + "train_runtime": 14496.87, + "train_tokens_per_second": 1987.007 + }, + { + "epoch": 0.48753462603878117, + "grad_norm": 0.12035747617483139, + "learning_rate": 9.962292939134665e-05, + "loss": 0.0222672987729311, + "num_input_tokens_seen": 28821760, + "step": 1760, + "train_runtime": 14505.0999, + "train_tokens_per_second": 1987.009 + }, + { + "epoch": 0.48781163434903047, + "grad_norm": 0.1466553658246994, + "learning_rate": 9.962239043574832e-05, + "loss": 0.02620314620435238, + "num_input_tokens_seen": 28838136, + "step": 1761, + "train_runtime": 14513.3386, + "train_tokens_per_second": 1987.009 + }, + { + "epoch": 0.48808864265927976, + "grad_norm": 0.10744550079107285, + "learning_rate": 9.962185109671421e-05, + "loss": 0.019427409395575523, + "num_input_tokens_seen": 28854512, + "step": 1762, + "train_runtime": 14521.5756, + "train_tokens_per_second": 1987.01 + }, + { + "epoch": 0.4883656509695291, + "grad_norm": 0.08561361581087112, + "learning_rate": 9.962131137424849e-05, + "loss": 0.01952640898525715, + "num_input_tokens_seen": 28870888, + "step": 1763, + "train_runtime": 14529.7996, + "train_tokens_per_second": 1987.012 + }, + { + "epoch": 0.4886426592797784, + "grad_norm": 0.08777152746915817, + "learning_rate": 9.962077126835535e-05, + "loss": 0.016802703961730003, + "num_input_tokens_seen": 28887264, + "step": 1764, + "train_runtime": 14538.0185, + "train_tokens_per_second": 1987.015 + }, + { + "epoch": 0.4889196675900277, + "grad_norm": 0.0918930172920227, + "learning_rate": 9.96202307790389e-05, + "loss": 0.019679520279169083, + "num_input_tokens_seen": 28903640, + "step": 1765, + "train_runtime": 14546.2327, + "train_tokens_per_second": 1987.019 + }, + { + "epoch": 0.489196675900277, + "grad_norm": 0.07176651805639267, + "learning_rate": 9.961968990630339e-05, + "loss": 0.014115508645772934, + "num_input_tokens_seen": 28920016, + "step": 1766, + "train_runtime": 14554.4434, + "train_tokens_per_second": 1987.023 + }, + { + "epoch": 0.48947368421052634, + "grad_norm": 0.1134735494852066, + "learning_rate": 9.961914865015296e-05, + "loss": 0.015874730423092842, + "num_input_tokens_seen": 28936392, + "step": 1767, + "train_runtime": 14562.6648, + "train_tokens_per_second": 1987.026 + }, + { + "epoch": 0.48975069252077563, + "grad_norm": 0.08089867234230042, + "learning_rate": 9.961860701059181e-05, + "loss": 0.01937638409435749, + "num_input_tokens_seen": 28952768, + "step": 1768, + "train_runtime": 14570.8949, + "train_tokens_per_second": 1987.027 + }, + { + "epoch": 0.4900277008310249, + "grad_norm": 0.07894433289766312, + "learning_rate": 9.961806498762412e-05, + "loss": 0.015878263860940933, + "num_input_tokens_seen": 28969144, + "step": 1769, + "train_runtime": 14579.1206, + "train_tokens_per_second": 1987.03 + }, + { + "epoch": 0.4903047091412742, + "grad_norm": 0.08942706882953644, + "learning_rate": 9.961752258125406e-05, + "loss": 0.018298525363206863, + "num_input_tokens_seen": 28985520, + "step": 1770, + "train_runtime": 14587.361, + "train_tokens_per_second": 1987.03 + }, + { + "epoch": 0.49058171745152357, + "grad_norm": 0.0839257687330246, + "learning_rate": 9.961697979148585e-05, + "loss": 0.018959974870085716, + "num_input_tokens_seen": 29001896, + "step": 1771, + "train_runtime": 14595.6108, + "train_tokens_per_second": 1987.029 + }, + { + "epoch": 0.49085872576177286, + "grad_norm": 0.07137042284011841, + "learning_rate": 9.961643661832367e-05, + "loss": 0.016180917620658875, + "num_input_tokens_seen": 29018272, + "step": 1772, + "train_runtime": 14603.8598, + "train_tokens_per_second": 1987.028 + }, + { + "epoch": 0.49113573407202216, + "grad_norm": 0.13406138122081757, + "learning_rate": 9.96158930617717e-05, + "loss": 0.020685266703367233, + "num_input_tokens_seen": 29034648, + "step": 1773, + "train_runtime": 14612.0949, + "train_tokens_per_second": 1987.028 + }, + { + "epoch": 0.49141274238227145, + "grad_norm": 0.11311352252960205, + "learning_rate": 9.961534912183417e-05, + "loss": 0.014083188027143478, + "num_input_tokens_seen": 29051024, + "step": 1774, + "train_runtime": 14620.3176, + "train_tokens_per_second": 1987.031 + }, + { + "epoch": 0.4916897506925208, + "grad_norm": 0.08325134217739105, + "learning_rate": 9.961480479851528e-05, + "loss": 0.015964262187480927, + "num_input_tokens_seen": 29067400, + "step": 1775, + "train_runtime": 14628.5464, + "train_tokens_per_second": 1987.033 + }, + { + "epoch": 0.4919667590027701, + "grad_norm": 0.07112376391887665, + "learning_rate": 9.961426009181923e-05, + "loss": 0.01636013388633728, + "num_input_tokens_seen": 29083776, + "step": 1776, + "train_runtime": 14636.7854, + "train_tokens_per_second": 1987.033 + }, + { + "epoch": 0.4922437673130194, + "grad_norm": 0.09669315069913864, + "learning_rate": 9.961371500175021e-05, + "loss": 0.018796579912304878, + "num_input_tokens_seen": 29100152, + "step": 1777, + "train_runtime": 14645.0291, + "train_tokens_per_second": 1987.033 + }, + { + "epoch": 0.4925207756232687, + "grad_norm": 0.07905727624893188, + "learning_rate": 9.961316952831246e-05, + "loss": 0.017402052879333496, + "num_input_tokens_seen": 29116528, + "step": 1778, + "train_runtime": 14653.269, + "train_tokens_per_second": 1987.033 + }, + { + "epoch": 0.49279778393351803, + "grad_norm": 0.08676508069038391, + "learning_rate": 9.961262367151017e-05, + "loss": 0.01920703426003456, + "num_input_tokens_seen": 29132904, + "step": 1779, + "train_runtime": 14661.5009, + "train_tokens_per_second": 1987.034 + }, + { + "epoch": 0.4930747922437673, + "grad_norm": 0.0847424864768982, + "learning_rate": 9.961207743134757e-05, + "loss": 0.019628673791885376, + "num_input_tokens_seen": 29149280, + "step": 1780, + "train_runtime": 14669.724, + "train_tokens_per_second": 1987.037 + }, + { + "epoch": 0.4933518005540166, + "grad_norm": 0.07939790934324265, + "learning_rate": 9.96115308078289e-05, + "loss": 0.012937264516949654, + "num_input_tokens_seen": 29165656, + "step": 1781, + "train_runtime": 14677.9554, + "train_tokens_per_second": 1987.038 + }, + { + "epoch": 0.4936288088642659, + "grad_norm": 0.1497674584388733, + "learning_rate": 9.961098380095835e-05, + "loss": 0.01693170703947544, + "num_input_tokens_seen": 29182032, + "step": 1782, + "train_runtime": 14686.1825, + "train_tokens_per_second": 1987.04 + }, + { + "epoch": 0.49390581717451526, + "grad_norm": 0.08277232944965363, + "learning_rate": 9.961043641074018e-05, + "loss": 0.014847399666905403, + "num_input_tokens_seen": 29198408, + "step": 1783, + "train_runtime": 14694.4133, + "train_tokens_per_second": 1987.041 + }, + { + "epoch": 0.49418282548476455, + "grad_norm": 0.08254247903823853, + "learning_rate": 9.960988863717857e-05, + "loss": 0.017192034050822258, + "num_input_tokens_seen": 29214784, + "step": 1784, + "train_runtime": 14702.634, + "train_tokens_per_second": 1987.044 + }, + { + "epoch": 0.49445983379501385, + "grad_norm": 0.06666335463523865, + "learning_rate": 9.960934048027782e-05, + "loss": 0.015688534826040268, + "num_input_tokens_seen": 29231160, + "step": 1785, + "train_runtime": 14710.8547, + "train_tokens_per_second": 1987.047 + }, + { + "epoch": 0.49473684210526314, + "grad_norm": 0.0778060331940651, + "learning_rate": 9.960879194004211e-05, + "loss": 0.012859832495450974, + "num_input_tokens_seen": 29247536, + "step": 1786, + "train_runtime": 14719.0728, + "train_tokens_per_second": 1987.05 + }, + { + "epoch": 0.4950138504155125, + "grad_norm": 0.06906924396753311, + "learning_rate": 9.960824301647569e-05, + "loss": 0.013277629390358925, + "num_input_tokens_seen": 29263912, + "step": 1787, + "train_runtime": 14727.3076, + "train_tokens_per_second": 1987.051 + }, + { + "epoch": 0.4952908587257618, + "grad_norm": 0.1375819444656372, + "learning_rate": 9.960769370958283e-05, + "loss": 0.0183585062623024, + "num_input_tokens_seen": 29280288, + "step": 1788, + "train_runtime": 14735.5557, + "train_tokens_per_second": 1987.05 + }, + { + "epoch": 0.4955678670360111, + "grad_norm": 0.10213110595941544, + "learning_rate": 9.960714401936774e-05, + "loss": 0.015876969322562218, + "num_input_tokens_seen": 29296664, + "step": 1789, + "train_runtime": 14743.7851, + "train_tokens_per_second": 1987.052 + }, + { + "epoch": 0.49584487534626037, + "grad_norm": 0.12749335169792175, + "learning_rate": 9.960659394583469e-05, + "loss": 0.0231742262840271, + "num_input_tokens_seen": 29313040, + "step": 1790, + "train_runtime": 14752.0125, + "train_tokens_per_second": 1987.054 + }, + { + "epoch": 0.4961218836565097, + "grad_norm": 0.09676851332187653, + "learning_rate": 9.960604348898793e-05, + "loss": 0.017738288268446922, + "num_input_tokens_seen": 29329416, + "step": 1791, + "train_runtime": 14760.2325, + "train_tokens_per_second": 1987.056 + }, + { + "epoch": 0.496398891966759, + "grad_norm": 0.13409994542598724, + "learning_rate": 9.96054926488317e-05, + "loss": 0.018626132979989052, + "num_input_tokens_seen": 29345792, + "step": 1792, + "train_runtime": 14768.4582, + "train_tokens_per_second": 1987.059 + }, + { + "epoch": 0.4966759002770083, + "grad_norm": 0.08550756424665451, + "learning_rate": 9.960494142537027e-05, + "loss": 0.017904290929436684, + "num_input_tokens_seen": 29362168, + "step": 1793, + "train_runtime": 14776.6748, + "train_tokens_per_second": 1987.062 + }, + { + "epoch": 0.4969529085872576, + "grad_norm": 0.06709368526935577, + "learning_rate": 9.960438981860788e-05, + "loss": 0.0150308134034276, + "num_input_tokens_seen": 29378544, + "step": 1794, + "train_runtime": 14784.8905, + "train_tokens_per_second": 1987.065 + }, + { + "epoch": 0.49722991689750695, + "grad_norm": 0.09945196658372879, + "learning_rate": 9.960383782854881e-05, + "loss": 0.015481297858059406, + "num_input_tokens_seen": 29394920, + "step": 1795, + "train_runtime": 14793.1078, + "train_tokens_per_second": 1987.069 + }, + { + "epoch": 0.49750692520775625, + "grad_norm": 0.08576922118663788, + "learning_rate": 9.960328545519732e-05, + "loss": 0.014917549677193165, + "num_input_tokens_seen": 29411296, + "step": 1796, + "train_runtime": 14801.3205, + "train_tokens_per_second": 1987.072 + }, + { + "epoch": 0.49778393351800554, + "grad_norm": 0.0837588757276535, + "learning_rate": 9.960273269855768e-05, + "loss": 0.016401495784521103, + "num_input_tokens_seen": 29427672, + "step": 1797, + "train_runtime": 14809.5307, + "train_tokens_per_second": 1987.077 + }, + { + "epoch": 0.49806094182825483, + "grad_norm": 0.10155192017555237, + "learning_rate": 9.960217955863416e-05, + "loss": 0.013884928077459335, + "num_input_tokens_seen": 29444048, + "step": 1798, + "train_runtime": 14817.7422, + "train_tokens_per_second": 1987.081 + }, + { + "epoch": 0.4983379501385042, + "grad_norm": 0.12648645043373108, + "learning_rate": 9.960162603543101e-05, + "loss": 0.020751213654875755, + "num_input_tokens_seen": 29460424, + "step": 1799, + "train_runtime": 14825.9554, + "train_tokens_per_second": 1987.084 + }, + { + "epoch": 0.4986149584487535, + "grad_norm": 0.06506439298391342, + "learning_rate": 9.960107212895256e-05, + "loss": 0.01663685403764248, + "num_input_tokens_seen": 29476800, + "step": 1800, + "train_runtime": 14834.177, + "train_tokens_per_second": 1987.087 + }, + { + "epoch": 0.49889196675900277, + "grad_norm": 0.09466106444597244, + "learning_rate": 9.960051783920306e-05, + "loss": 0.016057772561907768, + "num_input_tokens_seen": 29493176, + "step": 1801, + "train_runtime": 14843.9148, + "train_tokens_per_second": 1986.887 + }, + { + "epoch": 0.49916897506925206, + "grad_norm": 0.12161450833082199, + "learning_rate": 9.959996316618679e-05, + "loss": 0.020360911265015602, + "num_input_tokens_seen": 29509552, + "step": 1802, + "train_runtime": 14852.1449, + "train_tokens_per_second": 1986.888 + }, + { + "epoch": 0.4994459833795014, + "grad_norm": 0.08105006068944931, + "learning_rate": 9.959940810990802e-05, + "loss": 0.016271885484457016, + "num_input_tokens_seen": 29525928, + "step": 1803, + "train_runtime": 14860.37, + "train_tokens_per_second": 1986.89 + }, + { + "epoch": 0.4997229916897507, + "grad_norm": 0.11636918783187866, + "learning_rate": 9.959885267037107e-05, + "loss": 0.018377164378762245, + "num_input_tokens_seen": 29542304, + "step": 1804, + "train_runtime": 14868.597, + "train_tokens_per_second": 1986.893 + }, + { + "epoch": 0.5, + "grad_norm": 0.09582046419382095, + "learning_rate": 9.959829684758021e-05, + "loss": 0.017652567476034164, + "num_input_tokens_seen": 29558680, + "step": 1805, + "train_runtime": 14876.8051, + "train_tokens_per_second": 1986.897 + }, + { + "epoch": 0.5002770083102493, + "grad_norm": 0.0760175883769989, + "learning_rate": 9.959774064153977e-05, + "loss": 0.016941012814641, + "num_input_tokens_seen": 29575056, + "step": 1806, + "train_runtime": 14885.0202, + "train_tokens_per_second": 1986.901 + }, + { + "epoch": 0.5005540166204986, + "grad_norm": 0.08849949389696121, + "learning_rate": 9.959718405225402e-05, + "loss": 0.018718058243393898, + "num_input_tokens_seen": 29591432, + "step": 1807, + "train_runtime": 14893.2426, + "train_tokens_per_second": 1986.903 + }, + { + "epoch": 0.5008310249307479, + "grad_norm": 0.09530262649059296, + "learning_rate": 9.959662707972724e-05, + "loss": 0.01823011040687561, + "num_input_tokens_seen": 29607808, + "step": 1808, + "train_runtime": 14901.4558, + "train_tokens_per_second": 1986.907 + }, + { + "epoch": 0.5011080332409972, + "grad_norm": 0.05546022579073906, + "learning_rate": 9.959606972396379e-05, + "loss": 0.013518894091248512, + "num_input_tokens_seen": 29624184, + "step": 1809, + "train_runtime": 14909.669, + "train_tokens_per_second": 1986.911 + }, + { + "epoch": 0.5013850415512465, + "grad_norm": 0.09304475039243698, + "learning_rate": 9.959551198496791e-05, + "loss": 0.016529297456145287, + "num_input_tokens_seen": 29640560, + "step": 1810, + "train_runtime": 14917.8762, + "train_tokens_per_second": 1986.916 + }, + { + "epoch": 0.5016620498614959, + "grad_norm": 0.0842980369925499, + "learning_rate": 9.959495386274395e-05, + "loss": 0.019266534596681595, + "num_input_tokens_seen": 29656936, + "step": 1811, + "train_runtime": 14926.0993, + "train_tokens_per_second": 1986.918 + }, + { + "epoch": 0.5019390581717451, + "grad_norm": 0.07599925249814987, + "learning_rate": 9.959439535729624e-05, + "loss": 0.014191603288054466, + "num_input_tokens_seen": 29673312, + "step": 1812, + "train_runtime": 14934.3351, + "train_tokens_per_second": 1986.919 + }, + { + "epoch": 0.5022160664819945, + "grad_norm": 0.06277257949113846, + "learning_rate": 9.959383646862906e-05, + "loss": 0.010919982567429543, + "num_input_tokens_seen": 29689688, + "step": 1813, + "train_runtime": 14942.5706, + "train_tokens_per_second": 1986.92 + }, + { + "epoch": 0.5024930747922438, + "grad_norm": 0.11780548840761185, + "learning_rate": 9.959327719674674e-05, + "loss": 0.020419323816895485, + "num_input_tokens_seen": 29706064, + "step": 1814, + "train_runtime": 14950.7928, + "train_tokens_per_second": 1986.922 + }, + { + "epoch": 0.502770083102493, + "grad_norm": 0.0965733602643013, + "learning_rate": 9.959271754165361e-05, + "loss": 0.018125738948583603, + "num_input_tokens_seen": 29722440, + "step": 1815, + "train_runtime": 14959.0249, + "train_tokens_per_second": 1986.924 + }, + { + "epoch": 0.5030470914127424, + "grad_norm": 0.11318568140268326, + "learning_rate": 9.959215750335398e-05, + "loss": 0.018779734149575233, + "num_input_tokens_seen": 29738816, + "step": 1816, + "train_runtime": 14967.2403, + "train_tokens_per_second": 1986.927 + }, + { + "epoch": 0.5033240997229916, + "grad_norm": 0.14244121313095093, + "learning_rate": 9.959159708185217e-05, + "loss": 0.02238495647907257, + "num_input_tokens_seen": 29755192, + "step": 1817, + "train_runtime": 14975.4542, + "train_tokens_per_second": 1986.931 + }, + { + "epoch": 0.503601108033241, + "grad_norm": 0.09768626093864441, + "learning_rate": 9.959103627715254e-05, + "loss": 0.01720147207379341, + "num_input_tokens_seen": 29771568, + "step": 1818, + "train_runtime": 14983.6699, + "train_tokens_per_second": 1986.934 + }, + { + "epoch": 0.5038781163434903, + "grad_norm": 0.1340266317129135, + "learning_rate": 9.959047508925941e-05, + "loss": 0.015425576828420162, + "num_input_tokens_seen": 29787944, + "step": 1819, + "train_runtime": 14991.8907, + "train_tokens_per_second": 1986.937 + }, + { + "epoch": 0.5041551246537396, + "grad_norm": 0.10067319124937057, + "learning_rate": 9.958991351817712e-05, + "loss": 0.017486702650785446, + "num_input_tokens_seen": 29804320, + "step": 1820, + "train_runtime": 15000.1149, + "train_tokens_per_second": 1986.939 + }, + { + "epoch": 0.5044321329639889, + "grad_norm": 0.07285833358764648, + "learning_rate": 9.958935156390998e-05, + "loss": 0.018001563847064972, + "num_input_tokens_seen": 29820696, + "step": 1821, + "train_runtime": 15008.3376, + "train_tokens_per_second": 1986.942 + }, + { + "epoch": 0.5047091412742383, + "grad_norm": 0.091016486287117, + "learning_rate": 9.958878922646238e-05, + "loss": 0.02118741348385811, + "num_input_tokens_seen": 29837072, + "step": 1822, + "train_runtime": 15016.5545, + "train_tokens_per_second": 1986.945 + }, + { + "epoch": 0.5049861495844875, + "grad_norm": 0.08723022788763046, + "learning_rate": 9.958822650583863e-05, + "loss": 0.01705147512257099, + "num_input_tokens_seen": 29853448, + "step": 1823, + "train_runtime": 15024.7669, + "train_tokens_per_second": 1986.949 + }, + { + "epoch": 0.5052631578947369, + "grad_norm": 0.08798883110284805, + "learning_rate": 9.958766340204309e-05, + "loss": 0.015584133565425873, + "num_input_tokens_seen": 29869824, + "step": 1824, + "train_runtime": 15032.9878, + "train_tokens_per_second": 1986.952 + }, + { + "epoch": 0.5055401662049861, + "grad_norm": 0.10681243985891342, + "learning_rate": 9.958709991508012e-05, + "loss": 0.015908295288681984, + "num_input_tokens_seen": 29886200, + "step": 1825, + "train_runtime": 15041.2163, + "train_tokens_per_second": 1986.954 + }, + { + "epoch": 0.5058171745152354, + "grad_norm": 0.114041268825531, + "learning_rate": 9.958653604495406e-05, + "loss": 0.019659269601106644, + "num_input_tokens_seen": 29902576, + "step": 1826, + "train_runtime": 15049.4546, + "train_tokens_per_second": 1986.954 + }, + { + "epoch": 0.5060941828254848, + "grad_norm": 0.10439081490039825, + "learning_rate": 9.958597179166926e-05, + "loss": 0.018892208114266396, + "num_input_tokens_seen": 29918952, + "step": 1827, + "train_runtime": 15057.6727, + "train_tokens_per_second": 1986.957 + }, + { + "epoch": 0.506371191135734, + "grad_norm": 0.11201824247837067, + "learning_rate": 9.958540715523011e-05, + "loss": 0.01760842092335224, + "num_input_tokens_seen": 29935328, + "step": 1828, + "train_runtime": 15065.8799, + "train_tokens_per_second": 1986.962 + }, + { + "epoch": 0.5066481994459834, + "grad_norm": 0.1030876636505127, + "learning_rate": 9.958484213564094e-05, + "loss": 0.018580788746476173, + "num_input_tokens_seen": 29951704, + "step": 1829, + "train_runtime": 15074.0914, + "train_tokens_per_second": 1986.966 + }, + { + "epoch": 0.5069252077562327, + "grad_norm": 0.10656551271677017, + "learning_rate": 9.958427673290615e-05, + "loss": 0.018199795857071877, + "num_input_tokens_seen": 29968080, + "step": 1830, + "train_runtime": 15082.3129, + "train_tokens_per_second": 1986.968 + }, + { + "epoch": 0.507202216066482, + "grad_norm": 0.11054423451423645, + "learning_rate": 9.958371094703007e-05, + "loss": 0.016640555113554, + "num_input_tokens_seen": 29984456, + "step": 1831, + "train_runtime": 15090.5382, + "train_tokens_per_second": 1986.971 + }, + { + "epoch": 0.5074792243767313, + "grad_norm": 0.10233211517333984, + "learning_rate": 9.958314477801709e-05, + "loss": 0.01721612922847271, + "num_input_tokens_seen": 30000832, + "step": 1832, + "train_runtime": 15098.7552, + "train_tokens_per_second": 1986.974 + }, + { + "epoch": 0.5077562326869806, + "grad_norm": 0.1084696426987648, + "learning_rate": 9.95825782258716e-05, + "loss": 0.01910608448088169, + "num_input_tokens_seen": 30017208, + "step": 1833, + "train_runtime": 15106.9691, + "train_tokens_per_second": 1986.978 + }, + { + "epoch": 0.5080332409972299, + "grad_norm": 0.06818591803312302, + "learning_rate": 9.958201129059797e-05, + "loss": 0.0172406118363142, + "num_input_tokens_seen": 30033584, + "step": 1834, + "train_runtime": 15115.1855, + "train_tokens_per_second": 1986.981 + }, + { + "epoch": 0.5083102493074793, + "grad_norm": 0.11895527690649033, + "learning_rate": 9.958144397220055e-05, + "loss": 0.017787836492061615, + "num_input_tokens_seen": 30049960, + "step": 1835, + "train_runtime": 15123.4094, + "train_tokens_per_second": 1986.983 + }, + { + "epoch": 0.5085872576177285, + "grad_norm": 0.10028067976236343, + "learning_rate": 9.958087627068376e-05, + "loss": 0.020783687010407448, + "num_input_tokens_seen": 30066336, + "step": 1836, + "train_runtime": 15131.6256, + "train_tokens_per_second": 1986.987 + }, + { + "epoch": 0.5088642659279778, + "grad_norm": 0.08059761673212051, + "learning_rate": 9.958030818605199e-05, + "loss": 0.01572716049849987, + "num_input_tokens_seen": 30082712, + "step": 1837, + "train_runtime": 15139.8564, + "train_tokens_per_second": 1986.988 + }, + { + "epoch": 0.5091412742382272, + "grad_norm": 0.08541271090507507, + "learning_rate": 9.957973971830961e-05, + "loss": 0.016107702627778053, + "num_input_tokens_seen": 30099088, + "step": 1838, + "train_runtime": 15148.0742, + "train_tokens_per_second": 1986.991 + }, + { + "epoch": 0.5094182825484764, + "grad_norm": 0.11983771622180939, + "learning_rate": 9.9579170867461e-05, + "loss": 0.022969059646129608, + "num_input_tokens_seen": 30115464, + "step": 1839, + "train_runtime": 15156.288, + "train_tokens_per_second": 1986.995 + }, + { + "epoch": 0.5096952908587258, + "grad_norm": 0.1154041439294815, + "learning_rate": 9.957860163351058e-05, + "loss": 0.015975000336766243, + "num_input_tokens_seen": 30131840, + "step": 1840, + "train_runtime": 15164.4904, + "train_tokens_per_second": 1987.0 + }, + { + "epoch": 0.509972299168975, + "grad_norm": 0.09982512146234512, + "learning_rate": 9.957803201646274e-05, + "loss": 0.015821807086467743, + "num_input_tokens_seen": 30148216, + "step": 1841, + "train_runtime": 15172.6979, + "train_tokens_per_second": 1987.004 + }, + { + "epoch": 0.5102493074792244, + "grad_norm": 0.07706023007631302, + "learning_rate": 9.95774620163219e-05, + "loss": 0.014151174575090408, + "num_input_tokens_seen": 30164592, + "step": 1842, + "train_runtime": 15180.9034, + "train_tokens_per_second": 1987.009 + }, + { + "epoch": 0.5105263157894737, + "grad_norm": 0.05877670273184776, + "learning_rate": 9.957689163309243e-05, + "loss": 0.015434532426297665, + "num_input_tokens_seen": 30180968, + "step": 1843, + "train_runtime": 15189.1085, + "train_tokens_per_second": 1987.014 + }, + { + "epoch": 0.510803324099723, + "grad_norm": 0.11678402125835419, + "learning_rate": 9.957632086677875e-05, + "loss": 0.020991099998354912, + "num_input_tokens_seen": 30197344, + "step": 1844, + "train_runtime": 15197.3194, + "train_tokens_per_second": 1987.018 + }, + { + "epoch": 0.5110803324099723, + "grad_norm": 0.10820344090461731, + "learning_rate": 9.95757497173853e-05, + "loss": 0.014192878268659115, + "num_input_tokens_seen": 30213720, + "step": 1845, + "train_runtime": 15205.5478, + "train_tokens_per_second": 1987.019 + }, + { + "epoch": 0.5113573407202217, + "grad_norm": 0.10554707050323486, + "learning_rate": 9.957517818491644e-05, + "loss": 0.019916312769055367, + "num_input_tokens_seen": 30230096, + "step": 1846, + "train_runtime": 15213.7681, + "train_tokens_per_second": 1987.022 + }, + { + "epoch": 0.5116343490304709, + "grad_norm": 0.07566064596176147, + "learning_rate": 9.957460626937664e-05, + "loss": 0.01792021095752716, + "num_input_tokens_seen": 30246472, + "step": 1847, + "train_runtime": 15222.0001, + "train_tokens_per_second": 1987.024 + }, + { + "epoch": 0.5119113573407202, + "grad_norm": 0.10353709012269974, + "learning_rate": 9.957403397077028e-05, + "loss": 0.01734613999724388, + "num_input_tokens_seen": 30262848, + "step": 1848, + "train_runtime": 15230.2338, + "train_tokens_per_second": 1987.025 + }, + { + "epoch": 0.5121883656509695, + "grad_norm": 0.14828883111476898, + "learning_rate": 9.95734612891018e-05, + "loss": 0.017759909853339195, + "num_input_tokens_seen": 30279224, + "step": 1849, + "train_runtime": 15238.4681, + "train_tokens_per_second": 1987.025 + }, + { + "epoch": 0.5124653739612188, + "grad_norm": 0.10398603230714798, + "learning_rate": 9.957288822437563e-05, + "loss": 0.013792823068797588, + "num_input_tokens_seen": 30295600, + "step": 1850, + "train_runtime": 15246.6888, + "train_tokens_per_second": 1987.028 + }, + { + "epoch": 0.5127423822714682, + "grad_norm": 0.056473106145858765, + "learning_rate": 9.957231477659616e-05, + "loss": 0.01344631239771843, + "num_input_tokens_seen": 30311976, + "step": 1851, + "train_runtime": 15254.8999, + "train_tokens_per_second": 1987.032 + }, + { + "epoch": 0.5130193905817174, + "grad_norm": 0.0821760818362236, + "learning_rate": 9.957174094576787e-05, + "loss": 0.018579229712486267, + "num_input_tokens_seen": 30328352, + "step": 1852, + "train_runtime": 15263.1026, + "train_tokens_per_second": 1987.037 + }, + { + "epoch": 0.5132963988919668, + "grad_norm": 0.10964100062847137, + "learning_rate": 9.957116673189518e-05, + "loss": 0.019274186342954636, + "num_input_tokens_seen": 30344728, + "step": 1853, + "train_runtime": 15271.3224, + "train_tokens_per_second": 1987.04 + }, + { + "epoch": 0.5135734072022161, + "grad_norm": 0.09965291619300842, + "learning_rate": 9.957059213498252e-05, + "loss": 0.014722111634910107, + "num_input_tokens_seen": 30361104, + "step": 1854, + "train_runtime": 15279.5448, + "train_tokens_per_second": 1987.042 + }, + { + "epoch": 0.5138504155124654, + "grad_norm": 0.12586314976215363, + "learning_rate": 9.957001715503433e-05, + "loss": 0.018137628212571144, + "num_input_tokens_seen": 30377480, + "step": 1855, + "train_runtime": 15287.7561, + "train_tokens_per_second": 1987.046 + }, + { + "epoch": 0.5141274238227147, + "grad_norm": 0.09315140545368195, + "learning_rate": 9.956944179205506e-05, + "loss": 0.018545854836702347, + "num_input_tokens_seen": 30393856, + "step": 1856, + "train_runtime": 15295.9612, + "train_tokens_per_second": 1987.051 + }, + { + "epoch": 0.5144044321329639, + "grad_norm": 0.10880604386329651, + "learning_rate": 9.956886604604913e-05, + "loss": 0.017065810039639473, + "num_input_tokens_seen": 30410232, + "step": 1857, + "train_runtime": 15304.1725, + "train_tokens_per_second": 1987.055 + }, + { + "epoch": 0.5146814404432133, + "grad_norm": 0.09081901609897614, + "learning_rate": 9.956828991702103e-05, + "loss": 0.020634371787309647, + "num_input_tokens_seen": 30426608, + "step": 1858, + "train_runtime": 15312.381, + "train_tokens_per_second": 1987.059 + }, + { + "epoch": 0.5149584487534626, + "grad_norm": 0.08800709247589111, + "learning_rate": 9.956771340497518e-05, + "loss": 0.017408641055226326, + "num_input_tokens_seen": 30442984, + "step": 1859, + "train_runtime": 15320.5905, + "train_tokens_per_second": 1987.063 + }, + { + "epoch": 0.5152354570637119, + "grad_norm": 0.09614653140306473, + "learning_rate": 9.956713650991605e-05, + "loss": 0.016811151057481766, + "num_input_tokens_seen": 30459360, + "step": 1860, + "train_runtime": 15328.8047, + "train_tokens_per_second": 1987.067 + }, + { + "epoch": 0.5155124653739612, + "grad_norm": 0.08718228340148926, + "learning_rate": 9.95665592318481e-05, + "loss": 0.01976643316447735, + "num_input_tokens_seen": 30475736, + "step": 1861, + "train_runtime": 15337.0273, + "train_tokens_per_second": 1987.069 + }, + { + "epoch": 0.5157894736842106, + "grad_norm": 0.08785660564899445, + "learning_rate": 9.956598157077576e-05, + "loss": 0.014245755970478058, + "num_input_tokens_seen": 30492112, + "step": 1862, + "train_runtime": 15345.2459, + "train_tokens_per_second": 1987.072 + }, + { + "epoch": 0.5160664819944598, + "grad_norm": 0.11161988228559494, + "learning_rate": 9.956540352670353e-05, + "loss": 0.020042838528752327, + "num_input_tokens_seen": 30508488, + "step": 1863, + "train_runtime": 15353.4749, + "train_tokens_per_second": 1987.074 + }, + { + "epoch": 0.5163434903047092, + "grad_norm": 0.07168211787939072, + "learning_rate": 9.956482509963587e-05, + "loss": 0.017212310805916786, + "num_input_tokens_seen": 30524864, + "step": 1864, + "train_runtime": 15361.7024, + "train_tokens_per_second": 1987.076 + }, + { + "epoch": 0.5166204986149584, + "grad_norm": 0.07717479020357132, + "learning_rate": 9.956424628957726e-05, + "loss": 0.016084248200058937, + "num_input_tokens_seen": 30541240, + "step": 1865, + "train_runtime": 15369.9256, + "train_tokens_per_second": 1987.078 + }, + { + "epoch": 0.5168975069252078, + "grad_norm": 0.07883825898170471, + "learning_rate": 9.956366709653213e-05, + "loss": 0.01539570651948452, + "num_input_tokens_seen": 30557616, + "step": 1866, + "train_runtime": 15378.1554, + "train_tokens_per_second": 1987.079 + }, + { + "epoch": 0.5171745152354571, + "grad_norm": 0.07965642958879471, + "learning_rate": 9.9563087520505e-05, + "loss": 0.015527622774243355, + "num_input_tokens_seen": 30573992, + "step": 1867, + "train_runtime": 15386.3817, + "train_tokens_per_second": 1987.081 + }, + { + "epoch": 0.5174515235457063, + "grad_norm": 0.09325995296239853, + "learning_rate": 9.956250756150032e-05, + "loss": 0.020089277997612953, + "num_input_tokens_seen": 30590368, + "step": 1868, + "train_runtime": 15394.6042, + "train_tokens_per_second": 1987.084 + }, + { + "epoch": 0.5177285318559557, + "grad_norm": 0.10145833343267441, + "learning_rate": 9.956192721952257e-05, + "loss": 0.01750960759818554, + "num_input_tokens_seen": 30606744, + "step": 1869, + "train_runtime": 15402.8083, + "train_tokens_per_second": 1987.089 + }, + { + "epoch": 0.518005540166205, + "grad_norm": 0.14807726442813873, + "learning_rate": 9.956134649457627e-05, + "loss": 0.01901850663125515, + "num_input_tokens_seen": 30623120, + "step": 1870, + "train_runtime": 15411.0156, + "train_tokens_per_second": 1987.093 + }, + { + "epoch": 0.5182825484764543, + "grad_norm": 0.06770645081996918, + "learning_rate": 9.956076538666586e-05, + "loss": 0.018978692591190338, + "num_input_tokens_seen": 30639496, + "step": 1871, + "train_runtime": 15419.2239, + "train_tokens_per_second": 1987.097 + }, + { + "epoch": 0.5185595567867036, + "grad_norm": 0.10699884593486786, + "learning_rate": 9.956018389579586e-05, + "loss": 0.016031131148338318, + "num_input_tokens_seen": 30655872, + "step": 1872, + "train_runtime": 15427.4267, + "train_tokens_per_second": 1987.102 + }, + { + "epoch": 0.5188365650969529, + "grad_norm": 0.08765536546707153, + "learning_rate": 9.955960202197077e-05, + "loss": 0.01559345331043005, + "num_input_tokens_seen": 30672248, + "step": 1873, + "train_runtime": 15435.6382, + "train_tokens_per_second": 1987.106 + }, + { + "epoch": 0.5191135734072022, + "grad_norm": 0.09207893162965775, + "learning_rate": 9.955901976519506e-05, + "loss": 0.017752349376678467, + "num_input_tokens_seen": 30688624, + "step": 1874, + "train_runtime": 15443.8449, + "train_tokens_per_second": 1987.11 + }, + { + "epoch": 0.5193905817174516, + "grad_norm": 0.098100945353508, + "learning_rate": 9.955843712547325e-05, + "loss": 0.018603889271616936, + "num_input_tokens_seen": 30705000, + "step": 1875, + "train_runtime": 15452.0549, + "train_tokens_per_second": 1987.114 + }, + { + "epoch": 0.5196675900277008, + "grad_norm": 0.09118365496397018, + "learning_rate": 9.955785410280984e-05, + "loss": 0.01444798894226551, + "num_input_tokens_seen": 30721376, + "step": 1876, + "train_runtime": 15460.2637, + "train_tokens_per_second": 1987.118 + }, + { + "epoch": 0.5199445983379501, + "grad_norm": 0.10821200907230377, + "learning_rate": 9.955727069720931e-05, + "loss": 0.01787273772060871, + "num_input_tokens_seen": 30737752, + "step": 1877, + "train_runtime": 15468.4704, + "train_tokens_per_second": 1987.123 + }, + { + "epoch": 0.5202216066481995, + "grad_norm": 0.09201613068580627, + "learning_rate": 9.95566869086762e-05, + "loss": 0.016818340867757797, + "num_input_tokens_seen": 30754128, + "step": 1878, + "train_runtime": 15476.6799, + "train_tokens_per_second": 1987.127 + }, + { + "epoch": 0.5204986149584487, + "grad_norm": 0.08667076379060745, + "learning_rate": 9.955610273721501e-05, + "loss": 0.01673009805381298, + "num_input_tokens_seen": 30770504, + "step": 1879, + "train_runtime": 15484.9064, + "train_tokens_per_second": 1987.129 + }, + { + "epoch": 0.5207756232686981, + "grad_norm": 0.07633943855762482, + "learning_rate": 9.955551818283024e-05, + "loss": 0.015070037916302681, + "num_input_tokens_seen": 30786880, + "step": 1880, + "train_runtime": 15493.135, + "train_tokens_per_second": 1987.13 + }, + { + "epoch": 0.5210526315789473, + "grad_norm": 0.07608691602945328, + "learning_rate": 9.955493324552643e-05, + "loss": 0.017158035188913345, + "num_input_tokens_seen": 30803256, + "step": 1881, + "train_runtime": 15501.3548, + "train_tokens_per_second": 1987.133 + }, + { + "epoch": 0.5213296398891967, + "grad_norm": 0.09855539351701736, + "learning_rate": 9.955434792530809e-05, + "loss": 0.019420191645622253, + "num_input_tokens_seen": 30819632, + "step": 1882, + "train_runtime": 15509.561, + "train_tokens_per_second": 1987.138 + }, + { + "epoch": 0.521606648199446, + "grad_norm": 0.10388928651809692, + "learning_rate": 9.955376222217974e-05, + "loss": 0.01658356562256813, + "num_input_tokens_seen": 30836008, + "step": 1883, + "train_runtime": 15517.7723, + "train_tokens_per_second": 1987.141 + }, + { + "epoch": 0.5218836565096953, + "grad_norm": 0.15706059336662292, + "learning_rate": 9.95531761361459e-05, + "loss": 0.018702322617173195, + "num_input_tokens_seen": 30852384, + "step": 1884, + "train_runtime": 15525.9772, + "train_tokens_per_second": 1987.146 + }, + { + "epoch": 0.5221606648199446, + "grad_norm": 0.11971984058618546, + "learning_rate": 9.955258966721111e-05, + "loss": 0.017069589346647263, + "num_input_tokens_seen": 30868760, + "step": 1885, + "train_runtime": 15534.1852, + "train_tokens_per_second": 1987.15 + }, + { + "epoch": 0.522437673130194, + "grad_norm": 0.07281798124313354, + "learning_rate": 9.955200281537991e-05, + "loss": 0.016502631828188896, + "num_input_tokens_seen": 30885136, + "step": 1886, + "train_runtime": 15542.3903, + "train_tokens_per_second": 1987.155 + }, + { + "epoch": 0.5227146814404432, + "grad_norm": 0.18217435479164124, + "learning_rate": 9.95514155806568e-05, + "loss": 0.014246392995119095, + "num_input_tokens_seen": 30901512, + "step": 1887, + "train_runtime": 15550.5987, + "train_tokens_per_second": 1987.159 + }, + { + "epoch": 0.5229916897506925, + "grad_norm": 0.09528523683547974, + "learning_rate": 9.955082796304636e-05, + "loss": 0.018458345904946327, + "num_input_tokens_seen": 30917888, + "step": 1888, + "train_runtime": 15558.8102, + "train_tokens_per_second": 1987.163 + }, + { + "epoch": 0.5232686980609418, + "grad_norm": 0.06672415137290955, + "learning_rate": 9.95502399625531e-05, + "loss": 0.018257951363921165, + "num_input_tokens_seen": 30934264, + "step": 1889, + "train_runtime": 15567.0178, + "train_tokens_per_second": 1987.167 + }, + { + "epoch": 0.5235457063711911, + "grad_norm": 0.05562487617135048, + "learning_rate": 9.954965157918157e-05, + "loss": 0.012693054042756557, + "num_input_tokens_seen": 30950640, + "step": 1890, + "train_runtime": 15575.2263, + "train_tokens_per_second": 1987.171 + }, + { + "epoch": 0.5238227146814405, + "grad_norm": 0.08407619595527649, + "learning_rate": 9.954906281293634e-05, + "loss": 0.016377776861190796, + "num_input_tokens_seen": 30967016, + "step": 1891, + "train_runtime": 15583.4543, + "train_tokens_per_second": 1987.173 + }, + { + "epoch": 0.5240997229916897, + "grad_norm": 0.11114783585071564, + "learning_rate": 9.954847366382191e-05, + "loss": 0.017977280542254448, + "num_input_tokens_seen": 30983392, + "step": 1892, + "train_runtime": 15591.6873, + "train_tokens_per_second": 1987.174 + }, + { + "epoch": 0.5243767313019391, + "grad_norm": 0.07196640223264694, + "learning_rate": 9.954788413184288e-05, + "loss": 0.015746409073472023, + "num_input_tokens_seen": 30999768, + "step": 1893, + "train_runtime": 15599.933, + "train_tokens_per_second": 1987.173 + }, + { + "epoch": 0.5246537396121884, + "grad_norm": 0.09304969012737274, + "learning_rate": 9.954729421700379e-05, + "loss": 0.015471158549189568, + "num_input_tokens_seen": 31016144, + "step": 1894, + "train_runtime": 15608.1545, + "train_tokens_per_second": 1987.176 + }, + { + "epoch": 0.5249307479224377, + "grad_norm": 0.13021861016750336, + "learning_rate": 9.95467039193092e-05, + "loss": 0.01943863555788994, + "num_input_tokens_seen": 31032520, + "step": 1895, + "train_runtime": 15616.3842, + "train_tokens_per_second": 1987.177 + }, + { + "epoch": 0.525207756232687, + "grad_norm": 0.14548449218273163, + "learning_rate": 9.954611323876366e-05, + "loss": 0.017590049654245377, + "num_input_tokens_seen": 31048896, + "step": 1896, + "train_runtime": 15624.6125, + "train_tokens_per_second": 1987.179 + }, + { + "epoch": 0.5254847645429362, + "grad_norm": 0.0881691724061966, + "learning_rate": 9.954552217537175e-05, + "loss": 0.01532942894846201, + "num_input_tokens_seen": 31065272, + "step": 1897, + "train_runtime": 15632.8443, + "train_tokens_per_second": 1987.18 + }, + { + "epoch": 0.5257617728531856, + "grad_norm": 0.11104818433523178, + "learning_rate": 9.954493072913801e-05, + "loss": 0.017118090763688087, + "num_input_tokens_seen": 31081648, + "step": 1898, + "train_runtime": 15641.075, + "train_tokens_per_second": 1987.181 + }, + { + "epoch": 0.5260387811634349, + "grad_norm": 0.08458904922008514, + "learning_rate": 9.954433890006705e-05, + "loss": 0.021176105365157127, + "num_input_tokens_seen": 31098024, + "step": 1899, + "train_runtime": 15649.286, + "train_tokens_per_second": 1987.185 + }, + { + "epoch": 0.5263157894736842, + "grad_norm": 0.08257447183132172, + "learning_rate": 9.954374668816341e-05, + "loss": 0.012254714034497738, + "num_input_tokens_seen": 31114400, + "step": 1900, + "train_runtime": 15657.4991, + "train_tokens_per_second": 1987.188 + }, + { + "epoch": 0.5265927977839335, + "grad_norm": 0.0941753089427948, + "learning_rate": 9.954315409343169e-05, + "loss": 0.014180666767060757, + "num_input_tokens_seen": 31130776, + "step": 1901, + "train_runtime": 15667.2378, + "train_tokens_per_second": 1986.998 + }, + { + "epoch": 0.5268698060941829, + "grad_norm": 0.11605091392993927, + "learning_rate": 9.954256111587645e-05, + "loss": 0.01732417196035385, + "num_input_tokens_seen": 31147152, + "step": 1902, + "train_runtime": 15675.4544, + "train_tokens_per_second": 1987.002 + }, + { + "epoch": 0.5271468144044321, + "grad_norm": 0.09371225535869598, + "learning_rate": 9.954196775550227e-05, + "loss": 0.018582696095108986, + "num_input_tokens_seen": 31163528, + "step": 1903, + "train_runtime": 15683.6758, + "train_tokens_per_second": 1987.004 + }, + { + "epoch": 0.5274238227146815, + "grad_norm": 0.08773181587457657, + "learning_rate": 9.954137401231376e-05, + "loss": 0.013868662528693676, + "num_input_tokens_seen": 31179904, + "step": 1904, + "train_runtime": 15691.9091, + "train_tokens_per_second": 1987.005 + }, + { + "epoch": 0.5277008310249307, + "grad_norm": 0.12424621731042862, + "learning_rate": 9.954077988631548e-05, + "loss": 0.019283972680568695, + "num_input_tokens_seen": 31196280, + "step": 1905, + "train_runtime": 15700.1453, + "train_tokens_per_second": 1987.006 + }, + { + "epoch": 0.52797783933518, + "grad_norm": 0.08476439863443375, + "learning_rate": 9.954018537751205e-05, + "loss": 0.013348141685128212, + "num_input_tokens_seen": 31212656, + "step": 1906, + "train_runtime": 15708.3637, + "train_tokens_per_second": 1987.009 + }, + { + "epoch": 0.5282548476454294, + "grad_norm": 0.08617305755615234, + "learning_rate": 9.953959048590802e-05, + "loss": 0.0179786067456007, + "num_input_tokens_seen": 31229032, + "step": 1907, + "train_runtime": 15716.5716, + "train_tokens_per_second": 1987.013 + }, + { + "epoch": 0.5285318559556786, + "grad_norm": 0.07938087731599808, + "learning_rate": 9.953899521150803e-05, + "loss": 0.013872330076992512, + "num_input_tokens_seen": 31245408, + "step": 1908, + "train_runtime": 15724.7882, + "train_tokens_per_second": 1987.016 + }, + { + "epoch": 0.528808864265928, + "grad_norm": 0.09483970701694489, + "learning_rate": 9.953839955431667e-05, + "loss": 0.017341913655400276, + "num_input_tokens_seen": 31261784, + "step": 1909, + "train_runtime": 15733.0199, + "train_tokens_per_second": 1987.017 + }, + { + "epoch": 0.5290858725761773, + "grad_norm": 0.08118247985839844, + "learning_rate": 9.953780351433853e-05, + "loss": 0.017518574371933937, + "num_input_tokens_seen": 31278160, + "step": 1910, + "train_runtime": 15741.2553, + "train_tokens_per_second": 1987.018 + }, + { + "epoch": 0.5293628808864266, + "grad_norm": 0.070774607360363, + "learning_rate": 9.953720709157821e-05, + "loss": 0.01621435582637787, + "num_input_tokens_seen": 31294536, + "step": 1911, + "train_runtime": 15749.4753, + "train_tokens_per_second": 1987.021 + }, + { + "epoch": 0.5296398891966759, + "grad_norm": 0.13372112810611725, + "learning_rate": 9.953661028604035e-05, + "loss": 0.01922645792365074, + "num_input_tokens_seen": 31310912, + "step": 1912, + "train_runtime": 15757.7075, + "train_tokens_per_second": 1987.022 + }, + { + "epoch": 0.5299168975069252, + "grad_norm": 0.08702892065048218, + "learning_rate": 9.953601309772953e-05, + "loss": 0.014563480392098427, + "num_input_tokens_seen": 31327288, + "step": 1913, + "train_runtime": 15765.955, + "train_tokens_per_second": 1987.021 + }, + { + "epoch": 0.5301939058171745, + "grad_norm": 0.10036574304103851, + "learning_rate": 9.953541552665038e-05, + "loss": 0.016956299543380737, + "num_input_tokens_seen": 31343664, + "step": 1914, + "train_runtime": 15774.1936, + "train_tokens_per_second": 1987.022 + }, + { + "epoch": 0.5304709141274239, + "grad_norm": 0.10305914282798767, + "learning_rate": 9.953481757280751e-05, + "loss": 0.019386250525712967, + "num_input_tokens_seen": 31360040, + "step": 1915, + "train_runtime": 15782.4347, + "train_tokens_per_second": 1987.022 + }, + { + "epoch": 0.5307479224376731, + "grad_norm": 0.05832630768418312, + "learning_rate": 9.953421923620554e-05, + "loss": 0.019148603081703186, + "num_input_tokens_seen": 31376416, + "step": 1916, + "train_runtime": 15790.6547, + "train_tokens_per_second": 1987.024 + }, + { + "epoch": 0.5310249307479225, + "grad_norm": 0.08836077898740768, + "learning_rate": 9.95336205168491e-05, + "loss": 0.019315166398882866, + "num_input_tokens_seen": 31392792, + "step": 1917, + "train_runtime": 15798.8882, + "train_tokens_per_second": 1987.025 + }, + { + "epoch": 0.5313019390581717, + "grad_norm": 0.049589402973651886, + "learning_rate": 9.95330214147428e-05, + "loss": 0.014440997503697872, + "num_input_tokens_seen": 31409168, + "step": 1918, + "train_runtime": 15807.1099, + "train_tokens_per_second": 1987.028 + }, + { + "epoch": 0.531578947368421, + "grad_norm": 0.08768460899591446, + "learning_rate": 9.953242192989132e-05, + "loss": 0.015496996231377125, + "num_input_tokens_seen": 31425544, + "step": 1919, + "train_runtime": 15815.3263, + "train_tokens_per_second": 1987.031 + }, + { + "epoch": 0.5318559556786704, + "grad_norm": 0.09196515381336212, + "learning_rate": 9.953182206229923e-05, + "loss": 0.015704816207289696, + "num_input_tokens_seen": 31441920, + "step": 1920, + "train_runtime": 15823.5603, + "train_tokens_per_second": 1987.032 + }, + { + "epoch": 0.5321329639889196, + "grad_norm": 0.08235734701156616, + "learning_rate": 9.95312218119712e-05, + "loss": 0.015786822885274887, + "num_input_tokens_seen": 31458296, + "step": 1921, + "train_runtime": 15831.7926, + "train_tokens_per_second": 1987.033 + }, + { + "epoch": 0.532409972299169, + "grad_norm": 0.1065826490521431, + "learning_rate": 9.953062117891185e-05, + "loss": 0.017856676131486893, + "num_input_tokens_seen": 31474672, + "step": 1922, + "train_runtime": 15840.0285, + "train_tokens_per_second": 1987.034 + }, + { + "epoch": 0.5326869806094183, + "grad_norm": 0.08060172945261002, + "learning_rate": 9.953002016312584e-05, + "loss": 0.011551488190889359, + "num_input_tokens_seen": 31491048, + "step": 1923, + "train_runtime": 15848.2422, + "train_tokens_per_second": 1987.037 + }, + { + "epoch": 0.5329639889196676, + "grad_norm": 0.08433254063129425, + "learning_rate": 9.952941876461779e-05, + "loss": 0.0148622440174222, + "num_input_tokens_seen": 31507424, + "step": 1924, + "train_runtime": 15856.4742, + "train_tokens_per_second": 1987.038 + }, + { + "epoch": 0.5332409972299169, + "grad_norm": 0.11975816637277603, + "learning_rate": 9.952881698339238e-05, + "loss": 0.01773867942392826, + "num_input_tokens_seen": 31523800, + "step": 1925, + "train_runtime": 15864.6977, + "train_tokens_per_second": 1987.041 + }, + { + "epoch": 0.5335180055401662, + "grad_norm": 0.07914690673351288, + "learning_rate": 9.952821481945423e-05, + "loss": 0.016352171078324318, + "num_input_tokens_seen": 31540176, + "step": 1926, + "train_runtime": 15872.9312, + "train_tokens_per_second": 1987.042 + }, + { + "epoch": 0.5337950138504155, + "grad_norm": 0.07463757693767548, + "learning_rate": 9.9527612272808e-05, + "loss": 0.015346016734838486, + "num_input_tokens_seen": 31556552, + "step": 1927, + "train_runtime": 15881.1633, + "train_tokens_per_second": 1987.043 + }, + { + "epoch": 0.5340720221606648, + "grad_norm": 0.0916297510266304, + "learning_rate": 9.952700934345835e-05, + "loss": 0.015123402699828148, + "num_input_tokens_seen": 31572928, + "step": 1928, + "train_runtime": 15889.3839, + "train_tokens_per_second": 1987.045 + }, + { + "epoch": 0.5343490304709141, + "grad_norm": 0.09082508832216263, + "learning_rate": 9.952640603140995e-05, + "loss": 0.017560554668307304, + "num_input_tokens_seen": 31589304, + "step": 1929, + "train_runtime": 15897.5999, + "train_tokens_per_second": 1987.049 + }, + { + "epoch": 0.5346260387811634, + "grad_norm": 0.08992074429988861, + "learning_rate": 9.952580233666743e-05, + "loss": 0.01747054234147072, + "num_input_tokens_seen": 31605680, + "step": 1930, + "train_runtime": 15905.8222, + "train_tokens_per_second": 1987.051 + }, + { + "epoch": 0.5349030470914128, + "grad_norm": 0.15161144733428955, + "learning_rate": 9.95251982592355e-05, + "loss": 0.016364848241209984, + "num_input_tokens_seen": 31622056, + "step": 1931, + "train_runtime": 15914.0586, + "train_tokens_per_second": 1987.052 + }, + { + "epoch": 0.535180055401662, + "grad_norm": 0.09369586408138275, + "learning_rate": 9.952459379911881e-05, + "loss": 0.015226107090711594, + "num_input_tokens_seen": 31638432, + "step": 1932, + "train_runtime": 15922.2802, + "train_tokens_per_second": 1987.054 + }, + { + "epoch": 0.5354570637119114, + "grad_norm": 0.07605031877756119, + "learning_rate": 9.9523988956322e-05, + "loss": 0.014249451458454132, + "num_input_tokens_seen": 31654808, + "step": 1933, + "train_runtime": 15930.4902, + "train_tokens_per_second": 1987.058 + }, + { + "epoch": 0.5357340720221606, + "grad_norm": 0.07476571202278137, + "learning_rate": 9.952338373084978e-05, + "loss": 0.017641378566622734, + "num_input_tokens_seen": 31671184, + "step": 1934, + "train_runtime": 15938.7177, + "train_tokens_per_second": 1987.06 + }, + { + "epoch": 0.53601108033241, + "grad_norm": 0.07152118533849716, + "learning_rate": 9.952277812270681e-05, + "loss": 0.01476480346173048, + "num_input_tokens_seen": 31687560, + "step": 1935, + "train_runtime": 15946.9388, + "train_tokens_per_second": 1987.062 + }, + { + "epoch": 0.5362880886426593, + "grad_norm": 0.08583308756351471, + "learning_rate": 9.952217213189777e-05, + "loss": 0.014694956131279469, + "num_input_tokens_seen": 31703936, + "step": 1936, + "train_runtime": 15955.1963, + "train_tokens_per_second": 1987.06 + }, + { + "epoch": 0.5365650969529085, + "grad_norm": 0.10640376061201096, + "learning_rate": 9.952156575842736e-05, + "loss": 0.019102102145552635, + "num_input_tokens_seen": 31720312, + "step": 1937, + "train_runtime": 15963.4248, + "train_tokens_per_second": 1987.062 + }, + { + "epoch": 0.5368421052631579, + "grad_norm": 0.09245344251394272, + "learning_rate": 9.952095900230023e-05, + "loss": 0.014089500531554222, + "num_input_tokens_seen": 31736688, + "step": 1938, + "train_runtime": 15971.656, + "train_tokens_per_second": 1987.063 + }, + { + "epoch": 0.5371191135734072, + "grad_norm": 0.07199344784021378, + "learning_rate": 9.95203518635211e-05, + "loss": 0.017307160422205925, + "num_input_tokens_seen": 31753064, + "step": 1939, + "train_runtime": 15979.8802, + "train_tokens_per_second": 1987.065 + }, + { + "epoch": 0.5373961218836565, + "grad_norm": 0.07118202745914459, + "learning_rate": 9.951974434209465e-05, + "loss": 0.014571859501302242, + "num_input_tokens_seen": 31769440, + "step": 1940, + "train_runtime": 15988.1041, + "train_tokens_per_second": 1987.067 + }, + { + "epoch": 0.5376731301939058, + "grad_norm": 0.05573735386133194, + "learning_rate": 9.951913643802558e-05, + "loss": 0.014909368008375168, + "num_input_tokens_seen": 31785816, + "step": 1941, + "train_runtime": 15996.3282, + "train_tokens_per_second": 1987.07 + }, + { + "epoch": 0.5379501385041551, + "grad_norm": 0.08974700421094894, + "learning_rate": 9.95185281513186e-05, + "loss": 0.01534330565482378, + "num_input_tokens_seen": 31802192, + "step": 1942, + "train_runtime": 16004.5453, + "train_tokens_per_second": 1987.073 + }, + { + "epoch": 0.5382271468144044, + "grad_norm": 0.0858231708407402, + "learning_rate": 9.951791948197837e-05, + "loss": 0.014909790828824043, + "num_input_tokens_seen": 31818568, + "step": 1943, + "train_runtime": 16012.7704, + "train_tokens_per_second": 1987.075 + }, + { + "epoch": 0.5385041551246538, + "grad_norm": 0.15102383494377136, + "learning_rate": 9.951731043000962e-05, + "loss": 0.01853618025779724, + "num_input_tokens_seen": 31834944, + "step": 1944, + "train_runtime": 16020.9994, + "train_tokens_per_second": 1987.076 + }, + { + "epoch": 0.538781163434903, + "grad_norm": 0.12447430938482285, + "learning_rate": 9.951670099541706e-05, + "loss": 0.018763680011034012, + "num_input_tokens_seen": 31851320, + "step": 1945, + "train_runtime": 16029.2128, + "train_tokens_per_second": 1987.079 + }, + { + "epoch": 0.5390581717451524, + "grad_norm": 0.09519626945257187, + "learning_rate": 9.951609117820538e-05, + "loss": 0.01639707386493683, + "num_input_tokens_seen": 31867696, + "step": 1946, + "train_runtime": 16037.4372, + "train_tokens_per_second": 1987.082 + }, + { + "epoch": 0.5393351800554017, + "grad_norm": 0.09113302826881409, + "learning_rate": 9.951548097837932e-05, + "loss": 0.017787765711545944, + "num_input_tokens_seen": 31884072, + "step": 1947, + "train_runtime": 16045.6595, + "train_tokens_per_second": 1987.084 + }, + { + "epoch": 0.539612188365651, + "grad_norm": 0.11211571097373962, + "learning_rate": 9.951487039594358e-05, + "loss": 0.018969014286994934, + "num_input_tokens_seen": 31900448, + "step": 1948, + "train_runtime": 16053.8882, + "train_tokens_per_second": 1987.085 + }, + { + "epoch": 0.5398891966759003, + "grad_norm": 0.059441689401865005, + "learning_rate": 9.951425943090286e-05, + "loss": 0.015746448189020157, + "num_input_tokens_seen": 31916824, + "step": 1949, + "train_runtime": 16062.112, + "train_tokens_per_second": 1987.088 + }, + { + "epoch": 0.5401662049861495, + "grad_norm": 0.09777325391769409, + "learning_rate": 9.951364808326191e-05, + "loss": 0.018530311062932014, + "num_input_tokens_seen": 31933200, + "step": 1950, + "train_runtime": 16070.3279, + "train_tokens_per_second": 1987.091 + }, + { + "epoch": 0.5404432132963989, + "grad_norm": 0.07741095870733261, + "learning_rate": 9.951303635302544e-05, + "loss": 0.016749899834394455, + "num_input_tokens_seen": 31949576, + "step": 1951, + "train_runtime": 16078.534, + "train_tokens_per_second": 1987.095 + }, + { + "epoch": 0.5407202216066482, + "grad_norm": 0.10444276034832001, + "learning_rate": 9.951242424019818e-05, + "loss": 0.018938900902867317, + "num_input_tokens_seen": 31965952, + "step": 1952, + "train_runtime": 16086.7458, + "train_tokens_per_second": 1987.099 + }, + { + "epoch": 0.5409972299168975, + "grad_norm": 0.09711914509534836, + "learning_rate": 9.951181174478485e-05, + "loss": 0.016858728602528572, + "num_input_tokens_seen": 31982328, + "step": 1953, + "train_runtime": 16094.9547, + "train_tokens_per_second": 1987.103 + }, + { + "epoch": 0.5412742382271468, + "grad_norm": 0.10052978247404099, + "learning_rate": 9.951119886679021e-05, + "loss": 0.016959430649876595, + "num_input_tokens_seen": 31998704, + "step": 1954, + "train_runtime": 16103.165, + "train_tokens_per_second": 1987.107 + }, + { + "epoch": 0.5415512465373962, + "grad_norm": 0.0966242253780365, + "learning_rate": 9.951058560621898e-05, + "loss": 0.01693958230316639, + "num_input_tokens_seen": 32015080, + "step": 1955, + "train_runtime": 16111.3918, + "train_tokens_per_second": 1987.108 + }, + { + "epoch": 0.5418282548476454, + "grad_norm": 0.09074979275465012, + "learning_rate": 9.950997196307587e-05, + "loss": 0.017931152135133743, + "num_input_tokens_seen": 32031456, + "step": 1956, + "train_runtime": 16119.6073, + "train_tokens_per_second": 1987.111 + }, + { + "epoch": 0.5421052631578948, + "grad_norm": 0.07949608564376831, + "learning_rate": 9.950935793736567e-05, + "loss": 0.019316544756293297, + "num_input_tokens_seen": 32047832, + "step": 1957, + "train_runtime": 16127.8212, + "train_tokens_per_second": 1987.115 + }, + { + "epoch": 0.542382271468144, + "grad_norm": 0.07294397056102753, + "learning_rate": 9.95087435290931e-05, + "loss": 0.01581498607993126, + "num_input_tokens_seen": 32064208, + "step": 1958, + "train_runtime": 16136.0266, + "train_tokens_per_second": 1987.119 + }, + { + "epoch": 0.5426592797783933, + "grad_norm": 0.0811343789100647, + "learning_rate": 9.95081287382629e-05, + "loss": 0.015639251098036766, + "num_input_tokens_seen": 32080584, + "step": 1959, + "train_runtime": 16144.2363, + "train_tokens_per_second": 1987.123 + }, + { + "epoch": 0.5429362880886427, + "grad_norm": 0.10959455370903015, + "learning_rate": 9.950751356487984e-05, + "loss": 0.018461771309375763, + "num_input_tokens_seen": 32096960, + "step": 1960, + "train_runtime": 16152.4401, + "train_tokens_per_second": 1987.128 + }, + { + "epoch": 0.5432132963988919, + "grad_norm": 0.04882406070828438, + "learning_rate": 9.950689800894866e-05, + "loss": 0.013356690295040607, + "num_input_tokens_seen": 32113336, + "step": 1961, + "train_runtime": 16160.6664, + "train_tokens_per_second": 1987.129 + }, + { + "epoch": 0.5434903047091413, + "grad_norm": 0.08415212482213974, + "learning_rate": 9.950628207047412e-05, + "loss": 0.016947150230407715, + "num_input_tokens_seen": 32129712, + "step": 1962, + "train_runtime": 16168.8793, + "train_tokens_per_second": 1987.133 + }, + { + "epoch": 0.5437673130193906, + "grad_norm": 0.07573730498552322, + "learning_rate": 9.950566574946099e-05, + "loss": 0.018183935433626175, + "num_input_tokens_seen": 32146088, + "step": 1963, + "train_runtime": 16177.0876, + "train_tokens_per_second": 1987.137 + }, + { + "epoch": 0.5440443213296399, + "grad_norm": 0.08026133477687836, + "learning_rate": 9.9505049045914e-05, + "loss": 0.01720396988093853, + "num_input_tokens_seen": 32162464, + "step": 1964, + "train_runtime": 16185.2926, + "train_tokens_per_second": 1987.141 + }, + { + "epoch": 0.5443213296398892, + "grad_norm": 0.07292509824037552, + "learning_rate": 9.950443195983796e-05, + "loss": 0.01566523313522339, + "num_input_tokens_seen": 32178840, + "step": 1965, + "train_runtime": 16193.5123, + "train_tokens_per_second": 1987.144 + }, + { + "epoch": 0.5445983379501385, + "grad_norm": 0.11802003532648087, + "learning_rate": 9.95038144912376e-05, + "loss": 0.017303738743066788, + "num_input_tokens_seen": 32195216, + "step": 1966, + "train_runtime": 16201.7563, + "train_tokens_per_second": 1987.144 + }, + { + "epoch": 0.5448753462603878, + "grad_norm": 0.08714399486780167, + "learning_rate": 9.950319664011772e-05, + "loss": 0.020565573126077652, + "num_input_tokens_seen": 32211592, + "step": 1967, + "train_runtime": 16209.9859, + "train_tokens_per_second": 1987.145 + }, + { + "epoch": 0.5451523545706372, + "grad_norm": 0.07025052607059479, + "learning_rate": 9.950257840648307e-05, + "loss": 0.01393582858145237, + "num_input_tokens_seen": 32227968, + "step": 1968, + "train_runtime": 16218.1968, + "train_tokens_per_second": 1987.149 + }, + { + "epoch": 0.5454293628808864, + "grad_norm": 0.13161218166351318, + "learning_rate": 9.950195979033846e-05, + "loss": 0.020445559173822403, + "num_input_tokens_seen": 32244344, + "step": 1969, + "train_runtime": 16226.4228, + "train_tokens_per_second": 1987.15 + }, + { + "epoch": 0.5457063711911357, + "grad_norm": 0.07352477312088013, + "learning_rate": 9.950134079168862e-05, + "loss": 0.013248994946479797, + "num_input_tokens_seen": 32260720, + "step": 1970, + "train_runtime": 16234.6585, + "train_tokens_per_second": 1987.151 + }, + { + "epoch": 0.5459833795013851, + "grad_norm": 0.08981064707040787, + "learning_rate": 9.950072141053838e-05, + "loss": 0.018133947625756264, + "num_input_tokens_seen": 32277096, + "step": 1971, + "train_runtime": 16242.8811, + "train_tokens_per_second": 1987.153 + }, + { + "epoch": 0.5462603878116343, + "grad_norm": 0.16845254600048065, + "learning_rate": 9.95001016468925e-05, + "loss": 0.020357206463813782, + "num_input_tokens_seen": 32293472, + "step": 1972, + "train_runtime": 16251.1119, + "train_tokens_per_second": 1987.155 + }, + { + "epoch": 0.5465373961218837, + "grad_norm": 0.06353282183408737, + "learning_rate": 9.949948150075579e-05, + "loss": 0.01594490371644497, + "num_input_tokens_seen": 32309848, + "step": 1973, + "train_runtime": 16259.3436, + "train_tokens_per_second": 1987.156 + }, + { + "epoch": 0.5468144044321329, + "grad_norm": 0.06423836946487427, + "learning_rate": 9.949886097213301e-05, + "loss": 0.014037410728633404, + "num_input_tokens_seen": 32326224, + "step": 1974, + "train_runtime": 16267.5544, + "train_tokens_per_second": 1987.159 + }, + { + "epoch": 0.5470914127423823, + "grad_norm": 0.09436669200658798, + "learning_rate": 9.949824006102899e-05, + "loss": 0.013500726781785488, + "num_input_tokens_seen": 32342600, + "step": 1975, + "train_runtime": 16275.7752, + "train_tokens_per_second": 1987.162 + }, + { + "epoch": 0.5473684210526316, + "grad_norm": 0.09069113433361053, + "learning_rate": 9.949761876744849e-05, + "loss": 0.018061533570289612, + "num_input_tokens_seen": 32358976, + "step": 1976, + "train_runtime": 16284.0099, + "train_tokens_per_second": 1987.163 + }, + { + "epoch": 0.5476454293628809, + "grad_norm": 0.07036572694778442, + "learning_rate": 9.949699709139634e-05, + "loss": 0.012950531207025051, + "num_input_tokens_seen": 32375352, + "step": 1977, + "train_runtime": 16292.2451, + "train_tokens_per_second": 1987.163 + }, + { + "epoch": 0.5479224376731302, + "grad_norm": 0.10522942990064621, + "learning_rate": 9.949637503287735e-05, + "loss": 0.013676893897354603, + "num_input_tokens_seen": 32391728, + "step": 1978, + "train_runtime": 16300.4784, + "train_tokens_per_second": 1987.164 + }, + { + "epoch": 0.5481994459833796, + "grad_norm": 0.07373467832803726, + "learning_rate": 9.94957525918963e-05, + "loss": 0.014903361909091473, + "num_input_tokens_seen": 32408104, + "step": 1979, + "train_runtime": 16308.7086, + "train_tokens_per_second": 1987.166 + }, + { + "epoch": 0.5484764542936288, + "grad_norm": 0.05882371589541435, + "learning_rate": 9.949512976845802e-05, + "loss": 0.015456289984285831, + "num_input_tokens_seen": 32424480, + "step": 1980, + "train_runtime": 16316.9467, + "train_tokens_per_second": 1987.166 + }, + { + "epoch": 0.5487534626038781, + "grad_norm": 0.08631424605846405, + "learning_rate": 9.949450656256732e-05, + "loss": 0.014146016910672188, + "num_input_tokens_seen": 32440856, + "step": 1981, + "train_runtime": 16325.1804, + "train_tokens_per_second": 1987.167 + }, + { + "epoch": 0.5490304709141274, + "grad_norm": 0.08013299107551575, + "learning_rate": 9.949388297422899e-05, + "loss": 0.013545701280236244, + "num_input_tokens_seen": 32457232, + "step": 1982, + "train_runtime": 16333.4086, + "train_tokens_per_second": 1987.168 + }, + { + "epoch": 0.5493074792243767, + "grad_norm": 0.1645331233739853, + "learning_rate": 9.949325900344788e-05, + "loss": 0.014792662113904953, + "num_input_tokens_seen": 32473608, + "step": 1983, + "train_runtime": 16341.6179, + "train_tokens_per_second": 1987.172 + }, + { + "epoch": 0.5495844875346261, + "grad_norm": 0.12270210683345795, + "learning_rate": 9.94926346502288e-05, + "loss": 0.016181424260139465, + "num_input_tokens_seen": 32489984, + "step": 1984, + "train_runtime": 16349.827, + "train_tokens_per_second": 1987.176 + }, + { + "epoch": 0.5498614958448753, + "grad_norm": 0.06407630443572998, + "learning_rate": 9.949200991457657e-05, + "loss": 0.013399852439761162, + "num_input_tokens_seen": 32506360, + "step": 1985, + "train_runtime": 16358.0407, + "train_tokens_per_second": 1987.179 + }, + { + "epoch": 0.5501385041551247, + "grad_norm": 0.09056628495454788, + "learning_rate": 9.949138479649602e-05, + "loss": 0.014293679967522621, + "num_input_tokens_seen": 32522736, + "step": 1986, + "train_runtime": 16366.2452, + "train_tokens_per_second": 1987.184 + }, + { + "epoch": 0.550415512465374, + "grad_norm": 0.07968532294034958, + "learning_rate": 9.949075929599199e-05, + "loss": 0.01733057014644146, + "num_input_tokens_seen": 32539112, + "step": 1987, + "train_runtime": 16374.455, + "train_tokens_per_second": 1987.187 + }, + { + "epoch": 0.5506925207756233, + "grad_norm": 0.11393287032842636, + "learning_rate": 9.94901334130693e-05, + "loss": 0.016731083393096924, + "num_input_tokens_seen": 32555488, + "step": 1988, + "train_runtime": 16382.6818, + "train_tokens_per_second": 1987.189 + }, + { + "epoch": 0.5509695290858726, + "grad_norm": 0.07614333927631378, + "learning_rate": 9.94895071477328e-05, + "loss": 0.016742395237088203, + "num_input_tokens_seen": 32571864, + "step": 1989, + "train_runtime": 16390.8985, + "train_tokens_per_second": 1987.192 + }, + { + "epoch": 0.5512465373961218, + "grad_norm": 0.07732956856489182, + "learning_rate": 9.948888049998731e-05, + "loss": 0.017228420823812485, + "num_input_tokens_seen": 32588240, + "step": 1990, + "train_runtime": 16399.1101, + "train_tokens_per_second": 1987.196 + }, + { + "epoch": 0.5515235457063712, + "grad_norm": 0.1212335154414177, + "learning_rate": 9.94882534698377e-05, + "loss": 0.020951250568032265, + "num_input_tokens_seen": 32604616, + "step": 1991, + "train_runtime": 16407.3402, + "train_tokens_per_second": 1987.197 + }, + { + "epoch": 0.5518005540166205, + "grad_norm": 0.09377948194742203, + "learning_rate": 9.948762605728878e-05, + "loss": 0.019497783854603767, + "num_input_tokens_seen": 32620992, + "step": 1992, + "train_runtime": 16415.5681, + "train_tokens_per_second": 1987.198 + }, + { + "epoch": 0.5520775623268698, + "grad_norm": 0.09644649177789688, + "learning_rate": 9.948699826234542e-05, + "loss": 0.013992421329021454, + "num_input_tokens_seen": 32637368, + "step": 1993, + "train_runtime": 16423.7728, + "train_tokens_per_second": 1987.203 + }, + { + "epoch": 0.5523545706371191, + "grad_norm": 0.10357899218797684, + "learning_rate": 9.948637008501248e-05, + "loss": 0.01733301393687725, + "num_input_tokens_seen": 32653744, + "step": 1994, + "train_runtime": 16431.9951, + "train_tokens_per_second": 1987.205 + }, + { + "epoch": 0.5526315789473685, + "grad_norm": 0.11469104140996933, + "learning_rate": 9.94857415252948e-05, + "loss": 0.01931411772966385, + "num_input_tokens_seen": 32670120, + "step": 1995, + "train_runtime": 16440.2247, + "train_tokens_per_second": 1987.206 + }, + { + "epoch": 0.5529085872576177, + "grad_norm": 0.07322665303945541, + "learning_rate": 9.948511258319721e-05, + "loss": 0.014291154220700264, + "num_input_tokens_seen": 32686496, + "step": 1996, + "train_runtime": 16448.4454, + "train_tokens_per_second": 1987.209 + }, + { + "epoch": 0.5531855955678671, + "grad_norm": 0.08772885799407959, + "learning_rate": 9.948448325872463e-05, + "loss": 0.018961109220981598, + "num_input_tokens_seen": 32702872, + "step": 1997, + "train_runtime": 16456.684, + "train_tokens_per_second": 1987.209 + }, + { + "epoch": 0.5534626038781163, + "grad_norm": 0.06996327638626099, + "learning_rate": 9.948385355188188e-05, + "loss": 0.017235567793250084, + "num_input_tokens_seen": 32719248, + "step": 1998, + "train_runtime": 16464.9167, + "train_tokens_per_second": 1987.21 + }, + { + "epoch": 0.5537396121883656, + "grad_norm": 0.07537481933832169, + "learning_rate": 9.948322346267384e-05, + "loss": 0.012255710549652576, + "num_input_tokens_seen": 32735624, + "step": 1999, + "train_runtime": 16473.1354, + "train_tokens_per_second": 1987.213 + }, + { + "epoch": 0.554016620498615, + "grad_norm": 0.07489829510450363, + "learning_rate": 9.948259299110538e-05, + "loss": 0.014150896109640598, + "num_input_tokens_seen": 32752000, + "step": 2000, + "train_runtime": 16481.3572, + "train_tokens_per_second": 1987.215 + }, + { + "epoch": 0.5542936288088642, + "grad_norm": 0.07289014756679535, + "learning_rate": 9.948196213718135e-05, + "loss": 0.016777269542217255, + "num_input_tokens_seen": 32768376, + "step": 2001, + "train_runtime": 16491.1544, + "train_tokens_per_second": 1987.027 + }, + { + "epoch": 0.5545706371191136, + "grad_norm": 0.07195544242858887, + "learning_rate": 9.948133090090666e-05, + "loss": 0.01660575531423092, + "num_input_tokens_seen": 32784752, + "step": 2002, + "train_runtime": 16499.3556, + "train_tokens_per_second": 1987.032 + }, + { + "epoch": 0.5548476454293629, + "grad_norm": 0.060905821621418, + "learning_rate": 9.948069928228616e-05, + "loss": 0.014020208269357681, + "num_input_tokens_seen": 32801128, + "step": 2003, + "train_runtime": 16507.5601, + "train_tokens_per_second": 1987.037 + }, + { + "epoch": 0.5551246537396122, + "grad_norm": 0.06543374806642532, + "learning_rate": 9.948006728132473e-05, + "loss": 0.017307410016655922, + "num_input_tokens_seen": 32817504, + "step": 2004, + "train_runtime": 16515.7621, + "train_tokens_per_second": 1987.041 + }, + { + "epoch": 0.5554016620498615, + "grad_norm": 0.11733537912368774, + "learning_rate": 9.947943489802729e-05, + "loss": 0.016965821385383606, + "num_input_tokens_seen": 32833880, + "step": 2005, + "train_runtime": 16523.9621, + "train_tokens_per_second": 1987.046 + }, + { + "epoch": 0.5556786703601108, + "grad_norm": 0.05257343128323555, + "learning_rate": 9.947880213239868e-05, + "loss": 0.01190627459436655, + "num_input_tokens_seen": 32850256, + "step": 2006, + "train_runtime": 16532.1666, + "train_tokens_per_second": 1987.051 + }, + { + "epoch": 0.5559556786703601, + "grad_norm": 0.10420121997594833, + "learning_rate": 9.947816898444381e-05, + "loss": 0.01661265827715397, + "num_input_tokens_seen": 32866632, + "step": 2007, + "train_runtime": 16540.3915, + "train_tokens_per_second": 1987.053 + }, + { + "epoch": 0.5562326869806095, + "grad_norm": 0.07173888385295868, + "learning_rate": 9.947753545416759e-05, + "loss": 0.014031942933797836, + "num_input_tokens_seen": 32883008, + "step": 2008, + "train_runtime": 16548.6181, + "train_tokens_per_second": 1987.055 + }, + { + "epoch": 0.5565096952908587, + "grad_norm": 0.07939192652702332, + "learning_rate": 9.947690154157487e-05, + "loss": 0.015179196372628212, + "num_input_tokens_seen": 32899384, + "step": 2009, + "train_runtime": 16556.843, + "train_tokens_per_second": 1987.057 + }, + { + "epoch": 0.556786703601108, + "grad_norm": 0.09970901161432266, + "learning_rate": 9.94762672466706e-05, + "loss": 0.016699809581041336, + "num_input_tokens_seen": 32915760, + "step": 2010, + "train_runtime": 16565.0569, + "train_tokens_per_second": 1987.06 + }, + { + "epoch": 0.5570637119113574, + "grad_norm": 0.0590316504240036, + "learning_rate": 9.947563256945964e-05, + "loss": 0.0132759315893054, + "num_input_tokens_seen": 32932136, + "step": 2011, + "train_runtime": 16573.2663, + "train_tokens_per_second": 1987.064 + }, + { + "epoch": 0.5573407202216066, + "grad_norm": 0.06612753868103027, + "learning_rate": 9.94749975099469e-05, + "loss": 0.019153311848640442, + "num_input_tokens_seen": 32948512, + "step": 2012, + "train_runtime": 16581.468, + "train_tokens_per_second": 1987.068 + }, + { + "epoch": 0.557617728531856, + "grad_norm": 0.127000629901886, + "learning_rate": 9.947436206813734e-05, + "loss": 0.017250893637537956, + "num_input_tokens_seen": 32964888, + "step": 2013, + "train_runtime": 16589.6767, + "train_tokens_per_second": 1987.072 + }, + { + "epoch": 0.5578947368421052, + "grad_norm": 0.08910764753818512, + "learning_rate": 9.947372624403579e-05, + "loss": 0.015683522447943687, + "num_input_tokens_seen": 32981264, + "step": 2014, + "train_runtime": 16597.8816, + "train_tokens_per_second": 1987.077 + }, + { + "epoch": 0.5581717451523546, + "grad_norm": 0.0789509192109108, + "learning_rate": 9.947309003764722e-05, + "loss": 0.014504469931125641, + "num_input_tokens_seen": 32997640, + "step": 2015, + "train_runtime": 16606.1001, + "train_tokens_per_second": 1987.079 + }, + { + "epoch": 0.5584487534626039, + "grad_norm": 0.13060970604419708, + "learning_rate": 9.947245344897653e-05, + "loss": 0.021255047991871834, + "num_input_tokens_seen": 33014016, + "step": 2016, + "train_runtime": 16614.3122, + "train_tokens_per_second": 1987.083 + }, + { + "epoch": 0.5587257617728532, + "grad_norm": 0.12965989112854004, + "learning_rate": 9.947181647802863e-05, + "loss": 0.016024865210056305, + "num_input_tokens_seen": 33030392, + "step": 2017, + "train_runtime": 16622.517, + "train_tokens_per_second": 1987.087 + }, + { + "epoch": 0.5590027700831025, + "grad_norm": 0.08025933057069778, + "learning_rate": 9.947117912480843e-05, + "loss": 0.01825520396232605, + "num_input_tokens_seen": 33046768, + "step": 2018, + "train_runtime": 16630.7213, + "train_tokens_per_second": 1987.092 + }, + { + "epoch": 0.5592797783933517, + "grad_norm": 0.0664687380194664, + "learning_rate": 9.94705413893209e-05, + "loss": 0.017378196120262146, + "num_input_tokens_seen": 33063144, + "step": 2019, + "train_runtime": 16638.9314, + "train_tokens_per_second": 1987.095 + }, + { + "epoch": 0.5595567867036011, + "grad_norm": 0.1010994166135788, + "learning_rate": 9.946990327157094e-05, + "loss": 0.01681683398783207, + "num_input_tokens_seen": 33079520, + "step": 2020, + "train_runtime": 16647.1556, + "train_tokens_per_second": 1987.097 + }, + { + "epoch": 0.5598337950138504, + "grad_norm": 0.06476171314716339, + "learning_rate": 9.946926477156346e-05, + "loss": 0.015957074239850044, + "num_input_tokens_seen": 33095896, + "step": 2021, + "train_runtime": 16655.3805, + "train_tokens_per_second": 1987.099 + }, + { + "epoch": 0.5601108033240997, + "grad_norm": 0.07848963141441345, + "learning_rate": 9.946862588930343e-05, + "loss": 0.016619902104139328, + "num_input_tokens_seen": 33112272, + "step": 2022, + "train_runtime": 16663.5909, + "train_tokens_per_second": 1987.103 + }, + { + "epoch": 0.560387811634349, + "grad_norm": 0.09192240983247757, + "learning_rate": 9.946798662479577e-05, + "loss": 0.021507766097784042, + "num_input_tokens_seen": 33128648, + "step": 2023, + "train_runtime": 16671.8087, + "train_tokens_per_second": 1987.106 + }, + { + "epoch": 0.5606648199445984, + "grad_norm": 0.07531498372554779, + "learning_rate": 9.946734697804542e-05, + "loss": 0.013042747043073177, + "num_input_tokens_seen": 33145024, + "step": 2024, + "train_runtime": 16680.0221, + "train_tokens_per_second": 1987.109 + }, + { + "epoch": 0.5609418282548476, + "grad_norm": 0.09018301963806152, + "learning_rate": 9.946670694905732e-05, + "loss": 0.016136135905981064, + "num_input_tokens_seen": 33161400, + "step": 2025, + "train_runtime": 16688.2326, + "train_tokens_per_second": 1987.113 + }, + { + "epoch": 0.561218836565097, + "grad_norm": 0.09093818813562393, + "learning_rate": 9.946606653783644e-05, + "loss": 0.020574774593114853, + "num_input_tokens_seen": 33177776, + "step": 2026, + "train_runtime": 16696.4417, + "train_tokens_per_second": 1987.117 + }, + { + "epoch": 0.5614958448753462, + "grad_norm": 0.1229904517531395, + "learning_rate": 9.946542574438769e-05, + "loss": 0.019672786816954613, + "num_input_tokens_seen": 33194152, + "step": 2027, + "train_runtime": 16704.6601, + "train_tokens_per_second": 1987.119 + }, + { + "epoch": 0.5617728531855956, + "grad_norm": 0.10301195830106735, + "learning_rate": 9.946478456871605e-05, + "loss": 0.01775110699236393, + "num_input_tokens_seen": 33210528, + "step": 2028, + "train_runtime": 16712.8726, + "train_tokens_per_second": 1987.123 + }, + { + "epoch": 0.5620498614958449, + "grad_norm": 0.050880931317806244, + "learning_rate": 9.946414301082644e-05, + "loss": 0.012934915721416473, + "num_input_tokens_seen": 33226904, + "step": 2029, + "train_runtime": 16721.0843, + "train_tokens_per_second": 1987.126 + }, + { + "epoch": 0.5623268698060941, + "grad_norm": 0.09030082076787949, + "learning_rate": 9.946350107072386e-05, + "loss": 0.01641332544386387, + "num_input_tokens_seen": 33243280, + "step": 2030, + "train_runtime": 16729.2891, + "train_tokens_per_second": 1987.13 + }, + { + "epoch": 0.5626038781163435, + "grad_norm": 0.10841050744056702, + "learning_rate": 9.946285874841326e-05, + "loss": 0.018907198682427406, + "num_input_tokens_seen": 33259656, + "step": 2031, + "train_runtime": 16737.5022, + "train_tokens_per_second": 1987.134 + }, + { + "epoch": 0.5628808864265928, + "grad_norm": 0.06484673172235489, + "learning_rate": 9.946221604389958e-05, + "loss": 0.016217142343521118, + "num_input_tokens_seen": 33276032, + "step": 2032, + "train_runtime": 16745.7276, + "train_tokens_per_second": 1987.136 + }, + { + "epoch": 0.5631578947368421, + "grad_norm": 0.09382252395153046, + "learning_rate": 9.946157295718781e-05, + "loss": 0.020303839817643166, + "num_input_tokens_seen": 33292408, + "step": 2033, + "train_runtime": 16753.9568, + "train_tokens_per_second": 1987.137 + }, + { + "epoch": 0.5634349030470914, + "grad_norm": 0.0824592337012291, + "learning_rate": 9.946092948828289e-05, + "loss": 0.020212672650814056, + "num_input_tokens_seen": 33308784, + "step": 2034, + "train_runtime": 16762.1886, + "train_tokens_per_second": 1987.138 + }, + { + "epoch": 0.5637119113573407, + "grad_norm": 0.10817951709032059, + "learning_rate": 9.946028563718984e-05, + "loss": 0.01600152626633644, + "num_input_tokens_seen": 33325160, + "step": 2035, + "train_runtime": 16770.4186, + "train_tokens_per_second": 1987.139 + }, + { + "epoch": 0.56398891966759, + "grad_norm": 0.08264970034360886, + "learning_rate": 9.94596414039136e-05, + "loss": 0.016023986041545868, + "num_input_tokens_seen": 33341536, + "step": 2036, + "train_runtime": 16778.6576, + "train_tokens_per_second": 1987.14 + }, + { + "epoch": 0.5642659279778394, + "grad_norm": 0.07150854170322418, + "learning_rate": 9.945899678845916e-05, + "loss": 0.013909805566072464, + "num_input_tokens_seen": 33357912, + "step": 2037, + "train_runtime": 16786.8931, + "train_tokens_per_second": 1987.14 + }, + { + "epoch": 0.5645429362880886, + "grad_norm": 0.08239323645830154, + "learning_rate": 9.94583517908315e-05, + "loss": 0.017493758350610733, + "num_input_tokens_seen": 33374288, + "step": 2038, + "train_runtime": 16795.1356, + "train_tokens_per_second": 1987.14 + }, + { + "epoch": 0.564819944598338, + "grad_norm": 0.10127869248390198, + "learning_rate": 9.945770641103558e-05, + "loss": 0.014520109631121159, + "num_input_tokens_seen": 33390664, + "step": 2039, + "train_runtime": 16803.3545, + "train_tokens_per_second": 1987.143 + }, + { + "epoch": 0.5650969529085873, + "grad_norm": 0.06544627994298935, + "learning_rate": 9.945706064907641e-05, + "loss": 0.015341592952609062, + "num_input_tokens_seen": 33407040, + "step": 2040, + "train_runtime": 16811.5878, + "train_tokens_per_second": 1987.144 + }, + { + "epoch": 0.5653739612188365, + "grad_norm": 0.09262189269065857, + "learning_rate": 9.9456414504959e-05, + "loss": 0.01912400871515274, + "num_input_tokens_seen": 33423416, + "step": 2041, + "train_runtime": 16819.8121, + "train_tokens_per_second": 1987.146 + }, + { + "epoch": 0.5656509695290859, + "grad_norm": 0.09583679586648941, + "learning_rate": 9.94557679786883e-05, + "loss": 0.017208188772201538, + "num_input_tokens_seen": 33439792, + "step": 2042, + "train_runtime": 16828.0259, + "train_tokens_per_second": 1987.149 + }, + { + "epoch": 0.5659279778393351, + "grad_norm": 0.09018665552139282, + "learning_rate": 9.945512107026933e-05, + "loss": 0.01470563467592001, + "num_input_tokens_seen": 33456168, + "step": 2043, + "train_runtime": 16836.2431, + "train_tokens_per_second": 1987.152 + }, + { + "epoch": 0.5662049861495845, + "grad_norm": 0.15051010251045227, + "learning_rate": 9.945447377970709e-05, + "loss": 0.018192840740084648, + "num_input_tokens_seen": 33472544, + "step": 2044, + "train_runtime": 16844.4818, + "train_tokens_per_second": 1987.152 + }, + { + "epoch": 0.5664819944598338, + "grad_norm": 0.09831003099679947, + "learning_rate": 9.945382610700657e-05, + "loss": 0.019183633849024773, + "num_input_tokens_seen": 33488920, + "step": 2045, + "train_runtime": 16852.722, + "train_tokens_per_second": 1987.152 + }, + { + "epoch": 0.5667590027700831, + "grad_norm": 0.08251814544200897, + "learning_rate": 9.94531780521728e-05, + "loss": 0.014048844575881958, + "num_input_tokens_seen": 33505296, + "step": 2046, + "train_runtime": 16860.9579, + "train_tokens_per_second": 1987.153 + }, + { + "epoch": 0.5670360110803324, + "grad_norm": 0.07679334282875061, + "learning_rate": 9.945252961521075e-05, + "loss": 0.011070278473198414, + "num_input_tokens_seen": 33521672, + "step": 2047, + "train_runtime": 16869.2002, + "train_tokens_per_second": 1987.152 + }, + { + "epoch": 0.5673130193905818, + "grad_norm": 0.08362238109111786, + "learning_rate": 9.945188079612545e-05, + "loss": 0.016134927049279213, + "num_input_tokens_seen": 33538048, + "step": 2048, + "train_runtime": 16877.4202, + "train_tokens_per_second": 1987.155 + }, + { + "epoch": 0.567590027700831, + "grad_norm": 0.060841675847768784, + "learning_rate": 9.945123159492192e-05, + "loss": 0.012675528414547443, + "num_input_tokens_seen": 33554424, + "step": 2049, + "train_runtime": 16885.6608, + "train_tokens_per_second": 1987.155 + }, + { + "epoch": 0.5678670360110804, + "grad_norm": 0.10795112699270248, + "learning_rate": 9.945058201160516e-05, + "loss": 0.01531983818858862, + "num_input_tokens_seen": 33570800, + "step": 2050, + "train_runtime": 16893.8966, + "train_tokens_per_second": 1987.156 + }, + { + "epoch": 0.5681440443213296, + "grad_norm": 0.07719950377941132, + "learning_rate": 9.94499320461802e-05, + "loss": 0.013697127811610699, + "num_input_tokens_seen": 33587176, + "step": 2051, + "train_runtime": 16902.1324, + "train_tokens_per_second": 1987.156 + }, + { + "epoch": 0.5684210526315789, + "grad_norm": 0.08880996704101562, + "learning_rate": 9.944928169865206e-05, + "loss": 0.017546208575367928, + "num_input_tokens_seen": 33603552, + "step": 2052, + "train_runtime": 16910.3706, + "train_tokens_per_second": 1987.156 + }, + { + "epoch": 0.5686980609418283, + "grad_norm": 0.0918957069516182, + "learning_rate": 9.944863096902578e-05, + "loss": 0.018676087260246277, + "num_input_tokens_seen": 33619928, + "step": 2053, + "train_runtime": 16918.6087, + "train_tokens_per_second": 1987.157 + }, + { + "epoch": 0.5689750692520775, + "grad_norm": 0.12257854640483856, + "learning_rate": 9.944797985730636e-05, + "loss": 0.014722314663231373, + "num_input_tokens_seen": 33636304, + "step": 2054, + "train_runtime": 16926.8426, + "train_tokens_per_second": 1987.158 + }, + { + "epoch": 0.5692520775623269, + "grad_norm": 0.09732627123594284, + "learning_rate": 9.944732836349887e-05, + "loss": 0.014147480018436909, + "num_input_tokens_seen": 33652680, + "step": 2055, + "train_runtime": 16935.075, + "train_tokens_per_second": 1987.159 + }, + { + "epoch": 0.5695290858725762, + "grad_norm": 0.12540754675865173, + "learning_rate": 9.944667648760828e-05, + "loss": 0.015071257017552853, + "num_input_tokens_seen": 33669056, + "step": 2056, + "train_runtime": 16943.2948, + "train_tokens_per_second": 1987.161 + }, + { + "epoch": 0.5698060941828255, + "grad_norm": 0.1099901795387268, + "learning_rate": 9.94460242296397e-05, + "loss": 0.018006348982453346, + "num_input_tokens_seen": 33685432, + "step": 2057, + "train_runtime": 16951.5097, + "train_tokens_per_second": 1987.164 + }, + { + "epoch": 0.5700831024930748, + "grad_norm": 0.138932004570961, + "learning_rate": 9.944537158959812e-05, + "loss": 0.017575882375240326, + "num_input_tokens_seen": 33701808, + "step": 2058, + "train_runtime": 16959.7247, + "train_tokens_per_second": 1987.167 + }, + { + "epoch": 0.570360110803324, + "grad_norm": 0.07946386188268661, + "learning_rate": 9.94447185674886e-05, + "loss": 0.016513589769601822, + "num_input_tokens_seen": 33718184, + "step": 2059, + "train_runtime": 16967.9409, + "train_tokens_per_second": 1987.17 + }, + { + "epoch": 0.5706371191135734, + "grad_norm": 0.1253076195716858, + "learning_rate": 9.94440651633162e-05, + "loss": 0.01861470937728882, + "num_input_tokens_seen": 33734560, + "step": 2060, + "train_runtime": 16976.169, + "train_tokens_per_second": 1987.172 + }, + { + "epoch": 0.5709141274238227, + "grad_norm": 0.10415740311145782, + "learning_rate": 9.944341137708592e-05, + "loss": 0.016478730365633965, + "num_input_tokens_seen": 33750936, + "step": 2061, + "train_runtime": 16984.3981, + "train_tokens_per_second": 1987.173 + }, + { + "epoch": 0.571191135734072, + "grad_norm": 0.07228225469589233, + "learning_rate": 9.944275720880288e-05, + "loss": 0.01349769625812769, + "num_input_tokens_seen": 33767312, + "step": 2062, + "train_runtime": 16992.6138, + "train_tokens_per_second": 1987.176 + }, + { + "epoch": 0.5714681440443213, + "grad_norm": 0.12917876243591309, + "learning_rate": 9.94421026584721e-05, + "loss": 0.019585639238357544, + "num_input_tokens_seen": 33783688, + "step": 2063, + "train_runtime": 17000.8286, + "train_tokens_per_second": 1987.179 + }, + { + "epoch": 0.5717451523545707, + "grad_norm": 0.08635582774877548, + "learning_rate": 9.944144772609863e-05, + "loss": 0.017835838720202446, + "num_input_tokens_seen": 33800064, + "step": 2064, + "train_runtime": 17009.0384, + "train_tokens_per_second": 1987.183 + }, + { + "epoch": 0.5720221606648199, + "grad_norm": 0.09668773412704468, + "learning_rate": 9.944079241168753e-05, + "loss": 0.018478713929653168, + "num_input_tokens_seen": 33816440, + "step": 2065, + "train_runtime": 17017.2555, + "train_tokens_per_second": 1987.185 + }, + { + "epoch": 0.5722991689750693, + "grad_norm": 0.06770260632038116, + "learning_rate": 9.944013671524389e-05, + "loss": 0.014112668111920357, + "num_input_tokens_seen": 33832816, + "step": 2066, + "train_runtime": 17025.4841, + "train_tokens_per_second": 1987.187 + }, + { + "epoch": 0.5725761772853185, + "grad_norm": 0.0619015246629715, + "learning_rate": 9.943948063677274e-05, + "loss": 0.015527596697211266, + "num_input_tokens_seen": 33849192, + "step": 2067, + "train_runtime": 17033.7157, + "train_tokens_per_second": 1987.188 + }, + { + "epoch": 0.5728531855955679, + "grad_norm": 0.06738870590925217, + "learning_rate": 9.943882417627919e-05, + "loss": 0.014975002966821194, + "num_input_tokens_seen": 33865568, + "step": 2068, + "train_runtime": 17041.9281, + "train_tokens_per_second": 1987.191 + }, + { + "epoch": 0.5731301939058172, + "grad_norm": 0.10329417139291763, + "learning_rate": 9.943816733376827e-05, + "loss": 0.017806634306907654, + "num_input_tokens_seen": 33881944, + "step": 2069, + "train_runtime": 17050.1382, + "train_tokens_per_second": 1987.195 + }, + { + "epoch": 0.5734072022160664, + "grad_norm": 0.0819372907280922, + "learning_rate": 9.943751010924509e-05, + "loss": 0.018918341025710106, + "num_input_tokens_seen": 33898320, + "step": 2070, + "train_runtime": 17058.3544, + "train_tokens_per_second": 1987.198 + }, + { + "epoch": 0.5736842105263158, + "grad_norm": 0.08632764220237732, + "learning_rate": 9.943685250271473e-05, + "loss": 0.01748986914753914, + "num_input_tokens_seen": 33914696, + "step": 2071, + "train_runtime": 17066.5748, + "train_tokens_per_second": 1987.2 + }, + { + "epoch": 0.5739612188365651, + "grad_norm": 0.08628295361995697, + "learning_rate": 9.943619451418224e-05, + "loss": 0.015241606160998344, + "num_input_tokens_seen": 33931072, + "step": 2072, + "train_runtime": 17074.804, + "train_tokens_per_second": 1987.201 + }, + { + "epoch": 0.5742382271468144, + "grad_norm": 0.0764111801981926, + "learning_rate": 9.943553614365273e-05, + "loss": 0.016579538583755493, + "num_input_tokens_seen": 33947448, + "step": 2073, + "train_runtime": 17083.0357, + "train_tokens_per_second": 1987.202 + }, + { + "epoch": 0.5745152354570637, + "grad_norm": 0.0626486986875534, + "learning_rate": 9.943487739113126e-05, + "loss": 0.012230983003973961, + "num_input_tokens_seen": 33963824, + "step": 2074, + "train_runtime": 17091.2693, + "train_tokens_per_second": 1987.203 + }, + { + "epoch": 0.574792243767313, + "grad_norm": 0.08768294006586075, + "learning_rate": 9.943421825662296e-05, + "loss": 0.014580314978957176, + "num_input_tokens_seen": 33980200, + "step": 2075, + "train_runtime": 17099.4853, + "train_tokens_per_second": 1987.206 + }, + { + "epoch": 0.5750692520775623, + "grad_norm": 0.09184559434652328, + "learning_rate": 9.94335587401329e-05, + "loss": 0.01764187030494213, + "num_input_tokens_seen": 33996576, + "step": 2076, + "train_runtime": 17107.6922, + "train_tokens_per_second": 1987.21 + }, + { + "epoch": 0.5753462603878117, + "grad_norm": 0.08221520483493805, + "learning_rate": 9.943289884166618e-05, + "loss": 0.01606881245970726, + "num_input_tokens_seen": 34012952, + "step": 2077, + "train_runtime": 17115.9026, + "train_tokens_per_second": 1987.213 + }, + { + "epoch": 0.5756232686980609, + "grad_norm": 0.09520773589611053, + "learning_rate": 9.943223856122788e-05, + "loss": 0.01697063073515892, + "num_input_tokens_seen": 34029328, + "step": 2078, + "train_runtime": 17124.1225, + "train_tokens_per_second": 1987.216 + }, + { + "epoch": 0.5759002770083103, + "grad_norm": 0.09960732609033585, + "learning_rate": 9.943157789882313e-05, + "loss": 0.018337858840823174, + "num_input_tokens_seen": 34045704, + "step": 2079, + "train_runtime": 17132.3441, + "train_tokens_per_second": 1987.218 + }, + { + "epoch": 0.5761772853185596, + "grad_norm": 0.13288827240467072, + "learning_rate": 9.943091685445705e-05, + "loss": 0.015325398184359074, + "num_input_tokens_seen": 34062080, + "step": 2080, + "train_runtime": 17140.5704, + "train_tokens_per_second": 1987.22 + }, + { + "epoch": 0.5764542936288088, + "grad_norm": 0.11800529062747955, + "learning_rate": 9.943025542813469e-05, + "loss": 0.017974909394979477, + "num_input_tokens_seen": 34078456, + "step": 2081, + "train_runtime": 17148.8122, + "train_tokens_per_second": 1987.22 + }, + { + "epoch": 0.5767313019390582, + "grad_norm": 0.0879470705986023, + "learning_rate": 9.942959361986119e-05, + "loss": 0.014782303012907505, + "num_input_tokens_seen": 34094832, + "step": 2082, + "train_runtime": 17157.0358, + "train_tokens_per_second": 1987.222 + }, + { + "epoch": 0.5770083102493074, + "grad_norm": 0.08500342816114426, + "learning_rate": 9.942893142964169e-05, + "loss": 0.01752835139632225, + "num_input_tokens_seen": 34111208, + "step": 2083, + "train_runtime": 17165.2551, + "train_tokens_per_second": 1987.224 + }, + { + "epoch": 0.5772853185595568, + "grad_norm": 0.07261700183153152, + "learning_rate": 9.942826885748127e-05, + "loss": 0.015188219025731087, + "num_input_tokens_seen": 34127584, + "step": 2084, + "train_runtime": 17173.4808, + "train_tokens_per_second": 1987.226 + }, + { + "epoch": 0.5775623268698061, + "grad_norm": 0.06685040891170502, + "learning_rate": 9.942760590338506e-05, + "loss": 0.014520355500280857, + "num_input_tokens_seen": 34143960, + "step": 2085, + "train_runtime": 17181.7325, + "train_tokens_per_second": 1987.225 + }, + { + "epoch": 0.5778393351800554, + "grad_norm": 0.10012686997652054, + "learning_rate": 9.942694256735821e-05, + "loss": 0.01392953097820282, + "num_input_tokens_seen": 34160336, + "step": 2086, + "train_runtime": 17189.9677, + "train_tokens_per_second": 1987.225 + }, + { + "epoch": 0.5781163434903047, + "grad_norm": 0.08537033200263977, + "learning_rate": 9.942627884940581e-05, + "loss": 0.012747320346534252, + "num_input_tokens_seen": 34176712, + "step": 2087, + "train_runtime": 17198.1871, + "train_tokens_per_second": 1987.228 + }, + { + "epoch": 0.5783933518005541, + "grad_norm": 0.09641183167695999, + "learning_rate": 9.942561474953298e-05, + "loss": 0.01592171937227249, + "num_input_tokens_seen": 34193088, + "step": 2088, + "train_runtime": 17206.4031, + "train_tokens_per_second": 1987.23 + }, + { + "epoch": 0.5786703601108033, + "grad_norm": 0.061392005532979965, + "learning_rate": 9.942495026774489e-05, + "loss": 0.013644108548760414, + "num_input_tokens_seen": 34209464, + "step": 2089, + "train_runtime": 17214.6369, + "train_tokens_per_second": 1987.231 + }, + { + "epoch": 0.5789473684210527, + "grad_norm": 0.13113358616828918, + "learning_rate": 9.942428540404667e-05, + "loss": 0.01838780753314495, + "num_input_tokens_seen": 34225840, + "step": 2090, + "train_runtime": 17222.8616, + "train_tokens_per_second": 1987.233 + }, + { + "epoch": 0.5792243767313019, + "grad_norm": 0.07764401286840439, + "learning_rate": 9.942362015844342e-05, + "loss": 0.01491974201053381, + "num_input_tokens_seen": 34242216, + "step": 2091, + "train_runtime": 17231.0863, + "train_tokens_per_second": 1987.235 + }, + { + "epoch": 0.5795013850415512, + "grad_norm": 0.06776473671197891, + "learning_rate": 9.942295453094032e-05, + "loss": 0.013621268793940544, + "num_input_tokens_seen": 34258592, + "step": 2092, + "train_runtime": 17239.3021, + "train_tokens_per_second": 1987.238 + }, + { + "epoch": 0.5797783933518006, + "grad_norm": 0.09405867755413055, + "learning_rate": 9.94222885215425e-05, + "loss": 0.015831707045435905, + "num_input_tokens_seen": 34274968, + "step": 2093, + "train_runtime": 17247.5384, + "train_tokens_per_second": 1987.238 + }, + { + "epoch": 0.5800554016620498, + "grad_norm": 0.09941450506448746, + "learning_rate": 9.942162213025508e-05, + "loss": 0.015160268172621727, + "num_input_tokens_seen": 34291344, + "step": 2094, + "train_runtime": 17255.759, + "train_tokens_per_second": 1987.241 + }, + { + "epoch": 0.5803324099722992, + "grad_norm": 0.07711614668369293, + "learning_rate": 9.942095535708325e-05, + "loss": 0.01426868885755539, + "num_input_tokens_seen": 34307720, + "step": 2095, + "train_runtime": 17263.9746, + "train_tokens_per_second": 1987.243 + }, + { + "epoch": 0.5806094182825485, + "grad_norm": 0.09398162364959717, + "learning_rate": 9.942028820203215e-05, + "loss": 0.013013315387070179, + "num_input_tokens_seen": 34324096, + "step": 2096, + "train_runtime": 17272.1934, + "train_tokens_per_second": 1987.246 + }, + { + "epoch": 0.5808864265927978, + "grad_norm": 0.10778792202472687, + "learning_rate": 9.941962066510693e-05, + "loss": 0.016402974724769592, + "num_input_tokens_seen": 34340472, + "step": 2097, + "train_runtime": 17280.4075, + "train_tokens_per_second": 1987.249 + }, + { + "epoch": 0.5811634349030471, + "grad_norm": 0.08213140070438385, + "learning_rate": 9.941895274631274e-05, + "loss": 0.014696250669658184, + "num_input_tokens_seen": 34356848, + "step": 2098, + "train_runtime": 17288.6391, + "train_tokens_per_second": 1987.25 + }, + { + "epoch": 0.5814404432132964, + "grad_norm": 0.09819025546312332, + "learning_rate": 9.941828444565475e-05, + "loss": 0.017754850909113884, + "num_input_tokens_seen": 34373224, + "step": 2099, + "train_runtime": 17296.8745, + "train_tokens_per_second": 1987.251 + }, + { + "epoch": 0.5817174515235457, + "grad_norm": 0.07120045274496078, + "learning_rate": 9.941761576313812e-05, + "loss": 0.016865864396095276, + "num_input_tokens_seen": 34389600, + "step": 2100, + "train_runtime": 17305.1177, + "train_tokens_per_second": 1987.25 + }, + { + "epoch": 0.581994459833795, + "grad_norm": 0.10060396790504456, + "learning_rate": 9.941694669876804e-05, + "loss": 0.015393567271530628, + "num_input_tokens_seen": 34405976, + "step": 2101, + "train_runtime": 17314.9402, + "train_tokens_per_second": 1987.069 + }, + { + "epoch": 0.5822714681440443, + "grad_norm": 0.08542817831039429, + "learning_rate": 9.941627725254965e-05, + "loss": 0.016537373885512352, + "num_input_tokens_seen": 34422352, + "step": 2102, + "train_runtime": 17323.1592, + "train_tokens_per_second": 1987.071 + }, + { + "epoch": 0.5825484764542936, + "grad_norm": 0.07421402633190155, + "learning_rate": 9.941560742448815e-05, + "loss": 0.013574734330177307, + "num_input_tokens_seen": 34438728, + "step": 2103, + "train_runtime": 17331.3957, + "train_tokens_per_second": 1987.072 + }, + { + "epoch": 0.582825484764543, + "grad_norm": 0.06698696315288544, + "learning_rate": 9.941493721458867e-05, + "loss": 0.012773135676980019, + "num_input_tokens_seen": 34455104, + "step": 2104, + "train_runtime": 17339.6234, + "train_tokens_per_second": 1987.073 + }, + { + "epoch": 0.5831024930747922, + "grad_norm": 0.11437013000249863, + "learning_rate": 9.941426662285645e-05, + "loss": 0.01744963601231575, + "num_input_tokens_seen": 34471480, + "step": 2105, + "train_runtime": 17347.8467, + "train_tokens_per_second": 1987.075 + }, + { + "epoch": 0.5833795013850416, + "grad_norm": 0.132111594080925, + "learning_rate": 9.941359564929662e-05, + "loss": 0.016521312296390533, + "num_input_tokens_seen": 34487856, + "step": 2106, + "train_runtime": 17356.0841, + "train_tokens_per_second": 1987.076 + }, + { + "epoch": 0.5836565096952908, + "grad_norm": 0.05906875059008598, + "learning_rate": 9.941292429391437e-05, + "loss": 0.01565786823630333, + "num_input_tokens_seen": 34504232, + "step": 2107, + "train_runtime": 17364.3213, + "train_tokens_per_second": 1987.076 + }, + { + "epoch": 0.5839335180055402, + "grad_norm": 0.07817839831113815, + "learning_rate": 9.941225255671494e-05, + "loss": 0.01654141955077648, + "num_input_tokens_seen": 34520608, + "step": 2108, + "train_runtime": 17372.5338, + "train_tokens_per_second": 1987.08 + }, + { + "epoch": 0.5842105263157895, + "grad_norm": 0.07641485333442688, + "learning_rate": 9.941158043770345e-05, + "loss": 0.016774535179138184, + "num_input_tokens_seen": 34536984, + "step": 2109, + "train_runtime": 17380.7439, + "train_tokens_per_second": 1987.083 + }, + { + "epoch": 0.5844875346260388, + "grad_norm": 0.09288601577281952, + "learning_rate": 9.941090793688514e-05, + "loss": 0.016895538195967674, + "num_input_tokens_seen": 34553360, + "step": 2110, + "train_runtime": 17388.9578, + "train_tokens_per_second": 1987.086 + }, + { + "epoch": 0.5847645429362881, + "grad_norm": 0.08092936128377914, + "learning_rate": 9.94102350542652e-05, + "loss": 0.015276138670742512, + "num_input_tokens_seen": 34569736, + "step": 2111, + "train_runtime": 17397.1669, + "train_tokens_per_second": 1987.09 + }, + { + "epoch": 0.5850415512465375, + "grad_norm": 0.12148188799619675, + "learning_rate": 9.940956178984881e-05, + "loss": 0.015456028282642365, + "num_input_tokens_seen": 34586112, + "step": 2112, + "train_runtime": 17405.3762, + "train_tokens_per_second": 1987.094 + }, + { + "epoch": 0.5853185595567867, + "grad_norm": 0.07623276114463806, + "learning_rate": 9.940888814364119e-05, + "loss": 0.01601051166653633, + "num_input_tokens_seen": 34602488, + "step": 2113, + "train_runtime": 17413.6145, + "train_tokens_per_second": 1987.094 + }, + { + "epoch": 0.585595567867036, + "grad_norm": 0.09564834088087082, + "learning_rate": 9.940821411564753e-05, + "loss": 0.016697479411959648, + "num_input_tokens_seen": 34618864, + "step": 2114, + "train_runtime": 17421.8571, + "train_tokens_per_second": 1987.094 + }, + { + "epoch": 0.5858725761772853, + "grad_norm": 0.05428877845406532, + "learning_rate": 9.940753970587307e-05, + "loss": 0.018592530861496925, + "num_input_tokens_seen": 34635240, + "step": 2115, + "train_runtime": 17430.0818, + "train_tokens_per_second": 1987.096 + }, + { + "epoch": 0.5861495844875346, + "grad_norm": 0.09089221060276031, + "learning_rate": 9.940686491432299e-05, + "loss": 0.018406206741929054, + "num_input_tokens_seen": 34651616, + "step": 2116, + "train_runtime": 17438.2882, + "train_tokens_per_second": 1987.1 + }, + { + "epoch": 0.586426592797784, + "grad_norm": 0.06803187727928162, + "learning_rate": 9.94061897410025e-05, + "loss": 0.016766831278800964, + "num_input_tokens_seen": 34667992, + "step": 2117, + "train_runtime": 17446.4973, + "train_tokens_per_second": 1987.103 + }, + { + "epoch": 0.5867036011080332, + "grad_norm": 0.1148122102022171, + "learning_rate": 9.940551418591684e-05, + "loss": 0.015121682547032833, + "num_input_tokens_seen": 34684368, + "step": 2118, + "train_runtime": 17454.7058, + "train_tokens_per_second": 1987.107 + }, + { + "epoch": 0.5869806094182826, + "grad_norm": 0.0855252668261528, + "learning_rate": 9.940483824907122e-05, + "loss": 0.01603671722114086, + "num_input_tokens_seen": 34700744, + "step": 2119, + "train_runtime": 17462.9387, + "train_tokens_per_second": 1987.108 + }, + { + "epoch": 0.5872576177285319, + "grad_norm": 0.0595712885260582, + "learning_rate": 9.940416193047086e-05, + "loss": 0.012584332376718521, + "num_input_tokens_seen": 34717120, + "step": 2120, + "train_runtime": 17471.1711, + "train_tokens_per_second": 1987.109 + }, + { + "epoch": 0.5875346260387811, + "grad_norm": 0.07593487948179245, + "learning_rate": 9.9403485230121e-05, + "loss": 0.01730220951139927, + "num_input_tokens_seen": 34733496, + "step": 2121, + "train_runtime": 17479.4005, + "train_tokens_per_second": 1987.11 + }, + { + "epoch": 0.5878116343490305, + "grad_norm": 0.0707104504108429, + "learning_rate": 9.940280814802685e-05, + "loss": 0.01555198896676302, + "num_input_tokens_seen": 34749872, + "step": 2122, + "train_runtime": 17487.6344, + "train_tokens_per_second": 1987.111 + }, + { + "epoch": 0.5880886426592797, + "grad_norm": 0.08739277720451355, + "learning_rate": 9.940213068419366e-05, + "loss": 0.016562171280384064, + "num_input_tokens_seen": 34766248, + "step": 2123, + "train_runtime": 17495.8587, + "train_tokens_per_second": 1987.113 + }, + { + "epoch": 0.5883656509695291, + "grad_norm": 0.04369286820292473, + "learning_rate": 9.940145283862666e-05, + "loss": 0.011739500798285007, + "num_input_tokens_seen": 34782624, + "step": 2124, + "train_runtime": 17504.0692, + "train_tokens_per_second": 1987.116 + }, + { + "epoch": 0.5886426592797784, + "grad_norm": 0.0697917491197586, + "learning_rate": 9.940077461133106e-05, + "loss": 0.015692664310336113, + "num_input_tokens_seen": 34799000, + "step": 2125, + "train_runtime": 17512.2786, + "train_tokens_per_second": 1987.12 + }, + { + "epoch": 0.5889196675900277, + "grad_norm": 0.07757745683193207, + "learning_rate": 9.940009600231213e-05, + "loss": 0.01334304641932249, + "num_input_tokens_seen": 34815376, + "step": 2126, + "train_runtime": 17520.4866, + "train_tokens_per_second": 1987.124 + }, + { + "epoch": 0.589196675900277, + "grad_norm": 0.06246872618794441, + "learning_rate": 9.939941701157511e-05, + "loss": 0.013325977139174938, + "num_input_tokens_seen": 34831752, + "step": 2127, + "train_runtime": 17528.704, + "train_tokens_per_second": 1987.126 + }, + { + "epoch": 0.5894736842105263, + "grad_norm": 0.057860080152750015, + "learning_rate": 9.939873763912526e-05, + "loss": 0.012500537559390068, + "num_input_tokens_seen": 34848128, + "step": 2128, + "train_runtime": 17536.9352, + "train_tokens_per_second": 1987.128 + }, + { + "epoch": 0.5897506925207756, + "grad_norm": 0.1348852515220642, + "learning_rate": 9.939805788496779e-05, + "loss": 0.017121801152825356, + "num_input_tokens_seen": 34864504, + "step": 2129, + "train_runtime": 17545.1652, + "train_tokens_per_second": 1987.129 + }, + { + "epoch": 0.590027700831025, + "grad_norm": 0.11328700929880142, + "learning_rate": 9.939737774910799e-05, + "loss": 0.016583099961280823, + "num_input_tokens_seen": 34880880, + "step": 2130, + "train_runtime": 17553.3937, + "train_tokens_per_second": 1987.13 + }, + { + "epoch": 0.5903047091412742, + "grad_norm": 0.08025941252708435, + "learning_rate": 9.93966972315511e-05, + "loss": 0.015208225697278976, + "num_input_tokens_seen": 34897256, + "step": 2131, + "train_runtime": 17561.5977, + "train_tokens_per_second": 1987.134 + }, + { + "epoch": 0.5905817174515235, + "grad_norm": 0.10152803361415863, + "learning_rate": 9.939601633230238e-05, + "loss": 0.016485029831528664, + "num_input_tokens_seen": 34913632, + "step": 2132, + "train_runtime": 17569.8096, + "train_tokens_per_second": 1987.138 + }, + { + "epoch": 0.5908587257617729, + "grad_norm": 0.09250335395336151, + "learning_rate": 9.939533505136708e-05, + "loss": 0.01951049081981182, + "num_input_tokens_seen": 34930008, + "step": 2133, + "train_runtime": 17578.0328, + "train_tokens_per_second": 1987.14 + }, + { + "epoch": 0.5911357340720221, + "grad_norm": 0.1222112849354744, + "learning_rate": 9.93946533887505e-05, + "loss": 0.01716788299381733, + "num_input_tokens_seen": 34946384, + "step": 2134, + "train_runtime": 17586.2649, + "train_tokens_per_second": 1987.141 + }, + { + "epoch": 0.5914127423822715, + "grad_norm": 0.0690528154373169, + "learning_rate": 9.939397134445787e-05, + "loss": 0.01295439712703228, + "num_input_tokens_seen": 34962760, + "step": 2135, + "train_runtime": 17594.4887, + "train_tokens_per_second": 1987.143 + }, + { + "epoch": 0.5916897506925207, + "grad_norm": 0.07266223430633545, + "learning_rate": 9.939328891849446e-05, + "loss": 0.01531321369111538, + "num_input_tokens_seen": 34979136, + "step": 2136, + "train_runtime": 17602.7158, + "train_tokens_per_second": 1987.144 + }, + { + "epoch": 0.5919667590027701, + "grad_norm": 0.056812603026628494, + "learning_rate": 9.939260611086557e-05, + "loss": 0.013076983392238617, + "num_input_tokens_seen": 34995512, + "step": 2137, + "train_runtime": 17610.9588, + "train_tokens_per_second": 1987.144 + }, + { + "epoch": 0.5922437673130194, + "grad_norm": 0.06403439491987228, + "learning_rate": 9.939192292157647e-05, + "loss": 0.01755025051534176, + "num_input_tokens_seen": 35011888, + "step": 2138, + "train_runtime": 17619.1901, + "train_tokens_per_second": 1987.145 + }, + { + "epoch": 0.5925207756232687, + "grad_norm": 0.2924961745738983, + "learning_rate": 9.939123935063241e-05, + "loss": 0.02040422335267067, + "num_input_tokens_seen": 35028264, + "step": 2139, + "train_runtime": 17627.4213, + "train_tokens_per_second": 1987.146 + }, + { + "epoch": 0.592797783933518, + "grad_norm": 0.06864126026630402, + "learning_rate": 9.939055539803871e-05, + "loss": 0.01657227799296379, + "num_input_tokens_seen": 35044640, + "step": 2140, + "train_runtime": 17635.6599, + "train_tokens_per_second": 1987.147 + }, + { + "epoch": 0.5930747922437674, + "grad_norm": 0.07944225519895554, + "learning_rate": 9.938987106380063e-05, + "loss": 0.016660036519169807, + "num_input_tokens_seen": 35061016, + "step": 2141, + "train_runtime": 17643.8742, + "train_tokens_per_second": 1987.15 + }, + { + "epoch": 0.5933518005540166, + "grad_norm": 0.1320721060037613, + "learning_rate": 9.938918634792347e-05, + "loss": 0.017126066610217094, + "num_input_tokens_seen": 35077392, + "step": 2142, + "train_runtime": 17652.0987, + "train_tokens_per_second": 1987.151 + }, + { + "epoch": 0.5936288088642659, + "grad_norm": 0.09361735731363297, + "learning_rate": 9.938850125041252e-05, + "loss": 0.014089793898165226, + "num_input_tokens_seen": 35093768, + "step": 2143, + "train_runtime": 17660.3342, + "train_tokens_per_second": 1987.152 + }, + { + "epoch": 0.5939058171745152, + "grad_norm": 0.068679578602314, + "learning_rate": 9.938781577127306e-05, + "loss": 0.01790490932762623, + "num_input_tokens_seen": 35110144, + "step": 2144, + "train_runtime": 17668.5675, + "train_tokens_per_second": 1987.153 + }, + { + "epoch": 0.5941828254847645, + "grad_norm": 0.06704232096672058, + "learning_rate": 9.93871299105104e-05, + "loss": 0.016735319048166275, + "num_input_tokens_seen": 35126520, + "step": 2145, + "train_runtime": 17676.8046, + "train_tokens_per_second": 1987.153 + }, + { + "epoch": 0.5944598337950139, + "grad_norm": 0.09206346422433853, + "learning_rate": 9.938644366812986e-05, + "loss": 0.01891777664422989, + "num_input_tokens_seen": 35142896, + "step": 2146, + "train_runtime": 17685.0351, + "train_tokens_per_second": 1987.154 + }, + { + "epoch": 0.5947368421052631, + "grad_norm": 0.08925329148769379, + "learning_rate": 9.93857570441367e-05, + "loss": 0.015556924045085907, + "num_input_tokens_seen": 35159272, + "step": 2147, + "train_runtime": 17693.2672, + "train_tokens_per_second": 1987.155 + }, + { + "epoch": 0.5950138504155125, + "grad_norm": 0.10765012353658676, + "learning_rate": 9.938507003853625e-05, + "loss": 0.015501763671636581, + "num_input_tokens_seen": 35175648, + "step": 2148, + "train_runtime": 17701.4891, + "train_tokens_per_second": 1987.158 + }, + { + "epoch": 0.5952908587257618, + "grad_norm": 0.07443853467702866, + "learning_rate": 9.93843826513338e-05, + "loss": 0.01762426272034645, + "num_input_tokens_seen": 35192024, + "step": 2149, + "train_runtime": 17709.6975, + "train_tokens_per_second": 1987.161 + }, + { + "epoch": 0.5955678670360111, + "grad_norm": 0.07382841408252716, + "learning_rate": 9.93836948825347e-05, + "loss": 0.015586127527058125, + "num_input_tokens_seen": 35208400, + "step": 2150, + "train_runtime": 17717.9068, + "train_tokens_per_second": 1987.165 + }, + { + "epoch": 0.5958448753462604, + "grad_norm": 0.09151880443096161, + "learning_rate": 9.938300673214423e-05, + "loss": 0.01839885115623474, + "num_input_tokens_seen": 35224776, + "step": 2151, + "train_runtime": 17726.1293, + "train_tokens_per_second": 1987.167 + }, + { + "epoch": 0.5961218836565096, + "grad_norm": 0.08883056789636612, + "learning_rate": 9.93823182001677e-05, + "loss": 0.017701406031847, + "num_input_tokens_seen": 35241152, + "step": 2152, + "train_runtime": 17734.3569, + "train_tokens_per_second": 1987.168 + }, + { + "epoch": 0.596398891966759, + "grad_norm": 0.12391877919435501, + "learning_rate": 9.938162928661047e-05, + "loss": 0.01628074422478676, + "num_input_tokens_seen": 35257528, + "step": 2153, + "train_runtime": 17742.57, + "train_tokens_per_second": 1987.171 + }, + { + "epoch": 0.5966759002770083, + "grad_norm": 0.05106211081147194, + "learning_rate": 9.938093999147784e-05, + "loss": 0.01530489232391119, + "num_input_tokens_seen": 35273904, + "step": 2154, + "train_runtime": 17750.7819, + "train_tokens_per_second": 1987.175 + }, + { + "epoch": 0.5969529085872576, + "grad_norm": 0.0682961642742157, + "learning_rate": 9.938025031477512e-05, + "loss": 0.015724465250968933, + "num_input_tokens_seen": 35290280, + "step": 2155, + "train_runtime": 17758.9873, + "train_tokens_per_second": 1987.179 + }, + { + "epoch": 0.5972299168975069, + "grad_norm": 0.10135135054588318, + "learning_rate": 9.937956025650768e-05, + "loss": 0.01645495742559433, + "num_input_tokens_seen": 35306656, + "step": 2156, + "train_runtime": 17767.1978, + "train_tokens_per_second": 1987.182 + }, + { + "epoch": 0.5975069252077563, + "grad_norm": 0.08373355120420456, + "learning_rate": 9.937886981668081e-05, + "loss": 0.01802564412355423, + "num_input_tokens_seen": 35323032, + "step": 2157, + "train_runtime": 17775.4041, + "train_tokens_per_second": 1987.186 + }, + { + "epoch": 0.5977839335180055, + "grad_norm": 0.09520959109067917, + "learning_rate": 9.937817899529986e-05, + "loss": 0.012276925146579742, + "num_input_tokens_seen": 35339408, + "step": 2158, + "train_runtime": 17783.6146, + "train_tokens_per_second": 1987.189 + }, + { + "epoch": 0.5980609418282549, + "grad_norm": 0.06354434043169022, + "learning_rate": 9.93774877923702e-05, + "loss": 0.01655401475727558, + "num_input_tokens_seen": 35355784, + "step": 2159, + "train_runtime": 17791.8216, + "train_tokens_per_second": 1987.193 + }, + { + "epoch": 0.5983379501385041, + "grad_norm": 0.0981563851237297, + "learning_rate": 9.937679620789712e-05, + "loss": 0.015978194773197174, + "num_input_tokens_seen": 35372160, + "step": 2160, + "train_runtime": 17800.0349, + "train_tokens_per_second": 1987.196 + }, + { + "epoch": 0.5986149584487535, + "grad_norm": 0.06168895214796066, + "learning_rate": 9.937610424188599e-05, + "loss": 0.01356567069888115, + "num_input_tokens_seen": 35388536, + "step": 2161, + "train_runtime": 17808.2438, + "train_tokens_per_second": 1987.2 + }, + { + "epoch": 0.5988919667590028, + "grad_norm": 0.12315115332603455, + "learning_rate": 9.937541189434215e-05, + "loss": 0.01673678494989872, + "num_input_tokens_seen": 35404912, + "step": 2162, + "train_runtime": 17816.4567, + "train_tokens_per_second": 1987.203 + }, + { + "epoch": 0.599168975069252, + "grad_norm": 0.049011629074811935, + "learning_rate": 9.937471916527096e-05, + "loss": 0.013037977740168571, + "num_input_tokens_seen": 35421288, + "step": 2163, + "train_runtime": 17824.6713, + "train_tokens_per_second": 1987.206 + }, + { + "epoch": 0.5994459833795014, + "grad_norm": 0.07606693357229233, + "learning_rate": 9.937402605467776e-05, + "loss": 0.017334073781967163, + "num_input_tokens_seen": 35437664, + "step": 2164, + "train_runtime": 17832.8955, + "train_tokens_per_second": 1987.208 + }, + { + "epoch": 0.5997229916897507, + "grad_norm": 0.097477488219738, + "learning_rate": 9.937333256256791e-05, + "loss": 0.018788520246744156, + "num_input_tokens_seen": 35454040, + "step": 2165, + "train_runtime": 17841.1278, + "train_tokens_per_second": 1987.208 + }, + { + "epoch": 0.6, + "grad_norm": 0.08818960934877396, + "learning_rate": 9.937263868894678e-05, + "loss": 0.015571302734315395, + "num_input_tokens_seen": 35470416, + "step": 2166, + "train_runtime": 17849.3582, + "train_tokens_per_second": 1987.21 + }, + { + "epoch": 0.6002770083102493, + "grad_norm": 0.08229344338178635, + "learning_rate": 9.937194443381972e-05, + "loss": 0.017032012343406677, + "num_input_tokens_seen": 35486792, + "step": 2167, + "train_runtime": 17857.5876, + "train_tokens_per_second": 1987.211 + }, + { + "epoch": 0.6005540166204986, + "grad_norm": 0.08286096900701523, + "learning_rate": 9.93712497971921e-05, + "loss": 0.010395605117082596, + "num_input_tokens_seen": 35503168, + "step": 2168, + "train_runtime": 17865.8119, + "train_tokens_per_second": 1987.213 + }, + { + "epoch": 0.6008310249307479, + "grad_norm": 0.1042458564043045, + "learning_rate": 9.937055477906927e-05, + "loss": 0.022007502615451813, + "num_input_tokens_seen": 35519544, + "step": 2169, + "train_runtime": 17874.0326, + "train_tokens_per_second": 1987.215 + }, + { + "epoch": 0.6011080332409973, + "grad_norm": 0.10747610777616501, + "learning_rate": 9.936985937945663e-05, + "loss": 0.01658964902162552, + "num_input_tokens_seen": 35535920, + "step": 2170, + "train_runtime": 17882.2569, + "train_tokens_per_second": 1987.217 + }, + { + "epoch": 0.6013850415512465, + "grad_norm": 0.07120274007320404, + "learning_rate": 9.936916359835953e-05, + "loss": 0.014869150705635548, + "num_input_tokens_seen": 35552296, + "step": 2171, + "train_runtime": 17890.4784, + "train_tokens_per_second": 1987.219 + }, + { + "epoch": 0.6016620498614959, + "grad_norm": 0.07167871296405792, + "learning_rate": 9.936846743578336e-05, + "loss": 0.015259330160915852, + "num_input_tokens_seen": 35568672, + "step": 2172, + "train_runtime": 17898.7049, + "train_tokens_per_second": 1987.22 + }, + { + "epoch": 0.6019390581717452, + "grad_norm": 0.09186062961816788, + "learning_rate": 9.936777089173348e-05, + "loss": 0.018542001023888588, + "num_input_tokens_seen": 35585048, + "step": 2173, + "train_runtime": 17906.9253, + "train_tokens_per_second": 1987.223 + }, + { + "epoch": 0.6022160664819944, + "grad_norm": 0.057892411947250366, + "learning_rate": 9.93670739662153e-05, + "loss": 0.0167789775878191, + "num_input_tokens_seen": 35601424, + "step": 2174, + "train_runtime": 17915.1482, + "train_tokens_per_second": 1987.225 + }, + { + "epoch": 0.6024930747922438, + "grad_norm": 0.3230501711368561, + "learning_rate": 9.936637665923418e-05, + "loss": 0.016659829765558243, + "num_input_tokens_seen": 35617800, + "step": 2175, + "train_runtime": 17923.3686, + "train_tokens_per_second": 1987.227 + }, + { + "epoch": 0.602770083102493, + "grad_norm": 0.08701565861701965, + "learning_rate": 9.936567897079554e-05, + "loss": 0.01554142590612173, + "num_input_tokens_seen": 35634176, + "step": 2176, + "train_runtime": 17931.5915, + "train_tokens_per_second": 1987.229 + }, + { + "epoch": 0.6030470914127424, + "grad_norm": 0.05846947804093361, + "learning_rate": 9.936498090090474e-05, + "loss": 0.014291154220700264, + "num_input_tokens_seen": 35650552, + "step": 2177, + "train_runtime": 17939.8184, + "train_tokens_per_second": 1987.23 + }, + { + "epoch": 0.6033240997229917, + "grad_norm": 0.08242463320493698, + "learning_rate": 9.936428244956717e-05, + "loss": 0.017125431448221207, + "num_input_tokens_seen": 35666928, + "step": 2178, + "train_runtime": 17948.045, + "train_tokens_per_second": 1987.232 + }, + { + "epoch": 0.603601108033241, + "grad_norm": 0.042663365602493286, + "learning_rate": 9.936358361678826e-05, + "loss": 0.013502622954547405, + "num_input_tokens_seen": 35683304, + "step": 2179, + "train_runtime": 17956.2683, + "train_tokens_per_second": 1987.234 + }, + { + "epoch": 0.6038781163434903, + "grad_norm": 0.08072412759065628, + "learning_rate": 9.936288440257338e-05, + "loss": 0.017050372436642647, + "num_input_tokens_seen": 35699680, + "step": 2180, + "train_runtime": 17964.4868, + "train_tokens_per_second": 1987.236 + }, + { + "epoch": 0.6041551246537397, + "grad_norm": 0.08167660981416702, + "learning_rate": 9.936218480692794e-05, + "loss": 0.016144171357154846, + "num_input_tokens_seen": 35716056, + "step": 2181, + "train_runtime": 17972.712, + "train_tokens_per_second": 1987.238 + }, + { + "epoch": 0.6044321329639889, + "grad_norm": 0.08994349837303162, + "learning_rate": 9.936148482985736e-05, + "loss": 0.017972050234675407, + "num_input_tokens_seen": 35732432, + "step": 2182, + "train_runtime": 17980.9307, + "train_tokens_per_second": 1987.24 + }, + { + "epoch": 0.6047091412742382, + "grad_norm": 0.08929406106472015, + "learning_rate": 9.936078447136703e-05, + "loss": 0.015731584280729294, + "num_input_tokens_seen": 35748808, + "step": 2183, + "train_runtime": 17989.1585, + "train_tokens_per_second": 1987.242 + }, + { + "epoch": 0.6049861495844875, + "grad_norm": 0.07365606725215912, + "learning_rate": 9.936008373146237e-05, + "loss": 0.013847388327121735, + "num_input_tokens_seen": 35765184, + "step": 2184, + "train_runtime": 17997.385, + "train_tokens_per_second": 1987.243 + }, + { + "epoch": 0.6052631578947368, + "grad_norm": 0.06892859190702438, + "learning_rate": 9.935938261014881e-05, + "loss": 0.014669876545667648, + "num_input_tokens_seen": 35781560, + "step": 2185, + "train_runtime": 18005.6038, + "train_tokens_per_second": 1987.246 + }, + { + "epoch": 0.6055401662049862, + "grad_norm": 0.05776514112949371, + "learning_rate": 9.935868110743173e-05, + "loss": 0.013526909053325653, + "num_input_tokens_seen": 35797936, + "step": 2186, + "train_runtime": 18013.823, + "train_tokens_per_second": 1987.248 + }, + { + "epoch": 0.6058171745152354, + "grad_norm": 0.1104588583111763, + "learning_rate": 9.93579792233166e-05, + "loss": 0.017843514680862427, + "num_input_tokens_seen": 35814312, + "step": 2187, + "train_runtime": 18022.0342, + "train_tokens_per_second": 1987.251 + }, + { + "epoch": 0.6060941828254848, + "grad_norm": 0.09247991442680359, + "learning_rate": 9.935727695780881e-05, + "loss": 0.023176247254014015, + "num_input_tokens_seen": 35830688, + "step": 2188, + "train_runtime": 18030.2485, + "train_tokens_per_second": 1987.254 + }, + { + "epoch": 0.6063711911357341, + "grad_norm": 0.06866825371980667, + "learning_rate": 9.935657431091378e-05, + "loss": 0.015991894528269768, + "num_input_tokens_seen": 35847064, + "step": 2189, + "train_runtime": 18038.475, + "train_tokens_per_second": 1987.256 + }, + { + "epoch": 0.6066481994459834, + "grad_norm": 0.053704459220170975, + "learning_rate": 9.935587128263697e-05, + "loss": 0.017589982599020004, + "num_input_tokens_seen": 35863440, + "step": 2190, + "train_runtime": 18046.6974, + "train_tokens_per_second": 1987.258 + }, + { + "epoch": 0.6069252077562327, + "grad_norm": 0.09982097893953323, + "learning_rate": 9.935516787298378e-05, + "loss": 0.013691702857613564, + "num_input_tokens_seen": 35879816, + "step": 2191, + "train_runtime": 18054.9211, + "train_tokens_per_second": 1987.26 + }, + { + "epoch": 0.607202216066482, + "grad_norm": 0.07779620587825775, + "learning_rate": 9.935446408195967e-05, + "loss": 0.014904849231243134, + "num_input_tokens_seen": 35896192, + "step": 2192, + "train_runtime": 18063.1593, + "train_tokens_per_second": 1987.26 + }, + { + "epoch": 0.6074792243767313, + "grad_norm": 0.1353635936975479, + "learning_rate": 9.935375990957005e-05, + "loss": 0.019513092935085297, + "num_input_tokens_seen": 35912568, + "step": 2193, + "train_runtime": 18071.4024, + "train_tokens_per_second": 1987.26 + }, + { + "epoch": 0.6077562326869806, + "grad_norm": 0.08569759130477905, + "learning_rate": 9.93530553558204e-05, + "loss": 0.015894342213869095, + "num_input_tokens_seen": 35928944, + "step": 2194, + "train_runtime": 18079.6275, + "train_tokens_per_second": 1987.261 + }, + { + "epoch": 0.6080332409972299, + "grad_norm": 0.060006991028785706, + "learning_rate": 9.935235042071613e-05, + "loss": 0.012772277928888798, + "num_input_tokens_seen": 35945320, + "step": 2195, + "train_runtime": 18087.8566, + "train_tokens_per_second": 1987.263 + }, + { + "epoch": 0.6083102493074792, + "grad_norm": 0.06452345848083496, + "learning_rate": 9.935164510426271e-05, + "loss": 0.015256601385772228, + "num_input_tokens_seen": 35961696, + "step": 2196, + "train_runtime": 18096.0755, + "train_tokens_per_second": 1987.265 + }, + { + "epoch": 0.6085872576177286, + "grad_norm": 0.0774771198630333, + "learning_rate": 9.935093940646558e-05, + "loss": 0.013259366154670715, + "num_input_tokens_seen": 35978072, + "step": 2197, + "train_runtime": 18104.2835, + "train_tokens_per_second": 1987.268 + }, + { + "epoch": 0.6088642659279778, + "grad_norm": 0.07224270701408386, + "learning_rate": 9.93502333273302e-05, + "loss": 0.014552820473909378, + "num_input_tokens_seen": 35994448, + "step": 2198, + "train_runtime": 18112.4963, + "train_tokens_per_second": 1987.272 + }, + { + "epoch": 0.6091412742382272, + "grad_norm": 0.10365764051675797, + "learning_rate": 9.934952686686201e-05, + "loss": 0.016368234530091286, + "num_input_tokens_seen": 36010824, + "step": 2199, + "train_runtime": 18120.7194, + "train_tokens_per_second": 1987.273 + }, + { + "epoch": 0.6094182825484764, + "grad_norm": 0.07692506909370422, + "learning_rate": 9.93488200250665e-05, + "loss": 0.015429836697876453, + "num_input_tokens_seen": 36027200, + "step": 2200, + "train_runtime": 18128.934, + "train_tokens_per_second": 1987.276 + }, + { + "epoch": 0.6096952908587258, + "grad_norm": 0.12364072352647781, + "learning_rate": 9.934811280194908e-05, + "loss": 0.016636859625577927, + "num_input_tokens_seen": 36043576, + "step": 2201, + "train_runtime": 18139.101, + "train_tokens_per_second": 1987.065 + }, + { + "epoch": 0.6099722991689751, + "grad_norm": 0.0677676573395729, + "learning_rate": 9.934740519751525e-05, + "loss": 0.015273338183760643, + "num_input_tokens_seen": 36059952, + "step": 2202, + "train_runtime": 18147.3199, + "train_tokens_per_second": 1987.068 + }, + { + "epoch": 0.6102493074792243, + "grad_norm": 0.07892414182424545, + "learning_rate": 9.934669721177048e-05, + "loss": 0.014748954214155674, + "num_input_tokens_seen": 36076328, + "step": 2203, + "train_runtime": 18155.5414, + "train_tokens_per_second": 1987.07 + }, + { + "epoch": 0.6105263157894737, + "grad_norm": 0.07255963981151581, + "learning_rate": 9.934598884472024e-05, + "loss": 0.015295884571969509, + "num_input_tokens_seen": 36092704, + "step": 2204, + "train_runtime": 18163.7681, + "train_tokens_per_second": 1987.071 + }, + { + "epoch": 0.610803324099723, + "grad_norm": 0.06428027898073196, + "learning_rate": 9.934528009636999e-05, + "loss": 0.01627238839864731, + "num_input_tokens_seen": 36109080, + "step": 2205, + "train_runtime": 18171.9818, + "train_tokens_per_second": 1987.074 + }, + { + "epoch": 0.6110803324099723, + "grad_norm": 0.05702930688858032, + "learning_rate": 9.934457096672522e-05, + "loss": 0.010966709814965725, + "num_input_tokens_seen": 36125456, + "step": 2206, + "train_runtime": 18180.1933, + "train_tokens_per_second": 1987.078 + }, + { + "epoch": 0.6113573407202216, + "grad_norm": 0.09972251951694489, + "learning_rate": 9.93438614557914e-05, + "loss": 0.01818973198533058, + "num_input_tokens_seen": 36141832, + "step": 2207, + "train_runtime": 18188.4159, + "train_tokens_per_second": 1987.08 + }, + { + "epoch": 0.6116343490304709, + "grad_norm": 0.08622497320175171, + "learning_rate": 9.934315156357402e-05, + "loss": 0.017875781282782555, + "num_input_tokens_seen": 36158208, + "step": 2208, + "train_runtime": 18196.6431, + "train_tokens_per_second": 1987.081 + }, + { + "epoch": 0.6119113573407202, + "grad_norm": 0.09230081737041473, + "learning_rate": 9.934244129007855e-05, + "loss": 0.016219420358538628, + "num_input_tokens_seen": 36174584, + "step": 2209, + "train_runtime": 18204.866, + "train_tokens_per_second": 1987.083 + }, + { + "epoch": 0.6121883656509696, + "grad_norm": 0.09193071722984314, + "learning_rate": 9.93417306353105e-05, + "loss": 0.01655418798327446, + "num_input_tokens_seen": 36190960, + "step": 2210, + "train_runtime": 18213.0872, + "train_tokens_per_second": 1987.085 + }, + { + "epoch": 0.6124653739612188, + "grad_norm": 0.06119684875011444, + "learning_rate": 9.934101959927534e-05, + "loss": 0.014277486130595207, + "num_input_tokens_seen": 36207336, + "step": 2211, + "train_runtime": 18221.3062, + "train_tokens_per_second": 1987.088 + }, + { + "epoch": 0.6127423822714682, + "grad_norm": 0.05864091217517853, + "learning_rate": 9.934030818197857e-05, + "loss": 0.01471198070794344, + "num_input_tokens_seen": 36223712, + "step": 2212, + "train_runtime": 18229.5453, + "train_tokens_per_second": 1987.088 + }, + { + "epoch": 0.6130193905817175, + "grad_norm": 0.07907599210739136, + "learning_rate": 9.933959638342571e-05, + "loss": 0.016819076612591743, + "num_input_tokens_seen": 36240088, + "step": 2213, + "train_runtime": 18237.7793, + "train_tokens_per_second": 1987.089 + }, + { + "epoch": 0.6132963988919667, + "grad_norm": 0.11411405354738235, + "learning_rate": 9.933888420362226e-05, + "loss": 0.015446207486093044, + "num_input_tokens_seen": 36256464, + "step": 2214, + "train_runtime": 18246.0083, + "train_tokens_per_second": 1987.09 + }, + { + "epoch": 0.6135734072022161, + "grad_norm": 0.10317786782979965, + "learning_rate": 9.933817164257367e-05, + "loss": 0.015380132012069225, + "num_input_tokens_seen": 36272840, + "step": 2215, + "train_runtime": 18254.2418, + "train_tokens_per_second": 1987.091 + }, + { + "epoch": 0.6138504155124653, + "grad_norm": 0.0750286728143692, + "learning_rate": 9.933745870028548e-05, + "loss": 0.017121048644185066, + "num_input_tokens_seen": 36289216, + "step": 2216, + "train_runtime": 18262.4783, + "train_tokens_per_second": 1987.092 + }, + { + "epoch": 0.6141274238227147, + "grad_norm": 0.07031214237213135, + "learning_rate": 9.933674537676321e-05, + "loss": 0.018248137086629868, + "num_input_tokens_seen": 36305592, + "step": 2217, + "train_runtime": 18270.7041, + "train_tokens_per_second": 1987.093 + }, + { + "epoch": 0.614404432132964, + "grad_norm": 0.07243289798498154, + "learning_rate": 9.933603167201237e-05, + "loss": 0.014233789406716824, + "num_input_tokens_seen": 36321968, + "step": 2218, + "train_runtime": 18278.9328, + "train_tokens_per_second": 1987.095 + }, + { + "epoch": 0.6146814404432133, + "grad_norm": 0.09764569997787476, + "learning_rate": 9.933531758603846e-05, + "loss": 0.012348017655313015, + "num_input_tokens_seen": 36338344, + "step": 2219, + "train_runtime": 18287.1572, + "train_tokens_per_second": 1987.096 + }, + { + "epoch": 0.6149584487534626, + "grad_norm": 0.0620412603020668, + "learning_rate": 9.933460311884701e-05, + "loss": 0.012826143763959408, + "num_input_tokens_seen": 36354720, + "step": 2220, + "train_runtime": 18295.372, + "train_tokens_per_second": 1987.099 + }, + { + "epoch": 0.615235457063712, + "grad_norm": 0.08343780785799026, + "learning_rate": 9.933388827044355e-05, + "loss": 0.016140980646014214, + "num_input_tokens_seen": 36371096, + "step": 2221, + "train_runtime": 18303.5865, + "train_tokens_per_second": 1987.102 + }, + { + "epoch": 0.6155124653739612, + "grad_norm": 0.09128685295581818, + "learning_rate": 9.933317304083357e-05, + "loss": 0.016229623928666115, + "num_input_tokens_seen": 36387472, + "step": 2222, + "train_runtime": 18311.7992, + "train_tokens_per_second": 1987.105 + }, + { + "epoch": 0.6157894736842106, + "grad_norm": 0.06460173428058624, + "learning_rate": 9.933245743002262e-05, + "loss": 0.014480577781796455, + "num_input_tokens_seen": 36403848, + "step": 2223, + "train_runtime": 18320.0112, + "train_tokens_per_second": 1987.108 + }, + { + "epoch": 0.6160664819944598, + "grad_norm": 0.0539022758603096, + "learning_rate": 9.933174143801621e-05, + "loss": 0.013605144806206226, + "num_input_tokens_seen": 36420224, + "step": 2224, + "train_runtime": 18328.2318, + "train_tokens_per_second": 1987.111 + }, + { + "epoch": 0.6163434903047091, + "grad_norm": 0.08540330827236176, + "learning_rate": 9.93310250648199e-05, + "loss": 0.015964874997735023, + "num_input_tokens_seen": 36436600, + "step": 2225, + "train_runtime": 18336.4606, + "train_tokens_per_second": 1987.112 + }, + { + "epoch": 0.6166204986149585, + "grad_norm": 0.0790826678276062, + "learning_rate": 9.933030831043923e-05, + "loss": 0.01387317106127739, + "num_input_tokens_seen": 36452976, + "step": 2226, + "train_runtime": 18344.692, + "train_tokens_per_second": 1987.113 + }, + { + "epoch": 0.6168975069252077, + "grad_norm": 0.09122329205274582, + "learning_rate": 9.93295911748797e-05, + "loss": 0.01933823525905609, + "num_input_tokens_seen": 36469352, + "step": 2227, + "train_runtime": 18352.9218, + "train_tokens_per_second": 1987.114 + }, + { + "epoch": 0.6171745152354571, + "grad_norm": 0.09946245700120926, + "learning_rate": 9.932887365814688e-05, + "loss": 0.01546566653996706, + "num_input_tokens_seen": 36485728, + "step": 2228, + "train_runtime": 18361.1588, + "train_tokens_per_second": 1987.115 + }, + { + "epoch": 0.6174515235457064, + "grad_norm": 0.09303653985261917, + "learning_rate": 9.932815576024632e-05, + "loss": 0.017490819096565247, + "num_input_tokens_seen": 36502104, + "step": 2229, + "train_runtime": 18369.3926, + "train_tokens_per_second": 1987.115 + }, + { + "epoch": 0.6177285318559557, + "grad_norm": 0.0902031660079956, + "learning_rate": 9.932743748118354e-05, + "loss": 0.016192276030778885, + "num_input_tokens_seen": 36518480, + "step": 2230, + "train_runtime": 18377.6153, + "train_tokens_per_second": 1987.117 + }, + { + "epoch": 0.618005540166205, + "grad_norm": 0.07139287889003754, + "learning_rate": 9.932671882096409e-05, + "loss": 0.011705871671438217, + "num_input_tokens_seen": 36534856, + "step": 2231, + "train_runtime": 18385.8276, + "train_tokens_per_second": 1987.121 + }, + { + "epoch": 0.6182825484764543, + "grad_norm": 0.07938949018716812, + "learning_rate": 9.932599977959356e-05, + "loss": 0.014055703766644001, + "num_input_tokens_seen": 36551232, + "step": 2232, + "train_runtime": 18394.0461, + "train_tokens_per_second": 1987.123 + }, + { + "epoch": 0.6185595567867036, + "grad_norm": 0.06091950461268425, + "learning_rate": 9.93252803570775e-05, + "loss": 0.0179049801081419, + "num_input_tokens_seen": 36567608, + "step": 2233, + "train_runtime": 18402.272, + "train_tokens_per_second": 1987.125 + }, + { + "epoch": 0.618836565096953, + "grad_norm": 0.08532632142305374, + "learning_rate": 9.932456055342142e-05, + "loss": 0.018907945603132248, + "num_input_tokens_seen": 36583984, + "step": 2234, + "train_runtime": 18410.5059, + "train_tokens_per_second": 1987.125 + }, + { + "epoch": 0.6191135734072022, + "grad_norm": 0.06822917610406876, + "learning_rate": 9.932384036863094e-05, + "loss": 0.016019968315958977, + "num_input_tokens_seen": 36600360, + "step": 2235, + "train_runtime": 18418.7328, + "train_tokens_per_second": 1987.127 + }, + { + "epoch": 0.6193905817174515, + "grad_norm": 0.10850462317466736, + "learning_rate": 9.932311980271159e-05, + "loss": 0.016822075471282005, + "num_input_tokens_seen": 36616736, + "step": 2236, + "train_runtime": 18426.9569, + "train_tokens_per_second": 1987.129 + }, + { + "epoch": 0.6196675900277008, + "grad_norm": 0.08018793910741806, + "learning_rate": 9.932239885566895e-05, + "loss": 0.013833295553922653, + "num_input_tokens_seen": 36633112, + "step": 2237, + "train_runtime": 18435.1805, + "train_tokens_per_second": 1987.131 + }, + { + "epoch": 0.6199445983379501, + "grad_norm": 0.10587601363658905, + "learning_rate": 9.932167752750858e-05, + "loss": 0.01596056856215, + "num_input_tokens_seen": 36649488, + "step": 2238, + "train_runtime": 18443.4049, + "train_tokens_per_second": 1987.132 + }, + { + "epoch": 0.6202216066481995, + "grad_norm": 0.043902523815631866, + "learning_rate": 9.932095581823606e-05, + "loss": 0.011371411383152008, + "num_input_tokens_seen": 36665864, + "step": 2239, + "train_runtime": 18451.6297, + "train_tokens_per_second": 1987.134 + }, + { + "epoch": 0.6204986149584487, + "grad_norm": 0.06731446087360382, + "learning_rate": 9.932023372785698e-05, + "loss": 0.015917237848043442, + "num_input_tokens_seen": 36682240, + "step": 2240, + "train_runtime": 18459.8596, + "train_tokens_per_second": 1987.135 + }, + { + "epoch": 0.6207756232686981, + "grad_norm": 0.067887082695961, + "learning_rate": 9.93195112563769e-05, + "loss": 0.01415182277560234, + "num_input_tokens_seen": 36698616, + "step": 2241, + "train_runtime": 18468.0862, + "train_tokens_per_second": 1987.137 + }, + { + "epoch": 0.6210526315789474, + "grad_norm": 0.07491838186979294, + "learning_rate": 9.931878840380142e-05, + "loss": 0.01699545979499817, + "num_input_tokens_seen": 36714992, + "step": 2242, + "train_runtime": 18476.3208, + "train_tokens_per_second": 1987.138 + }, + { + "epoch": 0.6213296398891966, + "grad_norm": 0.0716409757733345, + "learning_rate": 9.931806517013612e-05, + "loss": 0.01369576808065176, + "num_input_tokens_seen": 36731368, + "step": 2243, + "train_runtime": 18484.561, + "train_tokens_per_second": 1987.138 + }, + { + "epoch": 0.621606648199446, + "grad_norm": 0.12608838081359863, + "learning_rate": 9.931734155538659e-05, + "loss": 0.02329796925187111, + "num_input_tokens_seen": 36747744, + "step": 2244, + "train_runtime": 18492.789, + "train_tokens_per_second": 1987.139 + }, + { + "epoch": 0.6218836565096952, + "grad_norm": 0.07940283417701721, + "learning_rate": 9.93166175595584e-05, + "loss": 0.01575656794011593, + "num_input_tokens_seen": 36764120, + "step": 2245, + "train_runtime": 18501.0255, + "train_tokens_per_second": 1987.14 + }, + { + "epoch": 0.6221606648199446, + "grad_norm": 0.07854501157999039, + "learning_rate": 9.931589318265717e-05, + "loss": 0.015226101502776146, + "num_input_tokens_seen": 36780496, + "step": 2246, + "train_runtime": 18509.2429, + "train_tokens_per_second": 1987.142 + }, + { + "epoch": 0.6224376731301939, + "grad_norm": 0.07686841487884521, + "learning_rate": 9.931516842468848e-05, + "loss": 0.014962132088840008, + "num_input_tokens_seen": 36796872, + "step": 2247, + "train_runtime": 18517.4568, + "train_tokens_per_second": 1987.145 + }, + { + "epoch": 0.6227146814404432, + "grad_norm": 0.08178262412548065, + "learning_rate": 9.931444328565795e-05, + "loss": 0.01513838954269886, + "num_input_tokens_seen": 36813248, + "step": 2248, + "train_runtime": 18525.6715, + "train_tokens_per_second": 1987.148 + }, + { + "epoch": 0.6229916897506925, + "grad_norm": 0.10193391889333725, + "learning_rate": 9.931371776557118e-05, + "loss": 0.018587525933980942, + "num_input_tokens_seen": 36829624, + "step": 2249, + "train_runtime": 18533.8993, + "train_tokens_per_second": 1987.149 + }, + { + "epoch": 0.6232686980609419, + "grad_norm": 0.07410042732954025, + "learning_rate": 9.931299186443375e-05, + "loss": 0.013374735601246357, + "num_input_tokens_seen": 36846000, + "step": 2250, + "train_runtime": 18542.1257, + "train_tokens_per_second": 1987.151 + }, + { + "epoch": 0.6235457063711911, + "grad_norm": 0.06876391917467117, + "learning_rate": 9.93122655822513e-05, + "loss": 0.014169575646519661, + "num_input_tokens_seen": 36862376, + "step": 2251, + "train_runtime": 18550.4038, + "train_tokens_per_second": 1987.147 + }, + { + "epoch": 0.6238227146814405, + "grad_norm": 0.09852499514818192, + "learning_rate": 9.931153891902942e-05, + "loss": 0.015418807044625282, + "num_input_tokens_seen": 36878752, + "step": 2252, + "train_runtime": 18558.6368, + "train_tokens_per_second": 1987.148 + }, + { + "epoch": 0.6240997229916897, + "grad_norm": 0.09990391135215759, + "learning_rate": 9.931081187477375e-05, + "loss": 0.01736169122159481, + "num_input_tokens_seen": 36895128, + "step": 2253, + "train_runtime": 18566.8569, + "train_tokens_per_second": 1987.15 + }, + { + "epoch": 0.624376731301939, + "grad_norm": 0.06732690334320068, + "learning_rate": 9.931008444948988e-05, + "loss": 0.014748628251254559, + "num_input_tokens_seen": 36911504, + "step": 2254, + "train_runtime": 18575.0848, + "train_tokens_per_second": 1987.151 + }, + { + "epoch": 0.6246537396121884, + "grad_norm": 0.08119776099920273, + "learning_rate": 9.930935664318343e-05, + "loss": 0.014692639000713825, + "num_input_tokens_seen": 36927880, + "step": 2255, + "train_runtime": 18583.3196, + "train_tokens_per_second": 1987.152 + }, + { + "epoch": 0.6249307479224376, + "grad_norm": 0.11127012968063354, + "learning_rate": 9.930862845586007e-05, + "loss": 0.01755707710981369, + "num_input_tokens_seen": 36944256, + "step": 2256, + "train_runtime": 18591.5473, + "train_tokens_per_second": 1987.153 + }, + { + "epoch": 0.625207756232687, + "grad_norm": 0.08838969469070435, + "learning_rate": 9.930789988752537e-05, + "loss": 0.014997825026512146, + "num_input_tokens_seen": 36960632, + "step": 2257, + "train_runtime": 18599.7731, + "train_tokens_per_second": 1987.155 + }, + { + "epoch": 0.6254847645429363, + "grad_norm": 0.09436649084091187, + "learning_rate": 9.930717093818498e-05, + "loss": 0.018886003643274307, + "num_input_tokens_seen": 36977008, + "step": 2258, + "train_runtime": 18607.9932, + "train_tokens_per_second": 1987.157 + }, + { + "epoch": 0.6257617728531856, + "grad_norm": 0.05544178560376167, + "learning_rate": 9.930644160784455e-05, + "loss": 0.012907825410366058, + "num_input_tokens_seen": 36993384, + "step": 2259, + "train_runtime": 18616.2118, + "train_tokens_per_second": 1987.16 + }, + { + "epoch": 0.6260387811634349, + "grad_norm": 0.09430685639381409, + "learning_rate": 9.93057118965097e-05, + "loss": 0.017565058544278145, + "num_input_tokens_seen": 37009760, + "step": 2260, + "train_runtime": 18624.4243, + "train_tokens_per_second": 1987.163 + }, + { + "epoch": 0.6263157894736842, + "grad_norm": 0.05366181954741478, + "learning_rate": 9.930498180418606e-05, + "loss": 0.012894481420516968, + "num_input_tokens_seen": 37026136, + "step": 2261, + "train_runtime": 18632.6299, + "train_tokens_per_second": 1987.166 + }, + { + "epoch": 0.6265927977839335, + "grad_norm": 0.07981177419424057, + "learning_rate": 9.930425133087928e-05, + "loss": 0.013992700725793839, + "num_input_tokens_seen": 37042512, + "step": 2262, + "train_runtime": 18640.8435, + "train_tokens_per_second": 1987.169 + }, + { + "epoch": 0.6268698060941829, + "grad_norm": 0.06609068810939789, + "learning_rate": 9.930352047659503e-05, + "loss": 0.01451890915632248, + "num_input_tokens_seen": 37058888, + "step": 2263, + "train_runtime": 18649.0649, + "train_tokens_per_second": 1987.171 + }, + { + "epoch": 0.6271468144044321, + "grad_norm": 0.047599922865629196, + "learning_rate": 9.930278924133891e-05, + "loss": 0.014524778351187706, + "num_input_tokens_seen": 37075264, + "step": 2264, + "train_runtime": 18657.2958, + "train_tokens_per_second": 1987.172 + }, + { + "epoch": 0.6274238227146814, + "grad_norm": 0.04987109825015068, + "learning_rate": 9.93020576251166e-05, + "loss": 0.015286060981452465, + "num_input_tokens_seen": 37091640, + "step": 2265, + "train_runtime": 18665.531, + "train_tokens_per_second": 1987.173 + }, + { + "epoch": 0.6277008310249308, + "grad_norm": 0.09223100543022156, + "learning_rate": 9.930132562793375e-05, + "loss": 0.018712984398007393, + "num_input_tokens_seen": 37108016, + "step": 2266, + "train_runtime": 18673.7598, + "train_tokens_per_second": 1987.174 + }, + { + "epoch": 0.62797783933518, + "grad_norm": 0.08452875912189484, + "learning_rate": 9.930059324979601e-05, + "loss": 0.016695793718099594, + "num_input_tokens_seen": 37124392, + "step": 2267, + "train_runtime": 18681.9914, + "train_tokens_per_second": 1987.175 + }, + { + "epoch": 0.6282548476454294, + "grad_norm": 0.07250675559043884, + "learning_rate": 9.929986049070905e-05, + "loss": 0.0170960184186697, + "num_input_tokens_seen": 37140768, + "step": 2268, + "train_runtime": 18690.2171, + "train_tokens_per_second": 1987.177 + }, + { + "epoch": 0.6285318559556786, + "grad_norm": 0.07805902510881424, + "learning_rate": 9.929912735067852e-05, + "loss": 0.01619689166545868, + "num_input_tokens_seen": 37157144, + "step": 2269, + "train_runtime": 18698.4251, + "train_tokens_per_second": 1987.18 + }, + { + "epoch": 0.628808864265928, + "grad_norm": 0.09094987064599991, + "learning_rate": 9.929839382971008e-05, + "loss": 0.01494566723704338, + "num_input_tokens_seen": 37173520, + "step": 2270, + "train_runtime": 18706.6328, + "train_tokens_per_second": 1987.184 + }, + { + "epoch": 0.6290858725761773, + "grad_norm": 0.08418678492307663, + "learning_rate": 9.929765992780942e-05, + "loss": 0.012249733321368694, + "num_input_tokens_seen": 37189896, + "step": 2271, + "train_runtime": 18714.841, + "train_tokens_per_second": 1987.187 + }, + { + "epoch": 0.6293628808864266, + "grad_norm": 0.07958530634641647, + "learning_rate": 9.929692564498219e-05, + "loss": 0.013934667222201824, + "num_input_tokens_seen": 37206272, + "step": 2272, + "train_runtime": 18723.0481, + "train_tokens_per_second": 1987.191 + }, + { + "epoch": 0.6296398891966759, + "grad_norm": 0.053358420729637146, + "learning_rate": 9.929619098123409e-05, + "loss": 0.013928474858403206, + "num_input_tokens_seen": 37222648, + "step": 2273, + "train_runtime": 18731.2621, + "train_tokens_per_second": 1987.194 + }, + { + "epoch": 0.6299168975069253, + "grad_norm": 0.08559989184141159, + "learning_rate": 9.929545593657074e-05, + "loss": 0.014965766109526157, + "num_input_tokens_seen": 37239024, + "step": 2274, + "train_runtime": 18739.4913, + "train_tokens_per_second": 1987.195 + }, + { + "epoch": 0.6301939058171745, + "grad_norm": 0.08385762572288513, + "learning_rate": 9.92947205109979e-05, + "loss": 0.012418823316693306, + "num_input_tokens_seen": 37255400, + "step": 2275, + "train_runtime": 18747.7221, + "train_tokens_per_second": 1987.196 + }, + { + "epoch": 0.6304709141274238, + "grad_norm": 0.07652783393859863, + "learning_rate": 9.929398470452118e-05, + "loss": 0.015083552338182926, + "num_input_tokens_seen": 37271776, + "step": 2276, + "train_runtime": 18755.9481, + "train_tokens_per_second": 1987.198 + }, + { + "epoch": 0.6307479224376731, + "grad_norm": 0.08129309862852097, + "learning_rate": 9.929324851714631e-05, + "loss": 0.011295096948742867, + "num_input_tokens_seen": 37288152, + "step": 2277, + "train_runtime": 18764.1949, + "train_tokens_per_second": 1987.197 + }, + { + "epoch": 0.6310249307479224, + "grad_norm": 0.08098883181810379, + "learning_rate": 9.929251194887898e-05, + "loss": 0.013306712731719017, + "num_input_tokens_seen": 37304528, + "step": 2278, + "train_runtime": 18772.4275, + "train_tokens_per_second": 1987.198 + }, + { + "epoch": 0.6313019390581718, + "grad_norm": 0.057320717722177505, + "learning_rate": 9.929177499972484e-05, + "loss": 0.01232556626200676, + "num_input_tokens_seen": 37320904, + "step": 2279, + "train_runtime": 18780.6582, + "train_tokens_per_second": 1987.199 + }, + { + "epoch": 0.631578947368421, + "grad_norm": 0.08917135000228882, + "learning_rate": 9.929103766968963e-05, + "loss": 0.014600366353988647, + "num_input_tokens_seen": 37337280, + "step": 2280, + "train_runtime": 18788.881, + "train_tokens_per_second": 1987.201 + }, + { + "epoch": 0.6318559556786704, + "grad_norm": 0.09413734078407288, + "learning_rate": 9.929029995877903e-05, + "loss": 0.016670797020196915, + "num_input_tokens_seen": 37353656, + "step": 2281, + "train_runtime": 18797.0966, + "train_tokens_per_second": 1987.203 + }, + { + "epoch": 0.6321329639889197, + "grad_norm": 0.10146411508321762, + "learning_rate": 9.928956186699873e-05, + "loss": 0.014510337263345718, + "num_input_tokens_seen": 37370032, + "step": 2282, + "train_runtime": 18805.3205, + "train_tokens_per_second": 1987.205 + }, + { + "epoch": 0.632409972299169, + "grad_norm": 0.08856786787509918, + "learning_rate": 9.928882339435446e-05, + "loss": 0.017323391512036324, + "num_input_tokens_seen": 37386408, + "step": 2283, + "train_runtime": 18813.5558, + "train_tokens_per_second": 1987.206 + }, + { + "epoch": 0.6326869806094183, + "grad_norm": 0.08109819144010544, + "learning_rate": 9.928808454085189e-05, + "loss": 0.012362709268927574, + "num_input_tokens_seen": 37402784, + "step": 2284, + "train_runtime": 18821.7908, + "train_tokens_per_second": 1987.206 + }, + { + "epoch": 0.6329639889196675, + "grad_norm": 0.05836997181177139, + "learning_rate": 9.928734530649675e-05, + "loss": 0.017328403890132904, + "num_input_tokens_seen": 37419160, + "step": 2285, + "train_runtime": 18830.02, + "train_tokens_per_second": 1987.208 + }, + { + "epoch": 0.6332409972299169, + "grad_norm": 0.06989683210849762, + "learning_rate": 9.928660569129475e-05, + "loss": 0.012980884872376919, + "num_input_tokens_seen": 37435536, + "step": 2286, + "train_runtime": 18838.2329, + "train_tokens_per_second": 1987.211 + }, + { + "epoch": 0.6335180055401662, + "grad_norm": 0.05454591289162636, + "learning_rate": 9.928586569525162e-05, + "loss": 0.015392155386507511, + "num_input_tokens_seen": 37451912, + "step": 2287, + "train_runtime": 18846.4574, + "train_tokens_per_second": 1987.212 + }, + { + "epoch": 0.6337950138504155, + "grad_norm": 0.08651084452867508, + "learning_rate": 9.928512531837305e-05, + "loss": 0.013871249742805958, + "num_input_tokens_seen": 37468288, + "step": 2288, + "train_runtime": 18854.6903, + "train_tokens_per_second": 1987.213 + }, + { + "epoch": 0.6340720221606648, + "grad_norm": 0.0857497826218605, + "learning_rate": 9.928438456066477e-05, + "loss": 0.014448794536292553, + "num_input_tokens_seen": 37484664, + "step": 2289, + "train_runtime": 18862.914, + "train_tokens_per_second": 1987.215 + }, + { + "epoch": 0.6343490304709142, + "grad_norm": 0.05455608665943146, + "learning_rate": 9.928364342213253e-05, + "loss": 0.012808405794203281, + "num_input_tokens_seen": 37501040, + "step": 2290, + "train_runtime": 18871.1258, + "train_tokens_per_second": 1987.218 + }, + { + "epoch": 0.6346260387811634, + "grad_norm": 0.0953046977519989, + "learning_rate": 9.928290190278201e-05, + "loss": 0.015571840107440948, + "num_input_tokens_seen": 37517416, + "step": 2291, + "train_runtime": 18879.3572, + "train_tokens_per_second": 1987.219 + }, + { + "epoch": 0.6349030470914128, + "grad_norm": 0.06781233847141266, + "learning_rate": 9.928216000261899e-05, + "loss": 0.009304466657340527, + "num_input_tokens_seen": 37533792, + "step": 2292, + "train_runtime": 18887.5909, + "train_tokens_per_second": 1987.22 + }, + { + "epoch": 0.635180055401662, + "grad_norm": 0.0751669779419899, + "learning_rate": 9.928141772164915e-05, + "loss": 0.01661655865609646, + "num_input_tokens_seen": 37550168, + "step": 2293, + "train_runtime": 18895.8182, + "train_tokens_per_second": 1987.221 + }, + { + "epoch": 0.6354570637119114, + "grad_norm": 0.06922679394483566, + "learning_rate": 9.928067505987826e-05, + "loss": 0.014155697077512741, + "num_input_tokens_seen": 37566544, + "step": 2294, + "train_runtime": 18904.0412, + "train_tokens_per_second": 1987.223 + }, + { + "epoch": 0.6357340720221607, + "grad_norm": 0.0739351138472557, + "learning_rate": 9.927993201731206e-05, + "loss": 0.014102019369602203, + "num_input_tokens_seen": 37582920, + "step": 2295, + "train_runtime": 18912.2682, + "train_tokens_per_second": 1987.224 + }, + { + "epoch": 0.6360110803324099, + "grad_norm": 0.07934904843568802, + "learning_rate": 9.927918859395628e-05, + "loss": 0.017440414056181908, + "num_input_tokens_seen": 37599296, + "step": 2296, + "train_runtime": 18920.495, + "train_tokens_per_second": 1987.226 + }, + { + "epoch": 0.6362880886426593, + "grad_norm": 0.07134152203798294, + "learning_rate": 9.927844478981667e-05, + "loss": 0.0169854573905468, + "num_input_tokens_seen": 37615672, + "step": 2297, + "train_runtime": 18928.714, + "train_tokens_per_second": 1987.228 + }, + { + "epoch": 0.6365650969529086, + "grad_norm": 0.0881340354681015, + "learning_rate": 9.927770060489897e-05, + "loss": 0.01651441864669323, + "num_input_tokens_seen": 37632048, + "step": 2298, + "train_runtime": 18936.9293, + "train_tokens_per_second": 1987.231 + }, + { + "epoch": 0.6368421052631579, + "grad_norm": 0.05554116517305374, + "learning_rate": 9.927695603920893e-05, + "loss": 0.014534235931932926, + "num_input_tokens_seen": 37648424, + "step": 2299, + "train_runtime": 18945.1379, + "train_tokens_per_second": 1987.234 + }, + { + "epoch": 0.6371191135734072, + "grad_norm": 0.10566544532775879, + "learning_rate": 9.927621109275233e-05, + "loss": 0.018640711903572083, + "num_input_tokens_seen": 37664800, + "step": 2300, + "train_runtime": 18953.3415, + "train_tokens_per_second": 1987.238 + }, + { + "epoch": 0.6373961218836565, + "grad_norm": 0.12053089588880539, + "learning_rate": 9.927546576553488e-05, + "loss": 0.015595117583870888, + "num_input_tokens_seen": 37681176, + "step": 2301, + "train_runtime": 18963.1777, + "train_tokens_per_second": 1987.071 + }, + { + "epoch": 0.6376731301939058, + "grad_norm": 0.07837548106908798, + "learning_rate": 9.927472005756238e-05, + "loss": 0.012434537522494793, + "num_input_tokens_seen": 37697552, + "step": 2302, + "train_runtime": 18971.3797, + "train_tokens_per_second": 1987.075 + }, + { + "epoch": 0.6379501385041552, + "grad_norm": 0.06169973313808441, + "learning_rate": 9.927397396884057e-05, + "loss": 0.01331840455532074, + "num_input_tokens_seen": 37713928, + "step": 2303, + "train_runtime": 18979.5978, + "train_tokens_per_second": 1987.077 + }, + { + "epoch": 0.6382271468144044, + "grad_norm": 0.09426392614841461, + "learning_rate": 9.927322749937522e-05, + "loss": 0.015343316830694675, + "num_input_tokens_seen": 37730304, + "step": 2304, + "train_runtime": 18987.8346, + "train_tokens_per_second": 1987.078 + }, + { + "epoch": 0.6385041551246537, + "grad_norm": 0.0697467178106308, + "learning_rate": 9.927248064917212e-05, + "loss": 0.013129360973834991, + "num_input_tokens_seen": 37746680, + "step": 2305, + "train_runtime": 18996.057, + "train_tokens_per_second": 1987.08 + }, + { + "epoch": 0.6387811634349031, + "grad_norm": 0.0823335200548172, + "learning_rate": 9.9271733418237e-05, + "loss": 0.015058435499668121, + "num_input_tokens_seen": 37763056, + "step": 2306, + "train_runtime": 19004.2759, + "train_tokens_per_second": 1987.082 + }, + { + "epoch": 0.6390581717451523, + "grad_norm": 0.05603457987308502, + "learning_rate": 9.927098580657566e-05, + "loss": 0.011922684498131275, + "num_input_tokens_seen": 37779432, + "step": 2307, + "train_runtime": 19012.4938, + "train_tokens_per_second": 1987.085 + }, + { + "epoch": 0.6393351800554017, + "grad_norm": 0.09820527583360672, + "learning_rate": 9.927023781419386e-05, + "loss": 0.01585111767053604, + "num_input_tokens_seen": 37795808, + "step": 2308, + "train_runtime": 19020.7162, + "train_tokens_per_second": 1987.086 + }, + { + "epoch": 0.6396121883656509, + "grad_norm": 0.07591965794563293, + "learning_rate": 9.92694894410974e-05, + "loss": 0.01627643033862114, + "num_input_tokens_seen": 37812184, + "step": 2309, + "train_runtime": 19028.9308, + "train_tokens_per_second": 1987.089 + }, + { + "epoch": 0.6398891966759003, + "grad_norm": 0.08136788755655289, + "learning_rate": 9.926874068729206e-05, + "loss": 0.013488716445863247, + "num_input_tokens_seen": 37828560, + "step": 2310, + "train_runtime": 19037.1486, + "train_tokens_per_second": 1987.092 + }, + { + "epoch": 0.6401662049861496, + "grad_norm": 0.08228448778390884, + "learning_rate": 9.926799155278362e-05, + "loss": 0.013084692880511284, + "num_input_tokens_seen": 37844936, + "step": 2311, + "train_runtime": 19045.3615, + "train_tokens_per_second": 1987.095 + }, + { + "epoch": 0.6404432132963989, + "grad_norm": 0.07741934806108475, + "learning_rate": 9.926724203757784e-05, + "loss": 0.014987883158028126, + "num_input_tokens_seen": 37861312, + "step": 2312, + "train_runtime": 19053.5829, + "train_tokens_per_second": 1987.097 + }, + { + "epoch": 0.6407202216066482, + "grad_norm": 0.05734337866306305, + "learning_rate": 9.926649214168057e-05, + "loss": 0.012392019852995872, + "num_input_tokens_seen": 37877688, + "step": 2313, + "train_runtime": 19061.8061, + "train_tokens_per_second": 1987.099 + }, + { + "epoch": 0.6409972299168976, + "grad_norm": 0.05560939013957977, + "learning_rate": 9.926574186509756e-05, + "loss": 0.012963881716132164, + "num_input_tokens_seen": 37894064, + "step": 2314, + "train_runtime": 19070.0303, + "train_tokens_per_second": 1987.1 + }, + { + "epoch": 0.6412742382271468, + "grad_norm": 0.07569949328899384, + "learning_rate": 9.926499120783463e-05, + "loss": 0.01367438118904829, + "num_input_tokens_seen": 37910440, + "step": 2315, + "train_runtime": 19078.2488, + "train_tokens_per_second": 1987.103 + }, + { + "epoch": 0.6415512465373961, + "grad_norm": 0.1263088583946228, + "learning_rate": 9.926424016989758e-05, + "loss": 0.016824908554553986, + "num_input_tokens_seen": 37926816, + "step": 2316, + "train_runtime": 19086.4665, + "train_tokens_per_second": 1987.105 + }, + { + "epoch": 0.6418282548476454, + "grad_norm": 0.10550690442323685, + "learning_rate": 9.926348875129218e-05, + "loss": 0.016789911314845085, + "num_input_tokens_seen": 37943192, + "step": 2317, + "train_runtime": 19094.6829, + "train_tokens_per_second": 1987.108 + }, + { + "epoch": 0.6421052631578947, + "grad_norm": 0.18597762286663055, + "learning_rate": 9.926273695202428e-05, + "loss": 0.017452774569392204, + "num_input_tokens_seen": 37959568, + "step": 2318, + "train_runtime": 19102.9102, + "train_tokens_per_second": 1987.109 + }, + { + "epoch": 0.6423822714681441, + "grad_norm": 0.06964525580406189, + "learning_rate": 9.926198477209966e-05, + "loss": 0.0144856758415699, + "num_input_tokens_seen": 37975944, + "step": 2319, + "train_runtime": 19111.1295, + "train_tokens_per_second": 1987.111 + }, + { + "epoch": 0.6426592797783933, + "grad_norm": 0.09196405857801437, + "learning_rate": 9.926123221152415e-05, + "loss": 0.0137498052790761, + "num_input_tokens_seen": 37992320, + "step": 2320, + "train_runtime": 19119.3646, + "train_tokens_per_second": 1987.112 + }, + { + "epoch": 0.6429362880886427, + "grad_norm": 0.11658690869808197, + "learning_rate": 9.926047927030355e-05, + "loss": 0.014972791075706482, + "num_input_tokens_seen": 38008696, + "step": 2321, + "train_runtime": 19127.5931, + "train_tokens_per_second": 1987.113 + }, + { + "epoch": 0.643213296398892, + "grad_norm": 0.07050884515047073, + "learning_rate": 9.92597259484437e-05, + "loss": 0.015382845886051655, + "num_input_tokens_seen": 38025072, + "step": 2322, + "train_runtime": 19135.8198, + "train_tokens_per_second": 1987.115 + }, + { + "epoch": 0.6434903047091413, + "grad_norm": 0.060021933168172836, + "learning_rate": 9.92589722459504e-05, + "loss": 0.01412512082606554, + "num_input_tokens_seen": 38041448, + "step": 2323, + "train_runtime": 19144.0577, + "train_tokens_per_second": 1987.115 + }, + { + "epoch": 0.6437673130193906, + "grad_norm": 0.10336330533027649, + "learning_rate": 9.925821816282948e-05, + "loss": 0.01745663583278656, + "num_input_tokens_seen": 38057824, + "step": 2324, + "train_runtime": 19152.2814, + "train_tokens_per_second": 1987.117 + }, + { + "epoch": 0.6440443213296398, + "grad_norm": 0.8055450916290283, + "learning_rate": 9.925746369908677e-05, + "loss": 0.014780683442950249, + "num_input_tokens_seen": 38074200, + "step": 2325, + "train_runtime": 19160.5067, + "train_tokens_per_second": 1987.119 + }, + { + "epoch": 0.6443213296398892, + "grad_norm": 0.24833595752716064, + "learning_rate": 9.925670885472809e-05, + "loss": 0.014092618599534035, + "num_input_tokens_seen": 38090576, + "step": 2326, + "train_runtime": 19168.7346, + "train_tokens_per_second": 1987.12 + }, + { + "epoch": 0.6445983379501385, + "grad_norm": 0.08371973782777786, + "learning_rate": 9.925595362975928e-05, + "loss": 0.016065135598182678, + "num_input_tokens_seen": 38106952, + "step": 2327, + "train_runtime": 19176.9621, + "train_tokens_per_second": 1987.121 + }, + { + "epoch": 0.6448753462603878, + "grad_norm": 0.07887957245111465, + "learning_rate": 9.925519802418618e-05, + "loss": 0.013729535043239594, + "num_input_tokens_seen": 38123328, + "step": 2328, + "train_runtime": 19185.191, + "train_tokens_per_second": 1987.123 + }, + { + "epoch": 0.6451523545706371, + "grad_norm": 0.08180635422468185, + "learning_rate": 9.925444203801463e-05, + "loss": 0.015407980419695377, + "num_input_tokens_seen": 38139704, + "step": 2329, + "train_runtime": 19193.4314, + "train_tokens_per_second": 1987.123 + }, + { + "epoch": 0.6454293628808865, + "grad_norm": 0.08259919285774231, + "learning_rate": 9.925368567125046e-05, + "loss": 0.013971359468996525, + "num_input_tokens_seen": 38156080, + "step": 2330, + "train_runtime": 19201.6596, + "train_tokens_per_second": 1987.124 + }, + { + "epoch": 0.6457063711911357, + "grad_norm": 0.057198092341423035, + "learning_rate": 9.925292892389953e-05, + "loss": 0.012414202094078064, + "num_input_tokens_seen": 38172456, + "step": 2331, + "train_runtime": 19209.8899, + "train_tokens_per_second": 1987.125 + }, + { + "epoch": 0.6459833795013851, + "grad_norm": 0.06418338418006897, + "learning_rate": 9.925217179596766e-05, + "loss": 0.015258543193340302, + "num_input_tokens_seen": 38188832, + "step": 2332, + "train_runtime": 19218.1204, + "train_tokens_per_second": 1987.126 + }, + { + "epoch": 0.6462603878116343, + "grad_norm": 0.06349740922451019, + "learning_rate": 9.925141428746073e-05, + "loss": 0.014363668859004974, + "num_input_tokens_seen": 38205208, + "step": 2333, + "train_runtime": 19226.3458, + "train_tokens_per_second": 1987.128 + }, + { + "epoch": 0.6465373961218837, + "grad_norm": 0.08106314390897751, + "learning_rate": 9.925065639838458e-05, + "loss": 0.015986790880560875, + "num_input_tokens_seen": 38221584, + "step": 2334, + "train_runtime": 19234.5706, + "train_tokens_per_second": 1987.13 + }, + { + "epoch": 0.646814404432133, + "grad_norm": 0.09276460111141205, + "learning_rate": 9.924989812874508e-05, + "loss": 0.015672339126467705, + "num_input_tokens_seen": 38237960, + "step": 2335, + "train_runtime": 19242.7955, + "train_tokens_per_second": 1987.131 + }, + { + "epoch": 0.6470914127423822, + "grad_norm": 0.0690341368317604, + "learning_rate": 9.924913947854805e-05, + "loss": 0.01610923558473587, + "num_input_tokens_seen": 38254336, + "step": 2336, + "train_runtime": 19251.0263, + "train_tokens_per_second": 1987.132 + }, + { + "epoch": 0.6473684210526316, + "grad_norm": 0.07801534235477448, + "learning_rate": 9.92483804477994e-05, + "loss": 0.017490576952695847, + "num_input_tokens_seen": 38270712, + "step": 2337, + "train_runtime": 19259.2574, + "train_tokens_per_second": 1987.133 + }, + { + "epoch": 0.6476454293628808, + "grad_norm": 0.10493207722902298, + "learning_rate": 9.924762103650497e-05, + "loss": 0.014039935544133186, + "num_input_tokens_seen": 38287088, + "step": 2338, + "train_runtime": 19267.465, + "train_tokens_per_second": 1987.137 + }, + { + "epoch": 0.6479224376731302, + "grad_norm": 0.06456614285707474, + "learning_rate": 9.924686124467062e-05, + "loss": 0.014914022758603096, + "num_input_tokens_seen": 38303464, + "step": 2339, + "train_runtime": 19275.6767, + "train_tokens_per_second": 1987.14 + }, + { + "epoch": 0.6481994459833795, + "grad_norm": 0.09107351303100586, + "learning_rate": 9.924610107230225e-05, + "loss": 0.014815768226981163, + "num_input_tokens_seen": 38319840, + "step": 2340, + "train_runtime": 19283.8979, + "train_tokens_per_second": 1987.142 + }, + { + "epoch": 0.6484764542936288, + "grad_norm": 0.06873004883527756, + "learning_rate": 9.924534051940571e-05, + "loss": 0.012361451052129269, + "num_input_tokens_seen": 38336216, + "step": 2341, + "train_runtime": 19292.1362, + "train_tokens_per_second": 1987.142 + }, + { + "epoch": 0.6487534626038781, + "grad_norm": 0.0923708975315094, + "learning_rate": 9.92445795859869e-05, + "loss": 0.01576896570622921, + "num_input_tokens_seen": 38352592, + "step": 2342, + "train_runtime": 19300.3674, + "train_tokens_per_second": 1987.143 + }, + { + "epoch": 0.6490304709141275, + "grad_norm": 0.10005738586187363, + "learning_rate": 9.924381827205166e-05, + "loss": 0.016976159065961838, + "num_input_tokens_seen": 38368968, + "step": 2343, + "train_runtime": 19308.5987, + "train_tokens_per_second": 1987.144 + }, + { + "epoch": 0.6493074792243767, + "grad_norm": 0.05379478260874748, + "learning_rate": 9.924305657760591e-05, + "loss": 0.015416833572089672, + "num_input_tokens_seen": 38385344, + "step": 2344, + "train_runtime": 19316.8258, + "train_tokens_per_second": 1987.146 + }, + { + "epoch": 0.649584487534626, + "grad_norm": 0.09580790251493454, + "learning_rate": 9.924229450265552e-05, + "loss": 0.017158566042780876, + "num_input_tokens_seen": 38401720, + "step": 2345, + "train_runtime": 19325.0605, + "train_tokens_per_second": 1987.146 + }, + { + "epoch": 0.6498614958448753, + "grad_norm": 0.06758984923362732, + "learning_rate": 9.924153204720639e-05, + "loss": 0.01501435600221157, + "num_input_tokens_seen": 38418096, + "step": 2346, + "train_runtime": 19333.2903, + "train_tokens_per_second": 1987.147 + }, + { + "epoch": 0.6501385041551246, + "grad_norm": 0.0884396955370903, + "learning_rate": 9.924076921126438e-05, + "loss": 0.01709570549428463, + "num_input_tokens_seen": 38434472, + "step": 2347, + "train_runtime": 19341.5217, + "train_tokens_per_second": 1987.148 + }, + { + "epoch": 0.650415512465374, + "grad_norm": 0.07403832674026489, + "learning_rate": 9.924000599483542e-05, + "loss": 0.016632119193673134, + "num_input_tokens_seen": 38450848, + "step": 2348, + "train_runtime": 19349.7451, + "train_tokens_per_second": 1987.15 + }, + { + "epoch": 0.6506925207756232, + "grad_norm": 0.08641830831766129, + "learning_rate": 9.92392423979254e-05, + "loss": 0.013431813567876816, + "num_input_tokens_seen": 38467224, + "step": 2349, + "train_runtime": 19357.9685, + "train_tokens_per_second": 1987.152 + }, + { + "epoch": 0.6509695290858726, + "grad_norm": 0.1194172278046608, + "learning_rate": 9.923847842054022e-05, + "loss": 0.0134870745241642, + "num_input_tokens_seen": 38483600, + "step": 2350, + "train_runtime": 19366.2056, + "train_tokens_per_second": 1987.152 + }, + { + "epoch": 0.6512465373961219, + "grad_norm": 0.062350522726774216, + "learning_rate": 9.923771406268576e-05, + "loss": 0.015231570228934288, + "num_input_tokens_seen": 38499976, + "step": 2351, + "train_runtime": 19374.4313, + "train_tokens_per_second": 1987.154 + }, + { + "epoch": 0.6515235457063712, + "grad_norm": 0.05562130734324455, + "learning_rate": 9.923694932436796e-05, + "loss": 0.015436230227351189, + "num_input_tokens_seen": 38516352, + "step": 2352, + "train_runtime": 19382.6409, + "train_tokens_per_second": 1987.157 + }, + { + "epoch": 0.6518005540166205, + "grad_norm": 0.10913991928100586, + "learning_rate": 9.923618420559268e-05, + "loss": 0.016925077885389328, + "num_input_tokens_seen": 38532728, + "step": 2353, + "train_runtime": 19390.8708, + "train_tokens_per_second": 1987.158 + }, + { + "epoch": 0.6520775623268698, + "grad_norm": 0.0846082866191864, + "learning_rate": 9.92354187063659e-05, + "loss": 0.017268052324652672, + "num_input_tokens_seen": 38549104, + "step": 2354, + "train_runtime": 19399.1113, + "train_tokens_per_second": 1987.158 + }, + { + "epoch": 0.6523545706371191, + "grad_norm": 0.05907193943858147, + "learning_rate": 9.923465282669349e-05, + "loss": 0.016012078151106834, + "num_input_tokens_seen": 38565480, + "step": 2355, + "train_runtime": 19407.3372, + "train_tokens_per_second": 1987.16 + }, + { + "epoch": 0.6526315789473685, + "grad_norm": 0.07625217735767365, + "learning_rate": 9.92338865665814e-05, + "loss": 0.01569373346865177, + "num_input_tokens_seen": 38581856, + "step": 2356, + "train_runtime": 19415.5706, + "train_tokens_per_second": 1987.161 + }, + { + "epoch": 0.6529085872576177, + "grad_norm": 0.1088777631521225, + "learning_rate": 9.92331199260355e-05, + "loss": 0.018437648192048073, + "num_input_tokens_seen": 38598232, + "step": 2357, + "train_runtime": 19423.8116, + "train_tokens_per_second": 1987.161 + }, + { + "epoch": 0.653185595567867, + "grad_norm": 0.051229432225227356, + "learning_rate": 9.923235290506174e-05, + "loss": 0.01224758755415678, + "num_input_tokens_seen": 38614608, + "step": 2358, + "train_runtime": 19432.0456, + "train_tokens_per_second": 1987.161 + }, + { + "epoch": 0.6534626038781164, + "grad_norm": 0.06651786714792252, + "learning_rate": 9.923158550366607e-05, + "loss": 0.01448697131127119, + "num_input_tokens_seen": 38630984, + "step": 2359, + "train_runtime": 19440.2775, + "train_tokens_per_second": 1987.162 + }, + { + "epoch": 0.6537396121883656, + "grad_norm": 0.08515540510416031, + "learning_rate": 9.923081772185439e-05, + "loss": 0.01667901873588562, + "num_input_tokens_seen": 38647360, + "step": 2360, + "train_runtime": 19448.5149, + "train_tokens_per_second": 1987.163 + }, + { + "epoch": 0.654016620498615, + "grad_norm": 0.07727853953838348, + "learning_rate": 9.923004955963265e-05, + "loss": 0.014400518499314785, + "num_input_tokens_seen": 38663736, + "step": 2361, + "train_runtime": 19456.7355, + "train_tokens_per_second": 1987.165 + }, + { + "epoch": 0.6542936288088642, + "grad_norm": 0.0748165175318718, + "learning_rate": 9.922928101700678e-05, + "loss": 0.015013545751571655, + "num_input_tokens_seen": 38680112, + "step": 2362, + "train_runtime": 19464.969, + "train_tokens_per_second": 1987.165 + }, + { + "epoch": 0.6545706371191136, + "grad_norm": 0.07818631082773209, + "learning_rate": 9.922851209398272e-05, + "loss": 0.019243035465478897, + "num_input_tokens_seen": 38696488, + "step": 2363, + "train_runtime": 19473.1952, + "train_tokens_per_second": 1987.167 + }, + { + "epoch": 0.6548476454293629, + "grad_norm": 0.08255145698785782, + "learning_rate": 9.922774279056639e-05, + "loss": 0.013125624507665634, + "num_input_tokens_seen": 38712864, + "step": 2364, + "train_runtime": 19481.4296, + "train_tokens_per_second": 1987.168 + }, + { + "epoch": 0.6551246537396122, + "grad_norm": 0.05959022045135498, + "learning_rate": 9.922697310676376e-05, + "loss": 0.010158970952033997, + "num_input_tokens_seen": 38729240, + "step": 2365, + "train_runtime": 19489.6575, + "train_tokens_per_second": 1987.169 + }, + { + "epoch": 0.6554016620498615, + "grad_norm": 0.07420343905687332, + "learning_rate": 9.922620304258078e-05, + "loss": 0.01300303265452385, + "num_input_tokens_seen": 38745616, + "step": 2366, + "train_runtime": 19497.8842, + "train_tokens_per_second": 1987.17 + }, + { + "epoch": 0.6556786703601108, + "grad_norm": 0.06415329873561859, + "learning_rate": 9.922543259802339e-05, + "loss": 0.013702637515962124, + "num_input_tokens_seen": 38761992, + "step": 2367, + "train_runtime": 19506.1011, + "train_tokens_per_second": 1987.173 + }, + { + "epoch": 0.6559556786703601, + "grad_norm": 0.06348177790641785, + "learning_rate": 9.922466177309754e-05, + "loss": 0.014490071684122086, + "num_input_tokens_seen": 38778368, + "step": 2368, + "train_runtime": 19514.3173, + "train_tokens_per_second": 1987.175 + }, + { + "epoch": 0.6562326869806094, + "grad_norm": 0.05554280802607536, + "learning_rate": 9.922389056780919e-05, + "loss": 0.011440515518188477, + "num_input_tokens_seen": 38794744, + "step": 2369, + "train_runtime": 19522.5267, + "train_tokens_per_second": 1987.178 + }, + { + "epoch": 0.6565096952908587, + "grad_norm": 0.0777096152305603, + "learning_rate": 9.92231189821643e-05, + "loss": 0.017670655623078346, + "num_input_tokens_seen": 38811120, + "step": 2370, + "train_runtime": 19530.737, + "train_tokens_per_second": 1987.182 + }, + { + "epoch": 0.656786703601108, + "grad_norm": 0.06398750841617584, + "learning_rate": 9.922234701616883e-05, + "loss": 0.015584224835038185, + "num_input_tokens_seen": 38827496, + "step": 2371, + "train_runtime": 19538.9564, + "train_tokens_per_second": 1987.184 + }, + { + "epoch": 0.6570637119113574, + "grad_norm": 0.0542973056435585, + "learning_rate": 9.922157466982874e-05, + "loss": 0.016597939655184746, + "num_input_tokens_seen": 38843872, + "step": 2372, + "train_runtime": 19547.1697, + "train_tokens_per_second": 1987.187 + }, + { + "epoch": 0.6573407202216066, + "grad_norm": 0.06946352869272232, + "learning_rate": 9.922080194315002e-05, + "loss": 0.014118635095655918, + "num_input_tokens_seen": 38860248, + "step": 2373, + "train_runtime": 19555.387, + "train_tokens_per_second": 1987.189 + }, + { + "epoch": 0.657617728531856, + "grad_norm": 0.08375200629234314, + "learning_rate": 9.922002883613861e-05, + "loss": 0.016562728211283684, + "num_input_tokens_seen": 38876624, + "step": 2374, + "train_runtime": 19563.6154, + "train_tokens_per_second": 1987.19 + }, + { + "epoch": 0.6578947368421053, + "grad_norm": 0.0872228667140007, + "learning_rate": 9.921925534880051e-05, + "loss": 0.015106619335711002, + "num_input_tokens_seen": 38893000, + "step": 2375, + "train_runtime": 19571.8282, + "train_tokens_per_second": 1987.193 + }, + { + "epoch": 0.6581717451523545, + "grad_norm": 0.07327134162187576, + "learning_rate": 9.921848148114168e-05, + "loss": 0.015249272808432579, + "num_input_tokens_seen": 38909376, + "step": 2376, + "train_runtime": 19580.0442, + "train_tokens_per_second": 1987.196 + }, + { + "epoch": 0.6584487534626039, + "grad_norm": 0.08414606004953384, + "learning_rate": 9.921770723316812e-05, + "loss": 0.014641388319432735, + "num_input_tokens_seen": 38925752, + "step": 2377, + "train_runtime": 19588.2788, + "train_tokens_per_second": 1987.196 + }, + { + "epoch": 0.6587257617728531, + "grad_norm": 0.07796399295330048, + "learning_rate": 9.921693260488578e-05, + "loss": 0.014513500034809113, + "num_input_tokens_seen": 38942128, + "step": 2378, + "train_runtime": 19596.5104, + "train_tokens_per_second": 1987.197 + }, + { + "epoch": 0.6590027700831025, + "grad_norm": 0.058826200664043427, + "learning_rate": 9.921615759630067e-05, + "loss": 0.015179600566625595, + "num_input_tokens_seen": 38958504, + "step": 2379, + "train_runtime": 19604.7196, + "train_tokens_per_second": 1987.2 + }, + { + "epoch": 0.6592797783933518, + "grad_norm": 0.08183912187814713, + "learning_rate": 9.921538220741877e-05, + "loss": 0.016953421756625175, + "num_input_tokens_seen": 38974880, + "step": 2380, + "train_runtime": 19612.9331, + "train_tokens_per_second": 1987.203 + }, + { + "epoch": 0.6595567867036011, + "grad_norm": 0.09189193695783615, + "learning_rate": 9.92146064382461e-05, + "loss": 0.017742767930030823, + "num_input_tokens_seen": 38991256, + "step": 2381, + "train_runtime": 19621.1446, + "train_tokens_per_second": 1987.206 + }, + { + "epoch": 0.6598337950138504, + "grad_norm": 0.0732617974281311, + "learning_rate": 9.92138302887886e-05, + "loss": 0.011767685413360596, + "num_input_tokens_seen": 39007632, + "step": 2382, + "train_runtime": 19629.3568, + "train_tokens_per_second": 1987.209 + }, + { + "epoch": 0.6601108033240998, + "grad_norm": 0.06219145655632019, + "learning_rate": 9.92130537590523e-05, + "loss": 0.01301621738821268, + "num_input_tokens_seen": 39024008, + "step": 2383, + "train_runtime": 19637.5684, + "train_tokens_per_second": 1987.212 + }, + { + "epoch": 0.660387811634349, + "grad_norm": 0.06668991595506668, + "learning_rate": 9.92122768490432e-05, + "loss": 0.014031288214027882, + "num_input_tokens_seen": 39040384, + "step": 2384, + "train_runtime": 19645.7984, + "train_tokens_per_second": 1987.213 + }, + { + "epoch": 0.6606648199445984, + "grad_norm": 0.08394521474838257, + "learning_rate": 9.92114995587673e-05, + "loss": 0.016818329691886902, + "num_input_tokens_seen": 39056760, + "step": 2385, + "train_runtime": 19654.0295, + "train_tokens_per_second": 1987.214 + }, + { + "epoch": 0.6609418282548476, + "grad_norm": 0.08480224758386612, + "learning_rate": 9.92107218882306e-05, + "loss": 0.01749425381422043, + "num_input_tokens_seen": 39073136, + "step": 2386, + "train_runtime": 19662.2484, + "train_tokens_per_second": 1987.216 + }, + { + "epoch": 0.661218836565097, + "grad_norm": 0.07212553173303604, + "learning_rate": 9.920994383743912e-05, + "loss": 0.014192241244018078, + "num_input_tokens_seen": 39089512, + "step": 2387, + "train_runtime": 19670.4605, + "train_tokens_per_second": 1987.219 + }, + { + "epoch": 0.6614958448753463, + "grad_norm": 0.087727852165699, + "learning_rate": 9.920916540639887e-05, + "loss": 0.015589611604809761, + "num_input_tokens_seen": 39105888, + "step": 2388, + "train_runtime": 19678.6756, + "train_tokens_per_second": 1987.222 + }, + { + "epoch": 0.6617728531855955, + "grad_norm": 0.09280978888273239, + "learning_rate": 9.920838659511585e-05, + "loss": 0.015675336122512817, + "num_input_tokens_seen": 39122264, + "step": 2389, + "train_runtime": 19686.8848, + "train_tokens_per_second": 1987.225 + }, + { + "epoch": 0.6620498614958449, + "grad_norm": 0.08944776654243469, + "learning_rate": 9.92076074035961e-05, + "loss": 0.016695551574230194, + "num_input_tokens_seen": 39138640, + "step": 2390, + "train_runtime": 19695.0942, + "train_tokens_per_second": 1987.228 + }, + { + "epoch": 0.6623268698060942, + "grad_norm": 0.046318184584379196, + "learning_rate": 9.920682783184563e-05, + "loss": 0.012537543661892414, + "num_input_tokens_seen": 39155016, + "step": 2391, + "train_runtime": 19703.3239, + "train_tokens_per_second": 1987.229 + }, + { + "epoch": 0.6626038781163435, + "grad_norm": 0.07492345571517944, + "learning_rate": 9.920604787987046e-05, + "loss": 0.013955362141132355, + "num_input_tokens_seen": 39171392, + "step": 2392, + "train_runtime": 19711.5367, + "train_tokens_per_second": 1987.232 + }, + { + "epoch": 0.6628808864265928, + "grad_norm": 0.12949645519256592, + "learning_rate": 9.920526754767662e-05, + "loss": 0.016360800713300705, + "num_input_tokens_seen": 39187768, + "step": 2393, + "train_runtime": 19719.7666, + "train_tokens_per_second": 1987.233 + }, + { + "epoch": 0.6631578947368421, + "grad_norm": 0.0862983986735344, + "learning_rate": 9.920448683527014e-05, + "loss": 0.014516335912048817, + "num_input_tokens_seen": 39204144, + "step": 2394, + "train_runtime": 19728.0012, + "train_tokens_per_second": 1987.233 + }, + { + "epoch": 0.6634349030470914, + "grad_norm": 0.058595072478055954, + "learning_rate": 9.920370574265707e-05, + "loss": 0.014995590783655643, + "num_input_tokens_seen": 39220520, + "step": 2395, + "train_runtime": 19736.2382, + "train_tokens_per_second": 1987.234 + }, + { + "epoch": 0.6637119113573408, + "grad_norm": 0.07421022653579712, + "learning_rate": 9.920292426984342e-05, + "loss": 0.014439661987125874, + "num_input_tokens_seen": 39236896, + "step": 2396, + "train_runtime": 19744.4649, + "train_tokens_per_second": 1987.235 + }, + { + "epoch": 0.66398891966759, + "grad_norm": 0.05568985641002655, + "learning_rate": 9.920214241683523e-05, + "loss": 0.008386853151023388, + "num_input_tokens_seen": 39253272, + "step": 2397, + "train_runtime": 19752.6996, + "train_tokens_per_second": 1987.236 + }, + { + "epoch": 0.6642659279778393, + "grad_norm": 0.07345324009656906, + "learning_rate": 9.920136018363856e-05, + "loss": 0.015010794624686241, + "num_input_tokens_seen": 39269648, + "step": 2398, + "train_runtime": 19760.9352, + "train_tokens_per_second": 1987.236 + }, + { + "epoch": 0.6645429362880887, + "grad_norm": 0.04989941790699959, + "learning_rate": 9.920057757025944e-05, + "loss": 0.012865474447607994, + "num_input_tokens_seen": 39286024, + "step": 2399, + "train_runtime": 19769.163, + "train_tokens_per_second": 1987.238 + }, + { + "epoch": 0.6648199445983379, + "grad_norm": 0.12138646841049194, + "learning_rate": 9.919979457670392e-05, + "loss": 0.014967430382966995, + "num_input_tokens_seen": 39302400, + "step": 2400, + "train_runtime": 19777.3917, + "train_tokens_per_second": 1987.239 + }, + { + "epoch": 0.6650969529085873, + "grad_norm": 0.08942490071058273, + "learning_rate": 9.919901120297805e-05, + "loss": 0.01637246087193489, + "num_input_tokens_seen": 39318776, + "step": 2401, + "train_runtime": 19787.2383, + "train_tokens_per_second": 1987.078 + }, + { + "epoch": 0.6653739612188365, + "grad_norm": 0.702212393283844, + "learning_rate": 9.919822744908789e-05, + "loss": 0.014940268360078335, + "num_input_tokens_seen": 39335152, + "step": 2402, + "train_runtime": 19795.465, + "train_tokens_per_second": 1987.079 + }, + { + "epoch": 0.6656509695290859, + "grad_norm": 0.09783861041069031, + "learning_rate": 9.919744331503947e-05, + "loss": 0.016860270872712135, + "num_input_tokens_seen": 39351528, + "step": 2403, + "train_runtime": 19803.6809, + "train_tokens_per_second": 1987.082 + }, + { + "epoch": 0.6659279778393352, + "grad_norm": 0.08359774947166443, + "learning_rate": 9.919665880083888e-05, + "loss": 0.015789136290550232, + "num_input_tokens_seen": 39367904, + "step": 2404, + "train_runtime": 19811.9084, + "train_tokens_per_second": 1987.083 + }, + { + "epoch": 0.6662049861495845, + "grad_norm": 0.09288936853408813, + "learning_rate": 9.919587390649219e-05, + "loss": 0.01410446036607027, + "num_input_tokens_seen": 39384280, + "step": 2405, + "train_runtime": 19820.1216, + "train_tokens_per_second": 1987.086 + }, + { + "epoch": 0.6664819944598338, + "grad_norm": 0.13254107534885406, + "learning_rate": 9.919508863200542e-05, + "loss": 0.01975351944565773, + "num_input_tokens_seen": 39400656, + "step": 2406, + "train_runtime": 19828.3289, + "train_tokens_per_second": 1987.089 + }, + { + "epoch": 0.6667590027700832, + "grad_norm": 0.08230464905500412, + "learning_rate": 9.919430297738468e-05, + "loss": 0.016228999942541122, + "num_input_tokens_seen": 39417032, + "step": 2407, + "train_runtime": 19836.5472, + "train_tokens_per_second": 1987.091 + }, + { + "epoch": 0.6670360110803324, + "grad_norm": 0.0942629724740982, + "learning_rate": 9.9193516942636e-05, + "loss": 0.018082039430737495, + "num_input_tokens_seen": 39433408, + "step": 2408, + "train_runtime": 19844.7579, + "train_tokens_per_second": 1987.094 + }, + { + "epoch": 0.6673130193905817, + "grad_norm": 0.07374273985624313, + "learning_rate": 9.919273052776553e-05, + "loss": 0.01504608616232872, + "num_input_tokens_seen": 39449784, + "step": 2409, + "train_runtime": 19852.9712, + "train_tokens_per_second": 1987.097 + }, + { + "epoch": 0.667590027700831, + "grad_norm": 0.07216652482748032, + "learning_rate": 9.919194373277924e-05, + "loss": 0.014146987348794937, + "num_input_tokens_seen": 39466160, + "step": 2410, + "train_runtime": 19861.1861, + "train_tokens_per_second": 1987.1 + }, + { + "epoch": 0.6678670360110803, + "grad_norm": 0.07397284358739853, + "learning_rate": 9.91911565576833e-05, + "loss": 0.01353096216917038, + "num_input_tokens_seen": 39482536, + "step": 2411, + "train_runtime": 19869.3902, + "train_tokens_per_second": 1987.104 + }, + { + "epoch": 0.6681440443213297, + "grad_norm": 0.07430322468280792, + "learning_rate": 9.919036900248375e-05, + "loss": 0.014743147417902946, + "num_input_tokens_seen": 39498912, + "step": 2412, + "train_runtime": 19877.6043, + "train_tokens_per_second": 1987.106 + }, + { + "epoch": 0.6684210526315789, + "grad_norm": 0.14496448636054993, + "learning_rate": 9.918958106718668e-05, + "loss": 0.017568673938512802, + "num_input_tokens_seen": 39515288, + "step": 2413, + "train_runtime": 19885.8116, + "train_tokens_per_second": 1987.11 + }, + { + "epoch": 0.6686980609418283, + "grad_norm": 0.06910529732704163, + "learning_rate": 9.918879275179818e-05, + "loss": 0.015760047361254692, + "num_input_tokens_seen": 39531664, + "step": 2414, + "train_runtime": 19894.0302, + "train_tokens_per_second": 1987.112 + }, + { + "epoch": 0.6689750692520776, + "grad_norm": 0.09376852959394455, + "learning_rate": 9.918800405632434e-05, + "loss": 0.012934878468513489, + "num_input_tokens_seen": 39548040, + "step": 2415, + "train_runtime": 19902.2658, + "train_tokens_per_second": 1987.112 + }, + { + "epoch": 0.6692520775623269, + "grad_norm": 0.06329435110092163, + "learning_rate": 9.918721498077126e-05, + "loss": 0.014038276858627796, + "num_input_tokens_seen": 39564416, + "step": 2416, + "train_runtime": 19910.4931, + "train_tokens_per_second": 1987.114 + }, + { + "epoch": 0.6695290858725762, + "grad_norm": 0.05351203680038452, + "learning_rate": 9.918642552514504e-05, + "loss": 0.012665034271776676, + "num_input_tokens_seen": 39580792, + "step": 2417, + "train_runtime": 19918.718, + "train_tokens_per_second": 1987.115 + }, + { + "epoch": 0.6698060941828254, + "grad_norm": 0.09656383842229843, + "learning_rate": 9.918563568945175e-05, + "loss": 0.017461445182561874, + "num_input_tokens_seen": 39597168, + "step": 2418, + "train_runtime": 19926.9397, + "train_tokens_per_second": 1987.117 + }, + { + "epoch": 0.6700831024930748, + "grad_norm": 0.06145282834768295, + "learning_rate": 9.918484547369755e-05, + "loss": 0.015219825319945812, + "num_input_tokens_seen": 39613544, + "step": 2419, + "train_runtime": 19935.1627, + "train_tokens_per_second": 1987.119 + }, + { + "epoch": 0.6703601108033241, + "grad_norm": 0.08039139956235886, + "learning_rate": 9.918405487788849e-05, + "loss": 0.014105882495641708, + "num_input_tokens_seen": 39629920, + "step": 2420, + "train_runtime": 19943.3837, + "train_tokens_per_second": 1987.121 + }, + { + "epoch": 0.6706371191135734, + "grad_norm": 0.11117056757211685, + "learning_rate": 9.918326390203072e-05, + "loss": 0.014416776597499847, + "num_input_tokens_seen": 39646296, + "step": 2421, + "train_runtime": 19951.6067, + "train_tokens_per_second": 1987.123 + }, + { + "epoch": 0.6709141274238227, + "grad_norm": 0.28140828013420105, + "learning_rate": 9.918247254613033e-05, + "loss": 0.017496054992079735, + "num_input_tokens_seen": 39662672, + "step": 2422, + "train_runtime": 19959.8269, + "train_tokens_per_second": 1987.125 + }, + { + "epoch": 0.6711911357340721, + "grad_norm": 0.22529876232147217, + "learning_rate": 9.918168081019343e-05, + "loss": 0.02195901796221733, + "num_input_tokens_seen": 39679048, + "step": 2423, + "train_runtime": 19968.0595, + "train_tokens_per_second": 1987.126 + }, + { + "epoch": 0.6714681440443213, + "grad_norm": 0.07113590836524963, + "learning_rate": 9.918088869422617e-05, + "loss": 0.01412181556224823, + "num_input_tokens_seen": 39695424, + "step": 2424, + "train_runtime": 19976.2823, + "train_tokens_per_second": 1987.128 + }, + { + "epoch": 0.6717451523545707, + "grad_norm": 0.0976172611117363, + "learning_rate": 9.918009619823462e-05, + "loss": 0.014051509089767933, + "num_input_tokens_seen": 39711800, + "step": 2425, + "train_runtime": 19984.5066, + "train_tokens_per_second": 1987.129 + }, + { + "epoch": 0.6720221606648199, + "grad_norm": 0.12614089250564575, + "learning_rate": 9.917930332222496e-05, + "loss": 0.014605769887566566, + "num_input_tokens_seen": 39728176, + "step": 2426, + "train_runtime": 19992.7296, + "train_tokens_per_second": 1987.131 + }, + { + "epoch": 0.6722991689750693, + "grad_norm": 0.10233761370182037, + "learning_rate": 9.917851006620327e-05, + "loss": 0.013296699151396751, + "num_input_tokens_seen": 39744552, + "step": 2427, + "train_runtime": 20000.957, + "train_tokens_per_second": 1987.133 + }, + { + "epoch": 0.6725761772853186, + "grad_norm": 0.085145965218544, + "learning_rate": 9.91777164301757e-05, + "loss": 0.014263231307268143, + "num_input_tokens_seen": 39760928, + "step": 2428, + "train_runtime": 20009.1825, + "train_tokens_per_second": 1987.134 + }, + { + "epoch": 0.6728531855955678, + "grad_norm": 0.10298462212085724, + "learning_rate": 9.917692241414837e-05, + "loss": 0.015789559110999107, + "num_input_tokens_seen": 39777304, + "step": 2429, + "train_runtime": 20017.407, + "train_tokens_per_second": 1987.136 + }, + { + "epoch": 0.6731301939058172, + "grad_norm": 0.08895015716552734, + "learning_rate": 9.917612801812744e-05, + "loss": 0.013110371306538582, + "num_input_tokens_seen": 39793680, + "step": 2430, + "train_runtime": 20025.6412, + "train_tokens_per_second": 1987.136 + }, + { + "epoch": 0.6734072022160665, + "grad_norm": 0.08040525764226913, + "learning_rate": 9.917533324211904e-05, + "loss": 0.012783890590071678, + "num_input_tokens_seen": 39810056, + "step": 2431, + "train_runtime": 20033.8746, + "train_tokens_per_second": 1987.137 + }, + { + "epoch": 0.6736842105263158, + "grad_norm": 0.06751594692468643, + "learning_rate": 9.917453808612929e-05, + "loss": 0.015007801353931427, + "num_input_tokens_seen": 39826432, + "step": 2432, + "train_runtime": 20042.1036, + "train_tokens_per_second": 1987.138 + }, + { + "epoch": 0.6739612188365651, + "grad_norm": 0.07224646210670471, + "learning_rate": 9.917374255016436e-05, + "loss": 0.01657278463244438, + "num_input_tokens_seen": 39842808, + "step": 2433, + "train_runtime": 20050.3358, + "train_tokens_per_second": 1987.139 + }, + { + "epoch": 0.6742382271468144, + "grad_norm": 0.06503286957740784, + "learning_rate": 9.917294663423037e-05, + "loss": 0.013899327255785465, + "num_input_tokens_seen": 39859184, + "step": 2434, + "train_runtime": 20058.5666, + "train_tokens_per_second": 1987.14 + }, + { + "epoch": 0.6745152354570637, + "grad_norm": 0.10716450959444046, + "learning_rate": 9.917215033833352e-05, + "loss": 0.01603855937719345, + "num_input_tokens_seen": 39875560, + "step": 2435, + "train_runtime": 20066.7974, + "train_tokens_per_second": 1987.141 + }, + { + "epoch": 0.6747922437673131, + "grad_norm": 0.1032087504863739, + "learning_rate": 9.91713536624799e-05, + "loss": 0.01669636182487011, + "num_input_tokens_seen": 39891936, + "step": 2436, + "train_runtime": 20075.0282, + "train_tokens_per_second": 1987.142 + }, + { + "epoch": 0.6750692520775623, + "grad_norm": 0.12053769826889038, + "learning_rate": 9.917055660667571e-05, + "loss": 0.018442949280142784, + "num_input_tokens_seen": 39908312, + "step": 2437, + "train_runtime": 20083.2573, + "train_tokens_per_second": 1987.143 + }, + { + "epoch": 0.6753462603878116, + "grad_norm": 0.11159549653530121, + "learning_rate": 9.916975917092709e-05, + "loss": 0.015096893534064293, + "num_input_tokens_seen": 39924688, + "step": 2438, + "train_runtime": 20091.4874, + "train_tokens_per_second": 1987.144 + }, + { + "epoch": 0.675623268698061, + "grad_norm": 0.08196612447500229, + "learning_rate": 9.91689613552402e-05, + "loss": 0.016377627849578857, + "num_input_tokens_seen": 39941064, + "step": 2439, + "train_runtime": 20099.71, + "train_tokens_per_second": 1987.146 + }, + { + "epoch": 0.6759002770083102, + "grad_norm": 0.06990259885787964, + "learning_rate": 9.916816315962123e-05, + "loss": 0.016532493755221367, + "num_input_tokens_seen": 39957440, + "step": 2440, + "train_runtime": 20107.9288, + "train_tokens_per_second": 1987.148 + }, + { + "epoch": 0.6761772853185596, + "grad_norm": 0.06106731668114662, + "learning_rate": 9.916736458407632e-05, + "loss": 0.013594798743724823, + "num_input_tokens_seen": 39973816, + "step": 2441, + "train_runtime": 20116.1572, + "train_tokens_per_second": 1987.15 + }, + { + "epoch": 0.6764542936288088, + "grad_norm": 0.05248474329710007, + "learning_rate": 9.916656562861164e-05, + "loss": 0.013300424441695213, + "num_input_tokens_seen": 39990192, + "step": 2442, + "train_runtime": 20124.3798, + "train_tokens_per_second": 1987.152 + }, + { + "epoch": 0.6767313019390582, + "grad_norm": 0.05478557571768761, + "learning_rate": 9.916576629323339e-05, + "loss": 0.012153281830251217, + "num_input_tokens_seen": 40006568, + "step": 2443, + "train_runtime": 20132.602, + "train_tokens_per_second": 1987.153 + }, + { + "epoch": 0.6770083102493075, + "grad_norm": 0.09977419674396515, + "learning_rate": 9.916496657794771e-05, + "loss": 0.01840289682149887, + "num_input_tokens_seen": 40022944, + "step": 2444, + "train_runtime": 20140.8345, + "train_tokens_per_second": 1987.154 + }, + { + "epoch": 0.6772853185595568, + "grad_norm": 0.06972428411245346, + "learning_rate": 9.91641664827608e-05, + "loss": 0.014918707311153412, + "num_input_tokens_seen": 40039320, + "step": 2445, + "train_runtime": 20149.0651, + "train_tokens_per_second": 1987.155 + }, + { + "epoch": 0.6775623268698061, + "grad_norm": 0.08310893177986145, + "learning_rate": 9.916336600767888e-05, + "loss": 0.011802599765360355, + "num_input_tokens_seen": 40055696, + "step": 2446, + "train_runtime": 20157.2989, + "train_tokens_per_second": 1987.156 + }, + { + "epoch": 0.6778393351800553, + "grad_norm": 0.05670709162950516, + "learning_rate": 9.916256515270805e-05, + "loss": 0.012951354496181011, + "num_input_tokens_seen": 40072072, + "step": 2447, + "train_runtime": 20165.5268, + "train_tokens_per_second": 1987.157 + }, + { + "epoch": 0.6781163434903047, + "grad_norm": 0.075042225420475, + "learning_rate": 9.916176391785457e-05, + "loss": 0.01575028896331787, + "num_input_tokens_seen": 40088448, + "step": 2448, + "train_runtime": 20173.7573, + "train_tokens_per_second": 1987.158 + }, + { + "epoch": 0.678393351800554, + "grad_norm": 0.08031318336725235, + "learning_rate": 9.91609623031246e-05, + "loss": 0.015659762546420097, + "num_input_tokens_seen": 40104824, + "step": 2449, + "train_runtime": 20181.9862, + "train_tokens_per_second": 1987.159 + }, + { + "epoch": 0.6786703601108033, + "grad_norm": 0.05656655877828598, + "learning_rate": 9.916016030852436e-05, + "loss": 0.014515340328216553, + "num_input_tokens_seen": 40121200, + "step": 2450, + "train_runtime": 20190.2112, + "train_tokens_per_second": 1987.161 + }, + { + "epoch": 0.6789473684210526, + "grad_norm": 0.07523903995752335, + "learning_rate": 9.915935793406e-05, + "loss": 0.01676260307431221, + "num_input_tokens_seen": 40137576, + "step": 2451, + "train_runtime": 20198.4367, + "train_tokens_per_second": 1987.163 + }, + { + "epoch": 0.679224376731302, + "grad_norm": 0.0743442252278328, + "learning_rate": 9.915855517973776e-05, + "loss": 0.014852266758680344, + "num_input_tokens_seen": 40153952, + "step": 2452, + "train_runtime": 20206.6461, + "train_tokens_per_second": 1987.166 + }, + { + "epoch": 0.6795013850415512, + "grad_norm": 0.13067583739757538, + "learning_rate": 9.915775204556382e-05, + "loss": 0.016780911013484, + "num_input_tokens_seen": 40170328, + "step": 2453, + "train_runtime": 20214.8637, + "train_tokens_per_second": 1987.168 + }, + { + "epoch": 0.6797783933518006, + "grad_norm": 0.07426609098911285, + "learning_rate": 9.915694853154442e-05, + "loss": 0.014333043247461319, + "num_input_tokens_seen": 40186704, + "step": 2454, + "train_runtime": 20223.0905, + "train_tokens_per_second": 1987.169 + }, + { + "epoch": 0.6800554016620498, + "grad_norm": 0.10284202545881271, + "learning_rate": 9.915614463768572e-05, + "loss": 0.01884426362812519, + "num_input_tokens_seen": 40203080, + "step": 2455, + "train_runtime": 20231.3132, + "train_tokens_per_second": 1987.171 + }, + { + "epoch": 0.6803324099722992, + "grad_norm": 0.09602479636669159, + "learning_rate": 9.915534036399397e-05, + "loss": 0.01607220619916916, + "num_input_tokens_seen": 40219456, + "step": 2456, + "train_runtime": 20239.5459, + "train_tokens_per_second": 1987.172 + }, + { + "epoch": 0.6806094182825485, + "grad_norm": 0.07713520526885986, + "learning_rate": 9.915453571047538e-05, + "loss": 0.01593644730746746, + "num_input_tokens_seen": 40235832, + "step": 2457, + "train_runtime": 20247.777, + "train_tokens_per_second": 1987.173 + }, + { + "epoch": 0.6808864265927977, + "grad_norm": 0.07983440160751343, + "learning_rate": 9.915373067713614e-05, + "loss": 0.014400376938283443, + "num_input_tokens_seen": 40252208, + "step": 2458, + "train_runtime": 20256.0078, + "train_tokens_per_second": 1987.174 + }, + { + "epoch": 0.6811634349030471, + "grad_norm": 0.06633429229259491, + "learning_rate": 9.91529252639825e-05, + "loss": 0.013788081705570221, + "num_input_tokens_seen": 40268584, + "step": 2459, + "train_runtime": 20264.2345, + "train_tokens_per_second": 1987.175 + }, + { + "epoch": 0.6814404432132964, + "grad_norm": 0.08546815812587738, + "learning_rate": 9.915211947102067e-05, + "loss": 0.017003316432237625, + "num_input_tokens_seen": 40284960, + "step": 2460, + "train_runtime": 20272.4697, + "train_tokens_per_second": 1987.176 + }, + { + "epoch": 0.6817174515235457, + "grad_norm": 0.08727104216814041, + "learning_rate": 9.915131329825689e-05, + "loss": 0.014346038922667503, + "num_input_tokens_seen": 40301336, + "step": 2461, + "train_runtime": 20280.7023, + "train_tokens_per_second": 1987.177 + }, + { + "epoch": 0.681994459833795, + "grad_norm": 0.07577130198478699, + "learning_rate": 9.915050674569736e-05, + "loss": 0.012921071611344814, + "num_input_tokens_seen": 40317712, + "step": 2462, + "train_runtime": 20288.9414, + "train_tokens_per_second": 1987.177 + }, + { + "epoch": 0.6822714681440443, + "grad_norm": 0.07510500401258469, + "learning_rate": 9.914969981334834e-05, + "loss": 0.014883982948958874, + "num_input_tokens_seen": 40334088, + "step": 2463, + "train_runtime": 20297.1699, + "train_tokens_per_second": 1987.178 + }, + { + "epoch": 0.6825484764542936, + "grad_norm": 0.1015135645866394, + "learning_rate": 9.914889250121606e-05, + "loss": 0.016187790781259537, + "num_input_tokens_seen": 40350464, + "step": 2464, + "train_runtime": 20305.3955, + "train_tokens_per_second": 1987.179 + }, + { + "epoch": 0.682825484764543, + "grad_norm": 0.09017466008663177, + "learning_rate": 9.914808480930676e-05, + "loss": 0.015494744293391705, + "num_input_tokens_seen": 40366840, + "step": 2465, + "train_runtime": 20313.6234, + "train_tokens_per_second": 1987.181 + }, + { + "epoch": 0.6831024930747922, + "grad_norm": 0.06289433687925339, + "learning_rate": 9.914727673762667e-05, + "loss": 0.013889764435589314, + "num_input_tokens_seen": 40383216, + "step": 2466, + "train_runtime": 20321.857, + "train_tokens_per_second": 1987.181 + }, + { + "epoch": 0.6833795013850416, + "grad_norm": 0.07832712680101395, + "learning_rate": 9.914646828618204e-05, + "loss": 0.01670549251139164, + "num_input_tokens_seen": 40399592, + "step": 2467, + "train_runtime": 20330.0934, + "train_tokens_per_second": 1987.182 + }, + { + "epoch": 0.6836565096952909, + "grad_norm": 0.05259642004966736, + "learning_rate": 9.914565945497912e-05, + "loss": 0.014127533882856369, + "num_input_tokens_seen": 40415968, + "step": 2468, + "train_runtime": 20338.3168, + "train_tokens_per_second": 1987.184 + }, + { + "epoch": 0.6839335180055401, + "grad_norm": 0.05069928616285324, + "learning_rate": 9.914485024402415e-05, + "loss": 0.013033144176006317, + "num_input_tokens_seen": 40432344, + "step": 2469, + "train_runtime": 20346.5363, + "train_tokens_per_second": 1987.186 + }, + { + "epoch": 0.6842105263157895, + "grad_norm": 0.08383794128894806, + "learning_rate": 9.91440406533234e-05, + "loss": 0.016869517043232918, + "num_input_tokens_seen": 40448720, + "step": 2470, + "train_runtime": 20354.7569, + "train_tokens_per_second": 1987.188 + }, + { + "epoch": 0.6844875346260387, + "grad_norm": 0.06265851855278015, + "learning_rate": 9.914323068288311e-05, + "loss": 0.013903995044529438, + "num_input_tokens_seen": 40465096, + "step": 2471, + "train_runtime": 20362.986, + "train_tokens_per_second": 1987.189 + }, + { + "epoch": 0.6847645429362881, + "grad_norm": 0.06490408629179001, + "learning_rate": 9.914242033270955e-05, + "loss": 0.015339266508817673, + "num_input_tokens_seen": 40481472, + "step": 2472, + "train_runtime": 20371.2058, + "train_tokens_per_second": 1987.191 + }, + { + "epoch": 0.6850415512465374, + "grad_norm": 0.11331083625555038, + "learning_rate": 9.914160960280897e-05, + "loss": 0.015072988346219063, + "num_input_tokens_seen": 40497848, + "step": 2473, + "train_runtime": 20379.4274, + "train_tokens_per_second": 1987.193 + }, + { + "epoch": 0.6853185595567867, + "grad_norm": 0.09924419224262238, + "learning_rate": 9.914079849318764e-05, + "loss": 0.018034327775239944, + "num_input_tokens_seen": 40514224, + "step": 2474, + "train_runtime": 20387.6568, + "train_tokens_per_second": 1987.194 + }, + { + "epoch": 0.685595567867036, + "grad_norm": 0.0569419339299202, + "learning_rate": 9.913998700385183e-05, + "loss": 0.015824802219867706, + "num_input_tokens_seen": 40530600, + "step": 2475, + "train_runtime": 20395.8745, + "train_tokens_per_second": 1987.196 + }, + { + "epoch": 0.6858725761772854, + "grad_norm": 0.3031487464904785, + "learning_rate": 9.91391751348078e-05, + "loss": 0.01697205938398838, + "num_input_tokens_seen": 40546976, + "step": 2476, + "train_runtime": 20404.0905, + "train_tokens_per_second": 1987.198 + }, + { + "epoch": 0.6861495844875346, + "grad_norm": 0.0516100749373436, + "learning_rate": 9.913836288606184e-05, + "loss": 0.01363718044012785, + "num_input_tokens_seen": 40563352, + "step": 2477, + "train_runtime": 20412.3025, + "train_tokens_per_second": 1987.201 + }, + { + "epoch": 0.686426592797784, + "grad_norm": 0.07619713991880417, + "learning_rate": 9.913755025762021e-05, + "loss": 0.014101594686508179, + "num_input_tokens_seen": 40579728, + "step": 2478, + "train_runtime": 20420.5104, + "train_tokens_per_second": 1987.204 + }, + { + "epoch": 0.6867036011080332, + "grad_norm": 0.06527069211006165, + "learning_rate": 9.913673724948921e-05, + "loss": 0.016707109287381172, + "num_input_tokens_seen": 40596104, + "step": 2479, + "train_runtime": 20428.7249, + "train_tokens_per_second": 1987.207 + }, + { + "epoch": 0.6869806094182825, + "grad_norm": 0.10014267265796661, + "learning_rate": 9.91359238616751e-05, + "loss": 0.017776012420654297, + "num_input_tokens_seen": 40612480, + "step": 2480, + "train_runtime": 20436.9592, + "train_tokens_per_second": 1987.208 + }, + { + "epoch": 0.6872576177285319, + "grad_norm": 0.08131642639636993, + "learning_rate": 9.913511009418418e-05, + "loss": 0.014226212166249752, + "num_input_tokens_seen": 40628856, + "step": 2481, + "train_runtime": 20445.1911, + "train_tokens_per_second": 1987.208 + }, + { + "epoch": 0.6875346260387811, + "grad_norm": 0.06845735758543015, + "learning_rate": 9.913429594702272e-05, + "loss": 0.016347207129001617, + "num_input_tokens_seen": 40645232, + "step": 2482, + "train_runtime": 20453.4208, + "train_tokens_per_second": 1987.209 + }, + { + "epoch": 0.6878116343490305, + "grad_norm": 0.05818931385874748, + "learning_rate": 9.913348142019704e-05, + "loss": 0.01348055712878704, + "num_input_tokens_seen": 40661608, + "step": 2483, + "train_runtime": 20461.6577, + "train_tokens_per_second": 1987.21 + }, + { + "epoch": 0.6880886426592798, + "grad_norm": 0.055121298879384995, + "learning_rate": 9.913266651371343e-05, + "loss": 0.013723594136536121, + "num_input_tokens_seen": 40677984, + "step": 2484, + "train_runtime": 20470.0366, + "train_tokens_per_second": 1987.196 + }, + { + "epoch": 0.6883656509695291, + "grad_norm": 0.04194130003452301, + "learning_rate": 9.913185122757814e-05, + "loss": 0.011786782182753086, + "num_input_tokens_seen": 40694360, + "step": 2485, + "train_runtime": 20478.2753, + "train_tokens_per_second": 1987.197 + }, + { + "epoch": 0.6886426592797784, + "grad_norm": 0.06775824725627899, + "learning_rate": 9.913103556179752e-05, + "loss": 0.019281666725873947, + "num_input_tokens_seen": 40710736, + "step": 2486, + "train_runtime": 20486.4862, + "train_tokens_per_second": 1987.2 + }, + { + "epoch": 0.6889196675900277, + "grad_norm": 0.0791282132267952, + "learning_rate": 9.913021951637786e-05, + "loss": 0.018188249319791794, + "num_input_tokens_seen": 40727112, + "step": 2487, + "train_runtime": 20494.7059, + "train_tokens_per_second": 1987.202 + }, + { + "epoch": 0.689196675900277, + "grad_norm": 0.06787728518247604, + "learning_rate": 9.912940309132546e-05, + "loss": 0.01621941663324833, + "num_input_tokens_seen": 40743488, + "step": 2488, + "train_runtime": 20502.9292, + "train_tokens_per_second": 1987.203 + }, + { + "epoch": 0.6894736842105263, + "grad_norm": 0.07887747138738632, + "learning_rate": 9.912858628664664e-05, + "loss": 0.012362202629446983, + "num_input_tokens_seen": 40759864, + "step": 2489, + "train_runtime": 20511.1643, + "train_tokens_per_second": 1987.204 + }, + { + "epoch": 0.6897506925207756, + "grad_norm": 0.0692920908331871, + "learning_rate": 9.912776910234767e-05, + "loss": 0.016079803928732872, + "num_input_tokens_seen": 40776240, + "step": 2490, + "train_runtime": 20519.3942, + "train_tokens_per_second": 1987.205 + }, + { + "epoch": 0.6900277008310249, + "grad_norm": 0.04628564417362213, + "learning_rate": 9.912695153843492e-05, + "loss": 0.014565879479050636, + "num_input_tokens_seen": 40792616, + "step": 2491, + "train_runtime": 20527.6207, + "train_tokens_per_second": 1987.206 + }, + { + "epoch": 0.6903047091412743, + "grad_norm": 0.09891730546951294, + "learning_rate": 9.91261335949147e-05, + "loss": 0.01677347533404827, + "num_input_tokens_seen": 40808992, + "step": 2492, + "train_runtime": 20535.8592, + "train_tokens_per_second": 1987.206 + }, + { + "epoch": 0.6905817174515235, + "grad_norm": 0.10106716305017471, + "learning_rate": 9.91253152717933e-05, + "loss": 0.016485029831528664, + "num_input_tokens_seen": 40825368, + "step": 2493, + "train_runtime": 20544.0801, + "train_tokens_per_second": 1987.208 + }, + { + "epoch": 0.6908587257617729, + "grad_norm": 0.05770794302225113, + "learning_rate": 9.912449656907706e-05, + "loss": 0.012589364312589169, + "num_input_tokens_seen": 40841744, + "step": 2494, + "train_runtime": 20552.301, + "train_tokens_per_second": 1987.21 + }, + { + "epoch": 0.6911357340720221, + "grad_norm": 0.09647738188505173, + "learning_rate": 9.91236774867723e-05, + "loss": 0.01544932834804058, + "num_input_tokens_seen": 40858120, + "step": 2495, + "train_runtime": 20560.521, + "train_tokens_per_second": 1987.212 + }, + { + "epoch": 0.6914127423822715, + "grad_norm": 0.09392660856246948, + "learning_rate": 9.912285802488534e-05, + "loss": 0.016337482258677483, + "num_input_tokens_seen": 40874496, + "step": 2496, + "train_runtime": 20568.7579, + "train_tokens_per_second": 1987.213 + }, + { + "epoch": 0.6916897506925208, + "grad_norm": 0.054690562188625336, + "learning_rate": 9.912203818342253e-05, + "loss": 0.014942296780645847, + "num_input_tokens_seen": 40890872, + "step": 2497, + "train_runtime": 20576.9859, + "train_tokens_per_second": 1987.214 + }, + { + "epoch": 0.69196675900277, + "grad_norm": 0.0986272543668747, + "learning_rate": 9.912121796239022e-05, + "loss": 0.016880806535482407, + "num_input_tokens_seen": 40907248, + "step": 2498, + "train_runtime": 20585.2116, + "train_tokens_per_second": 1987.215 + }, + { + "epoch": 0.6922437673130194, + "grad_norm": 0.06342771649360657, + "learning_rate": 9.912039736179471e-05, + "loss": 0.011126799508929253, + "num_input_tokens_seen": 40923624, + "step": 2499, + "train_runtime": 20593.4318, + "train_tokens_per_second": 1987.217 + }, + { + "epoch": 0.6925207756232687, + "grad_norm": 0.11686662584543228, + "learning_rate": 9.911957638164238e-05, + "loss": 0.015102099627256393, + "num_input_tokens_seen": 40940000, + "step": 2500, + "train_runtime": 20601.6603, + "train_tokens_per_second": 1987.218 + }, + { + "epoch": 0.692797783933518, + "grad_norm": 0.06868106871843338, + "learning_rate": 9.911875502193952e-05, + "loss": 0.015893545001745224, + "num_input_tokens_seen": 40956376, + "step": 2501, + "train_runtime": 20611.4974, + "train_tokens_per_second": 1987.065 + }, + { + "epoch": 0.6930747922437673, + "grad_norm": 0.07580265402793884, + "learning_rate": 9.911793328269255e-05, + "loss": 0.011470107361674309, + "num_input_tokens_seen": 40972752, + "step": 2502, + "train_runtime": 20619.7207, + "train_tokens_per_second": 1987.066 + }, + { + "epoch": 0.6933518005540166, + "grad_norm": 0.09306291490793228, + "learning_rate": 9.911711116390774e-05, + "loss": 0.013947066850960255, + "num_input_tokens_seen": 40989128, + "step": 2503, + "train_runtime": 20627.945, + "train_tokens_per_second": 1987.068 + }, + { + "epoch": 0.6936288088642659, + "grad_norm": 0.058387640863657, + "learning_rate": 9.91162886655915e-05, + "loss": 0.013345195911824703, + "num_input_tokens_seen": 41005504, + "step": 2504, + "train_runtime": 20636.1708, + "train_tokens_per_second": 1987.069 + }, + { + "epoch": 0.6939058171745153, + "grad_norm": 0.07035304605960846, + "learning_rate": 9.911546578775017e-05, + "loss": 0.012984943576157093, + "num_input_tokens_seen": 41021880, + "step": 2505, + "train_runtime": 20644.3906, + "train_tokens_per_second": 1987.071 + }, + { + "epoch": 0.6941828254847645, + "grad_norm": 0.07874088734388351, + "learning_rate": 9.91146425303901e-05, + "loss": 0.015398085117340088, + "num_input_tokens_seen": 41038256, + "step": 2506, + "train_runtime": 20652.6116, + "train_tokens_per_second": 1987.073 + }, + { + "epoch": 0.6944598337950139, + "grad_norm": 0.10619591921567917, + "learning_rate": 9.911381889351765e-05, + "loss": 0.018092917278409004, + "num_input_tokens_seen": 41054632, + "step": 2507, + "train_runtime": 20660.8346, + "train_tokens_per_second": 1987.075 + }, + { + "epoch": 0.6947368421052632, + "grad_norm": 0.04786757379770279, + "learning_rate": 9.91129948771392e-05, + "loss": 0.011978097259998322, + "num_input_tokens_seen": 41071008, + "step": 2508, + "train_runtime": 20669.0411, + "train_tokens_per_second": 1987.079 + }, + { + "epoch": 0.6950138504155124, + "grad_norm": 0.0660555511713028, + "learning_rate": 9.911217048126112e-05, + "loss": 0.009721033275127411, + "num_input_tokens_seen": 41087384, + "step": 2509, + "train_runtime": 20677.2576, + "train_tokens_per_second": 1987.081 + }, + { + "epoch": 0.6952908587257618, + "grad_norm": 0.08700243383646011, + "learning_rate": 9.911134570588976e-05, + "loss": 0.01689094305038452, + "num_input_tokens_seen": 41103760, + "step": 2510, + "train_runtime": 20685.4801, + "train_tokens_per_second": 1987.083 + }, + { + "epoch": 0.695567867036011, + "grad_norm": 0.06333564221858978, + "learning_rate": 9.91105205510315e-05, + "loss": 0.009912757202982903, + "num_input_tokens_seen": 41120136, + "step": 2511, + "train_runtime": 20693.7056, + "train_tokens_per_second": 1987.084 + }, + { + "epoch": 0.6958448753462604, + "grad_norm": 0.10808621346950531, + "learning_rate": 9.91096950166927e-05, + "loss": 0.015614132396876812, + "num_input_tokens_seen": 41136512, + "step": 2512, + "train_runtime": 20701.9302, + "train_tokens_per_second": 1987.086 + }, + { + "epoch": 0.6961218836565097, + "grad_norm": 0.071184441447258, + "learning_rate": 9.910886910287978e-05, + "loss": 0.014703407883644104, + "num_input_tokens_seen": 41152888, + "step": 2513, + "train_runtime": 20710.1633, + "train_tokens_per_second": 1987.087 + }, + { + "epoch": 0.696398891966759, + "grad_norm": 0.0670214369893074, + "learning_rate": 9.910804280959909e-05, + "loss": 0.012249029241502285, + "num_input_tokens_seen": 41169264, + "step": 2514, + "train_runtime": 20718.3923, + "train_tokens_per_second": 1987.088 + }, + { + "epoch": 0.6966759002770083, + "grad_norm": 0.05987909063696861, + "learning_rate": 9.910721613685703e-05, + "loss": 0.012559862807393074, + "num_input_tokens_seen": 41185640, + "step": 2515, + "train_runtime": 20726.6197, + "train_tokens_per_second": 1987.089 + }, + { + "epoch": 0.6969529085872577, + "grad_norm": 0.09467253088951111, + "learning_rate": 9.910638908465997e-05, + "loss": 0.018063662573695183, + "num_input_tokens_seen": 41202016, + "step": 2516, + "train_runtime": 20734.8463, + "train_tokens_per_second": 1987.09 + }, + { + "epoch": 0.6972299168975069, + "grad_norm": 0.061055973172187805, + "learning_rate": 9.910556165301432e-05, + "loss": 0.015238124877214432, + "num_input_tokens_seen": 41218392, + "step": 2517, + "train_runtime": 20743.0762, + "train_tokens_per_second": 1987.092 + }, + { + "epoch": 0.6975069252077563, + "grad_norm": 0.05718473345041275, + "learning_rate": 9.910473384192647e-05, + "loss": 0.016736453399062157, + "num_input_tokens_seen": 41234768, + "step": 2518, + "train_runtime": 20751.307, + "train_tokens_per_second": 1987.093 + }, + { + "epoch": 0.6977839335180055, + "grad_norm": 0.08443046361207962, + "learning_rate": 9.910390565140279e-05, + "loss": 0.015720995143055916, + "num_input_tokens_seen": 41251144, + "step": 2519, + "train_runtime": 20759.5384, + "train_tokens_per_second": 1987.094 + }, + { + "epoch": 0.6980609418282548, + "grad_norm": 0.0770387202501297, + "learning_rate": 9.910307708144972e-05, + "loss": 0.014351801946759224, + "num_input_tokens_seen": 41267520, + "step": 2520, + "train_runtime": 20767.7665, + "train_tokens_per_second": 1987.095 + }, + { + "epoch": 0.6983379501385042, + "grad_norm": 0.05257358402013779, + "learning_rate": 9.910224813207365e-05, + "loss": 0.01378590613603592, + "num_input_tokens_seen": 41283896, + "step": 2521, + "train_runtime": 20775.9938, + "train_tokens_per_second": 1987.096 + }, + { + "epoch": 0.6986149584487534, + "grad_norm": 0.05678045004606247, + "learning_rate": 9.910141880328097e-05, + "loss": 0.015217442065477371, + "num_input_tokens_seen": 41300272, + "step": 2522, + "train_runtime": 20784.2196, + "train_tokens_per_second": 1987.098 + }, + { + "epoch": 0.6988919667590028, + "grad_norm": 0.056117184460163116, + "learning_rate": 9.91005890950781e-05, + "loss": 0.013427874073386192, + "num_input_tokens_seen": 41316648, + "step": 2523, + "train_runtime": 20792.4644, + "train_tokens_per_second": 1987.097 + }, + { + "epoch": 0.6991689750692521, + "grad_norm": 0.07576458156108856, + "learning_rate": 9.909975900747144e-05, + "loss": 0.012278218753635883, + "num_input_tokens_seen": 41333024, + "step": 2524, + "train_runtime": 20800.7059, + "train_tokens_per_second": 1987.097 + }, + { + "epoch": 0.6994459833795014, + "grad_norm": 0.03744871914386749, + "learning_rate": 9.909892854046742e-05, + "loss": 0.010350801050662994, + "num_input_tokens_seen": 41349400, + "step": 2525, + "train_runtime": 20808.9373, + "train_tokens_per_second": 1987.098 + }, + { + "epoch": 0.6997229916897507, + "grad_norm": 0.045558199286460876, + "learning_rate": 9.909809769407244e-05, + "loss": 0.012474298477172852, + "num_input_tokens_seen": 41365776, + "step": 2526, + "train_runtime": 20817.1651, + "train_tokens_per_second": 1987.099 + }, + { + "epoch": 0.7, + "grad_norm": 0.0459325909614563, + "learning_rate": 9.909726646829295e-05, + "loss": 0.014266681857407093, + "num_input_tokens_seen": 41382152, + "step": 2527, + "train_runtime": 20825.396, + "train_tokens_per_second": 1987.1 + }, + { + "epoch": 0.7002770083102493, + "grad_norm": 0.06726428866386414, + "learning_rate": 9.909643486313533e-05, + "loss": 0.013360065408051014, + "num_input_tokens_seen": 41398528, + "step": 2528, + "train_runtime": 20833.6243, + "train_tokens_per_second": 1987.102 + }, + { + "epoch": 0.7005540166204987, + "grad_norm": 0.04469791799783707, + "learning_rate": 9.909560287860606e-05, + "loss": 0.012123973108828068, + "num_input_tokens_seen": 41414904, + "step": 2529, + "train_runtime": 20841.863, + "train_tokens_per_second": 1987.102 + }, + { + "epoch": 0.7008310249307479, + "grad_norm": 0.04761635512113571, + "learning_rate": 9.90947705147115e-05, + "loss": 0.013601338490843773, + "num_input_tokens_seen": 41431280, + "step": 2530, + "train_runtime": 20850.087, + "train_tokens_per_second": 1987.103 + }, + { + "epoch": 0.7011080332409972, + "grad_norm": 0.06449092924594879, + "learning_rate": 9.909393777145815e-05, + "loss": 0.014457003213465214, + "num_input_tokens_seen": 41447656, + "step": 2531, + "train_runtime": 20858.2996, + "train_tokens_per_second": 1987.106 + }, + { + "epoch": 0.7013850415512466, + "grad_norm": 0.0606805719435215, + "learning_rate": 9.90931046488524e-05, + "loss": 0.015154612250626087, + "num_input_tokens_seen": 41464032, + "step": 2532, + "train_runtime": 20866.5108, + "train_tokens_per_second": 1987.109 + }, + { + "epoch": 0.7016620498614958, + "grad_norm": 0.07382537424564362, + "learning_rate": 9.90922711469007e-05, + "loss": 0.014365563169121742, + "num_input_tokens_seen": 41480408, + "step": 2533, + "train_runtime": 20874.73, + "train_tokens_per_second": 1987.111 + }, + { + "epoch": 0.7019390581717452, + "grad_norm": 0.046769656240940094, + "learning_rate": 9.90914372656095e-05, + "loss": 0.012252395041286945, + "num_input_tokens_seen": 41496784, + "step": 2534, + "train_runtime": 20882.958, + "train_tokens_per_second": 1987.112 + }, + { + "epoch": 0.7022160664819944, + "grad_norm": 0.0679841861128807, + "learning_rate": 9.909060300498523e-05, + "loss": 0.01712052710354328, + "num_input_tokens_seen": 41513160, + "step": 2535, + "train_runtime": 20891.1823, + "train_tokens_per_second": 1987.114 + }, + { + "epoch": 0.7024930747922438, + "grad_norm": 0.06995433568954468, + "learning_rate": 9.908976836503434e-05, + "loss": 0.016286101192235947, + "num_input_tokens_seen": 41529536, + "step": 2536, + "train_runtime": 20899.4093, + "train_tokens_per_second": 1987.115 + }, + { + "epoch": 0.7027700831024931, + "grad_norm": 0.1974060833454132, + "learning_rate": 9.90889333457633e-05, + "loss": 0.011582336388528347, + "num_input_tokens_seen": 41545912, + "step": 2537, + "train_runtime": 20907.6443, + "train_tokens_per_second": 1987.116 + }, + { + "epoch": 0.7030470914127424, + "grad_norm": 0.10475171357393265, + "learning_rate": 9.908809794717852e-05, + "loss": 0.016275182366371155, + "num_input_tokens_seen": 41562288, + "step": 2538, + "train_runtime": 20915.8681, + "train_tokens_per_second": 1987.118 + }, + { + "epoch": 0.7033240997229917, + "grad_norm": 0.12189260125160217, + "learning_rate": 9.908726216928649e-05, + "loss": 0.011770590208470821, + "num_input_tokens_seen": 41578664, + "step": 2539, + "train_runtime": 20924.091, + "train_tokens_per_second": 1987.119 + }, + { + "epoch": 0.703601108033241, + "grad_norm": 0.08833901584148407, + "learning_rate": 9.908642601209366e-05, + "loss": 0.014716255478560925, + "num_input_tokens_seen": 41595040, + "step": 2540, + "train_runtime": 20932.3234, + "train_tokens_per_second": 1987.12 + }, + { + "epoch": 0.7038781163434903, + "grad_norm": 0.06412380933761597, + "learning_rate": 9.908558947560649e-05, + "loss": 0.012792207300662994, + "num_input_tokens_seen": 41611416, + "step": 2541, + "train_runtime": 20940.5577, + "train_tokens_per_second": 1987.121 + }, + { + "epoch": 0.7041551246537396, + "grad_norm": 0.08198963105678558, + "learning_rate": 9.908475255983143e-05, + "loss": 0.01726151630282402, + "num_input_tokens_seen": 41627792, + "step": 2542, + "train_runtime": 20948.7814, + "train_tokens_per_second": 1987.122 + }, + { + "epoch": 0.7044321329639889, + "grad_norm": 0.03745498135685921, + "learning_rate": 9.908391526477497e-05, + "loss": 0.016121745109558105, + "num_input_tokens_seen": 41644168, + "step": 2543, + "train_runtime": 20957.008, + "train_tokens_per_second": 1987.124 + }, + { + "epoch": 0.7047091412742382, + "grad_norm": 0.048249877989292145, + "learning_rate": 9.908307759044355e-05, + "loss": 0.012204395607113838, + "num_input_tokens_seen": 41660544, + "step": 2544, + "train_runtime": 20965.2287, + "train_tokens_per_second": 1987.126 + }, + { + "epoch": 0.7049861495844876, + "grad_norm": 0.08317162841558456, + "learning_rate": 9.908223953684367e-05, + "loss": 0.013684799894690514, + "num_input_tokens_seen": 41676920, + "step": 2545, + "train_runtime": 20973.446, + "train_tokens_per_second": 1987.128 + }, + { + "epoch": 0.7052631578947368, + "grad_norm": 0.07956897467374802, + "learning_rate": 9.908140110398181e-05, + "loss": 0.013699572533369064, + "num_input_tokens_seen": 41693296, + "step": 2546, + "train_runtime": 20981.6598, + "train_tokens_per_second": 1987.13 + }, + { + "epoch": 0.7055401662049862, + "grad_norm": 0.06958579272031784, + "learning_rate": 9.908056229186442e-05, + "loss": 0.012749023735523224, + "num_input_tokens_seen": 41709672, + "step": 2547, + "train_runtime": 20989.8865, + "train_tokens_per_second": 1987.132 + }, + { + "epoch": 0.7058171745152355, + "grad_norm": 0.04416269436478615, + "learning_rate": 9.9079723100498e-05, + "loss": 0.010804373770952225, + "num_input_tokens_seen": 41726048, + "step": 2548, + "train_runtime": 20998.1259, + "train_tokens_per_second": 1987.132 + }, + { + "epoch": 0.7060941828254848, + "grad_norm": 0.06450019776821136, + "learning_rate": 9.907888352988904e-05, + "loss": 0.01430969126522541, + "num_input_tokens_seen": 41742424, + "step": 2549, + "train_runtime": 21006.3674, + "train_tokens_per_second": 1987.132 + }, + { + "epoch": 0.7063711911357341, + "grad_norm": 0.09681873023509979, + "learning_rate": 9.907804358004402e-05, + "loss": 0.015644613653421402, + "num_input_tokens_seen": 41758800, + "step": 2550, + "train_runtime": 21014.6047, + "train_tokens_per_second": 1987.132 + }, + { + "epoch": 0.7066481994459833, + "grad_norm": 0.08126513659954071, + "learning_rate": 9.907720325096943e-05, + "loss": 0.015442796051502228, + "num_input_tokens_seen": 41775176, + "step": 2551, + "train_runtime": 21022.8326, + "train_tokens_per_second": 1987.134 + }, + { + "epoch": 0.7069252077562327, + "grad_norm": 0.07089509814977646, + "learning_rate": 9.907636254267176e-05, + "loss": 0.01374083198606968, + "num_input_tokens_seen": 41791552, + "step": 2552, + "train_runtime": 21031.0568, + "train_tokens_per_second": 1987.135 + }, + { + "epoch": 0.707202216066482, + "grad_norm": 0.08236368745565414, + "learning_rate": 9.90755214551575e-05, + "loss": 0.014298931695520878, + "num_input_tokens_seen": 41807928, + "step": 2553, + "train_runtime": 21039.2797, + "train_tokens_per_second": 1987.137 + }, + { + "epoch": 0.7074792243767313, + "grad_norm": 0.08273398131132126, + "learning_rate": 9.907467998843316e-05, + "loss": 0.01349730882793665, + "num_input_tokens_seen": 41824304, + "step": 2554, + "train_runtime": 21047.5054, + "train_tokens_per_second": 1987.138 + }, + { + "epoch": 0.7077562326869806, + "grad_norm": 0.06439745426177979, + "learning_rate": 9.907383814250526e-05, + "loss": 0.014255448244512081, + "num_input_tokens_seen": 41840680, + "step": 2555, + "train_runtime": 21055.723, + "train_tokens_per_second": 1987.14 + }, + { + "epoch": 0.7080332409972299, + "grad_norm": 0.06368391215801239, + "learning_rate": 9.907299591738026e-05, + "loss": 0.012179024517536163, + "num_input_tokens_seen": 41857056, + "step": 2556, + "train_runtime": 21063.939, + "train_tokens_per_second": 1987.143 + }, + { + "epoch": 0.7083102493074792, + "grad_norm": 0.07326491922140121, + "learning_rate": 9.907215331306471e-05, + "loss": 0.014267454855144024, + "num_input_tokens_seen": 41873432, + "step": 2557, + "train_runtime": 21072.1563, + "train_tokens_per_second": 1987.145 + }, + { + "epoch": 0.7085872576177286, + "grad_norm": 0.07253903895616531, + "learning_rate": 9.907131032956509e-05, + "loss": 0.014779717661440372, + "num_input_tokens_seen": 41889808, + "step": 2558, + "train_runtime": 21080.4041, + "train_tokens_per_second": 1987.144 + }, + { + "epoch": 0.7088642659279778, + "grad_norm": 0.08022894710302353, + "learning_rate": 9.907046696688794e-05, + "loss": 0.01479906402528286, + "num_input_tokens_seen": 41906184, + "step": 2559, + "train_runtime": 21088.6383, + "train_tokens_per_second": 1987.145 + }, + { + "epoch": 0.7091412742382271, + "grad_norm": 0.077599436044693, + "learning_rate": 9.906962322503977e-05, + "loss": 0.013780917972326279, + "num_input_tokens_seen": 41922560, + "step": 2560, + "train_runtime": 21096.8781, + "train_tokens_per_second": 1987.145 + }, + { + "epoch": 0.7094182825484765, + "grad_norm": 0.0827602669596672, + "learning_rate": 9.906877910402708e-05, + "loss": 0.01289704442024231, + "num_input_tokens_seen": 41938936, + "step": 2561, + "train_runtime": 21105.1099, + "train_tokens_per_second": 1987.146 + }, + { + "epoch": 0.7096952908587257, + "grad_norm": 0.07491835206747055, + "learning_rate": 9.906793460385642e-05, + "loss": 0.016090398654341698, + "num_input_tokens_seen": 41955312, + "step": 2562, + "train_runtime": 21113.3442, + "train_tokens_per_second": 1987.147 + }, + { + "epoch": 0.7099722991689751, + "grad_norm": 0.21131345629692078, + "learning_rate": 9.906708972453429e-05, + "loss": 0.01786944828927517, + "num_input_tokens_seen": 41971688, + "step": 2563, + "train_runtime": 21121.5741, + "train_tokens_per_second": 1987.148 + }, + { + "epoch": 0.7102493074792243, + "grad_norm": 0.08644188940525055, + "learning_rate": 9.906624446606723e-05, + "loss": 0.01691097766160965, + "num_input_tokens_seen": 41988064, + "step": 2564, + "train_runtime": 21129.7942, + "train_tokens_per_second": 1987.15 + }, + { + "epoch": 0.7105263157894737, + "grad_norm": 0.045426350086927414, + "learning_rate": 9.906539882846178e-05, + "loss": 0.012936215847730637, + "num_input_tokens_seen": 42004440, + "step": 2565, + "train_runtime": 21138.0259, + "train_tokens_per_second": 1987.151 + }, + { + "epoch": 0.710803324099723, + "grad_norm": 0.061328500509262085, + "learning_rate": 9.906455281172447e-05, + "loss": 0.012518560513854027, + "num_input_tokens_seen": 42020816, + "step": 2566, + "train_runtime": 21146.2598, + "train_tokens_per_second": 1987.151 + }, + { + "epoch": 0.7110803324099723, + "grad_norm": 0.06894192099571228, + "learning_rate": 9.906370641586182e-05, + "loss": 0.012815548107028008, + "num_input_tokens_seen": 42037192, + "step": 2567, + "train_runtime": 21154.4902, + "train_tokens_per_second": 1987.152 + }, + { + "epoch": 0.7113573407202216, + "grad_norm": 0.058242253959178925, + "learning_rate": 9.90628596408804e-05, + "loss": 0.011962542310357094, + "num_input_tokens_seen": 42053568, + "step": 2568, + "train_runtime": 21162.7176, + "train_tokens_per_second": 1987.153 + }, + { + "epoch": 0.711634349030471, + "grad_norm": 0.06168752536177635, + "learning_rate": 9.906201248678673e-05, + "loss": 0.01536400057375431, + "num_input_tokens_seen": 42069944, + "step": 2569, + "train_runtime": 21170.9576, + "train_tokens_per_second": 1987.154 + }, + { + "epoch": 0.7119113573407202, + "grad_norm": 0.06711956858634949, + "learning_rate": 9.906116495358736e-05, + "loss": 0.013356685638427734, + "num_input_tokens_seen": 42086320, + "step": 2570, + "train_runtime": 21179.1889, + "train_tokens_per_second": 1987.154 + }, + { + "epoch": 0.7121883656509695, + "grad_norm": 0.07307901233434677, + "learning_rate": 9.906031704128884e-05, + "loss": 0.01474160049110651, + "num_input_tokens_seen": 42102696, + "step": 2571, + "train_runtime": 21187.4118, + "train_tokens_per_second": 1987.156 + }, + { + "epoch": 0.7124653739612188, + "grad_norm": 0.09166843444108963, + "learning_rate": 9.905946874989773e-05, + "loss": 0.021010953933000565, + "num_input_tokens_seen": 42119072, + "step": 2572, + "train_runtime": 21195.6397, + "train_tokens_per_second": 1987.157 + }, + { + "epoch": 0.7127423822714681, + "grad_norm": 0.07175473123788834, + "learning_rate": 9.905862007942058e-05, + "loss": 0.014223083853721619, + "num_input_tokens_seen": 42135448, + "step": 2573, + "train_runtime": 21203.8729, + "train_tokens_per_second": 1987.158 + }, + { + "epoch": 0.7130193905817175, + "grad_norm": 0.10445842146873474, + "learning_rate": 9.905777102986393e-05, + "loss": 0.015664175152778625, + "num_input_tokens_seen": 42151824, + "step": 2574, + "train_runtime": 21212.0878, + "train_tokens_per_second": 1987.161 + }, + { + "epoch": 0.7132963988919667, + "grad_norm": 0.060705605894327164, + "learning_rate": 9.905692160123438e-05, + "loss": 0.014701305888593197, + "num_input_tokens_seen": 42168200, + "step": 2575, + "train_runtime": 21220.3198, + "train_tokens_per_second": 1987.161 + }, + { + "epoch": 0.7135734072022161, + "grad_norm": 0.05282096192240715, + "learning_rate": 9.905607179353846e-05, + "loss": 0.011930298060178757, + "num_input_tokens_seen": 42184576, + "step": 2576, + "train_runtime": 21228.5444, + "train_tokens_per_second": 1987.163 + }, + { + "epoch": 0.7138504155124654, + "grad_norm": 0.07853282988071442, + "learning_rate": 9.905522160678275e-05, + "loss": 0.016224240884184837, + "num_input_tokens_seen": 42200952, + "step": 2577, + "train_runtime": 21236.7598, + "train_tokens_per_second": 1987.165 + }, + { + "epoch": 0.7141274238227147, + "grad_norm": 0.20324568450450897, + "learning_rate": 9.905437104097381e-05, + "loss": 0.02179066836833954, + "num_input_tokens_seen": 42217328, + "step": 2578, + "train_runtime": 21244.9729, + "train_tokens_per_second": 1987.168 + }, + { + "epoch": 0.714404432132964, + "grad_norm": 0.059591952711343765, + "learning_rate": 9.905352009611823e-05, + "loss": 0.014172961935400963, + "num_input_tokens_seen": 42233704, + "step": 2579, + "train_runtime": 21253.1796, + "train_tokens_per_second": 1987.171 + }, + { + "epoch": 0.7146814404432132, + "grad_norm": 0.06382734328508377, + "learning_rate": 9.905266877222256e-05, + "loss": 0.018366139382123947, + "num_input_tokens_seen": 42250080, + "step": 2580, + "train_runtime": 21261.3897, + "train_tokens_per_second": 1987.174 + }, + { + "epoch": 0.7149584487534626, + "grad_norm": 0.10002744942903519, + "learning_rate": 9.90518170692934e-05, + "loss": 0.017407992854714394, + "num_input_tokens_seen": 42266456, + "step": 2581, + "train_runtime": 21269.5976, + "train_tokens_per_second": 1987.177 + }, + { + "epoch": 0.7152354570637119, + "grad_norm": 0.07590474933385849, + "learning_rate": 9.905096498733733e-05, + "loss": 0.013747522607445717, + "num_input_tokens_seen": 42282832, + "step": 2582, + "train_runtime": 21277.807, + "train_tokens_per_second": 1987.18 + }, + { + "epoch": 0.7155124653739612, + "grad_norm": 0.06545314937829971, + "learning_rate": 9.905011252636093e-05, + "loss": 0.014622601680457592, + "num_input_tokens_seen": 42299208, + "step": 2583, + "train_runtime": 21286.0156, + "train_tokens_per_second": 1987.183 + }, + { + "epoch": 0.7157894736842105, + "grad_norm": 0.051503829658031464, + "learning_rate": 9.904925968637078e-05, + "loss": 0.009895860217511654, + "num_input_tokens_seen": 42315584, + "step": 2584, + "train_runtime": 21294.2216, + "train_tokens_per_second": 1987.186 + }, + { + "epoch": 0.7160664819944599, + "grad_norm": 0.09776584059000015, + "learning_rate": 9.904840646737346e-05, + "loss": 0.017693184316158295, + "num_input_tokens_seen": 42331960, + "step": 2585, + "train_runtime": 21302.4373, + "train_tokens_per_second": 1987.189 + }, + { + "epoch": 0.7163434903047091, + "grad_norm": 0.059469521045684814, + "learning_rate": 9.90475528693756e-05, + "loss": 0.012867439538240433, + "num_input_tokens_seen": 42348336, + "step": 2586, + "train_runtime": 21310.6661, + "train_tokens_per_second": 1987.19 + }, + { + "epoch": 0.7166204986149585, + "grad_norm": 0.08092531561851501, + "learning_rate": 9.904669889238376e-05, + "loss": 0.014725293032824993, + "num_input_tokens_seen": 42364712, + "step": 2587, + "train_runtime": 21318.896, + "train_tokens_per_second": 1987.191 + }, + { + "epoch": 0.7168975069252077, + "grad_norm": 0.06114800274372101, + "learning_rate": 9.904584453640455e-05, + "loss": 0.012545449659228325, + "num_input_tokens_seen": 42381088, + "step": 2588, + "train_runtime": 21327.123, + "train_tokens_per_second": 1987.192 + }, + { + "epoch": 0.717174515235457, + "grad_norm": 0.05627395957708359, + "learning_rate": 9.904498980144459e-05, + "loss": 0.012739500030875206, + "num_input_tokens_seen": 42397464, + "step": 2589, + "train_runtime": 21335.3557, + "train_tokens_per_second": 1987.193 + }, + { + "epoch": 0.7174515235457064, + "grad_norm": 0.07558786123991013, + "learning_rate": 9.904413468751046e-05, + "loss": 0.012999224476516247, + "num_input_tokens_seen": 42413840, + "step": 2590, + "train_runtime": 21343.585, + "train_tokens_per_second": 1987.194 + }, + { + "epoch": 0.7177285318559556, + "grad_norm": 0.05131303146481514, + "learning_rate": 9.904327919460876e-05, + "loss": 0.01190529577434063, + "num_input_tokens_seen": 42430216, + "step": 2591, + "train_runtime": 21351.8148, + "train_tokens_per_second": 1987.195 + }, + { + "epoch": 0.718005540166205, + "grad_norm": 0.061505552381277084, + "learning_rate": 9.904242332274614e-05, + "loss": 0.013639656826853752, + "num_input_tokens_seen": 42446592, + "step": 2592, + "train_runtime": 21360.0574, + "train_tokens_per_second": 1987.195 + }, + { + "epoch": 0.7182825484764543, + "grad_norm": 0.07795015722513199, + "learning_rate": 9.904156707192917e-05, + "loss": 0.013468562625348568, + "num_input_tokens_seen": 42462968, + "step": 2593, + "train_runtime": 21368.2873, + "train_tokens_per_second": 1987.196 + }, + { + "epoch": 0.7185595567867036, + "grad_norm": 0.05298732966184616, + "learning_rate": 9.90407104421645e-05, + "loss": 0.01208474114537239, + "num_input_tokens_seen": 42479344, + "step": 2594, + "train_runtime": 21376.5124, + "train_tokens_per_second": 1987.197 + }, + { + "epoch": 0.7188365650969529, + "grad_norm": 0.07478810846805573, + "learning_rate": 9.903985343345873e-05, + "loss": 0.014119633473455906, + "num_input_tokens_seen": 42495720, + "step": 2595, + "train_runtime": 21384.7268, + "train_tokens_per_second": 1987.2 + }, + { + "epoch": 0.7191135734072022, + "grad_norm": 0.08145356923341751, + "learning_rate": 9.903899604581849e-05, + "loss": 0.016190581023693085, + "num_input_tokens_seen": 42512096, + "step": 2596, + "train_runtime": 21392.9578, + "train_tokens_per_second": 1987.2 + }, + { + "epoch": 0.7193905817174515, + "grad_norm": 0.05902230739593506, + "learning_rate": 9.903813827925042e-05, + "loss": 0.012803674675524235, + "num_input_tokens_seen": 42528472, + "step": 2597, + "train_runtime": 21401.1841, + "train_tokens_per_second": 1987.202 + }, + { + "epoch": 0.7196675900277009, + "grad_norm": 0.06683387607336044, + "learning_rate": 9.90372801337611e-05, + "loss": 0.013334661722183228, + "num_input_tokens_seen": 42544848, + "step": 2598, + "train_runtime": 21409.4188, + "train_tokens_per_second": 1987.202 + }, + { + "epoch": 0.7199445983379501, + "grad_norm": 0.06901353597640991, + "learning_rate": 9.903642160935723e-05, + "loss": 0.015309279784560204, + "num_input_tokens_seen": 42561224, + "step": 2599, + "train_runtime": 21417.6575, + "train_tokens_per_second": 1987.203 + }, + { + "epoch": 0.7202216066481995, + "grad_norm": 0.07497791200876236, + "learning_rate": 9.903556270604536e-05, + "loss": 0.01491079106926918, + "num_input_tokens_seen": 42577600, + "step": 2600, + "train_runtime": 21425.8885, + "train_tokens_per_second": 1987.203 + }, + { + "epoch": 0.7204986149584488, + "grad_norm": 0.07055418193340302, + "learning_rate": 9.90347034238322e-05, + "loss": 0.012726746499538422, + "num_input_tokens_seen": 42593976, + "step": 2601, + "train_runtime": 21435.7838, + "train_tokens_per_second": 1987.05 + }, + { + "epoch": 0.720775623268698, + "grad_norm": 0.07162605971097946, + "learning_rate": 9.903384376272435e-05, + "loss": 0.015724072232842445, + "num_input_tokens_seen": 42610352, + "step": 2602, + "train_runtime": 21444.0111, + "train_tokens_per_second": 1987.051 + }, + { + "epoch": 0.7210526315789474, + "grad_norm": 0.09725921601057053, + "learning_rate": 9.903298372272846e-05, + "loss": 0.01785125769674778, + "num_input_tokens_seen": 42626728, + "step": 2603, + "train_runtime": 21452.2344, + "train_tokens_per_second": 1987.053 + }, + { + "epoch": 0.7213296398891966, + "grad_norm": 0.07616115361452103, + "learning_rate": 9.903212330385122e-05, + "loss": 0.012897678650915623, + "num_input_tokens_seen": 42643104, + "step": 2604, + "train_runtime": 21460.4669, + "train_tokens_per_second": 1987.054 + }, + { + "epoch": 0.721606648199446, + "grad_norm": 0.05041630566120148, + "learning_rate": 9.90312625060992e-05, + "loss": 0.014788821339607239, + "num_input_tokens_seen": 42659480, + "step": 2605, + "train_runtime": 21468.6963, + "train_tokens_per_second": 1987.055 + }, + { + "epoch": 0.7218836565096953, + "grad_norm": 0.06382815539836884, + "learning_rate": 9.90304013294791e-05, + "loss": 0.014305340126156807, + "num_input_tokens_seen": 42675856, + "step": 2606, + "train_runtime": 21476.9269, + "train_tokens_per_second": 1987.056 + }, + { + "epoch": 0.7221606648199446, + "grad_norm": 0.08835389465093613, + "learning_rate": 9.902953977399756e-05, + "loss": 0.01407462079077959, + "num_input_tokens_seen": 42692232, + "step": 2607, + "train_runtime": 21485.157, + "train_tokens_per_second": 1987.057 + }, + { + "epoch": 0.7224376731301939, + "grad_norm": 0.0752464011311531, + "learning_rate": 9.902867783966126e-05, + "loss": 0.018060998991131783, + "num_input_tokens_seen": 42708608, + "step": 2608, + "train_runtime": 21493.389, + "train_tokens_per_second": 1987.058 + }, + { + "epoch": 0.7227146814404433, + "grad_norm": 0.09055190533399582, + "learning_rate": 9.902781552647682e-05, + "loss": 0.0166783444583416, + "num_input_tokens_seen": 42724984, + "step": 2609, + "train_runtime": 21501.6223, + "train_tokens_per_second": 1987.059 + }, + { + "epoch": 0.7229916897506925, + "grad_norm": 0.05183892697095871, + "learning_rate": 9.902695283445093e-05, + "loss": 0.01407467108219862, + "num_input_tokens_seen": 42741360, + "step": 2610, + "train_runtime": 21509.8462, + "train_tokens_per_second": 1987.06 + }, + { + "epoch": 0.7232686980609419, + "grad_norm": 0.9135137796401978, + "learning_rate": 9.902608976359026e-05, + "loss": 0.012833833694458008, + "num_input_tokens_seen": 42757736, + "step": 2611, + "train_runtime": 21518.0733, + "train_tokens_per_second": 1987.062 + }, + { + "epoch": 0.7235457063711911, + "grad_norm": 0.08803021162748337, + "learning_rate": 9.902522631390147e-05, + "loss": 0.014343911781907082, + "num_input_tokens_seen": 42774112, + "step": 2612, + "train_runtime": 21526.2972, + "train_tokens_per_second": 1987.063 + }, + { + "epoch": 0.7238227146814404, + "grad_norm": 0.07570972293615341, + "learning_rate": 9.902436248539122e-05, + "loss": 0.013493772596120834, + "num_input_tokens_seen": 42790488, + "step": 2613, + "train_runtime": 21534.5224, + "train_tokens_per_second": 1987.065 + }, + { + "epoch": 0.7240997229916898, + "grad_norm": 0.06534922868013382, + "learning_rate": 9.90234982780662e-05, + "loss": 0.01426333375275135, + "num_input_tokens_seen": 42806864, + "step": 2614, + "train_runtime": 21542.7599, + "train_tokens_per_second": 1987.065 + }, + { + "epoch": 0.724376731301939, + "grad_norm": 0.0958356261253357, + "learning_rate": 9.90226336919331e-05, + "loss": 0.013764418661594391, + "num_input_tokens_seen": 42823240, + "step": 2615, + "train_runtime": 21550.981, + "train_tokens_per_second": 1987.067 + }, + { + "epoch": 0.7246537396121884, + "grad_norm": 0.09714631736278534, + "learning_rate": 9.902176872699857e-05, + "loss": 0.012933757156133652, + "num_input_tokens_seen": 42839616, + "step": 2616, + "train_runtime": 21559.1936, + "train_tokens_per_second": 1987.069 + }, + { + "epoch": 0.7249307479224377, + "grad_norm": 0.06505749374628067, + "learning_rate": 9.902090338326932e-05, + "loss": 0.013080387376248837, + "num_input_tokens_seen": 42855992, + "step": 2617, + "train_runtime": 21567.4057, + "train_tokens_per_second": 1987.072 + }, + { + "epoch": 0.725207756232687, + "grad_norm": 0.043604668229818344, + "learning_rate": 9.902003766075202e-05, + "loss": 0.012014374136924744, + "num_input_tokens_seen": 42872368, + "step": 2618, + "train_runtime": 21575.6212, + "train_tokens_per_second": 1987.075 + }, + { + "epoch": 0.7254847645429363, + "grad_norm": 0.11405043303966522, + "learning_rate": 9.901917155945337e-05, + "loss": 0.01450019795447588, + "num_input_tokens_seen": 42888744, + "step": 2619, + "train_runtime": 21583.8343, + "train_tokens_per_second": 1987.077 + }, + { + "epoch": 0.7257617728531855, + "grad_norm": 0.06671954691410065, + "learning_rate": 9.901830507938006e-05, + "loss": 0.016228344291448593, + "num_input_tokens_seen": 42905120, + "step": 2620, + "train_runtime": 21592.056, + "train_tokens_per_second": 1987.079 + }, + { + "epoch": 0.7260387811634349, + "grad_norm": 0.05821807309985161, + "learning_rate": 9.901743822053879e-05, + "loss": 0.011185996234416962, + "num_input_tokens_seen": 42921496, + "step": 2621, + "train_runtime": 21600.2677, + "train_tokens_per_second": 1987.082 + }, + { + "epoch": 0.7263157894736842, + "grad_norm": 0.06922943890094757, + "learning_rate": 9.901657098293624e-05, + "loss": 0.013573682866990566, + "num_input_tokens_seen": 42937872, + "step": 2622, + "train_runtime": 21608.4816, + "train_tokens_per_second": 1987.084 + }, + { + "epoch": 0.7265927977839335, + "grad_norm": 0.07704619318246841, + "learning_rate": 9.901570336657912e-05, + "loss": 0.011221247725188732, + "num_input_tokens_seen": 42954248, + "step": 2623, + "train_runtime": 21616.6937, + "train_tokens_per_second": 1987.087 + }, + { + "epoch": 0.7268698060941828, + "grad_norm": 0.05497593432664871, + "learning_rate": 9.901483537147414e-05, + "loss": 0.016514644026756287, + "num_input_tokens_seen": 42970624, + "step": 2624, + "train_runtime": 21624.9009, + "train_tokens_per_second": 1987.09 + }, + { + "epoch": 0.7271468144044322, + "grad_norm": 0.09205906093120575, + "learning_rate": 9.9013966997628e-05, + "loss": 0.01419711485505104, + "num_input_tokens_seen": 42987000, + "step": 2625, + "train_runtime": 21633.1137, + "train_tokens_per_second": 1987.093 + }, + { + "epoch": 0.7274238227146814, + "grad_norm": 0.0601637177169323, + "learning_rate": 9.901309824504743e-05, + "loss": 0.01242904458194971, + "num_input_tokens_seen": 43003376, + "step": 2626, + "train_runtime": 21641.33, + "train_tokens_per_second": 1987.095 + }, + { + "epoch": 0.7277008310249308, + "grad_norm": 0.06499280780553818, + "learning_rate": 9.901222911373911e-05, + "loss": 0.012703416869044304, + "num_input_tokens_seen": 43019752, + "step": 2627, + "train_runtime": 21649.5606, + "train_tokens_per_second": 1987.096 + }, + { + "epoch": 0.72797783933518, + "grad_norm": 0.07920948415994644, + "learning_rate": 9.901135960370977e-05, + "loss": 0.01203902717679739, + "num_input_tokens_seen": 43036128, + "step": 2628, + "train_runtime": 21657.7868, + "train_tokens_per_second": 1987.097 + }, + { + "epoch": 0.7282548476454294, + "grad_norm": 0.06832709163427353, + "learning_rate": 9.901048971496615e-05, + "loss": 0.012285965494811535, + "num_input_tokens_seen": 43052504, + "step": 2629, + "train_runtime": 21666.0151, + "train_tokens_per_second": 1987.098 + }, + { + "epoch": 0.7285318559556787, + "grad_norm": 0.10855673998594284, + "learning_rate": 9.900961944751494e-05, + "loss": 0.014764301478862762, + "num_input_tokens_seen": 43068880, + "step": 2630, + "train_runtime": 21674.2395, + "train_tokens_per_second": 1987.1 + }, + { + "epoch": 0.728808864265928, + "grad_norm": 0.05765470489859581, + "learning_rate": 9.900874880136289e-05, + "loss": 0.0148309376090765, + "num_input_tokens_seen": 43085256, + "step": 2631, + "train_runtime": 21682.4712, + "train_tokens_per_second": 1987.101 + }, + { + "epoch": 0.7290858725761773, + "grad_norm": 0.06209743395447731, + "learning_rate": 9.90078777765167e-05, + "loss": 0.0139786871150136, + "num_input_tokens_seen": 43101632, + "step": 2632, + "train_runtime": 21690.6936, + "train_tokens_per_second": 1987.103 + }, + { + "epoch": 0.7293628808864266, + "grad_norm": 0.06516801565885544, + "learning_rate": 9.900700637298312e-05, + "loss": 0.01172975916415453, + "num_input_tokens_seen": 43118008, + "step": 2633, + "train_runtime": 21698.9147, + "train_tokens_per_second": 1987.104 + }, + { + "epoch": 0.7296398891966759, + "grad_norm": 0.07721249014139175, + "learning_rate": 9.900613459076889e-05, + "loss": 0.015099594369530678, + "num_input_tokens_seen": 43134384, + "step": 2634, + "train_runtime": 21707.1389, + "train_tokens_per_second": 1987.106 + }, + { + "epoch": 0.7299168975069252, + "grad_norm": 0.05000893771648407, + "learning_rate": 9.900526242988071e-05, + "loss": 0.013153152540326118, + "num_input_tokens_seen": 43150760, + "step": 2635, + "train_runtime": 21715.3626, + "train_tokens_per_second": 1987.108 + }, + { + "epoch": 0.7301939058171745, + "grad_norm": 0.09536460787057877, + "learning_rate": 9.900438989032536e-05, + "loss": 0.016667142510414124, + "num_input_tokens_seen": 43167136, + "step": 2636, + "train_runtime": 21723.5907, + "train_tokens_per_second": 1987.109 + }, + { + "epoch": 0.7304709141274238, + "grad_norm": 0.057016029953956604, + "learning_rate": 9.900351697210959e-05, + "loss": 0.012629246339201927, + "num_input_tokens_seen": 43183512, + "step": 2637, + "train_runtime": 21731.8141, + "train_tokens_per_second": 1987.11 + }, + { + "epoch": 0.7307479224376732, + "grad_norm": 0.06289942562580109, + "learning_rate": 9.900264367524009e-05, + "loss": 0.013683175668120384, + "num_input_tokens_seen": 43199888, + "step": 2638, + "train_runtime": 21740.0307, + "train_tokens_per_second": 1987.113 + }, + { + "epoch": 0.7310249307479224, + "grad_norm": 0.07376905530691147, + "learning_rate": 9.900176999972366e-05, + "loss": 0.01538157369941473, + "num_input_tokens_seen": 43216264, + "step": 2639, + "train_runtime": 21748.2568, + "train_tokens_per_second": 1987.114 + }, + { + "epoch": 0.7313019390581718, + "grad_norm": 0.06799143552780151, + "learning_rate": 9.900089594556701e-05, + "loss": 0.01455591432750225, + "num_input_tokens_seen": 43232640, + "step": 2640, + "train_runtime": 21756.4873, + "train_tokens_per_second": 1987.115 + }, + { + "epoch": 0.7315789473684211, + "grad_norm": 0.06276247650384903, + "learning_rate": 9.900002151277695e-05, + "loss": 0.01575237326323986, + "num_input_tokens_seen": 43249016, + "step": 2641, + "train_runtime": 21764.7111, + "train_tokens_per_second": 1987.116 + }, + { + "epoch": 0.7318559556786703, + "grad_norm": 0.06650171428918839, + "learning_rate": 9.899914670136017e-05, + "loss": 0.015900226309895515, + "num_input_tokens_seen": 43265392, + "step": 2642, + "train_runtime": 21772.9225, + "train_tokens_per_second": 1987.119 + }, + { + "epoch": 0.7321329639889197, + "grad_norm": 0.14138132333755493, + "learning_rate": 9.899827151132346e-05, + "loss": 0.017255425453186035, + "num_input_tokens_seen": 43281768, + "step": 2643, + "train_runtime": 21781.1355, + "train_tokens_per_second": 1987.122 + }, + { + "epoch": 0.7324099722991689, + "grad_norm": 0.09107768535614014, + "learning_rate": 9.89973959426736e-05, + "loss": 0.017035339027643204, + "num_input_tokens_seen": 43298144, + "step": 2644, + "train_runtime": 21789.3561, + "train_tokens_per_second": 1987.124 + }, + { + "epoch": 0.7326869806094183, + "grad_norm": 0.050875402987003326, + "learning_rate": 9.899651999541733e-05, + "loss": 0.015012338757514954, + "num_input_tokens_seen": 43314520, + "step": 2645, + "train_runtime": 21797.5737, + "train_tokens_per_second": 1987.126 + }, + { + "epoch": 0.7329639889196676, + "grad_norm": 0.07043706625699997, + "learning_rate": 9.899564366956143e-05, + "loss": 0.012204976752400398, + "num_input_tokens_seen": 43330896, + "step": 2646, + "train_runtime": 21805.7888, + "train_tokens_per_second": 1987.128 + }, + { + "epoch": 0.7332409972299169, + "grad_norm": 0.05210456624627113, + "learning_rate": 9.899476696511267e-05, + "loss": 0.013118208386003971, + "num_input_tokens_seen": 43347272, + "step": 2647, + "train_runtime": 21813.9997, + "train_tokens_per_second": 1987.131 + }, + { + "epoch": 0.7335180055401662, + "grad_norm": 0.0782817155122757, + "learning_rate": 9.899388988207782e-05, + "loss": 0.015137511305510998, + "num_input_tokens_seen": 43363648, + "step": 2648, + "train_runtime": 21822.2258, + "train_tokens_per_second": 1987.132 + }, + { + "epoch": 0.7337950138504156, + "grad_norm": 0.06654171645641327, + "learning_rate": 9.899301242046368e-05, + "loss": 0.016414975747466087, + "num_input_tokens_seen": 43380024, + "step": 2649, + "train_runtime": 21830.4581, + "train_tokens_per_second": 1987.133 + }, + { + "epoch": 0.7340720221606648, + "grad_norm": 0.06461989134550095, + "learning_rate": 9.899213458027701e-05, + "loss": 0.012553581967949867, + "num_input_tokens_seen": 43396400, + "step": 2650, + "train_runtime": 21838.682, + "train_tokens_per_second": 1987.135 + }, + { + "epoch": 0.7343490304709142, + "grad_norm": 0.045477692037820816, + "learning_rate": 9.899125636152458e-05, + "loss": 0.01130886934697628, + "num_input_tokens_seen": 43412776, + "step": 2651, + "train_runtime": 21846.9053, + "train_tokens_per_second": 1987.136 + }, + { + "epoch": 0.7346260387811634, + "grad_norm": 0.05932047590613365, + "learning_rate": 9.899037776421318e-05, + "loss": 0.014815169386565685, + "num_input_tokens_seen": 43429152, + "step": 2652, + "train_runtime": 21855.1372, + "train_tokens_per_second": 1987.137 + }, + { + "epoch": 0.7349030470914127, + "grad_norm": 0.0993775799870491, + "learning_rate": 9.898949878834964e-05, + "loss": 0.015535587444901466, + "num_input_tokens_seen": 43445528, + "step": 2653, + "train_runtime": 21863.369, + "train_tokens_per_second": 1987.138 + }, + { + "epoch": 0.7351800554016621, + "grad_norm": 0.07741548866033554, + "learning_rate": 9.89886194339407e-05, + "loss": 0.012742456048727036, + "num_input_tokens_seen": 43461904, + "step": 2654, + "train_runtime": 21871.5965, + "train_tokens_per_second": 1987.139 + }, + { + "epoch": 0.7354570637119113, + "grad_norm": 0.0749627947807312, + "learning_rate": 9.898773970099317e-05, + "loss": 0.014758741483092308, + "num_input_tokens_seen": 43478280, + "step": 2655, + "train_runtime": 21879.8231, + "train_tokens_per_second": 1987.14 + }, + { + "epoch": 0.7357340720221607, + "grad_norm": 0.0736103504896164, + "learning_rate": 9.898685958951388e-05, + "loss": 0.012336365878582, + "num_input_tokens_seen": 43494656, + "step": 2656, + "train_runtime": 21888.057, + "train_tokens_per_second": 1987.141 + }, + { + "epoch": 0.7360110803324099, + "grad_norm": 0.05046707019209862, + "learning_rate": 9.89859790995096e-05, + "loss": 0.011489582248032093, + "num_input_tokens_seen": 43511032, + "step": 2657, + "train_runtime": 21896.2804, + "train_tokens_per_second": 1987.143 + }, + { + "epoch": 0.7362880886426593, + "grad_norm": 0.07161302864551544, + "learning_rate": 9.898509823098712e-05, + "loss": 0.01398652233183384, + "num_input_tokens_seen": 43527408, + "step": 2658, + "train_runtime": 21904.5128, + "train_tokens_per_second": 1987.143 + }, + { + "epoch": 0.7365650969529086, + "grad_norm": 0.06932394951581955, + "learning_rate": 9.898421698395328e-05, + "loss": 0.014173421077430248, + "num_input_tokens_seen": 43543784, + "step": 2659, + "train_runtime": 21912.743, + "train_tokens_per_second": 1987.144 + }, + { + "epoch": 0.7368421052631579, + "grad_norm": 0.07453209161758423, + "learning_rate": 9.898333535841486e-05, + "loss": 0.015811365097761154, + "num_input_tokens_seen": 43560160, + "step": 2660, + "train_runtime": 21920.9663, + "train_tokens_per_second": 1987.146 + }, + { + "epoch": 0.7371191135734072, + "grad_norm": 0.08752705901861191, + "learning_rate": 9.89824533543787e-05, + "loss": 0.017116151750087738, + "num_input_tokens_seen": 43576536, + "step": 2661, + "train_runtime": 21929.1821, + "train_tokens_per_second": 1987.148 + }, + { + "epoch": 0.7373961218836566, + "grad_norm": 0.07091358304023743, + "learning_rate": 9.89815709718516e-05, + "loss": 0.01537756435573101, + "num_input_tokens_seen": 43592912, + "step": 2662, + "train_runtime": 21937.3965, + "train_tokens_per_second": 1987.151 + }, + { + "epoch": 0.7376731301939058, + "grad_norm": 0.08245299756526947, + "learning_rate": 9.898068821084037e-05, + "loss": 0.01217895932495594, + "num_input_tokens_seen": 43609288, + "step": 2663, + "train_runtime": 21945.6049, + "train_tokens_per_second": 1987.154 + }, + { + "epoch": 0.7379501385041551, + "grad_norm": 0.07365156710147858, + "learning_rate": 9.897980507135184e-05, + "loss": 0.013310277834534645, + "num_input_tokens_seen": 43625664, + "step": 2664, + "train_runtime": 21953.8296, + "train_tokens_per_second": 1987.155 + }, + { + "epoch": 0.7382271468144044, + "grad_norm": 0.07886596024036407, + "learning_rate": 9.897892155339285e-05, + "loss": 0.012116189114749432, + "num_input_tokens_seen": 43642040, + "step": 2665, + "train_runtime": 21962.058, + "train_tokens_per_second": 1987.156 + }, + { + "epoch": 0.7385041551246537, + "grad_norm": 0.11818085610866547, + "learning_rate": 9.897803765697022e-05, + "loss": 0.016236521303653717, + "num_input_tokens_seen": 43658416, + "step": 2666, + "train_runtime": 21970.2872, + "train_tokens_per_second": 1987.157 + }, + { + "epoch": 0.7387811634349031, + "grad_norm": 0.07760980725288391, + "learning_rate": 9.897715338209077e-05, + "loss": 0.013315483927726746, + "num_input_tokens_seen": 43674792, + "step": 2667, + "train_runtime": 21978.5174, + "train_tokens_per_second": 1987.158 + }, + { + "epoch": 0.7390581717451523, + "grad_norm": 0.053511541336774826, + "learning_rate": 9.897626872876133e-05, + "loss": 0.01164399553090334, + "num_input_tokens_seen": 43691168, + "step": 2668, + "train_runtime": 21986.749, + "train_tokens_per_second": 1987.159 + }, + { + "epoch": 0.7393351800554017, + "grad_norm": 0.09897203743457794, + "learning_rate": 9.897538369698874e-05, + "loss": 0.016194699332118034, + "num_input_tokens_seen": 43707544, + "step": 2669, + "train_runtime": 21994.9739, + "train_tokens_per_second": 1987.161 + }, + { + "epoch": 0.739612188365651, + "grad_norm": 0.06863001734018326, + "learning_rate": 9.897449828677983e-05, + "loss": 0.01447139773517847, + "num_input_tokens_seen": 43723920, + "step": 2670, + "train_runtime": 22003.1922, + "train_tokens_per_second": 1987.163 + }, + { + "epoch": 0.7398891966759003, + "grad_norm": 0.04656248539686203, + "learning_rate": 9.897361249814146e-05, + "loss": 0.01100231148302555, + "num_input_tokens_seen": 43740296, + "step": 2671, + "train_runtime": 22011.4117, + "train_tokens_per_second": 1987.165 + }, + { + "epoch": 0.7401662049861496, + "grad_norm": 0.06876207888126373, + "learning_rate": 9.897272633108046e-05, + "loss": 0.016766194254159927, + "num_input_tokens_seen": 43756672, + "step": 2672, + "train_runtime": 22019.6331, + "train_tokens_per_second": 1987.166 + }, + { + "epoch": 0.7404432132963988, + "grad_norm": 0.09537838399410248, + "learning_rate": 9.89718397856037e-05, + "loss": 0.016182225197553635, + "num_input_tokens_seen": 43773048, + "step": 2673, + "train_runtime": 22027.8609, + "train_tokens_per_second": 1987.167 + }, + { + "epoch": 0.7407202216066482, + "grad_norm": 0.053167033940553665, + "learning_rate": 9.8970952861718e-05, + "loss": 0.014847766607999802, + "num_input_tokens_seen": 43789424, + "step": 2674, + "train_runtime": 22036.0878, + "train_tokens_per_second": 1987.169 + }, + { + "epoch": 0.7409972299168975, + "grad_norm": 0.0646461620926857, + "learning_rate": 9.897006555943022e-05, + "loss": 0.014065724797546864, + "num_input_tokens_seen": 43805800, + "step": 2675, + "train_runtime": 22044.3127, + "train_tokens_per_second": 1987.17 + }, + { + "epoch": 0.7412742382271468, + "grad_norm": 0.07698734849691391, + "learning_rate": 9.896917787874725e-05, + "loss": 0.012723240070044994, + "num_input_tokens_seen": 43822176, + "step": 2676, + "train_runtime": 22052.5396, + "train_tokens_per_second": 1987.171 + }, + { + "epoch": 0.7415512465373961, + "grad_norm": 0.07897045463323593, + "learning_rate": 9.89682898196759e-05, + "loss": 0.018626704812049866, + "num_input_tokens_seen": 43838552, + "step": 2677, + "train_runtime": 22060.7645, + "train_tokens_per_second": 1987.173 + }, + { + "epoch": 0.7418282548476455, + "grad_norm": 0.07665813714265823, + "learning_rate": 9.896740138222306e-05, + "loss": 0.01720437780022621, + "num_input_tokens_seen": 43854928, + "step": 2678, + "train_runtime": 22068.9994, + "train_tokens_per_second": 1987.173 + }, + { + "epoch": 0.7421052631578947, + "grad_norm": 0.09176413714885712, + "learning_rate": 9.896651256639559e-05, + "loss": 0.016844110563397408, + "num_input_tokens_seen": 43871304, + "step": 2679, + "train_runtime": 22077.2245, + "train_tokens_per_second": 1987.175 + }, + { + "epoch": 0.7423822714681441, + "grad_norm": 0.05479644238948822, + "learning_rate": 9.896562337220036e-05, + "loss": 0.011964869685471058, + "num_input_tokens_seen": 43887680, + "step": 2680, + "train_runtime": 22085.4564, + "train_tokens_per_second": 1987.176 + }, + { + "epoch": 0.7426592797783933, + "grad_norm": 0.06456651538610458, + "learning_rate": 9.896473379964424e-05, + "loss": 0.012952339835464954, + "num_input_tokens_seen": 43904056, + "step": 2681, + "train_runtime": 22093.6811, + "train_tokens_per_second": 1987.177 + }, + { + "epoch": 0.7429362880886426, + "grad_norm": 0.056340817362070084, + "learning_rate": 9.896384384873409e-05, + "loss": 0.013706308789551258, + "num_input_tokens_seen": 43920432, + "step": 2682, + "train_runtime": 22101.9138, + "train_tokens_per_second": 1987.178 + }, + { + "epoch": 0.743213296398892, + "grad_norm": 0.05353933945298195, + "learning_rate": 9.896295351947681e-05, + "loss": 0.013110063970088959, + "num_input_tokens_seen": 43936808, + "step": 2683, + "train_runtime": 22110.1372, + "train_tokens_per_second": 1987.179 + }, + { + "epoch": 0.7434903047091412, + "grad_norm": 0.08324996381998062, + "learning_rate": 9.896206281187928e-05, + "loss": 0.01628609001636505, + "num_input_tokens_seen": 43953184, + "step": 2684, + "train_runtime": 22118.3638, + "train_tokens_per_second": 1987.181 + }, + { + "epoch": 0.7437673130193906, + "grad_norm": 0.10476714372634888, + "learning_rate": 9.896117172594835e-05, + "loss": 0.01556359976530075, + "num_input_tokens_seen": 43969560, + "step": 2685, + "train_runtime": 22126.5913, + "train_tokens_per_second": 1987.182 + }, + { + "epoch": 0.7440443213296399, + "grad_norm": 0.14574366807937622, + "learning_rate": 9.896028026169094e-05, + "loss": 0.021650806069374084, + "num_input_tokens_seen": 43985936, + "step": 2686, + "train_runtime": 22134.8287, + "train_tokens_per_second": 1987.182 + }, + { + "epoch": 0.7443213296398892, + "grad_norm": 0.09233483672142029, + "learning_rate": 9.895938841911392e-05, + "loss": 0.01631813868880272, + "num_input_tokens_seen": 44002312, + "step": 2687, + "train_runtime": 22143.0645, + "train_tokens_per_second": 1987.183 + }, + { + "epoch": 0.7445983379501385, + "grad_norm": 0.07860902696847916, + "learning_rate": 9.895849619822419e-05, + "loss": 0.013557962141931057, + "num_input_tokens_seen": 44018688, + "step": 2688, + "train_runtime": 22151.2841, + "train_tokens_per_second": 1987.184 + }, + { + "epoch": 0.7448753462603878, + "grad_norm": 0.05485935136675835, + "learning_rate": 9.895760359902864e-05, + "loss": 0.013658962212502956, + "num_input_tokens_seen": 44035064, + "step": 2689, + "train_runtime": 22159.5004, + "train_tokens_per_second": 1987.187 + }, + { + "epoch": 0.7451523545706371, + "grad_norm": 0.054403156042099, + "learning_rate": 9.895671062153417e-05, + "loss": 0.012849872931838036, + "num_input_tokens_seen": 44051440, + "step": 2690, + "train_runtime": 22167.7163, + "train_tokens_per_second": 1987.189 + }, + { + "epoch": 0.7454293628808865, + "grad_norm": 0.06215257570147514, + "learning_rate": 9.895581726574768e-05, + "loss": 0.011503093875944614, + "num_input_tokens_seen": 44067816, + "step": 2691, + "train_runtime": 22175.9309, + "train_tokens_per_second": 1987.191 + }, + { + "epoch": 0.7457063711911357, + "grad_norm": 0.07125752419233322, + "learning_rate": 9.895492353167606e-05, + "loss": 0.013242093846201897, + "num_input_tokens_seen": 44084192, + "step": 2692, + "train_runtime": 22184.1412, + "train_tokens_per_second": 1987.194 + }, + { + "epoch": 0.745983379501385, + "grad_norm": 0.05393203720450401, + "learning_rate": 9.895402941932623e-05, + "loss": 0.012857906520366669, + "num_input_tokens_seen": 44100568, + "step": 2693, + "train_runtime": 22192.3556, + "train_tokens_per_second": 1987.196 + }, + { + "epoch": 0.7462603878116344, + "grad_norm": 0.052882395684719086, + "learning_rate": 9.89531349287051e-05, + "loss": 0.015481678768992424, + "num_input_tokens_seen": 44116944, + "step": 2694, + "train_runtime": 22200.569, + "train_tokens_per_second": 1987.199 + }, + { + "epoch": 0.7465373961218836, + "grad_norm": 0.10914599150419235, + "learning_rate": 9.895224005981958e-05, + "loss": 0.012406752444803715, + "num_input_tokens_seen": 44133320, + "step": 2695, + "train_runtime": 22208.7948, + "train_tokens_per_second": 1987.2 + }, + { + "epoch": 0.746814404432133, + "grad_norm": 0.1377367526292801, + "learning_rate": 9.895134481267659e-05, + "loss": 0.01879769004881382, + "num_input_tokens_seen": 44149696, + "step": 2696, + "train_runtime": 22217.0085, + "train_tokens_per_second": 1987.203 + }, + { + "epoch": 0.7470914127423822, + "grad_norm": 0.0694916695356369, + "learning_rate": 9.895044918728302e-05, + "loss": 0.012498145923018456, + "num_input_tokens_seen": 44166072, + "step": 2697, + "train_runtime": 22225.2212, + "train_tokens_per_second": 1987.205 + }, + { + "epoch": 0.7473684210526316, + "grad_norm": 0.17793244123458862, + "learning_rate": 9.894955318364583e-05, + "loss": 0.020748278126120567, + "num_input_tokens_seen": 44182448, + "step": 2698, + "train_runtime": 22233.4315, + "train_tokens_per_second": 1987.208 + }, + { + "epoch": 0.7476454293628809, + "grad_norm": 0.07337184250354767, + "learning_rate": 9.89486568017719e-05, + "loss": 0.013852769508957863, + "num_input_tokens_seen": 44198824, + "step": 2699, + "train_runtime": 22241.6409, + "train_tokens_per_second": 1987.211 + }, + { + "epoch": 0.7479224376731302, + "grad_norm": 0.08647233992815018, + "learning_rate": 9.89477600416682e-05, + "loss": 0.01722142845392227, + "num_input_tokens_seen": 44215200, + "step": 2700, + "train_runtime": 22249.8568, + "train_tokens_per_second": 1987.213 + }, + { + "epoch": 0.7481994459833795, + "grad_norm": 0.055679503828287125, + "learning_rate": 9.894686290334163e-05, + "loss": 0.01597537286579609, + "num_input_tokens_seen": 44231576, + "step": 2701, + "train_runtime": 22259.6579, + "train_tokens_per_second": 1987.073 + }, + { + "epoch": 0.7484764542936289, + "grad_norm": 0.05741174891591072, + "learning_rate": 9.894596538679914e-05, + "loss": 0.015968510881066322, + "num_input_tokens_seen": 44247952, + "step": 2702, + "train_runtime": 22267.8892, + "train_tokens_per_second": 1987.074 + }, + { + "epoch": 0.7487534626038781, + "grad_norm": 0.05837472155690193, + "learning_rate": 9.894506749204766e-05, + "loss": 0.011958980932831764, + "num_input_tokens_seen": 44264328, + "step": 2703, + "train_runtime": 22276.1146, + "train_tokens_per_second": 1987.076 + }, + { + "epoch": 0.7490304709141274, + "grad_norm": 0.05328691750764847, + "learning_rate": 9.894416921909412e-05, + "loss": 0.014019351452589035, + "num_input_tokens_seen": 44280704, + "step": 2704, + "train_runtime": 22284.3387, + "train_tokens_per_second": 1987.077 + }, + { + "epoch": 0.7493074792243767, + "grad_norm": 0.07131396234035492, + "learning_rate": 9.894327056794547e-05, + "loss": 0.013230985030531883, + "num_input_tokens_seen": 44297080, + "step": 2705, + "train_runtime": 22292.5662, + "train_tokens_per_second": 1987.079 + }, + { + "epoch": 0.749584487534626, + "grad_norm": 0.06430681049823761, + "learning_rate": 9.894237153860865e-05, + "loss": 0.01314451266080141, + "num_input_tokens_seen": 44313456, + "step": 2706, + "train_runtime": 22300.7954, + "train_tokens_per_second": 1987.08 + }, + { + "epoch": 0.7498614958448754, + "grad_norm": 0.09428892284631729, + "learning_rate": 9.89414721310906e-05, + "loss": 0.01360138040035963, + "num_input_tokens_seen": 44329832, + "step": 2707, + "train_runtime": 22309.0231, + "train_tokens_per_second": 1987.081 + }, + { + "epoch": 0.7501385041551246, + "grad_norm": 0.06546631455421448, + "learning_rate": 9.894057234539829e-05, + "loss": 0.0141298184171319, + "num_input_tokens_seen": 44346208, + "step": 2708, + "train_runtime": 22317.2568, + "train_tokens_per_second": 1987.081 + }, + { + "epoch": 0.750415512465374, + "grad_norm": 0.0657321959733963, + "learning_rate": 9.893967218153863e-05, + "loss": 0.011592009104788303, + "num_input_tokens_seen": 44362584, + "step": 2709, + "train_runtime": 22325.4867, + "train_tokens_per_second": 1987.082 + }, + { + "epoch": 0.7506925207756233, + "grad_norm": 0.05897582694888115, + "learning_rate": 9.893877163951863e-05, + "loss": 0.011013180017471313, + "num_input_tokens_seen": 44378960, + "step": 2710, + "train_runtime": 22333.7258, + "train_tokens_per_second": 1987.083 + }, + { + "epoch": 0.7509695290858726, + "grad_norm": 0.06721258163452148, + "learning_rate": 9.893787071934522e-05, + "loss": 0.014163301326334476, + "num_input_tokens_seen": 44395336, + "step": 2711, + "train_runtime": 22341.9569, + "train_tokens_per_second": 1987.084 + }, + { + "epoch": 0.7512465373961219, + "grad_norm": 0.07331104576587677, + "learning_rate": 9.893696942102538e-05, + "loss": 0.015414642170071602, + "num_input_tokens_seen": 44411712, + "step": 2712, + "train_runtime": 22350.189, + "train_tokens_per_second": 1987.084 + }, + { + "epoch": 0.7515235457063711, + "grad_norm": 0.04691077023744583, + "learning_rate": 9.893606774456604e-05, + "loss": 0.013504615984857082, + "num_input_tokens_seen": 44428088, + "step": 2713, + "train_runtime": 22358.4213, + "train_tokens_per_second": 1987.085 + }, + { + "epoch": 0.7518005540166205, + "grad_norm": 0.07702714204788208, + "learning_rate": 9.893516568997418e-05, + "loss": 0.014830422587692738, + "num_input_tokens_seen": 44444464, + "step": 2714, + "train_runtime": 22366.6464, + "train_tokens_per_second": 1987.087 + }, + { + "epoch": 0.7520775623268698, + "grad_norm": 0.06773880124092102, + "learning_rate": 9.893426325725681e-05, + "loss": 0.012943606823682785, + "num_input_tokens_seen": 44460840, + "step": 2715, + "train_runtime": 22374.8792, + "train_tokens_per_second": 1987.087 + }, + { + "epoch": 0.7523545706371191, + "grad_norm": 0.0679863840341568, + "learning_rate": 9.893336044642085e-05, + "loss": 0.013576199300587177, + "num_input_tokens_seen": 44477216, + "step": 2716, + "train_runtime": 22383.1045, + "train_tokens_per_second": 1987.089 + }, + { + "epoch": 0.7526315789473684, + "grad_norm": 0.09793952852487564, + "learning_rate": 9.89324572574733e-05, + "loss": 0.015842769294977188, + "num_input_tokens_seen": 44493592, + "step": 2717, + "train_runtime": 22391.3275, + "train_tokens_per_second": 1987.09 + }, + { + "epoch": 0.7529085872576178, + "grad_norm": 0.08642060309648514, + "learning_rate": 9.893155369042113e-05, + "loss": 0.01596515066921711, + "num_input_tokens_seen": 44509968, + "step": 2718, + "train_runtime": 22399.5407, + "train_tokens_per_second": 1987.093 + }, + { + "epoch": 0.753185595567867, + "grad_norm": 0.07388613373041153, + "learning_rate": 9.893064974527133e-05, + "loss": 0.0135203767567873, + "num_input_tokens_seen": 44526344, + "step": 2719, + "train_runtime": 22407.7561, + "train_tokens_per_second": 1987.095 + }, + { + "epoch": 0.7534626038781164, + "grad_norm": 0.08832784742116928, + "learning_rate": 9.892974542203088e-05, + "loss": 0.014620735310018063, + "num_input_tokens_seen": 44542720, + "step": 2720, + "train_runtime": 22415.964, + "train_tokens_per_second": 1987.098 + }, + { + "epoch": 0.7537396121883656, + "grad_norm": 0.08423144370317459, + "learning_rate": 9.892884072070677e-05, + "loss": 0.01275092177093029, + "num_input_tokens_seen": 44559096, + "step": 2721, + "train_runtime": 22424.1915, + "train_tokens_per_second": 1987.099 + }, + { + "epoch": 0.754016620498615, + "grad_norm": 0.09326353669166565, + "learning_rate": 9.8927935641306e-05, + "loss": 0.013396571390330791, + "num_input_tokens_seen": 44575472, + "step": 2722, + "train_runtime": 22432.4138, + "train_tokens_per_second": 1987.101 + }, + { + "epoch": 0.7542936288088643, + "grad_norm": 0.08296901732683182, + "learning_rate": 9.892703018383554e-05, + "loss": 0.013783978298306465, + "num_input_tokens_seen": 44591848, + "step": 2723, + "train_runtime": 22440.6398, + "train_tokens_per_second": 1987.102 + }, + { + "epoch": 0.7545706371191135, + "grad_norm": 0.06766335666179657, + "learning_rate": 9.892612434830241e-05, + "loss": 0.013095257803797722, + "num_input_tokens_seen": 44608224, + "step": 2724, + "train_runtime": 22448.8685, + "train_tokens_per_second": 1987.103 + }, + { + "epoch": 0.7548476454293629, + "grad_norm": 0.0776444524526596, + "learning_rate": 9.89252181347136e-05, + "loss": 0.01611190289258957, + "num_input_tokens_seen": 44624600, + "step": 2725, + "train_runtime": 22457.0909, + "train_tokens_per_second": 1987.105 + }, + { + "epoch": 0.7551246537396122, + "grad_norm": 0.0697479248046875, + "learning_rate": 9.89243115430761e-05, + "loss": 0.011603694409132004, + "num_input_tokens_seen": 44640976, + "step": 2726, + "train_runtime": 22465.3266, + "train_tokens_per_second": 1987.106 + }, + { + "epoch": 0.7554016620498615, + "grad_norm": 0.0633765161037445, + "learning_rate": 9.892340457339695e-05, + "loss": 0.014517595991492271, + "num_input_tokens_seen": 44657352, + "step": 2727, + "train_runtime": 22473.5483, + "train_tokens_per_second": 1987.107 + }, + { + "epoch": 0.7556786703601108, + "grad_norm": 0.07512036710977554, + "learning_rate": 9.892249722568313e-05, + "loss": 0.016728276386857033, + "num_input_tokens_seen": 44673728, + "step": 2728, + "train_runtime": 22481.7752, + "train_tokens_per_second": 1987.109 + }, + { + "epoch": 0.7559556786703601, + "grad_norm": 0.06261610984802246, + "learning_rate": 9.892158949994166e-05, + "loss": 0.015236278995871544, + "num_input_tokens_seen": 44690104, + "step": 2729, + "train_runtime": 22490.0031, + "train_tokens_per_second": 1987.11 + }, + { + "epoch": 0.7562326869806094, + "grad_norm": 0.08269492536783218, + "learning_rate": 9.892068139617953e-05, + "loss": 0.012960094958543777, + "num_input_tokens_seen": 44706480, + "step": 2730, + "train_runtime": 22498.2334, + "train_tokens_per_second": 1987.111 + }, + { + "epoch": 0.7565096952908588, + "grad_norm": 0.07929860800504684, + "learning_rate": 9.89197729144038e-05, + "loss": 0.017456233501434326, + "num_input_tokens_seen": 44722856, + "step": 2731, + "train_runtime": 22506.4572, + "train_tokens_per_second": 1987.112 + }, + { + "epoch": 0.756786703601108, + "grad_norm": 0.0645526871085167, + "learning_rate": 9.891886405462148e-05, + "loss": 0.015767376869916916, + "num_input_tokens_seen": 44739232, + "step": 2732, + "train_runtime": 22514.6749, + "train_tokens_per_second": 1987.114 + }, + { + "epoch": 0.7570637119113574, + "grad_norm": 0.05226311832666397, + "learning_rate": 9.891795481683955e-05, + "loss": 0.015629468485713005, + "num_input_tokens_seen": 44755608, + "step": 2733, + "train_runtime": 22522.9018, + "train_tokens_per_second": 1987.116 + }, + { + "epoch": 0.7573407202216067, + "grad_norm": 0.06421778351068497, + "learning_rate": 9.891704520106509e-05, + "loss": 0.013572080992162228, + "num_input_tokens_seen": 44771984, + "step": 2734, + "train_runtime": 22531.1256, + "train_tokens_per_second": 1987.117 + }, + { + "epoch": 0.7576177285318559, + "grad_norm": 0.07560236752033234, + "learning_rate": 9.89161352073051e-05, + "loss": 0.016786077991127968, + "num_input_tokens_seen": 44788360, + "step": 2735, + "train_runtime": 22539.3415, + "train_tokens_per_second": 1987.119 + }, + { + "epoch": 0.7578947368421053, + "grad_norm": 0.06426254659891129, + "learning_rate": 9.891522483556661e-05, + "loss": 0.01112865749746561, + "num_input_tokens_seen": 44804736, + "step": 2736, + "train_runtime": 22547.5583, + "train_tokens_per_second": 1987.121 + }, + { + "epoch": 0.7581717451523545, + "grad_norm": 0.064529649913311, + "learning_rate": 9.891431408585666e-05, + "loss": 0.013511287048459053, + "num_input_tokens_seen": 44821112, + "step": 2737, + "train_runtime": 22555.7837, + "train_tokens_per_second": 1987.123 + }, + { + "epoch": 0.7584487534626039, + "grad_norm": 0.08564779907464981, + "learning_rate": 9.89134029581823e-05, + "loss": 0.012112179771065712, + "num_input_tokens_seen": 44837488, + "step": 2738, + "train_runtime": 22564.0098, + "train_tokens_per_second": 1987.124 + }, + { + "epoch": 0.7587257617728532, + "grad_norm": 0.10153686255216599, + "learning_rate": 9.891249145255055e-05, + "loss": 0.01394668873399496, + "num_input_tokens_seen": 44853864, + "step": 2739, + "train_runtime": 22572.224, + "train_tokens_per_second": 1987.126 + }, + { + "epoch": 0.7590027700831025, + "grad_norm": 0.05403409153223038, + "learning_rate": 9.891157956896848e-05, + "loss": 0.011798469349741936, + "num_input_tokens_seen": 44870240, + "step": 2740, + "train_runtime": 22580.4361, + "train_tokens_per_second": 1987.129 + }, + { + "epoch": 0.7592797783933518, + "grad_norm": 0.0587361678481102, + "learning_rate": 9.89106673074431e-05, + "loss": 0.013651175424456596, + "num_input_tokens_seen": 44886616, + "step": 2741, + "train_runtime": 22588.6464, + "train_tokens_per_second": 1987.132 + }, + { + "epoch": 0.7595567867036012, + "grad_norm": 0.044267088174819946, + "learning_rate": 9.890975466798149e-05, + "loss": 0.011554024182260036, + "num_input_tokens_seen": 44902992, + "step": 2742, + "train_runtime": 22596.8563, + "train_tokens_per_second": 1987.134 + }, + { + "epoch": 0.7598337950138504, + "grad_norm": 0.07718314230442047, + "learning_rate": 9.890884165059068e-05, + "loss": 0.015408793464303017, + "num_input_tokens_seen": 44919368, + "step": 2743, + "train_runtime": 22605.0666, + "train_tokens_per_second": 1987.137 + }, + { + "epoch": 0.7601108033240997, + "grad_norm": 0.07950770854949951, + "learning_rate": 9.890792825527773e-05, + "loss": 0.012249999679625034, + "num_input_tokens_seen": 44935744, + "step": 2744, + "train_runtime": 22613.2766, + "train_tokens_per_second": 1987.14 + }, + { + "epoch": 0.760387811634349, + "grad_norm": 0.038075949996709824, + "learning_rate": 9.890701448204972e-05, + "loss": 0.012689215131103992, + "num_input_tokens_seen": 44952120, + "step": 2745, + "train_runtime": 22621.5046, + "train_tokens_per_second": 1987.141 + }, + { + "epoch": 0.7606648199445983, + "grad_norm": 0.09239771962165833, + "learning_rate": 9.89061003309137e-05, + "loss": 0.013491960242390633, + "num_input_tokens_seen": 44968496, + "step": 2746, + "train_runtime": 22629.7254, + "train_tokens_per_second": 1987.143 + }, + { + "epoch": 0.7609418282548477, + "grad_norm": 0.06486044824123383, + "learning_rate": 9.890518580187671e-05, + "loss": 0.013647961430251598, + "num_input_tokens_seen": 44984872, + "step": 2747, + "train_runtime": 22637.957, + "train_tokens_per_second": 1987.144 + }, + { + "epoch": 0.7612188365650969, + "grad_norm": 0.04937870427966118, + "learning_rate": 9.890427089494583e-05, + "loss": 0.012505638413131237, + "num_input_tokens_seen": 45001248, + "step": 2748, + "train_runtime": 22646.1827, + "train_tokens_per_second": 1987.145 + }, + { + "epoch": 0.7614958448753463, + "grad_norm": 0.10843571275472641, + "learning_rate": 9.890335561012815e-05, + "loss": 0.011226803064346313, + "num_input_tokens_seen": 45017624, + "step": 2749, + "train_runtime": 22654.4092, + "train_tokens_per_second": 1987.146 + }, + { + "epoch": 0.7617728531855956, + "grad_norm": 0.0827600508928299, + "learning_rate": 9.890243994743071e-05, + "loss": 0.012426611967384815, + "num_input_tokens_seen": 45034000, + "step": 2750, + "train_runtime": 22662.6166, + "train_tokens_per_second": 1987.149 + }, + { + "epoch": 0.7620498614958449, + "grad_norm": 0.07631555944681168, + "learning_rate": 9.890152390686061e-05, + "loss": 0.01676388643682003, + "num_input_tokens_seen": 45050376, + "step": 2751, + "train_runtime": 22670.8285, + "train_tokens_per_second": 1987.152 + }, + { + "epoch": 0.7623268698060942, + "grad_norm": 0.08494363725185394, + "learning_rate": 9.890060748842493e-05, + "loss": 0.015149304643273354, + "num_input_tokens_seen": 45066752, + "step": 2752, + "train_runtime": 22679.0573, + "train_tokens_per_second": 1987.153 + }, + { + "epoch": 0.7626038781163434, + "grad_norm": 0.059370361268520355, + "learning_rate": 9.889969069213072e-05, + "loss": 0.012817101553082466, + "num_input_tokens_seen": 45083128, + "step": 2753, + "train_runtime": 22687.2872, + "train_tokens_per_second": 1987.154 + }, + { + "epoch": 0.7628808864265928, + "grad_norm": 0.05518084391951561, + "learning_rate": 9.88987735179851e-05, + "loss": 0.011805694550275803, + "num_input_tokens_seen": 45099504, + "step": 2754, + "train_runtime": 22695.5143, + "train_tokens_per_second": 1987.155 + }, + { + "epoch": 0.7631578947368421, + "grad_norm": 0.07378622889518738, + "learning_rate": 9.889785596599514e-05, + "loss": 0.016036422923207283, + "num_input_tokens_seen": 45115880, + "step": 2755, + "train_runtime": 22703.7403, + "train_tokens_per_second": 1987.156 + }, + { + "epoch": 0.7634349030470914, + "grad_norm": 0.05293147638440132, + "learning_rate": 9.889693803616793e-05, + "loss": 0.012933339923620224, + "num_input_tokens_seen": 45132256, + "step": 2756, + "train_runtime": 22711.961, + "train_tokens_per_second": 1987.158 + }, + { + "epoch": 0.7637119113573407, + "grad_norm": 0.06563162058591843, + "learning_rate": 9.889601972851057e-05, + "loss": 0.012113811448216438, + "num_input_tokens_seen": 45148632, + "step": 2757, + "train_runtime": 22720.1848, + "train_tokens_per_second": 1987.16 + }, + { + "epoch": 0.7639889196675901, + "grad_norm": 0.07849372178316116, + "learning_rate": 9.889510104303015e-05, + "loss": 0.014203374274075031, + "num_input_tokens_seen": 45165008, + "step": 2758, + "train_runtime": 22728.414, + "train_tokens_per_second": 1987.161 + }, + { + "epoch": 0.7642659279778393, + "grad_norm": 0.0853261947631836, + "learning_rate": 9.889418197973376e-05, + "loss": 0.016850221902132034, + "num_input_tokens_seen": 45181384, + "step": 2759, + "train_runtime": 22736.6381, + "train_tokens_per_second": 1987.162 + }, + { + "epoch": 0.7645429362880887, + "grad_norm": 0.056058142334222794, + "learning_rate": 9.889326253862852e-05, + "loss": 0.012028757482767105, + "num_input_tokens_seen": 45197760, + "step": 2760, + "train_runtime": 22744.8658, + "train_tokens_per_second": 1987.163 + }, + { + "epoch": 0.7648199445983379, + "grad_norm": 0.05502932518720627, + "learning_rate": 9.889234271972151e-05, + "loss": 0.009812496602535248, + "num_input_tokens_seen": 45214136, + "step": 2761, + "train_runtime": 22753.0954, + "train_tokens_per_second": 1987.164 + }, + { + "epoch": 0.7650969529085873, + "grad_norm": 0.0579071044921875, + "learning_rate": 9.889142252301988e-05, + "loss": 0.015719633549451828, + "num_input_tokens_seen": 45230512, + "step": 2762, + "train_runtime": 22761.3273, + "train_tokens_per_second": 1987.165 + }, + { + "epoch": 0.7653739612188366, + "grad_norm": 0.06684819608926773, + "learning_rate": 9.88905019485307e-05, + "loss": 0.012526476755738258, + "num_input_tokens_seen": 45246888, + "step": 2763, + "train_runtime": 22769.5578, + "train_tokens_per_second": 1987.166 + }, + { + "epoch": 0.7656509695290858, + "grad_norm": 0.04516398161649704, + "learning_rate": 9.888958099626109e-05, + "loss": 0.014189965091645718, + "num_input_tokens_seen": 45263264, + "step": 2764, + "train_runtime": 22777.7705, + "train_tokens_per_second": 1987.168 + }, + { + "epoch": 0.7659279778393352, + "grad_norm": 0.0754794031381607, + "learning_rate": 9.888865966621818e-05, + "loss": 0.012158693745732307, + "num_input_tokens_seen": 45279640, + "step": 2765, + "train_runtime": 22785.9897, + "train_tokens_per_second": 1987.17 + }, + { + "epoch": 0.7662049861495844, + "grad_norm": 0.06674270331859589, + "learning_rate": 9.888773795840909e-05, + "loss": 0.010676402598619461, + "num_input_tokens_seen": 45296016, + "step": 2766, + "train_runtime": 22794.2161, + "train_tokens_per_second": 1987.171 + }, + { + "epoch": 0.7664819944598338, + "grad_norm": 0.06160201504826546, + "learning_rate": 9.888681587284093e-05, + "loss": 0.013873063027858734, + "num_input_tokens_seen": 45312392, + "step": 2767, + "train_runtime": 22802.4442, + "train_tokens_per_second": 1987.173 + }, + { + "epoch": 0.7667590027700831, + "grad_norm": 0.07393747568130493, + "learning_rate": 9.888589340952084e-05, + "loss": 0.013682560995221138, + "num_input_tokens_seen": 45328768, + "step": 2768, + "train_runtime": 22810.673, + "train_tokens_per_second": 1987.174 + }, + { + "epoch": 0.7670360110803324, + "grad_norm": 0.07236498594284058, + "learning_rate": 9.888497056845591e-05, + "loss": 0.014375913888216019, + "num_input_tokens_seen": 45345144, + "step": 2769, + "train_runtime": 22818.9053, + "train_tokens_per_second": 1987.174 + }, + { + "epoch": 0.7673130193905817, + "grad_norm": 0.0749197006225586, + "learning_rate": 9.888404734965333e-05, + "loss": 0.014565852470695972, + "num_input_tokens_seen": 45361520, + "step": 2770, + "train_runtime": 22827.1262, + "train_tokens_per_second": 1987.176 + }, + { + "epoch": 0.7675900277008311, + "grad_norm": 0.05496964603662491, + "learning_rate": 9.888312375312019e-05, + "loss": 0.013889294117689133, + "num_input_tokens_seen": 45377896, + "step": 2771, + "train_runtime": 22835.3394, + "train_tokens_per_second": 1987.179 + }, + { + "epoch": 0.7678670360110803, + "grad_norm": 0.05488632991909981, + "learning_rate": 9.888219977886364e-05, + "loss": 0.012798105366528034, + "num_input_tokens_seen": 45394272, + "step": 2772, + "train_runtime": 22843.5445, + "train_tokens_per_second": 1987.182 + }, + { + "epoch": 0.7681440443213297, + "grad_norm": 0.06514378637075424, + "learning_rate": 9.888127542689081e-05, + "loss": 0.013274408876895905, + "num_input_tokens_seen": 45410648, + "step": 2773, + "train_runtime": 22851.7563, + "train_tokens_per_second": 1987.184 + }, + { + "epoch": 0.7684210526315789, + "grad_norm": 0.05824143812060356, + "learning_rate": 9.888035069720886e-05, + "loss": 0.013703704811632633, + "num_input_tokens_seen": 45427024, + "step": 2774, + "train_runtime": 22859.9664, + "train_tokens_per_second": 1987.187 + }, + { + "epoch": 0.7686980609418282, + "grad_norm": 0.07398642599582672, + "learning_rate": 9.887942558982492e-05, + "loss": 0.016180574893951416, + "num_input_tokens_seen": 45443400, + "step": 2775, + "train_runtime": 22868.1787, + "train_tokens_per_second": 1987.189 + }, + { + "epoch": 0.7689750692520776, + "grad_norm": 0.19704467058181763, + "learning_rate": 9.887850010474615e-05, + "loss": 0.015991060063242912, + "num_input_tokens_seen": 45459776, + "step": 2776, + "train_runtime": 22876.3877, + "train_tokens_per_second": 1987.192 + }, + { + "epoch": 0.7692520775623268, + "grad_norm": 0.1390792280435562, + "learning_rate": 9.88775742419797e-05, + "loss": 0.010920211672782898, + "num_input_tokens_seen": 45476152, + "step": 2777, + "train_runtime": 22884.5949, + "train_tokens_per_second": 1987.195 + }, + { + "epoch": 0.7695290858725762, + "grad_norm": 0.05958801507949829, + "learning_rate": 9.887664800153272e-05, + "loss": 0.01406643446534872, + "num_input_tokens_seen": 45492528, + "step": 2778, + "train_runtime": 22892.8047, + "train_tokens_per_second": 1987.198 + }, + { + "epoch": 0.7698060941828255, + "grad_norm": 0.048678066581487656, + "learning_rate": 9.887572138341237e-05, + "loss": 0.013518884778022766, + "num_input_tokens_seen": 45508904, + "step": 2779, + "train_runtime": 22901.0124, + "train_tokens_per_second": 1987.201 + }, + { + "epoch": 0.7700831024930748, + "grad_norm": 0.054797954857349396, + "learning_rate": 9.887479438762579e-05, + "loss": 0.015930859372019768, + "num_input_tokens_seen": 45525280, + "step": 2780, + "train_runtime": 22909.2207, + "train_tokens_per_second": 1987.203 + }, + { + "epoch": 0.7703601108033241, + "grad_norm": 0.06551222503185272, + "learning_rate": 9.887386701418018e-05, + "loss": 0.009932538494467735, + "num_input_tokens_seen": 45541656, + "step": 2781, + "train_runtime": 22917.4289, + "train_tokens_per_second": 1987.206 + }, + { + "epoch": 0.7706371191135734, + "grad_norm": 0.0805068388581276, + "learning_rate": 9.887293926308267e-05, + "loss": 0.012950525619089603, + "num_input_tokens_seen": 45558032, + "step": 2782, + "train_runtime": 22925.6386, + "train_tokens_per_second": 1987.209 + }, + { + "epoch": 0.7709141274238227, + "grad_norm": 0.06686072796583176, + "learning_rate": 9.887201113434046e-05, + "loss": 0.016433564946055412, + "num_input_tokens_seen": 45574408, + "step": 2783, + "train_runtime": 22933.8425, + "train_tokens_per_second": 1987.212 + }, + { + "epoch": 0.771191135734072, + "grad_norm": 0.08797693997621536, + "learning_rate": 9.88710826279607e-05, + "loss": 0.014689648523926735, + "num_input_tokens_seen": 45590784, + "step": 2784, + "train_runtime": 22942.0723, + "train_tokens_per_second": 1987.213 + }, + { + "epoch": 0.7714681440443213, + "grad_norm": 0.0815708190202713, + "learning_rate": 9.887015374395057e-05, + "loss": 0.013193057850003242, + "num_input_tokens_seen": 45607160, + "step": 2785, + "train_runtime": 22950.3043, + "train_tokens_per_second": 1987.214 + }, + { + "epoch": 0.7717451523545706, + "grad_norm": 0.033343005925416946, + "learning_rate": 9.886922448231726e-05, + "loss": 0.007998672313988209, + "num_input_tokens_seen": 45623536, + "step": 2786, + "train_runtime": 22958.5235, + "train_tokens_per_second": 1987.216 + }, + { + "epoch": 0.77202216066482, + "grad_norm": 0.05724345147609711, + "learning_rate": 9.886829484306795e-05, + "loss": 0.013635605573654175, + "num_input_tokens_seen": 45639912, + "step": 2787, + "train_runtime": 22966.7404, + "train_tokens_per_second": 1987.218 + }, + { + "epoch": 0.7722991689750692, + "grad_norm": 0.0998334139585495, + "learning_rate": 9.886736482620978e-05, + "loss": 0.017659282311797142, + "num_input_tokens_seen": 45656288, + "step": 2788, + "train_runtime": 22974.959, + "train_tokens_per_second": 1987.22 + }, + { + "epoch": 0.7725761772853186, + "grad_norm": 0.06426281481981277, + "learning_rate": 9.886643443174999e-05, + "loss": 0.015152068808674812, + "num_input_tokens_seen": 45672664, + "step": 2789, + "train_runtime": 22983.1794, + "train_tokens_per_second": 1987.221 + }, + { + "epoch": 0.7728531855955678, + "grad_norm": 0.07714379578828812, + "learning_rate": 9.886550365969575e-05, + "loss": 0.01605403982102871, + "num_input_tokens_seen": 45689040, + "step": 2790, + "train_runtime": 22991.3956, + "train_tokens_per_second": 1987.223 + }, + { + "epoch": 0.7731301939058172, + "grad_norm": 0.0955212414264679, + "learning_rate": 9.886457251005424e-05, + "loss": 0.01516078319400549, + "num_input_tokens_seen": 45705416, + "step": 2791, + "train_runtime": 22999.6244, + "train_tokens_per_second": 1987.224 + }, + { + "epoch": 0.7734072022160665, + "grad_norm": 0.06887014955282211, + "learning_rate": 9.886364098283267e-05, + "loss": 0.013291779905557632, + "num_input_tokens_seen": 45721792, + "step": 2792, + "train_runtime": 23007.8597, + "train_tokens_per_second": 1987.225 + }, + { + "epoch": 0.7736842105263158, + "grad_norm": 0.07382757216691971, + "learning_rate": 9.886270907803823e-05, + "loss": 0.014200017787516117, + "num_input_tokens_seen": 45738168, + "step": 2793, + "train_runtime": 23016.0861, + "train_tokens_per_second": 1987.226 + }, + { + "epoch": 0.7739612188365651, + "grad_norm": 0.06921607255935669, + "learning_rate": 9.886177679567814e-05, + "loss": 0.014491732232272625, + "num_input_tokens_seen": 45754544, + "step": 2794, + "train_runtime": 23024.3153, + "train_tokens_per_second": 1987.227 + }, + { + "epoch": 0.7742382271468145, + "grad_norm": 0.06817793101072311, + "learning_rate": 9.886084413575958e-05, + "loss": 0.013675055466592312, + "num_input_tokens_seen": 45770920, + "step": 2795, + "train_runtime": 23032.5457, + "train_tokens_per_second": 1987.228 + }, + { + "epoch": 0.7745152354570637, + "grad_norm": 0.0665111243724823, + "learning_rate": 9.885991109828976e-05, + "loss": 0.013087740167975426, + "num_input_tokens_seen": 45787296, + "step": 2796, + "train_runtime": 23040.7745, + "train_tokens_per_second": 1987.229 + }, + { + "epoch": 0.774792243767313, + "grad_norm": 0.09187150746583939, + "learning_rate": 9.88589776832759e-05, + "loss": 0.017633620649576187, + "num_input_tokens_seen": 45803672, + "step": 2797, + "train_runtime": 23048.9953, + "train_tokens_per_second": 1987.231 + }, + { + "epoch": 0.7750692520775623, + "grad_norm": 0.07038883864879608, + "learning_rate": 9.885804389072521e-05, + "loss": 0.01685161329805851, + "num_input_tokens_seen": 45820048, + "step": 2798, + "train_runtime": 23057.226, + "train_tokens_per_second": 1987.232 + }, + { + "epoch": 0.7753462603878116, + "grad_norm": 0.08633486181497574, + "learning_rate": 9.88571097206449e-05, + "loss": 0.017398901283740997, + "num_input_tokens_seen": 45836424, + "step": 2799, + "train_runtime": 23065.457, + "train_tokens_per_second": 1987.232 + }, + { + "epoch": 0.775623268698061, + "grad_norm": 0.06576185673475266, + "learning_rate": 9.885617517304219e-05, + "loss": 0.013295881450176239, + "num_input_tokens_seen": 45852800, + "step": 2800, + "train_runtime": 23073.6882, + "train_tokens_per_second": 1987.233 + }, + { + "epoch": 0.7759002770083102, + "grad_norm": 0.05579021945595741, + "learning_rate": 9.88552402479243e-05, + "loss": 0.015601390041410923, + "num_input_tokens_seen": 45869176, + "step": 2801, + "train_runtime": 23083.7054, + "train_tokens_per_second": 1987.08 + }, + { + "epoch": 0.7761772853185596, + "grad_norm": 0.0390811413526535, + "learning_rate": 9.885430494529846e-05, + "loss": 0.010463660582900047, + "num_input_tokens_seen": 45885552, + "step": 2802, + "train_runtime": 23091.9294, + "train_tokens_per_second": 1987.082 + }, + { + "epoch": 0.7764542936288089, + "grad_norm": 0.050751153379678726, + "learning_rate": 9.88533692651719e-05, + "loss": 0.011760728433728218, + "num_input_tokens_seen": 45901928, + "step": 2803, + "train_runtime": 23100.1438, + "train_tokens_per_second": 1987.084 + }, + { + "epoch": 0.7767313019390581, + "grad_norm": 0.05926036462187767, + "learning_rate": 9.885243320755184e-05, + "loss": 0.01207482535392046, + "num_input_tokens_seen": 45918304, + "step": 2804, + "train_runtime": 23108.363, + "train_tokens_per_second": 1987.086 + }, + { + "epoch": 0.7770083102493075, + "grad_norm": 0.05781317502260208, + "learning_rate": 9.885149677244552e-05, + "loss": 0.01218110416084528, + "num_input_tokens_seen": 45934680, + "step": 2805, + "train_runtime": 23116.5815, + "train_tokens_per_second": 1987.088 + }, + { + "epoch": 0.7772853185595567, + "grad_norm": 0.05474071204662323, + "learning_rate": 9.885055995986017e-05, + "loss": 0.012093819677829742, + "num_input_tokens_seen": 45951056, + "step": 2806, + "train_runtime": 23124.805, + "train_tokens_per_second": 1987.089 + }, + { + "epoch": 0.7775623268698061, + "grad_norm": 0.05122913420200348, + "learning_rate": 9.884962276980303e-05, + "loss": 0.01225178875029087, + "num_input_tokens_seen": 45967432, + "step": 2807, + "train_runtime": 23133.0272, + "train_tokens_per_second": 1987.091 + }, + { + "epoch": 0.7778393351800554, + "grad_norm": 0.08627456426620483, + "learning_rate": 9.884868520228135e-05, + "loss": 0.01678445003926754, + "num_input_tokens_seen": 45983808, + "step": 2808, + "train_runtime": 23141.2576, + "train_tokens_per_second": 1987.092 + }, + { + "epoch": 0.7781163434903047, + "grad_norm": 0.08339132368564606, + "learning_rate": 9.884774725730235e-05, + "loss": 0.015786180272698402, + "num_input_tokens_seen": 46000184, + "step": 2809, + "train_runtime": 23149.4928, + "train_tokens_per_second": 1987.093 + }, + { + "epoch": 0.778393351800554, + "grad_norm": 0.0667184367775917, + "learning_rate": 9.884680893487332e-05, + "loss": 0.013192696496844292, + "num_input_tokens_seen": 46016560, + "step": 2810, + "train_runtime": 23157.7225, + "train_tokens_per_second": 1987.094 + }, + { + "epoch": 0.7786703601108034, + "grad_norm": 0.08784326910972595, + "learning_rate": 9.884587023500147e-05, + "loss": 0.016590718179941177, + "num_input_tokens_seen": 46032936, + "step": 2811, + "train_runtime": 23165.9573, + "train_tokens_per_second": 1987.094 + }, + { + "epoch": 0.7789473684210526, + "grad_norm": 0.05569564178586006, + "learning_rate": 9.884493115769407e-05, + "loss": 0.013175075873732567, + "num_input_tokens_seen": 46049312, + "step": 2812, + "train_runtime": 23174.1866, + "train_tokens_per_second": 1987.095 + }, + { + "epoch": 0.779224376731302, + "grad_norm": 0.15218155086040497, + "learning_rate": 9.884399170295839e-05, + "loss": 0.016826849430799484, + "num_input_tokens_seen": 46065688, + "step": 2813, + "train_runtime": 23182.418, + "train_tokens_per_second": 1987.096 + }, + { + "epoch": 0.7795013850415512, + "grad_norm": 0.05978861823678017, + "learning_rate": 9.884305187080167e-05, + "loss": 0.014073027297854424, + "num_input_tokens_seen": 46082064, + "step": 2814, + "train_runtime": 23190.6412, + "train_tokens_per_second": 1987.097 + }, + { + "epoch": 0.7797783933518005, + "grad_norm": 0.05468841269612312, + "learning_rate": 9.884211166123116e-05, + "loss": 0.012916183099150658, + "num_input_tokens_seen": 46098440, + "step": 2815, + "train_runtime": 23198.8715, + "train_tokens_per_second": 1987.098 + }, + { + "epoch": 0.7800554016620499, + "grad_norm": 0.07112068682909012, + "learning_rate": 9.884117107425417e-05, + "loss": 0.0105360709130764, + "num_input_tokens_seen": 46114816, + "step": 2816, + "train_runtime": 23207.1003, + "train_tokens_per_second": 1987.099 + }, + { + "epoch": 0.7803324099722991, + "grad_norm": 0.056254900991916656, + "learning_rate": 9.884023010987793e-05, + "loss": 0.012282677926123142, + "num_input_tokens_seen": 46131192, + "step": 2817, + "train_runtime": 23215.3305, + "train_tokens_per_second": 1987.1 + }, + { + "epoch": 0.7806094182825485, + "grad_norm": 0.09987155348062515, + "learning_rate": 9.88392887681097e-05, + "loss": 0.020707866176962852, + "num_input_tokens_seen": 46147568, + "step": 2818, + "train_runtime": 23223.5631, + "train_tokens_per_second": 1987.101 + }, + { + "epoch": 0.7808864265927978, + "grad_norm": 0.08026977628469467, + "learning_rate": 9.88383470489568e-05, + "loss": 0.015117183327674866, + "num_input_tokens_seen": 46163944, + "step": 2819, + "train_runtime": 23231.7902, + "train_tokens_per_second": 1987.102 + }, + { + "epoch": 0.7811634349030471, + "grad_norm": 0.0660996213555336, + "learning_rate": 9.883740495242648e-05, + "loss": 0.01480012945830822, + "num_input_tokens_seen": 46180320, + "step": 2820, + "train_runtime": 23240.0288, + "train_tokens_per_second": 1987.103 + }, + { + "epoch": 0.7814404432132964, + "grad_norm": 0.10647059231996536, + "learning_rate": 9.883646247852603e-05, + "loss": 0.015001215040683746, + "num_input_tokens_seen": 46196696, + "step": 2821, + "train_runtime": 23248.2667, + "train_tokens_per_second": 1987.103 + }, + { + "epoch": 0.7817174515235457, + "grad_norm": 0.06998641788959503, + "learning_rate": 9.883551962726271e-05, + "loss": 0.013782117515802383, + "num_input_tokens_seen": 46213072, + "step": 2822, + "train_runtime": 23256.4965, + "train_tokens_per_second": 1987.104 + }, + { + "epoch": 0.781994459833795, + "grad_norm": 0.04741765558719635, + "learning_rate": 9.883457639864382e-05, + "loss": 0.01170451007783413, + "num_input_tokens_seen": 46229448, + "step": 2823, + "train_runtime": 23264.7194, + "train_tokens_per_second": 1987.105 + }, + { + "epoch": 0.7822714681440444, + "grad_norm": 0.05626140907406807, + "learning_rate": 9.883363279267665e-05, + "loss": 0.013108796440064907, + "num_input_tokens_seen": 46245824, + "step": 2824, + "train_runtime": 23272.9576, + "train_tokens_per_second": 1987.106 + }, + { + "epoch": 0.7825484764542936, + "grad_norm": 0.04470635950565338, + "learning_rate": 9.883268880936849e-05, + "loss": 0.014121513813734055, + "num_input_tokens_seen": 46262200, + "step": 2825, + "train_runtime": 23281.1893, + "train_tokens_per_second": 1987.106 + }, + { + "epoch": 0.782825484764543, + "grad_norm": 0.11646144837141037, + "learning_rate": 9.883174444872663e-05, + "loss": 0.0151287242770195, + "num_input_tokens_seen": 46278576, + "step": 2826, + "train_runtime": 23289.4173, + "train_tokens_per_second": 1987.108 + }, + { + "epoch": 0.7831024930747923, + "grad_norm": 0.11701042950153351, + "learning_rate": 9.883079971075838e-05, + "loss": 0.014013620093464851, + "num_input_tokens_seen": 46294952, + "step": 2827, + "train_runtime": 23297.6584, + "train_tokens_per_second": 1987.108 + }, + { + "epoch": 0.7833795013850415, + "grad_norm": 0.07504182308912277, + "learning_rate": 9.882985459547102e-05, + "loss": 0.012170948088169098, + "num_input_tokens_seen": 46311328, + "step": 2828, + "train_runtime": 23305.8902, + "train_tokens_per_second": 1987.108 + }, + { + "epoch": 0.7836565096952909, + "grad_norm": 0.060448866337537766, + "learning_rate": 9.882890910287186e-05, + "loss": 0.015278724953532219, + "num_input_tokens_seen": 46327704, + "step": 2829, + "train_runtime": 23314.1155, + "train_tokens_per_second": 1987.11 + }, + { + "epoch": 0.7839335180055401, + "grad_norm": 0.108954519033432, + "learning_rate": 9.882796323296821e-05, + "loss": 0.013161188922822475, + "num_input_tokens_seen": 46344080, + "step": 2830, + "train_runtime": 23322.3364, + "train_tokens_per_second": 1987.111 + }, + { + "epoch": 0.7842105263157895, + "grad_norm": 0.06146970018744469, + "learning_rate": 9.88270169857674e-05, + "loss": 0.015496176667511463, + "num_input_tokens_seen": 46360456, + "step": 2831, + "train_runtime": 23330.5624, + "train_tokens_per_second": 1987.113 + }, + { + "epoch": 0.7844875346260388, + "grad_norm": 0.07926181703805923, + "learning_rate": 9.88260703612767e-05, + "loss": 0.013908729888498783, + "num_input_tokens_seen": 46376832, + "step": 2832, + "train_runtime": 23338.7884, + "train_tokens_per_second": 1987.114 + }, + { + "epoch": 0.7847645429362881, + "grad_norm": 0.05042561516165733, + "learning_rate": 9.882512335950344e-05, + "loss": 0.013696755282580853, + "num_input_tokens_seen": 46393208, + "step": 2833, + "train_runtime": 23347.0054, + "train_tokens_per_second": 1987.116 + }, + { + "epoch": 0.7850415512465374, + "grad_norm": 0.0670090764760971, + "learning_rate": 9.882417598045497e-05, + "loss": 0.012861893512308598, + "num_input_tokens_seen": 46409584, + "step": 2834, + "train_runtime": 23355.2264, + "train_tokens_per_second": 1987.118 + }, + { + "epoch": 0.7853185595567868, + "grad_norm": 0.03133324533700943, + "learning_rate": 9.882322822413856e-05, + "loss": 0.011997041292488575, + "num_input_tokens_seen": 46425960, + "step": 2835, + "train_runtime": 23363.4584, + "train_tokens_per_second": 1987.118 + }, + { + "epoch": 0.785595567867036, + "grad_norm": 0.06928440928459167, + "learning_rate": 9.882228009056155e-05, + "loss": 0.01703544333577156, + "num_input_tokens_seen": 46442336, + "step": 2836, + "train_runtime": 23371.6806, + "train_tokens_per_second": 1987.12 + }, + { + "epoch": 0.7858725761772853, + "grad_norm": 0.05118201673030853, + "learning_rate": 9.88213315797313e-05, + "loss": 0.014578202739357948, + "num_input_tokens_seen": 46458712, + "step": 2837, + "train_runtime": 23379.9105, + "train_tokens_per_second": 1987.121 + }, + { + "epoch": 0.7861495844875346, + "grad_norm": 0.04841691628098488, + "learning_rate": 9.88203826916551e-05, + "loss": 0.014649301767349243, + "num_input_tokens_seen": 46475088, + "step": 2838, + "train_runtime": 23388.1411, + "train_tokens_per_second": 1987.122 + }, + { + "epoch": 0.7864265927977839, + "grad_norm": 0.06578049808740616, + "learning_rate": 9.881943342634029e-05, + "loss": 0.011576908640563488, + "num_input_tokens_seen": 46491464, + "step": 2839, + "train_runtime": 23396.3711, + "train_tokens_per_second": 1987.123 + }, + { + "epoch": 0.7867036011080333, + "grad_norm": 0.06757106631994247, + "learning_rate": 9.881848378379421e-05, + "loss": 0.013114707544445992, + "num_input_tokens_seen": 46507840, + "step": 2840, + "train_runtime": 23404.5871, + "train_tokens_per_second": 1987.125 + }, + { + "epoch": 0.7869806094182825, + "grad_norm": 0.08043552190065384, + "learning_rate": 9.881753376402419e-05, + "loss": 0.014028497971594334, + "num_input_tokens_seen": 46524216, + "step": 2841, + "train_runtime": 23412.7988, + "train_tokens_per_second": 1987.127 + }, + { + "epoch": 0.7872576177285319, + "grad_norm": 0.06198875978589058, + "learning_rate": 9.881658336703759e-05, + "loss": 0.013255506753921509, + "num_input_tokens_seen": 46540592, + "step": 2842, + "train_runtime": 23421.0134, + "train_tokens_per_second": 1987.13 + }, + { + "epoch": 0.7875346260387812, + "grad_norm": 0.07566777616739273, + "learning_rate": 9.881563259284174e-05, + "loss": 0.013458599336445332, + "num_input_tokens_seen": 46556968, + "step": 2843, + "train_runtime": 23429.2418, + "train_tokens_per_second": 1987.131 + }, + { + "epoch": 0.7878116343490305, + "grad_norm": 0.06536279618740082, + "learning_rate": 9.881468144144397e-05, + "loss": 0.01429362129420042, + "num_input_tokens_seen": 46573344, + "step": 2844, + "train_runtime": 23437.4701, + "train_tokens_per_second": 1987.132 + }, + { + "epoch": 0.7880886426592798, + "grad_norm": 0.042149618268013, + "learning_rate": 9.881372991285167e-05, + "loss": 0.011720485053956509, + "num_input_tokens_seen": 46589720, + "step": 2845, + "train_runtime": 23445.697, + "train_tokens_per_second": 1987.133 + }, + { + "epoch": 0.788365650969529, + "grad_norm": 0.06333933025598526, + "learning_rate": 9.881277800707217e-05, + "loss": 0.013400847092270851, + "num_input_tokens_seen": 46606096, + "step": 2846, + "train_runtime": 23453.9266, + "train_tokens_per_second": 1987.134 + }, + { + "epoch": 0.7886426592797784, + "grad_norm": 0.06017325073480606, + "learning_rate": 9.881182572411281e-05, + "loss": 0.014783472754061222, + "num_input_tokens_seen": 46622472, + "step": 2847, + "train_runtime": 23462.141, + "train_tokens_per_second": 1987.136 + }, + { + "epoch": 0.7889196675900277, + "grad_norm": 0.06713388860225677, + "learning_rate": 9.881087306398097e-05, + "loss": 0.012811650522053242, + "num_input_tokens_seen": 46638848, + "step": 2848, + "train_runtime": 23470.3678, + "train_tokens_per_second": 1987.137 + }, + { + "epoch": 0.789196675900277, + "grad_norm": 0.051195383071899414, + "learning_rate": 9.880992002668401e-05, + "loss": 0.013099513947963715, + "num_input_tokens_seen": 46655224, + "step": 2849, + "train_runtime": 23478.5978, + "train_tokens_per_second": 1987.138 + }, + { + "epoch": 0.7894736842105263, + "grad_norm": 0.07428339868783951, + "learning_rate": 9.880896661222928e-05, + "loss": 0.01608060859143734, + "num_input_tokens_seen": 46671600, + "step": 2850, + "train_runtime": 23486.8288, + "train_tokens_per_second": 1987.139 + }, + { + "epoch": 0.7897506925207757, + "grad_norm": 0.08585640788078308, + "learning_rate": 9.880801282062417e-05, + "loss": 0.014679157175123692, + "num_input_tokens_seen": 46687976, + "step": 2851, + "train_runtime": 23495.0577, + "train_tokens_per_second": 1987.14 + }, + { + "epoch": 0.7900277008310249, + "grad_norm": 0.042084772139787674, + "learning_rate": 9.880705865187603e-05, + "loss": 0.014499365352094173, + "num_input_tokens_seen": 46704352, + "step": 2852, + "train_runtime": 23503.2856, + "train_tokens_per_second": 1987.141 + }, + { + "epoch": 0.7903047091412743, + "grad_norm": 0.04892360791563988, + "learning_rate": 9.880610410599223e-05, + "loss": 0.011819222010672092, + "num_input_tokens_seen": 46720728, + "step": 2853, + "train_runtime": 23511.5, + "train_tokens_per_second": 1987.144 + }, + { + "epoch": 0.7905817174515235, + "grad_norm": 0.048397425562143326, + "learning_rate": 9.880514918298018e-05, + "loss": 0.009615667164325714, + "num_input_tokens_seen": 46737104, + "step": 2854, + "train_runtime": 23519.7097, + "train_tokens_per_second": 1987.146 + }, + { + "epoch": 0.7908587257617729, + "grad_norm": 0.05376385897397995, + "learning_rate": 9.880419388284721e-05, + "loss": 0.012258943170309067, + "num_input_tokens_seen": 46753480, + "step": 2855, + "train_runtime": 23527.914, + "train_tokens_per_second": 1987.149 + }, + { + "epoch": 0.7911357340720222, + "grad_norm": 0.0552612729370594, + "learning_rate": 9.880323820560073e-05, + "loss": 0.01163687277585268, + "num_input_tokens_seen": 46769856, + "step": 2856, + "train_runtime": 23536.1263, + "train_tokens_per_second": 1987.152 + }, + { + "epoch": 0.7914127423822714, + "grad_norm": 0.07662267237901688, + "learning_rate": 9.880228215124813e-05, + "loss": 0.011909357272088528, + "num_input_tokens_seen": 46786232, + "step": 2857, + "train_runtime": 23544.3459, + "train_tokens_per_second": 1987.154 + }, + { + "epoch": 0.7916897506925208, + "grad_norm": 0.07497550547122955, + "learning_rate": 9.880132571979679e-05, + "loss": 0.012643708847463131, + "num_input_tokens_seen": 46802608, + "step": 2858, + "train_runtime": 23552.5617, + "train_tokens_per_second": 1987.156 + }, + { + "epoch": 0.7919667590027701, + "grad_norm": 0.05915795639157295, + "learning_rate": 9.880036891125409e-05, + "loss": 0.013676398433744907, + "num_input_tokens_seen": 46818984, + "step": 2859, + "train_runtime": 23560.7815, + "train_tokens_per_second": 1987.158 + }, + { + "epoch": 0.7922437673130194, + "grad_norm": 0.05733364820480347, + "learning_rate": 9.879941172562743e-05, + "loss": 0.013504689559340477, + "num_input_tokens_seen": 46835360, + "step": 2860, + "train_runtime": 23568.9951, + "train_tokens_per_second": 1987.16 + }, + { + "epoch": 0.7925207756232687, + "grad_norm": 0.0739421397447586, + "learning_rate": 9.879845416292421e-05, + "loss": 0.015287507325410843, + "num_input_tokens_seen": 46851736, + "step": 2861, + "train_runtime": 23577.2001, + "train_tokens_per_second": 1987.163 + }, + { + "epoch": 0.792797783933518, + "grad_norm": 0.07445746660232544, + "learning_rate": 9.879749622315183e-05, + "loss": 0.01184770092368126, + "num_input_tokens_seen": 46868112, + "step": 2862, + "train_runtime": 23585.4105, + "train_tokens_per_second": 1987.165 + }, + { + "epoch": 0.7930747922437673, + "grad_norm": 0.034144673496484756, + "learning_rate": 9.87965379063177e-05, + "loss": 0.010324498638510704, + "num_input_tokens_seen": 46884488, + "step": 2863, + "train_runtime": 23593.6185, + "train_tokens_per_second": 1987.168 + }, + { + "epoch": 0.7933518005540167, + "grad_norm": 0.08053354173898697, + "learning_rate": 9.879557921242919e-05, + "loss": 0.01837514154613018, + "num_input_tokens_seen": 46900864, + "step": 2864, + "train_runtime": 23601.8326, + "train_tokens_per_second": 1987.17 + }, + { + "epoch": 0.7936288088642659, + "grad_norm": 0.11746218055486679, + "learning_rate": 9.879462014149374e-05, + "loss": 0.017635224387049675, + "num_input_tokens_seen": 46917240, + "step": 2865, + "train_runtime": 23610.0574, + "train_tokens_per_second": 1987.172 + }, + { + "epoch": 0.7939058171745152, + "grad_norm": 0.05795081704854965, + "learning_rate": 9.879366069351876e-05, + "loss": 0.011660165153443813, + "num_input_tokens_seen": 46933616, + "step": 2866, + "train_runtime": 23618.2841, + "train_tokens_per_second": 1987.173 + }, + { + "epoch": 0.7941828254847645, + "grad_norm": 0.06551480293273926, + "learning_rate": 9.879270086851166e-05, + "loss": 0.015215912833809853, + "num_input_tokens_seen": 46949992, + "step": 2867, + "train_runtime": 23626.5086, + "train_tokens_per_second": 1987.174 + }, + { + "epoch": 0.7944598337950138, + "grad_norm": 0.09152200073003769, + "learning_rate": 9.879174066647985e-05, + "loss": 0.013185704126954079, + "num_input_tokens_seen": 46966368, + "step": 2868, + "train_runtime": 23634.7401, + "train_tokens_per_second": 1987.175 + }, + { + "epoch": 0.7947368421052632, + "grad_norm": 0.0577688030898571, + "learning_rate": 9.879078008743075e-05, + "loss": 0.012386184185743332, + "num_input_tokens_seen": 46982744, + "step": 2869, + "train_runtime": 23642.9661, + "train_tokens_per_second": 1987.176 + }, + { + "epoch": 0.7950138504155124, + "grad_norm": 0.06271252036094666, + "learning_rate": 9.878981913137179e-05, + "loss": 0.013184133917093277, + "num_input_tokens_seen": 46999120, + "step": 2870, + "train_runtime": 23651.1979, + "train_tokens_per_second": 1987.177 + }, + { + "epoch": 0.7952908587257618, + "grad_norm": 0.05307069420814514, + "learning_rate": 9.878885779831037e-05, + "loss": 0.013062546029686928, + "num_input_tokens_seen": 47015496, + "step": 2871, + "train_runtime": 23659.4312, + "train_tokens_per_second": 1987.178 + }, + { + "epoch": 0.7955678670360111, + "grad_norm": 0.08226348459720612, + "learning_rate": 9.878789608825396e-05, + "loss": 0.010223444551229477, + "num_input_tokens_seen": 47031872, + "step": 2872, + "train_runtime": 23667.6674, + "train_tokens_per_second": 1987.178 + }, + { + "epoch": 0.7958448753462604, + "grad_norm": 0.04944685474038124, + "learning_rate": 9.878693400120998e-05, + "loss": 0.012486201710999012, + "num_input_tokens_seen": 47048248, + "step": 2873, + "train_runtime": 23675.8924, + "train_tokens_per_second": 1987.179 + }, + { + "epoch": 0.7961218836565097, + "grad_norm": 0.08463560789823532, + "learning_rate": 9.878597153718584e-05, + "loss": 0.015941686928272247, + "num_input_tokens_seen": 47064624, + "step": 2874, + "train_runtime": 23684.1152, + "train_tokens_per_second": 1987.181 + }, + { + "epoch": 0.796398891966759, + "grad_norm": 0.05565151572227478, + "learning_rate": 9.878500869618899e-05, + "loss": 0.012054910883307457, + "num_input_tokens_seen": 47081000, + "step": 2875, + "train_runtime": 23692.3465, + "train_tokens_per_second": 1987.182 + }, + { + "epoch": 0.7966759002770083, + "grad_norm": 0.07578530162572861, + "learning_rate": 9.878404547822686e-05, + "loss": 0.01349600125104189, + "num_input_tokens_seen": 47097376, + "step": 2876, + "train_runtime": 23700.5724, + "train_tokens_per_second": 1987.183 + }, + { + "epoch": 0.7969529085872576, + "grad_norm": 0.05166560783982277, + "learning_rate": 9.878308188330692e-05, + "loss": 0.011415988206863403, + "num_input_tokens_seen": 47113752, + "step": 2877, + "train_runtime": 23708.8025, + "train_tokens_per_second": 1987.184 + }, + { + "epoch": 0.7972299168975069, + "grad_norm": 0.0550517737865448, + "learning_rate": 9.87821179114366e-05, + "loss": 0.013856110163033009, + "num_input_tokens_seen": 47130128, + "step": 2878, + "train_runtime": 23717.0342, + "train_tokens_per_second": 1987.185 + }, + { + "epoch": 0.7975069252077562, + "grad_norm": 0.07319103926420212, + "learning_rate": 9.878115356262334e-05, + "loss": 0.00991845317184925, + "num_input_tokens_seen": 47146504, + "step": 2879, + "train_runtime": 23725.2599, + "train_tokens_per_second": 1987.186 + }, + { + "epoch": 0.7977839335180056, + "grad_norm": 0.06909043341875076, + "learning_rate": 9.87801888368746e-05, + "loss": 0.013154987245798111, + "num_input_tokens_seen": 47162880, + "step": 2880, + "train_runtime": 23733.4822, + "train_tokens_per_second": 1987.188 + }, + { + "epoch": 0.7980609418282548, + "grad_norm": 0.06619327515363693, + "learning_rate": 9.877922373419786e-05, + "loss": 0.01630993001163006, + "num_input_tokens_seen": 47179256, + "step": 2881, + "train_runtime": 23741.7055, + "train_tokens_per_second": 1987.189 + }, + { + "epoch": 0.7983379501385042, + "grad_norm": 0.07801077514886856, + "learning_rate": 9.877825825460052e-05, + "loss": 0.014293648302555084, + "num_input_tokens_seen": 47195632, + "step": 2882, + "train_runtime": 23749.9333, + "train_tokens_per_second": 1987.19 + }, + { + "epoch": 0.7986149584487534, + "grad_norm": 0.08295145630836487, + "learning_rate": 9.877729239809008e-05, + "loss": 0.016281355172395706, + "num_input_tokens_seen": 47212008, + "step": 2883, + "train_runtime": 23758.1641, + "train_tokens_per_second": 1987.191 + }, + { + "epoch": 0.7988919667590028, + "grad_norm": 0.07122042775154114, + "learning_rate": 9.877632616467401e-05, + "loss": 0.010667161084711552, + "num_input_tokens_seen": 47228384, + "step": 2884, + "train_runtime": 23766.3942, + "train_tokens_per_second": 1987.192 + }, + { + "epoch": 0.7991689750692521, + "grad_norm": 0.05051134526729584, + "learning_rate": 9.877535955435974e-05, + "loss": 0.011693127453327179, + "num_input_tokens_seen": 47244760, + "step": 2885, + "train_runtime": 23774.6139, + "train_tokens_per_second": 1987.194 + }, + { + "epoch": 0.7994459833795013, + "grad_norm": 0.05448444187641144, + "learning_rate": 9.877439256715477e-05, + "loss": 0.01640714332461357, + "num_input_tokens_seen": 47261136, + "step": 2886, + "train_runtime": 23782.8561, + "train_tokens_per_second": 1987.193 + }, + { + "epoch": 0.7997229916897507, + "grad_norm": 0.08443493396043777, + "learning_rate": 9.877342520306659e-05, + "loss": 0.01363967452198267, + "num_input_tokens_seen": 47277512, + "step": 2887, + "train_runtime": 23791.0685, + "train_tokens_per_second": 1987.196 + }, + { + "epoch": 0.8, + "grad_norm": 0.08439821749925613, + "learning_rate": 9.87724574621026e-05, + "loss": 0.014266222715377808, + "num_input_tokens_seen": 47293888, + "step": 2888, + "train_runtime": 23799.2779, + "train_tokens_per_second": 1987.198 + }, + { + "epoch": 0.8002770083102493, + "grad_norm": 0.055484071373939514, + "learning_rate": 9.877148934427037e-05, + "loss": 0.012283045798540115, + "num_input_tokens_seen": 47310264, + "step": 2889, + "train_runtime": 23807.4905, + "train_tokens_per_second": 1987.201 + }, + { + "epoch": 0.8005540166204986, + "grad_norm": 0.04857178032398224, + "learning_rate": 9.87705208495773e-05, + "loss": 0.014403664506971836, + "num_input_tokens_seen": 47326640, + "step": 2890, + "train_runtime": 23815.718, + "train_tokens_per_second": 1987.202 + }, + { + "epoch": 0.8008310249307479, + "grad_norm": 0.07632966339588165, + "learning_rate": 9.876955197803093e-05, + "loss": 0.011455991305410862, + "num_input_tokens_seen": 47343016, + "step": 2891, + "train_runtime": 23823.9434, + "train_tokens_per_second": 1987.203 + }, + { + "epoch": 0.8011080332409972, + "grad_norm": 0.04542739689350128, + "learning_rate": 9.876858272963871e-05, + "loss": 0.011000685393810272, + "num_input_tokens_seen": 47359392, + "step": 2892, + "train_runtime": 23832.1774, + "train_tokens_per_second": 1987.204 + }, + { + "epoch": 0.8013850415512466, + "grad_norm": 0.08244240283966064, + "learning_rate": 9.876761310440817e-05, + "loss": 0.014721140265464783, + "num_input_tokens_seen": 47375768, + "step": 2893, + "train_runtime": 23840.4095, + "train_tokens_per_second": 1987.204 + }, + { + "epoch": 0.8016620498614958, + "grad_norm": 0.0444711335003376, + "learning_rate": 9.876664310234676e-05, + "loss": 0.011675555258989334, + "num_input_tokens_seen": 47392144, + "step": 2894, + "train_runtime": 23848.6397, + "train_tokens_per_second": 1987.205 + }, + { + "epoch": 0.8019390581717452, + "grad_norm": 0.05989229679107666, + "learning_rate": 9.8765672723462e-05, + "loss": 0.014574292115867138, + "num_input_tokens_seen": 47408520, + "step": 2895, + "train_runtime": 23856.8681, + "train_tokens_per_second": 1987.206 + }, + { + "epoch": 0.8022160664819945, + "grad_norm": 0.07370567321777344, + "learning_rate": 9.876470196776138e-05, + "loss": 0.01634531468153, + "num_input_tokens_seen": 47424896, + "step": 2896, + "train_runtime": 23865.0887, + "train_tokens_per_second": 1987.208 + }, + { + "epoch": 0.8024930747922437, + "grad_norm": 0.07012755423784256, + "learning_rate": 9.876373083525241e-05, + "loss": 0.013254844583570957, + "num_input_tokens_seen": 47441272, + "step": 2897, + "train_runtime": 23873.3084, + "train_tokens_per_second": 1987.21 + }, + { + "epoch": 0.8027700831024931, + "grad_norm": 0.04606231674551964, + "learning_rate": 9.876275932594256e-05, + "loss": 0.01308049913495779, + "num_input_tokens_seen": 47457648, + "step": 2898, + "train_runtime": 23881.5305, + "train_tokens_per_second": 1987.211 + }, + { + "epoch": 0.8030470914127423, + "grad_norm": 0.08210572600364685, + "learning_rate": 9.876178743983939e-05, + "loss": 0.013386963866651058, + "num_input_tokens_seen": 47474024, + "step": 2899, + "train_runtime": 23889.7465, + "train_tokens_per_second": 1987.213 + }, + { + "epoch": 0.8033240997229917, + "grad_norm": 0.059339240193367004, + "learning_rate": 9.876081517695038e-05, + "loss": 0.013525092974305153, + "num_input_tokens_seen": 47490400, + "step": 2900, + "train_runtime": 23897.9773, + "train_tokens_per_second": 1987.214 + }, + { + "epoch": 0.803601108033241, + "grad_norm": 0.04134247079491615, + "learning_rate": 9.875984253728306e-05, + "loss": 0.01088234968483448, + "num_input_tokens_seen": 47506776, + "step": 2901, + "train_runtime": 23907.7836, + "train_tokens_per_second": 1987.084 + }, + { + "epoch": 0.8038781163434903, + "grad_norm": 0.047109924256801605, + "learning_rate": 9.87588695208449e-05, + "loss": 0.01168452762067318, + "num_input_tokens_seen": 47523152, + "step": 2902, + "train_runtime": 23916.0107, + "train_tokens_per_second": 1987.085 + }, + { + "epoch": 0.8041551246537396, + "grad_norm": 0.05751391127705574, + "learning_rate": 9.875789612764346e-05, + "loss": 0.013956570997834206, + "num_input_tokens_seen": 47539528, + "step": 2903, + "train_runtime": 23924.2375, + "train_tokens_per_second": 1987.086 + }, + { + "epoch": 0.804432132963989, + "grad_norm": 0.062054816633462906, + "learning_rate": 9.875692235768627e-05, + "loss": 0.012268681079149246, + "num_input_tokens_seen": 47555904, + "step": 2904, + "train_runtime": 23932.4662, + "train_tokens_per_second": 1987.087 + }, + { + "epoch": 0.8047091412742382, + "grad_norm": 0.06056230515241623, + "learning_rate": 9.875594821098082e-05, + "loss": 0.012730496004223824, + "num_input_tokens_seen": 47572280, + "step": 2905, + "train_runtime": 23940.6871, + "train_tokens_per_second": 1987.089 + }, + { + "epoch": 0.8049861495844876, + "grad_norm": 0.04588078707456589, + "learning_rate": 9.875497368753466e-05, + "loss": 0.012542827986180782, + "num_input_tokens_seen": 47588656, + "step": 2906, + "train_runtime": 23948.9107, + "train_tokens_per_second": 1987.091 + }, + { + "epoch": 0.8052631578947368, + "grad_norm": 0.08242198079824448, + "learning_rate": 9.875399878735532e-05, + "loss": 0.014930852688848972, + "num_input_tokens_seen": 47605032, + "step": 2907, + "train_runtime": 23957.1354, + "train_tokens_per_second": 1987.092 + }, + { + "epoch": 0.8055401662049861, + "grad_norm": 0.08316002786159515, + "learning_rate": 9.875302351045031e-05, + "loss": 0.015284392051398754, + "num_input_tokens_seen": 47621408, + "step": 2908, + "train_runtime": 23965.3607, + "train_tokens_per_second": 1987.093 + }, + { + "epoch": 0.8058171745152355, + "grad_norm": 0.07534466683864594, + "learning_rate": 9.875204785682719e-05, + "loss": 0.01350906491279602, + "num_input_tokens_seen": 47637784, + "step": 2909, + "train_runtime": 23973.5889, + "train_tokens_per_second": 1987.094 + }, + { + "epoch": 0.8060941828254847, + "grad_norm": 0.08721563965082169, + "learning_rate": 9.87510718264935e-05, + "loss": 0.014860879629850388, + "num_input_tokens_seen": 47654160, + "step": 2910, + "train_runtime": 23981.8128, + "train_tokens_per_second": 1987.096 + }, + { + "epoch": 0.8063711911357341, + "grad_norm": 0.10294568538665771, + "learning_rate": 9.875009541945676e-05, + "loss": 0.01682443916797638, + "num_input_tokens_seen": 47670536, + "step": 2911, + "train_runtime": 23990.0412, + "train_tokens_per_second": 1987.097 + }, + { + "epoch": 0.8066481994459834, + "grad_norm": 0.13096745312213898, + "learning_rate": 9.874911863572453e-05, + "loss": 0.013389802537858486, + "num_input_tokens_seen": 47686912, + "step": 2912, + "train_runtime": 23998.2711, + "train_tokens_per_second": 1987.098 + }, + { + "epoch": 0.8069252077562327, + "grad_norm": 0.06820385903120041, + "learning_rate": 9.874814147530438e-05, + "loss": 0.013030856847763062, + "num_input_tokens_seen": 47703288, + "step": 2913, + "train_runtime": 24006.5006, + "train_tokens_per_second": 1987.099 + }, + { + "epoch": 0.807202216066482, + "grad_norm": 0.0658283606171608, + "learning_rate": 9.874716393820383e-05, + "loss": 0.014482216909527779, + "num_input_tokens_seen": 47719664, + "step": 2914, + "train_runtime": 24014.7212, + "train_tokens_per_second": 1987.1 + }, + { + "epoch": 0.8074792243767313, + "grad_norm": 0.05210626870393753, + "learning_rate": 9.874618602443042e-05, + "loss": 0.014995323494076729, + "num_input_tokens_seen": 47736040, + "step": 2915, + "train_runtime": 24022.9593, + "train_tokens_per_second": 1987.101 + }, + { + "epoch": 0.8077562326869806, + "grad_norm": 0.07398993521928787, + "learning_rate": 9.874520773399174e-05, + "loss": 0.010480135679244995, + "num_input_tokens_seen": 47752416, + "step": 2916, + "train_runtime": 24031.1819, + "train_tokens_per_second": 1987.102 + }, + { + "epoch": 0.80803324099723, + "grad_norm": 0.04654492065310478, + "learning_rate": 9.874422906689534e-05, + "loss": 0.010835009627044201, + "num_input_tokens_seen": 47768792, + "step": 2917, + "train_runtime": 24039.4143, + "train_tokens_per_second": 1987.103 + }, + { + "epoch": 0.8083102493074792, + "grad_norm": 0.075646311044693, + "learning_rate": 9.874325002314876e-05, + "loss": 0.0181724950671196, + "num_input_tokens_seen": 47785168, + "step": 2918, + "train_runtime": 24047.6488, + "train_tokens_per_second": 1987.104 + }, + { + "epoch": 0.8085872576177285, + "grad_norm": 0.05887891352176666, + "learning_rate": 9.87422706027596e-05, + "loss": 0.012919277884066105, + "num_input_tokens_seen": 47801544, + "step": 2919, + "train_runtime": 24055.8773, + "train_tokens_per_second": 1987.105 + }, + { + "epoch": 0.8088642659279779, + "grad_norm": 0.05624835938215256, + "learning_rate": 9.87412908057354e-05, + "loss": 0.01342066377401352, + "num_input_tokens_seen": 47817920, + "step": 2920, + "train_runtime": 24064.0987, + "train_tokens_per_second": 1987.106 + }, + { + "epoch": 0.8091412742382271, + "grad_norm": 0.05543890967965126, + "learning_rate": 9.874031063208376e-05, + "loss": 0.013258450664579868, + "num_input_tokens_seen": 47834296, + "step": 2921, + "train_runtime": 24072.3208, + "train_tokens_per_second": 1987.108 + }, + { + "epoch": 0.8094182825484765, + "grad_norm": 0.04409690573811531, + "learning_rate": 9.873933008181222e-05, + "loss": 0.013812362216413021, + "num_input_tokens_seen": 47850672, + "step": 2922, + "train_runtime": 24080.5571, + "train_tokens_per_second": 1987.108 + }, + { + "epoch": 0.8096952908587257, + "grad_norm": 0.046456534415483475, + "learning_rate": 9.873834915492837e-05, + "loss": 0.012990177609026432, + "num_input_tokens_seen": 47867048, + "step": 2923, + "train_runtime": 24088.7857, + "train_tokens_per_second": 1987.109 + }, + { + "epoch": 0.8099722991689751, + "grad_norm": 0.07826020568609238, + "learning_rate": 9.87373678514398e-05, + "loss": 0.016857344657182693, + "num_input_tokens_seen": 47883424, + "step": 2924, + "train_runtime": 24097.015, + "train_tokens_per_second": 1987.11 + }, + { + "epoch": 0.8102493074792244, + "grad_norm": 0.06654863804578781, + "learning_rate": 9.87363861713541e-05, + "loss": 0.01571049727499485, + "num_input_tokens_seen": 47899800, + "step": 2925, + "train_runtime": 24105.2376, + "train_tokens_per_second": 1987.112 + }, + { + "epoch": 0.8105263157894737, + "grad_norm": 0.05053775757551193, + "learning_rate": 9.873540411467884e-05, + "loss": 0.014327656477689743, + "num_input_tokens_seen": 47916176, + "step": 2926, + "train_runtime": 24113.468, + "train_tokens_per_second": 1987.113 + }, + { + "epoch": 0.810803324099723, + "grad_norm": 0.07217305153608322, + "learning_rate": 9.873442168142159e-05, + "loss": 0.012460192665457726, + "num_input_tokens_seen": 47932552, + "step": 2927, + "train_runtime": 24121.6921, + "train_tokens_per_second": 1987.114 + }, + { + "epoch": 0.8110803324099723, + "grad_norm": 0.06835464388132095, + "learning_rate": 9.873343887158997e-05, + "loss": 0.012812630273401737, + "num_input_tokens_seen": 47948928, + "step": 2928, + "train_runtime": 24129.9129, + "train_tokens_per_second": 1987.116 + }, + { + "epoch": 0.8113573407202216, + "grad_norm": 0.07079863548278809, + "learning_rate": 9.873245568519156e-05, + "loss": 0.013095445930957794, + "num_input_tokens_seen": 47965304, + "step": 2929, + "train_runtime": 24138.1413, + "train_tokens_per_second": 1987.117 + }, + { + "epoch": 0.8116343490304709, + "grad_norm": 0.048217982053756714, + "learning_rate": 9.873147212223395e-05, + "loss": 0.010614442639052868, + "num_input_tokens_seen": 47981680, + "step": 2930, + "train_runtime": 24146.3699, + "train_tokens_per_second": 1987.118 + }, + { + "epoch": 0.8119113573407202, + "grad_norm": 0.05497929826378822, + "learning_rate": 9.873048818272478e-05, + "loss": 0.01330361794680357, + "num_input_tokens_seen": 47998056, + "step": 2931, + "train_runtime": 24154.595, + "train_tokens_per_second": 1987.119 + }, + { + "epoch": 0.8121883656509695, + "grad_norm": 0.05858958885073662, + "learning_rate": 9.872950386667161e-05, + "loss": 0.01414954662322998, + "num_input_tokens_seen": 48014432, + "step": 2932, + "train_runtime": 24162.8112, + "train_tokens_per_second": 1987.121 + }, + { + "epoch": 0.8124653739612189, + "grad_norm": 0.07439182698726654, + "learning_rate": 9.872851917408207e-05, + "loss": 0.01263683196157217, + "num_input_tokens_seen": 48030808, + "step": 2933, + "train_runtime": 24171.0265, + "train_tokens_per_second": 1987.123 + }, + { + "epoch": 0.8127423822714681, + "grad_norm": 0.08261517435312271, + "learning_rate": 9.872753410496375e-05, + "loss": 0.013222528621554375, + "num_input_tokens_seen": 48047184, + "step": 2934, + "train_runtime": 24179.2579, + "train_tokens_per_second": 1987.124 + }, + { + "epoch": 0.8130193905817175, + "grad_norm": 0.046216726303100586, + "learning_rate": 9.872654865932428e-05, + "loss": 0.011102077551186085, + "num_input_tokens_seen": 48063560, + "step": 2935, + "train_runtime": 24187.4857, + "train_tokens_per_second": 1987.125 + }, + { + "epoch": 0.8132963988919668, + "grad_norm": 0.06009708344936371, + "learning_rate": 9.872556283717125e-05, + "loss": 0.017142467200756073, + "num_input_tokens_seen": 48079936, + "step": 2936, + "train_runtime": 24195.7153, + "train_tokens_per_second": 1987.126 + }, + { + "epoch": 0.813573407202216, + "grad_norm": 0.09720524400472641, + "learning_rate": 9.872457663851232e-05, + "loss": 0.013226914219558239, + "num_input_tokens_seen": 48096312, + "step": 2937, + "train_runtime": 24203.9455, + "train_tokens_per_second": 1987.127 + }, + { + "epoch": 0.8138504155124654, + "grad_norm": 0.06290486454963684, + "learning_rate": 9.872359006335505e-05, + "loss": 0.016530092805624008, + "num_input_tokens_seen": 48112688, + "step": 2938, + "train_runtime": 24212.1732, + "train_tokens_per_second": 1987.128 + }, + { + "epoch": 0.8141274238227146, + "grad_norm": 0.07236675918102264, + "learning_rate": 9.872260311170714e-05, + "loss": 0.014536003582179546, + "num_input_tokens_seen": 48129064, + "step": 2939, + "train_runtime": 24220.3951, + "train_tokens_per_second": 1987.13 + }, + { + "epoch": 0.814404432132964, + "grad_norm": 0.06957624852657318, + "learning_rate": 9.872161578357614e-05, + "loss": 0.012640129774808884, + "num_input_tokens_seen": 48145440, + "step": 2940, + "train_runtime": 24228.6235, + "train_tokens_per_second": 1987.131 + }, + { + "epoch": 0.8146814404432133, + "grad_norm": 0.04019584879279137, + "learning_rate": 9.872062807896972e-05, + "loss": 0.013637568801641464, + "num_input_tokens_seen": 48161816, + "step": 2941, + "train_runtime": 24236.8453, + "train_tokens_per_second": 1987.132 + }, + { + "epoch": 0.8149584487534626, + "grad_norm": 0.08107120543718338, + "learning_rate": 9.87196399978955e-05, + "loss": 0.015465661883354187, + "num_input_tokens_seen": 48178192, + "step": 2942, + "train_runtime": 24245.0667, + "train_tokens_per_second": 1987.134 + }, + { + "epoch": 0.8152354570637119, + "grad_norm": 0.06039109453558922, + "learning_rate": 9.871865154036114e-05, + "loss": 0.00981525145471096, + "num_input_tokens_seen": 48194568, + "step": 2943, + "train_runtime": 24253.2889, + "train_tokens_per_second": 1987.135 + }, + { + "epoch": 0.8155124653739613, + "grad_norm": 0.06848680227994919, + "learning_rate": 9.871766270637424e-05, + "loss": 0.016045574098825455, + "num_input_tokens_seen": 48210944, + "step": 2944, + "train_runtime": 24261.5109, + "train_tokens_per_second": 1987.137 + }, + { + "epoch": 0.8157894736842105, + "grad_norm": 0.05902143940329552, + "learning_rate": 9.871667349594247e-05, + "loss": 0.013178948312997818, + "num_input_tokens_seen": 48227320, + "step": 2945, + "train_runtime": 24269.7235, + "train_tokens_per_second": 1987.139 + }, + { + "epoch": 0.8160664819944599, + "grad_norm": 0.11810272186994553, + "learning_rate": 9.871568390907346e-05, + "loss": 0.010002417489886284, + "num_input_tokens_seen": 48243696, + "step": 2946, + "train_runtime": 24277.9476, + "train_tokens_per_second": 1987.141 + }, + { + "epoch": 0.8163434903047091, + "grad_norm": 0.07261879742145538, + "learning_rate": 9.871469394577484e-05, + "loss": 0.015200951136648655, + "num_input_tokens_seen": 48260072, + "step": 2947, + "train_runtime": 24286.164, + "train_tokens_per_second": 1987.143 + }, + { + "epoch": 0.8166204986149584, + "grad_norm": 0.09052418172359467, + "learning_rate": 9.87137036060543e-05, + "loss": 0.013822418637573719, + "num_input_tokens_seen": 48276448, + "step": 2948, + "train_runtime": 24294.3839, + "train_tokens_per_second": 1987.144 + }, + { + "epoch": 0.8168975069252078, + "grad_norm": 0.077455073595047, + "learning_rate": 9.871271288991947e-05, + "loss": 0.015047365799546242, + "num_input_tokens_seen": 48292824, + "step": 2949, + "train_runtime": 24302.6066, + "train_tokens_per_second": 1987.146 + }, + { + "epoch": 0.817174515235457, + "grad_norm": 0.06605514883995056, + "learning_rate": 9.8711721797378e-05, + "loss": 0.016811756417155266, + "num_input_tokens_seen": 48309200, + "step": 2950, + "train_runtime": 24310.8318, + "train_tokens_per_second": 1987.147 + }, + { + "epoch": 0.8174515235457064, + "grad_norm": 0.06290590018033981, + "learning_rate": 9.871073032843756e-05, + "loss": 0.012276452966034412, + "num_input_tokens_seen": 48325576, + "step": 2951, + "train_runtime": 24319.0578, + "train_tokens_per_second": 1987.148 + }, + { + "epoch": 0.8177285318559557, + "grad_norm": 0.04442431405186653, + "learning_rate": 9.87097384831058e-05, + "loss": 0.010292697697877884, + "num_input_tokens_seen": 48341952, + "step": 2952, + "train_runtime": 24327.2821, + "train_tokens_per_second": 1987.15 + }, + { + "epoch": 0.818005540166205, + "grad_norm": 0.0619485005736351, + "learning_rate": 9.870874626139039e-05, + "loss": 0.012767677195370197, + "num_input_tokens_seen": 48358328, + "step": 2953, + "train_runtime": 24335.4952, + "train_tokens_per_second": 1987.152 + }, + { + "epoch": 0.8182825484764543, + "grad_norm": 0.07950824499130249, + "learning_rate": 9.8707753663299e-05, + "loss": 0.010954173281788826, + "num_input_tokens_seen": 48374704, + "step": 2954, + "train_runtime": 24343.7036, + "train_tokens_per_second": 1987.155 + }, + { + "epoch": 0.8185595567867036, + "grad_norm": 0.09837134927511215, + "learning_rate": 9.870676068883928e-05, + "loss": 0.012330879457294941, + "num_input_tokens_seen": 48391080, + "step": 2955, + "train_runtime": 24351.9194, + "train_tokens_per_second": 1987.157 + }, + { + "epoch": 0.8188365650969529, + "grad_norm": 0.04437301307916641, + "learning_rate": 9.870576733801894e-05, + "loss": 0.012652511708438396, + "num_input_tokens_seen": 48407456, + "step": 2956, + "train_runtime": 24360.1283, + "train_tokens_per_second": 1987.159 + }, + { + "epoch": 0.8191135734072023, + "grad_norm": 0.052784476429224014, + "learning_rate": 9.870477361084563e-05, + "loss": 0.011049040593206882, + "num_input_tokens_seen": 48423832, + "step": 2957, + "train_runtime": 24368.3438, + "train_tokens_per_second": 1987.161 + }, + { + "epoch": 0.8193905817174515, + "grad_norm": 0.053318724036216736, + "learning_rate": 9.870377950732703e-05, + "loss": 0.0126597099006176, + "num_input_tokens_seen": 48440208, + "step": 2958, + "train_runtime": 24376.5731, + "train_tokens_per_second": 1987.162 + }, + { + "epoch": 0.8196675900277008, + "grad_norm": 0.05852421373128891, + "learning_rate": 9.870278502747082e-05, + "loss": 0.01388319581747055, + "num_input_tokens_seen": 48456584, + "step": 2959, + "train_runtime": 24384.8039, + "train_tokens_per_second": 1987.163 + }, + { + "epoch": 0.8199445983379502, + "grad_norm": 0.05899207666516304, + "learning_rate": 9.87017901712847e-05, + "loss": 0.01190374419093132, + "num_input_tokens_seen": 48472960, + "step": 2960, + "train_runtime": 24393.0326, + "train_tokens_per_second": 1987.164 + }, + { + "epoch": 0.8202216066481994, + "grad_norm": 0.09446212649345398, + "learning_rate": 9.870079493877635e-05, + "loss": 0.01354297436773777, + "num_input_tokens_seen": 48489336, + "step": 2961, + "train_runtime": 24401.2637, + "train_tokens_per_second": 1987.165 + }, + { + "epoch": 0.8204986149584488, + "grad_norm": 0.08778124302625656, + "learning_rate": 9.869979932995343e-05, + "loss": 0.01488038245588541, + "num_input_tokens_seen": 48505712, + "step": 2962, + "train_runtime": 24409.4915, + "train_tokens_per_second": 1987.166 + }, + { + "epoch": 0.820775623268698, + "grad_norm": 0.04730156809091568, + "learning_rate": 9.869880334482368e-05, + "loss": 0.01390691939741373, + "num_input_tokens_seen": 48522088, + "step": 2963, + "train_runtime": 24417.724, + "train_tokens_per_second": 1987.167 + }, + { + "epoch": 0.8210526315789474, + "grad_norm": 0.07742829620838165, + "learning_rate": 9.869780698339479e-05, + "loss": 0.011650009080767632, + "num_input_tokens_seen": 48538464, + "step": 2964, + "train_runtime": 24425.9479, + "train_tokens_per_second": 1987.168 + }, + { + "epoch": 0.8213296398891967, + "grad_norm": 0.0953574851155281, + "learning_rate": 9.869681024567443e-05, + "loss": 0.014555558562278748, + "num_input_tokens_seen": 48554840, + "step": 2965, + "train_runtime": 24434.1645, + "train_tokens_per_second": 1987.17 + }, + { + "epoch": 0.821606648199446, + "grad_norm": 0.0792686864733696, + "learning_rate": 9.869581313167032e-05, + "loss": 0.01599268428981304, + "num_input_tokens_seen": 48571216, + "step": 2966, + "train_runtime": 24442.378, + "train_tokens_per_second": 1987.172 + }, + { + "epoch": 0.8218836565096953, + "grad_norm": 0.061101965606212616, + "learning_rate": 9.869481564139016e-05, + "loss": 0.012498898431658745, + "num_input_tokens_seen": 48587592, + "step": 2967, + "train_runtime": 24450.598, + "train_tokens_per_second": 1987.174 + }, + { + "epoch": 0.8221606648199447, + "grad_norm": 0.06542983651161194, + "learning_rate": 9.869381777484166e-05, + "loss": 0.014572751708328724, + "num_input_tokens_seen": 48603968, + "step": 2968, + "train_runtime": 24458.8144, + "train_tokens_per_second": 1987.176 + }, + { + "epoch": 0.8224376731301939, + "grad_norm": 0.10918153822422028, + "learning_rate": 9.869281953203254e-05, + "loss": 0.012087294831871986, + "num_input_tokens_seen": 48620344, + "step": 2969, + "train_runtime": 24467.0277, + "train_tokens_per_second": 1987.178 + }, + { + "epoch": 0.8227146814404432, + "grad_norm": 0.08845154196023941, + "learning_rate": 9.869182091297048e-05, + "loss": 0.011611311696469784, + "num_input_tokens_seen": 48636720, + "step": 2970, + "train_runtime": 24475.2385, + "train_tokens_per_second": 1987.181 + }, + { + "epoch": 0.8229916897506925, + "grad_norm": 0.06015957519412041, + "learning_rate": 9.869082191766323e-05, + "loss": 0.012920280918478966, + "num_input_tokens_seen": 48653096, + "step": 2971, + "train_runtime": 24483.4467, + "train_tokens_per_second": 1987.183 + }, + { + "epoch": 0.8232686980609418, + "grad_norm": 0.06467743217945099, + "learning_rate": 9.868982254611849e-05, + "loss": 0.013469625264406204, + "num_input_tokens_seen": 48669472, + "step": 2972, + "train_runtime": 24491.6598, + "train_tokens_per_second": 1987.186 + }, + { + "epoch": 0.8235457063711912, + "grad_norm": 0.06239588186144829, + "learning_rate": 9.868882279834401e-05, + "loss": 0.01244464609771967, + "num_input_tokens_seen": 48685848, + "step": 2973, + "train_runtime": 24499.8669, + "train_tokens_per_second": 1987.188 + }, + { + "epoch": 0.8238227146814404, + "grad_norm": 0.059167515486478806, + "learning_rate": 9.86878226743475e-05, + "loss": 0.01258029229938984, + "num_input_tokens_seen": 48702224, + "step": 2974, + "train_runtime": 24508.0787, + "train_tokens_per_second": 1987.191 + }, + { + "epoch": 0.8240997229916898, + "grad_norm": 0.06263389438390732, + "learning_rate": 9.868682217413667e-05, + "loss": 0.012478876858949661, + "num_input_tokens_seen": 48718600, + "step": 2975, + "train_runtime": 24516.2963, + "train_tokens_per_second": 1987.192 + }, + { + "epoch": 0.824376731301939, + "grad_norm": 0.05005008354783058, + "learning_rate": 9.868582129771927e-05, + "loss": 0.013253957964479923, + "num_input_tokens_seen": 48734976, + "step": 2976, + "train_runtime": 24524.5206, + "train_tokens_per_second": 1987.194 + }, + { + "epoch": 0.8246537396121884, + "grad_norm": 0.08083988726139069, + "learning_rate": 9.868482004510302e-05, + "loss": 0.015278982929885387, + "num_input_tokens_seen": 48751352, + "step": 2977, + "train_runtime": 24532.7349, + "train_tokens_per_second": 1987.196 + }, + { + "epoch": 0.8249307479224377, + "grad_norm": 0.04520134627819061, + "learning_rate": 9.868381841629567e-05, + "loss": 0.01173260435461998, + "num_input_tokens_seen": 48767728, + "step": 2978, + "train_runtime": 24540.9566, + "train_tokens_per_second": 1987.198 + }, + { + "epoch": 0.8252077562326869, + "grad_norm": 0.09654716402292252, + "learning_rate": 9.868281641130496e-05, + "loss": 0.01318840216845274, + "num_input_tokens_seen": 48784104, + "step": 2979, + "train_runtime": 24549.17, + "train_tokens_per_second": 1987.2 + }, + { + "epoch": 0.8254847645429363, + "grad_norm": 0.05483449995517731, + "learning_rate": 9.868181403013865e-05, + "loss": 0.013280758634209633, + "num_input_tokens_seen": 48800480, + "step": 2980, + "train_runtime": 24557.3882, + "train_tokens_per_second": 1987.202 + }, + { + "epoch": 0.8257617728531856, + "grad_norm": 0.07687953114509583, + "learning_rate": 9.868081127280444e-05, + "loss": 0.016584446653723717, + "num_input_tokens_seen": 48816856, + "step": 2981, + "train_runtime": 24565.6106, + "train_tokens_per_second": 1987.203 + }, + { + "epoch": 0.8260387811634349, + "grad_norm": 0.12211718410253525, + "learning_rate": 9.86798081393101e-05, + "loss": 0.014376027509570122, + "num_input_tokens_seen": 48833232, + "step": 2982, + "train_runtime": 24573.8261, + "train_tokens_per_second": 1987.205 + }, + { + "epoch": 0.8263157894736842, + "grad_norm": 0.06787188351154327, + "learning_rate": 9.867880462966339e-05, + "loss": 0.01592555083334446, + "num_input_tokens_seen": 48849608, + "step": 2983, + "train_runtime": 24582.0405, + "train_tokens_per_second": 1987.207 + }, + { + "epoch": 0.8265927977839335, + "grad_norm": 0.0709359347820282, + "learning_rate": 9.867780074387207e-05, + "loss": 0.014562766067683697, + "num_input_tokens_seen": 48865984, + "step": 2984, + "train_runtime": 24590.2713, + "train_tokens_per_second": 1987.208 + }, + { + "epoch": 0.8268698060941828, + "grad_norm": 0.05647069588303566, + "learning_rate": 9.867679648194385e-05, + "loss": 0.013594910502433777, + "num_input_tokens_seen": 48882360, + "step": 2985, + "train_runtime": 24598.4966, + "train_tokens_per_second": 1987.209 + }, + { + "epoch": 0.8271468144044322, + "grad_norm": 0.08298798650503159, + "learning_rate": 9.867579184388656e-05, + "loss": 0.0152199175208807, + "num_input_tokens_seen": 48898736, + "step": 2986, + "train_runtime": 24606.7296, + "train_tokens_per_second": 1987.21 + }, + { + "epoch": 0.8274238227146814, + "grad_norm": 0.07122445851564407, + "learning_rate": 9.867478682970793e-05, + "loss": 0.013718029484152794, + "num_input_tokens_seen": 48915112, + "step": 2987, + "train_runtime": 24614.9569, + "train_tokens_per_second": 1987.211 + }, + { + "epoch": 0.8277008310249307, + "grad_norm": 0.036974791437387466, + "learning_rate": 9.86737814394157e-05, + "loss": 0.011513177305459976, + "num_input_tokens_seen": 48931488, + "step": 2988, + "train_runtime": 24623.1889, + "train_tokens_per_second": 1987.212 + }, + { + "epoch": 0.8279778393351801, + "grad_norm": 0.07113412767648697, + "learning_rate": 9.867277567301767e-05, + "loss": 0.01059110090136528, + "num_input_tokens_seen": 48947864, + "step": 2989, + "train_runtime": 24631.4065, + "train_tokens_per_second": 1987.214 + }, + { + "epoch": 0.8282548476454293, + "grad_norm": 0.051904935389757156, + "learning_rate": 9.86717695305216e-05, + "loss": 0.009909174405038357, + "num_input_tokens_seen": 48964240, + "step": 2990, + "train_runtime": 24639.616, + "train_tokens_per_second": 1987.216 + }, + { + "epoch": 0.8285318559556787, + "grad_norm": 0.054237671196460724, + "learning_rate": 9.867076301193528e-05, + "loss": 0.012174868024885654, + "num_input_tokens_seen": 48980616, + "step": 2991, + "train_runtime": 24647.8244, + "train_tokens_per_second": 1987.219 + }, + { + "epoch": 0.8288088642659279, + "grad_norm": 0.06834293901920319, + "learning_rate": 9.866975611726645e-05, + "loss": 0.012183640152215958, + "num_input_tokens_seen": 48996992, + "step": 2992, + "train_runtime": 24656.0373, + "train_tokens_per_second": 1987.221 + }, + { + "epoch": 0.8290858725761773, + "grad_norm": 0.07256858795881271, + "learning_rate": 9.866874884652296e-05, + "loss": 0.014687005430459976, + "num_input_tokens_seen": 49013368, + "step": 2993, + "train_runtime": 24664.2626, + "train_tokens_per_second": 1987.222 + }, + { + "epoch": 0.8293628808864266, + "grad_norm": 0.07846427708864212, + "learning_rate": 9.866774119971251e-05, + "loss": 0.011997952125966549, + "num_input_tokens_seen": 49029744, + "step": 2994, + "train_runtime": 24672.4862, + "train_tokens_per_second": 1987.223 + }, + { + "epoch": 0.8296398891966759, + "grad_norm": 0.04601418972015381, + "learning_rate": 9.866673317684293e-05, + "loss": 0.013471005484461784, + "num_input_tokens_seen": 49046120, + "step": 2995, + "train_runtime": 24680.7137, + "train_tokens_per_second": 1987.225 + }, + { + "epoch": 0.8299168975069252, + "grad_norm": 0.05742669105529785, + "learning_rate": 9.866572477792202e-05, + "loss": 0.011459847912192345, + "num_input_tokens_seen": 49062496, + "step": 2996, + "train_runtime": 24688.9415, + "train_tokens_per_second": 1987.226 + }, + { + "epoch": 0.8301939058171746, + "grad_norm": 0.056804168969392776, + "learning_rate": 9.866471600295754e-05, + "loss": 0.013342165388166904, + "num_input_tokens_seen": 49078872, + "step": 2997, + "train_runtime": 24697.1776, + "train_tokens_per_second": 1987.226 + }, + { + "epoch": 0.8304709141274238, + "grad_norm": 0.07538998126983643, + "learning_rate": 9.866370685195732e-05, + "loss": 0.012359441258013248, + "num_input_tokens_seen": 49095248, + "step": 2998, + "train_runtime": 24705.4138, + "train_tokens_per_second": 1987.226 + }, + { + "epoch": 0.8307479224376731, + "grad_norm": 0.07016482949256897, + "learning_rate": 9.866269732492915e-05, + "loss": 0.011980826035141945, + "num_input_tokens_seen": 49111624, + "step": 2999, + "train_runtime": 24713.6432, + "train_tokens_per_second": 1987.227 + }, + { + "epoch": 0.8310249307479224, + "grad_norm": 0.06557415425777435, + "learning_rate": 9.86616874218808e-05, + "loss": 0.01504882238805294, + "num_input_tokens_seen": 49128000, + "step": 3000, + "train_runtime": 24721.8576, + "train_tokens_per_second": 1987.229 + }, + { + "epoch": 0.8313019390581717, + "grad_norm": 0.05925697460770607, + "learning_rate": 9.866067714282008e-05, + "loss": 0.01449844054877758, + "num_input_tokens_seen": 49144376, + "step": 3001, + "train_runtime": 24731.8232, + "train_tokens_per_second": 1987.091 + }, + { + "epoch": 0.8315789473684211, + "grad_norm": 0.06919239461421967, + "learning_rate": 9.865966648775483e-05, + "loss": 0.01415999699383974, + "num_input_tokens_seen": 49160752, + "step": 3002, + "train_runtime": 24740.0479, + "train_tokens_per_second": 1987.092 + }, + { + "epoch": 0.8318559556786703, + "grad_norm": 0.05943150445818901, + "learning_rate": 9.865865545669284e-05, + "loss": 0.013996552675962448, + "num_input_tokens_seen": 49177128, + "step": 3003, + "train_runtime": 24748.2633, + "train_tokens_per_second": 1987.094 + }, + { + "epoch": 0.8321329639889197, + "grad_norm": 0.04716562107205391, + "learning_rate": 9.86576440496419e-05, + "loss": 0.009949605911970139, + "num_input_tokens_seen": 49193504, + "step": 3004, + "train_runtime": 24756.4923, + "train_tokens_per_second": 1987.095 + }, + { + "epoch": 0.832409972299169, + "grad_norm": 0.05927328020334244, + "learning_rate": 9.865663226660989e-05, + "loss": 0.013242846354842186, + "num_input_tokens_seen": 49209880, + "step": 3005, + "train_runtime": 24764.716, + "train_tokens_per_second": 1987.096 + }, + { + "epoch": 0.8326869806094183, + "grad_norm": 0.0858074277639389, + "learning_rate": 9.865562010760454e-05, + "loss": 0.0147493164986372, + "num_input_tokens_seen": 49226256, + "step": 3006, + "train_runtime": 24772.9309, + "train_tokens_per_second": 1987.099 + }, + { + "epoch": 0.8329639889196676, + "grad_norm": 0.06434524804353714, + "learning_rate": 9.865460757263375e-05, + "loss": 0.014569351449608803, + "num_input_tokens_seen": 49242632, + "step": 3007, + "train_runtime": 24781.1581, + "train_tokens_per_second": 1987.1 + }, + { + "epoch": 0.8332409972299168, + "grad_norm": 0.06801465898752213, + "learning_rate": 9.865359466170529e-05, + "loss": 0.016760580241680145, + "num_input_tokens_seen": 49259008, + "step": 3008, + "train_runtime": 24789.3869, + "train_tokens_per_second": 1987.101 + }, + { + "epoch": 0.8335180055401662, + "grad_norm": 0.04917032644152641, + "learning_rate": 9.8652581374827e-05, + "loss": 0.015257865190505981, + "num_input_tokens_seen": 49275384, + "step": 3009, + "train_runtime": 24797.6087, + "train_tokens_per_second": 1987.102 + }, + { + "epoch": 0.8337950138504155, + "grad_norm": 0.059890445321798325, + "learning_rate": 9.865156771200675e-05, + "loss": 0.013866892084479332, + "num_input_tokens_seen": 49291760, + "step": 3010, + "train_runtime": 24805.8321, + "train_tokens_per_second": 1987.104 + }, + { + "epoch": 0.8340720221606648, + "grad_norm": 0.07455237954854965, + "learning_rate": 9.86505536732523e-05, + "loss": 0.013296687975525856, + "num_input_tokens_seen": 49308136, + "step": 3011, + "train_runtime": 24814.0622, + "train_tokens_per_second": 1987.105 + }, + { + "epoch": 0.8343490304709141, + "grad_norm": 0.05863918364048004, + "learning_rate": 9.864953925857155e-05, + "loss": 0.010536023415625095, + "num_input_tokens_seen": 49324512, + "step": 3012, + "train_runtime": 24822.2883, + "train_tokens_per_second": 1987.106 + }, + { + "epoch": 0.8346260387811635, + "grad_norm": 0.07561648637056351, + "learning_rate": 9.86485244679723e-05, + "loss": 0.011203763075172901, + "num_input_tokens_seen": 49340888, + "step": 3013, + "train_runtime": 24830.5152, + "train_tokens_per_second": 1987.107 + }, + { + "epoch": 0.8349030470914127, + "grad_norm": 0.08122837543487549, + "learning_rate": 9.864750930146242e-05, + "loss": 0.018406057730317116, + "num_input_tokens_seen": 49357264, + "step": 3014, + "train_runtime": 24838.7324, + "train_tokens_per_second": 1987.109 + }, + { + "epoch": 0.8351800554016621, + "grad_norm": 0.06891181319952011, + "learning_rate": 9.864649375904972e-05, + "loss": 0.015971068292856216, + "num_input_tokens_seen": 49373640, + "step": 3015, + "train_runtime": 24846.9579, + "train_tokens_per_second": 1987.11 + }, + { + "epoch": 0.8354570637119113, + "grad_norm": 0.1637900173664093, + "learning_rate": 9.864547784074209e-05, + "loss": 0.01612158492207527, + "num_input_tokens_seen": 49390016, + "step": 3016, + "train_runtime": 24855.1882, + "train_tokens_per_second": 1987.111 + }, + { + "epoch": 0.8357340720221607, + "grad_norm": 0.07998530566692352, + "learning_rate": 9.864446154654735e-05, + "loss": 0.012939668260514736, + "num_input_tokens_seen": 49406392, + "step": 3017, + "train_runtime": 24863.4165, + "train_tokens_per_second": 1987.112 + }, + { + "epoch": 0.83601108033241, + "grad_norm": 0.05037128925323486, + "learning_rate": 9.864344487647333e-05, + "loss": 0.011689994484186172, + "num_input_tokens_seen": 49422768, + "step": 3018, + "train_runtime": 24871.6392, + "train_tokens_per_second": 1987.113 + }, + { + "epoch": 0.8362880886426592, + "grad_norm": 0.07810845971107483, + "learning_rate": 9.864242783052794e-05, + "loss": 0.0133891049772501, + "num_input_tokens_seen": 49439144, + "step": 3019, + "train_runtime": 24879.8672, + "train_tokens_per_second": 1987.114 + }, + { + "epoch": 0.8365650969529086, + "grad_norm": 0.03781251981854439, + "learning_rate": 9.864141040871902e-05, + "loss": 0.01059484388679266, + "num_input_tokens_seen": 49455520, + "step": 3020, + "train_runtime": 24888.0977, + "train_tokens_per_second": 1987.115 + }, + { + "epoch": 0.8368421052631579, + "grad_norm": 0.04760425165295601, + "learning_rate": 9.864039261105442e-05, + "loss": 0.011386038735508919, + "num_input_tokens_seen": 49471896, + "step": 3021, + "train_runtime": 24896.3238, + "train_tokens_per_second": 1987.117 + }, + { + "epoch": 0.8371191135734072, + "grad_norm": 0.0685766413807869, + "learning_rate": 9.8639374437542e-05, + "loss": 0.01437688060104847, + "num_input_tokens_seen": 49488272, + "step": 3022, + "train_runtime": 24904.5412, + "train_tokens_per_second": 1987.118 + }, + { + "epoch": 0.8373961218836565, + "grad_norm": 0.07490769773721695, + "learning_rate": 9.863835588818963e-05, + "loss": 0.015434373170137405, + "num_input_tokens_seen": 49504648, + "step": 3023, + "train_runtime": 24912.7604, + "train_tokens_per_second": 1987.12 + }, + { + "epoch": 0.8376731301939058, + "grad_norm": 0.09010086208581924, + "learning_rate": 9.863733696300521e-05, + "loss": 0.014394453726708889, + "num_input_tokens_seen": 49521024, + "step": 3024, + "train_runtime": 24920.9913, + "train_tokens_per_second": 1987.121 + }, + { + "epoch": 0.8379501385041551, + "grad_norm": 0.07203367352485657, + "learning_rate": 9.86363176619966e-05, + "loss": 0.012225371785461903, + "num_input_tokens_seen": 49537400, + "step": 3025, + "train_runtime": 24929.2088, + "train_tokens_per_second": 1987.123 + }, + { + "epoch": 0.8382271468144045, + "grad_norm": 0.05396263301372528, + "learning_rate": 9.863529798517163e-05, + "loss": 0.014289984479546547, + "num_input_tokens_seen": 49553776, + "step": 3026, + "train_runtime": 24937.4371, + "train_tokens_per_second": 1987.124 + }, + { + "epoch": 0.8385041551246537, + "grad_norm": 0.06477943807840347, + "learning_rate": 9.863427793253825e-05, + "loss": 0.014300175942480564, + "num_input_tokens_seen": 49570152, + "step": 3027, + "train_runtime": 24945.6611, + "train_tokens_per_second": 1987.125 + }, + { + "epoch": 0.838781163434903, + "grad_norm": 0.045806825160980225, + "learning_rate": 9.863325750410428e-05, + "loss": 0.011391891166567802, + "num_input_tokens_seen": 49586528, + "step": 3028, + "train_runtime": 24953.8834, + "train_tokens_per_second": 1987.127 + }, + { + "epoch": 0.8390581717451524, + "grad_norm": 0.08632927387952805, + "learning_rate": 9.863223669987766e-05, + "loss": 0.012630072422325611, + "num_input_tokens_seen": 49602904, + "step": 3029, + "train_runtime": 24962.113, + "train_tokens_per_second": 1987.128 + }, + { + "epoch": 0.8393351800554016, + "grad_norm": 0.057690203189849854, + "learning_rate": 9.863121551986623e-05, + "loss": 0.015211204066872597, + "num_input_tokens_seen": 49619280, + "step": 3030, + "train_runtime": 24970.3418, + "train_tokens_per_second": 1987.129 + }, + { + "epoch": 0.839612188365651, + "grad_norm": 0.06978486478328705, + "learning_rate": 9.863019396407792e-05, + "loss": 0.017446676269173622, + "num_input_tokens_seen": 49635656, + "step": 3031, + "train_runtime": 24978.5736, + "train_tokens_per_second": 1987.129 + }, + { + "epoch": 0.8398891966759002, + "grad_norm": 0.03770345076918602, + "learning_rate": 9.86291720325206e-05, + "loss": 0.01057517435401678, + "num_input_tokens_seen": 49652032, + "step": 3032, + "train_runtime": 24986.7892, + "train_tokens_per_second": 1987.131 + }, + { + "epoch": 0.8401662049861496, + "grad_norm": 0.07199680805206299, + "learning_rate": 9.862814972520219e-05, + "loss": 0.016247710213065147, + "num_input_tokens_seen": 49668408, + "step": 3033, + "train_runtime": 24995.0108, + "train_tokens_per_second": 1987.133 + }, + { + "epoch": 0.8404432132963989, + "grad_norm": 0.0636885017156601, + "learning_rate": 9.862712704213053e-05, + "loss": 0.012099074199795723, + "num_input_tokens_seen": 49684784, + "step": 3034, + "train_runtime": 25003.2192, + "train_tokens_per_second": 1987.135 + }, + { + "epoch": 0.8407202216066482, + "grad_norm": 0.060257069766521454, + "learning_rate": 9.86261039833136e-05, + "loss": 0.013842999935150146, + "num_input_tokens_seen": 49701160, + "step": 3035, + "train_runtime": 25011.4328, + "train_tokens_per_second": 1987.138 + }, + { + "epoch": 0.8409972299168975, + "grad_norm": 0.1045432984828949, + "learning_rate": 9.862508054875926e-05, + "loss": 0.01748289354145527, + "num_input_tokens_seen": 49717536, + "step": 3036, + "train_runtime": 25019.6576, + "train_tokens_per_second": 1987.139 + }, + { + "epoch": 0.8412742382271469, + "grad_norm": 0.04544979706406593, + "learning_rate": 9.862405673847542e-05, + "loss": 0.013244090601801872, + "num_input_tokens_seen": 49733912, + "step": 3037, + "train_runtime": 25027.8713, + "train_tokens_per_second": 1987.141 + }, + { + "epoch": 0.8415512465373961, + "grad_norm": 0.07914943248033524, + "learning_rate": 9.862303255247e-05, + "loss": 0.016345825046300888, + "num_input_tokens_seen": 49750288, + "step": 3038, + "train_runtime": 25036.0852, + "train_tokens_per_second": 1987.143 + }, + { + "epoch": 0.8418282548476455, + "grad_norm": 0.048129718750715256, + "learning_rate": 9.862200799075092e-05, + "loss": 0.01436719112098217, + "num_input_tokens_seen": 49766664, + "step": 3039, + "train_runtime": 25044.2966, + "train_tokens_per_second": 1987.146 + }, + { + "epoch": 0.8421052631578947, + "grad_norm": 0.06547839939594269, + "learning_rate": 9.862098305332608e-05, + "loss": 0.015023446641862392, + "num_input_tokens_seen": 49783040, + "step": 3040, + "train_runtime": 25052.5154, + "train_tokens_per_second": 1987.147 + }, + { + "epoch": 0.842382271468144, + "grad_norm": 0.03844577446579933, + "learning_rate": 9.86199577402034e-05, + "loss": 0.011963002383708954, + "num_input_tokens_seen": 49799416, + "step": 3041, + "train_runtime": 25060.7413, + "train_tokens_per_second": 1987.149 + }, + { + "epoch": 0.8426592797783934, + "grad_norm": 0.08050607144832611, + "learning_rate": 9.861893205139083e-05, + "loss": 0.01354434434324503, + "num_input_tokens_seen": 49815792, + "step": 3042, + "train_runtime": 25068.9708, + "train_tokens_per_second": 1987.149 + }, + { + "epoch": 0.8429362880886426, + "grad_norm": 0.05916636809706688, + "learning_rate": 9.861790598689628e-05, + "loss": 0.011026655323803425, + "num_input_tokens_seen": 49832168, + "step": 3043, + "train_runtime": 25077.1957, + "train_tokens_per_second": 1987.151 + }, + { + "epoch": 0.843213296398892, + "grad_norm": 0.04126787558197975, + "learning_rate": 9.861687954672767e-05, + "loss": 0.01194753684103489, + "num_input_tokens_seen": 49848544, + "step": 3044, + "train_runtime": 25085.413, + "train_tokens_per_second": 1987.153 + }, + { + "epoch": 0.8434903047091413, + "grad_norm": 0.07867301255464554, + "learning_rate": 9.861585273089293e-05, + "loss": 0.013125015422701836, + "num_input_tokens_seen": 49864920, + "step": 3045, + "train_runtime": 25093.6362, + "train_tokens_per_second": 1987.154 + }, + { + "epoch": 0.8437673130193906, + "grad_norm": 0.07400617748498917, + "learning_rate": 9.861482553940003e-05, + "loss": 0.014708655886352062, + "num_input_tokens_seen": 49881296, + "step": 3046, + "train_runtime": 25101.8686, + "train_tokens_per_second": 1987.155 + }, + { + "epoch": 0.8440443213296399, + "grad_norm": 0.057840581983327866, + "learning_rate": 9.861379797225685e-05, + "loss": 0.011017046868801117, + "num_input_tokens_seen": 49897672, + "step": 3047, + "train_runtime": 25110.0943, + "train_tokens_per_second": 1987.156 + }, + { + "epoch": 0.8443213296398892, + "grad_norm": 0.0603223517537117, + "learning_rate": 9.861277002947136e-05, + "loss": 0.011370315216481686, + "num_input_tokens_seen": 49914048, + "step": 3048, + "train_runtime": 25118.3269, + "train_tokens_per_second": 1987.157 + }, + { + "epoch": 0.8445983379501385, + "grad_norm": 0.060171712189912796, + "learning_rate": 9.861174171105151e-05, + "loss": 0.015077225863933563, + "num_input_tokens_seen": 49930424, + "step": 3049, + "train_runtime": 25126.5482, + "train_tokens_per_second": 1987.158 + }, + { + "epoch": 0.8448753462603878, + "grad_norm": 0.10776327550411224, + "learning_rate": 9.861071301700523e-05, + "loss": 0.012440809048712254, + "num_input_tokens_seen": 49946800, + "step": 3050, + "train_runtime": 25134.7716, + "train_tokens_per_second": 1987.159 + }, + { + "epoch": 0.8451523545706371, + "grad_norm": 0.07708487659692764, + "learning_rate": 9.860968394734049e-05, + "loss": 0.014781074598431587, + "num_input_tokens_seen": 49963176, + "step": 3051, + "train_runtime": 25142.9944, + "train_tokens_per_second": 1987.161 + }, + { + "epoch": 0.8454293628808864, + "grad_norm": 0.05231643095612526, + "learning_rate": 9.860865450206522e-05, + "loss": 0.011863634921610355, + "num_input_tokens_seen": 49979552, + "step": 3052, + "train_runtime": 25151.2165, + "train_tokens_per_second": 1987.162 + }, + { + "epoch": 0.8457063711911358, + "grad_norm": 0.125255286693573, + "learning_rate": 9.860762468118738e-05, + "loss": 0.015021195635199547, + "num_input_tokens_seen": 49995928, + "step": 3053, + "train_runtime": 25159.4383, + "train_tokens_per_second": 1987.164 + }, + { + "epoch": 0.845983379501385, + "grad_norm": 0.06625112891197205, + "learning_rate": 9.860659448471492e-05, + "loss": 0.012551926076412201, + "num_input_tokens_seen": 50012304, + "step": 3054, + "train_runtime": 25167.6602, + "train_tokens_per_second": 1987.165 + }, + { + "epoch": 0.8462603878116344, + "grad_norm": 0.056840814650058746, + "learning_rate": 9.860556391265582e-05, + "loss": 0.011161139234900475, + "num_input_tokens_seen": 50028680, + "step": 3055, + "train_runtime": 25175.8864, + "train_tokens_per_second": 1987.167 + }, + { + "epoch": 0.8465373961218836, + "grad_norm": 0.06039707362651825, + "learning_rate": 9.860453296501805e-05, + "loss": 0.01025974191725254, + "num_input_tokens_seen": 50045056, + "step": 3056, + "train_runtime": 25184.1101, + "train_tokens_per_second": 1987.168 + }, + { + "epoch": 0.846814404432133, + "grad_norm": 0.05672851577401161, + "learning_rate": 9.860350164180954e-05, + "loss": 0.01115566398948431, + "num_input_tokens_seen": 50061432, + "step": 3057, + "train_runtime": 25192.3371, + "train_tokens_per_second": 1987.169 + }, + { + "epoch": 0.8470914127423823, + "grad_norm": 0.05223054438829422, + "learning_rate": 9.86024699430383e-05, + "loss": 0.01173850242048502, + "num_input_tokens_seen": 50077808, + "step": 3058, + "train_runtime": 25200.5627, + "train_tokens_per_second": 1987.17 + }, + { + "epoch": 0.8473684210526315, + "grad_norm": 0.07969946414232254, + "learning_rate": 9.860143786871225e-05, + "loss": 0.013442116789519787, + "num_input_tokens_seen": 50094184, + "step": 3059, + "train_runtime": 25208.7809, + "train_tokens_per_second": 1987.172 + }, + { + "epoch": 0.8476454293628809, + "grad_norm": 0.0534239299595356, + "learning_rate": 9.860040541883942e-05, + "loss": 0.011453073471784592, + "num_input_tokens_seen": 50110560, + "step": 3060, + "train_runtime": 25217.0035, + "train_tokens_per_second": 1987.173 + }, + { + "epoch": 0.8479224376731302, + "grad_norm": 0.11767929792404175, + "learning_rate": 9.859937259342776e-05, + "loss": 0.011844459921121597, + "num_input_tokens_seen": 50126936, + "step": 3061, + "train_runtime": 25225.235, + "train_tokens_per_second": 1987.174 + }, + { + "epoch": 0.8481994459833795, + "grad_norm": 0.06263735145330429, + "learning_rate": 9.859833939248526e-05, + "loss": 0.013057825155556202, + "num_input_tokens_seen": 50143312, + "step": 3062, + "train_runtime": 25233.461, + "train_tokens_per_second": 1987.175 + }, + { + "epoch": 0.8484764542936288, + "grad_norm": 0.06809918582439423, + "learning_rate": 9.859730581601989e-05, + "loss": 0.01434639748185873, + "num_input_tokens_seen": 50159688, + "step": 3063, + "train_runtime": 25241.6873, + "train_tokens_per_second": 1987.177 + }, + { + "epoch": 0.8487534626038781, + "grad_norm": 0.08020938187837601, + "learning_rate": 9.859627186403965e-05, + "loss": 0.01412825658917427, + "num_input_tokens_seen": 50176064, + "step": 3064, + "train_runtime": 25249.9152, + "train_tokens_per_second": 1987.178 + }, + { + "epoch": 0.8490304709141274, + "grad_norm": 0.09274015575647354, + "learning_rate": 9.859523753655252e-05, + "loss": 0.015933627262711525, + "num_input_tokens_seen": 50192440, + "step": 3065, + "train_runtime": 25258.1423, + "train_tokens_per_second": 1987.179 + }, + { + "epoch": 0.8493074792243768, + "grad_norm": 0.04071243479847908, + "learning_rate": 9.85942028335665e-05, + "loss": 0.01279393583536148, + "num_input_tokens_seen": 50208816, + "step": 3066, + "train_runtime": 25266.3711, + "train_tokens_per_second": 1987.18 + }, + { + "epoch": 0.849584487534626, + "grad_norm": 0.08359570801258087, + "learning_rate": 9.859316775508957e-05, + "loss": 0.012898696586489677, + "num_input_tokens_seen": 50225192, + "step": 3067, + "train_runtime": 25274.5961, + "train_tokens_per_second": 1987.181 + }, + { + "epoch": 0.8498614958448754, + "grad_norm": 0.0931970477104187, + "learning_rate": 9.859213230112976e-05, + "loss": 0.015218539163470268, + "num_input_tokens_seen": 50241568, + "step": 3068, + "train_runtime": 25282.8238, + "train_tokens_per_second": 1987.182 + }, + { + "epoch": 0.8501385041551247, + "grad_norm": 0.05731836333870888, + "learning_rate": 9.859109647169503e-05, + "loss": 0.013773124665021896, + "num_input_tokens_seen": 50257944, + "step": 3069, + "train_runtime": 25291.0483, + "train_tokens_per_second": 1987.183 + }, + { + "epoch": 0.850415512465374, + "grad_norm": 0.042284559458494186, + "learning_rate": 9.859006026679343e-05, + "loss": 0.011244196444749832, + "num_input_tokens_seen": 50274320, + "step": 3070, + "train_runtime": 25299.2725, + "train_tokens_per_second": 1987.184 + }, + { + "epoch": 0.8506925207756233, + "grad_norm": 0.05142008885741234, + "learning_rate": 9.858902368643293e-05, + "loss": 0.014157164841890335, + "num_input_tokens_seen": 50290696, + "step": 3071, + "train_runtime": 25307.5005, + "train_tokens_per_second": 1987.185 + }, + { + "epoch": 0.8509695290858725, + "grad_norm": 0.08321494609117508, + "learning_rate": 9.858798673062152e-05, + "loss": 0.01632140763103962, + "num_input_tokens_seen": 50307072, + "step": 3072, + "train_runtime": 25315.7276, + "train_tokens_per_second": 1987.186 + }, + { + "epoch": 0.8512465373961219, + "grad_norm": 0.04843628779053688, + "learning_rate": 9.858694939936729e-05, + "loss": 0.014289340004324913, + "num_input_tokens_seen": 50323448, + "step": 3073, + "train_runtime": 25323.9572, + "train_tokens_per_second": 1987.187 + }, + { + "epoch": 0.8515235457063712, + "grad_norm": 0.05385051667690277, + "learning_rate": 9.858591169267819e-05, + "loss": 0.013341542333364487, + "num_input_tokens_seen": 50339824, + "step": 3074, + "train_runtime": 25332.1819, + "train_tokens_per_second": 1987.189 + }, + { + "epoch": 0.8518005540166205, + "grad_norm": 0.10584016889333725, + "learning_rate": 9.858487361056226e-05, + "loss": 0.015536860562860966, + "num_input_tokens_seen": 50356200, + "step": 3075, + "train_runtime": 25340.3911, + "train_tokens_per_second": 1987.191 + }, + { + "epoch": 0.8520775623268698, + "grad_norm": 0.05063769966363907, + "learning_rate": 9.858383515302752e-05, + "loss": 0.011026791296899319, + "num_input_tokens_seen": 50372576, + "step": 3076, + "train_runtime": 25348.6046, + "train_tokens_per_second": 1987.193 + }, + { + "epoch": 0.8523545706371192, + "grad_norm": 0.08219396322965622, + "learning_rate": 9.858279632008199e-05, + "loss": 0.019005075097084045, + "num_input_tokens_seen": 50388952, + "step": 3077, + "train_runtime": 25356.8191, + "train_tokens_per_second": 1987.195 + }, + { + "epoch": 0.8526315789473684, + "grad_norm": 0.07072747498750687, + "learning_rate": 9.858175711173371e-05, + "loss": 0.012669466435909271, + "num_input_tokens_seen": 50405328, + "step": 3078, + "train_runtime": 25365.0247, + "train_tokens_per_second": 1987.198 + }, + { + "epoch": 0.8529085872576178, + "grad_norm": 0.06224488839507103, + "learning_rate": 9.85807175279907e-05, + "loss": 0.014296507462859154, + "num_input_tokens_seen": 50421704, + "step": 3079, + "train_runtime": 25373.2382, + "train_tokens_per_second": 1987.2 + }, + { + "epoch": 0.853185595567867, + "grad_norm": 0.053624097257852554, + "learning_rate": 9.857967756886098e-05, + "loss": 0.014394709840416908, + "num_input_tokens_seen": 50438080, + "step": 3080, + "train_runtime": 25381.457, + "train_tokens_per_second": 1987.202 + }, + { + "epoch": 0.8534626038781163, + "grad_norm": 0.07263583689928055, + "learning_rate": 9.857863723435261e-05, + "loss": 0.011426431126892567, + "num_input_tokens_seen": 50454456, + "step": 3081, + "train_runtime": 25389.6798, + "train_tokens_per_second": 1987.203 + }, + { + "epoch": 0.8537396121883657, + "grad_norm": 0.06282375007867813, + "learning_rate": 9.857759652447363e-05, + "loss": 0.01346889790147543, + "num_input_tokens_seen": 50470832, + "step": 3082, + "train_runtime": 25397.9077, + "train_tokens_per_second": 1987.204 + }, + { + "epoch": 0.8540166204986149, + "grad_norm": 0.06536664068698883, + "learning_rate": 9.857655543923206e-05, + "loss": 0.01287558488547802, + "num_input_tokens_seen": 50487208, + "step": 3083, + "train_runtime": 25406.1308, + "train_tokens_per_second": 1987.206 + }, + { + "epoch": 0.8542936288088643, + "grad_norm": 0.05091387778520584, + "learning_rate": 9.857551397863594e-05, + "loss": 0.010835446417331696, + "num_input_tokens_seen": 50503584, + "step": 3084, + "train_runtime": 25414.3579, + "train_tokens_per_second": 1987.207 + }, + { + "epoch": 0.8545706371191135, + "grad_norm": 0.0720706433057785, + "learning_rate": 9.857447214269335e-05, + "loss": 0.016326168552041054, + "num_input_tokens_seen": 50519960, + "step": 3085, + "train_runtime": 25422.5807, + "train_tokens_per_second": 1987.208 + }, + { + "epoch": 0.8548476454293629, + "grad_norm": 0.06803520023822784, + "learning_rate": 9.857342993141231e-05, + "loss": 0.01422247476875782, + "num_input_tokens_seen": 50536336, + "step": 3086, + "train_runtime": 25430.8041, + "train_tokens_per_second": 1987.21 + }, + { + "epoch": 0.8551246537396122, + "grad_norm": 0.07990429550409317, + "learning_rate": 9.85723873448009e-05, + "loss": 0.014507900923490524, + "num_input_tokens_seen": 50552712, + "step": 3087, + "train_runtime": 25439.0181, + "train_tokens_per_second": 1987.212 + }, + { + "epoch": 0.8554016620498615, + "grad_norm": 0.0686531737446785, + "learning_rate": 9.857134438286714e-05, + "loss": 0.012536874040961266, + "num_input_tokens_seen": 50569088, + "step": 3088, + "train_runtime": 25447.2262, + "train_tokens_per_second": 1987.214 + }, + { + "epoch": 0.8556786703601108, + "grad_norm": 0.0564347468316555, + "learning_rate": 9.857030104561912e-05, + "loss": 0.01140422560274601, + "num_input_tokens_seen": 50585464, + "step": 3089, + "train_runtime": 25455.4404, + "train_tokens_per_second": 1987.216 + }, + { + "epoch": 0.8559556786703602, + "grad_norm": 0.0803319662809372, + "learning_rate": 9.85692573330649e-05, + "loss": 0.013182707130908966, + "num_input_tokens_seen": 50601840, + "step": 3090, + "train_runtime": 25463.658, + "train_tokens_per_second": 1987.218 + }, + { + "epoch": 0.8562326869806094, + "grad_norm": 0.032709743827581406, + "learning_rate": 9.856821324521254e-05, + "loss": 0.009937363676726818, + "num_input_tokens_seen": 50618216, + "step": 3091, + "train_runtime": 25471.8715, + "train_tokens_per_second": 1987.22 + }, + { + "epoch": 0.8565096952908587, + "grad_norm": 0.054287686944007874, + "learning_rate": 9.85671687820701e-05, + "loss": 0.013114724308252335, + "num_input_tokens_seen": 50634592, + "step": 3092, + "train_runtime": 25480.0831, + "train_tokens_per_second": 1987.222 + }, + { + "epoch": 0.856786703601108, + "grad_norm": 0.05949670821428299, + "learning_rate": 9.856612394364566e-05, + "loss": 0.012097060680389404, + "num_input_tokens_seen": 50650968, + "step": 3093, + "train_runtime": 25488.3007, + "train_tokens_per_second": 1987.224 + }, + { + "epoch": 0.8570637119113573, + "grad_norm": 0.0598759800195694, + "learning_rate": 9.856507872994727e-05, + "loss": 0.01142276730388403, + "num_input_tokens_seen": 50667344, + "step": 3094, + "train_runtime": 25496.5353, + "train_tokens_per_second": 1987.225 + }, + { + "epoch": 0.8573407202216067, + "grad_norm": 0.06968191266059875, + "learning_rate": 9.856403314098304e-05, + "loss": 0.013131371699273586, + "num_input_tokens_seen": 50683720, + "step": 3095, + "train_runtime": 25504.7641, + "train_tokens_per_second": 1987.226 + }, + { + "epoch": 0.8576177285318559, + "grad_norm": 0.07283341139554977, + "learning_rate": 9.856298717676104e-05, + "loss": 0.014153454452753067, + "num_input_tokens_seen": 50700096, + "step": 3096, + "train_runtime": 25512.9962, + "train_tokens_per_second": 1987.226 + }, + { + "epoch": 0.8578947368421053, + "grad_norm": 0.06299787759780884, + "learning_rate": 9.856194083728936e-05, + "loss": 0.012234027497470379, + "num_input_tokens_seen": 50716472, + "step": 3097, + "train_runtime": 25521.2262, + "train_tokens_per_second": 1987.227 + }, + { + "epoch": 0.8581717451523546, + "grad_norm": 0.06719633936882019, + "learning_rate": 9.856089412257606e-05, + "loss": 0.012358734384179115, + "num_input_tokens_seen": 50732848, + "step": 3098, + "train_runtime": 25529.4569, + "train_tokens_per_second": 1987.228 + }, + { + "epoch": 0.8584487534626039, + "grad_norm": 0.05340227484703064, + "learning_rate": 9.855984703262922e-05, + "loss": 0.013679315336048603, + "num_input_tokens_seen": 50749224, + "step": 3099, + "train_runtime": 25537.6865, + "train_tokens_per_second": 1987.229 + }, + { + "epoch": 0.8587257617728532, + "grad_norm": 0.048111408948898315, + "learning_rate": 9.855879956745698e-05, + "loss": 0.01359168067574501, + "num_input_tokens_seen": 50765600, + "step": 3100, + "train_runtime": 25545.9155, + "train_tokens_per_second": 1987.23 + }, + { + "epoch": 0.8590027700831024, + "grad_norm": 0.08432472497224808, + "learning_rate": 9.855775172706738e-05, + "loss": 0.012653413228690624, + "num_input_tokens_seen": 50781976, + "step": 3101, + "train_runtime": 25555.8573, + "train_tokens_per_second": 1987.097 + }, + { + "epoch": 0.8592797783933518, + "grad_norm": 0.06551084667444229, + "learning_rate": 9.855670351146856e-05, + "loss": 0.013816132210195065, + "num_input_tokens_seen": 50798352, + "step": 3102, + "train_runtime": 25564.0777, + "train_tokens_per_second": 1987.099 + }, + { + "epoch": 0.8595567867036011, + "grad_norm": 0.037773385643959045, + "learning_rate": 9.855565492066861e-05, + "loss": 0.009711747989058495, + "num_input_tokens_seen": 50814728, + "step": 3103, + "train_runtime": 25572.2972, + "train_tokens_per_second": 1987.101 + }, + { + "epoch": 0.8598337950138504, + "grad_norm": 0.07774832844734192, + "learning_rate": 9.85546059546756e-05, + "loss": 0.010535942390561104, + "num_input_tokens_seen": 50831104, + "step": 3104, + "train_runtime": 25580.5183, + "train_tokens_per_second": 1987.102 + }, + { + "epoch": 0.8601108033240997, + "grad_norm": 0.09331648051738739, + "learning_rate": 9.855355661349767e-05, + "loss": 0.014629561454057693, + "num_input_tokens_seen": 50847480, + "step": 3105, + "train_runtime": 25588.7358, + "train_tokens_per_second": 1987.104 + }, + { + "epoch": 0.8603878116343491, + "grad_norm": 0.070989690721035, + "learning_rate": 9.855250689714293e-05, + "loss": 0.011694293469190598, + "num_input_tokens_seen": 50863856, + "step": 3106, + "train_runtime": 25596.9584, + "train_tokens_per_second": 1987.105 + }, + { + "epoch": 0.8606648199445983, + "grad_norm": 0.044308681041002274, + "learning_rate": 9.855145680561945e-05, + "loss": 0.011714459396898746, + "num_input_tokens_seen": 50880232, + "step": 3107, + "train_runtime": 25605.1826, + "train_tokens_per_second": 1987.107 + }, + { + "epoch": 0.8609418282548477, + "grad_norm": 0.07524628937244415, + "learning_rate": 9.85504063389354e-05, + "loss": 0.01263779029250145, + "num_input_tokens_seen": 50896608, + "step": 3108, + "train_runtime": 25613.4067, + "train_tokens_per_second": 1987.108 + }, + { + "epoch": 0.8612188365650969, + "grad_norm": 0.06308847665786743, + "learning_rate": 9.854935549709888e-05, + "loss": 0.015501119196414948, + "num_input_tokens_seen": 50912984, + "step": 3109, + "train_runtime": 25621.6314, + "train_tokens_per_second": 1987.109 + }, + { + "epoch": 0.8614958448753463, + "grad_norm": 0.051080554723739624, + "learning_rate": 9.854830428011797e-05, + "loss": 0.010461809113621712, + "num_input_tokens_seen": 50929360, + "step": 3110, + "train_runtime": 25629.8615, + "train_tokens_per_second": 1987.11 + }, + { + "epoch": 0.8617728531855956, + "grad_norm": 0.07114709168672562, + "learning_rate": 9.854725268800084e-05, + "loss": 0.016756659373641014, + "num_input_tokens_seen": 50945736, + "step": 3111, + "train_runtime": 25638.0828, + "train_tokens_per_second": 1987.112 + }, + { + "epoch": 0.8620498614958448, + "grad_norm": 0.06538508087396622, + "learning_rate": 9.85462007207556e-05, + "loss": 0.012189443223178387, + "num_input_tokens_seen": 50962112, + "step": 3112, + "train_runtime": 25646.304, + "train_tokens_per_second": 1987.113 + }, + { + "epoch": 0.8623268698060942, + "grad_norm": 0.07003022730350494, + "learning_rate": 9.854514837839037e-05, + "loss": 0.01338217779994011, + "num_input_tokens_seen": 50978488, + "step": 3113, + "train_runtime": 25654.5328, + "train_tokens_per_second": 1987.114 + }, + { + "epoch": 0.8626038781163435, + "grad_norm": 0.06560472398996353, + "learning_rate": 9.85440956609133e-05, + "loss": 0.015453977510333061, + "num_input_tokens_seen": 50994864, + "step": 3114, + "train_runtime": 25662.7743, + "train_tokens_per_second": 1987.114 + }, + { + "epoch": 0.8628808864265928, + "grad_norm": 0.03903624787926674, + "learning_rate": 9.85430425683325e-05, + "loss": 0.01201903447508812, + "num_input_tokens_seen": 51011240, + "step": 3115, + "train_runtime": 25670.9872, + "train_tokens_per_second": 1987.116 + }, + { + "epoch": 0.8631578947368421, + "grad_norm": 0.0832379162311554, + "learning_rate": 9.854198910065613e-05, + "loss": 0.010840998031198978, + "num_input_tokens_seen": 51027616, + "step": 3116, + "train_runtime": 25679.2116, + "train_tokens_per_second": 1987.118 + }, + { + "epoch": 0.8634349030470914, + "grad_norm": 0.07386861741542816, + "learning_rate": 9.854093525789232e-05, + "loss": 0.013505513779819012, + "num_input_tokens_seen": 51043992, + "step": 3117, + "train_runtime": 25687.4369, + "train_tokens_per_second": 1987.119 + }, + { + "epoch": 0.8637119113573407, + "grad_norm": 0.05573912709951401, + "learning_rate": 9.853988104004921e-05, + "loss": 0.01148900669068098, + "num_input_tokens_seen": 51060368, + "step": 3118, + "train_runtime": 25695.6582, + "train_tokens_per_second": 1987.12 + }, + { + "epoch": 0.8639889196675901, + "grad_norm": 0.10032981634140015, + "learning_rate": 9.853882644713494e-05, + "loss": 0.016129784286022186, + "num_input_tokens_seen": 51076744, + "step": 3119, + "train_runtime": 25703.8727, + "train_tokens_per_second": 1987.123 + }, + { + "epoch": 0.8642659279778393, + "grad_norm": 0.09093724936246872, + "learning_rate": 9.853777147915769e-05, + "loss": 0.013716045767068863, + "num_input_tokens_seen": 51093120, + "step": 3120, + "train_runtime": 25712.0906, + "train_tokens_per_second": 1987.124 + }, + { + "epoch": 0.8645429362880886, + "grad_norm": 0.04573426395654678, + "learning_rate": 9.853671613612559e-05, + "loss": 0.012540996074676514, + "num_input_tokens_seen": 51109496, + "step": 3121, + "train_runtime": 25720.3067, + "train_tokens_per_second": 1987.126 + }, + { + "epoch": 0.864819944598338, + "grad_norm": 0.04531633108854294, + "learning_rate": 9.853566041804678e-05, + "loss": 0.011887227185070515, + "num_input_tokens_seen": 51125872, + "step": 3122, + "train_runtime": 25728.5334, + "train_tokens_per_second": 1987.127 + }, + { + "epoch": 0.8650969529085872, + "grad_norm": 0.06930217146873474, + "learning_rate": 9.853460432492944e-05, + "loss": 0.013211116194725037, + "num_input_tokens_seen": 51142248, + "step": 3123, + "train_runtime": 25736.7634, + "train_tokens_per_second": 1987.128 + }, + { + "epoch": 0.8653739612188366, + "grad_norm": 0.040190599858760834, + "learning_rate": 9.853354785678174e-05, + "loss": 0.012819112278521061, + "num_input_tokens_seen": 51158624, + "step": 3124, + "train_runtime": 25744.9935, + "train_tokens_per_second": 1987.129 + }, + { + "epoch": 0.8656509695290858, + "grad_norm": 0.08357550948858261, + "learning_rate": 9.853249101361182e-05, + "loss": 0.01522712130099535, + "num_input_tokens_seen": 51175000, + "step": 3125, + "train_runtime": 25753.2159, + "train_tokens_per_second": 1987.13 + }, + { + "epoch": 0.8659279778393352, + "grad_norm": 0.05221496894955635, + "learning_rate": 9.853143379542784e-05, + "loss": 0.010888982564210892, + "num_input_tokens_seen": 51191376, + "step": 3126, + "train_runtime": 25761.4484, + "train_tokens_per_second": 1987.131 + }, + { + "epoch": 0.8662049861495845, + "grad_norm": 0.10823532938957214, + "learning_rate": 9.853037620223799e-05, + "loss": 0.01252523623406887, + "num_input_tokens_seen": 51207752, + "step": 3127, + "train_runtime": 25769.6701, + "train_tokens_per_second": 1987.133 + }, + { + "epoch": 0.8664819944598338, + "grad_norm": 0.04314491152763367, + "learning_rate": 9.852931823405043e-05, + "loss": 0.014054740779101849, + "num_input_tokens_seen": 51224128, + "step": 3128, + "train_runtime": 25777.8897, + "train_tokens_per_second": 1987.134 + }, + { + "epoch": 0.8667590027700831, + "grad_norm": 0.07857277989387512, + "learning_rate": 9.852825989087335e-05, + "loss": 0.01153540425002575, + "num_input_tokens_seen": 51240504, + "step": 3129, + "train_runtime": 25786.1152, + "train_tokens_per_second": 1987.135 + }, + { + "epoch": 0.8670360110803325, + "grad_norm": 0.10624659061431885, + "learning_rate": 9.85272011727149e-05, + "loss": 0.014674916863441467, + "num_input_tokens_seen": 51256880, + "step": 3130, + "train_runtime": 25794.3393, + "train_tokens_per_second": 1987.137 + }, + { + "epoch": 0.8673130193905817, + "grad_norm": 0.05740472301840782, + "learning_rate": 9.85261420795833e-05, + "loss": 0.014019926078617573, + "num_input_tokens_seen": 51273256, + "step": 3131, + "train_runtime": 25802.5659, + "train_tokens_per_second": 1987.138 + }, + { + "epoch": 0.867590027700831, + "grad_norm": 0.048766814172267914, + "learning_rate": 9.85250826114867e-05, + "loss": 0.014000961557030678, + "num_input_tokens_seen": 51289632, + "step": 3132, + "train_runtime": 25810.7965, + "train_tokens_per_second": 1987.139 + }, + { + "epoch": 0.8678670360110803, + "grad_norm": 0.04561290889978409, + "learning_rate": 9.85240227684333e-05, + "loss": 0.014953956939280033, + "num_input_tokens_seen": 51306008, + "step": 3133, + "train_runtime": 25819.0263, + "train_tokens_per_second": 1987.14 + }, + { + "epoch": 0.8681440443213296, + "grad_norm": 0.07715948671102524, + "learning_rate": 9.852296255043129e-05, + "loss": 0.014273329637944698, + "num_input_tokens_seen": 51322384, + "step": 3134, + "train_runtime": 25827.2573, + "train_tokens_per_second": 1987.14 + }, + { + "epoch": 0.868421052631579, + "grad_norm": 0.07195008546113968, + "learning_rate": 9.852190195748885e-05, + "loss": 0.014478351920843124, + "num_input_tokens_seen": 51338760, + "step": 3135, + "train_runtime": 25835.4857, + "train_tokens_per_second": 1987.141 + }, + { + "epoch": 0.8686980609418282, + "grad_norm": 0.054678574204444885, + "learning_rate": 9.852084098961421e-05, + "loss": 0.011246562004089355, + "num_input_tokens_seen": 51355136, + "step": 3136, + "train_runtime": 25843.7203, + "train_tokens_per_second": 1987.142 + }, + { + "epoch": 0.8689750692520776, + "grad_norm": 0.06665392965078354, + "learning_rate": 9.851977964681554e-05, + "loss": 0.015509036369621754, + "num_input_tokens_seen": 51371512, + "step": 3137, + "train_runtime": 25851.9601, + "train_tokens_per_second": 1987.142 + }, + { + "epoch": 0.8692520775623269, + "grad_norm": 0.05466733127832413, + "learning_rate": 9.851871792910102e-05, + "loss": 0.013000753708183765, + "num_input_tokens_seen": 51387888, + "step": 3138, + "train_runtime": 25860.1955, + "train_tokens_per_second": 1987.142 + }, + { + "epoch": 0.8695290858725762, + "grad_norm": 0.044090185314416885, + "learning_rate": 9.85176558364789e-05, + "loss": 0.011785978451371193, + "num_input_tokens_seen": 51404264, + "step": 3139, + "train_runtime": 25868.4205, + "train_tokens_per_second": 1987.144 + }, + { + "epoch": 0.8698060941828255, + "grad_norm": 0.059843286871910095, + "learning_rate": 9.851659336895735e-05, + "loss": 0.01203276589512825, + "num_input_tokens_seen": 51420640, + "step": 3140, + "train_runtime": 25876.6578, + "train_tokens_per_second": 1987.144 + }, + { + "epoch": 0.8700831024930747, + "grad_norm": 0.05476228520274162, + "learning_rate": 9.851553052654463e-05, + "loss": 0.008655362762510777, + "num_input_tokens_seen": 51437016, + "step": 3141, + "train_runtime": 25884.8797, + "train_tokens_per_second": 1987.145 + }, + { + "epoch": 0.8703601108033241, + "grad_norm": 0.05489419773221016, + "learning_rate": 9.851446730924889e-05, + "loss": 0.012026580050587654, + "num_input_tokens_seen": 51453392, + "step": 3142, + "train_runtime": 25893.1115, + "train_tokens_per_second": 1987.146 + }, + { + "epoch": 0.8706371191135734, + "grad_norm": 0.07878071814775467, + "learning_rate": 9.851340371707837e-05, + "loss": 0.01271924376487732, + "num_input_tokens_seen": 51469768, + "step": 3143, + "train_runtime": 25901.3308, + "train_tokens_per_second": 1987.148 + }, + { + "epoch": 0.8709141274238227, + "grad_norm": 0.04488513618707657, + "learning_rate": 9.85123397500413e-05, + "loss": 0.011001655831933022, + "num_input_tokens_seen": 51486144, + "step": 3144, + "train_runtime": 25909.5617, + "train_tokens_per_second": 1987.148 + }, + { + "epoch": 0.871191135734072, + "grad_norm": 0.07004415988922119, + "learning_rate": 9.85112754081459e-05, + "loss": 0.012438801117241383, + "num_input_tokens_seen": 51502520, + "step": 3145, + "train_runtime": 25917.7814, + "train_tokens_per_second": 1987.15 + }, + { + "epoch": 0.8714681440443214, + "grad_norm": 0.06701209396123886, + "learning_rate": 9.851021069140038e-05, + "loss": 0.012158999219536781, + "num_input_tokens_seen": 51518896, + "step": 3146, + "train_runtime": 25926.0068, + "train_tokens_per_second": 1987.151 + }, + { + "epoch": 0.8717451523545706, + "grad_norm": 0.09108272939920425, + "learning_rate": 9.850914559981298e-05, + "loss": 0.014192432165145874, + "num_input_tokens_seen": 51535272, + "step": 3147, + "train_runtime": 25934.2342, + "train_tokens_per_second": 1987.152 + }, + { + "epoch": 0.87202216066482, + "grad_norm": 0.06179644167423248, + "learning_rate": 9.850808013339192e-05, + "loss": 0.01345607079565525, + "num_input_tokens_seen": 51551648, + "step": 3148, + "train_runtime": 25942.4708, + "train_tokens_per_second": 1987.153 + }, + { + "epoch": 0.8722991689750692, + "grad_norm": 0.06002107262611389, + "learning_rate": 9.850701429214544e-05, + "loss": 0.013048871420323849, + "num_input_tokens_seen": 51568024, + "step": 3149, + "train_runtime": 25950.7125, + "train_tokens_per_second": 1987.153 + }, + { + "epoch": 0.8725761772853186, + "grad_norm": 0.049038562923669815, + "learning_rate": 9.850594807608177e-05, + "loss": 0.011078553274273872, + "num_input_tokens_seen": 51584400, + "step": 3150, + "train_runtime": 25958.9473, + "train_tokens_per_second": 1987.153 + }, + { + "epoch": 0.8728531855955679, + "grad_norm": 0.0711856409907341, + "learning_rate": 9.850488148520918e-05, + "loss": 0.012100485153496265, + "num_input_tokens_seen": 51600776, + "step": 3151, + "train_runtime": 25967.1797, + "train_tokens_per_second": 1987.154 + }, + { + "epoch": 0.8731301939058171, + "grad_norm": 0.05176782235503197, + "learning_rate": 9.850381451953586e-05, + "loss": 0.012101370841264725, + "num_input_tokens_seen": 51617152, + "step": 3152, + "train_runtime": 25975.4106, + "train_tokens_per_second": 1987.154 + }, + { + "epoch": 0.8734072022160665, + "grad_norm": 0.0709516704082489, + "learning_rate": 9.850274717907008e-05, + "loss": 0.01217200793325901, + "num_input_tokens_seen": 51633528, + "step": 3153, + "train_runtime": 25983.6409, + "train_tokens_per_second": 1987.155 + }, + { + "epoch": 0.8736842105263158, + "grad_norm": 0.1283256560564041, + "learning_rate": 9.850167946382009e-05, + "loss": 0.014775131829082966, + "num_input_tokens_seen": 51649904, + "step": 3154, + "train_runtime": 25991.8692, + "train_tokens_per_second": 1987.156 + }, + { + "epoch": 0.8739612188365651, + "grad_norm": 0.08657892793416977, + "learning_rate": 9.850061137379413e-05, + "loss": 0.014632993377745152, + "num_input_tokens_seen": 51666280, + "step": 3155, + "train_runtime": 26000.0835, + "train_tokens_per_second": 1987.158 + }, + { + "epoch": 0.8742382271468144, + "grad_norm": 0.0637960210442543, + "learning_rate": 9.849954290900046e-05, + "loss": 0.015842869877815247, + "num_input_tokens_seen": 51682656, + "step": 3156, + "train_runtime": 26008.3034, + "train_tokens_per_second": 1987.16 + }, + { + "epoch": 0.8745152354570637, + "grad_norm": 0.0470198430120945, + "learning_rate": 9.849847406944734e-05, + "loss": 0.012224909849464893, + "num_input_tokens_seen": 51699032, + "step": 3157, + "train_runtime": 26016.5402, + "train_tokens_per_second": 1987.16 + }, + { + "epoch": 0.874792243767313, + "grad_norm": 0.037548620253801346, + "learning_rate": 9.849740485514302e-05, + "loss": 0.010497041046619415, + "num_input_tokens_seen": 51715408, + "step": 3158, + "train_runtime": 26024.7769, + "train_tokens_per_second": 1987.16 + }, + { + "epoch": 0.8750692520775624, + "grad_norm": 0.04966398701071739, + "learning_rate": 9.849633526609578e-05, + "loss": 0.01227495539933443, + "num_input_tokens_seen": 51731784, + "step": 3159, + "train_runtime": 26033.0074, + "train_tokens_per_second": 1987.161 + }, + { + "epoch": 0.8753462603878116, + "grad_norm": 0.06592658162117004, + "learning_rate": 9.849526530231386e-05, + "loss": 0.013125133700668812, + "num_input_tokens_seen": 51748160, + "step": 3160, + "train_runtime": 26041.2158, + "train_tokens_per_second": 1987.164 + }, + { + "epoch": 0.875623268698061, + "grad_norm": 0.07043174654245377, + "learning_rate": 9.849419496380554e-05, + "loss": 0.016416672617197037, + "num_input_tokens_seen": 51764536, + "step": 3161, + "train_runtime": 26049.4242, + "train_tokens_per_second": 1987.166 + }, + { + "epoch": 0.8759002770083103, + "grad_norm": 0.05382814630866051, + "learning_rate": 9.849312425057911e-05, + "loss": 0.00994250737130642, + "num_input_tokens_seen": 51780912, + "step": 3162, + "train_runtime": 26057.6319, + "train_tokens_per_second": 1987.169 + }, + { + "epoch": 0.8761772853185595, + "grad_norm": 0.05013863742351532, + "learning_rate": 9.849205316264279e-05, + "loss": 0.010965641587972641, + "num_input_tokens_seen": 51797288, + "step": 3163, + "train_runtime": 26065.8417, + "train_tokens_per_second": 1987.171 + }, + { + "epoch": 0.8764542936288089, + "grad_norm": 0.04905688017606735, + "learning_rate": 9.849098170000491e-05, + "loss": 0.011893926188349724, + "num_input_tokens_seen": 51813664, + "step": 3164, + "train_runtime": 26074.0565, + "train_tokens_per_second": 1987.173 + }, + { + "epoch": 0.8767313019390581, + "grad_norm": 0.08128813654184341, + "learning_rate": 9.848990986267373e-05, + "loss": 0.014162128791213036, + "num_input_tokens_seen": 51830040, + "step": 3165, + "train_runtime": 26082.2664, + "train_tokens_per_second": 1987.175 + }, + { + "epoch": 0.8770083102493075, + "grad_norm": 0.08510101586580276, + "learning_rate": 9.848883765065753e-05, + "loss": 0.01265966147184372, + "num_input_tokens_seen": 51846416, + "step": 3166, + "train_runtime": 26090.4782, + "train_tokens_per_second": 1987.178 + }, + { + "epoch": 0.8772853185595568, + "grad_norm": 0.0634651631116867, + "learning_rate": 9.848776506396458e-05, + "loss": 0.012518322095274925, + "num_input_tokens_seen": 51862792, + "step": 3167, + "train_runtime": 26098.6901, + "train_tokens_per_second": 1987.18 + }, + { + "epoch": 0.8775623268698061, + "grad_norm": 0.04080342873930931, + "learning_rate": 9.848669210260319e-05, + "loss": 0.013427142053842545, + "num_input_tokens_seen": 51879168, + "step": 3168, + "train_runtime": 26106.9011, + "train_tokens_per_second": 1987.182 + }, + { + "epoch": 0.8778393351800554, + "grad_norm": 0.06644147634506226, + "learning_rate": 9.848561876658166e-05, + "loss": 0.013027895241975784, + "num_input_tokens_seen": 51895544, + "step": 3169, + "train_runtime": 26115.11, + "train_tokens_per_second": 1987.185 + }, + { + "epoch": 0.8781163434903048, + "grad_norm": 0.050840843468904495, + "learning_rate": 9.848454505590826e-05, + "loss": 0.012723173946142197, + "num_input_tokens_seen": 51911920, + "step": 3170, + "train_runtime": 26123.3241, + "train_tokens_per_second": 1987.187 + }, + { + "epoch": 0.878393351800554, + "grad_norm": 0.09269953519105911, + "learning_rate": 9.84834709705913e-05, + "loss": 0.009675766341388226, + "num_input_tokens_seen": 51928296, + "step": 3171, + "train_runtime": 26131.5354, + "train_tokens_per_second": 1987.189 + }, + { + "epoch": 0.8786703601108034, + "grad_norm": 0.04480402171611786, + "learning_rate": 9.848239651063906e-05, + "loss": 0.009877492673695087, + "num_input_tokens_seen": 51944672, + "step": 3172, + "train_runtime": 26139.745, + "train_tokens_per_second": 1987.191 + }, + { + "epoch": 0.8789473684210526, + "grad_norm": 0.05521195009350777, + "learning_rate": 9.848132167605987e-05, + "loss": 0.011677587404847145, + "num_input_tokens_seen": 51961048, + "step": 3173, + "train_runtime": 26147.9589, + "train_tokens_per_second": 1987.193 + }, + { + "epoch": 0.8792243767313019, + "grad_norm": 0.055711373686790466, + "learning_rate": 9.848024646686202e-05, + "loss": 0.013274816796183586, + "num_input_tokens_seen": 51977424, + "step": 3174, + "train_runtime": 26156.1746, + "train_tokens_per_second": 1987.195 + }, + { + "epoch": 0.8795013850415513, + "grad_norm": 0.08308987319469452, + "learning_rate": 9.847917088305383e-05, + "loss": 0.01115440484136343, + "num_input_tokens_seen": 51993800, + "step": 3175, + "train_runtime": 26164.4196, + "train_tokens_per_second": 1987.195 + }, + { + "epoch": 0.8797783933518005, + "grad_norm": 0.05179900676012039, + "learning_rate": 9.84780949246436e-05, + "loss": 0.01264860201627016, + "num_input_tokens_seen": 52010176, + "step": 3176, + "train_runtime": 26172.6329, + "train_tokens_per_second": 1987.197 + }, + { + "epoch": 0.8800554016620499, + "grad_norm": 0.04238082468509674, + "learning_rate": 9.847701859163962e-05, + "loss": 0.009498205967247486, + "num_input_tokens_seen": 52026552, + "step": 3177, + "train_runtime": 26180.8457, + "train_tokens_per_second": 1987.199 + }, + { + "epoch": 0.8803324099722992, + "grad_norm": 0.0768587589263916, + "learning_rate": 9.847594188405027e-05, + "loss": 0.014724044129252434, + "num_input_tokens_seen": 52042928, + "step": 3178, + "train_runtime": 26189.0567, + "train_tokens_per_second": 1987.201 + }, + { + "epoch": 0.8806094182825485, + "grad_norm": 0.05762647092342377, + "learning_rate": 9.847486480188381e-05, + "loss": 0.01359645090997219, + "num_input_tokens_seen": 52059304, + "step": 3179, + "train_runtime": 26197.2756, + "train_tokens_per_second": 1987.203 + }, + { + "epoch": 0.8808864265927978, + "grad_norm": 0.05034008249640465, + "learning_rate": 9.847378734514858e-05, + "loss": 0.011081263422966003, + "num_input_tokens_seen": 52075680, + "step": 3180, + "train_runtime": 26205.5056, + "train_tokens_per_second": 1987.204 + }, + { + "epoch": 0.881163434903047, + "grad_norm": 0.08567168563604355, + "learning_rate": 9.847270951385292e-05, + "loss": 0.013190810568630695, + "num_input_tokens_seen": 52092056, + "step": 3181, + "train_runtime": 26213.7292, + "train_tokens_per_second": 1987.205 + }, + { + "epoch": 0.8814404432132964, + "grad_norm": 0.11297842115163803, + "learning_rate": 9.847163130800517e-05, + "loss": 0.013200638815760612, + "num_input_tokens_seen": 52108432, + "step": 3182, + "train_runtime": 26221.96, + "train_tokens_per_second": 1987.206 + }, + { + "epoch": 0.8817174515235457, + "grad_norm": 0.03989402577280998, + "learning_rate": 9.847055272761361e-05, + "loss": 0.011130109429359436, + "num_input_tokens_seen": 52124808, + "step": 3183, + "train_runtime": 26230.1922, + "train_tokens_per_second": 1987.206 + }, + { + "epoch": 0.881994459833795, + "grad_norm": 0.07305870950222015, + "learning_rate": 9.846947377268663e-05, + "loss": 0.014002839103341103, + "num_input_tokens_seen": 52141184, + "step": 3184, + "train_runtime": 26238.4182, + "train_tokens_per_second": 1987.208 + }, + { + "epoch": 0.8822714681440443, + "grad_norm": 0.06854282319545746, + "learning_rate": 9.846839444323253e-05, + "loss": 0.012761496938765049, + "num_input_tokens_seen": 52157560, + "step": 3185, + "train_runtime": 26246.6423, + "train_tokens_per_second": 1987.209 + }, + { + "epoch": 0.8825484764542936, + "grad_norm": 0.050872135907411575, + "learning_rate": 9.846731473925966e-05, + "loss": 0.012961538508534431, + "num_input_tokens_seen": 52173936, + "step": 3186, + "train_runtime": 26254.8565, + "train_tokens_per_second": 1987.211 + }, + { + "epoch": 0.8828254847645429, + "grad_norm": 0.0575326606631279, + "learning_rate": 9.846623466077635e-05, + "loss": 0.012572428211569786, + "num_input_tokens_seen": 52190312, + "step": 3187, + "train_runtime": 26263.0662, + "train_tokens_per_second": 1987.213 + }, + { + "epoch": 0.8831024930747923, + "grad_norm": 0.05907263234257698, + "learning_rate": 9.846515420779099e-05, + "loss": 0.012694556266069412, + "num_input_tokens_seen": 52206688, + "step": 3188, + "train_runtime": 26271.2704, + "train_tokens_per_second": 1987.216 + }, + { + "epoch": 0.8833795013850415, + "grad_norm": 0.06384424865245819, + "learning_rate": 9.846407338031189e-05, + "loss": 0.011155618354678154, + "num_input_tokens_seen": 52223064, + "step": 3189, + "train_runtime": 26279.4796, + "train_tokens_per_second": 1987.218 + }, + { + "epoch": 0.8836565096952909, + "grad_norm": 0.09353955835103989, + "learning_rate": 9.84629921783474e-05, + "loss": 0.013418620452284813, + "num_input_tokens_seen": 52239440, + "step": 3190, + "train_runtime": 26287.7109, + "train_tokens_per_second": 1987.219 + }, + { + "epoch": 0.8839335180055402, + "grad_norm": 0.05476946011185646, + "learning_rate": 9.846191060190591e-05, + "loss": 0.012835712172091007, + "num_input_tokens_seen": 52255816, + "step": 3191, + "train_runtime": 26295.9264, + "train_tokens_per_second": 1987.221 + }, + { + "epoch": 0.8842105263157894, + "grad_norm": 0.09897098690271378, + "learning_rate": 9.846082865099572e-05, + "loss": 0.014130430296063423, + "num_input_tokens_seen": 52272192, + "step": 3192, + "train_runtime": 26304.1357, + "train_tokens_per_second": 1987.223 + }, + { + "epoch": 0.8844875346260388, + "grad_norm": 0.06992805004119873, + "learning_rate": 9.845974632562524e-05, + "loss": 0.011673254892230034, + "num_input_tokens_seen": 52288568, + "step": 3193, + "train_runtime": 26312.3391, + "train_tokens_per_second": 1987.226 + }, + { + "epoch": 0.884764542936288, + "grad_norm": 0.07601481676101685, + "learning_rate": 9.845866362580283e-05, + "loss": 0.011460145004093647, + "num_input_tokens_seen": 52304944, + "step": 3194, + "train_runtime": 26320.5558, + "train_tokens_per_second": 1987.228 + }, + { + "epoch": 0.8850415512465374, + "grad_norm": 0.10082302987575531, + "learning_rate": 9.845758055153683e-05, + "loss": 0.01710282452404499, + "num_input_tokens_seen": 52321320, + "step": 3195, + "train_runtime": 26328.7686, + "train_tokens_per_second": 1987.23 + }, + { + "epoch": 0.8853185595567867, + "grad_norm": 0.05721540376543999, + "learning_rate": 9.845649710283563e-05, + "loss": 0.011549131944775581, + "num_input_tokens_seen": 52337696, + "step": 3196, + "train_runtime": 26336.9738, + "train_tokens_per_second": 1987.233 + }, + { + "epoch": 0.885595567867036, + "grad_norm": 0.078179731965065, + "learning_rate": 9.845541327970758e-05, + "loss": 0.01191410981118679, + "num_input_tokens_seen": 52354072, + "step": 3197, + "train_runtime": 26345.184, + "train_tokens_per_second": 1987.235 + }, + { + "epoch": 0.8858725761772853, + "grad_norm": 0.07521672546863556, + "learning_rate": 9.845432908216107e-05, + "loss": 0.013526412658393383, + "num_input_tokens_seen": 52370448, + "step": 3198, + "train_runtime": 26353.4109, + "train_tokens_per_second": 1987.236 + }, + { + "epoch": 0.8861495844875347, + "grad_norm": 0.05857445299625397, + "learning_rate": 9.84532445102045e-05, + "loss": 0.013361010700464249, + "num_input_tokens_seen": 52386824, + "step": 3199, + "train_runtime": 26361.634, + "train_tokens_per_second": 1987.237 + }, + { + "epoch": 0.8864265927977839, + "grad_norm": 0.09340082854032516, + "learning_rate": 9.84521595638462e-05, + "loss": 0.012338138185441494, + "num_input_tokens_seen": 52403200, + "step": 3200, + "train_runtime": 26369.8567, + "train_tokens_per_second": 1987.239 + }, + { + "epoch": 0.8867036011080333, + "grad_norm": 0.08583322167396545, + "learning_rate": 9.845107424309459e-05, + "loss": 0.013119630515575409, + "num_input_tokens_seen": 52419576, + "step": 3201, + "train_runtime": 26379.8416, + "train_tokens_per_second": 1987.107 + }, + { + "epoch": 0.8869806094182825, + "grad_norm": 0.062004707753658295, + "learning_rate": 9.844998854795806e-05, + "loss": 0.01061323843896389, + "num_input_tokens_seen": 52435952, + "step": 3202, + "train_runtime": 26388.0607, + "train_tokens_per_second": 1987.109 + }, + { + "epoch": 0.8872576177285318, + "grad_norm": 0.051840052008628845, + "learning_rate": 9.844890247844497e-05, + "loss": 0.00897449441254139, + "num_input_tokens_seen": 52452328, + "step": 3203, + "train_runtime": 26396.2849, + "train_tokens_per_second": 1987.11 + }, + { + "epoch": 0.8875346260387812, + "grad_norm": 0.06774044036865234, + "learning_rate": 9.844781603456372e-05, + "loss": 0.014472698792815208, + "num_input_tokens_seen": 52468704, + "step": 3204, + "train_runtime": 26404.4994, + "train_tokens_per_second": 1987.112 + }, + { + "epoch": 0.8878116343490304, + "grad_norm": 0.0698368176817894, + "learning_rate": 9.844672921632274e-05, + "loss": 0.014143327251076698, + "num_input_tokens_seen": 52485080, + "step": 3205, + "train_runtime": 26412.7213, + "train_tokens_per_second": 1987.114 + }, + { + "epoch": 0.8880886426592798, + "grad_norm": 0.09101536124944687, + "learning_rate": 9.844564202373039e-05, + "loss": 0.016152270138263702, + "num_input_tokens_seen": 52501456, + "step": 3206, + "train_runtime": 26420.9456, + "train_tokens_per_second": 1987.115 + }, + { + "epoch": 0.8883656509695291, + "grad_norm": 0.0771101787686348, + "learning_rate": 9.844455445679507e-05, + "loss": 0.014451547525823116, + "num_input_tokens_seen": 52517832, + "step": 3207, + "train_runtime": 26429.1738, + "train_tokens_per_second": 1987.116 + }, + { + "epoch": 0.8886426592797784, + "grad_norm": 0.045235585421323776, + "learning_rate": 9.84434665155252e-05, + "loss": 0.01341484859585762, + "num_input_tokens_seen": 52534208, + "step": 3208, + "train_runtime": 26437.3996, + "train_tokens_per_second": 1987.117 + }, + { + "epoch": 0.8889196675900277, + "grad_norm": 0.04472764953970909, + "learning_rate": 9.844237819992918e-05, + "loss": 0.01045423373579979, + "num_input_tokens_seen": 52550584, + "step": 3209, + "train_runtime": 26445.6287, + "train_tokens_per_second": 1987.118 + }, + { + "epoch": 0.889196675900277, + "grad_norm": 0.08165770024061203, + "learning_rate": 9.844128951001544e-05, + "loss": 0.011845387518405914, + "num_input_tokens_seen": 52566960, + "step": 3210, + "train_runtime": 26453.8568, + "train_tokens_per_second": 1987.119 + }, + { + "epoch": 0.8894736842105263, + "grad_norm": 0.09034226089715958, + "learning_rate": 9.844020044579237e-05, + "loss": 0.014074466191232204, + "num_input_tokens_seen": 52583336, + "step": 3211, + "train_runtime": 26462.0886, + "train_tokens_per_second": 1987.12 + }, + { + "epoch": 0.8897506925207757, + "grad_norm": 0.06279339641332626, + "learning_rate": 9.843911100726837e-05, + "loss": 0.013494543731212616, + "num_input_tokens_seen": 52599712, + "step": 3212, + "train_runtime": 26470.3145, + "train_tokens_per_second": 1987.121 + }, + { + "epoch": 0.8900277008310249, + "grad_norm": 0.07301562279462814, + "learning_rate": 9.84380211944519e-05, + "loss": 0.013673895969986916, + "num_input_tokens_seen": 52616088, + "step": 3213, + "train_runtime": 26478.5475, + "train_tokens_per_second": 1987.121 + }, + { + "epoch": 0.8903047091412742, + "grad_norm": 0.11745452880859375, + "learning_rate": 9.843693100735134e-05, + "loss": 0.01267430279403925, + "num_input_tokens_seen": 52632464, + "step": 3214, + "train_runtime": 26486.7709, + "train_tokens_per_second": 1987.123 + }, + { + "epoch": 0.8905817174515236, + "grad_norm": 0.07619405537843704, + "learning_rate": 9.843584044597513e-05, + "loss": 0.012970438227057457, + "num_input_tokens_seen": 52648840, + "step": 3215, + "train_runtime": 26494.994, + "train_tokens_per_second": 1987.124 + }, + { + "epoch": 0.8908587257617728, + "grad_norm": 0.05832138657569885, + "learning_rate": 9.843474951033171e-05, + "loss": 0.011677877977490425, + "num_input_tokens_seen": 52665216, + "step": 3216, + "train_runtime": 26503.2099, + "train_tokens_per_second": 1987.126 + }, + { + "epoch": 0.8911357340720222, + "grad_norm": 0.09040416032075882, + "learning_rate": 9.84336582004295e-05, + "loss": 0.01670926623046398, + "num_input_tokens_seen": 52681592, + "step": 3217, + "train_runtime": 26511.4368, + "train_tokens_per_second": 1987.127 + }, + { + "epoch": 0.8914127423822714, + "grad_norm": 0.06677490472793579, + "learning_rate": 9.843256651627693e-05, + "loss": 0.011260585859417915, + "num_input_tokens_seen": 52697968, + "step": 3218, + "train_runtime": 26519.6645, + "train_tokens_per_second": 1987.128 + }, + { + "epoch": 0.8916897506925208, + "grad_norm": 0.060873162001371384, + "learning_rate": 9.843147445788244e-05, + "loss": 0.011093074455857277, + "num_input_tokens_seen": 52714344, + "step": 3219, + "train_runtime": 26527.8973, + "train_tokens_per_second": 1987.129 + }, + { + "epoch": 0.8919667590027701, + "grad_norm": 0.052379049360752106, + "learning_rate": 9.843038202525447e-05, + "loss": 0.01049839612096548, + "num_input_tokens_seen": 52730720, + "step": 3220, + "train_runtime": 26536.1258, + "train_tokens_per_second": 1987.13 + }, + { + "epoch": 0.8922437673130194, + "grad_norm": 0.06261297315359116, + "learning_rate": 9.842928921840144e-05, + "loss": 0.013568068854510784, + "num_input_tokens_seen": 52747096, + "step": 3221, + "train_runtime": 26544.3579, + "train_tokens_per_second": 1987.13 + }, + { + "epoch": 0.8925207756232687, + "grad_norm": 0.052830588072538376, + "learning_rate": 9.842819603733182e-05, + "loss": 0.012382186017930508, + "num_input_tokens_seen": 52763472, + "step": 3222, + "train_runtime": 26552.5756, + "train_tokens_per_second": 1987.132 + }, + { + "epoch": 0.892797783933518, + "grad_norm": 0.060427721589803696, + "learning_rate": 9.842710248205405e-05, + "loss": 0.01153691578656435, + "num_input_tokens_seen": 52779848, + "step": 3223, + "train_runtime": 26560.8053, + "train_tokens_per_second": 1987.133 + }, + { + "epoch": 0.8930747922437673, + "grad_norm": 0.06282243877649307, + "learning_rate": 9.842600855257658e-05, + "loss": 0.009244867600500584, + "num_input_tokens_seen": 52796224, + "step": 3224, + "train_runtime": 26569.0376, + "train_tokens_per_second": 1987.133 + }, + { + "epoch": 0.8933518005540166, + "grad_norm": 0.06610157340765, + "learning_rate": 9.842491424890787e-05, + "loss": 0.011176707223057747, + "num_input_tokens_seen": 52812600, + "step": 3225, + "train_runtime": 26577.2688, + "train_tokens_per_second": 1987.134 + }, + { + "epoch": 0.8936288088642659, + "grad_norm": 0.08808278292417526, + "learning_rate": 9.842381957105636e-05, + "loss": 0.012866740114986897, + "num_input_tokens_seen": 52828976, + "step": 3226, + "train_runtime": 26585.4965, + "train_tokens_per_second": 1987.135 + }, + { + "epoch": 0.8939058171745152, + "grad_norm": 0.09301361441612244, + "learning_rate": 9.842272451903049e-05, + "loss": 0.013836421072483063, + "num_input_tokens_seen": 52845352, + "step": 3227, + "train_runtime": 26593.7321, + "train_tokens_per_second": 1987.136 + }, + { + "epoch": 0.8941828254847646, + "grad_norm": 0.06689035147428513, + "learning_rate": 9.842162909283879e-05, + "loss": 0.013617895543575287, + "num_input_tokens_seen": 52861728, + "step": 3228, + "train_runtime": 26601.9661, + "train_tokens_per_second": 1987.136 + }, + { + "epoch": 0.8944598337950138, + "grad_norm": 0.078070729970932, + "learning_rate": 9.842053329248966e-05, + "loss": 0.011860020458698273, + "num_input_tokens_seen": 52878104, + "step": 3229, + "train_runtime": 26610.1952, + "train_tokens_per_second": 1987.137 + }, + { + "epoch": 0.8947368421052632, + "grad_norm": 0.03946702554821968, + "learning_rate": 9.841943711799158e-05, + "loss": 0.01436656154692173, + "num_input_tokens_seen": 52894480, + "step": 3230, + "train_runtime": 26618.4334, + "train_tokens_per_second": 1987.137 + }, + { + "epoch": 0.8950138504155125, + "grad_norm": 0.07142075896263123, + "learning_rate": 9.841834056935304e-05, + "loss": 0.012369544245302677, + "num_input_tokens_seen": 52910856, + "step": 3231, + "train_runtime": 26626.669, + "train_tokens_per_second": 1987.138 + }, + { + "epoch": 0.8952908587257618, + "grad_norm": 0.05603375285863876, + "learning_rate": 9.84172436465825e-05, + "loss": 0.010334598831832409, + "num_input_tokens_seen": 52927232, + "step": 3232, + "train_runtime": 26634.8848, + "train_tokens_per_second": 1987.14 + }, + { + "epoch": 0.8955678670360111, + "grad_norm": 0.07648472487926483, + "learning_rate": 9.841614634968843e-05, + "loss": 0.014453429728746414, + "num_input_tokens_seen": 52943608, + "step": 3233, + "train_runtime": 26643.1142, + "train_tokens_per_second": 1987.14 + }, + { + "epoch": 0.8958448753462603, + "grad_norm": 0.04335494711995125, + "learning_rate": 9.841504867867933e-05, + "loss": 0.016218531876802444, + "num_input_tokens_seen": 52959984, + "step": 3234, + "train_runtime": 26651.3385, + "train_tokens_per_second": 1987.142 + }, + { + "epoch": 0.8961218836565097, + "grad_norm": 0.06676878780126572, + "learning_rate": 9.841395063356367e-05, + "loss": 0.014549464918673038, + "num_input_tokens_seen": 52976360, + "step": 3235, + "train_runtime": 26659.5668, + "train_tokens_per_second": 1987.143 + }, + { + "epoch": 0.896398891966759, + "grad_norm": 0.12459925562143326, + "learning_rate": 9.841285221434993e-05, + "loss": 0.01478135958313942, + "num_input_tokens_seen": 52992736, + "step": 3236, + "train_runtime": 26667.7966, + "train_tokens_per_second": 1987.143 + }, + { + "epoch": 0.8966759002770083, + "grad_norm": 0.12076133489608765, + "learning_rate": 9.841175342104661e-05, + "loss": 0.012324195355176926, + "num_input_tokens_seen": 53009112, + "step": 3237, + "train_runtime": 26676.0193, + "train_tokens_per_second": 1987.145 + }, + { + "epoch": 0.8969529085872576, + "grad_norm": 0.05907126143574715, + "learning_rate": 9.841065425366217e-05, + "loss": 0.011883776634931564, + "num_input_tokens_seen": 53025488, + "step": 3238, + "train_runtime": 26684.2285, + "train_tokens_per_second": 1987.147 + }, + { + "epoch": 0.897229916897507, + "grad_norm": 0.087920643389225, + "learning_rate": 9.840955471220514e-05, + "loss": 0.016818637028336525, + "num_input_tokens_seen": 53041864, + "step": 3239, + "train_runtime": 26692.4385, + "train_tokens_per_second": 1987.149 + }, + { + "epoch": 0.8975069252077562, + "grad_norm": 0.05174710601568222, + "learning_rate": 9.840845479668399e-05, + "loss": 0.008841082453727722, + "num_input_tokens_seen": 53058240, + "step": 3240, + "train_runtime": 26700.6655, + "train_tokens_per_second": 1987.15 + }, + { + "epoch": 0.8977839335180056, + "grad_norm": 0.06760042160749435, + "learning_rate": 9.840735450710724e-05, + "loss": 0.01392355002462864, + "num_input_tokens_seen": 53074616, + "step": 3241, + "train_runtime": 26708.8964, + "train_tokens_per_second": 1987.151 + }, + { + "epoch": 0.8980609418282548, + "grad_norm": 0.05747731029987335, + "learning_rate": 9.840625384348339e-05, + "loss": 0.014696772210299969, + "num_input_tokens_seen": 53090992, + "step": 3242, + "train_runtime": 26717.1293, + "train_tokens_per_second": 1987.152 + }, + { + "epoch": 0.8983379501385041, + "grad_norm": 0.07307559996843338, + "learning_rate": 9.840515280582092e-05, + "loss": 0.014424911700189114, + "num_input_tokens_seen": 53107368, + "step": 3243, + "train_runtime": 26725.3604, + "train_tokens_per_second": 1987.153 + }, + { + "epoch": 0.8986149584487535, + "grad_norm": 0.06936611235141754, + "learning_rate": 9.840405139412836e-05, + "loss": 0.014613274484872818, + "num_input_tokens_seen": 53123744, + "step": 3244, + "train_runtime": 26733.5986, + "train_tokens_per_second": 1987.153 + }, + { + "epoch": 0.8988919667590027, + "grad_norm": 0.05342936888337135, + "learning_rate": 9.840294960841423e-05, + "loss": 0.011432912200689316, + "num_input_tokens_seen": 53140120, + "step": 3245, + "train_runtime": 26741.8291, + "train_tokens_per_second": 1987.154 + }, + { + "epoch": 0.8991689750692521, + "grad_norm": 0.06893877685070038, + "learning_rate": 9.840184744868701e-05, + "loss": 0.012158434838056564, + "num_input_tokens_seen": 53156496, + "step": 3246, + "train_runtime": 26750.0638, + "train_tokens_per_second": 1987.154 + }, + { + "epoch": 0.8994459833795014, + "grad_norm": 0.05573328956961632, + "learning_rate": 9.840074491495523e-05, + "loss": 0.014063281938433647, + "num_input_tokens_seen": 53172872, + "step": 3247, + "train_runtime": 26758.2943, + "train_tokens_per_second": 1987.155 + }, + { + "epoch": 0.8997229916897507, + "grad_norm": 0.043872538954019547, + "learning_rate": 9.839964200722743e-05, + "loss": 0.01311839185655117, + "num_input_tokens_seen": 53189248, + "step": 3248, + "train_runtime": 26766.5175, + "train_tokens_per_second": 1987.156 + }, + { + "epoch": 0.9, + "grad_norm": 0.06560183316469193, + "learning_rate": 9.839853872551212e-05, + "loss": 0.014082060195505619, + "num_input_tokens_seen": 53205624, + "step": 3249, + "train_runtime": 26774.7434, + "train_tokens_per_second": 1987.157 + }, + { + "epoch": 0.9002770083102493, + "grad_norm": 0.04893656447529793, + "learning_rate": 9.839743506981782e-05, + "loss": 0.011561529710888863, + "num_input_tokens_seen": 53222000, + "step": 3250, + "train_runtime": 26782.9732, + "train_tokens_per_second": 1987.158 + }, + { + "epoch": 0.9005540166204986, + "grad_norm": 0.08122606575489044, + "learning_rate": 9.839633104015305e-05, + "loss": 0.012077066116034985, + "num_input_tokens_seen": 53238376, + "step": 3251, + "train_runtime": 26791.1981, + "train_tokens_per_second": 1987.159 + }, + { + "epoch": 0.900831024930748, + "grad_norm": 0.05817558243870735, + "learning_rate": 9.839522663652635e-05, + "loss": 0.015231628902256489, + "num_input_tokens_seen": 53254752, + "step": 3252, + "train_runtime": 26799.4352, + "train_tokens_per_second": 1987.159 + }, + { + "epoch": 0.9011080332409972, + "grad_norm": 0.06634731590747833, + "learning_rate": 9.839412185894628e-05, + "loss": 0.015181874856352806, + "num_input_tokens_seen": 53271128, + "step": 3253, + "train_runtime": 26807.6587, + "train_tokens_per_second": 1987.161 + }, + { + "epoch": 0.9013850415512465, + "grad_norm": 0.03624751418828964, + "learning_rate": 9.839301670742134e-05, + "loss": 0.011698653921484947, + "num_input_tokens_seen": 53287504, + "step": 3254, + "train_runtime": 26815.8859, + "train_tokens_per_second": 1987.162 + }, + { + "epoch": 0.9016620498614959, + "grad_norm": 0.07120251655578613, + "learning_rate": 9.839191118196007e-05, + "loss": 0.016394540667533875, + "num_input_tokens_seen": 53303880, + "step": 3255, + "train_runtime": 26824.1032, + "train_tokens_per_second": 1987.164 + }, + { + "epoch": 0.9019390581717451, + "grad_norm": 0.0683327317237854, + "learning_rate": 9.839080528257104e-05, + "loss": 0.01460427325218916, + "num_input_tokens_seen": 53320256, + "step": 3256, + "train_runtime": 26832.3196, + "train_tokens_per_second": 1987.165 + }, + { + "epoch": 0.9022160664819945, + "grad_norm": 0.05673768371343613, + "learning_rate": 9.838969900926277e-05, + "loss": 0.011751855723559856, + "num_input_tokens_seen": 53336632, + "step": 3257, + "train_runtime": 26840.531, + "train_tokens_per_second": 1987.168 + }, + { + "epoch": 0.9024930747922437, + "grad_norm": 0.08588428050279617, + "learning_rate": 9.838859236204382e-05, + "loss": 0.01286538876593113, + "num_input_tokens_seen": 53353008, + "step": 3258, + "train_runtime": 26848.7388, + "train_tokens_per_second": 1987.17 + }, + { + "epoch": 0.9027700831024931, + "grad_norm": 0.12261991947889328, + "learning_rate": 9.838748534092274e-05, + "loss": 0.01154380850493908, + "num_input_tokens_seen": 53369384, + "step": 3259, + "train_runtime": 26856.9558, + "train_tokens_per_second": 1987.172 + }, + { + "epoch": 0.9030470914127424, + "grad_norm": 0.04910409450531006, + "learning_rate": 9.838637794590808e-05, + "loss": 0.010512185283005238, + "num_input_tokens_seen": 53385760, + "step": 3260, + "train_runtime": 26865.1741, + "train_tokens_per_second": 1987.173 + }, + { + "epoch": 0.9033240997229917, + "grad_norm": 0.049258243292570114, + "learning_rate": 9.83852701770084e-05, + "loss": 0.010818629525601864, + "num_input_tokens_seen": 53402136, + "step": 3261, + "train_runtime": 26873.3874, + "train_tokens_per_second": 1987.175 + }, + { + "epoch": 0.903601108033241, + "grad_norm": 0.04757596552371979, + "learning_rate": 9.838416203423226e-05, + "loss": 0.01332101970911026, + "num_input_tokens_seen": 53418512, + "step": 3262, + "train_runtime": 26881.5964, + "train_tokens_per_second": 1987.178 + }, + { + "epoch": 0.9038781163434904, + "grad_norm": 0.11364039033651352, + "learning_rate": 9.838305351758823e-05, + "loss": 0.013193977065384388, + "num_input_tokens_seen": 53434888, + "step": 3263, + "train_runtime": 26889.8092, + "train_tokens_per_second": 1987.18 + }, + { + "epoch": 0.9041551246537396, + "grad_norm": 0.0716327354311943, + "learning_rate": 9.838194462708485e-05, + "loss": 0.01614002138376236, + "num_input_tokens_seen": 53451264, + "step": 3264, + "train_runtime": 26898.0243, + "train_tokens_per_second": 1987.182 + }, + { + "epoch": 0.9044321329639889, + "grad_norm": 0.07672341912984848, + "learning_rate": 9.838083536273073e-05, + "loss": 0.011599855497479439, + "num_input_tokens_seen": 53467640, + "step": 3265, + "train_runtime": 26906.235, + "train_tokens_per_second": 1987.184 + }, + { + "epoch": 0.9047091412742382, + "grad_norm": 0.08764645457267761, + "learning_rate": 9.83797257245344e-05, + "loss": 0.00911495927721262, + "num_input_tokens_seen": 53484016, + "step": 3266, + "train_runtime": 26914.4431, + "train_tokens_per_second": 1987.186 + }, + { + "epoch": 0.9049861495844875, + "grad_norm": 0.03328318893909454, + "learning_rate": 9.837861571250445e-05, + "loss": 0.009278730489313602, + "num_input_tokens_seen": 53500392, + "step": 3267, + "train_runtime": 26922.671, + "train_tokens_per_second": 1987.187 + }, + { + "epoch": 0.9052631578947369, + "grad_norm": 0.06963762640953064, + "learning_rate": 9.837750532664947e-05, + "loss": 0.012943669222295284, + "num_input_tokens_seen": 53516768, + "step": 3268, + "train_runtime": 26930.9044, + "train_tokens_per_second": 1987.188 + }, + { + "epoch": 0.9055401662049861, + "grad_norm": 0.04102727770805359, + "learning_rate": 9.837639456697803e-05, + "loss": 0.013433863408863544, + "num_input_tokens_seen": 53533144, + "step": 3269, + "train_runtime": 26939.1353, + "train_tokens_per_second": 1987.189 + }, + { + "epoch": 0.9058171745152355, + "grad_norm": 0.04699486121535301, + "learning_rate": 9.837528343349868e-05, + "loss": 0.011308424174785614, + "num_input_tokens_seen": 53549520, + "step": 3270, + "train_runtime": 26947.3629, + "train_tokens_per_second": 1987.19 + }, + { + "epoch": 0.9060941828254848, + "grad_norm": 0.07727546989917755, + "learning_rate": 9.837417192622008e-05, + "loss": 0.016293026506900787, + "num_input_tokens_seen": 53565896, + "step": 3271, + "train_runtime": 26955.5885, + "train_tokens_per_second": 1987.191 + }, + { + "epoch": 0.9063711911357341, + "grad_norm": 0.10956088453531265, + "learning_rate": 9.837306004515076e-05, + "loss": 0.0148494653403759, + "num_input_tokens_seen": 53582272, + "step": 3272, + "train_runtime": 26963.8139, + "train_tokens_per_second": 1987.192 + }, + { + "epoch": 0.9066481994459834, + "grad_norm": 0.06799192726612091, + "learning_rate": 9.837194779029933e-05, + "loss": 0.014452828094363213, + "num_input_tokens_seen": 53598648, + "step": 3273, + "train_runtime": 26972.0415, + "train_tokens_per_second": 1987.193 + }, + { + "epoch": 0.9069252077562326, + "grad_norm": 0.051187749952077866, + "learning_rate": 9.837083516167438e-05, + "loss": 0.012632009573280811, + "num_input_tokens_seen": 53615024, + "step": 3274, + "train_runtime": 26980.2656, + "train_tokens_per_second": 1987.194 + }, + { + "epoch": 0.907202216066482, + "grad_norm": 0.07660068571567535, + "learning_rate": 9.83697221592845e-05, + "loss": 0.010875511914491653, + "num_input_tokens_seen": 53631400, + "step": 3275, + "train_runtime": 26988.4992, + "train_tokens_per_second": 1987.195 + }, + { + "epoch": 0.9074792243767313, + "grad_norm": 0.05309059098362923, + "learning_rate": 9.836860878313831e-05, + "loss": 0.013445901684463024, + "num_input_tokens_seen": 53647776, + "step": 3276, + "train_runtime": 26996.7315, + "train_tokens_per_second": 1987.195 + }, + { + "epoch": 0.9077562326869806, + "grad_norm": 0.041371721774339676, + "learning_rate": 9.836749503324442e-05, + "loss": 0.011613600887358189, + "num_input_tokens_seen": 53664152, + "step": 3277, + "train_runtime": 27004.9587, + "train_tokens_per_second": 1987.196 + }, + { + "epoch": 0.9080332409972299, + "grad_norm": 0.07921940833330154, + "learning_rate": 9.83663809096114e-05, + "loss": 0.012895813211798668, + "num_input_tokens_seen": 53680528, + "step": 3278, + "train_runtime": 27013.1804, + "train_tokens_per_second": 1987.198 + }, + { + "epoch": 0.9083102493074793, + "grad_norm": 0.07560623437166214, + "learning_rate": 9.836526641224788e-05, + "loss": 0.013682825490832329, + "num_input_tokens_seen": 53696904, + "step": 3279, + "train_runtime": 27021.3942, + "train_tokens_per_second": 1987.2 + }, + { + "epoch": 0.9085872576177285, + "grad_norm": 0.05941716581583023, + "learning_rate": 9.836415154116248e-05, + "loss": 0.011469023302197456, + "num_input_tokens_seen": 53713280, + "step": 3280, + "train_runtime": 27029.6208, + "train_tokens_per_second": 1987.201 + }, + { + "epoch": 0.9088642659279779, + "grad_norm": 0.033809587359428406, + "learning_rate": 9.836303629636379e-05, + "loss": 0.011571134440600872, + "num_input_tokens_seen": 53729656, + "step": 3281, + "train_runtime": 27037.8552, + "train_tokens_per_second": 1987.201 + }, + { + "epoch": 0.9091412742382271, + "grad_norm": 0.05035177618265152, + "learning_rate": 9.836192067786045e-05, + "loss": 0.01212283968925476, + "num_input_tokens_seen": 53746032, + "step": 3282, + "train_runtime": 27046.079, + "train_tokens_per_second": 1987.202 + }, + { + "epoch": 0.9094182825484765, + "grad_norm": 0.055411551147699356, + "learning_rate": 9.836080468566107e-05, + "loss": 0.01433553360402584, + "num_input_tokens_seen": 53762408, + "step": 3283, + "train_runtime": 27054.3154, + "train_tokens_per_second": 1987.203 + }, + { + "epoch": 0.9096952908587258, + "grad_norm": 0.07064571231603622, + "learning_rate": 9.835968831977428e-05, + "loss": 0.013533788733184338, + "num_input_tokens_seen": 53778784, + "step": 3284, + "train_runtime": 27062.5562, + "train_tokens_per_second": 1987.203 + }, + { + "epoch": 0.909972299168975, + "grad_norm": 0.06179019808769226, + "learning_rate": 9.83585715802087e-05, + "loss": 0.01385747455060482, + "num_input_tokens_seen": 53795160, + "step": 3285, + "train_runtime": 27070.7733, + "train_tokens_per_second": 1987.204 + }, + { + "epoch": 0.9102493074792244, + "grad_norm": 0.09797976911067963, + "learning_rate": 9.835745446697296e-05, + "loss": 0.016143254935741425, + "num_input_tokens_seen": 53811536, + "step": 3286, + "train_runtime": 27078.9874, + "train_tokens_per_second": 1987.206 + }, + { + "epoch": 0.9105263157894737, + "grad_norm": 0.06096421927213669, + "learning_rate": 9.83563369800757e-05, + "loss": 0.01548079028725624, + "num_input_tokens_seen": 53827912, + "step": 3287, + "train_runtime": 27087.2004, + "train_tokens_per_second": 1987.208 + }, + { + "epoch": 0.910803324099723, + "grad_norm": 0.07985594123601913, + "learning_rate": 9.835521911952555e-05, + "loss": 0.016479207202792168, + "num_input_tokens_seen": 53844288, + "step": 3288, + "train_runtime": 27095.4152, + "train_tokens_per_second": 1987.21 + }, + { + "epoch": 0.9110803324099723, + "grad_norm": 0.05087704211473465, + "learning_rate": 9.835410088533112e-05, + "loss": 0.011400139890611172, + "num_input_tokens_seen": 53860664, + "step": 3289, + "train_runtime": 27103.6244, + "train_tokens_per_second": 1987.213 + }, + { + "epoch": 0.9113573407202216, + "grad_norm": 0.05508463829755783, + "learning_rate": 9.835298227750111e-05, + "loss": 0.012788405641913414, + "num_input_tokens_seen": 53877040, + "step": 3290, + "train_runtime": 27111.836, + "train_tokens_per_second": 1987.215 + }, + { + "epoch": 0.9116343490304709, + "grad_norm": 0.060181859880685806, + "learning_rate": 9.835186329604412e-05, + "loss": 0.011950353160500526, + "num_input_tokens_seen": 53893416, + "step": 3291, + "train_runtime": 27120.0554, + "train_tokens_per_second": 1987.216 + }, + { + "epoch": 0.9119113573407203, + "grad_norm": 0.06228428706526756, + "learning_rate": 9.83507439409688e-05, + "loss": 0.012884742580354214, + "num_input_tokens_seen": 53909792, + "step": 3292, + "train_runtime": 27128.2658, + "train_tokens_per_second": 1987.219 + }, + { + "epoch": 0.9121883656509695, + "grad_norm": 0.06818456202745438, + "learning_rate": 9.834962421228381e-05, + "loss": 0.011429829522967339, + "num_input_tokens_seen": 53926168, + "step": 3293, + "train_runtime": 27136.4735, + "train_tokens_per_second": 1987.221 + }, + { + "epoch": 0.9124653739612189, + "grad_norm": 0.043972063809633255, + "learning_rate": 9.83485041099978e-05, + "loss": 0.01333948690444231, + "num_input_tokens_seen": 53942544, + "step": 3294, + "train_runtime": 27144.7015, + "train_tokens_per_second": 1987.222 + }, + { + "epoch": 0.9127423822714681, + "grad_norm": 0.058974117040634155, + "learning_rate": 9.834738363411941e-05, + "loss": 0.012472731992602348, + "num_input_tokens_seen": 53958920, + "step": 3295, + "train_runtime": 27152.9334, + "train_tokens_per_second": 1987.222 + }, + { + "epoch": 0.9130193905817174, + "grad_norm": 0.08808322995901108, + "learning_rate": 9.834626278465733e-05, + "loss": 0.01286469865590334, + "num_input_tokens_seen": 53975296, + "step": 3296, + "train_runtime": 27161.1611, + "train_tokens_per_second": 1987.223 + }, + { + "epoch": 0.9132963988919668, + "grad_norm": 0.045647986233234406, + "learning_rate": 9.834514156162021e-05, + "loss": 0.013595884665846825, + "num_input_tokens_seen": 53991672, + "step": 3297, + "train_runtime": 27169.3894, + "train_tokens_per_second": 1987.224 + }, + { + "epoch": 0.913573407202216, + "grad_norm": 0.055740032345056534, + "learning_rate": 9.834401996501669e-05, + "loss": 0.012963525950908661, + "num_input_tokens_seen": 54008048, + "step": 3298, + "train_runtime": 27177.6203, + "train_tokens_per_second": 1987.225 + }, + { + "epoch": 0.9138504155124654, + "grad_norm": 0.05277996510267258, + "learning_rate": 9.834289799485545e-05, + "loss": 0.012285958044230938, + "num_input_tokens_seen": 54024424, + "step": 3299, + "train_runtime": 27185.8419, + "train_tokens_per_second": 1987.226 + }, + { + "epoch": 0.9141274238227147, + "grad_norm": 0.09178784489631653, + "learning_rate": 9.834177565114517e-05, + "loss": 0.01185586303472519, + "num_input_tokens_seen": 54040800, + "step": 3300, + "train_runtime": 27194.0685, + "train_tokens_per_second": 1987.227 + }, + { + "epoch": 0.914404432132964, + "grad_norm": 0.03408125787973404, + "learning_rate": 9.834065293389452e-05, + "loss": 0.01089492067694664, + "num_input_tokens_seen": 54057176, + "step": 3301, + "train_runtime": 27204.0434, + "train_tokens_per_second": 1987.101 + }, + { + "epoch": 0.9146814404432133, + "grad_norm": 0.055461522191762924, + "learning_rate": 9.833952984311216e-05, + "loss": 0.013150589540600777, + "num_input_tokens_seen": 54073552, + "step": 3302, + "train_runtime": 27212.2728, + "train_tokens_per_second": 1987.102 + }, + { + "epoch": 0.9149584487534625, + "grad_norm": 0.042485687881708145, + "learning_rate": 9.833840637880678e-05, + "loss": 0.01164222788065672, + "num_input_tokens_seen": 54089928, + "step": 3303, + "train_runtime": 27220.4941, + "train_tokens_per_second": 1987.103 + }, + { + "epoch": 0.9152354570637119, + "grad_norm": 0.06109447777271271, + "learning_rate": 9.833728254098706e-05, + "loss": 0.01412368007004261, + "num_input_tokens_seen": 54106304, + "step": 3304, + "train_runtime": 27228.7263, + "train_tokens_per_second": 1987.104 + }, + { + "epoch": 0.9155124653739612, + "grad_norm": 0.048097629100084305, + "learning_rate": 9.833615832966168e-05, + "loss": 0.010440709069371223, + "num_input_tokens_seen": 54122680, + "step": 3305, + "train_runtime": 27236.9578, + "train_tokens_per_second": 1987.104 + }, + { + "epoch": 0.9157894736842105, + "grad_norm": 0.052636370062828064, + "learning_rate": 9.833503374483933e-05, + "loss": 0.013599002733826637, + "num_input_tokens_seen": 54139056, + "step": 3306, + "train_runtime": 27245.1884, + "train_tokens_per_second": 1987.105 + }, + { + "epoch": 0.9160664819944598, + "grad_norm": 0.049276042729616165, + "learning_rate": 9.83339087865287e-05, + "loss": 0.012597579509019852, + "num_input_tokens_seen": 54155432, + "step": 3307, + "train_runtime": 27253.416, + "train_tokens_per_second": 1987.106 + }, + { + "epoch": 0.9163434903047092, + "grad_norm": 0.07358361780643463, + "learning_rate": 9.83327834547385e-05, + "loss": 0.012537290342152119, + "num_input_tokens_seen": 54171808, + "step": 3308, + "train_runtime": 27261.6403, + "train_tokens_per_second": 1987.107 + }, + { + "epoch": 0.9166204986149584, + "grad_norm": 0.05128917470574379, + "learning_rate": 9.833165774947739e-05, + "loss": 0.014516521245241165, + "num_input_tokens_seen": 54188184, + "step": 3309, + "train_runtime": 27269.8655, + "train_tokens_per_second": 1987.109 + }, + { + "epoch": 0.9168975069252078, + "grad_norm": 0.08900585770606995, + "learning_rate": 9.83305316707541e-05, + "loss": 0.014075737446546555, + "num_input_tokens_seen": 54204560, + "step": 3310, + "train_runtime": 27278.0932, + "train_tokens_per_second": 1987.11 + }, + { + "epoch": 0.917174515235457, + "grad_norm": 0.05611032992601395, + "learning_rate": 9.83294052185773e-05, + "loss": 0.014109053649008274, + "num_input_tokens_seen": 54220936, + "step": 3311, + "train_runtime": 27286.3205, + "train_tokens_per_second": 1987.111 + }, + { + "epoch": 0.9174515235457064, + "grad_norm": 0.07576622068881989, + "learning_rate": 9.832827839295573e-05, + "loss": 0.013502559624612331, + "num_input_tokens_seen": 54237312, + "step": 3312, + "train_runtime": 27294.5582, + "train_tokens_per_second": 1987.111 + }, + { + "epoch": 0.9177285318559557, + "grad_norm": 0.03428316116333008, + "learning_rate": 9.832715119389805e-05, + "loss": 0.00976739451289177, + "num_input_tokens_seen": 54253688, + "step": 3313, + "train_runtime": 27302.7807, + "train_tokens_per_second": 1987.112 + }, + { + "epoch": 0.918005540166205, + "grad_norm": 0.1164865717291832, + "learning_rate": 9.832602362141303e-05, + "loss": 0.014435664750635624, + "num_input_tokens_seen": 54270064, + "step": 3314, + "train_runtime": 27310.9994, + "train_tokens_per_second": 1987.114 + }, + { + "epoch": 0.9182825484764543, + "grad_norm": 0.06861692667007446, + "learning_rate": 9.832489567550935e-05, + "loss": 0.01319881621748209, + "num_input_tokens_seen": 54286440, + "step": 3315, + "train_runtime": 27319.2133, + "train_tokens_per_second": 1987.116 + }, + { + "epoch": 0.9185595567867036, + "grad_norm": 0.0551634319126606, + "learning_rate": 9.832376735619572e-05, + "loss": 0.01538463681936264, + "num_input_tokens_seen": 54302816, + "step": 3316, + "train_runtime": 27327.4226, + "train_tokens_per_second": 1987.118 + }, + { + "epoch": 0.9188365650969529, + "grad_norm": 0.0666244700551033, + "learning_rate": 9.832263866348086e-05, + "loss": 0.010357881896197796, + "num_input_tokens_seen": 54319192, + "step": 3317, + "train_runtime": 27335.6361, + "train_tokens_per_second": 1987.12 + }, + { + "epoch": 0.9191135734072022, + "grad_norm": 0.03848222270607948, + "learning_rate": 9.83215095973735e-05, + "loss": 0.012389197945594788, + "num_input_tokens_seen": 54335568, + "step": 3318, + "train_runtime": 27343.8552, + "train_tokens_per_second": 1987.122 + }, + { + "epoch": 0.9193905817174515, + "grad_norm": 0.054118745028972626, + "learning_rate": 9.832038015788238e-05, + "loss": 0.009524069726467133, + "num_input_tokens_seen": 54351944, + "step": 3319, + "train_runtime": 27352.0676, + "train_tokens_per_second": 1987.124 + }, + { + "epoch": 0.9196675900277008, + "grad_norm": 0.09011952579021454, + "learning_rate": 9.831925034501619e-05, + "loss": 0.012214712798595428, + "num_input_tokens_seen": 54368320, + "step": 3320, + "train_runtime": 27360.2905, + "train_tokens_per_second": 1987.125 + }, + { + "epoch": 0.9199445983379502, + "grad_norm": 0.04507458209991455, + "learning_rate": 9.831812015878368e-05, + "loss": 0.011899257078766823, + "num_input_tokens_seen": 54384696, + "step": 3321, + "train_runtime": 27368.5071, + "train_tokens_per_second": 1987.127 + }, + { + "epoch": 0.9202216066481994, + "grad_norm": 0.0656532570719719, + "learning_rate": 9.831698959919359e-05, + "loss": 0.011285647749900818, + "num_input_tokens_seen": 54401072, + "step": 3322, + "train_runtime": 27376.7375, + "train_tokens_per_second": 1987.128 + }, + { + "epoch": 0.9204986149584488, + "grad_norm": 0.08386757224798203, + "learning_rate": 9.831585866625465e-05, + "loss": 0.010682555846869946, + "num_input_tokens_seen": 54417448, + "step": 3323, + "train_runtime": 27384.9682, + "train_tokens_per_second": 1987.128 + }, + { + "epoch": 0.9207756232686981, + "grad_norm": 0.08345803618431091, + "learning_rate": 9.831472735997559e-05, + "loss": 0.01205068826675415, + "num_input_tokens_seen": 54433824, + "step": 3324, + "train_runtime": 27393.1987, + "train_tokens_per_second": 1987.129 + }, + { + "epoch": 0.9210526315789473, + "grad_norm": 0.06436320394277573, + "learning_rate": 9.831359568036516e-05, + "loss": 0.010692065581679344, + "num_input_tokens_seen": 54450200, + "step": 3325, + "train_runtime": 27401.4282, + "train_tokens_per_second": 1987.13 + }, + { + "epoch": 0.9213296398891967, + "grad_norm": 0.05759985372424126, + "learning_rate": 9.83124636274321e-05, + "loss": 0.013111688196659088, + "num_input_tokens_seen": 54466576, + "step": 3326, + "train_runtime": 27409.6565, + "train_tokens_per_second": 1987.131 + }, + { + "epoch": 0.9216066481994459, + "grad_norm": 0.07954785227775574, + "learning_rate": 9.831133120118516e-05, + "loss": 0.010524911805987358, + "num_input_tokens_seen": 54482952, + "step": 3327, + "train_runtime": 27417.8876, + "train_tokens_per_second": 1987.132 + }, + { + "epoch": 0.9218836565096953, + "grad_norm": 0.056869156658649445, + "learning_rate": 9.831019840163311e-05, + "loss": 0.013516183942556381, + "num_input_tokens_seen": 54499328, + "step": 3328, + "train_runtime": 27426.1148, + "train_tokens_per_second": 1987.133 + }, + { + "epoch": 0.9221606648199446, + "grad_norm": 0.056787945330142975, + "learning_rate": 9.830906522878466e-05, + "loss": 0.011057994328439236, + "num_input_tokens_seen": 54515704, + "step": 3329, + "train_runtime": 27434.34, + "train_tokens_per_second": 1987.134 + }, + { + "epoch": 0.9224376731301939, + "grad_norm": 0.05950992554426193, + "learning_rate": 9.830793168264861e-05, + "loss": 0.012139319442212582, + "num_input_tokens_seen": 54532080, + "step": 3330, + "train_runtime": 27442.5695, + "train_tokens_per_second": 1987.135 + }, + { + "epoch": 0.9227146814404432, + "grad_norm": 0.0692841038107872, + "learning_rate": 9.83067977632337e-05, + "loss": 0.011683146469295025, + "num_input_tokens_seen": 54548456, + "step": 3331, + "train_runtime": 27450.7992, + "train_tokens_per_second": 1987.135 + }, + { + "epoch": 0.9229916897506926, + "grad_norm": 0.06614711880683899, + "learning_rate": 9.830566347054868e-05, + "loss": 0.013213127851486206, + "num_input_tokens_seen": 54564832, + "step": 3332, + "train_runtime": 27459.0253, + "train_tokens_per_second": 1987.137 + }, + { + "epoch": 0.9232686980609418, + "grad_norm": 0.03920547664165497, + "learning_rate": 9.830452880460232e-05, + "loss": 0.012382379733026028, + "num_input_tokens_seen": 54581208, + "step": 3333, + "train_runtime": 27467.2555, + "train_tokens_per_second": 1987.137 + }, + { + "epoch": 0.9235457063711912, + "grad_norm": 0.09828086197376251, + "learning_rate": 9.83033937654034e-05, + "loss": 0.01276595238596201, + "num_input_tokens_seen": 54597584, + "step": 3334, + "train_runtime": 27475.4812, + "train_tokens_per_second": 1987.138 + }, + { + "epoch": 0.9238227146814404, + "grad_norm": 0.07624299079179764, + "learning_rate": 9.83022583529607e-05, + "loss": 0.012607828713953495, + "num_input_tokens_seen": 54613960, + "step": 3335, + "train_runtime": 27483.713, + "train_tokens_per_second": 1987.139 + }, + { + "epoch": 0.9240997229916897, + "grad_norm": 0.07213998585939407, + "learning_rate": 9.830112256728296e-05, + "loss": 0.012488245032727718, + "num_input_tokens_seen": 54630336, + "step": 3336, + "train_runtime": 27491.9405, + "train_tokens_per_second": 1987.14 + }, + { + "epoch": 0.9243767313019391, + "grad_norm": 0.08435960859060287, + "learning_rate": 9.829998640837898e-05, + "loss": 0.013253322802484035, + "num_input_tokens_seen": 54646712, + "step": 3337, + "train_runtime": 27500.1707, + "train_tokens_per_second": 1987.141 + }, + { + "epoch": 0.9246537396121883, + "grad_norm": 0.04758675396442413, + "learning_rate": 9.829884987625754e-05, + "loss": 0.012498872354626656, + "num_input_tokens_seen": 54663088, + "step": 3338, + "train_runtime": 27508.397, + "train_tokens_per_second": 1987.142 + }, + { + "epoch": 0.9249307479224377, + "grad_norm": 0.046000465750694275, + "learning_rate": 9.829771297092741e-05, + "loss": 0.012264952063560486, + "num_input_tokens_seen": 54679464, + "step": 3339, + "train_runtime": 27516.6246, + "train_tokens_per_second": 1987.143 + }, + { + "epoch": 0.925207756232687, + "grad_norm": 0.0569562129676342, + "learning_rate": 9.829657569239738e-05, + "loss": 0.012959408573806286, + "num_input_tokens_seen": 54695840, + "step": 3340, + "train_runtime": 27524.861, + "train_tokens_per_second": 1987.143 + }, + { + "epoch": 0.9254847645429363, + "grad_norm": 0.06347034871578217, + "learning_rate": 9.829543804067625e-05, + "loss": 0.012992125935852528, + "num_input_tokens_seen": 54712216, + "step": 3341, + "train_runtime": 27533.0937, + "train_tokens_per_second": 1987.144 + }, + { + "epoch": 0.9257617728531856, + "grad_norm": 0.05122828111052513, + "learning_rate": 9.829430001577278e-05, + "loss": 0.01262674480676651, + "num_input_tokens_seen": 54728592, + "step": 3342, + "train_runtime": 27541.3105, + "train_tokens_per_second": 1987.146 + }, + { + "epoch": 0.9260387811634349, + "grad_norm": 0.06730969250202179, + "learning_rate": 9.829316161769578e-05, + "loss": 0.011713473126292229, + "num_input_tokens_seen": 54744968, + "step": 3343, + "train_runtime": 27549.5267, + "train_tokens_per_second": 1987.147 + }, + { + "epoch": 0.9263157894736842, + "grad_norm": 0.058426372706890106, + "learning_rate": 9.829202284645407e-05, + "loss": 0.012086832895874977, + "num_input_tokens_seen": 54761344, + "step": 3344, + "train_runtime": 27557.756, + "train_tokens_per_second": 1987.148 + }, + { + "epoch": 0.9265927977839336, + "grad_norm": 0.0813402459025383, + "learning_rate": 9.829088370205642e-05, + "loss": 0.013964684680104256, + "num_input_tokens_seen": 54777720, + "step": 3345, + "train_runtime": 27565.9842, + "train_tokens_per_second": 1987.149 + }, + { + "epoch": 0.9268698060941828, + "grad_norm": 0.0682549700140953, + "learning_rate": 9.828974418451163e-05, + "loss": 0.012469902634620667, + "num_input_tokens_seen": 54794096, + "step": 3346, + "train_runtime": 27574.2126, + "train_tokens_per_second": 1987.15 + }, + { + "epoch": 0.9271468144044321, + "grad_norm": 0.05033661797642708, + "learning_rate": 9.828860429382851e-05, + "loss": 0.012716328725218773, + "num_input_tokens_seen": 54810472, + "step": 3347, + "train_runtime": 27582.4351, + "train_tokens_per_second": 1987.151 + }, + { + "epoch": 0.9274238227146815, + "grad_norm": 0.057988278567790985, + "learning_rate": 9.82874640300159e-05, + "loss": 0.010466784238815308, + "num_input_tokens_seen": 54826848, + "step": 3348, + "train_runtime": 27590.6724, + "train_tokens_per_second": 1987.152 + }, + { + "epoch": 0.9277008310249307, + "grad_norm": 0.03326844424009323, + "learning_rate": 9.828632339308256e-05, + "loss": 0.011953074485063553, + "num_input_tokens_seen": 54843224, + "step": 3349, + "train_runtime": 27598.9007, + "train_tokens_per_second": 1987.152 + }, + { + "epoch": 0.9279778393351801, + "grad_norm": 0.07778773456811905, + "learning_rate": 9.828518238303734e-05, + "loss": 0.012425011023879051, + "num_input_tokens_seen": 54859600, + "step": 3350, + "train_runtime": 27607.126, + "train_tokens_per_second": 1987.154 + }, + { + "epoch": 0.9282548476454293, + "grad_norm": 0.04833591729402542, + "learning_rate": 9.828404099988905e-05, + "loss": 0.012577132321894169, + "num_input_tokens_seen": 54875976, + "step": 3351, + "train_runtime": 27615.3594, + "train_tokens_per_second": 1987.154 + }, + { + "epoch": 0.9285318559556787, + "grad_norm": 0.08412452787160873, + "learning_rate": 9.82828992436465e-05, + "loss": 0.0123710623010993, + "num_input_tokens_seen": 54892352, + "step": 3352, + "train_runtime": 27623.5844, + "train_tokens_per_second": 1987.155 + }, + { + "epoch": 0.928808864265928, + "grad_norm": 0.08355466276407242, + "learning_rate": 9.828175711431851e-05, + "loss": 0.012313381768763065, + "num_input_tokens_seen": 54908728, + "step": 3353, + "train_runtime": 27631.8085, + "train_tokens_per_second": 1987.157 + }, + { + "epoch": 0.9290858725761773, + "grad_norm": 0.07217024266719818, + "learning_rate": 9.828061461191392e-05, + "loss": 0.013224618509411812, + "num_input_tokens_seen": 54925104, + "step": 3354, + "train_runtime": 27640.0335, + "train_tokens_per_second": 1987.158 + }, + { + "epoch": 0.9293628808864266, + "grad_norm": 0.05232220143079758, + "learning_rate": 9.827947173644155e-05, + "loss": 0.012466439977288246, + "num_input_tokens_seen": 54941480, + "step": 3355, + "train_runtime": 27648.2632, + "train_tokens_per_second": 1987.158 + }, + { + "epoch": 0.929639889196676, + "grad_norm": 0.04567185044288635, + "learning_rate": 9.827832848791024e-05, + "loss": 0.010248366743326187, + "num_input_tokens_seen": 54957856, + "step": 3356, + "train_runtime": 27656.4871, + "train_tokens_per_second": 1987.16 + }, + { + "epoch": 0.9299168975069252, + "grad_norm": 0.055245641618967056, + "learning_rate": 9.82771848663288e-05, + "loss": 0.01622382551431656, + "num_input_tokens_seen": 54974232, + "step": 3357, + "train_runtime": 27664.703, + "train_tokens_per_second": 1987.161 + }, + { + "epoch": 0.9301939058171745, + "grad_norm": 0.0569559708237648, + "learning_rate": 9.827604087170609e-05, + "loss": 0.012228285893797874, + "num_input_tokens_seen": 54990608, + "step": 3358, + "train_runtime": 27672.9236, + "train_tokens_per_second": 1987.163 + }, + { + "epoch": 0.9304709141274238, + "grad_norm": 0.06863977015018463, + "learning_rate": 9.827489650405094e-05, + "loss": 0.012697705067694187, + "num_input_tokens_seen": 55006984, + "step": 3359, + "train_runtime": 27681.1597, + "train_tokens_per_second": 1987.163 + }, + { + "epoch": 0.9307479224376731, + "grad_norm": 0.07793543487787247, + "learning_rate": 9.827375176337219e-05, + "loss": 0.013455644249916077, + "num_input_tokens_seen": 55023360, + "step": 3360, + "train_runtime": 27689.3849, + "train_tokens_per_second": 1987.164 + }, + { + "epoch": 0.9310249307479225, + "grad_norm": 0.08132289350032806, + "learning_rate": 9.82726066496787e-05, + "loss": 0.01125197485089302, + "num_input_tokens_seen": 55039736, + "step": 3361, + "train_runtime": 27697.6135, + "train_tokens_per_second": 1987.165 + }, + { + "epoch": 0.9313019390581717, + "grad_norm": 0.046843271702528, + "learning_rate": 9.82714611629793e-05, + "loss": 0.013387376442551613, + "num_input_tokens_seen": 55056112, + "step": 3362, + "train_runtime": 27705.8301, + "train_tokens_per_second": 1987.167 + }, + { + "epoch": 0.9315789473684211, + "grad_norm": 0.05536904186010361, + "learning_rate": 9.827031530328285e-05, + "loss": 0.014170343056321144, + "num_input_tokens_seen": 55072488, + "step": 3363, + "train_runtime": 27714.0425, + "train_tokens_per_second": 1987.169 + }, + { + "epoch": 0.9318559556786704, + "grad_norm": 0.032034117728471756, + "learning_rate": 9.826916907059822e-05, + "loss": 0.012788631953299046, + "num_input_tokens_seen": 55088864, + "step": 3364, + "train_runtime": 27722.2568, + "train_tokens_per_second": 1987.171 + }, + { + "epoch": 0.9321329639889196, + "grad_norm": 0.08892276883125305, + "learning_rate": 9.826802246493425e-05, + "loss": 0.012188506312668324, + "num_input_tokens_seen": 55105240, + "step": 3365, + "train_runtime": 27730.4705, + "train_tokens_per_second": 1987.173 + }, + { + "epoch": 0.932409972299169, + "grad_norm": 0.05539855733513832, + "learning_rate": 9.826687548629979e-05, + "loss": 0.01228314358741045, + "num_input_tokens_seen": 55121616, + "step": 3366, + "train_runtime": 27738.6786, + "train_tokens_per_second": 1987.175 + }, + { + "epoch": 0.9326869806094182, + "grad_norm": 0.05084408447146416, + "learning_rate": 9.826572813470372e-05, + "loss": 0.013576891273260117, + "num_input_tokens_seen": 55137992, + "step": 3367, + "train_runtime": 27746.8915, + "train_tokens_per_second": 1987.177 + }, + { + "epoch": 0.9329639889196676, + "grad_norm": 0.08634687215089798, + "learning_rate": 9.826458041015488e-05, + "loss": 0.013151008635759354, + "num_input_tokens_seen": 55154368, + "step": 3368, + "train_runtime": 27755.1145, + "train_tokens_per_second": 1987.179 + }, + { + "epoch": 0.9332409972299169, + "grad_norm": 0.04506591334939003, + "learning_rate": 9.826343231266217e-05, + "loss": 0.00967298075556755, + "num_input_tokens_seen": 55170744, + "step": 3369, + "train_runtime": 27763.3371, + "train_tokens_per_second": 1987.18 + }, + { + "epoch": 0.9335180055401662, + "grad_norm": 0.05024629458785057, + "learning_rate": 9.826228384223445e-05, + "loss": 0.012139078229665756, + "num_input_tokens_seen": 55187120, + "step": 3370, + "train_runtime": 27771.5753, + "train_tokens_per_second": 1987.18 + }, + { + "epoch": 0.9337950138504155, + "grad_norm": 0.06748451292514801, + "learning_rate": 9.82611349988806e-05, + "loss": 0.013985328376293182, + "num_input_tokens_seen": 55203496, + "step": 3371, + "train_runtime": 27779.8059, + "train_tokens_per_second": 1987.181 + }, + { + "epoch": 0.9340720221606649, + "grad_norm": 0.05073055997490883, + "learning_rate": 9.825998578260948e-05, + "loss": 0.00994561705738306, + "num_input_tokens_seen": 55219872, + "step": 3372, + "train_runtime": 27788.0417, + "train_tokens_per_second": 1987.181 + }, + { + "epoch": 0.9343490304709141, + "grad_norm": 0.07822690159082413, + "learning_rate": 9.825883619342999e-05, + "loss": 0.012608082965016365, + "num_input_tokens_seen": 55236248, + "step": 3373, + "train_runtime": 27796.2692, + "train_tokens_per_second": 1987.182 + }, + { + "epoch": 0.9346260387811635, + "grad_norm": 0.07832124084234238, + "learning_rate": 9.8257686231351e-05, + "loss": 0.011642718687653542, + "num_input_tokens_seen": 55252624, + "step": 3374, + "train_runtime": 27804.4988, + "train_tokens_per_second": 1987.183 + }, + { + "epoch": 0.9349030470914127, + "grad_norm": 0.05949614569544792, + "learning_rate": 9.825653589638142e-05, + "loss": 0.010672066360712051, + "num_input_tokens_seen": 55269000, + "step": 3375, + "train_runtime": 27812.7233, + "train_tokens_per_second": 1987.184 + }, + { + "epoch": 0.935180055401662, + "grad_norm": 0.05316091328859329, + "learning_rate": 9.825538518853009e-05, + "loss": 0.012215740978717804, + "num_input_tokens_seen": 55285376, + "step": 3376, + "train_runtime": 27820.9412, + "train_tokens_per_second": 1987.186 + }, + { + "epoch": 0.9354570637119114, + "grad_norm": 0.07454487681388855, + "learning_rate": 9.825423410780595e-05, + "loss": 0.010901959612965584, + "num_input_tokens_seen": 55301752, + "step": 3377, + "train_runtime": 27829.1575, + "train_tokens_per_second": 1987.187 + }, + { + "epoch": 0.9357340720221606, + "grad_norm": 0.10558325052261353, + "learning_rate": 9.825308265421786e-05, + "loss": 0.011615180410444736, + "num_input_tokens_seen": 55318128, + "step": 3378, + "train_runtime": 27837.3693, + "train_tokens_per_second": 1987.19 + }, + { + "epoch": 0.93601108033241, + "grad_norm": 0.08313065022230148, + "learning_rate": 9.825193082777473e-05, + "loss": 0.013049179688096046, + "num_input_tokens_seen": 55334504, + "step": 3379, + "train_runtime": 27845.5812, + "train_tokens_per_second": 1987.192 + }, + { + "epoch": 0.9362880886426593, + "grad_norm": 0.06275617331266403, + "learning_rate": 9.825077862848547e-05, + "loss": 0.013408762402832508, + "num_input_tokens_seen": 55350880, + "step": 3380, + "train_runtime": 27853.7927, + "train_tokens_per_second": 1987.194 + }, + { + "epoch": 0.9365650969529086, + "grad_norm": 0.06615035235881805, + "learning_rate": 9.824962605635898e-05, + "loss": 0.015220900066196918, + "num_input_tokens_seen": 55367256, + "step": 3381, + "train_runtime": 27861.999, + "train_tokens_per_second": 1987.196 + }, + { + "epoch": 0.9368421052631579, + "grad_norm": 0.06574691087007523, + "learning_rate": 9.824847311140415e-05, + "loss": 0.011286848224699497, + "num_input_tokens_seen": 55383632, + "step": 3382, + "train_runtime": 27870.2104, + "train_tokens_per_second": 1987.198 + }, + { + "epoch": 0.9371191135734072, + "grad_norm": 0.0617886520922184, + "learning_rate": 9.824731979362991e-05, + "loss": 0.01300980243831873, + "num_input_tokens_seen": 55400008, + "step": 3383, + "train_runtime": 27878.4242, + "train_tokens_per_second": 1987.2 + }, + { + "epoch": 0.9373961218836565, + "grad_norm": 0.0646289810538292, + "learning_rate": 9.824616610304516e-05, + "loss": 0.013404738157987595, + "num_input_tokens_seen": 55416384, + "step": 3384, + "train_runtime": 27886.6413, + "train_tokens_per_second": 1987.202 + }, + { + "epoch": 0.9376731301939059, + "grad_norm": 0.043762318789958954, + "learning_rate": 9.824501203965881e-05, + "loss": 0.014247558079659939, + "num_input_tokens_seen": 55432760, + "step": 3385, + "train_runtime": 27894.8646, + "train_tokens_per_second": 1987.203 + }, + { + "epoch": 0.9379501385041551, + "grad_norm": 0.06400191783905029, + "learning_rate": 9.82438576034798e-05, + "loss": 0.011373890563845634, + "num_input_tokens_seen": 55449136, + "step": 3386, + "train_runtime": 27903.0744, + "train_tokens_per_second": 1987.205 + }, + { + "epoch": 0.9382271468144044, + "grad_norm": 0.04339992254972458, + "learning_rate": 9.824270279451701e-05, + "loss": 0.012140309438109398, + "num_input_tokens_seen": 55465512, + "step": 3387, + "train_runtime": 27911.2918, + "train_tokens_per_second": 1987.207 + }, + { + "epoch": 0.9385041551246538, + "grad_norm": 0.07018207013607025, + "learning_rate": 9.82415476127794e-05, + "loss": 0.010768821462988853, + "num_input_tokens_seen": 55481888, + "step": 3388, + "train_runtime": 27919.5167, + "train_tokens_per_second": 1987.208 + }, + { + "epoch": 0.938781163434903, + "grad_norm": 0.050933077931404114, + "learning_rate": 9.82403920582759e-05, + "loss": 0.0120734553784132, + "num_input_tokens_seen": 55498264, + "step": 3389, + "train_runtime": 27927.7383, + "train_tokens_per_second": 1987.209 + }, + { + "epoch": 0.9390581717451524, + "grad_norm": 0.046167973428964615, + "learning_rate": 9.82392361310154e-05, + "loss": 0.013507128693163395, + "num_input_tokens_seen": 55514640, + "step": 3390, + "train_runtime": 27935.9638, + "train_tokens_per_second": 1987.21 + }, + { + "epoch": 0.9393351800554016, + "grad_norm": 0.06369786709547043, + "learning_rate": 9.823807983100687e-05, + "loss": 0.010931135155260563, + "num_input_tokens_seen": 55531016, + "step": 3391, + "train_runtime": 27944.1947, + "train_tokens_per_second": 1987.211 + }, + { + "epoch": 0.939612188365651, + "grad_norm": 0.07525492459535599, + "learning_rate": 9.823692315825921e-05, + "loss": 0.008187425322830677, + "num_input_tokens_seen": 55547392, + "step": 3392, + "train_runtime": 27952.425, + "train_tokens_per_second": 1987.212 + }, + { + "epoch": 0.9398891966759003, + "grad_norm": 0.06977739930152893, + "learning_rate": 9.82357661127814e-05, + "loss": 0.013100111857056618, + "num_input_tokens_seen": 55563768, + "step": 3393, + "train_runtime": 27960.6581, + "train_tokens_per_second": 1987.212 + }, + { + "epoch": 0.9401662049861496, + "grad_norm": 0.16442346572875977, + "learning_rate": 9.823460869458235e-05, + "loss": 0.014933119527995586, + "num_input_tokens_seen": 55580144, + "step": 3394, + "train_runtime": 27968.8809, + "train_tokens_per_second": 1987.214 + }, + { + "epoch": 0.9404432132963989, + "grad_norm": 0.05430838465690613, + "learning_rate": 9.823345090367102e-05, + "loss": 0.009761069901287556, + "num_input_tokens_seen": 55596520, + "step": 3395, + "train_runtime": 27977.1085, + "train_tokens_per_second": 1987.215 + }, + { + "epoch": 0.9407202216066483, + "grad_norm": 0.04637477546930313, + "learning_rate": 9.823229274005633e-05, + "loss": 0.012392906472086906, + "num_input_tokens_seen": 55612896, + "step": 3396, + "train_runtime": 27985.3403, + "train_tokens_per_second": 1987.215 + }, + { + "epoch": 0.9409972299168975, + "grad_norm": 0.09154167771339417, + "learning_rate": 9.823113420374725e-05, + "loss": 0.014010224491357803, + "num_input_tokens_seen": 55629272, + "step": 3397, + "train_runtime": 27993.5714, + "train_tokens_per_second": 1987.216 + }, + { + "epoch": 0.9412742382271468, + "grad_norm": 0.04891818016767502, + "learning_rate": 9.822997529475275e-05, + "loss": 0.011250577867031097, + "num_input_tokens_seen": 55645648, + "step": 3398, + "train_runtime": 28001.8015, + "train_tokens_per_second": 1987.217 + }, + { + "epoch": 0.9415512465373961, + "grad_norm": 0.04555664211511612, + "learning_rate": 9.822881601308174e-05, + "loss": 0.012126728892326355, + "num_input_tokens_seen": 55662024, + "step": 3399, + "train_runtime": 28010.0351, + "train_tokens_per_second": 1987.217 + }, + { + "epoch": 0.9418282548476454, + "grad_norm": 0.07293099164962769, + "learning_rate": 9.822765635874323e-05, + "loss": 0.013750373385846615, + "num_input_tokens_seen": 55678400, + "step": 3400, + "train_runtime": 28018.267, + "train_tokens_per_second": 1987.218 + }, + { + "epoch": 0.9421052631578948, + "grad_norm": 0.07768960297107697, + "learning_rate": 9.822649633174612e-05, + "loss": 0.013715889304876328, + "num_input_tokens_seen": 55694776, + "step": 3401, + "train_runtime": 28028.2015, + "train_tokens_per_second": 1987.098 + }, + { + "epoch": 0.942382271468144, + "grad_norm": 0.038909364491701126, + "learning_rate": 9.822533593209941e-05, + "loss": 0.011951399967074394, + "num_input_tokens_seen": 55711152, + "step": 3402, + "train_runtime": 28036.4282, + "train_tokens_per_second": 1987.099 + }, + { + "epoch": 0.9426592797783934, + "grad_norm": 0.098070427775383, + "learning_rate": 9.822417515981208e-05, + "loss": 0.01564546674489975, + "num_input_tokens_seen": 55727528, + "step": 3403, + "train_runtime": 28044.6594, + "train_tokens_per_second": 1987.099 + }, + { + "epoch": 0.9429362880886426, + "grad_norm": 0.06666504591703415, + "learning_rate": 9.822301401489309e-05, + "loss": 0.013370029628276825, + "num_input_tokens_seen": 55743904, + "step": 3404, + "train_runtime": 28052.8902, + "train_tokens_per_second": 1987.1 + }, + { + "epoch": 0.943213296398892, + "grad_norm": 0.06510673463344574, + "learning_rate": 9.822185249735139e-05, + "loss": 0.014183426275849342, + "num_input_tokens_seen": 55760280, + "step": 3405, + "train_runtime": 28061.1206, + "train_tokens_per_second": 1987.101 + }, + { + "epoch": 0.9434903047091413, + "grad_norm": 0.05917630344629288, + "learning_rate": 9.822069060719596e-05, + "loss": 0.011521373875439167, + "num_input_tokens_seen": 55776656, + "step": 3406, + "train_runtime": 28069.3556, + "train_tokens_per_second": 1987.101 + }, + { + "epoch": 0.9437673130193905, + "grad_norm": 0.06658991426229477, + "learning_rate": 9.82195283444358e-05, + "loss": 0.014143912121653557, + "num_input_tokens_seen": 55793032, + "step": 3407, + "train_runtime": 28077.5889, + "train_tokens_per_second": 1987.102 + }, + { + "epoch": 0.9440443213296399, + "grad_norm": 0.06584969907999039, + "learning_rate": 9.821836570907987e-05, + "loss": 0.01339448057115078, + "num_input_tokens_seen": 55809408, + "step": 3408, + "train_runtime": 28085.8151, + "train_tokens_per_second": 1987.103 + }, + { + "epoch": 0.9443213296398892, + "grad_norm": 0.05384904518723488, + "learning_rate": 9.821720270113718e-05, + "loss": 0.012943413108587265, + "num_input_tokens_seen": 55825784, + "step": 3409, + "train_runtime": 28094.0375, + "train_tokens_per_second": 1987.104 + }, + { + "epoch": 0.9445983379501385, + "grad_norm": 0.05215821787714958, + "learning_rate": 9.821603932061669e-05, + "loss": 0.013279331848025322, + "num_input_tokens_seen": 55842160, + "step": 3410, + "train_runtime": 28102.2548, + "train_tokens_per_second": 1987.106 + }, + { + "epoch": 0.9448753462603878, + "grad_norm": 0.08122871816158295, + "learning_rate": 9.82148755675274e-05, + "loss": 0.014326483011245728, + "num_input_tokens_seen": 55858536, + "step": 3411, + "train_runtime": 28110.4678, + "train_tokens_per_second": 1987.108 + }, + { + "epoch": 0.9451523545706371, + "grad_norm": 0.05222857743501663, + "learning_rate": 9.82137114418783e-05, + "loss": 0.011275701224803925, + "num_input_tokens_seen": 55874912, + "step": 3412, + "train_runtime": 28118.6885, + "train_tokens_per_second": 1987.109 + }, + { + "epoch": 0.9454293628808864, + "grad_norm": 0.05132470279932022, + "learning_rate": 9.821254694367839e-05, + "loss": 0.012799404561519623, + "num_input_tokens_seen": 55891288, + "step": 3413, + "train_runtime": 28126.901, + "train_tokens_per_second": 1987.111 + }, + { + "epoch": 0.9457063711911358, + "grad_norm": 0.0486428439617157, + "learning_rate": 9.821138207293666e-05, + "loss": 0.01215145830065012, + "num_input_tokens_seen": 55907664, + "step": 3414, + "train_runtime": 28135.1274, + "train_tokens_per_second": 1987.113 + }, + { + "epoch": 0.945983379501385, + "grad_norm": 0.07185772806406021, + "learning_rate": 9.821021682966209e-05, + "loss": 0.012780224904417992, + "num_input_tokens_seen": 55924040, + "step": 3415, + "train_runtime": 28143.3634, + "train_tokens_per_second": 1987.113 + }, + { + "epoch": 0.9462603878116344, + "grad_norm": 0.04633599892258644, + "learning_rate": 9.820905121386375e-05, + "loss": 0.01330717746168375, + "num_input_tokens_seen": 55940416, + "step": 3416, + "train_runtime": 28151.6013, + "train_tokens_per_second": 1987.113 + }, + { + "epoch": 0.9465373961218837, + "grad_norm": 0.05347038432955742, + "learning_rate": 9.820788522555058e-05, + "loss": 0.010736946016550064, + "num_input_tokens_seen": 55956792, + "step": 3417, + "train_runtime": 28159.8384, + "train_tokens_per_second": 1987.113 + }, + { + "epoch": 0.9468144044321329, + "grad_norm": 0.06911235302686691, + "learning_rate": 9.820671886473163e-05, + "loss": 0.012011191807687283, + "num_input_tokens_seen": 55973168, + "step": 3418, + "train_runtime": 28168.0703, + "train_tokens_per_second": 1987.114 + }, + { + "epoch": 0.9470914127423823, + "grad_norm": 0.07857958227396011, + "learning_rate": 9.820555213141589e-05, + "loss": 0.012562636286020279, + "num_input_tokens_seen": 55989544, + "step": 3419, + "train_runtime": 28176.2798, + "train_tokens_per_second": 1987.116 + }, + { + "epoch": 0.9473684210526315, + "grad_norm": 0.05561184883117676, + "learning_rate": 9.820438502561238e-05, + "loss": 0.012161044403910637, + "num_input_tokens_seen": 56005920, + "step": 3420, + "train_runtime": 28184.4992, + "train_tokens_per_second": 1987.118 + }, + { + "epoch": 0.9476454293628809, + "grad_norm": 0.05171430855989456, + "learning_rate": 9.820321754733013e-05, + "loss": 0.010443482547998428, + "num_input_tokens_seen": 56022296, + "step": 3421, + "train_runtime": 28192.7309, + "train_tokens_per_second": 1987.118 + }, + { + "epoch": 0.9479224376731302, + "grad_norm": 0.08281467109918594, + "learning_rate": 9.820204969657816e-05, + "loss": 0.009100619703531265, + "num_input_tokens_seen": 56038672, + "step": 3422, + "train_runtime": 28200.9572, + "train_tokens_per_second": 1987.119 + }, + { + "epoch": 0.9481994459833795, + "grad_norm": 0.08245860785245895, + "learning_rate": 9.820088147336548e-05, + "loss": 0.014360588043928146, + "num_input_tokens_seen": 56055048, + "step": 3423, + "train_runtime": 28209.1908, + "train_tokens_per_second": 1987.12 + }, + { + "epoch": 0.9484764542936288, + "grad_norm": 0.10243845731019974, + "learning_rate": 9.819971287770112e-05, + "loss": 0.0144706005230546, + "num_input_tokens_seen": 56071424, + "step": 3424, + "train_runtime": 28217.4274, + "train_tokens_per_second": 1987.12 + }, + { + "epoch": 0.9487534626038782, + "grad_norm": 0.06164037436246872, + "learning_rate": 9.819854390959412e-05, + "loss": 0.013450069352984428, + "num_input_tokens_seen": 56087800, + "step": 3425, + "train_runtime": 28225.6578, + "train_tokens_per_second": 1987.121 + }, + { + "epoch": 0.9490304709141274, + "grad_norm": 0.04964802414178848, + "learning_rate": 9.819737456905352e-05, + "loss": 0.0118025541305542, + "num_input_tokens_seen": 56104176, + "step": 3426, + "train_runtime": 28233.8795, + "train_tokens_per_second": 1987.122 + }, + { + "epoch": 0.9493074792243767, + "grad_norm": 0.07917213439941406, + "learning_rate": 9.819620485608832e-05, + "loss": 0.012825772166252136, + "num_input_tokens_seen": 56120552, + "step": 3427, + "train_runtime": 28242.0901, + "train_tokens_per_second": 1987.125 + }, + { + "epoch": 0.949584487534626, + "grad_norm": 0.049628302454948425, + "learning_rate": 9.819503477070758e-05, + "loss": 0.009393664076924324, + "num_input_tokens_seen": 56136928, + "step": 3428, + "train_runtime": 28250.3155, + "train_tokens_per_second": 1987.126 + }, + { + "epoch": 0.9498614958448753, + "grad_norm": 0.06897792965173721, + "learning_rate": 9.819386431292035e-05, + "loss": 0.01304903905838728, + "num_input_tokens_seen": 56153304, + "step": 3429, + "train_runtime": 28258.5432, + "train_tokens_per_second": 1987.127 + }, + { + "epoch": 0.9501385041551247, + "grad_norm": 0.06628378480672836, + "learning_rate": 9.819269348273567e-05, + "loss": 0.011392151936888695, + "num_input_tokens_seen": 56169680, + "step": 3430, + "train_runtime": 28266.7677, + "train_tokens_per_second": 1987.128 + }, + { + "epoch": 0.9504155124653739, + "grad_norm": 0.052351322025060654, + "learning_rate": 9.819152228016257e-05, + "loss": 0.0124359717592597, + "num_input_tokens_seen": 56186056, + "step": 3431, + "train_runtime": 28274.995, + "train_tokens_per_second": 1987.129 + }, + { + "epoch": 0.9506925207756233, + "grad_norm": 0.09306387603282928, + "learning_rate": 9.819035070521013e-05, + "loss": 0.012263515964150429, + "num_input_tokens_seen": 56202432, + "step": 3432, + "train_runtime": 28283.2465, + "train_tokens_per_second": 1987.128 + }, + { + "epoch": 0.9509695290858726, + "grad_norm": 0.0380416065454483, + "learning_rate": 9.818917875788738e-05, + "loss": 0.012844188138842583, + "num_input_tokens_seen": 56218808, + "step": 3433, + "train_runtime": 28291.4672, + "train_tokens_per_second": 1987.129 + }, + { + "epoch": 0.9512465373961219, + "grad_norm": 0.051456037908792496, + "learning_rate": 9.818800643820338e-05, + "loss": 0.01063609030097723, + "num_input_tokens_seen": 56235184, + "step": 3434, + "train_runtime": 28299.6828, + "train_tokens_per_second": 1987.131 + }, + { + "epoch": 0.9515235457063712, + "grad_norm": 0.04176431894302368, + "learning_rate": 9.81868337461672e-05, + "loss": 0.010963656939566135, + "num_input_tokens_seen": 56251560, + "step": 3435, + "train_runtime": 28307.8886, + "train_tokens_per_second": 1987.134 + }, + { + "epoch": 0.9518005540166204, + "grad_norm": 0.04463617131114006, + "learning_rate": 9.818566068178788e-05, + "loss": 0.013263936154544353, + "num_input_tokens_seen": 56267936, + "step": 3436, + "train_runtime": 28316.0989, + "train_tokens_per_second": 1987.136 + }, + { + "epoch": 0.9520775623268698, + "grad_norm": 0.0710156038403511, + "learning_rate": 9.81844872450745e-05, + "loss": 0.013552050106227398, + "num_input_tokens_seen": 56284312, + "step": 3437, + "train_runtime": 28324.3187, + "train_tokens_per_second": 1987.137 + }, + { + "epoch": 0.9523545706371191, + "grad_norm": 0.049234386533498764, + "learning_rate": 9.818331343603612e-05, + "loss": 0.010073045268654823, + "num_input_tokens_seen": 56300688, + "step": 3438, + "train_runtime": 28332.5376, + "train_tokens_per_second": 1987.139 + }, + { + "epoch": 0.9526315789473684, + "grad_norm": 0.06111782416701317, + "learning_rate": 9.818213925468183e-05, + "loss": 0.011924738064408302, + "num_input_tokens_seen": 56317064, + "step": 3439, + "train_runtime": 28340.758, + "train_tokens_per_second": 1987.14 + }, + { + "epoch": 0.9529085872576177, + "grad_norm": 0.03914536535739899, + "learning_rate": 9.818096470102067e-05, + "loss": 0.011774462647736073, + "num_input_tokens_seen": 56333440, + "step": 3440, + "train_runtime": 28348.9954, + "train_tokens_per_second": 1987.141 + }, + { + "epoch": 0.9531855955678671, + "grad_norm": 0.032777708023786545, + "learning_rate": 9.817978977506174e-05, + "loss": 0.008260452188551426, + "num_input_tokens_seen": 56349816, + "step": 3441, + "train_runtime": 28357.2382, + "train_tokens_per_second": 1987.14 + }, + { + "epoch": 0.9534626038781163, + "grad_norm": 0.06321536004543304, + "learning_rate": 9.817861447681411e-05, + "loss": 0.012097081169486046, + "num_input_tokens_seen": 56366192, + "step": 3442, + "train_runtime": 28365.4642, + "train_tokens_per_second": 1987.142 + }, + { + "epoch": 0.9537396121883657, + "grad_norm": 0.04064768925309181, + "learning_rate": 9.817743880628686e-05, + "loss": 0.008594024926424026, + "num_input_tokens_seen": 56382568, + "step": 3443, + "train_runtime": 28373.6863, + "train_tokens_per_second": 1987.143 + }, + { + "epoch": 0.9540166204986149, + "grad_norm": 0.046092212200164795, + "learning_rate": 9.817626276348908e-05, + "loss": 0.013315455988049507, + "num_input_tokens_seen": 56398944, + "step": 3444, + "train_runtime": 28381.9242, + "train_tokens_per_second": 1987.143 + }, + { + "epoch": 0.9542936288088643, + "grad_norm": 0.03866634517908096, + "learning_rate": 9.817508634842985e-05, + "loss": 0.009488417766988277, + "num_input_tokens_seen": 56415320, + "step": 3445, + "train_runtime": 28390.1554, + "train_tokens_per_second": 1987.144 + }, + { + "epoch": 0.9545706371191136, + "grad_norm": 0.05358671024441719, + "learning_rate": 9.817390956111827e-05, + "loss": 0.010045185685157776, + "num_input_tokens_seen": 56431696, + "step": 3446, + "train_runtime": 28398.3826, + "train_tokens_per_second": 1987.145 + }, + { + "epoch": 0.9548476454293628, + "grad_norm": 0.08092974126338959, + "learning_rate": 9.817273240156344e-05, + "loss": 0.013992191292345524, + "num_input_tokens_seen": 56448072, + "step": 3447, + "train_runtime": 28406.6143, + "train_tokens_per_second": 1987.145 + }, + { + "epoch": 0.9551246537396122, + "grad_norm": 0.1196431964635849, + "learning_rate": 9.817155486977443e-05, + "loss": 0.012416848912835121, + "num_input_tokens_seen": 56464448, + "step": 3448, + "train_runtime": 28414.8437, + "train_tokens_per_second": 1987.146 + }, + { + "epoch": 0.9554016620498615, + "grad_norm": 0.052506010979413986, + "learning_rate": 9.817037696576034e-05, + "loss": 0.014370227232575417, + "num_input_tokens_seen": 56480824, + "step": 3449, + "train_runtime": 28423.0556, + "train_tokens_per_second": 1987.148 + }, + { + "epoch": 0.9556786703601108, + "grad_norm": 0.08310600370168686, + "learning_rate": 9.816919868953031e-05, + "loss": 0.011498485691845417, + "num_input_tokens_seen": 56497200, + "step": 3450, + "train_runtime": 28431.267, + "train_tokens_per_second": 1987.15 + }, + { + "epoch": 0.9559556786703601, + "grad_norm": 0.081356942653656, + "learning_rate": 9.81680200410934e-05, + "loss": 0.012788455933332443, + "num_input_tokens_seen": 56513576, + "step": 3451, + "train_runtime": 28439.4736, + "train_tokens_per_second": 1987.153 + }, + { + "epoch": 0.9562326869806094, + "grad_norm": 0.05950438603758812, + "learning_rate": 9.816684102045874e-05, + "loss": 0.013020639307796955, + "num_input_tokens_seen": 56529952, + "step": 3452, + "train_runtime": 28447.6843, + "train_tokens_per_second": 1987.155 + }, + { + "epoch": 0.9565096952908587, + "grad_norm": 0.061506059020757675, + "learning_rate": 9.816566162763546e-05, + "loss": 0.01412620022892952, + "num_input_tokens_seen": 56546328, + "step": 3453, + "train_runtime": 28455.8975, + "train_tokens_per_second": 1987.157 + }, + { + "epoch": 0.9567867036011081, + "grad_norm": 0.03970945253968239, + "learning_rate": 9.816448186263263e-05, + "loss": 0.009836520999670029, + "num_input_tokens_seen": 56562704, + "step": 3454, + "train_runtime": 28464.1255, + "train_tokens_per_second": 1987.158 + }, + { + "epoch": 0.9570637119113573, + "grad_norm": 0.08041775226593018, + "learning_rate": 9.816330172545938e-05, + "loss": 0.014120589941740036, + "num_input_tokens_seen": 56579080, + "step": 3455, + "train_runtime": 28472.3575, + "train_tokens_per_second": 1987.158 + }, + { + "epoch": 0.9573407202216067, + "grad_norm": 0.0669625997543335, + "learning_rate": 9.816212121612485e-05, + "loss": 0.013172806240618229, + "num_input_tokens_seen": 56595456, + "step": 3456, + "train_runtime": 28480.5746, + "train_tokens_per_second": 1987.16 + }, + { + "epoch": 0.957617728531856, + "grad_norm": 0.039886265993118286, + "learning_rate": 9.816094033463815e-05, + "loss": 0.011480771005153656, + "num_input_tokens_seen": 56611832, + "step": 3457, + "train_runtime": 28488.7848, + "train_tokens_per_second": 1987.162 + }, + { + "epoch": 0.9578947368421052, + "grad_norm": 0.04913703352212906, + "learning_rate": 9.81597590810084e-05, + "loss": 0.012552808038890362, + "num_input_tokens_seen": 56628208, + "step": 3458, + "train_runtime": 28496.9934, + "train_tokens_per_second": 1987.164 + }, + { + "epoch": 0.9581717451523546, + "grad_norm": 0.04041992500424385, + "learning_rate": 9.815857745524473e-05, + "loss": 0.010824098251760006, + "num_input_tokens_seen": 56644584, + "step": 3459, + "train_runtime": 28505.2034, + "train_tokens_per_second": 1987.166 + }, + { + "epoch": 0.9584487534626038, + "grad_norm": 0.045338764786720276, + "learning_rate": 9.815739545735626e-05, + "loss": 0.010921325534582138, + "num_input_tokens_seen": 56660960, + "step": 3460, + "train_runtime": 28513.4333, + "train_tokens_per_second": 1987.167 + }, + { + "epoch": 0.9587257617728532, + "grad_norm": 0.052006568759679794, + "learning_rate": 9.815621308735214e-05, + "loss": 0.009369282983243465, + "num_input_tokens_seen": 56677336, + "step": 3461, + "train_runtime": 28521.6559, + "train_tokens_per_second": 1987.168 + }, + { + "epoch": 0.9590027700831025, + "grad_norm": 0.050047989934682846, + "learning_rate": 9.81550303452415e-05, + "loss": 0.012649165466427803, + "num_input_tokens_seen": 56693712, + "step": 3462, + "train_runtime": 28529.8708, + "train_tokens_per_second": 1987.17 + }, + { + "epoch": 0.9592797783933518, + "grad_norm": 0.034299153834581375, + "learning_rate": 9.815384723103348e-05, + "loss": 0.012025951407849789, + "num_input_tokens_seen": 56710088, + "step": 3463, + "train_runtime": 28538.0822, + "train_tokens_per_second": 1987.172 + }, + { + "epoch": 0.9595567867036011, + "grad_norm": 0.07221338897943497, + "learning_rate": 9.815266374473721e-05, + "loss": 0.009412750601768494, + "num_input_tokens_seen": 56726464, + "step": 3464, + "train_runtime": 28546.285, + "train_tokens_per_second": 1987.175 + }, + { + "epoch": 0.9598337950138505, + "grad_norm": 0.07054927200078964, + "learning_rate": 9.815147988636187e-05, + "loss": 0.012935085222125053, + "num_input_tokens_seen": 56742840, + "step": 3465, + "train_runtime": 28554.495, + "train_tokens_per_second": 1987.177 + }, + { + "epoch": 0.9601108033240997, + "grad_norm": 0.05585706606507301, + "learning_rate": 9.815029565591657e-05, + "loss": 0.012076294049620628, + "num_input_tokens_seen": 56759216, + "step": 3466, + "train_runtime": 28562.6985, + "train_tokens_per_second": 1987.18 + }, + { + "epoch": 0.960387811634349, + "grad_norm": 0.04889178276062012, + "learning_rate": 9.814911105341047e-05, + "loss": 0.011662596836686134, + "num_input_tokens_seen": 56775592, + "step": 3467, + "train_runtime": 28570.907, + "train_tokens_per_second": 1987.182 + }, + { + "epoch": 0.9606648199445983, + "grad_norm": 0.03151857480406761, + "learning_rate": 9.814792607885274e-05, + "loss": 0.011370552703738213, + "num_input_tokens_seen": 56791968, + "step": 3468, + "train_runtime": 28579.1326, + "train_tokens_per_second": 1987.183 + }, + { + "epoch": 0.9609418282548476, + "grad_norm": 0.06077367812395096, + "learning_rate": 9.814674073225252e-05, + "loss": 0.009975409135222435, + "num_input_tokens_seen": 56808344, + "step": 3469, + "train_runtime": 28587.3369, + "train_tokens_per_second": 1987.186 + }, + { + "epoch": 0.961218836565097, + "grad_norm": 0.03761344403028488, + "learning_rate": 9.814555501361897e-05, + "loss": 0.010968383401632309, + "num_input_tokens_seen": 56824720, + "step": 3470, + "train_runtime": 28595.5553, + "train_tokens_per_second": 1987.187 + }, + { + "epoch": 0.9614958448753462, + "grad_norm": 0.06465107202529907, + "learning_rate": 9.814436892296126e-05, + "loss": 0.014676619321107864, + "num_input_tokens_seen": 56841096, + "step": 3471, + "train_runtime": 28603.7881, + "train_tokens_per_second": 1987.188 + }, + { + "epoch": 0.9617728531855956, + "grad_norm": 0.05545935407280922, + "learning_rate": 9.814318246028855e-05, + "loss": 0.01078350655734539, + "num_input_tokens_seen": 56857472, + "step": 3472, + "train_runtime": 28612.0163, + "train_tokens_per_second": 1987.189 + }, + { + "epoch": 0.9620498614958449, + "grad_norm": 0.08530715852975845, + "learning_rate": 9.814199562561e-05, + "loss": 0.014865799807012081, + "num_input_tokens_seen": 56873848, + "step": 3473, + "train_runtime": 28620.2318, + "train_tokens_per_second": 1987.19 + }, + { + "epoch": 0.9623268698060942, + "grad_norm": 0.07943485677242279, + "learning_rate": 9.81408084189348e-05, + "loss": 0.00995702762156725, + "num_input_tokens_seen": 56890224, + "step": 3474, + "train_runtime": 28628.4608, + "train_tokens_per_second": 1987.191 + }, + { + "epoch": 0.9626038781163435, + "grad_norm": 0.04757952317595482, + "learning_rate": 9.813962084027211e-05, + "loss": 0.01281618420034647, + "num_input_tokens_seen": 56906600, + "step": 3475, + "train_runtime": 28636.6825, + "train_tokens_per_second": 1987.192 + }, + { + "epoch": 0.9628808864265928, + "grad_norm": 0.10049233585596085, + "learning_rate": 9.813843288963111e-05, + "loss": 0.016295071691274643, + "num_input_tokens_seen": 56922976, + "step": 3476, + "train_runtime": 28644.8979, + "train_tokens_per_second": 1987.194 + }, + { + "epoch": 0.9631578947368421, + "grad_norm": 0.060008060187101364, + "learning_rate": 9.813724456702099e-05, + "loss": 0.013922999612987041, + "num_input_tokens_seen": 56939352, + "step": 3477, + "train_runtime": 28653.1081, + "train_tokens_per_second": 1987.196 + }, + { + "epoch": 0.9634349030470915, + "grad_norm": 0.04164751619100571, + "learning_rate": 9.81360558724509e-05, + "loss": 0.01027719397097826, + "num_input_tokens_seen": 56955728, + "step": 3478, + "train_runtime": 28661.318, + "train_tokens_per_second": 1987.198 + }, + { + "epoch": 0.9637119113573407, + "grad_norm": 0.05442468076944351, + "learning_rate": 9.813486680593004e-05, + "loss": 0.013029193505644798, + "num_input_tokens_seen": 56972104, + "step": 3479, + "train_runtime": 28669.5425, + "train_tokens_per_second": 1987.2 + }, + { + "epoch": 0.96398891966759, + "grad_norm": 0.04537346959114075, + "learning_rate": 9.813367736746764e-05, + "loss": 0.012310046702623367, + "num_input_tokens_seen": 56988480, + "step": 3480, + "train_runtime": 28677.7593, + "train_tokens_per_second": 1987.201 + }, + { + "epoch": 0.9642659279778394, + "grad_norm": 0.06302908062934875, + "learning_rate": 9.813248755707283e-05, + "loss": 0.01558053120970726, + "num_input_tokens_seen": 57004856, + "step": 3481, + "train_runtime": 28685.9674, + "train_tokens_per_second": 1987.204 + }, + { + "epoch": 0.9645429362880886, + "grad_norm": 0.04345203936100006, + "learning_rate": 9.813129737475484e-05, + "loss": 0.011289955116808414, + "num_input_tokens_seen": 57021232, + "step": 3482, + "train_runtime": 28694.1778, + "train_tokens_per_second": 1987.206 + }, + { + "epoch": 0.964819944598338, + "grad_norm": 0.06902728229761124, + "learning_rate": 9.813010682052284e-05, + "loss": 0.012497722171247005, + "num_input_tokens_seen": 57037608, + "step": 3483, + "train_runtime": 28702.4054, + "train_tokens_per_second": 1987.207 + }, + { + "epoch": 0.9650969529085872, + "grad_norm": 0.05735690891742706, + "learning_rate": 9.812891589438607e-05, + "loss": 0.013603639788925648, + "num_input_tokens_seen": 57053984, + "step": 3484, + "train_runtime": 28710.6222, + "train_tokens_per_second": 1987.208 + }, + { + "epoch": 0.9653739612188366, + "grad_norm": 0.07752981036901474, + "learning_rate": 9.81277245963537e-05, + "loss": 0.011832530610263348, + "num_input_tokens_seen": 57070360, + "step": 3485, + "train_runtime": 28718.8358, + "train_tokens_per_second": 1987.21 + }, + { + "epoch": 0.9656509695290859, + "grad_norm": 0.03405062109231949, + "learning_rate": 9.812653292643492e-05, + "loss": 0.01044989563524723, + "num_input_tokens_seen": 57086736, + "step": 3486, + "train_runtime": 28727.0401, + "train_tokens_per_second": 1987.213 + }, + { + "epoch": 0.9659279778393352, + "grad_norm": 0.0550898015499115, + "learning_rate": 9.812534088463897e-05, + "loss": 0.011966297402977943, + "num_input_tokens_seen": 57103112, + "step": 3487, + "train_runtime": 28735.2547, + "train_tokens_per_second": 1987.214 + }, + { + "epoch": 0.9662049861495845, + "grad_norm": 0.0555434450507164, + "learning_rate": 9.812414847097507e-05, + "loss": 0.010928276926279068, + "num_input_tokens_seen": 57119488, + "step": 3488, + "train_runtime": 28743.4663, + "train_tokens_per_second": 1987.216 + }, + { + "epoch": 0.9664819944598338, + "grad_norm": 0.044945891946554184, + "learning_rate": 9.81229556854524e-05, + "loss": 0.011329125612974167, + "num_input_tokens_seen": 57135864, + "step": 3489, + "train_runtime": 28751.671, + "train_tokens_per_second": 1987.219 + }, + { + "epoch": 0.9667590027700831, + "grad_norm": 0.056438952684402466, + "learning_rate": 9.812176252808018e-05, + "loss": 0.014014297164976597, + "num_input_tokens_seen": 57152240, + "step": 3490, + "train_runtime": 28759.8774, + "train_tokens_per_second": 1987.221 + }, + { + "epoch": 0.9670360110803324, + "grad_norm": 0.06014394387602806, + "learning_rate": 9.812056899886768e-05, + "loss": 0.013051506131887436, + "num_input_tokens_seen": 57168616, + "step": 3491, + "train_runtime": 28768.0832, + "train_tokens_per_second": 1987.224 + }, + { + "epoch": 0.9673130193905817, + "grad_norm": 0.12539424002170563, + "learning_rate": 9.811937509782404e-05, + "loss": 0.014424408785998821, + "num_input_tokens_seen": 57184992, + "step": 3492, + "train_runtime": 28776.289, + "train_tokens_per_second": 1987.226 + }, + { + "epoch": 0.967590027700831, + "grad_norm": 0.07920383661985397, + "learning_rate": 9.811818082495856e-05, + "loss": 0.015743495896458626, + "num_input_tokens_seen": 57201368, + "step": 3493, + "train_runtime": 28784.5062, + "train_tokens_per_second": 1987.228 + }, + { + "epoch": 0.9678670360110804, + "grad_norm": 0.11441051959991455, + "learning_rate": 9.811698618028044e-05, + "loss": 0.012444263324141502, + "num_input_tokens_seen": 57217744, + "step": 3494, + "train_runtime": 28792.7188, + "train_tokens_per_second": 1987.23 + }, + { + "epoch": 0.9681440443213296, + "grad_norm": 0.05763565003871918, + "learning_rate": 9.81157911637989e-05, + "loss": 0.014669590629637241, + "num_input_tokens_seen": 57234120, + "step": 3495, + "train_runtime": 28800.9353, + "train_tokens_per_second": 1987.231 + }, + { + "epoch": 0.968421052631579, + "grad_norm": 0.0492832288146019, + "learning_rate": 9.811459577552317e-05, + "loss": 0.01360265351831913, + "num_input_tokens_seen": 57250496, + "step": 3496, + "train_runtime": 28809.1411, + "train_tokens_per_second": 1987.234 + }, + { + "epoch": 0.9686980609418283, + "grad_norm": 0.07859674096107483, + "learning_rate": 9.811340001546251e-05, + "loss": 0.0151074742898345, + "num_input_tokens_seen": 57266872, + "step": 3497, + "train_runtime": 28817.354, + "train_tokens_per_second": 1987.236 + }, + { + "epoch": 0.9689750692520775, + "grad_norm": 0.04693932831287384, + "learning_rate": 9.811220388362617e-05, + "loss": 0.012341799214482307, + "num_input_tokens_seen": 57283248, + "step": 3498, + "train_runtime": 28825.5569, + "train_tokens_per_second": 1987.238 + }, + { + "epoch": 0.9692520775623269, + "grad_norm": 0.08377372473478317, + "learning_rate": 9.811100738002336e-05, + "loss": 0.012339861132204533, + "num_input_tokens_seen": 57299624, + "step": 3499, + "train_runtime": 28833.7683, + "train_tokens_per_second": 1987.24 + }, + { + "epoch": 0.9695290858725761, + "grad_norm": 0.03805753216147423, + "learning_rate": 9.810981050466332e-05, + "loss": 0.010506808757781982, + "num_input_tokens_seen": 57316000, + "step": 3500, + "train_runtime": 28841.9782, + "train_tokens_per_second": 1987.242 + }, + { + "epoch": 0.9698060941828255, + "grad_norm": 0.04249967262148857, + "learning_rate": 9.810861325755534e-05, + "loss": 0.011508291587233543, + "num_input_tokens_seen": 57332376, + "step": 3501, + "train_runtime": 28851.841, + "train_tokens_per_second": 1987.131 + }, + { + "epoch": 0.9700831024930748, + "grad_norm": 0.06870055198669434, + "learning_rate": 9.810741563870863e-05, + "loss": 0.012952628545463085, + "num_input_tokens_seen": 57348752, + "step": 3502, + "train_runtime": 28860.0396, + "train_tokens_per_second": 1987.134 + }, + { + "epoch": 0.9703601108033241, + "grad_norm": 0.05929458886384964, + "learning_rate": 9.810621764813248e-05, + "loss": 0.012653995305299759, + "num_input_tokens_seen": 57365128, + "step": 3503, + "train_runtime": 28868.2377, + "train_tokens_per_second": 1987.136 + }, + { + "epoch": 0.9706371191135734, + "grad_norm": 0.04316656291484833, + "learning_rate": 9.810501928583611e-05, + "loss": 0.012243291363120079, + "num_input_tokens_seen": 57381504, + "step": 3504, + "train_runtime": 28876.4376, + "train_tokens_per_second": 1987.139 + }, + { + "epoch": 0.9709141274238227, + "grad_norm": 0.1439526230096817, + "learning_rate": 9.81038205518288e-05, + "loss": 0.01758175529539585, + "num_input_tokens_seen": 57397880, + "step": 3505, + "train_runtime": 28884.6406, + "train_tokens_per_second": 1987.142 + }, + { + "epoch": 0.971191135734072, + "grad_norm": 0.06404208391904831, + "learning_rate": 9.810262144611983e-05, + "loss": 0.011914060451090336, + "num_input_tokens_seen": 57414256, + "step": 3506, + "train_runtime": 28892.8552, + "train_tokens_per_second": 1987.144 + }, + { + "epoch": 0.9714681440443214, + "grad_norm": 0.052072830498218536, + "learning_rate": 9.810142196871843e-05, + "loss": 0.010149682871997356, + "num_input_tokens_seen": 57430632, + "step": 3507, + "train_runtime": 28901.1231, + "train_tokens_per_second": 1987.142 + }, + { + "epoch": 0.9717451523545706, + "grad_norm": 0.060674965381622314, + "learning_rate": 9.810022211963388e-05, + "loss": 0.012780903838574886, + "num_input_tokens_seen": 57447008, + "step": 3508, + "train_runtime": 28909.3652, + "train_tokens_per_second": 1987.142 + }, + { + "epoch": 0.97202216066482, + "grad_norm": 0.08173175156116486, + "learning_rate": 9.809902189887548e-05, + "loss": 0.011868325062096119, + "num_input_tokens_seen": 57463384, + "step": 3509, + "train_runtime": 28917.5728, + "train_tokens_per_second": 1987.144 + }, + { + "epoch": 0.9722991689750693, + "grad_norm": 0.06077544018626213, + "learning_rate": 9.809782130645245e-05, + "loss": 0.011623235419392586, + "num_input_tokens_seen": 57479760, + "step": 3510, + "train_runtime": 28925.7852, + "train_tokens_per_second": 1987.146 + }, + { + "epoch": 0.9725761772853185, + "grad_norm": 0.10130877792835236, + "learning_rate": 9.809662034237413e-05, + "loss": 0.014847049489617348, + "num_input_tokens_seen": 57496136, + "step": 3511, + "train_runtime": 28934.0034, + "train_tokens_per_second": 1987.148 + }, + { + "epoch": 0.9728531855955679, + "grad_norm": 0.0352676622569561, + "learning_rate": 9.809541900664973e-05, + "loss": 0.010382642969489098, + "num_input_tokens_seen": 57512512, + "step": 3512, + "train_runtime": 28942.2318, + "train_tokens_per_second": 1987.148 + }, + { + "epoch": 0.9731301939058171, + "grad_norm": 0.04925035685300827, + "learning_rate": 9.809421729928859e-05, + "loss": 0.012544576078653336, + "num_input_tokens_seen": 57528888, + "step": 3513, + "train_runtime": 28950.4578, + "train_tokens_per_second": 1987.15 + }, + { + "epoch": 0.9734072022160665, + "grad_norm": 0.058656223118305206, + "learning_rate": 9.80930152203e-05, + "loss": 0.013158746995031834, + "num_input_tokens_seen": 57545264, + "step": 3514, + "train_runtime": 28958.6817, + "train_tokens_per_second": 1987.151 + }, + { + "epoch": 0.9736842105263158, + "grad_norm": 0.054754797369241714, + "learning_rate": 9.809181276969319e-05, + "loss": 0.010154408402740955, + "num_input_tokens_seen": 57561640, + "step": 3515, + "train_runtime": 28966.9034, + "train_tokens_per_second": 1987.152 + }, + { + "epoch": 0.9739612188365651, + "grad_norm": 0.05000312626361847, + "learning_rate": 9.80906099474775e-05, + "loss": 0.012546788901090622, + "num_input_tokens_seen": 57578016, + "step": 3516, + "train_runtime": 28975.1118, + "train_tokens_per_second": 1987.154 + }, + { + "epoch": 0.9742382271468144, + "grad_norm": 0.04047946259379387, + "learning_rate": 9.808940675366221e-05, + "loss": 0.011204661801457405, + "num_input_tokens_seen": 57594392, + "step": 3517, + "train_runtime": 28983.3172, + "train_tokens_per_second": 1987.157 + }, + { + "epoch": 0.9745152354570638, + "grad_norm": 0.05040128529071808, + "learning_rate": 9.80882031882566e-05, + "loss": 0.013417194597423077, + "num_input_tokens_seen": 57610768, + "step": 3518, + "train_runtime": 28991.5345, + "train_tokens_per_second": 1987.158 + }, + { + "epoch": 0.974792243767313, + "grad_norm": 0.06377498060464859, + "learning_rate": 9.808699925127001e-05, + "loss": 0.015714287757873535, + "num_input_tokens_seen": 57627144, + "step": 3519, + "train_runtime": 28999.7566, + "train_tokens_per_second": 1987.16 + }, + { + "epoch": 0.9750692520775623, + "grad_norm": 0.04537203907966614, + "learning_rate": 9.808579494271171e-05, + "loss": 0.010880155488848686, + "num_input_tokens_seen": 57643520, + "step": 3520, + "train_runtime": 29007.9669, + "train_tokens_per_second": 1987.162 + }, + { + "epoch": 0.9753462603878116, + "grad_norm": 0.04477611929178238, + "learning_rate": 9.808459026259102e-05, + "loss": 0.013445360586047173, + "num_input_tokens_seen": 57659896, + "step": 3521, + "train_runtime": 29016.177, + "train_tokens_per_second": 1987.164 + }, + { + "epoch": 0.9756232686980609, + "grad_norm": 0.05654842033982277, + "learning_rate": 9.808338521091724e-05, + "loss": 0.012946882285177708, + "num_input_tokens_seen": 57676272, + "step": 3522, + "train_runtime": 29024.3791, + "train_tokens_per_second": 1987.166 + }, + { + "epoch": 0.9759002770083103, + "grad_norm": 0.055486198514699936, + "learning_rate": 9.808217978769969e-05, + "loss": 0.01192509289830923, + "num_input_tokens_seen": 57692648, + "step": 3523, + "train_runtime": 29032.5787, + "train_tokens_per_second": 1987.169 + }, + { + "epoch": 0.9761772853185595, + "grad_norm": 0.055459164083004, + "learning_rate": 9.808097399294769e-05, + "loss": 0.01299977209419012, + "num_input_tokens_seen": 57709024, + "step": 3524, + "train_runtime": 29040.797, + "train_tokens_per_second": 1987.171 + }, + { + "epoch": 0.9764542936288089, + "grad_norm": 0.047453004866838455, + "learning_rate": 9.807976782667054e-05, + "loss": 0.013906329870223999, + "num_input_tokens_seen": 57725400, + "step": 3525, + "train_runtime": 29049.0087, + "train_tokens_per_second": 1987.173 + }, + { + "epoch": 0.9767313019390582, + "grad_norm": 0.05867801979184151, + "learning_rate": 9.807856128887755e-05, + "loss": 0.01100163348019123, + "num_input_tokens_seen": 57741776, + "step": 3526, + "train_runtime": 29057.2189, + "train_tokens_per_second": 1987.175 + }, + { + "epoch": 0.9770083102493075, + "grad_norm": 0.07391180098056793, + "learning_rate": 9.807735437957808e-05, + "loss": 0.018187399953603745, + "num_input_tokens_seen": 57758152, + "step": 3527, + "train_runtime": 29065.4539, + "train_tokens_per_second": 1987.175 + }, + { + "epoch": 0.9772853185595568, + "grad_norm": 0.061065346002578735, + "learning_rate": 9.807614709878144e-05, + "loss": 0.012108325958251953, + "num_input_tokens_seen": 57774528, + "step": 3528, + "train_runtime": 29073.6791, + "train_tokens_per_second": 1987.176 + }, + { + "epoch": 0.977562326869806, + "grad_norm": 0.0830519050359726, + "learning_rate": 9.807493944649695e-05, + "loss": 0.010602802969515324, + "num_input_tokens_seen": 57790904, + "step": 3529, + "train_runtime": 29081.9013, + "train_tokens_per_second": 1987.178 + }, + { + "epoch": 0.9778393351800554, + "grad_norm": 0.03905424475669861, + "learning_rate": 9.807373142273395e-05, + "loss": 0.011764029040932655, + "num_input_tokens_seen": 57807280, + "step": 3530, + "train_runtime": 29090.1286, + "train_tokens_per_second": 1987.179 + }, + { + "epoch": 0.9781163434903047, + "grad_norm": 0.05176712945103645, + "learning_rate": 9.807252302750177e-05, + "loss": 0.011963315308094025, + "num_input_tokens_seen": 57823656, + "step": 3531, + "train_runtime": 29098.3573, + "train_tokens_per_second": 1987.179 + }, + { + "epoch": 0.978393351800554, + "grad_norm": 0.04544339329004288, + "learning_rate": 9.807131426080976e-05, + "loss": 0.011782919988036156, + "num_input_tokens_seen": 57840032, + "step": 3532, + "train_runtime": 29106.5824, + "train_tokens_per_second": 1987.18 + }, + { + "epoch": 0.9786703601108033, + "grad_norm": 0.05496210604906082, + "learning_rate": 9.807010512266723e-05, + "loss": 0.013051475398242474, + "num_input_tokens_seen": 57856408, + "step": 3533, + "train_runtime": 29114.8072, + "train_tokens_per_second": 1987.182 + }, + { + "epoch": 0.9789473684210527, + "grad_norm": 0.051471028476953506, + "learning_rate": 9.806889561308354e-05, + "loss": 0.012552494183182716, + "num_input_tokens_seen": 57872784, + "step": 3534, + "train_runtime": 29123.0278, + "train_tokens_per_second": 1987.183 + }, + { + "epoch": 0.9792243767313019, + "grad_norm": 0.05279483646154404, + "learning_rate": 9.806768573206806e-05, + "loss": 0.009547565132379532, + "num_input_tokens_seen": 57889160, + "step": 3535, + "train_runtime": 29131.2438, + "train_tokens_per_second": 1987.185 + }, + { + "epoch": 0.9795013850415513, + "grad_norm": 0.06543261557817459, + "learning_rate": 9.806647547963011e-05, + "loss": 0.011450964026153088, + "num_input_tokens_seen": 57905536, + "step": 3536, + "train_runtime": 29139.4685, + "train_tokens_per_second": 1987.186 + }, + { + "epoch": 0.9797783933518005, + "grad_norm": 0.05051161348819733, + "learning_rate": 9.806526485577905e-05, + "loss": 0.012190014123916626, + "num_input_tokens_seen": 57921912, + "step": 3537, + "train_runtime": 29147.6854, + "train_tokens_per_second": 1987.187 + }, + { + "epoch": 0.9800554016620499, + "grad_norm": 0.03386096656322479, + "learning_rate": 9.806405386052422e-05, + "loss": 0.00974328350275755, + "num_input_tokens_seen": 57938288, + "step": 3538, + "train_runtime": 29155.8959, + "train_tokens_per_second": 1987.189 + }, + { + "epoch": 0.9803324099722992, + "grad_norm": 0.06570097804069519, + "learning_rate": 9.806284249387501e-05, + "loss": 0.012072603218257427, + "num_input_tokens_seen": 57954664, + "step": 3539, + "train_runtime": 29164.1173, + "train_tokens_per_second": 1987.191 + }, + { + "epoch": 0.9806094182825484, + "grad_norm": 0.22964294254779816, + "learning_rate": 9.806163075584074e-05, + "loss": 0.01301700808107853, + "num_input_tokens_seen": 57971040, + "step": 3540, + "train_runtime": 29172.3424, + "train_tokens_per_second": 1987.192 + }, + { + "epoch": 0.9808864265927978, + "grad_norm": 0.05063426122069359, + "learning_rate": 9.80604186464308e-05, + "loss": 0.010952799580991268, + "num_input_tokens_seen": 57987416, + "step": 3541, + "train_runtime": 29180.5627, + "train_tokens_per_second": 1987.193 + }, + { + "epoch": 0.9811634349030471, + "grad_norm": 0.04558515176177025, + "learning_rate": 9.805920616565457e-05, + "loss": 0.012271598912775517, + "num_input_tokens_seen": 58003792, + "step": 3542, + "train_runtime": 29188.7817, + "train_tokens_per_second": 1987.195 + }, + { + "epoch": 0.9814404432132964, + "grad_norm": 0.05897901579737663, + "learning_rate": 9.805799331352139e-05, + "loss": 0.012617039494216442, + "num_input_tokens_seen": 58020168, + "step": 3543, + "train_runtime": 29197.0033, + "train_tokens_per_second": 1987.196 + }, + { + "epoch": 0.9817174515235457, + "grad_norm": 0.08006145060062408, + "learning_rate": 9.805678009004063e-05, + "loss": 0.012429030612111092, + "num_input_tokens_seen": 58036544, + "step": 3544, + "train_runtime": 29205.2238, + "train_tokens_per_second": 1987.197 + }, + { + "epoch": 0.981994459833795, + "grad_norm": 0.05568476766347885, + "learning_rate": 9.80555664952217e-05, + "loss": 0.012626906856894493, + "num_input_tokens_seen": 58052920, + "step": 3545, + "train_runtime": 29213.4422, + "train_tokens_per_second": 1987.199 + }, + { + "epoch": 0.9822714681440443, + "grad_norm": 0.06290298700332642, + "learning_rate": 9.805435252907393e-05, + "loss": 0.013109629042446613, + "num_input_tokens_seen": 58069296, + "step": 3546, + "train_runtime": 29221.6646, + "train_tokens_per_second": 1987.2 + }, + { + "epoch": 0.9825484764542937, + "grad_norm": 0.03895458206534386, + "learning_rate": 9.805313819160674e-05, + "loss": 0.010025415569543839, + "num_input_tokens_seen": 58085672, + "step": 3547, + "train_runtime": 29229.8859, + "train_tokens_per_second": 1987.201 + }, + { + "epoch": 0.9828254847645429, + "grad_norm": 0.037749603390693665, + "learning_rate": 9.805192348282947e-05, + "loss": 0.012669747695326805, + "num_input_tokens_seen": 58102048, + "step": 3548, + "train_runtime": 29238.1106, + "train_tokens_per_second": 1987.203 + }, + { + "epoch": 0.9831024930747922, + "grad_norm": 0.0424899086356163, + "learning_rate": 9.805070840275156e-05, + "loss": 0.012665703892707825, + "num_input_tokens_seen": 58118424, + "step": 3549, + "train_runtime": 29246.3241, + "train_tokens_per_second": 1987.204 + }, + { + "epoch": 0.9833795013850416, + "grad_norm": 0.05152192711830139, + "learning_rate": 9.804949295138237e-05, + "loss": 0.010612081736326218, + "num_input_tokens_seen": 58134800, + "step": 3550, + "train_runtime": 29254.5346, + "train_tokens_per_second": 1987.206 + }, + { + "epoch": 0.9836565096952908, + "grad_norm": 0.05263574421405792, + "learning_rate": 9.804827712873128e-05, + "loss": 0.014614326879382133, + "num_input_tokens_seen": 58151176, + "step": 3551, + "train_runtime": 29262.7592, + "train_tokens_per_second": 1987.208 + }, + { + "epoch": 0.9839335180055402, + "grad_norm": 0.05505939573049545, + "learning_rate": 9.804706093480771e-05, + "loss": 0.015549803152680397, + "num_input_tokens_seen": 58167552, + "step": 3552, + "train_runtime": 29270.9868, + "train_tokens_per_second": 1987.208 + }, + { + "epoch": 0.9842105263157894, + "grad_norm": 0.053688228130340576, + "learning_rate": 9.804584436962106e-05, + "loss": 0.011995253153145313, + "num_input_tokens_seen": 58183928, + "step": 3553, + "train_runtime": 29279.2052, + "train_tokens_per_second": 1987.21 + }, + { + "epoch": 0.9844875346260388, + "grad_norm": 0.053766656666994095, + "learning_rate": 9.804462743318069e-05, + "loss": 0.015370335429906845, + "num_input_tokens_seen": 58200304, + "step": 3554, + "train_runtime": 29287.424, + "train_tokens_per_second": 1987.211 + }, + { + "epoch": 0.9847645429362881, + "grad_norm": 0.04762713611125946, + "learning_rate": 9.804341012549605e-05, + "loss": 0.011873982846736908, + "num_input_tokens_seen": 58216680, + "step": 3555, + "train_runtime": 29295.6434, + "train_tokens_per_second": 1987.213 + }, + { + "epoch": 0.9850415512465374, + "grad_norm": 0.08654700964689255, + "learning_rate": 9.80421924465765e-05, + "loss": 0.012657077983021736, + "num_input_tokens_seen": 58233056, + "step": 3556, + "train_runtime": 29303.8561, + "train_tokens_per_second": 1987.215 + }, + { + "epoch": 0.9853185595567867, + "grad_norm": 0.06321166455745697, + "learning_rate": 9.80409743964315e-05, + "loss": 0.015939580276608467, + "num_input_tokens_seen": 58249432, + "step": 3557, + "train_runtime": 29312.0667, + "train_tokens_per_second": 1987.217 + }, + { + "epoch": 0.9855955678670361, + "grad_norm": 0.04419441148638725, + "learning_rate": 9.803975597507044e-05, + "loss": 0.011320279911160469, + "num_input_tokens_seen": 58265808, + "step": 3558, + "train_runtime": 29320.2689, + "train_tokens_per_second": 1987.219 + }, + { + "epoch": 0.9858725761772853, + "grad_norm": 0.04741780459880829, + "learning_rate": 9.803853718250273e-05, + "loss": 0.011466274037957191, + "num_input_tokens_seen": 58282184, + "step": 3559, + "train_runtime": 29328.4769, + "train_tokens_per_second": 1987.222 + }, + { + "epoch": 0.9861495844875346, + "grad_norm": 0.055950090289115906, + "learning_rate": 9.803731801873779e-05, + "loss": 0.011598552577197552, + "num_input_tokens_seen": 58298560, + "step": 3560, + "train_runtime": 29336.6917, + "train_tokens_per_second": 1987.223 + }, + { + "epoch": 0.9864265927977839, + "grad_norm": 0.055575475096702576, + "learning_rate": 9.803609848378503e-05, + "loss": 0.013288592919707298, + "num_input_tokens_seen": 58314936, + "step": 3561, + "train_runtime": 29344.9185, + "train_tokens_per_second": 1987.224 + }, + { + "epoch": 0.9867036011080332, + "grad_norm": 0.04304628446698189, + "learning_rate": 9.803487857765388e-05, + "loss": 0.009056463837623596, + "num_input_tokens_seen": 58331312, + "step": 3562, + "train_runtime": 29353.1363, + "train_tokens_per_second": 1987.226 + }, + { + "epoch": 0.9869806094182826, + "grad_norm": 0.048590902239084244, + "learning_rate": 9.803365830035379e-05, + "loss": 0.00995295774191618, + "num_input_tokens_seen": 58347688, + "step": 3563, + "train_runtime": 29361.3547, + "train_tokens_per_second": 1987.227 + }, + { + "epoch": 0.9872576177285318, + "grad_norm": 0.04665222391486168, + "learning_rate": 9.803243765189416e-05, + "loss": 0.011374130845069885, + "num_input_tokens_seen": 58364064, + "step": 3564, + "train_runtime": 29369.5733, + "train_tokens_per_second": 1987.229 + }, + { + "epoch": 0.9875346260387812, + "grad_norm": 0.045688778162002563, + "learning_rate": 9.803121663228443e-05, + "loss": 0.011431408114731312, + "num_input_tokens_seen": 58380440, + "step": 3565, + "train_runtime": 29377.8022, + "train_tokens_per_second": 1987.23 + }, + { + "epoch": 0.9878116343490305, + "grad_norm": 0.07607708126306534, + "learning_rate": 9.802999524153405e-05, + "loss": 0.014894556254148483, + "num_input_tokens_seen": 58396816, + "step": 3566, + "train_runtime": 29386.0185, + "train_tokens_per_second": 1987.231 + }, + { + "epoch": 0.9880886426592798, + "grad_norm": 0.031372953206300735, + "learning_rate": 9.802877347965243e-05, + "loss": 0.011428062804043293, + "num_input_tokens_seen": 58413192, + "step": 3567, + "train_runtime": 29394.2434, + "train_tokens_per_second": 1987.232 + }, + { + "epoch": 0.9883656509695291, + "grad_norm": 0.05824067071080208, + "learning_rate": 9.802755134664903e-05, + "loss": 0.01193675585091114, + "num_input_tokens_seen": 58429568, + "step": 3568, + "train_runtime": 29402.4541, + "train_tokens_per_second": 1987.234 + }, + { + "epoch": 0.9886426592797783, + "grad_norm": 0.03277715668082237, + "learning_rate": 9.802632884253328e-05, + "loss": 0.008907095529139042, + "num_input_tokens_seen": 58445944, + "step": 3569, + "train_runtime": 29410.6593, + "train_tokens_per_second": 1987.237 + }, + { + "epoch": 0.9889196675900277, + "grad_norm": 0.04990538954734802, + "learning_rate": 9.802510596731465e-05, + "loss": 0.012981057167053223, + "num_input_tokens_seen": 58462320, + "step": 3570, + "train_runtime": 29418.864, + "train_tokens_per_second": 1987.239 + }, + { + "epoch": 0.989196675900277, + "grad_norm": 0.04359464347362518, + "learning_rate": 9.802388272100256e-05, + "loss": 0.0113340700045228, + "num_input_tokens_seen": 58478696, + "step": 3571, + "train_runtime": 29427.0733, + "train_tokens_per_second": 1987.241 + }, + { + "epoch": 0.9894736842105263, + "grad_norm": 0.04117942228913307, + "learning_rate": 9.802265910360648e-05, + "loss": 0.01058569923043251, + "num_input_tokens_seen": 58495072, + "step": 3572, + "train_runtime": 29435.2749, + "train_tokens_per_second": 1987.244 + }, + { + "epoch": 0.9897506925207756, + "grad_norm": 0.06251958012580872, + "learning_rate": 9.802143511513587e-05, + "loss": 0.008505920879542828, + "num_input_tokens_seen": 58511448, + "step": 3573, + "train_runtime": 29443.4801, + "train_tokens_per_second": 1987.246 + }, + { + "epoch": 0.990027700831025, + "grad_norm": 0.05992871895432472, + "learning_rate": 9.802021075560017e-05, + "loss": 0.013195968233048916, + "num_input_tokens_seen": 58527824, + "step": 3574, + "train_runtime": 29451.6878, + "train_tokens_per_second": 1987.249 + }, + { + "epoch": 0.9903047091412742, + "grad_norm": 0.04848679527640343, + "learning_rate": 9.801898602500886e-05, + "loss": 0.014333275146782398, + "num_input_tokens_seen": 58544200, + "step": 3575, + "train_runtime": 29459.8895, + "train_tokens_per_second": 1987.251 + }, + { + "epoch": 0.9905817174515236, + "grad_norm": 0.1596071869134903, + "learning_rate": 9.801776092337138e-05, + "loss": 0.016077933833003044, + "num_input_tokens_seen": 58560576, + "step": 3576, + "train_runtime": 29468.0977, + "train_tokens_per_second": 1987.253 + }, + { + "epoch": 0.9908587257617728, + "grad_norm": 0.05320886895060539, + "learning_rate": 9.801653545069723e-05, + "loss": 0.01008276455104351, + "num_input_tokens_seen": 58576952, + "step": 3577, + "train_runtime": 29476.3033, + "train_tokens_per_second": 1987.256 + }, + { + "epoch": 0.9911357340720222, + "grad_norm": 0.06471644341945648, + "learning_rate": 9.801530960699585e-05, + "loss": 0.01486801728606224, + "num_input_tokens_seen": 58593328, + "step": 3578, + "train_runtime": 29484.5102, + "train_tokens_per_second": 1987.258 + }, + { + "epoch": 0.9914127423822715, + "grad_norm": 0.044260505586862564, + "learning_rate": 9.801408339227671e-05, + "loss": 0.012415457516908646, + "num_input_tokens_seen": 58609704, + "step": 3579, + "train_runtime": 29492.7181, + "train_tokens_per_second": 1987.26 + }, + { + "epoch": 0.9916897506925207, + "grad_norm": 0.0364578440785408, + "learning_rate": 9.80128568065493e-05, + "loss": 0.010742820799350739, + "num_input_tokens_seen": 58626080, + "step": 3580, + "train_runtime": 29500.9201, + "train_tokens_per_second": 1987.263 + }, + { + "epoch": 0.9919667590027701, + "grad_norm": 0.04277382045984268, + "learning_rate": 9.80116298498231e-05, + "loss": 0.0115016745403409, + "num_input_tokens_seen": 58642456, + "step": 3581, + "train_runtime": 29509.13, + "train_tokens_per_second": 1987.265 + }, + { + "epoch": 0.9922437673130194, + "grad_norm": 0.04253993183374405, + "learning_rate": 9.80104025221076e-05, + "loss": 0.011778700165450573, + "num_input_tokens_seen": 58658832, + "step": 3582, + "train_runtime": 29517.3317, + "train_tokens_per_second": 1987.267 + }, + { + "epoch": 0.9925207756232687, + "grad_norm": 0.03615740314126015, + "learning_rate": 9.800917482341225e-05, + "loss": 0.011401854455471039, + "num_input_tokens_seen": 58675208, + "step": 3583, + "train_runtime": 29525.5347, + "train_tokens_per_second": 1987.27 + }, + { + "epoch": 0.992797783933518, + "grad_norm": 0.06385451555252075, + "learning_rate": 9.800794675374655e-05, + "loss": 0.011426274664700031, + "num_input_tokens_seen": 58691584, + "step": 3584, + "train_runtime": 29533.7427, + "train_tokens_per_second": 1987.272 + }, + { + "epoch": 0.9930747922437673, + "grad_norm": 0.05834570899605751, + "learning_rate": 9.800671831312e-05, + "loss": 0.014154543168842793, + "num_input_tokens_seen": 58707960, + "step": 3585, + "train_runtime": 29541.954, + "train_tokens_per_second": 1987.274 + }, + { + "epoch": 0.9933518005540166, + "grad_norm": 0.06575719267129898, + "learning_rate": 9.800548950154209e-05, + "loss": 0.010712189599871635, + "num_input_tokens_seen": 58724336, + "step": 3586, + "train_runtime": 29550.163, + "train_tokens_per_second": 1987.276 + }, + { + "epoch": 0.993628808864266, + "grad_norm": 0.05421176925301552, + "learning_rate": 9.800426031902233e-05, + "loss": 0.012380987405776978, + "num_input_tokens_seen": 58740712, + "step": 3587, + "train_runtime": 29558.3837, + "train_tokens_per_second": 1987.278 + }, + { + "epoch": 0.9939058171745152, + "grad_norm": 0.05724744871258736, + "learning_rate": 9.800303076557018e-05, + "loss": 0.010328252799808979, + "num_input_tokens_seen": 58757088, + "step": 3588, + "train_runtime": 29566.6063, + "train_tokens_per_second": 1987.279 + }, + { + "epoch": 0.9941828254847646, + "grad_norm": 0.04298464208841324, + "learning_rate": 9.800180084119514e-05, + "loss": 0.009979402646422386, + "num_input_tokens_seen": 58773464, + "step": 3589, + "train_runtime": 29574.8353, + "train_tokens_per_second": 1987.28 + }, + { + "epoch": 0.9944598337950139, + "grad_norm": 0.033676303923130035, + "learning_rate": 9.800057054590677e-05, + "loss": 0.012552227824926376, + "num_input_tokens_seen": 58789840, + "step": 3590, + "train_runtime": 29583.055, + "train_tokens_per_second": 1987.281 + }, + { + "epoch": 0.9947368421052631, + "grad_norm": 0.08989022672176361, + "learning_rate": 9.799933987971452e-05, + "loss": 0.014045273885130882, + "num_input_tokens_seen": 58806216, + "step": 3591, + "train_runtime": 29591.2847, + "train_tokens_per_second": 1987.282 + }, + { + "epoch": 0.9950138504155125, + "grad_norm": 0.07480644434690475, + "learning_rate": 9.799810884262795e-05, + "loss": 0.015467949211597443, + "num_input_tokens_seen": 58822592, + "step": 3592, + "train_runtime": 29599.515, + "train_tokens_per_second": 1987.282 + }, + { + "epoch": 0.9952908587257617, + "grad_norm": 0.058785099536180496, + "learning_rate": 9.799687743465651e-05, + "loss": 0.011726289987564087, + "num_input_tokens_seen": 58838968, + "step": 3593, + "train_runtime": 29607.7315, + "train_tokens_per_second": 1987.284 + }, + { + "epoch": 0.9955678670360111, + "grad_norm": 0.04994065314531326, + "learning_rate": 9.799564565580977e-05, + "loss": 0.009945927187800407, + "num_input_tokens_seen": 58855344, + "step": 3594, + "train_runtime": 29615.9429, + "train_tokens_per_second": 1987.286 + }, + { + "epoch": 0.9958448753462604, + "grad_norm": 0.05616902932524681, + "learning_rate": 9.799441350609721e-05, + "loss": 0.011445460841059685, + "num_input_tokens_seen": 58871720, + "step": 3595, + "train_runtime": 29624.1614, + "train_tokens_per_second": 1987.287 + }, + { + "epoch": 0.9961218836565097, + "grad_norm": 0.08027923852205276, + "learning_rate": 9.799318098552837e-05, + "loss": 0.010700725950300694, + "num_input_tokens_seen": 58888096, + "step": 3596, + "train_runtime": 29632.3876, + "train_tokens_per_second": 1987.288 + }, + { + "epoch": 0.996398891966759, + "grad_norm": 0.03615880012512207, + "learning_rate": 9.799194809411278e-05, + "loss": 0.01192178949713707, + "num_input_tokens_seen": 58904472, + "step": 3597, + "train_runtime": 29640.619, + "train_tokens_per_second": 1987.289 + }, + { + "epoch": 0.9966759002770084, + "grad_norm": 0.0679144635796547, + "learning_rate": 9.799071483185995e-05, + "loss": 0.013820894062519073, + "num_input_tokens_seen": 58920848, + "step": 3598, + "train_runtime": 29648.8414, + "train_tokens_per_second": 1987.29 + }, + { + "epoch": 0.9969529085872576, + "grad_norm": 0.05050761625170708, + "learning_rate": 9.798948119877942e-05, + "loss": 0.012016582302749157, + "num_input_tokens_seen": 58937224, + "step": 3599, + "train_runtime": 29657.0608, + "train_tokens_per_second": 1987.291 + }, + { + "epoch": 0.997229916897507, + "grad_norm": 0.1637479066848755, + "learning_rate": 9.79882471948807e-05, + "loss": 0.014494596049189568, + "num_input_tokens_seen": 58953600, + "step": 3600, + "train_runtime": 29665.2758, + "train_tokens_per_second": 1987.293 + }, + { + "epoch": 0.9975069252077562, + "grad_norm": 0.053163763135671616, + "learning_rate": 9.798701282017339e-05, + "loss": 0.013217378407716751, + "num_input_tokens_seen": 58969976, + "step": 3601, + "train_runtime": 29675.4017, + "train_tokens_per_second": 1987.167 + }, + { + "epoch": 0.9977839335180055, + "grad_norm": 0.06746426224708557, + "learning_rate": 9.798577807466696e-05, + "loss": 0.013985666446387768, + "num_input_tokens_seen": 58986352, + "step": 3602, + "train_runtime": 29683.6139, + "train_tokens_per_second": 1987.169 + }, + { + "epoch": 0.9980609418282549, + "grad_norm": 0.04083525016903877, + "learning_rate": 9.798454295837096e-05, + "loss": 0.010846462100744247, + "num_input_tokens_seen": 59002728, + "step": 3603, + "train_runtime": 29691.8264, + "train_tokens_per_second": 1987.171 + }, + { + "epoch": 0.9983379501385041, + "grad_norm": 0.044289544224739075, + "learning_rate": 9.798330747129497e-05, + "loss": 0.011826488189399242, + "num_input_tokens_seen": 59019104, + "step": 3604, + "train_runtime": 29700.054, + "train_tokens_per_second": 1987.172 + }, + { + "epoch": 0.9986149584487535, + "grad_norm": 0.07787182927131653, + "learning_rate": 9.798207161344849e-05, + "loss": 0.012863989919424057, + "num_input_tokens_seen": 59035480, + "step": 3605, + "train_runtime": 29708.281, + "train_tokens_per_second": 1987.173 + }, + { + "epoch": 0.9988919667590028, + "grad_norm": 0.09375032037496567, + "learning_rate": 9.79808353848411e-05, + "loss": 0.013716299086809158, + "num_input_tokens_seen": 59051856, + "step": 3606, + "train_runtime": 29716.513, + "train_tokens_per_second": 1987.173 + }, + { + "epoch": 0.9991689750692521, + "grad_norm": 0.04643777385354042, + "learning_rate": 9.797959878548236e-05, + "loss": 0.010601370595395565, + "num_input_tokens_seen": 59068232, + "step": 3607, + "train_runtime": 29724.7409, + "train_tokens_per_second": 1987.174 + }, + { + "epoch": 0.9994459833795014, + "grad_norm": 0.04147808998823166, + "learning_rate": 9.79783618153818e-05, + "loss": 0.011542043648660183, + "num_input_tokens_seen": 59084608, + "step": 3608, + "train_runtime": 29732.9644, + "train_tokens_per_second": 1987.175 + }, + { + "epoch": 0.9997229916897507, + "grad_norm": 0.07933824509382248, + "learning_rate": 9.797712447454898e-05, + "loss": 0.013554045930504799, + "num_input_tokens_seen": 59100984, + "step": 3609, + "train_runtime": 29741.1901, + "train_tokens_per_second": 1987.176 + }, + { + "epoch": 1.0, + "grad_norm": 0.06676732748746872, + "learning_rate": 9.79758867629935e-05, + "loss": 0.01176518015563488, + "num_input_tokens_seen": 59117360, + "step": 3610, + "train_runtime": 29749.4113, + "train_tokens_per_second": 1987.177 + }, + { + "epoch": 1.0002770083102492, + "grad_norm": 0.053820449858903885, + "learning_rate": 9.797464868072488e-05, + "loss": 0.010963501408696175, + "num_input_tokens_seen": 59133736, + "step": 3611, + "train_runtime": 29757.6394, + "train_tokens_per_second": 1987.178 + }, + { + "epoch": 1.0005540166204987, + "grad_norm": 0.0514831617474556, + "learning_rate": 9.797341022775269e-05, + "loss": 0.012422848492860794, + "num_input_tokens_seen": 59150112, + "step": 3612, + "train_runtime": 29765.8729, + "train_tokens_per_second": 1987.179 + }, + { + "epoch": 1.000831024930748, + "grad_norm": 0.09170650690793991, + "learning_rate": 9.797217140408652e-05, + "loss": 0.014467915520071983, + "num_input_tokens_seen": 59166488, + "step": 3613, + "train_runtime": 29774.0988, + "train_tokens_per_second": 1987.18 + }, + { + "epoch": 1.0011080332409972, + "grad_norm": 0.0591764822602272, + "learning_rate": 9.797093220973594e-05, + "loss": 0.010695117525756359, + "num_input_tokens_seen": 59182864, + "step": 3614, + "train_runtime": 29782.3089, + "train_tokens_per_second": 1987.182 + }, + { + "epoch": 1.0013850415512466, + "grad_norm": 0.07048264890909195, + "learning_rate": 9.79696926447105e-05, + "loss": 0.010956053622066975, + "num_input_tokens_seen": 59199240, + "step": 3615, + "train_runtime": 29790.5202, + "train_tokens_per_second": 1987.184 + }, + { + "epoch": 1.0016620498614959, + "grad_norm": 0.04215631261467934, + "learning_rate": 9.796845270901982e-05, + "loss": 0.008792894892394543, + "num_input_tokens_seen": 59215616, + "step": 3616, + "train_runtime": 29798.7261, + "train_tokens_per_second": 1987.186 + }, + { + "epoch": 1.0019390581717451, + "grad_norm": 0.027204686775803566, + "learning_rate": 9.796721240267344e-05, + "loss": 0.009420130401849747, + "num_input_tokens_seen": 59231992, + "step": 3617, + "train_runtime": 29806.9371, + "train_tokens_per_second": 1987.188 + }, + { + "epoch": 1.0022160664819943, + "grad_norm": 0.04251737892627716, + "learning_rate": 9.796597172568099e-05, + "loss": 0.010300591588020325, + "num_input_tokens_seen": 59248368, + "step": 3618, + "train_runtime": 29815.1432, + "train_tokens_per_second": 1987.19 + }, + { + "epoch": 1.0024930747922438, + "grad_norm": 0.06796582043170929, + "learning_rate": 9.7964730678052e-05, + "loss": 0.014340371824800968, + "num_input_tokens_seen": 59264744, + "step": 3619, + "train_runtime": 29823.3621, + "train_tokens_per_second": 1987.192 + }, + { + "epoch": 1.002770083102493, + "grad_norm": 0.047593310475349426, + "learning_rate": 9.79634892597961e-05, + "loss": 0.011024976149201393, + "num_input_tokens_seen": 59281120, + "step": 3620, + "train_runtime": 29831.5877, + "train_tokens_per_second": 1987.193 + }, + { + "epoch": 1.0030470914127423, + "grad_norm": 0.09672874957323074, + "learning_rate": 9.796224747092286e-05, + "loss": 0.011057173833251, + "num_input_tokens_seen": 59297496, + "step": 3621, + "train_runtime": 29839.8139, + "train_tokens_per_second": 1987.194 + }, + { + "epoch": 1.0033240997229917, + "grad_norm": 0.06326260417699814, + "learning_rate": 9.79610053114419e-05, + "loss": 0.009650329127907753, + "num_input_tokens_seen": 59313872, + "step": 3622, + "train_runtime": 29848.0391, + "train_tokens_per_second": 1987.195 + }, + { + "epoch": 1.003601108033241, + "grad_norm": 0.02980826422572136, + "learning_rate": 9.79597627813628e-05, + "loss": 0.013285397551953793, + "num_input_tokens_seen": 59330248, + "step": 3623, + "train_runtime": 29856.2625, + "train_tokens_per_second": 1987.196 + }, + { + "epoch": 1.0038781163434902, + "grad_norm": 0.03432188928127289, + "learning_rate": 9.795851988069516e-05, + "loss": 0.012662193737924099, + "num_input_tokens_seen": 59346624, + "step": 3624, + "train_runtime": 29864.489, + "train_tokens_per_second": 1987.197 + }, + { + "epoch": 1.0041551246537397, + "grad_norm": 0.04214664548635483, + "learning_rate": 9.79572766094486e-05, + "loss": 0.011141316965222359, + "num_input_tokens_seen": 59363000, + "step": 3625, + "train_runtime": 29872.7256, + "train_tokens_per_second": 1987.197 + }, + { + "epoch": 1.004432132963989, + "grad_norm": 0.0657803863286972, + "learning_rate": 9.79560329676327e-05, + "loss": 0.012650681659579277, + "num_input_tokens_seen": 59379376, + "step": 3626, + "train_runtime": 29880.944, + "train_tokens_per_second": 1987.199 + }, + { + "epoch": 1.0047091412742382, + "grad_norm": 0.06750412285327911, + "learning_rate": 9.79547889552571e-05, + "loss": 0.011028921231627464, + "num_input_tokens_seen": 59395752, + "step": 3627, + "train_runtime": 29889.1597, + "train_tokens_per_second": 1987.2 + }, + { + "epoch": 1.0049861495844876, + "grad_norm": 0.04837886244058609, + "learning_rate": 9.79535445723314e-05, + "loss": 0.011656643822789192, + "num_input_tokens_seen": 59412128, + "step": 3628, + "train_runtime": 29897.376, + "train_tokens_per_second": 1987.202 + }, + { + "epoch": 1.0052631578947369, + "grad_norm": 0.07418709993362427, + "learning_rate": 9.795229981886521e-05, + "loss": 0.009956968948245049, + "num_input_tokens_seen": 59428504, + "step": 3629, + "train_runtime": 29905.5841, + "train_tokens_per_second": 1987.204 + }, + { + "epoch": 1.005540166204986, + "grad_norm": 0.05586574971675873, + "learning_rate": 9.795105469486817e-05, + "loss": 0.010344769805669785, + "num_input_tokens_seen": 59444880, + "step": 3630, + "train_runtime": 29913.791, + "train_tokens_per_second": 1987.207 + }, + { + "epoch": 1.0058171745152356, + "grad_norm": 0.03934964910149574, + "learning_rate": 9.794980920034985e-05, + "loss": 0.010277891531586647, + "num_input_tokens_seen": 59461256, + "step": 3631, + "train_runtime": 29921.9952, + "train_tokens_per_second": 1987.209 + }, + { + "epoch": 1.0060941828254848, + "grad_norm": 0.04919102042913437, + "learning_rate": 9.794856333531993e-05, + "loss": 0.010134895332157612, + "num_input_tokens_seen": 59477632, + "step": 3632, + "train_runtime": 29930.2052, + "train_tokens_per_second": 1987.211 + }, + { + "epoch": 1.006371191135734, + "grad_norm": 0.06252193450927734, + "learning_rate": 9.7947317099788e-05, + "loss": 0.012678338214755058, + "num_input_tokens_seen": 59494008, + "step": 3633, + "train_runtime": 29938.4087, + "train_tokens_per_second": 1987.213 + }, + { + "epoch": 1.0066481994459833, + "grad_norm": 0.07511600106954575, + "learning_rate": 9.794607049376371e-05, + "loss": 0.009843099862337112, + "num_input_tokens_seen": 59510384, + "step": 3634, + "train_runtime": 29946.6377, + "train_tokens_per_second": 1987.214 + }, + { + "epoch": 1.0069252077562327, + "grad_norm": 0.03944838047027588, + "learning_rate": 9.79448235172567e-05, + "loss": 0.010693670250475407, + "num_input_tokens_seen": 59526760, + "step": 3635, + "train_runtime": 29954.8697, + "train_tokens_per_second": 1987.215 + }, + { + "epoch": 1.007202216066482, + "grad_norm": 0.07836639881134033, + "learning_rate": 9.794357617027659e-05, + "loss": 0.010531067848205566, + "num_input_tokens_seen": 59543136, + "step": 3636, + "train_runtime": 29963.1005, + "train_tokens_per_second": 1987.215 + }, + { + "epoch": 1.0074792243767312, + "grad_norm": 0.050196126103401184, + "learning_rate": 9.7942328452833e-05, + "loss": 0.00795952882617712, + "num_input_tokens_seen": 59559512, + "step": 3637, + "train_runtime": 29971.3366, + "train_tokens_per_second": 1987.216 + }, + { + "epoch": 1.0077562326869807, + "grad_norm": 0.060406945645809174, + "learning_rate": 9.794108036493562e-05, + "loss": 0.012395146302878857, + "num_input_tokens_seen": 59575888, + "step": 3638, + "train_runtime": 29979.5649, + "train_tokens_per_second": 1987.217 + }, + { + "epoch": 1.00803324099723, + "grad_norm": 0.06720355153083801, + "learning_rate": 9.793983190659404e-05, + "loss": 0.01191509235650301, + "num_input_tokens_seen": 59592264, + "step": 3639, + "train_runtime": 29987.7871, + "train_tokens_per_second": 1987.218 + }, + { + "epoch": 1.0083102493074791, + "grad_norm": 0.07782931625843048, + "learning_rate": 9.793858307781796e-05, + "loss": 0.011590758338570595, + "num_input_tokens_seen": 59608640, + "step": 3640, + "train_runtime": 29996.0138, + "train_tokens_per_second": 1987.219 + }, + { + "epoch": 1.0085872576177286, + "grad_norm": 0.09533634036779404, + "learning_rate": 9.793733387861698e-05, + "loss": 0.01419934630393982, + "num_input_tokens_seen": 59625016, + "step": 3641, + "train_runtime": 30004.2387, + "train_tokens_per_second": 1987.22 + }, + { + "epoch": 1.0088642659279778, + "grad_norm": 0.0443292111158371, + "learning_rate": 9.793608430900079e-05, + "loss": 0.010199371725320816, + "num_input_tokens_seen": 59641392, + "step": 3642, + "train_runtime": 30012.4687, + "train_tokens_per_second": 1987.22 + }, + { + "epoch": 1.009141274238227, + "grad_norm": 0.05916454643011093, + "learning_rate": 9.7934834368979e-05, + "loss": 0.011833828873932362, + "num_input_tokens_seen": 59657768, + "step": 3643, + "train_runtime": 30020.7043, + "train_tokens_per_second": 1987.221 + }, + { + "epoch": 1.0094182825484765, + "grad_norm": 0.042115677148103714, + "learning_rate": 9.793358405856135e-05, + "loss": 0.01303466409444809, + "num_input_tokens_seen": 59674144, + "step": 3644, + "train_runtime": 30028.9198, + "train_tokens_per_second": 1987.222 + }, + { + "epoch": 1.0096952908587258, + "grad_norm": 0.06406417489051819, + "learning_rate": 9.793233337775742e-05, + "loss": 0.012344742193818092, + "num_input_tokens_seen": 59690520, + "step": 3645, + "train_runtime": 30037.1393, + "train_tokens_per_second": 1987.224 + }, + { + "epoch": 1.009972299168975, + "grad_norm": 0.049749113619327545, + "learning_rate": 9.79310823265769e-05, + "loss": 0.01259616669267416, + "num_input_tokens_seen": 59706896, + "step": 3646, + "train_runtime": 30045.3604, + "train_tokens_per_second": 1987.225 + }, + { + "epoch": 1.0102493074792245, + "grad_norm": 0.046490930020809174, + "learning_rate": 9.792983090502948e-05, + "loss": 0.00851383712142706, + "num_input_tokens_seen": 59723272, + "step": 3647, + "train_runtime": 30053.5742, + "train_tokens_per_second": 1987.227 + }, + { + "epoch": 1.0105263157894737, + "grad_norm": 0.048397574573755264, + "learning_rate": 9.792857911312479e-05, + "loss": 0.012150234542787075, + "num_input_tokens_seen": 59739648, + "step": 3648, + "train_runtime": 30061.7836, + "train_tokens_per_second": 1987.229 + }, + { + "epoch": 1.010803324099723, + "grad_norm": 0.05693655461072922, + "learning_rate": 9.792732695087254e-05, + "loss": 0.011699183844029903, + "num_input_tokens_seen": 59756024, + "step": 3649, + "train_runtime": 30069.9951, + "train_tokens_per_second": 1987.231 + }, + { + "epoch": 1.0110803324099722, + "grad_norm": 0.056131087243556976, + "learning_rate": 9.792607441828239e-05, + "loss": 0.012289492413401604, + "num_input_tokens_seen": 59772400, + "step": 3650, + "train_runtime": 30078.2333, + "train_tokens_per_second": 1987.231 + }, + { + "epoch": 1.0113573407202217, + "grad_norm": 0.050599899142980576, + "learning_rate": 9.792482151536402e-05, + "loss": 0.009951387532055378, + "num_input_tokens_seen": 59788776, + "step": 3651, + "train_runtime": 30086.4574, + "train_tokens_per_second": 1987.232 + }, + { + "epoch": 1.011634349030471, + "grad_norm": 0.03378968685865402, + "learning_rate": 9.792356824212709e-05, + "loss": 0.01028426643460989, + "num_input_tokens_seen": 59805152, + "step": 3652, + "train_runtime": 30094.6855, + "train_tokens_per_second": 1987.233 + }, + { + "epoch": 1.0119113573407201, + "grad_norm": 0.0450938455760479, + "learning_rate": 9.792231459858132e-05, + "loss": 0.01001026015728712, + "num_input_tokens_seen": 59821528, + "step": 3653, + "train_runtime": 30102.9105, + "train_tokens_per_second": 1987.234 + }, + { + "epoch": 1.0121883656509696, + "grad_norm": 0.04889840632677078, + "learning_rate": 9.792106058473638e-05, + "loss": 0.010782317258417606, + "num_input_tokens_seen": 59837904, + "step": 3654, + "train_runtime": 30111.134, + "train_tokens_per_second": 1987.235 + }, + { + "epoch": 1.0124653739612188, + "grad_norm": 0.035153210163116455, + "learning_rate": 9.791980620060197e-05, + "loss": 0.010975798591971397, + "num_input_tokens_seen": 59854280, + "step": 3655, + "train_runtime": 30119.3556, + "train_tokens_per_second": 1987.236 + }, + { + "epoch": 1.012742382271468, + "grad_norm": 0.08780215680599213, + "learning_rate": 9.791855144618775e-05, + "loss": 0.010949173010885715, + "num_input_tokens_seen": 59870656, + "step": 3656, + "train_runtime": 30127.5822, + "train_tokens_per_second": 1987.237 + }, + { + "epoch": 1.0130193905817175, + "grad_norm": 0.06677771359682083, + "learning_rate": 9.791729632150346e-05, + "loss": 0.012678389437496662, + "num_input_tokens_seen": 59887032, + "step": 3657, + "train_runtime": 30135.807, + "train_tokens_per_second": 1987.238 + }, + { + "epoch": 1.0132963988919668, + "grad_norm": 0.06531117856502533, + "learning_rate": 9.791604082655877e-05, + "loss": 0.01282273419201374, + "num_input_tokens_seen": 59903408, + "step": 3658, + "train_runtime": 30144.0391, + "train_tokens_per_second": 1987.239 + }, + { + "epoch": 1.013573407202216, + "grad_norm": 0.04700224846601486, + "learning_rate": 9.791478496136338e-05, + "loss": 0.011431659571826458, + "num_input_tokens_seen": 59919784, + "step": 3659, + "train_runtime": 30152.2689, + "train_tokens_per_second": 1987.24 + }, + { + "epoch": 1.0138504155124655, + "grad_norm": 0.07276435941457748, + "learning_rate": 9.791352872592701e-05, + "loss": 0.010256974026560783, + "num_input_tokens_seen": 59936160, + "step": 3660, + "train_runtime": 30160.4973, + "train_tokens_per_second": 1987.24 + }, + { + "epoch": 1.0141274238227147, + "grad_norm": 0.032287344336509705, + "learning_rate": 9.791227212025936e-05, + "loss": 0.0089475242421031, + "num_input_tokens_seen": 59952536, + "step": 3661, + "train_runtime": 30168.7188, + "train_tokens_per_second": 1987.242 + }, + { + "epoch": 1.014404432132964, + "grad_norm": 0.05229182168841362, + "learning_rate": 9.791101514437014e-05, + "loss": 0.010817248374223709, + "num_input_tokens_seen": 59968912, + "step": 3662, + "train_runtime": 30176.943, + "train_tokens_per_second": 1987.243 + }, + { + "epoch": 1.0146814404432134, + "grad_norm": 0.04277021065354347, + "learning_rate": 9.790975779826906e-05, + "loss": 0.00959568377584219, + "num_input_tokens_seen": 59985288, + "step": 3663, + "train_runtime": 30185.1675, + "train_tokens_per_second": 1987.244 + }, + { + "epoch": 1.0149584487534626, + "grad_norm": 0.042264699935913086, + "learning_rate": 9.790850008196584e-05, + "loss": 0.012967845425009727, + "num_input_tokens_seen": 60001664, + "step": 3664, + "train_runtime": 30193.4023, + "train_tokens_per_second": 1987.244 + }, + { + "epoch": 1.0152354570637119, + "grad_norm": 0.07100909948348999, + "learning_rate": 9.79072419954702e-05, + "loss": 0.010396511293947697, + "num_input_tokens_seen": 60018040, + "step": 3665, + "train_runtime": 30201.6335, + "train_tokens_per_second": 1987.245 + }, + { + "epoch": 1.0155124653739611, + "grad_norm": 0.047444842755794525, + "learning_rate": 9.790598353879184e-05, + "loss": 0.011391282081604004, + "num_input_tokens_seen": 60034416, + "step": 3666, + "train_runtime": 30209.8611, + "train_tokens_per_second": 1987.246 + }, + { + "epoch": 1.0157894736842106, + "grad_norm": 0.07132462412118912, + "learning_rate": 9.790472471194053e-05, + "loss": 0.013723359443247318, + "num_input_tokens_seen": 60050792, + "step": 3667, + "train_runtime": 30218.0907, + "train_tokens_per_second": 1987.246 + }, + { + "epoch": 1.0160664819944598, + "grad_norm": 0.045108117163181305, + "learning_rate": 9.790346551492592e-05, + "loss": 0.010554434731602669, + "num_input_tokens_seen": 60067168, + "step": 3668, + "train_runtime": 30226.319, + "train_tokens_per_second": 1987.247 + }, + { + "epoch": 1.016343490304709, + "grad_norm": 0.07059457153081894, + "learning_rate": 9.790220594775783e-05, + "loss": 0.011221875436604023, + "num_input_tokens_seen": 60083544, + "step": 3669, + "train_runtime": 30234.5457, + "train_tokens_per_second": 1987.248 + }, + { + "epoch": 1.0166204986149585, + "grad_norm": 0.049399860203266144, + "learning_rate": 9.790094601044594e-05, + "loss": 0.012618140317499638, + "num_input_tokens_seen": 60099920, + "step": 3670, + "train_runtime": 30242.7755, + "train_tokens_per_second": 1987.249 + }, + { + "epoch": 1.0168975069252078, + "grad_norm": 0.07034856826066971, + "learning_rate": 9.789968570300002e-05, + "loss": 0.013265222311019897, + "num_input_tokens_seen": 60116296, + "step": 3671, + "train_runtime": 30250.9948, + "train_tokens_per_second": 1987.25 + }, + { + "epoch": 1.017174515235457, + "grad_norm": 0.049390584230422974, + "learning_rate": 9.789842502542976e-05, + "loss": 0.011406631208956242, + "num_input_tokens_seen": 60132672, + "step": 3672, + "train_runtime": 30259.2115, + "train_tokens_per_second": 1987.252 + }, + { + "epoch": 1.0174515235457064, + "grad_norm": 0.04167110100388527, + "learning_rate": 9.789716397774493e-05, + "loss": 0.009719545021653175, + "num_input_tokens_seen": 60149048, + "step": 3673, + "train_runtime": 30267.4398, + "train_tokens_per_second": 1987.253 + }, + { + "epoch": 1.0177285318559557, + "grad_norm": 0.044832851737737656, + "learning_rate": 9.789590255995526e-05, + "loss": 0.00949248019605875, + "num_input_tokens_seen": 60165424, + "step": 3674, + "train_runtime": 30275.6702, + "train_tokens_per_second": 1987.253 + }, + { + "epoch": 1.018005540166205, + "grad_norm": 0.05766524374485016, + "learning_rate": 9.789464077207053e-05, + "loss": 0.013615123927593231, + "num_input_tokens_seen": 60181800, + "step": 3675, + "train_runtime": 30283.894, + "train_tokens_per_second": 1987.254 + }, + { + "epoch": 1.0182825484764544, + "grad_norm": 0.043441858142614365, + "learning_rate": 9.789337861410046e-05, + "loss": 0.008945224806666374, + "num_input_tokens_seen": 60198176, + "step": 3676, + "train_runtime": 30292.1175, + "train_tokens_per_second": 1987.255 + }, + { + "epoch": 1.0185595567867036, + "grad_norm": 0.03616614267230034, + "learning_rate": 9.789211608605482e-05, + "loss": 0.011450565420091152, + "num_input_tokens_seen": 60214552, + "step": 3677, + "train_runtime": 30300.6561, + "train_tokens_per_second": 1987.236 + }, + { + "epoch": 1.0188365650969529, + "grad_norm": 0.04441287741065025, + "learning_rate": 9.789085318794335e-05, + "loss": 0.01162717118859291, + "num_input_tokens_seen": 60230928, + "step": 3678, + "train_runtime": 30308.9086, + "train_tokens_per_second": 1987.235 + }, + { + "epoch": 1.0191135734072023, + "grad_norm": 0.03945942595601082, + "learning_rate": 9.78895899197758e-05, + "loss": 0.010811977088451385, + "num_input_tokens_seen": 60247304, + "step": 3679, + "train_runtime": 30317.1373, + "train_tokens_per_second": 1987.236 + }, + { + "epoch": 1.0193905817174516, + "grad_norm": 0.05216655135154724, + "learning_rate": 9.788832628156197e-05, + "loss": 0.010767295025289059, + "num_input_tokens_seen": 60263680, + "step": 3680, + "train_runtime": 30325.3561, + "train_tokens_per_second": 1987.237 + }, + { + "epoch": 1.0196675900277008, + "grad_norm": 0.06829164922237396, + "learning_rate": 9.78870622733116e-05, + "loss": 0.014049794524908066, + "num_input_tokens_seen": 60280056, + "step": 3681, + "train_runtime": 30333.5832, + "train_tokens_per_second": 1987.238 + }, + { + "epoch": 1.01994459833795, + "grad_norm": 0.08486425876617432, + "learning_rate": 9.788579789503444e-05, + "loss": 0.012118814513087273, + "num_input_tokens_seen": 60296432, + "step": 3682, + "train_runtime": 30341.8043, + "train_tokens_per_second": 1987.239 + }, + { + "epoch": 1.0202216066481995, + "grad_norm": 0.06619695574045181, + "learning_rate": 9.788453314674028e-05, + "loss": 0.010997056029736996, + "num_input_tokens_seen": 60312808, + "step": 3683, + "train_runtime": 30350.0323, + "train_tokens_per_second": 1987.24 + }, + { + "epoch": 1.0204986149584487, + "grad_norm": 0.06674300134181976, + "learning_rate": 9.78832680284389e-05, + "loss": 0.009978181682527065, + "num_input_tokens_seen": 60329184, + "step": 3684, + "train_runtime": 30358.2599, + "train_tokens_per_second": 1987.241 + }, + { + "epoch": 1.020775623268698, + "grad_norm": 0.06078317016363144, + "learning_rate": 9.788200254014006e-05, + "loss": 0.01309856679290533, + "num_input_tokens_seen": 60345560, + "step": 3685, + "train_runtime": 30366.5088, + "train_tokens_per_second": 1987.241 + }, + { + "epoch": 1.0210526315789474, + "grad_norm": 0.09005976468324661, + "learning_rate": 9.788073668185354e-05, + "loss": 0.012789195403456688, + "num_input_tokens_seen": 60361936, + "step": 3686, + "train_runtime": 30374.7386, + "train_tokens_per_second": 1987.241 + }, + { + "epoch": 1.0213296398891967, + "grad_norm": 0.0734124556183815, + "learning_rate": 9.787947045358913e-05, + "loss": 0.011376534588634968, + "num_input_tokens_seen": 60378312, + "step": 3687, + "train_runtime": 30382.9539, + "train_tokens_per_second": 1987.243 + }, + { + "epoch": 1.021606648199446, + "grad_norm": 0.07224555313587189, + "learning_rate": 9.787820385535663e-05, + "loss": 0.011718581430613995, + "num_input_tokens_seen": 60394688, + "step": 3688, + "train_runtime": 30391.1676, + "train_tokens_per_second": 1987.245 + }, + { + "epoch": 1.0218836565096954, + "grad_norm": 0.04958103224635124, + "learning_rate": 9.787693688716578e-05, + "loss": 0.012680839747190475, + "num_input_tokens_seen": 60411064, + "step": 3689, + "train_runtime": 30399.3751, + "train_tokens_per_second": 1987.247 + }, + { + "epoch": 1.0221606648199446, + "grad_norm": 0.05930076166987419, + "learning_rate": 9.78756695490264e-05, + "loss": 0.011680078692734241, + "num_input_tokens_seen": 60427440, + "step": 3690, + "train_runtime": 30407.5932, + "train_tokens_per_second": 1987.248 + }, + { + "epoch": 1.0224376731301938, + "grad_norm": 0.04488557577133179, + "learning_rate": 9.78744018409483e-05, + "loss": 0.010207741521298885, + "num_input_tokens_seen": 60443816, + "step": 3691, + "train_runtime": 30415.8201, + "train_tokens_per_second": 1987.249 + }, + { + "epoch": 1.0227146814404433, + "grad_norm": 0.051360681653022766, + "learning_rate": 9.787313376294125e-05, + "loss": 0.011010023765265942, + "num_input_tokens_seen": 60460192, + "step": 3692, + "train_runtime": 30424.0393, + "train_tokens_per_second": 1987.251 + }, + { + "epoch": 1.0229916897506925, + "grad_norm": 0.06421680003404617, + "learning_rate": 9.787186531501505e-05, + "loss": 0.010802016593515873, + "num_input_tokens_seen": 60476568, + "step": 3693, + "train_runtime": 30432.2771, + "train_tokens_per_second": 1987.251 + }, + { + "epoch": 1.0232686980609418, + "grad_norm": 0.060446903109550476, + "learning_rate": 9.78705964971795e-05, + "loss": 0.011222767643630505, + "num_input_tokens_seen": 60492944, + "step": 3694, + "train_runtime": 30440.5073, + "train_tokens_per_second": 1987.252 + }, + { + "epoch": 1.0235457063711912, + "grad_norm": 0.06490682065486908, + "learning_rate": 9.786932730944441e-05, + "loss": 0.011992533691227436, + "num_input_tokens_seen": 60509320, + "step": 3695, + "train_runtime": 30448.737, + "train_tokens_per_second": 1987.252 + }, + { + "epoch": 1.0238227146814405, + "grad_norm": 0.08036469668149948, + "learning_rate": 9.78680577518196e-05, + "loss": 0.013147721067070961, + "num_input_tokens_seen": 60525696, + "step": 3696, + "train_runtime": 30456.9661, + "train_tokens_per_second": 1987.253 + }, + { + "epoch": 1.0240997229916897, + "grad_norm": 0.06541439145803452, + "learning_rate": 9.786678782431486e-05, + "loss": 0.012844950892031193, + "num_input_tokens_seen": 60542072, + "step": 3697, + "train_runtime": 30465.1921, + "train_tokens_per_second": 1987.254 + }, + { + "epoch": 1.024376731301939, + "grad_norm": 0.06265795975923538, + "learning_rate": 9.786551752694e-05, + "loss": 0.010767239145934582, + "num_input_tokens_seen": 60558448, + "step": 3698, + "train_runtime": 30473.4225, + "train_tokens_per_second": 1987.255 + }, + { + "epoch": 1.0246537396121884, + "grad_norm": 0.056019335985183716, + "learning_rate": 9.786424685970486e-05, + "loss": 0.011096064001321793, + "num_input_tokens_seen": 60574824, + "step": 3699, + "train_runtime": 30481.6564, + "train_tokens_per_second": 1987.255 + }, + { + "epoch": 1.0249307479224377, + "grad_norm": 0.08501900732517242, + "learning_rate": 9.786297582261923e-05, + "loss": 0.010261152870953083, + "num_input_tokens_seen": 60591200, + "step": 3700, + "train_runtime": 30489.8743, + "train_tokens_per_second": 1987.256 + }, + { + "epoch": 1.025207756232687, + "grad_norm": 0.061464823782444, + "learning_rate": 9.786170441569296e-05, + "loss": 0.011175259947776794, + "num_input_tokens_seen": 60607576, + "step": 3701, + "train_runtime": 30500.055, + "train_tokens_per_second": 1987.13 + }, + { + "epoch": 1.0254847645429364, + "grad_norm": 0.0559670589864254, + "learning_rate": 9.786043263893585e-05, + "loss": 0.012587152421474457, + "num_input_tokens_seen": 60623952, + "step": 3702, + "train_runtime": 30508.274, + "train_tokens_per_second": 1987.131 + }, + { + "epoch": 1.0257617728531856, + "grad_norm": 0.058664631098508835, + "learning_rate": 9.785916049235775e-05, + "loss": 0.01207727286964655, + "num_input_tokens_seen": 60640328, + "step": 3703, + "train_runtime": 30516.5039, + "train_tokens_per_second": 1987.132 + }, + { + "epoch": 1.0260387811634348, + "grad_norm": 0.04845740646123886, + "learning_rate": 9.785788797596847e-05, + "loss": 0.0098965372890234, + "num_input_tokens_seen": 60656704, + "step": 3704, + "train_runtime": 30524.7336, + "train_tokens_per_second": 1987.133 + }, + { + "epoch": 1.0263157894736843, + "grad_norm": 0.046639710664749146, + "learning_rate": 9.785661508977784e-05, + "loss": 0.010422957129776478, + "num_input_tokens_seen": 60673080, + "step": 3705, + "train_runtime": 30532.9704, + "train_tokens_per_second": 1987.133 + }, + { + "epoch": 1.0265927977839335, + "grad_norm": 0.03775177523493767, + "learning_rate": 9.785534183379572e-05, + "loss": 0.010988934896886349, + "num_input_tokens_seen": 60689456, + "step": 3706, + "train_runtime": 30541.2031, + "train_tokens_per_second": 1987.134 + }, + { + "epoch": 1.0268698060941828, + "grad_norm": 0.07257435470819473, + "learning_rate": 9.785406820803192e-05, + "loss": 0.009688058868050575, + "num_input_tokens_seen": 60705832, + "step": 3707, + "train_runtime": 30549.4279, + "train_tokens_per_second": 1987.135 + }, + { + "epoch": 1.0271468144044322, + "grad_norm": 0.04892987757921219, + "learning_rate": 9.78527942124963e-05, + "loss": 0.010717351920902729, + "num_input_tokens_seen": 60722208, + "step": 3708, + "train_runtime": 30557.6565, + "train_tokens_per_second": 1987.136 + }, + { + "epoch": 1.0274238227146815, + "grad_norm": 0.05803246051073074, + "learning_rate": 9.78515198471987e-05, + "loss": 0.01088717021048069, + "num_input_tokens_seen": 60738584, + "step": 3709, + "train_runtime": 30565.883, + "train_tokens_per_second": 1987.137 + }, + { + "epoch": 1.0277008310249307, + "grad_norm": 0.04442107677459717, + "learning_rate": 9.785024511214897e-05, + "loss": 0.0100160026922822, + "num_input_tokens_seen": 60754960, + "step": 3710, + "train_runtime": 30574.117, + "train_tokens_per_second": 1987.137 + }, + { + "epoch": 1.02797783933518, + "grad_norm": 0.05717245116829872, + "learning_rate": 9.784897000735695e-05, + "loss": 0.009621836245059967, + "num_input_tokens_seen": 60771336, + "step": 3711, + "train_runtime": 30582.3541, + "train_tokens_per_second": 1987.137 + }, + { + "epoch": 1.0282548476454294, + "grad_norm": 0.04262928292155266, + "learning_rate": 9.784769453283252e-05, + "loss": 0.011739905923604965, + "num_input_tokens_seen": 60787712, + "step": 3712, + "train_runtime": 30590.5855, + "train_tokens_per_second": 1987.138 + }, + { + "epoch": 1.0285318559556786, + "grad_norm": 0.06055309996008873, + "learning_rate": 9.784641868858548e-05, + "loss": 0.011729092337191105, + "num_input_tokens_seen": 60804088, + "step": 3713, + "train_runtime": 30598.8063, + "train_tokens_per_second": 1987.139 + }, + { + "epoch": 1.0288088642659279, + "grad_norm": 0.04016323387622833, + "learning_rate": 9.784514247462575e-05, + "loss": 0.010123098269104958, + "num_input_tokens_seen": 60820464, + "step": 3714, + "train_runtime": 30607.0294, + "train_tokens_per_second": 1987.14 + }, + { + "epoch": 1.0290858725761773, + "grad_norm": 0.03826018422842026, + "learning_rate": 9.784386589096316e-05, + "loss": 0.010425163432955742, + "num_input_tokens_seen": 60836840, + "step": 3715, + "train_runtime": 30615.2561, + "train_tokens_per_second": 1987.141 + }, + { + "epoch": 1.0293628808864266, + "grad_norm": 0.060615140944719315, + "learning_rate": 9.784258893760759e-05, + "loss": 0.011432434432208538, + "num_input_tokens_seen": 60853216, + "step": 3716, + "train_runtime": 30623.4855, + "train_tokens_per_second": 1987.142 + }, + { + "epoch": 1.0296398891966758, + "grad_norm": 0.052922558039426804, + "learning_rate": 9.784131161456888e-05, + "loss": 0.010403708554804325, + "num_input_tokens_seen": 60869592, + "step": 3717, + "train_runtime": 30631.711, + "train_tokens_per_second": 1987.143 + }, + { + "epoch": 1.0299168975069253, + "grad_norm": 0.05775102600455284, + "learning_rate": 9.784003392185693e-05, + "loss": 0.01153525523841381, + "num_input_tokens_seen": 60885968, + "step": 3718, + "train_runtime": 30639.925, + "train_tokens_per_second": 1987.145 + }, + { + "epoch": 1.0301939058171745, + "grad_norm": 0.058432869613170624, + "learning_rate": 9.783875585948157e-05, + "loss": 0.011149376630783081, + "num_input_tokens_seen": 60902344, + "step": 3719, + "train_runtime": 30648.1376, + "train_tokens_per_second": 1987.147 + }, + { + "epoch": 1.0304709141274238, + "grad_norm": 0.032725173979997635, + "learning_rate": 9.783747742745273e-05, + "loss": 0.009583801962435246, + "num_input_tokens_seen": 60918720, + "step": 3720, + "train_runtime": 30656.3563, + "train_tokens_per_second": 1987.148 + }, + { + "epoch": 1.0307479224376732, + "grad_norm": 0.061142679303884506, + "learning_rate": 9.783619862578027e-05, + "loss": 0.009698698297142982, + "num_input_tokens_seen": 60935096, + "step": 3721, + "train_runtime": 30664.5803, + "train_tokens_per_second": 1987.149 + }, + { + "epoch": 1.0310249307479225, + "grad_norm": 0.06012491509318352, + "learning_rate": 9.783491945447404e-05, + "loss": 0.011399386450648308, + "num_input_tokens_seen": 60951472, + "step": 3722, + "train_runtime": 30672.8151, + "train_tokens_per_second": 1987.15 + }, + { + "epoch": 1.0313019390581717, + "grad_norm": 0.04691271483898163, + "learning_rate": 9.783363991354396e-05, + "loss": 0.010484754107892513, + "num_input_tokens_seen": 60967848, + "step": 3723, + "train_runtime": 30681.0554, + "train_tokens_per_second": 1987.15 + }, + { + "epoch": 1.0315789473684212, + "grad_norm": 0.051526911556720734, + "learning_rate": 9.783236000299991e-05, + "loss": 0.011047436855733395, + "num_input_tokens_seen": 60984224, + "step": 3724, + "train_runtime": 30689.2838, + "train_tokens_per_second": 1987.15 + }, + { + "epoch": 1.0318559556786704, + "grad_norm": 0.04462570697069168, + "learning_rate": 9.783107972285177e-05, + "loss": 0.009913088753819466, + "num_input_tokens_seen": 61000600, + "step": 3725, + "train_runtime": 30697.5093, + "train_tokens_per_second": 1987.151 + }, + { + "epoch": 1.0321329639889196, + "grad_norm": 0.06855393201112747, + "learning_rate": 9.782979907310945e-05, + "loss": 0.012754350900650024, + "num_input_tokens_seen": 61016976, + "step": 3726, + "train_runtime": 30705.7339, + "train_tokens_per_second": 1987.153 + }, + { + "epoch": 1.032409972299169, + "grad_norm": 0.06689037382602692, + "learning_rate": 9.782851805378283e-05, + "loss": 0.013249976560473442, + "num_input_tokens_seen": 61033352, + "step": 3727, + "train_runtime": 30713.945, + "train_tokens_per_second": 1987.154 + }, + { + "epoch": 1.0326869806094183, + "grad_norm": 0.08854853361845016, + "learning_rate": 9.782723666488181e-05, + "loss": 0.01087472215294838, + "num_input_tokens_seen": 61049728, + "step": 3728, + "train_runtime": 30722.1538, + "train_tokens_per_second": 1987.157 + }, + { + "epoch": 1.0329639889196676, + "grad_norm": 0.04691392183303833, + "learning_rate": 9.782595490641629e-05, + "loss": 0.01221020333468914, + "num_input_tokens_seen": 61066104, + "step": 3729, + "train_runtime": 30730.3626, + "train_tokens_per_second": 1987.159 + }, + { + "epoch": 1.0332409972299168, + "grad_norm": 0.03914070501923561, + "learning_rate": 9.78246727783962e-05, + "loss": 0.011943979188799858, + "num_input_tokens_seen": 61082480, + "step": 3730, + "train_runtime": 30738.5679, + "train_tokens_per_second": 1987.161 + }, + { + "epoch": 1.0335180055401663, + "grad_norm": 0.04972892999649048, + "learning_rate": 9.78233902808314e-05, + "loss": 0.013439825735986233, + "num_input_tokens_seen": 61098856, + "step": 3731, + "train_runtime": 30746.7834, + "train_tokens_per_second": 1987.163 + }, + { + "epoch": 1.0337950138504155, + "grad_norm": 0.03473243489861488, + "learning_rate": 9.782210741373184e-05, + "loss": 0.011745716445147991, + "num_input_tokens_seen": 61115232, + "step": 3732, + "train_runtime": 30755.0078, + "train_tokens_per_second": 1987.164 + }, + { + "epoch": 1.0340720221606647, + "grad_norm": 0.04043061286211014, + "learning_rate": 9.78208241771074e-05, + "loss": 0.009562662802636623, + "num_input_tokens_seen": 61131608, + "step": 3733, + "train_runtime": 30763.2333, + "train_tokens_per_second": 1987.165 + }, + { + "epoch": 1.0343490304709142, + "grad_norm": 0.03830944374203682, + "learning_rate": 9.781954057096805e-05, + "loss": 0.009983079507946968, + "num_input_tokens_seen": 61147984, + "step": 3734, + "train_runtime": 30771.4627, + "train_tokens_per_second": 1987.165 + }, + { + "epoch": 1.0346260387811634, + "grad_norm": 0.04605558514595032, + "learning_rate": 9.781825659532365e-05, + "loss": 0.013882147148251534, + "num_input_tokens_seen": 61164360, + "step": 3735, + "train_runtime": 30779.6911, + "train_tokens_per_second": 1987.166 + }, + { + "epoch": 1.0349030470914127, + "grad_norm": 0.060307785868644714, + "learning_rate": 9.781697225018413e-05, + "loss": 0.010406117886304855, + "num_input_tokens_seen": 61180736, + "step": 3736, + "train_runtime": 30787.9157, + "train_tokens_per_second": 1987.167 + }, + { + "epoch": 1.0351800554016621, + "grad_norm": 0.04631492495536804, + "learning_rate": 9.781568753555945e-05, + "loss": 0.011098505929112434, + "num_input_tokens_seen": 61197112, + "step": 3737, + "train_runtime": 30796.1459, + "train_tokens_per_second": 1987.168 + }, + { + "epoch": 1.0354570637119114, + "grad_norm": 0.046586982905864716, + "learning_rate": 9.781440245145952e-05, + "loss": 0.01199441496282816, + "num_input_tokens_seen": 61213488, + "step": 3738, + "train_runtime": 30804.3739, + "train_tokens_per_second": 1987.169 + }, + { + "epoch": 1.0357340720221606, + "grad_norm": 0.042752109467983246, + "learning_rate": 9.781311699789426e-05, + "loss": 0.010556434281170368, + "num_input_tokens_seen": 61229864, + "step": 3739, + "train_runtime": 30812.608, + "train_tokens_per_second": 1987.169 + }, + { + "epoch": 1.03601108033241, + "grad_norm": 0.07734595984220505, + "learning_rate": 9.78118311748736e-05, + "loss": 0.011261317878961563, + "num_input_tokens_seen": 61246240, + "step": 3740, + "train_runtime": 30820.8445, + "train_tokens_per_second": 1987.169 + }, + { + "epoch": 1.0362880886426593, + "grad_norm": 0.02813613787293434, + "learning_rate": 9.78105449824075e-05, + "loss": 0.010506111197173595, + "num_input_tokens_seen": 61262616, + "step": 3741, + "train_runtime": 30829.0765, + "train_tokens_per_second": 1987.17 + }, + { + "epoch": 1.0365650969529085, + "grad_norm": 0.03174810856580734, + "learning_rate": 9.780925842050586e-05, + "loss": 0.008601982146501541, + "num_input_tokens_seen": 61278992, + "step": 3742, + "train_runtime": 30837.3031, + "train_tokens_per_second": 1987.171 + }, + { + "epoch": 1.0368421052631578, + "grad_norm": 0.04199977591633797, + "learning_rate": 9.780797148917865e-05, + "loss": 0.009148536249995232, + "num_input_tokens_seen": 61295368, + "step": 3743, + "train_runtime": 30845.5374, + "train_tokens_per_second": 1987.171 + }, + { + "epoch": 1.0371191135734072, + "grad_norm": 0.07485923171043396, + "learning_rate": 9.780668418843581e-05, + "loss": 0.012233344838023186, + "num_input_tokens_seen": 61311744, + "step": 3744, + "train_runtime": 30853.7674, + "train_tokens_per_second": 1987.172 + }, + { + "epoch": 1.0373961218836565, + "grad_norm": 0.058560196310281754, + "learning_rate": 9.780539651828728e-05, + "loss": 0.012456191703677177, + "num_input_tokens_seen": 61328120, + "step": 3745, + "train_runtime": 30862.008, + "train_tokens_per_second": 1987.172 + }, + { + "epoch": 1.0376731301939057, + "grad_norm": 0.05709508806467056, + "learning_rate": 9.780410847874304e-05, + "loss": 0.01067947968840599, + "num_input_tokens_seen": 61344496, + "step": 3746, + "train_runtime": 30870.2471, + "train_tokens_per_second": 1987.172 + }, + { + "epoch": 1.0379501385041552, + "grad_norm": 0.051771316677331924, + "learning_rate": 9.7802820069813e-05, + "loss": 0.011675294488668442, + "num_input_tokens_seen": 61360872, + "step": 3747, + "train_runtime": 30878.4787, + "train_tokens_per_second": 1987.173 + }, + { + "epoch": 1.0382271468144044, + "grad_norm": 0.057265184819698334, + "learning_rate": 9.780153129150713e-05, + "loss": 0.009978496469557285, + "num_input_tokens_seen": 61377248, + "step": 3748, + "train_runtime": 30886.7124, + "train_tokens_per_second": 1987.173 + }, + { + "epoch": 1.0385041551246537, + "grad_norm": 0.05738464742898941, + "learning_rate": 9.780024214383538e-05, + "loss": 0.012085766531527042, + "num_input_tokens_seen": 61393624, + "step": 3749, + "train_runtime": 30894.9423, + "train_tokens_per_second": 1987.174 + }, + { + "epoch": 1.0387811634349031, + "grad_norm": 0.05900377407670021, + "learning_rate": 9.779895262680775e-05, + "loss": 0.012183531187474728, + "num_input_tokens_seen": 61410000, + "step": 3750, + "train_runtime": 30903.1734, + "train_tokens_per_second": 1987.175 + }, + { + "epoch": 1.0390581717451524, + "grad_norm": 0.025397704914212227, + "learning_rate": 9.779766274043416e-05, + "loss": 0.00938949454575777, + "num_input_tokens_seen": 61426376, + "step": 3751, + "train_runtime": 30911.4039, + "train_tokens_per_second": 1987.175 + }, + { + "epoch": 1.0393351800554016, + "grad_norm": 0.0699508935213089, + "learning_rate": 9.779637248472461e-05, + "loss": 0.01285634282976389, + "num_input_tokens_seen": 61442752, + "step": 3752, + "train_runtime": 30919.6326, + "train_tokens_per_second": 1987.176 + }, + { + "epoch": 1.039612188365651, + "grad_norm": 0.045774850994348526, + "learning_rate": 9.779508185968904e-05, + "loss": 0.01148825604468584, + "num_input_tokens_seen": 61459128, + "step": 3753, + "train_runtime": 30927.8577, + "train_tokens_per_second": 1987.177 + }, + { + "epoch": 1.0398891966759003, + "grad_norm": 0.0671430379152298, + "learning_rate": 9.779379086533744e-05, + "loss": 0.013432170264422894, + "num_input_tokens_seen": 61475504, + "step": 3754, + "train_runtime": 30936.0827, + "train_tokens_per_second": 1987.178 + }, + { + "epoch": 1.0401662049861495, + "grad_norm": 0.034271471202373505, + "learning_rate": 9.779249950167979e-05, + "loss": 0.011979158967733383, + "num_input_tokens_seen": 61491880, + "step": 3755, + "train_runtime": 30944.3096, + "train_tokens_per_second": 1987.179 + }, + { + "epoch": 1.040443213296399, + "grad_norm": 0.084877148270607, + "learning_rate": 9.779120776872605e-05, + "loss": 0.01280397828668356, + "num_input_tokens_seen": 61508256, + "step": 3756, + "train_runtime": 30952.5394, + "train_tokens_per_second": 1987.18 + }, + { + "epoch": 1.0407202216066482, + "grad_norm": 0.06519736349582672, + "learning_rate": 9.778991566648622e-05, + "loss": 0.01007282454520464, + "num_input_tokens_seen": 61524632, + "step": 3757, + "train_runtime": 30960.7687, + "train_tokens_per_second": 1987.18 + }, + { + "epoch": 1.0409972299168975, + "grad_norm": 0.02952883020043373, + "learning_rate": 9.778862319497027e-05, + "loss": 0.008219470269978046, + "num_input_tokens_seen": 61541008, + "step": 3758, + "train_runtime": 30968.9991, + "train_tokens_per_second": 1987.181 + }, + { + "epoch": 1.0412742382271467, + "grad_norm": 0.06788607686758041, + "learning_rate": 9.77873303541882e-05, + "loss": 0.01322537288069725, + "num_input_tokens_seen": 61557384, + "step": 3759, + "train_runtime": 30977.2319, + "train_tokens_per_second": 1987.182 + }, + { + "epoch": 1.0415512465373962, + "grad_norm": 0.03813086450099945, + "learning_rate": 9.778603714415e-05, + "loss": 0.01216287910938263, + "num_input_tokens_seen": 61573760, + "step": 3760, + "train_runtime": 30985.4625, + "train_tokens_per_second": 1987.182 + }, + { + "epoch": 1.0418282548476454, + "grad_norm": 0.053595319390296936, + "learning_rate": 9.778474356486564e-05, + "loss": 0.008322312496602535, + "num_input_tokens_seen": 61590136, + "step": 3761, + "train_runtime": 30993.6906, + "train_tokens_per_second": 1987.183 + }, + { + "epoch": 1.0421052631578946, + "grad_norm": 0.08930396288633347, + "learning_rate": 9.778344961634514e-05, + "loss": 0.00945754162967205, + "num_input_tokens_seen": 61606512, + "step": 3762, + "train_runtime": 31001.9245, + "train_tokens_per_second": 1987.183 + }, + { + "epoch": 1.042382271468144, + "grad_norm": 0.043400079011917114, + "learning_rate": 9.778215529859849e-05, + "loss": 0.01094425655901432, + "num_input_tokens_seen": 61622888, + "step": 3763, + "train_runtime": 31010.1575, + "train_tokens_per_second": 1987.184 + }, + { + "epoch": 1.0426592797783933, + "grad_norm": 0.04966172203421593, + "learning_rate": 9.77808606116357e-05, + "loss": 0.008884826675057411, + "num_input_tokens_seen": 61639264, + "step": 3764, + "train_runtime": 31018.3729, + "train_tokens_per_second": 1987.186 + }, + { + "epoch": 1.0429362880886426, + "grad_norm": 0.07539623230695724, + "learning_rate": 9.777956555546674e-05, + "loss": 0.010173491202294827, + "num_input_tokens_seen": 61655640, + "step": 3765, + "train_runtime": 31026.5864, + "train_tokens_per_second": 1987.187 + }, + { + "epoch": 1.043213296398892, + "grad_norm": 0.04602622985839844, + "learning_rate": 9.777827013010166e-05, + "loss": 0.008305387571454048, + "num_input_tokens_seen": 61672016, + "step": 3766, + "train_runtime": 31034.8118, + "train_tokens_per_second": 1987.188 + }, + { + "epoch": 1.0434903047091413, + "grad_norm": 0.051836974918842316, + "learning_rate": 9.777697433555047e-05, + "loss": 0.010582228191196918, + "num_input_tokens_seen": 61688392, + "step": 3767, + "train_runtime": 31043.0386, + "train_tokens_per_second": 1987.189 + }, + { + "epoch": 1.0437673130193905, + "grad_norm": 0.04897290840744972, + "learning_rate": 9.777567817182313e-05, + "loss": 0.00779558252543211, + "num_input_tokens_seen": 61704768, + "step": 3768, + "train_runtime": 31051.2547, + "train_tokens_per_second": 1987.191 + }, + { + "epoch": 1.04404432132964, + "grad_norm": 0.07583406567573547, + "learning_rate": 9.777438163892972e-05, + "loss": 0.01102156937122345, + "num_input_tokens_seen": 61721144, + "step": 3769, + "train_runtime": 31059.4651, + "train_tokens_per_second": 1987.193 + }, + { + "epoch": 1.0443213296398892, + "grad_norm": 0.06081093102693558, + "learning_rate": 9.777308473688022e-05, + "loss": 0.015072943642735481, + "num_input_tokens_seen": 61737520, + "step": 3770, + "train_runtime": 31067.6798, + "train_tokens_per_second": 1987.194 + }, + { + "epoch": 1.0445983379501385, + "grad_norm": 0.06086336076259613, + "learning_rate": 9.777178746568466e-05, + "loss": 0.011803150177001953, + "num_input_tokens_seen": 61753896, + "step": 3771, + "train_runtime": 31075.8918, + "train_tokens_per_second": 1987.196 + }, + { + "epoch": 1.044875346260388, + "grad_norm": 0.058224134147167206, + "learning_rate": 9.777048982535306e-05, + "loss": 0.012882440350949764, + "num_input_tokens_seen": 61770272, + "step": 3772, + "train_runtime": 31084.1104, + "train_tokens_per_second": 1987.198 + }, + { + "epoch": 1.0451523545706372, + "grad_norm": 0.05346133932471275, + "learning_rate": 9.776919181589546e-05, + "loss": 0.010661358945071697, + "num_input_tokens_seen": 61786648, + "step": 3773, + "train_runtime": 31092.3394, + "train_tokens_per_second": 1987.198 + }, + { + "epoch": 1.0454293628808864, + "grad_norm": 0.04824414104223251, + "learning_rate": 9.776789343732186e-05, + "loss": 0.01093046274036169, + "num_input_tokens_seen": 61803024, + "step": 3774, + "train_runtime": 31100.5805, + "train_tokens_per_second": 1987.198 + }, + { + "epoch": 1.0457063711911356, + "grad_norm": 0.051292773336172104, + "learning_rate": 9.776659468964234e-05, + "loss": 0.011679714545607567, + "num_input_tokens_seen": 61819400, + "step": 3775, + "train_runtime": 31108.8113, + "train_tokens_per_second": 1987.199 + }, + { + "epoch": 1.045983379501385, + "grad_norm": 0.23070378601551056, + "learning_rate": 9.776529557286689e-05, + "loss": 0.012169376015663147, + "num_input_tokens_seen": 61835776, + "step": 3776, + "train_runtime": 31117.0391, + "train_tokens_per_second": 1987.2 + }, + { + "epoch": 1.0462603878116343, + "grad_norm": 0.06754060089588165, + "learning_rate": 9.776399608700557e-05, + "loss": 0.010546913370490074, + "num_input_tokens_seen": 61852152, + "step": 3777, + "train_runtime": 31125.2652, + "train_tokens_per_second": 1987.201 + }, + { + "epoch": 1.0465373961218836, + "grad_norm": 0.08960000425577164, + "learning_rate": 9.776269623206843e-05, + "loss": 0.010381987318396568, + "num_input_tokens_seen": 61868528, + "step": 3778, + "train_runtime": 31133.5096, + "train_tokens_per_second": 1987.201 + }, + { + "epoch": 1.046814404432133, + "grad_norm": 0.07647223025560379, + "learning_rate": 9.77613960080655e-05, + "loss": 0.011068910360336304, + "num_input_tokens_seen": 61884904, + "step": 3779, + "train_runtime": 31141.7449, + "train_tokens_per_second": 1987.201 + }, + { + "epoch": 1.0470914127423823, + "grad_norm": 0.040842942893505096, + "learning_rate": 9.776009541500684e-05, + "loss": 0.014251110143959522, + "num_input_tokens_seen": 61901280, + "step": 3780, + "train_runtime": 31149.9708, + "train_tokens_per_second": 1987.202 + }, + { + "epoch": 1.0473684210526315, + "grad_norm": 0.04225728288292885, + "learning_rate": 9.775879445290247e-05, + "loss": 0.00884745828807354, + "num_input_tokens_seen": 61917656, + "step": 3781, + "train_runtime": 31158.1826, + "train_tokens_per_second": 1987.204 + }, + { + "epoch": 1.047645429362881, + "grad_norm": 0.07172433286905289, + "learning_rate": 9.775749312176248e-05, + "loss": 0.010176447220146656, + "num_input_tokens_seen": 61934032, + "step": 3782, + "train_runtime": 31166.3918, + "train_tokens_per_second": 1987.206 + }, + { + "epoch": 1.0479224376731302, + "grad_norm": 0.058885496109724045, + "learning_rate": 9.77561914215969e-05, + "loss": 0.01193478424102068, + "num_input_tokens_seen": 61950408, + "step": 3783, + "train_runtime": 31174.6032, + "train_tokens_per_second": 1987.208 + }, + { + "epoch": 1.0481994459833794, + "grad_norm": 0.06504636257886887, + "learning_rate": 9.77548893524158e-05, + "loss": 0.013142688199877739, + "num_input_tokens_seen": 61966784, + "step": 3784, + "train_runtime": 31182.8125, + "train_tokens_per_second": 1987.21 + }, + { + "epoch": 1.048476454293629, + "grad_norm": 0.047887224704027176, + "learning_rate": 9.775358691422922e-05, + "loss": 0.008157146163284779, + "num_input_tokens_seen": 61983160, + "step": 3785, + "train_runtime": 31191.0257, + "train_tokens_per_second": 1987.211 + }, + { + "epoch": 1.0487534626038781, + "grad_norm": 0.024107590317726135, + "learning_rate": 9.775228410704725e-05, + "loss": 0.009500520303845406, + "num_input_tokens_seen": 61999536, + "step": 3786, + "train_runtime": 31199.2382, + "train_tokens_per_second": 1987.213 + }, + { + "epoch": 1.0490304709141274, + "grad_norm": 0.05491548404097557, + "learning_rate": 9.775098093087996e-05, + "loss": 0.010792186483740807, + "num_input_tokens_seen": 62015912, + "step": 3787, + "train_runtime": 31207.4416, + "train_tokens_per_second": 1987.216 + }, + { + "epoch": 1.0493074792243768, + "grad_norm": 0.05610409379005432, + "learning_rate": 9.774967738573739e-05, + "loss": 0.011645480990409851, + "num_input_tokens_seen": 62032288, + "step": 3788, + "train_runtime": 31215.6632, + "train_tokens_per_second": 1987.217 + }, + { + "epoch": 1.049584487534626, + "grad_norm": 0.04042061045765877, + "learning_rate": 9.774837347162964e-05, + "loss": 0.01034472230821848, + "num_input_tokens_seen": 62048664, + "step": 3789, + "train_runtime": 31223.8767, + "train_tokens_per_second": 1987.218 + }, + { + "epoch": 1.0498614958448753, + "grad_norm": 0.05468621477484703, + "learning_rate": 9.774706918856679e-05, + "loss": 0.010311242192983627, + "num_input_tokens_seen": 62065040, + "step": 3790, + "train_runtime": 31232.0857, + "train_tokens_per_second": 1987.22 + }, + { + "epoch": 1.0501385041551246, + "grad_norm": 0.13691551983356476, + "learning_rate": 9.774576453655888e-05, + "loss": 0.014181950129568577, + "num_input_tokens_seen": 62081416, + "step": 3791, + "train_runtime": 31240.2953, + "train_tokens_per_second": 1987.222 + }, + { + "epoch": 1.050415512465374, + "grad_norm": 0.05216197296977043, + "learning_rate": 9.774445951561603e-05, + "loss": 0.01007768977433443, + "num_input_tokens_seen": 62097792, + "step": 3792, + "train_runtime": 31248.5135, + "train_tokens_per_second": 1987.224 + }, + { + "epoch": 1.0506925207756233, + "grad_norm": 0.054140180349349976, + "learning_rate": 9.77431541257483e-05, + "loss": 0.012014862149953842, + "num_input_tokens_seen": 62114168, + "step": 3793, + "train_runtime": 31256.7282, + "train_tokens_per_second": 1987.226 + }, + { + "epoch": 1.0509695290858725, + "grad_norm": 0.041986070573329926, + "learning_rate": 9.77418483669658e-05, + "loss": 0.010511999018490314, + "num_input_tokens_seen": 62130544, + "step": 3794, + "train_runtime": 31264.9393, + "train_tokens_per_second": 1987.227 + }, + { + "epoch": 1.051246537396122, + "grad_norm": 0.057239748537540436, + "learning_rate": 9.77405422392786e-05, + "loss": 0.012372869998216629, + "num_input_tokens_seen": 62146920, + "step": 3795, + "train_runtime": 31273.1449, + "train_tokens_per_second": 1987.23 + }, + { + "epoch": 1.0515235457063712, + "grad_norm": 0.18706531822681427, + "learning_rate": 9.77392357426968e-05, + "loss": 0.013176257722079754, + "num_input_tokens_seen": 62163296, + "step": 3796, + "train_runtime": 31281.3709, + "train_tokens_per_second": 1987.231 + }, + { + "epoch": 1.0518005540166204, + "grad_norm": 0.040477264672517776, + "learning_rate": 9.77379288772305e-05, + "loss": 0.011299707926809788, + "num_input_tokens_seen": 62179672, + "step": 3797, + "train_runtime": 31289.5812, + "train_tokens_per_second": 1987.232 + }, + { + "epoch": 1.0520775623268699, + "grad_norm": 0.041915178298950195, + "learning_rate": 9.773662164288978e-05, + "loss": 0.012125221081078053, + "num_input_tokens_seen": 62196048, + "step": 3798, + "train_runtime": 31297.7994, + "train_tokens_per_second": 1987.234 + }, + { + "epoch": 1.0523545706371191, + "grad_norm": 0.035921331495046616, + "learning_rate": 9.773531403968476e-05, + "loss": 0.01233083289116621, + "num_input_tokens_seen": 62212424, + "step": 3799, + "train_runtime": 31306.0175, + "train_tokens_per_second": 1987.235 + }, + { + "epoch": 1.0526315789473684, + "grad_norm": 0.030279694125056267, + "learning_rate": 9.773400606762552e-05, + "loss": 0.008982674218714237, + "num_input_tokens_seen": 62228800, + "step": 3800, + "train_runtime": 31314.2377, + "train_tokens_per_second": 1987.237 + }, + { + "epoch": 1.0529085872576178, + "grad_norm": 0.03175919130444527, + "learning_rate": 9.773269772672222e-05, + "loss": 0.011729382909834385, + "num_input_tokens_seen": 62245176, + "step": 3801, + "train_runtime": 31324.1693, + "train_tokens_per_second": 1987.129 + }, + { + "epoch": 1.053185595567867, + "grad_norm": 0.0916648581624031, + "learning_rate": 9.773138901698492e-05, + "loss": 0.011235182173550129, + "num_input_tokens_seen": 62261552, + "step": 3802, + "train_runtime": 31332.3682, + "train_tokens_per_second": 1987.132 + }, + { + "epoch": 1.0534626038781163, + "grad_norm": 0.035021521151065826, + "learning_rate": 9.773007993842372e-05, + "loss": 0.009924542158842087, + "num_input_tokens_seen": 62277928, + "step": 3803, + "train_runtime": 31340.5881, + "train_tokens_per_second": 1987.133 + }, + { + "epoch": 1.0537396121883655, + "grad_norm": 0.054930541664361954, + "learning_rate": 9.772877049104878e-05, + "loss": 0.012695014476776123, + "num_input_tokens_seen": 62294304, + "step": 3804, + "train_runtime": 31348.8206, + "train_tokens_per_second": 1987.134 + }, + { + "epoch": 1.054016620498615, + "grad_norm": 0.07430452108383179, + "learning_rate": 9.77274606748702e-05, + "loss": 0.011438942514359951, + "num_input_tokens_seen": 62310680, + "step": 3805, + "train_runtime": 31357.0553, + "train_tokens_per_second": 1987.134 + }, + { + "epoch": 1.0542936288088642, + "grad_norm": 0.08182372152805328, + "learning_rate": 9.772615048989811e-05, + "loss": 0.012292424216866493, + "num_input_tokens_seen": 62327056, + "step": 3806, + "train_runtime": 31365.2706, + "train_tokens_per_second": 1987.136 + }, + { + "epoch": 1.0545706371191135, + "grad_norm": 0.056593578308820724, + "learning_rate": 9.772483993614261e-05, + "loss": 0.012162161991000175, + "num_input_tokens_seen": 62343432, + "step": 3807, + "train_runtime": 31373.4838, + "train_tokens_per_second": 1987.138 + }, + { + "epoch": 1.054847645429363, + "grad_norm": 0.06572683155536652, + "learning_rate": 9.772352901361385e-05, + "loss": 0.01052769459784031, + "num_input_tokens_seen": 62359808, + "step": 3808, + "train_runtime": 31381.6926, + "train_tokens_per_second": 1987.14 + }, + { + "epoch": 1.0551246537396122, + "grad_norm": 0.048856306821107864, + "learning_rate": 9.772221772232195e-05, + "loss": 0.010813510976731777, + "num_input_tokens_seen": 62376184, + "step": 3809, + "train_runtime": 31389.8945, + "train_tokens_per_second": 1987.142 + }, + { + "epoch": 1.0554016620498614, + "grad_norm": 0.059242624789476395, + "learning_rate": 9.772090606227704e-05, + "loss": 0.012859644368290901, + "num_input_tokens_seen": 62392560, + "step": 3810, + "train_runtime": 31398.1079, + "train_tokens_per_second": 1987.144 + }, + { + "epoch": 1.0556786703601109, + "grad_norm": 0.026177067309617996, + "learning_rate": 9.771959403348926e-05, + "loss": 0.008809025399386883, + "num_input_tokens_seen": 62408936, + "step": 3811, + "train_runtime": 31406.313, + "train_tokens_per_second": 1987.146 + }, + { + "epoch": 1.05595567867036, + "grad_norm": 0.03967634215950966, + "learning_rate": 9.771828163596874e-05, + "loss": 0.010029998607933521, + "num_input_tokens_seen": 62425312, + "step": 3812, + "train_runtime": 31414.5258, + "train_tokens_per_second": 1987.148 + }, + { + "epoch": 1.0562326869806093, + "grad_norm": 0.06888782232999802, + "learning_rate": 9.771696886972564e-05, + "loss": 0.013888946734368801, + "num_input_tokens_seen": 62441688, + "step": 3813, + "train_runtime": 31422.7435, + "train_tokens_per_second": 1987.149 + }, + { + "epoch": 1.0565096952908588, + "grad_norm": 0.03313596546649933, + "learning_rate": 9.771565573477008e-05, + "loss": 0.010433310642838478, + "num_input_tokens_seen": 62458064, + "step": 3814, + "train_runtime": 31430.9675, + "train_tokens_per_second": 1987.151 + }, + { + "epoch": 1.056786703601108, + "grad_norm": 0.09022916853427887, + "learning_rate": 9.771434223111224e-05, + "loss": 0.014645840972661972, + "num_input_tokens_seen": 62474440, + "step": 3815, + "train_runtime": 31439.182, + "train_tokens_per_second": 1987.152 + }, + { + "epoch": 1.0570637119113573, + "grad_norm": 0.03852362558245659, + "learning_rate": 9.771302835876224e-05, + "loss": 0.011675200425088406, + "num_input_tokens_seen": 62490816, + "step": 3816, + "train_runtime": 31447.391, + "train_tokens_per_second": 1987.154 + }, + { + "epoch": 1.0573407202216067, + "grad_norm": 0.03366321325302124, + "learning_rate": 9.771171411773024e-05, + "loss": 0.011568721383810043, + "num_input_tokens_seen": 62507192, + "step": 3817, + "train_runtime": 31455.5951, + "train_tokens_per_second": 1987.157 + }, + { + "epoch": 1.057617728531856, + "grad_norm": 0.04268166050314903, + "learning_rate": 9.771039950802639e-05, + "loss": 0.012001033872365952, + "num_input_tokens_seen": 62523568, + "step": 3818, + "train_runtime": 31463.7992, + "train_tokens_per_second": 1987.159 + }, + { + "epoch": 1.0578947368421052, + "grad_norm": 0.06675996631383896, + "learning_rate": 9.770908452966083e-05, + "loss": 0.013070369139313698, + "num_input_tokens_seen": 62539944, + "step": 3819, + "train_runtime": 31472.0101, + "train_tokens_per_second": 1987.161 + }, + { + "epoch": 1.0581717451523547, + "grad_norm": 0.050722699612379074, + "learning_rate": 9.770776918264377e-05, + "loss": 0.010815337300300598, + "num_input_tokens_seen": 62556320, + "step": 3820, + "train_runtime": 31480.2276, + "train_tokens_per_second": 1987.162 + }, + { + "epoch": 1.058448753462604, + "grad_norm": 0.031914353370666504, + "learning_rate": 9.770645346698535e-05, + "loss": 0.00963571760803461, + "num_input_tokens_seen": 62572696, + "step": 3821, + "train_runtime": 31488.4646, + "train_tokens_per_second": 1987.162 + }, + { + "epoch": 1.0587257617728532, + "grad_norm": 0.05777985602617264, + "learning_rate": 9.770513738269573e-05, + "loss": 0.01211148127913475, + "num_input_tokens_seen": 62589072, + "step": 3822, + "train_runtime": 31496.701, + "train_tokens_per_second": 1987.163 + }, + { + "epoch": 1.0590027700831024, + "grad_norm": 0.0421951524913311, + "learning_rate": 9.77038209297851e-05, + "loss": 0.00925691332668066, + "num_input_tokens_seen": 62605448, + "step": 3823, + "train_runtime": 31504.9196, + "train_tokens_per_second": 1987.164 + }, + { + "epoch": 1.0592797783933519, + "grad_norm": 0.04974353313446045, + "learning_rate": 9.77025041082636e-05, + "loss": 0.01144323404878378, + "num_input_tokens_seen": 62621824, + "step": 3824, + "train_runtime": 31513.1273, + "train_tokens_per_second": 1987.166 + }, + { + "epoch": 1.059556786703601, + "grad_norm": 0.0444025993347168, + "learning_rate": 9.770118691814142e-05, + "loss": 0.014861421659588814, + "num_input_tokens_seen": 62638200, + "step": 3825, + "train_runtime": 31521.3409, + "train_tokens_per_second": 1987.168 + }, + { + "epoch": 1.0598337950138503, + "grad_norm": 0.04374004527926445, + "learning_rate": 9.769986935942874e-05, + "loss": 0.011330538429319859, + "num_input_tokens_seen": 62654576, + "step": 3826, + "train_runtime": 31529.5645, + "train_tokens_per_second": 1987.169 + }, + { + "epoch": 1.0601108033240998, + "grad_norm": 0.047110170125961304, + "learning_rate": 9.769855143213575e-05, + "loss": 0.009779145941138268, + "num_input_tokens_seen": 62670952, + "step": 3827, + "train_runtime": 31537.8072, + "train_tokens_per_second": 1987.169 + }, + { + "epoch": 1.060387811634349, + "grad_norm": 0.03762749955058098, + "learning_rate": 9.769723313627261e-05, + "loss": 0.012372635304927826, + "num_input_tokens_seen": 62687328, + "step": 3828, + "train_runtime": 31546.0343, + "train_tokens_per_second": 1987.17 + }, + { + "epoch": 1.0606648199445983, + "grad_norm": 0.04534434154629707, + "learning_rate": 9.769591447184954e-05, + "loss": 0.009565849788486958, + "num_input_tokens_seen": 62703704, + "step": 3829, + "train_runtime": 31554.2396, + "train_tokens_per_second": 1987.172 + }, + { + "epoch": 1.0609418282548477, + "grad_norm": 0.03922650218009949, + "learning_rate": 9.769459543887672e-05, + "loss": 0.009292518720030785, + "num_input_tokens_seen": 62720080, + "step": 3830, + "train_runtime": 31562.4667, + "train_tokens_per_second": 1987.173 + }, + { + "epoch": 1.061218836565097, + "grad_norm": 0.03571050614118576, + "learning_rate": 9.76932760373643e-05, + "loss": 0.011348854750394821, + "num_input_tokens_seen": 62736456, + "step": 3831, + "train_runtime": 31570.693, + "train_tokens_per_second": 1987.174 + }, + { + "epoch": 1.0614958448753462, + "grad_norm": 0.04949600622057915, + "learning_rate": 9.769195626732253e-05, + "loss": 0.010828062891960144, + "num_input_tokens_seen": 62752832, + "step": 3832, + "train_runtime": 31578.9281, + "train_tokens_per_second": 1987.174 + }, + { + "epoch": 1.0617728531855957, + "grad_norm": 0.04381706938147545, + "learning_rate": 9.769063612876157e-05, + "loss": 0.01084198895841837, + "num_input_tokens_seen": 62769208, + "step": 3833, + "train_runtime": 31587.1441, + "train_tokens_per_second": 1987.176 + }, + { + "epoch": 1.062049861495845, + "grad_norm": 0.06027359515428543, + "learning_rate": 9.768931562169166e-05, + "loss": 0.010325999930500984, + "num_input_tokens_seen": 62785584, + "step": 3834, + "train_runtime": 31595.3652, + "train_tokens_per_second": 1987.177 + }, + { + "epoch": 1.0623268698060941, + "grad_norm": 0.07169921696186066, + "learning_rate": 9.768799474612296e-05, + "loss": 0.009747426025569439, + "num_input_tokens_seen": 62801960, + "step": 3835, + "train_runtime": 31603.6012, + "train_tokens_per_second": 1987.177 + }, + { + "epoch": 1.0626038781163434, + "grad_norm": 0.053846824914216995, + "learning_rate": 9.768667350206571e-05, + "loss": 0.013729888945817947, + "num_input_tokens_seen": 62818336, + "step": 3836, + "train_runtime": 31611.821, + "train_tokens_per_second": 1987.179 + }, + { + "epoch": 1.0628808864265928, + "grad_norm": 0.07333219796419144, + "learning_rate": 9.768535188953009e-05, + "loss": 0.009528185240924358, + "num_input_tokens_seen": 62834712, + "step": 3837, + "train_runtime": 31620.0322, + "train_tokens_per_second": 1987.181 + }, + { + "epoch": 1.063157894736842, + "grad_norm": 0.055927034467458725, + "learning_rate": 9.768402990852635e-05, + "loss": 0.0094844289124012, + "num_input_tokens_seen": 62851088, + "step": 3838, + "train_runtime": 31628.254, + "train_tokens_per_second": 1987.182 + }, + { + "epoch": 1.0634349030470913, + "grad_norm": 0.06691182404756546, + "learning_rate": 9.768270755906466e-05, + "loss": 0.011597083881497383, + "num_input_tokens_seen": 62867464, + "step": 3839, + "train_runtime": 31636.4682, + "train_tokens_per_second": 1987.183 + }, + { + "epoch": 1.0637119113573408, + "grad_norm": 0.04631033539772034, + "learning_rate": 9.768138484115528e-05, + "loss": 0.01043386198580265, + "num_input_tokens_seen": 62883840, + "step": 3840, + "train_runtime": 31644.6748, + "train_tokens_per_second": 1987.186 + }, + { + "epoch": 1.06398891966759, + "grad_norm": 0.04103139787912369, + "learning_rate": 9.768006175480839e-05, + "loss": 0.011702612042427063, + "num_input_tokens_seen": 62900216, + "step": 3841, + "train_runtime": 31652.8855, + "train_tokens_per_second": 1987.187 + }, + { + "epoch": 1.0642659279778393, + "grad_norm": 0.048169657588005066, + "learning_rate": 9.767873830003425e-05, + "loss": 0.012123513035476208, + "num_input_tokens_seen": 62916592, + "step": 3842, + "train_runtime": 31661.1038, + "train_tokens_per_second": 1987.189 + }, + { + "epoch": 1.0645429362880887, + "grad_norm": 0.04943675175309181, + "learning_rate": 9.767741447684307e-05, + "loss": 0.010591447353363037, + "num_input_tokens_seen": 62932968, + "step": 3843, + "train_runtime": 31669.3299, + "train_tokens_per_second": 1987.19 + }, + { + "epoch": 1.064819944598338, + "grad_norm": 0.0785728171467781, + "learning_rate": 9.767609028524509e-05, + "loss": 0.00987450685352087, + "num_input_tokens_seen": 62949344, + "step": 3844, + "train_runtime": 31677.545, + "train_tokens_per_second": 1987.191 + }, + { + "epoch": 1.0650969529085872, + "grad_norm": 0.07251870632171631, + "learning_rate": 9.767476572525053e-05, + "loss": 0.009953147731721401, + "num_input_tokens_seen": 62965720, + "step": 3845, + "train_runtime": 31685.7657, + "train_tokens_per_second": 1987.193 + }, + { + "epoch": 1.0653739612188367, + "grad_norm": 0.07177263498306274, + "learning_rate": 9.767344079686963e-05, + "loss": 0.010782013647258282, + "num_input_tokens_seen": 62982096, + "step": 3846, + "train_runtime": 31693.9765, + "train_tokens_per_second": 1987.195 + }, + { + "epoch": 1.065650969529086, + "grad_norm": 0.06286007165908813, + "learning_rate": 9.767211550011262e-05, + "loss": 0.012742618098855019, + "num_input_tokens_seen": 62998472, + "step": 3847, + "train_runtime": 31702.1933, + "train_tokens_per_second": 1987.196 + }, + { + "epoch": 1.0659279778393351, + "grad_norm": 0.03198860213160515, + "learning_rate": 9.767078983498973e-05, + "loss": 0.0126805379986763, + "num_input_tokens_seen": 63014848, + "step": 3848, + "train_runtime": 31710.4096, + "train_tokens_per_second": 1987.198 + }, + { + "epoch": 1.0662049861495846, + "grad_norm": 0.04286915808916092, + "learning_rate": 9.766946380151125e-05, + "loss": 0.010690629482269287, + "num_input_tokens_seen": 63031224, + "step": 3849, + "train_runtime": 31718.629, + "train_tokens_per_second": 1987.199 + }, + { + "epoch": 1.0664819944598338, + "grad_norm": 0.038369517773389816, + "learning_rate": 9.766813739968739e-05, + "loss": 0.011491477489471436, + "num_input_tokens_seen": 63047600, + "step": 3850, + "train_runtime": 31726.8572, + "train_tokens_per_second": 1987.2 + }, + { + "epoch": 1.066759002770083, + "grad_norm": 0.057249944657087326, + "learning_rate": 9.766681062952841e-05, + "loss": 0.011435069143772125, + "num_input_tokens_seen": 63063976, + "step": 3851, + "train_runtime": 31735.0854, + "train_tokens_per_second": 1987.2 + }, + { + "epoch": 1.0670360110803325, + "grad_norm": 0.03448889032006264, + "learning_rate": 9.766548349104454e-05, + "loss": 0.010743675753474236, + "num_input_tokens_seen": 63080352, + "step": 3852, + "train_runtime": 31743.3139, + "train_tokens_per_second": 1987.201 + }, + { + "epoch": 1.0673130193905818, + "grad_norm": 0.03729412704706192, + "learning_rate": 9.766415598424607e-05, + "loss": 0.012580791488289833, + "num_input_tokens_seen": 63096728, + "step": 3853, + "train_runtime": 31751.5447, + "train_tokens_per_second": 1987.202 + }, + { + "epoch": 1.067590027700831, + "grad_norm": 0.04147167131304741, + "learning_rate": 9.766282810914323e-05, + "loss": 0.011534934863448143, + "num_input_tokens_seen": 63113104, + "step": 3854, + "train_runtime": 31759.7727, + "train_tokens_per_second": 1987.203 + }, + { + "epoch": 1.0678670360110802, + "grad_norm": 0.05814177170395851, + "learning_rate": 9.76614998657463e-05, + "loss": 0.011214611120522022, + "num_input_tokens_seen": 63129480, + "step": 3855, + "train_runtime": 31767.9947, + "train_tokens_per_second": 1987.204 + }, + { + "epoch": 1.0681440443213297, + "grad_norm": 0.09496350586414337, + "learning_rate": 9.766017125406552e-05, + "loss": 0.009861648082733154, + "num_input_tokens_seen": 63145856, + "step": 3856, + "train_runtime": 31776.2286, + "train_tokens_per_second": 1987.204 + }, + { + "epoch": 1.068421052631579, + "grad_norm": 0.03903531655669212, + "learning_rate": 9.76588422741112e-05, + "loss": 0.010182204656302929, + "num_input_tokens_seen": 63162232, + "step": 3857, + "train_runtime": 31784.456, + "train_tokens_per_second": 1987.205 + }, + { + "epoch": 1.0686980609418282, + "grad_norm": 0.06425604224205017, + "learning_rate": 9.765751292589356e-05, + "loss": 0.010507369413971901, + "num_input_tokens_seen": 63178608, + "step": 3858, + "train_runtime": 31792.6834, + "train_tokens_per_second": 1987.206 + }, + { + "epoch": 1.0689750692520776, + "grad_norm": 0.15594805777072906, + "learning_rate": 9.76561832094229e-05, + "loss": 0.011533216573297977, + "num_input_tokens_seen": 63194984, + "step": 3859, + "train_runtime": 31800.8917, + "train_tokens_per_second": 1987.208 + }, + { + "epoch": 1.0692520775623269, + "grad_norm": 0.035520751029253006, + "learning_rate": 9.765485312470946e-05, + "loss": 0.011485876515507698, + "num_input_tokens_seen": 63211360, + "step": 3860, + "train_runtime": 31809.105, + "train_tokens_per_second": 1987.21 + }, + { + "epoch": 1.0695290858725761, + "grad_norm": 0.04820150509476662, + "learning_rate": 9.765352267176357e-05, + "loss": 0.012007521465420723, + "num_input_tokens_seen": 63227736, + "step": 3861, + "train_runtime": 31817.3297, + "train_tokens_per_second": 1987.211 + }, + { + "epoch": 1.0698060941828256, + "grad_norm": 0.06896115839481354, + "learning_rate": 9.765219185059549e-05, + "loss": 0.011377292685210705, + "num_input_tokens_seen": 63244112, + "step": 3862, + "train_runtime": 31825.5445, + "train_tokens_per_second": 1987.212 + }, + { + "epoch": 1.0700831024930748, + "grad_norm": 0.06700178235769272, + "learning_rate": 9.765086066121547e-05, + "loss": 0.012104992754757404, + "num_input_tokens_seen": 63260488, + "step": 3863, + "train_runtime": 31833.7594, + "train_tokens_per_second": 1987.214 + }, + { + "epoch": 1.070360110803324, + "grad_norm": 0.04237519949674606, + "learning_rate": 9.764952910363386e-05, + "loss": 0.011029201559722424, + "num_input_tokens_seen": 63276864, + "step": 3864, + "train_runtime": 31841.9803, + "train_tokens_per_second": 1987.215 + }, + { + "epoch": 1.0706371191135735, + "grad_norm": 0.06617611646652222, + "learning_rate": 9.76481971778609e-05, + "loss": 0.011543436907231808, + "num_input_tokens_seen": 63293240, + "step": 3865, + "train_runtime": 31850.1926, + "train_tokens_per_second": 1987.217 + }, + { + "epoch": 1.0709141274238227, + "grad_norm": 0.08085663616657257, + "learning_rate": 9.764686488390689e-05, + "loss": 0.014464478939771652, + "num_input_tokens_seen": 63309616, + "step": 3866, + "train_runtime": 31858.4111, + "train_tokens_per_second": 1987.218 + }, + { + "epoch": 1.071191135734072, + "grad_norm": 0.09040825068950653, + "learning_rate": 9.764553222178213e-05, + "loss": 0.009780142456293106, + "num_input_tokens_seen": 63325992, + "step": 3867, + "train_runtime": 31866.6334, + "train_tokens_per_second": 1987.219 + }, + { + "epoch": 1.0714681440443212, + "grad_norm": 0.04832965135574341, + "learning_rate": 9.764419919149692e-05, + "loss": 0.010144880972802639, + "num_input_tokens_seen": 63342368, + "step": 3868, + "train_runtime": 31879.9723, + "train_tokens_per_second": 1986.902 + }, + { + "epoch": 1.0717451523545707, + "grad_norm": 0.05060167610645294, + "learning_rate": 9.764286579306155e-05, + "loss": 0.010272592306137085, + "num_input_tokens_seen": 63358744, + "step": 3869, + "train_runtime": 31888.2208, + "train_tokens_per_second": 1986.901 + }, + { + "epoch": 1.07202216066482, + "grad_norm": 0.03904157131910324, + "learning_rate": 9.764153202648634e-05, + "loss": 0.01111189741641283, + "num_input_tokens_seen": 63375120, + "step": 3870, + "train_runtime": 31896.4829, + "train_tokens_per_second": 1986.9 + }, + { + "epoch": 1.0722991689750692, + "grad_norm": 0.0691773071885109, + "learning_rate": 9.76401978917816e-05, + "loss": 0.009883713908493519, + "num_input_tokens_seen": 63391496, + "step": 3871, + "train_runtime": 31904.7079, + "train_tokens_per_second": 1986.901 + }, + { + "epoch": 1.0725761772853186, + "grad_norm": 0.084663987159729, + "learning_rate": 9.763886338895761e-05, + "loss": 0.011864541098475456, + "num_input_tokens_seen": 63407872, + "step": 3872, + "train_runtime": 31914.8007, + "train_tokens_per_second": 1986.786 + }, + { + "epoch": 1.0728531855955679, + "grad_norm": 0.04254892095923424, + "learning_rate": 9.76375285180247e-05, + "loss": 0.011586755514144897, + "num_input_tokens_seen": 63424248, + "step": 3873, + "train_runtime": 31924.0725, + "train_tokens_per_second": 1986.722 + }, + { + "epoch": 1.073130193905817, + "grad_norm": 0.06327681243419647, + "learning_rate": 9.763619327899318e-05, + "loss": 0.011475075036287308, + "num_input_tokens_seen": 63440624, + "step": 3874, + "train_runtime": 31939.8146, + "train_tokens_per_second": 1986.255 + }, + { + "epoch": 1.0734072022160666, + "grad_norm": 0.05412222072482109, + "learning_rate": 9.763485767187338e-05, + "loss": 0.009704059921205044, + "num_input_tokens_seen": 63457000, + "step": 3875, + "train_runtime": 31948.0548, + "train_tokens_per_second": 1986.256 + }, + { + "epoch": 1.0736842105263158, + "grad_norm": 0.04104563593864441, + "learning_rate": 9.763352169667561e-05, + "loss": 0.011450214311480522, + "num_input_tokens_seen": 63473376, + "step": 3876, + "train_runtime": 31956.6609, + "train_tokens_per_second": 1986.233 + }, + { + "epoch": 1.073961218836565, + "grad_norm": 0.04146042466163635, + "learning_rate": 9.76321853534102e-05, + "loss": 0.010541372932493687, + "num_input_tokens_seen": 63489752, + "step": 3877, + "train_runtime": 31964.9039, + "train_tokens_per_second": 1986.233 + }, + { + "epoch": 1.0742382271468145, + "grad_norm": 0.024387512356042862, + "learning_rate": 9.763084864208744e-05, + "loss": 0.010215009562671185, + "num_input_tokens_seen": 63506128, + "step": 3878, + "train_runtime": 31975.8388, + "train_tokens_per_second": 1986.066 + }, + { + "epoch": 1.0745152354570637, + "grad_norm": 0.03432224318385124, + "learning_rate": 9.762951156271772e-05, + "loss": 0.009528063237667084, + "num_input_tokens_seen": 63522504, + "step": 3879, + "train_runtime": 31984.0899, + "train_tokens_per_second": 1986.066 + }, + { + "epoch": 1.074792243767313, + "grad_norm": 0.05098291486501694, + "learning_rate": 9.762817411531133e-05, + "loss": 0.011346708983182907, + "num_input_tokens_seen": 63538880, + "step": 3880, + "train_runtime": 31992.3102, + "train_tokens_per_second": 1986.067 + }, + { + "epoch": 1.0750692520775624, + "grad_norm": 0.050853196531534195, + "learning_rate": 9.762683629987862e-05, + "loss": 0.009834840893745422, + "num_input_tokens_seen": 63555256, + "step": 3881, + "train_runtime": 32000.5359, + "train_tokens_per_second": 1986.068 + }, + { + "epoch": 1.0753462603878117, + "grad_norm": 0.048197176307439804, + "learning_rate": 9.762549811642991e-05, + "loss": 0.009986807592213154, + "num_input_tokens_seen": 63571632, + "step": 3882, + "train_runtime": 32010.1649, + "train_tokens_per_second": 1985.983 + }, + { + "epoch": 1.075623268698061, + "grad_norm": 0.036002520471811295, + "learning_rate": 9.762415956497557e-05, + "loss": 0.009914222173392773, + "num_input_tokens_seen": 63588008, + "step": 3883, + "train_runtime": 32018.4289, + "train_tokens_per_second": 1985.982 + }, + { + "epoch": 1.0759002770083101, + "grad_norm": 0.0560290701687336, + "learning_rate": 9.762282064552592e-05, + "loss": 0.011616087518632412, + "num_input_tokens_seen": 63604384, + "step": 3884, + "train_runtime": 32026.6424, + "train_tokens_per_second": 1985.984 + }, + { + "epoch": 1.0761772853185596, + "grad_norm": 0.09052721410989761, + "learning_rate": 9.76214813580913e-05, + "loss": 0.010977070778608322, + "num_input_tokens_seen": 63620760, + "step": 3885, + "train_runtime": 32034.8541, + "train_tokens_per_second": 1985.986 + }, + { + "epoch": 1.0764542936288088, + "grad_norm": 0.05518944934010506, + "learning_rate": 9.76201417026821e-05, + "loss": 0.009846169501543045, + "num_input_tokens_seen": 63637136, + "step": 3886, + "train_runtime": 32043.0687, + "train_tokens_per_second": 1985.988 + }, + { + "epoch": 1.076731301939058, + "grad_norm": 0.07793617248535156, + "learning_rate": 9.761880167930862e-05, + "loss": 0.011297058314085007, + "num_input_tokens_seen": 63653512, + "step": 3887, + "train_runtime": 32051.2833, + "train_tokens_per_second": 1985.989 + }, + { + "epoch": 1.0770083102493075, + "grad_norm": 0.03952991962432861, + "learning_rate": 9.761746128798124e-05, + "loss": 0.009759500622749329, + "num_input_tokens_seen": 63669888, + "step": 3888, + "train_runtime": 32059.49, + "train_tokens_per_second": 1985.992 + }, + { + "epoch": 1.0772853185595568, + "grad_norm": 0.0553169921040535, + "learning_rate": 9.761612052871031e-05, + "loss": 0.009453658014535904, + "num_input_tokens_seen": 63686264, + "step": 3889, + "train_runtime": 32067.7022, + "train_tokens_per_second": 1985.994 + }, + { + "epoch": 1.077562326869806, + "grad_norm": 0.049456171691417694, + "learning_rate": 9.76147794015062e-05, + "loss": 0.010982786305248737, + "num_input_tokens_seen": 63702640, + "step": 3890, + "train_runtime": 32075.9193, + "train_tokens_per_second": 1985.996 + }, + { + "epoch": 1.0778393351800555, + "grad_norm": 0.0442131869494915, + "learning_rate": 9.761343790637925e-05, + "loss": 0.011613182723522186, + "num_input_tokens_seen": 63719016, + "step": 3891, + "train_runtime": 32084.1548, + "train_tokens_per_second": 1985.996 + }, + { + "epoch": 1.0781163434903047, + "grad_norm": 0.058825936168432236, + "learning_rate": 9.761209604333988e-05, + "loss": 0.010905595496296883, + "num_input_tokens_seen": 63735392, + "step": 3892, + "train_runtime": 32092.3974, + "train_tokens_per_second": 1985.997 + }, + { + "epoch": 1.078393351800554, + "grad_norm": 0.03172140195965767, + "learning_rate": 9.761075381239839e-05, + "loss": 0.010524607263505459, + "num_input_tokens_seen": 63751768, + "step": 3893, + "train_runtime": 32100.6201, + "train_tokens_per_second": 1985.998 + }, + { + "epoch": 1.0786703601108034, + "grad_norm": 0.06452421098947525, + "learning_rate": 9.76094112135652e-05, + "loss": 0.012433002702891827, + "num_input_tokens_seen": 63768144, + "step": 3894, + "train_runtime": 32108.8408, + "train_tokens_per_second": 1986.0 + }, + { + "epoch": 1.0789473684210527, + "grad_norm": 0.05798755958676338, + "learning_rate": 9.760806824685065e-05, + "loss": 0.010006838478147984, + "num_input_tokens_seen": 63784520, + "step": 3895, + "train_runtime": 32117.0732, + "train_tokens_per_second": 1986.0 + }, + { + "epoch": 1.079224376731302, + "grad_norm": 0.06845668703317642, + "learning_rate": 9.760672491226514e-05, + "loss": 0.013823593966662884, + "num_input_tokens_seen": 63800896, + "step": 3896, + "train_runtime": 32125.3086, + "train_tokens_per_second": 1986.001 + }, + { + "epoch": 1.0795013850415511, + "grad_norm": 0.03614844009280205, + "learning_rate": 9.760538120981905e-05, + "loss": 0.0108848437666893, + "num_input_tokens_seen": 63817272, + "step": 3897, + "train_runtime": 32133.5304, + "train_tokens_per_second": 1986.003 + }, + { + "epoch": 1.0797783933518006, + "grad_norm": 0.07330393046140671, + "learning_rate": 9.760403713952275e-05, + "loss": 0.00978956650942564, + "num_input_tokens_seen": 63833648, + "step": 3898, + "train_runtime": 32141.756, + "train_tokens_per_second": 1986.004 + }, + { + "epoch": 1.0800554016620498, + "grad_norm": 0.059426017105579376, + "learning_rate": 9.760269270138664e-05, + "loss": 0.015146909281611443, + "num_input_tokens_seen": 63850024, + "step": 3899, + "train_runtime": 32149.9968, + "train_tokens_per_second": 1986.004 + }, + { + "epoch": 1.080332409972299, + "grad_norm": 0.056600965559482574, + "learning_rate": 9.760134789542112e-05, + "loss": 0.011519484221935272, + "num_input_tokens_seen": 63866400, + "step": 3900, + "train_runtime": 32158.2318, + "train_tokens_per_second": 1986.005 + }, + { + "epoch": 1.0806094182825485, + "grad_norm": 0.08221009373664856, + "learning_rate": 9.760000272163654e-05, + "loss": 0.011002538725733757, + "num_input_tokens_seen": 63882776, + "step": 3901, + "train_runtime": 32168.0148, + "train_tokens_per_second": 1985.91 + }, + { + "epoch": 1.0808864265927978, + "grad_norm": 0.06398721784353256, + "learning_rate": 9.759865718004332e-05, + "loss": 0.012484809383749962, + "num_input_tokens_seen": 63899152, + "step": 3902, + "train_runtime": 32176.2462, + "train_tokens_per_second": 1985.911 + }, + { + "epoch": 1.081163434903047, + "grad_norm": 0.0739598423242569, + "learning_rate": 9.759731127065185e-05, + "loss": 0.010267623700201511, + "num_input_tokens_seen": 63915528, + "step": 3903, + "train_runtime": 32184.4606, + "train_tokens_per_second": 1985.913 + }, + { + "epoch": 1.0814404432132965, + "grad_norm": 0.037815943360328674, + "learning_rate": 9.759596499347254e-05, + "loss": 0.011906595900654793, + "num_input_tokens_seen": 63931904, + "step": 3904, + "train_runtime": 32192.6774, + "train_tokens_per_second": 1985.914 + }, + { + "epoch": 1.0817174515235457, + "grad_norm": 0.033408474177122116, + "learning_rate": 9.75946183485158e-05, + "loss": 0.009389925748109818, + "num_input_tokens_seen": 63948280, + "step": 3905, + "train_runtime": 32200.8865, + "train_tokens_per_second": 1985.917 + }, + { + "epoch": 1.081994459833795, + "grad_norm": 0.07680845260620117, + "learning_rate": 9.759327133579201e-05, + "loss": 0.009318969212472439, + "num_input_tokens_seen": 63964656, + "step": 3906, + "train_runtime": 32209.0993, + "train_tokens_per_second": 1985.919 + }, + { + "epoch": 1.0822714681440444, + "grad_norm": 0.08003868162631989, + "learning_rate": 9.75919239553116e-05, + "loss": 0.013129858300089836, + "num_input_tokens_seen": 63981032, + "step": 3907, + "train_runtime": 32217.3358, + "train_tokens_per_second": 1985.919 + }, + { + "epoch": 1.0825484764542936, + "grad_norm": 0.07967372983694077, + "learning_rate": 9.759057620708495e-05, + "loss": 0.012650794349610806, + "num_input_tokens_seen": 63997408, + "step": 3908, + "train_runtime": 32225.5718, + "train_tokens_per_second": 1985.92 + }, + { + "epoch": 1.0828254847645429, + "grad_norm": 0.05211906507611275, + "learning_rate": 9.758922809112252e-05, + "loss": 0.011589614674448967, + "num_input_tokens_seen": 64013784, + "step": 3909, + "train_runtime": 32233.8056, + "train_tokens_per_second": 1985.921 + }, + { + "epoch": 1.0831024930747923, + "grad_norm": 0.038513585925102234, + "learning_rate": 9.758787960743471e-05, + "loss": 0.012146752327680588, + "num_input_tokens_seen": 64030160, + "step": 3910, + "train_runtime": 32242.0335, + "train_tokens_per_second": 1985.922 + }, + { + "epoch": 1.0833795013850416, + "grad_norm": 0.041664522141218185, + "learning_rate": 9.758653075603192e-05, + "loss": 0.009669951163232327, + "num_input_tokens_seen": 64046536, + "step": 3911, + "train_runtime": 32250.2557, + "train_tokens_per_second": 1985.923 + }, + { + "epoch": 1.0836565096952908, + "grad_norm": 0.08830210566520691, + "learning_rate": 9.758518153692458e-05, + "loss": 0.011248597875237465, + "num_input_tokens_seen": 64062912, + "step": 3912, + "train_runtime": 32258.4742, + "train_tokens_per_second": 1985.925 + }, + { + "epoch": 1.0839335180055403, + "grad_norm": 0.06756860017776489, + "learning_rate": 9.758383195012312e-05, + "loss": 0.015915801748633385, + "num_input_tokens_seen": 64079288, + "step": 3913, + "train_runtime": 32266.713, + "train_tokens_per_second": 1985.925 + }, + { + "epoch": 1.0842105263157895, + "grad_norm": 0.047008078545331955, + "learning_rate": 9.7582481995638e-05, + "loss": 0.009842833504080772, + "num_input_tokens_seen": 64095664, + "step": 3914, + "train_runtime": 32274.932, + "train_tokens_per_second": 1985.927 + }, + { + "epoch": 1.0844875346260388, + "grad_norm": 0.08491893112659454, + "learning_rate": 9.75811316734796e-05, + "loss": 0.013793270103633404, + "num_input_tokens_seen": 64112040, + "step": 3915, + "train_runtime": 32283.1447, + "train_tokens_per_second": 1985.929 + }, + { + "epoch": 1.084764542936288, + "grad_norm": 0.06696759164333344, + "learning_rate": 9.757978098365838e-05, + "loss": 0.010947792790830135, + "num_input_tokens_seen": 64128416, + "step": 3916, + "train_runtime": 32291.3538, + "train_tokens_per_second": 1985.931 + }, + { + "epoch": 1.0850415512465375, + "grad_norm": 0.0718206837773323, + "learning_rate": 9.757842992618476e-05, + "loss": 0.014809751883149147, + "num_input_tokens_seen": 64144792, + "step": 3917, + "train_runtime": 32299.5873, + "train_tokens_per_second": 1985.932 + }, + { + "epoch": 1.0853185595567867, + "grad_norm": 0.042879849672317505, + "learning_rate": 9.75770785010692e-05, + "loss": 0.0097948107868433, + "num_input_tokens_seen": 64161168, + "step": 3918, + "train_runtime": 32307.8189, + "train_tokens_per_second": 1985.933 + }, + { + "epoch": 1.085595567867036, + "grad_norm": 0.028894279152154922, + "learning_rate": 9.757572670832214e-05, + "loss": 0.010568168945610523, + "num_input_tokens_seen": 64177544, + "step": 3919, + "train_runtime": 32316.0345, + "train_tokens_per_second": 1985.935 + }, + { + "epoch": 1.0858725761772854, + "grad_norm": 0.10753391683101654, + "learning_rate": 9.757437454795402e-05, + "loss": 0.010730267502367496, + "num_input_tokens_seen": 64193920, + "step": 3920, + "train_runtime": 32324.2387, + "train_tokens_per_second": 1985.938 + }, + { + "epoch": 1.0861495844875346, + "grad_norm": 0.060874685645103455, + "learning_rate": 9.757302201997529e-05, + "loss": 0.009066342376172543, + "num_input_tokens_seen": 64210296, + "step": 3921, + "train_runtime": 32332.4542, + "train_tokens_per_second": 1985.939 + }, + { + "epoch": 1.0864265927977839, + "grad_norm": 0.06014513596892357, + "learning_rate": 9.75716691243964e-05, + "loss": 0.012714618816971779, + "num_input_tokens_seen": 64226672, + "step": 3922, + "train_runtime": 32340.67, + "train_tokens_per_second": 1985.941 + }, + { + "epoch": 1.0867036011080333, + "grad_norm": 0.055210039019584656, + "learning_rate": 9.757031586122777e-05, + "loss": 0.011078319512307644, + "num_input_tokens_seen": 64243048, + "step": 3923, + "train_runtime": 32348.9027, + "train_tokens_per_second": 1985.942 + }, + { + "epoch": 1.0869806094182826, + "grad_norm": 0.05078956484794617, + "learning_rate": 9.756896223047993e-05, + "loss": 0.00989421084523201, + "num_input_tokens_seen": 64259424, + "step": 3924, + "train_runtime": 32357.1401, + "train_tokens_per_second": 1985.943 + }, + { + "epoch": 1.0872576177285318, + "grad_norm": 0.04667045176029205, + "learning_rate": 9.75676082321633e-05, + "loss": 0.012321023270487785, + "num_input_tokens_seen": 64275800, + "step": 3925, + "train_runtime": 32365.3703, + "train_tokens_per_second": 1985.944 + }, + { + "epoch": 1.0875346260387813, + "grad_norm": 0.0673670768737793, + "learning_rate": 9.756625386628832e-05, + "loss": 0.011315872892737389, + "num_input_tokens_seen": 64292176, + "step": 3926, + "train_runtime": 32373.5967, + "train_tokens_per_second": 1985.945 + }, + { + "epoch": 1.0878116343490305, + "grad_norm": 0.052999112755060196, + "learning_rate": 9.756489913286548e-05, + "loss": 0.008933230303227901, + "num_input_tokens_seen": 64308552, + "step": 3927, + "train_runtime": 32381.8228, + "train_tokens_per_second": 1985.946 + }, + { + "epoch": 1.0880886426592797, + "grad_norm": 0.05974114313721657, + "learning_rate": 9.756354403190526e-05, + "loss": 0.0079395342618227, + "num_input_tokens_seen": 64324928, + "step": 3928, + "train_runtime": 32390.0568, + "train_tokens_per_second": 1985.947 + }, + { + "epoch": 1.088365650969529, + "grad_norm": 0.06333409249782562, + "learning_rate": 9.75621885634181e-05, + "loss": 0.011551974341273308, + "num_input_tokens_seen": 64341304, + "step": 3929, + "train_runtime": 32398.2915, + "train_tokens_per_second": 1985.947 + }, + { + "epoch": 1.0886426592797784, + "grad_norm": 0.0640920028090477, + "learning_rate": 9.756083272741451e-05, + "loss": 0.013180970214307308, + "num_input_tokens_seen": 64357680, + "step": 3930, + "train_runtime": 32406.5196, + "train_tokens_per_second": 1985.949 + }, + { + "epoch": 1.0889196675900277, + "grad_norm": 0.047560639679431915, + "learning_rate": 9.755947652390494e-05, + "loss": 0.010250380262732506, + "num_input_tokens_seen": 64374056, + "step": 3931, + "train_runtime": 32414.7595, + "train_tokens_per_second": 1985.949 + }, + { + "epoch": 1.089196675900277, + "grad_norm": 0.08192902058362961, + "learning_rate": 9.755811995289987e-05, + "loss": 0.01094529777765274, + "num_input_tokens_seen": 64390432, + "step": 3932, + "train_runtime": 32422.9873, + "train_tokens_per_second": 1985.95 + }, + { + "epoch": 1.0894736842105264, + "grad_norm": 0.047558967024087906, + "learning_rate": 9.755676301440979e-05, + "loss": 0.012842399999499321, + "num_input_tokens_seen": 64406808, + "step": 3933, + "train_runtime": 32431.2222, + "train_tokens_per_second": 1985.951 + }, + { + "epoch": 1.0897506925207756, + "grad_norm": 0.1860976368188858, + "learning_rate": 9.755540570844519e-05, + "loss": 0.013516038656234741, + "num_input_tokens_seen": 64423184, + "step": 3934, + "train_runtime": 32439.4564, + "train_tokens_per_second": 1985.951 + }, + { + "epoch": 1.0900277008310248, + "grad_norm": 0.05106062442064285, + "learning_rate": 9.755404803501654e-05, + "loss": 0.010687476955354214, + "num_input_tokens_seen": 64439560, + "step": 3935, + "train_runtime": 32447.6641, + "train_tokens_per_second": 1985.954 + }, + { + "epoch": 1.0903047091412743, + "grad_norm": 0.05101123824715614, + "learning_rate": 9.755268999413436e-05, + "loss": 0.00914676208049059, + "num_input_tokens_seen": 64455936, + "step": 3936, + "train_runtime": 32455.8749, + "train_tokens_per_second": 1985.956 + }, + { + "epoch": 1.0905817174515235, + "grad_norm": 0.03874104097485542, + "learning_rate": 9.755133158580912e-05, + "loss": 0.011186009272933006, + "num_input_tokens_seen": 64472312, + "step": 3937, + "train_runtime": 32464.0801, + "train_tokens_per_second": 1985.958 + }, + { + "epoch": 1.0908587257617728, + "grad_norm": 0.038825519382953644, + "learning_rate": 9.754997281005132e-05, + "loss": 0.0082972701638937, + "num_input_tokens_seen": 64488688, + "step": 3938, + "train_runtime": 32472.2848, + "train_tokens_per_second": 1985.961 + }, + { + "epoch": 1.0911357340720222, + "grad_norm": 0.03532016649842262, + "learning_rate": 9.754861366687145e-05, + "loss": 0.009606897830963135, + "num_input_tokens_seen": 64505064, + "step": 3939, + "train_runtime": 32480.4929, + "train_tokens_per_second": 1985.963 + }, + { + "epoch": 1.0914127423822715, + "grad_norm": 0.06442486494779587, + "learning_rate": 9.754725415628003e-05, + "loss": 0.009804212488234043, + "num_input_tokens_seen": 64521440, + "step": 3940, + "train_runtime": 32488.6945, + "train_tokens_per_second": 1985.966 + }, + { + "epoch": 1.0916897506925207, + "grad_norm": 0.05649049952626228, + "learning_rate": 9.754589427828757e-05, + "loss": 0.011266477406024933, + "num_input_tokens_seen": 64537816, + "step": 3941, + "train_runtime": 32496.9044, + "train_tokens_per_second": 1985.968 + }, + { + "epoch": 1.0919667590027702, + "grad_norm": 0.030758557841181755, + "learning_rate": 9.754453403290457e-05, + "loss": 0.011244909837841988, + "num_input_tokens_seen": 64554192, + "step": 3942, + "train_runtime": 32505.1102, + "train_tokens_per_second": 1985.971 + }, + { + "epoch": 1.0922437673130194, + "grad_norm": 0.08550435304641724, + "learning_rate": 9.754317342014153e-05, + "loss": 0.013735475018620491, + "num_input_tokens_seen": 64570568, + "step": 3943, + "train_runtime": 32513.3325, + "train_tokens_per_second": 1985.972 + }, + { + "epoch": 1.0925207756232687, + "grad_norm": 0.0556330680847168, + "learning_rate": 9.754181244000898e-05, + "loss": 0.009950787760317326, + "num_input_tokens_seen": 64586944, + "step": 3944, + "train_runtime": 32521.5542, + "train_tokens_per_second": 1985.973 + }, + { + "epoch": 1.0927977839335181, + "grad_norm": 0.026025669649243355, + "learning_rate": 9.754045109251742e-05, + "loss": 0.008202224969863892, + "num_input_tokens_seen": 64603320, + "step": 3945, + "train_runtime": 32529.762, + "train_tokens_per_second": 1985.976 + }, + { + "epoch": 1.0930747922437674, + "grad_norm": 0.05075647309422493, + "learning_rate": 9.753908937767737e-05, + "loss": 0.012160813435912132, + "num_input_tokens_seen": 64619696, + "step": 3946, + "train_runtime": 32537.9685, + "train_tokens_per_second": 1985.978 + }, + { + "epoch": 1.0933518005540166, + "grad_norm": 0.03548528254032135, + "learning_rate": 9.753772729549938e-05, + "loss": 0.010579400695860386, + "num_input_tokens_seen": 64636072, + "step": 3947, + "train_runtime": 32546.1766, + "train_tokens_per_second": 1985.98 + }, + { + "epoch": 1.0936288088642658, + "grad_norm": 0.06287726014852524, + "learning_rate": 9.753636484599393e-05, + "loss": 0.011452563107013702, + "num_input_tokens_seen": 64652448, + "step": 3948, + "train_runtime": 32554.3836, + "train_tokens_per_second": 1985.983 + }, + { + "epoch": 1.0939058171745153, + "grad_norm": 0.06502392888069153, + "learning_rate": 9.753500202917158e-05, + "loss": 0.011983844451606274, + "num_input_tokens_seen": 64668824, + "step": 3949, + "train_runtime": 32562.5921, + "train_tokens_per_second": 1985.985 + }, + { + "epoch": 1.0941828254847645, + "grad_norm": 0.05855856090784073, + "learning_rate": 9.753363884504287e-05, + "loss": 0.012103581801056862, + "num_input_tokens_seen": 64685200, + "step": 3950, + "train_runtime": 32570.8094, + "train_tokens_per_second": 1985.987 + }, + { + "epoch": 1.0944598337950138, + "grad_norm": 0.06585437804460526, + "learning_rate": 9.75322752936183e-05, + "loss": 0.008522842079401016, + "num_input_tokens_seen": 64701576, + "step": 3951, + "train_runtime": 32579.0547, + "train_tokens_per_second": 1985.987 + }, + { + "epoch": 1.0947368421052632, + "grad_norm": 0.05132242292165756, + "learning_rate": 9.753091137490841e-05, + "loss": 0.009853649884462357, + "num_input_tokens_seen": 64717952, + "step": 3952, + "train_runtime": 32587.2672, + "train_tokens_per_second": 1985.989 + }, + { + "epoch": 1.0950138504155125, + "grad_norm": 0.031111976131796837, + "learning_rate": 9.752954708892377e-05, + "loss": 0.009096575900912285, + "num_input_tokens_seen": 64734328, + "step": 3953, + "train_runtime": 32595.486, + "train_tokens_per_second": 1985.991 + }, + { + "epoch": 1.0952908587257617, + "grad_norm": 0.07105430960655212, + "learning_rate": 9.75281824356749e-05, + "loss": 0.012475431896746159, + "num_input_tokens_seen": 64750704, + "step": 3954, + "train_runtime": 32603.6942, + "train_tokens_per_second": 1985.993 + }, + { + "epoch": 1.0955678670360112, + "grad_norm": 0.05959070846438408, + "learning_rate": 9.752681741517236e-05, + "loss": 0.011475129052996635, + "num_input_tokens_seen": 64767080, + "step": 3955, + "train_runtime": 32611.9064, + "train_tokens_per_second": 1985.995 + }, + { + "epoch": 1.0958448753462604, + "grad_norm": 0.07649648189544678, + "learning_rate": 9.752545202742666e-05, + "loss": 0.010325485840439796, + "num_input_tokens_seen": 64783456, + "step": 3956, + "train_runtime": 32620.1166, + "train_tokens_per_second": 1985.997 + }, + { + "epoch": 1.0961218836565096, + "grad_norm": 0.057398464530706406, + "learning_rate": 9.75240862724484e-05, + "loss": 0.009820683859288692, + "num_input_tokens_seen": 64799832, + "step": 3957, + "train_runtime": 32628.3172, + "train_tokens_per_second": 1986.0 + }, + { + "epoch": 1.096398891966759, + "grad_norm": 0.03188711404800415, + "learning_rate": 9.75227201502481e-05, + "loss": 0.010472026653587818, + "num_input_tokens_seen": 64816208, + "step": 3958, + "train_runtime": 32636.5248, + "train_tokens_per_second": 1986.002 + }, + { + "epoch": 1.0966759002770083, + "grad_norm": 0.05913674458861351, + "learning_rate": 9.752135366083632e-05, + "loss": 0.013791779987514019, + "num_input_tokens_seen": 64832584, + "step": 3959, + "train_runtime": 32644.7333, + "train_tokens_per_second": 1986.004 + }, + { + "epoch": 1.0969529085872576, + "grad_norm": 0.07679981738328934, + "learning_rate": 9.751998680422363e-05, + "loss": 0.01191805861890316, + "num_input_tokens_seen": 64848960, + "step": 3960, + "train_runtime": 32652.9535, + "train_tokens_per_second": 1986.006 + }, + { + "epoch": 1.0972299168975068, + "grad_norm": 0.08013710379600525, + "learning_rate": 9.751861958042059e-05, + "loss": 0.013473497703671455, + "num_input_tokens_seen": 64865336, + "step": 3961, + "train_runtime": 32661.1818, + "train_tokens_per_second": 1986.007 + }, + { + "epoch": 1.0975069252077563, + "grad_norm": 0.03650819882750511, + "learning_rate": 9.751725198943775e-05, + "loss": 0.008501586504280567, + "num_input_tokens_seen": 64881712, + "step": 3962, + "train_runtime": 32669.4095, + "train_tokens_per_second": 1986.008 + }, + { + "epoch": 1.0977839335180055, + "grad_norm": 0.07991637289524078, + "learning_rate": 9.751588403128569e-05, + "loss": 0.014258540235459805, + "num_input_tokens_seen": 64898088, + "step": 3963, + "train_runtime": 32677.6345, + "train_tokens_per_second": 1986.009 + }, + { + "epoch": 1.0980609418282548, + "grad_norm": 0.05771110579371452, + "learning_rate": 9.7514515705975e-05, + "loss": 0.013020124286413193, + "num_input_tokens_seen": 64914464, + "step": 3964, + "train_runtime": 32685.8544, + "train_tokens_per_second": 1986.011 + }, + { + "epoch": 1.0983379501385042, + "grad_norm": 0.06264960765838623, + "learning_rate": 9.751314701351621e-05, + "loss": 0.01000777818262577, + "num_input_tokens_seen": 64930840, + "step": 3965, + "train_runtime": 32694.0862, + "train_tokens_per_second": 1986.012 + }, + { + "epoch": 1.0986149584487535, + "grad_norm": 0.07297331094741821, + "learning_rate": 9.751177795391992e-05, + "loss": 0.011762741021811962, + "num_input_tokens_seen": 64947216, + "step": 3966, + "train_runtime": 32702.3193, + "train_tokens_per_second": 1986.013 + }, + { + "epoch": 1.0988919667590027, + "grad_norm": 0.06484247744083405, + "learning_rate": 9.751040852719671e-05, + "loss": 0.011443344876170158, + "num_input_tokens_seen": 64963592, + "step": 3967, + "train_runtime": 32710.5553, + "train_tokens_per_second": 1986.013 + }, + { + "epoch": 1.0991689750692522, + "grad_norm": 0.05539792776107788, + "learning_rate": 9.750903873335716e-05, + "loss": 0.011187000200152397, + "num_input_tokens_seen": 64979968, + "step": 3968, + "train_runtime": 32718.768, + "train_tokens_per_second": 1986.015 + }, + { + "epoch": 1.0994459833795014, + "grad_norm": 0.11963348090648651, + "learning_rate": 9.750766857241184e-05, + "loss": 0.011555004864931107, + "num_input_tokens_seen": 64996344, + "step": 3969, + "train_runtime": 32726.9716, + "train_tokens_per_second": 1986.018 + }, + { + "epoch": 1.0997229916897506, + "grad_norm": 0.03107762150466442, + "learning_rate": 9.750629804437137e-05, + "loss": 0.00999031774699688, + "num_input_tokens_seen": 65012720, + "step": 3970, + "train_runtime": 32735.1855, + "train_tokens_per_second": 1986.02 + }, + { + "epoch": 1.1, + "grad_norm": 0.05008799582719803, + "learning_rate": 9.750492714924631e-05, + "loss": 0.012455718591809273, + "num_input_tokens_seen": 65029096, + "step": 3971, + "train_runtime": 32743.3965, + "train_tokens_per_second": 1986.022 + }, + { + "epoch": 1.1002770083102493, + "grad_norm": 0.04417363181710243, + "learning_rate": 9.750355588704727e-05, + "loss": 0.009494811296463013, + "num_input_tokens_seen": 65045472, + "step": 3972, + "train_runtime": 32751.607, + "train_tokens_per_second": 1986.024 + }, + { + "epoch": 1.1005540166204986, + "grad_norm": 0.0921197235584259, + "learning_rate": 9.750218425778484e-05, + "loss": 0.014285549521446228, + "num_input_tokens_seen": 65061848, + "step": 3973, + "train_runtime": 32759.8188, + "train_tokens_per_second": 1986.026 + }, + { + "epoch": 1.100831024930748, + "grad_norm": 0.039303846657276154, + "learning_rate": 9.75008122614696e-05, + "loss": 0.009734856896102428, + "num_input_tokens_seen": 65078224, + "step": 3974, + "train_runtime": 32768.0277, + "train_tokens_per_second": 1986.028 + }, + { + "epoch": 1.1011080332409973, + "grad_norm": 0.0808226615190506, + "learning_rate": 9.749943989811218e-05, + "loss": 0.012161336839199066, + "num_input_tokens_seen": 65094600, + "step": 3975, + "train_runtime": 32776.238, + "train_tokens_per_second": 1986.03 + }, + { + "epoch": 1.1013850415512465, + "grad_norm": 0.07028472423553467, + "learning_rate": 9.749806716772319e-05, + "loss": 0.01156681589782238, + "num_input_tokens_seen": 65110976, + "step": 3976, + "train_runtime": 32784.4729, + "train_tokens_per_second": 1986.031 + }, + { + "epoch": 1.101662049861496, + "grad_norm": 0.03685639798641205, + "learning_rate": 9.749669407031322e-05, + "loss": 0.009174413979053497, + "num_input_tokens_seen": 65127352, + "step": 3977, + "train_runtime": 32792.7121, + "train_tokens_per_second": 1986.031 + }, + { + "epoch": 1.1019390581717452, + "grad_norm": 0.04553087428212166, + "learning_rate": 9.749532060589285e-05, + "loss": 0.011422062292695045, + "num_input_tokens_seen": 65143728, + "step": 3978, + "train_runtime": 32800.9323, + "train_tokens_per_second": 1986.033 + }, + { + "epoch": 1.1022160664819944, + "grad_norm": 0.03748774528503418, + "learning_rate": 9.749394677447274e-05, + "loss": 0.009966040030121803, + "num_input_tokens_seen": 65160104, + "step": 3979, + "train_runtime": 32809.1669, + "train_tokens_per_second": 1986.033 + }, + { + "epoch": 1.1024930747922437, + "grad_norm": 0.06512326747179031, + "learning_rate": 9.74925725760635e-05, + "loss": 0.012334256432950497, + "num_input_tokens_seen": 65176480, + "step": 3980, + "train_runtime": 32817.3889, + "train_tokens_per_second": 1986.035 + }, + { + "epoch": 1.1027700831024931, + "grad_norm": 0.0650954619050026, + "learning_rate": 9.749119801067572e-05, + "loss": 0.01050098892301321, + "num_input_tokens_seen": 65192856, + "step": 3981, + "train_runtime": 32825.6136, + "train_tokens_per_second": 1986.036 + }, + { + "epoch": 1.1030470914127424, + "grad_norm": 0.043077558279037476, + "learning_rate": 9.748982307832005e-05, + "loss": 0.010049271397292614, + "num_input_tokens_seen": 65209232, + "step": 3982, + "train_runtime": 32833.84, + "train_tokens_per_second": 1986.037 + }, + { + "epoch": 1.1033240997229916, + "grad_norm": 0.10845879465341568, + "learning_rate": 9.748844777900711e-05, + "loss": 0.011600939556956291, + "num_input_tokens_seen": 65225608, + "step": 3983, + "train_runtime": 32842.0696, + "train_tokens_per_second": 1986.038 + }, + { + "epoch": 1.103601108033241, + "grad_norm": 0.0666482001543045, + "learning_rate": 9.74870721127475e-05, + "loss": 0.011724908836185932, + "num_input_tokens_seen": 65241984, + "step": 3984, + "train_runtime": 32850.2947, + "train_tokens_per_second": 1986.04 + }, + { + "epoch": 1.1038781163434903, + "grad_norm": 0.048221491277217865, + "learning_rate": 9.748569607955189e-05, + "loss": 0.010609282180666924, + "num_input_tokens_seen": 65258360, + "step": 3985, + "train_runtime": 32858.5811, + "train_tokens_per_second": 1986.037 + }, + { + "epoch": 1.1041551246537396, + "grad_norm": 0.0323600098490715, + "learning_rate": 9.74843196794309e-05, + "loss": 0.009555723518133163, + "num_input_tokens_seen": 65274736, + "step": 3986, + "train_runtime": 32866.8222, + "train_tokens_per_second": 1986.037 + }, + { + "epoch": 1.104432132963989, + "grad_norm": 0.028697334229946136, + "learning_rate": 9.748294291239512e-05, + "loss": 0.00956262368708849, + "num_input_tokens_seen": 65291112, + "step": 3987, + "train_runtime": 32875.0403, + "train_tokens_per_second": 1986.039 + }, + { + "epoch": 1.1047091412742382, + "grad_norm": 0.04093416407704353, + "learning_rate": 9.748156577845527e-05, + "loss": 0.0105076152831316, + "num_input_tokens_seen": 65307488, + "step": 3988, + "train_runtime": 32883.2623, + "train_tokens_per_second": 1986.04 + }, + { + "epoch": 1.1049861495844875, + "grad_norm": 0.06901216506958008, + "learning_rate": 9.748018827762194e-05, + "loss": 0.011186670511960983, + "num_input_tokens_seen": 65323864, + "step": 3989, + "train_runtime": 32891.4876, + "train_tokens_per_second": 1986.042 + }, + { + "epoch": 1.1052631578947367, + "grad_norm": 0.08178779482841492, + "learning_rate": 9.747881040990576e-05, + "loss": 0.012268928810954094, + "num_input_tokens_seen": 65340240, + "step": 3990, + "train_runtime": 32899.7002, + "train_tokens_per_second": 1986.044 + }, + { + "epoch": 1.1055401662049862, + "grad_norm": 0.06610453128814697, + "learning_rate": 9.747743217531741e-05, + "loss": 0.015912320464849472, + "num_input_tokens_seen": 65356616, + "step": 3991, + "train_runtime": 32907.906, + "train_tokens_per_second": 1986.046 + }, + { + "epoch": 1.1058171745152354, + "grad_norm": 0.061390627175569534, + "learning_rate": 9.747605357386754e-05, + "loss": 0.01464213989675045, + "num_input_tokens_seen": 65372992, + "step": 3992, + "train_runtime": 32916.1357, + "train_tokens_per_second": 1986.047 + }, + { + "epoch": 1.1060941828254847, + "grad_norm": 0.0840650275349617, + "learning_rate": 9.747467460556678e-05, + "loss": 0.013576631434261799, + "num_input_tokens_seen": 65389368, + "step": 3993, + "train_runtime": 32924.3577, + "train_tokens_per_second": 1986.048 + }, + { + "epoch": 1.1063711911357341, + "grad_norm": 0.040830716490745544, + "learning_rate": 9.747329527042579e-05, + "loss": 0.01133173517882824, + "num_input_tokens_seen": 65405744, + "step": 3994, + "train_runtime": 32932.584, + "train_tokens_per_second": 1986.05 + }, + { + "epoch": 1.1066481994459834, + "grad_norm": 0.0484037809073925, + "learning_rate": 9.747191556845525e-05, + "loss": 0.0068367901258170605, + "num_input_tokens_seen": 65422120, + "step": 3995, + "train_runtime": 32940.7986, + "train_tokens_per_second": 1986.051 + }, + { + "epoch": 1.1069252077562326, + "grad_norm": 0.04795311018824577, + "learning_rate": 9.747053549966581e-05, + "loss": 0.012635704129934311, + "num_input_tokens_seen": 65438496, + "step": 3996, + "train_runtime": 32949.0222, + "train_tokens_per_second": 1986.053 + }, + { + "epoch": 1.107202216066482, + "grad_norm": 0.04021509364247322, + "learning_rate": 9.746915506406811e-05, + "loss": 0.012350187636911869, + "num_input_tokens_seen": 65454872, + "step": 3997, + "train_runtime": 32957.2543, + "train_tokens_per_second": 1986.054 + }, + { + "epoch": 1.1074792243767313, + "grad_norm": 0.03398369252681732, + "learning_rate": 9.746777426167285e-05, + "loss": 0.01073765940964222, + "num_input_tokens_seen": 65471248, + "step": 3998, + "train_runtime": 32965.4774, + "train_tokens_per_second": 1986.055 + }, + { + "epoch": 1.1077562326869805, + "grad_norm": 0.03685345500707626, + "learning_rate": 9.746639309249068e-05, + "loss": 0.011731450445950031, + "num_input_tokens_seen": 65487624, + "step": 3999, + "train_runtime": 32973.7025, + "train_tokens_per_second": 1986.056 + }, + { + "epoch": 1.10803324099723, + "grad_norm": 0.0552346408367157, + "learning_rate": 9.746501155653229e-05, + "loss": 0.012177822180092335, + "num_input_tokens_seen": 65504000, + "step": 4000, + "train_runtime": 32981.9223, + "train_tokens_per_second": 1986.058 + }, + { + "epoch": 1.1083102493074792, + "grad_norm": 0.049838319420814514, + "learning_rate": 9.746362965380834e-05, + "loss": 0.01073554065078497, + "num_input_tokens_seen": 65520376, + "step": 4001, + "train_runtime": 32991.7244, + "train_tokens_per_second": 1985.964 + }, + { + "epoch": 1.1085872576177285, + "grad_norm": 0.05608271807432175, + "learning_rate": 9.74622473843295e-05, + "loss": 0.013869187794625759, + "num_input_tokens_seen": 65536752, + "step": 4002, + "train_runtime": 32999.9545, + "train_tokens_per_second": 1985.965 + }, + { + "epoch": 1.108864265927978, + "grad_norm": 0.04227830842137337, + "learning_rate": 9.746086474810649e-05, + "loss": 0.011273231357336044, + "num_input_tokens_seen": 65553128, + "step": 4003, + "train_runtime": 33008.182, + "train_tokens_per_second": 1985.966 + }, + { + "epoch": 1.1091412742382272, + "grad_norm": 0.06623886525630951, + "learning_rate": 9.745948174514993e-05, + "loss": 0.012609990313649178, + "num_input_tokens_seen": 65569504, + "step": 4004, + "train_runtime": 33016.4113, + "train_tokens_per_second": 1985.967 + }, + { + "epoch": 1.1094182825484764, + "grad_norm": 0.057367898523807526, + "learning_rate": 9.745809837547057e-05, + "loss": 0.011776949279010296, + "num_input_tokens_seen": 65585880, + "step": 4005, + "train_runtime": 33024.6159, + "train_tokens_per_second": 1985.97 + }, + { + "epoch": 1.1096952908587259, + "grad_norm": 0.051905881613492966, + "learning_rate": 9.745671463907906e-05, + "loss": 0.011016003787517548, + "num_input_tokens_seen": 65602256, + "step": 4006, + "train_runtime": 33032.821, + "train_tokens_per_second": 1985.972 + }, + { + "epoch": 1.109972299168975, + "grad_norm": 0.04372883215546608, + "learning_rate": 9.745533053598612e-05, + "loss": 0.011578713543713093, + "num_input_tokens_seen": 65618632, + "step": 4007, + "train_runtime": 33041.0436, + "train_tokens_per_second": 1985.973 + }, + { + "epoch": 1.1102493074792243, + "grad_norm": 0.0642542615532875, + "learning_rate": 9.745394606620242e-05, + "loss": 0.010820561088621616, + "num_input_tokens_seen": 65635008, + "step": 4008, + "train_runtime": 33049.2648, + "train_tokens_per_second": 1985.975 + }, + { + "epoch": 1.1105263157894736, + "grad_norm": 0.06480871886014938, + "learning_rate": 9.745256122973867e-05, + "loss": 0.010889872908592224, + "num_input_tokens_seen": 65651384, + "step": 4009, + "train_runtime": 33057.4891, + "train_tokens_per_second": 1985.976 + }, + { + "epoch": 1.110803324099723, + "grad_norm": 0.05971669778227806, + "learning_rate": 9.745117602660556e-05, + "loss": 0.009653652086853981, + "num_input_tokens_seen": 65667760, + "step": 4010, + "train_runtime": 33065.7158, + "train_tokens_per_second": 1985.977 + }, + { + "epoch": 1.1110803324099723, + "grad_norm": 0.044014494866132736, + "learning_rate": 9.744979045681382e-05, + "loss": 0.009702913463115692, + "num_input_tokens_seen": 65684136, + "step": 4011, + "train_runtime": 33073.9537, + "train_tokens_per_second": 1985.978 + }, + { + "epoch": 1.1113573407202215, + "grad_norm": 0.06325758993625641, + "learning_rate": 9.744840452037411e-05, + "loss": 0.011276943609118462, + "num_input_tokens_seen": 65700512, + "step": 4012, + "train_runtime": 33082.1771, + "train_tokens_per_second": 1985.979 + }, + { + "epoch": 1.111634349030471, + "grad_norm": 0.06472689658403397, + "learning_rate": 9.744701821729721e-05, + "loss": 0.01094317901879549, + "num_input_tokens_seen": 65716888, + "step": 4013, + "train_runtime": 33090.391, + "train_tokens_per_second": 1985.981 + }, + { + "epoch": 1.1119113573407202, + "grad_norm": 0.035991113632917404, + "learning_rate": 9.744563154759375e-05, + "loss": 0.011273406445980072, + "num_input_tokens_seen": 65733264, + "step": 4014, + "train_runtime": 33098.5981, + "train_tokens_per_second": 1985.983 + }, + { + "epoch": 1.1121883656509695, + "grad_norm": 0.07049426436424255, + "learning_rate": 9.744424451127449e-05, + "loss": 0.011366420425474644, + "num_input_tokens_seen": 65749640, + "step": 4015, + "train_runtime": 33106.807, + "train_tokens_per_second": 1985.986 + }, + { + "epoch": 1.112465373961219, + "grad_norm": 0.4823474884033203, + "learning_rate": 9.744285710835016e-05, + "loss": 0.011052596382796764, + "num_input_tokens_seen": 65766016, + "step": 4016, + "train_runtime": 33115.02, + "train_tokens_per_second": 1985.988 + }, + { + "epoch": 1.1127423822714682, + "grad_norm": 0.04562072828412056, + "learning_rate": 9.744146933883146e-05, + "loss": 0.011495563201606274, + "num_input_tokens_seen": 65782392, + "step": 4017, + "train_runtime": 33123.2439, + "train_tokens_per_second": 1985.989 + }, + { + "epoch": 1.1130193905817174, + "grad_norm": 0.05590519309043884, + "learning_rate": 9.744008120272911e-05, + "loss": 0.010949376970529556, + "num_input_tokens_seen": 65798768, + "step": 4018, + "train_runtime": 33131.4706, + "train_tokens_per_second": 1985.99 + }, + { + "epoch": 1.1132963988919669, + "grad_norm": 0.061936523765325546, + "learning_rate": 9.743869270005384e-05, + "loss": 0.01151655986905098, + "num_input_tokens_seen": 65815144, + "step": 4019, + "train_runtime": 33139.7065, + "train_tokens_per_second": 1985.991 + }, + { + "epoch": 1.113573407202216, + "grad_norm": 0.040510617196559906, + "learning_rate": 9.743730383081639e-05, + "loss": 0.010523208416998386, + "num_input_tokens_seen": 65831520, + "step": 4020, + "train_runtime": 33147.9422, + "train_tokens_per_second": 1985.991 + }, + { + "epoch": 1.1138504155124653, + "grad_norm": 0.04722712188959122, + "learning_rate": 9.743591459502748e-05, + "loss": 0.010383661836385727, + "num_input_tokens_seen": 65847896, + "step": 4021, + "train_runtime": 33156.173, + "train_tokens_per_second": 1985.992 + }, + { + "epoch": 1.1141274238227146, + "grad_norm": 0.03956179693341255, + "learning_rate": 9.743452499269785e-05, + "loss": 0.010341591201722622, + "num_input_tokens_seen": 65864272, + "step": 4022, + "train_runtime": 33164.3835, + "train_tokens_per_second": 1985.994 + }, + { + "epoch": 1.114404432132964, + "grad_norm": 0.04550495743751526, + "learning_rate": 9.743313502383823e-05, + "loss": 0.010196794755756855, + "num_input_tokens_seen": 65880648, + "step": 4023, + "train_runtime": 33172.5934, + "train_tokens_per_second": 1985.996 + }, + { + "epoch": 1.1146814404432133, + "grad_norm": 0.06681609898805618, + "learning_rate": 9.743174468845937e-05, + "loss": 0.010472347028553486, + "num_input_tokens_seen": 65897024, + "step": 4024, + "train_runtime": 33180.84, + "train_tokens_per_second": 1985.996 + }, + { + "epoch": 1.1149584487534625, + "grad_norm": 0.06505830585956573, + "learning_rate": 9.743035398657201e-05, + "loss": 0.0107891159132123, + "num_input_tokens_seen": 65913400, + "step": 4025, + "train_runtime": 33189.0837, + "train_tokens_per_second": 1985.996 + }, + { + "epoch": 1.115235457063712, + "grad_norm": 0.04183874651789665, + "learning_rate": 9.74289629181869e-05, + "loss": 0.009968935512006283, + "num_input_tokens_seen": 65929776, + "step": 4026, + "train_runtime": 33197.3132, + "train_tokens_per_second": 1985.997 + }, + { + "epoch": 1.1155124653739612, + "grad_norm": 0.04919213056564331, + "learning_rate": 9.742757148331479e-05, + "loss": 0.00882916059345007, + "num_input_tokens_seen": 65946152, + "step": 4027, + "train_runtime": 33205.5419, + "train_tokens_per_second": 1985.998 + }, + { + "epoch": 1.1157894736842104, + "grad_norm": 0.029691744595766068, + "learning_rate": 9.742617968196641e-05, + "loss": 0.008594071492552757, + "num_input_tokens_seen": 65962528, + "step": 4028, + "train_runtime": 33213.7678, + "train_tokens_per_second": 1986.0 + }, + { + "epoch": 1.11606648199446, + "grad_norm": 0.0333486869931221, + "learning_rate": 9.742478751415253e-05, + "loss": 0.008881728164851665, + "num_input_tokens_seen": 65978904, + "step": 4029, + "train_runtime": 33221.9817, + "train_tokens_per_second": 1986.001 + }, + { + "epoch": 1.1163434903047091, + "grad_norm": 0.04225469380617142, + "learning_rate": 9.74233949798839e-05, + "loss": 0.011871159076690674, + "num_input_tokens_seen": 65995280, + "step": 4030, + "train_runtime": 33230.1921, + "train_tokens_per_second": 1986.004 + }, + { + "epoch": 1.1166204986149584, + "grad_norm": 0.04473339021205902, + "learning_rate": 9.742200207917131e-05, + "loss": 0.012608485296368599, + "num_input_tokens_seen": 66011656, + "step": 4031, + "train_runtime": 33238.3967, + "train_tokens_per_second": 1986.006 + }, + { + "epoch": 1.1168975069252078, + "grad_norm": 0.06547077000141144, + "learning_rate": 9.74206088120255e-05, + "loss": 0.010165911167860031, + "num_input_tokens_seen": 66028032, + "step": 4032, + "train_runtime": 33246.5971, + "train_tokens_per_second": 1986.009 + }, + { + "epoch": 1.117174515235457, + "grad_norm": 0.07198271155357361, + "learning_rate": 9.741921517845723e-05, + "loss": 0.011455098167061806, + "num_input_tokens_seen": 66044408, + "step": 4033, + "train_runtime": 33254.817, + "train_tokens_per_second": 1986.01 + }, + { + "epoch": 1.1174515235457063, + "grad_norm": 0.07047975063323975, + "learning_rate": 9.741782117847729e-05, + "loss": 0.01065729558467865, + "num_input_tokens_seen": 66060784, + "step": 4034, + "train_runtime": 33263.0261, + "train_tokens_per_second": 1986.012 + }, + { + "epoch": 1.1177285318559558, + "grad_norm": 0.058896198868751526, + "learning_rate": 9.741642681209641e-05, + "loss": 0.009564632549881935, + "num_input_tokens_seen": 66077160, + "step": 4035, + "train_runtime": 33271.23, + "train_tokens_per_second": 1986.015 + }, + { + "epoch": 1.118005540166205, + "grad_norm": 0.03420407697558403, + "learning_rate": 9.74150320793254e-05, + "loss": 0.012660757638514042, + "num_input_tokens_seen": 66093536, + "step": 4036, + "train_runtime": 33279.4349, + "train_tokens_per_second": 1986.017 + }, + { + "epoch": 1.1182825484764543, + "grad_norm": 0.04087977111339569, + "learning_rate": 9.741363698017505e-05, + "loss": 0.01192148495465517, + "num_input_tokens_seen": 66109912, + "step": 4037, + "train_runtime": 33287.668, + "train_tokens_per_second": 1986.018 + }, + { + "epoch": 1.1185595567867037, + "grad_norm": 0.04130358621478081, + "learning_rate": 9.74122415146561e-05, + "loss": 0.011621425859630108, + "num_input_tokens_seen": 66126288, + "step": 4038, + "train_runtime": 33295.8922, + "train_tokens_per_second": 1986.019 + }, + { + "epoch": 1.118836565096953, + "grad_norm": 0.030736565589904785, + "learning_rate": 9.741084568277936e-05, + "loss": 0.011516756378114223, + "num_input_tokens_seen": 66142664, + "step": 4039, + "train_runtime": 33304.1131, + "train_tokens_per_second": 1986.021 + }, + { + "epoch": 1.1191135734072022, + "grad_norm": 0.03731580451130867, + "learning_rate": 9.74094494845556e-05, + "loss": 0.011613672599196434, + "num_input_tokens_seen": 66159040, + "step": 4040, + "train_runtime": 33312.3256, + "train_tokens_per_second": 1986.023 + }, + { + "epoch": 1.1193905817174514, + "grad_norm": 0.04750248044729233, + "learning_rate": 9.740805291999562e-05, + "loss": 0.011405542492866516, + "num_input_tokens_seen": 66175416, + "step": 4041, + "train_runtime": 33320.5573, + "train_tokens_per_second": 1986.024 + }, + { + "epoch": 1.1196675900277009, + "grad_norm": 0.055649060755968094, + "learning_rate": 9.740665598911021e-05, + "loss": 0.011862995103001595, + "num_input_tokens_seen": 66191792, + "step": 4042, + "train_runtime": 33328.7679, + "train_tokens_per_second": 1986.026 + }, + { + "epoch": 1.1199445983379501, + "grad_norm": 0.0728328675031662, + "learning_rate": 9.740525869191016e-05, + "loss": 0.009016567841172218, + "num_input_tokens_seen": 66208168, + "step": 4043, + "train_runtime": 33336.9916, + "train_tokens_per_second": 1986.027 + }, + { + "epoch": 1.1202216066481994, + "grad_norm": 0.025136591866612434, + "learning_rate": 9.740386102840626e-05, + "loss": 0.010186895728111267, + "num_input_tokens_seen": 66224544, + "step": 4044, + "train_runtime": 33345.2313, + "train_tokens_per_second": 1986.027 + }, + { + "epoch": 1.1204986149584488, + "grad_norm": 0.05015212297439575, + "learning_rate": 9.740246299860933e-05, + "loss": 0.011069195345044136, + "num_input_tokens_seen": 66240920, + "step": 4045, + "train_runtime": 33353.4548, + "train_tokens_per_second": 1986.029 + }, + { + "epoch": 1.120775623268698, + "grad_norm": 0.06021752581000328, + "learning_rate": 9.740106460253015e-05, + "loss": 0.010323741473257542, + "num_input_tokens_seen": 66257296, + "step": 4046, + "train_runtime": 33361.6673, + "train_tokens_per_second": 1986.031 + }, + { + "epoch": 1.1210526315789473, + "grad_norm": 0.04783134162425995, + "learning_rate": 9.739966584017956e-05, + "loss": 0.009537484496831894, + "num_input_tokens_seen": 66273672, + "step": 4047, + "train_runtime": 33369.8772, + "train_tokens_per_second": 1986.033 + }, + { + "epoch": 1.1213296398891968, + "grad_norm": 0.05997356399893761, + "learning_rate": 9.739826671156834e-05, + "loss": 0.008394863456487656, + "num_input_tokens_seen": 66290048, + "step": 4048, + "train_runtime": 33378.1008, + "train_tokens_per_second": 1986.034 + }, + { + "epoch": 1.121606648199446, + "grad_norm": 0.058388348668813705, + "learning_rate": 9.739686721670729e-05, + "loss": 0.012526223435997963, + "num_input_tokens_seen": 66306424, + "step": 4049, + "train_runtime": 33386.3076, + "train_tokens_per_second": 1986.036 + }, + { + "epoch": 1.1218836565096952, + "grad_norm": 0.051906876266002655, + "learning_rate": 9.739546735560724e-05, + "loss": 0.01148817129433155, + "num_input_tokens_seen": 66322800, + "step": 4050, + "train_runtime": 33394.5182, + "train_tokens_per_second": 1986.039 + }, + { + "epoch": 1.1221606648199447, + "grad_norm": 0.06963014602661133, + "learning_rate": 9.7394067128279e-05, + "loss": 0.015494692139327526, + "num_input_tokens_seen": 66339176, + "step": 4051, + "train_runtime": 33402.7255, + "train_tokens_per_second": 1986.041 + }, + { + "epoch": 1.122437673130194, + "grad_norm": 0.0463855154812336, + "learning_rate": 9.739266653473342e-05, + "loss": 0.011756431311368942, + "num_input_tokens_seen": 66355552, + "step": 4052, + "train_runtime": 33410.9432, + "train_tokens_per_second": 1986.042 + }, + { + "epoch": 1.1227146814404432, + "grad_norm": 0.04784267023205757, + "learning_rate": 9.73912655749813e-05, + "loss": 0.009188334457576275, + "num_input_tokens_seen": 66371928, + "step": 4053, + "train_runtime": 33419.1619, + "train_tokens_per_second": 1986.044 + }, + { + "epoch": 1.1229916897506924, + "grad_norm": 0.06265944987535477, + "learning_rate": 9.738986424903345e-05, + "loss": 0.01086884643882513, + "num_input_tokens_seen": 66388304, + "step": 4054, + "train_runtime": 33427.3709, + "train_tokens_per_second": 1986.046 + }, + { + "epoch": 1.1232686980609419, + "grad_norm": 0.05938506871461868, + "learning_rate": 9.738846255690071e-05, + "loss": 0.012159900739789009, + "num_input_tokens_seen": 66404680, + "step": 4055, + "train_runtime": 33435.5753, + "train_tokens_per_second": 1986.049 + }, + { + "epoch": 1.123545706371191, + "grad_norm": 0.061071623116731644, + "learning_rate": 9.738706049859392e-05, + "loss": 0.012135719880461693, + "num_input_tokens_seen": 66421056, + "step": 4056, + "train_runtime": 33443.7862, + "train_tokens_per_second": 1986.051 + }, + { + "epoch": 1.1238227146814403, + "grad_norm": 0.09185332804918289, + "learning_rate": 9.738565807412392e-05, + "loss": 0.014282917603850365, + "num_input_tokens_seen": 66437432, + "step": 4057, + "train_runtime": 33451.9927, + "train_tokens_per_second": 1986.053 + }, + { + "epoch": 1.1240997229916898, + "grad_norm": 0.05976944789290428, + "learning_rate": 9.738425528350152e-05, + "loss": 0.015050239861011505, + "num_input_tokens_seen": 66453808, + "step": 4058, + "train_runtime": 33460.2028, + "train_tokens_per_second": 1986.055 + }, + { + "epoch": 1.124376731301939, + "grad_norm": 0.03661668300628662, + "learning_rate": 9.738285212673756e-05, + "loss": 0.011681092903017998, + "num_input_tokens_seen": 66470184, + "step": 4059, + "train_runtime": 33468.4091, + "train_tokens_per_second": 1986.057 + }, + { + "epoch": 1.1246537396121883, + "grad_norm": 0.07029136270284653, + "learning_rate": 9.738144860384292e-05, + "loss": 0.01157437078654766, + "num_input_tokens_seen": 66486560, + "step": 4060, + "train_runtime": 33476.6317, + "train_tokens_per_second": 1986.059 + }, + { + "epoch": 1.1249307479224377, + "grad_norm": 0.04390070214867592, + "learning_rate": 9.738004471482842e-05, + "loss": 0.009217829443514347, + "num_input_tokens_seen": 66502936, + "step": 4061, + "train_runtime": 33484.8559, + "train_tokens_per_second": 1986.06 + }, + { + "epoch": 1.125207756232687, + "grad_norm": 0.04590623453259468, + "learning_rate": 9.73786404597049e-05, + "loss": 0.011067504994571209, + "num_input_tokens_seen": 66519312, + "step": 4062, + "train_runtime": 33493.0882, + "train_tokens_per_second": 1986.061 + }, + { + "epoch": 1.1254847645429362, + "grad_norm": 0.032110486179590225, + "learning_rate": 9.737723583848322e-05, + "loss": 0.010671042837202549, + "num_input_tokens_seen": 66535688, + "step": 4063, + "train_runtime": 33501.3141, + "train_tokens_per_second": 1986.062 + }, + { + "epoch": 1.1257617728531857, + "grad_norm": 0.044971417635679245, + "learning_rate": 9.737583085117423e-05, + "loss": 0.010024555027484894, + "num_input_tokens_seen": 66552064, + "step": 4064, + "train_runtime": 33509.5377, + "train_tokens_per_second": 1986.063 + }, + { + "epoch": 1.126038781163435, + "grad_norm": 0.05717610567808151, + "learning_rate": 9.73744254977888e-05, + "loss": 0.011990437284111977, + "num_input_tokens_seen": 66568440, + "step": 4065, + "train_runtime": 33517.7667, + "train_tokens_per_second": 1986.064 + }, + { + "epoch": 1.1263157894736842, + "grad_norm": 0.06051238626241684, + "learning_rate": 9.737301977833777e-05, + "loss": 0.0100700156763196, + "num_input_tokens_seen": 66584816, + "step": 4066, + "train_runtime": 33525.9905, + "train_tokens_per_second": 1986.066 + }, + { + "epoch": 1.1265927977839336, + "grad_norm": 0.03516576811671257, + "learning_rate": 9.737161369283201e-05, + "loss": 0.011201897636055946, + "num_input_tokens_seen": 66601192, + "step": 4067, + "train_runtime": 33534.2029, + "train_tokens_per_second": 1986.068 + }, + { + "epoch": 1.1268698060941829, + "grad_norm": 0.03638621047139168, + "learning_rate": 9.737020724128239e-05, + "loss": 0.011424185708165169, + "num_input_tokens_seen": 66617568, + "step": 4068, + "train_runtime": 33542.4092, + "train_tokens_per_second": 1986.07 + }, + { + "epoch": 1.127146814404432, + "grad_norm": 0.028047088533639908, + "learning_rate": 9.736880042369978e-05, + "loss": 0.007177075371146202, + "num_input_tokens_seen": 66633944, + "step": 4069, + "train_runtime": 33550.6187, + "train_tokens_per_second": 1986.072 + }, + { + "epoch": 1.1274238227146816, + "grad_norm": 0.04351355880498886, + "learning_rate": 9.736739324009503e-05, + "loss": 0.01057787798345089, + "num_input_tokens_seen": 66650320, + "step": 4070, + "train_runtime": 33558.8236, + "train_tokens_per_second": 1986.074 + }, + { + "epoch": 1.1277008310249308, + "grad_norm": 0.08440288156270981, + "learning_rate": 9.736598569047903e-05, + "loss": 0.01431445311754942, + "num_input_tokens_seen": 66666696, + "step": 4071, + "train_runtime": 33567.0328, + "train_tokens_per_second": 1986.077 + }, + { + "epoch": 1.12797783933518, + "grad_norm": 0.04625235125422478, + "learning_rate": 9.736457777486267e-05, + "loss": 0.010972598567605019, + "num_input_tokens_seen": 66683072, + "step": 4072, + "train_runtime": 33575.237, + "train_tokens_per_second": 1986.079 + }, + { + "epoch": 1.1282548476454293, + "grad_norm": 0.0642000287771225, + "learning_rate": 9.73631694932568e-05, + "loss": 0.011743303388357162, + "num_input_tokens_seen": 66699448, + "step": 4073, + "train_runtime": 33583.454, + "train_tokens_per_second": 1986.081 + }, + { + "epoch": 1.1285318559556787, + "grad_norm": 0.07170519232749939, + "learning_rate": 9.736176084567232e-05, + "loss": 0.012197182513773441, + "num_input_tokens_seen": 66715824, + "step": 4074, + "train_runtime": 33591.6656, + "train_tokens_per_second": 1986.083 + }, + { + "epoch": 1.128808864265928, + "grad_norm": 0.05239381641149521, + "learning_rate": 9.736035183212011e-05, + "loss": 0.01083369366824627, + "num_input_tokens_seen": 66732200, + "step": 4075, + "train_runtime": 33599.8725, + "train_tokens_per_second": 1986.085 + }, + { + "epoch": 1.1290858725761772, + "grad_norm": 0.07442901283502579, + "learning_rate": 9.735894245261105e-05, + "loss": 0.007707607466727495, + "num_input_tokens_seen": 66748576, + "step": 4076, + "train_runtime": 33608.0822, + "train_tokens_per_second": 1986.087 + }, + { + "epoch": 1.1293628808864267, + "grad_norm": 0.0549372062087059, + "learning_rate": 9.735753270715604e-05, + "loss": 0.011305682361125946, + "num_input_tokens_seen": 66764952, + "step": 4077, + "train_runtime": 33616.2978, + "train_tokens_per_second": 1986.089 + }, + { + "epoch": 1.129639889196676, + "grad_norm": 0.04166809096932411, + "learning_rate": 9.735612259576597e-05, + "loss": 0.011091365478932858, + "num_input_tokens_seen": 66781328, + "step": 4078, + "train_runtime": 33624.5032, + "train_tokens_per_second": 1986.091 + }, + { + "epoch": 1.1299168975069251, + "grad_norm": 0.04501907527446747, + "learning_rate": 9.735471211845175e-05, + "loss": 0.010777709074318409, + "num_input_tokens_seen": 66797704, + "step": 4079, + "train_runtime": 33632.7111, + "train_tokens_per_second": 1986.093 + }, + { + "epoch": 1.1301939058171746, + "grad_norm": 0.03931240364909172, + "learning_rate": 9.735330127522425e-05, + "loss": 0.01090767327696085, + "num_input_tokens_seen": 66814080, + "step": 4080, + "train_runtime": 33640.9163, + "train_tokens_per_second": 1986.096 + }, + { + "epoch": 1.1304709141274238, + "grad_norm": 0.03888137266039848, + "learning_rate": 9.735189006609439e-05, + "loss": 0.013163149356842041, + "num_input_tokens_seen": 66830456, + "step": 4081, + "train_runtime": 33649.1207, + "train_tokens_per_second": 1986.098 + }, + { + "epoch": 1.130747922437673, + "grad_norm": 0.08276280760765076, + "learning_rate": 9.735047849107308e-05, + "loss": 0.012398244813084602, + "num_input_tokens_seen": 66846832, + "step": 4082, + "train_runtime": 33657.3316, + "train_tokens_per_second": 1986.1 + }, + { + "epoch": 1.1310249307479223, + "grad_norm": 0.0378122478723526, + "learning_rate": 9.734906655017122e-05, + "loss": 0.010936222970485687, + "num_input_tokens_seen": 66863208, + "step": 4083, + "train_runtime": 33665.5355, + "train_tokens_per_second": 1986.103 + }, + { + "epoch": 1.1313019390581718, + "grad_norm": 0.042381372302770615, + "learning_rate": 9.73476542433997e-05, + "loss": 0.01143976952880621, + "num_input_tokens_seen": 66879584, + "step": 4084, + "train_runtime": 33673.754, + "train_tokens_per_second": 1986.104 + }, + { + "epoch": 1.131578947368421, + "grad_norm": 0.07909480482339859, + "learning_rate": 9.734624157076948e-05, + "loss": 0.011790547519922256, + "num_input_tokens_seen": 66895960, + "step": 4085, + "train_runtime": 33681.977, + "train_tokens_per_second": 1986.106 + }, + { + "epoch": 1.1318559556786703, + "grad_norm": 0.04400653764605522, + "learning_rate": 9.734482853229144e-05, + "loss": 0.008566685952246189, + "num_input_tokens_seen": 66912336, + "step": 4086, + "train_runtime": 33690.2063, + "train_tokens_per_second": 1986.106 + }, + { + "epoch": 1.1321329639889197, + "grad_norm": 0.025791049003601074, + "learning_rate": 9.734341512797649e-05, + "loss": 0.007338285446166992, + "num_input_tokens_seen": 66928712, + "step": 4087, + "train_runtime": 33698.4308, + "train_tokens_per_second": 1986.108 + }, + { + "epoch": 1.132409972299169, + "grad_norm": 0.031480174511671066, + "learning_rate": 9.734200135783557e-05, + "loss": 0.010056400671601295, + "num_input_tokens_seen": 66945088, + "step": 4088, + "train_runtime": 33706.6596, + "train_tokens_per_second": 1986.109 + }, + { + "epoch": 1.1326869806094182, + "grad_norm": 0.07143156230449677, + "learning_rate": 9.734058722187962e-05, + "loss": 0.0107087017968297, + "num_input_tokens_seen": 66961464, + "step": 4089, + "train_runtime": 33714.8721, + "train_tokens_per_second": 1986.111 + }, + { + "epoch": 1.1329639889196677, + "grad_norm": 0.06293357163667679, + "learning_rate": 9.733917272011955e-05, + "loss": 0.009001286700367928, + "num_input_tokens_seen": 66977840, + "step": 4090, + "train_runtime": 33723.0864, + "train_tokens_per_second": 1986.112 + }, + { + "epoch": 1.133240997229917, + "grad_norm": 0.0444890595972538, + "learning_rate": 9.733775785256629e-05, + "loss": 0.012123160995543003, + "num_input_tokens_seen": 66994216, + "step": 4091, + "train_runtime": 33731.3191, + "train_tokens_per_second": 1986.113 + }, + { + "epoch": 1.1335180055401661, + "grad_norm": 0.03675323352217674, + "learning_rate": 9.733634261923075e-05, + "loss": 0.012360094115138054, + "num_input_tokens_seen": 67010592, + "step": 4092, + "train_runtime": 33739.5533, + "train_tokens_per_second": 1986.114 + }, + { + "epoch": 1.1337950138504156, + "grad_norm": 0.05709363520145416, + "learning_rate": 9.73349270201239e-05, + "loss": 0.011416299268603325, + "num_input_tokens_seen": 67026968, + "step": 4093, + "train_runtime": 33747.7812, + "train_tokens_per_second": 1986.115 + }, + { + "epoch": 1.1340720221606648, + "grad_norm": 0.07884740084409714, + "learning_rate": 9.733351105525668e-05, + "loss": 0.01494819950312376, + "num_input_tokens_seen": 67043344, + "step": 4094, + "train_runtime": 33755.9937, + "train_tokens_per_second": 1986.117 + }, + { + "epoch": 1.134349030470914, + "grad_norm": 0.05507196485996246, + "learning_rate": 9.733209472463999e-05, + "loss": 0.00884011760354042, + "num_input_tokens_seen": 67059720, + "step": 4095, + "train_runtime": 33764.2291, + "train_tokens_per_second": 1986.117 + }, + { + "epoch": 1.1346260387811635, + "grad_norm": 0.06191180646419525, + "learning_rate": 9.733067802828484e-05, + "loss": 0.009239506907761097, + "num_input_tokens_seen": 67076096, + "step": 4096, + "train_runtime": 33772.4704, + "train_tokens_per_second": 1986.118 + }, + { + "epoch": 1.1349030470914128, + "grad_norm": 0.058104608207941055, + "learning_rate": 9.73292609662021e-05, + "loss": 0.013602585531771183, + "num_input_tokens_seen": 67092472, + "step": 4097, + "train_runtime": 33780.7012, + "train_tokens_per_second": 1986.118 + }, + { + "epoch": 1.135180055401662, + "grad_norm": 0.05283571034669876, + "learning_rate": 9.732784353840278e-05, + "loss": 0.01230963971465826, + "num_input_tokens_seen": 67108848, + "step": 4098, + "train_runtime": 33788.916, + "train_tokens_per_second": 1986.12 + }, + { + "epoch": 1.1354570637119115, + "grad_norm": 0.04138341546058655, + "learning_rate": 9.732642574489781e-05, + "loss": 0.011597235687077045, + "num_input_tokens_seen": 67125224, + "step": 4099, + "train_runtime": 33797.1324, + "train_tokens_per_second": 1986.122 + }, + { + "epoch": 1.1357340720221607, + "grad_norm": 0.0641930103302002, + "learning_rate": 9.732500758569815e-05, + "loss": 0.008522730320692062, + "num_input_tokens_seen": 67141600, + "step": 4100, + "train_runtime": 33805.337, + "train_tokens_per_second": 1986.124 + }, + { + "epoch": 1.13601108033241, + "grad_norm": 0.054487720131874084, + "learning_rate": 9.732358906081475e-05, + "loss": 0.011536253616213799, + "num_input_tokens_seen": 67157976, + "step": 4101, + "train_runtime": 33815.0764, + "train_tokens_per_second": 1986.037 + }, + { + "epoch": 1.1362880886426594, + "grad_norm": 0.04934472590684891, + "learning_rate": 9.732217017025858e-05, + "loss": 0.010765564627945423, + "num_input_tokens_seen": 67174352, + "step": 4102, + "train_runtime": 33823.2786, + "train_tokens_per_second": 1986.039 + }, + { + "epoch": 1.1365650969529086, + "grad_norm": 0.06488984823226929, + "learning_rate": 9.732075091404058e-05, + "loss": 0.011146784760057926, + "num_input_tokens_seen": 67190728, + "step": 4103, + "train_runtime": 33831.4868, + "train_tokens_per_second": 1986.041 + }, + { + "epoch": 1.1368421052631579, + "grad_norm": 0.052206359803676605, + "learning_rate": 9.731933129217175e-05, + "loss": 0.011856123805046082, + "num_input_tokens_seen": 67207104, + "step": 4104, + "train_runtime": 33839.6946, + "train_tokens_per_second": 1986.043 + }, + { + "epoch": 1.1371191135734071, + "grad_norm": 0.15318043529987335, + "learning_rate": 9.731791130466306e-05, + "loss": 0.011166319251060486, + "num_input_tokens_seen": 67223480, + "step": 4105, + "train_runtime": 33847.904, + "train_tokens_per_second": 1986.046 + }, + { + "epoch": 1.1373961218836566, + "grad_norm": 0.044827237725257874, + "learning_rate": 9.731649095152545e-05, + "loss": 0.009195072576403618, + "num_input_tokens_seen": 67239856, + "step": 4106, + "train_runtime": 33856.1089, + "train_tokens_per_second": 1986.048 + }, + { + "epoch": 1.1376731301939058, + "grad_norm": 0.03953041508793831, + "learning_rate": 9.731507023276992e-05, + "loss": 0.010594122111797333, + "num_input_tokens_seen": 67256232, + "step": 4107, + "train_runtime": 33864.3248, + "train_tokens_per_second": 1986.05 + }, + { + "epoch": 1.137950138504155, + "grad_norm": 0.06516125053167343, + "learning_rate": 9.731364914840744e-05, + "loss": 0.013200429268181324, + "num_input_tokens_seen": 67272608, + "step": 4108, + "train_runtime": 33872.5545, + "train_tokens_per_second": 1986.051 + }, + { + "epoch": 1.1382271468144045, + "grad_norm": 0.055844683200120926, + "learning_rate": 9.731222769844898e-05, + "loss": 0.010753070935606956, + "num_input_tokens_seen": 67288984, + "step": 4109, + "train_runtime": 33880.7786, + "train_tokens_per_second": 1986.052 + }, + { + "epoch": 1.1385041551246537, + "grad_norm": 0.04587159305810928, + "learning_rate": 9.731080588290555e-05, + "loss": 0.010373927652835846, + "num_input_tokens_seen": 67305360, + "step": 4110, + "train_runtime": 33888.9936, + "train_tokens_per_second": 1986.054 + }, + { + "epoch": 1.138781163434903, + "grad_norm": 0.06767014414072037, + "learning_rate": 9.730938370178813e-05, + "loss": 0.010462703183293343, + "num_input_tokens_seen": 67321736, + "step": 4111, + "train_runtime": 33897.2009, + "train_tokens_per_second": 1986.056 + }, + { + "epoch": 1.1390581717451524, + "grad_norm": 0.0705600380897522, + "learning_rate": 9.73079611551077e-05, + "loss": 0.012470019981265068, + "num_input_tokens_seen": 67338112, + "step": 4112, + "train_runtime": 33905.4088, + "train_tokens_per_second": 1986.058 + }, + { + "epoch": 1.1393351800554017, + "grad_norm": 0.05840851366519928, + "learning_rate": 9.730653824287523e-05, + "loss": 0.013107694685459137, + "num_input_tokens_seen": 67354488, + "step": 4113, + "train_runtime": 33913.6336, + "train_tokens_per_second": 1986.059 + }, + { + "epoch": 1.139612188365651, + "grad_norm": 0.07573088258504868, + "learning_rate": 9.730511496510175e-05, + "loss": 0.013002810068428516, + "num_input_tokens_seen": 67370864, + "step": 4114, + "train_runtime": 33921.8631, + "train_tokens_per_second": 1986.06 + }, + { + "epoch": 1.1398891966759002, + "grad_norm": 0.37635427713394165, + "learning_rate": 9.730369132179827e-05, + "loss": 0.01792166940867901, + "num_input_tokens_seen": 67387240, + "step": 4115, + "train_runtime": 33930.0907, + "train_tokens_per_second": 1986.061 + }, + { + "epoch": 1.1401662049861496, + "grad_norm": 0.06326958537101746, + "learning_rate": 9.730226731297573e-05, + "loss": 0.012012366205453873, + "num_input_tokens_seen": 67403616, + "step": 4116, + "train_runtime": 33938.3127, + "train_tokens_per_second": 1986.063 + }, + { + "epoch": 1.1404432132963989, + "grad_norm": 0.04787744581699371, + "learning_rate": 9.73008429386452e-05, + "loss": 0.010509149171411991, + "num_input_tokens_seen": 67419992, + "step": 4117, + "train_runtime": 33946.5209, + "train_tokens_per_second": 1986.065 + }, + { + "epoch": 1.140720221606648, + "grad_norm": 0.05305388569831848, + "learning_rate": 9.729941819881765e-05, + "loss": 0.010794123634696007, + "num_input_tokens_seen": 67436368, + "step": 4118, + "train_runtime": 33954.7387, + "train_tokens_per_second": 1986.066 + }, + { + "epoch": 1.1409972299168976, + "grad_norm": 0.049200639128685, + "learning_rate": 9.729799309350408e-05, + "loss": 0.012412425130605698, + "num_input_tokens_seen": 67452744, + "step": 4119, + "train_runtime": 33962.9544, + "train_tokens_per_second": 1986.068 + }, + { + "epoch": 1.1412742382271468, + "grad_norm": 0.04155737906694412, + "learning_rate": 9.729656762271553e-05, + "loss": 0.010212821885943413, + "num_input_tokens_seen": 67469120, + "step": 4120, + "train_runtime": 33971.1638, + "train_tokens_per_second": 1986.07 + }, + { + "epoch": 1.141551246537396, + "grad_norm": 0.05324947088956833, + "learning_rate": 9.729514178646299e-05, + "loss": 0.010610063560307026, + "num_input_tokens_seen": 67485496, + "step": 4121, + "train_runtime": 33979.3709, + "train_tokens_per_second": 1986.073 + }, + { + "epoch": 1.1418282548476455, + "grad_norm": 0.050661832094192505, + "learning_rate": 9.729371558475751e-05, + "loss": 0.01152029912918806, + "num_input_tokens_seen": 67501872, + "step": 4122, + "train_runtime": 33987.5788, + "train_tokens_per_second": 1986.075 + }, + { + "epoch": 1.1421052631578947, + "grad_norm": 0.06439058482646942, + "learning_rate": 9.729228901761009e-05, + "loss": 0.008855707943439484, + "num_input_tokens_seen": 67518248, + "step": 4123, + "train_runtime": 33995.7863, + "train_tokens_per_second": 1986.077 + }, + { + "epoch": 1.142382271468144, + "grad_norm": 0.0684034451842308, + "learning_rate": 9.729086208503174e-05, + "loss": 0.01136317290365696, + "num_input_tokens_seen": 67534624, + "step": 4124, + "train_runtime": 34003.9952, + "train_tokens_per_second": 1986.079 + }, + { + "epoch": 1.1426592797783934, + "grad_norm": 0.04804167523980141, + "learning_rate": 9.72894347870335e-05, + "loss": 0.01202933769673109, + "num_input_tokens_seen": 67551000, + "step": 4125, + "train_runtime": 34012.2011, + "train_tokens_per_second": 1986.081 + }, + { + "epoch": 1.1429362880886427, + "grad_norm": 0.05014107748866081, + "learning_rate": 9.72880071236264e-05, + "loss": 0.010309175588190556, + "num_input_tokens_seen": 67567376, + "step": 4126, + "train_runtime": 34020.4284, + "train_tokens_per_second": 1986.082 + }, + { + "epoch": 1.143213296398892, + "grad_norm": 0.04035840928554535, + "learning_rate": 9.728657909482148e-05, + "loss": 0.009175826795399189, + "num_input_tokens_seen": 67583752, + "step": 4127, + "train_runtime": 34028.6542, + "train_tokens_per_second": 1986.084 + }, + { + "epoch": 1.1434903047091414, + "grad_norm": 0.07041165232658386, + "learning_rate": 9.728515070062975e-05, + "loss": 0.01338300108909607, + "num_input_tokens_seen": 67600128, + "step": 4128, + "train_runtime": 34036.8791, + "train_tokens_per_second": 1986.085 + }, + { + "epoch": 1.1437673130193906, + "grad_norm": 0.04477645456790924, + "learning_rate": 9.728372194106228e-05, + "loss": 0.010486126877367496, + "num_input_tokens_seen": 67616504, + "step": 4129, + "train_runtime": 34045.0959, + "train_tokens_per_second": 1986.086 + }, + { + "epoch": 1.1440443213296398, + "grad_norm": 0.049009934067726135, + "learning_rate": 9.728229281613009e-05, + "loss": 0.010523505508899689, + "num_input_tokens_seen": 67632880, + "step": 4130, + "train_runtime": 34053.3103, + "train_tokens_per_second": 1986.088 + }, + { + "epoch": 1.1443213296398893, + "grad_norm": 0.03582550957798958, + "learning_rate": 9.728086332584421e-05, + "loss": 0.009708449244499207, + "num_input_tokens_seen": 67649256, + "step": 4131, + "train_runtime": 34061.5364, + "train_tokens_per_second": 1986.089 + }, + { + "epoch": 1.1445983379501385, + "grad_norm": 0.06776738166809082, + "learning_rate": 9.72794334702157e-05, + "loss": 0.008523965254426003, + "num_input_tokens_seen": 67665632, + "step": 4132, + "train_runtime": 34069.7674, + "train_tokens_per_second": 1986.09 + }, + { + "epoch": 1.1448753462603878, + "grad_norm": 0.047860175371170044, + "learning_rate": 9.727800324925561e-05, + "loss": 0.011839190497994423, + "num_input_tokens_seen": 67682008, + "step": 4133, + "train_runtime": 34078.0016, + "train_tokens_per_second": 1986.091 + }, + { + "epoch": 1.1451523545706372, + "grad_norm": 0.03954118490219116, + "learning_rate": 9.727657266297499e-05, + "loss": 0.013298607431352139, + "num_input_tokens_seen": 67698384, + "step": 4134, + "train_runtime": 34086.225, + "train_tokens_per_second": 1986.092 + }, + { + "epoch": 1.1454293628808865, + "grad_norm": 0.03998711705207825, + "learning_rate": 9.727514171138492e-05, + "loss": 0.009616847150027752, + "num_input_tokens_seen": 67714760, + "step": 4135, + "train_runtime": 34094.4383, + "train_tokens_per_second": 1986.094 + }, + { + "epoch": 1.1457063711911357, + "grad_norm": 0.03357881307601929, + "learning_rate": 9.72737103944964e-05, + "loss": 0.011857526376843452, + "num_input_tokens_seen": 67731136, + "step": 4136, + "train_runtime": 34102.6415, + "train_tokens_per_second": 1986.096 + }, + { + "epoch": 1.145983379501385, + "grad_norm": 0.06272073835134506, + "learning_rate": 9.727227871232054e-05, + "loss": 0.012103923596441746, + "num_input_tokens_seen": 67747512, + "step": 4137, + "train_runtime": 34110.8534, + "train_tokens_per_second": 1986.098 + }, + { + "epoch": 1.1462603878116344, + "grad_norm": 0.029836971312761307, + "learning_rate": 9.727084666486838e-05, + "loss": 0.00839879922568798, + "num_input_tokens_seen": 67763888, + "step": 4138, + "train_runtime": 34119.056, + "train_tokens_per_second": 1986.101 + }, + { + "epoch": 1.1465373961218837, + "grad_norm": 0.0386880561709404, + "learning_rate": 9.7269414252151e-05, + "loss": 0.008777551352977753, + "num_input_tokens_seen": 67780264, + "step": 4139, + "train_runtime": 34127.2615, + "train_tokens_per_second": 1986.103 + }, + { + "epoch": 1.146814404432133, + "grad_norm": 0.07995076477527618, + "learning_rate": 9.726798147417944e-05, + "loss": 0.011476365849375725, + "num_input_tokens_seen": 67796640, + "step": 4140, + "train_runtime": 34135.4763, + "train_tokens_per_second": 1986.105 + }, + { + "epoch": 1.1470914127423824, + "grad_norm": 0.042528726160526276, + "learning_rate": 9.72665483309648e-05, + "loss": 0.009634330868721008, + "num_input_tokens_seen": 67813016, + "step": 4141, + "train_runtime": 34143.6866, + "train_tokens_per_second": 1986.107 + }, + { + "epoch": 1.1473684210526316, + "grad_norm": 0.05069376155734062, + "learning_rate": 9.726511482251814e-05, + "loss": 0.010032218880951405, + "num_input_tokens_seen": 67829392, + "step": 4142, + "train_runtime": 34151.8905, + "train_tokens_per_second": 1986.109 + }, + { + "epoch": 1.1476454293628808, + "grad_norm": 0.05556261166930199, + "learning_rate": 9.726368094885055e-05, + "loss": 0.01182524487376213, + "num_input_tokens_seen": 67845768, + "step": 4143, + "train_runtime": 34160.1018, + "train_tokens_per_second": 1986.111 + }, + { + "epoch": 1.14792243767313, + "grad_norm": 0.044867463409900665, + "learning_rate": 9.726224670997308e-05, + "loss": 0.008422324433922768, + "num_input_tokens_seen": 67862144, + "step": 4144, + "train_runtime": 34168.3051, + "train_tokens_per_second": 1986.114 + }, + { + "epoch": 1.1481994459833795, + "grad_norm": 0.0735117495059967, + "learning_rate": 9.726081210589684e-05, + "loss": 0.010919502004981041, + "num_input_tokens_seen": 67878520, + "step": 4145, + "train_runtime": 34176.53, + "train_tokens_per_second": 1986.115 + }, + { + "epoch": 1.1484764542936288, + "grad_norm": 0.062124237418174744, + "learning_rate": 9.725937713663292e-05, + "loss": 0.012715650722384453, + "num_input_tokens_seen": 67894896, + "step": 4146, + "train_runtime": 34184.7634, + "train_tokens_per_second": 1986.116 + }, + { + "epoch": 1.148753462603878, + "grad_norm": 0.036421965807676315, + "learning_rate": 9.72579418021924e-05, + "loss": 0.011427892372012138, + "num_input_tokens_seen": 67911272, + "step": 4147, + "train_runtime": 34192.9864, + "train_tokens_per_second": 1986.117 + }, + { + "epoch": 1.1490304709141275, + "grad_norm": 0.058613792061805725, + "learning_rate": 9.725650610258634e-05, + "loss": 0.010121659375727177, + "num_input_tokens_seen": 67927648, + "step": 4148, + "train_runtime": 34201.2115, + "train_tokens_per_second": 1986.118 + }, + { + "epoch": 1.1493074792243767, + "grad_norm": 0.060149502009153366, + "learning_rate": 9.725507003782588e-05, + "loss": 0.011890695430338383, + "num_input_tokens_seen": 67944024, + "step": 4149, + "train_runtime": 34209.4352, + "train_tokens_per_second": 1986.119 + }, + { + "epoch": 1.149584487534626, + "grad_norm": 0.07278253138065338, + "learning_rate": 9.725363360792208e-05, + "loss": 0.011022266931831837, + "num_input_tokens_seen": 67960400, + "step": 4150, + "train_runtime": 34217.6608, + "train_tokens_per_second": 1986.121 + }, + { + "epoch": 1.1498614958448754, + "grad_norm": 0.04353613406419754, + "learning_rate": 9.725219681288608e-05, + "loss": 0.0073887938633561134, + "num_input_tokens_seen": 67976776, + "step": 4151, + "train_runtime": 34225.8836, + "train_tokens_per_second": 1986.122 + }, + { + "epoch": 1.1501385041551246, + "grad_norm": 0.06123727932572365, + "learning_rate": 9.725075965272894e-05, + "loss": 0.010977404192090034, + "num_input_tokens_seen": 67993152, + "step": 4152, + "train_runtime": 34234.1089, + "train_tokens_per_second": 1986.123 + }, + { + "epoch": 1.1504155124653739, + "grad_norm": 0.04718281701207161, + "learning_rate": 9.72493221274618e-05, + "loss": 0.007516940124332905, + "num_input_tokens_seen": 68009528, + "step": 4153, + "train_runtime": 34242.3362, + "train_tokens_per_second": 1986.124 + }, + { + "epoch": 1.1506925207756233, + "grad_norm": 0.04307832568883896, + "learning_rate": 9.724788423709575e-05, + "loss": 0.008345176465809345, + "num_input_tokens_seen": 68025904, + "step": 4154, + "train_runtime": 34250.566, + "train_tokens_per_second": 1986.125 + }, + { + "epoch": 1.1509695290858726, + "grad_norm": 0.03764527291059494, + "learning_rate": 9.724644598164189e-05, + "loss": 0.008850798942148685, + "num_input_tokens_seen": 68042280, + "step": 4155, + "train_runtime": 34258.7973, + "train_tokens_per_second": 1986.126 + }, + { + "epoch": 1.1512465373961218, + "grad_norm": 0.04951438680291176, + "learning_rate": 9.724500736111135e-05, + "loss": 0.010371102951467037, + "num_input_tokens_seen": 68058656, + "step": 4156, + "train_runtime": 34267.0199, + "train_tokens_per_second": 1986.127 + }, + { + "epoch": 1.1515235457063713, + "grad_norm": 0.032940369099378586, + "learning_rate": 9.724356837551525e-05, + "loss": 0.01059846580028534, + "num_input_tokens_seen": 68075032, + "step": 4157, + "train_runtime": 34275.2385, + "train_tokens_per_second": 1986.129 + }, + { + "epoch": 1.1518005540166205, + "grad_norm": 0.10426712781190872, + "learning_rate": 9.724212902486469e-05, + "loss": 0.012047878466546535, + "num_input_tokens_seen": 68091408, + "step": 4158, + "train_runtime": 34283.4578, + "train_tokens_per_second": 1986.13 + }, + { + "epoch": 1.1520775623268698, + "grad_norm": 0.04483957588672638, + "learning_rate": 9.72406893091708e-05, + "loss": 0.00874206144362688, + "num_input_tokens_seen": 68107784, + "step": 4159, + "train_runtime": 34291.6827, + "train_tokens_per_second": 1986.131 + }, + { + "epoch": 1.1523545706371192, + "grad_norm": 0.05258503556251526, + "learning_rate": 9.723924922844473e-05, + "loss": 0.009630528278648853, + "num_input_tokens_seen": 68124160, + "step": 4160, + "train_runtime": 34299.9088, + "train_tokens_per_second": 1986.132 + }, + { + "epoch": 1.1526315789473685, + "grad_norm": 0.04423118010163307, + "learning_rate": 9.723780878269757e-05, + "loss": 0.010992344468832016, + "num_input_tokens_seen": 68140536, + "step": 4161, + "train_runtime": 34308.1205, + "train_tokens_per_second": 1986.134 + }, + { + "epoch": 1.1529085872576177, + "grad_norm": 0.05230216309428215, + "learning_rate": 9.723636797194046e-05, + "loss": 0.01208721473813057, + "num_input_tokens_seen": 68156912, + "step": 4162, + "train_runtime": 34316.329, + "train_tokens_per_second": 1986.136 + }, + { + "epoch": 1.1531855955678671, + "grad_norm": 0.041244037449359894, + "learning_rate": 9.723492679618455e-05, + "loss": 0.010370147414505482, + "num_input_tokens_seen": 68173288, + "step": 4163, + "train_runtime": 34324.5531, + "train_tokens_per_second": 1986.138 + }, + { + "epoch": 1.1534626038781164, + "grad_norm": 0.04505473002791405, + "learning_rate": 9.723348525544098e-05, + "loss": 0.01166712585836649, + "num_input_tokens_seen": 68189664, + "step": 4164, + "train_runtime": 34332.7642, + "train_tokens_per_second": 1986.14 + }, + { + "epoch": 1.1537396121883656, + "grad_norm": 0.03473668172955513, + "learning_rate": 9.723204334972085e-05, + "loss": 0.009927507489919662, + "num_input_tokens_seen": 68206040, + "step": 4165, + "train_runtime": 34340.9771, + "train_tokens_per_second": 1986.142 + }, + { + "epoch": 1.1540166204986149, + "grad_norm": 0.04547987878322601, + "learning_rate": 9.723060107903534e-05, + "loss": 0.01033610850572586, + "num_input_tokens_seen": 68222416, + "step": 4166, + "train_runtime": 34349.1815, + "train_tokens_per_second": 1986.144 + }, + { + "epoch": 1.1542936288088643, + "grad_norm": 0.04301692545413971, + "learning_rate": 9.722915844339557e-05, + "loss": 0.01168130524456501, + "num_input_tokens_seen": 68238792, + "step": 4167, + "train_runtime": 34357.4036, + "train_tokens_per_second": 1986.145 + }, + { + "epoch": 1.1545706371191136, + "grad_norm": 0.0389796644449234, + "learning_rate": 9.722771544281271e-05, + "loss": 0.009142384864389896, + "num_input_tokens_seen": 68255168, + "step": 4168, + "train_runtime": 34365.6375, + "train_tokens_per_second": 1986.146 + }, + { + "epoch": 1.1548476454293628, + "grad_norm": 0.03682183846831322, + "learning_rate": 9.722627207729789e-05, + "loss": 0.010371272452175617, + "num_input_tokens_seen": 68271544, + "step": 4169, + "train_runtime": 34373.873, + "train_tokens_per_second": 1986.146 + }, + { + "epoch": 1.1551246537396123, + "grad_norm": 0.0663997158408165, + "learning_rate": 9.722482834686227e-05, + "loss": 0.012004458345472813, + "num_input_tokens_seen": 68287920, + "step": 4170, + "train_runtime": 34382.0869, + "train_tokens_per_second": 1986.148 + }, + { + "epoch": 1.1554016620498615, + "grad_norm": 0.06192312389612198, + "learning_rate": 9.722338425151701e-05, + "loss": 0.01262248121201992, + "num_input_tokens_seen": 68304296, + "step": 4171, + "train_runtime": 34390.2928, + "train_tokens_per_second": 1986.15 + }, + { + "epoch": 1.1556786703601107, + "grad_norm": 0.05084363371133804, + "learning_rate": 9.722193979127328e-05, + "loss": 0.012341287918388844, + "num_input_tokens_seen": 68320672, + "step": 4172, + "train_runtime": 34398.5038, + "train_tokens_per_second": 1986.152 + }, + { + "epoch": 1.1559556786703602, + "grad_norm": 0.05471033230423927, + "learning_rate": 9.72204949661422e-05, + "loss": 0.009551827795803547, + "num_input_tokens_seen": 68337048, + "step": 4173, + "train_runtime": 34406.7192, + "train_tokens_per_second": 1986.154 + }, + { + "epoch": 1.1562326869806094, + "grad_norm": 0.02729158103466034, + "learning_rate": 9.721904977613497e-05, + "loss": 0.008663509972393513, + "num_input_tokens_seen": 68353424, + "step": 4174, + "train_runtime": 34414.9241, + "train_tokens_per_second": 1986.156 + }, + { + "epoch": 1.1565096952908587, + "grad_norm": 0.07023870944976807, + "learning_rate": 9.721760422126276e-05, + "loss": 0.010249251499772072, + "num_input_tokens_seen": 68369800, + "step": 4175, + "train_runtime": 34423.1313, + "train_tokens_per_second": 1986.159 + }, + { + "epoch": 1.156786703601108, + "grad_norm": 0.06328748166561127, + "learning_rate": 9.721615830153671e-05, + "loss": 0.01273371372371912, + "num_input_tokens_seen": 68386176, + "step": 4176, + "train_runtime": 34431.3339, + "train_tokens_per_second": 1986.161 + }, + { + "epoch": 1.1570637119113574, + "grad_norm": 0.04910686984658241, + "learning_rate": 9.721471201696803e-05, + "loss": 0.009261350147426128, + "num_input_tokens_seen": 68402552, + "step": 4177, + "train_runtime": 34439.5359, + "train_tokens_per_second": 1986.164 + }, + { + "epoch": 1.1573407202216066, + "grad_norm": 0.06443687528371811, + "learning_rate": 9.721326536756788e-05, + "loss": 0.010815195739269257, + "num_input_tokens_seen": 68418928, + "step": 4178, + "train_runtime": 34447.7532, + "train_tokens_per_second": 1986.165 + }, + { + "epoch": 1.1576177285318558, + "grad_norm": 0.028931250795722008, + "learning_rate": 9.721181835334741e-05, + "loss": 0.01048198901116848, + "num_input_tokens_seen": 68435304, + "step": 4179, + "train_runtime": 34455.9621, + "train_tokens_per_second": 1986.167 + }, + { + "epoch": 1.1578947368421053, + "grad_norm": 0.047065891325473785, + "learning_rate": 9.721037097431785e-05, + "loss": 0.010855134576559067, + "num_input_tokens_seen": 68451680, + "step": 4180, + "train_runtime": 34464.1648, + "train_tokens_per_second": 1986.17 + }, + { + "epoch": 1.1581717451523545, + "grad_norm": 0.07794886082410812, + "learning_rate": 9.720892323049033e-05, + "loss": 0.01180336158722639, + "num_input_tokens_seen": 68468056, + "step": 4181, + "train_runtime": 34472.3794, + "train_tokens_per_second": 1986.171 + }, + { + "epoch": 1.1584487534626038, + "grad_norm": 0.03545091301202774, + "learning_rate": 9.72074751218761e-05, + "loss": 0.008737481199204922, + "num_input_tokens_seen": 68484432, + "step": 4182, + "train_runtime": 34480.6049, + "train_tokens_per_second": 1986.173 + }, + { + "epoch": 1.1587257617728532, + "grad_norm": 0.0774664357304573, + "learning_rate": 9.72060266484863e-05, + "loss": 0.013448329642415047, + "num_input_tokens_seen": 68500808, + "step": 4183, + "train_runtime": 34488.8193, + "train_tokens_per_second": 1986.174 + }, + { + "epoch": 1.1590027700831025, + "grad_norm": 0.048766765743494034, + "learning_rate": 9.720457781033215e-05, + "loss": 0.01200693380087614, + "num_input_tokens_seen": 68517184, + "step": 4184, + "train_runtime": 34497.0323, + "train_tokens_per_second": 1986.176 + }, + { + "epoch": 1.1592797783933517, + "grad_norm": 0.06424888223409653, + "learning_rate": 9.720312860742482e-05, + "loss": 0.011402271687984467, + "num_input_tokens_seen": 68533560, + "step": 4185, + "train_runtime": 34505.2593, + "train_tokens_per_second": 1986.177 + }, + { + "epoch": 1.1595567867036012, + "grad_norm": 0.07045385241508484, + "learning_rate": 9.720167903977553e-05, + "loss": 0.009864402003586292, + "num_input_tokens_seen": 68549936, + "step": 4186, + "train_runtime": 34513.4916, + "train_tokens_per_second": 1986.178 + }, + { + "epoch": 1.1598337950138504, + "grad_norm": 0.027461223304271698, + "learning_rate": 9.720022910739545e-05, + "loss": 0.01165740191936493, + "num_input_tokens_seen": 68566312, + "step": 4187, + "train_runtime": 34521.7242, + "train_tokens_per_second": 1986.179 + }, + { + "epoch": 1.1601108033240997, + "grad_norm": 0.0733179897069931, + "learning_rate": 9.719877881029585e-05, + "loss": 0.013792186975479126, + "num_input_tokens_seen": 68582688, + "step": 4188, + "train_runtime": 34529.9534, + "train_tokens_per_second": 1986.18 + }, + { + "epoch": 1.1603878116343491, + "grad_norm": 0.036322902888059616, + "learning_rate": 9.719732814848786e-05, + "loss": 0.009671422652900219, + "num_input_tokens_seen": 68599064, + "step": 4189, + "train_runtime": 34538.1825, + "train_tokens_per_second": 1986.18 + }, + { + "epoch": 1.1606648199445984, + "grad_norm": 0.047622181475162506, + "learning_rate": 9.719587712198275e-05, + "loss": 0.011872388422489166, + "num_input_tokens_seen": 68615440, + "step": 4190, + "train_runtime": 34546.4003, + "train_tokens_per_second": 1986.182 + }, + { + "epoch": 1.1609418282548476, + "grad_norm": 0.07502725720405579, + "learning_rate": 9.719442573079169e-05, + "loss": 0.011366970837116241, + "num_input_tokens_seen": 68631816, + "step": 4191, + "train_runtime": 34554.6096, + "train_tokens_per_second": 1986.184 + }, + { + "epoch": 1.161218836565097, + "grad_norm": 0.05483747273683548, + "learning_rate": 9.71929739749259e-05, + "loss": 0.01103995367884636, + "num_input_tokens_seen": 68648192, + "step": 4192, + "train_runtime": 34562.8196, + "train_tokens_per_second": 1986.186 + }, + { + "epoch": 1.1614958448753463, + "grad_norm": 0.025356821715831757, + "learning_rate": 9.719152185439662e-05, + "loss": 0.010946668684482574, + "num_input_tokens_seen": 68664568, + "step": 4193, + "train_runtime": 34571.0214, + "train_tokens_per_second": 1986.189 + }, + { + "epoch": 1.1617728531855955, + "grad_norm": 0.037221331149339676, + "learning_rate": 9.719006936921507e-05, + "loss": 0.010752509348094463, + "num_input_tokens_seen": 68680944, + "step": 4194, + "train_runtime": 34579.2445, + "train_tokens_per_second": 1986.19 + }, + { + "epoch": 1.162049861495845, + "grad_norm": 0.04084762558341026, + "learning_rate": 9.718861651939244e-05, + "loss": 0.01102987676858902, + "num_input_tokens_seen": 68697320, + "step": 4195, + "train_runtime": 34587.4736, + "train_tokens_per_second": 1986.191 + }, + { + "epoch": 1.1623268698060942, + "grad_norm": 0.08193482458591461, + "learning_rate": 9.718716330493998e-05, + "loss": 0.012811103835701942, + "num_input_tokens_seen": 68713696, + "step": 4196, + "train_runtime": 34595.6939, + "train_tokens_per_second": 1986.192 + }, + { + "epoch": 1.1626038781163435, + "grad_norm": 0.046466853469610214, + "learning_rate": 9.718570972586893e-05, + "loss": 0.012023267336189747, + "num_input_tokens_seen": 68730072, + "step": 4197, + "train_runtime": 34603.901, + "train_tokens_per_second": 1986.194 + }, + { + "epoch": 1.1628808864265927, + "grad_norm": 0.05665383115410805, + "learning_rate": 9.718425578219051e-05, + "loss": 0.012425830587744713, + "num_input_tokens_seen": 68746448, + "step": 4198, + "train_runtime": 34612.1078, + "train_tokens_per_second": 1986.197 + }, + { + "epoch": 1.1631578947368422, + "grad_norm": 0.053611934185028076, + "learning_rate": 9.718280147391594e-05, + "loss": 0.009844831191003323, + "num_input_tokens_seen": 68762824, + "step": 4199, + "train_runtime": 34620.3265, + "train_tokens_per_second": 1986.198 + }, + { + "epoch": 1.1634349030470914, + "grad_norm": 0.06615272164344788, + "learning_rate": 9.718134680105648e-05, + "loss": 0.011262536980211735, + "num_input_tokens_seen": 68779200, + "step": 4200, + "train_runtime": 34628.5717, + "train_tokens_per_second": 1986.198 + } + ], + "logging_steps": 1, + "max_steps": 36100, + "num_input_tokens_seen": 68779200, + "num_train_epochs": 10, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.159317304144691e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}