diff --git "a/v127rc_exp2/B_rep/checkpoint-1300/trainer_state.json" "b/v127rc_exp2/B_rep/checkpoint-1300/trainer_state.json" new file mode 100644--- /dev/null +++ "b/v127rc_exp2/B_rep/checkpoint-1300/trainer_state.json" @@ -0,0 +1,13034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.7878787878787878, + "eval_steps": 500, + "global_step": 1300, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006060606060606061, + "grad_norm": 0.35123783349990845, + "learning_rate": 0.0, + "loss": 1.6639432907104492, + "num_input_tokens_seen": 16376, + "step": 1, + "train_runtime": 9.7703, + "train_tokens_per_second": 1676.104 + }, + { + "epoch": 0.0012121212121212121, + "grad_norm": 0.39342227578163147, + "learning_rate": 6.060606060606061e-07, + "loss": 1.6057767868041992, + "num_input_tokens_seen": 32752, + "step": 2, + "train_runtime": 17.8325, + "train_tokens_per_second": 1836.647 + }, + { + "epoch": 0.0018181818181818182, + "grad_norm": 0.3597555458545685, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.6560568809509277, + "num_input_tokens_seen": 49128, + "step": 3, + "train_runtime": 25.8927, + "train_tokens_per_second": 1897.372 + }, + { + "epoch": 0.0024242424242424242, + "grad_norm": 0.3463701009750366, + "learning_rate": 1.818181818181818e-06, + "loss": 1.6540638208389282, + "num_input_tokens_seen": 65504, + "step": 4, + "train_runtime": 33.9566, + "train_tokens_per_second": 1929.051 + }, + { + "epoch": 0.0030303030303030303, + "grad_norm": 0.34733158349990845, + "learning_rate": 2.4242424242424244e-06, + "loss": 1.664928913116455, + "num_input_tokens_seen": 81880, + "step": 5, + "train_runtime": 42.0394, + "train_tokens_per_second": 1947.697 + }, + { + "epoch": 0.0036363636363636364, + "grad_norm": 0.36326366662979126, + "learning_rate": 3.0303030303030305e-06, + "loss": 1.6352522373199463, + "num_input_tokens_seen": 98256, + "step": 6, + "train_runtime": 50.1229, + "train_tokens_per_second": 1960.302 + }, + { + "epoch": 0.004242424242424243, + "grad_norm": 0.351137638092041, + "learning_rate": 3.636363636363636e-06, + "loss": 1.660022497177124, + "num_input_tokens_seen": 114632, + "step": 7, + "train_runtime": 58.2137, + "train_tokens_per_second": 1969.159 + }, + { + "epoch": 0.0048484848484848485, + "grad_norm": 0.353691428899765, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6757584810256958, + "num_input_tokens_seen": 131008, + "step": 8, + "train_runtime": 66.311, + "train_tokens_per_second": 1975.66 + }, + { + "epoch": 0.005454545454545455, + "grad_norm": 0.3630884885787964, + "learning_rate": 4.848484848484849e-06, + "loss": 1.6366666555404663, + "num_input_tokens_seen": 147384, + "step": 9, + "train_runtime": 74.4151, + "train_tokens_per_second": 1980.565 + }, + { + "epoch": 0.006060606060606061, + "grad_norm": 0.354055255651474, + "learning_rate": 5.4545454545454545e-06, + "loss": 1.6339915990829468, + "num_input_tokens_seen": 163760, + "step": 10, + "train_runtime": 82.5209, + "train_tokens_per_second": 1984.468 + }, + { + "epoch": 0.006666666666666667, + "grad_norm": 0.3574777841567993, + "learning_rate": 6.060606060606061e-06, + "loss": 1.6360563039779663, + "num_input_tokens_seen": 180136, + "step": 11, + "train_runtime": 90.6349, + "train_tokens_per_second": 1987.491 + }, + { + "epoch": 0.007272727272727273, + "grad_norm": 0.3561362028121948, + "learning_rate": 6.666666666666667e-06, + "loss": 1.6641417741775513, + "num_input_tokens_seen": 196512, + "step": 12, + "train_runtime": 98.7492, + "train_tokens_per_second": 1990.012 + }, + { + "epoch": 0.00787878787878788, + "grad_norm": 0.3659680485725403, + "learning_rate": 7.272727272727272e-06, + "loss": 1.6375828981399536, + "num_input_tokens_seen": 212888, + "step": 13, + "train_runtime": 106.8626, + "train_tokens_per_second": 1992.165 + }, + { + "epoch": 0.008484848484848486, + "grad_norm": 0.37148839235305786, + "learning_rate": 7.878787878787878e-06, + "loss": 1.6246858835220337, + "num_input_tokens_seen": 229264, + "step": 14, + "train_runtime": 114.9785, + "train_tokens_per_second": 1993.973 + }, + { + "epoch": 0.00909090909090909, + "grad_norm": 0.38491716980934143, + "learning_rate": 8.484848484848486e-06, + "loss": 1.5969434976577759, + "num_input_tokens_seen": 245640, + "step": 15, + "train_runtime": 123.0953, + "train_tokens_per_second": 1995.526 + }, + { + "epoch": 0.009696969696969697, + "grad_norm": 0.37805187702178955, + "learning_rate": 9.090909090909091e-06, + "loss": 1.6518127918243408, + "num_input_tokens_seen": 262016, + "step": 16, + "train_runtime": 131.2123, + "train_tokens_per_second": 1996.886 + }, + { + "epoch": 0.010303030303030303, + "grad_norm": 0.3775594234466553, + "learning_rate": 9.696969696969698e-06, + "loss": 1.6409087181091309, + "num_input_tokens_seen": 278392, + "step": 17, + "train_runtime": 139.3368, + "train_tokens_per_second": 1997.979 + }, + { + "epoch": 0.01090909090909091, + "grad_norm": 0.39896833896636963, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.5989718437194824, + "num_input_tokens_seen": 294768, + "step": 18, + "train_runtime": 147.4538, + "train_tokens_per_second": 1999.054 + }, + { + "epoch": 0.011515151515151515, + "grad_norm": 0.386406272649765, + "learning_rate": 1.0909090909090909e-05, + "loss": 1.6259583234786987, + "num_input_tokens_seen": 311144, + "step": 19, + "train_runtime": 155.5716, + "train_tokens_per_second": 2000.005 + }, + { + "epoch": 0.012121212121212121, + "grad_norm": 0.3878491520881653, + "learning_rate": 1.1515151515151517e-05, + "loss": 1.5945892333984375, + "num_input_tokens_seen": 327520, + "step": 20, + "train_runtime": 163.6898, + "train_tokens_per_second": 2000.858 + }, + { + "epoch": 0.012727272727272728, + "grad_norm": 0.4080464839935303, + "learning_rate": 1.2121212121212122e-05, + "loss": 1.5983033180236816, + "num_input_tokens_seen": 343896, + "step": 21, + "train_runtime": 171.8117, + "train_tokens_per_second": 2001.587 + }, + { + "epoch": 0.013333333333333334, + "grad_norm": 0.41852834820747375, + "learning_rate": 1.2727272727272727e-05, + "loss": 1.5769346952438354, + "num_input_tokens_seen": 360272, + "step": 22, + "train_runtime": 179.9341, + "train_tokens_per_second": 2002.244 + }, + { + "epoch": 0.013939393939393939, + "grad_norm": 0.4324847459793091, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.5699174404144287, + "num_input_tokens_seen": 376648, + "step": 23, + "train_runtime": 188.0563, + "train_tokens_per_second": 2002.847 + }, + { + "epoch": 0.014545454545454545, + "grad_norm": 0.4219138026237488, + "learning_rate": 1.3939393939393942e-05, + "loss": 1.5589112043380737, + "num_input_tokens_seen": 393024, + "step": 24, + "train_runtime": 196.1758, + "train_tokens_per_second": 2003.427 + }, + { + "epoch": 0.015151515151515152, + "grad_norm": 0.42980635166168213, + "learning_rate": 1.4545454545454545e-05, + "loss": 1.5662312507629395, + "num_input_tokens_seen": 409400, + "step": 25, + "train_runtime": 204.2941, + "train_tokens_per_second": 2003.974 + }, + { + "epoch": 0.01575757575757576, + "grad_norm": 0.4569622576236725, + "learning_rate": 1.5151515151515153e-05, + "loss": 1.4885609149932861, + "num_input_tokens_seen": 425776, + "step": 26, + "train_runtime": 212.4141, + "train_tokens_per_second": 2004.462 + }, + { + "epoch": 0.016363636363636365, + "grad_norm": 0.4413582384586334, + "learning_rate": 1.5757575757575756e-05, + "loss": 1.4823509454727173, + "num_input_tokens_seen": 442152, + "step": 27, + "train_runtime": 220.5358, + "train_tokens_per_second": 2004.899 + }, + { + "epoch": 0.01696969696969697, + "grad_norm": 0.45630744099617004, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.4595903158187866, + "num_input_tokens_seen": 458528, + "step": 28, + "train_runtime": 228.6622, + "train_tokens_per_second": 2005.264 + }, + { + "epoch": 0.017575757575757574, + "grad_norm": 0.457793653011322, + "learning_rate": 1.6969696969696972e-05, + "loss": 1.4525277614593506, + "num_input_tokens_seen": 474904, + "step": 29, + "train_runtime": 236.7808, + "train_tokens_per_second": 2005.67 + }, + { + "epoch": 0.01818181818181818, + "grad_norm": 0.4766552150249481, + "learning_rate": 1.7575757575757576e-05, + "loss": 1.425432801246643, + "num_input_tokens_seen": 491280, + "step": 30, + "train_runtime": 244.8928, + "train_tokens_per_second": 2006.102 + }, + { + "epoch": 0.018787878787878787, + "grad_norm": 0.5165067911148071, + "learning_rate": 1.8181818181818182e-05, + "loss": 1.3646923303604126, + "num_input_tokens_seen": 507656, + "step": 31, + "train_runtime": 253.0027, + "train_tokens_per_second": 2006.524 + }, + { + "epoch": 0.019393939393939394, + "grad_norm": 0.4833853244781494, + "learning_rate": 1.878787878787879e-05, + "loss": 1.3608993291854858, + "num_input_tokens_seen": 524032, + "step": 32, + "train_runtime": 261.1128, + "train_tokens_per_second": 2006.918 + }, + { + "epoch": 0.02, + "grad_norm": 0.49612611532211304, + "learning_rate": 1.9393939393939395e-05, + "loss": 1.350702166557312, + "num_input_tokens_seen": 540408, + "step": 33, + "train_runtime": 269.2241, + "train_tokens_per_second": 2007.28 + }, + { + "epoch": 0.020606060606060607, + "grad_norm": 0.5136600732803345, + "learning_rate": 2e-05, + "loss": 1.291304349899292, + "num_input_tokens_seen": 556784, + "step": 34, + "train_runtime": 277.336, + "train_tokens_per_second": 2007.615 + }, + { + "epoch": 0.021212121212121213, + "grad_norm": 0.5192011594772339, + "learning_rate": 2.0606060606060608e-05, + "loss": 1.2744120359420776, + "num_input_tokens_seen": 573160, + "step": 35, + "train_runtime": 285.4446, + "train_tokens_per_second": 2007.956 + }, + { + "epoch": 0.02181818181818182, + "grad_norm": 0.5397765636444092, + "learning_rate": 2.1212121212121215e-05, + "loss": 1.208145022392273, + "num_input_tokens_seen": 589536, + "step": 36, + "train_runtime": 293.5567, + "train_tokens_per_second": 2008.253 + }, + { + "epoch": 0.022424242424242423, + "grad_norm": 0.5493120551109314, + "learning_rate": 2.1818181818181818e-05, + "loss": 1.2057533264160156, + "num_input_tokens_seen": 605912, + "step": 37, + "train_runtime": 301.6704, + "train_tokens_per_second": 2008.523 + }, + { + "epoch": 0.02303030303030303, + "grad_norm": 0.5603742599487305, + "learning_rate": 2.2424242424242424e-05, + "loss": 1.1387653350830078, + "num_input_tokens_seen": 622288, + "step": 38, + "train_runtime": 309.7816, + "train_tokens_per_second": 2008.796 + }, + { + "epoch": 0.023636363636363636, + "grad_norm": 0.581070601940155, + "learning_rate": 2.3030303030303034e-05, + "loss": 1.138227939605713, + "num_input_tokens_seen": 638664, + "step": 39, + "train_runtime": 317.8926, + "train_tokens_per_second": 2009.056 + }, + { + "epoch": 0.024242424242424242, + "grad_norm": 0.5650333762168884, + "learning_rate": 2.3636363636363637e-05, + "loss": 1.1126341819763184, + "num_input_tokens_seen": 655040, + "step": 40, + "train_runtime": 326.0006, + "train_tokens_per_second": 2009.321 + }, + { + "epoch": 0.02484848484848485, + "grad_norm": 0.6228408813476562, + "learning_rate": 2.4242424242424244e-05, + "loss": 1.0580966472625732, + "num_input_tokens_seen": 671416, + "step": 41, + "train_runtime": 334.1133, + "train_tokens_per_second": 2009.546 + }, + { + "epoch": 0.025454545454545455, + "grad_norm": 0.7027150392532349, + "learning_rate": 2.4848484848484847e-05, + "loss": 1.0436644554138184, + "num_input_tokens_seen": 687792, + "step": 42, + "train_runtime": 342.2227, + "train_tokens_per_second": 2009.779 + }, + { + "epoch": 0.026060606060606062, + "grad_norm": 0.876166045665741, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.9523745775222778, + "num_input_tokens_seen": 704168, + "step": 43, + "train_runtime": 350.3357, + "train_tokens_per_second": 2009.981 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 0.5786903500556946, + "learning_rate": 2.6060606060606063e-05, + "loss": 0.9218084812164307, + "num_input_tokens_seen": 720544, + "step": 44, + "train_runtime": 358.4444, + "train_tokens_per_second": 2010.198 + }, + { + "epoch": 0.02727272727272727, + "grad_norm": 0.6627383828163147, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.8746504187583923, + "num_input_tokens_seen": 736920, + "step": 45, + "train_runtime": 366.5519, + "train_tokens_per_second": 2010.411 + }, + { + "epoch": 0.027878787878787878, + "grad_norm": 0.6991789937019348, + "learning_rate": 2.7272727272727273e-05, + "loss": 0.8592554926872253, + "num_input_tokens_seen": 753296, + "step": 46, + "train_runtime": 374.6632, + "train_tokens_per_second": 2010.595 + }, + { + "epoch": 0.028484848484848484, + "grad_norm": 0.6843043565750122, + "learning_rate": 2.7878787878787883e-05, + "loss": 0.7838267683982849, + "num_input_tokens_seen": 769672, + "step": 47, + "train_runtime": 382.7718, + "train_tokens_per_second": 2010.786 + }, + { + "epoch": 0.02909090909090909, + "grad_norm": 0.6203355193138123, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7517961263656616, + "num_input_tokens_seen": 786048, + "step": 48, + "train_runtime": 390.883, + "train_tokens_per_second": 2010.955 + }, + { + "epoch": 0.029696969696969697, + "grad_norm": 0.6031985878944397, + "learning_rate": 2.909090909090909e-05, + "loss": 0.7074779272079468, + "num_input_tokens_seen": 802424, + "step": 49, + "train_runtime": 398.992, + "train_tokens_per_second": 2011.128 + }, + { + "epoch": 0.030303030303030304, + "grad_norm": 0.6645159125328064, + "learning_rate": 2.96969696969697e-05, + "loss": 0.6244415044784546, + "num_input_tokens_seen": 818800, + "step": 50, + "train_runtime": 407.1007, + "train_tokens_per_second": 2011.296 + }, + { + "epoch": 0.03090909090909091, + "grad_norm": 0.6037282943725586, + "learning_rate": 3.0303030303030306e-05, + "loss": 0.6258814334869385, + "num_input_tokens_seen": 835176, + "step": 51, + "train_runtime": 415.209, + "train_tokens_per_second": 2011.459 + }, + { + "epoch": 0.03151515151515152, + "grad_norm": 0.7840785980224609, + "learning_rate": 3.090909090909091e-05, + "loss": 0.5502547025680542, + "num_input_tokens_seen": 851552, + "step": 52, + "train_runtime": 423.3167, + "train_tokens_per_second": 2011.619 + }, + { + "epoch": 0.03212121212121212, + "grad_norm": 0.5410464406013489, + "learning_rate": 3.151515151515151e-05, + "loss": 0.4808294475078583, + "num_input_tokens_seen": 867928, + "step": 53, + "train_runtime": 431.4326, + "train_tokens_per_second": 2011.735 + }, + { + "epoch": 0.03272727272727273, + "grad_norm": 0.5532175898551941, + "learning_rate": 3.212121212121212e-05, + "loss": 0.4808656871318817, + "num_input_tokens_seen": 884304, + "step": 54, + "train_runtime": 439.5458, + "train_tokens_per_second": 2011.859 + }, + { + "epoch": 0.03333333333333333, + "grad_norm": 0.6308758854866028, + "learning_rate": 3.272727272727273e-05, + "loss": 0.4137771427631378, + "num_input_tokens_seen": 900680, + "step": 55, + "train_runtime": 447.6539, + "train_tokens_per_second": 2012.001 + }, + { + "epoch": 0.03393939393939394, + "grad_norm": 0.492653489112854, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.3654894530773163, + "num_input_tokens_seen": 917056, + "step": 56, + "train_runtime": 455.7624, + "train_tokens_per_second": 2012.136 + }, + { + "epoch": 0.034545454545454546, + "grad_norm": 0.5767380595207214, + "learning_rate": 3.3939393939393945e-05, + "loss": 0.342722088098526, + "num_input_tokens_seen": 933432, + "step": 57, + "train_runtime": 463.8713, + "train_tokens_per_second": 2012.265 + }, + { + "epoch": 0.03515151515151515, + "grad_norm": 0.5243986248970032, + "learning_rate": 3.454545454545455e-05, + "loss": 0.2960652709007263, + "num_input_tokens_seen": 949808, + "step": 58, + "train_runtime": 471.9805, + "train_tokens_per_second": 2012.388 + }, + { + "epoch": 0.03575757575757576, + "grad_norm": 0.4490169882774353, + "learning_rate": 3.515151515151515e-05, + "loss": 0.26675525307655334, + "num_input_tokens_seen": 966184, + "step": 59, + "train_runtime": 480.0912, + "train_tokens_per_second": 2012.501 + }, + { + "epoch": 0.03636363636363636, + "grad_norm": 0.4677429795265198, + "learning_rate": 3.575757575757576e-05, + "loss": 0.2512170076370239, + "num_input_tokens_seen": 982560, + "step": 60, + "train_runtime": 488.2016, + "train_tokens_per_second": 2012.611 + }, + { + "epoch": 0.03696969696969697, + "grad_norm": 0.37272387742996216, + "learning_rate": 3.6363636363636364e-05, + "loss": 0.19348715245723724, + "num_input_tokens_seen": 998936, + "step": 61, + "train_runtime": 496.3104, + "train_tokens_per_second": 2012.724 + }, + { + "epoch": 0.037575757575757575, + "grad_norm": 0.36983442306518555, + "learning_rate": 3.6969696969696974e-05, + "loss": 0.18563911318778992, + "num_input_tokens_seen": 1015312, + "step": 62, + "train_runtime": 504.419, + "train_tokens_per_second": 2012.835 + }, + { + "epoch": 0.038181818181818185, + "grad_norm": 0.37516751885414124, + "learning_rate": 3.757575757575758e-05, + "loss": 0.16986083984375, + "num_input_tokens_seen": 1031688, + "step": 63, + "train_runtime": 512.5348, + "train_tokens_per_second": 2012.913 + }, + { + "epoch": 0.03878787878787879, + "grad_norm": 0.3174577057361603, + "learning_rate": 3.818181818181819e-05, + "loss": 0.1534540057182312, + "num_input_tokens_seen": 1048064, + "step": 64, + "train_runtime": 520.644, + "train_tokens_per_second": 2013.015 + }, + { + "epoch": 0.03939393939393939, + "grad_norm": 0.30689847469329834, + "learning_rate": 3.878787878787879e-05, + "loss": 0.14156833291053772, + "num_input_tokens_seen": 1064440, + "step": 65, + "train_runtime": 528.7787, + "train_tokens_per_second": 2013.016 + }, + { + "epoch": 0.04, + "grad_norm": 0.2671639621257782, + "learning_rate": 3.939393939393939e-05, + "loss": 0.12481589615345001, + "num_input_tokens_seen": 1080816, + "step": 66, + "train_runtime": 536.8903, + "train_tokens_per_second": 2013.104 + }, + { + "epoch": 0.040606060606060604, + "grad_norm": 0.2459305375814438, + "learning_rate": 4e-05, + "loss": 0.12609152495861053, + "num_input_tokens_seen": 1097192, + "step": 67, + "train_runtime": 545.0023, + "train_tokens_per_second": 2013.188 + }, + { + "epoch": 0.041212121212121214, + "grad_norm": 0.23298931121826172, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.10923294723033905, + "num_input_tokens_seen": 1113568, + "step": 68, + "train_runtime": 553.1113, + "train_tokens_per_second": 2013.28 + }, + { + "epoch": 0.04181818181818182, + "grad_norm": 0.22864830493927002, + "learning_rate": 4.1212121212121216e-05, + "loss": 0.10794200748205185, + "num_input_tokens_seen": 1129944, + "step": 69, + "train_runtime": 561.2215, + "train_tokens_per_second": 2013.365 + }, + { + "epoch": 0.04242424242424243, + "grad_norm": 0.2130967080593109, + "learning_rate": 4.181818181818182e-05, + "loss": 0.09509418904781342, + "num_input_tokens_seen": 1146320, + "step": 70, + "train_runtime": 569.3343, + "train_tokens_per_second": 2013.439 + }, + { + "epoch": 0.04303030303030303, + "grad_norm": 0.19734057784080505, + "learning_rate": 4.242424242424243e-05, + "loss": 0.08767769485712051, + "num_input_tokens_seen": 1162696, + "step": 71, + "train_runtime": 577.4461, + "train_tokens_per_second": 2013.514 + }, + { + "epoch": 0.04363636363636364, + "grad_norm": 0.2512868344783783, + "learning_rate": 4.303030303030303e-05, + "loss": 0.08520924299955368, + "num_input_tokens_seen": 1179072, + "step": 72, + "train_runtime": 585.5562, + "train_tokens_per_second": 2013.593 + }, + { + "epoch": 0.04424242424242424, + "grad_norm": 0.18867339193820953, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.08193657547235489, + "num_input_tokens_seen": 1195448, + "step": 73, + "train_runtime": 593.6659, + "train_tokens_per_second": 2013.671 + }, + { + "epoch": 0.044848484848484846, + "grad_norm": 0.17708271741867065, + "learning_rate": 4.4242424242424246e-05, + "loss": 0.07861079275608063, + "num_input_tokens_seen": 1211824, + "step": 74, + "train_runtime": 601.778, + "train_tokens_per_second": 2013.739 + }, + { + "epoch": 0.045454545454545456, + "grad_norm": 0.16671743988990784, + "learning_rate": 4.484848484848485e-05, + "loss": 0.07204174995422363, + "num_input_tokens_seen": 1228200, + "step": 75, + "train_runtime": 609.889, + "train_tokens_per_second": 2013.809 + }, + { + "epoch": 0.04606060606060606, + "grad_norm": 0.17388567328453064, + "learning_rate": 4.545454545454546e-05, + "loss": 0.05977003276348114, + "num_input_tokens_seen": 1244576, + "step": 76, + "train_runtime": 617.9973, + "train_tokens_per_second": 2013.886 + }, + { + "epoch": 0.04666666666666667, + "grad_norm": 0.14751967787742615, + "learning_rate": 4.606060606060607e-05, + "loss": 0.06652094423770905, + "num_input_tokens_seen": 1260952, + "step": 77, + "train_runtime": 626.1063, + "train_tokens_per_second": 2013.958 + }, + { + "epoch": 0.04727272727272727, + "grad_norm": 0.1427117884159088, + "learning_rate": 4.666666666666667e-05, + "loss": 0.05981641262769699, + "num_input_tokens_seen": 1277328, + "step": 78, + "train_runtime": 634.2178, + "train_tokens_per_second": 2014.021 + }, + { + "epoch": 0.04787878787878788, + "grad_norm": 0.16328735649585724, + "learning_rate": 4.7272727272727275e-05, + "loss": 0.059813786298036575, + "num_input_tokens_seen": 1293704, + "step": 79, + "train_runtime": 642.3361, + "train_tokens_per_second": 2014.061 + }, + { + "epoch": 0.048484848484848485, + "grad_norm": 0.15144814550876617, + "learning_rate": 4.787878787878788e-05, + "loss": 0.05687074735760689, + "num_input_tokens_seen": 1310080, + "step": 80, + "train_runtime": 650.4589, + "train_tokens_per_second": 2014.086 + }, + { + "epoch": 0.04909090909090909, + "grad_norm": 0.19531840085983276, + "learning_rate": 4.848484848484849e-05, + "loss": 0.06199571490287781, + "num_input_tokens_seen": 1326456, + "step": 81, + "train_runtime": 658.5803, + "train_tokens_per_second": 2014.114 + }, + { + "epoch": 0.0496969696969697, + "grad_norm": 0.11535873264074326, + "learning_rate": 4.909090909090909e-05, + "loss": 0.05434288829565048, + "num_input_tokens_seen": 1342832, + "step": 82, + "train_runtime": 666.7006, + "train_tokens_per_second": 2014.145 + }, + { + "epoch": 0.0503030303030303, + "grad_norm": 0.17366129159927368, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.0584072507917881, + "num_input_tokens_seen": 1359208, + "step": 83, + "train_runtime": 674.8206, + "train_tokens_per_second": 2014.177 + }, + { + "epoch": 0.05090909090909091, + "grad_norm": 0.16601437330245972, + "learning_rate": 5.030303030303031e-05, + "loss": 0.055472493171691895, + "num_input_tokens_seen": 1375584, + "step": 84, + "train_runtime": 682.9407, + "train_tokens_per_second": 2014.207 + }, + { + "epoch": 0.051515151515151514, + "grad_norm": 0.12125150859355927, + "learning_rate": 5.090909090909091e-05, + "loss": 0.04972580820322037, + "num_input_tokens_seen": 1391960, + "step": 85, + "train_runtime": 691.0602, + "train_tokens_per_second": 2014.238 + }, + { + "epoch": 0.052121212121212124, + "grad_norm": 0.10404529422521591, + "learning_rate": 5.151515151515152e-05, + "loss": 0.04972917586565018, + "num_input_tokens_seen": 1408336, + "step": 86, + "train_runtime": 699.177, + "train_tokens_per_second": 2014.277 + }, + { + "epoch": 0.05272727272727273, + "grad_norm": 0.19109457731246948, + "learning_rate": 5.212121212121213e-05, + "loss": 0.04995625838637352, + "num_input_tokens_seen": 1424712, + "step": 87, + "train_runtime": 707.2957, + "train_tokens_per_second": 2014.309 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 0.14529068768024445, + "learning_rate": 5.272727272727272e-05, + "loss": 0.044690582901239395, + "num_input_tokens_seen": 1441088, + "step": 88, + "train_runtime": 715.4144, + "train_tokens_per_second": 2014.34 + }, + { + "epoch": 0.05393939393939394, + "grad_norm": 0.12216632813215256, + "learning_rate": 5.333333333333333e-05, + "loss": 0.04490099102258682, + "num_input_tokens_seen": 1457464, + "step": 89, + "train_runtime": 723.5369, + "train_tokens_per_second": 2014.36 + }, + { + "epoch": 0.05454545454545454, + "grad_norm": 0.09520085901021957, + "learning_rate": 5.393939393939394e-05, + "loss": 0.039979420602321625, + "num_input_tokens_seen": 1473840, + "step": 90, + "train_runtime": 731.6566, + "train_tokens_per_second": 2014.388 + }, + { + "epoch": 0.05515151515151515, + "grad_norm": 0.13766801357269287, + "learning_rate": 5.4545454545454546e-05, + "loss": 0.04609033092856407, + "num_input_tokens_seen": 1490216, + "step": 91, + "train_runtime": 739.7761, + "train_tokens_per_second": 2014.415 + }, + { + "epoch": 0.055757575757575756, + "grad_norm": 0.13074332475662231, + "learning_rate": 5.5151515151515156e-05, + "loss": 0.040276553481817245, + "num_input_tokens_seen": 1506592, + "step": 92, + "train_runtime": 747.8977, + "train_tokens_per_second": 2014.436 + }, + { + "epoch": 0.056363636363636366, + "grad_norm": 0.11333464086055756, + "learning_rate": 5.5757575757575766e-05, + "loss": 0.03974860906600952, + "num_input_tokens_seen": 1522968, + "step": 93, + "train_runtime": 756.018, + "train_tokens_per_second": 2014.46 + }, + { + "epoch": 0.05696969696969697, + "grad_norm": 0.09708438813686371, + "learning_rate": 5.636363636363636e-05, + "loss": 0.03745771571993828, + "num_input_tokens_seen": 1539344, + "step": 94, + "train_runtime": 764.1373, + "train_tokens_per_second": 2014.486 + }, + { + "epoch": 0.05757575757575758, + "grad_norm": 0.13791343569755554, + "learning_rate": 5.696969696969697e-05, + "loss": 0.04385356977581978, + "num_input_tokens_seen": 1555720, + "step": 95, + "train_runtime": 772.256, + "train_tokens_per_second": 2014.513 + }, + { + "epoch": 0.05818181818181818, + "grad_norm": 0.15427744388580322, + "learning_rate": 5.757575757575758e-05, + "loss": 0.0388864129781723, + "num_input_tokens_seen": 1572096, + "step": 96, + "train_runtime": 780.3755, + "train_tokens_per_second": 2014.538 + }, + { + "epoch": 0.058787878787878785, + "grad_norm": 0.11847083270549774, + "learning_rate": 5.818181818181818e-05, + "loss": 0.033506229519844055, + "num_input_tokens_seen": 1588472, + "step": 97, + "train_runtime": 788.4951, + "train_tokens_per_second": 2014.562 + }, + { + "epoch": 0.059393939393939395, + "grad_norm": 0.10092757642269135, + "learning_rate": 5.878787878787879e-05, + "loss": 0.03343300521373749, + "num_input_tokens_seen": 1604848, + "step": 98, + "train_runtime": 796.6166, + "train_tokens_per_second": 2014.58 + }, + { + "epoch": 0.06, + "grad_norm": 0.10452481359243393, + "learning_rate": 5.93939393939394e-05, + "loss": 0.036986708641052246, + "num_input_tokens_seen": 1621224, + "step": 99, + "train_runtime": 804.7379, + "train_tokens_per_second": 2014.599 + }, + { + "epoch": 0.06060606060606061, + "grad_norm": 0.08679923415184021, + "learning_rate": 6e-05, + "loss": 0.03295439854264259, + "num_input_tokens_seen": 1637600, + "step": 100, + "train_runtime": 812.8578, + "train_tokens_per_second": 2014.62 + }, + { + "epoch": 0.06121212121212121, + "grad_norm": 0.1115456148982048, + "learning_rate": 6.060606060606061e-05, + "loss": 0.03657374531030655, + "num_input_tokens_seen": 1653976, + "step": 101, + "train_runtime": 821.8569, + "train_tokens_per_second": 2012.487 + }, + { + "epoch": 0.06181818181818182, + "grad_norm": 0.08771228045225143, + "learning_rate": 6.121212121212121e-05, + "loss": 0.0364333875477314, + "num_input_tokens_seen": 1670352, + "step": 102, + "train_runtime": 829.9743, + "train_tokens_per_second": 2012.535 + }, + { + "epoch": 0.062424242424242424, + "grad_norm": 0.08961863070726395, + "learning_rate": 6.181818181818182e-05, + "loss": 0.03239607438445091, + "num_input_tokens_seen": 1686728, + "step": 103, + "train_runtime": 838.0926, + "train_tokens_per_second": 2012.58 + }, + { + "epoch": 0.06303030303030303, + "grad_norm": 0.10658557713031769, + "learning_rate": 6.242424242424243e-05, + "loss": 0.035685982555150986, + "num_input_tokens_seen": 1703104, + "step": 104, + "train_runtime": 846.2114, + "train_tokens_per_second": 2012.622 + }, + { + "epoch": 0.06363636363636363, + "grad_norm": 0.07003116607666016, + "learning_rate": 6.303030303030302e-05, + "loss": 0.03269325941801071, + "num_input_tokens_seen": 1719480, + "step": 105, + "train_runtime": 854.3347, + "train_tokens_per_second": 2012.654 + }, + { + "epoch": 0.06424242424242424, + "grad_norm": 0.0889090895652771, + "learning_rate": 6.363636363636364e-05, + "loss": 0.030469391494989395, + "num_input_tokens_seen": 1735856, + "step": 106, + "train_runtime": 862.4518, + "train_tokens_per_second": 2012.699 + }, + { + "epoch": 0.06484848484848485, + "grad_norm": 0.12026192247867584, + "learning_rate": 6.424242424242424e-05, + "loss": 0.032258037477731705, + "num_input_tokens_seen": 1752232, + "step": 107, + "train_runtime": 870.5683, + "train_tokens_per_second": 2012.745 + }, + { + "epoch": 0.06545454545454546, + "grad_norm": 0.06484470516443253, + "learning_rate": 6.484848484848485e-05, + "loss": 0.026622053235769272, + "num_input_tokens_seen": 1768608, + "step": 108, + "train_runtime": 878.6857, + "train_tokens_per_second": 2012.788 + }, + { + "epoch": 0.06606060606060606, + "grad_norm": 0.09636206179857254, + "learning_rate": 6.545454545454546e-05, + "loss": 0.03460235893726349, + "num_input_tokens_seen": 1784984, + "step": 109, + "train_runtime": 886.8033, + "train_tokens_per_second": 2012.83 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.10380304604768753, + "learning_rate": 6.606060606060607e-05, + "loss": 0.030300751328468323, + "num_input_tokens_seen": 1801360, + "step": 110, + "train_runtime": 894.9204, + "train_tokens_per_second": 2012.872 + }, + { + "epoch": 0.06727272727272728, + "grad_norm": 0.07361245900392532, + "learning_rate": 6.666666666666667e-05, + "loss": 0.03334670513868332, + "num_input_tokens_seen": 1817736, + "step": 111, + "train_runtime": 903.0383, + "train_tokens_per_second": 2012.911 + }, + { + "epoch": 0.06787878787878789, + "grad_norm": 0.06159133464097977, + "learning_rate": 6.727272727272727e-05, + "loss": 0.026774805039167404, + "num_input_tokens_seen": 1834112, + "step": 112, + "train_runtime": 911.1548, + "train_tokens_per_second": 2012.953 + }, + { + "epoch": 0.06848484848484848, + "grad_norm": 0.08236563950777054, + "learning_rate": 6.787878787878789e-05, + "loss": 0.02836509235203266, + "num_input_tokens_seen": 1850488, + "step": 113, + "train_runtime": 919.2722, + "train_tokens_per_second": 2012.992 + }, + { + "epoch": 0.06909090909090909, + "grad_norm": 0.06620238721370697, + "learning_rate": 6.848484848484848e-05, + "loss": 0.027467701584100723, + "num_input_tokens_seen": 1866864, + "step": 114, + "train_runtime": 927.3888, + "train_tokens_per_second": 2013.033 + }, + { + "epoch": 0.0696969696969697, + "grad_norm": 0.06323213130235672, + "learning_rate": 6.90909090909091e-05, + "loss": 0.02602136880159378, + "num_input_tokens_seen": 1883240, + "step": 115, + "train_runtime": 935.5053, + "train_tokens_per_second": 2013.072 + }, + { + "epoch": 0.0703030303030303, + "grad_norm": 0.06442830711603165, + "learning_rate": 6.96969696969697e-05, + "loss": 0.024133116006851196, + "num_input_tokens_seen": 1899616, + "step": 116, + "train_runtime": 943.6216, + "train_tokens_per_second": 2013.112 + }, + { + "epoch": 0.07090909090909091, + "grad_norm": 0.057056326419115067, + "learning_rate": 7.03030303030303e-05, + "loss": 0.029189810156822205, + "num_input_tokens_seen": 1915992, + "step": 117, + "train_runtime": 951.74, + "train_tokens_per_second": 2013.147 + }, + { + "epoch": 0.07151515151515152, + "grad_norm": 0.067554771900177, + "learning_rate": 7.090909090909092e-05, + "loss": 0.026694156229496002, + "num_input_tokens_seen": 1932368, + "step": 118, + "train_runtime": 959.8558, + "train_tokens_per_second": 2013.186 + }, + { + "epoch": 0.07212121212121213, + "grad_norm": 0.14906729757785797, + "learning_rate": 7.151515151515152e-05, + "loss": 0.027481166645884514, + "num_input_tokens_seen": 1948744, + "step": 119, + "train_runtime": 967.9726, + "train_tokens_per_second": 2013.222 + }, + { + "epoch": 0.07272727272727272, + "grad_norm": 0.08957181125879288, + "learning_rate": 7.212121212121213e-05, + "loss": 0.026221584528684616, + "num_input_tokens_seen": 1965120, + "step": 120, + "train_runtime": 976.0892, + "train_tokens_per_second": 2013.259 + }, + { + "epoch": 0.07333333333333333, + "grad_norm": 0.06401059031486511, + "learning_rate": 7.272727272727273e-05, + "loss": 0.024882640689611435, + "num_input_tokens_seen": 1981496, + "step": 121, + "train_runtime": 984.2063, + "train_tokens_per_second": 2013.293 + }, + { + "epoch": 0.07393939393939394, + "grad_norm": 0.08041027188301086, + "learning_rate": 7.333333333333333e-05, + "loss": 0.02306070551276207, + "num_input_tokens_seen": 1997872, + "step": 122, + "train_runtime": 992.3345, + "train_tokens_per_second": 2013.305 + }, + { + "epoch": 0.07454545454545454, + "grad_norm": 0.12150601297616959, + "learning_rate": 7.393939393939395e-05, + "loss": 0.024561185389757156, + "num_input_tokens_seen": 2014248, + "step": 123, + "train_runtime": 1000.452, + "train_tokens_per_second": 2013.338 + }, + { + "epoch": 0.07515151515151515, + "grad_norm": 0.24074473977088928, + "learning_rate": 7.454545454545455e-05, + "loss": 0.027396628633141518, + "num_input_tokens_seen": 2030624, + "step": 124, + "train_runtime": 1008.5688, + "train_tokens_per_second": 2013.372 + }, + { + "epoch": 0.07575757575757576, + "grad_norm": 0.05276267230510712, + "learning_rate": 7.515151515151515e-05, + "loss": 0.024067046120762825, + "num_input_tokens_seen": 2047000, + "step": 125, + "train_runtime": 1016.6862, + "train_tokens_per_second": 2013.404 + }, + { + "epoch": 0.07636363636363637, + "grad_norm": 0.17272238433361053, + "learning_rate": 7.575757575757576e-05, + "loss": 0.023468442261219025, + "num_input_tokens_seen": 2063376, + "step": 126, + "train_runtime": 1024.8042, + "train_tokens_per_second": 2013.434 + }, + { + "epoch": 0.07696969696969697, + "grad_norm": 0.3582988977432251, + "learning_rate": 7.636363636363637e-05, + "loss": 0.027403943240642548, + "num_input_tokens_seen": 2079752, + "step": 127, + "train_runtime": 1032.9345, + "train_tokens_per_second": 2013.44 + }, + { + "epoch": 0.07757575757575758, + "grad_norm": 0.0781882107257843, + "learning_rate": 7.696969696969696e-05, + "loss": 0.023713622242212296, + "num_input_tokens_seen": 2096128, + "step": 128, + "train_runtime": 1041.056, + "train_tokens_per_second": 2013.463 + }, + { + "epoch": 0.07818181818181819, + "grad_norm": 0.07272130995988846, + "learning_rate": 7.757575757575758e-05, + "loss": 0.022761020809412003, + "num_input_tokens_seen": 2112504, + "step": 129, + "train_runtime": 1049.1772, + "train_tokens_per_second": 2013.486 + }, + { + "epoch": 0.07878787878787878, + "grad_norm": 0.2158210277557373, + "learning_rate": 7.818181818181818e-05, + "loss": 0.024013228714466095, + "num_input_tokens_seen": 2128880, + "step": 130, + "train_runtime": 1057.2975, + "train_tokens_per_second": 2013.511 + }, + { + "epoch": 0.07939393939393939, + "grad_norm": 0.586162269115448, + "learning_rate": 7.878787878787879e-05, + "loss": 0.022834377363324165, + "num_input_tokens_seen": 2145256, + "step": 131, + "train_runtime": 1065.4164, + "train_tokens_per_second": 2013.538 + }, + { + "epoch": 0.08, + "grad_norm": 0.323000431060791, + "learning_rate": 7.93939393939394e-05, + "loss": 0.022654253989458084, + "num_input_tokens_seen": 2161632, + "step": 132, + "train_runtime": 1073.5352, + "train_tokens_per_second": 2013.564 + }, + { + "epoch": 0.08060606060606061, + "grad_norm": 0.08159562945365906, + "learning_rate": 8e-05, + "loss": 0.02390367165207863, + "num_input_tokens_seen": 2178008, + "step": 133, + "train_runtime": 1081.6528, + "train_tokens_per_second": 2013.593 + }, + { + "epoch": 0.08121212121212121, + "grad_norm": 0.7155167460441589, + "learning_rate": 8.060606060606061e-05, + "loss": 0.022787289693951607, + "num_input_tokens_seen": 2194384, + "step": 134, + "train_runtime": 1089.7709, + "train_tokens_per_second": 2013.619 + }, + { + "epoch": 0.08181818181818182, + "grad_norm": 0.08167142421007156, + "learning_rate": 8.121212121212121e-05, + "loss": 0.02184353396296501, + "num_input_tokens_seen": 2210760, + "step": 135, + "train_runtime": 1097.8902, + "train_tokens_per_second": 2013.644 + }, + { + "epoch": 0.08242424242424243, + "grad_norm": 0.47277864813804626, + "learning_rate": 8.181818181818183e-05, + "loss": 0.02624150738120079, + "num_input_tokens_seen": 2227136, + "step": 136, + "train_runtime": 1106.0079, + "train_tokens_per_second": 2013.671 + }, + { + "epoch": 0.08303030303030302, + "grad_norm": 0.07428373396396637, + "learning_rate": 8.242424242424243e-05, + "loss": 0.02352747693657875, + "num_input_tokens_seen": 2243512, + "step": 137, + "train_runtime": 1114.1326, + "train_tokens_per_second": 2013.685 + }, + { + "epoch": 0.08363636363636363, + "grad_norm": 0.47124460339546204, + "learning_rate": 8.303030303030304e-05, + "loss": 0.025087552145123482, + "num_input_tokens_seen": 2259888, + "step": 138, + "train_runtime": 1122.2501, + "train_tokens_per_second": 2013.712 + }, + { + "epoch": 0.08424242424242424, + "grad_norm": 0.2430545538663864, + "learning_rate": 8.363636363636364e-05, + "loss": 0.024803292006254196, + "num_input_tokens_seen": 2276264, + "step": 139, + "train_runtime": 1130.3676, + "train_tokens_per_second": 2013.738 + }, + { + "epoch": 0.08484848484848485, + "grad_norm": 0.08046893775463104, + "learning_rate": 8.424242424242424e-05, + "loss": 0.022827964276075363, + "num_input_tokens_seen": 2292640, + "step": 140, + "train_runtime": 1138.4851, + "train_tokens_per_second": 2013.764 + }, + { + "epoch": 0.08545454545454545, + "grad_norm": 0.15526282787322998, + "learning_rate": 8.484848484848486e-05, + "loss": 0.02164369635283947, + "num_input_tokens_seen": 2309016, + "step": 141, + "train_runtime": 1146.6046, + "train_tokens_per_second": 2013.786 + }, + { + "epoch": 0.08606060606060606, + "grad_norm": 0.0912376195192337, + "learning_rate": 8.545454545454545e-05, + "loss": 0.0223920289427042, + "num_input_tokens_seen": 2325392, + "step": 142, + "train_runtime": 1154.7226, + "train_tokens_per_second": 2013.81 + }, + { + "epoch": 0.08666666666666667, + "grad_norm": 0.08407703042030334, + "learning_rate": 8.606060606060606e-05, + "loss": 0.022693689912557602, + "num_input_tokens_seen": 2341768, + "step": 143, + "train_runtime": 1162.8406, + "train_tokens_per_second": 2013.834 + }, + { + "epoch": 0.08727272727272728, + "grad_norm": 0.07187625020742416, + "learning_rate": 8.666666666666667e-05, + "loss": 0.020523108541965485, + "num_input_tokens_seen": 2358144, + "step": 144, + "train_runtime": 1170.9602, + "train_tokens_per_second": 2013.855 + }, + { + "epoch": 0.08787878787878788, + "grad_norm": 0.08785762637853622, + "learning_rate": 8.727272727272727e-05, + "loss": 0.023188354447484016, + "num_input_tokens_seen": 2374520, + "step": 145, + "train_runtime": 1179.0803, + "train_tokens_per_second": 2013.875 + }, + { + "epoch": 0.08848484848484849, + "grad_norm": 0.06223875284194946, + "learning_rate": 8.787878787878789e-05, + "loss": 0.019059190526604652, + "num_input_tokens_seen": 2390896, + "step": 146, + "train_runtime": 1187.2017, + "train_tokens_per_second": 2013.892 + }, + { + "epoch": 0.0890909090909091, + "grad_norm": 0.09552452713251114, + "learning_rate": 8.848484848484849e-05, + "loss": 0.020222101360559464, + "num_input_tokens_seen": 2407272, + "step": 147, + "train_runtime": 1195.3217, + "train_tokens_per_second": 2013.911 + }, + { + "epoch": 0.08969696969696969, + "grad_norm": 0.07248228043317795, + "learning_rate": 8.90909090909091e-05, + "loss": 0.020538993179798126, + "num_input_tokens_seen": 2423648, + "step": 148, + "train_runtime": 1203.4411, + "train_tokens_per_second": 2013.932 + }, + { + "epoch": 0.0903030303030303, + "grad_norm": 0.08636505901813507, + "learning_rate": 8.96969696969697e-05, + "loss": 0.020172201097011566, + "num_input_tokens_seen": 2440024, + "step": 149, + "train_runtime": 1211.5609, + "train_tokens_per_second": 2013.951 + }, + { + "epoch": 0.09090909090909091, + "grad_norm": 0.0678800642490387, + "learning_rate": 9.030303030303031e-05, + "loss": 0.01839592307806015, + "num_input_tokens_seen": 2456400, + "step": 150, + "train_runtime": 1219.679, + "train_tokens_per_second": 2013.973 + }, + { + "epoch": 0.09151515151515152, + "grad_norm": 0.08543987572193146, + "learning_rate": 9.090909090909092e-05, + "loss": 0.02213234454393387, + "num_input_tokens_seen": 2472776, + "step": 151, + "train_runtime": 1227.7971, + "train_tokens_per_second": 2013.994 + }, + { + "epoch": 0.09212121212121212, + "grad_norm": 0.06894785910844803, + "learning_rate": 9.151515151515152e-05, + "loss": 0.019493641331791878, + "num_input_tokens_seen": 2489152, + "step": 152, + "train_runtime": 1235.9161, + "train_tokens_per_second": 2014.014 + }, + { + "epoch": 0.09272727272727273, + "grad_norm": 0.0796777755022049, + "learning_rate": 9.212121212121214e-05, + "loss": 0.019212841987609863, + "num_input_tokens_seen": 2505528, + "step": 153, + "train_runtime": 1244.0335, + "train_tokens_per_second": 2014.036 + }, + { + "epoch": 0.09333333333333334, + "grad_norm": 0.03816372528672218, + "learning_rate": 9.272727272727273e-05, + "loss": 0.018845168873667717, + "num_input_tokens_seen": 2521904, + "step": 154, + "train_runtime": 1252.1501, + "train_tokens_per_second": 2014.059 + }, + { + "epoch": 0.09393939393939393, + "grad_norm": 0.05867328122258186, + "learning_rate": 9.333333333333334e-05, + "loss": 0.020137080922722816, + "num_input_tokens_seen": 2538280, + "step": 155, + "train_runtime": 1260.2669, + "train_tokens_per_second": 2014.081 + }, + { + "epoch": 0.09454545454545454, + "grad_norm": 0.12616179883480072, + "learning_rate": 9.393939393939395e-05, + "loss": 0.023685304448008537, + "num_input_tokens_seen": 2554656, + "step": 156, + "train_runtime": 1268.385, + "train_tokens_per_second": 2014.101 + }, + { + "epoch": 0.09515151515151515, + "grad_norm": 0.06801550090312958, + "learning_rate": 9.454545454545455e-05, + "loss": 0.021116768941283226, + "num_input_tokens_seen": 2571032, + "step": 157, + "train_runtime": 1276.5029, + "train_tokens_per_second": 2014.122 + }, + { + "epoch": 0.09575757575757576, + "grad_norm": 0.05668250098824501, + "learning_rate": 9.515151515151515e-05, + "loss": 0.019319312646985054, + "num_input_tokens_seen": 2587408, + "step": 158, + "train_runtime": 1284.6181, + "train_tokens_per_second": 2014.146 + }, + { + "epoch": 0.09636363636363636, + "grad_norm": 0.05750446021556854, + "learning_rate": 9.575757575757576e-05, + "loss": 0.01928100548684597, + "num_input_tokens_seen": 2603784, + "step": 159, + "train_runtime": 1292.7386, + "train_tokens_per_second": 2014.161 + }, + { + "epoch": 0.09696969696969697, + "grad_norm": 0.08826832473278046, + "learning_rate": 9.636363636363637e-05, + "loss": 0.02036631852388382, + "num_input_tokens_seen": 2620160, + "step": 160, + "train_runtime": 1300.8562, + "train_tokens_per_second": 2014.181 + }, + { + "epoch": 0.09757575757575758, + "grad_norm": 0.05680972710251808, + "learning_rate": 9.696969696969698e-05, + "loss": 0.017789499834179878, + "num_input_tokens_seen": 2636536, + "step": 161, + "train_runtime": 1308.9737, + "train_tokens_per_second": 2014.201 + }, + { + "epoch": 0.09818181818181818, + "grad_norm": 0.04641514644026756, + "learning_rate": 9.757575757575758e-05, + "loss": 0.02048567123711109, + "num_input_tokens_seen": 2652912, + "step": 162, + "train_runtime": 1317.092, + "train_tokens_per_second": 2014.219 + }, + { + "epoch": 0.09878787878787879, + "grad_norm": 0.04058675095438957, + "learning_rate": 9.818181818181818e-05, + "loss": 0.019105076789855957, + "num_input_tokens_seen": 2669288, + "step": 163, + "train_runtime": 1325.2097, + "train_tokens_per_second": 2014.238 + }, + { + "epoch": 0.0993939393939394, + "grad_norm": 0.08786831051111221, + "learning_rate": 9.87878787878788e-05, + "loss": 0.020488332957029343, + "num_input_tokens_seen": 2685664, + "step": 164, + "train_runtime": 1333.3352, + "train_tokens_per_second": 2014.245 + }, + { + "epoch": 0.1, + "grad_norm": 0.05097790062427521, + "learning_rate": 9.939393939393939e-05, + "loss": 0.018979694694280624, + "num_input_tokens_seen": 2702040, + "step": 165, + "train_runtime": 1341.4534, + "train_tokens_per_second": 2014.263 + }, + { + "epoch": 0.1006060606060606, + "grad_norm": 0.05220174416899681, + "learning_rate": 0.0001, + "loss": 0.017788853496313095, + "num_input_tokens_seen": 2718416, + "step": 166, + "train_runtime": 1349.5711, + "train_tokens_per_second": 2014.281 + }, + { + "epoch": 0.10121212121212121, + "grad_norm": 0.07084593176841736, + "learning_rate": 9.999999907529869e-05, + "loss": 0.017644576728343964, + "num_input_tokens_seen": 2734792, + "step": 167, + "train_runtime": 1357.6892, + "train_tokens_per_second": 2014.299 + }, + { + "epoch": 0.10181818181818182, + "grad_norm": 0.058325134217739105, + "learning_rate": 9.999999630119479e-05, + "loss": 0.01890077441930771, + "num_input_tokens_seen": 2751168, + "step": 168, + "train_runtime": 1365.8058, + "train_tokens_per_second": 2014.319 + }, + { + "epoch": 0.10242424242424242, + "grad_norm": 0.06277347356081009, + "learning_rate": 9.999999167768837e-05, + "loss": 0.020100781694054604, + "num_input_tokens_seen": 2767544, + "step": 169, + "train_runtime": 1373.9351, + "train_tokens_per_second": 2014.319 + }, + { + "epoch": 0.10303030303030303, + "grad_norm": 0.07524619996547699, + "learning_rate": 9.999998520477966e-05, + "loss": 0.016615130007267, + "num_input_tokens_seen": 2783920, + "step": 170, + "train_runtime": 1382.0536, + "train_tokens_per_second": 2014.336 + }, + { + "epoch": 0.10363636363636364, + "grad_norm": 0.07865840196609497, + "learning_rate": 9.999997688246885e-05, + "loss": 0.02175009250640869, + "num_input_tokens_seen": 2800296, + "step": 171, + "train_runtime": 1390.173, + "train_tokens_per_second": 2014.351 + }, + { + "epoch": 0.10424242424242425, + "grad_norm": 0.10437590628862381, + "learning_rate": 9.999996671075626e-05, + "loss": 0.021732885390520096, + "num_input_tokens_seen": 2816672, + "step": 172, + "train_runtime": 1398.29, + "train_tokens_per_second": 2014.369 + }, + { + "epoch": 0.10484848484848484, + "grad_norm": 0.09102741628885269, + "learning_rate": 9.99999546896423e-05, + "loss": 0.019160069525241852, + "num_input_tokens_seen": 2833048, + "step": 173, + "train_runtime": 1406.4092, + "train_tokens_per_second": 2014.384 + }, + { + "epoch": 0.10545454545454545, + "grad_norm": 0.09274180978536606, + "learning_rate": 9.999994081912736e-05, + "loss": 0.020909177139401436, + "num_input_tokens_seen": 2849424, + "step": 174, + "train_runtime": 1414.5329, + "train_tokens_per_second": 2014.392 + }, + { + "epoch": 0.10606060606060606, + "grad_norm": 0.0448119193315506, + "learning_rate": 9.999992509921199e-05, + "loss": 0.018382754176855087, + "num_input_tokens_seen": 2865800, + "step": 175, + "train_runtime": 1422.6511, + "train_tokens_per_second": 2014.408 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 0.04945825785398483, + "learning_rate": 9.999990752989675e-05, + "loss": 0.01783941313624382, + "num_input_tokens_seen": 2882176, + "step": 176, + "train_runtime": 1430.7704, + "train_tokens_per_second": 2014.422 + }, + { + "epoch": 0.10727272727272727, + "grad_norm": 0.04921802878379822, + "learning_rate": 9.999988811118231e-05, + "loss": 0.01793338730931282, + "num_input_tokens_seen": 2898552, + "step": 177, + "train_runtime": 1438.89, + "train_tokens_per_second": 2014.436 + }, + { + "epoch": 0.10787878787878788, + "grad_norm": 0.05301757901906967, + "learning_rate": 9.999986684306937e-05, + "loss": 0.01700768433511257, + "num_input_tokens_seen": 2914928, + "step": 178, + "train_runtime": 1447.011, + "train_tokens_per_second": 2014.448 + }, + { + "epoch": 0.10848484848484849, + "grad_norm": 0.0539541132748127, + "learning_rate": 9.999984372555874e-05, + "loss": 0.01774643547832966, + "num_input_tokens_seen": 2931304, + "step": 179, + "train_runtime": 1455.1319, + "train_tokens_per_second": 2014.459 + }, + { + "epoch": 0.10909090909090909, + "grad_norm": 0.046017974615097046, + "learning_rate": 9.999981875865125e-05, + "loss": 0.016473708674311638, + "num_input_tokens_seen": 2947680, + "step": 180, + "train_runtime": 1463.2551, + "train_tokens_per_second": 2014.468 + }, + { + "epoch": 0.1096969696969697, + "grad_norm": 0.05201786011457443, + "learning_rate": 9.999979194234786e-05, + "loss": 0.019079631194472313, + "num_input_tokens_seen": 2964056, + "step": 181, + "train_runtime": 1471.3776, + "train_tokens_per_second": 2014.477 + }, + { + "epoch": 0.1103030303030303, + "grad_norm": 0.07819167524576187, + "learning_rate": 9.99997632766495e-05, + "loss": 0.018508095294237137, + "num_input_tokens_seen": 2980432, + "step": 182, + "train_runtime": 1479.496, + "train_tokens_per_second": 2014.491 + }, + { + "epoch": 0.11090909090909092, + "grad_norm": 0.04773807153105736, + "learning_rate": 9.999973276155727e-05, + "loss": 0.016029708087444305, + "num_input_tokens_seen": 2996808, + "step": 183, + "train_runtime": 1487.6149, + "train_tokens_per_second": 2014.505 + }, + { + "epoch": 0.11151515151515151, + "grad_norm": 0.054091572761535645, + "learning_rate": 9.999970039707232e-05, + "loss": 0.01906082220375538, + "num_input_tokens_seen": 3013184, + "step": 184, + "train_runtime": 1495.7326, + "train_tokens_per_second": 2014.521 + }, + { + "epoch": 0.11212121212121212, + "grad_norm": 0.03870342671871185, + "learning_rate": 9.999966618319581e-05, + "loss": 0.01634303852915764, + "num_input_tokens_seen": 3029560, + "step": 185, + "train_runtime": 1503.8521, + "train_tokens_per_second": 2014.533 + }, + { + "epoch": 0.11272727272727273, + "grad_norm": 0.04409291222691536, + "learning_rate": 9.999963011992902e-05, + "loss": 0.016504261642694473, + "num_input_tokens_seen": 3045936, + "step": 186, + "train_runtime": 1511.9705, + "train_tokens_per_second": 2014.547 + }, + { + "epoch": 0.11333333333333333, + "grad_norm": 0.037538424134254456, + "learning_rate": 9.999959220727327e-05, + "loss": 0.016254613175988197, + "num_input_tokens_seen": 3062312, + "step": 187, + "train_runtime": 1520.0898, + "train_tokens_per_second": 2014.56 + }, + { + "epoch": 0.11393939393939394, + "grad_norm": 0.0896935984492302, + "learning_rate": 9.999955244522999e-05, + "loss": 0.016761597245931625, + "num_input_tokens_seen": 3078688, + "step": 188, + "train_runtime": 1528.2094, + "train_tokens_per_second": 2014.572 + }, + { + "epoch": 0.11454545454545455, + "grad_norm": 0.10176566988229752, + "learning_rate": 9.999951083380062e-05, + "loss": 0.01988411694765091, + "num_input_tokens_seen": 3095064, + "step": 189, + "train_runtime": 1536.333, + "train_tokens_per_second": 2014.579 + }, + { + "epoch": 0.11515151515151516, + "grad_norm": 0.039956171065568924, + "learning_rate": 9.999946737298674e-05, + "loss": 0.015326369553804398, + "num_input_tokens_seen": 3111440, + "step": 190, + "train_runtime": 1544.4503, + "train_tokens_per_second": 2014.594 + }, + { + "epoch": 0.11575757575757575, + "grad_norm": 0.06942013651132584, + "learning_rate": 9.99994220627899e-05, + "loss": 0.017792224884033203, + "num_input_tokens_seen": 3127816, + "step": 191, + "train_runtime": 1552.5689, + "train_tokens_per_second": 2014.607 + }, + { + "epoch": 0.11636363636363636, + "grad_norm": 0.06119908019900322, + "learning_rate": 9.999937490321182e-05, + "loss": 0.016535507515072823, + "num_input_tokens_seen": 3144192, + "step": 192, + "train_runtime": 1560.6857, + "train_tokens_per_second": 2014.622 + }, + { + "epoch": 0.11696969696969697, + "grad_norm": 0.07336534559726715, + "learning_rate": 9.999932589425423e-05, + "loss": 0.015493718907237053, + "num_input_tokens_seen": 3160568, + "step": 193, + "train_runtime": 1568.8033, + "train_tokens_per_second": 2014.636 + }, + { + "epoch": 0.11757575757575757, + "grad_norm": 0.03818663954734802, + "learning_rate": 9.999927503591896e-05, + "loss": 0.017348209396004677, + "num_input_tokens_seen": 3176944, + "step": 194, + "train_runtime": 1576.9206, + "train_tokens_per_second": 2014.651 + }, + { + "epoch": 0.11818181818181818, + "grad_norm": 0.028583593666553497, + "learning_rate": 9.999922232820785e-05, + "loss": 0.014952014200389385, + "num_input_tokens_seen": 3193320, + "step": 195, + "train_runtime": 1585.0393, + "train_tokens_per_second": 2014.663 + }, + { + "epoch": 0.11878787878787879, + "grad_norm": 0.04163753613829613, + "learning_rate": 9.999916777112288e-05, + "loss": 0.017875926569104195, + "num_input_tokens_seen": 3209696, + "step": 196, + "train_runtime": 1593.159, + "train_tokens_per_second": 2014.674 + }, + { + "epoch": 0.1193939393939394, + "grad_norm": 0.03779582679271698, + "learning_rate": 9.999911136466608e-05, + "loss": 0.01648208498954773, + "num_input_tokens_seen": 3226072, + "step": 197, + "train_runtime": 1601.2758, + "train_tokens_per_second": 2014.689 + }, + { + "epoch": 0.12, + "grad_norm": 0.06097209453582764, + "learning_rate": 9.99990531088395e-05, + "loss": 0.017982497811317444, + "num_input_tokens_seen": 3242448, + "step": 198, + "train_runtime": 1609.4726, + "train_tokens_per_second": 2014.603 + }, + { + "epoch": 0.1206060606060606, + "grad_norm": 0.07450928539037704, + "learning_rate": 9.999899300364532e-05, + "loss": 0.015351779758930206, + "num_input_tokens_seen": 3258824, + "step": 199, + "train_runtime": 1617.5877, + "train_tokens_per_second": 2014.62 + }, + { + "epoch": 0.12121212121212122, + "grad_norm": 0.06301674991846085, + "learning_rate": 9.999893104908577e-05, + "loss": 0.018576189875602722, + "num_input_tokens_seen": 3275200, + "step": 200, + "train_runtime": 1625.7153, + "train_tokens_per_second": 2014.621 + }, + { + "epoch": 0.12181818181818181, + "grad_norm": 0.05599730834364891, + "learning_rate": 9.999886724516312e-05, + "loss": 0.018099110573530197, + "num_input_tokens_seen": 3291576, + "step": 201, + "train_runtime": 1635.3633, + "train_tokens_per_second": 2012.749 + }, + { + "epoch": 0.12242424242424242, + "grad_norm": 0.040753431618213654, + "learning_rate": 9.999880159187975e-05, + "loss": 0.015437884256243706, + "num_input_tokens_seen": 3307952, + "step": 202, + "train_runtime": 1643.4859, + "train_tokens_per_second": 2012.766 + }, + { + "epoch": 0.12303030303030303, + "grad_norm": 0.03280268982052803, + "learning_rate": 9.999873408923806e-05, + "loss": 0.01625344157218933, + "num_input_tokens_seen": 3324328, + "step": 203, + "train_runtime": 1651.609, + "train_tokens_per_second": 2012.781 + }, + { + "epoch": 0.12363636363636364, + "grad_norm": 0.058769796043634415, + "learning_rate": 9.999866473724057e-05, + "loss": 0.019040308892726898, + "num_input_tokens_seen": 3340704, + "step": 204, + "train_runtime": 1659.7319, + "train_tokens_per_second": 2012.797 + }, + { + "epoch": 0.12424242424242424, + "grad_norm": 0.07302497327327728, + "learning_rate": 9.999859353588984e-05, + "loss": 0.015959227457642555, + "num_input_tokens_seen": 3357080, + "step": 205, + "train_runtime": 1667.8511, + "train_tokens_per_second": 2012.818 + }, + { + "epoch": 0.12484848484848485, + "grad_norm": 0.038392290472984314, + "learning_rate": 9.999852048518849e-05, + "loss": 0.015184870921075344, + "num_input_tokens_seen": 3373456, + "step": 206, + "train_runtime": 1675.97, + "train_tokens_per_second": 2012.838 + }, + { + "epoch": 0.12545454545454546, + "grad_norm": 0.057108521461486816, + "learning_rate": 9.999844558513926e-05, + "loss": 0.018102547153830528, + "num_input_tokens_seen": 3389832, + "step": 207, + "train_runtime": 1684.0874, + "train_tokens_per_second": 2012.86 + }, + { + "epoch": 0.12606060606060607, + "grad_norm": 0.05192007124423981, + "learning_rate": 9.999836883574488e-05, + "loss": 0.016045067459344864, + "num_input_tokens_seen": 3406208, + "step": 208, + "train_runtime": 1692.2048, + "train_tokens_per_second": 2012.882 + }, + { + "epoch": 0.12666666666666668, + "grad_norm": 0.05115659907460213, + "learning_rate": 9.99982902370082e-05, + "loss": 0.016623271629214287, + "num_input_tokens_seen": 3422584, + "step": 209, + "train_runtime": 1700.3232, + "train_tokens_per_second": 2012.902 + }, + { + "epoch": 0.12727272727272726, + "grad_norm": 0.07258911430835724, + "learning_rate": 9.999820978893216e-05, + "loss": 0.020482556894421577, + "num_input_tokens_seen": 3438960, + "step": 210, + "train_runtime": 1708.4412, + "train_tokens_per_second": 2012.923 + }, + { + "epoch": 0.12787878787878787, + "grad_norm": 0.1083996444940567, + "learning_rate": 9.999812749151966e-05, + "loss": 0.020862706005573273, + "num_input_tokens_seen": 3455336, + "step": 211, + "train_runtime": 1716.5608, + "train_tokens_per_second": 2012.941 + }, + { + "epoch": 0.12848484848484848, + "grad_norm": 0.04957745969295502, + "learning_rate": 9.999804334477383e-05, + "loss": 0.019352620467543602, + "num_input_tokens_seen": 3471712, + "step": 212, + "train_runtime": 1724.679, + "train_tokens_per_second": 2012.961 + }, + { + "epoch": 0.1290909090909091, + "grad_norm": 0.05110868439078331, + "learning_rate": 9.999795734869772e-05, + "loss": 0.01801101304590702, + "num_input_tokens_seen": 3488088, + "step": 213, + "train_runtime": 1732.7974, + "train_tokens_per_second": 2012.981 + }, + { + "epoch": 0.1296969696969697, + "grad_norm": 0.03656603768467903, + "learning_rate": 9.999786950329454e-05, + "loss": 0.014664572663605213, + "num_input_tokens_seen": 3504464, + "step": 214, + "train_runtime": 1740.9181, + "train_tokens_per_second": 2012.998 + }, + { + "epoch": 0.1303030303030303, + "grad_norm": 0.06225895509123802, + "learning_rate": 9.999777980856754e-05, + "loss": 0.01811577007174492, + "num_input_tokens_seen": 3520840, + "step": 215, + "train_runtime": 1749.0394, + "train_tokens_per_second": 2013.014 + }, + { + "epoch": 0.13090909090909092, + "grad_norm": 0.06217541545629501, + "learning_rate": 9.999768826452004e-05, + "loss": 0.015230846591293812, + "num_input_tokens_seen": 3537216, + "step": 216, + "train_runtime": 1757.1603, + "train_tokens_per_second": 2013.03 + }, + { + "epoch": 0.1315151515151515, + "grad_norm": 0.0395430289208889, + "learning_rate": 9.999759487115541e-05, + "loss": 0.017680658027529716, + "num_input_tokens_seen": 3553592, + "step": 217, + "train_runtime": 1765.2799, + "train_tokens_per_second": 2013.047 + }, + { + "epoch": 0.1321212121212121, + "grad_norm": 0.04460732638835907, + "learning_rate": 9.999749962847711e-05, + "loss": 0.015775006264448166, + "num_input_tokens_seen": 3569968, + "step": 218, + "train_runtime": 1773.4008, + "train_tokens_per_second": 2013.063 + }, + { + "epoch": 0.13272727272727272, + "grad_norm": 0.026493152603507042, + "learning_rate": 9.999740253648866e-05, + "loss": 0.016286678612232208, + "num_input_tokens_seen": 3586344, + "step": 219, + "train_runtime": 1781.5181, + "train_tokens_per_second": 2013.083 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.05032551288604736, + "learning_rate": 9.999730359519366e-05, + "loss": 0.01735139824450016, + "num_input_tokens_seen": 3602720, + "step": 220, + "train_runtime": 1789.6355, + "train_tokens_per_second": 2013.103 + }, + { + "epoch": 0.13393939393939394, + "grad_norm": 0.10480339080095291, + "learning_rate": 9.999720280459576e-05, + "loss": 0.0164189450442791, + "num_input_tokens_seen": 3619096, + "step": 221, + "train_runtime": 1797.7553, + "train_tokens_per_second": 2013.119 + }, + { + "epoch": 0.13454545454545455, + "grad_norm": 0.05456702038645744, + "learning_rate": 9.99971001646987e-05, + "loss": 0.018650280311703682, + "num_input_tokens_seen": 3635472, + "step": 222, + "train_runtime": 1805.876, + "train_tokens_per_second": 2013.135 + }, + { + "epoch": 0.13515151515151516, + "grad_norm": 0.03562236949801445, + "learning_rate": 9.999699567550627e-05, + "loss": 0.014892566949129105, + "num_input_tokens_seen": 3651848, + "step": 223, + "train_runtime": 1813.9965, + "train_tokens_per_second": 2013.151 + }, + { + "epoch": 0.13575757575757577, + "grad_norm": 0.09293515980243683, + "learning_rate": 9.999688933702232e-05, + "loss": 0.019074441865086555, + "num_input_tokens_seen": 3668224, + "step": 224, + "train_runtime": 1822.1164, + "train_tokens_per_second": 2013.167 + }, + { + "epoch": 0.13636363636363635, + "grad_norm": 0.04311508685350418, + "learning_rate": 9.99967811492508e-05, + "loss": 0.016122177243232727, + "num_input_tokens_seen": 3684600, + "step": 225, + "train_runtime": 1830.237, + "train_tokens_per_second": 2013.182 + }, + { + "epoch": 0.13696969696969696, + "grad_norm": 0.0684700533747673, + "learning_rate": 9.999667111219573e-05, + "loss": 0.016784384846687317, + "num_input_tokens_seen": 3700976, + "step": 226, + "train_runtime": 1838.3572, + "train_tokens_per_second": 2013.197 + }, + { + "epoch": 0.13757575757575757, + "grad_norm": 0.051709555089473724, + "learning_rate": 9.999655922586116e-05, + "loss": 0.01756284013390541, + "num_input_tokens_seen": 3717352, + "step": 227, + "train_runtime": 1846.4811, + "train_tokens_per_second": 2013.209 + }, + { + "epoch": 0.13818181818181818, + "grad_norm": 0.06800346821546555, + "learning_rate": 9.99964454902512e-05, + "loss": 0.018563883379101753, + "num_input_tokens_seen": 3733728, + "step": 228, + "train_runtime": 1854.6021, + "train_tokens_per_second": 2013.223 + }, + { + "epoch": 0.1387878787878788, + "grad_norm": 0.04645644128322601, + "learning_rate": 9.99963299053701e-05, + "loss": 0.017479516565799713, + "num_input_tokens_seen": 3750104, + "step": 229, + "train_runtime": 1862.7316, + "train_tokens_per_second": 2013.228 + }, + { + "epoch": 0.1393939393939394, + "grad_norm": 0.07372930645942688, + "learning_rate": 9.999621247122213e-05, + "loss": 0.017878303304314613, + "num_input_tokens_seen": 3766480, + "step": 230, + "train_runtime": 1870.8516, + "train_tokens_per_second": 2013.244 + }, + { + "epoch": 0.14, + "grad_norm": 0.1514655202627182, + "learning_rate": 9.99960931878116e-05, + "loss": 0.015512627549469471, + "num_input_tokens_seen": 3782856, + "step": 231, + "train_runtime": 1878.9708, + "train_tokens_per_second": 2013.26 + }, + { + "epoch": 0.1406060606060606, + "grad_norm": 0.04524844512343407, + "learning_rate": 9.999597205514297e-05, + "loss": 0.01565626822412014, + "num_input_tokens_seen": 3799232, + "step": 232, + "train_runtime": 1887.094, + "train_tokens_per_second": 2013.271 + }, + { + "epoch": 0.1412121212121212, + "grad_norm": 0.03657226637005806, + "learning_rate": 9.999584907322069e-05, + "loss": 0.014475165866315365, + "num_input_tokens_seen": 3815608, + "step": 233, + "train_runtime": 1895.2138, + "train_tokens_per_second": 2013.286 + }, + { + "epoch": 0.14181818181818182, + "grad_norm": 0.10837068408727646, + "learning_rate": 9.99957242420493e-05, + "loss": 0.016292275860905647, + "num_input_tokens_seen": 3831984, + "step": 234, + "train_runtime": 1903.3349, + "train_tokens_per_second": 2013.3 + }, + { + "epoch": 0.14242424242424243, + "grad_norm": 0.06915906816720963, + "learning_rate": 9.999559756163346e-05, + "loss": 0.01956966333091259, + "num_input_tokens_seen": 3848360, + "step": 235, + "train_runtime": 1911.4546, + "train_tokens_per_second": 2013.315 + }, + { + "epoch": 0.14303030303030304, + "grad_norm": 0.03815745189785957, + "learning_rate": 9.99954690319778e-05, + "loss": 0.01515297032892704, + "num_input_tokens_seen": 3864736, + "step": 236, + "train_runtime": 1919.5751, + "train_tokens_per_second": 2013.329 + }, + { + "epoch": 0.14363636363636365, + "grad_norm": 0.04804231598973274, + "learning_rate": 9.999533865308712e-05, + "loss": 0.017410308122634888, + "num_input_tokens_seen": 3881112, + "step": 237, + "train_runtime": 1927.6957, + "train_tokens_per_second": 2013.343 + }, + { + "epoch": 0.14424242424242426, + "grad_norm": 0.10351648926734924, + "learning_rate": 9.999520642496623e-05, + "loss": 0.01582871936261654, + "num_input_tokens_seen": 3897488, + "step": 238, + "train_runtime": 1935.8176, + "train_tokens_per_second": 2013.355 + }, + { + "epoch": 0.14484848484848484, + "grad_norm": 0.06399150937795639, + "learning_rate": 9.999507234762e-05, + "loss": 0.015461472794413567, + "num_input_tokens_seen": 3913864, + "step": 239, + "train_runtime": 1943.945, + "train_tokens_per_second": 2013.361 + }, + { + "epoch": 0.14545454545454545, + "grad_norm": 0.027640361338853836, + "learning_rate": 9.999493642105342e-05, + "loss": 0.01647048071026802, + "num_input_tokens_seen": 3930240, + "step": 240, + "train_runtime": 1952.0688, + "train_tokens_per_second": 2013.372 + }, + { + "epoch": 0.14606060606060606, + "grad_norm": 0.07313567399978638, + "learning_rate": 9.999479864527148e-05, + "loss": 0.015903417021036148, + "num_input_tokens_seen": 3946616, + "step": 241, + "train_runtime": 1960.1915, + "train_tokens_per_second": 2013.383 + }, + { + "epoch": 0.14666666666666667, + "grad_norm": 0.09255962073802948, + "learning_rate": 9.999465902027931e-05, + "loss": 0.01633605733513832, + "num_input_tokens_seen": 3962992, + "step": 242, + "train_runtime": 1968.3145, + "train_tokens_per_second": 2013.394 + }, + { + "epoch": 0.14727272727272728, + "grad_norm": 0.06311100721359253, + "learning_rate": 9.999451754608207e-05, + "loss": 0.018459340557456017, + "num_input_tokens_seen": 3979368, + "step": 243, + "train_runtime": 1976.4343, + "train_tokens_per_second": 2013.408 + }, + { + "epoch": 0.1478787878787879, + "grad_norm": 0.04240158200263977, + "learning_rate": 9.999437422268498e-05, + "loss": 0.01432002056390047, + "num_input_tokens_seen": 3995744, + "step": 244, + "train_runtime": 1984.5577, + "train_tokens_per_second": 2013.418 + }, + { + "epoch": 0.1484848484848485, + "grad_norm": 0.05550538748502731, + "learning_rate": 9.999422905009335e-05, + "loss": 0.014518518932163715, + "num_input_tokens_seen": 4012120, + "step": 245, + "train_runtime": 1992.685, + "train_tokens_per_second": 2013.424 + }, + { + "epoch": 0.14909090909090908, + "grad_norm": 0.037221502512693405, + "learning_rate": 9.999408202831255e-05, + "loss": 0.014823012985289097, + "num_input_tokens_seen": 4028496, + "step": 246, + "train_runtime": 2000.8075, + "train_tokens_per_second": 2013.435 + }, + { + "epoch": 0.1496969696969697, + "grad_norm": 0.06923341751098633, + "learning_rate": 9.999393315734801e-05, + "loss": 0.018903765827417374, + "num_input_tokens_seen": 4044872, + "step": 247, + "train_runtime": 2008.9335, + "train_tokens_per_second": 2013.443 + }, + { + "epoch": 0.1503030303030303, + "grad_norm": 0.07023045420646667, + "learning_rate": 9.999378243720523e-05, + "loss": 0.01768019236624241, + "num_input_tokens_seen": 4061248, + "step": 248, + "train_runtime": 2017.0572, + "train_tokens_per_second": 2013.452 + }, + { + "epoch": 0.1509090909090909, + "grad_norm": 0.04301533102989197, + "learning_rate": 9.999362986788981e-05, + "loss": 0.016754839569330215, + "num_input_tokens_seen": 4077624, + "step": 249, + "train_runtime": 2025.1771, + "train_tokens_per_second": 2013.465 + }, + { + "epoch": 0.15151515151515152, + "grad_norm": 0.08630920946598053, + "learning_rate": 9.999347544940739e-05, + "loss": 0.014999642968177795, + "num_input_tokens_seen": 4094000, + "step": 250, + "train_runtime": 2033.2978, + "train_tokens_per_second": 2013.478 + }, + { + "epoch": 0.15212121212121213, + "grad_norm": 0.03872856870293617, + "learning_rate": 9.999331918176365e-05, + "loss": 0.015648486092686653, + "num_input_tokens_seen": 4110376, + "step": 251, + "train_runtime": 2041.4306, + "train_tokens_per_second": 2013.478 + }, + { + "epoch": 0.15272727272727274, + "grad_norm": 0.0624275766313076, + "learning_rate": 9.999316106496439e-05, + "loss": 0.015371391549706459, + "num_input_tokens_seen": 4126752, + "step": 252, + "train_runtime": 2049.5498, + "train_tokens_per_second": 2013.492 + }, + { + "epoch": 0.15333333333333332, + "grad_norm": 0.03090560808777809, + "learning_rate": 9.999300109901548e-05, + "loss": 0.013192292302846909, + "num_input_tokens_seen": 4143128, + "step": 253, + "train_runtime": 2057.6702, + "train_tokens_per_second": 2013.504 + }, + { + "epoch": 0.15393939393939393, + "grad_norm": 0.5114591121673584, + "learning_rate": 9.99928392839228e-05, + "loss": 0.018224472180008888, + "num_input_tokens_seen": 4159504, + "step": 254, + "train_runtime": 2065.8007, + "train_tokens_per_second": 2013.507 + }, + { + "epoch": 0.15454545454545454, + "grad_norm": 0.05735045298933983, + "learning_rate": 9.999267561969235e-05, + "loss": 0.017389601096510887, + "num_input_tokens_seen": 4175880, + "step": 255, + "train_runtime": 2073.9307, + "train_tokens_per_second": 2013.51 + }, + { + "epoch": 0.15515151515151515, + "grad_norm": 0.13113801181316376, + "learning_rate": 9.99925101063302e-05, + "loss": 0.015801645815372467, + "num_input_tokens_seen": 4192256, + "step": 256, + "train_runtime": 2082.0503, + "train_tokens_per_second": 2013.523 + }, + { + "epoch": 0.15575757575757576, + "grad_norm": 0.1659373939037323, + "learning_rate": 9.999234274384244e-05, + "loss": 0.016719762235879898, + "num_input_tokens_seen": 4208632, + "step": 257, + "train_runtime": 2090.1723, + "train_tokens_per_second": 2013.534 + }, + { + "epoch": 0.15636363636363637, + "grad_norm": 0.09268343448638916, + "learning_rate": 9.99921735322353e-05, + "loss": 0.01958809420466423, + "num_input_tokens_seen": 4225008, + "step": 258, + "train_runtime": 2098.2916, + "train_tokens_per_second": 2013.547 + }, + { + "epoch": 0.15696969696969698, + "grad_norm": 0.08097874373197556, + "learning_rate": 9.999200247151499e-05, + "loss": 0.01584583893418312, + "num_input_tokens_seen": 4241384, + "step": 259, + "train_runtime": 2106.4305, + "train_tokens_per_second": 2013.541 + }, + { + "epoch": 0.15757575757575756, + "grad_norm": 0.072023406624794, + "learning_rate": 9.999182956168787e-05, + "loss": 0.0168259609490633, + "num_input_tokens_seen": 4257760, + "step": 260, + "train_runtime": 2114.5526, + "train_tokens_per_second": 2013.551 + }, + { + "epoch": 0.15818181818181817, + "grad_norm": 0.038404542952775955, + "learning_rate": 9.999165480276034e-05, + "loss": 0.014127206057310104, + "num_input_tokens_seen": 4274136, + "step": 261, + "train_runtime": 2122.6772, + "train_tokens_per_second": 2013.559 + }, + { + "epoch": 0.15878787878787878, + "grad_norm": 0.03950539231300354, + "learning_rate": 9.999147819473884e-05, + "loss": 0.016822200268507004, + "num_input_tokens_seen": 4290512, + "step": 262, + "train_runtime": 2130.7967, + "train_tokens_per_second": 2013.572 + }, + { + "epoch": 0.1593939393939394, + "grad_norm": 0.04290624335408211, + "learning_rate": 9.999129973762992e-05, + "loss": 0.016068218275904655, + "num_input_tokens_seen": 4306888, + "step": 263, + "train_runtime": 2138.9172, + "train_tokens_per_second": 2013.583 + }, + { + "epoch": 0.16, + "grad_norm": 0.05928179994225502, + "learning_rate": 9.99911194314402e-05, + "loss": 0.016628028824925423, + "num_input_tokens_seen": 4323264, + "step": 264, + "train_runtime": 2147.039, + "train_tokens_per_second": 2013.594 + }, + { + "epoch": 0.1606060606060606, + "grad_norm": 0.04302699863910675, + "learning_rate": 9.99909372761763e-05, + "loss": 0.014704343862831593, + "num_input_tokens_seen": 4339640, + "step": 265, + "train_runtime": 2155.1707, + "train_tokens_per_second": 2013.595 + }, + { + "epoch": 0.16121212121212122, + "grad_norm": 0.047466881573200226, + "learning_rate": 9.999075327184499e-05, + "loss": 0.016627237200737, + "num_input_tokens_seen": 4356016, + "step": 266, + "train_runtime": 2163.294, + "train_tokens_per_second": 2013.603 + }, + { + "epoch": 0.1618181818181818, + "grad_norm": 0.04007207974791527, + "learning_rate": 9.999056741845305e-05, + "loss": 0.01723393052816391, + "num_input_tokens_seen": 4372392, + "step": 267, + "train_runtime": 2171.417, + "train_tokens_per_second": 2013.612 + }, + { + "epoch": 0.16242424242424242, + "grad_norm": 0.04319130629301071, + "learning_rate": 9.99903797160074e-05, + "loss": 0.014541544020175934, + "num_input_tokens_seen": 4388768, + "step": 268, + "train_runtime": 2179.5352, + "train_tokens_per_second": 2013.626 + }, + { + "epoch": 0.16303030303030303, + "grad_norm": 0.02772807702422142, + "learning_rate": 9.999019016451494e-05, + "loss": 0.01326832640916109, + "num_input_tokens_seen": 4405144, + "step": 269, + "train_runtime": 2187.6543, + "train_tokens_per_second": 2013.638 + }, + { + "epoch": 0.16363636363636364, + "grad_norm": 0.03225944936275482, + "learning_rate": 9.998999876398271e-05, + "loss": 0.013814960606396198, + "num_input_tokens_seen": 4421520, + "step": 270, + "train_runtime": 2195.7724, + "train_tokens_per_second": 2013.651 + }, + { + "epoch": 0.16424242424242425, + "grad_norm": 0.03607013449072838, + "learning_rate": 9.998980551441776e-05, + "loss": 0.01566735841333866, + "num_input_tokens_seen": 4437896, + "step": 271, + "train_runtime": 2203.8921, + "train_tokens_per_second": 2013.663 + }, + { + "epoch": 0.16484848484848486, + "grad_norm": 0.02214481309056282, + "learning_rate": 9.998961041582727e-05, + "loss": 0.014288516715168953, + "num_input_tokens_seen": 4454272, + "step": 272, + "train_runtime": 2212.0309, + "train_tokens_per_second": 2013.657 + }, + { + "epoch": 0.16545454545454547, + "grad_norm": 0.03539419174194336, + "learning_rate": 9.998941346821844e-05, + "loss": 0.016615379601716995, + "num_input_tokens_seen": 4470648, + "step": 273, + "train_runtime": 2220.1513, + "train_tokens_per_second": 2013.668 + }, + { + "epoch": 0.16606060606060605, + "grad_norm": 0.02361457794904709, + "learning_rate": 9.998921467159855e-05, + "loss": 0.015559839084744453, + "num_input_tokens_seen": 4487024, + "step": 274, + "train_runtime": 2228.2688, + "train_tokens_per_second": 2013.682 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 0.029787061735987663, + "learning_rate": 9.998901402597496e-05, + "loss": 0.014054241590201855, + "num_input_tokens_seen": 4503400, + "step": 275, + "train_runtime": 2236.3965, + "train_tokens_per_second": 2013.686 + }, + { + "epoch": 0.16727272727272727, + "grad_norm": 0.08080027997493744, + "learning_rate": 9.99888115313551e-05, + "loss": 0.01626443862915039, + "num_input_tokens_seen": 4519776, + "step": 276, + "train_runtime": 2244.5116, + "train_tokens_per_second": 2013.701 + }, + { + "epoch": 0.16787878787878788, + "grad_norm": 0.04751146212220192, + "learning_rate": 9.998860718774643e-05, + "loss": 0.015646975487470627, + "num_input_tokens_seen": 4536152, + "step": 277, + "train_runtime": 2252.6223, + "train_tokens_per_second": 2013.721 + }, + { + "epoch": 0.1684848484848485, + "grad_norm": 0.11396390199661255, + "learning_rate": 9.998840099515655e-05, + "loss": 0.01626933366060257, + "num_input_tokens_seen": 4552528, + "step": 278, + "train_runtime": 2260.7374, + "train_tokens_per_second": 2013.736 + }, + { + "epoch": 0.1690909090909091, + "grad_norm": 0.03807124122977257, + "learning_rate": 9.998819295359305e-05, + "loss": 0.01517193578183651, + "num_input_tokens_seen": 4568904, + "step": 279, + "train_runtime": 2268.8455, + "train_tokens_per_second": 2013.757 + }, + { + "epoch": 0.1696969696969697, + "grad_norm": 0.07842900604009628, + "learning_rate": 9.998798306306366e-05, + "loss": 0.016375314444303513, + "num_input_tokens_seen": 4585280, + "step": 280, + "train_runtime": 2276.9581, + "train_tokens_per_second": 2013.774 + }, + { + "epoch": 0.1703030303030303, + "grad_norm": 0.12316741049289703, + "learning_rate": 9.99877713235761e-05, + "loss": 0.0158452857285738, + "num_input_tokens_seen": 4601656, + "step": 281, + "train_runtime": 2285.07, + "train_tokens_per_second": 2013.792 + }, + { + "epoch": 0.1709090909090909, + "grad_norm": 0.035711321979761124, + "learning_rate": 9.998755773513824e-05, + "loss": 0.014004937373101711, + "num_input_tokens_seen": 4618032, + "step": 282, + "train_runtime": 2293.1794, + "train_tokens_per_second": 2013.812 + }, + { + "epoch": 0.1715151515151515, + "grad_norm": 0.04513373225927353, + "learning_rate": 9.998734229775794e-05, + "loss": 0.015064300037920475, + "num_input_tokens_seen": 4634408, + "step": 283, + "train_runtime": 2301.2911, + "train_tokens_per_second": 2013.83 + }, + { + "epoch": 0.17212121212121212, + "grad_norm": 0.04803522303700447, + "learning_rate": 9.998712501144323e-05, + "loss": 0.015632454305887222, + "num_input_tokens_seen": 4650784, + "step": 284, + "train_runtime": 2309.4064, + "train_tokens_per_second": 2013.844 + }, + { + "epoch": 0.17272727272727273, + "grad_norm": 0.0677453801035881, + "learning_rate": 9.99869058762021e-05, + "loss": 0.01668519154191017, + "num_input_tokens_seen": 4667160, + "step": 285, + "train_runtime": 2317.5195, + "train_tokens_per_second": 2013.86 + }, + { + "epoch": 0.17333333333333334, + "grad_norm": 0.06408604979515076, + "learning_rate": 9.998668489204266e-05, + "loss": 0.016011208295822144, + "num_input_tokens_seen": 4683536, + "step": 286, + "train_runtime": 2325.6311, + "train_tokens_per_second": 2013.877 + }, + { + "epoch": 0.17393939393939395, + "grad_norm": 0.049628015607595444, + "learning_rate": 9.998646205897309e-05, + "loss": 0.015140787698328495, + "num_input_tokens_seen": 4699912, + "step": 287, + "train_runtime": 2333.7425, + "train_tokens_per_second": 2013.895 + }, + { + "epoch": 0.17454545454545456, + "grad_norm": 0.05506971478462219, + "learning_rate": 9.998623737700163e-05, + "loss": 0.014441089704632759, + "num_input_tokens_seen": 4716288, + "step": 288, + "train_runtime": 2341.8537, + "train_tokens_per_second": 2013.912 + }, + { + "epoch": 0.17515151515151514, + "grad_norm": 0.04357004538178444, + "learning_rate": 9.99860108461366e-05, + "loss": 0.014559566974639893, + "num_input_tokens_seen": 4732664, + "step": 289, + "train_runtime": 2349.9687, + "train_tokens_per_second": 2013.926 + }, + { + "epoch": 0.17575757575757575, + "grad_norm": 0.03436315059661865, + "learning_rate": 9.998578246638637e-05, + "loss": 0.014904836192727089, + "num_input_tokens_seen": 4749040, + "step": 290, + "train_runtime": 2358.082, + "train_tokens_per_second": 2013.942 + }, + { + "epoch": 0.17636363636363636, + "grad_norm": 0.030473578721284866, + "learning_rate": 9.99855522377594e-05, + "loss": 0.013786690309643745, + "num_input_tokens_seen": 4765416, + "step": 291, + "train_runtime": 2366.1924, + "train_tokens_per_second": 2013.96 + }, + { + "epoch": 0.17696969696969697, + "grad_norm": 0.033072736114263535, + "learning_rate": 9.998532016026418e-05, + "loss": 0.016431497409939766, + "num_input_tokens_seen": 4781792, + "step": 292, + "train_runtime": 2374.3035, + "train_tokens_per_second": 2013.977 + }, + { + "epoch": 0.17757575757575758, + "grad_norm": 0.03811201453208923, + "learning_rate": 9.998508623390932e-05, + "loss": 0.014959779568016529, + "num_input_tokens_seen": 4798168, + "step": 293, + "train_runtime": 2382.4135, + "train_tokens_per_second": 2013.995 + }, + { + "epoch": 0.1781818181818182, + "grad_norm": 0.04069237411022186, + "learning_rate": 9.998485045870344e-05, + "loss": 0.016118772327899933, + "num_input_tokens_seen": 4814544, + "step": 294, + "train_runtime": 2390.5227, + "train_tokens_per_second": 2014.013 + }, + { + "epoch": 0.1787878787878788, + "grad_norm": 0.031989723443984985, + "learning_rate": 9.99846128346553e-05, + "loss": 0.01669073849916458, + "num_input_tokens_seen": 4830920, + "step": 295, + "train_runtime": 2398.6348, + "train_tokens_per_second": 2014.029 + }, + { + "epoch": 0.17939393939393938, + "grad_norm": 0.03683701902627945, + "learning_rate": 9.998437336177369e-05, + "loss": 0.014967912808060646, + "num_input_tokens_seen": 4847296, + "step": 296, + "train_runtime": 2406.7421, + "train_tokens_per_second": 2014.049 + }, + { + "epoch": 0.18, + "grad_norm": 0.057917602360248566, + "learning_rate": 9.998413204006742e-05, + "loss": 0.018314681947231293, + "num_input_tokens_seen": 4863672, + "step": 297, + "train_runtime": 2414.8505, + "train_tokens_per_second": 2014.068 + }, + { + "epoch": 0.1806060606060606, + "grad_norm": 0.042889710515737534, + "learning_rate": 9.998388886954547e-05, + "loss": 0.014539923518896103, + "num_input_tokens_seen": 4880048, + "step": 298, + "train_runtime": 2422.9583, + "train_tokens_per_second": 2014.087 + }, + { + "epoch": 0.1812121212121212, + "grad_norm": 0.04697619378566742, + "learning_rate": 9.998364385021679e-05, + "loss": 0.01652900129556656, + "num_input_tokens_seen": 4896424, + "step": 299, + "train_runtime": 2431.0701, + "train_tokens_per_second": 2014.102 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 0.038388364017009735, + "learning_rate": 9.998339698209046e-05, + "loss": 0.013660457916557789, + "num_input_tokens_seen": 4912800, + "step": 300, + "train_runtime": 2439.1822, + "train_tokens_per_second": 2014.118 + }, + { + "epoch": 0.18242424242424243, + "grad_norm": 0.026958242058753967, + "learning_rate": 9.998314826517563e-05, + "loss": 0.015251623466610909, + "num_input_tokens_seen": 4929176, + "step": 301, + "train_runtime": 2448.2631, + "train_tokens_per_second": 2013.336 + }, + { + "epoch": 0.18303030303030304, + "grad_norm": 0.04779147729277611, + "learning_rate": 9.998289769948147e-05, + "loss": 0.012775855138897896, + "num_input_tokens_seen": 4945552, + "step": 302, + "train_runtime": 2456.368, + "train_tokens_per_second": 2013.36 + }, + { + "epoch": 0.18363636363636363, + "grad_norm": 0.03123384155333042, + "learning_rate": 9.998264528501727e-05, + "loss": 0.015583731234073639, + "num_input_tokens_seen": 4961928, + "step": 303, + "train_runtime": 2464.4763, + "train_tokens_per_second": 2013.38 + }, + { + "epoch": 0.18424242424242424, + "grad_norm": 0.05030890926718712, + "learning_rate": 9.998239102179236e-05, + "loss": 0.013868209905922413, + "num_input_tokens_seen": 4978304, + "step": 304, + "train_runtime": 2472.5834, + "train_tokens_per_second": 2013.402 + }, + { + "epoch": 0.18484848484848485, + "grad_norm": 0.033021751791238785, + "learning_rate": 9.998213490981614e-05, + "loss": 0.016501927748322487, + "num_input_tokens_seen": 4994680, + "step": 305, + "train_runtime": 2480.6921, + "train_tokens_per_second": 2013.422 + }, + { + "epoch": 0.18545454545454546, + "grad_norm": 0.050541143864393234, + "learning_rate": 9.998187694909807e-05, + "loss": 0.01771150343120098, + "num_input_tokens_seen": 5011056, + "step": 306, + "train_runtime": 2488.7992, + "train_tokens_per_second": 2013.443 + }, + { + "epoch": 0.18606060606060607, + "grad_norm": 0.04063250124454498, + "learning_rate": 9.998161713964774e-05, + "loss": 0.015554912388324738, + "num_input_tokens_seen": 5027432, + "step": 307, + "train_runtime": 2496.9044, + "train_tokens_per_second": 2013.466 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 0.02722395956516266, + "learning_rate": 9.998135548147469e-05, + "loss": 0.013613277114927769, + "num_input_tokens_seen": 5043808, + "step": 308, + "train_runtime": 2505.0089, + "train_tokens_per_second": 2013.489 + }, + { + "epoch": 0.18727272727272729, + "grad_norm": 0.02678558975458145, + "learning_rate": 9.998109197458865e-05, + "loss": 0.014953495003283024, + "num_input_tokens_seen": 5060184, + "step": 309, + "train_runtime": 2513.1157, + "train_tokens_per_second": 2013.51 + }, + { + "epoch": 0.18787878787878787, + "grad_norm": 0.02857518568634987, + "learning_rate": 9.998082661899935e-05, + "loss": 0.013844496570527554, + "num_input_tokens_seen": 5076560, + "step": 310, + "train_runtime": 2521.2304, + "train_tokens_per_second": 2013.525 + }, + { + "epoch": 0.18848484848484848, + "grad_norm": 0.2615605294704437, + "learning_rate": 9.998055941471662e-05, + "loss": 0.01809251680970192, + "num_input_tokens_seen": 5092936, + "step": 311, + "train_runtime": 2529.3408, + "train_tokens_per_second": 2013.543 + }, + { + "epoch": 0.1890909090909091, + "grad_norm": 0.029859403148293495, + "learning_rate": 9.998029036175031e-05, + "loss": 0.015970397740602493, + "num_input_tokens_seen": 5109312, + "step": 312, + "train_runtime": 2537.4488, + "train_tokens_per_second": 2013.563 + }, + { + "epoch": 0.1896969696969697, + "grad_norm": 0.03636668995022774, + "learning_rate": 9.99800194601104e-05, + "loss": 0.01580364629626274, + "num_input_tokens_seen": 5125688, + "step": 313, + "train_runtime": 2545.553, + "train_tokens_per_second": 2013.585 + }, + { + "epoch": 0.1903030303030303, + "grad_norm": 0.0684208944439888, + "learning_rate": 9.997974670980691e-05, + "loss": 0.017103755846619606, + "num_input_tokens_seen": 5142064, + "step": 314, + "train_runtime": 2553.6615, + "train_tokens_per_second": 2013.604 + }, + { + "epoch": 0.19090909090909092, + "grad_norm": 0.028665577992796898, + "learning_rate": 9.997947211084991e-05, + "loss": 0.014511539600789547, + "num_input_tokens_seen": 5158440, + "step": 315, + "train_runtime": 2561.7735, + "train_tokens_per_second": 2013.621 + }, + { + "epoch": 0.19151515151515153, + "grad_norm": 0.09884219616651535, + "learning_rate": 9.997919566324959e-05, + "loss": 0.014168107882142067, + "num_input_tokens_seen": 5174816, + "step": 316, + "train_runtime": 2569.8855, + "train_tokens_per_second": 2013.637 + }, + { + "epoch": 0.1921212121212121, + "grad_norm": 0.1779116839170456, + "learning_rate": 9.997891736701613e-05, + "loss": 0.014995518140494823, + "num_input_tokens_seen": 5191192, + "step": 317, + "train_runtime": 2577.9971, + "train_tokens_per_second": 2013.653 + }, + { + "epoch": 0.19272727272727272, + "grad_norm": 0.030352341011166573, + "learning_rate": 9.997863722215983e-05, + "loss": 0.014715241268277168, + "num_input_tokens_seen": 5207568, + "step": 318, + "train_runtime": 2586.1052, + "train_tokens_per_second": 2013.672 + }, + { + "epoch": 0.19333333333333333, + "grad_norm": 0.03511129692196846, + "learning_rate": 9.99783552286911e-05, + "loss": 0.01499946229159832, + "num_input_tokens_seen": 5223944, + "step": 319, + "train_runtime": 2594.2129, + "train_tokens_per_second": 2013.691 + }, + { + "epoch": 0.19393939393939394, + "grad_norm": 0.04475672170519829, + "learning_rate": 9.997807138662033e-05, + "loss": 0.014523375779390335, + "num_input_tokens_seen": 5240320, + "step": 320, + "train_runtime": 2602.3206, + "train_tokens_per_second": 2013.71 + }, + { + "epoch": 0.19454545454545455, + "grad_norm": 0.02900783158838749, + "learning_rate": 9.997778569595801e-05, + "loss": 0.015447665005922318, + "num_input_tokens_seen": 5256696, + "step": 321, + "train_runtime": 2610.4318, + "train_tokens_per_second": 2013.727 + }, + { + "epoch": 0.19515151515151516, + "grad_norm": 0.022910727187991142, + "learning_rate": 9.997749815671473e-05, + "loss": 0.013799930922687054, + "num_input_tokens_seen": 5273072, + "step": 322, + "train_runtime": 2618.541, + "train_tokens_per_second": 2013.744 + }, + { + "epoch": 0.19575757575757577, + "grad_norm": 0.03925245255231857, + "learning_rate": 9.997720876890113e-05, + "loss": 0.013741591945290565, + "num_input_tokens_seen": 5289448, + "step": 323, + "train_runtime": 2626.6511, + "train_tokens_per_second": 2013.761 + }, + { + "epoch": 0.19636363636363635, + "grad_norm": 0.029477456584572792, + "learning_rate": 9.997691753252791e-05, + "loss": 0.013831754215061665, + "num_input_tokens_seen": 5305824, + "step": 324, + "train_runtime": 2634.7586, + "train_tokens_per_second": 2013.78 + }, + { + "epoch": 0.19696969696969696, + "grad_norm": 0.0368235781788826, + "learning_rate": 9.997662444760583e-05, + "loss": 0.014774560928344727, + "num_input_tokens_seen": 5322200, + "step": 325, + "train_runtime": 2642.8689, + "train_tokens_per_second": 2013.796 + }, + { + "epoch": 0.19757575757575757, + "grad_norm": 0.04399452731013298, + "learning_rate": 9.997632951414573e-05, + "loss": 0.014160547405481339, + "num_input_tokens_seen": 5338576, + "step": 326, + "train_runtime": 2650.978, + "train_tokens_per_second": 2013.814 + }, + { + "epoch": 0.19818181818181818, + "grad_norm": 0.02241128869354725, + "learning_rate": 9.997603273215853e-05, + "loss": 0.013626255095005035, + "num_input_tokens_seen": 5354952, + "step": 327, + "train_runtime": 2659.0857, + "train_tokens_per_second": 2013.832 + }, + { + "epoch": 0.1987878787878788, + "grad_norm": 0.022924182936549187, + "learning_rate": 9.99757341016552e-05, + "loss": 0.013918038457632065, + "num_input_tokens_seen": 5371328, + "step": 328, + "train_runtime": 2667.1942, + "train_tokens_per_second": 2013.85 + }, + { + "epoch": 0.1993939393939394, + "grad_norm": 0.0384218692779541, + "learning_rate": 9.99754336226468e-05, + "loss": 0.01543221715837717, + "num_input_tokens_seen": 5387704, + "step": 329, + "train_runtime": 2675.3051, + "train_tokens_per_second": 2013.865 + }, + { + "epoch": 0.2, + "grad_norm": 0.024983001872897148, + "learning_rate": 9.997513129514442e-05, + "loss": 0.014143919572234154, + "num_input_tokens_seen": 5404080, + "step": 330, + "train_runtime": 2683.4136, + "train_tokens_per_second": 2013.883 + }, + { + "epoch": 0.2006060606060606, + "grad_norm": 0.036509182304143906, + "learning_rate": 9.997482711915927e-05, + "loss": 0.017176145687699318, + "num_input_tokens_seen": 5420456, + "step": 331, + "train_runtime": 2691.5304, + "train_tokens_per_second": 2013.894 + }, + { + "epoch": 0.2012121212121212, + "grad_norm": 0.02530326321721077, + "learning_rate": 9.997452109470257e-05, + "loss": 0.01395807322114706, + "num_input_tokens_seen": 5436832, + "step": 332, + "train_runtime": 2699.6383, + "train_tokens_per_second": 2013.911 + }, + { + "epoch": 0.2018181818181818, + "grad_norm": 0.026743337512016296, + "learning_rate": 9.997421322178566e-05, + "loss": 0.015008356422185898, + "num_input_tokens_seen": 5453208, + "step": 333, + "train_runtime": 2707.7479, + "train_tokens_per_second": 2013.928 + }, + { + "epoch": 0.20242424242424242, + "grad_norm": 0.03141747787594795, + "learning_rate": 9.997390350041993e-05, + "loss": 0.014487622305750847, + "num_input_tokens_seen": 5469584, + "step": 334, + "train_runtime": 2715.8554, + "train_tokens_per_second": 2013.945 + }, + { + "epoch": 0.20303030303030303, + "grad_norm": 0.03556372597813606, + "learning_rate": 9.997359193061681e-05, + "loss": 0.014322612434625626, + "num_input_tokens_seen": 5485960, + "step": 335, + "train_runtime": 2723.964, + "train_tokens_per_second": 2013.962 + }, + { + "epoch": 0.20363636363636364, + "grad_norm": 0.05319400504231453, + "learning_rate": 9.997327851238788e-05, + "loss": 0.015110835433006287, + "num_input_tokens_seen": 5502336, + "step": 336, + "train_runtime": 2732.0746, + "train_tokens_per_second": 2013.977 + }, + { + "epoch": 0.20424242424242425, + "grad_norm": 0.05987285077571869, + "learning_rate": 9.997296324574467e-05, + "loss": 0.015784846618771553, + "num_input_tokens_seen": 5518712, + "step": 337, + "train_runtime": 2740.1837, + "train_tokens_per_second": 2013.993 + }, + { + "epoch": 0.20484848484848484, + "grad_norm": 0.05444290488958359, + "learning_rate": 9.997264613069887e-05, + "loss": 0.016434665769338608, + "num_input_tokens_seen": 5535088, + "step": 338, + "train_runtime": 2748.2918, + "train_tokens_per_second": 2014.01 + }, + { + "epoch": 0.20545454545454545, + "grad_norm": 0.03842825070023537, + "learning_rate": 9.997232716726222e-05, + "loss": 0.01436456385999918, + "num_input_tokens_seen": 5551464, + "step": 339, + "train_runtime": 2756.4036, + "train_tokens_per_second": 2014.024 + }, + { + "epoch": 0.20606060606060606, + "grad_norm": 0.0297915730625391, + "learning_rate": 9.997200635544648e-05, + "loss": 0.014456460252404213, + "num_input_tokens_seen": 5567840, + "step": 340, + "train_runtime": 2764.5114, + "train_tokens_per_second": 2014.041 + }, + { + "epoch": 0.20666666666666667, + "grad_norm": 0.030197616666555405, + "learning_rate": 9.997168369526355e-05, + "loss": 0.013316805474460125, + "num_input_tokens_seen": 5584216, + "step": 341, + "train_runtime": 2772.6201, + "train_tokens_per_second": 2014.057 + }, + { + "epoch": 0.20727272727272728, + "grad_norm": 0.04718567803502083, + "learning_rate": 9.997135918672536e-05, + "loss": 0.014915217645466328, + "num_input_tokens_seen": 5600592, + "step": 342, + "train_runtime": 2780.7298, + "train_tokens_per_second": 2014.073 + }, + { + "epoch": 0.20787878787878789, + "grad_norm": 0.04453250393271446, + "learning_rate": 9.997103282984391e-05, + "loss": 0.013720309361815453, + "num_input_tokens_seen": 5616968, + "step": 343, + "train_runtime": 2788.839, + "train_tokens_per_second": 2014.088 + }, + { + "epoch": 0.2084848484848485, + "grad_norm": 0.028496714308857918, + "learning_rate": 9.997070462463127e-05, + "loss": 0.015428826212882996, + "num_input_tokens_seen": 5633344, + "step": 344, + "train_runtime": 2796.9478, + "train_tokens_per_second": 2014.104 + }, + { + "epoch": 0.20909090909090908, + "grad_norm": 0.025575809180736542, + "learning_rate": 9.99703745710996e-05, + "loss": 0.014485862106084824, + "num_input_tokens_seen": 5649720, + "step": 345, + "train_runtime": 2805.0546, + "train_tokens_per_second": 2014.121 + }, + { + "epoch": 0.2096969696969697, + "grad_norm": 0.03871789202094078, + "learning_rate": 9.997004266926105e-05, + "loss": 0.013593616895377636, + "num_input_tokens_seen": 5666096, + "step": 346, + "train_runtime": 2813.1609, + "train_tokens_per_second": 2014.139 + }, + { + "epoch": 0.2103030303030303, + "grad_norm": 0.07384062558412552, + "learning_rate": 9.996970891912794e-05, + "loss": 0.015072252601385117, + "num_input_tokens_seen": 5682472, + "step": 347, + "train_runtime": 2821.2688, + "train_tokens_per_second": 2014.155 + }, + { + "epoch": 0.2109090909090909, + "grad_norm": 0.041799403727054596, + "learning_rate": 9.996937332071263e-05, + "loss": 0.014217150397598743, + "num_input_tokens_seen": 5698848, + "step": 348, + "train_runtime": 2829.3783, + "train_tokens_per_second": 2014.17 + }, + { + "epoch": 0.21151515151515152, + "grad_norm": 0.04895857349038124, + "learning_rate": 9.99690358740275e-05, + "loss": 0.017368610948324203, + "num_input_tokens_seen": 5715224, + "step": 349, + "train_runtime": 2837.4869, + "train_tokens_per_second": 2014.185 + }, + { + "epoch": 0.21212121212121213, + "grad_norm": 0.03166350722312927, + "learning_rate": 9.996869657908504e-05, + "loss": 0.014376340433955193, + "num_input_tokens_seen": 5731600, + "step": 350, + "train_runtime": 2845.6047, + "train_tokens_per_second": 2014.194 + }, + { + "epoch": 0.21272727272727274, + "grad_norm": 0.06105640158057213, + "learning_rate": 9.996835543589781e-05, + "loss": 0.01661105453968048, + "num_input_tokens_seen": 5747976, + "step": 351, + "train_runtime": 2853.7303, + "train_tokens_per_second": 2014.197 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 0.038000039756298065, + "learning_rate": 9.996801244447842e-05, + "loss": 0.013641721569001675, + "num_input_tokens_seen": 5764352, + "step": 352, + "train_runtime": 2861.847, + "train_tokens_per_second": 2014.207 + }, + { + "epoch": 0.21393939393939393, + "grad_norm": 0.033811476081609726, + "learning_rate": 9.996766760483956e-05, + "loss": 0.01525929756462574, + "num_input_tokens_seen": 5780728, + "step": 353, + "train_runtime": 2869.9635, + "train_tokens_per_second": 2014.217 + }, + { + "epoch": 0.21454545454545454, + "grad_norm": 0.01919690892100334, + "learning_rate": 9.996732091699396e-05, + "loss": 0.013008120469748974, + "num_input_tokens_seen": 5797104, + "step": 354, + "train_runtime": 2878.0782, + "train_tokens_per_second": 2014.227 + }, + { + "epoch": 0.21515151515151515, + "grad_norm": 0.03718187287449837, + "learning_rate": 9.99669723809545e-05, + "loss": 0.015754155814647675, + "num_input_tokens_seen": 5813480, + "step": 355, + "train_runtime": 2886.1934, + "train_tokens_per_second": 2014.238 + }, + { + "epoch": 0.21575757575757576, + "grad_norm": 0.03534379228949547, + "learning_rate": 9.996662199673401e-05, + "loss": 0.014936118386685848, + "num_input_tokens_seen": 5829856, + "step": 356, + "train_runtime": 2894.3081, + "train_tokens_per_second": 2014.249 + }, + { + "epoch": 0.21636363636363637, + "grad_norm": 0.024305060505867004, + "learning_rate": 9.99662697643455e-05, + "loss": 0.01359601877629757, + "num_input_tokens_seen": 5846232, + "step": 357, + "train_runtime": 2902.4301, + "train_tokens_per_second": 2014.254 + }, + { + "epoch": 0.21696969696969698, + "grad_norm": 0.027639245614409447, + "learning_rate": 9.996591568380196e-05, + "loss": 0.014319726265966892, + "num_input_tokens_seen": 5862608, + "step": 358, + "train_runtime": 2910.5461, + "train_tokens_per_second": 2014.264 + }, + { + "epoch": 0.2175757575757576, + "grad_norm": 0.06455444544553757, + "learning_rate": 9.996555975511652e-05, + "loss": 0.013829253613948822, + "num_input_tokens_seen": 5878984, + "step": 359, + "train_runtime": 2918.6646, + "train_tokens_per_second": 2014.272 + }, + { + "epoch": 0.21818181818181817, + "grad_norm": 0.02637428045272827, + "learning_rate": 9.996520197830231e-05, + "loss": 0.01420363038778305, + "num_input_tokens_seen": 5895360, + "step": 360, + "train_runtime": 2926.7812, + "train_tokens_per_second": 2014.281 + }, + { + "epoch": 0.21878787878787878, + "grad_norm": 0.06233112886548042, + "learning_rate": 9.99648423533726e-05, + "loss": 0.017570551484823227, + "num_input_tokens_seen": 5911736, + "step": 361, + "train_runtime": 2934.9002, + "train_tokens_per_second": 2014.289 + }, + { + "epoch": 0.2193939393939394, + "grad_norm": 0.04012456163764, + "learning_rate": 9.996448088034065e-05, + "loss": 0.015336515381932259, + "num_input_tokens_seen": 5928112, + "step": 362, + "train_runtime": 2943.0179, + "train_tokens_per_second": 2014.297 + }, + { + "epoch": 0.22, + "grad_norm": 0.029959173873066902, + "learning_rate": 9.996411755921987e-05, + "loss": 0.013176209293305874, + "num_input_tokens_seen": 5944488, + "step": 363, + "train_runtime": 2951.1353, + "train_tokens_per_second": 2014.305 + }, + { + "epoch": 0.2206060606060606, + "grad_norm": 0.045539602637290955, + "learning_rate": 9.996375239002369e-05, + "loss": 0.017476335167884827, + "num_input_tokens_seen": 5960864, + "step": 364, + "train_runtime": 2959.2526, + "train_tokens_per_second": 2014.314 + }, + { + "epoch": 0.22121212121212122, + "grad_norm": 0.04066498950123787, + "learning_rate": 9.996338537276559e-05, + "loss": 0.015315013006329536, + "num_input_tokens_seen": 5977240, + "step": 365, + "train_runtime": 2967.3711, + "train_tokens_per_second": 2014.322 + }, + { + "epoch": 0.22181818181818183, + "grad_norm": 0.055071763694286346, + "learning_rate": 9.996301650745917e-05, + "loss": 0.013316687196493149, + "num_input_tokens_seen": 5993616, + "step": 366, + "train_runtime": 2975.4906, + "train_tokens_per_second": 2014.329 + }, + { + "epoch": 0.2224242424242424, + "grad_norm": 0.020134275779128075, + "learning_rate": 9.996264579411807e-05, + "loss": 0.012931122444570065, + "num_input_tokens_seen": 6009992, + "step": 367, + "train_runtime": 2983.6081, + "train_tokens_per_second": 2014.337 + }, + { + "epoch": 0.22303030303030302, + "grad_norm": 0.0290455874055624, + "learning_rate": 9.9962273232756e-05, + "loss": 0.013352105394005775, + "num_input_tokens_seen": 6026368, + "step": 368, + "train_runtime": 2991.7313, + "train_tokens_per_second": 2014.341 + }, + { + "epoch": 0.22363636363636363, + "grad_norm": 0.03161335363984108, + "learning_rate": 9.996189882338675e-05, + "loss": 0.012487310916185379, + "num_input_tokens_seen": 6042744, + "step": 369, + "train_runtime": 2999.8498, + "train_tokens_per_second": 2014.349 + }, + { + "epoch": 0.22424242424242424, + "grad_norm": 0.05878787115216255, + "learning_rate": 9.996152256602414e-05, + "loss": 0.014912744984030724, + "num_input_tokens_seen": 6059120, + "step": 370, + "train_runtime": 3007.9654, + "train_tokens_per_second": 2014.358 + }, + { + "epoch": 0.22484848484848485, + "grad_norm": 0.029024092480540276, + "learning_rate": 9.996114446068212e-05, + "loss": 0.012249596416950226, + "num_input_tokens_seen": 6075496, + "step": 371, + "train_runtime": 3016.083, + "train_tokens_per_second": 2014.366 + }, + { + "epoch": 0.22545454545454546, + "grad_norm": 0.023940905928611755, + "learning_rate": 9.996076450737465e-05, + "loss": 0.014684991911053658, + "num_input_tokens_seen": 6091872, + "step": 372, + "train_runtime": 3024.1999, + "train_tokens_per_second": 2014.375 + }, + { + "epoch": 0.22606060606060607, + "grad_norm": 0.07777219265699387, + "learning_rate": 9.99603827061158e-05, + "loss": 0.01571383886039257, + "num_input_tokens_seen": 6108248, + "step": 373, + "train_runtime": 3032.3166, + "train_tokens_per_second": 2014.383 + }, + { + "epoch": 0.22666666666666666, + "grad_norm": 0.030761806294322014, + "learning_rate": 9.99599990569197e-05, + "loss": 0.013848803006112576, + "num_input_tokens_seen": 6124624, + "step": 374, + "train_runtime": 3040.4333, + "train_tokens_per_second": 2014.392 + }, + { + "epoch": 0.22727272727272727, + "grad_norm": 0.0438305102288723, + "learning_rate": 9.995961355980051e-05, + "loss": 0.014024798758327961, + "num_input_tokens_seen": 6141000, + "step": 375, + "train_runtime": 3048.549, + "train_tokens_per_second": 2014.401 + }, + { + "epoch": 0.22787878787878788, + "grad_norm": 0.04035346210002899, + "learning_rate": 9.995922621477252e-05, + "loss": 0.014576055109500885, + "num_input_tokens_seen": 6157376, + "step": 376, + "train_runtime": 3056.6655, + "train_tokens_per_second": 2014.41 + }, + { + "epoch": 0.22848484848484849, + "grad_norm": 0.09497886896133423, + "learning_rate": 9.995883702185003e-05, + "loss": 0.014249450527131557, + "num_input_tokens_seen": 6173752, + "step": 377, + "train_runtime": 3064.7824, + "train_tokens_per_second": 2014.418 + }, + { + "epoch": 0.2290909090909091, + "grad_norm": 0.03223222866654396, + "learning_rate": 9.995844598104746e-05, + "loss": 0.013723311945796013, + "num_input_tokens_seen": 6190128, + "step": 378, + "train_runtime": 3072.8984, + "train_tokens_per_second": 2014.426 + }, + { + "epoch": 0.2296969696969697, + "grad_norm": 0.023603513836860657, + "learning_rate": 9.995805309237926e-05, + "loss": 0.015003862790763378, + "num_input_tokens_seen": 6206504, + "step": 379, + "train_runtime": 3081.015, + "train_tokens_per_second": 2014.435 + }, + { + "epoch": 0.23030303030303031, + "grad_norm": 0.07697781920433044, + "learning_rate": 9.995765835585995e-05, + "loss": 0.01642550155520439, + "num_input_tokens_seen": 6222880, + "step": 380, + "train_runtime": 3089.1312, + "train_tokens_per_second": 2014.443 + }, + { + "epoch": 0.2309090909090909, + "grad_norm": 0.06212541460990906, + "learning_rate": 9.995726177150418e-05, + "loss": 0.013186133466660976, + "num_input_tokens_seen": 6239256, + "step": 381, + "train_runtime": 3097.2484, + "train_tokens_per_second": 2014.451 + }, + { + "epoch": 0.2315151515151515, + "grad_norm": 0.04135077819228172, + "learning_rate": 9.995686333932655e-05, + "loss": 0.015075747855007648, + "num_input_tokens_seen": 6255632, + "step": 382, + "train_runtime": 3105.3662, + "train_tokens_per_second": 2014.459 + }, + { + "epoch": 0.23212121212121212, + "grad_norm": 0.03373231366276741, + "learning_rate": 9.995646305934184e-05, + "loss": 0.015022508800029755, + "num_input_tokens_seen": 6272008, + "step": 383, + "train_runtime": 3113.4845, + "train_tokens_per_second": 2014.466 + }, + { + "epoch": 0.23272727272727273, + "grad_norm": 0.052756134420633316, + "learning_rate": 9.995606093156485e-05, + "loss": 0.016195476055145264, + "num_input_tokens_seen": 6288384, + "step": 384, + "train_runtime": 3121.6016, + "train_tokens_per_second": 2014.474 + }, + { + "epoch": 0.23333333333333334, + "grad_norm": 0.04732633754611015, + "learning_rate": 9.995565695601045e-05, + "loss": 0.015717167407274246, + "num_input_tokens_seen": 6304760, + "step": 385, + "train_runtime": 3129.7191, + "train_tokens_per_second": 2014.481 + }, + { + "epoch": 0.23393939393939395, + "grad_norm": 0.050964321941137314, + "learning_rate": 9.99552511326936e-05, + "loss": 0.013431689701974392, + "num_input_tokens_seen": 6321136, + "step": 386, + "train_runtime": 3137.836, + "train_tokens_per_second": 2014.489 + }, + { + "epoch": 0.23454545454545456, + "grad_norm": 0.029031990095973015, + "learning_rate": 9.995484346162926e-05, + "loss": 0.013563702814280987, + "num_input_tokens_seen": 6337512, + "step": 387, + "train_runtime": 3145.953, + "train_tokens_per_second": 2014.497 + }, + { + "epoch": 0.23515151515151514, + "grad_norm": 0.03224366530776024, + "learning_rate": 9.995443394283257e-05, + "loss": 0.01605670340359211, + "num_input_tokens_seen": 6353888, + "step": 388, + "train_runtime": 3154.0724, + "train_tokens_per_second": 2014.503 + }, + { + "epoch": 0.23575757575757575, + "grad_norm": 0.03045693039894104, + "learning_rate": 9.995402257631865e-05, + "loss": 0.015148544684052467, + "num_input_tokens_seen": 6370264, + "step": 389, + "train_runtime": 3162.1889, + "train_tokens_per_second": 2014.511 + }, + { + "epoch": 0.23636363636363636, + "grad_norm": 0.027332261204719543, + "learning_rate": 9.995360936210271e-05, + "loss": 0.014781562611460686, + "num_input_tokens_seen": 6386640, + "step": 390, + "train_runtime": 3170.3051, + "train_tokens_per_second": 2014.519 + }, + { + "epoch": 0.23696969696969697, + "grad_norm": 0.023009251803159714, + "learning_rate": 9.995319430020003e-05, + "loss": 0.013627824373543262, + "num_input_tokens_seen": 6403016, + "step": 391, + "train_runtime": 3178.43, + "train_tokens_per_second": 2014.522 + }, + { + "epoch": 0.23757575757575758, + "grad_norm": 0.035416360944509506, + "learning_rate": 9.995277739062599e-05, + "loss": 0.01493286807090044, + "num_input_tokens_seen": 6419392, + "step": 392, + "train_runtime": 3186.5451, + "train_tokens_per_second": 2014.53 + }, + { + "epoch": 0.2381818181818182, + "grad_norm": 0.04003625363111496, + "learning_rate": 9.995235863339598e-05, + "loss": 0.016020091250538826, + "num_input_tokens_seen": 6435768, + "step": 393, + "train_runtime": 3194.6612, + "train_tokens_per_second": 2014.539 + }, + { + "epoch": 0.2387878787878788, + "grad_norm": 0.024710826575756073, + "learning_rate": 9.995193802852552e-05, + "loss": 0.015763292089104652, + "num_input_tokens_seen": 6452144, + "step": 394, + "train_runtime": 3202.7765, + "train_tokens_per_second": 2014.547 + }, + { + "epoch": 0.23939393939393938, + "grad_norm": 0.05250145494937897, + "learning_rate": 9.995151557603013e-05, + "loss": 0.017301952466368675, + "num_input_tokens_seen": 6468520, + "step": 395, + "train_runtime": 3210.893, + "train_tokens_per_second": 2014.555 + }, + { + "epoch": 0.24, + "grad_norm": 0.037685710936784744, + "learning_rate": 9.995109127592546e-05, + "loss": 0.014692970551550388, + "num_input_tokens_seen": 6484896, + "step": 396, + "train_runtime": 3219.0101, + "train_tokens_per_second": 2014.562 + }, + { + "epoch": 0.2406060606060606, + "grad_norm": 0.03617233410477638, + "learning_rate": 9.99506651282272e-05, + "loss": 0.015763459727168083, + "num_input_tokens_seen": 6501272, + "step": 397, + "train_runtime": 3227.1302, + "train_tokens_per_second": 2014.568 + }, + { + "epoch": 0.2412121212121212, + "grad_norm": 0.026065215468406677, + "learning_rate": 9.995023713295111e-05, + "loss": 0.013620332814753056, + "num_input_tokens_seen": 6517648, + "step": 398, + "train_runtime": 3235.2472, + "train_tokens_per_second": 2014.575 + }, + { + "epoch": 0.24181818181818182, + "grad_norm": 0.045087747275829315, + "learning_rate": 9.994980729011303e-05, + "loss": 0.015572777949273586, + "num_input_tokens_seen": 6534024, + "step": 399, + "train_runtime": 3243.3644, + "train_tokens_per_second": 2014.582 + }, + { + "epoch": 0.24242424242424243, + "grad_norm": 0.02911469154059887, + "learning_rate": 9.994937559972884e-05, + "loss": 0.014463523402810097, + "num_input_tokens_seen": 6550400, + "step": 400, + "train_runtime": 3251.4815, + "train_tokens_per_second": 2014.589 + }, + { + "epoch": 0.24303030303030304, + "grad_norm": 0.09026223421096802, + "learning_rate": 9.994894206181452e-05, + "loss": 0.015273511409759521, + "num_input_tokens_seen": 6566776, + "step": 401, + "train_runtime": 3260.5529, + "train_tokens_per_second": 2014.007 + }, + { + "epoch": 0.24363636363636362, + "grad_norm": 0.059329140931367874, + "learning_rate": 9.994850667638611e-05, + "loss": 0.017180006951093674, + "num_input_tokens_seen": 6583152, + "step": 402, + "train_runtime": 3268.6733, + "train_tokens_per_second": 2014.013 + }, + { + "epoch": 0.24424242424242423, + "grad_norm": 0.05259858816862106, + "learning_rate": 9.99480694434597e-05, + "loss": 0.01665383018553257, + "num_input_tokens_seen": 6599528, + "step": 403, + "train_runtime": 3276.7926, + "train_tokens_per_second": 2014.021 + }, + { + "epoch": 0.24484848484848484, + "grad_norm": 0.046337101608514786, + "learning_rate": 9.994763036305148e-05, + "loss": 0.01817156933248043, + "num_input_tokens_seen": 6615904, + "step": 404, + "train_runtime": 3284.9091, + "train_tokens_per_second": 2014.03 + }, + { + "epoch": 0.24545454545454545, + "grad_norm": 0.023166598752141, + "learning_rate": 9.994718943517768e-05, + "loss": 0.012105523608624935, + "num_input_tokens_seen": 6632280, + "step": 405, + "train_runtime": 3293.0293, + "train_tokens_per_second": 2014.036 + }, + { + "epoch": 0.24606060606060606, + "grad_norm": 0.044385019689798355, + "learning_rate": 9.994674665985461e-05, + "loss": 0.01413038745522499, + "num_input_tokens_seen": 6648656, + "step": 406, + "train_runtime": 3301.1473, + "train_tokens_per_second": 2014.044 + }, + { + "epoch": 0.24666666666666667, + "grad_norm": 0.038354646414518356, + "learning_rate": 9.994630203709865e-05, + "loss": 0.015764841809868813, + "num_input_tokens_seen": 6665032, + "step": 407, + "train_runtime": 3309.2652, + "train_tokens_per_second": 2014.052 + }, + { + "epoch": 0.24727272727272728, + "grad_norm": 0.026519082486629486, + "learning_rate": 9.994585556692624e-05, + "loss": 0.015617020428180695, + "num_input_tokens_seen": 6681408, + "step": 408, + "train_runtime": 3317.3836, + "train_tokens_per_second": 2014.06 + }, + { + "epoch": 0.24787878787878787, + "grad_norm": 0.07033390551805496, + "learning_rate": 9.994540724935389e-05, + "loss": 0.01747780106961727, + "num_input_tokens_seen": 6697784, + "step": 409, + "train_runtime": 3325.5001, + "train_tokens_per_second": 2014.068 + }, + { + "epoch": 0.24848484848484848, + "grad_norm": 0.02514197863638401, + "learning_rate": 9.994495708439819e-05, + "loss": 0.01398993656039238, + "num_input_tokens_seen": 6714160, + "step": 410, + "train_runtime": 3333.618, + "train_tokens_per_second": 2014.076 + }, + { + "epoch": 0.24909090909090909, + "grad_norm": 0.023313792422413826, + "learning_rate": 9.99445050720758e-05, + "loss": 0.013531757518649101, + "num_input_tokens_seen": 6730536, + "step": 411, + "train_runtime": 3341.7359, + "train_tokens_per_second": 2014.084 + }, + { + "epoch": 0.2496969696969697, + "grad_norm": 0.04927172139286995, + "learning_rate": 9.994405121240344e-05, + "loss": 0.014407115057110786, + "num_input_tokens_seen": 6746912, + "step": 412, + "train_runtime": 3349.8514, + "train_tokens_per_second": 2014.093 + }, + { + "epoch": 0.2503030303030303, + "grad_norm": 0.03376639634370804, + "learning_rate": 9.994359550539787e-05, + "loss": 0.015590015798807144, + "num_input_tokens_seen": 6763288, + "step": 413, + "train_runtime": 3357.9682, + "train_tokens_per_second": 2014.101 + }, + { + "epoch": 0.2509090909090909, + "grad_norm": 0.026951145380735397, + "learning_rate": 9.994313795107597e-05, + "loss": 0.013218428939580917, + "num_input_tokens_seen": 6779664, + "step": 414, + "train_runtime": 3366.0858, + "train_tokens_per_second": 2014.109 + }, + { + "epoch": 0.2515151515151515, + "grad_norm": 0.028939809650182724, + "learning_rate": 9.994267854945465e-05, + "loss": 0.013945825397968292, + "num_input_tokens_seen": 6796040, + "step": 415, + "train_runtime": 3374.204, + "train_tokens_per_second": 2014.116 + }, + { + "epoch": 0.25212121212121213, + "grad_norm": 0.048603300005197525, + "learning_rate": 9.994221730055091e-05, + "loss": 0.014013823121786118, + "num_input_tokens_seen": 6812416, + "step": 416, + "train_runtime": 3382.3201, + "train_tokens_per_second": 2014.125 + }, + { + "epoch": 0.25272727272727274, + "grad_norm": 0.03397737815976143, + "learning_rate": 9.994175420438182e-05, + "loss": 0.016459740698337555, + "num_input_tokens_seen": 6828792, + "step": 417, + "train_runtime": 3390.4376, + "train_tokens_per_second": 2014.133 + }, + { + "epoch": 0.25333333333333335, + "grad_norm": 0.09882687032222748, + "learning_rate": 9.99412892609645e-05, + "loss": 0.019090697169303894, + "num_input_tokens_seen": 6845168, + "step": 418, + "train_runtime": 3398.5545, + "train_tokens_per_second": 2014.141 + }, + { + "epoch": 0.25393939393939396, + "grad_norm": 0.02406393364071846, + "learning_rate": 9.994082247031613e-05, + "loss": 0.01460934616625309, + "num_input_tokens_seen": 6861544, + "step": 419, + "train_runtime": 3406.6729, + "train_tokens_per_second": 2014.148 + }, + { + "epoch": 0.2545454545454545, + "grad_norm": 0.05103567615151405, + "learning_rate": 9.994035383245401e-05, + "loss": 0.014737242832779884, + "num_input_tokens_seen": 6877920, + "step": 420, + "train_runtime": 3414.7913, + "train_tokens_per_second": 2014.155 + }, + { + "epoch": 0.25515151515151513, + "grad_norm": 0.040553025901317596, + "learning_rate": 9.993988334739544e-05, + "loss": 0.015402523800730705, + "num_input_tokens_seen": 6894296, + "step": 421, + "train_runtime": 3422.91, + "train_tokens_per_second": 2014.162 + }, + { + "epoch": 0.25575757575757574, + "grad_norm": 0.038083747029304504, + "learning_rate": 9.993941101515786e-05, + "loss": 0.014769435860216618, + "num_input_tokens_seen": 6910672, + "step": 422, + "train_runtime": 3431.0293, + "train_tokens_per_second": 2014.169 + }, + { + "epoch": 0.25636363636363635, + "grad_norm": 0.018217189237475395, + "learning_rate": 9.99389368357587e-05, + "loss": 0.01357343327254057, + "num_input_tokens_seen": 6927048, + "step": 423, + "train_runtime": 3439.1473, + "train_tokens_per_second": 2014.176 + }, + { + "epoch": 0.25696969696969696, + "grad_norm": 0.04052957519888878, + "learning_rate": 9.993846080921552e-05, + "loss": 0.01406765729188919, + "num_input_tokens_seen": 6943424, + "step": 424, + "train_runtime": 3447.2634, + "train_tokens_per_second": 2014.184 + }, + { + "epoch": 0.25757575757575757, + "grad_norm": 0.02357480488717556, + "learning_rate": 9.993798293554593e-05, + "loss": 0.013200477696955204, + "num_input_tokens_seen": 6959800, + "step": 425, + "train_runtime": 3455.3793, + "train_tokens_per_second": 2014.193 + }, + { + "epoch": 0.2581818181818182, + "grad_norm": 0.02221427671611309, + "learning_rate": 9.99375032147676e-05, + "loss": 0.014044541865587234, + "num_input_tokens_seen": 6976176, + "step": 426, + "train_runtime": 3463.4954, + "train_tokens_per_second": 2014.201 + }, + { + "epoch": 0.2587878787878788, + "grad_norm": 0.03215425834059715, + "learning_rate": 9.993702164689829e-05, + "loss": 0.013318242505192757, + "num_input_tokens_seen": 6992552, + "step": 427, + "train_runtime": 3471.613, + "train_tokens_per_second": 2014.208 + }, + { + "epoch": 0.2593939393939394, + "grad_norm": 0.049007292836904526, + "learning_rate": 9.993653823195578e-05, + "loss": 0.014676532708108425, + "num_input_tokens_seen": 7008928, + "step": 428, + "train_runtime": 3479.731, + "train_tokens_per_second": 2014.215 + }, + { + "epoch": 0.26, + "grad_norm": 0.029083114117383957, + "learning_rate": 9.993605296995796e-05, + "loss": 0.013533808290958405, + "num_input_tokens_seen": 7025304, + "step": 429, + "train_runtime": 3487.8493, + "train_tokens_per_second": 2014.222 + }, + { + "epoch": 0.2606060606060606, + "grad_norm": 0.03159458562731743, + "learning_rate": 9.993556586092281e-05, + "loss": 0.015523270703852177, + "num_input_tokens_seen": 7041680, + "step": 430, + "train_runtime": 3495.9654, + "train_tokens_per_second": 2014.23 + }, + { + "epoch": 0.26121212121212123, + "grad_norm": 0.023704880848526955, + "learning_rate": 9.993507690486831e-05, + "loss": 0.014423849992454052, + "num_input_tokens_seen": 7058056, + "step": 431, + "train_runtime": 3504.0833, + "train_tokens_per_second": 2014.238 + }, + { + "epoch": 0.26181818181818184, + "grad_norm": 0.061435069888830185, + "learning_rate": 9.993458610181256e-05, + "loss": 0.01381218247115612, + "num_input_tokens_seen": 7074432, + "step": 432, + "train_runtime": 3512.2002, + "train_tokens_per_second": 2014.245 + }, + { + "epoch": 0.26242424242424245, + "grad_norm": 0.027623331174254417, + "learning_rate": 9.993409345177371e-05, + "loss": 0.013529473915696144, + "num_input_tokens_seen": 7090808, + "step": 433, + "train_runtime": 3520.3183, + "train_tokens_per_second": 2014.252 + }, + { + "epoch": 0.263030303030303, + "grad_norm": 0.02938493713736534, + "learning_rate": 9.993359895477e-05, + "loss": 0.014209594577550888, + "num_input_tokens_seen": 7107184, + "step": 434, + "train_runtime": 3528.4347, + "train_tokens_per_second": 2014.26 + }, + { + "epoch": 0.2636363636363636, + "grad_norm": 0.05708494782447815, + "learning_rate": 9.993310261081968e-05, + "loss": 0.01838802546262741, + "num_input_tokens_seen": 7123560, + "step": 435, + "train_runtime": 3536.5523, + "train_tokens_per_second": 2014.267 + }, + { + "epoch": 0.2642424242424242, + "grad_norm": 0.01653749868273735, + "learning_rate": 9.993260441994116e-05, + "loss": 0.014132829383015633, + "num_input_tokens_seen": 7139936, + "step": 436, + "train_runtime": 3544.6693, + "train_tokens_per_second": 2014.274 + }, + { + "epoch": 0.26484848484848483, + "grad_norm": 0.06222791597247124, + "learning_rate": 9.993210438215284e-05, + "loss": 0.017560908570885658, + "num_input_tokens_seen": 7156312, + "step": 437, + "train_runtime": 3552.7886, + "train_tokens_per_second": 2014.28 + }, + { + "epoch": 0.26545454545454544, + "grad_norm": 0.023168306797742844, + "learning_rate": 9.993160249747319e-05, + "loss": 0.014680145308375359, + "num_input_tokens_seen": 7172688, + "step": 438, + "train_runtime": 3560.9057, + "train_tokens_per_second": 2014.288 + }, + { + "epoch": 0.26606060606060605, + "grad_norm": 0.03977813571691513, + "learning_rate": 9.993109876592083e-05, + "loss": 0.01688549481332302, + "num_input_tokens_seen": 7189064, + "step": 439, + "train_runtime": 3569.029, + "train_tokens_per_second": 2014.291 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.027993550524115562, + "learning_rate": 9.993059318751435e-05, + "loss": 0.012989813461899757, + "num_input_tokens_seen": 7205440, + "step": 440, + "train_runtime": 3577.1458, + "train_tokens_per_second": 2014.299 + }, + { + "epoch": 0.2672727272727273, + "grad_norm": 0.051551882177591324, + "learning_rate": 9.993008576227247e-05, + "loss": 0.016102567315101624, + "num_input_tokens_seen": 7221816, + "step": 441, + "train_runtime": 3585.2621, + "train_tokens_per_second": 2014.306 + }, + { + "epoch": 0.2678787878787879, + "grad_norm": 0.03278960660099983, + "learning_rate": 9.992957649021395e-05, + "loss": 0.014773263595998287, + "num_input_tokens_seen": 7238192, + "step": 442, + "train_runtime": 3593.378, + "train_tokens_per_second": 2014.314 + }, + { + "epoch": 0.2684848484848485, + "grad_norm": 0.030394606292247772, + "learning_rate": 9.992906537135762e-05, + "loss": 0.015549161471426487, + "num_input_tokens_seen": 7254568, + "step": 443, + "train_runtime": 3601.4945, + "train_tokens_per_second": 2014.322 + }, + { + "epoch": 0.2690909090909091, + "grad_norm": 0.027792129665613174, + "learning_rate": 9.992855240572241e-05, + "loss": 0.01473160833120346, + "num_input_tokens_seen": 7270944, + "step": 444, + "train_runtime": 3609.6111, + "train_tokens_per_second": 2014.329 + }, + { + "epoch": 0.2696969696969697, + "grad_norm": 0.01833016611635685, + "learning_rate": 9.992803759332728e-05, + "loss": 0.013827802613377571, + "num_input_tokens_seen": 7287320, + "step": 445, + "train_runtime": 3617.7304, + "train_tokens_per_second": 2014.335 + }, + { + "epoch": 0.2703030303030303, + "grad_norm": 0.021910199895501137, + "learning_rate": 9.992752093419124e-05, + "loss": 0.014088256284594536, + "num_input_tokens_seen": 7303696, + "step": 446, + "train_runtime": 3625.8455, + "train_tokens_per_second": 2014.343 + }, + { + "epoch": 0.27090909090909093, + "grad_norm": 0.03614957630634308, + "learning_rate": 9.992700242833346e-05, + "loss": 0.014040564186871052, + "num_input_tokens_seen": 7320072, + "step": 447, + "train_runtime": 3633.9607, + "train_tokens_per_second": 2014.351 + }, + { + "epoch": 0.27151515151515154, + "grad_norm": 0.03147033974528313, + "learning_rate": 9.992648207577308e-05, + "loss": 0.01510291825979948, + "num_input_tokens_seen": 7336448, + "step": 448, + "train_runtime": 3642.0772, + "train_tokens_per_second": 2014.358 + }, + { + "epoch": 0.2721212121212121, + "grad_norm": 0.01757362298667431, + "learning_rate": 9.992595987652935e-05, + "loss": 0.01235952414572239, + "num_input_tokens_seen": 7352824, + "step": 449, + "train_runtime": 3650.194, + "train_tokens_per_second": 2014.365 + }, + { + "epoch": 0.2727272727272727, + "grad_norm": 0.018810704350471497, + "learning_rate": 9.99254358306216e-05, + "loss": 0.01245784480124712, + "num_input_tokens_seen": 7369200, + "step": 450, + "train_runtime": 3658.3109, + "train_tokens_per_second": 2014.372 + }, + { + "epoch": 0.2733333333333333, + "grad_norm": 0.02399486117064953, + "learning_rate": 9.99249099380692e-05, + "loss": 0.012906970456242561, + "num_input_tokens_seen": 7385576, + "step": 451, + "train_runtime": 3666.429, + "train_tokens_per_second": 2014.379 + }, + { + "epoch": 0.2739393939393939, + "grad_norm": 0.07980017364025116, + "learning_rate": 9.99243821988916e-05, + "loss": 0.017233727499842644, + "num_input_tokens_seen": 7401952, + "step": 452, + "train_runtime": 3674.5475, + "train_tokens_per_second": 2014.385 + }, + { + "epoch": 0.27454545454545454, + "grad_norm": 0.019096143543720245, + "learning_rate": 9.992385261310833e-05, + "loss": 0.013073702342808247, + "num_input_tokens_seen": 7418328, + "step": 453, + "train_runtime": 3682.6645, + "train_tokens_per_second": 2014.392 + }, + { + "epoch": 0.27515151515151515, + "grad_norm": 0.055766504257917404, + "learning_rate": 9.992332118073897e-05, + "loss": 0.014186715707182884, + "num_input_tokens_seen": 7434704, + "step": 454, + "train_runtime": 3690.7797, + "train_tokens_per_second": 2014.399 + }, + { + "epoch": 0.27575757575757576, + "grad_norm": 0.02542242966592312, + "learning_rate": 9.992278790180318e-05, + "loss": 0.016023358330130577, + "num_input_tokens_seen": 7451080, + "step": 455, + "train_runtime": 3698.8942, + "train_tokens_per_second": 2014.407 + }, + { + "epoch": 0.27636363636363637, + "grad_norm": 0.020465506240725517, + "learning_rate": 9.99222527763207e-05, + "loss": 0.013445570133626461, + "num_input_tokens_seen": 7467456, + "step": 456, + "train_runtime": 3707.0085, + "train_tokens_per_second": 2014.416 + }, + { + "epoch": 0.276969696969697, + "grad_norm": 0.022726397961378098, + "learning_rate": 9.992171580431129e-05, + "loss": 0.013883800245821476, + "num_input_tokens_seen": 7483832, + "step": 457, + "train_runtime": 3715.1297, + "train_tokens_per_second": 2014.42 + }, + { + "epoch": 0.2775757575757576, + "grad_norm": 0.06926342844963074, + "learning_rate": 9.992117698579484e-05, + "loss": 0.016109909862279892, + "num_input_tokens_seen": 7500208, + "step": 458, + "train_runtime": 3723.2461, + "train_tokens_per_second": 2014.427 + }, + { + "epoch": 0.2781818181818182, + "grad_norm": 0.03352541849017143, + "learning_rate": 9.992063632079127e-05, + "loss": 0.01359601691365242, + "num_input_tokens_seen": 7516584, + "step": 459, + "train_runtime": 3731.3653, + "train_tokens_per_second": 2014.433 + }, + { + "epoch": 0.2787878787878788, + "grad_norm": 0.046891167759895325, + "learning_rate": 9.992009380932059e-05, + "loss": 0.014447907917201519, + "num_input_tokens_seen": 7532960, + "step": 460, + "train_runtime": 3739.4829, + "train_tokens_per_second": 2014.439 + }, + { + "epoch": 0.2793939393939394, + "grad_norm": 0.05756726115942001, + "learning_rate": 9.991954945140284e-05, + "loss": 0.012774428352713585, + "num_input_tokens_seen": 7549336, + "step": 461, + "train_runtime": 3747.5996, + "train_tokens_per_second": 2014.446 + }, + { + "epoch": 0.28, + "grad_norm": 0.06149715185165405, + "learning_rate": 9.991900324705817e-05, + "loss": 0.015111779794096947, + "num_input_tokens_seen": 7565712, + "step": 462, + "train_runtime": 3755.7151, + "train_tokens_per_second": 2014.453 + }, + { + "epoch": 0.2806060606060606, + "grad_norm": 0.03807002305984497, + "learning_rate": 9.991845519630678e-05, + "loss": 0.014264722354710102, + "num_input_tokens_seen": 7582088, + "step": 463, + "train_runtime": 3763.8316, + "train_tokens_per_second": 2014.46 + }, + { + "epoch": 0.2812121212121212, + "grad_norm": 0.038672804832458496, + "learning_rate": 9.991790529916896e-05, + "loss": 0.014925600029528141, + "num_input_tokens_seen": 7598464, + "step": 464, + "train_runtime": 3771.9486, + "train_tokens_per_second": 2014.466 + }, + { + "epoch": 0.2818181818181818, + "grad_norm": 0.04409286752343178, + "learning_rate": 9.991735355566502e-05, + "loss": 0.01355639100074768, + "num_input_tokens_seen": 7614840, + "step": 465, + "train_runtime": 3780.0654, + "train_tokens_per_second": 2014.473 + }, + { + "epoch": 0.2824242424242424, + "grad_norm": 0.05239715427160263, + "learning_rate": 9.991679996581539e-05, + "loss": 0.01419782917946577, + "num_input_tokens_seen": 7631216, + "step": 466, + "train_runtime": 3788.182, + "train_tokens_per_second": 2014.48 + }, + { + "epoch": 0.283030303030303, + "grad_norm": 0.04078468307852745, + "learning_rate": 9.991624452964054e-05, + "loss": 0.014365100301802158, + "num_input_tokens_seen": 7647592, + "step": 467, + "train_runtime": 3796.2972, + "train_tokens_per_second": 2014.487 + }, + { + "epoch": 0.28363636363636363, + "grad_norm": 0.05068361386656761, + "learning_rate": 9.9915687247161e-05, + "loss": 0.016069650650024414, + "num_input_tokens_seen": 7663968, + "step": 468, + "train_runtime": 3804.4156, + "train_tokens_per_second": 2014.493 + }, + { + "epoch": 0.28424242424242424, + "grad_norm": 0.028354912996292114, + "learning_rate": 9.991512811839741e-05, + "loss": 0.01326735783368349, + "num_input_tokens_seen": 7680344, + "step": 469, + "train_runtime": 3812.5326, + "train_tokens_per_second": 2014.499 + }, + { + "epoch": 0.28484848484848485, + "grad_norm": 0.018959172070026398, + "learning_rate": 9.991456714337041e-05, + "loss": 0.01290344912558794, + "num_input_tokens_seen": 7696720, + "step": 470, + "train_runtime": 3820.6476, + "train_tokens_per_second": 2014.507 + }, + { + "epoch": 0.28545454545454546, + "grad_norm": 0.03419540822505951, + "learning_rate": 9.99140043221008e-05, + "loss": 0.015551136806607246, + "num_input_tokens_seen": 7713096, + "step": 471, + "train_runtime": 3828.7623, + "train_tokens_per_second": 2014.514 + }, + { + "epoch": 0.28606060606060607, + "grad_norm": 0.0427350290119648, + "learning_rate": 9.991343965460937e-05, + "loss": 0.014623988419771194, + "num_input_tokens_seen": 7729472, + "step": 472, + "train_runtime": 3836.8784, + "train_tokens_per_second": 2014.521 + }, + { + "epoch": 0.2866666666666667, + "grad_norm": 0.030883153900504112, + "learning_rate": 9.991287314091699e-05, + "loss": 0.013778546825051308, + "num_input_tokens_seen": 7745848, + "step": 473, + "train_runtime": 3844.9946, + "train_tokens_per_second": 2014.528 + }, + { + "epoch": 0.2872727272727273, + "grad_norm": 0.021236877888441086, + "learning_rate": 9.991230478104466e-05, + "loss": 0.013353691436350346, + "num_input_tokens_seen": 7762224, + "step": 474, + "train_runtime": 3853.1121, + "train_tokens_per_second": 2014.534 + }, + { + "epoch": 0.2878787878787879, + "grad_norm": 0.019137293100357056, + "learning_rate": 9.991173457501337e-05, + "loss": 0.013228803873062134, + "num_input_tokens_seen": 7778600, + "step": 475, + "train_runtime": 3861.229, + "train_tokens_per_second": 2014.54 + }, + { + "epoch": 0.2884848484848485, + "grad_norm": 0.01902465894818306, + "learning_rate": 9.991116252284421e-05, + "loss": 0.013284035958349705, + "num_input_tokens_seen": 7794976, + "step": 476, + "train_runtime": 3869.3457, + "train_tokens_per_second": 2014.546 + }, + { + "epoch": 0.28909090909090907, + "grad_norm": 0.028947357088327408, + "learning_rate": 9.991058862455833e-05, + "loss": 0.01423730794340372, + "num_input_tokens_seen": 7811352, + "step": 477, + "train_runtime": 3877.4643, + "train_tokens_per_second": 2014.552 + }, + { + "epoch": 0.2896969696969697, + "grad_norm": 0.024383556097745895, + "learning_rate": 9.991001288017701e-05, + "loss": 0.013436602428555489, + "num_input_tokens_seen": 7827728, + "step": 478, + "train_runtime": 3885.5822, + "train_tokens_per_second": 2014.557 + }, + { + "epoch": 0.2903030303030303, + "grad_norm": 0.04384802654385567, + "learning_rate": 9.990943528972147e-05, + "loss": 0.013107577338814735, + "num_input_tokens_seen": 7844104, + "step": 479, + "train_runtime": 3893.6976, + "train_tokens_per_second": 2014.564 + }, + { + "epoch": 0.2909090909090909, + "grad_norm": 0.020900119096040726, + "learning_rate": 9.990885585321315e-05, + "loss": 0.015309646725654602, + "num_input_tokens_seen": 7860480, + "step": 480, + "train_runtime": 3901.8179, + "train_tokens_per_second": 2014.569 + }, + { + "epoch": 0.2915151515151515, + "grad_norm": 0.018041405826807022, + "learning_rate": 9.990827457067343e-05, + "loss": 0.012978669255971909, + "num_input_tokens_seen": 7876856, + "step": 481, + "train_runtime": 3909.935, + "train_tokens_per_second": 2014.575 + }, + { + "epoch": 0.2921212121212121, + "grad_norm": 0.02291363663971424, + "learning_rate": 9.99076914421238e-05, + "loss": 0.014085205271840096, + "num_input_tokens_seen": 7893232, + "step": 482, + "train_runtime": 3918.0537, + "train_tokens_per_second": 2014.58 + }, + { + "epoch": 0.2927272727272727, + "grad_norm": 0.023675069212913513, + "learning_rate": 9.990710646758589e-05, + "loss": 0.014468826353549957, + "num_input_tokens_seen": 7909608, + "step": 483, + "train_runtime": 3926.1718, + "train_tokens_per_second": 2014.585 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 0.021886734291911125, + "learning_rate": 9.990651964708128e-05, + "loss": 0.014159688726067543, + "num_input_tokens_seen": 7925984, + "step": 484, + "train_runtime": 3934.2949, + "train_tokens_per_second": 2014.588 + }, + { + "epoch": 0.29393939393939394, + "grad_norm": 0.019282542169094086, + "learning_rate": 9.99059309806317e-05, + "loss": 0.013447335921227932, + "num_input_tokens_seen": 7942360, + "step": 485, + "train_runtime": 3942.412, + "train_tokens_per_second": 2014.594 + }, + { + "epoch": 0.29454545454545455, + "grad_norm": 0.021736539900302887, + "learning_rate": 9.990534046825893e-05, + "loss": 0.014465593732893467, + "num_input_tokens_seen": 7958736, + "step": 486, + "train_runtime": 3950.5289, + "train_tokens_per_second": 2014.6 + }, + { + "epoch": 0.29515151515151516, + "grad_norm": 0.058480676263570786, + "learning_rate": 9.99047481099848e-05, + "loss": 0.015324249863624573, + "num_input_tokens_seen": 7975112, + "step": 487, + "train_runtime": 3958.6459, + "train_tokens_per_second": 2014.606 + }, + { + "epoch": 0.2957575757575758, + "grad_norm": 0.04795762896537781, + "learning_rate": 9.990415390583122e-05, + "loss": 0.015603849664330482, + "num_input_tokens_seen": 7991488, + "step": 488, + "train_runtime": 3966.7616, + "train_tokens_per_second": 2014.613 + }, + { + "epoch": 0.2963636363636364, + "grad_norm": 0.045595213770866394, + "learning_rate": 9.990355785582017e-05, + "loss": 0.013210836797952652, + "num_input_tokens_seen": 8007864, + "step": 489, + "train_runtime": 3974.8769, + "train_tokens_per_second": 2014.619 + }, + { + "epoch": 0.296969696969697, + "grad_norm": 0.03191044181585312, + "learning_rate": 9.99029599599737e-05, + "loss": 0.0133978221565485, + "num_input_tokens_seen": 8024240, + "step": 490, + "train_runtime": 3982.9927, + "train_tokens_per_second": 2014.626 + }, + { + "epoch": 0.29757575757575755, + "grad_norm": 0.03503177687525749, + "learning_rate": 9.990236021831391e-05, + "loss": 0.01524767093360424, + "num_input_tokens_seen": 8040616, + "step": 491, + "train_runtime": 3991.1095, + "train_tokens_per_second": 2014.632 + }, + { + "epoch": 0.29818181818181816, + "grad_norm": 0.021688032895326614, + "learning_rate": 9.990175863086302e-05, + "loss": 0.013602089136838913, + "num_input_tokens_seen": 8056992, + "step": 492, + "train_runtime": 3999.2294, + "train_tokens_per_second": 2014.636 + }, + { + "epoch": 0.29878787878787877, + "grad_norm": 0.02230294793844223, + "learning_rate": 9.990115519764325e-05, + "loss": 0.01378709264099598, + "num_input_tokens_seen": 8073368, + "step": 493, + "train_runtime": 4007.3438, + "train_tokens_per_second": 2014.643 + }, + { + "epoch": 0.2993939393939394, + "grad_norm": 0.0244484543800354, + "learning_rate": 9.990054991867692e-05, + "loss": 0.01362735964357853, + "num_input_tokens_seen": 8089744, + "step": 494, + "train_runtime": 4015.461, + "train_tokens_per_second": 2014.649 + }, + { + "epoch": 0.3, + "grad_norm": 0.021698100492358208, + "learning_rate": 9.989994279398642e-05, + "loss": 0.01317393034696579, + "num_input_tokens_seen": 8106120, + "step": 495, + "train_runtime": 4023.5779, + "train_tokens_per_second": 2014.655 + }, + { + "epoch": 0.3006060606060606, + "grad_norm": 0.04310522973537445, + "learning_rate": 9.989933382359422e-05, + "loss": 0.014429607428610325, + "num_input_tokens_seen": 8122496, + "step": 496, + "train_runtime": 4031.6942, + "train_tokens_per_second": 2014.661 + }, + { + "epoch": 0.3012121212121212, + "grad_norm": 0.018435562029480934, + "learning_rate": 9.989872300752283e-05, + "loss": 0.013920141384005547, + "num_input_tokens_seen": 8138872, + "step": 497, + "train_runtime": 4039.8107, + "train_tokens_per_second": 2014.667 + }, + { + "epoch": 0.3018181818181818, + "grad_norm": 0.023063285276293755, + "learning_rate": 9.989811034579486e-05, + "loss": 0.0139535591006279, + "num_input_tokens_seen": 8155248, + "step": 498, + "train_runtime": 4047.9289, + "train_tokens_per_second": 2014.672 + }, + { + "epoch": 0.30242424242424243, + "grad_norm": 0.0432952381670475, + "learning_rate": 9.989749583843296e-05, + "loss": 0.014083024114370346, + "num_input_tokens_seen": 8171624, + "step": 499, + "train_runtime": 4056.0475, + "train_tokens_per_second": 2014.677 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 0.0212725643068552, + "learning_rate": 9.989687948545985e-05, + "loss": 0.013151183724403381, + "num_input_tokens_seen": 8188000, + "step": 500, + "train_runtime": 4064.1628, + "train_tokens_per_second": 2014.683 + }, + { + "epoch": 0.30363636363636365, + "grad_norm": 0.03599437326192856, + "learning_rate": 9.989626128689835e-05, + "loss": 0.016130445525050163, + "num_input_tokens_seen": 8204376, + "step": 501, + "train_runtime": 4073.1872, + "train_tokens_per_second": 2014.24 + }, + { + "epoch": 0.30424242424242426, + "grad_norm": 0.021969085559248924, + "learning_rate": 9.98956412427713e-05, + "loss": 0.013816497288644314, + "num_input_tokens_seen": 8220752, + "step": 502, + "train_runtime": 4081.3029, + "train_tokens_per_second": 2014.247 + }, + { + "epoch": 0.30484848484848487, + "grad_norm": 0.032568175345659256, + "learning_rate": 9.989501935310166e-05, + "loss": 0.015003332868218422, + "num_input_tokens_seen": 8237128, + "step": 503, + "train_runtime": 4089.4198, + "train_tokens_per_second": 2014.253 + }, + { + "epoch": 0.3054545454545455, + "grad_norm": 0.0263565294444561, + "learning_rate": 9.98943956179124e-05, + "loss": 0.014254853129386902, + "num_input_tokens_seen": 8253504, + "step": 504, + "train_runtime": 4097.5386, + "train_tokens_per_second": 2014.259 + }, + { + "epoch": 0.30606060606060603, + "grad_norm": 0.019410574808716774, + "learning_rate": 9.989377003722664e-05, + "loss": 0.012152588926255703, + "num_input_tokens_seen": 8269880, + "step": 505, + "train_runtime": 4105.6623, + "train_tokens_per_second": 2014.262 + }, + { + "epoch": 0.30666666666666664, + "grad_norm": 0.043787118047475815, + "learning_rate": 9.989314261106749e-05, + "loss": 0.013709669932723045, + "num_input_tokens_seen": 8286256, + "step": 506, + "train_runtime": 4113.7771, + "train_tokens_per_second": 2014.27 + }, + { + "epoch": 0.30727272727272725, + "grad_norm": 0.04813135415315628, + "learning_rate": 9.989251333945813e-05, + "loss": 0.014608191326260567, + "num_input_tokens_seen": 8302632, + "step": 507, + "train_runtime": 4121.8917, + "train_tokens_per_second": 2014.277 + }, + { + "epoch": 0.30787878787878786, + "grad_norm": 0.021107302978634834, + "learning_rate": 9.989188222242188e-05, + "loss": 0.012715778313577175, + "num_input_tokens_seen": 8319008, + "step": 508, + "train_runtime": 4130.0145, + "train_tokens_per_second": 2014.281 + }, + { + "epoch": 0.3084848484848485, + "grad_norm": 0.019778916612267494, + "learning_rate": 9.989124925998205e-05, + "loss": 0.012830444611608982, + "num_input_tokens_seen": 8335384, + "step": 509, + "train_runtime": 4138.13, + "train_tokens_per_second": 2014.288 + }, + { + "epoch": 0.3090909090909091, + "grad_norm": 0.019679522141814232, + "learning_rate": 9.989061445216208e-05, + "loss": 0.013841142877936363, + "num_input_tokens_seen": 8351760, + "step": 510, + "train_runtime": 4146.2492, + "train_tokens_per_second": 2014.293 + }, + { + "epoch": 0.3096969696969697, + "grad_norm": 0.023106170818209648, + "learning_rate": 9.988997779898545e-05, + "loss": 0.013808130286633968, + "num_input_tokens_seen": 8368136, + "step": 511, + "train_runtime": 4154.3635, + "train_tokens_per_second": 2014.3 + }, + { + "epoch": 0.3103030303030303, + "grad_norm": 0.02681031823158264, + "learning_rate": 9.988933930047569e-05, + "loss": 0.015086129307746887, + "num_input_tokens_seen": 8384512, + "step": 512, + "train_runtime": 4162.4819, + "train_tokens_per_second": 2014.306 + }, + { + "epoch": 0.3109090909090909, + "grad_norm": 0.044101521372795105, + "learning_rate": 9.988869895665642e-05, + "loss": 0.01502022985368967, + "num_input_tokens_seen": 8400888, + "step": 513, + "train_runtime": 4170.6029, + "train_tokens_per_second": 2014.31 + }, + { + "epoch": 0.3115151515151515, + "grad_norm": 0.016393663361668587, + "learning_rate": 9.988805676755133e-05, + "loss": 0.01283847913146019, + "num_input_tokens_seen": 8417264, + "step": 514, + "train_runtime": 4178.7186, + "train_tokens_per_second": 2014.317 + }, + { + "epoch": 0.31212121212121213, + "grad_norm": 0.04226645827293396, + "learning_rate": 9.988741273318416e-05, + "loss": 0.01453358493745327, + "num_input_tokens_seen": 8433640, + "step": 515, + "train_runtime": 4186.8351, + "train_tokens_per_second": 2014.323 + }, + { + "epoch": 0.31272727272727274, + "grad_norm": 0.021670697256922722, + "learning_rate": 9.988676685357876e-05, + "loss": 0.014670845121145248, + "num_input_tokens_seen": 8450016, + "step": 516, + "train_runtime": 4194.9601, + "train_tokens_per_second": 2014.326 + }, + { + "epoch": 0.31333333333333335, + "grad_norm": 0.02870395965874195, + "learning_rate": 9.988611912875901e-05, + "loss": 0.013808513060212135, + "num_input_tokens_seen": 8466392, + "step": 517, + "train_runtime": 4203.0848, + "train_tokens_per_second": 2014.328 + }, + { + "epoch": 0.31393939393939396, + "grad_norm": 0.02202719636261463, + "learning_rate": 9.988546955874885e-05, + "loss": 0.014270287938416004, + "num_input_tokens_seen": 8482768, + "step": 518, + "train_runtime": 4211.2073, + "train_tokens_per_second": 2014.332 + }, + { + "epoch": 0.3145454545454546, + "grad_norm": 0.03838543966412544, + "learning_rate": 9.988481814357233e-05, + "loss": 0.016241563484072685, + "num_input_tokens_seen": 8499144, + "step": 519, + "train_runtime": 4219.3304, + "train_tokens_per_second": 2014.335 + }, + { + "epoch": 0.3151515151515151, + "grad_norm": 0.024275533854961395, + "learning_rate": 9.988416488325352e-05, + "loss": 0.012701138854026794, + "num_input_tokens_seen": 8515520, + "step": 520, + "train_runtime": 4227.4491, + "train_tokens_per_second": 2014.34 + }, + { + "epoch": 0.31575757575757574, + "grad_norm": 0.019648293033242226, + "learning_rate": 9.98835097778166e-05, + "loss": 0.014066259376704693, + "num_input_tokens_seen": 8531896, + "step": 521, + "train_runtime": 4235.5669, + "train_tokens_per_second": 2014.346 + }, + { + "epoch": 0.31636363636363635, + "grad_norm": 0.03942210599780083, + "learning_rate": 9.98828528272858e-05, + "loss": 0.015006550587713718, + "num_input_tokens_seen": 8548272, + "step": 522, + "train_runtime": 4243.6821, + "train_tokens_per_second": 2014.353 + }, + { + "epoch": 0.31696969696969696, + "grad_norm": 0.01995157264173031, + "learning_rate": 9.988219403168542e-05, + "loss": 0.014066948555409908, + "num_input_tokens_seen": 8564648, + "step": 523, + "train_runtime": 4251.7984, + "train_tokens_per_second": 2014.359 + }, + { + "epoch": 0.31757575757575757, + "grad_norm": 0.05812249332666397, + "learning_rate": 9.988153339103983e-05, + "loss": 0.01575363054871559, + "num_input_tokens_seen": 8581024, + "step": 524, + "train_runtime": 4259.9171, + "train_tokens_per_second": 2014.364 + }, + { + "epoch": 0.3181818181818182, + "grad_norm": 0.02528631128370762, + "learning_rate": 9.988087090537344e-05, + "loss": 0.013741475529968739, + "num_input_tokens_seen": 8597400, + "step": 525, + "train_runtime": 4268.0355, + "train_tokens_per_second": 2014.369 + }, + { + "epoch": 0.3187878787878788, + "grad_norm": 0.015316633507609367, + "learning_rate": 9.988020657471077e-05, + "loss": 0.01343776285648346, + "num_input_tokens_seen": 8613776, + "step": 526, + "train_runtime": 4276.154, + "train_tokens_per_second": 2014.375 + }, + { + "epoch": 0.3193939393939394, + "grad_norm": 0.0239357091486454, + "learning_rate": 9.987954039907642e-05, + "loss": 0.013446596451103687, + "num_input_tokens_seen": 8630152, + "step": 527, + "train_runtime": 4284.2698, + "train_tokens_per_second": 2014.381 + }, + { + "epoch": 0.32, + "grad_norm": 0.023286571726202965, + "learning_rate": 9.9878872378495e-05, + "loss": 0.012851690873503685, + "num_input_tokens_seen": 8646528, + "step": 528, + "train_runtime": 4292.3857, + "train_tokens_per_second": 2014.387 + }, + { + "epoch": 0.3206060606060606, + "grad_norm": 0.03030410036444664, + "learning_rate": 9.987820251299122e-05, + "loss": 0.014106137678027153, + "num_input_tokens_seen": 8662904, + "step": 529, + "train_runtime": 4300.5021, + "train_tokens_per_second": 2014.394 + }, + { + "epoch": 0.3212121212121212, + "grad_norm": 0.018672285601496696, + "learning_rate": 9.987753080258986e-05, + "loss": 0.013117408379912376, + "num_input_tokens_seen": 8679280, + "step": 530, + "train_runtime": 4308.6186, + "train_tokens_per_second": 2014.4 + }, + { + "epoch": 0.32181818181818184, + "grad_norm": 0.032513462007045746, + "learning_rate": 9.987685724731577e-05, + "loss": 0.01231987215578556, + "num_input_tokens_seen": 8695656, + "step": 531, + "train_runtime": 4316.7369, + "train_tokens_per_second": 2014.405 + }, + { + "epoch": 0.32242424242424245, + "grad_norm": 0.11805391311645508, + "learning_rate": 9.987618184719386e-05, + "loss": 0.013388572260737419, + "num_input_tokens_seen": 8712032, + "step": 532, + "train_runtime": 4324.8544, + "train_tokens_per_second": 2014.41 + }, + { + "epoch": 0.32303030303030306, + "grad_norm": 0.02607562392950058, + "learning_rate": 9.987550460224912e-05, + "loss": 0.014675582759082317, + "num_input_tokens_seen": 8728408, + "step": 533, + "train_runtime": 4332.9699, + "train_tokens_per_second": 2014.417 + }, + { + "epoch": 0.3236363636363636, + "grad_norm": 0.03229625150561333, + "learning_rate": 9.987482551250659e-05, + "loss": 0.014730843715369701, + "num_input_tokens_seen": 8744784, + "step": 534, + "train_runtime": 4341.0862, + "train_tokens_per_second": 2014.423 + }, + { + "epoch": 0.3242424242424242, + "grad_norm": 0.02484363690018654, + "learning_rate": 9.987414457799138e-05, + "loss": 0.01373380795121193, + "num_input_tokens_seen": 8761160, + "step": 535, + "train_runtime": 4349.2033, + "train_tokens_per_second": 2014.429 + }, + { + "epoch": 0.32484848484848483, + "grad_norm": 0.06518429517745972, + "learning_rate": 9.987346179872869e-05, + "loss": 0.01318280678242445, + "num_input_tokens_seen": 8777536, + "step": 536, + "train_runtime": 4357.3294, + "train_tokens_per_second": 2014.43 + }, + { + "epoch": 0.32545454545454544, + "grad_norm": 0.023426007479429245, + "learning_rate": 9.98727771747438e-05, + "loss": 0.013221761211752892, + "num_input_tokens_seen": 8793912, + "step": 537, + "train_runtime": 4365.4504, + "train_tokens_per_second": 2014.434 + }, + { + "epoch": 0.32606060606060605, + "grad_norm": 0.017606353387236595, + "learning_rate": 9.987209070606199e-05, + "loss": 0.013325998559594154, + "num_input_tokens_seen": 8810288, + "step": 538, + "train_runtime": 4373.5723, + "train_tokens_per_second": 2014.437 + }, + { + "epoch": 0.32666666666666666, + "grad_norm": 0.01875401847064495, + "learning_rate": 9.987140239270865e-05, + "loss": 0.012510064989328384, + "num_input_tokens_seen": 8826664, + "step": 539, + "train_runtime": 4381.6917, + "train_tokens_per_second": 2014.442 + }, + { + "epoch": 0.32727272727272727, + "grad_norm": 0.01927015371620655, + "learning_rate": 9.987071223470926e-05, + "loss": 0.012322126887738705, + "num_input_tokens_seen": 8843040, + "step": 540, + "train_runtime": 4389.8115, + "train_tokens_per_second": 2014.446 + }, + { + "epoch": 0.3278787878787879, + "grad_norm": 0.021669652312994003, + "learning_rate": 9.987002023208935e-05, + "loss": 0.013479230925440788, + "num_input_tokens_seen": 8859416, + "step": 541, + "train_runtime": 4397.9315, + "train_tokens_per_second": 2014.451 + }, + { + "epoch": 0.3284848484848485, + "grad_norm": 0.021821271628141403, + "learning_rate": 9.98693263848745e-05, + "loss": 0.013202294707298279, + "num_input_tokens_seen": 8875792, + "step": 542, + "train_runtime": 4406.0521, + "train_tokens_per_second": 2014.455 + }, + { + "epoch": 0.3290909090909091, + "grad_norm": 0.04035639762878418, + "learning_rate": 9.98686306930904e-05, + "loss": 0.014951585792005062, + "num_input_tokens_seen": 8892168, + "step": 543, + "train_runtime": 4414.1742, + "train_tokens_per_second": 2014.458 + }, + { + "epoch": 0.3296969696969697, + "grad_norm": 0.01868710108101368, + "learning_rate": 9.986793315676276e-05, + "loss": 0.012716731987893581, + "num_input_tokens_seen": 8908544, + "step": 544, + "train_runtime": 4422.2924, + "train_tokens_per_second": 2014.463 + }, + { + "epoch": 0.3303030303030303, + "grad_norm": 0.030803462490439415, + "learning_rate": 9.986723377591738e-05, + "loss": 0.012449722737073898, + "num_input_tokens_seen": 8924920, + "step": 545, + "train_runtime": 4430.4191, + "train_tokens_per_second": 2014.464 + }, + { + "epoch": 0.33090909090909093, + "grad_norm": 0.031005537137389183, + "learning_rate": 9.986653255058014e-05, + "loss": 0.014312123879790306, + "num_input_tokens_seen": 8941296, + "step": 546, + "train_runtime": 4438.5386, + "train_tokens_per_second": 2014.468 + }, + { + "epoch": 0.33151515151515154, + "grad_norm": 0.0480731725692749, + "learning_rate": 9.986582948077696e-05, + "loss": 0.015260567888617516, + "num_input_tokens_seen": 8957672, + "step": 547, + "train_runtime": 4446.6634, + "train_tokens_per_second": 2014.47 + }, + { + "epoch": 0.3321212121212121, + "grad_norm": 0.031962063163518906, + "learning_rate": 9.986512456653388e-05, + "loss": 0.01442326046526432, + "num_input_tokens_seen": 8974048, + "step": 548, + "train_runtime": 4454.7823, + "train_tokens_per_second": 2014.475 + }, + { + "epoch": 0.3327272727272727, + "grad_norm": 0.026429401710629463, + "learning_rate": 9.986441780787692e-05, + "loss": 0.014029188081622124, + "num_input_tokens_seen": 8990424, + "step": 549, + "train_runtime": 4462.9013, + "train_tokens_per_second": 2014.48 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.025757839903235435, + "learning_rate": 9.986370920483227e-05, + "loss": 0.013720030896365643, + "num_input_tokens_seen": 9006800, + "step": 550, + "train_runtime": 4471.0287, + "train_tokens_per_second": 2014.48 + }, + { + "epoch": 0.3339393939393939, + "grad_norm": 0.029027970507740974, + "learning_rate": 9.986299875742613e-05, + "loss": 0.014392418786883354, + "num_input_tokens_seen": 9023176, + "step": 551, + "train_runtime": 4479.1476, + "train_tokens_per_second": 2014.485 + }, + { + "epoch": 0.33454545454545453, + "grad_norm": 0.02587730623781681, + "learning_rate": 9.986228646568475e-05, + "loss": 0.014536920003592968, + "num_input_tokens_seen": 9039552, + "step": 552, + "train_runtime": 4487.2645, + "train_tokens_per_second": 2014.491 + }, + { + "epoch": 0.33515151515151514, + "grad_norm": 0.024850307032465935, + "learning_rate": 9.986157232963452e-05, + "loss": 0.014528162777423859, + "num_input_tokens_seen": 9055928, + "step": 553, + "train_runtime": 4495.3823, + "train_tokens_per_second": 2014.496 + }, + { + "epoch": 0.33575757575757575, + "grad_norm": 0.03375309333205223, + "learning_rate": 9.98608563493018e-05, + "loss": 0.01345045492053032, + "num_input_tokens_seen": 9072304, + "step": 554, + "train_runtime": 4503.5123, + "train_tokens_per_second": 2014.495 + }, + { + "epoch": 0.33636363636363636, + "grad_norm": 0.034519318491220474, + "learning_rate": 9.986013852471313e-05, + "loss": 0.016201037913560867, + "num_input_tokens_seen": 9088680, + "step": 555, + "train_runtime": 4511.6315, + "train_tokens_per_second": 2014.5 + }, + { + "epoch": 0.336969696969697, + "grad_norm": 0.025029929354786873, + "learning_rate": 9.985941885589502e-05, + "loss": 0.013687830418348312, + "num_input_tokens_seen": 9105056, + "step": 556, + "train_runtime": 4519.7498, + "train_tokens_per_second": 2014.504 + }, + { + "epoch": 0.3375757575757576, + "grad_norm": 0.02109324000775814, + "learning_rate": 9.98586973428741e-05, + "loss": 0.013876669108867645, + "num_input_tokens_seen": 9121432, + "step": 557, + "train_runtime": 4527.8676, + "train_tokens_per_second": 2014.509 + }, + { + "epoch": 0.3381818181818182, + "grad_norm": 0.017437269911170006, + "learning_rate": 9.985797398567707e-05, + "loss": 0.013100878335535526, + "num_input_tokens_seen": 9137808, + "step": 558, + "train_runtime": 4535.9928, + "train_tokens_per_second": 2014.511 + }, + { + "epoch": 0.3387878787878788, + "grad_norm": 0.04041491076350212, + "learning_rate": 9.985724878433066e-05, + "loss": 0.014973807148635387, + "num_input_tokens_seen": 9154184, + "step": 559, + "train_runtime": 4544.113, + "train_tokens_per_second": 2014.515 + }, + { + "epoch": 0.3393939393939394, + "grad_norm": 0.02034146897494793, + "learning_rate": 9.985652173886174e-05, + "loss": 0.012258726172149181, + "num_input_tokens_seen": 9170560, + "step": 560, + "train_runtime": 4552.2371, + "train_tokens_per_second": 2014.517 + }, + { + "epoch": 0.34, + "grad_norm": 0.016358409076929092, + "learning_rate": 9.985579284929715e-05, + "loss": 0.014534495770931244, + "num_input_tokens_seen": 9186936, + "step": 561, + "train_runtime": 4560.3582, + "train_tokens_per_second": 2014.521 + }, + { + "epoch": 0.3406060606060606, + "grad_norm": 0.017970645800232887, + "learning_rate": 9.985506211566388e-05, + "loss": 0.013168847188353539, + "num_input_tokens_seen": 9203312, + "step": 562, + "train_runtime": 4568.4702, + "train_tokens_per_second": 2014.528 + }, + { + "epoch": 0.3412121212121212, + "grad_norm": 0.02478228323161602, + "learning_rate": 9.985432953798895e-05, + "loss": 0.016286451369524002, + "num_input_tokens_seen": 9219688, + "step": 563, + "train_runtime": 4576.5846, + "train_tokens_per_second": 2014.535 + }, + { + "epoch": 0.3418181818181818, + "grad_norm": 0.023158971220254898, + "learning_rate": 9.985359511629944e-05, + "loss": 0.014812255278229713, + "num_input_tokens_seen": 9236064, + "step": 564, + "train_runtime": 4584.6914, + "train_tokens_per_second": 2014.544 + }, + { + "epoch": 0.3424242424242424, + "grad_norm": 0.017976826056838036, + "learning_rate": 9.985285885062257e-05, + "loss": 0.013011513277888298, + "num_input_tokens_seen": 9252440, + "step": 565, + "train_runtime": 4592.801, + "train_tokens_per_second": 2014.553 + }, + { + "epoch": 0.343030303030303, + "grad_norm": 0.022492917254567146, + "learning_rate": 9.98521207409855e-05, + "loss": 0.014015360735356808, + "num_input_tokens_seen": 9268816, + "step": 566, + "train_runtime": 4600.9112, + "train_tokens_per_second": 2014.561 + }, + { + "epoch": 0.34363636363636363, + "grad_norm": 0.05375469848513603, + "learning_rate": 9.985138078741559e-05, + "loss": 0.013538680039346218, + "num_input_tokens_seen": 9285192, + "step": 567, + "train_runtime": 4609.0183, + "train_tokens_per_second": 2014.57 + }, + { + "epoch": 0.34424242424242424, + "grad_norm": 0.011526068672537804, + "learning_rate": 9.985063898994016e-05, + "loss": 0.012446783483028412, + "num_input_tokens_seen": 9301568, + "step": 568, + "train_runtime": 4617.1293, + "train_tokens_per_second": 2014.578 + }, + { + "epoch": 0.34484848484848485, + "grad_norm": 0.015349720604717731, + "learning_rate": 9.984989534858669e-05, + "loss": 0.012871544808149338, + "num_input_tokens_seen": 9317944, + "step": 569, + "train_runtime": 4625.2366, + "train_tokens_per_second": 2014.588 + }, + { + "epoch": 0.34545454545454546, + "grad_norm": 0.03799523040652275, + "learning_rate": 9.984914986338268e-05, + "loss": 0.014556103385984898, + "num_input_tokens_seen": 9334320, + "step": 570, + "train_runtime": 4633.3464, + "train_tokens_per_second": 2014.596 + }, + { + "epoch": 0.34606060606060607, + "grad_norm": 0.042935777455568314, + "learning_rate": 9.984840253435568e-05, + "loss": 0.015330069698393345, + "num_input_tokens_seen": 9350696, + "step": 571, + "train_runtime": 4641.4533, + "train_tokens_per_second": 2014.605 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 0.026697825640439987, + "learning_rate": 9.984765336153334e-05, + "loss": 0.01370144821703434, + "num_input_tokens_seen": 9367072, + "step": 572, + "train_runtime": 4649.5653, + "train_tokens_per_second": 2014.612 + }, + { + "epoch": 0.3472727272727273, + "grad_norm": 0.04093024507164955, + "learning_rate": 9.984690234494339e-05, + "loss": 0.01424380298703909, + "num_input_tokens_seen": 9383448, + "step": 573, + "train_runtime": 4657.6738, + "train_tokens_per_second": 2014.621 + }, + { + "epoch": 0.3478787878787879, + "grad_norm": 0.03236076980829239, + "learning_rate": 9.984614948461358e-05, + "loss": 0.014988360926508904, + "num_input_tokens_seen": 9399824, + "step": 574, + "train_runtime": 4665.7816, + "train_tokens_per_second": 2014.63 + }, + { + "epoch": 0.3484848484848485, + "grad_norm": 0.016026047989726067, + "learning_rate": 9.984539478057178e-05, + "loss": 0.013162180781364441, + "num_input_tokens_seen": 9416200, + "step": 575, + "train_runtime": 4673.8904, + "train_tokens_per_second": 2014.639 + }, + { + "epoch": 0.3490909090909091, + "grad_norm": 0.03273920342326164, + "learning_rate": 9.984463823284589e-05, + "loss": 0.015174154192209244, + "num_input_tokens_seen": 9432576, + "step": 576, + "train_runtime": 4682.0015, + "train_tokens_per_second": 2014.646 + }, + { + "epoch": 0.3496969696969697, + "grad_norm": 0.03933154046535492, + "learning_rate": 9.98438798414639e-05, + "loss": 0.014418127946555614, + "num_input_tokens_seen": 9448952, + "step": 577, + "train_runtime": 4690.1128, + "train_tokens_per_second": 2014.653 + }, + { + "epoch": 0.3503030303030303, + "grad_norm": 0.02570173889398575, + "learning_rate": 9.984311960645388e-05, + "loss": 0.01333607453852892, + "num_input_tokens_seen": 9465328, + "step": 578, + "train_runtime": 4698.2293, + "train_tokens_per_second": 2014.659 + }, + { + "epoch": 0.3509090909090909, + "grad_norm": 0.024147065356373787, + "learning_rate": 9.984235752784392e-05, + "loss": 0.013619362376630306, + "num_input_tokens_seen": 9481704, + "step": 579, + "train_runtime": 4706.3371, + "train_tokens_per_second": 2014.667 + }, + { + "epoch": 0.3515151515151515, + "grad_norm": 0.04005376994609833, + "learning_rate": 9.98415936056622e-05, + "loss": 0.014414026401937008, + "num_input_tokens_seen": 9498080, + "step": 580, + "train_runtime": 4714.4455, + "train_tokens_per_second": 2014.676 + }, + { + "epoch": 0.3521212121212121, + "grad_norm": 0.03428025171160698, + "learning_rate": 9.984082783993703e-05, + "loss": 0.01436635572463274, + "num_input_tokens_seen": 9514456, + "step": 581, + "train_runtime": 4722.5545, + "train_tokens_per_second": 2014.684 + }, + { + "epoch": 0.3527272727272727, + "grad_norm": 0.02205795608460903, + "learning_rate": 9.984006023069666e-05, + "loss": 0.013060957193374634, + "num_input_tokens_seen": 9530832, + "step": 582, + "train_runtime": 4730.6633, + "train_tokens_per_second": 2014.693 + }, + { + "epoch": 0.35333333333333333, + "grad_norm": 0.020862819626927376, + "learning_rate": 9.983929077796954e-05, + "loss": 0.013365531340241432, + "num_input_tokens_seen": 9547208, + "step": 583, + "train_runtime": 4738.7746, + "train_tokens_per_second": 2014.7 + }, + { + "epoch": 0.35393939393939394, + "grad_norm": 0.012693438678979874, + "learning_rate": 9.983851948178412e-05, + "loss": 0.012265143916010857, + "num_input_tokens_seen": 9563584, + "step": 584, + "train_runtime": 4746.884, + "train_tokens_per_second": 2014.708 + }, + { + "epoch": 0.35454545454545455, + "grad_norm": 0.03995286300778389, + "learning_rate": 9.983774634216892e-05, + "loss": 0.014887749217450619, + "num_input_tokens_seen": 9579960, + "step": 585, + "train_runtime": 4754.9935, + "train_tokens_per_second": 2014.716 + }, + { + "epoch": 0.35515151515151516, + "grad_norm": 0.02919401042163372, + "learning_rate": 9.983697135915252e-05, + "loss": 0.01471506617963314, + "num_input_tokens_seen": 9596336, + "step": 586, + "train_runtime": 4763.1041, + "train_tokens_per_second": 2014.723 + }, + { + "epoch": 0.3557575757575758, + "grad_norm": 0.03058960661292076, + "learning_rate": 9.98361945327636e-05, + "loss": 0.014638346619904041, + "num_input_tokens_seen": 9612712, + "step": 587, + "train_runtime": 4771.2155, + "train_tokens_per_second": 2014.73 + }, + { + "epoch": 0.3563636363636364, + "grad_norm": 0.03899887949228287, + "learning_rate": 9.983541586303091e-05, + "loss": 0.015173106454312801, + "num_input_tokens_seen": 9629088, + "step": 588, + "train_runtime": 4779.3321, + "train_tokens_per_second": 2014.735 + }, + { + "epoch": 0.356969696969697, + "grad_norm": 0.34171223640441895, + "learning_rate": 9.983463534998326e-05, + "loss": 0.01584211364388466, + "num_input_tokens_seen": 9645464, + "step": 589, + "train_runtime": 4787.4435, + "train_tokens_per_second": 2014.742 + }, + { + "epoch": 0.3575757575757576, + "grad_norm": 0.025424521416425705, + "learning_rate": 9.983385299364946e-05, + "loss": 0.01455459464341402, + "num_input_tokens_seen": 9661840, + "step": 590, + "train_runtime": 4795.5546, + "train_tokens_per_second": 2014.749 + }, + { + "epoch": 0.35818181818181816, + "grad_norm": 0.032859109342098236, + "learning_rate": 9.98330687940585e-05, + "loss": 0.0146177988499403, + "num_input_tokens_seen": 9678216, + "step": 591, + "train_runtime": 4803.6648, + "train_tokens_per_second": 2014.757 + }, + { + "epoch": 0.35878787878787877, + "grad_norm": 0.038725532591342926, + "learning_rate": 9.983228275123938e-05, + "loss": 0.014792557805776596, + "num_input_tokens_seen": 9694592, + "step": 592, + "train_runtime": 4811.7746, + "train_tokens_per_second": 2014.764 + }, + { + "epoch": 0.3593939393939394, + "grad_norm": 0.020830297842621803, + "learning_rate": 9.983149486522115e-05, + "loss": 0.014553902670741081, + "num_input_tokens_seen": 9710968, + "step": 593, + "train_runtime": 4819.8876, + "train_tokens_per_second": 2014.771 + }, + { + "epoch": 0.36, + "grad_norm": 0.01844129152595997, + "learning_rate": 9.983070513603295e-05, + "loss": 0.014042770490050316, + "num_input_tokens_seen": 9727344, + "step": 594, + "train_runtime": 4827.9961, + "train_tokens_per_second": 2014.779 + }, + { + "epoch": 0.3606060606060606, + "grad_norm": 0.2604560852050781, + "learning_rate": 9.982991356370404e-05, + "loss": 0.01581915095448494, + "num_input_tokens_seen": 9743720, + "step": 595, + "train_runtime": 4836.1086, + "train_tokens_per_second": 2014.785 + }, + { + "epoch": 0.3612121212121212, + "grad_norm": 0.03814680501818657, + "learning_rate": 9.982912014826365e-05, + "loss": 0.016680167987942696, + "num_input_tokens_seen": 9760096, + "step": 596, + "train_runtime": 4844.2153, + "train_tokens_per_second": 2014.794 + }, + { + "epoch": 0.3618181818181818, + "grad_norm": 0.02060728892683983, + "learning_rate": 9.982832488974115e-05, + "loss": 0.014381843619048595, + "num_input_tokens_seen": 9776472, + "step": 597, + "train_runtime": 4852.3306, + "train_tokens_per_second": 2014.799 + }, + { + "epoch": 0.3624242424242424, + "grad_norm": 0.028759043663740158, + "learning_rate": 9.982752778816595e-05, + "loss": 0.014019730500876904, + "num_input_tokens_seen": 9792848, + "step": 598, + "train_runtime": 4860.4404, + "train_tokens_per_second": 2014.807 + }, + { + "epoch": 0.36303030303030304, + "grad_norm": 0.05189267545938492, + "learning_rate": 9.982672884356752e-05, + "loss": 0.01498887874186039, + "num_input_tokens_seen": 9809224, + "step": 599, + "train_runtime": 4868.5548, + "train_tokens_per_second": 2014.812 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 0.01455398928374052, + "learning_rate": 9.982592805597544e-05, + "loss": 0.011788399890065193, + "num_input_tokens_seen": 9825600, + "step": 600, + "train_runtime": 4876.663, + "train_tokens_per_second": 2014.82 + }, + { + "epoch": 0.36424242424242426, + "grad_norm": 0.046520307660102844, + "learning_rate": 9.982512542541929e-05, + "loss": 0.012856653891503811, + "num_input_tokens_seen": 9841976, + "step": 601, + "train_runtime": 4885.6882, + "train_tokens_per_second": 2014.45 + }, + { + "epoch": 0.36484848484848487, + "grad_norm": 0.017443792894482613, + "learning_rate": 9.98243209519288e-05, + "loss": 0.013804689049720764, + "num_input_tokens_seen": 9858352, + "step": 602, + "train_runtime": 4893.7966, + "train_tokens_per_second": 2014.459 + }, + { + "epoch": 0.3654545454545455, + "grad_norm": 0.016950292512774467, + "learning_rate": 9.98235146355337e-05, + "loss": 0.01243675872683525, + "num_input_tokens_seen": 9874728, + "step": 603, + "train_runtime": 4901.9018, + "train_tokens_per_second": 2014.469 + }, + { + "epoch": 0.3660606060606061, + "grad_norm": 0.017681090161204338, + "learning_rate": 9.982270647626382e-05, + "loss": 0.011940497905015945, + "num_input_tokens_seen": 9891104, + "step": 604, + "train_runtime": 4910.0066, + "train_tokens_per_second": 2014.479 + }, + { + "epoch": 0.36666666666666664, + "grad_norm": 0.018248707056045532, + "learning_rate": 9.982189647414906e-05, + "loss": 0.012673230841755867, + "num_input_tokens_seen": 9907480, + "step": 605, + "train_runtime": 4918.1184, + "train_tokens_per_second": 2014.486 + }, + { + "epoch": 0.36727272727272725, + "grad_norm": 0.020540893077850342, + "learning_rate": 9.982108462921937e-05, + "loss": 0.014132777228951454, + "num_input_tokens_seen": 9923856, + "step": 606, + "train_runtime": 4926.2283, + "train_tokens_per_second": 2014.494 + }, + { + "epoch": 0.36787878787878786, + "grad_norm": 0.023124700412154198, + "learning_rate": 9.982027094150478e-05, + "loss": 0.012684160843491554, + "num_input_tokens_seen": 9940232, + "step": 607, + "train_runtime": 4934.3331, + "train_tokens_per_second": 2014.504 + }, + { + "epoch": 0.36848484848484847, + "grad_norm": 0.020409781485795975, + "learning_rate": 9.98194554110354e-05, + "loss": 0.014147626236081123, + "num_input_tokens_seen": 9956608, + "step": 608, + "train_runtime": 4942.4391, + "train_tokens_per_second": 2014.513 + }, + { + "epoch": 0.3690909090909091, + "grad_norm": 0.015636246651411057, + "learning_rate": 9.981863803784136e-05, + "loss": 0.014182131737470627, + "num_input_tokens_seen": 9972984, + "step": 609, + "train_runtime": 4950.5477, + "train_tokens_per_second": 2014.521 + }, + { + "epoch": 0.3696969696969697, + "grad_norm": 0.0192013718187809, + "learning_rate": 9.981781882195292e-05, + "loss": 0.013808063231408596, + "num_input_tokens_seen": 9989360, + "step": 610, + "train_runtime": 4958.6543, + "train_tokens_per_second": 2014.53 + }, + { + "epoch": 0.3703030303030303, + "grad_norm": 0.017762696370482445, + "learning_rate": 9.981699776340039e-05, + "loss": 0.013210650533437729, + "num_input_tokens_seen": 10005736, + "step": 611, + "train_runtime": 4966.7598, + "train_tokens_per_second": 2014.54 + }, + { + "epoch": 0.3709090909090909, + "grad_norm": 0.025030212476849556, + "learning_rate": 9.981617486221413e-05, + "loss": 0.01400088518857956, + "num_input_tokens_seen": 10022112, + "step": 612, + "train_runtime": 4974.8675, + "train_tokens_per_second": 2014.549 + }, + { + "epoch": 0.3715151515151515, + "grad_norm": 0.030215473845601082, + "learning_rate": 9.981535011842456e-05, + "loss": 0.01368585042655468, + "num_input_tokens_seen": 10038488, + "step": 613, + "train_runtime": 4982.9771, + "train_tokens_per_second": 2014.556 + }, + { + "epoch": 0.37212121212121213, + "grad_norm": 0.021045658737421036, + "learning_rate": 9.981452353206222e-05, + "loss": 0.014398960396647453, + "num_input_tokens_seen": 10054864, + "step": 614, + "train_runtime": 4991.0863, + "train_tokens_per_second": 2014.564 + }, + { + "epoch": 0.37272727272727274, + "grad_norm": 0.01661411114037037, + "learning_rate": 9.981369510315764e-05, + "loss": 0.0135966120287776, + "num_input_tokens_seen": 10071240, + "step": 615, + "train_runtime": 4999.1912, + "train_tokens_per_second": 2014.574 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 0.013987096957862377, + "learning_rate": 9.98128648317415e-05, + "loss": 0.011429902166128159, + "num_input_tokens_seen": 10087616, + "step": 616, + "train_runtime": 5007.2986, + "train_tokens_per_second": 2014.582 + }, + { + "epoch": 0.37393939393939396, + "grad_norm": 0.01872987300157547, + "learning_rate": 9.981203271784449e-05, + "loss": 0.011292507871985435, + "num_input_tokens_seen": 10103992, + "step": 617, + "train_runtime": 5015.406, + "train_tokens_per_second": 2014.591 + }, + { + "epoch": 0.37454545454545457, + "grad_norm": 0.013638158328831196, + "learning_rate": 9.98111987614974e-05, + "loss": 0.012537346221506596, + "num_input_tokens_seen": 10120368, + "step": 618, + "train_runtime": 5023.5134, + "train_tokens_per_second": 2014.6 + }, + { + "epoch": 0.3751515151515151, + "grad_norm": 0.012727733701467514, + "learning_rate": 9.981036296273106e-05, + "loss": 0.012531593441963196, + "num_input_tokens_seen": 10136744, + "step": 619, + "train_runtime": 5031.6191, + "train_tokens_per_second": 2014.609 + }, + { + "epoch": 0.37575757575757573, + "grad_norm": 0.017707858234643936, + "learning_rate": 9.98095253215764e-05, + "loss": 0.012445853091776371, + "num_input_tokens_seen": 10153120, + "step": 620, + "train_runtime": 5039.7288, + "train_tokens_per_second": 2014.616 + }, + { + "epoch": 0.37636363636363634, + "grad_norm": 0.02095656655728817, + "learning_rate": 9.98086858380644e-05, + "loss": 0.01246220339089632, + "num_input_tokens_seen": 10169496, + "step": 621, + "train_runtime": 5047.8343, + "train_tokens_per_second": 2014.626 + }, + { + "epoch": 0.37696969696969695, + "grad_norm": 0.0194542296230793, + "learning_rate": 9.980784451222612e-05, + "loss": 0.012840205803513527, + "num_input_tokens_seen": 10185872, + "step": 622, + "train_runtime": 5055.9398, + "train_tokens_per_second": 2014.635 + }, + { + "epoch": 0.37757575757575756, + "grad_norm": 0.045034874230623245, + "learning_rate": 9.980700134409266e-05, + "loss": 0.01571492850780487, + "num_input_tokens_seen": 10202248, + "step": 623, + "train_runtime": 5064.0515, + "train_tokens_per_second": 2014.641 + }, + { + "epoch": 0.3781818181818182, + "grad_norm": 0.017045883461833, + "learning_rate": 9.980615633369522e-05, + "loss": 0.013137969188392162, + "num_input_tokens_seen": 10218624, + "step": 624, + "train_runtime": 5072.1723, + "train_tokens_per_second": 2014.644 + }, + { + "epoch": 0.3787878787878788, + "grad_norm": 0.01485395897179842, + "learning_rate": 9.980530948106504e-05, + "loss": 0.01213077548891306, + "num_input_tokens_seen": 10235000, + "step": 625, + "train_runtime": 5080.2996, + "train_tokens_per_second": 2014.645 + }, + { + "epoch": 0.3793939393939394, + "grad_norm": 0.014804039150476456, + "learning_rate": 9.980446078623345e-05, + "loss": 0.012899467721581459, + "num_input_tokens_seen": 10251376, + "step": 626, + "train_runtime": 5088.4306, + "train_tokens_per_second": 2014.644 + }, + { + "epoch": 0.38, + "grad_norm": 0.02651570737361908, + "learning_rate": 9.980361024923185e-05, + "loss": 0.012421991676092148, + "num_input_tokens_seen": 10267752, + "step": 627, + "train_runtime": 5096.5501, + "train_tokens_per_second": 2014.648 + }, + { + "epoch": 0.3806060606060606, + "grad_norm": 0.018621394410729408, + "learning_rate": 9.98027578700917e-05, + "loss": 0.01267517451196909, + "num_input_tokens_seen": 10284128, + "step": 628, + "train_runtime": 5104.6689, + "train_tokens_per_second": 2014.651 + }, + { + "epoch": 0.3812121212121212, + "grad_norm": 0.0398629792034626, + "learning_rate": 9.980190364884452e-05, + "loss": 0.014264339581131935, + "num_input_tokens_seen": 10300504, + "step": 629, + "train_runtime": 5112.783, + "train_tokens_per_second": 2014.657 + }, + { + "epoch": 0.38181818181818183, + "grad_norm": 0.014866935089230537, + "learning_rate": 9.98010475855219e-05, + "loss": 0.01269571203738451, + "num_input_tokens_seen": 10316880, + "step": 630, + "train_runtime": 5120.8995, + "train_tokens_per_second": 2014.662 + }, + { + "epoch": 0.38242424242424244, + "grad_norm": 0.02409232407808304, + "learning_rate": 9.980018968015552e-05, + "loss": 0.01351371593773365, + "num_input_tokens_seen": 10333256, + "step": 631, + "train_runtime": 5129.0287, + "train_tokens_per_second": 2014.661 + }, + { + "epoch": 0.38303030303030305, + "grad_norm": 0.01822233758866787, + "learning_rate": 9.979932993277711e-05, + "loss": 0.011882105842232704, + "num_input_tokens_seen": 10349632, + "step": 632, + "train_runtime": 5137.1531, + "train_tokens_per_second": 2014.663 + }, + { + "epoch": 0.3836363636363636, + "grad_norm": 0.030663253739476204, + "learning_rate": 9.979846834341846e-05, + "loss": 0.014444777742028236, + "num_input_tokens_seen": 10366008, + "step": 633, + "train_runtime": 5145.2769, + "train_tokens_per_second": 2014.665 + }, + { + "epoch": 0.3842424242424242, + "grad_norm": 0.013876891694962978, + "learning_rate": 9.979760491211146e-05, + "loss": 0.012167233973741531, + "num_input_tokens_seen": 10382384, + "step": 634, + "train_runtime": 5153.3942, + "train_tokens_per_second": 2014.669 + }, + { + "epoch": 0.38484848484848483, + "grad_norm": 0.03647688776254654, + "learning_rate": 9.979673963888801e-05, + "loss": 0.013262891210615635, + "num_input_tokens_seen": 10398760, + "step": 635, + "train_runtime": 5161.5119, + "train_tokens_per_second": 2014.673 + }, + { + "epoch": 0.38545454545454544, + "grad_norm": 0.02617211639881134, + "learning_rate": 9.979587252378013e-05, + "loss": 0.014726457186043262, + "num_input_tokens_seen": 10415136, + "step": 636, + "train_runtime": 5169.6294, + "train_tokens_per_second": 2014.678 + }, + { + "epoch": 0.38606060606060605, + "grad_norm": 0.01650061085820198, + "learning_rate": 9.979500356681992e-05, + "loss": 0.014401402324438095, + "num_input_tokens_seen": 10431512, + "step": 637, + "train_runtime": 5177.7469, + "train_tokens_per_second": 2014.682 + }, + { + "epoch": 0.38666666666666666, + "grad_norm": 0.017912236973643303, + "learning_rate": 9.979413276803948e-05, + "loss": 0.011410839855670929, + "num_input_tokens_seen": 10447888, + "step": 638, + "train_runtime": 5185.8616, + "train_tokens_per_second": 2014.687 + }, + { + "epoch": 0.38727272727272727, + "grad_norm": 0.02133595198392868, + "learning_rate": 9.979326012747106e-05, + "loss": 0.01264719758182764, + "num_input_tokens_seen": 10464264, + "step": 639, + "train_runtime": 5193.9789, + "train_tokens_per_second": 2014.691 + }, + { + "epoch": 0.3878787878787879, + "grad_norm": 0.011059283278882504, + "learning_rate": 9.97923856451469e-05, + "loss": 0.011714452877640724, + "num_input_tokens_seen": 10480640, + "step": 640, + "train_runtime": 5202.0952, + "train_tokens_per_second": 2014.696 + }, + { + "epoch": 0.3884848484848485, + "grad_norm": 0.01679043099284172, + "learning_rate": 9.979150932109937e-05, + "loss": 0.012356593273580074, + "num_input_tokens_seen": 10497016, + "step": 641, + "train_runtime": 5210.2129, + "train_tokens_per_second": 2014.7 + }, + { + "epoch": 0.3890909090909091, + "grad_norm": 0.017658302560448647, + "learning_rate": 9.979063115536086e-05, + "loss": 0.014303645119071007, + "num_input_tokens_seen": 10513392, + "step": 642, + "train_runtime": 5218.3299, + "train_tokens_per_second": 2014.704 + }, + { + "epoch": 0.3896969696969697, + "grad_norm": 0.037931594997644424, + "learning_rate": 9.978975114796389e-05, + "loss": 0.015233817510306835, + "num_input_tokens_seen": 10529768, + "step": 643, + "train_runtime": 5226.4474, + "train_tokens_per_second": 2014.709 + }, + { + "epoch": 0.3903030303030303, + "grad_norm": 0.024847477674484253, + "learning_rate": 9.978886929894096e-05, + "loss": 0.011363557539880276, + "num_input_tokens_seen": 10546144, + "step": 644, + "train_runtime": 5234.5646, + "train_tokens_per_second": 2014.713 + }, + { + "epoch": 0.39090909090909093, + "grad_norm": 0.025633033365011215, + "learning_rate": 9.978798560832474e-05, + "loss": 0.01591489464044571, + "num_input_tokens_seen": 10562520, + "step": 645, + "train_runtime": 5242.6796, + "train_tokens_per_second": 2014.718 + }, + { + "epoch": 0.39151515151515154, + "grad_norm": 0.01618288829922676, + "learning_rate": 9.978710007614786e-05, + "loss": 0.012586476281285286, + "num_input_tokens_seen": 10578896, + "step": 646, + "train_runtime": 5250.7993, + "train_tokens_per_second": 2014.721 + }, + { + "epoch": 0.39212121212121215, + "grad_norm": 0.02201761119067669, + "learning_rate": 9.978621270244313e-05, + "loss": 0.015117557719349861, + "num_input_tokens_seen": 10595272, + "step": 647, + "train_runtime": 5258.9174, + "train_tokens_per_second": 2014.725 + }, + { + "epoch": 0.3927272727272727, + "grad_norm": 0.0371362566947937, + "learning_rate": 9.978532348724335e-05, + "loss": 0.014719461090862751, + "num_input_tokens_seen": 10611648, + "step": 648, + "train_runtime": 5267.0377, + "train_tokens_per_second": 2014.728 + }, + { + "epoch": 0.3933333333333333, + "grad_norm": 0.02168305590748787, + "learning_rate": 9.978443243058139e-05, + "loss": 0.01353619247674942, + "num_input_tokens_seen": 10628024, + "step": 649, + "train_runtime": 5275.1562, + "train_tokens_per_second": 2014.732 + }, + { + "epoch": 0.3939393939393939, + "grad_norm": 0.019228238612413406, + "learning_rate": 9.978353953249022e-05, + "loss": 0.013856697827577591, + "num_input_tokens_seen": 10644400, + "step": 650, + "train_runtime": 5283.2715, + "train_tokens_per_second": 2014.737 + }, + { + "epoch": 0.39454545454545453, + "grad_norm": 0.027308976277709007, + "learning_rate": 9.978264479300289e-05, + "loss": 0.013041336089372635, + "num_input_tokens_seen": 10660776, + "step": 651, + "train_runtime": 5291.3911, + "train_tokens_per_second": 2014.74 + }, + { + "epoch": 0.39515151515151514, + "grad_norm": 0.016961168497800827, + "learning_rate": 9.978174821215247e-05, + "loss": 0.012095801532268524, + "num_input_tokens_seen": 10677152, + "step": 652, + "train_runtime": 5299.5022, + "train_tokens_per_second": 2014.746 + }, + { + "epoch": 0.39575757575757575, + "grad_norm": 0.030550425872206688, + "learning_rate": 9.978084978997212e-05, + "loss": 0.014912940561771393, + "num_input_tokens_seen": 10693528, + "step": 653, + "train_runtime": 5307.6115, + "train_tokens_per_second": 2014.753 + }, + { + "epoch": 0.39636363636363636, + "grad_norm": 0.035802144557237625, + "learning_rate": 9.977994952649509e-05, + "loss": 0.014338945969939232, + "num_input_tokens_seen": 10709904, + "step": 654, + "train_runtime": 5315.7289, + "train_tokens_per_second": 2014.757 + }, + { + "epoch": 0.396969696969697, + "grad_norm": 0.016549181193113327, + "learning_rate": 9.977904742175466e-05, + "loss": 0.013156197033822536, + "num_input_tokens_seen": 10726280, + "step": 655, + "train_runtime": 5323.8353, + "train_tokens_per_second": 2014.766 + }, + { + "epoch": 0.3975757575757576, + "grad_norm": 0.020908519625663757, + "learning_rate": 9.977814347578421e-05, + "loss": 0.012832121923565865, + "num_input_tokens_seen": 10742656, + "step": 656, + "train_runtime": 5331.9419, + "train_tokens_per_second": 2014.774 + }, + { + "epoch": 0.3981818181818182, + "grad_norm": 0.0449579656124115, + "learning_rate": 9.977723768861718e-05, + "loss": 0.011967733502388, + "num_input_tokens_seen": 10759032, + "step": 657, + "train_runtime": 5340.0518, + "train_tokens_per_second": 2014.78 + }, + { + "epoch": 0.3987878787878788, + "grad_norm": 0.01602446660399437, + "learning_rate": 9.977633006028706e-05, + "loss": 0.012816080823540688, + "num_input_tokens_seen": 10775408, + "step": 658, + "train_runtime": 5348.1597, + "train_tokens_per_second": 2014.788 + }, + { + "epoch": 0.3993939393939394, + "grad_norm": 0.028448155149817467, + "learning_rate": 9.977542059082742e-05, + "loss": 0.014847241342067719, + "num_input_tokens_seen": 10791784, + "step": 659, + "train_runtime": 5356.2671, + "train_tokens_per_second": 2014.796 + }, + { + "epoch": 0.4, + "grad_norm": 0.011783472262322903, + "learning_rate": 9.977450928027191e-05, + "loss": 0.013164190575480461, + "num_input_tokens_seen": 10808160, + "step": 660, + "train_runtime": 5364.3761, + "train_tokens_per_second": 2014.803 + }, + { + "epoch": 0.40060606060606063, + "grad_norm": 0.026984520256519318, + "learning_rate": 9.977359612865423e-05, + "loss": 0.013657883740961552, + "num_input_tokens_seen": 10824536, + "step": 661, + "train_runtime": 5372.4863, + "train_tokens_per_second": 2014.809 + }, + { + "epoch": 0.4012121212121212, + "grad_norm": 0.022077390924096107, + "learning_rate": 9.977268113600817e-05, + "loss": 0.014578605070710182, + "num_input_tokens_seen": 10840912, + "step": 662, + "train_runtime": 5380.5934, + "train_tokens_per_second": 2014.817 + }, + { + "epoch": 0.4018181818181818, + "grad_norm": 0.01575160026550293, + "learning_rate": 9.977176430236755e-05, + "loss": 0.013932663947343826, + "num_input_tokens_seen": 10857288, + "step": 663, + "train_runtime": 5388.7195, + "train_tokens_per_second": 2014.818 + }, + { + "epoch": 0.4024242424242424, + "grad_norm": 0.029406050220131874, + "learning_rate": 9.977084562776631e-05, + "loss": 0.015834983438253403, + "num_input_tokens_seen": 10873664, + "step": 664, + "train_runtime": 5396.8304, + "train_tokens_per_second": 2014.824 + }, + { + "epoch": 0.403030303030303, + "grad_norm": 0.028436392545700073, + "learning_rate": 9.976992511223839e-05, + "loss": 0.014038406312465668, + "num_input_tokens_seen": 10890040, + "step": 665, + "train_runtime": 5404.9444, + "train_tokens_per_second": 2014.829 + }, + { + "epoch": 0.4036363636363636, + "grad_norm": 0.029235292226076126, + "learning_rate": 9.976900275581789e-05, + "loss": 0.015379410237073898, + "num_input_tokens_seen": 10906416, + "step": 666, + "train_runtime": 5413.0528, + "train_tokens_per_second": 2014.836 + }, + { + "epoch": 0.40424242424242424, + "grad_norm": 0.03774306923151016, + "learning_rate": 9.976807855853886e-05, + "loss": 0.014895454980432987, + "num_input_tokens_seen": 10922792, + "step": 667, + "train_runtime": 5421.159, + "train_tokens_per_second": 2014.844 + }, + { + "epoch": 0.40484848484848485, + "grad_norm": 0.01916997693479061, + "learning_rate": 9.976715252043555e-05, + "loss": 0.0143886748701334, + "num_input_tokens_seen": 10939168, + "step": 668, + "train_runtime": 5429.2675, + "train_tokens_per_second": 2014.852 + }, + { + "epoch": 0.40545454545454546, + "grad_norm": 0.021564677357673645, + "learning_rate": 9.976622464154219e-05, + "loss": 0.013210933655500412, + "num_input_tokens_seen": 10955544, + "step": 669, + "train_runtime": 5437.3809, + "train_tokens_per_second": 2014.857 + }, + { + "epoch": 0.40606060606060607, + "grad_norm": 0.02249998040497303, + "learning_rate": 9.976529492189309e-05, + "loss": 0.013446344994008541, + "num_input_tokens_seen": 10971920, + "step": 670, + "train_runtime": 5445.4997, + "train_tokens_per_second": 2014.86 + }, + { + "epoch": 0.4066666666666667, + "grad_norm": 0.03089592047035694, + "learning_rate": 9.976436336152265e-05, + "loss": 0.014300989918410778, + "num_input_tokens_seen": 10988296, + "step": 671, + "train_runtime": 5453.615, + "train_tokens_per_second": 2014.865 + }, + { + "epoch": 0.4072727272727273, + "grad_norm": 0.01742340438067913, + "learning_rate": 9.976342996046532e-05, + "loss": 0.012858121655881405, + "num_input_tokens_seen": 11004672, + "step": 672, + "train_runtime": 5461.7321, + "train_tokens_per_second": 2014.869 + }, + { + "epoch": 0.4078787878787879, + "grad_norm": 0.0165674090385437, + "learning_rate": 9.976249471875561e-05, + "loss": 0.013976114802062511, + "num_input_tokens_seen": 11021048, + "step": 673, + "train_runtime": 5469.8479, + "train_tokens_per_second": 2014.873 + }, + { + "epoch": 0.4084848484848485, + "grad_norm": 0.013970437459647655, + "learning_rate": 9.976155763642813e-05, + "loss": 0.013127206824719906, + "num_input_tokens_seen": 11037424, + "step": 674, + "train_runtime": 5477.9644, + "train_tokens_per_second": 2014.877 + }, + { + "epoch": 0.4090909090909091, + "grad_norm": 0.028073744848370552, + "learning_rate": 9.976061871351756e-05, + "loss": 0.013469989411532879, + "num_input_tokens_seen": 11053800, + "step": 675, + "train_runtime": 5486.0804, + "train_tokens_per_second": 2014.881 + }, + { + "epoch": 0.40969696969696967, + "grad_norm": 0.02016565017402172, + "learning_rate": 9.975967795005859e-05, + "loss": 0.013997921720147133, + "num_input_tokens_seen": 11070176, + "step": 676, + "train_runtime": 5494.197, + "train_tokens_per_second": 2014.885 + }, + { + "epoch": 0.4103030303030303, + "grad_norm": 0.01767519861459732, + "learning_rate": 9.975873534608604e-05, + "loss": 0.013824408873915672, + "num_input_tokens_seen": 11086552, + "step": 677, + "train_runtime": 5502.3132, + "train_tokens_per_second": 2014.889 + }, + { + "epoch": 0.4109090909090909, + "grad_norm": 0.02294917404651642, + "learning_rate": 9.975779090163478e-05, + "loss": 0.013364237733185291, + "num_input_tokens_seen": 11102928, + "step": 678, + "train_runtime": 5510.4298, + "train_tokens_per_second": 2014.893 + }, + { + "epoch": 0.4115151515151515, + "grad_norm": 0.015453618951141834, + "learning_rate": 9.975684461673972e-05, + "loss": 0.011895030736923218, + "num_input_tokens_seen": 11119304, + "step": 679, + "train_runtime": 5518.5467, + "train_tokens_per_second": 2014.897 + }, + { + "epoch": 0.4121212121212121, + "grad_norm": 0.02744610421359539, + "learning_rate": 9.975589649143588e-05, + "loss": 0.01399244274944067, + "num_input_tokens_seen": 11135680, + "step": 680, + "train_runtime": 5526.6634, + "train_tokens_per_second": 2014.901 + }, + { + "epoch": 0.4127272727272727, + "grad_norm": 0.0141525249928236, + "learning_rate": 9.975494652575832e-05, + "loss": 0.012226445600390434, + "num_input_tokens_seen": 11152056, + "step": 681, + "train_runtime": 5534.7831, + "train_tokens_per_second": 2014.904 + }, + { + "epoch": 0.41333333333333333, + "grad_norm": 0.05674010142683983, + "learning_rate": 9.975399471974218e-05, + "loss": 0.013092868961393833, + "num_input_tokens_seen": 11168432, + "step": 682, + "train_runtime": 5542.8995, + "train_tokens_per_second": 2014.908 + }, + { + "epoch": 0.41393939393939394, + "grad_norm": 0.014718937687575817, + "learning_rate": 9.975304107342268e-05, + "loss": 0.012982090935111046, + "num_input_tokens_seen": 11184808, + "step": 683, + "train_runtime": 5551.0179, + "train_tokens_per_second": 2014.911 + }, + { + "epoch": 0.41454545454545455, + "grad_norm": 0.017596984282135963, + "learning_rate": 9.975208558683508e-05, + "loss": 0.013058310374617577, + "num_input_tokens_seen": 11201184, + "step": 684, + "train_runtime": 5559.1335, + "train_tokens_per_second": 2014.915 + }, + { + "epoch": 0.41515151515151516, + "grad_norm": 0.05556584894657135, + "learning_rate": 9.975112826001471e-05, + "loss": 0.013223481364548206, + "num_input_tokens_seen": 11217560, + "step": 685, + "train_runtime": 5567.2483, + "train_tokens_per_second": 2014.92 + }, + { + "epoch": 0.41575757575757577, + "grad_norm": 0.039875905960798264, + "learning_rate": 9.9750169092997e-05, + "loss": 0.016192132607102394, + "num_input_tokens_seen": 11233936, + "step": 686, + "train_runtime": 5575.3629, + "train_tokens_per_second": 2014.925 + }, + { + "epoch": 0.4163636363636364, + "grad_norm": 0.04174409061670303, + "learning_rate": 9.97492080858174e-05, + "loss": 0.013733956962823868, + "num_input_tokens_seen": 11250312, + "step": 687, + "train_runtime": 5583.4823, + "train_tokens_per_second": 2014.928 + }, + { + "epoch": 0.416969696969697, + "grad_norm": 0.018462834879755974, + "learning_rate": 9.97482452385115e-05, + "loss": 0.011940184980630875, + "num_input_tokens_seen": 11266688, + "step": 688, + "train_runtime": 5591.5984, + "train_tokens_per_second": 2014.932 + }, + { + "epoch": 0.4175757575757576, + "grad_norm": 0.021226534619927406, + "learning_rate": 9.974728055111487e-05, + "loss": 0.013460342772305012, + "num_input_tokens_seen": 11283064, + "step": 689, + "train_runtime": 5599.7136, + "train_tokens_per_second": 2014.936 + }, + { + "epoch": 0.41818181818181815, + "grad_norm": 0.017722534015774727, + "learning_rate": 9.974631402366322e-05, + "loss": 0.013120359741151333, + "num_input_tokens_seen": 11299440, + "step": 690, + "train_runtime": 5607.8302, + "train_tokens_per_second": 2014.94 + }, + { + "epoch": 0.41878787878787876, + "grad_norm": 0.04932510480284691, + "learning_rate": 9.97453456561923e-05, + "loss": 0.014747078530490398, + "num_input_tokens_seen": 11315816, + "step": 691, + "train_runtime": 5615.9471, + "train_tokens_per_second": 2014.943 + }, + { + "epoch": 0.4193939393939394, + "grad_norm": 0.014801602810621262, + "learning_rate": 9.974437544873791e-05, + "loss": 0.012634863145649433, + "num_input_tokens_seen": 11332192, + "step": 692, + "train_runtime": 5624.0643, + "train_tokens_per_second": 2014.947 + }, + { + "epoch": 0.42, + "grad_norm": 0.01846308819949627, + "learning_rate": 9.974340340133595e-05, + "loss": 0.013980153016746044, + "num_input_tokens_seen": 11348568, + "step": 693, + "train_runtime": 5632.1816, + "train_tokens_per_second": 2014.951 + }, + { + "epoch": 0.4206060606060606, + "grad_norm": 0.022268032655119896, + "learning_rate": 9.974242951402235e-05, + "loss": 0.013369940221309662, + "num_input_tokens_seen": 11364944, + "step": 694, + "train_runtime": 5640.2993, + "train_tokens_per_second": 2014.954 + }, + { + "epoch": 0.4212121212121212, + "grad_norm": 0.017928361892700195, + "learning_rate": 9.974145378683318e-05, + "loss": 0.012236877344548702, + "num_input_tokens_seen": 11381320, + "step": 695, + "train_runtime": 5648.4187, + "train_tokens_per_second": 2014.957 + }, + { + "epoch": 0.4218181818181818, + "grad_norm": 0.026991484686732292, + "learning_rate": 9.974047621980447e-05, + "loss": 0.013161352835595608, + "num_input_tokens_seen": 11397696, + "step": 696, + "train_runtime": 5656.5432, + "train_tokens_per_second": 2014.958 + }, + { + "epoch": 0.4224242424242424, + "grad_norm": 0.016671424731612206, + "learning_rate": 9.973949681297244e-05, + "loss": 0.013532438315451145, + "num_input_tokens_seen": 11414072, + "step": 697, + "train_runtime": 5664.6671, + "train_tokens_per_second": 2014.959 + }, + { + "epoch": 0.42303030303030303, + "grad_norm": 0.04440519958734512, + "learning_rate": 9.973851556637326e-05, + "loss": 0.014023078605532646, + "num_input_tokens_seen": 11430448, + "step": 698, + "train_runtime": 5672.7922, + "train_tokens_per_second": 2014.96 + }, + { + "epoch": 0.42363636363636364, + "grad_norm": 0.01818687841296196, + "learning_rate": 9.973753248004326e-05, + "loss": 0.012776060961186886, + "num_input_tokens_seen": 11446824, + "step": 699, + "train_runtime": 5680.9115, + "train_tokens_per_second": 2014.963 + }, + { + "epoch": 0.42424242424242425, + "grad_norm": 0.03709911182522774, + "learning_rate": 9.97365475540188e-05, + "loss": 0.013938689604401588, + "num_input_tokens_seen": 11463200, + "step": 700, + "train_runtime": 5689.0323, + "train_tokens_per_second": 2014.965 + }, + { + "epoch": 0.42484848484848486, + "grad_norm": 0.02871977910399437, + "learning_rate": 9.97355607883363e-05, + "loss": 0.015867041423916817, + "num_input_tokens_seen": 11479576, + "step": 701, + "train_runtime": 5698.2647, + "train_tokens_per_second": 2014.574 + }, + { + "epoch": 0.4254545454545455, + "grad_norm": 0.023145193234086037, + "learning_rate": 9.973457218303226e-05, + "loss": 0.01401555072516203, + "num_input_tokens_seen": 11495952, + "step": 702, + "train_runtime": 5706.3816, + "train_tokens_per_second": 2014.578 + }, + { + "epoch": 0.4260606060606061, + "grad_norm": 0.015238692052662373, + "learning_rate": 9.973358173814324e-05, + "loss": 0.01140027865767479, + "num_input_tokens_seen": 11512328, + "step": 703, + "train_runtime": 5714.5032, + "train_tokens_per_second": 2014.581 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 0.017513658851385117, + "learning_rate": 9.97325894537059e-05, + "loss": 0.01290590688586235, + "num_input_tokens_seen": 11528704, + "step": 704, + "train_runtime": 5722.6292, + "train_tokens_per_second": 2014.582 + }, + { + "epoch": 0.42727272727272725, + "grad_norm": 0.02398119866847992, + "learning_rate": 9.973159532975691e-05, + "loss": 0.013042651116847992, + "num_input_tokens_seen": 11545080, + "step": 705, + "train_runtime": 5730.753, + "train_tokens_per_second": 2014.583 + }, + { + "epoch": 0.42787878787878786, + "grad_norm": 0.01669715717434883, + "learning_rate": 9.973059936633306e-05, + "loss": 0.011862633749842644, + "num_input_tokens_seen": 11561456, + "step": 706, + "train_runtime": 5738.8701, + "train_tokens_per_second": 2014.588 + }, + { + "epoch": 0.42848484848484847, + "grad_norm": 0.0743919089436531, + "learning_rate": 9.97296015634712e-05, + "loss": 0.012939982116222382, + "num_input_tokens_seen": 11577832, + "step": 707, + "train_runtime": 5746.9879, + "train_tokens_per_second": 2014.591 + }, + { + "epoch": 0.4290909090909091, + "grad_norm": 0.014302635565400124, + "learning_rate": 9.972860192120821e-05, + "loss": 0.01308290846645832, + "num_input_tokens_seen": 11594208, + "step": 708, + "train_runtime": 5755.1051, + "train_tokens_per_second": 2014.595 + }, + { + "epoch": 0.4296969696969697, + "grad_norm": 0.03461941331624985, + "learning_rate": 9.972760043958109e-05, + "loss": 0.01451612077653408, + "num_input_tokens_seen": 11610584, + "step": 709, + "train_runtime": 5763.2288, + "train_tokens_per_second": 2014.597 + }, + { + "epoch": 0.4303030303030303, + "grad_norm": 0.026271218433976173, + "learning_rate": 9.972659711862687e-05, + "loss": 0.012233047746121883, + "num_input_tokens_seen": 11626960, + "step": 710, + "train_runtime": 5771.3444, + "train_tokens_per_second": 2014.602 + }, + { + "epoch": 0.4309090909090909, + "grad_norm": 0.03146032616496086, + "learning_rate": 9.972559195838263e-05, + "loss": 0.012203723192214966, + "num_input_tokens_seen": 11643336, + "step": 711, + "train_runtime": 5779.4615, + "train_tokens_per_second": 2014.606 + }, + { + "epoch": 0.4315151515151515, + "grad_norm": 0.023236479610204697, + "learning_rate": 9.97245849588856e-05, + "loss": 0.014339424669742584, + "num_input_tokens_seen": 11659712, + "step": 712, + "train_runtime": 5787.5789, + "train_tokens_per_second": 2014.61 + }, + { + "epoch": 0.43212121212121213, + "grad_norm": 0.016745924949645996, + "learning_rate": 9.972357612017302e-05, + "loss": 0.012629512697458267, + "num_input_tokens_seen": 11676088, + "step": 713, + "train_runtime": 5795.6981, + "train_tokens_per_second": 2014.613 + }, + { + "epoch": 0.43272727272727274, + "grad_norm": 0.028602320700883865, + "learning_rate": 9.972256544228217e-05, + "loss": 0.01239441242069006, + "num_input_tokens_seen": 11692464, + "step": 714, + "train_runtime": 5803.8136, + "train_tokens_per_second": 2014.617 + }, + { + "epoch": 0.43333333333333335, + "grad_norm": 0.04347382113337517, + "learning_rate": 9.972155292525046e-05, + "loss": 0.013399597257375717, + "num_input_tokens_seen": 11708840, + "step": 715, + "train_runtime": 5811.9326, + "train_tokens_per_second": 2014.621 + }, + { + "epoch": 0.43393939393939396, + "grad_norm": 0.027413364499807358, + "learning_rate": 9.972053856911534e-05, + "loss": 0.014752673916518688, + "num_input_tokens_seen": 11725216, + "step": 716, + "train_runtime": 5820.0498, + "train_tokens_per_second": 2014.625 + }, + { + "epoch": 0.43454545454545457, + "grad_norm": 0.034208860248327255, + "learning_rate": 9.971952237391433e-05, + "loss": 0.013670345768332481, + "num_input_tokens_seen": 11741592, + "step": 717, + "train_runtime": 5828.1669, + "train_tokens_per_second": 2014.629 + }, + { + "epoch": 0.4351515151515152, + "grad_norm": 0.08834357559680939, + "learning_rate": 9.971850433968499e-05, + "loss": 0.01636839471757412, + "num_input_tokens_seen": 11757968, + "step": 718, + "train_runtime": 5836.2889, + "train_tokens_per_second": 2014.631 + }, + { + "epoch": 0.43575757575757573, + "grad_norm": 0.09180225431919098, + "learning_rate": 9.971748446646503e-05, + "loss": 0.013547438196837902, + "num_input_tokens_seen": 11774344, + "step": 719, + "train_runtime": 5844.4057, + "train_tokens_per_second": 2014.635 + }, + { + "epoch": 0.43636363636363634, + "grad_norm": 0.021431786939501762, + "learning_rate": 9.971646275429211e-05, + "loss": 0.014424419030547142, + "num_input_tokens_seen": 11790720, + "step": 720, + "train_runtime": 5852.5291, + "train_tokens_per_second": 2014.637 + }, + { + "epoch": 0.43696969696969695, + "grad_norm": 0.014504344202578068, + "learning_rate": 9.971543920320407e-05, + "loss": 0.012794758193194866, + "num_input_tokens_seen": 11807096, + "step": 721, + "train_runtime": 5860.6452, + "train_tokens_per_second": 2014.641 + }, + { + "epoch": 0.43757575757575756, + "grad_norm": 0.04303886368870735, + "learning_rate": 9.971441381323874e-05, + "loss": 0.014037848450243473, + "num_input_tokens_seen": 11823472, + "step": 722, + "train_runtime": 5868.7615, + "train_tokens_per_second": 2014.645 + }, + { + "epoch": 0.4381818181818182, + "grad_norm": 0.028946641832590103, + "learning_rate": 9.971338658443406e-05, + "loss": 0.012954017147421837, + "num_input_tokens_seen": 11839848, + "step": 723, + "train_runtime": 5876.878, + "train_tokens_per_second": 2014.649 + }, + { + "epoch": 0.4387878787878788, + "grad_norm": 0.02165861800312996, + "learning_rate": 9.971235751682802e-05, + "loss": 0.012219181284308434, + "num_input_tokens_seen": 11856224, + "step": 724, + "train_runtime": 5884.9934, + "train_tokens_per_second": 2014.654 + }, + { + "epoch": 0.4393939393939394, + "grad_norm": 0.023574933409690857, + "learning_rate": 9.971132661045868e-05, + "loss": 0.014860106632113457, + "num_input_tokens_seen": 11872600, + "step": 725, + "train_runtime": 5893.1105, + "train_tokens_per_second": 2014.658 + }, + { + "epoch": 0.44, + "grad_norm": 0.05360223352909088, + "learning_rate": 9.971029386536419e-05, + "loss": 0.014855952933430672, + "num_input_tokens_seen": 11888976, + "step": 726, + "train_runtime": 5901.2285, + "train_tokens_per_second": 2014.661 + }, + { + "epoch": 0.4406060606060606, + "grad_norm": 0.03671532869338989, + "learning_rate": 9.970925928158274e-05, + "loss": 0.015136584639549255, + "num_input_tokens_seen": 11905352, + "step": 727, + "train_runtime": 5909.3465, + "train_tokens_per_second": 2014.665 + }, + { + "epoch": 0.4412121212121212, + "grad_norm": 0.012548093684017658, + "learning_rate": 9.970822285915257e-05, + "loss": 0.012122916989028454, + "num_input_tokens_seen": 11921728, + "step": 728, + "train_runtime": 5917.4638, + "train_tokens_per_second": 2014.669 + }, + { + "epoch": 0.44181818181818183, + "grad_norm": 0.02257922850549221, + "learning_rate": 9.970718459811206e-05, + "loss": 0.013802756555378437, + "num_input_tokens_seen": 11938104, + "step": 729, + "train_runtime": 5925.5783, + "train_tokens_per_second": 2014.673 + }, + { + "epoch": 0.44242424242424244, + "grad_norm": 0.014075133018195629, + "learning_rate": 9.97061444984996e-05, + "loss": 0.012838860973715782, + "num_input_tokens_seen": 11954480, + "step": 730, + "train_runtime": 5933.6937, + "train_tokens_per_second": 2014.678 + }, + { + "epoch": 0.44303030303030305, + "grad_norm": 0.022020021453499794, + "learning_rate": 9.970510256035364e-05, + "loss": 0.01375649869441986, + "num_input_tokens_seen": 11970856, + "step": 731, + "train_runtime": 5941.8106, + "train_tokens_per_second": 2014.682 + }, + { + "epoch": 0.44363636363636366, + "grad_norm": 0.01787860319018364, + "learning_rate": 9.970405878371273e-05, + "loss": 0.012008238583803177, + "num_input_tokens_seen": 11987232, + "step": 732, + "train_runtime": 5949.9292, + "train_tokens_per_second": 2014.685 + }, + { + "epoch": 0.4442424242424242, + "grad_norm": 0.019049983471632004, + "learning_rate": 9.970301316861548e-05, + "loss": 0.012502388097345829, + "num_input_tokens_seen": 12003608, + "step": 733, + "train_runtime": 5958.0503, + "train_tokens_per_second": 2014.687 + }, + { + "epoch": 0.4448484848484848, + "grad_norm": 0.02835710346698761, + "learning_rate": 9.970196571510057e-05, + "loss": 0.012223845347762108, + "num_input_tokens_seen": 12019984, + "step": 734, + "train_runtime": 5966.1707, + "train_tokens_per_second": 2014.69 + }, + { + "epoch": 0.44545454545454544, + "grad_norm": 0.04534858092665672, + "learning_rate": 9.970091642320674e-05, + "loss": 0.01531003974378109, + "num_input_tokens_seen": 12036360, + "step": 735, + "train_runtime": 5974.2918, + "train_tokens_per_second": 2014.692 + }, + { + "epoch": 0.44606060606060605, + "grad_norm": 0.02770829014480114, + "learning_rate": 9.96998652929728e-05, + "loss": 0.014202866703271866, + "num_input_tokens_seen": 12052736, + "step": 736, + "train_runtime": 5982.4163, + "train_tokens_per_second": 2014.694 + }, + { + "epoch": 0.44666666666666666, + "grad_norm": 0.01627975143492222, + "learning_rate": 9.969881232443761e-05, + "loss": 0.013593195006251335, + "num_input_tokens_seen": 12069112, + "step": 737, + "train_runtime": 5990.5422, + "train_tokens_per_second": 2014.694 + }, + { + "epoch": 0.44727272727272727, + "grad_norm": 0.02013089507818222, + "learning_rate": 9.969775751764015e-05, + "loss": 0.012935129925608635, + "num_input_tokens_seen": 12085488, + "step": 738, + "train_runtime": 5998.6638, + "train_tokens_per_second": 2014.697 + }, + { + "epoch": 0.4478787878787879, + "grad_norm": 0.03128223493695259, + "learning_rate": 9.969670087261942e-05, + "loss": 0.014752635732293129, + "num_input_tokens_seen": 12101864, + "step": 739, + "train_runtime": 6006.7832, + "train_tokens_per_second": 2014.7 + }, + { + "epoch": 0.4484848484848485, + "grad_norm": 0.08356563001871109, + "learning_rate": 9.969564238941452e-05, + "loss": 0.012013277038931847, + "num_input_tokens_seen": 12118240, + "step": 740, + "train_runtime": 6014.9037, + "train_tokens_per_second": 2014.702 + }, + { + "epoch": 0.4490909090909091, + "grad_norm": 0.04240264743566513, + "learning_rate": 9.969458206806456e-05, + "loss": 0.013846787624061108, + "num_input_tokens_seen": 12134616, + "step": 741, + "train_runtime": 6023.0287, + "train_tokens_per_second": 2014.703 + }, + { + "epoch": 0.4496969696969697, + "grad_norm": 0.020833732560276985, + "learning_rate": 9.96935199086088e-05, + "loss": 0.014301668852567673, + "num_input_tokens_seen": 12150992, + "step": 742, + "train_runtime": 6031.1472, + "train_tokens_per_second": 2014.707 + }, + { + "epoch": 0.4503030303030303, + "grad_norm": 0.021045729517936707, + "learning_rate": 9.969245591108652e-05, + "loss": 0.013184930197894573, + "num_input_tokens_seen": 12167368, + "step": 743, + "train_runtime": 6039.2669, + "train_tokens_per_second": 2014.709 + }, + { + "epoch": 0.4509090909090909, + "grad_norm": 0.014139235951006413, + "learning_rate": 9.969139007553705e-05, + "loss": 0.013327041640877724, + "num_input_tokens_seen": 12183744, + "step": 744, + "train_runtime": 6047.3846, + "train_tokens_per_second": 2014.713 + }, + { + "epoch": 0.45151515151515154, + "grad_norm": 0.7923178672790527, + "learning_rate": 9.969032240199983e-05, + "loss": 0.012914719060063362, + "num_input_tokens_seen": 12200120, + "step": 745, + "train_runtime": 6055.5018, + "train_tokens_per_second": 2014.717 + }, + { + "epoch": 0.45212121212121215, + "grad_norm": 0.033203721046447754, + "learning_rate": 9.968925289051436e-05, + "loss": 0.013039352372288704, + "num_input_tokens_seen": 12216496, + "step": 746, + "train_runtime": 6063.6194, + "train_tokens_per_second": 2014.72 + }, + { + "epoch": 0.4527272727272727, + "grad_norm": 0.02019328624010086, + "learning_rate": 9.96881815411202e-05, + "loss": 0.012438374571502209, + "num_input_tokens_seen": 12232872, + "step": 747, + "train_runtime": 6071.7363, + "train_tokens_per_second": 2014.724 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 0.03482190519571304, + "learning_rate": 9.968710835385696e-05, + "loss": 0.015620945952832699, + "num_input_tokens_seen": 12249248, + "step": 748, + "train_runtime": 6079.8541, + "train_tokens_per_second": 2014.727 + }, + { + "epoch": 0.4539393939393939, + "grad_norm": 0.053270891308784485, + "learning_rate": 9.968603332876434e-05, + "loss": 0.012819363735616207, + "num_input_tokens_seen": 12265624, + "step": 749, + "train_runtime": 6087.9704, + "train_tokens_per_second": 2014.731 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 0.013719640672206879, + "learning_rate": 9.968495646588211e-05, + "loss": 0.013314586132764816, + "num_input_tokens_seen": 12282000, + "step": 750, + "train_runtime": 6096.0915, + "train_tokens_per_second": 2014.734 + }, + { + "epoch": 0.45515151515151514, + "grad_norm": 0.020413396880030632, + "learning_rate": 9.96838777652501e-05, + "loss": 0.012559941038489342, + "num_input_tokens_seen": 12298376, + "step": 751, + "train_runtime": 6104.2072, + "train_tokens_per_second": 2014.738 + }, + { + "epoch": 0.45575757575757575, + "grad_norm": 0.02567451260983944, + "learning_rate": 9.968279722690819e-05, + "loss": 0.013514967635273933, + "num_input_tokens_seen": 12314752, + "step": 752, + "train_runtime": 6112.3314, + "train_tokens_per_second": 2014.739 + }, + { + "epoch": 0.45636363636363636, + "grad_norm": 0.015409312210977077, + "learning_rate": 9.968171485089638e-05, + "loss": 0.012808658182621002, + "num_input_tokens_seen": 12331128, + "step": 753, + "train_runtime": 6120.4491, + "train_tokens_per_second": 2014.742 + }, + { + "epoch": 0.45696969696969697, + "grad_norm": 0.02095264568924904, + "learning_rate": 9.968063063725468e-05, + "loss": 0.014174265787005424, + "num_input_tokens_seen": 12347504, + "step": 754, + "train_runtime": 6128.5679, + "train_tokens_per_second": 2014.745 + }, + { + "epoch": 0.4575757575757576, + "grad_norm": 0.020611796528100967, + "learning_rate": 9.96795445860232e-05, + "loss": 0.011881090700626373, + "num_input_tokens_seen": 12363880, + "step": 755, + "train_runtime": 6136.6868, + "train_tokens_per_second": 2014.748 + }, + { + "epoch": 0.4581818181818182, + "grad_norm": 0.018243003636598587, + "learning_rate": 9.967845669724212e-05, + "loss": 0.012596143409609795, + "num_input_tokens_seen": 12380256, + "step": 756, + "train_runtime": 6144.8042, + "train_tokens_per_second": 2014.752 + }, + { + "epoch": 0.4587878787878788, + "grad_norm": 0.016125964000821114, + "learning_rate": 9.967736697095167e-05, + "loss": 0.013951683416962624, + "num_input_tokens_seen": 12396632, + "step": 757, + "train_runtime": 6152.9288, + "train_tokens_per_second": 2014.753 + }, + { + "epoch": 0.4593939393939394, + "grad_norm": 0.019307058304548264, + "learning_rate": 9.967627540719215e-05, + "loss": 0.013310304842889309, + "num_input_tokens_seen": 12413008, + "step": 758, + "train_runtime": 6161.047, + "train_tokens_per_second": 2014.756 + }, + { + "epoch": 0.46, + "grad_norm": 0.0198148675262928, + "learning_rate": 9.967518200600396e-05, + "loss": 0.013110843487083912, + "num_input_tokens_seen": 12429384, + "step": 759, + "train_runtime": 6169.1657, + "train_tokens_per_second": 2014.759 + }, + { + "epoch": 0.46060606060606063, + "grad_norm": 0.02929919771850109, + "learning_rate": 9.967408676742751e-05, + "loss": 0.015073966234922409, + "num_input_tokens_seen": 12445760, + "step": 760, + "train_runtime": 6177.2831, + "train_tokens_per_second": 2014.763 + }, + { + "epoch": 0.4612121212121212, + "grad_norm": 0.015382593497633934, + "learning_rate": 9.967298969150334e-05, + "loss": 0.012051237747073174, + "num_input_tokens_seen": 12462136, + "step": 761, + "train_runtime": 6185.4001, + "train_tokens_per_second": 2014.766 + }, + { + "epoch": 0.4618181818181818, + "grad_norm": 0.02371540106832981, + "learning_rate": 9.9671890778272e-05, + "loss": 0.015372917987406254, + "num_input_tokens_seen": 12478512, + "step": 762, + "train_runtime": 6193.5166, + "train_tokens_per_second": 2014.77 + }, + { + "epoch": 0.4624242424242424, + "grad_norm": 0.02178136259317398, + "learning_rate": 9.967079002777417e-05, + "loss": 0.013376548886299133, + "num_input_tokens_seen": 12494888, + "step": 763, + "train_runtime": 6201.6342, + "train_tokens_per_second": 2014.773 + }, + { + "epoch": 0.463030303030303, + "grad_norm": 0.01065842155367136, + "learning_rate": 9.966968744005052e-05, + "loss": 0.012219875119626522, + "num_input_tokens_seen": 12511264, + "step": 764, + "train_runtime": 6209.7525, + "train_tokens_per_second": 2014.777 + }, + { + "epoch": 0.4636363636363636, + "grad_norm": 0.013287489302456379, + "learning_rate": 9.966858301514188e-05, + "loss": 0.011538016609847546, + "num_input_tokens_seen": 12527640, + "step": 765, + "train_runtime": 6217.8691, + "train_tokens_per_second": 2014.78 + }, + { + "epoch": 0.46424242424242423, + "grad_norm": 0.013882887549698353, + "learning_rate": 9.966747675308907e-05, + "loss": 0.012349468655884266, + "num_input_tokens_seen": 12544016, + "step": 766, + "train_runtime": 6225.9864, + "train_tokens_per_second": 2014.784 + }, + { + "epoch": 0.46484848484848484, + "grad_norm": 0.018599022179841995, + "learning_rate": 9.966636865393301e-05, + "loss": 0.012744025327265263, + "num_input_tokens_seen": 12560392, + "step": 767, + "train_runtime": 6234.1026, + "train_tokens_per_second": 2014.787 + }, + { + "epoch": 0.46545454545454545, + "grad_norm": 0.012023529969155788, + "learning_rate": 9.966525871771472e-05, + "loss": 0.01199167687445879, + "num_input_tokens_seen": 12576768, + "step": 768, + "train_runtime": 6242.2199, + "train_tokens_per_second": 2014.791 + }, + { + "epoch": 0.46606060606060606, + "grad_norm": 0.01650414615869522, + "learning_rate": 9.966414694447521e-05, + "loss": 0.012927195057272911, + "num_input_tokens_seen": 12593144, + "step": 769, + "train_runtime": 6250.3375, + "train_tokens_per_second": 2014.794 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.034085940569639206, + "learning_rate": 9.966303333425563e-05, + "loss": 0.012202315032482147, + "num_input_tokens_seen": 12609520, + "step": 770, + "train_runtime": 6258.4532, + "train_tokens_per_second": 2014.798 + }, + { + "epoch": 0.4672727272727273, + "grad_norm": 0.013827620074152946, + "learning_rate": 9.966191788709716e-05, + "loss": 0.013147883117198944, + "num_input_tokens_seen": 12625896, + "step": 771, + "train_runtime": 6266.5701, + "train_tokens_per_second": 2014.802 + }, + { + "epoch": 0.4678787878787879, + "grad_norm": 0.0181913860142231, + "learning_rate": 9.966080060304105e-05, + "loss": 0.013427773490548134, + "num_input_tokens_seen": 12642272, + "step": 772, + "train_runtime": 6274.6886, + "train_tokens_per_second": 2014.805 + }, + { + "epoch": 0.4684848484848485, + "grad_norm": 0.07882755249738693, + "learning_rate": 9.965968148212864e-05, + "loss": 0.017075341194868088, + "num_input_tokens_seen": 12658648, + "step": 773, + "train_runtime": 6282.8062, + "train_tokens_per_second": 2014.808 + }, + { + "epoch": 0.4690909090909091, + "grad_norm": 0.007325070444494486, + "learning_rate": 9.965856052440132e-05, + "loss": 0.011197097599506378, + "num_input_tokens_seen": 12675024, + "step": 774, + "train_runtime": 6290.929, + "train_tokens_per_second": 2014.81 + }, + { + "epoch": 0.4696969696969697, + "grad_norm": 0.030580898746848106, + "learning_rate": 9.965743772990054e-05, + "loss": 0.012808885425329208, + "num_input_tokens_seen": 12691400, + "step": 775, + "train_runtime": 6299.0468, + "train_tokens_per_second": 2014.813 + }, + { + "epoch": 0.4703030303030303, + "grad_norm": 0.027805298566818237, + "learning_rate": 9.965631309866788e-05, + "loss": 0.012805595062673092, + "num_input_tokens_seen": 12707776, + "step": 776, + "train_runtime": 6307.1647, + "train_tokens_per_second": 2014.816 + }, + { + "epoch": 0.4709090909090909, + "grad_norm": 0.01449024397879839, + "learning_rate": 9.965518663074487e-05, + "loss": 0.013110213913023472, + "num_input_tokens_seen": 12724152, + "step": 777, + "train_runtime": 6315.2824, + "train_tokens_per_second": 2014.819 + }, + { + "epoch": 0.4715151515151515, + "grad_norm": 0.013304144144058228, + "learning_rate": 9.96540583261732e-05, + "loss": 0.012666239403188229, + "num_input_tokens_seen": 12740528, + "step": 778, + "train_runtime": 6323.3995, + "train_tokens_per_second": 2014.823 + }, + { + "epoch": 0.4721212121212121, + "grad_norm": 0.01922908052802086, + "learning_rate": 9.965292818499463e-05, + "loss": 0.012315730564296246, + "num_input_tokens_seen": 12756904, + "step": 779, + "train_runtime": 6331.5179, + "train_tokens_per_second": 2014.826 + }, + { + "epoch": 0.4727272727272727, + "grad_norm": 0.042174387723207474, + "learning_rate": 9.965179620725093e-05, + "loss": 0.015461819246411324, + "num_input_tokens_seen": 12773280, + "step": 780, + "train_runtime": 6339.636, + "train_tokens_per_second": 2014.829 + }, + { + "epoch": 0.47333333333333333, + "grad_norm": 0.02851157635450363, + "learning_rate": 9.965066239298398e-05, + "loss": 0.012629134580492973, + "num_input_tokens_seen": 12789656, + "step": 781, + "train_runtime": 6347.7537, + "train_tokens_per_second": 2014.832 + }, + { + "epoch": 0.47393939393939394, + "grad_norm": 0.10219256579875946, + "learning_rate": 9.96495267422357e-05, + "loss": 0.014288711361587048, + "num_input_tokens_seen": 12806032, + "step": 782, + "train_runtime": 6355.8718, + "train_tokens_per_second": 2014.835 + }, + { + "epoch": 0.47454545454545455, + "grad_norm": 0.012413585558533669, + "learning_rate": 9.964838925504816e-05, + "loss": 0.012026645243167877, + "num_input_tokens_seen": 12822408, + "step": 783, + "train_runtime": 6363.9912, + "train_tokens_per_second": 2014.837 + }, + { + "epoch": 0.47515151515151516, + "grad_norm": 0.019600611180067062, + "learning_rate": 9.964724993146335e-05, + "loss": 0.012678924947977066, + "num_input_tokens_seen": 12838784, + "step": 784, + "train_runtime": 6372.1105, + "train_tokens_per_second": 2014.84 + }, + { + "epoch": 0.47575757575757577, + "grad_norm": 0.021761193871498108, + "learning_rate": 9.964610877152346e-05, + "loss": 0.012011994607746601, + "num_input_tokens_seen": 12855160, + "step": 785, + "train_runtime": 6380.2296, + "train_tokens_per_second": 2014.843 + }, + { + "epoch": 0.4763636363636364, + "grad_norm": 0.016564620658755302, + "learning_rate": 9.964496577527069e-05, + "loss": 0.01261131465435028, + "num_input_tokens_seen": 12871536, + "step": 786, + "train_runtime": 6388.348, + "train_tokens_per_second": 2014.846 + }, + { + "epoch": 0.476969696969697, + "grad_norm": 0.009226581081748009, + "learning_rate": 9.964382094274732e-05, + "loss": 0.012591596692800522, + "num_input_tokens_seen": 12887912, + "step": 787, + "train_runtime": 6396.4664, + "train_tokens_per_second": 2014.849 + }, + { + "epoch": 0.4775757575757576, + "grad_norm": 0.017386259511113167, + "learning_rate": 9.964267427399568e-05, + "loss": 0.012936464510858059, + "num_input_tokens_seen": 12904288, + "step": 788, + "train_runtime": 6404.5838, + "train_tokens_per_second": 2014.852 + }, + { + "epoch": 0.4781818181818182, + "grad_norm": 0.023312706500291824, + "learning_rate": 9.964152576905819e-05, + "loss": 0.012287257239222527, + "num_input_tokens_seen": 12920664, + "step": 789, + "train_runtime": 6412.7014, + "train_tokens_per_second": 2014.855 + }, + { + "epoch": 0.47878787878787876, + "grad_norm": 0.03517942875623703, + "learning_rate": 9.964037542797735e-05, + "loss": 0.014132940210402012, + "num_input_tokens_seen": 12937040, + "step": 790, + "train_runtime": 6420.8203, + "train_tokens_per_second": 2014.858 + }, + { + "epoch": 0.4793939393939394, + "grad_norm": 0.03619959577918053, + "learning_rate": 9.963922325079567e-05, + "loss": 0.014860968105494976, + "num_input_tokens_seen": 12953416, + "step": 791, + "train_runtime": 6428.9382, + "train_tokens_per_second": 2014.861 + }, + { + "epoch": 0.48, + "grad_norm": 0.03862093389034271, + "learning_rate": 9.96380692375558e-05, + "loss": 0.012788870371878147, + "num_input_tokens_seen": 12969792, + "step": 792, + "train_runtime": 6437.0552, + "train_tokens_per_second": 2014.864 + }, + { + "epoch": 0.4806060606060606, + "grad_norm": 0.014955422841012478, + "learning_rate": 9.963691338830044e-05, + "loss": 0.012180945836007595, + "num_input_tokens_seen": 12986168, + "step": 793, + "train_runtime": 6445.1731, + "train_tokens_per_second": 2014.867 + }, + { + "epoch": 0.4812121212121212, + "grad_norm": 0.02255093678832054, + "learning_rate": 9.963575570307228e-05, + "loss": 0.015188801102340221, + "num_input_tokens_seen": 13002544, + "step": 794, + "train_runtime": 6453.2915, + "train_tokens_per_second": 2014.87 + }, + { + "epoch": 0.4818181818181818, + "grad_norm": 0.023307740688323975, + "learning_rate": 9.96345961819142e-05, + "loss": 0.012430655770003796, + "num_input_tokens_seen": 13018920, + "step": 795, + "train_runtime": 6461.4033, + "train_tokens_per_second": 2014.875 + }, + { + "epoch": 0.4824242424242424, + "grad_norm": 0.015535326674580574, + "learning_rate": 9.963343482486906e-05, + "loss": 0.013036166317760944, + "num_input_tokens_seen": 13035296, + "step": 796, + "train_runtime": 6469.51, + "train_tokens_per_second": 2014.882 + }, + { + "epoch": 0.48303030303030303, + "grad_norm": 0.015238570980727673, + "learning_rate": 9.963227163197982e-05, + "loss": 0.012019358575344086, + "num_input_tokens_seen": 13051672, + "step": 797, + "train_runtime": 6477.6153, + "train_tokens_per_second": 2014.888 + }, + { + "epoch": 0.48363636363636364, + "grad_norm": 0.033798947930336, + "learning_rate": 9.963110660328952e-05, + "loss": 0.013339506462216377, + "num_input_tokens_seen": 13068048, + "step": 798, + "train_runtime": 6485.7294, + "train_tokens_per_second": 2014.893 + }, + { + "epoch": 0.48424242424242425, + "grad_norm": 0.019505798816680908, + "learning_rate": 9.962993973884122e-05, + "loss": 0.012281915172934532, + "num_input_tokens_seen": 13084424, + "step": 799, + "train_runtime": 6493.8366, + "train_tokens_per_second": 2014.899 + }, + { + "epoch": 0.48484848484848486, + "grad_norm": 0.010988899506628513, + "learning_rate": 9.96287710386781e-05, + "loss": 0.011865864507853985, + "num_input_tokens_seen": 13100800, + "step": 800, + "train_runtime": 6501.9444, + "train_tokens_per_second": 2014.905 + }, + { + "epoch": 0.48545454545454547, + "grad_norm": 0.031102674081921577, + "learning_rate": 9.96276005028434e-05, + "loss": 0.013372216373682022, + "num_input_tokens_seen": 13117176, + "step": 801, + "train_runtime": 6511.057, + "train_tokens_per_second": 2014.6 + }, + { + "epoch": 0.4860606060606061, + "grad_norm": 0.009399918839335442, + "learning_rate": 9.962642813138039e-05, + "loss": 0.012573515065014362, + "num_input_tokens_seen": 13133552, + "step": 802, + "train_runtime": 6519.1656, + "train_tokens_per_second": 2014.606 + }, + { + "epoch": 0.4866666666666667, + "grad_norm": 0.06464923918247223, + "learning_rate": 9.962525392433246e-05, + "loss": 0.014730310998857021, + "num_input_tokens_seen": 13149928, + "step": 803, + "train_runtime": 6527.273, + "train_tokens_per_second": 2014.613 + }, + { + "epoch": 0.48727272727272725, + "grad_norm": 0.028241781517863274, + "learning_rate": 9.962407788174301e-05, + "loss": 0.01580268330872059, + "num_input_tokens_seen": 13166304, + "step": 804, + "train_runtime": 6535.3821, + "train_tokens_per_second": 2014.619 + }, + { + "epoch": 0.48787878787878786, + "grad_norm": 0.008157139644026756, + "learning_rate": 9.962290000365558e-05, + "loss": 0.011951067484915257, + "num_input_tokens_seen": 13182680, + "step": 805, + "train_runtime": 6543.4933, + "train_tokens_per_second": 2014.624 + }, + { + "epoch": 0.48848484848484847, + "grad_norm": 0.017825007438659668, + "learning_rate": 9.96217202901137e-05, + "loss": 0.01247593853622675, + "num_input_tokens_seen": 13199056, + "step": 806, + "train_runtime": 6551.599, + "train_tokens_per_second": 2014.631 + }, + { + "epoch": 0.4890909090909091, + "grad_norm": 0.03140291944146156, + "learning_rate": 9.962053874116102e-05, + "loss": 0.013065744191408157, + "num_input_tokens_seen": 13215432, + "step": 807, + "train_runtime": 6559.707, + "train_tokens_per_second": 2014.638 + }, + { + "epoch": 0.4896969696969697, + "grad_norm": 0.020545680075883865, + "learning_rate": 9.961935535684127e-05, + "loss": 0.013503405265510082, + "num_input_tokens_seen": 13231808, + "step": 808, + "train_runtime": 6567.8172, + "train_tokens_per_second": 2014.643 + }, + { + "epoch": 0.4903030303030303, + "grad_norm": 0.010955904610455036, + "learning_rate": 9.961817013719815e-05, + "loss": 0.011936129070818424, + "num_input_tokens_seen": 13248184, + "step": 809, + "train_runtime": 6575.9284, + "train_tokens_per_second": 2014.648 + }, + { + "epoch": 0.4909090909090909, + "grad_norm": 0.01849379763007164, + "learning_rate": 9.961698308227557e-05, + "loss": 0.012791337445378304, + "num_input_tokens_seen": 13264560, + "step": 810, + "train_runtime": 6584.0343, + "train_tokens_per_second": 2014.655 + }, + { + "epoch": 0.4915151515151515, + "grad_norm": 0.014219888485968113, + "learning_rate": 9.961579419211741e-05, + "loss": 0.01348559744656086, + "num_input_tokens_seen": 13280936, + "step": 811, + "train_runtime": 6592.1415, + "train_tokens_per_second": 2014.662 + }, + { + "epoch": 0.4921212121212121, + "grad_norm": 0.02992507442831993, + "learning_rate": 9.961460346676763e-05, + "loss": 0.013612410053610802, + "num_input_tokens_seen": 13297312, + "step": 812, + "train_runtime": 6600.2507, + "train_tokens_per_second": 2014.668 + }, + { + "epoch": 0.49272727272727274, + "grad_norm": 0.029259268194437027, + "learning_rate": 9.961341090627031e-05, + "loss": 0.014138033613562584, + "num_input_tokens_seen": 13313688, + "step": 813, + "train_runtime": 6608.362, + "train_tokens_per_second": 2014.673 + }, + { + "epoch": 0.49333333333333335, + "grad_norm": 0.016515251249074936, + "learning_rate": 9.961221651066952e-05, + "loss": 0.013446497730910778, + "num_input_tokens_seen": 13330064, + "step": 814, + "train_runtime": 6616.47, + "train_tokens_per_second": 2014.679 + }, + { + "epoch": 0.49393939393939396, + "grad_norm": 0.019002556800842285, + "learning_rate": 9.961102028000948e-05, + "loss": 0.013769666664302349, + "num_input_tokens_seen": 13346440, + "step": 815, + "train_runtime": 6624.5765, + "train_tokens_per_second": 2014.686 + }, + { + "epoch": 0.49454545454545457, + "grad_norm": 0.023732759058475494, + "learning_rate": 9.960982221433439e-05, + "loss": 0.01219931710511446, + "num_input_tokens_seen": 13362816, + "step": 816, + "train_runtime": 6632.6975, + "train_tokens_per_second": 2014.688 + }, + { + "epoch": 0.4951515151515152, + "grad_norm": 0.012622934766113758, + "learning_rate": 9.960862231368859e-05, + "loss": 0.012783626094460487, + "num_input_tokens_seen": 13379192, + "step": 817, + "train_runtime": 6640.8076, + "train_tokens_per_second": 2014.694 + }, + { + "epoch": 0.49575757575757573, + "grad_norm": 0.014281938783824444, + "learning_rate": 9.960742057811648e-05, + "loss": 0.012687593698501587, + "num_input_tokens_seen": 13395568, + "step": 818, + "train_runtime": 6648.9137, + "train_tokens_per_second": 2014.7 + }, + { + "epoch": 0.49636363636363634, + "grad_norm": 0.053434181958436966, + "learning_rate": 9.960621700766246e-05, + "loss": 0.013879223726689816, + "num_input_tokens_seen": 13411944, + "step": 819, + "train_runtime": 6657.0289, + "train_tokens_per_second": 2014.704 + }, + { + "epoch": 0.49696969696969695, + "grad_norm": 0.014049537479877472, + "learning_rate": 9.960501160237107e-05, + "loss": 0.011275812052190304, + "num_input_tokens_seen": 13428320, + "step": 820, + "train_runtime": 6665.1394, + "train_tokens_per_second": 2014.71 + }, + { + "epoch": 0.49757575757575756, + "grad_norm": 0.02216215617954731, + "learning_rate": 9.960380436228693e-05, + "loss": 0.01345481164753437, + "num_input_tokens_seen": 13444696, + "step": 821, + "train_runtime": 6673.2486, + "train_tokens_per_second": 2014.715 + }, + { + "epoch": 0.49818181818181817, + "grad_norm": 0.01626548357307911, + "learning_rate": 9.960259528745466e-05, + "loss": 0.01268689427524805, + "num_input_tokens_seen": 13461072, + "step": 822, + "train_runtime": 6681.3546, + "train_tokens_per_second": 2014.722 + }, + { + "epoch": 0.4987878787878788, + "grad_norm": 0.029701311141252518, + "learning_rate": 9.960138437791899e-05, + "loss": 0.013831757940351963, + "num_input_tokens_seen": 13477448, + "step": 823, + "train_runtime": 6689.465, + "train_tokens_per_second": 2014.727 + }, + { + "epoch": 0.4993939393939394, + "grad_norm": 0.01778031513094902, + "learning_rate": 9.96001716337247e-05, + "loss": 0.012985551729798317, + "num_input_tokens_seen": 13493824, + "step": 824, + "train_runtime": 6697.5721, + "train_tokens_per_second": 2014.734 + }, + { + "epoch": 0.5, + "grad_norm": 0.011812685988843441, + "learning_rate": 9.959895705491664e-05, + "loss": 0.013474401086568832, + "num_input_tokens_seen": 13510200, + "step": 825, + "train_runtime": 6705.6803, + "train_tokens_per_second": 2014.74 + }, + { + "epoch": 0.5006060606060606, + "grad_norm": 0.024887410923838615, + "learning_rate": 9.959774064153977e-05, + "loss": 0.012352567166090012, + "num_input_tokens_seen": 13526576, + "step": 826, + "train_runtime": 6713.7875, + "train_tokens_per_second": 2014.746 + }, + { + "epoch": 0.5012121212121212, + "grad_norm": 0.02427525445818901, + "learning_rate": 9.959652239363906e-05, + "loss": 0.01411970891058445, + "num_input_tokens_seen": 13542952, + "step": 827, + "train_runtime": 6721.8992, + "train_tokens_per_second": 2014.751 + }, + { + "epoch": 0.5018181818181818, + "grad_norm": 0.02203851006925106, + "learning_rate": 9.959530231125955e-05, + "loss": 0.01270216703414917, + "num_input_tokens_seen": 13559328, + "step": 828, + "train_runtime": 6730.0067, + "train_tokens_per_second": 2014.757 + }, + { + "epoch": 0.5024242424242424, + "grad_norm": 0.033256348222494125, + "learning_rate": 9.959408039444641e-05, + "loss": 0.013468440622091293, + "num_input_tokens_seen": 13575704, + "step": 829, + "train_runtime": 6738.1159, + "train_tokens_per_second": 2014.763 + }, + { + "epoch": 0.503030303030303, + "grad_norm": 0.030981307849287987, + "learning_rate": 9.95928566432448e-05, + "loss": 0.013072172179818153, + "num_input_tokens_seen": 13592080, + "step": 830, + "train_runtime": 6746.2296, + "train_tokens_per_second": 2014.767 + }, + { + "epoch": 0.5036363636363637, + "grad_norm": 0.019473901018500328, + "learning_rate": 9.959163105770002e-05, + "loss": 0.01263860147446394, + "num_input_tokens_seen": 13608456, + "step": 831, + "train_runtime": 6754.3387, + "train_tokens_per_second": 2014.773 + }, + { + "epoch": 0.5042424242424243, + "grad_norm": 0.023273654282093048, + "learning_rate": 9.959040363785736e-05, + "loss": 0.014287668280303478, + "num_input_tokens_seen": 13624832, + "step": 832, + "train_runtime": 6762.4478, + "train_tokens_per_second": 2014.778 + }, + { + "epoch": 0.5048484848484849, + "grad_norm": 0.0494939386844635, + "learning_rate": 9.958917438376226e-05, + "loss": 0.013972645625472069, + "num_input_tokens_seen": 13641208, + "step": 833, + "train_runtime": 6770.5557, + "train_tokens_per_second": 2014.784 + }, + { + "epoch": 0.5054545454545455, + "grad_norm": 0.0583622045814991, + "learning_rate": 9.958794329546017e-05, + "loss": 0.015316938981413841, + "num_input_tokens_seen": 13657584, + "step": 834, + "train_runtime": 6778.6628, + "train_tokens_per_second": 2014.79 + }, + { + "epoch": 0.5060606060606061, + "grad_norm": 0.022303935140371323, + "learning_rate": 9.958671037299662e-05, + "loss": 0.012674327939748764, + "num_input_tokens_seen": 13673960, + "step": 835, + "train_runtime": 6786.7703, + "train_tokens_per_second": 2014.796 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 0.033000800758600235, + "learning_rate": 9.958547561641722e-05, + "loss": 0.013727420009672642, + "num_input_tokens_seen": 13690336, + "step": 836, + "train_runtime": 6794.8756, + "train_tokens_per_second": 2014.803 + }, + { + "epoch": 0.5072727272727273, + "grad_norm": 0.015586488880217075, + "learning_rate": 9.958423902576763e-05, + "loss": 0.015323062427341938, + "num_input_tokens_seen": 13706712, + "step": 837, + "train_runtime": 6802.9828, + "train_tokens_per_second": 2014.809 + }, + { + "epoch": 0.5078787878787879, + "grad_norm": 0.022322285920381546, + "learning_rate": 9.958300060109362e-05, + "loss": 0.014234354719519615, + "num_input_tokens_seen": 13723088, + "step": 838, + "train_runtime": 6811.0938, + "train_tokens_per_second": 2014.814 + }, + { + "epoch": 0.5084848484848485, + "grad_norm": 0.008347253315150738, + "learning_rate": 9.958176034244097e-05, + "loss": 0.012262934818863869, + "num_input_tokens_seen": 13739464, + "step": 839, + "train_runtime": 6819.2024, + "train_tokens_per_second": 2014.82 + }, + { + "epoch": 0.509090909090909, + "grad_norm": 0.02393462508916855, + "learning_rate": 9.958051824985555e-05, + "loss": 0.01308400183916092, + "num_input_tokens_seen": 13755840, + "step": 840, + "train_runtime": 6827.3112, + "train_tokens_per_second": 2014.825 + }, + { + "epoch": 0.5096969696969696, + "grad_norm": 0.01569426991045475, + "learning_rate": 9.957927432338332e-05, + "loss": 0.012214584276080132, + "num_input_tokens_seen": 13772216, + "step": 841, + "train_runtime": 6835.4183, + "train_tokens_per_second": 2014.831 + }, + { + "epoch": 0.5103030303030303, + "grad_norm": 0.026208873838186264, + "learning_rate": 9.957802856307029e-05, + "loss": 0.014355281367897987, + "num_input_tokens_seen": 13788592, + "step": 842, + "train_runtime": 6843.5292, + "train_tokens_per_second": 2014.836 + }, + { + "epoch": 0.5109090909090909, + "grad_norm": 0.016047121956944466, + "learning_rate": 9.957678096896252e-05, + "loss": 0.012238034047186375, + "num_input_tokens_seen": 13804968, + "step": 843, + "train_runtime": 6851.6374, + "train_tokens_per_second": 2014.842 + }, + { + "epoch": 0.5115151515151515, + "grad_norm": 0.04430484399199486, + "learning_rate": 9.957553154110617e-05, + "loss": 0.013455298729240894, + "num_input_tokens_seen": 13821344, + "step": 844, + "train_runtime": 6859.7446, + "train_tokens_per_second": 2014.848 + }, + { + "epoch": 0.5121212121212121, + "grad_norm": 0.01514506246894598, + "learning_rate": 9.957428027954746e-05, + "loss": 0.014497831463813782, + "num_input_tokens_seen": 13837720, + "step": 845, + "train_runtime": 6867.8522, + "train_tokens_per_second": 2014.854 + }, + { + "epoch": 0.5127272727272727, + "grad_norm": 0.11227481067180634, + "learning_rate": 9.957302718433266e-05, + "loss": 0.01227258238941431, + "num_input_tokens_seen": 13854096, + "step": 846, + "train_runtime": 6875.9627, + "train_tokens_per_second": 2014.859 + }, + { + "epoch": 0.5133333333333333, + "grad_norm": 0.02800634503364563, + "learning_rate": 9.957177225550813e-05, + "loss": 0.013792254962027073, + "num_input_tokens_seen": 13870472, + "step": 847, + "train_runtime": 6884.0675, + "train_tokens_per_second": 2014.866 + }, + { + "epoch": 0.5139393939393939, + "grad_norm": 0.029475996270775795, + "learning_rate": 9.957051549312027e-05, + "loss": 0.013554091565310955, + "num_input_tokens_seen": 13886848, + "step": 848, + "train_runtime": 6892.1731, + "train_tokens_per_second": 2014.872 + }, + { + "epoch": 0.5145454545454545, + "grad_norm": 0.019583873450756073, + "learning_rate": 9.956925689721559e-05, + "loss": 0.014205913059413433, + "num_input_tokens_seen": 13903224, + "step": 849, + "train_runtime": 6900.2826, + "train_tokens_per_second": 2014.877 + }, + { + "epoch": 0.5151515151515151, + "grad_norm": 0.015864579007029533, + "learning_rate": 9.95679964678406e-05, + "loss": 0.01432622317224741, + "num_input_tokens_seen": 13919600, + "step": 850, + "train_runtime": 6908.3907, + "train_tokens_per_second": 2014.883 + }, + { + "epoch": 0.5157575757575757, + "grad_norm": 0.01455528661608696, + "learning_rate": 9.9566734205042e-05, + "loss": 0.015681616961956024, + "num_input_tokens_seen": 13935976, + "step": 851, + "train_runtime": 6916.5024, + "train_tokens_per_second": 2014.888 + }, + { + "epoch": 0.5163636363636364, + "grad_norm": 0.02918148599565029, + "learning_rate": 9.956547010886639e-05, + "loss": 0.012535885907709599, + "num_input_tokens_seen": 13952352, + "step": 852, + "train_runtime": 6924.6094, + "train_tokens_per_second": 2014.894 + }, + { + "epoch": 0.516969696969697, + "grad_norm": 0.0162571519613266, + "learning_rate": 9.956420417936056e-05, + "loss": 0.012905891984701157, + "num_input_tokens_seen": 13968728, + "step": 853, + "train_runtime": 6932.7194, + "train_tokens_per_second": 2014.899 + }, + { + "epoch": 0.5175757575757576, + "grad_norm": 0.01789519377052784, + "learning_rate": 9.956293641657137e-05, + "loss": 0.01288038119673729, + "num_input_tokens_seen": 13985104, + "step": 854, + "train_runtime": 6940.8319, + "train_tokens_per_second": 2014.903 + }, + { + "epoch": 0.5181818181818182, + "grad_norm": 0.01946009323000908, + "learning_rate": 9.956166682054566e-05, + "loss": 0.013123282231390476, + "num_input_tokens_seen": 14001480, + "step": 855, + "train_runtime": 6948.9381, + "train_tokens_per_second": 2014.909 + }, + { + "epoch": 0.5187878787878788, + "grad_norm": 0.02161416970193386, + "learning_rate": 9.956039539133042e-05, + "loss": 0.011395135894417763, + "num_input_tokens_seen": 14017856, + "step": 856, + "train_runtime": 6957.048, + "train_tokens_per_second": 2014.914 + }, + { + "epoch": 0.5193939393939394, + "grad_norm": 0.01752905547618866, + "learning_rate": 9.955912212897267e-05, + "loss": 0.014676744118332863, + "num_input_tokens_seen": 14034232, + "step": 857, + "train_runtime": 6965.1559, + "train_tokens_per_second": 2014.92 + }, + { + "epoch": 0.52, + "grad_norm": 0.012038851156830788, + "learning_rate": 9.955784703351949e-05, + "loss": 0.012578791007399559, + "num_input_tokens_seen": 14050608, + "step": 858, + "train_runtime": 6973.2666, + "train_tokens_per_second": 2014.925 + }, + { + "epoch": 0.5206060606060606, + "grad_norm": 0.01986696757376194, + "learning_rate": 9.955657010501806e-05, + "loss": 0.012446455657482147, + "num_input_tokens_seen": 14066984, + "step": 859, + "train_runtime": 6981.3718, + "train_tokens_per_second": 2014.931 + }, + { + "epoch": 0.5212121212121212, + "grad_norm": 0.020363394170999527, + "learning_rate": 9.955529134351563e-05, + "loss": 0.012604762800037861, + "num_input_tokens_seen": 14083360, + "step": 860, + "train_runtime": 6989.4801, + "train_tokens_per_second": 2014.937 + }, + { + "epoch": 0.5218181818181818, + "grad_norm": 0.010133441537618637, + "learning_rate": 9.955401074905945e-05, + "loss": 0.01250852644443512, + "num_input_tokens_seen": 14099736, + "step": 861, + "train_runtime": 6997.5889, + "train_tokens_per_second": 2014.942 + }, + { + "epoch": 0.5224242424242425, + "grad_norm": 0.012160439044237137, + "learning_rate": 9.955272832169694e-05, + "loss": 0.013129970990121365, + "num_input_tokens_seen": 14116112, + "step": 862, + "train_runtime": 7005.6941, + "train_tokens_per_second": 2014.948 + }, + { + "epoch": 0.5230303030303031, + "grad_norm": 0.0197035763412714, + "learning_rate": 9.95514440614755e-05, + "loss": 0.012795310467481613, + "num_input_tokens_seen": 14132488, + "step": 863, + "train_runtime": 7013.8017, + "train_tokens_per_second": 2014.954 + }, + { + "epoch": 0.5236363636363637, + "grad_norm": 0.029051663354039192, + "learning_rate": 9.955015796844263e-05, + "loss": 0.012731630355119705, + "num_input_tokens_seen": 14148864, + "step": 864, + "train_runtime": 7021.913, + "train_tokens_per_second": 2014.959 + }, + { + "epoch": 0.5242424242424243, + "grad_norm": 0.01819092035293579, + "learning_rate": 9.954887004264591e-05, + "loss": 0.012530642561614513, + "num_input_tokens_seen": 14165240, + "step": 865, + "train_runtime": 7030.031, + "train_tokens_per_second": 2014.961 + }, + { + "epoch": 0.5248484848484849, + "grad_norm": 0.012354613281786442, + "learning_rate": 9.9547580284133e-05, + "loss": 0.012999298982322216, + "num_input_tokens_seen": 14181616, + "step": 866, + "train_runtime": 7038.1398, + "train_tokens_per_second": 2014.966 + }, + { + "epoch": 0.5254545454545455, + "grad_norm": 0.009374301880598068, + "learning_rate": 9.954628869295157e-05, + "loss": 0.012080837972462177, + "num_input_tokens_seen": 14197992, + "step": 867, + "train_runtime": 7046.2489, + "train_tokens_per_second": 2014.972 + }, + { + "epoch": 0.526060606060606, + "grad_norm": 0.04844909533858299, + "learning_rate": 9.954499526914941e-05, + "loss": 0.014849531464278698, + "num_input_tokens_seen": 14214368, + "step": 868, + "train_runtime": 7054.3586, + "train_tokens_per_second": 2014.977 + }, + { + "epoch": 0.5266666666666666, + "grad_norm": 0.0264375489205122, + "learning_rate": 9.954370001277435e-05, + "loss": 0.013595725409686565, + "num_input_tokens_seen": 14230744, + "step": 869, + "train_runtime": 7062.4663, + "train_tokens_per_second": 2014.982 + }, + { + "epoch": 0.5272727272727272, + "grad_norm": 0.011517049744725227, + "learning_rate": 9.954240292387434e-05, + "loss": 0.012497092597186565, + "num_input_tokens_seen": 14247120, + "step": 870, + "train_runtime": 7070.5718, + "train_tokens_per_second": 2014.988 + }, + { + "epoch": 0.5278787878787878, + "grad_norm": 0.012493406422436237, + "learning_rate": 9.95411040024973e-05, + "loss": 0.01143716461956501, + "num_input_tokens_seen": 14263496, + "step": 871, + "train_runtime": 7078.6807, + "train_tokens_per_second": 2014.994 + }, + { + "epoch": 0.5284848484848484, + "grad_norm": 0.04269085079431534, + "learning_rate": 9.95398032486913e-05, + "loss": 0.013632988557219505, + "num_input_tokens_seen": 14279872, + "step": 872, + "train_runtime": 7086.7887, + "train_tokens_per_second": 2014.999 + }, + { + "epoch": 0.5290909090909091, + "grad_norm": 0.04483538493514061, + "learning_rate": 9.953850066250445e-05, + "loss": 0.013953006826341152, + "num_input_tokens_seen": 14296248, + "step": 873, + "train_runtime": 7094.8962, + "train_tokens_per_second": 2015.005 + }, + { + "epoch": 0.5296969696969697, + "grad_norm": 0.05677570030093193, + "learning_rate": 9.953719624398495e-05, + "loss": 0.012957635335624218, + "num_input_tokens_seen": 14312624, + "step": 874, + "train_runtime": 7103.0013, + "train_tokens_per_second": 2015.011 + }, + { + "epoch": 0.5303030303030303, + "grad_norm": 0.038775816559791565, + "learning_rate": 9.953588999318101e-05, + "loss": 0.01283508911728859, + "num_input_tokens_seen": 14329000, + "step": 875, + "train_runtime": 7111.1121, + "train_tokens_per_second": 2015.015 + }, + { + "epoch": 0.5309090909090909, + "grad_norm": 0.032757148146629333, + "learning_rate": 9.953458191014098e-05, + "loss": 0.013316294178366661, + "num_input_tokens_seen": 14345376, + "step": 876, + "train_runtime": 7119.2189, + "train_tokens_per_second": 2015.021 + }, + { + "epoch": 0.5315151515151515, + "grad_norm": 0.022632509469985962, + "learning_rate": 9.953327199491323e-05, + "loss": 0.011890828609466553, + "num_input_tokens_seen": 14361752, + "step": 877, + "train_runtime": 7127.3324, + "train_tokens_per_second": 2015.025 + }, + { + "epoch": 0.5321212121212121, + "grad_norm": 0.013239112682640553, + "learning_rate": 9.953196024754621e-05, + "loss": 0.011631186120212078, + "num_input_tokens_seen": 14378128, + "step": 878, + "train_runtime": 7135.4381, + "train_tokens_per_second": 2015.031 + }, + { + "epoch": 0.5327272727272727, + "grad_norm": 0.012772745452821255, + "learning_rate": 9.953064666808843e-05, + "loss": 0.011507662013173103, + "num_input_tokens_seen": 14394504, + "step": 879, + "train_runtime": 7143.5516, + "train_tokens_per_second": 2015.035 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.02860845811665058, + "learning_rate": 9.952933125658849e-05, + "loss": 0.013187154196202755, + "num_input_tokens_seen": 14410880, + "step": 880, + "train_runtime": 7151.6624, + "train_tokens_per_second": 2015.039 + }, + { + "epoch": 0.5339393939393939, + "grad_norm": 0.011422947980463505, + "learning_rate": 9.952801401309503e-05, + "loss": 0.012076064944267273, + "num_input_tokens_seen": 14427256, + "step": 881, + "train_runtime": 7159.772, + "train_tokens_per_second": 2015.044 + }, + { + "epoch": 0.5345454545454545, + "grad_norm": 0.00976222101598978, + "learning_rate": 9.95266949376568e-05, + "loss": 0.011884449049830437, + "num_input_tokens_seen": 14443632, + "step": 882, + "train_runtime": 7167.8808, + "train_tokens_per_second": 2015.049 + }, + { + "epoch": 0.5351515151515152, + "grad_norm": 0.017465714365243912, + "learning_rate": 9.952537403032258e-05, + "loss": 0.012587850913405418, + "num_input_tokens_seen": 14460008, + "step": 883, + "train_runtime": 7175.9926, + "train_tokens_per_second": 2015.053 + }, + { + "epoch": 0.5357575757575758, + "grad_norm": 0.01686178520321846, + "learning_rate": 9.952405129114119e-05, + "loss": 0.01267196424305439, + "num_input_tokens_seen": 14476384, + "step": 884, + "train_runtime": 7184.0994, + "train_tokens_per_second": 2015.059 + }, + { + "epoch": 0.5363636363636364, + "grad_norm": 0.021161451935768127, + "learning_rate": 9.952272672016161e-05, + "loss": 0.012368117459118366, + "num_input_tokens_seen": 14492760, + "step": 885, + "train_runtime": 7192.2074, + "train_tokens_per_second": 2015.064 + }, + { + "epoch": 0.536969696969697, + "grad_norm": 0.018734315410256386, + "learning_rate": 9.95214003174328e-05, + "loss": 0.013907104730606079, + "num_input_tokens_seen": 14509136, + "step": 886, + "train_runtime": 7200.3181, + "train_tokens_per_second": 2015.069 + }, + { + "epoch": 0.5375757575757576, + "grad_norm": 0.017368443310260773, + "learning_rate": 9.952007208300384e-05, + "loss": 0.013688186183571815, + "num_input_tokens_seen": 14525512, + "step": 887, + "train_runtime": 7208.4306, + "train_tokens_per_second": 2015.073 + }, + { + "epoch": 0.5381818181818182, + "grad_norm": 0.014055633917450905, + "learning_rate": 9.951874201692386e-05, + "loss": 0.011441092006862164, + "num_input_tokens_seen": 14541888, + "step": 888, + "train_runtime": 7216.5403, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 0.5387878787878788, + "grad_norm": 0.014830189757049084, + "learning_rate": 9.951741011924202e-05, + "loss": 0.012659481726586819, + "num_input_tokens_seen": 14558264, + "step": 889, + "train_runtime": 7224.6486, + "train_tokens_per_second": 2015.083 + }, + { + "epoch": 0.5393939393939394, + "grad_norm": 0.04141494259238243, + "learning_rate": 9.951607639000763e-05, + "loss": 0.014267532154917717, + "num_input_tokens_seen": 14574640, + "step": 890, + "train_runtime": 7232.7595, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 0.54, + "grad_norm": 0.026582296937704086, + "learning_rate": 9.951474082927e-05, + "loss": 0.01351410336792469, + "num_input_tokens_seen": 14591016, + "step": 891, + "train_runtime": 7240.8673, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 0.5406060606060606, + "grad_norm": 0.029941242188215256, + "learning_rate": 9.951340343707852e-05, + "loss": 0.013386565260589123, + "num_input_tokens_seen": 14607392, + "step": 892, + "train_runtime": 7248.9795, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 0.5412121212121213, + "grad_norm": 0.01376877911388874, + "learning_rate": 9.951206421348267e-05, + "loss": 0.012590361759066582, + "num_input_tokens_seen": 14623768, + "step": 893, + "train_runtime": 7257.0885, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 0.5418181818181819, + "grad_norm": 0.015015073120594025, + "learning_rate": 9.9510723158532e-05, + "loss": 0.012574484571814537, + "num_input_tokens_seen": 14640144, + "step": 894, + "train_runtime": 7265.1975, + "train_tokens_per_second": 2015.106 + }, + { + "epoch": 0.5424242424242425, + "grad_norm": 0.013042068108916283, + "learning_rate": 9.950938027227608e-05, + "loss": 0.01163259893655777, + "num_input_tokens_seen": 14656520, + "step": 895, + "train_runtime": 7273.3074, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 0.5430303030303031, + "grad_norm": 0.2448950558900833, + "learning_rate": 9.950803555476463e-05, + "loss": 0.029144512489438057, + "num_input_tokens_seen": 14672896, + "step": 896, + "train_runtime": 7281.4158, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 0.5436363636363636, + "grad_norm": 0.015140167437493801, + "learning_rate": 9.950668900604733e-05, + "loss": 0.012354775331914425, + "num_input_tokens_seen": 14689272, + "step": 897, + "train_runtime": 7289.5307, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 0.5442424242424242, + "grad_norm": 0.014910165220499039, + "learning_rate": 9.950534062617401e-05, + "loss": 0.013464296236634254, + "num_input_tokens_seen": 14705648, + "step": 898, + "train_runtime": 7297.6408, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 0.5448484848484848, + "grad_norm": 0.025381648913025856, + "learning_rate": 9.950399041519456e-05, + "loss": 0.01381002739071846, + "num_input_tokens_seen": 14722024, + "step": 899, + "train_runtime": 7305.7486, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 0.016502218320965767, + "learning_rate": 9.950263837315891e-05, + "loss": 0.014580944553017616, + "num_input_tokens_seen": 14738400, + "step": 900, + "train_runtime": 7313.8574, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 0.546060606060606, + "grad_norm": 0.036798711866140366, + "learning_rate": 9.950128450011706e-05, + "loss": 0.01336810551583767, + "num_input_tokens_seen": 14754776, + "step": 901, + "train_runtime": 7322.836, + "train_tokens_per_second": 2014.899 + }, + { + "epoch": 0.5466666666666666, + "grad_norm": 0.03919834643602371, + "learning_rate": 9.949992879611911e-05, + "loss": 0.013614124618470669, + "num_input_tokens_seen": 14771152, + "step": 902, + "train_runtime": 7330.9449, + "train_tokens_per_second": 2014.904 + }, + { + "epoch": 0.5472727272727272, + "grad_norm": 0.015492623671889305, + "learning_rate": 9.949857126121517e-05, + "loss": 0.01262598019093275, + "num_input_tokens_seen": 14787528, + "step": 903, + "train_runtime": 7339.051, + "train_tokens_per_second": 2014.91 + }, + { + "epoch": 0.5478787878787879, + "grad_norm": 0.04381313920021057, + "learning_rate": 9.949721189545549e-05, + "loss": 0.012830916792154312, + "num_input_tokens_seen": 14803904, + "step": 904, + "train_runtime": 7347.1591, + "train_tokens_per_second": 2014.915 + }, + { + "epoch": 0.5484848484848485, + "grad_norm": 0.012728218920528889, + "learning_rate": 9.949585069889033e-05, + "loss": 0.012215669266879559, + "num_input_tokens_seen": 14820280, + "step": 905, + "train_runtime": 7355.2671, + "train_tokens_per_second": 2014.921 + }, + { + "epoch": 0.5490909090909091, + "grad_norm": 0.02701408974826336, + "learning_rate": 9.949448767157003e-05, + "loss": 0.014799817465245724, + "num_input_tokens_seen": 14836656, + "step": 906, + "train_runtime": 7363.3735, + "train_tokens_per_second": 2014.926 + }, + { + "epoch": 0.5496969696969697, + "grad_norm": 0.01919523999094963, + "learning_rate": 9.949312281354504e-05, + "loss": 0.012729383073747158, + "num_input_tokens_seen": 14853032, + "step": 907, + "train_runtime": 7371.4797, + "train_tokens_per_second": 2014.932 + }, + { + "epoch": 0.5503030303030303, + "grad_norm": 0.017987912520766258, + "learning_rate": 9.94917561248658e-05, + "loss": 0.011925067752599716, + "num_input_tokens_seen": 14869408, + "step": 908, + "train_runtime": 7379.5909, + "train_tokens_per_second": 2014.937 + }, + { + "epoch": 0.5509090909090909, + "grad_norm": 0.016029933467507362, + "learning_rate": 9.94903876055829e-05, + "loss": 0.014640429988503456, + "num_input_tokens_seen": 14885784, + "step": 909, + "train_runtime": 7387.701, + "train_tokens_per_second": 2014.941 + }, + { + "epoch": 0.5515151515151515, + "grad_norm": 0.02371898479759693, + "learning_rate": 9.948901725574692e-05, + "loss": 0.013192545622587204, + "num_input_tokens_seen": 14902160, + "step": 910, + "train_runtime": 7395.8127, + "train_tokens_per_second": 2014.946 + }, + { + "epoch": 0.5521212121212121, + "grad_norm": 0.028052695095539093, + "learning_rate": 9.948764507540858e-05, + "loss": 0.014127026312053204, + "num_input_tokens_seen": 14918536, + "step": 911, + "train_runtime": 7403.9308, + "train_tokens_per_second": 2014.948 + }, + { + "epoch": 0.5527272727272727, + "grad_norm": 0.022900646552443504, + "learning_rate": 9.94862710646186e-05, + "loss": 0.01369861327111721, + "num_input_tokens_seen": 14934912, + "step": 912, + "train_runtime": 7412.0474, + "train_tokens_per_second": 2014.951 + }, + { + "epoch": 0.5533333333333333, + "grad_norm": 0.024493444710969925, + "learning_rate": 9.948489522342786e-05, + "loss": 0.012475069612264633, + "num_input_tokens_seen": 14951288, + "step": 913, + "train_runtime": 7420.164, + "train_tokens_per_second": 2014.954 + }, + { + "epoch": 0.553939393939394, + "grad_norm": 0.009486420080065727, + "learning_rate": 9.948351755188718e-05, + "loss": 0.011415514163672924, + "num_input_tokens_seen": 14967664, + "step": 914, + "train_runtime": 7428.2787, + "train_tokens_per_second": 2014.957 + }, + { + "epoch": 0.5545454545454546, + "grad_norm": 0.02638114243745804, + "learning_rate": 9.948213805004758e-05, + "loss": 0.014981718733906746, + "num_input_tokens_seen": 14984040, + "step": 915, + "train_runtime": 7436.3836, + "train_tokens_per_second": 2014.963 + }, + { + "epoch": 0.5551515151515152, + "grad_norm": 0.024289410561323166, + "learning_rate": 9.948075671796004e-05, + "loss": 0.013489319942891598, + "num_input_tokens_seen": 15000416, + "step": 916, + "train_runtime": 7444.4934, + "train_tokens_per_second": 2014.968 + }, + { + "epoch": 0.5557575757575758, + "grad_norm": 0.019992362707853317, + "learning_rate": 9.947937355567566e-05, + "loss": 0.013457294553518295, + "num_input_tokens_seen": 15016792, + "step": 917, + "train_runtime": 7452.6, + "train_tokens_per_second": 2014.974 + }, + { + "epoch": 0.5563636363636364, + "grad_norm": 0.01874268427491188, + "learning_rate": 9.947798856324562e-05, + "loss": 0.014019965194165707, + "num_input_tokens_seen": 15033168, + "step": 918, + "train_runtime": 7460.7075, + "train_tokens_per_second": 2014.979 + }, + { + "epoch": 0.556969696969697, + "grad_norm": 0.006537168752402067, + "learning_rate": 9.947660174072113e-05, + "loss": 0.01211620308458805, + "num_input_tokens_seen": 15049544, + "step": 919, + "train_runtime": 7468.8162, + "train_tokens_per_second": 2014.984 + }, + { + "epoch": 0.5575757575757576, + "grad_norm": 0.014149926602840424, + "learning_rate": 9.94752130881535e-05, + "loss": 0.01367366872727871, + "num_input_tokens_seen": 15065920, + "step": 920, + "train_runtime": 7476.9293, + "train_tokens_per_second": 2014.988 + }, + { + "epoch": 0.5581818181818182, + "grad_norm": 0.02201610431075096, + "learning_rate": 9.947382260559408e-05, + "loss": 0.014585314318537712, + "num_input_tokens_seen": 15082296, + "step": 921, + "train_runtime": 7485.0355, + "train_tokens_per_second": 2014.993 + }, + { + "epoch": 0.5587878787878788, + "grad_norm": 0.016061201691627502, + "learning_rate": 9.947243029309433e-05, + "loss": 0.012058419175446033, + "num_input_tokens_seen": 15098672, + "step": 922, + "train_runtime": 7493.1729, + "train_tokens_per_second": 2014.99 + }, + { + "epoch": 0.5593939393939394, + "grad_norm": 0.014283844269812107, + "learning_rate": 9.94710361507057e-05, + "loss": 0.013241814449429512, + "num_input_tokens_seen": 15115048, + "step": 923, + "train_runtime": 7501.2834, + "train_tokens_per_second": 2014.995 + }, + { + "epoch": 0.56, + "grad_norm": 0.014411736279726028, + "learning_rate": 9.94696401784798e-05, + "loss": 0.011771513149142265, + "num_input_tokens_seen": 15131424, + "step": 924, + "train_runtime": 7509.3934, + "train_tokens_per_second": 2015.0 + }, + { + "epoch": 0.5606060606060606, + "grad_norm": 0.015076170675456524, + "learning_rate": 9.946824237646824e-05, + "loss": 0.012921320274472237, + "num_input_tokens_seen": 15147800, + "step": 925, + "train_runtime": 7517.5023, + "train_tokens_per_second": 2015.004 + }, + { + "epoch": 0.5612121212121212, + "grad_norm": 0.019479839131236076, + "learning_rate": 9.94668427447227e-05, + "loss": 0.01365247555077076, + "num_input_tokens_seen": 15164176, + "step": 926, + "train_runtime": 7525.6088, + "train_tokens_per_second": 2015.01 + }, + { + "epoch": 0.5618181818181818, + "grad_norm": 0.015186650678515434, + "learning_rate": 9.946544128329502e-05, + "loss": 0.011964188888669014, + "num_input_tokens_seen": 15180552, + "step": 927, + "train_runtime": 7533.7184, + "train_tokens_per_second": 2015.015 + }, + { + "epoch": 0.5624242424242424, + "grad_norm": 0.01884932816028595, + "learning_rate": 9.9464037992237e-05, + "loss": 0.013231384567916393, + "num_input_tokens_seen": 15196928, + "step": 928, + "train_runtime": 7541.8298, + "train_tokens_per_second": 2015.019 + }, + { + "epoch": 0.563030303030303, + "grad_norm": 0.024524593725800514, + "learning_rate": 9.946263287160051e-05, + "loss": 0.013677388429641724, + "num_input_tokens_seen": 15213304, + "step": 929, + "train_runtime": 7549.9392, + "train_tokens_per_second": 2015.023 + }, + { + "epoch": 0.5636363636363636, + "grad_norm": 0.017896726727485657, + "learning_rate": 9.946122592143758e-05, + "loss": 0.012685752473771572, + "num_input_tokens_seen": 15229680, + "step": 930, + "train_runtime": 7558.0487, + "train_tokens_per_second": 2015.028 + }, + { + "epoch": 0.5642424242424242, + "grad_norm": 0.02456982247531414, + "learning_rate": 9.945981714180021e-05, + "loss": 0.012439090758562088, + "num_input_tokens_seen": 15246056, + "step": 931, + "train_runtime": 7566.1626, + "train_tokens_per_second": 2015.032 + }, + { + "epoch": 0.5648484848484848, + "grad_norm": 0.011778507381677628, + "learning_rate": 9.945840653274052e-05, + "loss": 0.01277371309697628, + "num_input_tokens_seen": 15262432, + "step": 932, + "train_runtime": 7574.272, + "train_tokens_per_second": 2015.036 + }, + { + "epoch": 0.5654545454545454, + "grad_norm": 0.00871087983250618, + "learning_rate": 9.945699409431071e-05, + "loss": 0.012337596155703068, + "num_input_tokens_seen": 15278808, + "step": 933, + "train_runtime": 7582.3801, + "train_tokens_per_second": 2015.041 + }, + { + "epoch": 0.566060606060606, + "grad_norm": 0.02395842783153057, + "learning_rate": 9.945557982656299e-05, + "loss": 0.013987423852086067, + "num_input_tokens_seen": 15295184, + "step": 934, + "train_runtime": 7590.493, + "train_tokens_per_second": 2015.045 + }, + { + "epoch": 0.5666666666666667, + "grad_norm": 0.014825602062046528, + "learning_rate": 9.945416372954968e-05, + "loss": 0.013695470988750458, + "num_input_tokens_seen": 15311560, + "step": 935, + "train_runtime": 7598.6032, + "train_tokens_per_second": 2015.049 + }, + { + "epoch": 0.5672727272727273, + "grad_norm": 0.034912459552288055, + "learning_rate": 9.945274580332316e-05, + "loss": 0.014644785784184933, + "num_input_tokens_seen": 15327936, + "step": 936, + "train_runtime": 7606.7121, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 0.5678787878787879, + "grad_norm": 0.015183918178081512, + "learning_rate": 9.945132604793588e-05, + "loss": 0.013066308572888374, + "num_input_tokens_seen": 15344312, + "step": 937, + "train_runtime": 7614.8211, + "train_tokens_per_second": 2015.059 + }, + { + "epoch": 0.5684848484848485, + "grad_norm": 0.015175413340330124, + "learning_rate": 9.944990446344033e-05, + "loss": 0.012659816071391106, + "num_input_tokens_seen": 15360688, + "step": 938, + "train_runtime": 7622.9305, + "train_tokens_per_second": 2015.063 + }, + { + "epoch": 0.5690909090909091, + "grad_norm": 0.00944305956363678, + "learning_rate": 9.944848104988915e-05, + "loss": 0.012941330671310425, + "num_input_tokens_seen": 15377064, + "step": 939, + "train_runtime": 7631.041, + "train_tokens_per_second": 2015.068 + }, + { + "epoch": 0.5696969696969697, + "grad_norm": 0.008134279400110245, + "learning_rate": 9.944705580733493e-05, + "loss": 0.012083706445991993, + "num_input_tokens_seen": 15393440, + "step": 940, + "train_runtime": 7639.151, + "train_tokens_per_second": 2015.072 + }, + { + "epoch": 0.5703030303030303, + "grad_norm": 0.01920422352850437, + "learning_rate": 9.944562873583042e-05, + "loss": 0.012228092178702354, + "num_input_tokens_seen": 15409816, + "step": 941, + "train_runtime": 7647.2582, + "train_tokens_per_second": 2015.077 + }, + { + "epoch": 0.5709090909090909, + "grad_norm": 0.02532947063446045, + "learning_rate": 9.944419983542839e-05, + "loss": 0.014129354618489742, + "num_input_tokens_seen": 15426192, + "step": 942, + "train_runtime": 7655.3689, + "train_tokens_per_second": 2015.081 + }, + { + "epoch": 0.5715151515151515, + "grad_norm": 0.014770124107599258, + "learning_rate": 9.944276910618168e-05, + "loss": 0.01307615451514721, + "num_input_tokens_seen": 15442568, + "step": 943, + "train_runtime": 7663.4788, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 0.5721212121212121, + "grad_norm": 0.04172991216182709, + "learning_rate": 9.944133654814325e-05, + "loss": 0.01433885470032692, + "num_input_tokens_seen": 15458944, + "step": 944, + "train_runtime": 7671.5887, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 0.5727272727272728, + "grad_norm": 0.02282462641596794, + "learning_rate": 9.943990216136605e-05, + "loss": 0.012092739343643188, + "num_input_tokens_seen": 15475320, + "step": 945, + "train_runtime": 7679.6999, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 0.5733333333333334, + "grad_norm": 0.0323781855404377, + "learning_rate": 9.943846594590316e-05, + "loss": 0.014233306050300598, + "num_input_tokens_seen": 15491696, + "step": 946, + "train_runtime": 7687.8075, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 0.573939393939394, + "grad_norm": 0.016390513628721237, + "learning_rate": 9.943702790180769e-05, + "loss": 0.01384427584707737, + "num_input_tokens_seen": 15508072, + "step": 947, + "train_runtime": 7695.9168, + "train_tokens_per_second": 2015.104 + }, + { + "epoch": 0.5745454545454546, + "grad_norm": 0.017519650980830193, + "learning_rate": 9.943558802913282e-05, + "loss": 0.013568704016506672, + "num_input_tokens_seen": 15524448, + "step": 948, + "train_runtime": 7704.0297, + "train_tokens_per_second": 2015.108 + }, + { + "epoch": 0.5751515151515152, + "grad_norm": 0.012753440998494625, + "learning_rate": 9.943414632793184e-05, + "loss": 0.012147994711995125, + "num_input_tokens_seen": 15540824, + "step": 949, + "train_runtime": 7712.145, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 0.5757575757575758, + "grad_norm": 0.011699757538735867, + "learning_rate": 9.943270279825803e-05, + "loss": 0.013070912100374699, + "num_input_tokens_seen": 15557200, + "step": 950, + "train_runtime": 7720.2554, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 0.5763636363636364, + "grad_norm": 0.01527287345379591, + "learning_rate": 9.943125744016483e-05, + "loss": 0.011352474801242352, + "num_input_tokens_seen": 15573576, + "step": 951, + "train_runtime": 7728.3625, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 0.576969696969697, + "grad_norm": 0.025451278313994408, + "learning_rate": 9.942981025370568e-05, + "loss": 0.013020837679505348, + "num_input_tokens_seen": 15589952, + "step": 952, + "train_runtime": 7736.4706, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 0.5775757575757576, + "grad_norm": 0.021832725033164024, + "learning_rate": 9.942836123893408e-05, + "loss": 0.015131472609937191, + "num_input_tokens_seen": 15606328, + "step": 953, + "train_runtime": 7744.5854, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 0.5781818181818181, + "grad_norm": 0.022370878607034683, + "learning_rate": 9.942691039590369e-05, + "loss": 0.012688050046563148, + "num_input_tokens_seen": 15622704, + "step": 954, + "train_runtime": 7752.6981, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 0.5787878787878787, + "grad_norm": 0.021051136776804924, + "learning_rate": 9.942545772466814e-05, + "loss": 0.012345478869974613, + "num_input_tokens_seen": 15639080, + "step": 955, + "train_runtime": 7760.8061, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 0.5793939393939394, + "grad_norm": 0.01372633595019579, + "learning_rate": 9.942400322528114e-05, + "loss": 0.012315414845943451, + "num_input_tokens_seen": 15655456, + "step": 956, + "train_runtime": 7768.9154, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 0.58, + "grad_norm": 0.028729503974318504, + "learning_rate": 9.942254689779651e-05, + "loss": 0.013109761290252209, + "num_input_tokens_seen": 15671832, + "step": 957, + "train_runtime": 7777.0294, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 0.5806060606060606, + "grad_norm": 0.029019076377153397, + "learning_rate": 9.942108874226811e-05, + "loss": 0.013196980580687523, + "num_input_tokens_seen": 15688208, + "step": 958, + "train_runtime": 7785.1364, + "train_tokens_per_second": 2015.149 + }, + { + "epoch": 0.5812121212121212, + "grad_norm": 0.011110197752714157, + "learning_rate": 9.94196287587499e-05, + "loss": 0.012527218088507652, + "num_input_tokens_seen": 15704584, + "step": 959, + "train_runtime": 7793.2454, + "train_tokens_per_second": 2015.153 + }, + { + "epoch": 0.5818181818181818, + "grad_norm": 0.012445122934877872, + "learning_rate": 9.941816694729586e-05, + "loss": 0.013050834648311138, + "num_input_tokens_seen": 15720960, + "step": 960, + "train_runtime": 7801.3578, + "train_tokens_per_second": 2015.157 + }, + { + "epoch": 0.5824242424242424, + "grad_norm": 0.01324465125799179, + "learning_rate": 9.941670330796007e-05, + "loss": 0.012385859154164791, + "num_input_tokens_seen": 15737336, + "step": 961, + "train_runtime": 7809.4681, + "train_tokens_per_second": 2015.161 + }, + { + "epoch": 0.583030303030303, + "grad_norm": 0.020351726561784744, + "learning_rate": 9.941523784079665e-05, + "loss": 0.013481922447681427, + "num_input_tokens_seen": 15753712, + "step": 962, + "train_runtime": 7817.5774, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 0.5836363636363636, + "grad_norm": 0.017218874767422676, + "learning_rate": 9.94137705458598e-05, + "loss": 0.011243843473494053, + "num_input_tokens_seen": 15770088, + "step": 963, + "train_runtime": 7825.6869, + "train_tokens_per_second": 2015.17 + }, + { + "epoch": 0.5842424242424242, + "grad_norm": 0.020052634179592133, + "learning_rate": 9.941230142320381e-05, + "loss": 0.01419176533818245, + "num_input_tokens_seen": 15786464, + "step": 964, + "train_runtime": 7833.7989, + "train_tokens_per_second": 2015.174 + }, + { + "epoch": 0.5848484848484848, + "grad_norm": 0.01865479350090027, + "learning_rate": 9.941083047288305e-05, + "loss": 0.013855772092938423, + "num_input_tokens_seen": 15802840, + "step": 965, + "train_runtime": 7841.9087, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 0.5854545454545454, + "grad_norm": 0.019557680934667587, + "learning_rate": 9.940935769495186e-05, + "loss": 0.014046021737158298, + "num_input_tokens_seen": 15819216, + "step": 966, + "train_runtime": 7850.0169, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 0.5860606060606061, + "grad_norm": 0.01921168901026249, + "learning_rate": 9.940788308946476e-05, + "loss": 0.013276162557303905, + "num_input_tokens_seen": 15835592, + "step": 967, + "train_runtime": 7858.13, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 0.015911763533949852, + "learning_rate": 9.940640665647626e-05, + "loss": 0.012454750947654247, + "num_input_tokens_seen": 15851968, + "step": 968, + "train_runtime": 7866.2398, + "train_tokens_per_second": 2015.19 + }, + { + "epoch": 0.5872727272727273, + "grad_norm": 0.020958999171853065, + "learning_rate": 9.940492839604103e-05, + "loss": 0.01228359155356884, + "num_input_tokens_seen": 15868344, + "step": 969, + "train_runtime": 7874.3484, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 0.5878787878787879, + "grad_norm": 0.017634913325309753, + "learning_rate": 9.940344830821368e-05, + "loss": 0.013240614905953407, + "num_input_tokens_seen": 15884720, + "step": 970, + "train_runtime": 7882.4581, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 0.5884848484848485, + "grad_norm": 0.018232690170407295, + "learning_rate": 9.9401966393049e-05, + "loss": 0.01443801261484623, + "num_input_tokens_seen": 15901096, + "step": 971, + "train_runtime": 7890.5672, + "train_tokens_per_second": 2015.203 + }, + { + "epoch": 0.5890909090909091, + "grad_norm": 0.021868707612156868, + "learning_rate": 9.94004826506018e-05, + "loss": 0.014730443246662617, + "num_input_tokens_seen": 15917472, + "step": 972, + "train_runtime": 7898.6753, + "train_tokens_per_second": 2015.208 + }, + { + "epoch": 0.5896969696969697, + "grad_norm": 0.015589121729135513, + "learning_rate": 9.939899708092692e-05, + "loss": 0.011880002915859222, + "num_input_tokens_seen": 15933848, + "step": 973, + "train_runtime": 7906.7854, + "train_tokens_per_second": 2015.212 + }, + { + "epoch": 0.5903030303030303, + "grad_norm": 0.010916100814938545, + "learning_rate": 9.939750968407938e-05, + "loss": 0.011822294443845749, + "num_input_tokens_seen": 15950224, + "step": 974, + "train_runtime": 7914.891, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 0.5909090909090909, + "grad_norm": 0.014051892794668674, + "learning_rate": 9.939602046011412e-05, + "loss": 0.012878884561359882, + "num_input_tokens_seen": 15966600, + "step": 975, + "train_runtime": 7923.0019, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 0.5915151515151515, + "grad_norm": 0.032839711755514145, + "learning_rate": 9.939452940908626e-05, + "loss": 0.014527475461363792, + "num_input_tokens_seen": 15982976, + "step": 976, + "train_runtime": 7931.1131, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 0.5921212121212122, + "grad_norm": 0.020389258861541748, + "learning_rate": 9.939303653105096e-05, + "loss": 0.013167984783649445, + "num_input_tokens_seen": 15999352, + "step": 977, + "train_runtime": 7939.2338, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 0.5927272727272728, + "grad_norm": 0.025760652497410774, + "learning_rate": 9.939154182606341e-05, + "loss": 0.01562490500509739, + "num_input_tokens_seen": 16015728, + "step": 978, + "train_runtime": 7947.343, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 0.5933333333333334, + "grad_norm": 0.017900720238685608, + "learning_rate": 9.939004529417894e-05, + "loss": 0.011635327711701393, + "num_input_tokens_seen": 16032104, + "step": 979, + "train_runtime": 7955.4555, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 0.593939393939394, + "grad_norm": 0.018658578395843506, + "learning_rate": 9.938854693545285e-05, + "loss": 0.011654762551188469, + "num_input_tokens_seen": 16048480, + "step": 980, + "train_runtime": 7963.5661, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 0.5945454545454546, + "grad_norm": 0.01790103130042553, + "learning_rate": 9.938704674994062e-05, + "loss": 0.013270128518342972, + "num_input_tokens_seen": 16064856, + "step": 981, + "train_runtime": 7971.6756, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 0.5951515151515151, + "grad_norm": 0.039879657328128815, + "learning_rate": 9.938554473769768e-05, + "loss": 0.01646546646952629, + "num_input_tokens_seen": 16081232, + "step": 982, + "train_runtime": 7979.7879, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 0.5957575757575757, + "grad_norm": 0.013998485170304775, + "learning_rate": 9.938404089877961e-05, + "loss": 0.012206289917230606, + "num_input_tokens_seen": 16097608, + "step": 983, + "train_runtime": 7987.8964, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 0.5963636363636363, + "grad_norm": 0.006746624130755663, + "learning_rate": 9.938253523324206e-05, + "loss": 0.012235766276717186, + "num_input_tokens_seen": 16113984, + "step": 984, + "train_runtime": 7996.0036, + "train_tokens_per_second": 2015.255 + }, + { + "epoch": 0.5969696969696969, + "grad_norm": 0.022575756534934044, + "learning_rate": 9.93810277411407e-05, + "loss": 0.012963814660906792, + "num_input_tokens_seen": 16130360, + "step": 985, + "train_runtime": 8004.1105, + "train_tokens_per_second": 2015.26 + }, + { + "epoch": 0.5975757575757575, + "grad_norm": 0.007626754697412252, + "learning_rate": 9.937951842253127e-05, + "loss": 0.01213219203054905, + "num_input_tokens_seen": 16146736, + "step": 986, + "train_runtime": 8012.2202, + "train_tokens_per_second": 2015.264 + }, + { + "epoch": 0.5981818181818181, + "grad_norm": 0.013599387370049953, + "learning_rate": 9.937800727746964e-05, + "loss": 0.012984167784452438, + "num_input_tokens_seen": 16163112, + "step": 987, + "train_runtime": 8020.337, + "train_tokens_per_second": 2015.266 + }, + { + "epoch": 0.5987878787878788, + "grad_norm": 0.010270299389958382, + "learning_rate": 9.937649430601166e-05, + "loss": 0.011544723995029926, + "num_input_tokens_seen": 16179488, + "step": 988, + "train_runtime": 8028.447, + "train_tokens_per_second": 2015.27 + }, + { + "epoch": 0.5993939393939394, + "grad_norm": 0.03377272188663483, + "learning_rate": 9.937497950821332e-05, + "loss": 0.01466489490121603, + "num_input_tokens_seen": 16195864, + "step": 989, + "train_runtime": 8036.5629, + "train_tokens_per_second": 2015.272 + }, + { + "epoch": 0.6, + "grad_norm": 0.012808220461010933, + "learning_rate": 9.937346288413064e-05, + "loss": 0.014080810360610485, + "num_input_tokens_seen": 16212240, + "step": 990, + "train_runtime": 8044.6741, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 0.6006060606060606, + "grad_norm": 0.022888874635100365, + "learning_rate": 9.937194443381972e-05, + "loss": 0.012964661233127117, + "num_input_tokens_seen": 16228616, + "step": 991, + "train_runtime": 8052.7845, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 0.6012121212121212, + "grad_norm": 0.028279505670070648, + "learning_rate": 9.937042415733673e-05, + "loss": 0.012717594392597675, + "num_input_tokens_seen": 16244992, + "step": 992, + "train_runtime": 8060.8929, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 0.6018181818181818, + "grad_norm": 0.09445340186357498, + "learning_rate": 9.936890205473787e-05, + "loss": 0.013668234460055828, + "num_input_tokens_seen": 16261368, + "step": 993, + "train_runtime": 8069.0044, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 0.6024242424242424, + "grad_norm": 0.008610354736447334, + "learning_rate": 9.936737812607949e-05, + "loss": 0.011679118499159813, + "num_input_tokens_seen": 16277744, + "step": 994, + "train_runtime": 8077.1154, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 0.603030303030303, + "grad_norm": 0.017112495377659798, + "learning_rate": 9.936585237141792e-05, + "loss": 0.012689062394201756, + "num_input_tokens_seen": 16294120, + "step": 995, + "train_runtime": 8085.2294, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 0.6036363636363636, + "grad_norm": 0.0271944347769022, + "learning_rate": 9.936432479080961e-05, + "loss": 0.014213870279490948, + "num_input_tokens_seen": 16310496, + "step": 996, + "train_runtime": 8093.3362, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 0.6042424242424242, + "grad_norm": 0.012547393329441547, + "learning_rate": 9.936279538431106e-05, + "loss": 0.012523166835308075, + "num_input_tokens_seen": 16326872, + "step": 997, + "train_runtime": 8101.4449, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 0.6048484848484849, + "grad_norm": 0.02419351600110531, + "learning_rate": 9.936126415197884e-05, + "loss": 0.014308387413620949, + "num_input_tokens_seen": 16343248, + "step": 998, + "train_runtime": 8109.5556, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 0.6054545454545455, + "grad_norm": 0.015599401667714119, + "learning_rate": 9.935973109386958e-05, + "loss": 0.012808605097234249, + "num_input_tokens_seen": 16359624, + "step": 999, + "train_runtime": 8117.6633, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.021892806515097618, + "learning_rate": 9.935819621003999e-05, + "loss": 0.013939116150140762, + "num_input_tokens_seen": 16376000, + "step": 1000, + "train_runtime": 8125.7712, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 0.6066666666666667, + "grad_norm": 0.01672331802546978, + "learning_rate": 9.935665950054684e-05, + "loss": 0.014093529433012009, + "num_input_tokens_seen": 16392376, + "step": 1001, + "train_runtime": 8134.7177, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 0.6072727272727273, + "grad_norm": 0.009217355400323868, + "learning_rate": 9.9355120965447e-05, + "loss": 0.01290955115109682, + "num_input_tokens_seen": 16408752, + "step": 1002, + "train_runtime": 8142.8295, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 0.6078787878787879, + "grad_norm": 0.01524933148175478, + "learning_rate": 9.935358060479731e-05, + "loss": 0.012339223176240921, + "num_input_tokens_seen": 16425128, + "step": 1003, + "train_runtime": 8150.9365, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 0.6084848484848485, + "grad_norm": 0.02360517345368862, + "learning_rate": 9.935203841865482e-05, + "loss": 0.012756834737956524, + "num_input_tokens_seen": 16441504, + "step": 1004, + "train_runtime": 8159.0458, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 0.6090909090909091, + "grad_norm": 0.020947473123669624, + "learning_rate": 9.93504944070765e-05, + "loss": 0.012582367286086082, + "num_input_tokens_seen": 16457880, + "step": 1005, + "train_runtime": 8167.1644, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 0.6096969696969697, + "grad_norm": 0.01945319212973118, + "learning_rate": 9.934894857011953e-05, + "loss": 0.012788314372301102, + "num_input_tokens_seen": 16474256, + "step": 1006, + "train_runtime": 8175.2812, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 0.6103030303030303, + "grad_norm": 0.0219440758228302, + "learning_rate": 9.934740090784103e-05, + "loss": 0.013707922771573067, + "num_input_tokens_seen": 16490632, + "step": 1007, + "train_runtime": 8183.3988, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 0.610909090909091, + "grad_norm": 0.012798693962395191, + "learning_rate": 9.934585142029828e-05, + "loss": 0.013069421984255314, + "num_input_tokens_seen": 16507008, + "step": 1008, + "train_runtime": 8191.5179, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 0.6115151515151516, + "grad_norm": 0.012583008036017418, + "learning_rate": 9.934430010754861e-05, + "loss": 0.011966132558882236, + "num_input_tokens_seen": 16523384, + "step": 1009, + "train_runtime": 8199.6359, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 0.6121212121212121, + "grad_norm": 0.03669752925634384, + "learning_rate": 9.934274696964934e-05, + "loss": 0.014166103675961494, + "num_input_tokens_seen": 16539760, + "step": 1010, + "train_runtime": 8207.7511, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 0.6127272727272727, + "grad_norm": 0.019834555685520172, + "learning_rate": 9.934119200665795e-05, + "loss": 0.011456426233053207, + "num_input_tokens_seen": 16556136, + "step": 1011, + "train_runtime": 8215.8683, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 0.017150534316897392, + "learning_rate": 9.933963521863196e-05, + "loss": 0.012325924821197987, + "num_input_tokens_seen": 16572512, + "step": 1012, + "train_runtime": 8223.99, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 0.6139393939393939, + "grad_norm": 0.013030534610152245, + "learning_rate": 9.933807660562898e-05, + "loss": 0.012827505357563496, + "num_input_tokens_seen": 16588888, + "step": 1013, + "train_runtime": 8232.106, + "train_tokens_per_second": 2015.145 + }, + { + "epoch": 0.6145454545454545, + "grad_norm": 0.01751735992729664, + "learning_rate": 9.933651616770658e-05, + "loss": 0.012782123871147633, + "num_input_tokens_seen": 16605264, + "step": 1014, + "train_runtime": 8240.2294, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 0.6151515151515151, + "grad_norm": 0.013464527204632759, + "learning_rate": 9.933495390492256e-05, + "loss": 0.014123444445431232, + "num_input_tokens_seen": 16621640, + "step": 1015, + "train_runtime": 8248.3463, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 0.6157575757575757, + "grad_norm": 0.026679445058107376, + "learning_rate": 9.933338981733464e-05, + "loss": 0.012160470709204674, + "num_input_tokens_seen": 16638016, + "step": 1016, + "train_runtime": 8256.4635, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 0.6163636363636363, + "grad_norm": 0.010502724908292294, + "learning_rate": 9.933182390500073e-05, + "loss": 0.011820110492408276, + "num_input_tokens_seen": 16654392, + "step": 1017, + "train_runtime": 8264.5788, + "train_tokens_per_second": 2015.153 + }, + { + "epoch": 0.616969696969697, + "grad_norm": 0.013210924342274666, + "learning_rate": 9.93302561679787e-05, + "loss": 0.013029432855546474, + "num_input_tokens_seen": 16670768, + "step": 1018, + "train_runtime": 8272.6927, + "train_tokens_per_second": 2015.156 + }, + { + "epoch": 0.6175757575757576, + "grad_norm": 0.032258208841085434, + "learning_rate": 9.932868660632659e-05, + "loss": 0.012911350466310978, + "num_input_tokens_seen": 16687144, + "step": 1019, + "train_runtime": 8280.8162, + "train_tokens_per_second": 2015.157 + }, + { + "epoch": 0.6181818181818182, + "grad_norm": 0.03345981612801552, + "learning_rate": 9.932711522010241e-05, + "loss": 0.01444256491959095, + "num_input_tokens_seen": 16703520, + "step": 1020, + "train_runtime": 8288.934, + "train_tokens_per_second": 2015.159 + }, + { + "epoch": 0.6187878787878788, + "grad_norm": 0.023281559348106384, + "learning_rate": 9.932554200936429e-05, + "loss": 0.014297975227236748, + "num_input_tokens_seen": 16719896, + "step": 1021, + "train_runtime": 8297.0522, + "train_tokens_per_second": 2015.161 + }, + { + "epoch": 0.6193939393939394, + "grad_norm": 0.02298637479543686, + "learning_rate": 9.932396697417044e-05, + "loss": 0.012052800506353378, + "num_input_tokens_seen": 16736272, + "step": 1022, + "train_runtime": 8305.1688, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 0.62, + "grad_norm": 0.01204346027225256, + "learning_rate": 9.932239011457909e-05, + "loss": 0.012858795002102852, + "num_input_tokens_seen": 16752648, + "step": 1023, + "train_runtime": 8313.2898, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 0.6206060606060606, + "grad_norm": 0.018114762380719185, + "learning_rate": 9.93208114306486e-05, + "loss": 0.013215101324021816, + "num_input_tokens_seen": 16769024, + "step": 1024, + "train_runtime": 8321.4063, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 0.6212121212121212, + "grad_norm": 0.009015897288918495, + "learning_rate": 9.931923092243733e-05, + "loss": 0.013312953524291515, + "num_input_tokens_seen": 16785400, + "step": 1025, + "train_runtime": 8329.5303, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 0.6218181818181818, + "grad_norm": 0.011126898229122162, + "learning_rate": 9.931764859000375e-05, + "loss": 0.011524452827870846, + "num_input_tokens_seen": 16801776, + "step": 1026, + "train_runtime": 8337.647, + "train_tokens_per_second": 2015.17 + }, + { + "epoch": 0.6224242424242424, + "grad_norm": 0.021657567471265793, + "learning_rate": 9.93160644334064e-05, + "loss": 0.012531260028481483, + "num_input_tokens_seen": 16818152, + "step": 1027, + "train_runtime": 8345.7666, + "train_tokens_per_second": 2015.172 + }, + { + "epoch": 0.623030303030303, + "grad_norm": 0.05316740646958351, + "learning_rate": 9.931447845270388e-05, + "loss": 0.013248222880065441, + "num_input_tokens_seen": 16834528, + "step": 1028, + "train_runtime": 8353.8829, + "train_tokens_per_second": 2015.174 + }, + { + "epoch": 0.6236363636363637, + "grad_norm": 0.012917754240334034, + "learning_rate": 9.931289064795482e-05, + "loss": 0.013202149420976639, + "num_input_tokens_seen": 16850904, + "step": 1029, + "train_runtime": 8362.0006, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 0.6242424242424243, + "grad_norm": 0.021064553409814835, + "learning_rate": 9.931130101921795e-05, + "loss": 0.013943769969046116, + "num_input_tokens_seen": 16867280, + "step": 1030, + "train_runtime": 8370.1194, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 0.6248484848484849, + "grad_norm": 0.012005737982690334, + "learning_rate": 9.930970956655212e-05, + "loss": 0.012500936165452003, + "num_input_tokens_seen": 16883656, + "step": 1031, + "train_runtime": 8378.2369, + "train_tokens_per_second": 2015.18 + }, + { + "epoch": 0.6254545454545455, + "grad_norm": 0.02506149373948574, + "learning_rate": 9.930811629001613e-05, + "loss": 0.014318128116428852, + "num_input_tokens_seen": 16900032, + "step": 1032, + "train_runtime": 8386.3552, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 0.6260606060606061, + "grad_norm": 0.03320576995611191, + "learning_rate": 9.930652118966895e-05, + "loss": 0.010508203878998756, + "num_input_tokens_seen": 16916408, + "step": 1033, + "train_runtime": 8394.4718, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 0.6266666666666667, + "grad_norm": 0.03429649397730827, + "learning_rate": 9.93049242655696e-05, + "loss": 0.012183441780507565, + "num_input_tokens_seen": 16932784, + "step": 1034, + "train_runtime": 8402.5875, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 0.6272727272727273, + "grad_norm": 0.01607862487435341, + "learning_rate": 9.930332551777708e-05, + "loss": 0.013750139623880386, + "num_input_tokens_seen": 16949160, + "step": 1035, + "train_runtime": 8410.7043, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 0.6278787878787879, + "grad_norm": 0.01341179572045803, + "learning_rate": 9.930172494635057e-05, + "loss": 0.012538340874016285, + "num_input_tokens_seen": 16965536, + "step": 1036, + "train_runtime": 8418.8297, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 0.6284848484848485, + "grad_norm": 0.00997228641062975, + "learning_rate": 9.930012255134928e-05, + "loss": 0.012722784653306007, + "num_input_tokens_seen": 16981912, + "step": 1037, + "train_runtime": 8426.9482, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 0.6290909090909091, + "grad_norm": 0.00990308728069067, + "learning_rate": 9.929851833283245e-05, + "loss": 0.013942928053438663, + "num_input_tokens_seen": 16998288, + "step": 1038, + "train_runtime": 8435.0672, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 0.6296969696969696, + "grad_norm": 0.011313795112073421, + "learning_rate": 9.929691229085944e-05, + "loss": 0.011238223873078823, + "num_input_tokens_seen": 17014664, + "step": 1039, + "train_runtime": 8443.1862, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 0.6303030303030303, + "grad_norm": 0.010831150226294994, + "learning_rate": 9.929530442548965e-05, + "loss": 0.012601799331605434, + "num_input_tokens_seen": 17031040, + "step": 1040, + "train_runtime": 8451.3035, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 0.6309090909090909, + "grad_norm": 0.014783729799091816, + "learning_rate": 9.929369473678253e-05, + "loss": 0.013956460170447826, + "num_input_tokens_seen": 17047416, + "step": 1041, + "train_runtime": 8459.4295, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 0.6315151515151515, + "grad_norm": 0.01627667248249054, + "learning_rate": 9.929208322479764e-05, + "loss": 0.013232799246907234, + "num_input_tokens_seen": 17063792, + "step": 1042, + "train_runtime": 8467.5479, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 0.6321212121212121, + "grad_norm": 0.011055609211325645, + "learning_rate": 9.92904698895946e-05, + "loss": 0.01293270569294691, + "num_input_tokens_seen": 17080168, + "step": 1043, + "train_runtime": 8475.665, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 0.6327272727272727, + "grad_norm": 0.03507707267999649, + "learning_rate": 9.928885473123306e-05, + "loss": 0.012113180011510849, + "num_input_tokens_seen": 17096544, + "step": 1044, + "train_runtime": 8483.7839, + "train_tokens_per_second": 2015.203 + }, + { + "epoch": 0.6333333333333333, + "grad_norm": 0.01946045272052288, + "learning_rate": 9.928723774977275e-05, + "loss": 0.013142693787813187, + "num_input_tokens_seen": 17112920, + "step": 1045, + "train_runtime": 8491.9041, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 0.6339393939393939, + "grad_norm": 0.021705901250243187, + "learning_rate": 9.928561894527353e-05, + "loss": 0.012501654215157032, + "num_input_tokens_seen": 17129296, + "step": 1046, + "train_runtime": 8500.0295, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 0.6345454545454545, + "grad_norm": 0.019804542884230614, + "learning_rate": 9.928399831779523e-05, + "loss": 0.012758147902786732, + "num_input_tokens_seen": 17145672, + "step": 1047, + "train_runtime": 8508.1486, + "train_tokens_per_second": 2015.206 + }, + { + "epoch": 0.6351515151515151, + "grad_norm": 0.011929893866181374, + "learning_rate": 9.928237586739781e-05, + "loss": 0.013042271137237549, + "num_input_tokens_seen": 17162048, + "step": 1048, + "train_runtime": 8516.2673, + "train_tokens_per_second": 2015.208 + }, + { + "epoch": 0.6357575757575757, + "grad_norm": 0.028489001095294952, + "learning_rate": 9.928075159414128e-05, + "loss": 0.013056590221822262, + "num_input_tokens_seen": 17178424, + "step": 1049, + "train_runtime": 8524.3858, + "train_tokens_per_second": 2015.21 + }, + { + "epoch": 0.6363636363636364, + "grad_norm": 0.01078235823661089, + "learning_rate": 9.927912549808572e-05, + "loss": 0.012080740183591843, + "num_input_tokens_seen": 17194800, + "step": 1050, + "train_runtime": 8532.5029, + "train_tokens_per_second": 2015.212 + }, + { + "epoch": 0.636969696969697, + "grad_norm": 0.021545223891735077, + "learning_rate": 9.927749757929125e-05, + "loss": 0.015170791186392307, + "num_input_tokens_seen": 17211176, + "step": 1051, + "train_runtime": 8540.6203, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 0.6375757575757576, + "grad_norm": 0.021686149761080742, + "learning_rate": 9.927586783781814e-05, + "loss": 0.013388474471867085, + "num_input_tokens_seen": 17227552, + "step": 1052, + "train_runtime": 8548.7393, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 0.6381818181818182, + "grad_norm": 0.019198935478925705, + "learning_rate": 9.927423627372663e-05, + "loss": 0.013151840306818485, + "num_input_tokens_seen": 17243928, + "step": 1053, + "train_runtime": 8556.8572, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 0.6387878787878788, + "grad_norm": 0.026876596733927727, + "learning_rate": 9.927260288707707e-05, + "loss": 0.01568884216248989, + "num_input_tokens_seen": 17260304, + "step": 1054, + "train_runtime": 8564.9754, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 0.6393939393939394, + "grad_norm": 0.02315112017095089, + "learning_rate": 9.92709676779299e-05, + "loss": 0.013643411919474602, + "num_input_tokens_seen": 17276680, + "step": 1055, + "train_runtime": 8573.0936, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 0.64, + "grad_norm": 0.013450577855110168, + "learning_rate": 9.926933064634558e-05, + "loss": 0.011888994835317135, + "num_input_tokens_seen": 17293056, + "step": 1056, + "train_runtime": 8581.213, + "train_tokens_per_second": 2015.223 + }, + { + "epoch": 0.6406060606060606, + "grad_norm": 0.038361355662345886, + "learning_rate": 9.926769179238466e-05, + "loss": 0.01497360784560442, + "num_input_tokens_seen": 17309432, + "step": 1057, + "train_runtime": 8589.3331, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 0.6412121212121212, + "grad_norm": 0.019271399825811386, + "learning_rate": 9.926605111610776e-05, + "loss": 0.014056256040930748, + "num_input_tokens_seen": 17325808, + "step": 1058, + "train_runtime": 8597.4511, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 0.6418181818181818, + "grad_norm": 0.01557596493512392, + "learning_rate": 9.926440861757557e-05, + "loss": 0.012796062976121902, + "num_input_tokens_seen": 17342184, + "step": 1059, + "train_runtime": 8605.5697, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 0.6424242424242425, + "grad_norm": 0.005278696306049824, + "learning_rate": 9.926276429684886e-05, + "loss": 0.011402487754821777, + "num_input_tokens_seen": 17358560, + "step": 1060, + "train_runtime": 8613.6883, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 0.6430303030303031, + "grad_norm": 0.015694163739681244, + "learning_rate": 9.926111815398843e-05, + "loss": 0.013192391023039818, + "num_input_tokens_seen": 17374936, + "step": 1061, + "train_runtime": 8621.8068, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 0.6436363636363637, + "grad_norm": 0.01900624856352806, + "learning_rate": 9.925947018905516e-05, + "loss": 0.013219461776316166, + "num_input_tokens_seen": 17391312, + "step": 1062, + "train_runtime": 8629.9293, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 0.6442424242424243, + "grad_norm": 0.013446804136037827, + "learning_rate": 9.925782040211002e-05, + "loss": 0.011763139627873898, + "num_input_tokens_seen": 17407688, + "step": 1063, + "train_runtime": 8638.0493, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 0.6448484848484849, + "grad_norm": 0.01933007501065731, + "learning_rate": 9.925616879321404e-05, + "loss": 0.011931811459362507, + "num_input_tokens_seen": 17424064, + "step": 1064, + "train_runtime": 8646.1674, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 0.6454545454545455, + "grad_norm": 0.016764989122748375, + "learning_rate": 9.925451536242829e-05, + "loss": 0.013410956598818302, + "num_input_tokens_seen": 17440440, + "step": 1065, + "train_runtime": 8654.2855, + "train_tokens_per_second": 2015.237 + }, + { + "epoch": 0.6460606060606061, + "grad_norm": 0.019174639135599136, + "learning_rate": 9.925286010981394e-05, + "loss": 0.014691396616399288, + "num_input_tokens_seen": 17456816, + "step": 1066, + "train_runtime": 8662.4024, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 0.6466666666666666, + "grad_norm": 0.0077021801844239235, + "learning_rate": 9.925120303543219e-05, + "loss": 0.012529893778264523, + "num_input_tokens_seen": 17473192, + "step": 1067, + "train_runtime": 8670.5209, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 0.6472727272727272, + "grad_norm": 0.014966354705393314, + "learning_rate": 9.924954413934438e-05, + "loss": 0.013215701095759869, + "num_input_tokens_seen": 17489568, + "step": 1068, + "train_runtime": 8678.6394, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 0.6478787878787878, + "grad_norm": 0.020852232351899147, + "learning_rate": 9.924788342161182e-05, + "loss": 0.013355967588722706, + "num_input_tokens_seen": 17505944, + "step": 1069, + "train_runtime": 8686.7585, + "train_tokens_per_second": 2015.245 + }, + { + "epoch": 0.6484848484848484, + "grad_norm": 0.017107227817177773, + "learning_rate": 9.924622088229597e-05, + "loss": 0.014044157229363918, + "num_input_tokens_seen": 17522320, + "step": 1070, + "train_runtime": 8694.8787, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 0.649090909090909, + "grad_norm": 0.015282119624316692, + "learning_rate": 9.924455652145831e-05, + "loss": 0.01387142762541771, + "num_input_tokens_seen": 17538696, + "step": 1071, + "train_runtime": 8702.997, + "train_tokens_per_second": 2015.248 + }, + { + "epoch": 0.6496969696969697, + "grad_norm": 0.010007917881011963, + "learning_rate": 9.92428903391604e-05, + "loss": 0.01257625874131918, + "num_input_tokens_seen": 17555072, + "step": 1072, + "train_runtime": 8711.1152, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 0.6503030303030303, + "grad_norm": 0.009446706622838974, + "learning_rate": 9.924122233546386e-05, + "loss": 0.013552306219935417, + "num_input_tokens_seen": 17571448, + "step": 1073, + "train_runtime": 8719.2328, + "train_tokens_per_second": 2015.252 + }, + { + "epoch": 0.6509090909090909, + "grad_norm": 0.012225381098687649, + "learning_rate": 9.923955251043042e-05, + "loss": 0.011776247061789036, + "num_input_tokens_seen": 17587824, + "step": 1074, + "train_runtime": 8727.35, + "train_tokens_per_second": 2015.254 + }, + { + "epoch": 0.6515151515151515, + "grad_norm": 0.020964186638593674, + "learning_rate": 9.923788086412182e-05, + "loss": 0.012502472847700119, + "num_input_tokens_seen": 17604200, + "step": 1075, + "train_runtime": 8735.4695, + "train_tokens_per_second": 2015.255 + }, + { + "epoch": 0.6521212121212121, + "grad_norm": 0.017575940117239952, + "learning_rate": 9.923620739659989e-05, + "loss": 0.012096179649233818, + "num_input_tokens_seen": 17620576, + "step": 1076, + "train_runtime": 8743.5854, + "train_tokens_per_second": 2015.258 + }, + { + "epoch": 0.6527272727272727, + "grad_norm": 0.013330096378922462, + "learning_rate": 9.923453210792653e-05, + "loss": 0.013803805224597454, + "num_input_tokens_seen": 17636952, + "step": 1077, + "train_runtime": 8751.7034, + "train_tokens_per_second": 2015.259 + }, + { + "epoch": 0.6533333333333333, + "grad_norm": 0.011349702253937721, + "learning_rate": 9.92328549981637e-05, + "loss": 0.013793877325952053, + "num_input_tokens_seen": 17653328, + "step": 1078, + "train_runtime": 8759.8296, + "train_tokens_per_second": 2015.259 + }, + { + "epoch": 0.6539393939393939, + "grad_norm": 0.015959061682224274, + "learning_rate": 9.923117606737346e-05, + "loss": 0.013326899148523808, + "num_input_tokens_seen": 17669704, + "step": 1079, + "train_runtime": 8767.9523, + "train_tokens_per_second": 2015.26 + }, + { + "epoch": 0.6545454545454545, + "grad_norm": 0.014492125250399113, + "learning_rate": 9.922949531561788e-05, + "loss": 0.01288958266377449, + "num_input_tokens_seen": 17686080, + "step": 1080, + "train_runtime": 8776.0735, + "train_tokens_per_second": 2015.261 + }, + { + "epoch": 0.6551515151515152, + "grad_norm": 0.013345365412533283, + "learning_rate": 9.922781274295913e-05, + "loss": 0.012366179376840591, + "num_input_tokens_seen": 17702456, + "step": 1081, + "train_runtime": 8784.1923, + "train_tokens_per_second": 2015.263 + }, + { + "epoch": 0.6557575757575758, + "grad_norm": 0.010763085447251797, + "learning_rate": 9.922612834945947e-05, + "loss": 0.01264217309653759, + "num_input_tokens_seen": 17718832, + "step": 1082, + "train_runtime": 8792.3102, + "train_tokens_per_second": 2015.265 + }, + { + "epoch": 0.6563636363636364, + "grad_norm": 0.011818567290902138, + "learning_rate": 9.922444213518117e-05, + "loss": 0.013193395920097828, + "num_input_tokens_seen": 17735208, + "step": 1083, + "train_runtime": 8800.4295, + "train_tokens_per_second": 2015.266 + }, + { + "epoch": 0.656969696969697, + "grad_norm": 0.010724381543695927, + "learning_rate": 9.922275410018663e-05, + "loss": 0.012857016175985336, + "num_input_tokens_seen": 17751584, + "step": 1084, + "train_runtime": 8808.5474, + "train_tokens_per_second": 2015.268 + }, + { + "epoch": 0.6575757575757576, + "grad_norm": 0.017108984291553497, + "learning_rate": 9.922106424453826e-05, + "loss": 0.013113675639033318, + "num_input_tokens_seen": 17767960, + "step": 1085, + "train_runtime": 8816.6647, + "train_tokens_per_second": 2015.27 + }, + { + "epoch": 0.6581818181818182, + "grad_norm": 0.022697484120726585, + "learning_rate": 9.921937256829859e-05, + "loss": 0.012546958401799202, + "num_input_tokens_seen": 17784336, + "step": 1086, + "train_runtime": 8824.7847, + "train_tokens_per_second": 2015.271 + }, + { + "epoch": 0.6587878787878788, + "grad_norm": 0.014008583500981331, + "learning_rate": 9.921767907153016e-05, + "loss": 0.011740295216441154, + "num_input_tokens_seen": 17800712, + "step": 1087, + "train_runtime": 8832.904, + "train_tokens_per_second": 2015.273 + }, + { + "epoch": 0.6593939393939394, + "grad_norm": 0.011233743280172348, + "learning_rate": 9.921598375429564e-05, + "loss": 0.011731310747563839, + "num_input_tokens_seen": 17817088, + "step": 1088, + "train_runtime": 8841.0299, + "train_tokens_per_second": 2015.273 + }, + { + "epoch": 0.66, + "grad_norm": 0.011883188039064407, + "learning_rate": 9.921428661665772e-05, + "loss": 0.012650273740291595, + "num_input_tokens_seen": 17833464, + "step": 1089, + "train_runtime": 8849.1483, + "train_tokens_per_second": 2015.275 + }, + { + "epoch": 0.6606060606060606, + "grad_norm": 0.010079750791192055, + "learning_rate": 9.921258765867919e-05, + "loss": 0.012131286785006523, + "num_input_tokens_seen": 17849840, + "step": 1090, + "train_runtime": 8857.2661, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 0.6612121212121213, + "grad_norm": 0.013724222779273987, + "learning_rate": 9.921088688042287e-05, + "loss": 0.012973928824067116, + "num_input_tokens_seen": 17866216, + "step": 1091, + "train_runtime": 8865.3859, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 0.6618181818181819, + "grad_norm": 0.019831640645861626, + "learning_rate": 9.920918428195168e-05, + "loss": 0.01297835074365139, + "num_input_tokens_seen": 17882592, + "step": 1092, + "train_runtime": 8873.5052, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 0.6624242424242425, + "grad_norm": 0.011757400818169117, + "learning_rate": 9.920747986332858e-05, + "loss": 0.013069117441773415, + "num_input_tokens_seen": 17898968, + "step": 1093, + "train_runtime": 8881.6295, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 0.6630303030303031, + "grad_norm": 0.013741742819547653, + "learning_rate": 9.920577362461665e-05, + "loss": 0.013204855844378471, + "num_input_tokens_seen": 17915344, + "step": 1094, + "train_runtime": 8889.749, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 0.6636363636363637, + "grad_norm": 0.02447706274688244, + "learning_rate": 9.920406556587897e-05, + "loss": 0.011999960988759995, + "num_input_tokens_seen": 17931720, + "step": 1095, + "train_runtime": 8897.8668, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 0.6642424242424242, + "grad_norm": 0.03095782734453678, + "learning_rate": 9.920235568717873e-05, + "loss": 0.01361205242574215, + "num_input_tokens_seen": 17948096, + "step": 1096, + "train_runtime": 8905.9871, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 0.6648484848484848, + "grad_norm": 0.037076305598020554, + "learning_rate": 9.920064398857916e-05, + "loss": 0.012342737056314945, + "num_input_tokens_seen": 17964472, + "step": 1097, + "train_runtime": 8914.1084, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 0.6654545454545454, + "grad_norm": 0.053048014640808105, + "learning_rate": 9.91989304701436e-05, + "loss": 0.012850755825638771, + "num_input_tokens_seen": 17980848, + "step": 1098, + "train_runtime": 8922.229, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 0.666060606060606, + "grad_norm": 0.018742846325039864, + "learning_rate": 9.919721513193538e-05, + "loss": 0.012020561844110489, + "num_input_tokens_seen": 17997224, + "step": 1099, + "train_runtime": 8930.3477, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.013778687454760075, + "learning_rate": 9.919549797401802e-05, + "loss": 0.014269824139773846, + "num_input_tokens_seen": 18013600, + "step": 1100, + "train_runtime": 8938.4671, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 0.6672727272727272, + "grad_norm": 0.06041925400495529, + "learning_rate": 9.919377899645497e-05, + "loss": 0.013500120490789413, + "num_input_tokens_seen": 18029976, + "step": 1101, + "train_runtime": 8947.5154, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 0.6678787878787878, + "grad_norm": 0.006662312895059586, + "learning_rate": 9.919205819930983e-05, + "loss": 0.011903712525963783, + "num_input_tokens_seen": 18046352, + "step": 1102, + "train_runtime": 8955.6388, + "train_tokens_per_second": 2015.083 + }, + { + "epoch": 0.6684848484848485, + "grad_norm": 0.014133021235466003, + "learning_rate": 9.919033558264627e-05, + "loss": 0.013043178245425224, + "num_input_tokens_seen": 18062728, + "step": 1103, + "train_runtime": 8963.7612, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 0.6690909090909091, + "grad_norm": 0.018031738698482513, + "learning_rate": 9.918861114652798e-05, + "loss": 0.012816919945180416, + "num_input_tokens_seen": 18079104, + "step": 1104, + "train_runtime": 8971.8826, + "train_tokens_per_second": 2015.085 + }, + { + "epoch": 0.6696969696969697, + "grad_norm": 0.030864031985402107, + "learning_rate": 9.918688489101875e-05, + "loss": 0.011915095150470734, + "num_input_tokens_seen": 18095480, + "step": 1105, + "train_runtime": 8980.0063, + "train_tokens_per_second": 2015.085 + }, + { + "epoch": 0.6703030303030303, + "grad_norm": 0.859399676322937, + "learning_rate": 9.918515681618246e-05, + "loss": 0.014253467321395874, + "num_input_tokens_seen": 18111856, + "step": 1106, + "train_runtime": 8988.1298, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 0.6709090909090909, + "grad_norm": 0.009849797002971172, + "learning_rate": 9.918342692208297e-05, + "loss": 0.012211693450808525, + "num_input_tokens_seen": 18128232, + "step": 1107, + "train_runtime": 8996.2594, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 0.6715151515151515, + "grad_norm": 0.008677136152982712, + "learning_rate": 9.918169520878432e-05, + "loss": 0.013050990179181099, + "num_input_tokens_seen": 18144608, + "step": 1108, + "train_runtime": 9004.3806, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 0.6721212121212121, + "grad_norm": 0.020974334329366684, + "learning_rate": 9.917996167635053e-05, + "loss": 0.013656461611390114, + "num_input_tokens_seen": 18160984, + "step": 1109, + "train_runtime": 9012.5058, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 0.6727272727272727, + "grad_norm": 0.013642716221511364, + "learning_rate": 9.917822632484575e-05, + "loss": 0.012185771018266678, + "num_input_tokens_seen": 18177360, + "step": 1110, + "train_runtime": 9020.6295, + "train_tokens_per_second": 2015.088 + }, + { + "epoch": 0.6733333333333333, + "grad_norm": 0.01303725503385067, + "learning_rate": 9.917648915433413e-05, + "loss": 0.012668903917074203, + "num_input_tokens_seen": 18193736, + "step": 1111, + "train_runtime": 9028.755, + "train_tokens_per_second": 2015.088 + }, + { + "epoch": 0.673939393939394, + "grad_norm": 0.02112429402768612, + "learning_rate": 9.917475016487993e-05, + "loss": 0.014089099131524563, + "num_input_tokens_seen": 18210112, + "step": 1112, + "train_runtime": 9036.8746, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 0.6745454545454546, + "grad_norm": 0.016523541882634163, + "learning_rate": 9.917300935654751e-05, + "loss": 0.012728005647659302, + "num_input_tokens_seen": 18226488, + "step": 1113, + "train_runtime": 9044.9946, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 0.6751515151515152, + "grad_norm": 0.0112396739423275, + "learning_rate": 9.917126672940124e-05, + "loss": 0.013019783422350883, + "num_input_tokens_seen": 18242864, + "step": 1114, + "train_runtime": 9053.1208, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 0.6757575757575758, + "grad_norm": 0.21001896262168884, + "learning_rate": 9.916952228350556e-05, + "loss": 0.019040443003177643, + "num_input_tokens_seen": 18259240, + "step": 1115, + "train_runtime": 9061.2411, + "train_tokens_per_second": 2015.093 + }, + { + "epoch": 0.6763636363636364, + "grad_norm": 0.015162148512899876, + "learning_rate": 9.916777601892499e-05, + "loss": 0.011509026400744915, + "num_input_tokens_seen": 18275616, + "step": 1116, + "train_runtime": 9069.361, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 0.676969696969697, + "grad_norm": 0.018534110859036446, + "learning_rate": 9.916602793572415e-05, + "loss": 0.012472787871956825, + "num_input_tokens_seen": 18291992, + "step": 1117, + "train_runtime": 9077.4851, + "train_tokens_per_second": 2015.095 + }, + { + "epoch": 0.6775757575757576, + "grad_norm": 0.08402104675769806, + "learning_rate": 9.916427803396769e-05, + "loss": 0.014569929800927639, + "num_input_tokens_seen": 18308368, + "step": 1118, + "train_runtime": 9085.6102, + "train_tokens_per_second": 2015.095 + }, + { + "epoch": 0.6781818181818182, + "grad_norm": 0.018771981820464134, + "learning_rate": 9.91625263137203e-05, + "loss": 0.011995847336947918, + "num_input_tokens_seen": 18324744, + "step": 1119, + "train_runtime": 9093.7324, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 0.6787878787878788, + "grad_norm": 0.03660675883293152, + "learning_rate": 9.916077277504683e-05, + "loss": 0.013902310281991959, + "num_input_tokens_seen": 18341120, + "step": 1120, + "train_runtime": 9101.8526, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 0.6793939393939394, + "grad_norm": 0.02395397052168846, + "learning_rate": 9.91590174180121e-05, + "loss": 0.012367844581604004, + "num_input_tokens_seen": 18357496, + "step": 1121, + "train_runtime": 9109.9726, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 0.68, + "grad_norm": 0.019227512180805206, + "learning_rate": 9.915726024268104e-05, + "loss": 0.012134227901697159, + "num_input_tokens_seen": 18373872, + "step": 1122, + "train_runtime": 9118.0998, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 0.6806060606060607, + "grad_norm": 0.01857166923582554, + "learning_rate": 9.915550124911866e-05, + "loss": 0.013478003442287445, + "num_input_tokens_seen": 18390248, + "step": 1123, + "train_runtime": 9126.2293, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 0.6812121212121212, + "grad_norm": 0.04824969545006752, + "learning_rate": 9.915374043739003e-05, + "loss": 0.012269456870853901, + "num_input_tokens_seen": 18406624, + "step": 1124, + "train_runtime": 9134.35, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 0.6818181818181818, + "grad_norm": 3.1688060760498047, + "learning_rate": 9.915197780756025e-05, + "loss": 0.02297493815422058, + "num_input_tokens_seen": 18423000, + "step": 1125, + "train_runtime": 9142.4746, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 0.6824242424242424, + "grad_norm": 13.637248992919922, + "learning_rate": 9.915021335969452e-05, + "loss": 0.03535247966647148, + "num_input_tokens_seen": 18439376, + "step": 1126, + "train_runtime": 9150.5959, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 0.683030303030303, + "grad_norm": 0.018440239131450653, + "learning_rate": 9.914844709385813e-05, + "loss": 0.014308687299489975, + "num_input_tokens_seen": 18455752, + "step": 1127, + "train_runtime": 9158.7198, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 0.6836363636363636, + "grad_norm": 0.017091959714889526, + "learning_rate": 9.914667901011638e-05, + "loss": 0.012615025043487549, + "num_input_tokens_seen": 18472128, + "step": 1128, + "train_runtime": 9166.8428, + "train_tokens_per_second": 2015.103 + }, + { + "epoch": 0.6842424242424242, + "grad_norm": 0.040168218314647675, + "learning_rate": 9.91449091085347e-05, + "loss": 0.013721957802772522, + "num_input_tokens_seen": 18488504, + "step": 1129, + "train_runtime": 9174.9646, + "train_tokens_per_second": 2015.104 + }, + { + "epoch": 0.6848484848484848, + "grad_norm": 0.01958506926894188, + "learning_rate": 9.914313738917853e-05, + "loss": 0.015058807097375393, + "num_input_tokens_seen": 18504880, + "step": 1130, + "train_runtime": 9183.0766, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 0.6854545454545454, + "grad_norm": 0.041311051696538925, + "learning_rate": 9.914136385211341e-05, + "loss": 0.011465203016996384, + "num_input_tokens_seen": 18521256, + "step": 1131, + "train_runtime": 9191.1874, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 0.686060606060606, + "grad_norm": 0.029558753594756126, + "learning_rate": 9.913958849740493e-05, + "loss": 0.013997621834278107, + "num_input_tokens_seen": 18537632, + "step": 1132, + "train_runtime": 9199.2987, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 0.6866666666666666, + "grad_norm": 0.01560160145163536, + "learning_rate": 9.913781132511877e-05, + "loss": 0.01135623175650835, + "num_input_tokens_seen": 18554008, + "step": 1133, + "train_runtime": 9207.4109, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 0.6872727272727273, + "grad_norm": 0.026331115514039993, + "learning_rate": 9.913603233532067e-05, + "loss": 0.014213286340236664, + "num_input_tokens_seen": 18570384, + "step": 1134, + "train_runtime": 9215.5295, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 0.6878787878787879, + "grad_norm": 0.012758780270814896, + "learning_rate": 9.913425152807642e-05, + "loss": 0.013095496222376823, + "num_input_tokens_seen": 18586760, + "step": 1135, + "train_runtime": 9223.6386, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 0.6884848484848485, + "grad_norm": 0.02692464366555214, + "learning_rate": 9.913246890345189e-05, + "loss": 0.014479240402579308, + "num_input_tokens_seen": 18603136, + "step": 1136, + "train_runtime": 9231.7499, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 0.6890909090909091, + "grad_norm": 0.023674434050917625, + "learning_rate": 9.913068446151302e-05, + "loss": 0.01468647737056017, + "num_input_tokens_seen": 18619512, + "step": 1137, + "train_runtime": 9239.8624, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 0.6896969696969697, + "grad_norm": 0.043436527252197266, + "learning_rate": 9.912889820232578e-05, + "loss": 0.013666333630681038, + "num_input_tokens_seen": 18635888, + "step": 1138, + "train_runtime": 9247.9735, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 0.6903030303030303, + "grad_norm": 0.010912930592894554, + "learning_rate": 9.91271101259563e-05, + "loss": 0.013306580483913422, + "num_input_tokens_seen": 18652264, + "step": 1139, + "train_runtime": 9256.0853, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 0.6909090909090909, + "grad_norm": 0.027857549488544464, + "learning_rate": 9.912532023247068e-05, + "loss": 0.01315208338201046, + "num_input_tokens_seen": 18668640, + "step": 1140, + "train_runtime": 9264.193, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 0.6915151515151515, + "grad_norm": 0.014686026610434055, + "learning_rate": 9.912352852193514e-05, + "loss": 0.012413710355758667, + "num_input_tokens_seen": 18685016, + "step": 1141, + "train_runtime": 9272.3053, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 0.6921212121212121, + "grad_norm": 0.16849519312381744, + "learning_rate": 9.912173499441593e-05, + "loss": 0.013621876947581768, + "num_input_tokens_seen": 18701392, + "step": 1142, + "train_runtime": 9280.4143, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 0.6927272727272727, + "grad_norm": 0.025766436010599136, + "learning_rate": 9.91199396499794e-05, + "loss": 0.014693841338157654, + "num_input_tokens_seen": 18717768, + "step": 1143, + "train_runtime": 9288.5292, + "train_tokens_per_second": 2015.149 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 0.03636888787150383, + "learning_rate": 9.911814248869198e-05, + "loss": 0.015230114571750164, + "num_input_tokens_seen": 18734144, + "step": 1144, + "train_runtime": 9296.6386, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 0.693939393939394, + "grad_norm": 0.02268008515238762, + "learning_rate": 9.91163435106201e-05, + "loss": 0.014965626411139965, + "num_input_tokens_seen": 18750520, + "step": 1145, + "train_runtime": 9304.7495, + "train_tokens_per_second": 2015.156 + }, + { + "epoch": 0.6945454545454546, + "grad_norm": 0.02825307659804821, + "learning_rate": 9.911454271583034e-05, + "loss": 0.013202480971813202, + "num_input_tokens_seen": 18766896, + "step": 1146, + "train_runtime": 9312.8608, + "train_tokens_per_second": 2015.159 + }, + { + "epoch": 0.6951515151515152, + "grad_norm": 0.0277263056486845, + "learning_rate": 9.911274010438928e-05, + "loss": 0.014979338273406029, + "num_input_tokens_seen": 18783272, + "step": 1147, + "train_runtime": 9320.9729, + "train_tokens_per_second": 2015.162 + }, + { + "epoch": 0.6957575757575758, + "grad_norm": 0.03655631095170975, + "learning_rate": 9.91109356763636e-05, + "loss": 0.01276368834078312, + "num_input_tokens_seen": 18799648, + "step": 1148, + "train_runtime": 9329.0854, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 0.6963636363636364, + "grad_norm": 0.017650572583079338, + "learning_rate": 9.910912943182007e-05, + "loss": 0.013225570321083069, + "num_input_tokens_seen": 18816024, + "step": 1149, + "train_runtime": 9337.1951, + "train_tokens_per_second": 2015.169 + }, + { + "epoch": 0.696969696969697, + "grad_norm": 0.029844503849744797, + "learning_rate": 9.910732137082547e-05, + "loss": 0.012919209897518158, + "num_input_tokens_seen": 18832400, + "step": 1150, + "train_runtime": 9345.3036, + "train_tokens_per_second": 2015.173 + }, + { + "epoch": 0.6975757575757576, + "grad_norm": 0.022128146141767502, + "learning_rate": 9.910551149344669e-05, + "loss": 0.013780666515231133, + "num_input_tokens_seen": 18848776, + "step": 1151, + "train_runtime": 9353.4141, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 0.6981818181818182, + "grad_norm": 0.02025616727769375, + "learning_rate": 9.910369979975065e-05, + "loss": 0.014601497910916805, + "num_input_tokens_seen": 18865152, + "step": 1152, + "train_runtime": 9361.5308, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 0.6987878787878787, + "grad_norm": 0.01940023899078369, + "learning_rate": 9.910188628980439e-05, + "loss": 0.01339776162058115, + "num_input_tokens_seen": 18881528, + "step": 1153, + "train_runtime": 9369.6441, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 0.6993939393939393, + "grad_norm": 0.022027693688869476, + "learning_rate": 9.910007096367497e-05, + "loss": 0.01376222725957632, + "num_input_tokens_seen": 18897904, + "step": 1154, + "train_runtime": 9377.7542, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 0.7, + "grad_norm": 0.006554140709340572, + "learning_rate": 9.909825382142955e-05, + "loss": 0.012087719514966011, + "num_input_tokens_seen": 18914280, + "step": 1155, + "train_runtime": 9385.8634, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 0.7006060606060606, + "grad_norm": 0.011244562454521656, + "learning_rate": 9.909643486313533e-05, + "loss": 0.011743160896003246, + "num_input_tokens_seen": 18930656, + "step": 1156, + "train_runtime": 9393.9756, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 0.7012121212121212, + "grad_norm": 0.015718987211585045, + "learning_rate": 9.909461408885961e-05, + "loss": 0.015649257227778435, + "num_input_tokens_seen": 18947032, + "step": 1157, + "train_runtime": 9402.0879, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 0.7018181818181818, + "grad_norm": 0.014524322003126144, + "learning_rate": 9.909279149866971e-05, + "loss": 0.012584694661200047, + "num_input_tokens_seen": 18963408, + "step": 1158, + "train_runtime": 9410.1978, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 0.7024242424242424, + "grad_norm": 0.01179551426321268, + "learning_rate": 9.909096709263305e-05, + "loss": 0.01177270244807005, + "num_input_tokens_seen": 18979784, + "step": 1159, + "train_runtime": 9418.3067, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 0.703030303030303, + "grad_norm": 0.3294766843318939, + "learning_rate": 9.908914087081714e-05, + "loss": 0.013622680678963661, + "num_input_tokens_seen": 18996160, + "step": 1160, + "train_runtime": 9426.418, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 0.7036363636363636, + "grad_norm": 0.019340241327881813, + "learning_rate": 9.908731283328949e-05, + "loss": 0.013781043700873852, + "num_input_tokens_seen": 19012536, + "step": 1161, + "train_runtime": 9434.5376, + "train_tokens_per_second": 2015.206 + }, + { + "epoch": 0.7042424242424242, + "grad_norm": 0.31950604915618896, + "learning_rate": 9.908548298011774e-05, + "loss": 0.013624520972371101, + "num_input_tokens_seen": 19028912, + "step": 1162, + "train_runtime": 9442.6474, + "train_tokens_per_second": 2015.209 + }, + { + "epoch": 0.7048484848484848, + "grad_norm": 0.01044798456132412, + "learning_rate": 9.908365131136957e-05, + "loss": 0.013481276109814644, + "num_input_tokens_seen": 19045288, + "step": 1163, + "train_runtime": 9450.7603, + "train_tokens_per_second": 2015.212 + }, + { + "epoch": 0.7054545454545454, + "grad_norm": 0.08119679987430573, + "learning_rate": 9.90818178271127e-05, + "loss": 0.01282893493771553, + "num_input_tokens_seen": 19061664, + "step": 1164, + "train_runtime": 9458.8734, + "train_tokens_per_second": 2015.215 + }, + { + "epoch": 0.706060606060606, + "grad_norm": 0.013537311926484108, + "learning_rate": 9.907998252741498e-05, + "loss": 0.013240115717053413, + "num_input_tokens_seen": 19078040, + "step": 1165, + "train_runtime": 9466.9849, + "train_tokens_per_second": 2015.218 + }, + { + "epoch": 0.7066666666666667, + "grad_norm": 0.015183590352535248, + "learning_rate": 9.907814541234429e-05, + "loss": 0.01356966607272625, + "num_input_tokens_seen": 19094416, + "step": 1166, + "train_runtime": 9475.0931, + "train_tokens_per_second": 2015.222 + }, + { + "epoch": 0.7072727272727273, + "grad_norm": 0.01905563659965992, + "learning_rate": 9.907630648196857e-05, + "loss": 0.011865122243762016, + "num_input_tokens_seen": 19110792, + "step": 1167, + "train_runtime": 9483.2064, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 0.7078787878787879, + "grad_norm": 0.01771489344537258, + "learning_rate": 9.907446573635586e-05, + "loss": 0.014323254115879536, + "num_input_tokens_seen": 19127168, + "step": 1168, + "train_runtime": 9491.3179, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 0.7084848484848485, + "grad_norm": 0.013392560184001923, + "learning_rate": 9.907262317557422e-05, + "loss": 0.014154933393001556, + "num_input_tokens_seen": 19143544, + "step": 1169, + "train_runtime": 9499.4298, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 0.7090909090909091, + "grad_norm": 0.01917138509452343, + "learning_rate": 9.907077879969182e-05, + "loss": 0.014620376750826836, + "num_input_tokens_seen": 19159920, + "step": 1170, + "train_runtime": 9507.5424, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 0.7096969696969697, + "grad_norm": 0.023388303816318512, + "learning_rate": 9.906893260877686e-05, + "loss": 0.013931838795542717, + "num_input_tokens_seen": 19176296, + "step": 1171, + "train_runtime": 9515.6549, + "train_tokens_per_second": 2015.237 + }, + { + "epoch": 0.7103030303030303, + "grad_norm": 0.014943883754312992, + "learning_rate": 9.906708460289765e-05, + "loss": 0.012756659649312496, + "num_input_tokens_seen": 19192672, + "step": 1172, + "train_runtime": 9523.7631, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 0.7109090909090909, + "grad_norm": 0.011030408553779125, + "learning_rate": 9.906523478212252e-05, + "loss": 0.01190275140106678, + "num_input_tokens_seen": 19209048, + "step": 1173, + "train_runtime": 9531.8735, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 0.7115151515151515, + "grad_norm": 0.008161013014614582, + "learning_rate": 9.906338314651993e-05, + "loss": 0.012577732093632221, + "num_input_tokens_seen": 19225424, + "step": 1174, + "train_runtime": 9539.9859, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 0.7121212121212122, + "grad_norm": 0.02119288221001625, + "learning_rate": 9.906152969615833e-05, + "loss": 0.012449773959815502, + "num_input_tokens_seen": 19241800, + "step": 1175, + "train_runtime": 9548.0982, + "train_tokens_per_second": 2015.249 + }, + { + "epoch": 0.7127272727272728, + "grad_norm": 0.017582163214683533, + "learning_rate": 9.90596744311063e-05, + "loss": 0.011529134586453438, + "num_input_tokens_seen": 19258176, + "step": 1176, + "train_runtime": 9556.2087, + "train_tokens_per_second": 2015.253 + }, + { + "epoch": 0.7133333333333334, + "grad_norm": 0.04412311315536499, + "learning_rate": 9.905781735143245e-05, + "loss": 0.014292292296886444, + "num_input_tokens_seen": 19274552, + "step": 1177, + "train_runtime": 9564.3204, + "train_tokens_per_second": 2015.256 + }, + { + "epoch": 0.713939393939394, + "grad_norm": 0.07766410708427429, + "learning_rate": 9.905595845720545e-05, + "loss": 0.011792981065809727, + "num_input_tokens_seen": 19290928, + "step": 1178, + "train_runtime": 9572.4335, + "train_tokens_per_second": 2015.258 + }, + { + "epoch": 0.7145454545454546, + "grad_norm": 0.020279264077544212, + "learning_rate": 9.90540977484941e-05, + "loss": 0.014193961396813393, + "num_input_tokens_seen": 19307304, + "step": 1179, + "train_runtime": 9580.5444, + "train_tokens_per_second": 2015.262 + }, + { + "epoch": 0.7151515151515152, + "grad_norm": 0.023957345634698868, + "learning_rate": 9.905223522536719e-05, + "loss": 0.01391246635466814, + "num_input_tokens_seen": 19323680, + "step": 1180, + "train_runtime": 9588.6548, + "train_tokens_per_second": 2015.265 + }, + { + "epoch": 0.7157575757575757, + "grad_norm": 0.02165958844125271, + "learning_rate": 9.905037088789363e-05, + "loss": 0.014714146964251995, + "num_input_tokens_seen": 19340056, + "step": 1181, + "train_runtime": 9596.7692, + "train_tokens_per_second": 2015.267 + }, + { + "epoch": 0.7163636363636363, + "grad_norm": 0.014883043244481087, + "learning_rate": 9.904850473614237e-05, + "loss": 0.013630779460072517, + "num_input_tokens_seen": 19356432, + "step": 1182, + "train_runtime": 9604.8799, + "train_tokens_per_second": 2015.271 + }, + { + "epoch": 0.7169696969696969, + "grad_norm": 0.012120597995817661, + "learning_rate": 9.904663677018245e-05, + "loss": 0.013401714153587818, + "num_input_tokens_seen": 19372808, + "step": 1183, + "train_runtime": 9612.9913, + "train_tokens_per_second": 2015.274 + }, + { + "epoch": 0.7175757575757575, + "grad_norm": 0.024704404175281525, + "learning_rate": 9.904476699008293e-05, + "loss": 0.015781283378601074, + "num_input_tokens_seen": 19389184, + "step": 1184, + "train_runtime": 9621.1054, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 0.7181818181818181, + "grad_norm": 0.015950346365571022, + "learning_rate": 9.9042895395913e-05, + "loss": 0.012905421666800976, + "num_input_tokens_seen": 19405560, + "step": 1185, + "train_runtime": 9629.2186, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 0.7187878787878788, + "grad_norm": 0.021412916481494904, + "learning_rate": 9.904102198774188e-05, + "loss": 0.012717105448246002, + "num_input_tokens_seen": 19421936, + "step": 1186, + "train_runtime": 9637.3311, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 0.7193939393939394, + "grad_norm": 0.024673737585544586, + "learning_rate": 9.903914676563885e-05, + "loss": 0.012580260634422302, + "num_input_tokens_seen": 19438312, + "step": 1187, + "train_runtime": 9645.4427, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 0.72, + "grad_norm": 0.07743503898382187, + "learning_rate": 9.90372697296733e-05, + "loss": 0.013859845697879791, + "num_input_tokens_seen": 19454688, + "step": 1188, + "train_runtime": 9653.5584, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 0.7206060606060606, + "grad_norm": 0.014397671446204185, + "learning_rate": 9.903539087991462e-05, + "loss": 0.013244936242699623, + "num_input_tokens_seen": 19471064, + "step": 1189, + "train_runtime": 9661.6716, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 0.7212121212121212, + "grad_norm": 0.027382057160139084, + "learning_rate": 9.903351021643233e-05, + "loss": 0.014433873817324638, + "num_input_tokens_seen": 19487440, + "step": 1190, + "train_runtime": 9669.7828, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 0.7218181818181818, + "grad_norm": 0.013371971435844898, + "learning_rate": 9.903162773929599e-05, + "loss": 0.014319634065032005, + "num_input_tokens_seen": 19503816, + "step": 1191, + "train_runtime": 9677.8954, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 0.7224242424242424, + "grad_norm": 0.02415373921394348, + "learning_rate": 9.902974344857521e-05, + "loss": 0.01522553525865078, + "num_input_tokens_seen": 19520192, + "step": 1192, + "train_runtime": 9686.0046, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 0.723030303030303, + "grad_norm": 0.013075731694698334, + "learning_rate": 9.902785734433971e-05, + "loss": 0.012145644053816795, + "num_input_tokens_seen": 19536568, + "step": 1193, + "train_runtime": 9694.1175, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 0.7236363636363636, + "grad_norm": 0.02217678166925907, + "learning_rate": 9.902596942665925e-05, + "loss": 0.013490047305822372, + "num_input_tokens_seen": 19552944, + "step": 1194, + "train_runtime": 9702.2306, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 0.7242424242424242, + "grad_norm": 0.014989197254180908, + "learning_rate": 9.902407969560364e-05, + "loss": 0.015374877490103245, + "num_input_tokens_seen": 19569320, + "step": 1195, + "train_runtime": 9710.3384, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 0.7248484848484849, + "grad_norm": 0.010880461893975735, + "learning_rate": 9.90221881512428e-05, + "loss": 0.010911534540355206, + "num_input_tokens_seen": 19585696, + "step": 1196, + "train_runtime": 9718.4475, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 0.7254545454545455, + "grad_norm": 0.0177223589271307, + "learning_rate": 9.90202947936467e-05, + "loss": 0.01328328251838684, + "num_input_tokens_seen": 19602072, + "step": 1197, + "train_runtime": 9726.5574, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 0.7260606060606061, + "grad_norm": 0.015080858021974564, + "learning_rate": 9.901839962288533e-05, + "loss": 0.013248666189610958, + "num_input_tokens_seen": 19618448, + "step": 1198, + "train_runtime": 9734.6668, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 0.7266666666666667, + "grad_norm": 0.01892446167767048, + "learning_rate": 9.901650263902884e-05, + "loss": 0.012533879838883877, + "num_input_tokens_seen": 19634824, + "step": 1199, + "train_runtime": 9742.776, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 0.0085715651512146, + "learning_rate": 9.901460384214736e-05, + "loss": 0.011274173855781555, + "num_input_tokens_seen": 19651200, + "step": 1200, + "train_runtime": 9750.8874, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 0.7278787878787879, + "grad_norm": 0.030662082135677338, + "learning_rate": 9.901270323231115e-05, + "loss": 0.012586663477122784, + "num_input_tokens_seen": 19667576, + "step": 1201, + "train_runtime": 9759.9377, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 0.7284848484848485, + "grad_norm": 0.012625769712030888, + "learning_rate": 9.901080080959048e-05, + "loss": 0.013224436901509762, + "num_input_tokens_seen": 19683952, + "step": 1202, + "train_runtime": 9768.0467, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 0.7290909090909091, + "grad_norm": 0.012317335233092308, + "learning_rate": 9.900889657405573e-05, + "loss": 0.012883040122687817, + "num_input_tokens_seen": 19700328, + "step": 1203, + "train_runtime": 9776.155, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 0.7296969696969697, + "grad_norm": 0.012403651140630245, + "learning_rate": 9.900699052577736e-05, + "loss": 0.012290080077946186, + "num_input_tokens_seen": 19716704, + "step": 1204, + "train_runtime": 9784.2649, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 0.7303030303030303, + "grad_norm": 0.01588149555027485, + "learning_rate": 9.900508266482582e-05, + "loss": 0.011603264138102531, + "num_input_tokens_seen": 19733080, + "step": 1205, + "train_runtime": 9792.3778, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 0.730909090909091, + "grad_norm": 0.014620691537857056, + "learning_rate": 9.900317299127171e-05, + "loss": 0.012423778884112835, + "num_input_tokens_seen": 19749456, + "step": 1206, + "train_runtime": 9800.4881, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 0.7315151515151516, + "grad_norm": 0.012740055099129677, + "learning_rate": 9.900126150518567e-05, + "loss": 0.013299481943249702, + "num_input_tokens_seen": 19765832, + "step": 1207, + "train_runtime": 9808.599, + "train_tokens_per_second": 2015.153 + }, + { + "epoch": 0.7321212121212122, + "grad_norm": 0.015813497826457024, + "learning_rate": 9.899934820663839e-05, + "loss": 0.014216665178537369, + "num_input_tokens_seen": 19782208, + "step": 1208, + "train_runtime": 9816.7097, + "train_tokens_per_second": 2015.157 + }, + { + "epoch": 0.7327272727272728, + "grad_norm": 0.023462215438485146, + "learning_rate": 9.899743309570065e-05, + "loss": 0.014444109052419662, + "num_input_tokens_seen": 19798584, + "step": 1209, + "train_runtime": 9824.8204, + "train_tokens_per_second": 2015.16 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.016535522416234016, + "learning_rate": 9.899551617244326e-05, + "loss": 0.012044892646372318, + "num_input_tokens_seen": 19814960, + "step": 1210, + "train_runtime": 9832.9302, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 0.7339393939393939, + "grad_norm": 0.01581740379333496, + "learning_rate": 9.899359743693714e-05, + "loss": 0.014411653392016888, + "num_input_tokens_seen": 19831336, + "step": 1211, + "train_runtime": 9841.0417, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 0.7345454545454545, + "grad_norm": 0.01694261096417904, + "learning_rate": 9.899167688925328e-05, + "loss": 0.01339998934417963, + "num_input_tokens_seen": 19847712, + "step": 1212, + "train_runtime": 9849.1549, + "train_tokens_per_second": 2015.169 + }, + { + "epoch": 0.7351515151515151, + "grad_norm": 0.011397319845855236, + "learning_rate": 9.898975452946268e-05, + "loss": 0.013992566615343094, + "num_input_tokens_seen": 19864088, + "step": 1213, + "train_runtime": 9857.2628, + "train_tokens_per_second": 2015.173 + }, + { + "epoch": 0.7357575757575757, + "grad_norm": 0.009932632558047771, + "learning_rate": 9.898783035763648e-05, + "loss": 0.013121276162564754, + "num_input_tokens_seen": 19880464, + "step": 1214, + "train_runtime": 9865.3743, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 0.7363636363636363, + "grad_norm": 0.039875004440546036, + "learning_rate": 9.898590437384583e-05, + "loss": 0.013154653832316399, + "num_input_tokens_seen": 19896840, + "step": 1215, + "train_runtime": 9873.4892, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 0.7369696969696969, + "grad_norm": 0.014247733168303967, + "learning_rate": 9.898397657816198e-05, + "loss": 0.012165211141109467, + "num_input_tokens_seen": 19913216, + "step": 1216, + "train_runtime": 9881.6008, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 0.7375757575757576, + "grad_norm": 0.020671000704169273, + "learning_rate": 9.89820469706562e-05, + "loss": 0.012851119041442871, + "num_input_tokens_seen": 19929592, + "step": 1217, + "train_runtime": 9889.711, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 0.7381818181818182, + "grad_norm": 0.01268229354172945, + "learning_rate": 9.898011555139991e-05, + "loss": 0.011670916341245174, + "num_input_tokens_seen": 19945968, + "step": 1218, + "train_runtime": 9897.8448, + "train_tokens_per_second": 2015.183 + }, + { + "epoch": 0.7387878787878788, + "grad_norm": 0.014971123076975346, + "learning_rate": 9.897818232046454e-05, + "loss": 0.012817314825952053, + "num_input_tokens_seen": 19962344, + "step": 1219, + "train_runtime": 9905.9579, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 0.7393939393939394, + "grad_norm": 0.03158552944660187, + "learning_rate": 9.897624727792159e-05, + "loss": 0.01493182685226202, + "num_input_tokens_seen": 19978720, + "step": 1220, + "train_runtime": 9914.0699, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 0.74, + "grad_norm": 0.013837055303156376, + "learning_rate": 9.897431042384261e-05, + "loss": 0.01410394161939621, + "num_input_tokens_seen": 19995096, + "step": 1221, + "train_runtime": 9922.1811, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 0.7406060606060606, + "grad_norm": 0.02035367488861084, + "learning_rate": 9.897237175829926e-05, + "loss": 0.014466963708400726, + "num_input_tokens_seen": 20011472, + "step": 1222, + "train_runtime": 9930.2933, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 0.7412121212121212, + "grad_norm": 0.03811359778046608, + "learning_rate": 9.897043128136325e-05, + "loss": 0.013205880299210548, + "num_input_tokens_seen": 20027848, + "step": 1223, + "train_runtime": 9938.4061, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 0.7418181818181818, + "grad_norm": 0.018652835860848427, + "learning_rate": 9.896848899310636e-05, + "loss": 0.013042958453297615, + "num_input_tokens_seen": 20044224, + "step": 1224, + "train_runtime": 9946.5152, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 0.7424242424242424, + "grad_norm": 0.017733843997120857, + "learning_rate": 9.896654489360042e-05, + "loss": 0.012684517540037632, + "num_input_tokens_seen": 20060600, + "step": 1225, + "train_runtime": 9954.6306, + "train_tokens_per_second": 2015.203 + }, + { + "epoch": 0.743030303030303, + "grad_norm": 0.010155964642763138, + "learning_rate": 9.896459898291734e-05, + "loss": 0.011605635285377502, + "num_input_tokens_seen": 20076976, + "step": 1226, + "train_runtime": 9962.74, + "train_tokens_per_second": 2015.206 + }, + { + "epoch": 0.7436363636363637, + "grad_norm": 0.02421714924275875, + "learning_rate": 9.896265126112911e-05, + "loss": 0.015139145776629448, + "num_input_tokens_seen": 20093352, + "step": 1227, + "train_runtime": 9970.8484, + "train_tokens_per_second": 2015.21 + }, + { + "epoch": 0.7442424242424243, + "grad_norm": 0.02827371098101139, + "learning_rate": 9.896070172830776e-05, + "loss": 0.013175873085856438, + "num_input_tokens_seen": 20109728, + "step": 1228, + "train_runtime": 9978.9575, + "train_tokens_per_second": 2015.213 + }, + { + "epoch": 0.7448484848484849, + "grad_norm": 0.012187021784484386, + "learning_rate": 9.895875038452539e-05, + "loss": 0.013465436175465584, + "num_input_tokens_seen": 20126104, + "step": 1229, + "train_runtime": 9987.0668, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 0.7454545454545455, + "grad_norm": 0.011740162037312984, + "learning_rate": 9.895679722985419e-05, + "loss": 0.013261547312140465, + "num_input_tokens_seen": 20142480, + "step": 1230, + "train_runtime": 9995.1753, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 0.7460606060606061, + "grad_norm": 0.02706027776002884, + "learning_rate": 9.89548422643664e-05, + "loss": 0.013440998271107674, + "num_input_tokens_seen": 20158856, + "step": 1231, + "train_runtime": 10003.284, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 0.05222317576408386, + "learning_rate": 9.895288548813432e-05, + "loss": 0.014066273346543312, + "num_input_tokens_seen": 20175232, + "step": 1232, + "train_runtime": 10011.3904, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 0.7472727272727273, + "grad_norm": 0.011138387955725193, + "learning_rate": 9.895092690123035e-05, + "loss": 0.012343725189566612, + "num_input_tokens_seen": 20191608, + "step": 1233, + "train_runtime": 10019.4992, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 0.7478787878787879, + "grad_norm": 0.019493183121085167, + "learning_rate": 9.894896650372692e-05, + "loss": 0.014319119974970818, + "num_input_tokens_seen": 20207984, + "step": 1234, + "train_runtime": 10027.6078, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 0.7484848484848485, + "grad_norm": 0.010399113409221172, + "learning_rate": 9.894700429569653e-05, + "loss": 0.013344192877411842, + "num_input_tokens_seen": 20224360, + "step": 1235, + "train_runtime": 10035.7155, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 0.7490909090909091, + "grad_norm": 0.013207124546170235, + "learning_rate": 9.894504027721179e-05, + "loss": 0.012579311616718769, + "num_input_tokens_seen": 20240736, + "step": 1236, + "train_runtime": 10043.8304, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 0.7496969696969698, + "grad_norm": 0.007676406297832727, + "learning_rate": 9.89430744483453e-05, + "loss": 0.012105286121368408, + "num_input_tokens_seen": 20257112, + "step": 1237, + "train_runtime": 10051.9411, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 0.7503030303030302, + "grad_norm": 0.26611316204071045, + "learning_rate": 9.894110680916981e-05, + "loss": 0.012751906178891659, + "num_input_tokens_seen": 20273488, + "step": 1238, + "train_runtime": 10060.0502, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 0.7509090909090909, + "grad_norm": 0.016328565776348114, + "learning_rate": 9.89391373597581e-05, + "loss": 0.013627522625029087, + "num_input_tokens_seen": 20289864, + "step": 1239, + "train_runtime": 10068.1581, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 0.7515151515151515, + "grad_norm": 0.018115442246198654, + "learning_rate": 9.8937166100183e-05, + "loss": 0.014619875699281693, + "num_input_tokens_seen": 20306240, + "step": 1240, + "train_runtime": 10076.2632, + "train_tokens_per_second": 2015.255 + }, + { + "epoch": 0.7521212121212121, + "grad_norm": 0.047225791960954666, + "learning_rate": 9.893519303051742e-05, + "loss": 0.012407291680574417, + "num_input_tokens_seen": 20322616, + "step": 1241, + "train_runtime": 10084.3737, + "train_tokens_per_second": 2015.258 + }, + { + "epoch": 0.7527272727272727, + "grad_norm": 0.00958853680640459, + "learning_rate": 9.893321815083435e-05, + "loss": 0.012367008253932, + "num_input_tokens_seen": 20338992, + "step": 1242, + "train_runtime": 10092.4834, + "train_tokens_per_second": 2015.261 + }, + { + "epoch": 0.7533333333333333, + "grad_norm": 0.01551489531993866, + "learning_rate": 9.893124146120684e-05, + "loss": 0.011828011833131313, + "num_input_tokens_seen": 20355368, + "step": 1243, + "train_runtime": 10100.5915, + "train_tokens_per_second": 2015.265 + }, + { + "epoch": 0.7539393939393939, + "grad_norm": 0.015479539521038532, + "learning_rate": 9.892926296170799e-05, + "loss": 0.013003758154809475, + "num_input_tokens_seen": 20371744, + "step": 1244, + "train_runtime": 10108.6986, + "train_tokens_per_second": 2015.269 + }, + { + "epoch": 0.7545454545454545, + "grad_norm": 0.018905159085989, + "learning_rate": 9.892728265241098e-05, + "loss": 0.013263228349387646, + "num_input_tokens_seen": 20388120, + "step": 1245, + "train_runtime": 10116.8092, + "train_tokens_per_second": 2015.272 + }, + { + "epoch": 0.7551515151515151, + "grad_norm": 0.02863249182701111, + "learning_rate": 9.892530053338909e-05, + "loss": 0.0130619453266263, + "num_input_tokens_seen": 20404496, + "step": 1246, + "train_runtime": 10124.9156, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 0.7557575757575757, + "grad_norm": 0.016296787187457085, + "learning_rate": 9.892331660471559e-05, + "loss": 0.012045785784721375, + "num_input_tokens_seen": 20420872, + "step": 1247, + "train_runtime": 10133.0202, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 0.7563636363636363, + "grad_norm": 0.016199452802538872, + "learning_rate": 9.892133086646389e-05, + "loss": 0.012048415839672089, + "num_input_tokens_seen": 20437248, + "step": 1248, + "train_runtime": 10141.1305, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 0.756969696969697, + "grad_norm": 0.012741641141474247, + "learning_rate": 9.891934331870743e-05, + "loss": 0.01335767563432455, + "num_input_tokens_seen": 20453624, + "step": 1249, + "train_runtime": 10149.2473, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 0.7575757575757576, + "grad_norm": 0.03929731622338295, + "learning_rate": 9.891735396151972e-05, + "loss": 0.01206697802990675, + "num_input_tokens_seen": 20470000, + "step": 1250, + "train_runtime": 10157.3657, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 0.7581818181818182, + "grad_norm": 0.007868324406445026, + "learning_rate": 9.891536279497436e-05, + "loss": 0.011791637167334557, + "num_input_tokens_seen": 20486376, + "step": 1251, + "train_runtime": 10165.4828, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 0.7587878787878788, + "grad_norm": 0.013859824277460575, + "learning_rate": 9.891336981914499e-05, + "loss": 0.014204591512680054, + "num_input_tokens_seen": 20502752, + "step": 1252, + "train_runtime": 10173.6015, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 0.7593939393939394, + "grad_norm": 0.03682630881667137, + "learning_rate": 9.891137503410531e-05, + "loss": 0.01157104317098856, + "num_input_tokens_seen": 20519128, + "step": 1253, + "train_runtime": 10181.7191, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 0.76, + "grad_norm": 0.015358424745500088, + "learning_rate": 9.890937843992913e-05, + "loss": 0.013172848150134087, + "num_input_tokens_seen": 20535504, + "step": 1254, + "train_runtime": 10189.8374, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 0.7606060606060606, + "grad_norm": 0.01969468779861927, + "learning_rate": 9.890738003669029e-05, + "loss": 0.013599451631307602, + "num_input_tokens_seen": 20551880, + "step": 1255, + "train_runtime": 10197.9553, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 0.7612121212121212, + "grad_norm": 0.01678163930773735, + "learning_rate": 9.89053798244627e-05, + "loss": 0.013114574365317822, + "num_input_tokens_seen": 20568256, + "step": 1256, + "train_runtime": 10206.0749, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 0.7618181818181818, + "grad_norm": 0.0193489920347929, + "learning_rate": 9.890337780332035e-05, + "loss": 0.011934047564864159, + "num_input_tokens_seen": 20584632, + "step": 1257, + "train_runtime": 10214.1928, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 0.7624242424242424, + "grad_norm": 0.011665060184895992, + "learning_rate": 9.890137397333729e-05, + "loss": 0.012188711203634739, + "num_input_tokens_seen": 20601008, + "step": 1258, + "train_runtime": 10222.3099, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 0.7630303030303031, + "grad_norm": 0.005775026045739651, + "learning_rate": 9.889936833458763e-05, + "loss": 0.011419412679970264, + "num_input_tokens_seen": 20617384, + "step": 1259, + "train_runtime": 10230.4305, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 0.7636363636363637, + "grad_norm": 0.023811450228095055, + "learning_rate": 9.889736088714558e-05, + "loss": 0.01227609720081091, + "num_input_tokens_seen": 20633760, + "step": 1260, + "train_runtime": 10238.5502, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 0.7642424242424243, + "grad_norm": 0.0233469195663929, + "learning_rate": 9.889535163108537e-05, + "loss": 0.012738242745399475, + "num_input_tokens_seen": 20650136, + "step": 1261, + "train_runtime": 10246.6696, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 0.7648484848484849, + "grad_norm": 0.01263290736824274, + "learning_rate": 9.889334056648131e-05, + "loss": 0.01269836351275444, + "num_input_tokens_seen": 20666512, + "step": 1262, + "train_runtime": 10254.7884, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 0.7654545454545455, + "grad_norm": 0.014581980183720589, + "learning_rate": 9.889132769340781e-05, + "loss": 0.013540278188884258, + "num_input_tokens_seen": 20682888, + "step": 1263, + "train_runtime": 10262.9085, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 0.7660606060606061, + "grad_norm": 0.014391904696822166, + "learning_rate": 9.88893130119393e-05, + "loss": 0.012283596210181713, + "num_input_tokens_seen": 20699264, + "step": 1264, + "train_runtime": 10271.0298, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 0.7666666666666667, + "grad_norm": 0.015524503774940968, + "learning_rate": 9.888729652215032e-05, + "loss": 0.012001638300716877, + "num_input_tokens_seen": 20715640, + "step": 1265, + "train_runtime": 10279.149, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 0.7672727272727272, + "grad_norm": 0.011605373583734035, + "learning_rate": 9.888527822411543e-05, + "loss": 0.012554067187011242, + "num_input_tokens_seen": 20732016, + "step": 1266, + "train_runtime": 10287.2682, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 0.7678787878787878, + "grad_norm": 0.017037956044077873, + "learning_rate": 9.888325811790931e-05, + "loss": 0.013448834419250488, + "num_input_tokens_seen": 20748392, + "step": 1267, + "train_runtime": 10295.3877, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 0.7684848484848484, + "grad_norm": 0.028556402772665024, + "learning_rate": 9.888123620360666e-05, + "loss": 0.012878211215138435, + "num_input_tokens_seen": 20764768, + "step": 1268, + "train_runtime": 10303.5056, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 0.769090909090909, + "grad_norm": 0.014958829618990421, + "learning_rate": 9.887921248128228e-05, + "loss": 0.013986572623252869, + "num_input_tokens_seen": 20781144, + "step": 1269, + "train_runtime": 10311.6315, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 0.7696969696969697, + "grad_norm": 0.021999262273311615, + "learning_rate": 9.887718695101102e-05, + "loss": 0.01611473597586155, + "num_input_tokens_seen": 20797520, + "step": 1270, + "train_runtime": 10319.7512, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 0.7703030303030303, + "grad_norm": 0.01434963196516037, + "learning_rate": 9.88751596128678e-05, + "loss": 0.012239954434335232, + "num_input_tokens_seen": 20813896, + "step": 1271, + "train_runtime": 10327.8713, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 0.7709090909090909, + "grad_norm": 0.02051941119134426, + "learning_rate": 9.887313046692761e-05, + "loss": 0.013740262016654015, + "num_input_tokens_seen": 20830272, + "step": 1272, + "train_runtime": 10335.9914, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 0.7715151515151515, + "grad_norm": 0.00836126133799553, + "learning_rate": 9.88710995132655e-05, + "loss": 0.011003411374986172, + "num_input_tokens_seen": 20846648, + "step": 1273, + "train_runtime": 10344.1113, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 0.7721212121212121, + "grad_norm": 0.009217855520546436, + "learning_rate": 9.886906675195657e-05, + "loss": 0.012320063076913357, + "num_input_tokens_seen": 20863024, + "step": 1274, + "train_runtime": 10352.2308, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 0.7727272727272727, + "grad_norm": 0.00831685308367014, + "learning_rate": 9.886703218307604e-05, + "loss": 0.013156922534108162, + "num_input_tokens_seen": 20879400, + "step": 1275, + "train_runtime": 10360.351, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 0.01840154640376568, + "learning_rate": 9.886499580669917e-05, + "loss": 0.01196813490241766, + "num_input_tokens_seen": 20895776, + "step": 1276, + "train_runtime": 10368.4707, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 0.7739393939393939, + "grad_norm": 0.016405558213591576, + "learning_rate": 9.886295762290125e-05, + "loss": 0.013263520784676075, + "num_input_tokens_seen": 20912152, + "step": 1277, + "train_runtime": 10376.5894, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 0.7745454545454545, + "grad_norm": 0.017034931108355522, + "learning_rate": 9.886091763175769e-05, + "loss": 0.013993248343467712, + "num_input_tokens_seen": 20928528, + "step": 1278, + "train_runtime": 10384.7083, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 0.7751515151515151, + "grad_norm": 0.03572826832532883, + "learning_rate": 9.885887583334393e-05, + "loss": 0.012332772836089134, + "num_input_tokens_seen": 20944904, + "step": 1279, + "train_runtime": 10392.8323, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 0.7757575757575758, + "grad_norm": 0.02001163735985756, + "learning_rate": 9.885683222773551e-05, + "loss": 0.012113104574382305, + "num_input_tokens_seen": 20961280, + "step": 1280, + "train_runtime": 10400.9501, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 0.7763636363636364, + "grad_norm": 0.04807475954294205, + "learning_rate": 9.8854786815008e-05, + "loss": 0.011850223876535892, + "num_input_tokens_seen": 20977656, + "step": 1281, + "train_runtime": 10409.0709, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 0.776969696969697, + "grad_norm": 0.007331644184887409, + "learning_rate": 9.885273959523707e-05, + "loss": 0.011687932536005974, + "num_input_tokens_seen": 20994032, + "step": 1282, + "train_runtime": 10417.1889, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 0.7775757575757576, + "grad_norm": 0.013896801508963108, + "learning_rate": 9.885069056849845e-05, + "loss": 0.01239155326038599, + "num_input_tokens_seen": 21010408, + "step": 1283, + "train_runtime": 10425.3081, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 0.7781818181818182, + "grad_norm": 0.009068959392607212, + "learning_rate": 9.88486397348679e-05, + "loss": 0.01141006126999855, + "num_input_tokens_seen": 21026784, + "step": 1284, + "train_runtime": 10433.4338, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 0.7787878787878788, + "grad_norm": 0.01311533898115158, + "learning_rate": 9.884658709442132e-05, + "loss": 0.011742614209651947, + "num_input_tokens_seen": 21043160, + "step": 1285, + "train_runtime": 10441.5524, + "train_tokens_per_second": 2015.329 + }, + { + "epoch": 0.7793939393939394, + "grad_norm": 0.01562919095158577, + "learning_rate": 9.884453264723459e-05, + "loss": 0.012607906013727188, + "num_input_tokens_seen": 21059536, + "step": 1286, + "train_runtime": 10449.6697, + "train_tokens_per_second": 2015.33 + }, + { + "epoch": 0.78, + "grad_norm": 0.017651278525590897, + "learning_rate": 9.884247639338373e-05, + "loss": 0.01244867779314518, + "num_input_tokens_seen": 21075912, + "step": 1287, + "train_runtime": 10457.7881, + "train_tokens_per_second": 2015.332 + }, + { + "epoch": 0.7806060606060606, + "grad_norm": 0.020198311656713486, + "learning_rate": 9.884041833294476e-05, + "loss": 0.013492776080965996, + "num_input_tokens_seen": 21092288, + "step": 1288, + "train_runtime": 10465.9062, + "train_tokens_per_second": 2015.333 + }, + { + "epoch": 0.7812121212121212, + "grad_norm": 0.009970282204449177, + "learning_rate": 9.883835846599386e-05, + "loss": 0.013857762329280376, + "num_input_tokens_seen": 21108664, + "step": 1289, + "train_runtime": 10474.1537, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 0.7818181818181819, + "grad_norm": 0.011340651661157608, + "learning_rate": 9.883629679260715e-05, + "loss": 0.011344236321747303, + "num_input_tokens_seen": 21125040, + "step": 1290, + "train_runtime": 10482.2705, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 0.7824242424242425, + "grad_norm": 0.03793201595544815, + "learning_rate": 9.883423331286096e-05, + "loss": 0.015287358313798904, + "num_input_tokens_seen": 21141416, + "step": 1291, + "train_runtime": 10490.3857, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 0.7830303030303031, + "grad_norm": 0.02402154542505741, + "learning_rate": 9.883216802683158e-05, + "loss": 0.013735389336943626, + "num_input_tokens_seen": 21157792, + "step": 1292, + "train_runtime": 10498.5012, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 0.7836363636363637, + "grad_norm": 0.016549425199627876, + "learning_rate": 9.883010093459537e-05, + "loss": 0.01311381347477436, + "num_input_tokens_seen": 21174168, + "step": 1293, + "train_runtime": 10506.618, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 0.7842424242424243, + "grad_norm": 0.0236363522708416, + "learning_rate": 9.882803203622884e-05, + "loss": 0.01185927726328373, + "num_input_tokens_seen": 21190544, + "step": 1294, + "train_runtime": 10514.7333, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 0.7848484848484848, + "grad_norm": 0.015482014045119286, + "learning_rate": 9.882596133180849e-05, + "loss": 0.012073281221091747, + "num_input_tokens_seen": 21206920, + "step": 1295, + "train_runtime": 10522.8502, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 0.7854545454545454, + "grad_norm": 0.01528620719909668, + "learning_rate": 9.882388882141092e-05, + "loss": 0.012514740228652954, + "num_input_tokens_seen": 21223296, + "step": 1296, + "train_runtime": 10530.9675, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 0.786060606060606, + "grad_norm": 0.01590045541524887, + "learning_rate": 9.882181450511278e-05, + "loss": 0.014040066860616207, + "num_input_tokens_seen": 21239672, + "step": 1297, + "train_runtime": 10539.085, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 0.7866666666666666, + "grad_norm": 0.026240071281790733, + "learning_rate": 9.88197383829908e-05, + "loss": 0.012822052463889122, + "num_input_tokens_seen": 21256048, + "step": 1298, + "train_runtime": 10547.2019, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 0.7872727272727272, + "grad_norm": 0.014810437336564064, + "learning_rate": 9.881766045512176e-05, + "loss": 0.01398603618144989, + "num_input_tokens_seen": 21272424, + "step": 1299, + "train_runtime": 10555.3212, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 0.7878787878787878, + "grad_norm": 0.0264164749532938, + "learning_rate": 9.881558072158252e-05, + "loss": 0.012693504802882671, + "num_input_tokens_seen": 21288800, + "step": 1300, + "train_runtime": 10563.4399, + "train_tokens_per_second": 2015.328 + } + ], + "logging_steps": 1, + "max_steps": 16500, + "num_input_tokens_seen": 21288800, + "num_train_epochs": 10, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 9.778839274733568e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}