diff --git "a/v127rc_exp2/B_rep/checkpoint-8700/trainer_state.json" "b/v127rc_exp2/B_rep/checkpoint-8700/trainer_state.json" new file mode 100644--- /dev/null +++ "b/v127rc_exp2/B_rep/checkpoint-8700/trainer_state.json" @@ -0,0 +1,87034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 5.2727272727272725, + "eval_steps": 500, + "global_step": 8700, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006060606060606061, + "grad_norm": 0.35123783349990845, + "learning_rate": 0.0, + "loss": 1.6639432907104492, + "num_input_tokens_seen": 16376, + "step": 1, + "train_runtime": 9.7703, + "train_tokens_per_second": 1676.104 + }, + { + "epoch": 0.0012121212121212121, + "grad_norm": 0.39342227578163147, + "learning_rate": 6.060606060606061e-07, + "loss": 1.6057767868041992, + "num_input_tokens_seen": 32752, + "step": 2, + "train_runtime": 17.8325, + "train_tokens_per_second": 1836.647 + }, + { + "epoch": 0.0018181818181818182, + "grad_norm": 0.3597555458545685, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.6560568809509277, + "num_input_tokens_seen": 49128, + "step": 3, + "train_runtime": 25.8927, + "train_tokens_per_second": 1897.372 + }, + { + "epoch": 0.0024242424242424242, + "grad_norm": 0.3463701009750366, + "learning_rate": 1.818181818181818e-06, + "loss": 1.6540638208389282, + "num_input_tokens_seen": 65504, + "step": 4, + "train_runtime": 33.9566, + "train_tokens_per_second": 1929.051 + }, + { + "epoch": 0.0030303030303030303, + "grad_norm": 0.34733158349990845, + "learning_rate": 2.4242424242424244e-06, + "loss": 1.664928913116455, + "num_input_tokens_seen": 81880, + "step": 5, + "train_runtime": 42.0394, + "train_tokens_per_second": 1947.697 + }, + { + "epoch": 0.0036363636363636364, + "grad_norm": 0.36326366662979126, + "learning_rate": 3.0303030303030305e-06, + "loss": 1.6352522373199463, + "num_input_tokens_seen": 98256, + "step": 6, + "train_runtime": 50.1229, + "train_tokens_per_second": 1960.302 + }, + { + "epoch": 0.004242424242424243, + "grad_norm": 0.351137638092041, + "learning_rate": 3.636363636363636e-06, + "loss": 1.660022497177124, + "num_input_tokens_seen": 114632, + "step": 7, + "train_runtime": 58.2137, + "train_tokens_per_second": 1969.159 + }, + { + "epoch": 0.0048484848484848485, + "grad_norm": 0.353691428899765, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6757584810256958, + "num_input_tokens_seen": 131008, + "step": 8, + "train_runtime": 66.311, + "train_tokens_per_second": 1975.66 + }, + { + "epoch": 0.005454545454545455, + "grad_norm": 0.3630884885787964, + "learning_rate": 4.848484848484849e-06, + "loss": 1.6366666555404663, + "num_input_tokens_seen": 147384, + "step": 9, + "train_runtime": 74.4151, + "train_tokens_per_second": 1980.565 + }, + { + "epoch": 0.006060606060606061, + "grad_norm": 0.354055255651474, + "learning_rate": 5.4545454545454545e-06, + "loss": 1.6339915990829468, + "num_input_tokens_seen": 163760, + "step": 10, + "train_runtime": 82.5209, + "train_tokens_per_second": 1984.468 + }, + { + "epoch": 0.006666666666666667, + "grad_norm": 0.3574777841567993, + "learning_rate": 6.060606060606061e-06, + "loss": 1.6360563039779663, + "num_input_tokens_seen": 180136, + "step": 11, + "train_runtime": 90.6349, + "train_tokens_per_second": 1987.491 + }, + { + "epoch": 0.007272727272727273, + "grad_norm": 0.3561362028121948, + "learning_rate": 6.666666666666667e-06, + "loss": 1.6641417741775513, + "num_input_tokens_seen": 196512, + "step": 12, + "train_runtime": 98.7492, + "train_tokens_per_second": 1990.012 + }, + { + "epoch": 0.00787878787878788, + "grad_norm": 0.3659680485725403, + "learning_rate": 7.272727272727272e-06, + "loss": 1.6375828981399536, + "num_input_tokens_seen": 212888, + "step": 13, + "train_runtime": 106.8626, + "train_tokens_per_second": 1992.165 + }, + { + "epoch": 0.008484848484848486, + "grad_norm": 0.37148839235305786, + "learning_rate": 7.878787878787878e-06, + "loss": 1.6246858835220337, + "num_input_tokens_seen": 229264, + "step": 14, + "train_runtime": 114.9785, + "train_tokens_per_second": 1993.973 + }, + { + "epoch": 0.00909090909090909, + "grad_norm": 0.38491716980934143, + "learning_rate": 8.484848484848486e-06, + "loss": 1.5969434976577759, + "num_input_tokens_seen": 245640, + "step": 15, + "train_runtime": 123.0953, + "train_tokens_per_second": 1995.526 + }, + { + "epoch": 0.009696969696969697, + "grad_norm": 0.37805187702178955, + "learning_rate": 9.090909090909091e-06, + "loss": 1.6518127918243408, + "num_input_tokens_seen": 262016, + "step": 16, + "train_runtime": 131.2123, + "train_tokens_per_second": 1996.886 + }, + { + "epoch": 0.010303030303030303, + "grad_norm": 0.3775594234466553, + "learning_rate": 9.696969696969698e-06, + "loss": 1.6409087181091309, + "num_input_tokens_seen": 278392, + "step": 17, + "train_runtime": 139.3368, + "train_tokens_per_second": 1997.979 + }, + { + "epoch": 0.01090909090909091, + "grad_norm": 0.39896833896636963, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.5989718437194824, + "num_input_tokens_seen": 294768, + "step": 18, + "train_runtime": 147.4538, + "train_tokens_per_second": 1999.054 + }, + { + "epoch": 0.011515151515151515, + "grad_norm": 0.386406272649765, + "learning_rate": 1.0909090909090909e-05, + "loss": 1.6259583234786987, + "num_input_tokens_seen": 311144, + "step": 19, + "train_runtime": 155.5716, + "train_tokens_per_second": 2000.005 + }, + { + "epoch": 0.012121212121212121, + "grad_norm": 0.3878491520881653, + "learning_rate": 1.1515151515151517e-05, + "loss": 1.5945892333984375, + "num_input_tokens_seen": 327520, + "step": 20, + "train_runtime": 163.6898, + "train_tokens_per_second": 2000.858 + }, + { + "epoch": 0.012727272727272728, + "grad_norm": 0.4080464839935303, + "learning_rate": 1.2121212121212122e-05, + "loss": 1.5983033180236816, + "num_input_tokens_seen": 343896, + "step": 21, + "train_runtime": 171.8117, + "train_tokens_per_second": 2001.587 + }, + { + "epoch": 0.013333333333333334, + "grad_norm": 0.41852834820747375, + "learning_rate": 1.2727272727272727e-05, + "loss": 1.5769346952438354, + "num_input_tokens_seen": 360272, + "step": 22, + "train_runtime": 179.9341, + "train_tokens_per_second": 2002.244 + }, + { + "epoch": 0.013939393939393939, + "grad_norm": 0.4324847459793091, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.5699174404144287, + "num_input_tokens_seen": 376648, + "step": 23, + "train_runtime": 188.0563, + "train_tokens_per_second": 2002.847 + }, + { + "epoch": 0.014545454545454545, + "grad_norm": 0.4219138026237488, + "learning_rate": 1.3939393939393942e-05, + "loss": 1.5589112043380737, + "num_input_tokens_seen": 393024, + "step": 24, + "train_runtime": 196.1758, + "train_tokens_per_second": 2003.427 + }, + { + "epoch": 0.015151515151515152, + "grad_norm": 0.42980635166168213, + "learning_rate": 1.4545454545454545e-05, + "loss": 1.5662312507629395, + "num_input_tokens_seen": 409400, + "step": 25, + "train_runtime": 204.2941, + "train_tokens_per_second": 2003.974 + }, + { + "epoch": 0.01575757575757576, + "grad_norm": 0.4569622576236725, + "learning_rate": 1.5151515151515153e-05, + "loss": 1.4885609149932861, + "num_input_tokens_seen": 425776, + "step": 26, + "train_runtime": 212.4141, + "train_tokens_per_second": 2004.462 + }, + { + "epoch": 0.016363636363636365, + "grad_norm": 0.4413582384586334, + "learning_rate": 1.5757575757575756e-05, + "loss": 1.4823509454727173, + "num_input_tokens_seen": 442152, + "step": 27, + "train_runtime": 220.5358, + "train_tokens_per_second": 2004.899 + }, + { + "epoch": 0.01696969696969697, + "grad_norm": 0.45630744099617004, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.4595903158187866, + "num_input_tokens_seen": 458528, + "step": 28, + "train_runtime": 228.6622, + "train_tokens_per_second": 2005.264 + }, + { + "epoch": 0.017575757575757574, + "grad_norm": 0.457793653011322, + "learning_rate": 1.6969696969696972e-05, + "loss": 1.4525277614593506, + "num_input_tokens_seen": 474904, + "step": 29, + "train_runtime": 236.7808, + "train_tokens_per_second": 2005.67 + }, + { + "epoch": 0.01818181818181818, + "grad_norm": 0.4766552150249481, + "learning_rate": 1.7575757575757576e-05, + "loss": 1.425432801246643, + "num_input_tokens_seen": 491280, + "step": 30, + "train_runtime": 244.8928, + "train_tokens_per_second": 2006.102 + }, + { + "epoch": 0.018787878787878787, + "grad_norm": 0.5165067911148071, + "learning_rate": 1.8181818181818182e-05, + "loss": 1.3646923303604126, + "num_input_tokens_seen": 507656, + "step": 31, + "train_runtime": 253.0027, + "train_tokens_per_second": 2006.524 + }, + { + "epoch": 0.019393939393939394, + "grad_norm": 0.4833853244781494, + "learning_rate": 1.878787878787879e-05, + "loss": 1.3608993291854858, + "num_input_tokens_seen": 524032, + "step": 32, + "train_runtime": 261.1128, + "train_tokens_per_second": 2006.918 + }, + { + "epoch": 0.02, + "grad_norm": 0.49612611532211304, + "learning_rate": 1.9393939393939395e-05, + "loss": 1.350702166557312, + "num_input_tokens_seen": 540408, + "step": 33, + "train_runtime": 269.2241, + "train_tokens_per_second": 2007.28 + }, + { + "epoch": 0.020606060606060607, + "grad_norm": 0.5136600732803345, + "learning_rate": 2e-05, + "loss": 1.291304349899292, + "num_input_tokens_seen": 556784, + "step": 34, + "train_runtime": 277.336, + "train_tokens_per_second": 2007.615 + }, + { + "epoch": 0.021212121212121213, + "grad_norm": 0.5192011594772339, + "learning_rate": 2.0606060606060608e-05, + "loss": 1.2744120359420776, + "num_input_tokens_seen": 573160, + "step": 35, + "train_runtime": 285.4446, + "train_tokens_per_second": 2007.956 + }, + { + "epoch": 0.02181818181818182, + "grad_norm": 0.5397765636444092, + "learning_rate": 2.1212121212121215e-05, + "loss": 1.208145022392273, + "num_input_tokens_seen": 589536, + "step": 36, + "train_runtime": 293.5567, + "train_tokens_per_second": 2008.253 + }, + { + "epoch": 0.022424242424242423, + "grad_norm": 0.5493120551109314, + "learning_rate": 2.1818181818181818e-05, + "loss": 1.2057533264160156, + "num_input_tokens_seen": 605912, + "step": 37, + "train_runtime": 301.6704, + "train_tokens_per_second": 2008.523 + }, + { + "epoch": 0.02303030303030303, + "grad_norm": 0.5603742599487305, + "learning_rate": 2.2424242424242424e-05, + "loss": 1.1387653350830078, + "num_input_tokens_seen": 622288, + "step": 38, + "train_runtime": 309.7816, + "train_tokens_per_second": 2008.796 + }, + { + "epoch": 0.023636363636363636, + "grad_norm": 0.581070601940155, + "learning_rate": 2.3030303030303034e-05, + "loss": 1.138227939605713, + "num_input_tokens_seen": 638664, + "step": 39, + "train_runtime": 317.8926, + "train_tokens_per_second": 2009.056 + }, + { + "epoch": 0.024242424242424242, + "grad_norm": 0.5650333762168884, + "learning_rate": 2.3636363636363637e-05, + "loss": 1.1126341819763184, + "num_input_tokens_seen": 655040, + "step": 40, + "train_runtime": 326.0006, + "train_tokens_per_second": 2009.321 + }, + { + "epoch": 0.02484848484848485, + "grad_norm": 0.6228408813476562, + "learning_rate": 2.4242424242424244e-05, + "loss": 1.0580966472625732, + "num_input_tokens_seen": 671416, + "step": 41, + "train_runtime": 334.1133, + "train_tokens_per_second": 2009.546 + }, + { + "epoch": 0.025454545454545455, + "grad_norm": 0.7027150392532349, + "learning_rate": 2.4848484848484847e-05, + "loss": 1.0436644554138184, + "num_input_tokens_seen": 687792, + "step": 42, + "train_runtime": 342.2227, + "train_tokens_per_second": 2009.779 + }, + { + "epoch": 0.026060606060606062, + "grad_norm": 0.876166045665741, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.9523745775222778, + "num_input_tokens_seen": 704168, + "step": 43, + "train_runtime": 350.3357, + "train_tokens_per_second": 2009.981 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 0.5786903500556946, + "learning_rate": 2.6060606060606063e-05, + "loss": 0.9218084812164307, + "num_input_tokens_seen": 720544, + "step": 44, + "train_runtime": 358.4444, + "train_tokens_per_second": 2010.198 + }, + { + "epoch": 0.02727272727272727, + "grad_norm": 0.6627383828163147, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.8746504187583923, + "num_input_tokens_seen": 736920, + "step": 45, + "train_runtime": 366.5519, + "train_tokens_per_second": 2010.411 + }, + { + "epoch": 0.027878787878787878, + "grad_norm": 0.6991789937019348, + "learning_rate": 2.7272727272727273e-05, + "loss": 0.8592554926872253, + "num_input_tokens_seen": 753296, + "step": 46, + "train_runtime": 374.6632, + "train_tokens_per_second": 2010.595 + }, + { + "epoch": 0.028484848484848484, + "grad_norm": 0.6843043565750122, + "learning_rate": 2.7878787878787883e-05, + "loss": 0.7838267683982849, + "num_input_tokens_seen": 769672, + "step": 47, + "train_runtime": 382.7718, + "train_tokens_per_second": 2010.786 + }, + { + "epoch": 0.02909090909090909, + "grad_norm": 0.6203355193138123, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7517961263656616, + "num_input_tokens_seen": 786048, + "step": 48, + "train_runtime": 390.883, + "train_tokens_per_second": 2010.955 + }, + { + "epoch": 0.029696969696969697, + "grad_norm": 0.6031985878944397, + "learning_rate": 2.909090909090909e-05, + "loss": 0.7074779272079468, + "num_input_tokens_seen": 802424, + "step": 49, + "train_runtime": 398.992, + "train_tokens_per_second": 2011.128 + }, + { + "epoch": 0.030303030303030304, + "grad_norm": 0.6645159125328064, + "learning_rate": 2.96969696969697e-05, + "loss": 0.6244415044784546, + "num_input_tokens_seen": 818800, + "step": 50, + "train_runtime": 407.1007, + "train_tokens_per_second": 2011.296 + }, + { + "epoch": 0.03090909090909091, + "grad_norm": 0.6037282943725586, + "learning_rate": 3.0303030303030306e-05, + "loss": 0.6258814334869385, + "num_input_tokens_seen": 835176, + "step": 51, + "train_runtime": 415.209, + "train_tokens_per_second": 2011.459 + }, + { + "epoch": 0.03151515151515152, + "grad_norm": 0.7840785980224609, + "learning_rate": 3.090909090909091e-05, + "loss": 0.5502547025680542, + "num_input_tokens_seen": 851552, + "step": 52, + "train_runtime": 423.3167, + "train_tokens_per_second": 2011.619 + }, + { + "epoch": 0.03212121212121212, + "grad_norm": 0.5410464406013489, + "learning_rate": 3.151515151515151e-05, + "loss": 0.4808294475078583, + "num_input_tokens_seen": 867928, + "step": 53, + "train_runtime": 431.4326, + "train_tokens_per_second": 2011.735 + }, + { + "epoch": 0.03272727272727273, + "grad_norm": 0.5532175898551941, + "learning_rate": 3.212121212121212e-05, + "loss": 0.4808656871318817, + "num_input_tokens_seen": 884304, + "step": 54, + "train_runtime": 439.5458, + "train_tokens_per_second": 2011.859 + }, + { + "epoch": 0.03333333333333333, + "grad_norm": 0.6308758854866028, + "learning_rate": 3.272727272727273e-05, + "loss": 0.4137771427631378, + "num_input_tokens_seen": 900680, + "step": 55, + "train_runtime": 447.6539, + "train_tokens_per_second": 2012.001 + }, + { + "epoch": 0.03393939393939394, + "grad_norm": 0.492653489112854, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.3654894530773163, + "num_input_tokens_seen": 917056, + "step": 56, + "train_runtime": 455.7624, + "train_tokens_per_second": 2012.136 + }, + { + "epoch": 0.034545454545454546, + "grad_norm": 0.5767380595207214, + "learning_rate": 3.3939393939393945e-05, + "loss": 0.342722088098526, + "num_input_tokens_seen": 933432, + "step": 57, + "train_runtime": 463.8713, + "train_tokens_per_second": 2012.265 + }, + { + "epoch": 0.03515151515151515, + "grad_norm": 0.5243986248970032, + "learning_rate": 3.454545454545455e-05, + "loss": 0.2960652709007263, + "num_input_tokens_seen": 949808, + "step": 58, + "train_runtime": 471.9805, + "train_tokens_per_second": 2012.388 + }, + { + "epoch": 0.03575757575757576, + "grad_norm": 0.4490169882774353, + "learning_rate": 3.515151515151515e-05, + "loss": 0.26675525307655334, + "num_input_tokens_seen": 966184, + "step": 59, + "train_runtime": 480.0912, + "train_tokens_per_second": 2012.501 + }, + { + "epoch": 0.03636363636363636, + "grad_norm": 0.4677429795265198, + "learning_rate": 3.575757575757576e-05, + "loss": 0.2512170076370239, + "num_input_tokens_seen": 982560, + "step": 60, + "train_runtime": 488.2016, + "train_tokens_per_second": 2012.611 + }, + { + "epoch": 0.03696969696969697, + "grad_norm": 0.37272387742996216, + "learning_rate": 3.6363636363636364e-05, + "loss": 0.19348715245723724, + "num_input_tokens_seen": 998936, + "step": 61, + "train_runtime": 496.3104, + "train_tokens_per_second": 2012.724 + }, + { + "epoch": 0.037575757575757575, + "grad_norm": 0.36983442306518555, + "learning_rate": 3.6969696969696974e-05, + "loss": 0.18563911318778992, + "num_input_tokens_seen": 1015312, + "step": 62, + "train_runtime": 504.419, + "train_tokens_per_second": 2012.835 + }, + { + "epoch": 0.038181818181818185, + "grad_norm": 0.37516751885414124, + "learning_rate": 3.757575757575758e-05, + "loss": 0.16986083984375, + "num_input_tokens_seen": 1031688, + "step": 63, + "train_runtime": 512.5348, + "train_tokens_per_second": 2012.913 + }, + { + "epoch": 0.03878787878787879, + "grad_norm": 0.3174577057361603, + "learning_rate": 3.818181818181819e-05, + "loss": 0.1534540057182312, + "num_input_tokens_seen": 1048064, + "step": 64, + "train_runtime": 520.644, + "train_tokens_per_second": 2013.015 + }, + { + "epoch": 0.03939393939393939, + "grad_norm": 0.30689847469329834, + "learning_rate": 3.878787878787879e-05, + "loss": 0.14156833291053772, + "num_input_tokens_seen": 1064440, + "step": 65, + "train_runtime": 528.7787, + "train_tokens_per_second": 2013.016 + }, + { + "epoch": 0.04, + "grad_norm": 0.2671639621257782, + "learning_rate": 3.939393939393939e-05, + "loss": 0.12481589615345001, + "num_input_tokens_seen": 1080816, + "step": 66, + "train_runtime": 536.8903, + "train_tokens_per_second": 2013.104 + }, + { + "epoch": 0.040606060606060604, + "grad_norm": 0.2459305375814438, + "learning_rate": 4e-05, + "loss": 0.12609152495861053, + "num_input_tokens_seen": 1097192, + "step": 67, + "train_runtime": 545.0023, + "train_tokens_per_second": 2013.188 + }, + { + "epoch": 0.041212121212121214, + "grad_norm": 0.23298931121826172, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.10923294723033905, + "num_input_tokens_seen": 1113568, + "step": 68, + "train_runtime": 553.1113, + "train_tokens_per_second": 2013.28 + }, + { + "epoch": 0.04181818181818182, + "grad_norm": 0.22864830493927002, + "learning_rate": 4.1212121212121216e-05, + "loss": 0.10794200748205185, + "num_input_tokens_seen": 1129944, + "step": 69, + "train_runtime": 561.2215, + "train_tokens_per_second": 2013.365 + }, + { + "epoch": 0.04242424242424243, + "grad_norm": 0.2130967080593109, + "learning_rate": 4.181818181818182e-05, + "loss": 0.09509418904781342, + "num_input_tokens_seen": 1146320, + "step": 70, + "train_runtime": 569.3343, + "train_tokens_per_second": 2013.439 + }, + { + "epoch": 0.04303030303030303, + "grad_norm": 0.19734057784080505, + "learning_rate": 4.242424242424243e-05, + "loss": 0.08767769485712051, + "num_input_tokens_seen": 1162696, + "step": 71, + "train_runtime": 577.4461, + "train_tokens_per_second": 2013.514 + }, + { + "epoch": 0.04363636363636364, + "grad_norm": 0.2512868344783783, + "learning_rate": 4.303030303030303e-05, + "loss": 0.08520924299955368, + "num_input_tokens_seen": 1179072, + "step": 72, + "train_runtime": 585.5562, + "train_tokens_per_second": 2013.593 + }, + { + "epoch": 0.04424242424242424, + "grad_norm": 0.18867339193820953, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.08193657547235489, + "num_input_tokens_seen": 1195448, + "step": 73, + "train_runtime": 593.6659, + "train_tokens_per_second": 2013.671 + }, + { + "epoch": 0.044848484848484846, + "grad_norm": 0.17708271741867065, + "learning_rate": 4.4242424242424246e-05, + "loss": 0.07861079275608063, + "num_input_tokens_seen": 1211824, + "step": 74, + "train_runtime": 601.778, + "train_tokens_per_second": 2013.739 + }, + { + "epoch": 0.045454545454545456, + "grad_norm": 0.16671743988990784, + "learning_rate": 4.484848484848485e-05, + "loss": 0.07204174995422363, + "num_input_tokens_seen": 1228200, + "step": 75, + "train_runtime": 609.889, + "train_tokens_per_second": 2013.809 + }, + { + "epoch": 0.04606060606060606, + "grad_norm": 0.17388567328453064, + "learning_rate": 4.545454545454546e-05, + "loss": 0.05977003276348114, + "num_input_tokens_seen": 1244576, + "step": 76, + "train_runtime": 617.9973, + "train_tokens_per_second": 2013.886 + }, + { + "epoch": 0.04666666666666667, + "grad_norm": 0.14751967787742615, + "learning_rate": 4.606060606060607e-05, + "loss": 0.06652094423770905, + "num_input_tokens_seen": 1260952, + "step": 77, + "train_runtime": 626.1063, + "train_tokens_per_second": 2013.958 + }, + { + "epoch": 0.04727272727272727, + "grad_norm": 0.1427117884159088, + "learning_rate": 4.666666666666667e-05, + "loss": 0.05981641262769699, + "num_input_tokens_seen": 1277328, + "step": 78, + "train_runtime": 634.2178, + "train_tokens_per_second": 2014.021 + }, + { + "epoch": 0.04787878787878788, + "grad_norm": 0.16328735649585724, + "learning_rate": 4.7272727272727275e-05, + "loss": 0.059813786298036575, + "num_input_tokens_seen": 1293704, + "step": 79, + "train_runtime": 642.3361, + "train_tokens_per_second": 2014.061 + }, + { + "epoch": 0.048484848484848485, + "grad_norm": 0.15144814550876617, + "learning_rate": 4.787878787878788e-05, + "loss": 0.05687074735760689, + "num_input_tokens_seen": 1310080, + "step": 80, + "train_runtime": 650.4589, + "train_tokens_per_second": 2014.086 + }, + { + "epoch": 0.04909090909090909, + "grad_norm": 0.19531840085983276, + "learning_rate": 4.848484848484849e-05, + "loss": 0.06199571490287781, + "num_input_tokens_seen": 1326456, + "step": 81, + "train_runtime": 658.5803, + "train_tokens_per_second": 2014.114 + }, + { + "epoch": 0.0496969696969697, + "grad_norm": 0.11535873264074326, + "learning_rate": 4.909090909090909e-05, + "loss": 0.05434288829565048, + "num_input_tokens_seen": 1342832, + "step": 82, + "train_runtime": 666.7006, + "train_tokens_per_second": 2014.145 + }, + { + "epoch": 0.0503030303030303, + "grad_norm": 0.17366129159927368, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.0584072507917881, + "num_input_tokens_seen": 1359208, + "step": 83, + "train_runtime": 674.8206, + "train_tokens_per_second": 2014.177 + }, + { + "epoch": 0.05090909090909091, + "grad_norm": 0.16601437330245972, + "learning_rate": 5.030303030303031e-05, + "loss": 0.055472493171691895, + "num_input_tokens_seen": 1375584, + "step": 84, + "train_runtime": 682.9407, + "train_tokens_per_second": 2014.207 + }, + { + "epoch": 0.051515151515151514, + "grad_norm": 0.12125150859355927, + "learning_rate": 5.090909090909091e-05, + "loss": 0.04972580820322037, + "num_input_tokens_seen": 1391960, + "step": 85, + "train_runtime": 691.0602, + "train_tokens_per_second": 2014.238 + }, + { + "epoch": 0.052121212121212124, + "grad_norm": 0.10404529422521591, + "learning_rate": 5.151515151515152e-05, + "loss": 0.04972917586565018, + "num_input_tokens_seen": 1408336, + "step": 86, + "train_runtime": 699.177, + "train_tokens_per_second": 2014.277 + }, + { + "epoch": 0.05272727272727273, + "grad_norm": 0.19109457731246948, + "learning_rate": 5.212121212121213e-05, + "loss": 0.04995625838637352, + "num_input_tokens_seen": 1424712, + "step": 87, + "train_runtime": 707.2957, + "train_tokens_per_second": 2014.309 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 0.14529068768024445, + "learning_rate": 5.272727272727272e-05, + "loss": 0.044690582901239395, + "num_input_tokens_seen": 1441088, + "step": 88, + "train_runtime": 715.4144, + "train_tokens_per_second": 2014.34 + }, + { + "epoch": 0.05393939393939394, + "grad_norm": 0.12216632813215256, + "learning_rate": 5.333333333333333e-05, + "loss": 0.04490099102258682, + "num_input_tokens_seen": 1457464, + "step": 89, + "train_runtime": 723.5369, + "train_tokens_per_second": 2014.36 + }, + { + "epoch": 0.05454545454545454, + "grad_norm": 0.09520085901021957, + "learning_rate": 5.393939393939394e-05, + "loss": 0.039979420602321625, + "num_input_tokens_seen": 1473840, + "step": 90, + "train_runtime": 731.6566, + "train_tokens_per_second": 2014.388 + }, + { + "epoch": 0.05515151515151515, + "grad_norm": 0.13766801357269287, + "learning_rate": 5.4545454545454546e-05, + "loss": 0.04609033092856407, + "num_input_tokens_seen": 1490216, + "step": 91, + "train_runtime": 739.7761, + "train_tokens_per_second": 2014.415 + }, + { + "epoch": 0.055757575757575756, + "grad_norm": 0.13074332475662231, + "learning_rate": 5.5151515151515156e-05, + "loss": 0.040276553481817245, + "num_input_tokens_seen": 1506592, + "step": 92, + "train_runtime": 747.8977, + "train_tokens_per_second": 2014.436 + }, + { + "epoch": 0.056363636363636366, + "grad_norm": 0.11333464086055756, + "learning_rate": 5.5757575757575766e-05, + "loss": 0.03974860906600952, + "num_input_tokens_seen": 1522968, + "step": 93, + "train_runtime": 756.018, + "train_tokens_per_second": 2014.46 + }, + { + "epoch": 0.05696969696969697, + "grad_norm": 0.09708438813686371, + "learning_rate": 5.636363636363636e-05, + "loss": 0.03745771571993828, + "num_input_tokens_seen": 1539344, + "step": 94, + "train_runtime": 764.1373, + "train_tokens_per_second": 2014.486 + }, + { + "epoch": 0.05757575757575758, + "grad_norm": 0.13791343569755554, + "learning_rate": 5.696969696969697e-05, + "loss": 0.04385356977581978, + "num_input_tokens_seen": 1555720, + "step": 95, + "train_runtime": 772.256, + "train_tokens_per_second": 2014.513 + }, + { + "epoch": 0.05818181818181818, + "grad_norm": 0.15427744388580322, + "learning_rate": 5.757575757575758e-05, + "loss": 0.0388864129781723, + "num_input_tokens_seen": 1572096, + "step": 96, + "train_runtime": 780.3755, + "train_tokens_per_second": 2014.538 + }, + { + "epoch": 0.058787878787878785, + "grad_norm": 0.11847083270549774, + "learning_rate": 5.818181818181818e-05, + "loss": 0.033506229519844055, + "num_input_tokens_seen": 1588472, + "step": 97, + "train_runtime": 788.4951, + "train_tokens_per_second": 2014.562 + }, + { + "epoch": 0.059393939393939395, + "grad_norm": 0.10092757642269135, + "learning_rate": 5.878787878787879e-05, + "loss": 0.03343300521373749, + "num_input_tokens_seen": 1604848, + "step": 98, + "train_runtime": 796.6166, + "train_tokens_per_second": 2014.58 + }, + { + "epoch": 0.06, + "grad_norm": 0.10452481359243393, + "learning_rate": 5.93939393939394e-05, + "loss": 0.036986708641052246, + "num_input_tokens_seen": 1621224, + "step": 99, + "train_runtime": 804.7379, + "train_tokens_per_second": 2014.599 + }, + { + "epoch": 0.06060606060606061, + "grad_norm": 0.08679923415184021, + "learning_rate": 6e-05, + "loss": 0.03295439854264259, + "num_input_tokens_seen": 1637600, + "step": 100, + "train_runtime": 812.8578, + "train_tokens_per_second": 2014.62 + }, + { + "epoch": 0.06121212121212121, + "grad_norm": 0.1115456148982048, + "learning_rate": 6.060606060606061e-05, + "loss": 0.03657374531030655, + "num_input_tokens_seen": 1653976, + "step": 101, + "train_runtime": 821.8569, + "train_tokens_per_second": 2012.487 + }, + { + "epoch": 0.06181818181818182, + "grad_norm": 0.08771228045225143, + "learning_rate": 6.121212121212121e-05, + "loss": 0.0364333875477314, + "num_input_tokens_seen": 1670352, + "step": 102, + "train_runtime": 829.9743, + "train_tokens_per_second": 2012.535 + }, + { + "epoch": 0.062424242424242424, + "grad_norm": 0.08961863070726395, + "learning_rate": 6.181818181818182e-05, + "loss": 0.03239607438445091, + "num_input_tokens_seen": 1686728, + "step": 103, + "train_runtime": 838.0926, + "train_tokens_per_second": 2012.58 + }, + { + "epoch": 0.06303030303030303, + "grad_norm": 0.10658557713031769, + "learning_rate": 6.242424242424243e-05, + "loss": 0.035685982555150986, + "num_input_tokens_seen": 1703104, + "step": 104, + "train_runtime": 846.2114, + "train_tokens_per_second": 2012.622 + }, + { + "epoch": 0.06363636363636363, + "grad_norm": 0.07003116607666016, + "learning_rate": 6.303030303030302e-05, + "loss": 0.03269325941801071, + "num_input_tokens_seen": 1719480, + "step": 105, + "train_runtime": 854.3347, + "train_tokens_per_second": 2012.654 + }, + { + "epoch": 0.06424242424242424, + "grad_norm": 0.0889090895652771, + "learning_rate": 6.363636363636364e-05, + "loss": 0.030469391494989395, + "num_input_tokens_seen": 1735856, + "step": 106, + "train_runtime": 862.4518, + "train_tokens_per_second": 2012.699 + }, + { + "epoch": 0.06484848484848485, + "grad_norm": 0.12026192247867584, + "learning_rate": 6.424242424242424e-05, + "loss": 0.032258037477731705, + "num_input_tokens_seen": 1752232, + "step": 107, + "train_runtime": 870.5683, + "train_tokens_per_second": 2012.745 + }, + { + "epoch": 0.06545454545454546, + "grad_norm": 0.06484470516443253, + "learning_rate": 6.484848484848485e-05, + "loss": 0.026622053235769272, + "num_input_tokens_seen": 1768608, + "step": 108, + "train_runtime": 878.6857, + "train_tokens_per_second": 2012.788 + }, + { + "epoch": 0.06606060606060606, + "grad_norm": 0.09636206179857254, + "learning_rate": 6.545454545454546e-05, + "loss": 0.03460235893726349, + "num_input_tokens_seen": 1784984, + "step": 109, + "train_runtime": 886.8033, + "train_tokens_per_second": 2012.83 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.10380304604768753, + "learning_rate": 6.606060606060607e-05, + "loss": 0.030300751328468323, + "num_input_tokens_seen": 1801360, + "step": 110, + "train_runtime": 894.9204, + "train_tokens_per_second": 2012.872 + }, + { + "epoch": 0.06727272727272728, + "grad_norm": 0.07361245900392532, + "learning_rate": 6.666666666666667e-05, + "loss": 0.03334670513868332, + "num_input_tokens_seen": 1817736, + "step": 111, + "train_runtime": 903.0383, + "train_tokens_per_second": 2012.911 + }, + { + "epoch": 0.06787878787878789, + "grad_norm": 0.06159133464097977, + "learning_rate": 6.727272727272727e-05, + "loss": 0.026774805039167404, + "num_input_tokens_seen": 1834112, + "step": 112, + "train_runtime": 911.1548, + "train_tokens_per_second": 2012.953 + }, + { + "epoch": 0.06848484848484848, + "grad_norm": 0.08236563950777054, + "learning_rate": 6.787878787878789e-05, + "loss": 0.02836509235203266, + "num_input_tokens_seen": 1850488, + "step": 113, + "train_runtime": 919.2722, + "train_tokens_per_second": 2012.992 + }, + { + "epoch": 0.06909090909090909, + "grad_norm": 0.06620238721370697, + "learning_rate": 6.848484848484848e-05, + "loss": 0.027467701584100723, + "num_input_tokens_seen": 1866864, + "step": 114, + "train_runtime": 927.3888, + "train_tokens_per_second": 2013.033 + }, + { + "epoch": 0.0696969696969697, + "grad_norm": 0.06323213130235672, + "learning_rate": 6.90909090909091e-05, + "loss": 0.02602136880159378, + "num_input_tokens_seen": 1883240, + "step": 115, + "train_runtime": 935.5053, + "train_tokens_per_second": 2013.072 + }, + { + "epoch": 0.0703030303030303, + "grad_norm": 0.06442830711603165, + "learning_rate": 6.96969696969697e-05, + "loss": 0.024133116006851196, + "num_input_tokens_seen": 1899616, + "step": 116, + "train_runtime": 943.6216, + "train_tokens_per_second": 2013.112 + }, + { + "epoch": 0.07090909090909091, + "grad_norm": 0.057056326419115067, + "learning_rate": 7.03030303030303e-05, + "loss": 0.029189810156822205, + "num_input_tokens_seen": 1915992, + "step": 117, + "train_runtime": 951.74, + "train_tokens_per_second": 2013.147 + }, + { + "epoch": 0.07151515151515152, + "grad_norm": 0.067554771900177, + "learning_rate": 7.090909090909092e-05, + "loss": 0.026694156229496002, + "num_input_tokens_seen": 1932368, + "step": 118, + "train_runtime": 959.8558, + "train_tokens_per_second": 2013.186 + }, + { + "epoch": 0.07212121212121213, + "grad_norm": 0.14906729757785797, + "learning_rate": 7.151515151515152e-05, + "loss": 0.027481166645884514, + "num_input_tokens_seen": 1948744, + "step": 119, + "train_runtime": 967.9726, + "train_tokens_per_second": 2013.222 + }, + { + "epoch": 0.07272727272727272, + "grad_norm": 0.08957181125879288, + "learning_rate": 7.212121212121213e-05, + "loss": 0.026221584528684616, + "num_input_tokens_seen": 1965120, + "step": 120, + "train_runtime": 976.0892, + "train_tokens_per_second": 2013.259 + }, + { + "epoch": 0.07333333333333333, + "grad_norm": 0.06401059031486511, + "learning_rate": 7.272727272727273e-05, + "loss": 0.024882640689611435, + "num_input_tokens_seen": 1981496, + "step": 121, + "train_runtime": 984.2063, + "train_tokens_per_second": 2013.293 + }, + { + "epoch": 0.07393939393939394, + "grad_norm": 0.08041027188301086, + "learning_rate": 7.333333333333333e-05, + "loss": 0.02306070551276207, + "num_input_tokens_seen": 1997872, + "step": 122, + "train_runtime": 992.3345, + "train_tokens_per_second": 2013.305 + }, + { + "epoch": 0.07454545454545454, + "grad_norm": 0.12150601297616959, + "learning_rate": 7.393939393939395e-05, + "loss": 0.024561185389757156, + "num_input_tokens_seen": 2014248, + "step": 123, + "train_runtime": 1000.452, + "train_tokens_per_second": 2013.338 + }, + { + "epoch": 0.07515151515151515, + "grad_norm": 0.24074473977088928, + "learning_rate": 7.454545454545455e-05, + "loss": 0.027396628633141518, + "num_input_tokens_seen": 2030624, + "step": 124, + "train_runtime": 1008.5688, + "train_tokens_per_second": 2013.372 + }, + { + "epoch": 0.07575757575757576, + "grad_norm": 0.05276267230510712, + "learning_rate": 7.515151515151515e-05, + "loss": 0.024067046120762825, + "num_input_tokens_seen": 2047000, + "step": 125, + "train_runtime": 1016.6862, + "train_tokens_per_second": 2013.404 + }, + { + "epoch": 0.07636363636363637, + "grad_norm": 0.17272238433361053, + "learning_rate": 7.575757575757576e-05, + "loss": 0.023468442261219025, + "num_input_tokens_seen": 2063376, + "step": 126, + "train_runtime": 1024.8042, + "train_tokens_per_second": 2013.434 + }, + { + "epoch": 0.07696969696969697, + "grad_norm": 0.3582988977432251, + "learning_rate": 7.636363636363637e-05, + "loss": 0.027403943240642548, + "num_input_tokens_seen": 2079752, + "step": 127, + "train_runtime": 1032.9345, + "train_tokens_per_second": 2013.44 + }, + { + "epoch": 0.07757575757575758, + "grad_norm": 0.0781882107257843, + "learning_rate": 7.696969696969696e-05, + "loss": 0.023713622242212296, + "num_input_tokens_seen": 2096128, + "step": 128, + "train_runtime": 1041.056, + "train_tokens_per_second": 2013.463 + }, + { + "epoch": 0.07818181818181819, + "grad_norm": 0.07272130995988846, + "learning_rate": 7.757575757575758e-05, + "loss": 0.022761020809412003, + "num_input_tokens_seen": 2112504, + "step": 129, + "train_runtime": 1049.1772, + "train_tokens_per_second": 2013.486 + }, + { + "epoch": 0.07878787878787878, + "grad_norm": 0.2158210277557373, + "learning_rate": 7.818181818181818e-05, + "loss": 0.024013228714466095, + "num_input_tokens_seen": 2128880, + "step": 130, + "train_runtime": 1057.2975, + "train_tokens_per_second": 2013.511 + }, + { + "epoch": 0.07939393939393939, + "grad_norm": 0.586162269115448, + "learning_rate": 7.878787878787879e-05, + "loss": 0.022834377363324165, + "num_input_tokens_seen": 2145256, + "step": 131, + "train_runtime": 1065.4164, + "train_tokens_per_second": 2013.538 + }, + { + "epoch": 0.08, + "grad_norm": 0.323000431060791, + "learning_rate": 7.93939393939394e-05, + "loss": 0.022654253989458084, + "num_input_tokens_seen": 2161632, + "step": 132, + "train_runtime": 1073.5352, + "train_tokens_per_second": 2013.564 + }, + { + "epoch": 0.08060606060606061, + "grad_norm": 0.08159562945365906, + "learning_rate": 8e-05, + "loss": 0.02390367165207863, + "num_input_tokens_seen": 2178008, + "step": 133, + "train_runtime": 1081.6528, + "train_tokens_per_second": 2013.593 + }, + { + "epoch": 0.08121212121212121, + "grad_norm": 0.7155167460441589, + "learning_rate": 8.060606060606061e-05, + "loss": 0.022787289693951607, + "num_input_tokens_seen": 2194384, + "step": 134, + "train_runtime": 1089.7709, + "train_tokens_per_second": 2013.619 + }, + { + "epoch": 0.08181818181818182, + "grad_norm": 0.08167142421007156, + "learning_rate": 8.121212121212121e-05, + "loss": 0.02184353396296501, + "num_input_tokens_seen": 2210760, + "step": 135, + "train_runtime": 1097.8902, + "train_tokens_per_second": 2013.644 + }, + { + "epoch": 0.08242424242424243, + "grad_norm": 0.47277864813804626, + "learning_rate": 8.181818181818183e-05, + "loss": 0.02624150738120079, + "num_input_tokens_seen": 2227136, + "step": 136, + "train_runtime": 1106.0079, + "train_tokens_per_second": 2013.671 + }, + { + "epoch": 0.08303030303030302, + "grad_norm": 0.07428373396396637, + "learning_rate": 8.242424242424243e-05, + "loss": 0.02352747693657875, + "num_input_tokens_seen": 2243512, + "step": 137, + "train_runtime": 1114.1326, + "train_tokens_per_second": 2013.685 + }, + { + "epoch": 0.08363636363636363, + "grad_norm": 0.47124460339546204, + "learning_rate": 8.303030303030304e-05, + "loss": 0.025087552145123482, + "num_input_tokens_seen": 2259888, + "step": 138, + "train_runtime": 1122.2501, + "train_tokens_per_second": 2013.712 + }, + { + "epoch": 0.08424242424242424, + "grad_norm": 0.2430545538663864, + "learning_rate": 8.363636363636364e-05, + "loss": 0.024803292006254196, + "num_input_tokens_seen": 2276264, + "step": 139, + "train_runtime": 1130.3676, + "train_tokens_per_second": 2013.738 + }, + { + "epoch": 0.08484848484848485, + "grad_norm": 0.08046893775463104, + "learning_rate": 8.424242424242424e-05, + "loss": 0.022827964276075363, + "num_input_tokens_seen": 2292640, + "step": 140, + "train_runtime": 1138.4851, + "train_tokens_per_second": 2013.764 + }, + { + "epoch": 0.08545454545454545, + "grad_norm": 0.15526282787322998, + "learning_rate": 8.484848484848486e-05, + "loss": 0.02164369635283947, + "num_input_tokens_seen": 2309016, + "step": 141, + "train_runtime": 1146.6046, + "train_tokens_per_second": 2013.786 + }, + { + "epoch": 0.08606060606060606, + "grad_norm": 0.0912376195192337, + "learning_rate": 8.545454545454545e-05, + "loss": 0.0223920289427042, + "num_input_tokens_seen": 2325392, + "step": 142, + "train_runtime": 1154.7226, + "train_tokens_per_second": 2013.81 + }, + { + "epoch": 0.08666666666666667, + "grad_norm": 0.08407703042030334, + "learning_rate": 8.606060606060606e-05, + "loss": 0.022693689912557602, + "num_input_tokens_seen": 2341768, + "step": 143, + "train_runtime": 1162.8406, + "train_tokens_per_second": 2013.834 + }, + { + "epoch": 0.08727272727272728, + "grad_norm": 0.07187625020742416, + "learning_rate": 8.666666666666667e-05, + "loss": 0.020523108541965485, + "num_input_tokens_seen": 2358144, + "step": 144, + "train_runtime": 1170.9602, + "train_tokens_per_second": 2013.855 + }, + { + "epoch": 0.08787878787878788, + "grad_norm": 0.08785762637853622, + "learning_rate": 8.727272727272727e-05, + "loss": 0.023188354447484016, + "num_input_tokens_seen": 2374520, + "step": 145, + "train_runtime": 1179.0803, + "train_tokens_per_second": 2013.875 + }, + { + "epoch": 0.08848484848484849, + "grad_norm": 0.06223875284194946, + "learning_rate": 8.787878787878789e-05, + "loss": 0.019059190526604652, + "num_input_tokens_seen": 2390896, + "step": 146, + "train_runtime": 1187.2017, + "train_tokens_per_second": 2013.892 + }, + { + "epoch": 0.0890909090909091, + "grad_norm": 0.09552452713251114, + "learning_rate": 8.848484848484849e-05, + "loss": 0.020222101360559464, + "num_input_tokens_seen": 2407272, + "step": 147, + "train_runtime": 1195.3217, + "train_tokens_per_second": 2013.911 + }, + { + "epoch": 0.08969696969696969, + "grad_norm": 0.07248228043317795, + "learning_rate": 8.90909090909091e-05, + "loss": 0.020538993179798126, + "num_input_tokens_seen": 2423648, + "step": 148, + "train_runtime": 1203.4411, + "train_tokens_per_second": 2013.932 + }, + { + "epoch": 0.0903030303030303, + "grad_norm": 0.08636505901813507, + "learning_rate": 8.96969696969697e-05, + "loss": 0.020172201097011566, + "num_input_tokens_seen": 2440024, + "step": 149, + "train_runtime": 1211.5609, + "train_tokens_per_second": 2013.951 + }, + { + "epoch": 0.09090909090909091, + "grad_norm": 0.0678800642490387, + "learning_rate": 9.030303030303031e-05, + "loss": 0.01839592307806015, + "num_input_tokens_seen": 2456400, + "step": 150, + "train_runtime": 1219.679, + "train_tokens_per_second": 2013.973 + }, + { + "epoch": 0.09151515151515152, + "grad_norm": 0.08543987572193146, + "learning_rate": 9.090909090909092e-05, + "loss": 0.02213234454393387, + "num_input_tokens_seen": 2472776, + "step": 151, + "train_runtime": 1227.7971, + "train_tokens_per_second": 2013.994 + }, + { + "epoch": 0.09212121212121212, + "grad_norm": 0.06894785910844803, + "learning_rate": 9.151515151515152e-05, + "loss": 0.019493641331791878, + "num_input_tokens_seen": 2489152, + "step": 152, + "train_runtime": 1235.9161, + "train_tokens_per_second": 2014.014 + }, + { + "epoch": 0.09272727272727273, + "grad_norm": 0.0796777755022049, + "learning_rate": 9.212121212121214e-05, + "loss": 0.019212841987609863, + "num_input_tokens_seen": 2505528, + "step": 153, + "train_runtime": 1244.0335, + "train_tokens_per_second": 2014.036 + }, + { + "epoch": 0.09333333333333334, + "grad_norm": 0.03816372528672218, + "learning_rate": 9.272727272727273e-05, + "loss": 0.018845168873667717, + "num_input_tokens_seen": 2521904, + "step": 154, + "train_runtime": 1252.1501, + "train_tokens_per_second": 2014.059 + }, + { + "epoch": 0.09393939393939393, + "grad_norm": 0.05867328122258186, + "learning_rate": 9.333333333333334e-05, + "loss": 0.020137080922722816, + "num_input_tokens_seen": 2538280, + "step": 155, + "train_runtime": 1260.2669, + "train_tokens_per_second": 2014.081 + }, + { + "epoch": 0.09454545454545454, + "grad_norm": 0.12616179883480072, + "learning_rate": 9.393939393939395e-05, + "loss": 0.023685304448008537, + "num_input_tokens_seen": 2554656, + "step": 156, + "train_runtime": 1268.385, + "train_tokens_per_second": 2014.101 + }, + { + "epoch": 0.09515151515151515, + "grad_norm": 0.06801550090312958, + "learning_rate": 9.454545454545455e-05, + "loss": 0.021116768941283226, + "num_input_tokens_seen": 2571032, + "step": 157, + "train_runtime": 1276.5029, + "train_tokens_per_second": 2014.122 + }, + { + "epoch": 0.09575757575757576, + "grad_norm": 0.05668250098824501, + "learning_rate": 9.515151515151515e-05, + "loss": 0.019319312646985054, + "num_input_tokens_seen": 2587408, + "step": 158, + "train_runtime": 1284.6181, + "train_tokens_per_second": 2014.146 + }, + { + "epoch": 0.09636363636363636, + "grad_norm": 0.05750446021556854, + "learning_rate": 9.575757575757576e-05, + "loss": 0.01928100548684597, + "num_input_tokens_seen": 2603784, + "step": 159, + "train_runtime": 1292.7386, + "train_tokens_per_second": 2014.161 + }, + { + "epoch": 0.09696969696969697, + "grad_norm": 0.08826832473278046, + "learning_rate": 9.636363636363637e-05, + "loss": 0.02036631852388382, + "num_input_tokens_seen": 2620160, + "step": 160, + "train_runtime": 1300.8562, + "train_tokens_per_second": 2014.181 + }, + { + "epoch": 0.09757575757575758, + "grad_norm": 0.05680972710251808, + "learning_rate": 9.696969696969698e-05, + "loss": 0.017789499834179878, + "num_input_tokens_seen": 2636536, + "step": 161, + "train_runtime": 1308.9737, + "train_tokens_per_second": 2014.201 + }, + { + "epoch": 0.09818181818181818, + "grad_norm": 0.04641514644026756, + "learning_rate": 9.757575757575758e-05, + "loss": 0.02048567123711109, + "num_input_tokens_seen": 2652912, + "step": 162, + "train_runtime": 1317.092, + "train_tokens_per_second": 2014.219 + }, + { + "epoch": 0.09878787878787879, + "grad_norm": 0.04058675095438957, + "learning_rate": 9.818181818181818e-05, + "loss": 0.019105076789855957, + "num_input_tokens_seen": 2669288, + "step": 163, + "train_runtime": 1325.2097, + "train_tokens_per_second": 2014.238 + }, + { + "epoch": 0.0993939393939394, + "grad_norm": 0.08786831051111221, + "learning_rate": 9.87878787878788e-05, + "loss": 0.020488332957029343, + "num_input_tokens_seen": 2685664, + "step": 164, + "train_runtime": 1333.3352, + "train_tokens_per_second": 2014.245 + }, + { + "epoch": 0.1, + "grad_norm": 0.05097790062427521, + "learning_rate": 9.939393939393939e-05, + "loss": 0.018979694694280624, + "num_input_tokens_seen": 2702040, + "step": 165, + "train_runtime": 1341.4534, + "train_tokens_per_second": 2014.263 + }, + { + "epoch": 0.1006060606060606, + "grad_norm": 0.05220174416899681, + "learning_rate": 0.0001, + "loss": 0.017788853496313095, + "num_input_tokens_seen": 2718416, + "step": 166, + "train_runtime": 1349.5711, + "train_tokens_per_second": 2014.281 + }, + { + "epoch": 0.10121212121212121, + "grad_norm": 0.07084593176841736, + "learning_rate": 9.999999907529869e-05, + "loss": 0.017644576728343964, + "num_input_tokens_seen": 2734792, + "step": 167, + "train_runtime": 1357.6892, + "train_tokens_per_second": 2014.299 + }, + { + "epoch": 0.10181818181818182, + "grad_norm": 0.058325134217739105, + "learning_rate": 9.999999630119479e-05, + "loss": 0.01890077441930771, + "num_input_tokens_seen": 2751168, + "step": 168, + "train_runtime": 1365.8058, + "train_tokens_per_second": 2014.319 + }, + { + "epoch": 0.10242424242424242, + "grad_norm": 0.06277347356081009, + "learning_rate": 9.999999167768837e-05, + "loss": 0.020100781694054604, + "num_input_tokens_seen": 2767544, + "step": 169, + "train_runtime": 1373.9351, + "train_tokens_per_second": 2014.319 + }, + { + "epoch": 0.10303030303030303, + "grad_norm": 0.07524619996547699, + "learning_rate": 9.999998520477966e-05, + "loss": 0.016615130007267, + "num_input_tokens_seen": 2783920, + "step": 170, + "train_runtime": 1382.0536, + "train_tokens_per_second": 2014.336 + }, + { + "epoch": 0.10363636363636364, + "grad_norm": 0.07865840196609497, + "learning_rate": 9.999997688246885e-05, + "loss": 0.02175009250640869, + "num_input_tokens_seen": 2800296, + "step": 171, + "train_runtime": 1390.173, + "train_tokens_per_second": 2014.351 + }, + { + "epoch": 0.10424242424242425, + "grad_norm": 0.10437590628862381, + "learning_rate": 9.999996671075626e-05, + "loss": 0.021732885390520096, + "num_input_tokens_seen": 2816672, + "step": 172, + "train_runtime": 1398.29, + "train_tokens_per_second": 2014.369 + }, + { + "epoch": 0.10484848484848484, + "grad_norm": 0.09102741628885269, + "learning_rate": 9.99999546896423e-05, + "loss": 0.019160069525241852, + "num_input_tokens_seen": 2833048, + "step": 173, + "train_runtime": 1406.4092, + "train_tokens_per_second": 2014.384 + }, + { + "epoch": 0.10545454545454545, + "grad_norm": 0.09274180978536606, + "learning_rate": 9.999994081912736e-05, + "loss": 0.020909177139401436, + "num_input_tokens_seen": 2849424, + "step": 174, + "train_runtime": 1414.5329, + "train_tokens_per_second": 2014.392 + }, + { + "epoch": 0.10606060606060606, + "grad_norm": 0.0448119193315506, + "learning_rate": 9.999992509921199e-05, + "loss": 0.018382754176855087, + "num_input_tokens_seen": 2865800, + "step": 175, + "train_runtime": 1422.6511, + "train_tokens_per_second": 2014.408 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 0.04945825785398483, + "learning_rate": 9.999990752989675e-05, + "loss": 0.01783941313624382, + "num_input_tokens_seen": 2882176, + "step": 176, + "train_runtime": 1430.7704, + "train_tokens_per_second": 2014.422 + }, + { + "epoch": 0.10727272727272727, + "grad_norm": 0.04921802878379822, + "learning_rate": 9.999988811118231e-05, + "loss": 0.01793338730931282, + "num_input_tokens_seen": 2898552, + "step": 177, + "train_runtime": 1438.89, + "train_tokens_per_second": 2014.436 + }, + { + "epoch": 0.10787878787878788, + "grad_norm": 0.05301757901906967, + "learning_rate": 9.999986684306937e-05, + "loss": 0.01700768433511257, + "num_input_tokens_seen": 2914928, + "step": 178, + "train_runtime": 1447.011, + "train_tokens_per_second": 2014.448 + }, + { + "epoch": 0.10848484848484849, + "grad_norm": 0.0539541132748127, + "learning_rate": 9.999984372555874e-05, + "loss": 0.01774643547832966, + "num_input_tokens_seen": 2931304, + "step": 179, + "train_runtime": 1455.1319, + "train_tokens_per_second": 2014.459 + }, + { + "epoch": 0.10909090909090909, + "grad_norm": 0.046017974615097046, + "learning_rate": 9.999981875865125e-05, + "loss": 0.016473708674311638, + "num_input_tokens_seen": 2947680, + "step": 180, + "train_runtime": 1463.2551, + "train_tokens_per_second": 2014.468 + }, + { + "epoch": 0.1096969696969697, + "grad_norm": 0.05201786011457443, + "learning_rate": 9.999979194234786e-05, + "loss": 0.019079631194472313, + "num_input_tokens_seen": 2964056, + "step": 181, + "train_runtime": 1471.3776, + "train_tokens_per_second": 2014.477 + }, + { + "epoch": 0.1103030303030303, + "grad_norm": 0.07819167524576187, + "learning_rate": 9.99997632766495e-05, + "loss": 0.018508095294237137, + "num_input_tokens_seen": 2980432, + "step": 182, + "train_runtime": 1479.496, + "train_tokens_per_second": 2014.491 + }, + { + "epoch": 0.11090909090909092, + "grad_norm": 0.04773807153105736, + "learning_rate": 9.999973276155727e-05, + "loss": 0.016029708087444305, + "num_input_tokens_seen": 2996808, + "step": 183, + "train_runtime": 1487.6149, + "train_tokens_per_second": 2014.505 + }, + { + "epoch": 0.11151515151515151, + "grad_norm": 0.054091572761535645, + "learning_rate": 9.999970039707232e-05, + "loss": 0.01906082220375538, + "num_input_tokens_seen": 3013184, + "step": 184, + "train_runtime": 1495.7326, + "train_tokens_per_second": 2014.521 + }, + { + "epoch": 0.11212121212121212, + "grad_norm": 0.03870342671871185, + "learning_rate": 9.999966618319581e-05, + "loss": 0.01634303852915764, + "num_input_tokens_seen": 3029560, + "step": 185, + "train_runtime": 1503.8521, + "train_tokens_per_second": 2014.533 + }, + { + "epoch": 0.11272727272727273, + "grad_norm": 0.04409291222691536, + "learning_rate": 9.999963011992902e-05, + "loss": 0.016504261642694473, + "num_input_tokens_seen": 3045936, + "step": 186, + "train_runtime": 1511.9705, + "train_tokens_per_second": 2014.547 + }, + { + "epoch": 0.11333333333333333, + "grad_norm": 0.037538424134254456, + "learning_rate": 9.999959220727327e-05, + "loss": 0.016254613175988197, + "num_input_tokens_seen": 3062312, + "step": 187, + "train_runtime": 1520.0898, + "train_tokens_per_second": 2014.56 + }, + { + "epoch": 0.11393939393939394, + "grad_norm": 0.0896935984492302, + "learning_rate": 9.999955244522999e-05, + "loss": 0.016761597245931625, + "num_input_tokens_seen": 3078688, + "step": 188, + "train_runtime": 1528.2094, + "train_tokens_per_second": 2014.572 + }, + { + "epoch": 0.11454545454545455, + "grad_norm": 0.10176566988229752, + "learning_rate": 9.999951083380062e-05, + "loss": 0.01988411694765091, + "num_input_tokens_seen": 3095064, + "step": 189, + "train_runtime": 1536.333, + "train_tokens_per_second": 2014.579 + }, + { + "epoch": 0.11515151515151516, + "grad_norm": 0.039956171065568924, + "learning_rate": 9.999946737298674e-05, + "loss": 0.015326369553804398, + "num_input_tokens_seen": 3111440, + "step": 190, + "train_runtime": 1544.4503, + "train_tokens_per_second": 2014.594 + }, + { + "epoch": 0.11575757575757575, + "grad_norm": 0.06942013651132584, + "learning_rate": 9.99994220627899e-05, + "loss": 0.017792224884033203, + "num_input_tokens_seen": 3127816, + "step": 191, + "train_runtime": 1552.5689, + "train_tokens_per_second": 2014.607 + }, + { + "epoch": 0.11636363636363636, + "grad_norm": 0.06119908019900322, + "learning_rate": 9.999937490321182e-05, + "loss": 0.016535507515072823, + "num_input_tokens_seen": 3144192, + "step": 192, + "train_runtime": 1560.6857, + "train_tokens_per_second": 2014.622 + }, + { + "epoch": 0.11696969696969697, + "grad_norm": 0.07336534559726715, + "learning_rate": 9.999932589425423e-05, + "loss": 0.015493718907237053, + "num_input_tokens_seen": 3160568, + "step": 193, + "train_runtime": 1568.8033, + "train_tokens_per_second": 2014.636 + }, + { + "epoch": 0.11757575757575757, + "grad_norm": 0.03818663954734802, + "learning_rate": 9.999927503591896e-05, + "loss": 0.017348209396004677, + "num_input_tokens_seen": 3176944, + "step": 194, + "train_runtime": 1576.9206, + "train_tokens_per_second": 2014.651 + }, + { + "epoch": 0.11818181818181818, + "grad_norm": 0.028583593666553497, + "learning_rate": 9.999922232820785e-05, + "loss": 0.014952014200389385, + "num_input_tokens_seen": 3193320, + "step": 195, + "train_runtime": 1585.0393, + "train_tokens_per_second": 2014.663 + }, + { + "epoch": 0.11878787878787879, + "grad_norm": 0.04163753613829613, + "learning_rate": 9.999916777112288e-05, + "loss": 0.017875926569104195, + "num_input_tokens_seen": 3209696, + "step": 196, + "train_runtime": 1593.159, + "train_tokens_per_second": 2014.674 + }, + { + "epoch": 0.1193939393939394, + "grad_norm": 0.03779582679271698, + "learning_rate": 9.999911136466608e-05, + "loss": 0.01648208498954773, + "num_input_tokens_seen": 3226072, + "step": 197, + "train_runtime": 1601.2758, + "train_tokens_per_second": 2014.689 + }, + { + "epoch": 0.12, + "grad_norm": 0.06097209453582764, + "learning_rate": 9.99990531088395e-05, + "loss": 0.017982497811317444, + "num_input_tokens_seen": 3242448, + "step": 198, + "train_runtime": 1609.4726, + "train_tokens_per_second": 2014.603 + }, + { + "epoch": 0.1206060606060606, + "grad_norm": 0.07450928539037704, + "learning_rate": 9.999899300364532e-05, + "loss": 0.015351779758930206, + "num_input_tokens_seen": 3258824, + "step": 199, + "train_runtime": 1617.5877, + "train_tokens_per_second": 2014.62 + }, + { + "epoch": 0.12121212121212122, + "grad_norm": 0.06301674991846085, + "learning_rate": 9.999893104908577e-05, + "loss": 0.018576189875602722, + "num_input_tokens_seen": 3275200, + "step": 200, + "train_runtime": 1625.7153, + "train_tokens_per_second": 2014.621 + }, + { + "epoch": 0.12181818181818181, + "grad_norm": 0.05599730834364891, + "learning_rate": 9.999886724516312e-05, + "loss": 0.018099110573530197, + "num_input_tokens_seen": 3291576, + "step": 201, + "train_runtime": 1635.3633, + "train_tokens_per_second": 2012.749 + }, + { + "epoch": 0.12242424242424242, + "grad_norm": 0.040753431618213654, + "learning_rate": 9.999880159187975e-05, + "loss": 0.015437884256243706, + "num_input_tokens_seen": 3307952, + "step": 202, + "train_runtime": 1643.4859, + "train_tokens_per_second": 2012.766 + }, + { + "epoch": 0.12303030303030303, + "grad_norm": 0.03280268982052803, + "learning_rate": 9.999873408923806e-05, + "loss": 0.01625344157218933, + "num_input_tokens_seen": 3324328, + "step": 203, + "train_runtime": 1651.609, + "train_tokens_per_second": 2012.781 + }, + { + "epoch": 0.12363636363636364, + "grad_norm": 0.058769796043634415, + "learning_rate": 9.999866473724057e-05, + "loss": 0.019040308892726898, + "num_input_tokens_seen": 3340704, + "step": 204, + "train_runtime": 1659.7319, + "train_tokens_per_second": 2012.797 + }, + { + "epoch": 0.12424242424242424, + "grad_norm": 0.07302497327327728, + "learning_rate": 9.999859353588984e-05, + "loss": 0.015959227457642555, + "num_input_tokens_seen": 3357080, + "step": 205, + "train_runtime": 1667.8511, + "train_tokens_per_second": 2012.818 + }, + { + "epoch": 0.12484848484848485, + "grad_norm": 0.038392290472984314, + "learning_rate": 9.999852048518849e-05, + "loss": 0.015184870921075344, + "num_input_tokens_seen": 3373456, + "step": 206, + "train_runtime": 1675.97, + "train_tokens_per_second": 2012.838 + }, + { + "epoch": 0.12545454545454546, + "grad_norm": 0.057108521461486816, + "learning_rate": 9.999844558513926e-05, + "loss": 0.018102547153830528, + "num_input_tokens_seen": 3389832, + "step": 207, + "train_runtime": 1684.0874, + "train_tokens_per_second": 2012.86 + }, + { + "epoch": 0.12606060606060607, + "grad_norm": 0.05192007124423981, + "learning_rate": 9.999836883574488e-05, + "loss": 0.016045067459344864, + "num_input_tokens_seen": 3406208, + "step": 208, + "train_runtime": 1692.2048, + "train_tokens_per_second": 2012.882 + }, + { + "epoch": 0.12666666666666668, + "grad_norm": 0.05115659907460213, + "learning_rate": 9.99982902370082e-05, + "loss": 0.016623271629214287, + "num_input_tokens_seen": 3422584, + "step": 209, + "train_runtime": 1700.3232, + "train_tokens_per_second": 2012.902 + }, + { + "epoch": 0.12727272727272726, + "grad_norm": 0.07258911430835724, + "learning_rate": 9.999820978893216e-05, + "loss": 0.020482556894421577, + "num_input_tokens_seen": 3438960, + "step": 210, + "train_runtime": 1708.4412, + "train_tokens_per_second": 2012.923 + }, + { + "epoch": 0.12787878787878787, + "grad_norm": 0.1083996444940567, + "learning_rate": 9.999812749151966e-05, + "loss": 0.020862706005573273, + "num_input_tokens_seen": 3455336, + "step": 211, + "train_runtime": 1716.5608, + "train_tokens_per_second": 2012.941 + }, + { + "epoch": 0.12848484848484848, + "grad_norm": 0.04957745969295502, + "learning_rate": 9.999804334477383e-05, + "loss": 0.019352620467543602, + "num_input_tokens_seen": 3471712, + "step": 212, + "train_runtime": 1724.679, + "train_tokens_per_second": 2012.961 + }, + { + "epoch": 0.1290909090909091, + "grad_norm": 0.05110868439078331, + "learning_rate": 9.999795734869772e-05, + "loss": 0.01801101304590702, + "num_input_tokens_seen": 3488088, + "step": 213, + "train_runtime": 1732.7974, + "train_tokens_per_second": 2012.981 + }, + { + "epoch": 0.1296969696969697, + "grad_norm": 0.03656603768467903, + "learning_rate": 9.999786950329454e-05, + "loss": 0.014664572663605213, + "num_input_tokens_seen": 3504464, + "step": 214, + "train_runtime": 1740.9181, + "train_tokens_per_second": 2012.998 + }, + { + "epoch": 0.1303030303030303, + "grad_norm": 0.06225895509123802, + "learning_rate": 9.999777980856754e-05, + "loss": 0.01811577007174492, + "num_input_tokens_seen": 3520840, + "step": 215, + "train_runtime": 1749.0394, + "train_tokens_per_second": 2013.014 + }, + { + "epoch": 0.13090909090909092, + "grad_norm": 0.06217541545629501, + "learning_rate": 9.999768826452004e-05, + "loss": 0.015230846591293812, + "num_input_tokens_seen": 3537216, + "step": 216, + "train_runtime": 1757.1603, + "train_tokens_per_second": 2013.03 + }, + { + "epoch": 0.1315151515151515, + "grad_norm": 0.0395430289208889, + "learning_rate": 9.999759487115541e-05, + "loss": 0.017680658027529716, + "num_input_tokens_seen": 3553592, + "step": 217, + "train_runtime": 1765.2799, + "train_tokens_per_second": 2013.047 + }, + { + "epoch": 0.1321212121212121, + "grad_norm": 0.04460732638835907, + "learning_rate": 9.999749962847711e-05, + "loss": 0.015775006264448166, + "num_input_tokens_seen": 3569968, + "step": 218, + "train_runtime": 1773.4008, + "train_tokens_per_second": 2013.063 + }, + { + "epoch": 0.13272727272727272, + "grad_norm": 0.026493152603507042, + "learning_rate": 9.999740253648866e-05, + "loss": 0.016286678612232208, + "num_input_tokens_seen": 3586344, + "step": 219, + "train_runtime": 1781.5181, + "train_tokens_per_second": 2013.083 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.05032551288604736, + "learning_rate": 9.999730359519366e-05, + "loss": 0.01735139824450016, + "num_input_tokens_seen": 3602720, + "step": 220, + "train_runtime": 1789.6355, + "train_tokens_per_second": 2013.103 + }, + { + "epoch": 0.13393939393939394, + "grad_norm": 0.10480339080095291, + "learning_rate": 9.999720280459576e-05, + "loss": 0.0164189450442791, + "num_input_tokens_seen": 3619096, + "step": 221, + "train_runtime": 1797.7553, + "train_tokens_per_second": 2013.119 + }, + { + "epoch": 0.13454545454545455, + "grad_norm": 0.05456702038645744, + "learning_rate": 9.99971001646987e-05, + "loss": 0.018650280311703682, + "num_input_tokens_seen": 3635472, + "step": 222, + "train_runtime": 1805.876, + "train_tokens_per_second": 2013.135 + }, + { + "epoch": 0.13515151515151516, + "grad_norm": 0.03562236949801445, + "learning_rate": 9.999699567550627e-05, + "loss": 0.014892566949129105, + "num_input_tokens_seen": 3651848, + "step": 223, + "train_runtime": 1813.9965, + "train_tokens_per_second": 2013.151 + }, + { + "epoch": 0.13575757575757577, + "grad_norm": 0.09293515980243683, + "learning_rate": 9.999688933702232e-05, + "loss": 0.019074441865086555, + "num_input_tokens_seen": 3668224, + "step": 224, + "train_runtime": 1822.1164, + "train_tokens_per_second": 2013.167 + }, + { + "epoch": 0.13636363636363635, + "grad_norm": 0.04311508685350418, + "learning_rate": 9.99967811492508e-05, + "loss": 0.016122177243232727, + "num_input_tokens_seen": 3684600, + "step": 225, + "train_runtime": 1830.237, + "train_tokens_per_second": 2013.182 + }, + { + "epoch": 0.13696969696969696, + "grad_norm": 0.0684700533747673, + "learning_rate": 9.999667111219573e-05, + "loss": 0.016784384846687317, + "num_input_tokens_seen": 3700976, + "step": 226, + "train_runtime": 1838.3572, + "train_tokens_per_second": 2013.197 + }, + { + "epoch": 0.13757575757575757, + "grad_norm": 0.051709555089473724, + "learning_rate": 9.999655922586116e-05, + "loss": 0.01756284013390541, + "num_input_tokens_seen": 3717352, + "step": 227, + "train_runtime": 1846.4811, + "train_tokens_per_second": 2013.209 + }, + { + "epoch": 0.13818181818181818, + "grad_norm": 0.06800346821546555, + "learning_rate": 9.99964454902512e-05, + "loss": 0.018563883379101753, + "num_input_tokens_seen": 3733728, + "step": 228, + "train_runtime": 1854.6021, + "train_tokens_per_second": 2013.223 + }, + { + "epoch": 0.1387878787878788, + "grad_norm": 0.04645644128322601, + "learning_rate": 9.99963299053701e-05, + "loss": 0.017479516565799713, + "num_input_tokens_seen": 3750104, + "step": 229, + "train_runtime": 1862.7316, + "train_tokens_per_second": 2013.228 + }, + { + "epoch": 0.1393939393939394, + "grad_norm": 0.07372930645942688, + "learning_rate": 9.999621247122213e-05, + "loss": 0.017878303304314613, + "num_input_tokens_seen": 3766480, + "step": 230, + "train_runtime": 1870.8516, + "train_tokens_per_second": 2013.244 + }, + { + "epoch": 0.14, + "grad_norm": 0.1514655202627182, + "learning_rate": 9.99960931878116e-05, + "loss": 0.015512627549469471, + "num_input_tokens_seen": 3782856, + "step": 231, + "train_runtime": 1878.9708, + "train_tokens_per_second": 2013.26 + }, + { + "epoch": 0.1406060606060606, + "grad_norm": 0.04524844512343407, + "learning_rate": 9.999597205514297e-05, + "loss": 0.01565626822412014, + "num_input_tokens_seen": 3799232, + "step": 232, + "train_runtime": 1887.094, + "train_tokens_per_second": 2013.271 + }, + { + "epoch": 0.1412121212121212, + "grad_norm": 0.03657226637005806, + "learning_rate": 9.999584907322069e-05, + "loss": 0.014475165866315365, + "num_input_tokens_seen": 3815608, + "step": 233, + "train_runtime": 1895.2138, + "train_tokens_per_second": 2013.286 + }, + { + "epoch": 0.14181818181818182, + "grad_norm": 0.10837068408727646, + "learning_rate": 9.99957242420493e-05, + "loss": 0.016292275860905647, + "num_input_tokens_seen": 3831984, + "step": 234, + "train_runtime": 1903.3349, + "train_tokens_per_second": 2013.3 + }, + { + "epoch": 0.14242424242424243, + "grad_norm": 0.06915906816720963, + "learning_rate": 9.999559756163346e-05, + "loss": 0.01956966333091259, + "num_input_tokens_seen": 3848360, + "step": 235, + "train_runtime": 1911.4546, + "train_tokens_per_second": 2013.315 + }, + { + "epoch": 0.14303030303030304, + "grad_norm": 0.03815745189785957, + "learning_rate": 9.99954690319778e-05, + "loss": 0.01515297032892704, + "num_input_tokens_seen": 3864736, + "step": 236, + "train_runtime": 1919.5751, + "train_tokens_per_second": 2013.329 + }, + { + "epoch": 0.14363636363636365, + "grad_norm": 0.04804231598973274, + "learning_rate": 9.999533865308712e-05, + "loss": 0.017410308122634888, + "num_input_tokens_seen": 3881112, + "step": 237, + "train_runtime": 1927.6957, + "train_tokens_per_second": 2013.343 + }, + { + "epoch": 0.14424242424242426, + "grad_norm": 0.10351648926734924, + "learning_rate": 9.999520642496623e-05, + "loss": 0.01582871936261654, + "num_input_tokens_seen": 3897488, + "step": 238, + "train_runtime": 1935.8176, + "train_tokens_per_second": 2013.355 + }, + { + "epoch": 0.14484848484848484, + "grad_norm": 0.06399150937795639, + "learning_rate": 9.999507234762e-05, + "loss": 0.015461472794413567, + "num_input_tokens_seen": 3913864, + "step": 239, + "train_runtime": 1943.945, + "train_tokens_per_second": 2013.361 + }, + { + "epoch": 0.14545454545454545, + "grad_norm": 0.027640361338853836, + "learning_rate": 9.999493642105342e-05, + "loss": 0.01647048071026802, + "num_input_tokens_seen": 3930240, + "step": 240, + "train_runtime": 1952.0688, + "train_tokens_per_second": 2013.372 + }, + { + "epoch": 0.14606060606060606, + "grad_norm": 0.07313567399978638, + "learning_rate": 9.999479864527148e-05, + "loss": 0.015903417021036148, + "num_input_tokens_seen": 3946616, + "step": 241, + "train_runtime": 1960.1915, + "train_tokens_per_second": 2013.383 + }, + { + "epoch": 0.14666666666666667, + "grad_norm": 0.09255962073802948, + "learning_rate": 9.999465902027931e-05, + "loss": 0.01633605733513832, + "num_input_tokens_seen": 3962992, + "step": 242, + "train_runtime": 1968.3145, + "train_tokens_per_second": 2013.394 + }, + { + "epoch": 0.14727272727272728, + "grad_norm": 0.06311100721359253, + "learning_rate": 9.999451754608207e-05, + "loss": 0.018459340557456017, + "num_input_tokens_seen": 3979368, + "step": 243, + "train_runtime": 1976.4343, + "train_tokens_per_second": 2013.408 + }, + { + "epoch": 0.1478787878787879, + "grad_norm": 0.04240158200263977, + "learning_rate": 9.999437422268498e-05, + "loss": 0.01432002056390047, + "num_input_tokens_seen": 3995744, + "step": 244, + "train_runtime": 1984.5577, + "train_tokens_per_second": 2013.418 + }, + { + "epoch": 0.1484848484848485, + "grad_norm": 0.05550538748502731, + "learning_rate": 9.999422905009335e-05, + "loss": 0.014518518932163715, + "num_input_tokens_seen": 4012120, + "step": 245, + "train_runtime": 1992.685, + "train_tokens_per_second": 2013.424 + }, + { + "epoch": 0.14909090909090908, + "grad_norm": 0.037221502512693405, + "learning_rate": 9.999408202831255e-05, + "loss": 0.014823012985289097, + "num_input_tokens_seen": 4028496, + "step": 246, + "train_runtime": 2000.8075, + "train_tokens_per_second": 2013.435 + }, + { + "epoch": 0.1496969696969697, + "grad_norm": 0.06923341751098633, + "learning_rate": 9.999393315734801e-05, + "loss": 0.018903765827417374, + "num_input_tokens_seen": 4044872, + "step": 247, + "train_runtime": 2008.9335, + "train_tokens_per_second": 2013.443 + }, + { + "epoch": 0.1503030303030303, + "grad_norm": 0.07023045420646667, + "learning_rate": 9.999378243720523e-05, + "loss": 0.01768019236624241, + "num_input_tokens_seen": 4061248, + "step": 248, + "train_runtime": 2017.0572, + "train_tokens_per_second": 2013.452 + }, + { + "epoch": 0.1509090909090909, + "grad_norm": 0.04301533102989197, + "learning_rate": 9.999362986788981e-05, + "loss": 0.016754839569330215, + "num_input_tokens_seen": 4077624, + "step": 249, + "train_runtime": 2025.1771, + "train_tokens_per_second": 2013.465 + }, + { + "epoch": 0.15151515151515152, + "grad_norm": 0.08630920946598053, + "learning_rate": 9.999347544940739e-05, + "loss": 0.014999642968177795, + "num_input_tokens_seen": 4094000, + "step": 250, + "train_runtime": 2033.2978, + "train_tokens_per_second": 2013.478 + }, + { + "epoch": 0.15212121212121213, + "grad_norm": 0.03872856870293617, + "learning_rate": 9.999331918176365e-05, + "loss": 0.015648486092686653, + "num_input_tokens_seen": 4110376, + "step": 251, + "train_runtime": 2041.4306, + "train_tokens_per_second": 2013.478 + }, + { + "epoch": 0.15272727272727274, + "grad_norm": 0.0624275766313076, + "learning_rate": 9.999316106496439e-05, + "loss": 0.015371391549706459, + "num_input_tokens_seen": 4126752, + "step": 252, + "train_runtime": 2049.5498, + "train_tokens_per_second": 2013.492 + }, + { + "epoch": 0.15333333333333332, + "grad_norm": 0.03090560808777809, + "learning_rate": 9.999300109901548e-05, + "loss": 0.013192292302846909, + "num_input_tokens_seen": 4143128, + "step": 253, + "train_runtime": 2057.6702, + "train_tokens_per_second": 2013.504 + }, + { + "epoch": 0.15393939393939393, + "grad_norm": 0.5114591121673584, + "learning_rate": 9.99928392839228e-05, + "loss": 0.018224472180008888, + "num_input_tokens_seen": 4159504, + "step": 254, + "train_runtime": 2065.8007, + "train_tokens_per_second": 2013.507 + }, + { + "epoch": 0.15454545454545454, + "grad_norm": 0.05735045298933983, + "learning_rate": 9.999267561969235e-05, + "loss": 0.017389601096510887, + "num_input_tokens_seen": 4175880, + "step": 255, + "train_runtime": 2073.9307, + "train_tokens_per_second": 2013.51 + }, + { + "epoch": 0.15515151515151515, + "grad_norm": 0.13113801181316376, + "learning_rate": 9.99925101063302e-05, + "loss": 0.015801645815372467, + "num_input_tokens_seen": 4192256, + "step": 256, + "train_runtime": 2082.0503, + "train_tokens_per_second": 2013.523 + }, + { + "epoch": 0.15575757575757576, + "grad_norm": 0.1659373939037323, + "learning_rate": 9.999234274384244e-05, + "loss": 0.016719762235879898, + "num_input_tokens_seen": 4208632, + "step": 257, + "train_runtime": 2090.1723, + "train_tokens_per_second": 2013.534 + }, + { + "epoch": 0.15636363636363637, + "grad_norm": 0.09268343448638916, + "learning_rate": 9.99921735322353e-05, + "loss": 0.01958809420466423, + "num_input_tokens_seen": 4225008, + "step": 258, + "train_runtime": 2098.2916, + "train_tokens_per_second": 2013.547 + }, + { + "epoch": 0.15696969696969698, + "grad_norm": 0.08097874373197556, + "learning_rate": 9.999200247151499e-05, + "loss": 0.01584583893418312, + "num_input_tokens_seen": 4241384, + "step": 259, + "train_runtime": 2106.4305, + "train_tokens_per_second": 2013.541 + }, + { + "epoch": 0.15757575757575756, + "grad_norm": 0.072023406624794, + "learning_rate": 9.999182956168787e-05, + "loss": 0.0168259609490633, + "num_input_tokens_seen": 4257760, + "step": 260, + "train_runtime": 2114.5526, + "train_tokens_per_second": 2013.551 + }, + { + "epoch": 0.15818181818181817, + "grad_norm": 0.038404542952775955, + "learning_rate": 9.999165480276034e-05, + "loss": 0.014127206057310104, + "num_input_tokens_seen": 4274136, + "step": 261, + "train_runtime": 2122.6772, + "train_tokens_per_second": 2013.559 + }, + { + "epoch": 0.15878787878787878, + "grad_norm": 0.03950539231300354, + "learning_rate": 9.999147819473884e-05, + "loss": 0.016822200268507004, + "num_input_tokens_seen": 4290512, + "step": 262, + "train_runtime": 2130.7967, + "train_tokens_per_second": 2013.572 + }, + { + "epoch": 0.1593939393939394, + "grad_norm": 0.04290624335408211, + "learning_rate": 9.999129973762992e-05, + "loss": 0.016068218275904655, + "num_input_tokens_seen": 4306888, + "step": 263, + "train_runtime": 2138.9172, + "train_tokens_per_second": 2013.583 + }, + { + "epoch": 0.16, + "grad_norm": 0.05928179994225502, + "learning_rate": 9.99911194314402e-05, + "loss": 0.016628028824925423, + "num_input_tokens_seen": 4323264, + "step": 264, + "train_runtime": 2147.039, + "train_tokens_per_second": 2013.594 + }, + { + "epoch": 0.1606060606060606, + "grad_norm": 0.04302699863910675, + "learning_rate": 9.99909372761763e-05, + "loss": 0.014704343862831593, + "num_input_tokens_seen": 4339640, + "step": 265, + "train_runtime": 2155.1707, + "train_tokens_per_second": 2013.595 + }, + { + "epoch": 0.16121212121212122, + "grad_norm": 0.047466881573200226, + "learning_rate": 9.999075327184499e-05, + "loss": 0.016627237200737, + "num_input_tokens_seen": 4356016, + "step": 266, + "train_runtime": 2163.294, + "train_tokens_per_second": 2013.603 + }, + { + "epoch": 0.1618181818181818, + "grad_norm": 0.04007207974791527, + "learning_rate": 9.999056741845305e-05, + "loss": 0.01723393052816391, + "num_input_tokens_seen": 4372392, + "step": 267, + "train_runtime": 2171.417, + "train_tokens_per_second": 2013.612 + }, + { + "epoch": 0.16242424242424242, + "grad_norm": 0.04319130629301071, + "learning_rate": 9.99903797160074e-05, + "loss": 0.014541544020175934, + "num_input_tokens_seen": 4388768, + "step": 268, + "train_runtime": 2179.5352, + "train_tokens_per_second": 2013.626 + }, + { + "epoch": 0.16303030303030303, + "grad_norm": 0.02772807702422142, + "learning_rate": 9.999019016451494e-05, + "loss": 0.01326832640916109, + "num_input_tokens_seen": 4405144, + "step": 269, + "train_runtime": 2187.6543, + "train_tokens_per_second": 2013.638 + }, + { + "epoch": 0.16363636363636364, + "grad_norm": 0.03225944936275482, + "learning_rate": 9.998999876398271e-05, + "loss": 0.013814960606396198, + "num_input_tokens_seen": 4421520, + "step": 270, + "train_runtime": 2195.7724, + "train_tokens_per_second": 2013.651 + }, + { + "epoch": 0.16424242424242425, + "grad_norm": 0.03607013449072838, + "learning_rate": 9.998980551441776e-05, + "loss": 0.01566735841333866, + "num_input_tokens_seen": 4437896, + "step": 271, + "train_runtime": 2203.8921, + "train_tokens_per_second": 2013.663 + }, + { + "epoch": 0.16484848484848486, + "grad_norm": 0.02214481309056282, + "learning_rate": 9.998961041582727e-05, + "loss": 0.014288516715168953, + "num_input_tokens_seen": 4454272, + "step": 272, + "train_runtime": 2212.0309, + "train_tokens_per_second": 2013.657 + }, + { + "epoch": 0.16545454545454547, + "grad_norm": 0.03539419174194336, + "learning_rate": 9.998941346821844e-05, + "loss": 0.016615379601716995, + "num_input_tokens_seen": 4470648, + "step": 273, + "train_runtime": 2220.1513, + "train_tokens_per_second": 2013.668 + }, + { + "epoch": 0.16606060606060605, + "grad_norm": 0.02361457794904709, + "learning_rate": 9.998921467159855e-05, + "loss": 0.015559839084744453, + "num_input_tokens_seen": 4487024, + "step": 274, + "train_runtime": 2228.2688, + "train_tokens_per_second": 2013.682 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 0.029787061735987663, + "learning_rate": 9.998901402597496e-05, + "loss": 0.014054241590201855, + "num_input_tokens_seen": 4503400, + "step": 275, + "train_runtime": 2236.3965, + "train_tokens_per_second": 2013.686 + }, + { + "epoch": 0.16727272727272727, + "grad_norm": 0.08080027997493744, + "learning_rate": 9.99888115313551e-05, + "loss": 0.01626443862915039, + "num_input_tokens_seen": 4519776, + "step": 276, + "train_runtime": 2244.5116, + "train_tokens_per_second": 2013.701 + }, + { + "epoch": 0.16787878787878788, + "grad_norm": 0.04751146212220192, + "learning_rate": 9.998860718774643e-05, + "loss": 0.015646975487470627, + "num_input_tokens_seen": 4536152, + "step": 277, + "train_runtime": 2252.6223, + "train_tokens_per_second": 2013.721 + }, + { + "epoch": 0.1684848484848485, + "grad_norm": 0.11396390199661255, + "learning_rate": 9.998840099515655e-05, + "loss": 0.01626933366060257, + "num_input_tokens_seen": 4552528, + "step": 278, + "train_runtime": 2260.7374, + "train_tokens_per_second": 2013.736 + }, + { + "epoch": 0.1690909090909091, + "grad_norm": 0.03807124122977257, + "learning_rate": 9.998819295359305e-05, + "loss": 0.01517193578183651, + "num_input_tokens_seen": 4568904, + "step": 279, + "train_runtime": 2268.8455, + "train_tokens_per_second": 2013.757 + }, + { + "epoch": 0.1696969696969697, + "grad_norm": 0.07842900604009628, + "learning_rate": 9.998798306306366e-05, + "loss": 0.016375314444303513, + "num_input_tokens_seen": 4585280, + "step": 280, + "train_runtime": 2276.9581, + "train_tokens_per_second": 2013.774 + }, + { + "epoch": 0.1703030303030303, + "grad_norm": 0.12316741049289703, + "learning_rate": 9.99877713235761e-05, + "loss": 0.0158452857285738, + "num_input_tokens_seen": 4601656, + "step": 281, + "train_runtime": 2285.07, + "train_tokens_per_second": 2013.792 + }, + { + "epoch": 0.1709090909090909, + "grad_norm": 0.035711321979761124, + "learning_rate": 9.998755773513824e-05, + "loss": 0.014004937373101711, + "num_input_tokens_seen": 4618032, + "step": 282, + "train_runtime": 2293.1794, + "train_tokens_per_second": 2013.812 + }, + { + "epoch": 0.1715151515151515, + "grad_norm": 0.04513373225927353, + "learning_rate": 9.998734229775794e-05, + "loss": 0.015064300037920475, + "num_input_tokens_seen": 4634408, + "step": 283, + "train_runtime": 2301.2911, + "train_tokens_per_second": 2013.83 + }, + { + "epoch": 0.17212121212121212, + "grad_norm": 0.04803522303700447, + "learning_rate": 9.998712501144323e-05, + "loss": 0.015632454305887222, + "num_input_tokens_seen": 4650784, + "step": 284, + "train_runtime": 2309.4064, + "train_tokens_per_second": 2013.844 + }, + { + "epoch": 0.17272727272727273, + "grad_norm": 0.0677453801035881, + "learning_rate": 9.99869058762021e-05, + "loss": 0.01668519154191017, + "num_input_tokens_seen": 4667160, + "step": 285, + "train_runtime": 2317.5195, + "train_tokens_per_second": 2013.86 + }, + { + "epoch": 0.17333333333333334, + "grad_norm": 0.06408604979515076, + "learning_rate": 9.998668489204266e-05, + "loss": 0.016011208295822144, + "num_input_tokens_seen": 4683536, + "step": 286, + "train_runtime": 2325.6311, + "train_tokens_per_second": 2013.877 + }, + { + "epoch": 0.17393939393939395, + "grad_norm": 0.049628015607595444, + "learning_rate": 9.998646205897309e-05, + "loss": 0.015140787698328495, + "num_input_tokens_seen": 4699912, + "step": 287, + "train_runtime": 2333.7425, + "train_tokens_per_second": 2013.895 + }, + { + "epoch": 0.17454545454545456, + "grad_norm": 0.05506971478462219, + "learning_rate": 9.998623737700163e-05, + "loss": 0.014441089704632759, + "num_input_tokens_seen": 4716288, + "step": 288, + "train_runtime": 2341.8537, + "train_tokens_per_second": 2013.912 + }, + { + "epoch": 0.17515151515151514, + "grad_norm": 0.04357004538178444, + "learning_rate": 9.99860108461366e-05, + "loss": 0.014559566974639893, + "num_input_tokens_seen": 4732664, + "step": 289, + "train_runtime": 2349.9687, + "train_tokens_per_second": 2013.926 + }, + { + "epoch": 0.17575757575757575, + "grad_norm": 0.03436315059661865, + "learning_rate": 9.998578246638637e-05, + "loss": 0.014904836192727089, + "num_input_tokens_seen": 4749040, + "step": 290, + "train_runtime": 2358.082, + "train_tokens_per_second": 2013.942 + }, + { + "epoch": 0.17636363636363636, + "grad_norm": 0.030473578721284866, + "learning_rate": 9.99855522377594e-05, + "loss": 0.013786690309643745, + "num_input_tokens_seen": 4765416, + "step": 291, + "train_runtime": 2366.1924, + "train_tokens_per_second": 2013.96 + }, + { + "epoch": 0.17696969696969697, + "grad_norm": 0.033072736114263535, + "learning_rate": 9.998532016026418e-05, + "loss": 0.016431497409939766, + "num_input_tokens_seen": 4781792, + "step": 292, + "train_runtime": 2374.3035, + "train_tokens_per_second": 2013.977 + }, + { + "epoch": 0.17757575757575758, + "grad_norm": 0.03811201453208923, + "learning_rate": 9.998508623390932e-05, + "loss": 0.014959779568016529, + "num_input_tokens_seen": 4798168, + "step": 293, + "train_runtime": 2382.4135, + "train_tokens_per_second": 2013.995 + }, + { + "epoch": 0.1781818181818182, + "grad_norm": 0.04069237411022186, + "learning_rate": 9.998485045870344e-05, + "loss": 0.016118772327899933, + "num_input_tokens_seen": 4814544, + "step": 294, + "train_runtime": 2390.5227, + "train_tokens_per_second": 2014.013 + }, + { + "epoch": 0.1787878787878788, + "grad_norm": 0.031989723443984985, + "learning_rate": 9.99846128346553e-05, + "loss": 0.01669073849916458, + "num_input_tokens_seen": 4830920, + "step": 295, + "train_runtime": 2398.6348, + "train_tokens_per_second": 2014.029 + }, + { + "epoch": 0.17939393939393938, + "grad_norm": 0.03683701902627945, + "learning_rate": 9.998437336177369e-05, + "loss": 0.014967912808060646, + "num_input_tokens_seen": 4847296, + "step": 296, + "train_runtime": 2406.7421, + "train_tokens_per_second": 2014.049 + }, + { + "epoch": 0.18, + "grad_norm": 0.057917602360248566, + "learning_rate": 9.998413204006742e-05, + "loss": 0.018314681947231293, + "num_input_tokens_seen": 4863672, + "step": 297, + "train_runtime": 2414.8505, + "train_tokens_per_second": 2014.068 + }, + { + "epoch": 0.1806060606060606, + "grad_norm": 0.042889710515737534, + "learning_rate": 9.998388886954547e-05, + "loss": 0.014539923518896103, + "num_input_tokens_seen": 4880048, + "step": 298, + "train_runtime": 2422.9583, + "train_tokens_per_second": 2014.087 + }, + { + "epoch": 0.1812121212121212, + "grad_norm": 0.04697619378566742, + "learning_rate": 9.998364385021679e-05, + "loss": 0.01652900129556656, + "num_input_tokens_seen": 4896424, + "step": 299, + "train_runtime": 2431.0701, + "train_tokens_per_second": 2014.102 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 0.038388364017009735, + "learning_rate": 9.998339698209046e-05, + "loss": 0.013660457916557789, + "num_input_tokens_seen": 4912800, + "step": 300, + "train_runtime": 2439.1822, + "train_tokens_per_second": 2014.118 + }, + { + "epoch": 0.18242424242424243, + "grad_norm": 0.026958242058753967, + "learning_rate": 9.998314826517563e-05, + "loss": 0.015251623466610909, + "num_input_tokens_seen": 4929176, + "step": 301, + "train_runtime": 2448.2631, + "train_tokens_per_second": 2013.336 + }, + { + "epoch": 0.18303030303030304, + "grad_norm": 0.04779147729277611, + "learning_rate": 9.998289769948147e-05, + "loss": 0.012775855138897896, + "num_input_tokens_seen": 4945552, + "step": 302, + "train_runtime": 2456.368, + "train_tokens_per_second": 2013.36 + }, + { + "epoch": 0.18363636363636363, + "grad_norm": 0.03123384155333042, + "learning_rate": 9.998264528501727e-05, + "loss": 0.015583731234073639, + "num_input_tokens_seen": 4961928, + "step": 303, + "train_runtime": 2464.4763, + "train_tokens_per_second": 2013.38 + }, + { + "epoch": 0.18424242424242424, + "grad_norm": 0.05030890926718712, + "learning_rate": 9.998239102179236e-05, + "loss": 0.013868209905922413, + "num_input_tokens_seen": 4978304, + "step": 304, + "train_runtime": 2472.5834, + "train_tokens_per_second": 2013.402 + }, + { + "epoch": 0.18484848484848485, + "grad_norm": 0.033021751791238785, + "learning_rate": 9.998213490981614e-05, + "loss": 0.016501927748322487, + "num_input_tokens_seen": 4994680, + "step": 305, + "train_runtime": 2480.6921, + "train_tokens_per_second": 2013.422 + }, + { + "epoch": 0.18545454545454546, + "grad_norm": 0.050541143864393234, + "learning_rate": 9.998187694909807e-05, + "loss": 0.01771150343120098, + "num_input_tokens_seen": 5011056, + "step": 306, + "train_runtime": 2488.7992, + "train_tokens_per_second": 2013.443 + }, + { + "epoch": 0.18606060606060607, + "grad_norm": 0.04063250124454498, + "learning_rate": 9.998161713964774e-05, + "loss": 0.015554912388324738, + "num_input_tokens_seen": 5027432, + "step": 307, + "train_runtime": 2496.9044, + "train_tokens_per_second": 2013.466 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 0.02722395956516266, + "learning_rate": 9.998135548147469e-05, + "loss": 0.013613277114927769, + "num_input_tokens_seen": 5043808, + "step": 308, + "train_runtime": 2505.0089, + "train_tokens_per_second": 2013.489 + }, + { + "epoch": 0.18727272727272729, + "grad_norm": 0.02678558975458145, + "learning_rate": 9.998109197458865e-05, + "loss": 0.014953495003283024, + "num_input_tokens_seen": 5060184, + "step": 309, + "train_runtime": 2513.1157, + "train_tokens_per_second": 2013.51 + }, + { + "epoch": 0.18787878787878787, + "grad_norm": 0.02857518568634987, + "learning_rate": 9.998082661899935e-05, + "loss": 0.013844496570527554, + "num_input_tokens_seen": 5076560, + "step": 310, + "train_runtime": 2521.2304, + "train_tokens_per_second": 2013.525 + }, + { + "epoch": 0.18848484848484848, + "grad_norm": 0.2615605294704437, + "learning_rate": 9.998055941471662e-05, + "loss": 0.01809251680970192, + "num_input_tokens_seen": 5092936, + "step": 311, + "train_runtime": 2529.3408, + "train_tokens_per_second": 2013.543 + }, + { + "epoch": 0.1890909090909091, + "grad_norm": 0.029859403148293495, + "learning_rate": 9.998029036175031e-05, + "loss": 0.015970397740602493, + "num_input_tokens_seen": 5109312, + "step": 312, + "train_runtime": 2537.4488, + "train_tokens_per_second": 2013.563 + }, + { + "epoch": 0.1896969696969697, + "grad_norm": 0.03636668995022774, + "learning_rate": 9.99800194601104e-05, + "loss": 0.01580364629626274, + "num_input_tokens_seen": 5125688, + "step": 313, + "train_runtime": 2545.553, + "train_tokens_per_second": 2013.585 + }, + { + "epoch": 0.1903030303030303, + "grad_norm": 0.0684208944439888, + "learning_rate": 9.997974670980691e-05, + "loss": 0.017103755846619606, + "num_input_tokens_seen": 5142064, + "step": 314, + "train_runtime": 2553.6615, + "train_tokens_per_second": 2013.604 + }, + { + "epoch": 0.19090909090909092, + "grad_norm": 0.028665577992796898, + "learning_rate": 9.997947211084991e-05, + "loss": 0.014511539600789547, + "num_input_tokens_seen": 5158440, + "step": 315, + "train_runtime": 2561.7735, + "train_tokens_per_second": 2013.621 + }, + { + "epoch": 0.19151515151515153, + "grad_norm": 0.09884219616651535, + "learning_rate": 9.997919566324959e-05, + "loss": 0.014168107882142067, + "num_input_tokens_seen": 5174816, + "step": 316, + "train_runtime": 2569.8855, + "train_tokens_per_second": 2013.637 + }, + { + "epoch": 0.1921212121212121, + "grad_norm": 0.1779116839170456, + "learning_rate": 9.997891736701613e-05, + "loss": 0.014995518140494823, + "num_input_tokens_seen": 5191192, + "step": 317, + "train_runtime": 2577.9971, + "train_tokens_per_second": 2013.653 + }, + { + "epoch": 0.19272727272727272, + "grad_norm": 0.030352341011166573, + "learning_rate": 9.997863722215983e-05, + "loss": 0.014715241268277168, + "num_input_tokens_seen": 5207568, + "step": 318, + "train_runtime": 2586.1052, + "train_tokens_per_second": 2013.672 + }, + { + "epoch": 0.19333333333333333, + "grad_norm": 0.03511129692196846, + "learning_rate": 9.99783552286911e-05, + "loss": 0.01499946229159832, + "num_input_tokens_seen": 5223944, + "step": 319, + "train_runtime": 2594.2129, + "train_tokens_per_second": 2013.691 + }, + { + "epoch": 0.19393939393939394, + "grad_norm": 0.04475672170519829, + "learning_rate": 9.997807138662033e-05, + "loss": 0.014523375779390335, + "num_input_tokens_seen": 5240320, + "step": 320, + "train_runtime": 2602.3206, + "train_tokens_per_second": 2013.71 + }, + { + "epoch": 0.19454545454545455, + "grad_norm": 0.02900783158838749, + "learning_rate": 9.997778569595801e-05, + "loss": 0.015447665005922318, + "num_input_tokens_seen": 5256696, + "step": 321, + "train_runtime": 2610.4318, + "train_tokens_per_second": 2013.727 + }, + { + "epoch": 0.19515151515151516, + "grad_norm": 0.022910727187991142, + "learning_rate": 9.997749815671473e-05, + "loss": 0.013799930922687054, + "num_input_tokens_seen": 5273072, + "step": 322, + "train_runtime": 2618.541, + "train_tokens_per_second": 2013.744 + }, + { + "epoch": 0.19575757575757577, + "grad_norm": 0.03925245255231857, + "learning_rate": 9.997720876890113e-05, + "loss": 0.013741591945290565, + "num_input_tokens_seen": 5289448, + "step": 323, + "train_runtime": 2626.6511, + "train_tokens_per_second": 2013.761 + }, + { + "epoch": 0.19636363636363635, + "grad_norm": 0.029477456584572792, + "learning_rate": 9.997691753252791e-05, + "loss": 0.013831754215061665, + "num_input_tokens_seen": 5305824, + "step": 324, + "train_runtime": 2634.7586, + "train_tokens_per_second": 2013.78 + }, + { + "epoch": 0.19696969696969696, + "grad_norm": 0.0368235781788826, + "learning_rate": 9.997662444760583e-05, + "loss": 0.014774560928344727, + "num_input_tokens_seen": 5322200, + "step": 325, + "train_runtime": 2642.8689, + "train_tokens_per_second": 2013.796 + }, + { + "epoch": 0.19757575757575757, + "grad_norm": 0.04399452731013298, + "learning_rate": 9.997632951414573e-05, + "loss": 0.014160547405481339, + "num_input_tokens_seen": 5338576, + "step": 326, + "train_runtime": 2650.978, + "train_tokens_per_second": 2013.814 + }, + { + "epoch": 0.19818181818181818, + "grad_norm": 0.02241128869354725, + "learning_rate": 9.997603273215853e-05, + "loss": 0.013626255095005035, + "num_input_tokens_seen": 5354952, + "step": 327, + "train_runtime": 2659.0857, + "train_tokens_per_second": 2013.832 + }, + { + "epoch": 0.1987878787878788, + "grad_norm": 0.022924182936549187, + "learning_rate": 9.99757341016552e-05, + "loss": 0.013918038457632065, + "num_input_tokens_seen": 5371328, + "step": 328, + "train_runtime": 2667.1942, + "train_tokens_per_second": 2013.85 + }, + { + "epoch": 0.1993939393939394, + "grad_norm": 0.0384218692779541, + "learning_rate": 9.99754336226468e-05, + "loss": 0.01543221715837717, + "num_input_tokens_seen": 5387704, + "step": 329, + "train_runtime": 2675.3051, + "train_tokens_per_second": 2013.865 + }, + { + "epoch": 0.2, + "grad_norm": 0.024983001872897148, + "learning_rate": 9.997513129514442e-05, + "loss": 0.014143919572234154, + "num_input_tokens_seen": 5404080, + "step": 330, + "train_runtime": 2683.4136, + "train_tokens_per_second": 2013.883 + }, + { + "epoch": 0.2006060606060606, + "grad_norm": 0.036509182304143906, + "learning_rate": 9.997482711915927e-05, + "loss": 0.017176145687699318, + "num_input_tokens_seen": 5420456, + "step": 331, + "train_runtime": 2691.5304, + "train_tokens_per_second": 2013.894 + }, + { + "epoch": 0.2012121212121212, + "grad_norm": 0.02530326321721077, + "learning_rate": 9.997452109470257e-05, + "loss": 0.01395807322114706, + "num_input_tokens_seen": 5436832, + "step": 332, + "train_runtime": 2699.6383, + "train_tokens_per_second": 2013.911 + }, + { + "epoch": 0.2018181818181818, + "grad_norm": 0.026743337512016296, + "learning_rate": 9.997421322178566e-05, + "loss": 0.015008356422185898, + "num_input_tokens_seen": 5453208, + "step": 333, + "train_runtime": 2707.7479, + "train_tokens_per_second": 2013.928 + }, + { + "epoch": 0.20242424242424242, + "grad_norm": 0.03141747787594795, + "learning_rate": 9.997390350041993e-05, + "loss": 0.014487622305750847, + "num_input_tokens_seen": 5469584, + "step": 334, + "train_runtime": 2715.8554, + "train_tokens_per_second": 2013.945 + }, + { + "epoch": 0.20303030303030303, + "grad_norm": 0.03556372597813606, + "learning_rate": 9.997359193061681e-05, + "loss": 0.014322612434625626, + "num_input_tokens_seen": 5485960, + "step": 335, + "train_runtime": 2723.964, + "train_tokens_per_second": 2013.962 + }, + { + "epoch": 0.20363636363636364, + "grad_norm": 0.05319400504231453, + "learning_rate": 9.997327851238788e-05, + "loss": 0.015110835433006287, + "num_input_tokens_seen": 5502336, + "step": 336, + "train_runtime": 2732.0746, + "train_tokens_per_second": 2013.977 + }, + { + "epoch": 0.20424242424242425, + "grad_norm": 0.05987285077571869, + "learning_rate": 9.997296324574467e-05, + "loss": 0.015784846618771553, + "num_input_tokens_seen": 5518712, + "step": 337, + "train_runtime": 2740.1837, + "train_tokens_per_second": 2013.993 + }, + { + "epoch": 0.20484848484848484, + "grad_norm": 0.05444290488958359, + "learning_rate": 9.997264613069887e-05, + "loss": 0.016434665769338608, + "num_input_tokens_seen": 5535088, + "step": 338, + "train_runtime": 2748.2918, + "train_tokens_per_second": 2014.01 + }, + { + "epoch": 0.20545454545454545, + "grad_norm": 0.03842825070023537, + "learning_rate": 9.997232716726222e-05, + "loss": 0.01436456385999918, + "num_input_tokens_seen": 5551464, + "step": 339, + "train_runtime": 2756.4036, + "train_tokens_per_second": 2014.024 + }, + { + "epoch": 0.20606060606060606, + "grad_norm": 0.0297915730625391, + "learning_rate": 9.997200635544648e-05, + "loss": 0.014456460252404213, + "num_input_tokens_seen": 5567840, + "step": 340, + "train_runtime": 2764.5114, + "train_tokens_per_second": 2014.041 + }, + { + "epoch": 0.20666666666666667, + "grad_norm": 0.030197616666555405, + "learning_rate": 9.997168369526355e-05, + "loss": 0.013316805474460125, + "num_input_tokens_seen": 5584216, + "step": 341, + "train_runtime": 2772.6201, + "train_tokens_per_second": 2014.057 + }, + { + "epoch": 0.20727272727272728, + "grad_norm": 0.04718567803502083, + "learning_rate": 9.997135918672536e-05, + "loss": 0.014915217645466328, + "num_input_tokens_seen": 5600592, + "step": 342, + "train_runtime": 2780.7298, + "train_tokens_per_second": 2014.073 + }, + { + "epoch": 0.20787878787878789, + "grad_norm": 0.04453250393271446, + "learning_rate": 9.997103282984391e-05, + "loss": 0.013720309361815453, + "num_input_tokens_seen": 5616968, + "step": 343, + "train_runtime": 2788.839, + "train_tokens_per_second": 2014.088 + }, + { + "epoch": 0.2084848484848485, + "grad_norm": 0.028496714308857918, + "learning_rate": 9.997070462463127e-05, + "loss": 0.015428826212882996, + "num_input_tokens_seen": 5633344, + "step": 344, + "train_runtime": 2796.9478, + "train_tokens_per_second": 2014.104 + }, + { + "epoch": 0.20909090909090908, + "grad_norm": 0.025575809180736542, + "learning_rate": 9.99703745710996e-05, + "loss": 0.014485862106084824, + "num_input_tokens_seen": 5649720, + "step": 345, + "train_runtime": 2805.0546, + "train_tokens_per_second": 2014.121 + }, + { + "epoch": 0.2096969696969697, + "grad_norm": 0.03871789202094078, + "learning_rate": 9.997004266926105e-05, + "loss": 0.013593616895377636, + "num_input_tokens_seen": 5666096, + "step": 346, + "train_runtime": 2813.1609, + "train_tokens_per_second": 2014.139 + }, + { + "epoch": 0.2103030303030303, + "grad_norm": 0.07384062558412552, + "learning_rate": 9.996970891912794e-05, + "loss": 0.015072252601385117, + "num_input_tokens_seen": 5682472, + "step": 347, + "train_runtime": 2821.2688, + "train_tokens_per_second": 2014.155 + }, + { + "epoch": 0.2109090909090909, + "grad_norm": 0.041799403727054596, + "learning_rate": 9.996937332071263e-05, + "loss": 0.014217150397598743, + "num_input_tokens_seen": 5698848, + "step": 348, + "train_runtime": 2829.3783, + "train_tokens_per_second": 2014.17 + }, + { + "epoch": 0.21151515151515152, + "grad_norm": 0.04895857349038124, + "learning_rate": 9.99690358740275e-05, + "loss": 0.017368610948324203, + "num_input_tokens_seen": 5715224, + "step": 349, + "train_runtime": 2837.4869, + "train_tokens_per_second": 2014.185 + }, + { + "epoch": 0.21212121212121213, + "grad_norm": 0.03166350722312927, + "learning_rate": 9.996869657908504e-05, + "loss": 0.014376340433955193, + "num_input_tokens_seen": 5731600, + "step": 350, + "train_runtime": 2845.6047, + "train_tokens_per_second": 2014.194 + }, + { + "epoch": 0.21272727272727274, + "grad_norm": 0.06105640158057213, + "learning_rate": 9.996835543589781e-05, + "loss": 0.01661105453968048, + "num_input_tokens_seen": 5747976, + "step": 351, + "train_runtime": 2853.7303, + "train_tokens_per_second": 2014.197 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 0.038000039756298065, + "learning_rate": 9.996801244447842e-05, + "loss": 0.013641721569001675, + "num_input_tokens_seen": 5764352, + "step": 352, + "train_runtime": 2861.847, + "train_tokens_per_second": 2014.207 + }, + { + "epoch": 0.21393939393939393, + "grad_norm": 0.033811476081609726, + "learning_rate": 9.996766760483956e-05, + "loss": 0.01525929756462574, + "num_input_tokens_seen": 5780728, + "step": 353, + "train_runtime": 2869.9635, + "train_tokens_per_second": 2014.217 + }, + { + "epoch": 0.21454545454545454, + "grad_norm": 0.01919690892100334, + "learning_rate": 9.996732091699396e-05, + "loss": 0.013008120469748974, + "num_input_tokens_seen": 5797104, + "step": 354, + "train_runtime": 2878.0782, + "train_tokens_per_second": 2014.227 + }, + { + "epoch": 0.21515151515151515, + "grad_norm": 0.03718187287449837, + "learning_rate": 9.99669723809545e-05, + "loss": 0.015754155814647675, + "num_input_tokens_seen": 5813480, + "step": 355, + "train_runtime": 2886.1934, + "train_tokens_per_second": 2014.238 + }, + { + "epoch": 0.21575757575757576, + "grad_norm": 0.03534379228949547, + "learning_rate": 9.996662199673401e-05, + "loss": 0.014936118386685848, + "num_input_tokens_seen": 5829856, + "step": 356, + "train_runtime": 2894.3081, + "train_tokens_per_second": 2014.249 + }, + { + "epoch": 0.21636363636363637, + "grad_norm": 0.024305060505867004, + "learning_rate": 9.99662697643455e-05, + "loss": 0.01359601877629757, + "num_input_tokens_seen": 5846232, + "step": 357, + "train_runtime": 2902.4301, + "train_tokens_per_second": 2014.254 + }, + { + "epoch": 0.21696969696969698, + "grad_norm": 0.027639245614409447, + "learning_rate": 9.996591568380196e-05, + "loss": 0.014319726265966892, + "num_input_tokens_seen": 5862608, + "step": 358, + "train_runtime": 2910.5461, + "train_tokens_per_second": 2014.264 + }, + { + "epoch": 0.2175757575757576, + "grad_norm": 0.06455444544553757, + "learning_rate": 9.996555975511652e-05, + "loss": 0.013829253613948822, + "num_input_tokens_seen": 5878984, + "step": 359, + "train_runtime": 2918.6646, + "train_tokens_per_second": 2014.272 + }, + { + "epoch": 0.21818181818181817, + "grad_norm": 0.02637428045272827, + "learning_rate": 9.996520197830231e-05, + "loss": 0.01420363038778305, + "num_input_tokens_seen": 5895360, + "step": 360, + "train_runtime": 2926.7812, + "train_tokens_per_second": 2014.281 + }, + { + "epoch": 0.21878787878787878, + "grad_norm": 0.06233112886548042, + "learning_rate": 9.99648423533726e-05, + "loss": 0.017570551484823227, + "num_input_tokens_seen": 5911736, + "step": 361, + "train_runtime": 2934.9002, + "train_tokens_per_second": 2014.289 + }, + { + "epoch": 0.2193939393939394, + "grad_norm": 0.04012456163764, + "learning_rate": 9.996448088034065e-05, + "loss": 0.015336515381932259, + "num_input_tokens_seen": 5928112, + "step": 362, + "train_runtime": 2943.0179, + "train_tokens_per_second": 2014.297 + }, + { + "epoch": 0.22, + "grad_norm": 0.029959173873066902, + "learning_rate": 9.996411755921987e-05, + "loss": 0.013176209293305874, + "num_input_tokens_seen": 5944488, + "step": 363, + "train_runtime": 2951.1353, + "train_tokens_per_second": 2014.305 + }, + { + "epoch": 0.2206060606060606, + "grad_norm": 0.045539602637290955, + "learning_rate": 9.996375239002369e-05, + "loss": 0.017476335167884827, + "num_input_tokens_seen": 5960864, + "step": 364, + "train_runtime": 2959.2526, + "train_tokens_per_second": 2014.314 + }, + { + "epoch": 0.22121212121212122, + "grad_norm": 0.04066498950123787, + "learning_rate": 9.996338537276559e-05, + "loss": 0.015315013006329536, + "num_input_tokens_seen": 5977240, + "step": 365, + "train_runtime": 2967.3711, + "train_tokens_per_second": 2014.322 + }, + { + "epoch": 0.22181818181818183, + "grad_norm": 0.055071763694286346, + "learning_rate": 9.996301650745917e-05, + "loss": 0.013316687196493149, + "num_input_tokens_seen": 5993616, + "step": 366, + "train_runtime": 2975.4906, + "train_tokens_per_second": 2014.329 + }, + { + "epoch": 0.2224242424242424, + "grad_norm": 0.020134275779128075, + "learning_rate": 9.996264579411807e-05, + "loss": 0.012931122444570065, + "num_input_tokens_seen": 6009992, + "step": 367, + "train_runtime": 2983.6081, + "train_tokens_per_second": 2014.337 + }, + { + "epoch": 0.22303030303030302, + "grad_norm": 0.0290455874055624, + "learning_rate": 9.9962273232756e-05, + "loss": 0.013352105394005775, + "num_input_tokens_seen": 6026368, + "step": 368, + "train_runtime": 2991.7313, + "train_tokens_per_second": 2014.341 + }, + { + "epoch": 0.22363636363636363, + "grad_norm": 0.03161335363984108, + "learning_rate": 9.996189882338675e-05, + "loss": 0.012487310916185379, + "num_input_tokens_seen": 6042744, + "step": 369, + "train_runtime": 2999.8498, + "train_tokens_per_second": 2014.349 + }, + { + "epoch": 0.22424242424242424, + "grad_norm": 0.05878787115216255, + "learning_rate": 9.996152256602414e-05, + "loss": 0.014912744984030724, + "num_input_tokens_seen": 6059120, + "step": 370, + "train_runtime": 3007.9654, + "train_tokens_per_second": 2014.358 + }, + { + "epoch": 0.22484848484848485, + "grad_norm": 0.029024092480540276, + "learning_rate": 9.996114446068212e-05, + "loss": 0.012249596416950226, + "num_input_tokens_seen": 6075496, + "step": 371, + "train_runtime": 3016.083, + "train_tokens_per_second": 2014.366 + }, + { + "epoch": 0.22545454545454546, + "grad_norm": 0.023940905928611755, + "learning_rate": 9.996076450737465e-05, + "loss": 0.014684991911053658, + "num_input_tokens_seen": 6091872, + "step": 372, + "train_runtime": 3024.1999, + "train_tokens_per_second": 2014.375 + }, + { + "epoch": 0.22606060606060607, + "grad_norm": 0.07777219265699387, + "learning_rate": 9.99603827061158e-05, + "loss": 0.01571383886039257, + "num_input_tokens_seen": 6108248, + "step": 373, + "train_runtime": 3032.3166, + "train_tokens_per_second": 2014.383 + }, + { + "epoch": 0.22666666666666666, + "grad_norm": 0.030761806294322014, + "learning_rate": 9.99599990569197e-05, + "loss": 0.013848803006112576, + "num_input_tokens_seen": 6124624, + "step": 374, + "train_runtime": 3040.4333, + "train_tokens_per_second": 2014.392 + }, + { + "epoch": 0.22727272727272727, + "grad_norm": 0.0438305102288723, + "learning_rate": 9.995961355980051e-05, + "loss": 0.014024798758327961, + "num_input_tokens_seen": 6141000, + "step": 375, + "train_runtime": 3048.549, + "train_tokens_per_second": 2014.401 + }, + { + "epoch": 0.22787878787878788, + "grad_norm": 0.04035346210002899, + "learning_rate": 9.995922621477252e-05, + "loss": 0.014576055109500885, + "num_input_tokens_seen": 6157376, + "step": 376, + "train_runtime": 3056.6655, + "train_tokens_per_second": 2014.41 + }, + { + "epoch": 0.22848484848484849, + "grad_norm": 0.09497886896133423, + "learning_rate": 9.995883702185003e-05, + "loss": 0.014249450527131557, + "num_input_tokens_seen": 6173752, + "step": 377, + "train_runtime": 3064.7824, + "train_tokens_per_second": 2014.418 + }, + { + "epoch": 0.2290909090909091, + "grad_norm": 0.03223222866654396, + "learning_rate": 9.995844598104746e-05, + "loss": 0.013723311945796013, + "num_input_tokens_seen": 6190128, + "step": 378, + "train_runtime": 3072.8984, + "train_tokens_per_second": 2014.426 + }, + { + "epoch": 0.2296969696969697, + "grad_norm": 0.023603513836860657, + "learning_rate": 9.995805309237926e-05, + "loss": 0.015003862790763378, + "num_input_tokens_seen": 6206504, + "step": 379, + "train_runtime": 3081.015, + "train_tokens_per_second": 2014.435 + }, + { + "epoch": 0.23030303030303031, + "grad_norm": 0.07697781920433044, + "learning_rate": 9.995765835585995e-05, + "loss": 0.01642550155520439, + "num_input_tokens_seen": 6222880, + "step": 380, + "train_runtime": 3089.1312, + "train_tokens_per_second": 2014.443 + }, + { + "epoch": 0.2309090909090909, + "grad_norm": 0.06212541460990906, + "learning_rate": 9.995726177150418e-05, + "loss": 0.013186133466660976, + "num_input_tokens_seen": 6239256, + "step": 381, + "train_runtime": 3097.2484, + "train_tokens_per_second": 2014.451 + }, + { + "epoch": 0.2315151515151515, + "grad_norm": 0.04135077819228172, + "learning_rate": 9.995686333932655e-05, + "loss": 0.015075747855007648, + "num_input_tokens_seen": 6255632, + "step": 382, + "train_runtime": 3105.3662, + "train_tokens_per_second": 2014.459 + }, + { + "epoch": 0.23212121212121212, + "grad_norm": 0.03373231366276741, + "learning_rate": 9.995646305934184e-05, + "loss": 0.015022508800029755, + "num_input_tokens_seen": 6272008, + "step": 383, + "train_runtime": 3113.4845, + "train_tokens_per_second": 2014.466 + }, + { + "epoch": 0.23272727272727273, + "grad_norm": 0.052756134420633316, + "learning_rate": 9.995606093156485e-05, + "loss": 0.016195476055145264, + "num_input_tokens_seen": 6288384, + "step": 384, + "train_runtime": 3121.6016, + "train_tokens_per_second": 2014.474 + }, + { + "epoch": 0.23333333333333334, + "grad_norm": 0.04732633754611015, + "learning_rate": 9.995565695601045e-05, + "loss": 0.015717167407274246, + "num_input_tokens_seen": 6304760, + "step": 385, + "train_runtime": 3129.7191, + "train_tokens_per_second": 2014.481 + }, + { + "epoch": 0.23393939393939395, + "grad_norm": 0.050964321941137314, + "learning_rate": 9.99552511326936e-05, + "loss": 0.013431689701974392, + "num_input_tokens_seen": 6321136, + "step": 386, + "train_runtime": 3137.836, + "train_tokens_per_second": 2014.489 + }, + { + "epoch": 0.23454545454545456, + "grad_norm": 0.029031990095973015, + "learning_rate": 9.995484346162926e-05, + "loss": 0.013563702814280987, + "num_input_tokens_seen": 6337512, + "step": 387, + "train_runtime": 3145.953, + "train_tokens_per_second": 2014.497 + }, + { + "epoch": 0.23515151515151514, + "grad_norm": 0.03224366530776024, + "learning_rate": 9.995443394283257e-05, + "loss": 0.01605670340359211, + "num_input_tokens_seen": 6353888, + "step": 388, + "train_runtime": 3154.0724, + "train_tokens_per_second": 2014.503 + }, + { + "epoch": 0.23575757575757575, + "grad_norm": 0.03045693039894104, + "learning_rate": 9.995402257631865e-05, + "loss": 0.015148544684052467, + "num_input_tokens_seen": 6370264, + "step": 389, + "train_runtime": 3162.1889, + "train_tokens_per_second": 2014.511 + }, + { + "epoch": 0.23636363636363636, + "grad_norm": 0.027332261204719543, + "learning_rate": 9.995360936210271e-05, + "loss": 0.014781562611460686, + "num_input_tokens_seen": 6386640, + "step": 390, + "train_runtime": 3170.3051, + "train_tokens_per_second": 2014.519 + }, + { + "epoch": 0.23696969696969697, + "grad_norm": 0.023009251803159714, + "learning_rate": 9.995319430020003e-05, + "loss": 0.013627824373543262, + "num_input_tokens_seen": 6403016, + "step": 391, + "train_runtime": 3178.43, + "train_tokens_per_second": 2014.522 + }, + { + "epoch": 0.23757575757575758, + "grad_norm": 0.035416360944509506, + "learning_rate": 9.995277739062599e-05, + "loss": 0.01493286807090044, + "num_input_tokens_seen": 6419392, + "step": 392, + "train_runtime": 3186.5451, + "train_tokens_per_second": 2014.53 + }, + { + "epoch": 0.2381818181818182, + "grad_norm": 0.04003625363111496, + "learning_rate": 9.995235863339598e-05, + "loss": 0.016020091250538826, + "num_input_tokens_seen": 6435768, + "step": 393, + "train_runtime": 3194.6612, + "train_tokens_per_second": 2014.539 + }, + { + "epoch": 0.2387878787878788, + "grad_norm": 0.024710826575756073, + "learning_rate": 9.995193802852552e-05, + "loss": 0.015763292089104652, + "num_input_tokens_seen": 6452144, + "step": 394, + "train_runtime": 3202.7765, + "train_tokens_per_second": 2014.547 + }, + { + "epoch": 0.23939393939393938, + "grad_norm": 0.05250145494937897, + "learning_rate": 9.995151557603013e-05, + "loss": 0.017301952466368675, + "num_input_tokens_seen": 6468520, + "step": 395, + "train_runtime": 3210.893, + "train_tokens_per_second": 2014.555 + }, + { + "epoch": 0.24, + "grad_norm": 0.037685710936784744, + "learning_rate": 9.995109127592546e-05, + "loss": 0.014692970551550388, + "num_input_tokens_seen": 6484896, + "step": 396, + "train_runtime": 3219.0101, + "train_tokens_per_second": 2014.562 + }, + { + "epoch": 0.2406060606060606, + "grad_norm": 0.03617233410477638, + "learning_rate": 9.99506651282272e-05, + "loss": 0.015763459727168083, + "num_input_tokens_seen": 6501272, + "step": 397, + "train_runtime": 3227.1302, + "train_tokens_per_second": 2014.568 + }, + { + "epoch": 0.2412121212121212, + "grad_norm": 0.026065215468406677, + "learning_rate": 9.995023713295111e-05, + "loss": 0.013620332814753056, + "num_input_tokens_seen": 6517648, + "step": 398, + "train_runtime": 3235.2472, + "train_tokens_per_second": 2014.575 + }, + { + "epoch": 0.24181818181818182, + "grad_norm": 0.045087747275829315, + "learning_rate": 9.994980729011303e-05, + "loss": 0.015572777949273586, + "num_input_tokens_seen": 6534024, + "step": 399, + "train_runtime": 3243.3644, + "train_tokens_per_second": 2014.582 + }, + { + "epoch": 0.24242424242424243, + "grad_norm": 0.02911469154059887, + "learning_rate": 9.994937559972884e-05, + "loss": 0.014463523402810097, + "num_input_tokens_seen": 6550400, + "step": 400, + "train_runtime": 3251.4815, + "train_tokens_per_second": 2014.589 + }, + { + "epoch": 0.24303030303030304, + "grad_norm": 0.09026223421096802, + "learning_rate": 9.994894206181452e-05, + "loss": 0.015273511409759521, + "num_input_tokens_seen": 6566776, + "step": 401, + "train_runtime": 3260.5529, + "train_tokens_per_second": 2014.007 + }, + { + "epoch": 0.24363636363636362, + "grad_norm": 0.059329140931367874, + "learning_rate": 9.994850667638611e-05, + "loss": 0.017180006951093674, + "num_input_tokens_seen": 6583152, + "step": 402, + "train_runtime": 3268.6733, + "train_tokens_per_second": 2014.013 + }, + { + "epoch": 0.24424242424242423, + "grad_norm": 0.05259858816862106, + "learning_rate": 9.99480694434597e-05, + "loss": 0.01665383018553257, + "num_input_tokens_seen": 6599528, + "step": 403, + "train_runtime": 3276.7926, + "train_tokens_per_second": 2014.021 + }, + { + "epoch": 0.24484848484848484, + "grad_norm": 0.046337101608514786, + "learning_rate": 9.994763036305148e-05, + "loss": 0.01817156933248043, + "num_input_tokens_seen": 6615904, + "step": 404, + "train_runtime": 3284.9091, + "train_tokens_per_second": 2014.03 + }, + { + "epoch": 0.24545454545454545, + "grad_norm": 0.023166598752141, + "learning_rate": 9.994718943517768e-05, + "loss": 0.012105523608624935, + "num_input_tokens_seen": 6632280, + "step": 405, + "train_runtime": 3293.0293, + "train_tokens_per_second": 2014.036 + }, + { + "epoch": 0.24606060606060606, + "grad_norm": 0.044385019689798355, + "learning_rate": 9.994674665985461e-05, + "loss": 0.01413038745522499, + "num_input_tokens_seen": 6648656, + "step": 406, + "train_runtime": 3301.1473, + "train_tokens_per_second": 2014.044 + }, + { + "epoch": 0.24666666666666667, + "grad_norm": 0.038354646414518356, + "learning_rate": 9.994630203709865e-05, + "loss": 0.015764841809868813, + "num_input_tokens_seen": 6665032, + "step": 407, + "train_runtime": 3309.2652, + "train_tokens_per_second": 2014.052 + }, + { + "epoch": 0.24727272727272728, + "grad_norm": 0.026519082486629486, + "learning_rate": 9.994585556692624e-05, + "loss": 0.015617020428180695, + "num_input_tokens_seen": 6681408, + "step": 408, + "train_runtime": 3317.3836, + "train_tokens_per_second": 2014.06 + }, + { + "epoch": 0.24787878787878787, + "grad_norm": 0.07033390551805496, + "learning_rate": 9.994540724935389e-05, + "loss": 0.01747780106961727, + "num_input_tokens_seen": 6697784, + "step": 409, + "train_runtime": 3325.5001, + "train_tokens_per_second": 2014.068 + }, + { + "epoch": 0.24848484848484848, + "grad_norm": 0.02514197863638401, + "learning_rate": 9.994495708439819e-05, + "loss": 0.01398993656039238, + "num_input_tokens_seen": 6714160, + "step": 410, + "train_runtime": 3333.618, + "train_tokens_per_second": 2014.076 + }, + { + "epoch": 0.24909090909090909, + "grad_norm": 0.023313792422413826, + "learning_rate": 9.99445050720758e-05, + "loss": 0.013531757518649101, + "num_input_tokens_seen": 6730536, + "step": 411, + "train_runtime": 3341.7359, + "train_tokens_per_second": 2014.084 + }, + { + "epoch": 0.2496969696969697, + "grad_norm": 0.04927172139286995, + "learning_rate": 9.994405121240344e-05, + "loss": 0.014407115057110786, + "num_input_tokens_seen": 6746912, + "step": 412, + "train_runtime": 3349.8514, + "train_tokens_per_second": 2014.093 + }, + { + "epoch": 0.2503030303030303, + "grad_norm": 0.03376639634370804, + "learning_rate": 9.994359550539787e-05, + "loss": 0.015590015798807144, + "num_input_tokens_seen": 6763288, + "step": 413, + "train_runtime": 3357.9682, + "train_tokens_per_second": 2014.101 + }, + { + "epoch": 0.2509090909090909, + "grad_norm": 0.026951145380735397, + "learning_rate": 9.994313795107597e-05, + "loss": 0.013218428939580917, + "num_input_tokens_seen": 6779664, + "step": 414, + "train_runtime": 3366.0858, + "train_tokens_per_second": 2014.109 + }, + { + "epoch": 0.2515151515151515, + "grad_norm": 0.028939809650182724, + "learning_rate": 9.994267854945465e-05, + "loss": 0.013945825397968292, + "num_input_tokens_seen": 6796040, + "step": 415, + "train_runtime": 3374.204, + "train_tokens_per_second": 2014.116 + }, + { + "epoch": 0.25212121212121213, + "grad_norm": 0.048603300005197525, + "learning_rate": 9.994221730055091e-05, + "loss": 0.014013823121786118, + "num_input_tokens_seen": 6812416, + "step": 416, + "train_runtime": 3382.3201, + "train_tokens_per_second": 2014.125 + }, + { + "epoch": 0.25272727272727274, + "grad_norm": 0.03397737815976143, + "learning_rate": 9.994175420438182e-05, + "loss": 0.016459740698337555, + "num_input_tokens_seen": 6828792, + "step": 417, + "train_runtime": 3390.4376, + "train_tokens_per_second": 2014.133 + }, + { + "epoch": 0.25333333333333335, + "grad_norm": 0.09882687032222748, + "learning_rate": 9.99412892609645e-05, + "loss": 0.019090697169303894, + "num_input_tokens_seen": 6845168, + "step": 418, + "train_runtime": 3398.5545, + "train_tokens_per_second": 2014.141 + }, + { + "epoch": 0.25393939393939396, + "grad_norm": 0.02406393364071846, + "learning_rate": 9.994082247031613e-05, + "loss": 0.01460934616625309, + "num_input_tokens_seen": 6861544, + "step": 419, + "train_runtime": 3406.6729, + "train_tokens_per_second": 2014.148 + }, + { + "epoch": 0.2545454545454545, + "grad_norm": 0.05103567615151405, + "learning_rate": 9.994035383245401e-05, + "loss": 0.014737242832779884, + "num_input_tokens_seen": 6877920, + "step": 420, + "train_runtime": 3414.7913, + "train_tokens_per_second": 2014.155 + }, + { + "epoch": 0.25515151515151513, + "grad_norm": 0.040553025901317596, + "learning_rate": 9.993988334739544e-05, + "loss": 0.015402523800730705, + "num_input_tokens_seen": 6894296, + "step": 421, + "train_runtime": 3422.91, + "train_tokens_per_second": 2014.162 + }, + { + "epoch": 0.25575757575757574, + "grad_norm": 0.038083747029304504, + "learning_rate": 9.993941101515786e-05, + "loss": 0.014769435860216618, + "num_input_tokens_seen": 6910672, + "step": 422, + "train_runtime": 3431.0293, + "train_tokens_per_second": 2014.169 + }, + { + "epoch": 0.25636363636363635, + "grad_norm": 0.018217189237475395, + "learning_rate": 9.99389368357587e-05, + "loss": 0.01357343327254057, + "num_input_tokens_seen": 6927048, + "step": 423, + "train_runtime": 3439.1473, + "train_tokens_per_second": 2014.176 + }, + { + "epoch": 0.25696969696969696, + "grad_norm": 0.04052957519888878, + "learning_rate": 9.993846080921552e-05, + "loss": 0.01406765729188919, + "num_input_tokens_seen": 6943424, + "step": 424, + "train_runtime": 3447.2634, + "train_tokens_per_second": 2014.184 + }, + { + "epoch": 0.25757575757575757, + "grad_norm": 0.02357480488717556, + "learning_rate": 9.993798293554593e-05, + "loss": 0.013200477696955204, + "num_input_tokens_seen": 6959800, + "step": 425, + "train_runtime": 3455.3793, + "train_tokens_per_second": 2014.193 + }, + { + "epoch": 0.2581818181818182, + "grad_norm": 0.02221427671611309, + "learning_rate": 9.99375032147676e-05, + "loss": 0.014044541865587234, + "num_input_tokens_seen": 6976176, + "step": 426, + "train_runtime": 3463.4954, + "train_tokens_per_second": 2014.201 + }, + { + "epoch": 0.2587878787878788, + "grad_norm": 0.03215425834059715, + "learning_rate": 9.993702164689829e-05, + "loss": 0.013318242505192757, + "num_input_tokens_seen": 6992552, + "step": 427, + "train_runtime": 3471.613, + "train_tokens_per_second": 2014.208 + }, + { + "epoch": 0.2593939393939394, + "grad_norm": 0.049007292836904526, + "learning_rate": 9.993653823195578e-05, + "loss": 0.014676532708108425, + "num_input_tokens_seen": 7008928, + "step": 428, + "train_runtime": 3479.731, + "train_tokens_per_second": 2014.215 + }, + { + "epoch": 0.26, + "grad_norm": 0.029083114117383957, + "learning_rate": 9.993605296995796e-05, + "loss": 0.013533808290958405, + "num_input_tokens_seen": 7025304, + "step": 429, + "train_runtime": 3487.8493, + "train_tokens_per_second": 2014.222 + }, + { + "epoch": 0.2606060606060606, + "grad_norm": 0.03159458562731743, + "learning_rate": 9.993556586092281e-05, + "loss": 0.015523270703852177, + "num_input_tokens_seen": 7041680, + "step": 430, + "train_runtime": 3495.9654, + "train_tokens_per_second": 2014.23 + }, + { + "epoch": 0.26121212121212123, + "grad_norm": 0.023704880848526955, + "learning_rate": 9.993507690486831e-05, + "loss": 0.014423849992454052, + "num_input_tokens_seen": 7058056, + "step": 431, + "train_runtime": 3504.0833, + "train_tokens_per_second": 2014.238 + }, + { + "epoch": 0.26181818181818184, + "grad_norm": 0.061435069888830185, + "learning_rate": 9.993458610181256e-05, + "loss": 0.01381218247115612, + "num_input_tokens_seen": 7074432, + "step": 432, + "train_runtime": 3512.2002, + "train_tokens_per_second": 2014.245 + }, + { + "epoch": 0.26242424242424245, + "grad_norm": 0.027623331174254417, + "learning_rate": 9.993409345177371e-05, + "loss": 0.013529473915696144, + "num_input_tokens_seen": 7090808, + "step": 433, + "train_runtime": 3520.3183, + "train_tokens_per_second": 2014.252 + }, + { + "epoch": 0.263030303030303, + "grad_norm": 0.02938493713736534, + "learning_rate": 9.993359895477e-05, + "loss": 0.014209594577550888, + "num_input_tokens_seen": 7107184, + "step": 434, + "train_runtime": 3528.4347, + "train_tokens_per_second": 2014.26 + }, + { + "epoch": 0.2636363636363636, + "grad_norm": 0.05708494782447815, + "learning_rate": 9.993310261081968e-05, + "loss": 0.01838802546262741, + "num_input_tokens_seen": 7123560, + "step": 435, + "train_runtime": 3536.5523, + "train_tokens_per_second": 2014.267 + }, + { + "epoch": 0.2642424242424242, + "grad_norm": 0.01653749868273735, + "learning_rate": 9.993260441994116e-05, + "loss": 0.014132829383015633, + "num_input_tokens_seen": 7139936, + "step": 436, + "train_runtime": 3544.6693, + "train_tokens_per_second": 2014.274 + }, + { + "epoch": 0.26484848484848483, + "grad_norm": 0.06222791597247124, + "learning_rate": 9.993210438215284e-05, + "loss": 0.017560908570885658, + "num_input_tokens_seen": 7156312, + "step": 437, + "train_runtime": 3552.7886, + "train_tokens_per_second": 2014.28 + }, + { + "epoch": 0.26545454545454544, + "grad_norm": 0.023168306797742844, + "learning_rate": 9.993160249747319e-05, + "loss": 0.014680145308375359, + "num_input_tokens_seen": 7172688, + "step": 438, + "train_runtime": 3560.9057, + "train_tokens_per_second": 2014.288 + }, + { + "epoch": 0.26606060606060605, + "grad_norm": 0.03977813571691513, + "learning_rate": 9.993109876592083e-05, + "loss": 0.01688549481332302, + "num_input_tokens_seen": 7189064, + "step": 439, + "train_runtime": 3569.029, + "train_tokens_per_second": 2014.291 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.027993550524115562, + "learning_rate": 9.993059318751435e-05, + "loss": 0.012989813461899757, + "num_input_tokens_seen": 7205440, + "step": 440, + "train_runtime": 3577.1458, + "train_tokens_per_second": 2014.299 + }, + { + "epoch": 0.2672727272727273, + "grad_norm": 0.051551882177591324, + "learning_rate": 9.993008576227247e-05, + "loss": 0.016102567315101624, + "num_input_tokens_seen": 7221816, + "step": 441, + "train_runtime": 3585.2621, + "train_tokens_per_second": 2014.306 + }, + { + "epoch": 0.2678787878787879, + "grad_norm": 0.03278960660099983, + "learning_rate": 9.992957649021395e-05, + "loss": 0.014773263595998287, + "num_input_tokens_seen": 7238192, + "step": 442, + "train_runtime": 3593.378, + "train_tokens_per_second": 2014.314 + }, + { + "epoch": 0.2684848484848485, + "grad_norm": 0.030394606292247772, + "learning_rate": 9.992906537135762e-05, + "loss": 0.015549161471426487, + "num_input_tokens_seen": 7254568, + "step": 443, + "train_runtime": 3601.4945, + "train_tokens_per_second": 2014.322 + }, + { + "epoch": 0.2690909090909091, + "grad_norm": 0.027792129665613174, + "learning_rate": 9.992855240572241e-05, + "loss": 0.01473160833120346, + "num_input_tokens_seen": 7270944, + "step": 444, + "train_runtime": 3609.6111, + "train_tokens_per_second": 2014.329 + }, + { + "epoch": 0.2696969696969697, + "grad_norm": 0.01833016611635685, + "learning_rate": 9.992803759332728e-05, + "loss": 0.013827802613377571, + "num_input_tokens_seen": 7287320, + "step": 445, + "train_runtime": 3617.7304, + "train_tokens_per_second": 2014.335 + }, + { + "epoch": 0.2703030303030303, + "grad_norm": 0.021910199895501137, + "learning_rate": 9.992752093419124e-05, + "loss": 0.014088256284594536, + "num_input_tokens_seen": 7303696, + "step": 446, + "train_runtime": 3625.8455, + "train_tokens_per_second": 2014.343 + }, + { + "epoch": 0.27090909090909093, + "grad_norm": 0.03614957630634308, + "learning_rate": 9.992700242833346e-05, + "loss": 0.014040564186871052, + "num_input_tokens_seen": 7320072, + "step": 447, + "train_runtime": 3633.9607, + "train_tokens_per_second": 2014.351 + }, + { + "epoch": 0.27151515151515154, + "grad_norm": 0.03147033974528313, + "learning_rate": 9.992648207577308e-05, + "loss": 0.01510291825979948, + "num_input_tokens_seen": 7336448, + "step": 448, + "train_runtime": 3642.0772, + "train_tokens_per_second": 2014.358 + }, + { + "epoch": 0.2721212121212121, + "grad_norm": 0.01757362298667431, + "learning_rate": 9.992595987652935e-05, + "loss": 0.01235952414572239, + "num_input_tokens_seen": 7352824, + "step": 449, + "train_runtime": 3650.194, + "train_tokens_per_second": 2014.365 + }, + { + "epoch": 0.2727272727272727, + "grad_norm": 0.018810704350471497, + "learning_rate": 9.99254358306216e-05, + "loss": 0.01245784480124712, + "num_input_tokens_seen": 7369200, + "step": 450, + "train_runtime": 3658.3109, + "train_tokens_per_second": 2014.372 + }, + { + "epoch": 0.2733333333333333, + "grad_norm": 0.02399486117064953, + "learning_rate": 9.99249099380692e-05, + "loss": 0.012906970456242561, + "num_input_tokens_seen": 7385576, + "step": 451, + "train_runtime": 3666.429, + "train_tokens_per_second": 2014.379 + }, + { + "epoch": 0.2739393939393939, + "grad_norm": 0.07980017364025116, + "learning_rate": 9.99243821988916e-05, + "loss": 0.017233727499842644, + "num_input_tokens_seen": 7401952, + "step": 452, + "train_runtime": 3674.5475, + "train_tokens_per_second": 2014.385 + }, + { + "epoch": 0.27454545454545454, + "grad_norm": 0.019096143543720245, + "learning_rate": 9.992385261310833e-05, + "loss": 0.013073702342808247, + "num_input_tokens_seen": 7418328, + "step": 453, + "train_runtime": 3682.6645, + "train_tokens_per_second": 2014.392 + }, + { + "epoch": 0.27515151515151515, + "grad_norm": 0.055766504257917404, + "learning_rate": 9.992332118073897e-05, + "loss": 0.014186715707182884, + "num_input_tokens_seen": 7434704, + "step": 454, + "train_runtime": 3690.7797, + "train_tokens_per_second": 2014.399 + }, + { + "epoch": 0.27575757575757576, + "grad_norm": 0.02542242966592312, + "learning_rate": 9.992278790180318e-05, + "loss": 0.016023358330130577, + "num_input_tokens_seen": 7451080, + "step": 455, + "train_runtime": 3698.8942, + "train_tokens_per_second": 2014.407 + }, + { + "epoch": 0.27636363636363637, + "grad_norm": 0.020465506240725517, + "learning_rate": 9.99222527763207e-05, + "loss": 0.013445570133626461, + "num_input_tokens_seen": 7467456, + "step": 456, + "train_runtime": 3707.0085, + "train_tokens_per_second": 2014.416 + }, + { + "epoch": 0.276969696969697, + "grad_norm": 0.022726397961378098, + "learning_rate": 9.992171580431129e-05, + "loss": 0.013883800245821476, + "num_input_tokens_seen": 7483832, + "step": 457, + "train_runtime": 3715.1297, + "train_tokens_per_second": 2014.42 + }, + { + "epoch": 0.2775757575757576, + "grad_norm": 0.06926342844963074, + "learning_rate": 9.992117698579484e-05, + "loss": 0.016109909862279892, + "num_input_tokens_seen": 7500208, + "step": 458, + "train_runtime": 3723.2461, + "train_tokens_per_second": 2014.427 + }, + { + "epoch": 0.2781818181818182, + "grad_norm": 0.03352541849017143, + "learning_rate": 9.992063632079127e-05, + "loss": 0.01359601691365242, + "num_input_tokens_seen": 7516584, + "step": 459, + "train_runtime": 3731.3653, + "train_tokens_per_second": 2014.433 + }, + { + "epoch": 0.2787878787878788, + "grad_norm": 0.046891167759895325, + "learning_rate": 9.992009380932059e-05, + "loss": 0.014447907917201519, + "num_input_tokens_seen": 7532960, + "step": 460, + "train_runtime": 3739.4829, + "train_tokens_per_second": 2014.439 + }, + { + "epoch": 0.2793939393939394, + "grad_norm": 0.05756726115942001, + "learning_rate": 9.991954945140284e-05, + "loss": 0.012774428352713585, + "num_input_tokens_seen": 7549336, + "step": 461, + "train_runtime": 3747.5996, + "train_tokens_per_second": 2014.446 + }, + { + "epoch": 0.28, + "grad_norm": 0.06149715185165405, + "learning_rate": 9.991900324705817e-05, + "loss": 0.015111779794096947, + "num_input_tokens_seen": 7565712, + "step": 462, + "train_runtime": 3755.7151, + "train_tokens_per_second": 2014.453 + }, + { + "epoch": 0.2806060606060606, + "grad_norm": 0.03807002305984497, + "learning_rate": 9.991845519630678e-05, + "loss": 0.014264722354710102, + "num_input_tokens_seen": 7582088, + "step": 463, + "train_runtime": 3763.8316, + "train_tokens_per_second": 2014.46 + }, + { + "epoch": 0.2812121212121212, + "grad_norm": 0.038672804832458496, + "learning_rate": 9.991790529916896e-05, + "loss": 0.014925600029528141, + "num_input_tokens_seen": 7598464, + "step": 464, + "train_runtime": 3771.9486, + "train_tokens_per_second": 2014.466 + }, + { + "epoch": 0.2818181818181818, + "grad_norm": 0.04409286752343178, + "learning_rate": 9.991735355566502e-05, + "loss": 0.01355639100074768, + "num_input_tokens_seen": 7614840, + "step": 465, + "train_runtime": 3780.0654, + "train_tokens_per_second": 2014.473 + }, + { + "epoch": 0.2824242424242424, + "grad_norm": 0.05239715427160263, + "learning_rate": 9.991679996581539e-05, + "loss": 0.01419782917946577, + "num_input_tokens_seen": 7631216, + "step": 466, + "train_runtime": 3788.182, + "train_tokens_per_second": 2014.48 + }, + { + "epoch": 0.283030303030303, + "grad_norm": 0.04078468307852745, + "learning_rate": 9.991624452964054e-05, + "loss": 0.014365100301802158, + "num_input_tokens_seen": 7647592, + "step": 467, + "train_runtime": 3796.2972, + "train_tokens_per_second": 2014.487 + }, + { + "epoch": 0.28363636363636363, + "grad_norm": 0.05068361386656761, + "learning_rate": 9.9915687247161e-05, + "loss": 0.016069650650024414, + "num_input_tokens_seen": 7663968, + "step": 468, + "train_runtime": 3804.4156, + "train_tokens_per_second": 2014.493 + }, + { + "epoch": 0.28424242424242424, + "grad_norm": 0.028354912996292114, + "learning_rate": 9.991512811839741e-05, + "loss": 0.01326735783368349, + "num_input_tokens_seen": 7680344, + "step": 469, + "train_runtime": 3812.5326, + "train_tokens_per_second": 2014.499 + }, + { + "epoch": 0.28484848484848485, + "grad_norm": 0.018959172070026398, + "learning_rate": 9.991456714337041e-05, + "loss": 0.01290344912558794, + "num_input_tokens_seen": 7696720, + "step": 470, + "train_runtime": 3820.6476, + "train_tokens_per_second": 2014.507 + }, + { + "epoch": 0.28545454545454546, + "grad_norm": 0.03419540822505951, + "learning_rate": 9.99140043221008e-05, + "loss": 0.015551136806607246, + "num_input_tokens_seen": 7713096, + "step": 471, + "train_runtime": 3828.7623, + "train_tokens_per_second": 2014.514 + }, + { + "epoch": 0.28606060606060607, + "grad_norm": 0.0427350290119648, + "learning_rate": 9.991343965460937e-05, + "loss": 0.014623988419771194, + "num_input_tokens_seen": 7729472, + "step": 472, + "train_runtime": 3836.8784, + "train_tokens_per_second": 2014.521 + }, + { + "epoch": 0.2866666666666667, + "grad_norm": 0.030883153900504112, + "learning_rate": 9.991287314091699e-05, + "loss": 0.013778546825051308, + "num_input_tokens_seen": 7745848, + "step": 473, + "train_runtime": 3844.9946, + "train_tokens_per_second": 2014.528 + }, + { + "epoch": 0.2872727272727273, + "grad_norm": 0.021236877888441086, + "learning_rate": 9.991230478104466e-05, + "loss": 0.013353691436350346, + "num_input_tokens_seen": 7762224, + "step": 474, + "train_runtime": 3853.1121, + "train_tokens_per_second": 2014.534 + }, + { + "epoch": 0.2878787878787879, + "grad_norm": 0.019137293100357056, + "learning_rate": 9.991173457501337e-05, + "loss": 0.013228803873062134, + "num_input_tokens_seen": 7778600, + "step": 475, + "train_runtime": 3861.229, + "train_tokens_per_second": 2014.54 + }, + { + "epoch": 0.2884848484848485, + "grad_norm": 0.01902465894818306, + "learning_rate": 9.991116252284421e-05, + "loss": 0.013284035958349705, + "num_input_tokens_seen": 7794976, + "step": 476, + "train_runtime": 3869.3457, + "train_tokens_per_second": 2014.546 + }, + { + "epoch": 0.28909090909090907, + "grad_norm": 0.028947357088327408, + "learning_rate": 9.991058862455833e-05, + "loss": 0.01423730794340372, + "num_input_tokens_seen": 7811352, + "step": 477, + "train_runtime": 3877.4643, + "train_tokens_per_second": 2014.552 + }, + { + "epoch": 0.2896969696969697, + "grad_norm": 0.024383556097745895, + "learning_rate": 9.991001288017701e-05, + "loss": 0.013436602428555489, + "num_input_tokens_seen": 7827728, + "step": 478, + "train_runtime": 3885.5822, + "train_tokens_per_second": 2014.557 + }, + { + "epoch": 0.2903030303030303, + "grad_norm": 0.04384802654385567, + "learning_rate": 9.990943528972147e-05, + "loss": 0.013107577338814735, + "num_input_tokens_seen": 7844104, + "step": 479, + "train_runtime": 3893.6976, + "train_tokens_per_second": 2014.564 + }, + { + "epoch": 0.2909090909090909, + "grad_norm": 0.020900119096040726, + "learning_rate": 9.990885585321315e-05, + "loss": 0.015309646725654602, + "num_input_tokens_seen": 7860480, + "step": 480, + "train_runtime": 3901.8179, + "train_tokens_per_second": 2014.569 + }, + { + "epoch": 0.2915151515151515, + "grad_norm": 0.018041405826807022, + "learning_rate": 9.990827457067343e-05, + "loss": 0.012978669255971909, + "num_input_tokens_seen": 7876856, + "step": 481, + "train_runtime": 3909.935, + "train_tokens_per_second": 2014.575 + }, + { + "epoch": 0.2921212121212121, + "grad_norm": 0.02291363663971424, + "learning_rate": 9.99076914421238e-05, + "loss": 0.014085205271840096, + "num_input_tokens_seen": 7893232, + "step": 482, + "train_runtime": 3918.0537, + "train_tokens_per_second": 2014.58 + }, + { + "epoch": 0.2927272727272727, + "grad_norm": 0.023675069212913513, + "learning_rate": 9.990710646758589e-05, + "loss": 0.014468826353549957, + "num_input_tokens_seen": 7909608, + "step": 483, + "train_runtime": 3926.1718, + "train_tokens_per_second": 2014.585 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 0.021886734291911125, + "learning_rate": 9.990651964708128e-05, + "loss": 0.014159688726067543, + "num_input_tokens_seen": 7925984, + "step": 484, + "train_runtime": 3934.2949, + "train_tokens_per_second": 2014.588 + }, + { + "epoch": 0.29393939393939394, + "grad_norm": 0.019282542169094086, + "learning_rate": 9.99059309806317e-05, + "loss": 0.013447335921227932, + "num_input_tokens_seen": 7942360, + "step": 485, + "train_runtime": 3942.412, + "train_tokens_per_second": 2014.594 + }, + { + "epoch": 0.29454545454545455, + "grad_norm": 0.021736539900302887, + "learning_rate": 9.990534046825893e-05, + "loss": 0.014465593732893467, + "num_input_tokens_seen": 7958736, + "step": 486, + "train_runtime": 3950.5289, + "train_tokens_per_second": 2014.6 + }, + { + "epoch": 0.29515151515151516, + "grad_norm": 0.058480676263570786, + "learning_rate": 9.99047481099848e-05, + "loss": 0.015324249863624573, + "num_input_tokens_seen": 7975112, + "step": 487, + "train_runtime": 3958.6459, + "train_tokens_per_second": 2014.606 + }, + { + "epoch": 0.2957575757575758, + "grad_norm": 0.04795762896537781, + "learning_rate": 9.990415390583122e-05, + "loss": 0.015603849664330482, + "num_input_tokens_seen": 7991488, + "step": 488, + "train_runtime": 3966.7616, + "train_tokens_per_second": 2014.613 + }, + { + "epoch": 0.2963636363636364, + "grad_norm": 0.045595213770866394, + "learning_rate": 9.990355785582017e-05, + "loss": 0.013210836797952652, + "num_input_tokens_seen": 8007864, + "step": 489, + "train_runtime": 3974.8769, + "train_tokens_per_second": 2014.619 + }, + { + "epoch": 0.296969696969697, + "grad_norm": 0.03191044181585312, + "learning_rate": 9.99029599599737e-05, + "loss": 0.0133978221565485, + "num_input_tokens_seen": 8024240, + "step": 490, + "train_runtime": 3982.9927, + "train_tokens_per_second": 2014.626 + }, + { + "epoch": 0.29757575757575755, + "grad_norm": 0.03503177687525749, + "learning_rate": 9.990236021831391e-05, + "loss": 0.01524767093360424, + "num_input_tokens_seen": 8040616, + "step": 491, + "train_runtime": 3991.1095, + "train_tokens_per_second": 2014.632 + }, + { + "epoch": 0.29818181818181816, + "grad_norm": 0.021688032895326614, + "learning_rate": 9.990175863086302e-05, + "loss": 0.013602089136838913, + "num_input_tokens_seen": 8056992, + "step": 492, + "train_runtime": 3999.2294, + "train_tokens_per_second": 2014.636 + }, + { + "epoch": 0.29878787878787877, + "grad_norm": 0.02230294793844223, + "learning_rate": 9.990115519764325e-05, + "loss": 0.01378709264099598, + "num_input_tokens_seen": 8073368, + "step": 493, + "train_runtime": 4007.3438, + "train_tokens_per_second": 2014.643 + }, + { + "epoch": 0.2993939393939394, + "grad_norm": 0.0244484543800354, + "learning_rate": 9.990054991867692e-05, + "loss": 0.01362735964357853, + "num_input_tokens_seen": 8089744, + "step": 494, + "train_runtime": 4015.461, + "train_tokens_per_second": 2014.649 + }, + { + "epoch": 0.3, + "grad_norm": 0.021698100492358208, + "learning_rate": 9.989994279398642e-05, + "loss": 0.01317393034696579, + "num_input_tokens_seen": 8106120, + "step": 495, + "train_runtime": 4023.5779, + "train_tokens_per_second": 2014.655 + }, + { + "epoch": 0.3006060606060606, + "grad_norm": 0.04310522973537445, + "learning_rate": 9.989933382359422e-05, + "loss": 0.014429607428610325, + "num_input_tokens_seen": 8122496, + "step": 496, + "train_runtime": 4031.6942, + "train_tokens_per_second": 2014.661 + }, + { + "epoch": 0.3012121212121212, + "grad_norm": 0.018435562029480934, + "learning_rate": 9.989872300752283e-05, + "loss": 0.013920141384005547, + "num_input_tokens_seen": 8138872, + "step": 497, + "train_runtime": 4039.8107, + "train_tokens_per_second": 2014.667 + }, + { + "epoch": 0.3018181818181818, + "grad_norm": 0.023063285276293755, + "learning_rate": 9.989811034579486e-05, + "loss": 0.0139535591006279, + "num_input_tokens_seen": 8155248, + "step": 498, + "train_runtime": 4047.9289, + "train_tokens_per_second": 2014.672 + }, + { + "epoch": 0.30242424242424243, + "grad_norm": 0.0432952381670475, + "learning_rate": 9.989749583843296e-05, + "loss": 0.014083024114370346, + "num_input_tokens_seen": 8171624, + "step": 499, + "train_runtime": 4056.0475, + "train_tokens_per_second": 2014.677 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 0.0212725643068552, + "learning_rate": 9.989687948545985e-05, + "loss": 0.013151183724403381, + "num_input_tokens_seen": 8188000, + "step": 500, + "train_runtime": 4064.1628, + "train_tokens_per_second": 2014.683 + }, + { + "epoch": 0.30363636363636365, + "grad_norm": 0.03599437326192856, + "learning_rate": 9.989626128689835e-05, + "loss": 0.016130445525050163, + "num_input_tokens_seen": 8204376, + "step": 501, + "train_runtime": 4073.1872, + "train_tokens_per_second": 2014.24 + }, + { + "epoch": 0.30424242424242426, + "grad_norm": 0.021969085559248924, + "learning_rate": 9.98956412427713e-05, + "loss": 0.013816497288644314, + "num_input_tokens_seen": 8220752, + "step": 502, + "train_runtime": 4081.3029, + "train_tokens_per_second": 2014.247 + }, + { + "epoch": 0.30484848484848487, + "grad_norm": 0.032568175345659256, + "learning_rate": 9.989501935310166e-05, + "loss": 0.015003332868218422, + "num_input_tokens_seen": 8237128, + "step": 503, + "train_runtime": 4089.4198, + "train_tokens_per_second": 2014.253 + }, + { + "epoch": 0.3054545454545455, + "grad_norm": 0.0263565294444561, + "learning_rate": 9.98943956179124e-05, + "loss": 0.014254853129386902, + "num_input_tokens_seen": 8253504, + "step": 504, + "train_runtime": 4097.5386, + "train_tokens_per_second": 2014.259 + }, + { + "epoch": 0.30606060606060603, + "grad_norm": 0.019410574808716774, + "learning_rate": 9.989377003722664e-05, + "loss": 0.012152588926255703, + "num_input_tokens_seen": 8269880, + "step": 505, + "train_runtime": 4105.6623, + "train_tokens_per_second": 2014.262 + }, + { + "epoch": 0.30666666666666664, + "grad_norm": 0.043787118047475815, + "learning_rate": 9.989314261106749e-05, + "loss": 0.013709669932723045, + "num_input_tokens_seen": 8286256, + "step": 506, + "train_runtime": 4113.7771, + "train_tokens_per_second": 2014.27 + }, + { + "epoch": 0.30727272727272725, + "grad_norm": 0.04813135415315628, + "learning_rate": 9.989251333945813e-05, + "loss": 0.014608191326260567, + "num_input_tokens_seen": 8302632, + "step": 507, + "train_runtime": 4121.8917, + "train_tokens_per_second": 2014.277 + }, + { + "epoch": 0.30787878787878786, + "grad_norm": 0.021107302978634834, + "learning_rate": 9.989188222242188e-05, + "loss": 0.012715778313577175, + "num_input_tokens_seen": 8319008, + "step": 508, + "train_runtime": 4130.0145, + "train_tokens_per_second": 2014.281 + }, + { + "epoch": 0.3084848484848485, + "grad_norm": 0.019778916612267494, + "learning_rate": 9.989124925998205e-05, + "loss": 0.012830444611608982, + "num_input_tokens_seen": 8335384, + "step": 509, + "train_runtime": 4138.13, + "train_tokens_per_second": 2014.288 + }, + { + "epoch": 0.3090909090909091, + "grad_norm": 0.019679522141814232, + "learning_rate": 9.989061445216208e-05, + "loss": 0.013841142877936363, + "num_input_tokens_seen": 8351760, + "step": 510, + "train_runtime": 4146.2492, + "train_tokens_per_second": 2014.293 + }, + { + "epoch": 0.3096969696969697, + "grad_norm": 0.023106170818209648, + "learning_rate": 9.988997779898545e-05, + "loss": 0.013808130286633968, + "num_input_tokens_seen": 8368136, + "step": 511, + "train_runtime": 4154.3635, + "train_tokens_per_second": 2014.3 + }, + { + "epoch": 0.3103030303030303, + "grad_norm": 0.02681031823158264, + "learning_rate": 9.988933930047569e-05, + "loss": 0.015086129307746887, + "num_input_tokens_seen": 8384512, + "step": 512, + "train_runtime": 4162.4819, + "train_tokens_per_second": 2014.306 + }, + { + "epoch": 0.3109090909090909, + "grad_norm": 0.044101521372795105, + "learning_rate": 9.988869895665642e-05, + "loss": 0.01502022985368967, + "num_input_tokens_seen": 8400888, + "step": 513, + "train_runtime": 4170.6029, + "train_tokens_per_second": 2014.31 + }, + { + "epoch": 0.3115151515151515, + "grad_norm": 0.016393663361668587, + "learning_rate": 9.988805676755133e-05, + "loss": 0.01283847913146019, + "num_input_tokens_seen": 8417264, + "step": 514, + "train_runtime": 4178.7186, + "train_tokens_per_second": 2014.317 + }, + { + "epoch": 0.31212121212121213, + "grad_norm": 0.04226645827293396, + "learning_rate": 9.988741273318416e-05, + "loss": 0.01453358493745327, + "num_input_tokens_seen": 8433640, + "step": 515, + "train_runtime": 4186.8351, + "train_tokens_per_second": 2014.323 + }, + { + "epoch": 0.31272727272727274, + "grad_norm": 0.021670697256922722, + "learning_rate": 9.988676685357876e-05, + "loss": 0.014670845121145248, + "num_input_tokens_seen": 8450016, + "step": 516, + "train_runtime": 4194.9601, + "train_tokens_per_second": 2014.326 + }, + { + "epoch": 0.31333333333333335, + "grad_norm": 0.02870395965874195, + "learning_rate": 9.988611912875901e-05, + "loss": 0.013808513060212135, + "num_input_tokens_seen": 8466392, + "step": 517, + "train_runtime": 4203.0848, + "train_tokens_per_second": 2014.328 + }, + { + "epoch": 0.31393939393939396, + "grad_norm": 0.02202719636261463, + "learning_rate": 9.988546955874885e-05, + "loss": 0.014270287938416004, + "num_input_tokens_seen": 8482768, + "step": 518, + "train_runtime": 4211.2073, + "train_tokens_per_second": 2014.332 + }, + { + "epoch": 0.3145454545454546, + "grad_norm": 0.03838543966412544, + "learning_rate": 9.988481814357233e-05, + "loss": 0.016241563484072685, + "num_input_tokens_seen": 8499144, + "step": 519, + "train_runtime": 4219.3304, + "train_tokens_per_second": 2014.335 + }, + { + "epoch": 0.3151515151515151, + "grad_norm": 0.024275533854961395, + "learning_rate": 9.988416488325352e-05, + "loss": 0.012701138854026794, + "num_input_tokens_seen": 8515520, + "step": 520, + "train_runtime": 4227.4491, + "train_tokens_per_second": 2014.34 + }, + { + "epoch": 0.31575757575757574, + "grad_norm": 0.019648293033242226, + "learning_rate": 9.98835097778166e-05, + "loss": 0.014066259376704693, + "num_input_tokens_seen": 8531896, + "step": 521, + "train_runtime": 4235.5669, + "train_tokens_per_second": 2014.346 + }, + { + "epoch": 0.31636363636363635, + "grad_norm": 0.03942210599780083, + "learning_rate": 9.98828528272858e-05, + "loss": 0.015006550587713718, + "num_input_tokens_seen": 8548272, + "step": 522, + "train_runtime": 4243.6821, + "train_tokens_per_second": 2014.353 + }, + { + "epoch": 0.31696969696969696, + "grad_norm": 0.01995157264173031, + "learning_rate": 9.988219403168542e-05, + "loss": 0.014066948555409908, + "num_input_tokens_seen": 8564648, + "step": 523, + "train_runtime": 4251.7984, + "train_tokens_per_second": 2014.359 + }, + { + "epoch": 0.31757575757575757, + "grad_norm": 0.05812249332666397, + "learning_rate": 9.988153339103983e-05, + "loss": 0.01575363054871559, + "num_input_tokens_seen": 8581024, + "step": 524, + "train_runtime": 4259.9171, + "train_tokens_per_second": 2014.364 + }, + { + "epoch": 0.3181818181818182, + "grad_norm": 0.02528631128370762, + "learning_rate": 9.988087090537344e-05, + "loss": 0.013741475529968739, + "num_input_tokens_seen": 8597400, + "step": 525, + "train_runtime": 4268.0355, + "train_tokens_per_second": 2014.369 + }, + { + "epoch": 0.3187878787878788, + "grad_norm": 0.015316633507609367, + "learning_rate": 9.988020657471077e-05, + "loss": 0.01343776285648346, + "num_input_tokens_seen": 8613776, + "step": 526, + "train_runtime": 4276.154, + "train_tokens_per_second": 2014.375 + }, + { + "epoch": 0.3193939393939394, + "grad_norm": 0.0239357091486454, + "learning_rate": 9.987954039907642e-05, + "loss": 0.013446596451103687, + "num_input_tokens_seen": 8630152, + "step": 527, + "train_runtime": 4284.2698, + "train_tokens_per_second": 2014.381 + }, + { + "epoch": 0.32, + "grad_norm": 0.023286571726202965, + "learning_rate": 9.9878872378495e-05, + "loss": 0.012851690873503685, + "num_input_tokens_seen": 8646528, + "step": 528, + "train_runtime": 4292.3857, + "train_tokens_per_second": 2014.387 + }, + { + "epoch": 0.3206060606060606, + "grad_norm": 0.03030410036444664, + "learning_rate": 9.987820251299122e-05, + "loss": 0.014106137678027153, + "num_input_tokens_seen": 8662904, + "step": 529, + "train_runtime": 4300.5021, + "train_tokens_per_second": 2014.394 + }, + { + "epoch": 0.3212121212121212, + "grad_norm": 0.018672285601496696, + "learning_rate": 9.987753080258986e-05, + "loss": 0.013117408379912376, + "num_input_tokens_seen": 8679280, + "step": 530, + "train_runtime": 4308.6186, + "train_tokens_per_second": 2014.4 + }, + { + "epoch": 0.32181818181818184, + "grad_norm": 0.032513462007045746, + "learning_rate": 9.987685724731577e-05, + "loss": 0.01231987215578556, + "num_input_tokens_seen": 8695656, + "step": 531, + "train_runtime": 4316.7369, + "train_tokens_per_second": 2014.405 + }, + { + "epoch": 0.32242424242424245, + "grad_norm": 0.11805391311645508, + "learning_rate": 9.987618184719386e-05, + "loss": 0.013388572260737419, + "num_input_tokens_seen": 8712032, + "step": 532, + "train_runtime": 4324.8544, + "train_tokens_per_second": 2014.41 + }, + { + "epoch": 0.32303030303030306, + "grad_norm": 0.02607562392950058, + "learning_rate": 9.987550460224912e-05, + "loss": 0.014675582759082317, + "num_input_tokens_seen": 8728408, + "step": 533, + "train_runtime": 4332.9699, + "train_tokens_per_second": 2014.417 + }, + { + "epoch": 0.3236363636363636, + "grad_norm": 0.03229625150561333, + "learning_rate": 9.987482551250659e-05, + "loss": 0.014730843715369701, + "num_input_tokens_seen": 8744784, + "step": 534, + "train_runtime": 4341.0862, + "train_tokens_per_second": 2014.423 + }, + { + "epoch": 0.3242424242424242, + "grad_norm": 0.02484363690018654, + "learning_rate": 9.987414457799138e-05, + "loss": 0.01373380795121193, + "num_input_tokens_seen": 8761160, + "step": 535, + "train_runtime": 4349.2033, + "train_tokens_per_second": 2014.429 + }, + { + "epoch": 0.32484848484848483, + "grad_norm": 0.06518429517745972, + "learning_rate": 9.987346179872869e-05, + "loss": 0.01318280678242445, + "num_input_tokens_seen": 8777536, + "step": 536, + "train_runtime": 4357.3294, + "train_tokens_per_second": 2014.43 + }, + { + "epoch": 0.32545454545454544, + "grad_norm": 0.023426007479429245, + "learning_rate": 9.98727771747438e-05, + "loss": 0.013221761211752892, + "num_input_tokens_seen": 8793912, + "step": 537, + "train_runtime": 4365.4504, + "train_tokens_per_second": 2014.434 + }, + { + "epoch": 0.32606060606060605, + "grad_norm": 0.017606353387236595, + "learning_rate": 9.987209070606199e-05, + "loss": 0.013325998559594154, + "num_input_tokens_seen": 8810288, + "step": 538, + "train_runtime": 4373.5723, + "train_tokens_per_second": 2014.437 + }, + { + "epoch": 0.32666666666666666, + "grad_norm": 0.01875401847064495, + "learning_rate": 9.987140239270865e-05, + "loss": 0.012510064989328384, + "num_input_tokens_seen": 8826664, + "step": 539, + "train_runtime": 4381.6917, + "train_tokens_per_second": 2014.442 + }, + { + "epoch": 0.32727272727272727, + "grad_norm": 0.01927015371620655, + "learning_rate": 9.987071223470926e-05, + "loss": 0.012322126887738705, + "num_input_tokens_seen": 8843040, + "step": 540, + "train_runtime": 4389.8115, + "train_tokens_per_second": 2014.446 + }, + { + "epoch": 0.3278787878787879, + "grad_norm": 0.021669652312994003, + "learning_rate": 9.987002023208935e-05, + "loss": 0.013479230925440788, + "num_input_tokens_seen": 8859416, + "step": 541, + "train_runtime": 4397.9315, + "train_tokens_per_second": 2014.451 + }, + { + "epoch": 0.3284848484848485, + "grad_norm": 0.021821271628141403, + "learning_rate": 9.98693263848745e-05, + "loss": 0.013202294707298279, + "num_input_tokens_seen": 8875792, + "step": 542, + "train_runtime": 4406.0521, + "train_tokens_per_second": 2014.455 + }, + { + "epoch": 0.3290909090909091, + "grad_norm": 0.04035639762878418, + "learning_rate": 9.98686306930904e-05, + "loss": 0.014951585792005062, + "num_input_tokens_seen": 8892168, + "step": 543, + "train_runtime": 4414.1742, + "train_tokens_per_second": 2014.458 + }, + { + "epoch": 0.3296969696969697, + "grad_norm": 0.01868710108101368, + "learning_rate": 9.986793315676276e-05, + "loss": 0.012716731987893581, + "num_input_tokens_seen": 8908544, + "step": 544, + "train_runtime": 4422.2924, + "train_tokens_per_second": 2014.463 + }, + { + "epoch": 0.3303030303030303, + "grad_norm": 0.030803462490439415, + "learning_rate": 9.986723377591738e-05, + "loss": 0.012449722737073898, + "num_input_tokens_seen": 8924920, + "step": 545, + "train_runtime": 4430.4191, + "train_tokens_per_second": 2014.464 + }, + { + "epoch": 0.33090909090909093, + "grad_norm": 0.031005537137389183, + "learning_rate": 9.986653255058014e-05, + "loss": 0.014312123879790306, + "num_input_tokens_seen": 8941296, + "step": 546, + "train_runtime": 4438.5386, + "train_tokens_per_second": 2014.468 + }, + { + "epoch": 0.33151515151515154, + "grad_norm": 0.0480731725692749, + "learning_rate": 9.986582948077696e-05, + "loss": 0.015260567888617516, + "num_input_tokens_seen": 8957672, + "step": 547, + "train_runtime": 4446.6634, + "train_tokens_per_second": 2014.47 + }, + { + "epoch": 0.3321212121212121, + "grad_norm": 0.031962063163518906, + "learning_rate": 9.986512456653388e-05, + "loss": 0.01442326046526432, + "num_input_tokens_seen": 8974048, + "step": 548, + "train_runtime": 4454.7823, + "train_tokens_per_second": 2014.475 + }, + { + "epoch": 0.3327272727272727, + "grad_norm": 0.026429401710629463, + "learning_rate": 9.986441780787692e-05, + "loss": 0.014029188081622124, + "num_input_tokens_seen": 8990424, + "step": 549, + "train_runtime": 4462.9013, + "train_tokens_per_second": 2014.48 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.025757839903235435, + "learning_rate": 9.986370920483227e-05, + "loss": 0.013720030896365643, + "num_input_tokens_seen": 9006800, + "step": 550, + "train_runtime": 4471.0287, + "train_tokens_per_second": 2014.48 + }, + { + "epoch": 0.3339393939393939, + "grad_norm": 0.029027970507740974, + "learning_rate": 9.986299875742613e-05, + "loss": 0.014392418786883354, + "num_input_tokens_seen": 9023176, + "step": 551, + "train_runtime": 4479.1476, + "train_tokens_per_second": 2014.485 + }, + { + "epoch": 0.33454545454545453, + "grad_norm": 0.02587730623781681, + "learning_rate": 9.986228646568475e-05, + "loss": 0.014536920003592968, + "num_input_tokens_seen": 9039552, + "step": 552, + "train_runtime": 4487.2645, + "train_tokens_per_second": 2014.491 + }, + { + "epoch": 0.33515151515151514, + "grad_norm": 0.024850307032465935, + "learning_rate": 9.986157232963452e-05, + "loss": 0.014528162777423859, + "num_input_tokens_seen": 9055928, + "step": 553, + "train_runtime": 4495.3823, + "train_tokens_per_second": 2014.496 + }, + { + "epoch": 0.33575757575757575, + "grad_norm": 0.03375309333205223, + "learning_rate": 9.98608563493018e-05, + "loss": 0.01345045492053032, + "num_input_tokens_seen": 9072304, + "step": 554, + "train_runtime": 4503.5123, + "train_tokens_per_second": 2014.495 + }, + { + "epoch": 0.33636363636363636, + "grad_norm": 0.034519318491220474, + "learning_rate": 9.986013852471313e-05, + "loss": 0.016201037913560867, + "num_input_tokens_seen": 9088680, + "step": 555, + "train_runtime": 4511.6315, + "train_tokens_per_second": 2014.5 + }, + { + "epoch": 0.336969696969697, + "grad_norm": 0.025029929354786873, + "learning_rate": 9.985941885589502e-05, + "loss": 0.013687830418348312, + "num_input_tokens_seen": 9105056, + "step": 556, + "train_runtime": 4519.7498, + "train_tokens_per_second": 2014.504 + }, + { + "epoch": 0.3375757575757576, + "grad_norm": 0.02109324000775814, + "learning_rate": 9.98586973428741e-05, + "loss": 0.013876669108867645, + "num_input_tokens_seen": 9121432, + "step": 557, + "train_runtime": 4527.8676, + "train_tokens_per_second": 2014.509 + }, + { + "epoch": 0.3381818181818182, + "grad_norm": 0.017437269911170006, + "learning_rate": 9.985797398567707e-05, + "loss": 0.013100878335535526, + "num_input_tokens_seen": 9137808, + "step": 558, + "train_runtime": 4535.9928, + "train_tokens_per_second": 2014.511 + }, + { + "epoch": 0.3387878787878788, + "grad_norm": 0.04041491076350212, + "learning_rate": 9.985724878433066e-05, + "loss": 0.014973807148635387, + "num_input_tokens_seen": 9154184, + "step": 559, + "train_runtime": 4544.113, + "train_tokens_per_second": 2014.515 + }, + { + "epoch": 0.3393939393939394, + "grad_norm": 0.02034146897494793, + "learning_rate": 9.985652173886174e-05, + "loss": 0.012258726172149181, + "num_input_tokens_seen": 9170560, + "step": 560, + "train_runtime": 4552.2371, + "train_tokens_per_second": 2014.517 + }, + { + "epoch": 0.34, + "grad_norm": 0.016358409076929092, + "learning_rate": 9.985579284929715e-05, + "loss": 0.014534495770931244, + "num_input_tokens_seen": 9186936, + "step": 561, + "train_runtime": 4560.3582, + "train_tokens_per_second": 2014.521 + }, + { + "epoch": 0.3406060606060606, + "grad_norm": 0.017970645800232887, + "learning_rate": 9.985506211566388e-05, + "loss": 0.013168847188353539, + "num_input_tokens_seen": 9203312, + "step": 562, + "train_runtime": 4568.4702, + "train_tokens_per_second": 2014.528 + }, + { + "epoch": 0.3412121212121212, + "grad_norm": 0.02478228323161602, + "learning_rate": 9.985432953798895e-05, + "loss": 0.016286451369524002, + "num_input_tokens_seen": 9219688, + "step": 563, + "train_runtime": 4576.5846, + "train_tokens_per_second": 2014.535 + }, + { + "epoch": 0.3418181818181818, + "grad_norm": 0.023158971220254898, + "learning_rate": 9.985359511629944e-05, + "loss": 0.014812255278229713, + "num_input_tokens_seen": 9236064, + "step": 564, + "train_runtime": 4584.6914, + "train_tokens_per_second": 2014.544 + }, + { + "epoch": 0.3424242424242424, + "grad_norm": 0.017976826056838036, + "learning_rate": 9.985285885062257e-05, + "loss": 0.013011513277888298, + "num_input_tokens_seen": 9252440, + "step": 565, + "train_runtime": 4592.801, + "train_tokens_per_second": 2014.553 + }, + { + "epoch": 0.343030303030303, + "grad_norm": 0.022492917254567146, + "learning_rate": 9.98521207409855e-05, + "loss": 0.014015360735356808, + "num_input_tokens_seen": 9268816, + "step": 566, + "train_runtime": 4600.9112, + "train_tokens_per_second": 2014.561 + }, + { + "epoch": 0.34363636363636363, + "grad_norm": 0.05375469848513603, + "learning_rate": 9.985138078741559e-05, + "loss": 0.013538680039346218, + "num_input_tokens_seen": 9285192, + "step": 567, + "train_runtime": 4609.0183, + "train_tokens_per_second": 2014.57 + }, + { + "epoch": 0.34424242424242424, + "grad_norm": 0.011526068672537804, + "learning_rate": 9.985063898994016e-05, + "loss": 0.012446783483028412, + "num_input_tokens_seen": 9301568, + "step": 568, + "train_runtime": 4617.1293, + "train_tokens_per_second": 2014.578 + }, + { + "epoch": 0.34484848484848485, + "grad_norm": 0.015349720604717731, + "learning_rate": 9.984989534858669e-05, + "loss": 0.012871544808149338, + "num_input_tokens_seen": 9317944, + "step": 569, + "train_runtime": 4625.2366, + "train_tokens_per_second": 2014.588 + }, + { + "epoch": 0.34545454545454546, + "grad_norm": 0.03799523040652275, + "learning_rate": 9.984914986338268e-05, + "loss": 0.014556103385984898, + "num_input_tokens_seen": 9334320, + "step": 570, + "train_runtime": 4633.3464, + "train_tokens_per_second": 2014.596 + }, + { + "epoch": 0.34606060606060607, + "grad_norm": 0.042935777455568314, + "learning_rate": 9.984840253435568e-05, + "loss": 0.015330069698393345, + "num_input_tokens_seen": 9350696, + "step": 571, + "train_runtime": 4641.4533, + "train_tokens_per_second": 2014.605 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 0.026697825640439987, + "learning_rate": 9.984765336153334e-05, + "loss": 0.01370144821703434, + "num_input_tokens_seen": 9367072, + "step": 572, + "train_runtime": 4649.5653, + "train_tokens_per_second": 2014.612 + }, + { + "epoch": 0.3472727272727273, + "grad_norm": 0.04093024507164955, + "learning_rate": 9.984690234494339e-05, + "loss": 0.01424380298703909, + "num_input_tokens_seen": 9383448, + "step": 573, + "train_runtime": 4657.6738, + "train_tokens_per_second": 2014.621 + }, + { + "epoch": 0.3478787878787879, + "grad_norm": 0.03236076980829239, + "learning_rate": 9.984614948461358e-05, + "loss": 0.014988360926508904, + "num_input_tokens_seen": 9399824, + "step": 574, + "train_runtime": 4665.7816, + "train_tokens_per_second": 2014.63 + }, + { + "epoch": 0.3484848484848485, + "grad_norm": 0.016026047989726067, + "learning_rate": 9.984539478057178e-05, + "loss": 0.013162180781364441, + "num_input_tokens_seen": 9416200, + "step": 575, + "train_runtime": 4673.8904, + "train_tokens_per_second": 2014.639 + }, + { + "epoch": 0.3490909090909091, + "grad_norm": 0.03273920342326164, + "learning_rate": 9.984463823284589e-05, + "loss": 0.015174154192209244, + "num_input_tokens_seen": 9432576, + "step": 576, + "train_runtime": 4682.0015, + "train_tokens_per_second": 2014.646 + }, + { + "epoch": 0.3496969696969697, + "grad_norm": 0.03933154046535492, + "learning_rate": 9.98438798414639e-05, + "loss": 0.014418127946555614, + "num_input_tokens_seen": 9448952, + "step": 577, + "train_runtime": 4690.1128, + "train_tokens_per_second": 2014.653 + }, + { + "epoch": 0.3503030303030303, + "grad_norm": 0.02570173889398575, + "learning_rate": 9.984311960645388e-05, + "loss": 0.01333607453852892, + "num_input_tokens_seen": 9465328, + "step": 578, + "train_runtime": 4698.2293, + "train_tokens_per_second": 2014.659 + }, + { + "epoch": 0.3509090909090909, + "grad_norm": 0.024147065356373787, + "learning_rate": 9.984235752784392e-05, + "loss": 0.013619362376630306, + "num_input_tokens_seen": 9481704, + "step": 579, + "train_runtime": 4706.3371, + "train_tokens_per_second": 2014.667 + }, + { + "epoch": 0.3515151515151515, + "grad_norm": 0.04005376994609833, + "learning_rate": 9.98415936056622e-05, + "loss": 0.014414026401937008, + "num_input_tokens_seen": 9498080, + "step": 580, + "train_runtime": 4714.4455, + "train_tokens_per_second": 2014.676 + }, + { + "epoch": 0.3521212121212121, + "grad_norm": 0.03428025171160698, + "learning_rate": 9.984082783993703e-05, + "loss": 0.01436635572463274, + "num_input_tokens_seen": 9514456, + "step": 581, + "train_runtime": 4722.5545, + "train_tokens_per_second": 2014.684 + }, + { + "epoch": 0.3527272727272727, + "grad_norm": 0.02205795608460903, + "learning_rate": 9.984006023069666e-05, + "loss": 0.013060957193374634, + "num_input_tokens_seen": 9530832, + "step": 582, + "train_runtime": 4730.6633, + "train_tokens_per_second": 2014.693 + }, + { + "epoch": 0.35333333333333333, + "grad_norm": 0.020862819626927376, + "learning_rate": 9.983929077796954e-05, + "loss": 0.013365531340241432, + "num_input_tokens_seen": 9547208, + "step": 583, + "train_runtime": 4738.7746, + "train_tokens_per_second": 2014.7 + }, + { + "epoch": 0.35393939393939394, + "grad_norm": 0.012693438678979874, + "learning_rate": 9.983851948178412e-05, + "loss": 0.012265143916010857, + "num_input_tokens_seen": 9563584, + "step": 584, + "train_runtime": 4746.884, + "train_tokens_per_second": 2014.708 + }, + { + "epoch": 0.35454545454545455, + "grad_norm": 0.03995286300778389, + "learning_rate": 9.983774634216892e-05, + "loss": 0.014887749217450619, + "num_input_tokens_seen": 9579960, + "step": 585, + "train_runtime": 4754.9935, + "train_tokens_per_second": 2014.716 + }, + { + "epoch": 0.35515151515151516, + "grad_norm": 0.02919401042163372, + "learning_rate": 9.983697135915252e-05, + "loss": 0.01471506617963314, + "num_input_tokens_seen": 9596336, + "step": 586, + "train_runtime": 4763.1041, + "train_tokens_per_second": 2014.723 + }, + { + "epoch": 0.3557575757575758, + "grad_norm": 0.03058960661292076, + "learning_rate": 9.98361945327636e-05, + "loss": 0.014638346619904041, + "num_input_tokens_seen": 9612712, + "step": 587, + "train_runtime": 4771.2155, + "train_tokens_per_second": 2014.73 + }, + { + "epoch": 0.3563636363636364, + "grad_norm": 0.03899887949228287, + "learning_rate": 9.983541586303091e-05, + "loss": 0.015173106454312801, + "num_input_tokens_seen": 9629088, + "step": 588, + "train_runtime": 4779.3321, + "train_tokens_per_second": 2014.735 + }, + { + "epoch": 0.356969696969697, + "grad_norm": 0.34171223640441895, + "learning_rate": 9.983463534998326e-05, + "loss": 0.01584211364388466, + "num_input_tokens_seen": 9645464, + "step": 589, + "train_runtime": 4787.4435, + "train_tokens_per_second": 2014.742 + }, + { + "epoch": 0.3575757575757576, + "grad_norm": 0.025424521416425705, + "learning_rate": 9.983385299364946e-05, + "loss": 0.01455459464341402, + "num_input_tokens_seen": 9661840, + "step": 590, + "train_runtime": 4795.5546, + "train_tokens_per_second": 2014.749 + }, + { + "epoch": 0.35818181818181816, + "grad_norm": 0.032859109342098236, + "learning_rate": 9.98330687940585e-05, + "loss": 0.0146177988499403, + "num_input_tokens_seen": 9678216, + "step": 591, + "train_runtime": 4803.6648, + "train_tokens_per_second": 2014.757 + }, + { + "epoch": 0.35878787878787877, + "grad_norm": 0.038725532591342926, + "learning_rate": 9.983228275123938e-05, + "loss": 0.014792557805776596, + "num_input_tokens_seen": 9694592, + "step": 592, + "train_runtime": 4811.7746, + "train_tokens_per_second": 2014.764 + }, + { + "epoch": 0.3593939393939394, + "grad_norm": 0.020830297842621803, + "learning_rate": 9.983149486522115e-05, + "loss": 0.014553902670741081, + "num_input_tokens_seen": 9710968, + "step": 593, + "train_runtime": 4819.8876, + "train_tokens_per_second": 2014.771 + }, + { + "epoch": 0.36, + "grad_norm": 0.01844129152595997, + "learning_rate": 9.983070513603295e-05, + "loss": 0.014042770490050316, + "num_input_tokens_seen": 9727344, + "step": 594, + "train_runtime": 4827.9961, + "train_tokens_per_second": 2014.779 + }, + { + "epoch": 0.3606060606060606, + "grad_norm": 0.2604560852050781, + "learning_rate": 9.982991356370404e-05, + "loss": 0.01581915095448494, + "num_input_tokens_seen": 9743720, + "step": 595, + "train_runtime": 4836.1086, + "train_tokens_per_second": 2014.785 + }, + { + "epoch": 0.3612121212121212, + "grad_norm": 0.03814680501818657, + "learning_rate": 9.982912014826365e-05, + "loss": 0.016680167987942696, + "num_input_tokens_seen": 9760096, + "step": 596, + "train_runtime": 4844.2153, + "train_tokens_per_second": 2014.794 + }, + { + "epoch": 0.3618181818181818, + "grad_norm": 0.02060728892683983, + "learning_rate": 9.982832488974115e-05, + "loss": 0.014381843619048595, + "num_input_tokens_seen": 9776472, + "step": 597, + "train_runtime": 4852.3306, + "train_tokens_per_second": 2014.799 + }, + { + "epoch": 0.3624242424242424, + "grad_norm": 0.028759043663740158, + "learning_rate": 9.982752778816595e-05, + "loss": 0.014019730500876904, + "num_input_tokens_seen": 9792848, + "step": 598, + "train_runtime": 4860.4404, + "train_tokens_per_second": 2014.807 + }, + { + "epoch": 0.36303030303030304, + "grad_norm": 0.05189267545938492, + "learning_rate": 9.982672884356752e-05, + "loss": 0.01498887874186039, + "num_input_tokens_seen": 9809224, + "step": 599, + "train_runtime": 4868.5548, + "train_tokens_per_second": 2014.812 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 0.01455398928374052, + "learning_rate": 9.982592805597544e-05, + "loss": 0.011788399890065193, + "num_input_tokens_seen": 9825600, + "step": 600, + "train_runtime": 4876.663, + "train_tokens_per_second": 2014.82 + }, + { + "epoch": 0.36424242424242426, + "grad_norm": 0.046520307660102844, + "learning_rate": 9.982512542541929e-05, + "loss": 0.012856653891503811, + "num_input_tokens_seen": 9841976, + "step": 601, + "train_runtime": 4885.6882, + "train_tokens_per_second": 2014.45 + }, + { + "epoch": 0.36484848484848487, + "grad_norm": 0.017443792894482613, + "learning_rate": 9.98243209519288e-05, + "loss": 0.013804689049720764, + "num_input_tokens_seen": 9858352, + "step": 602, + "train_runtime": 4893.7966, + "train_tokens_per_second": 2014.459 + }, + { + "epoch": 0.3654545454545455, + "grad_norm": 0.016950292512774467, + "learning_rate": 9.98235146355337e-05, + "loss": 0.01243675872683525, + "num_input_tokens_seen": 9874728, + "step": 603, + "train_runtime": 4901.9018, + "train_tokens_per_second": 2014.469 + }, + { + "epoch": 0.3660606060606061, + "grad_norm": 0.017681090161204338, + "learning_rate": 9.982270647626382e-05, + "loss": 0.011940497905015945, + "num_input_tokens_seen": 9891104, + "step": 604, + "train_runtime": 4910.0066, + "train_tokens_per_second": 2014.479 + }, + { + "epoch": 0.36666666666666664, + "grad_norm": 0.018248707056045532, + "learning_rate": 9.982189647414906e-05, + "loss": 0.012673230841755867, + "num_input_tokens_seen": 9907480, + "step": 605, + "train_runtime": 4918.1184, + "train_tokens_per_second": 2014.486 + }, + { + "epoch": 0.36727272727272725, + "grad_norm": 0.020540893077850342, + "learning_rate": 9.982108462921937e-05, + "loss": 0.014132777228951454, + "num_input_tokens_seen": 9923856, + "step": 606, + "train_runtime": 4926.2283, + "train_tokens_per_second": 2014.494 + }, + { + "epoch": 0.36787878787878786, + "grad_norm": 0.023124700412154198, + "learning_rate": 9.982027094150478e-05, + "loss": 0.012684160843491554, + "num_input_tokens_seen": 9940232, + "step": 607, + "train_runtime": 4934.3331, + "train_tokens_per_second": 2014.504 + }, + { + "epoch": 0.36848484848484847, + "grad_norm": 0.020409781485795975, + "learning_rate": 9.98194554110354e-05, + "loss": 0.014147626236081123, + "num_input_tokens_seen": 9956608, + "step": 608, + "train_runtime": 4942.4391, + "train_tokens_per_second": 2014.513 + }, + { + "epoch": 0.3690909090909091, + "grad_norm": 0.015636246651411057, + "learning_rate": 9.981863803784136e-05, + "loss": 0.014182131737470627, + "num_input_tokens_seen": 9972984, + "step": 609, + "train_runtime": 4950.5477, + "train_tokens_per_second": 2014.521 + }, + { + "epoch": 0.3696969696969697, + "grad_norm": 0.0192013718187809, + "learning_rate": 9.981781882195292e-05, + "loss": 0.013808063231408596, + "num_input_tokens_seen": 9989360, + "step": 610, + "train_runtime": 4958.6543, + "train_tokens_per_second": 2014.53 + }, + { + "epoch": 0.3703030303030303, + "grad_norm": 0.017762696370482445, + "learning_rate": 9.981699776340039e-05, + "loss": 0.013210650533437729, + "num_input_tokens_seen": 10005736, + "step": 611, + "train_runtime": 4966.7598, + "train_tokens_per_second": 2014.54 + }, + { + "epoch": 0.3709090909090909, + "grad_norm": 0.025030212476849556, + "learning_rate": 9.981617486221413e-05, + "loss": 0.01400088518857956, + "num_input_tokens_seen": 10022112, + "step": 612, + "train_runtime": 4974.8675, + "train_tokens_per_second": 2014.549 + }, + { + "epoch": 0.3715151515151515, + "grad_norm": 0.030215473845601082, + "learning_rate": 9.981535011842456e-05, + "loss": 0.01368585042655468, + "num_input_tokens_seen": 10038488, + "step": 613, + "train_runtime": 4982.9771, + "train_tokens_per_second": 2014.556 + }, + { + "epoch": 0.37212121212121213, + "grad_norm": 0.021045658737421036, + "learning_rate": 9.981452353206222e-05, + "loss": 0.014398960396647453, + "num_input_tokens_seen": 10054864, + "step": 614, + "train_runtime": 4991.0863, + "train_tokens_per_second": 2014.564 + }, + { + "epoch": 0.37272727272727274, + "grad_norm": 0.01661411114037037, + "learning_rate": 9.981369510315764e-05, + "loss": 0.0135966120287776, + "num_input_tokens_seen": 10071240, + "step": 615, + "train_runtime": 4999.1912, + "train_tokens_per_second": 2014.574 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 0.013987096957862377, + "learning_rate": 9.98128648317415e-05, + "loss": 0.011429902166128159, + "num_input_tokens_seen": 10087616, + "step": 616, + "train_runtime": 5007.2986, + "train_tokens_per_second": 2014.582 + }, + { + "epoch": 0.37393939393939396, + "grad_norm": 0.01872987300157547, + "learning_rate": 9.981203271784449e-05, + "loss": 0.011292507871985435, + "num_input_tokens_seen": 10103992, + "step": 617, + "train_runtime": 5015.406, + "train_tokens_per_second": 2014.591 + }, + { + "epoch": 0.37454545454545457, + "grad_norm": 0.013638158328831196, + "learning_rate": 9.98111987614974e-05, + "loss": 0.012537346221506596, + "num_input_tokens_seen": 10120368, + "step": 618, + "train_runtime": 5023.5134, + "train_tokens_per_second": 2014.6 + }, + { + "epoch": 0.3751515151515151, + "grad_norm": 0.012727733701467514, + "learning_rate": 9.981036296273106e-05, + "loss": 0.012531593441963196, + "num_input_tokens_seen": 10136744, + "step": 619, + "train_runtime": 5031.6191, + "train_tokens_per_second": 2014.609 + }, + { + "epoch": 0.37575757575757573, + "grad_norm": 0.017707858234643936, + "learning_rate": 9.98095253215764e-05, + "loss": 0.012445853091776371, + "num_input_tokens_seen": 10153120, + "step": 620, + "train_runtime": 5039.7288, + "train_tokens_per_second": 2014.616 + }, + { + "epoch": 0.37636363636363634, + "grad_norm": 0.02095656655728817, + "learning_rate": 9.98086858380644e-05, + "loss": 0.01246220339089632, + "num_input_tokens_seen": 10169496, + "step": 621, + "train_runtime": 5047.8343, + "train_tokens_per_second": 2014.626 + }, + { + "epoch": 0.37696969696969695, + "grad_norm": 0.0194542296230793, + "learning_rate": 9.980784451222612e-05, + "loss": 0.012840205803513527, + "num_input_tokens_seen": 10185872, + "step": 622, + "train_runtime": 5055.9398, + "train_tokens_per_second": 2014.635 + }, + { + "epoch": 0.37757575757575756, + "grad_norm": 0.045034874230623245, + "learning_rate": 9.980700134409266e-05, + "loss": 0.01571492850780487, + "num_input_tokens_seen": 10202248, + "step": 623, + "train_runtime": 5064.0515, + "train_tokens_per_second": 2014.641 + }, + { + "epoch": 0.3781818181818182, + "grad_norm": 0.017045883461833, + "learning_rate": 9.980615633369522e-05, + "loss": 0.013137969188392162, + "num_input_tokens_seen": 10218624, + "step": 624, + "train_runtime": 5072.1723, + "train_tokens_per_second": 2014.644 + }, + { + "epoch": 0.3787878787878788, + "grad_norm": 0.01485395897179842, + "learning_rate": 9.980530948106504e-05, + "loss": 0.01213077548891306, + "num_input_tokens_seen": 10235000, + "step": 625, + "train_runtime": 5080.2996, + "train_tokens_per_second": 2014.645 + }, + { + "epoch": 0.3793939393939394, + "grad_norm": 0.014804039150476456, + "learning_rate": 9.980446078623345e-05, + "loss": 0.012899467721581459, + "num_input_tokens_seen": 10251376, + "step": 626, + "train_runtime": 5088.4306, + "train_tokens_per_second": 2014.644 + }, + { + "epoch": 0.38, + "grad_norm": 0.02651570737361908, + "learning_rate": 9.980361024923185e-05, + "loss": 0.012421991676092148, + "num_input_tokens_seen": 10267752, + "step": 627, + "train_runtime": 5096.5501, + "train_tokens_per_second": 2014.648 + }, + { + "epoch": 0.3806060606060606, + "grad_norm": 0.018621394410729408, + "learning_rate": 9.98027578700917e-05, + "loss": 0.01267517451196909, + "num_input_tokens_seen": 10284128, + "step": 628, + "train_runtime": 5104.6689, + "train_tokens_per_second": 2014.651 + }, + { + "epoch": 0.3812121212121212, + "grad_norm": 0.0398629792034626, + "learning_rate": 9.980190364884452e-05, + "loss": 0.014264339581131935, + "num_input_tokens_seen": 10300504, + "step": 629, + "train_runtime": 5112.783, + "train_tokens_per_second": 2014.657 + }, + { + "epoch": 0.38181818181818183, + "grad_norm": 0.014866935089230537, + "learning_rate": 9.98010475855219e-05, + "loss": 0.01269571203738451, + "num_input_tokens_seen": 10316880, + "step": 630, + "train_runtime": 5120.8995, + "train_tokens_per_second": 2014.662 + }, + { + "epoch": 0.38242424242424244, + "grad_norm": 0.02409232407808304, + "learning_rate": 9.980018968015552e-05, + "loss": 0.01351371593773365, + "num_input_tokens_seen": 10333256, + "step": 631, + "train_runtime": 5129.0287, + "train_tokens_per_second": 2014.661 + }, + { + "epoch": 0.38303030303030305, + "grad_norm": 0.01822233758866787, + "learning_rate": 9.979932993277711e-05, + "loss": 0.011882105842232704, + "num_input_tokens_seen": 10349632, + "step": 632, + "train_runtime": 5137.1531, + "train_tokens_per_second": 2014.663 + }, + { + "epoch": 0.3836363636363636, + "grad_norm": 0.030663253739476204, + "learning_rate": 9.979846834341846e-05, + "loss": 0.014444777742028236, + "num_input_tokens_seen": 10366008, + "step": 633, + "train_runtime": 5145.2769, + "train_tokens_per_second": 2014.665 + }, + { + "epoch": 0.3842424242424242, + "grad_norm": 0.013876891694962978, + "learning_rate": 9.979760491211146e-05, + "loss": 0.012167233973741531, + "num_input_tokens_seen": 10382384, + "step": 634, + "train_runtime": 5153.3942, + "train_tokens_per_second": 2014.669 + }, + { + "epoch": 0.38484848484848483, + "grad_norm": 0.03647688776254654, + "learning_rate": 9.979673963888801e-05, + "loss": 0.013262891210615635, + "num_input_tokens_seen": 10398760, + "step": 635, + "train_runtime": 5161.5119, + "train_tokens_per_second": 2014.673 + }, + { + "epoch": 0.38545454545454544, + "grad_norm": 0.02617211639881134, + "learning_rate": 9.979587252378013e-05, + "loss": 0.014726457186043262, + "num_input_tokens_seen": 10415136, + "step": 636, + "train_runtime": 5169.6294, + "train_tokens_per_second": 2014.678 + }, + { + "epoch": 0.38606060606060605, + "grad_norm": 0.01650061085820198, + "learning_rate": 9.979500356681992e-05, + "loss": 0.014401402324438095, + "num_input_tokens_seen": 10431512, + "step": 637, + "train_runtime": 5177.7469, + "train_tokens_per_second": 2014.682 + }, + { + "epoch": 0.38666666666666666, + "grad_norm": 0.017912236973643303, + "learning_rate": 9.979413276803948e-05, + "loss": 0.011410839855670929, + "num_input_tokens_seen": 10447888, + "step": 638, + "train_runtime": 5185.8616, + "train_tokens_per_second": 2014.687 + }, + { + "epoch": 0.38727272727272727, + "grad_norm": 0.02133595198392868, + "learning_rate": 9.979326012747106e-05, + "loss": 0.01264719758182764, + "num_input_tokens_seen": 10464264, + "step": 639, + "train_runtime": 5193.9789, + "train_tokens_per_second": 2014.691 + }, + { + "epoch": 0.3878787878787879, + "grad_norm": 0.011059283278882504, + "learning_rate": 9.97923856451469e-05, + "loss": 0.011714452877640724, + "num_input_tokens_seen": 10480640, + "step": 640, + "train_runtime": 5202.0952, + "train_tokens_per_second": 2014.696 + }, + { + "epoch": 0.3884848484848485, + "grad_norm": 0.01679043099284172, + "learning_rate": 9.979150932109937e-05, + "loss": 0.012356593273580074, + "num_input_tokens_seen": 10497016, + "step": 641, + "train_runtime": 5210.2129, + "train_tokens_per_second": 2014.7 + }, + { + "epoch": 0.3890909090909091, + "grad_norm": 0.017658302560448647, + "learning_rate": 9.979063115536086e-05, + "loss": 0.014303645119071007, + "num_input_tokens_seen": 10513392, + "step": 642, + "train_runtime": 5218.3299, + "train_tokens_per_second": 2014.704 + }, + { + "epoch": 0.3896969696969697, + "grad_norm": 0.037931594997644424, + "learning_rate": 9.978975114796389e-05, + "loss": 0.015233817510306835, + "num_input_tokens_seen": 10529768, + "step": 643, + "train_runtime": 5226.4474, + "train_tokens_per_second": 2014.709 + }, + { + "epoch": 0.3903030303030303, + "grad_norm": 0.024847477674484253, + "learning_rate": 9.978886929894096e-05, + "loss": 0.011363557539880276, + "num_input_tokens_seen": 10546144, + "step": 644, + "train_runtime": 5234.5646, + "train_tokens_per_second": 2014.713 + }, + { + "epoch": 0.39090909090909093, + "grad_norm": 0.025633033365011215, + "learning_rate": 9.978798560832474e-05, + "loss": 0.01591489464044571, + "num_input_tokens_seen": 10562520, + "step": 645, + "train_runtime": 5242.6796, + "train_tokens_per_second": 2014.718 + }, + { + "epoch": 0.39151515151515154, + "grad_norm": 0.01618288829922676, + "learning_rate": 9.978710007614786e-05, + "loss": 0.012586476281285286, + "num_input_tokens_seen": 10578896, + "step": 646, + "train_runtime": 5250.7993, + "train_tokens_per_second": 2014.721 + }, + { + "epoch": 0.39212121212121215, + "grad_norm": 0.02201761119067669, + "learning_rate": 9.978621270244313e-05, + "loss": 0.015117557719349861, + "num_input_tokens_seen": 10595272, + "step": 647, + "train_runtime": 5258.9174, + "train_tokens_per_second": 2014.725 + }, + { + "epoch": 0.3927272727272727, + "grad_norm": 0.0371362566947937, + "learning_rate": 9.978532348724335e-05, + "loss": 0.014719461090862751, + "num_input_tokens_seen": 10611648, + "step": 648, + "train_runtime": 5267.0377, + "train_tokens_per_second": 2014.728 + }, + { + "epoch": 0.3933333333333333, + "grad_norm": 0.02168305590748787, + "learning_rate": 9.978443243058139e-05, + "loss": 0.01353619247674942, + "num_input_tokens_seen": 10628024, + "step": 649, + "train_runtime": 5275.1562, + "train_tokens_per_second": 2014.732 + }, + { + "epoch": 0.3939393939393939, + "grad_norm": 0.019228238612413406, + "learning_rate": 9.978353953249022e-05, + "loss": 0.013856697827577591, + "num_input_tokens_seen": 10644400, + "step": 650, + "train_runtime": 5283.2715, + "train_tokens_per_second": 2014.737 + }, + { + "epoch": 0.39454545454545453, + "grad_norm": 0.027308976277709007, + "learning_rate": 9.978264479300289e-05, + "loss": 0.013041336089372635, + "num_input_tokens_seen": 10660776, + "step": 651, + "train_runtime": 5291.3911, + "train_tokens_per_second": 2014.74 + }, + { + "epoch": 0.39515151515151514, + "grad_norm": 0.016961168497800827, + "learning_rate": 9.978174821215247e-05, + "loss": 0.012095801532268524, + "num_input_tokens_seen": 10677152, + "step": 652, + "train_runtime": 5299.5022, + "train_tokens_per_second": 2014.746 + }, + { + "epoch": 0.39575757575757575, + "grad_norm": 0.030550425872206688, + "learning_rate": 9.978084978997212e-05, + "loss": 0.014912940561771393, + "num_input_tokens_seen": 10693528, + "step": 653, + "train_runtime": 5307.6115, + "train_tokens_per_second": 2014.753 + }, + { + "epoch": 0.39636363636363636, + "grad_norm": 0.035802144557237625, + "learning_rate": 9.977994952649509e-05, + "loss": 0.014338945969939232, + "num_input_tokens_seen": 10709904, + "step": 654, + "train_runtime": 5315.7289, + "train_tokens_per_second": 2014.757 + }, + { + "epoch": 0.396969696969697, + "grad_norm": 0.016549181193113327, + "learning_rate": 9.977904742175466e-05, + "loss": 0.013156197033822536, + "num_input_tokens_seen": 10726280, + "step": 655, + "train_runtime": 5323.8353, + "train_tokens_per_second": 2014.766 + }, + { + "epoch": 0.3975757575757576, + "grad_norm": 0.020908519625663757, + "learning_rate": 9.977814347578421e-05, + "loss": 0.012832121923565865, + "num_input_tokens_seen": 10742656, + "step": 656, + "train_runtime": 5331.9419, + "train_tokens_per_second": 2014.774 + }, + { + "epoch": 0.3981818181818182, + "grad_norm": 0.0449579656124115, + "learning_rate": 9.977723768861718e-05, + "loss": 0.011967733502388, + "num_input_tokens_seen": 10759032, + "step": 657, + "train_runtime": 5340.0518, + "train_tokens_per_second": 2014.78 + }, + { + "epoch": 0.3987878787878788, + "grad_norm": 0.01602446660399437, + "learning_rate": 9.977633006028706e-05, + "loss": 0.012816080823540688, + "num_input_tokens_seen": 10775408, + "step": 658, + "train_runtime": 5348.1597, + "train_tokens_per_second": 2014.788 + }, + { + "epoch": 0.3993939393939394, + "grad_norm": 0.028448155149817467, + "learning_rate": 9.977542059082742e-05, + "loss": 0.014847241342067719, + "num_input_tokens_seen": 10791784, + "step": 659, + "train_runtime": 5356.2671, + "train_tokens_per_second": 2014.796 + }, + { + "epoch": 0.4, + "grad_norm": 0.011783472262322903, + "learning_rate": 9.977450928027191e-05, + "loss": 0.013164190575480461, + "num_input_tokens_seen": 10808160, + "step": 660, + "train_runtime": 5364.3761, + "train_tokens_per_second": 2014.803 + }, + { + "epoch": 0.40060606060606063, + "grad_norm": 0.026984520256519318, + "learning_rate": 9.977359612865423e-05, + "loss": 0.013657883740961552, + "num_input_tokens_seen": 10824536, + "step": 661, + "train_runtime": 5372.4863, + "train_tokens_per_second": 2014.809 + }, + { + "epoch": 0.4012121212121212, + "grad_norm": 0.022077390924096107, + "learning_rate": 9.977268113600817e-05, + "loss": 0.014578605070710182, + "num_input_tokens_seen": 10840912, + "step": 662, + "train_runtime": 5380.5934, + "train_tokens_per_second": 2014.817 + }, + { + "epoch": 0.4018181818181818, + "grad_norm": 0.01575160026550293, + "learning_rate": 9.977176430236755e-05, + "loss": 0.013932663947343826, + "num_input_tokens_seen": 10857288, + "step": 663, + "train_runtime": 5388.7195, + "train_tokens_per_second": 2014.818 + }, + { + "epoch": 0.4024242424242424, + "grad_norm": 0.029406050220131874, + "learning_rate": 9.977084562776631e-05, + "loss": 0.015834983438253403, + "num_input_tokens_seen": 10873664, + "step": 664, + "train_runtime": 5396.8304, + "train_tokens_per_second": 2014.824 + }, + { + "epoch": 0.403030303030303, + "grad_norm": 0.028436392545700073, + "learning_rate": 9.976992511223839e-05, + "loss": 0.014038406312465668, + "num_input_tokens_seen": 10890040, + "step": 665, + "train_runtime": 5404.9444, + "train_tokens_per_second": 2014.829 + }, + { + "epoch": 0.4036363636363636, + "grad_norm": 0.029235292226076126, + "learning_rate": 9.976900275581789e-05, + "loss": 0.015379410237073898, + "num_input_tokens_seen": 10906416, + "step": 666, + "train_runtime": 5413.0528, + "train_tokens_per_second": 2014.836 + }, + { + "epoch": 0.40424242424242424, + "grad_norm": 0.03774306923151016, + "learning_rate": 9.976807855853886e-05, + "loss": 0.014895454980432987, + "num_input_tokens_seen": 10922792, + "step": 667, + "train_runtime": 5421.159, + "train_tokens_per_second": 2014.844 + }, + { + "epoch": 0.40484848484848485, + "grad_norm": 0.01916997693479061, + "learning_rate": 9.976715252043555e-05, + "loss": 0.0143886748701334, + "num_input_tokens_seen": 10939168, + "step": 668, + "train_runtime": 5429.2675, + "train_tokens_per_second": 2014.852 + }, + { + "epoch": 0.40545454545454546, + "grad_norm": 0.021564677357673645, + "learning_rate": 9.976622464154219e-05, + "loss": 0.013210933655500412, + "num_input_tokens_seen": 10955544, + "step": 669, + "train_runtime": 5437.3809, + "train_tokens_per_second": 2014.857 + }, + { + "epoch": 0.40606060606060607, + "grad_norm": 0.02249998040497303, + "learning_rate": 9.976529492189309e-05, + "loss": 0.013446344994008541, + "num_input_tokens_seen": 10971920, + "step": 670, + "train_runtime": 5445.4997, + "train_tokens_per_second": 2014.86 + }, + { + "epoch": 0.4066666666666667, + "grad_norm": 0.03089592047035694, + "learning_rate": 9.976436336152265e-05, + "loss": 0.014300989918410778, + "num_input_tokens_seen": 10988296, + "step": 671, + "train_runtime": 5453.615, + "train_tokens_per_second": 2014.865 + }, + { + "epoch": 0.4072727272727273, + "grad_norm": 0.01742340438067913, + "learning_rate": 9.976342996046532e-05, + "loss": 0.012858121655881405, + "num_input_tokens_seen": 11004672, + "step": 672, + "train_runtime": 5461.7321, + "train_tokens_per_second": 2014.869 + }, + { + "epoch": 0.4078787878787879, + "grad_norm": 0.0165674090385437, + "learning_rate": 9.976249471875561e-05, + "loss": 0.013976114802062511, + "num_input_tokens_seen": 11021048, + "step": 673, + "train_runtime": 5469.8479, + "train_tokens_per_second": 2014.873 + }, + { + "epoch": 0.4084848484848485, + "grad_norm": 0.013970437459647655, + "learning_rate": 9.976155763642813e-05, + "loss": 0.013127206824719906, + "num_input_tokens_seen": 11037424, + "step": 674, + "train_runtime": 5477.9644, + "train_tokens_per_second": 2014.877 + }, + { + "epoch": 0.4090909090909091, + "grad_norm": 0.028073744848370552, + "learning_rate": 9.976061871351756e-05, + "loss": 0.013469989411532879, + "num_input_tokens_seen": 11053800, + "step": 675, + "train_runtime": 5486.0804, + "train_tokens_per_second": 2014.881 + }, + { + "epoch": 0.40969696969696967, + "grad_norm": 0.02016565017402172, + "learning_rate": 9.975967795005859e-05, + "loss": 0.013997921720147133, + "num_input_tokens_seen": 11070176, + "step": 676, + "train_runtime": 5494.197, + "train_tokens_per_second": 2014.885 + }, + { + "epoch": 0.4103030303030303, + "grad_norm": 0.01767519861459732, + "learning_rate": 9.975873534608604e-05, + "loss": 0.013824408873915672, + "num_input_tokens_seen": 11086552, + "step": 677, + "train_runtime": 5502.3132, + "train_tokens_per_second": 2014.889 + }, + { + "epoch": 0.4109090909090909, + "grad_norm": 0.02294917404651642, + "learning_rate": 9.975779090163478e-05, + "loss": 0.013364237733185291, + "num_input_tokens_seen": 11102928, + "step": 678, + "train_runtime": 5510.4298, + "train_tokens_per_second": 2014.893 + }, + { + "epoch": 0.4115151515151515, + "grad_norm": 0.015453618951141834, + "learning_rate": 9.975684461673972e-05, + "loss": 0.011895030736923218, + "num_input_tokens_seen": 11119304, + "step": 679, + "train_runtime": 5518.5467, + "train_tokens_per_second": 2014.897 + }, + { + "epoch": 0.4121212121212121, + "grad_norm": 0.02744610421359539, + "learning_rate": 9.975589649143588e-05, + "loss": 0.01399244274944067, + "num_input_tokens_seen": 11135680, + "step": 680, + "train_runtime": 5526.6634, + "train_tokens_per_second": 2014.901 + }, + { + "epoch": 0.4127272727272727, + "grad_norm": 0.0141525249928236, + "learning_rate": 9.975494652575832e-05, + "loss": 0.012226445600390434, + "num_input_tokens_seen": 11152056, + "step": 681, + "train_runtime": 5534.7831, + "train_tokens_per_second": 2014.904 + }, + { + "epoch": 0.41333333333333333, + "grad_norm": 0.05674010142683983, + "learning_rate": 9.975399471974218e-05, + "loss": 0.013092868961393833, + "num_input_tokens_seen": 11168432, + "step": 682, + "train_runtime": 5542.8995, + "train_tokens_per_second": 2014.908 + }, + { + "epoch": 0.41393939393939394, + "grad_norm": 0.014718937687575817, + "learning_rate": 9.975304107342268e-05, + "loss": 0.012982090935111046, + "num_input_tokens_seen": 11184808, + "step": 683, + "train_runtime": 5551.0179, + "train_tokens_per_second": 2014.911 + }, + { + "epoch": 0.41454545454545455, + "grad_norm": 0.017596984282135963, + "learning_rate": 9.975208558683508e-05, + "loss": 0.013058310374617577, + "num_input_tokens_seen": 11201184, + "step": 684, + "train_runtime": 5559.1335, + "train_tokens_per_second": 2014.915 + }, + { + "epoch": 0.41515151515151516, + "grad_norm": 0.05556584894657135, + "learning_rate": 9.975112826001471e-05, + "loss": 0.013223481364548206, + "num_input_tokens_seen": 11217560, + "step": 685, + "train_runtime": 5567.2483, + "train_tokens_per_second": 2014.92 + }, + { + "epoch": 0.41575757575757577, + "grad_norm": 0.039875905960798264, + "learning_rate": 9.9750169092997e-05, + "loss": 0.016192132607102394, + "num_input_tokens_seen": 11233936, + "step": 686, + "train_runtime": 5575.3629, + "train_tokens_per_second": 2014.925 + }, + { + "epoch": 0.4163636363636364, + "grad_norm": 0.04174409061670303, + "learning_rate": 9.97492080858174e-05, + "loss": 0.013733956962823868, + "num_input_tokens_seen": 11250312, + "step": 687, + "train_runtime": 5583.4823, + "train_tokens_per_second": 2014.928 + }, + { + "epoch": 0.416969696969697, + "grad_norm": 0.018462834879755974, + "learning_rate": 9.97482452385115e-05, + "loss": 0.011940184980630875, + "num_input_tokens_seen": 11266688, + "step": 688, + "train_runtime": 5591.5984, + "train_tokens_per_second": 2014.932 + }, + { + "epoch": 0.4175757575757576, + "grad_norm": 0.021226534619927406, + "learning_rate": 9.974728055111487e-05, + "loss": 0.013460342772305012, + "num_input_tokens_seen": 11283064, + "step": 689, + "train_runtime": 5599.7136, + "train_tokens_per_second": 2014.936 + }, + { + "epoch": 0.41818181818181815, + "grad_norm": 0.017722534015774727, + "learning_rate": 9.974631402366322e-05, + "loss": 0.013120359741151333, + "num_input_tokens_seen": 11299440, + "step": 690, + "train_runtime": 5607.8302, + "train_tokens_per_second": 2014.94 + }, + { + "epoch": 0.41878787878787876, + "grad_norm": 0.04932510480284691, + "learning_rate": 9.97453456561923e-05, + "loss": 0.014747078530490398, + "num_input_tokens_seen": 11315816, + "step": 691, + "train_runtime": 5615.9471, + "train_tokens_per_second": 2014.943 + }, + { + "epoch": 0.4193939393939394, + "grad_norm": 0.014801602810621262, + "learning_rate": 9.974437544873791e-05, + "loss": 0.012634863145649433, + "num_input_tokens_seen": 11332192, + "step": 692, + "train_runtime": 5624.0643, + "train_tokens_per_second": 2014.947 + }, + { + "epoch": 0.42, + "grad_norm": 0.01846308819949627, + "learning_rate": 9.974340340133595e-05, + "loss": 0.013980153016746044, + "num_input_tokens_seen": 11348568, + "step": 693, + "train_runtime": 5632.1816, + "train_tokens_per_second": 2014.951 + }, + { + "epoch": 0.4206060606060606, + "grad_norm": 0.022268032655119896, + "learning_rate": 9.974242951402235e-05, + "loss": 0.013369940221309662, + "num_input_tokens_seen": 11364944, + "step": 694, + "train_runtime": 5640.2993, + "train_tokens_per_second": 2014.954 + }, + { + "epoch": 0.4212121212121212, + "grad_norm": 0.017928361892700195, + "learning_rate": 9.974145378683318e-05, + "loss": 0.012236877344548702, + "num_input_tokens_seen": 11381320, + "step": 695, + "train_runtime": 5648.4187, + "train_tokens_per_second": 2014.957 + }, + { + "epoch": 0.4218181818181818, + "grad_norm": 0.026991484686732292, + "learning_rate": 9.974047621980447e-05, + "loss": 0.013161352835595608, + "num_input_tokens_seen": 11397696, + "step": 696, + "train_runtime": 5656.5432, + "train_tokens_per_second": 2014.958 + }, + { + "epoch": 0.4224242424242424, + "grad_norm": 0.016671424731612206, + "learning_rate": 9.973949681297244e-05, + "loss": 0.013532438315451145, + "num_input_tokens_seen": 11414072, + "step": 697, + "train_runtime": 5664.6671, + "train_tokens_per_second": 2014.959 + }, + { + "epoch": 0.42303030303030303, + "grad_norm": 0.04440519958734512, + "learning_rate": 9.973851556637326e-05, + "loss": 0.014023078605532646, + "num_input_tokens_seen": 11430448, + "step": 698, + "train_runtime": 5672.7922, + "train_tokens_per_second": 2014.96 + }, + { + "epoch": 0.42363636363636364, + "grad_norm": 0.01818687841296196, + "learning_rate": 9.973753248004326e-05, + "loss": 0.012776060961186886, + "num_input_tokens_seen": 11446824, + "step": 699, + "train_runtime": 5680.9115, + "train_tokens_per_second": 2014.963 + }, + { + "epoch": 0.42424242424242425, + "grad_norm": 0.03709911182522774, + "learning_rate": 9.97365475540188e-05, + "loss": 0.013938689604401588, + "num_input_tokens_seen": 11463200, + "step": 700, + "train_runtime": 5689.0323, + "train_tokens_per_second": 2014.965 + }, + { + "epoch": 0.42484848484848486, + "grad_norm": 0.02871977910399437, + "learning_rate": 9.97355607883363e-05, + "loss": 0.015867041423916817, + "num_input_tokens_seen": 11479576, + "step": 701, + "train_runtime": 5698.2647, + "train_tokens_per_second": 2014.574 + }, + { + "epoch": 0.4254545454545455, + "grad_norm": 0.023145193234086037, + "learning_rate": 9.973457218303226e-05, + "loss": 0.01401555072516203, + "num_input_tokens_seen": 11495952, + "step": 702, + "train_runtime": 5706.3816, + "train_tokens_per_second": 2014.578 + }, + { + "epoch": 0.4260606060606061, + "grad_norm": 0.015238692052662373, + "learning_rate": 9.973358173814324e-05, + "loss": 0.01140027865767479, + "num_input_tokens_seen": 11512328, + "step": 703, + "train_runtime": 5714.5032, + "train_tokens_per_second": 2014.581 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 0.017513658851385117, + "learning_rate": 9.97325894537059e-05, + "loss": 0.01290590688586235, + "num_input_tokens_seen": 11528704, + "step": 704, + "train_runtime": 5722.6292, + "train_tokens_per_second": 2014.582 + }, + { + "epoch": 0.42727272727272725, + "grad_norm": 0.02398119866847992, + "learning_rate": 9.973159532975691e-05, + "loss": 0.013042651116847992, + "num_input_tokens_seen": 11545080, + "step": 705, + "train_runtime": 5730.753, + "train_tokens_per_second": 2014.583 + }, + { + "epoch": 0.42787878787878786, + "grad_norm": 0.01669715717434883, + "learning_rate": 9.973059936633306e-05, + "loss": 0.011862633749842644, + "num_input_tokens_seen": 11561456, + "step": 706, + "train_runtime": 5738.8701, + "train_tokens_per_second": 2014.588 + }, + { + "epoch": 0.42848484848484847, + "grad_norm": 0.0743919089436531, + "learning_rate": 9.97296015634712e-05, + "loss": 0.012939982116222382, + "num_input_tokens_seen": 11577832, + "step": 707, + "train_runtime": 5746.9879, + "train_tokens_per_second": 2014.591 + }, + { + "epoch": 0.4290909090909091, + "grad_norm": 0.014302635565400124, + "learning_rate": 9.972860192120821e-05, + "loss": 0.01308290846645832, + "num_input_tokens_seen": 11594208, + "step": 708, + "train_runtime": 5755.1051, + "train_tokens_per_second": 2014.595 + }, + { + "epoch": 0.4296969696969697, + "grad_norm": 0.03461941331624985, + "learning_rate": 9.972760043958109e-05, + "loss": 0.01451612077653408, + "num_input_tokens_seen": 11610584, + "step": 709, + "train_runtime": 5763.2288, + "train_tokens_per_second": 2014.597 + }, + { + "epoch": 0.4303030303030303, + "grad_norm": 0.026271218433976173, + "learning_rate": 9.972659711862687e-05, + "loss": 0.012233047746121883, + "num_input_tokens_seen": 11626960, + "step": 710, + "train_runtime": 5771.3444, + "train_tokens_per_second": 2014.602 + }, + { + "epoch": 0.4309090909090909, + "grad_norm": 0.03146032616496086, + "learning_rate": 9.972559195838263e-05, + "loss": 0.012203723192214966, + "num_input_tokens_seen": 11643336, + "step": 711, + "train_runtime": 5779.4615, + "train_tokens_per_second": 2014.606 + }, + { + "epoch": 0.4315151515151515, + "grad_norm": 0.023236479610204697, + "learning_rate": 9.97245849588856e-05, + "loss": 0.014339424669742584, + "num_input_tokens_seen": 11659712, + "step": 712, + "train_runtime": 5787.5789, + "train_tokens_per_second": 2014.61 + }, + { + "epoch": 0.43212121212121213, + "grad_norm": 0.016745924949645996, + "learning_rate": 9.972357612017302e-05, + "loss": 0.012629512697458267, + "num_input_tokens_seen": 11676088, + "step": 713, + "train_runtime": 5795.6981, + "train_tokens_per_second": 2014.613 + }, + { + "epoch": 0.43272727272727274, + "grad_norm": 0.028602320700883865, + "learning_rate": 9.972256544228217e-05, + "loss": 0.01239441242069006, + "num_input_tokens_seen": 11692464, + "step": 714, + "train_runtime": 5803.8136, + "train_tokens_per_second": 2014.617 + }, + { + "epoch": 0.43333333333333335, + "grad_norm": 0.04347382113337517, + "learning_rate": 9.972155292525046e-05, + "loss": 0.013399597257375717, + "num_input_tokens_seen": 11708840, + "step": 715, + "train_runtime": 5811.9326, + "train_tokens_per_second": 2014.621 + }, + { + "epoch": 0.43393939393939396, + "grad_norm": 0.027413364499807358, + "learning_rate": 9.972053856911534e-05, + "loss": 0.014752673916518688, + "num_input_tokens_seen": 11725216, + "step": 716, + "train_runtime": 5820.0498, + "train_tokens_per_second": 2014.625 + }, + { + "epoch": 0.43454545454545457, + "grad_norm": 0.034208860248327255, + "learning_rate": 9.971952237391433e-05, + "loss": 0.013670345768332481, + "num_input_tokens_seen": 11741592, + "step": 717, + "train_runtime": 5828.1669, + "train_tokens_per_second": 2014.629 + }, + { + "epoch": 0.4351515151515152, + "grad_norm": 0.08834357559680939, + "learning_rate": 9.971850433968499e-05, + "loss": 0.01636839471757412, + "num_input_tokens_seen": 11757968, + "step": 718, + "train_runtime": 5836.2889, + "train_tokens_per_second": 2014.631 + }, + { + "epoch": 0.43575757575757573, + "grad_norm": 0.09180225431919098, + "learning_rate": 9.971748446646503e-05, + "loss": 0.013547438196837902, + "num_input_tokens_seen": 11774344, + "step": 719, + "train_runtime": 5844.4057, + "train_tokens_per_second": 2014.635 + }, + { + "epoch": 0.43636363636363634, + "grad_norm": 0.021431786939501762, + "learning_rate": 9.971646275429211e-05, + "loss": 0.014424419030547142, + "num_input_tokens_seen": 11790720, + "step": 720, + "train_runtime": 5852.5291, + "train_tokens_per_second": 2014.637 + }, + { + "epoch": 0.43696969696969695, + "grad_norm": 0.014504344202578068, + "learning_rate": 9.971543920320407e-05, + "loss": 0.012794758193194866, + "num_input_tokens_seen": 11807096, + "step": 721, + "train_runtime": 5860.6452, + "train_tokens_per_second": 2014.641 + }, + { + "epoch": 0.43757575757575756, + "grad_norm": 0.04303886368870735, + "learning_rate": 9.971441381323874e-05, + "loss": 0.014037848450243473, + "num_input_tokens_seen": 11823472, + "step": 722, + "train_runtime": 5868.7615, + "train_tokens_per_second": 2014.645 + }, + { + "epoch": 0.4381818181818182, + "grad_norm": 0.028946641832590103, + "learning_rate": 9.971338658443406e-05, + "loss": 0.012954017147421837, + "num_input_tokens_seen": 11839848, + "step": 723, + "train_runtime": 5876.878, + "train_tokens_per_second": 2014.649 + }, + { + "epoch": 0.4387878787878788, + "grad_norm": 0.02165861800312996, + "learning_rate": 9.971235751682802e-05, + "loss": 0.012219181284308434, + "num_input_tokens_seen": 11856224, + "step": 724, + "train_runtime": 5884.9934, + "train_tokens_per_second": 2014.654 + }, + { + "epoch": 0.4393939393939394, + "grad_norm": 0.023574933409690857, + "learning_rate": 9.971132661045868e-05, + "loss": 0.014860106632113457, + "num_input_tokens_seen": 11872600, + "step": 725, + "train_runtime": 5893.1105, + "train_tokens_per_second": 2014.658 + }, + { + "epoch": 0.44, + "grad_norm": 0.05360223352909088, + "learning_rate": 9.971029386536419e-05, + "loss": 0.014855952933430672, + "num_input_tokens_seen": 11888976, + "step": 726, + "train_runtime": 5901.2285, + "train_tokens_per_second": 2014.661 + }, + { + "epoch": 0.4406060606060606, + "grad_norm": 0.03671532869338989, + "learning_rate": 9.970925928158274e-05, + "loss": 0.015136584639549255, + "num_input_tokens_seen": 11905352, + "step": 727, + "train_runtime": 5909.3465, + "train_tokens_per_second": 2014.665 + }, + { + "epoch": 0.4412121212121212, + "grad_norm": 0.012548093684017658, + "learning_rate": 9.970822285915257e-05, + "loss": 0.012122916989028454, + "num_input_tokens_seen": 11921728, + "step": 728, + "train_runtime": 5917.4638, + "train_tokens_per_second": 2014.669 + }, + { + "epoch": 0.44181818181818183, + "grad_norm": 0.02257922850549221, + "learning_rate": 9.970718459811206e-05, + "loss": 0.013802756555378437, + "num_input_tokens_seen": 11938104, + "step": 729, + "train_runtime": 5925.5783, + "train_tokens_per_second": 2014.673 + }, + { + "epoch": 0.44242424242424244, + "grad_norm": 0.014075133018195629, + "learning_rate": 9.97061444984996e-05, + "loss": 0.012838860973715782, + "num_input_tokens_seen": 11954480, + "step": 730, + "train_runtime": 5933.6937, + "train_tokens_per_second": 2014.678 + }, + { + "epoch": 0.44303030303030305, + "grad_norm": 0.022020021453499794, + "learning_rate": 9.970510256035364e-05, + "loss": 0.01375649869441986, + "num_input_tokens_seen": 11970856, + "step": 731, + "train_runtime": 5941.8106, + "train_tokens_per_second": 2014.682 + }, + { + "epoch": 0.44363636363636366, + "grad_norm": 0.01787860319018364, + "learning_rate": 9.970405878371273e-05, + "loss": 0.012008238583803177, + "num_input_tokens_seen": 11987232, + "step": 732, + "train_runtime": 5949.9292, + "train_tokens_per_second": 2014.685 + }, + { + "epoch": 0.4442424242424242, + "grad_norm": 0.019049983471632004, + "learning_rate": 9.970301316861548e-05, + "loss": 0.012502388097345829, + "num_input_tokens_seen": 12003608, + "step": 733, + "train_runtime": 5958.0503, + "train_tokens_per_second": 2014.687 + }, + { + "epoch": 0.4448484848484848, + "grad_norm": 0.02835710346698761, + "learning_rate": 9.970196571510057e-05, + "loss": 0.012223845347762108, + "num_input_tokens_seen": 12019984, + "step": 734, + "train_runtime": 5966.1707, + "train_tokens_per_second": 2014.69 + }, + { + "epoch": 0.44545454545454544, + "grad_norm": 0.04534858092665672, + "learning_rate": 9.970091642320674e-05, + "loss": 0.01531003974378109, + "num_input_tokens_seen": 12036360, + "step": 735, + "train_runtime": 5974.2918, + "train_tokens_per_second": 2014.692 + }, + { + "epoch": 0.44606060606060605, + "grad_norm": 0.02770829014480114, + "learning_rate": 9.96998652929728e-05, + "loss": 0.014202866703271866, + "num_input_tokens_seen": 12052736, + "step": 736, + "train_runtime": 5982.4163, + "train_tokens_per_second": 2014.694 + }, + { + "epoch": 0.44666666666666666, + "grad_norm": 0.01627975143492222, + "learning_rate": 9.969881232443761e-05, + "loss": 0.013593195006251335, + "num_input_tokens_seen": 12069112, + "step": 737, + "train_runtime": 5990.5422, + "train_tokens_per_second": 2014.694 + }, + { + "epoch": 0.44727272727272727, + "grad_norm": 0.02013089507818222, + "learning_rate": 9.969775751764015e-05, + "loss": 0.012935129925608635, + "num_input_tokens_seen": 12085488, + "step": 738, + "train_runtime": 5998.6638, + "train_tokens_per_second": 2014.697 + }, + { + "epoch": 0.4478787878787879, + "grad_norm": 0.03128223493695259, + "learning_rate": 9.969670087261942e-05, + "loss": 0.014752635732293129, + "num_input_tokens_seen": 12101864, + "step": 739, + "train_runtime": 6006.7832, + "train_tokens_per_second": 2014.7 + }, + { + "epoch": 0.4484848484848485, + "grad_norm": 0.08356563001871109, + "learning_rate": 9.969564238941452e-05, + "loss": 0.012013277038931847, + "num_input_tokens_seen": 12118240, + "step": 740, + "train_runtime": 6014.9037, + "train_tokens_per_second": 2014.702 + }, + { + "epoch": 0.4490909090909091, + "grad_norm": 0.04240264743566513, + "learning_rate": 9.969458206806456e-05, + "loss": 0.013846787624061108, + "num_input_tokens_seen": 12134616, + "step": 741, + "train_runtime": 6023.0287, + "train_tokens_per_second": 2014.703 + }, + { + "epoch": 0.4496969696969697, + "grad_norm": 0.020833732560276985, + "learning_rate": 9.96935199086088e-05, + "loss": 0.014301668852567673, + "num_input_tokens_seen": 12150992, + "step": 742, + "train_runtime": 6031.1472, + "train_tokens_per_second": 2014.707 + }, + { + "epoch": 0.4503030303030303, + "grad_norm": 0.021045729517936707, + "learning_rate": 9.969245591108652e-05, + "loss": 0.013184930197894573, + "num_input_tokens_seen": 12167368, + "step": 743, + "train_runtime": 6039.2669, + "train_tokens_per_second": 2014.709 + }, + { + "epoch": 0.4509090909090909, + "grad_norm": 0.014139235951006413, + "learning_rate": 9.969139007553705e-05, + "loss": 0.013327041640877724, + "num_input_tokens_seen": 12183744, + "step": 744, + "train_runtime": 6047.3846, + "train_tokens_per_second": 2014.713 + }, + { + "epoch": 0.45151515151515154, + "grad_norm": 0.7923178672790527, + "learning_rate": 9.969032240199983e-05, + "loss": 0.012914719060063362, + "num_input_tokens_seen": 12200120, + "step": 745, + "train_runtime": 6055.5018, + "train_tokens_per_second": 2014.717 + }, + { + "epoch": 0.45212121212121215, + "grad_norm": 0.033203721046447754, + "learning_rate": 9.968925289051436e-05, + "loss": 0.013039352372288704, + "num_input_tokens_seen": 12216496, + "step": 746, + "train_runtime": 6063.6194, + "train_tokens_per_second": 2014.72 + }, + { + "epoch": 0.4527272727272727, + "grad_norm": 0.02019328624010086, + "learning_rate": 9.96881815411202e-05, + "loss": 0.012438374571502209, + "num_input_tokens_seen": 12232872, + "step": 747, + "train_runtime": 6071.7363, + "train_tokens_per_second": 2014.724 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 0.03482190519571304, + "learning_rate": 9.968710835385696e-05, + "loss": 0.015620945952832699, + "num_input_tokens_seen": 12249248, + "step": 748, + "train_runtime": 6079.8541, + "train_tokens_per_second": 2014.727 + }, + { + "epoch": 0.4539393939393939, + "grad_norm": 0.053270891308784485, + "learning_rate": 9.968603332876434e-05, + "loss": 0.012819363735616207, + "num_input_tokens_seen": 12265624, + "step": 749, + "train_runtime": 6087.9704, + "train_tokens_per_second": 2014.731 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 0.013719640672206879, + "learning_rate": 9.968495646588211e-05, + "loss": 0.013314586132764816, + "num_input_tokens_seen": 12282000, + "step": 750, + "train_runtime": 6096.0915, + "train_tokens_per_second": 2014.734 + }, + { + "epoch": 0.45515151515151514, + "grad_norm": 0.020413396880030632, + "learning_rate": 9.96838777652501e-05, + "loss": 0.012559941038489342, + "num_input_tokens_seen": 12298376, + "step": 751, + "train_runtime": 6104.2072, + "train_tokens_per_second": 2014.738 + }, + { + "epoch": 0.45575757575757575, + "grad_norm": 0.02567451260983944, + "learning_rate": 9.968279722690819e-05, + "loss": 0.013514967635273933, + "num_input_tokens_seen": 12314752, + "step": 752, + "train_runtime": 6112.3314, + "train_tokens_per_second": 2014.739 + }, + { + "epoch": 0.45636363636363636, + "grad_norm": 0.015409312210977077, + "learning_rate": 9.968171485089638e-05, + "loss": 0.012808658182621002, + "num_input_tokens_seen": 12331128, + "step": 753, + "train_runtime": 6120.4491, + "train_tokens_per_second": 2014.742 + }, + { + "epoch": 0.45696969696969697, + "grad_norm": 0.02095264568924904, + "learning_rate": 9.968063063725468e-05, + "loss": 0.014174265787005424, + "num_input_tokens_seen": 12347504, + "step": 754, + "train_runtime": 6128.5679, + "train_tokens_per_second": 2014.745 + }, + { + "epoch": 0.4575757575757576, + "grad_norm": 0.020611796528100967, + "learning_rate": 9.96795445860232e-05, + "loss": 0.011881090700626373, + "num_input_tokens_seen": 12363880, + "step": 755, + "train_runtime": 6136.6868, + "train_tokens_per_second": 2014.748 + }, + { + "epoch": 0.4581818181818182, + "grad_norm": 0.018243003636598587, + "learning_rate": 9.967845669724212e-05, + "loss": 0.012596143409609795, + "num_input_tokens_seen": 12380256, + "step": 756, + "train_runtime": 6144.8042, + "train_tokens_per_second": 2014.752 + }, + { + "epoch": 0.4587878787878788, + "grad_norm": 0.016125964000821114, + "learning_rate": 9.967736697095167e-05, + "loss": 0.013951683416962624, + "num_input_tokens_seen": 12396632, + "step": 757, + "train_runtime": 6152.9288, + "train_tokens_per_second": 2014.753 + }, + { + "epoch": 0.4593939393939394, + "grad_norm": 0.019307058304548264, + "learning_rate": 9.967627540719215e-05, + "loss": 0.013310304842889309, + "num_input_tokens_seen": 12413008, + "step": 758, + "train_runtime": 6161.047, + "train_tokens_per_second": 2014.756 + }, + { + "epoch": 0.46, + "grad_norm": 0.0198148675262928, + "learning_rate": 9.967518200600396e-05, + "loss": 0.013110843487083912, + "num_input_tokens_seen": 12429384, + "step": 759, + "train_runtime": 6169.1657, + "train_tokens_per_second": 2014.759 + }, + { + "epoch": 0.46060606060606063, + "grad_norm": 0.02929919771850109, + "learning_rate": 9.967408676742751e-05, + "loss": 0.015073966234922409, + "num_input_tokens_seen": 12445760, + "step": 760, + "train_runtime": 6177.2831, + "train_tokens_per_second": 2014.763 + }, + { + "epoch": 0.4612121212121212, + "grad_norm": 0.015382593497633934, + "learning_rate": 9.967298969150334e-05, + "loss": 0.012051237747073174, + "num_input_tokens_seen": 12462136, + "step": 761, + "train_runtime": 6185.4001, + "train_tokens_per_second": 2014.766 + }, + { + "epoch": 0.4618181818181818, + "grad_norm": 0.02371540106832981, + "learning_rate": 9.9671890778272e-05, + "loss": 0.015372917987406254, + "num_input_tokens_seen": 12478512, + "step": 762, + "train_runtime": 6193.5166, + "train_tokens_per_second": 2014.77 + }, + { + "epoch": 0.4624242424242424, + "grad_norm": 0.02178136259317398, + "learning_rate": 9.967079002777417e-05, + "loss": 0.013376548886299133, + "num_input_tokens_seen": 12494888, + "step": 763, + "train_runtime": 6201.6342, + "train_tokens_per_second": 2014.773 + }, + { + "epoch": 0.463030303030303, + "grad_norm": 0.01065842155367136, + "learning_rate": 9.966968744005052e-05, + "loss": 0.012219875119626522, + "num_input_tokens_seen": 12511264, + "step": 764, + "train_runtime": 6209.7525, + "train_tokens_per_second": 2014.777 + }, + { + "epoch": 0.4636363636363636, + "grad_norm": 0.013287489302456379, + "learning_rate": 9.966858301514188e-05, + "loss": 0.011538016609847546, + "num_input_tokens_seen": 12527640, + "step": 765, + "train_runtime": 6217.8691, + "train_tokens_per_second": 2014.78 + }, + { + "epoch": 0.46424242424242423, + "grad_norm": 0.013882887549698353, + "learning_rate": 9.966747675308907e-05, + "loss": 0.012349468655884266, + "num_input_tokens_seen": 12544016, + "step": 766, + "train_runtime": 6225.9864, + "train_tokens_per_second": 2014.784 + }, + { + "epoch": 0.46484848484848484, + "grad_norm": 0.018599022179841995, + "learning_rate": 9.966636865393301e-05, + "loss": 0.012744025327265263, + "num_input_tokens_seen": 12560392, + "step": 767, + "train_runtime": 6234.1026, + "train_tokens_per_second": 2014.787 + }, + { + "epoch": 0.46545454545454545, + "grad_norm": 0.012023529969155788, + "learning_rate": 9.966525871771472e-05, + "loss": 0.01199167687445879, + "num_input_tokens_seen": 12576768, + "step": 768, + "train_runtime": 6242.2199, + "train_tokens_per_second": 2014.791 + }, + { + "epoch": 0.46606060606060606, + "grad_norm": 0.01650414615869522, + "learning_rate": 9.966414694447521e-05, + "loss": 0.012927195057272911, + "num_input_tokens_seen": 12593144, + "step": 769, + "train_runtime": 6250.3375, + "train_tokens_per_second": 2014.794 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.034085940569639206, + "learning_rate": 9.966303333425563e-05, + "loss": 0.012202315032482147, + "num_input_tokens_seen": 12609520, + "step": 770, + "train_runtime": 6258.4532, + "train_tokens_per_second": 2014.798 + }, + { + "epoch": 0.4672727272727273, + "grad_norm": 0.013827620074152946, + "learning_rate": 9.966191788709716e-05, + "loss": 0.013147883117198944, + "num_input_tokens_seen": 12625896, + "step": 771, + "train_runtime": 6266.5701, + "train_tokens_per_second": 2014.802 + }, + { + "epoch": 0.4678787878787879, + "grad_norm": 0.0181913860142231, + "learning_rate": 9.966080060304105e-05, + "loss": 0.013427773490548134, + "num_input_tokens_seen": 12642272, + "step": 772, + "train_runtime": 6274.6886, + "train_tokens_per_second": 2014.805 + }, + { + "epoch": 0.4684848484848485, + "grad_norm": 0.07882755249738693, + "learning_rate": 9.965968148212864e-05, + "loss": 0.017075341194868088, + "num_input_tokens_seen": 12658648, + "step": 773, + "train_runtime": 6282.8062, + "train_tokens_per_second": 2014.808 + }, + { + "epoch": 0.4690909090909091, + "grad_norm": 0.007325070444494486, + "learning_rate": 9.965856052440132e-05, + "loss": 0.011197097599506378, + "num_input_tokens_seen": 12675024, + "step": 774, + "train_runtime": 6290.929, + "train_tokens_per_second": 2014.81 + }, + { + "epoch": 0.4696969696969697, + "grad_norm": 0.030580898746848106, + "learning_rate": 9.965743772990054e-05, + "loss": 0.012808885425329208, + "num_input_tokens_seen": 12691400, + "step": 775, + "train_runtime": 6299.0468, + "train_tokens_per_second": 2014.813 + }, + { + "epoch": 0.4703030303030303, + "grad_norm": 0.027805298566818237, + "learning_rate": 9.965631309866788e-05, + "loss": 0.012805595062673092, + "num_input_tokens_seen": 12707776, + "step": 776, + "train_runtime": 6307.1647, + "train_tokens_per_second": 2014.816 + }, + { + "epoch": 0.4709090909090909, + "grad_norm": 0.01449024397879839, + "learning_rate": 9.965518663074487e-05, + "loss": 0.013110213913023472, + "num_input_tokens_seen": 12724152, + "step": 777, + "train_runtime": 6315.2824, + "train_tokens_per_second": 2014.819 + }, + { + "epoch": 0.4715151515151515, + "grad_norm": 0.013304144144058228, + "learning_rate": 9.96540583261732e-05, + "loss": 0.012666239403188229, + "num_input_tokens_seen": 12740528, + "step": 778, + "train_runtime": 6323.3995, + "train_tokens_per_second": 2014.823 + }, + { + "epoch": 0.4721212121212121, + "grad_norm": 0.01922908052802086, + "learning_rate": 9.965292818499463e-05, + "loss": 0.012315730564296246, + "num_input_tokens_seen": 12756904, + "step": 779, + "train_runtime": 6331.5179, + "train_tokens_per_second": 2014.826 + }, + { + "epoch": 0.4727272727272727, + "grad_norm": 0.042174387723207474, + "learning_rate": 9.965179620725093e-05, + "loss": 0.015461819246411324, + "num_input_tokens_seen": 12773280, + "step": 780, + "train_runtime": 6339.636, + "train_tokens_per_second": 2014.829 + }, + { + "epoch": 0.47333333333333333, + "grad_norm": 0.02851157635450363, + "learning_rate": 9.965066239298398e-05, + "loss": 0.012629134580492973, + "num_input_tokens_seen": 12789656, + "step": 781, + "train_runtime": 6347.7537, + "train_tokens_per_second": 2014.832 + }, + { + "epoch": 0.47393939393939394, + "grad_norm": 0.10219256579875946, + "learning_rate": 9.96495267422357e-05, + "loss": 0.014288711361587048, + "num_input_tokens_seen": 12806032, + "step": 782, + "train_runtime": 6355.8718, + "train_tokens_per_second": 2014.835 + }, + { + "epoch": 0.47454545454545455, + "grad_norm": 0.012413585558533669, + "learning_rate": 9.964838925504816e-05, + "loss": 0.012026645243167877, + "num_input_tokens_seen": 12822408, + "step": 783, + "train_runtime": 6363.9912, + "train_tokens_per_second": 2014.837 + }, + { + "epoch": 0.47515151515151516, + "grad_norm": 0.019600611180067062, + "learning_rate": 9.964724993146335e-05, + "loss": 0.012678924947977066, + "num_input_tokens_seen": 12838784, + "step": 784, + "train_runtime": 6372.1105, + "train_tokens_per_second": 2014.84 + }, + { + "epoch": 0.47575757575757577, + "grad_norm": 0.021761193871498108, + "learning_rate": 9.964610877152346e-05, + "loss": 0.012011994607746601, + "num_input_tokens_seen": 12855160, + "step": 785, + "train_runtime": 6380.2296, + "train_tokens_per_second": 2014.843 + }, + { + "epoch": 0.4763636363636364, + "grad_norm": 0.016564620658755302, + "learning_rate": 9.964496577527069e-05, + "loss": 0.01261131465435028, + "num_input_tokens_seen": 12871536, + "step": 786, + "train_runtime": 6388.348, + "train_tokens_per_second": 2014.846 + }, + { + "epoch": 0.476969696969697, + "grad_norm": 0.009226581081748009, + "learning_rate": 9.964382094274732e-05, + "loss": 0.012591596692800522, + "num_input_tokens_seen": 12887912, + "step": 787, + "train_runtime": 6396.4664, + "train_tokens_per_second": 2014.849 + }, + { + "epoch": 0.4775757575757576, + "grad_norm": 0.017386259511113167, + "learning_rate": 9.964267427399568e-05, + "loss": 0.012936464510858059, + "num_input_tokens_seen": 12904288, + "step": 788, + "train_runtime": 6404.5838, + "train_tokens_per_second": 2014.852 + }, + { + "epoch": 0.4781818181818182, + "grad_norm": 0.023312706500291824, + "learning_rate": 9.964152576905819e-05, + "loss": 0.012287257239222527, + "num_input_tokens_seen": 12920664, + "step": 789, + "train_runtime": 6412.7014, + "train_tokens_per_second": 2014.855 + }, + { + "epoch": 0.47878787878787876, + "grad_norm": 0.03517942875623703, + "learning_rate": 9.964037542797735e-05, + "loss": 0.014132940210402012, + "num_input_tokens_seen": 12937040, + "step": 790, + "train_runtime": 6420.8203, + "train_tokens_per_second": 2014.858 + }, + { + "epoch": 0.4793939393939394, + "grad_norm": 0.03619959577918053, + "learning_rate": 9.963922325079567e-05, + "loss": 0.014860968105494976, + "num_input_tokens_seen": 12953416, + "step": 791, + "train_runtime": 6428.9382, + "train_tokens_per_second": 2014.861 + }, + { + "epoch": 0.48, + "grad_norm": 0.03862093389034271, + "learning_rate": 9.96380692375558e-05, + "loss": 0.012788870371878147, + "num_input_tokens_seen": 12969792, + "step": 792, + "train_runtime": 6437.0552, + "train_tokens_per_second": 2014.864 + }, + { + "epoch": 0.4806060606060606, + "grad_norm": 0.014955422841012478, + "learning_rate": 9.963691338830044e-05, + "loss": 0.012180945836007595, + "num_input_tokens_seen": 12986168, + "step": 793, + "train_runtime": 6445.1731, + "train_tokens_per_second": 2014.867 + }, + { + "epoch": 0.4812121212121212, + "grad_norm": 0.02255093678832054, + "learning_rate": 9.963575570307228e-05, + "loss": 0.015188801102340221, + "num_input_tokens_seen": 13002544, + "step": 794, + "train_runtime": 6453.2915, + "train_tokens_per_second": 2014.87 + }, + { + "epoch": 0.4818181818181818, + "grad_norm": 0.023307740688323975, + "learning_rate": 9.96345961819142e-05, + "loss": 0.012430655770003796, + "num_input_tokens_seen": 13018920, + "step": 795, + "train_runtime": 6461.4033, + "train_tokens_per_second": 2014.875 + }, + { + "epoch": 0.4824242424242424, + "grad_norm": 0.015535326674580574, + "learning_rate": 9.963343482486906e-05, + "loss": 0.013036166317760944, + "num_input_tokens_seen": 13035296, + "step": 796, + "train_runtime": 6469.51, + "train_tokens_per_second": 2014.882 + }, + { + "epoch": 0.48303030303030303, + "grad_norm": 0.015238570980727673, + "learning_rate": 9.963227163197982e-05, + "loss": 0.012019358575344086, + "num_input_tokens_seen": 13051672, + "step": 797, + "train_runtime": 6477.6153, + "train_tokens_per_second": 2014.888 + }, + { + "epoch": 0.48363636363636364, + "grad_norm": 0.033798947930336, + "learning_rate": 9.963110660328952e-05, + "loss": 0.013339506462216377, + "num_input_tokens_seen": 13068048, + "step": 798, + "train_runtime": 6485.7294, + "train_tokens_per_second": 2014.893 + }, + { + "epoch": 0.48424242424242425, + "grad_norm": 0.019505798816680908, + "learning_rate": 9.962993973884122e-05, + "loss": 0.012281915172934532, + "num_input_tokens_seen": 13084424, + "step": 799, + "train_runtime": 6493.8366, + "train_tokens_per_second": 2014.899 + }, + { + "epoch": 0.48484848484848486, + "grad_norm": 0.010988899506628513, + "learning_rate": 9.96287710386781e-05, + "loss": 0.011865864507853985, + "num_input_tokens_seen": 13100800, + "step": 800, + "train_runtime": 6501.9444, + "train_tokens_per_second": 2014.905 + }, + { + "epoch": 0.48545454545454547, + "grad_norm": 0.031102674081921577, + "learning_rate": 9.96276005028434e-05, + "loss": 0.013372216373682022, + "num_input_tokens_seen": 13117176, + "step": 801, + "train_runtime": 6511.057, + "train_tokens_per_second": 2014.6 + }, + { + "epoch": 0.4860606060606061, + "grad_norm": 0.009399918839335442, + "learning_rate": 9.962642813138039e-05, + "loss": 0.012573515065014362, + "num_input_tokens_seen": 13133552, + "step": 802, + "train_runtime": 6519.1656, + "train_tokens_per_second": 2014.606 + }, + { + "epoch": 0.4866666666666667, + "grad_norm": 0.06464923918247223, + "learning_rate": 9.962525392433246e-05, + "loss": 0.014730310998857021, + "num_input_tokens_seen": 13149928, + "step": 803, + "train_runtime": 6527.273, + "train_tokens_per_second": 2014.613 + }, + { + "epoch": 0.48727272727272725, + "grad_norm": 0.028241781517863274, + "learning_rate": 9.962407788174301e-05, + "loss": 0.01580268330872059, + "num_input_tokens_seen": 13166304, + "step": 804, + "train_runtime": 6535.3821, + "train_tokens_per_second": 2014.619 + }, + { + "epoch": 0.48787878787878786, + "grad_norm": 0.008157139644026756, + "learning_rate": 9.962290000365558e-05, + "loss": 0.011951067484915257, + "num_input_tokens_seen": 13182680, + "step": 805, + "train_runtime": 6543.4933, + "train_tokens_per_second": 2014.624 + }, + { + "epoch": 0.48848484848484847, + "grad_norm": 0.017825007438659668, + "learning_rate": 9.96217202901137e-05, + "loss": 0.01247593853622675, + "num_input_tokens_seen": 13199056, + "step": 806, + "train_runtime": 6551.599, + "train_tokens_per_second": 2014.631 + }, + { + "epoch": 0.4890909090909091, + "grad_norm": 0.03140291944146156, + "learning_rate": 9.962053874116102e-05, + "loss": 0.013065744191408157, + "num_input_tokens_seen": 13215432, + "step": 807, + "train_runtime": 6559.707, + "train_tokens_per_second": 2014.638 + }, + { + "epoch": 0.4896969696969697, + "grad_norm": 0.020545680075883865, + "learning_rate": 9.961935535684127e-05, + "loss": 0.013503405265510082, + "num_input_tokens_seen": 13231808, + "step": 808, + "train_runtime": 6567.8172, + "train_tokens_per_second": 2014.643 + }, + { + "epoch": 0.4903030303030303, + "grad_norm": 0.010955904610455036, + "learning_rate": 9.961817013719815e-05, + "loss": 0.011936129070818424, + "num_input_tokens_seen": 13248184, + "step": 809, + "train_runtime": 6575.9284, + "train_tokens_per_second": 2014.648 + }, + { + "epoch": 0.4909090909090909, + "grad_norm": 0.01849379763007164, + "learning_rate": 9.961698308227557e-05, + "loss": 0.012791337445378304, + "num_input_tokens_seen": 13264560, + "step": 810, + "train_runtime": 6584.0343, + "train_tokens_per_second": 2014.655 + }, + { + "epoch": 0.4915151515151515, + "grad_norm": 0.014219888485968113, + "learning_rate": 9.961579419211741e-05, + "loss": 0.01348559744656086, + "num_input_tokens_seen": 13280936, + "step": 811, + "train_runtime": 6592.1415, + "train_tokens_per_second": 2014.662 + }, + { + "epoch": 0.4921212121212121, + "grad_norm": 0.02992507442831993, + "learning_rate": 9.961460346676763e-05, + "loss": 0.013612410053610802, + "num_input_tokens_seen": 13297312, + "step": 812, + "train_runtime": 6600.2507, + "train_tokens_per_second": 2014.668 + }, + { + "epoch": 0.49272727272727274, + "grad_norm": 0.029259268194437027, + "learning_rate": 9.961341090627031e-05, + "loss": 0.014138033613562584, + "num_input_tokens_seen": 13313688, + "step": 813, + "train_runtime": 6608.362, + "train_tokens_per_second": 2014.673 + }, + { + "epoch": 0.49333333333333335, + "grad_norm": 0.016515251249074936, + "learning_rate": 9.961221651066952e-05, + "loss": 0.013446497730910778, + "num_input_tokens_seen": 13330064, + "step": 814, + "train_runtime": 6616.47, + "train_tokens_per_second": 2014.679 + }, + { + "epoch": 0.49393939393939396, + "grad_norm": 0.019002556800842285, + "learning_rate": 9.961102028000948e-05, + "loss": 0.013769666664302349, + "num_input_tokens_seen": 13346440, + "step": 815, + "train_runtime": 6624.5765, + "train_tokens_per_second": 2014.686 + }, + { + "epoch": 0.49454545454545457, + "grad_norm": 0.023732759058475494, + "learning_rate": 9.960982221433439e-05, + "loss": 0.01219931710511446, + "num_input_tokens_seen": 13362816, + "step": 816, + "train_runtime": 6632.6975, + "train_tokens_per_second": 2014.688 + }, + { + "epoch": 0.4951515151515152, + "grad_norm": 0.012622934766113758, + "learning_rate": 9.960862231368859e-05, + "loss": 0.012783626094460487, + "num_input_tokens_seen": 13379192, + "step": 817, + "train_runtime": 6640.8076, + "train_tokens_per_second": 2014.694 + }, + { + "epoch": 0.49575757575757573, + "grad_norm": 0.014281938783824444, + "learning_rate": 9.960742057811648e-05, + "loss": 0.012687593698501587, + "num_input_tokens_seen": 13395568, + "step": 818, + "train_runtime": 6648.9137, + "train_tokens_per_second": 2014.7 + }, + { + "epoch": 0.49636363636363634, + "grad_norm": 0.053434181958436966, + "learning_rate": 9.960621700766246e-05, + "loss": 0.013879223726689816, + "num_input_tokens_seen": 13411944, + "step": 819, + "train_runtime": 6657.0289, + "train_tokens_per_second": 2014.704 + }, + { + "epoch": 0.49696969696969695, + "grad_norm": 0.014049537479877472, + "learning_rate": 9.960501160237107e-05, + "loss": 0.011275812052190304, + "num_input_tokens_seen": 13428320, + "step": 820, + "train_runtime": 6665.1394, + "train_tokens_per_second": 2014.71 + }, + { + "epoch": 0.49757575757575756, + "grad_norm": 0.02216215617954731, + "learning_rate": 9.960380436228693e-05, + "loss": 0.01345481164753437, + "num_input_tokens_seen": 13444696, + "step": 821, + "train_runtime": 6673.2486, + "train_tokens_per_second": 2014.715 + }, + { + "epoch": 0.49818181818181817, + "grad_norm": 0.01626548357307911, + "learning_rate": 9.960259528745466e-05, + "loss": 0.01268689427524805, + "num_input_tokens_seen": 13461072, + "step": 822, + "train_runtime": 6681.3546, + "train_tokens_per_second": 2014.722 + }, + { + "epoch": 0.4987878787878788, + "grad_norm": 0.029701311141252518, + "learning_rate": 9.960138437791899e-05, + "loss": 0.013831757940351963, + "num_input_tokens_seen": 13477448, + "step": 823, + "train_runtime": 6689.465, + "train_tokens_per_second": 2014.727 + }, + { + "epoch": 0.4993939393939394, + "grad_norm": 0.01778031513094902, + "learning_rate": 9.96001716337247e-05, + "loss": 0.012985551729798317, + "num_input_tokens_seen": 13493824, + "step": 824, + "train_runtime": 6697.5721, + "train_tokens_per_second": 2014.734 + }, + { + "epoch": 0.5, + "grad_norm": 0.011812685988843441, + "learning_rate": 9.959895705491664e-05, + "loss": 0.013474401086568832, + "num_input_tokens_seen": 13510200, + "step": 825, + "train_runtime": 6705.6803, + "train_tokens_per_second": 2014.74 + }, + { + "epoch": 0.5006060606060606, + "grad_norm": 0.024887410923838615, + "learning_rate": 9.959774064153977e-05, + "loss": 0.012352567166090012, + "num_input_tokens_seen": 13526576, + "step": 826, + "train_runtime": 6713.7875, + "train_tokens_per_second": 2014.746 + }, + { + "epoch": 0.5012121212121212, + "grad_norm": 0.02427525445818901, + "learning_rate": 9.959652239363906e-05, + "loss": 0.01411970891058445, + "num_input_tokens_seen": 13542952, + "step": 827, + "train_runtime": 6721.8992, + "train_tokens_per_second": 2014.751 + }, + { + "epoch": 0.5018181818181818, + "grad_norm": 0.02203851006925106, + "learning_rate": 9.959530231125955e-05, + "loss": 0.01270216703414917, + "num_input_tokens_seen": 13559328, + "step": 828, + "train_runtime": 6730.0067, + "train_tokens_per_second": 2014.757 + }, + { + "epoch": 0.5024242424242424, + "grad_norm": 0.033256348222494125, + "learning_rate": 9.959408039444641e-05, + "loss": 0.013468440622091293, + "num_input_tokens_seen": 13575704, + "step": 829, + "train_runtime": 6738.1159, + "train_tokens_per_second": 2014.763 + }, + { + "epoch": 0.503030303030303, + "grad_norm": 0.030981307849287987, + "learning_rate": 9.95928566432448e-05, + "loss": 0.013072172179818153, + "num_input_tokens_seen": 13592080, + "step": 830, + "train_runtime": 6746.2296, + "train_tokens_per_second": 2014.767 + }, + { + "epoch": 0.5036363636363637, + "grad_norm": 0.019473901018500328, + "learning_rate": 9.959163105770002e-05, + "loss": 0.01263860147446394, + "num_input_tokens_seen": 13608456, + "step": 831, + "train_runtime": 6754.3387, + "train_tokens_per_second": 2014.773 + }, + { + "epoch": 0.5042424242424243, + "grad_norm": 0.023273654282093048, + "learning_rate": 9.959040363785736e-05, + "loss": 0.014287668280303478, + "num_input_tokens_seen": 13624832, + "step": 832, + "train_runtime": 6762.4478, + "train_tokens_per_second": 2014.778 + }, + { + "epoch": 0.5048484848484849, + "grad_norm": 0.0494939386844635, + "learning_rate": 9.958917438376226e-05, + "loss": 0.013972645625472069, + "num_input_tokens_seen": 13641208, + "step": 833, + "train_runtime": 6770.5557, + "train_tokens_per_second": 2014.784 + }, + { + "epoch": 0.5054545454545455, + "grad_norm": 0.0583622045814991, + "learning_rate": 9.958794329546017e-05, + "loss": 0.015316938981413841, + "num_input_tokens_seen": 13657584, + "step": 834, + "train_runtime": 6778.6628, + "train_tokens_per_second": 2014.79 + }, + { + "epoch": 0.5060606060606061, + "grad_norm": 0.022303935140371323, + "learning_rate": 9.958671037299662e-05, + "loss": 0.012674327939748764, + "num_input_tokens_seen": 13673960, + "step": 835, + "train_runtime": 6786.7703, + "train_tokens_per_second": 2014.796 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 0.033000800758600235, + "learning_rate": 9.958547561641722e-05, + "loss": 0.013727420009672642, + "num_input_tokens_seen": 13690336, + "step": 836, + "train_runtime": 6794.8756, + "train_tokens_per_second": 2014.803 + }, + { + "epoch": 0.5072727272727273, + "grad_norm": 0.015586488880217075, + "learning_rate": 9.958423902576763e-05, + "loss": 0.015323062427341938, + "num_input_tokens_seen": 13706712, + "step": 837, + "train_runtime": 6802.9828, + "train_tokens_per_second": 2014.809 + }, + { + "epoch": 0.5078787878787879, + "grad_norm": 0.022322285920381546, + "learning_rate": 9.958300060109362e-05, + "loss": 0.014234354719519615, + "num_input_tokens_seen": 13723088, + "step": 838, + "train_runtime": 6811.0938, + "train_tokens_per_second": 2014.814 + }, + { + "epoch": 0.5084848484848485, + "grad_norm": 0.008347253315150738, + "learning_rate": 9.958176034244097e-05, + "loss": 0.012262934818863869, + "num_input_tokens_seen": 13739464, + "step": 839, + "train_runtime": 6819.2024, + "train_tokens_per_second": 2014.82 + }, + { + "epoch": 0.509090909090909, + "grad_norm": 0.02393462508916855, + "learning_rate": 9.958051824985555e-05, + "loss": 0.01308400183916092, + "num_input_tokens_seen": 13755840, + "step": 840, + "train_runtime": 6827.3112, + "train_tokens_per_second": 2014.825 + }, + { + "epoch": 0.5096969696969696, + "grad_norm": 0.01569426991045475, + "learning_rate": 9.957927432338332e-05, + "loss": 0.012214584276080132, + "num_input_tokens_seen": 13772216, + "step": 841, + "train_runtime": 6835.4183, + "train_tokens_per_second": 2014.831 + }, + { + "epoch": 0.5103030303030303, + "grad_norm": 0.026208873838186264, + "learning_rate": 9.957802856307029e-05, + "loss": 0.014355281367897987, + "num_input_tokens_seen": 13788592, + "step": 842, + "train_runtime": 6843.5292, + "train_tokens_per_second": 2014.836 + }, + { + "epoch": 0.5109090909090909, + "grad_norm": 0.016047121956944466, + "learning_rate": 9.957678096896252e-05, + "loss": 0.012238034047186375, + "num_input_tokens_seen": 13804968, + "step": 843, + "train_runtime": 6851.6374, + "train_tokens_per_second": 2014.842 + }, + { + "epoch": 0.5115151515151515, + "grad_norm": 0.04430484399199486, + "learning_rate": 9.957553154110617e-05, + "loss": 0.013455298729240894, + "num_input_tokens_seen": 13821344, + "step": 844, + "train_runtime": 6859.7446, + "train_tokens_per_second": 2014.848 + }, + { + "epoch": 0.5121212121212121, + "grad_norm": 0.01514506246894598, + "learning_rate": 9.957428027954746e-05, + "loss": 0.014497831463813782, + "num_input_tokens_seen": 13837720, + "step": 845, + "train_runtime": 6867.8522, + "train_tokens_per_second": 2014.854 + }, + { + "epoch": 0.5127272727272727, + "grad_norm": 0.11227481067180634, + "learning_rate": 9.957302718433266e-05, + "loss": 0.01227258238941431, + "num_input_tokens_seen": 13854096, + "step": 846, + "train_runtime": 6875.9627, + "train_tokens_per_second": 2014.859 + }, + { + "epoch": 0.5133333333333333, + "grad_norm": 0.02800634503364563, + "learning_rate": 9.957177225550813e-05, + "loss": 0.013792254962027073, + "num_input_tokens_seen": 13870472, + "step": 847, + "train_runtime": 6884.0675, + "train_tokens_per_second": 2014.866 + }, + { + "epoch": 0.5139393939393939, + "grad_norm": 0.029475996270775795, + "learning_rate": 9.957051549312027e-05, + "loss": 0.013554091565310955, + "num_input_tokens_seen": 13886848, + "step": 848, + "train_runtime": 6892.1731, + "train_tokens_per_second": 2014.872 + }, + { + "epoch": 0.5145454545454545, + "grad_norm": 0.019583873450756073, + "learning_rate": 9.956925689721559e-05, + "loss": 0.014205913059413433, + "num_input_tokens_seen": 13903224, + "step": 849, + "train_runtime": 6900.2826, + "train_tokens_per_second": 2014.877 + }, + { + "epoch": 0.5151515151515151, + "grad_norm": 0.015864579007029533, + "learning_rate": 9.95679964678406e-05, + "loss": 0.01432622317224741, + "num_input_tokens_seen": 13919600, + "step": 850, + "train_runtime": 6908.3907, + "train_tokens_per_second": 2014.883 + }, + { + "epoch": 0.5157575757575757, + "grad_norm": 0.01455528661608696, + "learning_rate": 9.9566734205042e-05, + "loss": 0.015681616961956024, + "num_input_tokens_seen": 13935976, + "step": 851, + "train_runtime": 6916.5024, + "train_tokens_per_second": 2014.888 + }, + { + "epoch": 0.5163636363636364, + "grad_norm": 0.02918148599565029, + "learning_rate": 9.956547010886639e-05, + "loss": 0.012535885907709599, + "num_input_tokens_seen": 13952352, + "step": 852, + "train_runtime": 6924.6094, + "train_tokens_per_second": 2014.894 + }, + { + "epoch": 0.516969696969697, + "grad_norm": 0.0162571519613266, + "learning_rate": 9.956420417936056e-05, + "loss": 0.012905891984701157, + "num_input_tokens_seen": 13968728, + "step": 853, + "train_runtime": 6932.7194, + "train_tokens_per_second": 2014.899 + }, + { + "epoch": 0.5175757575757576, + "grad_norm": 0.01789519377052784, + "learning_rate": 9.956293641657137e-05, + "loss": 0.01288038119673729, + "num_input_tokens_seen": 13985104, + "step": 854, + "train_runtime": 6940.8319, + "train_tokens_per_second": 2014.903 + }, + { + "epoch": 0.5181818181818182, + "grad_norm": 0.01946009323000908, + "learning_rate": 9.956166682054566e-05, + "loss": 0.013123282231390476, + "num_input_tokens_seen": 14001480, + "step": 855, + "train_runtime": 6948.9381, + "train_tokens_per_second": 2014.909 + }, + { + "epoch": 0.5187878787878788, + "grad_norm": 0.02161416970193386, + "learning_rate": 9.956039539133042e-05, + "loss": 0.011395135894417763, + "num_input_tokens_seen": 14017856, + "step": 856, + "train_runtime": 6957.048, + "train_tokens_per_second": 2014.914 + }, + { + "epoch": 0.5193939393939394, + "grad_norm": 0.01752905547618866, + "learning_rate": 9.955912212897267e-05, + "loss": 0.014676744118332863, + "num_input_tokens_seen": 14034232, + "step": 857, + "train_runtime": 6965.1559, + "train_tokens_per_second": 2014.92 + }, + { + "epoch": 0.52, + "grad_norm": 0.012038851156830788, + "learning_rate": 9.955784703351949e-05, + "loss": 0.012578791007399559, + "num_input_tokens_seen": 14050608, + "step": 858, + "train_runtime": 6973.2666, + "train_tokens_per_second": 2014.925 + }, + { + "epoch": 0.5206060606060606, + "grad_norm": 0.01986696757376194, + "learning_rate": 9.955657010501806e-05, + "loss": 0.012446455657482147, + "num_input_tokens_seen": 14066984, + "step": 859, + "train_runtime": 6981.3718, + "train_tokens_per_second": 2014.931 + }, + { + "epoch": 0.5212121212121212, + "grad_norm": 0.020363394170999527, + "learning_rate": 9.955529134351563e-05, + "loss": 0.012604762800037861, + "num_input_tokens_seen": 14083360, + "step": 860, + "train_runtime": 6989.4801, + "train_tokens_per_second": 2014.937 + }, + { + "epoch": 0.5218181818181818, + "grad_norm": 0.010133441537618637, + "learning_rate": 9.955401074905945e-05, + "loss": 0.01250852644443512, + "num_input_tokens_seen": 14099736, + "step": 861, + "train_runtime": 6997.5889, + "train_tokens_per_second": 2014.942 + }, + { + "epoch": 0.5224242424242425, + "grad_norm": 0.012160439044237137, + "learning_rate": 9.955272832169694e-05, + "loss": 0.013129970990121365, + "num_input_tokens_seen": 14116112, + "step": 862, + "train_runtime": 7005.6941, + "train_tokens_per_second": 2014.948 + }, + { + "epoch": 0.5230303030303031, + "grad_norm": 0.0197035763412714, + "learning_rate": 9.95514440614755e-05, + "loss": 0.012795310467481613, + "num_input_tokens_seen": 14132488, + "step": 863, + "train_runtime": 7013.8017, + "train_tokens_per_second": 2014.954 + }, + { + "epoch": 0.5236363636363637, + "grad_norm": 0.029051663354039192, + "learning_rate": 9.955015796844263e-05, + "loss": 0.012731630355119705, + "num_input_tokens_seen": 14148864, + "step": 864, + "train_runtime": 7021.913, + "train_tokens_per_second": 2014.959 + }, + { + "epoch": 0.5242424242424243, + "grad_norm": 0.01819092035293579, + "learning_rate": 9.954887004264591e-05, + "loss": 0.012530642561614513, + "num_input_tokens_seen": 14165240, + "step": 865, + "train_runtime": 7030.031, + "train_tokens_per_second": 2014.961 + }, + { + "epoch": 0.5248484848484849, + "grad_norm": 0.012354613281786442, + "learning_rate": 9.9547580284133e-05, + "loss": 0.012999298982322216, + "num_input_tokens_seen": 14181616, + "step": 866, + "train_runtime": 7038.1398, + "train_tokens_per_second": 2014.966 + }, + { + "epoch": 0.5254545454545455, + "grad_norm": 0.009374301880598068, + "learning_rate": 9.954628869295157e-05, + "loss": 0.012080837972462177, + "num_input_tokens_seen": 14197992, + "step": 867, + "train_runtime": 7046.2489, + "train_tokens_per_second": 2014.972 + }, + { + "epoch": 0.526060606060606, + "grad_norm": 0.04844909533858299, + "learning_rate": 9.954499526914941e-05, + "loss": 0.014849531464278698, + "num_input_tokens_seen": 14214368, + "step": 868, + "train_runtime": 7054.3586, + "train_tokens_per_second": 2014.977 + }, + { + "epoch": 0.5266666666666666, + "grad_norm": 0.0264375489205122, + "learning_rate": 9.954370001277435e-05, + "loss": 0.013595725409686565, + "num_input_tokens_seen": 14230744, + "step": 869, + "train_runtime": 7062.4663, + "train_tokens_per_second": 2014.982 + }, + { + "epoch": 0.5272727272727272, + "grad_norm": 0.011517049744725227, + "learning_rate": 9.954240292387434e-05, + "loss": 0.012497092597186565, + "num_input_tokens_seen": 14247120, + "step": 870, + "train_runtime": 7070.5718, + "train_tokens_per_second": 2014.988 + }, + { + "epoch": 0.5278787878787878, + "grad_norm": 0.012493406422436237, + "learning_rate": 9.95411040024973e-05, + "loss": 0.01143716461956501, + "num_input_tokens_seen": 14263496, + "step": 871, + "train_runtime": 7078.6807, + "train_tokens_per_second": 2014.994 + }, + { + "epoch": 0.5284848484848484, + "grad_norm": 0.04269085079431534, + "learning_rate": 9.95398032486913e-05, + "loss": 0.013632988557219505, + "num_input_tokens_seen": 14279872, + "step": 872, + "train_runtime": 7086.7887, + "train_tokens_per_second": 2014.999 + }, + { + "epoch": 0.5290909090909091, + "grad_norm": 0.04483538493514061, + "learning_rate": 9.953850066250445e-05, + "loss": 0.013953006826341152, + "num_input_tokens_seen": 14296248, + "step": 873, + "train_runtime": 7094.8962, + "train_tokens_per_second": 2015.005 + }, + { + "epoch": 0.5296969696969697, + "grad_norm": 0.05677570030093193, + "learning_rate": 9.953719624398495e-05, + "loss": 0.012957635335624218, + "num_input_tokens_seen": 14312624, + "step": 874, + "train_runtime": 7103.0013, + "train_tokens_per_second": 2015.011 + }, + { + "epoch": 0.5303030303030303, + "grad_norm": 0.038775816559791565, + "learning_rate": 9.953588999318101e-05, + "loss": 0.01283508911728859, + "num_input_tokens_seen": 14329000, + "step": 875, + "train_runtime": 7111.1121, + "train_tokens_per_second": 2015.015 + }, + { + "epoch": 0.5309090909090909, + "grad_norm": 0.032757148146629333, + "learning_rate": 9.953458191014098e-05, + "loss": 0.013316294178366661, + "num_input_tokens_seen": 14345376, + "step": 876, + "train_runtime": 7119.2189, + "train_tokens_per_second": 2015.021 + }, + { + "epoch": 0.5315151515151515, + "grad_norm": 0.022632509469985962, + "learning_rate": 9.953327199491323e-05, + "loss": 0.011890828609466553, + "num_input_tokens_seen": 14361752, + "step": 877, + "train_runtime": 7127.3324, + "train_tokens_per_second": 2015.025 + }, + { + "epoch": 0.5321212121212121, + "grad_norm": 0.013239112682640553, + "learning_rate": 9.953196024754621e-05, + "loss": 0.011631186120212078, + "num_input_tokens_seen": 14378128, + "step": 878, + "train_runtime": 7135.4381, + "train_tokens_per_second": 2015.031 + }, + { + "epoch": 0.5327272727272727, + "grad_norm": 0.012772745452821255, + "learning_rate": 9.953064666808843e-05, + "loss": 0.011507662013173103, + "num_input_tokens_seen": 14394504, + "step": 879, + "train_runtime": 7143.5516, + "train_tokens_per_second": 2015.035 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.02860845811665058, + "learning_rate": 9.952933125658849e-05, + "loss": 0.013187154196202755, + "num_input_tokens_seen": 14410880, + "step": 880, + "train_runtime": 7151.6624, + "train_tokens_per_second": 2015.039 + }, + { + "epoch": 0.5339393939393939, + "grad_norm": 0.011422947980463505, + "learning_rate": 9.952801401309503e-05, + "loss": 0.012076064944267273, + "num_input_tokens_seen": 14427256, + "step": 881, + "train_runtime": 7159.772, + "train_tokens_per_second": 2015.044 + }, + { + "epoch": 0.5345454545454545, + "grad_norm": 0.00976222101598978, + "learning_rate": 9.95266949376568e-05, + "loss": 0.011884449049830437, + "num_input_tokens_seen": 14443632, + "step": 882, + "train_runtime": 7167.8808, + "train_tokens_per_second": 2015.049 + }, + { + "epoch": 0.5351515151515152, + "grad_norm": 0.017465714365243912, + "learning_rate": 9.952537403032258e-05, + "loss": 0.012587850913405418, + "num_input_tokens_seen": 14460008, + "step": 883, + "train_runtime": 7175.9926, + "train_tokens_per_second": 2015.053 + }, + { + "epoch": 0.5357575757575758, + "grad_norm": 0.01686178520321846, + "learning_rate": 9.952405129114119e-05, + "loss": 0.01267196424305439, + "num_input_tokens_seen": 14476384, + "step": 884, + "train_runtime": 7184.0994, + "train_tokens_per_second": 2015.059 + }, + { + "epoch": 0.5363636363636364, + "grad_norm": 0.021161451935768127, + "learning_rate": 9.952272672016161e-05, + "loss": 0.012368117459118366, + "num_input_tokens_seen": 14492760, + "step": 885, + "train_runtime": 7192.2074, + "train_tokens_per_second": 2015.064 + }, + { + "epoch": 0.536969696969697, + "grad_norm": 0.018734315410256386, + "learning_rate": 9.95214003174328e-05, + "loss": 0.013907104730606079, + "num_input_tokens_seen": 14509136, + "step": 886, + "train_runtime": 7200.3181, + "train_tokens_per_second": 2015.069 + }, + { + "epoch": 0.5375757575757576, + "grad_norm": 0.017368443310260773, + "learning_rate": 9.952007208300384e-05, + "loss": 0.013688186183571815, + "num_input_tokens_seen": 14525512, + "step": 887, + "train_runtime": 7208.4306, + "train_tokens_per_second": 2015.073 + }, + { + "epoch": 0.5381818181818182, + "grad_norm": 0.014055633917450905, + "learning_rate": 9.951874201692386e-05, + "loss": 0.011441092006862164, + "num_input_tokens_seen": 14541888, + "step": 888, + "train_runtime": 7216.5403, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 0.5387878787878788, + "grad_norm": 0.014830189757049084, + "learning_rate": 9.951741011924202e-05, + "loss": 0.012659481726586819, + "num_input_tokens_seen": 14558264, + "step": 889, + "train_runtime": 7224.6486, + "train_tokens_per_second": 2015.083 + }, + { + "epoch": 0.5393939393939394, + "grad_norm": 0.04141494259238243, + "learning_rate": 9.951607639000763e-05, + "loss": 0.014267532154917717, + "num_input_tokens_seen": 14574640, + "step": 890, + "train_runtime": 7232.7595, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 0.54, + "grad_norm": 0.026582296937704086, + "learning_rate": 9.951474082927e-05, + "loss": 0.01351410336792469, + "num_input_tokens_seen": 14591016, + "step": 891, + "train_runtime": 7240.8673, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 0.5406060606060606, + "grad_norm": 0.029941242188215256, + "learning_rate": 9.951340343707852e-05, + "loss": 0.013386565260589123, + "num_input_tokens_seen": 14607392, + "step": 892, + "train_runtime": 7248.9795, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 0.5412121212121213, + "grad_norm": 0.01376877911388874, + "learning_rate": 9.951206421348267e-05, + "loss": 0.012590361759066582, + "num_input_tokens_seen": 14623768, + "step": 893, + "train_runtime": 7257.0885, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 0.5418181818181819, + "grad_norm": 0.015015073120594025, + "learning_rate": 9.9510723158532e-05, + "loss": 0.012574484571814537, + "num_input_tokens_seen": 14640144, + "step": 894, + "train_runtime": 7265.1975, + "train_tokens_per_second": 2015.106 + }, + { + "epoch": 0.5424242424242425, + "grad_norm": 0.013042068108916283, + "learning_rate": 9.950938027227608e-05, + "loss": 0.01163259893655777, + "num_input_tokens_seen": 14656520, + "step": 895, + "train_runtime": 7273.3074, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 0.5430303030303031, + "grad_norm": 0.2448950558900833, + "learning_rate": 9.950803555476463e-05, + "loss": 0.029144512489438057, + "num_input_tokens_seen": 14672896, + "step": 896, + "train_runtime": 7281.4158, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 0.5436363636363636, + "grad_norm": 0.015140167437493801, + "learning_rate": 9.950668900604733e-05, + "loss": 0.012354775331914425, + "num_input_tokens_seen": 14689272, + "step": 897, + "train_runtime": 7289.5307, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 0.5442424242424242, + "grad_norm": 0.014910165220499039, + "learning_rate": 9.950534062617401e-05, + "loss": 0.013464296236634254, + "num_input_tokens_seen": 14705648, + "step": 898, + "train_runtime": 7297.6408, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 0.5448484848484848, + "grad_norm": 0.025381648913025856, + "learning_rate": 9.950399041519456e-05, + "loss": 0.01381002739071846, + "num_input_tokens_seen": 14722024, + "step": 899, + "train_runtime": 7305.7486, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 0.016502218320965767, + "learning_rate": 9.950263837315891e-05, + "loss": 0.014580944553017616, + "num_input_tokens_seen": 14738400, + "step": 900, + "train_runtime": 7313.8574, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 0.546060606060606, + "grad_norm": 0.036798711866140366, + "learning_rate": 9.950128450011706e-05, + "loss": 0.01336810551583767, + "num_input_tokens_seen": 14754776, + "step": 901, + "train_runtime": 7322.836, + "train_tokens_per_second": 2014.899 + }, + { + "epoch": 0.5466666666666666, + "grad_norm": 0.03919834643602371, + "learning_rate": 9.949992879611911e-05, + "loss": 0.013614124618470669, + "num_input_tokens_seen": 14771152, + "step": 902, + "train_runtime": 7330.9449, + "train_tokens_per_second": 2014.904 + }, + { + "epoch": 0.5472727272727272, + "grad_norm": 0.015492623671889305, + "learning_rate": 9.949857126121517e-05, + "loss": 0.01262598019093275, + "num_input_tokens_seen": 14787528, + "step": 903, + "train_runtime": 7339.051, + "train_tokens_per_second": 2014.91 + }, + { + "epoch": 0.5478787878787879, + "grad_norm": 0.04381313920021057, + "learning_rate": 9.949721189545549e-05, + "loss": 0.012830916792154312, + "num_input_tokens_seen": 14803904, + "step": 904, + "train_runtime": 7347.1591, + "train_tokens_per_second": 2014.915 + }, + { + "epoch": 0.5484848484848485, + "grad_norm": 0.012728218920528889, + "learning_rate": 9.949585069889033e-05, + "loss": 0.012215669266879559, + "num_input_tokens_seen": 14820280, + "step": 905, + "train_runtime": 7355.2671, + "train_tokens_per_second": 2014.921 + }, + { + "epoch": 0.5490909090909091, + "grad_norm": 0.02701408974826336, + "learning_rate": 9.949448767157003e-05, + "loss": 0.014799817465245724, + "num_input_tokens_seen": 14836656, + "step": 906, + "train_runtime": 7363.3735, + "train_tokens_per_second": 2014.926 + }, + { + "epoch": 0.5496969696969697, + "grad_norm": 0.01919523999094963, + "learning_rate": 9.949312281354504e-05, + "loss": 0.012729383073747158, + "num_input_tokens_seen": 14853032, + "step": 907, + "train_runtime": 7371.4797, + "train_tokens_per_second": 2014.932 + }, + { + "epoch": 0.5503030303030303, + "grad_norm": 0.017987912520766258, + "learning_rate": 9.94917561248658e-05, + "loss": 0.011925067752599716, + "num_input_tokens_seen": 14869408, + "step": 908, + "train_runtime": 7379.5909, + "train_tokens_per_second": 2014.937 + }, + { + "epoch": 0.5509090909090909, + "grad_norm": 0.016029933467507362, + "learning_rate": 9.94903876055829e-05, + "loss": 0.014640429988503456, + "num_input_tokens_seen": 14885784, + "step": 909, + "train_runtime": 7387.701, + "train_tokens_per_second": 2014.941 + }, + { + "epoch": 0.5515151515151515, + "grad_norm": 0.02371898479759693, + "learning_rate": 9.948901725574692e-05, + "loss": 0.013192545622587204, + "num_input_tokens_seen": 14902160, + "step": 910, + "train_runtime": 7395.8127, + "train_tokens_per_second": 2014.946 + }, + { + "epoch": 0.5521212121212121, + "grad_norm": 0.028052695095539093, + "learning_rate": 9.948764507540858e-05, + "loss": 0.014127026312053204, + "num_input_tokens_seen": 14918536, + "step": 911, + "train_runtime": 7403.9308, + "train_tokens_per_second": 2014.948 + }, + { + "epoch": 0.5527272727272727, + "grad_norm": 0.022900646552443504, + "learning_rate": 9.94862710646186e-05, + "loss": 0.01369861327111721, + "num_input_tokens_seen": 14934912, + "step": 912, + "train_runtime": 7412.0474, + "train_tokens_per_second": 2014.951 + }, + { + "epoch": 0.5533333333333333, + "grad_norm": 0.024493444710969925, + "learning_rate": 9.948489522342786e-05, + "loss": 0.012475069612264633, + "num_input_tokens_seen": 14951288, + "step": 913, + "train_runtime": 7420.164, + "train_tokens_per_second": 2014.954 + }, + { + "epoch": 0.553939393939394, + "grad_norm": 0.009486420080065727, + "learning_rate": 9.948351755188718e-05, + "loss": 0.011415514163672924, + "num_input_tokens_seen": 14967664, + "step": 914, + "train_runtime": 7428.2787, + "train_tokens_per_second": 2014.957 + }, + { + "epoch": 0.5545454545454546, + "grad_norm": 0.02638114243745804, + "learning_rate": 9.948213805004758e-05, + "loss": 0.014981718733906746, + "num_input_tokens_seen": 14984040, + "step": 915, + "train_runtime": 7436.3836, + "train_tokens_per_second": 2014.963 + }, + { + "epoch": 0.5551515151515152, + "grad_norm": 0.024289410561323166, + "learning_rate": 9.948075671796004e-05, + "loss": 0.013489319942891598, + "num_input_tokens_seen": 15000416, + "step": 916, + "train_runtime": 7444.4934, + "train_tokens_per_second": 2014.968 + }, + { + "epoch": 0.5557575757575758, + "grad_norm": 0.019992362707853317, + "learning_rate": 9.947937355567566e-05, + "loss": 0.013457294553518295, + "num_input_tokens_seen": 15016792, + "step": 917, + "train_runtime": 7452.6, + "train_tokens_per_second": 2014.974 + }, + { + "epoch": 0.5563636363636364, + "grad_norm": 0.01874268427491188, + "learning_rate": 9.947798856324562e-05, + "loss": 0.014019965194165707, + "num_input_tokens_seen": 15033168, + "step": 918, + "train_runtime": 7460.7075, + "train_tokens_per_second": 2014.979 + }, + { + "epoch": 0.556969696969697, + "grad_norm": 0.006537168752402067, + "learning_rate": 9.947660174072113e-05, + "loss": 0.01211620308458805, + "num_input_tokens_seen": 15049544, + "step": 919, + "train_runtime": 7468.8162, + "train_tokens_per_second": 2014.984 + }, + { + "epoch": 0.5575757575757576, + "grad_norm": 0.014149926602840424, + "learning_rate": 9.94752130881535e-05, + "loss": 0.01367366872727871, + "num_input_tokens_seen": 15065920, + "step": 920, + "train_runtime": 7476.9293, + "train_tokens_per_second": 2014.988 + }, + { + "epoch": 0.5581818181818182, + "grad_norm": 0.02201610431075096, + "learning_rate": 9.947382260559408e-05, + "loss": 0.014585314318537712, + "num_input_tokens_seen": 15082296, + "step": 921, + "train_runtime": 7485.0355, + "train_tokens_per_second": 2014.993 + }, + { + "epoch": 0.5587878787878788, + "grad_norm": 0.016061201691627502, + "learning_rate": 9.947243029309433e-05, + "loss": 0.012058419175446033, + "num_input_tokens_seen": 15098672, + "step": 922, + "train_runtime": 7493.1729, + "train_tokens_per_second": 2014.99 + }, + { + "epoch": 0.5593939393939394, + "grad_norm": 0.014283844269812107, + "learning_rate": 9.94710361507057e-05, + "loss": 0.013241814449429512, + "num_input_tokens_seen": 15115048, + "step": 923, + "train_runtime": 7501.2834, + "train_tokens_per_second": 2014.995 + }, + { + "epoch": 0.56, + "grad_norm": 0.014411736279726028, + "learning_rate": 9.94696401784798e-05, + "loss": 0.011771513149142265, + "num_input_tokens_seen": 15131424, + "step": 924, + "train_runtime": 7509.3934, + "train_tokens_per_second": 2015.0 + }, + { + "epoch": 0.5606060606060606, + "grad_norm": 0.015076170675456524, + "learning_rate": 9.946824237646824e-05, + "loss": 0.012921320274472237, + "num_input_tokens_seen": 15147800, + "step": 925, + "train_runtime": 7517.5023, + "train_tokens_per_second": 2015.004 + }, + { + "epoch": 0.5612121212121212, + "grad_norm": 0.019479839131236076, + "learning_rate": 9.94668427447227e-05, + "loss": 0.01365247555077076, + "num_input_tokens_seen": 15164176, + "step": 926, + "train_runtime": 7525.6088, + "train_tokens_per_second": 2015.01 + }, + { + "epoch": 0.5618181818181818, + "grad_norm": 0.015186650678515434, + "learning_rate": 9.946544128329502e-05, + "loss": 0.011964188888669014, + "num_input_tokens_seen": 15180552, + "step": 927, + "train_runtime": 7533.7184, + "train_tokens_per_second": 2015.015 + }, + { + "epoch": 0.5624242424242424, + "grad_norm": 0.01884932816028595, + "learning_rate": 9.9464037992237e-05, + "loss": 0.013231384567916393, + "num_input_tokens_seen": 15196928, + "step": 928, + "train_runtime": 7541.8298, + "train_tokens_per_second": 2015.019 + }, + { + "epoch": 0.563030303030303, + "grad_norm": 0.024524593725800514, + "learning_rate": 9.946263287160051e-05, + "loss": 0.013677388429641724, + "num_input_tokens_seen": 15213304, + "step": 929, + "train_runtime": 7549.9392, + "train_tokens_per_second": 2015.023 + }, + { + "epoch": 0.5636363636363636, + "grad_norm": 0.017896726727485657, + "learning_rate": 9.946122592143758e-05, + "loss": 0.012685752473771572, + "num_input_tokens_seen": 15229680, + "step": 930, + "train_runtime": 7558.0487, + "train_tokens_per_second": 2015.028 + }, + { + "epoch": 0.5642424242424242, + "grad_norm": 0.02456982247531414, + "learning_rate": 9.945981714180021e-05, + "loss": 0.012439090758562088, + "num_input_tokens_seen": 15246056, + "step": 931, + "train_runtime": 7566.1626, + "train_tokens_per_second": 2015.032 + }, + { + "epoch": 0.5648484848484848, + "grad_norm": 0.011778507381677628, + "learning_rate": 9.945840653274052e-05, + "loss": 0.01277371309697628, + "num_input_tokens_seen": 15262432, + "step": 932, + "train_runtime": 7574.272, + "train_tokens_per_second": 2015.036 + }, + { + "epoch": 0.5654545454545454, + "grad_norm": 0.00871087983250618, + "learning_rate": 9.945699409431071e-05, + "loss": 0.012337596155703068, + "num_input_tokens_seen": 15278808, + "step": 933, + "train_runtime": 7582.3801, + "train_tokens_per_second": 2015.041 + }, + { + "epoch": 0.566060606060606, + "grad_norm": 0.02395842783153057, + "learning_rate": 9.945557982656299e-05, + "loss": 0.013987423852086067, + "num_input_tokens_seen": 15295184, + "step": 934, + "train_runtime": 7590.493, + "train_tokens_per_second": 2015.045 + }, + { + "epoch": 0.5666666666666667, + "grad_norm": 0.014825602062046528, + "learning_rate": 9.945416372954968e-05, + "loss": 0.013695470988750458, + "num_input_tokens_seen": 15311560, + "step": 935, + "train_runtime": 7598.6032, + "train_tokens_per_second": 2015.049 + }, + { + "epoch": 0.5672727272727273, + "grad_norm": 0.034912459552288055, + "learning_rate": 9.945274580332316e-05, + "loss": 0.014644785784184933, + "num_input_tokens_seen": 15327936, + "step": 936, + "train_runtime": 7606.7121, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 0.5678787878787879, + "grad_norm": 0.015183918178081512, + "learning_rate": 9.945132604793588e-05, + "loss": 0.013066308572888374, + "num_input_tokens_seen": 15344312, + "step": 937, + "train_runtime": 7614.8211, + "train_tokens_per_second": 2015.059 + }, + { + "epoch": 0.5684848484848485, + "grad_norm": 0.015175413340330124, + "learning_rate": 9.944990446344033e-05, + "loss": 0.012659816071391106, + "num_input_tokens_seen": 15360688, + "step": 938, + "train_runtime": 7622.9305, + "train_tokens_per_second": 2015.063 + }, + { + "epoch": 0.5690909090909091, + "grad_norm": 0.00944305956363678, + "learning_rate": 9.944848104988915e-05, + "loss": 0.012941330671310425, + "num_input_tokens_seen": 15377064, + "step": 939, + "train_runtime": 7631.041, + "train_tokens_per_second": 2015.068 + }, + { + "epoch": 0.5696969696969697, + "grad_norm": 0.008134279400110245, + "learning_rate": 9.944705580733493e-05, + "loss": 0.012083706445991993, + "num_input_tokens_seen": 15393440, + "step": 940, + "train_runtime": 7639.151, + "train_tokens_per_second": 2015.072 + }, + { + "epoch": 0.5703030303030303, + "grad_norm": 0.01920422352850437, + "learning_rate": 9.944562873583042e-05, + "loss": 0.012228092178702354, + "num_input_tokens_seen": 15409816, + "step": 941, + "train_runtime": 7647.2582, + "train_tokens_per_second": 2015.077 + }, + { + "epoch": 0.5709090909090909, + "grad_norm": 0.02532947063446045, + "learning_rate": 9.944419983542839e-05, + "loss": 0.014129354618489742, + "num_input_tokens_seen": 15426192, + "step": 942, + "train_runtime": 7655.3689, + "train_tokens_per_second": 2015.081 + }, + { + "epoch": 0.5715151515151515, + "grad_norm": 0.014770124107599258, + "learning_rate": 9.944276910618168e-05, + "loss": 0.01307615451514721, + "num_input_tokens_seen": 15442568, + "step": 943, + "train_runtime": 7663.4788, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 0.5721212121212121, + "grad_norm": 0.04172991216182709, + "learning_rate": 9.944133654814325e-05, + "loss": 0.01433885470032692, + "num_input_tokens_seen": 15458944, + "step": 944, + "train_runtime": 7671.5887, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 0.5727272727272728, + "grad_norm": 0.02282462641596794, + "learning_rate": 9.943990216136605e-05, + "loss": 0.012092739343643188, + "num_input_tokens_seen": 15475320, + "step": 945, + "train_runtime": 7679.6999, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 0.5733333333333334, + "grad_norm": 0.0323781855404377, + "learning_rate": 9.943846594590316e-05, + "loss": 0.014233306050300598, + "num_input_tokens_seen": 15491696, + "step": 946, + "train_runtime": 7687.8075, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 0.573939393939394, + "grad_norm": 0.016390513628721237, + "learning_rate": 9.943702790180769e-05, + "loss": 0.01384427584707737, + "num_input_tokens_seen": 15508072, + "step": 947, + "train_runtime": 7695.9168, + "train_tokens_per_second": 2015.104 + }, + { + "epoch": 0.5745454545454546, + "grad_norm": 0.017519650980830193, + "learning_rate": 9.943558802913282e-05, + "loss": 0.013568704016506672, + "num_input_tokens_seen": 15524448, + "step": 948, + "train_runtime": 7704.0297, + "train_tokens_per_second": 2015.108 + }, + { + "epoch": 0.5751515151515152, + "grad_norm": 0.012753440998494625, + "learning_rate": 9.943414632793184e-05, + "loss": 0.012147994711995125, + "num_input_tokens_seen": 15540824, + "step": 949, + "train_runtime": 7712.145, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 0.5757575757575758, + "grad_norm": 0.011699757538735867, + "learning_rate": 9.943270279825803e-05, + "loss": 0.013070912100374699, + "num_input_tokens_seen": 15557200, + "step": 950, + "train_runtime": 7720.2554, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 0.5763636363636364, + "grad_norm": 0.01527287345379591, + "learning_rate": 9.943125744016483e-05, + "loss": 0.011352474801242352, + "num_input_tokens_seen": 15573576, + "step": 951, + "train_runtime": 7728.3625, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 0.576969696969697, + "grad_norm": 0.025451278313994408, + "learning_rate": 9.942981025370568e-05, + "loss": 0.013020837679505348, + "num_input_tokens_seen": 15589952, + "step": 952, + "train_runtime": 7736.4706, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 0.5775757575757576, + "grad_norm": 0.021832725033164024, + "learning_rate": 9.942836123893408e-05, + "loss": 0.015131472609937191, + "num_input_tokens_seen": 15606328, + "step": 953, + "train_runtime": 7744.5854, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 0.5781818181818181, + "grad_norm": 0.022370878607034683, + "learning_rate": 9.942691039590369e-05, + "loss": 0.012688050046563148, + "num_input_tokens_seen": 15622704, + "step": 954, + "train_runtime": 7752.6981, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 0.5787878787878787, + "grad_norm": 0.021051136776804924, + "learning_rate": 9.942545772466814e-05, + "loss": 0.012345478869974613, + "num_input_tokens_seen": 15639080, + "step": 955, + "train_runtime": 7760.8061, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 0.5793939393939394, + "grad_norm": 0.01372633595019579, + "learning_rate": 9.942400322528114e-05, + "loss": 0.012315414845943451, + "num_input_tokens_seen": 15655456, + "step": 956, + "train_runtime": 7768.9154, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 0.58, + "grad_norm": 0.028729503974318504, + "learning_rate": 9.942254689779651e-05, + "loss": 0.013109761290252209, + "num_input_tokens_seen": 15671832, + "step": 957, + "train_runtime": 7777.0294, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 0.5806060606060606, + "grad_norm": 0.029019076377153397, + "learning_rate": 9.942108874226811e-05, + "loss": 0.013196980580687523, + "num_input_tokens_seen": 15688208, + "step": 958, + "train_runtime": 7785.1364, + "train_tokens_per_second": 2015.149 + }, + { + "epoch": 0.5812121212121212, + "grad_norm": 0.011110197752714157, + "learning_rate": 9.94196287587499e-05, + "loss": 0.012527218088507652, + "num_input_tokens_seen": 15704584, + "step": 959, + "train_runtime": 7793.2454, + "train_tokens_per_second": 2015.153 + }, + { + "epoch": 0.5818181818181818, + "grad_norm": 0.012445122934877872, + "learning_rate": 9.941816694729586e-05, + "loss": 0.013050834648311138, + "num_input_tokens_seen": 15720960, + "step": 960, + "train_runtime": 7801.3578, + "train_tokens_per_second": 2015.157 + }, + { + "epoch": 0.5824242424242424, + "grad_norm": 0.01324465125799179, + "learning_rate": 9.941670330796007e-05, + "loss": 0.012385859154164791, + "num_input_tokens_seen": 15737336, + "step": 961, + "train_runtime": 7809.4681, + "train_tokens_per_second": 2015.161 + }, + { + "epoch": 0.583030303030303, + "grad_norm": 0.020351726561784744, + "learning_rate": 9.941523784079665e-05, + "loss": 0.013481922447681427, + "num_input_tokens_seen": 15753712, + "step": 962, + "train_runtime": 7817.5774, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 0.5836363636363636, + "grad_norm": 0.017218874767422676, + "learning_rate": 9.94137705458598e-05, + "loss": 0.011243843473494053, + "num_input_tokens_seen": 15770088, + "step": 963, + "train_runtime": 7825.6869, + "train_tokens_per_second": 2015.17 + }, + { + "epoch": 0.5842424242424242, + "grad_norm": 0.020052634179592133, + "learning_rate": 9.941230142320381e-05, + "loss": 0.01419176533818245, + "num_input_tokens_seen": 15786464, + "step": 964, + "train_runtime": 7833.7989, + "train_tokens_per_second": 2015.174 + }, + { + "epoch": 0.5848484848484848, + "grad_norm": 0.01865479350090027, + "learning_rate": 9.941083047288305e-05, + "loss": 0.013855772092938423, + "num_input_tokens_seen": 15802840, + "step": 965, + "train_runtime": 7841.9087, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 0.5854545454545454, + "grad_norm": 0.019557680934667587, + "learning_rate": 9.940935769495186e-05, + "loss": 0.014046021737158298, + "num_input_tokens_seen": 15819216, + "step": 966, + "train_runtime": 7850.0169, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 0.5860606060606061, + "grad_norm": 0.01921168901026249, + "learning_rate": 9.940788308946476e-05, + "loss": 0.013276162557303905, + "num_input_tokens_seen": 15835592, + "step": 967, + "train_runtime": 7858.13, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 0.015911763533949852, + "learning_rate": 9.940640665647626e-05, + "loss": 0.012454750947654247, + "num_input_tokens_seen": 15851968, + "step": 968, + "train_runtime": 7866.2398, + "train_tokens_per_second": 2015.19 + }, + { + "epoch": 0.5872727272727273, + "grad_norm": 0.020958999171853065, + "learning_rate": 9.940492839604103e-05, + "loss": 0.01228359155356884, + "num_input_tokens_seen": 15868344, + "step": 969, + "train_runtime": 7874.3484, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 0.5878787878787879, + "grad_norm": 0.017634913325309753, + "learning_rate": 9.940344830821368e-05, + "loss": 0.013240614905953407, + "num_input_tokens_seen": 15884720, + "step": 970, + "train_runtime": 7882.4581, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 0.5884848484848485, + "grad_norm": 0.018232690170407295, + "learning_rate": 9.9401966393049e-05, + "loss": 0.01443801261484623, + "num_input_tokens_seen": 15901096, + "step": 971, + "train_runtime": 7890.5672, + "train_tokens_per_second": 2015.203 + }, + { + "epoch": 0.5890909090909091, + "grad_norm": 0.021868707612156868, + "learning_rate": 9.94004826506018e-05, + "loss": 0.014730443246662617, + "num_input_tokens_seen": 15917472, + "step": 972, + "train_runtime": 7898.6753, + "train_tokens_per_second": 2015.208 + }, + { + "epoch": 0.5896969696969697, + "grad_norm": 0.015589121729135513, + "learning_rate": 9.939899708092692e-05, + "loss": 0.011880002915859222, + "num_input_tokens_seen": 15933848, + "step": 973, + "train_runtime": 7906.7854, + "train_tokens_per_second": 2015.212 + }, + { + "epoch": 0.5903030303030303, + "grad_norm": 0.010916100814938545, + "learning_rate": 9.939750968407938e-05, + "loss": 0.011822294443845749, + "num_input_tokens_seen": 15950224, + "step": 974, + "train_runtime": 7914.891, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 0.5909090909090909, + "grad_norm": 0.014051892794668674, + "learning_rate": 9.939602046011412e-05, + "loss": 0.012878884561359882, + "num_input_tokens_seen": 15966600, + "step": 975, + "train_runtime": 7923.0019, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 0.5915151515151515, + "grad_norm": 0.032839711755514145, + "learning_rate": 9.939452940908626e-05, + "loss": 0.014527475461363792, + "num_input_tokens_seen": 15982976, + "step": 976, + "train_runtime": 7931.1131, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 0.5921212121212122, + "grad_norm": 0.020389258861541748, + "learning_rate": 9.939303653105096e-05, + "loss": 0.013167984783649445, + "num_input_tokens_seen": 15999352, + "step": 977, + "train_runtime": 7939.2338, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 0.5927272727272728, + "grad_norm": 0.025760652497410774, + "learning_rate": 9.939154182606341e-05, + "loss": 0.01562490500509739, + "num_input_tokens_seen": 16015728, + "step": 978, + "train_runtime": 7947.343, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 0.5933333333333334, + "grad_norm": 0.017900720238685608, + "learning_rate": 9.939004529417894e-05, + "loss": 0.011635327711701393, + "num_input_tokens_seen": 16032104, + "step": 979, + "train_runtime": 7955.4555, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 0.593939393939394, + "grad_norm": 0.018658578395843506, + "learning_rate": 9.938854693545285e-05, + "loss": 0.011654762551188469, + "num_input_tokens_seen": 16048480, + "step": 980, + "train_runtime": 7963.5661, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 0.5945454545454546, + "grad_norm": 0.01790103130042553, + "learning_rate": 9.938704674994062e-05, + "loss": 0.013270128518342972, + "num_input_tokens_seen": 16064856, + "step": 981, + "train_runtime": 7971.6756, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 0.5951515151515151, + "grad_norm": 0.039879657328128815, + "learning_rate": 9.938554473769768e-05, + "loss": 0.01646546646952629, + "num_input_tokens_seen": 16081232, + "step": 982, + "train_runtime": 7979.7879, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 0.5957575757575757, + "grad_norm": 0.013998485170304775, + "learning_rate": 9.938404089877961e-05, + "loss": 0.012206289917230606, + "num_input_tokens_seen": 16097608, + "step": 983, + "train_runtime": 7987.8964, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 0.5963636363636363, + "grad_norm": 0.006746624130755663, + "learning_rate": 9.938253523324206e-05, + "loss": 0.012235766276717186, + "num_input_tokens_seen": 16113984, + "step": 984, + "train_runtime": 7996.0036, + "train_tokens_per_second": 2015.255 + }, + { + "epoch": 0.5969696969696969, + "grad_norm": 0.022575756534934044, + "learning_rate": 9.93810277411407e-05, + "loss": 0.012963814660906792, + "num_input_tokens_seen": 16130360, + "step": 985, + "train_runtime": 8004.1105, + "train_tokens_per_second": 2015.26 + }, + { + "epoch": 0.5975757575757575, + "grad_norm": 0.007626754697412252, + "learning_rate": 9.937951842253127e-05, + "loss": 0.01213219203054905, + "num_input_tokens_seen": 16146736, + "step": 986, + "train_runtime": 8012.2202, + "train_tokens_per_second": 2015.264 + }, + { + "epoch": 0.5981818181818181, + "grad_norm": 0.013599387370049953, + "learning_rate": 9.937800727746964e-05, + "loss": 0.012984167784452438, + "num_input_tokens_seen": 16163112, + "step": 987, + "train_runtime": 8020.337, + "train_tokens_per_second": 2015.266 + }, + { + "epoch": 0.5987878787878788, + "grad_norm": 0.010270299389958382, + "learning_rate": 9.937649430601166e-05, + "loss": 0.011544723995029926, + "num_input_tokens_seen": 16179488, + "step": 988, + "train_runtime": 8028.447, + "train_tokens_per_second": 2015.27 + }, + { + "epoch": 0.5993939393939394, + "grad_norm": 0.03377272188663483, + "learning_rate": 9.937497950821332e-05, + "loss": 0.01466489490121603, + "num_input_tokens_seen": 16195864, + "step": 989, + "train_runtime": 8036.5629, + "train_tokens_per_second": 2015.272 + }, + { + "epoch": 0.6, + "grad_norm": 0.012808220461010933, + "learning_rate": 9.937346288413064e-05, + "loss": 0.014080810360610485, + "num_input_tokens_seen": 16212240, + "step": 990, + "train_runtime": 8044.6741, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 0.6006060606060606, + "grad_norm": 0.022888874635100365, + "learning_rate": 9.937194443381972e-05, + "loss": 0.012964661233127117, + "num_input_tokens_seen": 16228616, + "step": 991, + "train_runtime": 8052.7845, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 0.6012121212121212, + "grad_norm": 0.028279505670070648, + "learning_rate": 9.937042415733673e-05, + "loss": 0.012717594392597675, + "num_input_tokens_seen": 16244992, + "step": 992, + "train_runtime": 8060.8929, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 0.6018181818181818, + "grad_norm": 0.09445340186357498, + "learning_rate": 9.936890205473787e-05, + "loss": 0.013668234460055828, + "num_input_tokens_seen": 16261368, + "step": 993, + "train_runtime": 8069.0044, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 0.6024242424242424, + "grad_norm": 0.008610354736447334, + "learning_rate": 9.936737812607949e-05, + "loss": 0.011679118499159813, + "num_input_tokens_seen": 16277744, + "step": 994, + "train_runtime": 8077.1154, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 0.603030303030303, + "grad_norm": 0.017112495377659798, + "learning_rate": 9.936585237141792e-05, + "loss": 0.012689062394201756, + "num_input_tokens_seen": 16294120, + "step": 995, + "train_runtime": 8085.2294, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 0.6036363636363636, + "grad_norm": 0.0271944347769022, + "learning_rate": 9.936432479080961e-05, + "loss": 0.014213870279490948, + "num_input_tokens_seen": 16310496, + "step": 996, + "train_runtime": 8093.3362, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 0.6042424242424242, + "grad_norm": 0.012547393329441547, + "learning_rate": 9.936279538431106e-05, + "loss": 0.012523166835308075, + "num_input_tokens_seen": 16326872, + "step": 997, + "train_runtime": 8101.4449, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 0.6048484848484849, + "grad_norm": 0.02419351600110531, + "learning_rate": 9.936126415197884e-05, + "loss": 0.014308387413620949, + "num_input_tokens_seen": 16343248, + "step": 998, + "train_runtime": 8109.5556, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 0.6054545454545455, + "grad_norm": 0.015599401667714119, + "learning_rate": 9.935973109386958e-05, + "loss": 0.012808605097234249, + "num_input_tokens_seen": 16359624, + "step": 999, + "train_runtime": 8117.6633, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.021892806515097618, + "learning_rate": 9.935819621003999e-05, + "loss": 0.013939116150140762, + "num_input_tokens_seen": 16376000, + "step": 1000, + "train_runtime": 8125.7712, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 0.6066666666666667, + "grad_norm": 0.01672331802546978, + "learning_rate": 9.935665950054684e-05, + "loss": 0.014093529433012009, + "num_input_tokens_seen": 16392376, + "step": 1001, + "train_runtime": 8134.7177, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 0.6072727272727273, + "grad_norm": 0.009217355400323868, + "learning_rate": 9.9355120965447e-05, + "loss": 0.01290955115109682, + "num_input_tokens_seen": 16408752, + "step": 1002, + "train_runtime": 8142.8295, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 0.6078787878787879, + "grad_norm": 0.01524933148175478, + "learning_rate": 9.935358060479731e-05, + "loss": 0.012339223176240921, + "num_input_tokens_seen": 16425128, + "step": 1003, + "train_runtime": 8150.9365, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 0.6084848484848485, + "grad_norm": 0.02360517345368862, + "learning_rate": 9.935203841865482e-05, + "loss": 0.012756834737956524, + "num_input_tokens_seen": 16441504, + "step": 1004, + "train_runtime": 8159.0458, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 0.6090909090909091, + "grad_norm": 0.020947473123669624, + "learning_rate": 9.93504944070765e-05, + "loss": 0.012582367286086082, + "num_input_tokens_seen": 16457880, + "step": 1005, + "train_runtime": 8167.1644, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 0.6096969696969697, + "grad_norm": 0.01945319212973118, + "learning_rate": 9.934894857011953e-05, + "loss": 0.012788314372301102, + "num_input_tokens_seen": 16474256, + "step": 1006, + "train_runtime": 8175.2812, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 0.6103030303030303, + "grad_norm": 0.0219440758228302, + "learning_rate": 9.934740090784103e-05, + "loss": 0.013707922771573067, + "num_input_tokens_seen": 16490632, + "step": 1007, + "train_runtime": 8183.3988, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 0.610909090909091, + "grad_norm": 0.012798693962395191, + "learning_rate": 9.934585142029828e-05, + "loss": 0.013069421984255314, + "num_input_tokens_seen": 16507008, + "step": 1008, + "train_runtime": 8191.5179, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 0.6115151515151516, + "grad_norm": 0.012583008036017418, + "learning_rate": 9.934430010754861e-05, + "loss": 0.011966132558882236, + "num_input_tokens_seen": 16523384, + "step": 1009, + "train_runtime": 8199.6359, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 0.6121212121212121, + "grad_norm": 0.03669752925634384, + "learning_rate": 9.934274696964934e-05, + "loss": 0.014166103675961494, + "num_input_tokens_seen": 16539760, + "step": 1010, + "train_runtime": 8207.7511, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 0.6127272727272727, + "grad_norm": 0.019834555685520172, + "learning_rate": 9.934119200665795e-05, + "loss": 0.011456426233053207, + "num_input_tokens_seen": 16556136, + "step": 1011, + "train_runtime": 8215.8683, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 0.017150534316897392, + "learning_rate": 9.933963521863196e-05, + "loss": 0.012325924821197987, + "num_input_tokens_seen": 16572512, + "step": 1012, + "train_runtime": 8223.99, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 0.6139393939393939, + "grad_norm": 0.013030534610152245, + "learning_rate": 9.933807660562898e-05, + "loss": 0.012827505357563496, + "num_input_tokens_seen": 16588888, + "step": 1013, + "train_runtime": 8232.106, + "train_tokens_per_second": 2015.145 + }, + { + "epoch": 0.6145454545454545, + "grad_norm": 0.01751735992729664, + "learning_rate": 9.933651616770658e-05, + "loss": 0.012782123871147633, + "num_input_tokens_seen": 16605264, + "step": 1014, + "train_runtime": 8240.2294, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 0.6151515151515151, + "grad_norm": 0.013464527204632759, + "learning_rate": 9.933495390492256e-05, + "loss": 0.014123444445431232, + "num_input_tokens_seen": 16621640, + "step": 1015, + "train_runtime": 8248.3463, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 0.6157575757575757, + "grad_norm": 0.026679445058107376, + "learning_rate": 9.933338981733464e-05, + "loss": 0.012160470709204674, + "num_input_tokens_seen": 16638016, + "step": 1016, + "train_runtime": 8256.4635, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 0.6163636363636363, + "grad_norm": 0.010502724908292294, + "learning_rate": 9.933182390500073e-05, + "loss": 0.011820110492408276, + "num_input_tokens_seen": 16654392, + "step": 1017, + "train_runtime": 8264.5788, + "train_tokens_per_second": 2015.153 + }, + { + "epoch": 0.616969696969697, + "grad_norm": 0.013210924342274666, + "learning_rate": 9.93302561679787e-05, + "loss": 0.013029432855546474, + "num_input_tokens_seen": 16670768, + "step": 1018, + "train_runtime": 8272.6927, + "train_tokens_per_second": 2015.156 + }, + { + "epoch": 0.6175757575757576, + "grad_norm": 0.032258208841085434, + "learning_rate": 9.932868660632659e-05, + "loss": 0.012911350466310978, + "num_input_tokens_seen": 16687144, + "step": 1019, + "train_runtime": 8280.8162, + "train_tokens_per_second": 2015.157 + }, + { + "epoch": 0.6181818181818182, + "grad_norm": 0.03345981612801552, + "learning_rate": 9.932711522010241e-05, + "loss": 0.01444256491959095, + "num_input_tokens_seen": 16703520, + "step": 1020, + "train_runtime": 8288.934, + "train_tokens_per_second": 2015.159 + }, + { + "epoch": 0.6187878787878788, + "grad_norm": 0.023281559348106384, + "learning_rate": 9.932554200936429e-05, + "loss": 0.014297975227236748, + "num_input_tokens_seen": 16719896, + "step": 1021, + "train_runtime": 8297.0522, + "train_tokens_per_second": 2015.161 + }, + { + "epoch": 0.6193939393939394, + "grad_norm": 0.02298637479543686, + "learning_rate": 9.932396697417044e-05, + "loss": 0.012052800506353378, + "num_input_tokens_seen": 16736272, + "step": 1022, + "train_runtime": 8305.1688, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 0.62, + "grad_norm": 0.01204346027225256, + "learning_rate": 9.932239011457909e-05, + "loss": 0.012858795002102852, + "num_input_tokens_seen": 16752648, + "step": 1023, + "train_runtime": 8313.2898, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 0.6206060606060606, + "grad_norm": 0.018114762380719185, + "learning_rate": 9.93208114306486e-05, + "loss": 0.013215101324021816, + "num_input_tokens_seen": 16769024, + "step": 1024, + "train_runtime": 8321.4063, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 0.6212121212121212, + "grad_norm": 0.009015897288918495, + "learning_rate": 9.931923092243733e-05, + "loss": 0.013312953524291515, + "num_input_tokens_seen": 16785400, + "step": 1025, + "train_runtime": 8329.5303, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 0.6218181818181818, + "grad_norm": 0.011126898229122162, + "learning_rate": 9.931764859000375e-05, + "loss": 0.011524452827870846, + "num_input_tokens_seen": 16801776, + "step": 1026, + "train_runtime": 8337.647, + "train_tokens_per_second": 2015.17 + }, + { + "epoch": 0.6224242424242424, + "grad_norm": 0.021657567471265793, + "learning_rate": 9.93160644334064e-05, + "loss": 0.012531260028481483, + "num_input_tokens_seen": 16818152, + "step": 1027, + "train_runtime": 8345.7666, + "train_tokens_per_second": 2015.172 + }, + { + "epoch": 0.623030303030303, + "grad_norm": 0.05316740646958351, + "learning_rate": 9.931447845270388e-05, + "loss": 0.013248222880065441, + "num_input_tokens_seen": 16834528, + "step": 1028, + "train_runtime": 8353.8829, + "train_tokens_per_second": 2015.174 + }, + { + "epoch": 0.6236363636363637, + "grad_norm": 0.012917754240334034, + "learning_rate": 9.931289064795482e-05, + "loss": 0.013202149420976639, + "num_input_tokens_seen": 16850904, + "step": 1029, + "train_runtime": 8362.0006, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 0.6242424242424243, + "grad_norm": 0.021064553409814835, + "learning_rate": 9.931130101921795e-05, + "loss": 0.013943769969046116, + "num_input_tokens_seen": 16867280, + "step": 1030, + "train_runtime": 8370.1194, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 0.6248484848484849, + "grad_norm": 0.012005737982690334, + "learning_rate": 9.930970956655212e-05, + "loss": 0.012500936165452003, + "num_input_tokens_seen": 16883656, + "step": 1031, + "train_runtime": 8378.2369, + "train_tokens_per_second": 2015.18 + }, + { + "epoch": 0.6254545454545455, + "grad_norm": 0.02506149373948574, + "learning_rate": 9.930811629001613e-05, + "loss": 0.014318128116428852, + "num_input_tokens_seen": 16900032, + "step": 1032, + "train_runtime": 8386.3552, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 0.6260606060606061, + "grad_norm": 0.03320576995611191, + "learning_rate": 9.930652118966895e-05, + "loss": 0.010508203878998756, + "num_input_tokens_seen": 16916408, + "step": 1033, + "train_runtime": 8394.4718, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 0.6266666666666667, + "grad_norm": 0.03429649397730827, + "learning_rate": 9.93049242655696e-05, + "loss": 0.012183441780507565, + "num_input_tokens_seen": 16932784, + "step": 1034, + "train_runtime": 8402.5875, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 0.6272727272727273, + "grad_norm": 0.01607862487435341, + "learning_rate": 9.930332551777708e-05, + "loss": 0.013750139623880386, + "num_input_tokens_seen": 16949160, + "step": 1035, + "train_runtime": 8410.7043, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 0.6278787878787879, + "grad_norm": 0.01341179572045803, + "learning_rate": 9.930172494635057e-05, + "loss": 0.012538340874016285, + "num_input_tokens_seen": 16965536, + "step": 1036, + "train_runtime": 8418.8297, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 0.6284848484848485, + "grad_norm": 0.00997228641062975, + "learning_rate": 9.930012255134928e-05, + "loss": 0.012722784653306007, + "num_input_tokens_seen": 16981912, + "step": 1037, + "train_runtime": 8426.9482, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 0.6290909090909091, + "grad_norm": 0.00990308728069067, + "learning_rate": 9.929851833283245e-05, + "loss": 0.013942928053438663, + "num_input_tokens_seen": 16998288, + "step": 1038, + "train_runtime": 8435.0672, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 0.6296969696969696, + "grad_norm": 0.011313795112073421, + "learning_rate": 9.929691229085944e-05, + "loss": 0.011238223873078823, + "num_input_tokens_seen": 17014664, + "step": 1039, + "train_runtime": 8443.1862, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 0.6303030303030303, + "grad_norm": 0.010831150226294994, + "learning_rate": 9.929530442548965e-05, + "loss": 0.012601799331605434, + "num_input_tokens_seen": 17031040, + "step": 1040, + "train_runtime": 8451.3035, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 0.6309090909090909, + "grad_norm": 0.014783729799091816, + "learning_rate": 9.929369473678253e-05, + "loss": 0.013956460170447826, + "num_input_tokens_seen": 17047416, + "step": 1041, + "train_runtime": 8459.4295, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 0.6315151515151515, + "grad_norm": 0.01627667248249054, + "learning_rate": 9.929208322479764e-05, + "loss": 0.013232799246907234, + "num_input_tokens_seen": 17063792, + "step": 1042, + "train_runtime": 8467.5479, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 0.6321212121212121, + "grad_norm": 0.011055609211325645, + "learning_rate": 9.92904698895946e-05, + "loss": 0.01293270569294691, + "num_input_tokens_seen": 17080168, + "step": 1043, + "train_runtime": 8475.665, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 0.6327272727272727, + "grad_norm": 0.03507707267999649, + "learning_rate": 9.928885473123306e-05, + "loss": 0.012113180011510849, + "num_input_tokens_seen": 17096544, + "step": 1044, + "train_runtime": 8483.7839, + "train_tokens_per_second": 2015.203 + }, + { + "epoch": 0.6333333333333333, + "grad_norm": 0.01946045272052288, + "learning_rate": 9.928723774977275e-05, + "loss": 0.013142693787813187, + "num_input_tokens_seen": 17112920, + "step": 1045, + "train_runtime": 8491.9041, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 0.6339393939393939, + "grad_norm": 0.021705901250243187, + "learning_rate": 9.928561894527353e-05, + "loss": 0.012501654215157032, + "num_input_tokens_seen": 17129296, + "step": 1046, + "train_runtime": 8500.0295, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 0.6345454545454545, + "grad_norm": 0.019804542884230614, + "learning_rate": 9.928399831779523e-05, + "loss": 0.012758147902786732, + "num_input_tokens_seen": 17145672, + "step": 1047, + "train_runtime": 8508.1486, + "train_tokens_per_second": 2015.206 + }, + { + "epoch": 0.6351515151515151, + "grad_norm": 0.011929893866181374, + "learning_rate": 9.928237586739781e-05, + "loss": 0.013042271137237549, + "num_input_tokens_seen": 17162048, + "step": 1048, + "train_runtime": 8516.2673, + "train_tokens_per_second": 2015.208 + }, + { + "epoch": 0.6357575757575757, + "grad_norm": 0.028489001095294952, + "learning_rate": 9.928075159414128e-05, + "loss": 0.013056590221822262, + "num_input_tokens_seen": 17178424, + "step": 1049, + "train_runtime": 8524.3858, + "train_tokens_per_second": 2015.21 + }, + { + "epoch": 0.6363636363636364, + "grad_norm": 0.01078235823661089, + "learning_rate": 9.927912549808572e-05, + "loss": 0.012080740183591843, + "num_input_tokens_seen": 17194800, + "step": 1050, + "train_runtime": 8532.5029, + "train_tokens_per_second": 2015.212 + }, + { + "epoch": 0.636969696969697, + "grad_norm": 0.021545223891735077, + "learning_rate": 9.927749757929125e-05, + "loss": 0.015170791186392307, + "num_input_tokens_seen": 17211176, + "step": 1051, + "train_runtime": 8540.6203, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 0.6375757575757576, + "grad_norm": 0.021686149761080742, + "learning_rate": 9.927586783781814e-05, + "loss": 0.013388474471867085, + "num_input_tokens_seen": 17227552, + "step": 1052, + "train_runtime": 8548.7393, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 0.6381818181818182, + "grad_norm": 0.019198935478925705, + "learning_rate": 9.927423627372663e-05, + "loss": 0.013151840306818485, + "num_input_tokens_seen": 17243928, + "step": 1053, + "train_runtime": 8556.8572, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 0.6387878787878788, + "grad_norm": 0.026876596733927727, + "learning_rate": 9.927260288707707e-05, + "loss": 0.01568884216248989, + "num_input_tokens_seen": 17260304, + "step": 1054, + "train_runtime": 8564.9754, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 0.6393939393939394, + "grad_norm": 0.02315112017095089, + "learning_rate": 9.92709676779299e-05, + "loss": 0.013643411919474602, + "num_input_tokens_seen": 17276680, + "step": 1055, + "train_runtime": 8573.0936, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 0.64, + "grad_norm": 0.013450577855110168, + "learning_rate": 9.926933064634558e-05, + "loss": 0.011888994835317135, + "num_input_tokens_seen": 17293056, + "step": 1056, + "train_runtime": 8581.213, + "train_tokens_per_second": 2015.223 + }, + { + "epoch": 0.6406060606060606, + "grad_norm": 0.038361355662345886, + "learning_rate": 9.926769179238466e-05, + "loss": 0.01497360784560442, + "num_input_tokens_seen": 17309432, + "step": 1057, + "train_runtime": 8589.3331, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 0.6412121212121212, + "grad_norm": 0.019271399825811386, + "learning_rate": 9.926605111610776e-05, + "loss": 0.014056256040930748, + "num_input_tokens_seen": 17325808, + "step": 1058, + "train_runtime": 8597.4511, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 0.6418181818181818, + "grad_norm": 0.01557596493512392, + "learning_rate": 9.926440861757557e-05, + "loss": 0.012796062976121902, + "num_input_tokens_seen": 17342184, + "step": 1059, + "train_runtime": 8605.5697, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 0.6424242424242425, + "grad_norm": 0.005278696306049824, + "learning_rate": 9.926276429684886e-05, + "loss": 0.011402487754821777, + "num_input_tokens_seen": 17358560, + "step": 1060, + "train_runtime": 8613.6883, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 0.6430303030303031, + "grad_norm": 0.015694163739681244, + "learning_rate": 9.926111815398843e-05, + "loss": 0.013192391023039818, + "num_input_tokens_seen": 17374936, + "step": 1061, + "train_runtime": 8621.8068, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 0.6436363636363637, + "grad_norm": 0.01900624856352806, + "learning_rate": 9.925947018905516e-05, + "loss": 0.013219461776316166, + "num_input_tokens_seen": 17391312, + "step": 1062, + "train_runtime": 8629.9293, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 0.6442424242424243, + "grad_norm": 0.013446804136037827, + "learning_rate": 9.925782040211002e-05, + "loss": 0.011763139627873898, + "num_input_tokens_seen": 17407688, + "step": 1063, + "train_runtime": 8638.0493, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 0.6448484848484849, + "grad_norm": 0.01933007501065731, + "learning_rate": 9.925616879321404e-05, + "loss": 0.011931811459362507, + "num_input_tokens_seen": 17424064, + "step": 1064, + "train_runtime": 8646.1674, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 0.6454545454545455, + "grad_norm": 0.016764989122748375, + "learning_rate": 9.925451536242829e-05, + "loss": 0.013410956598818302, + "num_input_tokens_seen": 17440440, + "step": 1065, + "train_runtime": 8654.2855, + "train_tokens_per_second": 2015.237 + }, + { + "epoch": 0.6460606060606061, + "grad_norm": 0.019174639135599136, + "learning_rate": 9.925286010981394e-05, + "loss": 0.014691396616399288, + "num_input_tokens_seen": 17456816, + "step": 1066, + "train_runtime": 8662.4024, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 0.6466666666666666, + "grad_norm": 0.0077021801844239235, + "learning_rate": 9.925120303543219e-05, + "loss": 0.012529893778264523, + "num_input_tokens_seen": 17473192, + "step": 1067, + "train_runtime": 8670.5209, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 0.6472727272727272, + "grad_norm": 0.014966354705393314, + "learning_rate": 9.924954413934438e-05, + "loss": 0.013215701095759869, + "num_input_tokens_seen": 17489568, + "step": 1068, + "train_runtime": 8678.6394, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 0.6478787878787878, + "grad_norm": 0.020852232351899147, + "learning_rate": 9.924788342161182e-05, + "loss": 0.013355967588722706, + "num_input_tokens_seen": 17505944, + "step": 1069, + "train_runtime": 8686.7585, + "train_tokens_per_second": 2015.245 + }, + { + "epoch": 0.6484848484848484, + "grad_norm": 0.017107227817177773, + "learning_rate": 9.924622088229597e-05, + "loss": 0.014044157229363918, + "num_input_tokens_seen": 17522320, + "step": 1070, + "train_runtime": 8694.8787, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 0.649090909090909, + "grad_norm": 0.015282119624316692, + "learning_rate": 9.924455652145831e-05, + "loss": 0.01387142762541771, + "num_input_tokens_seen": 17538696, + "step": 1071, + "train_runtime": 8702.997, + "train_tokens_per_second": 2015.248 + }, + { + "epoch": 0.6496969696969697, + "grad_norm": 0.010007917881011963, + "learning_rate": 9.92428903391604e-05, + "loss": 0.01257625874131918, + "num_input_tokens_seen": 17555072, + "step": 1072, + "train_runtime": 8711.1152, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 0.6503030303030303, + "grad_norm": 0.009446706622838974, + "learning_rate": 9.924122233546386e-05, + "loss": 0.013552306219935417, + "num_input_tokens_seen": 17571448, + "step": 1073, + "train_runtime": 8719.2328, + "train_tokens_per_second": 2015.252 + }, + { + "epoch": 0.6509090909090909, + "grad_norm": 0.012225381098687649, + "learning_rate": 9.923955251043042e-05, + "loss": 0.011776247061789036, + "num_input_tokens_seen": 17587824, + "step": 1074, + "train_runtime": 8727.35, + "train_tokens_per_second": 2015.254 + }, + { + "epoch": 0.6515151515151515, + "grad_norm": 0.020964186638593674, + "learning_rate": 9.923788086412182e-05, + "loss": 0.012502472847700119, + "num_input_tokens_seen": 17604200, + "step": 1075, + "train_runtime": 8735.4695, + "train_tokens_per_second": 2015.255 + }, + { + "epoch": 0.6521212121212121, + "grad_norm": 0.017575940117239952, + "learning_rate": 9.923620739659989e-05, + "loss": 0.012096179649233818, + "num_input_tokens_seen": 17620576, + "step": 1076, + "train_runtime": 8743.5854, + "train_tokens_per_second": 2015.258 + }, + { + "epoch": 0.6527272727272727, + "grad_norm": 0.013330096378922462, + "learning_rate": 9.923453210792653e-05, + "loss": 0.013803805224597454, + "num_input_tokens_seen": 17636952, + "step": 1077, + "train_runtime": 8751.7034, + "train_tokens_per_second": 2015.259 + }, + { + "epoch": 0.6533333333333333, + "grad_norm": 0.011349702253937721, + "learning_rate": 9.92328549981637e-05, + "loss": 0.013793877325952053, + "num_input_tokens_seen": 17653328, + "step": 1078, + "train_runtime": 8759.8296, + "train_tokens_per_second": 2015.259 + }, + { + "epoch": 0.6539393939393939, + "grad_norm": 0.015959061682224274, + "learning_rate": 9.923117606737346e-05, + "loss": 0.013326899148523808, + "num_input_tokens_seen": 17669704, + "step": 1079, + "train_runtime": 8767.9523, + "train_tokens_per_second": 2015.26 + }, + { + "epoch": 0.6545454545454545, + "grad_norm": 0.014492125250399113, + "learning_rate": 9.922949531561788e-05, + "loss": 0.01288958266377449, + "num_input_tokens_seen": 17686080, + "step": 1080, + "train_runtime": 8776.0735, + "train_tokens_per_second": 2015.261 + }, + { + "epoch": 0.6551515151515152, + "grad_norm": 0.013345365412533283, + "learning_rate": 9.922781274295913e-05, + "loss": 0.012366179376840591, + "num_input_tokens_seen": 17702456, + "step": 1081, + "train_runtime": 8784.1923, + "train_tokens_per_second": 2015.263 + }, + { + "epoch": 0.6557575757575758, + "grad_norm": 0.010763085447251797, + "learning_rate": 9.922612834945947e-05, + "loss": 0.01264217309653759, + "num_input_tokens_seen": 17718832, + "step": 1082, + "train_runtime": 8792.3102, + "train_tokens_per_second": 2015.265 + }, + { + "epoch": 0.6563636363636364, + "grad_norm": 0.011818567290902138, + "learning_rate": 9.922444213518117e-05, + "loss": 0.013193395920097828, + "num_input_tokens_seen": 17735208, + "step": 1083, + "train_runtime": 8800.4295, + "train_tokens_per_second": 2015.266 + }, + { + "epoch": 0.656969696969697, + "grad_norm": 0.010724381543695927, + "learning_rate": 9.922275410018663e-05, + "loss": 0.012857016175985336, + "num_input_tokens_seen": 17751584, + "step": 1084, + "train_runtime": 8808.5474, + "train_tokens_per_second": 2015.268 + }, + { + "epoch": 0.6575757575757576, + "grad_norm": 0.017108984291553497, + "learning_rate": 9.922106424453826e-05, + "loss": 0.013113675639033318, + "num_input_tokens_seen": 17767960, + "step": 1085, + "train_runtime": 8816.6647, + "train_tokens_per_second": 2015.27 + }, + { + "epoch": 0.6581818181818182, + "grad_norm": 0.022697484120726585, + "learning_rate": 9.921937256829859e-05, + "loss": 0.012546958401799202, + "num_input_tokens_seen": 17784336, + "step": 1086, + "train_runtime": 8824.7847, + "train_tokens_per_second": 2015.271 + }, + { + "epoch": 0.6587878787878788, + "grad_norm": 0.014008583500981331, + "learning_rate": 9.921767907153016e-05, + "loss": 0.011740295216441154, + "num_input_tokens_seen": 17800712, + "step": 1087, + "train_runtime": 8832.904, + "train_tokens_per_second": 2015.273 + }, + { + "epoch": 0.6593939393939394, + "grad_norm": 0.011233743280172348, + "learning_rate": 9.921598375429564e-05, + "loss": 0.011731310747563839, + "num_input_tokens_seen": 17817088, + "step": 1088, + "train_runtime": 8841.0299, + "train_tokens_per_second": 2015.273 + }, + { + "epoch": 0.66, + "grad_norm": 0.011883188039064407, + "learning_rate": 9.921428661665772e-05, + "loss": 0.012650273740291595, + "num_input_tokens_seen": 17833464, + "step": 1089, + "train_runtime": 8849.1483, + "train_tokens_per_second": 2015.275 + }, + { + "epoch": 0.6606060606060606, + "grad_norm": 0.010079750791192055, + "learning_rate": 9.921258765867919e-05, + "loss": 0.012131286785006523, + "num_input_tokens_seen": 17849840, + "step": 1090, + "train_runtime": 8857.2661, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 0.6612121212121213, + "grad_norm": 0.013724222779273987, + "learning_rate": 9.921088688042287e-05, + "loss": 0.012973928824067116, + "num_input_tokens_seen": 17866216, + "step": 1091, + "train_runtime": 8865.3859, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 0.6618181818181819, + "grad_norm": 0.019831640645861626, + "learning_rate": 9.920918428195168e-05, + "loss": 0.01297835074365139, + "num_input_tokens_seen": 17882592, + "step": 1092, + "train_runtime": 8873.5052, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 0.6624242424242425, + "grad_norm": 0.011757400818169117, + "learning_rate": 9.920747986332858e-05, + "loss": 0.013069117441773415, + "num_input_tokens_seen": 17898968, + "step": 1093, + "train_runtime": 8881.6295, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 0.6630303030303031, + "grad_norm": 0.013741742819547653, + "learning_rate": 9.920577362461665e-05, + "loss": 0.013204855844378471, + "num_input_tokens_seen": 17915344, + "step": 1094, + "train_runtime": 8889.749, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 0.6636363636363637, + "grad_norm": 0.02447706274688244, + "learning_rate": 9.920406556587897e-05, + "loss": 0.011999960988759995, + "num_input_tokens_seen": 17931720, + "step": 1095, + "train_runtime": 8897.8668, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 0.6642424242424242, + "grad_norm": 0.03095782734453678, + "learning_rate": 9.920235568717873e-05, + "loss": 0.01361205242574215, + "num_input_tokens_seen": 17948096, + "step": 1096, + "train_runtime": 8905.9871, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 0.6648484848484848, + "grad_norm": 0.037076305598020554, + "learning_rate": 9.920064398857916e-05, + "loss": 0.012342737056314945, + "num_input_tokens_seen": 17964472, + "step": 1097, + "train_runtime": 8914.1084, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 0.6654545454545454, + "grad_norm": 0.053048014640808105, + "learning_rate": 9.91989304701436e-05, + "loss": 0.012850755825638771, + "num_input_tokens_seen": 17980848, + "step": 1098, + "train_runtime": 8922.229, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 0.666060606060606, + "grad_norm": 0.018742846325039864, + "learning_rate": 9.919721513193538e-05, + "loss": 0.012020561844110489, + "num_input_tokens_seen": 17997224, + "step": 1099, + "train_runtime": 8930.3477, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.013778687454760075, + "learning_rate": 9.919549797401802e-05, + "loss": 0.014269824139773846, + "num_input_tokens_seen": 18013600, + "step": 1100, + "train_runtime": 8938.4671, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 0.6672727272727272, + "grad_norm": 0.06041925400495529, + "learning_rate": 9.919377899645497e-05, + "loss": 0.013500120490789413, + "num_input_tokens_seen": 18029976, + "step": 1101, + "train_runtime": 8947.5154, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 0.6678787878787878, + "grad_norm": 0.006662312895059586, + "learning_rate": 9.919205819930983e-05, + "loss": 0.011903712525963783, + "num_input_tokens_seen": 18046352, + "step": 1102, + "train_runtime": 8955.6388, + "train_tokens_per_second": 2015.083 + }, + { + "epoch": 0.6684848484848485, + "grad_norm": 0.014133021235466003, + "learning_rate": 9.919033558264627e-05, + "loss": 0.013043178245425224, + "num_input_tokens_seen": 18062728, + "step": 1103, + "train_runtime": 8963.7612, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 0.6690909090909091, + "grad_norm": 0.018031738698482513, + "learning_rate": 9.918861114652798e-05, + "loss": 0.012816919945180416, + "num_input_tokens_seen": 18079104, + "step": 1104, + "train_runtime": 8971.8826, + "train_tokens_per_second": 2015.085 + }, + { + "epoch": 0.6696969696969697, + "grad_norm": 0.030864031985402107, + "learning_rate": 9.918688489101875e-05, + "loss": 0.011915095150470734, + "num_input_tokens_seen": 18095480, + "step": 1105, + "train_runtime": 8980.0063, + "train_tokens_per_second": 2015.085 + }, + { + "epoch": 0.6703030303030303, + "grad_norm": 0.859399676322937, + "learning_rate": 9.918515681618246e-05, + "loss": 0.014253467321395874, + "num_input_tokens_seen": 18111856, + "step": 1106, + "train_runtime": 8988.1298, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 0.6709090909090909, + "grad_norm": 0.009849797002971172, + "learning_rate": 9.918342692208297e-05, + "loss": 0.012211693450808525, + "num_input_tokens_seen": 18128232, + "step": 1107, + "train_runtime": 8996.2594, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 0.6715151515151515, + "grad_norm": 0.008677136152982712, + "learning_rate": 9.918169520878432e-05, + "loss": 0.013050990179181099, + "num_input_tokens_seen": 18144608, + "step": 1108, + "train_runtime": 9004.3806, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 0.6721212121212121, + "grad_norm": 0.020974334329366684, + "learning_rate": 9.917996167635053e-05, + "loss": 0.013656461611390114, + "num_input_tokens_seen": 18160984, + "step": 1109, + "train_runtime": 9012.5058, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 0.6727272727272727, + "grad_norm": 0.013642716221511364, + "learning_rate": 9.917822632484575e-05, + "loss": 0.012185771018266678, + "num_input_tokens_seen": 18177360, + "step": 1110, + "train_runtime": 9020.6295, + "train_tokens_per_second": 2015.088 + }, + { + "epoch": 0.6733333333333333, + "grad_norm": 0.01303725503385067, + "learning_rate": 9.917648915433413e-05, + "loss": 0.012668903917074203, + "num_input_tokens_seen": 18193736, + "step": 1111, + "train_runtime": 9028.755, + "train_tokens_per_second": 2015.088 + }, + { + "epoch": 0.673939393939394, + "grad_norm": 0.02112429402768612, + "learning_rate": 9.917475016487993e-05, + "loss": 0.014089099131524563, + "num_input_tokens_seen": 18210112, + "step": 1112, + "train_runtime": 9036.8746, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 0.6745454545454546, + "grad_norm": 0.016523541882634163, + "learning_rate": 9.917300935654751e-05, + "loss": 0.012728005647659302, + "num_input_tokens_seen": 18226488, + "step": 1113, + "train_runtime": 9044.9946, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 0.6751515151515152, + "grad_norm": 0.0112396739423275, + "learning_rate": 9.917126672940124e-05, + "loss": 0.013019783422350883, + "num_input_tokens_seen": 18242864, + "step": 1114, + "train_runtime": 9053.1208, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 0.6757575757575758, + "grad_norm": 0.21001896262168884, + "learning_rate": 9.916952228350556e-05, + "loss": 0.019040443003177643, + "num_input_tokens_seen": 18259240, + "step": 1115, + "train_runtime": 9061.2411, + "train_tokens_per_second": 2015.093 + }, + { + "epoch": 0.6763636363636364, + "grad_norm": 0.015162148512899876, + "learning_rate": 9.916777601892499e-05, + "loss": 0.011509026400744915, + "num_input_tokens_seen": 18275616, + "step": 1116, + "train_runtime": 9069.361, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 0.676969696969697, + "grad_norm": 0.018534110859036446, + "learning_rate": 9.916602793572415e-05, + "loss": 0.012472787871956825, + "num_input_tokens_seen": 18291992, + "step": 1117, + "train_runtime": 9077.4851, + "train_tokens_per_second": 2015.095 + }, + { + "epoch": 0.6775757575757576, + "grad_norm": 0.08402104675769806, + "learning_rate": 9.916427803396769e-05, + "loss": 0.014569929800927639, + "num_input_tokens_seen": 18308368, + "step": 1118, + "train_runtime": 9085.6102, + "train_tokens_per_second": 2015.095 + }, + { + "epoch": 0.6781818181818182, + "grad_norm": 0.018771981820464134, + "learning_rate": 9.91625263137203e-05, + "loss": 0.011995847336947918, + "num_input_tokens_seen": 18324744, + "step": 1119, + "train_runtime": 9093.7324, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 0.6787878787878788, + "grad_norm": 0.03660675883293152, + "learning_rate": 9.916077277504683e-05, + "loss": 0.013902310281991959, + "num_input_tokens_seen": 18341120, + "step": 1120, + "train_runtime": 9101.8526, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 0.6793939393939394, + "grad_norm": 0.02395397052168846, + "learning_rate": 9.91590174180121e-05, + "loss": 0.012367844581604004, + "num_input_tokens_seen": 18357496, + "step": 1121, + "train_runtime": 9109.9726, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 0.68, + "grad_norm": 0.019227512180805206, + "learning_rate": 9.915726024268104e-05, + "loss": 0.012134227901697159, + "num_input_tokens_seen": 18373872, + "step": 1122, + "train_runtime": 9118.0998, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 0.6806060606060607, + "grad_norm": 0.01857166923582554, + "learning_rate": 9.915550124911866e-05, + "loss": 0.013478003442287445, + "num_input_tokens_seen": 18390248, + "step": 1123, + "train_runtime": 9126.2293, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 0.6812121212121212, + "grad_norm": 0.04824969545006752, + "learning_rate": 9.915374043739003e-05, + "loss": 0.012269456870853901, + "num_input_tokens_seen": 18406624, + "step": 1124, + "train_runtime": 9134.35, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 0.6818181818181818, + "grad_norm": 3.1688060760498047, + "learning_rate": 9.915197780756025e-05, + "loss": 0.02297493815422058, + "num_input_tokens_seen": 18423000, + "step": 1125, + "train_runtime": 9142.4746, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 0.6824242424242424, + "grad_norm": 13.637248992919922, + "learning_rate": 9.915021335969452e-05, + "loss": 0.03535247966647148, + "num_input_tokens_seen": 18439376, + "step": 1126, + "train_runtime": 9150.5959, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 0.683030303030303, + "grad_norm": 0.018440239131450653, + "learning_rate": 9.914844709385813e-05, + "loss": 0.014308687299489975, + "num_input_tokens_seen": 18455752, + "step": 1127, + "train_runtime": 9158.7198, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 0.6836363636363636, + "grad_norm": 0.017091959714889526, + "learning_rate": 9.914667901011638e-05, + "loss": 0.012615025043487549, + "num_input_tokens_seen": 18472128, + "step": 1128, + "train_runtime": 9166.8428, + "train_tokens_per_second": 2015.103 + }, + { + "epoch": 0.6842424242424242, + "grad_norm": 0.040168218314647675, + "learning_rate": 9.91449091085347e-05, + "loss": 0.013721957802772522, + "num_input_tokens_seen": 18488504, + "step": 1129, + "train_runtime": 9174.9646, + "train_tokens_per_second": 2015.104 + }, + { + "epoch": 0.6848484848484848, + "grad_norm": 0.01958506926894188, + "learning_rate": 9.914313738917853e-05, + "loss": 0.015058807097375393, + "num_input_tokens_seen": 18504880, + "step": 1130, + "train_runtime": 9183.0766, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 0.6854545454545454, + "grad_norm": 0.041311051696538925, + "learning_rate": 9.914136385211341e-05, + "loss": 0.011465203016996384, + "num_input_tokens_seen": 18521256, + "step": 1131, + "train_runtime": 9191.1874, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 0.686060606060606, + "grad_norm": 0.029558753594756126, + "learning_rate": 9.913958849740493e-05, + "loss": 0.013997621834278107, + "num_input_tokens_seen": 18537632, + "step": 1132, + "train_runtime": 9199.2987, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 0.6866666666666666, + "grad_norm": 0.01560160145163536, + "learning_rate": 9.913781132511877e-05, + "loss": 0.01135623175650835, + "num_input_tokens_seen": 18554008, + "step": 1133, + "train_runtime": 9207.4109, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 0.6872727272727273, + "grad_norm": 0.026331115514039993, + "learning_rate": 9.913603233532067e-05, + "loss": 0.014213286340236664, + "num_input_tokens_seen": 18570384, + "step": 1134, + "train_runtime": 9215.5295, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 0.6878787878787879, + "grad_norm": 0.012758780270814896, + "learning_rate": 9.913425152807642e-05, + "loss": 0.013095496222376823, + "num_input_tokens_seen": 18586760, + "step": 1135, + "train_runtime": 9223.6386, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 0.6884848484848485, + "grad_norm": 0.02692464366555214, + "learning_rate": 9.913246890345189e-05, + "loss": 0.014479240402579308, + "num_input_tokens_seen": 18603136, + "step": 1136, + "train_runtime": 9231.7499, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 0.6890909090909091, + "grad_norm": 0.023674434050917625, + "learning_rate": 9.913068446151302e-05, + "loss": 0.01468647737056017, + "num_input_tokens_seen": 18619512, + "step": 1137, + "train_runtime": 9239.8624, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 0.6896969696969697, + "grad_norm": 0.043436527252197266, + "learning_rate": 9.912889820232578e-05, + "loss": 0.013666333630681038, + "num_input_tokens_seen": 18635888, + "step": 1138, + "train_runtime": 9247.9735, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 0.6903030303030303, + "grad_norm": 0.010912930592894554, + "learning_rate": 9.91271101259563e-05, + "loss": 0.013306580483913422, + "num_input_tokens_seen": 18652264, + "step": 1139, + "train_runtime": 9256.0853, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 0.6909090909090909, + "grad_norm": 0.027857549488544464, + "learning_rate": 9.912532023247068e-05, + "loss": 0.01315208338201046, + "num_input_tokens_seen": 18668640, + "step": 1140, + "train_runtime": 9264.193, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 0.6915151515151515, + "grad_norm": 0.014686026610434055, + "learning_rate": 9.912352852193514e-05, + "loss": 0.012413710355758667, + "num_input_tokens_seen": 18685016, + "step": 1141, + "train_runtime": 9272.3053, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 0.6921212121212121, + "grad_norm": 0.16849519312381744, + "learning_rate": 9.912173499441593e-05, + "loss": 0.013621876947581768, + "num_input_tokens_seen": 18701392, + "step": 1142, + "train_runtime": 9280.4143, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 0.6927272727272727, + "grad_norm": 0.025766436010599136, + "learning_rate": 9.91199396499794e-05, + "loss": 0.014693841338157654, + "num_input_tokens_seen": 18717768, + "step": 1143, + "train_runtime": 9288.5292, + "train_tokens_per_second": 2015.149 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 0.03636888787150383, + "learning_rate": 9.911814248869198e-05, + "loss": 0.015230114571750164, + "num_input_tokens_seen": 18734144, + "step": 1144, + "train_runtime": 9296.6386, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 0.693939393939394, + "grad_norm": 0.02268008515238762, + "learning_rate": 9.91163435106201e-05, + "loss": 0.014965626411139965, + "num_input_tokens_seen": 18750520, + "step": 1145, + "train_runtime": 9304.7495, + "train_tokens_per_second": 2015.156 + }, + { + "epoch": 0.6945454545454546, + "grad_norm": 0.02825307659804821, + "learning_rate": 9.911454271583034e-05, + "loss": 0.013202480971813202, + "num_input_tokens_seen": 18766896, + "step": 1146, + "train_runtime": 9312.8608, + "train_tokens_per_second": 2015.159 + }, + { + "epoch": 0.6951515151515152, + "grad_norm": 0.0277263056486845, + "learning_rate": 9.911274010438928e-05, + "loss": 0.014979338273406029, + "num_input_tokens_seen": 18783272, + "step": 1147, + "train_runtime": 9320.9729, + "train_tokens_per_second": 2015.162 + }, + { + "epoch": 0.6957575757575758, + "grad_norm": 0.03655631095170975, + "learning_rate": 9.91109356763636e-05, + "loss": 0.01276368834078312, + "num_input_tokens_seen": 18799648, + "step": 1148, + "train_runtime": 9329.0854, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 0.6963636363636364, + "grad_norm": 0.017650572583079338, + "learning_rate": 9.910912943182007e-05, + "loss": 0.013225570321083069, + "num_input_tokens_seen": 18816024, + "step": 1149, + "train_runtime": 9337.1951, + "train_tokens_per_second": 2015.169 + }, + { + "epoch": 0.696969696969697, + "grad_norm": 0.029844503849744797, + "learning_rate": 9.910732137082547e-05, + "loss": 0.012919209897518158, + "num_input_tokens_seen": 18832400, + "step": 1150, + "train_runtime": 9345.3036, + "train_tokens_per_second": 2015.173 + }, + { + "epoch": 0.6975757575757576, + "grad_norm": 0.022128146141767502, + "learning_rate": 9.910551149344669e-05, + "loss": 0.013780666515231133, + "num_input_tokens_seen": 18848776, + "step": 1151, + "train_runtime": 9353.4141, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 0.6981818181818182, + "grad_norm": 0.02025616727769375, + "learning_rate": 9.910369979975065e-05, + "loss": 0.014601497910916805, + "num_input_tokens_seen": 18865152, + "step": 1152, + "train_runtime": 9361.5308, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 0.6987878787878787, + "grad_norm": 0.01940023899078369, + "learning_rate": 9.910188628980439e-05, + "loss": 0.01339776162058115, + "num_input_tokens_seen": 18881528, + "step": 1153, + "train_runtime": 9369.6441, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 0.6993939393939393, + "grad_norm": 0.022027693688869476, + "learning_rate": 9.910007096367497e-05, + "loss": 0.01376222725957632, + "num_input_tokens_seen": 18897904, + "step": 1154, + "train_runtime": 9377.7542, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 0.7, + "grad_norm": 0.006554140709340572, + "learning_rate": 9.909825382142955e-05, + "loss": 0.012087719514966011, + "num_input_tokens_seen": 18914280, + "step": 1155, + "train_runtime": 9385.8634, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 0.7006060606060606, + "grad_norm": 0.011244562454521656, + "learning_rate": 9.909643486313533e-05, + "loss": 0.011743160896003246, + "num_input_tokens_seen": 18930656, + "step": 1156, + "train_runtime": 9393.9756, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 0.7012121212121212, + "grad_norm": 0.015718987211585045, + "learning_rate": 9.909461408885961e-05, + "loss": 0.015649257227778435, + "num_input_tokens_seen": 18947032, + "step": 1157, + "train_runtime": 9402.0879, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 0.7018181818181818, + "grad_norm": 0.014524322003126144, + "learning_rate": 9.909279149866971e-05, + "loss": 0.012584694661200047, + "num_input_tokens_seen": 18963408, + "step": 1158, + "train_runtime": 9410.1978, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 0.7024242424242424, + "grad_norm": 0.01179551426321268, + "learning_rate": 9.909096709263305e-05, + "loss": 0.01177270244807005, + "num_input_tokens_seen": 18979784, + "step": 1159, + "train_runtime": 9418.3067, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 0.703030303030303, + "grad_norm": 0.3294766843318939, + "learning_rate": 9.908914087081714e-05, + "loss": 0.013622680678963661, + "num_input_tokens_seen": 18996160, + "step": 1160, + "train_runtime": 9426.418, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 0.7036363636363636, + "grad_norm": 0.019340241327881813, + "learning_rate": 9.908731283328949e-05, + "loss": 0.013781043700873852, + "num_input_tokens_seen": 19012536, + "step": 1161, + "train_runtime": 9434.5376, + "train_tokens_per_second": 2015.206 + }, + { + "epoch": 0.7042424242424242, + "grad_norm": 0.31950604915618896, + "learning_rate": 9.908548298011774e-05, + "loss": 0.013624520972371101, + "num_input_tokens_seen": 19028912, + "step": 1162, + "train_runtime": 9442.6474, + "train_tokens_per_second": 2015.209 + }, + { + "epoch": 0.7048484848484848, + "grad_norm": 0.01044798456132412, + "learning_rate": 9.908365131136957e-05, + "loss": 0.013481276109814644, + "num_input_tokens_seen": 19045288, + "step": 1163, + "train_runtime": 9450.7603, + "train_tokens_per_second": 2015.212 + }, + { + "epoch": 0.7054545454545454, + "grad_norm": 0.08119679987430573, + "learning_rate": 9.90818178271127e-05, + "loss": 0.01282893493771553, + "num_input_tokens_seen": 19061664, + "step": 1164, + "train_runtime": 9458.8734, + "train_tokens_per_second": 2015.215 + }, + { + "epoch": 0.706060606060606, + "grad_norm": 0.013537311926484108, + "learning_rate": 9.907998252741498e-05, + "loss": 0.013240115717053413, + "num_input_tokens_seen": 19078040, + "step": 1165, + "train_runtime": 9466.9849, + "train_tokens_per_second": 2015.218 + }, + { + "epoch": 0.7066666666666667, + "grad_norm": 0.015183590352535248, + "learning_rate": 9.907814541234429e-05, + "loss": 0.01356966607272625, + "num_input_tokens_seen": 19094416, + "step": 1166, + "train_runtime": 9475.0931, + "train_tokens_per_second": 2015.222 + }, + { + "epoch": 0.7072727272727273, + "grad_norm": 0.01905563659965992, + "learning_rate": 9.907630648196857e-05, + "loss": 0.011865122243762016, + "num_input_tokens_seen": 19110792, + "step": 1167, + "train_runtime": 9483.2064, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 0.7078787878787879, + "grad_norm": 0.01771489344537258, + "learning_rate": 9.907446573635586e-05, + "loss": 0.014323254115879536, + "num_input_tokens_seen": 19127168, + "step": 1168, + "train_runtime": 9491.3179, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 0.7084848484848485, + "grad_norm": 0.013392560184001923, + "learning_rate": 9.907262317557422e-05, + "loss": 0.014154933393001556, + "num_input_tokens_seen": 19143544, + "step": 1169, + "train_runtime": 9499.4298, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 0.7090909090909091, + "grad_norm": 0.01917138509452343, + "learning_rate": 9.907077879969182e-05, + "loss": 0.014620376750826836, + "num_input_tokens_seen": 19159920, + "step": 1170, + "train_runtime": 9507.5424, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 0.7096969696969697, + "grad_norm": 0.023388303816318512, + "learning_rate": 9.906893260877686e-05, + "loss": 0.013931838795542717, + "num_input_tokens_seen": 19176296, + "step": 1171, + "train_runtime": 9515.6549, + "train_tokens_per_second": 2015.237 + }, + { + "epoch": 0.7103030303030303, + "grad_norm": 0.014943883754312992, + "learning_rate": 9.906708460289765e-05, + "loss": 0.012756659649312496, + "num_input_tokens_seen": 19192672, + "step": 1172, + "train_runtime": 9523.7631, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 0.7109090909090909, + "grad_norm": 0.011030408553779125, + "learning_rate": 9.906523478212252e-05, + "loss": 0.01190275140106678, + "num_input_tokens_seen": 19209048, + "step": 1173, + "train_runtime": 9531.8735, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 0.7115151515151515, + "grad_norm": 0.008161013014614582, + "learning_rate": 9.906338314651993e-05, + "loss": 0.012577732093632221, + "num_input_tokens_seen": 19225424, + "step": 1174, + "train_runtime": 9539.9859, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 0.7121212121212122, + "grad_norm": 0.02119288221001625, + "learning_rate": 9.906152969615833e-05, + "loss": 0.012449773959815502, + "num_input_tokens_seen": 19241800, + "step": 1175, + "train_runtime": 9548.0982, + "train_tokens_per_second": 2015.249 + }, + { + "epoch": 0.7127272727272728, + "grad_norm": 0.017582163214683533, + "learning_rate": 9.90596744311063e-05, + "loss": 0.011529134586453438, + "num_input_tokens_seen": 19258176, + "step": 1176, + "train_runtime": 9556.2087, + "train_tokens_per_second": 2015.253 + }, + { + "epoch": 0.7133333333333334, + "grad_norm": 0.04412311315536499, + "learning_rate": 9.905781735143245e-05, + "loss": 0.014292292296886444, + "num_input_tokens_seen": 19274552, + "step": 1177, + "train_runtime": 9564.3204, + "train_tokens_per_second": 2015.256 + }, + { + "epoch": 0.713939393939394, + "grad_norm": 0.07766410708427429, + "learning_rate": 9.905595845720545e-05, + "loss": 0.011792981065809727, + "num_input_tokens_seen": 19290928, + "step": 1178, + "train_runtime": 9572.4335, + "train_tokens_per_second": 2015.258 + }, + { + "epoch": 0.7145454545454546, + "grad_norm": 0.020279264077544212, + "learning_rate": 9.90540977484941e-05, + "loss": 0.014193961396813393, + "num_input_tokens_seen": 19307304, + "step": 1179, + "train_runtime": 9580.5444, + "train_tokens_per_second": 2015.262 + }, + { + "epoch": 0.7151515151515152, + "grad_norm": 0.023957345634698868, + "learning_rate": 9.905223522536719e-05, + "loss": 0.01391246635466814, + "num_input_tokens_seen": 19323680, + "step": 1180, + "train_runtime": 9588.6548, + "train_tokens_per_second": 2015.265 + }, + { + "epoch": 0.7157575757575757, + "grad_norm": 0.02165958844125271, + "learning_rate": 9.905037088789363e-05, + "loss": 0.014714146964251995, + "num_input_tokens_seen": 19340056, + "step": 1181, + "train_runtime": 9596.7692, + "train_tokens_per_second": 2015.267 + }, + { + "epoch": 0.7163636363636363, + "grad_norm": 0.014883043244481087, + "learning_rate": 9.904850473614237e-05, + "loss": 0.013630779460072517, + "num_input_tokens_seen": 19356432, + "step": 1182, + "train_runtime": 9604.8799, + "train_tokens_per_second": 2015.271 + }, + { + "epoch": 0.7169696969696969, + "grad_norm": 0.012120597995817661, + "learning_rate": 9.904663677018245e-05, + "loss": 0.013401714153587818, + "num_input_tokens_seen": 19372808, + "step": 1183, + "train_runtime": 9612.9913, + "train_tokens_per_second": 2015.274 + }, + { + "epoch": 0.7175757575757575, + "grad_norm": 0.024704404175281525, + "learning_rate": 9.904476699008293e-05, + "loss": 0.015781283378601074, + "num_input_tokens_seen": 19389184, + "step": 1184, + "train_runtime": 9621.1054, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 0.7181818181818181, + "grad_norm": 0.015950346365571022, + "learning_rate": 9.9042895395913e-05, + "loss": 0.012905421666800976, + "num_input_tokens_seen": 19405560, + "step": 1185, + "train_runtime": 9629.2186, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 0.7187878787878788, + "grad_norm": 0.021412916481494904, + "learning_rate": 9.904102198774188e-05, + "loss": 0.012717105448246002, + "num_input_tokens_seen": 19421936, + "step": 1186, + "train_runtime": 9637.3311, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 0.7193939393939394, + "grad_norm": 0.024673737585544586, + "learning_rate": 9.903914676563885e-05, + "loss": 0.012580260634422302, + "num_input_tokens_seen": 19438312, + "step": 1187, + "train_runtime": 9645.4427, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 0.72, + "grad_norm": 0.07743503898382187, + "learning_rate": 9.90372697296733e-05, + "loss": 0.013859845697879791, + "num_input_tokens_seen": 19454688, + "step": 1188, + "train_runtime": 9653.5584, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 0.7206060606060606, + "grad_norm": 0.014397671446204185, + "learning_rate": 9.903539087991462e-05, + "loss": 0.013244936242699623, + "num_input_tokens_seen": 19471064, + "step": 1189, + "train_runtime": 9661.6716, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 0.7212121212121212, + "grad_norm": 0.027382057160139084, + "learning_rate": 9.903351021643233e-05, + "loss": 0.014433873817324638, + "num_input_tokens_seen": 19487440, + "step": 1190, + "train_runtime": 9669.7828, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 0.7218181818181818, + "grad_norm": 0.013371971435844898, + "learning_rate": 9.903162773929599e-05, + "loss": 0.014319634065032005, + "num_input_tokens_seen": 19503816, + "step": 1191, + "train_runtime": 9677.8954, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 0.7224242424242424, + "grad_norm": 0.02415373921394348, + "learning_rate": 9.902974344857521e-05, + "loss": 0.01522553525865078, + "num_input_tokens_seen": 19520192, + "step": 1192, + "train_runtime": 9686.0046, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 0.723030303030303, + "grad_norm": 0.013075731694698334, + "learning_rate": 9.902785734433971e-05, + "loss": 0.012145644053816795, + "num_input_tokens_seen": 19536568, + "step": 1193, + "train_runtime": 9694.1175, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 0.7236363636363636, + "grad_norm": 0.02217678166925907, + "learning_rate": 9.902596942665925e-05, + "loss": 0.013490047305822372, + "num_input_tokens_seen": 19552944, + "step": 1194, + "train_runtime": 9702.2306, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 0.7242424242424242, + "grad_norm": 0.014989197254180908, + "learning_rate": 9.902407969560364e-05, + "loss": 0.015374877490103245, + "num_input_tokens_seen": 19569320, + "step": 1195, + "train_runtime": 9710.3384, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 0.7248484848484849, + "grad_norm": 0.010880461893975735, + "learning_rate": 9.90221881512428e-05, + "loss": 0.010911534540355206, + "num_input_tokens_seen": 19585696, + "step": 1196, + "train_runtime": 9718.4475, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 0.7254545454545455, + "grad_norm": 0.0177223589271307, + "learning_rate": 9.90202947936467e-05, + "loss": 0.01328328251838684, + "num_input_tokens_seen": 19602072, + "step": 1197, + "train_runtime": 9726.5574, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 0.7260606060606061, + "grad_norm": 0.015080858021974564, + "learning_rate": 9.901839962288533e-05, + "loss": 0.013248666189610958, + "num_input_tokens_seen": 19618448, + "step": 1198, + "train_runtime": 9734.6668, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 0.7266666666666667, + "grad_norm": 0.01892446167767048, + "learning_rate": 9.901650263902884e-05, + "loss": 0.012533879838883877, + "num_input_tokens_seen": 19634824, + "step": 1199, + "train_runtime": 9742.776, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 0.0085715651512146, + "learning_rate": 9.901460384214736e-05, + "loss": 0.011274173855781555, + "num_input_tokens_seen": 19651200, + "step": 1200, + "train_runtime": 9750.8874, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 0.7278787878787879, + "grad_norm": 0.030662082135677338, + "learning_rate": 9.901270323231115e-05, + "loss": 0.012586663477122784, + "num_input_tokens_seen": 19667576, + "step": 1201, + "train_runtime": 9759.9377, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 0.7284848484848485, + "grad_norm": 0.012625769712030888, + "learning_rate": 9.901080080959048e-05, + "loss": 0.013224436901509762, + "num_input_tokens_seen": 19683952, + "step": 1202, + "train_runtime": 9768.0467, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 0.7290909090909091, + "grad_norm": 0.012317335233092308, + "learning_rate": 9.900889657405573e-05, + "loss": 0.012883040122687817, + "num_input_tokens_seen": 19700328, + "step": 1203, + "train_runtime": 9776.155, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 0.7296969696969697, + "grad_norm": 0.012403651140630245, + "learning_rate": 9.900699052577736e-05, + "loss": 0.012290080077946186, + "num_input_tokens_seen": 19716704, + "step": 1204, + "train_runtime": 9784.2649, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 0.7303030303030303, + "grad_norm": 0.01588149555027485, + "learning_rate": 9.900508266482582e-05, + "loss": 0.011603264138102531, + "num_input_tokens_seen": 19733080, + "step": 1205, + "train_runtime": 9792.3778, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 0.730909090909091, + "grad_norm": 0.014620691537857056, + "learning_rate": 9.900317299127171e-05, + "loss": 0.012423778884112835, + "num_input_tokens_seen": 19749456, + "step": 1206, + "train_runtime": 9800.4881, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 0.7315151515151516, + "grad_norm": 0.012740055099129677, + "learning_rate": 9.900126150518567e-05, + "loss": 0.013299481943249702, + "num_input_tokens_seen": 19765832, + "step": 1207, + "train_runtime": 9808.599, + "train_tokens_per_second": 2015.153 + }, + { + "epoch": 0.7321212121212122, + "grad_norm": 0.015813497826457024, + "learning_rate": 9.899934820663839e-05, + "loss": 0.014216665178537369, + "num_input_tokens_seen": 19782208, + "step": 1208, + "train_runtime": 9816.7097, + "train_tokens_per_second": 2015.157 + }, + { + "epoch": 0.7327272727272728, + "grad_norm": 0.023462215438485146, + "learning_rate": 9.899743309570065e-05, + "loss": 0.014444109052419662, + "num_input_tokens_seen": 19798584, + "step": 1209, + "train_runtime": 9824.8204, + "train_tokens_per_second": 2015.16 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.016535522416234016, + "learning_rate": 9.899551617244326e-05, + "loss": 0.012044892646372318, + "num_input_tokens_seen": 19814960, + "step": 1210, + "train_runtime": 9832.9302, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 0.7339393939393939, + "grad_norm": 0.01581740379333496, + "learning_rate": 9.899359743693714e-05, + "loss": 0.014411653392016888, + "num_input_tokens_seen": 19831336, + "step": 1211, + "train_runtime": 9841.0417, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 0.7345454545454545, + "grad_norm": 0.01694261096417904, + "learning_rate": 9.899167688925328e-05, + "loss": 0.01339998934417963, + "num_input_tokens_seen": 19847712, + "step": 1212, + "train_runtime": 9849.1549, + "train_tokens_per_second": 2015.169 + }, + { + "epoch": 0.7351515151515151, + "grad_norm": 0.011397319845855236, + "learning_rate": 9.898975452946268e-05, + "loss": 0.013992566615343094, + "num_input_tokens_seen": 19864088, + "step": 1213, + "train_runtime": 9857.2628, + "train_tokens_per_second": 2015.173 + }, + { + "epoch": 0.7357575757575757, + "grad_norm": 0.009932632558047771, + "learning_rate": 9.898783035763648e-05, + "loss": 0.013121276162564754, + "num_input_tokens_seen": 19880464, + "step": 1214, + "train_runtime": 9865.3743, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 0.7363636363636363, + "grad_norm": 0.039875004440546036, + "learning_rate": 9.898590437384583e-05, + "loss": 0.013154653832316399, + "num_input_tokens_seen": 19896840, + "step": 1215, + "train_runtime": 9873.4892, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 0.7369696969696969, + "grad_norm": 0.014247733168303967, + "learning_rate": 9.898397657816198e-05, + "loss": 0.012165211141109467, + "num_input_tokens_seen": 19913216, + "step": 1216, + "train_runtime": 9881.6008, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 0.7375757575757576, + "grad_norm": 0.020671000704169273, + "learning_rate": 9.89820469706562e-05, + "loss": 0.012851119041442871, + "num_input_tokens_seen": 19929592, + "step": 1217, + "train_runtime": 9889.711, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 0.7381818181818182, + "grad_norm": 0.01268229354172945, + "learning_rate": 9.898011555139991e-05, + "loss": 0.011670916341245174, + "num_input_tokens_seen": 19945968, + "step": 1218, + "train_runtime": 9897.8448, + "train_tokens_per_second": 2015.183 + }, + { + "epoch": 0.7387878787878788, + "grad_norm": 0.014971123076975346, + "learning_rate": 9.897818232046454e-05, + "loss": 0.012817314825952053, + "num_input_tokens_seen": 19962344, + "step": 1219, + "train_runtime": 9905.9579, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 0.7393939393939394, + "grad_norm": 0.03158552944660187, + "learning_rate": 9.897624727792159e-05, + "loss": 0.01493182685226202, + "num_input_tokens_seen": 19978720, + "step": 1220, + "train_runtime": 9914.0699, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 0.74, + "grad_norm": 0.013837055303156376, + "learning_rate": 9.897431042384261e-05, + "loss": 0.01410394161939621, + "num_input_tokens_seen": 19995096, + "step": 1221, + "train_runtime": 9922.1811, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 0.7406060606060606, + "grad_norm": 0.02035367488861084, + "learning_rate": 9.897237175829926e-05, + "loss": 0.014466963708400726, + "num_input_tokens_seen": 20011472, + "step": 1222, + "train_runtime": 9930.2933, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 0.7412121212121212, + "grad_norm": 0.03811359778046608, + "learning_rate": 9.897043128136325e-05, + "loss": 0.013205880299210548, + "num_input_tokens_seen": 20027848, + "step": 1223, + "train_runtime": 9938.4061, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 0.7418181818181818, + "grad_norm": 0.018652835860848427, + "learning_rate": 9.896848899310636e-05, + "loss": 0.013042958453297615, + "num_input_tokens_seen": 20044224, + "step": 1224, + "train_runtime": 9946.5152, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 0.7424242424242424, + "grad_norm": 0.017733843997120857, + "learning_rate": 9.896654489360042e-05, + "loss": 0.012684517540037632, + "num_input_tokens_seen": 20060600, + "step": 1225, + "train_runtime": 9954.6306, + "train_tokens_per_second": 2015.203 + }, + { + "epoch": 0.743030303030303, + "grad_norm": 0.010155964642763138, + "learning_rate": 9.896459898291734e-05, + "loss": 0.011605635285377502, + "num_input_tokens_seen": 20076976, + "step": 1226, + "train_runtime": 9962.74, + "train_tokens_per_second": 2015.206 + }, + { + "epoch": 0.7436363636363637, + "grad_norm": 0.02421714924275875, + "learning_rate": 9.896265126112911e-05, + "loss": 0.015139145776629448, + "num_input_tokens_seen": 20093352, + "step": 1227, + "train_runtime": 9970.8484, + "train_tokens_per_second": 2015.21 + }, + { + "epoch": 0.7442424242424243, + "grad_norm": 0.02827371098101139, + "learning_rate": 9.896070172830776e-05, + "loss": 0.013175873085856438, + "num_input_tokens_seen": 20109728, + "step": 1228, + "train_runtime": 9978.9575, + "train_tokens_per_second": 2015.213 + }, + { + "epoch": 0.7448484848484849, + "grad_norm": 0.012187021784484386, + "learning_rate": 9.895875038452539e-05, + "loss": 0.013465436175465584, + "num_input_tokens_seen": 20126104, + "step": 1229, + "train_runtime": 9987.0668, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 0.7454545454545455, + "grad_norm": 0.011740162037312984, + "learning_rate": 9.895679722985419e-05, + "loss": 0.013261547312140465, + "num_input_tokens_seen": 20142480, + "step": 1230, + "train_runtime": 9995.1753, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 0.7460606060606061, + "grad_norm": 0.02706027776002884, + "learning_rate": 9.89548422643664e-05, + "loss": 0.013440998271107674, + "num_input_tokens_seen": 20158856, + "step": 1231, + "train_runtime": 10003.284, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 0.05222317576408386, + "learning_rate": 9.895288548813432e-05, + "loss": 0.014066273346543312, + "num_input_tokens_seen": 20175232, + "step": 1232, + "train_runtime": 10011.3904, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 0.7472727272727273, + "grad_norm": 0.011138387955725193, + "learning_rate": 9.895092690123035e-05, + "loss": 0.012343725189566612, + "num_input_tokens_seen": 20191608, + "step": 1233, + "train_runtime": 10019.4992, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 0.7478787878787879, + "grad_norm": 0.019493183121085167, + "learning_rate": 9.894896650372692e-05, + "loss": 0.014319119974970818, + "num_input_tokens_seen": 20207984, + "step": 1234, + "train_runtime": 10027.6078, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 0.7484848484848485, + "grad_norm": 0.010399113409221172, + "learning_rate": 9.894700429569653e-05, + "loss": 0.013344192877411842, + "num_input_tokens_seen": 20224360, + "step": 1235, + "train_runtime": 10035.7155, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 0.7490909090909091, + "grad_norm": 0.013207124546170235, + "learning_rate": 9.894504027721179e-05, + "loss": 0.012579311616718769, + "num_input_tokens_seen": 20240736, + "step": 1236, + "train_runtime": 10043.8304, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 0.7496969696969698, + "grad_norm": 0.007676406297832727, + "learning_rate": 9.89430744483453e-05, + "loss": 0.012105286121368408, + "num_input_tokens_seen": 20257112, + "step": 1237, + "train_runtime": 10051.9411, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 0.7503030303030302, + "grad_norm": 0.26611316204071045, + "learning_rate": 9.894110680916981e-05, + "loss": 0.012751906178891659, + "num_input_tokens_seen": 20273488, + "step": 1238, + "train_runtime": 10060.0502, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 0.7509090909090909, + "grad_norm": 0.016328565776348114, + "learning_rate": 9.89391373597581e-05, + "loss": 0.013627522625029087, + "num_input_tokens_seen": 20289864, + "step": 1239, + "train_runtime": 10068.1581, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 0.7515151515151515, + "grad_norm": 0.018115442246198654, + "learning_rate": 9.8937166100183e-05, + "loss": 0.014619875699281693, + "num_input_tokens_seen": 20306240, + "step": 1240, + "train_runtime": 10076.2632, + "train_tokens_per_second": 2015.255 + }, + { + "epoch": 0.7521212121212121, + "grad_norm": 0.047225791960954666, + "learning_rate": 9.893519303051742e-05, + "loss": 0.012407291680574417, + "num_input_tokens_seen": 20322616, + "step": 1241, + "train_runtime": 10084.3737, + "train_tokens_per_second": 2015.258 + }, + { + "epoch": 0.7527272727272727, + "grad_norm": 0.00958853680640459, + "learning_rate": 9.893321815083435e-05, + "loss": 0.012367008253932, + "num_input_tokens_seen": 20338992, + "step": 1242, + "train_runtime": 10092.4834, + "train_tokens_per_second": 2015.261 + }, + { + "epoch": 0.7533333333333333, + "grad_norm": 0.01551489531993866, + "learning_rate": 9.893124146120684e-05, + "loss": 0.011828011833131313, + "num_input_tokens_seen": 20355368, + "step": 1243, + "train_runtime": 10100.5915, + "train_tokens_per_second": 2015.265 + }, + { + "epoch": 0.7539393939393939, + "grad_norm": 0.015479539521038532, + "learning_rate": 9.892926296170799e-05, + "loss": 0.013003758154809475, + "num_input_tokens_seen": 20371744, + "step": 1244, + "train_runtime": 10108.6986, + "train_tokens_per_second": 2015.269 + }, + { + "epoch": 0.7545454545454545, + "grad_norm": 0.018905159085989, + "learning_rate": 9.892728265241098e-05, + "loss": 0.013263228349387646, + "num_input_tokens_seen": 20388120, + "step": 1245, + "train_runtime": 10116.8092, + "train_tokens_per_second": 2015.272 + }, + { + "epoch": 0.7551515151515151, + "grad_norm": 0.02863249182701111, + "learning_rate": 9.892530053338909e-05, + "loss": 0.0130619453266263, + "num_input_tokens_seen": 20404496, + "step": 1246, + "train_runtime": 10124.9156, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 0.7557575757575757, + "grad_norm": 0.016296787187457085, + "learning_rate": 9.892331660471559e-05, + "loss": 0.012045785784721375, + "num_input_tokens_seen": 20420872, + "step": 1247, + "train_runtime": 10133.0202, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 0.7563636363636363, + "grad_norm": 0.016199452802538872, + "learning_rate": 9.892133086646389e-05, + "loss": 0.012048415839672089, + "num_input_tokens_seen": 20437248, + "step": 1248, + "train_runtime": 10141.1305, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 0.756969696969697, + "grad_norm": 0.012741641141474247, + "learning_rate": 9.891934331870743e-05, + "loss": 0.01335767563432455, + "num_input_tokens_seen": 20453624, + "step": 1249, + "train_runtime": 10149.2473, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 0.7575757575757576, + "grad_norm": 0.03929731622338295, + "learning_rate": 9.891735396151972e-05, + "loss": 0.01206697802990675, + "num_input_tokens_seen": 20470000, + "step": 1250, + "train_runtime": 10157.3657, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 0.7581818181818182, + "grad_norm": 0.007868324406445026, + "learning_rate": 9.891536279497436e-05, + "loss": 0.011791637167334557, + "num_input_tokens_seen": 20486376, + "step": 1251, + "train_runtime": 10165.4828, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 0.7587878787878788, + "grad_norm": 0.013859824277460575, + "learning_rate": 9.891336981914499e-05, + "loss": 0.014204591512680054, + "num_input_tokens_seen": 20502752, + "step": 1252, + "train_runtime": 10173.6015, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 0.7593939393939394, + "grad_norm": 0.03682630881667137, + "learning_rate": 9.891137503410531e-05, + "loss": 0.01157104317098856, + "num_input_tokens_seen": 20519128, + "step": 1253, + "train_runtime": 10181.7191, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 0.76, + "grad_norm": 0.015358424745500088, + "learning_rate": 9.890937843992913e-05, + "loss": 0.013172848150134087, + "num_input_tokens_seen": 20535504, + "step": 1254, + "train_runtime": 10189.8374, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 0.7606060606060606, + "grad_norm": 0.01969468779861927, + "learning_rate": 9.890738003669029e-05, + "loss": 0.013599451631307602, + "num_input_tokens_seen": 20551880, + "step": 1255, + "train_runtime": 10197.9553, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 0.7612121212121212, + "grad_norm": 0.01678163930773735, + "learning_rate": 9.89053798244627e-05, + "loss": 0.013114574365317822, + "num_input_tokens_seen": 20568256, + "step": 1256, + "train_runtime": 10206.0749, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 0.7618181818181818, + "grad_norm": 0.0193489920347929, + "learning_rate": 9.890337780332035e-05, + "loss": 0.011934047564864159, + "num_input_tokens_seen": 20584632, + "step": 1257, + "train_runtime": 10214.1928, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 0.7624242424242424, + "grad_norm": 0.011665060184895992, + "learning_rate": 9.890137397333729e-05, + "loss": 0.012188711203634739, + "num_input_tokens_seen": 20601008, + "step": 1258, + "train_runtime": 10222.3099, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 0.7630303030303031, + "grad_norm": 0.005775026045739651, + "learning_rate": 9.889936833458763e-05, + "loss": 0.011419412679970264, + "num_input_tokens_seen": 20617384, + "step": 1259, + "train_runtime": 10230.4305, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 0.7636363636363637, + "grad_norm": 0.023811450228095055, + "learning_rate": 9.889736088714558e-05, + "loss": 0.01227609720081091, + "num_input_tokens_seen": 20633760, + "step": 1260, + "train_runtime": 10238.5502, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 0.7642424242424243, + "grad_norm": 0.0233469195663929, + "learning_rate": 9.889535163108537e-05, + "loss": 0.012738242745399475, + "num_input_tokens_seen": 20650136, + "step": 1261, + "train_runtime": 10246.6696, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 0.7648484848484849, + "grad_norm": 0.01263290736824274, + "learning_rate": 9.889334056648131e-05, + "loss": 0.01269836351275444, + "num_input_tokens_seen": 20666512, + "step": 1262, + "train_runtime": 10254.7884, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 0.7654545454545455, + "grad_norm": 0.014581980183720589, + "learning_rate": 9.889132769340781e-05, + "loss": 0.013540278188884258, + "num_input_tokens_seen": 20682888, + "step": 1263, + "train_runtime": 10262.9085, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 0.7660606060606061, + "grad_norm": 0.014391904696822166, + "learning_rate": 9.88893130119393e-05, + "loss": 0.012283596210181713, + "num_input_tokens_seen": 20699264, + "step": 1264, + "train_runtime": 10271.0298, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 0.7666666666666667, + "grad_norm": 0.015524503774940968, + "learning_rate": 9.888729652215032e-05, + "loss": 0.012001638300716877, + "num_input_tokens_seen": 20715640, + "step": 1265, + "train_runtime": 10279.149, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 0.7672727272727272, + "grad_norm": 0.011605373583734035, + "learning_rate": 9.888527822411543e-05, + "loss": 0.012554067187011242, + "num_input_tokens_seen": 20732016, + "step": 1266, + "train_runtime": 10287.2682, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 0.7678787878787878, + "grad_norm": 0.017037956044077873, + "learning_rate": 9.888325811790931e-05, + "loss": 0.013448834419250488, + "num_input_tokens_seen": 20748392, + "step": 1267, + "train_runtime": 10295.3877, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 0.7684848484848484, + "grad_norm": 0.028556402772665024, + "learning_rate": 9.888123620360666e-05, + "loss": 0.012878211215138435, + "num_input_tokens_seen": 20764768, + "step": 1268, + "train_runtime": 10303.5056, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 0.769090909090909, + "grad_norm": 0.014958829618990421, + "learning_rate": 9.887921248128228e-05, + "loss": 0.013986572623252869, + "num_input_tokens_seen": 20781144, + "step": 1269, + "train_runtime": 10311.6315, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 0.7696969696969697, + "grad_norm": 0.021999262273311615, + "learning_rate": 9.887718695101102e-05, + "loss": 0.01611473597586155, + "num_input_tokens_seen": 20797520, + "step": 1270, + "train_runtime": 10319.7512, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 0.7703030303030303, + "grad_norm": 0.01434963196516037, + "learning_rate": 9.88751596128678e-05, + "loss": 0.012239954434335232, + "num_input_tokens_seen": 20813896, + "step": 1271, + "train_runtime": 10327.8713, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 0.7709090909090909, + "grad_norm": 0.02051941119134426, + "learning_rate": 9.887313046692761e-05, + "loss": 0.013740262016654015, + "num_input_tokens_seen": 20830272, + "step": 1272, + "train_runtime": 10335.9914, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 0.7715151515151515, + "grad_norm": 0.00836126133799553, + "learning_rate": 9.88710995132655e-05, + "loss": 0.011003411374986172, + "num_input_tokens_seen": 20846648, + "step": 1273, + "train_runtime": 10344.1113, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 0.7721212121212121, + "grad_norm": 0.009217855520546436, + "learning_rate": 9.886906675195657e-05, + "loss": 0.012320063076913357, + "num_input_tokens_seen": 20863024, + "step": 1274, + "train_runtime": 10352.2308, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 0.7727272727272727, + "grad_norm": 0.00831685308367014, + "learning_rate": 9.886703218307604e-05, + "loss": 0.013156922534108162, + "num_input_tokens_seen": 20879400, + "step": 1275, + "train_runtime": 10360.351, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 0.01840154640376568, + "learning_rate": 9.886499580669917e-05, + "loss": 0.01196813490241766, + "num_input_tokens_seen": 20895776, + "step": 1276, + "train_runtime": 10368.4707, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 0.7739393939393939, + "grad_norm": 0.016405558213591576, + "learning_rate": 9.886295762290125e-05, + "loss": 0.013263520784676075, + "num_input_tokens_seen": 20912152, + "step": 1277, + "train_runtime": 10376.5894, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 0.7745454545454545, + "grad_norm": 0.017034931108355522, + "learning_rate": 9.886091763175769e-05, + "loss": 0.013993248343467712, + "num_input_tokens_seen": 20928528, + "step": 1278, + "train_runtime": 10384.7083, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 0.7751515151515151, + "grad_norm": 0.03572826832532883, + "learning_rate": 9.885887583334393e-05, + "loss": 0.012332772836089134, + "num_input_tokens_seen": 20944904, + "step": 1279, + "train_runtime": 10392.8323, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 0.7757575757575758, + "grad_norm": 0.02001163735985756, + "learning_rate": 9.885683222773551e-05, + "loss": 0.012113104574382305, + "num_input_tokens_seen": 20961280, + "step": 1280, + "train_runtime": 10400.9501, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 0.7763636363636364, + "grad_norm": 0.04807475954294205, + "learning_rate": 9.8854786815008e-05, + "loss": 0.011850223876535892, + "num_input_tokens_seen": 20977656, + "step": 1281, + "train_runtime": 10409.0709, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 0.776969696969697, + "grad_norm": 0.007331644184887409, + "learning_rate": 9.885273959523707e-05, + "loss": 0.011687932536005974, + "num_input_tokens_seen": 20994032, + "step": 1282, + "train_runtime": 10417.1889, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 0.7775757575757576, + "grad_norm": 0.013896801508963108, + "learning_rate": 9.885069056849845e-05, + "loss": 0.01239155326038599, + "num_input_tokens_seen": 21010408, + "step": 1283, + "train_runtime": 10425.3081, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 0.7781818181818182, + "grad_norm": 0.009068959392607212, + "learning_rate": 9.88486397348679e-05, + "loss": 0.01141006126999855, + "num_input_tokens_seen": 21026784, + "step": 1284, + "train_runtime": 10433.4338, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 0.7787878787878788, + "grad_norm": 0.01311533898115158, + "learning_rate": 9.884658709442132e-05, + "loss": 0.011742614209651947, + "num_input_tokens_seen": 21043160, + "step": 1285, + "train_runtime": 10441.5524, + "train_tokens_per_second": 2015.329 + }, + { + "epoch": 0.7793939393939394, + "grad_norm": 0.01562919095158577, + "learning_rate": 9.884453264723459e-05, + "loss": 0.012607906013727188, + "num_input_tokens_seen": 21059536, + "step": 1286, + "train_runtime": 10449.6697, + "train_tokens_per_second": 2015.33 + }, + { + "epoch": 0.78, + "grad_norm": 0.017651278525590897, + "learning_rate": 9.884247639338373e-05, + "loss": 0.01244867779314518, + "num_input_tokens_seen": 21075912, + "step": 1287, + "train_runtime": 10457.7881, + "train_tokens_per_second": 2015.332 + }, + { + "epoch": 0.7806060606060606, + "grad_norm": 0.020198311656713486, + "learning_rate": 9.884041833294476e-05, + "loss": 0.013492776080965996, + "num_input_tokens_seen": 21092288, + "step": 1288, + "train_runtime": 10465.9062, + "train_tokens_per_second": 2015.333 + }, + { + "epoch": 0.7812121212121212, + "grad_norm": 0.009970282204449177, + "learning_rate": 9.883835846599386e-05, + "loss": 0.013857762329280376, + "num_input_tokens_seen": 21108664, + "step": 1289, + "train_runtime": 10474.1537, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 0.7818181818181819, + "grad_norm": 0.011340651661157608, + "learning_rate": 9.883629679260715e-05, + "loss": 0.011344236321747303, + "num_input_tokens_seen": 21125040, + "step": 1290, + "train_runtime": 10482.2705, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 0.7824242424242425, + "grad_norm": 0.03793201595544815, + "learning_rate": 9.883423331286096e-05, + "loss": 0.015287358313798904, + "num_input_tokens_seen": 21141416, + "step": 1291, + "train_runtime": 10490.3857, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 0.7830303030303031, + "grad_norm": 0.02402154542505741, + "learning_rate": 9.883216802683158e-05, + "loss": 0.013735389336943626, + "num_input_tokens_seen": 21157792, + "step": 1292, + "train_runtime": 10498.5012, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 0.7836363636363637, + "grad_norm": 0.016549425199627876, + "learning_rate": 9.883010093459537e-05, + "loss": 0.01311381347477436, + "num_input_tokens_seen": 21174168, + "step": 1293, + "train_runtime": 10506.618, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 0.7842424242424243, + "grad_norm": 0.0236363522708416, + "learning_rate": 9.882803203622884e-05, + "loss": 0.01185927726328373, + "num_input_tokens_seen": 21190544, + "step": 1294, + "train_runtime": 10514.7333, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 0.7848484848484848, + "grad_norm": 0.015482014045119286, + "learning_rate": 9.882596133180849e-05, + "loss": 0.012073281221091747, + "num_input_tokens_seen": 21206920, + "step": 1295, + "train_runtime": 10522.8502, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 0.7854545454545454, + "grad_norm": 0.01528620719909668, + "learning_rate": 9.882388882141092e-05, + "loss": 0.012514740228652954, + "num_input_tokens_seen": 21223296, + "step": 1296, + "train_runtime": 10530.9675, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 0.786060606060606, + "grad_norm": 0.01590045541524887, + "learning_rate": 9.882181450511278e-05, + "loss": 0.014040066860616207, + "num_input_tokens_seen": 21239672, + "step": 1297, + "train_runtime": 10539.085, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 0.7866666666666666, + "grad_norm": 0.026240071281790733, + "learning_rate": 9.88197383829908e-05, + "loss": 0.012822052463889122, + "num_input_tokens_seen": 21256048, + "step": 1298, + "train_runtime": 10547.2019, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 0.7872727272727272, + "grad_norm": 0.014810437336564064, + "learning_rate": 9.881766045512176e-05, + "loss": 0.01398603618144989, + "num_input_tokens_seen": 21272424, + "step": 1299, + "train_runtime": 10555.3212, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 0.7878787878787878, + "grad_norm": 0.0264164749532938, + "learning_rate": 9.881558072158252e-05, + "loss": 0.012693504802882671, + "num_input_tokens_seen": 21288800, + "step": 1300, + "train_runtime": 10563.4399, + "train_tokens_per_second": 2015.328 + }, + { + "epoch": 0.7884848484848485, + "grad_norm": 0.01858045533299446, + "learning_rate": 9.881349918245005e-05, + "loss": 0.013458561152219772, + "num_input_tokens_seen": 21305176, + "step": 1301, + "train_runtime": 10572.4807, + "train_tokens_per_second": 2015.154 + }, + { + "epoch": 0.7890909090909091, + "grad_norm": 0.012029891833662987, + "learning_rate": 9.881141583780127e-05, + "loss": 0.014163712970912457, + "num_input_tokens_seen": 21321552, + "step": 1302, + "train_runtime": 10580.5948, + "train_tokens_per_second": 2015.156 + }, + { + "epoch": 0.7896969696969697, + "grad_norm": 0.016712768003344536, + "learning_rate": 9.880933068771329e-05, + "loss": 0.012644865550100803, + "num_input_tokens_seen": 21337928, + "step": 1303, + "train_runtime": 10588.7111, + "train_tokens_per_second": 2015.158 + }, + { + "epoch": 0.7903030303030303, + "grad_norm": 0.013986393809318542, + "learning_rate": 9.88072437322632e-05, + "loss": 0.015078244730830193, + "num_input_tokens_seen": 21354304, + "step": 1304, + "train_runtime": 10596.8307, + "train_tokens_per_second": 2015.159 + }, + { + "epoch": 0.7909090909090909, + "grad_norm": 0.012918438762426376, + "learning_rate": 9.880515497152823e-05, + "loss": 0.011986867524683475, + "num_input_tokens_seen": 21370680, + "step": 1305, + "train_runtime": 10604.9468, + "train_tokens_per_second": 2015.161 + }, + { + "epoch": 0.7915151515151515, + "grad_norm": 0.0405765101313591, + "learning_rate": 9.880306440558562e-05, + "loss": 0.011655117385089397, + "num_input_tokens_seen": 21387056, + "step": 1306, + "train_runtime": 10613.0635, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 0.7921212121212121, + "grad_norm": 0.01451539620757103, + "learning_rate": 9.880097203451271e-05, + "loss": 0.012863151729106903, + "num_input_tokens_seen": 21403432, + "step": 1307, + "train_runtime": 10621.1883, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 0.7927272727272727, + "grad_norm": 0.015082642436027527, + "learning_rate": 9.879887785838687e-05, + "loss": 0.013372685760259628, + "num_input_tokens_seen": 21419808, + "step": 1308, + "train_runtime": 10629.3154, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 0.7933333333333333, + "grad_norm": 0.013615542091429234, + "learning_rate": 9.879678187728557e-05, + "loss": 0.012768305838108063, + "num_input_tokens_seen": 21436184, + "step": 1309, + "train_runtime": 10637.4354, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 0.793939393939394, + "grad_norm": 0.011857746168971062, + "learning_rate": 9.879468409128632e-05, + "loss": 0.01288798451423645, + "num_input_tokens_seen": 21452560, + "step": 1310, + "train_runtime": 10645.5577, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 0.7945454545454546, + "grad_norm": 0.014605470933020115, + "learning_rate": 9.879258450046673e-05, + "loss": 0.012226996943354607, + "num_input_tokens_seen": 21468936, + "step": 1311, + "train_runtime": 10653.6811, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 0.7951515151515152, + "grad_norm": 0.012224176898598671, + "learning_rate": 9.879048310490448e-05, + "loss": 0.012793928384780884, + "num_input_tokens_seen": 21485312, + "step": 1312, + "train_runtime": 10661.8093, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 0.7957575757575758, + "grad_norm": 0.011239518411457539, + "learning_rate": 9.878837990467725e-05, + "loss": 0.012553832493722439, + "num_input_tokens_seen": 21501688, + "step": 1313, + "train_runtime": 10669.9356, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 0.7963636363636364, + "grad_norm": 0.007681385613977909, + "learning_rate": 9.878627489986287e-05, + "loss": 0.011498531326651573, + "num_input_tokens_seen": 21518064, + "step": 1314, + "train_runtime": 10678.055, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 0.796969696969697, + "grad_norm": 0.01674646884202957, + "learning_rate": 9.87841680905392e-05, + "loss": 0.012630677781999111, + "num_input_tokens_seen": 21534440, + "step": 1315, + "train_runtime": 10686.1747, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 0.7975757575757576, + "grad_norm": 0.041864536702632904, + "learning_rate": 9.878205947678414e-05, + "loss": 0.012615383602678776, + "num_input_tokens_seen": 21550816, + "step": 1316, + "train_runtime": 10694.294, + "train_tokens_per_second": 2015.17 + }, + { + "epoch": 0.7981818181818182, + "grad_norm": 0.02699940651655197, + "learning_rate": 9.877994905867571e-05, + "loss": 0.012835457921028137, + "num_input_tokens_seen": 21567192, + "step": 1317, + "train_runtime": 10702.4144, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 0.7987878787878788, + "grad_norm": 0.014113808050751686, + "learning_rate": 9.877783683629195e-05, + "loss": 0.012954406440258026, + "num_input_tokens_seen": 21583568, + "step": 1318, + "train_runtime": 10710.5348, + "train_tokens_per_second": 2015.172 + }, + { + "epoch": 0.7993939393939394, + "grad_norm": 0.021632181480526924, + "learning_rate": 9.8775722809711e-05, + "loss": 0.01311055850237608, + "num_input_tokens_seen": 21599944, + "step": 1319, + "train_runtime": 10718.6528, + "train_tokens_per_second": 2015.173 + }, + { + "epoch": 0.8, + "grad_norm": 0.017235929146409035, + "learning_rate": 9.877360697901105e-05, + "loss": 0.01242838054895401, + "num_input_tokens_seen": 21616320, + "step": 1320, + "train_runtime": 10726.7712, + "train_tokens_per_second": 2015.175 + }, + { + "epoch": 0.8006060606060607, + "grad_norm": 0.016485046595335007, + "learning_rate": 9.877148934427037e-05, + "loss": 0.012305478565394878, + "num_input_tokens_seen": 21632696, + "step": 1321, + "train_runtime": 10734.8904, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 0.8012121212121213, + "grad_norm": 0.012739230878651142, + "learning_rate": 9.876936990556725e-05, + "loss": 0.012538356706500053, + "num_input_tokens_seen": 21649072, + "step": 1322, + "train_runtime": 10743.0097, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 0.8018181818181818, + "grad_norm": 0.012361896224319935, + "learning_rate": 9.876724866298012e-05, + "loss": 0.013269990682601929, + "num_input_tokens_seen": 21665448, + "step": 1323, + "train_runtime": 10751.1304, + "train_tokens_per_second": 2015.179 + }, + { + "epoch": 0.8024242424242424, + "grad_norm": 0.0066161691211164, + "learning_rate": 9.876512561658745e-05, + "loss": 0.011660989373922348, + "num_input_tokens_seen": 21681824, + "step": 1324, + "train_runtime": 10759.2511, + "train_tokens_per_second": 2015.18 + }, + { + "epoch": 0.803030303030303, + "grad_norm": 0.025402076542377472, + "learning_rate": 9.876300076646774e-05, + "loss": 0.012346756644546986, + "num_input_tokens_seen": 21698200, + "step": 1325, + "train_runtime": 10767.3706, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 0.8036363636363636, + "grad_norm": 0.02845195308327675, + "learning_rate": 9.876087411269959e-05, + "loss": 0.0148523710668087, + "num_input_tokens_seen": 21714576, + "step": 1326, + "train_runtime": 10775.4904, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 0.8042424242424242, + "grad_norm": 0.020637033507227898, + "learning_rate": 9.875874565536167e-05, + "loss": 0.013783378526568413, + "num_input_tokens_seen": 21730952, + "step": 1327, + "train_runtime": 10783.6113, + "train_tokens_per_second": 2015.183 + }, + { + "epoch": 0.8048484848484848, + "grad_norm": 0.03877370432019234, + "learning_rate": 9.87566153945327e-05, + "loss": 0.01296904031187296, + "num_input_tokens_seen": 21747328, + "step": 1328, + "train_runtime": 10791.7314, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 0.8054545454545454, + "grad_norm": 0.02621079795062542, + "learning_rate": 9.875448333029146e-05, + "loss": 0.015151145868003368, + "num_input_tokens_seen": 21763704, + "step": 1329, + "train_runtime": 10799.8508, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 0.806060606060606, + "grad_norm": 0.012624816037714481, + "learning_rate": 9.875234946271685e-05, + "loss": 0.01100456528365612, + "num_input_tokens_seen": 21780080, + "step": 1330, + "train_runtime": 10807.9703, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 0.8066666666666666, + "grad_norm": 0.04244585335254669, + "learning_rate": 9.875021379188776e-05, + "loss": 0.014457973651587963, + "num_input_tokens_seen": 21796456, + "step": 1331, + "train_runtime": 10816.0902, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 0.8072727272727273, + "grad_norm": 0.012526098638772964, + "learning_rate": 9.87480763178832e-05, + "loss": 0.01147517841309309, + "num_input_tokens_seen": 21812832, + "step": 1332, + "train_runtime": 10824.2096, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 0.8078787878787879, + "grad_norm": 0.01184050552546978, + "learning_rate": 9.874593704078224e-05, + "loss": 0.012551544234156609, + "num_input_tokens_seen": 21829208, + "step": 1333, + "train_runtime": 10832.3319, + "train_tokens_per_second": 2015.19 + }, + { + "epoch": 0.8084848484848485, + "grad_norm": 0.01893959939479828, + "learning_rate": 9.874379596066398e-05, + "loss": 0.014782631769776344, + "num_input_tokens_seen": 21845584, + "step": 1334, + "train_runtime": 10840.4505, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 0.8090909090909091, + "grad_norm": 0.014813544228672981, + "learning_rate": 9.874165307760764e-05, + "loss": 0.01277944352477789, + "num_input_tokens_seen": 21861960, + "step": 1335, + "train_runtime": 10848.5696, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 0.8096969696969697, + "grad_norm": 0.011958160437643528, + "learning_rate": 9.873950839169248e-05, + "loss": 0.012058142572641373, + "num_input_tokens_seen": 21878336, + "step": 1336, + "train_runtime": 10856.6889, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 0.8103030303030303, + "grad_norm": 0.016458792611956596, + "learning_rate": 9.87373619029978e-05, + "loss": 0.012132089585065842, + "num_input_tokens_seen": 21894712, + "step": 1337, + "train_runtime": 10864.8105, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 0.8109090909090909, + "grad_norm": 0.027679556980729103, + "learning_rate": 9.873521361160304e-05, + "loss": 0.012615354731678963, + "num_input_tokens_seen": 21911088, + "step": 1338, + "train_runtime": 10872.9347, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 0.8115151515151515, + "grad_norm": 0.01617676578462124, + "learning_rate": 9.873306351758762e-05, + "loss": 0.011802049353718758, + "num_input_tokens_seen": 21927464, + "step": 1339, + "train_runtime": 10881.0546, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 0.8121212121212121, + "grad_norm": 0.009554133750498295, + "learning_rate": 9.87309116210311e-05, + "loss": 0.012184510938823223, + "num_input_tokens_seen": 21943840, + "step": 1340, + "train_runtime": 10889.1741, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 0.8127272727272727, + "grad_norm": 0.01341445092111826, + "learning_rate": 9.872875792201304e-05, + "loss": 0.012919439002871513, + "num_input_tokens_seen": 21960216, + "step": 1341, + "train_runtime": 10897.2932, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 0.8133333333333334, + "grad_norm": 0.010654855519533157, + "learning_rate": 9.872660242061314e-05, + "loss": 0.013909978792071342, + "num_input_tokens_seen": 21976592, + "step": 1342, + "train_runtime": 10905.4183, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 0.813939393939394, + "grad_norm": 0.011213628575205803, + "learning_rate": 9.872444511691107e-05, + "loss": 0.011805294081568718, + "num_input_tokens_seen": 21992968, + "step": 1343, + "train_runtime": 10913.5364, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 0.8145454545454546, + "grad_norm": 0.011983062140643597, + "learning_rate": 9.87222860109867e-05, + "loss": 0.010815788060426712, + "num_input_tokens_seen": 22009344, + "step": 1344, + "train_runtime": 10921.6544, + "train_tokens_per_second": 2015.202 + }, + { + "epoch": 0.8151515151515152, + "grad_norm": 0.011851955205202103, + "learning_rate": 9.872012510291983e-05, + "loss": 0.013886788859963417, + "num_input_tokens_seen": 22025720, + "step": 1345, + "train_runtime": 10929.7729, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 0.8157575757575758, + "grad_norm": 0.012403900735080242, + "learning_rate": 9.871796239279043e-05, + "loss": 0.01268466841429472, + "num_input_tokens_seen": 22042096, + "step": 1346, + "train_runtime": 10937.8919, + "train_tokens_per_second": 2015.205 + }, + { + "epoch": 0.8163636363636364, + "grad_norm": 0.016347525641322136, + "learning_rate": 9.871579788067846e-05, + "loss": 0.012477520853281021, + "num_input_tokens_seen": 22058472, + "step": 1347, + "train_runtime": 10946.0113, + "train_tokens_per_second": 2015.206 + }, + { + "epoch": 0.816969696969697, + "grad_norm": 0.02359098754823208, + "learning_rate": 9.8713631566664e-05, + "loss": 0.012588823214173317, + "num_input_tokens_seen": 22074848, + "step": 1348, + "train_runtime": 10954.1352, + "train_tokens_per_second": 2015.207 + }, + { + "epoch": 0.8175757575757576, + "grad_norm": 0.008759316988289356, + "learning_rate": 9.871146345082716e-05, + "loss": 0.012180456891655922, + "num_input_tokens_seen": 22091224, + "step": 1349, + "train_runtime": 10962.2539, + "train_tokens_per_second": 2015.208 + }, + { + "epoch": 0.8181818181818182, + "grad_norm": 0.03136594220995903, + "learning_rate": 9.870929353324817e-05, + "loss": 0.014148636721074581, + "num_input_tokens_seen": 22107600, + "step": 1350, + "train_runtime": 10970.3731, + "train_tokens_per_second": 2015.209 + }, + { + "epoch": 0.8187878787878788, + "grad_norm": 0.02214551530778408, + "learning_rate": 9.870712181400726e-05, + "loss": 0.012522125616669655, + "num_input_tokens_seen": 22123976, + "step": 1351, + "train_runtime": 10978.4906, + "train_tokens_per_second": 2015.211 + }, + { + "epoch": 0.8193939393939393, + "grad_norm": 0.017318114638328552, + "learning_rate": 9.870494829318478e-05, + "loss": 0.013153801672160625, + "num_input_tokens_seen": 22140352, + "step": 1352, + "train_runtime": 10986.6091, + "train_tokens_per_second": 2015.213 + }, + { + "epoch": 0.82, + "grad_norm": 0.014927364885807037, + "learning_rate": 9.87027729708611e-05, + "loss": 0.012346560135483742, + "num_input_tokens_seen": 22156728, + "step": 1353, + "train_runtime": 10994.7304, + "train_tokens_per_second": 2015.213 + }, + { + "epoch": 0.8206060606060606, + "grad_norm": 0.011974423192441463, + "learning_rate": 9.870059584711668e-05, + "loss": 0.012083306908607483, + "num_input_tokens_seen": 22173104, + "step": 1354, + "train_runtime": 11002.8485, + "train_tokens_per_second": 2015.215 + }, + { + "epoch": 0.8212121212121212, + "grad_norm": 0.014825565740466118, + "learning_rate": 9.869841692203208e-05, + "loss": 0.013183614239096642, + "num_input_tokens_seen": 22189480, + "step": 1355, + "train_runtime": 11010.9676, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 0.8218181818181818, + "grad_norm": 0.01507493481040001, + "learning_rate": 9.869623619568786e-05, + "loss": 0.012968642637133598, + "num_input_tokens_seen": 22205856, + "step": 1356, + "train_runtime": 11019.0872, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 0.8224242424242424, + "grad_norm": 0.009937523864209652, + "learning_rate": 9.86940536681647e-05, + "loss": 0.012275813147425652, + "num_input_tokens_seen": 22222232, + "step": 1357, + "train_runtime": 11027.2052, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 0.823030303030303, + "grad_norm": 0.008918135426938534, + "learning_rate": 9.869186933954331e-05, + "loss": 0.012659851461648941, + "num_input_tokens_seen": 22238608, + "step": 1358, + "train_runtime": 11035.3325, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 0.8236363636363636, + "grad_norm": 0.015751199796795845, + "learning_rate": 9.868968320990452e-05, + "loss": 0.01403406634926796, + "num_input_tokens_seen": 22254984, + "step": 1359, + "train_runtime": 11043.4519, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 0.8242424242424242, + "grad_norm": 0.04037531092762947, + "learning_rate": 9.868749527932914e-05, + "loss": 0.014338891953229904, + "num_input_tokens_seen": 22271360, + "step": 1360, + "train_runtime": 11051.5705, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 0.8248484848484848, + "grad_norm": 0.027882136404514313, + "learning_rate": 9.868530554789815e-05, + "loss": 0.013941345736384392, + "num_input_tokens_seen": 22287736, + "step": 1361, + "train_runtime": 11059.6884, + "train_tokens_per_second": 2015.223 + }, + { + "epoch": 0.8254545454545454, + "grad_norm": 0.012381003238260746, + "learning_rate": 9.868311401569251e-05, + "loss": 0.013261671178042889, + "num_input_tokens_seen": 22304112, + "step": 1362, + "train_runtime": 11067.807, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 0.826060606060606, + "grad_norm": 0.019036108627915382, + "learning_rate": 9.868092068279329e-05, + "loss": 0.011298813857138157, + "num_input_tokens_seen": 22320488, + "step": 1363, + "train_runtime": 11075.931, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 0.017254643142223358, + "learning_rate": 9.86787255492816e-05, + "loss": 0.011698050424456596, + "num_input_tokens_seen": 22336864, + "step": 1364, + "train_runtime": 11084.0513, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 0.8272727272727273, + "grad_norm": 0.012853083200752735, + "learning_rate": 9.867652861523866e-05, + "loss": 0.012597981840372086, + "num_input_tokens_seen": 22353240, + "step": 1365, + "train_runtime": 11092.1694, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 0.8278787878787879, + "grad_norm": 0.01841077208518982, + "learning_rate": 9.867432988074572e-05, + "loss": 0.014389104209840298, + "num_input_tokens_seen": 22369616, + "step": 1366, + "train_runtime": 11100.288, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 0.8284848484848485, + "grad_norm": 0.010745099745690823, + "learning_rate": 9.867212934588411e-05, + "loss": 0.013019641861319542, + "num_input_tokens_seen": 22385992, + "step": 1367, + "train_runtime": 11108.4091, + "train_tokens_per_second": 2015.229 + }, + { + "epoch": 0.8290909090909091, + "grad_norm": 0.015590776689350605, + "learning_rate": 9.866992701073522e-05, + "loss": 0.012512456625699997, + "num_input_tokens_seen": 22402368, + "step": 1368, + "train_runtime": 11116.5309, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 0.8296969696969697, + "grad_norm": 0.012689262628555298, + "learning_rate": 9.866772287538051e-05, + "loss": 0.0124176861718297, + "num_input_tokens_seen": 22418744, + "step": 1369, + "train_runtime": 11124.6504, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 0.8303030303030303, + "grad_norm": 0.01711587980389595, + "learning_rate": 9.866551693990151e-05, + "loss": 0.012735790573060513, + "num_input_tokens_seen": 22435120, + "step": 1370, + "train_runtime": 11132.7682, + "train_tokens_per_second": 2015.233 + }, + { + "epoch": 0.8309090909090909, + "grad_norm": 0.015730151906609535, + "learning_rate": 9.866330920437979e-05, + "loss": 0.012005583383142948, + "num_input_tokens_seen": 22451496, + "step": 1371, + "train_runtime": 11140.8866, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 0.8315151515151515, + "grad_norm": 0.01519712619483471, + "learning_rate": 9.866109966889705e-05, + "loss": 0.01357693038880825, + "num_input_tokens_seen": 22467872, + "step": 1372, + "train_runtime": 11149.0048, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 0.8321212121212122, + "grad_norm": 0.013813342899084091, + "learning_rate": 9.865888833353499e-05, + "loss": 0.01139441505074501, + "num_input_tokens_seen": 22484248, + "step": 1373, + "train_runtime": 11157.132, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 0.8327272727272728, + "grad_norm": 0.0025392461102455854, + "learning_rate": 9.865667519837541e-05, + "loss": 0.012021156027913094, + "num_input_tokens_seen": 22500624, + "step": 1374, + "train_runtime": 11165.2509, + "train_tokens_per_second": 2015.237 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.02511661872267723, + "learning_rate": 9.865446026350017e-05, + "loss": 0.013405115343630314, + "num_input_tokens_seen": 22517000, + "step": 1375, + "train_runtime": 11173.3711, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 0.833939393939394, + "grad_norm": 0.00909800361841917, + "learning_rate": 9.865224352899119e-05, + "loss": 0.012450836598873138, + "num_input_tokens_seen": 22533376, + "step": 1376, + "train_runtime": 11181.4899, + "train_tokens_per_second": 2015.239 + }, + { + "epoch": 0.8345454545454546, + "grad_norm": 0.016653254628181458, + "learning_rate": 9.865002499493048e-05, + "loss": 0.012657481245696545, + "num_input_tokens_seen": 22549752, + "step": 1377, + "train_runtime": 11189.6096, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 0.8351515151515152, + "grad_norm": 0.01992923766374588, + "learning_rate": 9.864780466140009e-05, + "loss": 0.014634167775511742, + "num_input_tokens_seen": 22566128, + "step": 1378, + "train_runtime": 11197.7309, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 0.8357575757575758, + "grad_norm": 0.01114288903772831, + "learning_rate": 9.864558252848213e-05, + "loss": 0.012311486527323723, + "num_input_tokens_seen": 22582504, + "step": 1379, + "train_runtime": 11205.8507, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 0.8363636363636363, + "grad_norm": 0.006412926129996777, + "learning_rate": 9.864335859625879e-05, + "loss": 0.011968771927058697, + "num_input_tokens_seen": 22598880, + "step": 1380, + "train_runtime": 11213.9704, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 0.8369696969696969, + "grad_norm": 0.015292688272893429, + "learning_rate": 9.864113286481237e-05, + "loss": 0.012508758343756199, + "num_input_tokens_seen": 22615256, + "step": 1381, + "train_runtime": 11222.0902, + "train_tokens_per_second": 2015.245 + }, + { + "epoch": 0.8375757575757575, + "grad_norm": 0.007994702085852623, + "learning_rate": 9.863890533422516e-05, + "loss": 0.011799611151218414, + "num_input_tokens_seen": 22631632, + "step": 1382, + "train_runtime": 11230.2088, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 0.8381818181818181, + "grad_norm": 0.011203468777239323, + "learning_rate": 9.863667600457957e-05, + "loss": 0.012280134484171867, + "num_input_tokens_seen": 22648008, + "step": 1383, + "train_runtime": 11238.3318, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 0.8387878787878787, + "grad_norm": 0.011974025517702103, + "learning_rate": 9.863444487595803e-05, + "loss": 0.012465615756809711, + "num_input_tokens_seen": 22664384, + "step": 1384, + "train_runtime": 11246.4525, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 0.8393939393939394, + "grad_norm": 0.015972545370459557, + "learning_rate": 9.86322119484431e-05, + "loss": 0.011578064411878586, + "num_input_tokens_seen": 22680760, + "step": 1385, + "train_runtime": 11254.5716, + "train_tokens_per_second": 2015.249 + }, + { + "epoch": 0.84, + "grad_norm": 0.015455652959644794, + "learning_rate": 9.862997722211735e-05, + "loss": 0.013119183480739594, + "num_input_tokens_seen": 22697136, + "step": 1386, + "train_runtime": 11262.6896, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 0.8406060606060606, + "grad_norm": 0.02377978526055813, + "learning_rate": 9.862774069706346e-05, + "loss": 0.013906264677643776, + "num_input_tokens_seen": 22713512, + "step": 1387, + "train_runtime": 11270.8064, + "train_tokens_per_second": 2015.252 + }, + { + "epoch": 0.8412121212121212, + "grad_norm": 0.012666025198996067, + "learning_rate": 9.862550237336413e-05, + "loss": 0.011985675431787968, + "num_input_tokens_seen": 22729888, + "step": 1388, + "train_runtime": 11278.9313, + "train_tokens_per_second": 2015.252 + }, + { + "epoch": 0.8418181818181818, + "grad_norm": 0.011326838284730911, + "learning_rate": 9.862326225110216e-05, + "loss": 0.011968444101512432, + "num_input_tokens_seen": 22746264, + "step": 1389, + "train_runtime": 11287.0513, + "train_tokens_per_second": 2015.253 + }, + { + "epoch": 0.8424242424242424, + "grad_norm": 0.0107469717040658, + "learning_rate": 9.862102033036042e-05, + "loss": 0.012955324724316597, + "num_input_tokens_seen": 22762640, + "step": 1390, + "train_runtime": 11295.1711, + "train_tokens_per_second": 2015.254 + }, + { + "epoch": 0.843030303030303, + "grad_norm": 0.011529113166034222, + "learning_rate": 9.86187766112218e-05, + "loss": 0.012033510953187943, + "num_input_tokens_seen": 22779016, + "step": 1391, + "train_runtime": 11303.291, + "train_tokens_per_second": 2015.255 + }, + { + "epoch": 0.8436363636363636, + "grad_norm": 0.011735017411410809, + "learning_rate": 9.861653109376934e-05, + "loss": 0.012355628423392773, + "num_input_tokens_seen": 22795392, + "step": 1392, + "train_runtime": 11311.4112, + "train_tokens_per_second": 2015.256 + }, + { + "epoch": 0.8442424242424242, + "grad_norm": 0.05303164944052696, + "learning_rate": 9.861428377808606e-05, + "loss": 0.011541967280209064, + "num_input_tokens_seen": 22811768, + "step": 1393, + "train_runtime": 11319.5333, + "train_tokens_per_second": 2015.257 + }, + { + "epoch": 0.8448484848484848, + "grad_norm": 0.006728252395987511, + "learning_rate": 9.861203466425508e-05, + "loss": 0.013560689054429531, + "num_input_tokens_seen": 22828144, + "step": 1394, + "train_runtime": 11327.6516, + "train_tokens_per_second": 2015.258 + }, + { + "epoch": 0.8454545454545455, + "grad_norm": 0.013561035506427288, + "learning_rate": 9.860978375235963e-05, + "loss": 0.01197909377515316, + "num_input_tokens_seen": 22844520, + "step": 1395, + "train_runtime": 11335.7713, + "train_tokens_per_second": 2015.259 + }, + { + "epoch": 0.8460606060606061, + "grad_norm": 0.020181827247142792, + "learning_rate": 9.860753104248292e-05, + "loss": 0.013650638982653618, + "num_input_tokens_seen": 22860896, + "step": 1396, + "train_runtime": 11343.891, + "train_tokens_per_second": 2015.261 + }, + { + "epoch": 0.8466666666666667, + "grad_norm": 0.014147473499178886, + "learning_rate": 9.860527653470831e-05, + "loss": 0.012558222748339176, + "num_input_tokens_seen": 22877272, + "step": 1397, + "train_runtime": 11352.0104, + "train_tokens_per_second": 2015.262 + }, + { + "epoch": 0.8472727272727273, + "grad_norm": 0.02730811946094036, + "learning_rate": 9.860302022911918e-05, + "loss": 0.011438505724072456, + "num_input_tokens_seen": 22893648, + "step": 1398, + "train_runtime": 11360.1326, + "train_tokens_per_second": 2015.262 + }, + { + "epoch": 0.8478787878787879, + "grad_norm": 0.008396074175834656, + "learning_rate": 9.860076212579896e-05, + "loss": 0.011421089991927147, + "num_input_tokens_seen": 22910024, + "step": 1399, + "train_runtime": 11368.2516, + "train_tokens_per_second": 2015.264 + }, + { + "epoch": 0.8484848484848485, + "grad_norm": 0.015687720850110054, + "learning_rate": 9.859850222483123e-05, + "loss": 0.011946003884077072, + "num_input_tokens_seen": 22926400, + "step": 1400, + "train_runtime": 11376.3701, + "train_tokens_per_second": 2015.265 + }, + { + "epoch": 0.8490909090909091, + "grad_norm": 0.016600729897618294, + "learning_rate": 9.859624052629951e-05, + "loss": 0.013347601518034935, + "num_input_tokens_seen": 22942776, + "step": 1401, + "train_runtime": 11385.3698, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 0.8496969696969697, + "grad_norm": 0.01619286835193634, + "learning_rate": 9.85939770302875e-05, + "loss": 0.01237676665186882, + "num_input_tokens_seen": 22959152, + "step": 1402, + "train_runtime": 11393.4858, + "train_tokens_per_second": 2015.112 + }, + { + "epoch": 0.8503030303030303, + "grad_norm": 0.03248157724738121, + "learning_rate": 9.859171173687891e-05, + "loss": 0.011984573677182198, + "num_input_tokens_seen": 22975528, + "step": 1403, + "train_runtime": 11401.6049, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 0.850909090909091, + "grad_norm": 0.015169057063758373, + "learning_rate": 9.858944464615754e-05, + "loss": 0.012047179043293, + "num_input_tokens_seen": 22991904, + "step": 1404, + "train_runtime": 11409.7207, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 0.8515151515151516, + "grad_norm": 0.013567727990448475, + "learning_rate": 9.858717575820723e-05, + "loss": 0.015603979118168354, + "num_input_tokens_seen": 23008280, + "step": 1405, + "train_runtime": 11417.8368, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 0.8521212121212122, + "grad_norm": 0.009488792158663273, + "learning_rate": 9.85849050731119e-05, + "loss": 0.012402615509927273, + "num_input_tokens_seen": 23024656, + "step": 1406, + "train_runtime": 11425.9595, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 0.8527272727272728, + "grad_norm": 0.010693400166928768, + "learning_rate": 9.858263259095557e-05, + "loss": 0.012290366925299168, + "num_input_tokens_seen": 23041032, + "step": 1407, + "train_runtime": 11434.0763, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 0.033301692456007004, + "learning_rate": 9.858035831182226e-05, + "loss": 0.016457989811897278, + "num_input_tokens_seen": 23057408, + "step": 1408, + "train_runtime": 11442.1926, + "train_tokens_per_second": 2015.121 + }, + { + "epoch": 0.8539393939393939, + "grad_norm": 0.021398158743977547, + "learning_rate": 9.85780822357961e-05, + "loss": 0.01473909430205822, + "num_input_tokens_seen": 23073784, + "step": 1409, + "train_runtime": 11450.3097, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 0.8545454545454545, + "grad_norm": 0.024622607976198196, + "learning_rate": 9.857580436296127e-05, + "loss": 0.01464729942381382, + "num_input_tokens_seen": 23090160, + "step": 1410, + "train_runtime": 11458.4341, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 0.8551515151515151, + "grad_norm": 0.010785657912492752, + "learning_rate": 9.857352469340204e-05, + "loss": 0.01193370670080185, + "num_input_tokens_seen": 23106536, + "step": 1411, + "train_runtime": 11466.5619, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 0.8557575757575757, + "grad_norm": 0.00772235170006752, + "learning_rate": 9.857124322720273e-05, + "loss": 0.01089341752231121, + "num_input_tokens_seen": 23122912, + "step": 1412, + "train_runtime": 11474.6825, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 0.8563636363636363, + "grad_norm": 0.013583921827375889, + "learning_rate": 9.856895996444772e-05, + "loss": 0.011918467469513416, + "num_input_tokens_seen": 23139288, + "step": 1413, + "train_runtime": 11482.8028, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 0.8569696969696969, + "grad_norm": 0.010831024497747421, + "learning_rate": 9.856667490522146e-05, + "loss": 0.011809214949607849, + "num_input_tokens_seen": 23155664, + "step": 1414, + "train_runtime": 11490.9203, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 0.8575757575757575, + "grad_norm": 0.010280442424118519, + "learning_rate": 9.856438804960848e-05, + "loss": 0.011262697167694569, + "num_input_tokens_seen": 23172040, + "step": 1415, + "train_runtime": 11499.0452, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 0.8581818181818182, + "grad_norm": 0.017096806317567825, + "learning_rate": 9.856209939769335e-05, + "loss": 0.013108273036777973, + "num_input_tokens_seen": 23188416, + "step": 1416, + "train_runtime": 11507.1661, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 0.8587878787878788, + "grad_norm": 0.011299760080873966, + "learning_rate": 9.855980894956074e-05, + "loss": 0.011602142825722694, + "num_input_tokens_seen": 23204792, + "step": 1417, + "train_runtime": 11515.2871, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 0.8593939393939394, + "grad_norm": 0.016026539728045464, + "learning_rate": 9.855751670529536e-05, + "loss": 0.013274731114506721, + "num_input_tokens_seen": 23221168, + "step": 1418, + "train_runtime": 11523.407, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 0.86, + "grad_norm": 0.012965604662895203, + "learning_rate": 9.8555222664982e-05, + "loss": 0.01300659030675888, + "num_input_tokens_seen": 23237544, + "step": 1419, + "train_runtime": 11531.5321, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 0.8606060606060606, + "grad_norm": 0.02445352077484131, + "learning_rate": 9.855292682870551e-05, + "loss": 0.013643065467476845, + "num_input_tokens_seen": 23253920, + "step": 1420, + "train_runtime": 11539.6549, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 0.8612121212121212, + "grad_norm": 0.01660062186419964, + "learning_rate": 9.855062919655083e-05, + "loss": 0.012771239504218102, + "num_input_tokens_seen": 23270296, + "step": 1421, + "train_runtime": 11547.78, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 0.8618181818181818, + "grad_norm": 0.022184133529663086, + "learning_rate": 9.854832976860289e-05, + "loss": 0.012608212418854237, + "num_input_tokens_seen": 23286672, + "step": 1422, + "train_runtime": 11555.8992, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 0.8624242424242424, + "grad_norm": 0.018696678802371025, + "learning_rate": 9.85460285449468e-05, + "loss": 0.01361760776489973, + "num_input_tokens_seen": 23303048, + "step": 1423, + "train_runtime": 11564.0219, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 0.863030303030303, + "grad_norm": 0.009972603991627693, + "learning_rate": 9.854372552566764e-05, + "loss": 0.012581647373735905, + "num_input_tokens_seen": 23319424, + "step": 1424, + "train_runtime": 11572.1414, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 0.8636363636363636, + "grad_norm": 0.007429645396769047, + "learning_rate": 9.854142071085061e-05, + "loss": 0.011579862795770168, + "num_input_tokens_seen": 23335800, + "step": 1425, + "train_runtime": 11580.2672, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 0.8642424242424243, + "grad_norm": 0.014917800202965736, + "learning_rate": 9.853911410058097e-05, + "loss": 0.014219231903553009, + "num_input_tokens_seen": 23352176, + "step": 1426, + "train_runtime": 11588.3879, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 0.8648484848484849, + "grad_norm": 0.013650372624397278, + "learning_rate": 9.853680569494401e-05, + "loss": 0.011797931976616383, + "num_input_tokens_seen": 23368552, + "step": 1427, + "train_runtime": 11596.5073, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 0.8654545454545455, + "grad_norm": 0.02313203178346157, + "learning_rate": 9.853449549402514e-05, + "loss": 0.012145559303462505, + "num_input_tokens_seen": 23384928, + "step": 1428, + "train_runtime": 11604.6343, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 0.8660606060606061, + "grad_norm": 0.01312107965350151, + "learning_rate": 9.853218349790979e-05, + "loss": 0.012798131443560123, + "num_input_tokens_seen": 23401304, + "step": 1429, + "train_runtime": 11612.7553, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 0.01796252839267254, + "learning_rate": 9.852986970668349e-05, + "loss": 0.012703349813818932, + "num_input_tokens_seen": 23417680, + "step": 1430, + "train_runtime": 11620.8769, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 0.8672727272727273, + "grad_norm": 0.008615394122898579, + "learning_rate": 9.85275541204318e-05, + "loss": 0.011092279106378555, + "num_input_tokens_seen": 23434056, + "step": 1431, + "train_runtime": 11628.999, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 0.8678787878787879, + "grad_norm": 0.01242077350616455, + "learning_rate": 9.852523673924042e-05, + "loss": 0.012449410744011402, + "num_input_tokens_seen": 23450432, + "step": 1432, + "train_runtime": 11637.1318, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 0.8684848484848485, + "grad_norm": 0.018718481063842773, + "learning_rate": 9.852291756319501e-05, + "loss": 0.014180365018546581, + "num_input_tokens_seen": 23466808, + "step": 1433, + "train_runtime": 11645.2526, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 0.8690909090909091, + "grad_norm": 0.015112209133803844, + "learning_rate": 9.852059659238137e-05, + "loss": 0.011895522475242615, + "num_input_tokens_seen": 23483184, + "step": 1434, + "train_runtime": 11653.3718, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 0.8696969696969697, + "grad_norm": 0.016961101442575455, + "learning_rate": 9.851827382688535e-05, + "loss": 0.013209850527346134, + "num_input_tokens_seen": 23499560, + "step": 1435, + "train_runtime": 11661.4915, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 0.8703030303030304, + "grad_norm": 0.024609530344605446, + "learning_rate": 9.851594926679287e-05, + "loss": 0.013271688483655453, + "num_input_tokens_seen": 23515936, + "step": 1436, + "train_runtime": 11669.6192, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 0.8709090909090909, + "grad_norm": 0.012788881547749043, + "learning_rate": 9.851362291218991e-05, + "loss": 0.014663812704384327, + "num_input_tokens_seen": 23532312, + "step": 1437, + "train_runtime": 11677.741, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 0.8715151515151515, + "grad_norm": 0.012840853072702885, + "learning_rate": 9.851129476316252e-05, + "loss": 0.014140025712549686, + "num_input_tokens_seen": 23548688, + "step": 1438, + "train_runtime": 11685.8601, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 0.8721212121212121, + "grad_norm": 0.008752677589654922, + "learning_rate": 9.85089648197968e-05, + "loss": 0.012541829608380795, + "num_input_tokens_seen": 23565064, + "step": 1439, + "train_runtime": 11694.0189, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 0.8727272727272727, + "grad_norm": 0.019652189686894417, + "learning_rate": 9.850663308217893e-05, + "loss": 0.013279132544994354, + "num_input_tokens_seen": 23581440, + "step": 1440, + "train_runtime": 11702.1396, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 0.8733333333333333, + "grad_norm": 0.020490368828177452, + "learning_rate": 9.850429955039518e-05, + "loss": 0.013454969972372055, + "num_input_tokens_seen": 23597816, + "step": 1441, + "train_runtime": 11710.2595, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 0.8739393939393939, + "grad_norm": 0.007157603278756142, + "learning_rate": 9.850196422453185e-05, + "loss": 0.012067731469869614, + "num_input_tokens_seen": 23614192, + "step": 1442, + "train_runtime": 11718.3843, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 0.8745454545454545, + "grad_norm": 0.01431642472743988, + "learning_rate": 9.849962710467531e-05, + "loss": 0.012786502949893475, + "num_input_tokens_seen": 23630568, + "step": 1443, + "train_runtime": 11726.5058, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 0.8751515151515151, + "grad_norm": 0.019850250333547592, + "learning_rate": 9.849728819091201e-05, + "loss": 0.014681624248623848, + "num_input_tokens_seen": 23646944, + "step": 1444, + "train_runtime": 11734.6344, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 0.8757575757575757, + "grad_norm": 0.020691564306616783, + "learning_rate": 9.849494748332846e-05, + "loss": 0.013279177248477936, + "num_input_tokens_seen": 23663320, + "step": 1445, + "train_runtime": 11742.7555, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 0.8763636363636363, + "grad_norm": 0.01907418854534626, + "learning_rate": 9.849260498201126e-05, + "loss": 0.013074219226837158, + "num_input_tokens_seen": 23679696, + "step": 1446, + "train_runtime": 11750.8745, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 0.876969696969697, + "grad_norm": 0.015929479151964188, + "learning_rate": 9.849026068704702e-05, + "loss": 0.01318158209323883, + "num_input_tokens_seen": 23696072, + "step": 1447, + "train_runtime": 11758.9957, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 0.8775757575757576, + "grad_norm": 0.008310094475746155, + "learning_rate": 9.848791459852247e-05, + "loss": 0.012736203148961067, + "num_input_tokens_seen": 23712448, + "step": 1448, + "train_runtime": 11767.1144, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 0.8781818181818182, + "grad_norm": 0.014460076577961445, + "learning_rate": 9.848556671652438e-05, + "loss": 0.013328369706869125, + "num_input_tokens_seen": 23728824, + "step": 1449, + "train_runtime": 11775.2427, + "train_tokens_per_second": 2015.145 + }, + { + "epoch": 0.8787878787878788, + "grad_norm": 0.03255464881658554, + "learning_rate": 9.84832170411396e-05, + "loss": 0.013615809381008148, + "num_input_tokens_seen": 23745200, + "step": 1450, + "train_runtime": 11783.3648, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 0.8793939393939394, + "grad_norm": 0.012133513577282429, + "learning_rate": 9.848086557245507e-05, + "loss": 0.01337357982993126, + "num_input_tokens_seen": 23761576, + "step": 1451, + "train_runtime": 11791.4852, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 0.88, + "grad_norm": 0.015239196829497814, + "learning_rate": 9.847851231055769e-05, + "loss": 0.014663100242614746, + "num_input_tokens_seen": 23777952, + "step": 1452, + "train_runtime": 11799.6042, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 0.8806060606060606, + "grad_norm": 0.01241212897002697, + "learning_rate": 9.847615725553456e-05, + "loss": 0.010611728765070438, + "num_input_tokens_seen": 23794328, + "step": 1453, + "train_runtime": 11807.7327, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 0.8812121212121212, + "grad_norm": 0.011121004819869995, + "learning_rate": 9.84738004074728e-05, + "loss": 0.01263385359197855, + "num_input_tokens_seen": 23810704, + "step": 1454, + "train_runtime": 11815.8522, + "train_tokens_per_second": 2015.149 + }, + { + "epoch": 0.8818181818181818, + "grad_norm": 0.004201293457299471, + "learning_rate": 9.847144176645954e-05, + "loss": 0.011922663077712059, + "num_input_tokens_seen": 23827080, + "step": 1455, + "train_runtime": 11823.9733, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 0.8824242424242424, + "grad_norm": 0.020469272509217262, + "learning_rate": 9.846908133258204e-05, + "loss": 0.014031194150447845, + "num_input_tokens_seen": 23843456, + "step": 1456, + "train_runtime": 11832.0934, + "train_tokens_per_second": 2015.151 + }, + { + "epoch": 0.883030303030303, + "grad_norm": 0.011495082639157772, + "learning_rate": 9.846671910592761e-05, + "loss": 0.010416560806334019, + "num_input_tokens_seen": 23859832, + "step": 1457, + "train_runtime": 11840.214, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 0.8836363636363637, + "grad_norm": 0.018274936825037003, + "learning_rate": 9.846435508658364e-05, + "loss": 0.013232077471911907, + "num_input_tokens_seen": 23876208, + "step": 1458, + "train_runtime": 11848.3402, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 0.8842424242424243, + "grad_norm": 0.015522732399404049, + "learning_rate": 9.846198927463754e-05, + "loss": 0.01150945108383894, + "num_input_tokens_seen": 23892584, + "step": 1459, + "train_runtime": 11856.4668, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 0.8848484848484849, + "grad_norm": 0.0219294223934412, + "learning_rate": 9.845962167017684e-05, + "loss": 0.014381872490048409, + "num_input_tokens_seen": 23908960, + "step": 1460, + "train_runtime": 11864.5874, + "train_tokens_per_second": 2015.153 + }, + { + "epoch": 0.8854545454545455, + "grad_norm": 0.014889719896018505, + "learning_rate": 9.84572522732891e-05, + "loss": 0.014377344399690628, + "num_input_tokens_seen": 23925336, + "step": 1461, + "train_runtime": 11872.7097, + "train_tokens_per_second": 2015.154 + }, + { + "epoch": 0.8860606060606061, + "grad_norm": 0.03352119028568268, + "learning_rate": 9.845488108406198e-05, + "loss": 0.012402249500155449, + "num_input_tokens_seen": 23941712, + "step": 1462, + "train_runtime": 11880.8311, + "train_tokens_per_second": 2015.155 + }, + { + "epoch": 0.8866666666666667, + "grad_norm": 0.01767772249877453, + "learning_rate": 9.845250810258315e-05, + "loss": 0.01259274035692215, + "num_input_tokens_seen": 23958088, + "step": 1463, + "train_runtime": 11888.9518, + "train_tokens_per_second": 2015.156 + }, + { + "epoch": 0.8872727272727273, + "grad_norm": 0.02254144847393036, + "learning_rate": 9.845013332894043e-05, + "loss": 0.012254755012691021, + "num_input_tokens_seen": 23974464, + "step": 1464, + "train_runtime": 11897.08, + "train_tokens_per_second": 2015.155 + }, + { + "epoch": 0.8878787878787879, + "grad_norm": 0.02848796173930168, + "learning_rate": 9.84477567632216e-05, + "loss": 0.011723476462066174, + "num_input_tokens_seen": 23990840, + "step": 1465, + "train_runtime": 11905.2012, + "train_tokens_per_second": 2015.156 + }, + { + "epoch": 0.8884848484848484, + "grad_norm": 0.010046341456472874, + "learning_rate": 9.844537840551462e-05, + "loss": 0.012512149289250374, + "num_input_tokens_seen": 24007216, + "step": 1466, + "train_runtime": 11913.3224, + "train_tokens_per_second": 2015.157 + }, + { + "epoch": 0.889090909090909, + "grad_norm": 0.010369017720222473, + "learning_rate": 9.844299825590741e-05, + "loss": 0.012070760130882263, + "num_input_tokens_seen": 24023592, + "step": 1467, + "train_runtime": 11921.4458, + "train_tokens_per_second": 2015.158 + }, + { + "epoch": 0.8896969696969697, + "grad_norm": 0.013119385577738285, + "learning_rate": 9.844061631448804e-05, + "loss": 0.013083739206194878, + "num_input_tokens_seen": 24039968, + "step": 1468, + "train_runtime": 11929.5648, + "train_tokens_per_second": 2015.159 + }, + { + "epoch": 0.8903030303030303, + "grad_norm": 0.034340064972639084, + "learning_rate": 9.843823258134461e-05, + "loss": 0.012108924798667431, + "num_input_tokens_seen": 24056344, + "step": 1469, + "train_runtime": 11937.7032, + "train_tokens_per_second": 2015.157 + }, + { + "epoch": 0.8909090909090909, + "grad_norm": 0.014157054014503956, + "learning_rate": 9.84358470565653e-05, + "loss": 0.012553790584206581, + "num_input_tokens_seen": 24072720, + "step": 1470, + "train_runtime": 11945.8324, + "train_tokens_per_second": 2015.156 + }, + { + "epoch": 0.8915151515151515, + "grad_norm": 0.045436400920152664, + "learning_rate": 9.843345974023832e-05, + "loss": 0.012693868018686771, + "num_input_tokens_seen": 24089096, + "step": 1471, + "train_runtime": 11953.9552, + "train_tokens_per_second": 2015.157 + }, + { + "epoch": 0.8921212121212121, + "grad_norm": 0.0057262699119746685, + "learning_rate": 9.843107063245199e-05, + "loss": 0.01204710453748703, + "num_input_tokens_seen": 24105472, + "step": 1472, + "train_runtime": 11962.0766, + "train_tokens_per_second": 2015.158 + }, + { + "epoch": 0.8927272727272727, + "grad_norm": 0.010710208676755428, + "learning_rate": 9.842867973329466e-05, + "loss": 0.011947019957005978, + "num_input_tokens_seen": 24121848, + "step": 1473, + "train_runtime": 11970.1975, + "train_tokens_per_second": 2015.159 + }, + { + "epoch": 0.8933333333333333, + "grad_norm": 0.01923258602619171, + "learning_rate": 9.842628704285479e-05, + "loss": 0.012753183022141457, + "num_input_tokens_seen": 24138224, + "step": 1474, + "train_runtime": 11978.3191, + "train_tokens_per_second": 2015.16 + }, + { + "epoch": 0.8939393939393939, + "grad_norm": 0.01927172765135765, + "learning_rate": 9.842389256122086e-05, + "loss": 0.011474791914224625, + "num_input_tokens_seen": 24154600, + "step": 1475, + "train_runtime": 11986.4408, + "train_tokens_per_second": 2015.16 + }, + { + "epoch": 0.8945454545454545, + "grad_norm": 0.008145877160131931, + "learning_rate": 9.842149628848145e-05, + "loss": 0.01207401417195797, + "num_input_tokens_seen": 24170976, + "step": 1476, + "train_runtime": 11994.5627, + "train_tokens_per_second": 2015.161 + }, + { + "epoch": 0.8951515151515151, + "grad_norm": 0.00783380214124918, + "learning_rate": 9.841909822472518e-05, + "loss": 0.012457404285669327, + "num_input_tokens_seen": 24187352, + "step": 1477, + "train_runtime": 12002.6852, + "train_tokens_per_second": 2015.162 + }, + { + "epoch": 0.8957575757575758, + "grad_norm": 0.0179067924618721, + "learning_rate": 9.841669837004077e-05, + "loss": 0.013264812529087067, + "num_input_tokens_seen": 24203728, + "step": 1478, + "train_runtime": 12010.806, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 0.8963636363636364, + "grad_norm": 0.013561374507844448, + "learning_rate": 9.841429672451697e-05, + "loss": 0.012999416328966618, + "num_input_tokens_seen": 24220104, + "step": 1479, + "train_runtime": 12018.9318, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 0.896969696969697, + "grad_norm": 0.017535557970404625, + "learning_rate": 9.84118932882426e-05, + "loss": 0.01261084619909525, + "num_input_tokens_seen": 24236480, + "step": 1480, + "train_runtime": 12027.0548, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 0.8975757575757576, + "grad_norm": 0.010221567936241627, + "learning_rate": 9.84094880613066e-05, + "loss": 0.012463985942304134, + "num_input_tokens_seen": 24252856, + "step": 1481, + "train_runtime": 12035.177, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 0.8981818181818182, + "grad_norm": 0.015500430017709732, + "learning_rate": 9.84070810437979e-05, + "loss": 0.012875360436737537, + "num_input_tokens_seen": 24269232, + "step": 1482, + "train_runtime": 12043.2972, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 0.8987878787878788, + "grad_norm": 0.018367871642112732, + "learning_rate": 9.840467223580554e-05, + "loss": 0.013259278610348701, + "num_input_tokens_seen": 24285608, + "step": 1483, + "train_runtime": 12051.4351, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 0.8993939393939394, + "grad_norm": 0.010412012226879597, + "learning_rate": 9.840226163741862e-05, + "loss": 0.01317787729203701, + "num_input_tokens_seen": 24301984, + "step": 1484, + "train_runtime": 12059.5701, + "train_tokens_per_second": 2015.162 + }, + { + "epoch": 0.9, + "grad_norm": 0.014846539124846458, + "learning_rate": 9.83998492487263e-05, + "loss": 0.013309162110090256, + "num_input_tokens_seen": 24318360, + "step": 1485, + "train_runtime": 12067.6971, + "train_tokens_per_second": 2015.162 + }, + { + "epoch": 0.9006060606060606, + "grad_norm": 0.015418080613017082, + "learning_rate": 9.839743506981782e-05, + "loss": 0.013252614066004753, + "num_input_tokens_seen": 24334736, + "step": 1486, + "train_runtime": 12075.8171, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 0.9012121212121212, + "grad_norm": 0.005580899305641651, + "learning_rate": 9.839501910078246e-05, + "loss": 0.011665296740829945, + "num_input_tokens_seen": 24351112, + "step": 1487, + "train_runtime": 12083.9372, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 0.9018181818181819, + "grad_norm": 0.020693181082606316, + "learning_rate": 9.839260134170958e-05, + "loss": 0.012977859936654568, + "num_input_tokens_seen": 24367488, + "step": 1488, + "train_runtime": 12092.0566, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 0.9024242424242425, + "grad_norm": 0.02792605198919773, + "learning_rate": 9.839018179268862e-05, + "loss": 0.013854194432497025, + "num_input_tokens_seen": 24383864, + "step": 1489, + "train_runtime": 12100.1926, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 0.9030303030303031, + "grad_norm": 0.017738644033670425, + "learning_rate": 9.838776045380909e-05, + "loss": 0.013810316100716591, + "num_input_tokens_seen": 24400240, + "step": 1490, + "train_runtime": 12108.3165, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 0.9036363636363637, + "grad_norm": 0.013722319155931473, + "learning_rate": 9.838533732516051e-05, + "loss": 0.013590461574494839, + "num_input_tokens_seen": 24416616, + "step": 1491, + "train_runtime": 12116.4361, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 0.9042424242424243, + "grad_norm": 0.015147789381444454, + "learning_rate": 9.838291240683252e-05, + "loss": 0.01322873868048191, + "num_input_tokens_seen": 24432992, + "step": 1492, + "train_runtime": 12124.5584, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 0.9048484848484849, + "grad_norm": 0.024722442030906677, + "learning_rate": 9.838048569891485e-05, + "loss": 0.011483176611363888, + "num_input_tokens_seen": 24449368, + "step": 1493, + "train_runtime": 12132.6806, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 0.9054545454545454, + "grad_norm": 0.011857298202812672, + "learning_rate": 9.837805720149721e-05, + "loss": 0.012423008680343628, + "num_input_tokens_seen": 24465744, + "step": 1494, + "train_runtime": 12140.8021, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 0.906060606060606, + "grad_norm": 0.007093346677720547, + "learning_rate": 9.837562691466946e-05, + "loss": 0.012259125709533691, + "num_input_tokens_seen": 24482120, + "step": 1495, + "train_runtime": 12148.9221, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 0.014173478819429874, + "learning_rate": 9.837319483852147e-05, + "loss": 0.012404312379658222, + "num_input_tokens_seen": 24498496, + "step": 1496, + "train_runtime": 12157.059, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 0.9072727272727272, + "grad_norm": 0.008476898074150085, + "learning_rate": 9.837076097314319e-05, + "loss": 0.011994283646345139, + "num_input_tokens_seen": 24514872, + "step": 1497, + "train_runtime": 12165.1807, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 0.9078787878787878, + "grad_norm": 0.008641723543405533, + "learning_rate": 9.836832531862469e-05, + "loss": 0.01290223654359579, + "num_input_tokens_seen": 24531248, + "step": 1498, + "train_runtime": 12173.3042, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 0.9084848484848485, + "grad_norm": 0.011743849143385887, + "learning_rate": 9.836588787505601e-05, + "loss": 0.012031139805912971, + "num_input_tokens_seen": 24547624, + "step": 1499, + "train_runtime": 12181.4387, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.03372018039226532, + "learning_rate": 9.836344864252734e-05, + "loss": 0.013024641200900078, + "num_input_tokens_seen": 24564000, + "step": 1500, + "train_runtime": 12189.5644, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 0.9096969696969697, + "grad_norm": 0.009321557357907295, + "learning_rate": 9.836100762112888e-05, + "loss": 0.012353789061307907, + "num_input_tokens_seen": 24580376, + "step": 1501, + "train_runtime": 12198.7668, + "train_tokens_per_second": 2014.989 + }, + { + "epoch": 0.9103030303030303, + "grad_norm": 0.033833153545856476, + "learning_rate": 9.835856481095092e-05, + "loss": 0.014163315296173096, + "num_input_tokens_seen": 24596752, + "step": 1502, + "train_runtime": 12206.876, + "train_tokens_per_second": 2014.992 + }, + { + "epoch": 0.9109090909090909, + "grad_norm": 0.008616876788437366, + "learning_rate": 9.835612021208382e-05, + "loss": 0.012016966007649899, + "num_input_tokens_seen": 24613128, + "step": 1503, + "train_runtime": 12214.988, + "train_tokens_per_second": 2014.994 + }, + { + "epoch": 0.9115151515151515, + "grad_norm": 0.01279782596975565, + "learning_rate": 9.835367382461802e-05, + "loss": 0.012753800489008427, + "num_input_tokens_seen": 24629504, + "step": 1504, + "train_runtime": 12223.097, + "train_tokens_per_second": 2014.997 + }, + { + "epoch": 0.9121212121212121, + "grad_norm": 0.01015873346477747, + "learning_rate": 9.835122564864397e-05, + "loss": 0.014264887198805809, + "num_input_tokens_seen": 24645880, + "step": 1505, + "train_runtime": 12231.2086, + "train_tokens_per_second": 2015.0 + }, + { + "epoch": 0.9127272727272727, + "grad_norm": 0.01769862323999405, + "learning_rate": 9.834877568425225e-05, + "loss": 0.011948327533900738, + "num_input_tokens_seen": 24662256, + "step": 1506, + "train_runtime": 12239.3149, + "train_tokens_per_second": 2015.003 + }, + { + "epoch": 0.9133333333333333, + "grad_norm": 0.017047662287950516, + "learning_rate": 9.834632393153348e-05, + "loss": 0.013797544874250889, + "num_input_tokens_seen": 24678632, + "step": 1507, + "train_runtime": 12247.4368, + "train_tokens_per_second": 2015.004 + }, + { + "epoch": 0.9139393939393939, + "grad_norm": 0.01629558391869068, + "learning_rate": 9.834387039057833e-05, + "loss": 0.011770099401473999, + "num_input_tokens_seen": 24695008, + "step": 1508, + "train_runtime": 12255.5461, + "train_tokens_per_second": 2015.007 + }, + { + "epoch": 0.9145454545454546, + "grad_norm": 0.01035243272781372, + "learning_rate": 9.834141506147756e-05, + "loss": 0.012996964156627655, + "num_input_tokens_seen": 24711384, + "step": 1509, + "train_runtime": 12263.6587, + "train_tokens_per_second": 2015.009 + }, + { + "epoch": 0.9151515151515152, + "grad_norm": 0.010673630982637405, + "learning_rate": 9.833895794432199e-05, + "loss": 0.011348108761012554, + "num_input_tokens_seen": 24727760, + "step": 1510, + "train_runtime": 12271.7697, + "train_tokens_per_second": 2015.012 + }, + { + "epoch": 0.9157575757575758, + "grad_norm": 0.0162323247641325, + "learning_rate": 9.83364990392025e-05, + "loss": 0.011655725538730621, + "num_input_tokens_seen": 24744136, + "step": 1511, + "train_runtime": 12279.878, + "train_tokens_per_second": 2015.015 + }, + { + "epoch": 0.9163636363636364, + "grad_norm": 0.011112192645668983, + "learning_rate": 9.833403834621005e-05, + "loss": 0.012943776324391365, + "num_input_tokens_seen": 24760512, + "step": 1512, + "train_runtime": 12287.9854, + "train_tokens_per_second": 2015.018 + }, + { + "epoch": 0.916969696969697, + "grad_norm": 0.008601181209087372, + "learning_rate": 9.833157586543565e-05, + "loss": 0.012646821327507496, + "num_input_tokens_seen": 24776888, + "step": 1513, + "train_runtime": 12296.0916, + "train_tokens_per_second": 2015.021 + }, + { + "epoch": 0.9175757575757576, + "grad_norm": 0.012012478895485401, + "learning_rate": 9.832911159697035e-05, + "loss": 0.012354527600109577, + "num_input_tokens_seen": 24793264, + "step": 1514, + "train_runtime": 12304.2001, + "train_tokens_per_second": 2015.024 + }, + { + "epoch": 0.9181818181818182, + "grad_norm": 0.035613156855106354, + "learning_rate": 9.832664554090536e-05, + "loss": 0.016033384948968887, + "num_input_tokens_seen": 24809640, + "step": 1515, + "train_runtime": 12312.3086, + "train_tokens_per_second": 2015.027 + }, + { + "epoch": 0.9187878787878788, + "grad_norm": 0.006802879273891449, + "learning_rate": 9.832417769733185e-05, + "loss": 0.011312966234982014, + "num_input_tokens_seen": 24826016, + "step": 1516, + "train_runtime": 12320.4184, + "train_tokens_per_second": 2015.03 + }, + { + "epoch": 0.9193939393939394, + "grad_norm": 0.021973978728055954, + "learning_rate": 9.832170806634112e-05, + "loss": 0.014309680089354515, + "num_input_tokens_seen": 24842392, + "step": 1517, + "train_runtime": 12328.5323, + "train_tokens_per_second": 2015.032 + }, + { + "epoch": 0.92, + "grad_norm": 0.005967301782220602, + "learning_rate": 9.831923664802452e-05, + "loss": 0.013270399533212185, + "num_input_tokens_seen": 24858768, + "step": 1518, + "train_runtime": 12336.6424, + "train_tokens_per_second": 2015.035 + }, + { + "epoch": 0.9206060606060606, + "grad_norm": 0.009456534869968891, + "learning_rate": 9.831676344247342e-05, + "loss": 0.01195263396948576, + "num_input_tokens_seen": 24875144, + "step": 1519, + "train_runtime": 12344.7519, + "train_tokens_per_second": 2015.038 + }, + { + "epoch": 0.9212121212121213, + "grad_norm": 0.007073413580656052, + "learning_rate": 9.831428844977937e-05, + "loss": 0.01261575985699892, + "num_input_tokens_seen": 24891520, + "step": 1520, + "train_runtime": 12352.8597, + "train_tokens_per_second": 2015.041 + }, + { + "epoch": 0.9218181818181819, + "grad_norm": 0.015730643644928932, + "learning_rate": 9.831181167003385e-05, + "loss": 0.012905986048281193, + "num_input_tokens_seen": 24907896, + "step": 1521, + "train_runtime": 12360.9698, + "train_tokens_per_second": 2015.044 + }, + { + "epoch": 0.9224242424242424, + "grad_norm": 0.018344024196267128, + "learning_rate": 9.830933310332853e-05, + "loss": 0.013406560756266117, + "num_input_tokens_seen": 24924272, + "step": 1522, + "train_runtime": 12369.0808, + "train_tokens_per_second": 2015.046 + }, + { + "epoch": 0.923030303030303, + "grad_norm": 0.012346428819000721, + "learning_rate": 9.830685274975504e-05, + "loss": 0.014868221245706081, + "num_input_tokens_seen": 24940648, + "step": 1523, + "train_runtime": 12377.1913, + "train_tokens_per_second": 2015.049 + }, + { + "epoch": 0.9236363636363636, + "grad_norm": 0.009414401836693287, + "learning_rate": 9.830437060940513e-05, + "loss": 0.011081631295382977, + "num_input_tokens_seen": 24957024, + "step": 1524, + "train_runtime": 12385.2982, + "train_tokens_per_second": 2015.052 + }, + { + "epoch": 0.9242424242424242, + "grad_norm": 0.009916838258504868, + "learning_rate": 9.830188668237063e-05, + "loss": 0.012078795582056046, + "num_input_tokens_seen": 24973400, + "step": 1525, + "train_runtime": 12393.4176, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 0.9248484848484848, + "grad_norm": 0.008649183437228203, + "learning_rate": 9.82994009687434e-05, + "loss": 0.01329115778207779, + "num_input_tokens_seen": 24989776, + "step": 1526, + "train_runtime": 12401.5342, + "train_tokens_per_second": 2015.055 + }, + { + "epoch": 0.9254545454545454, + "grad_norm": 0.008059944026172161, + "learning_rate": 9.829691346861539e-05, + "loss": 0.010912570171058178, + "num_input_tokens_seen": 25006152, + "step": 1527, + "train_runtime": 12409.6441, + "train_tokens_per_second": 2015.058 + }, + { + "epoch": 0.926060606060606, + "grad_norm": 0.01213870570063591, + "learning_rate": 9.82944241820786e-05, + "loss": 0.013018831610679626, + "num_input_tokens_seen": 25022528, + "step": 1528, + "train_runtime": 12417.7549, + "train_tokens_per_second": 2015.061 + }, + { + "epoch": 0.9266666666666666, + "grad_norm": 0.006536056753247976, + "learning_rate": 9.829193310922511e-05, + "loss": 0.012272411026060581, + "num_input_tokens_seen": 25038904, + "step": 1529, + "train_runtime": 12425.8697, + "train_tokens_per_second": 2015.062 + }, + { + "epoch": 0.9272727272727272, + "grad_norm": 0.016091682016849518, + "learning_rate": 9.828944025014707e-05, + "loss": 0.013518830761313438, + "num_input_tokens_seen": 25055280, + "step": 1530, + "train_runtime": 12433.9806, + "train_tokens_per_second": 2015.065 + }, + { + "epoch": 0.9278787878787879, + "grad_norm": 0.013526062481105328, + "learning_rate": 9.828694560493667e-05, + "loss": 0.011766214855015278, + "num_input_tokens_seen": 25071656, + "step": 1531, + "train_runtime": 12442.099, + "train_tokens_per_second": 2015.066 + }, + { + "epoch": 0.9284848484848485, + "grad_norm": 0.012226013466715813, + "learning_rate": 9.828444917368618e-05, + "loss": 0.012446287088096142, + "num_input_tokens_seen": 25088032, + "step": 1532, + "train_runtime": 12450.2199, + "train_tokens_per_second": 2015.067 + }, + { + "epoch": 0.9290909090909091, + "grad_norm": 0.008501514792442322, + "learning_rate": 9.828195095648796e-05, + "loss": 0.011350834742188454, + "num_input_tokens_seen": 25104408, + "step": 1533, + "train_runtime": 12458.3354, + "train_tokens_per_second": 2015.069 + }, + { + "epoch": 0.9296969696969697, + "grad_norm": 0.015691563487052917, + "learning_rate": 9.827945095343438e-05, + "loss": 0.01239042729139328, + "num_input_tokens_seen": 25120784, + "step": 1534, + "train_runtime": 12466.4613, + "train_tokens_per_second": 2015.069 + }, + { + "epoch": 0.9303030303030303, + "grad_norm": 0.010623699985444546, + "learning_rate": 9.827694916461793e-05, + "loss": 0.012479366734623909, + "num_input_tokens_seen": 25137160, + "step": 1535, + "train_runtime": 12474.5831, + "train_tokens_per_second": 2015.07 + }, + { + "epoch": 0.9309090909090909, + "grad_norm": 0.016181156039237976, + "learning_rate": 9.827444559013115e-05, + "loss": 0.012995701283216476, + "num_input_tokens_seen": 25153536, + "step": 1536, + "train_runtime": 12482.7136, + "train_tokens_per_second": 2015.07 + }, + { + "epoch": 0.9315151515151515, + "grad_norm": 0.011957235634326935, + "learning_rate": 9.827194023006665e-05, + "loss": 0.011975055560469627, + "num_input_tokens_seen": 25169912, + "step": 1537, + "train_runtime": 12490.8442, + "train_tokens_per_second": 2015.069 + }, + { + "epoch": 0.9321212121212121, + "grad_norm": 0.006527577061206102, + "learning_rate": 9.826943308451706e-05, + "loss": 0.011895911768078804, + "num_input_tokens_seen": 25186288, + "step": 1538, + "train_runtime": 12498.9672, + "train_tokens_per_second": 2015.07 + }, + { + "epoch": 0.9327272727272727, + "grad_norm": 0.01042003184556961, + "learning_rate": 9.826692415357517e-05, + "loss": 0.014506472274661064, + "num_input_tokens_seen": 25202664, + "step": 1539, + "train_runtime": 12507.0894, + "train_tokens_per_second": 2015.07 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.016334760934114456, + "learning_rate": 9.826441343733373e-05, + "loss": 0.0129412692040205, + "num_input_tokens_seen": 25219040, + "step": 1540, + "train_runtime": 12515.2111, + "train_tokens_per_second": 2015.071 + }, + { + "epoch": 0.933939393939394, + "grad_norm": 0.014927252195775509, + "learning_rate": 9.826190093588563e-05, + "loss": 0.012800981290638447, + "num_input_tokens_seen": 25235416, + "step": 1541, + "train_runtime": 12523.3336, + "train_tokens_per_second": 2015.072 + }, + { + "epoch": 0.9345454545454546, + "grad_norm": 0.029416421428322792, + "learning_rate": 9.825938664932381e-05, + "loss": 0.014405488967895508, + "num_input_tokens_seen": 25251792, + "step": 1542, + "train_runtime": 12531.4527, + "train_tokens_per_second": 2015.073 + }, + { + "epoch": 0.9351515151515152, + "grad_norm": 0.02064371295273304, + "learning_rate": 9.825687057774126e-05, + "loss": 0.012185944244265556, + "num_input_tokens_seen": 25268168, + "step": 1543, + "train_runtime": 12539.5726, + "train_tokens_per_second": 2015.074 + }, + { + "epoch": 0.9357575757575758, + "grad_norm": 0.02839016169309616, + "learning_rate": 9.825435272123103e-05, + "loss": 0.01328684575855732, + "num_input_tokens_seen": 25284544, + "step": 1544, + "train_runtime": 12547.6922, + "train_tokens_per_second": 2015.075 + }, + { + "epoch": 0.9363636363636364, + "grad_norm": 0.00797637552022934, + "learning_rate": 9.825183307988628e-05, + "loss": 0.012131592258810997, + "num_input_tokens_seen": 25300920, + "step": 1545, + "train_runtime": 12555.816, + "train_tokens_per_second": 2015.076 + }, + { + "epoch": 0.936969696969697, + "grad_norm": 0.019619282335042953, + "learning_rate": 9.824931165380018e-05, + "loss": 0.012967569753527641, + "num_input_tokens_seen": 25317296, + "step": 1546, + "train_runtime": 12563.9372, + "train_tokens_per_second": 2015.077 + }, + { + "epoch": 0.9375757575757576, + "grad_norm": 0.021843625232577324, + "learning_rate": 9.824678844306601e-05, + "loss": 0.01267443411052227, + "num_input_tokens_seen": 25333672, + "step": 1547, + "train_runtime": 12572.06, + "train_tokens_per_second": 2015.077 + }, + { + "epoch": 0.9381818181818182, + "grad_norm": 0.017489034682512283, + "learning_rate": 9.824426344777708e-05, + "loss": 0.013256320729851723, + "num_input_tokens_seen": 25350048, + "step": 1548, + "train_runtime": 12580.182, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 0.9387878787878788, + "grad_norm": 0.013636451214551926, + "learning_rate": 9.82417366680268e-05, + "loss": 0.012895656749606133, + "num_input_tokens_seen": 25366424, + "step": 1549, + "train_runtime": 12588.3031, + "train_tokens_per_second": 2015.079 + }, + { + "epoch": 0.9393939393939394, + "grad_norm": 0.010681037791073322, + "learning_rate": 9.823920810390864e-05, + "loss": 0.01336823869496584, + "num_input_tokens_seen": 25382800, + "step": 1550, + "train_runtime": 12596.4312, + "train_tokens_per_second": 2015.079 + }, + { + "epoch": 0.94, + "grad_norm": 0.008681289851665497, + "learning_rate": 9.823667775551611e-05, + "loss": 0.012347033247351646, + "num_input_tokens_seen": 25399176, + "step": 1551, + "train_runtime": 12604.5524, + "train_tokens_per_second": 2015.08 + }, + { + "epoch": 0.9406060606060606, + "grad_norm": 0.014772225171327591, + "learning_rate": 9.82341456229428e-05, + "loss": 0.012610615231096745, + "num_input_tokens_seen": 25415552, + "step": 1552, + "train_runtime": 12612.6729, + "train_tokens_per_second": 2015.081 + }, + { + "epoch": 0.9412121212121212, + "grad_norm": 0.009455538354814053, + "learning_rate": 9.823161170628236e-05, + "loss": 0.011409729719161987, + "num_input_tokens_seen": 25431928, + "step": 1553, + "train_runtime": 12620.7947, + "train_tokens_per_second": 2015.081 + }, + { + "epoch": 0.9418181818181818, + "grad_norm": 0.007376038935035467, + "learning_rate": 9.822907600562855e-05, + "loss": 0.01010863482952118, + "num_input_tokens_seen": 25448304, + "step": 1554, + "train_runtime": 12628.9187, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 0.9424242424242424, + "grad_norm": 0.018327629193663597, + "learning_rate": 9.822653852107514e-05, + "loss": 0.012985773384571075, + "num_input_tokens_seen": 25464680, + "step": 1555, + "train_runtime": 12637.0405, + "train_tokens_per_second": 2015.083 + }, + { + "epoch": 0.943030303030303, + "grad_norm": 0.020945513620972633, + "learning_rate": 9.822399925271598e-05, + "loss": 0.012259690091013908, + "num_input_tokens_seen": 25481056, + "step": 1556, + "train_runtime": 12645.1611, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 0.9436363636363636, + "grad_norm": 0.01777738891541958, + "learning_rate": 9.822145820064501e-05, + "loss": 0.013157106004655361, + "num_input_tokens_seen": 25497432, + "step": 1557, + "train_runtime": 12653.2798, + "train_tokens_per_second": 2015.085 + }, + { + "epoch": 0.9442424242424242, + "grad_norm": 0.014833358116447926, + "learning_rate": 9.82189153649562e-05, + "loss": 0.013078153133392334, + "num_input_tokens_seen": 25513808, + "step": 1558, + "train_runtime": 12661.4005, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 0.9448484848484848, + "grad_norm": 0.011480891145765781, + "learning_rate": 9.821637074574362e-05, + "loss": 0.012636776082217693, + "num_input_tokens_seen": 25530184, + "step": 1559, + "train_runtime": 12669.5208, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 0.9454545454545454, + "grad_norm": 0.01719282753765583, + "learning_rate": 9.821382434310136e-05, + "loss": 0.013058073818683624, + "num_input_tokens_seen": 25546560, + "step": 1560, + "train_runtime": 12677.6402, + "train_tokens_per_second": 2015.088 + }, + { + "epoch": 0.946060606060606, + "grad_norm": 0.017694855108857155, + "learning_rate": 9.821127615712364e-05, + "loss": 0.012888854369521141, + "num_input_tokens_seen": 25562936, + "step": 1561, + "train_runtime": 12685.76, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 0.9466666666666667, + "grad_norm": 0.012164749205112457, + "learning_rate": 9.820872618790472e-05, + "loss": 0.011173507198691368, + "num_input_tokens_seen": 25579312, + "step": 1562, + "train_runtime": 12693.8817, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 0.9472727272727273, + "grad_norm": 0.012561215087771416, + "learning_rate": 9.820617443553889e-05, + "loss": 0.012248185463249683, + "num_input_tokens_seen": 25595688, + "step": 1563, + "train_runtime": 12702.0014, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 0.9478787878787879, + "grad_norm": 0.01546258945018053, + "learning_rate": 9.820362090012054e-05, + "loss": 0.013846226967871189, + "num_input_tokens_seen": 25612064, + "step": 1564, + "train_runtime": 12710.1221, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 0.9484848484848485, + "grad_norm": 0.01711091957986355, + "learning_rate": 9.820106558174413e-05, + "loss": 0.012854847125709057, + "num_input_tokens_seen": 25628440, + "step": 1565, + "train_runtime": 12718.245, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 0.9490909090909091, + "grad_norm": 0.014450309798121452, + "learning_rate": 9.819850848050419e-05, + "loss": 0.013084612786769867, + "num_input_tokens_seen": 25644816, + "step": 1566, + "train_runtime": 12726.3658, + "train_tokens_per_second": 2015.093 + }, + { + "epoch": 0.9496969696969697, + "grad_norm": 0.019604388624429703, + "learning_rate": 9.819594959649525e-05, + "loss": 0.014434726908802986, + "num_input_tokens_seen": 25661192, + "step": 1567, + "train_runtime": 12734.4867, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 0.9503030303030303, + "grad_norm": 0.010934766381978989, + "learning_rate": 9.819338892981201e-05, + "loss": 0.012038343586027622, + "num_input_tokens_seen": 25677568, + "step": 1568, + "train_runtime": 12742.6071, + "train_tokens_per_second": 2015.095 + }, + { + "epoch": 0.9509090909090909, + "grad_norm": 0.023866428062319756, + "learning_rate": 9.819082648054915e-05, + "loss": 0.012406328693032265, + "num_input_tokens_seen": 25693944, + "step": 1569, + "train_runtime": 12750.7319, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 0.9515151515151515, + "grad_norm": 0.021724287420511246, + "learning_rate": 9.81882622488015e-05, + "loss": 0.013142053037881851, + "num_input_tokens_seen": 25710320, + "step": 1570, + "train_runtime": 12758.8522, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 0.9521212121212121, + "grad_norm": 0.013339296914637089, + "learning_rate": 9.818569623466383e-05, + "loss": 0.012692091055214405, + "num_input_tokens_seen": 25726696, + "step": 1571, + "train_runtime": 12766.9728, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 0.9527272727272728, + "grad_norm": 0.01173945888876915, + "learning_rate": 9.818312843823113e-05, + "loss": 0.012100563384592533, + "num_input_tokens_seen": 25743072, + "step": 1572, + "train_runtime": 12775.0957, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 0.9533333333333334, + "grad_norm": 0.024036092683672905, + "learning_rate": 9.818055885959831e-05, + "loss": 0.01412537693977356, + "num_input_tokens_seen": 25759448, + "step": 1573, + "train_runtime": 12783.2155, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 0.953939393939394, + "grad_norm": 0.017213815823197365, + "learning_rate": 9.817798749886047e-05, + "loss": 0.012354889884591103, + "num_input_tokens_seen": 25775824, + "step": 1574, + "train_runtime": 12791.3381, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 0.9545454545454546, + "grad_norm": 0.00862727127969265, + "learning_rate": 9.817541435611268e-05, + "loss": 0.011515894904732704, + "num_input_tokens_seen": 25792200, + "step": 1575, + "train_runtime": 12799.4582, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 0.9551515151515152, + "grad_norm": 0.015271785669028759, + "learning_rate": 9.817283943145013e-05, + "loss": 0.013525201007723808, + "num_input_tokens_seen": 25808576, + "step": 1576, + "train_runtime": 12807.5792, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 0.9557575757575758, + "grad_norm": 0.011643790639936924, + "learning_rate": 9.817026272496806e-05, + "loss": 0.012195194140076637, + "num_input_tokens_seen": 25824952, + "step": 1577, + "train_runtime": 12815.6998, + "train_tokens_per_second": 2015.103 + }, + { + "epoch": 0.9563636363636364, + "grad_norm": 0.014296936802566051, + "learning_rate": 9.81676842367618e-05, + "loss": 0.011279569007456303, + "num_input_tokens_seen": 25841328, + "step": 1578, + "train_runtime": 12823.8181, + "train_tokens_per_second": 2015.104 + }, + { + "epoch": 0.9569696969696969, + "grad_norm": 0.0846167728304863, + "learning_rate": 9.816510396692668e-05, + "loss": 0.011400844901800156, + "num_input_tokens_seen": 25857704, + "step": 1579, + "train_runtime": 12831.94, + "train_tokens_per_second": 2015.105 + }, + { + "epoch": 0.9575757575757575, + "grad_norm": 0.01264986302703619, + "learning_rate": 9.816252191555818e-05, + "loss": 0.012491201981902122, + "num_input_tokens_seen": 25874080, + "step": 1580, + "train_runtime": 12840.061, + "train_tokens_per_second": 2015.106 + }, + { + "epoch": 0.9581818181818181, + "grad_norm": 0.012004262767732143, + "learning_rate": 9.815993808275177e-05, + "loss": 0.012015881948173046, + "num_input_tokens_seen": 25890456, + "step": 1581, + "train_runtime": 12848.1818, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 0.9587878787878787, + "grad_norm": 0.017323743551969528, + "learning_rate": 9.815735246860305e-05, + "loss": 0.012511130422353745, + "num_input_tokens_seen": 25906832, + "step": 1582, + "train_runtime": 12856.3029, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 0.9593939393939394, + "grad_norm": 0.007995963096618652, + "learning_rate": 9.815476507320762e-05, + "loss": 0.012292873114347458, + "num_input_tokens_seen": 25923208, + "step": 1583, + "train_runtime": 12864.4325, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 0.96, + "grad_norm": 0.013012220151722431, + "learning_rate": 9.815217589666124e-05, + "loss": 0.012744070030748844, + "num_input_tokens_seen": 25939584, + "step": 1584, + "train_runtime": 12872.5519, + "train_tokens_per_second": 2015.108 + }, + { + "epoch": 0.9606060606060606, + "grad_norm": 0.012448850087821484, + "learning_rate": 9.814958493905963e-05, + "loss": 0.010710782371461391, + "num_input_tokens_seen": 25955960, + "step": 1585, + "train_runtime": 12880.6727, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 0.9612121212121212, + "grad_norm": 0.017448868602514267, + "learning_rate": 9.814699220049863e-05, + "loss": 0.012682373635470867, + "num_input_tokens_seen": 25972336, + "step": 1586, + "train_runtime": 12888.7948, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 0.9618181818181818, + "grad_norm": 0.009177150204777718, + "learning_rate": 9.814439768107418e-05, + "loss": 0.01149784680455923, + "num_input_tokens_seen": 25988712, + "step": 1587, + "train_runtime": 12896.9201, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 0.9624242424242424, + "grad_norm": 0.046517495065927505, + "learning_rate": 9.814180138088218e-05, + "loss": 0.012817755341529846, + "num_input_tokens_seen": 26005088, + "step": 1588, + "train_runtime": 12905.0427, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 0.963030303030303, + "grad_norm": 0.04576029255986214, + "learning_rate": 9.813920330001872e-05, + "loss": 0.013101590797305107, + "num_input_tokens_seen": 26021464, + "step": 1589, + "train_runtime": 12913.1645, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 0.9636363636363636, + "grad_norm": 0.005771426483988762, + "learning_rate": 9.813660343857988e-05, + "loss": 0.011728906072676182, + "num_input_tokens_seen": 26037840, + "step": 1590, + "train_runtime": 12921.2861, + "train_tokens_per_second": 2015.112 + }, + { + "epoch": 0.9642424242424242, + "grad_norm": 0.019825542345643044, + "learning_rate": 9.813400179666181e-05, + "loss": 0.014890195801854134, + "num_input_tokens_seen": 26054216, + "step": 1591, + "train_runtime": 12929.407, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 0.9648484848484848, + "grad_norm": 0.014512875117361546, + "learning_rate": 9.813139837436076e-05, + "loss": 0.012027891352772713, + "num_input_tokens_seen": 26070592, + "step": 1592, + "train_runtime": 12937.5336, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 0.9654545454545455, + "grad_norm": 0.0058611356653273106, + "learning_rate": 9.8128793171773e-05, + "loss": 0.011675585061311722, + "num_input_tokens_seen": 26086968, + "step": 1593, + "train_runtime": 12945.6547, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 0.9660606060606061, + "grad_norm": 0.007386692333966494, + "learning_rate": 9.812618618899491e-05, + "loss": 0.012009193189442158, + "num_input_tokens_seen": 26103344, + "step": 1594, + "train_runtime": 12953.7762, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 0.9666666666666667, + "grad_norm": 0.006147931329905987, + "learning_rate": 9.812357742612293e-05, + "loss": 0.0111166313290596, + "num_input_tokens_seen": 26119720, + "step": 1595, + "train_runtime": 12961.8976, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 0.9672727272727273, + "grad_norm": 0.014322157017886639, + "learning_rate": 9.812096688325354e-05, + "loss": 0.012861751019954681, + "num_input_tokens_seen": 26136096, + "step": 1596, + "train_runtime": 12970.0197, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 0.9678787878787879, + "grad_norm": 0.027148565277457237, + "learning_rate": 9.811835456048328e-05, + "loss": 0.013231952674686909, + "num_input_tokens_seen": 26152472, + "step": 1597, + "train_runtime": 12978.1419, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 0.9684848484848485, + "grad_norm": 0.010908321477472782, + "learning_rate": 9.811574045790879e-05, + "loss": 0.01282698567956686, + "num_input_tokens_seen": 26168848, + "step": 1598, + "train_runtime": 12986.2629, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 0.9690909090909091, + "grad_norm": 0.011924156919121742, + "learning_rate": 9.811312457562678e-05, + "loss": 0.01284240186214447, + "num_input_tokens_seen": 26185224, + "step": 1599, + "train_runtime": 12994.3852, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 0.9696969696969697, + "grad_norm": 0.015434959903359413, + "learning_rate": 9.811050691373396e-05, + "loss": 0.012538165785372257, + "num_input_tokens_seen": 26201600, + "step": 1600, + "train_runtime": 13002.5063, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 0.9703030303030303, + "grad_norm": 0.012002578936517239, + "learning_rate": 9.810788747232721e-05, + "loss": 0.013315416872501373, + "num_input_tokens_seen": 26217976, + "step": 1601, + "train_runtime": 13011.6796, + "train_tokens_per_second": 2014.957 + }, + { + "epoch": 0.9709090909090909, + "grad_norm": 0.014069149270653725, + "learning_rate": 9.810526625150337e-05, + "loss": 0.013467980548739433, + "num_input_tokens_seen": 26234352, + "step": 1602, + "train_runtime": 13019.7981, + "train_tokens_per_second": 2014.958 + }, + { + "epoch": 0.9715151515151516, + "grad_norm": 0.014620055444538593, + "learning_rate": 9.810264325135942e-05, + "loss": 0.013381460681557655, + "num_input_tokens_seen": 26250728, + "step": 1603, + "train_runtime": 13027.918, + "train_tokens_per_second": 2014.96 + }, + { + "epoch": 0.9721212121212122, + "grad_norm": 0.01015267800539732, + "learning_rate": 9.810001847199237e-05, + "loss": 0.011481476947665215, + "num_input_tokens_seen": 26267104, + "step": 1604, + "train_runtime": 13036.036, + "train_tokens_per_second": 2014.961 + }, + { + "epoch": 0.9727272727272728, + "grad_norm": 0.010799713432788849, + "learning_rate": 9.80973919134993e-05, + "loss": 0.012389612384140491, + "num_input_tokens_seen": 26283480, + "step": 1605, + "train_runtime": 13044.1551, + "train_tokens_per_second": 2014.962 + }, + { + "epoch": 0.9733333333333334, + "grad_norm": 0.010257367044687271, + "learning_rate": 9.809476357597738e-05, + "loss": 0.01151387207210064, + "num_input_tokens_seen": 26299856, + "step": 1606, + "train_runtime": 13052.2743, + "train_tokens_per_second": 2014.963 + }, + { + "epoch": 0.973939393939394, + "grad_norm": 0.013825331814587116, + "learning_rate": 9.809213345952381e-05, + "loss": 0.011732700280845165, + "num_input_tokens_seen": 26316232, + "step": 1607, + "train_runtime": 13060.3933, + "train_tokens_per_second": 2014.965 + }, + { + "epoch": 0.9745454545454545, + "grad_norm": 0.012632308527827263, + "learning_rate": 9.808950156423588e-05, + "loss": 0.012355693615972996, + "num_input_tokens_seen": 26332608, + "step": 1608, + "train_runtime": 13068.5111, + "train_tokens_per_second": 2014.966 + }, + { + "epoch": 0.9751515151515151, + "grad_norm": 0.002986186882480979, + "learning_rate": 9.808686789021093e-05, + "loss": 0.011115066707134247, + "num_input_tokens_seen": 26348984, + "step": 1609, + "train_runtime": 13076.6315, + "train_tokens_per_second": 2014.967 + }, + { + "epoch": 0.9757575757575757, + "grad_norm": 0.14690753817558289, + "learning_rate": 9.808423243754639e-05, + "loss": 0.011805863119661808, + "num_input_tokens_seen": 26365360, + "step": 1610, + "train_runtime": 13084.7512, + "train_tokens_per_second": 2014.968 + }, + { + "epoch": 0.9763636363636363, + "grad_norm": 0.010931416414678097, + "learning_rate": 9.808159520633973e-05, + "loss": 0.010769207030534744, + "num_input_tokens_seen": 26381736, + "step": 1611, + "train_runtime": 13092.8686, + "train_tokens_per_second": 2014.97 + }, + { + "epoch": 0.9769696969696969, + "grad_norm": 0.005970633123070002, + "learning_rate": 9.80789561966885e-05, + "loss": 0.011445348151028156, + "num_input_tokens_seen": 26398112, + "step": 1612, + "train_runtime": 13101.0028, + "train_tokens_per_second": 2014.969 + }, + { + "epoch": 0.9775757575757575, + "grad_norm": 0.014702706597745419, + "learning_rate": 9.80763154086903e-05, + "loss": 0.012894170358777046, + "num_input_tokens_seen": 26414488, + "step": 1613, + "train_runtime": 13109.121, + "train_tokens_per_second": 2014.97 + }, + { + "epoch": 0.9781818181818182, + "grad_norm": 0.012428919784724712, + "learning_rate": 9.807367284244282e-05, + "loss": 0.012821591459214687, + "num_input_tokens_seen": 26430864, + "step": 1614, + "train_runtime": 13117.2386, + "train_tokens_per_second": 2014.972 + }, + { + "epoch": 0.9787878787878788, + "grad_norm": 0.06160819157958031, + "learning_rate": 9.807102849804381e-05, + "loss": 0.013576450757682323, + "num_input_tokens_seen": 26447240, + "step": 1615, + "train_runtime": 13125.3585, + "train_tokens_per_second": 2014.973 + }, + { + "epoch": 0.9793939393939394, + "grad_norm": 0.017494510859251022, + "learning_rate": 9.806838237559107e-05, + "loss": 0.013401782140135765, + "num_input_tokens_seen": 26463616, + "step": 1616, + "train_runtime": 13133.477, + "train_tokens_per_second": 2014.974 + }, + { + "epoch": 0.98, + "grad_norm": 0.05766647309064865, + "learning_rate": 9.806573447518246e-05, + "loss": 0.014988632872700691, + "num_input_tokens_seen": 26479992, + "step": 1617, + "train_runtime": 13141.5957, + "train_tokens_per_second": 2014.975 + }, + { + "epoch": 0.9806060606060606, + "grad_norm": 0.0317838154733181, + "learning_rate": 9.806308479691595e-05, + "loss": 0.011899848468601704, + "num_input_tokens_seen": 26496368, + "step": 1618, + "train_runtime": 13149.7145, + "train_tokens_per_second": 2014.977 + }, + { + "epoch": 0.9812121212121212, + "grad_norm": 0.0214092880487442, + "learning_rate": 9.806043334088952e-05, + "loss": 0.011899617500603199, + "num_input_tokens_seen": 26512744, + "step": 1619, + "train_runtime": 13157.8348, + "train_tokens_per_second": 2014.978 + }, + { + "epoch": 0.9818181818181818, + "grad_norm": 0.020651455968618393, + "learning_rate": 9.805778010720126e-05, + "loss": 0.012952842749655247, + "num_input_tokens_seen": 26529120, + "step": 1620, + "train_runtime": 13165.9527, + "train_tokens_per_second": 2014.979 + }, + { + "epoch": 0.9824242424242424, + "grad_norm": 0.011873727664351463, + "learning_rate": 9.80551250959493e-05, + "loss": 0.012238996103405952, + "num_input_tokens_seen": 26545496, + "step": 1621, + "train_runtime": 13174.0706, + "train_tokens_per_second": 2014.981 + }, + { + "epoch": 0.983030303030303, + "grad_norm": 0.008294295519590378, + "learning_rate": 9.805246830723186e-05, + "loss": 0.013415738008916378, + "num_input_tokens_seen": 26561872, + "step": 1622, + "train_runtime": 13182.1887, + "train_tokens_per_second": 2014.982 + }, + { + "epoch": 0.9836363636363636, + "grad_norm": 0.006087975576519966, + "learning_rate": 9.804980974114719e-05, + "loss": 0.01233192440122366, + "num_input_tokens_seen": 26578248, + "step": 1623, + "train_runtime": 13190.3059, + "train_tokens_per_second": 2014.983 + }, + { + "epoch": 0.9842424242424243, + "grad_norm": 0.014780385419726372, + "learning_rate": 9.804714939779362e-05, + "loss": 0.012391527183353901, + "num_input_tokens_seen": 26594624, + "step": 1624, + "train_runtime": 13198.4311, + "train_tokens_per_second": 2014.984 + }, + { + "epoch": 0.9848484848484849, + "grad_norm": 0.027467984706163406, + "learning_rate": 9.804448727726956e-05, + "loss": 0.013444105163216591, + "num_input_tokens_seen": 26611000, + "step": 1625, + "train_runtime": 13206.5451, + "train_tokens_per_second": 2014.986 + }, + { + "epoch": 0.9854545454545455, + "grad_norm": 0.019479792565107346, + "learning_rate": 9.804182337967349e-05, + "loss": 0.012373380362987518, + "num_input_tokens_seen": 26627376, + "step": 1626, + "train_runtime": 13214.6617, + "train_tokens_per_second": 2014.987 + }, + { + "epoch": 0.9860606060606061, + "grad_norm": 0.01583736762404442, + "learning_rate": 9.803915770510393e-05, + "loss": 0.01365045364946127, + "num_input_tokens_seen": 26643752, + "step": 1627, + "train_runtime": 13222.7801, + "train_tokens_per_second": 2014.989 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 0.009480198845267296, + "learning_rate": 9.803649025365947e-05, + "loss": 0.012557166628539562, + "num_input_tokens_seen": 26660128, + "step": 1628, + "train_runtime": 13230.8971, + "train_tokens_per_second": 2014.99 + }, + { + "epoch": 0.9872727272727273, + "grad_norm": 0.04188961163163185, + "learning_rate": 9.803382102543879e-05, + "loss": 0.012879314832389355, + "num_input_tokens_seen": 26676504, + "step": 1629, + "train_runtime": 13239.0155, + "train_tokens_per_second": 2014.992 + }, + { + "epoch": 0.9878787878787879, + "grad_norm": 0.025775199756026268, + "learning_rate": 9.80311500205406e-05, + "loss": 0.012588057667016983, + "num_input_tokens_seen": 26692880, + "step": 1630, + "train_runtime": 13247.1337, + "train_tokens_per_second": 2014.993 + }, + { + "epoch": 0.9884848484848485, + "grad_norm": 0.013006957247853279, + "learning_rate": 9.802847723906371e-05, + "loss": 0.012539117597043514, + "num_input_tokens_seen": 26709256, + "step": 1631, + "train_runtime": 13255.2503, + "train_tokens_per_second": 2014.994 + }, + { + "epoch": 0.9890909090909091, + "grad_norm": 0.015388661995530128, + "learning_rate": 9.802580268110699e-05, + "loss": 0.013457395136356354, + "num_input_tokens_seen": 26725632, + "step": 1632, + "train_runtime": 13263.3681, + "train_tokens_per_second": 2014.996 + }, + { + "epoch": 0.9896969696969697, + "grad_norm": 0.015193904750049114, + "learning_rate": 9.802312634676934e-05, + "loss": 0.012364407069981098, + "num_input_tokens_seen": 26742008, + "step": 1633, + "train_runtime": 13271.4872, + "train_tokens_per_second": 2014.997 + }, + { + "epoch": 0.9903030303030304, + "grad_norm": 0.008997797966003418, + "learning_rate": 9.802044823614978e-05, + "loss": 0.011594077572226524, + "num_input_tokens_seen": 26758384, + "step": 1634, + "train_runtime": 13279.6059, + "train_tokens_per_second": 2014.998 + }, + { + "epoch": 0.990909090909091, + "grad_norm": 0.008219058625400066, + "learning_rate": 9.801776834934736e-05, + "loss": 0.011843642219901085, + "num_input_tokens_seen": 26774760, + "step": 1635, + "train_runtime": 13287.7237, + "train_tokens_per_second": 2015.0 + }, + { + "epoch": 0.9915151515151515, + "grad_norm": 0.010219755582511425, + "learning_rate": 9.801508668646118e-05, + "loss": 0.013223512098193169, + "num_input_tokens_seen": 26791136, + "step": 1636, + "train_runtime": 13295.8407, + "train_tokens_per_second": 2015.001 + }, + { + "epoch": 0.9921212121212121, + "grad_norm": 0.006228118669241667, + "learning_rate": 9.801240324759045e-05, + "loss": 0.011068768799304962, + "num_input_tokens_seen": 26807512, + "step": 1637, + "train_runtime": 13303.9585, + "train_tokens_per_second": 2015.003 + }, + { + "epoch": 0.9927272727272727, + "grad_norm": 0.02005128748714924, + "learning_rate": 9.800971803283443e-05, + "loss": 0.01461564190685749, + "num_input_tokens_seen": 26823888, + "step": 1638, + "train_runtime": 13312.0757, + "train_tokens_per_second": 2015.004 + }, + { + "epoch": 0.9933333333333333, + "grad_norm": 0.01215451955795288, + "learning_rate": 9.800703104229245e-05, + "loss": 0.012672146782279015, + "num_input_tokens_seen": 26840264, + "step": 1639, + "train_runtime": 13320.1936, + "train_tokens_per_second": 2015.006 + }, + { + "epoch": 0.9939393939393939, + "grad_norm": 0.007676825392991304, + "learning_rate": 9.800434227606385e-05, + "loss": 0.011905834078788757, + "num_input_tokens_seen": 26856640, + "step": 1640, + "train_runtime": 13328.3104, + "train_tokens_per_second": 2015.007 + }, + { + "epoch": 0.9945454545454545, + "grad_norm": 0.007540915627032518, + "learning_rate": 9.800165173424814e-05, + "loss": 0.011878485791385174, + "num_input_tokens_seen": 26873016, + "step": 1641, + "train_runtime": 13336.4323, + "train_tokens_per_second": 2015.008 + }, + { + "epoch": 0.9951515151515151, + "grad_norm": 0.00563439354300499, + "learning_rate": 9.799895941694481e-05, + "loss": 0.013368148356676102, + "num_input_tokens_seen": 26889392, + "step": 1642, + "train_runtime": 13344.5508, + "train_tokens_per_second": 2015.009 + }, + { + "epoch": 0.9957575757575757, + "grad_norm": 0.015325483866035938, + "learning_rate": 9.799626532425343e-05, + "loss": 0.012677650898694992, + "num_input_tokens_seen": 26905768, + "step": 1643, + "train_runtime": 13352.671, + "train_tokens_per_second": 2015.01 + }, + { + "epoch": 0.9963636363636363, + "grad_norm": 0.021497316658496857, + "learning_rate": 9.799356945627368e-05, + "loss": 0.012580220587551594, + "num_input_tokens_seen": 26922144, + "step": 1644, + "train_runtime": 13360.7875, + "train_tokens_per_second": 2015.012 + }, + { + "epoch": 0.996969696969697, + "grad_norm": 0.012657717801630497, + "learning_rate": 9.799087181310524e-05, + "loss": 0.012727495282888412, + "num_input_tokens_seen": 26938520, + "step": 1645, + "train_runtime": 13368.9042, + "train_tokens_per_second": 2015.013 + }, + { + "epoch": 0.9975757575757576, + "grad_norm": 0.011814654804766178, + "learning_rate": 9.798817239484792e-05, + "loss": 0.012600673362612724, + "num_input_tokens_seen": 26954896, + "step": 1646, + "train_runtime": 13377.0215, + "train_tokens_per_second": 2015.015 + }, + { + "epoch": 0.9981818181818182, + "grad_norm": 0.020266059786081314, + "learning_rate": 9.798547120160156e-05, + "loss": 0.01349579356610775, + "num_input_tokens_seen": 26971272, + "step": 1647, + "train_runtime": 13385.1417, + "train_tokens_per_second": 2015.016 + }, + { + "epoch": 0.9987878787878788, + "grad_norm": 0.013499039225280285, + "learning_rate": 9.798276823346606e-05, + "loss": 0.013363813050091267, + "num_input_tokens_seen": 26987648, + "step": 1648, + "train_runtime": 13393.2596, + "train_tokens_per_second": 2015.017 + }, + { + "epoch": 0.9993939393939394, + "grad_norm": 0.013451367616653442, + "learning_rate": 9.79800634905414e-05, + "loss": 0.013179901987314224, + "num_input_tokens_seen": 27004024, + "step": 1649, + "train_runtime": 13401.3766, + "train_tokens_per_second": 2015.019 + }, + { + "epoch": 1.0, + "grad_norm": 0.008038493804633617, + "learning_rate": 9.797735697292765e-05, + "loss": 0.012226985767483711, + "num_input_tokens_seen": 27020400, + "step": 1650, + "train_runtime": 13409.4946, + "train_tokens_per_second": 2015.02 + }, + { + "epoch": 1.0006060606060605, + "grad_norm": 0.012623721733689308, + "learning_rate": 9.797464868072488e-05, + "loss": 0.012360217981040478, + "num_input_tokens_seen": 27036776, + "step": 1651, + "train_runtime": 13417.6145, + "train_tokens_per_second": 2015.021 + }, + { + "epoch": 1.0012121212121212, + "grad_norm": 0.012440846301615238, + "learning_rate": 9.797193861403329e-05, + "loss": 0.01371823437511921, + "num_input_tokens_seen": 27053152, + "step": 1652, + "train_runtime": 13425.7343, + "train_tokens_per_second": 2015.022 + }, + { + "epoch": 1.0018181818181817, + "grad_norm": 0.009919981472194195, + "learning_rate": 9.79692267729531e-05, + "loss": 0.01273531373590231, + "num_input_tokens_seen": 27069528, + "step": 1653, + "train_runtime": 13433.8539, + "train_tokens_per_second": 2015.023 + }, + { + "epoch": 1.0024242424242424, + "grad_norm": 0.00648895651102066, + "learning_rate": 9.796651315758463e-05, + "loss": 0.012470672838389874, + "num_input_tokens_seen": 27085904, + "step": 1654, + "train_runtime": 13441.9722, + "train_tokens_per_second": 2015.025 + }, + { + "epoch": 1.003030303030303, + "grad_norm": 0.010717087425291538, + "learning_rate": 9.796379776802826e-05, + "loss": 0.012379986234009266, + "num_input_tokens_seen": 27102280, + "step": 1655, + "train_runtime": 13450.0852, + "train_tokens_per_second": 2015.027 + }, + { + "epoch": 1.0036363636363637, + "grad_norm": 0.010810550302267075, + "learning_rate": 9.79610806043844e-05, + "loss": 0.012405799701809883, + "num_input_tokens_seen": 27118656, + "step": 1656, + "train_runtime": 13458.1958, + "train_tokens_per_second": 2015.029 + }, + { + "epoch": 1.0042424242424242, + "grad_norm": 0.012775463983416557, + "learning_rate": 9.795836166675358e-05, + "loss": 0.01413954608142376, + "num_input_tokens_seen": 27135032, + "step": 1657, + "train_runtime": 13466.3055, + "train_tokens_per_second": 2015.032 + }, + { + "epoch": 1.0048484848484849, + "grad_norm": 0.008526409044861794, + "learning_rate": 9.795564095523635e-05, + "loss": 0.012026500888168812, + "num_input_tokens_seen": 27151408, + "step": 1658, + "train_runtime": 13474.4176, + "train_tokens_per_second": 2015.034 + }, + { + "epoch": 1.0054545454545454, + "grad_norm": 0.02498007006943226, + "learning_rate": 9.795291846993337e-05, + "loss": 0.012756639160215855, + "num_input_tokens_seen": 27167784, + "step": 1659, + "train_runtime": 13482.5333, + "train_tokens_per_second": 2015.036 + }, + { + "epoch": 1.006060606060606, + "grad_norm": 0.007863683626055717, + "learning_rate": 9.79501942109453e-05, + "loss": 0.010186624713242054, + "num_input_tokens_seen": 27184160, + "step": 1660, + "train_runtime": 13490.6437, + "train_tokens_per_second": 2015.038 + }, + { + "epoch": 1.0066666666666666, + "grad_norm": 0.01177004911005497, + "learning_rate": 9.794746817837293e-05, + "loss": 0.01371595449745655, + "num_input_tokens_seen": 27200536, + "step": 1661, + "train_runtime": 13498.7517, + "train_tokens_per_second": 2015.041 + }, + { + "epoch": 1.0072727272727273, + "grad_norm": 0.016004914417862892, + "learning_rate": 9.794474037231708e-05, + "loss": 0.015218988992273808, + "num_input_tokens_seen": 27216912, + "step": 1662, + "train_runtime": 13506.8607, + "train_tokens_per_second": 2015.044 + }, + { + "epoch": 1.0078787878787878, + "grad_norm": 0.002983122132718563, + "learning_rate": 9.794201079287865e-05, + "loss": 0.01247552502900362, + "num_input_tokens_seen": 27233288, + "step": 1663, + "train_runtime": 13514.9678, + "train_tokens_per_second": 2015.046 + }, + { + "epoch": 1.0084848484848485, + "grad_norm": 0.007749462965875864, + "learning_rate": 9.793927944015862e-05, + "loss": 0.013320086523890495, + "num_input_tokens_seen": 27249664, + "step": 1664, + "train_runtime": 13523.0773, + "train_tokens_per_second": 2015.049 + }, + { + "epoch": 1.009090909090909, + "grad_norm": 0.008403261192142963, + "learning_rate": 9.7936546314258e-05, + "loss": 0.01121437270194292, + "num_input_tokens_seen": 27266040, + "step": 1665, + "train_runtime": 13531.191, + "train_tokens_per_second": 2015.051 + }, + { + "epoch": 1.0096969696969698, + "grad_norm": 0.009295761585235596, + "learning_rate": 9.793381141527786e-05, + "loss": 0.011189664714038372, + "num_input_tokens_seen": 27282416, + "step": 1666, + "train_runtime": 13539.3003, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 1.0103030303030303, + "grad_norm": 0.008638166822493076, + "learning_rate": 9.79310747433194e-05, + "loss": 0.012114373967051506, + "num_input_tokens_seen": 27298792, + "step": 1667, + "train_runtime": 13547.4075, + "train_tokens_per_second": 2015.057 + }, + { + "epoch": 1.010909090909091, + "grad_norm": 0.013895823620259762, + "learning_rate": 9.792833629848384e-05, + "loss": 0.012441890314221382, + "num_input_tokens_seen": 27315168, + "step": 1668, + "train_runtime": 13555.5158, + "train_tokens_per_second": 2015.059 + }, + { + "epoch": 1.0115151515151515, + "grad_norm": 0.010214622132480145, + "learning_rate": 9.792559608087243e-05, + "loss": 0.013287513516843319, + "num_input_tokens_seen": 27331544, + "step": 1669, + "train_runtime": 13563.632, + "train_tokens_per_second": 2015.061 + }, + { + "epoch": 1.0121212121212122, + "grad_norm": 0.049505386501550674, + "learning_rate": 9.792285409058657e-05, + "loss": 0.014072421938180923, + "num_input_tokens_seen": 27347920, + "step": 1670, + "train_runtime": 13571.7428, + "train_tokens_per_second": 2015.063 + }, + { + "epoch": 1.0127272727272727, + "grad_norm": 0.0072554643265903, + "learning_rate": 9.792011032772765e-05, + "loss": 0.012629404664039612, + "num_input_tokens_seen": 27364296, + "step": 1671, + "train_runtime": 13579.8507, + "train_tokens_per_second": 2015.066 + }, + { + "epoch": 1.0133333333333334, + "grad_norm": 0.03262757137417793, + "learning_rate": 9.791736479239717e-05, + "loss": 0.013759227469563484, + "num_input_tokens_seen": 27380672, + "step": 1672, + "train_runtime": 13587.9608, + "train_tokens_per_second": 2015.069 + }, + { + "epoch": 1.013939393939394, + "grad_norm": 0.012852982617914677, + "learning_rate": 9.791461748469669e-05, + "loss": 0.012652904726564884, + "num_input_tokens_seen": 27397048, + "step": 1673, + "train_runtime": 13596.0726, + "train_tokens_per_second": 2015.071 + }, + { + "epoch": 1.0145454545454546, + "grad_norm": 0.008402649313211441, + "learning_rate": 9.791186840472781e-05, + "loss": 0.012419513426721096, + "num_input_tokens_seen": 27413424, + "step": 1674, + "train_runtime": 13604.1792, + "train_tokens_per_second": 2015.074 + }, + { + "epoch": 1.0151515151515151, + "grad_norm": 0.00858678761869669, + "learning_rate": 9.790911755259223e-05, + "loss": 0.011973465792834759, + "num_input_tokens_seen": 27429800, + "step": 1675, + "train_runtime": 13612.2879, + "train_tokens_per_second": 2015.076 + }, + { + "epoch": 1.0157575757575759, + "grad_norm": 0.00914035551249981, + "learning_rate": 9.79063649283917e-05, + "loss": 0.012171374633908272, + "num_input_tokens_seen": 27446176, + "step": 1676, + "train_runtime": 13620.3961, + "train_tokens_per_second": 2015.079 + }, + { + "epoch": 1.0163636363636364, + "grad_norm": 0.01112990453839302, + "learning_rate": 9.790361053222799e-05, + "loss": 0.013811156153678894, + "num_input_tokens_seen": 27462552, + "step": 1677, + "train_runtime": 13628.5074, + "train_tokens_per_second": 2015.081 + }, + { + "epoch": 1.016969696969697, + "grad_norm": 0.008288376964628696, + "learning_rate": 9.790085436420304e-05, + "loss": 0.012029111385345459, + "num_input_tokens_seen": 27478928, + "step": 1678, + "train_runtime": 13636.6158, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 1.0175757575757576, + "grad_norm": 0.009814487770199776, + "learning_rate": 9.789809642441877e-05, + "loss": 0.012732294388115406, + "num_input_tokens_seen": 27495304, + "step": 1679, + "train_runtime": 13644.7216, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 1.018181818181818, + "grad_norm": 0.014307788573205471, + "learning_rate": 9.789533671297719e-05, + "loss": 0.013805416412651539, + "num_input_tokens_seen": 27511680, + "step": 1680, + "train_runtime": 13652.8336, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 1.0187878787878788, + "grad_norm": 0.012851865030825138, + "learning_rate": 9.789257522998037e-05, + "loss": 0.012965833768248558, + "num_input_tokens_seen": 27528056, + "step": 1681, + "train_runtime": 13660.9411, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 1.0193939393939393, + "grad_norm": 0.01544270757585764, + "learning_rate": 9.788981197553047e-05, + "loss": 0.012375458143651485, + "num_input_tokens_seen": 27544432, + "step": 1682, + "train_runtime": 13669.0468, + "train_tokens_per_second": 2015.095 + }, + { + "epoch": 1.02, + "grad_norm": 0.017993232235312462, + "learning_rate": 9.788704694972967e-05, + "loss": 0.012339062988758087, + "num_input_tokens_seen": 27560808, + "step": 1683, + "train_runtime": 13677.1562, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.0206060606060605, + "grad_norm": 0.011097032576799393, + "learning_rate": 9.788428015268027e-05, + "loss": 0.01293334923684597, + "num_input_tokens_seen": 27577184, + "step": 1684, + "train_runtime": 13685.2675, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 1.0212121212121212, + "grad_norm": 0.009788686409592628, + "learning_rate": 9.78815115844846e-05, + "loss": 0.013497721403837204, + "num_input_tokens_seen": 27593560, + "step": 1685, + "train_runtime": 13693.378, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 1.0218181818181817, + "grad_norm": 0.011068359948694706, + "learning_rate": 9.787874124524505e-05, + "loss": 0.012450134381651878, + "num_input_tokens_seen": 27609936, + "step": 1686, + "train_runtime": 13701.4857, + "train_tokens_per_second": 2015.105 + }, + { + "epoch": 1.0224242424242425, + "grad_norm": 0.010093600489199162, + "learning_rate": 9.78759691350641e-05, + "loss": 0.012252770364284515, + "num_input_tokens_seen": 27626312, + "step": 1687, + "train_runtime": 13709.5946, + "train_tokens_per_second": 2015.108 + }, + { + "epoch": 1.023030303030303, + "grad_norm": 0.0072128004394471645, + "learning_rate": 9.78731952540443e-05, + "loss": 0.011251643300056458, + "num_input_tokens_seen": 27642688, + "step": 1688, + "train_runtime": 13717.7048, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 1.0236363636363637, + "grad_norm": 0.01251761894673109, + "learning_rate": 9.787041960228823e-05, + "loss": 0.013902283273637295, + "num_input_tokens_seen": 27659064, + "step": 1689, + "train_runtime": 13725.8143, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.0242424242424242, + "grad_norm": 0.011588496156036854, + "learning_rate": 9.786764217989856e-05, + "loss": 0.0121589545160532, + "num_input_tokens_seen": 27675440, + "step": 1690, + "train_runtime": 13733.9208, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 1.024848484848485, + "grad_norm": 0.013498706743121147, + "learning_rate": 9.786486298697803e-05, + "loss": 0.014338000677525997, + "num_input_tokens_seen": 27691816, + "step": 1691, + "train_runtime": 13742.0332, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 1.0254545454545454, + "grad_norm": 0.008712450042366982, + "learning_rate": 9.786208202362943e-05, + "loss": 0.011765317060053349, + "num_input_tokens_seen": 27708192, + "step": 1692, + "train_runtime": 13750.1435, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 1.0260606060606061, + "grad_norm": 0.012163599021732807, + "learning_rate": 9.785929928995561e-05, + "loss": 0.013479816727340221, + "num_input_tokens_seen": 27724568, + "step": 1693, + "train_runtime": 13758.2542, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 1.0266666666666666, + "grad_norm": 0.011169539764523506, + "learning_rate": 9.785651478605953e-05, + "loss": 0.011647282168269157, + "num_input_tokens_seen": 27740944, + "step": 1694, + "train_runtime": 13766.3635, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 1.0272727272727273, + "grad_norm": 0.008172113448381424, + "learning_rate": 9.785372851204415e-05, + "loss": 0.013068556785583496, + "num_input_tokens_seen": 27757320, + "step": 1695, + "train_runtime": 13774.4757, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.0278787878787878, + "grad_norm": 0.01855943165719509, + "learning_rate": 9.785094046801256e-05, + "loss": 0.012416105717420578, + "num_input_tokens_seen": 27773696, + "step": 1696, + "train_runtime": 13782.5835, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 1.0284848484848486, + "grad_norm": 0.011047457344830036, + "learning_rate": 9.784815065406785e-05, + "loss": 0.01277101319283247, + "num_input_tokens_seen": 27790072, + "step": 1697, + "train_runtime": 13790.6921, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 1.029090909090909, + "grad_norm": 0.015314662829041481, + "learning_rate": 9.784535907031322e-05, + "loss": 0.01302441954612732, + "num_input_tokens_seen": 27806448, + "step": 1698, + "train_runtime": 13798.8029, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 1.0296969696969698, + "grad_norm": 0.00843130238354206, + "learning_rate": 9.784256571685195e-05, + "loss": 0.012320177629590034, + "num_input_tokens_seen": 27822824, + "step": 1699, + "train_runtime": 13806.9118, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 1.0303030303030303, + "grad_norm": 0.022686941549181938, + "learning_rate": 9.783977059378734e-05, + "loss": 0.013117888011038303, + "num_input_tokens_seen": 27839200, + "step": 1700, + "train_runtime": 13815.0208, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 1.030909090909091, + "grad_norm": 0.01333204098045826, + "learning_rate": 9.783697370122278e-05, + "loss": 0.014601165428757668, + "num_input_tokens_seen": 27855576, + "step": 1701, + "train_runtime": 13824.1682, + "train_tokens_per_second": 2014.991 + }, + { + "epoch": 1.0315151515151515, + "grad_norm": 0.014649780467152596, + "learning_rate": 9.783417503926172e-05, + "loss": 0.013181449845433235, + "num_input_tokens_seen": 27871952, + "step": 1702, + "train_runtime": 13832.2803, + "train_tokens_per_second": 2014.993 + }, + { + "epoch": 1.0321212121212122, + "grad_norm": 0.011895393021404743, + "learning_rate": 9.783137460800768e-05, + "loss": 0.012327020056545734, + "num_input_tokens_seen": 27888328, + "step": 1703, + "train_runtime": 13840.3903, + "train_tokens_per_second": 2014.996 + }, + { + "epoch": 1.0327272727272727, + "grad_norm": 0.009198145009577274, + "learning_rate": 9.782857240756423e-05, + "loss": 0.011196177452802658, + "num_input_tokens_seen": 27904704, + "step": 1704, + "train_runtime": 13848.5004, + "train_tokens_per_second": 2014.998 + }, + { + "epoch": 1.0333333333333334, + "grad_norm": 0.008451443165540695, + "learning_rate": 9.782576843803504e-05, + "loss": 0.011635595001280308, + "num_input_tokens_seen": 27921080, + "step": 1705, + "train_runtime": 13856.607, + "train_tokens_per_second": 2015.001 + }, + { + "epoch": 1.033939393939394, + "grad_norm": 0.016875306144356728, + "learning_rate": 9.78229626995238e-05, + "loss": 0.012971418909728527, + "num_input_tokens_seen": 27937456, + "step": 1706, + "train_runtime": 13864.7179, + "train_tokens_per_second": 2015.004 + }, + { + "epoch": 1.0345454545454547, + "grad_norm": 0.01164314430207014, + "learning_rate": 9.782015519213433e-05, + "loss": 0.013034064322710037, + "num_input_tokens_seen": 27953832, + "step": 1707, + "train_runtime": 13872.8335, + "train_tokens_per_second": 2015.005 + }, + { + "epoch": 1.0351515151515152, + "grad_norm": 0.010639763437211514, + "learning_rate": 9.78173459159704e-05, + "loss": 0.01193158607929945, + "num_input_tokens_seen": 27970208, + "step": 1708, + "train_runtime": 13880.9435, + "train_tokens_per_second": 2015.008 + }, + { + "epoch": 1.0357575757575757, + "grad_norm": 0.012052073143422604, + "learning_rate": 9.7814534871136e-05, + "loss": 0.012332379817962646, + "num_input_tokens_seen": 27986584, + "step": 1709, + "train_runtime": 13889.0553, + "train_tokens_per_second": 2015.01 + }, + { + "epoch": 1.0363636363636364, + "grad_norm": 0.010986050590872765, + "learning_rate": 9.781172205773506e-05, + "loss": 0.011283627711236477, + "num_input_tokens_seen": 28002960, + "step": 1710, + "train_runtime": 13897.165, + "train_tokens_per_second": 2015.012 + }, + { + "epoch": 1.0369696969696969, + "grad_norm": 0.012194296345114708, + "learning_rate": 9.780890747587164e-05, + "loss": 0.012133404612541199, + "num_input_tokens_seen": 28019336, + "step": 1711, + "train_runtime": 13905.2722, + "train_tokens_per_second": 2015.015 + }, + { + "epoch": 1.0375757575757576, + "grad_norm": 0.011508403345942497, + "learning_rate": 9.780609112564981e-05, + "loss": 0.012315447442233562, + "num_input_tokens_seen": 28035712, + "step": 1712, + "train_runtime": 13913.3813, + "train_tokens_per_second": 2015.018 + }, + { + "epoch": 1.038181818181818, + "grad_norm": 0.009075857698917389, + "learning_rate": 9.780327300717378e-05, + "loss": 0.013060295023024082, + "num_input_tokens_seen": 28052088, + "step": 1713, + "train_runtime": 13921.4903, + "train_tokens_per_second": 2015.02 + }, + { + "epoch": 1.0387878787878788, + "grad_norm": 0.011064046993851662, + "learning_rate": 9.780045312054778e-05, + "loss": 0.011568753980100155, + "num_input_tokens_seen": 28068464, + "step": 1714, + "train_runtime": 13929.6024, + "train_tokens_per_second": 2015.023 + }, + { + "epoch": 1.0393939393939393, + "grad_norm": 0.006963782943785191, + "learning_rate": 9.77976314658761e-05, + "loss": 0.013147801160812378, + "num_input_tokens_seen": 28084840, + "step": 1715, + "train_runtime": 13937.7105, + "train_tokens_per_second": 2015.025 + }, + { + "epoch": 1.04, + "grad_norm": 0.01637214981019497, + "learning_rate": 9.779480804326313e-05, + "loss": 0.013339829631149769, + "num_input_tokens_seen": 28101216, + "step": 1716, + "train_runtime": 13945.8208, + "train_tokens_per_second": 2015.028 + }, + { + "epoch": 1.0406060606060605, + "grad_norm": 0.017523132264614105, + "learning_rate": 9.779198285281325e-05, + "loss": 0.013437901623547077, + "num_input_tokens_seen": 28117592, + "step": 1717, + "train_runtime": 13953.9357, + "train_tokens_per_second": 2015.029 + }, + { + "epoch": 1.0412121212121213, + "grad_norm": 0.010818637907505035, + "learning_rate": 9.778915589463102e-05, + "loss": 0.012181894853711128, + "num_input_tokens_seen": 28133968, + "step": 1718, + "train_runtime": 13962.0499, + "train_tokens_per_second": 2015.031 + }, + { + "epoch": 1.0418181818181818, + "grad_norm": 0.015641039237380028, + "learning_rate": 9.7786327168821e-05, + "loss": 0.012458113953471184, + "num_input_tokens_seen": 28150344, + "step": 1719, + "train_runtime": 13970.1615, + "train_tokens_per_second": 2015.034 + }, + { + "epoch": 1.0424242424242425, + "grad_norm": 0.01187529880553484, + "learning_rate": 9.778349667548776e-05, + "loss": 0.012462708167731762, + "num_input_tokens_seen": 28166720, + "step": 1720, + "train_runtime": 13978.2715, + "train_tokens_per_second": 2015.036 + }, + { + "epoch": 1.043030303030303, + "grad_norm": 0.006183183286339045, + "learning_rate": 9.778066441473604e-05, + "loss": 0.011370932683348656, + "num_input_tokens_seen": 28183096, + "step": 1721, + "train_runtime": 13986.3841, + "train_tokens_per_second": 2015.038 + }, + { + "epoch": 1.0436363636363637, + "grad_norm": 0.004316729959100485, + "learning_rate": 9.777783038667061e-05, + "loss": 0.010927550494670868, + "num_input_tokens_seen": 28199472, + "step": 1722, + "train_runtime": 13994.4946, + "train_tokens_per_second": 2015.04 + }, + { + "epoch": 1.0442424242424242, + "grad_norm": 0.009708013385534286, + "learning_rate": 9.777499459139626e-05, + "loss": 0.01241978257894516, + "num_input_tokens_seen": 28215848, + "step": 1723, + "train_runtime": 14002.6047, + "train_tokens_per_second": 2015.043 + }, + { + "epoch": 1.044848484848485, + "grad_norm": 0.01743965595960617, + "learning_rate": 9.777215702901789e-05, + "loss": 0.012833865359425545, + "num_input_tokens_seen": 28232224, + "step": 1724, + "train_runtime": 14010.7147, + "train_tokens_per_second": 2015.045 + }, + { + "epoch": 1.0454545454545454, + "grad_norm": 0.016349952667951584, + "learning_rate": 9.776931769964049e-05, + "loss": 0.012332115322351456, + "num_input_tokens_seen": 28248600, + "step": 1725, + "train_runtime": 14018.8331, + "train_tokens_per_second": 2015.046 + }, + { + "epoch": 1.0460606060606061, + "grad_norm": 0.012781591154634953, + "learning_rate": 9.776647660336903e-05, + "loss": 0.013009129092097282, + "num_input_tokens_seen": 28264976, + "step": 1726, + "train_runtime": 14026.9411, + "train_tokens_per_second": 2015.049 + }, + { + "epoch": 1.0466666666666666, + "grad_norm": 0.0058420756831765175, + "learning_rate": 9.776363374030864e-05, + "loss": 0.01141081377863884, + "num_input_tokens_seen": 28281352, + "step": 1727, + "train_runtime": 14035.0484, + "train_tokens_per_second": 2015.052 + }, + { + "epoch": 1.0472727272727274, + "grad_norm": 0.010841155424714088, + "learning_rate": 9.776078911056445e-05, + "loss": 0.011902189813554287, + "num_input_tokens_seen": 28297728, + "step": 1728, + "train_runtime": 14043.1635, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 1.0478787878787879, + "grad_norm": 0.006410760339349508, + "learning_rate": 9.775794271424168e-05, + "loss": 0.011528456583619118, + "num_input_tokens_seen": 28314104, + "step": 1729, + "train_runtime": 14051.2752, + "train_tokens_per_second": 2015.056 + }, + { + "epoch": 1.0484848484848486, + "grad_norm": 0.01570526696741581, + "learning_rate": 9.77550945514456e-05, + "loss": 0.01312203984707594, + "num_input_tokens_seen": 28330480, + "step": 1730, + "train_runtime": 14059.3837, + "train_tokens_per_second": 2015.058 + }, + { + "epoch": 1.049090909090909, + "grad_norm": 0.01310622040182352, + "learning_rate": 9.775224462228159e-05, + "loss": 0.012747850269079208, + "num_input_tokens_seen": 28346856, + "step": 1731, + "train_runtime": 14067.4947, + "train_tokens_per_second": 2015.061 + }, + { + "epoch": 1.0496969696969698, + "grad_norm": 0.01001759059727192, + "learning_rate": 9.774939292685504e-05, + "loss": 0.012231552973389626, + "num_input_tokens_seen": 28363232, + "step": 1732, + "train_runtime": 14075.6078, + "train_tokens_per_second": 2015.063 + }, + { + "epoch": 1.0503030303030303, + "grad_norm": 0.008718959987163544, + "learning_rate": 9.774653946527141e-05, + "loss": 0.012665827758610249, + "num_input_tokens_seen": 28379608, + "step": 1733, + "train_runtime": 14083.7165, + "train_tokens_per_second": 2015.065 + }, + { + "epoch": 1.050909090909091, + "grad_norm": 0.013324756175279617, + "learning_rate": 9.774368423763629e-05, + "loss": 0.012266149744391441, + "num_input_tokens_seen": 28395984, + "step": 1734, + "train_runtime": 14091.8314, + "train_tokens_per_second": 2015.067 + }, + { + "epoch": 1.0515151515151515, + "grad_norm": 0.013658484444022179, + "learning_rate": 9.774082724405526e-05, + "loss": 0.013175414875149727, + "num_input_tokens_seen": 28412360, + "step": 1735, + "train_runtime": 14099.9432, + "train_tokens_per_second": 2015.069 + }, + { + "epoch": 1.0521212121212122, + "grad_norm": 0.005875375587493181, + "learning_rate": 9.773796848463402e-05, + "loss": 0.011622895486652851, + "num_input_tokens_seen": 28428736, + "step": 1736, + "train_runtime": 14108.0531, + "train_tokens_per_second": 2015.072 + }, + { + "epoch": 1.0527272727272727, + "grad_norm": 0.007178007159382105, + "learning_rate": 9.773510795947827e-05, + "loss": 0.010762317106127739, + "num_input_tokens_seen": 28445112, + "step": 1737, + "train_runtime": 14116.1633, + "train_tokens_per_second": 2015.074 + }, + { + "epoch": 1.0533333333333332, + "grad_norm": 0.012299314141273499, + "learning_rate": 9.773224566869385e-05, + "loss": 0.012406258843839169, + "num_input_tokens_seen": 28461488, + "step": 1738, + "train_runtime": 14124.2738, + "train_tokens_per_second": 2015.076 + }, + { + "epoch": 1.053939393939394, + "grad_norm": 0.009009703993797302, + "learning_rate": 9.77293816123866e-05, + "loss": 0.013556526973843575, + "num_input_tokens_seen": 28477864, + "step": 1739, + "train_runtime": 14132.3859, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 1.0545454545454545, + "grad_norm": 0.014840499497950077, + "learning_rate": 9.772651579066248e-05, + "loss": 0.012394964694976807, + "num_input_tokens_seen": 28494240, + "step": 1740, + "train_runtime": 14140.4952, + "train_tokens_per_second": 2015.081 + }, + { + "epoch": 1.0551515151515152, + "grad_norm": 0.0068387677893042564, + "learning_rate": 9.772364820362749e-05, + "loss": 0.011850697919726372, + "num_input_tokens_seen": 28510616, + "step": 1741, + "train_runtime": 14148.6048, + "train_tokens_per_second": 2015.083 + }, + { + "epoch": 1.0557575757575757, + "grad_norm": 0.03244248777627945, + "learning_rate": 9.772077885138769e-05, + "loss": 0.013322196900844574, + "num_input_tokens_seen": 28526992, + "step": 1742, + "train_runtime": 14156.7153, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.0563636363636364, + "grad_norm": 0.0062013729475438595, + "learning_rate": 9.771790773404921e-05, + "loss": 0.012244854122400284, + "num_input_tokens_seen": 28543368, + "step": 1743, + "train_runtime": 14164.8321, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 1.056969696969697, + "grad_norm": 0.012492086738348007, + "learning_rate": 9.771503485171824e-05, + "loss": 0.011908994987607002, + "num_input_tokens_seen": 28559744, + "step": 1744, + "train_runtime": 14172.949, + "train_tokens_per_second": 2015.088 + }, + { + "epoch": 1.0575757575757576, + "grad_norm": 0.00868499930948019, + "learning_rate": 9.771216020450108e-05, + "loss": 0.011504937894642353, + "num_input_tokens_seen": 28576120, + "step": 1745, + "train_runtime": 14181.0686, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 1.0581818181818181, + "grad_norm": 0.006381432060152292, + "learning_rate": 9.770928379250399e-05, + "loss": 0.011398052796721458, + "num_input_tokens_seen": 28592496, + "step": 1746, + "train_runtime": 14189.1889, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 1.0587878787878788, + "grad_norm": 0.011300310492515564, + "learning_rate": 9.770640561583342e-05, + "loss": 0.013754375278949738, + "num_input_tokens_seen": 28608872, + "step": 1747, + "train_runtime": 14197.3074, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.0593939393939393, + "grad_norm": 0.01168846245855093, + "learning_rate": 9.770352567459582e-05, + "loss": 0.013645244762301445, + "num_input_tokens_seen": 28625248, + "step": 1748, + "train_runtime": 14205.4333, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 1.06, + "grad_norm": 0.010689773596823215, + "learning_rate": 9.770064396889769e-05, + "loss": 0.012318682856857777, + "num_input_tokens_seen": 28641624, + "step": 1749, + "train_runtime": 14213.5515, + "train_tokens_per_second": 2015.093 + }, + { + "epoch": 1.0606060606060606, + "grad_norm": 0.027544992044568062, + "learning_rate": 9.769776049884563e-05, + "loss": 0.011938882060348988, + "num_input_tokens_seen": 28658000, + "step": 1750, + "train_runtime": 14221.6712, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 1.0612121212121213, + "grad_norm": 0.014839374460279942, + "learning_rate": 9.769487526454631e-05, + "loss": 0.01188915129750967, + "num_input_tokens_seen": 28674376, + "step": 1751, + "train_runtime": 14229.7903, + "train_tokens_per_second": 2015.095 + }, + { + "epoch": 1.0618181818181818, + "grad_norm": 0.01328266877681017, + "learning_rate": 9.769198826610644e-05, + "loss": 0.013045158237218857, + "num_input_tokens_seen": 28690752, + "step": 1752, + "train_runtime": 14237.9087, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 1.0624242424242425, + "grad_norm": 0.005636645946651697, + "learning_rate": 9.768909950363278e-05, + "loss": 0.011337255127727985, + "num_input_tokens_seen": 28707128, + "step": 1753, + "train_runtime": 14246.0326, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 1.063030303030303, + "grad_norm": 0.013464851304888725, + "learning_rate": 9.76862089772322e-05, + "loss": 0.012251244857907295, + "num_input_tokens_seen": 28723504, + "step": 1754, + "train_runtime": 14254.1552, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.0636363636363637, + "grad_norm": 0.016677234321832657, + "learning_rate": 9.768331668701162e-05, + "loss": 0.012644241563975811, + "num_input_tokens_seen": 28739880, + "step": 1755, + "train_runtime": 14262.2737, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.0642424242424242, + "grad_norm": 0.013724375516176224, + "learning_rate": 9.768042263307804e-05, + "loss": 0.013380605727434158, + "num_input_tokens_seen": 28756256, + "step": 1756, + "train_runtime": 14270.3932, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 1.064848484848485, + "grad_norm": 0.008208510465919971, + "learning_rate": 9.767752681553845e-05, + "loss": 0.011122636497020721, + "num_input_tokens_seen": 28772632, + "step": 1757, + "train_runtime": 14278.513, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 1.0654545454545454, + "grad_norm": 0.03562194108963013, + "learning_rate": 9.76746292345e-05, + "loss": 0.01469709537923336, + "num_input_tokens_seen": 28789008, + "step": 1758, + "train_runtime": 14286.6323, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 1.0660606060606062, + "grad_norm": 0.008073823526501656, + "learning_rate": 9.767172989006985e-05, + "loss": 0.012804273515939713, + "num_input_tokens_seen": 28805384, + "step": 1759, + "train_runtime": 14294.7846, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 0.01726573519408703, + "learning_rate": 9.766882878235526e-05, + "loss": 0.012737629935145378, + "num_input_tokens_seen": 28821760, + "step": 1760, + "train_runtime": 14302.9039, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.0672727272727274, + "grad_norm": 0.009928864426910877, + "learning_rate": 9.766592591146352e-05, + "loss": 0.012318781577050686, + "num_input_tokens_seen": 28838136, + "step": 1761, + "train_runtime": 14311.023, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 1.0678787878787879, + "grad_norm": 0.010449480265378952, + "learning_rate": 9.7663021277502e-05, + "loss": 0.012027337215840816, + "num_input_tokens_seen": 28854512, + "step": 1762, + "train_runtime": 14319.1405, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 1.0684848484848484, + "grad_norm": 0.00951461587101221, + "learning_rate": 9.766011488057815e-05, + "loss": 0.012412112206220627, + "num_input_tokens_seen": 28870888, + "step": 1763, + "train_runtime": 14327.2603, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 1.069090909090909, + "grad_norm": 0.012439711019396782, + "learning_rate": 9.765720672079946e-05, + "loss": 0.012368155643343925, + "num_input_tokens_seen": 28887264, + "step": 1764, + "train_runtime": 14335.3786, + "train_tokens_per_second": 2015.103 + }, + { + "epoch": 1.0696969696969698, + "grad_norm": 0.027021143585443497, + "learning_rate": 9.76542967982735e-05, + "loss": 0.013114279136061668, + "num_input_tokens_seen": 28903640, + "step": 1765, + "train_runtime": 14343.4964, + "train_tokens_per_second": 2015.104 + }, + { + "epoch": 1.0703030303030303, + "grad_norm": 0.017193615436553955, + "learning_rate": 9.765138511310791e-05, + "loss": 0.015285216271877289, + "num_input_tokens_seen": 28920016, + "step": 1766, + "train_runtime": 14351.613, + "train_tokens_per_second": 2015.106 + }, + { + "epoch": 1.0709090909090908, + "grad_norm": 0.012239827774465084, + "learning_rate": 9.764847166541038e-05, + "loss": 0.011103234253823757, + "num_input_tokens_seen": 28936392, + "step": 1767, + "train_runtime": 14359.7327, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 1.0715151515151515, + "grad_norm": 0.015190470963716507, + "learning_rate": 9.764555645528867e-05, + "loss": 0.012309125624597073, + "num_input_tokens_seen": 28952768, + "step": 1768, + "train_runtime": 14367.8503, + "train_tokens_per_second": 2015.108 + }, + { + "epoch": 1.072121212121212, + "grad_norm": 0.06645633280277252, + "learning_rate": 9.764263948285062e-05, + "loss": 0.010960210114717484, + "num_input_tokens_seen": 28969144, + "step": 1769, + "train_runtime": 14375.9681, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 1.0727272727272728, + "grad_norm": 0.009598735719919205, + "learning_rate": 9.76397207482041e-05, + "loss": 0.011304397135972977, + "num_input_tokens_seen": 28985520, + "step": 1770, + "train_runtime": 14384.086, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 1.0733333333333333, + "grad_norm": 0.011472896672785282, + "learning_rate": 9.76368002514571e-05, + "loss": 0.011527102440595627, + "num_input_tokens_seen": 29001896, + "step": 1771, + "train_runtime": 14392.2036, + "train_tokens_per_second": 2015.112 + }, + { + "epoch": 1.073939393939394, + "grad_norm": 0.014919782057404518, + "learning_rate": 9.763387799271761e-05, + "loss": 0.012540474534034729, + "num_input_tokens_seen": 29018272, + "step": 1772, + "train_runtime": 14400.3221, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.0745454545454545, + "grad_norm": 0.015532581135630608, + "learning_rate": 9.763095397209374e-05, + "loss": 0.012600000947713852, + "num_input_tokens_seen": 29034648, + "step": 1773, + "train_runtime": 14408.4457, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.0751515151515152, + "grad_norm": 0.008681225590407848, + "learning_rate": 9.762802818969366e-05, + "loss": 0.0126079972833395, + "num_input_tokens_seen": 29051024, + "step": 1774, + "train_runtime": 14416.5656, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 1.0757575757575757, + "grad_norm": 0.03641294687986374, + "learning_rate": 9.762510064562556e-05, + "loss": 0.013490713201463223, + "num_input_tokens_seen": 29067400, + "step": 1775, + "train_runtime": 14424.6824, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 1.0763636363636364, + "grad_norm": 0.13002076745033264, + "learning_rate": 9.762217133999771e-05, + "loss": 0.014682717621326447, + "num_input_tokens_seen": 29083776, + "step": 1776, + "train_runtime": 14432.8024, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 1.076969696969697, + "grad_norm": 0.01150805689394474, + "learning_rate": 9.76192402729185e-05, + "loss": 0.013302959501743317, + "num_input_tokens_seen": 29100152, + "step": 1777, + "train_runtime": 14440.92, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 1.0775757575757576, + "grad_norm": 0.0106755206361413, + "learning_rate": 9.761630744449633e-05, + "loss": 0.01287130918353796, + "num_input_tokens_seen": 29116528, + "step": 1778, + "train_runtime": 14449.038, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 1.0781818181818181, + "grad_norm": 0.01342584379017353, + "learning_rate": 9.761337285483967e-05, + "loss": 0.01304157730191946, + "num_input_tokens_seen": 29132904, + "step": 1779, + "train_runtime": 14457.157, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 1.0787878787878789, + "grad_norm": 0.012149529531598091, + "learning_rate": 9.761043650405708e-05, + "loss": 0.013586745597422123, + "num_input_tokens_seen": 29149280, + "step": 1780, + "train_runtime": 14465.2764, + "train_tokens_per_second": 2015.121 + }, + { + "epoch": 1.0793939393939394, + "grad_norm": 0.030686961486935616, + "learning_rate": 9.760749839225714e-05, + "loss": 0.011370385065674782, + "num_input_tokens_seen": 29165656, + "step": 1781, + "train_runtime": 14473.3971, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.08, + "grad_norm": 0.011646251194179058, + "learning_rate": 9.760455851954857e-05, + "loss": 0.012405425310134888, + "num_input_tokens_seen": 29182032, + "step": 1782, + "train_runtime": 14481.5154, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 1.0806060606060606, + "grad_norm": 0.008761495351791382, + "learning_rate": 9.760161688604008e-05, + "loss": 0.012334661558270454, + "num_input_tokens_seen": 29198408, + "step": 1783, + "train_runtime": 14489.6462, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.0812121212121213, + "grad_norm": 0.017235441133379936, + "learning_rate": 9.759867349184046e-05, + "loss": 0.014072883874177933, + "num_input_tokens_seen": 29214784, + "step": 1784, + "train_runtime": 14497.764, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 1.0818181818181818, + "grad_norm": 0.012654728256165981, + "learning_rate": 9.759572833705864e-05, + "loss": 0.012484287843108177, + "num_input_tokens_seen": 29231160, + "step": 1785, + "train_runtime": 14505.8825, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 1.0824242424242425, + "grad_norm": 0.013732358813285828, + "learning_rate": 9.759278142180348e-05, + "loss": 0.01238732784986496, + "num_input_tokens_seen": 29247536, + "step": 1786, + "train_runtime": 14514.0013, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 1.083030303030303, + "grad_norm": 0.012203603982925415, + "learning_rate": 9.758983274618404e-05, + "loss": 0.012388849630951881, + "num_input_tokens_seen": 29263912, + "step": 1787, + "train_runtime": 14522.1208, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.0836363636363637, + "grad_norm": 0.013789234682917595, + "learning_rate": 9.758688231030935e-05, + "loss": 0.011297831311821938, + "num_input_tokens_seen": 29280288, + "step": 1788, + "train_runtime": 14530.2411, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.0842424242424242, + "grad_norm": 0.008793489076197147, + "learning_rate": 9.758393011428857e-05, + "loss": 0.012010754086077213, + "num_input_tokens_seen": 29296664, + "step": 1789, + "train_runtime": 14538.3598, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 1.084848484848485, + "grad_norm": 0.009672212414443493, + "learning_rate": 9.758097615823088e-05, + "loss": 0.01176269818097353, + "num_input_tokens_seen": 29313040, + "step": 1790, + "train_runtime": 14546.4774, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 1.0854545454545454, + "grad_norm": 0.05065310373902321, + "learning_rate": 9.757802044224553e-05, + "loss": 0.01442466676235199, + "num_input_tokens_seen": 29329416, + "step": 1791, + "train_runtime": 14554.6003, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 1.086060606060606, + "grad_norm": 0.013653626665472984, + "learning_rate": 9.757506296644186e-05, + "loss": 0.012618528679013252, + "num_input_tokens_seen": 29345792, + "step": 1792, + "train_runtime": 14562.7191, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 1.0866666666666667, + "grad_norm": 0.01730596460402012, + "learning_rate": 9.757210373092926e-05, + "loss": 0.012462806887924671, + "num_input_tokens_seen": 29362168, + "step": 1793, + "train_runtime": 14570.8367, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 1.0872727272727274, + "grad_norm": 0.03405005484819412, + "learning_rate": 9.756914273581718e-05, + "loss": 0.012813026085495949, + "num_input_tokens_seen": 29378544, + "step": 1794, + "train_runtime": 14578.957, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 1.087878787878788, + "grad_norm": 0.00812508538365364, + "learning_rate": 9.756617998121516e-05, + "loss": 0.012410092167556286, + "num_input_tokens_seen": 29394920, + "step": 1795, + "train_runtime": 14587.076, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 1.0884848484848484, + "grad_norm": 0.01273594330996275, + "learning_rate": 9.756321546723277e-05, + "loss": 0.014060670509934425, + "num_input_tokens_seen": 29411296, + "step": 1796, + "train_runtime": 14595.1942, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 1.089090909090909, + "grad_norm": 0.010581238195300102, + "learning_rate": 9.756024919397965e-05, + "loss": 0.013785408809781075, + "num_input_tokens_seen": 29427672, + "step": 1797, + "train_runtime": 14603.3146, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 1.0896969696969696, + "grad_norm": 0.008253599517047405, + "learning_rate": 9.755728116156555e-05, + "loss": 0.011731135658919811, + "num_input_tokens_seen": 29444048, + "step": 1798, + "train_runtime": 14611.434, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 1.0903030303030303, + "grad_norm": 0.00944508146494627, + "learning_rate": 9.755431137010023e-05, + "loss": 0.01250201091170311, + "num_input_tokens_seen": 29460424, + "step": 1799, + "train_runtime": 14619.551, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 1.0909090909090908, + "grad_norm": 0.011392886750400066, + "learning_rate": 9.755133981969353e-05, + "loss": 0.011629536747932434, + "num_input_tokens_seen": 29476800, + "step": 1800, + "train_runtime": 14627.6691, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 1.0915151515151515, + "grad_norm": 0.018327729776501656, + "learning_rate": 9.754836651045538e-05, + "loss": 0.014299526810646057, + "num_input_tokens_seen": 29493176, + "step": 1801, + "train_runtime": 14636.8186, + "train_tokens_per_second": 2014.999 + }, + { + "epoch": 1.092121212121212, + "grad_norm": 0.01780688390135765, + "learning_rate": 9.754539144249574e-05, + "loss": 0.012428405694663525, + "num_input_tokens_seen": 29509552, + "step": 1802, + "train_runtime": 14644.9369, + "train_tokens_per_second": 2015.0 + }, + { + "epoch": 1.0927272727272728, + "grad_norm": 0.01762632466852665, + "learning_rate": 9.754241461592468e-05, + "loss": 0.011811019852757454, + "num_input_tokens_seen": 29525928, + "step": 1803, + "train_runtime": 14653.0556, + "train_tokens_per_second": 2015.001 + }, + { + "epoch": 1.0933333333333333, + "grad_norm": 0.00704316608607769, + "learning_rate": 9.753943603085227e-05, + "loss": 0.011734440922737122, + "num_input_tokens_seen": 29542304, + "step": 1804, + "train_runtime": 14661.1926, + "train_tokens_per_second": 2015.0 + }, + { + "epoch": 1.093939393939394, + "grad_norm": 0.007165444549173117, + "learning_rate": 9.753645568738871e-05, + "loss": 0.012202661484479904, + "num_input_tokens_seen": 29558680, + "step": 1805, + "train_runtime": 14669.3124, + "train_tokens_per_second": 2015.001 + }, + { + "epoch": 1.0945454545454545, + "grad_norm": 0.007894719950854778, + "learning_rate": 9.753347358564423e-05, + "loss": 0.01162760891020298, + "num_input_tokens_seen": 29575056, + "step": 1806, + "train_runtime": 14677.4323, + "train_tokens_per_second": 2015.002 + }, + { + "epoch": 1.0951515151515152, + "grad_norm": 0.015263660810887814, + "learning_rate": 9.753048972572912e-05, + "loss": 0.014112107455730438, + "num_input_tokens_seen": 29591432, + "step": 1807, + "train_runtime": 14685.5514, + "train_tokens_per_second": 2015.003 + }, + { + "epoch": 1.0957575757575757, + "grad_norm": 0.015707993879914284, + "learning_rate": 9.752750410775377e-05, + "loss": 0.011696948669850826, + "num_input_tokens_seen": 29607808, + "step": 1808, + "train_runtime": 14693.6719, + "train_tokens_per_second": 2015.004 + }, + { + "epoch": 1.0963636363636364, + "grad_norm": 0.008728215470910072, + "learning_rate": 9.752451673182859e-05, + "loss": 0.01177802961319685, + "num_input_tokens_seen": 29624184, + "step": 1809, + "train_runtime": 14701.7939, + "train_tokens_per_second": 2015.005 + }, + { + "epoch": 1.096969696969697, + "grad_norm": 0.013884173706173897, + "learning_rate": 9.752152759806408e-05, + "loss": 0.012690341100096703, + "num_input_tokens_seen": 29640560, + "step": 1810, + "train_runtime": 14709.9132, + "train_tokens_per_second": 2015.006 + }, + { + "epoch": 1.0975757575757576, + "grad_norm": 0.01112292893230915, + "learning_rate": 9.751853670657081e-05, + "loss": 0.012653153389692307, + "num_input_tokens_seen": 29656936, + "step": 1811, + "train_runtime": 14718.0328, + "train_tokens_per_second": 2015.007 + }, + { + "epoch": 1.0981818181818181, + "grad_norm": 0.010812713764607906, + "learning_rate": 9.751554405745941e-05, + "loss": 0.012738914228975773, + "num_input_tokens_seen": 29673312, + "step": 1812, + "train_runtime": 14726.1501, + "train_tokens_per_second": 2015.008 + }, + { + "epoch": 1.0987878787878789, + "grad_norm": 0.010833236388862133, + "learning_rate": 9.751254965084056e-05, + "loss": 0.012412777170538902, + "num_input_tokens_seen": 29689688, + "step": 1813, + "train_runtime": 14734.2996, + "train_tokens_per_second": 2015.005 + }, + { + "epoch": 1.0993939393939394, + "grad_norm": 0.0037972007412463427, + "learning_rate": 9.750955348682503e-05, + "loss": 0.011926090344786644, + "num_input_tokens_seen": 29706064, + "step": 1814, + "train_runtime": 14742.4187, + "train_tokens_per_second": 2015.006 + }, + { + "epoch": 1.1, + "grad_norm": 0.010151666589081287, + "learning_rate": 9.750655556552364e-05, + "loss": 0.011667672544717789, + "num_input_tokens_seen": 29722440, + "step": 1815, + "train_runtime": 14750.5371, + "train_tokens_per_second": 2015.007 + }, + { + "epoch": 1.1006060606060606, + "grad_norm": 0.004636733792722225, + "learning_rate": 9.750355588704727e-05, + "loss": 0.0114663764834404, + "num_input_tokens_seen": 29738816, + "step": 1816, + "train_runtime": 14758.6548, + "train_tokens_per_second": 2015.009 + }, + { + "epoch": 1.1012121212121213, + "grad_norm": 0.009711910970509052, + "learning_rate": 9.750055445150688e-05, + "loss": 0.012894188985228539, + "num_input_tokens_seen": 29755192, + "step": 1817, + "train_runtime": 14766.7747, + "train_tokens_per_second": 2015.01 + }, + { + "epoch": 1.1018181818181818, + "grad_norm": 0.01187776681035757, + "learning_rate": 9.749755125901349e-05, + "loss": 0.013470055535435677, + "num_input_tokens_seen": 29771568, + "step": 1818, + "train_runtime": 14774.8985, + "train_tokens_per_second": 2015.01 + }, + { + "epoch": 1.1024242424242425, + "grad_norm": 0.014701352454721928, + "learning_rate": 9.749454630967816e-05, + "loss": 0.013217932544648647, + "num_input_tokens_seen": 29787944, + "step": 1819, + "train_runtime": 14783.0202, + "train_tokens_per_second": 2015.011 + }, + { + "epoch": 1.103030303030303, + "grad_norm": 0.01794923096895218, + "learning_rate": 9.749153960361207e-05, + "loss": 0.013999737799167633, + "num_input_tokens_seen": 29804320, + "step": 1820, + "train_runtime": 14791.1415, + "train_tokens_per_second": 2015.011 + }, + { + "epoch": 1.1036363636363635, + "grad_norm": 0.00407990999519825, + "learning_rate": 9.748853114092639e-05, + "loss": 0.012194567359983921, + "num_input_tokens_seen": 29820696, + "step": 1821, + "train_runtime": 14799.2637, + "train_tokens_per_second": 2015.012 + }, + { + "epoch": 1.1042424242424242, + "grad_norm": 0.015737803652882576, + "learning_rate": 9.748552092173246e-05, + "loss": 0.012318138033151627, + "num_input_tokens_seen": 29837072, + "step": 1822, + "train_runtime": 14807.3879, + "train_tokens_per_second": 2015.013 + }, + { + "epoch": 1.1048484848484847, + "grad_norm": 0.01409347727894783, + "learning_rate": 9.748250894614156e-05, + "loss": 0.013581490144133568, + "num_input_tokens_seen": 29853448, + "step": 1823, + "train_runtime": 14815.5037, + "train_tokens_per_second": 2015.014 + }, + { + "epoch": 1.1054545454545455, + "grad_norm": 0.014121908694505692, + "learning_rate": 9.747949521426514e-05, + "loss": 0.012862889096140862, + "num_input_tokens_seen": 29869824, + "step": 1824, + "train_runtime": 14823.6193, + "train_tokens_per_second": 2015.016 + }, + { + "epoch": 1.106060606060606, + "grad_norm": 0.007615895476192236, + "learning_rate": 9.747647972621463e-05, + "loss": 0.012592250481247902, + "num_input_tokens_seen": 29886200, + "step": 1825, + "train_runtime": 14831.7454, + "train_tokens_per_second": 2015.016 + }, + { + "epoch": 1.1066666666666667, + "grad_norm": 0.00876256637275219, + "learning_rate": 9.747346248210161e-05, + "loss": 0.011891753412783146, + "num_input_tokens_seen": 29902576, + "step": 1826, + "train_runtime": 14839.876, + "train_tokens_per_second": 2015.015 + }, + { + "epoch": 1.1072727272727272, + "grad_norm": 0.009708087891340256, + "learning_rate": 9.747044348203766e-05, + "loss": 0.013173967599868774, + "num_input_tokens_seen": 29918952, + "step": 1827, + "train_runtime": 14847.9958, + "train_tokens_per_second": 2015.016 + }, + { + "epoch": 1.107878787878788, + "grad_norm": 0.013138052076101303, + "learning_rate": 9.746742272613443e-05, + "loss": 0.012966789305210114, + "num_input_tokens_seen": 29935328, + "step": 1828, + "train_runtime": 14856.1159, + "train_tokens_per_second": 2015.017 + }, + { + "epoch": 1.1084848484848484, + "grad_norm": 0.008558280766010284, + "learning_rate": 9.74644002145037e-05, + "loss": 0.011896101757884026, + "num_input_tokens_seen": 29951704, + "step": 1829, + "train_runtime": 14864.2363, + "train_tokens_per_second": 2015.018 + }, + { + "epoch": 1.1090909090909091, + "grad_norm": 0.006839067209511995, + "learning_rate": 9.746137594725722e-05, + "loss": 0.013318931683897972, + "num_input_tokens_seen": 29968080, + "step": 1830, + "train_runtime": 14872.3569, + "train_tokens_per_second": 2015.019 + }, + { + "epoch": 1.1096969696969696, + "grad_norm": 0.011994832195341587, + "learning_rate": 9.745834992450689e-05, + "loss": 0.012568656355142593, + "num_input_tokens_seen": 29984456, + "step": 1831, + "train_runtime": 14880.4762, + "train_tokens_per_second": 2015.02 + }, + { + "epoch": 1.1103030303030303, + "grad_norm": 0.014069044031202793, + "learning_rate": 9.745532214636459e-05, + "loss": 0.01205383613705635, + "num_input_tokens_seen": 30000832, + "step": 1832, + "train_runtime": 14888.5973, + "train_tokens_per_second": 2015.021 + }, + { + "epoch": 1.1109090909090908, + "grad_norm": 0.011945140548050404, + "learning_rate": 9.745229261294235e-05, + "loss": 0.01266874186694622, + "num_input_tokens_seen": 30017208, + "step": 1833, + "train_runtime": 14896.7181, + "train_tokens_per_second": 2015.022 + }, + { + "epoch": 1.1115151515151516, + "grad_norm": 0.008917572908103466, + "learning_rate": 9.744926132435223e-05, + "loss": 0.012527624145150185, + "num_input_tokens_seen": 30033584, + "step": 1834, + "train_runtime": 14904.8405, + "train_tokens_per_second": 2015.022 + }, + { + "epoch": 1.112121212121212, + "grad_norm": 0.05755450576543808, + "learning_rate": 9.744622828070632e-05, + "loss": 0.013464560732245445, + "num_input_tokens_seen": 30049960, + "step": 1835, + "train_runtime": 14912.9616, + "train_tokens_per_second": 2015.023 + }, + { + "epoch": 1.1127272727272728, + "grad_norm": 0.08009150624275208, + "learning_rate": 9.744319348211684e-05, + "loss": 0.011920344084501266, + "num_input_tokens_seen": 30066336, + "step": 1836, + "train_runtime": 14921.0823, + "train_tokens_per_second": 2015.024 + }, + { + "epoch": 1.1133333333333333, + "grad_norm": 0.018737800419330597, + "learning_rate": 9.744015692869602e-05, + "loss": 0.013126276433467865, + "num_input_tokens_seen": 30082712, + "step": 1837, + "train_runtime": 14929.2031, + "train_tokens_per_second": 2015.025 + }, + { + "epoch": 1.113939393939394, + "grad_norm": 0.011901168152689934, + "learning_rate": 9.743711862055615e-05, + "loss": 0.013369777239859104, + "num_input_tokens_seen": 30099088, + "step": 1838, + "train_runtime": 14937.3337, + "train_tokens_per_second": 2015.024 + }, + { + "epoch": 1.1145454545454545, + "grad_norm": 0.014852079562842846, + "learning_rate": 9.743407855780969e-05, + "loss": 0.012921641580760479, + "num_input_tokens_seen": 30115464, + "step": 1839, + "train_runtime": 14945.4561, + "train_tokens_per_second": 2015.025 + }, + { + "epoch": 1.1151515151515152, + "grad_norm": 0.02092067152261734, + "learning_rate": 9.7431036740569e-05, + "loss": 0.01345036644488573, + "num_input_tokens_seen": 30131840, + "step": 1840, + "train_runtime": 14953.5781, + "train_tokens_per_second": 2015.025 + }, + { + "epoch": 1.1157575757575757, + "grad_norm": 0.014358977787196636, + "learning_rate": 9.742799316894663e-05, + "loss": 0.012646627612411976, + "num_input_tokens_seen": 30148216, + "step": 1841, + "train_runtime": 14961.699, + "train_tokens_per_second": 2015.026 + }, + { + "epoch": 1.1163636363636364, + "grad_norm": 0.008887351490557194, + "learning_rate": 9.742494784305518e-05, + "loss": 0.01288522221148014, + "num_input_tokens_seen": 30164592, + "step": 1842, + "train_runtime": 14969.8197, + "train_tokens_per_second": 2015.027 + }, + { + "epoch": 1.116969696969697, + "grad_norm": 0.011526895686984062, + "learning_rate": 9.742190076300726e-05, + "loss": 0.013668229803442955, + "num_input_tokens_seen": 30180968, + "step": 1843, + "train_runtime": 14977.9403, + "train_tokens_per_second": 2015.028 + }, + { + "epoch": 1.1175757575757577, + "grad_norm": 0.007548601366579533, + "learning_rate": 9.741885192891556e-05, + "loss": 0.012254860252141953, + "num_input_tokens_seen": 30197344, + "step": 1844, + "train_runtime": 14986.0602, + "train_tokens_per_second": 2015.029 + }, + { + "epoch": 1.1181818181818182, + "grad_norm": 0.008887049742043018, + "learning_rate": 9.74158013408929e-05, + "loss": 0.012711411342024803, + "num_input_tokens_seen": 30213720, + "step": 1845, + "train_runtime": 14994.181, + "train_tokens_per_second": 2015.03 + }, + { + "epoch": 1.1187878787878789, + "grad_norm": 0.009076109156012535, + "learning_rate": 9.741274899905207e-05, + "loss": 0.011722360737621784, + "num_input_tokens_seen": 30230096, + "step": 1846, + "train_runtime": 15002.3018, + "train_tokens_per_second": 2015.031 + }, + { + "epoch": 1.1193939393939394, + "grad_norm": 0.00916222669184208, + "learning_rate": 9.740969490350598e-05, + "loss": 0.012270544655621052, + "num_input_tokens_seen": 30246472, + "step": 1847, + "train_runtime": 15010.4324, + "train_tokens_per_second": 2015.03 + }, + { + "epoch": 1.12, + "grad_norm": 0.013789625838398933, + "learning_rate": 9.74066390543676e-05, + "loss": 0.011830583214759827, + "num_input_tokens_seen": 30262848, + "step": 1848, + "train_runtime": 15018.5537, + "train_tokens_per_second": 2015.031 + }, + { + "epoch": 1.1206060606060606, + "grad_norm": 0.00897382665425539, + "learning_rate": 9.740358145174998e-05, + "loss": 0.012301658280193806, + "num_input_tokens_seen": 30279224, + "step": 1849, + "train_runtime": 15026.6729, + "train_tokens_per_second": 2015.032 + }, + { + "epoch": 1.121212121212121, + "grad_norm": 0.013259979896247387, + "learning_rate": 9.740052209576619e-05, + "loss": 0.013160964474081993, + "num_input_tokens_seen": 30295600, + "step": 1850, + "train_runtime": 15034.7937, + "train_tokens_per_second": 2015.033 + }, + { + "epoch": 1.1218181818181818, + "grad_norm": 0.009648144245147705, + "learning_rate": 9.739746098652939e-05, + "loss": 0.013108627870678902, + "num_input_tokens_seen": 30311976, + "step": 1851, + "train_runtime": 15042.9152, + "train_tokens_per_second": 2015.033 + }, + { + "epoch": 1.1224242424242423, + "grad_norm": 0.006125753745436668, + "learning_rate": 9.739439812415281e-05, + "loss": 0.012194282375276089, + "num_input_tokens_seen": 30328352, + "step": 1852, + "train_runtime": 15051.0341, + "train_tokens_per_second": 2015.034 + }, + { + "epoch": 1.123030303030303, + "grad_norm": 0.01046109851449728, + "learning_rate": 9.739133350874974e-05, + "loss": 0.012304945848882198, + "num_input_tokens_seen": 30344728, + "step": 1853, + "train_runtime": 15059.1522, + "train_tokens_per_second": 2015.036 + }, + { + "epoch": 1.1236363636363635, + "grad_norm": 0.01071660965681076, + "learning_rate": 9.738826714043354e-05, + "loss": 0.011550496332347393, + "num_input_tokens_seen": 30361104, + "step": 1854, + "train_runtime": 15067.2715, + "train_tokens_per_second": 2015.037 + }, + { + "epoch": 1.1242424242424243, + "grad_norm": 0.020824618637561798, + "learning_rate": 9.738519901931762e-05, + "loss": 0.013716255314648151, + "num_input_tokens_seen": 30377480, + "step": 1855, + "train_runtime": 15075.3926, + "train_tokens_per_second": 2015.037 + }, + { + "epoch": 1.1248484848484848, + "grad_norm": 0.010834557004272938, + "learning_rate": 9.738212914551547e-05, + "loss": 0.012743671424686909, + "num_input_tokens_seen": 30393856, + "step": 1856, + "train_runtime": 15083.5149, + "train_tokens_per_second": 2015.038 + }, + { + "epoch": 1.1254545454545455, + "grad_norm": 0.012935217469930649, + "learning_rate": 9.737905751914063e-05, + "loss": 0.014386632479727268, + "num_input_tokens_seen": 30410232, + "step": 1857, + "train_runtime": 15091.6366, + "train_tokens_per_second": 2015.039 + }, + { + "epoch": 1.126060606060606, + "grad_norm": 0.014889142476022243, + "learning_rate": 9.737598414030673e-05, + "loss": 0.012645886279642582, + "num_input_tokens_seen": 30426608, + "step": 1858, + "train_runtime": 15099.7572, + "train_tokens_per_second": 2015.04 + }, + { + "epoch": 1.1266666666666667, + "grad_norm": 0.04802517592906952, + "learning_rate": 9.737290900912743e-05, + "loss": 0.014930625446140766, + "num_input_tokens_seen": 30442984, + "step": 1859, + "train_runtime": 15107.8772, + "train_tokens_per_second": 2015.04 + }, + { + "epoch": 1.1272727272727272, + "grad_norm": 0.02512347884476185, + "learning_rate": 9.736983212571646e-05, + "loss": 0.013300512917339802, + "num_input_tokens_seen": 30459360, + "step": 1860, + "train_runtime": 15115.9968, + "train_tokens_per_second": 2015.041 + }, + { + "epoch": 1.127878787878788, + "grad_norm": 0.023098865523934364, + "learning_rate": 9.736675349018767e-05, + "loss": 0.01131696067750454, + "num_input_tokens_seen": 30475736, + "step": 1861, + "train_runtime": 15124.1176, + "train_tokens_per_second": 2015.042 + }, + { + "epoch": 1.1284848484848484, + "grad_norm": 0.014170211739838123, + "learning_rate": 9.736367310265492e-05, + "loss": 0.013279788196086884, + "num_input_tokens_seen": 30492112, + "step": 1862, + "train_runtime": 15132.2381, + "train_tokens_per_second": 2015.043 + }, + { + "epoch": 1.1290909090909091, + "grad_norm": 0.00675050588324666, + "learning_rate": 9.736059096323212e-05, + "loss": 0.012112347409129143, + "num_input_tokens_seen": 30508488, + "step": 1863, + "train_runtime": 15140.3587, + "train_tokens_per_second": 2015.044 + }, + { + "epoch": 1.1296969696969696, + "grad_norm": 0.01739928312599659, + "learning_rate": 9.735750707203331e-05, + "loss": 0.013913700357079506, + "num_input_tokens_seen": 30524864, + "step": 1864, + "train_runtime": 15148.4784, + "train_tokens_per_second": 2015.045 + }, + { + "epoch": 1.1303030303030304, + "grad_norm": 0.005056953988969326, + "learning_rate": 9.73544214291725e-05, + "loss": 0.010809306055307388, + "num_input_tokens_seen": 30541240, + "step": 1865, + "train_runtime": 15156.5973, + "train_tokens_per_second": 2015.046 + }, + { + "epoch": 1.1309090909090909, + "grad_norm": 0.01127872709184885, + "learning_rate": 9.73513340347639e-05, + "loss": 0.011840720660984516, + "num_input_tokens_seen": 30557616, + "step": 1866, + "train_runtime": 15164.7169, + "train_tokens_per_second": 2015.047 + }, + { + "epoch": 1.1315151515151516, + "grad_norm": 0.010311625897884369, + "learning_rate": 9.734824488892164e-05, + "loss": 0.011924706399440765, + "num_input_tokens_seen": 30573992, + "step": 1867, + "train_runtime": 15172.8357, + "train_tokens_per_second": 2015.048 + }, + { + "epoch": 1.132121212121212, + "grad_norm": 0.007677890360355377, + "learning_rate": 9.734515399176003e-05, + "loss": 0.012517043389379978, + "num_input_tokens_seen": 30590368, + "step": 1868, + "train_runtime": 15180.9572, + "train_tokens_per_second": 2015.049 + }, + { + "epoch": 1.1327272727272728, + "grad_norm": 0.004882013890892267, + "learning_rate": 9.734206134339337e-05, + "loss": 0.011892163194715977, + "num_input_tokens_seen": 30606744, + "step": 1869, + "train_runtime": 15189.0791, + "train_tokens_per_second": 2015.049 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 0.012440717779099941, + "learning_rate": 9.733896694393605e-05, + "loss": 0.012283443473279476, + "num_input_tokens_seen": 30623120, + "step": 1870, + "train_runtime": 15197.1978, + "train_tokens_per_second": 2015.05 + }, + { + "epoch": 1.133939393939394, + "grad_norm": 0.010757396928966045, + "learning_rate": 9.733587079350252e-05, + "loss": 0.011945050209760666, + "num_input_tokens_seen": 30639496, + "step": 1871, + "train_runtime": 15205.3192, + "train_tokens_per_second": 2015.051 + }, + { + "epoch": 1.1345454545454545, + "grad_norm": 0.028685016557574272, + "learning_rate": 9.733277289220733e-05, + "loss": 0.012834792956709862, + "num_input_tokens_seen": 30655872, + "step": 1872, + "train_runtime": 15213.4408, + "train_tokens_per_second": 2015.052 + }, + { + "epoch": 1.1351515151515152, + "grad_norm": 0.009486911818385124, + "learning_rate": 9.732967324016504e-05, + "loss": 0.011870292015373707, + "num_input_tokens_seen": 30672248, + "step": 1873, + "train_runtime": 15221.5618, + "train_tokens_per_second": 2015.053 + }, + { + "epoch": 1.1357575757575757, + "grad_norm": 0.010406363755464554, + "learning_rate": 9.732657183749029e-05, + "loss": 0.014183721505105495, + "num_input_tokens_seen": 30688624, + "step": 1874, + "train_runtime": 15229.6819, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 1.1363636363636362, + "grad_norm": 0.00912264920771122, + "learning_rate": 9.732346868429784e-05, + "loss": 0.012175610288977623, + "num_input_tokens_seen": 30705000, + "step": 1875, + "train_runtime": 15237.8034, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 1.136969696969697, + "grad_norm": 0.013175376690924168, + "learning_rate": 9.732036378070243e-05, + "loss": 0.011904444545507431, + "num_input_tokens_seen": 30721376, + "step": 1876, + "train_runtime": 15245.9237, + "train_tokens_per_second": 2015.055 + }, + { + "epoch": 1.1375757575757577, + "grad_norm": 0.019623400643467903, + "learning_rate": 9.731725712681892e-05, + "loss": 0.012222235091030598, + "num_input_tokens_seen": 30737752, + "step": 1877, + "train_runtime": 15254.0434, + "train_tokens_per_second": 2015.056 + }, + { + "epoch": 1.1381818181818182, + "grad_norm": 0.011514107696712017, + "learning_rate": 9.731414872276221e-05, + "loss": 0.012634092941880226, + "num_input_tokens_seen": 30754128, + "step": 1878, + "train_runtime": 15262.1622, + "train_tokens_per_second": 2015.057 + }, + { + "epoch": 1.1387878787878787, + "grad_norm": 0.010799894109368324, + "learning_rate": 9.731103856864728e-05, + "loss": 0.012497657909989357, + "num_input_tokens_seen": 30770504, + "step": 1879, + "train_runtime": 15270.2827, + "train_tokens_per_second": 2015.058 + }, + { + "epoch": 1.1393939393939394, + "grad_norm": 0.006618468556553125, + "learning_rate": 9.730792666458916e-05, + "loss": 0.011121107265353203, + "num_input_tokens_seen": 30786880, + "step": 1880, + "train_runtime": 15278.4036, + "train_tokens_per_second": 2015.059 + }, + { + "epoch": 1.1400000000000001, + "grad_norm": 0.01856573298573494, + "learning_rate": 9.730481301070298e-05, + "loss": 0.012096052058041096, + "num_input_tokens_seen": 30803256, + "step": 1881, + "train_runtime": 15286.5234, + "train_tokens_per_second": 2015.06 + }, + { + "epoch": 1.1406060606060606, + "grad_norm": 0.0105481231585145, + "learning_rate": 9.730169760710386e-05, + "loss": 0.012894745916128159, + "num_input_tokens_seen": 30819632, + "step": 1882, + "train_runtime": 15294.6412, + "train_tokens_per_second": 2015.061 + }, + { + "epoch": 1.1412121212121211, + "grad_norm": 0.006541201379150152, + "learning_rate": 9.729858045390708e-05, + "loss": 0.011877333745360374, + "num_input_tokens_seen": 30836008, + "step": 1883, + "train_runtime": 15302.7603, + "train_tokens_per_second": 2015.062 + }, + { + "epoch": 1.1418181818181818, + "grad_norm": 0.011717238463461399, + "learning_rate": 9.729546155122792e-05, + "loss": 0.011121803894639015, + "num_input_tokens_seen": 30852384, + "step": 1884, + "train_runtime": 15310.8784, + "train_tokens_per_second": 2015.063 + }, + { + "epoch": 1.1424242424242423, + "grad_norm": 0.01204200740903616, + "learning_rate": 9.729234089918173e-05, + "loss": 0.012877327390015125, + "num_input_tokens_seen": 30868760, + "step": 1885, + "train_runtime": 15318.9981, + "train_tokens_per_second": 2015.064 + }, + { + "epoch": 1.143030303030303, + "grad_norm": 0.009307028725743294, + "learning_rate": 9.728921849788397e-05, + "loss": 0.011277549900114536, + "num_input_tokens_seen": 30885136, + "step": 1886, + "train_runtime": 15327.1181, + "train_tokens_per_second": 2015.065 + }, + { + "epoch": 1.1436363636363636, + "grad_norm": 0.00815204344689846, + "learning_rate": 9.728609434745009e-05, + "loss": 0.012305430136620998, + "num_input_tokens_seen": 30901512, + "step": 1887, + "train_runtime": 15335.2383, + "train_tokens_per_second": 2015.066 + }, + { + "epoch": 1.1442424242424243, + "grad_norm": 0.013295816257596016, + "learning_rate": 9.728296844799567e-05, + "loss": 0.011643802747130394, + "num_input_tokens_seen": 30917888, + "step": 1888, + "train_runtime": 15343.3603, + "train_tokens_per_second": 2015.066 + }, + { + "epoch": 1.1448484848484848, + "grad_norm": 0.015981702134013176, + "learning_rate": 9.727984079963632e-05, + "loss": 0.011237949132919312, + "num_input_tokens_seen": 30934264, + "step": 1889, + "train_runtime": 15351.4806, + "train_tokens_per_second": 2015.067 + }, + { + "epoch": 1.1454545454545455, + "grad_norm": 0.014281482435762882, + "learning_rate": 9.727671140248775e-05, + "loss": 0.0122231962159276, + "num_input_tokens_seen": 30950640, + "step": 1890, + "train_runtime": 15359.6018, + "train_tokens_per_second": 2015.068 + }, + { + "epoch": 1.146060606060606, + "grad_norm": 0.011495551094412804, + "learning_rate": 9.727358025666568e-05, + "loss": 0.012298443354666233, + "num_input_tokens_seen": 30967016, + "step": 1891, + "train_runtime": 15367.7225, + "train_tokens_per_second": 2015.069 + }, + { + "epoch": 1.1466666666666667, + "grad_norm": 0.01308425236493349, + "learning_rate": 9.727044736228594e-05, + "loss": 0.013174796476960182, + "num_input_tokens_seen": 30983392, + "step": 1892, + "train_runtime": 15375.8433, + "train_tokens_per_second": 2015.069 + }, + { + "epoch": 1.1472727272727272, + "grad_norm": 0.01665692962706089, + "learning_rate": 9.726731271946441e-05, + "loss": 0.013839912600815296, + "num_input_tokens_seen": 30999768, + "step": 1893, + "train_runtime": 15383.9648, + "train_tokens_per_second": 2015.07 + }, + { + "epoch": 1.147878787878788, + "grad_norm": 0.011216187849640846, + "learning_rate": 9.726417632831701e-05, + "loss": 0.012092461809515953, + "num_input_tokens_seen": 31016144, + "step": 1894, + "train_runtime": 15392.0887, + "train_tokens_per_second": 2015.071 + }, + { + "epoch": 1.1484848484848484, + "grad_norm": 0.0110325887799263, + "learning_rate": 9.72610381889598e-05, + "loss": 0.013044838793575764, + "num_input_tokens_seen": 31032520, + "step": 1895, + "train_runtime": 15400.2084, + "train_tokens_per_second": 2015.071 + }, + { + "epoch": 1.1490909090909092, + "grad_norm": 0.011509820818901062, + "learning_rate": 9.725789830150882e-05, + "loss": 0.012543351389467716, + "num_input_tokens_seen": 31048896, + "step": 1896, + "train_runtime": 15408.3347, + "train_tokens_per_second": 2015.071 + }, + { + "epoch": 1.1496969696969697, + "grad_norm": 0.016707783564925194, + "learning_rate": 9.725475666608019e-05, + "loss": 0.013459472917020321, + "num_input_tokens_seen": 31065272, + "step": 1897, + "train_runtime": 15416.4555, + "train_tokens_per_second": 2015.072 + }, + { + "epoch": 1.1503030303030304, + "grad_norm": 0.016305526718497276, + "learning_rate": 9.725161328279016e-05, + "loss": 0.012954285368323326, + "num_input_tokens_seen": 31081648, + "step": 1898, + "train_runtime": 15424.5766, + "train_tokens_per_second": 2015.073 + }, + { + "epoch": 1.1509090909090909, + "grad_norm": 0.014199224300682545, + "learning_rate": 9.724846815175495e-05, + "loss": 0.01172240823507309, + "num_input_tokens_seen": 31098024, + "step": 1899, + "train_runtime": 15432.6971, + "train_tokens_per_second": 2015.074 + }, + { + "epoch": 1.1515151515151516, + "grad_norm": 0.014520173892378807, + "learning_rate": 9.724532127309094e-05, + "loss": 0.013250859454274178, + "num_input_tokens_seen": 31114400, + "step": 1900, + "train_runtime": 15440.8165, + "train_tokens_per_second": 2015.075 + }, + { + "epoch": 1.152121212121212, + "grad_norm": 0.014697631821036339, + "learning_rate": 9.724217264691448e-05, + "loss": 0.01238673273473978, + "num_input_tokens_seen": 31130776, + "step": 1901, + "train_runtime": 15449.8926, + "train_tokens_per_second": 2014.951 + }, + { + "epoch": 1.1527272727272728, + "grad_norm": 0.011663121171295643, + "learning_rate": 9.723902227334207e-05, + "loss": 0.012560025788843632, + "num_input_tokens_seen": 31147152, + "step": 1902, + "train_runtime": 15458.0103, + "train_tokens_per_second": 2014.952 + }, + { + "epoch": 1.1533333333333333, + "grad_norm": 0.013661185279488564, + "learning_rate": 9.723587015249021e-05, + "loss": 0.013027054257690907, + "num_input_tokens_seen": 31163528, + "step": 1903, + "train_runtime": 15466.1331, + "train_tokens_per_second": 2014.953 + }, + { + "epoch": 1.1539393939393938, + "grad_norm": 0.01441930141299963, + "learning_rate": 9.72327162844755e-05, + "loss": 0.012912960723042488, + "num_input_tokens_seen": 31179904, + "step": 1904, + "train_runtime": 15474.2486, + "train_tokens_per_second": 2014.954 + }, + { + "epoch": 1.1545454545454545, + "grad_norm": 0.008236902765929699, + "learning_rate": 9.72295606694146e-05, + "loss": 0.01216426957398653, + "num_input_tokens_seen": 31196280, + "step": 1905, + "train_runtime": 15482.3692, + "train_tokens_per_second": 2014.955 + }, + { + "epoch": 1.1551515151515153, + "grad_norm": 0.007718207780271769, + "learning_rate": 9.722640330742423e-05, + "loss": 0.013841992244124413, + "num_input_tokens_seen": 31212656, + "step": 1906, + "train_runtime": 15490.489, + "train_tokens_per_second": 2014.956 + }, + { + "epoch": 1.1557575757575758, + "grad_norm": 0.013511128723621368, + "learning_rate": 9.722324419862116e-05, + "loss": 0.014025630429387093, + "num_input_tokens_seen": 31229032, + "step": 1907, + "train_runtime": 15498.6085, + "train_tokens_per_second": 2014.957 + }, + { + "epoch": 1.1563636363636363, + "grad_norm": 0.01642940379679203, + "learning_rate": 9.722008334312227e-05, + "loss": 0.011956113390624523, + "num_input_tokens_seen": 31245408, + "step": 1908, + "train_runtime": 15506.7329, + "train_tokens_per_second": 2014.958 + }, + { + "epoch": 1.156969696969697, + "grad_norm": 0.0073166899383068085, + "learning_rate": 9.721692074104444e-05, + "loss": 0.011238785460591316, + "num_input_tokens_seen": 31261784, + "step": 1909, + "train_runtime": 15514.852, + "train_tokens_per_second": 2014.959 + }, + { + "epoch": 1.1575757575757575, + "grad_norm": 0.017258716747164726, + "learning_rate": 9.721375639250467e-05, + "loss": 0.012529391795396805, + "num_input_tokens_seen": 31278160, + "step": 1910, + "train_runtime": 15522.9703, + "train_tokens_per_second": 2014.96 + }, + { + "epoch": 1.1581818181818182, + "grad_norm": 0.006015890743583441, + "learning_rate": 9.721059029761999e-05, + "loss": 0.012787789106369019, + "num_input_tokens_seen": 31294536, + "step": 1911, + "train_runtime": 15531.0907, + "train_tokens_per_second": 2014.961 + }, + { + "epoch": 1.1587878787878787, + "grad_norm": 0.01181771419942379, + "learning_rate": 9.720742245650751e-05, + "loss": 0.013402738608419895, + "num_input_tokens_seen": 31310912, + "step": 1912, + "train_runtime": 15539.2103, + "train_tokens_per_second": 2014.962 + }, + { + "epoch": 1.1593939393939394, + "grad_norm": 0.014448689296841621, + "learning_rate": 9.72042528692844e-05, + "loss": 0.012921266257762909, + "num_input_tokens_seen": 31327288, + "step": 1913, + "train_runtime": 15547.3368, + "train_tokens_per_second": 2014.962 + }, + { + "epoch": 1.16, + "grad_norm": 0.01236814260482788, + "learning_rate": 9.720108153606792e-05, + "loss": 0.01249447651207447, + "num_input_tokens_seen": 31343664, + "step": 1914, + "train_runtime": 15555.455, + "train_tokens_per_second": 2014.963 + }, + { + "epoch": 1.1606060606060606, + "grad_norm": 0.012424842454493046, + "learning_rate": 9.719790845697533e-05, + "loss": 0.013374033384025097, + "num_input_tokens_seen": 31360040, + "step": 1915, + "train_runtime": 15563.5726, + "train_tokens_per_second": 2014.964 + }, + { + "epoch": 1.1612121212121211, + "grad_norm": 0.008714928291738033, + "learning_rate": 9.719473363212405e-05, + "loss": 0.013384588994085789, + "num_input_tokens_seen": 31376416, + "step": 1916, + "train_runtime": 15571.6914, + "train_tokens_per_second": 2014.965 + }, + { + "epoch": 1.1618181818181819, + "grad_norm": 0.01234753243625164, + "learning_rate": 9.719155706163145e-05, + "loss": 0.013264678418636322, + "num_input_tokens_seen": 31392792, + "step": 1917, + "train_runtime": 15579.8107, + "train_tokens_per_second": 2014.966 + }, + { + "epoch": 1.1624242424242424, + "grad_norm": 0.008164497092366219, + "learning_rate": 9.718837874561509e-05, + "loss": 0.011975225061178207, + "num_input_tokens_seen": 31409168, + "step": 1918, + "train_runtime": 15587.9331, + "train_tokens_per_second": 2014.967 + }, + { + "epoch": 1.163030303030303, + "grad_norm": 0.01258091814815998, + "learning_rate": 9.718519868419247e-05, + "loss": 0.012896685861051083, + "num_input_tokens_seen": 31425544, + "step": 1919, + "train_runtime": 15596.052, + "train_tokens_per_second": 2014.968 + }, + { + "epoch": 1.1636363636363636, + "grad_norm": 0.014153181575238705, + "learning_rate": 9.718201687748126e-05, + "loss": 0.011028273962438107, + "num_input_tokens_seen": 31441920, + "step": 1920, + "train_runtime": 15604.1723, + "train_tokens_per_second": 2014.969 + }, + { + "epoch": 1.1642424242424243, + "grad_norm": 0.010321607813239098, + "learning_rate": 9.71788333255991e-05, + "loss": 0.012135772034525871, + "num_input_tokens_seen": 31458296, + "step": 1921, + "train_runtime": 15612.2941, + "train_tokens_per_second": 2014.969 + }, + { + "epoch": 1.1648484848484848, + "grad_norm": 0.007093664258718491, + "learning_rate": 9.717564802866379e-05, + "loss": 0.012399922125041485, + "num_input_tokens_seen": 31474672, + "step": 1922, + "train_runtime": 15620.4151, + "train_tokens_per_second": 2014.97 + }, + { + "epoch": 1.1654545454545455, + "grad_norm": 0.023240283131599426, + "learning_rate": 9.717246098679313e-05, + "loss": 0.014058588072657585, + "num_input_tokens_seen": 31491048, + "step": 1923, + "train_runtime": 15628.5367, + "train_tokens_per_second": 2014.971 + }, + { + "epoch": 1.166060606060606, + "grad_norm": 0.009191809222102165, + "learning_rate": 9.716927220010499e-05, + "loss": 0.013260525651276112, + "num_input_tokens_seen": 31507424, + "step": 1924, + "train_runtime": 15636.6557, + "train_tokens_per_second": 2014.972 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 0.011490467004477978, + "learning_rate": 9.716608166871735e-05, + "loss": 0.011423847638070583, + "num_input_tokens_seen": 31523800, + "step": 1925, + "train_runtime": 15644.7767, + "train_tokens_per_second": 2014.973 + }, + { + "epoch": 1.1672727272727272, + "grad_norm": 0.01001172885298729, + "learning_rate": 9.716288939274819e-05, + "loss": 0.013747838325798512, + "num_input_tokens_seen": 31540176, + "step": 1926, + "train_runtime": 15652.8971, + "train_tokens_per_second": 2014.974 + }, + { + "epoch": 1.167878787878788, + "grad_norm": 0.016452116891741753, + "learning_rate": 9.715969537231559e-05, + "loss": 0.013036968186497688, + "num_input_tokens_seen": 31556552, + "step": 1927, + "train_runtime": 15661.0189, + "train_tokens_per_second": 2014.974 + }, + { + "epoch": 1.1684848484848485, + "grad_norm": 0.04086410999298096, + "learning_rate": 9.71564996075377e-05, + "loss": 0.011422049254179, + "num_input_tokens_seen": 31572928, + "step": 1928, + "train_runtime": 15669.1386, + "train_tokens_per_second": 2014.975 + }, + { + "epoch": 1.1690909090909092, + "grad_norm": 0.01278830785304308, + "learning_rate": 9.715330209853272e-05, + "loss": 0.012607689946889877, + "num_input_tokens_seen": 31589304, + "step": 1929, + "train_runtime": 15677.2541, + "train_tokens_per_second": 2014.977 + }, + { + "epoch": 1.1696969696969697, + "grad_norm": 0.007186457980424166, + "learning_rate": 9.715010284541894e-05, + "loss": 0.012316851876676083, + "num_input_tokens_seen": 31605680, + "step": 1930, + "train_runtime": 15685.3619, + "train_tokens_per_second": 2014.979 + }, + { + "epoch": 1.1703030303030304, + "grad_norm": 0.006031945813447237, + "learning_rate": 9.714690184831465e-05, + "loss": 0.01131855882704258, + "num_input_tokens_seen": 31622056, + "step": 1931, + "train_runtime": 15693.4757, + "train_tokens_per_second": 2014.981 + }, + { + "epoch": 1.170909090909091, + "grad_norm": 0.011418391950428486, + "learning_rate": 9.714369910733829e-05, + "loss": 0.012706535868346691, + "num_input_tokens_seen": 31638432, + "step": 1932, + "train_runtime": 15701.5872, + "train_tokens_per_second": 2014.983 + }, + { + "epoch": 1.1715151515151514, + "grad_norm": 0.019623173400759697, + "learning_rate": 9.714049462260833e-05, + "loss": 0.012899842113256454, + "num_input_tokens_seen": 31654808, + "step": 1933, + "train_runtime": 15709.6966, + "train_tokens_per_second": 2014.985 + }, + { + "epoch": 1.1721212121212121, + "grad_norm": 0.006744697690010071, + "learning_rate": 9.713728839424325e-05, + "loss": 0.011626766063272953, + "num_input_tokens_seen": 31671184, + "step": 1934, + "train_runtime": 15717.8085, + "train_tokens_per_second": 2014.987 + }, + { + "epoch": 1.1727272727272728, + "grad_norm": 0.011822863481938839, + "learning_rate": 9.713408042236166e-05, + "loss": 0.012239542789757252, + "num_input_tokens_seen": 31687560, + "step": 1935, + "train_runtime": 15725.9209, + "train_tokens_per_second": 2014.989 + }, + { + "epoch": 1.1733333333333333, + "grad_norm": 0.015078735537827015, + "learning_rate": 9.713087070708224e-05, + "loss": 0.012939630076289177, + "num_input_tokens_seen": 31703936, + "step": 1936, + "train_runtime": 15734.034, + "train_tokens_per_second": 2014.991 + }, + { + "epoch": 1.1739393939393938, + "grad_norm": 0.009683185257017612, + "learning_rate": 9.71276592485237e-05, + "loss": 0.011823873035609722, + "num_input_tokens_seen": 31720312, + "step": 1937, + "train_runtime": 15742.146, + "train_tokens_per_second": 2014.993 + }, + { + "epoch": 1.1745454545454546, + "grad_norm": 0.022101467475295067, + "learning_rate": 9.712444604680481e-05, + "loss": 0.013082625344395638, + "num_input_tokens_seen": 31736688, + "step": 1938, + "train_runtime": 15750.2574, + "train_tokens_per_second": 2014.995 + }, + { + "epoch": 1.175151515151515, + "grad_norm": 0.011323574930429459, + "learning_rate": 9.712123110204442e-05, + "loss": 0.012833611108362675, + "num_input_tokens_seen": 31753064, + "step": 1939, + "train_runtime": 15758.3713, + "train_tokens_per_second": 2014.997 + }, + { + "epoch": 1.1757575757575758, + "grad_norm": 0.008909697644412518, + "learning_rate": 9.711801441436148e-05, + "loss": 0.012643275782465935, + "num_input_tokens_seen": 31769440, + "step": 1940, + "train_runtime": 15766.4811, + "train_tokens_per_second": 2014.999 + }, + { + "epoch": 1.1763636363636363, + "grad_norm": 0.016408922150731087, + "learning_rate": 9.711479598387494e-05, + "loss": 0.011867276392877102, + "num_input_tokens_seen": 31785816, + "step": 1941, + "train_runtime": 15774.5926, + "train_tokens_per_second": 2015.001 + }, + { + "epoch": 1.176969696969697, + "grad_norm": 0.012622256763279438, + "learning_rate": 9.711157581070385e-05, + "loss": 0.011785149574279785, + "num_input_tokens_seen": 31802192, + "step": 1942, + "train_runtime": 15782.7054, + "train_tokens_per_second": 2015.003 + }, + { + "epoch": 1.1775757575757575, + "grad_norm": 0.009545985609292984, + "learning_rate": 9.71083538949673e-05, + "loss": 0.01227510068565607, + "num_input_tokens_seen": 31818568, + "step": 1943, + "train_runtime": 15790.8177, + "train_tokens_per_second": 2015.004 + }, + { + "epoch": 1.1781818181818182, + "grad_norm": 0.012797760777175426, + "learning_rate": 9.710513023678449e-05, + "loss": 0.0132676362991333, + "num_input_tokens_seen": 31834944, + "step": 1944, + "train_runtime": 15798.9326, + "train_tokens_per_second": 2015.006 + }, + { + "epoch": 1.1787878787878787, + "grad_norm": 0.005393213592469692, + "learning_rate": 9.710190483627465e-05, + "loss": 0.01224264781922102, + "num_input_tokens_seen": 31851320, + "step": 1945, + "train_runtime": 15807.0427, + "train_tokens_per_second": 2015.008 + }, + { + "epoch": 1.1793939393939394, + "grad_norm": 0.0058806887827813625, + "learning_rate": 9.709867769355707e-05, + "loss": 0.012036345899105072, + "num_input_tokens_seen": 31867696, + "step": 1946, + "train_runtime": 15815.1548, + "train_tokens_per_second": 2015.01 + }, + { + "epoch": 1.18, + "grad_norm": 0.012321457266807556, + "learning_rate": 9.709544880875113e-05, + "loss": 0.011586939916014671, + "num_input_tokens_seen": 31884072, + "step": 1947, + "train_runtime": 15823.2685, + "train_tokens_per_second": 2015.012 + }, + { + "epoch": 1.1806060606060607, + "grad_norm": 0.0094247255474329, + "learning_rate": 9.709221818197624e-05, + "loss": 0.012648900970816612, + "num_input_tokens_seen": 31900448, + "step": 1948, + "train_runtime": 15831.3802, + "train_tokens_per_second": 2015.014 + }, + { + "epoch": 1.1812121212121212, + "grad_norm": 0.01918826624751091, + "learning_rate": 9.70889858133519e-05, + "loss": 0.014765182510018349, + "num_input_tokens_seen": 31916824, + "step": 1949, + "train_runtime": 15839.4904, + "train_tokens_per_second": 2015.016 + }, + { + "epoch": 1.1818181818181819, + "grad_norm": 0.00749099301174283, + "learning_rate": 9.708575170299771e-05, + "loss": 0.012448623776435852, + "num_input_tokens_seen": 31933200, + "step": 1950, + "train_runtime": 15847.6013, + "train_tokens_per_second": 2015.018 + }, + { + "epoch": 1.1824242424242424, + "grad_norm": 0.012216299772262573, + "learning_rate": 9.708251585103322e-05, + "loss": 0.011859457939863205, + "num_input_tokens_seen": 31949576, + "step": 1951, + "train_runtime": 15855.711, + "train_tokens_per_second": 2015.02 + }, + { + "epoch": 1.183030303030303, + "grad_norm": 0.007972866296768188, + "learning_rate": 9.707927825757819e-05, + "loss": 0.013553624972701073, + "num_input_tokens_seen": 31965952, + "step": 1952, + "train_runtime": 15863.8207, + "train_tokens_per_second": 2015.022 + }, + { + "epoch": 1.1836363636363636, + "grad_norm": 0.009913386777043343, + "learning_rate": 9.707603892275233e-05, + "loss": 0.012638632208108902, + "num_input_tokens_seen": 31982328, + "step": 1953, + "train_runtime": 15871.9335, + "train_tokens_per_second": 2015.024 + }, + { + "epoch": 1.1842424242424243, + "grad_norm": 0.008947964757680893, + "learning_rate": 9.707279784667547e-05, + "loss": 0.011412415653467178, + "num_input_tokens_seen": 31998704, + "step": 1954, + "train_runtime": 15880.0463, + "train_tokens_per_second": 2015.026 + }, + { + "epoch": 1.1848484848484848, + "grad_norm": 0.011816347017884254, + "learning_rate": 9.706955502946748e-05, + "loss": 0.013120824471116066, + "num_input_tokens_seen": 32015080, + "step": 1955, + "train_runtime": 15888.1549, + "train_tokens_per_second": 2015.028 + }, + { + "epoch": 1.1854545454545455, + "grad_norm": 0.01560800801962614, + "learning_rate": 9.706631047124833e-05, + "loss": 0.013406200334429741, + "num_input_tokens_seen": 32031456, + "step": 1956, + "train_runtime": 15896.2624, + "train_tokens_per_second": 2015.031 + }, + { + "epoch": 1.186060606060606, + "grad_norm": 0.007744067348539829, + "learning_rate": 9.706306417213798e-05, + "loss": 0.011934707872569561, + "num_input_tokens_seen": 32047832, + "step": 1957, + "train_runtime": 15904.3754, + "train_tokens_per_second": 2015.032 + }, + { + "epoch": 1.1866666666666668, + "grad_norm": 0.013483401387929916, + "learning_rate": 9.705981613225656e-05, + "loss": 0.01200819294899702, + "num_input_tokens_seen": 32064208, + "step": 1958, + "train_runtime": 15912.4903, + "train_tokens_per_second": 2015.034 + }, + { + "epoch": 1.1872727272727273, + "grad_norm": 0.008434257470071316, + "learning_rate": 9.705656635172419e-05, + "loss": 0.01099243201315403, + "num_input_tokens_seen": 32080584, + "step": 1959, + "train_runtime": 15920.6011, + "train_tokens_per_second": 2015.036 + }, + { + "epoch": 1.187878787878788, + "grad_norm": 0.007052503060549498, + "learning_rate": 9.705331483066106e-05, + "loss": 0.011562798172235489, + "num_input_tokens_seen": 32096960, + "step": 1960, + "train_runtime": 15928.7124, + "train_tokens_per_second": 2015.038 + }, + { + "epoch": 1.1884848484848485, + "grad_norm": 0.0031905195210129023, + "learning_rate": 9.705006156918744e-05, + "loss": 0.012042374350130558, + "num_input_tokens_seen": 32113336, + "step": 1961, + "train_runtime": 15936.8214, + "train_tokens_per_second": 2015.04 + }, + { + "epoch": 1.189090909090909, + "grad_norm": 0.017608430236577988, + "learning_rate": 9.704680656742368e-05, + "loss": 0.012978605926036835, + "num_input_tokens_seen": 32129712, + "step": 1962, + "train_runtime": 15944.9337, + "train_tokens_per_second": 2015.042 + }, + { + "epoch": 1.1896969696969697, + "grad_norm": 0.011044766753911972, + "learning_rate": 9.704354982549016e-05, + "loss": 0.012821994721889496, + "num_input_tokens_seen": 32146088, + "step": 1963, + "train_runtime": 15953.0436, + "train_tokens_per_second": 2015.044 + }, + { + "epoch": 1.1903030303030304, + "grad_norm": 0.0026662456803023815, + "learning_rate": 9.704029134350735e-05, + "loss": 0.011127783916890621, + "num_input_tokens_seen": 32162464, + "step": 1964, + "train_runtime": 15961.1548, + "train_tokens_per_second": 2015.046 + }, + { + "epoch": 1.190909090909091, + "grad_norm": 0.013216360472142696, + "learning_rate": 9.703703112159576e-05, + "loss": 0.013246205635368824, + "num_input_tokens_seen": 32178840, + "step": 1965, + "train_runtime": 15969.2663, + "train_tokens_per_second": 2015.048 + }, + { + "epoch": 1.1915151515151514, + "grad_norm": 0.005839253775775433, + "learning_rate": 9.703376915987601e-05, + "loss": 0.011579563841223717, + "num_input_tokens_seen": 32195216, + "step": 1966, + "train_runtime": 15977.3772, + "train_tokens_per_second": 2015.05 + }, + { + "epoch": 1.1921212121212121, + "grad_norm": 0.014731714501976967, + "learning_rate": 9.703050545846871e-05, + "loss": 0.012472787871956825, + "num_input_tokens_seen": 32211592, + "step": 1967, + "train_runtime": 15985.4869, + "train_tokens_per_second": 2015.052 + }, + { + "epoch": 1.1927272727272726, + "grad_norm": 0.010836574248969555, + "learning_rate": 9.702724001749461e-05, + "loss": 0.01216835342347622, + "num_input_tokens_seen": 32227968, + "step": 1968, + "train_runtime": 15993.5976, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 1.1933333333333334, + "grad_norm": 0.008247988298535347, + "learning_rate": 9.702397283707448e-05, + "loss": 0.012309988029301167, + "num_input_tokens_seen": 32244344, + "step": 1969, + "train_runtime": 16001.7113, + "train_tokens_per_second": 2015.056 + }, + { + "epoch": 1.1939393939393939, + "grad_norm": 0.01689152605831623, + "learning_rate": 9.702070391732919e-05, + "loss": 0.013732653111219406, + "num_input_tokens_seen": 32260720, + "step": 1970, + "train_runtime": 16009.8226, + "train_tokens_per_second": 2015.058 + }, + { + "epoch": 1.1945454545454546, + "grad_norm": 0.013673562556505203, + "learning_rate": 9.70174332583796e-05, + "loss": 0.013782327063381672, + "num_input_tokens_seen": 32277096, + "step": 1971, + "train_runtime": 16017.9333, + "train_tokens_per_second": 2015.06 + }, + { + "epoch": 1.195151515151515, + "grad_norm": 0.010520472191274166, + "learning_rate": 9.701416086034672e-05, + "loss": 0.013326936401426792, + "num_input_tokens_seen": 32293472, + "step": 1972, + "train_runtime": 16026.0492, + "train_tokens_per_second": 2015.061 + }, + { + "epoch": 1.1957575757575758, + "grad_norm": 0.007514155004173517, + "learning_rate": 9.70108867233516e-05, + "loss": 0.012479415163397789, + "num_input_tokens_seen": 32309848, + "step": 1973, + "train_runtime": 16034.161, + "train_tokens_per_second": 2015.063 + }, + { + "epoch": 1.1963636363636363, + "grad_norm": 0.004866638220846653, + "learning_rate": 9.700761084751533e-05, + "loss": 0.011496278457343578, + "num_input_tokens_seen": 32326224, + "step": 1974, + "train_runtime": 16042.2706, + "train_tokens_per_second": 2015.065 + }, + { + "epoch": 1.196969696969697, + "grad_norm": 0.008786221966147423, + "learning_rate": 9.700433323295907e-05, + "loss": 0.012215346097946167, + "num_input_tokens_seen": 32342600, + "step": 1975, + "train_runtime": 16050.3809, + "train_tokens_per_second": 2015.067 + }, + { + "epoch": 1.1975757575757575, + "grad_norm": 0.01284866128116846, + "learning_rate": 9.700105387980406e-05, + "loss": 0.012052543461322784, + "num_input_tokens_seen": 32358976, + "step": 1976, + "train_runtime": 16058.492, + "train_tokens_per_second": 2015.069 + }, + { + "epoch": 1.1981818181818182, + "grad_norm": 0.00797280389815569, + "learning_rate": 9.699777278817161e-05, + "loss": 0.01189066469669342, + "num_input_tokens_seen": 32375352, + "step": 1977, + "train_runtime": 16066.6022, + "train_tokens_per_second": 2015.071 + }, + { + "epoch": 1.1987878787878787, + "grad_norm": 0.005341751966625452, + "learning_rate": 9.699448995818306e-05, + "loss": 0.01284201443195343, + "num_input_tokens_seen": 32391728, + "step": 1978, + "train_runtime": 16074.7091, + "train_tokens_per_second": 2015.074 + }, + { + "epoch": 1.1993939393939395, + "grad_norm": 0.008474123664200306, + "learning_rate": 9.699120538995982e-05, + "loss": 0.011298474855720997, + "num_input_tokens_seen": 32408104, + "step": 1979, + "train_runtime": 16082.8191, + "train_tokens_per_second": 2015.076 + }, + { + "epoch": 1.2, + "grad_norm": 0.005944432690739632, + "learning_rate": 9.698791908362344e-05, + "loss": 0.011836757883429527, + "num_input_tokens_seen": 32424480, + "step": 1980, + "train_runtime": 16090.9333, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 1.2006060606060607, + "grad_norm": 0.04931863769888878, + "learning_rate": 9.698463103929542e-05, + "loss": 0.013664236292243004, + "num_input_tokens_seen": 32440856, + "step": 1981, + "train_runtime": 16099.0461, + "train_tokens_per_second": 2015.079 + }, + { + "epoch": 1.2012121212121212, + "grad_norm": 0.017436610534787178, + "learning_rate": 9.698134125709741e-05, + "loss": 0.012293105944991112, + "num_input_tokens_seen": 32457232, + "step": 1982, + "train_runtime": 16107.1573, + "train_tokens_per_second": 2015.081 + }, + { + "epoch": 1.201818181818182, + "grad_norm": 0.012726429849863052, + "learning_rate": 9.697804973715106e-05, + "loss": 0.0125419395044446, + "num_input_tokens_seen": 32473608, + "step": 1983, + "train_runtime": 16115.2692, + "train_tokens_per_second": 2015.083 + }, + { + "epoch": 1.2024242424242424, + "grad_norm": 0.00739372568204999, + "learning_rate": 9.697475647957814e-05, + "loss": 0.011883998289704323, + "num_input_tokens_seen": 32489984, + "step": 1984, + "train_runtime": 16123.3808, + "train_tokens_per_second": 2015.085 + }, + { + "epoch": 1.2030303030303031, + "grad_norm": 0.009471113793551922, + "learning_rate": 9.697146148450047e-05, + "loss": 0.013606306165456772, + "num_input_tokens_seen": 32506360, + "step": 1985, + "train_runtime": 16131.4911, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 1.2036363636363636, + "grad_norm": 0.010123181156814098, + "learning_rate": 9.696816475203992e-05, + "loss": 0.013429714366793633, + "num_input_tokens_seen": 32522736, + "step": 1986, + "train_runtime": 16139.6034, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 1.2042424242424243, + "grad_norm": 0.0061989715322852135, + "learning_rate": 9.69648662823184e-05, + "loss": 0.012474027462303638, + "num_input_tokens_seen": 32539112, + "step": 1987, + "train_runtime": 16147.7149, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.2048484848484848, + "grad_norm": 0.012311330065131187, + "learning_rate": 9.696156607545795e-05, + "loss": 0.01264164038002491, + "num_input_tokens_seen": 32555488, + "step": 1988, + "train_runtime": 16155.8325, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 1.2054545454545456, + "grad_norm": 0.0055806501768529415, + "learning_rate": 9.69582641315806e-05, + "loss": 0.011141511611640453, + "num_input_tokens_seen": 32571864, + "step": 1989, + "train_runtime": 16163.9452, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 1.206060606060606, + "grad_norm": 0.005560377612709999, + "learning_rate": 9.695496045080853e-05, + "loss": 0.012061137706041336, + "num_input_tokens_seen": 32588240, + "step": 1990, + "train_runtime": 16172.0604, + "train_tokens_per_second": 2015.095 + }, + { + "epoch": 1.2066666666666666, + "grad_norm": 0.012639972381293774, + "learning_rate": 9.69516550332639e-05, + "loss": 0.012125247158110142, + "num_input_tokens_seen": 32604616, + "step": 1991, + "train_runtime": 16180.1724, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.2072727272727273, + "grad_norm": 0.012606188654899597, + "learning_rate": 9.6948347879069e-05, + "loss": 0.012232892215251923, + "num_input_tokens_seen": 32620992, + "step": 1992, + "train_runtime": 16188.2838, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 1.207878787878788, + "grad_norm": 0.008077176287770271, + "learning_rate": 9.694503898834612e-05, + "loss": 0.011687630787491798, + "num_input_tokens_seen": 32637368, + "step": 1993, + "train_runtime": 16196.3922, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 1.2084848484848485, + "grad_norm": 0.006857742555439472, + "learning_rate": 9.694172836121769e-05, + "loss": 0.012620335444808006, + "num_input_tokens_seen": 32653744, + "step": 1994, + "train_runtime": 16204.5031, + "train_tokens_per_second": 2015.103 + }, + { + "epoch": 1.209090909090909, + "grad_norm": 0.007197006605565548, + "learning_rate": 9.693841599780613e-05, + "loss": 0.012056245468556881, + "num_input_tokens_seen": 32670120, + "step": 1995, + "train_runtime": 16212.6138, + "train_tokens_per_second": 2015.105 + }, + { + "epoch": 1.2096969696969697, + "grad_norm": 0.00502738356590271, + "learning_rate": 9.693510189823398e-05, + "loss": 0.012053314596414566, + "num_input_tokens_seen": 32686496, + "step": 1996, + "train_runtime": 16220.723, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 1.2103030303030302, + "grad_norm": 0.010123145766556263, + "learning_rate": 9.69317860626238e-05, + "loss": 0.012628944590687752, + "num_input_tokens_seen": 32702872, + "step": 1997, + "train_runtime": 16228.8373, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 1.210909090909091, + "grad_norm": 0.00677911564707756, + "learning_rate": 9.692846849109827e-05, + "loss": 0.01243099570274353, + "num_input_tokens_seen": 32719248, + "step": 1998, + "train_runtime": 16236.9472, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 1.2115151515151514, + "grad_norm": 0.018878202885389328, + "learning_rate": 9.692514918378006e-05, + "loss": 0.012131169438362122, + "num_input_tokens_seen": 32735624, + "step": 1999, + "train_runtime": 16245.0577, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.2121212121212122, + "grad_norm": 0.009054052643477917, + "learning_rate": 9.692182814079197e-05, + "loss": 0.013145225122570992, + "num_input_tokens_seen": 32752000, + "step": 2000, + "train_runtime": 16253.1657, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 1.2127272727272727, + "grad_norm": 0.00841708667576313, + "learning_rate": 9.691850536225684e-05, + "loss": 0.013336677104234695, + "num_input_tokens_seen": 32768376, + "step": 2001, + "train_runtime": 16262.2722, + "train_tokens_per_second": 2014.994 + }, + { + "epoch": 1.2133333333333334, + "grad_norm": 0.008042370900511742, + "learning_rate": 9.691518084829756e-05, + "loss": 0.011015270836651325, + "num_input_tokens_seen": 32784752, + "step": 2002, + "train_runtime": 16270.3838, + "train_tokens_per_second": 2014.996 + }, + { + "epoch": 1.2139393939393939, + "grad_norm": 0.014253576286137104, + "learning_rate": 9.691185459903709e-05, + "loss": 0.012948616407811642, + "num_input_tokens_seen": 32801128, + "step": 2003, + "train_runtime": 16278.4941, + "train_tokens_per_second": 2014.998 + }, + { + "epoch": 1.2145454545454546, + "grad_norm": 0.010794151574373245, + "learning_rate": 9.690852661459849e-05, + "loss": 0.012411735020577908, + "num_input_tokens_seen": 32817504, + "step": 2004, + "train_runtime": 16286.6014, + "train_tokens_per_second": 2015.0 + }, + { + "epoch": 1.215151515151515, + "grad_norm": 0.018177302554249763, + "learning_rate": 9.690519689510484e-05, + "loss": 0.011491055600345135, + "num_input_tokens_seen": 32833880, + "step": 2005, + "train_runtime": 16294.7116, + "train_tokens_per_second": 2015.002 + }, + { + "epoch": 1.2157575757575758, + "grad_norm": 0.006788145750761032, + "learning_rate": 9.69018654406793e-05, + "loss": 0.012544289231300354, + "num_input_tokens_seen": 32850256, + "step": 2006, + "train_runtime": 16302.8206, + "train_tokens_per_second": 2015.004 + }, + { + "epoch": 1.2163636363636363, + "grad_norm": 0.010082833468914032, + "learning_rate": 9.68985322514451e-05, + "loss": 0.011602875776588917, + "num_input_tokens_seen": 32866632, + "step": 2007, + "train_runtime": 16310.9318, + "train_tokens_per_second": 2015.006 + }, + { + "epoch": 1.216969696969697, + "grad_norm": 0.005194054916501045, + "learning_rate": 9.689519732752552e-05, + "loss": 0.012436242774128914, + "num_input_tokens_seen": 32883008, + "step": 2008, + "train_runtime": 16319.0428, + "train_tokens_per_second": 2015.008 + }, + { + "epoch": 1.2175757575757575, + "grad_norm": 0.011215485632419586, + "learning_rate": 9.68918606690439e-05, + "loss": 0.012597991153597832, + "num_input_tokens_seen": 32899384, + "step": 2009, + "train_runtime": 16327.1604, + "train_tokens_per_second": 2015.01 + }, + { + "epoch": 1.2181818181818183, + "grad_norm": 0.007134478073567152, + "learning_rate": 9.688852227612369e-05, + "loss": 0.0119805708527565, + "num_input_tokens_seen": 32915760, + "step": 2010, + "train_runtime": 16335.2778, + "train_tokens_per_second": 2015.011 + }, + { + "epoch": 1.2187878787878788, + "grad_norm": 0.005966852884739637, + "learning_rate": 9.688518214888836e-05, + "loss": 0.012521771714091301, + "num_input_tokens_seen": 32932136, + "step": 2011, + "train_runtime": 16343.3899, + "train_tokens_per_second": 2015.013 + }, + { + "epoch": 1.2193939393939395, + "grad_norm": 0.011345195583999157, + "learning_rate": 9.688184028746141e-05, + "loss": 0.012418065220117569, + "num_input_tokens_seen": 32948512, + "step": 2012, + "train_runtime": 16351.5105, + "train_tokens_per_second": 2015.013 + }, + { + "epoch": 1.22, + "grad_norm": 0.10804049670696259, + "learning_rate": 9.687849669196652e-05, + "loss": 0.013680079020559788, + "num_input_tokens_seen": 32964888, + "step": 2013, + "train_runtime": 16359.6328, + "train_tokens_per_second": 2015.014 + }, + { + "epoch": 1.2206060606060607, + "grad_norm": 0.03245476633310318, + "learning_rate": 9.687515136252731e-05, + "loss": 0.013701889663934708, + "num_input_tokens_seen": 32981264, + "step": 2014, + "train_runtime": 16367.748, + "train_tokens_per_second": 2015.015 + }, + { + "epoch": 1.2212121212121212, + "grad_norm": 0.011148846708238125, + "learning_rate": 9.687180429926754e-05, + "loss": 0.01261454913765192, + "num_input_tokens_seen": 32997640, + "step": 2015, + "train_runtime": 16375.8622, + "train_tokens_per_second": 2015.017 + }, + { + "epoch": 1.221818181818182, + "grad_norm": 0.01723579317331314, + "learning_rate": 9.686845550231102e-05, + "loss": 0.013511408120393753, + "num_input_tokens_seen": 33014016, + "step": 2016, + "train_runtime": 16383.9829, + "train_tokens_per_second": 2015.018 + }, + { + "epoch": 1.2224242424242424, + "grad_norm": 0.008297464810311794, + "learning_rate": 9.68651049717816e-05, + "loss": 0.012829539366066456, + "num_input_tokens_seen": 33030392, + "step": 2017, + "train_runtime": 16392.0973, + "train_tokens_per_second": 2015.019 + }, + { + "epoch": 1.2230303030303031, + "grad_norm": 0.008225811645388603, + "learning_rate": 9.68617527078032e-05, + "loss": 0.01188596710562706, + "num_input_tokens_seen": 33046768, + "step": 2018, + "train_runtime": 16400.2172, + "train_tokens_per_second": 2015.02 + }, + { + "epoch": 1.2236363636363636, + "grad_norm": 0.01628193072974682, + "learning_rate": 9.685839871049984e-05, + "loss": 0.012169532477855682, + "num_input_tokens_seen": 33063144, + "step": 2019, + "train_runtime": 16408.3356, + "train_tokens_per_second": 2015.021 + }, + { + "epoch": 1.2242424242424241, + "grad_norm": 0.021080298349261284, + "learning_rate": 9.685504297999556e-05, + "loss": 0.012592184357345104, + "num_input_tokens_seen": 33079520, + "step": 2020, + "train_runtime": 16416.4533, + "train_tokens_per_second": 2015.022 + }, + { + "epoch": 1.2248484848484849, + "grad_norm": 0.013393021188676357, + "learning_rate": 9.685168551641448e-05, + "loss": 0.01185669656842947, + "num_input_tokens_seen": 33095896, + "step": 2021, + "train_runtime": 16424.5679, + "train_tokens_per_second": 2015.024 + }, + { + "epoch": 1.2254545454545456, + "grad_norm": 0.011576077900826931, + "learning_rate": 9.68483263198808e-05, + "loss": 0.012582024559378624, + "num_input_tokens_seen": 33112272, + "step": 2022, + "train_runtime": 16432.6854, + "train_tokens_per_second": 2015.025 + }, + { + "epoch": 1.226060606060606, + "grad_norm": 0.015389593318104744, + "learning_rate": 9.684496539051874e-05, + "loss": 0.012190048582851887, + "num_input_tokens_seen": 33128648, + "step": 2023, + "train_runtime": 16440.8056, + "train_tokens_per_second": 2015.026 + }, + { + "epoch": 1.2266666666666666, + "grad_norm": 0.005747777409851551, + "learning_rate": 9.684160272845267e-05, + "loss": 0.011490939185023308, + "num_input_tokens_seen": 33145024, + "step": 2024, + "train_runtime": 16448.9214, + "train_tokens_per_second": 2015.027 + }, + { + "epoch": 1.2272727272727273, + "grad_norm": 0.013231366872787476, + "learning_rate": 9.683823833380692e-05, + "loss": 0.01152926217764616, + "num_input_tokens_seen": 33161400, + "step": 2025, + "train_runtime": 16457.0388, + "train_tokens_per_second": 2015.028 + }, + { + "epoch": 1.2278787878787878, + "grad_norm": 0.007870008237659931, + "learning_rate": 9.683487220670595e-05, + "loss": 0.012311085127294064, + "num_input_tokens_seen": 33177776, + "step": 2026, + "train_runtime": 16465.1579, + "train_tokens_per_second": 2015.029 + }, + { + "epoch": 1.2284848484848485, + "grad_norm": 0.0100321713835001, + "learning_rate": 9.683150434727427e-05, + "loss": 0.013347048312425613, + "num_input_tokens_seen": 33194152, + "step": 2027, + "train_runtime": 16473.2735, + "train_tokens_per_second": 2015.031 + }, + { + "epoch": 1.229090909090909, + "grad_norm": 0.00871365051716566, + "learning_rate": 9.682813475563643e-05, + "loss": 0.012250279076397419, + "num_input_tokens_seen": 33210528, + "step": 2028, + "train_runtime": 16481.3886, + "train_tokens_per_second": 2015.032 + }, + { + "epoch": 1.2296969696969697, + "grad_norm": 0.010484444908797741, + "learning_rate": 9.682476343191708e-05, + "loss": 0.012891886755824089, + "num_input_tokens_seen": 33226904, + "step": 2029, + "train_runtime": 16489.5073, + "train_tokens_per_second": 2015.033 + }, + { + "epoch": 1.2303030303030302, + "grad_norm": 0.015102782286703587, + "learning_rate": 9.682139037624092e-05, + "loss": 0.011657091788947582, + "num_input_tokens_seen": 33243280, + "step": 2030, + "train_runtime": 16497.6346, + "train_tokens_per_second": 2015.033 + }, + { + "epoch": 1.230909090909091, + "grad_norm": 0.015282537788152695, + "learning_rate": 9.681801558873272e-05, + "loss": 0.012187018990516663, + "num_input_tokens_seen": 33259656, + "step": 2031, + "train_runtime": 16505.7523, + "train_tokens_per_second": 2015.034 + }, + { + "epoch": 1.2315151515151515, + "grad_norm": 0.008355207741260529, + "learning_rate": 9.681463906951729e-05, + "loss": 0.011612944304943085, + "num_input_tokens_seen": 33276032, + "step": 2032, + "train_runtime": 16513.8704, + "train_tokens_per_second": 2015.035 + }, + { + "epoch": 1.2321212121212122, + "grad_norm": 0.01107091549783945, + "learning_rate": 9.681126081871955e-05, + "loss": 0.011706216260790825, + "num_input_tokens_seen": 33292408, + "step": 2033, + "train_runtime": 16521.9865, + "train_tokens_per_second": 2015.037 + }, + { + "epoch": 1.2327272727272727, + "grad_norm": 0.0022801703307777643, + "learning_rate": 9.680788083646439e-05, + "loss": 0.011544807814061642, + "num_input_tokens_seen": 33308784, + "step": 2034, + "train_runtime": 16530.102, + "train_tokens_per_second": 2015.038 + }, + { + "epoch": 1.2333333333333334, + "grad_norm": 0.008891911245882511, + "learning_rate": 9.68044991228769e-05, + "loss": 0.01236899383366108, + "num_input_tokens_seen": 33325160, + "step": 2035, + "train_runtime": 16538.2109, + "train_tokens_per_second": 2015.04 + }, + { + "epoch": 1.233939393939394, + "grad_norm": 0.011175681836903095, + "learning_rate": 9.680111567808213e-05, + "loss": 0.011853158473968506, + "num_input_tokens_seen": 33341536, + "step": 2036, + "train_runtime": 16546.3354, + "train_tokens_per_second": 2015.04 + }, + { + "epoch": 1.2345454545454546, + "grad_norm": 0.008549856953322887, + "learning_rate": 9.679773050220524e-05, + "loss": 0.011179388500750065, + "num_input_tokens_seen": 33357912, + "step": 2037, + "train_runtime": 16554.4491, + "train_tokens_per_second": 2015.042 + }, + { + "epoch": 1.2351515151515151, + "grad_norm": 0.005819539073854685, + "learning_rate": 9.67943435953714e-05, + "loss": 0.01257402915507555, + "num_input_tokens_seen": 33374288, + "step": 2038, + "train_runtime": 16562.563, + "train_tokens_per_second": 2015.044 + }, + { + "epoch": 1.2357575757575758, + "grad_norm": 0.01801511086523533, + "learning_rate": 9.679095495770596e-05, + "loss": 0.01320955716073513, + "num_input_tokens_seen": 33390664, + "step": 2039, + "train_runtime": 16570.6848, + "train_tokens_per_second": 2015.044 + }, + { + "epoch": 1.2363636363636363, + "grad_norm": 0.009865757077932358, + "learning_rate": 9.67875645893342e-05, + "loss": 0.012449276633560658, + "num_input_tokens_seen": 33407040, + "step": 2040, + "train_runtime": 16578.8018, + "train_tokens_per_second": 2015.046 + }, + { + "epoch": 1.236969696969697, + "grad_norm": 0.008288033306598663, + "learning_rate": 9.678417249038154e-05, + "loss": 0.01053472887724638, + "num_input_tokens_seen": 33423416, + "step": 2041, + "train_runtime": 16586.9196, + "train_tokens_per_second": 2015.047 + }, + { + "epoch": 1.2375757575757576, + "grad_norm": 0.02465786226093769, + "learning_rate": 9.678077866097345e-05, + "loss": 0.012921673245728016, + "num_input_tokens_seen": 33439792, + "step": 2042, + "train_runtime": 16595.0371, + "train_tokens_per_second": 2015.048 + }, + { + "epoch": 1.2381818181818183, + "grad_norm": 0.013770204037427902, + "learning_rate": 9.677738310123545e-05, + "loss": 0.012475434690713882, + "num_input_tokens_seen": 33456168, + "step": 2043, + "train_runtime": 16603.154, + "train_tokens_per_second": 2015.049 + }, + { + "epoch": 1.2387878787878788, + "grad_norm": 0.01958361268043518, + "learning_rate": 9.677398581129316e-05, + "loss": 0.01197823602706194, + "num_input_tokens_seen": 33472544, + "step": 2044, + "train_runtime": 16611.2666, + "train_tokens_per_second": 2015.051 + }, + { + "epoch": 1.2393939393939393, + "grad_norm": 0.004778360947966576, + "learning_rate": 9.67705867912722e-05, + "loss": 0.012999579310417175, + "num_input_tokens_seen": 33488920, + "step": 2045, + "train_runtime": 16619.3844, + "train_tokens_per_second": 2015.052 + }, + { + "epoch": 1.24, + "grad_norm": 0.015011285431683064, + "learning_rate": 9.676718604129832e-05, + "loss": 0.014176999218761921, + "num_input_tokens_seen": 33505296, + "step": 2046, + "train_runtime": 16627.5012, + "train_tokens_per_second": 2015.053 + }, + { + "epoch": 1.2406060606060607, + "grad_norm": 0.019506732001900673, + "learning_rate": 9.676378356149734e-05, + "loss": 0.012224535457789898, + "num_input_tokens_seen": 33521672, + "step": 2047, + "train_runtime": 16635.6217, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 1.2412121212121212, + "grad_norm": 0.13394935429096222, + "learning_rate": 9.676037935199505e-05, + "loss": 0.01198052242398262, + "num_input_tokens_seen": 33538048, + "step": 2048, + "train_runtime": 16643.7418, + "train_tokens_per_second": 2015.055 + }, + { + "epoch": 1.2418181818181817, + "grad_norm": 0.010586493648588657, + "learning_rate": 9.675697341291738e-05, + "loss": 0.013710709288716316, + "num_input_tokens_seen": 33554424, + "step": 2049, + "train_runtime": 16651.8585, + "train_tokens_per_second": 2015.056 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.007696137297898531, + "learning_rate": 9.675356574439031e-05, + "loss": 0.013701122254133224, + "num_input_tokens_seen": 33570800, + "step": 2050, + "train_runtime": 16659.9803, + "train_tokens_per_second": 2015.056 + }, + { + "epoch": 1.2430303030303032, + "grad_norm": 0.012039069086313248, + "learning_rate": 9.675015634653992e-05, + "loss": 0.011533101089298725, + "num_input_tokens_seen": 33587176, + "step": 2051, + "train_runtime": 16668.0889, + "train_tokens_per_second": 2015.059 + }, + { + "epoch": 1.2436363636363637, + "grad_norm": 0.011677329428493977, + "learning_rate": 9.674674521949227e-05, + "loss": 0.013059742748737335, + "num_input_tokens_seen": 33603552, + "step": 2052, + "train_runtime": 16676.2054, + "train_tokens_per_second": 2015.06 + }, + { + "epoch": 1.2442424242424241, + "grad_norm": 0.009239686653017998, + "learning_rate": 9.674333236337356e-05, + "loss": 0.012059221975505352, + "num_input_tokens_seen": 33619928, + "step": 2053, + "train_runtime": 16684.3227, + "train_tokens_per_second": 2015.061 + }, + { + "epoch": 1.2448484848484849, + "grad_norm": 0.09366417676210403, + "learning_rate": 9.673991777831001e-05, + "loss": 0.012007784098386765, + "num_input_tokens_seen": 33636304, + "step": 2054, + "train_runtime": 16692.4412, + "train_tokens_per_second": 2015.062 + }, + { + "epoch": 1.2454545454545454, + "grad_norm": 0.009940850548446178, + "learning_rate": 9.673650146442791e-05, + "loss": 0.012801412492990494, + "num_input_tokens_seen": 33652680, + "step": 2055, + "train_runtime": 16700.5558, + "train_tokens_per_second": 2015.063 + }, + { + "epoch": 1.246060606060606, + "grad_norm": 0.011787704192101955, + "learning_rate": 9.673308342185365e-05, + "loss": 0.013499357737600803, + "num_input_tokens_seen": 33669056, + "step": 2056, + "train_runtime": 16708.6666, + "train_tokens_per_second": 2015.065 + }, + { + "epoch": 1.2466666666666666, + "grad_norm": 0.012068502604961395, + "learning_rate": 9.672966365071365e-05, + "loss": 0.013464034534990788, + "num_input_tokens_seen": 33685432, + "step": 2057, + "train_runtime": 16716.7761, + "train_tokens_per_second": 2015.067 + }, + { + "epoch": 1.2472727272727273, + "grad_norm": 0.007820342667400837, + "learning_rate": 9.67262421511344e-05, + "loss": 0.011818510480225086, + "num_input_tokens_seen": 33701808, + "step": 2058, + "train_runtime": 16724.8854, + "train_tokens_per_second": 2015.07 + }, + { + "epoch": 1.2478787878787878, + "grad_norm": 0.011603613384068012, + "learning_rate": 9.672281892324242e-05, + "loss": 0.01426640897989273, + "num_input_tokens_seen": 33718184, + "step": 2059, + "train_runtime": 16732.9925, + "train_tokens_per_second": 2015.072 + }, + { + "epoch": 1.2484848484848485, + "grad_norm": 0.015823999419808388, + "learning_rate": 9.671939396716436e-05, + "loss": 0.013109918683767319, + "num_input_tokens_seen": 33734560, + "step": 2060, + "train_runtime": 16741.1026, + "train_tokens_per_second": 2015.074 + }, + { + "epoch": 1.249090909090909, + "grad_norm": 0.00919675175100565, + "learning_rate": 9.671596728302692e-05, + "loss": 0.01303145196288824, + "num_input_tokens_seen": 33750936, + "step": 2061, + "train_runtime": 16749.2125, + "train_tokens_per_second": 2015.076 + }, + { + "epoch": 1.2496969696969698, + "grad_norm": 0.00808015652000904, + "learning_rate": 9.671253887095681e-05, + "loss": 0.013077626936137676, + "num_input_tokens_seen": 33767312, + "step": 2062, + "train_runtime": 16757.3238, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 1.2503030303030302, + "grad_norm": 0.007350953761488199, + "learning_rate": 9.670910873108086e-05, + "loss": 0.012963245622813702, + "num_input_tokens_seen": 33783688, + "step": 2063, + "train_runtime": 16765.4356, + "train_tokens_per_second": 2015.08 + }, + { + "epoch": 1.250909090909091, + "grad_norm": 0.011904108338057995, + "learning_rate": 9.670567686352594e-05, + "loss": 0.012116971425712109, + "num_input_tokens_seen": 33800064, + "step": 2064, + "train_runtime": 16773.546, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 1.2515151515151515, + "grad_norm": 0.011483087204396725, + "learning_rate": 9.6702243268419e-05, + "loss": 0.013674170710146427, + "num_input_tokens_seen": 33816440, + "step": 2065, + "train_runtime": 16781.6557, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 1.2521212121212122, + "grad_norm": 0.010501409880816936, + "learning_rate": 9.669880794588701e-05, + "loss": 0.011777383275330067, + "num_input_tokens_seen": 33832816, + "step": 2066, + "train_runtime": 16789.766, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.2527272727272727, + "grad_norm": 0.009026569314301014, + "learning_rate": 9.669537089605705e-05, + "loss": 0.012171929702162743, + "num_input_tokens_seen": 33849192, + "step": 2067, + "train_runtime": 16797.8767, + "train_tokens_per_second": 2015.088 + }, + { + "epoch": 1.2533333333333334, + "grad_norm": 0.018056530505418777, + "learning_rate": 9.669193211905627e-05, + "loss": 0.012256179004907608, + "num_input_tokens_seen": 33865568, + "step": 2068, + "train_runtime": 16805.9871, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 1.253939393939394, + "grad_norm": 0.010606756433844566, + "learning_rate": 9.668849161501185e-05, + "loss": 0.012096712365746498, + "num_input_tokens_seen": 33881944, + "step": 2069, + "train_runtime": 16814.095, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 1.2545454545454544, + "grad_norm": 0.011599649675190449, + "learning_rate": 9.668504938405105e-05, + "loss": 0.013895167037844658, + "num_input_tokens_seen": 33898320, + "step": 2070, + "train_runtime": 16822.2019, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 1.2551515151515151, + "grad_norm": 0.01043979823589325, + "learning_rate": 9.668160542630118e-05, + "loss": 0.012740423902869225, + "num_input_tokens_seen": 33914696, + "step": 2071, + "train_runtime": 16830.3157, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 1.2557575757575759, + "grad_norm": 0.006431375164538622, + "learning_rate": 9.667815974188965e-05, + "loss": 0.01092531718313694, + "num_input_tokens_seen": 33931072, + "step": 2072, + "train_runtime": 16838.4336, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.2563636363636363, + "grad_norm": 0.012145663611590862, + "learning_rate": 9.667471233094387e-05, + "loss": 0.011573081836104393, + "num_input_tokens_seen": 33947448, + "step": 2073, + "train_runtime": 16846.5416, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 1.2569696969696968, + "grad_norm": 0.01263564545661211, + "learning_rate": 9.667126319359139e-05, + "loss": 0.012712339870631695, + "num_input_tokens_seen": 33963824, + "step": 2074, + "train_runtime": 16854.6507, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 1.2575757575757576, + "grad_norm": 0.010235338471829891, + "learning_rate": 9.666781232995976e-05, + "loss": 0.012764286249876022, + "num_input_tokens_seen": 33980200, + "step": 2075, + "train_runtime": 16862.7613, + "train_tokens_per_second": 2015.103 + }, + { + "epoch": 1.2581818181818183, + "grad_norm": 0.01789509318768978, + "learning_rate": 9.666435974017665e-05, + "loss": 0.012191517278552055, + "num_input_tokens_seen": 33996576, + "step": 2076, + "train_runtime": 16870.8737, + "train_tokens_per_second": 2015.105 + }, + { + "epoch": 1.2587878787878788, + "grad_norm": 0.03132858872413635, + "learning_rate": 9.666090542436974e-05, + "loss": 0.013369154185056686, + "num_input_tokens_seen": 34012952, + "step": 2077, + "train_runtime": 16878.986, + "train_tokens_per_second": 2015.106 + }, + { + "epoch": 1.2593939393939393, + "grad_norm": 0.010015154257416725, + "learning_rate": 9.665744938266681e-05, + "loss": 0.011515887454152107, + "num_input_tokens_seen": 34029328, + "step": 2078, + "train_runtime": 16887.0989, + "train_tokens_per_second": 2015.108 + }, + { + "epoch": 1.26, + "grad_norm": 0.014227609150111675, + "learning_rate": 9.665399161519569e-05, + "loss": 0.011430085636675358, + "num_input_tokens_seen": 34045704, + "step": 2079, + "train_runtime": 16895.2111, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 1.2606060606060607, + "grad_norm": 0.010706653818488121, + "learning_rate": 9.665053212208426e-05, + "loss": 0.012273017317056656, + "num_input_tokens_seen": 34062080, + "step": 2080, + "train_runtime": 16903.3202, + "train_tokens_per_second": 2015.112 + }, + { + "epoch": 1.2612121212121212, + "grad_norm": 0.011419372633099556, + "learning_rate": 9.66470709034605e-05, + "loss": 0.012077580206096172, + "num_input_tokens_seen": 34078456, + "step": 2081, + "train_runtime": 16911.4345, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.2618181818181817, + "grad_norm": 0.004964667372405529, + "learning_rate": 9.664360795945244e-05, + "loss": 0.011858327314257622, + "num_input_tokens_seen": 34094832, + "step": 2082, + "train_runtime": 16919.5448, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 1.2624242424242424, + "grad_norm": 0.01158229261636734, + "learning_rate": 9.664014329018813e-05, + "loss": 0.012356961145997047, + "num_input_tokens_seen": 34111208, + "step": 2083, + "train_runtime": 16927.655, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 1.263030303030303, + "grad_norm": 0.007866512052714825, + "learning_rate": 9.663667689579578e-05, + "loss": 0.012617571279406548, + "num_input_tokens_seen": 34127584, + "step": 2084, + "train_runtime": 16935.7655, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 1.2636363636363637, + "grad_norm": 0.010511146858334541, + "learning_rate": 9.663320877640355e-05, + "loss": 0.013274503871798515, + "num_input_tokens_seen": 34143960, + "step": 2085, + "train_runtime": 16943.8757, + "train_tokens_per_second": 2015.121 + }, + { + "epoch": 1.2642424242424242, + "grad_norm": 0.010922242887318134, + "learning_rate": 9.662973893213976e-05, + "loss": 0.012496921233832836, + "num_input_tokens_seen": 34160336, + "step": 2086, + "train_runtime": 16951.9865, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 1.2648484848484849, + "grad_norm": 0.013786193914711475, + "learning_rate": 9.662626736313271e-05, + "loss": 0.011950873769819736, + "num_input_tokens_seen": 34176712, + "step": 2087, + "train_runtime": 16960.1051, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 1.2654545454545454, + "grad_norm": 0.008805682882666588, + "learning_rate": 9.662279406951084e-05, + "loss": 0.011753477156162262, + "num_input_tokens_seen": 34193088, + "step": 2088, + "train_runtime": 16968.2158, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 1.266060606060606, + "grad_norm": 0.006958288606256247, + "learning_rate": 9.661931905140263e-05, + "loss": 0.01254260540008545, + "num_input_tokens_seen": 34209464, + "step": 2089, + "train_runtime": 16976.3341, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 0.015437749214470387, + "learning_rate": 9.661584230893657e-05, + "loss": 0.012971932999789715, + "num_input_tokens_seen": 34225840, + "step": 2090, + "train_runtime": 16984.4439, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 1.2672727272727273, + "grad_norm": 0.014704076573252678, + "learning_rate": 9.661236384224129e-05, + "loss": 0.013508946634829044, + "num_input_tokens_seen": 34242216, + "step": 2091, + "train_runtime": 16992.5556, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 1.2678787878787878, + "grad_norm": 0.006109852343797684, + "learning_rate": 9.660888365144545e-05, + "loss": 0.012193357571959496, + "num_input_tokens_seen": 34258592, + "step": 2092, + "train_runtime": 17000.6672, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 1.2684848484848485, + "grad_norm": 0.008939354680478573, + "learning_rate": 9.660540173667778e-05, + "loss": 0.01164991408586502, + "num_input_tokens_seen": 34274968, + "step": 2093, + "train_runtime": 17008.7775, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 1.269090909090909, + "grad_norm": 0.012699137441813946, + "learning_rate": 9.660191809806705e-05, + "loss": 0.012199487537145615, + "num_input_tokens_seen": 34291344, + "step": 2094, + "train_runtime": 17016.8855, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 1.2696969696969698, + "grad_norm": 0.01226895023137331, + "learning_rate": 9.659843273574212e-05, + "loss": 0.0135191073641181, + "num_input_tokens_seen": 34307720, + "step": 2095, + "train_runtime": 17024.9979, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 1.2703030303030303, + "grad_norm": 0.0103456387296319, + "learning_rate": 9.659494564983191e-05, + "loss": 0.01234687864780426, + "num_input_tokens_seen": 34324096, + "step": 2096, + "train_runtime": 17033.1086, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 1.270909090909091, + "grad_norm": 0.012149964459240437, + "learning_rate": 9.65914568404654e-05, + "loss": 0.01230283547192812, + "num_input_tokens_seen": 34340472, + "step": 2097, + "train_runtime": 17041.2202, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 1.2715151515151515, + "grad_norm": 0.012203685007989407, + "learning_rate": 9.658796630777162e-05, + "loss": 0.011648830026388168, + "num_input_tokens_seen": 34356848, + "step": 2098, + "train_runtime": 17049.3365, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 1.272121212121212, + "grad_norm": 0.015927039086818695, + "learning_rate": 9.658447405187971e-05, + "loss": 0.010989967733621597, + "num_input_tokens_seen": 34373224, + "step": 2099, + "train_runtime": 17057.4473, + "train_tokens_per_second": 2015.145 + }, + { + "epoch": 1.2727272727272727, + "grad_norm": 0.008946129120886326, + "learning_rate": 9.658098007291883e-05, + "loss": 0.012766811065375805, + "num_input_tokens_seen": 34389600, + "step": 2100, + "train_runtime": 17065.5571, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 1.2733333333333334, + "grad_norm": 0.008856577798724174, + "learning_rate": 9.65774843710182e-05, + "loss": 0.012293383479118347, + "num_input_tokens_seen": 34405976, + "step": 2101, + "train_runtime": 17074.6331, + "train_tokens_per_second": 2015.035 + }, + { + "epoch": 1.273939393939394, + "grad_norm": 0.010618072003126144, + "learning_rate": 9.657398694630712e-05, + "loss": 0.01199729647487402, + "num_input_tokens_seen": 34422352, + "step": 2102, + "train_runtime": 17082.7419, + "train_tokens_per_second": 2015.037 + }, + { + "epoch": 1.2745454545454544, + "grad_norm": 0.0050176153890788555, + "learning_rate": 9.657048779891498e-05, + "loss": 0.012176436372101307, + "num_input_tokens_seen": 34438728, + "step": 2103, + "train_runtime": 17090.8518, + "train_tokens_per_second": 2015.039 + }, + { + "epoch": 1.2751515151515151, + "grad_norm": 0.014950458891689777, + "learning_rate": 9.656698692897117e-05, + "loss": 0.01280341949313879, + "num_input_tokens_seen": 34455104, + "step": 2104, + "train_runtime": 17098.9578, + "train_tokens_per_second": 2015.041 + }, + { + "epoch": 1.2757575757575759, + "grad_norm": 0.010291986167430878, + "learning_rate": 9.656348433660521e-05, + "loss": 0.012015961110591888, + "num_input_tokens_seen": 34471480, + "step": 2105, + "train_runtime": 17107.0671, + "train_tokens_per_second": 2015.043 + }, + { + "epoch": 1.2763636363636364, + "grad_norm": 0.01197121199220419, + "learning_rate": 9.655998002194663e-05, + "loss": 0.013888024725019932, + "num_input_tokens_seen": 34487856, + "step": 2106, + "train_runtime": 17115.1785, + "train_tokens_per_second": 2015.045 + }, + { + "epoch": 1.2769696969696969, + "grad_norm": 0.006403345614671707, + "learning_rate": 9.655647398512509e-05, + "loss": 0.01155434362590313, + "num_input_tokens_seen": 34504232, + "step": 2107, + "train_runtime": 17123.2944, + "train_tokens_per_second": 2015.046 + }, + { + "epoch": 1.2775757575757576, + "grad_norm": 0.009825125336647034, + "learning_rate": 9.655296622627021e-05, + "loss": 0.012322339229285717, + "num_input_tokens_seen": 34520608, + "step": 2108, + "train_runtime": 17131.4068, + "train_tokens_per_second": 2015.048 + }, + { + "epoch": 1.2781818181818183, + "grad_norm": 0.014537754468619823, + "learning_rate": 9.654945674551177e-05, + "loss": 0.0121865663677454, + "num_input_tokens_seen": 34536984, + "step": 2109, + "train_runtime": 17139.5156, + "train_tokens_per_second": 2015.05 + }, + { + "epoch": 1.2787878787878788, + "grad_norm": 0.01560883317142725, + "learning_rate": 9.65459455429796e-05, + "loss": 0.013027477078139782, + "num_input_tokens_seen": 34553360, + "step": 2110, + "train_runtime": 17147.6232, + "train_tokens_per_second": 2015.052 + }, + { + "epoch": 1.2793939393939393, + "grad_norm": 0.01559220440685749, + "learning_rate": 9.654243261880353e-05, + "loss": 0.012413599528372288, + "num_input_tokens_seen": 34569736, + "step": 2111, + "train_runtime": 17155.7332, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 1.28, + "grad_norm": 0.009660288691520691, + "learning_rate": 9.653891797311351e-05, + "loss": 0.012785199098289013, + "num_input_tokens_seen": 34586112, + "step": 2112, + "train_runtime": 17163.8439, + "train_tokens_per_second": 2015.056 + }, + { + "epoch": 1.2806060606060605, + "grad_norm": 0.006946479436010122, + "learning_rate": 9.653540160603956e-05, + "loss": 0.01120313722640276, + "num_input_tokens_seen": 34602488, + "step": 2113, + "train_runtime": 17171.9524, + "train_tokens_per_second": 2015.058 + }, + { + "epoch": 1.2812121212121212, + "grad_norm": 0.01230864692479372, + "learning_rate": 9.653188351771172e-05, + "loss": 0.012665695510804653, + "num_input_tokens_seen": 34618864, + "step": 2114, + "train_runtime": 17180.0636, + "train_tokens_per_second": 2015.06 + }, + { + "epoch": 1.2818181818181817, + "grad_norm": 0.011574783362448215, + "learning_rate": 9.652836370826013e-05, + "loss": 0.01255100592970848, + "num_input_tokens_seen": 34635240, + "step": 2115, + "train_runtime": 17188.1736, + "train_tokens_per_second": 2015.062 + }, + { + "epoch": 1.2824242424242425, + "grad_norm": 0.008793197572231293, + "learning_rate": 9.652484217781497e-05, + "loss": 0.013284552842378616, + "num_input_tokens_seen": 34651616, + "step": 2116, + "train_runtime": 17196.2832, + "train_tokens_per_second": 2015.064 + }, + { + "epoch": 1.283030303030303, + "grad_norm": 0.006198557559400797, + "learning_rate": 9.652131892650651e-05, + "loss": 0.010948103852570057, + "num_input_tokens_seen": 34667992, + "step": 2117, + "train_runtime": 17204.394, + "train_tokens_per_second": 2015.066 + }, + { + "epoch": 1.2836363636363637, + "grad_norm": 0.01315808854997158, + "learning_rate": 9.651779395446505e-05, + "loss": 0.012427638284862041, + "num_input_tokens_seen": 34684368, + "step": 2118, + "train_runtime": 17212.5026, + "train_tokens_per_second": 2015.068 + }, + { + "epoch": 1.2842424242424242, + "grad_norm": 0.0191126000136137, + "learning_rate": 9.651426726182098e-05, + "loss": 0.013659548945724964, + "num_input_tokens_seen": 34700744, + "step": 2119, + "train_runtime": 17220.6124, + "train_tokens_per_second": 2015.07 + }, + { + "epoch": 1.284848484848485, + "grad_norm": 0.009408008307218552, + "learning_rate": 9.651073884870473e-05, + "loss": 0.012401404790580273, + "num_input_tokens_seen": 34717120, + "step": 2120, + "train_runtime": 17228.7217, + "train_tokens_per_second": 2015.072 + }, + { + "epoch": 1.2854545454545454, + "grad_norm": 0.011676698923110962, + "learning_rate": 9.650720871524686e-05, + "loss": 0.011847556568682194, + "num_input_tokens_seen": 34733496, + "step": 2121, + "train_runtime": 17236.8342, + "train_tokens_per_second": 2015.074 + }, + { + "epoch": 1.2860606060606061, + "grad_norm": 0.008365709334611893, + "learning_rate": 9.65036768615779e-05, + "loss": 0.011720732785761356, + "num_input_tokens_seen": 34749872, + "step": 2122, + "train_runtime": 17244.9443, + "train_tokens_per_second": 2015.076 + }, + { + "epoch": 1.2866666666666666, + "grad_norm": 0.008611305616796017, + "learning_rate": 9.650014328782848e-05, + "loss": 0.012757069431245327, + "num_input_tokens_seen": 34766248, + "step": 2123, + "train_runtime": 17253.0952, + "train_tokens_per_second": 2015.073 + }, + { + "epoch": 1.2872727272727273, + "grad_norm": 0.015551083721220493, + "learning_rate": 9.649660799412933e-05, + "loss": 0.01262652687728405, + "num_input_tokens_seen": 34782624, + "step": 2124, + "train_runtime": 17261.2065, + "train_tokens_per_second": 2015.075 + }, + { + "epoch": 1.2878787878787878, + "grad_norm": 0.01336025632917881, + "learning_rate": 9.649307098061119e-05, + "loss": 0.013620062731206417, + "num_input_tokens_seen": 34799000, + "step": 2125, + "train_runtime": 17269.3148, + "train_tokens_per_second": 2015.077 + }, + { + "epoch": 1.2884848484848486, + "grad_norm": 0.01245210412889719, + "learning_rate": 9.648953224740489e-05, + "loss": 0.013339771889150143, + "num_input_tokens_seen": 34815376, + "step": 2126, + "train_runtime": 17277.4317, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 1.289090909090909, + "grad_norm": 0.009113944135606289, + "learning_rate": 9.648599179464134e-05, + "loss": 0.0135605214163661, + "num_input_tokens_seen": 34831752, + "step": 2127, + "train_runtime": 17285.5412, + "train_tokens_per_second": 2015.08 + }, + { + "epoch": 1.2896969696969696, + "grad_norm": 0.009461781941354275, + "learning_rate": 9.648244962245148e-05, + "loss": 0.012702050618827343, + "num_input_tokens_seen": 34848128, + "step": 2128, + "train_runtime": 17293.6537, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 1.2903030303030303, + "grad_norm": 0.007549144793301821, + "learning_rate": 9.647890573096632e-05, + "loss": 0.011274856515228748, + "num_input_tokens_seen": 34864504, + "step": 2129, + "train_runtime": 17301.7635, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 1.290909090909091, + "grad_norm": 0.010565084405243397, + "learning_rate": 9.647536012031695e-05, + "loss": 0.011094843037426472, + "num_input_tokens_seen": 34880880, + "step": 2130, + "train_runtime": 17309.8729, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.2915151515151515, + "grad_norm": 0.015499581582844257, + "learning_rate": 9.647181279063453e-05, + "loss": 0.01364554651081562, + "num_input_tokens_seen": 34897256, + "step": 2131, + "train_runtime": 17317.9876, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 1.292121212121212, + "grad_norm": 0.008567445911467075, + "learning_rate": 9.646826374205022e-05, + "loss": 0.012525910511612892, + "num_input_tokens_seen": 34913632, + "step": 2132, + "train_runtime": 17326.0958, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 1.2927272727272727, + "grad_norm": 0.013958334922790527, + "learning_rate": 9.646471297469537e-05, + "loss": 0.012882156297564507, + "num_input_tokens_seen": 34930008, + "step": 2133, + "train_runtime": 17334.2117, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.2933333333333334, + "grad_norm": 0.007428262382745743, + "learning_rate": 9.646116048870124e-05, + "loss": 0.011848744936287403, + "num_input_tokens_seen": 34946384, + "step": 2134, + "train_runtime": 17342.3224, + "train_tokens_per_second": 2015.093 + }, + { + "epoch": 1.293939393939394, + "grad_norm": 0.009584290906786919, + "learning_rate": 9.645760628419929e-05, + "loss": 0.011593570932745934, + "num_input_tokens_seen": 34962760, + "step": 2135, + "train_runtime": 17350.435, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 1.2945454545454544, + "grad_norm": 0.016960185021162033, + "learning_rate": 9.645405036132093e-05, + "loss": 0.013664147816598415, + "num_input_tokens_seen": 34979136, + "step": 2136, + "train_runtime": 17358.55, + "train_tokens_per_second": 2015.095 + }, + { + "epoch": 1.2951515151515152, + "grad_norm": 0.01113409735262394, + "learning_rate": 9.645049272019773e-05, + "loss": 0.013035980984568596, + "num_input_tokens_seen": 34995512, + "step": 2137, + "train_runtime": 17366.6598, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.2957575757575759, + "grad_norm": 0.02672557160258293, + "learning_rate": 9.644693336096125e-05, + "loss": 0.01212849747389555, + "num_input_tokens_seen": 35011888, + "step": 2138, + "train_runtime": 17374.7703, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 1.2963636363636364, + "grad_norm": 0.015280626714229584, + "learning_rate": 9.644337228374318e-05, + "loss": 0.012423450127243996, + "num_input_tokens_seen": 35028264, + "step": 2139, + "train_runtime": 17382.8807, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 1.2969696969696969, + "grad_norm": 0.014273280277848244, + "learning_rate": 9.643980948867519e-05, + "loss": 0.011721854098141193, + "num_input_tokens_seen": 35044640, + "step": 2140, + "train_runtime": 17390.9881, + "train_tokens_per_second": 2015.103 + }, + { + "epoch": 1.2975757575757576, + "grad_norm": 0.005597109440714121, + "learning_rate": 9.643624497588908e-05, + "loss": 0.011341876350343227, + "num_input_tokens_seen": 35061016, + "step": 2141, + "train_runtime": 17399.099, + "train_tokens_per_second": 2015.105 + }, + { + "epoch": 1.298181818181818, + "grad_norm": 0.009911877103149891, + "learning_rate": 9.643267874551671e-05, + "loss": 0.013796377927064896, + "num_input_tokens_seen": 35077392, + "step": 2142, + "train_runtime": 17407.2072, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 1.2987878787878788, + "grad_norm": 0.008413155563175678, + "learning_rate": 9.642911079768999e-05, + "loss": 0.01139981858432293, + "num_input_tokens_seen": 35093768, + "step": 2143, + "train_runtime": 17415.3192, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 1.2993939393939393, + "grad_norm": 0.00816043745726347, + "learning_rate": 9.642554113254085e-05, + "loss": 0.012404600158333778, + "num_input_tokens_seen": 35110144, + "step": 2144, + "train_runtime": 17423.4327, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 1.3, + "grad_norm": 0.009998388588428497, + "learning_rate": 9.642196975020137e-05, + "loss": 0.013593346811830997, + "num_input_tokens_seen": 35126520, + "step": 2145, + "train_runtime": 17431.5439, + "train_tokens_per_second": 2015.112 + }, + { + "epoch": 1.3006060606060605, + "grad_norm": 0.008478298783302307, + "learning_rate": 9.641839665080363e-05, + "loss": 0.012044758535921574, + "num_input_tokens_seen": 35142896, + "step": 2146, + "train_runtime": 17439.6567, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 1.3012121212121213, + "grad_norm": 0.01786281354725361, + "learning_rate": 9.64148218344798e-05, + "loss": 0.01225617527961731, + "num_input_tokens_seen": 35159272, + "step": 2147, + "train_runtime": 17447.7674, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 1.3018181818181818, + "grad_norm": 0.004462715703994036, + "learning_rate": 9.641124530136209e-05, + "loss": 0.01188221201300621, + "num_input_tokens_seen": 35175648, + "step": 2148, + "train_runtime": 17455.8753, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 1.3024242424242425, + "grad_norm": 0.0029105639550834894, + "learning_rate": 9.64076670515828e-05, + "loss": 0.012195127084851265, + "num_input_tokens_seen": 35192024, + "step": 2149, + "train_runtime": 17463.9854, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 1.303030303030303, + "grad_norm": 0.00953491497784853, + "learning_rate": 9.640408708527429e-05, + "loss": 0.01310074981302023, + "num_input_tokens_seen": 35208400, + "step": 2150, + "train_runtime": 17472.0957, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.3036363636363637, + "grad_norm": 0.018413005396723747, + "learning_rate": 9.640050540256896e-05, + "loss": 0.014166397973895073, + "num_input_tokens_seen": 35224776, + "step": 2151, + "train_runtime": 17480.2096, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 1.3042424242424242, + "grad_norm": 0.010966022498905659, + "learning_rate": 9.639692200359929e-05, + "loss": 0.013556400313973427, + "num_input_tokens_seen": 35241152, + "step": 2152, + "train_runtime": 17488.3185, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 1.304848484848485, + "grad_norm": 0.008611932396888733, + "learning_rate": 9.639333688849784e-05, + "loss": 0.010546581819653511, + "num_input_tokens_seen": 35257528, + "step": 2153, + "train_runtime": 17496.4338, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.3054545454545454, + "grad_norm": 0.01247915904968977, + "learning_rate": 9.638975005739719e-05, + "loss": 0.013438762165606022, + "num_input_tokens_seen": 35273904, + "step": 2154, + "train_runtime": 17504.547, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 1.3060606060606061, + "grad_norm": 0.011569264344871044, + "learning_rate": 9.638616151043003e-05, + "loss": 0.012859106063842773, + "num_input_tokens_seen": 35290280, + "step": 2155, + "train_runtime": 17512.6539, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 1.3066666666666666, + "grad_norm": 0.013723514974117279, + "learning_rate": 9.638257124772909e-05, + "loss": 0.012551365420222282, + "num_input_tokens_seen": 35306656, + "step": 2156, + "train_runtime": 17520.7657, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 1.3072727272727271, + "grad_norm": 0.008372720330953598, + "learning_rate": 9.637897926942716e-05, + "loss": 0.012505918741226196, + "num_input_tokens_seen": 35323032, + "step": 2157, + "train_runtime": 17528.874, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 1.3078787878787879, + "grad_norm": 0.008815966546535492, + "learning_rate": 9.637538557565712e-05, + "loss": 0.011104857549071312, + "num_input_tokens_seen": 35339408, + "step": 2158, + "train_runtime": 17536.9823, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 1.3084848484848486, + "grad_norm": 0.01211005449295044, + "learning_rate": 9.637179016655186e-05, + "loss": 0.01384238712489605, + "num_input_tokens_seen": 35355784, + "step": 2159, + "train_runtime": 17545.0948, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 1.309090909090909, + "grad_norm": 0.017194336280226707, + "learning_rate": 9.63681930422444e-05, + "loss": 0.014862093143165112, + "num_input_tokens_seen": 35372160, + "step": 2160, + "train_runtime": 17553.2169, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 1.3096969696969696, + "grad_norm": 0.010664109140634537, + "learning_rate": 9.636459420286779e-05, + "loss": 0.012079211883246899, + "num_input_tokens_seen": 35388536, + "step": 2161, + "train_runtime": 17561.3338, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 1.3103030303030303, + "grad_norm": 0.01663060300052166, + "learning_rate": 9.636099364855511e-05, + "loss": 0.013289893046021461, + "num_input_tokens_seen": 35404912, + "step": 2162, + "train_runtime": 17569.4461, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 1.310909090909091, + "grad_norm": 0.012000697664916515, + "learning_rate": 9.635739137943957e-05, + "loss": 0.01403304748237133, + "num_input_tokens_seen": 35421288, + "step": 2163, + "train_runtime": 17577.5567, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 1.3115151515151515, + "grad_norm": 0.012029038742184639, + "learning_rate": 9.635378739565439e-05, + "loss": 0.013772612437605858, + "num_input_tokens_seen": 35437664, + "step": 2164, + "train_runtime": 17585.6679, + "train_tokens_per_second": 2015.145 + }, + { + "epoch": 1.312121212121212, + "grad_norm": 0.018417729064822197, + "learning_rate": 9.63501816973329e-05, + "loss": 0.013312135823071003, + "num_input_tokens_seen": 35454040, + "step": 2165, + "train_runtime": 17593.7809, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 1.3127272727272727, + "grad_norm": 0.009397652000188828, + "learning_rate": 9.634657428460844e-05, + "loss": 0.01353902742266655, + "num_input_tokens_seen": 35470416, + "step": 2166, + "train_runtime": 17601.8913, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 1.3133333333333335, + "grad_norm": 0.01506609097123146, + "learning_rate": 9.634296515761445e-05, + "loss": 0.012555200606584549, + "num_input_tokens_seen": 35486792, + "step": 2167, + "train_runtime": 17610.0027, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 1.313939393939394, + "grad_norm": 0.008759674616158009, + "learning_rate": 9.633935431648444e-05, + "loss": 0.012441650964319706, + "num_input_tokens_seen": 35503168, + "step": 2168, + "train_runtime": 17618.1138, + "train_tokens_per_second": 2015.151 + }, + { + "epoch": 1.3145454545454545, + "grad_norm": 0.006139460019767284, + "learning_rate": 9.633574176135194e-05, + "loss": 0.011962692253291607, + "num_input_tokens_seen": 35519544, + "step": 2169, + "train_runtime": 17626.2224, + "train_tokens_per_second": 2015.154 + }, + { + "epoch": 1.3151515151515152, + "grad_norm": 0.012904582545161247, + "learning_rate": 9.63321274923506e-05, + "loss": 0.013941573910415173, + "num_input_tokens_seen": 35535920, + "step": 2170, + "train_runtime": 17634.3357, + "train_tokens_per_second": 2015.155 + }, + { + "epoch": 1.3157575757575757, + "grad_norm": 0.014602463692426682, + "learning_rate": 9.632851150961409e-05, + "loss": 0.01265255268663168, + "num_input_tokens_seen": 35552296, + "step": 2171, + "train_runtime": 17642.4467, + "train_tokens_per_second": 2015.157 + }, + { + "epoch": 1.3163636363636364, + "grad_norm": 0.02269907295703888, + "learning_rate": 9.632489381327617e-05, + "loss": 0.012979868799448013, + "num_input_tokens_seen": 35568672, + "step": 2172, + "train_runtime": 17650.5584, + "train_tokens_per_second": 2015.158 + }, + { + "epoch": 1.316969696969697, + "grad_norm": 0.005800630897283554, + "learning_rate": 9.632127440347062e-05, + "loss": 0.013988342136144638, + "num_input_tokens_seen": 35585048, + "step": 2173, + "train_runtime": 17658.667, + "train_tokens_per_second": 2015.16 + }, + { + "epoch": 1.3175757575757576, + "grad_norm": 0.01643582247197628, + "learning_rate": 9.631765328033134e-05, + "loss": 0.013193177059292793, + "num_input_tokens_seen": 35601424, + "step": 2174, + "train_runtime": 17666.7787, + "train_tokens_per_second": 2015.162 + }, + { + "epoch": 1.3181818181818181, + "grad_norm": 0.012858077883720398, + "learning_rate": 9.631403044399227e-05, + "loss": 0.011166015639901161, + "num_input_tokens_seen": 35617800, + "step": 2175, + "train_runtime": 17674.8914, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 1.3187878787878788, + "grad_norm": 0.013520823791623116, + "learning_rate": 9.631040589458741e-05, + "loss": 0.011491977609694004, + "num_input_tokens_seen": 35634176, + "step": 2176, + "train_runtime": 17683.0014, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 1.3193939393939393, + "grad_norm": 0.010789001360535622, + "learning_rate": 9.630677963225082e-05, + "loss": 0.012689574621617794, + "num_input_tokens_seen": 35650552, + "step": 2177, + "train_runtime": 17691.1129, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 1.32, + "grad_norm": 0.02198723889887333, + "learning_rate": 9.630315165711664e-05, + "loss": 0.01258945930749178, + "num_input_tokens_seen": 35666928, + "step": 2178, + "train_runtime": 17699.2219, + "train_tokens_per_second": 2015.169 + }, + { + "epoch": 1.3206060606060606, + "grad_norm": 0.009862944483757019, + "learning_rate": 9.629952196931901e-05, + "loss": 0.013526612892746925, + "num_input_tokens_seen": 35683304, + "step": 2179, + "train_runtime": 17707.3338, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 1.3212121212121213, + "grad_norm": 0.010745448991656303, + "learning_rate": 9.629589056899226e-05, + "loss": 0.011776023544371128, + "num_input_tokens_seen": 35699680, + "step": 2180, + "train_runtime": 17715.4454, + "train_tokens_per_second": 2015.173 + }, + { + "epoch": 1.3218181818181818, + "grad_norm": 0.011397488415241241, + "learning_rate": 9.629225745627069e-05, + "loss": 0.011914942413568497, + "num_input_tokens_seen": 35716056, + "step": 2181, + "train_runtime": 17723.5593, + "train_tokens_per_second": 2015.174 + }, + { + "epoch": 1.3224242424242425, + "grad_norm": 0.006203577853739262, + "learning_rate": 9.628862263128863e-05, + "loss": 0.011155352927744389, + "num_input_tokens_seen": 35732432, + "step": 2182, + "train_runtime": 17731.6685, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 1.323030303030303, + "grad_norm": 0.02225896529853344, + "learning_rate": 9.628498609418058e-05, + "loss": 0.01270115002989769, + "num_input_tokens_seen": 35748808, + "step": 2183, + "train_runtime": 17739.7798, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 1.3236363636363637, + "grad_norm": 0.008858407847583294, + "learning_rate": 9.628134784508104e-05, + "loss": 0.011755815707147121, + "num_input_tokens_seen": 35765184, + "step": 2184, + "train_runtime": 17747.8901, + "train_tokens_per_second": 2015.179 + }, + { + "epoch": 1.3242424242424242, + "grad_norm": 0.008333653211593628, + "learning_rate": 9.627770788412455e-05, + "loss": 0.011658458970487118, + "num_input_tokens_seen": 35781560, + "step": 2185, + "train_runtime": 17755.9976, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 1.3248484848484847, + "grad_norm": 0.008924845606088638, + "learning_rate": 9.627406621144578e-05, + "loss": 0.013182871043682098, + "num_input_tokens_seen": 35797936, + "step": 2186, + "train_runtime": 17764.1073, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 1.3254545454545454, + "grad_norm": 0.008099894039332867, + "learning_rate": 9.627042282717942e-05, + "loss": 0.012596973218023777, + "num_input_tokens_seen": 35814312, + "step": 2187, + "train_runtime": 17772.2161, + "train_tokens_per_second": 2015.185 + }, + { + "epoch": 1.3260606060606062, + "grad_norm": 0.01450270600616932, + "learning_rate": 9.626677773146022e-05, + "loss": 0.012211315333843231, + "num_input_tokens_seen": 35830688, + "step": 2188, + "train_runtime": 17780.3344, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 1.3266666666666667, + "grad_norm": 0.011908031068742275, + "learning_rate": 9.6263130924423e-05, + "loss": 0.011781557463109493, + "num_input_tokens_seen": 35847064, + "step": 2189, + "train_runtime": 17788.4466, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 1.3272727272727272, + "grad_norm": 0.01242706086486578, + "learning_rate": 9.625948240620269e-05, + "loss": 0.012625298462808132, + "num_input_tokens_seen": 35863440, + "step": 2190, + "train_runtime": 17796.5592, + "train_tokens_per_second": 2015.19 + }, + { + "epoch": 1.3278787878787879, + "grad_norm": 0.010798187926411629, + "learning_rate": 9.62558321769342e-05, + "loss": 0.012144774198532104, + "num_input_tokens_seen": 35879816, + "step": 2191, + "train_runtime": 17804.6702, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 1.3284848484848486, + "grad_norm": 0.010587851516902447, + "learning_rate": 9.625218023675255e-05, + "loss": 0.012753032147884369, + "num_input_tokens_seen": 35896192, + "step": 2192, + "train_runtime": 17812.7787, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 1.329090909090909, + "grad_norm": 0.006857162807136774, + "learning_rate": 9.624852658579282e-05, + "loss": 0.012723483145236969, + "num_input_tokens_seen": 35912568, + "step": 2193, + "train_runtime": 17820.8934, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 1.3296969696969696, + "grad_norm": 0.04555951803922653, + "learning_rate": 9.624487122419017e-05, + "loss": 0.011125890538096428, + "num_input_tokens_seen": 35928944, + "step": 2194, + "train_runtime": 17829.0031, + "train_tokens_per_second": 2015.196 + }, + { + "epoch": 1.3303030303030303, + "grad_norm": 0.009600553661584854, + "learning_rate": 9.624121415207978e-05, + "loss": 0.01181505061686039, + "num_input_tokens_seen": 35945320, + "step": 2195, + "train_runtime": 17837.1166, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 1.330909090909091, + "grad_norm": 0.006707084830850363, + "learning_rate": 9.623755536959693e-05, + "loss": 0.012222236022353172, + "num_input_tokens_seen": 35961696, + "step": 2196, + "train_runtime": 17845.2322, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 1.3315151515151515, + "grad_norm": 0.04006510227918625, + "learning_rate": 9.623389487687696e-05, + "loss": 0.01126607321202755, + "num_input_tokens_seen": 35978072, + "step": 2197, + "train_runtime": 17853.3441, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 1.332121212121212, + "grad_norm": 0.010973289608955383, + "learning_rate": 9.623023267405525e-05, + "loss": 0.012974373064935207, + "num_input_tokens_seen": 35994448, + "step": 2198, + "train_runtime": 17861.4569, + "train_tokens_per_second": 2015.202 + }, + { + "epoch": 1.3327272727272728, + "grad_norm": 0.010835564695298672, + "learning_rate": 9.622656876126726e-05, + "loss": 0.012828577309846878, + "num_input_tokens_seen": 36010824, + "step": 2199, + "train_runtime": 17869.5656, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.04496414214372635, + "learning_rate": 9.622290313864852e-05, + "loss": 0.012135835364460945, + "num_input_tokens_seen": 36027200, + "step": 2200, + "train_runtime": 17877.6728, + "train_tokens_per_second": 2015.206 + }, + { + "epoch": 1.333939393939394, + "grad_norm": 0.014242942444980145, + "learning_rate": 9.62192358063346e-05, + "loss": 0.013479562476277351, + "num_input_tokens_seen": 36043576, + "step": 2201, + "train_runtime": 17886.7409, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 1.3345454545454545, + "grad_norm": 0.013382949866354465, + "learning_rate": 9.621556676446117e-05, + "loss": 0.013178217224776745, + "num_input_tokens_seen": 36059952, + "step": 2202, + "train_runtime": 17894.8493, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 1.3351515151515152, + "grad_norm": 0.0031381326261907816, + "learning_rate": 9.621189601316391e-05, + "loss": 0.011984724551439285, + "num_input_tokens_seen": 36076328, + "step": 2203, + "train_runtime": 17903.1459, + "train_tokens_per_second": 2015.083 + }, + { + "epoch": 1.3357575757575757, + "grad_norm": 0.01637393608689308, + "learning_rate": 9.620822355257861e-05, + "loss": 0.013623081147670746, + "num_input_tokens_seen": 36092704, + "step": 2204, + "train_runtime": 17911.2629, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 1.3363636363636364, + "grad_norm": 0.008263975381851196, + "learning_rate": 9.620454938284112e-05, + "loss": 0.012027481570839882, + "num_input_tokens_seen": 36109080, + "step": 2205, + "train_runtime": 17919.3786, + "train_tokens_per_second": 2015.085 + }, + { + "epoch": 1.336969696969697, + "grad_norm": 0.006873487960547209, + "learning_rate": 9.620087350408732e-05, + "loss": 0.011758643202483654, + "num_input_tokens_seen": 36125456, + "step": 2206, + "train_runtime": 17927.4869, + "train_tokens_per_second": 2015.088 + }, + { + "epoch": 1.3375757575757576, + "grad_norm": 0.024665268138051033, + "learning_rate": 9.619719591645317e-05, + "loss": 0.013782843947410583, + "num_input_tokens_seen": 36141832, + "step": 2207, + "train_runtime": 17935.6003, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 1.3381818181818181, + "grad_norm": 0.02779507078230381, + "learning_rate": 9.619351662007473e-05, + "loss": 0.012481366284191608, + "num_input_tokens_seen": 36158208, + "step": 2208, + "train_runtime": 17943.7156, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 1.3387878787878789, + "grad_norm": 0.013757690787315369, + "learning_rate": 9.618983561508805e-05, + "loss": 0.01145961880683899, + "num_input_tokens_seen": 36174584, + "step": 2209, + "train_runtime": 17951.8332, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.3393939393939394, + "grad_norm": 0.010363428853452206, + "learning_rate": 9.618615290162931e-05, + "loss": 0.012521384283900261, + "num_input_tokens_seen": 36190960, + "step": 2210, + "train_runtime": 17959.9421, + "train_tokens_per_second": 2015.093 + }, + { + "epoch": 1.34, + "grad_norm": 0.009606797248125076, + "learning_rate": 9.618246847983471e-05, + "loss": 0.011278870515525341, + "num_input_tokens_seen": 36207336, + "step": 2211, + "train_runtime": 17968.0555, + "train_tokens_per_second": 2015.095 + }, + { + "epoch": 1.3406060606060606, + "grad_norm": 0.01277079712599516, + "learning_rate": 9.617878234984055e-05, + "loss": 0.013770547695457935, + "num_input_tokens_seen": 36223712, + "step": 2212, + "train_runtime": 17976.1678, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 1.3412121212121213, + "grad_norm": 0.03445848822593689, + "learning_rate": 9.617509451178317e-05, + "loss": 0.013981115072965622, + "num_input_tokens_seen": 36240088, + "step": 2213, + "train_runtime": 17984.2782, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.3418181818181818, + "grad_norm": 0.014536652714014053, + "learning_rate": 9.617140496579896e-05, + "loss": 0.012220301665365696, + "num_input_tokens_seen": 36256464, + "step": 2214, + "train_runtime": 17992.387, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 1.3424242424242423, + "grad_norm": 0.01611790619790554, + "learning_rate": 9.616771371202437e-05, + "loss": 0.013024747371673584, + "num_input_tokens_seen": 36272840, + "step": 2215, + "train_runtime": 18000.4999, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 1.343030303030303, + "grad_norm": 0.01158247608691454, + "learning_rate": 9.616402075059597e-05, + "loss": 0.012004833668470383, + "num_input_tokens_seen": 36289216, + "step": 2216, + "train_runtime": 18008.6151, + "train_tokens_per_second": 2015.103 + }, + { + "epoch": 1.3436363636363637, + "grad_norm": 0.023385636508464813, + "learning_rate": 9.616032608165034e-05, + "loss": 0.012171284295618534, + "num_input_tokens_seen": 36305592, + "step": 2217, + "train_runtime": 18016.7328, + "train_tokens_per_second": 2015.104 + }, + { + "epoch": 1.3442424242424242, + "grad_norm": 0.0019628936424851418, + "learning_rate": 9.615662970532416e-05, + "loss": 0.011382883414626122, + "num_input_tokens_seen": 36321968, + "step": 2218, + "train_runtime": 18024.8441, + "train_tokens_per_second": 2015.106 + }, + { + "epoch": 1.3448484848484847, + "grad_norm": 0.006762090139091015, + "learning_rate": 9.615293162175412e-05, + "loss": 0.012919168919324875, + "num_input_tokens_seen": 36338344, + "step": 2219, + "train_runtime": 18032.9565, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 1.3454545454545455, + "grad_norm": 0.009785478003323078, + "learning_rate": 9.6149231831077e-05, + "loss": 0.012674746103584766, + "num_input_tokens_seen": 36354720, + "step": 2220, + "train_runtime": 18041.066, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 1.3460606060606062, + "grad_norm": 0.010461698286235332, + "learning_rate": 9.614553033342969e-05, + "loss": 0.011975216679275036, + "num_input_tokens_seen": 36371096, + "step": 2221, + "train_runtime": 18049.1775, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 1.3466666666666667, + "grad_norm": 0.008190472610294819, + "learning_rate": 9.614182712894907e-05, + "loss": 0.012141593731939793, + "num_input_tokens_seen": 36387472, + "step": 2222, + "train_runtime": 18057.2852, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.3472727272727272, + "grad_norm": 0.008826551958918571, + "learning_rate": 9.613812221777212e-05, + "loss": 0.012635836377739906, + "num_input_tokens_seen": 36403848, + "step": 2223, + "train_runtime": 18065.3978, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 1.347878787878788, + "grad_norm": 0.010394547134637833, + "learning_rate": 9.613441560003588e-05, + "loss": 0.01198198739439249, + "num_input_tokens_seen": 36420224, + "step": 2224, + "train_runtime": 18073.5128, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 1.3484848484848486, + "grad_norm": 0.010476038791239262, + "learning_rate": 9.613070727587745e-05, + "loss": 0.012501906603574753, + "num_input_tokens_seen": 36436600, + "step": 2225, + "train_runtime": 18081.6231, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 1.3490909090909091, + "grad_norm": 0.010277483612298965, + "learning_rate": 9.6126997245434e-05, + "loss": 0.01279398612678051, + "num_input_tokens_seen": 36452976, + "step": 2226, + "train_runtime": 18089.7358, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 1.3496969696969696, + "grad_norm": 0.022543100640177727, + "learning_rate": 9.612328550884274e-05, + "loss": 0.012489145621657372, + "num_input_tokens_seen": 36469352, + "step": 2227, + "train_runtime": 18097.8572, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 1.3503030303030303, + "grad_norm": 0.012405750341713428, + "learning_rate": 9.611957206624098e-05, + "loss": 0.011780200526118279, + "num_input_tokens_seen": 36485728, + "step": 2228, + "train_runtime": 18105.9862, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 1.3509090909090908, + "grad_norm": 0.024704037234187126, + "learning_rate": 9.611585691776606e-05, + "loss": 0.012659060768783092, + "num_input_tokens_seen": 36502104, + "step": 2229, + "train_runtime": 18114.1069, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 1.3515151515151516, + "grad_norm": 0.004990878514945507, + "learning_rate": 9.61121400635554e-05, + "loss": 0.010893161408603191, + "num_input_tokens_seen": 36518480, + "step": 2230, + "train_runtime": 18122.234, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 1.352121212121212, + "grad_norm": 0.011252249591052532, + "learning_rate": 9.610842150374647e-05, + "loss": 0.013744168914854527, + "num_input_tokens_seen": 36534856, + "step": 2231, + "train_runtime": 18130.3576, + "train_tokens_per_second": 2015.121 + }, + { + "epoch": 1.3527272727272728, + "grad_norm": 0.009260553866624832, + "learning_rate": 9.610470123847682e-05, + "loss": 0.011988443322479725, + "num_input_tokens_seen": 36551232, + "step": 2232, + "train_runtime": 18138.4799, + "train_tokens_per_second": 2015.121 + }, + { + "epoch": 1.3533333333333333, + "grad_norm": 0.011449086479842663, + "learning_rate": 9.610097926788406e-05, + "loss": 0.012442278675734997, + "num_input_tokens_seen": 36567608, + "step": 2233, + "train_runtime": 18146.6009, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.353939393939394, + "grad_norm": 0.012972496449947357, + "learning_rate": 9.609725559210586e-05, + "loss": 0.012432006187736988, + "num_input_tokens_seen": 36583984, + "step": 2234, + "train_runtime": 18154.7212, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.3545454545454545, + "grad_norm": 0.010363031178712845, + "learning_rate": 9.609353021127994e-05, + "loss": 0.011616806499660015, + "num_input_tokens_seen": 36600360, + "step": 2235, + "train_runtime": 18162.8432, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 1.3551515151515152, + "grad_norm": 0.008639562875032425, + "learning_rate": 9.60898031255441e-05, + "loss": 0.012379256077110767, + "num_input_tokens_seen": 36616736, + "step": 2236, + "train_runtime": 18170.964, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 1.3557575757575757, + "grad_norm": 0.010502860881388187, + "learning_rate": 9.60860743350362e-05, + "loss": 0.011975327506661415, + "num_input_tokens_seen": 36633112, + "step": 2237, + "train_runtime": 18179.0854, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 1.3563636363636364, + "grad_norm": 0.007005107123404741, + "learning_rate": 9.608234383989416e-05, + "loss": 0.011983048170804977, + "num_input_tokens_seen": 36649488, + "step": 2238, + "train_runtime": 18187.2075, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 1.356969696969697, + "grad_norm": 0.005663185380399227, + "learning_rate": 9.607861164025596e-05, + "loss": 0.011925374157726765, + "num_input_tokens_seen": 36665864, + "step": 2239, + "train_runtime": 18195.3344, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 1.3575757575757577, + "grad_norm": 0.011430823244154453, + "learning_rate": 9.607487773625967e-05, + "loss": 0.011923066340386868, + "num_input_tokens_seen": 36682240, + "step": 2240, + "train_runtime": 18203.4567, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 1.3581818181818182, + "grad_norm": 0.015044212341308594, + "learning_rate": 9.607114212804335e-05, + "loss": 0.01411314494907856, + "num_input_tokens_seen": 36698616, + "step": 2241, + "train_runtime": 18211.5792, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 1.3587878787878789, + "grad_norm": 0.01229457464069128, + "learning_rate": 9.606740481574522e-05, + "loss": 0.013101696036756039, + "num_input_tokens_seen": 36714992, + "step": 2242, + "train_runtime": 18219.7, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 1.3593939393939394, + "grad_norm": 0.010342378169298172, + "learning_rate": 9.606366579950348e-05, + "loss": 0.012473942711949348, + "num_input_tokens_seen": 36731368, + "step": 2243, + "train_runtime": 18227.8219, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 0.010652092285454273, + "learning_rate": 9.605992507945647e-05, + "loss": 0.012655368074774742, + "num_input_tokens_seen": 36747744, + "step": 2244, + "train_runtime": 18235.9442, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.3606060606060606, + "grad_norm": 0.005535294767469168, + "learning_rate": 9.60561826557425e-05, + "loss": 0.01122352760285139, + "num_input_tokens_seen": 36764120, + "step": 2245, + "train_runtime": 18244.067, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.3612121212121213, + "grad_norm": 0.01733524352312088, + "learning_rate": 9.605243852850006e-05, + "loss": 0.013020082376897335, + "num_input_tokens_seen": 36780496, + "step": 2246, + "train_runtime": 18252.1891, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 1.3618181818181818, + "grad_norm": 0.018927304074168205, + "learning_rate": 9.604869269786758e-05, + "loss": 0.013990607112646103, + "num_input_tokens_seen": 36796872, + "step": 2247, + "train_runtime": 18260.3112, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 1.3624242424242423, + "grad_norm": 0.009154443629086018, + "learning_rate": 9.604494516398364e-05, + "loss": 0.010833939537405968, + "num_input_tokens_seen": 36813248, + "step": 2248, + "train_runtime": 18268.4336, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 1.363030303030303, + "grad_norm": 0.010260475799441338, + "learning_rate": 9.604119592698684e-05, + "loss": 0.011943116784095764, + "num_input_tokens_seen": 36829624, + "step": 2249, + "train_runtime": 18276.5587, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 0.010942655615508556, + "learning_rate": 9.603744498701585e-05, + "loss": 0.01344153843820095, + "num_input_tokens_seen": 36846000, + "step": 2250, + "train_runtime": 18284.6808, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 1.3642424242424243, + "grad_norm": 0.01657605543732643, + "learning_rate": 9.603369234420945e-05, + "loss": 0.01295737735927105, + "num_input_tokens_seen": 36862376, + "step": 2251, + "train_runtime": 18292.8034, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 1.3648484848484848, + "grad_norm": 0.013552656397223473, + "learning_rate": 9.602993799870642e-05, + "loss": 0.011583628132939339, + "num_input_tokens_seen": 36878752, + "step": 2252, + "train_runtime": 18300.9349, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 1.3654545454545455, + "grad_norm": 0.008103492669761181, + "learning_rate": 9.602618195064558e-05, + "loss": 0.01070526335388422, + "num_input_tokens_seen": 36895128, + "step": 2253, + "train_runtime": 18309.0571, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 1.3660606060606062, + "grad_norm": 0.01473317015916109, + "learning_rate": 9.602242420016594e-05, + "loss": 0.014281337149441242, + "num_input_tokens_seen": 36911504, + "step": 2254, + "train_runtime": 18317.1808, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 1.3666666666666667, + "grad_norm": 0.013630642555654049, + "learning_rate": 9.601866474740645e-05, + "loss": 0.013544391840696335, + "num_input_tokens_seen": 36927880, + "step": 2255, + "train_runtime": 18325.3025, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 1.3672727272727272, + "grad_norm": 0.010557189583778381, + "learning_rate": 9.601490359250615e-05, + "loss": 0.011742150411009789, + "num_input_tokens_seen": 36944256, + "step": 2256, + "train_runtime": 18333.4243, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 1.367878787878788, + "grad_norm": 0.0053065926767885685, + "learning_rate": 9.60111407356042e-05, + "loss": 0.011019930243492126, + "num_input_tokens_seen": 36960632, + "step": 2257, + "train_runtime": 18341.5509, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 1.3684848484848484, + "grad_norm": 0.010890625417232513, + "learning_rate": 9.600737617683975e-05, + "loss": 0.01254215557128191, + "num_input_tokens_seen": 36977008, + "step": 2258, + "train_runtime": 18349.6736, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 1.3690909090909091, + "grad_norm": 0.0021913303062319756, + "learning_rate": 9.600360991635204e-05, + "loss": 0.011373097077012062, + "num_input_tokens_seen": 36993384, + "step": 2259, + "train_runtime": 18357.7964, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 1.3696969696969696, + "grad_norm": 0.015858152881264687, + "learning_rate": 9.59998419542804e-05, + "loss": 0.01217947993427515, + "num_input_tokens_seen": 37009760, + "step": 2260, + "train_runtime": 18365.9166, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 1.3703030303030304, + "grad_norm": 0.01206044852733612, + "learning_rate": 9.599607229076418e-05, + "loss": 0.012296038679778576, + "num_input_tokens_seen": 37026136, + "step": 2261, + "train_runtime": 18374.0366, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 1.3709090909090909, + "grad_norm": 0.016060523688793182, + "learning_rate": 9.599230092594283e-05, + "loss": 0.012836135923862457, + "num_input_tokens_seen": 37042512, + "step": 2262, + "train_runtime": 18382.157, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 1.3715151515151516, + "grad_norm": 0.012261572293937206, + "learning_rate": 9.598852785995581e-05, + "loss": 0.013203001581132412, + "num_input_tokens_seen": 37058888, + "step": 2263, + "train_runtime": 18390.2794, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 1.372121212121212, + "grad_norm": 0.011548114009201527, + "learning_rate": 9.598475309294272e-05, + "loss": 0.012114378623664379, + "num_input_tokens_seen": 37075264, + "step": 2264, + "train_runtime": 18398.4008, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 1.3727272727272728, + "grad_norm": 0.007986263372004032, + "learning_rate": 9.598097662504315e-05, + "loss": 0.012728969566524029, + "num_input_tokens_seen": 37091640, + "step": 2265, + "train_runtime": 18406.5207, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 1.3733333333333333, + "grad_norm": 0.010519546456634998, + "learning_rate": 9.597719845639682e-05, + "loss": 0.012828621082007885, + "num_input_tokens_seen": 37108016, + "step": 2266, + "train_runtime": 18414.6424, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 1.373939393939394, + "grad_norm": 0.013495163060724735, + "learning_rate": 9.597341858714343e-05, + "loss": 0.01284085027873516, + "num_input_tokens_seen": 37124392, + "step": 2267, + "train_runtime": 18422.7642, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 1.3745454545454545, + "grad_norm": 0.016053834930062294, + "learning_rate": 9.596963701742285e-05, + "loss": 0.013004240579903126, + "num_input_tokens_seen": 37140768, + "step": 2268, + "train_runtime": 18430.8856, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 1.375151515151515, + "grad_norm": 0.012462491169571877, + "learning_rate": 9.59658537473749e-05, + "loss": 0.012112741358578205, + "num_input_tokens_seen": 37157144, + "step": 2269, + "train_runtime": 18439.0076, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 1.3757575757575757, + "grad_norm": 0.008608606643974781, + "learning_rate": 9.596206877713953e-05, + "loss": 0.011361487209796906, + "num_input_tokens_seen": 37173520, + "step": 2270, + "train_runtime": 18447.1339, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 1.3763636363636365, + "grad_norm": 0.01020008884370327, + "learning_rate": 9.595828210685675e-05, + "loss": 0.011964777484536171, + "num_input_tokens_seen": 37189896, + "step": 2271, + "train_runtime": 18455.2562, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 1.376969696969697, + "grad_norm": 0.00379827618598938, + "learning_rate": 9.59544937366666e-05, + "loss": 0.011180602014064789, + "num_input_tokens_seen": 37206272, + "step": 2272, + "train_runtime": 18463.378, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 1.3775757575757575, + "grad_norm": 0.016575191169977188, + "learning_rate": 9.595070366670924e-05, + "loss": 0.014480410143733025, + "num_input_tokens_seen": 37222648, + "step": 2273, + "train_runtime": 18471.4999, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 1.3781818181818182, + "grad_norm": 0.013181117363274097, + "learning_rate": 9.594691189712482e-05, + "loss": 0.012310860678553581, + "num_input_tokens_seen": 37239024, + "step": 2274, + "train_runtime": 18479.6203, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 1.378787878787879, + "grad_norm": 0.013198458589613438, + "learning_rate": 9.594311842805362e-05, + "loss": 0.012114742770791054, + "num_input_tokens_seen": 37255400, + "step": 2275, + "train_runtime": 18487.7432, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 1.3793939393939394, + "grad_norm": 0.013458278961479664, + "learning_rate": 9.593932325963593e-05, + "loss": 0.013166999444365501, + "num_input_tokens_seen": 37271776, + "step": 2276, + "train_runtime": 18495.8672, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 1.38, + "grad_norm": 0.003906270023435354, + "learning_rate": 9.593552639201213e-05, + "loss": 0.01128899771720171, + "num_input_tokens_seen": 37288152, + "step": 2277, + "train_runtime": 18503.9911, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 1.3806060606060606, + "grad_norm": 0.02250552736222744, + "learning_rate": 9.593172782532268e-05, + "loss": 0.012319961562752724, + "num_input_tokens_seen": 37304528, + "step": 2278, + "train_runtime": 18512.1132, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 1.3812121212121213, + "grad_norm": 0.009309303015470505, + "learning_rate": 9.592792755970806e-05, + "loss": 0.012765881605446339, + "num_input_tokens_seen": 37320904, + "step": 2279, + "train_runtime": 18520.2349, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 1.3818181818181818, + "grad_norm": 0.010714930482208729, + "learning_rate": 9.592412559530884e-05, + "loss": 0.012022463604807854, + "num_input_tokens_seen": 37337280, + "step": 2280, + "train_runtime": 18528.3557, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 1.3824242424242423, + "grad_norm": 0.011457400396466255, + "learning_rate": 9.592032193226564e-05, + "loss": 0.01279627624899149, + "num_input_tokens_seen": 37353656, + "step": 2281, + "train_runtime": 18536.4775, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 1.383030303030303, + "grad_norm": 0.01243502926081419, + "learning_rate": 9.591651657071916e-05, + "loss": 0.011872101575136185, + "num_input_tokens_seen": 37370032, + "step": 2282, + "train_runtime": 18544.6001, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 1.3836363636363636, + "grad_norm": 0.005702585447579622, + "learning_rate": 9.591270951081016e-05, + "loss": 0.01134478859603405, + "num_input_tokens_seen": 37386408, + "step": 2283, + "train_runtime": 18552.7226, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 1.3842424242424243, + "grad_norm": 0.021376250311732292, + "learning_rate": 9.590890075267943e-05, + "loss": 0.011881144717335701, + "num_input_tokens_seen": 37402784, + "step": 2284, + "train_runtime": 18560.8455, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 1.3848484848484848, + "grad_norm": 0.008453167043626308, + "learning_rate": 9.590509029646788e-05, + "loss": 0.012073191814124584, + "num_input_tokens_seen": 37419160, + "step": 2285, + "train_runtime": 18568.9689, + "train_tokens_per_second": 2015.145 + }, + { + "epoch": 1.3854545454545455, + "grad_norm": 0.008096368052065372, + "learning_rate": 9.590127814231642e-05, + "loss": 0.011614611372351646, + "num_input_tokens_seen": 37435536, + "step": 2286, + "train_runtime": 18577.09, + "train_tokens_per_second": 2015.145 + }, + { + "epoch": 1.386060606060606, + "grad_norm": 0.006734437309205532, + "learning_rate": 9.589746429036609e-05, + "loss": 0.012934810481965542, + "num_input_tokens_seen": 37451912, + "step": 2287, + "train_runtime": 18585.2112, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 1.3866666666666667, + "grad_norm": 0.01690874621272087, + "learning_rate": 9.589364874075793e-05, + "loss": 0.012905376963317394, + "num_input_tokens_seen": 37468288, + "step": 2288, + "train_runtime": 18593.3338, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 1.3872727272727272, + "grad_norm": 0.011003658175468445, + "learning_rate": 9.588983149363306e-05, + "loss": 0.014468666166067123, + "num_input_tokens_seen": 37484664, + "step": 2289, + "train_runtime": 18601.4552, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 1.387878787878788, + "grad_norm": 0.011732536368072033, + "learning_rate": 9.588601254913272e-05, + "loss": 0.013740056194365025, + "num_input_tokens_seen": 37501040, + "step": 2290, + "train_runtime": 18609.5761, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 1.3884848484848484, + "grad_norm": 0.005165507551282644, + "learning_rate": 9.588219190739811e-05, + "loss": 0.01104014739394188, + "num_input_tokens_seen": 37517416, + "step": 2291, + "train_runtime": 18617.6996, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 1.3890909090909092, + "grad_norm": 0.008920264430344105, + "learning_rate": 9.587836956857059e-05, + "loss": 0.011560751125216484, + "num_input_tokens_seen": 37533792, + "step": 2292, + "train_runtime": 18625.8217, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 1.3896969696969697, + "grad_norm": 0.014716854318976402, + "learning_rate": 9.587454553279152e-05, + "loss": 0.012428359128534794, + "num_input_tokens_seen": 37550168, + "step": 2293, + "train_runtime": 18633.9439, + "train_tokens_per_second": 2015.149 + }, + { + "epoch": 1.3903030303030304, + "grad_norm": 0.005737116560339928, + "learning_rate": 9.587071980020233e-05, + "loss": 0.011969603598117828, + "num_input_tokens_seen": 37566544, + "step": 2294, + "train_runtime": 18642.0653, + "train_tokens_per_second": 2015.149 + }, + { + "epoch": 1.3909090909090909, + "grad_norm": 0.009410570375621319, + "learning_rate": 9.586689237094455e-05, + "loss": 0.012249289080500603, + "num_input_tokens_seen": 37582920, + "step": 2295, + "train_runtime": 18650.1861, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 1.3915151515151516, + "grad_norm": 0.0060779326595366, + "learning_rate": 9.586306324515976e-05, + "loss": 0.011125440709292889, + "num_input_tokens_seen": 37599296, + "step": 2296, + "train_runtime": 18658.3067, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 1.392121212121212, + "grad_norm": 0.006624804809689522, + "learning_rate": 9.585923242298955e-05, + "loss": 0.01228361390531063, + "num_input_tokens_seen": 37615672, + "step": 2297, + "train_runtime": 18666.4325, + "train_tokens_per_second": 2015.151 + }, + { + "epoch": 1.3927272727272726, + "grad_norm": 0.009875877760350704, + "learning_rate": 9.585539990457566e-05, + "loss": 0.012501105666160583, + "num_input_tokens_seen": 37632048, + "step": 2298, + "train_runtime": 18674.5538, + "train_tokens_per_second": 2015.151 + }, + { + "epoch": 1.3933333333333333, + "grad_norm": 0.009113671258091927, + "learning_rate": 9.585156569005982e-05, + "loss": 0.011152594350278378, + "num_input_tokens_seen": 37648424, + "step": 2299, + "train_runtime": 18682.676, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 1.393939393939394, + "grad_norm": 0.007954055443406105, + "learning_rate": 9.584772977958386e-05, + "loss": 0.011372476816177368, + "num_input_tokens_seen": 37664800, + "step": 2300, + "train_runtime": 18690.7976, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 1.3945454545454545, + "grad_norm": 0.002810339443385601, + "learning_rate": 9.584389217328966e-05, + "loss": 0.012347941286861897, + "num_input_tokens_seen": 37681176, + "step": 2301, + "train_runtime": 18699.9071, + "train_tokens_per_second": 2015.046 + }, + { + "epoch": 1.395151515151515, + "grad_norm": 0.01631283573806286, + "learning_rate": 9.584005287131917e-05, + "loss": 0.01252490933984518, + "num_input_tokens_seen": 37697552, + "step": 2302, + "train_runtime": 18708.0344, + "train_tokens_per_second": 2015.046 + }, + { + "epoch": 1.3957575757575758, + "grad_norm": 0.0038756714202463627, + "learning_rate": 9.583621187381437e-05, + "loss": 0.010715951211750507, + "num_input_tokens_seen": 37713928, + "step": 2303, + "train_runtime": 18716.1541, + "train_tokens_per_second": 2015.047 + }, + { + "epoch": 1.3963636363636365, + "grad_norm": 0.007086980622261763, + "learning_rate": 9.583236918091738e-05, + "loss": 0.012314547784626484, + "num_input_tokens_seen": 37730304, + "step": 2304, + "train_runtime": 18724.2742, + "train_tokens_per_second": 2015.048 + }, + { + "epoch": 1.396969696969697, + "grad_norm": 0.005804756656289101, + "learning_rate": 9.58285247927703e-05, + "loss": 0.010950103402137756, + "num_input_tokens_seen": 37746680, + "step": 2305, + "train_runtime": 18732.3971, + "train_tokens_per_second": 2015.048 + }, + { + "epoch": 1.3975757575757575, + "grad_norm": 0.009648945182561874, + "learning_rate": 9.582467870951533e-05, + "loss": 0.0125240758061409, + "num_input_tokens_seen": 37763056, + "step": 2306, + "train_runtime": 18740.5161, + "train_tokens_per_second": 2015.049 + }, + { + "epoch": 1.3981818181818182, + "grad_norm": 0.009216394275426865, + "learning_rate": 9.582083093129473e-05, + "loss": 0.01159854419529438, + "num_input_tokens_seen": 37779432, + "step": 2307, + "train_runtime": 18748.6358, + "train_tokens_per_second": 2015.05 + }, + { + "epoch": 1.398787878787879, + "grad_norm": 0.007218767423182726, + "learning_rate": 9.581698145825084e-05, + "loss": 0.0115420613437891, + "num_input_tokens_seen": 37795808, + "step": 2308, + "train_runtime": 18756.7561, + "train_tokens_per_second": 2015.05 + }, + { + "epoch": 1.3993939393939394, + "grad_norm": 0.009544998407363892, + "learning_rate": 9.581313029052602e-05, + "loss": 0.01166750118136406, + "num_input_tokens_seen": 37812184, + "step": 2309, + "train_runtime": 18764.8776, + "train_tokens_per_second": 2015.051 + }, + { + "epoch": 1.4, + "grad_norm": 0.006320256739854813, + "learning_rate": 9.580927742826274e-05, + "loss": 0.012228483334183693, + "num_input_tokens_seen": 37828560, + "step": 2310, + "train_runtime": 18773.0022, + "train_tokens_per_second": 2015.051 + }, + { + "epoch": 1.4006060606060606, + "grad_norm": 0.006051518488675356, + "learning_rate": 9.580542287160348e-05, + "loss": 0.013318963348865509, + "num_input_tokens_seen": 37844936, + "step": 2311, + "train_runtime": 18781.1243, + "train_tokens_per_second": 2015.052 + }, + { + "epoch": 1.4012121212121211, + "grad_norm": 0.006172268185764551, + "learning_rate": 9.580156662069084e-05, + "loss": 0.013921252451837063, + "num_input_tokens_seen": 37861312, + "step": 2312, + "train_runtime": 18789.2452, + "train_tokens_per_second": 2015.052 + }, + { + "epoch": 1.4018181818181819, + "grad_norm": 0.011184340342879295, + "learning_rate": 9.579770867566744e-05, + "loss": 0.01270650140941143, + "num_input_tokens_seen": 37877688, + "step": 2313, + "train_runtime": 18797.3677, + "train_tokens_per_second": 2015.053 + }, + { + "epoch": 1.4024242424242424, + "grad_norm": 0.009068330749869347, + "learning_rate": 9.5793849036676e-05, + "loss": 0.012799710035324097, + "num_input_tokens_seen": 37894064, + "step": 2314, + "train_runtime": 18805.4876, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 1.403030303030303, + "grad_norm": 0.011376739479601383, + "learning_rate": 9.578998770385925e-05, + "loss": 0.012034818530082703, + "num_input_tokens_seen": 37910440, + "step": 2315, + "train_runtime": 18813.609, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 1.4036363636363636, + "grad_norm": 0.008086096495389938, + "learning_rate": 9.578612467736004e-05, + "loss": 0.01248577143996954, + "num_input_tokens_seen": 37926816, + "step": 2316, + "train_runtime": 18821.7325, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 1.4042424242424243, + "grad_norm": 0.009844251908361912, + "learning_rate": 9.578225995732123e-05, + "loss": 0.010496463626623154, + "num_input_tokens_seen": 37943192, + "step": 2317, + "train_runtime": 18829.8558, + "train_tokens_per_second": 2015.055 + }, + { + "epoch": 1.4048484848484848, + "grad_norm": 0.013504873029887676, + "learning_rate": 9.577839354388577e-05, + "loss": 0.013046303763985634, + "num_input_tokens_seen": 37959568, + "step": 2318, + "train_runtime": 18837.9767, + "train_tokens_per_second": 2015.055 + }, + { + "epoch": 1.4054545454545455, + "grad_norm": 0.008937161415815353, + "learning_rate": 9.577452543719669e-05, + "loss": 0.01245005801320076, + "num_input_tokens_seen": 37975944, + "step": 2319, + "train_runtime": 18846.0993, + "train_tokens_per_second": 2015.056 + }, + { + "epoch": 1.406060606060606, + "grad_norm": 0.00886614341288805, + "learning_rate": 9.577065563739706e-05, + "loss": 0.012116841971874237, + "num_input_tokens_seen": 37992320, + "step": 2320, + "train_runtime": 18854.221, + "train_tokens_per_second": 2015.056 + }, + { + "epoch": 1.4066666666666667, + "grad_norm": 0.01636064611375332, + "learning_rate": 9.576678414463001e-05, + "loss": 0.012819192372262478, + "num_input_tokens_seen": 38008696, + "step": 2321, + "train_runtime": 18862.3418, + "train_tokens_per_second": 2015.057 + }, + { + "epoch": 1.4072727272727272, + "grad_norm": 0.008886885829269886, + "learning_rate": 9.576291095903875e-05, + "loss": 0.012500411830842495, + "num_input_tokens_seen": 38025072, + "step": 2322, + "train_runtime": 18870.4635, + "train_tokens_per_second": 2015.058 + }, + { + "epoch": 1.407878787878788, + "grad_norm": 0.009175439365208149, + "learning_rate": 9.575903608076652e-05, + "loss": 0.011257543228566647, + "num_input_tokens_seen": 38041448, + "step": 2323, + "train_runtime": 18878.5841, + "train_tokens_per_second": 2015.058 + }, + { + "epoch": 1.4084848484848485, + "grad_norm": 0.008139233104884624, + "learning_rate": 9.575515950995666e-05, + "loss": 0.012008091434836388, + "num_input_tokens_seen": 38057824, + "step": 2324, + "train_runtime": 18886.7035, + "train_tokens_per_second": 2015.059 + }, + { + "epoch": 1.4090909090909092, + "grad_norm": 0.010456080548465252, + "learning_rate": 9.575128124675257e-05, + "loss": 0.012284350581467152, + "num_input_tokens_seen": 38074200, + "step": 2325, + "train_runtime": 18894.8317, + "train_tokens_per_second": 2015.059 + }, + { + "epoch": 1.4096969696969697, + "grad_norm": 0.010791217908263206, + "learning_rate": 9.574740129129767e-05, + "loss": 0.012802988290786743, + "num_input_tokens_seen": 38090576, + "step": 2326, + "train_runtime": 18902.9531, + "train_tokens_per_second": 2015.06 + }, + { + "epoch": 1.4103030303030302, + "grad_norm": 0.010405773296952248, + "learning_rate": 9.574351964373548e-05, + "loss": 0.013286586850881577, + "num_input_tokens_seen": 38106952, + "step": 2327, + "train_runtime": 18911.0757, + "train_tokens_per_second": 2015.06 + }, + { + "epoch": 1.410909090909091, + "grad_norm": 0.013858595862984657, + "learning_rate": 9.573963630420958e-05, + "loss": 0.011597267352044582, + "num_input_tokens_seen": 38123328, + "step": 2328, + "train_runtime": 18919.1967, + "train_tokens_per_second": 2015.061 + }, + { + "epoch": 1.4115151515151516, + "grad_norm": 0.0037960167974233627, + "learning_rate": 9.573575127286361e-05, + "loss": 0.01243551168590784, + "num_input_tokens_seen": 38139704, + "step": 2329, + "train_runtime": 18927.3156, + "train_tokens_per_second": 2015.061 + }, + { + "epoch": 1.412121212121212, + "grad_norm": 0.01110562589019537, + "learning_rate": 9.573186454984127e-05, + "loss": 0.012111995369195938, + "num_input_tokens_seen": 38156080, + "step": 2330, + "train_runtime": 18935.4381, + "train_tokens_per_second": 2015.062 + }, + { + "epoch": 1.4127272727272726, + "grad_norm": 0.013541216030716896, + "learning_rate": 9.572797613528633e-05, + "loss": 0.012958360835909843, + "num_input_tokens_seen": 38172456, + "step": 2331, + "train_runtime": 18943.5591, + "train_tokens_per_second": 2015.063 + }, + { + "epoch": 1.4133333333333333, + "grad_norm": 0.008804292418062687, + "learning_rate": 9.572408602934258e-05, + "loss": 0.012310491874814034, + "num_input_tokens_seen": 38188832, + "step": 2332, + "train_runtime": 18951.6803, + "train_tokens_per_second": 2015.063 + }, + { + "epoch": 1.413939393939394, + "grad_norm": 0.00845362152904272, + "learning_rate": 9.572019423215395e-05, + "loss": 0.011901391670107841, + "num_input_tokens_seen": 38205208, + "step": 2333, + "train_runtime": 18959.8011, + "train_tokens_per_second": 2015.064 + }, + { + "epoch": 1.4145454545454546, + "grad_norm": 0.012662705034017563, + "learning_rate": 9.571630074386436e-05, + "loss": 0.01295614056289196, + "num_input_tokens_seen": 38221584, + "step": 2334, + "train_runtime": 18967.9217, + "train_tokens_per_second": 2015.064 + }, + { + "epoch": 1.415151515151515, + "grad_norm": 0.01324890460819006, + "learning_rate": 9.571240556461784e-05, + "loss": 0.012609636411070824, + "num_input_tokens_seen": 38237960, + "step": 2335, + "train_runtime": 18976.0431, + "train_tokens_per_second": 2015.065 + }, + { + "epoch": 1.4157575757575758, + "grad_norm": 0.009312913753092289, + "learning_rate": 9.570850869455845e-05, + "loss": 0.01243036799132824, + "num_input_tokens_seen": 38254336, + "step": 2336, + "train_runtime": 18984.1637, + "train_tokens_per_second": 2015.066 + }, + { + "epoch": 1.4163636363636365, + "grad_norm": 0.009475616738200188, + "learning_rate": 9.570461013383036e-05, + "loss": 0.011987818405032158, + "num_input_tokens_seen": 38270712, + "step": 2337, + "train_runtime": 18992.2853, + "train_tokens_per_second": 2015.066 + }, + { + "epoch": 1.416969696969697, + "grad_norm": 0.017518380656838417, + "learning_rate": 9.570070988257772e-05, + "loss": 0.012605913914740086, + "num_input_tokens_seen": 38287088, + "step": 2338, + "train_runtime": 19000.4061, + "train_tokens_per_second": 2015.067 + }, + { + "epoch": 1.4175757575757575, + "grad_norm": 0.013496988452970982, + "learning_rate": 9.569680794094483e-05, + "loss": 0.012542678974568844, + "num_input_tokens_seen": 38303464, + "step": 2339, + "train_runtime": 19008.5325, + "train_tokens_per_second": 2015.067 + }, + { + "epoch": 1.4181818181818182, + "grad_norm": 0.01436201948672533, + "learning_rate": 9.5692904309076e-05, + "loss": 0.011861974373459816, + "num_input_tokens_seen": 38319840, + "step": 2340, + "train_runtime": 19016.6545, + "train_tokens_per_second": 2015.067 + }, + { + "epoch": 1.4187878787878787, + "grad_norm": 0.009680923074483871, + "learning_rate": 9.568899898711563e-05, + "loss": 0.013890981674194336, + "num_input_tokens_seen": 38336216, + "step": 2341, + "train_runtime": 19024.7747, + "train_tokens_per_second": 2015.068 + }, + { + "epoch": 1.4193939393939394, + "grad_norm": 0.00575306685641408, + "learning_rate": 9.568509197520816e-05, + "loss": 0.01138025987893343, + "num_input_tokens_seen": 38352592, + "step": 2342, + "train_runtime": 19032.8957, + "train_tokens_per_second": 2015.069 + }, + { + "epoch": 1.42, + "grad_norm": 0.008749360218644142, + "learning_rate": 9.568118327349811e-05, + "loss": 0.012537163682281971, + "num_input_tokens_seen": 38368968, + "step": 2343, + "train_runtime": 19041.0165, + "train_tokens_per_second": 2015.069 + }, + { + "epoch": 1.4206060606060606, + "grad_norm": 0.006181587930768728, + "learning_rate": 9.567727288213005e-05, + "loss": 0.011792626231908798, + "num_input_tokens_seen": 38385344, + "step": 2344, + "train_runtime": 19049.1379, + "train_tokens_per_second": 2015.07 + }, + { + "epoch": 1.4212121212121211, + "grad_norm": 0.004062063992023468, + "learning_rate": 9.567336080124861e-05, + "loss": 0.011532892473042011, + "num_input_tokens_seen": 38401720, + "step": 2345, + "train_runtime": 19057.2588, + "train_tokens_per_second": 2015.07 + }, + { + "epoch": 1.4218181818181819, + "grad_norm": 0.007840434089303017, + "learning_rate": 9.566944703099852e-05, + "loss": 0.012082960456609726, + "num_input_tokens_seen": 38418096, + "step": 2346, + "train_runtime": 19065.38, + "train_tokens_per_second": 2015.071 + }, + { + "epoch": 1.4224242424242424, + "grad_norm": 0.006259352900087833, + "learning_rate": 9.56655315715245e-05, + "loss": 0.011257804930210114, + "num_input_tokens_seen": 38434472, + "step": 2347, + "train_runtime": 19073.5015, + "train_tokens_per_second": 2015.072 + }, + { + "epoch": 1.423030303030303, + "grad_norm": 0.018244467675685883, + "learning_rate": 9.56616144229714e-05, + "loss": 0.01377645879983902, + "num_input_tokens_seen": 38450848, + "step": 2348, + "train_runtime": 19081.6232, + "train_tokens_per_second": 2015.072 + }, + { + "epoch": 1.4236363636363636, + "grad_norm": 0.007569839712232351, + "learning_rate": 9.565769558548409e-05, + "loss": 0.010858017019927502, + "num_input_tokens_seen": 38467224, + "step": 2349, + "train_runtime": 19089.7453, + "train_tokens_per_second": 2015.073 + }, + { + "epoch": 1.4242424242424243, + "grad_norm": 0.009098520502448082, + "learning_rate": 9.565377505920756e-05, + "loss": 0.012635212391614914, + "num_input_tokens_seen": 38483600, + "step": 2350, + "train_runtime": 19097.8661, + "train_tokens_per_second": 2015.073 + }, + { + "epoch": 1.4248484848484848, + "grad_norm": 0.011632603593170643, + "learning_rate": 9.564985284428679e-05, + "loss": 0.012759126722812653, + "num_input_tokens_seen": 38499976, + "step": 2351, + "train_runtime": 19105.9874, + "train_tokens_per_second": 2015.074 + }, + { + "epoch": 1.4254545454545455, + "grad_norm": 0.010547060519456863, + "learning_rate": 9.564592894086685e-05, + "loss": 0.01271246001124382, + "num_input_tokens_seen": 38516352, + "step": 2352, + "train_runtime": 19114.1093, + "train_tokens_per_second": 2015.074 + }, + { + "epoch": 1.426060606060606, + "grad_norm": 0.013726288452744484, + "learning_rate": 9.564200334909292e-05, + "loss": 0.013344586826860905, + "num_input_tokens_seen": 38532728, + "step": 2353, + "train_runtime": 19122.2325, + "train_tokens_per_second": 2015.075 + }, + { + "epoch": 1.4266666666666667, + "grad_norm": 0.013462604023516178, + "learning_rate": 9.563807606911015e-05, + "loss": 0.013579159043729305, + "num_input_tokens_seen": 38549104, + "step": 2354, + "train_runtime": 19130.3458, + "train_tokens_per_second": 2015.076 + }, + { + "epoch": 1.4272727272727272, + "grad_norm": 0.008283951319754124, + "learning_rate": 9.563414710106382e-05, + "loss": 0.01217900775372982, + "num_input_tokens_seen": 38565480, + "step": 2355, + "train_runtime": 19138.4556, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 1.4278787878787877, + "grad_norm": 0.011906127445399761, + "learning_rate": 9.563021644509926e-05, + "loss": 0.012697991915047169, + "num_input_tokens_seen": 38581856, + "step": 2356, + "train_runtime": 19146.5656, + "train_tokens_per_second": 2015.08 + }, + { + "epoch": 1.4284848484848485, + "grad_norm": 0.009647993370890617, + "learning_rate": 9.562628410136186e-05, + "loss": 0.012488002888858318, + "num_input_tokens_seen": 38598232, + "step": 2357, + "train_runtime": 19154.6797, + "train_tokens_per_second": 2015.081 + }, + { + "epoch": 1.4290909090909092, + "grad_norm": 0.00915882084518671, + "learning_rate": 9.562235006999705e-05, + "loss": 0.01235243584960699, + "num_input_tokens_seen": 38614608, + "step": 2358, + "train_runtime": 19162.7899, + "train_tokens_per_second": 2015.083 + }, + { + "epoch": 1.4296969696969697, + "grad_norm": 0.009903871454298496, + "learning_rate": 9.561841435115037e-05, + "loss": 0.012734384275972843, + "num_input_tokens_seen": 38630984, + "step": 2359, + "train_runtime": 19170.901, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 1.4303030303030302, + "grad_norm": 0.013122179545462132, + "learning_rate": 9.561447694496736e-05, + "loss": 0.01317589357495308, + "num_input_tokens_seen": 38647360, + "step": 2360, + "train_runtime": 19179.0112, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.430909090909091, + "grad_norm": 0.006084715947508812, + "learning_rate": 9.561053785159371e-05, + "loss": 0.011836802586913109, + "num_input_tokens_seen": 38663736, + "step": 2361, + "train_runtime": 19187.1227, + "train_tokens_per_second": 2015.088 + }, + { + "epoch": 1.4315151515151516, + "grad_norm": 0.009425677359104156, + "learning_rate": 9.560659707117507e-05, + "loss": 0.012720235623419285, + "num_input_tokens_seen": 38680112, + "step": 2362, + "train_runtime": 19195.2334, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 1.4321212121212121, + "grad_norm": 0.01125511433929205, + "learning_rate": 9.560265460385723e-05, + "loss": 0.01049058698117733, + "num_input_tokens_seen": 38696488, + "step": 2363, + "train_runtime": 19203.3454, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.4327272727272726, + "grad_norm": 0.008309018798172474, + "learning_rate": 9.559871044978598e-05, + "loss": 0.012103556655347347, + "num_input_tokens_seen": 38712864, + "step": 2364, + "train_runtime": 19211.4583, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 1.4333333333333333, + "grad_norm": 0.010823550634086132, + "learning_rate": 9.559476460910725e-05, + "loss": 0.013532055541872978, + "num_input_tokens_seen": 38729240, + "step": 2365, + "train_runtime": 19219.5713, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 1.433939393939394, + "grad_norm": 0.007804887369275093, + "learning_rate": 9.559081708196696e-05, + "loss": 0.011757384985685349, + "num_input_tokens_seen": 38745616, + "step": 2366, + "train_runtime": 19227.6813, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 1.4345454545454546, + "grad_norm": 0.011105155572295189, + "learning_rate": 9.558686786851115e-05, + "loss": 0.013056900352239609, + "num_input_tokens_seen": 38761992, + "step": 2367, + "train_runtime": 19235.7941, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.435151515151515, + "grad_norm": 0.006022731773555279, + "learning_rate": 9.558291696888584e-05, + "loss": 0.011985806748270988, + "num_input_tokens_seen": 38778368, + "step": 2368, + "train_runtime": 19243.9058, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 1.4357575757575758, + "grad_norm": 0.021359071135520935, + "learning_rate": 9.55789643832372e-05, + "loss": 0.013741593807935715, + "num_input_tokens_seen": 38794744, + "step": 2369, + "train_runtime": 19252.0165, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 1.4363636363636363, + "grad_norm": 0.016318688169121742, + "learning_rate": 9.557501011171145e-05, + "loss": 0.01426868885755539, + "num_input_tokens_seen": 38811120, + "step": 2370, + "train_runtime": 19260.1321, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 1.436969696969697, + "grad_norm": 0.03291529044508934, + "learning_rate": 9.557105415445484e-05, + "loss": 0.012934263795614243, + "num_input_tokens_seen": 38827496, + "step": 2371, + "train_runtime": 19268.2464, + "train_tokens_per_second": 2015.103 + }, + { + "epoch": 1.4375757575757575, + "grad_norm": 0.007644362282007933, + "learning_rate": 9.556709651161366e-05, + "loss": 0.011257193982601166, + "num_input_tokens_seen": 38843872, + "step": 2372, + "train_runtime": 19276.359, + "train_tokens_per_second": 2015.104 + }, + { + "epoch": 1.4381818181818182, + "grad_norm": 0.009007222019135952, + "learning_rate": 9.556313718333433e-05, + "loss": 0.011448862962424755, + "num_input_tokens_seen": 38860248, + "step": 2373, + "train_runtime": 19284.4725, + "train_tokens_per_second": 2015.106 + }, + { + "epoch": 1.4387878787878787, + "grad_norm": 0.0117417573928833, + "learning_rate": 9.555917616976329e-05, + "loss": 0.013442277908325195, + "num_input_tokens_seen": 38876624, + "step": 2374, + "train_runtime": 19292.5852, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 1.4393939393939394, + "grad_norm": 0.0057337055914103985, + "learning_rate": 9.555521347104703e-05, + "loss": 0.01220876444131136, + "num_input_tokens_seen": 38893000, + "step": 2375, + "train_runtime": 19300.7, + "train_tokens_per_second": 2015.108 + }, + { + "epoch": 1.44, + "grad_norm": 0.006500967778265476, + "learning_rate": 9.555124908733215e-05, + "loss": 0.013221386820077896, + "num_input_tokens_seen": 38909376, + "step": 2376, + "train_runtime": 19308.8126, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 1.4406060606060607, + "grad_norm": 0.013432295992970467, + "learning_rate": 9.554728301876526e-05, + "loss": 0.013204741291701794, + "num_input_tokens_seen": 38925752, + "step": 2377, + "train_runtime": 19316.9331, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 1.4412121212121212, + "grad_norm": 0.015894871205091476, + "learning_rate": 9.554331526549308e-05, + "loss": 0.013291300274431705, + "num_input_tokens_seen": 38942128, + "step": 2378, + "train_runtime": 19325.046, + "train_tokens_per_second": 2015.112 + }, + { + "epoch": 1.4418181818181819, + "grad_norm": 0.020607370883226395, + "learning_rate": 9.553934582766235e-05, + "loss": 0.012713750824332237, + "num_input_tokens_seen": 38958504, + "step": 2379, + "train_runtime": 19333.1603, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.4424242424242424, + "grad_norm": 0.012630708515644073, + "learning_rate": 9.553537470541992e-05, + "loss": 0.011279501020908356, + "num_input_tokens_seen": 38974880, + "step": 2380, + "train_runtime": 19341.2724, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 1.443030303030303, + "grad_norm": 0.011622477322816849, + "learning_rate": 9.553140189891266e-05, + "loss": 0.011939273215830326, + "num_input_tokens_seen": 38991256, + "step": 2381, + "train_runtime": 19349.3856, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 1.4436363636363636, + "grad_norm": 0.007888108491897583, + "learning_rate": 9.552742740828748e-05, + "loss": 0.01125436369329691, + "num_input_tokens_seen": 39007632, + "step": 2382, + "train_runtime": 19357.4988, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 1.4442424242424243, + "grad_norm": 0.012038582935929298, + "learning_rate": 9.552345123369144e-05, + "loss": 0.01231908705085516, + "num_input_tokens_seen": 39024008, + "step": 2383, + "train_runtime": 19365.612, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 1.4448484848484848, + "grad_norm": 0.006818312220275402, + "learning_rate": 9.551947337527159e-05, + "loss": 0.012319983914494514, + "num_input_tokens_seen": 39040384, + "step": 2384, + "train_runtime": 19373.7348, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 1.4454545454545453, + "grad_norm": 0.012328113429248333, + "learning_rate": 9.551549383317506e-05, + "loss": 0.012599589303135872, + "num_input_tokens_seen": 39056760, + "step": 2385, + "train_runtime": 19381.851, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 1.446060606060606, + "grad_norm": 0.00953622069209814, + "learning_rate": 9.551151260754907e-05, + "loss": 0.012549671344459057, + "num_input_tokens_seen": 39073136, + "step": 2386, + "train_runtime": 19389.9639, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.4466666666666668, + "grad_norm": 0.011133255437016487, + "learning_rate": 9.550752969854084e-05, + "loss": 0.013944639824330807, + "num_input_tokens_seen": 39089512, + "step": 2387, + "train_runtime": 19398.0778, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 1.4472727272727273, + "grad_norm": 0.00964092556387186, + "learning_rate": 9.55035451062977e-05, + "loss": 0.012482261285185814, + "num_input_tokens_seen": 39105888, + "step": 2388, + "train_runtime": 19406.1887, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 1.4478787878787878, + "grad_norm": 0.006806143093854189, + "learning_rate": 9.549955883096706e-05, + "loss": 0.01177418977022171, + "num_input_tokens_seen": 39122264, + "step": 2389, + "train_runtime": 19414.3007, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 1.4484848484848485, + "grad_norm": 0.006465624086558819, + "learning_rate": 9.549557087269634e-05, + "loss": 0.013276521116495132, + "num_input_tokens_seen": 39138640, + "step": 2390, + "train_runtime": 19422.414, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.4490909090909092, + "grad_norm": 0.008866474032402039, + "learning_rate": 9.549158123163305e-05, + "loss": 0.012841928750276566, + "num_input_tokens_seen": 39155016, + "step": 2391, + "train_runtime": 19430.5325, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 1.4496969696969697, + "grad_norm": 0.010618673637509346, + "learning_rate": 9.548758990792477e-05, + "loss": 0.012568192556500435, + "num_input_tokens_seen": 39171392, + "step": 2392, + "train_runtime": 19438.6455, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 1.4503030303030302, + "grad_norm": 0.008561650291085243, + "learning_rate": 9.548359690171911e-05, + "loss": 0.012789330445230007, + "num_input_tokens_seen": 39187768, + "step": 2393, + "train_runtime": 19446.7579, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 1.450909090909091, + "grad_norm": 0.0074139004573225975, + "learning_rate": 9.547960221316379e-05, + "loss": 0.012125402688980103, + "num_input_tokens_seen": 39204144, + "step": 2394, + "train_runtime": 19454.8715, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 1.4515151515151516, + "grad_norm": 0.008938332088291645, + "learning_rate": 9.547560584240653e-05, + "loss": 0.012520406395196915, + "num_input_tokens_seen": 39220520, + "step": 2395, + "train_runtime": 19462.9863, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 1.4521212121212121, + "grad_norm": 0.010951995849609375, + "learning_rate": 9.547160778959519e-05, + "loss": 0.013598313555121422, + "num_input_tokens_seen": 39236896, + "step": 2396, + "train_runtime": 19471.1016, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 1.4527272727272726, + "grad_norm": 0.010507755912840366, + "learning_rate": 9.546760805487762e-05, + "loss": 0.011697047390043736, + "num_input_tokens_seen": 39253272, + "step": 2397, + "train_runtime": 19479.2149, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 1.4533333333333334, + "grad_norm": 0.008784889243543148, + "learning_rate": 9.546360663840177e-05, + "loss": 0.01291731558740139, + "num_input_tokens_seen": 39269648, + "step": 2398, + "train_runtime": 19487.3345, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 1.4539393939393939, + "grad_norm": 0.008387868292629719, + "learning_rate": 9.545960354031565e-05, + "loss": 0.011412294581532478, + "num_input_tokens_seen": 39286024, + "step": 2399, + "train_runtime": 19495.4456, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 1.4545454545454546, + "grad_norm": 0.013577899895608425, + "learning_rate": 9.545559876076733e-05, + "loss": 0.012150186114013195, + "num_input_tokens_seen": 39302400, + "step": 2400, + "train_runtime": 19503.5605, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 1.455151515151515, + "grad_norm": 0.00847565196454525, + "learning_rate": 9.545159229990493e-05, + "loss": 0.011293401941657066, + "num_input_tokens_seen": 39318776, + "step": 2401, + "train_runtime": 19512.6323, + "train_tokens_per_second": 2015.042 + }, + { + "epoch": 1.4557575757575758, + "grad_norm": 0.010008870624005795, + "learning_rate": 9.544758415787662e-05, + "loss": 0.012785600498318672, + "num_input_tokens_seen": 39335152, + "step": 2402, + "train_runtime": 19520.7422, + "train_tokens_per_second": 2015.044 + }, + { + "epoch": 1.4563636363636363, + "grad_norm": 0.007814590819180012, + "learning_rate": 9.544357433483071e-05, + "loss": 0.012637192383408546, + "num_input_tokens_seen": 39351528, + "step": 2403, + "train_runtime": 19528.8556, + "train_tokens_per_second": 2015.045 + }, + { + "epoch": 1.456969696969697, + "grad_norm": 0.006621070671826601, + "learning_rate": 9.543956283091546e-05, + "loss": 0.011230424046516418, + "num_input_tokens_seen": 39367904, + "step": 2404, + "train_runtime": 19536.9652, + "train_tokens_per_second": 2015.047 + }, + { + "epoch": 1.4575757575757575, + "grad_norm": 0.01204211637377739, + "learning_rate": 9.54355496462793e-05, + "loss": 0.012431148439645767, + "num_input_tokens_seen": 39384280, + "step": 2405, + "train_runtime": 19545.0747, + "train_tokens_per_second": 2015.049 + }, + { + "epoch": 1.4581818181818182, + "grad_norm": 0.007015693001449108, + "learning_rate": 9.543153478107061e-05, + "loss": 0.011989517137408257, + "num_input_tokens_seen": 39400656, + "step": 2406, + "train_runtime": 19553.1823, + "train_tokens_per_second": 2015.051 + }, + { + "epoch": 1.4587878787878787, + "grad_norm": 0.011309216730296612, + "learning_rate": 9.542751823543793e-05, + "loss": 0.011417721398174763, + "num_input_tokens_seen": 39417032, + "step": 2407, + "train_runtime": 19561.2927, + "train_tokens_per_second": 2015.053 + }, + { + "epoch": 1.4593939393939395, + "grad_norm": 0.015342672355473042, + "learning_rate": 9.542350000952982e-05, + "loss": 0.013073185458779335, + "num_input_tokens_seen": 39433408, + "step": 2408, + "train_runtime": 19569.4017, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 1.46, + "grad_norm": 0.019160790368914604, + "learning_rate": 9.541948010349491e-05, + "loss": 0.01287474948912859, + "num_input_tokens_seen": 39449784, + "step": 2409, + "train_runtime": 19577.5113, + "train_tokens_per_second": 2015.056 + }, + { + "epoch": 1.4606060606060607, + "grad_norm": 0.007335372269153595, + "learning_rate": 9.541545851748186e-05, + "loss": 0.012530960142612457, + "num_input_tokens_seen": 39466160, + "step": 2410, + "train_runtime": 19585.6206, + "train_tokens_per_second": 2015.058 + }, + { + "epoch": 1.4612121212121212, + "grad_norm": 0.012553319334983826, + "learning_rate": 9.541143525163946e-05, + "loss": 0.013056598603725433, + "num_input_tokens_seen": 39482536, + "step": 2411, + "train_runtime": 19593.735, + "train_tokens_per_second": 2015.059 + }, + { + "epoch": 1.461818181818182, + "grad_norm": 0.007112372200936079, + "learning_rate": 9.54074103061165e-05, + "loss": 0.012453190051019192, + "num_input_tokens_seen": 39498912, + "step": 2412, + "train_runtime": 19601.8436, + "train_tokens_per_second": 2015.061 + }, + { + "epoch": 1.4624242424242424, + "grad_norm": 0.0041340249590575695, + "learning_rate": 9.540338368106185e-05, + "loss": 0.010426213033497334, + "num_input_tokens_seen": 39515288, + "step": 2413, + "train_runtime": 19609.9527, + "train_tokens_per_second": 2015.063 + }, + { + "epoch": 1.463030303030303, + "grad_norm": 0.010998896323144436, + "learning_rate": 9.539935537662448e-05, + "loss": 0.012171028181910515, + "num_input_tokens_seen": 39531664, + "step": 2414, + "train_runtime": 19618.0619, + "train_tokens_per_second": 2015.065 + }, + { + "epoch": 1.4636363636363636, + "grad_norm": 0.030140312388539314, + "learning_rate": 9.539532539295335e-05, + "loss": 0.012536111287772655, + "num_input_tokens_seen": 39548040, + "step": 2415, + "train_runtime": 19626.1713, + "train_tokens_per_second": 2015.066 + }, + { + "epoch": 1.4642424242424243, + "grad_norm": 0.009954207576811314, + "learning_rate": 9.539129373019754e-05, + "loss": 0.012501617893576622, + "num_input_tokens_seen": 39564416, + "step": 2416, + "train_runtime": 19634.2817, + "train_tokens_per_second": 2015.068 + }, + { + "epoch": 1.4648484848484848, + "grad_norm": 0.014923661015927792, + "learning_rate": 9.538726038850617e-05, + "loss": 0.013546659611165524, + "num_input_tokens_seen": 39580792, + "step": 2417, + "train_runtime": 19642.3932, + "train_tokens_per_second": 2015.07 + }, + { + "epoch": 1.4654545454545453, + "grad_norm": 0.01247413083910942, + "learning_rate": 9.538322536802842e-05, + "loss": 0.012776615098118782, + "num_input_tokens_seen": 39597168, + "step": 2418, + "train_runtime": 19650.5062, + "train_tokens_per_second": 2015.071 + }, + { + "epoch": 1.466060606060606, + "grad_norm": 0.2203642725944519, + "learning_rate": 9.537918866891355e-05, + "loss": 0.01299387775361538, + "num_input_tokens_seen": 39613544, + "step": 2419, + "train_runtime": 19658.6181, + "train_tokens_per_second": 2015.073 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 0.010403799824416637, + "learning_rate": 9.537515029131086e-05, + "loss": 0.012151832692325115, + "num_input_tokens_seen": 39629920, + "step": 2420, + "train_runtime": 19666.7347, + "train_tokens_per_second": 2015.074 + }, + { + "epoch": 1.4672727272727273, + "grad_norm": 0.010242112912237644, + "learning_rate": 9.537111023536973e-05, + "loss": 0.012043867260217667, + "num_input_tokens_seen": 39646296, + "step": 2421, + "train_runtime": 19674.8455, + "train_tokens_per_second": 2015.075 + }, + { + "epoch": 1.4678787878787878, + "grad_norm": 0.01728324219584465, + "learning_rate": 9.53670685012396e-05, + "loss": 0.01141743827611208, + "num_input_tokens_seen": 39662672, + "step": 2422, + "train_runtime": 19682.9588, + "train_tokens_per_second": 2015.077 + }, + { + "epoch": 1.4684848484848485, + "grad_norm": 0.011234860867261887, + "learning_rate": 9.536302508906993e-05, + "loss": 0.011003116145730019, + "num_input_tokens_seen": 39679048, + "step": 2423, + "train_runtime": 19691.0777, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 1.4690909090909092, + "grad_norm": 0.016217637807130814, + "learning_rate": 9.535897999901032e-05, + "loss": 0.012819355353713036, + "num_input_tokens_seen": 39695424, + "step": 2424, + "train_runtime": 19699.1987, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 1.4696969696969697, + "grad_norm": 0.02879735641181469, + "learning_rate": 9.535493323121036e-05, + "loss": 0.012496593408286572, + "num_input_tokens_seen": 39711800, + "step": 2425, + "train_runtime": 19707.3223, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 1.4703030303030302, + "grad_norm": 0.00734851835295558, + "learning_rate": 9.535088478581975e-05, + "loss": 0.011950269341468811, + "num_input_tokens_seen": 39728176, + "step": 2426, + "train_runtime": 19715.4383, + "train_tokens_per_second": 2015.08 + }, + { + "epoch": 1.470909090909091, + "grad_norm": 0.009754326194524765, + "learning_rate": 9.534683466298823e-05, + "loss": 0.012536708265542984, + "num_input_tokens_seen": 39744552, + "step": 2427, + "train_runtime": 19723.5544, + "train_tokens_per_second": 2015.081 + }, + { + "epoch": 1.4715151515151514, + "grad_norm": 0.009746896103024483, + "learning_rate": 9.53427828628656e-05, + "loss": 0.01178042497485876, + "num_input_tokens_seen": 39760928, + "step": 2428, + "train_runtime": 19731.672, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 1.4721212121212122, + "grad_norm": 0.00775569211691618, + "learning_rate": 9.533872938560174e-05, + "loss": 0.011952969245612621, + "num_input_tokens_seen": 39777304, + "step": 2429, + "train_runtime": 19739.7884, + "train_tokens_per_second": 2015.083 + }, + { + "epoch": 1.4727272727272727, + "grad_norm": 0.00642067426815629, + "learning_rate": 9.533467423134657e-05, + "loss": 0.013351340778172016, + "num_input_tokens_seen": 39793680, + "step": 2430, + "train_runtime": 19747.9027, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 1.4733333333333334, + "grad_norm": 0.00757970055565238, + "learning_rate": 9.533061740025008e-05, + "loss": 0.012188777327537537, + "num_input_tokens_seen": 39810056, + "step": 2431, + "train_runtime": 19756.0198, + "train_tokens_per_second": 2015.085 + }, + { + "epoch": 1.4739393939393939, + "grad_norm": 0.010369901545345783, + "learning_rate": 9.532655889246234e-05, + "loss": 0.01212995033711195, + "num_input_tokens_seen": 39826432, + "step": 2432, + "train_runtime": 19764.139, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.4745454545454546, + "grad_norm": 0.0055848038755357265, + "learning_rate": 9.532249870813344e-05, + "loss": 0.012233348563313484, + "num_input_tokens_seen": 39842808, + "step": 2433, + "train_runtime": 19772.2691, + "train_tokens_per_second": 2015.085 + }, + { + "epoch": 1.475151515151515, + "grad_norm": 0.018506374210119247, + "learning_rate": 9.53184368474136e-05, + "loss": 0.01355830393731594, + "num_input_tokens_seen": 39859184, + "step": 2434, + "train_runtime": 19780.4062, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 1.4757575757575758, + "grad_norm": 0.007064046338200569, + "learning_rate": 9.531437331045301e-05, + "loss": 0.011627360247075558, + "num_input_tokens_seen": 39875560, + "step": 2435, + "train_runtime": 19788.5498, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 1.4763636363636363, + "grad_norm": 0.013803046196699142, + "learning_rate": 9.5310308097402e-05, + "loss": 0.01383510883897543, + "num_input_tokens_seen": 39891936, + "step": 2436, + "train_runtime": 19796.6824, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 1.476969696969697, + "grad_norm": 0.006459993310272694, + "learning_rate": 9.530624120841094e-05, + "loss": 0.011402672156691551, + "num_input_tokens_seen": 39908312, + "step": 2437, + "train_runtime": 19804.8017, + "train_tokens_per_second": 2015.083 + }, + { + "epoch": 1.4775757575757575, + "grad_norm": 0.013386828824877739, + "learning_rate": 9.530217264363024e-05, + "loss": 0.012710126116871834, + "num_input_tokens_seen": 39924688, + "step": 2438, + "train_runtime": 19812.9203, + "train_tokens_per_second": 2015.083 + }, + { + "epoch": 1.4781818181818183, + "grad_norm": 0.008386164903640747, + "learning_rate": 9.52981024032104e-05, + "loss": 0.01170976459980011, + "num_input_tokens_seen": 39941064, + "step": 2439, + "train_runtime": 19821.0403, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 1.4787878787878788, + "grad_norm": 0.019332820549607277, + "learning_rate": 9.529403048730197e-05, + "loss": 0.014887764118611813, + "num_input_tokens_seen": 39957440, + "step": 2440, + "train_runtime": 19829.157, + "train_tokens_per_second": 2015.085 + }, + { + "epoch": 1.4793939393939395, + "grad_norm": 0.01192883588373661, + "learning_rate": 9.528995689605556e-05, + "loss": 0.012767164967954159, + "num_input_tokens_seen": 39973816, + "step": 2441, + "train_runtime": 19837.2767, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.48, + "grad_norm": 0.007754697930067778, + "learning_rate": 9.528588162962184e-05, + "loss": 0.01059720292687416, + "num_input_tokens_seen": 39990192, + "step": 2442, + "train_runtime": 19845.3992, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.4806060606060605, + "grad_norm": 0.0071412501856684685, + "learning_rate": 9.528180468815155e-05, + "loss": 0.0120925884693861, + "num_input_tokens_seen": 40006568, + "step": 2443, + "train_runtime": 19853.5324, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.4812121212121212, + "grad_norm": 0.0038404990918934345, + "learning_rate": 9.527772607179548e-05, + "loss": 0.011214636266231537, + "num_input_tokens_seen": 40022944, + "step": 2444, + "train_runtime": 19861.6572, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.481818181818182, + "grad_norm": 0.006489252671599388, + "learning_rate": 9.52736457807045e-05, + "loss": 0.012799175456166267, + "num_input_tokens_seen": 40039320, + "step": 2445, + "train_runtime": 19869.7777, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.4824242424242424, + "grad_norm": 0.007905665785074234, + "learning_rate": 9.526956381502953e-05, + "loss": 0.01181122101843357, + "num_input_tokens_seen": 40055696, + "step": 2446, + "train_runtime": 19877.9055, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.483030303030303, + "grad_norm": 0.006114371120929718, + "learning_rate": 9.526548017492156e-05, + "loss": 0.01095847599208355, + "num_input_tokens_seen": 40072072, + "step": 2447, + "train_runtime": 19886.0326, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.4836363636363636, + "grad_norm": 0.007537721656262875, + "learning_rate": 9.526139486053162e-05, + "loss": 0.012018000707030296, + "num_input_tokens_seen": 40088448, + "step": 2448, + "train_runtime": 19894.1525, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 1.4842424242424244, + "grad_norm": 0.01275183167308569, + "learning_rate": 9.525730787201083e-05, + "loss": 0.01350702065974474, + "num_input_tokens_seen": 40104824, + "step": 2449, + "train_runtime": 19902.2701, + "train_tokens_per_second": 2015.088 + }, + { + "epoch": 1.4848484848484849, + "grad_norm": 0.006207689177244902, + "learning_rate": 9.525321920951034e-05, + "loss": 0.011248544789850712, + "num_input_tokens_seen": 40121200, + "step": 2450, + "train_runtime": 19910.3904, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 1.4854545454545454, + "grad_norm": 0.0031499108299613, + "learning_rate": 9.524912887318142e-05, + "loss": 0.011422288604080677, + "num_input_tokens_seen": 40137576, + "step": 2451, + "train_runtime": 19918.5126, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 1.486060606060606, + "grad_norm": 0.006611378397792578, + "learning_rate": 9.524503686317534e-05, + "loss": 0.012991274707019329, + "num_input_tokens_seen": 40153952, + "step": 2452, + "train_runtime": 19926.6344, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 1.4866666666666668, + "grad_norm": 0.01270908024162054, + "learning_rate": 9.524094317964345e-05, + "loss": 0.013357768766582012, + "num_input_tokens_seen": 40170328, + "step": 2453, + "train_runtime": 19934.758, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 1.4872727272727273, + "grad_norm": 0.009194605052471161, + "learning_rate": 9.523684782273718e-05, + "loss": 0.012534737586975098, + "num_input_tokens_seen": 40186704, + "step": 2454, + "train_runtime": 19942.8783, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 1.4878787878787878, + "grad_norm": 0.006802633870393038, + "learning_rate": 9.523275079260799e-05, + "loss": 0.012076673097908497, + "num_input_tokens_seen": 40203080, + "step": 2455, + "train_runtime": 19951.0021, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.4884848484848485, + "grad_norm": 0.01247409638017416, + "learning_rate": 9.522865208940745e-05, + "loss": 0.012032316997647285, + "num_input_tokens_seen": 40219456, + "step": 2456, + "train_runtime": 19959.1325, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 1.489090909090909, + "grad_norm": 0.02582702599465847, + "learning_rate": 9.522455171328715e-05, + "loss": 0.011424973607063293, + "num_input_tokens_seen": 40235832, + "step": 2457, + "train_runtime": 19967.2572, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.4896969696969697, + "grad_norm": 0.011868856847286224, + "learning_rate": 9.522044966439873e-05, + "loss": 0.012090028263628483, + "num_input_tokens_seen": 40252208, + "step": 2458, + "train_runtime": 19975.3808, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.4903030303030302, + "grad_norm": 0.009827052243053913, + "learning_rate": 9.521634594289396e-05, + "loss": 0.012720966711640358, + "num_input_tokens_seen": 40268584, + "step": 2459, + "train_runtime": 19983.5029, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.490909090909091, + "grad_norm": 0.007020117249339819, + "learning_rate": 9.52122405489246e-05, + "loss": 0.012172859162092209, + "num_input_tokens_seen": 40284960, + "step": 2460, + "train_runtime": 19991.6205, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 1.4915151515151515, + "grad_norm": 0.004183088894933462, + "learning_rate": 9.520813348264252e-05, + "loss": 0.012092591263353825, + "num_input_tokens_seen": 40301336, + "step": 2461, + "train_runtime": 19999.7464, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 1.4921212121212122, + "grad_norm": 0.004259482026100159, + "learning_rate": 9.52040247441996e-05, + "loss": 0.010745976120233536, + "num_input_tokens_seen": 40317712, + "step": 2462, + "train_runtime": 20007.8652, + "train_tokens_per_second": 2015.093 + }, + { + "epoch": 1.4927272727272727, + "grad_norm": 0.010508165694773197, + "learning_rate": 9.519991433374787e-05, + "loss": 0.012759631499648094, + "num_input_tokens_seen": 40334088, + "step": 2463, + "train_runtime": 20015.9813, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 1.4933333333333334, + "grad_norm": 0.008700689300894737, + "learning_rate": 9.51958022514393e-05, + "loss": 0.013472158461809158, + "num_input_tokens_seen": 40350464, + "step": 2464, + "train_runtime": 20024.0943, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 1.493939393939394, + "grad_norm": 0.007512248121201992, + "learning_rate": 9.519168849742604e-05, + "loss": 0.011116516776382923, + "num_input_tokens_seen": 40366840, + "step": 2465, + "train_runtime": 20032.2166, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 1.4945454545454546, + "grad_norm": 0.006751928012818098, + "learning_rate": 9.518757307186021e-05, + "loss": 0.012299465015530586, + "num_input_tokens_seen": 40383216, + "step": 2466, + "train_runtime": 20040.3361, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.4951515151515151, + "grad_norm": 0.012251977808773518, + "learning_rate": 9.518345597489406e-05, + "loss": 0.012819000519812107, + "num_input_tokens_seen": 40399592, + "step": 2467, + "train_runtime": 20048.4565, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.4957575757575756, + "grad_norm": 0.016346946358680725, + "learning_rate": 9.517933720667986e-05, + "loss": 0.013819322921335697, + "num_input_tokens_seen": 40415968, + "step": 2468, + "train_runtime": 20056.5803, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.4963636363636363, + "grad_norm": 0.013435586355626583, + "learning_rate": 9.517521676736997e-05, + "loss": 0.011113706976175308, + "num_input_tokens_seen": 40432344, + "step": 2469, + "train_runtime": 20064.702, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.496969696969697, + "grad_norm": 0.010519376955926418, + "learning_rate": 9.517109465711678e-05, + "loss": 0.013021894730627537, + "num_input_tokens_seen": 40448720, + "step": 2470, + "train_runtime": 20072.834, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.4975757575757576, + "grad_norm": 0.007130353711545467, + "learning_rate": 9.516697087607276e-05, + "loss": 0.01177508756518364, + "num_input_tokens_seen": 40465096, + "step": 2471, + "train_runtime": 20080.9578, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.498181818181818, + "grad_norm": 0.0072803315706551075, + "learning_rate": 9.516284542439047e-05, + "loss": 0.012394273653626442, + "num_input_tokens_seen": 40481472, + "step": 2472, + "train_runtime": 20089.0934, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.4987878787878788, + "grad_norm": 0.02864505536854267, + "learning_rate": 9.515871830222244e-05, + "loss": 0.012685790657997131, + "num_input_tokens_seen": 40497848, + "step": 2473, + "train_runtime": 20097.2121, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.4993939393939395, + "grad_norm": 0.007369753438979387, + "learning_rate": 9.51545895097214e-05, + "loss": 0.012503450736403465, + "num_input_tokens_seen": 40514224, + "step": 2474, + "train_runtime": 20105.3349, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.5, + "grad_norm": 0.012454242445528507, + "learning_rate": 9.515045904704001e-05, + "loss": 0.011608883738517761, + "num_input_tokens_seen": 40530600, + "step": 2475, + "train_runtime": 20113.4635, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.5006060606060605, + "grad_norm": 0.007108752615749836, + "learning_rate": 9.514632691433107e-05, + "loss": 0.012201216071844101, + "num_input_tokens_seen": 40546976, + "step": 2476, + "train_runtime": 20121.5906, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.5012121212121212, + "grad_norm": 0.010609873570501804, + "learning_rate": 9.514219311174741e-05, + "loss": 0.011614919640123844, + "num_input_tokens_seen": 40563352, + "step": 2477, + "train_runtime": 20129.7127, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.501818181818182, + "grad_norm": 0.036003436893224716, + "learning_rate": 9.513805763944195e-05, + "loss": 0.014732426032423973, + "num_input_tokens_seen": 40579728, + "step": 2478, + "train_runtime": 20137.8216, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 1.5024242424242424, + "grad_norm": 0.0038061749655753374, + "learning_rate": 9.513392049756761e-05, + "loss": 0.011091469787061214, + "num_input_tokens_seen": 40596104, + "step": 2479, + "train_runtime": 20145.9325, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 1.503030303030303, + "grad_norm": 0.016383551061153412, + "learning_rate": 9.512978168627749e-05, + "loss": 0.013081599958240986, + "num_input_tokens_seen": 40612480, + "step": 2480, + "train_runtime": 20154.0449, + "train_tokens_per_second": 2015.103 + }, + { + "epoch": 1.5036363636363637, + "grad_norm": 0.006661078426986933, + "learning_rate": 9.51256412057246e-05, + "loss": 0.012471092864871025, + "num_input_tokens_seen": 40628856, + "step": 2481, + "train_runtime": 20162.1558, + "train_tokens_per_second": 2015.105 + }, + { + "epoch": 1.5042424242424244, + "grad_norm": 0.00888329278677702, + "learning_rate": 9.512149905606213e-05, + "loss": 0.013931803405284882, + "num_input_tokens_seen": 40645232, + "step": 2482, + "train_runtime": 20170.2657, + "train_tokens_per_second": 2015.106 + }, + { + "epoch": 1.5048484848484849, + "grad_norm": 0.009034967981278896, + "learning_rate": 9.511735523744328e-05, + "loss": 0.011597779579460621, + "num_input_tokens_seen": 40661608, + "step": 2483, + "train_runtime": 20178.3784, + "train_tokens_per_second": 2015.108 + }, + { + "epoch": 1.5054545454545454, + "grad_norm": 0.003994882106781006, + "learning_rate": 9.511320975002132e-05, + "loss": 0.012429807335138321, + "num_input_tokens_seen": 40677984, + "step": 2484, + "train_runtime": 20186.489, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 1.506060606060606, + "grad_norm": 0.005746257957071066, + "learning_rate": 9.510906259394958e-05, + "loss": 0.01171612273901701, + "num_input_tokens_seen": 40694360, + "step": 2485, + "train_runtime": 20194.5999, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 1.5066666666666668, + "grad_norm": 0.008891885168850422, + "learning_rate": 9.510491376938147e-05, + "loss": 0.01219995692372322, + "num_input_tokens_seen": 40710736, + "step": 2486, + "train_runtime": 20202.7112, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.5072727272727273, + "grad_norm": 0.0058426158502697945, + "learning_rate": 9.510076327647042e-05, + "loss": 0.012936384417116642, + "num_input_tokens_seen": 40727112, + "step": 2487, + "train_runtime": 20210.8213, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 1.5078787878787878, + "grad_norm": 0.009390783496201038, + "learning_rate": 9.509661111536998e-05, + "loss": 0.012800348922610283, + "num_input_tokens_seen": 40743488, + "step": 2488, + "train_runtime": 20218.932, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 1.5084848484848485, + "grad_norm": 0.007901819422841072, + "learning_rate": 9.509245728623373e-05, + "loss": 0.012779071927070618, + "num_input_tokens_seen": 40759864, + "step": 2489, + "train_runtime": 20227.0427, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 1.509090909090909, + "grad_norm": 0.014118324033915997, + "learning_rate": 9.508830178921529e-05, + "loss": 0.013085502199828625, + "num_input_tokens_seen": 40776240, + "step": 2490, + "train_runtime": 20235.1526, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 1.5096969696969698, + "grad_norm": 0.009893806651234627, + "learning_rate": 9.508414462446835e-05, + "loss": 0.012658249586820602, + "num_input_tokens_seen": 40792616, + "step": 2491, + "train_runtime": 20243.2643, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 1.5103030303030303, + "grad_norm": 0.009940304793417454, + "learning_rate": 9.507998579214671e-05, + "loss": 0.013717295601963997, + "num_input_tokens_seen": 40808992, + "step": 2492, + "train_runtime": 20251.3756, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.5109090909090908, + "grad_norm": 0.008184517733752728, + "learning_rate": 9.50758252924042e-05, + "loss": 0.01362772099673748, + "num_input_tokens_seen": 40825368, + "step": 2493, + "train_runtime": 20259.4871, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 1.5115151515151515, + "grad_norm": 0.008952822536230087, + "learning_rate": 9.507166312539468e-05, + "loss": 0.012840436771512032, + "num_input_tokens_seen": 40841744, + "step": 2494, + "train_runtime": 20267.5958, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 1.5121212121212122, + "grad_norm": 0.006644078996032476, + "learning_rate": 9.506749929127212e-05, + "loss": 0.012481370940804482, + "num_input_tokens_seen": 40858120, + "step": 2495, + "train_runtime": 20275.7082, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.5127272727272727, + "grad_norm": 0.012844868935644627, + "learning_rate": 9.506333379019052e-05, + "loss": 0.012236448004841805, + "num_input_tokens_seen": 40874496, + "step": 2496, + "train_runtime": 20283.8171, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 1.5133333333333332, + "grad_norm": 0.017544275149703026, + "learning_rate": 9.505916662230397e-05, + "loss": 0.012658393010497093, + "num_input_tokens_seen": 40890872, + "step": 2497, + "train_runtime": 20291.9328, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 1.513939393939394, + "grad_norm": 0.01012002769857645, + "learning_rate": 9.505499778776658e-05, + "loss": 0.012574484571814537, + "num_input_tokens_seen": 40907248, + "step": 2498, + "train_runtime": 20300.0437, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 1.5145454545454546, + "grad_norm": 0.018078140914440155, + "learning_rate": 9.505082728673257e-05, + "loss": 0.013375327922403812, + "num_input_tokens_seen": 40923624, + "step": 2499, + "train_runtime": 20308.1567, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 1.5151515151515151, + "grad_norm": 0.007978398352861404, + "learning_rate": 9.50466551193562e-05, + "loss": 0.01290303748100996, + "num_input_tokens_seen": 40940000, + "step": 2500, + "train_runtime": 20316.2678, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 1.5157575757575756, + "grad_norm": 0.006939285434782505, + "learning_rate": 9.504248128579177e-05, + "loss": 0.011965984478592873, + "num_input_tokens_seen": 40956376, + "step": 2501, + "train_runtime": 20325.3605, + "train_tokens_per_second": 2015.038 + }, + { + "epoch": 1.5163636363636364, + "grad_norm": 0.007044460158795118, + "learning_rate": 9.503830578619368e-05, + "loss": 0.011356959119439125, + "num_input_tokens_seen": 40972752, + "step": 2502, + "train_runtime": 20333.468, + "train_tokens_per_second": 2015.04 + }, + { + "epoch": 1.516969696969697, + "grad_norm": 0.007603634148836136, + "learning_rate": 9.503412862071637e-05, + "loss": 0.011854066513478756, + "num_input_tokens_seen": 40989128, + "step": 2503, + "train_runtime": 20341.576, + "train_tokens_per_second": 2015.042 + }, + { + "epoch": 1.5175757575757576, + "grad_norm": 0.009264685213565826, + "learning_rate": 9.502994978951435e-05, + "loss": 0.012205901555716991, + "num_input_tokens_seen": 41005504, + "step": 2504, + "train_runtime": 20349.6894, + "train_tokens_per_second": 2015.043 + }, + { + "epoch": 1.518181818181818, + "grad_norm": 0.004056988749653101, + "learning_rate": 9.502576929274214e-05, + "loss": 0.01200732309371233, + "num_input_tokens_seen": 41021880, + "step": 2505, + "train_runtime": 20357.7988, + "train_tokens_per_second": 2015.045 + }, + { + "epoch": 1.5187878787878788, + "grad_norm": 0.009046165272593498, + "learning_rate": 9.502158713055444e-05, + "loss": 0.012369451113045216, + "num_input_tokens_seen": 41038256, + "step": 2506, + "train_runtime": 20365.9067, + "train_tokens_per_second": 2015.047 + }, + { + "epoch": 1.5193939393939395, + "grad_norm": 0.0077380104921758175, + "learning_rate": 9.50174033031059e-05, + "loss": 0.012663315050303936, + "num_input_tokens_seen": 41054632, + "step": 2507, + "train_runtime": 20374.0154, + "train_tokens_per_second": 2015.049 + }, + { + "epoch": 1.52, + "grad_norm": 0.012925632297992706, + "learning_rate": 9.501321781055129e-05, + "loss": 0.013452775776386261, + "num_input_tokens_seen": 41071008, + "step": 2508, + "train_runtime": 20382.1317, + "train_tokens_per_second": 2015.05 + }, + { + "epoch": 1.5206060606060605, + "grad_norm": 0.024989936500787735, + "learning_rate": 9.50090306530454e-05, + "loss": 0.012328572571277618, + "num_input_tokens_seen": 41087384, + "step": 2509, + "train_runtime": 20390.2429, + "train_tokens_per_second": 2015.051 + }, + { + "epoch": 1.5212121212121212, + "grad_norm": 0.010371995158493519, + "learning_rate": 9.500484183074312e-05, + "loss": 0.013627678155899048, + "num_input_tokens_seen": 41103760, + "step": 2510, + "train_runtime": 20398.3582, + "train_tokens_per_second": 2015.052 + }, + { + "epoch": 1.521818181818182, + "grad_norm": 0.006825145334005356, + "learning_rate": 9.500065134379939e-05, + "loss": 0.012711216695606709, + "num_input_tokens_seen": 41120136, + "step": 2511, + "train_runtime": 20406.48, + "train_tokens_per_second": 2015.053 + }, + { + "epoch": 1.5224242424242425, + "grad_norm": 0.009489607997238636, + "learning_rate": 9.49964591923692e-05, + "loss": 0.013465148396790028, + "num_input_tokens_seen": 41136512, + "step": 2512, + "train_runtime": 20414.5979, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 1.523030303030303, + "grad_norm": 0.006394787225872278, + "learning_rate": 9.49922653766076e-05, + "loss": 0.010701936669647694, + "num_input_tokens_seen": 41152888, + "step": 2513, + "train_runtime": 20422.7123, + "train_tokens_per_second": 2015.055 + }, + { + "epoch": 1.5236363636363637, + "grad_norm": 0.029152654111385345, + "learning_rate": 9.498806989666972e-05, + "loss": 0.010741956532001495, + "num_input_tokens_seen": 41169264, + "step": 2514, + "train_runtime": 20430.8349, + "train_tokens_per_second": 2015.055 + }, + { + "epoch": 1.5242424242424244, + "grad_norm": 0.017097560688853264, + "learning_rate": 9.498387275271074e-05, + "loss": 0.011944272555410862, + "num_input_tokens_seen": 41185640, + "step": 2515, + "train_runtime": 20438.952, + "train_tokens_per_second": 2015.056 + }, + { + "epoch": 1.524848484848485, + "grad_norm": 0.008915740065276623, + "learning_rate": 9.497967394488594e-05, + "loss": 0.012338991276919842, + "num_input_tokens_seen": 41202016, + "step": 2516, + "train_runtime": 20447.0655, + "train_tokens_per_second": 2015.058 + }, + { + "epoch": 1.5254545454545454, + "grad_norm": 0.01135164313018322, + "learning_rate": 9.497547347335058e-05, + "loss": 0.012441879138350487, + "num_input_tokens_seen": 41218392, + "step": 2517, + "train_runtime": 20455.1815, + "train_tokens_per_second": 2015.059 + }, + { + "epoch": 1.526060606060606, + "grad_norm": 0.00980446208268404, + "learning_rate": 9.497127133826003e-05, + "loss": 0.011328864842653275, + "num_input_tokens_seen": 41234768, + "step": 2518, + "train_runtime": 20463.2979, + "train_tokens_per_second": 2015.06 + }, + { + "epoch": 1.5266666666666666, + "grad_norm": 0.010182627476751804, + "learning_rate": 9.496706753976974e-05, + "loss": 0.012514796108007431, + "num_input_tokens_seen": 41251144, + "step": 2519, + "train_runtime": 20471.4184, + "train_tokens_per_second": 2015.06 + }, + { + "epoch": 1.5272727272727273, + "grad_norm": 0.012182634323835373, + "learning_rate": 9.49628620780352e-05, + "loss": 0.01241758931428194, + "num_input_tokens_seen": 41267520, + "step": 2520, + "train_runtime": 20479.541, + "train_tokens_per_second": 2015.061 + }, + { + "epoch": 1.5278787878787878, + "grad_norm": 0.0064279730431735516, + "learning_rate": 9.495865495321194e-05, + "loss": 0.010764975100755692, + "num_input_tokens_seen": 41283896, + "step": 2521, + "train_runtime": 20487.6618, + "train_tokens_per_second": 2015.061 + }, + { + "epoch": 1.5284848484848483, + "grad_norm": 0.009709006175398827, + "learning_rate": 9.495444616545559e-05, + "loss": 0.012598009780049324, + "num_input_tokens_seen": 41300272, + "step": 2522, + "train_runtime": 20495.7827, + "train_tokens_per_second": 2015.062 + }, + { + "epoch": 1.529090909090909, + "grad_norm": 0.01092903409153223, + "learning_rate": 9.495023571492181e-05, + "loss": 0.012402733787894249, + "num_input_tokens_seen": 41316648, + "step": 2523, + "train_runtime": 20503.8968, + "train_tokens_per_second": 2015.063 + }, + { + "epoch": 1.5296969696969698, + "grad_norm": 0.009195779450237751, + "learning_rate": 9.494602360176637e-05, + "loss": 0.012332282029092312, + "num_input_tokens_seen": 41333024, + "step": 2524, + "train_runtime": 20512.0154, + "train_tokens_per_second": 2015.064 + }, + { + "epoch": 1.5303030303030303, + "grad_norm": 0.007992182858288288, + "learning_rate": 9.494180982614502e-05, + "loss": 0.012078803032636642, + "num_input_tokens_seen": 41349400, + "step": 2525, + "train_runtime": 20520.134, + "train_tokens_per_second": 2015.065 + }, + { + "epoch": 1.5309090909090908, + "grad_norm": 0.00906646903604269, + "learning_rate": 9.493759438821366e-05, + "loss": 0.01147826574742794, + "num_input_tokens_seen": 41365776, + "step": 2526, + "train_runtime": 20528.2519, + "train_tokens_per_second": 2015.066 + }, + { + "epoch": 1.5315151515151515, + "grad_norm": 0.012746486812829971, + "learning_rate": 9.49333772881282e-05, + "loss": 0.013307849876582623, + "num_input_tokens_seen": 41382152, + "step": 2527, + "train_runtime": 20536.3683, + "train_tokens_per_second": 2015.067 + }, + { + "epoch": 1.5321212121212122, + "grad_norm": 0.02986880950629711, + "learning_rate": 9.49291585260446e-05, + "loss": 0.012137986719608307, + "num_input_tokens_seen": 41398528, + "step": 2528, + "train_runtime": 20544.4827, + "train_tokens_per_second": 2015.068 + }, + { + "epoch": 1.5327272727272727, + "grad_norm": 0.006111033260822296, + "learning_rate": 9.492493810211895e-05, + "loss": 0.012053197249770164, + "num_input_tokens_seen": 41414904, + "step": 2529, + "train_runtime": 20552.6098, + "train_tokens_per_second": 2015.068 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 0.005969110410660505, + "learning_rate": 9.492071601650731e-05, + "loss": 0.012252414599061012, + "num_input_tokens_seen": 41431280, + "step": 2530, + "train_runtime": 20560.7227, + "train_tokens_per_second": 2015.069 + }, + { + "epoch": 1.533939393939394, + "grad_norm": 0.004376005847007036, + "learning_rate": 9.491649226936585e-05, + "loss": 0.011911889538168907, + "num_input_tokens_seen": 41447656, + "step": 2531, + "train_runtime": 20568.8377, + "train_tokens_per_second": 2015.07 + }, + { + "epoch": 1.5345454545454547, + "grad_norm": 0.005644640419632196, + "learning_rate": 9.491226686085084e-05, + "loss": 0.01167337503284216, + "num_input_tokens_seen": 41464032, + "step": 2532, + "train_runtime": 20576.9582, + "train_tokens_per_second": 2015.071 + }, + { + "epoch": 1.5351515151515152, + "grad_norm": 0.007492442615330219, + "learning_rate": 9.490803979111851e-05, + "loss": 0.011597873643040657, + "num_input_tokens_seen": 41480408, + "step": 2533, + "train_runtime": 20585.0778, + "train_tokens_per_second": 2015.072 + }, + { + "epoch": 1.5357575757575757, + "grad_norm": 0.010739155113697052, + "learning_rate": 9.490381106032526e-05, + "loss": 0.0118883540853858, + "num_input_tokens_seen": 41496784, + "step": 2534, + "train_runtime": 20593.1973, + "train_tokens_per_second": 2015.072 + }, + { + "epoch": 1.5363636363636364, + "grad_norm": 0.010975335724651814, + "learning_rate": 9.48995806686275e-05, + "loss": 0.011686563491821289, + "num_input_tokens_seen": 41513160, + "step": 2535, + "train_runtime": 20601.31, + "train_tokens_per_second": 2015.074 + }, + { + "epoch": 1.536969696969697, + "grad_norm": 0.015743060037493706, + "learning_rate": 9.489534861618166e-05, + "loss": 0.014532624743878841, + "num_input_tokens_seen": 41529536, + "step": 2536, + "train_runtime": 20609.4325, + "train_tokens_per_second": 2015.074 + }, + { + "epoch": 1.5375757575757576, + "grad_norm": 0.006268225610256195, + "learning_rate": 9.489111490314433e-05, + "loss": 0.011723088100552559, + "num_input_tokens_seen": 41545912, + "step": 2537, + "train_runtime": 20617.55, + "train_tokens_per_second": 2015.075 + }, + { + "epoch": 1.538181818181818, + "grad_norm": 0.008138432167470455, + "learning_rate": 9.488687952967207e-05, + "loss": 0.0119707603007555, + "num_input_tokens_seen": 41562288, + "step": 2538, + "train_runtime": 20625.6618, + "train_tokens_per_second": 2015.077 + }, + { + "epoch": 1.5387878787878788, + "grad_norm": 0.010901181027293205, + "learning_rate": 9.488264249592154e-05, + "loss": 0.01287322398275137, + "num_input_tokens_seen": 41578664, + "step": 2539, + "train_runtime": 20633.7742, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 1.5393939393939395, + "grad_norm": 0.008760156109929085, + "learning_rate": 9.487840380204949e-05, + "loss": 0.012112857773900032, + "num_input_tokens_seen": 41595040, + "step": 2540, + "train_runtime": 20641.8924, + "train_tokens_per_second": 2015.079 + }, + { + "epoch": 1.54, + "grad_norm": 0.014003272168338299, + "learning_rate": 9.487416344821267e-05, + "loss": 0.012452900409698486, + "num_input_tokens_seen": 41611416, + "step": 2541, + "train_runtime": 20650.0123, + "train_tokens_per_second": 2015.079 + }, + { + "epoch": 1.5406060606060605, + "grad_norm": 0.01951734907925129, + "learning_rate": 9.486992143456792e-05, + "loss": 0.01286186370998621, + "num_input_tokens_seen": 41627792, + "step": 2542, + "train_runtime": 20658.1339, + "train_tokens_per_second": 2015.08 + }, + { + "epoch": 1.5412121212121213, + "grad_norm": 0.004000355023890734, + "learning_rate": 9.486567776127218e-05, + "loss": 0.011163925752043724, + "num_input_tokens_seen": 41644168, + "step": 2543, + "train_runtime": 20666.2487, + "train_tokens_per_second": 2015.081 + }, + { + "epoch": 1.541818181818182, + "grad_norm": 0.015947144478559494, + "learning_rate": 9.486143242848238e-05, + "loss": 0.011878188699483871, + "num_input_tokens_seen": 41660544, + "step": 2544, + "train_runtime": 20674.3655, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 1.5424242424242425, + "grad_norm": 0.01202548760920763, + "learning_rate": 9.485718543635555e-05, + "loss": 0.013055860064923763, + "num_input_tokens_seen": 41676920, + "step": 2545, + "train_runtime": 20682.4826, + "train_tokens_per_second": 2015.083 + }, + { + "epoch": 1.543030303030303, + "grad_norm": 0.02771756984293461, + "learning_rate": 9.485293678504879e-05, + "loss": 0.012415559962391853, + "num_input_tokens_seen": 41693296, + "step": 2546, + "train_runtime": 20690.6025, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 1.5436363636363635, + "grad_norm": 0.0019140188815072179, + "learning_rate": 9.484868647471926e-05, + "loss": 0.010037734173238277, + "num_input_tokens_seen": 41709672, + "step": 2547, + "train_runtime": 20698.72, + "train_tokens_per_second": 2015.085 + }, + { + "epoch": 1.5442424242424242, + "grad_norm": 0.00840219296514988, + "learning_rate": 9.484443450552413e-05, + "loss": 0.012736542150378227, + "num_input_tokens_seen": 41726048, + "step": 2548, + "train_runtime": 20706.8338, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.544848484848485, + "grad_norm": 0.025348152965307236, + "learning_rate": 9.484018087762072e-05, + "loss": 0.013136149384081364, + "num_input_tokens_seen": 41742424, + "step": 2549, + "train_runtime": 20714.95, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 1.5454545454545454, + "grad_norm": 0.01935630850493908, + "learning_rate": 9.483592559116633e-05, + "loss": 0.012145849876105785, + "num_input_tokens_seen": 41758800, + "step": 2550, + "train_runtime": 20723.0714, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 1.546060606060606, + "grad_norm": 0.015744099393486977, + "learning_rate": 9.483166864631837e-05, + "loss": 0.011445660144090652, + "num_input_tokens_seen": 41775176, + "step": 2551, + "train_runtime": 20731.188, + "train_tokens_per_second": 2015.088 + }, + { + "epoch": 1.5466666666666666, + "grad_norm": 0.009685778059065342, + "learning_rate": 9.48274100432343e-05, + "loss": 0.013055415824055672, + "num_input_tokens_seen": 41791552, + "step": 2552, + "train_runtime": 20739.3039, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 1.5472727272727274, + "grad_norm": 0.010554308071732521, + "learning_rate": 9.48231497820716e-05, + "loss": 0.01200939528644085, + "num_input_tokens_seen": 41807928, + "step": 2553, + "train_runtime": 20747.4234, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 1.5478787878787879, + "grad_norm": 0.0028376460541039705, + "learning_rate": 9.481888786298791e-05, + "loss": 0.011462630704045296, + "num_input_tokens_seen": 41824304, + "step": 2554, + "train_runtime": 20755.5419, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.5484848484848484, + "grad_norm": 0.009041995741426945, + "learning_rate": 9.481462428614083e-05, + "loss": 0.012170386500656605, + "num_input_tokens_seen": 41840680, + "step": 2555, + "train_runtime": 20763.6649, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.549090909090909, + "grad_norm": 0.015133941546082497, + "learning_rate": 9.481035905168808e-05, + "loss": 0.01340141985565424, + "num_input_tokens_seen": 41857056, + "step": 2556, + "train_runtime": 20771.7807, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 1.5496969696969698, + "grad_norm": 0.01299117412418127, + "learning_rate": 9.48060921597874e-05, + "loss": 0.012615041807293892, + "num_input_tokens_seen": 41873432, + "step": 2557, + "train_runtime": 20779.8958, + "train_tokens_per_second": 2015.093 + }, + { + "epoch": 1.5503030303030303, + "grad_norm": 0.013336027972400188, + "learning_rate": 9.480182361059662e-05, + "loss": 0.013393765315413475, + "num_input_tokens_seen": 41889808, + "step": 2558, + "train_runtime": 20788.0093, + "train_tokens_per_second": 2015.095 + }, + { + "epoch": 1.5509090909090908, + "grad_norm": 0.001979593886062503, + "learning_rate": 9.479755340427365e-05, + "loss": 0.011380261741578579, + "num_input_tokens_seen": 41906184, + "step": 2559, + "train_runtime": 20796.1337, + "train_tokens_per_second": 2015.095 + }, + { + "epoch": 1.5515151515151515, + "grad_norm": 0.011270053684711456, + "learning_rate": 9.479328154097642e-05, + "loss": 0.012305236421525478, + "num_input_tokens_seen": 41922560, + "step": 2560, + "train_runtime": 20804.2513, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 1.5521212121212122, + "grad_norm": 0.0068170237354934216, + "learning_rate": 9.478900802086292e-05, + "loss": 0.011324869468808174, + "num_input_tokens_seen": 41938936, + "step": 2561, + "train_runtime": 20812.3727, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 1.5527272727272727, + "grad_norm": 0.022808684036135674, + "learning_rate": 9.478473284409124e-05, + "loss": 0.01196727342903614, + "num_input_tokens_seen": 41955312, + "step": 2562, + "train_runtime": 20820.4878, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.5533333333333332, + "grad_norm": 0.007863566279411316, + "learning_rate": 9.478045601081952e-05, + "loss": 0.012731033377349377, + "num_input_tokens_seen": 41971688, + "step": 2563, + "train_runtime": 20828.6152, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.553939393939394, + "grad_norm": 0.007482618093490601, + "learning_rate": 9.477617752120593e-05, + "loss": 0.012603063136339188, + "num_input_tokens_seen": 41988064, + "step": 2564, + "train_runtime": 20836.7363, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.5545454545454547, + "grad_norm": 0.03489001467823982, + "learning_rate": 9.477189737540873e-05, + "loss": 0.012344112619757652, + "num_input_tokens_seen": 42004440, + "step": 2565, + "train_runtime": 20844.8571, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.5551515151515152, + "grad_norm": 0.011057870462536812, + "learning_rate": 9.476761557358623e-05, + "loss": 0.011534559540450573, + "num_input_tokens_seen": 42020816, + "step": 2566, + "train_runtime": 20852.9755, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 1.5557575757575757, + "grad_norm": 0.009332367219030857, + "learning_rate": 9.476333211589682e-05, + "loss": 0.013054633513092995, + "num_input_tokens_seen": 42037192, + "step": 2567, + "train_runtime": 20861.091, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 1.5563636363636364, + "grad_norm": 0.006530660204589367, + "learning_rate": 9.475904700249892e-05, + "loss": 0.010158851742744446, + "num_input_tokens_seen": 42053568, + "step": 2568, + "train_runtime": 20869.2092, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 1.5569696969696971, + "grad_norm": 0.010101820342242718, + "learning_rate": 9.475476023355103e-05, + "loss": 0.012717381119728088, + "num_input_tokens_seen": 42069944, + "step": 2569, + "train_runtime": 20877.3339, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 1.5575757575757576, + "grad_norm": 0.010927229188382626, + "learning_rate": 9.475047180921172e-05, + "loss": 0.011622841469943523, + "num_input_tokens_seen": 42086320, + "step": 2570, + "train_runtime": 20885.4524, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 1.5581818181818181, + "grad_norm": 0.006744182202965021, + "learning_rate": 9.474618172963963e-05, + "loss": 0.012462071143090725, + "num_input_tokens_seen": 42102696, + "step": 2571, + "train_runtime": 20893.5671, + "train_tokens_per_second": 2015.103 + }, + { + "epoch": 1.5587878787878788, + "grad_norm": 0.0053069149143993855, + "learning_rate": 9.474188999499339e-05, + "loss": 0.009743815287947655, + "num_input_tokens_seen": 42119072, + "step": 2572, + "train_runtime": 20901.6827, + "train_tokens_per_second": 2015.104 + }, + { + "epoch": 1.5593939393939396, + "grad_norm": 0.009455588646233082, + "learning_rate": 9.473759660543178e-05, + "loss": 0.012206991203129292, + "num_input_tokens_seen": 42135448, + "step": 2573, + "train_runtime": 20909.8014, + "train_tokens_per_second": 2015.105 + }, + { + "epoch": 1.56, + "grad_norm": 0.015813136473298073, + "learning_rate": 9.473330156111358e-05, + "loss": 0.012782268226146698, + "num_input_tokens_seen": 42151824, + "step": 2574, + "train_runtime": 20917.9217, + "train_tokens_per_second": 2015.106 + }, + { + "epoch": 1.5606060606060606, + "grad_norm": 0.011628096923232079, + "learning_rate": 9.472900486219769e-05, + "loss": 0.012355889193713665, + "num_input_tokens_seen": 42168200, + "step": 2575, + "train_runtime": 20926.038, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 1.561212121212121, + "grad_norm": 0.007116169203072786, + "learning_rate": 9.4724706508843e-05, + "loss": 0.012335915118455887, + "num_input_tokens_seen": 42184576, + "step": 2576, + "train_runtime": 20934.1567, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 1.5618181818181818, + "grad_norm": 0.007721391040831804, + "learning_rate": 9.472040650120852e-05, + "loss": 0.010860590264201164, + "num_input_tokens_seen": 42200952, + "step": 2577, + "train_runtime": 20942.2738, + "train_tokens_per_second": 2015.108 + }, + { + "epoch": 1.5624242424242425, + "grad_norm": 0.010738436132669449, + "learning_rate": 9.471610483945329e-05, + "loss": 0.012578247115015984, + "num_input_tokens_seen": 42217328, + "step": 2578, + "train_runtime": 20950.3912, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 1.563030303030303, + "grad_norm": 0.007581173907965422, + "learning_rate": 9.471180152373642e-05, + "loss": 0.011820399202406406, + "num_input_tokens_seen": 42233704, + "step": 2579, + "train_runtime": 20958.5067, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 1.5636363636363635, + "grad_norm": 0.013069345615804195, + "learning_rate": 9.47074965542171e-05, + "loss": 0.011658655479550362, + "num_input_tokens_seen": 42250080, + "step": 2580, + "train_runtime": 20966.6228, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 1.5642424242424242, + "grad_norm": 0.012723376974463463, + "learning_rate": 9.470318993105453e-05, + "loss": 0.014138157479465008, + "num_input_tokens_seen": 42266456, + "step": 2581, + "train_runtime": 20974.7366, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.564848484848485, + "grad_norm": 0.010134165175259113, + "learning_rate": 9.469888165440803e-05, + "loss": 0.012676380574703217, + "num_input_tokens_seen": 42282832, + "step": 2582, + "train_runtime": 20982.8521, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 1.5654545454545454, + "grad_norm": 0.02984500490128994, + "learning_rate": 9.469457172443694e-05, + "loss": 0.013629116117954254, + "num_input_tokens_seen": 42299208, + "step": 2583, + "train_runtime": 20990.9659, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 1.566060606060606, + "grad_norm": 0.006516328081488609, + "learning_rate": 9.469026014130068e-05, + "loss": 0.012136437930166721, + "num_input_tokens_seen": 42315584, + "step": 2584, + "train_runtime": 20999.0825, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 1.5666666666666667, + "grad_norm": 0.012075986713171005, + "learning_rate": 9.468594690515873e-05, + "loss": 0.011823137290775776, + "num_input_tokens_seen": 42331960, + "step": 2585, + "train_runtime": 21007.1994, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 1.5672727272727274, + "grad_norm": 0.018787235021591187, + "learning_rate": 9.468163201617062e-05, + "loss": 0.013657747767865658, + "num_input_tokens_seen": 42348336, + "step": 2586, + "train_runtime": 21015.3203, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 1.5678787878787879, + "grad_norm": 0.0069130174815654755, + "learning_rate": 9.467731547449596e-05, + "loss": 0.013095945119857788, + "num_input_tokens_seen": 42364712, + "step": 2587, + "train_runtime": 21023.4359, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 1.5684848484848484, + "grad_norm": 0.01700754649937153, + "learning_rate": 9.46729972802944e-05, + "loss": 0.012791439890861511, + "num_input_tokens_seen": 42381088, + "step": 2588, + "train_runtime": 21031.5541, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 1.569090909090909, + "grad_norm": 0.009264576248824596, + "learning_rate": 9.466867743372567e-05, + "loss": 0.012931954115629196, + "num_input_tokens_seen": 42397464, + "step": 2589, + "train_runtime": 21039.669, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 1.5696969696969698, + "grad_norm": 0.009157332591712475, + "learning_rate": 9.466435593494955e-05, + "loss": 0.01335228979587555, + "num_input_tokens_seen": 42413840, + "step": 2590, + "train_runtime": 21047.7829, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.5703030303030303, + "grad_norm": 0.006586118135601282, + "learning_rate": 9.46600327841259e-05, + "loss": 0.011794445104897022, + "num_input_tokens_seen": 42430216, + "step": 2591, + "train_runtime": 21055.8959, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 1.5709090909090908, + "grad_norm": 0.028860818594694138, + "learning_rate": 9.465570798141459e-05, + "loss": 0.012518493458628654, + "num_input_tokens_seen": 42446592, + "step": 2592, + "train_runtime": 21064.0134, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 1.5715151515151515, + "grad_norm": 0.008049269206821918, + "learning_rate": 9.46513815269756e-05, + "loss": 0.01254999078810215, + "num_input_tokens_seen": 42462968, + "step": 2593, + "train_runtime": 21072.1324, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 1.5721212121212123, + "grad_norm": 0.01140893530100584, + "learning_rate": 9.464705342096897e-05, + "loss": 0.012186221778392792, + "num_input_tokens_seen": 42479344, + "step": 2594, + "train_runtime": 21080.2464, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 1.5727272727272728, + "grad_norm": 0.009714704938232899, + "learning_rate": 9.464272366355479e-05, + "loss": 0.012780411168932915, + "num_input_tokens_seen": 42495720, + "step": 2595, + "train_runtime": 21088.3621, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.5733333333333333, + "grad_norm": 0.012686456553637981, + "learning_rate": 9.46383922548932e-05, + "loss": 0.011896700598299503, + "num_input_tokens_seen": 42512096, + "step": 2596, + "train_runtime": 21096.4739, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 1.573939393939394, + "grad_norm": 0.006144341081380844, + "learning_rate": 9.463405919514438e-05, + "loss": 0.011786655522882938, + "num_input_tokens_seen": 42528472, + "step": 2597, + "train_runtime": 21104.5883, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 1.5745454545454547, + "grad_norm": 0.006458967924118042, + "learning_rate": 9.462972448446865e-05, + "loss": 0.011068484745919704, + "num_input_tokens_seen": 42544848, + "step": 2598, + "train_runtime": 21112.6999, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 1.5751515151515152, + "grad_norm": 0.009129184298217297, + "learning_rate": 9.462538812302634e-05, + "loss": 0.011313150636851788, + "num_input_tokens_seen": 42561224, + "step": 2599, + "train_runtime": 21120.8129, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 1.5757575757575757, + "grad_norm": 0.01139355730265379, + "learning_rate": 9.462105011097781e-05, + "loss": 0.013862118124961853, + "num_input_tokens_seen": 42577600, + "step": 2600, + "train_runtime": 21128.933, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 1.5763636363636364, + "grad_norm": 0.008616439066827297, + "learning_rate": 9.461671044848352e-05, + "loss": 0.012551181018352509, + "num_input_tokens_seen": 42593976, + "step": 2601, + "train_runtime": 21138.0207, + "train_tokens_per_second": 2015.041 + }, + { + "epoch": 1.5769696969696971, + "grad_norm": 0.00892567913979292, + "learning_rate": 9.461236913570403e-05, + "loss": 0.013059779070317745, + "num_input_tokens_seen": 42610352, + "step": 2602, + "train_runtime": 21146.1334, + "train_tokens_per_second": 2015.042 + }, + { + "epoch": 1.5775757575757576, + "grad_norm": 0.009504597634077072, + "learning_rate": 9.460802617279988e-05, + "loss": 0.011425955221056938, + "num_input_tokens_seen": 42626728, + "step": 2603, + "train_runtime": 21154.2477, + "train_tokens_per_second": 2015.043 + }, + { + "epoch": 1.5781818181818181, + "grad_norm": 0.010583021678030491, + "learning_rate": 9.460368155993169e-05, + "loss": 0.011836562305688858, + "num_input_tokens_seen": 42643104, + "step": 2604, + "train_runtime": 21162.3587, + "train_tokens_per_second": 2015.045 + }, + { + "epoch": 1.5787878787878786, + "grad_norm": 0.008107494562864304, + "learning_rate": 9.459933529726018e-05, + "loss": 0.011621729470789433, + "num_input_tokens_seen": 42659480, + "step": 2605, + "train_runtime": 21170.4693, + "train_tokens_per_second": 2015.046 + }, + { + "epoch": 1.5793939393939394, + "grad_norm": 0.004669983871281147, + "learning_rate": 9.459498738494613e-05, + "loss": 0.011748522520065308, + "num_input_tokens_seen": 42675856, + "step": 2606, + "train_runtime": 21178.5793, + "train_tokens_per_second": 2015.048 + }, + { + "epoch": 1.58, + "grad_norm": 0.008583268150687218, + "learning_rate": 9.459063782315032e-05, + "loss": 0.011582738719880581, + "num_input_tokens_seen": 42692232, + "step": 2607, + "train_runtime": 21186.6935, + "train_tokens_per_second": 2015.049 + }, + { + "epoch": 1.5806060606060606, + "grad_norm": 0.0053719752468168736, + "learning_rate": 9.458628661203367e-05, + "loss": 0.011640815064311028, + "num_input_tokens_seen": 42708608, + "step": 2608, + "train_runtime": 21194.8072, + "train_tokens_per_second": 2015.051 + }, + { + "epoch": 1.581212121212121, + "grad_norm": 0.008548380807042122, + "learning_rate": 9.45819337517571e-05, + "loss": 0.013440142385661602, + "num_input_tokens_seen": 42724984, + "step": 2609, + "train_runtime": 21202.923, + "train_tokens_per_second": 2015.052 + }, + { + "epoch": 1.5818181818181818, + "grad_norm": 0.009923234581947327, + "learning_rate": 9.457757924248163e-05, + "loss": 0.012190381065011024, + "num_input_tokens_seen": 42741360, + "step": 2610, + "train_runtime": 21211.0385, + "train_tokens_per_second": 2015.053 + }, + { + "epoch": 1.5824242424242425, + "grad_norm": 0.007502941880375147, + "learning_rate": 9.457322308436828e-05, + "loss": 0.012137582525610924, + "num_input_tokens_seen": 42757736, + "step": 2611, + "train_runtime": 21219.1524, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 1.583030303030303, + "grad_norm": 0.009094790555536747, + "learning_rate": 9.456886527757825e-05, + "loss": 0.012535467743873596, + "num_input_tokens_seen": 42774112, + "step": 2612, + "train_runtime": 21227.262, + "train_tokens_per_second": 2015.056 + }, + { + "epoch": 1.5836363636363635, + "grad_norm": 0.014347701333463192, + "learning_rate": 9.456450582227267e-05, + "loss": 0.013268754817545414, + "num_input_tokens_seen": 42790488, + "step": 2613, + "train_runtime": 21235.3767, + "train_tokens_per_second": 2015.057 + }, + { + "epoch": 1.5842424242424242, + "grad_norm": 0.01596418023109436, + "learning_rate": 9.456014471861281e-05, + "loss": 0.013029556721448898, + "num_input_tokens_seen": 42806864, + "step": 2614, + "train_runtime": 21243.49, + "train_tokens_per_second": 2015.058 + }, + { + "epoch": 1.584848484848485, + "grad_norm": 0.00878889486193657, + "learning_rate": 9.455578196675999e-05, + "loss": 0.01179637759923935, + "num_input_tokens_seen": 42823240, + "step": 2615, + "train_runtime": 21251.6075, + "train_tokens_per_second": 2015.059 + }, + { + "epoch": 1.5854545454545454, + "grad_norm": 0.0020696879364550114, + "learning_rate": 9.455141756687554e-05, + "loss": 0.012115818448364735, + "num_input_tokens_seen": 42839616, + "step": 2616, + "train_runtime": 21259.7328, + "train_tokens_per_second": 2015.059 + }, + { + "epoch": 1.586060606060606, + "grad_norm": 0.01816830411553383, + "learning_rate": 9.454705151912091e-05, + "loss": 0.012639476917684078, + "num_input_tokens_seen": 42855992, + "step": 2617, + "train_runtime": 21267.8554, + "train_tokens_per_second": 2015.059 + }, + { + "epoch": 1.5866666666666667, + "grad_norm": 0.011715560220181942, + "learning_rate": 9.45426838236576e-05, + "loss": 0.01289452612400055, + "num_input_tokens_seen": 42872368, + "step": 2618, + "train_runtime": 21275.9757, + "train_tokens_per_second": 2015.06 + }, + { + "epoch": 1.5872727272727274, + "grad_norm": 0.00640740105882287, + "learning_rate": 9.453831448064717e-05, + "loss": 0.011819720268249512, + "num_input_tokens_seen": 42888744, + "step": 2619, + "train_runtime": 21284.1025, + "train_tokens_per_second": 2015.06 + }, + { + "epoch": 1.587878787878788, + "grad_norm": 0.00489066680893302, + "learning_rate": 9.453394349025122e-05, + "loss": 0.0112529331818223, + "num_input_tokens_seen": 42905120, + "step": 2620, + "train_runtime": 21292.2323, + "train_tokens_per_second": 2015.06 + }, + { + "epoch": 1.5884848484848484, + "grad_norm": 0.010229917243123055, + "learning_rate": 9.452957085263142e-05, + "loss": 0.01201337669044733, + "num_input_tokens_seen": 42921496, + "step": 2621, + "train_runtime": 21300.3542, + "train_tokens_per_second": 2015.06 + }, + { + "epoch": 1.589090909090909, + "grad_norm": 0.007779464591294527, + "learning_rate": 9.452519656794952e-05, + "loss": 0.012388527393341064, + "num_input_tokens_seen": 42937872, + "step": 2622, + "train_runtime": 21308.4726, + "train_tokens_per_second": 2015.061 + }, + { + "epoch": 1.5896969696969698, + "grad_norm": 0.01498553529381752, + "learning_rate": 9.452082063636729e-05, + "loss": 0.012615111656486988, + "num_input_tokens_seen": 42954248, + "step": 2623, + "train_runtime": 21316.591, + "train_tokens_per_second": 2015.062 + }, + { + "epoch": 1.5903030303030303, + "grad_norm": 0.01064207497984171, + "learning_rate": 9.45164430580466e-05, + "loss": 0.012389152310788631, + "num_input_tokens_seen": 42970624, + "step": 2624, + "train_runtime": 21324.7033, + "train_tokens_per_second": 2015.063 + }, + { + "epoch": 1.5909090909090908, + "grad_norm": 0.019471580162644386, + "learning_rate": 9.451206383314941e-05, + "loss": 0.014277242124080658, + "num_input_tokens_seen": 42987000, + "step": 2625, + "train_runtime": 21332.8185, + "train_tokens_per_second": 2015.064 + }, + { + "epoch": 1.5915151515151515, + "grad_norm": 0.012485023587942123, + "learning_rate": 9.450768296183765e-05, + "loss": 0.011769906617701054, + "num_input_tokens_seen": 43003376, + "step": 2626, + "train_runtime": 21340.9508, + "train_tokens_per_second": 2015.064 + }, + { + "epoch": 1.5921212121212123, + "grad_norm": 0.006313348188996315, + "learning_rate": 9.450330044427336e-05, + "loss": 0.010830282233655453, + "num_input_tokens_seen": 43019752, + "step": 2627, + "train_runtime": 21349.0674, + "train_tokens_per_second": 2015.065 + }, + { + "epoch": 1.5927272727272728, + "grad_norm": 0.005766854155808687, + "learning_rate": 9.449891628061864e-05, + "loss": 0.012779559940099716, + "num_input_tokens_seen": 43036128, + "step": 2628, + "train_runtime": 21357.1866, + "train_tokens_per_second": 2015.065 + }, + { + "epoch": 1.5933333333333333, + "grad_norm": 0.022942470386624336, + "learning_rate": 9.449453047103569e-05, + "loss": 0.012017364613711834, + "num_input_tokens_seen": 43052504, + "step": 2629, + "train_runtime": 21365.3011, + "train_tokens_per_second": 2015.067 + }, + { + "epoch": 1.593939393939394, + "grad_norm": 0.005800590384751558, + "learning_rate": 9.449014301568671e-05, + "loss": 0.012153777293860912, + "num_input_tokens_seen": 43068880, + "step": 2630, + "train_runtime": 21373.4177, + "train_tokens_per_second": 2015.068 + }, + { + "epoch": 1.5945454545454547, + "grad_norm": 0.014468961395323277, + "learning_rate": 9.448575391473396e-05, + "loss": 0.012839552015066147, + "num_input_tokens_seen": 43085256, + "step": 2631, + "train_runtime": 21381.5335, + "train_tokens_per_second": 2015.069 + }, + { + "epoch": 1.5951515151515152, + "grad_norm": 0.020431358367204666, + "learning_rate": 9.448136316833981e-05, + "loss": 0.01263053435832262, + "num_input_tokens_seen": 43101632, + "step": 2632, + "train_runtime": 21389.6484, + "train_tokens_per_second": 2015.07 + }, + { + "epoch": 1.5957575757575757, + "grad_norm": 0.028599435463547707, + "learning_rate": 9.447697077666666e-05, + "loss": 0.013531115837395191, + "num_input_tokens_seen": 43118008, + "step": 2633, + "train_runtime": 21397.7637, + "train_tokens_per_second": 2015.071 + }, + { + "epoch": 1.5963636363636362, + "grad_norm": 0.004123292397707701, + "learning_rate": 9.447257673987697e-05, + "loss": 0.011205705814063549, + "num_input_tokens_seen": 43134384, + "step": 2634, + "train_runtime": 21405.8785, + "train_tokens_per_second": 2015.072 + }, + { + "epoch": 1.596969696969697, + "grad_norm": 0.009270669892430305, + "learning_rate": 9.44681810581333e-05, + "loss": 0.01207477692514658, + "num_input_tokens_seen": 43150760, + "step": 2635, + "train_runtime": 21413.992, + "train_tokens_per_second": 2015.073 + }, + { + "epoch": 1.5975757575757576, + "grad_norm": 0.0058461823500692844, + "learning_rate": 9.446378373159818e-05, + "loss": 0.012760588899254799, + "num_input_tokens_seen": 43167136, + "step": 2636, + "train_runtime": 21422.1076, + "train_tokens_per_second": 2015.074 + }, + { + "epoch": 1.5981818181818181, + "grad_norm": 0.009135591797530651, + "learning_rate": 9.44593847604343e-05, + "loss": 0.012779445387423038, + "num_input_tokens_seen": 43183512, + "step": 2637, + "train_runtime": 21430.2211, + "train_tokens_per_second": 2015.075 + }, + { + "epoch": 1.5987878787878786, + "grad_norm": 0.007859868928790092, + "learning_rate": 9.445498414480436e-05, + "loss": 0.012065579183399677, + "num_input_tokens_seen": 43199888, + "step": 2638, + "train_runtime": 21438.3375, + "train_tokens_per_second": 2015.076 + }, + { + "epoch": 1.5993939393939394, + "grad_norm": 0.0413634367287159, + "learning_rate": 9.445058188487113e-05, + "loss": 0.011168006807565689, + "num_input_tokens_seen": 43216264, + "step": 2639, + "train_runtime": 21446.4552, + "train_tokens_per_second": 2015.077 + }, + { + "epoch": 1.6, + "grad_norm": 0.006517525762319565, + "learning_rate": 9.444617798079744e-05, + "loss": 0.011061888188123703, + "num_input_tokens_seen": 43232640, + "step": 2640, + "train_runtime": 21454.5714, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 1.6006060606060606, + "grad_norm": 0.007652346510440111, + "learning_rate": 9.444177243274618e-05, + "loss": 0.01161403302103281, + "num_input_tokens_seen": 43249016, + "step": 2641, + "train_runtime": 21462.6866, + "train_tokens_per_second": 2015.079 + }, + { + "epoch": 1.601212121212121, + "grad_norm": 0.008789447136223316, + "learning_rate": 9.44373652408803e-05, + "loss": 0.012821921147406101, + "num_input_tokens_seen": 43265392, + "step": 2642, + "train_runtime": 21470.8059, + "train_tokens_per_second": 2015.08 + }, + { + "epoch": 1.6018181818181818, + "grad_norm": 0.012086639180779457, + "learning_rate": 9.443295640536283e-05, + "loss": 0.011262400075793266, + "num_input_tokens_seen": 43281768, + "step": 2643, + "train_runtime": 21478.9332, + "train_tokens_per_second": 2015.08 + }, + { + "epoch": 1.6024242424242425, + "grad_norm": 0.011036599986255169, + "learning_rate": 9.442854592635681e-05, + "loss": 0.011876849457621574, + "num_input_tokens_seen": 43298144, + "step": 2644, + "train_runtime": 21487.0481, + "train_tokens_per_second": 2015.081 + }, + { + "epoch": 1.603030303030303, + "grad_norm": 0.011125577613711357, + "learning_rate": 9.442413380402541e-05, + "loss": 0.012037638574838638, + "num_input_tokens_seen": 43314520, + "step": 2645, + "train_runtime": 21495.1688, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 1.6036363636363635, + "grad_norm": 0.007470986340194941, + "learning_rate": 9.441972003853181e-05, + "loss": 0.011867578141391277, + "num_input_tokens_seen": 43330896, + "step": 2646, + "train_runtime": 21503.286, + "train_tokens_per_second": 2015.083 + }, + { + "epoch": 1.6042424242424242, + "grad_norm": 0.009798098355531693, + "learning_rate": 9.441530463003928e-05, + "loss": 0.011247251182794571, + "num_input_tokens_seen": 43347272, + "step": 2647, + "train_runtime": 21511.4064, + "train_tokens_per_second": 2015.083 + }, + { + "epoch": 1.604848484848485, + "grad_norm": 0.05813064053654671, + "learning_rate": 9.441088757871112e-05, + "loss": 0.011350232176482677, + "num_input_tokens_seen": 43363648, + "step": 2648, + "train_runtime": 21519.5234, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 1.6054545454545455, + "grad_norm": 0.009082259610295296, + "learning_rate": 9.440646888471071e-05, + "loss": 0.013249941170215607, + "num_input_tokens_seen": 43380024, + "step": 2649, + "train_runtime": 21527.6397, + "train_tokens_per_second": 2015.085 + }, + { + "epoch": 1.606060606060606, + "grad_norm": 0.013390403240919113, + "learning_rate": 9.440204854820149e-05, + "loss": 0.01255058590322733, + "num_input_tokens_seen": 43396400, + "step": 2650, + "train_runtime": 21535.7563, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.6066666666666667, + "grad_norm": 0.007771987933665514, + "learning_rate": 9.439762656934698e-05, + "loss": 0.012910917401313782, + "num_input_tokens_seen": 43412776, + "step": 2651, + "train_runtime": 21543.8783, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.6072727272727274, + "grad_norm": 0.010885335505008698, + "learning_rate": 9.43932029483107e-05, + "loss": 0.011177360080182552, + "num_input_tokens_seen": 43429152, + "step": 2652, + "train_runtime": 21551.994, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 1.607878787878788, + "grad_norm": 0.022055966779589653, + "learning_rate": 9.438877768525631e-05, + "loss": 0.01327504776418209, + "num_input_tokens_seen": 43445528, + "step": 2653, + "train_runtime": 21560.1083, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 1.6084848484848484, + "grad_norm": 0.007112656719982624, + "learning_rate": 9.438435078034749e-05, + "loss": 0.012204596772789955, + "num_input_tokens_seen": 43461904, + "step": 2654, + "train_runtime": 21568.235, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 1.6090909090909091, + "grad_norm": 0.010424617677927017, + "learning_rate": 9.437992223374794e-05, + "loss": 0.012944593094289303, + "num_input_tokens_seen": 43478280, + "step": 2655, + "train_runtime": 21576.3518, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 1.6096969696969698, + "grad_norm": 0.007999579422175884, + "learning_rate": 9.437549204562151e-05, + "loss": 0.011439507827162743, + "num_input_tokens_seen": 43494656, + "step": 2656, + "train_runtime": 21584.4663, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.6103030303030303, + "grad_norm": 0.009420707821846008, + "learning_rate": 9.437106021613204e-05, + "loss": 0.010938690043985844, + "num_input_tokens_seen": 43511032, + "step": 2657, + "train_runtime": 21592.5817, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 1.6109090909090908, + "grad_norm": 0.010780644603073597, + "learning_rate": 9.436662674544348e-05, + "loss": 0.013350578024983406, + "num_input_tokens_seen": 43527408, + "step": 2658, + "train_runtime": 21600.699, + "train_tokens_per_second": 2015.093 + }, + { + "epoch": 1.6115151515151516, + "grad_norm": 0.013084693811833858, + "learning_rate": 9.436219163371977e-05, + "loss": 0.013172319158911705, + "num_input_tokens_seen": 43543784, + "step": 2659, + "train_runtime": 21608.814, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 1.612121212121212, + "grad_norm": 0.012968843802809715, + "learning_rate": 9.4357754881125e-05, + "loss": 0.012867764569818974, + "num_input_tokens_seen": 43560160, + "step": 2660, + "train_runtime": 21616.9359, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 1.6127272727272728, + "grad_norm": 0.019006604328751564, + "learning_rate": 9.435331648782324e-05, + "loss": 0.011904004961252213, + "num_input_tokens_seen": 43576536, + "step": 2661, + "train_runtime": 21625.0507, + "train_tokens_per_second": 2015.095 + }, + { + "epoch": 1.6133333333333333, + "grad_norm": 0.0019992270972579718, + "learning_rate": 9.43488764539787e-05, + "loss": 0.013505300506949425, + "num_input_tokens_seen": 43592912, + "step": 2662, + "train_runtime": 21633.1713, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 1.6139393939393938, + "grad_norm": 0.01781928539276123, + "learning_rate": 9.434443477975558e-05, + "loss": 0.012036633677780628, + "num_input_tokens_seen": 43609288, + "step": 2663, + "train_runtime": 21641.2871, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.6145454545454545, + "grad_norm": 0.009235371835529804, + "learning_rate": 9.433999146531815e-05, + "loss": 0.012760656885802746, + "num_input_tokens_seen": 43625664, + "step": 2664, + "train_runtime": 21649.4077, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.6151515151515152, + "grad_norm": 0.010370719246566296, + "learning_rate": 9.433554651083082e-05, + "loss": 0.011637035757303238, + "num_input_tokens_seen": 43642040, + "step": 2665, + "train_runtime": 21657.5339, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.6157575757575757, + "grad_norm": 0.036511119455099106, + "learning_rate": 9.433109991645795e-05, + "loss": 0.011988237500190735, + "num_input_tokens_seen": 43658416, + "step": 2666, + "train_runtime": 21665.6475, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 1.6163636363636362, + "grad_norm": 0.013008903712034225, + "learning_rate": 9.432665168236401e-05, + "loss": 0.01194752287119627, + "num_input_tokens_seen": 43674792, + "step": 2667, + "train_runtime": 21673.7643, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 1.616969696969697, + "grad_norm": 0.012565176002681255, + "learning_rate": 9.432220180871358e-05, + "loss": 0.012999428436160088, + "num_input_tokens_seen": 43691168, + "step": 2668, + "train_runtime": 21681.8784, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 1.6175757575757577, + "grad_norm": 0.005679360590875149, + "learning_rate": 9.43177502956712e-05, + "loss": 0.012297439388930798, + "num_input_tokens_seen": 43707544, + "step": 2669, + "train_runtime": 21689.994, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 1.6181818181818182, + "grad_norm": 0.015765322372317314, + "learning_rate": 9.431329714340154e-05, + "loss": 0.012844718061387539, + "num_input_tokens_seen": 43723920, + "step": 2670, + "train_runtime": 21698.1079, + "train_tokens_per_second": 2015.103 + }, + { + "epoch": 1.6187878787878787, + "grad_norm": 0.005528231151401997, + "learning_rate": 9.43088423520693e-05, + "loss": 0.012213103473186493, + "num_input_tokens_seen": 43740296, + "step": 2671, + "train_runtime": 21706.2193, + "train_tokens_per_second": 2015.104 + }, + { + "epoch": 1.6193939393939394, + "grad_norm": 0.020404066890478134, + "learning_rate": 9.430438592183928e-05, + "loss": 0.011327740736305714, + "num_input_tokens_seen": 43756672, + "step": 2672, + "train_runtime": 21714.334, + "train_tokens_per_second": 2015.105 + }, + { + "epoch": 1.62, + "grad_norm": 0.00987694226205349, + "learning_rate": 9.429992785287632e-05, + "loss": 0.010421988554298878, + "num_input_tokens_seen": 43773048, + "step": 2673, + "train_runtime": 21722.4492, + "train_tokens_per_second": 2015.106 + }, + { + "epoch": 1.6206060606060606, + "grad_norm": 0.0061499676667153835, + "learning_rate": 9.429546814534529e-05, + "loss": 0.011812202632427216, + "num_input_tokens_seen": 43789424, + "step": 2674, + "train_runtime": 21730.5672, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 1.621212121212121, + "grad_norm": 0.00769574660807848, + "learning_rate": 9.429100679941114e-05, + "loss": 0.010980362072587013, + "num_input_tokens_seen": 43805800, + "step": 2675, + "train_runtime": 21738.6884, + "train_tokens_per_second": 2015.108 + }, + { + "epoch": 1.6218181818181818, + "grad_norm": 0.011081526055932045, + "learning_rate": 9.428654381523892e-05, + "loss": 0.012496310286223888, + "num_input_tokens_seen": 43822176, + "step": 2676, + "train_runtime": 21746.807, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 1.6224242424242425, + "grad_norm": 0.013930363580584526, + "learning_rate": 9.428207919299368e-05, + "loss": 0.013375709764659405, + "num_input_tokens_seen": 43838552, + "step": 2677, + "train_runtime": 21754.9233, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 1.623030303030303, + "grad_norm": 0.016915637999773026, + "learning_rate": 9.427761293284057e-05, + "loss": 0.014272555708885193, + "num_input_tokens_seen": 43854928, + "step": 2678, + "train_runtime": 21763.0382, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 1.6236363636363635, + "grad_norm": 0.011937938630580902, + "learning_rate": 9.427314503494477e-05, + "loss": 0.012509403750300407, + "num_input_tokens_seen": 43871304, + "step": 2679, + "train_runtime": 21771.1558, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 1.6242424242424243, + "grad_norm": 0.014566425234079361, + "learning_rate": 9.426867549947158e-05, + "loss": 0.012007796205580235, + "num_input_tokens_seen": 43887680, + "step": 2680, + "train_runtime": 21779.2721, + "train_tokens_per_second": 2015.112 + }, + { + "epoch": 1.624848484848485, + "grad_norm": 0.011659136973321438, + "learning_rate": 9.426420432658627e-05, + "loss": 0.012796234339475632, + "num_input_tokens_seen": 43904056, + "step": 2681, + "train_runtime": 21787.388, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.6254545454545455, + "grad_norm": 0.007594785187393427, + "learning_rate": 9.425973151645426e-05, + "loss": 0.012028402648866177, + "num_input_tokens_seen": 43920432, + "step": 2682, + "train_runtime": 21795.5065, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 1.626060606060606, + "grad_norm": 0.017702842131257057, + "learning_rate": 9.425525706924096e-05, + "loss": 0.01287344191223383, + "num_input_tokens_seen": 43936808, + "step": 2683, + "train_runtime": 21803.6224, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 1.6266666666666667, + "grad_norm": 0.010299012064933777, + "learning_rate": 9.425078098511188e-05, + "loss": 0.01260048896074295, + "num_input_tokens_seen": 43953184, + "step": 2684, + "train_runtime": 21811.7422, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 1.6272727272727274, + "grad_norm": 0.009039688855409622, + "learning_rate": 9.424630326423259e-05, + "loss": 0.011654467321932316, + "num_input_tokens_seen": 43969560, + "step": 2685, + "train_runtime": 21819.8614, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 1.627878787878788, + "grad_norm": 0.006915550213307142, + "learning_rate": 9.424182390676872e-05, + "loss": 0.011898837052285671, + "num_input_tokens_seen": 43985936, + "step": 2686, + "train_runtime": 21827.9726, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 1.6284848484848484, + "grad_norm": 0.01088861282914877, + "learning_rate": 9.423734291288592e-05, + "loss": 0.012327456846833229, + "num_input_tokens_seen": 44002312, + "step": 2687, + "train_runtime": 21836.0845, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 1.6290909090909091, + "grad_norm": 0.012999316677451134, + "learning_rate": 9.423286028274997e-05, + "loss": 0.013050897978246212, + "num_input_tokens_seen": 44018688, + "step": 2688, + "train_runtime": 21844.1972, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 1.6296969696969696, + "grad_norm": 0.009379571303725243, + "learning_rate": 9.422837601652665e-05, + "loss": 0.012682823464274406, + "num_input_tokens_seen": 44035064, + "step": 2689, + "train_runtime": 21852.3117, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.6303030303030304, + "grad_norm": 0.00579515565186739, + "learning_rate": 9.422389011438184e-05, + "loss": 0.01177308801561594, + "num_input_tokens_seen": 44051440, + "step": 2690, + "train_runtime": 21860.4334, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.6309090909090909, + "grad_norm": 0.0018515904666855931, + "learning_rate": 9.421940257648146e-05, + "loss": 0.012456430122256279, + "num_input_tokens_seen": 44067816, + "step": 2691, + "train_runtime": 21868.5457, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 1.6315151515151514, + "grad_norm": 0.013134176842868328, + "learning_rate": 9.421491340299148e-05, + "loss": 0.012923019006848335, + "num_input_tokens_seen": 44084192, + "step": 2692, + "train_runtime": 21876.6591, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 1.632121212121212, + "grad_norm": 0.007743083406239748, + "learning_rate": 9.421042259407796e-05, + "loss": 0.011915619485080242, + "num_input_tokens_seen": 44100568, + "step": 2693, + "train_runtime": 21884.772, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 1.6327272727272728, + "grad_norm": 0.005690127145498991, + "learning_rate": 9.4205930149907e-05, + "loss": 0.010516472160816193, + "num_input_tokens_seen": 44116944, + "step": 2694, + "train_runtime": 21892.8858, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.6333333333333333, + "grad_norm": 0.011323309503495693, + "learning_rate": 9.420143607064478e-05, + "loss": 0.012086287140846252, + "num_input_tokens_seen": 44133320, + "step": 2695, + "train_runtime": 21901.0625, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.6339393939393938, + "grad_norm": 0.007330689113587141, + "learning_rate": 9.419694035645751e-05, + "loss": 0.012611635960638523, + "num_input_tokens_seen": 44149696, + "step": 2696, + "train_runtime": 21909.176, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 1.6345454545454545, + "grad_norm": 0.006508952938020229, + "learning_rate": 9.41924430075115e-05, + "loss": 0.010217411443591118, + "num_input_tokens_seen": 44166072, + "step": 2697, + "train_runtime": 21917.2911, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 1.6351515151515152, + "grad_norm": 0.009744730778038502, + "learning_rate": 9.418794402397307e-05, + "loss": 0.01241858210414648, + "num_input_tokens_seen": 44182448, + "step": 2698, + "train_runtime": 21925.4067, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 1.6357575757575757, + "grad_norm": 0.011063654907047749, + "learning_rate": 9.418344340600865e-05, + "loss": 0.011393502354621887, + "num_input_tokens_seen": 44198824, + "step": 2699, + "train_runtime": 21933.5244, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 1.6363636363636362, + "grad_norm": 0.006254155188798904, + "learning_rate": 9.41789411537847e-05, + "loss": 0.01240509282797575, + "num_input_tokens_seen": 44215200, + "step": 2700, + "train_runtime": 21941.6385, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 1.636969696969697, + "grad_norm": 0.01012837328016758, + "learning_rate": 9.417443726746776e-05, + "loss": 0.011583870276808739, + "num_input_tokens_seen": 44231576, + "step": 2701, + "train_runtime": 21950.696, + "train_tokens_per_second": 2015.042 + }, + { + "epoch": 1.6375757575757577, + "grad_norm": 0.013250237330794334, + "learning_rate": 9.416993174722439e-05, + "loss": 0.012408467009663582, + "num_input_tokens_seen": 44247952, + "step": 2702, + "train_runtime": 21958.8131, + "train_tokens_per_second": 2015.043 + }, + { + "epoch": 1.6381818181818182, + "grad_norm": 0.006648391485214233, + "learning_rate": 9.416542459322129e-05, + "loss": 0.013180596753954887, + "num_input_tokens_seen": 44264328, + "step": 2703, + "train_runtime": 21966.9333, + "train_tokens_per_second": 2015.044 + }, + { + "epoch": 1.6387878787878787, + "grad_norm": 0.009948927909135818, + "learning_rate": 9.416091580562512e-05, + "loss": 0.012322509661316872, + "num_input_tokens_seen": 44280704, + "step": 2704, + "train_runtime": 21975.0494, + "train_tokens_per_second": 2015.045 + }, + { + "epoch": 1.6393939393939394, + "grad_norm": 0.0077480534091591835, + "learning_rate": 9.415640538460267e-05, + "loss": 0.011874118819832802, + "num_input_tokens_seen": 44297080, + "step": 2705, + "train_runtime": 21983.1643, + "train_tokens_per_second": 2015.046 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.009989479556679726, + "learning_rate": 9.41518933303208e-05, + "loss": 0.013704563491046429, + "num_input_tokens_seen": 44313456, + "step": 2706, + "train_runtime": 21991.2775, + "train_tokens_per_second": 2015.047 + }, + { + "epoch": 1.6406060606060606, + "grad_norm": 0.00627144007012248, + "learning_rate": 9.414737964294636e-05, + "loss": 0.01262811291962862, + "num_input_tokens_seen": 44329832, + "step": 2707, + "train_runtime": 21999.3947, + "train_tokens_per_second": 2015.048 + }, + { + "epoch": 1.6412121212121211, + "grad_norm": 0.009063741192221642, + "learning_rate": 9.414286432264631e-05, + "loss": 0.013369398191571236, + "num_input_tokens_seen": 44346208, + "step": 2708, + "train_runtime": 22007.5139, + "train_tokens_per_second": 2015.049 + }, + { + "epoch": 1.6418181818181818, + "grad_norm": 0.007614328060299158, + "learning_rate": 9.413834736958768e-05, + "loss": 0.012139086611568928, + "num_input_tokens_seen": 44362584, + "step": 2709, + "train_runtime": 22015.6353, + "train_tokens_per_second": 2015.049 + }, + { + "epoch": 1.6424242424242426, + "grad_norm": 0.007654812186956406, + "learning_rate": 9.413382878393754e-05, + "loss": 0.011492074467241764, + "num_input_tokens_seen": 44378960, + "step": 2710, + "train_runtime": 22023.752, + "train_tokens_per_second": 2015.05 + }, + { + "epoch": 1.643030303030303, + "grad_norm": 0.007990519516170025, + "learning_rate": 9.412930856586304e-05, + "loss": 0.012617778033018112, + "num_input_tokens_seen": 44395336, + "step": 2711, + "train_runtime": 22031.8714, + "train_tokens_per_second": 2015.051 + }, + { + "epoch": 1.6436363636363636, + "grad_norm": 0.004427951294928789, + "learning_rate": 9.412478671553134e-05, + "loss": 0.011064354330301285, + "num_input_tokens_seen": 44411712, + "step": 2712, + "train_runtime": 22039.991, + "train_tokens_per_second": 2015.051 + }, + { + "epoch": 1.6442424242424243, + "grad_norm": 0.011509922333061695, + "learning_rate": 9.41202632331097e-05, + "loss": 0.012677370570600033, + "num_input_tokens_seen": 44428088, + "step": 2713, + "train_runtime": 22048.1121, + "train_tokens_per_second": 2015.052 + }, + { + "epoch": 1.644848484848485, + "grad_norm": 0.004903316497802734, + "learning_rate": 9.411573811876544e-05, + "loss": 0.011655345559120178, + "num_input_tokens_seen": 44444464, + "step": 2714, + "train_runtime": 22056.234, + "train_tokens_per_second": 2015.052 + }, + { + "epoch": 1.6454545454545455, + "grad_norm": 0.004879241809248924, + "learning_rate": 9.411121137266595e-05, + "loss": 0.011117602698504925, + "num_input_tokens_seen": 44460840, + "step": 2715, + "train_runtime": 22064.3531, + "train_tokens_per_second": 2015.053 + }, + { + "epoch": 1.646060606060606, + "grad_norm": 0.014790347777307034, + "learning_rate": 9.410668299497864e-05, + "loss": 0.013863930478692055, + "num_input_tokens_seen": 44477216, + "step": 2716, + "train_runtime": 22072.4672, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 1.6466666666666665, + "grad_norm": 0.01018503587692976, + "learning_rate": 9.410215298587104e-05, + "loss": 0.010815615765750408, + "num_input_tokens_seen": 44493592, + "step": 2717, + "train_runtime": 22080.5833, + "train_tokens_per_second": 2015.055 + }, + { + "epoch": 1.6472727272727272, + "grad_norm": 0.005130484700202942, + "learning_rate": 9.409762134551068e-05, + "loss": 0.012508288025856018, + "num_input_tokens_seen": 44509968, + "step": 2718, + "train_runtime": 22088.7033, + "train_tokens_per_second": 2015.056 + }, + { + "epoch": 1.647878787878788, + "grad_norm": 0.007447066716849804, + "learning_rate": 9.409308807406518e-05, + "loss": 0.011768568307161331, + "num_input_tokens_seen": 44526344, + "step": 2719, + "train_runtime": 22096.8327, + "train_tokens_per_second": 2015.055 + }, + { + "epoch": 1.6484848484848484, + "grad_norm": 0.007597601041197777, + "learning_rate": 9.408855317170222e-05, + "loss": 0.011454282328486443, + "num_input_tokens_seen": 44542720, + "step": 2720, + "train_runtime": 22104.9518, + "train_tokens_per_second": 2015.056 + }, + { + "epoch": 1.649090909090909, + "grad_norm": 0.00861985981464386, + "learning_rate": 9.408401663858953e-05, + "loss": 0.011637267656624317, + "num_input_tokens_seen": 44559096, + "step": 2721, + "train_runtime": 22113.0731, + "train_tokens_per_second": 2015.057 + }, + { + "epoch": 1.6496969696969697, + "grad_norm": 0.008903435431420803, + "learning_rate": 9.407947847489494e-05, + "loss": 0.01246644090861082, + "num_input_tokens_seen": 44575472, + "step": 2722, + "train_runtime": 22121.1881, + "train_tokens_per_second": 2015.058 + }, + { + "epoch": 1.6503030303030304, + "grad_norm": 0.013762188144028187, + "learning_rate": 9.407493868078625e-05, + "loss": 0.012603108771145344, + "num_input_tokens_seen": 44591848, + "step": 2723, + "train_runtime": 22129.3025, + "train_tokens_per_second": 2015.059 + }, + { + "epoch": 1.6509090909090909, + "grad_norm": 0.007037809584289789, + "learning_rate": 9.407039725643142e-05, + "loss": 0.011482897214591503, + "num_input_tokens_seen": 44608224, + "step": 2724, + "train_runtime": 22137.4172, + "train_tokens_per_second": 2015.06 + }, + { + "epoch": 1.6515151515151514, + "grad_norm": 0.0225937832146883, + "learning_rate": 9.406585420199843e-05, + "loss": 0.011697374284267426, + "num_input_tokens_seen": 44624600, + "step": 2725, + "train_runtime": 22145.5353, + "train_tokens_per_second": 2015.061 + }, + { + "epoch": 1.652121212121212, + "grad_norm": 0.004367986228317022, + "learning_rate": 9.406130951765529e-05, + "loss": 0.01129780150949955, + "num_input_tokens_seen": 44640976, + "step": 2726, + "train_runtime": 22153.6512, + "train_tokens_per_second": 2015.062 + }, + { + "epoch": 1.6527272727272728, + "grad_norm": 0.008844173513352871, + "learning_rate": 9.405676320357013e-05, + "loss": 0.010827938094735146, + "num_input_tokens_seen": 44657352, + "step": 2727, + "train_runtime": 22161.7685, + "train_tokens_per_second": 2015.063 + }, + { + "epoch": 1.6533333333333333, + "grad_norm": 0.009946335107088089, + "learning_rate": 9.405221525991108e-05, + "loss": 0.01152572687715292, + "num_input_tokens_seen": 44673728, + "step": 2728, + "train_runtime": 22169.8861, + "train_tokens_per_second": 2015.063 + }, + { + "epoch": 1.6539393939393938, + "grad_norm": 0.00656983582302928, + "learning_rate": 9.40476656868464e-05, + "loss": 0.012073036283254623, + "num_input_tokens_seen": 44690104, + "step": 2729, + "train_runtime": 22178.0059, + "train_tokens_per_second": 2015.064 + }, + { + "epoch": 1.6545454545454545, + "grad_norm": 0.008812750689685345, + "learning_rate": 9.404311448454433e-05, + "loss": 0.011705856770277023, + "num_input_tokens_seen": 44706480, + "step": 2730, + "train_runtime": 22186.1189, + "train_tokens_per_second": 2015.065 + }, + { + "epoch": 1.6551515151515153, + "grad_norm": 0.0035067375283688307, + "learning_rate": 9.403856165317321e-05, + "loss": 0.012019923888146877, + "num_input_tokens_seen": 44722856, + "step": 2731, + "train_runtime": 22194.2334, + "train_tokens_per_second": 2015.066 + }, + { + "epoch": 1.6557575757575758, + "grad_norm": 0.015511504374444485, + "learning_rate": 9.403400719290147e-05, + "loss": 0.01346611324697733, + "num_input_tokens_seen": 44739232, + "step": 2732, + "train_runtime": 22202.352, + "train_tokens_per_second": 2015.067 + }, + { + "epoch": 1.6563636363636363, + "grad_norm": 0.007372966967523098, + "learning_rate": 9.402945110389757e-05, + "loss": 0.012887794524431229, + "num_input_tokens_seen": 44755608, + "step": 2733, + "train_runtime": 22210.4716, + "train_tokens_per_second": 2015.068 + }, + { + "epoch": 1.656969696969697, + "grad_norm": 0.023236919194459915, + "learning_rate": 9.402489338633001e-05, + "loss": 0.01442014705389738, + "num_input_tokens_seen": 44771984, + "step": 2734, + "train_runtime": 22218.5872, + "train_tokens_per_second": 2015.069 + }, + { + "epoch": 1.6575757575757577, + "grad_norm": 0.007083198521286249, + "learning_rate": 9.402033404036736e-05, + "loss": 0.012199325487017632, + "num_input_tokens_seen": 44788360, + "step": 2735, + "train_runtime": 22226.7057, + "train_tokens_per_second": 2015.07 + }, + { + "epoch": 1.6581818181818182, + "grad_norm": 0.008631768636405468, + "learning_rate": 9.40157730661783e-05, + "loss": 0.01288297027349472, + "num_input_tokens_seen": 44804736, + "step": 2736, + "train_runtime": 22234.834, + "train_tokens_per_second": 2015.07 + }, + { + "epoch": 1.6587878787878787, + "grad_norm": 0.009771174751222134, + "learning_rate": 9.401121046393151e-05, + "loss": 0.01390773430466652, + "num_input_tokens_seen": 44821112, + "step": 2737, + "train_runtime": 22242.9515, + "train_tokens_per_second": 2015.07 + }, + { + "epoch": 1.6593939393939394, + "grad_norm": 0.007200576830655336, + "learning_rate": 9.400664623379573e-05, + "loss": 0.01203729398548603, + "num_input_tokens_seen": 44837488, + "step": 2738, + "train_runtime": 22251.0674, + "train_tokens_per_second": 2015.071 + }, + { + "epoch": 1.6600000000000001, + "grad_norm": 0.006853716913610697, + "learning_rate": 9.400208037593983e-05, + "loss": 0.012496777810156345, + "num_input_tokens_seen": 44853864, + "step": 2739, + "train_runtime": 22259.1856, + "train_tokens_per_second": 2015.072 + }, + { + "epoch": 1.6606060606060606, + "grad_norm": 0.017664391547441483, + "learning_rate": 9.399751289053267e-05, + "loss": 0.013294186443090439, + "num_input_tokens_seen": 44870240, + "step": 2740, + "train_runtime": 22267.3004, + "train_tokens_per_second": 2015.073 + }, + { + "epoch": 1.6612121212121211, + "grad_norm": 0.005868007894605398, + "learning_rate": 9.399294377774318e-05, + "loss": 0.012370945885777473, + "num_input_tokens_seen": 44886616, + "step": 2741, + "train_runtime": 22275.4172, + "train_tokens_per_second": 2015.074 + }, + { + "epoch": 1.6618181818181819, + "grad_norm": 0.014271008782088757, + "learning_rate": 9.398837303774037e-05, + "loss": 0.01195800956338644, + "num_input_tokens_seen": 44902992, + "step": 2742, + "train_runtime": 22283.5344, + "train_tokens_per_second": 2015.075 + }, + { + "epoch": 1.6624242424242426, + "grad_norm": 0.012290251441299915, + "learning_rate": 9.39838006706933e-05, + "loss": 0.0112903518602252, + "num_input_tokens_seen": 44919368, + "step": 2743, + "train_runtime": 22291.6532, + "train_tokens_per_second": 2015.076 + }, + { + "epoch": 1.663030303030303, + "grad_norm": 0.006131591275334358, + "learning_rate": 9.39792266767711e-05, + "loss": 0.012527124024927616, + "num_input_tokens_seen": 44935744, + "step": 2744, + "train_runtime": 22299.7686, + "train_tokens_per_second": 2015.077 + }, + { + "epoch": 1.6636363636363636, + "grad_norm": 0.019052183255553246, + "learning_rate": 9.397465105614296e-05, + "loss": 0.013172638602554798, + "num_input_tokens_seen": 44952120, + "step": 2745, + "train_runtime": 22307.8823, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 1.664242424242424, + "grad_norm": 0.013884824700653553, + "learning_rate": 9.39700738089781e-05, + "loss": 0.012267852202057838, + "num_input_tokens_seen": 44968496, + "step": 2746, + "train_runtime": 22315.9938, + "train_tokens_per_second": 2015.079 + }, + { + "epoch": 1.6648484848484848, + "grad_norm": 0.00940402876585722, + "learning_rate": 9.396549493544584e-05, + "loss": 0.011282133869826794, + "num_input_tokens_seen": 44984872, + "step": 2747, + "train_runtime": 22324.1106, + "train_tokens_per_second": 2015.08 + }, + { + "epoch": 1.6654545454545455, + "grad_norm": 0.004152586217969656, + "learning_rate": 9.396091443571555e-05, + "loss": 0.010489017702639103, + "num_input_tokens_seen": 45001248, + "step": 2748, + "train_runtime": 22332.2249, + "train_tokens_per_second": 2015.081 + }, + { + "epoch": 1.666060606060606, + "grad_norm": 0.010575785301625729, + "learning_rate": 9.395633230995664e-05, + "loss": 0.011070530861616135, + "num_input_tokens_seen": 45017624, + "step": 2749, + "train_runtime": 22340.3421, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.010641394183039665, + "learning_rate": 9.39517485583386e-05, + "loss": 0.012648802250623703, + "num_input_tokens_seen": 45034000, + "step": 2750, + "train_runtime": 22348.4522, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 1.6672727272727272, + "grad_norm": 0.008935361169278622, + "learning_rate": 9.394716318103098e-05, + "loss": 0.01304717268794775, + "num_input_tokens_seen": 45050376, + "step": 2751, + "train_runtime": 22356.568, + "train_tokens_per_second": 2015.085 + }, + { + "epoch": 1.667878787878788, + "grad_norm": 0.009867693297564983, + "learning_rate": 9.394257617820336e-05, + "loss": 0.011446228250861168, + "num_input_tokens_seen": 45066752, + "step": 2752, + "train_runtime": 22364.6824, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.6684848484848485, + "grad_norm": 0.007289520464837551, + "learning_rate": 9.393798755002544e-05, + "loss": 0.012150555849075317, + "num_input_tokens_seen": 45083128, + "step": 2753, + "train_runtime": 22372.7957, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 1.669090909090909, + "grad_norm": 0.01773468591272831, + "learning_rate": 9.393339729666693e-05, + "loss": 0.01132100261747837, + "num_input_tokens_seen": 45099504, + "step": 2754, + "train_runtime": 22380.9119, + "train_tokens_per_second": 2015.088 + }, + { + "epoch": 1.6696969696969697, + "grad_norm": 0.01178825180977583, + "learning_rate": 9.392880541829758e-05, + "loss": 0.013354834169149399, + "num_input_tokens_seen": 45115880, + "step": 2755, + "train_runtime": 22389.0244, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 1.6703030303030304, + "grad_norm": 0.012685943394899368, + "learning_rate": 9.392421191508729e-05, + "loss": 0.012555164285004139, + "num_input_tokens_seen": 45132256, + "step": 2756, + "train_runtime": 22397.1411, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 1.670909090909091, + "grad_norm": 0.008436055853962898, + "learning_rate": 9.391961678720594e-05, + "loss": 0.012797040864825249, + "num_input_tokens_seen": 45148632, + "step": 2757, + "train_runtime": 22405.2557, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.6715151515151514, + "grad_norm": 0.011204235255718231, + "learning_rate": 9.391502003482349e-05, + "loss": 0.01226828433573246, + "num_input_tokens_seen": 45165008, + "step": 2758, + "train_runtime": 22413.372, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 1.6721212121212121, + "grad_norm": 0.007430667523294687, + "learning_rate": 9.391042165810996e-05, + "loss": 0.013038146309554577, + "num_input_tokens_seen": 45181384, + "step": 2759, + "train_runtime": 22421.4906, + "train_tokens_per_second": 2015.093 + }, + { + "epoch": 1.6727272727272728, + "grad_norm": 0.012358262203633785, + "learning_rate": 9.390582165723544e-05, + "loss": 0.012622443027794361, + "num_input_tokens_seen": 45197760, + "step": 2760, + "train_runtime": 22429.6043, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 1.6733333333333333, + "grad_norm": 0.009910723194479942, + "learning_rate": 9.39012200323701e-05, + "loss": 0.013371050357818604, + "num_input_tokens_seen": 45214136, + "step": 2761, + "train_runtime": 22437.7158, + "train_tokens_per_second": 2015.095 + }, + { + "epoch": 1.6739393939393938, + "grad_norm": 0.006673166062682867, + "learning_rate": 9.389661678368413e-05, + "loss": 0.012958286330103874, + "num_input_tokens_seen": 45230512, + "step": 2762, + "train_runtime": 22445.8348, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 1.6745454545454546, + "grad_norm": 0.010125633329153061, + "learning_rate": 9.389201191134776e-05, + "loss": 0.011834094300866127, + "num_input_tokens_seen": 45246888, + "step": 2763, + "train_runtime": 22453.9495, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.6751515151515153, + "grad_norm": 0.00810938235372305, + "learning_rate": 9.388740541553138e-05, + "loss": 0.012278936803340912, + "num_input_tokens_seen": 45263264, + "step": 2764, + "train_runtime": 22462.0643, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.6757575757575758, + "grad_norm": 0.009514245204627514, + "learning_rate": 9.388279729640531e-05, + "loss": 0.011925340630114079, + "num_input_tokens_seen": 45279640, + "step": 2765, + "train_runtime": 22470.1772, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 1.6763636363636363, + "grad_norm": 0.015000533312559128, + "learning_rate": 9.387818755414004e-05, + "loss": 0.011756815947592258, + "num_input_tokens_seen": 45296016, + "step": 2766, + "train_runtime": 22478.2921, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 1.676969696969697, + "grad_norm": 0.008925878442823887, + "learning_rate": 9.387357618890606e-05, + "loss": 0.011443368159234524, + "num_input_tokens_seen": 45312392, + "step": 2767, + "train_runtime": 22486.4077, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 1.6775757575757577, + "grad_norm": 0.008592803962528706, + "learning_rate": 9.386896320087392e-05, + "loss": 0.012534118257462978, + "num_input_tokens_seen": 45328768, + "step": 2768, + "train_runtime": 22494.5243, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 1.6781818181818182, + "grad_norm": 0.007586160209029913, + "learning_rate": 9.386434859021429e-05, + "loss": 0.011447837576270103, + "num_input_tokens_seen": 45345144, + "step": 2769, + "train_runtime": 22502.6471, + "train_tokens_per_second": 2015.103 + }, + { + "epoch": 1.6787878787878787, + "grad_norm": 0.004868637304753065, + "learning_rate": 9.385973235709781e-05, + "loss": 0.011121487244963646, + "num_input_tokens_seen": 45361520, + "step": 2770, + "train_runtime": 22510.7612, + "train_tokens_per_second": 2015.104 + }, + { + "epoch": 1.6793939393939394, + "grad_norm": 0.01013186015188694, + "learning_rate": 9.385511450169525e-05, + "loss": 0.012332988902926445, + "num_input_tokens_seen": 45377896, + "step": 2771, + "train_runtime": 22518.8756, + "train_tokens_per_second": 2015.105 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 0.013302835635840893, + "learning_rate": 9.385049502417742e-05, + "loss": 0.012212036177515984, + "num_input_tokens_seen": 45394272, + "step": 2772, + "train_runtime": 22526.9886, + "train_tokens_per_second": 2015.106 + }, + { + "epoch": 1.6806060606060607, + "grad_norm": 0.01163007877767086, + "learning_rate": 9.384587392471515e-05, + "loss": 0.011997763998806477, + "num_input_tokens_seen": 45410648, + "step": 2773, + "train_runtime": 22535.105, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 1.6812121212121212, + "grad_norm": 0.010505879297852516, + "learning_rate": 9.38412512034794e-05, + "loss": 0.013251842930912971, + "num_input_tokens_seen": 45427024, + "step": 2774, + "train_runtime": 22543.222, + "train_tokens_per_second": 2015.108 + }, + { + "epoch": 1.6818181818181817, + "grad_norm": 0.009793641045689583, + "learning_rate": 9.383662686064114e-05, + "loss": 0.012996641919016838, + "num_input_tokens_seen": 45443400, + "step": 2775, + "train_runtime": 22551.3383, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 1.6824242424242424, + "grad_norm": 0.010234086774289608, + "learning_rate": 9.383200089637143e-05, + "loss": 0.01099113654345274, + "num_input_tokens_seen": 45459776, + "step": 2776, + "train_runtime": 22559.4538, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 1.683030303030303, + "grad_norm": 0.013903598301112652, + "learning_rate": 9.382737331084137e-05, + "loss": 0.01228981651365757, + "num_input_tokens_seen": 45476152, + "step": 2777, + "train_runtime": 22567.5733, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 1.6836363636363636, + "grad_norm": 0.010938679799437523, + "learning_rate": 9.382274410422211e-05, + "loss": 0.012848911806941032, + "num_input_tokens_seen": 45492528, + "step": 2778, + "train_runtime": 22575.6879, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 1.684242424242424, + "grad_norm": 0.022132035344839096, + "learning_rate": 9.381811327668488e-05, + "loss": 0.012318627908825874, + "num_input_tokens_seen": 45508904, + "step": 2779, + "train_runtime": 22583.8014, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.6848484848484848, + "grad_norm": 0.011289408430457115, + "learning_rate": 9.381348082840098e-05, + "loss": 0.011903293430805206, + "num_input_tokens_seen": 45525280, + "step": 2780, + "train_runtime": 22591.9158, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 1.6854545454545455, + "grad_norm": 0.019458260387182236, + "learning_rate": 9.380884675954176e-05, + "loss": 0.013369610533118248, + "num_input_tokens_seen": 45541656, + "step": 2781, + "train_runtime": 22600.0616, + "train_tokens_per_second": 2015.112 + }, + { + "epoch": 1.686060606060606, + "grad_norm": 0.014678013511002064, + "learning_rate": 9.380421107027859e-05, + "loss": 0.012422224506735802, + "num_input_tokens_seen": 45558032, + "step": 2782, + "train_runtime": 22608.1756, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.6866666666666665, + "grad_norm": 0.008445504121482372, + "learning_rate": 9.379957376078297e-05, + "loss": 0.011338223703205585, + "num_input_tokens_seen": 45574408, + "step": 2783, + "train_runtime": 22616.2904, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 1.6872727272727273, + "grad_norm": 0.01064318511635065, + "learning_rate": 9.379493483122642e-05, + "loss": 0.01236021425575018, + "num_input_tokens_seen": 45590784, + "step": 2784, + "train_runtime": 22624.4065, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 1.687878787878788, + "grad_norm": 0.006612302735447884, + "learning_rate": 9.37902942817805e-05, + "loss": 0.011895343661308289, + "num_input_tokens_seen": 45607160, + "step": 2785, + "train_runtime": 22632.5329, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 1.6884848484848485, + "grad_norm": 0.030377233400940895, + "learning_rate": 9.378565211261687e-05, + "loss": 0.013250184245407581, + "num_input_tokens_seen": 45623536, + "step": 2786, + "train_runtime": 22640.6471, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 1.689090909090909, + "grad_norm": 0.01889094151556492, + "learning_rate": 9.378100832390727e-05, + "loss": 0.014152915216982365, + "num_input_tokens_seen": 45639912, + "step": 2787, + "train_runtime": 22648.7656, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 1.6896969696969697, + "grad_norm": 0.008286499418318272, + "learning_rate": 9.377636291582339e-05, + "loss": 0.01310395635664463, + "num_input_tokens_seen": 45656288, + "step": 2788, + "train_runtime": 22656.882, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 1.6903030303030304, + "grad_norm": 0.01981155388057232, + "learning_rate": 9.377171588853712e-05, + "loss": 0.011624631471931934, + "num_input_tokens_seen": 45672664, + "step": 2789, + "train_runtime": 22664.9985, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 1.690909090909091, + "grad_norm": 0.007989699020981789, + "learning_rate": 9.376706724222031e-05, + "loss": 0.012425190769135952, + "num_input_tokens_seen": 45689040, + "step": 2790, + "train_runtime": 22673.1109, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 1.6915151515151514, + "grad_norm": 0.01827121339738369, + "learning_rate": 9.376241697704493e-05, + "loss": 0.012519946321845055, + "num_input_tokens_seen": 45705416, + "step": 2791, + "train_runtime": 22681.2318, + "train_tokens_per_second": 2015.121 + }, + { + "epoch": 1.6921212121212121, + "grad_norm": 0.032639652490615845, + "learning_rate": 9.375776509318296e-05, + "loss": 0.012380331754684448, + "num_input_tokens_seen": 45721792, + "step": 2792, + "train_runtime": 22689.3466, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.6927272727272729, + "grad_norm": 0.02035784162580967, + "learning_rate": 9.375311159080647e-05, + "loss": 0.011968818493187428, + "num_input_tokens_seen": 45738168, + "step": 2793, + "train_runtime": 22697.4604, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 1.6933333333333334, + "grad_norm": 0.025810981169342995, + "learning_rate": 9.374845647008758e-05, + "loss": 0.012797437608242035, + "num_input_tokens_seen": 45754544, + "step": 2794, + "train_runtime": 22705.5736, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 1.6939393939393939, + "grad_norm": 0.01363008376210928, + "learning_rate": 9.37437997311985e-05, + "loss": 0.012758086435496807, + "num_input_tokens_seen": 45770920, + "step": 2795, + "train_runtime": 22713.6878, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 1.6945454545454546, + "grad_norm": 0.013838708400726318, + "learning_rate": 9.373914137431146e-05, + "loss": 0.011740066111087799, + "num_input_tokens_seen": 45787296, + "step": 2796, + "train_runtime": 22721.8028, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 1.6951515151515153, + "grad_norm": 0.013491389341652393, + "learning_rate": 9.373448139959873e-05, + "loss": 0.011969367042183876, + "num_input_tokens_seen": 45803672, + "step": 2797, + "train_runtime": 22729.921, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.6957575757575758, + "grad_norm": 0.006902933586388826, + "learning_rate": 9.372981980723272e-05, + "loss": 0.01120357122272253, + "num_input_tokens_seen": 45820048, + "step": 2798, + "train_runtime": 22738.0404, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.6963636363636363, + "grad_norm": 0.01884661242365837, + "learning_rate": 9.372515659738583e-05, + "loss": 0.012923416681587696, + "num_input_tokens_seen": 45836424, + "step": 2799, + "train_runtime": 22746.156, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 1.696969696969697, + "grad_norm": 0.009512597694993019, + "learning_rate": 9.372049177023055e-05, + "loss": 0.012344890274107456, + "num_input_tokens_seen": 45852800, + "step": 2800, + "train_runtime": 22754.2693, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 1.6975757575757577, + "grad_norm": 0.016322260722517967, + "learning_rate": 9.371582532593943e-05, + "loss": 0.013447512872517109, + "num_input_tokens_seen": 45869176, + "step": 2801, + "train_runtime": 22763.3025, + "train_tokens_per_second": 2015.049 + }, + { + "epoch": 1.6981818181818182, + "grad_norm": 0.01216021366417408, + "learning_rate": 9.371115726468505e-05, + "loss": 0.013008292764425278, + "num_input_tokens_seen": 45885552, + "step": 2802, + "train_runtime": 22771.4191, + "train_tokens_per_second": 2015.05 + }, + { + "epoch": 1.6987878787878787, + "grad_norm": 0.039111893624067307, + "learning_rate": 9.37064875866401e-05, + "loss": 0.014293879270553589, + "num_input_tokens_seen": 45901928, + "step": 2803, + "train_runtime": 22779.5344, + "train_tokens_per_second": 2015.051 + }, + { + "epoch": 1.6993939393939392, + "grad_norm": 0.0064725009724497795, + "learning_rate": 9.370181629197728e-05, + "loss": 0.011476884596049786, + "num_input_tokens_seen": 45918304, + "step": 2804, + "train_runtime": 22787.6455, + "train_tokens_per_second": 2015.053 + }, + { + "epoch": 1.7, + "grad_norm": 0.010711527429521084, + "learning_rate": 9.369714338086939e-05, + "loss": 0.012465567328035831, + "num_input_tokens_seen": 45934680, + "step": 2805, + "train_runtime": 22795.7584, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 1.7006060606060607, + "grad_norm": 0.006154636852443218, + "learning_rate": 9.369246885348926e-05, + "loss": 0.012477940879762173, + "num_input_tokens_seen": 45951056, + "step": 2806, + "train_runtime": 22803.872, + "train_tokens_per_second": 2015.055 + }, + { + "epoch": 1.7012121212121212, + "grad_norm": 0.008175252936780453, + "learning_rate": 9.368779271000978e-05, + "loss": 0.011836701072752476, + "num_input_tokens_seen": 45967432, + "step": 2807, + "train_runtime": 22811.9839, + "train_tokens_per_second": 2015.056 + }, + { + "epoch": 1.7018181818181817, + "grad_norm": 0.00616717291995883, + "learning_rate": 9.368311495060393e-05, + "loss": 0.010793658904731274, + "num_input_tokens_seen": 45983808, + "step": 2808, + "train_runtime": 22820.0984, + "train_tokens_per_second": 2015.057 + }, + { + "epoch": 1.7024242424242424, + "grad_norm": 0.03607097640633583, + "learning_rate": 9.367843557544474e-05, + "loss": 0.014611356891691685, + "num_input_tokens_seen": 46000184, + "step": 2809, + "train_runtime": 22828.2107, + "train_tokens_per_second": 2015.059 + }, + { + "epoch": 1.7030303030303031, + "grad_norm": 0.018071550875902176, + "learning_rate": 9.367375458470526e-05, + "loss": 0.01233680546283722, + "num_input_tokens_seen": 46016560, + "step": 2810, + "train_runtime": 22836.3246, + "train_tokens_per_second": 2015.06 + }, + { + "epoch": 1.7036363636363636, + "grad_norm": 0.00596799748018384, + "learning_rate": 9.366907197855868e-05, + "loss": 0.011606480926275253, + "num_input_tokens_seen": 46032936, + "step": 2811, + "train_runtime": 22844.4382, + "train_tokens_per_second": 2015.061 + }, + { + "epoch": 1.7042424242424241, + "grad_norm": 0.007533122319728136, + "learning_rate": 9.366438775717814e-05, + "loss": 0.012357478961348534, + "num_input_tokens_seen": 46049312, + "step": 2812, + "train_runtime": 22852.5509, + "train_tokens_per_second": 2015.062 + }, + { + "epoch": 1.7048484848484848, + "grad_norm": 0.01864629052579403, + "learning_rate": 9.365970192073694e-05, + "loss": 0.013025326654314995, + "num_input_tokens_seen": 46065688, + "step": 2813, + "train_runtime": 22860.6639, + "train_tokens_per_second": 2015.063 + }, + { + "epoch": 1.7054545454545456, + "grad_norm": 0.008015927858650684, + "learning_rate": 9.365501446940839e-05, + "loss": 0.01284027099609375, + "num_input_tokens_seen": 46082064, + "step": 2814, + "train_runtime": 22868.7759, + "train_tokens_per_second": 2015.065 + }, + { + "epoch": 1.706060606060606, + "grad_norm": 0.010607431642711163, + "learning_rate": 9.365032540336587e-05, + "loss": 0.012594718486070633, + "num_input_tokens_seen": 46098440, + "step": 2815, + "train_runtime": 22876.8894, + "train_tokens_per_second": 2015.066 + }, + { + "epoch": 1.7066666666666666, + "grad_norm": 0.007729121949523687, + "learning_rate": 9.36456347227828e-05, + "loss": 0.013601492159068584, + "num_input_tokens_seen": 46114816, + "step": 2816, + "train_runtime": 22885.0026, + "train_tokens_per_second": 2015.067 + }, + { + "epoch": 1.7072727272727273, + "grad_norm": 0.010761508718132973, + "learning_rate": 9.364094242783272e-05, + "loss": 0.011926738545298576, + "num_input_tokens_seen": 46131192, + "step": 2817, + "train_runtime": 22893.1206, + "train_tokens_per_second": 2015.068 + }, + { + "epoch": 1.707878787878788, + "grad_norm": 0.014684220775961876, + "learning_rate": 9.363624851868916e-05, + "loss": 0.013269368559122086, + "num_input_tokens_seen": 46147568, + "step": 2818, + "train_runtime": 22901.2337, + "train_tokens_per_second": 2015.069 + }, + { + "epoch": 1.7084848484848485, + "grad_norm": 0.008672056719660759, + "learning_rate": 9.363155299552573e-05, + "loss": 0.012723954394459724, + "num_input_tokens_seen": 46163944, + "step": 2819, + "train_runtime": 22909.3467, + "train_tokens_per_second": 2015.07 + }, + { + "epoch": 1.709090909090909, + "grad_norm": 0.007565699517726898, + "learning_rate": 9.362685585851614e-05, + "loss": 0.011728525161743164, + "num_input_tokens_seen": 46180320, + "step": 2820, + "train_runtime": 22917.4588, + "train_tokens_per_second": 2015.072 + }, + { + "epoch": 1.7096969696969697, + "grad_norm": 0.013421568088233471, + "learning_rate": 9.362215710783411e-05, + "loss": 0.010953246615827084, + "num_input_tokens_seen": 46196696, + "step": 2821, + "train_runtime": 22925.5745, + "train_tokens_per_second": 2015.073 + }, + { + "epoch": 1.7103030303030304, + "grad_norm": 0.014821798540651798, + "learning_rate": 9.361745674365345e-05, + "loss": 0.014481105841696262, + "num_input_tokens_seen": 46213072, + "step": 2822, + "train_runtime": 22933.6913, + "train_tokens_per_second": 2015.073 + }, + { + "epoch": 1.710909090909091, + "grad_norm": 0.01066959835588932, + "learning_rate": 9.361275476614798e-05, + "loss": 0.012770322151482105, + "num_input_tokens_seen": 46229448, + "step": 2823, + "train_runtime": 22941.8058, + "train_tokens_per_second": 2015.075 + }, + { + "epoch": 1.7115151515151514, + "grad_norm": 0.02900104783475399, + "learning_rate": 9.360805117549165e-05, + "loss": 0.013311613351106644, + "num_input_tokens_seen": 46245824, + "step": 2824, + "train_runtime": 22949.9216, + "train_tokens_per_second": 2015.075 + }, + { + "epoch": 1.7121212121212122, + "grad_norm": 0.00498326076194644, + "learning_rate": 9.360334597185845e-05, + "loss": 0.011704309843480587, + "num_input_tokens_seen": 46262200, + "step": 2825, + "train_runtime": 22958.0391, + "train_tokens_per_second": 2015.076 + }, + { + "epoch": 1.7127272727272729, + "grad_norm": 0.005870001390576363, + "learning_rate": 9.359863915542238e-05, + "loss": 0.010895316489040852, + "num_input_tokens_seen": 46278576, + "step": 2826, + "train_runtime": 22966.1548, + "train_tokens_per_second": 2015.077 + }, + { + "epoch": 1.7133333333333334, + "grad_norm": 0.008924623019993305, + "learning_rate": 9.359393072635755e-05, + "loss": 0.011977889575064182, + "num_input_tokens_seen": 46294952, + "step": 2827, + "train_runtime": 22974.268, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 1.7139393939393939, + "grad_norm": 0.006065470166504383, + "learning_rate": 9.358922068483812e-05, + "loss": 0.011861737817525864, + "num_input_tokens_seen": 46311328, + "step": 2828, + "train_runtime": 22982.3833, + "train_tokens_per_second": 2015.079 + }, + { + "epoch": 1.7145454545454546, + "grad_norm": 0.005640857852995396, + "learning_rate": 9.35845090310383e-05, + "loss": 0.011305807158350945, + "num_input_tokens_seen": 46327704, + "step": 2829, + "train_runtime": 22990.5002, + "train_tokens_per_second": 2015.08 + }, + { + "epoch": 1.7151515151515153, + "grad_norm": 0.023728247731924057, + "learning_rate": 9.357979576513238e-05, + "loss": 0.01281740888953209, + "num_input_tokens_seen": 46344080, + "step": 2830, + "train_runtime": 22998.6172, + "train_tokens_per_second": 2015.081 + }, + { + "epoch": 1.7157575757575758, + "grad_norm": 0.006878760643303394, + "learning_rate": 9.357508088729468e-05, + "loss": 0.011280113831162453, + "num_input_tokens_seen": 46360456, + "step": 2831, + "train_runtime": 23006.735, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 1.7163636363636363, + "grad_norm": 0.012969830073416233, + "learning_rate": 9.35703643976996e-05, + "loss": 0.013124781660735607, + "num_input_tokens_seen": 46376832, + "step": 2832, + "train_runtime": 23014.8512, + "train_tokens_per_second": 2015.083 + }, + { + "epoch": 1.7169696969696968, + "grad_norm": 0.00749437790364027, + "learning_rate": 9.356564629652158e-05, + "loss": 0.011703899130225182, + "num_input_tokens_seen": 46393208, + "step": 2833, + "train_runtime": 23022.9653, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 1.7175757575757575, + "grad_norm": 0.015355957671999931, + "learning_rate": 9.356092658393514e-05, + "loss": 0.011799749918282032, + "num_input_tokens_seen": 46409584, + "step": 2834, + "train_runtime": 23031.0777, + "train_tokens_per_second": 2015.085 + }, + { + "epoch": 1.7181818181818183, + "grad_norm": 0.006823359522968531, + "learning_rate": 9.355620526011486e-05, + "loss": 0.01280257198959589, + "num_input_tokens_seen": 46425960, + "step": 2835, + "train_runtime": 23039.1904, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.7187878787878788, + "grad_norm": 0.009748456999659538, + "learning_rate": 9.355148232523537e-05, + "loss": 0.013985298573970795, + "num_input_tokens_seen": 46442336, + "step": 2836, + "train_runtime": 23047.3039, + "train_tokens_per_second": 2015.088 + }, + { + "epoch": 1.7193939393939393, + "grad_norm": 0.011919494718313217, + "learning_rate": 9.354675777947138e-05, + "loss": 0.012215827591717243, + "num_input_tokens_seen": 46458712, + "step": 2837, + "train_runtime": 23055.4193, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 1.72, + "grad_norm": 0.00848611444234848, + "learning_rate": 9.354203162299759e-05, + "loss": 0.012414870783686638, + "num_input_tokens_seen": 46475088, + "step": 2838, + "train_runtime": 23063.5341, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 1.7206060606060607, + "grad_norm": 0.008148525841534138, + "learning_rate": 9.353730385598887e-05, + "loss": 0.013450459577143192, + "num_input_tokens_seen": 46491464, + "step": 2839, + "train_runtime": 23071.6495, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.7212121212121212, + "grad_norm": 0.006279889028519392, + "learning_rate": 9.353257447862005e-05, + "loss": 0.011724742129445076, + "num_input_tokens_seen": 46507840, + "step": 2840, + "train_runtime": 23079.7611, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 1.7218181818181817, + "grad_norm": 0.008062539622187614, + "learning_rate": 9.352784349106608e-05, + "loss": 0.01200947817414999, + "num_input_tokens_seen": 46524216, + "step": 2841, + "train_runtime": 23087.8723, + "train_tokens_per_second": 2015.093 + }, + { + "epoch": 1.7224242424242424, + "grad_norm": 0.020648252218961716, + "learning_rate": 9.352311089350195e-05, + "loss": 0.014192345552146435, + "num_input_tokens_seen": 46540592, + "step": 2842, + "train_runtime": 23095.987, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 1.7230303030303031, + "grad_norm": 0.007957853376865387, + "learning_rate": 9.35183766861027e-05, + "loss": 0.012997648678719997, + "num_input_tokens_seen": 46556968, + "step": 2843, + "train_runtime": 23104.1043, + "train_tokens_per_second": 2015.095 + }, + { + "epoch": 1.7236363636363636, + "grad_norm": 0.011467264033854008, + "learning_rate": 9.351364086904345e-05, + "loss": 0.012560134753584862, + "num_input_tokens_seen": 46573344, + "step": 2844, + "train_runtime": 23112.2178, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 1.7242424242424241, + "grad_norm": 0.008995643816888332, + "learning_rate": 9.350890344249936e-05, + "loss": 0.012183014303445816, + "num_input_tokens_seen": 46589720, + "step": 2845, + "train_runtime": 23120.3346, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.7248484848484849, + "grad_norm": 0.010562472976744175, + "learning_rate": 9.350416440664566e-05, + "loss": 0.01177982147783041, + "num_input_tokens_seen": 46606096, + "step": 2846, + "train_runtime": 23128.4506, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.7254545454545456, + "grad_norm": 0.009001946076750755, + "learning_rate": 9.349942376165766e-05, + "loss": 0.012112541124224663, + "num_input_tokens_seen": 46622472, + "step": 2847, + "train_runtime": 23136.5642, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 1.726060606060606, + "grad_norm": 0.009399345144629478, + "learning_rate": 9.349468150771065e-05, + "loss": 0.012279201298952103, + "num_input_tokens_seen": 46638848, + "step": 2848, + "train_runtime": 23144.6777, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 1.7266666666666666, + "grad_norm": 0.010582569986581802, + "learning_rate": 9.34899376449801e-05, + "loss": 0.012267028912901878, + "num_input_tokens_seen": 46655224, + "step": 2849, + "train_runtime": 23152.7895, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 1.7272727272727273, + "grad_norm": 0.015244287438690662, + "learning_rate": 9.348519217364145e-05, + "loss": 0.013822423294186592, + "num_input_tokens_seen": 46671600, + "step": 2850, + "train_runtime": 23160.9084, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 1.727878787878788, + "grad_norm": 0.006491030566394329, + "learning_rate": 9.34804450938702e-05, + "loss": 0.012164949439466, + "num_input_tokens_seen": 46687976, + "step": 2851, + "train_runtime": 23169.0229, + "train_tokens_per_second": 2015.103 + }, + { + "epoch": 1.7284848484848485, + "grad_norm": 0.0091371675953269, + "learning_rate": 9.347569640584198e-05, + "loss": 0.012326296418905258, + "num_input_tokens_seen": 46704352, + "step": 2852, + "train_runtime": 23177.1375, + "train_tokens_per_second": 2015.104 + }, + { + "epoch": 1.729090909090909, + "grad_norm": 0.007827811874449253, + "learning_rate": 9.347094610973241e-05, + "loss": 0.010904887691140175, + "num_input_tokens_seen": 46720728, + "step": 2853, + "train_runtime": 23185.2512, + "train_tokens_per_second": 2015.106 + }, + { + "epoch": 1.7296969696969697, + "grad_norm": 0.01404674630612135, + "learning_rate": 9.346619420571721e-05, + "loss": 0.012772872112691402, + "num_input_tokens_seen": 46737104, + "step": 2854, + "train_runtime": 23193.3674, + "train_tokens_per_second": 2015.106 + }, + { + "epoch": 1.7303030303030305, + "grad_norm": 0.008301572874188423, + "learning_rate": 9.346144069397211e-05, + "loss": 0.011670062318444252, + "num_input_tokens_seen": 46753480, + "step": 2855, + "train_runtime": 23201.4816, + "train_tokens_per_second": 2015.108 + }, + { + "epoch": 1.730909090909091, + "grad_norm": 0.06437455862760544, + "learning_rate": 9.345668557467298e-05, + "loss": 0.014140639454126358, + "num_input_tokens_seen": 46769856, + "step": 2856, + "train_runtime": 23209.596, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 1.7315151515151515, + "grad_norm": 0.014902097173035145, + "learning_rate": 9.345192884799567e-05, + "loss": 0.014665037393569946, + "num_input_tokens_seen": 46786232, + "step": 2857, + "train_runtime": 23217.7111, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 1.7321212121212122, + "grad_norm": 0.014375180006027222, + "learning_rate": 9.344717051411612e-05, + "loss": 0.01194059569388628, + "num_input_tokens_seen": 46802608, + "step": 2858, + "train_runtime": 23225.8241, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 1.732727272727273, + "grad_norm": 0.009062650613486767, + "learning_rate": 9.344241057321035e-05, + "loss": 0.01136862114071846, + "num_input_tokens_seen": 46818984, + "step": 2859, + "train_runtime": 23233.938, + "train_tokens_per_second": 2015.112 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 0.010465494357049465, + "learning_rate": 9.343764902545443e-05, + "loss": 0.011493357829749584, + "num_input_tokens_seen": 46835360, + "step": 2860, + "train_runtime": 23242.0502, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.733939393939394, + "grad_norm": 0.009279366582632065, + "learning_rate": 9.343288587102443e-05, + "loss": 0.011399534530937672, + "num_input_tokens_seen": 46851736, + "step": 2861, + "train_runtime": 23250.1646, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 1.7345454545454544, + "grad_norm": 0.007747293449938297, + "learning_rate": 9.342812111009658e-05, + "loss": 0.012958042323589325, + "num_input_tokens_seen": 46868112, + "step": 2862, + "train_runtime": 23258.284, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 1.7351515151515151, + "grad_norm": 0.008354146964848042, + "learning_rate": 9.342335474284711e-05, + "loss": 0.012031560763716698, + "num_input_tokens_seen": 46884488, + "step": 2863, + "train_runtime": 23266.3974, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 1.7357575757575758, + "grad_norm": 0.00816110149025917, + "learning_rate": 9.34185867694523e-05, + "loss": 0.01127876527607441, + "num_input_tokens_seen": 46900864, + "step": 2864, + "train_runtime": 23274.5083, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 1.7363636363636363, + "grad_norm": 0.007726035080850124, + "learning_rate": 9.341381719008853e-05, + "loss": 0.013300550170242786, + "num_input_tokens_seen": 46917240, + "step": 2865, + "train_runtime": 23282.6218, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 1.7369696969696968, + "grad_norm": 0.008016044273972511, + "learning_rate": 9.34090460049322e-05, + "loss": 0.011594901792705059, + "num_input_tokens_seen": 46933616, + "step": 2866, + "train_runtime": 23290.7358, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 1.7375757575757576, + "grad_norm": 0.013905017636716366, + "learning_rate": 9.340427321415978e-05, + "loss": 0.013001223094761372, + "num_input_tokens_seen": 46949992, + "step": 2867, + "train_runtime": 23298.8518, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 1.7381818181818183, + "grad_norm": 0.01734462007880211, + "learning_rate": 9.339949881794785e-05, + "loss": 0.013187142089009285, + "num_input_tokens_seen": 46966368, + "step": 2868, + "train_runtime": 23306.9631, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.7387878787878788, + "grad_norm": 0.012361729517579079, + "learning_rate": 9.339472281647294e-05, + "loss": 0.0126962810754776, + "num_input_tokens_seen": 46982744, + "step": 2869, + "train_runtime": 23315.0772, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 1.7393939393939393, + "grad_norm": 0.009342917241156101, + "learning_rate": 9.338994520991177e-05, + "loss": 0.012031439691781998, + "num_input_tokens_seen": 46999120, + "step": 2870, + "train_runtime": 23323.1954, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 1.74, + "grad_norm": 0.00756025267764926, + "learning_rate": 9.338516599844101e-05, + "loss": 0.011724259704351425, + "num_input_tokens_seen": 47015496, + "step": 2871, + "train_runtime": 23331.311, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 1.7406060606060607, + "grad_norm": 0.004233363550156355, + "learning_rate": 9.338038518223747e-05, + "loss": 0.011528807692229748, + "num_input_tokens_seen": 47031872, + "step": 2872, + "train_runtime": 23339.4332, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 1.7412121212121212, + "grad_norm": 0.0077259112149477005, + "learning_rate": 9.337560276147793e-05, + "loss": 0.011708910576999187, + "num_input_tokens_seen": 47048248, + "step": 2873, + "train_runtime": 23347.5464, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 1.7418181818181817, + "grad_norm": 0.011294611729681492, + "learning_rate": 9.337081873633934e-05, + "loss": 0.012407934293150902, + "num_input_tokens_seen": 47064624, + "step": 2874, + "train_runtime": 23355.6638, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.7424242424242424, + "grad_norm": 0.007208488415926695, + "learning_rate": 9.33660331069986e-05, + "loss": 0.012143628671765327, + "num_input_tokens_seen": 47081000, + "step": 2875, + "train_runtime": 23363.7804, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 1.7430303030303032, + "grad_norm": 0.008625108748674393, + "learning_rate": 9.336124587363278e-05, + "loss": 0.012219018302857876, + "num_input_tokens_seen": 47097376, + "step": 2876, + "train_runtime": 23371.9001, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 1.7436363636363637, + "grad_norm": 0.003514588577672839, + "learning_rate": 9.335645703641889e-05, + "loss": 0.011641733348369598, + "num_input_tokens_seen": 47113752, + "step": 2877, + "train_runtime": 23380.0173, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 1.7442424242424241, + "grad_norm": 0.00892610289156437, + "learning_rate": 9.33516665955341e-05, + "loss": 0.013232930563390255, + "num_input_tokens_seen": 47130128, + "step": 2878, + "train_runtime": 23388.1336, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 1.7448484848484849, + "grad_norm": 0.015292099677026272, + "learning_rate": 9.334687455115559e-05, + "loss": 0.012574004009366035, + "num_input_tokens_seen": 47146504, + "step": 2879, + "train_runtime": 23396.2437, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 1.7454545454545456, + "grad_norm": 0.007786950096487999, + "learning_rate": 9.334208090346058e-05, + "loss": 0.013052877970039845, + "num_input_tokens_seen": 47162880, + "step": 2880, + "train_runtime": 23404.3577, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 1.746060606060606, + "grad_norm": 0.008695406839251518, + "learning_rate": 9.333728565262642e-05, + "loss": 0.011388403363525867, + "num_input_tokens_seen": 47179256, + "step": 2881, + "train_runtime": 23412.4708, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 1.7466666666666666, + "grad_norm": 0.01445862464606762, + "learning_rate": 9.333248879883045e-05, + "loss": 0.01251003984361887, + "num_input_tokens_seen": 47195632, + "step": 2882, + "train_runtime": 23420.587, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 1.7472727272727273, + "grad_norm": 0.00993890967220068, + "learning_rate": 9.332769034225012e-05, + "loss": 0.013060958124697208, + "num_input_tokens_seen": 47212008, + "step": 2883, + "train_runtime": 23428.6997, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 1.747878787878788, + "grad_norm": 0.007343083154410124, + "learning_rate": 9.332289028306289e-05, + "loss": 0.011927951127290726, + "num_input_tokens_seen": 47228384, + "step": 2884, + "train_runtime": 23436.8135, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 1.7484848484848485, + "grad_norm": 0.012586996890604496, + "learning_rate": 9.331808862144633e-05, + "loss": 0.014243541285395622, + "num_input_tokens_seen": 47244760, + "step": 2885, + "train_runtime": 23444.9342, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 1.749090909090909, + "grad_norm": 0.007525481283664703, + "learning_rate": 9.331328535757801e-05, + "loss": 0.012809276580810547, + "num_input_tokens_seen": 47261136, + "step": 2886, + "train_runtime": 23453.0478, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 1.7496969696969698, + "grad_norm": 0.008755379356443882, + "learning_rate": 9.330848049163562e-05, + "loss": 0.012331864796578884, + "num_input_tokens_seen": 47277512, + "step": 2887, + "train_runtime": 23461.1604, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 1.7503030303030302, + "grad_norm": 0.007854153402149677, + "learning_rate": 9.33036740237969e-05, + "loss": 0.011172168888151646, + "num_input_tokens_seen": 47293888, + "step": 2888, + "train_runtime": 23469.2714, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 1.750909090909091, + "grad_norm": 0.007569638080894947, + "learning_rate": 9.329886595423958e-05, + "loss": 0.011403515934944153, + "num_input_tokens_seen": 47310264, + "step": 2889, + "train_runtime": 23477.3853, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 1.7515151515151515, + "grad_norm": 0.006328298710286617, + "learning_rate": 9.329405628314152e-05, + "loss": 0.011399973183870316, + "num_input_tokens_seen": 47326640, + "step": 2890, + "train_runtime": 23485.5008, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 1.752121212121212, + "grad_norm": 0.00960509479045868, + "learning_rate": 9.328924501068066e-05, + "loss": 0.013432014733552933, + "num_input_tokens_seen": 47343016, + "step": 2891, + "train_runtime": 23493.6174, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 1.7527272727272727, + "grad_norm": 0.006631573662161827, + "learning_rate": 9.32844321370349e-05, + "loss": 0.012809229083359241, + "num_input_tokens_seen": 47359392, + "step": 2892, + "train_runtime": 23501.7329, + "train_tokens_per_second": 2015.145 + }, + { + "epoch": 1.7533333333333334, + "grad_norm": 0.008058437146246433, + "learning_rate": 9.327961766238231e-05, + "loss": 0.011357891373336315, + "num_input_tokens_seen": 47375768, + "step": 2893, + "train_runtime": 23509.8459, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 1.753939393939394, + "grad_norm": 0.004802866373211145, + "learning_rate": 9.327480158690094e-05, + "loss": 0.011158658191561699, + "num_input_tokens_seen": 47392144, + "step": 2894, + "train_runtime": 23517.9586, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 1.7545454545454544, + "grad_norm": 0.018760213628411293, + "learning_rate": 9.326998391076893e-05, + "loss": 0.01469513401389122, + "num_input_tokens_seen": 47408520, + "step": 2895, + "train_runtime": 23526.0771, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 1.7551515151515151, + "grad_norm": 0.011593660339713097, + "learning_rate": 9.326516463416448e-05, + "loss": 0.012592260725796223, + "num_input_tokens_seen": 47424896, + "step": 2896, + "train_runtime": 23534.1892, + "train_tokens_per_second": 2015.149 + }, + { + "epoch": 1.7557575757575759, + "grad_norm": 0.00899164192378521, + "learning_rate": 9.326034375726586e-05, + "loss": 0.012454191222786903, + "num_input_tokens_seen": 47441272, + "step": 2897, + "train_runtime": 23542.3037, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 1.7563636363636363, + "grad_norm": 0.007111812941730022, + "learning_rate": 9.325552128025135e-05, + "loss": 0.012857899069786072, + "num_input_tokens_seen": 47457648, + "step": 2898, + "train_runtime": 23550.4184, + "train_tokens_per_second": 2015.151 + }, + { + "epoch": 1.7569696969696968, + "grad_norm": 0.006486005615442991, + "learning_rate": 9.325069720329936e-05, + "loss": 0.011890999972820282, + "num_input_tokens_seen": 47474024, + "step": 2899, + "train_runtime": 23558.5372, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 1.7575757575757576, + "grad_norm": 0.004689642693847418, + "learning_rate": 9.324587152658828e-05, + "loss": 0.013131581246852875, + "num_input_tokens_seen": 47490400, + "step": 2900, + "train_runtime": 23566.6511, + "train_tokens_per_second": 2015.153 + }, + { + "epoch": 1.7581818181818183, + "grad_norm": 0.008016454055905342, + "learning_rate": 9.324104425029665e-05, + "loss": 0.01247711107134819, + "num_input_tokens_seen": 47506776, + "step": 2901, + "train_runtime": 23575.6961, + "train_tokens_per_second": 2015.074 + }, + { + "epoch": 1.7587878787878788, + "grad_norm": 0.008500835858285427, + "learning_rate": 9.323621537460301e-05, + "loss": 0.011386177502572536, + "num_input_tokens_seen": 47523152, + "step": 2902, + "train_runtime": 23583.8132, + "train_tokens_per_second": 2015.075 + }, + { + "epoch": 1.7593939393939393, + "grad_norm": 0.006690427660942078, + "learning_rate": 9.323138489968595e-05, + "loss": 0.012762854807078838, + "num_input_tokens_seen": 47539528, + "step": 2903, + "train_runtime": 23591.933, + "train_tokens_per_second": 2015.076 + }, + { + "epoch": 1.76, + "grad_norm": 0.016449101269245148, + "learning_rate": 9.322655282572414e-05, + "loss": 0.0136633962392807, + "num_input_tokens_seen": 47555904, + "step": 2904, + "train_runtime": 23600.0479, + "train_tokens_per_second": 2015.077 + }, + { + "epoch": 1.7606060606060607, + "grad_norm": 0.00727116409689188, + "learning_rate": 9.322171915289635e-05, + "loss": 0.011916939169168472, + "num_input_tokens_seen": 47572280, + "step": 2905, + "train_runtime": 23608.163, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 1.7612121212121212, + "grad_norm": 0.008331136777997017, + "learning_rate": 9.321688388138132e-05, + "loss": 0.011783263646066189, + "num_input_tokens_seen": 47588656, + "step": 2906, + "train_runtime": 23616.2789, + "train_tokens_per_second": 2015.079 + }, + { + "epoch": 1.7618181818181817, + "grad_norm": 0.020789558067917824, + "learning_rate": 9.32120470113579e-05, + "loss": 0.013218401931226254, + "num_input_tokens_seen": 47605032, + "step": 2907, + "train_runtime": 23624.391, + "train_tokens_per_second": 2015.08 + }, + { + "epoch": 1.7624242424242424, + "grad_norm": 0.00496008712798357, + "learning_rate": 9.320720854300504e-05, + "loss": 0.012365386821329594, + "num_input_tokens_seen": 47621408, + "step": 2908, + "train_runtime": 23632.5039, + "train_tokens_per_second": 2015.081 + }, + { + "epoch": 1.7630303030303032, + "grad_norm": 0.006191062740981579, + "learning_rate": 9.320236847650168e-05, + "loss": 0.012062267400324345, + "num_input_tokens_seen": 47637784, + "step": 2909, + "train_runtime": 23640.6186, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 1.7636363636363637, + "grad_norm": 0.006267681252211332, + "learning_rate": 9.319752681202683e-05, + "loss": 0.01223327498883009, + "num_input_tokens_seen": 47654160, + "step": 2910, + "train_runtime": 23648.7416, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 1.7642424242424242, + "grad_norm": 0.01227263081818819, + "learning_rate": 9.319268354975959e-05, + "loss": 0.013540641404688358, + "num_input_tokens_seen": 47670536, + "step": 2911, + "train_runtime": 23656.8565, + "train_tokens_per_second": 2015.083 + }, + { + "epoch": 1.7648484848484849, + "grad_norm": 0.005225658882409334, + "learning_rate": 9.31878386898791e-05, + "loss": 0.01202879287302494, + "num_input_tokens_seen": 47686912, + "step": 2912, + "train_runtime": 23664.972, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 1.7654545454545456, + "grad_norm": 0.02399802766740322, + "learning_rate": 9.318299223256456e-05, + "loss": 0.012255542911589146, + "num_input_tokens_seen": 47703288, + "step": 2913, + "train_runtime": 23673.0935, + "train_tokens_per_second": 2015.085 + }, + { + "epoch": 1.766060606060606, + "grad_norm": 0.014451933093369007, + "learning_rate": 9.317814417799523e-05, + "loss": 0.013469447381794453, + "num_input_tokens_seen": 47719664, + "step": 2914, + "train_runtime": 23681.2101, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.7666666666666666, + "grad_norm": 0.009129231795668602, + "learning_rate": 9.317329452635044e-05, + "loss": 0.011540930718183517, + "num_input_tokens_seen": 47736040, + "step": 2915, + "train_runtime": 23689.3237, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 1.767272727272727, + "grad_norm": 0.008276116102933884, + "learning_rate": 9.316844327780955e-05, + "loss": 0.012826155871152878, + "num_input_tokens_seen": 47752416, + "step": 2916, + "train_runtime": 23697.4395, + "train_tokens_per_second": 2015.088 + }, + { + "epoch": 1.7678787878787878, + "grad_norm": 0.01835167407989502, + "learning_rate": 9.316359043255201e-05, + "loss": 0.013342682272195816, + "num_input_tokens_seen": 47768792, + "step": 2917, + "train_runtime": 23705.5611, + "train_tokens_per_second": 2015.088 + }, + { + "epoch": 1.7684848484848485, + "grad_norm": 0.009004290215671062, + "learning_rate": 9.315873599075733e-05, + "loss": 0.01241071242839098, + "num_input_tokens_seen": 47785168, + "step": 2918, + "train_runtime": 23713.6782, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 1.769090909090909, + "grad_norm": 0.018462710082530975, + "learning_rate": 9.315387995260505e-05, + "loss": 0.011892465874552727, + "num_input_tokens_seen": 47801544, + "step": 2919, + "train_runtime": 23721.7922, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 1.7696969696969695, + "grad_norm": 0.010308179073035717, + "learning_rate": 9.314902231827478e-05, + "loss": 0.012521905824542046, + "num_input_tokens_seen": 47817920, + "step": 2920, + "train_runtime": 23729.9118, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.7703030303030303, + "grad_norm": 0.005858840420842171, + "learning_rate": 9.314416308794621e-05, + "loss": 0.0120368218049407, + "num_input_tokens_seen": 47834296, + "step": 2921, + "train_runtime": 23738.0341, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.770909090909091, + "grad_norm": 0.025842219591140747, + "learning_rate": 9.313930226179908e-05, + "loss": 0.012849163264036179, + "num_input_tokens_seen": 47850672, + "step": 2922, + "train_runtime": 23746.1488, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 1.7715151515151515, + "grad_norm": 0.006477030925452709, + "learning_rate": 9.313443984001315e-05, + "loss": 0.011147328652441502, + "num_input_tokens_seen": 47867048, + "step": 2923, + "train_runtime": 23754.2656, + "train_tokens_per_second": 2015.093 + }, + { + "epoch": 1.772121212121212, + "grad_norm": 0.00806950218975544, + "learning_rate": 9.312957582276829e-05, + "loss": 0.012119542807340622, + "num_input_tokens_seen": 47883424, + "step": 2924, + "train_runtime": 23762.385, + "train_tokens_per_second": 2015.093 + }, + { + "epoch": 1.7727272727272727, + "grad_norm": 0.010666623711585999, + "learning_rate": 9.312471021024443e-05, + "loss": 0.013124541379511356, + "num_input_tokens_seen": 47899800, + "step": 2925, + "train_runtime": 23770.5067, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 1.7733333333333334, + "grad_norm": 0.008134602569043636, + "learning_rate": 9.31198430026215e-05, + "loss": 0.012375653721392155, + "num_input_tokens_seen": 47916176, + "step": 2926, + "train_runtime": 23778.6358, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 1.773939393939394, + "grad_norm": 0.007719589862972498, + "learning_rate": 9.311497420007955e-05, + "loss": 0.011732139624655247, + "num_input_tokens_seen": 47932552, + "step": 2927, + "train_runtime": 23786.7556, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 1.7745454545454544, + "grad_norm": 0.01229825522750616, + "learning_rate": 9.311010380279868e-05, + "loss": 0.01175294816493988, + "num_input_tokens_seen": 47948928, + "step": 2928, + "train_runtime": 23794.8768, + "train_tokens_per_second": 2015.095 + }, + { + "epoch": 1.7751515151515151, + "grad_norm": 0.0076052104122936726, + "learning_rate": 9.310523181095903e-05, + "loss": 0.012578996829688549, + "num_input_tokens_seen": 47965304, + "step": 2929, + "train_runtime": 23802.9912, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 1.7757575757575759, + "grad_norm": 0.008468554355204105, + "learning_rate": 9.310035822474076e-05, + "loss": 0.011648007668554783, + "num_input_tokens_seen": 47981680, + "step": 2930, + "train_runtime": 23811.1077, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.7763636363636364, + "grad_norm": 0.010106262750923634, + "learning_rate": 9.309548304432421e-05, + "loss": 0.012974138371646404, + "num_input_tokens_seen": 47998056, + "step": 2931, + "train_runtime": 23819.2234, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.7769696969696969, + "grad_norm": 0.008024870418012142, + "learning_rate": 9.309060626988966e-05, + "loss": 0.0120691554620862, + "num_input_tokens_seen": 48014432, + "step": 2932, + "train_runtime": 23827.3433, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.7775757575757576, + "grad_norm": 0.006827709265053272, + "learning_rate": 9.30857279016175e-05, + "loss": 0.012827214784920216, + "num_input_tokens_seen": 48030808, + "step": 2933, + "train_runtime": 23835.4651, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.7781818181818183, + "grad_norm": 0.005936678033322096, + "learning_rate": 9.308084793968816e-05, + "loss": 0.011023994535207748, + "num_input_tokens_seen": 48047184, + "step": 2934, + "train_runtime": 23843.5821, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 1.7787878787878788, + "grad_norm": 0.1119927316904068, + "learning_rate": 9.307596638428217e-05, + "loss": 0.011640896089375019, + "num_input_tokens_seen": 48063560, + "step": 2935, + "train_runtime": 23851.6986, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 1.7793939393939393, + "grad_norm": 0.012935559265315533, + "learning_rate": 9.307108323558005e-05, + "loss": 0.01166920829564333, + "num_input_tokens_seen": 48079936, + "step": 2936, + "train_runtime": 23859.8191, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 1.78, + "grad_norm": 0.016682934015989304, + "learning_rate": 9.306619849376245e-05, + "loss": 0.012249463237822056, + "num_input_tokens_seen": 48096312, + "step": 2937, + "train_runtime": 23867.9395, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 1.7806060606060607, + "grad_norm": 0.01278962567448616, + "learning_rate": 9.306131215901003e-05, + "loss": 0.012957882136106491, + "num_input_tokens_seen": 48112688, + "step": 2938, + "train_runtime": 23876.0537, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 1.7812121212121212, + "grad_norm": 0.009745283052325249, + "learning_rate": 9.305642423150353e-05, + "loss": 0.014513827860355377, + "num_input_tokens_seen": 48129064, + "step": 2939, + "train_runtime": 23884.1711, + "train_tokens_per_second": 2015.103 + }, + { + "epoch": 1.7818181818181817, + "grad_norm": 0.010113997384905815, + "learning_rate": 9.305153471142377e-05, + "loss": 0.0118255615234375, + "num_input_tokens_seen": 48145440, + "step": 2940, + "train_runtime": 23892.2873, + "train_tokens_per_second": 2015.104 + }, + { + "epoch": 1.7824242424242425, + "grad_norm": 0.009374349378049374, + "learning_rate": 9.304664359895155e-05, + "loss": 0.012293674983084202, + "num_input_tokens_seen": 48161816, + "step": 2941, + "train_runtime": 23900.4033, + "train_tokens_per_second": 2015.105 + }, + { + "epoch": 1.7830303030303032, + "grad_norm": 0.021635450422763824, + "learning_rate": 9.30417508942678e-05, + "loss": 0.012411084957420826, + "num_input_tokens_seen": 48178192, + "step": 2942, + "train_runtime": 23908.5186, + "train_tokens_per_second": 2015.106 + }, + { + "epoch": 1.7836363636363637, + "grad_norm": 0.005758213810622692, + "learning_rate": 9.303685659755354e-05, + "loss": 0.011029191315174103, + "num_input_tokens_seen": 48194568, + "step": 2943, + "train_runtime": 23916.6336, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 1.7842424242424242, + "grad_norm": 0.007983488030731678, + "learning_rate": 9.303196070898975e-05, + "loss": 0.012368512339890003, + "num_input_tokens_seen": 48210944, + "step": 2944, + "train_runtime": 23924.7466, + "train_tokens_per_second": 2015.108 + }, + { + "epoch": 1.7848484848484847, + "grad_norm": 0.011937152594327927, + "learning_rate": 9.302706322875753e-05, + "loss": 0.011813906021416187, + "num_input_tokens_seen": 48227320, + "step": 2945, + "train_runtime": 23932.8634, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 1.7854545454545454, + "grad_norm": 0.01328189205378294, + "learning_rate": 9.302216415703805e-05, + "loss": 0.01319885067641735, + "num_input_tokens_seen": 48243696, + "step": 2946, + "train_runtime": 23940.9812, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 1.7860606060606061, + "grad_norm": 0.007090682163834572, + "learning_rate": 9.301726349401249e-05, + "loss": 0.01120240893214941, + "num_input_tokens_seen": 48260072, + "step": 2947, + "train_runtime": 23949.1004, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 1.7866666666666666, + "grad_norm": 0.012933997437357903, + "learning_rate": 9.301236123986212e-05, + "loss": 0.01253314595669508, + "num_input_tokens_seen": 48276448, + "step": 2948, + "train_runtime": 23957.2143, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 1.7872727272727271, + "grad_norm": 0.015164041891694069, + "learning_rate": 9.300745739476829e-05, + "loss": 0.012761669233441353, + "num_input_tokens_seen": 48292824, + "step": 2949, + "train_runtime": 23965.3347, + "train_tokens_per_second": 2015.112 + }, + { + "epoch": 1.7878787878787878, + "grad_norm": 0.009341921657323837, + "learning_rate": 9.300255195891233e-05, + "loss": 0.013164439238607883, + "num_input_tokens_seen": 48309200, + "step": 2950, + "train_runtime": 23973.4578, + "train_tokens_per_second": 2015.112 + }, + { + "epoch": 1.7884848484848486, + "grad_norm": 0.010138064622879028, + "learning_rate": 9.299764493247574e-05, + "loss": 0.013836441561579704, + "num_input_tokens_seen": 48325576, + "step": 2951, + "train_runtime": 23981.5762, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.789090909090909, + "grad_norm": 0.009686525911092758, + "learning_rate": 9.299273631563998e-05, + "loss": 0.011630890890955925, + "num_input_tokens_seen": 48341952, + "step": 2952, + "train_runtime": 23989.6919, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.7896969696969696, + "grad_norm": 0.017342818900942802, + "learning_rate": 9.298782610858664e-05, + "loss": 0.013579259626567364, + "num_input_tokens_seen": 48358328, + "step": 2953, + "train_runtime": 23997.8112, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 1.7903030303030303, + "grad_norm": 0.009860479272902012, + "learning_rate": 9.298291431149733e-05, + "loss": 0.013241427019238472, + "num_input_tokens_seen": 48374704, + "step": 2954, + "train_runtime": 24005.9337, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 1.790909090909091, + "grad_norm": 0.007178580854088068, + "learning_rate": 9.297800092455373e-05, + "loss": 0.011353488080203533, + "num_input_tokens_seen": 48391080, + "step": 2955, + "train_runtime": 24014.0491, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 1.7915151515151515, + "grad_norm": 0.007200692314654589, + "learning_rate": 9.297308594793756e-05, + "loss": 0.012314318679273129, + "num_input_tokens_seen": 48407456, + "step": 2956, + "train_runtime": 24022.1626, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 1.792121212121212, + "grad_norm": 0.006994608324021101, + "learning_rate": 9.296816938183063e-05, + "loss": 0.011929539032280445, + "num_input_tokens_seen": 48423832, + "step": 2957, + "train_runtime": 24030.2772, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 1.7927272727272727, + "grad_norm": 0.01123084407299757, + "learning_rate": 9.29632512264148e-05, + "loss": 0.01273421198129654, + "num_input_tokens_seen": 48440208, + "step": 2958, + "train_runtime": 24038.3949, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 1.7933333333333334, + "grad_norm": 0.00830906443297863, + "learning_rate": 9.295833148187197e-05, + "loss": 0.012322529219090939, + "num_input_tokens_seen": 48456584, + "step": 2959, + "train_runtime": 24046.5123, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 1.793939393939394, + "grad_norm": 0.010710365138947964, + "learning_rate": 9.295341014838412e-05, + "loss": 0.011278321035206318, + "num_input_tokens_seen": 48472960, + "step": 2960, + "train_runtime": 24054.6331, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 1.7945454545454544, + "grad_norm": 0.010728915221989155, + "learning_rate": 9.294848722613326e-05, + "loss": 0.010784944519400597, + "num_input_tokens_seen": 48489336, + "step": 2961, + "train_runtime": 24062.7468, + "train_tokens_per_second": 2015.121 + }, + { + "epoch": 1.7951515151515152, + "grad_norm": 0.007336590439081192, + "learning_rate": 9.294356271530151e-05, + "loss": 0.011799480766057968, + "num_input_tokens_seen": 48505712, + "step": 2962, + "train_runtime": 24070.8603, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.7957575757575759, + "grad_norm": 0.00758334482088685, + "learning_rate": 9.2938636616071e-05, + "loss": 0.01114749163389206, + "num_input_tokens_seen": 48522088, + "step": 2963, + "train_runtime": 24078.9751, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 1.7963636363636364, + "grad_norm": 0.010609005577862263, + "learning_rate": 9.293370892862395e-05, + "loss": 0.012277994304895401, + "num_input_tokens_seen": 48538464, + "step": 2964, + "train_runtime": 24087.0912, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 1.7969696969696969, + "grad_norm": 0.007987729273736477, + "learning_rate": 9.29287796531426e-05, + "loss": 0.012755843810737133, + "num_input_tokens_seen": 48554840, + "step": 2965, + "train_runtime": 24095.2123, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 1.7975757575757576, + "grad_norm": 0.00924461055546999, + "learning_rate": 9.29238487898093e-05, + "loss": 0.013270684517920017, + "num_input_tokens_seen": 48571216, + "step": 2966, + "train_runtime": 24103.3345, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 1.7981818181818183, + "grad_norm": 0.0071054198779165745, + "learning_rate": 9.291891633880642e-05, + "loss": 0.012391364201903343, + "num_input_tokens_seen": 48587592, + "step": 2967, + "train_runtime": 24111.4534, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 1.7987878787878788, + "grad_norm": 0.009188920259475708, + "learning_rate": 9.29139823003164e-05, + "loss": 0.011726969853043556, + "num_input_tokens_seen": 48603968, + "step": 2968, + "train_runtime": 24119.5686, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 1.7993939393939393, + "grad_norm": 0.005871398374438286, + "learning_rate": 9.290904667452177e-05, + "loss": 0.01141232531517744, + "num_input_tokens_seen": 48620344, + "step": 2969, + "train_runtime": 24127.6861, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.8, + "grad_norm": 0.010933088138699532, + "learning_rate": 9.290410946160504e-05, + "loss": 0.012397650629281998, + "num_input_tokens_seen": 48636720, + "step": 2970, + "train_runtime": 24135.8006, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 1.8006060606060608, + "grad_norm": 0.008161036297678947, + "learning_rate": 9.289917066174886e-05, + "loss": 0.012152892537415028, + "num_input_tokens_seen": 48653096, + "step": 2971, + "train_runtime": 24143.9187, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 1.8012121212121213, + "grad_norm": 0.007746783550828695, + "learning_rate": 9.28942302751359e-05, + "loss": 0.013096818700432777, + "num_input_tokens_seen": 48669472, + "step": 2972, + "train_runtime": 24152.0341, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 1.8018181818181818, + "grad_norm": 0.011512810364365578, + "learning_rate": 9.28892883019489e-05, + "loss": 0.012248929589986801, + "num_input_tokens_seen": 48685848, + "step": 2973, + "train_runtime": 24160.1509, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 1.8024242424242423, + "grad_norm": 0.009246395900845528, + "learning_rate": 9.288434474237064e-05, + "loss": 0.011867566034197807, + "num_input_tokens_seen": 48702224, + "step": 2974, + "train_runtime": 24168.2635, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 1.803030303030303, + "grad_norm": 0.017433451488614082, + "learning_rate": 9.287939959658399e-05, + "loss": 0.013736135326325893, + "num_input_tokens_seen": 48718600, + "step": 2975, + "train_runtime": 24176.3771, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 1.8036363636363637, + "grad_norm": 0.010817728005349636, + "learning_rate": 9.287445286477184e-05, + "loss": 0.011506814509630203, + "num_input_tokens_seen": 48734976, + "step": 2976, + "train_runtime": 24184.4902, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 1.8042424242424242, + "grad_norm": 0.009098121896386147, + "learning_rate": 9.286950454711717e-05, + "loss": 0.012756666168570518, + "num_input_tokens_seen": 48751352, + "step": 2977, + "train_runtime": 24192.6057, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 1.8048484848484847, + "grad_norm": 0.009739307686686516, + "learning_rate": 9.286455464380304e-05, + "loss": 0.013063987717032433, + "num_input_tokens_seen": 48767728, + "step": 2978, + "train_runtime": 24200.7221, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 1.8054545454545454, + "grad_norm": 0.014028403908014297, + "learning_rate": 9.285960315501248e-05, + "loss": 0.01358321774750948, + "num_input_tokens_seen": 48784104, + "step": 2979, + "train_runtime": 24208.8405, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 1.8060606060606061, + "grad_norm": 0.008561541326344013, + "learning_rate": 9.285465008092868e-05, + "loss": 0.0112238060683012, + "num_input_tokens_seen": 48800480, + "step": 2980, + "train_runtime": 24216.9613, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 1.8066666666666666, + "grad_norm": 0.014710756950080395, + "learning_rate": 9.284969542173482e-05, + "loss": 0.012869363650679588, + "num_input_tokens_seen": 48816856, + "step": 2981, + "train_runtime": 24225.0787, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 1.8072727272727271, + "grad_norm": 0.004121546167880297, + "learning_rate": 9.284473917761419e-05, + "loss": 0.011222013272345066, + "num_input_tokens_seen": 48833232, + "step": 2982, + "train_runtime": 24233.1928, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 1.8078787878787879, + "grad_norm": 0.016711929813027382, + "learning_rate": 9.283978134875006e-05, + "loss": 0.013897864148020744, + "num_input_tokens_seen": 48849608, + "step": 2983, + "train_runtime": 24241.3078, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 1.8084848484848486, + "grad_norm": 0.010518312454223633, + "learning_rate": 9.283482193532587e-05, + "loss": 0.011868288740515709, + "num_input_tokens_seen": 48865984, + "step": 2984, + "train_runtime": 24249.4339, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 1.809090909090909, + "grad_norm": 0.006691992282867432, + "learning_rate": 9.282986093752504e-05, + "loss": 0.010689632967114449, + "num_input_tokens_seen": 48882360, + "step": 2985, + "train_runtime": 24257.5472, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 1.8096969696969696, + "grad_norm": 0.008037862367928028, + "learning_rate": 9.282489835553106e-05, + "loss": 0.011833954602479935, + "num_input_tokens_seen": 48898736, + "step": 2986, + "train_runtime": 24265.6632, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 1.8103030303030303, + "grad_norm": 0.009776163846254349, + "learning_rate": 9.281993418952746e-05, + "loss": 0.012045754119753838, + "num_input_tokens_seen": 48915112, + "step": 2987, + "train_runtime": 24273.7804, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 1.810909090909091, + "grad_norm": 0.012512095272541046, + "learning_rate": 9.28149684396979e-05, + "loss": 0.012259161099791527, + "num_input_tokens_seen": 48931488, + "step": 2988, + "train_runtime": 24281.8933, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 1.8115151515151515, + "grad_norm": 0.010951677337288857, + "learning_rate": 9.281000110622605e-05, + "loss": 0.012102634645998478, + "num_input_tokens_seen": 48947864, + "step": 2989, + "train_runtime": 24290.0107, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 1.812121212121212, + "grad_norm": 0.004510990809649229, + "learning_rate": 9.28050321892956e-05, + "loss": 0.012496976181864738, + "num_input_tokens_seen": 48964240, + "step": 2990, + "train_runtime": 24298.1351, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 1.8127272727272727, + "grad_norm": 0.007676657754927874, + "learning_rate": 9.280006168909039e-05, + "loss": 0.011141535826027393, + "num_input_tokens_seen": 48980616, + "step": 2991, + "train_runtime": 24306.2519, + "train_tokens_per_second": 2015.145 + }, + { + "epoch": 1.8133333333333335, + "grad_norm": 0.004773823544383049, + "learning_rate": 9.279508960579424e-05, + "loss": 0.011702566407620907, + "num_input_tokens_seen": 48996992, + "step": 2992, + "train_runtime": 24314.3665, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 1.813939393939394, + "grad_norm": 0.013620059005916119, + "learning_rate": 9.279011593959106e-05, + "loss": 0.013101032935082912, + "num_input_tokens_seen": 49013368, + "step": 2993, + "train_runtime": 24322.4857, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 1.8145454545454545, + "grad_norm": 0.00769168371334672, + "learning_rate": 9.278514069066483e-05, + "loss": 0.012206891551613808, + "num_input_tokens_seen": 49029744, + "step": 2994, + "train_runtime": 24330.6009, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 1.8151515151515152, + "grad_norm": 0.017870178446173668, + "learning_rate": 9.278016385919957e-05, + "loss": 0.013382025063037872, + "num_input_tokens_seen": 49046120, + "step": 2995, + "train_runtime": 24338.7174, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 1.815757575757576, + "grad_norm": 0.024471454322338104, + "learning_rate": 9.277518544537934e-05, + "loss": 0.012662556953728199, + "num_input_tokens_seen": 49062496, + "step": 2996, + "train_runtime": 24346.8339, + "train_tokens_per_second": 2015.149 + }, + { + "epoch": 1.8163636363636364, + "grad_norm": 0.0086158262565732, + "learning_rate": 9.277020544938832e-05, + "loss": 0.012152843177318573, + "num_input_tokens_seen": 49078872, + "step": 2997, + "train_runtime": 24354.9513, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 1.816969696969697, + "grad_norm": 0.025600312277674675, + "learning_rate": 9.276522387141068e-05, + "loss": 0.015133306384086609, + "num_input_tokens_seen": 49095248, + "step": 2998, + "train_runtime": 24363.07, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 1.8175757575757576, + "grad_norm": 0.010355425998568535, + "learning_rate": 9.27602407116307e-05, + "loss": 0.012976177968084812, + "num_input_tokens_seen": 49111624, + "step": 2999, + "train_runtime": 24371.1852, + "train_tokens_per_second": 2015.151 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.01108719315379858, + "learning_rate": 9.275525597023267e-05, + "loss": 0.01246652752161026, + "num_input_tokens_seen": 49128000, + "step": 3000, + "train_runtime": 24379.3032, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 1.8187878787878788, + "grad_norm": 0.010925337672233582, + "learning_rate": 9.275026964740101e-05, + "loss": 0.01247719768434763, + "num_input_tokens_seen": 49144376, + "step": 3001, + "train_runtime": 24388.476, + "train_tokens_per_second": 2015.065 + }, + { + "epoch": 1.8193939393939393, + "grad_norm": 0.005827899090945721, + "learning_rate": 9.274528174332011e-05, + "loss": 0.011851150542497635, + "num_input_tokens_seen": 49160752, + "step": 3002, + "train_runtime": 24396.5903, + "train_tokens_per_second": 2015.067 + }, + { + "epoch": 1.8199999999999998, + "grad_norm": 0.006017337553203106, + "learning_rate": 9.274029225817449e-05, + "loss": 0.012704812921583652, + "num_input_tokens_seen": 49177128, + "step": 3003, + "train_runtime": 24404.7062, + "train_tokens_per_second": 2015.067 + }, + { + "epoch": 1.8206060606060606, + "grad_norm": 0.010842292569577694, + "learning_rate": 9.273530119214868e-05, + "loss": 0.012414321303367615, + "num_input_tokens_seen": 49193504, + "step": 3004, + "train_runtime": 24412.8222, + "train_tokens_per_second": 2015.068 + }, + { + "epoch": 1.8212121212121213, + "grad_norm": 0.013193870894610882, + "learning_rate": 9.27303085454273e-05, + "loss": 0.013441788032650948, + "num_input_tokens_seen": 49209880, + "step": 3005, + "train_runtime": 24420.9392, + "train_tokens_per_second": 2015.069 + }, + { + "epoch": 1.8218181818181818, + "grad_norm": 0.009797339327633381, + "learning_rate": 9.272531431819504e-05, + "loss": 0.012301875278353691, + "num_input_tokens_seen": 49226256, + "step": 3006, + "train_runtime": 24429.0554, + "train_tokens_per_second": 2015.07 + }, + { + "epoch": 1.8224242424242423, + "grad_norm": 0.010019945912063122, + "learning_rate": 9.27203185106366e-05, + "loss": 0.01279345341026783, + "num_input_tokens_seen": 49242632, + "step": 3007, + "train_runtime": 24437.167, + "train_tokens_per_second": 2015.071 + }, + { + "epoch": 1.823030303030303, + "grad_norm": 0.011813540011644363, + "learning_rate": 9.271532112293678e-05, + "loss": 0.012321519665420055, + "num_input_tokens_seen": 49259008, + "step": 3008, + "train_runtime": 24445.2808, + "train_tokens_per_second": 2015.072 + }, + { + "epoch": 1.8236363636363637, + "grad_norm": 0.010466402396559715, + "learning_rate": 9.27103221552804e-05, + "loss": 0.012350622564554214, + "num_input_tokens_seen": 49275384, + "step": 3009, + "train_runtime": 24453.3942, + "train_tokens_per_second": 2015.073 + }, + { + "epoch": 1.8242424242424242, + "grad_norm": 0.016839874908328056, + "learning_rate": 9.270532160785238e-05, + "loss": 0.013571641407907009, + "num_input_tokens_seen": 49291760, + "step": 3010, + "train_runtime": 24461.5065, + "train_tokens_per_second": 2015.075 + }, + { + "epoch": 1.8248484848484847, + "grad_norm": 0.009407256729900837, + "learning_rate": 9.270031948083769e-05, + "loss": 0.011951828375458717, + "num_input_tokens_seen": 49308136, + "step": 3011, + "train_runtime": 24469.6211, + "train_tokens_per_second": 2015.076 + }, + { + "epoch": 1.8254545454545454, + "grad_norm": 0.01694776676595211, + "learning_rate": 9.269531577442132e-05, + "loss": 0.012847152538597584, + "num_input_tokens_seen": 49324512, + "step": 3012, + "train_runtime": 24477.7371, + "train_tokens_per_second": 2015.076 + }, + { + "epoch": 1.8260606060606062, + "grad_norm": 0.012286297976970673, + "learning_rate": 9.269031048878838e-05, + "loss": 0.01283589843660593, + "num_input_tokens_seen": 49340888, + "step": 3013, + "train_runtime": 24485.8537, + "train_tokens_per_second": 2015.077 + }, + { + "epoch": 1.8266666666666667, + "grad_norm": 0.003697991603985429, + "learning_rate": 9.268530362412398e-05, + "loss": 0.01160636730492115, + "num_input_tokens_seen": 49357264, + "step": 3014, + "train_runtime": 24493.9669, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 1.8272727272727272, + "grad_norm": 0.011356630362570286, + "learning_rate": 9.268029518061334e-05, + "loss": 0.013032147660851479, + "num_input_tokens_seen": 49373640, + "step": 3015, + "train_runtime": 24502.0825, + "train_tokens_per_second": 2015.079 + }, + { + "epoch": 1.8278787878787879, + "grad_norm": 0.0060284812934696674, + "learning_rate": 9.267528515844168e-05, + "loss": 0.01141420565545559, + "num_input_tokens_seen": 49390016, + "step": 3016, + "train_runtime": 24510.2002, + "train_tokens_per_second": 2015.08 + }, + { + "epoch": 1.8284848484848486, + "grad_norm": 0.0051521072164177895, + "learning_rate": 9.267027355779434e-05, + "loss": 0.011409430764615536, + "num_input_tokens_seen": 49406392, + "step": 3017, + "train_runtime": 24518.3136, + "train_tokens_per_second": 2015.081 + }, + { + "epoch": 1.829090909090909, + "grad_norm": 0.005750606767833233, + "learning_rate": 9.266526037885668e-05, + "loss": 0.011167693883180618, + "num_input_tokens_seen": 49422768, + "step": 3018, + "train_runtime": 24526.4326, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 1.8296969696969696, + "grad_norm": 0.011581099592149258, + "learning_rate": 9.26602456218141e-05, + "loss": 0.013885931111872196, + "num_input_tokens_seen": 49439144, + "step": 3019, + "train_runtime": 24534.5512, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 1.8303030303030303, + "grad_norm": 0.006689704954624176, + "learning_rate": 9.265522928685215e-05, + "loss": 0.011876719072461128, + "num_input_tokens_seen": 49455520, + "step": 3020, + "train_runtime": 24542.6651, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 1.830909090909091, + "grad_norm": 0.008976902812719345, + "learning_rate": 9.26502113741563e-05, + "loss": 0.0122674610465765, + "num_input_tokens_seen": 49471896, + "step": 3021, + "train_runtime": 24550.7824, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 1.8315151515151515, + "grad_norm": 0.005990789737552404, + "learning_rate": 9.26451918839122e-05, + "loss": 0.012194567359983921, + "num_input_tokens_seen": 49488272, + "step": 3022, + "train_runtime": 24558.9059, + "train_tokens_per_second": 2015.085 + }, + { + "epoch": 1.832121212121212, + "grad_norm": 0.007173141464591026, + "learning_rate": 9.264017081630551e-05, + "loss": 0.013572081923484802, + "num_input_tokens_seen": 49504648, + "step": 3023, + "train_runtime": 24567.0235, + "train_tokens_per_second": 2015.085 + }, + { + "epoch": 1.8327272727272728, + "grad_norm": 0.008800432085990906, + "learning_rate": 9.263514817152195e-05, + "loss": 0.011637565679848194, + "num_input_tokens_seen": 49521024, + "step": 3024, + "train_runtime": 24575.1469, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 0.015690628439188004, + "learning_rate": 9.263012394974726e-05, + "loss": 0.012906611897051334, + "num_input_tokens_seen": 49537400, + "step": 3025, + "train_runtime": 24583.2632, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.833939393939394, + "grad_norm": 0.013949367217719555, + "learning_rate": 9.262509815116732e-05, + "loss": 0.011235008016228676, + "num_input_tokens_seen": 49553776, + "step": 3026, + "train_runtime": 24591.3813, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 1.8345454545454545, + "grad_norm": 0.012325744144618511, + "learning_rate": 9.262007077596799e-05, + "loss": 0.012805722653865814, + "num_input_tokens_seen": 49570152, + "step": 3027, + "train_runtime": 24599.4999, + "train_tokens_per_second": 2015.088 + }, + { + "epoch": 1.8351515151515152, + "grad_norm": 0.009210403077304363, + "learning_rate": 9.261504182433528e-05, + "loss": 0.012145797722041607, + "num_input_tokens_seen": 49586528, + "step": 3028, + "train_runtime": 24607.6184, + "train_tokens_per_second": 2015.088 + }, + { + "epoch": 1.835757575757576, + "grad_norm": 0.00966776255518198, + "learning_rate": 9.261001129645513e-05, + "loss": 0.012410152703523636, + "num_input_tokens_seen": 49602904, + "step": 3029, + "train_runtime": 24615.7372, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 1.8363636363636364, + "grad_norm": 0.008486943319439888, + "learning_rate": 9.260497919251364e-05, + "loss": 0.01206645742058754, + "num_input_tokens_seen": 49619280, + "step": 3030, + "train_runtime": 24623.8556, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 1.836969696969697, + "grad_norm": 0.007263388019055128, + "learning_rate": 9.259994551269694e-05, + "loss": 0.011400827206671238, + "num_input_tokens_seen": 49635656, + "step": 3031, + "train_runtime": 24631.9724, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.8375757575757574, + "grad_norm": 0.005560369696468115, + "learning_rate": 9.259491025719122e-05, + "loss": 0.012198167853057384, + "num_input_tokens_seen": 49652032, + "step": 3032, + "train_runtime": 24640.0936, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.8381818181818181, + "grad_norm": 0.008768472820520401, + "learning_rate": 9.258987342618273e-05, + "loss": 0.012726176530122757, + "num_input_tokens_seen": 49668408, + "step": 3033, + "train_runtime": 24648.2093, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 1.8387878787878789, + "grad_norm": 0.009570861235260963, + "learning_rate": 9.258483501985775e-05, + "loss": 0.011579609476029873, + "num_input_tokens_seen": 49684784, + "step": 3034, + "train_runtime": 24656.324, + "train_tokens_per_second": 2015.093 + }, + { + "epoch": 1.8393939393939394, + "grad_norm": 0.009371986612677574, + "learning_rate": 9.257979503840266e-05, + "loss": 0.012336795218288898, + "num_input_tokens_seen": 49701160, + "step": 3035, + "train_runtime": 24664.4461, + "train_tokens_per_second": 2015.093 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 0.009010751731693745, + "learning_rate": 9.257475348200387e-05, + "loss": 0.011590475216507912, + "num_input_tokens_seen": 49717536, + "step": 3036, + "train_runtime": 24672.5666, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 1.8406060606060606, + "grad_norm": 0.008704320527613163, + "learning_rate": 9.256971035084785e-05, + "loss": 0.012646995484828949, + "num_input_tokens_seen": 49733912, + "step": 3037, + "train_runtime": 24680.6929, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 1.8412121212121213, + "grad_norm": 0.011614583432674408, + "learning_rate": 9.256466564512115e-05, + "loss": 0.012255294248461723, + "num_input_tokens_seen": 49750288, + "step": 3038, + "train_runtime": 24688.816, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 1.8418181818181818, + "grad_norm": 0.009356755763292313, + "learning_rate": 9.255961936501036e-05, + "loss": 0.012481609359383583, + "num_input_tokens_seen": 49766664, + "step": 3039, + "train_runtime": 24696.9324, + "train_tokens_per_second": 2015.095 + }, + { + "epoch": 1.8424242424242423, + "grad_norm": 0.008717809803783894, + "learning_rate": 9.255457151070213e-05, + "loss": 0.013015971519052982, + "num_input_tokens_seen": 49783040, + "step": 3040, + "train_runtime": 24705.0478, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 1.843030303030303, + "grad_norm": 0.008320217952132225, + "learning_rate": 9.254952208238318e-05, + "loss": 0.01091097667813301, + "num_input_tokens_seen": 49799416, + "step": 3041, + "train_runtime": 24713.1648, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.8436363636363637, + "grad_norm": 0.003688125405460596, + "learning_rate": 9.254447108024026e-05, + "loss": 0.012057192623615265, + "num_input_tokens_seen": 49815792, + "step": 3042, + "train_runtime": 24721.277, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.8442424242424242, + "grad_norm": 0.004650153685361147, + "learning_rate": 9.25394185044602e-05, + "loss": 0.011981946416199207, + "num_input_tokens_seen": 49832168, + "step": 3043, + "train_runtime": 24729.391, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 1.8448484848484847, + "grad_norm": 0.006913701072335243, + "learning_rate": 9.253436435522991e-05, + "loss": 0.011715936474502087, + "num_input_tokens_seen": 49848544, + "step": 3044, + "train_runtime": 24737.5075, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 1.8454545454545455, + "grad_norm": 0.008736771531403065, + "learning_rate": 9.25293086327363e-05, + "loss": 0.012155945412814617, + "num_input_tokens_seen": 49864920, + "step": 3045, + "train_runtime": 24745.6316, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 1.8460606060606062, + "grad_norm": 0.021866118535399437, + "learning_rate": 9.25242513371664e-05, + "loss": 0.013389071449637413, + "num_input_tokens_seen": 49881296, + "step": 3046, + "train_runtime": 24753.7474, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 1.8466666666666667, + "grad_norm": 0.009575455449521542, + "learning_rate": 9.251919246870724e-05, + "loss": 0.011815833859145641, + "num_input_tokens_seen": 49897672, + "step": 3047, + "train_runtime": 24761.8582, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 1.8472727272727272, + "grad_norm": 0.012831861153244972, + "learning_rate": 9.251413202754595e-05, + "loss": 0.012752903625369072, + "num_input_tokens_seen": 49914048, + "step": 3048, + "train_runtime": 24769.9707, + "train_tokens_per_second": 2015.103 + }, + { + "epoch": 1.847878787878788, + "grad_norm": 0.009517887607216835, + "learning_rate": 9.250907001386972e-05, + "loss": 0.011513019911944866, + "num_input_tokens_seen": 49930424, + "step": 3049, + "train_runtime": 24778.082, + "train_tokens_per_second": 2015.104 + }, + { + "epoch": 1.8484848484848486, + "grad_norm": 0.013358705677092075, + "learning_rate": 9.250400642786576e-05, + "loss": 0.012704689055681229, + "num_input_tokens_seen": 49946800, + "step": 3050, + "train_runtime": 24786.1962, + "train_tokens_per_second": 2015.105 + }, + { + "epoch": 1.8490909090909091, + "grad_norm": 0.011109757237136364, + "learning_rate": 9.24989412697214e-05, + "loss": 0.012690899893641472, + "num_input_tokens_seen": 49963176, + "step": 3051, + "train_runtime": 24794.3077, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 1.8496969696969696, + "grad_norm": 0.0029274390544742346, + "learning_rate": 9.249387453962394e-05, + "loss": 0.011420530267059803, + "num_input_tokens_seen": 49979552, + "step": 3052, + "train_runtime": 24802.4218, + "train_tokens_per_second": 2015.108 + }, + { + "epoch": 1.8503030303030303, + "grad_norm": 0.006933121010661125, + "learning_rate": 9.248880623776081e-05, + "loss": 0.011758833192288876, + "num_input_tokens_seen": 49995928, + "step": 3053, + "train_runtime": 24810.5355, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 1.850909090909091, + "grad_norm": 0.008510863408446312, + "learning_rate": 9.248373636431951e-05, + "loss": 0.011804106645286083, + "num_input_tokens_seen": 50012304, + "step": 3054, + "train_runtime": 24818.65, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 1.8515151515151516, + "grad_norm": 0.00596174830570817, + "learning_rate": 9.247866491948752e-05, + "loss": 0.01262554433196783, + "num_input_tokens_seen": 50028680, + "step": 3055, + "train_runtime": 24826.7631, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 1.852121212121212, + "grad_norm": 0.011812315322458744, + "learning_rate": 9.247359190345243e-05, + "loss": 0.012343344278633595, + "num_input_tokens_seen": 50045056, + "step": 3056, + "train_runtime": 24834.8763, + "train_tokens_per_second": 2015.112 + }, + { + "epoch": 1.8527272727272728, + "grad_norm": 0.010871347971260548, + "learning_rate": 9.24685173164019e-05, + "loss": 0.01399338711053133, + "num_input_tokens_seen": 50061432, + "step": 3057, + "train_runtime": 24842.9898, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.8533333333333335, + "grad_norm": 0.01032578106969595, + "learning_rate": 9.246344115852361e-05, + "loss": 0.011646384373307228, + "num_input_tokens_seen": 50077808, + "step": 3058, + "train_runtime": 24851.1039, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 1.853939393939394, + "grad_norm": 0.009668087586760521, + "learning_rate": 9.245836343000533e-05, + "loss": 0.01203220710158348, + "num_input_tokens_seen": 50094184, + "step": 3059, + "train_runtime": 24859.2162, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 1.8545454545454545, + "grad_norm": 0.0030508043710142374, + "learning_rate": 9.245328413103488e-05, + "loss": 0.011559851467609406, + "num_input_tokens_seen": 50110560, + "step": 3060, + "train_runtime": 24867.3359, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 1.855151515151515, + "grad_norm": 0.012229938060045242, + "learning_rate": 9.244820326180011e-05, + "loss": 0.012419401668012142, + "num_input_tokens_seen": 50126936, + "step": 3061, + "train_runtime": 24875.4521, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 1.8557575757575757, + "grad_norm": 0.01794649288058281, + "learning_rate": 9.244312082248897e-05, + "loss": 0.013206228613853455, + "num_input_tokens_seen": 50143312, + "step": 3062, + "train_runtime": 24883.5707, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 1.8563636363636364, + "grad_norm": 0.007639274932444096, + "learning_rate": 9.243803681328943e-05, + "loss": 0.011870847083628178, + "num_input_tokens_seen": 50159688, + "step": 3063, + "train_runtime": 24891.6848, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 1.856969696969697, + "grad_norm": 0.0070930058136582375, + "learning_rate": 9.243295123438958e-05, + "loss": 0.010470615699887276, + "num_input_tokens_seen": 50176064, + "step": 3064, + "train_runtime": 24899.7968, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 1.8575757575757574, + "grad_norm": 0.007529653608798981, + "learning_rate": 9.24278640859775e-05, + "loss": 0.013072483241558075, + "num_input_tokens_seen": 50192440, + "step": 3065, + "train_runtime": 24907.9126, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 1.8581818181818182, + "grad_norm": 0.005360973067581654, + "learning_rate": 9.242277536824134e-05, + "loss": 0.011755731888115406, + "num_input_tokens_seen": 50208816, + "step": 3066, + "train_runtime": 24916.0331, + "train_tokens_per_second": 2015.121 + }, + { + "epoch": 1.8587878787878789, + "grad_norm": 0.00501141045242548, + "learning_rate": 9.241768508136933e-05, + "loss": 0.011487549170851707, + "num_input_tokens_seen": 50225192, + "step": 3067, + "train_runtime": 24924.1492, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.8593939393939394, + "grad_norm": 0.01034004520624876, + "learning_rate": 9.241259322554973e-05, + "loss": 0.012985237874090672, + "num_input_tokens_seen": 50241568, + "step": 3068, + "train_runtime": 24932.2668, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.8599999999999999, + "grad_norm": 0.007243257015943527, + "learning_rate": 9.240749980097094e-05, + "loss": 0.012754643335938454, + "num_input_tokens_seen": 50257944, + "step": 3069, + "train_runtime": 24940.3821, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 1.8606060606060606, + "grad_norm": 0.01349063403904438, + "learning_rate": 9.24024048078213e-05, + "loss": 0.012604167684912682, + "num_input_tokens_seen": 50274320, + "step": 3070, + "train_runtime": 24948.5006, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 1.8612121212121213, + "grad_norm": 0.004877285100519657, + "learning_rate": 9.23973082462893e-05, + "loss": 0.011970577761530876, + "num_input_tokens_seen": 50290696, + "step": 3071, + "train_runtime": 24956.6188, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 1.8618181818181818, + "grad_norm": 0.009895937517285347, + "learning_rate": 9.239221011656341e-05, + "loss": 0.01223594881594181, + "num_input_tokens_seen": 50307072, + "step": 3072, + "train_runtime": 24964.7356, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 1.8624242424242423, + "grad_norm": 0.006175011862069368, + "learning_rate": 9.238711041883222e-05, + "loss": 0.012292975559830666, + "num_input_tokens_seen": 50323448, + "step": 3073, + "train_runtime": 24972.8571, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 1.863030303030303, + "grad_norm": 0.008000146597623825, + "learning_rate": 9.238200915328438e-05, + "loss": 0.01205262541770935, + "num_input_tokens_seen": 50339824, + "step": 3074, + "train_runtime": 24980.9726, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.8636363636363638, + "grad_norm": 0.007993191480636597, + "learning_rate": 9.237690632010853e-05, + "loss": 0.013416312634944916, + "num_input_tokens_seen": 50356200, + "step": 3075, + "train_runtime": 24989.091, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.8642424242424243, + "grad_norm": 0.005173394922167063, + "learning_rate": 9.237180191949347e-05, + "loss": 0.012492675334215164, + "num_input_tokens_seen": 50372576, + "step": 3076, + "train_runtime": 24997.2059, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 1.8648484848484848, + "grad_norm": 0.008313057012856007, + "learning_rate": 9.236669595162797e-05, + "loss": 0.012614956125617027, + "num_input_tokens_seen": 50388952, + "step": 3077, + "train_runtime": 25005.3215, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 1.8654545454545455, + "grad_norm": 0.00968491192907095, + "learning_rate": 9.236158841670088e-05, + "loss": 0.012565825134515762, + "num_input_tokens_seen": 50405328, + "step": 3078, + "train_runtime": 25013.4393, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 1.8660606060606062, + "grad_norm": 0.007664080243557692, + "learning_rate": 9.235647931490112e-05, + "loss": 0.011506912298500538, + "num_input_tokens_seen": 50421704, + "step": 3079, + "train_runtime": 25021.5561, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 0.00756417540833354, + "learning_rate": 9.23513686464177e-05, + "loss": 0.01151568628847599, + "num_input_tokens_seen": 50438080, + "step": 3080, + "train_runtime": 25029.6711, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 1.8672727272727272, + "grad_norm": 0.008650683797895908, + "learning_rate": 9.23462564114396e-05, + "loss": 0.01214786246418953, + "num_input_tokens_seen": 50454456, + "step": 3081, + "train_runtime": 25037.7853, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 1.867878787878788, + "grad_norm": 0.007602212950587273, + "learning_rate": 9.234114261015597e-05, + "loss": 0.012525323778390884, + "num_input_tokens_seen": 50470832, + "step": 3082, + "train_runtime": 25045.9002, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 1.8684848484848486, + "grad_norm": 0.011407344602048397, + "learning_rate": 9.233602724275592e-05, + "loss": 0.012371896766126156, + "num_input_tokens_seen": 50487208, + "step": 3083, + "train_runtime": 25054.0193, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 1.8690909090909091, + "grad_norm": 0.006255041342228651, + "learning_rate": 9.233091030942866e-05, + "loss": 0.012409602291882038, + "num_input_tokens_seen": 50503584, + "step": 3084, + "train_runtime": 25062.1368, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 1.8696969696969696, + "grad_norm": 0.010671233758330345, + "learning_rate": 9.232579181036347e-05, + "loss": 0.012695337645709515, + "num_input_tokens_seen": 50519960, + "step": 3085, + "train_runtime": 25070.2486, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 1.8703030303030304, + "grad_norm": 0.009065535850822926, + "learning_rate": 9.232067174574968e-05, + "loss": 0.011437319219112396, + "num_input_tokens_seen": 50536336, + "step": 3086, + "train_runtime": 25078.3618, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 1.8709090909090909, + "grad_norm": 0.005685082171112299, + "learning_rate": 9.231555011577663e-05, + "loss": 0.011578761972486973, + "num_input_tokens_seen": 50552712, + "step": 3087, + "train_runtime": 25086.4723, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 1.8715151515151516, + "grad_norm": 0.0075594717636704445, + "learning_rate": 9.23104269206338e-05, + "loss": 0.011918365955352783, + "num_input_tokens_seen": 50569088, + "step": 3088, + "train_runtime": 25094.5849, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 1.872121212121212, + "grad_norm": 0.006498353555798531, + "learning_rate": 9.230530216051069e-05, + "loss": 0.012700119987130165, + "num_input_tokens_seen": 50585464, + "step": 3089, + "train_runtime": 25102.7055, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 1.8727272727272726, + "grad_norm": 0.006765829864889383, + "learning_rate": 9.230017583559682e-05, + "loss": 0.011392736807465553, + "num_input_tokens_seen": 50601840, + "step": 3090, + "train_runtime": 25110.8334, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 1.8733333333333333, + "grad_norm": 0.011798511259257793, + "learning_rate": 9.229504794608182e-05, + "loss": 0.013159212656319141, + "num_input_tokens_seen": 50618216, + "step": 3091, + "train_runtime": 25118.9529, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 1.873939393939394, + "grad_norm": 0.008059892803430557, + "learning_rate": 9.228991849215538e-05, + "loss": 0.012083720415830612, + "num_input_tokens_seen": 50634592, + "step": 3092, + "train_runtime": 25127.0722, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 1.8745454545454545, + "grad_norm": 0.006497847847640514, + "learning_rate": 9.22847874740072e-05, + "loss": 0.01243192795664072, + "num_input_tokens_seen": 50650968, + "step": 3093, + "train_runtime": 25135.1934, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 1.875151515151515, + "grad_norm": 0.008748437277972698, + "learning_rate": 9.227965489182708e-05, + "loss": 0.011547371745109558, + "num_input_tokens_seen": 50667344, + "step": 3094, + "train_runtime": 25143.3106, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 1.8757575757575757, + "grad_norm": 0.009610083885490894, + "learning_rate": 9.227452074580485e-05, + "loss": 0.01293950341641903, + "num_input_tokens_seen": 50683720, + "step": 3095, + "train_runtime": 25151.4328, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 1.8763636363636365, + "grad_norm": 0.006703491788357496, + "learning_rate": 9.226938503613043e-05, + "loss": 0.012840256094932556, + "num_input_tokens_seen": 50700096, + "step": 3096, + "train_runtime": 25159.546, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 1.876969696969697, + "grad_norm": 0.006835185922682285, + "learning_rate": 9.226424776299378e-05, + "loss": 0.012015492655336857, + "num_input_tokens_seen": 50716472, + "step": 3097, + "train_runtime": 25167.6608, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 1.8775757575757575, + "grad_norm": 0.014235780574381351, + "learning_rate": 9.22591089265849e-05, + "loss": 0.01358760241419077, + "num_input_tokens_seen": 50732848, + "step": 3098, + "train_runtime": 25175.7736, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 1.8781818181818182, + "grad_norm": 0.007752373814582825, + "learning_rate": 9.225396852709389e-05, + "loss": 0.01264961063861847, + "num_input_tokens_seen": 50749224, + "step": 3099, + "train_runtime": 25183.896, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 1.878787878787879, + "grad_norm": 0.00946936383843422, + "learning_rate": 9.224882656471086e-05, + "loss": 0.013263813219964504, + "num_input_tokens_seen": 50765600, + "step": 3100, + "train_runtime": 25192.0111, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 1.8793939393939394, + "grad_norm": 0.00941481627523899, + "learning_rate": 9.2243683039626e-05, + "loss": 0.013559294864535332, + "num_input_tokens_seen": 50781976, + "step": 3101, + "train_runtime": 25201.0372, + "train_tokens_per_second": 2015.075 + }, + { + "epoch": 1.88, + "grad_norm": 0.0050283982418477535, + "learning_rate": 9.22385379520296e-05, + "loss": 0.012278541922569275, + "num_input_tokens_seen": 50798352, + "step": 3102, + "train_runtime": 25209.1467, + "train_tokens_per_second": 2015.076 + }, + { + "epoch": 1.8806060606060606, + "grad_norm": 0.034983959048986435, + "learning_rate": 9.223339130211192e-05, + "loss": 0.013488172553479671, + "num_input_tokens_seen": 50814728, + "step": 3103, + "train_runtime": 25217.2561, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 1.8812121212121213, + "grad_norm": 0.006300953682512045, + "learning_rate": 9.222824309006335e-05, + "loss": 0.011517742648720741, + "num_input_tokens_seen": 50831104, + "step": 3104, + "train_runtime": 25225.3653, + "train_tokens_per_second": 2015.079 + }, + { + "epoch": 1.8818181818181818, + "grad_norm": 0.009675579145550728, + "learning_rate": 9.222309331607428e-05, + "loss": 0.011878578923642635, + "num_input_tokens_seen": 50847480, + "step": 3105, + "train_runtime": 25233.4811, + "train_tokens_per_second": 2015.08 + }, + { + "epoch": 1.8824242424242423, + "grad_norm": 0.010816797614097595, + "learning_rate": 9.221794198033525e-05, + "loss": 0.011553348042070866, + "num_input_tokens_seen": 50863856, + "step": 3106, + "train_runtime": 25241.5952, + "train_tokens_per_second": 2015.081 + }, + { + "epoch": 1.883030303030303, + "grad_norm": 0.007883163169026375, + "learning_rate": 9.221278908303674e-05, + "loss": 0.012542840093374252, + "num_input_tokens_seen": 50880232, + "step": 3107, + "train_runtime": 25249.7106, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 1.8836363636363638, + "grad_norm": 0.009454211220145226, + "learning_rate": 9.220763462436937e-05, + "loss": 0.01177777536213398, + "num_input_tokens_seen": 50896608, + "step": 3108, + "train_runtime": 25257.8324, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 1.8842424242424243, + "grad_norm": 0.012845884077250957, + "learning_rate": 9.220247860452378e-05, + "loss": 0.011665490455925465, + "num_input_tokens_seen": 50912984, + "step": 3109, + "train_runtime": 25265.9507, + "train_tokens_per_second": 2015.083 + }, + { + "epoch": 1.8848484848484848, + "grad_norm": 0.008703289553523064, + "learning_rate": 9.21973210236907e-05, + "loss": 0.012141491286456585, + "num_input_tokens_seen": 50929360, + "step": 3110, + "train_runtime": 25274.0675, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 1.8854545454545455, + "grad_norm": 0.011421225033700466, + "learning_rate": 9.21921618820609e-05, + "loss": 0.013285665772855282, + "num_input_tokens_seen": 50945736, + "step": 3111, + "train_runtime": 25282.1874, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 1.8860606060606062, + "grad_norm": 0.009302772581577301, + "learning_rate": 9.218700117982519e-05, + "loss": 0.011570766568183899, + "num_input_tokens_seen": 50962112, + "step": 3112, + "train_runtime": 25290.3063, + "train_tokens_per_second": 2015.085 + }, + { + "epoch": 1.8866666666666667, + "grad_norm": 0.012814724817872047, + "learning_rate": 9.218183891717447e-05, + "loss": 0.011747146025300026, + "num_input_tokens_seen": 50978488, + "step": 3113, + "train_runtime": 25298.4184, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.8872727272727272, + "grad_norm": 0.011627566069364548, + "learning_rate": 9.217667509429966e-05, + "loss": 0.011807992123067379, + "num_input_tokens_seen": 50994864, + "step": 3114, + "train_runtime": 25306.5341, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 1.887878787878788, + "grad_norm": 0.010376955382525921, + "learning_rate": 9.217150971139178e-05, + "loss": 0.01147371158003807, + "num_input_tokens_seen": 51011240, + "step": 3115, + "train_runtime": 25314.6486, + "train_tokens_per_second": 2015.088 + }, + { + "epoch": 1.8884848484848484, + "grad_norm": 0.012348284013569355, + "learning_rate": 9.216634276864188e-05, + "loss": 0.011648658663034439, + "num_input_tokens_seen": 51027616, + "step": 3116, + "train_runtime": 25322.7595, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 1.8890909090909092, + "grad_norm": 0.009763936512172222, + "learning_rate": 9.216117426624107e-05, + "loss": 0.013211511075496674, + "num_input_tokens_seen": 51043992, + "step": 3117, + "train_runtime": 25330.8803, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 1.8896969696969697, + "grad_norm": 0.008509074337780476, + "learning_rate": 9.215600420438054e-05, + "loss": 0.013487070798873901, + "num_input_tokens_seen": 51060368, + "step": 3118, + "train_runtime": 25338.9955, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 1.8903030303030302, + "grad_norm": 0.0092775272205472, + "learning_rate": 9.215083258325152e-05, + "loss": 0.012261521071195602, + "num_input_tokens_seen": 51076744, + "step": 3119, + "train_runtime": 25347.1108, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.8909090909090909, + "grad_norm": 0.007059205323457718, + "learning_rate": 9.214565940304528e-05, + "loss": 0.011692165397107601, + "num_input_tokens_seen": 51093120, + "step": 3120, + "train_runtime": 25355.2335, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 1.8915151515151516, + "grad_norm": 0.01579814963042736, + "learning_rate": 9.214048466395316e-05, + "loss": 0.012515694834291935, + "num_input_tokens_seen": 51109496, + "step": 3121, + "train_runtime": 25363.3518, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 1.892121212121212, + "grad_norm": 0.008557470515370369, + "learning_rate": 9.213530836616657e-05, + "loss": 0.01166028343141079, + "num_input_tokens_seen": 51125872, + "step": 3122, + "train_runtime": 25371.4695, + "train_tokens_per_second": 2015.093 + }, + { + "epoch": 1.8927272727272726, + "grad_norm": 0.009341249242424965, + "learning_rate": 9.2130130509877e-05, + "loss": 0.01161861326545477, + "num_input_tokens_seen": 51142248, + "step": 3123, + "train_runtime": 25379.5857, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 1.8933333333333333, + "grad_norm": 0.009486070834100246, + "learning_rate": 9.212495109527594e-05, + "loss": 0.012822091579437256, + "num_input_tokens_seen": 51158624, + "step": 3124, + "train_runtime": 25387.6977, + "train_tokens_per_second": 2015.095 + }, + { + "epoch": 1.893939393939394, + "grad_norm": 0.004312561824917793, + "learning_rate": 9.211977012255498e-05, + "loss": 0.012744562700390816, + "num_input_tokens_seen": 51175000, + "step": 3125, + "train_runtime": 25395.8131, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 1.8945454545454545, + "grad_norm": 0.008629312738776207, + "learning_rate": 9.211458759190573e-05, + "loss": 0.012353931553661823, + "num_input_tokens_seen": 51191376, + "step": 3126, + "train_runtime": 25403.9329, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 1.895151515151515, + "grad_norm": 0.005605903919786215, + "learning_rate": 9.210940350351991e-05, + "loss": 0.01195718813687563, + "num_input_tokens_seen": 51207752, + "step": 3127, + "train_runtime": 25412.0511, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.8957575757575758, + "grad_norm": 0.010096116922795773, + "learning_rate": 9.210421785758927e-05, + "loss": 0.01376781240105629, + "num_input_tokens_seen": 51224128, + "step": 3128, + "train_runtime": 25420.1712, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.8963636363636365, + "grad_norm": 0.010645140893757343, + "learning_rate": 9.209903065430558e-05, + "loss": 0.012773418799042702, + "num_input_tokens_seen": 51240504, + "step": 3129, + "train_runtime": 25428.2855, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 1.896969696969697, + "grad_norm": 0.012686858884990215, + "learning_rate": 9.209384189386075e-05, + "loss": 0.01278688758611679, + "num_input_tokens_seen": 51256880, + "step": 3130, + "train_runtime": 25436.403, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 1.8975757575757575, + "grad_norm": 0.0073151239193975925, + "learning_rate": 9.208865157644668e-05, + "loss": 0.012328793294727802, + "num_input_tokens_seen": 51273256, + "step": 3131, + "train_runtime": 25444.5158, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 1.8981818181818182, + "grad_norm": 0.009652036242187023, + "learning_rate": 9.208345970225535e-05, + "loss": 0.012286683544516563, + "num_input_tokens_seen": 51289632, + "step": 3132, + "train_runtime": 25452.6326, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 1.898787878787879, + "grad_norm": 0.008292704820632935, + "learning_rate": 9.207826627147879e-05, + "loss": 0.01197260431945324, + "num_input_tokens_seen": 51306008, + "step": 3133, + "train_runtime": 25460.7457, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 1.8993939393939394, + "grad_norm": 0.014060231857001781, + "learning_rate": 9.207307128430913e-05, + "loss": 0.013115906156599522, + "num_input_tokens_seen": 51322384, + "step": 3134, + "train_runtime": 25468.8578, + "train_tokens_per_second": 2015.103 + }, + { + "epoch": 1.9, + "grad_norm": 0.011946297250688076, + "learning_rate": 9.206787474093848e-05, + "loss": 0.011841751635074615, + "num_input_tokens_seen": 51338760, + "step": 3135, + "train_runtime": 25476.9731, + "train_tokens_per_second": 2015.104 + }, + { + "epoch": 1.9006060606060606, + "grad_norm": 0.007455665152519941, + "learning_rate": 9.206267664155907e-05, + "loss": 0.012197930365800858, + "num_input_tokens_seen": 51355136, + "step": 3136, + "train_runtime": 25485.0917, + "train_tokens_per_second": 2015.105 + }, + { + "epoch": 1.9012121212121214, + "grad_norm": 0.026311933994293213, + "learning_rate": 9.205747698636316e-05, + "loss": 0.012256120331585407, + "num_input_tokens_seen": 51371512, + "step": 3137, + "train_runtime": 25493.2028, + "train_tokens_per_second": 2015.106 + }, + { + "epoch": 1.9018181818181819, + "grad_norm": 0.004334684461355209, + "learning_rate": 9.205227577554307e-05, + "loss": 0.011752675287425518, + "num_input_tokens_seen": 51387888, + "step": 3138, + "train_runtime": 25501.3209, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 1.9024242424242424, + "grad_norm": 0.028817525133490562, + "learning_rate": 9.204707300929121e-05, + "loss": 0.01371039915829897, + "num_input_tokens_seen": 51404264, + "step": 3139, + "train_runtime": 25509.437, + "train_tokens_per_second": 2015.108 + }, + { + "epoch": 1.903030303030303, + "grad_norm": 0.012218418531119823, + "learning_rate": 9.204186868779999e-05, + "loss": 0.012458586134016514, + "num_input_tokens_seen": 51420640, + "step": 3140, + "train_runtime": 25517.548, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 1.9036363636363638, + "grad_norm": 0.010442850179970264, + "learning_rate": 9.203666281126193e-05, + "loss": 0.012032663449645042, + "num_input_tokens_seen": 51437016, + "step": 3141, + "train_runtime": 25525.6624, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 1.9042424242424243, + "grad_norm": 0.010629463009536266, + "learning_rate": 9.203145537986957e-05, + "loss": 0.012171917594969273, + "num_input_tokens_seen": 51453392, + "step": 3142, + "train_runtime": 25533.7739, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 1.9048484848484848, + "grad_norm": 0.004095530137419701, + "learning_rate": 9.202624639381552e-05, + "loss": 0.011937003582715988, + "num_input_tokens_seen": 51469768, + "step": 3143, + "train_runtime": 25541.8863, + "train_tokens_per_second": 2015.112 + }, + { + "epoch": 1.9054545454545453, + "grad_norm": 0.00807954091578722, + "learning_rate": 9.202103585329247e-05, + "loss": 0.012175404466688633, + "num_input_tokens_seen": 51486144, + "step": 3144, + "train_runtime": 25550.002, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.906060606060606, + "grad_norm": 0.012023149989545345, + "learning_rate": 9.201582375849313e-05, + "loss": 0.013452206738293171, + "num_input_tokens_seen": 51502520, + "step": 3145, + "train_runtime": 25558.118, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 1.9066666666666667, + "grad_norm": 0.006550980266183615, + "learning_rate": 9.20106101096103e-05, + "loss": 0.011639823205769062, + "num_input_tokens_seen": 51518896, + "step": 3146, + "train_runtime": 25566.2333, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 1.9072727272727272, + "grad_norm": 0.011515556834638119, + "learning_rate": 9.200539490683682e-05, + "loss": 0.0110179977491498, + "num_input_tokens_seen": 51535272, + "step": 3147, + "train_runtime": 25574.3546, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 1.9078787878787877, + "grad_norm": 0.014057625085115433, + "learning_rate": 9.200017815036557e-05, + "loss": 0.014424681663513184, + "num_input_tokens_seen": 51551648, + "step": 3148, + "train_runtime": 25582.4699, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 1.9084848484848485, + "grad_norm": 0.00884437095373869, + "learning_rate": 9.199495984038953e-05, + "loss": 0.011430526152253151, + "num_input_tokens_seen": 51568024, + "step": 3149, + "train_runtime": 25590.5833, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 1.9090909090909092, + "grad_norm": 0.007343432400375605, + "learning_rate": 9.198973997710169e-05, + "loss": 0.01341752428561449, + "num_input_tokens_seen": 51584400, + "step": 3150, + "train_runtime": 25598.6979, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 1.9096969696969697, + "grad_norm": 0.011537443846464157, + "learning_rate": 9.198451856069515e-05, + "loss": 0.012639664113521576, + "num_input_tokens_seen": 51600776, + "step": 3151, + "train_runtime": 25606.8086, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 1.9103030303030302, + "grad_norm": 0.006157017312943935, + "learning_rate": 9.197929559136304e-05, + "loss": 0.012154512107372284, + "num_input_tokens_seen": 51617152, + "step": 3152, + "train_runtime": 25614.9229, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 1.910909090909091, + "grad_norm": 0.014012346975505352, + "learning_rate": 9.197407106929851e-05, + "loss": 0.014261198230087757, + "num_input_tokens_seen": 51633528, + "step": 3153, + "train_runtime": 25623.042, + "train_tokens_per_second": 2015.121 + }, + { + "epoch": 1.9115151515151516, + "grad_norm": 0.011342213489115238, + "learning_rate": 9.196884499469486e-05, + "loss": 0.012025252915918827, + "num_input_tokens_seen": 51649904, + "step": 3154, + "train_runtime": 25631.1577, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.912121212121212, + "grad_norm": 0.008528700098395348, + "learning_rate": 9.196361736774535e-05, + "loss": 0.01199396327137947, + "num_input_tokens_seen": 51666280, + "step": 3155, + "train_runtime": 25639.2724, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 1.9127272727272726, + "grad_norm": 0.00821691658347845, + "learning_rate": 9.195838818864337e-05, + "loss": 0.012443341314792633, + "num_input_tokens_seen": 51682656, + "step": 3156, + "train_runtime": 25647.3857, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 1.9133333333333333, + "grad_norm": 0.010422799736261368, + "learning_rate": 9.19531574575823e-05, + "loss": 0.011776940897107124, + "num_input_tokens_seen": 51699032, + "step": 3157, + "train_runtime": 25655.4992, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 1.913939393939394, + "grad_norm": 0.012758140452206135, + "learning_rate": 9.194792517475565e-05, + "loss": 0.013998530805110931, + "num_input_tokens_seen": 51715408, + "step": 3158, + "train_runtime": 25663.6124, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 1.9145454545454546, + "grad_norm": 0.0021320863161236048, + "learning_rate": 9.194269134035692e-05, + "loss": 0.011572036892175674, + "num_input_tokens_seen": 51731784, + "step": 3159, + "train_runtime": 25671.7345, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 1.915151515151515, + "grad_norm": 0.005653473548591137, + "learning_rate": 9.193745595457974e-05, + "loss": 0.01234716922044754, + "num_input_tokens_seen": 51748160, + "step": 3160, + "train_runtime": 25679.8465, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.9157575757575758, + "grad_norm": 0.00827194843441248, + "learning_rate": 9.193221901761772e-05, + "loss": 0.011896461248397827, + "num_input_tokens_seen": 51764536, + "step": 3161, + "train_runtime": 25687.958, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 1.9163636363636365, + "grad_norm": 0.00640900107100606, + "learning_rate": 9.192698052966457e-05, + "loss": 0.011466245166957378, + "num_input_tokens_seen": 51780912, + "step": 3162, + "train_runtime": 25696.0738, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 1.916969696969697, + "grad_norm": 0.0063836583867669106, + "learning_rate": 9.192174049091407e-05, + "loss": 0.011264004744589329, + "num_input_tokens_seen": 51797288, + "step": 3163, + "train_runtime": 25704.1858, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 1.9175757575757575, + "grad_norm": 0.011097591370344162, + "learning_rate": 9.191649890156003e-05, + "loss": 0.014016124419867992, + "num_input_tokens_seen": 51813664, + "step": 3164, + "train_runtime": 25712.3007, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 1.9181818181818182, + "grad_norm": 0.008603382855653763, + "learning_rate": 9.191125576179634e-05, + "loss": 0.011245546862483025, + "num_input_tokens_seen": 51830040, + "step": 3165, + "train_runtime": 25720.4175, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 1.918787878787879, + "grad_norm": 0.007882286794483662, + "learning_rate": 9.19060110718169e-05, + "loss": 0.011330833658576012, + "num_input_tokens_seen": 51846416, + "step": 3166, + "train_runtime": 25728.533, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 1.9193939393939394, + "grad_norm": 0.009839195758104324, + "learning_rate": 9.190076483181572e-05, + "loss": 0.012397222220897675, + "num_input_tokens_seen": 51862792, + "step": 3167, + "train_runtime": 25736.6463, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 1.92, + "grad_norm": 0.021807054057717323, + "learning_rate": 9.189551704198683e-05, + "loss": 0.014394733123481274, + "num_input_tokens_seen": 51879168, + "step": 3168, + "train_runtime": 25744.7632, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 1.9206060606060606, + "grad_norm": 0.02324414625763893, + "learning_rate": 9.189026770252436e-05, + "loss": 0.013463632203638554, + "num_input_tokens_seen": 51895544, + "step": 3169, + "train_runtime": 25752.8788, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 1.9212121212121214, + "grad_norm": 0.008596673607826233, + "learning_rate": 9.18850168136225e-05, + "loss": 0.012573868036270142, + "num_input_tokens_seen": 51911920, + "step": 3170, + "train_runtime": 25760.9907, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 1.9218181818181819, + "grad_norm": 0.008097982965409756, + "learning_rate": 9.187976437547538e-05, + "loss": 0.013436982408165932, + "num_input_tokens_seen": 51928296, + "step": 3171, + "train_runtime": 25769.11, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 1.9224242424242424, + "grad_norm": 0.01106534618884325, + "learning_rate": 9.187451038827737e-05, + "loss": 0.012379739433526993, + "num_input_tokens_seen": 51944672, + "step": 3172, + "train_runtime": 25777.2322, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 1.9230303030303029, + "grad_norm": 0.011437936685979366, + "learning_rate": 9.186925485222276e-05, + "loss": 0.01257653534412384, + "num_input_tokens_seen": 51961048, + "step": 3173, + "train_runtime": 25785.3456, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 1.9236363636363636, + "grad_norm": 0.0026785824447870255, + "learning_rate": 9.186399776750596e-05, + "loss": 0.010649677366018295, + "num_input_tokens_seen": 51977424, + "step": 3174, + "train_runtime": 25793.4554, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 1.9242424242424243, + "grad_norm": 0.004529865458607674, + "learning_rate": 9.185873913432139e-05, + "loss": 0.01199475396424532, + "num_input_tokens_seen": 51993800, + "step": 3175, + "train_runtime": 25801.5682, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 1.9248484848484848, + "grad_norm": 0.008403346873819828, + "learning_rate": 9.185347895286358e-05, + "loss": 0.011764888651669025, + "num_input_tokens_seen": 52010176, + "step": 3176, + "train_runtime": 25809.6829, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 1.9254545454545453, + "grad_norm": 0.005153805483132601, + "learning_rate": 9.18482172233271e-05, + "loss": 0.011884771287441254, + "num_input_tokens_seen": 52026552, + "step": 3177, + "train_runtime": 25817.798, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 1.926060606060606, + "grad_norm": 0.008104590699076653, + "learning_rate": 9.184295394590655e-05, + "loss": 0.012589624151587486, + "num_input_tokens_seen": 52042928, + "step": 3178, + "train_runtime": 25825.9099, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 1.9266666666666667, + "grad_norm": 0.004483804106712341, + "learning_rate": 9.183768912079662e-05, + "loss": 0.011564332991838455, + "num_input_tokens_seen": 52059304, + "step": 3179, + "train_runtime": 25834.0235, + "train_tokens_per_second": 2015.145 + }, + { + "epoch": 1.9272727272727272, + "grad_norm": 0.007661936338990927, + "learning_rate": 9.183242274819205e-05, + "loss": 0.011691214516758919, + "num_input_tokens_seen": 52075680, + "step": 3180, + "train_runtime": 25842.1332, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 1.9278787878787877, + "grad_norm": 0.006727028172463179, + "learning_rate": 9.182715482828763e-05, + "loss": 0.011927913874387741, + "num_input_tokens_seen": 52092056, + "step": 3181, + "train_runtime": 25850.2436, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 1.9284848484848485, + "grad_norm": 0.007131942082196474, + "learning_rate": 9.18218853612782e-05, + "loss": 0.01202892605215311, + "num_input_tokens_seen": 52108432, + "step": 3182, + "train_runtime": 25858.354, + "train_tokens_per_second": 2015.149 + }, + { + "epoch": 1.9290909090909092, + "grad_norm": 0.018185211345553398, + "learning_rate": 9.181661434735867e-05, + "loss": 0.011910402216017246, + "num_input_tokens_seen": 52124808, + "step": 3183, + "train_runtime": 25866.4679, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 1.9296969696969697, + "grad_norm": 0.013236770406365395, + "learning_rate": 9.181134178672401e-05, + "loss": 0.011599000543355942, + "num_input_tokens_seen": 52141184, + "step": 3184, + "train_runtime": 25874.5825, + "train_tokens_per_second": 2015.151 + }, + { + "epoch": 1.9303030303030302, + "grad_norm": 0.007080638315528631, + "learning_rate": 9.180606767956925e-05, + "loss": 0.011230498552322388, + "num_input_tokens_seen": 52157560, + "step": 3185, + "train_runtime": 25882.7011, + "train_tokens_per_second": 2015.151 + }, + { + "epoch": 1.930909090909091, + "grad_norm": 0.006421273574233055, + "learning_rate": 9.180079202608947e-05, + "loss": 0.012318437919020653, + "num_input_tokens_seen": 52173936, + "step": 3186, + "train_runtime": 25890.8138, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 1.9315151515151516, + "grad_norm": 0.006406648550182581, + "learning_rate": 9.179551482647978e-05, + "loss": 0.012266881763935089, + "num_input_tokens_seen": 52190312, + "step": 3187, + "train_runtime": 25898.9317, + "train_tokens_per_second": 2015.153 + }, + { + "epoch": 1.9321212121212121, + "grad_norm": 0.008923706598579884, + "learning_rate": 9.17902360809354e-05, + "loss": 0.011904872953891754, + "num_input_tokens_seen": 52206688, + "step": 3188, + "train_runtime": 25907.0441, + "train_tokens_per_second": 2015.154 + }, + { + "epoch": 1.9327272727272726, + "grad_norm": 0.009994618594646454, + "learning_rate": 9.178495578965157e-05, + "loss": 0.01313931867480278, + "num_input_tokens_seen": 52223064, + "step": 3189, + "train_runtime": 25915.1572, + "train_tokens_per_second": 2015.155 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 0.010433097369968891, + "learning_rate": 9.177967395282359e-05, + "loss": 0.011210711672902107, + "num_input_tokens_seen": 52239440, + "step": 3190, + "train_runtime": 25923.2678, + "train_tokens_per_second": 2015.156 + }, + { + "epoch": 1.933939393939394, + "grad_norm": 0.01240938063710928, + "learning_rate": 9.177439057064683e-05, + "loss": 0.012704628519713879, + "num_input_tokens_seen": 52255816, + "step": 3191, + "train_runtime": 25931.3805, + "train_tokens_per_second": 2015.158 + }, + { + "epoch": 1.9345454545454546, + "grad_norm": 0.011206640861928463, + "learning_rate": 9.176910564331671e-05, + "loss": 0.012657486833631992, + "num_input_tokens_seen": 52272192, + "step": 3192, + "train_runtime": 25939.4924, + "train_tokens_per_second": 2015.159 + }, + { + "epoch": 1.935151515151515, + "grad_norm": 0.01022346131503582, + "learning_rate": 9.176381917102873e-05, + "loss": 0.011253394186496735, + "num_input_tokens_seen": 52288568, + "step": 3193, + "train_runtime": 25947.6051, + "train_tokens_per_second": 2015.16 + }, + { + "epoch": 1.9357575757575758, + "grad_norm": 0.01788550242781639, + "learning_rate": 9.17585311539784e-05, + "loss": 0.013412839733064175, + "num_input_tokens_seen": 52304944, + "step": 3194, + "train_runtime": 25955.7207, + "train_tokens_per_second": 2015.161 + }, + { + "epoch": 1.9363636363636365, + "grad_norm": 0.01697622984647751, + "learning_rate": 9.175324159236132e-05, + "loss": 0.01345901656895876, + "num_input_tokens_seen": 52321320, + "step": 3195, + "train_runtime": 25963.8327, + "train_tokens_per_second": 2015.162 + }, + { + "epoch": 1.936969696969697, + "grad_norm": 0.009710061363875866, + "learning_rate": 9.174795048637316e-05, + "loss": 0.012749395333230495, + "num_input_tokens_seen": 52337696, + "step": 3196, + "train_runtime": 25971.9426, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 1.9375757575757575, + "grad_norm": 0.011495089158415794, + "learning_rate": 9.174265783620961e-05, + "loss": 0.0130428122356534, + "num_input_tokens_seen": 52354072, + "step": 3197, + "train_runtime": 25980.0524, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 1.9381818181818182, + "grad_norm": 0.006617996841669083, + "learning_rate": 9.173736364206642e-05, + "loss": 0.01320140715688467, + "num_input_tokens_seen": 52370448, + "step": 3198, + "train_runtime": 25988.1659, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 1.938787878787879, + "grad_norm": 0.006989014334976673, + "learning_rate": 9.173206790413945e-05, + "loss": 0.011760172434151173, + "num_input_tokens_seen": 52386824, + "step": 3199, + "train_runtime": 25996.2801, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 1.9393939393939394, + "grad_norm": 0.035555772483348846, + "learning_rate": 9.172677062262453e-05, + "loss": 0.01268516480922699, + "num_input_tokens_seen": 52403200, + "step": 3200, + "train_runtime": 26004.3932, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 1.94, + "grad_norm": 0.007417343556880951, + "learning_rate": 9.172147179771765e-05, + "loss": 0.011998838745057583, + "num_input_tokens_seen": 52419576, + "step": 3201, + "train_runtime": 26013.4365, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 1.9406060606060604, + "grad_norm": 0.007505638990551233, + "learning_rate": 9.171617142961477e-05, + "loss": 0.012394100427627563, + "num_input_tokens_seen": 52435952, + "step": 3202, + "train_runtime": 26021.5523, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.9412121212121212, + "grad_norm": 0.012211352586746216, + "learning_rate": 9.171086951851194e-05, + "loss": 0.012769252061843872, + "num_input_tokens_seen": 52452328, + "step": 3203, + "train_runtime": 26029.6684, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.9418181818181819, + "grad_norm": 0.007897526025772095, + "learning_rate": 9.170556606460527e-05, + "loss": 0.011707163415849209, + "num_input_tokens_seen": 52468704, + "step": 3204, + "train_runtime": 26037.7862, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 1.9424242424242424, + "grad_norm": 0.012640303000807762, + "learning_rate": 9.170026106809095e-05, + "loss": 0.012289149686694145, + "num_input_tokens_seen": 52485080, + "step": 3205, + "train_runtime": 26045.8995, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 1.9430303030303029, + "grad_norm": 0.006511483807116747, + "learning_rate": 9.169495452916516e-05, + "loss": 0.01183705311268568, + "num_input_tokens_seen": 52501456, + "step": 3206, + "train_runtime": 26054.0156, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 1.9436363636363636, + "grad_norm": 0.013782680965960026, + "learning_rate": 9.168964644802422e-05, + "loss": 0.011273516342043877, + "num_input_tokens_seen": 52517832, + "step": 3207, + "train_runtime": 26062.134, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 1.9442424242424243, + "grad_norm": 0.008283359929919243, + "learning_rate": 9.168433682486444e-05, + "loss": 0.012856019660830498, + "num_input_tokens_seen": 52534208, + "step": 3208, + "train_runtime": 26070.2527, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 1.9448484848484848, + "grad_norm": 0.006199698895215988, + "learning_rate": 9.16790256598822e-05, + "loss": 0.011374854482710361, + "num_input_tokens_seen": 52550584, + "step": 3209, + "train_runtime": 26078.3698, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 1.9454545454545453, + "grad_norm": 0.004435536917299032, + "learning_rate": 9.167371295327399e-05, + "loss": 0.011426806449890137, + "num_input_tokens_seen": 52566960, + "step": 3210, + "train_runtime": 26086.4805, + "train_tokens_per_second": 2015.104 + }, + { + "epoch": 1.946060606060606, + "grad_norm": 0.00908664334565401, + "learning_rate": 9.166839870523627e-05, + "loss": 0.011732470244169235, + "num_input_tokens_seen": 52583336, + "step": 3211, + "train_runtime": 26094.5916, + "train_tokens_per_second": 2015.105 + }, + { + "epoch": 1.9466666666666668, + "grad_norm": 0.007592847105115652, + "learning_rate": 9.166308291596563e-05, + "loss": 0.012461712583899498, + "num_input_tokens_seen": 52599712, + "step": 3212, + "train_runtime": 26102.7074, + "train_tokens_per_second": 2015.106 + }, + { + "epoch": 1.9472727272727273, + "grad_norm": 0.013892276212573051, + "learning_rate": 9.16577655856587e-05, + "loss": 0.012362895533442497, + "num_input_tokens_seen": 52616088, + "step": 3213, + "train_runtime": 26110.8219, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 1.9478787878787878, + "grad_norm": 0.006196176167577505, + "learning_rate": 9.165244671451214e-05, + "loss": 0.011770393699407578, + "num_input_tokens_seen": 52632464, + "step": 3214, + "train_runtime": 26118.9426, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 1.9484848484848485, + "grad_norm": 0.007588802836835384, + "learning_rate": 9.16471263027227e-05, + "loss": 0.011993967927992344, + "num_input_tokens_seen": 52648840, + "step": 3215, + "train_runtime": 26127.0587, + "train_tokens_per_second": 2015.108 + }, + { + "epoch": 1.9490909090909092, + "grad_norm": 0.006540258880704641, + "learning_rate": 9.164180435048715e-05, + "loss": 0.012559541501104832, + "num_input_tokens_seen": 52665216, + "step": 3216, + "train_runtime": 26135.1757, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 1.9496969696969697, + "grad_norm": 0.007406895514577627, + "learning_rate": 9.163648085800236e-05, + "loss": 0.012432373128831387, + "num_input_tokens_seen": 52681592, + "step": 3217, + "train_runtime": 26143.2906, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 1.9503030303030302, + "grad_norm": 0.006238951813429594, + "learning_rate": 9.163115582546522e-05, + "loss": 0.012486881576478481, + "num_input_tokens_seen": 52697968, + "step": 3218, + "train_runtime": 26151.4015, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 1.950909090909091, + "grad_norm": 0.005955249071121216, + "learning_rate": 9.162582925307271e-05, + "loss": 0.012666616588830948, + "num_input_tokens_seen": 52714344, + "step": 3219, + "train_runtime": 26159.5147, + "train_tokens_per_second": 2015.112 + }, + { + "epoch": 1.9515151515151516, + "grad_norm": 0.01560470461845398, + "learning_rate": 9.162050114102184e-05, + "loss": 0.01188839040696621, + "num_input_tokens_seen": 52730720, + "step": 3220, + "train_runtime": 26167.6327, + "train_tokens_per_second": 2015.112 + }, + { + "epoch": 1.9521212121212121, + "grad_norm": 0.006458323448896408, + "learning_rate": 9.161517148950967e-05, + "loss": 0.011296706274151802, + "num_input_tokens_seen": 52747096, + "step": 3221, + "train_runtime": 26175.7479, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.9527272727272726, + "grad_norm": 0.006805418990552425, + "learning_rate": 9.160984029873334e-05, + "loss": 0.012141970917582512, + "num_input_tokens_seen": 52763472, + "step": 3222, + "train_runtime": 26183.8621, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 1.9533333333333334, + "grad_norm": 0.007566072512418032, + "learning_rate": 9.160450756889006e-05, + "loss": 0.01246306486427784, + "num_input_tokens_seen": 52779848, + "step": 3223, + "train_runtime": 26191.9843, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 1.953939393939394, + "grad_norm": 0.008537397719919682, + "learning_rate": 9.159917330017707e-05, + "loss": 0.012608175165951252, + "num_input_tokens_seen": 52796224, + "step": 3224, + "train_runtime": 26200.0967, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 1.9545454545454546, + "grad_norm": 0.008435552939772606, + "learning_rate": 9.159383749279167e-05, + "loss": 0.012503480538725853, + "num_input_tokens_seen": 52812600, + "step": 3225, + "train_runtime": 26208.214, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 1.955151515151515, + "grad_norm": 0.00943230465054512, + "learning_rate": 9.158850014693123e-05, + "loss": 0.012536080554127693, + "num_input_tokens_seen": 52828976, + "step": 3226, + "train_runtime": 26216.3339, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 1.9557575757575758, + "grad_norm": 0.010770905762910843, + "learning_rate": 9.158316126279314e-05, + "loss": 0.012846671044826508, + "num_input_tokens_seen": 52845352, + "step": 3227, + "train_runtime": 26224.4505, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 1.9563636363636365, + "grad_norm": 0.00734669528901577, + "learning_rate": 9.157782084057491e-05, + "loss": 0.011868518777191639, + "num_input_tokens_seen": 52861728, + "step": 3228, + "train_runtime": 26232.5655, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 1.956969696969697, + "grad_norm": 0.00950642116367817, + "learning_rate": 9.157247888047405e-05, + "loss": 0.011426198296248913, + "num_input_tokens_seen": 52878104, + "step": 3229, + "train_runtime": 26240.6832, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 1.9575757575757575, + "grad_norm": 0.0105692557990551, + "learning_rate": 9.156713538268815e-05, + "loss": 0.01276457030326128, + "num_input_tokens_seen": 52894480, + "step": 3230, + "train_runtime": 26248.7985, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 1.958181818181818, + "grad_norm": 0.012945852242410183, + "learning_rate": 9.156179034741486e-05, + "loss": 0.012674668803811073, + "num_input_tokens_seen": 52910856, + "step": 3231, + "train_runtime": 26256.9107, + "train_tokens_per_second": 2015.121 + }, + { + "epoch": 1.9587878787878787, + "grad_norm": 0.011738558299839497, + "learning_rate": 9.155644377485188e-05, + "loss": 0.012540457770228386, + "num_input_tokens_seen": 52927232, + "step": 3232, + "train_runtime": 26265.0321, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.9593939393939395, + "grad_norm": 0.007134211249649525, + "learning_rate": 9.1551095665197e-05, + "loss": 0.013350131921470165, + "num_input_tokens_seen": 52943608, + "step": 3233, + "train_runtime": 26273.1455, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 1.96, + "grad_norm": 0.004556073807179928, + "learning_rate": 9.154574601864799e-05, + "loss": 0.012090170755982399, + "num_input_tokens_seen": 52959984, + "step": 3234, + "train_runtime": 26281.2589, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 1.9606060606060605, + "grad_norm": 0.0194314606487751, + "learning_rate": 9.154039483540273e-05, + "loss": 0.013460388407111168, + "num_input_tokens_seen": 52976360, + "step": 3235, + "train_runtime": 26289.3732, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 1.9612121212121212, + "grad_norm": 0.011464192532002926, + "learning_rate": 9.153504211565917e-05, + "loss": 0.009950187988579273, + "num_input_tokens_seen": 52992736, + "step": 3236, + "train_runtime": 26297.4864, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 1.961818181818182, + "grad_norm": 0.00886337086558342, + "learning_rate": 9.152968785961529e-05, + "loss": 0.012892349623143673, + "num_input_tokens_seen": 53009112, + "step": 3237, + "train_runtime": 26305.6, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.9624242424242424, + "grad_norm": 0.008062895387411118, + "learning_rate": 9.152433206746913e-05, + "loss": 0.012557323090732098, + "num_input_tokens_seen": 53025488, + "step": 3238, + "train_runtime": 26313.7176, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.963030303030303, + "grad_norm": 0.009354766458272934, + "learning_rate": 9.151897473941879e-05, + "loss": 0.012314814142882824, + "num_input_tokens_seen": 53041864, + "step": 3239, + "train_runtime": 26321.8323, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 1.9636363636363636, + "grad_norm": 0.007434674073010683, + "learning_rate": 9.151361587566246e-05, + "loss": 0.012359886430203915, + "num_input_tokens_seen": 53058240, + "step": 3240, + "train_runtime": 26329.9456, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 1.9642424242424243, + "grad_norm": 0.010088525712490082, + "learning_rate": 9.150825547639827e-05, + "loss": 0.013497358188033104, + "num_input_tokens_seen": 53074616, + "step": 3241, + "train_runtime": 26338.0619, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 1.9648484848484848, + "grad_norm": 0.010220595635473728, + "learning_rate": 9.150289354182458e-05, + "loss": 0.01281267125159502, + "num_input_tokens_seen": 53090992, + "step": 3242, + "train_runtime": 26346.1755, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 1.9654545454545453, + "grad_norm": 0.007582931779325008, + "learning_rate": 9.149753007213966e-05, + "loss": 0.011935632675886154, + "num_input_tokens_seen": 53107368, + "step": 3243, + "train_runtime": 26354.2901, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 1.966060606060606, + "grad_norm": 0.03495870903134346, + "learning_rate": 9.149216506754192e-05, + "loss": 0.013317975215613842, + "num_input_tokens_seen": 53123744, + "step": 3244, + "train_runtime": 26362.4037, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 1.9666666666666668, + "grad_norm": 0.009395863860845566, + "learning_rate": 9.148679852822981e-05, + "loss": 0.012221533805131912, + "num_input_tokens_seen": 53140120, + "step": 3245, + "train_runtime": 26370.5241, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 1.9672727272727273, + "grad_norm": 0.011900022625923157, + "learning_rate": 9.14814304544018e-05, + "loss": 0.011450408957898617, + "num_input_tokens_seen": 53156496, + "step": 3246, + "train_runtime": 26378.6407, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 1.9678787878787878, + "grad_norm": 0.004700829740613699, + "learning_rate": 9.147606084625648e-05, + "loss": 0.012666188180446625, + "num_input_tokens_seen": 53172872, + "step": 3247, + "train_runtime": 26386.7572, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 1.9684848484848485, + "grad_norm": 0.00924025196582079, + "learning_rate": 9.147068970399242e-05, + "loss": 0.013658061623573303, + "num_input_tokens_seen": 53189248, + "step": 3248, + "train_runtime": 26394.8721, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 1.9690909090909092, + "grad_norm": 0.009269666858017445, + "learning_rate": 9.146531702780832e-05, + "loss": 0.012272707186639309, + "num_input_tokens_seen": 53205624, + "step": 3249, + "train_runtime": 26402.9859, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 1.9696969696969697, + "grad_norm": 0.00729750981554389, + "learning_rate": 9.145994281790287e-05, + "loss": 0.011746074073016644, + "num_input_tokens_seen": 53222000, + "step": 3250, + "train_runtime": 26411.1001, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 1.9703030303030302, + "grad_norm": 0.011676258407533169, + "learning_rate": 9.145456707447491e-05, + "loss": 0.01279502548277378, + "num_input_tokens_seen": 53238376, + "step": 3251, + "train_runtime": 26419.2149, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 1.970909090909091, + "grad_norm": 0.005959291011095047, + "learning_rate": 9.144918979772322e-05, + "loss": 0.011548520997166634, + "num_input_tokens_seen": 53254752, + "step": 3252, + "train_runtime": 26427.3347, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 1.9715151515151517, + "grad_norm": 0.010743229649960995, + "learning_rate": 9.144381098784671e-05, + "loss": 0.011153536848723888, + "num_input_tokens_seen": 53271128, + "step": 3253, + "train_runtime": 26435.4522, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 1.9721212121212122, + "grad_norm": 0.011109214276075363, + "learning_rate": 9.143843064504437e-05, + "loss": 0.011543774977326393, + "num_input_tokens_seen": 53287504, + "step": 3254, + "train_runtime": 26443.568, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 1.9727272727272727, + "grad_norm": 0.012104131281375885, + "learning_rate": 9.143304876951515e-05, + "loss": 0.012650061398744583, + "num_input_tokens_seen": 53303880, + "step": 3255, + "train_runtime": 26451.684, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 1.9733333333333334, + "grad_norm": 0.007510365452617407, + "learning_rate": 9.142766536145815e-05, + "loss": 0.01217754278331995, + "num_input_tokens_seen": 53320256, + "step": 3256, + "train_runtime": 26459.8042, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 1.973939393939394, + "grad_norm": 0.011189469136297703, + "learning_rate": 9.142228042107248e-05, + "loss": 0.012512904591858387, + "num_input_tokens_seen": 53336632, + "step": 3257, + "train_runtime": 26467.918, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 1.9745454545454546, + "grad_norm": 0.015042346902191639, + "learning_rate": 9.141689394855734e-05, + "loss": 0.012815610505640507, + "num_input_tokens_seen": 53353008, + "step": 3258, + "train_runtime": 26476.0346, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 1.975151515151515, + "grad_norm": 0.0030142359901219606, + "learning_rate": 9.141150594411195e-05, + "loss": 0.012168426997959614, + "num_input_tokens_seen": 53369384, + "step": 3259, + "train_runtime": 26484.1476, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 1.9757575757575756, + "grad_norm": 0.0037197330966591835, + "learning_rate": 9.140611640793558e-05, + "loss": 0.012724127620458603, + "num_input_tokens_seen": 53385760, + "step": 3260, + "train_runtime": 26492.262, + "train_tokens_per_second": 2015.145 + }, + { + "epoch": 1.9763636363636363, + "grad_norm": 0.009197385981678963, + "learning_rate": 9.14007253402276e-05, + "loss": 0.011984776705503464, + "num_input_tokens_seen": 53402136, + "step": 3261, + "train_runtime": 26500.3733, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 1.976969696969697, + "grad_norm": 0.00908108614385128, + "learning_rate": 9.139533274118743e-05, + "loss": 0.013469175435602665, + "num_input_tokens_seen": 53418512, + "step": 3262, + "train_runtime": 26508.4878, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 1.9775757575757575, + "grad_norm": 0.008263330906629562, + "learning_rate": 9.138993861101452e-05, + "loss": 0.012428317219018936, + "num_input_tokens_seen": 53434888, + "step": 3263, + "train_runtime": 26516.6014, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 1.978181818181818, + "grad_norm": 0.00935580488294363, + "learning_rate": 9.138454294990837e-05, + "loss": 0.012690423987805843, + "num_input_tokens_seen": 53451264, + "step": 3264, + "train_runtime": 26524.7165, + "train_tokens_per_second": 2015.149 + }, + { + "epoch": 1.9787878787878788, + "grad_norm": 0.006904932204633951, + "learning_rate": 9.137914575806856e-05, + "loss": 0.013632209971547127, + "num_input_tokens_seen": 53467640, + "step": 3265, + "train_runtime": 26532.8344, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 1.9793939393939395, + "grad_norm": 0.010423954576253891, + "learning_rate": 9.137374703569475e-05, + "loss": 0.012165880762040615, + "num_input_tokens_seen": 53484016, + "step": 3266, + "train_runtime": 26540.948, + "train_tokens_per_second": 2015.151 + }, + { + "epoch": 1.98, + "grad_norm": 0.011065114289522171, + "learning_rate": 9.13683467829866e-05, + "loss": 0.01318406593054533, + "num_input_tokens_seen": 53500392, + "step": 3267, + "train_runtime": 26549.0617, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 1.9806060606060605, + "grad_norm": 0.007728222757577896, + "learning_rate": 9.136294500014386e-05, + "loss": 0.013370378874242306, + "num_input_tokens_seen": 53516768, + "step": 3268, + "train_runtime": 26557.1762, + "train_tokens_per_second": 2015.153 + }, + { + "epoch": 1.9812121212121212, + "grad_norm": 0.010369036346673965, + "learning_rate": 9.135754168736635e-05, + "loss": 0.01195678859949112, + "num_input_tokens_seen": 53533144, + "step": 3269, + "train_runtime": 26565.2909, + "train_tokens_per_second": 2015.154 + }, + { + "epoch": 1.981818181818182, + "grad_norm": 0.007326615508645773, + "learning_rate": 9.135213684485389e-05, + "loss": 0.0117134815081954, + "num_input_tokens_seen": 53549520, + "step": 3270, + "train_runtime": 26573.4064, + "train_tokens_per_second": 2015.155 + }, + { + "epoch": 1.9824242424242424, + "grad_norm": 0.006437429692596197, + "learning_rate": 9.134673047280645e-05, + "loss": 0.011881757527589798, + "num_input_tokens_seen": 53565896, + "step": 3271, + "train_runtime": 26581.5229, + "train_tokens_per_second": 2015.155 + }, + { + "epoch": 1.983030303030303, + "grad_norm": 0.011852375231683254, + "learning_rate": 9.134132257142394e-05, + "loss": 0.013348350301384926, + "num_input_tokens_seen": 53582272, + "step": 3272, + "train_runtime": 26589.6392, + "train_tokens_per_second": 2015.156 + }, + { + "epoch": 1.9836363636363636, + "grad_norm": 0.008073097094893456, + "learning_rate": 9.133591314090643e-05, + "loss": 0.011144507676362991, + "num_input_tokens_seen": 53598648, + "step": 3273, + "train_runtime": 26597.7518, + "train_tokens_per_second": 2015.157 + }, + { + "epoch": 1.9842424242424244, + "grad_norm": 0.006733953952789307, + "learning_rate": 9.133050218145398e-05, + "loss": 0.012771431356668472, + "num_input_tokens_seen": 53615024, + "step": 3274, + "train_runtime": 26605.8616, + "train_tokens_per_second": 2015.158 + }, + { + "epoch": 1.9848484848484849, + "grad_norm": 0.005549773573875427, + "learning_rate": 9.132508969326675e-05, + "loss": 0.012543238699436188, + "num_input_tokens_seen": 53631400, + "step": 3275, + "train_runtime": 26613.98, + "train_tokens_per_second": 2015.159 + }, + { + "epoch": 1.9854545454545454, + "grad_norm": 0.006168725900352001, + "learning_rate": 9.131967567654493e-05, + "loss": 0.011709775775671005, + "num_input_tokens_seen": 53647776, + "step": 3276, + "train_runtime": 26622.0908, + "train_tokens_per_second": 2015.16 + }, + { + "epoch": 1.986060606060606, + "grad_norm": 0.008207867853343487, + "learning_rate": 9.131426013148876e-05, + "loss": 0.011277429759502411, + "num_input_tokens_seen": 53664152, + "step": 3277, + "train_runtime": 26630.2042, + "train_tokens_per_second": 2015.161 + }, + { + "epoch": 1.9866666666666668, + "grad_norm": 0.014408214017748833, + "learning_rate": 9.130884305829859e-05, + "loss": 0.01298760250210762, + "num_input_tokens_seen": 53680528, + "step": 3278, + "train_runtime": 26638.321, + "train_tokens_per_second": 2015.162 + }, + { + "epoch": 1.9872727272727273, + "grad_norm": 0.003766058012843132, + "learning_rate": 9.130342445717475e-05, + "loss": 0.011248024180531502, + "num_input_tokens_seen": 53696904, + "step": 3279, + "train_runtime": 26646.4374, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 1.9878787878787878, + "grad_norm": 0.006661458872258663, + "learning_rate": 9.129800432831767e-05, + "loss": 0.013307865709066391, + "num_input_tokens_seen": 53713280, + "step": 3280, + "train_runtime": 26654.5509, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 1.9884848484848485, + "grad_norm": 0.02135315351188183, + "learning_rate": 9.129258267192783e-05, + "loss": 0.012892954051494598, + "num_input_tokens_seen": 53729656, + "step": 3281, + "train_runtime": 26662.6636, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 1.9890909090909092, + "grad_norm": 0.01414970587939024, + "learning_rate": 9.128715948820576e-05, + "loss": 0.01163790188729763, + "num_input_tokens_seen": 53746032, + "step": 3282, + "train_runtime": 26670.7762, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 1.9896969696969697, + "grad_norm": 0.007704513147473335, + "learning_rate": 9.128173477735209e-05, + "loss": 0.01249206718057394, + "num_input_tokens_seen": 53762408, + "step": 3283, + "train_runtime": 26678.8915, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 1.9903030303030302, + "grad_norm": 0.01047444436699152, + "learning_rate": 9.127630853956743e-05, + "loss": 0.011483356356620789, + "num_input_tokens_seen": 53778784, + "step": 3284, + "train_runtime": 26687.0032, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 1.990909090909091, + "grad_norm": 0.013243939727544785, + "learning_rate": 9.12708807750525e-05, + "loss": 0.012570211663842201, + "num_input_tokens_seen": 53795160, + "step": 3285, + "train_runtime": 26695.1198, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 1.9915151515151515, + "grad_norm": 0.008042296394705772, + "learning_rate": 9.126545148400807e-05, + "loss": 0.011773718520998955, + "num_input_tokens_seen": 53811536, + "step": 3286, + "train_runtime": 26703.2386, + "train_tokens_per_second": 2015.169 + }, + { + "epoch": 1.9921212121212122, + "grad_norm": 0.010223199613392353, + "learning_rate": 9.126002066663492e-05, + "loss": 0.013065483421087265, + "num_input_tokens_seen": 53827912, + "step": 3287, + "train_runtime": 26711.3547, + "train_tokens_per_second": 2015.17 + }, + { + "epoch": 1.9927272727272727, + "grad_norm": 0.007627638056874275, + "learning_rate": 9.125458832313399e-05, + "loss": 0.012292512692511082, + "num_input_tokens_seen": 53844288, + "step": 3288, + "train_runtime": 26719.4688, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 1.9933333333333332, + "grad_norm": 0.006064161658287048, + "learning_rate": 9.124915445370617e-05, + "loss": 0.01392812468111515, + "num_input_tokens_seen": 53860664, + "step": 3289, + "train_runtime": 26727.5878, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 1.993939393939394, + "grad_norm": 0.007689913269132376, + "learning_rate": 9.124371905855244e-05, + "loss": 0.012879272922873497, + "num_input_tokens_seen": 53877040, + "step": 3290, + "train_runtime": 26735.7022, + "train_tokens_per_second": 2015.172 + }, + { + "epoch": 1.9945454545454546, + "grad_norm": 0.00797196477651596, + "learning_rate": 9.123828213787389e-05, + "loss": 0.013234483078122139, + "num_input_tokens_seen": 53893416, + "step": 3291, + "train_runtime": 26743.8161, + "train_tokens_per_second": 2015.173 + }, + { + "epoch": 1.9951515151515151, + "grad_norm": 0.007768760900944471, + "learning_rate": 9.123284369187157e-05, + "loss": 0.011896961368620396, + "num_input_tokens_seen": 53909792, + "step": 3292, + "train_runtime": 26751.9335, + "train_tokens_per_second": 2015.174 + }, + { + "epoch": 1.9957575757575756, + "grad_norm": 0.008472788147628307, + "learning_rate": 9.122740372074665e-05, + "loss": 0.012093445286154747, + "num_input_tokens_seen": 53926168, + "step": 3293, + "train_runtime": 26760.0478, + "train_tokens_per_second": 2015.175 + }, + { + "epoch": 1.9963636363636363, + "grad_norm": 0.013480499386787415, + "learning_rate": 9.122196222470036e-05, + "loss": 0.012600832618772984, + "num_input_tokens_seen": 53942544, + "step": 3294, + "train_runtime": 26768.1606, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 1.996969696969697, + "grad_norm": 0.018711727112531662, + "learning_rate": 9.121651920393399e-05, + "loss": 0.012485871091485023, + "num_input_tokens_seen": 53958920, + "step": 3295, + "train_runtime": 26776.2771, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 1.9975757575757576, + "grad_norm": 0.007433328777551651, + "learning_rate": 9.121107465864882e-05, + "loss": 0.01180267333984375, + "num_input_tokens_seen": 53975296, + "step": 3296, + "train_runtime": 26784.391, + "train_tokens_per_second": 2015.177 + }, + { + "epoch": 1.998181818181818, + "grad_norm": 0.0183311365544796, + "learning_rate": 9.120562858904624e-05, + "loss": 0.013096505776047707, + "num_input_tokens_seen": 53991672, + "step": 3297, + "train_runtime": 26792.504, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 1.9987878787878788, + "grad_norm": 0.010210291482508183, + "learning_rate": 9.120018099532773e-05, + "loss": 0.012559071183204651, + "num_input_tokens_seen": 54008048, + "step": 3298, + "train_runtime": 26800.6169, + "train_tokens_per_second": 2015.179 + }, + { + "epoch": 1.9993939393939395, + "grad_norm": 0.01029531005769968, + "learning_rate": 9.119473187769474e-05, + "loss": 0.011382916942238808, + "num_input_tokens_seen": 54024424, + "step": 3299, + "train_runtime": 26808.7324, + "train_tokens_per_second": 2015.18 + }, + { + "epoch": 2.0, + "grad_norm": 0.011802438646554947, + "learning_rate": 9.118928123634885e-05, + "loss": 0.011201423592865467, + "num_input_tokens_seen": 54040800, + "step": 3300, + "train_runtime": 26816.8424, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 2.0006060606060605, + "grad_norm": 0.007276556454598904, + "learning_rate": 9.118382907149165e-05, + "loss": 0.012055739760398865, + "num_input_tokens_seen": 54057176, + "step": 3301, + "train_runtime": 26825.9949, + "train_tokens_per_second": 2015.104 + }, + { + "epoch": 2.001212121212121, + "grad_norm": 0.010496939532458782, + "learning_rate": 9.117837538332481e-05, + "loss": 0.012779267504811287, + "num_input_tokens_seen": 54073552, + "step": 3302, + "train_runtime": 26834.1084, + "train_tokens_per_second": 2015.105 + }, + { + "epoch": 2.001818181818182, + "grad_norm": 0.008761009201407433, + "learning_rate": 9.117292017205007e-05, + "loss": 0.012465574778616428, + "num_input_tokens_seen": 54089928, + "step": 3303, + "train_runtime": 26842.2204, + "train_tokens_per_second": 2015.106 + }, + { + "epoch": 2.0024242424242424, + "grad_norm": 0.02535530924797058, + "learning_rate": 9.116746343786919e-05, + "loss": 0.013483214192092419, + "num_input_tokens_seen": 54106304, + "step": 3304, + "train_runtime": 26850.336, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 2.003030303030303, + "grad_norm": 0.009476523846387863, + "learning_rate": 9.1162005180984e-05, + "loss": 0.011629991233348846, + "num_input_tokens_seen": 54122680, + "step": 3305, + "train_runtime": 26858.4529, + "train_tokens_per_second": 2015.108 + }, + { + "epoch": 2.0036363636363634, + "grad_norm": 0.006627608090639114, + "learning_rate": 9.115654540159641e-05, + "loss": 0.012818768620491028, + "num_input_tokens_seen": 54139056, + "step": 3306, + "train_runtime": 26866.565, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 2.0042424242424244, + "grad_norm": 0.004400262143462896, + "learning_rate": 9.115108409990833e-05, + "loss": 0.01134046632796526, + "num_input_tokens_seen": 54155432, + "step": 3307, + "train_runtime": 26874.676, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 2.004848484848485, + "grad_norm": 0.0066299899481236935, + "learning_rate": 9.114562127612181e-05, + "loss": 0.011135777458548546, + "num_input_tokens_seen": 54171808, + "step": 3308, + "train_runtime": 26882.7915, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 2.0054545454545454, + "grad_norm": 0.02751355618238449, + "learning_rate": 9.11401569304389e-05, + "loss": 0.01189148798584938, + "num_input_tokens_seen": 54188184, + "step": 3309, + "train_runtime": 26890.9074, + "train_tokens_per_second": 2015.112 + }, + { + "epoch": 2.006060606060606, + "grad_norm": 0.008675160817801952, + "learning_rate": 9.113469106306167e-05, + "loss": 0.01138359121978283, + "num_input_tokens_seen": 54204560, + "step": 3310, + "train_runtime": 26899.0207, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 2.006666666666667, + "grad_norm": 0.018375243991613388, + "learning_rate": 9.112922367419234e-05, + "loss": 0.01198117621243, + "num_input_tokens_seen": 54220936, + "step": 3311, + "train_runtime": 26907.1346, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 2.0072727272727273, + "grad_norm": 0.01007237657904625, + "learning_rate": 9.112375476403312e-05, + "loss": 0.011535527184605598, + "num_input_tokens_seen": 54237312, + "step": 3312, + "train_runtime": 26915.2469, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 2.007878787878788, + "grad_norm": 0.010128041729331017, + "learning_rate": 9.111828433278628e-05, + "loss": 0.012508670799434185, + "num_input_tokens_seen": 54253688, + "step": 3313, + "train_runtime": 26923.3578, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 2.0084848484848483, + "grad_norm": 0.026225844398140907, + "learning_rate": 9.11128123806542e-05, + "loss": 0.011080056428909302, + "num_input_tokens_seen": 54270064, + "step": 3314, + "train_runtime": 26931.4726, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 2.0090909090909093, + "grad_norm": 0.00960629153996706, + "learning_rate": 9.110733890783925e-05, + "loss": 0.012581977993249893, + "num_input_tokens_seen": 54286440, + "step": 3315, + "train_runtime": 26939.5865, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 2.0096969696969698, + "grad_norm": 0.005847670137882233, + "learning_rate": 9.110186391454389e-05, + "loss": 0.011724804528057575, + "num_input_tokens_seen": 54302816, + "step": 3316, + "train_runtime": 26947.6966, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 2.0103030303030303, + "grad_norm": 0.026506055146455765, + "learning_rate": 9.109638740097062e-05, + "loss": 0.011649670079350471, + "num_input_tokens_seen": 54319192, + "step": 3317, + "train_runtime": 26955.8123, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 2.0109090909090908, + "grad_norm": 0.009331312030553818, + "learning_rate": 9.1090909367322e-05, + "loss": 0.01132029015570879, + "num_input_tokens_seen": 54335568, + "step": 3318, + "train_runtime": 26963.9355, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 2.0115151515151517, + "grad_norm": 0.007260499056428671, + "learning_rate": 9.108542981380067e-05, + "loss": 0.01217691320925951, + "num_input_tokens_seen": 54351944, + "step": 3319, + "train_runtime": 26972.0564, + "train_tokens_per_second": 2015.121 + }, + { + "epoch": 2.012121212121212, + "grad_norm": 0.00819828175008297, + "learning_rate": 9.10799487406093e-05, + "loss": 0.011852500028908253, + "num_input_tokens_seen": 54368320, + "step": 3320, + "train_runtime": 26980.1718, + "train_tokens_per_second": 2015.121 + }, + { + "epoch": 2.0127272727272727, + "grad_norm": 0.015189445577561855, + "learning_rate": 9.107446614795063e-05, + "loss": 0.013060184195637703, + "num_input_tokens_seen": 54384696, + "step": 3321, + "train_runtime": 26988.2907, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 2.013333333333333, + "grad_norm": 0.007456401828676462, + "learning_rate": 9.106898203602745e-05, + "loss": 0.012429913505911827, + "num_input_tokens_seen": 54401072, + "step": 3322, + "train_runtime": 26996.4101, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 2.013939393939394, + "grad_norm": 0.009956259280443192, + "learning_rate": 9.10634964050426e-05, + "loss": 0.011429233476519585, + "num_input_tokens_seen": 54417448, + "step": 3323, + "train_runtime": 27004.5332, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 2.0145454545454546, + "grad_norm": 0.024274544790387154, + "learning_rate": 9.105800925519898e-05, + "loss": 0.01382430363446474, + "num_input_tokens_seen": 54433824, + "step": 3324, + "train_runtime": 27012.6509, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 2.015151515151515, + "grad_norm": 0.007276281714439392, + "learning_rate": 9.105252058669957e-05, + "loss": 0.012992753647267818, + "num_input_tokens_seen": 54450200, + "step": 3325, + "train_runtime": 27020.7716, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 2.0157575757575756, + "grad_norm": 0.006913206540048122, + "learning_rate": 9.104703039974736e-05, + "loss": 0.012510275468230247, + "num_input_tokens_seen": 54466576, + "step": 3326, + "train_runtime": 27028.8892, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 2.0163636363636366, + "grad_norm": 0.011480524204671383, + "learning_rate": 9.104153869454543e-05, + "loss": 0.01283776294440031, + "num_input_tokens_seen": 54482952, + "step": 3327, + "train_runtime": 27037.0067, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 2.016969696969697, + "grad_norm": 0.006622251123189926, + "learning_rate": 9.10360454712969e-05, + "loss": 0.011710776016116142, + "num_input_tokens_seen": 54499328, + "step": 3328, + "train_runtime": 27045.1199, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 2.0175757575757576, + "grad_norm": 0.006461408920586109, + "learning_rate": 9.103055073020497e-05, + "loss": 0.011920584365725517, + "num_input_tokens_seen": 54515704, + "step": 3329, + "train_runtime": 27053.2408, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 2.018181818181818, + "grad_norm": 0.0003032787353731692, + "learning_rate": 9.102505447147287e-05, + "loss": 0.012296212837100029, + "num_input_tokens_seen": 54532080, + "step": 3330, + "train_runtime": 27061.3579, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 2.0187878787878786, + "grad_norm": 0.009282547980546951, + "learning_rate": 9.101955669530391e-05, + "loss": 0.012170149944722652, + "num_input_tokens_seen": 54548456, + "step": 3331, + "train_runtime": 27069.4762, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 2.0193939393939395, + "grad_norm": 0.007768985815346241, + "learning_rate": 9.101405740190141e-05, + "loss": 0.011895807459950447, + "num_input_tokens_seen": 54564832, + "step": 3332, + "train_runtime": 27077.5966, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 2.02, + "grad_norm": 0.024832775816321373, + "learning_rate": 9.10085565914688e-05, + "loss": 0.013282284140586853, + "num_input_tokens_seen": 54581208, + "step": 3333, + "train_runtime": 27085.7138, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 2.0206060606060605, + "grad_norm": 0.007753964047878981, + "learning_rate": 9.100305426420956e-05, + "loss": 0.012050673365592957, + "num_input_tokens_seen": 54597584, + "step": 3334, + "train_runtime": 27093.833, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 2.021212121212121, + "grad_norm": 0.04361976683139801, + "learning_rate": 9.099755042032718e-05, + "loss": 0.012513071298599243, + "num_input_tokens_seen": 54613960, + "step": 3335, + "train_runtime": 27101.9434, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 2.021818181818182, + "grad_norm": 0.010623808018863201, + "learning_rate": 9.099204506002525e-05, + "loss": 0.01084177102893591, + "num_input_tokens_seen": 54630336, + "step": 3336, + "train_runtime": 27110.0619, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 2.0224242424242425, + "grad_norm": 0.011434576474130154, + "learning_rate": 9.09865381835074e-05, + "loss": 0.012685502879321575, + "num_input_tokens_seen": 54646712, + "step": 3337, + "train_runtime": 27118.1774, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 2.023030303030303, + "grad_norm": 0.005738785490393639, + "learning_rate": 9.098102979097733e-05, + "loss": 0.011825205758213997, + "num_input_tokens_seen": 54663088, + "step": 3338, + "train_runtime": 27126.296, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 2.0236363636363635, + "grad_norm": 0.01584089733660221, + "learning_rate": 9.097551988263877e-05, + "loss": 0.013741337694227695, + "num_input_tokens_seen": 54679464, + "step": 3339, + "train_runtime": 27134.4219, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 2.0242424242424244, + "grad_norm": 0.0060920617543160915, + "learning_rate": 9.097000845869553e-05, + "loss": 0.012348243035376072, + "num_input_tokens_seen": 54695840, + "step": 3340, + "train_runtime": 27142.5426, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 2.024848484848485, + "grad_norm": 0.005727679468691349, + "learning_rate": 9.096449551935144e-05, + "loss": 0.011096533387899399, + "num_input_tokens_seen": 54712216, + "step": 3341, + "train_runtime": 27150.6608, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 2.0254545454545454, + "grad_norm": 0.0022178071085363626, + "learning_rate": 9.095898106481045e-05, + "loss": 0.011531295254826546, + "num_input_tokens_seen": 54728592, + "step": 3342, + "train_runtime": 27158.7798, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 2.026060606060606, + "grad_norm": 0.003265876555815339, + "learning_rate": 9.095346509527652e-05, + "loss": 0.012010122649371624, + "num_input_tokens_seen": 54744968, + "step": 3343, + "train_runtime": 27166.8861, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 2.026666666666667, + "grad_norm": 0.00931676384061575, + "learning_rate": 9.094794761095366e-05, + "loss": 0.011602475307881832, + "num_input_tokens_seen": 54761344, + "step": 3344, + "train_runtime": 27174.9978, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 2.0272727272727273, + "grad_norm": 0.029080282896757126, + "learning_rate": 9.094242861204599e-05, + "loss": 0.012969661504030228, + "num_input_tokens_seen": 54777720, + "step": 3345, + "train_runtime": 27183.1125, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 2.027878787878788, + "grad_norm": 0.013951561413705349, + "learning_rate": 9.093690809875758e-05, + "loss": 0.013166350312530994, + "num_input_tokens_seen": 54794096, + "step": 3346, + "train_runtime": 27191.2325, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 2.0284848484848483, + "grad_norm": 0.006821845192462206, + "learning_rate": 9.093138607129268e-05, + "loss": 0.011342315934598446, + "num_input_tokens_seen": 54810472, + "step": 3347, + "train_runtime": 27199.35, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 2.0290909090909093, + "grad_norm": 0.010899233631789684, + "learning_rate": 9.092586252985551e-05, + "loss": 0.012500293552875519, + "num_input_tokens_seen": 54826848, + "step": 3348, + "train_runtime": 27207.4661, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 2.0296969696969698, + "grad_norm": 0.007849505171179771, + "learning_rate": 9.092033747465039e-05, + "loss": 0.012547525577247143, + "num_input_tokens_seen": 54843224, + "step": 3349, + "train_runtime": 27215.5858, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 2.0303030303030303, + "grad_norm": 0.007524041458964348, + "learning_rate": 9.091481090588166e-05, + "loss": 0.011828714981675148, + "num_input_tokens_seen": 54859600, + "step": 3350, + "train_runtime": 27223.7056, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 2.0309090909090908, + "grad_norm": 0.009752164594829082, + "learning_rate": 9.090928282375378e-05, + "loss": 0.011578820645809174, + "num_input_tokens_seen": 54875976, + "step": 3351, + "train_runtime": 27231.8332, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 2.0315151515151517, + "grad_norm": 0.06355761736631393, + "learning_rate": 9.090375322847118e-05, + "loss": 0.011825401335954666, + "num_input_tokens_seen": 54892352, + "step": 3352, + "train_runtime": 27239.9532, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 2.032121212121212, + "grad_norm": 0.007345013786107302, + "learning_rate": 9.089822212023839e-05, + "loss": 0.011034861207008362, + "num_input_tokens_seen": 54908728, + "step": 3353, + "train_runtime": 27248.0725, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 2.0327272727272727, + "grad_norm": 0.0073317899368703365, + "learning_rate": 9.089268949926004e-05, + "loss": 0.01266011968255043, + "num_input_tokens_seen": 54925104, + "step": 3354, + "train_runtime": 27256.1914, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 2.033333333333333, + "grad_norm": 0.0070832595229148865, + "learning_rate": 9.088715536574071e-05, + "loss": 0.011928196996450424, + "num_input_tokens_seen": 54941480, + "step": 3355, + "train_runtime": 27264.3078, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 2.033939393939394, + "grad_norm": 0.011500328779220581, + "learning_rate": 9.088161971988516e-05, + "loss": 0.011790191754698753, + "num_input_tokens_seen": 54957856, + "step": 3356, + "train_runtime": 27272.4332, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 2.0345454545454547, + "grad_norm": 0.00981878861784935, + "learning_rate": 9.087608256189808e-05, + "loss": 0.012370465323328972, + "num_input_tokens_seen": 54974232, + "step": 3357, + "train_runtime": 27280.5519, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 2.035151515151515, + "grad_norm": 0.011719790287315845, + "learning_rate": 9.087054389198432e-05, + "loss": 0.012797150760889053, + "num_input_tokens_seen": 54990608, + "step": 3358, + "train_runtime": 27288.6627, + "train_tokens_per_second": 2015.145 + }, + { + "epoch": 2.0357575757575757, + "grad_norm": 0.005948623642325401, + "learning_rate": 9.086500371034874e-05, + "loss": 0.012527494691312313, + "num_input_tokens_seen": 55006984, + "step": 3359, + "train_runtime": 27296.776, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 2.036363636363636, + "grad_norm": 0.009340680204331875, + "learning_rate": 9.085946201719625e-05, + "loss": 0.011543444357812405, + "num_input_tokens_seen": 55023360, + "step": 3360, + "train_runtime": 27304.8956, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 2.036969696969697, + "grad_norm": 0.007699036505073309, + "learning_rate": 9.085391881273182e-05, + "loss": 0.011673328466713428, + "num_input_tokens_seen": 55039736, + "step": 3361, + "train_runtime": 27313.0081, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 2.0375757575757576, + "grad_norm": 0.008742819540202618, + "learning_rate": 9.084837409716051e-05, + "loss": 0.012066803872585297, + "num_input_tokens_seen": 55056112, + "step": 3362, + "train_runtime": 27321.1237, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 2.038181818181818, + "grad_norm": 0.006048336159437895, + "learning_rate": 9.084282787068739e-05, + "loss": 0.012774009257555008, + "num_input_tokens_seen": 55072488, + "step": 3363, + "train_runtime": 27329.2452, + "train_tokens_per_second": 2015.149 + }, + { + "epoch": 2.0387878787878786, + "grad_norm": 0.007238797843456268, + "learning_rate": 9.083728013351758e-05, + "loss": 0.011799037456512451, + "num_input_tokens_seen": 55088864, + "step": 3364, + "train_runtime": 27337.3642, + "train_tokens_per_second": 2015.149 + }, + { + "epoch": 2.0393939393939395, + "grad_norm": 0.021580960601568222, + "learning_rate": 9.083173088585632e-05, + "loss": 0.011892813257873058, + "num_input_tokens_seen": 55105240, + "step": 3365, + "train_runtime": 27345.484, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 2.04, + "grad_norm": 0.009234655648469925, + "learning_rate": 9.082618012790886e-05, + "loss": 0.011318245902657509, + "num_input_tokens_seen": 55121616, + "step": 3366, + "train_runtime": 27353.6041, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 2.0406060606060605, + "grad_norm": 0.012906300835311413, + "learning_rate": 9.082062785988049e-05, + "loss": 0.012823051773011684, + "num_input_tokens_seen": 55137992, + "step": 3367, + "train_runtime": 27361.723, + "train_tokens_per_second": 2015.151 + }, + { + "epoch": 2.041212121212121, + "grad_norm": 0.0036449427716434, + "learning_rate": 9.08150740819766e-05, + "loss": 0.01129306573420763, + "num_input_tokens_seen": 55154368, + "step": 3368, + "train_runtime": 27369.8385, + "train_tokens_per_second": 2015.151 + }, + { + "epoch": 2.041818181818182, + "grad_norm": 0.012543014250695705, + "learning_rate": 9.08095187944026e-05, + "loss": 0.013440349139273167, + "num_input_tokens_seen": 55170744, + "step": 3369, + "train_runtime": 27377.953, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 2.0424242424242425, + "grad_norm": 0.0104695875197649, + "learning_rate": 9.080396199736396e-05, + "loss": 0.012156671844422817, + "num_input_tokens_seen": 55187120, + "step": 3370, + "train_runtime": 27386.0679, + "train_tokens_per_second": 2015.153 + }, + { + "epoch": 2.043030303030303, + "grad_norm": 0.0225234292447567, + "learning_rate": 9.079840369106625e-05, + "loss": 0.011554519645869732, + "num_input_tokens_seen": 55203496, + "step": 3371, + "train_runtime": 27394.1921, + "train_tokens_per_second": 2015.153 + }, + { + "epoch": 2.0436363636363635, + "grad_norm": 0.017583031207323074, + "learning_rate": 9.079284387571503e-05, + "loss": 0.012483416125178337, + "num_input_tokens_seen": 55219872, + "step": 3372, + "train_runtime": 27402.313, + "train_tokens_per_second": 2015.154 + }, + { + "epoch": 2.0442424242424244, + "grad_norm": 0.00885077379643917, + "learning_rate": 9.078728255151594e-05, + "loss": 0.011692331172525883, + "num_input_tokens_seen": 55236248, + "step": 3373, + "train_runtime": 27410.433, + "train_tokens_per_second": 2015.154 + }, + { + "epoch": 2.044848484848485, + "grad_norm": 0.004110273905098438, + "learning_rate": 9.078171971867471e-05, + "loss": 0.012116055004298687, + "num_input_tokens_seen": 55252624, + "step": 3374, + "train_runtime": 27418.5477, + "train_tokens_per_second": 2015.155 + }, + { + "epoch": 2.0454545454545454, + "grad_norm": 0.011093060486018658, + "learning_rate": 9.077615537739709e-05, + "loss": 0.01290032360702753, + "num_input_tokens_seen": 55269000, + "step": 3375, + "train_runtime": 27426.6586, + "train_tokens_per_second": 2015.156 + }, + { + "epoch": 2.046060606060606, + "grad_norm": 0.007847296074032784, + "learning_rate": 9.077058952788888e-05, + "loss": 0.013083033263683319, + "num_input_tokens_seen": 55285376, + "step": 3376, + "train_runtime": 27434.772, + "train_tokens_per_second": 2015.157 + }, + { + "epoch": 2.046666666666667, + "grad_norm": 0.006762146949768066, + "learning_rate": 9.076502217035597e-05, + "loss": 0.013601238839328289, + "num_input_tokens_seen": 55301752, + "step": 3377, + "train_runtime": 27442.8879, + "train_tokens_per_second": 2015.158 + }, + { + "epoch": 2.0472727272727274, + "grad_norm": 0.006784932222217321, + "learning_rate": 9.075945330500428e-05, + "loss": 0.01189483143389225, + "num_input_tokens_seen": 55318128, + "step": 3378, + "train_runtime": 27451.0075, + "train_tokens_per_second": 2015.158 + }, + { + "epoch": 2.047878787878788, + "grad_norm": 0.007549419533461332, + "learning_rate": 9.075388293203978e-05, + "loss": 0.01299357507377863, + "num_input_tokens_seen": 55334504, + "step": 3379, + "train_runtime": 27459.1348, + "train_tokens_per_second": 2015.158 + }, + { + "epoch": 2.0484848484848484, + "grad_norm": 0.006521687377244234, + "learning_rate": 9.074831105166852e-05, + "loss": 0.013047239743173122, + "num_input_tokens_seen": 55350880, + "step": 3380, + "train_runtime": 27467.2501, + "train_tokens_per_second": 2015.159 + }, + { + "epoch": 2.0490909090909093, + "grad_norm": 0.008688780479133129, + "learning_rate": 9.074273766409657e-05, + "loss": 0.013033932074904442, + "num_input_tokens_seen": 55367256, + "step": 3381, + "train_runtime": 27475.3662, + "train_tokens_per_second": 2015.16 + }, + { + "epoch": 2.04969696969697, + "grad_norm": 0.005272239912301302, + "learning_rate": 9.073716276953012e-05, + "loss": 0.013080219738185406, + "num_input_tokens_seen": 55383632, + "step": 3382, + "train_runtime": 27483.4813, + "train_tokens_per_second": 2015.161 + }, + { + "epoch": 2.0503030303030303, + "grad_norm": 0.008483667857944965, + "learning_rate": 9.073158636817535e-05, + "loss": 0.010860172100365162, + "num_input_tokens_seen": 55400008, + "step": 3383, + "train_runtime": 27491.6, + "train_tokens_per_second": 2015.161 + }, + { + "epoch": 2.050909090909091, + "grad_norm": 0.0050791422836482525, + "learning_rate": 9.07260084602385e-05, + "loss": 0.011222576722502708, + "num_input_tokens_seen": 55416384, + "step": 3384, + "train_runtime": 27499.717, + "train_tokens_per_second": 2015.162 + }, + { + "epoch": 2.0515151515151517, + "grad_norm": 0.00709482142701745, + "learning_rate": 9.072042904592593e-05, + "loss": 0.011624621227383614, + "num_input_tokens_seen": 55432760, + "step": 3385, + "train_runtime": 27507.8357, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 2.0521212121212122, + "grad_norm": 0.004702972248196602, + "learning_rate": 9.071484812544398e-05, + "loss": 0.011560735292732716, + "num_input_tokens_seen": 55449136, + "step": 3386, + "train_runtime": 27515.9538, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 2.0527272727272727, + "grad_norm": 0.008912491612136364, + "learning_rate": 9.070926569899909e-05, + "loss": 0.011354231275618076, + "num_input_tokens_seen": 55465512, + "step": 3387, + "train_runtime": 27524.0687, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 2.0533333333333332, + "grad_norm": 0.0077764480374753475, + "learning_rate": 9.070368176679774e-05, + "loss": 0.012655006721615791, + "num_input_tokens_seen": 55481888, + "step": 3388, + "train_runtime": 27532.1814, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 2.0539393939393937, + "grad_norm": 0.015883052721619606, + "learning_rate": 9.069809632904646e-05, + "loss": 0.012338697910308838, + "num_input_tokens_seen": 55498264, + "step": 3389, + "train_runtime": 27540.3008, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 2.0545454545454547, + "grad_norm": 0.01027887687087059, + "learning_rate": 9.069250938595185e-05, + "loss": 0.012151487171649933, + "num_input_tokens_seen": 55514640, + "step": 3390, + "train_runtime": 27548.4228, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 2.055151515151515, + "grad_norm": 0.0059898607432842255, + "learning_rate": 9.068692093772058e-05, + "loss": 0.011957871727645397, + "num_input_tokens_seen": 55531016, + "step": 3391, + "train_runtime": 27556.5422, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 2.0557575757575757, + "grad_norm": 0.01387920044362545, + "learning_rate": 9.068133098455932e-05, + "loss": 0.01215735636651516, + "num_input_tokens_seen": 55547392, + "step": 3392, + "train_runtime": 27564.6577, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 2.056363636363636, + "grad_norm": 0.006400204263627529, + "learning_rate": 9.067573952667486e-05, + "loss": 0.012007832527160645, + "num_input_tokens_seen": 55563768, + "step": 3393, + "train_runtime": 27572.7756, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 2.056969696969697, + "grad_norm": 0.009694559499621391, + "learning_rate": 9.067014656427401e-05, + "loss": 0.011804431676864624, + "num_input_tokens_seen": 55580144, + "step": 3394, + "train_runtime": 27580.8899, + "train_tokens_per_second": 2015.169 + }, + { + "epoch": 2.0575757575757576, + "grad_norm": 0.010956378653645515, + "learning_rate": 9.066455209756364e-05, + "loss": 0.012839428149163723, + "num_input_tokens_seen": 55596520, + "step": 3395, + "train_runtime": 27589.0075, + "train_tokens_per_second": 2015.169 + }, + { + "epoch": 2.058181818181818, + "grad_norm": 0.008832655847072601, + "learning_rate": 9.065895612675066e-05, + "loss": 0.011447408236563206, + "num_input_tokens_seen": 55612896, + "step": 3396, + "train_runtime": 27597.1214, + "train_tokens_per_second": 2015.17 + }, + { + "epoch": 2.0587878787878786, + "grad_norm": 0.013783660717308521, + "learning_rate": 9.06533586520421e-05, + "loss": 0.012833976186811924, + "num_input_tokens_seen": 55629272, + "step": 3397, + "train_runtime": 27605.2398, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 2.0593939393939396, + "grad_norm": 0.00604918971657753, + "learning_rate": 9.064775967364495e-05, + "loss": 0.010695607401430607, + "num_input_tokens_seen": 55645648, + "step": 3398, + "train_runtime": 27613.3528, + "train_tokens_per_second": 2015.172 + }, + { + "epoch": 2.06, + "grad_norm": 0.010326673276722431, + "learning_rate": 9.064215919176634e-05, + "loss": 0.01307748258113861, + "num_input_tokens_seen": 55662024, + "step": 3399, + "train_runtime": 27621.4707, + "train_tokens_per_second": 2015.172 + }, + { + "epoch": 2.0606060606060606, + "grad_norm": 0.00769348070025444, + "learning_rate": 9.06365572066134e-05, + "loss": 0.011743209324777126, + "num_input_tokens_seen": 55678400, + "step": 3400, + "train_runtime": 27629.5868, + "train_tokens_per_second": 2015.173 + }, + { + "epoch": 2.061212121212121, + "grad_norm": 0.019033854827284813, + "learning_rate": 9.063095371839337e-05, + "loss": 0.012079644948244095, + "num_input_tokens_seen": 55694776, + "step": 3401, + "train_runtime": 27638.5839, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 2.061818181818182, + "grad_norm": 0.008042428642511368, + "learning_rate": 9.062534872731346e-05, + "loss": 0.011539160273969173, + "num_input_tokens_seen": 55711152, + "step": 3402, + "train_runtime": 27646.693, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 2.0624242424242425, + "grad_norm": 0.007993514649569988, + "learning_rate": 9.061974223358101e-05, + "loss": 0.012027964927256107, + "num_input_tokens_seen": 55727528, + "step": 3403, + "train_runtime": 27654.8055, + "train_tokens_per_second": 2015.112 + }, + { + "epoch": 2.063030303030303, + "grad_norm": 0.0052061243914067745, + "learning_rate": 9.061413423740342e-05, + "loss": 0.011442933231592178, + "num_input_tokens_seen": 55743904, + "step": 3404, + "train_runtime": 27662.9148, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 2.0636363636363635, + "grad_norm": 0.010407760739326477, + "learning_rate": 9.060852473898808e-05, + "loss": 0.012942980974912643, + "num_input_tokens_seen": 55760280, + "step": 3405, + "train_runtime": 27671.0408, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 2.0642424242424244, + "grad_norm": 0.012654558755457401, + "learning_rate": 9.060291373854251e-05, + "loss": 0.013148204423487186, + "num_input_tokens_seen": 55776656, + "step": 3406, + "train_runtime": 27679.1555, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 2.064848484848485, + "grad_norm": 0.012586663477122784, + "learning_rate": 9.05973012362742e-05, + "loss": 0.012279224582016468, + "num_input_tokens_seen": 55793032, + "step": 3407, + "train_runtime": 27687.2688, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 2.0654545454545454, + "grad_norm": 0.011190484277904034, + "learning_rate": 9.059168723239081e-05, + "loss": 0.011963452212512493, + "num_input_tokens_seen": 55809408, + "step": 3408, + "train_runtime": 27695.3819, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 2.066060606060606, + "grad_norm": 0.009318447671830654, + "learning_rate": 9.058607172709994e-05, + "loss": 0.011981315910816193, + "num_input_tokens_seen": 55825784, + "step": 3409, + "train_runtime": 27703.4932, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 2.066666666666667, + "grad_norm": 0.009665393270552158, + "learning_rate": 9.058045472060931e-05, + "loss": 0.011912458576261997, + "num_input_tokens_seen": 55842160, + "step": 3410, + "train_runtime": 27711.6106, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 2.0672727272727274, + "grad_norm": 0.005949472542852163, + "learning_rate": 9.057483621312671e-05, + "loss": 0.012273924425244331, + "num_input_tokens_seen": 55858536, + "step": 3411, + "train_runtime": 27719.7219, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 2.067878787878788, + "grad_norm": 0.008634460158646107, + "learning_rate": 9.056921620485992e-05, + "loss": 0.01283192541450262, + "num_input_tokens_seen": 55874912, + "step": 3412, + "train_runtime": 27727.8332, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 2.0684848484848484, + "grad_norm": 0.0065974947065114975, + "learning_rate": 9.056359469601683e-05, + "loss": 0.012668941169977188, + "num_input_tokens_seen": 55891288, + "step": 3413, + "train_runtime": 27735.9456, + "train_tokens_per_second": 2015.121 + }, + { + "epoch": 2.0690909090909093, + "grad_norm": 0.007294429000467062, + "learning_rate": 9.055797168680538e-05, + "loss": 0.01187070831656456, + "num_input_tokens_seen": 55907664, + "step": 3414, + "train_runtime": 27744.0589, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 2.06969696969697, + "grad_norm": 0.009185160510241985, + "learning_rate": 9.055234717743351e-05, + "loss": 0.012446683831512928, + "num_input_tokens_seen": 55924040, + "step": 3415, + "train_runtime": 27752.1797, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 2.0703030303030303, + "grad_norm": 0.006855017505586147, + "learning_rate": 9.054672116810932e-05, + "loss": 0.011901344172656536, + "num_input_tokens_seen": 55940416, + "step": 3416, + "train_runtime": 27760.2968, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 2.070909090909091, + "grad_norm": 0.011837942525744438, + "learning_rate": 9.054109365904085e-05, + "loss": 0.012001942843198776, + "num_input_tokens_seen": 55956792, + "step": 3417, + "train_runtime": 27768.4105, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 2.0715151515151513, + "grad_norm": 0.0068351225927472115, + "learning_rate": 9.053546465043629e-05, + "loss": 0.01226651668548584, + "num_input_tokens_seen": 55973168, + "step": 3418, + "train_runtime": 27776.5321, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 2.0721212121212123, + "grad_norm": 0.007168815471231937, + "learning_rate": 9.052983414250382e-05, + "loss": 0.01221819780766964, + "num_input_tokens_seen": 55989544, + "step": 3419, + "train_runtime": 27784.6461, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 2.0727272727272728, + "grad_norm": 0.0102562690153718, + "learning_rate": 9.052420213545172e-05, + "loss": 0.014302713796496391, + "num_input_tokens_seen": 56005920, + "step": 3420, + "train_runtime": 27792.7619, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 2.0733333333333333, + "grad_norm": 0.009133314713835716, + "learning_rate": 9.05185686294883e-05, + "loss": 0.012644648551940918, + "num_input_tokens_seen": 56022296, + "step": 3421, + "train_runtime": 27800.8763, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 2.0739393939393937, + "grad_norm": 0.0054561379365623, + "learning_rate": 9.051293362482193e-05, + "loss": 0.010259821079671383, + "num_input_tokens_seen": 56038672, + "step": 3422, + "train_runtime": 27808.987, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 2.0745454545454547, + "grad_norm": 0.015878645703196526, + "learning_rate": 9.050729712166105e-05, + "loss": 0.012173894792795181, + "num_input_tokens_seen": 56055048, + "step": 3423, + "train_runtime": 27817.1057, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 2.075151515151515, + "grad_norm": 0.007376739289611578, + "learning_rate": 9.050165912021413e-05, + "loss": 0.011926956474781036, + "num_input_tokens_seen": 56071424, + "step": 3424, + "train_runtime": 27825.2176, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 2.0757575757575757, + "grad_norm": 0.008722481317818165, + "learning_rate": 9.049601962068971e-05, + "loss": 0.010563036426901817, + "num_input_tokens_seen": 56087800, + "step": 3425, + "train_runtime": 27833.3333, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 2.076363636363636, + "grad_norm": 0.018847903236746788, + "learning_rate": 9.04903786232964e-05, + "loss": 0.012597802095115185, + "num_input_tokens_seen": 56104176, + "step": 3426, + "train_runtime": 27841.5441, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 2.076969696969697, + "grad_norm": 0.005797175224870443, + "learning_rate": 9.048473612824282e-05, + "loss": 0.012497548013925552, + "num_input_tokens_seen": 56120552, + "step": 3427, + "train_runtime": 27849.6592, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 2.0775757575757576, + "grad_norm": 0.010824366472661495, + "learning_rate": 9.047909213573769e-05, + "loss": 0.01156754419207573, + "num_input_tokens_seen": 56136928, + "step": 3428, + "train_runtime": 27857.7764, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 2.078181818181818, + "grad_norm": 0.007866271771490574, + "learning_rate": 9.047344664598978e-05, + "loss": 0.011103162541985512, + "num_input_tokens_seen": 56153304, + "step": 3429, + "train_runtime": 27865.8902, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 2.0787878787878786, + "grad_norm": 0.009463231079280376, + "learning_rate": 9.046779965920788e-05, + "loss": 0.012735102325677872, + "num_input_tokens_seen": 56169680, + "step": 3430, + "train_runtime": 27874.0089, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 2.0793939393939396, + "grad_norm": 0.013803756795823574, + "learning_rate": 9.04621511756009e-05, + "loss": 0.012847594916820526, + "num_input_tokens_seen": 56186056, + "step": 3431, + "train_runtime": 27882.1324, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 2.08, + "grad_norm": 0.00846054870635271, + "learning_rate": 9.045650119537774e-05, + "loss": 0.01081385649740696, + "num_input_tokens_seen": 56202432, + "step": 3432, + "train_runtime": 27890.2506, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 2.0806060606060606, + "grad_norm": 0.008346304297447205, + "learning_rate": 9.045084971874738e-05, + "loss": 0.012644726783037186, + "num_input_tokens_seen": 56218808, + "step": 3433, + "train_runtime": 27898.3623, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 2.081212121212121, + "grad_norm": 0.012914376333355904, + "learning_rate": 9.044519674591887e-05, + "loss": 0.012044238857924938, + "num_input_tokens_seen": 56235184, + "step": 3434, + "train_runtime": 27906.4762, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 2.081818181818182, + "grad_norm": 0.00625306461006403, + "learning_rate": 9.043954227710128e-05, + "loss": 0.009518924169242382, + "num_input_tokens_seen": 56251560, + "step": 3435, + "train_runtime": 27914.5917, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 2.0824242424242425, + "grad_norm": 0.00748471962288022, + "learning_rate": 9.04338863125038e-05, + "loss": 0.012188691645860672, + "num_input_tokens_seen": 56267936, + "step": 3436, + "train_runtime": 27922.7087, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 2.083030303030303, + "grad_norm": 0.0038603260181844234, + "learning_rate": 9.042822885233557e-05, + "loss": 0.011931288056075573, + "num_input_tokens_seen": 56284312, + "step": 3437, + "train_runtime": 27930.8323, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 2.0836363636363635, + "grad_norm": 0.005839409306645393, + "learning_rate": 9.04225698968059e-05, + "loss": 0.011982793919742107, + "num_input_tokens_seen": 56300688, + "step": 3438, + "train_runtime": 27938.9479, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 2.0842424242424245, + "grad_norm": 0.008254511281847954, + "learning_rate": 9.04169094461241e-05, + "loss": 0.011427072808146477, + "num_input_tokens_seen": 56317064, + "step": 3439, + "train_runtime": 27947.0623, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 2.084848484848485, + "grad_norm": 0.00821908749639988, + "learning_rate": 9.041124750049955e-05, + "loss": 0.012207668274641037, + "num_input_tokens_seen": 56333440, + "step": 3440, + "train_runtime": 27955.1756, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 2.0854545454545454, + "grad_norm": 0.021660784259438515, + "learning_rate": 9.040558406014161e-05, + "loss": 0.01299472339451313, + "num_input_tokens_seen": 56349816, + "step": 3441, + "train_runtime": 27963.2951, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 2.086060606060606, + "grad_norm": 0.007126884069293737, + "learning_rate": 9.039991912525983e-05, + "loss": 0.010887030512094498, + "num_input_tokens_seen": 56366192, + "step": 3442, + "train_runtime": 27971.4076, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 2.086666666666667, + "grad_norm": 0.0057904645800590515, + "learning_rate": 9.03942526960637e-05, + "loss": 0.011882147751748562, + "num_input_tokens_seen": 56382568, + "step": 3443, + "train_runtime": 27979.5347, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 2.0872727272727274, + "grad_norm": 0.014943573623895645, + "learning_rate": 9.038858477276282e-05, + "loss": 0.013633402064442635, + "num_input_tokens_seen": 56398944, + "step": 3444, + "train_runtime": 27987.649, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 2.087878787878788, + "grad_norm": 0.02003590203821659, + "learning_rate": 9.038291535556686e-05, + "loss": 0.011881126090884209, + "num_input_tokens_seen": 56415320, + "step": 3445, + "train_runtime": 27995.7668, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 2.0884848484848484, + "grad_norm": 0.005910848267376423, + "learning_rate": 9.03772444446855e-05, + "loss": 0.012205943465232849, + "num_input_tokens_seen": 56431696, + "step": 3446, + "train_runtime": 28003.8788, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 2.089090909090909, + "grad_norm": 0.005043689161539078, + "learning_rate": 9.037157204032848e-05, + "loss": 0.01205090619623661, + "num_input_tokens_seen": 56448072, + "step": 3447, + "train_runtime": 28011.9913, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 2.08969696969697, + "grad_norm": 0.012684042565524578, + "learning_rate": 9.036589814270565e-05, + "loss": 0.012548624537885189, + "num_input_tokens_seen": 56464448, + "step": 3448, + "train_runtime": 28020.1027, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 2.0903030303030303, + "grad_norm": 0.011274142190814018, + "learning_rate": 9.036022275202686e-05, + "loss": 0.01294254045933485, + "num_input_tokens_seen": 56480824, + "step": 3449, + "train_runtime": 28028.2168, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 2.090909090909091, + "grad_norm": 0.007525157183408737, + "learning_rate": 9.035454586850202e-05, + "loss": 0.012265852652490139, + "num_input_tokens_seen": 56497200, + "step": 3450, + "train_runtime": 28036.3356, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 2.0915151515151513, + "grad_norm": 0.007187213283032179, + "learning_rate": 9.034886749234111e-05, + "loss": 0.011971338652074337, + "num_input_tokens_seen": 56513576, + "step": 3451, + "train_runtime": 28044.4535, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 2.0921212121212123, + "grad_norm": 0.00701161241158843, + "learning_rate": 9.034318762375418e-05, + "loss": 0.012336109764873981, + "num_input_tokens_seen": 56529952, + "step": 3452, + "train_runtime": 28052.5663, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 2.0927272727272728, + "grad_norm": 0.009412666782736778, + "learning_rate": 9.03375062629513e-05, + "loss": 0.012127682566642761, + "num_input_tokens_seen": 56546328, + "step": 3453, + "train_runtime": 28060.6789, + "train_tokens_per_second": 2015.145 + }, + { + "epoch": 2.0933333333333333, + "grad_norm": 0.00752132898196578, + "learning_rate": 9.033182341014261e-05, + "loss": 0.011383445002138615, + "num_input_tokens_seen": 56562704, + "step": 3454, + "train_runtime": 28068.7885, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 2.0939393939393938, + "grad_norm": 0.010009681805968285, + "learning_rate": 9.032613906553833e-05, + "loss": 0.01266009733080864, + "num_input_tokens_seen": 56579080, + "step": 3455, + "train_runtime": 28076.9019, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 2.0945454545454547, + "grad_norm": 0.004542776383459568, + "learning_rate": 9.032045322934868e-05, + "loss": 0.013468949124217033, + "num_input_tokens_seen": 56595456, + "step": 3456, + "train_runtime": 28085.0148, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 2.095151515151515, + "grad_norm": 0.015904322266578674, + "learning_rate": 9.031476590178399e-05, + "loss": 0.012843945994973183, + "num_input_tokens_seen": 56611832, + "step": 3457, + "train_runtime": 28093.1335, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 2.0957575757575757, + "grad_norm": 0.007006669882684946, + "learning_rate": 9.030907708305463e-05, + "loss": 0.012989908456802368, + "num_input_tokens_seen": 56628208, + "step": 3458, + "train_runtime": 28101.2497, + "train_tokens_per_second": 2015.149 + }, + { + "epoch": 2.096363636363636, + "grad_norm": 0.004681939259171486, + "learning_rate": 9.0303386773371e-05, + "loss": 0.012774134986102581, + "num_input_tokens_seen": 56644584, + "step": 3459, + "train_runtime": 28109.3682, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 2.096969696969697, + "grad_norm": 0.004613004624843597, + "learning_rate": 9.029769497294358e-05, + "loss": 0.012811033055186272, + "num_input_tokens_seen": 56660960, + "step": 3460, + "train_runtime": 28117.4807, + "train_tokens_per_second": 2015.151 + }, + { + "epoch": 2.0975757575757576, + "grad_norm": 0.006610489450395107, + "learning_rate": 9.029200168198289e-05, + "loss": 0.012730253860354424, + "num_input_tokens_seen": 56677336, + "step": 3461, + "train_runtime": 28125.5904, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 2.098181818181818, + "grad_norm": 0.011558209545910358, + "learning_rate": 9.028630690069954e-05, + "loss": 0.013134753331542015, + "num_input_tokens_seen": 56693712, + "step": 3462, + "train_runtime": 28133.7036, + "train_tokens_per_second": 2015.153 + }, + { + "epoch": 2.0987878787878786, + "grad_norm": 0.013601330108940601, + "learning_rate": 9.028061062930414e-05, + "loss": 0.012436976656317711, + "num_input_tokens_seen": 56710088, + "step": 3463, + "train_runtime": 28141.8175, + "train_tokens_per_second": 2015.154 + }, + { + "epoch": 2.0993939393939396, + "grad_norm": 0.01037578471004963, + "learning_rate": 9.02749128680074e-05, + "loss": 0.013326249085366726, + "num_input_tokens_seen": 56726464, + "step": 3464, + "train_runtime": 28149.9323, + "train_tokens_per_second": 2015.155 + }, + { + "epoch": 2.1, + "grad_norm": 0.007674772758036852, + "learning_rate": 9.026921361702007e-05, + "loss": 0.011600622907280922, + "num_input_tokens_seen": 56742840, + "step": 3465, + "train_runtime": 28158.0456, + "train_tokens_per_second": 2015.155 + }, + { + "epoch": 2.1006060606060606, + "grad_norm": 0.014724268577992916, + "learning_rate": 9.026351287655294e-05, + "loss": 0.012907741591334343, + "num_input_tokens_seen": 56759216, + "step": 3466, + "train_runtime": 28166.1554, + "train_tokens_per_second": 2015.157 + }, + { + "epoch": 2.101212121212121, + "grad_norm": 0.00393960764631629, + "learning_rate": 9.025781064681687e-05, + "loss": 0.011136265471577644, + "num_input_tokens_seen": 56775592, + "step": 3467, + "train_runtime": 28174.2678, + "train_tokens_per_second": 2015.158 + }, + { + "epoch": 2.101818181818182, + "grad_norm": 0.01141727901995182, + "learning_rate": 9.02521069280228e-05, + "loss": 0.01337271649390459, + "num_input_tokens_seen": 56791968, + "step": 3468, + "train_runtime": 28182.3794, + "train_tokens_per_second": 2015.159 + }, + { + "epoch": 2.1024242424242425, + "grad_norm": 0.021951353177428246, + "learning_rate": 9.024640172038168e-05, + "loss": 0.012599781155586243, + "num_input_tokens_seen": 56808344, + "step": 3469, + "train_runtime": 28190.4956, + "train_tokens_per_second": 2015.159 + }, + { + "epoch": 2.103030303030303, + "grad_norm": 0.005962764844298363, + "learning_rate": 9.024069502410453e-05, + "loss": 0.01175682246685028, + "num_input_tokens_seen": 56824720, + "step": 3470, + "train_runtime": 28198.606, + "train_tokens_per_second": 2015.161 + }, + { + "epoch": 2.1036363636363635, + "grad_norm": 0.008232819847762585, + "learning_rate": 9.023498683940243e-05, + "loss": 0.01175486296415329, + "num_input_tokens_seen": 56841096, + "step": 3471, + "train_runtime": 28206.717, + "train_tokens_per_second": 2015.162 + }, + { + "epoch": 2.1042424242424245, + "grad_norm": 0.00799788348376751, + "learning_rate": 9.022927716648653e-05, + "loss": 0.013452235609292984, + "num_input_tokens_seen": 56857472, + "step": 3472, + "train_runtime": 28214.8326, + "train_tokens_per_second": 2015.162 + }, + { + "epoch": 2.104848484848485, + "grad_norm": 0.005621060729026794, + "learning_rate": 9.022356600556801e-05, + "loss": 0.011244012042880058, + "num_input_tokens_seen": 56873848, + "step": 3473, + "train_runtime": 28222.9449, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 2.1054545454545455, + "grad_norm": 0.005224290303885937, + "learning_rate": 9.021785335685813e-05, + "loss": 0.012814436107873917, + "num_input_tokens_seen": 56890224, + "step": 3474, + "train_runtime": 28231.0612, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 2.106060606060606, + "grad_norm": 0.010674373246729374, + "learning_rate": 9.021213922056815e-05, + "loss": 0.0114644356071949, + "num_input_tokens_seen": 56906600, + "step": 3475, + "train_runtime": 28239.1737, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 2.1066666666666665, + "grad_norm": 0.0121085736900568, + "learning_rate": 9.020642359690947e-05, + "loss": 0.012265300378203392, + "num_input_tokens_seen": 56922976, + "step": 3476, + "train_runtime": 28247.2869, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 2.1072727272727274, + "grad_norm": 0.009924935176968575, + "learning_rate": 9.020070648609347e-05, + "loss": 0.013388853520154953, + "num_input_tokens_seen": 56939352, + "step": 3477, + "train_runtime": 28255.3966, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 2.107878787878788, + "grad_norm": 0.005500199273228645, + "learning_rate": 9.019498788833161e-05, + "loss": 0.011340290307998657, + "num_input_tokens_seen": 56955728, + "step": 3478, + "train_runtime": 28263.5114, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 2.1084848484848484, + "grad_norm": 0.008234964683651924, + "learning_rate": 9.018926780383545e-05, + "loss": 0.012281153351068497, + "num_input_tokens_seen": 56972104, + "step": 3479, + "train_runtime": 28271.6223, + "train_tokens_per_second": 2015.169 + }, + { + "epoch": 2.109090909090909, + "grad_norm": 0.008555219508707523, + "learning_rate": 9.018354623281653e-05, + "loss": 0.012110285460948944, + "num_input_tokens_seen": 56988480, + "step": 3480, + "train_runtime": 28279.7386, + "train_tokens_per_second": 2015.17 + }, + { + "epoch": 2.10969696969697, + "grad_norm": 0.012038682587444782, + "learning_rate": 9.017782317548649e-05, + "loss": 0.013431099243462086, + "num_input_tokens_seen": 57004856, + "step": 3481, + "train_runtime": 28287.8525, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 2.1103030303030303, + "grad_norm": 0.010675543919205666, + "learning_rate": 9.0172098632057e-05, + "loss": 0.01298239640891552, + "num_input_tokens_seen": 57021232, + "step": 3482, + "train_runtime": 28295.966, + "train_tokens_per_second": 2015.172 + }, + { + "epoch": 2.110909090909091, + "grad_norm": 0.010105367749929428, + "learning_rate": 9.016637260273983e-05, + "loss": 0.012679103761911392, + "num_input_tokens_seen": 57037608, + "step": 3483, + "train_runtime": 28304.0783, + "train_tokens_per_second": 2015.173 + }, + { + "epoch": 2.1115151515151513, + "grad_norm": 0.006353117059916258, + "learning_rate": 9.016064508774675e-05, + "loss": 0.012139725498855114, + "num_input_tokens_seen": 57053984, + "step": 3484, + "train_runtime": 28312.1911, + "train_tokens_per_second": 2015.174 + }, + { + "epoch": 2.1121212121212123, + "grad_norm": 0.006390564609318972, + "learning_rate": 9.015491608728961e-05, + "loss": 0.012158969417214394, + "num_input_tokens_seen": 57070360, + "step": 3485, + "train_runtime": 28320.303, + "train_tokens_per_second": 2015.175 + }, + { + "epoch": 2.112727272727273, + "grad_norm": 0.006470012944191694, + "learning_rate": 9.014918560158035e-05, + "loss": 0.012223651632666588, + "num_input_tokens_seen": 57086736, + "step": 3486, + "train_runtime": 28328.4147, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 2.1133333333333333, + "grad_norm": 0.011723276227712631, + "learning_rate": 9.014345363083086e-05, + "loss": 0.012710933573544025, + "num_input_tokens_seen": 57103112, + "step": 3487, + "train_runtime": 28336.5331, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 2.113939393939394, + "grad_norm": 0.01011462602764368, + "learning_rate": 9.013772017525322e-05, + "loss": 0.011596627533435822, + "num_input_tokens_seen": 57119488, + "step": 3488, + "train_runtime": 28344.6508, + "train_tokens_per_second": 2015.177 + }, + { + "epoch": 2.1145454545454547, + "grad_norm": 0.010281038470566273, + "learning_rate": 9.013198523505948e-05, + "loss": 0.011673036962747574, + "num_input_tokens_seen": 57135864, + "step": 3489, + "train_runtime": 28352.7677, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 2.1151515151515152, + "grad_norm": 0.01556433830410242, + "learning_rate": 9.012624881046176e-05, + "loss": 0.0127674276009202, + "num_input_tokens_seen": 57152240, + "step": 3490, + "train_runtime": 28360.8807, + "train_tokens_per_second": 2015.179 + }, + { + "epoch": 2.1157575757575757, + "grad_norm": 0.0046404823660850525, + "learning_rate": 9.012051090167222e-05, + "loss": 0.012291817925870419, + "num_input_tokens_seen": 57168616, + "step": 3491, + "train_runtime": 28368.9922, + "train_tokens_per_second": 2015.18 + }, + { + "epoch": 2.1163636363636362, + "grad_norm": 0.010019625537097454, + "learning_rate": 9.011477150890313e-05, + "loss": 0.012228306382894516, + "num_input_tokens_seen": 57184992, + "step": 3492, + "train_runtime": 28377.1083, + "train_tokens_per_second": 2015.18 + }, + { + "epoch": 2.116969696969697, + "grad_norm": 0.006875072605907917, + "learning_rate": 9.010903063236675e-05, + "loss": 0.011554446071386337, + "num_input_tokens_seen": 57201368, + "step": 3493, + "train_runtime": 28385.2224, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 2.1175757575757577, + "grad_norm": 0.009701329283416271, + "learning_rate": 9.010328827227545e-05, + "loss": 0.012353150174021721, + "num_input_tokens_seen": 57217744, + "step": 3494, + "train_runtime": 28393.3452, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 2.118181818181818, + "grad_norm": 0.006121743004769087, + "learning_rate": 9.00975444288416e-05, + "loss": 0.011772389523684978, + "num_input_tokens_seen": 57234120, + "step": 3495, + "train_runtime": 28401.4597, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 2.1187878787878787, + "grad_norm": 0.00822470337152481, + "learning_rate": 9.009179910227768e-05, + "loss": 0.012995040975511074, + "num_input_tokens_seen": 57250496, + "step": 3496, + "train_runtime": 28409.5721, + "train_tokens_per_second": 2015.183 + }, + { + "epoch": 2.1193939393939396, + "grad_norm": 0.01121497992426157, + "learning_rate": 9.008605229279618e-05, + "loss": 0.012147591449320316, + "num_input_tokens_seen": 57266872, + "step": 3497, + "train_runtime": 28417.6857, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 2.12, + "grad_norm": 0.012420396320521832, + "learning_rate": 9.008030400060967e-05, + "loss": 0.01183453667908907, + "num_input_tokens_seen": 57283248, + "step": 3498, + "train_runtime": 28425.7985, + "train_tokens_per_second": 2015.185 + }, + { + "epoch": 2.1206060606060606, + "grad_norm": 0.009392550215125084, + "learning_rate": 9.007455422593077e-05, + "loss": 0.012206954881548882, + "num_input_tokens_seen": 57299624, + "step": 3499, + "train_runtime": 28433.9121, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 2.121212121212121, + "grad_norm": 0.011173112317919731, + "learning_rate": 9.006880296897215e-05, + "loss": 0.013555949553847313, + "num_input_tokens_seen": 57316000, + "step": 3500, + "train_runtime": 28442.0324, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 2.1218181818181816, + "grad_norm": 0.008368341252207756, + "learning_rate": 9.006305022994654e-05, + "loss": 0.013381442055106163, + "num_input_tokens_seen": 57332376, + "step": 3501, + "train_runtime": 28451.0449, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 2.1224242424242425, + "grad_norm": 0.004341749008744955, + "learning_rate": 9.005729600906671e-05, + "loss": 0.01216307282447815, + "num_input_tokens_seen": 57348752, + "step": 3502, + "train_runtime": 28459.1587, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 2.123030303030303, + "grad_norm": 0.00595821114256978, + "learning_rate": 9.005154030654553e-05, + "loss": 0.01141276303678751, + "num_input_tokens_seen": 57365128, + "step": 3503, + "train_runtime": 28467.2732, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 2.1236363636363635, + "grad_norm": 0.010522489435970783, + "learning_rate": 9.004578312259586e-05, + "loss": 0.014437702484428883, + "num_input_tokens_seen": 57381504, + "step": 3504, + "train_runtime": 28475.3897, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 2.124242424242424, + "grad_norm": 0.008232937194406986, + "learning_rate": 9.004002445743065e-05, + "loss": 0.011311432346701622, + "num_input_tokens_seen": 57397880, + "step": 3505, + "train_runtime": 28483.5045, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 2.124848484848485, + "grad_norm": 0.01173914410173893, + "learning_rate": 9.003426431126291e-05, + "loss": 0.011294864118099213, + "num_input_tokens_seen": 57414256, + "step": 3506, + "train_runtime": 28491.6177, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 2.1254545454545455, + "grad_norm": 0.011773685924708843, + "learning_rate": 9.002850268430572e-05, + "loss": 0.012857058085501194, + "num_input_tokens_seen": 57430632, + "step": 3507, + "train_runtime": 28499.7337, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 2.126060606060606, + "grad_norm": 0.0063151223585009575, + "learning_rate": 9.002273957677214e-05, + "loss": 0.011314822360873222, + "num_input_tokens_seen": 57447008, + "step": 3508, + "train_runtime": 28507.8479, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 2.1266666666666665, + "grad_norm": 0.010733768343925476, + "learning_rate": 9.001697498887537e-05, + "loss": 0.01355978474020958, + "num_input_tokens_seen": 57463384, + "step": 3509, + "train_runtime": 28515.9634, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 2.1272727272727274, + "grad_norm": 0.006729649379849434, + "learning_rate": 9.001120892082864e-05, + "loss": 0.012324851006269455, + "num_input_tokens_seen": 57479760, + "step": 3510, + "train_runtime": 28524.0773, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 2.127878787878788, + "grad_norm": 0.008031347766518593, + "learning_rate": 9.000544137284519e-05, + "loss": 0.012918076477944851, + "num_input_tokens_seen": 57496136, + "step": 3511, + "train_runtime": 28532.1947, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 2.1284848484848484, + "grad_norm": 0.00953582301735878, + "learning_rate": 8.999967234513838e-05, + "loss": 0.013045232743024826, + "num_input_tokens_seen": 57512512, + "step": 3512, + "train_runtime": 28540.3099, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 2.129090909090909, + "grad_norm": 0.005362562369555235, + "learning_rate": 8.999390183792159e-05, + "loss": 0.012626387178897858, + "num_input_tokens_seen": 57528888, + "step": 3513, + "train_runtime": 28548.4227, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 2.12969696969697, + "grad_norm": 0.008026414550840855, + "learning_rate": 8.998812985140825e-05, + "loss": 0.012113875709474087, + "num_input_tokens_seen": 57545264, + "step": 3514, + "train_runtime": 28556.5366, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 2.1303030303030304, + "grad_norm": 0.009107707999646664, + "learning_rate": 8.998235638581186e-05, + "loss": 0.012264639139175415, + "num_input_tokens_seen": 57561640, + "step": 3515, + "train_runtime": 28564.651, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 2.130909090909091, + "grad_norm": 0.008588920347392559, + "learning_rate": 8.997658144134598e-05, + "loss": 0.01172946859151125, + "num_input_tokens_seen": 57578016, + "step": 3516, + "train_runtime": 28572.7705, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 2.1315151515151514, + "grad_norm": 0.008900281973183155, + "learning_rate": 8.99708050182242e-05, + "loss": 0.013760112226009369, + "num_input_tokens_seen": 57594392, + "step": 3517, + "train_runtime": 28580.8907, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 2.1321212121212123, + "grad_norm": 0.009173940867185593, + "learning_rate": 8.996502711666016e-05, + "loss": 0.012618775479495525, + "num_input_tokens_seen": 57610768, + "step": 3518, + "train_runtime": 28589.0102, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 2.132727272727273, + "grad_norm": 0.004674053750932217, + "learning_rate": 8.995924773686761e-05, + "loss": 0.012420371174812317, + "num_input_tokens_seen": 57627144, + "step": 3519, + "train_runtime": 28597.1324, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 0.015587940812110901, + "learning_rate": 8.99534668790603e-05, + "loss": 0.01104898750782013, + "num_input_tokens_seen": 57643520, + "step": 3520, + "train_runtime": 28605.2483, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 2.133939393939394, + "grad_norm": 0.008624122478067875, + "learning_rate": 8.994768454345206e-05, + "loss": 0.011609626933932304, + "num_input_tokens_seen": 57659896, + "step": 3521, + "train_runtime": 28613.361, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 2.1345454545454547, + "grad_norm": 0.0049206861294806, + "learning_rate": 8.994190073025676e-05, + "loss": 0.011751390993595123, + "num_input_tokens_seen": 57676272, + "step": 3522, + "train_runtime": 28621.4759, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 2.1351515151515152, + "grad_norm": 0.01566511020064354, + "learning_rate": 8.993611543968835e-05, + "loss": 0.012831299565732479, + "num_input_tokens_seen": 57692648, + "step": 3523, + "train_runtime": 28629.5909, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 2.1357575757575757, + "grad_norm": 0.008315403945744038, + "learning_rate": 8.99303286719608e-05, + "loss": 0.012952431105077267, + "num_input_tokens_seen": 57709024, + "step": 3524, + "train_runtime": 28637.7062, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 2.1363636363636362, + "grad_norm": 0.009655642323195934, + "learning_rate": 8.992454042728813e-05, + "loss": 0.01324331946671009, + "num_input_tokens_seen": 57725400, + "step": 3525, + "train_runtime": 28645.8234, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 2.1369696969696967, + "grad_norm": 0.006624852307140827, + "learning_rate": 8.991875070588447e-05, + "loss": 0.01158294826745987, + "num_input_tokens_seen": 57741776, + "step": 3526, + "train_runtime": 28653.9459, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 2.1375757575757577, + "grad_norm": 0.015377560630440712, + "learning_rate": 8.991295950796397e-05, + "loss": 0.013609301298856735, + "num_input_tokens_seen": 57758152, + "step": 3527, + "train_runtime": 28662.0625, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 2.138181818181818, + "grad_norm": 0.0044901263900101185, + "learning_rate": 8.990716683374082e-05, + "loss": 0.010975447483360767, + "num_input_tokens_seen": 57774528, + "step": 3528, + "train_runtime": 28670.177, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 2.1387878787878787, + "grad_norm": 0.003714088350534439, + "learning_rate": 8.990137268342929e-05, + "loss": 0.012609384953975677, + "num_input_tokens_seen": 57790904, + "step": 3529, + "train_runtime": 28678.2931, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 2.1393939393939396, + "grad_norm": 0.007226061541587114, + "learning_rate": 8.989557705724367e-05, + "loss": 0.011608580127358437, + "num_input_tokens_seen": 57807280, + "step": 3530, + "train_runtime": 28686.4065, + "train_tokens_per_second": 2015.145 + }, + { + "epoch": 2.14, + "grad_norm": 0.01645052060484886, + "learning_rate": 8.988977995539837e-05, + "loss": 0.012488780543208122, + "num_input_tokens_seen": 57823656, + "step": 3531, + "train_runtime": 28694.5227, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 2.1406060606060606, + "grad_norm": 0.0056457314640283585, + "learning_rate": 8.988398137810777e-05, + "loss": 0.012707125395536423, + "num_input_tokens_seen": 57840032, + "step": 3532, + "train_runtime": 28702.6373, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 2.141212121212121, + "grad_norm": 0.011652040295302868, + "learning_rate": 8.987818132558639e-05, + "loss": 0.012485072016716003, + "num_input_tokens_seen": 57856408, + "step": 3533, + "train_runtime": 28710.7566, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 2.1418181818181816, + "grad_norm": 0.00488140806555748, + "learning_rate": 8.987237979804872e-05, + "loss": 0.011395161971449852, + "num_input_tokens_seen": 57872784, + "step": 3534, + "train_runtime": 28718.8711, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 2.1424242424242426, + "grad_norm": 0.006278018932789564, + "learning_rate": 8.986657679570938e-05, + "loss": 0.011353380978107452, + "num_input_tokens_seen": 57889160, + "step": 3535, + "train_runtime": 28726.9829, + "train_tokens_per_second": 2015.149 + }, + { + "epoch": 2.143030303030303, + "grad_norm": 0.012279496528208256, + "learning_rate": 8.9860772318783e-05, + "loss": 0.011512484401464462, + "num_input_tokens_seen": 57905536, + "step": 3536, + "train_runtime": 28735.0966, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 2.1436363636363636, + "grad_norm": 0.009725527837872505, + "learning_rate": 8.985496636748428e-05, + "loss": 0.012157324701547623, + "num_input_tokens_seen": 57921912, + "step": 3537, + "train_runtime": 28743.2101, + "train_tokens_per_second": 2015.151 + }, + { + "epoch": 2.144242424242424, + "grad_norm": 0.08072280138731003, + "learning_rate": 8.984915894202797e-05, + "loss": 0.01255282387137413, + "num_input_tokens_seen": 57938288, + "step": 3538, + "train_runtime": 28751.3233, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 2.144848484848485, + "grad_norm": 0.011321510188281536, + "learning_rate": 8.984335004262888e-05, + "loss": 0.012012355960905552, + "num_input_tokens_seen": 57954664, + "step": 3539, + "train_runtime": 28759.4365, + "train_tokens_per_second": 2015.153 + }, + { + "epoch": 2.1454545454545455, + "grad_norm": 0.015953831374645233, + "learning_rate": 8.983753966950185e-05, + "loss": 0.012593870982527733, + "num_input_tokens_seen": 57971040, + "step": 3540, + "train_runtime": 28767.5752, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 2.146060606060606, + "grad_norm": 0.0007118682260625064, + "learning_rate": 8.98317278228618e-05, + "loss": 0.010947332717478275, + "num_input_tokens_seen": 57987416, + "step": 3541, + "train_runtime": 28775.6895, + "train_tokens_per_second": 2015.153 + }, + { + "epoch": 2.1466666666666665, + "grad_norm": 0.007488769944757223, + "learning_rate": 8.982591450292372e-05, + "loss": 0.011720303446054459, + "num_input_tokens_seen": 58003792, + "step": 3542, + "train_runtime": 28783.8035, + "train_tokens_per_second": 2015.154 + }, + { + "epoch": 2.1472727272727274, + "grad_norm": 0.006082794163376093, + "learning_rate": 8.982009970990261e-05, + "loss": 0.011088498868048191, + "num_input_tokens_seen": 58020168, + "step": 3543, + "train_runtime": 28791.9189, + "train_tokens_per_second": 2015.155 + }, + { + "epoch": 2.147878787878788, + "grad_norm": 0.012090684846043587, + "learning_rate": 8.981428344401359e-05, + "loss": 0.012264646589756012, + "num_input_tokens_seen": 58036544, + "step": 3544, + "train_runtime": 28800.0347, + "train_tokens_per_second": 2015.155 + }, + { + "epoch": 2.1484848484848484, + "grad_norm": 0.006332604214549065, + "learning_rate": 8.980846570547172e-05, + "loss": 0.011855223216116428, + "num_input_tokens_seen": 58052920, + "step": 3545, + "train_runtime": 28808.1468, + "train_tokens_per_second": 2015.156 + }, + { + "epoch": 2.149090909090909, + "grad_norm": 0.00799900759011507, + "learning_rate": 8.980264649449225e-05, + "loss": 0.012117343954741955, + "num_input_tokens_seen": 58069296, + "step": 3546, + "train_runtime": 28816.2577, + "train_tokens_per_second": 2015.157 + }, + { + "epoch": 2.14969696969697, + "grad_norm": 0.006898942403495312, + "learning_rate": 8.979682581129038e-05, + "loss": 0.011957213282585144, + "num_input_tokens_seen": 58085672, + "step": 3547, + "train_runtime": 28824.3734, + "train_tokens_per_second": 2015.158 + }, + { + "epoch": 2.1503030303030304, + "grad_norm": 0.017603494226932526, + "learning_rate": 8.979100365608144e-05, + "loss": 0.012435558252036572, + "num_input_tokens_seen": 58102048, + "step": 3548, + "train_runtime": 28832.4891, + "train_tokens_per_second": 2015.159 + }, + { + "epoch": 2.150909090909091, + "grad_norm": 0.009896202012896538, + "learning_rate": 8.978518002908076e-05, + "loss": 0.01290203258395195, + "num_input_tokens_seen": 58118424, + "step": 3549, + "train_runtime": 28840.6049, + "train_tokens_per_second": 2015.16 + }, + { + "epoch": 2.1515151515151514, + "grad_norm": 0.0274839848279953, + "learning_rate": 8.977935493050375e-05, + "loss": 0.01113799400627613, + "num_input_tokens_seen": 58134800, + "step": 3550, + "train_runtime": 28848.7218, + "train_tokens_per_second": 2015.16 + }, + { + "epoch": 2.1521212121212123, + "grad_norm": 0.012439846992492676, + "learning_rate": 8.977352836056587e-05, + "loss": 0.013506392948329449, + "num_input_tokens_seen": 58151176, + "step": 3551, + "train_runtime": 28856.8415, + "train_tokens_per_second": 2015.161 + }, + { + "epoch": 2.152727272727273, + "grad_norm": 0.0051726060919463634, + "learning_rate": 8.976770031948263e-05, + "loss": 0.011873546056449413, + "num_input_tokens_seen": 58167552, + "step": 3552, + "train_runtime": 28864.9598, + "train_tokens_per_second": 2015.161 + }, + { + "epoch": 2.1533333333333333, + "grad_norm": 0.00895746424794197, + "learning_rate": 8.97618708074696e-05, + "loss": 0.012198572047054768, + "num_input_tokens_seen": 58183928, + "step": 3553, + "train_runtime": 28873.0733, + "train_tokens_per_second": 2015.162 + }, + { + "epoch": 2.153939393939394, + "grad_norm": 0.01121628936380148, + "learning_rate": 8.97560398247424e-05, + "loss": 0.012298112735152245, + "num_input_tokens_seen": 58200304, + "step": 3554, + "train_runtime": 28881.1851, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 2.1545454545454543, + "grad_norm": 0.007435557898133993, + "learning_rate": 8.975020737151669e-05, + "loss": 0.010877593420445919, + "num_input_tokens_seen": 58216680, + "step": 3555, + "train_runtime": 28889.2984, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 2.1551515151515153, + "grad_norm": 0.00940319150686264, + "learning_rate": 8.974437344800825e-05, + "loss": 0.012261370196938515, + "num_input_tokens_seen": 58233056, + "step": 3556, + "train_runtime": 28897.4122, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 2.1557575757575758, + "grad_norm": 0.0037964419461786747, + "learning_rate": 8.973853805443282e-05, + "loss": 0.011090653017163277, + "num_input_tokens_seen": 58249432, + "step": 3557, + "train_runtime": 28905.5327, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 2.1563636363636363, + "grad_norm": 0.007811735384166241, + "learning_rate": 8.973270119100625e-05, + "loss": 0.012524952180683613, + "num_input_tokens_seen": 58265808, + "step": 3558, + "train_runtime": 28913.6435, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 2.156969696969697, + "grad_norm": 0.008808665908873081, + "learning_rate": 8.972686285794445e-05, + "loss": 0.013249467127025127, + "num_input_tokens_seen": 58282184, + "step": 3559, + "train_runtime": 28921.7604, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 2.1575757575757577, + "grad_norm": 0.0032920176163315773, + "learning_rate": 8.972102305546334e-05, + "loss": 0.011471050791442394, + "num_input_tokens_seen": 58298560, + "step": 3560, + "train_runtime": 28929.8725, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 2.158181818181818, + "grad_norm": 0.008728522807359695, + "learning_rate": 8.971518178377895e-05, + "loss": 0.01315800566226244, + "num_input_tokens_seen": 58314936, + "step": 3561, + "train_runtime": 28937.9839, + "train_tokens_per_second": 2015.169 + }, + { + "epoch": 2.1587878787878787, + "grad_norm": 0.010329267010092735, + "learning_rate": 8.970933904310734e-05, + "loss": 0.012310674414038658, + "num_input_tokens_seen": 58331312, + "step": 3562, + "train_runtime": 28946.0971, + "train_tokens_per_second": 2015.17 + }, + { + "epoch": 2.159393939393939, + "grad_norm": 0.007029344793409109, + "learning_rate": 8.970349483366461e-05, + "loss": 0.011882564052939415, + "num_input_tokens_seen": 58347688, + "step": 3563, + "train_runtime": 28954.2127, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 2.16, + "grad_norm": 0.009087336249649525, + "learning_rate": 8.96976491556669e-05, + "loss": 0.013293171301484108, + "num_input_tokens_seen": 58364064, + "step": 3564, + "train_runtime": 28962.3346, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 2.1606060606060606, + "grad_norm": 0.005902925040572882, + "learning_rate": 8.969180200933047e-05, + "loss": 0.012850413098931313, + "num_input_tokens_seen": 58380440, + "step": 3565, + "train_runtime": 28970.451, + "train_tokens_per_second": 2015.172 + }, + { + "epoch": 2.161212121212121, + "grad_norm": 0.00861071515828371, + "learning_rate": 8.968595339487157e-05, + "loss": 0.012767734937369823, + "num_input_tokens_seen": 58396816, + "step": 3566, + "train_runtime": 28978.5649, + "train_tokens_per_second": 2015.173 + }, + { + "epoch": 2.1618181818181816, + "grad_norm": 0.006146362982690334, + "learning_rate": 8.968010331250656e-05, + "loss": 0.011280233040452003, + "num_input_tokens_seen": 58413192, + "step": 3567, + "train_runtime": 28986.6774, + "train_tokens_per_second": 2015.174 + }, + { + "epoch": 2.1624242424242426, + "grad_norm": 0.008694643154740334, + "learning_rate": 8.967425176245178e-05, + "loss": 0.010438431054353714, + "num_input_tokens_seen": 58429568, + "step": 3568, + "train_runtime": 28994.7878, + "train_tokens_per_second": 2015.175 + }, + { + "epoch": 2.163030303030303, + "grad_norm": 0.0005360030918382108, + "learning_rate": 8.966839874492371e-05, + "loss": 0.012298746034502983, + "num_input_tokens_seen": 58445944, + "step": 3569, + "train_runtime": 29002.9014, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 2.1636363636363636, + "grad_norm": 0.005812949035316706, + "learning_rate": 8.96625442601388e-05, + "loss": 0.011884803883731365, + "num_input_tokens_seen": 58462320, + "step": 3570, + "train_runtime": 29011.0144, + "train_tokens_per_second": 2015.177 + }, + { + "epoch": 2.164242424242424, + "grad_norm": 0.005894954781979322, + "learning_rate": 8.965668830831364e-05, + "loss": 0.013101841323077679, + "num_input_tokens_seen": 58478696, + "step": 3571, + "train_runtime": 29019.1364, + "train_tokens_per_second": 2015.177 + }, + { + "epoch": 2.164848484848485, + "grad_norm": 0.006240121088922024, + "learning_rate": 8.96508308896648e-05, + "loss": 0.013054500333964825, + "num_input_tokens_seen": 58495072, + "step": 3572, + "train_runtime": 29027.2486, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 2.1654545454545455, + "grad_norm": 0.007768931332975626, + "learning_rate": 8.964497200440894e-05, + "loss": 0.011665784753859043, + "num_input_tokens_seen": 58511448, + "step": 3573, + "train_runtime": 29035.3681, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 2.166060606060606, + "grad_norm": 0.008867294527590275, + "learning_rate": 8.963911165276275e-05, + "loss": 0.011086254380643368, + "num_input_tokens_seen": 58527824, + "step": 3574, + "train_runtime": 29043.4821, + "train_tokens_per_second": 2015.179 + }, + { + "epoch": 2.1666666666666665, + "grad_norm": 0.008764234371483326, + "learning_rate": 8.963324983494303e-05, + "loss": 0.012631715275347233, + "num_input_tokens_seen": 58544200, + "step": 3575, + "train_runtime": 29051.5973, + "train_tokens_per_second": 2015.18 + }, + { + "epoch": 2.1672727272727275, + "grad_norm": 0.01382221095263958, + "learning_rate": 8.962738655116658e-05, + "loss": 0.013242697343230247, + "num_input_tokens_seen": 58560576, + "step": 3576, + "train_runtime": 29059.7068, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 2.167878787878788, + "grad_norm": 0.019054502248764038, + "learning_rate": 8.962152180165028e-05, + "loss": 0.01292281411588192, + "num_input_tokens_seen": 58576952, + "step": 3577, + "train_runtime": 29067.8199, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 2.1684848484848485, + "grad_norm": 0.002178309252485633, + "learning_rate": 8.961565558661104e-05, + "loss": 0.011553612537682056, + "num_input_tokens_seen": 58593328, + "step": 3578, + "train_runtime": 29075.9338, + "train_tokens_per_second": 2015.183 + }, + { + "epoch": 2.169090909090909, + "grad_norm": 0.016054967418313026, + "learning_rate": 8.960978790626587e-05, + "loss": 0.011835544370114803, + "num_input_tokens_seen": 58609704, + "step": 3579, + "train_runtime": 29084.0467, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 2.16969696969697, + "grad_norm": 0.024970337748527527, + "learning_rate": 8.960391876083174e-05, + "loss": 0.012018397450447083, + "num_input_tokens_seen": 58626080, + "step": 3580, + "train_runtime": 29092.1608, + "train_tokens_per_second": 2015.185 + }, + { + "epoch": 2.1703030303030304, + "grad_norm": 0.007381356321275234, + "learning_rate": 8.959804815052582e-05, + "loss": 0.011703860014677048, + "num_input_tokens_seen": 58642456, + "step": 3581, + "train_runtime": 29100.2831, + "train_tokens_per_second": 2015.185 + }, + { + "epoch": 2.170909090909091, + "grad_norm": 0.008867908269166946, + "learning_rate": 8.959217607556519e-05, + "loss": 0.012440843507647514, + "num_input_tokens_seen": 58658832, + "step": 3582, + "train_runtime": 29108.4007, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 2.1715151515151514, + "grad_norm": 0.008855480700731277, + "learning_rate": 8.958630253616706e-05, + "loss": 0.01147475279867649, + "num_input_tokens_seen": 58675208, + "step": 3583, + "train_runtime": 29116.5152, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 2.172121212121212, + "grad_norm": 0.005014322232455015, + "learning_rate": 8.958042753254872e-05, + "loss": 0.011361206881701946, + "num_input_tokens_seen": 58691584, + "step": 3584, + "train_runtime": 29124.6325, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 2.172727272727273, + "grad_norm": 0.005862353835254908, + "learning_rate": 8.957455106492742e-05, + "loss": 0.012513482943177223, + "num_input_tokens_seen": 58707960, + "step": 3585, + "train_runtime": 29132.8568, + "train_tokens_per_second": 2015.18 + }, + { + "epoch": 2.1733333333333333, + "grad_norm": 0.010451595298945904, + "learning_rate": 8.956867313352056e-05, + "loss": 0.012588118202984333, + "num_input_tokens_seen": 58724336, + "step": 3586, + "train_runtime": 29140.9745, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 2.173939393939394, + "grad_norm": 0.007304052356630564, + "learning_rate": 8.956279373854552e-05, + "loss": 0.012194668874144554, + "num_input_tokens_seen": 58740712, + "step": 3587, + "train_runtime": 29149.0938, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 2.174545454545455, + "grad_norm": 0.0036269514821469784, + "learning_rate": 8.95569128802198e-05, + "loss": 0.011555613949894905, + "num_input_tokens_seen": 58757088, + "step": 3588, + "train_runtime": 29157.2151, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 2.1751515151515153, + "grad_norm": 0.0072369822300970554, + "learning_rate": 8.95510305587609e-05, + "loss": 0.010729311965405941, + "num_input_tokens_seen": 58773464, + "step": 3589, + "train_runtime": 29165.3345, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 2.175757575757576, + "grad_norm": 0.006867996882647276, + "learning_rate": 8.95451467743864e-05, + "loss": 0.012114688754081726, + "num_input_tokens_seen": 58789840, + "step": 3590, + "train_runtime": 29173.4604, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 2.1763636363636363, + "grad_norm": 0.0026318137533962727, + "learning_rate": 8.953926152731394e-05, + "loss": 0.012799869291484356, + "num_input_tokens_seen": 58806216, + "step": 3591, + "train_runtime": 29181.5691, + "train_tokens_per_second": 2015.183 + }, + { + "epoch": 2.1769696969696968, + "grad_norm": 0.006512695923447609, + "learning_rate": 8.953337481776119e-05, + "loss": 0.01212363876402378, + "num_input_tokens_seen": 58822592, + "step": 3592, + "train_runtime": 29189.6855, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 2.1775757575757577, + "grad_norm": 0.02227167598903179, + "learning_rate": 8.95274866459459e-05, + "loss": 0.012120643630623817, + "num_input_tokens_seen": 58838968, + "step": 3593, + "train_runtime": 29197.8002, + "train_tokens_per_second": 2015.185 + }, + { + "epoch": 2.178181818181818, + "grad_norm": 0.006759993266314268, + "learning_rate": 8.952159701208584e-05, + "loss": 0.012031888589262962, + "num_input_tokens_seen": 58855344, + "step": 3594, + "train_runtime": 29205.9148, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 2.1787878787878787, + "grad_norm": 0.014940389432013035, + "learning_rate": 8.951570591639889e-05, + "loss": 0.012228570878505707, + "num_input_tokens_seen": 58871720, + "step": 3595, + "train_runtime": 29214.036, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 2.179393939393939, + "grad_norm": 0.009364038705825806, + "learning_rate": 8.950981335910291e-05, + "loss": 0.012020081281661987, + "num_input_tokens_seen": 58888096, + "step": 3596, + "train_runtime": 29222.1557, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 2.18, + "grad_norm": 0.006528595462441444, + "learning_rate": 8.950391934041589e-05, + "loss": 0.012315641157329082, + "num_input_tokens_seen": 58904472, + "step": 3597, + "train_runtime": 29230.2688, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 2.1806060606060607, + "grad_norm": 0.01194236520677805, + "learning_rate": 8.949802386055581e-05, + "loss": 0.013185814023017883, + "num_input_tokens_seen": 58920848, + "step": 3598, + "train_runtime": 29238.3888, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 2.181212121212121, + "grad_norm": 0.008188650012016296, + "learning_rate": 8.949212691974077e-05, + "loss": 0.011144852265715599, + "num_input_tokens_seen": 58937224, + "step": 3599, + "train_runtime": 29246.5055, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 2.1818181818181817, + "grad_norm": 0.008416100405156612, + "learning_rate": 8.948622851818885e-05, + "loss": 0.012679114006459713, + "num_input_tokens_seen": 58953600, + "step": 3600, + "train_runtime": 29254.633, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 2.1824242424242426, + "grad_norm": 0.004923258442431688, + "learning_rate": 8.948032865611822e-05, + "loss": 0.01115406770259142, + "num_input_tokens_seen": 58969976, + "step": 3601, + "train_runtime": 29263.7377, + "train_tokens_per_second": 2015.121 + }, + { + "epoch": 2.183030303030303, + "grad_norm": 0.004624438937753439, + "learning_rate": 8.947442733374714e-05, + "loss": 0.011263374239206314, + "num_input_tokens_seen": 58986352, + "step": 3602, + "train_runtime": 29271.8564, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 2.1836363636363636, + "grad_norm": 0.0119725801050663, + "learning_rate": 8.946852455129384e-05, + "loss": 0.01155043113976717, + "num_input_tokens_seen": 59002728, + "step": 3603, + "train_runtime": 29279.9724, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 2.184242424242424, + "grad_norm": 0.01406806893646717, + "learning_rate": 8.94626203089767e-05, + "loss": 0.011634095571935177, + "num_input_tokens_seen": 59019104, + "step": 3604, + "train_runtime": 29288.0849, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 2.184848484848485, + "grad_norm": 0.0035006983671337366, + "learning_rate": 8.945671460701408e-05, + "loss": 0.011123578995466232, + "num_input_tokens_seen": 59035480, + "step": 3605, + "train_runtime": 29296.2014, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 2.1854545454545455, + "grad_norm": 0.0063458941876888275, + "learning_rate": 8.945080744562442e-05, + "loss": 0.012211378663778305, + "num_input_tokens_seen": 59051856, + "step": 3606, + "train_runtime": 29304.3199, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 2.186060606060606, + "grad_norm": 0.009857217781245708, + "learning_rate": 8.944489882502623e-05, + "loss": 0.012334001250565052, + "num_input_tokens_seen": 59068232, + "step": 3607, + "train_runtime": 29312.4364, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 2.1866666666666665, + "grad_norm": 0.013009733520448208, + "learning_rate": 8.943898874543803e-05, + "loss": 0.013556170277297497, + "num_input_tokens_seen": 59084608, + "step": 3608, + "train_runtime": 29320.5544, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 2.1872727272727275, + "grad_norm": 0.01466745138168335, + "learning_rate": 8.943307720707845e-05, + "loss": 0.011487782001495361, + "num_input_tokens_seen": 59100984, + "step": 3609, + "train_runtime": 29328.6739, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 2.187878787878788, + "grad_norm": 0.011151660233736038, + "learning_rate": 8.942716421016614e-05, + "loss": 0.012820694595575333, + "num_input_tokens_seen": 59117360, + "step": 3610, + "train_runtime": 29336.7895, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 2.1884848484848485, + "grad_norm": 0.004317841026932001, + "learning_rate": 8.942124975491981e-05, + "loss": 0.01183843333274126, + "num_input_tokens_seen": 59133736, + "step": 3611, + "train_runtime": 29344.9083, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 2.189090909090909, + "grad_norm": 0.014631208963692188, + "learning_rate": 8.941533384155822e-05, + "loss": 0.012403767555952072, + "num_input_tokens_seen": 59150112, + "step": 3612, + "train_runtime": 29353.0328, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 2.1896969696969695, + "grad_norm": 0.004952683579176664, + "learning_rate": 8.940941647030019e-05, + "loss": 0.01236814446747303, + "num_input_tokens_seen": 59166488, + "step": 3613, + "train_runtime": 29361.1816, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 2.1903030303030304, + "grad_norm": 0.009116880595684052, + "learning_rate": 8.940349764136457e-05, + "loss": 0.013562958687543869, + "num_input_tokens_seen": 59182864, + "step": 3614, + "train_runtime": 29369.301, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 2.190909090909091, + "grad_norm": 0.0036380335222929716, + "learning_rate": 8.939757735497034e-05, + "loss": 0.011842243373394012, + "num_input_tokens_seen": 59199240, + "step": 3615, + "train_runtime": 29377.4179, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 2.1915151515151514, + "grad_norm": 0.008418438956141472, + "learning_rate": 8.939165561133642e-05, + "loss": 0.011070848442614079, + "num_input_tokens_seen": 59215616, + "step": 3616, + "train_runtime": 29385.5329, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 2.192121212121212, + "grad_norm": 0.00455420883372426, + "learning_rate": 8.938573241068189e-05, + "loss": 0.01256749127060175, + "num_input_tokens_seen": 59231992, + "step": 3617, + "train_runtime": 29393.6487, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 2.192727272727273, + "grad_norm": 0.004541611764580011, + "learning_rate": 8.937980775322581e-05, + "loss": 0.010654402896761894, + "num_input_tokens_seen": 59248368, + "step": 3618, + "train_runtime": 29401.767, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 2.1933333333333334, + "grad_norm": 0.012203039601445198, + "learning_rate": 8.937388163918731e-05, + "loss": 0.012502270750701427, + "num_input_tokens_seen": 59264744, + "step": 3619, + "train_runtime": 29409.8869, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 2.193939393939394, + "grad_norm": 0.007232667412608862, + "learning_rate": 8.936795406878564e-05, + "loss": 0.011787505820393562, + "num_input_tokens_seen": 59281120, + "step": 3620, + "train_runtime": 29417.9999, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 2.1945454545454544, + "grad_norm": 0.011393684893846512, + "learning_rate": 8.936202504224e-05, + "loss": 0.013321079313755035, + "num_input_tokens_seen": 59297496, + "step": 3621, + "train_runtime": 29426.1152, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 2.1951515151515153, + "grad_norm": 0.01522404607385397, + "learning_rate": 8.93560945597697e-05, + "loss": 0.011272291652858257, + "num_input_tokens_seen": 59313872, + "step": 3622, + "train_runtime": 29434.2338, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 2.195757575757576, + "grad_norm": 0.006869960110634565, + "learning_rate": 8.935016262159412e-05, + "loss": 0.012107600457966328, + "num_input_tokens_seen": 59330248, + "step": 3623, + "train_runtime": 29442.3517, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 2.1963636363636363, + "grad_norm": 0.00808816310018301, + "learning_rate": 8.934422922793265e-05, + "loss": 0.011816064827144146, + "num_input_tokens_seen": 59346624, + "step": 3624, + "train_runtime": 29450.463, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 2.196969696969697, + "grad_norm": 0.0004978812648914754, + "learning_rate": 8.933829437900475e-05, + "loss": 0.011826543137431145, + "num_input_tokens_seen": 59363000, + "step": 3625, + "train_runtime": 29458.577, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 2.1975757575757577, + "grad_norm": 0.006791422143578529, + "learning_rate": 8.933235807502996e-05, + "loss": 0.012391680851578712, + "num_input_tokens_seen": 59379376, + "step": 3626, + "train_runtime": 29466.6925, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 2.1981818181818182, + "grad_norm": 0.0055072130635380745, + "learning_rate": 8.932642031622783e-05, + "loss": 0.011894084513187408, + "num_input_tokens_seen": 59395752, + "step": 3627, + "train_runtime": 29474.8059, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 2.1987878787878787, + "grad_norm": 0.008963002823293209, + "learning_rate": 8.9320481102818e-05, + "loss": 0.012350327335298061, + "num_input_tokens_seen": 59412128, + "step": 3628, + "train_runtime": 29482.9188, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 2.1993939393939392, + "grad_norm": 0.010187532752752304, + "learning_rate": 8.931454043502016e-05, + "loss": 0.011566300876438618, + "num_input_tokens_seen": 59428504, + "step": 3629, + "train_runtime": 29491.033, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 2.2, + "grad_norm": 0.014371749944984913, + "learning_rate": 8.930859831305401e-05, + "loss": 0.011745520867407322, + "num_input_tokens_seen": 59444880, + "step": 3630, + "train_runtime": 29499.148, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 2.2006060606060607, + "grad_norm": 0.004343628883361816, + "learning_rate": 8.930265473713938e-05, + "loss": 0.012125818058848381, + "num_input_tokens_seen": 59461256, + "step": 3631, + "train_runtime": 29507.262, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 2.201212121212121, + "grad_norm": 0.00850563496351242, + "learning_rate": 8.929670970749608e-05, + "loss": 0.012195354327559471, + "num_input_tokens_seen": 59477632, + "step": 3632, + "train_runtime": 29515.3773, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 2.2018181818181817, + "grad_norm": 0.01019760686904192, + "learning_rate": 8.929076322434402e-05, + "loss": 0.013076315633952618, + "num_input_tokens_seen": 59494008, + "step": 3633, + "train_runtime": 29523.4923, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 2.2024242424242426, + "grad_norm": 0.006664637941867113, + "learning_rate": 8.928481528790313e-05, + "loss": 0.012966278940439224, + "num_input_tokens_seen": 59510384, + "step": 3634, + "train_runtime": 29531.6111, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 2.203030303030303, + "grad_norm": 0.008315959945321083, + "learning_rate": 8.927886589839344e-05, + "loss": 0.012466199696063995, + "num_input_tokens_seen": 59526760, + "step": 3635, + "train_runtime": 29539.7326, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 2.2036363636363636, + "grad_norm": 0.006912038661539555, + "learning_rate": 8.9272915056035e-05, + "loss": 0.013427576050162315, + "num_input_tokens_seen": 59543136, + "step": 3636, + "train_runtime": 29547.8478, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 2.204242424242424, + "grad_norm": 0.006813558284193277, + "learning_rate": 8.92669627610479e-05, + "loss": 0.012301689945161343, + "num_input_tokens_seen": 59559512, + "step": 3637, + "train_runtime": 29555.9654, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 2.204848484848485, + "grad_norm": 0.0047012618742883205, + "learning_rate": 8.92610090136523e-05, + "loss": 0.012508450075984001, + "num_input_tokens_seen": 59575888, + "step": 3638, + "train_runtime": 29564.0806, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 2.2054545454545456, + "grad_norm": 0.006925490219146013, + "learning_rate": 8.925505381406845e-05, + "loss": 0.012498512864112854, + "num_input_tokens_seen": 59592264, + "step": 3639, + "train_runtime": 29572.1905, + "train_tokens_per_second": 2015.145 + }, + { + "epoch": 2.206060606060606, + "grad_norm": 0.005478878039866686, + "learning_rate": 8.924909716251661e-05, + "loss": 0.011745494790375233, + "num_input_tokens_seen": 59608640, + "step": 3640, + "train_runtime": 29580.3016, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 2.2066666666666666, + "grad_norm": 0.00699197594076395, + "learning_rate": 8.924313905921709e-05, + "loss": 0.012664221227169037, + "num_input_tokens_seen": 59625016, + "step": 3641, + "train_runtime": 29588.4179, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 2.207272727272727, + "grad_norm": 0.012702935375273228, + "learning_rate": 8.923717950439029e-05, + "loss": 0.013789419084787369, + "num_input_tokens_seen": 59641392, + "step": 3642, + "train_runtime": 29596.5348, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 2.207878787878788, + "grad_norm": 0.006319780368357897, + "learning_rate": 8.923121849825662e-05, + "loss": 0.012776189483702183, + "num_input_tokens_seen": 59657768, + "step": 3643, + "train_runtime": 29604.6513, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 2.2084848484848485, + "grad_norm": 0.0030518770217895508, + "learning_rate": 8.922525604103659e-05, + "loss": 0.011326993815600872, + "num_input_tokens_seen": 59674144, + "step": 3644, + "train_runtime": 29612.7676, + "train_tokens_per_second": 2015.149 + }, + { + "epoch": 2.209090909090909, + "grad_norm": 0.006767469458281994, + "learning_rate": 8.921929213295071e-05, + "loss": 0.012699131853878498, + "num_input_tokens_seen": 59690520, + "step": 3645, + "train_runtime": 29620.8821, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 2.2096969696969695, + "grad_norm": 0.005254995543509722, + "learning_rate": 8.921332677421961e-05, + "loss": 0.011081083677709103, + "num_input_tokens_seen": 59706896, + "step": 3646, + "train_runtime": 29628.9966, + "train_tokens_per_second": 2015.151 + }, + { + "epoch": 2.2103030303030304, + "grad_norm": 0.007556076627224684, + "learning_rate": 8.92073599650639e-05, + "loss": 0.013109242543578148, + "num_input_tokens_seen": 59723272, + "step": 3647, + "train_runtime": 29637.1096, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 2.210909090909091, + "grad_norm": 0.010643698275089264, + "learning_rate": 8.920139170570429e-05, + "loss": 0.011104393750429153, + "num_input_tokens_seen": 59739648, + "step": 3648, + "train_runtime": 29645.2332, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 2.2115151515151514, + "grad_norm": 0.006891094613820314, + "learning_rate": 8.919542199636158e-05, + "loss": 0.012059992179274559, + "num_input_tokens_seen": 59756024, + "step": 3649, + "train_runtime": 29653.3446, + "train_tokens_per_second": 2015.153 + }, + { + "epoch": 2.212121212121212, + "grad_norm": 0.005681055597960949, + "learning_rate": 8.91894508372565e-05, + "loss": 0.012363753281533718, + "num_input_tokens_seen": 59772400, + "step": 3650, + "train_runtime": 29661.4574, + "train_tokens_per_second": 2015.154 + }, + { + "epoch": 2.212727272727273, + "grad_norm": 0.006800381001085043, + "learning_rate": 8.918347822860997e-05, + "loss": 0.012068090960383415, + "num_input_tokens_seen": 59788776, + "step": 3651, + "train_runtime": 29669.5712, + "train_tokens_per_second": 2015.155 + }, + { + "epoch": 2.2133333333333334, + "grad_norm": 0.007353676483035088, + "learning_rate": 8.917750417064289e-05, + "loss": 0.012048767879605293, + "num_input_tokens_seen": 59805152, + "step": 3652, + "train_runtime": 29677.6856, + "train_tokens_per_second": 2015.156 + }, + { + "epoch": 2.213939393939394, + "grad_norm": 0.009312749840319157, + "learning_rate": 8.91715286635762e-05, + "loss": 0.01307837013155222, + "num_input_tokens_seen": 59821528, + "step": 3653, + "train_runtime": 29685.8003, + "train_tokens_per_second": 2015.156 + }, + { + "epoch": 2.2145454545454544, + "grad_norm": 0.005366871133446693, + "learning_rate": 8.916555170763099e-05, + "loss": 0.012519482523202896, + "num_input_tokens_seen": 59837904, + "step": 3654, + "train_runtime": 29693.9134, + "train_tokens_per_second": 2015.157 + }, + { + "epoch": 2.2151515151515153, + "grad_norm": 0.025069156661629677, + "learning_rate": 8.915957330302827e-05, + "loss": 0.013411092571914196, + "num_input_tokens_seen": 59854280, + "step": 3655, + "train_runtime": 29702.0319, + "train_tokens_per_second": 2015.158 + }, + { + "epoch": 2.215757575757576, + "grad_norm": 0.011153807863593102, + "learning_rate": 8.915359344998919e-05, + "loss": 0.013310923241078854, + "num_input_tokens_seen": 59870656, + "step": 3656, + "train_runtime": 29710.1442, + "train_tokens_per_second": 2015.159 + }, + { + "epoch": 2.2163636363636363, + "grad_norm": 0.008448402397334576, + "learning_rate": 8.914761214873493e-05, + "loss": 0.012956599704921246, + "num_input_tokens_seen": 59887032, + "step": 3657, + "train_runtime": 29718.2542, + "train_tokens_per_second": 2015.16 + }, + { + "epoch": 2.216969696969697, + "grad_norm": 0.005610155873000622, + "learning_rate": 8.914162939948676e-05, + "loss": 0.0120665542781353, + "num_input_tokens_seen": 59903408, + "step": 3658, + "train_runtime": 29726.3677, + "train_tokens_per_second": 2015.161 + }, + { + "epoch": 2.2175757575757578, + "grad_norm": 0.009017917327582836, + "learning_rate": 8.913564520246592e-05, + "loss": 0.010684678331017494, + "num_input_tokens_seen": 59919784, + "step": 3659, + "train_runtime": 29734.4873, + "train_tokens_per_second": 2015.161 + }, + { + "epoch": 2.2181818181818183, + "grad_norm": 0.0062376754358410835, + "learning_rate": 8.912965955789378e-05, + "loss": 0.012134547345340252, + "num_input_tokens_seen": 59936160, + "step": 3660, + "train_runtime": 29742.6015, + "train_tokens_per_second": 2015.162 + }, + { + "epoch": 2.2187878787878788, + "grad_norm": 0.02972797304391861, + "learning_rate": 8.912367246599175e-05, + "loss": 0.013637243770062923, + "num_input_tokens_seen": 59952536, + "step": 3661, + "train_runtime": 29750.7121, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 2.2193939393939393, + "grad_norm": 0.008331749588251114, + "learning_rate": 8.911768392698126e-05, + "loss": 0.011612921953201294, + "num_input_tokens_seen": 59968912, + "step": 3662, + "train_runtime": 29758.8326, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 2.22, + "grad_norm": 0.006937834434211254, + "learning_rate": 8.91116939410838e-05, + "loss": 0.011773437261581421, + "num_input_tokens_seen": 59985288, + "step": 3663, + "train_runtime": 29766.9488, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 2.2206060606060607, + "grad_norm": 0.011741766706109047, + "learning_rate": 8.910570250852097e-05, + "loss": 0.014320777729153633, + "num_input_tokens_seen": 60001664, + "step": 3664, + "train_runtime": 29775.063, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 2.221212121212121, + "grad_norm": 0.00781282875686884, + "learning_rate": 8.909970962951435e-05, + "loss": 0.011964559555053711, + "num_input_tokens_seen": 60018040, + "step": 3665, + "train_runtime": 29783.1785, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 2.2218181818181817, + "grad_norm": 0.007946248166263103, + "learning_rate": 8.909371530428561e-05, + "loss": 0.012657862156629562, + "num_input_tokens_seen": 60034416, + "step": 3666, + "train_runtime": 29791.2958, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 2.2224242424242426, + "grad_norm": 0.010118436068296432, + "learning_rate": 8.908771953305648e-05, + "loss": 0.012623686343431473, + "num_input_tokens_seen": 60050792, + "step": 3667, + "train_runtime": 29799.4144, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 2.223030303030303, + "grad_norm": 0.010533769614994526, + "learning_rate": 8.908172231604873e-05, + "loss": 0.012056194245815277, + "num_input_tokens_seen": 60067168, + "step": 3668, + "train_runtime": 29807.5331, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 2.2236363636363636, + "grad_norm": 0.030545897781848907, + "learning_rate": 8.907572365348416e-05, + "loss": 0.012916878797113895, + "num_input_tokens_seen": 60083544, + "step": 3669, + "train_runtime": 29815.6463, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 2.224242424242424, + "grad_norm": 0.005548179615288973, + "learning_rate": 8.906972354558469e-05, + "loss": 0.011496108956634998, + "num_input_tokens_seen": 60099920, + "step": 3670, + "train_runtime": 29823.7611, + "train_tokens_per_second": 2015.169 + }, + { + "epoch": 2.2248484848484846, + "grad_norm": 0.007247095461934805, + "learning_rate": 8.906372199257223e-05, + "loss": 0.01363338902592659, + "num_input_tokens_seen": 60116296, + "step": 3671, + "train_runtime": 29831.875, + "train_tokens_per_second": 2015.17 + }, + { + "epoch": 2.2254545454545456, + "grad_norm": 0.01965804398059845, + "learning_rate": 8.905771899466875e-05, + "loss": 0.01304022315889597, + "num_input_tokens_seen": 60132672, + "step": 3672, + "train_runtime": 29839.99, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 2.226060606060606, + "grad_norm": 0.014896688051521778, + "learning_rate": 8.905171455209631e-05, + "loss": 0.012952609919011593, + "num_input_tokens_seen": 60149048, + "step": 3673, + "train_runtime": 29848.1095, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 2.2266666666666666, + "grad_norm": 0.008357114158570766, + "learning_rate": 8.9045708665077e-05, + "loss": 0.011900687590241432, + "num_input_tokens_seen": 60165424, + "step": 3674, + "train_runtime": 29856.2329, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 2.227272727272727, + "grad_norm": 0.013724857941269875, + "learning_rate": 8.903970133383297e-05, + "loss": 0.011536635458469391, + "num_input_tokens_seen": 60181800, + "step": 3675, + "train_runtime": 29864.3458, + "train_tokens_per_second": 2015.172 + }, + { + "epoch": 2.227878787878788, + "grad_norm": 0.006729908287525177, + "learning_rate": 8.90336925585864e-05, + "loss": 0.012406526133418083, + "num_input_tokens_seen": 60198176, + "step": 3676, + "train_runtime": 29872.4534, + "train_tokens_per_second": 2015.173 + }, + { + "epoch": 2.2284848484848485, + "grad_norm": 0.00586884468793869, + "learning_rate": 8.902768233955958e-05, + "loss": 0.011684720404446125, + "num_input_tokens_seen": 60214552, + "step": 3677, + "train_runtime": 29880.565, + "train_tokens_per_second": 2015.174 + }, + { + "epoch": 2.229090909090909, + "grad_norm": 0.00459505058825016, + "learning_rate": 8.902167067697477e-05, + "loss": 0.012015356682240963, + "num_input_tokens_seen": 60230928, + "step": 3678, + "train_runtime": 29888.6788, + "train_tokens_per_second": 2015.175 + }, + { + "epoch": 2.2296969696969695, + "grad_norm": 0.006937180645763874, + "learning_rate": 8.901565757105437e-05, + "loss": 0.012867008335888386, + "num_input_tokens_seen": 60247304, + "step": 3679, + "train_runtime": 29896.7935, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 2.2303030303030305, + "grad_norm": 0.008642815984785557, + "learning_rate": 8.900964302202078e-05, + "loss": 0.012027869001030922, + "num_input_tokens_seen": 60263680, + "step": 3680, + "train_runtime": 29904.9075, + "train_tokens_per_second": 2015.177 + }, + { + "epoch": 2.230909090909091, + "grad_norm": 0.009581885300576687, + "learning_rate": 8.900362703009644e-05, + "loss": 0.012776635587215424, + "num_input_tokens_seen": 60280056, + "step": 3681, + "train_runtime": 29913.0217, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 2.2315151515151515, + "grad_norm": 0.01875944249331951, + "learning_rate": 8.899760959550389e-05, + "loss": 0.013734135776758194, + "num_input_tokens_seen": 60296432, + "step": 3682, + "train_runtime": 29921.1351, + "train_tokens_per_second": 2015.179 + }, + { + "epoch": 2.232121212121212, + "grad_norm": 0.008612217381596565, + "learning_rate": 8.899159071846575e-05, + "loss": 0.012646391056478024, + "num_input_tokens_seen": 60312808, + "step": 3683, + "train_runtime": 29929.2454, + "train_tokens_per_second": 2015.18 + }, + { + "epoch": 2.232727272727273, + "grad_norm": 0.011928489431738853, + "learning_rate": 8.898557039920457e-05, + "loss": 0.011622844263911247, + "num_input_tokens_seen": 60329184, + "step": 3684, + "train_runtime": 29937.3581, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 2.2333333333333334, + "grad_norm": 0.011898646131157875, + "learning_rate": 8.897954863794305e-05, + "loss": 0.010517679154872894, + "num_input_tokens_seen": 60345560, + "step": 3685, + "train_runtime": 29945.469, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 2.233939393939394, + "grad_norm": 0.011300486512482166, + "learning_rate": 8.897352543490395e-05, + "loss": 0.013875912874937057, + "num_input_tokens_seen": 60361936, + "step": 3686, + "train_runtime": 29953.5902, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 2.2345454545454544, + "grad_norm": 0.0071717859245836735, + "learning_rate": 8.896750079031005e-05, + "loss": 0.011511060409247875, + "num_input_tokens_seen": 60378312, + "step": 3687, + "train_runtime": 29961.7012, + "train_tokens_per_second": 2015.183 + }, + { + "epoch": 2.2351515151515153, + "grad_norm": 0.005728852469474077, + "learning_rate": 8.896147470438416e-05, + "loss": 0.012795530259609222, + "num_input_tokens_seen": 60394688, + "step": 3688, + "train_runtime": 29969.8163, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 2.235757575757576, + "grad_norm": 0.004077422432601452, + "learning_rate": 8.89554471773492e-05, + "loss": 0.01207432895898819, + "num_input_tokens_seen": 60411064, + "step": 3689, + "train_runtime": 29977.9329, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 2.2363636363636363, + "grad_norm": 0.010106992907822132, + "learning_rate": 8.894941820942813e-05, + "loss": 0.012829555198550224, + "num_input_tokens_seen": 60427440, + "step": 3690, + "train_runtime": 29986.0439, + "train_tokens_per_second": 2015.185 + }, + { + "epoch": 2.236969696969697, + "grad_norm": 0.011969654820859432, + "learning_rate": 8.894338780084392e-05, + "loss": 0.010625853203237057, + "num_input_tokens_seen": 60443816, + "step": 3691, + "train_runtime": 29994.1554, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 2.2375757575757578, + "grad_norm": 0.011020442470908165, + "learning_rate": 8.893735595181962e-05, + "loss": 0.012517699040472507, + "num_input_tokens_seen": 60460192, + "step": 3692, + "train_runtime": 30002.2734, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 2.2381818181818183, + "grad_norm": 0.009888879954814911, + "learning_rate": 8.893132266257837e-05, + "loss": 0.011072501540184021, + "num_input_tokens_seen": 60476568, + "step": 3693, + "train_runtime": 30010.3922, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 2.2387878787878788, + "grad_norm": 0.0004854231374338269, + "learning_rate": 8.89252879333433e-05, + "loss": 0.01135720033198595, + "num_input_tokens_seen": 60492944, + "step": 3694, + "train_runtime": 30018.5065, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 2.2393939393939393, + "grad_norm": 0.0065515311434865, + "learning_rate": 8.891925176433764e-05, + "loss": 0.011831994168460369, + "num_input_tokens_seen": 60509320, + "step": 3695, + "train_runtime": 30026.6188, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 2.24, + "grad_norm": 0.006119894795119762, + "learning_rate": 8.891321415578464e-05, + "loss": 0.011481634341180325, + "num_input_tokens_seen": 60525696, + "step": 3696, + "train_runtime": 30034.7353, + "train_tokens_per_second": 2015.19 + }, + { + "epoch": 2.2406060606060607, + "grad_norm": 0.006571260746568441, + "learning_rate": 8.890717510790763e-05, + "loss": 0.013411670923233032, + "num_input_tokens_seen": 60542072, + "step": 3697, + "train_runtime": 30042.8507, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 2.241212121212121, + "grad_norm": 0.008368059061467648, + "learning_rate": 8.890113462093e-05, + "loss": 0.011599770747125149, + "num_input_tokens_seen": 60558448, + "step": 3698, + "train_runtime": 30050.965, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 2.2418181818181817, + "grad_norm": 0.0047463481314480305, + "learning_rate": 8.889509269507514e-05, + "loss": 0.010899157263338566, + "num_input_tokens_seen": 60574824, + "step": 3699, + "train_runtime": 30059.0815, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 2.242424242424242, + "grad_norm": 0.007197657600045204, + "learning_rate": 8.888904933056654e-05, + "loss": 0.012830094434320927, + "num_input_tokens_seen": 60591200, + "step": 3700, + "train_runtime": 30067.1921, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 2.243030303030303, + "grad_norm": 0.004900816362351179, + "learning_rate": 8.888300452762774e-05, + "loss": 0.011792563833296299, + "num_input_tokens_seen": 60607576, + "step": 3701, + "train_runtime": 30076.1536, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 2.2436363636363637, + "grad_norm": 0.008804868906736374, + "learning_rate": 8.887695828648232e-05, + "loss": 0.011721835471689701, + "num_input_tokens_seen": 60623952, + "step": 3702, + "train_runtime": 30084.2656, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 2.244242424242424, + "grad_norm": 0.010449771769344807, + "learning_rate": 8.887091060735395e-05, + "loss": 0.0124953743070364, + "num_input_tokens_seen": 60640328, + "step": 3703, + "train_runtime": 30092.4732, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 2.2448484848484846, + "grad_norm": 0.008419829420745373, + "learning_rate": 8.886486149046627e-05, + "loss": 0.01311418879777193, + "num_input_tokens_seen": 60656704, + "step": 3704, + "train_runtime": 30100.593, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 2.2454545454545456, + "grad_norm": 0.0049407086335122585, + "learning_rate": 8.885881093604306e-05, + "loss": 0.012327872216701508, + "num_input_tokens_seen": 60673080, + "step": 3705, + "train_runtime": 30108.7078, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 2.246060606060606, + "grad_norm": 0.004161890130490065, + "learning_rate": 8.88527589443081e-05, + "loss": 0.011168462224304676, + "num_input_tokens_seen": 60689456, + "step": 3706, + "train_runtime": 30116.8229, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 2.2466666666666666, + "grad_norm": 0.004563709255307913, + "learning_rate": 8.884670551548525e-05, + "loss": 0.012025438249111176, + "num_input_tokens_seen": 60705832, + "step": 3707, + "train_runtime": 30124.9366, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 2.247272727272727, + "grad_norm": 0.009686720557510853, + "learning_rate": 8.884065064979841e-05, + "loss": 0.012253142893314362, + "num_input_tokens_seen": 60722208, + "step": 3708, + "train_runtime": 30133.0553, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 2.247878787878788, + "grad_norm": 0.01068910863250494, + "learning_rate": 8.883459434747154e-05, + "loss": 0.012575153261423111, + "num_input_tokens_seen": 60738584, + "step": 3709, + "train_runtime": 30141.1687, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 2.2484848484848485, + "grad_norm": 0.006846324075013399, + "learning_rate": 8.882853660872867e-05, + "loss": 0.012148548848927021, + "num_input_tokens_seen": 60754960, + "step": 3710, + "train_runtime": 30149.287, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 2.249090909090909, + "grad_norm": 0.0058296844363212585, + "learning_rate": 8.882247743379383e-05, + "loss": 0.013228103518486023, + "num_input_tokens_seen": 60771336, + "step": 3711, + "train_runtime": 30157.3998, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 2.2496969696969695, + "grad_norm": 0.005055832210928202, + "learning_rate": 8.881641682289117e-05, + "loss": 0.01328389160335064, + "num_input_tokens_seen": 60787712, + "step": 3712, + "train_runtime": 30165.5105, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 2.2503030303030305, + "grad_norm": 0.006351431831717491, + "learning_rate": 8.881035477624483e-05, + "loss": 0.011203351430594921, + "num_input_tokens_seen": 60804088, + "step": 3713, + "train_runtime": 30173.6214, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 2.250909090909091, + "grad_norm": 0.005891186185181141, + "learning_rate": 8.880429129407904e-05, + "loss": 0.012171884998679161, + "num_input_tokens_seen": 60820464, + "step": 3714, + "train_runtime": 30181.7337, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 2.2515151515151515, + "grad_norm": 0.010121798142790794, + "learning_rate": 8.879822637661809e-05, + "loss": 0.011959838680922985, + "num_input_tokens_seen": 60836840, + "step": 3715, + "train_runtime": 30189.8449, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 2.252121212121212, + "grad_norm": 0.008183280937373638, + "learning_rate": 8.879216002408631e-05, + "loss": 0.013505983166396618, + "num_input_tokens_seen": 60853216, + "step": 3716, + "train_runtime": 30197.9576, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 2.252727272727273, + "grad_norm": 0.006862631533294916, + "learning_rate": 8.878609223670806e-05, + "loss": 0.012125739827752113, + "num_input_tokens_seen": 60869592, + "step": 3717, + "train_runtime": 30206.0714, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 2.2533333333333334, + "grad_norm": 0.007013807073235512, + "learning_rate": 8.87800230147078e-05, + "loss": 0.012417798861861229, + "num_input_tokens_seen": 60885968, + "step": 3718, + "train_runtime": 30214.1929, + "train_tokens_per_second": 2015.145 + }, + { + "epoch": 2.253939393939394, + "grad_norm": 0.0071312859654426575, + "learning_rate": 8.877395235831001e-05, + "loss": 0.012290849350392818, + "num_input_tokens_seen": 60902344, + "step": 3719, + "train_runtime": 30222.3058, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 2.2545454545454544, + "grad_norm": 0.009357710368931293, + "learning_rate": 8.876788026773922e-05, + "loss": 0.01159263588488102, + "num_input_tokens_seen": 60918720, + "step": 3720, + "train_runtime": 30230.4202, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 2.255151515151515, + "grad_norm": 0.016083406284451485, + "learning_rate": 8.876180674322005e-05, + "loss": 0.011838029138743877, + "num_input_tokens_seen": 60935096, + "step": 3721, + "train_runtime": 30238.5359, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 2.255757575757576, + "grad_norm": 0.0025645827408879995, + "learning_rate": 8.875573178497714e-05, + "loss": 0.010505922138690948, + "num_input_tokens_seen": 60951472, + "step": 3722, + "train_runtime": 30246.6542, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 2.2563636363636363, + "grad_norm": 0.005660255905240774, + "learning_rate": 8.874965539323517e-05, + "loss": 0.012189293280243874, + "num_input_tokens_seen": 60967848, + "step": 3723, + "train_runtime": 30254.7693, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 2.256969696969697, + "grad_norm": 0.010649233125150204, + "learning_rate": 8.87435775682189e-05, + "loss": 0.012099821120500565, + "num_input_tokens_seen": 60984224, + "step": 3724, + "train_runtime": 30262.8821, + "train_tokens_per_second": 2015.149 + }, + { + "epoch": 2.257575757575758, + "grad_norm": 0.005941980052739382, + "learning_rate": 8.873749831015315e-05, + "loss": 0.011310269124805927, + "num_input_tokens_seen": 61000600, + "step": 3725, + "train_runtime": 30270.9994, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 2.2581818181818183, + "grad_norm": 0.010598689317703247, + "learning_rate": 8.87314176192628e-05, + "loss": 0.012694881297647953, + "num_input_tokens_seen": 61016976, + "step": 3726, + "train_runtime": 30279.1172, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 2.258787878787879, + "grad_norm": 0.01025476586073637, + "learning_rate": 8.872533549577271e-05, + "loss": 0.012136287987232208, + "num_input_tokens_seen": 61033352, + "step": 3727, + "train_runtime": 30287.2334, + "train_tokens_per_second": 2015.151 + }, + { + "epoch": 2.2593939393939393, + "grad_norm": 0.006790067069232464, + "learning_rate": 8.871925193990789e-05, + "loss": 0.013514727354049683, + "num_input_tokens_seen": 61049728, + "step": 3728, + "train_runtime": 30295.3472, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 2.26, + "grad_norm": 0.007537974044680595, + "learning_rate": 8.871316695189334e-05, + "loss": 0.012649727053940296, + "num_input_tokens_seen": 61066104, + "step": 3729, + "train_runtime": 30303.4656, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 2.2606060606060607, + "grad_norm": 0.0073478384874761105, + "learning_rate": 8.870708053195413e-05, + "loss": 0.011982829309999943, + "num_input_tokens_seen": 61082480, + "step": 3730, + "train_runtime": 30311.5858, + "train_tokens_per_second": 2015.153 + }, + { + "epoch": 2.2612121212121212, + "grad_norm": 0.028094328939914703, + "learning_rate": 8.87009926803154e-05, + "loss": 0.012677352875471115, + "num_input_tokens_seen": 61098856, + "step": 3731, + "train_runtime": 30319.7025, + "train_tokens_per_second": 2015.154 + }, + { + "epoch": 2.2618181818181817, + "grad_norm": 0.011618182994425297, + "learning_rate": 8.86949033972023e-05, + "loss": 0.012250116094946861, + "num_input_tokens_seen": 61115232, + "step": 3732, + "train_runtime": 30327.8211, + "train_tokens_per_second": 2015.154 + }, + { + "epoch": 2.2624242424242427, + "grad_norm": 0.010365051217377186, + "learning_rate": 8.868881268284008e-05, + "loss": 0.011696823872625828, + "num_input_tokens_seen": 61131608, + "step": 3733, + "train_runtime": 30335.9403, + "train_tokens_per_second": 2015.155 + }, + { + "epoch": 2.263030303030303, + "grad_norm": 0.006496044807136059, + "learning_rate": 8.868272053745403e-05, + "loss": 0.011718837544322014, + "num_input_tokens_seen": 61147984, + "step": 3734, + "train_runtime": 30344.0549, + "train_tokens_per_second": 2015.155 + }, + { + "epoch": 2.2636363636363637, + "grad_norm": 0.005344794597476721, + "learning_rate": 8.867662696126948e-05, + "loss": 0.010989891365170479, + "num_input_tokens_seen": 61164360, + "step": 3735, + "train_runtime": 30352.1698, + "train_tokens_per_second": 2015.156 + }, + { + "epoch": 2.264242424242424, + "grad_norm": 0.012590425089001656, + "learning_rate": 8.867053195451183e-05, + "loss": 0.013337450101971626, + "num_input_tokens_seen": 61180736, + "step": 3736, + "train_runtime": 30360.2893, + "train_tokens_per_second": 2015.157 + }, + { + "epoch": 2.2648484848484847, + "grad_norm": 0.010968165472149849, + "learning_rate": 8.866443551740648e-05, + "loss": 0.013668050989508629, + "num_input_tokens_seen": 61197112, + "step": 3737, + "train_runtime": 30368.4071, + "train_tokens_per_second": 2015.157 + }, + { + "epoch": 2.2654545454545456, + "grad_norm": 0.00999407097697258, + "learning_rate": 8.865833765017899e-05, + "loss": 0.01268429309129715, + "num_input_tokens_seen": 61213488, + "step": 3738, + "train_runtime": 30376.5202, + "train_tokens_per_second": 2015.158 + }, + { + "epoch": 2.266060606060606, + "grad_norm": 0.012314529158174992, + "learning_rate": 8.865223835305485e-05, + "loss": 0.01283974852412939, + "num_input_tokens_seen": 61229864, + "step": 3739, + "train_runtime": 30384.635, + "train_tokens_per_second": 2015.159 + }, + { + "epoch": 2.2666666666666666, + "grad_norm": 0.0056611280888319016, + "learning_rate": 8.864613762625969e-05, + "loss": 0.01165764406323433, + "num_input_tokens_seen": 61246240, + "step": 3740, + "train_runtime": 30392.7519, + "train_tokens_per_second": 2015.159 + }, + { + "epoch": 2.267272727272727, + "grad_norm": 0.0059904055669903755, + "learning_rate": 8.864003547001915e-05, + "loss": 0.011943137273192406, + "num_input_tokens_seen": 61262616, + "step": 3741, + "train_runtime": 30400.8654, + "train_tokens_per_second": 2015.16 + }, + { + "epoch": 2.267878787878788, + "grad_norm": 0.00643067667260766, + "learning_rate": 8.863393188455897e-05, + "loss": 0.01149215642362833, + "num_input_tokens_seen": 61278992, + "step": 3742, + "train_runtime": 30408.9806, + "train_tokens_per_second": 2015.161 + }, + { + "epoch": 2.2684848484848485, + "grad_norm": 0.007379885762929916, + "learning_rate": 8.862782687010487e-05, + "loss": 0.012658249586820602, + "num_input_tokens_seen": 61295368, + "step": 3743, + "train_runtime": 30417.0971, + "train_tokens_per_second": 2015.162 + }, + { + "epoch": 2.269090909090909, + "grad_norm": 0.007340370211750269, + "learning_rate": 8.862172042688268e-05, + "loss": 0.01166062243282795, + "num_input_tokens_seen": 61311744, + "step": 3744, + "train_runtime": 30425.213, + "train_tokens_per_second": 2015.162 + }, + { + "epoch": 2.2696969696969695, + "grad_norm": 0.009426895529031754, + "learning_rate": 8.861561255511826e-05, + "loss": 0.010667637921869755, + "num_input_tokens_seen": 61328120, + "step": 3745, + "train_runtime": 30433.3335, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 2.2703030303030305, + "grad_norm": 0.009414087980985641, + "learning_rate": 8.860950325503754e-05, + "loss": 0.011794875375926495, + "num_input_tokens_seen": 61344496, + "step": 3746, + "train_runtime": 30441.4479, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 2.270909090909091, + "grad_norm": 0.003789094975218177, + "learning_rate": 8.860339252686648e-05, + "loss": 0.011640205979347229, + "num_input_tokens_seen": 61360872, + "step": 3747, + "train_runtime": 30449.5634, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 2.2715151515151515, + "grad_norm": 0.008663547225296497, + "learning_rate": 8.85972803708311e-05, + "loss": 0.011976547539234161, + "num_input_tokens_seen": 61377248, + "step": 3748, + "train_runtime": 30457.6768, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 2.272121212121212, + "grad_norm": 0.005016832146793604, + "learning_rate": 8.859116678715751e-05, + "loss": 0.011901703663170338, + "num_input_tokens_seen": 61393624, + "step": 3749, + "train_runtime": 30465.787, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 2.2727272727272725, + "grad_norm": 0.0046637230552732944, + "learning_rate": 8.85850517760718e-05, + "loss": 0.011695494875311852, + "num_input_tokens_seen": 61410000, + "step": 3750, + "train_runtime": 30473.9004, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 2.2733333333333334, + "grad_norm": 0.007660789415240288, + "learning_rate": 8.857893533780015e-05, + "loss": 0.011524790897965431, + "num_input_tokens_seen": 61426376, + "step": 3751, + "train_runtime": 30482.0201, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 2.273939393939394, + "grad_norm": 0.005929175764322281, + "learning_rate": 8.857281747256882e-05, + "loss": 0.01320036593824625, + "num_input_tokens_seen": 61442752, + "step": 3752, + "train_runtime": 30490.1361, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 2.2745454545454544, + "grad_norm": 0.013882546685636044, + "learning_rate": 8.856669818060409e-05, + "loss": 0.01192145049571991, + "num_input_tokens_seen": 61459128, + "step": 3753, + "train_runtime": 30498.2521, + "train_tokens_per_second": 2015.169 + }, + { + "epoch": 2.2751515151515154, + "grad_norm": 0.003313496010378003, + "learning_rate": 8.85605774621323e-05, + "loss": 0.011736012995243073, + "num_input_tokens_seen": 61475504, + "step": 3754, + "train_runtime": 30506.3624, + "train_tokens_per_second": 2015.17 + }, + { + "epoch": 2.275757575757576, + "grad_norm": 0.005379652138799429, + "learning_rate": 8.855445531737985e-05, + "loss": 0.012325488962233067, + "num_input_tokens_seen": 61491880, + "step": 3755, + "train_runtime": 30514.4757, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 2.2763636363636364, + "grad_norm": 0.012600576505064964, + "learning_rate": 8.854833174657317e-05, + "loss": 0.012256315909326077, + "num_input_tokens_seen": 61508256, + "step": 3756, + "train_runtime": 30522.5913, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 2.276969696969697, + "grad_norm": 0.011063162237405777, + "learning_rate": 8.854220674993876e-05, + "loss": 0.013093437068164349, + "num_input_tokens_seen": 61524632, + "step": 3757, + "train_runtime": 30530.7003, + "train_tokens_per_second": 2015.173 + }, + { + "epoch": 2.2775757575757574, + "grad_norm": 0.011599640361964703, + "learning_rate": 8.85360803277032e-05, + "loss": 0.011047718115150928, + "num_input_tokens_seen": 61541008, + "step": 3758, + "train_runtime": 30538.8161, + "train_tokens_per_second": 2015.173 + }, + { + "epoch": 2.2781818181818183, + "grad_norm": 0.011033455841243267, + "learning_rate": 8.852995248009305e-05, + "loss": 0.012573250569403172, + "num_input_tokens_seen": 61557384, + "step": 3759, + "train_runtime": 30546.9326, + "train_tokens_per_second": 2015.174 + }, + { + "epoch": 2.278787878787879, + "grad_norm": 0.01250431314110756, + "learning_rate": 8.852382320733501e-05, + "loss": 0.011132653802633286, + "num_input_tokens_seen": 61573760, + "step": 3760, + "train_runtime": 30555.0493, + "train_tokens_per_second": 2015.175 + }, + { + "epoch": 2.2793939393939393, + "grad_norm": 0.008397900499403477, + "learning_rate": 8.851769250965577e-05, + "loss": 0.012294886633753777, + "num_input_tokens_seen": 61590136, + "step": 3761, + "train_runtime": 30563.1648, + "train_tokens_per_second": 2015.175 + }, + { + "epoch": 2.2800000000000002, + "grad_norm": 0.0026903818361461163, + "learning_rate": 8.851156038728209e-05, + "loss": 0.012650152668356895, + "num_input_tokens_seen": 61606512, + "step": 3762, + "train_runtime": 30571.2819, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 2.2806060606060607, + "grad_norm": 0.006723749917000532, + "learning_rate": 8.850542684044078e-05, + "loss": 0.011322797276079655, + "num_input_tokens_seen": 61622888, + "step": 3763, + "train_runtime": 30579.3977, + "train_tokens_per_second": 2015.177 + }, + { + "epoch": 2.2812121212121212, + "grad_norm": 0.006619950756430626, + "learning_rate": 8.849929186935874e-05, + "loss": 0.012670768424868584, + "num_input_tokens_seen": 61639264, + "step": 3764, + "train_runtime": 30587.5168, + "train_tokens_per_second": 2015.177 + }, + { + "epoch": 2.2818181818181817, + "grad_norm": 0.004861629568040371, + "learning_rate": 8.849315547426284e-05, + "loss": 0.011259309016168118, + "num_input_tokens_seen": 61655640, + "step": 3765, + "train_runtime": 30595.6326, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 2.2824242424242422, + "grad_norm": 0.008465851657092571, + "learning_rate": 8.84870176553801e-05, + "loss": 0.010890805162489414, + "num_input_tokens_seen": 61672016, + "step": 3766, + "train_runtime": 30603.7507, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 2.283030303030303, + "grad_norm": 0.01856720820069313, + "learning_rate": 8.848087841293753e-05, + "loss": 0.011901823803782463, + "num_input_tokens_seen": 61688392, + "step": 3767, + "train_runtime": 30611.8653, + "train_tokens_per_second": 2015.179 + }, + { + "epoch": 2.2836363636363637, + "grad_norm": 0.006428460590541363, + "learning_rate": 8.84747377471622e-05, + "loss": 0.013656743802130222, + "num_input_tokens_seen": 61704768, + "step": 3768, + "train_runtime": 30619.9787, + "train_tokens_per_second": 2015.18 + }, + { + "epoch": 2.284242424242424, + "grad_norm": 0.008296936750411987, + "learning_rate": 8.846859565828124e-05, + "loss": 0.012134167365729809, + "num_input_tokens_seen": 61721144, + "step": 3769, + "train_runtime": 30628.0958, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 2.2848484848484847, + "grad_norm": 0.006756153889000416, + "learning_rate": 8.846245214652185e-05, + "loss": 0.011300654150545597, + "num_input_tokens_seen": 61737520, + "step": 3770, + "train_runtime": 30636.2146, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 2.2854545454545456, + "grad_norm": 0.007698638364672661, + "learning_rate": 8.845630721211124e-05, + "loss": 0.011657550930976868, + "num_input_tokens_seen": 61753896, + "step": 3771, + "train_runtime": 30644.3343, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 2.286060606060606, + "grad_norm": 0.00994476955384016, + "learning_rate": 8.845016085527673e-05, + "loss": 0.010930661112070084, + "num_input_tokens_seen": 61770272, + "step": 3772, + "train_runtime": 30652.4485, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 2.2866666666666666, + "grad_norm": 0.009451497346162796, + "learning_rate": 8.844401307624566e-05, + "loss": 0.012202934361994267, + "num_input_tokens_seen": 61786648, + "step": 3773, + "train_runtime": 30660.5626, + "train_tokens_per_second": 2015.183 + }, + { + "epoch": 2.287272727272727, + "grad_norm": 0.010669391602277756, + "learning_rate": 8.84378638752454e-05, + "loss": 0.013062708079814911, + "num_input_tokens_seen": 61803024, + "step": 3774, + "train_runtime": 30668.6819, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 2.287878787878788, + "grad_norm": 0.005233460105955601, + "learning_rate": 8.843171325250341e-05, + "loss": 0.011414062231779099, + "num_input_tokens_seen": 61819400, + "step": 3775, + "train_runtime": 30676.7958, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 2.2884848484848486, + "grad_norm": 0.006945099215954542, + "learning_rate": 8.842556120824719e-05, + "loss": 0.013735410757362843, + "num_input_tokens_seen": 61835776, + "step": 3776, + "train_runtime": 30684.9131, + "train_tokens_per_second": 2015.185 + }, + { + "epoch": 2.289090909090909, + "grad_norm": 0.011563980020582676, + "learning_rate": 8.841940774270429e-05, + "loss": 0.011850640177726746, + "num_input_tokens_seen": 61852152, + "step": 3777, + "train_runtime": 30693.0323, + "train_tokens_per_second": 2015.185 + }, + { + "epoch": 2.2896969696969696, + "grad_norm": 0.019558526575565338, + "learning_rate": 8.841325285610232e-05, + "loss": 0.012019994668662548, + "num_input_tokens_seen": 61868528, + "step": 3778, + "train_runtime": 30701.1474, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 2.29030303030303, + "grad_norm": 0.0072617363184690475, + "learning_rate": 8.840709654866892e-05, + "loss": 0.012482079677283764, + "num_input_tokens_seen": 61884904, + "step": 3779, + "train_runtime": 30709.2577, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 2.290909090909091, + "grad_norm": 0.008295083418488503, + "learning_rate": 8.840093882063182e-05, + "loss": 0.012043890543282032, + "num_input_tokens_seen": 61901280, + "step": 3780, + "train_runtime": 30717.3684, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 2.2915151515151515, + "grad_norm": 0.010095818899571896, + "learning_rate": 8.839477967221879e-05, + "loss": 0.0124919218942523, + "num_input_tokens_seen": 61917656, + "step": 3781, + "train_runtime": 30725.4828, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 2.292121212121212, + "grad_norm": 0.004255611915141344, + "learning_rate": 8.838861910365762e-05, + "loss": 0.01309207733720541, + "num_input_tokens_seen": 61934032, + "step": 3782, + "train_runtime": 30733.5973, + "train_tokens_per_second": 2015.19 + }, + { + "epoch": 2.292727272727273, + "grad_norm": 0.00850534439086914, + "learning_rate": 8.838245711517618e-05, + "loss": 0.012254266068339348, + "num_input_tokens_seen": 61950408, + "step": 3783, + "train_runtime": 30741.7101, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 2.2933333333333334, + "grad_norm": 0.00831502303481102, + "learning_rate": 8.83762937070024e-05, + "loss": 0.01236623153090477, + "num_input_tokens_seen": 61966784, + "step": 3784, + "train_runtime": 30749.8321, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 2.293939393939394, + "grad_norm": 0.00466503482311964, + "learning_rate": 8.837012887936426e-05, + "loss": 0.012684816494584084, + "num_input_tokens_seen": 61983160, + "step": 3785, + "train_runtime": 30757.9435, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 2.2945454545454544, + "grad_norm": 0.007791228126734495, + "learning_rate": 8.836396263248976e-05, + "loss": 0.012480277568101883, + "num_input_tokens_seen": 61999536, + "step": 3786, + "train_runtime": 30766.0566, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 2.295151515151515, + "grad_norm": 0.008186898194253445, + "learning_rate": 8.835779496660701e-05, + "loss": 0.012753861956298351, + "num_input_tokens_seen": 62015912, + "step": 3787, + "train_runtime": 30774.1735, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 2.295757575757576, + "grad_norm": 0.008967457339167595, + "learning_rate": 8.835162588194411e-05, + "loss": 0.011731700040400028, + "num_input_tokens_seen": 62032288, + "step": 3788, + "train_runtime": 30782.2879, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 2.2963636363636364, + "grad_norm": 0.003565243910998106, + "learning_rate": 8.834545537872925e-05, + "loss": 0.013852463103830814, + "num_input_tokens_seen": 62048664, + "step": 3789, + "train_runtime": 30790.4021, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 2.296969696969697, + "grad_norm": 0.0027453871443867683, + "learning_rate": 8.833928345719069e-05, + "loss": 0.011183127760887146, + "num_input_tokens_seen": 62065040, + "step": 3790, + "train_runtime": 30798.518, + "train_tokens_per_second": 2015.196 + }, + { + "epoch": 2.2975757575757574, + "grad_norm": 0.007668246980756521, + "learning_rate": 8.833311011755668e-05, + "loss": 0.011989946477115154, + "num_input_tokens_seen": 62081416, + "step": 3791, + "train_runtime": 30806.6335, + "train_tokens_per_second": 2015.196 + }, + { + "epoch": 2.2981818181818183, + "grad_norm": 0.025066649541258812, + "learning_rate": 8.832693536005558e-05, + "loss": 0.014158163219690323, + "num_input_tokens_seen": 62097792, + "step": 3792, + "train_runtime": 30814.7498, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 2.298787878787879, + "grad_norm": 0.017292816191911697, + "learning_rate": 8.832075918491579e-05, + "loss": 0.011756017804145813, + "num_input_tokens_seen": 62114168, + "step": 3793, + "train_runtime": 30822.8675, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 2.2993939393939393, + "grad_norm": 0.0034284384455531836, + "learning_rate": 8.831458159236575e-05, + "loss": 0.012419110164046288, + "num_input_tokens_seen": 62130544, + "step": 3794, + "train_runtime": 30830.9797, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 2.3, + "grad_norm": 0.00656497897580266, + "learning_rate": 8.830840258263393e-05, + "loss": 0.012269002385437489, + "num_input_tokens_seen": 62146920, + "step": 3795, + "train_runtime": 30839.0917, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 2.3006060606060608, + "grad_norm": 0.009967640973627567, + "learning_rate": 8.83022221559489e-05, + "loss": 0.012140346691012383, + "num_input_tokens_seen": 62163296, + "step": 3796, + "train_runtime": 30847.2027, + "train_tokens_per_second": 2015.2 + }, + { + "epoch": 2.3012121212121213, + "grad_norm": 0.007896405644714832, + "learning_rate": 8.829604031253929e-05, + "loss": 0.013535144738852978, + "num_input_tokens_seen": 62179672, + "step": 3797, + "train_runtime": 30855.3143, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 2.3018181818181818, + "grad_norm": 0.011223090812563896, + "learning_rate": 8.828985705263369e-05, + "loss": 0.012020010501146317, + "num_input_tokens_seen": 62196048, + "step": 3798, + "train_runtime": 30863.4314, + "train_tokens_per_second": 2015.202 + }, + { + "epoch": 2.3024242424242423, + "grad_norm": 0.006788720842450857, + "learning_rate": 8.828367237646087e-05, + "loss": 0.012599104084074497, + "num_input_tokens_seen": 62212424, + "step": 3799, + "train_runtime": 30871.5457, + "train_tokens_per_second": 2015.203 + }, + { + "epoch": 2.303030303030303, + "grad_norm": 0.006533706095069647, + "learning_rate": 8.827748628424956e-05, + "loss": 0.012083176523447037, + "num_input_tokens_seen": 62228800, + "step": 3800, + "train_runtime": 30879.6599, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 2.3036363636363637, + "grad_norm": 0.008081979118287563, + "learning_rate": 8.827129877622857e-05, + "loss": 0.012634792365133762, + "num_input_tokens_seen": 62245176, + "step": 3801, + "train_runtime": 30888.7316, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 2.304242424242424, + "grad_norm": 0.0023193880915641785, + "learning_rate": 8.826510985262677e-05, + "loss": 0.011524361558258533, + "num_input_tokens_seen": 62261552, + "step": 3802, + "train_runtime": 30896.8426, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 2.3048484848484847, + "grad_norm": 0.006892868783324957, + "learning_rate": 8.825891951367307e-05, + "loss": 0.011532147414982319, + "num_input_tokens_seen": 62277928, + "step": 3803, + "train_runtime": 30904.9545, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 2.3054545454545456, + "grad_norm": 0.0047895400784909725, + "learning_rate": 8.825272775959644e-05, + "loss": 0.013725175522267818, + "num_input_tokens_seen": 62294304, + "step": 3804, + "train_runtime": 30913.0669, + "train_tokens_per_second": 2015.145 + }, + { + "epoch": 2.306060606060606, + "grad_norm": 0.0064620282500982285, + "learning_rate": 8.824653459062591e-05, + "loss": 0.012814794667065144, + "num_input_tokens_seen": 62310680, + "step": 3805, + "train_runtime": 30921.1796, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 2.3066666666666666, + "grad_norm": 0.008356164209544659, + "learning_rate": 8.824034000699055e-05, + "loss": 0.01243899017572403, + "num_input_tokens_seen": 62327056, + "step": 3806, + "train_runtime": 30929.2943, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 2.307272727272727, + "grad_norm": 0.004934507422149181, + "learning_rate": 8.823414400891948e-05, + "loss": 0.011569508351385593, + "num_input_tokens_seen": 62343432, + "step": 3807, + "train_runtime": 30937.41, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 2.3078787878787876, + "grad_norm": 0.031424786895513535, + "learning_rate": 8.822794659664187e-05, + "loss": 0.01256850641220808, + "num_input_tokens_seen": 62359808, + "step": 3808, + "train_runtime": 30945.532, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 2.3084848484848486, + "grad_norm": 0.011678419075906277, + "learning_rate": 8.822174777038697e-05, + "loss": 0.012379190884530544, + "num_input_tokens_seen": 62376184, + "step": 3809, + "train_runtime": 30953.6454, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 2.309090909090909, + "grad_norm": 0.009252636693418026, + "learning_rate": 8.821554753038406e-05, + "loss": 0.012269056402146816, + "num_input_tokens_seen": 62392560, + "step": 3810, + "train_runtime": 30961.7611, + "train_tokens_per_second": 2015.149 + }, + { + "epoch": 2.3096969696969696, + "grad_norm": 0.012772388756275177, + "learning_rate": 8.820934587686247e-05, + "loss": 0.013819447718560696, + "num_input_tokens_seen": 62408936, + "step": 3811, + "train_runtime": 30969.8761, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 2.3103030303030305, + "grad_norm": 0.016963202506303787, + "learning_rate": 8.820314281005158e-05, + "loss": 0.013393501751124859, + "num_input_tokens_seen": 62425312, + "step": 3812, + "train_runtime": 30977.9916, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 2.310909090909091, + "grad_norm": 0.008827922865748405, + "learning_rate": 8.819693833018083e-05, + "loss": 0.011720137670636177, + "num_input_tokens_seen": 62441688, + "step": 3813, + "train_runtime": 30986.1051, + "train_tokens_per_second": 2015.151 + }, + { + "epoch": 2.3115151515151515, + "grad_norm": 0.012870227918028831, + "learning_rate": 8.81907324374797e-05, + "loss": 0.0119565948843956, + "num_input_tokens_seen": 62458064, + "step": 3814, + "train_runtime": 30994.2197, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 2.312121212121212, + "grad_norm": 0.010535592213273048, + "learning_rate": 8.818452513217778e-05, + "loss": 0.011577087454497814, + "num_input_tokens_seen": 62474440, + "step": 3815, + "train_runtime": 31002.3329, + "train_tokens_per_second": 2015.153 + }, + { + "epoch": 2.3127272727272725, + "grad_norm": 0.008618311025202274, + "learning_rate": 8.817831641450462e-05, + "loss": 0.011819293722510338, + "num_input_tokens_seen": 62490816, + "step": 3816, + "train_runtime": 31010.4426, + "train_tokens_per_second": 2015.154 + }, + { + "epoch": 2.3133333333333335, + "grad_norm": 0.006642166990786791, + "learning_rate": 8.817210628468991e-05, + "loss": 0.012905126437544823, + "num_input_tokens_seen": 62507192, + "step": 3817, + "train_runtime": 31018.5576, + "train_tokens_per_second": 2015.155 + }, + { + "epoch": 2.313939393939394, + "grad_norm": 0.005279494449496269, + "learning_rate": 8.81658947429633e-05, + "loss": 0.011396235786378384, + "num_input_tokens_seen": 62523568, + "step": 3818, + "train_runtime": 31026.6711, + "train_tokens_per_second": 2015.156 + }, + { + "epoch": 2.3145454545454545, + "grad_norm": 0.005412065424025059, + "learning_rate": 8.815968178955456e-05, + "loss": 0.01221628300845623, + "num_input_tokens_seen": 62539944, + "step": 3819, + "train_runtime": 31034.7855, + "train_tokens_per_second": 2015.156 + }, + { + "epoch": 2.315151515151515, + "grad_norm": 0.010122607462108135, + "learning_rate": 8.815346742469352e-05, + "loss": 0.013018102385103703, + "num_input_tokens_seen": 62556320, + "step": 3820, + "train_runtime": 31042.8967, + "train_tokens_per_second": 2015.157 + }, + { + "epoch": 2.315757575757576, + "grad_norm": 0.0067726317793130875, + "learning_rate": 8.814725164861001e-05, + "loss": 0.011113530024886131, + "num_input_tokens_seen": 62572696, + "step": 3821, + "train_runtime": 31051.0128, + "train_tokens_per_second": 2015.158 + }, + { + "epoch": 2.3163636363636364, + "grad_norm": 0.007007678970694542, + "learning_rate": 8.814103446153396e-05, + "loss": 0.011365599930286407, + "num_input_tokens_seen": 62589072, + "step": 3822, + "train_runtime": 31059.1319, + "train_tokens_per_second": 2015.158 + }, + { + "epoch": 2.316969696969697, + "grad_norm": 0.006877017207443714, + "learning_rate": 8.813481586369532e-05, + "loss": 0.013382461853325367, + "num_input_tokens_seen": 62605448, + "step": 3823, + "train_runtime": 31067.2418, + "train_tokens_per_second": 2015.16 + }, + { + "epoch": 2.3175757575757574, + "grad_norm": 0.009172811172902584, + "learning_rate": 8.812859585532411e-05, + "loss": 0.011877622455358505, + "num_input_tokens_seen": 62621824, + "step": 3824, + "train_runtime": 31075.3563, + "train_tokens_per_second": 2015.16 + }, + { + "epoch": 2.3181818181818183, + "grad_norm": 0.0055466280318796635, + "learning_rate": 8.81223744366504e-05, + "loss": 0.011979153379797935, + "num_input_tokens_seen": 62638200, + "step": 3825, + "train_runtime": 31083.4744, + "train_tokens_per_second": 2015.161 + }, + { + "epoch": 2.318787878787879, + "grad_norm": 0.011318989098072052, + "learning_rate": 8.811615160790427e-05, + "loss": 0.012014471925795078, + "num_input_tokens_seen": 62654576, + "step": 3826, + "train_runtime": 31091.593, + "train_tokens_per_second": 2015.161 + }, + { + "epoch": 2.3193939393939393, + "grad_norm": 0.005611430387943983, + "learning_rate": 8.810992736931594e-05, + "loss": 0.012370433658361435, + "num_input_tokens_seen": 62670952, + "step": 3827, + "train_runtime": 31099.7092, + "train_tokens_per_second": 2015.162 + }, + { + "epoch": 2.32, + "grad_norm": 0.005884826648980379, + "learning_rate": 8.810370172111559e-05, + "loss": 0.011997217312455177, + "num_input_tokens_seen": 62687328, + "step": 3828, + "train_runtime": 31107.8319, + "train_tokens_per_second": 2015.162 + }, + { + "epoch": 2.320606060606061, + "grad_norm": 0.008681188337504864, + "learning_rate": 8.809747466353356e-05, + "loss": 0.011951385997235775, + "num_input_tokens_seen": 62703704, + "step": 3829, + "train_runtime": 31115.946, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 2.3212121212121213, + "grad_norm": 0.006343021057546139, + "learning_rate": 8.80912461968001e-05, + "loss": 0.012676788493990898, + "num_input_tokens_seen": 62720080, + "step": 3830, + "train_runtime": 31124.0602, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 2.321818181818182, + "grad_norm": 0.02809157408773899, + "learning_rate": 8.808501632114563e-05, + "loss": 0.012784118764102459, + "num_input_tokens_seen": 62736456, + "step": 3831, + "train_runtime": 31132.1724, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 2.3224242424242423, + "grad_norm": 0.007597580552101135, + "learning_rate": 8.807878503680056e-05, + "loss": 0.012820257805287838, + "num_input_tokens_seen": 62752832, + "step": 3832, + "train_runtime": 31140.2903, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 2.3230303030303032, + "grad_norm": 0.011442799121141434, + "learning_rate": 8.80725523439954e-05, + "loss": 0.01274331659078598, + "num_input_tokens_seen": 62769208, + "step": 3833, + "train_runtime": 31148.4048, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 2.3236363636363637, + "grad_norm": 0.020038971677422523, + "learning_rate": 8.806631824296068e-05, + "loss": 0.012703890912234783, + "num_input_tokens_seen": 62785584, + "step": 3834, + "train_runtime": 31156.521, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 2.324242424242424, + "grad_norm": 0.005542220082134008, + "learning_rate": 8.806008273392698e-05, + "loss": 0.01139106322079897, + "num_input_tokens_seen": 62801960, + "step": 3835, + "train_runtime": 31164.6348, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 2.3248484848484847, + "grad_norm": 0.003856452414765954, + "learning_rate": 8.805384581712492e-05, + "loss": 0.011459710076451302, + "num_input_tokens_seen": 62818336, + "step": 3836, + "train_runtime": 31172.7486, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 2.325454545454545, + "grad_norm": 0.008005725219845772, + "learning_rate": 8.804760749278522e-05, + "loss": 0.01386493630707264, + "num_input_tokens_seen": 62834712, + "step": 3837, + "train_runtime": 31180.8619, + "train_tokens_per_second": 2015.169 + }, + { + "epoch": 2.326060606060606, + "grad_norm": 0.007413184270262718, + "learning_rate": 8.80413677611386e-05, + "loss": 0.012458150275051594, + "num_input_tokens_seen": 62851088, + "step": 3838, + "train_runtime": 31188.9752, + "train_tokens_per_second": 2015.17 + }, + { + "epoch": 2.3266666666666667, + "grad_norm": 0.006525777745991945, + "learning_rate": 8.803512662241589e-05, + "loss": 0.01186311710625887, + "num_input_tokens_seen": 62867464, + "step": 3839, + "train_runtime": 31197.0905, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 2.327272727272727, + "grad_norm": 0.007110640872269869, + "learning_rate": 8.802888407684791e-05, + "loss": 0.011905853636562824, + "num_input_tokens_seen": 62883840, + "step": 3840, + "train_runtime": 31205.2066, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 2.327878787878788, + "grad_norm": 0.006256428547203541, + "learning_rate": 8.802264012466557e-05, + "loss": 0.011172623373568058, + "num_input_tokens_seen": 62900216, + "step": 3841, + "train_runtime": 31213.3215, + "train_tokens_per_second": 2015.172 + }, + { + "epoch": 2.3284848484848486, + "grad_norm": 0.008018501102924347, + "learning_rate": 8.801639476609979e-05, + "loss": 0.012479901313781738, + "num_input_tokens_seen": 62916592, + "step": 3842, + "train_runtime": 31221.4325, + "train_tokens_per_second": 2015.173 + }, + { + "epoch": 2.329090909090909, + "grad_norm": 0.01011030375957489, + "learning_rate": 8.801014800138164e-05, + "loss": 0.013398240320384502, + "num_input_tokens_seen": 62932968, + "step": 3843, + "train_runtime": 31229.5477, + "train_tokens_per_second": 2015.174 + }, + { + "epoch": 2.3296969696969696, + "grad_norm": 0.01473653968423605, + "learning_rate": 8.800389983074211e-05, + "loss": 0.01274619810283184, + "num_input_tokens_seen": 62949344, + "step": 3844, + "train_runtime": 31237.6629, + "train_tokens_per_second": 2015.175 + }, + { + "epoch": 2.33030303030303, + "grad_norm": 0.006297265645116568, + "learning_rate": 8.799765025441235e-05, + "loss": 0.011753606610000134, + "num_input_tokens_seen": 62965720, + "step": 3845, + "train_runtime": 31245.7774, + "train_tokens_per_second": 2015.175 + }, + { + "epoch": 2.330909090909091, + "grad_norm": 0.009894706308841705, + "learning_rate": 8.79913992726235e-05, + "loss": 0.013425085693597794, + "num_input_tokens_seen": 62982096, + "step": 3846, + "train_runtime": 31253.8878, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 2.3315151515151515, + "grad_norm": 0.015656357631087303, + "learning_rate": 8.798514688560678e-05, + "loss": 0.012578295543789864, + "num_input_tokens_seen": 62998472, + "step": 3847, + "train_runtime": 31262.0009, + "train_tokens_per_second": 2015.177 + }, + { + "epoch": 2.332121212121212, + "grad_norm": 0.009474585764110088, + "learning_rate": 8.797889309359343e-05, + "loss": 0.012363191694021225, + "num_input_tokens_seen": 63014848, + "step": 3848, + "train_runtime": 31270.1147, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 2.3327272727272725, + "grad_norm": 0.007632074877619743, + "learning_rate": 8.79726378968148e-05, + "loss": 0.01290170382708311, + "num_input_tokens_seen": 63031224, + "step": 3849, + "train_runtime": 31278.2312, + "train_tokens_per_second": 2015.179 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.015381796285510063, + "learning_rate": 8.796638129550223e-05, + "loss": 0.013437875546514988, + "num_input_tokens_seen": 63047600, + "step": 3850, + "train_runtime": 31286.345, + "train_tokens_per_second": 2015.179 + }, + { + "epoch": 2.333939393939394, + "grad_norm": 0.0024572880938649178, + "learning_rate": 8.796012328988716e-05, + "loss": 0.012945571914315224, + "num_input_tokens_seen": 63063976, + "step": 3851, + "train_runtime": 31294.4586, + "train_tokens_per_second": 2015.18 + }, + { + "epoch": 2.3345454545454545, + "grad_norm": 0.03190493956208229, + "learning_rate": 8.795386388020106e-05, + "loss": 0.013913745060563087, + "num_input_tokens_seen": 63080352, + "step": 3852, + "train_runtime": 31302.5703, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 2.335151515151515, + "grad_norm": 0.007880356162786484, + "learning_rate": 8.794760306667544e-05, + "loss": 0.012863239273428917, + "num_input_tokens_seen": 63096728, + "step": 3853, + "train_runtime": 31310.6847, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 2.335757575757576, + "grad_norm": 0.00890274252742529, + "learning_rate": 8.794134084954189e-05, + "loss": 0.014049514196813107, + "num_input_tokens_seen": 63113104, + "step": 3854, + "train_runtime": 31318.7996, + "train_tokens_per_second": 2015.183 + }, + { + "epoch": 2.3363636363636364, + "grad_norm": 0.010360688902437687, + "learning_rate": 8.793507722903203e-05, + "loss": 0.013474004343152046, + "num_input_tokens_seen": 63129480, + "step": 3855, + "train_runtime": 31326.9153, + "train_tokens_per_second": 2015.183 + }, + { + "epoch": 2.336969696969697, + "grad_norm": 0.009520480409264565, + "learning_rate": 8.792881220537751e-05, + "loss": 0.012026038020849228, + "num_input_tokens_seen": 63145856, + "step": 3856, + "train_runtime": 31335.0317, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 2.3375757575757574, + "grad_norm": 0.004477637819945812, + "learning_rate": 8.792254577881012e-05, + "loss": 0.011705001816153526, + "num_input_tokens_seen": 63162232, + "step": 3857, + "train_runtime": 31343.1442, + "train_tokens_per_second": 2015.185 + }, + { + "epoch": 2.3381818181818184, + "grad_norm": 0.014203597791492939, + "learning_rate": 8.79162779495616e-05, + "loss": 0.01088397391140461, + "num_input_tokens_seen": 63178608, + "step": 3858, + "train_runtime": 31351.2613, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 2.338787878787879, + "grad_norm": 0.01523826364427805, + "learning_rate": 8.791000871786381e-05, + "loss": 0.012338997796177864, + "num_input_tokens_seen": 63194984, + "step": 3859, + "train_runtime": 31359.3787, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 2.3393939393939394, + "grad_norm": 0.009995555505156517, + "learning_rate": 8.790373808394862e-05, + "loss": 0.012309486977756023, + "num_input_tokens_seen": 63211360, + "step": 3860, + "train_runtime": 31367.4914, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 2.34, + "grad_norm": 0.0040956162847578526, + "learning_rate": 8.789746604804796e-05, + "loss": 0.011944938451051712, + "num_input_tokens_seen": 63227736, + "step": 3861, + "train_runtime": 31375.6055, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 2.340606060606061, + "grad_norm": 0.00751397805288434, + "learning_rate": 8.789119261039385e-05, + "loss": 0.011225864291191101, + "num_input_tokens_seen": 63244112, + "step": 3862, + "train_runtime": 31383.7176, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 2.3412121212121213, + "grad_norm": 0.007151484955102205, + "learning_rate": 8.78849177712183e-05, + "loss": 0.011669758707284927, + "num_input_tokens_seen": 63260488, + "step": 3863, + "train_runtime": 31391.8328, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 2.341818181818182, + "grad_norm": 0.0053044590167701244, + "learning_rate": 8.787864153075342e-05, + "loss": 0.011644741520285606, + "num_input_tokens_seen": 63276864, + "step": 3864, + "train_runtime": 31399.9456, + "train_tokens_per_second": 2015.19 + }, + { + "epoch": 2.3424242424242423, + "grad_norm": 0.003996069077402353, + "learning_rate": 8.787236388923137e-05, + "loss": 0.01181547436863184, + "num_input_tokens_seen": 63293240, + "step": 3865, + "train_runtime": 31408.0627, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 2.343030303030303, + "grad_norm": 0.009208021685481071, + "learning_rate": 8.786608484688432e-05, + "loss": 0.011496206745505333, + "num_input_tokens_seen": 63309616, + "step": 3866, + "train_runtime": 31416.1761, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 2.3436363636363637, + "grad_norm": 0.003765063127502799, + "learning_rate": 8.785980440394454e-05, + "loss": 0.012080052867531776, + "num_input_tokens_seen": 63325992, + "step": 3867, + "train_runtime": 31424.2906, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 2.3442424242424242, + "grad_norm": 0.013859529979526997, + "learning_rate": 8.785352256064432e-05, + "loss": 0.013082655146718025, + "num_input_tokens_seen": 63342368, + "step": 3868, + "train_runtime": 31432.4026, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 2.3448484848484847, + "grad_norm": 0.006903677247464657, + "learning_rate": 8.784723931721602e-05, + "loss": 0.011762754060328007, + "num_input_tokens_seen": 63358744, + "step": 3869, + "train_runtime": 31440.5151, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 2.3454545454545457, + "grad_norm": 0.011766036041080952, + "learning_rate": 8.784095467389202e-05, + "loss": 0.011996396817266941, + "num_input_tokens_seen": 63375120, + "step": 3870, + "train_runtime": 31448.632, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 2.346060606060606, + "grad_norm": 0.006888863630592823, + "learning_rate": 8.783466863090482e-05, + "loss": 0.01209091953933239, + "num_input_tokens_seen": 63391496, + "step": 3871, + "train_runtime": 31456.7487, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 2.3466666666666667, + "grad_norm": 0.005084891337901354, + "learning_rate": 8.78283811884869e-05, + "loss": 0.012265229597687721, + "num_input_tokens_seen": 63407872, + "step": 3872, + "train_runtime": 31464.8643, + "train_tokens_per_second": 2015.196 + }, + { + "epoch": 2.347272727272727, + "grad_norm": 0.00594416493549943, + "learning_rate": 8.782209234687083e-05, + "loss": 0.013370493426918983, + "num_input_tokens_seen": 63424248, + "step": 3873, + "train_runtime": 31472.9765, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 2.3478787878787877, + "grad_norm": 0.007754097227007151, + "learning_rate": 8.781580210628922e-05, + "loss": 0.011244947090744972, + "num_input_tokens_seen": 63440624, + "step": 3874, + "train_runtime": 31481.0909, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 2.3484848484848486, + "grad_norm": 0.006137073040008545, + "learning_rate": 8.780951046697475e-05, + "loss": 0.011433717794716358, + "num_input_tokens_seen": 63457000, + "step": 3875, + "train_runtime": 31489.2069, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 2.349090909090909, + "grad_norm": 0.008919673040509224, + "learning_rate": 8.780321742916008e-05, + "loss": 0.012265768833458424, + "num_input_tokens_seen": 63473376, + "step": 3876, + "train_runtime": 31497.3208, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 2.3496969696969696, + "grad_norm": 0.0061377896927297115, + "learning_rate": 8.779692299307804e-05, + "loss": 0.012009664438664913, + "num_input_tokens_seen": 63489752, + "step": 3877, + "train_runtime": 31505.4386, + "train_tokens_per_second": 2015.2 + }, + { + "epoch": 2.35030303030303, + "grad_norm": 0.009156587533652782, + "learning_rate": 8.779062715896143e-05, + "loss": 0.012589774094522, + "num_input_tokens_seen": 63506128, + "step": 3878, + "train_runtime": 31513.5564, + "train_tokens_per_second": 2015.2 + }, + { + "epoch": 2.350909090909091, + "grad_norm": 0.01206312794238329, + "learning_rate": 8.778432992704311e-05, + "loss": 0.011919211596250534, + "num_input_tokens_seen": 63522504, + "step": 3879, + "train_runtime": 31521.6703, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 2.3515151515151516, + "grad_norm": 0.003118017455562949, + "learning_rate": 8.777803129755599e-05, + "loss": 0.011527287773787975, + "num_input_tokens_seen": 63538880, + "step": 3880, + "train_runtime": 31529.7853, + "train_tokens_per_second": 2015.202 + }, + { + "epoch": 2.352121212121212, + "grad_norm": 0.008360499516129494, + "learning_rate": 8.777173127073308e-05, + "loss": 0.012660115957260132, + "num_input_tokens_seen": 63555256, + "step": 3881, + "train_runtime": 31537.8983, + "train_tokens_per_second": 2015.203 + }, + { + "epoch": 2.3527272727272726, + "grad_norm": 0.010115530341863632, + "learning_rate": 8.776542984680738e-05, + "loss": 0.012301959097385406, + "num_input_tokens_seen": 63571632, + "step": 3882, + "train_runtime": 31546.0083, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 2.3533333333333335, + "grad_norm": 0.00837163906544447, + "learning_rate": 8.7759127026012e-05, + "loss": 0.013043307699263096, + "num_input_tokens_seen": 63588008, + "step": 3883, + "train_runtime": 31554.1174, + "train_tokens_per_second": 2015.205 + }, + { + "epoch": 2.353939393939394, + "grad_norm": 0.006768459919840097, + "learning_rate": 8.775282280858e-05, + "loss": 0.012747212313115597, + "num_input_tokens_seen": 63604384, + "step": 3884, + "train_runtime": 31562.2343, + "train_tokens_per_second": 2015.205 + }, + { + "epoch": 2.3545454545454545, + "grad_norm": 0.007784388028085232, + "learning_rate": 8.774651719474463e-05, + "loss": 0.01133689470589161, + "num_input_tokens_seen": 63620760, + "step": 3885, + "train_runtime": 31570.3465, + "train_tokens_per_second": 2015.206 + }, + { + "epoch": 2.355151515151515, + "grad_norm": 0.0044767530634999275, + "learning_rate": 8.77402101847391e-05, + "loss": 0.011376718059182167, + "num_input_tokens_seen": 63637136, + "step": 3886, + "train_runtime": 31578.4583, + "train_tokens_per_second": 2015.207 + }, + { + "epoch": 2.355757575757576, + "grad_norm": 0.004322875756770372, + "learning_rate": 8.773390177879668e-05, + "loss": 0.010949796997010708, + "num_input_tokens_seen": 63653512, + "step": 3887, + "train_runtime": 31586.5693, + "train_tokens_per_second": 2015.208 + }, + { + "epoch": 2.3563636363636364, + "grad_norm": 0.008982558734714985, + "learning_rate": 8.772759197715073e-05, + "loss": 0.013297686353325844, + "num_input_tokens_seen": 63669888, + "step": 3888, + "train_runtime": 31594.6844, + "train_tokens_per_second": 2015.209 + }, + { + "epoch": 2.356969696969697, + "grad_norm": 0.005320236552506685, + "learning_rate": 8.772128078003461e-05, + "loss": 0.011769617907702923, + "num_input_tokens_seen": 63686264, + "step": 3889, + "train_runtime": 31602.7968, + "train_tokens_per_second": 2015.21 + }, + { + "epoch": 2.3575757575757574, + "grad_norm": 0.008895252831280231, + "learning_rate": 8.771496818768177e-05, + "loss": 0.011837522499263287, + "num_input_tokens_seen": 63702640, + "step": 3890, + "train_runtime": 31610.9094, + "train_tokens_per_second": 2015.211 + }, + { + "epoch": 2.3581818181818184, + "grad_norm": 0.00486038438975811, + "learning_rate": 8.770865420032571e-05, + "loss": 0.011740483343601227, + "num_input_tokens_seen": 63719016, + "step": 3891, + "train_runtime": 31619.0347, + "train_tokens_per_second": 2015.211 + }, + { + "epoch": 2.358787878787879, + "grad_norm": 0.004801337141543627, + "learning_rate": 8.770233881819997e-05, + "loss": 0.011615416966378689, + "num_input_tokens_seen": 63735392, + "step": 3892, + "train_runtime": 31627.1484, + "train_tokens_per_second": 2015.211 + }, + { + "epoch": 2.3593939393939394, + "grad_norm": 0.007693479303270578, + "learning_rate": 8.769602204153813e-05, + "loss": 0.01139110792428255, + "num_input_tokens_seen": 63751768, + "step": 3893, + "train_runtime": 31635.2584, + "train_tokens_per_second": 2015.212 + }, + { + "epoch": 2.36, + "grad_norm": 0.00855227466672659, + "learning_rate": 8.768970387057385e-05, + "loss": 0.012721371836960316, + "num_input_tokens_seen": 63768144, + "step": 3894, + "train_runtime": 31643.3682, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 2.3606060606060604, + "grad_norm": 0.008411828428506851, + "learning_rate": 8.768338430554082e-05, + "loss": 0.012851890176534653, + "num_input_tokens_seen": 63784520, + "step": 3895, + "train_runtime": 31651.4806, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 2.3612121212121213, + "grad_norm": 0.009419060312211514, + "learning_rate": 8.767706334667279e-05, + "loss": 0.011950638145208359, + "num_input_tokens_seen": 63800896, + "step": 3896, + "train_runtime": 31659.5904, + "train_tokens_per_second": 2015.215 + }, + { + "epoch": 2.361818181818182, + "grad_norm": 0.005601715762168169, + "learning_rate": 8.767074099420356e-05, + "loss": 0.011967363767325878, + "num_input_tokens_seen": 63817272, + "step": 3897, + "train_runtime": 31667.7024, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 2.3624242424242423, + "grad_norm": 0.014286424033343792, + "learning_rate": 8.766441724836698e-05, + "loss": 0.011659272015094757, + "num_input_tokens_seen": 63833648, + "step": 3898, + "train_runtime": 31675.8155, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 2.3630303030303033, + "grad_norm": 0.00412064278498292, + "learning_rate": 8.765809210939697e-05, + "loss": 0.011589322239160538, + "num_input_tokens_seen": 63850024, + "step": 3899, + "train_runtime": 31683.9353, + "train_tokens_per_second": 2015.218 + }, + { + "epoch": 2.3636363636363638, + "grad_norm": 0.007143765222281218, + "learning_rate": 8.765176557752744e-05, + "loss": 0.012061070650815964, + "num_input_tokens_seen": 63866400, + "step": 3900, + "train_runtime": 31692.046, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 2.3642424242424243, + "grad_norm": 0.004888126160949469, + "learning_rate": 8.764543765299245e-05, + "loss": 0.011578274890780449, + "num_input_tokens_seen": 63882776, + "step": 3901, + "train_runtime": 31701.0096, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 2.3648484848484848, + "grad_norm": 0.005478084087371826, + "learning_rate": 8.763910833602601e-05, + "loss": 0.011829855851829052, + "num_input_tokens_seen": 63899152, + "step": 3902, + "train_runtime": 31709.1321, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 2.3654545454545453, + "grad_norm": 0.007182782515883446, + "learning_rate": 8.763277762686227e-05, + "loss": 0.0127269197255373, + "num_input_tokens_seen": 63915528, + "step": 3903, + "train_runtime": 31717.2513, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 2.366060606060606, + "grad_norm": 0.007466602139174938, + "learning_rate": 8.762644552573535e-05, + "loss": 0.012824708595871925, + "num_input_tokens_seen": 63931904, + "step": 3904, + "train_runtime": 31725.3651, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 2.3666666666666667, + "grad_norm": 0.005637146532535553, + "learning_rate": 8.76201120328795e-05, + "loss": 0.011652861721813679, + "num_input_tokens_seen": 63948280, + "step": 3905, + "train_runtime": 31733.4766, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 2.367272727272727, + "grad_norm": 0.005748794414103031, + "learning_rate": 8.761377714852899e-05, + "loss": 0.011319580487906933, + "num_input_tokens_seen": 63964656, + "step": 3906, + "train_runtime": 31741.5911, + "train_tokens_per_second": 2015.169 + }, + { + "epoch": 2.3678787878787877, + "grad_norm": 0.00832221657037735, + "learning_rate": 8.760744087291808e-05, + "loss": 0.012169033288955688, + "num_input_tokens_seen": 63981032, + "step": 3907, + "train_runtime": 31749.7061, + "train_tokens_per_second": 2015.169 + }, + { + "epoch": 2.3684848484848486, + "grad_norm": 0.018167784437537193, + "learning_rate": 8.760110320628118e-05, + "loss": 0.012270736508071423, + "num_input_tokens_seen": 63997408, + "step": 3908, + "train_runtime": 31757.8211, + "train_tokens_per_second": 2015.17 + }, + { + "epoch": 2.369090909090909, + "grad_norm": 0.010734605602920055, + "learning_rate": 8.759476414885269e-05, + "loss": 0.012558269314467907, + "num_input_tokens_seen": 64013784, + "step": 3909, + "train_runtime": 31765.9363, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 2.3696969696969696, + "grad_norm": 0.023609034717082977, + "learning_rate": 8.758842370086709e-05, + "loss": 0.012377963401377201, + "num_input_tokens_seen": 64030160, + "step": 3910, + "train_runtime": 31774.0521, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 2.37030303030303, + "grad_norm": 0.008125863038003445, + "learning_rate": 8.75820818625589e-05, + "loss": 0.012392617762088776, + "num_input_tokens_seen": 64046536, + "step": 3911, + "train_runtime": 31782.1664, + "train_tokens_per_second": 2015.172 + }, + { + "epoch": 2.370909090909091, + "grad_norm": 0.005616354290395975, + "learning_rate": 8.757573863416269e-05, + "loss": 0.011652743443846703, + "num_input_tokens_seen": 64062912, + "step": 3912, + "train_runtime": 31790.2815, + "train_tokens_per_second": 2015.173 + }, + { + "epoch": 2.3715151515151516, + "grad_norm": 0.008864682167768478, + "learning_rate": 8.756939401591309e-05, + "loss": 0.012643544003367424, + "num_input_tokens_seen": 64079288, + "step": 3913, + "train_runtime": 31798.3948, + "train_tokens_per_second": 2015.174 + }, + { + "epoch": 2.372121212121212, + "grad_norm": 0.008480784483253956, + "learning_rate": 8.756304800804475e-05, + "loss": 0.011656440794467926, + "num_input_tokens_seen": 64095664, + "step": 3914, + "train_runtime": 31806.5085, + "train_tokens_per_second": 2015.174 + }, + { + "epoch": 2.3727272727272726, + "grad_norm": 0.001905617187730968, + "learning_rate": 8.755670061079244e-05, + "loss": 0.01306787971407175, + "num_input_tokens_seen": 64112040, + "step": 3915, + "train_runtime": 31814.6328, + "train_tokens_per_second": 2015.175 + }, + { + "epoch": 2.3733333333333335, + "grad_norm": 0.006388600450009108, + "learning_rate": 8.755035182439088e-05, + "loss": 0.011712341569364071, + "num_input_tokens_seen": 64128416, + "step": 3916, + "train_runtime": 31822.7462, + "train_tokens_per_second": 2015.175 + }, + { + "epoch": 2.373939393939394, + "grad_norm": 0.008756665512919426, + "learning_rate": 8.754400164907497e-05, + "loss": 0.012182825244963169, + "num_input_tokens_seen": 64144792, + "step": 3917, + "train_runtime": 31830.8643, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 2.3745454545454545, + "grad_norm": 0.008914709091186523, + "learning_rate": 8.753765008507953e-05, + "loss": 0.012191008776426315, + "num_input_tokens_seen": 64161168, + "step": 3918, + "train_runtime": 31838.9795, + "train_tokens_per_second": 2015.177 + }, + { + "epoch": 2.375151515151515, + "grad_norm": 0.022416841238737106, + "learning_rate": 8.753129713263951e-05, + "loss": 0.014600102789700031, + "num_input_tokens_seen": 64177544, + "step": 3919, + "train_runtime": 31847.0936, + "train_tokens_per_second": 2015.177 + }, + { + "epoch": 2.375757575757576, + "grad_norm": 0.007959865033626556, + "learning_rate": 8.75249427919899e-05, + "loss": 0.012604182586073875, + "num_input_tokens_seen": 64193920, + "step": 3920, + "train_runtime": 31855.2065, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 2.3763636363636365, + "grad_norm": 0.007637340109795332, + "learning_rate": 8.751858706336576e-05, + "loss": 0.01179521530866623, + "num_input_tokens_seen": 64210296, + "step": 3921, + "train_runtime": 31863.3188, + "train_tokens_per_second": 2015.179 + }, + { + "epoch": 2.376969696969697, + "grad_norm": 0.007538910489529371, + "learning_rate": 8.751222994700213e-05, + "loss": 0.012452198192477226, + "num_input_tokens_seen": 64226672, + "step": 3922, + "train_runtime": 31871.4351, + "train_tokens_per_second": 2015.18 + }, + { + "epoch": 2.3775757575757575, + "grad_norm": 0.011203823611140251, + "learning_rate": 8.750587144313416e-05, + "loss": 0.012804090976715088, + "num_input_tokens_seen": 64243048, + "step": 3923, + "train_runtime": 31879.5506, + "train_tokens_per_second": 2015.18 + }, + { + "epoch": 2.378181818181818, + "grad_norm": 0.00984243955463171, + "learning_rate": 8.749951155199703e-05, + "loss": 0.011516422033309937, + "num_input_tokens_seen": 64259424, + "step": 3924, + "train_runtime": 31887.6672, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 2.378787878787879, + "grad_norm": 0.007776329293847084, + "learning_rate": 8.749315027382601e-05, + "loss": 0.012981178238987923, + "num_input_tokens_seen": 64275800, + "step": 3925, + "train_runtime": 31895.7817, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 2.3793939393939394, + "grad_norm": 0.007024808786809444, + "learning_rate": 8.748678760885638e-05, + "loss": 0.012982901185750961, + "num_input_tokens_seen": 64292176, + "step": 3926, + "train_runtime": 31903.8951, + "train_tokens_per_second": 2015.183 + }, + { + "epoch": 2.38, + "grad_norm": 0.0047778841108083725, + "learning_rate": 8.748042355732349e-05, + "loss": 0.01131907757371664, + "num_input_tokens_seen": 64308552, + "step": 3927, + "train_runtime": 31912.0105, + "train_tokens_per_second": 2015.183 + }, + { + "epoch": 2.380606060606061, + "grad_norm": 0.004458183888345957, + "learning_rate": 8.74740581194627e-05, + "loss": 0.011565866880118847, + "num_input_tokens_seen": 64324928, + "step": 3928, + "train_runtime": 31920.1334, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 2.3812121212121213, + "grad_norm": 0.0061905584298074245, + "learning_rate": 8.746769129550949e-05, + "loss": 0.011856907047331333, + "num_input_tokens_seen": 64341304, + "step": 3929, + "train_runtime": 31928.2494, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 2.381818181818182, + "grad_norm": 0.008414180018007755, + "learning_rate": 8.746132308569934e-05, + "loss": 0.011779951862990856, + "num_input_tokens_seen": 64357680, + "step": 3930, + "train_runtime": 31936.3652, + "train_tokens_per_second": 2015.185 + }, + { + "epoch": 2.3824242424242423, + "grad_norm": 0.017430568113923073, + "learning_rate": 8.745495349026781e-05, + "loss": 0.013200096786022186, + "num_input_tokens_seen": 64374056, + "step": 3931, + "train_runtime": 31944.4848, + "train_tokens_per_second": 2015.185 + }, + { + "epoch": 2.383030303030303, + "grad_norm": 0.007815414108335972, + "learning_rate": 8.744858250945049e-05, + "loss": 0.011149406433105469, + "num_input_tokens_seen": 64390432, + "step": 3932, + "train_runtime": 31952.6034, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 2.3836363636363638, + "grad_norm": 0.011867988854646683, + "learning_rate": 8.744221014348301e-05, + "loss": 0.012829601764678955, + "num_input_tokens_seen": 64406808, + "step": 3933, + "train_runtime": 31960.7217, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 2.3842424242424243, + "grad_norm": 0.01321521494537592, + "learning_rate": 8.743583639260111e-05, + "loss": 0.011815814301371574, + "num_input_tokens_seen": 64423184, + "step": 3934, + "train_runtime": 31968.8435, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 2.3848484848484848, + "grad_norm": 0.010845907032489777, + "learning_rate": 8.742946125704052e-05, + "loss": 0.012657510116696358, + "num_input_tokens_seen": 64439560, + "step": 3935, + "train_runtime": 31976.9608, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 2.3854545454545453, + "grad_norm": 0.007816660217940807, + "learning_rate": 8.742308473703706e-05, + "loss": 0.01170468982309103, + "num_input_tokens_seen": 64455936, + "step": 3936, + "train_runtime": 31985.0806, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 2.386060606060606, + "grad_norm": 0.005382678937166929, + "learning_rate": 8.741670683282655e-05, + "loss": 0.0122041841968894, + "num_input_tokens_seen": 64472312, + "step": 3937, + "train_runtime": 31993.1961, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 2.3866666666666667, + "grad_norm": 0.004945850465446711, + "learning_rate": 8.741032754464494e-05, + "loss": 0.011137978173792362, + "num_input_tokens_seen": 64488688, + "step": 3938, + "train_runtime": 32001.3096, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 2.387272727272727, + "grad_norm": 0.004918837919831276, + "learning_rate": 8.740394687272816e-05, + "loss": 0.011163354851305485, + "num_input_tokens_seen": 64505064, + "step": 3939, + "train_runtime": 32009.4312, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 2.3878787878787877, + "grad_norm": 0.006628453731536865, + "learning_rate": 8.739756481731223e-05, + "loss": 0.012198293581604958, + "num_input_tokens_seen": 64521440, + "step": 3940, + "train_runtime": 32017.5454, + "train_tokens_per_second": 2015.19 + }, + { + "epoch": 2.3884848484848487, + "grad_norm": 0.006536687724292278, + "learning_rate": 8.73911813786332e-05, + "loss": 0.011898152530193329, + "num_input_tokens_seen": 64537816, + "step": 3941, + "train_runtime": 32025.6611, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 2.389090909090909, + "grad_norm": 0.011323172599077225, + "learning_rate": 8.738479655692719e-05, + "loss": 0.013106940314173698, + "num_input_tokens_seen": 64554192, + "step": 3942, + "train_runtime": 32033.7775, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 2.3896969696969697, + "grad_norm": 0.010536973364651203, + "learning_rate": 8.737841035243036e-05, + "loss": 0.012583276256918907, + "num_input_tokens_seen": 64570568, + "step": 3943, + "train_runtime": 32041.8972, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 2.39030303030303, + "grad_norm": 0.007271362002938986, + "learning_rate": 8.737202276537891e-05, + "loss": 0.012286387383937836, + "num_input_tokens_seen": 64586944, + "step": 3944, + "train_runtime": 32050.0111, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 2.390909090909091, + "grad_norm": 0.009338966570794582, + "learning_rate": 8.736563379600913e-05, + "loss": 0.012851104140281677, + "num_input_tokens_seen": 64603320, + "step": 3945, + "train_runtime": 32058.1218, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 2.3915151515151516, + "grad_norm": 0.008454913273453712, + "learning_rate": 8.735924344455732e-05, + "loss": 0.012162717990577221, + "num_input_tokens_seen": 64619696, + "step": 3946, + "train_runtime": 32066.2372, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 2.392121212121212, + "grad_norm": 0.0019281277200207114, + "learning_rate": 8.735285171125986e-05, + "loss": 0.010228649713099003, + "num_input_tokens_seen": 64636072, + "step": 3947, + "train_runtime": 32074.3517, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 2.3927272727272726, + "grad_norm": 0.0058963717892766, + "learning_rate": 8.734645859635313e-05, + "loss": 0.011077743954956532, + "num_input_tokens_seen": 64652448, + "step": 3948, + "train_runtime": 32082.4649, + "train_tokens_per_second": 2015.196 + }, + { + "epoch": 2.3933333333333335, + "grad_norm": 0.01275597233325243, + "learning_rate": 8.734006410007365e-05, + "loss": 0.013113114051520824, + "num_input_tokens_seen": 64668824, + "step": 3949, + "train_runtime": 32090.5816, + "train_tokens_per_second": 2015.196 + }, + { + "epoch": 2.393939393939394, + "grad_norm": 0.008671462535858154, + "learning_rate": 8.73336682226579e-05, + "loss": 0.012027603574097157, + "num_input_tokens_seen": 64685200, + "step": 3950, + "train_runtime": 32098.6971, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 2.3945454545454545, + "grad_norm": 0.0048281666822731495, + "learning_rate": 8.732727096434247e-05, + "loss": 0.011461691930890083, + "num_input_tokens_seen": 64701576, + "step": 3951, + "train_runtime": 32106.817, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 2.395151515151515, + "grad_norm": 0.0087870042771101, + "learning_rate": 8.732087232536399e-05, + "loss": 0.011858628131449223, + "num_input_tokens_seen": 64717952, + "step": 3952, + "train_runtime": 32114.933, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 2.3957575757575755, + "grad_norm": 0.023671137169003487, + "learning_rate": 8.731447230595911e-05, + "loss": 0.013579844497144222, + "num_input_tokens_seen": 64734328, + "step": 3953, + "train_runtime": 32123.0491, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 2.3963636363636365, + "grad_norm": 0.00703487079590559, + "learning_rate": 8.730807090636457e-05, + "loss": 0.011315067298710346, + "num_input_tokens_seen": 64750704, + "step": 3954, + "train_runtime": 32131.1664, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 2.396969696969697, + "grad_norm": 0.009797473438084126, + "learning_rate": 8.730166812681713e-05, + "loss": 0.01242794282734394, + "num_input_tokens_seen": 64767080, + "step": 3955, + "train_runtime": 32139.2798, + "train_tokens_per_second": 2015.2 + }, + { + "epoch": 2.3975757575757575, + "grad_norm": 0.009140574373304844, + "learning_rate": 8.729526396755365e-05, + "loss": 0.011765132658183575, + "num_input_tokens_seen": 64783456, + "step": 3956, + "train_runtime": 32147.3949, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 2.3981818181818184, + "grad_norm": 0.002114366739988327, + "learning_rate": 8.728885842881095e-05, + "loss": 0.01141907088458538, + "num_input_tokens_seen": 64799832, + "step": 3957, + "train_runtime": 32155.5104, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 2.398787878787879, + "grad_norm": 0.010168755427002907, + "learning_rate": 8.728245151082604e-05, + "loss": 0.012520655058324337, + "num_input_tokens_seen": 64816208, + "step": 3958, + "train_runtime": 32163.6325, + "train_tokens_per_second": 2015.202 + }, + { + "epoch": 2.3993939393939394, + "grad_norm": 0.01424007210880518, + "learning_rate": 8.727604321383583e-05, + "loss": 0.013034731149673462, + "num_input_tokens_seen": 64832584, + "step": 3959, + "train_runtime": 32171.7515, + "train_tokens_per_second": 2015.202 + }, + { + "epoch": 2.4, + "grad_norm": 0.0068917106837034225, + "learning_rate": 8.726963353807735e-05, + "loss": 0.011889351531863213, + "num_input_tokens_seen": 64848960, + "step": 3960, + "train_runtime": 32179.8659, + "train_tokens_per_second": 2015.203 + }, + { + "epoch": 2.4006060606060604, + "grad_norm": 0.009508705697953701, + "learning_rate": 8.726322248378775e-05, + "loss": 0.01212370302528143, + "num_input_tokens_seen": 64865336, + "step": 3961, + "train_runtime": 32187.9824, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 2.4012121212121214, + "grad_norm": 0.00829673558473587, + "learning_rate": 8.725681005120409e-05, + "loss": 0.011778369545936584, + "num_input_tokens_seen": 64881712, + "step": 3962, + "train_runtime": 32196.0985, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 2.401818181818182, + "grad_norm": 0.005323054734617472, + "learning_rate": 8.725039624056359e-05, + "loss": 0.012644222006201744, + "num_input_tokens_seen": 64898088, + "step": 3963, + "train_runtime": 32204.2147, + "train_tokens_per_second": 2015.205 + }, + { + "epoch": 2.4024242424242424, + "grad_norm": 0.008517703972756863, + "learning_rate": 8.724398105210345e-05, + "loss": 0.01162803266197443, + "num_input_tokens_seen": 64914464, + "step": 3964, + "train_runtime": 32212.3337, + "train_tokens_per_second": 2015.205 + }, + { + "epoch": 2.403030303030303, + "grad_norm": 0.010649501346051693, + "learning_rate": 8.723756448606101e-05, + "loss": 0.011956385336816311, + "num_input_tokens_seen": 64930840, + "step": 3965, + "train_runtime": 32220.4482, + "train_tokens_per_second": 2015.206 + }, + { + "epoch": 2.403636363636364, + "grad_norm": 0.009511959739029408, + "learning_rate": 8.723114654267356e-05, + "loss": 0.011983465403318405, + "num_input_tokens_seen": 64947216, + "step": 3966, + "train_runtime": 32228.5616, + "train_tokens_per_second": 2015.207 + }, + { + "epoch": 2.4042424242424243, + "grad_norm": 0.006440349854528904, + "learning_rate": 8.722472722217852e-05, + "loss": 0.012858221307396889, + "num_input_tokens_seen": 64963592, + "step": 3967, + "train_runtime": 32236.6751, + "train_tokens_per_second": 2015.208 + }, + { + "epoch": 2.404848484848485, + "grad_norm": 0.007032300345599651, + "learning_rate": 8.721830652481328e-05, + "loss": 0.013298509642481804, + "num_input_tokens_seen": 64979968, + "step": 3968, + "train_runtime": 32244.7854, + "train_tokens_per_second": 2015.209 + }, + { + "epoch": 2.4054545454545453, + "grad_norm": 0.005947112571448088, + "learning_rate": 8.72118844508154e-05, + "loss": 0.012420369312167168, + "num_input_tokens_seen": 64996344, + "step": 3969, + "train_runtime": 32252.903, + "train_tokens_per_second": 2015.209 + }, + { + "epoch": 2.4060606060606062, + "grad_norm": 0.009514856152236462, + "learning_rate": 8.720546100042235e-05, + "loss": 0.012390246614813805, + "num_input_tokens_seen": 65012720, + "step": 3970, + "train_runtime": 32261.0208, + "train_tokens_per_second": 2015.21 + }, + { + "epoch": 2.4066666666666667, + "grad_norm": 0.010288123041391373, + "learning_rate": 8.719903617387178e-05, + "loss": 0.011466922238469124, + "num_input_tokens_seen": 65029096, + "step": 3971, + "train_runtime": 32269.1401, + "train_tokens_per_second": 2015.21 + }, + { + "epoch": 2.4072727272727272, + "grad_norm": 0.008854511193931103, + "learning_rate": 8.719260997140128e-05, + "loss": 0.013790170662105083, + "num_input_tokens_seen": 65045472, + "step": 3972, + "train_runtime": 32277.2523, + "train_tokens_per_second": 2015.211 + }, + { + "epoch": 2.4078787878787877, + "grad_norm": 0.00761467544361949, + "learning_rate": 8.718618239324858e-05, + "loss": 0.012384867295622826, + "num_input_tokens_seen": 65061848, + "step": 3973, + "train_runtime": 32285.3664, + "train_tokens_per_second": 2015.212 + }, + { + "epoch": 2.4084848484848487, + "grad_norm": 0.014865431934595108, + "learning_rate": 8.717975343965141e-05, + "loss": 0.012042179703712463, + "num_input_tokens_seen": 65078224, + "step": 3974, + "train_runtime": 32293.4804, + "train_tokens_per_second": 2015.212 + }, + { + "epoch": 2.409090909090909, + "grad_norm": 0.012648499570786953, + "learning_rate": 8.717332311084755e-05, + "loss": 0.013338636606931686, + "num_input_tokens_seen": 65094600, + "step": 3975, + "train_runtime": 32301.595, + "train_tokens_per_second": 2015.213 + }, + { + "epoch": 2.4096969696969697, + "grad_norm": 0.010478436946868896, + "learning_rate": 8.716689140707488e-05, + "loss": 0.01220523752272129, + "num_input_tokens_seen": 65110976, + "step": 3976, + "train_runtime": 32309.7103, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 2.41030303030303, + "grad_norm": 0.005855999421328306, + "learning_rate": 8.716045832857128e-05, + "loss": 0.012362138368189335, + "num_input_tokens_seen": 65127352, + "step": 3977, + "train_runtime": 32317.833, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 2.410909090909091, + "grad_norm": 0.008465795777738094, + "learning_rate": 8.715402387557467e-05, + "loss": 0.012066630646586418, + "num_input_tokens_seen": 65143728, + "step": 3978, + "train_runtime": 32325.9436, + "train_tokens_per_second": 2015.215 + }, + { + "epoch": 2.4115151515151516, + "grad_norm": 0.004939177073538303, + "learning_rate": 8.714758804832309e-05, + "loss": 0.011245203204452991, + "num_input_tokens_seen": 65160104, + "step": 3979, + "train_runtime": 32334.0546, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 2.412121212121212, + "grad_norm": 0.008906640112400055, + "learning_rate": 8.714115084705454e-05, + "loss": 0.012994782999157906, + "num_input_tokens_seen": 65176480, + "step": 3980, + "train_runtime": 32342.1684, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 2.4127272727272726, + "grad_norm": 0.006914692930877209, + "learning_rate": 8.713471227200719e-05, + "loss": 0.011876557022333145, + "num_input_tokens_seen": 65192856, + "step": 3981, + "train_runtime": 32350.2791, + "train_tokens_per_second": 2015.218 + }, + { + "epoch": 2.413333333333333, + "grad_norm": 0.007783967535942793, + "learning_rate": 8.712827232341911e-05, + "loss": 0.01124640740454197, + "num_input_tokens_seen": 65209232, + "step": 3982, + "train_runtime": 32358.3956, + "train_tokens_per_second": 2015.218 + }, + { + "epoch": 2.413939393939394, + "grad_norm": 0.009643170982599258, + "learning_rate": 8.712183100152857e-05, + "loss": 0.01266550924628973, + "num_input_tokens_seen": 65225608, + "step": 3983, + "train_runtime": 32366.5081, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 2.4145454545454546, + "grad_norm": 0.008697593584656715, + "learning_rate": 8.711538830657378e-05, + "loss": 0.011208837851881981, + "num_input_tokens_seen": 65241984, + "step": 3984, + "train_runtime": 32374.6203, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 2.415151515151515, + "grad_norm": 0.008631549775600433, + "learning_rate": 8.710894423879305e-05, + "loss": 0.012429659254848957, + "num_input_tokens_seen": 65258360, + "step": 3985, + "train_runtime": 32382.7329, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 2.415757575757576, + "grad_norm": 0.017001524567604065, + "learning_rate": 8.710249879842476e-05, + "loss": 0.012807359918951988, + "num_input_tokens_seen": 65274736, + "step": 3986, + "train_runtime": 32390.8453, + "train_tokens_per_second": 2015.222 + }, + { + "epoch": 2.4163636363636365, + "grad_norm": 0.008777068927884102, + "learning_rate": 8.709605198570728e-05, + "loss": 0.010934505611658096, + "num_input_tokens_seen": 65291112, + "step": 3987, + "train_runtime": 32398.9606, + "train_tokens_per_second": 2015.222 + }, + { + "epoch": 2.416969696969697, + "grad_norm": 0.005198657512664795, + "learning_rate": 8.708960380087907e-05, + "loss": 0.01222632359713316, + "num_input_tokens_seen": 65307488, + "step": 3988, + "train_runtime": 32407.0754, + "train_tokens_per_second": 2015.223 + }, + { + "epoch": 2.4175757575757575, + "grad_norm": 0.010032770223915577, + "learning_rate": 8.708315424417866e-05, + "loss": 0.012328274548053741, + "num_input_tokens_seen": 65323864, + "step": 3989, + "train_runtime": 32415.1872, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 2.418181818181818, + "grad_norm": 0.009887870401144028, + "learning_rate": 8.707670331584459e-05, + "loss": 0.012643869034945965, + "num_input_tokens_seen": 65340240, + "step": 3990, + "train_runtime": 32423.3028, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 2.418787878787879, + "grad_norm": 0.009102854877710342, + "learning_rate": 8.707025101611545e-05, + "loss": 0.012734930962324142, + "num_input_tokens_seen": 65356616, + "step": 3991, + "train_runtime": 32431.4161, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 2.4193939393939394, + "grad_norm": 0.025766532868146896, + "learning_rate": 8.706379734522994e-05, + "loss": 0.01270482037216425, + "num_input_tokens_seen": 65372992, + "step": 3992, + "train_runtime": 32439.5348, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 2.42, + "grad_norm": 0.006967430934309959, + "learning_rate": 8.705734230342672e-05, + "loss": 0.01074296422302723, + "num_input_tokens_seen": 65389368, + "step": 3993, + "train_runtime": 32447.6469, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 2.4206060606060604, + "grad_norm": 0.011303038336336613, + "learning_rate": 8.705088589094459e-05, + "loss": 0.013301991857588291, + "num_input_tokens_seen": 65405744, + "step": 3994, + "train_runtime": 32455.7634, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 2.4212121212121214, + "grad_norm": 0.014830484986305237, + "learning_rate": 8.704442810802234e-05, + "loss": 0.013416048139333725, + "num_input_tokens_seen": 65422120, + "step": 3995, + "train_runtime": 32463.8793, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 2.421818181818182, + "grad_norm": 0.0049521829932928085, + "learning_rate": 8.703796895489883e-05, + "loss": 0.013172317296266556, + "num_input_tokens_seen": 65438496, + "step": 3996, + "train_runtime": 32471.9961, + "train_tokens_per_second": 2015.229 + }, + { + "epoch": 2.4224242424242424, + "grad_norm": 0.0015984463971108198, + "learning_rate": 8.7031508431813e-05, + "loss": 0.011939289048314095, + "num_input_tokens_seen": 65454872, + "step": 3997, + "train_runtime": 32480.1114, + "train_tokens_per_second": 2015.229 + }, + { + "epoch": 2.423030303030303, + "grad_norm": 0.0038701631128787994, + "learning_rate": 8.702504653900376e-05, + "loss": 0.012130429968237877, + "num_input_tokens_seen": 65471248, + "step": 3998, + "train_runtime": 32488.2337, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 2.423636363636364, + "grad_norm": 0.007171195466071367, + "learning_rate": 8.701858327671016e-05, + "loss": 0.01233917847275734, + "num_input_tokens_seen": 65487624, + "step": 3999, + "train_runtime": 32496.3526, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 2.4242424242424243, + "grad_norm": 0.0014149510534480214, + "learning_rate": 8.701211864517126e-05, + "loss": 0.010504164732992649, + "num_input_tokens_seen": 65504000, + "step": 4000, + "train_runtime": 32504.4642, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 2.424848484848485, + "grad_norm": 0.0008805702673271298, + "learning_rate": 8.700565264462617e-05, + "loss": 0.011478891596198082, + "num_input_tokens_seen": 65520376, + "step": 4001, + "train_runtime": 32513.4696, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 2.4254545454545453, + "grad_norm": 0.007924284785985947, + "learning_rate": 8.699918527531404e-05, + "loss": 0.011402283795177937, + "num_input_tokens_seen": 65536752, + "step": 4002, + "train_runtime": 32521.5831, + "train_tokens_per_second": 2015.177 + }, + { + "epoch": 2.4260606060606063, + "grad_norm": 0.009870833717286587, + "learning_rate": 8.699271653747411e-05, + "loss": 0.01156836748123169, + "num_input_tokens_seen": 65553128, + "step": 4003, + "train_runtime": 32529.6943, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 2.4266666666666667, + "grad_norm": 0.011118307709693909, + "learning_rate": 8.698624643134564e-05, + "loss": 0.013963157311081886, + "num_input_tokens_seen": 65569504, + "step": 4004, + "train_runtime": 32537.8051, + "train_tokens_per_second": 2015.179 + }, + { + "epoch": 2.4272727272727272, + "grad_norm": 0.021435871720314026, + "learning_rate": 8.697977495716793e-05, + "loss": 0.012135976925492287, + "num_input_tokens_seen": 65585880, + "step": 4005, + "train_runtime": 32545.9167, + "train_tokens_per_second": 2015.18 + }, + { + "epoch": 2.4278787878787877, + "grad_norm": 0.009903870522975922, + "learning_rate": 8.697330211518038e-05, + "loss": 0.012505902908742428, + "num_input_tokens_seen": 65602256, + "step": 4006, + "train_runtime": 32554.0318, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 2.4284848484848487, + "grad_norm": 0.008014468476176262, + "learning_rate": 8.696682790562236e-05, + "loss": 0.012186196632683277, + "num_input_tokens_seen": 65618632, + "step": 4007, + "train_runtime": 32562.1483, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 2.429090909090909, + "grad_norm": 0.007817745208740234, + "learning_rate": 8.696035232873339e-05, + "loss": 0.012502459809184074, + "num_input_tokens_seen": 65635008, + "step": 4008, + "train_runtime": 32570.2616, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 2.4296969696969697, + "grad_norm": 0.01054783258587122, + "learning_rate": 8.695387538475295e-05, + "loss": 0.012050812132656574, + "num_input_tokens_seen": 65651384, + "step": 4009, + "train_runtime": 32578.3709, + "train_tokens_per_second": 2015.183 + }, + { + "epoch": 2.43030303030303, + "grad_norm": 0.010385658591985703, + "learning_rate": 8.694739707392063e-05, + "loss": 0.012348880060017109, + "num_input_tokens_seen": 65667760, + "step": 4010, + "train_runtime": 32586.4871, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 2.4309090909090907, + "grad_norm": 0.006136162206530571, + "learning_rate": 8.694091739647602e-05, + "loss": 0.01128119695931673, + "num_input_tokens_seen": 65684136, + "step": 4011, + "train_runtime": 32594.6042, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 2.4315151515151516, + "grad_norm": 0.16773830354213715, + "learning_rate": 8.693443635265884e-05, + "loss": 0.019942179322242737, + "num_input_tokens_seen": 65700512, + "step": 4012, + "train_runtime": 32602.7162, + "train_tokens_per_second": 2015.185 + }, + { + "epoch": 2.432121212121212, + "grad_norm": 0.007391555700451136, + "learning_rate": 8.692795394270878e-05, + "loss": 0.011346128769218922, + "num_input_tokens_seen": 65716888, + "step": 4013, + "train_runtime": 32610.8327, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 2.4327272727272726, + "grad_norm": 0.008623134344816208, + "learning_rate": 8.692147016686562e-05, + "loss": 0.011837894096970558, + "num_input_tokens_seen": 65733264, + "step": 4014, + "train_runtime": 32618.9456, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 2.4333333333333336, + "grad_norm": 0.017599230632185936, + "learning_rate": 8.691498502536919e-05, + "loss": 0.013225538656115532, + "num_input_tokens_seen": 65749640, + "step": 4015, + "train_runtime": 32627.0611, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 2.433939393939394, + "grad_norm": 0.008119662292301655, + "learning_rate": 8.690849851845933e-05, + "loss": 0.013446874916553497, + "num_input_tokens_seen": 65766016, + "step": 4016, + "train_runtime": 32635.1768, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 2.4345454545454546, + "grad_norm": 1.2305080890655518, + "learning_rate": 8.6902010646376e-05, + "loss": 0.02664513699710369, + "num_input_tokens_seen": 65782392, + "step": 4017, + "train_runtime": 32643.2922, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 2.435151515151515, + "grad_norm": 0.004301746841520071, + "learning_rate": 8.689552140935914e-05, + "loss": 0.011035654693841934, + "num_input_tokens_seen": 65798768, + "step": 4018, + "train_runtime": 32651.4034, + "train_tokens_per_second": 2015.19 + }, + { + "epoch": 2.4357575757575756, + "grad_norm": 0.004877461586147547, + "learning_rate": 8.688903080764883e-05, + "loss": 0.011010750196874142, + "num_input_tokens_seen": 65815144, + "step": 4019, + "train_runtime": 32659.5176, + "train_tokens_per_second": 2015.19 + }, + { + "epoch": 2.4363636363636365, + "grad_norm": 0.00430849427357316, + "learning_rate": 8.688253884148509e-05, + "loss": 0.012497692368924618, + "num_input_tokens_seen": 65831520, + "step": 4020, + "train_runtime": 32667.6349, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 2.436969696969697, + "grad_norm": 0.04685498774051666, + "learning_rate": 8.687604551110807e-05, + "loss": 0.013098020106554031, + "num_input_tokens_seen": 65847896, + "step": 4021, + "train_runtime": 32675.7503, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 2.4375757575757575, + "grad_norm": 0.007516586687415838, + "learning_rate": 8.686955081675791e-05, + "loss": 0.011286056600511074, + "num_input_tokens_seen": 65864272, + "step": 4022, + "train_runtime": 32683.8619, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 2.438181818181818, + "grad_norm": 0.006795146968215704, + "learning_rate": 8.68630547586749e-05, + "loss": 0.011966537684202194, + "num_input_tokens_seen": 65880648, + "step": 4023, + "train_runtime": 32691.9751, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 2.438787878787879, + "grad_norm": 0.0067265634424984455, + "learning_rate": 8.685655733709928e-05, + "loss": 0.012211540713906288, + "num_input_tokens_seen": 65897024, + "step": 4024, + "train_runtime": 32700.0912, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 2.4393939393939394, + "grad_norm": 0.08641108870506287, + "learning_rate": 8.685005855227135e-05, + "loss": 0.017108287662267685, + "num_input_tokens_seen": 65913400, + "step": 4025, + "train_runtime": 32708.2163, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 2.44, + "grad_norm": 0.011075939051806927, + "learning_rate": 8.684355840443155e-05, + "loss": 0.011944500729441643, + "num_input_tokens_seen": 65929776, + "step": 4026, + "train_runtime": 32716.3331, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 2.4406060606060604, + "grad_norm": 0.069160595536232, + "learning_rate": 8.683705689382024e-05, + "loss": 0.017644496634602547, + "num_input_tokens_seen": 65946152, + "step": 4027, + "train_runtime": 32724.4465, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 2.4412121212121214, + "grad_norm": 0.009637738578021526, + "learning_rate": 8.683055402067797e-05, + "loss": 0.012353415600955486, + "num_input_tokens_seen": 65962528, + "step": 4028, + "train_runtime": 32732.5617, + "train_tokens_per_second": 2015.196 + }, + { + "epoch": 2.441818181818182, + "grad_norm": 0.005617988295853138, + "learning_rate": 8.682404978524522e-05, + "loss": 0.012186834588646889, + "num_input_tokens_seen": 65978904, + "step": 4029, + "train_runtime": 32740.6745, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 2.4424242424242424, + "grad_norm": 0.010201000608503819, + "learning_rate": 8.681754418776255e-05, + "loss": 0.011051755398511887, + "num_input_tokens_seen": 65995280, + "step": 4030, + "train_runtime": 32748.7901, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 2.443030303030303, + "grad_norm": 0.008390221744775772, + "learning_rate": 8.681103722847065e-05, + "loss": 0.011525029316544533, + "num_input_tokens_seen": 66011656, + "step": 4031, + "train_runtime": 32756.905, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 2.443636363636364, + "grad_norm": 0.008890388533473015, + "learning_rate": 8.680452890761016e-05, + "loss": 0.011630352586507797, + "num_input_tokens_seen": 66028032, + "step": 4032, + "train_runtime": 32765.017, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 2.4442424242424243, + "grad_norm": 0.034759897738695145, + "learning_rate": 8.679801922542182e-05, + "loss": 0.014052574522793293, + "num_input_tokens_seen": 66044408, + "step": 4033, + "train_runtime": 32773.1319, + "train_tokens_per_second": 2015.2 + }, + { + "epoch": 2.444848484848485, + "grad_norm": 0.007354637607932091, + "learning_rate": 8.67915081821464e-05, + "loss": 0.012096257880330086, + "num_input_tokens_seen": 66060784, + "step": 4034, + "train_runtime": 32781.2426, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 2.4454545454545453, + "grad_norm": 0.010367256589233875, + "learning_rate": 8.678499577802476e-05, + "loss": 0.012405885383486748, + "num_input_tokens_seen": 66077160, + "step": 4035, + "train_runtime": 32789.3584, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 2.4460606060606063, + "grad_norm": 0.007225418463349342, + "learning_rate": 8.677848201329774e-05, + "loss": 0.011580531485378742, + "num_input_tokens_seen": 66093536, + "step": 4036, + "train_runtime": 32797.4719, + "train_tokens_per_second": 2015.202 + }, + { + "epoch": 2.4466666666666668, + "grad_norm": 0.007067396771162748, + "learning_rate": 8.677196688820631e-05, + "loss": 0.011616021394729614, + "num_input_tokens_seen": 66109912, + "step": 4037, + "train_runtime": 32805.585, + "train_tokens_per_second": 2015.203 + }, + { + "epoch": 2.4472727272727273, + "grad_norm": 0.007955334149301052, + "learning_rate": 8.676545040299145e-05, + "loss": 0.01117919571697712, + "num_input_tokens_seen": 66126288, + "step": 4038, + "train_runtime": 32813.6965, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 2.4478787878787878, + "grad_norm": 0.0061789704486727715, + "learning_rate": 8.675893255789413e-05, + "loss": 0.01187138445675373, + "num_input_tokens_seen": 66142664, + "step": 4039, + "train_runtime": 32821.8121, + "train_tokens_per_second": 2015.205 + }, + { + "epoch": 2.4484848484848483, + "grad_norm": 0.009460385888814926, + "learning_rate": 8.675241335315551e-05, + "loss": 0.0124445129185915, + "num_input_tokens_seen": 66159040, + "step": 4040, + "train_runtime": 32829.9313, + "train_tokens_per_second": 2015.205 + }, + { + "epoch": 2.449090909090909, + "grad_norm": 0.008185412734746933, + "learning_rate": 8.67458927890167e-05, + "loss": 0.011840671300888062, + "num_input_tokens_seen": 66175416, + "step": 4041, + "train_runtime": 32838.0484, + "train_tokens_per_second": 2015.206 + }, + { + "epoch": 2.4496969696969697, + "grad_norm": 0.03455425798892975, + "learning_rate": 8.673937086571886e-05, + "loss": 0.015499784611165524, + "num_input_tokens_seen": 66191792, + "step": 4042, + "train_runtime": 32846.1583, + "train_tokens_per_second": 2015.207 + }, + { + "epoch": 2.45030303030303, + "grad_norm": 0.006477878894656897, + "learning_rate": 8.673284758350324e-05, + "loss": 0.011704593896865845, + "num_input_tokens_seen": 66208168, + "step": 4043, + "train_runtime": 32854.2708, + "train_tokens_per_second": 2015.207 + }, + { + "epoch": 2.450909090909091, + "grad_norm": 0.004375692456960678, + "learning_rate": 8.672632294261114e-05, + "loss": 0.011329500935971737, + "num_input_tokens_seen": 66224544, + "step": 4044, + "train_runtime": 32862.3824, + "train_tokens_per_second": 2015.208 + }, + { + "epoch": 2.4515151515151516, + "grad_norm": 0.004360073246061802, + "learning_rate": 8.671979694328385e-05, + "loss": 0.012417464517056942, + "num_input_tokens_seen": 66240920, + "step": 4045, + "train_runtime": 32870.4933, + "train_tokens_per_second": 2015.209 + }, + { + "epoch": 2.452121212121212, + "grad_norm": 0.011735597625374794, + "learning_rate": 8.671326958576279e-05, + "loss": 0.01322389580309391, + "num_input_tokens_seen": 66257296, + "step": 4046, + "train_runtime": 32878.6066, + "train_tokens_per_second": 2015.21 + }, + { + "epoch": 2.4527272727272726, + "grad_norm": 0.011418039910495281, + "learning_rate": 8.670674087028939e-05, + "loss": 0.012756542302668095, + "num_input_tokens_seen": 66273672, + "step": 4047, + "train_runtime": 32886.7165, + "train_tokens_per_second": 2015.211 + }, + { + "epoch": 2.453333333333333, + "grad_norm": 0.032791610807180405, + "learning_rate": 8.67002107971051e-05, + "loss": 0.014766186475753784, + "num_input_tokens_seen": 66290048, + "step": 4048, + "train_runtime": 32894.8314, + "train_tokens_per_second": 2015.212 + }, + { + "epoch": 2.453939393939394, + "grad_norm": 0.0517447367310524, + "learning_rate": 8.669367936645151e-05, + "loss": 0.017308732494711876, + "num_input_tokens_seen": 66306424, + "step": 4049, + "train_runtime": 32902.9453, + "train_tokens_per_second": 2015.212 + }, + { + "epoch": 2.4545454545454546, + "grad_norm": 0.03526085987687111, + "learning_rate": 8.668714657857018e-05, + "loss": 0.018967142328619957, + "num_input_tokens_seen": 66322800, + "step": 4050, + "train_runtime": 32911.0585, + "train_tokens_per_second": 2015.213 + }, + { + "epoch": 2.455151515151515, + "grad_norm": 0.00729320477694273, + "learning_rate": 8.668061243370274e-05, + "loss": 0.012054507620632648, + "num_input_tokens_seen": 66339176, + "step": 4051, + "train_runtime": 32919.1705, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 2.4557575757575756, + "grad_norm": 0.03337623551487923, + "learning_rate": 8.667407693209087e-05, + "loss": 0.015036912634968758, + "num_input_tokens_seen": 66355552, + "step": 4052, + "train_runtime": 32927.2799, + "train_tokens_per_second": 2015.215 + }, + { + "epoch": 2.4563636363636365, + "grad_norm": 0.015217745676636696, + "learning_rate": 8.666754007397632e-05, + "loss": 0.012794998474419117, + "num_input_tokens_seen": 66371928, + "step": 4053, + "train_runtime": 32935.3918, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 2.456969696969697, + "grad_norm": 0.017872435972094536, + "learning_rate": 8.666100185960087e-05, + "loss": 0.017377035692334175, + "num_input_tokens_seen": 66388304, + "step": 4054, + "train_runtime": 32943.5101, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 2.4575757575757575, + "grad_norm": 0.023252859711647034, + "learning_rate": 8.665446228920635e-05, + "loss": 0.013442318886518478, + "num_input_tokens_seen": 66404680, + "step": 4055, + "train_runtime": 32951.6321, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 2.458181818181818, + "grad_norm": 0.0032380518969148397, + "learning_rate": 8.664792136303465e-05, + "loss": 0.010896595194935799, + "num_input_tokens_seen": 66421056, + "step": 4056, + "train_runtime": 32959.7444, + "train_tokens_per_second": 2015.218 + }, + { + "epoch": 2.458787878787879, + "grad_norm": 0.01565861888229847, + "learning_rate": 8.664137908132772e-05, + "loss": 0.01155568566173315, + "num_input_tokens_seen": 66437432, + "step": 4057, + "train_runtime": 32967.8557, + "train_tokens_per_second": 2015.218 + }, + { + "epoch": 2.4593939393939395, + "grad_norm": 0.013617758639156818, + "learning_rate": 8.663483544432751e-05, + "loss": 0.013444559648633003, + "num_input_tokens_seen": 66453808, + "step": 4058, + "train_runtime": 32975.9744, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 2.46, + "grad_norm": 0.009455419145524502, + "learning_rate": 8.662829045227609e-05, + "loss": 0.010722637176513672, + "num_input_tokens_seen": 66470184, + "step": 4059, + "train_runtime": 32984.0875, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 2.4606060606060605, + "grad_norm": 0.00824559573084116, + "learning_rate": 8.662174410541555e-05, + "loss": 0.011758264154195786, + "num_input_tokens_seen": 66486560, + "step": 4060, + "train_runtime": 32992.2057, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 2.461212121212121, + "grad_norm": 0.007522704545408487, + "learning_rate": 8.661519640398801e-05, + "loss": 0.011111623607575893, + "num_input_tokens_seen": 66502936, + "step": 4061, + "train_runtime": 33000.317, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 2.461818181818182, + "grad_norm": 0.003967296797782183, + "learning_rate": 8.660864734823564e-05, + "loss": 0.011284412816166878, + "num_input_tokens_seen": 66519312, + "step": 4062, + "train_runtime": 33008.4349, + "train_tokens_per_second": 2015.222 + }, + { + "epoch": 2.4624242424242424, + "grad_norm": 0.005898671690374613, + "learning_rate": 8.660209693840072e-05, + "loss": 0.012734920717775822, + "num_input_tokens_seen": 66535688, + "step": 4063, + "train_runtime": 33016.5501, + "train_tokens_per_second": 2015.222 + }, + { + "epoch": 2.463030303030303, + "grad_norm": 0.008629771880805492, + "learning_rate": 8.65955451747255e-05, + "loss": 0.012427698820829391, + "num_input_tokens_seen": 66552064, + "step": 4064, + "train_runtime": 33024.6649, + "train_tokens_per_second": 2015.223 + }, + { + "epoch": 2.463636363636364, + "grad_norm": 0.006099363323301077, + "learning_rate": 8.658899205745235e-05, + "loss": 0.011295391246676445, + "num_input_tokens_seen": 66568440, + "step": 4065, + "train_runtime": 33032.7814, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 2.4642424242424243, + "grad_norm": 0.008219312876462936, + "learning_rate": 8.658243758682361e-05, + "loss": 0.011767336167395115, + "num_input_tokens_seen": 66584816, + "step": 4066, + "train_runtime": 33040.8935, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 2.464848484848485, + "grad_norm": 0.009053852409124374, + "learning_rate": 8.657588176308176e-05, + "loss": 0.012841441668570042, + "num_input_tokens_seen": 66601192, + "step": 4067, + "train_runtime": 33049.0074, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 2.4654545454545453, + "grad_norm": 0.09069479256868362, + "learning_rate": 8.656932458646927e-05, + "loss": 0.013467703014612198, + "num_input_tokens_seen": 66617568, + "step": 4068, + "train_runtime": 33057.121, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 2.466060606060606, + "grad_norm": 0.009245152585208416, + "learning_rate": 8.656276605722868e-05, + "loss": 0.012219304218888283, + "num_input_tokens_seen": 66633944, + "step": 4069, + "train_runtime": 33065.2343, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 2.466666666666667, + "grad_norm": 0.008828767575323582, + "learning_rate": 8.655620617560257e-05, + "loss": 0.012695424258708954, + "num_input_tokens_seen": 66650320, + "step": 4070, + "train_runtime": 33073.3479, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 2.4672727272727273, + "grad_norm": 0.005800986662507057, + "learning_rate": 8.654964494183358e-05, + "loss": 0.012122102081775665, + "num_input_tokens_seen": 66666696, + "step": 4071, + "train_runtime": 33081.4597, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 2.467878787878788, + "grad_norm": 0.005396679975092411, + "learning_rate": 8.654308235616442e-05, + "loss": 0.012633191421627998, + "num_input_tokens_seen": 66683072, + "step": 4072, + "train_runtime": 33089.5751, + "train_tokens_per_second": 2015.229 + }, + { + "epoch": 2.4684848484848487, + "grad_norm": 0.007578667718917131, + "learning_rate": 8.653651841883779e-05, + "loss": 0.011809978634119034, + "num_input_tokens_seen": 66699448, + "step": 4073, + "train_runtime": 33097.6868, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 2.4690909090909092, + "grad_norm": 0.0044122799299657345, + "learning_rate": 8.65299531300965e-05, + "loss": 0.011763782240450382, + "num_input_tokens_seen": 66715824, + "step": 4074, + "train_runtime": 33105.7999, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 2.4696969696969697, + "grad_norm": 0.0043112775310873985, + "learning_rate": 8.652338649018339e-05, + "loss": 0.012082591652870178, + "num_input_tokens_seen": 66732200, + "step": 4075, + "train_runtime": 33113.9159, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 2.4703030303030302, + "grad_norm": 0.28410032391548157, + "learning_rate": 8.651681849934134e-05, + "loss": 0.01635185442864895, + "num_input_tokens_seen": 66748576, + "step": 4076, + "train_runtime": 33122.0317, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 2.4709090909090907, + "grad_norm": 0.12259082496166229, + "learning_rate": 8.651024915781327e-05, + "loss": 0.01743101142346859, + "num_input_tokens_seen": 66764952, + "step": 4077, + "train_runtime": 33130.1487, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 2.4715151515151517, + "grad_norm": 0.005474293604493141, + "learning_rate": 8.650367846584219e-05, + "loss": 0.012096352875232697, + "num_input_tokens_seen": 66781328, + "step": 4078, + "train_runtime": 33138.2613, + "train_tokens_per_second": 2015.233 + }, + { + "epoch": 2.472121212121212, + "grad_norm": 0.006114025134593248, + "learning_rate": 8.649710642367115e-05, + "loss": 0.013284233398735523, + "num_input_tokens_seen": 66797704, + "step": 4079, + "train_runtime": 33146.3753, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 2.4727272727272727, + "grad_norm": 0.011463556438684464, + "learning_rate": 8.64905330315432e-05, + "loss": 0.012340724468231201, + "num_input_tokens_seen": 66814080, + "step": 4080, + "train_runtime": 33154.491, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 2.473333333333333, + "grad_norm": 0.00733643164858222, + "learning_rate": 8.64839582897015e-05, + "loss": 0.011145420372486115, + "num_input_tokens_seen": 66830456, + "step": 4081, + "train_runtime": 33162.609, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 2.473939393939394, + "grad_norm": 0.007582388818264008, + "learning_rate": 8.647738219838924e-05, + "loss": 0.011645256541669369, + "num_input_tokens_seen": 66846832, + "step": 4082, + "train_runtime": 33170.7236, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 2.4745454545454546, + "grad_norm": 0.006805262062698603, + "learning_rate": 8.647080475784964e-05, + "loss": 0.012549490667879581, + "num_input_tokens_seen": 66863208, + "step": 4083, + "train_runtime": 33178.8401, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 2.475151515151515, + "grad_norm": 0.008055298589169979, + "learning_rate": 8.646422596832599e-05, + "loss": 0.012534864246845245, + "num_input_tokens_seen": 66879584, + "step": 4084, + "train_runtime": 33186.9521, + "train_tokens_per_second": 2015.237 + }, + { + "epoch": 2.4757575757575756, + "grad_norm": 0.008632096461951733, + "learning_rate": 8.645764583006165e-05, + "loss": 0.011780787259340286, + "num_input_tokens_seen": 66895960, + "step": 4085, + "train_runtime": 33195.0702, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 2.4763636363636365, + "grad_norm": 0.010974240489304066, + "learning_rate": 8.645106434329996e-05, + "loss": 0.01195211336016655, + "num_input_tokens_seen": 66912336, + "step": 4086, + "train_runtime": 33203.1836, + "train_tokens_per_second": 2015.239 + }, + { + "epoch": 2.476969696969697, + "grad_norm": 0.017518963664770126, + "learning_rate": 8.644448150828442e-05, + "loss": 0.012673698365688324, + "num_input_tokens_seen": 66928712, + "step": 4087, + "train_runtime": 33211.2984, + "train_tokens_per_second": 2015.239 + }, + { + "epoch": 2.4775757575757575, + "grad_norm": 0.008905385620892048, + "learning_rate": 8.643789732525846e-05, + "loss": 0.012078780680894852, + "num_input_tokens_seen": 66945088, + "step": 4088, + "train_runtime": 33219.4156, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 2.478181818181818, + "grad_norm": 0.004331223201006651, + "learning_rate": 8.643131179446564e-05, + "loss": 0.011904648505151272, + "num_input_tokens_seen": 66961464, + "step": 4089, + "train_runtime": 33227.5336, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 2.4787878787878785, + "grad_norm": 0.0028569402638822794, + "learning_rate": 8.642472491614954e-05, + "loss": 0.011711115948855877, + "num_input_tokens_seen": 66977840, + "step": 4090, + "train_runtime": 33235.6505, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 2.4793939393939395, + "grad_norm": 0.008105210028588772, + "learning_rate": 8.641813669055381e-05, + "loss": 0.011557997204363346, + "num_input_tokens_seen": 66994216, + "step": 4091, + "train_runtime": 33243.7668, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 2.48, + "grad_norm": 0.008874750696122646, + "learning_rate": 8.641154711792212e-05, + "loss": 0.0119530213996768, + "num_input_tokens_seen": 67010592, + "step": 4092, + "train_runtime": 33251.8847, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 2.4806060606060605, + "grad_norm": 0.011586075648665428, + "learning_rate": 8.640495619849821e-05, + "loss": 0.012206172570586205, + "num_input_tokens_seen": 67026968, + "step": 4093, + "train_runtime": 33260.0003, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 2.4812121212121214, + "grad_norm": 0.013051996007561684, + "learning_rate": 8.639836393252587e-05, + "loss": 0.012065069749951363, + "num_input_tokens_seen": 67043344, + "step": 4094, + "train_runtime": 33268.1136, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 2.481818181818182, + "grad_norm": 0.01702691800892353, + "learning_rate": 8.639177032024892e-05, + "loss": 0.011707555502653122, + "num_input_tokens_seen": 67059720, + "step": 4095, + "train_runtime": 33276.2314, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 2.4824242424242424, + "grad_norm": 0.004312279634177685, + "learning_rate": 8.638517536191127e-05, + "loss": 0.012810110114514828, + "num_input_tokens_seen": 67076096, + "step": 4096, + "train_runtime": 33284.348, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 2.483030303030303, + "grad_norm": 0.004693899769335985, + "learning_rate": 8.637857905775684e-05, + "loss": 0.011241926811635494, + "num_input_tokens_seen": 67092472, + "step": 4097, + "train_runtime": 33292.4626, + "train_tokens_per_second": 2015.245 + }, + { + "epoch": 2.4836363636363634, + "grad_norm": 0.010167837142944336, + "learning_rate": 8.63719814080296e-05, + "loss": 0.012888771481812, + "num_input_tokens_seen": 67108848, + "step": 4098, + "train_runtime": 33300.5761, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 2.4842424242424244, + "grad_norm": 0.008203212171792984, + "learning_rate": 8.63653824129736e-05, + "loss": 0.01142559852451086, + "num_input_tokens_seen": 67125224, + "step": 4099, + "train_runtime": 33308.688, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 2.484848484848485, + "grad_norm": 0.006197168026119471, + "learning_rate": 8.635878207283293e-05, + "loss": 0.012334875762462616, + "num_input_tokens_seen": 67141600, + "step": 4100, + "train_runtime": 33316.8014, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 2.4854545454545454, + "grad_norm": 0.008716695941984653, + "learning_rate": 8.635218038785171e-05, + "loss": 0.011401981115341187, + "num_input_tokens_seen": 67157976, + "step": 4101, + "train_runtime": 33325.8463, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 2.4860606060606063, + "grad_norm": 0.007376333698630333, + "learning_rate": 8.634557735827415e-05, + "loss": 0.013144776225090027, + "num_input_tokens_seen": 67174352, + "step": 4102, + "train_runtime": 33333.9583, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 2.486666666666667, + "grad_norm": 0.009919609874486923, + "learning_rate": 8.633897298434443e-05, + "loss": 0.012736881151795387, + "num_input_tokens_seen": 67190728, + "step": 4103, + "train_runtime": 33342.0701, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 2.4872727272727273, + "grad_norm": 0.01712992414832115, + "learning_rate": 8.633236726630688e-05, + "loss": 0.014519492164254189, + "num_input_tokens_seen": 67207104, + "step": 4104, + "train_runtime": 33350.1853, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 2.487878787878788, + "grad_norm": 0.007383553311228752, + "learning_rate": 8.632576020440584e-05, + "loss": 0.011516118422150612, + "num_input_tokens_seen": 67223480, + "step": 4105, + "train_runtime": 33358.2982, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 2.4884848484848483, + "grad_norm": 0.007224396802484989, + "learning_rate": 8.631915179888566e-05, + "loss": 0.012431045062839985, + "num_input_tokens_seen": 67239856, + "step": 4106, + "train_runtime": 33366.4131, + "train_tokens_per_second": 2015.196 + }, + { + "epoch": 2.4890909090909092, + "grad_norm": 0.01208457536995411, + "learning_rate": 8.631254204999077e-05, + "loss": 0.012394698336720467, + "num_input_tokens_seen": 67256232, + "step": 4107, + "train_runtime": 33374.5316, + "train_tokens_per_second": 2015.196 + }, + { + "epoch": 2.4896969696969697, + "grad_norm": 0.010818401351571083, + "learning_rate": 8.630593095796567e-05, + "loss": 0.011898752301931381, + "num_input_tokens_seen": 67272608, + "step": 4108, + "train_runtime": 33382.6414, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 2.4903030303030302, + "grad_norm": 0.012678304687142372, + "learning_rate": 8.629931852305489e-05, + "loss": 0.013062065467238426, + "num_input_tokens_seen": 67288984, + "step": 4109, + "train_runtime": 33390.7564, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 2.4909090909090907, + "grad_norm": 0.008769886568188667, + "learning_rate": 8.629270474550302e-05, + "loss": 0.011773526668548584, + "num_input_tokens_seen": 67305360, + "step": 4110, + "train_runtime": 33398.868, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 2.4915151515151517, + "grad_norm": 0.2816353440284729, + "learning_rate": 8.628608962555467e-05, + "loss": 0.01199405174702406, + "num_input_tokens_seen": 67321736, + "step": 4111, + "train_runtime": 33406.9782, + "train_tokens_per_second": 2015.2 + }, + { + "epoch": 2.492121212121212, + "grad_norm": 0.00666635250672698, + "learning_rate": 8.627947316345452e-05, + "loss": 0.01267234981060028, + "num_input_tokens_seen": 67338112, + "step": 4112, + "train_runtime": 33415.0865, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 2.4927272727272727, + "grad_norm": 0.004921226762235165, + "learning_rate": 8.627285535944731e-05, + "loss": 0.011555058881640434, + "num_input_tokens_seen": 67354488, + "step": 4113, + "train_runtime": 33423.1999, + "train_tokens_per_second": 2015.202 + }, + { + "epoch": 2.493333333333333, + "grad_norm": 0.007291084621101618, + "learning_rate": 8.626623621377782e-05, + "loss": 0.011287565343081951, + "num_input_tokens_seen": 67370864, + "step": 4114, + "train_runtime": 33431.3078, + "train_tokens_per_second": 2015.203 + }, + { + "epoch": 2.493939393939394, + "grad_norm": 0.005141217261552811, + "learning_rate": 8.625961572669088e-05, + "loss": 0.012209695763885975, + "num_input_tokens_seen": 67387240, + "step": 4115, + "train_runtime": 33439.4199, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 2.4945454545454546, + "grad_norm": 0.008479267358779907, + "learning_rate": 8.625299389843137e-05, + "loss": 0.011675290763378143, + "num_input_tokens_seen": 67403616, + "step": 4116, + "train_runtime": 33447.5327, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 2.495151515151515, + "grad_norm": 0.008358309976756573, + "learning_rate": 8.62463707292442e-05, + "loss": 0.012512709014117718, + "num_input_tokens_seen": 67419992, + "step": 4117, + "train_runtime": 33455.6481, + "train_tokens_per_second": 2015.205 + }, + { + "epoch": 2.4957575757575756, + "grad_norm": 0.010408759117126465, + "learning_rate": 8.623974621937439e-05, + "loss": 0.012065138667821884, + "num_input_tokens_seen": 67436368, + "step": 4118, + "train_runtime": 33463.7671, + "train_tokens_per_second": 2015.206 + }, + { + "epoch": 2.496363636363636, + "grad_norm": 0.0021597829181700945, + "learning_rate": 8.623312036906693e-05, + "loss": 0.011656312271952629, + "num_input_tokens_seen": 67452744, + "step": 4119, + "train_runtime": 33471.8793, + "train_tokens_per_second": 2015.206 + }, + { + "epoch": 2.496969696969697, + "grad_norm": 0.005352547857910395, + "learning_rate": 8.62264931785669e-05, + "loss": 0.011543437838554382, + "num_input_tokens_seen": 67469120, + "step": 4120, + "train_runtime": 33479.9913, + "train_tokens_per_second": 2015.207 + }, + { + "epoch": 2.4975757575757576, + "grad_norm": 0.07341810315847397, + "learning_rate": 8.621986464811943e-05, + "loss": 0.01311152521520853, + "num_input_tokens_seen": 67485496, + "step": 4121, + "train_runtime": 33488.1054, + "train_tokens_per_second": 2015.208 + }, + { + "epoch": 2.498181818181818, + "grad_norm": 0.01662714220583439, + "learning_rate": 8.621323477796971e-05, + "loss": 0.012739536352455616, + "num_input_tokens_seen": 67501872, + "step": 4122, + "train_runtime": 33496.2192, + "train_tokens_per_second": 2015.209 + }, + { + "epoch": 2.498787878787879, + "grad_norm": 0.008028144016861916, + "learning_rate": 8.620660356836297e-05, + "loss": 0.012385563924908638, + "num_input_tokens_seen": 67518248, + "step": 4123, + "train_runtime": 33504.334, + "train_tokens_per_second": 2015.209 + }, + { + "epoch": 2.4993939393939395, + "grad_norm": 0.008046145550906658, + "learning_rate": 8.619997101954444e-05, + "loss": 0.0121089406311512, + "num_input_tokens_seen": 67534624, + "step": 4124, + "train_runtime": 33512.4463, + "train_tokens_per_second": 2015.21 + }, + { + "epoch": 2.5, + "grad_norm": 0.010293934494256973, + "learning_rate": 8.619333713175951e-05, + "loss": 0.012157551944255829, + "num_input_tokens_seen": 67551000, + "step": 4125, + "train_runtime": 33520.5584, + "train_tokens_per_second": 2015.211 + }, + { + "epoch": 2.5006060606060605, + "grad_norm": 0.013131627812981606, + "learning_rate": 8.618670190525352e-05, + "loss": 0.01264239102602005, + "num_input_tokens_seen": 67567376, + "step": 4126, + "train_runtime": 33528.6726, + "train_tokens_per_second": 2015.212 + }, + { + "epoch": 2.501212121212121, + "grad_norm": 0.005999131128191948, + "learning_rate": 8.618006534027188e-05, + "loss": 0.011658573523163795, + "num_input_tokens_seen": 67583752, + "step": 4127, + "train_runtime": 33536.7822, + "train_tokens_per_second": 2015.213 + }, + { + "epoch": 2.501818181818182, + "grad_norm": 0.005932522937655449, + "learning_rate": 8.61734274370601e-05, + "loss": 0.012348456308245659, + "num_input_tokens_seen": 67600128, + "step": 4128, + "train_runtime": 33544.896, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 2.5024242424242424, + "grad_norm": 0.007554478943347931, + "learning_rate": 8.616678819586367e-05, + "loss": 0.012308004312217236, + "num_input_tokens_seen": 67616504, + "step": 4129, + "train_runtime": 33553.0107, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 2.503030303030303, + "grad_norm": 0.012266907840967178, + "learning_rate": 8.616014761692816e-05, + "loss": 0.012991974130272865, + "num_input_tokens_seen": 67632880, + "step": 4130, + "train_runtime": 33561.1237, + "train_tokens_per_second": 2015.215 + }, + { + "epoch": 2.503636363636364, + "grad_norm": 0.009817441925406456, + "learning_rate": 8.615350570049924e-05, + "loss": 0.012946602888405323, + "num_input_tokens_seen": 67649256, + "step": 4131, + "train_runtime": 33569.2416, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 2.5042424242424244, + "grad_norm": 0.008134410716593266, + "learning_rate": 8.614686244682255e-05, + "loss": 0.01143142394721508, + "num_input_tokens_seen": 67665632, + "step": 4132, + "train_runtime": 33577.3553, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 2.504848484848485, + "grad_norm": 0.018325001001358032, + "learning_rate": 8.61402178561438e-05, + "loss": 0.01349793840199709, + "num_input_tokens_seen": 67682008, + "step": 4133, + "train_runtime": 33585.4664, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 2.5054545454545454, + "grad_norm": 0.006841942667961121, + "learning_rate": 8.613357192870876e-05, + "loss": 0.011571954004466534, + "num_input_tokens_seen": 67698384, + "step": 4134, + "train_runtime": 33593.5811, + "train_tokens_per_second": 2015.218 + }, + { + "epoch": 2.506060606060606, + "grad_norm": 0.012761874124407768, + "learning_rate": 8.612692466476328e-05, + "loss": 0.01210363395512104, + "num_input_tokens_seen": 67714760, + "step": 4135, + "train_runtime": 33601.6948, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 2.506666666666667, + "grad_norm": 0.009739668108522892, + "learning_rate": 8.61202760645532e-05, + "loss": 0.012247622944414616, + "num_input_tokens_seen": 67731136, + "step": 4136, + "train_runtime": 33609.8106, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 2.5072727272727273, + "grad_norm": 0.008258581161499023, + "learning_rate": 8.611362612832445e-05, + "loss": 0.011566182598471642, + "num_input_tokens_seen": 67747512, + "step": 4137, + "train_runtime": 33617.9329, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 2.507878787878788, + "grad_norm": 0.010635389015078545, + "learning_rate": 8.610697485632299e-05, + "loss": 0.011706124991178513, + "num_input_tokens_seen": 67763888, + "step": 4138, + "train_runtime": 33626.048, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 2.5084848484848488, + "grad_norm": 0.021257584914565086, + "learning_rate": 8.610032224879486e-05, + "loss": 0.014669377356767654, + "num_input_tokens_seen": 67780264, + "step": 4139, + "train_runtime": 33634.164, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 2.509090909090909, + "grad_norm": 0.008578256703913212, + "learning_rate": 8.60936683059861e-05, + "loss": 0.012150867842137814, + "num_input_tokens_seen": 67796640, + "step": 4140, + "train_runtime": 33642.2754, + "train_tokens_per_second": 2015.222 + }, + { + "epoch": 2.5096969696969698, + "grad_norm": 0.0066306074149906635, + "learning_rate": 8.608701302814286e-05, + "loss": 0.013020064681768417, + "num_input_tokens_seen": 67813016, + "step": 4141, + "train_runtime": 33650.3872, + "train_tokens_per_second": 2015.222 + }, + { + "epoch": 2.5103030303030303, + "grad_norm": 0.013219548389315605, + "learning_rate": 8.608035641551127e-05, + "loss": 0.011568482965230942, + "num_input_tokens_seen": 67829392, + "step": 4142, + "train_runtime": 33658.5003, + "train_tokens_per_second": 2015.223 + }, + { + "epoch": 2.5109090909090908, + "grad_norm": 0.04668772220611572, + "learning_rate": 8.607369846833754e-05, + "loss": 0.013515348546206951, + "num_input_tokens_seen": 67845768, + "step": 4143, + "train_runtime": 33666.6156, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 2.5115151515151517, + "grad_norm": 0.009873609058558941, + "learning_rate": 8.606703918686799e-05, + "loss": 0.011558369733393192, + "num_input_tokens_seen": 67862144, + "step": 4144, + "train_runtime": 33674.7326, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 2.512121212121212, + "grad_norm": 0.013589495792984962, + "learning_rate": 8.606037857134887e-05, + "loss": 0.011783171445131302, + "num_input_tokens_seen": 67878520, + "step": 4145, + "train_runtime": 33682.8418, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 2.5127272727272727, + "grad_norm": 0.008833514526486397, + "learning_rate": 8.60537166220266e-05, + "loss": 0.013286241330206394, + "num_input_tokens_seen": 67894896, + "step": 4146, + "train_runtime": 33690.958, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 2.513333333333333, + "grad_norm": 0.006422446109354496, + "learning_rate": 8.604705333914754e-05, + "loss": 0.012631628662347794, + "num_input_tokens_seen": 67911272, + "step": 4147, + "train_runtime": 33699.0703, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 2.5139393939393937, + "grad_norm": 0.006251727230846882, + "learning_rate": 8.604038872295817e-05, + "loss": 0.013666517101228237, + "num_input_tokens_seen": 67927648, + "step": 4148, + "train_runtime": 33707.1783, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 2.5145454545454546, + "grad_norm": 0.00579094560816884, + "learning_rate": 8.603372277370503e-05, + "loss": 0.012197660282254219, + "num_input_tokens_seen": 67944024, + "step": 4149, + "train_runtime": 33715.2903, + "train_tokens_per_second": 2015.229 + }, + { + "epoch": 2.515151515151515, + "grad_norm": 0.016107073053717613, + "learning_rate": 8.602705549163464e-05, + "loss": 0.012970278970897198, + "num_input_tokens_seen": 67960400, + "step": 4150, + "train_runtime": 33723.4005, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 2.5157575757575756, + "grad_norm": 0.01195712573826313, + "learning_rate": 8.602038687699364e-05, + "loss": 0.011596700176596642, + "num_input_tokens_seen": 67976776, + "step": 4151, + "train_runtime": 33731.5095, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 2.5163636363636366, + "grad_norm": 0.001594836125150323, + "learning_rate": 8.601371693002865e-05, + "loss": 0.011990657076239586, + "num_input_tokens_seen": 67993152, + "step": 4152, + "train_runtime": 33739.6194, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 2.516969696969697, + "grad_norm": 0.010010171681642532, + "learning_rate": 8.600704565098643e-05, + "loss": 0.012920582666993141, + "num_input_tokens_seen": 68009528, + "step": 4153, + "train_runtime": 33747.7333, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 2.5175757575757576, + "grad_norm": 0.0065169380977749825, + "learning_rate": 8.600037304011371e-05, + "loss": 0.013019610196352005, + "num_input_tokens_seen": 68025904, + "step": 4154, + "train_runtime": 33755.8473, + "train_tokens_per_second": 2015.233 + }, + { + "epoch": 2.518181818181818, + "grad_norm": 0.02080150693655014, + "learning_rate": 8.599369909765729e-05, + "loss": 0.013795982114970684, + "num_input_tokens_seen": 68042280, + "step": 4155, + "train_runtime": 33763.9635, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 2.5187878787878786, + "grad_norm": 0.013080280274152756, + "learning_rate": 8.598702382386403e-05, + "loss": 0.013059237971901894, + "num_input_tokens_seen": 68058656, + "step": 4156, + "train_runtime": 33772.0745, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 2.5193939393939395, + "grad_norm": 0.012310740537941456, + "learning_rate": 8.598034721898085e-05, + "loss": 0.012900668196380138, + "num_input_tokens_seen": 68075032, + "step": 4157, + "train_runtime": 33780.1885, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 2.52, + "grad_norm": 0.005335957743227482, + "learning_rate": 8.59736692832547e-05, + "loss": 0.012863250449299812, + "num_input_tokens_seen": 68091408, + "step": 4158, + "train_runtime": 33788.3, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 2.5206060606060605, + "grad_norm": 0.008592368103563786, + "learning_rate": 8.596699001693255e-05, + "loss": 0.011448989622294903, + "num_input_tokens_seen": 68107784, + "step": 4159, + "train_runtime": 33796.4103, + "train_tokens_per_second": 2015.237 + }, + { + "epoch": 2.5212121212121215, + "grad_norm": 0.06237168610095978, + "learning_rate": 8.596030942026152e-05, + "loss": 0.014227611944079399, + "num_input_tokens_seen": 68124160, + "step": 4160, + "train_runtime": 33804.5229, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 2.521818181818182, + "grad_norm": 0.007747406605631113, + "learning_rate": 8.595362749348866e-05, + "loss": 0.011239602230489254, + "num_input_tokens_seen": 68140536, + "step": 4161, + "train_runtime": 33812.6408, + "train_tokens_per_second": 2015.239 + }, + { + "epoch": 2.5224242424242425, + "grad_norm": 0.008204971440136433, + "learning_rate": 8.594694423686112e-05, + "loss": 0.013253196142613888, + "num_input_tokens_seen": 68156912, + "step": 4162, + "train_runtime": 33820.7552, + "train_tokens_per_second": 2015.239 + }, + { + "epoch": 2.523030303030303, + "grad_norm": 0.008401142433285713, + "learning_rate": 8.594025965062613e-05, + "loss": 0.012079546228051186, + "num_input_tokens_seen": 68173288, + "step": 4163, + "train_runtime": 33828.8685, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 2.5236363636363635, + "grad_norm": 0.009007366374135017, + "learning_rate": 8.593357373503093e-05, + "loss": 0.01343685481697321, + "num_input_tokens_seen": 68189664, + "step": 4164, + "train_runtime": 33836.981, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 2.5242424242424244, + "grad_norm": 0.0051362281665205956, + "learning_rate": 8.592688649032282e-05, + "loss": 0.01115061528980732, + "num_input_tokens_seen": 68206040, + "step": 4165, + "train_runtime": 33845.0962, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 2.524848484848485, + "grad_norm": 0.009801853448152542, + "learning_rate": 8.592019791674913e-05, + "loss": 0.012383127585053444, + "num_input_tokens_seen": 68222416, + "step": 4166, + "train_runtime": 33853.2083, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 2.5254545454545454, + "grad_norm": 0.010700262151658535, + "learning_rate": 8.591350801455726e-05, + "loss": 0.012222339399158955, + "num_input_tokens_seen": 68238792, + "step": 4167, + "train_runtime": 33861.3194, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 2.526060606060606, + "grad_norm": 0.05414673313498497, + "learning_rate": 8.590681678399469e-05, + "loss": 0.013189787045121193, + "num_input_tokens_seen": 68255168, + "step": 4168, + "train_runtime": 33869.4378, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 2.5266666666666664, + "grad_norm": 0.010257396847009659, + "learning_rate": 8.590012422530889e-05, + "loss": 0.011694240383803844, + "num_input_tokens_seen": 68271544, + "step": 4169, + "train_runtime": 33877.553, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 2.5272727272727273, + "grad_norm": 0.00906137190759182, + "learning_rate": 8.58934303387474e-05, + "loss": 0.012298794463276863, + "num_input_tokens_seen": 68287920, + "step": 4170, + "train_runtime": 33885.6639, + "train_tokens_per_second": 2015.245 + }, + { + "epoch": 2.527878787878788, + "grad_norm": 0.01206654217094183, + "learning_rate": 8.588673512455781e-05, + "loss": 0.011892374604940414, + "num_input_tokens_seen": 68304296, + "step": 4171, + "train_runtime": 33893.7734, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 2.5284848484848483, + "grad_norm": 0.006913334131240845, + "learning_rate": 8.588003858298778e-05, + "loss": 0.013139956630766392, + "num_input_tokens_seen": 68320672, + "step": 4172, + "train_runtime": 33901.8855, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 2.5290909090909093, + "grad_norm": 0.00906689465045929, + "learning_rate": 8.5873340714285e-05, + "loss": 0.01318311132490635, + "num_input_tokens_seen": 68337048, + "step": 4173, + "train_runtime": 33909.996, + "train_tokens_per_second": 2015.248 + }, + { + "epoch": 2.5296969696969698, + "grad_norm": 0.006641354411840439, + "learning_rate": 8.586664151869721e-05, + "loss": 0.012888241559267044, + "num_input_tokens_seen": 68353424, + "step": 4174, + "train_runtime": 33918.1078, + "train_tokens_per_second": 2015.249 + }, + { + "epoch": 2.5303030303030303, + "grad_norm": 0.012540820986032486, + "learning_rate": 8.585994099647218e-05, + "loss": 0.012079787440598011, + "num_input_tokens_seen": 68369800, + "step": 4175, + "train_runtime": 33926.2183, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 2.5309090909090908, + "grad_norm": 0.012157919816672802, + "learning_rate": 8.58532391478578e-05, + "loss": 0.011909179389476776, + "num_input_tokens_seen": 68386176, + "step": 4176, + "train_runtime": 33934.333, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 2.5315151515151513, + "grad_norm": 0.007573497481644154, + "learning_rate": 8.584653597310191e-05, + "loss": 0.012104995548725128, + "num_input_tokens_seen": 68402552, + "step": 4177, + "train_runtime": 33942.4483, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 2.532121212121212, + "grad_norm": 0.01022687740623951, + "learning_rate": 8.583983147245244e-05, + "loss": 0.012559885159134865, + "num_input_tokens_seen": 68418928, + "step": 4178, + "train_runtime": 33950.5648, + "train_tokens_per_second": 2015.252 + }, + { + "epoch": 2.5327272727272727, + "grad_norm": 0.010468428023159504, + "learning_rate": 8.583312564615744e-05, + "loss": 0.011742668226361275, + "num_input_tokens_seen": 68435304, + "step": 4179, + "train_runtime": 33958.6795, + "train_tokens_per_second": 2015.252 + }, + { + "epoch": 2.533333333333333, + "grad_norm": 0.0068651726469397545, + "learning_rate": 8.582641849446487e-05, + "loss": 0.011383350938558578, + "num_input_tokens_seen": 68451680, + "step": 4180, + "train_runtime": 33966.7899, + "train_tokens_per_second": 2015.253 + }, + { + "epoch": 2.533939393939394, + "grad_norm": 0.009449625387787819, + "learning_rate": 8.581971001762286e-05, + "loss": 0.013028772547841072, + "num_input_tokens_seen": 68468056, + "step": 4181, + "train_runtime": 33974.9032, + "train_tokens_per_second": 2015.254 + }, + { + "epoch": 2.5345454545454547, + "grad_norm": 0.008841153234243393, + "learning_rate": 8.581300021587955e-05, + "loss": 0.012170545756816864, + "num_input_tokens_seen": 68484432, + "step": 4182, + "train_runtime": 33983.0125, + "train_tokens_per_second": 2015.255 + }, + { + "epoch": 2.535151515151515, + "grad_norm": 0.007193283643573523, + "learning_rate": 8.580628908948308e-05, + "loss": 0.011749835684895515, + "num_input_tokens_seen": 68500808, + "step": 4183, + "train_runtime": 33991.1323, + "train_tokens_per_second": 2015.255 + }, + { + "epoch": 2.5357575757575757, + "grad_norm": 0.004431907087564468, + "learning_rate": 8.57995766386817e-05, + "loss": 0.011753798462450504, + "num_input_tokens_seen": 68517184, + "step": 4184, + "train_runtime": 33999.2431, + "train_tokens_per_second": 2015.256 + }, + { + "epoch": 2.536363636363636, + "grad_norm": 0.0062288763001561165, + "learning_rate": 8.579286286372372e-05, + "loss": 0.01223161444067955, + "num_input_tokens_seen": 68533560, + "step": 4185, + "train_runtime": 34007.3525, + "train_tokens_per_second": 2015.257 + }, + { + "epoch": 2.536969696969697, + "grad_norm": 0.011136210523545742, + "learning_rate": 8.578614776485743e-05, + "loss": 0.012547209858894348, + "num_input_tokens_seen": 68549936, + "step": 4186, + "train_runtime": 34015.4627, + "train_tokens_per_second": 2015.258 + }, + { + "epoch": 2.5375757575757576, + "grad_norm": 0.009545357897877693, + "learning_rate": 8.577943134233124e-05, + "loss": 0.012209147214889526, + "num_input_tokens_seen": 68566312, + "step": 4187, + "train_runtime": 34023.5776, + "train_tokens_per_second": 2015.259 + }, + { + "epoch": 2.538181818181818, + "grad_norm": 0.006777395494282246, + "learning_rate": 8.577271359639356e-05, + "loss": 0.012126508168876171, + "num_input_tokens_seen": 68582688, + "step": 4188, + "train_runtime": 34031.6894, + "train_tokens_per_second": 2015.26 + }, + { + "epoch": 2.538787878787879, + "grad_norm": 0.008090141229331493, + "learning_rate": 8.576599452729287e-05, + "loss": 0.012071416713297367, + "num_input_tokens_seen": 68599064, + "step": 4189, + "train_runtime": 34039.8004, + "train_tokens_per_second": 2015.26 + }, + { + "epoch": 2.5393939393939395, + "grad_norm": 0.009810656309127808, + "learning_rate": 8.575927413527767e-05, + "loss": 0.01236578356474638, + "num_input_tokens_seen": 68615440, + "step": 4190, + "train_runtime": 34047.9137, + "train_tokens_per_second": 2015.261 + }, + { + "epoch": 2.54, + "grad_norm": 0.005306133069097996, + "learning_rate": 8.575255242059656e-05, + "loss": 0.011993910185992718, + "num_input_tokens_seen": 68631816, + "step": 4191, + "train_runtime": 34056.0321, + "train_tokens_per_second": 2015.262 + }, + { + "epoch": 2.5406060606060605, + "grad_norm": 0.007159395609050989, + "learning_rate": 8.574582938349817e-05, + "loss": 0.011761287227272987, + "num_input_tokens_seen": 68648192, + "step": 4192, + "train_runtime": 34064.1465, + "train_tokens_per_second": 2015.262 + }, + { + "epoch": 2.541212121212121, + "grad_norm": 0.009937534108757973, + "learning_rate": 8.573910502423119e-05, + "loss": 0.012713994830846786, + "num_input_tokens_seen": 68664568, + "step": 4193, + "train_runtime": 34072.2598, + "train_tokens_per_second": 2015.263 + }, + { + "epoch": 2.541818181818182, + "grad_norm": 0.015779662877321243, + "learning_rate": 8.573237934304428e-05, + "loss": 0.014923243783414364, + "num_input_tokens_seen": 68680944, + "step": 4194, + "train_runtime": 34080.376, + "train_tokens_per_second": 2015.264 + }, + { + "epoch": 2.5424242424242425, + "grad_norm": 0.009395881555974483, + "learning_rate": 8.572565234018626e-05, + "loss": 0.010241414420306683, + "num_input_tokens_seen": 68697320, + "step": 4195, + "train_runtime": 34088.4922, + "train_tokens_per_second": 2015.264 + }, + { + "epoch": 2.543030303030303, + "grad_norm": 0.013018476776778698, + "learning_rate": 8.571892401590591e-05, + "loss": 0.012208077125251293, + "num_input_tokens_seen": 68713696, + "step": 4196, + "train_runtime": 34096.6057, + "train_tokens_per_second": 2015.265 + }, + { + "epoch": 2.5436363636363635, + "grad_norm": 0.0026204294990748167, + "learning_rate": 8.571219437045214e-05, + "loss": 0.012451937422156334, + "num_input_tokens_seen": 68730072, + "step": 4197, + "train_runtime": 34104.7223, + "train_tokens_per_second": 2015.266 + }, + { + "epoch": 2.544242424242424, + "grad_norm": 0.0031558230984956026, + "learning_rate": 8.570546340407386e-05, + "loss": 0.011697427369654179, + "num_input_tokens_seen": 68746448, + "step": 4198, + "train_runtime": 34112.8409, + "train_tokens_per_second": 2015.266 + }, + { + "epoch": 2.544848484848485, + "grad_norm": 0.013345579616725445, + "learning_rate": 8.569873111702e-05, + "loss": 0.01368304155766964, + "num_input_tokens_seen": 68762824, + "step": 4199, + "train_runtime": 34120.9512, + "train_tokens_per_second": 2015.267 + }, + { + "epoch": 2.5454545454545454, + "grad_norm": 0.012526432983577251, + "learning_rate": 8.56919975095396e-05, + "loss": 0.01308903656899929, + "num_input_tokens_seen": 68779200, + "step": 4200, + "train_runtime": 34129.0659, + "train_tokens_per_second": 2015.268 + }, + { + "epoch": 2.546060606060606, + "grad_norm": 0.004964140709489584, + "learning_rate": 8.568526258188172e-05, + "loss": 0.01307650189846754, + "num_input_tokens_seen": 68795576, + "step": 4201, + "train_runtime": 34138.0952, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 2.546666666666667, + "grad_norm": 0.009814539924263954, + "learning_rate": 8.567852633429547e-05, + "loss": 0.012293417938053608, + "num_input_tokens_seen": 68811952, + "step": 4202, + "train_runtime": 34146.2051, + "train_tokens_per_second": 2015.215 + }, + { + "epoch": 2.5472727272727274, + "grad_norm": 0.011141370050609112, + "learning_rate": 8.567178876703002e-05, + "loss": 0.011135764420032501, + "num_input_tokens_seen": 68828328, + "step": 4203, + "train_runtime": 34154.3146, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 2.547878787878788, + "grad_norm": 0.008699796162545681, + "learning_rate": 8.566504988033456e-05, + "loss": 0.011189600452780724, + "num_input_tokens_seen": 68844704, + "step": 4204, + "train_runtime": 34162.4319, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 2.5484848484848484, + "grad_norm": 0.009542635641992092, + "learning_rate": 8.565830967445836e-05, + "loss": 0.012678613886237144, + "num_input_tokens_seen": 68861080, + "step": 4205, + "train_runtime": 34170.5461, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 2.549090909090909, + "grad_norm": 0.01241234689950943, + "learning_rate": 8.565156814965074e-05, + "loss": 0.011123303323984146, + "num_input_tokens_seen": 68877456, + "step": 4206, + "train_runtime": 34178.6631, + "train_tokens_per_second": 2015.218 + }, + { + "epoch": 2.54969696969697, + "grad_norm": 0.008778631687164307, + "learning_rate": 8.564482530616103e-05, + "loss": 0.013528062030673027, + "num_input_tokens_seen": 68893832, + "step": 4207, + "train_runtime": 34186.7778, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 2.5503030303030303, + "grad_norm": 0.006002672016620636, + "learning_rate": 8.563808114423864e-05, + "loss": 0.012405400164425373, + "num_input_tokens_seen": 68910208, + "step": 4208, + "train_runtime": 34194.8903, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 2.550909090909091, + "grad_norm": 0.008005453273653984, + "learning_rate": 8.563133566413304e-05, + "loss": 0.011684123426675797, + "num_input_tokens_seen": 68926584, + "step": 4209, + "train_runtime": 34203.0056, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 2.5515151515151517, + "grad_norm": 0.009759655222296715, + "learning_rate": 8.562458886609372e-05, + "loss": 0.012069775722920895, + "num_input_tokens_seen": 68942960, + "step": 4210, + "train_runtime": 34211.121, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 2.5521212121212122, + "grad_norm": 0.007833522744476795, + "learning_rate": 8.561784075037023e-05, + "loss": 0.012013886123895645, + "num_input_tokens_seen": 68959336, + "step": 4211, + "train_runtime": 34219.2324, + "train_tokens_per_second": 2015.222 + }, + { + "epoch": 2.5527272727272727, + "grad_norm": 0.011014553718268871, + "learning_rate": 8.561109131721219e-05, + "loss": 0.01208100188523531, + "num_input_tokens_seen": 68975712, + "step": 4212, + "train_runtime": 34227.3444, + "train_tokens_per_second": 2015.222 + }, + { + "epoch": 2.5533333333333332, + "grad_norm": 0.008966478519141674, + "learning_rate": 8.560434056686921e-05, + "loss": 0.013254774734377861, + "num_input_tokens_seen": 68992088, + "step": 4213, + "train_runtime": 34235.4627, + "train_tokens_per_second": 2015.223 + }, + { + "epoch": 2.5539393939393937, + "grad_norm": 0.00642790924757719, + "learning_rate": 8.559758849959103e-05, + "loss": 0.013158031739294529, + "num_input_tokens_seen": 69008464, + "step": 4214, + "train_runtime": 34243.5727, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 2.5545454545454547, + "grad_norm": 0.005826066713780165, + "learning_rate": 8.559083511562735e-05, + "loss": 0.009736889973282814, + "num_input_tokens_seen": 69024840, + "step": 4215, + "train_runtime": 34251.6817, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 2.555151515151515, + "grad_norm": 0.007942257449030876, + "learning_rate": 8.558408041522801e-05, + "loss": 0.012654442340135574, + "num_input_tokens_seen": 69041216, + "step": 4216, + "train_runtime": 34259.7968, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 2.5557575757575757, + "grad_norm": 0.004941877909004688, + "learning_rate": 8.557732439864283e-05, + "loss": 0.011076820082962513, + "num_input_tokens_seen": 69057592, + "step": 4217, + "train_runtime": 34267.9065, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 2.5563636363636366, + "grad_norm": 0.008505112491548061, + "learning_rate": 8.55705670661217e-05, + "loss": 0.012647919356822968, + "num_input_tokens_seen": 69073968, + "step": 4218, + "train_runtime": 34276.0173, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 2.556969696969697, + "grad_norm": 0.007098487112671137, + "learning_rate": 8.556380841791455e-05, + "loss": 0.012106881476938725, + "num_input_tokens_seen": 69090344, + "step": 4219, + "train_runtime": 34284.1318, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 2.5575757575757576, + "grad_norm": 0.006404672283679247, + "learning_rate": 8.555704845427142e-05, + "loss": 0.011900501325726509, + "num_input_tokens_seen": 69106720, + "step": 4220, + "train_runtime": 34292.3835, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 2.558181818181818, + "grad_norm": 0.006173280067741871, + "learning_rate": 8.555028717544227e-05, + "loss": 0.012371562421321869, + "num_input_tokens_seen": 69123096, + "step": 4221, + "train_runtime": 34300.4927, + "train_tokens_per_second": 2015.222 + }, + { + "epoch": 2.5587878787878786, + "grad_norm": 0.0037785377353429794, + "learning_rate": 8.554352458167727e-05, + "loss": 0.011552992276847363, + "num_input_tokens_seen": 69139472, + "step": 4222, + "train_runtime": 34308.6035, + "train_tokens_per_second": 2015.223 + }, + { + "epoch": 2.5593939393939396, + "grad_norm": 0.0059936679899692535, + "learning_rate": 8.553676067322649e-05, + "loss": 0.012954373843967915, + "num_input_tokens_seen": 69155848, + "step": 4223, + "train_runtime": 34316.7127, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 2.56, + "grad_norm": 0.010465111583471298, + "learning_rate": 8.552999545034013e-05, + "loss": 0.012163587845861912, + "num_input_tokens_seen": 69172224, + "step": 4224, + "train_runtime": 34324.8327, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 2.5606060606060606, + "grad_norm": 0.008582420647144318, + "learning_rate": 8.552322891326846e-05, + "loss": 0.012270374223589897, + "num_input_tokens_seen": 69188600, + "step": 4225, + "train_runtime": 34332.942, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 2.561212121212121, + "grad_norm": 0.008858418092131615, + "learning_rate": 8.551646106226169e-05, + "loss": 0.012922226451337337, + "num_input_tokens_seen": 69204976, + "step": 4226, + "train_runtime": 34341.0566, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 2.5618181818181816, + "grad_norm": 0.008054768666625023, + "learning_rate": 8.550969189757021e-05, + "loss": 0.011254087090492249, + "num_input_tokens_seen": 69221352, + "step": 4227, + "train_runtime": 34349.1678, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 2.5624242424242425, + "grad_norm": 0.007701630238443613, + "learning_rate": 8.550292141944439e-05, + "loss": 0.011938437819480896, + "num_input_tokens_seen": 69237728, + "step": 4228, + "train_runtime": 34357.2795, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 2.563030303030303, + "grad_norm": 0.003518422832712531, + "learning_rate": 8.549614962813464e-05, + "loss": 0.011110475286841393, + "num_input_tokens_seen": 69254104, + "step": 4229, + "train_runtime": 34365.3918, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 2.5636363636363635, + "grad_norm": 0.00661065336316824, + "learning_rate": 8.548937652389142e-05, + "loss": 0.012568378821015358, + "num_input_tokens_seen": 69270480, + "step": 4230, + "train_runtime": 34373.5045, + "train_tokens_per_second": 2015.229 + }, + { + "epoch": 2.5642424242424244, + "grad_norm": 0.012559509836137295, + "learning_rate": 8.54826021069653e-05, + "loss": 0.01257312297821045, + "num_input_tokens_seen": 69286856, + "step": 4231, + "train_runtime": 34381.6197, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 2.564848484848485, + "grad_norm": 0.014508885331451893, + "learning_rate": 8.547582637760681e-05, + "loss": 0.013298265635967255, + "num_input_tokens_seen": 69303232, + "step": 4232, + "train_runtime": 34389.7349, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 2.5654545454545454, + "grad_norm": 0.008601444773375988, + "learning_rate": 8.546904933606661e-05, + "loss": 0.01283743791282177, + "num_input_tokens_seen": 69319608, + "step": 4233, + "train_runtime": 34397.8502, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 2.566060606060606, + "grad_norm": 0.010805055499076843, + "learning_rate": 8.546227098259532e-05, + "loss": 0.011883988976478577, + "num_input_tokens_seen": 69335984, + "step": 4234, + "train_runtime": 34405.9644, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 2.5666666666666664, + "grad_norm": 0.03161879628896713, + "learning_rate": 8.545549131744371e-05, + "loss": 0.01316217053681612, + "num_input_tokens_seen": 69352360, + "step": 4235, + "train_runtime": 34414.0772, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 2.5672727272727274, + "grad_norm": 0.01094900630414486, + "learning_rate": 8.54487103408625e-05, + "loss": 0.012402595020830631, + "num_input_tokens_seen": 69368736, + "step": 4236, + "train_runtime": 34422.1893, + "train_tokens_per_second": 2015.233 + }, + { + "epoch": 2.567878787878788, + "grad_norm": 0.0038218225818127394, + "learning_rate": 8.544192805310254e-05, + "loss": 0.011664657853543758, + "num_input_tokens_seen": 69385112, + "step": 4237, + "train_runtime": 34430.3062, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 2.5684848484848484, + "grad_norm": 0.007981471717357635, + "learning_rate": 8.54351444544147e-05, + "loss": 0.011700598523020744, + "num_input_tokens_seen": 69401488, + "step": 4238, + "train_runtime": 34438.4232, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 2.5690909090909093, + "grad_norm": 0.014592055231332779, + "learning_rate": 8.542835954504984e-05, + "loss": 0.01353910006582737, + "num_input_tokens_seen": 69417864, + "step": 4239, + "train_runtime": 34446.535, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 2.56969696969697, + "grad_norm": 0.00856589712202549, + "learning_rate": 8.542157332525898e-05, + "loss": 0.011866646818816662, + "num_input_tokens_seen": 69434240, + "step": 4240, + "train_runtime": 34454.6455, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 2.5703030303030303, + "grad_norm": 0.006065376102924347, + "learning_rate": 8.541478579529308e-05, + "loss": 0.012246459722518921, + "num_input_tokens_seen": 69450616, + "step": 4241, + "train_runtime": 34462.756, + "train_tokens_per_second": 2015.237 + }, + { + "epoch": 2.570909090909091, + "grad_norm": 0.007532956078648567, + "learning_rate": 8.540799695540325e-05, + "loss": 0.012833459302783012, + "num_input_tokens_seen": 69466992, + "step": 4242, + "train_runtime": 34470.8686, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 2.5715151515151513, + "grad_norm": 0.006494333501905203, + "learning_rate": 8.540120680584054e-05, + "loss": 0.01119618583470583, + "num_input_tokens_seen": 69483368, + "step": 4243, + "train_runtime": 34478.9822, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 2.5721212121212123, + "grad_norm": 0.007824079133570194, + "learning_rate": 8.539441534685614e-05, + "loss": 0.010887386277318, + "num_input_tokens_seen": 69499744, + "step": 4244, + "train_runtime": 34487.0921, + "train_tokens_per_second": 2015.239 + }, + { + "epoch": 2.5727272727272728, + "grad_norm": 0.012008079327642918, + "learning_rate": 8.538762257870124e-05, + "loss": 0.012646320275962353, + "num_input_tokens_seen": 69516120, + "step": 4245, + "train_runtime": 34495.2044, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 2.5733333333333333, + "grad_norm": 0.007631377782672644, + "learning_rate": 8.53808285016271e-05, + "loss": 0.01180965080857277, + "num_input_tokens_seen": 69532496, + "step": 4246, + "train_runtime": 34503.3134, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 2.573939393939394, + "grad_norm": 0.014731809496879578, + "learning_rate": 8.5374033115885e-05, + "loss": 0.01271150540560484, + "num_input_tokens_seen": 69548872, + "step": 4247, + "train_runtime": 34511.4326, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 2.5745454545454547, + "grad_norm": 0.01020353939384222, + "learning_rate": 8.536723642172632e-05, + "loss": 0.01370199117809534, + "num_input_tokens_seen": 69565248, + "step": 4248, + "train_runtime": 34519.5422, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 2.575151515151515, + "grad_norm": 0.009751253761351109, + "learning_rate": 8.536043841940245e-05, + "loss": 0.013536883518099785, + "num_input_tokens_seen": 69581624, + "step": 4249, + "train_runtime": 34527.65, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 2.5757575757575757, + "grad_norm": 0.0058158026076853275, + "learning_rate": 8.535363910916481e-05, + "loss": 0.01220100000500679, + "num_input_tokens_seen": 69598000, + "step": 4250, + "train_runtime": 34535.7639, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 2.576363636363636, + "grad_norm": 0.013153862208127975, + "learning_rate": 8.53468384912649e-05, + "loss": 0.012638058513402939, + "num_input_tokens_seen": 69614376, + "step": 4251, + "train_runtime": 34543.8777, + "train_tokens_per_second": 2015.245 + }, + { + "epoch": 2.576969696969697, + "grad_norm": 0.007359377574175596, + "learning_rate": 8.534003656595429e-05, + "loss": 0.012561459094285965, + "num_input_tokens_seen": 69630752, + "step": 4252, + "train_runtime": 34551.9942, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 2.5775757575757576, + "grad_norm": 0.005675836466252804, + "learning_rate": 8.533323333348452e-05, + "loss": 0.011403108946979046, + "num_input_tokens_seen": 69647128, + "step": 4253, + "train_runtime": 34560.1131, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 2.578181818181818, + "grad_norm": 0.010039916262030602, + "learning_rate": 8.532642879410728e-05, + "loss": 0.011742551811039448, + "num_input_tokens_seen": 69663504, + "step": 4254, + "train_runtime": 34568.2215, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 2.5787878787878786, + "grad_norm": 0.02736113779246807, + "learning_rate": 8.531962294807423e-05, + "loss": 0.012846502475440502, + "num_input_tokens_seen": 69679880, + "step": 4255, + "train_runtime": 34576.3364, + "train_tokens_per_second": 2015.248 + }, + { + "epoch": 2.579393939393939, + "grad_norm": 0.006057315971702337, + "learning_rate": 8.53128157956371e-05, + "loss": 0.011819146573543549, + "num_input_tokens_seen": 69696256, + "step": 4256, + "train_runtime": 34584.4472, + "train_tokens_per_second": 2015.249 + }, + { + "epoch": 2.58, + "grad_norm": 0.003937495406717062, + "learning_rate": 8.53060073370477e-05, + "loss": 0.01222772616893053, + "num_input_tokens_seen": 69712632, + "step": 4257, + "train_runtime": 34592.5597, + "train_tokens_per_second": 2015.249 + }, + { + "epoch": 2.5806060606060606, + "grad_norm": 0.009848159737884998, + "learning_rate": 8.529919757255783e-05, + "loss": 0.011999274604022503, + "num_input_tokens_seen": 69729008, + "step": 4258, + "train_runtime": 34600.6717, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 2.581212121212121, + "grad_norm": 0.007803330197930336, + "learning_rate": 8.529238650241938e-05, + "loss": 0.01272343099117279, + "num_input_tokens_seen": 69745384, + "step": 4259, + "train_runtime": 34608.7837, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 2.581818181818182, + "grad_norm": 0.008959654718637466, + "learning_rate": 8.528557412688429e-05, + "loss": 0.012705165892839432, + "num_input_tokens_seen": 69761760, + "step": 4260, + "train_runtime": 34616.8925, + "train_tokens_per_second": 2015.252 + }, + { + "epoch": 2.5824242424242425, + "grad_norm": 0.005905602592974901, + "learning_rate": 8.527876044620453e-05, + "loss": 0.01188373938202858, + "num_input_tokens_seen": 69778136, + "step": 4261, + "train_runtime": 34625.0046, + "train_tokens_per_second": 2015.253 + }, + { + "epoch": 2.583030303030303, + "grad_norm": 0.03104606829583645, + "learning_rate": 8.527194546063211e-05, + "loss": 0.012363776564598083, + "num_input_tokens_seen": 69794512, + "step": 4262, + "train_runtime": 34633.1144, + "train_tokens_per_second": 2015.254 + }, + { + "epoch": 2.5836363636363635, + "grad_norm": 0.013915720395743847, + "learning_rate": 8.526512917041913e-05, + "loss": 0.012315332889556885, + "num_input_tokens_seen": 69810888, + "step": 4263, + "train_runtime": 34641.2316, + "train_tokens_per_second": 2015.254 + }, + { + "epoch": 2.584242424242424, + "grad_norm": 0.009097406640648842, + "learning_rate": 8.52583115758177e-05, + "loss": 0.013435162603855133, + "num_input_tokens_seen": 69827264, + "step": 4264, + "train_runtime": 34649.3457, + "train_tokens_per_second": 2015.255 + }, + { + "epoch": 2.584848484848485, + "grad_norm": 0.008107993751764297, + "learning_rate": 8.525149267707999e-05, + "loss": 0.012399150058627129, + "num_input_tokens_seen": 69843640, + "step": 4265, + "train_runtime": 34657.4573, + "train_tokens_per_second": 2015.256 + }, + { + "epoch": 2.5854545454545454, + "grad_norm": 0.010603228583931923, + "learning_rate": 8.52446724744582e-05, + "loss": 0.0121312802657485, + "num_input_tokens_seen": 69860016, + "step": 4266, + "train_runtime": 34665.5679, + "train_tokens_per_second": 2015.257 + }, + { + "epoch": 2.586060606060606, + "grad_norm": 0.006463999394327402, + "learning_rate": 8.523785096820462e-05, + "loss": 0.01217202190309763, + "num_input_tokens_seen": 69876392, + "step": 4267, + "train_runtime": 34673.6812, + "train_tokens_per_second": 2015.257 + }, + { + "epoch": 2.586666666666667, + "grad_norm": 0.008864687755703926, + "learning_rate": 8.523102815857154e-05, + "loss": 0.011695782653987408, + "num_input_tokens_seen": 69892768, + "step": 4268, + "train_runtime": 34681.7944, + "train_tokens_per_second": 2015.258 + }, + { + "epoch": 2.5872727272727274, + "grad_norm": 0.01097130123525858, + "learning_rate": 8.522420404581136e-05, + "loss": 0.012454645708203316, + "num_input_tokens_seen": 69909144, + "step": 4269, + "train_runtime": 34689.9035, + "train_tokens_per_second": 2015.259 + }, + { + "epoch": 2.587878787878788, + "grad_norm": 0.028967060148715973, + "learning_rate": 8.521737863017644e-05, + "loss": 0.012948554009199142, + "num_input_tokens_seen": 69925520, + "step": 4270, + "train_runtime": 34698.0129, + "train_tokens_per_second": 2015.26 + }, + { + "epoch": 2.5884848484848484, + "grad_norm": 0.008769636042416096, + "learning_rate": 8.52105519119193e-05, + "loss": 0.01239432767033577, + "num_input_tokens_seen": 69941896, + "step": 4271, + "train_runtime": 34706.121, + "train_tokens_per_second": 2015.261 + }, + { + "epoch": 2.589090909090909, + "grad_norm": 0.023006586357951164, + "learning_rate": 8.520372389129241e-05, + "loss": 0.012152011506259441, + "num_input_tokens_seen": 69958272, + "step": 4272, + "train_runtime": 34714.2336, + "train_tokens_per_second": 2015.262 + }, + { + "epoch": 2.58969696969697, + "grad_norm": 0.019926972687244415, + "learning_rate": 8.519689456854831e-05, + "loss": 0.012471056543290615, + "num_input_tokens_seen": 69974648, + "step": 4273, + "train_runtime": 34722.3449, + "train_tokens_per_second": 2015.263 + }, + { + "epoch": 2.5903030303030303, + "grad_norm": 0.012716429308056831, + "learning_rate": 8.519006394393964e-05, + "loss": 0.01244234200567007, + "num_input_tokens_seen": 69991024, + "step": 4274, + "train_runtime": 34730.4583, + "train_tokens_per_second": 2015.263 + }, + { + "epoch": 2.590909090909091, + "grad_norm": 0.005737027153372765, + "learning_rate": 8.518323201771903e-05, + "loss": 0.011021402664482594, + "num_input_tokens_seen": 70007400, + "step": 4275, + "train_runtime": 34738.5701, + "train_tokens_per_second": 2015.264 + }, + { + "epoch": 2.5915151515151518, + "grad_norm": 0.007127284538000822, + "learning_rate": 8.517639879013916e-05, + "loss": 0.01087925210595131, + "num_input_tokens_seen": 70023776, + "step": 4276, + "train_runtime": 34746.6823, + "train_tokens_per_second": 2015.265 + }, + { + "epoch": 2.5921212121212123, + "grad_norm": 0.009442265145480633, + "learning_rate": 8.516956426145284e-05, + "loss": 0.013851032592356205, + "num_input_tokens_seen": 70040152, + "step": 4277, + "train_runtime": 34754.7971, + "train_tokens_per_second": 2015.266 + }, + { + "epoch": 2.5927272727272728, + "grad_norm": 0.009272607043385506, + "learning_rate": 8.51627284319128e-05, + "loss": 0.012256700545549393, + "num_input_tokens_seen": 70056528, + "step": 4278, + "train_runtime": 34762.9135, + "train_tokens_per_second": 2015.266 + }, + { + "epoch": 2.5933333333333333, + "grad_norm": 0.01035922672599554, + "learning_rate": 8.515589130177192e-05, + "loss": 0.012722386047244072, + "num_input_tokens_seen": 70072904, + "step": 4279, + "train_runtime": 34771.0331, + "train_tokens_per_second": 2015.267 + }, + { + "epoch": 2.5939393939393938, + "grad_norm": 0.015355154871940613, + "learning_rate": 8.51490528712831e-05, + "loss": 0.013812702149152756, + "num_input_tokens_seen": 70089280, + "step": 4280, + "train_runtime": 34779.1467, + "train_tokens_per_second": 2015.267 + }, + { + "epoch": 2.5945454545454547, + "grad_norm": 0.008414781652390957, + "learning_rate": 8.514221314069923e-05, + "loss": 0.011847485788166523, + "num_input_tokens_seen": 70105656, + "step": 4281, + "train_runtime": 34787.2616, + "train_tokens_per_second": 2015.268 + }, + { + "epoch": 2.595151515151515, + "grad_norm": 0.007625683676451445, + "learning_rate": 8.513537211027336e-05, + "loss": 0.011404497548937798, + "num_input_tokens_seen": 70122032, + "step": 4282, + "train_runtime": 34795.3715, + "train_tokens_per_second": 2015.269 + }, + { + "epoch": 2.5957575757575757, + "grad_norm": 0.010032870806753635, + "learning_rate": 8.51285297802585e-05, + "loss": 0.011987663805484772, + "num_input_tokens_seen": 70138408, + "step": 4283, + "train_runtime": 34803.4862, + "train_tokens_per_second": 2015.27 + }, + { + "epoch": 2.596363636363636, + "grad_norm": 0.017785673961043358, + "learning_rate": 8.512168615090773e-05, + "loss": 0.01496301218867302, + "num_input_tokens_seen": 70154784, + "step": 4284, + "train_runtime": 34811.6005, + "train_tokens_per_second": 2015.27 + }, + { + "epoch": 2.5969696969696967, + "grad_norm": 0.011660014279186726, + "learning_rate": 8.511484122247416e-05, + "loss": 0.013330879621207714, + "num_input_tokens_seen": 70171160, + "step": 4285, + "train_runtime": 34819.7127, + "train_tokens_per_second": 2015.271 + }, + { + "epoch": 2.5975757575757576, + "grad_norm": 0.006647603120654821, + "learning_rate": 8.510799499521103e-05, + "loss": 0.01146023441106081, + "num_input_tokens_seen": 70187536, + "step": 4286, + "train_runtime": 34827.832, + "train_tokens_per_second": 2015.271 + }, + { + "epoch": 2.598181818181818, + "grad_norm": 0.01858823373913765, + "learning_rate": 8.51011474693715e-05, + "loss": 0.012187170796096325, + "num_input_tokens_seen": 70203912, + "step": 4287, + "train_runtime": 34835.9446, + "train_tokens_per_second": 2015.272 + }, + { + "epoch": 2.5987878787878786, + "grad_norm": 0.007660167291760445, + "learning_rate": 8.509429864520891e-05, + "loss": 0.012933144345879555, + "num_input_tokens_seen": 70220288, + "step": 4288, + "train_runtime": 34844.0549, + "train_tokens_per_second": 2015.273 + }, + { + "epoch": 2.5993939393939396, + "grad_norm": 0.015724975615739822, + "learning_rate": 8.508744852297654e-05, + "loss": 0.011832614429295063, + "num_input_tokens_seen": 70236664, + "step": 4289, + "train_runtime": 34852.1654, + "train_tokens_per_second": 2015.274 + }, + { + "epoch": 2.6, + "grad_norm": 0.008818217553198338, + "learning_rate": 8.508059710292779e-05, + "loss": 0.011538914404809475, + "num_input_tokens_seen": 70253040, + "step": 4290, + "train_runtime": 34860.2768, + "train_tokens_per_second": 2015.275 + }, + { + "epoch": 2.6006060606060606, + "grad_norm": 0.025440014898777008, + "learning_rate": 8.507374438531607e-05, + "loss": 0.01380870770663023, + "num_input_tokens_seen": 70269416, + "step": 4291, + "train_runtime": 34868.3931, + "train_tokens_per_second": 2015.275 + }, + { + "epoch": 2.601212121212121, + "grad_norm": 0.0081210657954216, + "learning_rate": 8.506689037039485e-05, + "loss": 0.01100747101008892, + "num_input_tokens_seen": 70285792, + "step": 4292, + "train_runtime": 34876.5029, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 2.6018181818181816, + "grad_norm": 0.00411269161850214, + "learning_rate": 8.506003505841764e-05, + "loss": 0.013029702939093113, + "num_input_tokens_seen": 70302168, + "step": 4293, + "train_runtime": 34884.6115, + "train_tokens_per_second": 2015.277 + }, + { + "epoch": 2.6024242424242425, + "grad_norm": 0.007913190871477127, + "learning_rate": 8.5053178449638e-05, + "loss": 0.011994820088148117, + "num_input_tokens_seen": 70318544, + "step": 4294, + "train_runtime": 34892.7325, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 2.603030303030303, + "grad_norm": 0.005812303628772497, + "learning_rate": 8.504632054430956e-05, + "loss": 0.011586738750338554, + "num_input_tokens_seen": 70334920, + "step": 4295, + "train_runtime": 34900.848, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 2.6036363636363635, + "grad_norm": 0.016979441046714783, + "learning_rate": 8.503946134268596e-05, + "loss": 0.011731847189366817, + "num_input_tokens_seen": 70351296, + "step": 4296, + "train_runtime": 34908.9584, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 2.6042424242424245, + "grad_norm": 0.004646490328013897, + "learning_rate": 8.503260084502094e-05, + "loss": 0.012206630781292915, + "num_input_tokens_seen": 70367672, + "step": 4297, + "train_runtime": 34917.0702, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 2.604848484848485, + "grad_norm": 0.00871951226145029, + "learning_rate": 8.502573905156823e-05, + "loss": 0.011980916373431683, + "num_input_tokens_seen": 70384048, + "step": 4298, + "train_runtime": 34925.1816, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 2.6054545454545455, + "grad_norm": 0.00996157992631197, + "learning_rate": 8.501887596258165e-05, + "loss": 0.012021244503557682, + "num_input_tokens_seen": 70400424, + "step": 4299, + "train_runtime": 34933.2902, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 2.606060606060606, + "grad_norm": 0.014308267273008823, + "learning_rate": 8.501201157831504e-05, + "loss": 0.015138473361730576, + "num_input_tokens_seen": 70416800, + "step": 4300, + "train_runtime": 34941.3982, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 2.6066666666666665, + "grad_norm": 0.008140078745782375, + "learning_rate": 8.50051458990223e-05, + "loss": 0.013001412153244019, + "num_input_tokens_seen": 70433176, + "step": 4301, + "train_runtime": 34950.3094, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 2.6072727272727274, + "grad_norm": 0.00812485720962286, + "learning_rate": 8.499827892495739e-05, + "loss": 0.011935997754335403, + "num_input_tokens_seen": 70449552, + "step": 4302, + "train_runtime": 34958.4207, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 2.607878787878788, + "grad_norm": 0.0021026732865720987, + "learning_rate": 8.499141065637429e-05, + "loss": 0.01220116950571537, + "num_input_tokens_seen": 70465928, + "step": 4303, + "train_runtime": 34966.5323, + "train_tokens_per_second": 2015.239 + }, + { + "epoch": 2.6084848484848484, + "grad_norm": 0.0090709263458848, + "learning_rate": 8.498454109352707e-05, + "loss": 0.011682862415909767, + "num_input_tokens_seen": 70482304, + "step": 4304, + "train_runtime": 34974.6412, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 2.6090909090909093, + "grad_norm": 0.002561000408604741, + "learning_rate": 8.497767023666978e-05, + "loss": 0.011235254816710949, + "num_input_tokens_seen": 70498680, + "step": 4305, + "train_runtime": 34982.7535, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 2.60969696969697, + "grad_norm": 0.008073585107922554, + "learning_rate": 8.497079808605659e-05, + "loss": 0.010844714939594269, + "num_input_tokens_seen": 70515056, + "step": 4306, + "train_runtime": 34990.8642, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 2.6103030303030303, + "grad_norm": 0.013203009963035583, + "learning_rate": 8.49639246419417e-05, + "loss": 0.012161492370069027, + "num_input_tokens_seen": 70531432, + "step": 4307, + "train_runtime": 34998.9804, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 2.610909090909091, + "grad_norm": 0.010954346507787704, + "learning_rate": 8.495704990457931e-05, + "loss": 0.011865230277180672, + "num_input_tokens_seen": 70547808, + "step": 4308, + "train_runtime": 35007.0918, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 2.6115151515151513, + "grad_norm": 0.006475602276623249, + "learning_rate": 8.495017387422371e-05, + "loss": 0.012148110195994377, + "num_input_tokens_seen": 70564184, + "step": 4309, + "train_runtime": 35015.2052, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 2.6121212121212123, + "grad_norm": 0.004678139928728342, + "learning_rate": 8.494329655112926e-05, + "loss": 0.013450360856950283, + "num_input_tokens_seen": 70580560, + "step": 4310, + "train_runtime": 35023.3158, + "train_tokens_per_second": 2015.245 + }, + { + "epoch": 2.612727272727273, + "grad_norm": 0.0053882477805018425, + "learning_rate": 8.493641793555032e-05, + "loss": 0.01208103820681572, + "num_input_tokens_seen": 70596936, + "step": 4311, + "train_runtime": 35031.4317, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 2.6133333333333333, + "grad_norm": 0.010673929005861282, + "learning_rate": 8.492953802774131e-05, + "loss": 0.013441650196909904, + "num_input_tokens_seen": 70613312, + "step": 4312, + "train_runtime": 35039.55, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 2.613939393939394, + "grad_norm": 0.009238391183316708, + "learning_rate": 8.49226568279567e-05, + "loss": 0.01362934336066246, + "num_input_tokens_seen": 70629688, + "step": 4313, + "train_runtime": 35047.6658, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 2.6145454545454543, + "grad_norm": 0.009982015937566757, + "learning_rate": 8.491577433645102e-05, + "loss": 0.013081942684948444, + "num_input_tokens_seen": 70646064, + "step": 4314, + "train_runtime": 35055.7803, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 2.6151515151515152, + "grad_norm": 0.006326040253043175, + "learning_rate": 8.490889055347887e-05, + "loss": 0.012055287137627602, + "num_input_tokens_seen": 70662440, + "step": 4315, + "train_runtime": 35063.8958, + "train_tokens_per_second": 2015.248 + }, + { + "epoch": 2.6157575757575757, + "grad_norm": 0.006414179224520922, + "learning_rate": 8.490200547929481e-05, + "loss": 0.011723064817488194, + "num_input_tokens_seen": 70678816, + "step": 4316, + "train_runtime": 35072.0101, + "train_tokens_per_second": 2015.249 + }, + { + "epoch": 2.6163636363636362, + "grad_norm": 0.006143802776932716, + "learning_rate": 8.489511911415356e-05, + "loss": 0.011870152316987514, + "num_input_tokens_seen": 70695192, + "step": 4317, + "train_runtime": 35080.1222, + "train_tokens_per_second": 2015.249 + }, + { + "epoch": 2.616969696969697, + "grad_norm": 0.08147447556257248, + "learning_rate": 8.48882314583098e-05, + "loss": 0.013313976116478443, + "num_input_tokens_seen": 70711568, + "step": 4318, + "train_runtime": 35088.2343, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 2.6175757575757577, + "grad_norm": 0.006835119798779488, + "learning_rate": 8.48813425120183e-05, + "loss": 0.012407150119543076, + "num_input_tokens_seen": 70727944, + "step": 4319, + "train_runtime": 35096.346, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 2.618181818181818, + "grad_norm": 0.010249876417219639, + "learning_rate": 8.487445227553387e-05, + "loss": 0.011905526742339134, + "num_input_tokens_seen": 70744320, + "step": 4320, + "train_runtime": 35104.4572, + "train_tokens_per_second": 2015.252 + }, + { + "epoch": 2.6187878787878787, + "grad_norm": 0.005235610529780388, + "learning_rate": 8.486756074911137e-05, + "loss": 0.010719122365117073, + "num_input_tokens_seen": 70760696, + "step": 4321, + "train_runtime": 35112.5679, + "train_tokens_per_second": 2015.253 + }, + { + "epoch": 2.619393939393939, + "grad_norm": 0.010949133895337582, + "learning_rate": 8.486066793300568e-05, + "loss": 0.014093095436692238, + "num_input_tokens_seen": 70777072, + "step": 4322, + "train_runtime": 35120.6802, + "train_tokens_per_second": 2015.253 + }, + { + "epoch": 2.62, + "grad_norm": 0.006062129978090525, + "learning_rate": 8.48537738274718e-05, + "loss": 0.012335565872490406, + "num_input_tokens_seen": 70793448, + "step": 4323, + "train_runtime": 35128.7954, + "train_tokens_per_second": 2015.254 + }, + { + "epoch": 2.6206060606060606, + "grad_norm": 0.009427910670638084, + "learning_rate": 8.484687843276469e-05, + "loss": 0.011834895238280296, + "num_input_tokens_seen": 70809824, + "step": 4324, + "train_runtime": 35136.9073, + "train_tokens_per_second": 2015.255 + }, + { + "epoch": 2.621212121212121, + "grad_norm": 0.0106477290391922, + "learning_rate": 8.483998174913939e-05, + "loss": 0.012269280850887299, + "num_input_tokens_seen": 70826200, + "step": 4325, + "train_runtime": 35145.02, + "train_tokens_per_second": 2015.256 + }, + { + "epoch": 2.621818181818182, + "grad_norm": 0.007401573471724987, + "learning_rate": 8.483308377685104e-05, + "loss": 0.011649166233837605, + "num_input_tokens_seen": 70842576, + "step": 4326, + "train_runtime": 35153.1347, + "train_tokens_per_second": 2015.256 + }, + { + "epoch": 2.6224242424242425, + "grad_norm": 0.006271129474043846, + "learning_rate": 8.482618451615473e-05, + "loss": 0.012360481545329094, + "num_input_tokens_seen": 70858952, + "step": 4327, + "train_runtime": 35161.2517, + "train_tokens_per_second": 2015.257 + }, + { + "epoch": 2.623030303030303, + "grad_norm": 0.011400998570024967, + "learning_rate": 8.48192839673057e-05, + "loss": 0.012946637347340584, + "num_input_tokens_seen": 70875328, + "step": 4328, + "train_runtime": 35169.3628, + "train_tokens_per_second": 2015.258 + }, + { + "epoch": 2.6236363636363635, + "grad_norm": 0.008684597909450531, + "learning_rate": 8.481238213055913e-05, + "loss": 0.012262959964573383, + "num_input_tokens_seen": 70891704, + "step": 4329, + "train_runtime": 35177.4745, + "train_tokens_per_second": 2015.258 + }, + { + "epoch": 2.624242424242424, + "grad_norm": 0.006013798993080854, + "learning_rate": 8.480547900617038e-05, + "loss": 0.011853741481900215, + "num_input_tokens_seen": 70908080, + "step": 4330, + "train_runtime": 35185.5852, + "train_tokens_per_second": 2015.259 + }, + { + "epoch": 2.624848484848485, + "grad_norm": 0.011710184626281261, + "learning_rate": 8.479857459439471e-05, + "loss": 0.012230598367750645, + "num_input_tokens_seen": 70924456, + "step": 4331, + "train_runtime": 35193.7013, + "train_tokens_per_second": 2015.26 + }, + { + "epoch": 2.6254545454545455, + "grad_norm": 0.005434371531009674, + "learning_rate": 8.479166889548755e-05, + "loss": 0.011310860514640808, + "num_input_tokens_seen": 70940832, + "step": 4332, + "train_runtime": 35201.8148, + "train_tokens_per_second": 2015.261 + }, + { + "epoch": 2.626060606060606, + "grad_norm": 0.010889391414821148, + "learning_rate": 8.478476190970431e-05, + "loss": 0.01314119715243578, + "num_input_tokens_seen": 70957208, + "step": 4333, + "train_runtime": 35209.9323, + "train_tokens_per_second": 2015.261 + }, + { + "epoch": 2.626666666666667, + "grad_norm": 0.01051140297204256, + "learning_rate": 8.477785363730046e-05, + "loss": 0.012584518641233444, + "num_input_tokens_seen": 70973584, + "step": 4334, + "train_runtime": 35218.0473, + "train_tokens_per_second": 2015.262 + }, + { + "epoch": 2.6272727272727274, + "grad_norm": 0.009582994505763054, + "learning_rate": 8.477094407853153e-05, + "loss": 0.012380758300423622, + "num_input_tokens_seen": 70989960, + "step": 4335, + "train_runtime": 35226.16, + "train_tokens_per_second": 2015.263 + }, + { + "epoch": 2.627878787878788, + "grad_norm": 0.024512801319360733, + "learning_rate": 8.47640332336531e-05, + "loss": 0.013403578661382198, + "num_input_tokens_seen": 71006336, + "step": 4336, + "train_runtime": 35234.2721, + "train_tokens_per_second": 2015.263 + }, + { + "epoch": 2.6284848484848484, + "grad_norm": 0.007025278173387051, + "learning_rate": 8.475712110292078e-05, + "loss": 0.011801440268754959, + "num_input_tokens_seen": 71022712, + "step": 4337, + "train_runtime": 35242.3881, + "train_tokens_per_second": 2015.264 + }, + { + "epoch": 2.629090909090909, + "grad_norm": 0.009816234931349754, + "learning_rate": 8.475020768659023e-05, + "loss": 0.01296780165284872, + "num_input_tokens_seen": 71039088, + "step": 4338, + "train_runtime": 35250.5053, + "train_tokens_per_second": 2015.264 + }, + { + "epoch": 2.62969696969697, + "grad_norm": 0.010255989618599415, + "learning_rate": 8.474329298491717e-05, + "loss": 0.012685622088611126, + "num_input_tokens_seen": 71055464, + "step": 4339, + "train_runtime": 35258.616, + "train_tokens_per_second": 2015.265 + }, + { + "epoch": 2.6303030303030304, + "grad_norm": 0.010021884925663471, + "learning_rate": 8.473637699815737e-05, + "loss": 0.011408963240683079, + "num_input_tokens_seen": 71071840, + "step": 4340, + "train_runtime": 35266.7325, + "train_tokens_per_second": 2015.266 + }, + { + "epoch": 2.630909090909091, + "grad_norm": 0.005544560961425304, + "learning_rate": 8.472945972656662e-05, + "loss": 0.011756407096982002, + "num_input_tokens_seen": 71088216, + "step": 4341, + "train_runtime": 35274.8414, + "train_tokens_per_second": 2015.267 + }, + { + "epoch": 2.6315151515151514, + "grad_norm": 0.012866887263953686, + "learning_rate": 8.472254117040079e-05, + "loss": 0.011519216932356358, + "num_input_tokens_seen": 71104592, + "step": 4342, + "train_runtime": 35282.9524, + "train_tokens_per_second": 2015.268 + }, + { + "epoch": 2.632121212121212, + "grad_norm": 0.01217510737478733, + "learning_rate": 8.471562132991579e-05, + "loss": 0.012491295114159584, + "num_input_tokens_seen": 71120968, + "step": 4343, + "train_runtime": 35291.065, + "train_tokens_per_second": 2015.268 + }, + { + "epoch": 2.632727272727273, + "grad_norm": 0.008041740395128727, + "learning_rate": 8.470870020536757e-05, + "loss": 0.011530266143381596, + "num_input_tokens_seen": 71137344, + "step": 4344, + "train_runtime": 35299.1776, + "train_tokens_per_second": 2015.269 + }, + { + "epoch": 2.6333333333333333, + "grad_norm": 0.01783039979636669, + "learning_rate": 8.47017777970121e-05, + "loss": 0.013242207467556, + "num_input_tokens_seen": 71153720, + "step": 4345, + "train_runtime": 35307.2908, + "train_tokens_per_second": 2015.27 + }, + { + "epoch": 2.633939393939394, + "grad_norm": 0.008601765148341656, + "learning_rate": 8.469485410510545e-05, + "loss": 0.012506759725511074, + "num_input_tokens_seen": 71170096, + "step": 4346, + "train_runtime": 35315.4079, + "train_tokens_per_second": 2015.27 + }, + { + "epoch": 2.6345454545454547, + "grad_norm": 0.019806895405054092, + "learning_rate": 8.468792912990374e-05, + "loss": 0.01206189300864935, + "num_input_tokens_seen": 71186472, + "step": 4347, + "train_runtime": 35323.522, + "train_tokens_per_second": 2015.271 + }, + { + "epoch": 2.6351515151515152, + "grad_norm": 0.055740099400281906, + "learning_rate": 8.468100287166306e-05, + "loss": 0.01260375790297985, + "num_input_tokens_seen": 71202848, + "step": 4348, + "train_runtime": 35331.6379, + "train_tokens_per_second": 2015.272 + }, + { + "epoch": 2.6357575757575757, + "grad_norm": 0.006394113413989544, + "learning_rate": 8.467407533063962e-05, + "loss": 0.011575368233025074, + "num_input_tokens_seen": 71219224, + "step": 4349, + "train_runtime": 35339.7564, + "train_tokens_per_second": 2015.272 + }, + { + "epoch": 2.6363636363636362, + "grad_norm": 0.0039197527803480625, + "learning_rate": 8.466714650708964e-05, + "loss": 0.011557562276721, + "num_input_tokens_seen": 71235600, + "step": 4350, + "train_runtime": 35347.8725, + "train_tokens_per_second": 2015.273 + }, + { + "epoch": 2.6369696969696967, + "grad_norm": 0.00674759317189455, + "learning_rate": 8.466021640126945e-05, + "loss": 0.011535733938217163, + "num_input_tokens_seen": 71251976, + "step": 4351, + "train_runtime": 35355.9917, + "train_tokens_per_second": 2015.273 + }, + { + "epoch": 2.6375757575757577, + "grad_norm": 0.00842457264661789, + "learning_rate": 8.465328501343534e-05, + "loss": 0.012026438489556313, + "num_input_tokens_seen": 71268352, + "step": 4352, + "train_runtime": 35364.1077, + "train_tokens_per_second": 2015.274 + }, + { + "epoch": 2.638181818181818, + "grad_norm": 0.005042821168899536, + "learning_rate": 8.464635234384373e-05, + "loss": 0.011582613922655582, + "num_input_tokens_seen": 71284728, + "step": 4353, + "train_runtime": 35372.232, + "train_tokens_per_second": 2015.274 + }, + { + "epoch": 2.6387878787878787, + "grad_norm": 0.009325952269136906, + "learning_rate": 8.463941839275097e-05, + "loss": 0.012684072367846966, + "num_input_tokens_seen": 71301104, + "step": 4354, + "train_runtime": 35380.3422, + "train_tokens_per_second": 2015.275 + }, + { + "epoch": 2.6393939393939396, + "grad_norm": 0.021494105458259583, + "learning_rate": 8.463248316041363e-05, + "loss": 0.01256593968719244, + "num_input_tokens_seen": 71317480, + "step": 4355, + "train_runtime": 35388.4577, + "train_tokens_per_second": 2015.275 + }, + { + "epoch": 2.64, + "grad_norm": 0.01312971767038107, + "learning_rate": 8.462554664708815e-05, + "loss": 0.014445781707763672, + "num_input_tokens_seen": 71333856, + "step": 4356, + "train_runtime": 35396.5712, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 2.6406060606060606, + "grad_norm": 0.004671615082770586, + "learning_rate": 8.461860885303114e-05, + "loss": 0.01263829879462719, + "num_input_tokens_seen": 71350232, + "step": 4357, + "train_runtime": 35404.6885, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 2.641212121212121, + "grad_norm": 0.009678158909082413, + "learning_rate": 8.46116697784992e-05, + "loss": 0.011891501955688, + "num_input_tokens_seen": 71366608, + "step": 4358, + "train_runtime": 35412.8011, + "train_tokens_per_second": 2015.277 + }, + { + "epoch": 2.6418181818181816, + "grad_norm": 0.024872975423932076, + "learning_rate": 8.460472942374901e-05, + "loss": 0.012431523762643337, + "num_input_tokens_seen": 71382984, + "step": 4359, + "train_runtime": 35420.915, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 2.6424242424242426, + "grad_norm": 0.0038883527740836143, + "learning_rate": 8.459778778903727e-05, + "loss": 0.011545796878635883, + "num_input_tokens_seen": 71399360, + "step": 4360, + "train_runtime": 35429.0326, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 2.643030303030303, + "grad_norm": 0.00714039197191596, + "learning_rate": 8.459084487462072e-05, + "loss": 0.0127326101064682, + "num_input_tokens_seen": 71415736, + "step": 4361, + "train_runtime": 35437.145, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 2.6436363636363636, + "grad_norm": 0.005958594847470522, + "learning_rate": 8.458390068075617e-05, + "loss": 0.012266283854842186, + "num_input_tokens_seen": 71432112, + "step": 4362, + "train_runtime": 35445.2597, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 2.6442424242424245, + "grad_norm": 0.001327206613495946, + "learning_rate": 8.45769552077005e-05, + "loss": 0.011368953622877598, + "num_input_tokens_seen": 71448488, + "step": 4363, + "train_runtime": 35453.3768, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 2.644848484848485, + "grad_norm": 0.011309816502034664, + "learning_rate": 8.457000845571059e-05, + "loss": 0.01137612760066986, + "num_input_tokens_seen": 71464864, + "step": 4364, + "train_runtime": 35461.4897, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 2.6454545454545455, + "grad_norm": 0.008746663108468056, + "learning_rate": 8.456306042504341e-05, + "loss": 0.012663026340305805, + "num_input_tokens_seen": 71481240, + "step": 4365, + "train_runtime": 35469.6024, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 2.646060606060606, + "grad_norm": 0.024537766352295876, + "learning_rate": 8.455611111595591e-05, + "loss": 0.012257825583219528, + "num_input_tokens_seen": 71497616, + "step": 4366, + "train_runtime": 35477.7121, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 2.6466666666666665, + "grad_norm": 0.003678740933537483, + "learning_rate": 8.454916052870516e-05, + "loss": 0.011355373077094555, + "num_input_tokens_seen": 71513992, + "step": 4367, + "train_runtime": 35485.823, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 2.6472727272727274, + "grad_norm": 0.01966022700071335, + "learning_rate": 8.454220866354825e-05, + "loss": 0.013168991543352604, + "num_input_tokens_seen": 71530368, + "step": 4368, + "train_runtime": 35493.9417, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 2.647878787878788, + "grad_norm": 0.022551169618964195, + "learning_rate": 8.453525552074229e-05, + "loss": 0.013159026391804218, + "num_input_tokens_seen": 71546744, + "step": 4369, + "train_runtime": 35502.0564, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 2.6484848484848484, + "grad_norm": 0.009290799498558044, + "learning_rate": 8.452830110054451e-05, + "loss": 0.012852588668465614, + "num_input_tokens_seen": 71563120, + "step": 4370, + "train_runtime": 35510.1647, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 2.649090909090909, + "grad_norm": 0.010224821045994759, + "learning_rate": 8.452134540321208e-05, + "loss": 0.012725301086902618, + "num_input_tokens_seen": 71579496, + "step": 4371, + "train_runtime": 35518.2767, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 2.6496969696969694, + "grad_norm": 0.006368674803525209, + "learning_rate": 8.451438842900234e-05, + "loss": 0.011845732107758522, + "num_input_tokens_seen": 71595872, + "step": 4372, + "train_runtime": 35526.3904, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 2.6503030303030304, + "grad_norm": 0.007049901410937309, + "learning_rate": 8.450743017817257e-05, + "loss": 0.011245891451835632, + "num_input_tokens_seen": 71612248, + "step": 4373, + "train_runtime": 35534.5051, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 2.650909090909091, + "grad_norm": 0.00675829965621233, + "learning_rate": 8.450047065098016e-05, + "loss": 0.011235181242227554, + "num_input_tokens_seen": 71628624, + "step": 4374, + "train_runtime": 35542.6153, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 2.6515151515151514, + "grad_norm": 0.00808720849454403, + "learning_rate": 8.449350984768252e-05, + "loss": 0.012145287357270718, + "num_input_tokens_seen": 71645000, + "step": 4375, + "train_runtime": 35550.734, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 2.6521212121212123, + "grad_norm": 0.006317794788628817, + "learning_rate": 8.448654776853714e-05, + "loss": 0.011347971856594086, + "num_input_tokens_seen": 71661376, + "step": 4376, + "train_runtime": 35558.8443, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 2.652727272727273, + "grad_norm": 0.00903842318803072, + "learning_rate": 8.44795844138015e-05, + "loss": 0.011753967963159084, + "num_input_tokens_seen": 71677752, + "step": 4377, + "train_runtime": 35566.9601, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 2.6533333333333333, + "grad_norm": 0.008810496889054775, + "learning_rate": 8.447261978373319e-05, + "loss": 0.011386499740183353, + "num_input_tokens_seen": 71694128, + "step": 4378, + "train_runtime": 35575.0699, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 2.653939393939394, + "grad_norm": 0.010316540487110615, + "learning_rate": 8.446565387858981e-05, + "loss": 0.012241136282682419, + "num_input_tokens_seen": 71710504, + "step": 4379, + "train_runtime": 35583.1874, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 2.6545454545454543, + "grad_norm": 0.020265327766537666, + "learning_rate": 8.445868669862901e-05, + "loss": 0.01350229512900114, + "num_input_tokens_seen": 71726880, + "step": 4380, + "train_runtime": 35591.3023, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 2.6551515151515153, + "grad_norm": 0.028402065858244896, + "learning_rate": 8.445171824410848e-05, + "loss": 0.012432043440639973, + "num_input_tokens_seen": 71743256, + "step": 4381, + "train_runtime": 35599.4143, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 2.6557575757575758, + "grad_norm": 0.016863582655787468, + "learning_rate": 8.444474851528601e-05, + "loss": 0.013000136241316795, + "num_input_tokens_seen": 71759632, + "step": 4382, + "train_runtime": 35607.5315, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 2.6563636363636363, + "grad_norm": 0.0064758602529764175, + "learning_rate": 8.443777751241936e-05, + "loss": 0.011766228824853897, + "num_input_tokens_seen": 71776008, + "step": 4383, + "train_runtime": 35615.6467, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 2.656969696969697, + "grad_norm": 0.011727205477654934, + "learning_rate": 8.443080523576639e-05, + "loss": 0.011181896552443504, + "num_input_tokens_seen": 71792384, + "step": 4384, + "train_runtime": 35623.7599, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 2.6575757575757577, + "grad_norm": 0.004063058644533157, + "learning_rate": 8.442383168558496e-05, + "loss": 0.01103916484862566, + "num_input_tokens_seen": 71808760, + "step": 4385, + "train_runtime": 35631.8754, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 2.658181818181818, + "grad_norm": 0.01011338084936142, + "learning_rate": 8.441685686213306e-05, + "loss": 0.012588326819241047, + "num_input_tokens_seen": 71825136, + "step": 4386, + "train_runtime": 35639.9905, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 2.6587878787878787, + "grad_norm": 0.011045512743294239, + "learning_rate": 8.440988076566862e-05, + "loss": 0.011369702406227589, + "num_input_tokens_seen": 71841512, + "step": 4387, + "train_runtime": 35648.1025, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 2.659393939393939, + "grad_norm": 0.0074189272709190845, + "learning_rate": 8.440290339644972e-05, + "loss": 0.011855355463922024, + "num_input_tokens_seen": 71857888, + "step": 4388, + "train_runtime": 35656.2136, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 2.66, + "grad_norm": 0.011250571347773075, + "learning_rate": 8.439592475473443e-05, + "loss": 0.011890600435435772, + "num_input_tokens_seen": 71874264, + "step": 4389, + "train_runtime": 35664.3334, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 2.6606060606060606, + "grad_norm": 0.009766768664121628, + "learning_rate": 8.438894484078086e-05, + "loss": 0.012095589190721512, + "num_input_tokens_seen": 71890640, + "step": 4390, + "train_runtime": 35672.4534, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 2.661212121212121, + "grad_norm": 0.013576023280620575, + "learning_rate": 8.438196365484718e-05, + "loss": 0.01422170177102089, + "num_input_tokens_seen": 71907016, + "step": 4391, + "train_runtime": 35680.5665, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 2.661818181818182, + "grad_norm": 0.01201293058693409, + "learning_rate": 8.437498119719163e-05, + "loss": 0.012109003029763699, + "num_input_tokens_seen": 71923392, + "step": 4392, + "train_runtime": 35688.6764, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 2.6624242424242426, + "grad_norm": 0.005783925764262676, + "learning_rate": 8.436799746807245e-05, + "loss": 0.01241049263626337, + "num_input_tokens_seen": 71939768, + "step": 4393, + "train_runtime": 35696.7861, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 2.663030303030303, + "grad_norm": 0.007553341798484325, + "learning_rate": 8.436101246774799e-05, + "loss": 0.012073860503733158, + "num_input_tokens_seen": 71956144, + "step": 4394, + "train_runtime": 35704.902, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 2.6636363636363636, + "grad_norm": 0.012787474319338799, + "learning_rate": 8.435402619647659e-05, + "loss": 0.012013544328510761, + "num_input_tokens_seen": 71972520, + "step": 4395, + "train_runtime": 35713.0171, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 2.664242424242424, + "grad_norm": 0.012466519139707088, + "learning_rate": 8.434703865451665e-05, + "loss": 0.01138471718877554, + "num_input_tokens_seen": 71988896, + "step": 4396, + "train_runtime": 35721.1322, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 2.664848484848485, + "grad_norm": 0.004472682252526283, + "learning_rate": 8.434004984212665e-05, + "loss": 0.011566979810595512, + "num_input_tokens_seen": 72005272, + "step": 4397, + "train_runtime": 35729.2456, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 2.6654545454545455, + "grad_norm": 0.009981125593185425, + "learning_rate": 8.433305975956507e-05, + "loss": 0.012517772614955902, + "num_input_tokens_seen": 72021648, + "step": 4398, + "train_runtime": 35737.3602, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 2.666060606060606, + "grad_norm": 0.005660703405737877, + "learning_rate": 8.43260684070905e-05, + "loss": 0.012004796415567398, + "num_input_tokens_seen": 72038024, + "step": 4399, + "train_runtime": 35745.4698, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 0.009055431932210922, + "learning_rate": 8.431907578496148e-05, + "loss": 0.011670810170471668, + "num_input_tokens_seen": 72054400, + "step": 4400, + "train_runtime": 35753.5851, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 2.667272727272727, + "grad_norm": 0.00939918216317892, + "learning_rate": 8.43120818934367e-05, + "loss": 0.013822179287672043, + "num_input_tokens_seen": 72070776, + "step": 4401, + "train_runtime": 35762.6046, + "train_tokens_per_second": 2015.255 + }, + { + "epoch": 2.667878787878788, + "grad_norm": 0.010527390986680984, + "learning_rate": 8.43050867327748e-05, + "loss": 0.011938024312257767, + "num_input_tokens_seen": 72087152, + "step": 4402, + "train_runtime": 35770.7166, + "train_tokens_per_second": 2015.256 + }, + { + "epoch": 2.6684848484848485, + "grad_norm": 0.006208324804902077, + "learning_rate": 8.429809030323456e-05, + "loss": 0.01352614350616932, + "num_input_tokens_seen": 72103528, + "step": 4403, + "train_runtime": 35778.8311, + "train_tokens_per_second": 2015.257 + }, + { + "epoch": 2.669090909090909, + "grad_norm": 0.009397996589541435, + "learning_rate": 8.429109260507476e-05, + "loss": 0.011227701790630817, + "num_input_tokens_seen": 72119904, + "step": 4404, + "train_runtime": 35786.9429, + "train_tokens_per_second": 2015.257 + }, + { + "epoch": 2.66969696969697, + "grad_norm": 0.006822841241955757, + "learning_rate": 8.428409363855423e-05, + "loss": 0.011959469877183437, + "num_input_tokens_seen": 72136280, + "step": 4405, + "train_runtime": 35795.0559, + "train_tokens_per_second": 2015.258 + }, + { + "epoch": 2.6703030303030304, + "grad_norm": 0.007352875079959631, + "learning_rate": 8.427709340393181e-05, + "loss": 0.01244130078703165, + "num_input_tokens_seen": 72152656, + "step": 4406, + "train_runtime": 35803.1645, + "train_tokens_per_second": 2015.259 + }, + { + "epoch": 2.670909090909091, + "grad_norm": 0.004418450873345137, + "learning_rate": 8.427009190146649e-05, + "loss": 0.012306888587772846, + "num_input_tokens_seen": 72169032, + "step": 4407, + "train_runtime": 35811.2761, + "train_tokens_per_second": 2015.26 + }, + { + "epoch": 2.6715151515151514, + "grad_norm": 0.007639321964234114, + "learning_rate": 8.426308913141719e-05, + "loss": 0.011339564807713032, + "num_input_tokens_seen": 72185408, + "step": 4408, + "train_runtime": 35819.3855, + "train_tokens_per_second": 2015.261 + }, + { + "epoch": 2.672121212121212, + "grad_norm": 0.01279147807508707, + "learning_rate": 8.425608509404296e-05, + "loss": 0.012118877843022346, + "num_input_tokens_seen": 72201784, + "step": 4409, + "train_runtime": 35827.4973, + "train_tokens_per_second": 2015.262 + }, + { + "epoch": 2.672727272727273, + "grad_norm": 0.004805168602615595, + "learning_rate": 8.424907978960285e-05, + "loss": 0.011280016973614693, + "num_input_tokens_seen": 72218160, + "step": 4410, + "train_runtime": 35835.6113, + "train_tokens_per_second": 2015.262 + }, + { + "epoch": 2.6733333333333333, + "grad_norm": 0.0007007545209489763, + "learning_rate": 8.424207321835598e-05, + "loss": 0.011943530291318893, + "num_input_tokens_seen": 72234536, + "step": 4411, + "train_runtime": 35843.7313, + "train_tokens_per_second": 2015.263 + }, + { + "epoch": 2.673939393939394, + "grad_norm": 0.00240122782997787, + "learning_rate": 8.42350653805615e-05, + "loss": 0.012038838118314743, + "num_input_tokens_seen": 72250912, + "step": 4412, + "train_runtime": 35851.8506, + "train_tokens_per_second": 2015.263 + }, + { + "epoch": 2.674545454545455, + "grad_norm": 0.006492991931736469, + "learning_rate": 8.42280562764786e-05, + "loss": 0.013071507215499878, + "num_input_tokens_seen": 72267288, + "step": 4413, + "train_runtime": 35859.9625, + "train_tokens_per_second": 2015.264 + }, + { + "epoch": 2.6751515151515153, + "grad_norm": 0.003527931170538068, + "learning_rate": 8.422104590636659e-05, + "loss": 0.012185542844235897, + "num_input_tokens_seen": 72283664, + "step": 4414, + "train_runtime": 35868.0759, + "train_tokens_per_second": 2015.265 + }, + { + "epoch": 2.675757575757576, + "grad_norm": 0.005685761570930481, + "learning_rate": 8.421403427048472e-05, + "loss": 0.011341195553541183, + "num_input_tokens_seen": 72300040, + "step": 4415, + "train_runtime": 35876.1903, + "train_tokens_per_second": 2015.265 + }, + { + "epoch": 2.6763636363636363, + "grad_norm": 0.006813893094658852, + "learning_rate": 8.420702136909234e-05, + "loss": 0.01166342943906784, + "num_input_tokens_seen": 72316416, + "step": 4416, + "train_runtime": 35884.3042, + "train_tokens_per_second": 2015.266 + }, + { + "epoch": 2.6769696969696968, + "grad_norm": 0.012871243990957737, + "learning_rate": 8.420000720244886e-05, + "loss": 0.01290359441190958, + "num_input_tokens_seen": 72332792, + "step": 4417, + "train_runtime": 35892.4215, + "train_tokens_per_second": 2015.266 + }, + { + "epoch": 2.6775757575757577, + "grad_norm": 0.00795845128595829, + "learning_rate": 8.419299177081372e-05, + "loss": 0.01156303845345974, + "num_input_tokens_seen": 72349168, + "step": 4418, + "train_runtime": 35900.5329, + "train_tokens_per_second": 2015.267 + }, + { + "epoch": 2.678181818181818, + "grad_norm": 0.005334604065865278, + "learning_rate": 8.41859750744464e-05, + "loss": 0.011066998355090618, + "num_input_tokens_seen": 72365544, + "step": 4419, + "train_runtime": 35908.6476, + "train_tokens_per_second": 2015.268 + }, + { + "epoch": 2.6787878787878787, + "grad_norm": 0.0026235533878207207, + "learning_rate": 8.417895711360643e-05, + "loss": 0.012075402773916721, + "num_input_tokens_seen": 72381920, + "step": 4420, + "train_runtime": 35916.7606, + "train_tokens_per_second": 2015.269 + }, + { + "epoch": 2.6793939393939397, + "grad_norm": 0.0013224153080955148, + "learning_rate": 8.41719378885534e-05, + "loss": 0.011157059110701084, + "num_input_tokens_seen": 72398296, + "step": 4421, + "train_runtime": 35924.8721, + "train_tokens_per_second": 2015.269 + }, + { + "epoch": 2.68, + "grad_norm": 0.0028534450102597475, + "learning_rate": 8.416491739954694e-05, + "loss": 0.011411381885409355, + "num_input_tokens_seen": 72414672, + "step": 4422, + "train_runtime": 35932.9829, + "train_tokens_per_second": 2015.27 + }, + { + "epoch": 2.6806060606060607, + "grad_norm": 0.007303812075406313, + "learning_rate": 8.415789564684673e-05, + "loss": 0.011504009366035461, + "num_input_tokens_seen": 72431048, + "step": 4423, + "train_runtime": 35941.0982, + "train_tokens_per_second": 2015.271 + }, + { + "epoch": 2.681212121212121, + "grad_norm": 0.023628417402505875, + "learning_rate": 8.415087263071247e-05, + "loss": 0.012358436360955238, + "num_input_tokens_seen": 72447424, + "step": 4424, + "train_runtime": 35949.2125, + "train_tokens_per_second": 2015.272 + }, + { + "epoch": 2.6818181818181817, + "grad_norm": 0.03140052407979965, + "learning_rate": 8.414384835140392e-05, + "loss": 0.013035248965024948, + "num_input_tokens_seen": 72463800, + "step": 4425, + "train_runtime": 35957.3329, + "train_tokens_per_second": 2015.272 + }, + { + "epoch": 2.6824242424242426, + "grad_norm": 0.003144563641399145, + "learning_rate": 8.413682280918093e-05, + "loss": 0.012425084598362446, + "num_input_tokens_seen": 72480176, + "step": 4426, + "train_runtime": 35965.4491, + "train_tokens_per_second": 2015.272 + }, + { + "epoch": 2.683030303030303, + "grad_norm": 0.012414596043527126, + "learning_rate": 8.412979600430333e-05, + "loss": 0.012512211687862873, + "num_input_tokens_seen": 72496552, + "step": 4427, + "train_runtime": 35973.5633, + "train_tokens_per_second": 2015.273 + }, + { + "epoch": 2.6836363636363636, + "grad_norm": 0.008683600462973118, + "learning_rate": 8.412276793703106e-05, + "loss": 0.011885604821145535, + "num_input_tokens_seen": 72512928, + "step": 4428, + "train_runtime": 35981.6754, + "train_tokens_per_second": 2015.274 + }, + { + "epoch": 2.684242424242424, + "grad_norm": 0.014704830013215542, + "learning_rate": 8.411573860762404e-05, + "loss": 0.011572681367397308, + "num_input_tokens_seen": 72529304, + "step": 4429, + "train_runtime": 35989.7896, + "train_tokens_per_second": 2015.274 + }, + { + "epoch": 2.6848484848484846, + "grad_norm": 0.012880926951766014, + "learning_rate": 8.410870801634229e-05, + "loss": 0.011459063738584518, + "num_input_tokens_seen": 72545680, + "step": 4430, + "train_runtime": 35997.9024, + "train_tokens_per_second": 2015.275 + }, + { + "epoch": 2.6854545454545455, + "grad_norm": 0.011970085091888905, + "learning_rate": 8.410167616344586e-05, + "loss": 0.013594500720500946, + "num_input_tokens_seen": 72562056, + "step": 4431, + "train_runtime": 36006.0148, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 2.686060606060606, + "grad_norm": 0.011575406417250633, + "learning_rate": 8.409464304919484e-05, + "loss": 0.012265223078429699, + "num_input_tokens_seen": 72578432, + "step": 4432, + "train_runtime": 36014.1332, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 2.6866666666666665, + "grad_norm": 0.007652411237359047, + "learning_rate": 8.408760867384936e-05, + "loss": 0.012334804981946945, + "num_input_tokens_seen": 72594808, + "step": 4433, + "train_runtime": 36022.2445, + "train_tokens_per_second": 2015.277 + }, + { + "epoch": 2.6872727272727275, + "grad_norm": 0.006646799389272928, + "learning_rate": 8.408057303766961e-05, + "loss": 0.012621132656931877, + "num_input_tokens_seen": 72611184, + "step": 4434, + "train_runtime": 36030.3597, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 2.687878787878788, + "grad_norm": 0.006309642922133207, + "learning_rate": 8.407353614091585e-05, + "loss": 0.012295114807784557, + "num_input_tokens_seen": 72627560, + "step": 4435, + "train_runtime": 36038.4757, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 2.6884848484848485, + "grad_norm": 0.0021731348242610693, + "learning_rate": 8.406649798384834e-05, + "loss": 0.010900859721004963, + "num_input_tokens_seen": 72643936, + "step": 4436, + "train_runtime": 36046.5897, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 2.689090909090909, + "grad_norm": 0.005132134538143873, + "learning_rate": 8.405945856672739e-05, + "loss": 0.010258615016937256, + "num_input_tokens_seen": 72660312, + "step": 4437, + "train_runtime": 36054.7005, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 2.6896969696969695, + "grad_norm": 0.005098214838653803, + "learning_rate": 8.405241788981341e-05, + "loss": 0.012920193374156952, + "num_input_tokens_seen": 72676688, + "step": 4438, + "train_runtime": 36062.8132, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 2.6903030303030304, + "grad_norm": 0.006919191684573889, + "learning_rate": 8.404537595336681e-05, + "loss": 0.012241924181580544, + "num_input_tokens_seen": 72693064, + "step": 4439, + "train_runtime": 36070.9319, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 2.690909090909091, + "grad_norm": 0.007868586108088493, + "learning_rate": 8.403833275764805e-05, + "loss": 0.011545106768608093, + "num_input_tokens_seen": 72709440, + "step": 4440, + "train_runtime": 36079.0457, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 2.6915151515151514, + "grad_norm": 0.006030546501278877, + "learning_rate": 8.403128830291767e-05, + "loss": 0.011829491704702377, + "num_input_tokens_seen": 72725816, + "step": 4441, + "train_runtime": 36087.1633, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 2.6921212121212124, + "grad_norm": 0.007370266132056713, + "learning_rate": 8.402424258943618e-05, + "loss": 0.012624634429812431, + "num_input_tokens_seen": 72742192, + "step": 4442, + "train_runtime": 36095.2778, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 2.692727272727273, + "grad_norm": 0.006004734430462122, + "learning_rate": 8.401719561746422e-05, + "loss": 0.01227719895541668, + "num_input_tokens_seen": 72758568, + "step": 4443, + "train_runtime": 36103.3927, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 2.6933333333333334, + "grad_norm": 0.014376594685018063, + "learning_rate": 8.401014738726245e-05, + "loss": 0.013458449393510818, + "num_input_tokens_seen": 72774944, + "step": 4444, + "train_runtime": 36111.5057, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 2.693939393939394, + "grad_norm": 0.00856937374919653, + "learning_rate": 8.400309789909156e-05, + "loss": 0.010445252060890198, + "num_input_tokens_seen": 72791320, + "step": 4445, + "train_runtime": 36119.6222, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 2.6945454545454544, + "grad_norm": 0.007901309989392757, + "learning_rate": 8.399604715321227e-05, + "loss": 0.01286272332072258, + "num_input_tokens_seen": 72807696, + "step": 4446, + "train_runtime": 36127.7333, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 2.6951515151515153, + "grad_norm": 0.008734796196222305, + "learning_rate": 8.398899514988543e-05, + "loss": 0.012080827727913857, + "num_input_tokens_seen": 72824072, + "step": 4447, + "train_runtime": 36135.8454, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 2.695757575757576, + "grad_norm": 0.015319878235459328, + "learning_rate": 8.398194188937184e-05, + "loss": 0.012312346138060093, + "num_input_tokens_seen": 72840448, + "step": 4448, + "train_runtime": 36143.9594, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 2.6963636363636363, + "grad_norm": 0.005286949686706066, + "learning_rate": 8.39748873719324e-05, + "loss": 0.012197275646030903, + "num_input_tokens_seen": 72856824, + "step": 4449, + "train_runtime": 36152.0726, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 2.6969696969696972, + "grad_norm": 0.018315957859158516, + "learning_rate": 8.396783159782804e-05, + "loss": 0.01294033881276846, + "num_input_tokens_seen": 72873200, + "step": 4450, + "train_runtime": 36160.1847, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 2.6975757575757577, + "grad_norm": 0.006518305744975805, + "learning_rate": 8.396077456731974e-05, + "loss": 0.0118323415517807, + "num_input_tokens_seen": 72889576, + "step": 4451, + "train_runtime": 36168.2959, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 2.6981818181818182, + "grad_norm": 0.007139664608985186, + "learning_rate": 8.39537162806685e-05, + "loss": 0.012888872064650059, + "num_input_tokens_seen": 72905952, + "step": 4452, + "train_runtime": 36176.408, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 2.6987878787878787, + "grad_norm": 0.005414220504462719, + "learning_rate": 8.394665673813544e-05, + "loss": 0.010829141363501549, + "num_input_tokens_seen": 72922328, + "step": 4453, + "train_runtime": 36184.5219, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 2.6993939393939392, + "grad_norm": 0.003490846836939454, + "learning_rate": 8.393959593998166e-05, + "loss": 0.012149629183113575, + "num_input_tokens_seen": 72938704, + "step": 4454, + "train_runtime": 36192.6356, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 2.7, + "grad_norm": 0.005022817756980658, + "learning_rate": 8.393253388646831e-05, + "loss": 0.012500596232712269, + "num_input_tokens_seen": 72955080, + "step": 4455, + "train_runtime": 36200.7498, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 2.7006060606060607, + "grad_norm": 0.008135463111102581, + "learning_rate": 8.392547057785661e-05, + "loss": 0.013383036479353905, + "num_input_tokens_seen": 72971456, + "step": 4456, + "train_runtime": 36208.8684, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 2.701212121212121, + "grad_norm": 0.020837651565670967, + "learning_rate": 8.391840601440784e-05, + "loss": 0.012878560461103916, + "num_input_tokens_seen": 72987832, + "step": 4457, + "train_runtime": 36216.9844, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 2.7018181818181817, + "grad_norm": 0.00532251363620162, + "learning_rate": 8.391134019638326e-05, + "loss": 0.01171312015503645, + "num_input_tokens_seen": 73004208, + "step": 4458, + "train_runtime": 36225.098, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 2.702424242424242, + "grad_norm": 0.01003164891153574, + "learning_rate": 8.390427312404426e-05, + "loss": 0.011207787320017815, + "num_input_tokens_seen": 73020584, + "step": 4459, + "train_runtime": 36233.2104, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 2.703030303030303, + "grad_norm": 0.0221566092222929, + "learning_rate": 8.389720479765221e-05, + "loss": 0.014188327826559544, + "num_input_tokens_seen": 73036960, + "step": 4460, + "train_runtime": 36241.3353, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 2.7036363636363636, + "grad_norm": 0.012309076264500618, + "learning_rate": 8.389013521746857e-05, + "loss": 0.0123416967689991, + "num_input_tokens_seen": 73053336, + "step": 4461, + "train_runtime": 36249.447, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 2.704242424242424, + "grad_norm": 0.008402823470532894, + "learning_rate": 8.388306438375483e-05, + "loss": 0.01218133419752121, + "num_input_tokens_seen": 73069712, + "step": 4462, + "train_runtime": 36257.5609, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 2.704848484848485, + "grad_norm": 0.007609476801007986, + "learning_rate": 8.387599229677252e-05, + "loss": 0.011831454932689667, + "num_input_tokens_seen": 73086088, + "step": 4463, + "train_runtime": 36265.6733, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 2.7054545454545456, + "grad_norm": 0.00904910359531641, + "learning_rate": 8.386891895678323e-05, + "loss": 0.011985763907432556, + "num_input_tokens_seen": 73102464, + "step": 4464, + "train_runtime": 36273.7864, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 2.706060606060606, + "grad_norm": 0.005750224459916353, + "learning_rate": 8.386184436404859e-05, + "loss": 0.011789086274802685, + "num_input_tokens_seen": 73118840, + "step": 4465, + "train_runtime": 36281.8979, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 2.7066666666666666, + "grad_norm": 0.004379808437079191, + "learning_rate": 8.385476851883025e-05, + "loss": 0.01236353162676096, + "num_input_tokens_seen": 73135216, + "step": 4466, + "train_runtime": 36290.0083, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 2.707272727272727, + "grad_norm": 0.004893327597528696, + "learning_rate": 8.384769142138998e-05, + "loss": 0.010962733998894691, + "num_input_tokens_seen": 73151592, + "step": 4467, + "train_runtime": 36298.1208, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 2.707878787878788, + "grad_norm": 0.005924634635448456, + "learning_rate": 8.38406130719895e-05, + "loss": 0.010948587208986282, + "num_input_tokens_seen": 73167968, + "step": 4468, + "train_runtime": 36306.236, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 2.7084848484848485, + "grad_norm": 0.00669765193015337, + "learning_rate": 8.383353347089064e-05, + "loss": 0.011370482854545116, + "num_input_tokens_seen": 73184344, + "step": 4469, + "train_runtime": 36314.3469, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 2.709090909090909, + "grad_norm": 0.01210335735231638, + "learning_rate": 8.382645261835526e-05, + "loss": 0.011705320328474045, + "num_input_tokens_seen": 73200720, + "step": 4470, + "train_runtime": 36322.461, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 2.70969696969697, + "grad_norm": 0.008841666392982006, + "learning_rate": 8.38193705146453e-05, + "loss": 0.012350142002105713, + "num_input_tokens_seen": 73217096, + "step": 4471, + "train_runtime": 36330.5757, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 2.7103030303030304, + "grad_norm": 0.006763878278434277, + "learning_rate": 8.381228716002268e-05, + "loss": 0.012824160046875477, + "num_input_tokens_seen": 73233472, + "step": 4472, + "train_runtime": 36338.6855, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 2.710909090909091, + "grad_norm": 0.009388206526637077, + "learning_rate": 8.380520255474937e-05, + "loss": 0.012399101629853249, + "num_input_tokens_seen": 73249848, + "step": 4473, + "train_runtime": 36346.7972, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 2.7115151515151514, + "grad_norm": 0.005588878411799669, + "learning_rate": 8.379811669908749e-05, + "loss": 0.012563240714371204, + "num_input_tokens_seen": 73266224, + "step": 4474, + "train_runtime": 36354.9075, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 2.712121212121212, + "grad_norm": 0.0073500629514455795, + "learning_rate": 8.379102959329907e-05, + "loss": 0.012038810178637505, + "num_input_tokens_seen": 73282600, + "step": 4475, + "train_runtime": 36363.0231, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 2.712727272727273, + "grad_norm": 0.0054277996532619, + "learning_rate": 8.378394123764628e-05, + "loss": 0.012276766821742058, + "num_input_tokens_seen": 73298976, + "step": 4476, + "train_runtime": 36371.137, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 2.7133333333333334, + "grad_norm": 0.009593921713531017, + "learning_rate": 8.377685163239128e-05, + "loss": 0.011807446368038654, + "num_input_tokens_seen": 73315352, + "step": 4477, + "train_runtime": 36379.2506, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 2.713939393939394, + "grad_norm": 0.006743255071341991, + "learning_rate": 8.376976077779633e-05, + "loss": 0.012907275930047035, + "num_input_tokens_seen": 73331728, + "step": 4478, + "train_runtime": 36387.3656, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 2.714545454545455, + "grad_norm": 0.006794107612222433, + "learning_rate": 8.376266867412368e-05, + "loss": 0.013223852030932903, + "num_input_tokens_seen": 73348104, + "step": 4479, + "train_runtime": 36395.4823, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 2.7151515151515153, + "grad_norm": 0.008469006977975368, + "learning_rate": 8.375557532163568e-05, + "loss": 0.013661408796906471, + "num_input_tokens_seen": 73364480, + "step": 4480, + "train_runtime": 36403.5929, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 2.715757575757576, + "grad_norm": 0.009223740547895432, + "learning_rate": 8.374848072059469e-05, + "loss": 0.01242228876799345, + "num_input_tokens_seen": 73380856, + "step": 4481, + "train_runtime": 36411.704, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 2.7163636363636363, + "grad_norm": 0.01576068438589573, + "learning_rate": 8.374138487126311e-05, + "loss": 0.012605620548129082, + "num_input_tokens_seen": 73397232, + "step": 4482, + "train_runtime": 36419.8203, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 2.716969696969697, + "grad_norm": 0.010961165651679039, + "learning_rate": 8.37342877739034e-05, + "loss": 0.01267789676785469, + "num_input_tokens_seen": 73413608, + "step": 4483, + "train_runtime": 36427.9359, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 2.7175757575757578, + "grad_norm": 0.00765617610886693, + "learning_rate": 8.372718942877809e-05, + "loss": 0.012697771191596985, + "num_input_tokens_seen": 73429984, + "step": 4484, + "train_runtime": 36436.0486, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 2.7181818181818183, + "grad_norm": 0.008748726919293404, + "learning_rate": 8.372008983614973e-05, + "loss": 0.012355216778814793, + "num_input_tokens_seen": 73446360, + "step": 4485, + "train_runtime": 36444.1626, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 2.7187878787878788, + "grad_norm": 0.006933788303285837, + "learning_rate": 8.371298899628091e-05, + "loss": 0.012977370992302895, + "num_input_tokens_seen": 73462736, + "step": 4486, + "train_runtime": 36452.2805, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 2.7193939393939393, + "grad_norm": 0.005583157297223806, + "learning_rate": 8.370588690943428e-05, + "loss": 0.012528903782367706, + "num_input_tokens_seen": 73479112, + "step": 4487, + "train_runtime": 36460.3897, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 2.7199999999999998, + "grad_norm": 0.009750589728355408, + "learning_rate": 8.369878357587253e-05, + "loss": 0.0128232566639781, + "num_input_tokens_seen": 73495488, + "step": 4488, + "train_runtime": 36468.4997, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 2.7206060606060607, + "grad_norm": 0.006186305079609156, + "learning_rate": 8.369167899585841e-05, + "loss": 0.013384897261857986, + "num_input_tokens_seen": 73511864, + "step": 4489, + "train_runtime": 36476.6124, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 2.721212121212121, + "grad_norm": 0.016899501904845238, + "learning_rate": 8.368457316965468e-05, + "loss": 0.013230472803115845, + "num_input_tokens_seen": 73528240, + "step": 4490, + "train_runtime": 36484.7322, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 2.7218181818181817, + "grad_norm": 0.006692992523312569, + "learning_rate": 8.367746609752419e-05, + "loss": 0.010813414119184017, + "num_input_tokens_seen": 73544616, + "step": 4491, + "train_runtime": 36492.8444, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 2.7224242424242426, + "grad_norm": 0.011271690018475056, + "learning_rate": 8.367035777972982e-05, + "loss": 0.01094187330454588, + "num_input_tokens_seen": 73560992, + "step": 4492, + "train_runtime": 36500.9553, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 2.723030303030303, + "grad_norm": 0.007674421649426222, + "learning_rate": 8.366324821653449e-05, + "loss": 0.013626289553940296, + "num_input_tokens_seen": 73577368, + "step": 4493, + "train_runtime": 36509.0729, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 2.7236363636363636, + "grad_norm": 0.012051126919686794, + "learning_rate": 8.365613740820115e-05, + "loss": 0.013218015432357788, + "num_input_tokens_seen": 73593744, + "step": 4494, + "train_runtime": 36517.1844, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 2.724242424242424, + "grad_norm": 0.00684850150719285, + "learning_rate": 8.364902535499284e-05, + "loss": 0.010251245461404324, + "num_input_tokens_seen": 73610120, + "step": 4495, + "train_runtime": 36525.3003, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 2.7248484848484846, + "grad_norm": 0.007850105874240398, + "learning_rate": 8.36419120571726e-05, + "loss": 0.013076088391244411, + "num_input_tokens_seen": 73626496, + "step": 4496, + "train_runtime": 36533.4156, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 2.7254545454545456, + "grad_norm": 0.011168957687914371, + "learning_rate": 8.363479751500356e-05, + "loss": 0.012956866063177586, + "num_input_tokens_seen": 73642872, + "step": 4497, + "train_runtime": 36541.5331, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 2.726060606060606, + "grad_norm": 0.00739514222368598, + "learning_rate": 8.362768172874884e-05, + "loss": 0.01177054550498724, + "num_input_tokens_seen": 73659248, + "step": 4498, + "train_runtime": 36549.6446, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 2.7266666666666666, + "grad_norm": 0.005301035940647125, + "learning_rate": 8.362056469867168e-05, + "loss": 0.012158683501183987, + "num_input_tokens_seen": 73675624, + "step": 4499, + "train_runtime": 36557.7563, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 2.7272727272727275, + "grad_norm": 0.004065181594341993, + "learning_rate": 8.36134464250353e-05, + "loss": 0.011116517707705498, + "num_input_tokens_seen": 73692000, + "step": 4500, + "train_runtime": 36565.8663, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 2.727878787878788, + "grad_norm": 0.0071369255892932415, + "learning_rate": 8.3606326908103e-05, + "loss": 0.012784713879227638, + "num_input_tokens_seen": 73708376, + "step": 4501, + "train_runtime": 36574.8464, + "train_tokens_per_second": 2015.275 + }, + { + "epoch": 2.7284848484848485, + "grad_norm": 0.028517471626400948, + "learning_rate": 8.359920614813811e-05, + "loss": 0.011820834130048752, + "num_input_tokens_seen": 73724752, + "step": 4502, + "train_runtime": 36582.9603, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 2.729090909090909, + "grad_norm": 0.006723679136484861, + "learning_rate": 8.359208414540402e-05, + "loss": 0.011836536228656769, + "num_input_tokens_seen": 73741128, + "step": 4503, + "train_runtime": 36591.0715, + "train_tokens_per_second": 2015.277 + }, + { + "epoch": 2.7296969696969695, + "grad_norm": 0.008562642149627209, + "learning_rate": 8.358496090016416e-05, + "loss": 0.011798292398452759, + "num_input_tokens_seen": 73757504, + "step": 4504, + "train_runtime": 36599.1854, + "train_tokens_per_second": 2015.277 + }, + { + "epoch": 2.7303030303030305, + "grad_norm": 0.007878309115767479, + "learning_rate": 8.357783641268197e-05, + "loss": 0.012860746122896671, + "num_input_tokens_seen": 73773880, + "step": 4505, + "train_runtime": 36607.2993, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 2.730909090909091, + "grad_norm": 0.010284132324159145, + "learning_rate": 8.357071068322104e-05, + "loss": 0.011526248417794704, + "num_input_tokens_seen": 73790256, + "step": 4506, + "train_runtime": 36615.4199, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 2.7315151515151515, + "grad_norm": 0.00934764463454485, + "learning_rate": 8.356358371204487e-05, + "loss": 0.013024148531258106, + "num_input_tokens_seen": 73806632, + "step": 4507, + "train_runtime": 36623.543, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 2.7321212121212124, + "grad_norm": 0.007866177707910538, + "learning_rate": 8.355645549941711e-05, + "loss": 0.011900964193046093, + "num_input_tokens_seen": 73823008, + "step": 4508, + "train_runtime": 36631.6615, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 2.732727272727273, + "grad_norm": 0.006257795263081789, + "learning_rate": 8.35493260456014e-05, + "loss": 0.012311553582549095, + "num_input_tokens_seen": 73839384, + "step": 4509, + "train_runtime": 36639.7785, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 2.7333333333333334, + "grad_norm": 0.01223308127373457, + "learning_rate": 8.354219535086147e-05, + "loss": 0.014281507581472397, + "num_input_tokens_seen": 73855760, + "step": 4510, + "train_runtime": 36647.8966, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 2.733939393939394, + "grad_norm": 0.014745892956852913, + "learning_rate": 8.353506341546104e-05, + "loss": 0.013549687340855598, + "num_input_tokens_seen": 73872136, + "step": 4511, + "train_runtime": 36656.0105, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 2.7345454545454544, + "grad_norm": 0.013305160216987133, + "learning_rate": 8.352793023966395e-05, + "loss": 0.011918231844902039, + "num_input_tokens_seen": 73888512, + "step": 4512, + "train_runtime": 36664.1325, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 2.7351515151515153, + "grad_norm": 0.00820198841392994, + "learning_rate": 8.352079582373398e-05, + "loss": 0.012731670401990414, + "num_input_tokens_seen": 73904888, + "step": 4513, + "train_runtime": 36672.2479, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 2.735757575757576, + "grad_norm": 0.00963593740016222, + "learning_rate": 8.351366016793507e-05, + "loss": 0.009951244108378887, + "num_input_tokens_seen": 73921264, + "step": 4514, + "train_runtime": 36680.3657, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 2.7363636363636363, + "grad_norm": 0.008508513681590557, + "learning_rate": 8.350652327253112e-05, + "loss": 0.01182563602924347, + "num_input_tokens_seen": 73937640, + "step": 4515, + "train_runtime": 36688.5617, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 2.736969696969697, + "grad_norm": 0.0079377805814147, + "learning_rate": 8.349938513778613e-05, + "loss": 0.012589650228619576, + "num_input_tokens_seen": 73954016, + "step": 4516, + "train_runtime": 36696.6757, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 2.7375757575757573, + "grad_norm": 0.006075101438909769, + "learning_rate": 8.349224576396413e-05, + "loss": 0.012828178703784943, + "num_input_tokens_seen": 73970392, + "step": 4517, + "train_runtime": 36704.7905, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 2.7381818181818183, + "grad_norm": 0.005510466173291206, + "learning_rate": 8.348510515132916e-05, + "loss": 0.01271991990506649, + "num_input_tokens_seen": 73986768, + "step": 4518, + "train_runtime": 36712.9028, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 2.7387878787878788, + "grad_norm": 0.00571348937228322, + "learning_rate": 8.34779633001454e-05, + "loss": 0.012127124704420567, + "num_input_tokens_seen": 74003144, + "step": 4519, + "train_runtime": 36721.0212, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 2.7393939393939393, + "grad_norm": 0.007746982388198376, + "learning_rate": 8.347082021067694e-05, + "loss": 0.012520769611001015, + "num_input_tokens_seen": 74019520, + "step": 4520, + "train_runtime": 36729.1374, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 2.74, + "grad_norm": 0.007642251439392567, + "learning_rate": 8.346367588318804e-05, + "loss": 0.011345982551574707, + "num_input_tokens_seen": 74035896, + "step": 4521, + "train_runtime": 36737.2571, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 2.7406060606060607, + "grad_norm": 0.009121562354266644, + "learning_rate": 8.345653031794292e-05, + "loss": 0.011867363005876541, + "num_input_tokens_seen": 74052272, + "step": 4522, + "train_runtime": 36745.3695, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 2.741212121212121, + "grad_norm": 0.012458735145628452, + "learning_rate": 8.34493835152059e-05, + "loss": 0.011831055395305157, + "num_input_tokens_seen": 74068648, + "step": 4523, + "train_runtime": 36753.4845, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 2.7418181818181817, + "grad_norm": 0.006926266942173243, + "learning_rate": 8.344223547524132e-05, + "loss": 0.012583856470882893, + "num_input_tokens_seen": 74085024, + "step": 4524, + "train_runtime": 36761.5974, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 2.742424242424242, + "grad_norm": 0.008176716975867748, + "learning_rate": 8.34350861983136e-05, + "loss": 0.011472863145172596, + "num_input_tokens_seen": 74101400, + "step": 4525, + "train_runtime": 36769.7112, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 2.743030303030303, + "grad_norm": 0.0065798284485936165, + "learning_rate": 8.342793568468713e-05, + "loss": 0.01237739622592926, + "num_input_tokens_seen": 74117776, + "step": 4526, + "train_runtime": 36777.8323, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 2.7436363636363637, + "grad_norm": 0.011811626143753529, + "learning_rate": 8.342078393462643e-05, + "loss": 0.012651550583541393, + "num_input_tokens_seen": 74134152, + "step": 4527, + "train_runtime": 36785.9456, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 2.744242424242424, + "grad_norm": 0.018916035071015358, + "learning_rate": 8.341363094839601e-05, + "loss": 0.013918230310082436, + "num_input_tokens_seen": 74150528, + "step": 4528, + "train_runtime": 36794.0609, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 2.744848484848485, + "grad_norm": 0.008403911255300045, + "learning_rate": 8.340647672626046e-05, + "loss": 0.011243307031691074, + "num_input_tokens_seen": 74166904, + "step": 4529, + "train_runtime": 36802.1753, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 2.7454545454545456, + "grad_norm": 0.0066023096442222595, + "learning_rate": 8.339932126848437e-05, + "loss": 0.013308960013091564, + "num_input_tokens_seen": 74183280, + "step": 4530, + "train_runtime": 36810.2876, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 2.746060606060606, + "grad_norm": 0.006932645104825497, + "learning_rate": 8.339216457533244e-05, + "loss": 0.012073111720383167, + "num_input_tokens_seen": 74199656, + "step": 4531, + "train_runtime": 36818.4011, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 2.7466666666666666, + "grad_norm": 0.0067931716330349445, + "learning_rate": 8.338500664706936e-05, + "loss": 0.012355629354715347, + "num_input_tokens_seen": 74216032, + "step": 4532, + "train_runtime": 36826.513, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 2.747272727272727, + "grad_norm": 0.007497054990381002, + "learning_rate": 8.337784748395992e-05, + "loss": 0.012315018102526665, + "num_input_tokens_seen": 74232408, + "step": 4533, + "train_runtime": 36834.6318, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 2.747878787878788, + "grad_norm": 0.0089682312682271, + "learning_rate": 8.33706870862689e-05, + "loss": 0.012521359138190746, + "num_input_tokens_seen": 74248784, + "step": 4534, + "train_runtime": 36842.7493, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 2.7484848484848485, + "grad_norm": 0.005700670648366213, + "learning_rate": 8.336352545426114e-05, + "loss": 0.010872012004256248, + "num_input_tokens_seen": 74265160, + "step": 4535, + "train_runtime": 36851.0614, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 2.749090909090909, + "grad_norm": 0.009577528573572636, + "learning_rate": 8.335636258820155e-05, + "loss": 0.011853006668388844, + "num_input_tokens_seen": 74281536, + "step": 4536, + "train_runtime": 36859.1749, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 2.74969696969697, + "grad_norm": 0.006186668295413256, + "learning_rate": 8.334919848835507e-05, + "loss": 0.011964666657149792, + "num_input_tokens_seen": 74297912, + "step": 4537, + "train_runtime": 36867.2871, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 2.75030303030303, + "grad_norm": 0.008738999255001545, + "learning_rate": 8.334203315498668e-05, + "loss": 0.011959796771407127, + "num_input_tokens_seen": 74314288, + "step": 4538, + "train_runtime": 36875.4004, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 2.750909090909091, + "grad_norm": 0.009763142094016075, + "learning_rate": 8.33348665883614e-05, + "loss": 0.01147974468767643, + "num_input_tokens_seen": 74330664, + "step": 4539, + "train_runtime": 36883.5178, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 2.7515151515151515, + "grad_norm": 0.010166713036596775, + "learning_rate": 8.332769878874434e-05, + "loss": 0.01307996828109026, + "num_input_tokens_seen": 74347040, + "step": 4540, + "train_runtime": 36891.6358, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 2.752121212121212, + "grad_norm": 0.005661500617861748, + "learning_rate": 8.332052975640061e-05, + "loss": 0.012151244096457958, + "num_input_tokens_seen": 74363416, + "step": 4541, + "train_runtime": 36899.7527, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 2.752727272727273, + "grad_norm": 0.007195177022367716, + "learning_rate": 8.331335949159535e-05, + "loss": 0.011629585176706314, + "num_input_tokens_seen": 74379792, + "step": 4542, + "train_runtime": 36907.8672, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 2.7533333333333334, + "grad_norm": 0.015221587382256985, + "learning_rate": 8.330618799459381e-05, + "loss": 0.013828705064952374, + "num_input_tokens_seen": 74396168, + "step": 4543, + "train_runtime": 36915.981, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 2.753939393939394, + "grad_norm": 0.006161235738545656, + "learning_rate": 8.329901526566124e-05, + "loss": 0.011762632988393307, + "num_input_tokens_seen": 74412544, + "step": 4544, + "train_runtime": 36924.0933, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 2.7545454545454544, + "grad_norm": 0.013080923818051815, + "learning_rate": 8.329184130506294e-05, + "loss": 0.011559555307030678, + "num_input_tokens_seen": 74428920, + "step": 4545, + "train_runtime": 36932.2105, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 2.755151515151515, + "grad_norm": 0.008135508745908737, + "learning_rate": 8.328466611306427e-05, + "loss": 0.013216247782111168, + "num_input_tokens_seen": 74445296, + "step": 4546, + "train_runtime": 36940.321, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 2.755757575757576, + "grad_norm": 0.005981654394418001, + "learning_rate": 8.327748968993061e-05, + "loss": 0.012581274844706059, + "num_input_tokens_seen": 74461672, + "step": 4547, + "train_runtime": 36948.4358, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 2.7563636363636363, + "grad_norm": 0.007957780733704567, + "learning_rate": 8.32703120359274e-05, + "loss": 0.012196117080748081, + "num_input_tokens_seen": 74478048, + "step": 4548, + "train_runtime": 36956.5511, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 2.756969696969697, + "grad_norm": 0.008788809180259705, + "learning_rate": 8.326313315132016e-05, + "loss": 0.011585334315896034, + "num_input_tokens_seen": 74494424, + "step": 4549, + "train_runtime": 36964.6672, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 2.757575757575758, + "grad_norm": 0.01375692430883646, + "learning_rate": 8.325595303637439e-05, + "loss": 0.013039148412644863, + "num_input_tokens_seen": 74510800, + "step": 4550, + "train_runtime": 36972.7849, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 2.7581818181818183, + "grad_norm": 0.00947599858045578, + "learning_rate": 8.324877169135569e-05, + "loss": 0.012019573710858822, + "num_input_tokens_seen": 74527176, + "step": 4551, + "train_runtime": 36980.9009, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 2.758787878787879, + "grad_norm": 0.008802402764558792, + "learning_rate": 8.324158911652966e-05, + "loss": 0.012496326118707657, + "num_input_tokens_seen": 74543552, + "step": 4552, + "train_runtime": 36989.0202, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 2.7593939393939393, + "grad_norm": 0.0026664589531719685, + "learning_rate": 8.323440531216199e-05, + "loss": 0.01244363747537136, + "num_input_tokens_seen": 74559928, + "step": 4553, + "train_runtime": 36997.1357, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 2.76, + "grad_norm": 0.008980312384665012, + "learning_rate": 8.322722027851839e-05, + "loss": 0.01205851137638092, + "num_input_tokens_seen": 74576304, + "step": 4554, + "train_runtime": 37005.2456, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 2.7606060606060607, + "grad_norm": 0.007881799712777138, + "learning_rate": 8.322003401586462e-05, + "loss": 0.011951956897974014, + "num_input_tokens_seen": 74592680, + "step": 4555, + "train_runtime": 37013.3608, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 2.7612121212121212, + "grad_norm": 0.009107083082199097, + "learning_rate": 8.32128465244665e-05, + "loss": 0.012532702647149563, + "num_input_tokens_seen": 74609056, + "step": 4556, + "train_runtime": 37021.4735, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 2.7618181818181817, + "grad_norm": 0.010470434091985226, + "learning_rate": 8.320565780458984e-05, + "loss": 0.012278139591217041, + "num_input_tokens_seen": 74625432, + "step": 4557, + "train_runtime": 37029.5982, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 2.7624242424242427, + "grad_norm": 0.009943035431206226, + "learning_rate": 8.319846785650057e-05, + "loss": 0.011647533625364304, + "num_input_tokens_seen": 74641808, + "step": 4558, + "train_runtime": 37037.7121, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 2.763030303030303, + "grad_norm": 0.011546614579856396, + "learning_rate": 8.319127668046463e-05, + "loss": 0.012851388193666935, + "num_input_tokens_seen": 74658184, + "step": 4559, + "train_runtime": 37045.8315, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 2.7636363636363637, + "grad_norm": 0.007627122104167938, + "learning_rate": 8.3184084276748e-05, + "loss": 0.012408561073243618, + "num_input_tokens_seen": 74674560, + "step": 4560, + "train_runtime": 37053.9526, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 2.764242424242424, + "grad_norm": 0.005179052706807852, + "learning_rate": 8.317689064561671e-05, + "loss": 0.012707646004855633, + "num_input_tokens_seen": 74690936, + "step": 4561, + "train_runtime": 37062.0674, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 2.7648484848484847, + "grad_norm": 0.004154569003731012, + "learning_rate": 8.316969578733686e-05, + "loss": 0.010566530749201775, + "num_input_tokens_seen": 74707312, + "step": 4562, + "train_runtime": 37070.1778, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 2.7654545454545456, + "grad_norm": 0.007608859799802303, + "learning_rate": 8.316249970217454e-05, + "loss": 0.012400549836456776, + "num_input_tokens_seen": 74723688, + "step": 4563, + "train_runtime": 37078.2907, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 2.766060606060606, + "grad_norm": 0.008431745693087578, + "learning_rate": 8.315530239039595e-05, + "loss": 0.01226590946316719, + "num_input_tokens_seen": 74740064, + "step": 4564, + "train_runtime": 37086.4041, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 2.7666666666666666, + "grad_norm": 0.002696508541703224, + "learning_rate": 8.314810385226728e-05, + "loss": 0.011671803891658783, + "num_input_tokens_seen": 74756440, + "step": 4565, + "train_runtime": 37094.5208, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 2.767272727272727, + "grad_norm": 0.01169643085449934, + "learning_rate": 8.314090408805482e-05, + "loss": 0.013272207230329514, + "num_input_tokens_seen": 74772816, + "step": 4566, + "train_runtime": 37102.6361, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 2.7678787878787876, + "grad_norm": 0.007161509245634079, + "learning_rate": 8.313370309802483e-05, + "loss": 0.01238691620528698, + "num_input_tokens_seen": 74789192, + "step": 4567, + "train_runtime": 37110.7544, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 2.7684848484848485, + "grad_norm": 0.013030235655605793, + "learning_rate": 8.312650088244372e-05, + "loss": 0.012166631408035755, + "num_input_tokens_seen": 74805568, + "step": 4568, + "train_runtime": 37118.8718, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 2.769090909090909, + "grad_norm": 0.007812634110450745, + "learning_rate": 8.311929744157783e-05, + "loss": 0.011776421219110489, + "num_input_tokens_seen": 74821944, + "step": 4569, + "train_runtime": 37126.9848, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 2.7696969696969695, + "grad_norm": 0.0065947952680289745, + "learning_rate": 8.311209277569364e-05, + "loss": 0.012017328292131424, + "num_input_tokens_seen": 74838320, + "step": 4570, + "train_runtime": 37135.0999, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 2.7703030303030305, + "grad_norm": 0.010814502835273743, + "learning_rate": 8.31048868850576e-05, + "loss": 0.011847824789583683, + "num_input_tokens_seen": 74854696, + "step": 4571, + "train_runtime": 37143.2152, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 2.770909090909091, + "grad_norm": 0.009051995351910591, + "learning_rate": 8.309767976993627e-05, + "loss": 0.011820012703537941, + "num_input_tokens_seen": 74871072, + "step": 4572, + "train_runtime": 37151.336, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 2.7715151515151515, + "grad_norm": 0.006676412653177977, + "learning_rate": 8.309047143059623e-05, + "loss": 0.01197296567261219, + "num_input_tokens_seen": 74887448, + "step": 4573, + "train_runtime": 37159.4484, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 2.772121212121212, + "grad_norm": 0.006181318312883377, + "learning_rate": 8.308326186730409e-05, + "loss": 0.011802428402006626, + "num_input_tokens_seen": 74903824, + "step": 4574, + "train_runtime": 37167.5614, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 2.7727272727272725, + "grad_norm": 0.0034635437186807394, + "learning_rate": 8.30760510803265e-05, + "loss": 0.011104054749011993, + "num_input_tokens_seen": 74920200, + "step": 4575, + "train_runtime": 37175.6773, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 2.7733333333333334, + "grad_norm": 0.007076766341924667, + "learning_rate": 8.306883906993022e-05, + "loss": 0.01123069878667593, + "num_input_tokens_seen": 74936576, + "step": 4576, + "train_runtime": 37183.7892, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 2.773939393939394, + "grad_norm": 0.006231664214283228, + "learning_rate": 8.306162583638197e-05, + "loss": 0.012108227238059044, + "num_input_tokens_seen": 74952952, + "step": 4577, + "train_runtime": 37191.9057, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 2.7745454545454544, + "grad_norm": 0.004386771935969591, + "learning_rate": 8.305441137994856e-05, + "loss": 0.010733839124441147, + "num_input_tokens_seen": 74969328, + "step": 4578, + "train_runtime": 37200.0207, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 2.7751515151515154, + "grad_norm": 0.0074032628908753395, + "learning_rate": 8.304719570089685e-05, + "loss": 0.012523296289145947, + "num_input_tokens_seen": 74985704, + "step": 4579, + "train_runtime": 37208.1357, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 2.775757575757576, + "grad_norm": 0.006272404920309782, + "learning_rate": 8.303997879949373e-05, + "loss": 0.012356406077742577, + "num_input_tokens_seen": 75002080, + "step": 4580, + "train_runtime": 37216.2473, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 2.7763636363636364, + "grad_norm": 0.007595263421535492, + "learning_rate": 8.303276067600614e-05, + "loss": 0.011584490537643433, + "num_input_tokens_seen": 75018456, + "step": 4581, + "train_runtime": 37224.3619, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 2.776969696969697, + "grad_norm": 0.008950237184762955, + "learning_rate": 8.302554133070103e-05, + "loss": 0.011969258077442646, + "num_input_tokens_seen": 75034832, + "step": 4582, + "train_runtime": 37232.4789, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 2.7775757575757574, + "grad_norm": 0.0075401319190859795, + "learning_rate": 8.301832076384551e-05, + "loss": 0.01156130526214838, + "num_input_tokens_seen": 75051208, + "step": 4583, + "train_runtime": 37240.5884, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 2.7781818181818183, + "grad_norm": 0.0088722612708807, + "learning_rate": 8.301109897570657e-05, + "loss": 0.011036232113838196, + "num_input_tokens_seen": 75067584, + "step": 4584, + "train_runtime": 37248.7013, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 2.778787878787879, + "grad_norm": 0.00827216450124979, + "learning_rate": 8.300387596655137e-05, + "loss": 0.01255103386938572, + "num_input_tokens_seen": 75083960, + "step": 4585, + "train_runtime": 37256.8136, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 2.7793939393939393, + "grad_norm": 0.00940687209367752, + "learning_rate": 8.299665173664708e-05, + "loss": 0.010962124913930893, + "num_input_tokens_seen": 75100336, + "step": 4586, + "train_runtime": 37264.932, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 2.7800000000000002, + "grad_norm": 0.00792167242616415, + "learning_rate": 8.298942628626089e-05, + "loss": 0.012415515258908272, + "num_input_tokens_seen": 75116712, + "step": 4587, + "train_runtime": 37273.0444, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 2.7806060606060607, + "grad_norm": 0.007778497412800789, + "learning_rate": 8.298219961566009e-05, + "loss": 0.011703824624419212, + "num_input_tokens_seen": 75133088, + "step": 4588, + "train_runtime": 37281.1557, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 2.7812121212121212, + "grad_norm": 0.010049012489616871, + "learning_rate": 8.297497172511192e-05, + "loss": 0.011274192482233047, + "num_input_tokens_seen": 75149464, + "step": 4589, + "train_runtime": 37289.271, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 2.7818181818181817, + "grad_norm": 0.01315548550337553, + "learning_rate": 8.296774261488378e-05, + "loss": 0.01203350629657507, + "num_input_tokens_seen": 75165840, + "step": 4590, + "train_runtime": 37297.3868, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 2.7824242424242422, + "grad_norm": 0.011911927722394466, + "learning_rate": 8.296051228524305e-05, + "loss": 0.01165513601154089, + "num_input_tokens_seen": 75182216, + "step": 4591, + "train_runtime": 37305.5036, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 2.783030303030303, + "grad_norm": 0.008554724976420403, + "learning_rate": 8.295328073645716e-05, + "loss": 0.011198979802429676, + "num_input_tokens_seen": 75198592, + "step": 4592, + "train_runtime": 37313.6162, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 2.7836363636363637, + "grad_norm": 0.009792471304535866, + "learning_rate": 8.294604796879357e-05, + "loss": 0.012244784273207188, + "num_input_tokens_seen": 75214968, + "step": 4593, + "train_runtime": 37321.7319, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 2.784242424242424, + "grad_norm": 0.005816595163196325, + "learning_rate": 8.293881398251984e-05, + "loss": 0.011864472180604935, + "num_input_tokens_seen": 75231344, + "step": 4594, + "train_runtime": 37329.8426, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 2.7848484848484847, + "grad_norm": 0.011942507699131966, + "learning_rate": 8.293157877790352e-05, + "loss": 0.012738109566271305, + "num_input_tokens_seen": 75247720, + "step": 4595, + "train_runtime": 37337.9542, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 2.785454545454545, + "grad_norm": 0.00609165383502841, + "learning_rate": 8.292434235521222e-05, + "loss": 0.011192462407052517, + "num_input_tokens_seen": 75264096, + "step": 4596, + "train_runtime": 37346.0705, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 2.786060606060606, + "grad_norm": 0.008764318190515041, + "learning_rate": 8.291710471471363e-05, + "loss": 0.012121830135583878, + "num_input_tokens_seen": 75280472, + "step": 4597, + "train_runtime": 37354.1901, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 2.7866666666666666, + "grad_norm": 0.009747988544404507, + "learning_rate": 8.290986585667544e-05, + "loss": 0.011521042324602604, + "num_input_tokens_seen": 75296848, + "step": 4598, + "train_runtime": 37362.3021, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 2.787272727272727, + "grad_norm": 0.0075605809688568115, + "learning_rate": 8.29026257813654e-05, + "loss": 0.012785978615283966, + "num_input_tokens_seen": 75313224, + "step": 4599, + "train_runtime": 37370.4162, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 2.787878787878788, + "grad_norm": 0.007667553145438433, + "learning_rate": 8.289538448905131e-05, + "loss": 0.012494080699980259, + "num_input_tokens_seen": 75329600, + "step": 4600, + "train_runtime": 37378.5328, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 2.7884848484848486, + "grad_norm": 0.006360137369483709, + "learning_rate": 8.2888141980001e-05, + "loss": 0.01090994756668806, + "num_input_tokens_seen": 75345976, + "step": 4601, + "train_runtime": 37387.5601, + "train_tokens_per_second": 2015.269 + }, + { + "epoch": 2.789090909090909, + "grad_norm": 0.009333646856248379, + "learning_rate": 8.288089825448237e-05, + "loss": 0.011638118885457516, + "num_input_tokens_seen": 75362352, + "step": 4602, + "train_runtime": 37395.6731, + "train_tokens_per_second": 2015.269 + }, + { + "epoch": 2.7896969696969696, + "grad_norm": 0.009527266956865788, + "learning_rate": 8.287365331276335e-05, + "loss": 0.012087761424481869, + "num_input_tokens_seen": 75378728, + "step": 4603, + "train_runtime": 37403.7858, + "train_tokens_per_second": 2015.27 + }, + { + "epoch": 2.79030303030303, + "grad_norm": 0.008427105844020844, + "learning_rate": 8.28664071551119e-05, + "loss": 0.012114526703953743, + "num_input_tokens_seen": 75395104, + "step": 4604, + "train_runtime": 37411.8991, + "train_tokens_per_second": 2015.271 + }, + { + "epoch": 2.790909090909091, + "grad_norm": 0.006635100580751896, + "learning_rate": 8.285915978179606e-05, + "loss": 0.011045638471841812, + "num_input_tokens_seen": 75411480, + "step": 4605, + "train_runtime": 37420.0111, + "train_tokens_per_second": 2015.271 + }, + { + "epoch": 2.7915151515151515, + "grad_norm": 0.009286362677812576, + "learning_rate": 8.285191119308389e-05, + "loss": 0.011749152094125748, + "num_input_tokens_seen": 75427856, + "step": 4606, + "train_runtime": 37428.1221, + "train_tokens_per_second": 2015.272 + }, + { + "epoch": 2.792121212121212, + "grad_norm": 0.0053281174041330814, + "learning_rate": 8.28446613892435e-05, + "loss": 0.012263793498277664, + "num_input_tokens_seen": 75444232, + "step": 4607, + "train_runtime": 37436.232, + "train_tokens_per_second": 2015.273 + }, + { + "epoch": 2.792727272727273, + "grad_norm": 0.02770264819264412, + "learning_rate": 8.283741037054305e-05, + "loss": 0.01143752969801426, + "num_input_tokens_seen": 75460608, + "step": 4608, + "train_runtime": 37444.3448, + "train_tokens_per_second": 2015.274 + }, + { + "epoch": 2.7933333333333334, + "grad_norm": 0.011117475107312202, + "learning_rate": 8.283015813725074e-05, + "loss": 0.01322878897190094, + "num_input_tokens_seen": 75476984, + "step": 4609, + "train_runtime": 37452.4597, + "train_tokens_per_second": 2015.274 + }, + { + "epoch": 2.793939393939394, + "grad_norm": 0.006448531523346901, + "learning_rate": 8.282290468963481e-05, + "loss": 0.012871074490249157, + "num_input_tokens_seen": 75493360, + "step": 4610, + "train_runtime": 37460.5742, + "train_tokens_per_second": 2015.275 + }, + { + "epoch": 2.7945454545454544, + "grad_norm": 0.014228587038815022, + "learning_rate": 8.281565002796356e-05, + "loss": 0.011354008689522743, + "num_input_tokens_seen": 75509736, + "step": 4611, + "train_runtime": 37468.6907, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 2.795151515151515, + "grad_norm": 0.006881978362798691, + "learning_rate": 8.280839415250532e-05, + "loss": 0.012481747195124626, + "num_input_tokens_seen": 75526112, + "step": 4612, + "train_runtime": 37476.8043, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 2.795757575757576, + "grad_norm": 0.008226819336414337, + "learning_rate": 8.280113706352845e-05, + "loss": 0.011904973536729813, + "num_input_tokens_seen": 75542488, + "step": 4613, + "train_runtime": 37484.9153, + "train_tokens_per_second": 2015.277 + }, + { + "epoch": 2.7963636363636364, + "grad_norm": 0.008040538057684898, + "learning_rate": 8.279387876130143e-05, + "loss": 0.012113945558667183, + "num_input_tokens_seen": 75558864, + "step": 4614, + "train_runtime": 37493.0315, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 2.796969696969697, + "grad_norm": 0.006377527490258217, + "learning_rate": 8.278661924609271e-05, + "loss": 0.012096541002392769, + "num_input_tokens_seen": 75575240, + "step": 4615, + "train_runtime": 37501.1498, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 2.797575757575758, + "grad_norm": 0.0043525369837880135, + "learning_rate": 8.277935851817075e-05, + "loss": 0.012137744575738907, + "num_input_tokens_seen": 75591616, + "step": 4616, + "train_runtime": 37509.2641, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 2.7981818181818183, + "grad_norm": 0.05825034901499748, + "learning_rate": 8.277209657780417e-05, + "loss": 0.01267202664166689, + "num_input_tokens_seen": 75607992, + "step": 4617, + "train_runtime": 37517.3804, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 2.798787878787879, + "grad_norm": 0.007179918698966503, + "learning_rate": 8.276483342526155e-05, + "loss": 0.010754615068435669, + "num_input_tokens_seen": 75624368, + "step": 4618, + "train_runtime": 37525.4981, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 2.7993939393939393, + "grad_norm": 0.0027494365349411964, + "learning_rate": 8.275756906081157e-05, + "loss": 0.011694258078932762, + "num_input_tokens_seen": 75640744, + "step": 4619, + "train_runtime": 37533.6155, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 2.8, + "grad_norm": 0.004198397044092417, + "learning_rate": 8.275030348472289e-05, + "loss": 0.012074324302375317, + "num_input_tokens_seen": 75657120, + "step": 4620, + "train_runtime": 37541.7318, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 2.8006060606060608, + "grad_norm": 0.00927363894879818, + "learning_rate": 8.274303669726426e-05, + "loss": 0.012912195175886154, + "num_input_tokens_seen": 75673496, + "step": 4621, + "train_runtime": 37549.8424, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 2.8012121212121213, + "grad_norm": 0.0049201371148228645, + "learning_rate": 8.273576869870448e-05, + "loss": 0.011908473446965218, + "num_input_tokens_seen": 75689872, + "step": 4622, + "train_runtime": 37557.9579, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 2.8018181818181818, + "grad_norm": 0.008473035879433155, + "learning_rate": 8.272849948931234e-05, + "loss": 0.01259793620556593, + "num_input_tokens_seen": 75706248, + "step": 4623, + "train_runtime": 37566.0718, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 2.8024242424242423, + "grad_norm": 0.026559488847851753, + "learning_rate": 8.272122906935675e-05, + "loss": 0.012545714154839516, + "num_input_tokens_seen": 75722624, + "step": 4624, + "train_runtime": 37574.1851, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 2.8030303030303028, + "grad_norm": 0.008581848815083504, + "learning_rate": 8.271395743910664e-05, + "loss": 0.011335734277963638, + "num_input_tokens_seen": 75739000, + "step": 4625, + "train_runtime": 37582.2967, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 2.8036363636363637, + "grad_norm": 0.011590315960347652, + "learning_rate": 8.270668459883093e-05, + "loss": 0.012633738107979298, + "num_input_tokens_seen": 75755376, + "step": 4626, + "train_runtime": 37590.4122, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 2.804242424242424, + "grad_norm": 0.020206259563565254, + "learning_rate": 8.269941054879867e-05, + "loss": 0.013163880445063114, + "num_input_tokens_seen": 75771752, + "step": 4627, + "train_runtime": 37598.5224, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 2.8048484848484847, + "grad_norm": 0.008124117739498615, + "learning_rate": 8.269213528927886e-05, + "loss": 0.012483615428209305, + "num_input_tokens_seen": 75788128, + "step": 4628, + "train_runtime": 37606.6359, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 2.8054545454545456, + "grad_norm": 0.0028913484420627356, + "learning_rate": 8.268485882054065e-05, + "loss": 0.012408132664859295, + "num_input_tokens_seen": 75804504, + "step": 4629, + "train_runtime": 37614.7489, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 2.806060606060606, + "grad_norm": 0.005892369896173477, + "learning_rate": 8.267758114285315e-05, + "loss": 0.010854450985789299, + "num_input_tokens_seen": 75820880, + "step": 4630, + "train_runtime": 37622.8654, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 2.8066666666666666, + "grad_norm": 0.008197440765798092, + "learning_rate": 8.26703022564856e-05, + "loss": 0.012501079589128494, + "num_input_tokens_seen": 75837256, + "step": 4631, + "train_runtime": 37630.9803, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 2.807272727272727, + "grad_norm": 0.02512498013675213, + "learning_rate": 8.266302216170715e-05, + "loss": 0.012897662818431854, + "num_input_tokens_seen": 75853632, + "step": 4632, + "train_runtime": 37639.0926, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 2.8078787878787876, + "grad_norm": 0.011668462306261063, + "learning_rate": 8.265574085878713e-05, + "loss": 0.012918482534587383, + "num_input_tokens_seen": 75870008, + "step": 4633, + "train_runtime": 37647.2041, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 2.8084848484848486, + "grad_norm": 0.06489795446395874, + "learning_rate": 8.264845834799487e-05, + "loss": 0.012461014091968536, + "num_input_tokens_seen": 75886384, + "step": 4634, + "train_runtime": 37655.3218, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 2.809090909090909, + "grad_norm": 0.026794573292136192, + "learning_rate": 8.264117462959968e-05, + "loss": 0.012038145214319229, + "num_input_tokens_seen": 75902760, + "step": 4635, + "train_runtime": 37663.436, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 2.8096969696969696, + "grad_norm": 0.004727222956717014, + "learning_rate": 8.263388970387102e-05, + "loss": 0.011617126874625683, + "num_input_tokens_seen": 75919136, + "step": 4636, + "train_runtime": 37671.9735, + "train_tokens_per_second": 2015.268 + }, + { + "epoch": 2.8103030303030305, + "grad_norm": 0.009042751975357533, + "learning_rate": 8.262660357107835e-05, + "loss": 0.011723637580871582, + "num_input_tokens_seen": 75935512, + "step": 4637, + "train_runtime": 37680.0914, + "train_tokens_per_second": 2015.269 + }, + { + "epoch": 2.810909090909091, + "grad_norm": 0.005566759500652552, + "learning_rate": 8.261931623149115e-05, + "loss": 0.0113048255443573, + "num_input_tokens_seen": 75951888, + "step": 4638, + "train_runtime": 37688.2072, + "train_tokens_per_second": 2015.269 + }, + { + "epoch": 2.8115151515151515, + "grad_norm": 0.02460971102118492, + "learning_rate": 8.261202768537895e-05, + "loss": 0.012707947753369808, + "num_input_tokens_seen": 75968264, + "step": 4639, + "train_runtime": 37696.3195, + "train_tokens_per_second": 2015.27 + }, + { + "epoch": 2.812121212121212, + "grad_norm": 0.014103865250945091, + "learning_rate": 8.260473793301135e-05, + "loss": 0.01248024683445692, + "num_input_tokens_seen": 75984640, + "step": 4640, + "train_runtime": 37704.4331, + "train_tokens_per_second": 2015.271 + }, + { + "epoch": 2.8127272727272725, + "grad_norm": 0.006093468051403761, + "learning_rate": 8.259744697465799e-05, + "loss": 0.011360524222254753, + "num_input_tokens_seen": 76001016, + "step": 4641, + "train_runtime": 37712.5473, + "train_tokens_per_second": 2015.271 + }, + { + "epoch": 2.8133333333333335, + "grad_norm": 0.006964137777686119, + "learning_rate": 8.259015481058856e-05, + "loss": 0.011688297614455223, + "num_input_tokens_seen": 76017392, + "step": 4642, + "train_runtime": 37720.6604, + "train_tokens_per_second": 2015.272 + }, + { + "epoch": 2.813939393939394, + "grad_norm": 0.007173947524279356, + "learning_rate": 8.258286144107276e-05, + "loss": 0.011954311281442642, + "num_input_tokens_seen": 76033768, + "step": 4643, + "train_runtime": 37728.7715, + "train_tokens_per_second": 2015.273 + }, + { + "epoch": 2.8145454545454545, + "grad_norm": 0.010495497845113277, + "learning_rate": 8.257556686638038e-05, + "loss": 0.011953799985349178, + "num_input_tokens_seen": 76050144, + "step": 4644, + "train_runtime": 37736.8879, + "train_tokens_per_second": 2015.273 + }, + { + "epoch": 2.8151515151515154, + "grad_norm": 0.01465742476284504, + "learning_rate": 8.25682710867812e-05, + "loss": 0.012720690108835697, + "num_input_tokens_seen": 76066520, + "step": 4645, + "train_runtime": 37745.004, + "train_tokens_per_second": 2015.274 + }, + { + "epoch": 2.815757575757576, + "grad_norm": 0.00989493913948536, + "learning_rate": 8.256097410254512e-05, + "loss": 0.011029507964849472, + "num_input_tokens_seen": 76082896, + "step": 4646, + "train_runtime": 37753.1147, + "train_tokens_per_second": 2015.275 + }, + { + "epoch": 2.8163636363636364, + "grad_norm": 0.0075317383743822575, + "learning_rate": 8.255367591394201e-05, + "loss": 0.012229567393660545, + "num_input_tokens_seen": 76099272, + "step": 4647, + "train_runtime": 37761.2321, + "train_tokens_per_second": 2015.275 + }, + { + "epoch": 2.816969696969697, + "grad_norm": 0.007885935716331005, + "learning_rate": 8.254637652124182e-05, + "loss": 0.013478076085448265, + "num_input_tokens_seen": 76115648, + "step": 4648, + "train_runtime": 37769.3475, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 2.8175757575757574, + "grad_norm": 0.008157597854733467, + "learning_rate": 8.253907592471453e-05, + "loss": 0.011275263503193855, + "num_input_tokens_seen": 76132024, + "step": 4649, + "train_runtime": 37777.4613, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 2.8181818181818183, + "grad_norm": 0.006938634905964136, + "learning_rate": 8.25317741246302e-05, + "loss": 0.01300935260951519, + "num_input_tokens_seen": 76148400, + "step": 4650, + "train_runtime": 37785.5731, + "train_tokens_per_second": 2015.277 + }, + { + "epoch": 2.818787878787879, + "grad_norm": 0.005442952737212181, + "learning_rate": 8.252447112125889e-05, + "loss": 0.010733041912317276, + "num_input_tokens_seen": 76164776, + "step": 4651, + "train_runtime": 37793.6834, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 2.8193939393939393, + "grad_norm": 0.006607817951589823, + "learning_rate": 8.251716691487074e-05, + "loss": 0.011935989372432232, + "num_input_tokens_seen": 76181152, + "step": 4652, + "train_runtime": 37801.7968, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 2.82, + "grad_norm": 0.006429760251194239, + "learning_rate": 8.250986150573592e-05, + "loss": 0.013100754469633102, + "num_input_tokens_seen": 76197528, + "step": 4653, + "train_runtime": 37809.9113, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 2.8206060606060603, + "grad_norm": 0.015638621523976326, + "learning_rate": 8.250255489412463e-05, + "loss": 0.012257112190127373, + "num_input_tokens_seen": 76213904, + "step": 4654, + "train_runtime": 37818.0228, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 2.8212121212121213, + "grad_norm": 0.004669523332268, + "learning_rate": 8.249524708030712e-05, + "loss": 0.012286133132874966, + "num_input_tokens_seen": 76230280, + "step": 4655, + "train_runtime": 37826.1346, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 2.821818181818182, + "grad_norm": 0.004024967085570097, + "learning_rate": 8.24879380645537e-05, + "loss": 0.011852612718939781, + "num_input_tokens_seen": 76246656, + "step": 4656, + "train_runtime": 37834.2477, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 2.8224242424242423, + "grad_norm": 0.0016817516880109906, + "learning_rate": 8.248062784713472e-05, + "loss": 0.01089123822748661, + "num_input_tokens_seen": 76263032, + "step": 4657, + "train_runtime": 37842.3664, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 2.8230303030303032, + "grad_norm": 0.004179390147328377, + "learning_rate": 8.247331642832059e-05, + "loss": 0.012344965711236, + "num_input_tokens_seen": 76279408, + "step": 4658, + "train_runtime": 37850.4795, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 2.8236363636363637, + "grad_norm": 0.006849796045571566, + "learning_rate": 8.24660038083817e-05, + "loss": 0.012040671892464161, + "num_input_tokens_seen": 76295784, + "step": 4659, + "train_runtime": 37858.5931, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 2.824242424242424, + "grad_norm": 0.008180524222552776, + "learning_rate": 8.245868998758856e-05, + "loss": 0.011144166812300682, + "num_input_tokens_seen": 76312160, + "step": 4660, + "train_runtime": 37866.7084, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 2.8248484848484847, + "grad_norm": 0.008782410994172096, + "learning_rate": 8.245137496621169e-05, + "loss": 0.011486460454761982, + "num_input_tokens_seen": 76328536, + "step": 4661, + "train_runtime": 37874.8317, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 2.825454545454545, + "grad_norm": 0.022730255499482155, + "learning_rate": 8.244405874452166e-05, + "loss": 0.011689424514770508, + "num_input_tokens_seen": 76344912, + "step": 4662, + "train_runtime": 37882.9458, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 2.826060606060606, + "grad_norm": 0.011878054589033127, + "learning_rate": 8.243674132278908e-05, + "loss": 0.012257306836545467, + "num_input_tokens_seen": 76361288, + "step": 4663, + "train_runtime": 37891.0653, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 2.8266666666666667, + "grad_norm": 0.008010772988200188, + "learning_rate": 8.24294227012846e-05, + "loss": 0.01344395987689495, + "num_input_tokens_seen": 76377664, + "step": 4664, + "train_runtime": 37899.1779, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 2.827272727272727, + "grad_norm": 0.00795338861644268, + "learning_rate": 8.242210288027893e-05, + "loss": 0.012415559031069279, + "num_input_tokens_seen": 76394040, + "step": 4665, + "train_runtime": 37907.2866, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 2.827878787878788, + "grad_norm": 0.008723371662199497, + "learning_rate": 8.24147818600428e-05, + "loss": 0.01241840049624443, + "num_input_tokens_seen": 76410416, + "step": 4666, + "train_runtime": 37915.3996, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 2.8284848484848486, + "grad_norm": 0.0056407395750284195, + "learning_rate": 8.240745964084703e-05, + "loss": 0.012028871104121208, + "num_input_tokens_seen": 76426792, + "step": 4667, + "train_runtime": 37923.5155, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 2.829090909090909, + "grad_norm": 0.0066760084591805935, + "learning_rate": 8.240013622296243e-05, + "loss": 0.011801970191299915, + "num_input_tokens_seen": 76443168, + "step": 4668, + "train_runtime": 37931.6316, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 2.8296969696969696, + "grad_norm": 0.004957540892064571, + "learning_rate": 8.239281160665991e-05, + "loss": 0.012320438399910927, + "num_input_tokens_seen": 76459544, + "step": 4669, + "train_runtime": 37939.7457, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 2.83030303030303, + "grad_norm": 0.007193710654973984, + "learning_rate": 8.238548579221034e-05, + "loss": 0.012795445509254932, + "num_input_tokens_seen": 76475920, + "step": 4670, + "train_runtime": 37947.8604, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 2.830909090909091, + "grad_norm": 0.00474508898332715, + "learning_rate": 8.237815877988472e-05, + "loss": 0.012654460035264492, + "num_input_tokens_seen": 76492296, + "step": 4671, + "train_runtime": 37955.9767, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 2.8315151515151515, + "grad_norm": 0.020529473200440407, + "learning_rate": 8.237083056995408e-05, + "loss": 0.01226736232638359, + "num_input_tokens_seen": 76508672, + "step": 4672, + "train_runtime": 37964.0899, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 2.832121212121212, + "grad_norm": 0.011525917798280716, + "learning_rate": 8.236350116268944e-05, + "loss": 0.012446033768355846, + "num_input_tokens_seen": 76525048, + "step": 4673, + "train_runtime": 37972.2004, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 2.832727272727273, + "grad_norm": 0.006290186662226915, + "learning_rate": 8.235617055836193e-05, + "loss": 0.012279867194592953, + "num_input_tokens_seen": 76541424, + "step": 4674, + "train_runtime": 37980.3187, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 2.8333333333333335, + "grad_norm": 0.006467217113822699, + "learning_rate": 8.234883875724269e-05, + "loss": 0.012418560683727264, + "num_input_tokens_seen": 76557800, + "step": 4675, + "train_runtime": 37988.4376, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 2.833939393939394, + "grad_norm": 0.012123103253543377, + "learning_rate": 8.234150575960288e-05, + "loss": 0.013292765244841576, + "num_input_tokens_seen": 76574176, + "step": 4676, + "train_runtime": 37996.5522, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 2.8345454545454545, + "grad_norm": 0.007779979612678289, + "learning_rate": 8.233417156571377e-05, + "loss": 0.012682919390499592, + "num_input_tokens_seen": 76590552, + "step": 4677, + "train_runtime": 38004.6667, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 2.835151515151515, + "grad_norm": 0.009410477243363857, + "learning_rate": 8.23268361758466e-05, + "loss": 0.01235941145569086, + "num_input_tokens_seen": 76606928, + "step": 4678, + "train_runtime": 38012.7858, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 2.835757575757576, + "grad_norm": 0.008335885591804981, + "learning_rate": 8.231949959027272e-05, + "loss": 0.013082320801913738, + "num_input_tokens_seen": 76623304, + "step": 4679, + "train_runtime": 38020.9033, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 2.8363636363636364, + "grad_norm": 0.007242868654429913, + "learning_rate": 8.23121618092635e-05, + "loss": 0.011379387229681015, + "num_input_tokens_seen": 76639680, + "step": 4680, + "train_runtime": 38029.018, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 2.836969696969697, + "grad_norm": 0.006332886405289173, + "learning_rate": 8.230482283309035e-05, + "loss": 0.011234107427299023, + "num_input_tokens_seen": 76656056, + "step": 4681, + "train_runtime": 38037.1362, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 2.8375757575757574, + "grad_norm": 0.008837784640491009, + "learning_rate": 8.229748266202469e-05, + "loss": 0.013117042370140553, + "num_input_tokens_seen": 76672432, + "step": 4682, + "train_runtime": 38045.2545, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 2.838181818181818, + "grad_norm": 0.0059628235176205635, + "learning_rate": 8.229014129633805e-05, + "loss": 0.011711148545145988, + "num_input_tokens_seen": 76688808, + "step": 4683, + "train_runtime": 38053.3752, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 2.838787878787879, + "grad_norm": 0.01094250287860632, + "learning_rate": 8.228279873630198e-05, + "loss": 0.012190528213977814, + "num_input_tokens_seen": 76705184, + "step": 4684, + "train_runtime": 38061.4916, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 2.8393939393939394, + "grad_norm": 0.007205680478364229, + "learning_rate": 8.227545498218804e-05, + "loss": 0.011979609727859497, + "num_input_tokens_seen": 76721560, + "step": 4685, + "train_runtime": 38069.6098, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 2.84, + "grad_norm": 0.007192616350948811, + "learning_rate": 8.226811003426788e-05, + "loss": 0.0111123938113451, + "num_input_tokens_seen": 76737936, + "step": 4686, + "train_runtime": 38077.7327, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 2.840606060606061, + "grad_norm": 0.008314704522490501, + "learning_rate": 8.226076389281316e-05, + "loss": 0.011709689162671566, + "num_input_tokens_seen": 76754312, + "step": 4687, + "train_runtime": 38085.8467, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 2.8412121212121213, + "grad_norm": 0.007341733667999506, + "learning_rate": 8.225341655809562e-05, + "loss": 0.012364760972559452, + "num_input_tokens_seen": 76770688, + "step": 4688, + "train_runtime": 38093.9596, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 2.841818181818182, + "grad_norm": 0.007388652767986059, + "learning_rate": 8.224606803038699e-05, + "loss": 0.01227816753089428, + "num_input_tokens_seen": 76787064, + "step": 4689, + "train_runtime": 38102.0778, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 2.8424242424242423, + "grad_norm": 0.008731914684176445, + "learning_rate": 8.223871830995913e-05, + "loss": 0.012175517156720161, + "num_input_tokens_seen": 76803440, + "step": 4690, + "train_runtime": 38110.1913, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 2.843030303030303, + "grad_norm": 0.00467828381806612, + "learning_rate": 8.223136739708383e-05, + "loss": 0.011725027114152908, + "num_input_tokens_seen": 76819816, + "step": 4691, + "train_runtime": 38118.3055, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 2.8436363636363637, + "grad_norm": 0.008670373819768429, + "learning_rate": 8.222401529203304e-05, + "loss": 0.013265066780149937, + "num_input_tokens_seen": 76836192, + "step": 4692, + "train_runtime": 38126.4164, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 2.8442424242424242, + "grad_norm": 0.007325474638491869, + "learning_rate": 8.221666199507867e-05, + "loss": 0.011616285890340805, + "num_input_tokens_seen": 76852568, + "step": 4693, + "train_runtime": 38134.5355, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 2.8448484848484847, + "grad_norm": 0.0021694747265428305, + "learning_rate": 8.220930750649272e-05, + "loss": 0.012336771935224533, + "num_input_tokens_seen": 76868944, + "step": 4694, + "train_runtime": 38142.648, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 2.8454545454545457, + "grad_norm": 0.006484622601419687, + "learning_rate": 8.22019518265472e-05, + "loss": 0.011436006054282188, + "num_input_tokens_seen": 76885320, + "step": 4695, + "train_runtime": 38150.7627, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 2.846060606060606, + "grad_norm": 0.010106910951435566, + "learning_rate": 8.219459495551421e-05, + "loss": 0.01265712734311819, + "num_input_tokens_seen": 76901696, + "step": 4696, + "train_runtime": 38158.8779, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 2.8466666666666667, + "grad_norm": 0.009206676855683327, + "learning_rate": 8.218723689366583e-05, + "loss": 0.011726567521691322, + "num_input_tokens_seen": 76918072, + "step": 4697, + "train_runtime": 38166.9932, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 2.847272727272727, + "grad_norm": 0.007140909321606159, + "learning_rate": 8.217987764127424e-05, + "loss": 0.012025251053273678, + "num_input_tokens_seen": 76934448, + "step": 4698, + "train_runtime": 38175.1048, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 2.8478787878787877, + "grad_norm": 0.01144883967936039, + "learning_rate": 8.217251719861164e-05, + "loss": 0.01268355455249548, + "num_input_tokens_seen": 76950824, + "step": 4699, + "train_runtime": 38183.2169, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 2.8484848484848486, + "grad_norm": 0.007549237459897995, + "learning_rate": 8.21651555659503e-05, + "loss": 0.012479782104492188, + "num_input_tokens_seen": 76967200, + "step": 4700, + "train_runtime": 38191.3342, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 2.849090909090909, + "grad_norm": 0.012838720344007015, + "learning_rate": 8.215779274356248e-05, + "loss": 0.013429594226181507, + "num_input_tokens_seen": 76983576, + "step": 4701, + "train_runtime": 38200.3352, + "train_tokens_per_second": 2015.259 + }, + { + "epoch": 2.8496969696969696, + "grad_norm": 0.007976187393069267, + "learning_rate": 8.215042873172054e-05, + "loss": 0.012263092212378979, + "num_input_tokens_seen": 76999952, + "step": 4702, + "train_runtime": 38208.4494, + "train_tokens_per_second": 2015.26 + }, + { + "epoch": 2.8503030303030306, + "grad_norm": 0.0065808966755867004, + "learning_rate": 8.214306353069685e-05, + "loss": 0.010553563013672829, + "num_input_tokens_seen": 77016328, + "step": 4703, + "train_runtime": 38216.5666, + "train_tokens_per_second": 2015.26 + }, + { + "epoch": 2.850909090909091, + "grad_norm": 0.009572876617312431, + "learning_rate": 8.213569714076383e-05, + "loss": 0.012312974780797958, + "num_input_tokens_seen": 77032704, + "step": 4704, + "train_runtime": 38224.6806, + "train_tokens_per_second": 2015.261 + }, + { + "epoch": 2.8515151515151516, + "grad_norm": 0.005114784464240074, + "learning_rate": 8.212832956219397e-05, + "loss": 0.013319691643118858, + "num_input_tokens_seen": 77049080, + "step": 4705, + "train_runtime": 38232.7984, + "train_tokens_per_second": 2015.261 + }, + { + "epoch": 2.852121212121212, + "grad_norm": 0.009916257113218307, + "learning_rate": 8.212096079525974e-05, + "loss": 0.012637803331017494, + "num_input_tokens_seen": 77065456, + "step": 4706, + "train_runtime": 38240.9114, + "train_tokens_per_second": 2015.262 + }, + { + "epoch": 2.8527272727272726, + "grad_norm": 0.008782883174717426, + "learning_rate": 8.211359084023373e-05, + "loss": 0.011659450829029083, + "num_input_tokens_seen": 77081832, + "step": 4707, + "train_runtime": 38249.0346, + "train_tokens_per_second": 2015.262 + }, + { + "epoch": 2.8533333333333335, + "grad_norm": 0.006448367144912481, + "learning_rate": 8.210621969738854e-05, + "loss": 0.013088960200548172, + "num_input_tokens_seen": 77098208, + "step": 4708, + "train_runtime": 38257.1533, + "train_tokens_per_second": 2015.263 + }, + { + "epoch": 2.853939393939394, + "grad_norm": 0.006844471208751202, + "learning_rate": 8.209884736699681e-05, + "loss": 0.012313634157180786, + "num_input_tokens_seen": 77114584, + "step": 4709, + "train_runtime": 38265.2702, + "train_tokens_per_second": 2015.263 + }, + { + "epoch": 2.8545454545454545, + "grad_norm": 0.010077044367790222, + "learning_rate": 8.209147384933123e-05, + "loss": 0.012333729304373264, + "num_input_tokens_seen": 77130960, + "step": 4710, + "train_runtime": 38273.3846, + "train_tokens_per_second": 2015.264 + }, + { + "epoch": 2.855151515151515, + "grad_norm": 0.004618333652615547, + "learning_rate": 8.20840991446645e-05, + "loss": 0.011696845293045044, + "num_input_tokens_seen": 77147336, + "step": 4711, + "train_runtime": 38281.5046, + "train_tokens_per_second": 2015.264 + }, + { + "epoch": 2.8557575757575755, + "grad_norm": 0.006597245577722788, + "learning_rate": 8.207672325326945e-05, + "loss": 0.012270568870007992, + "num_input_tokens_seen": 77163712, + "step": 4712, + "train_runtime": 38289.6217, + "train_tokens_per_second": 2015.264 + }, + { + "epoch": 2.8563636363636364, + "grad_norm": 0.005862189922481775, + "learning_rate": 8.206934617541887e-05, + "loss": 0.012216673232614994, + "num_input_tokens_seen": 77180088, + "step": 4713, + "train_runtime": 38297.7357, + "train_tokens_per_second": 2015.265 + }, + { + "epoch": 2.856969696969697, + "grad_norm": 0.006318623665720224, + "learning_rate": 8.206196791138562e-05, + "loss": 0.011606366373598576, + "num_input_tokens_seen": 77196464, + "step": 4714, + "train_runtime": 38305.847, + "train_tokens_per_second": 2015.266 + }, + { + "epoch": 2.8575757575757574, + "grad_norm": 0.006274382583796978, + "learning_rate": 8.205458846144263e-05, + "loss": 0.013790265657007694, + "num_input_tokens_seen": 77212840, + "step": 4715, + "train_runtime": 38313.9644, + "train_tokens_per_second": 2015.266 + }, + { + "epoch": 2.8581818181818184, + "grad_norm": 0.004328856244683266, + "learning_rate": 8.204720782586281e-05, + "loss": 0.012001307681202888, + "num_input_tokens_seen": 77229216, + "step": 4716, + "train_runtime": 38322.0812, + "train_tokens_per_second": 2015.267 + }, + { + "epoch": 2.858787878787879, + "grad_norm": 0.005298782605677843, + "learning_rate": 8.203982600491921e-05, + "loss": 0.011159601621329784, + "num_input_tokens_seen": 77245592, + "step": 4717, + "train_runtime": 38330.1958, + "train_tokens_per_second": 2015.267 + }, + { + "epoch": 2.8593939393939394, + "grad_norm": 0.02102711983025074, + "learning_rate": 8.203244299888481e-05, + "loss": 0.013127142563462257, + "num_input_tokens_seen": 77261968, + "step": 4718, + "train_runtime": 38338.3103, + "train_tokens_per_second": 2015.268 + }, + { + "epoch": 2.86, + "grad_norm": 0.00992545485496521, + "learning_rate": 8.202505880803275e-05, + "loss": 0.012790859676897526, + "num_input_tokens_seen": 77278344, + "step": 4719, + "train_runtime": 38346.4333, + "train_tokens_per_second": 2015.268 + }, + { + "epoch": 2.8606060606060604, + "grad_norm": 0.00433537969365716, + "learning_rate": 8.201767343263612e-05, + "loss": 0.011160307563841343, + "num_input_tokens_seen": 77294720, + "step": 4720, + "train_runtime": 38354.5508, + "train_tokens_per_second": 2015.269 + }, + { + "epoch": 2.8612121212121213, + "grad_norm": 0.0033427646849304438, + "learning_rate": 8.20102868729681e-05, + "loss": 0.012500651180744171, + "num_input_tokens_seen": 77311096, + "step": 4721, + "train_runtime": 38362.6683, + "train_tokens_per_second": 2015.269 + }, + { + "epoch": 2.861818181818182, + "grad_norm": 0.00928401667624712, + "learning_rate": 8.200289912930191e-05, + "loss": 0.012834717519581318, + "num_input_tokens_seen": 77327472, + "step": 4722, + "train_runtime": 38370.7814, + "train_tokens_per_second": 2015.27 + }, + { + "epoch": 2.8624242424242423, + "grad_norm": 0.00823894888162613, + "learning_rate": 8.19955102019108e-05, + "loss": 0.012038108892738819, + "num_input_tokens_seen": 77343848, + "step": 4723, + "train_runtime": 38378.8979, + "train_tokens_per_second": 2015.27 + }, + { + "epoch": 2.8630303030303033, + "grad_norm": 0.006094546057283878, + "learning_rate": 8.198812009106809e-05, + "loss": 0.012487877160310745, + "num_input_tokens_seen": 77360224, + "step": 4724, + "train_runtime": 38387.0147, + "train_tokens_per_second": 2015.271 + }, + { + "epoch": 2.8636363636363638, + "grad_norm": 0.01044407393783331, + "learning_rate": 8.19807287970471e-05, + "loss": 0.012458986602723598, + "num_input_tokens_seen": 77376600, + "step": 4725, + "train_runtime": 38395.132, + "train_tokens_per_second": 2015.271 + }, + { + "epoch": 2.8642424242424243, + "grad_norm": 0.003974167630076408, + "learning_rate": 8.197333632012123e-05, + "loss": 0.011355073191225529, + "num_input_tokens_seen": 77392976, + "step": 4726, + "train_runtime": 38403.2464, + "train_tokens_per_second": 2015.272 + }, + { + "epoch": 2.8648484848484848, + "grad_norm": 0.006210191175341606, + "learning_rate": 8.196594266056392e-05, + "loss": 0.01205118466168642, + "num_input_tokens_seen": 77409352, + "step": 4727, + "train_runtime": 38411.3606, + "train_tokens_per_second": 2015.272 + }, + { + "epoch": 2.8654545454545453, + "grad_norm": 0.010924349538981915, + "learning_rate": 8.195854781864864e-05, + "loss": 0.01274740882217884, + "num_input_tokens_seen": 77425728, + "step": 4728, + "train_runtime": 38419.4751, + "train_tokens_per_second": 2015.273 + }, + { + "epoch": 2.866060606060606, + "grad_norm": 0.03383970260620117, + "learning_rate": 8.19511517946489e-05, + "loss": 0.011538989841938019, + "num_input_tokens_seen": 77442104, + "step": 4729, + "train_runtime": 38427.5873, + "train_tokens_per_second": 2015.274 + }, + { + "epoch": 2.8666666666666667, + "grad_norm": 0.018625345081090927, + "learning_rate": 8.19437545888383e-05, + "loss": 0.012814061716198921, + "num_input_tokens_seen": 77458480, + "step": 4730, + "train_runtime": 38435.7009, + "train_tokens_per_second": 2015.274 + }, + { + "epoch": 2.867272727272727, + "grad_norm": 0.006600679364055395, + "learning_rate": 8.19363562014904e-05, + "loss": 0.010461091995239258, + "num_input_tokens_seen": 77474856, + "step": 4731, + "train_runtime": 38443.8141, + "train_tokens_per_second": 2015.275 + }, + { + "epoch": 2.867878787878788, + "grad_norm": 0.00991960521787405, + "learning_rate": 8.192895663287889e-05, + "loss": 0.01293308474123478, + "num_input_tokens_seen": 77491232, + "step": 4732, + "train_runtime": 38451.9312, + "train_tokens_per_second": 2015.275 + }, + { + "epoch": 2.8684848484848486, + "grad_norm": 0.0070592002011835575, + "learning_rate": 8.192155588327747e-05, + "loss": 0.011132686398923397, + "num_input_tokens_seen": 77507608, + "step": 4733, + "train_runtime": 38460.047, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 2.869090909090909, + "grad_norm": 0.011658566072583199, + "learning_rate": 8.191415395295985e-05, + "loss": 0.014341931790113449, + "num_input_tokens_seen": 77523984, + "step": 4734, + "train_runtime": 38468.1703, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 2.8696969696969696, + "grad_norm": 0.00843893550336361, + "learning_rate": 8.190675084219981e-05, + "loss": 0.01196410320699215, + "num_input_tokens_seen": 77540360, + "step": 4735, + "train_runtime": 38476.2817, + "train_tokens_per_second": 2015.277 + }, + { + "epoch": 2.87030303030303, + "grad_norm": 0.006252128630876541, + "learning_rate": 8.189934655127121e-05, + "loss": 0.011650530621409416, + "num_input_tokens_seen": 77556736, + "step": 4736, + "train_runtime": 38484.396, + "train_tokens_per_second": 2015.277 + }, + { + "epoch": 2.870909090909091, + "grad_norm": 0.007708635646849871, + "learning_rate": 8.189194108044788e-05, + "loss": 0.012655800208449364, + "num_input_tokens_seen": 77573112, + "step": 4737, + "train_runtime": 38492.5173, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 2.8715151515151516, + "grad_norm": 0.006693357136100531, + "learning_rate": 8.18845344300038e-05, + "loss": 0.011742263101041317, + "num_input_tokens_seen": 77589488, + "step": 4738, + "train_runtime": 38500.6322, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 2.872121212121212, + "grad_norm": 0.008778940886259079, + "learning_rate": 8.187712660021285e-05, + "loss": 0.011361917480826378, + "num_input_tokens_seen": 77605864, + "step": 4739, + "train_runtime": 38508.7455, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 2.8727272727272726, + "grad_norm": 0.008899624459445477, + "learning_rate": 8.186971759134907e-05, + "loss": 0.01167591568082571, + "num_input_tokens_seen": 77622240, + "step": 4740, + "train_runtime": 38516.8626, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 2.873333333333333, + "grad_norm": 0.008621900342404842, + "learning_rate": 8.186230740368649e-05, + "loss": 0.013102307915687561, + "num_input_tokens_seen": 77638616, + "step": 4741, + "train_runtime": 38524.9777, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 2.873939393939394, + "grad_norm": 0.004962034057825804, + "learning_rate": 8.185489603749922e-05, + "loss": 0.012572520412504673, + "num_input_tokens_seen": 77654992, + "step": 4742, + "train_runtime": 38533.0934, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 2.8745454545454545, + "grad_norm": 0.00656279968097806, + "learning_rate": 8.184748349306137e-05, + "loss": 0.012523069977760315, + "num_input_tokens_seen": 77671368, + "step": 4743, + "train_runtime": 38541.205, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 2.875151515151515, + "grad_norm": 0.006848558783531189, + "learning_rate": 8.184006977064715e-05, + "loss": 0.012846555560827255, + "num_input_tokens_seen": 77687744, + "step": 4744, + "train_runtime": 38549.3193, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 2.875757575757576, + "grad_norm": 0.0042163049802184105, + "learning_rate": 8.183265487053074e-05, + "loss": 0.010707122273743153, + "num_input_tokens_seen": 77704120, + "step": 4745, + "train_runtime": 38557.4368, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 2.8763636363636365, + "grad_norm": 0.012756328098475933, + "learning_rate": 8.18252387929864e-05, + "loss": 0.012497348710894585, + "num_input_tokens_seen": 77720496, + "step": 4746, + "train_runtime": 38565.5482, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 2.876969696969697, + "grad_norm": 0.006287388503551483, + "learning_rate": 8.181782153828848e-05, + "loss": 0.011731589213013649, + "num_input_tokens_seen": 77736872, + "step": 4747, + "train_runtime": 38573.6617, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 2.8775757575757575, + "grad_norm": 0.007682175375521183, + "learning_rate": 8.181040310671129e-05, + "loss": 0.01188613660633564, + "num_input_tokens_seen": 77753248, + "step": 4748, + "train_runtime": 38581.7777, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 2.878181818181818, + "grad_norm": 0.0071826111525297165, + "learning_rate": 8.180298349852924e-05, + "loss": 0.012171470560133457, + "num_input_tokens_seen": 77769624, + "step": 4749, + "train_runtime": 38589.89, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 2.878787878787879, + "grad_norm": 0.007872107438743114, + "learning_rate": 8.179556271401677e-05, + "loss": 0.012316541746258736, + "num_input_tokens_seen": 77786000, + "step": 4750, + "train_runtime": 38598.0045, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 2.8793939393939394, + "grad_norm": 0.03132949024438858, + "learning_rate": 8.178814075344836e-05, + "loss": 0.011205391958355904, + "num_input_tokens_seen": 77802376, + "step": 4751, + "train_runtime": 38606.1159, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 2.88, + "grad_norm": 0.010916111059486866, + "learning_rate": 8.178071761709851e-05, + "loss": 0.012511285953223705, + "num_input_tokens_seen": 77818752, + "step": 4752, + "train_runtime": 38614.2321, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 2.880606060606061, + "grad_norm": 0.008171255700290203, + "learning_rate": 8.177329330524182e-05, + "loss": 0.013458529487252235, + "num_input_tokens_seen": 77835128, + "step": 4753, + "train_runtime": 38622.3445, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 2.8812121212121213, + "grad_norm": 0.00846005417406559, + "learning_rate": 8.176586781815287e-05, + "loss": 0.01138240471482277, + "num_input_tokens_seen": 77851504, + "step": 4754, + "train_runtime": 38630.4532, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 2.881818181818182, + "grad_norm": 0.007493291515856981, + "learning_rate": 8.175844115610634e-05, + "loss": 0.011661755852401257, + "num_input_tokens_seen": 77867880, + "step": 4755, + "train_runtime": 38638.5644, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 2.8824242424242423, + "grad_norm": 0.009785952046513557, + "learning_rate": 8.175101331937693e-05, + "loss": 0.012685212306678295, + "num_input_tokens_seen": 77884256, + "step": 4756, + "train_runtime": 38646.6747, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 2.883030303030303, + "grad_norm": 0.009249714203178883, + "learning_rate": 8.174358430823935e-05, + "loss": 0.012614551931619644, + "num_input_tokens_seen": 77900632, + "step": 4757, + "train_runtime": 38654.7896, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 2.8836363636363638, + "grad_norm": 0.008339757099747658, + "learning_rate": 8.173615412296841e-05, + "loss": 0.012358537875115871, + "num_input_tokens_seen": 77917008, + "step": 4758, + "train_runtime": 38662.9003, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 2.8842424242424243, + "grad_norm": 0.009891746565699577, + "learning_rate": 8.172872276383893e-05, + "loss": 0.012062979862093925, + "num_input_tokens_seen": 77933384, + "step": 4759, + "train_runtime": 38671.0131, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 2.8848484848484848, + "grad_norm": 0.009022560901939869, + "learning_rate": 8.172129023112581e-05, + "loss": 0.011840518563985825, + "num_input_tokens_seen": 77949760, + "step": 4760, + "train_runtime": 38679.1319, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 2.8854545454545457, + "grad_norm": 0.01085427962243557, + "learning_rate": 8.17138565251039e-05, + "loss": 0.012974469922482967, + "num_input_tokens_seen": 77966136, + "step": 4761, + "train_runtime": 38687.2447, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 2.886060606060606, + "grad_norm": 0.005396164488047361, + "learning_rate": 8.170642164604823e-05, + "loss": 0.011595631949603558, + "num_input_tokens_seen": 77982512, + "step": 4762, + "train_runtime": 38695.359, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 2.8866666666666667, + "grad_norm": 0.0065277437679469585, + "learning_rate": 8.169898559423375e-05, + "loss": 0.012015747837722301, + "num_input_tokens_seen": 77998888, + "step": 4763, + "train_runtime": 38703.4741, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 2.887272727272727, + "grad_norm": 0.01171449851244688, + "learning_rate": 8.169154836993551e-05, + "loss": 0.01328394003212452, + "num_input_tokens_seen": 78015264, + "step": 4764, + "train_runtime": 38711.585, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 2.8878787878787877, + "grad_norm": 0.007599621079862118, + "learning_rate": 8.168410997342864e-05, + "loss": 0.012459292076528072, + "num_input_tokens_seen": 78031640, + "step": 4765, + "train_runtime": 38719.6969, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 2.8884848484848487, + "grad_norm": 0.0036309475544840097, + "learning_rate": 8.167667040498823e-05, + "loss": 0.011952606029808521, + "num_input_tokens_seen": 78048016, + "step": 4766, + "train_runtime": 38727.8079, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 2.889090909090909, + "grad_norm": 0.009230728261172771, + "learning_rate": 8.166922966488948e-05, + "loss": 0.011952600441873074, + "num_input_tokens_seen": 78064392, + "step": 4767, + "train_runtime": 38735.9225, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 2.8896969696969697, + "grad_norm": 0.00602208124473691, + "learning_rate": 8.166178775340758e-05, + "loss": 0.011412234045565128, + "num_input_tokens_seen": 78080768, + "step": 4768, + "train_runtime": 38744.0347, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 2.89030303030303, + "grad_norm": 0.005236285272985697, + "learning_rate": 8.165434467081783e-05, + "loss": 0.011104105971753597, + "num_input_tokens_seen": 78097144, + "step": 4769, + "train_runtime": 38752.149, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 2.8909090909090907, + "grad_norm": 0.004033911973237991, + "learning_rate": 8.16469004173955e-05, + "loss": 0.012001413851976395, + "num_input_tokens_seen": 78113520, + "step": 4770, + "train_runtime": 38760.2628, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 2.8915151515151516, + "grad_norm": 0.0100289611145854, + "learning_rate": 8.163945499341596e-05, + "loss": 0.012626633048057556, + "num_input_tokens_seen": 78129896, + "step": 4771, + "train_runtime": 38768.3766, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 2.892121212121212, + "grad_norm": 0.014887494966387749, + "learning_rate": 8.163200839915459e-05, + "loss": 0.012779121287167072, + "num_input_tokens_seen": 78146272, + "step": 4772, + "train_runtime": 38776.4881, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 2.8927272727272726, + "grad_norm": 0.007808137219399214, + "learning_rate": 8.162456063488684e-05, + "loss": 0.012779667973518372, + "num_input_tokens_seen": 78162648, + "step": 4773, + "train_runtime": 38784.5999, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 2.8933333333333335, + "grad_norm": 0.00499767204746604, + "learning_rate": 8.161711170088818e-05, + "loss": 0.0115958321839571, + "num_input_tokens_seen": 78179024, + "step": 4774, + "train_runtime": 38792.7129, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 2.893939393939394, + "grad_norm": 0.005851836409419775, + "learning_rate": 8.16096615974341e-05, + "loss": 0.011336658149957657, + "num_input_tokens_seen": 78195400, + "step": 4775, + "train_runtime": 38800.8314, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 2.8945454545454545, + "grad_norm": 0.026062481105327606, + "learning_rate": 8.160221032480021e-05, + "loss": 0.01450158841907978, + "num_input_tokens_seen": 78211776, + "step": 4776, + "train_runtime": 38808.9442, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 2.895151515151515, + "grad_norm": 0.001216348959133029, + "learning_rate": 8.159475788326212e-05, + "loss": 0.010902187786996365, + "num_input_tokens_seen": 78228152, + "step": 4777, + "train_runtime": 38817.061, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 2.8957575757575755, + "grad_norm": 0.006366488989442587, + "learning_rate": 8.158730427309544e-05, + "loss": 0.011272700503468513, + "num_input_tokens_seen": 78244528, + "step": 4778, + "train_runtime": 38825.1769, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 2.8963636363636365, + "grad_norm": 0.006954401731491089, + "learning_rate": 8.15798494945759e-05, + "loss": 0.012355746701359749, + "num_input_tokens_seen": 78260904, + "step": 4779, + "train_runtime": 38833.2889, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 2.896969696969697, + "grad_norm": 0.007805840112268925, + "learning_rate": 8.157239354797924e-05, + "loss": 0.01202874630689621, + "num_input_tokens_seen": 78277280, + "step": 4780, + "train_runtime": 38841.4, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 2.8975757575757575, + "grad_norm": 0.007524395361542702, + "learning_rate": 8.156493643358121e-05, + "loss": 0.011659018695354462, + "num_input_tokens_seen": 78293656, + "step": 4781, + "train_runtime": 38849.51, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 2.8981818181818184, + "grad_norm": 0.009618330746889114, + "learning_rate": 8.155747815165765e-05, + "loss": 0.012802318669855595, + "num_input_tokens_seen": 78310032, + "step": 4782, + "train_runtime": 38857.6221, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 2.898787878787879, + "grad_norm": 0.005547967739403248, + "learning_rate": 8.155001870248443e-05, + "loss": 0.012005164287984371, + "num_input_tokens_seen": 78326408, + "step": 4783, + "train_runtime": 38865.7322, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 2.8993939393939394, + "grad_norm": 0.01106483768671751, + "learning_rate": 8.154255808633746e-05, + "loss": 0.011981705203652382, + "num_input_tokens_seen": 78342784, + "step": 4784, + "train_runtime": 38873.8428, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 2.9, + "grad_norm": 0.007454793434590101, + "learning_rate": 8.15350963034927e-05, + "loss": 0.011833623051643372, + "num_input_tokens_seen": 78359160, + "step": 4785, + "train_runtime": 38881.9575, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 2.9006060606060604, + "grad_norm": 0.0029768566600978374, + "learning_rate": 8.152763335422613e-05, + "loss": 0.011730967089533806, + "num_input_tokens_seen": 78375536, + "step": 4786, + "train_runtime": 38890.0719, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 2.9012121212121214, + "grad_norm": 0.012138666585087776, + "learning_rate": 8.15201692388138e-05, + "loss": 0.013305027969181538, + "num_input_tokens_seen": 78391912, + "step": 4787, + "train_runtime": 38898.1847, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 2.901818181818182, + "grad_norm": 0.006695140153169632, + "learning_rate": 8.15127039575318e-05, + "loss": 0.012170565314590931, + "num_input_tokens_seen": 78408288, + "step": 4788, + "train_runtime": 38906.2973, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 2.9024242424242424, + "grad_norm": 0.003637703601270914, + "learning_rate": 8.150523751065624e-05, + "loss": 0.011070928536355495, + "num_input_tokens_seen": 78424664, + "step": 4789, + "train_runtime": 38914.4127, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 2.9030303030303033, + "grad_norm": 0.007801847532391548, + "learning_rate": 8.149776989846331e-05, + "loss": 0.012279287911951542, + "num_input_tokens_seen": 78441040, + "step": 4790, + "train_runtime": 38922.5324, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 2.903636363636364, + "grad_norm": 0.008295193314552307, + "learning_rate": 8.14903011212292e-05, + "loss": 0.011995763517916203, + "num_input_tokens_seen": 78457416, + "step": 4791, + "train_runtime": 38930.6431, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 2.9042424242424243, + "grad_norm": 0.008223768323659897, + "learning_rate": 8.14828311792302e-05, + "loss": 0.01296326145529747, + "num_input_tokens_seen": 78473792, + "step": 4792, + "train_runtime": 38938.7573, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 2.904848484848485, + "grad_norm": 0.00802955124527216, + "learning_rate": 8.147536007274255e-05, + "loss": 0.012813668698072433, + "num_input_tokens_seen": 78490168, + "step": 4793, + "train_runtime": 38946.8676, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 2.9054545454545453, + "grad_norm": 0.009117009118199348, + "learning_rate": 8.146788780204263e-05, + "loss": 0.012199170887470245, + "num_input_tokens_seen": 78506544, + "step": 4794, + "train_runtime": 38954.9773, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 2.9060606060606062, + "grad_norm": 0.009650224819779396, + "learning_rate": 8.146041436740684e-05, + "loss": 0.01164478063583374, + "num_input_tokens_seen": 78522920, + "step": 4795, + "train_runtime": 38963.087, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 2.9066666666666667, + "grad_norm": 0.006193290930241346, + "learning_rate": 8.145293976911158e-05, + "loss": 0.01110462099313736, + "num_input_tokens_seen": 78539296, + "step": 4796, + "train_runtime": 38971.2007, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 2.9072727272727272, + "grad_norm": 0.0059146336279809475, + "learning_rate": 8.144546400743334e-05, + "loss": 0.012170787900686264, + "num_input_tokens_seen": 78555672, + "step": 4797, + "train_runtime": 38979.3125, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 2.9078787878787877, + "grad_norm": 0.01286760251969099, + "learning_rate": 8.143798708264861e-05, + "loss": 0.012644006870687008, + "num_input_tokens_seen": 78572048, + "step": 4798, + "train_runtime": 38987.4313, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 2.9084848484848482, + "grad_norm": 0.011959551833570004, + "learning_rate": 8.143050899503396e-05, + "loss": 0.012189952656626701, + "num_input_tokens_seen": 78588424, + "step": 4799, + "train_runtime": 38995.5447, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 2.909090909090909, + "grad_norm": 0.015677666291594505, + "learning_rate": 8.1423029744866e-05, + "loss": 0.012034375220537186, + "num_input_tokens_seen": 78604800, + "step": 4800, + "train_runtime": 39003.6597, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 2.9096969696969697, + "grad_norm": 0.006909128278493881, + "learning_rate": 8.141554933242135e-05, + "loss": 0.011400085873901844, + "num_input_tokens_seen": 78621176, + "step": 4801, + "train_runtime": 39012.6548, + "train_tokens_per_second": 2015.274 + }, + { + "epoch": 2.91030303030303, + "grad_norm": 0.009060295298695564, + "learning_rate": 8.140806775797671e-05, + "loss": 0.013769303448498249, + "num_input_tokens_seen": 78637552, + "step": 4802, + "train_runtime": 39020.765, + "train_tokens_per_second": 2015.274 + }, + { + "epoch": 2.910909090909091, + "grad_norm": 0.002323293825611472, + "learning_rate": 8.140058502180883e-05, + "loss": 0.012060626409947872, + "num_input_tokens_seen": 78653928, + "step": 4803, + "train_runtime": 39028.8784, + "train_tokens_per_second": 2015.275 + }, + { + "epoch": 2.9115151515151516, + "grad_norm": 0.005302889738231897, + "learning_rate": 8.139310112419444e-05, + "loss": 0.011929575353860855, + "num_input_tokens_seen": 78670304, + "step": 4804, + "train_runtime": 39036.9906, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 2.912121212121212, + "grad_norm": 0.010831132531166077, + "learning_rate": 8.138561606541038e-05, + "loss": 0.011364113539457321, + "num_input_tokens_seen": 78686680, + "step": 4805, + "train_runtime": 39045.1096, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 2.9127272727272726, + "grad_norm": 0.00674369977787137, + "learning_rate": 8.13781298457335e-05, + "loss": 0.012235518544912338, + "num_input_tokens_seen": 78703056, + "step": 4806, + "train_runtime": 39053.2324, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 2.913333333333333, + "grad_norm": 0.0003000107826665044, + "learning_rate": 8.13706424654407e-05, + "loss": 0.010876612737774849, + "num_input_tokens_seen": 78719432, + "step": 4807, + "train_runtime": 39061.3464, + "train_tokens_per_second": 2015.277 + }, + { + "epoch": 2.913939393939394, + "grad_norm": 0.005810308735817671, + "learning_rate": 8.136315392480893e-05, + "loss": 0.012176680378615856, + "num_input_tokens_seen": 78735808, + "step": 4808, + "train_runtime": 39069.4602, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 2.9145454545454546, + "grad_norm": 0.008763106539845467, + "learning_rate": 8.135566422411519e-05, + "loss": 0.01323007419705391, + "num_input_tokens_seen": 78752184, + "step": 4809, + "train_runtime": 39077.5759, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 2.915151515151515, + "grad_norm": 0.006949743255972862, + "learning_rate": 8.134817336363647e-05, + "loss": 0.012575346976518631, + "num_input_tokens_seen": 78768560, + "step": 4810, + "train_runtime": 39085.6945, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 2.915757575757576, + "grad_norm": 0.007601077202707529, + "learning_rate": 8.134068134364987e-05, + "loss": 0.011841082945466042, + "num_input_tokens_seen": 78784936, + "step": 4811, + "train_runtime": 39093.8106, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 2.9163636363636365, + "grad_norm": 0.006447995081543922, + "learning_rate": 8.133318816443251e-05, + "loss": 0.011303157545626163, + "num_input_tokens_seen": 78801312, + "step": 4812, + "train_runtime": 39101.9327, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 2.916969696969697, + "grad_norm": 0.006306002847850323, + "learning_rate": 8.132569382626154e-05, + "loss": 0.01237834244966507, + "num_input_tokens_seen": 78817688, + "step": 4813, + "train_runtime": 39110.0454, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 2.9175757575757575, + "grad_norm": 0.007324448321014643, + "learning_rate": 8.131819832941414e-05, + "loss": 0.012380668893456459, + "num_input_tokens_seen": 78834064, + "step": 4814, + "train_runtime": 39118.1631, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 2.918181818181818, + "grad_norm": 0.010003463365137577, + "learning_rate": 8.131070167416759e-05, + "loss": 0.012885022908449173, + "num_input_tokens_seen": 78850440, + "step": 4815, + "train_runtime": 39126.2777, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 2.918787878787879, + "grad_norm": 0.006504611577838659, + "learning_rate": 8.130320386079915e-05, + "loss": 0.012051417492330074, + "num_input_tokens_seen": 78866816, + "step": 4816, + "train_runtime": 39134.3955, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 2.9193939393939394, + "grad_norm": 0.006056876853108406, + "learning_rate": 8.129570488958618e-05, + "loss": 0.011992553249001503, + "num_input_tokens_seen": 78883192, + "step": 4817, + "train_runtime": 39142.5071, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 2.92, + "grad_norm": 0.011380886659026146, + "learning_rate": 8.1288204760806e-05, + "loss": 0.011975511908531189, + "num_input_tokens_seen": 78899568, + "step": 4818, + "train_runtime": 39150.6179, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 2.920606060606061, + "grad_norm": 0.006082882639020681, + "learning_rate": 8.128070347473609e-05, + "loss": 0.012980255298316479, + "num_input_tokens_seen": 78915944, + "step": 4819, + "train_runtime": 39158.7327, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 2.9212121212121214, + "grad_norm": 0.004800500348210335, + "learning_rate": 8.127320103165384e-05, + "loss": 0.011077520437538624, + "num_input_tokens_seen": 78932320, + "step": 4820, + "train_runtime": 39166.8462, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 2.921818181818182, + "grad_norm": 0.0017740195617079735, + "learning_rate": 8.126569743183681e-05, + "loss": 0.012449763715267181, + "num_input_tokens_seen": 78948696, + "step": 4821, + "train_runtime": 39174.9594, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 2.9224242424242424, + "grad_norm": 0.008002673275768757, + "learning_rate": 8.125819267556252e-05, + "loss": 0.012877354398369789, + "num_input_tokens_seen": 78965072, + "step": 4822, + "train_runtime": 39183.0783, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 2.923030303030303, + "grad_norm": 0.006293389480561018, + "learning_rate": 8.125068676310854e-05, + "loss": 0.010820485651493073, + "num_input_tokens_seen": 78981448, + "step": 4823, + "train_runtime": 39191.1946, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 2.923636363636364, + "grad_norm": 0.007727582007646561, + "learning_rate": 8.124317969475252e-05, + "loss": 0.012851117178797722, + "num_input_tokens_seen": 78997824, + "step": 4824, + "train_runtime": 39199.3066, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 2.9242424242424243, + "grad_norm": 0.010304388590157032, + "learning_rate": 8.123567147077214e-05, + "loss": 0.011944583617150784, + "num_input_tokens_seen": 79014200, + "step": 4825, + "train_runtime": 39207.4198, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 2.924848484848485, + "grad_norm": 0.008436611853539944, + "learning_rate": 8.122816209144509e-05, + "loss": 0.013189258053898811, + "num_input_tokens_seen": 79030576, + "step": 4826, + "train_runtime": 39215.5323, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 2.9254545454545453, + "grad_norm": 0.00815647654235363, + "learning_rate": 8.122065155704912e-05, + "loss": 0.012783393263816833, + "num_input_tokens_seen": 79046952, + "step": 4827, + "train_runtime": 39223.6452, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 2.926060606060606, + "grad_norm": 0.006351114716380835, + "learning_rate": 8.121313986786208e-05, + "loss": 0.011633126065135002, + "num_input_tokens_seen": 79063328, + "step": 4828, + "train_runtime": 39231.7557, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 2.9266666666666667, + "grad_norm": 0.009325454942882061, + "learning_rate": 8.120562702416175e-05, + "loss": 0.011887112632393837, + "num_input_tokens_seen": 79079704, + "step": 4829, + "train_runtime": 39239.8698, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 2.9272727272727272, + "grad_norm": 0.0056626941077411175, + "learning_rate": 8.119811302622608e-05, + "loss": 0.011641982942819595, + "num_input_tokens_seen": 79096080, + "step": 4830, + "train_runtime": 39247.9844, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 2.9278787878787877, + "grad_norm": 0.009380260482430458, + "learning_rate": 8.119059787433294e-05, + "loss": 0.011709613725543022, + "num_input_tokens_seen": 79112456, + "step": 4831, + "train_runtime": 39256.1021, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 2.9284848484848487, + "grad_norm": 0.010623188689351082, + "learning_rate": 8.118308156876033e-05, + "loss": 0.0120775755494833, + "num_input_tokens_seen": 79128832, + "step": 4832, + "train_runtime": 39264.2144, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 2.929090909090909, + "grad_norm": 0.006411305163055658, + "learning_rate": 8.117556410978626e-05, + "loss": 0.012766589410603046, + "num_input_tokens_seen": 79145208, + "step": 4833, + "train_runtime": 39272.3332, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 2.9296969696969697, + "grad_norm": 0.0071541788056492805, + "learning_rate": 8.11680454976888e-05, + "loss": 0.011429469101130962, + "num_input_tokens_seen": 79161584, + "step": 4834, + "train_runtime": 39280.4443, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 2.93030303030303, + "grad_norm": 0.009859742596745491, + "learning_rate": 8.116052573274602e-05, + "loss": 0.013847287744283676, + "num_input_tokens_seen": 79177960, + "step": 4835, + "train_runtime": 39288.5574, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 2.9309090909090907, + "grad_norm": 0.0038820153567939997, + "learning_rate": 8.115300481523609e-05, + "loss": 0.011019338853657246, + "num_input_tokens_seen": 79194336, + "step": 4836, + "train_runtime": 39296.6712, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 2.9315151515151516, + "grad_norm": 0.010395511984825134, + "learning_rate": 8.114548274543717e-05, + "loss": 0.013552706688642502, + "num_input_tokens_seen": 79210712, + "step": 4837, + "train_runtime": 39304.7888, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 2.932121212121212, + "grad_norm": 0.0057226382195949554, + "learning_rate": 8.113795952362748e-05, + "loss": 0.012806729413568974, + "num_input_tokens_seen": 79227088, + "step": 4838, + "train_runtime": 39312.9002, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 2.9327272727272726, + "grad_norm": 0.005786038935184479, + "learning_rate": 8.113043515008532e-05, + "loss": 0.011644753627479076, + "num_input_tokens_seen": 79243464, + "step": 4839, + "train_runtime": 39321.0128, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 2.9333333333333336, + "grad_norm": 0.006874296814203262, + "learning_rate": 8.1122909625089e-05, + "loss": 0.012462293729186058, + "num_input_tokens_seen": 79259840, + "step": 4840, + "train_runtime": 39329.1321, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 2.933939393939394, + "grad_norm": 0.0042829355224967, + "learning_rate": 8.111538294891684e-05, + "loss": 0.012841212563216686, + "num_input_tokens_seen": 79276216, + "step": 4841, + "train_runtime": 39337.246, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 2.9345454545454546, + "grad_norm": 0.003657143795862794, + "learning_rate": 8.110785512184727e-05, + "loss": 0.010463301092386246, + "num_input_tokens_seen": 79292592, + "step": 4842, + "train_runtime": 39345.356, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 2.935151515151515, + "grad_norm": 0.008298623375594616, + "learning_rate": 8.110032614415872e-05, + "loss": 0.012673629447817802, + "num_input_tokens_seen": 79308968, + "step": 4843, + "train_runtime": 39353.4707, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 2.9357575757575756, + "grad_norm": 0.009869730100035667, + "learning_rate": 8.109279601612967e-05, + "loss": 0.011228544637560844, + "num_input_tokens_seen": 79325344, + "step": 4844, + "train_runtime": 39361.5846, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 2.9363636363636365, + "grad_norm": 0.004542497918009758, + "learning_rate": 8.108526473803863e-05, + "loss": 0.012027280405163765, + "num_input_tokens_seen": 79341720, + "step": 4845, + "train_runtime": 39369.6971, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 2.936969696969697, + "grad_norm": 0.010077869519591331, + "learning_rate": 8.10777323101642e-05, + "loss": 0.012924620881676674, + "num_input_tokens_seen": 79358096, + "step": 4846, + "train_runtime": 39377.8097, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 2.9375757575757575, + "grad_norm": 0.007604667916893959, + "learning_rate": 8.107019873278498e-05, + "loss": 0.012456710450351238, + "num_input_tokens_seen": 79374472, + "step": 4847, + "train_runtime": 39385.9192, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 2.9381818181818184, + "grad_norm": 0.007579051423817873, + "learning_rate": 8.106266400617961e-05, + "loss": 0.012052253820002079, + "num_input_tokens_seen": 79390848, + "step": 4848, + "train_runtime": 39394.032, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 2.938787878787879, + "grad_norm": 0.0078118774108588696, + "learning_rate": 8.105512813062678e-05, + "loss": 0.012326090596616268, + "num_input_tokens_seen": 79407224, + "step": 4849, + "train_runtime": 39402.1477, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 2.9393939393939394, + "grad_norm": 0.009636261500418186, + "learning_rate": 8.104759110640524e-05, + "loss": 0.012556592002511024, + "num_input_tokens_seen": 79423600, + "step": 4850, + "train_runtime": 39410.2603, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 2.94, + "grad_norm": 0.004140492994338274, + "learning_rate": 8.104005293379378e-05, + "loss": 0.011732570827007294, + "num_input_tokens_seen": 79439976, + "step": 4851, + "train_runtime": 39418.3741, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 2.9406060606060604, + "grad_norm": 0.010788233019411564, + "learning_rate": 8.103251361307119e-05, + "loss": 0.013359260745346546, + "num_input_tokens_seen": 79456352, + "step": 4852, + "train_runtime": 39426.4875, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 2.9412121212121214, + "grad_norm": 0.008047981187701225, + "learning_rate": 8.102497314451637e-05, + "loss": 0.010942239314317703, + "num_input_tokens_seen": 79472728, + "step": 4853, + "train_runtime": 39434.6006, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 2.941818181818182, + "grad_norm": 0.006586496718227863, + "learning_rate": 8.101743152840821e-05, + "loss": 0.012033998966217041, + "num_input_tokens_seen": 79489104, + "step": 4854, + "train_runtime": 39442.7152, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 2.9424242424242424, + "grad_norm": 0.010820439085364342, + "learning_rate": 8.100988876502566e-05, + "loss": 0.013370683416724205, + "num_input_tokens_seen": 79505480, + "step": 4855, + "train_runtime": 39450.8311, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 2.943030303030303, + "grad_norm": 0.007953312247991562, + "learning_rate": 8.100234485464771e-05, + "loss": 0.01249429676681757, + "num_input_tokens_seen": 79521856, + "step": 4856, + "train_runtime": 39458.9451, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 2.9436363636363634, + "grad_norm": 0.007280191406607628, + "learning_rate": 8.09947997975534e-05, + "loss": 0.01287197694182396, + "num_input_tokens_seen": 79538232, + "step": 4857, + "train_runtime": 39467.058, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 2.9442424242424243, + "grad_norm": 0.006243347655981779, + "learning_rate": 8.09872535940218e-05, + "loss": 0.013017448596656322, + "num_input_tokens_seen": 79554608, + "step": 4858, + "train_runtime": 39475.1743, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 2.944848484848485, + "grad_norm": 0.009019375778734684, + "learning_rate": 8.097970624433204e-05, + "loss": 0.010967589914798737, + "num_input_tokens_seen": 79570984, + "step": 4859, + "train_runtime": 39483.2917, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 2.9454545454545453, + "grad_norm": 0.009026739746332169, + "learning_rate": 8.097215774876328e-05, + "loss": 0.011725298129022121, + "num_input_tokens_seen": 79587360, + "step": 4860, + "train_runtime": 39491.4085, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 2.9460606060606063, + "grad_norm": 0.002666132990270853, + "learning_rate": 8.096460810759472e-05, + "loss": 0.011624579317867756, + "num_input_tokens_seen": 79603736, + "step": 4861, + "train_runtime": 39499.5205, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 2.9466666666666668, + "grad_norm": 0.006255737040191889, + "learning_rate": 8.095705732110559e-05, + "loss": 0.01150699332356453, + "num_input_tokens_seen": 79620112, + "step": 4862, + "train_runtime": 39507.6329, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 2.9472727272727273, + "grad_norm": 0.006555578205734491, + "learning_rate": 8.094950538957521e-05, + "loss": 0.011662416160106659, + "num_input_tokens_seen": 79636488, + "step": 4863, + "train_runtime": 39515.7455, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 2.9478787878787878, + "grad_norm": 0.0077291373163461685, + "learning_rate": 8.09419523132829e-05, + "loss": 0.011693781241774559, + "num_input_tokens_seen": 79652864, + "step": 4864, + "train_runtime": 39523.8571, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 2.9484848484848483, + "grad_norm": 0.012921378947794437, + "learning_rate": 8.093439809250802e-05, + "loss": 0.012114960700273514, + "num_input_tokens_seen": 79669240, + "step": 4865, + "train_runtime": 39531.9702, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 2.949090909090909, + "grad_norm": 0.009752864949405193, + "learning_rate": 8.092684272753002e-05, + "loss": 0.011265935376286507, + "num_input_tokens_seen": 79685616, + "step": 4866, + "train_runtime": 39540.0839, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 2.9496969696969697, + "grad_norm": 0.00840400718152523, + "learning_rate": 8.091928621862831e-05, + "loss": 0.010895718820393085, + "num_input_tokens_seen": 79701992, + "step": 4867, + "train_runtime": 39548.2012, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 2.95030303030303, + "grad_norm": 0.005635090172290802, + "learning_rate": 8.091172856608242e-05, + "loss": 0.011656675487756729, + "num_input_tokens_seen": 79718368, + "step": 4868, + "train_runtime": 39556.3159, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 2.950909090909091, + "grad_norm": 0.008805626071989536, + "learning_rate": 8.09041697701719e-05, + "loss": 0.01231548935174942, + "num_input_tokens_seen": 79734744, + "step": 4869, + "train_runtime": 39564.4368, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 2.9515151515151516, + "grad_norm": 0.009158898144960403, + "learning_rate": 8.089660983117631e-05, + "loss": 0.011996923014521599, + "num_input_tokens_seen": 79751120, + "step": 4870, + "train_runtime": 39572.5523, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 2.952121212121212, + "grad_norm": 0.010081618092954159, + "learning_rate": 8.088904874937528e-05, + "loss": 0.012205241248011589, + "num_input_tokens_seen": 79767496, + "step": 4871, + "train_runtime": 39580.6672, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 2.9527272727272726, + "grad_norm": 0.007465929724276066, + "learning_rate": 8.088148652504852e-05, + "loss": 0.012481342069804668, + "num_input_tokens_seen": 79783872, + "step": 4872, + "train_runtime": 39588.7827, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 2.953333333333333, + "grad_norm": 0.005115598905831575, + "learning_rate": 8.08739231584757e-05, + "loss": 0.011542865075170994, + "num_input_tokens_seen": 79800248, + "step": 4873, + "train_runtime": 39596.8968, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 2.953939393939394, + "grad_norm": 0.006625794339925051, + "learning_rate": 8.08663586499366e-05, + "loss": 0.01376316323876381, + "num_input_tokens_seen": 79816624, + "step": 4874, + "train_runtime": 39605.0105, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 2.9545454545454546, + "grad_norm": 0.007995427586138248, + "learning_rate": 8.085879299971097e-05, + "loss": 0.01280970312654972, + "num_input_tokens_seen": 79833000, + "step": 4875, + "train_runtime": 39613.1311, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 2.955151515151515, + "grad_norm": 0.01811821386218071, + "learning_rate": 8.08512262080787e-05, + "loss": 0.011927287094295025, + "num_input_tokens_seen": 79849376, + "step": 4876, + "train_runtime": 39621.2427, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 2.955757575757576, + "grad_norm": 0.0031975284218788147, + "learning_rate": 8.084365827531966e-05, + "loss": 0.012051818892359734, + "num_input_tokens_seen": 79865752, + "step": 4877, + "train_runtime": 39629.3572, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 2.9563636363636365, + "grad_norm": 0.0075789038091897964, + "learning_rate": 8.083608920171375e-05, + "loss": 0.012632708065211773, + "num_input_tokens_seen": 79882128, + "step": 4878, + "train_runtime": 39637.4753, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 2.956969696969697, + "grad_norm": 0.009403679519891739, + "learning_rate": 8.082851898754096e-05, + "loss": 0.012977367267012596, + "num_input_tokens_seen": 79898504, + "step": 4879, + "train_runtime": 39645.5878, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 2.9575757575757575, + "grad_norm": 0.008212127722799778, + "learning_rate": 8.082094763308129e-05, + "loss": 0.012115645222365856, + "num_input_tokens_seen": 79914880, + "step": 4880, + "train_runtime": 39653.7023, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 2.958181818181818, + "grad_norm": 0.011410173960030079, + "learning_rate": 8.081337513861478e-05, + "loss": 0.011854222975671291, + "num_input_tokens_seen": 79931256, + "step": 4881, + "train_runtime": 39661.8168, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 2.958787878787879, + "grad_norm": 0.0053351521492004395, + "learning_rate": 8.080580150442154e-05, + "loss": 0.010878185741603374, + "num_input_tokens_seen": 79947632, + "step": 4882, + "train_runtime": 39669.9314, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 2.9593939393939395, + "grad_norm": 0.012433327734470367, + "learning_rate": 8.079822673078169e-05, + "loss": 0.012915851548314095, + "num_input_tokens_seen": 79964008, + "step": 4883, + "train_runtime": 39678.0495, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 2.96, + "grad_norm": 0.009064363315701485, + "learning_rate": 8.079065081797542e-05, + "loss": 0.012236394919455051, + "num_input_tokens_seen": 79980384, + "step": 4884, + "train_runtime": 39686.1625, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 2.9606060606060605, + "grad_norm": 0.007341083604842424, + "learning_rate": 8.07830737662829e-05, + "loss": 0.0127581050619483, + "num_input_tokens_seen": 79996760, + "step": 4885, + "train_runtime": 39694.282, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 2.961212121212121, + "grad_norm": 0.005861541721969843, + "learning_rate": 8.077549557598448e-05, + "loss": 0.0123793575912714, + "num_input_tokens_seen": 80013136, + "step": 4886, + "train_runtime": 39702.3973, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 2.961818181818182, + "grad_norm": 0.006770276464521885, + "learning_rate": 8.076791624736038e-05, + "loss": 0.011531973257660866, + "num_input_tokens_seen": 80029512, + "step": 4887, + "train_runtime": 39710.5115, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 2.9624242424242424, + "grad_norm": 0.0085370521992445, + "learning_rate": 8.076033578069099e-05, + "loss": 0.011185248382389545, + "num_input_tokens_seen": 80045888, + "step": 4888, + "train_runtime": 39718.6321, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 2.963030303030303, + "grad_norm": 0.00585033418610692, + "learning_rate": 8.075275417625667e-05, + "loss": 0.011876201257109642, + "num_input_tokens_seen": 80062264, + "step": 4889, + "train_runtime": 39726.7471, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 2.963636363636364, + "grad_norm": 0.007977165281772614, + "learning_rate": 8.074517143433786e-05, + "loss": 0.012217435985803604, + "num_input_tokens_seen": 80078640, + "step": 4890, + "train_runtime": 39734.8615, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 2.9642424242424243, + "grad_norm": 0.00461649801582098, + "learning_rate": 8.073758755521505e-05, + "loss": 0.010575024411082268, + "num_input_tokens_seen": 80095016, + "step": 4891, + "train_runtime": 39742.9741, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 2.964848484848485, + "grad_norm": 0.007153033744543791, + "learning_rate": 8.073000253916873e-05, + "loss": 0.012415817938745022, + "num_input_tokens_seen": 80111392, + "step": 4892, + "train_runtime": 39751.0914, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 2.9654545454545453, + "grad_norm": 0.009921128861606121, + "learning_rate": 8.072241638647944e-05, + "loss": 0.011550546623766422, + "num_input_tokens_seen": 80127768, + "step": 4893, + "train_runtime": 39759.2059, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 2.966060606060606, + "grad_norm": 0.010931876488029957, + "learning_rate": 8.071482909742782e-05, + "loss": 0.013049537315964699, + "num_input_tokens_seen": 80144144, + "step": 4894, + "train_runtime": 39767.3176, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 2.966666666666667, + "grad_norm": 0.006309812422841787, + "learning_rate": 8.070724067229448e-05, + "loss": 0.012073435820639133, + "num_input_tokens_seen": 80160520, + "step": 4895, + "train_runtime": 39775.4312, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 2.9672727272727273, + "grad_norm": 0.011260004714131355, + "learning_rate": 8.06996511113601e-05, + "loss": 0.0120348846539855, + "num_input_tokens_seen": 80176896, + "step": 4896, + "train_runtime": 39783.5464, + "train_tokens_per_second": 2015.328 + }, + { + "epoch": 2.967878787878788, + "grad_norm": 0.010533444583415985, + "learning_rate": 8.069206041490542e-05, + "loss": 0.013703427277505398, + "num_input_tokens_seen": 80193272, + "step": 4897, + "train_runtime": 39791.6605, + "train_tokens_per_second": 2015.329 + }, + { + "epoch": 2.9684848484848487, + "grad_norm": 0.006889041513204575, + "learning_rate": 8.068446858321119e-05, + "loss": 0.012217766605317593, + "num_input_tokens_seen": 80209648, + "step": 4898, + "train_runtime": 39799.7775, + "train_tokens_per_second": 2015.329 + }, + { + "epoch": 2.9690909090909092, + "grad_norm": 0.006927069276571274, + "learning_rate": 8.067687561655822e-05, + "loss": 0.012239620089530945, + "num_input_tokens_seen": 80226024, + "step": 4899, + "train_runtime": 39807.8954, + "train_tokens_per_second": 2015.329 + }, + { + "epoch": 2.9696969696969697, + "grad_norm": 0.015214423649013042, + "learning_rate": 8.066928151522737e-05, + "loss": 0.013368513435125351, + "num_input_tokens_seen": 80242400, + "step": 4900, + "train_runtime": 39816.0135, + "train_tokens_per_second": 2015.33 + }, + { + "epoch": 2.9703030303030302, + "grad_norm": 0.006322086323052645, + "learning_rate": 8.066168627949952e-05, + "loss": 0.011778680607676506, + "num_input_tokens_seen": 80258776, + "step": 4901, + "train_runtime": 39825.0802, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 2.9709090909090907, + "grad_norm": 0.010962334461510181, + "learning_rate": 8.06540899096556e-05, + "loss": 0.013227182440459728, + "num_input_tokens_seen": 80275152, + "step": 4902, + "train_runtime": 39833.1898, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 2.9715151515151517, + "grad_norm": 0.016706550493836403, + "learning_rate": 8.064649240597659e-05, + "loss": 0.013346428982913494, + "num_input_tokens_seen": 80291528, + "step": 4903, + "train_runtime": 39841.2995, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 2.972121212121212, + "grad_norm": 0.006449924781918526, + "learning_rate": 8.06388937687435e-05, + "loss": 0.011918066069483757, + "num_input_tokens_seen": 80307904, + "step": 4904, + "train_runtime": 39849.4144, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 2.9727272727272727, + "grad_norm": 0.006571634206920862, + "learning_rate": 8.063129399823741e-05, + "loss": 0.012702937237918377, + "num_input_tokens_seen": 80324280, + "step": 4905, + "train_runtime": 39857.5316, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 2.9733333333333336, + "grad_norm": 0.003930455073714256, + "learning_rate": 8.06236930947394e-05, + "loss": 0.011632885783910751, + "num_input_tokens_seen": 80340656, + "step": 4906, + "train_runtime": 39865.6464, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 2.973939393939394, + "grad_norm": 0.00680804206058383, + "learning_rate": 8.061609105853063e-05, + "loss": 0.01279873214662075, + "num_input_tokens_seen": 80357032, + "step": 4907, + "train_runtime": 39873.7684, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 2.9745454545454546, + "grad_norm": 0.009840059094130993, + "learning_rate": 8.060848788989226e-05, + "loss": 0.012677513062953949, + "num_input_tokens_seen": 80373408, + "step": 4908, + "train_runtime": 39881.8803, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 2.975151515151515, + "grad_norm": 0.010013958439230919, + "learning_rate": 8.060088358910554e-05, + "loss": 0.01314396783709526, + "num_input_tokens_seen": 80389784, + "step": 4909, + "train_runtime": 39889.9949, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 2.9757575757575756, + "grad_norm": 0.00861060805618763, + "learning_rate": 8.059327815645172e-05, + "loss": 0.013995667919516563, + "num_input_tokens_seen": 80406160, + "step": 4910, + "train_runtime": 39898.1058, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 2.9763636363636365, + "grad_norm": 0.0041717467829585075, + "learning_rate": 8.058567159221213e-05, + "loss": 0.012105030938982964, + "num_input_tokens_seen": 80422536, + "step": 4911, + "train_runtime": 39906.2315, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 2.976969696969697, + "grad_norm": 0.01032066997140646, + "learning_rate": 8.057806389666811e-05, + "loss": 0.012619595974683762, + "num_input_tokens_seen": 80438912, + "step": 4912, + "train_runtime": 39914.3462, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 2.9775757575757575, + "grad_norm": 0.008707588538527489, + "learning_rate": 8.057045507010104e-05, + "loss": 0.012493311427533627, + "num_input_tokens_seen": 80455288, + "step": 4913, + "train_runtime": 39922.4644, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 2.978181818181818, + "grad_norm": 0.008389385417103767, + "learning_rate": 8.056284511279237e-05, + "loss": 0.012712804600596428, + "num_input_tokens_seen": 80471664, + "step": 4914, + "train_runtime": 39930.5809, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 2.9787878787878785, + "grad_norm": 0.006027390249073505, + "learning_rate": 8.05552340250236e-05, + "loss": 0.011198298074305058, + "num_input_tokens_seen": 80488040, + "step": 4915, + "train_runtime": 39938.6921, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 2.9793939393939395, + "grad_norm": 0.008501900359988213, + "learning_rate": 8.05476218070762e-05, + "loss": 0.012907647527754307, + "num_input_tokens_seen": 80504416, + "step": 4916, + "train_runtime": 39946.8053, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 2.98, + "grad_norm": 0.010556991212069988, + "learning_rate": 8.054000845923178e-05, + "loss": 0.011356150731444359, + "num_input_tokens_seen": 80520792, + "step": 4917, + "train_runtime": 39954.9193, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 2.9806060606060605, + "grad_norm": 0.007868185639381409, + "learning_rate": 8.053239398177191e-05, + "loss": 0.012448453344404697, + "num_input_tokens_seen": 80537168, + "step": 4918, + "train_runtime": 39963.0319, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 2.9812121212121214, + "grad_norm": 0.005407453048974276, + "learning_rate": 8.052477837497825e-05, + "loss": 0.01242247223854065, + "num_input_tokens_seen": 80553544, + "step": 4919, + "train_runtime": 39971.1447, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 2.981818181818182, + "grad_norm": 0.0066650621592998505, + "learning_rate": 8.051716163913247e-05, + "loss": 0.012247301638126373, + "num_input_tokens_seen": 80569920, + "step": 4920, + "train_runtime": 39979.2558, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 2.9824242424242424, + "grad_norm": 0.006091345567256212, + "learning_rate": 8.050954377451634e-05, + "loss": 0.0118141183629632, + "num_input_tokens_seen": 80586296, + "step": 4921, + "train_runtime": 39987.3719, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 2.983030303030303, + "grad_norm": 0.015410438179969788, + "learning_rate": 8.050192478141157e-05, + "loss": 0.013607362285256386, + "num_input_tokens_seen": 80602672, + "step": 4922, + "train_runtime": 39995.4854, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 2.9836363636363634, + "grad_norm": 0.011519228108227253, + "learning_rate": 8.049430466010002e-05, + "loss": 0.012264905497431755, + "num_input_tokens_seen": 80619048, + "step": 4923, + "train_runtime": 40003.5957, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 2.9842424242424244, + "grad_norm": 0.004235079046338797, + "learning_rate": 8.048668341086352e-05, + "loss": 0.011862663552165031, + "num_input_tokens_seen": 80635424, + "step": 4924, + "train_runtime": 40011.7089, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 2.984848484848485, + "grad_norm": 0.005770577583462, + "learning_rate": 8.047906103398396e-05, + "loss": 0.012330323457717896, + "num_input_tokens_seen": 80651800, + "step": 4925, + "train_runtime": 40019.8213, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 2.9854545454545454, + "grad_norm": 0.004001193679869175, + "learning_rate": 8.047143752974331e-05, + "loss": 0.011662803590297699, + "num_input_tokens_seen": 80668176, + "step": 4926, + "train_runtime": 40027.9355, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 2.9860606060606063, + "grad_norm": 0.007290661334991455, + "learning_rate": 8.04638128984235e-05, + "loss": 0.012945979833602905, + "num_input_tokens_seen": 80684552, + "step": 4927, + "train_runtime": 40036.0488, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 2.986666666666667, + "grad_norm": 0.011220223270356655, + "learning_rate": 8.045618714030659e-05, + "loss": 0.013632332906126976, + "num_input_tokens_seen": 80700928, + "step": 4928, + "train_runtime": 40044.1605, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 2.9872727272727273, + "grad_norm": 0.00916143599897623, + "learning_rate": 8.044856025567463e-05, + "loss": 0.010160490870475769, + "num_input_tokens_seen": 80717304, + "step": 4929, + "train_runtime": 40052.2767, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 2.987878787878788, + "grad_norm": 0.011209206655621529, + "learning_rate": 8.044093224480973e-05, + "loss": 0.012202143669128418, + "num_input_tokens_seen": 80733680, + "step": 4930, + "train_runtime": 40060.3939, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 2.9884848484848483, + "grad_norm": 0.005773114040493965, + "learning_rate": 8.043330310799402e-05, + "loss": 0.012278302572667599, + "num_input_tokens_seen": 80750056, + "step": 4931, + "train_runtime": 40068.5064, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 2.9890909090909092, + "grad_norm": 0.009268639609217644, + "learning_rate": 8.042567284550969e-05, + "loss": 0.01146883424371481, + "num_input_tokens_seen": 80766432, + "step": 4932, + "train_runtime": 40076.618, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 2.9896969696969697, + "grad_norm": 0.00657606078311801, + "learning_rate": 8.041804145763897e-05, + "loss": 0.011813577264547348, + "num_input_tokens_seen": 80782808, + "step": 4933, + "train_runtime": 40084.7351, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 2.9903030303030302, + "grad_norm": 0.0068115731701254845, + "learning_rate": 8.041040894466414e-05, + "loss": 0.011404371820390224, + "num_input_tokens_seen": 80799184, + "step": 4934, + "train_runtime": 40092.8492, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 2.990909090909091, + "grad_norm": 0.007134865503758192, + "learning_rate": 8.04027753068675e-05, + "loss": 0.012304671108722687, + "num_input_tokens_seen": 80815560, + "step": 4935, + "train_runtime": 40100.9612, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 2.9915151515151512, + "grad_norm": 0.005618153139948845, + "learning_rate": 8.039514054453141e-05, + "loss": 0.01196444034576416, + "num_input_tokens_seen": 80831936, + "step": 4936, + "train_runtime": 40109.0797, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 2.992121212121212, + "grad_norm": 0.005986324045807123, + "learning_rate": 8.038750465793827e-05, + "loss": 0.012723533436655998, + "num_input_tokens_seen": 80848312, + "step": 4937, + "train_runtime": 40117.1939, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 2.9927272727272727, + "grad_norm": 0.006666071247309446, + "learning_rate": 8.037986764737049e-05, + "loss": 0.011652782559394836, + "num_input_tokens_seen": 80864688, + "step": 4938, + "train_runtime": 40125.3039, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 2.993333333333333, + "grad_norm": 0.00388430361635983, + "learning_rate": 8.037222951311059e-05, + "loss": 0.012460242956876755, + "num_input_tokens_seen": 80881064, + "step": 4939, + "train_runtime": 40133.4138, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 2.993939393939394, + "grad_norm": 0.00885312631726265, + "learning_rate": 8.036459025544105e-05, + "loss": 0.012784990482032299, + "num_input_tokens_seen": 80897440, + "step": 4940, + "train_runtime": 40141.533, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 2.9945454545454546, + "grad_norm": 0.01166592724621296, + "learning_rate": 8.035694987464446e-05, + "loss": 0.011826218105852604, + "num_input_tokens_seen": 80913816, + "step": 4941, + "train_runtime": 40149.6496, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 2.995151515151515, + "grad_norm": 0.009321312420070171, + "learning_rate": 8.03493083710034e-05, + "loss": 0.013048290275037289, + "num_input_tokens_seen": 80930192, + "step": 4942, + "train_runtime": 40157.7624, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 2.9957575757575756, + "grad_norm": 0.009328039363026619, + "learning_rate": 8.034166574480055e-05, + "loss": 0.011413088999688625, + "num_input_tokens_seen": 80946568, + "step": 4943, + "train_runtime": 40165.8773, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 2.996363636363636, + "grad_norm": 0.01060427911579609, + "learning_rate": 8.033402199631855e-05, + "loss": 0.013176953420042992, + "num_input_tokens_seen": 80962944, + "step": 4944, + "train_runtime": 40173.9929, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 2.996969696969697, + "grad_norm": 0.007758413441479206, + "learning_rate": 8.032637712584016e-05, + "loss": 0.012373056262731552, + "num_input_tokens_seen": 80979320, + "step": 4945, + "train_runtime": 40182.1105, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 2.9975757575757576, + "grad_norm": 0.007142862770706415, + "learning_rate": 8.031873113364814e-05, + "loss": 0.011989555321633816, + "num_input_tokens_seen": 80995696, + "step": 4946, + "train_runtime": 40190.2217, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 2.998181818181818, + "grad_norm": 0.016804706305265427, + "learning_rate": 8.03110840200253e-05, + "loss": 0.013811063021421432, + "num_input_tokens_seen": 81012072, + "step": 4947, + "train_runtime": 40198.3339, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 2.998787878787879, + "grad_norm": 0.0078121027909219265, + "learning_rate": 8.030343578525446e-05, + "loss": 0.012376299127936363, + "num_input_tokens_seen": 81028448, + "step": 4948, + "train_runtime": 40206.4506, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 2.9993939393939395, + "grad_norm": 0.008717039600014687, + "learning_rate": 8.029578642961857e-05, + "loss": 0.011752917431294918, + "num_input_tokens_seen": 81044824, + "step": 4949, + "train_runtime": 40214.5669, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.0, + "grad_norm": 0.011319301091134548, + "learning_rate": 8.028813595340053e-05, + "loss": 0.011984911747276783, + "num_input_tokens_seen": 81061200, + "step": 4950, + "train_runtime": 40222.6795, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 3.0006060606060605, + "grad_norm": 0.006529376842081547, + "learning_rate": 8.028048435688333e-05, + "loss": 0.011669652536511421, + "num_input_tokens_seen": 81077576, + "step": 4951, + "train_runtime": 40230.7975, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 3.001212121212121, + "grad_norm": 0.009932905435562134, + "learning_rate": 8.027283164035e-05, + "loss": 0.011645457707345486, + "num_input_tokens_seen": 81093952, + "step": 4952, + "train_runtime": 40238.914, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.001818181818182, + "grad_norm": 0.009132424369454384, + "learning_rate": 8.026517780408355e-05, + "loss": 0.012062609195709229, + "num_input_tokens_seen": 81110328, + "step": 4953, + "train_runtime": 40247.0328, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.0024242424242424, + "grad_norm": 0.007863885723054409, + "learning_rate": 8.025752284836713e-05, + "loss": 0.012253664433956146, + "num_input_tokens_seen": 81126704, + "step": 4954, + "train_runtime": 40255.146, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 3.003030303030303, + "grad_norm": 0.010760089382529259, + "learning_rate": 8.024986677348385e-05, + "loss": 0.011846562847495079, + "num_input_tokens_seen": 81143080, + "step": 4955, + "train_runtime": 40263.2639, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 3.0036363636363634, + "grad_norm": 0.007021928206086159, + "learning_rate": 8.024220957971693e-05, + "loss": 0.012105617672204971, + "num_input_tokens_seen": 81159456, + "step": 4956, + "train_runtime": 40271.3806, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 3.0042424242424244, + "grad_norm": 0.007369599305093288, + "learning_rate": 8.023455126734955e-05, + "loss": 0.012362487614154816, + "num_input_tokens_seen": 81175832, + "step": 4957, + "train_runtime": 40279.4957, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.004848484848485, + "grad_norm": 0.005767274647951126, + "learning_rate": 8.022689183666501e-05, + "loss": 0.011250068433582783, + "num_input_tokens_seen": 81192208, + "step": 4958, + "train_runtime": 40287.6103, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.0054545454545454, + "grad_norm": 0.0044665527530014515, + "learning_rate": 8.02192312879466e-05, + "loss": 0.011691495776176453, + "num_input_tokens_seen": 81208584, + "step": 4959, + "train_runtime": 40295.7318, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.006060606060606, + "grad_norm": 0.007468418683856726, + "learning_rate": 8.021156962147767e-05, + "loss": 0.012513428926467896, + "num_input_tokens_seen": 81224960, + "step": 4960, + "train_runtime": 40303.8452, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.006666666666667, + "grad_norm": 0.01171969622373581, + "learning_rate": 8.020390683754161e-05, + "loss": 0.011917298659682274, + "num_input_tokens_seen": 81241336, + "step": 4961, + "train_runtime": 40311.9603, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 3.0072727272727273, + "grad_norm": 0.007130028679966927, + "learning_rate": 8.019624293642186e-05, + "loss": 0.011978394351899624, + "num_input_tokens_seen": 81257712, + "step": 4962, + "train_runtime": 40320.0766, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 3.007878787878788, + "grad_norm": 0.004460524767637253, + "learning_rate": 8.018857791840188e-05, + "loss": 0.011985224671661854, + "num_input_tokens_seen": 81274088, + "step": 4963, + "train_runtime": 40328.1959, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 3.0084848484848483, + "grad_norm": 0.009568332694470882, + "learning_rate": 8.018091178376521e-05, + "loss": 0.013089433312416077, + "num_input_tokens_seen": 81290464, + "step": 4964, + "train_runtime": 40336.3125, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 3.0090909090909093, + "grad_norm": 0.00939208921045065, + "learning_rate": 8.017324453279539e-05, + "loss": 0.011690562590956688, + "num_input_tokens_seen": 81306840, + "step": 4965, + "train_runtime": 40344.4323, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 3.0096969696969698, + "grad_norm": 0.001349581521935761, + "learning_rate": 8.016557616577601e-05, + "loss": 0.011842044070363045, + "num_input_tokens_seen": 81323216, + "step": 4966, + "train_runtime": 40352.5448, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 3.0103030303030303, + "grad_norm": 0.01166871190071106, + "learning_rate": 8.01579066829907e-05, + "loss": 0.012193872593343258, + "num_input_tokens_seen": 81339592, + "step": 4967, + "train_runtime": 40360.6599, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 3.0109090909090908, + "grad_norm": 0.0067023844458162785, + "learning_rate": 8.015023608472317e-05, + "loss": 0.011534628458321095, + "num_input_tokens_seen": 81355968, + "step": 4968, + "train_runtime": 40368.7697, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 3.0115151515151517, + "grad_norm": 0.006843973882496357, + "learning_rate": 8.014256437125712e-05, + "loss": 0.012186324223876, + "num_input_tokens_seen": 81372344, + "step": 4969, + "train_runtime": 40376.8805, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 3.012121212121212, + "grad_norm": 0.003915082197636366, + "learning_rate": 8.01348915428763e-05, + "loss": 0.012047907337546349, + "num_input_tokens_seen": 81388720, + "step": 4970, + "train_runtime": 40384.9942, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 3.0127272727272727, + "grad_norm": 0.006388366222381592, + "learning_rate": 8.012721759986452e-05, + "loss": 0.01216863002628088, + "num_input_tokens_seen": 81405096, + "step": 4971, + "train_runtime": 40393.1054, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 3.013333333333333, + "grad_norm": 0.009649273939430714, + "learning_rate": 8.011954254250564e-05, + "loss": 0.012484976090490818, + "num_input_tokens_seen": 81421472, + "step": 4972, + "train_runtime": 40401.2162, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 3.013939393939394, + "grad_norm": 0.005378090776503086, + "learning_rate": 8.011186637108355e-05, + "loss": 0.012689990922808647, + "num_input_tokens_seen": 81437848, + "step": 4973, + "train_runtime": 40409.3347, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 3.0145454545454546, + "grad_norm": 0.03123796544969082, + "learning_rate": 8.010418908588216e-05, + "loss": 0.01335354708135128, + "num_input_tokens_seen": 81454224, + "step": 4974, + "train_runtime": 40417.4478, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 3.015151515151515, + "grad_norm": 0.008666237816214561, + "learning_rate": 8.009651068718542e-05, + "loss": 0.011685595847666264, + "num_input_tokens_seen": 81470600, + "step": 4975, + "train_runtime": 40425.5592, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 3.0157575757575756, + "grad_norm": 0.011991309933364391, + "learning_rate": 8.008883117527738e-05, + "loss": 0.012238707393407822, + "num_input_tokens_seen": 81486976, + "step": 4976, + "train_runtime": 40433.6715, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 3.0163636363636366, + "grad_norm": 0.004189320374280214, + "learning_rate": 8.008115055044208e-05, + "loss": 0.0124836890026927, + "num_input_tokens_seen": 81503352, + "step": 4977, + "train_runtime": 40441.7866, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 3.016969696969697, + "grad_norm": 0.008140149526298046, + "learning_rate": 8.007346881296357e-05, + "loss": 0.010785011574625969, + "num_input_tokens_seen": 81519728, + "step": 4978, + "train_runtime": 40449.905, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 3.0175757575757576, + "grad_norm": 0.003690504701808095, + "learning_rate": 8.006578596312604e-05, + "loss": 0.011542819440364838, + "num_input_tokens_seen": 81536104, + "step": 4979, + "train_runtime": 40458.0323, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 3.018181818181818, + "grad_norm": 0.00906476378440857, + "learning_rate": 8.005810200121363e-05, + "loss": 0.01341304462403059, + "num_input_tokens_seen": 81552480, + "step": 4980, + "train_runtime": 40466.1457, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 3.0187878787878786, + "grad_norm": 0.011695929802954197, + "learning_rate": 8.005041692751055e-05, + "loss": 0.013208205811679363, + "num_input_tokens_seen": 81568856, + "step": 4981, + "train_runtime": 40474.2589, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 3.0193939393939395, + "grad_norm": 0.008303516544401646, + "learning_rate": 8.004273074230108e-05, + "loss": 0.012137368321418762, + "num_input_tokens_seen": 81585232, + "step": 4982, + "train_runtime": 40482.3703, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 3.02, + "grad_norm": 0.008456442505121231, + "learning_rate": 8.00350434458695e-05, + "loss": 0.011716444976627827, + "num_input_tokens_seen": 81601608, + "step": 4983, + "train_runtime": 40490.4825, + "train_tokens_per_second": 2015.328 + }, + { + "epoch": 3.0206060606060605, + "grad_norm": 0.018605031073093414, + "learning_rate": 8.002735503850016e-05, + "loss": 0.012494084425270557, + "num_input_tokens_seen": 81617984, + "step": 4984, + "train_runtime": 40498.5954, + "train_tokens_per_second": 2015.329 + }, + { + "epoch": 3.021212121212121, + "grad_norm": 0.007609925698488951, + "learning_rate": 8.001966552047743e-05, + "loss": 0.011657200753688812, + "num_input_tokens_seen": 81634360, + "step": 4985, + "train_runtime": 40506.7089, + "train_tokens_per_second": 2015.329 + }, + { + "epoch": 3.021818181818182, + "grad_norm": 0.005220186896622181, + "learning_rate": 8.001197489208572e-05, + "loss": 0.012575463391840458, + "num_input_tokens_seen": 81650736, + "step": 4986, + "train_runtime": 40514.8319, + "train_tokens_per_second": 2015.329 + }, + { + "epoch": 3.0224242424242425, + "grad_norm": 0.0093778595328331, + "learning_rate": 8.000428315360953e-05, + "loss": 0.012079675681889057, + "num_input_tokens_seen": 81667112, + "step": 4987, + "train_runtime": 40522.9495, + "train_tokens_per_second": 2015.33 + }, + { + "epoch": 3.023030303030303, + "grad_norm": 0.01672329008579254, + "learning_rate": 7.999659030533331e-05, + "loss": 0.013024467043578625, + "num_input_tokens_seen": 81683488, + "step": 4988, + "train_runtime": 40531.0632, + "train_tokens_per_second": 2015.331 + }, + { + "epoch": 3.0236363636363635, + "grad_norm": 0.003408196149393916, + "learning_rate": 7.998889634754165e-05, + "loss": 0.009604381397366524, + "num_input_tokens_seen": 81699864, + "step": 4989, + "train_runtime": 40539.1787, + "train_tokens_per_second": 2015.331 + }, + { + "epoch": 3.0242424242424244, + "grad_norm": 0.0028907994274049997, + "learning_rate": 7.998120128051911e-05, + "loss": 0.011881847865879536, + "num_input_tokens_seen": 81716240, + "step": 4990, + "train_runtime": 40547.2928, + "train_tokens_per_second": 2015.332 + }, + { + "epoch": 3.024848484848485, + "grad_norm": 0.00538458488881588, + "learning_rate": 7.997350510455032e-05, + "loss": 0.011096558533608913, + "num_input_tokens_seen": 81732616, + "step": 4991, + "train_runtime": 40555.4048, + "train_tokens_per_second": 2015.332 + }, + { + "epoch": 3.0254545454545454, + "grad_norm": 0.006726040970534086, + "learning_rate": 7.996580781991996e-05, + "loss": 0.012707695364952087, + "num_input_tokens_seen": 81748992, + "step": 4992, + "train_runtime": 40563.519, + "train_tokens_per_second": 2015.333 + }, + { + "epoch": 3.026060606060606, + "grad_norm": 0.010692209005355835, + "learning_rate": 7.99581094269127e-05, + "loss": 0.011330877430737019, + "num_input_tokens_seen": 81765368, + "step": 4993, + "train_runtime": 40571.6357, + "train_tokens_per_second": 2015.333 + }, + { + "epoch": 3.026666666666667, + "grad_norm": 0.00690585607662797, + "learning_rate": 7.995040992581332e-05, + "loss": 0.012908353470265865, + "num_input_tokens_seen": 81781744, + "step": 4994, + "train_runtime": 40579.7476, + "train_tokens_per_second": 2015.334 + }, + { + "epoch": 3.0272727272727273, + "grad_norm": 0.00812698993831873, + "learning_rate": 7.994270931690662e-05, + "loss": 0.012667578645050526, + "num_input_tokens_seen": 81798120, + "step": 4995, + "train_runtime": 40587.862, + "train_tokens_per_second": 2015.335 + }, + { + "epoch": 3.027878787878788, + "grad_norm": 0.04256836697459221, + "learning_rate": 7.993500760047739e-05, + "loss": 0.012861201539635658, + "num_input_tokens_seen": 81814496, + "step": 4996, + "train_runtime": 40595.9746, + "train_tokens_per_second": 2015.335 + }, + { + "epoch": 3.0284848484848483, + "grad_norm": 0.008666256442666054, + "learning_rate": 7.992730477681054e-05, + "loss": 0.012526500038802624, + "num_input_tokens_seen": 81830872, + "step": 4997, + "train_runtime": 40604.0855, + "train_tokens_per_second": 2015.336 + }, + { + "epoch": 3.0290909090909093, + "grad_norm": 0.0030135237611830235, + "learning_rate": 7.991960084619096e-05, + "loss": 0.011259794235229492, + "num_input_tokens_seen": 81847248, + "step": 4998, + "train_runtime": 40612.2, + "train_tokens_per_second": 2015.336 + }, + { + "epoch": 3.0296969696969698, + "grad_norm": 0.007124700583517551, + "learning_rate": 7.991189580890362e-05, + "loss": 0.010558691807091236, + "num_input_tokens_seen": 81863624, + "step": 4999, + "train_runtime": 40620.313, + "train_tokens_per_second": 2015.337 + }, + { + "epoch": 3.0303030303030303, + "grad_norm": 0.004220771603286266, + "learning_rate": 7.99041896652335e-05, + "loss": 0.010593022219836712, + "num_input_tokens_seen": 81880000, + "step": 5000, + "train_runtime": 40628.4317, + "train_tokens_per_second": 2015.337 + }, + { + "epoch": 3.0309090909090908, + "grad_norm": 0.00907983724027872, + "learning_rate": 7.989648241546563e-05, + "loss": 0.013040842488408089, + "num_input_tokens_seen": 81896376, + "step": 5001, + "train_runtime": 40637.5883, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 3.0315151515151517, + "grad_norm": 0.008579259738326073, + "learning_rate": 7.988877405988511e-05, + "loss": 0.012871500104665756, + "num_input_tokens_seen": 81912752, + "step": 5002, + "train_runtime": 40645.696, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 3.032121212121212, + "grad_norm": 0.0076529341749846935, + "learning_rate": 7.988106459877703e-05, + "loss": 0.011429394595324993, + "num_input_tokens_seen": 81929128, + "step": 5003, + "train_runtime": 40653.8098, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 3.0327272727272727, + "grad_norm": 0.008507606573402882, + "learning_rate": 7.987335403242657e-05, + "loss": 0.012636275961995125, + "num_input_tokens_seen": 81945504, + "step": 5004, + "train_runtime": 40661.9322, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 3.033333333333333, + "grad_norm": 0.00982301402837038, + "learning_rate": 7.986564236111891e-05, + "loss": 0.013790562748908997, + "num_input_tokens_seen": 81961880, + "step": 5005, + "train_runtime": 40670.0498, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 3.033939393939394, + "grad_norm": 0.009134885855019093, + "learning_rate": 7.985792958513931e-05, + "loss": 0.01261051930487156, + "num_input_tokens_seen": 81978256, + "step": 5006, + "train_runtime": 40678.161, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.0345454545454547, + "grad_norm": 0.00475759943947196, + "learning_rate": 7.985021570477304e-05, + "loss": 0.013154814019799232, + "num_input_tokens_seen": 81994632, + "step": 5007, + "train_runtime": 40686.2751, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 3.035151515151515, + "grad_norm": 0.00726286880671978, + "learning_rate": 7.984250072030543e-05, + "loss": 0.011552652344107628, + "num_input_tokens_seen": 82011008, + "step": 5008, + "train_runtime": 40694.3911, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 3.0357575757575757, + "grad_norm": 0.006222061812877655, + "learning_rate": 7.983478463202181e-05, + "loss": 0.011257261037826538, + "num_input_tokens_seen": 82027384, + "step": 5009, + "train_runtime": 40702.5067, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 3.036363636363636, + "grad_norm": 0.007683869916945696, + "learning_rate": 7.982706744020763e-05, + "loss": 0.012328354641795158, + "num_input_tokens_seen": 82043760, + "step": 5010, + "train_runtime": 40710.6318, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 3.036969696969697, + "grad_norm": 0.006627171766012907, + "learning_rate": 7.981934914514829e-05, + "loss": 0.010838976129889488, + "num_input_tokens_seen": 82060136, + "step": 5011, + "train_runtime": 40718.7493, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 3.0375757575757576, + "grad_norm": 0.00595509959384799, + "learning_rate": 7.981162974712931e-05, + "loss": 0.011428937315940857, + "num_input_tokens_seen": 82076512, + "step": 5012, + "train_runtime": 40726.866, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 3.038181818181818, + "grad_norm": 0.010024442337453365, + "learning_rate": 7.98039092464362e-05, + "loss": 0.011992322281002998, + "num_input_tokens_seen": 82092888, + "step": 5013, + "train_runtime": 40734.9854, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 3.0387878787878786, + "grad_norm": 0.011965795420110226, + "learning_rate": 7.979618764335453e-05, + "loss": 0.011822822503745556, + "num_input_tokens_seen": 82109264, + "step": 5014, + "train_runtime": 40743.1049, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 3.0393939393939395, + "grad_norm": 0.005699925124645233, + "learning_rate": 7.978846493816989e-05, + "loss": 0.012378029525279999, + "num_input_tokens_seen": 82125640, + "step": 5015, + "train_runtime": 40751.2181, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 3.04, + "grad_norm": 0.010358860716223717, + "learning_rate": 7.978074113116796e-05, + "loss": 0.01181040983647108, + "num_input_tokens_seen": 82142016, + "step": 5016, + "train_runtime": 40759.3372, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 3.0406060606060605, + "grad_norm": 0.009625044651329517, + "learning_rate": 7.97730162226344e-05, + "loss": 0.012521148659288883, + "num_input_tokens_seen": 82158392, + "step": 5017, + "train_runtime": 40767.4525, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 3.041212121212121, + "grad_norm": 0.0033440766856074333, + "learning_rate": 7.976529021285496e-05, + "loss": 0.00979761965572834, + "num_input_tokens_seen": 82174768, + "step": 5018, + "train_runtime": 40775.5703, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 3.041818181818182, + "grad_norm": 0.010199353098869324, + "learning_rate": 7.97575631021154e-05, + "loss": 0.012486265040934086, + "num_input_tokens_seen": 82191144, + "step": 5019, + "train_runtime": 40783.6866, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 3.0424242424242425, + "grad_norm": 0.01120639406144619, + "learning_rate": 7.974983489070151e-05, + "loss": 0.012112337164580822, + "num_input_tokens_seen": 82207520, + "step": 5020, + "train_runtime": 40791.8044, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 3.043030303030303, + "grad_norm": 0.004718172829598188, + "learning_rate": 7.974210557889919e-05, + "loss": 0.011622084304690361, + "num_input_tokens_seen": 82223896, + "step": 5021, + "train_runtime": 40799.9314, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 3.0436363636363635, + "grad_norm": 0.002265132497996092, + "learning_rate": 7.973437516699429e-05, + "loss": 0.012322201393544674, + "num_input_tokens_seen": 82240272, + "step": 5022, + "train_runtime": 40808.05, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 3.0442424242424244, + "grad_norm": 0.007122958078980446, + "learning_rate": 7.972664365527276e-05, + "loss": 0.011614312417805195, + "num_input_tokens_seen": 82256648, + "step": 5023, + "train_runtime": 40816.1656, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 3.044848484848485, + "grad_norm": 0.006269804202020168, + "learning_rate": 7.971891104402058e-05, + "loss": 0.012160408310592175, + "num_input_tokens_seen": 82273024, + "step": 5024, + "train_runtime": 40824.2782, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 3.0454545454545454, + "grad_norm": 0.008944250643253326, + "learning_rate": 7.971117733352373e-05, + "loss": 0.012614576146006584, + "num_input_tokens_seen": 82289400, + "step": 5025, + "train_runtime": 40832.3917, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 3.046060606060606, + "grad_norm": 0.007353902328759432, + "learning_rate": 7.970344252406831e-05, + "loss": 0.011188640259206295, + "num_input_tokens_seen": 82305776, + "step": 5026, + "train_runtime": 40840.5099, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 3.046666666666667, + "grad_norm": 0.006395937409251928, + "learning_rate": 7.96957066159404e-05, + "loss": 0.011715516448020935, + "num_input_tokens_seen": 82322152, + "step": 5027, + "train_runtime": 40848.632, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.0472727272727274, + "grad_norm": 0.0033540395088493824, + "learning_rate": 7.968796960942613e-05, + "loss": 0.011196743696928024, + "num_input_tokens_seen": 82338528, + "step": 5028, + "train_runtime": 40856.7457, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.047878787878788, + "grad_norm": 0.004545119125396013, + "learning_rate": 7.968023150481168e-05, + "loss": 0.012470544315874577, + "num_input_tokens_seen": 82354904, + "step": 5029, + "train_runtime": 40864.8622, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.0484848484848484, + "grad_norm": 0.009499830193817616, + "learning_rate": 7.967249230238325e-05, + "loss": 0.012202820740640163, + "num_input_tokens_seen": 82371280, + "step": 5030, + "train_runtime": 40872.9807, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.0490909090909093, + "grad_norm": 0.008196836337447166, + "learning_rate": 7.966475200242713e-05, + "loss": 0.013460800051689148, + "num_input_tokens_seen": 82387656, + "step": 5031, + "train_runtime": 40881.1013, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.04969696969697, + "grad_norm": 0.008573905564844608, + "learning_rate": 7.965701060522958e-05, + "loss": 0.011132548563182354, + "num_input_tokens_seen": 82404032, + "step": 5032, + "train_runtime": 40889.2214, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 3.0503030303030303, + "grad_norm": 0.005743544083088636, + "learning_rate": 7.9649268111077e-05, + "loss": 0.012292323634028435, + "num_input_tokens_seen": 82420408, + "step": 5033, + "train_runtime": 40897.3373, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 3.050909090909091, + "grad_norm": 0.005372727755457163, + "learning_rate": 7.96415245202557e-05, + "loss": 0.011763568967580795, + "num_input_tokens_seen": 82436784, + "step": 5034, + "train_runtime": 40905.4566, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 3.0515151515151517, + "grad_norm": 0.00797323603183031, + "learning_rate": 7.963377983305214e-05, + "loss": 0.012424743734300137, + "num_input_tokens_seen": 82453160, + "step": 5035, + "train_runtime": 40913.5699, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 3.0521212121212122, + "grad_norm": 0.008463975042104721, + "learning_rate": 7.962603404975278e-05, + "loss": 0.011387032456696033, + "num_input_tokens_seen": 82469536, + "step": 5036, + "train_runtime": 40921.6866, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 3.0527272727272727, + "grad_norm": 0.009456364437937737, + "learning_rate": 7.96182871706441e-05, + "loss": 0.012254003435373306, + "num_input_tokens_seen": 82485912, + "step": 5037, + "train_runtime": 40929.8043, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 3.0533333333333332, + "grad_norm": 0.006122071761637926, + "learning_rate": 7.961053919601267e-05, + "loss": 0.011528786271810532, + "num_input_tokens_seen": 82502288, + "step": 5038, + "train_runtime": 40937.9315, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 3.0539393939393937, + "grad_norm": 0.006338039413094521, + "learning_rate": 7.960279012614507e-05, + "loss": 0.012046465650200844, + "num_input_tokens_seen": 82518664, + "step": 5039, + "train_runtime": 40946.0473, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 3.0545454545454547, + "grad_norm": 0.00785675086081028, + "learning_rate": 7.959503996132789e-05, + "loss": 0.011942458339035511, + "num_input_tokens_seen": 82535040, + "step": 5040, + "train_runtime": 40954.1617, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 3.055151515151515, + "grad_norm": 0.009984432719647884, + "learning_rate": 7.958728870184782e-05, + "loss": 0.012800004333257675, + "num_input_tokens_seen": 82551416, + "step": 5041, + "train_runtime": 40962.2793, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 3.0557575757575757, + "grad_norm": 0.007312713656574488, + "learning_rate": 7.957953634799158e-05, + "loss": 0.011666078120470047, + "num_input_tokens_seen": 82567792, + "step": 5042, + "train_runtime": 40970.3977, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 3.056363636363636, + "grad_norm": 0.010828329250216484, + "learning_rate": 7.957178290004586e-05, + "loss": 0.012796947732567787, + "num_input_tokens_seen": 82584168, + "step": 5043, + "train_runtime": 40978.5165, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 3.056969696969697, + "grad_norm": 0.008904154412448406, + "learning_rate": 7.956402835829751e-05, + "loss": 0.011869868263602257, + "num_input_tokens_seen": 82600544, + "step": 5044, + "train_runtime": 40986.635, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 3.0575757575757576, + "grad_norm": 0.003937427885830402, + "learning_rate": 7.95562727230333e-05, + "loss": 0.012504490092396736, + "num_input_tokens_seen": 82616920, + "step": 5045, + "train_runtime": 40994.7476, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 3.058181818181818, + "grad_norm": 0.006595636252313852, + "learning_rate": 7.954851599454014e-05, + "loss": 0.012058332562446594, + "num_input_tokens_seen": 82633296, + "step": 5046, + "train_runtime": 41002.8654, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 3.0587878787878786, + "grad_norm": 0.006277075037360191, + "learning_rate": 7.95407581731049e-05, + "loss": 0.011951112188398838, + "num_input_tokens_seen": 82649672, + "step": 5047, + "train_runtime": 41010.9855, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 3.0593939393939396, + "grad_norm": 0.004333112854510546, + "learning_rate": 7.953299925901456e-05, + "loss": 0.011924955993890762, + "num_input_tokens_seen": 82666048, + "step": 5048, + "train_runtime": 41019.1011, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 3.06, + "grad_norm": 0.007758693303912878, + "learning_rate": 7.952523925255609e-05, + "loss": 0.01136488001793623, + "num_input_tokens_seen": 82682424, + "step": 5049, + "train_runtime": 41027.2119, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 3.0606060606060606, + "grad_norm": 0.005722535774111748, + "learning_rate": 7.95174781540165e-05, + "loss": 0.012064126320183277, + "num_input_tokens_seen": 82698800, + "step": 5050, + "train_runtime": 41035.3342, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 3.061212121212121, + "grad_norm": 0.013080879114568233, + "learning_rate": 7.950971596368289e-05, + "loss": 0.012441691011190414, + "num_input_tokens_seen": 82715176, + "step": 5051, + "train_runtime": 41043.4498, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 3.061818181818182, + "grad_norm": 0.0034102669451385736, + "learning_rate": 7.950195268184235e-05, + "loss": 0.0118917440995574, + "num_input_tokens_seen": 82731552, + "step": 5052, + "train_runtime": 41051.5656, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 3.0624242424242425, + "grad_norm": 0.00600830465555191, + "learning_rate": 7.949418830878203e-05, + "loss": 0.013058274053037167, + "num_input_tokens_seen": 82747928, + "step": 5053, + "train_runtime": 41059.6824, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.063030303030303, + "grad_norm": 0.00989510491490364, + "learning_rate": 7.948642284478912e-05, + "loss": 0.012646889314055443, + "num_input_tokens_seen": 82764304, + "step": 5054, + "train_runtime": 41067.8002, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.0636363636363635, + "grad_norm": 0.007937680929899216, + "learning_rate": 7.947865629015086e-05, + "loss": 0.011417629197239876, + "num_input_tokens_seen": 82780680, + "step": 5055, + "train_runtime": 41075.9129, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.0642424242424244, + "grad_norm": 0.007360556162893772, + "learning_rate": 7.94708886451545e-05, + "loss": 0.012504201382398605, + "num_input_tokens_seen": 82797056, + "step": 5056, + "train_runtime": 41084.0229, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.064848484848485, + "grad_norm": 0.004632122348994017, + "learning_rate": 7.946311991008736e-05, + "loss": 0.012084313668310642, + "num_input_tokens_seen": 82813432, + "step": 5057, + "train_runtime": 41092.1352, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 3.0654545454545454, + "grad_norm": 0.007727290503680706, + "learning_rate": 7.94553500852368e-05, + "loss": 0.012533760629594326, + "num_input_tokens_seen": 82829808, + "step": 5058, + "train_runtime": 41100.2473, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.066060606060606, + "grad_norm": 0.0055076926946640015, + "learning_rate": 7.94475791708902e-05, + "loss": 0.011855492368340492, + "num_input_tokens_seen": 82846184, + "step": 5059, + "train_runtime": 41108.3646, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.066666666666667, + "grad_norm": 0.004027305170893669, + "learning_rate": 7.943980716733499e-05, + "loss": 0.011520362459123135, + "num_input_tokens_seen": 82862560, + "step": 5060, + "train_runtime": 41116.4779, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 3.0672727272727274, + "grad_norm": 0.013380679301917553, + "learning_rate": 7.943203407485864e-05, + "loss": 0.01223910041153431, + "num_input_tokens_seen": 82878936, + "step": 5061, + "train_runtime": 41124.5892, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 3.067878787878788, + "grad_norm": 0.006780378520488739, + "learning_rate": 7.942425989374868e-05, + "loss": 0.011735268868505955, + "num_input_tokens_seen": 82895312, + "step": 5062, + "train_runtime": 41132.7046, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.0684848484848484, + "grad_norm": 0.0036255079321563244, + "learning_rate": 7.941648462429264e-05, + "loss": 0.010879157111048698, + "num_input_tokens_seen": 82911688, + "step": 5063, + "train_runtime": 41140.8221, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.0690909090909093, + "grad_norm": 0.006235482171177864, + "learning_rate": 7.940870826677814e-05, + "loss": 0.01206477265805006, + "num_input_tokens_seen": 82928064, + "step": 5064, + "train_runtime": 41148.9346, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.06969696969697, + "grad_norm": 0.007015205454081297, + "learning_rate": 7.940093082149275e-05, + "loss": 0.012744509615004063, + "num_input_tokens_seen": 82944440, + "step": 5065, + "train_runtime": 41157.0489, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 3.0703030303030303, + "grad_norm": 0.002925195964053273, + "learning_rate": 7.939315228872421e-05, + "loss": 0.011732226237654686, + "num_input_tokens_seen": 82960816, + "step": 5066, + "train_runtime": 41165.1621, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 3.070909090909091, + "grad_norm": 0.006914564874023199, + "learning_rate": 7.93853726687602e-05, + "loss": 0.011949972249567509, + "num_input_tokens_seen": 82977192, + "step": 5067, + "train_runtime": 41173.2768, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 3.0715151515151513, + "grad_norm": 0.005173922050744295, + "learning_rate": 7.937759196188849e-05, + "loss": 0.011470858007669449, + "num_input_tokens_seen": 82993568, + "step": 5068, + "train_runtime": 41181.3897, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 3.0721212121212123, + "grad_norm": 0.007633210625499487, + "learning_rate": 7.936981016839687e-05, + "loss": 0.011128420941531658, + "num_input_tokens_seen": 83009944, + "step": 5069, + "train_runtime": 41189.5068, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 3.0727272727272728, + "grad_norm": 0.0073098912835121155, + "learning_rate": 7.936202728857315e-05, + "loss": 0.011449356563389301, + "num_input_tokens_seen": 83026320, + "step": 5070, + "train_runtime": 41197.6364, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 3.0733333333333333, + "grad_norm": 0.006596955005079508, + "learning_rate": 7.935424332270522e-05, + "loss": 0.012056310661137104, + "num_input_tokens_seen": 83042696, + "step": 5071, + "train_runtime": 41205.754, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 3.0739393939393937, + "grad_norm": 0.01389376726001501, + "learning_rate": 7.9346458271081e-05, + "loss": 0.013683836907148361, + "num_input_tokens_seen": 83059072, + "step": 5072, + "train_runtime": 41213.8722, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 3.0745454545454547, + "grad_norm": 0.009280980564653873, + "learning_rate": 7.933867213398845e-05, + "loss": 0.01356413122266531, + "num_input_tokens_seen": 83075448, + "step": 5073, + "train_runtime": 41221.9887, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 3.075151515151515, + "grad_norm": 0.00515192374587059, + "learning_rate": 7.933088491171555e-05, + "loss": 0.011403946205973625, + "num_input_tokens_seen": 83091824, + "step": 5074, + "train_runtime": 41230.1044, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 3.0757575757575757, + "grad_norm": 0.007790415547788143, + "learning_rate": 7.93230966045503e-05, + "loss": 0.014018058776855469, + "num_input_tokens_seen": 83108200, + "step": 5075, + "train_runtime": 41238.2214, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 3.076363636363636, + "grad_norm": 0.00923969317227602, + "learning_rate": 7.931530721278084e-05, + "loss": 0.012067675590515137, + "num_input_tokens_seen": 83124576, + "step": 5076, + "train_runtime": 41246.3366, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 3.076969696969697, + "grad_norm": 0.0040249391458928585, + "learning_rate": 7.930751673669527e-05, + "loss": 0.012174971401691437, + "num_input_tokens_seen": 83140952, + "step": 5077, + "train_runtime": 41254.4518, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 3.0775757575757576, + "grad_norm": 0.009031681343913078, + "learning_rate": 7.929972517658171e-05, + "loss": 0.012244632467627525, + "num_input_tokens_seen": 83157328, + "step": 5078, + "train_runtime": 41262.5646, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 3.078181818181818, + "grad_norm": 0.0014552852371707559, + "learning_rate": 7.929193253272838e-05, + "loss": 0.010897099040448666, + "num_input_tokens_seen": 83173704, + "step": 5079, + "train_runtime": 41270.6791, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 3.0787878787878786, + "grad_norm": 0.008750759065151215, + "learning_rate": 7.928413880542349e-05, + "loss": 0.011758248321712017, + "num_input_tokens_seen": 83190080, + "step": 5080, + "train_runtime": 41278.7945, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 3.0793939393939396, + "grad_norm": 0.006342112086713314, + "learning_rate": 7.927634399495536e-05, + "loss": 0.010831915773451328, + "num_input_tokens_seen": 83206456, + "step": 5081, + "train_runtime": 41286.9101, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 3.08, + "grad_norm": 0.012833413667976856, + "learning_rate": 7.926854810161226e-05, + "loss": 0.012967357411980629, + "num_input_tokens_seen": 83222832, + "step": 5082, + "train_runtime": 41295.0229, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 3.0806060606060606, + "grad_norm": 0.012256626039743423, + "learning_rate": 7.926075112568259e-05, + "loss": 0.012168221175670624, + "num_input_tokens_seen": 83239208, + "step": 5083, + "train_runtime": 41303.1364, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 3.081212121212121, + "grad_norm": 0.008062608540058136, + "learning_rate": 7.925295306745469e-05, + "loss": 0.012462492100894451, + "num_input_tokens_seen": 83255584, + "step": 5084, + "train_runtime": 41311.2522, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 3.081818181818182, + "grad_norm": 0.005075621884316206, + "learning_rate": 7.924515392721703e-05, + "loss": 0.012568464502692223, + "num_input_tokens_seen": 83271960, + "step": 5085, + "train_runtime": 41319.3669, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 3.0824242424242425, + "grad_norm": 0.010546546429395676, + "learning_rate": 7.923735370525809e-05, + "loss": 0.012801365926861763, + "num_input_tokens_seen": 83288336, + "step": 5086, + "train_runtime": 41327.4805, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 3.083030303030303, + "grad_norm": 0.008668221533298492, + "learning_rate": 7.922955240186635e-05, + "loss": 0.012692100368440151, + "num_input_tokens_seen": 83304712, + "step": 5087, + "train_runtime": 41335.594, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 3.0836363636363635, + "grad_norm": 0.011252512224018574, + "learning_rate": 7.922175001733042e-05, + "loss": 0.012912561185657978, + "num_input_tokens_seen": 83321088, + "step": 5088, + "train_runtime": 41343.7138, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 3.0842424242424245, + "grad_norm": 0.008441926911473274, + "learning_rate": 7.921394655193884e-05, + "loss": 0.012918155640363693, + "num_input_tokens_seen": 83337464, + "step": 5089, + "train_runtime": 41351.8322, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 3.084848484848485, + "grad_norm": 0.007579430006444454, + "learning_rate": 7.920614200598029e-05, + "loss": 0.011566674336791039, + "num_input_tokens_seen": 83353840, + "step": 5090, + "train_runtime": 41359.9519, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 3.0854545454545454, + "grad_norm": 0.006866890471428633, + "learning_rate": 7.91983363797434e-05, + "loss": 0.012592778541147709, + "num_input_tokens_seen": 83370216, + "step": 5091, + "train_runtime": 41368.0709, + "train_tokens_per_second": 2015.328 + }, + { + "epoch": 3.086060606060606, + "grad_norm": 0.008269276469945908, + "learning_rate": 7.919052967351693e-05, + "loss": 0.012565435841679573, + "num_input_tokens_seen": 83386592, + "step": 5092, + "train_runtime": 41376.1875, + "train_tokens_per_second": 2015.328 + }, + { + "epoch": 3.086666666666667, + "grad_norm": 0.006387703586369753, + "learning_rate": 7.91827218875896e-05, + "loss": 0.011804311536252499, + "num_input_tokens_seen": 83402968, + "step": 5093, + "train_runtime": 41384.302, + "train_tokens_per_second": 2015.329 + }, + { + "epoch": 3.0872727272727274, + "grad_norm": 0.004637232515960932, + "learning_rate": 7.917491302225022e-05, + "loss": 0.011333847418427467, + "num_input_tokens_seen": 83419344, + "step": 5094, + "train_runtime": 41392.4154, + "train_tokens_per_second": 2015.329 + }, + { + "epoch": 3.087878787878788, + "grad_norm": 0.013361013494431973, + "learning_rate": 7.916710307778762e-05, + "loss": 0.012411284260451794, + "num_input_tokens_seen": 83435720, + "step": 5095, + "train_runtime": 41400.5314, + "train_tokens_per_second": 2015.33 + }, + { + "epoch": 3.0884848484848484, + "grad_norm": 0.0006919089355506003, + "learning_rate": 7.915929205449069e-05, + "loss": 0.011356989853084087, + "num_input_tokens_seen": 83452096, + "step": 5096, + "train_runtime": 41408.646, + "train_tokens_per_second": 2015.33 + }, + { + "epoch": 3.089090909090909, + "grad_norm": 0.011503004468977451, + "learning_rate": 7.91514799526483e-05, + "loss": 0.012822176329791546, + "num_input_tokens_seen": 83468472, + "step": 5097, + "train_runtime": 41416.7631, + "train_tokens_per_second": 2015.331 + }, + { + "epoch": 3.08969696969697, + "grad_norm": 0.006111727096140385, + "learning_rate": 7.914366677254947e-05, + "loss": 0.012183048762381077, + "num_input_tokens_seen": 83484848, + "step": 5098, + "train_runtime": 41424.8762, + "train_tokens_per_second": 2015.331 + }, + { + "epoch": 3.0903030303030303, + "grad_norm": 0.007792407181113958, + "learning_rate": 7.913585251448315e-05, + "loss": 0.01216356735676527, + "num_input_tokens_seen": 83501224, + "step": 5099, + "train_runtime": 41432.9908, + "train_tokens_per_second": 2015.332 + }, + { + "epoch": 3.090909090909091, + "grad_norm": 0.0102054039016366, + "learning_rate": 7.91280371787384e-05, + "loss": 0.01263304241001606, + "num_input_tokens_seen": 83517600, + "step": 5100, + "train_runtime": 41441.1087, + "train_tokens_per_second": 2015.332 + }, + { + "epoch": 3.0915151515151513, + "grad_norm": 0.0071181547828018665, + "learning_rate": 7.912022076560426e-05, + "loss": 0.011831587180495262, + "num_input_tokens_seen": 83533976, + "step": 5101, + "train_runtime": 41450.1628, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 3.0921212121212123, + "grad_norm": 0.00976664200425148, + "learning_rate": 7.911240327536986e-05, + "loss": 0.013129421509802341, + "num_input_tokens_seen": 83550352, + "step": 5102, + "train_runtime": 41458.2761, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 3.0927272727272728, + "grad_norm": 0.005207084119319916, + "learning_rate": 7.910458470832437e-05, + "loss": 0.012243582867085934, + "num_input_tokens_seen": 83566728, + "step": 5103, + "train_runtime": 41466.3944, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 3.0933333333333333, + "grad_norm": 0.005988720804452896, + "learning_rate": 7.909676506475697e-05, + "loss": 0.012617035768926144, + "num_input_tokens_seen": 83583104, + "step": 5104, + "train_runtime": 41474.5099, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.0939393939393938, + "grad_norm": 0.00800618901848793, + "learning_rate": 7.908894434495689e-05, + "loss": 0.011513113044202328, + "num_input_tokens_seen": 83599480, + "step": 5105, + "train_runtime": 41482.6317, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.0945454545454547, + "grad_norm": 0.014598195441067219, + "learning_rate": 7.908112254921341e-05, + "loss": 0.013231202960014343, + "num_input_tokens_seen": 83615856, + "step": 5106, + "train_runtime": 41490.7504, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.095151515151515, + "grad_norm": 0.006086903624236584, + "learning_rate": 7.907329967781581e-05, + "loss": 0.013285774737596512, + "num_input_tokens_seen": 83632232, + "step": 5107, + "train_runtime": 41498.8707, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.0957575757575757, + "grad_norm": 0.00650793919339776, + "learning_rate": 7.906547573105351e-05, + "loss": 0.011458156630396843, + "num_input_tokens_seen": 83648608, + "step": 5108, + "train_runtime": 41506.9876, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 3.096363636363636, + "grad_norm": 0.00880400463938713, + "learning_rate": 7.905765070921585e-05, + "loss": 0.012448299676179886, + "num_input_tokens_seen": 83664984, + "step": 5109, + "train_runtime": 41515.1055, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 3.096969696969697, + "grad_norm": 0.008767417632043362, + "learning_rate": 7.904982461259229e-05, + "loss": 0.012742777355015278, + "num_input_tokens_seen": 83681360, + "step": 5110, + "train_runtime": 41523.2314, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 3.0975757575757576, + "grad_norm": 0.012613235041499138, + "learning_rate": 7.904199744147228e-05, + "loss": 0.01224098727107048, + "num_input_tokens_seen": 83697736, + "step": 5111, + "train_runtime": 41531.352, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 3.098181818181818, + "grad_norm": 0.001237873686477542, + "learning_rate": 7.903416919614532e-05, + "loss": 0.011446903459727764, + "num_input_tokens_seen": 83714112, + "step": 5112, + "train_runtime": 41539.4744, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 3.0987878787878786, + "grad_norm": 0.017986685037612915, + "learning_rate": 7.902633987690102e-05, + "loss": 0.012180576100945473, + "num_input_tokens_seen": 83730488, + "step": 5113, + "train_runtime": 41547.6003, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 3.0993939393939396, + "grad_norm": 0.007620835676789284, + "learning_rate": 7.901850948402891e-05, + "loss": 0.012758205644786358, + "num_input_tokens_seen": 83746864, + "step": 5114, + "train_runtime": 41555.7132, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 3.1, + "grad_norm": 0.005015079863369465, + "learning_rate": 7.901067801781866e-05, + "loss": 0.011803529225289822, + "num_input_tokens_seen": 83763240, + "step": 5115, + "train_runtime": 41563.8314, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 3.1006060606060606, + "grad_norm": 0.0020823602098971605, + "learning_rate": 7.900284547855991e-05, + "loss": 0.011068684980273247, + "num_input_tokens_seen": 83779616, + "step": 5116, + "train_runtime": 41571.945, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 3.101212121212121, + "grad_norm": 0.005872323177754879, + "learning_rate": 7.89950118665424e-05, + "loss": 0.01149643026292324, + "num_input_tokens_seen": 83795992, + "step": 5117, + "train_runtime": 41580.0649, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 3.101818181818182, + "grad_norm": 0.006837104447185993, + "learning_rate": 7.898717718205586e-05, + "loss": 0.01150029432028532, + "num_input_tokens_seen": 83812368, + "step": 5118, + "train_runtime": 41588.1871, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 3.1024242424242425, + "grad_norm": 0.007935219444334507, + "learning_rate": 7.89793414253901e-05, + "loss": 0.0126097546890378, + "num_input_tokens_seen": 83828744, + "step": 5119, + "train_runtime": 41596.3008, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 3.103030303030303, + "grad_norm": 0.0038245918694883585, + "learning_rate": 7.897150459683491e-05, + "loss": 0.011973814107477665, + "num_input_tokens_seen": 83845120, + "step": 5120, + "train_runtime": 41604.4192, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 3.1036363636363635, + "grad_norm": 0.008117848075926304, + "learning_rate": 7.89636666966802e-05, + "loss": 0.012624162249267101, + "num_input_tokens_seen": 83861496, + "step": 5121, + "train_runtime": 41612.5394, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 3.1042424242424245, + "grad_norm": 0.0033736233599483967, + "learning_rate": 7.895582772521586e-05, + "loss": 0.011142382398247719, + "num_input_tokens_seen": 83877872, + "step": 5122, + "train_runtime": 41620.6568, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 3.104848484848485, + "grad_norm": 0.01163144689053297, + "learning_rate": 7.894798768273184e-05, + "loss": 0.012628186494112015, + "num_input_tokens_seen": 83894248, + "step": 5123, + "train_runtime": 41628.7721, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 3.1054545454545455, + "grad_norm": 0.0070470902137458324, + "learning_rate": 7.894014656951813e-05, + "loss": 0.012801703065633774, + "num_input_tokens_seen": 83910624, + "step": 5124, + "train_runtime": 41636.8876, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 3.106060606060606, + "grad_norm": 0.00643567880615592, + "learning_rate": 7.893230438586476e-05, + "loss": 0.013806324452161789, + "num_input_tokens_seen": 83927000, + "step": 5125, + "train_runtime": 41645.0014, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 3.1066666666666665, + "grad_norm": 0.01251074206084013, + "learning_rate": 7.89244611320618e-05, + "loss": 0.012455376796424389, + "num_input_tokens_seen": 83943376, + "step": 5126, + "train_runtime": 41653.1147, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 3.1072727272727274, + "grad_norm": 0.008938591927289963, + "learning_rate": 7.891661680839932e-05, + "loss": 0.012542535550892353, + "num_input_tokens_seen": 83959752, + "step": 5127, + "train_runtime": 41661.2313, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 3.107878787878788, + "grad_norm": 0.008622155524790287, + "learning_rate": 7.890877141516753e-05, + "loss": 0.012785638682544231, + "num_input_tokens_seen": 83976128, + "step": 5128, + "train_runtime": 41669.3519, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 3.1084848484848484, + "grad_norm": 0.007751957047730684, + "learning_rate": 7.890092495265657e-05, + "loss": 0.012035049498081207, + "num_input_tokens_seen": 83992504, + "step": 5129, + "train_runtime": 41677.4679, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.109090909090909, + "grad_norm": 0.007923007011413574, + "learning_rate": 7.889307742115668e-05, + "loss": 0.011671951971948147, + "num_input_tokens_seen": 84008880, + "step": 5130, + "train_runtime": 41685.5825, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.10969696969697, + "grad_norm": 0.00512312026694417, + "learning_rate": 7.888522882095813e-05, + "loss": 0.011519131250679493, + "num_input_tokens_seen": 84025256, + "step": 5131, + "train_runtime": 41693.7011, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.1103030303030303, + "grad_norm": 0.008939514867961407, + "learning_rate": 7.887737915235121e-05, + "loss": 0.013339599594473839, + "num_input_tokens_seen": 84041632, + "step": 5132, + "train_runtime": 41701.8196, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.110909090909091, + "grad_norm": 0.008887813426554203, + "learning_rate": 7.886952841562627e-05, + "loss": 0.012748410925269127, + "num_input_tokens_seen": 84058008, + "step": 5133, + "train_runtime": 41709.9379, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.1115151515151513, + "grad_norm": 0.008092070929706097, + "learning_rate": 7.886167661107369e-05, + "loss": 0.01242149155586958, + "num_input_tokens_seen": 84074384, + "step": 5134, + "train_runtime": 41718.0522, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 3.1121212121212123, + "grad_norm": 0.005466654431074858, + "learning_rate": 7.88538237389839e-05, + "loss": 0.011800389736890793, + "num_input_tokens_seen": 84090760, + "step": 5135, + "train_runtime": 41726.1644, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 3.112727272727273, + "grad_norm": 0.0087019894272089, + "learning_rate": 7.884596979964736e-05, + "loss": 0.012434134259819984, + "num_input_tokens_seen": 84107136, + "step": 5136, + "train_runtime": 41734.2806, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 3.1133333333333333, + "grad_norm": 0.00563916377723217, + "learning_rate": 7.883811479335458e-05, + "loss": 0.012274009175598621, + "num_input_tokens_seen": 84123512, + "step": 5137, + "train_runtime": 41742.3958, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 3.113939393939394, + "grad_norm": 0.0055172680877149105, + "learning_rate": 7.883025872039609e-05, + "loss": 0.011265186592936516, + "num_input_tokens_seen": 84139888, + "step": 5138, + "train_runtime": 41750.5126, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 3.1145454545454547, + "grad_norm": 0.010379090905189514, + "learning_rate": 7.882240158106247e-05, + "loss": 0.01238405704498291, + "num_input_tokens_seen": 84156264, + "step": 5139, + "train_runtime": 41758.6328, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 3.1151515151515152, + "grad_norm": 0.006833005230873823, + "learning_rate": 7.881454337564436e-05, + "loss": 0.012351302430033684, + "num_input_tokens_seen": 84172640, + "step": 5140, + "train_runtime": 41766.7501, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 3.1157575757575757, + "grad_norm": 0.006575712934136391, + "learning_rate": 7.880668410443238e-05, + "loss": 0.011763148941099644, + "num_input_tokens_seen": 84189016, + "step": 5141, + "train_runtime": 41774.8689, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 3.1163636363636362, + "grad_norm": 0.007217420265078545, + "learning_rate": 7.879882376771727e-05, + "loss": 0.01220768317580223, + "num_input_tokens_seen": 84205392, + "step": 5142, + "train_runtime": 41782.9833, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 3.116969696969697, + "grad_norm": 0.004965866915881634, + "learning_rate": 7.879096236578974e-05, + "loss": 0.011544787324965, + "num_input_tokens_seen": 84221768, + "step": 5143, + "train_runtime": 41791.1012, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 3.1175757575757577, + "grad_norm": 0.00712276017293334, + "learning_rate": 7.87830998989406e-05, + "loss": 0.011951303109526634, + "num_input_tokens_seen": 84238144, + "step": 5144, + "train_runtime": 41799.2163, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 3.118181818181818, + "grad_norm": 0.00812452007085085, + "learning_rate": 7.877523636746063e-05, + "loss": 0.011689219623804092, + "num_input_tokens_seen": 84254520, + "step": 5145, + "train_runtime": 41807.3327, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 3.1187878787878787, + "grad_norm": 0.010325971990823746, + "learning_rate": 7.876737177164071e-05, + "loss": 0.012078515253961086, + "num_input_tokens_seen": 84270896, + "step": 5146, + "train_runtime": 41815.4478, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 3.1193939393939396, + "grad_norm": 0.004387114197015762, + "learning_rate": 7.875950611177172e-05, + "loss": 0.011316593736410141, + "num_input_tokens_seen": 84287272, + "step": 5147, + "train_runtime": 41823.5626, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 3.12, + "grad_norm": 0.012871846556663513, + "learning_rate": 7.875163938814462e-05, + "loss": 0.012524784542620182, + "num_input_tokens_seen": 84303648, + "step": 5148, + "train_runtime": 41831.6793, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 3.1206060606060606, + "grad_norm": 0.006912142038345337, + "learning_rate": 7.874377160105036e-05, + "loss": 0.01104238722473383, + "num_input_tokens_seen": 84320024, + "step": 5149, + "train_runtime": 41839.7917, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 3.121212121212121, + "grad_norm": 0.007153605110943317, + "learning_rate": 7.873590275077998e-05, + "loss": 0.011791144497692585, + "num_input_tokens_seen": 84336400, + "step": 5150, + "train_runtime": 41847.9061, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 3.1218181818181816, + "grad_norm": 0.0036277177277952433, + "learning_rate": 7.87280328376245e-05, + "loss": 0.011163576506078243, + "num_input_tokens_seen": 84352776, + "step": 5151, + "train_runtime": 41856.022, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 3.1224242424242425, + "grad_norm": 0.005414872895926237, + "learning_rate": 7.872016186187504e-05, + "loss": 0.010902917943894863, + "num_input_tokens_seen": 84369152, + "step": 5152, + "train_runtime": 41864.1339, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.123030303030303, + "grad_norm": 0.002707183128222823, + "learning_rate": 7.871228982382271e-05, + "loss": 0.010938930325210094, + "num_input_tokens_seen": 84385528, + "step": 5153, + "train_runtime": 41872.2507, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.1236363636363635, + "grad_norm": 0.004463442601263523, + "learning_rate": 7.870441672375873e-05, + "loss": 0.012558909133076668, + "num_input_tokens_seen": 84401904, + "step": 5154, + "train_runtime": 41880.3631, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.124242424242424, + "grad_norm": 0.011483821086585522, + "learning_rate": 7.869654256197422e-05, + "loss": 0.012831505388021469, + "num_input_tokens_seen": 84418280, + "step": 5155, + "train_runtime": 41888.4774, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.124848484848485, + "grad_norm": 0.010141262784600258, + "learning_rate": 7.868866733876052e-05, + "loss": 0.012616346590220928, + "num_input_tokens_seen": 84434656, + "step": 5156, + "train_runtime": 41896.5912, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 3.1254545454545455, + "grad_norm": 0.007358132861554623, + "learning_rate": 7.868079105440889e-05, + "loss": 0.012558026239275932, + "num_input_tokens_seen": 84451032, + "step": 5157, + "train_runtime": 41904.7047, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 3.126060606060606, + "grad_norm": 0.006527206394821405, + "learning_rate": 7.867291370921064e-05, + "loss": 0.012400537729263306, + "num_input_tokens_seen": 84467408, + "step": 5158, + "train_runtime": 41912.8199, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.1266666666666665, + "grad_norm": 0.008728621527552605, + "learning_rate": 7.866503530345715e-05, + "loss": 0.012100527994334698, + "num_input_tokens_seen": 84483784, + "step": 5159, + "train_runtime": 41920.9469, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.1272727272727274, + "grad_norm": 0.009332158602774143, + "learning_rate": 7.865715583743982e-05, + "loss": 0.011438591405749321, + "num_input_tokens_seen": 84500160, + "step": 5160, + "train_runtime": 41929.0587, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 3.127878787878788, + "grad_norm": 0.009722975082695484, + "learning_rate": 7.864927531145011e-05, + "loss": 0.01156856119632721, + "num_input_tokens_seen": 84516536, + "step": 5161, + "train_runtime": 41937.1727, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 3.1284848484848484, + "grad_norm": 0.009311624802649021, + "learning_rate": 7.86413937257795e-05, + "loss": 0.012138459831476212, + "num_input_tokens_seen": 84532912, + "step": 5162, + "train_runtime": 41945.2862, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.129090909090909, + "grad_norm": 0.0059304614551365376, + "learning_rate": 7.86335110807195e-05, + "loss": 0.011830560863018036, + "num_input_tokens_seen": 84549288, + "step": 5163, + "train_runtime": 41953.4021, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.12969696969697, + "grad_norm": 0.0070556155405938625, + "learning_rate": 7.86256273765617e-05, + "loss": 0.01258085947483778, + "num_input_tokens_seen": 84565664, + "step": 5164, + "train_runtime": 41961.5179, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.1303030303030304, + "grad_norm": 0.006310723256319761, + "learning_rate": 7.861774261359769e-05, + "loss": 0.011715739034116268, + "num_input_tokens_seen": 84582040, + "step": 5165, + "train_runtime": 41969.6377, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.130909090909091, + "grad_norm": 0.008160227909684181, + "learning_rate": 7.86098567921191e-05, + "loss": 0.01123846136033535, + "num_input_tokens_seen": 84598416, + "step": 5166, + "train_runtime": 41977.7531, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.1315151515151514, + "grad_norm": 0.005683892406523228, + "learning_rate": 7.860196991241764e-05, + "loss": 0.011757885105907917, + "num_input_tokens_seen": 84614792, + "step": 5167, + "train_runtime": 41985.8668, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 3.1321212121212123, + "grad_norm": 0.03434290736913681, + "learning_rate": 7.859408197478499e-05, + "loss": 0.012472064234316349, + "num_input_tokens_seen": 84631168, + "step": 5168, + "train_runtime": 41993.9792, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 3.132727272727273, + "grad_norm": 0.00959369819611311, + "learning_rate": 7.858619297951295e-05, + "loss": 0.01262537483125925, + "num_input_tokens_seen": 84647544, + "step": 5169, + "train_runtime": 42002.0931, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 3.1333333333333333, + "grad_norm": 0.009751481004059315, + "learning_rate": 7.857830292689329e-05, + "loss": 0.013263939879834652, + "num_input_tokens_seen": 84663920, + "step": 5170, + "train_runtime": 42010.2106, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 3.133939393939394, + "grad_norm": 0.007035667076706886, + "learning_rate": 7.857041181721787e-05, + "loss": 0.010719198733568192, + "num_input_tokens_seen": 84680296, + "step": 5171, + "train_runtime": 42018.3328, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 3.1345454545454547, + "grad_norm": 0.011088959872722626, + "learning_rate": 7.856251965077854e-05, + "loss": 0.012980067171156406, + "num_input_tokens_seen": 84696672, + "step": 5172, + "train_runtime": 42026.4465, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 3.1351515151515152, + "grad_norm": 0.009992634877562523, + "learning_rate": 7.855462642786724e-05, + "loss": 0.011668424122035503, + "num_input_tokens_seen": 84713048, + "step": 5173, + "train_runtime": 42034.5663, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 3.1357575757575757, + "grad_norm": 0.004791597370058298, + "learning_rate": 7.854673214877592e-05, + "loss": 0.011503913439810276, + "num_input_tokens_seen": 84729424, + "step": 5174, + "train_runtime": 42042.6791, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 3.1363636363636362, + "grad_norm": 0.007880473509430885, + "learning_rate": 7.853883681379657e-05, + "loss": 0.01226319931447506, + "num_input_tokens_seen": 84745800, + "step": 5175, + "train_runtime": 42050.7904, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 3.1369696969696967, + "grad_norm": 0.0051353685557842255, + "learning_rate": 7.853094042322121e-05, + "loss": 0.011764869093894958, + "num_input_tokens_seen": 84762176, + "step": 5176, + "train_runtime": 42058.9047, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 3.1375757575757577, + "grad_norm": 0.010507145896553993, + "learning_rate": 7.852304297734195e-05, + "loss": 0.012899585999548435, + "num_input_tokens_seen": 84778552, + "step": 5177, + "train_runtime": 42067.0163, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 3.138181818181818, + "grad_norm": 0.025372665375471115, + "learning_rate": 7.851514447645085e-05, + "loss": 0.012355811893939972, + "num_input_tokens_seen": 84794928, + "step": 5178, + "train_runtime": 42075.1312, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 3.1387878787878787, + "grad_norm": 0.009011749178171158, + "learning_rate": 7.850724492084009e-05, + "loss": 0.01081522461026907, + "num_input_tokens_seen": 84811304, + "step": 5179, + "train_runtime": 42083.2455, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 3.1393939393939396, + "grad_norm": 0.028803173452615738, + "learning_rate": 7.849934431080187e-05, + "loss": 0.011944063007831573, + "num_input_tokens_seen": 84827680, + "step": 5180, + "train_runtime": 42091.36, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 3.14, + "grad_norm": 0.0068590701557695866, + "learning_rate": 7.84914426466284e-05, + "loss": 0.011643407866358757, + "num_input_tokens_seen": 84844056, + "step": 5181, + "train_runtime": 42099.4751, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 3.1406060606060606, + "grad_norm": 0.00392129784449935, + "learning_rate": 7.848353992861195e-05, + "loss": 0.01264530885964632, + "num_input_tokens_seen": 84860432, + "step": 5182, + "train_runtime": 42107.587, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 3.141212121212121, + "grad_norm": 0.008084416389465332, + "learning_rate": 7.847563615704482e-05, + "loss": 0.012670534662902355, + "num_input_tokens_seen": 84876808, + "step": 5183, + "train_runtime": 42115.6992, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 3.1418181818181816, + "grad_norm": 0.004640469327569008, + "learning_rate": 7.846773133221936e-05, + "loss": 0.012905479408800602, + "num_input_tokens_seen": 84893184, + "step": 5184, + "train_runtime": 42123.8181, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 3.1424242424242426, + "grad_norm": 0.008622902445495129, + "learning_rate": 7.845982545442796e-05, + "loss": 0.012059461325407028, + "num_input_tokens_seen": 84909560, + "step": 5185, + "train_runtime": 42131.9346, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 3.143030303030303, + "grad_norm": 0.007753988727927208, + "learning_rate": 7.845191852396305e-05, + "loss": 0.012392483651638031, + "num_input_tokens_seen": 84925936, + "step": 5186, + "train_runtime": 42140.0475, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 3.1436363636363636, + "grad_norm": 0.009077763184905052, + "learning_rate": 7.844401054111707e-05, + "loss": 0.01178478728979826, + "num_input_tokens_seen": 84942312, + "step": 5187, + "train_runtime": 42148.2572, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 3.144242424242424, + "grad_norm": 0.004836606327444315, + "learning_rate": 7.843610150618255e-05, + "loss": 0.012236197479069233, + "num_input_tokens_seen": 84958688, + "step": 5188, + "train_runtime": 42156.3746, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 3.144848484848485, + "grad_norm": 0.01035410538315773, + "learning_rate": 7.842819141945199e-05, + "loss": 0.011785104870796204, + "num_input_tokens_seen": 84975064, + "step": 5189, + "train_runtime": 42164.4887, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 3.1454545454545455, + "grad_norm": 0.005500508937984705, + "learning_rate": 7.842028028121799e-05, + "loss": 0.011833153665065765, + "num_input_tokens_seen": 84991440, + "step": 5190, + "train_runtime": 42172.5993, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 3.146060606060606, + "grad_norm": 0.003851423505693674, + "learning_rate": 7.841236809177317e-05, + "loss": 0.011710869148373604, + "num_input_tokens_seen": 85007816, + "step": 5191, + "train_runtime": 42180.7139, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 3.1466666666666665, + "grad_norm": 0.0047545949928462505, + "learning_rate": 7.840445485141022e-05, + "loss": 0.012008590623736382, + "num_input_tokens_seen": 85024192, + "step": 5192, + "train_runtime": 42188.8314, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 3.1472727272727274, + "grad_norm": 0.004845188464969397, + "learning_rate": 7.839654056042176e-05, + "loss": 0.01172076165676117, + "num_input_tokens_seen": 85040568, + "step": 5193, + "train_runtime": 42196.948, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 3.147878787878788, + "grad_norm": 0.009915243834257126, + "learning_rate": 7.838862521910058e-05, + "loss": 0.01095246896147728, + "num_input_tokens_seen": 85056944, + "step": 5194, + "train_runtime": 42205.0624, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 3.1484848484848484, + "grad_norm": 0.007844122126698494, + "learning_rate": 7.838070882773943e-05, + "loss": 0.011876475065946579, + "num_input_tokens_seen": 85073320, + "step": 5195, + "train_runtime": 42213.1767, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 3.149090909090909, + "grad_norm": 0.009019284509122372, + "learning_rate": 7.837279138663114e-05, + "loss": 0.012739112600684166, + "num_input_tokens_seen": 85089696, + "step": 5196, + "train_runtime": 42221.2881, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 3.14969696969697, + "grad_norm": 0.006925021763890982, + "learning_rate": 7.836487289606853e-05, + "loss": 0.012848568148911, + "num_input_tokens_seen": 85106072, + "step": 5197, + "train_runtime": 42229.401, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 3.1503030303030304, + "grad_norm": 0.010892736725509167, + "learning_rate": 7.835695335634455e-05, + "loss": 0.012538321316242218, + "num_input_tokens_seen": 85122448, + "step": 5198, + "train_runtime": 42237.512, + "train_tokens_per_second": 2015.328 + }, + { + "epoch": 3.150909090909091, + "grad_norm": 0.015291296876966953, + "learning_rate": 7.834903276775204e-05, + "loss": 0.013082976453006268, + "num_input_tokens_seen": 85138824, + "step": 5199, + "train_runtime": 42245.6314, + "train_tokens_per_second": 2015.328 + }, + { + "epoch": 3.1515151515151514, + "grad_norm": 0.007720866706222296, + "learning_rate": 7.834111113058404e-05, + "loss": 0.01157902181148529, + "num_input_tokens_seen": 85155200, + "step": 5200, + "train_runtime": 42253.7503, + "train_tokens_per_second": 2015.329 + }, + { + "epoch": 3.1521212121212123, + "grad_norm": 0.004605120979249477, + "learning_rate": 7.833318844513353e-05, + "loss": 0.01053633727133274, + "num_input_tokens_seen": 85171576, + "step": 5201, + "train_runtime": 42262.9323, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 3.152727272727273, + "grad_norm": 0.004056462552398443, + "learning_rate": 7.832526471169356e-05, + "loss": 0.011966883204877377, + "num_input_tokens_seen": 85187952, + "step": 5202, + "train_runtime": 42271.0479, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 3.1533333333333333, + "grad_norm": 0.00564974220469594, + "learning_rate": 7.83173399305572e-05, + "loss": 0.012125165201723576, + "num_input_tokens_seen": 85204328, + "step": 5203, + "train_runtime": 42279.1632, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 3.153939393939394, + "grad_norm": 0.009710774756968021, + "learning_rate": 7.830941410201758e-05, + "loss": 0.012245293706655502, + "num_input_tokens_seen": 85220704, + "step": 5204, + "train_runtime": 42287.2773, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 3.1545454545454543, + "grad_norm": 0.00870461855083704, + "learning_rate": 7.830148722636787e-05, + "loss": 0.012516462244093418, + "num_input_tokens_seen": 85237080, + "step": 5205, + "train_runtime": 42295.3943, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 3.1551515151515153, + "grad_norm": 0.006455944385379553, + "learning_rate": 7.829355930390125e-05, + "loss": 0.012578926980495453, + "num_input_tokens_seen": 85253456, + "step": 5206, + "train_runtime": 42303.5123, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 3.1557575757575758, + "grad_norm": 0.006689351052045822, + "learning_rate": 7.828563033491099e-05, + "loss": 0.012533603236079216, + "num_input_tokens_seen": 85269832, + "step": 5207, + "train_runtime": 42311.6323, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 3.1563636363636363, + "grad_norm": 0.018662212416529655, + "learning_rate": 7.827770031969032e-05, + "loss": 0.012819463387131691, + "num_input_tokens_seen": 85286208, + "step": 5208, + "train_runtime": 42319.7487, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 3.156969696969697, + "grad_norm": 0.005386484786868095, + "learning_rate": 7.826976925853263e-05, + "loss": 0.01189794298261404, + "num_input_tokens_seen": 85302584, + "step": 5209, + "train_runtime": 42327.8673, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 3.1575757575757577, + "grad_norm": 0.00562541838735342, + "learning_rate": 7.826183715173118e-05, + "loss": 0.011717134155333042, + "num_input_tokens_seen": 85318960, + "step": 5210, + "train_runtime": 42335.991, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 3.158181818181818, + "grad_norm": 0.006123247090727091, + "learning_rate": 7.825390399957944e-05, + "loss": 0.011720769107341766, + "num_input_tokens_seen": 85335336, + "step": 5211, + "train_runtime": 42344.1055, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 3.1587878787878787, + "grad_norm": 0.004591134376823902, + "learning_rate": 7.82459698023708e-05, + "loss": 0.011776016093790531, + "num_input_tokens_seen": 85351712, + "step": 5212, + "train_runtime": 42352.216, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 3.159393939393939, + "grad_norm": 0.006868307013064623, + "learning_rate": 7.823803456039875e-05, + "loss": 0.012516679242253304, + "num_input_tokens_seen": 85368088, + "step": 5213, + "train_runtime": 42360.3337, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 3.16, + "grad_norm": 0.0040158540941774845, + "learning_rate": 7.823009827395678e-05, + "loss": 0.011936765164136887, + "num_input_tokens_seen": 85384464, + "step": 5214, + "train_runtime": 42368.4546, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 3.1606060606060606, + "grad_norm": 0.02153616212308407, + "learning_rate": 7.822216094333847e-05, + "loss": 0.01125241443514824, + "num_input_tokens_seen": 85400840, + "step": 5215, + "train_runtime": 42376.5762, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 3.161212121212121, + "grad_norm": 0.005792237352579832, + "learning_rate": 7.821422256883736e-05, + "loss": 0.011181306093931198, + "num_input_tokens_seen": 85417216, + "step": 5216, + "train_runtime": 42384.6945, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 3.1618181818181816, + "grad_norm": 0.004828534554690123, + "learning_rate": 7.820628315074714e-05, + "loss": 0.012275940738618374, + "num_input_tokens_seen": 85433592, + "step": 5217, + "train_runtime": 42392.8127, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 3.1624242424242426, + "grad_norm": 0.004352622665464878, + "learning_rate": 7.81983426893614e-05, + "loss": 0.01277929451316595, + "num_input_tokens_seen": 85449968, + "step": 5218, + "train_runtime": 42400.9323, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 3.163030303030303, + "grad_norm": 0.008392254821956158, + "learning_rate": 7.81904011849739e-05, + "loss": 0.012788921594619751, + "num_input_tokens_seen": 85466344, + "step": 5219, + "train_runtime": 42409.0449, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 3.1636363636363636, + "grad_norm": 0.009440755471587181, + "learning_rate": 7.818245863787834e-05, + "loss": 0.013028008863329887, + "num_input_tokens_seen": 85482720, + "step": 5220, + "train_runtime": 42417.1614, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 3.164242424242424, + "grad_norm": 0.010263587348163128, + "learning_rate": 7.817451504836852e-05, + "loss": 0.01092161051928997, + "num_input_tokens_seen": 85499096, + "step": 5221, + "train_runtime": 42425.2804, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 3.164848484848485, + "grad_norm": 0.008552046492695808, + "learning_rate": 7.816657041673828e-05, + "loss": 0.011946198530495167, + "num_input_tokens_seen": 85515472, + "step": 5222, + "train_runtime": 42433.4045, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 3.1654545454545455, + "grad_norm": 0.00744917057454586, + "learning_rate": 7.815862474328143e-05, + "loss": 0.011258787475526333, + "num_input_tokens_seen": 85531848, + "step": 5223, + "train_runtime": 42441.5313, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 3.166060606060606, + "grad_norm": 0.021711429581046104, + "learning_rate": 7.815067802829191e-05, + "loss": 0.013013795018196106, + "num_input_tokens_seen": 85548224, + "step": 5224, + "train_runtime": 42449.6516, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 3.1666666666666665, + "grad_norm": 0.006761034019291401, + "learning_rate": 7.81427302720636e-05, + "loss": 0.011210066266357899, + "num_input_tokens_seen": 85564600, + "step": 5225, + "train_runtime": 42457.7704, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 3.1672727272727275, + "grad_norm": 0.012877918779850006, + "learning_rate": 7.813478147489052e-05, + "loss": 0.012005215510725975, + "num_input_tokens_seen": 85580976, + "step": 5226, + "train_runtime": 42465.8903, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 3.167878787878788, + "grad_norm": 0.01090269349515438, + "learning_rate": 7.812683163706666e-05, + "loss": 0.01267540268599987, + "num_input_tokens_seen": 85597352, + "step": 5227, + "train_runtime": 42474.0099, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 3.1684848484848485, + "grad_norm": 0.00392185477539897, + "learning_rate": 7.811888075888607e-05, + "loss": 0.011277430690824986, + "num_input_tokens_seen": 85613728, + "step": 5228, + "train_runtime": 42482.1332, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 3.169090909090909, + "grad_norm": 0.009097250178456306, + "learning_rate": 7.811092884064284e-05, + "loss": 0.01278465986251831, + "num_input_tokens_seen": 85630104, + "step": 5229, + "train_runtime": 42490.2518, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 3.16969696969697, + "grad_norm": 0.006668399088084698, + "learning_rate": 7.810297588263108e-05, + "loss": 0.0134787792339921, + "num_input_tokens_seen": 85646480, + "step": 5230, + "train_runtime": 42498.3697, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.1703030303030304, + "grad_norm": 0.006416443735361099, + "learning_rate": 7.8095021885145e-05, + "loss": 0.012145865708589554, + "num_input_tokens_seen": 85662856, + "step": 5231, + "train_runtime": 42506.4915, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.170909090909091, + "grad_norm": 0.007501483894884586, + "learning_rate": 7.808706684847876e-05, + "loss": 0.012655824422836304, + "num_input_tokens_seen": 85679232, + "step": 5232, + "train_runtime": 42514.6113, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.1715151515151514, + "grad_norm": 0.004501063376665115, + "learning_rate": 7.807911077292662e-05, + "loss": 0.012081027962267399, + "num_input_tokens_seen": 85695608, + "step": 5233, + "train_runtime": 42522.7335, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.172121212121212, + "grad_norm": 0.009771152399480343, + "learning_rate": 7.807115365878285e-05, + "loss": 0.011455008760094643, + "num_input_tokens_seen": 85711984, + "step": 5234, + "train_runtime": 42530.8505, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 3.172727272727273, + "grad_norm": 0.004385191015899181, + "learning_rate": 7.806319550634176e-05, + "loss": 0.011270713992416859, + "num_input_tokens_seen": 85728360, + "step": 5235, + "train_runtime": 42538.9807, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.1733333333333333, + "grad_norm": 0.010677985846996307, + "learning_rate": 7.805523631589774e-05, + "loss": 0.012973921373486519, + "num_input_tokens_seen": 85744736, + "step": 5236, + "train_runtime": 42547.1004, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 3.173939393939394, + "grad_norm": 0.013292881660163403, + "learning_rate": 7.804727608774516e-05, + "loss": 0.012711204588413239, + "num_input_tokens_seen": 85761112, + "step": 5237, + "train_runtime": 42555.218, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 3.174545454545455, + "grad_norm": 0.022009989246726036, + "learning_rate": 7.803931482217845e-05, + "loss": 0.015031242743134499, + "num_input_tokens_seen": 85777488, + "step": 5238, + "train_runtime": 42563.3346, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 3.1751515151515153, + "grad_norm": 0.005530593916773796, + "learning_rate": 7.803135251949207e-05, + "loss": 0.01126721128821373, + "num_input_tokens_seen": 85793864, + "step": 5239, + "train_runtime": 42571.4529, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 3.175757575757576, + "grad_norm": 0.006890534423291683, + "learning_rate": 7.802338917998058e-05, + "loss": 0.011510295793414116, + "num_input_tokens_seen": 85810240, + "step": 5240, + "train_runtime": 42579.5714, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 3.1763636363636363, + "grad_norm": 0.008393880911171436, + "learning_rate": 7.801542480393849e-05, + "loss": 0.012440541759133339, + "num_input_tokens_seen": 85826616, + "step": 5241, + "train_runtime": 42587.6897, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 3.1769696969696968, + "grad_norm": 0.006428099237382412, + "learning_rate": 7.800745939166039e-05, + "loss": 0.012294844724237919, + "num_input_tokens_seen": 85842992, + "step": 5242, + "train_runtime": 42595.8049, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 3.1775757575757577, + "grad_norm": 0.019595114514231682, + "learning_rate": 7.79994929434409e-05, + "loss": 0.013041215017437935, + "num_input_tokens_seen": 85859368, + "step": 5243, + "train_runtime": 42603.9342, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 3.178181818181818, + "grad_norm": 0.007731567602604628, + "learning_rate": 7.799152545957472e-05, + "loss": 0.012966437265276909, + "num_input_tokens_seen": 85875744, + "step": 5244, + "train_runtime": 42612.0507, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 3.1787878787878787, + "grad_norm": 0.0077772801741957664, + "learning_rate": 7.798355694035649e-05, + "loss": 0.012168207205832005, + "num_input_tokens_seen": 85892120, + "step": 5245, + "train_runtime": 42620.1644, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 3.179393939393939, + "grad_norm": 0.009484034031629562, + "learning_rate": 7.797558738608099e-05, + "loss": 0.012143932282924652, + "num_input_tokens_seen": 85908496, + "step": 5246, + "train_runtime": 42628.2788, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 3.18, + "grad_norm": 0.006290920544415712, + "learning_rate": 7.796761679704301e-05, + "loss": 0.012149149551987648, + "num_input_tokens_seen": 85924872, + "step": 5247, + "train_runtime": 42636.3927, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 3.1806060606060607, + "grad_norm": 0.010773113928735256, + "learning_rate": 7.795964517353735e-05, + "loss": 0.012508148327469826, + "num_input_tokens_seen": 85941248, + "step": 5248, + "train_runtime": 42644.5072, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 3.181212121212121, + "grad_norm": 0.009889576584100723, + "learning_rate": 7.795167251585886e-05, + "loss": 0.012305041775107384, + "num_input_tokens_seen": 85957624, + "step": 5249, + "train_runtime": 42652.6201, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 3.1818181818181817, + "grad_norm": 0.009313871152698994, + "learning_rate": 7.794369882430243e-05, + "loss": 0.012120736762881279, + "num_input_tokens_seen": 85974000, + "step": 5250, + "train_runtime": 42660.7328, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 3.1824242424242426, + "grad_norm": 0.008882635273039341, + "learning_rate": 7.7935724099163e-05, + "loss": 0.012331506237387657, + "num_input_tokens_seen": 85990376, + "step": 5251, + "train_runtime": 42668.8489, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 3.183030303030303, + "grad_norm": 0.01780118979513645, + "learning_rate": 7.792774834073553e-05, + "loss": 0.012708324939012527, + "num_input_tokens_seen": 86006752, + "step": 5252, + "train_runtime": 42676.9662, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 3.1836363636363636, + "grad_norm": 0.008553601801395416, + "learning_rate": 7.791977154931505e-05, + "loss": 0.012197354808449745, + "num_input_tokens_seen": 86023128, + "step": 5253, + "train_runtime": 42685.0825, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 3.184242424242424, + "grad_norm": 0.005463483277708292, + "learning_rate": 7.791179372519659e-05, + "loss": 0.011639590375125408, + "num_input_tokens_seen": 86039504, + "step": 5254, + "train_runtime": 42693.2015, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.184848484848485, + "grad_norm": 0.009717253036797047, + "learning_rate": 7.790381486867522e-05, + "loss": 0.011988072656095028, + "num_input_tokens_seen": 86055880, + "step": 5255, + "train_runtime": 42701.3208, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.1854545454545455, + "grad_norm": 0.0026770310942083597, + "learning_rate": 7.78958349800461e-05, + "loss": 0.01169260498136282, + "num_input_tokens_seen": 86072256, + "step": 5256, + "train_runtime": 42709.4381, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.186060606060606, + "grad_norm": 0.00585093442350626, + "learning_rate": 7.788785405960436e-05, + "loss": 0.012223422527313232, + "num_input_tokens_seen": 86088632, + "step": 5257, + "train_runtime": 42717.5573, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.1866666666666665, + "grad_norm": 0.00566079979762435, + "learning_rate": 7.78798721076452e-05, + "loss": 0.012459107674658298, + "num_input_tokens_seen": 86105008, + "step": 5258, + "train_runtime": 42725.6725, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.1872727272727275, + "grad_norm": 0.007028930354863405, + "learning_rate": 7.787188912446389e-05, + "loss": 0.011840027756989002, + "num_input_tokens_seen": 86121384, + "step": 5259, + "train_runtime": 42733.7894, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.187878787878788, + "grad_norm": 0.008420264348387718, + "learning_rate": 7.786390511035564e-05, + "loss": 0.01269557885825634, + "num_input_tokens_seen": 86137760, + "step": 5260, + "train_runtime": 42741.9079, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 3.1884848484848485, + "grad_norm": 0.006593767553567886, + "learning_rate": 7.785592006561582e-05, + "loss": 0.01272114273160696, + "num_input_tokens_seen": 86154136, + "step": 5261, + "train_runtime": 42750.0331, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 3.189090909090909, + "grad_norm": 0.00321990461088717, + "learning_rate": 7.784793399053978e-05, + "loss": 0.011445445939898491, + "num_input_tokens_seen": 86170512, + "step": 5262, + "train_runtime": 42758.1541, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 3.1896969696969695, + "grad_norm": 0.008353577926754951, + "learning_rate": 7.783994688542288e-05, + "loss": 0.011499913409352303, + "num_input_tokens_seen": 86186888, + "step": 5263, + "train_runtime": 42766.2688, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 3.1903030303030304, + "grad_norm": 0.00946095772087574, + "learning_rate": 7.783195875056056e-05, + "loss": 0.011337222531437874, + "num_input_tokens_seen": 86203264, + "step": 5264, + "train_runtime": 42774.387, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 3.190909090909091, + "grad_norm": 0.01103484071791172, + "learning_rate": 7.782396958624829e-05, + "loss": 0.013117716647684574, + "num_input_tokens_seen": 86219640, + "step": 5265, + "train_runtime": 42782.5011, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 3.1915151515151514, + "grad_norm": 0.008302144706249237, + "learning_rate": 7.781597939278156e-05, + "loss": 0.012658239342272282, + "num_input_tokens_seen": 86236016, + "step": 5266, + "train_runtime": 42790.6163, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 3.192121212121212, + "grad_norm": 0.01597311906516552, + "learning_rate": 7.780798817045593e-05, + "loss": 0.01138401497155428, + "num_input_tokens_seen": 86252392, + "step": 5267, + "train_runtime": 42798.7321, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 3.192727272727273, + "grad_norm": 0.003429886419326067, + "learning_rate": 7.779999591956697e-05, + "loss": 0.012237127870321274, + "num_input_tokens_seen": 86268768, + "step": 5268, + "train_runtime": 42806.8457, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 3.1933333333333334, + "grad_norm": 0.007669529411941767, + "learning_rate": 7.779200264041029e-05, + "loss": 0.012243789620697498, + "num_input_tokens_seen": 86285144, + "step": 5269, + "train_runtime": 42814.9656, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 3.193939393939394, + "grad_norm": 0.006597874686121941, + "learning_rate": 7.778400833328156e-05, + "loss": 0.012067180126905441, + "num_input_tokens_seen": 86301520, + "step": 5270, + "train_runtime": 42823.0852, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 3.1945454545454544, + "grad_norm": 0.019583402201533318, + "learning_rate": 7.777601299847648e-05, + "loss": 0.012912326492369175, + "num_input_tokens_seen": 86317896, + "step": 5271, + "train_runtime": 42831.1971, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 3.1951515151515153, + "grad_norm": 0.0081368088722229, + "learning_rate": 7.776801663629077e-05, + "loss": 0.012012864463031292, + "num_input_tokens_seen": 86334272, + "step": 5272, + "train_runtime": 42839.3142, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 3.195757575757576, + "grad_norm": 0.009910166263580322, + "learning_rate": 7.776001924702017e-05, + "loss": 0.011483779177069664, + "num_input_tokens_seen": 86350648, + "step": 5273, + "train_runtime": 42847.4315, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 3.1963636363636363, + "grad_norm": 0.0065487949177622795, + "learning_rate": 7.775202083096054e-05, + "loss": 0.01277802512049675, + "num_input_tokens_seen": 86367024, + "step": 5274, + "train_runtime": 42855.5497, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 3.196969696969697, + "grad_norm": 0.011777360923588276, + "learning_rate": 7.774402138840771e-05, + "loss": 0.012683426029980183, + "num_input_tokens_seen": 86383400, + "step": 5275, + "train_runtime": 42863.6734, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 3.1975757575757577, + "grad_norm": 0.005514588672667742, + "learning_rate": 7.773602091965754e-05, + "loss": 0.010970336385071278, + "num_input_tokens_seen": 86399776, + "step": 5276, + "train_runtime": 42871.7921, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 3.1981818181818182, + "grad_norm": 0.010076678358018398, + "learning_rate": 7.7728019425006e-05, + "loss": 0.011914110742509365, + "num_input_tokens_seen": 86416152, + "step": 5277, + "train_runtime": 42879.9089, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 3.1987878787878787, + "grad_norm": 0.007916816510260105, + "learning_rate": 7.7720016904749e-05, + "loss": 0.011118197813630104, + "num_input_tokens_seen": 86432528, + "step": 5278, + "train_runtime": 42888.0323, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 3.1993939393939392, + "grad_norm": 0.010973465628921986, + "learning_rate": 7.771201335918254e-05, + "loss": 0.012102299369871616, + "num_input_tokens_seen": 86448904, + "step": 5279, + "train_runtime": 42896.1453, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 3.2, + "grad_norm": 0.001266568317078054, + "learning_rate": 7.77040087886027e-05, + "loss": 0.011844750493764877, + "num_input_tokens_seen": 86465280, + "step": 5280, + "train_runtime": 42904.2604, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 3.2006060606060607, + "grad_norm": 0.0060650077648460865, + "learning_rate": 7.769600319330552e-05, + "loss": 0.012039451859891415, + "num_input_tokens_seen": 86481656, + "step": 5281, + "train_runtime": 42912.3743, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 3.201212121212121, + "grad_norm": 0.0061028823256492615, + "learning_rate": 7.768799657358713e-05, + "loss": 0.011353418231010437, + "num_input_tokens_seen": 86498032, + "step": 5282, + "train_runtime": 42920.488, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.2018181818181817, + "grad_norm": 0.00689903786405921, + "learning_rate": 7.767998892974364e-05, + "loss": 0.012870377860963345, + "num_input_tokens_seen": 86514408, + "step": 5283, + "train_runtime": 42928.6023, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.2024242424242426, + "grad_norm": 0.004847546573728323, + "learning_rate": 7.767198026207127e-05, + "loss": 0.011542870663106441, + "num_input_tokens_seen": 86530784, + "step": 5284, + "train_runtime": 42936.7174, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.203030303030303, + "grad_norm": 0.004718760494142771, + "learning_rate": 7.766397057086624e-05, + "loss": 0.011087102815508842, + "num_input_tokens_seen": 86547160, + "step": 5285, + "train_runtime": 42944.8316, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.2036363636363636, + "grad_norm": 0.008538652211427689, + "learning_rate": 7.765595985642483e-05, + "loss": 0.011677338741719723, + "num_input_tokens_seen": 86563536, + "step": 5286, + "train_runtime": 42952.9466, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 3.204242424242424, + "grad_norm": 0.007054450921714306, + "learning_rate": 7.764794811904329e-05, + "loss": 0.01375613547861576, + "num_input_tokens_seen": 86579912, + "step": 5287, + "train_runtime": 42961.0593, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 3.204848484848485, + "grad_norm": 0.008712433278560638, + "learning_rate": 7.763993535901802e-05, + "loss": 0.012831787578761578, + "num_input_tokens_seen": 86596288, + "step": 5288, + "train_runtime": 42969.1743, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.2054545454545456, + "grad_norm": 0.00729356100782752, + "learning_rate": 7.763192157664535e-05, + "loss": 0.011344632133841515, + "num_input_tokens_seen": 86612664, + "step": 5289, + "train_runtime": 42977.2919, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.206060606060606, + "grad_norm": 0.007807271089404821, + "learning_rate": 7.762390677222171e-05, + "loss": 0.011946570128202438, + "num_input_tokens_seen": 86629040, + "step": 5290, + "train_runtime": 42985.4051, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 3.2066666666666666, + "grad_norm": 0.0030004698783159256, + "learning_rate": 7.761589094604357e-05, + "loss": 0.01230529323220253, + "num_input_tokens_seen": 86645416, + "step": 5291, + "train_runtime": 42993.523, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 3.207272727272727, + "grad_norm": 0.003309942316263914, + "learning_rate": 7.76078740984074e-05, + "loss": 0.01205356977880001, + "num_input_tokens_seen": 86661792, + "step": 5292, + "train_runtime": 43001.6372, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.207878787878788, + "grad_norm": 0.009860580787062645, + "learning_rate": 7.759985622960973e-05, + "loss": 0.013056493364274502, + "num_input_tokens_seen": 86678168, + "step": 5293, + "train_runtime": 43009.7493, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.2084848484848485, + "grad_norm": 0.004878977779299021, + "learning_rate": 7.75918373399471e-05, + "loss": 0.011190814897418022, + "num_input_tokens_seen": 86694544, + "step": 5294, + "train_runtime": 43017.8659, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.209090909090909, + "grad_norm": 0.007417880930006504, + "learning_rate": 7.758381742971617e-05, + "loss": 0.012997215613722801, + "num_input_tokens_seen": 86710920, + "step": 5295, + "train_runtime": 43025.9813, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.2096969696969695, + "grad_norm": 0.007833002135157585, + "learning_rate": 7.757579649921354e-05, + "loss": 0.012064127251505852, + "num_input_tokens_seen": 86727296, + "step": 5296, + "train_runtime": 43034.0993, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 3.2103030303030304, + "grad_norm": 0.005606712307780981, + "learning_rate": 7.75677745487359e-05, + "loss": 0.012654859572649002, + "num_input_tokens_seen": 86743672, + "step": 5297, + "train_runtime": 43042.2171, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 3.210909090909091, + "grad_norm": 0.008390406146645546, + "learning_rate": 7.755975157857995e-05, + "loss": 0.013293388299643993, + "num_input_tokens_seen": 86760048, + "step": 5298, + "train_runtime": 43050.336, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 3.2115151515151514, + "grad_norm": 0.0089068952947855, + "learning_rate": 7.755172758904249e-05, + "loss": 0.01230605598539114, + "num_input_tokens_seen": 86776424, + "step": 5299, + "train_runtime": 43058.4551, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 3.212121212121212, + "grad_norm": 0.007790511008352041, + "learning_rate": 7.754370258042025e-05, + "loss": 0.011736944317817688, + "num_input_tokens_seen": 86792800, + "step": 5300, + "train_runtime": 43066.5672, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 3.212727272727273, + "grad_norm": 0.009128447622060776, + "learning_rate": 7.753567655301012e-05, + "loss": 0.012180618941783905, + "num_input_tokens_seen": 86809176, + "step": 5301, + "train_runtime": 43075.8067, + "train_tokens_per_second": 2015.265 + }, + { + "epoch": 3.2133333333333334, + "grad_norm": 0.004686056170612574, + "learning_rate": 7.752764950710891e-05, + "loss": 0.01208141352981329, + "num_input_tokens_seen": 86825552, + "step": 5302, + "train_runtime": 43083.9218, + "train_tokens_per_second": 2015.266 + }, + { + "epoch": 3.213939393939394, + "grad_norm": 0.011579180136322975, + "learning_rate": 7.751962144301357e-05, + "loss": 0.012394418008625507, + "num_input_tokens_seen": 86841928, + "step": 5303, + "train_runtime": 43092.0328, + "train_tokens_per_second": 2015.266 + }, + { + "epoch": 3.2145454545454544, + "grad_norm": 0.008243214339017868, + "learning_rate": 7.751159236102103e-05, + "loss": 0.011955955065786839, + "num_input_tokens_seen": 86858304, + "step": 5304, + "train_runtime": 43100.146, + "train_tokens_per_second": 2015.267 + }, + { + "epoch": 3.2151515151515153, + "grad_norm": 0.005036482121795416, + "learning_rate": 7.750356226142826e-05, + "loss": 0.01322841551154852, + "num_input_tokens_seen": 86874680, + "step": 5305, + "train_runtime": 43108.2558, + "train_tokens_per_second": 2015.268 + }, + { + "epoch": 3.215757575757576, + "grad_norm": 0.0039468444883823395, + "learning_rate": 7.749553114453228e-05, + "loss": 0.010403268039226532, + "num_input_tokens_seen": 86891056, + "step": 5306, + "train_runtime": 43116.3694, + "train_tokens_per_second": 2015.268 + }, + { + "epoch": 3.2163636363636363, + "grad_norm": 0.007880299352109432, + "learning_rate": 7.748749901063014e-05, + "loss": 0.011753161437809467, + "num_input_tokens_seen": 86907432, + "step": 5307, + "train_runtime": 43124.4895, + "train_tokens_per_second": 2015.269 + }, + { + "epoch": 3.216969696969697, + "grad_norm": 0.005228618625551462, + "learning_rate": 7.747946586001894e-05, + "loss": 0.012010081671178341, + "num_input_tokens_seen": 86923808, + "step": 5308, + "train_runtime": 43132.6108, + "train_tokens_per_second": 2015.269 + }, + { + "epoch": 3.2175757575757578, + "grad_norm": 0.007711697835475206, + "learning_rate": 7.747143169299582e-05, + "loss": 0.012208916246891022, + "num_input_tokens_seen": 86940184, + "step": 5309, + "train_runtime": 43140.7315, + "train_tokens_per_second": 2015.269 + }, + { + "epoch": 3.2181818181818183, + "grad_norm": 0.00970266479998827, + "learning_rate": 7.746339650985795e-05, + "loss": 0.014195114374160767, + "num_input_tokens_seen": 86956560, + "step": 5310, + "train_runtime": 43148.8467, + "train_tokens_per_second": 2015.27 + }, + { + "epoch": 3.2187878787878788, + "grad_norm": 0.005777402780950069, + "learning_rate": 7.745536031090252e-05, + "loss": 0.013544959016144276, + "num_input_tokens_seen": 86972936, + "step": 5311, + "train_runtime": 43156.9614, + "train_tokens_per_second": 2015.27 + }, + { + "epoch": 3.2193939393939393, + "grad_norm": 0.012430955655872822, + "learning_rate": 7.744732309642678e-05, + "loss": 0.013212257996201515, + "num_input_tokens_seen": 86989312, + "step": 5312, + "train_runtime": 43165.0718, + "train_tokens_per_second": 2015.271 + }, + { + "epoch": 3.22, + "grad_norm": 0.0076065948233008385, + "learning_rate": 7.743928486672799e-05, + "loss": 0.012111306190490723, + "num_input_tokens_seen": 87005688, + "step": 5313, + "train_runtime": 43173.1877, + "train_tokens_per_second": 2015.271 + }, + { + "epoch": 3.2206060606060607, + "grad_norm": 0.006419904995709658, + "learning_rate": 7.74312456221035e-05, + "loss": 0.011076908558607101, + "num_input_tokens_seen": 87022064, + "step": 5314, + "train_runtime": 43181.3047, + "train_tokens_per_second": 2015.272 + }, + { + "epoch": 3.221212121212121, + "grad_norm": 0.005693621467798948, + "learning_rate": 7.742320536285066e-05, + "loss": 0.01254379190504551, + "num_input_tokens_seen": 87038440, + "step": 5315, + "train_runtime": 43189.4198, + "train_tokens_per_second": 2015.272 + }, + { + "epoch": 3.2218181818181817, + "grad_norm": 0.0037814192473888397, + "learning_rate": 7.741516408926686e-05, + "loss": 0.01170978881418705, + "num_input_tokens_seen": 87054816, + "step": 5316, + "train_runtime": 43197.5361, + "train_tokens_per_second": 2015.273 + }, + { + "epoch": 3.2224242424242426, + "grad_norm": 0.011709507554769516, + "learning_rate": 7.740712180164952e-05, + "loss": 0.012792158871889114, + "num_input_tokens_seen": 87071192, + "step": 5317, + "train_runtime": 43205.6547, + "train_tokens_per_second": 2015.273 + }, + { + "epoch": 3.223030303030303, + "grad_norm": 0.0018288815626874566, + "learning_rate": 7.739907850029612e-05, + "loss": 0.013272666372358799, + "num_input_tokens_seen": 87087568, + "step": 5318, + "train_runtime": 43213.766, + "train_tokens_per_second": 2015.274 + }, + { + "epoch": 3.2236363636363636, + "grad_norm": 0.002453134162351489, + "learning_rate": 7.739103418550416e-05, + "loss": 0.012618345208466053, + "num_input_tokens_seen": 87103944, + "step": 5319, + "train_runtime": 43221.8813, + "train_tokens_per_second": 2015.274 + }, + { + "epoch": 3.224242424242424, + "grad_norm": 0.007189453113824129, + "learning_rate": 7.73829888575712e-05, + "loss": 0.01104170735925436, + "num_input_tokens_seen": 87120320, + "step": 5320, + "train_runtime": 43229.9971, + "train_tokens_per_second": 2015.275 + }, + { + "epoch": 3.2248484848484846, + "grad_norm": 0.006664356216788292, + "learning_rate": 7.737494251679479e-05, + "loss": 0.011220650747418404, + "num_input_tokens_seen": 87136696, + "step": 5321, + "train_runtime": 43238.1099, + "train_tokens_per_second": 2015.275 + }, + { + "epoch": 3.2254545454545456, + "grad_norm": 0.029378481209278107, + "learning_rate": 7.736689516347258e-05, + "loss": 0.0126795070245862, + "num_input_tokens_seen": 87153072, + "step": 5322, + "train_runtime": 43246.2311, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 3.226060606060606, + "grad_norm": 0.006079778540879488, + "learning_rate": 7.73588467979022e-05, + "loss": 0.011886589229106903, + "num_input_tokens_seen": 87169448, + "step": 5323, + "train_runtime": 43254.3458, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 3.2266666666666666, + "grad_norm": 0.00680855568498373, + "learning_rate": 7.735079742038138e-05, + "loss": 0.011918196454644203, + "num_input_tokens_seen": 87185824, + "step": 5324, + "train_runtime": 43262.4618, + "train_tokens_per_second": 2015.277 + }, + { + "epoch": 3.227272727272727, + "grad_norm": 0.0083842808380723, + "learning_rate": 7.73427470312078e-05, + "loss": 0.01237604022026062, + "num_input_tokens_seen": 87202200, + "step": 5325, + "train_runtime": 43270.5765, + "train_tokens_per_second": 2015.277 + }, + { + "epoch": 3.227878787878788, + "grad_norm": 0.0039710090495646, + "learning_rate": 7.733469563067928e-05, + "loss": 0.012126031331717968, + "num_input_tokens_seen": 87218576, + "step": 5326, + "train_runtime": 43278.6914, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 3.2284848484848485, + "grad_norm": 0.0046617332845926285, + "learning_rate": 7.732664321909357e-05, + "loss": 0.011709000915288925, + "num_input_tokens_seen": 87234952, + "step": 5327, + "train_runtime": 43286.8115, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 3.229090909090909, + "grad_norm": 0.007795905694365501, + "learning_rate": 7.731858979674857e-05, + "loss": 0.012107725255191326, + "num_input_tokens_seen": 87251328, + "step": 5328, + "train_runtime": 43294.933, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 3.2296969696969695, + "grad_norm": 0.008231372572481632, + "learning_rate": 7.731053536394212e-05, + "loss": 0.01321522518992424, + "num_input_tokens_seen": 87267704, + "step": 5329, + "train_runtime": 43303.0489, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 3.2303030303030305, + "grad_norm": 0.005577977746725082, + "learning_rate": 7.730247992097214e-05, + "loss": 0.012189840897917747, + "num_input_tokens_seen": 87284080, + "step": 5330, + "train_runtime": 43311.1616, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 3.230909090909091, + "grad_norm": 0.008244067430496216, + "learning_rate": 7.729442346813662e-05, + "loss": 0.012310037389397621, + "num_input_tokens_seen": 87300456, + "step": 5331, + "train_runtime": 43319.2746, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 3.2315151515151515, + "grad_norm": 0.0049143703654408455, + "learning_rate": 7.728636600573354e-05, + "loss": 0.010644476860761642, + "num_input_tokens_seen": 87316832, + "step": 5332, + "train_runtime": 43327.3899, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 3.232121212121212, + "grad_norm": 0.0055672465823590755, + "learning_rate": 7.72783075340609e-05, + "loss": 0.012934127822518349, + "num_input_tokens_seen": 87333208, + "step": 5333, + "train_runtime": 43335.5077, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 3.232727272727273, + "grad_norm": 0.011581599712371826, + "learning_rate": 7.727024805341678e-05, + "loss": 0.01170682068914175, + "num_input_tokens_seen": 87349584, + "step": 5334, + "train_runtime": 43343.6209, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 3.2333333333333334, + "grad_norm": 0.008286160416901112, + "learning_rate": 7.72621875640993e-05, + "loss": 0.0123090585693717, + "num_input_tokens_seen": 87365960, + "step": 5335, + "train_runtime": 43351.7427, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 3.233939393939394, + "grad_norm": 0.006604601163417101, + "learning_rate": 7.725412606640658e-05, + "loss": 0.011820787563920021, + "num_input_tokens_seen": 87382336, + "step": 5336, + "train_runtime": 43359.8603, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 3.2345454545454544, + "grad_norm": 0.007266121916472912, + "learning_rate": 7.724606356063684e-05, + "loss": 0.012546796351671219, + "num_input_tokens_seen": 87398712, + "step": 5337, + "train_runtime": 43367.9764, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 3.2351515151515153, + "grad_norm": 0.005835233721882105, + "learning_rate": 7.723800004708825e-05, + "loss": 0.010815868154168129, + "num_input_tokens_seen": 87415088, + "step": 5338, + "train_runtime": 43376.0892, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 3.235757575757576, + "grad_norm": 0.005590966437011957, + "learning_rate": 7.722993552605909e-05, + "loss": 0.012126067653298378, + "num_input_tokens_seen": 87431464, + "step": 5339, + "train_runtime": 43384.2078, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 3.2363636363636363, + "grad_norm": 0.004200849682092667, + "learning_rate": 7.722186999784762e-05, + "loss": 0.011630792170763016, + "num_input_tokens_seen": 87447840, + "step": 5340, + "train_runtime": 43392.3213, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 3.236969696969697, + "grad_norm": 0.024852680042386055, + "learning_rate": 7.721380346275222e-05, + "loss": 0.01237387116998434, + "num_input_tokens_seen": 87464216, + "step": 5341, + "train_runtime": 43400.4376, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 3.2375757575757578, + "grad_norm": 0.003985259681940079, + "learning_rate": 7.720573592107121e-05, + "loss": 0.011438102461397648, + "num_input_tokens_seen": 87480592, + "step": 5342, + "train_runtime": 43408.5535, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 3.2381818181818183, + "grad_norm": 0.004417522810399532, + "learning_rate": 7.719766737310301e-05, + "loss": 0.012218753807246685, + "num_input_tokens_seen": 87496968, + "step": 5343, + "train_runtime": 43416.6686, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 3.2387878787878788, + "grad_norm": 0.007038233336061239, + "learning_rate": 7.718959781914606e-05, + "loss": 0.012582189403474331, + "num_input_tokens_seen": 87513344, + "step": 5344, + "train_runtime": 43424.7802, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 3.2393939393939393, + "grad_norm": 0.006807398982346058, + "learning_rate": 7.718152725949883e-05, + "loss": 0.012334870174527168, + "num_input_tokens_seen": 87529720, + "step": 5345, + "train_runtime": 43432.8927, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 3.24, + "grad_norm": 0.01821281388401985, + "learning_rate": 7.717345569445986e-05, + "loss": 0.01265687495470047, + "num_input_tokens_seen": 87546096, + "step": 5346, + "train_runtime": 43441.0057, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 3.2406060606060607, + "grad_norm": 0.007850013673305511, + "learning_rate": 7.716538312432766e-05, + "loss": 0.012506591156125069, + "num_input_tokens_seen": 87562472, + "step": 5347, + "train_runtime": 43449.1196, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 3.241212121212121, + "grad_norm": 0.006626442540436983, + "learning_rate": 7.715730954940084e-05, + "loss": 0.012080012820661068, + "num_input_tokens_seen": 87578848, + "step": 5348, + "train_runtime": 43457.2326, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 3.2418181818181817, + "grad_norm": 0.027083097025752068, + "learning_rate": 7.714923496997805e-05, + "loss": 0.013666247017681599, + "num_input_tokens_seen": 87595224, + "step": 5349, + "train_runtime": 43465.3446, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.242424242424242, + "grad_norm": 0.008418912068009377, + "learning_rate": 7.714115938635791e-05, + "loss": 0.01146827545017004, + "num_input_tokens_seen": 87611600, + "step": 5350, + "train_runtime": 43473.4655, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.243030303030303, + "grad_norm": 0.006113213486969471, + "learning_rate": 7.713308279883915e-05, + "loss": 0.01078350655734539, + "num_input_tokens_seen": 87627976, + "step": 5351, + "train_runtime": 43481.5841, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.2436363636363637, + "grad_norm": 0.013689510524272919, + "learning_rate": 7.712500520772048e-05, + "loss": 0.012107168324291706, + "num_input_tokens_seen": 87644352, + "step": 5352, + "train_runtime": 43489.6961, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 3.244242424242424, + "grad_norm": 0.005991401616483927, + "learning_rate": 7.71169266133007e-05, + "loss": 0.012074320577085018, + "num_input_tokens_seen": 87660728, + "step": 5353, + "train_runtime": 43497.8114, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 3.2448484848484846, + "grad_norm": 0.00826292484998703, + "learning_rate": 7.710884701587861e-05, + "loss": 0.012735782191157341, + "num_input_tokens_seen": 87677104, + "step": 5354, + "train_runtime": 43505.9324, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 3.2454545454545456, + "grad_norm": 0.010154642164707184, + "learning_rate": 7.710076641575308e-05, + "loss": 0.01338373776525259, + "num_input_tokens_seen": 87693480, + "step": 5355, + "train_runtime": 43514.0494, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 3.246060606060606, + "grad_norm": 0.011363768018782139, + "learning_rate": 7.709268481322296e-05, + "loss": 0.011802969500422478, + "num_input_tokens_seen": 87709856, + "step": 5356, + "train_runtime": 43522.1628, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 3.2466666666666666, + "grad_norm": 0.004688591696321964, + "learning_rate": 7.708460220858719e-05, + "loss": 0.011524985544383526, + "num_input_tokens_seen": 87726232, + "step": 5357, + "train_runtime": 43530.2817, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 3.247272727272727, + "grad_norm": 0.00529836118221283, + "learning_rate": 7.707651860214473e-05, + "loss": 0.0113229313865304, + "num_input_tokens_seen": 87742608, + "step": 5358, + "train_runtime": 43538.3999, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 3.247878787878788, + "grad_norm": 0.006621887441724539, + "learning_rate": 7.706843399419456e-05, + "loss": 0.011427229270339012, + "num_input_tokens_seen": 87758984, + "step": 5359, + "train_runtime": 43546.5147, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 3.2484848484848485, + "grad_norm": 0.00808833446353674, + "learning_rate": 7.706034838503577e-05, + "loss": 0.012164801359176636, + "num_input_tokens_seen": 87775360, + "step": 5360, + "train_runtime": 43554.6321, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 3.249090909090909, + "grad_norm": 0.007826986722648144, + "learning_rate": 7.705226177496736e-05, + "loss": 0.012866008095443249, + "num_input_tokens_seen": 87791736, + "step": 5361, + "train_runtime": 43562.7487, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 3.2496969696969695, + "grad_norm": 0.008997034281492233, + "learning_rate": 7.704417416428848e-05, + "loss": 0.012195097282528877, + "num_input_tokens_seen": 87808112, + "step": 5362, + "train_runtime": 43570.8645, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 3.2503030303030305, + "grad_norm": 0.007358288858085871, + "learning_rate": 7.703608555329825e-05, + "loss": 0.012331872247159481, + "num_input_tokens_seen": 87824488, + "step": 5363, + "train_runtime": 43578.9792, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 3.250909090909091, + "grad_norm": 0.005960812792181969, + "learning_rate": 7.702799594229588e-05, + "loss": 0.011205797083675861, + "num_input_tokens_seen": 87840864, + "step": 5364, + "train_runtime": 43587.0975, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 3.2515151515151515, + "grad_norm": 0.003501742845401168, + "learning_rate": 7.701990533158057e-05, + "loss": 0.011403818614780903, + "num_input_tokens_seen": 87857240, + "step": 5365, + "train_runtime": 43595.2123, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 3.252121212121212, + "grad_norm": 0.0071466476656496525, + "learning_rate": 7.701181372145159e-05, + "loss": 0.012449138797819614, + "num_input_tokens_seen": 87873616, + "step": 5366, + "train_runtime": 43603.3327, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 3.252727272727273, + "grad_norm": 0.00786797795444727, + "learning_rate": 7.700372111220819e-05, + "loss": 0.013322414830327034, + "num_input_tokens_seen": 87889992, + "step": 5367, + "train_runtime": 43611.4491, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 3.2533333333333334, + "grad_norm": 0.013827753253281116, + "learning_rate": 7.699562750414977e-05, + "loss": 0.012144897133111954, + "num_input_tokens_seen": 87906368, + "step": 5368, + "train_runtime": 43619.5621, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 3.253939393939394, + "grad_norm": 0.010182446800172329, + "learning_rate": 7.698753289757565e-05, + "loss": 0.01119221467524767, + "num_input_tokens_seen": 87922744, + "step": 5369, + "train_runtime": 43627.6764, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 3.2545454545454544, + "grad_norm": 0.008027137257158756, + "learning_rate": 7.697943729278524e-05, + "loss": 0.012467332184314728, + "num_input_tokens_seen": 87939120, + "step": 5370, + "train_runtime": 43635.7904, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.255151515151515, + "grad_norm": 0.00801102351397276, + "learning_rate": 7.697134069007799e-05, + "loss": 0.012449929490685463, + "num_input_tokens_seen": 87955496, + "step": 5371, + "train_runtime": 43643.9073, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.255757575757576, + "grad_norm": 0.010629222728312016, + "learning_rate": 7.696324308975335e-05, + "loss": 0.012017110362648964, + "num_input_tokens_seen": 87971872, + "step": 5372, + "train_runtime": 43652.0321, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.2563636363636363, + "grad_norm": 0.00643131835386157, + "learning_rate": 7.69551444921109e-05, + "loss": 0.011855978518724442, + "num_input_tokens_seen": 87988248, + "step": 5373, + "train_runtime": 43660.1502, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.256969696969697, + "grad_norm": 0.004472612868994474, + "learning_rate": 7.694704489745012e-05, + "loss": 0.012562030926346779, + "num_input_tokens_seen": 88004624, + "step": 5374, + "train_runtime": 43668.2604, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 3.257575757575758, + "grad_norm": 0.007085151504725218, + "learning_rate": 7.693894430607062e-05, + "loss": 0.013197656720876694, + "num_input_tokens_seen": 88021000, + "step": 5375, + "train_runtime": 43676.3754, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 3.2581818181818183, + "grad_norm": 0.005621409974992275, + "learning_rate": 7.693084271827205e-05, + "loss": 0.012143141590058804, + "num_input_tokens_seen": 88037376, + "step": 5376, + "train_runtime": 43684.4912, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 3.258787878787879, + "grad_norm": 0.0070752911269664764, + "learning_rate": 7.692274013435403e-05, + "loss": 0.013047544285655022, + "num_input_tokens_seen": 88053752, + "step": 5377, + "train_runtime": 43692.6105, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 3.2593939393939393, + "grad_norm": 0.006566370837390423, + "learning_rate": 7.69146365546163e-05, + "loss": 0.012332148849964142, + "num_input_tokens_seen": 88070128, + "step": 5378, + "train_runtime": 43700.7229, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 3.26, + "grad_norm": 0.008157361298799515, + "learning_rate": 7.69065319793586e-05, + "loss": 0.013099174946546555, + "num_input_tokens_seen": 88086504, + "step": 5379, + "train_runtime": 43708.8423, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 3.2606060606060607, + "grad_norm": 0.018293771892786026, + "learning_rate": 7.689842640888063e-05, + "loss": 0.0133364237844944, + "num_input_tokens_seen": 88102880, + "step": 5380, + "train_runtime": 43716.9556, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 3.2612121212121212, + "grad_norm": 0.005958425346761942, + "learning_rate": 7.689031984348227e-05, + "loss": 0.01234077475965023, + "num_input_tokens_seen": 88119256, + "step": 5381, + "train_runtime": 43725.07, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 3.2618181818181817, + "grad_norm": 0.0089218495413661, + "learning_rate": 7.688221228346335e-05, + "loss": 0.012801197357475758, + "num_input_tokens_seen": 88135632, + "step": 5382, + "train_runtime": 43733.1815, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 3.2624242424242427, + "grad_norm": 0.008774648420512676, + "learning_rate": 7.687410372912376e-05, + "loss": 0.011589978821575642, + "num_input_tokens_seen": 88152008, + "step": 5383, + "train_runtime": 43741.2949, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 3.263030303030303, + "grad_norm": 0.028292754665017128, + "learning_rate": 7.686599418076339e-05, + "loss": 0.01291901245713234, + "num_input_tokens_seen": 88168384, + "step": 5384, + "train_runtime": 43749.4093, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 3.2636363636363637, + "grad_norm": 0.007374802604317665, + "learning_rate": 7.685788363868223e-05, + "loss": 0.012309200130403042, + "num_input_tokens_seen": 88184760, + "step": 5385, + "train_runtime": 43757.5316, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 3.264242424242424, + "grad_norm": 0.005209075752645731, + "learning_rate": 7.684977210318024e-05, + "loss": 0.012631790712475777, + "num_input_tokens_seen": 88201136, + "step": 5386, + "train_runtime": 43765.6472, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 3.2648484848484847, + "grad_norm": 0.007359870243817568, + "learning_rate": 7.684165957455748e-05, + "loss": 0.011531918309628963, + "num_input_tokens_seen": 88217512, + "step": 5387, + "train_runtime": 43773.7644, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 3.2654545454545456, + "grad_norm": 0.006204367149621248, + "learning_rate": 7.6833546053114e-05, + "loss": 0.012570096179842949, + "num_input_tokens_seen": 88233888, + "step": 5388, + "train_runtime": 43781.878, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 3.266060606060606, + "grad_norm": 0.005969122517853975, + "learning_rate": 7.68254315391499e-05, + "loss": 0.011657875962555408, + "num_input_tokens_seen": 88250264, + "step": 5389, + "train_runtime": 43789.995, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 3.2666666666666666, + "grad_norm": 0.005659397691488266, + "learning_rate": 7.681731603296532e-05, + "loss": 0.01221960037946701, + "num_input_tokens_seen": 88266640, + "step": 5390, + "train_runtime": 43798.1111, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 3.267272727272727, + "grad_norm": 0.0069372644647955894, + "learning_rate": 7.680919953486048e-05, + "loss": 0.011819953098893166, + "num_input_tokens_seen": 88283016, + "step": 5391, + "train_runtime": 43806.2318, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 3.267878787878788, + "grad_norm": 0.007142509333789349, + "learning_rate": 7.680108204513552e-05, + "loss": 0.012626123614609241, + "num_input_tokens_seen": 88299392, + "step": 5392, + "train_runtime": 43814.3428, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 3.2684848484848485, + "grad_norm": 0.007783539593219757, + "learning_rate": 7.679296356409075e-05, + "loss": 0.011469131335616112, + "num_input_tokens_seen": 88315768, + "step": 5393, + "train_runtime": 43822.4562, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 3.269090909090909, + "grad_norm": 0.005784064996987581, + "learning_rate": 7.678484409202642e-05, + "loss": 0.011921831406652927, + "num_input_tokens_seen": 88332144, + "step": 5394, + "train_runtime": 43830.5701, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.2696969696969695, + "grad_norm": 0.009018114767968655, + "learning_rate": 7.677672362924288e-05, + "loss": 0.012291139923036098, + "num_input_tokens_seen": 88348520, + "step": 5395, + "train_runtime": 43838.6821, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.2703030303030305, + "grad_norm": 0.00671779690310359, + "learning_rate": 7.676860217604047e-05, + "loss": 0.012143897823989391, + "num_input_tokens_seen": 88364896, + "step": 5396, + "train_runtime": 43846.7946, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.270909090909091, + "grad_norm": 0.006585702300071716, + "learning_rate": 7.67604797327196e-05, + "loss": 0.01225357223302126, + "num_input_tokens_seen": 88381272, + "step": 5397, + "train_runtime": 43854.908, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 3.2715151515151515, + "grad_norm": 0.008013072423636913, + "learning_rate": 7.67523562995807e-05, + "loss": 0.012338386848568916, + "num_input_tokens_seen": 88397648, + "step": 5398, + "train_runtime": 43863.0214, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 3.272121212121212, + "grad_norm": 0.006657259538769722, + "learning_rate": 7.674423187692423e-05, + "loss": 0.011830288916826248, + "num_input_tokens_seen": 88414024, + "step": 5399, + "train_runtime": 43871.1395, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.2727272727272725, + "grad_norm": 0.009124872274696827, + "learning_rate": 7.673610646505072e-05, + "loss": 0.012330468744039536, + "num_input_tokens_seen": 88430400, + "step": 5400, + "train_runtime": 43879.2604, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.2733333333333334, + "grad_norm": 0.010633951053023338, + "learning_rate": 7.672798006426069e-05, + "loss": 0.012701372615993023, + "num_input_tokens_seen": 88446776, + "step": 5401, + "train_runtime": 43888.349, + "train_tokens_per_second": 2015.268 + }, + { + "epoch": 3.273939393939394, + "grad_norm": 0.007358280010521412, + "learning_rate": 7.671985267485474e-05, + "loss": 0.011870061047375202, + "num_input_tokens_seen": 88463152, + "step": 5402, + "train_runtime": 43896.4643, + "train_tokens_per_second": 2015.268 + }, + { + "epoch": 3.2745454545454544, + "grad_norm": 0.0025108593981713057, + "learning_rate": 7.671172429713345e-05, + "loss": 0.011547740548849106, + "num_input_tokens_seen": 88479528, + "step": 5403, + "train_runtime": 43904.5744, + "train_tokens_per_second": 2015.269 + }, + { + "epoch": 3.2751515151515154, + "grad_norm": 0.00886785052716732, + "learning_rate": 7.670359493139751e-05, + "loss": 0.012280113063752651, + "num_input_tokens_seen": 88495904, + "step": 5404, + "train_runtime": 43912.686, + "train_tokens_per_second": 2015.27 + }, + { + "epoch": 3.275757575757576, + "grad_norm": 0.011375620029866695, + "learning_rate": 7.66954645779476e-05, + "loss": 0.012922976166009903, + "num_input_tokens_seen": 88512280, + "step": 5405, + "train_runtime": 43920.7995, + "train_tokens_per_second": 2015.27 + }, + { + "epoch": 3.2763636363636364, + "grad_norm": 0.010332721285521984, + "learning_rate": 7.668733323708443e-05, + "loss": 0.01262161135673523, + "num_input_tokens_seen": 88528656, + "step": 5406, + "train_runtime": 43928.9121, + "train_tokens_per_second": 2015.271 + }, + { + "epoch": 3.276969696969697, + "grad_norm": 0.007661209441721439, + "learning_rate": 7.667920090910878e-05, + "loss": 0.012061371468007565, + "num_input_tokens_seen": 88545032, + "step": 5407, + "train_runtime": 43937.033, + "train_tokens_per_second": 2015.271 + }, + { + "epoch": 3.2775757575757574, + "grad_norm": 0.007089585531502962, + "learning_rate": 7.667106759432144e-05, + "loss": 0.012217340059578419, + "num_input_tokens_seen": 88561408, + "step": 5408, + "train_runtime": 43945.1477, + "train_tokens_per_second": 2015.272 + }, + { + "epoch": 3.2781818181818183, + "grad_norm": 0.005344622768461704, + "learning_rate": 7.666293329302326e-05, + "loss": 0.013070804998278618, + "num_input_tokens_seen": 88577784, + "step": 5409, + "train_runtime": 43953.2686, + "train_tokens_per_second": 2015.272 + }, + { + "epoch": 3.278787878787879, + "grad_norm": 0.008005596697330475, + "learning_rate": 7.665479800551509e-05, + "loss": 0.012912550009787083, + "num_input_tokens_seen": 88594160, + "step": 5410, + "train_runtime": 43961.3839, + "train_tokens_per_second": 2015.272 + }, + { + "epoch": 3.2793939393939393, + "grad_norm": 0.0043099066242575645, + "learning_rate": 7.664666173209787e-05, + "loss": 0.011820114217698574, + "num_input_tokens_seen": 88610536, + "step": 5411, + "train_runtime": 43969.5006, + "train_tokens_per_second": 2015.273 + }, + { + "epoch": 3.2800000000000002, + "grad_norm": 0.009404663927853107, + "learning_rate": 7.663852447307251e-05, + "loss": 0.012228131294250488, + "num_input_tokens_seen": 88626912, + "step": 5412, + "train_runtime": 43977.6119, + "train_tokens_per_second": 2015.273 + }, + { + "epoch": 3.2806060606060607, + "grad_norm": 0.010114171542227268, + "learning_rate": 7.663038622873999e-05, + "loss": 0.012550673447549343, + "num_input_tokens_seen": 88643288, + "step": 5413, + "train_runtime": 43985.7234, + "train_tokens_per_second": 2015.274 + }, + { + "epoch": 3.2812121212121212, + "grad_norm": 0.0058823456056416035, + "learning_rate": 7.662224699940137e-05, + "loss": 0.012166665866971016, + "num_input_tokens_seen": 88659664, + "step": 5414, + "train_runtime": 43993.8354, + "train_tokens_per_second": 2015.275 + }, + { + "epoch": 3.2818181818181817, + "grad_norm": 0.00713913980871439, + "learning_rate": 7.661410678535766e-05, + "loss": 0.01272317674010992, + "num_input_tokens_seen": 88676040, + "step": 5415, + "train_runtime": 44001.9492, + "train_tokens_per_second": 2015.275 + }, + { + "epoch": 3.2824242424242422, + "grad_norm": 0.012900741770863533, + "learning_rate": 7.660596558690998e-05, + "loss": 0.012671135365962982, + "num_input_tokens_seen": 88692416, + "step": 5416, + "train_runtime": 44010.0654, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 3.283030303030303, + "grad_norm": 0.004588556010276079, + "learning_rate": 7.659782340435944e-05, + "loss": 0.011600610800087452, + "num_input_tokens_seen": 88708792, + "step": 5417, + "train_runtime": 44018.1801, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 3.2836363636363637, + "grad_norm": 0.011387187987565994, + "learning_rate": 7.658968023800722e-05, + "loss": 0.012007078155875206, + "num_input_tokens_seen": 88725168, + "step": 5418, + "train_runtime": 44026.2933, + "train_tokens_per_second": 2015.277 + }, + { + "epoch": 3.284242424242424, + "grad_norm": 0.007149253506213427, + "learning_rate": 7.658153608815449e-05, + "loss": 0.011718599125742912, + "num_input_tokens_seen": 88741544, + "step": 5419, + "train_runtime": 44034.4046, + "train_tokens_per_second": 2015.277 + }, + { + "epoch": 3.2848484848484847, + "grad_norm": 0.007949737831950188, + "learning_rate": 7.657339095510252e-05, + "loss": 0.012249804101884365, + "num_input_tokens_seen": 88757920, + "step": 5420, + "train_runtime": 44042.5201, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 3.2854545454545456, + "grad_norm": 0.00583261251449585, + "learning_rate": 7.656524483915256e-05, + "loss": 0.011780316941440105, + "num_input_tokens_seen": 88774296, + "step": 5421, + "train_runtime": 44050.6395, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 3.286060606060606, + "grad_norm": 0.002191282343119383, + "learning_rate": 7.655709774060594e-05, + "loss": 0.011504745110869408, + "num_input_tokens_seen": 88790672, + "step": 5422, + "train_runtime": 44058.7562, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 3.2866666666666666, + "grad_norm": 0.011732435785233974, + "learning_rate": 7.654894965976398e-05, + "loss": 0.012860682792961597, + "num_input_tokens_seen": 88807048, + "step": 5423, + "train_runtime": 44066.8708, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 3.287272727272727, + "grad_norm": 0.011170846410095692, + "learning_rate": 7.654080059692808e-05, + "loss": 0.012251395732164383, + "num_input_tokens_seen": 88823424, + "step": 5424, + "train_runtime": 44074.9883, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 3.287878787878788, + "grad_norm": 0.009080127812922001, + "learning_rate": 7.653265055239965e-05, + "loss": 0.012889822944998741, + "num_input_tokens_seen": 88839800, + "step": 5425, + "train_runtime": 44083.101, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 3.2884848484848486, + "grad_norm": 0.007992480881512165, + "learning_rate": 7.652449952648013e-05, + "loss": 0.012583511881530285, + "num_input_tokens_seen": 88856176, + "step": 5426, + "train_runtime": 44091.2127, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 3.289090909090909, + "grad_norm": 0.0043755206279456615, + "learning_rate": 7.651634751947104e-05, + "loss": 0.011505357921123505, + "num_input_tokens_seen": 88872552, + "step": 5427, + "train_runtime": 44099.3566, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 3.2896969696969696, + "grad_norm": 0.003119753673672676, + "learning_rate": 7.65081945316739e-05, + "loss": 0.010851009748876095, + "num_input_tokens_seen": 88888928, + "step": 5428, + "train_runtime": 44107.4723, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 3.29030303030303, + "grad_norm": 0.0066917650401592255, + "learning_rate": 7.650004056339027e-05, + "loss": 0.011794503778219223, + "num_input_tokens_seen": 88905304, + "step": 5429, + "train_runtime": 44115.5844, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 3.290909090909091, + "grad_norm": 0.0061232890002429485, + "learning_rate": 7.649188561492173e-05, + "loss": 0.011317582800984383, + "num_input_tokens_seen": 88921680, + "step": 5430, + "train_runtime": 44123.6982, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 3.2915151515151515, + "grad_norm": 0.011330051347613335, + "learning_rate": 7.648372968656993e-05, + "loss": 0.011889325454831123, + "num_input_tokens_seen": 88938056, + "step": 5431, + "train_runtime": 44131.8154, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 3.292121212121212, + "grad_norm": 0.011863240972161293, + "learning_rate": 7.647557277863655e-05, + "loss": 0.012785956263542175, + "num_input_tokens_seen": 88954432, + "step": 5432, + "train_runtime": 44139.9321, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 3.292727272727273, + "grad_norm": 0.006013842299580574, + "learning_rate": 7.646741489142331e-05, + "loss": 0.011312728747725487, + "num_input_tokens_seen": 88970808, + "step": 5433, + "train_runtime": 44148.0489, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 3.2933333333333334, + "grad_norm": 0.009653451852500439, + "learning_rate": 7.64592560252319e-05, + "loss": 0.012427195906639099, + "num_input_tokens_seen": 88987184, + "step": 5434, + "train_runtime": 44156.16, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 3.293939393939394, + "grad_norm": 0.00774805573746562, + "learning_rate": 7.645109618036417e-05, + "loss": 0.01239108294248581, + "num_input_tokens_seen": 89003560, + "step": 5435, + "train_runtime": 44164.2743, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 3.2945454545454544, + "grad_norm": 0.007514607161283493, + "learning_rate": 7.644293535712189e-05, + "loss": 0.011899247765541077, + "num_input_tokens_seen": 89019936, + "step": 5436, + "train_runtime": 44172.3857, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 3.295151515151515, + "grad_norm": 0.005474135745316744, + "learning_rate": 7.643477355580693e-05, + "loss": 0.011997153982520103, + "num_input_tokens_seen": 89036312, + "step": 5437, + "train_runtime": 44180.5017, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 3.295757575757576, + "grad_norm": 0.006788759026676416, + "learning_rate": 7.642661077672117e-05, + "loss": 0.011526230722665787, + "num_input_tokens_seen": 89052688, + "step": 5438, + "train_runtime": 44188.6152, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 3.2963636363636364, + "grad_norm": 0.006606207229197025, + "learning_rate": 7.641844702016654e-05, + "loss": 0.012010307982563972, + "num_input_tokens_seen": 89069064, + "step": 5439, + "train_runtime": 44196.7326, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 3.296969696969697, + "grad_norm": 0.007282166741788387, + "learning_rate": 7.6410282286445e-05, + "loss": 0.012526284903287888, + "num_input_tokens_seen": 89085440, + "step": 5440, + "train_runtime": 44204.8447, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 3.2975757575757574, + "grad_norm": 0.0042368690483272076, + "learning_rate": 7.640211657585856e-05, + "loss": 0.011039022356271744, + "num_input_tokens_seen": 89101816, + "step": 5441, + "train_runtime": 44212.9582, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 3.2981818181818183, + "grad_norm": 0.0067785559222102165, + "learning_rate": 7.639394988870923e-05, + "loss": 0.012586561031639576, + "num_input_tokens_seen": 89118192, + "step": 5442, + "train_runtime": 44221.0702, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 3.298787878787879, + "grad_norm": 0.00926483329385519, + "learning_rate": 7.638578222529911e-05, + "loss": 0.012717369012534618, + "num_input_tokens_seen": 89134568, + "step": 5443, + "train_runtime": 44229.1859, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 3.2993939393939393, + "grad_norm": 0.0044740610755980015, + "learning_rate": 7.637761358593028e-05, + "loss": 0.011249896138906479, + "num_input_tokens_seen": 89150944, + "step": 5444, + "train_runtime": 44237.3011, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.3, + "grad_norm": 0.009143206290900707, + "learning_rate": 7.636944397090488e-05, + "loss": 0.011410893872380257, + "num_input_tokens_seen": 89167320, + "step": 5445, + "train_runtime": 44245.4173, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.3006060606060608, + "grad_norm": 0.006007570773363113, + "learning_rate": 7.636127338052512e-05, + "loss": 0.01222436036914587, + "num_input_tokens_seen": 89183696, + "step": 5446, + "train_runtime": 44253.5321, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 3.3012121212121213, + "grad_norm": 0.006948873400688171, + "learning_rate": 7.635310181509319e-05, + "loss": 0.011651809327304363, + "num_input_tokens_seen": 89200072, + "step": 5447, + "train_runtime": 44261.6429, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 3.3018181818181818, + "grad_norm": 0.005531075410544872, + "learning_rate": 7.634492927491135e-05, + "loss": 0.012211989611387253, + "num_input_tokens_seen": 89216448, + "step": 5448, + "train_runtime": 44269.7538, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 3.3024242424242423, + "grad_norm": 0.00866343267261982, + "learning_rate": 7.633675576028187e-05, + "loss": 0.012626084499061108, + "num_input_tokens_seen": 89232824, + "step": 5449, + "train_runtime": 44277.8693, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 3.303030303030303, + "grad_norm": 0.011473002843558788, + "learning_rate": 7.632858127150709e-05, + "loss": 0.013654201291501522, + "num_input_tokens_seen": 89249200, + "step": 5450, + "train_runtime": 44285.9807, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 3.3036363636363637, + "grad_norm": 0.009682814590632915, + "learning_rate": 7.632040580888936e-05, + "loss": 0.011822786182165146, + "num_input_tokens_seen": 89265576, + "step": 5451, + "train_runtime": 44294.0903, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 3.304242424242424, + "grad_norm": 0.005223688203841448, + "learning_rate": 7.631222937273107e-05, + "loss": 0.012679046019911766, + "num_input_tokens_seen": 89281952, + "step": 5452, + "train_runtime": 44302.2015, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 3.3048484848484847, + "grad_norm": 0.005150394048541784, + "learning_rate": 7.630405196333463e-05, + "loss": 0.011960092931985855, + "num_input_tokens_seen": 89298328, + "step": 5453, + "train_runtime": 44310.319, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 3.3054545454545456, + "grad_norm": 0.007385487202554941, + "learning_rate": 7.629587358100258e-05, + "loss": 0.010840906761586666, + "num_input_tokens_seen": 89314704, + "step": 5454, + "train_runtime": 44318.4363, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 3.306060606060606, + "grad_norm": 0.003970773424953222, + "learning_rate": 7.628769422603736e-05, + "loss": 0.013060454279184341, + "num_input_tokens_seen": 89331080, + "step": 5455, + "train_runtime": 44326.5489, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 3.3066666666666666, + "grad_norm": 0.004920857027173042, + "learning_rate": 7.62795138987415e-05, + "loss": 0.013290351256728172, + "num_input_tokens_seen": 89347456, + "step": 5456, + "train_runtime": 44334.6598, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 3.307272727272727, + "grad_norm": 0.004438877105712891, + "learning_rate": 7.627133259941763e-05, + "loss": 0.01109031680971384, + "num_input_tokens_seen": 89363832, + "step": 5457, + "train_runtime": 44342.7724, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 3.3078787878787876, + "grad_norm": 0.007805404718965292, + "learning_rate": 7.626315032836831e-05, + "loss": 0.012621323578059673, + "num_input_tokens_seen": 89380208, + "step": 5458, + "train_runtime": 44350.8889, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 3.3084848484848486, + "grad_norm": 0.005603905767202377, + "learning_rate": 7.62549670858962e-05, + "loss": 0.010705428197979927, + "num_input_tokens_seen": 89396584, + "step": 5459, + "train_runtime": 44359.006, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 3.309090909090909, + "grad_norm": 0.008148038759827614, + "learning_rate": 7.624678287230401e-05, + "loss": 0.013161386363208294, + "num_input_tokens_seen": 89412960, + "step": 5460, + "train_runtime": 44367.1202, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.3096969696969696, + "grad_norm": 0.006805382203310728, + "learning_rate": 7.623859768789441e-05, + "loss": 0.011652662418782711, + "num_input_tokens_seen": 89429336, + "step": 5461, + "train_runtime": 44375.2366, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.3103030303030305, + "grad_norm": 0.0067623755894601345, + "learning_rate": 7.62304115329702e-05, + "loss": 0.012203022837638855, + "num_input_tokens_seen": 89445712, + "step": 5462, + "train_runtime": 44383.3491, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.310909090909091, + "grad_norm": 0.007818473502993584, + "learning_rate": 7.622222440783414e-05, + "loss": 0.01248757541179657, + "num_input_tokens_seen": 89462088, + "step": 5463, + "train_runtime": 44391.461, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.3115151515151515, + "grad_norm": 0.008648247458040714, + "learning_rate": 7.621403631278908e-05, + "loss": 0.012562514282763004, + "num_input_tokens_seen": 89478464, + "step": 5464, + "train_runtime": 44399.5766, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 3.312121212121212, + "grad_norm": 0.010686001740396023, + "learning_rate": 7.620584724813782e-05, + "loss": 0.012101012282073498, + "num_input_tokens_seen": 89494840, + "step": 5465, + "train_runtime": 44407.6903, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 3.3127272727272725, + "grad_norm": 0.006840110290795565, + "learning_rate": 7.619765721418335e-05, + "loss": 0.012574302963912487, + "num_input_tokens_seen": 89511216, + "step": 5466, + "train_runtime": 44415.8038, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 3.3133333333333335, + "grad_norm": 0.007532029412686825, + "learning_rate": 7.618946621122853e-05, + "loss": 0.012426842004060745, + "num_input_tokens_seen": 89527592, + "step": 5467, + "train_runtime": 44423.9181, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 3.313939393939394, + "grad_norm": 0.009044677019119263, + "learning_rate": 7.618127423957637e-05, + "loss": 0.012309052981436253, + "num_input_tokens_seen": 89543968, + "step": 5468, + "train_runtime": 44432.0339, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 3.3145454545454545, + "grad_norm": 0.004497412592172623, + "learning_rate": 7.617308129952987e-05, + "loss": 0.012093428522348404, + "num_input_tokens_seen": 89560344, + "step": 5469, + "train_runtime": 44440.1511, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 3.315151515151515, + "grad_norm": 0.00786635559052229, + "learning_rate": 7.616488739139204e-05, + "loss": 0.012925980612635612, + "num_input_tokens_seen": 89576720, + "step": 5470, + "train_runtime": 44448.2699, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 3.315757575757576, + "grad_norm": 0.010471811518073082, + "learning_rate": 7.6156692515466e-05, + "loss": 0.0120413051918149, + "num_input_tokens_seen": 89593096, + "step": 5471, + "train_runtime": 44456.3851, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 3.3163636363636364, + "grad_norm": 0.006649843417108059, + "learning_rate": 7.614849667205482e-05, + "loss": 0.011431869119405746, + "num_input_tokens_seen": 89609472, + "step": 5472, + "train_runtime": 44464.4997, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 3.316969696969697, + "grad_norm": 0.0007087925914674997, + "learning_rate": 7.61402998614617e-05, + "loss": 0.01054468099027872, + "num_input_tokens_seen": 89625848, + "step": 5473, + "train_runtime": 44472.6153, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 3.3175757575757574, + "grad_norm": 0.0068687740713357925, + "learning_rate": 7.613210208398976e-05, + "loss": 0.012217010371387005, + "num_input_tokens_seen": 89642224, + "step": 5474, + "train_runtime": 44480.7335, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 3.3181818181818183, + "grad_norm": 0.005529994610697031, + "learning_rate": 7.612390333994228e-05, + "loss": 0.012402606196701527, + "num_input_tokens_seen": 89658600, + "step": 5475, + "train_runtime": 44488.8505, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 3.318787878787879, + "grad_norm": 0.006812646519392729, + "learning_rate": 7.611570362962248e-05, + "loss": 0.011882617138326168, + "num_input_tokens_seen": 89674976, + "step": 5476, + "train_runtime": 44496.9621, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 3.3193939393939393, + "grad_norm": 0.004153987392783165, + "learning_rate": 7.610750295333365e-05, + "loss": 0.011140245012938976, + "num_input_tokens_seen": 89691352, + "step": 5477, + "train_runtime": 44505.0765, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 3.32, + "grad_norm": 0.005511437077075243, + "learning_rate": 7.609930131137914e-05, + "loss": 0.0114361010491848, + "num_input_tokens_seen": 89707728, + "step": 5478, + "train_runtime": 44513.189, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 3.320606060606061, + "grad_norm": 0.008803537115454674, + "learning_rate": 7.60910987040623e-05, + "loss": 0.012866645120084286, + "num_input_tokens_seen": 89724104, + "step": 5479, + "train_runtime": 44521.3065, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 3.3212121212121213, + "grad_norm": 0.0073803626000881195, + "learning_rate": 7.608289513168653e-05, + "loss": 0.012501185759902, + "num_input_tokens_seen": 89740480, + "step": 5480, + "train_runtime": 44529.4316, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 3.321818181818182, + "grad_norm": 0.005053266882896423, + "learning_rate": 7.607469059455526e-05, + "loss": 0.012799685820937157, + "num_input_tokens_seen": 89756856, + "step": 5481, + "train_runtime": 44537.5449, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 3.3224242424242423, + "grad_norm": 0.00786762498319149, + "learning_rate": 7.606648509297196e-05, + "loss": 0.011066557839512825, + "num_input_tokens_seen": 89773232, + "step": 5482, + "train_runtime": 44545.6624, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 3.3230303030303032, + "grad_norm": 0.009618595242500305, + "learning_rate": 7.605827862724016e-05, + "loss": 0.012139074504375458, + "num_input_tokens_seen": 89789608, + "step": 5483, + "train_runtime": 44553.7765, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.3236363636363637, + "grad_norm": 0.004869706463068724, + "learning_rate": 7.605007119766334e-05, + "loss": 0.011102267540991306, + "num_input_tokens_seen": 89805984, + "step": 5484, + "train_runtime": 44561.8868, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.324242424242424, + "grad_norm": 0.015996413305401802, + "learning_rate": 7.604186280454515e-05, + "loss": 0.013029851950705051, + "num_input_tokens_seen": 89822360, + "step": 5485, + "train_runtime": 44569.9971, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.3248484848484847, + "grad_norm": 0.009245653636753559, + "learning_rate": 7.603365344818916e-05, + "loss": 0.011547653935849667, + "num_input_tokens_seen": 89838736, + "step": 5486, + "train_runtime": 44578.1102, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 3.325454545454545, + "grad_norm": 0.006920334883034229, + "learning_rate": 7.602544312889903e-05, + "loss": 0.01293699350208044, + "num_input_tokens_seen": 89855112, + "step": 5487, + "train_runtime": 44586.2219, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 3.326060606060606, + "grad_norm": 0.0052245729602873325, + "learning_rate": 7.601723184697842e-05, + "loss": 0.012247685343027115, + "num_input_tokens_seen": 89871488, + "step": 5488, + "train_runtime": 44594.3332, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.3266666666666667, + "grad_norm": 0.003991789650171995, + "learning_rate": 7.60090196027311e-05, + "loss": 0.012869923375546932, + "num_input_tokens_seen": 89887864, + "step": 5489, + "train_runtime": 44602.4506, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.327272727272727, + "grad_norm": 0.005501984618604183, + "learning_rate": 7.600080639646077e-05, + "loss": 0.012117445468902588, + "num_input_tokens_seen": 89904240, + "step": 5490, + "train_runtime": 44610.5667, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 3.327878787878788, + "grad_norm": 0.010951736941933632, + "learning_rate": 7.599259222847127e-05, + "loss": 0.01285905484110117, + "num_input_tokens_seen": 89920616, + "step": 5491, + "train_runtime": 44618.6799, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 3.3284848484848486, + "grad_norm": 0.004330459516495466, + "learning_rate": 7.598437709906638e-05, + "loss": 0.010762191377580166, + "num_input_tokens_seen": 89936992, + "step": 5492, + "train_runtime": 44626.798, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.329090909090909, + "grad_norm": 0.010865326039493084, + "learning_rate": 7.597616100854999e-05, + "loss": 0.012417128309607506, + "num_input_tokens_seen": 89953368, + "step": 5493, + "train_runtime": 44634.9117, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.3296969696969696, + "grad_norm": 0.0008989815833047032, + "learning_rate": 7.5967943957226e-05, + "loss": 0.012299996800720692, + "num_input_tokens_seen": 89969744, + "step": 5494, + "train_runtime": 44643.0346, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.33030303030303, + "grad_norm": 0.01129222009330988, + "learning_rate": 7.595972594539831e-05, + "loss": 0.011564287357032299, + "num_input_tokens_seen": 89986120, + "step": 5495, + "train_runtime": 44651.1464, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.330909090909091, + "grad_norm": 0.006419788580387831, + "learning_rate": 7.595150697337095e-05, + "loss": 0.01219707727432251, + "num_input_tokens_seen": 90002496, + "step": 5496, + "train_runtime": 44659.259, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.3315151515151515, + "grad_norm": 0.010371673852205276, + "learning_rate": 7.594328704144786e-05, + "loss": 0.012352915480732918, + "num_input_tokens_seen": 90018872, + "step": 5497, + "train_runtime": 44667.3717, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 3.332121212121212, + "grad_norm": 0.010289951227605343, + "learning_rate": 7.59350661499331e-05, + "loss": 0.012071865610778332, + "num_input_tokens_seen": 90035248, + "step": 5498, + "train_runtime": 44675.4894, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 3.3327272727272725, + "grad_norm": 0.010006491094827652, + "learning_rate": 7.592684429913075e-05, + "loss": 0.010838519781827927, + "num_input_tokens_seen": 90051624, + "step": 5499, + "train_runtime": 44683.6056, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 0.009875924326479435, + "learning_rate": 7.591862148934495e-05, + "loss": 0.013600432313978672, + "num_input_tokens_seen": 90068000, + "step": 5500, + "train_runtime": 44691.7188, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 3.333939393939394, + "grad_norm": 0.01028907485306263, + "learning_rate": 7.591039772087977e-05, + "loss": 0.012168627232313156, + "num_input_tokens_seen": 90084376, + "step": 5501, + "train_runtime": 44700.6759, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 3.3345454545454545, + "grad_norm": 0.008811515755951405, + "learning_rate": 7.590217299403948e-05, + "loss": 0.01146846916526556, + "num_input_tokens_seen": 90100752, + "step": 5502, + "train_runtime": 44708.7961, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 3.335151515151515, + "grad_norm": 0.0011186763877049088, + "learning_rate": 7.589394730912822e-05, + "loss": 0.011563530191779137, + "num_input_tokens_seen": 90117128, + "step": 5503, + "train_runtime": 44716.9148, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 3.335757575757576, + "grad_norm": 0.005841893143951893, + "learning_rate": 7.588572066645027e-05, + "loss": 0.011638449504971504, + "num_input_tokens_seen": 90133504, + "step": 5504, + "train_runtime": 44725.0335, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 3.3363636363636364, + "grad_norm": 0.007100232876837254, + "learning_rate": 7.587749306630995e-05, + "loss": 0.010972381569445133, + "num_input_tokens_seen": 90149880, + "step": 5505, + "train_runtime": 44733.1569, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 3.336969696969697, + "grad_norm": 0.006332727149128914, + "learning_rate": 7.586926450901155e-05, + "loss": 0.012634308077394962, + "num_input_tokens_seen": 90166256, + "step": 5506, + "train_runtime": 44741.2726, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 3.3375757575757574, + "grad_norm": 0.01088823564350605, + "learning_rate": 7.586103499485942e-05, + "loss": 0.013737642206251621, + "num_input_tokens_seen": 90182632, + "step": 5507, + "train_runtime": 44749.3873, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 3.3381818181818184, + "grad_norm": 0.0035653922241181135, + "learning_rate": 7.585280452415798e-05, + "loss": 0.011491003446280956, + "num_input_tokens_seen": 90199008, + "step": 5508, + "train_runtime": 44757.4997, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 3.338787878787879, + "grad_norm": 0.0076635368168354034, + "learning_rate": 7.584457309721164e-05, + "loss": 0.012830449268221855, + "num_input_tokens_seen": 90215384, + "step": 5509, + "train_runtime": 44765.6114, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 3.3393939393939394, + "grad_norm": 0.008027713745832443, + "learning_rate": 7.583634071432489e-05, + "loss": 0.013112273998558521, + "num_input_tokens_seen": 90231760, + "step": 5510, + "train_runtime": 44773.7314, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 3.34, + "grad_norm": 0.009944655001163483, + "learning_rate": 7.58281073758022e-05, + "loss": 0.012386097572743893, + "num_input_tokens_seen": 90248136, + "step": 5511, + "train_runtime": 44781.8443, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 3.340606060606061, + "grad_norm": 0.004994191229343414, + "learning_rate": 7.58198730819481e-05, + "loss": 0.011621551588177681, + "num_input_tokens_seen": 90264512, + "step": 5512, + "train_runtime": 44789.9569, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 3.3412121212121213, + "grad_norm": 0.012782320380210876, + "learning_rate": 7.581163783306719e-05, + "loss": 0.011238468810915947, + "num_input_tokens_seen": 90280888, + "step": 5513, + "train_runtime": 44798.0716, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 3.341818181818182, + "grad_norm": 0.006190773099660873, + "learning_rate": 7.580340162946407e-05, + "loss": 0.012123371474444866, + "num_input_tokens_seen": 90297264, + "step": 5514, + "train_runtime": 44806.186, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 3.3424242424242423, + "grad_norm": 0.00681741489097476, + "learning_rate": 7.579516447144336e-05, + "loss": 0.012508861720561981, + "num_input_tokens_seen": 90313640, + "step": 5515, + "train_runtime": 44814.3003, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 3.343030303030303, + "grad_norm": 0.012876826338469982, + "learning_rate": 7.578692635930975e-05, + "loss": 0.012768741697072983, + "num_input_tokens_seen": 90330016, + "step": 5516, + "train_runtime": 44822.4195, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 3.3436363636363637, + "grad_norm": 0.005768692586570978, + "learning_rate": 7.577868729336796e-05, + "loss": 0.012756479904055595, + "num_input_tokens_seen": 90346392, + "step": 5517, + "train_runtime": 44830.5359, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 3.3442424242424242, + "grad_norm": 0.008382749743759632, + "learning_rate": 7.577044727392273e-05, + "loss": 0.010987639427185059, + "num_input_tokens_seen": 90362768, + "step": 5518, + "train_runtime": 44838.6497, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 3.3448484848484847, + "grad_norm": 0.0062371958047151566, + "learning_rate": 7.576220630127883e-05, + "loss": 0.012474359944462776, + "num_input_tokens_seen": 90379144, + "step": 5519, + "train_runtime": 44846.7716, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 3.3454545454545457, + "grad_norm": 0.00679647084325552, + "learning_rate": 7.575396437574109e-05, + "loss": 0.013044663704931736, + "num_input_tokens_seen": 90395520, + "step": 5520, + "train_runtime": 44854.8882, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 3.346060606060606, + "grad_norm": 0.011369622312486172, + "learning_rate": 7.574572149761437e-05, + "loss": 0.01333982590585947, + "num_input_tokens_seen": 90411896, + "step": 5521, + "train_runtime": 44863.0019, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.3466666666666667, + "grad_norm": 0.008409282192587852, + "learning_rate": 7.573747766720354e-05, + "loss": 0.012235783040523529, + "num_input_tokens_seen": 90428272, + "step": 5522, + "train_runtime": 44871.1205, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.347272727272727, + "grad_norm": 0.005691043101251125, + "learning_rate": 7.572923288481354e-05, + "loss": 0.011727415025234222, + "num_input_tokens_seen": 90444648, + "step": 5523, + "train_runtime": 44879.2364, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.3478787878787877, + "grad_norm": 0.010564975440502167, + "learning_rate": 7.572098715074931e-05, + "loss": 0.011707552708685398, + "num_input_tokens_seen": 90461024, + "step": 5524, + "train_runtime": 44887.3573, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 3.3484848484848486, + "grad_norm": 0.010726746171712875, + "learning_rate": 7.571274046531586e-05, + "loss": 0.012426982633769512, + "num_input_tokens_seen": 90477400, + "step": 5525, + "train_runtime": 44895.4742, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 3.349090909090909, + "grad_norm": 0.006085506174713373, + "learning_rate": 7.570449282881822e-05, + "loss": 0.011125419288873672, + "num_input_tokens_seen": 90493776, + "step": 5526, + "train_runtime": 44903.5906, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 3.3496969696969696, + "grad_norm": 0.009037315845489502, + "learning_rate": 7.569624424156144e-05, + "loss": 0.012100107967853546, + "num_input_tokens_seen": 90510152, + "step": 5527, + "train_runtime": 44911.7106, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 3.35030303030303, + "grad_norm": 0.010065731592476368, + "learning_rate": 7.568799470385064e-05, + "loss": 0.01247998233884573, + "num_input_tokens_seen": 90526528, + "step": 5528, + "train_runtime": 44919.8327, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 3.350909090909091, + "grad_norm": 0.005243572406470776, + "learning_rate": 7.567974421599094e-05, + "loss": 0.012201013043522835, + "num_input_tokens_seen": 90542904, + "step": 5529, + "train_runtime": 44927.9504, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 3.3515151515151516, + "grad_norm": 0.007568024564534426, + "learning_rate": 7.56714927782875e-05, + "loss": 0.011319687590003014, + "num_input_tokens_seen": 90559280, + "step": 5530, + "train_runtime": 44936.0664, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 3.352121212121212, + "grad_norm": 0.013508440926671028, + "learning_rate": 7.566324039104553e-05, + "loss": 0.012454374693334103, + "num_input_tokens_seen": 90575656, + "step": 5531, + "train_runtime": 44944.188, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 3.3527272727272726, + "grad_norm": 0.00613227766007185, + "learning_rate": 7.565498705457028e-05, + "loss": 0.012941415421664715, + "num_input_tokens_seen": 90592032, + "step": 5532, + "train_runtime": 44952.3072, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 3.3533333333333335, + "grad_norm": 0.008166263811290264, + "learning_rate": 7.564673276916703e-05, + "loss": 0.013165703974664211, + "num_input_tokens_seen": 90608408, + "step": 5533, + "train_runtime": 44960.4311, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 3.353939393939394, + "grad_norm": 0.009492557495832443, + "learning_rate": 7.563847753514107e-05, + "loss": 0.012590172700583935, + "num_input_tokens_seen": 90624784, + "step": 5534, + "train_runtime": 44968.5491, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 3.3545454545454545, + "grad_norm": 0.00820946041494608, + "learning_rate": 7.563022135279775e-05, + "loss": 0.013918361626565456, + "num_input_tokens_seen": 90641160, + "step": 5535, + "train_runtime": 44976.6632, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 3.355151515151515, + "grad_norm": 0.007774450350552797, + "learning_rate": 7.562196422244245e-05, + "loss": 0.013481343165040016, + "num_input_tokens_seen": 90657536, + "step": 5536, + "train_runtime": 44984.7812, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 3.355757575757576, + "grad_norm": 0.0047575500793755054, + "learning_rate": 7.561370614438061e-05, + "loss": 0.01132679171860218, + "num_input_tokens_seen": 90673912, + "step": 5537, + "train_runtime": 44992.8986, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 3.3563636363636364, + "grad_norm": 0.009751240722835064, + "learning_rate": 7.560544711891766e-05, + "loss": 0.012721298262476921, + "num_input_tokens_seen": 90690288, + "step": 5538, + "train_runtime": 45001.0175, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 3.356969696969697, + "grad_norm": 0.0052260602824389935, + "learning_rate": 7.559718714635907e-05, + "loss": 0.012540977448225021, + "num_input_tokens_seen": 90706664, + "step": 5539, + "train_runtime": 45009.133, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 3.3575757575757574, + "grad_norm": 0.006648395676165819, + "learning_rate": 7.558892622701037e-05, + "loss": 0.012352634221315384, + "num_input_tokens_seen": 90723040, + "step": 5540, + "train_runtime": 45017.2508, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 3.3581818181818184, + "grad_norm": 0.00514795258641243, + "learning_rate": 7.558066436117715e-05, + "loss": 0.012124727480113506, + "num_input_tokens_seen": 90739416, + "step": 5541, + "train_runtime": 45025.3696, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 3.358787878787879, + "grad_norm": 0.0060890414752066135, + "learning_rate": 7.557240154916495e-05, + "loss": 0.012679407373070717, + "num_input_tokens_seen": 90755792, + "step": 5542, + "train_runtime": 45033.4844, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 3.3593939393939394, + "grad_norm": 0.0034894358832389116, + "learning_rate": 7.556413779127941e-05, + "loss": 0.012408466078341007, + "num_input_tokens_seen": 90772168, + "step": 5543, + "train_runtime": 45041.5952, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 3.36, + "grad_norm": 0.0054548694752156734, + "learning_rate": 7.555587308782622e-05, + "loss": 0.01283508725464344, + "num_input_tokens_seen": 90788544, + "step": 5544, + "train_runtime": 45049.7083, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 3.3606060606060604, + "grad_norm": 0.006968807894736528, + "learning_rate": 7.554760743911103e-05, + "loss": 0.012311786413192749, + "num_input_tokens_seen": 90804920, + "step": 5545, + "train_runtime": 45057.8199, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.3612121212121213, + "grad_norm": 0.004087213426828384, + "learning_rate": 7.553934084543961e-05, + "loss": 0.011944867670536041, + "num_input_tokens_seen": 90821296, + "step": 5546, + "train_runtime": 45065.9398, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.361818181818182, + "grad_norm": 0.006908513139933348, + "learning_rate": 7.553107330711769e-05, + "loss": 0.01202747318893671, + "num_input_tokens_seen": 90837672, + "step": 5547, + "train_runtime": 45074.0549, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.3624242424242423, + "grad_norm": 0.0025824864860624075, + "learning_rate": 7.552280482445112e-05, + "loss": 0.010443884879350662, + "num_input_tokens_seen": 90854048, + "step": 5548, + "train_runtime": 45082.1715, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.3630303030303033, + "grad_norm": 0.0101697389036417, + "learning_rate": 7.551453539774565e-05, + "loss": 0.012192212045192719, + "num_input_tokens_seen": 90870424, + "step": 5549, + "train_runtime": 45090.2857, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.3636363636363638, + "grad_norm": 0.008115961216390133, + "learning_rate": 7.550626502730726e-05, + "loss": 0.013131178915500641, + "num_input_tokens_seen": 90886800, + "step": 5550, + "train_runtime": 45098.3955, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 3.3642424242424243, + "grad_norm": 0.008823497220873833, + "learning_rate": 7.549799371344175e-05, + "loss": 0.012136719189584255, + "num_input_tokens_seen": 90903176, + "step": 5551, + "train_runtime": 45106.5127, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 3.3648484848484848, + "grad_norm": 0.006723793223500252, + "learning_rate": 7.548972145645515e-05, + "loss": 0.012134970165789127, + "num_input_tokens_seen": 90919552, + "step": 5552, + "train_runtime": 45114.6326, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 3.3654545454545453, + "grad_norm": 0.005425069481134415, + "learning_rate": 7.548144825665336e-05, + "loss": 0.011454436928033829, + "num_input_tokens_seen": 90935928, + "step": 5553, + "train_runtime": 45122.7495, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 3.366060606060606, + "grad_norm": 0.008128207176923752, + "learning_rate": 7.547317411434242e-05, + "loss": 0.01192244328558445, + "num_input_tokens_seen": 90952304, + "step": 5554, + "train_runtime": 45130.8629, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 3.3666666666666667, + "grad_norm": 0.013343941420316696, + "learning_rate": 7.54648990298284e-05, + "loss": 0.012931494042277336, + "num_input_tokens_seen": 90968680, + "step": 5555, + "train_runtime": 45138.9738, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 3.367272727272727, + "grad_norm": 0.00944167748093605, + "learning_rate": 7.545662300341736e-05, + "loss": 0.01212040800601244, + "num_input_tokens_seen": 90985056, + "step": 5556, + "train_runtime": 45147.0863, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 3.3678787878787877, + "grad_norm": 0.010796057991683483, + "learning_rate": 7.544834603541537e-05, + "loss": 0.01223970390856266, + "num_input_tokens_seen": 91001432, + "step": 5557, + "train_runtime": 45155.2053, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 3.3684848484848486, + "grad_norm": 0.0064018405973911285, + "learning_rate": 7.544006812612865e-05, + "loss": 0.011668344028294086, + "num_input_tokens_seen": 91017808, + "step": 5558, + "train_runtime": 45163.3176, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 3.369090909090909, + "grad_norm": 0.005864448379725218, + "learning_rate": 7.543178927586335e-05, + "loss": 0.011180905625224113, + "num_input_tokens_seen": 91034184, + "step": 5559, + "train_runtime": 45171.434, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 3.3696969696969696, + "grad_norm": 0.008967139758169651, + "learning_rate": 7.54235094849257e-05, + "loss": 0.01195902843028307, + "num_input_tokens_seen": 91050560, + "step": 5560, + "train_runtime": 45179.5463, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 3.37030303030303, + "grad_norm": 0.0041519273072481155, + "learning_rate": 7.541522875362193e-05, + "loss": 0.012691115029156208, + "num_input_tokens_seen": 91066936, + "step": 5561, + "train_runtime": 45187.6651, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 3.370909090909091, + "grad_norm": 0.013196374289691448, + "learning_rate": 7.540694708225832e-05, + "loss": 0.012026030570268631, + "num_input_tokens_seen": 91083312, + "step": 5562, + "train_runtime": 45195.7809, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 3.3715151515151516, + "grad_norm": 0.0056176562793552876, + "learning_rate": 7.539866447114126e-05, + "loss": 0.012482086196541786, + "num_input_tokens_seen": 91099688, + "step": 5563, + "train_runtime": 45203.8979, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 3.372121212121212, + "grad_norm": 0.0059923818334937096, + "learning_rate": 7.539038092057704e-05, + "loss": 0.011928334832191467, + "num_input_tokens_seen": 91116064, + "step": 5564, + "train_runtime": 45212.0135, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 3.3727272727272726, + "grad_norm": 0.00853908434510231, + "learning_rate": 7.538209643087207e-05, + "loss": 0.013977022841572762, + "num_input_tokens_seen": 91132440, + "step": 5565, + "train_runtime": 45220.1314, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 3.3733333333333335, + "grad_norm": 0.005957504268735647, + "learning_rate": 7.537381100233278e-05, + "loss": 0.010648944415152073, + "num_input_tokens_seen": 91148816, + "step": 5566, + "train_runtime": 45228.2413, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 3.373939393939394, + "grad_norm": 0.007915256544947624, + "learning_rate": 7.536552463526564e-05, + "loss": 0.012727542780339718, + "num_input_tokens_seen": 91165192, + "step": 5567, + "train_runtime": 45236.3528, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 3.3745454545454545, + "grad_norm": 0.008650321513414383, + "learning_rate": 7.535723732997715e-05, + "loss": 0.012222620658576488, + "num_input_tokens_seen": 91181568, + "step": 5568, + "train_runtime": 45244.4679, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.375151515151515, + "grad_norm": 0.010193414054811, + "learning_rate": 7.534894908677384e-05, + "loss": 0.01114997360855341, + "num_input_tokens_seen": 91197944, + "step": 5569, + "train_runtime": 45252.5862, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.375757575757576, + "grad_norm": 0.0083761690184474, + "learning_rate": 7.534065990596224e-05, + "loss": 0.013518362306058407, + "num_input_tokens_seen": 91214320, + "step": 5570, + "train_runtime": 45260.7007, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.3763636363636365, + "grad_norm": 0.007823417894542217, + "learning_rate": 7.5332369787849e-05, + "loss": 0.012948406860232353, + "num_input_tokens_seen": 91230696, + "step": 5571, + "train_runtime": 45268.8172, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.376969696969697, + "grad_norm": 0.0020584461744874716, + "learning_rate": 7.532407873274072e-05, + "loss": 0.010550681501626968, + "num_input_tokens_seen": 91247072, + "step": 5572, + "train_runtime": 45276.931, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.3775757575757575, + "grad_norm": 0.010576609522104263, + "learning_rate": 7.53157867409441e-05, + "loss": 0.012465088628232479, + "num_input_tokens_seen": 91263448, + "step": 5573, + "train_runtime": 45285.0454, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 3.378181818181818, + "grad_norm": 0.00603002542629838, + "learning_rate": 7.530749381276581e-05, + "loss": 0.012353937141597271, + "num_input_tokens_seen": 91279824, + "step": 5574, + "train_runtime": 45293.157, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.378787878787879, + "grad_norm": 0.00942918285727501, + "learning_rate": 7.529919994851262e-05, + "loss": 0.012542503885924816, + "num_input_tokens_seen": 91296200, + "step": 5575, + "train_runtime": 45301.2762, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.3793939393939394, + "grad_norm": 0.016185086220502853, + "learning_rate": 7.529090514849128e-05, + "loss": 0.012185894884169102, + "num_input_tokens_seen": 91312576, + "step": 5576, + "train_runtime": 45309.3921, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.38, + "grad_norm": 0.009853019379079342, + "learning_rate": 7.528260941300864e-05, + "loss": 0.012557181529700756, + "num_input_tokens_seen": 91328952, + "step": 5577, + "train_runtime": 45317.5061, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 3.380606060606061, + "grad_norm": 0.00715734763070941, + "learning_rate": 7.52743127423715e-05, + "loss": 0.01062626764178276, + "num_input_tokens_seen": 91345328, + "step": 5578, + "train_runtime": 45325.6211, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 3.3812121212121213, + "grad_norm": 0.006232213228940964, + "learning_rate": 7.526601513688673e-05, + "loss": 0.012660540640354156, + "num_input_tokens_seen": 91361704, + "step": 5579, + "train_runtime": 45333.7399, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.381818181818182, + "grad_norm": 0.007364703342318535, + "learning_rate": 7.525771659686128e-05, + "loss": 0.011443682946264744, + "num_input_tokens_seen": 91378080, + "step": 5580, + "train_runtime": 45341.8524, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.3824242424242423, + "grad_norm": 0.011822480708360672, + "learning_rate": 7.524941712260207e-05, + "loss": 0.014163168147206306, + "num_input_tokens_seen": 91394456, + "step": 5581, + "train_runtime": 45349.9645, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.383030303030303, + "grad_norm": 0.003918302245438099, + "learning_rate": 7.524111671441612e-05, + "loss": 0.012874097563326359, + "num_input_tokens_seen": 91410832, + "step": 5582, + "train_runtime": 45358.0795, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.3836363636363638, + "grad_norm": 0.004396955017000437, + "learning_rate": 7.523281537261039e-05, + "loss": 0.012388463132083416, + "num_input_tokens_seen": 91427208, + "step": 5583, + "train_runtime": 45366.1925, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 3.3842424242424243, + "grad_norm": 0.006505928002297878, + "learning_rate": 7.522451309749197e-05, + "loss": 0.011709807440638542, + "num_input_tokens_seen": 91443584, + "step": 5584, + "train_runtime": 45374.307, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 3.3848484848484848, + "grad_norm": 0.0010119694052264094, + "learning_rate": 7.521620988936792e-05, + "loss": 0.012415178120136261, + "num_input_tokens_seen": 91459960, + "step": 5585, + "train_runtime": 45382.4189, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 3.3854545454545453, + "grad_norm": 0.009552408941090107, + "learning_rate": 7.520790574854538e-05, + "loss": 0.012354734353721142, + "num_input_tokens_seen": 91476336, + "step": 5586, + "train_runtime": 45390.5333, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 3.386060606060606, + "grad_norm": 0.011787908151745796, + "learning_rate": 7.519960067533149e-05, + "loss": 0.0130911348387599, + "num_input_tokens_seen": 91492712, + "step": 5587, + "train_runtime": 45398.6462, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 3.3866666666666667, + "grad_norm": 0.0063934726640582085, + "learning_rate": 7.519129467003347e-05, + "loss": 0.01184793934226036, + "num_input_tokens_seen": 91509088, + "step": 5588, + "train_runtime": 45406.759, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 3.387272727272727, + "grad_norm": 0.00759939244017005, + "learning_rate": 7.518298773295849e-05, + "loss": 0.012232743203639984, + "num_input_tokens_seen": 91525464, + "step": 5589, + "train_runtime": 45414.8697, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 3.3878787878787877, + "grad_norm": 0.005053234286606312, + "learning_rate": 7.517467986441384e-05, + "loss": 0.012983040884137154, + "num_input_tokens_seen": 91541840, + "step": 5590, + "train_runtime": 45422.9823, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 3.3884848484848487, + "grad_norm": 0.007972417399287224, + "learning_rate": 7.516637106470683e-05, + "loss": 0.012427736073732376, + "num_input_tokens_seen": 91558216, + "step": 5591, + "train_runtime": 45431.0954, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 3.389090909090909, + "grad_norm": 0.0044271498918533325, + "learning_rate": 7.515806133414474e-05, + "loss": 0.012203115038573742, + "num_input_tokens_seen": 91574592, + "step": 5592, + "train_runtime": 45439.2079, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 3.3896969696969697, + "grad_norm": 0.007849742658436298, + "learning_rate": 7.514975067303496e-05, + "loss": 0.011317530646920204, + "num_input_tokens_seen": 91590968, + "step": 5593, + "train_runtime": 45447.3214, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 3.39030303030303, + "grad_norm": 0.007042233366519213, + "learning_rate": 7.514143908168487e-05, + "loss": 0.011524077504873276, + "num_input_tokens_seen": 91607344, + "step": 5594, + "train_runtime": 45455.4395, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 3.390909090909091, + "grad_norm": 0.0061455052345991135, + "learning_rate": 7.513312656040193e-05, + "loss": 0.011707305908203125, + "num_input_tokens_seen": 91623720, + "step": 5595, + "train_runtime": 45463.5522, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 3.3915151515151516, + "grad_norm": 0.005690615624189377, + "learning_rate": 7.512481310949358e-05, + "loss": 0.012467317283153534, + "num_input_tokens_seen": 91640096, + "step": 5596, + "train_runtime": 45471.6663, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 3.392121212121212, + "grad_norm": 0.006550784222781658, + "learning_rate": 7.51164987292673e-05, + "loss": 0.012659874744713306, + "num_input_tokens_seen": 91656472, + "step": 5597, + "train_runtime": 45479.7802, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 3.3927272727272726, + "grad_norm": 0.006821752060204744, + "learning_rate": 7.510818342003067e-05, + "loss": 0.011081205680966377, + "num_input_tokens_seen": 91672848, + "step": 5598, + "train_runtime": 45487.8915, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 3.3933333333333335, + "grad_norm": 0.011554849334061146, + "learning_rate": 7.509986718209121e-05, + "loss": 0.012107428163290024, + "num_input_tokens_seen": 91689224, + "step": 5599, + "train_runtime": 45496.001, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 3.393939393939394, + "grad_norm": 0.010039112530648708, + "learning_rate": 7.509155001575656e-05, + "loss": 0.01155742909759283, + "num_input_tokens_seen": 91705600, + "step": 5600, + "train_runtime": 45504.1124, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 3.3945454545454545, + "grad_norm": 0.009561151266098022, + "learning_rate": 7.508323192133432e-05, + "loss": 0.012796069495379925, + "num_input_tokens_seen": 91721976, + "step": 5601, + "train_runtime": 45513.1773, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 3.395151515151515, + "grad_norm": 0.006783085409551859, + "learning_rate": 7.50749128991322e-05, + "loss": 0.013654295355081558, + "num_input_tokens_seen": 91738352, + "step": 5602, + "train_runtime": 45521.2917, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 3.3957575757575755, + "grad_norm": 0.008901482447981834, + "learning_rate": 7.506659294945786e-05, + "loss": 0.012411155737936497, + "num_input_tokens_seen": 91754728, + "step": 5603, + "train_runtime": 45529.4024, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 3.3963636363636365, + "grad_norm": 0.008738926611840725, + "learning_rate": 7.505827207261908e-05, + "loss": 0.011707296594977379, + "num_input_tokens_seen": 91771104, + "step": 5604, + "train_runtime": 45537.5146, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 3.396969696969697, + "grad_norm": 0.014351113699376583, + "learning_rate": 7.504995026892361e-05, + "loss": 0.01276414468884468, + "num_input_tokens_seen": 91787480, + "step": 5605, + "train_runtime": 45545.6324, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 3.3975757575757575, + "grad_norm": 0.004262860864400864, + "learning_rate": 7.504162753867927e-05, + "loss": 0.011838957667350769, + "num_input_tokens_seen": 91803856, + "step": 5606, + "train_runtime": 45553.7476, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 3.3981818181818184, + "grad_norm": 0.005677108187228441, + "learning_rate": 7.503330388219389e-05, + "loss": 0.011324195191264153, + "num_input_tokens_seen": 91820232, + "step": 5607, + "train_runtime": 45561.8578, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 3.398787878787879, + "grad_norm": 0.00824224017560482, + "learning_rate": 7.502497929977532e-05, + "loss": 0.011128949001431465, + "num_input_tokens_seen": 91836608, + "step": 5608, + "train_runtime": 45569.9751, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 3.3993939393939394, + "grad_norm": 0.003958118613809347, + "learning_rate": 7.501665379173153e-05, + "loss": 0.011757533997297287, + "num_input_tokens_seen": 91852984, + "step": 5609, + "train_runtime": 45578.0896, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 3.4, + "grad_norm": 0.008513413369655609, + "learning_rate": 7.500832735837043e-05, + "loss": 0.011217262595891953, + "num_input_tokens_seen": 91869360, + "step": 5610, + "train_runtime": 45586.2019, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.4006060606060604, + "grad_norm": 0.007474839687347412, + "learning_rate": 7.500000000000001e-05, + "loss": 0.0122438445687294, + "num_input_tokens_seen": 91885736, + "step": 5611, + "train_runtime": 45594.313, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.4012121212121214, + "grad_norm": 0.009220748208463192, + "learning_rate": 7.499167171692826e-05, + "loss": 0.01283283717930317, + "num_input_tokens_seen": 91902112, + "step": 5612, + "train_runtime": 45602.4327, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 3.401818181818182, + "grad_norm": 0.00467658182606101, + "learning_rate": 7.498334250946325e-05, + "loss": 0.011454892344772816, + "num_input_tokens_seen": 91918488, + "step": 5613, + "train_runtime": 45610.5455, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 3.4024242424242424, + "grad_norm": 0.009933494962751865, + "learning_rate": 7.497501237791305e-05, + "loss": 0.012375656515359879, + "num_input_tokens_seen": 91934864, + "step": 5614, + "train_runtime": 45618.6595, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 3.403030303030303, + "grad_norm": 0.0027569003868848085, + "learning_rate": 7.496668132258578e-05, + "loss": 0.01163573283702135, + "num_input_tokens_seen": 91951240, + "step": 5615, + "train_runtime": 45626.7742, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 3.403636363636364, + "grad_norm": 0.003297042101621628, + "learning_rate": 7.495834934378958e-05, + "loss": 0.011613598093390465, + "num_input_tokens_seen": 91967616, + "step": 5616, + "train_runtime": 45634.8897, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 3.4042424242424243, + "grad_norm": 0.007942317984998226, + "learning_rate": 7.495001644183266e-05, + "loss": 0.012186195701360703, + "num_input_tokens_seen": 91983992, + "step": 5617, + "train_runtime": 45643.0045, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 3.404848484848485, + "grad_norm": 0.004564151167869568, + "learning_rate": 7.494168261702321e-05, + "loss": 0.011610557325184345, + "num_input_tokens_seen": 92000368, + "step": 5618, + "train_runtime": 45651.1175, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 3.4054545454545453, + "grad_norm": 0.011751524172723293, + "learning_rate": 7.493334786966951e-05, + "loss": 0.01282505877315998, + "num_input_tokens_seen": 92016744, + "step": 5619, + "train_runtime": 45659.2344, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 3.4060606060606062, + "grad_norm": 0.004201585426926613, + "learning_rate": 7.492501220007979e-05, + "loss": 0.012132086791098118, + "num_input_tokens_seen": 92033120, + "step": 5620, + "train_runtime": 45667.3494, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 3.4066666666666667, + "grad_norm": 0.004615694284439087, + "learning_rate": 7.491667560856242e-05, + "loss": 0.011308884248137474, + "num_input_tokens_seen": 92049496, + "step": 5621, + "train_runtime": 45675.4672, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 3.4072727272727272, + "grad_norm": 0.005300286691635847, + "learning_rate": 7.490833809542576e-05, + "loss": 0.011118387803435326, + "num_input_tokens_seen": 92065872, + "step": 5622, + "train_runtime": 45683.5819, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 3.4078787878787877, + "grad_norm": 0.009646954014897346, + "learning_rate": 7.489999966097817e-05, + "loss": 0.011952558532357216, + "num_input_tokens_seen": 92082248, + "step": 5623, + "train_runtime": 45691.6999, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 3.4084848484848487, + "grad_norm": 0.005909522529691458, + "learning_rate": 7.489166030552808e-05, + "loss": 0.011699676513671875, + "num_input_tokens_seen": 92098624, + "step": 5624, + "train_runtime": 45699.8116, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 3.409090909090909, + "grad_norm": 0.005958529654890299, + "learning_rate": 7.488332002938396e-05, + "loss": 0.011789480224251747, + "num_input_tokens_seen": 92115000, + "step": 5625, + "train_runtime": 45707.9321, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 3.4096969696969697, + "grad_norm": 0.002923007123172283, + "learning_rate": 7.487497883285428e-05, + "loss": 0.01139673963189125, + "num_input_tokens_seen": 92131376, + "step": 5626, + "train_runtime": 45716.0459, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 3.41030303030303, + "grad_norm": 0.0059952265582978725, + "learning_rate": 7.486663671624758e-05, + "loss": 0.010942156426608562, + "num_input_tokens_seen": 92147752, + "step": 5627, + "train_runtime": 45724.1629, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 3.410909090909091, + "grad_norm": 0.007945683784782887, + "learning_rate": 7.485829367987242e-05, + "loss": 0.012194421142339706, + "num_input_tokens_seen": 92164128, + "step": 5628, + "train_runtime": 45732.2804, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 3.4115151515151516, + "grad_norm": 0.005121713038533926, + "learning_rate": 7.484994972403736e-05, + "loss": 0.012372178956866264, + "num_input_tokens_seen": 92180504, + "step": 5629, + "train_runtime": 45740.3955, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 3.412121212121212, + "grad_norm": 0.009022644720971584, + "learning_rate": 7.484160484905108e-05, + "loss": 0.012644966132938862, + "num_input_tokens_seen": 92196880, + "step": 5630, + "train_runtime": 45748.5112, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.4127272727272726, + "grad_norm": 0.028164712712168694, + "learning_rate": 7.48332590552222e-05, + "loss": 0.012545616365969181, + "num_input_tokens_seen": 92213256, + "step": 5631, + "train_runtime": 45756.6317, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.413333333333333, + "grad_norm": 0.009564214386045933, + "learning_rate": 7.482491234285944e-05, + "loss": 0.013073266483843327, + "num_input_tokens_seen": 92229632, + "step": 5632, + "train_runtime": 45764.7472, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.413939393939394, + "grad_norm": 0.008958321996033192, + "learning_rate": 7.48165647122715e-05, + "loss": 0.012865670025348663, + "num_input_tokens_seen": 92246008, + "step": 5633, + "train_runtime": 45772.8619, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.4145454545454546, + "grad_norm": 0.0076562645845115185, + "learning_rate": 7.480821616376718e-05, + "loss": 0.011684519238770008, + "num_input_tokens_seen": 92262384, + "step": 5634, + "train_runtime": 45780.9794, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.415151515151515, + "grad_norm": 0.0036698610056191683, + "learning_rate": 7.479986669765523e-05, + "loss": 0.011633518151938915, + "num_input_tokens_seen": 92278760, + "step": 5635, + "train_runtime": 45789.0959, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 3.415757575757576, + "grad_norm": 0.014856724999845028, + "learning_rate": 7.479151631424453e-05, + "loss": 0.012845352292060852, + "num_input_tokens_seen": 92295136, + "step": 5636, + "train_runtime": 45797.2089, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 3.4163636363636365, + "grad_norm": 0.00892296340316534, + "learning_rate": 7.47831650138439e-05, + "loss": 0.011484719812870026, + "num_input_tokens_seen": 92311512, + "step": 5637, + "train_runtime": 45805.3344, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 3.416969696969697, + "grad_norm": 0.011974598281085491, + "learning_rate": 7.477481279676228e-05, + "loss": 0.013029697351157665, + "num_input_tokens_seen": 92327888, + "step": 5638, + "train_runtime": 45813.4492, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 3.4175757575757575, + "grad_norm": 0.012427419424057007, + "learning_rate": 7.476645966330856e-05, + "loss": 0.011405151337385178, + "num_input_tokens_seen": 92344264, + "step": 5639, + "train_runtime": 45821.566, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 3.418181818181818, + "grad_norm": 0.053284477442502975, + "learning_rate": 7.475810561379175e-05, + "loss": 0.013145934790372849, + "num_input_tokens_seen": 92360640, + "step": 5640, + "train_runtime": 45829.6809, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 3.418787878787879, + "grad_norm": 0.004110480193048716, + "learning_rate": 7.474975064852081e-05, + "loss": 0.010771472938358784, + "num_input_tokens_seen": 92377016, + "step": 5641, + "train_runtime": 45837.7969, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 3.4193939393939394, + "grad_norm": 0.008155697025358677, + "learning_rate": 7.474139476780481e-05, + "loss": 0.013157295063138008, + "num_input_tokens_seen": 92393392, + "step": 5642, + "train_runtime": 45845.914, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 3.42, + "grad_norm": 0.012406818568706512, + "learning_rate": 7.473303797195276e-05, + "loss": 0.012970956973731518, + "num_input_tokens_seen": 92409768, + "step": 5643, + "train_runtime": 45854.0313, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 3.4206060606060604, + "grad_norm": 0.009518361650407314, + "learning_rate": 7.472468026127385e-05, + "loss": 0.013033135794103146, + "num_input_tokens_seen": 92426144, + "step": 5644, + "train_runtime": 45862.1424, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 3.4212121212121214, + "grad_norm": 0.004814252257347107, + "learning_rate": 7.471632163607714e-05, + "loss": 0.011133428663015366, + "num_input_tokens_seen": 92442520, + "step": 5645, + "train_runtime": 45870.2611, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 3.421818181818182, + "grad_norm": 0.0062969340942800045, + "learning_rate": 7.470796209667184e-05, + "loss": 0.011497458443045616, + "num_input_tokens_seen": 92458896, + "step": 5646, + "train_runtime": 45878.3768, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 3.4224242424242424, + "grad_norm": 0.009826857596635818, + "learning_rate": 7.469960164336711e-05, + "loss": 0.01284845545887947, + "num_input_tokens_seen": 92475272, + "step": 5647, + "train_runtime": 45886.491, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 3.423030303030303, + "grad_norm": 0.008992847986519337, + "learning_rate": 7.469124027647224e-05, + "loss": 0.01120210811495781, + "num_input_tokens_seen": 92491648, + "step": 5648, + "train_runtime": 45894.6049, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 3.423636363636364, + "grad_norm": 0.012295125983655453, + "learning_rate": 7.468287799629648e-05, + "loss": 0.012950480915606022, + "num_input_tokens_seen": 92508024, + "step": 5649, + "train_runtime": 45902.7197, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 3.4242424242424243, + "grad_norm": 0.009868228808045387, + "learning_rate": 7.467451480314912e-05, + "loss": 0.012300976552069187, + "num_input_tokens_seen": 92524400, + "step": 5650, + "train_runtime": 45910.8369, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 3.424848484848485, + "grad_norm": 0.007319753523916006, + "learning_rate": 7.466615069733951e-05, + "loss": 0.011811192147433758, + "num_input_tokens_seen": 92540776, + "step": 5651, + "train_runtime": 45918.9551, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 3.4254545454545453, + "grad_norm": 0.008323924615979195, + "learning_rate": 7.465778567917702e-05, + "loss": 0.013353224843740463, + "num_input_tokens_seen": 92557152, + "step": 5652, + "train_runtime": 45927.075, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 3.4260606060606063, + "grad_norm": 0.005229050759226084, + "learning_rate": 7.464941974897106e-05, + "loss": 0.011186614632606506, + "num_input_tokens_seen": 92573528, + "step": 5653, + "train_runtime": 45935.193, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 3.4266666666666667, + "grad_norm": 0.0063659125007689, + "learning_rate": 7.464105290703106e-05, + "loss": 0.013129210099577904, + "num_input_tokens_seen": 92589904, + "step": 5654, + "train_runtime": 45943.3095, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 3.4272727272727272, + "grad_norm": 0.005025164689868689, + "learning_rate": 7.463268515366651e-05, + "loss": 0.011970991268754005, + "num_input_tokens_seen": 92606280, + "step": 5655, + "train_runtime": 45951.431, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 3.4278787878787877, + "grad_norm": 0.04118833690881729, + "learning_rate": 7.462431648918689e-05, + "loss": 0.011796675622463226, + "num_input_tokens_seen": 92622656, + "step": 5656, + "train_runtime": 45959.5466, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 3.4284848484848487, + "grad_norm": 0.005679294466972351, + "learning_rate": 7.461594691390177e-05, + "loss": 0.010558663867413998, + "num_input_tokens_seen": 92639032, + "step": 5657, + "train_runtime": 45967.6604, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.429090909090909, + "grad_norm": 0.006505673751235008, + "learning_rate": 7.46075764281207e-05, + "loss": 0.01141516026109457, + "num_input_tokens_seen": 92655408, + "step": 5658, + "train_runtime": 45975.7787, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.4296969696969697, + "grad_norm": 0.006326816510409117, + "learning_rate": 7.459920503215332e-05, + "loss": 0.011857026256620884, + "num_input_tokens_seen": 92671784, + "step": 5659, + "train_runtime": 45983.8947, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.43030303030303, + "grad_norm": 0.007779103238135576, + "learning_rate": 7.459083272630923e-05, + "loss": 0.013331515714526176, + "num_input_tokens_seen": 92688160, + "step": 5660, + "train_runtime": 45992.0134, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.4309090909090907, + "grad_norm": 0.006578861735761166, + "learning_rate": 7.458245951089813e-05, + "loss": 0.013023799285292625, + "num_input_tokens_seen": 92704536, + "step": 5661, + "train_runtime": 46000.1328, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.4315151515151516, + "grad_norm": 0.011803492903709412, + "learning_rate": 7.457408538622972e-05, + "loss": 0.013459639623761177, + "num_input_tokens_seen": 92720912, + "step": 5662, + "train_runtime": 46008.2445, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 3.432121212121212, + "grad_norm": 0.007298639044165611, + "learning_rate": 7.456571035261376e-05, + "loss": 0.011962136253714561, + "num_input_tokens_seen": 92737288, + "step": 5663, + "train_runtime": 46016.3573, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 3.4327272727272726, + "grad_norm": 0.005935965571552515, + "learning_rate": 7.455733441036e-05, + "loss": 0.011353859677910805, + "num_input_tokens_seen": 92753664, + "step": 5664, + "train_runtime": 46024.474, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.4333333333333336, + "grad_norm": 0.011586221866309643, + "learning_rate": 7.454895755977827e-05, + "loss": 0.013800525106489658, + "num_input_tokens_seen": 92770040, + "step": 5665, + "train_runtime": 46032.5887, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.433939393939394, + "grad_norm": 0.008750084787607193, + "learning_rate": 7.454057980117841e-05, + "loss": 0.012785893864929676, + "num_input_tokens_seen": 92786416, + "step": 5666, + "train_runtime": 46040.7033, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 3.4345454545454546, + "grad_norm": 0.010341456159949303, + "learning_rate": 7.45322011348703e-05, + "loss": 0.011114949360489845, + "num_input_tokens_seen": 92802792, + "step": 5667, + "train_runtime": 46048.8227, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 3.435151515151515, + "grad_norm": 0.00952091719955206, + "learning_rate": 7.452382156116383e-05, + "loss": 0.014661101624369621, + "num_input_tokens_seen": 92819168, + "step": 5668, + "train_runtime": 46056.9358, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.4357575757575756, + "grad_norm": 0.007992749102413654, + "learning_rate": 7.451544108036897e-05, + "loss": 0.013792257755994797, + "num_input_tokens_seen": 92835544, + "step": 5669, + "train_runtime": 46065.0498, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.4363636363636365, + "grad_norm": 0.010410956107079983, + "learning_rate": 7.450705969279568e-05, + "loss": 0.01291271485388279, + "num_input_tokens_seen": 92851920, + "step": 5670, + "train_runtime": 46073.1606, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.436969696969697, + "grad_norm": 0.018489185720682144, + "learning_rate": 7.449867739875397e-05, + "loss": 0.012213547714054585, + "num_input_tokens_seen": 92868296, + "step": 5671, + "train_runtime": 46081.2758, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.4375757575757575, + "grad_norm": 0.011361268348991871, + "learning_rate": 7.44902941985539e-05, + "loss": 0.012068888172507286, + "num_input_tokens_seen": 92884672, + "step": 5672, + "train_runtime": 46089.3895, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 3.438181818181818, + "grad_norm": 0.007942626252770424, + "learning_rate": 7.448191009250554e-05, + "loss": 0.012637203559279442, + "num_input_tokens_seen": 92901048, + "step": 5673, + "train_runtime": 46097.5025, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 3.438787878787879, + "grad_norm": 0.008537056855857372, + "learning_rate": 7.447352508091902e-05, + "loss": 0.012600860558450222, + "num_input_tokens_seen": 92917424, + "step": 5674, + "train_runtime": 46105.6159, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 3.4393939393939394, + "grad_norm": 0.00900659617036581, + "learning_rate": 7.446513916410443e-05, + "loss": 0.012105975300073624, + "num_input_tokens_seen": 92933800, + "step": 5675, + "train_runtime": 46113.7366, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 3.44, + "grad_norm": 0.006960330065339804, + "learning_rate": 7.445675234237202e-05, + "loss": 0.012462804093956947, + "num_input_tokens_seen": 92950176, + "step": 5676, + "train_runtime": 46121.848, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 3.4406060606060604, + "grad_norm": 0.0076194508001208305, + "learning_rate": 7.444836461603195e-05, + "loss": 0.012984546832740307, + "num_input_tokens_seen": 92966552, + "step": 5677, + "train_runtime": 46129.9622, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 3.4412121212121214, + "grad_norm": 0.006087950896471739, + "learning_rate": 7.44399759853945e-05, + "loss": 0.012162751518189907, + "num_input_tokens_seen": 92982928, + "step": 5678, + "train_runtime": 46138.078, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 3.441818181818182, + "grad_norm": 0.005816651042550802, + "learning_rate": 7.443158645076991e-05, + "loss": 0.012019617483019829, + "num_input_tokens_seen": 92999304, + "step": 5679, + "train_runtime": 46146.1948, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 3.4424242424242424, + "grad_norm": 0.0022751176729798317, + "learning_rate": 7.442319601246852e-05, + "loss": 0.012464880011975765, + "num_input_tokens_seen": 93015680, + "step": 5680, + "train_runtime": 46154.3079, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 3.443030303030303, + "grad_norm": 0.007649595849215984, + "learning_rate": 7.441480467080066e-05, + "loss": 0.012837440706789494, + "num_input_tokens_seen": 93032056, + "step": 5681, + "train_runtime": 46162.4597, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 3.443636363636364, + "grad_norm": 0.00949709303677082, + "learning_rate": 7.440641242607675e-05, + "loss": 0.011738740839064121, + "num_input_tokens_seen": 93048432, + "step": 5682, + "train_runtime": 46170.5763, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 3.4442424242424243, + "grad_norm": 0.00678859231993556, + "learning_rate": 7.439801927860717e-05, + "loss": 0.012019388377666473, + "num_input_tokens_seen": 93064808, + "step": 5683, + "train_runtime": 46178.6919, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 3.444848484848485, + "grad_norm": 0.009626498445868492, + "learning_rate": 7.438962522870237e-05, + "loss": 0.012465447187423706, + "num_input_tokens_seen": 93081184, + "step": 5684, + "train_runtime": 46186.8048, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 3.4454545454545453, + "grad_norm": 0.004472990985959768, + "learning_rate": 7.43812302766728e-05, + "loss": 0.01124614104628563, + "num_input_tokens_seen": 93097560, + "step": 5685, + "train_runtime": 46194.9204, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 3.4460606060606063, + "grad_norm": 0.010441385209560394, + "learning_rate": 7.437283442282904e-05, + "loss": 0.013200185261666775, + "num_input_tokens_seen": 93113936, + "step": 5686, + "train_runtime": 46203.0373, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 3.4466666666666668, + "grad_norm": 0.006401207763701677, + "learning_rate": 7.436443766748158e-05, + "loss": 0.012713635340332985, + "num_input_tokens_seen": 93130312, + "step": 5687, + "train_runtime": 46211.1495, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 3.4472727272727273, + "grad_norm": 0.0055511207319796085, + "learning_rate": 7.435604001094102e-05, + "loss": 0.011288444511592388, + "num_input_tokens_seen": 93146688, + "step": 5688, + "train_runtime": 46219.2625, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 3.4478787878787878, + "grad_norm": 0.024597343057394028, + "learning_rate": 7.434764145351796e-05, + "loss": 0.01280852872878313, + "num_input_tokens_seen": 93163064, + "step": 5689, + "train_runtime": 46227.3767, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 3.4484848484848483, + "grad_norm": 0.009699016809463501, + "learning_rate": 7.433924199552307e-05, + "loss": 0.011964798904955387, + "num_input_tokens_seen": 93179440, + "step": 5690, + "train_runtime": 46235.4977, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 3.449090909090909, + "grad_norm": 0.008894070982933044, + "learning_rate": 7.433084163726703e-05, + "loss": 0.012337209656834602, + "num_input_tokens_seen": 93195816, + "step": 5691, + "train_runtime": 46243.6151, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 3.4496969696969697, + "grad_norm": 0.005355211440473795, + "learning_rate": 7.432244037906053e-05, + "loss": 0.011590537615120411, + "num_input_tokens_seen": 93212192, + "step": 5692, + "train_runtime": 46251.7337, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 3.45030303030303, + "grad_norm": 0.0014480111422017217, + "learning_rate": 7.431403822121431e-05, + "loss": 0.010893017053604126, + "num_input_tokens_seen": 93228568, + "step": 5693, + "train_runtime": 46259.8537, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 3.450909090909091, + "grad_norm": 0.0035000210627913475, + "learning_rate": 7.430563516403918e-05, + "loss": 0.011136114597320557, + "num_input_tokens_seen": 93244944, + "step": 5694, + "train_runtime": 46267.9716, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 3.4515151515151516, + "grad_norm": 0.010011779144406319, + "learning_rate": 7.429723120784594e-05, + "loss": 0.013940184377133846, + "num_input_tokens_seen": 93261320, + "step": 5695, + "train_runtime": 46276.0861, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 3.452121212121212, + "grad_norm": 0.004295106045901775, + "learning_rate": 7.428882635294543e-05, + "loss": 0.011277815327048302, + "num_input_tokens_seen": 93277696, + "step": 5696, + "train_runtime": 46284.1994, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 3.4527272727272726, + "grad_norm": 0.010794086381793022, + "learning_rate": 7.428042059964853e-05, + "loss": 0.012796160764992237, + "num_input_tokens_seen": 93294072, + "step": 5697, + "train_runtime": 46292.317, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 3.453333333333333, + "grad_norm": 0.00434096809476614, + "learning_rate": 7.427201394826616e-05, + "loss": 0.01212590467184782, + "num_input_tokens_seen": 93310448, + "step": 5698, + "train_runtime": 46300.4332, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 3.453939393939394, + "grad_norm": 0.005192629527300596, + "learning_rate": 7.426360639910926e-05, + "loss": 0.010218428447842598, + "num_input_tokens_seen": 93326824, + "step": 5699, + "train_runtime": 46308.5475, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 3.4545454545454546, + "grad_norm": 0.007969344034790993, + "learning_rate": 7.425519795248882e-05, + "loss": 0.012883758172392845, + "num_input_tokens_seen": 93343200, + "step": 5700, + "train_runtime": 46316.6625, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 3.455151515151515, + "grad_norm": 0.006714339833706617, + "learning_rate": 7.424678860871584e-05, + "loss": 0.011554012075066566, + "num_input_tokens_seen": 93359576, + "step": 5701, + "train_runtime": 46325.6746, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 3.4557575757575756, + "grad_norm": 0.01265820860862732, + "learning_rate": 7.423837836810135e-05, + "loss": 0.01254359632730484, + "num_input_tokens_seen": 93375952, + "step": 5702, + "train_runtime": 46333.7851, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.4563636363636365, + "grad_norm": 0.016725221648812294, + "learning_rate": 7.422996723095647e-05, + "loss": 0.01151125505566597, + "num_input_tokens_seen": 93392328, + "step": 5703, + "train_runtime": 46341.8959, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.456969696969697, + "grad_norm": 0.0065767657943069935, + "learning_rate": 7.422155519759228e-05, + "loss": 0.011213088408112526, + "num_input_tokens_seen": 93408704, + "step": 5704, + "train_runtime": 46350.0109, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 3.4575757575757575, + "grad_norm": 0.008244355209171772, + "learning_rate": 7.421314226831993e-05, + "loss": 0.013280751183629036, + "num_input_tokens_seen": 93425080, + "step": 5705, + "train_runtime": 46358.1655, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.458181818181818, + "grad_norm": 0.003261887701228261, + "learning_rate": 7.420472844345059e-05, + "loss": 0.010885167866945267, + "num_input_tokens_seen": 93441456, + "step": 5706, + "train_runtime": 46366.2779, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.458787878787879, + "grad_norm": 0.0074326652102172375, + "learning_rate": 7.419631372329549e-05, + "loss": 0.011909465305507183, + "num_input_tokens_seen": 93457832, + "step": 5707, + "train_runtime": 46374.3926, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 3.4593939393939395, + "grad_norm": 0.008344646543264389, + "learning_rate": 7.418789810816587e-05, + "loss": 0.012697060592472553, + "num_input_tokens_seen": 93474208, + "step": 5708, + "train_runtime": 46382.5112, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 3.46, + "grad_norm": 0.00898341927677393, + "learning_rate": 7.4179481598373e-05, + "loss": 0.012666013091802597, + "num_input_tokens_seen": 93490584, + "step": 5709, + "train_runtime": 46390.6318, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 3.4606060606060605, + "grad_norm": 0.005887191742658615, + "learning_rate": 7.417106419422819e-05, + "loss": 0.011772170662879944, + "num_input_tokens_seen": 93506960, + "step": 5710, + "train_runtime": 46398.7475, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 3.461212121212121, + "grad_norm": 0.007113631349056959, + "learning_rate": 7.41626458960428e-05, + "loss": 0.011800704523921013, + "num_input_tokens_seen": 93523336, + "step": 5711, + "train_runtime": 46406.8613, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 3.461818181818182, + "grad_norm": 0.004423167090862989, + "learning_rate": 7.415422670412818e-05, + "loss": 0.011215953156352043, + "num_input_tokens_seen": 93539712, + "step": 5712, + "train_runtime": 46414.9777, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 3.4624242424242424, + "grad_norm": 0.003702617483213544, + "learning_rate": 7.414580661879575e-05, + "loss": 0.011041272431612015, + "num_input_tokens_seen": 93556088, + "step": 5713, + "train_runtime": 46423.0947, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 3.463030303030303, + "grad_norm": 0.008982347324490547, + "learning_rate": 7.413738564035695e-05, + "loss": 0.012591475620865822, + "num_input_tokens_seen": 93572464, + "step": 5714, + "train_runtime": 46431.2093, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 3.463636363636364, + "grad_norm": 0.007844550535082817, + "learning_rate": 7.412896376912328e-05, + "loss": 0.012164908461272717, + "num_input_tokens_seen": 93588840, + "step": 5715, + "train_runtime": 46439.3415, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 3.4642424242424243, + "grad_norm": 0.008633282035589218, + "learning_rate": 7.412054100540623e-05, + "loss": 0.012181570753455162, + "num_input_tokens_seen": 93605216, + "step": 5716, + "train_runtime": 46447.4573, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 3.464848484848485, + "grad_norm": 0.010241689160466194, + "learning_rate": 7.411211734951732e-05, + "loss": 0.013211209326982498, + "num_input_tokens_seen": 93621592, + "step": 5717, + "train_runtime": 46455.5728, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 3.4654545454545453, + "grad_norm": 0.007290650624781847, + "learning_rate": 7.410369280176816e-05, + "loss": 0.01144292950630188, + "num_input_tokens_seen": 93637968, + "step": 5718, + "train_runtime": 46463.6905, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 3.466060606060606, + "grad_norm": 0.00424894830211997, + "learning_rate": 7.409526736247034e-05, + "loss": 0.01089190412312746, + "num_input_tokens_seen": 93654344, + "step": 5719, + "train_runtime": 46471.8161, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 3.466666666666667, + "grad_norm": 0.009827720001339912, + "learning_rate": 7.408684103193551e-05, + "loss": 0.01270249206572771, + "num_input_tokens_seen": 93670720, + "step": 5720, + "train_runtime": 46479.9344, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 3.4672727272727273, + "grad_norm": 0.00946971494704485, + "learning_rate": 7.407841381047532e-05, + "loss": 0.01244182325899601, + "num_input_tokens_seen": 93687096, + "step": 5721, + "train_runtime": 46488.0548, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 3.467878787878788, + "grad_norm": 0.005102105438709259, + "learning_rate": 7.406998569840151e-05, + "loss": 0.011259316466748714, + "num_input_tokens_seen": 93703472, + "step": 5722, + "train_runtime": 46496.1755, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 3.4684848484848487, + "grad_norm": 0.00581786222755909, + "learning_rate": 7.406155669602579e-05, + "loss": 0.01192222349345684, + "num_input_tokens_seen": 93719848, + "step": 5723, + "train_runtime": 46504.3017, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 3.4690909090909092, + "grad_norm": 0.005428736563771963, + "learning_rate": 7.405312680365996e-05, + "loss": 0.010069424286484718, + "num_input_tokens_seen": 93736224, + "step": 5724, + "train_runtime": 46512.4336, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 3.4696969696969697, + "grad_norm": 0.006517233792692423, + "learning_rate": 7.404469602161579e-05, + "loss": 0.011620840057730675, + "num_input_tokens_seen": 93752600, + "step": 5725, + "train_runtime": 46520.5497, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 3.4703030303030302, + "grad_norm": 0.006483083125203848, + "learning_rate": 7.403626435020516e-05, + "loss": 0.0127715440467, + "num_input_tokens_seen": 93768976, + "step": 5726, + "train_runtime": 46528.6631, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 3.4709090909090907, + "grad_norm": 0.007359194569289684, + "learning_rate": 7.402783178973991e-05, + "loss": 0.012299592606723309, + "num_input_tokens_seen": 93785352, + "step": 5727, + "train_runtime": 46536.7753, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 3.4715151515151517, + "grad_norm": 0.006959372665733099, + "learning_rate": 7.401939834053197e-05, + "loss": 0.011287372559309006, + "num_input_tokens_seen": 93801728, + "step": 5728, + "train_runtime": 46544.892, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 3.472121212121212, + "grad_norm": 0.007229479495435953, + "learning_rate": 7.401096400289324e-05, + "loss": 0.011434637941420078, + "num_input_tokens_seen": 93818104, + "step": 5729, + "train_runtime": 46553.006, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 3.4727272727272727, + "grad_norm": 0.0081328721717, + "learning_rate": 7.400252877713571e-05, + "loss": 0.011495150625705719, + "num_input_tokens_seen": 93834480, + "step": 5730, + "train_runtime": 46561.1327, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 3.473333333333333, + "grad_norm": 0.007916068658232689, + "learning_rate": 7.399409266357139e-05, + "loss": 0.012190028093755245, + "num_input_tokens_seen": 93850856, + "step": 5731, + "train_runtime": 46569.2441, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 3.473939393939394, + "grad_norm": 0.008217734284698963, + "learning_rate": 7.398565566251232e-05, + "loss": 0.012017288245260715, + "num_input_tokens_seen": 93867232, + "step": 5732, + "train_runtime": 46577.3594, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 3.4745454545454546, + "grad_norm": 0.012743283994495869, + "learning_rate": 7.397721777427054e-05, + "loss": 0.01196483988314867, + "num_input_tokens_seen": 93883608, + "step": 5733, + "train_runtime": 46585.4766, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.475151515151515, + "grad_norm": 0.009506470523774624, + "learning_rate": 7.396877899915818e-05, + "loss": 0.012446848675608635, + "num_input_tokens_seen": 93899984, + "step": 5734, + "train_runtime": 46593.5926, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.4757575757575756, + "grad_norm": 0.007567623630166054, + "learning_rate": 7.396033933748735e-05, + "loss": 0.01220247894525528, + "num_input_tokens_seen": 93916360, + "step": 5735, + "train_runtime": 46601.7087, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.4763636363636365, + "grad_norm": 0.009169073775410652, + "learning_rate": 7.395189878957025e-05, + "loss": 0.013506796211004257, + "num_input_tokens_seen": 93932736, + "step": 5736, + "train_runtime": 46609.8322, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.476969696969697, + "grad_norm": 0.005508784204721451, + "learning_rate": 7.394345735571903e-05, + "loss": 0.012259161099791527, + "num_input_tokens_seen": 93949112, + "step": 5737, + "train_runtime": 46617.947, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.4775757575757575, + "grad_norm": 0.010150251910090446, + "learning_rate": 7.393501503624597e-05, + "loss": 0.011637775227427483, + "num_input_tokens_seen": 93965488, + "step": 5738, + "train_runtime": 46626.0657, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 3.478181818181818, + "grad_norm": 0.008901430293917656, + "learning_rate": 7.392657183146331e-05, + "loss": 0.012503727339208126, + "num_input_tokens_seen": 93981864, + "step": 5739, + "train_runtime": 46634.1823, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 3.4787878787878785, + "grad_norm": 0.013003533706068993, + "learning_rate": 7.391812774168334e-05, + "loss": 0.01290416345000267, + "num_input_tokens_seen": 93998240, + "step": 5740, + "train_runtime": 46642.2961, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 3.4793939393939395, + "grad_norm": 0.00936222355812788, + "learning_rate": 7.390968276721844e-05, + "loss": 0.012101269327104092, + "num_input_tokens_seen": 94014616, + "step": 5741, + "train_runtime": 46650.4151, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 3.48, + "grad_norm": 0.009190194308757782, + "learning_rate": 7.39012369083809e-05, + "loss": 0.013375822454690933, + "num_input_tokens_seen": 94030992, + "step": 5742, + "train_runtime": 46658.5339, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 3.4806060606060605, + "grad_norm": 0.008457150310277939, + "learning_rate": 7.389279016548316e-05, + "loss": 0.01176963746547699, + "num_input_tokens_seen": 94047368, + "step": 5743, + "train_runtime": 46666.6501, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 3.4812121212121214, + "grad_norm": 0.008041429333388805, + "learning_rate": 7.388434253883765e-05, + "loss": 0.011248382739722729, + "num_input_tokens_seen": 94063744, + "step": 5744, + "train_runtime": 46674.7638, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 3.481818181818182, + "grad_norm": 0.007893534377217293, + "learning_rate": 7.387589402875681e-05, + "loss": 0.011968870647251606, + "num_input_tokens_seen": 94080120, + "step": 5745, + "train_runtime": 46682.8823, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 3.4824242424242424, + "grad_norm": 0.006555131170898676, + "learning_rate": 7.386744463555316e-05, + "loss": 0.011590119451284409, + "num_input_tokens_seen": 94096496, + "step": 5746, + "train_runtime": 46690.9961, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 3.483030303030303, + "grad_norm": 0.010505225509405136, + "learning_rate": 7.38589943595392e-05, + "loss": 0.012342461384832859, + "num_input_tokens_seen": 94112872, + "step": 5747, + "train_runtime": 46699.11, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 3.4836363636363634, + "grad_norm": 0.007533901371061802, + "learning_rate": 7.385054320102751e-05, + "loss": 0.012379327788949013, + "num_input_tokens_seen": 94129248, + "step": 5748, + "train_runtime": 46707.2226, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 3.4842424242424244, + "grad_norm": 0.005872008856385946, + "learning_rate": 7.384209116033067e-05, + "loss": 0.012513642199337482, + "num_input_tokens_seen": 94145624, + "step": 5749, + "train_runtime": 46715.3387, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 3.484848484848485, + "grad_norm": 0.007267140783369541, + "learning_rate": 7.383363823776132e-05, + "loss": 0.013211112469434738, + "num_input_tokens_seen": 94162000, + "step": 5750, + "train_runtime": 46723.4523, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 3.4854545454545454, + "grad_norm": 0.004977668635547161, + "learning_rate": 7.382518443363208e-05, + "loss": 0.010892066173255444, + "num_input_tokens_seen": 94178376, + "step": 5751, + "train_runtime": 46731.5646, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 3.4860606060606063, + "grad_norm": 0.007567159365862608, + "learning_rate": 7.381672974825569e-05, + "loss": 0.012525063008069992, + "num_input_tokens_seen": 94194752, + "step": 5752, + "train_runtime": 46739.6798, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 3.486666666666667, + "grad_norm": 0.0047128573060035706, + "learning_rate": 7.380827418194483e-05, + "loss": 0.01172194629907608, + "num_input_tokens_seen": 94211128, + "step": 5753, + "train_runtime": 46747.7978, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 3.4872727272727273, + "grad_norm": 0.007110376842319965, + "learning_rate": 7.379981773501229e-05, + "loss": 0.011042311787605286, + "num_input_tokens_seen": 94227504, + "step": 5754, + "train_runtime": 46755.913, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 3.487878787878788, + "grad_norm": 0.006516760680824518, + "learning_rate": 7.379136040777083e-05, + "loss": 0.01209244504570961, + "num_input_tokens_seen": 94243880, + "step": 5755, + "train_runtime": 46764.0312, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 3.4884848484848483, + "grad_norm": 0.005064810160547495, + "learning_rate": 7.378290220053328e-05, + "loss": 0.01227223128080368, + "num_input_tokens_seen": 94260256, + "step": 5756, + "train_runtime": 46772.1475, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 3.4890909090909092, + "grad_norm": 0.006867413874715567, + "learning_rate": 7.37744431136125e-05, + "loss": 0.012030337937176228, + "num_input_tokens_seen": 94276632, + "step": 5757, + "train_runtime": 46780.2607, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 3.4896969696969697, + "grad_norm": 0.005449771415442228, + "learning_rate": 7.376598314732135e-05, + "loss": 0.012352284044027328, + "num_input_tokens_seen": 94293008, + "step": 5758, + "train_runtime": 46788.3715, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.4903030303030302, + "grad_norm": 0.003147744806483388, + "learning_rate": 7.375752230197278e-05, + "loss": 0.011739364825189114, + "num_input_tokens_seen": 94309384, + "step": 5759, + "train_runtime": 46796.4814, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.4909090909090907, + "grad_norm": 0.010479380376636982, + "learning_rate": 7.374906057787971e-05, + "loss": 0.013562586158514023, + "num_input_tokens_seen": 94325760, + "step": 5760, + "train_runtime": 46804.5944, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.4915151515151517, + "grad_norm": 0.007373539265245199, + "learning_rate": 7.374059797535517e-05, + "loss": 0.011339793913066387, + "num_input_tokens_seen": 94342136, + "step": 5761, + "train_runtime": 46812.7066, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.492121212121212, + "grad_norm": 0.008328596130013466, + "learning_rate": 7.373213449471213e-05, + "loss": 0.012237218208611012, + "num_input_tokens_seen": 94358512, + "step": 5762, + "train_runtime": 46820.8188, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 3.4927272727272727, + "grad_norm": 0.008913877420127392, + "learning_rate": 7.372367013626362e-05, + "loss": 0.012517942115664482, + "num_input_tokens_seen": 94374888, + "step": 5763, + "train_runtime": 46828.9322, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.493333333333333, + "grad_norm": 0.009435540065169334, + "learning_rate": 7.37152049003228e-05, + "loss": 0.012177466414868832, + "num_input_tokens_seen": 94391264, + "step": 5764, + "train_runtime": 46837.0515, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.493939393939394, + "grad_norm": 0.012269715778529644, + "learning_rate": 7.37067387872027e-05, + "loss": 0.012865628115832806, + "num_input_tokens_seen": 94407640, + "step": 5765, + "train_runtime": 46845.1618, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.4945454545454546, + "grad_norm": 0.007109838537871838, + "learning_rate": 7.369827179721651e-05, + "loss": 0.011837600730359554, + "num_input_tokens_seen": 94424016, + "step": 5766, + "train_runtime": 46853.2741, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 3.495151515151515, + "grad_norm": 0.010053800418972969, + "learning_rate": 7.368980393067739e-05, + "loss": 0.012629539705812931, + "num_input_tokens_seen": 94440392, + "step": 5767, + "train_runtime": 46861.3905, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 3.4957575757575756, + "grad_norm": 0.007064797915518284, + "learning_rate": 7.368133518789857e-05, + "loss": 0.011487978510558605, + "num_input_tokens_seen": 94456768, + "step": 5768, + "train_runtime": 46869.5063, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.496363636363636, + "grad_norm": 0.008290157653391361, + "learning_rate": 7.367286556919327e-05, + "loss": 0.01161447074264288, + "num_input_tokens_seen": 94473144, + "step": 5769, + "train_runtime": 46877.6317, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.496969696969697, + "grad_norm": 0.006640593986958265, + "learning_rate": 7.366439507487478e-05, + "loss": 0.011869044043123722, + "num_input_tokens_seen": 94489520, + "step": 5770, + "train_runtime": 46885.7441, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.4975757575757576, + "grad_norm": 0.005989828612655401, + "learning_rate": 7.365592370525639e-05, + "loss": 0.011852039024233818, + "num_input_tokens_seen": 94505896, + "step": 5771, + "train_runtime": 46893.8616, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.498181818181818, + "grad_norm": 0.019068827852606773, + "learning_rate": 7.364745146065146e-05, + "loss": 0.013244143687188625, + "num_input_tokens_seen": 94522272, + "step": 5772, + "train_runtime": 46901.9746, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.498787878787879, + "grad_norm": 0.005840938538312912, + "learning_rate": 7.363897834137334e-05, + "loss": 0.012880822643637657, + "num_input_tokens_seen": 94538648, + "step": 5773, + "train_runtime": 46910.091, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 3.4993939393939395, + "grad_norm": 0.008264121599495411, + "learning_rate": 7.363050434773546e-05, + "loss": 0.01202466618269682, + "num_input_tokens_seen": 94555024, + "step": 5774, + "train_runtime": 46918.2042, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 3.5, + "grad_norm": 0.009254860691726208, + "learning_rate": 7.362202948005123e-05, + "loss": 0.012589479796588421, + "num_input_tokens_seen": 94571400, + "step": 5775, + "train_runtime": 46926.3199, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 3.5006060606060605, + "grad_norm": 0.0061925603076815605, + "learning_rate": 7.361355373863414e-05, + "loss": 0.011458649300038815, + "num_input_tokens_seen": 94587776, + "step": 5776, + "train_runtime": 46934.4365, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 3.501212121212121, + "grad_norm": 0.004910873249173164, + "learning_rate": 7.360507712379769e-05, + "loss": 0.010929007083177567, + "num_input_tokens_seen": 94604152, + "step": 5777, + "train_runtime": 46942.5521, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 3.501818181818182, + "grad_norm": 0.012952589429914951, + "learning_rate": 7.359659963585539e-05, + "loss": 0.012447602115571499, + "num_input_tokens_seen": 94620528, + "step": 5778, + "train_runtime": 46950.6667, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 3.5024242424242424, + "grad_norm": 0.004770078230649233, + "learning_rate": 7.358812127512082e-05, + "loss": 0.01144139003008604, + "num_input_tokens_seen": 94636904, + "step": 5779, + "train_runtime": 46958.7804, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 3.503030303030303, + "grad_norm": 0.007522217463701963, + "learning_rate": 7.357964204190759e-05, + "loss": 0.012275812216103077, + "num_input_tokens_seen": 94653280, + "step": 5780, + "train_runtime": 46966.8935, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 3.503636363636364, + "grad_norm": 0.00546399736776948, + "learning_rate": 7.357116193652931e-05, + "loss": 0.013187265023589134, + "num_input_tokens_seen": 94669656, + "step": 5781, + "train_runtime": 46975.0061, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 3.5042424242424244, + "grad_norm": 0.005577375181019306, + "learning_rate": 7.356268095929966e-05, + "loss": 0.010674688965082169, + "num_input_tokens_seen": 94686032, + "step": 5782, + "train_runtime": 46983.1304, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 3.504848484848485, + "grad_norm": 0.011865477077662945, + "learning_rate": 7.355419911053232e-05, + "loss": 0.012380022555589676, + "num_input_tokens_seen": 94702408, + "step": 5783, + "train_runtime": 46991.246, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 3.5054545454545454, + "grad_norm": 0.006339898332953453, + "learning_rate": 7.354571639054105e-05, + "loss": 0.012048576027154922, + "num_input_tokens_seen": 94718784, + "step": 5784, + "train_runtime": 46999.3637, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 3.506060606060606, + "grad_norm": 0.00936094205826521, + "learning_rate": 7.353723279963956e-05, + "loss": 0.012387973256409168, + "num_input_tokens_seen": 94735160, + "step": 5785, + "train_runtime": 47007.4789, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 3.506666666666667, + "grad_norm": 0.008000703528523445, + "learning_rate": 7.352874833814168e-05, + "loss": 0.012093688361346722, + "num_input_tokens_seen": 94751536, + "step": 5786, + "train_runtime": 47015.5944, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 3.5072727272727273, + "grad_norm": 0.007156278472393751, + "learning_rate": 7.352026300636121e-05, + "loss": 0.010478870943188667, + "num_input_tokens_seen": 94767912, + "step": 5787, + "train_runtime": 47023.7074, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 3.507878787878788, + "grad_norm": 0.006965792737901211, + "learning_rate": 7.3511776804612e-05, + "loss": 0.012945892289280891, + "num_input_tokens_seen": 94784288, + "step": 5788, + "train_runtime": 47031.8193, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 3.5084848484848488, + "grad_norm": 0.0025677899830043316, + "learning_rate": 7.350328973320798e-05, + "loss": 0.013109579682350159, + "num_input_tokens_seen": 94800664, + "step": 5789, + "train_runtime": 47039.9338, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 3.509090909090909, + "grad_norm": 0.006909017916768789, + "learning_rate": 7.349480179246303e-05, + "loss": 0.01141324546188116, + "num_input_tokens_seen": 94817040, + "step": 5790, + "train_runtime": 47048.0477, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 3.5096969696969698, + "grad_norm": 0.008132491260766983, + "learning_rate": 7.348631298269114e-05, + "loss": 0.011673266999423504, + "num_input_tokens_seen": 94833416, + "step": 5791, + "train_runtime": 47056.1657, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 3.5103030303030303, + "grad_norm": 0.010527039878070354, + "learning_rate": 7.347782330420622e-05, + "loss": 0.012838568538427353, + "num_input_tokens_seen": 94849792, + "step": 5792, + "train_runtime": 47064.2825, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 3.5109090909090908, + "grad_norm": 0.008363801054656506, + "learning_rate": 7.346933275732237e-05, + "loss": 0.012484709732234478, + "num_input_tokens_seen": 94866168, + "step": 5793, + "train_runtime": 47072.399, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 3.5115151515151517, + "grad_norm": 0.007673931308090687, + "learning_rate": 7.34608413423536e-05, + "loss": 0.013201752677559853, + "num_input_tokens_seen": 94882544, + "step": 5794, + "train_runtime": 47080.5124, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 3.512121212121212, + "grad_norm": 0.002579211490228772, + "learning_rate": 7.3452349059614e-05, + "loss": 0.011163623072206974, + "num_input_tokens_seen": 94898920, + "step": 5795, + "train_runtime": 47088.6328, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 3.5127272727272727, + "grad_norm": 0.005312573164701462, + "learning_rate": 7.344385590941768e-05, + "loss": 0.012041897512972355, + "num_input_tokens_seen": 94915296, + "step": 5796, + "train_runtime": 47096.7459, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 3.513333333333333, + "grad_norm": 0.007416535168886185, + "learning_rate": 7.343536189207878e-05, + "loss": 0.01099968422204256, + "num_input_tokens_seen": 94931672, + "step": 5797, + "train_runtime": 47104.8614, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 3.5139393939393937, + "grad_norm": 0.007762560620903969, + "learning_rate": 7.342686700791148e-05, + "loss": 0.012042861431837082, + "num_input_tokens_seen": 94948048, + "step": 5798, + "train_runtime": 47112.9748, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 3.5145454545454546, + "grad_norm": 0.008070698007941246, + "learning_rate": 7.341837125723e-05, + "loss": 0.012285090982913971, + "num_input_tokens_seen": 94964424, + "step": 5799, + "train_runtime": 47121.0879, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 3.515151515151515, + "grad_norm": 0.0092160077765584, + "learning_rate": 7.340987464034857e-05, + "loss": 0.013058573007583618, + "num_input_tokens_seen": 94980800, + "step": 5800, + "train_runtime": 47129.2017, + "train_tokens_per_second": 2015.328 + }, + { + "epoch": 3.5157575757575756, + "grad_norm": 0.0058651152066886425, + "learning_rate": 7.340137715758146e-05, + "loss": 0.010406364686787128, + "num_input_tokens_seen": 94997176, + "step": 5801, + "train_runtime": 47138.2366, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.5163636363636366, + "grad_norm": 0.007012000307440758, + "learning_rate": 7.339287880924298e-05, + "loss": 0.012935097329318523, + "num_input_tokens_seen": 95013552, + "step": 5802, + "train_runtime": 47146.346, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 3.516969696969697, + "grad_norm": 0.008293623104691505, + "learning_rate": 7.338437959564748e-05, + "loss": 0.012179568409919739, + "num_input_tokens_seen": 95029928, + "step": 5803, + "train_runtime": 47154.4596, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 3.5175757575757576, + "grad_norm": 0.004262713715434074, + "learning_rate": 7.337587951710931e-05, + "loss": 0.010824104771018028, + "num_input_tokens_seen": 95046304, + "step": 5804, + "train_runtime": 47162.5767, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 3.518181818181818, + "grad_norm": 0.002482967684045434, + "learning_rate": 7.336737857394288e-05, + "loss": 0.011907706037163734, + "num_input_tokens_seen": 95062680, + "step": 5805, + "train_runtime": 47170.6906, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 3.5187878787878786, + "grad_norm": 0.006973763927817345, + "learning_rate": 7.335887676646263e-05, + "loss": 0.012270115315914154, + "num_input_tokens_seen": 95079056, + "step": 5806, + "train_runtime": 47178.8057, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 3.5193939393939395, + "grad_norm": 0.004857184831053019, + "learning_rate": 7.3350374094983e-05, + "loss": 0.011801487766206264, + "num_input_tokens_seen": 95095432, + "step": 5807, + "train_runtime": 47186.9187, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 3.52, + "grad_norm": 0.004443611484020948, + "learning_rate": 7.334187055981852e-05, + "loss": 0.011630197986960411, + "num_input_tokens_seen": 95111808, + "step": 5808, + "train_runtime": 47195.0327, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 3.5206060606060605, + "grad_norm": 0.004154758527874947, + "learning_rate": 7.333336616128369e-05, + "loss": 0.011535628698766232, + "num_input_tokens_seen": 95128184, + "step": 5809, + "train_runtime": 47203.1474, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 3.5212121212121215, + "grad_norm": 0.011085142381489277, + "learning_rate": 7.33248608996931e-05, + "loss": 0.013205901719629765, + "num_input_tokens_seen": 95144560, + "step": 5810, + "train_runtime": 47211.2594, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 3.521818181818182, + "grad_norm": 0.003917771857231855, + "learning_rate": 7.331635477536131e-05, + "loss": 0.011967899277806282, + "num_input_tokens_seen": 95160936, + "step": 5811, + "train_runtime": 47219.3741, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 3.5224242424242425, + "grad_norm": 0.009839467704296112, + "learning_rate": 7.330784778860297e-05, + "loss": 0.012451402842998505, + "num_input_tokens_seen": 95177312, + "step": 5812, + "train_runtime": 47227.4894, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 3.523030303030303, + "grad_norm": 0.0082631204277277, + "learning_rate": 7.329933993973275e-05, + "loss": 0.012474270537495613, + "num_input_tokens_seen": 95193688, + "step": 5813, + "train_runtime": 47235.6041, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 3.5236363636363635, + "grad_norm": 0.009430996142327785, + "learning_rate": 7.32908312290653e-05, + "loss": 0.011832127347588539, + "num_input_tokens_seen": 95210064, + "step": 5814, + "train_runtime": 47243.7175, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 3.5242424242424244, + "grad_norm": 0.0054839313961565495, + "learning_rate": 7.328232165691537e-05, + "loss": 0.012708758004009724, + "num_input_tokens_seen": 95226440, + "step": 5815, + "train_runtime": 47251.8367, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 3.524848484848485, + "grad_norm": 0.006591468118131161, + "learning_rate": 7.32738112235977e-05, + "loss": 0.013282454572618008, + "num_input_tokens_seen": 95242816, + "step": 5816, + "train_runtime": 47259.9529, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 3.5254545454545454, + "grad_norm": 0.005360343027859926, + "learning_rate": 7.326529992942706e-05, + "loss": 0.011001105420291424, + "num_input_tokens_seen": 95259192, + "step": 5817, + "train_runtime": 47268.0673, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 3.526060606060606, + "grad_norm": 0.007344765122979879, + "learning_rate": 7.32567877747183e-05, + "loss": 0.011359146796166897, + "num_input_tokens_seen": 95275568, + "step": 5818, + "train_runtime": 47276.1776, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.5266666666666664, + "grad_norm": 0.0058997273445129395, + "learning_rate": 7.324827475978625e-05, + "loss": 0.012606981210410595, + "num_input_tokens_seen": 95291944, + "step": 5819, + "train_runtime": 47284.2919, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.5272727272727273, + "grad_norm": 0.009089338593184948, + "learning_rate": 7.32397608849458e-05, + "loss": 0.012643213383853436, + "num_input_tokens_seen": 95308320, + "step": 5820, + "train_runtime": 47292.4105, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.527878787878788, + "grad_norm": 0.006619580090045929, + "learning_rate": 7.323124615051183e-05, + "loss": 0.012998654507100582, + "num_input_tokens_seen": 95324696, + "step": 5821, + "train_runtime": 47300.5319, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.5284848484848483, + "grad_norm": 0.00978800654411316, + "learning_rate": 7.322273055679931e-05, + "loss": 0.012150097638368607, + "num_input_tokens_seen": 95341072, + "step": 5822, + "train_runtime": 47308.6465, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.5290909090909093, + "grad_norm": 0.005609934683889151, + "learning_rate": 7.321421410412322e-05, + "loss": 0.011422100476920605, + "num_input_tokens_seen": 95357448, + "step": 5823, + "train_runtime": 47316.7642, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.5296969696969698, + "grad_norm": 0.009185352362692356, + "learning_rate": 7.320569679279856e-05, + "loss": 0.012563257478177547, + "num_input_tokens_seen": 95373824, + "step": 5824, + "train_runtime": 47324.8812, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 3.5303030303030303, + "grad_norm": 0.005365482997149229, + "learning_rate": 7.319717862314035e-05, + "loss": 0.012712497264146805, + "num_input_tokens_seen": 95390200, + "step": 5825, + "train_runtime": 47332.9996, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 3.5309090909090908, + "grad_norm": 0.008052932098507881, + "learning_rate": 7.318865959546369e-05, + "loss": 0.011759700253605843, + "num_input_tokens_seen": 95406576, + "step": 5826, + "train_runtime": 47341.1191, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 3.5315151515151513, + "grad_norm": 0.014464687556028366, + "learning_rate": 7.318013971008367e-05, + "loss": 0.012470792979001999, + "num_input_tokens_seen": 95422952, + "step": 5827, + "train_runtime": 47349.233, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 3.532121212121212, + "grad_norm": 0.006349163595587015, + "learning_rate": 7.317161896731543e-05, + "loss": 0.012775963172316551, + "num_input_tokens_seen": 95439328, + "step": 5828, + "train_runtime": 47357.3459, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 3.5327272727272727, + "grad_norm": 0.008344019763171673, + "learning_rate": 7.316309736747413e-05, + "loss": 0.012420687824487686, + "num_input_tokens_seen": 95455704, + "step": 5829, + "train_runtime": 47365.4602, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 3.533333333333333, + "grad_norm": 0.0070110224187374115, + "learning_rate": 7.315457491087494e-05, + "loss": 0.013306763954460621, + "num_input_tokens_seen": 95472080, + "step": 5830, + "train_runtime": 47373.5751, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 3.533939393939394, + "grad_norm": 0.011985666118562222, + "learning_rate": 7.314605159783314e-05, + "loss": 0.01180567592382431, + "num_input_tokens_seen": 95488456, + "step": 5831, + "train_runtime": 47381.6917, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 3.5345454545454547, + "grad_norm": 0.005882276687771082, + "learning_rate": 7.313752742866395e-05, + "loss": 0.012185905128717422, + "num_input_tokens_seen": 95504832, + "step": 5832, + "train_runtime": 47389.8073, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 3.535151515151515, + "grad_norm": 0.00307760969735682, + "learning_rate": 7.312900240368269e-05, + "loss": 0.012165137566626072, + "num_input_tokens_seen": 95521208, + "step": 5833, + "train_runtime": 47397.9317, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 3.5357575757575757, + "grad_norm": 0.0073172119446098804, + "learning_rate": 7.312047652320466e-05, + "loss": 0.012314935214817524, + "num_input_tokens_seen": 95537584, + "step": 5834, + "train_runtime": 47406.05, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 3.536363636363636, + "grad_norm": 0.004917105659842491, + "learning_rate": 7.311194978754526e-05, + "loss": 0.011149460449814796, + "num_input_tokens_seen": 95553960, + "step": 5835, + "train_runtime": 47414.1652, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 3.536969696969697, + "grad_norm": 0.008531511761248112, + "learning_rate": 7.310342219701981e-05, + "loss": 0.011887168511748314, + "num_input_tokens_seen": 95570336, + "step": 5836, + "train_runtime": 47422.2809, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 3.5375757575757576, + "grad_norm": 0.008818835951387882, + "learning_rate": 7.309489375194378e-05, + "loss": 0.012990172952413559, + "num_input_tokens_seen": 95586712, + "step": 5837, + "train_runtime": 47430.3997, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 3.538181818181818, + "grad_norm": 0.005654378794133663, + "learning_rate": 7.308636445263261e-05, + "loss": 0.011832312680780888, + "num_input_tokens_seen": 95603088, + "step": 5838, + "train_runtime": 47438.5173, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 3.538787878787879, + "grad_norm": 0.008869743905961514, + "learning_rate": 7.307783429940176e-05, + "loss": 0.0126182921230793, + "num_input_tokens_seen": 95619464, + "step": 5839, + "train_runtime": 47446.6349, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 3.5393939393939395, + "grad_norm": 0.008088440634310246, + "learning_rate": 7.306930329256678e-05, + "loss": 0.012163275852799416, + "num_input_tokens_seen": 95635840, + "step": 5840, + "train_runtime": 47454.7493, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 3.54, + "grad_norm": 0.003895081114023924, + "learning_rate": 7.30607714324432e-05, + "loss": 0.011562881991267204, + "num_input_tokens_seen": 95652216, + "step": 5841, + "train_runtime": 47462.8707, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 3.5406060606060605, + "grad_norm": 0.006944611202925444, + "learning_rate": 7.305223871934657e-05, + "loss": 0.012268252670764923, + "num_input_tokens_seen": 95668592, + "step": 5842, + "train_runtime": 47470.988, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 3.541212121212121, + "grad_norm": 0.007934746332466602, + "learning_rate": 7.304370515359254e-05, + "loss": 0.013790624216198921, + "num_input_tokens_seen": 95684968, + "step": 5843, + "train_runtime": 47479.1087, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 3.541818181818182, + "grad_norm": 0.002435459056869149, + "learning_rate": 7.303517073549671e-05, + "loss": 0.01117650419473648, + "num_input_tokens_seen": 95701344, + "step": 5844, + "train_runtime": 47487.2227, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 3.5424242424242425, + "grad_norm": 0.010012850165367126, + "learning_rate": 7.302663546537481e-05, + "loss": 0.012758877128362656, + "num_input_tokens_seen": 95717720, + "step": 5845, + "train_runtime": 47495.3443, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 3.543030303030303, + "grad_norm": 0.011153246276080608, + "learning_rate": 7.301809934354248e-05, + "loss": 0.010956776328384876, + "num_input_tokens_seen": 95734096, + "step": 5846, + "train_runtime": 47503.4573, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 3.5436363636363635, + "grad_norm": 0.006225211080163717, + "learning_rate": 7.300956237031548e-05, + "loss": 0.011708579026162624, + "num_input_tokens_seen": 95750472, + "step": 5847, + "train_runtime": 47511.5722, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 3.544242424242424, + "grad_norm": 0.0034785049501806498, + "learning_rate": 7.30010245460096e-05, + "loss": 0.01110053900629282, + "num_input_tokens_seen": 95766848, + "step": 5848, + "train_runtime": 47519.6856, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.544848484848485, + "grad_norm": 0.007381196599453688, + "learning_rate": 7.299248587094059e-05, + "loss": 0.012920059263706207, + "num_input_tokens_seen": 95783224, + "step": 5849, + "train_runtime": 47527.8027, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.5454545454545454, + "grad_norm": 0.0066797444596886635, + "learning_rate": 7.298394634542431e-05, + "loss": 0.01271806750446558, + "num_input_tokens_seen": 95799600, + "step": 5850, + "train_runtime": 47535.9156, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.546060606060606, + "grad_norm": 0.010639653541147709, + "learning_rate": 7.297540596977662e-05, + "loss": 0.011682888492941856, + "num_input_tokens_seen": 95815976, + "step": 5851, + "train_runtime": 47544.0352, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.546666666666667, + "grad_norm": 0.0049572717398405075, + "learning_rate": 7.29668647443134e-05, + "loss": 0.012953163124620914, + "num_input_tokens_seen": 95832352, + "step": 5852, + "train_runtime": 47552.1517, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 3.5472727272727274, + "grad_norm": 0.007160555105656385, + "learning_rate": 7.295832266935059e-05, + "loss": 0.012354250065982342, + "num_input_tokens_seen": 95848728, + "step": 5853, + "train_runtime": 47560.2646, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 3.547878787878788, + "grad_norm": 0.005190347321331501, + "learning_rate": 7.294977974520411e-05, + "loss": 0.011620163917541504, + "num_input_tokens_seen": 95865104, + "step": 5854, + "train_runtime": 47568.3774, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.5484848484848484, + "grad_norm": 0.0055112955160439014, + "learning_rate": 7.294123597219001e-05, + "loss": 0.011973883956670761, + "num_input_tokens_seen": 95881480, + "step": 5855, + "train_runtime": 47576.4938, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.549090909090909, + "grad_norm": 0.004323835484683514, + "learning_rate": 7.293269135062424e-05, + "loss": 0.01267464179545641, + "num_input_tokens_seen": 95897856, + "step": 5856, + "train_runtime": 47584.6094, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.54969696969697, + "grad_norm": 0.010749808512628078, + "learning_rate": 7.292414588082287e-05, + "loss": 0.011362884193658829, + "num_input_tokens_seen": 95914232, + "step": 5857, + "train_runtime": 47592.7319, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 3.5503030303030303, + "grad_norm": 0.0074259378015995026, + "learning_rate": 7.2915599563102e-05, + "loss": 0.012775878421962261, + "num_input_tokens_seen": 95930608, + "step": 5858, + "train_runtime": 47600.8431, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 3.550909090909091, + "grad_norm": 0.00586421275511384, + "learning_rate": 7.290705239777772e-05, + "loss": 0.012351606041193008, + "num_input_tokens_seen": 95946984, + "step": 5859, + "train_runtime": 47608.9603, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.5515151515151517, + "grad_norm": 0.0021972949616611004, + "learning_rate": 7.28985043851662e-05, + "loss": 0.010230764746665955, + "num_input_tokens_seen": 95963360, + "step": 5860, + "train_runtime": 47617.0786, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.5521212121212122, + "grad_norm": 0.011982802301645279, + "learning_rate": 7.288995552558357e-05, + "loss": 0.01211756095290184, + "num_input_tokens_seen": 95979736, + "step": 5861, + "train_runtime": 47625.1907, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.5527272727272727, + "grad_norm": 0.007210193667560816, + "learning_rate": 7.288140581934607e-05, + "loss": 0.01247687079012394, + "num_input_tokens_seen": 95996112, + "step": 5862, + "train_runtime": 47633.3026, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.5533333333333332, + "grad_norm": 0.014237412251532078, + "learning_rate": 7.287285526676994e-05, + "loss": 0.013054870069026947, + "num_input_tokens_seen": 96012488, + "step": 5863, + "train_runtime": 47641.4576, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.5539393939393937, + "grad_norm": 0.022380806505680084, + "learning_rate": 7.286430386817143e-05, + "loss": 0.011601369827985764, + "num_input_tokens_seen": 96028864, + "step": 5864, + "train_runtime": 47649.5757, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.5545454545454547, + "grad_norm": 0.008671924471855164, + "learning_rate": 7.285575162386682e-05, + "loss": 0.01161205768585205, + "num_input_tokens_seen": 96045240, + "step": 5865, + "train_runtime": 47657.6928, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.555151515151515, + "grad_norm": 0.010434572584927082, + "learning_rate": 7.28471985341725e-05, + "loss": 0.012368510477244854, + "num_input_tokens_seen": 96061616, + "step": 5866, + "train_runtime": 47665.806, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.5557575757575757, + "grad_norm": 0.008612114936113358, + "learning_rate": 7.283864459940479e-05, + "loss": 0.011871944181621075, + "num_input_tokens_seen": 96077992, + "step": 5867, + "train_runtime": 47673.9314, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.5563636363636366, + "grad_norm": 0.008478858508169651, + "learning_rate": 7.283008981988008e-05, + "loss": 0.012215369381010532, + "num_input_tokens_seen": 96094368, + "step": 5868, + "train_runtime": 47682.0518, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.556969696969697, + "grad_norm": 0.006399418227374554, + "learning_rate": 7.282153419591482e-05, + "loss": 0.012022614479064941, + "num_input_tokens_seen": 96110744, + "step": 5869, + "train_runtime": 47690.1685, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 3.5575757575757576, + "grad_norm": 0.004086225759238005, + "learning_rate": 7.281297772782547e-05, + "loss": 0.012686099857091904, + "num_input_tokens_seen": 96127120, + "step": 5870, + "train_runtime": 47698.2876, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 3.558181818181818, + "grad_norm": 0.006345798727124929, + "learning_rate": 7.280442041592846e-05, + "loss": 0.012352866120636463, + "num_input_tokens_seen": 96143496, + "step": 5871, + "train_runtime": 47706.4015, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 3.5587878787878786, + "grad_norm": 0.0025422079488635063, + "learning_rate": 7.279586226054038e-05, + "loss": 0.010226168669760227, + "num_input_tokens_seen": 96159872, + "step": 5872, + "train_runtime": 47714.52, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 3.5593939393939396, + "grad_norm": 0.00414996687322855, + "learning_rate": 7.278730326197774e-05, + "loss": 0.011924789287149906, + "num_input_tokens_seen": 96176248, + "step": 5873, + "train_runtime": 47722.64, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 3.56, + "grad_norm": 0.010338885709643364, + "learning_rate": 7.277874342055713e-05, + "loss": 0.0118539584800601, + "num_input_tokens_seen": 96192624, + "step": 5874, + "train_runtime": 47730.7594, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 3.5606060606060606, + "grad_norm": 0.007873056456446648, + "learning_rate": 7.277018273659517e-05, + "loss": 0.012136629782617092, + "num_input_tokens_seen": 96209000, + "step": 5875, + "train_runtime": 47738.8747, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 3.561212121212121, + "grad_norm": 0.005036206915974617, + "learning_rate": 7.276162121040846e-05, + "loss": 0.011856719851493835, + "num_input_tokens_seen": 96225376, + "step": 5876, + "train_runtime": 47746.9909, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 3.5618181818181816, + "grad_norm": 0.007103745359927416, + "learning_rate": 7.275305884231374e-05, + "loss": 0.012372402474284172, + "num_input_tokens_seen": 96241752, + "step": 5877, + "train_runtime": 47755.102, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 3.5624242424242425, + "grad_norm": 0.005591919645667076, + "learning_rate": 7.274449563262767e-05, + "loss": 0.011951525695621967, + "num_input_tokens_seen": 96258128, + "step": 5878, + "train_runtime": 47763.215, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 3.563030303030303, + "grad_norm": 0.010070479474961758, + "learning_rate": 7.273593158166699e-05, + "loss": 0.012515030801296234, + "num_input_tokens_seen": 96274504, + "step": 5879, + "train_runtime": 47771.3339, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 3.5636363636363635, + "grad_norm": 0.00037109048571437597, + "learning_rate": 7.27273666897485e-05, + "loss": 0.011346704326570034, + "num_input_tokens_seen": 96290880, + "step": 5880, + "train_runtime": 47779.4529, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 3.5642424242424244, + "grad_norm": 0.0058873919770121574, + "learning_rate": 7.271880095718895e-05, + "loss": 0.011149340309202671, + "num_input_tokens_seen": 96307256, + "step": 5881, + "train_runtime": 47787.5686, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 3.564848484848485, + "grad_norm": 0.006068386137485504, + "learning_rate": 7.271023438430522e-05, + "loss": 0.01098193135112524, + "num_input_tokens_seen": 96323632, + "step": 5882, + "train_runtime": 47795.6844, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 3.5654545454545454, + "grad_norm": 0.009908095002174377, + "learning_rate": 7.270166697141414e-05, + "loss": 0.011690868996083736, + "num_input_tokens_seen": 96340008, + "step": 5883, + "train_runtime": 47803.8003, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 3.566060606060606, + "grad_norm": 0.00926489382982254, + "learning_rate": 7.269309871883259e-05, + "loss": 0.01357988454401493, + "num_input_tokens_seen": 96356384, + "step": 5884, + "train_runtime": 47811.9175, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 3.5666666666666664, + "grad_norm": 0.006965457461774349, + "learning_rate": 7.268452962687754e-05, + "loss": 0.010554589331150055, + "num_input_tokens_seen": 96372760, + "step": 5885, + "train_runtime": 47820.0373, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 3.5672727272727274, + "grad_norm": 0.012735276482999325, + "learning_rate": 7.267595969586589e-05, + "loss": 0.013246860355138779, + "num_input_tokens_seen": 96389136, + "step": 5886, + "train_runtime": 47828.1541, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 3.567878787878788, + "grad_norm": 0.006725442595779896, + "learning_rate": 7.266738892611466e-05, + "loss": 0.01173440646380186, + "num_input_tokens_seen": 96405512, + "step": 5887, + "train_runtime": 47836.275, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 3.5684848484848484, + "grad_norm": 0.007369717117398977, + "learning_rate": 7.265881731794085e-05, + "loss": 0.011743596754968166, + "num_input_tokens_seen": 96421888, + "step": 5888, + "train_runtime": 47844.3886, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 3.5690909090909093, + "grad_norm": 0.015418984927237034, + "learning_rate": 7.265024487166153e-05, + "loss": 0.01261256355792284, + "num_input_tokens_seen": 96438264, + "step": 5889, + "train_runtime": 47852.5089, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 3.56969696969697, + "grad_norm": 0.005158932879567146, + "learning_rate": 7.264167158759374e-05, + "loss": 0.013125834986567497, + "num_input_tokens_seen": 96454640, + "step": 5890, + "train_runtime": 47860.6323, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 3.5703030303030303, + "grad_norm": 0.006621691398322582, + "learning_rate": 7.263309746605463e-05, + "loss": 0.011445289477705956, + "num_input_tokens_seen": 96471016, + "step": 5891, + "train_runtime": 47868.747, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 3.570909090909091, + "grad_norm": 0.006739482283592224, + "learning_rate": 7.26245225073613e-05, + "loss": 0.011213039048016071, + "num_input_tokens_seen": 96487392, + "step": 5892, + "train_runtime": 47876.8592, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 3.5715151515151513, + "grad_norm": 0.0055887773633003235, + "learning_rate": 7.261594671183097e-05, + "loss": 0.011708712205290794, + "num_input_tokens_seen": 96503768, + "step": 5893, + "train_runtime": 47884.9742, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 3.5721212121212123, + "grad_norm": 0.0071156942285597324, + "learning_rate": 7.260737007978078e-05, + "loss": 0.011691002175211906, + "num_input_tokens_seen": 96520144, + "step": 5894, + "train_runtime": 47893.0885, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 3.5727272727272728, + "grad_norm": 0.01133033074438572, + "learning_rate": 7.259879261152802e-05, + "loss": 0.012508073821663857, + "num_input_tokens_seen": 96536520, + "step": 5895, + "train_runtime": 47901.2026, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 3.5733333333333333, + "grad_norm": 0.00948979239910841, + "learning_rate": 7.259021430738993e-05, + "loss": 0.012429313734173775, + "num_input_tokens_seen": 96552896, + "step": 5896, + "train_runtime": 47909.3174, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 3.573939393939394, + "grad_norm": 0.010181117802858353, + "learning_rate": 7.25816351676838e-05, + "loss": 0.012198460288345814, + "num_input_tokens_seen": 96569272, + "step": 5897, + "train_runtime": 47917.4361, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 3.5745454545454547, + "grad_norm": 0.009311644360423088, + "learning_rate": 7.257305519272697e-05, + "loss": 0.012919502332806587, + "num_input_tokens_seen": 96585648, + "step": 5898, + "train_runtime": 47925.5493, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 3.575151515151515, + "grad_norm": 0.006472766399383545, + "learning_rate": 7.256447438283677e-05, + "loss": 0.011392352171242237, + "num_input_tokens_seen": 96602024, + "step": 5899, + "train_runtime": 47933.6641, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 3.5757575757575757, + "grad_norm": 0.00794739555567503, + "learning_rate": 7.25558927383306e-05, + "loss": 0.011913099326193333, + "num_input_tokens_seen": 96618400, + "step": 5900, + "train_runtime": 47941.7813, + "train_tokens_per_second": 2015.328 + }, + { + "epoch": 3.576363636363636, + "grad_norm": 0.009121609851717949, + "learning_rate": 7.254731025952591e-05, + "loss": 0.012828334234654903, + "num_input_tokens_seen": 96634776, + "step": 5901, + "train_runtime": 47950.8868, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 3.576969696969697, + "grad_norm": 0.004465658217668533, + "learning_rate": 7.253872694674011e-05, + "loss": 0.012094475328922272, + "num_input_tokens_seen": 96651152, + "step": 5902, + "train_runtime": 47958.9997, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 3.5775757575757576, + "grad_norm": 0.010618271306157112, + "learning_rate": 7.253014280029069e-05, + "loss": 0.011291446164250374, + "num_input_tokens_seen": 96667528, + "step": 5903, + "train_runtime": 47967.1113, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 3.578181818181818, + "grad_norm": 0.0044867792166769505, + "learning_rate": 7.252155782049516e-05, + "loss": 0.012184605933725834, + "num_input_tokens_seen": 96683904, + "step": 5904, + "train_runtime": 47975.2324, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 3.5787878787878786, + "grad_norm": 0.008549047634005547, + "learning_rate": 7.251297200767107e-05, + "loss": 0.013134010136127472, + "num_input_tokens_seen": 96700280, + "step": 5905, + "train_runtime": 47983.3492, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 3.579393939393939, + "grad_norm": 0.003035058965906501, + "learning_rate": 7.2504385362136e-05, + "loss": 0.011084550060331821, + "num_input_tokens_seen": 96716656, + "step": 5906, + "train_runtime": 47991.4655, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.58, + "grad_norm": 0.00707911467179656, + "learning_rate": 7.249579788420752e-05, + "loss": 0.011327555403113365, + "num_input_tokens_seen": 96733032, + "step": 5907, + "train_runtime": 47999.5859, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.5806060606060606, + "grad_norm": 0.00750761991366744, + "learning_rate": 7.24872095742033e-05, + "loss": 0.012483515776693821, + "num_input_tokens_seen": 96749408, + "step": 5908, + "train_runtime": 48007.7024, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.581212121212121, + "grad_norm": 0.005679203197360039, + "learning_rate": 7.247862043244098e-05, + "loss": 0.011277184821665287, + "num_input_tokens_seen": 96765784, + "step": 5909, + "train_runtime": 48015.8164, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 3.581818181818182, + "grad_norm": 0.007926653139293194, + "learning_rate": 7.247003045923827e-05, + "loss": 0.012194707058370113, + "num_input_tokens_seen": 96782160, + "step": 5910, + "train_runtime": 48023.9316, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 3.5824242424242425, + "grad_norm": 0.010826398618519306, + "learning_rate": 7.246143965491288e-05, + "loss": 0.011727727949619293, + "num_input_tokens_seen": 96798536, + "step": 5911, + "train_runtime": 48032.0469, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 3.583030303030303, + "grad_norm": 0.004161892458796501, + "learning_rate": 7.245284801978259e-05, + "loss": 0.012194670736789703, + "num_input_tokens_seen": 96814912, + "step": 5912, + "train_runtime": 48040.1617, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 3.5836363636363635, + "grad_norm": 0.005710763391107321, + "learning_rate": 7.244425555416518e-05, + "loss": 0.011207321658730507, + "num_input_tokens_seen": 96831288, + "step": 5913, + "train_runtime": 48048.2715, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 3.584242424242424, + "grad_norm": 0.008601192384958267, + "learning_rate": 7.243566225837846e-05, + "loss": 0.011591836810112, + "num_input_tokens_seen": 96847664, + "step": 5914, + "train_runtime": 48056.3799, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 3.584848484848485, + "grad_norm": 0.008475406095385551, + "learning_rate": 7.242706813274027e-05, + "loss": 0.011459856294095516, + "num_input_tokens_seen": 96864040, + "step": 5915, + "train_runtime": 48064.4963, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 3.5854545454545454, + "grad_norm": 0.0059858947061002254, + "learning_rate": 7.241847317756854e-05, + "loss": 0.012064672075212002, + "num_input_tokens_seen": 96880416, + "step": 5916, + "train_runtime": 48072.6099, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 3.586060606060606, + "grad_norm": 0.0073598697781562805, + "learning_rate": 7.24098773931811e-05, + "loss": 0.01285725086927414, + "num_input_tokens_seen": 96896792, + "step": 5917, + "train_runtime": 48080.7315, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 3.586666666666667, + "grad_norm": 0.010541504248976707, + "learning_rate": 7.240128077989598e-05, + "loss": 0.012244059704244137, + "num_input_tokens_seen": 96913168, + "step": 5918, + "train_runtime": 48088.8472, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 3.5872727272727274, + "grad_norm": 0.008749601431190968, + "learning_rate": 7.239268333803109e-05, + "loss": 0.012211959809064865, + "num_input_tokens_seen": 96929544, + "step": 5919, + "train_runtime": 48096.9604, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 3.587878787878788, + "grad_norm": 0.004194013774394989, + "learning_rate": 7.238408506790444e-05, + "loss": 0.012766794301569462, + "num_input_tokens_seen": 96945920, + "step": 5920, + "train_runtime": 48105.0708, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 3.5884848484848484, + "grad_norm": 0.0032958786468952894, + "learning_rate": 7.237548596983409e-05, + "loss": 0.011120064184069633, + "num_input_tokens_seen": 96962296, + "step": 5921, + "train_runtime": 48113.1855, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 3.589090909090909, + "grad_norm": 0.0033376135397702456, + "learning_rate": 7.236688604413809e-05, + "loss": 0.01202797144651413, + "num_input_tokens_seen": 96978672, + "step": 5922, + "train_runtime": 48121.2989, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 3.58969696969697, + "grad_norm": 0.008107118308544159, + "learning_rate": 7.235828529113453e-05, + "loss": 0.0110987089574337, + "num_input_tokens_seen": 96995048, + "step": 5923, + "train_runtime": 48129.4122, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 3.5903030303030303, + "grad_norm": 0.003823991399258375, + "learning_rate": 7.234968371114153e-05, + "loss": 0.011178378947079182, + "num_input_tokens_seen": 97011424, + "step": 5924, + "train_runtime": 48137.5309, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 3.590909090909091, + "grad_norm": 0.006605386734008789, + "learning_rate": 7.234108130447725e-05, + "loss": 0.012045325711369514, + "num_input_tokens_seen": 97027800, + "step": 5925, + "train_runtime": 48145.6472, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 3.5915151515151518, + "grad_norm": 0.009213857352733612, + "learning_rate": 7.233247807145989e-05, + "loss": 0.012750063091516495, + "num_input_tokens_seen": 97044176, + "step": 5926, + "train_runtime": 48153.7648, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.5921212121212123, + "grad_norm": 0.010158281773328781, + "learning_rate": 7.232387401240765e-05, + "loss": 0.012540701776742935, + "num_input_tokens_seen": 97060552, + "step": 5927, + "train_runtime": 48161.8805, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.5927272727272728, + "grad_norm": 0.005980766378343105, + "learning_rate": 7.231526912763878e-05, + "loss": 0.012314237654209137, + "num_input_tokens_seen": 97076928, + "step": 5928, + "train_runtime": 48169.9964, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.5933333333333333, + "grad_norm": 0.005870536435395479, + "learning_rate": 7.230666341747157e-05, + "loss": 0.012657422572374344, + "num_input_tokens_seen": 97093304, + "step": 5929, + "train_runtime": 48178.1091, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.5939393939393938, + "grad_norm": 0.011821609921753407, + "learning_rate": 7.229805688222432e-05, + "loss": 0.012953449040651321, + "num_input_tokens_seen": 97109680, + "step": 5930, + "train_runtime": 48186.2222, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 3.5945454545454547, + "grad_norm": 0.00882563553750515, + "learning_rate": 7.228944952221538e-05, + "loss": 0.011960305273532867, + "num_input_tokens_seen": 97126056, + "step": 5931, + "train_runtime": 48194.3334, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 3.595151515151515, + "grad_norm": 0.00669354572892189, + "learning_rate": 7.22808413377631e-05, + "loss": 0.011936339549720287, + "num_input_tokens_seen": 97142432, + "step": 5932, + "train_runtime": 48202.4497, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 3.5957575757575757, + "grad_norm": 0.00657305121421814, + "learning_rate": 7.227223232918588e-05, + "loss": 0.012866399250924587, + "num_input_tokens_seen": 97158808, + "step": 5933, + "train_runtime": 48210.5674, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 3.596363636363636, + "grad_norm": 0.004907965660095215, + "learning_rate": 7.226362249680216e-05, + "loss": 0.011865566484630108, + "num_input_tokens_seen": 97175184, + "step": 5934, + "train_runtime": 48218.6835, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 3.5969696969696967, + "grad_norm": 0.01201409101486206, + "learning_rate": 7.225501184093042e-05, + "loss": 0.012895400635898113, + "num_input_tokens_seen": 97191560, + "step": 5935, + "train_runtime": 48226.797, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 3.5975757575757576, + "grad_norm": 0.007334982976317406, + "learning_rate": 7.224640036188912e-05, + "loss": 0.012086250819265842, + "num_input_tokens_seen": 97207936, + "step": 5936, + "train_runtime": 48234.9109, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 3.598181818181818, + "grad_norm": 0.004307164344936609, + "learning_rate": 7.22377880599968e-05, + "loss": 0.011558006517589092, + "num_input_tokens_seen": 97224312, + "step": 5937, + "train_runtime": 48243.0321, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 3.5987878787878786, + "grad_norm": 0.005386605858802795, + "learning_rate": 7.222917493557197e-05, + "loss": 0.011355818249285221, + "num_input_tokens_seen": 97240688, + "step": 5938, + "train_runtime": 48251.1458, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 3.5993939393939396, + "grad_norm": 0.008641676045954227, + "learning_rate": 7.222056098893328e-05, + "loss": 0.01234667282551527, + "num_input_tokens_seen": 97257064, + "step": 5939, + "train_runtime": 48259.2614, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 3.6, + "grad_norm": 0.006817467510700226, + "learning_rate": 7.22119462203993e-05, + "loss": 0.012077373452484608, + "num_input_tokens_seen": 97273440, + "step": 5940, + "train_runtime": 48267.3734, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 3.6006060606060606, + "grad_norm": 0.009236082434654236, + "learning_rate": 7.220333063028872e-05, + "loss": 0.012414231896400452, + "num_input_tokens_seen": 97289816, + "step": 5941, + "train_runtime": 48275.49, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 3.601212121212121, + "grad_norm": 0.0010832896223291755, + "learning_rate": 7.219471421892015e-05, + "loss": 0.011659754440188408, + "num_input_tokens_seen": 97306192, + "step": 5942, + "train_runtime": 48283.6025, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 3.6018181818181816, + "grad_norm": 0.004627120215445757, + "learning_rate": 7.218609698661232e-05, + "loss": 0.012482816353440285, + "num_input_tokens_seen": 97322568, + "step": 5943, + "train_runtime": 48291.7141, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 3.6024242424242425, + "grad_norm": 0.004989259410649538, + "learning_rate": 7.217747893368397e-05, + "loss": 0.012546809390187263, + "num_input_tokens_seen": 97338944, + "step": 5944, + "train_runtime": 48299.8334, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 3.603030303030303, + "grad_norm": 0.0076448568142950535, + "learning_rate": 7.216886006045386e-05, + "loss": 0.01259324885904789, + "num_input_tokens_seen": 97355320, + "step": 5945, + "train_runtime": 48307.9449, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 3.6036363636363635, + "grad_norm": 0.0075515941716730595, + "learning_rate": 7.21602403672408e-05, + "loss": 0.01279412116855383, + "num_input_tokens_seen": 97371696, + "step": 5946, + "train_runtime": 48316.0592, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 3.6042424242424245, + "grad_norm": 0.005580172408372164, + "learning_rate": 7.21516198543636e-05, + "loss": 0.012541270814836025, + "num_input_tokens_seen": 97388072, + "step": 5947, + "train_runtime": 48324.1737, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 3.604848484848485, + "grad_norm": 0.008175063878297806, + "learning_rate": 7.214299852214113e-05, + "loss": 0.01269839983433485, + "num_input_tokens_seen": 97404448, + "step": 5948, + "train_runtime": 48332.2871, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 3.6054545454545455, + "grad_norm": 0.008326445706188679, + "learning_rate": 7.213437637089224e-05, + "loss": 0.013253039680421352, + "num_input_tokens_seen": 97420824, + "step": 5949, + "train_runtime": 48340.402, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.606060606060606, + "grad_norm": 0.00773974834010005, + "learning_rate": 7.212575340093591e-05, + "loss": 0.01318864431232214, + "num_input_tokens_seen": 97437200, + "step": 5950, + "train_runtime": 48348.5157, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.6066666666666665, + "grad_norm": 0.007150116376578808, + "learning_rate": 7.2117129612591e-05, + "loss": 0.01147664338350296, + "num_input_tokens_seen": 97453576, + "step": 5951, + "train_runtime": 48356.6338, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.6072727272727274, + "grad_norm": 0.007812732830643654, + "learning_rate": 7.210850500617657e-05, + "loss": 0.013079877011477947, + "num_input_tokens_seen": 97469952, + "step": 5952, + "train_runtime": 48364.7521, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.607878787878788, + "grad_norm": 0.00500169862061739, + "learning_rate": 7.209987958201158e-05, + "loss": 0.012586192227900028, + "num_input_tokens_seen": 97486328, + "step": 5953, + "train_runtime": 48372.8698, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.6084848484848484, + "grad_norm": 0.01068690326064825, + "learning_rate": 7.20912533404151e-05, + "loss": 0.012317171320319176, + "num_input_tokens_seen": 97502704, + "step": 5954, + "train_runtime": 48380.9835, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.6090909090909093, + "grad_norm": 0.007679889444261789, + "learning_rate": 7.208262628170616e-05, + "loss": 0.011284763924777508, + "num_input_tokens_seen": 97519080, + "step": 5955, + "train_runtime": 48389.0978, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 3.60969696969697, + "grad_norm": 0.004508465062826872, + "learning_rate": 7.20739984062039e-05, + "loss": 0.010057495906949043, + "num_input_tokens_seen": 97535456, + "step": 5956, + "train_runtime": 48397.2154, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 3.6103030303030303, + "grad_norm": 0.013704543933272362, + "learning_rate": 7.206536971422741e-05, + "loss": 0.012025773525238037, + "num_input_tokens_seen": 97551832, + "step": 5957, + "train_runtime": 48405.3339, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.610909090909091, + "grad_norm": 0.007847911678254604, + "learning_rate": 7.205674020609587e-05, + "loss": 0.01151137612760067, + "num_input_tokens_seen": 97568208, + "step": 5958, + "train_runtime": 48413.4481, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.6115151515151513, + "grad_norm": 0.009745624847710133, + "learning_rate": 7.204810988212846e-05, + "loss": 0.011379195377230644, + "num_input_tokens_seen": 97584584, + "step": 5959, + "train_runtime": 48421.567, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.6121212121212123, + "grad_norm": 0.008306813426315784, + "learning_rate": 7.203947874264441e-05, + "loss": 0.012223472818732262, + "num_input_tokens_seen": 97600960, + "step": 5960, + "train_runtime": 48429.6838, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 3.612727272727273, + "grad_norm": 0.005358349531888962, + "learning_rate": 7.203084678796295e-05, + "loss": 0.012783629819750786, + "num_input_tokens_seen": 97617336, + "step": 5961, + "train_runtime": 48437.7972, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 3.6133333333333333, + "grad_norm": 0.004735131748020649, + "learning_rate": 7.202221401840336e-05, + "loss": 0.010903502814471722, + "num_input_tokens_seen": 97633712, + "step": 5962, + "train_runtime": 48445.9123, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.613939393939394, + "grad_norm": 0.004694829694926739, + "learning_rate": 7.201358043428499e-05, + "loss": 0.01226576417684555, + "num_input_tokens_seen": 97650088, + "step": 5963, + "train_runtime": 48454.032, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.6145454545454543, + "grad_norm": 0.003633148269727826, + "learning_rate": 7.200494603592714e-05, + "loss": 0.012599844485521317, + "num_input_tokens_seen": 97666464, + "step": 5964, + "train_runtime": 48462.1457, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.6151515151515152, + "grad_norm": 0.008198810741305351, + "learning_rate": 7.199631082364917e-05, + "loss": 0.01147422008216381, + "num_input_tokens_seen": 97682840, + "step": 5965, + "train_runtime": 48470.2626, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.6157575757575757, + "grad_norm": 0.011272166855633259, + "learning_rate": 7.19876747977705e-05, + "loss": 0.011915856972336769, + "num_input_tokens_seen": 97699216, + "step": 5966, + "train_runtime": 48478.379, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.6163636363636362, + "grad_norm": 0.009361744858324528, + "learning_rate": 7.197903795861059e-05, + "loss": 0.012170983478426933, + "num_input_tokens_seen": 97715592, + "step": 5967, + "train_runtime": 48486.4953, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 3.616969696969697, + "grad_norm": 0.003466364461928606, + "learning_rate": 7.197040030648885e-05, + "loss": 0.011990480124950409, + "num_input_tokens_seen": 97731968, + "step": 5968, + "train_runtime": 48494.6154, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 3.6175757575757577, + "grad_norm": 0.00693540507927537, + "learning_rate": 7.196176184172478e-05, + "loss": 0.012302807532250881, + "num_input_tokens_seen": 97748344, + "step": 5969, + "train_runtime": 48502.7334, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 3.618181818181818, + "grad_norm": 0.006436810363084078, + "learning_rate": 7.19531225646379e-05, + "loss": 0.011995306238532066, + "num_input_tokens_seen": 97764720, + "step": 5970, + "train_runtime": 48510.8563, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 3.6187878787878787, + "grad_norm": 0.006393983494490385, + "learning_rate": 7.19444824755478e-05, + "loss": 0.011661283671855927, + "num_input_tokens_seen": 97781096, + "step": 5971, + "train_runtime": 48518.9747, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 3.619393939393939, + "grad_norm": 0.006695556920021772, + "learning_rate": 7.1935841574774e-05, + "loss": 0.011117118410766125, + "num_input_tokens_seen": 97797472, + "step": 5972, + "train_runtime": 48527.0901, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 3.62, + "grad_norm": 0.0050604804418981075, + "learning_rate": 7.192719986263616e-05, + "loss": 0.012257568538188934, + "num_input_tokens_seen": 97813848, + "step": 5973, + "train_runtime": 48535.2056, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 3.6206060606060606, + "grad_norm": 0.008141054771840572, + "learning_rate": 7.191855733945387e-05, + "loss": 0.013042940758168697, + "num_input_tokens_seen": 97830224, + "step": 5974, + "train_runtime": 48543.3328, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 3.621212121212121, + "grad_norm": 0.007235284894704819, + "learning_rate": 7.190991400554686e-05, + "loss": 0.01172514446079731, + "num_input_tokens_seen": 97846600, + "step": 5975, + "train_runtime": 48551.4536, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 3.621818181818182, + "grad_norm": 0.012224501930177212, + "learning_rate": 7.190126986123476e-05, + "loss": 0.012536799535155296, + "num_input_tokens_seen": 97862976, + "step": 5976, + "train_runtime": 48559.5689, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 3.6224242424242425, + "grad_norm": 0.008914670906960964, + "learning_rate": 7.189262490683737e-05, + "loss": 0.01172355655580759, + "num_input_tokens_seen": 97879352, + "step": 5977, + "train_runtime": 48567.6839, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 3.623030303030303, + "grad_norm": 0.008433090522885323, + "learning_rate": 7.188397914267441e-05, + "loss": 0.010266945697367191, + "num_input_tokens_seen": 97895728, + "step": 5978, + "train_runtime": 48575.7977, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 3.6236363636363635, + "grad_norm": 0.007765430957078934, + "learning_rate": 7.18753325690657e-05, + "loss": 0.01251099444925785, + "num_input_tokens_seen": 97912104, + "step": 5979, + "train_runtime": 48583.9148, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 3.624242424242424, + "grad_norm": 0.007677598390728235, + "learning_rate": 7.186668518633099e-05, + "loss": 0.012416304089128971, + "num_input_tokens_seen": 97928480, + "step": 5980, + "train_runtime": 48592.0322, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 3.624848484848485, + "grad_norm": 0.009832225739955902, + "learning_rate": 7.185803699479022e-05, + "loss": 0.01244290266185999, + "num_input_tokens_seen": 97944856, + "step": 5981, + "train_runtime": 48600.1466, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 3.6254545454545455, + "grad_norm": 0.006494112778455019, + "learning_rate": 7.184938799476321e-05, + "loss": 0.010885640047490597, + "num_input_tokens_seen": 97961232, + "step": 5982, + "train_runtime": 48608.259, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 3.626060606060606, + "grad_norm": 0.008364114910364151, + "learning_rate": 7.18407381865699e-05, + "loss": 0.01314442977309227, + "num_input_tokens_seen": 97977608, + "step": 5983, + "train_runtime": 48616.3723, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 3.626666666666667, + "grad_norm": 0.006673536263406277, + "learning_rate": 7.18320875705302e-05, + "loss": 0.012444613501429558, + "num_input_tokens_seen": 97993984, + "step": 5984, + "train_runtime": 48624.487, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 3.6272727272727274, + "grad_norm": 0.00534111587330699, + "learning_rate": 7.182343614696412e-05, + "loss": 0.0119509007781744, + "num_input_tokens_seen": 98010360, + "step": 5985, + "train_runtime": 48632.6073, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 3.627878787878788, + "grad_norm": 0.008285642601549625, + "learning_rate": 7.181478391619162e-05, + "loss": 0.011928752064704895, + "num_input_tokens_seen": 98026736, + "step": 5986, + "train_runtime": 48640.7315, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 3.6284848484848484, + "grad_norm": 0.007268223911523819, + "learning_rate": 7.180613087853275e-05, + "loss": 0.012210365384817123, + "num_input_tokens_seen": 98043112, + "step": 5987, + "train_runtime": 48648.8486, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 3.629090909090909, + "grad_norm": 0.008645938709378242, + "learning_rate": 7.179747703430757e-05, + "loss": 0.013656264171004295, + "num_input_tokens_seen": 98059488, + "step": 5988, + "train_runtime": 48656.965, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 3.62969696969697, + "grad_norm": 0.0030822623521089554, + "learning_rate": 7.178882238383614e-05, + "loss": 0.011116075329482555, + "num_input_tokens_seen": 98075864, + "step": 5989, + "train_runtime": 48665.0794, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 3.6303030303030304, + "grad_norm": 0.008546039462089539, + "learning_rate": 7.178016692743862e-05, + "loss": 0.012996817007660866, + "num_input_tokens_seen": 98092240, + "step": 5990, + "train_runtime": 48673.1888, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 3.630909090909091, + "grad_norm": 0.009508387185633183, + "learning_rate": 7.177151066543515e-05, + "loss": 0.013646122068166733, + "num_input_tokens_seen": 98108616, + "step": 5991, + "train_runtime": 48681.3004, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 3.6315151515151514, + "grad_norm": 0.010440315119922161, + "learning_rate": 7.176285359814588e-05, + "loss": 0.012341325171291828, + "num_input_tokens_seen": 98124992, + "step": 5992, + "train_runtime": 48689.4109, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 3.632121212121212, + "grad_norm": 0.003384822513908148, + "learning_rate": 7.175419572589104e-05, + "loss": 0.01292372401803732, + "num_input_tokens_seen": 98141368, + "step": 5993, + "train_runtime": 48697.5321, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 3.632727272727273, + "grad_norm": 0.007386879064142704, + "learning_rate": 7.174553704899086e-05, + "loss": 0.012308338657021523, + "num_input_tokens_seen": 98157744, + "step": 5994, + "train_runtime": 48705.6451, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 3.6333333333333333, + "grad_norm": 0.006014546845108271, + "learning_rate": 7.173687756776563e-05, + "loss": 0.011594224721193314, + "num_input_tokens_seen": 98174120, + "step": 5995, + "train_runtime": 48713.7602, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 3.633939393939394, + "grad_norm": 0.00886901468038559, + "learning_rate": 7.172821728253562e-05, + "loss": 0.012894167564809322, + "num_input_tokens_seen": 98190496, + "step": 5996, + "train_runtime": 48721.8759, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 3.6345454545454547, + "grad_norm": 0.005130333360284567, + "learning_rate": 7.171955619362116e-05, + "loss": 0.011047592386603355, + "num_input_tokens_seen": 98206872, + "step": 5997, + "train_runtime": 48729.9923, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 3.6351515151515152, + "grad_norm": 0.004651688039302826, + "learning_rate": 7.171089430134262e-05, + "loss": 0.011271205730736256, + "num_input_tokens_seen": 98223248, + "step": 5998, + "train_runtime": 48738.11, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 3.6357575757575757, + "grad_norm": 0.006117654498666525, + "learning_rate": 7.170223160602036e-05, + "loss": 0.011832252144813538, + "num_input_tokens_seen": 98239624, + "step": 5999, + "train_runtime": 48746.2324, + "train_tokens_per_second": 2015.328 + }, + { + "epoch": 3.6363636363636362, + "grad_norm": 0.008248839527368546, + "learning_rate": 7.169356810797485e-05, + "loss": 0.012528887018561363, + "num_input_tokens_seen": 98256000, + "step": 6000, + "train_runtime": 48754.3542, + "train_tokens_per_second": 2015.328 + }, + { + "epoch": 3.6369696969696967, + "grad_norm": 0.008471083827316761, + "learning_rate": 7.168490380752649e-05, + "loss": 0.012415273115038872, + "num_input_tokens_seen": 98272376, + "step": 6001, + "train_runtime": 48763.4052, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.6375757575757577, + "grad_norm": 0.010201534256339073, + "learning_rate": 7.167623870499576e-05, + "loss": 0.01219233125448227, + "num_input_tokens_seen": 98288752, + "step": 6002, + "train_runtime": 48771.52, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 3.638181818181818, + "grad_norm": 0.00201614061370492, + "learning_rate": 7.166757280070318e-05, + "loss": 0.012617104686796665, + "num_input_tokens_seen": 98305128, + "step": 6003, + "train_runtime": 48779.6344, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 3.6387878787878787, + "grad_norm": 0.0121644651517272, + "learning_rate": 7.165890609496928e-05, + "loss": 0.012308474630117416, + "num_input_tokens_seen": 98321504, + "step": 6004, + "train_runtime": 48787.7465, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 3.6393939393939396, + "grad_norm": 0.026779375970363617, + "learning_rate": 7.165023858811462e-05, + "loss": 0.012913781218230724, + "num_input_tokens_seen": 98337880, + "step": 6005, + "train_runtime": 48795.8587, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 3.64, + "grad_norm": 0.00477451179176569, + "learning_rate": 7.164157028045979e-05, + "loss": 0.012724547646939754, + "num_input_tokens_seen": 98354256, + "step": 6006, + "train_runtime": 48803.9694, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 3.6406060606060606, + "grad_norm": 0.006358371116220951, + "learning_rate": 7.163290117232542e-05, + "loss": 0.012128914706408978, + "num_input_tokens_seen": 98370632, + "step": 6007, + "train_runtime": 48812.0857, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 3.641212121212121, + "grad_norm": 0.008677128702402115, + "learning_rate": 7.162423126403217e-05, + "loss": 0.010942158289253712, + "num_input_tokens_seen": 98387008, + "step": 6008, + "train_runtime": 48820.198, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 3.6418181818181816, + "grad_norm": 0.008646576665341854, + "learning_rate": 7.161556055590071e-05, + "loss": 0.010778071358799934, + "num_input_tokens_seen": 98403384, + "step": 6009, + "train_runtime": 48828.3173, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 3.6424242424242426, + "grad_norm": 0.009995764121413231, + "learning_rate": 7.160688904825177e-05, + "loss": 0.01314915157854557, + "num_input_tokens_seen": 98419760, + "step": 6010, + "train_runtime": 48836.433, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 3.643030303030303, + "grad_norm": 0.002854242455214262, + "learning_rate": 7.159821674140607e-05, + "loss": 0.012515944428741932, + "num_input_tokens_seen": 98436136, + "step": 6011, + "train_runtime": 48844.5495, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 3.6436363636363636, + "grad_norm": 0.007694566156715155, + "learning_rate": 7.15895436356844e-05, + "loss": 0.012820071540772915, + "num_input_tokens_seen": 98452512, + "step": 6012, + "train_runtime": 48852.6653, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 3.6442424242424245, + "grad_norm": 0.005492750555276871, + "learning_rate": 7.158086973140756e-05, + "loss": 0.011673707515001297, + "num_input_tokens_seen": 98468888, + "step": 6013, + "train_runtime": 48860.7773, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 3.644848484848485, + "grad_norm": 0.008452641777694225, + "learning_rate": 7.157219502889636e-05, + "loss": 0.012209806591272354, + "num_input_tokens_seen": 98485264, + "step": 6014, + "train_runtime": 48868.8909, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 3.6454545454545455, + "grad_norm": 0.007447637151926756, + "learning_rate": 7.15635195284717e-05, + "loss": 0.01129322312772274, + "num_input_tokens_seen": 98501640, + "step": 6015, + "train_runtime": 48877.0062, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 3.646060606060606, + "grad_norm": 0.008995541371405125, + "learning_rate": 7.155484323045441e-05, + "loss": 0.012981800362467766, + "num_input_tokens_seen": 98518016, + "step": 6016, + "train_runtime": 48885.1209, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 3.6466666666666665, + "grad_norm": 0.005966581404209137, + "learning_rate": 7.154616613516548e-05, + "loss": 0.011928796768188477, + "num_input_tokens_seen": 98534392, + "step": 6017, + "train_runtime": 48893.2361, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 3.6472727272727274, + "grad_norm": 0.006047350354492664, + "learning_rate": 7.153748824292581e-05, + "loss": 0.011867531575262547, + "num_input_tokens_seen": 98550768, + "step": 6018, + "train_runtime": 48901.3513, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 3.647878787878788, + "grad_norm": 0.009243872947990894, + "learning_rate": 7.152880955405638e-05, + "loss": 0.012442519888281822, + "num_input_tokens_seen": 98567144, + "step": 6019, + "train_runtime": 48909.4623, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.6484848484848484, + "grad_norm": 0.006562298629432917, + "learning_rate": 7.152013006887823e-05, + "loss": 0.013065743260085583, + "num_input_tokens_seen": 98583520, + "step": 6020, + "train_runtime": 48917.5761, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.649090909090909, + "grad_norm": 0.0034189948346465826, + "learning_rate": 7.151144978771237e-05, + "loss": 0.011747035197913647, + "num_input_tokens_seen": 98599896, + "step": 6021, + "train_runtime": 48925.6982, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.6496969696969694, + "grad_norm": 0.005699932109564543, + "learning_rate": 7.150276871087987e-05, + "loss": 0.011496735736727715, + "num_input_tokens_seen": 98616272, + "step": 6022, + "train_runtime": 48933.8328, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.6503030303030304, + "grad_norm": 0.0005929829785600305, + "learning_rate": 7.149408683870183e-05, + "loss": 0.012868471443653107, + "num_input_tokens_seen": 98632648, + "step": 6023, + "train_runtime": 48941.9491, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.650909090909091, + "grad_norm": 0.009052216075360775, + "learning_rate": 7.148540417149938e-05, + "loss": 0.01339772343635559, + "num_input_tokens_seen": 98649024, + "step": 6024, + "train_runtime": 48950.0652, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.6515151515151514, + "grad_norm": 0.012863198295235634, + "learning_rate": 7.147672070959367e-05, + "loss": 0.01292307861149311, + "num_input_tokens_seen": 98665400, + "step": 6025, + "train_runtime": 48958.179, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 3.6521212121212123, + "grad_norm": 0.00936830323189497, + "learning_rate": 7.146803645330587e-05, + "loss": 0.012493688613176346, + "num_input_tokens_seen": 98681776, + "step": 6026, + "train_runtime": 48966.2909, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 3.652727272727273, + "grad_norm": 0.00684219179674983, + "learning_rate": 7.145935140295724e-05, + "loss": 0.013306375592947006, + "num_input_tokens_seen": 98698152, + "step": 6027, + "train_runtime": 48974.4055, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 3.6533333333333333, + "grad_norm": 0.006948802154511213, + "learning_rate": 7.145066555886897e-05, + "loss": 0.011734046041965485, + "num_input_tokens_seen": 98714528, + "step": 6028, + "train_runtime": 48982.5169, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 3.653939393939394, + "grad_norm": 0.004060723818838596, + "learning_rate": 7.144197892136236e-05, + "loss": 0.01118649821728468, + "num_input_tokens_seen": 98730904, + "step": 6029, + "train_runtime": 48990.6335, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 3.6545454545454543, + "grad_norm": 0.005371112376451492, + "learning_rate": 7.14332914907587e-05, + "loss": 0.012133692391216755, + "num_input_tokens_seen": 98747280, + "step": 6030, + "train_runtime": 48998.7476, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 3.6551515151515153, + "grad_norm": 0.00899979006499052, + "learning_rate": 7.142460326737933e-05, + "loss": 0.011642271652817726, + "num_input_tokens_seen": 98763656, + "step": 6031, + "train_runtime": 49006.8643, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 3.6557575757575758, + "grad_norm": 0.01003950648009777, + "learning_rate": 7.141591425154562e-05, + "loss": 0.012990344315767288, + "num_input_tokens_seen": 98780032, + "step": 6032, + "train_runtime": 49014.979, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 3.6563636363636363, + "grad_norm": 0.007446490693837404, + "learning_rate": 7.140722444357893e-05, + "loss": 0.01267696637660265, + "num_input_tokens_seen": 98796408, + "step": 6033, + "train_runtime": 49023.0987, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 3.656969696969697, + "grad_norm": 0.006436123512685299, + "learning_rate": 7.13985338438007e-05, + "loss": 0.011776996776461601, + "num_input_tokens_seen": 98812784, + "step": 6034, + "train_runtime": 49031.2156, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 3.6575757575757577, + "grad_norm": 0.004802078474313021, + "learning_rate": 7.138984245253238e-05, + "loss": 0.012874181382358074, + "num_input_tokens_seen": 98829160, + "step": 6035, + "train_runtime": 49039.3347, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 3.658181818181818, + "grad_norm": 0.020184827968478203, + "learning_rate": 7.138115027009544e-05, + "loss": 0.012180539779365063, + "num_input_tokens_seen": 98845536, + "step": 6036, + "train_runtime": 49047.4489, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 3.6587878787878787, + "grad_norm": 0.006393152289092541, + "learning_rate": 7.13724572968114e-05, + "loss": 0.01233154907822609, + "num_input_tokens_seen": 98861912, + "step": 6037, + "train_runtime": 49055.5653, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 3.659393939393939, + "grad_norm": 0.009826752357184887, + "learning_rate": 7.136376353300179e-05, + "loss": 0.012414640747010708, + "num_input_tokens_seen": 98878288, + "step": 6038, + "train_runtime": 49063.6775, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 3.66, + "grad_norm": 0.0020025684498250484, + "learning_rate": 7.135506897898814e-05, + "loss": 0.012105888687074184, + "num_input_tokens_seen": 98894664, + "step": 6039, + "train_runtime": 49071.7891, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 3.6606060606060606, + "grad_norm": 0.0033968070056289434, + "learning_rate": 7.13463736350921e-05, + "loss": 0.01147972047328949, + "num_input_tokens_seen": 98911040, + "step": 6040, + "train_runtime": 49079.9055, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 3.661212121212121, + "grad_norm": 0.00744658475741744, + "learning_rate": 7.133767750163526e-05, + "loss": 0.013104964047670364, + "num_input_tokens_seen": 98927416, + "step": 6041, + "train_runtime": 49088.0185, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 3.661818181818182, + "grad_norm": 0.007308576721698046, + "learning_rate": 7.132898057893929e-05, + "loss": 0.011652044951915741, + "num_input_tokens_seen": 98943792, + "step": 6042, + "train_runtime": 49096.1346, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 3.6624242424242426, + "grad_norm": 0.006826466415077448, + "learning_rate": 7.132028286732585e-05, + "loss": 0.013032132759690285, + "num_input_tokens_seen": 98960168, + "step": 6043, + "train_runtime": 49104.2479, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 3.663030303030303, + "grad_norm": 0.005819359794259071, + "learning_rate": 7.131158436711668e-05, + "loss": 0.011946756392717361, + "num_input_tokens_seen": 98976544, + "step": 6044, + "train_runtime": 49112.37, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 3.6636363636363636, + "grad_norm": 0.012134311720728874, + "learning_rate": 7.13028850786335e-05, + "loss": 0.011372080072760582, + "num_input_tokens_seen": 98992920, + "step": 6045, + "train_runtime": 49120.4859, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 3.664242424242424, + "grad_norm": 0.00799193512648344, + "learning_rate": 7.129418500219809e-05, + "loss": 0.012703660875558853, + "num_input_tokens_seen": 99009296, + "step": 6046, + "train_runtime": 49128.5984, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.664848484848485, + "grad_norm": 0.009202188812196255, + "learning_rate": 7.128548413813225e-05, + "loss": 0.011977000162005424, + "num_input_tokens_seen": 99025672, + "step": 6047, + "train_runtime": 49136.712, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.6654545454545455, + "grad_norm": 0.007550109177827835, + "learning_rate": 7.127678248675779e-05, + "loss": 0.012299132533371449, + "num_input_tokens_seen": 99042048, + "step": 6048, + "train_runtime": 49144.8349, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.666060606060606, + "grad_norm": 0.008233691565692425, + "learning_rate": 7.126808004839658e-05, + "loss": 0.011315702460706234, + "num_input_tokens_seen": 99058424, + "step": 6049, + "train_runtime": 49152.9533, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 0.012306444346904755, + "learning_rate": 7.125937682337052e-05, + "loss": 0.014014697633683681, + "num_input_tokens_seen": 99074800, + "step": 6050, + "train_runtime": 49161.0691, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.667272727272727, + "grad_norm": 0.008057020604610443, + "learning_rate": 7.12506728120015e-05, + "loss": 0.011323971673846245, + "num_input_tokens_seen": 99091176, + "step": 6051, + "train_runtime": 49169.1887, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.667878787878788, + "grad_norm": 0.00504477322101593, + "learning_rate": 7.12419680146115e-05, + "loss": 0.010937046259641647, + "num_input_tokens_seen": 99107552, + "step": 6052, + "train_runtime": 49177.3092, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 3.6684848484848485, + "grad_norm": 0.009694903157651424, + "learning_rate": 7.123326243152244e-05, + "loss": 0.013486252166330814, + "num_input_tokens_seen": 99123928, + "step": 6053, + "train_runtime": 49185.4312, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 3.669090909090909, + "grad_norm": 0.0037320698611438274, + "learning_rate": 7.122455606305637e-05, + "loss": 0.010877583175897598, + "num_input_tokens_seen": 99140304, + "step": 6054, + "train_runtime": 49193.5473, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 3.66969696969697, + "grad_norm": 0.008396755903959274, + "learning_rate": 7.12158489095353e-05, + "loss": 0.012185371480882168, + "num_input_tokens_seen": 99156680, + "step": 6055, + "train_runtime": 49201.6612, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.6703030303030304, + "grad_norm": 0.005162857007235289, + "learning_rate": 7.120714097128129e-05, + "loss": 0.011676938273012638, + "num_input_tokens_seen": 99173056, + "step": 6056, + "train_runtime": 49209.7793, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.670909090909091, + "grad_norm": 0.010337937623262405, + "learning_rate": 7.119843224861645e-05, + "loss": 0.011439969763159752, + "num_input_tokens_seen": 99189432, + "step": 6057, + "train_runtime": 49217.8941, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.6715151515151514, + "grad_norm": 0.005314069800078869, + "learning_rate": 7.118972274186286e-05, + "loss": 0.011886910535395145, + "num_input_tokens_seen": 99205808, + "step": 6058, + "train_runtime": 49226.0095, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 3.672121212121212, + "grad_norm": 0.006004201248288155, + "learning_rate": 7.118101245134271e-05, + "loss": 0.012565717101097107, + "num_input_tokens_seen": 99222184, + "step": 6059, + "train_runtime": 49234.1315, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 3.672727272727273, + "grad_norm": 0.0071060252375900745, + "learning_rate": 7.117230137737815e-05, + "loss": 0.012319165281951427, + "num_input_tokens_seen": 99238560, + "step": 6060, + "train_runtime": 49242.243, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.6733333333333333, + "grad_norm": 0.007341449614614248, + "learning_rate": 7.11635895202914e-05, + "loss": 0.011326941661536694, + "num_input_tokens_seen": 99254936, + "step": 6061, + "train_runtime": 49250.3545, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.673939393939394, + "grad_norm": 0.007920407690107822, + "learning_rate": 7.115487688040468e-05, + "loss": 0.01276029460132122, + "num_input_tokens_seen": 99271312, + "step": 6062, + "train_runtime": 49258.4692, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.674545454545455, + "grad_norm": 0.0043686022982001305, + "learning_rate": 7.114616345804026e-05, + "loss": 0.012116245925426483, + "num_input_tokens_seen": 99287688, + "step": 6063, + "train_runtime": 49266.5848, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.6751515151515153, + "grad_norm": 0.007974096573889256, + "learning_rate": 7.113744925352043e-05, + "loss": 0.012273788452148438, + "num_input_tokens_seen": 99304064, + "step": 6064, + "train_runtime": 49274.6986, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 3.675757575757576, + "grad_norm": 0.014010514132678509, + "learning_rate": 7.112873426716754e-05, + "loss": 0.010899647139012814, + "num_input_tokens_seen": 99320440, + "step": 6065, + "train_runtime": 49282.8139, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 3.6763636363636363, + "grad_norm": 0.006286607123911381, + "learning_rate": 7.112001849930388e-05, + "loss": 0.011665970087051392, + "num_input_tokens_seen": 99336816, + "step": 6066, + "train_runtime": 49290.9334, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 3.6769696969696968, + "grad_norm": 0.008172346279025078, + "learning_rate": 7.111130195025189e-05, + "loss": 0.012696857564151287, + "num_input_tokens_seen": 99353192, + "step": 6067, + "train_runtime": 49299.0441, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 3.6775757575757577, + "grad_norm": 0.008489429019391537, + "learning_rate": 7.110258462033394e-05, + "loss": 0.011572959832847118, + "num_input_tokens_seen": 99369568, + "step": 6068, + "train_runtime": 49307.1553, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 3.678181818181818, + "grad_norm": 0.004255587700754404, + "learning_rate": 7.10938665098725e-05, + "loss": 0.01205289363861084, + "num_input_tokens_seen": 99385944, + "step": 6069, + "train_runtime": 49315.2648, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 3.6787878787878787, + "grad_norm": 0.007641472388058901, + "learning_rate": 7.108514761918999e-05, + "loss": 0.012084404937922955, + "num_input_tokens_seen": 99402320, + "step": 6070, + "train_runtime": 49323.3778, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 3.6793939393939397, + "grad_norm": 0.007658126298338175, + "learning_rate": 7.107642794860895e-05, + "loss": 0.012432295829057693, + "num_input_tokens_seen": 99418696, + "step": 6071, + "train_runtime": 49331.4954, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 3.68, + "grad_norm": 0.007507559843361378, + "learning_rate": 7.106770749845189e-05, + "loss": 0.012327196076512337, + "num_input_tokens_seen": 99435072, + "step": 6072, + "train_runtime": 49339.6065, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 3.6806060606060607, + "grad_norm": 0.00621723523363471, + "learning_rate": 7.105898626904134e-05, + "loss": 0.011722536757588387, + "num_input_tokens_seen": 99451448, + "step": 6073, + "train_runtime": 49347.7189, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 3.681212121212121, + "grad_norm": 0.0023858556523919106, + "learning_rate": 7.105026426069989e-05, + "loss": 0.011853603646159172, + "num_input_tokens_seen": 99467824, + "step": 6074, + "train_runtime": 49355.8349, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 3.6818181818181817, + "grad_norm": 0.005720058921724558, + "learning_rate": 7.104154147375018e-05, + "loss": 0.011282936669886112, + "num_input_tokens_seen": 99484200, + "step": 6075, + "train_runtime": 49363.9488, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 3.6824242424242426, + "grad_norm": 0.006661038845777512, + "learning_rate": 7.103281790851482e-05, + "loss": 0.011736983433365822, + "num_input_tokens_seen": 99500576, + "step": 6076, + "train_runtime": 49372.0607, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 3.683030303030303, + "grad_norm": 0.009576331824064255, + "learning_rate": 7.10240935653165e-05, + "loss": 0.012589178048074245, + "num_input_tokens_seen": 99516952, + "step": 6077, + "train_runtime": 49380.1692, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 3.6836363636363636, + "grad_norm": 0.0047458321787416935, + "learning_rate": 7.101536844447789e-05, + "loss": 0.010931774973869324, + "num_input_tokens_seen": 99533328, + "step": 6078, + "train_runtime": 49388.2812, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 3.684242424242424, + "grad_norm": 0.009446113370358944, + "learning_rate": 7.10066425463217e-05, + "loss": 0.010143307037651539, + "num_input_tokens_seen": 99549704, + "step": 6079, + "train_runtime": 49396.3937, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 3.6848484848484846, + "grad_norm": 0.006444686558097601, + "learning_rate": 7.099791587117074e-05, + "loss": 0.011781896464526653, + "num_input_tokens_seen": 99566080, + "step": 6080, + "train_runtime": 49404.5054, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 3.6854545454545455, + "grad_norm": 0.01002875342965126, + "learning_rate": 7.098918841934775e-05, + "loss": 0.011732235550880432, + "num_input_tokens_seen": 99582456, + "step": 6081, + "train_runtime": 49412.6169, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 3.686060606060606, + "grad_norm": 0.006969011388719082, + "learning_rate": 7.098046019117557e-05, + "loss": 0.011486491188406944, + "num_input_tokens_seen": 99598832, + "step": 6082, + "train_runtime": 49420.7316, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 3.6866666666666665, + "grad_norm": 0.004786635749042034, + "learning_rate": 7.097173118697702e-05, + "loss": 0.012051347643136978, + "num_input_tokens_seen": 99615208, + "step": 6083, + "train_runtime": 49428.8447, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 3.6872727272727275, + "grad_norm": 0.013967028819024563, + "learning_rate": 7.096300140707497e-05, + "loss": 0.01182099711149931, + "num_input_tokens_seen": 99631584, + "step": 6084, + "train_runtime": 49436.954, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 3.687878787878788, + "grad_norm": 0.009063187055289745, + "learning_rate": 7.095427085179231e-05, + "loss": 0.012995096854865551, + "num_input_tokens_seen": 99647960, + "step": 6085, + "train_runtime": 49445.0709, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 3.6884848484848485, + "grad_norm": 0.01060736458748579, + "learning_rate": 7.094553952145202e-05, + "loss": 0.012583239935338497, + "num_input_tokens_seen": 99664336, + "step": 6086, + "train_runtime": 49453.1839, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 3.689090909090909, + "grad_norm": 0.008590192534029484, + "learning_rate": 7.093680741637698e-05, + "loss": 0.01221366599202156, + "num_input_tokens_seen": 99680712, + "step": 6087, + "train_runtime": 49461.296, + "train_tokens_per_second": 2015.328 + }, + { + "epoch": 3.6896969696969695, + "grad_norm": 0.0057416618801653385, + "learning_rate": 7.09280745368902e-05, + "loss": 0.01309233345091343, + "num_input_tokens_seen": 99697088, + "step": 6088, + "train_runtime": 49469.4087, + "train_tokens_per_second": 2015.328 + }, + { + "epoch": 3.6903030303030304, + "grad_norm": 0.009168618358671665, + "learning_rate": 7.091934088331472e-05, + "loss": 0.012509659864008427, + "num_input_tokens_seen": 99713464, + "step": 6089, + "train_runtime": 49477.5207, + "train_tokens_per_second": 2015.329 + }, + { + "epoch": 3.690909090909091, + "grad_norm": 0.004785375203937292, + "learning_rate": 7.091060645597354e-05, + "loss": 0.012110128998756409, + "num_input_tokens_seen": 99729840, + "step": 6090, + "train_runtime": 49485.6352, + "train_tokens_per_second": 2015.329 + }, + { + "epoch": 3.6915151515151514, + "grad_norm": 0.006008240394294262, + "learning_rate": 7.090187125518976e-05, + "loss": 0.011102152056992054, + "num_input_tokens_seen": 99746216, + "step": 6091, + "train_runtime": 49493.7466, + "train_tokens_per_second": 2015.33 + }, + { + "epoch": 3.6921212121212124, + "grad_norm": 0.006574293598532677, + "learning_rate": 7.089313528128646e-05, + "loss": 0.011857850477099419, + "num_input_tokens_seen": 99762592, + "step": 6092, + "train_runtime": 49501.8614, + "train_tokens_per_second": 2015.33 + }, + { + "epoch": 3.692727272727273, + "grad_norm": 0.0076755499467253685, + "learning_rate": 7.088439853458677e-05, + "loss": 0.010987117886543274, + "num_input_tokens_seen": 99778968, + "step": 6093, + "train_runtime": 49509.9768, + "train_tokens_per_second": 2015.331 + }, + { + "epoch": 3.6933333333333334, + "grad_norm": 0.007850540801882744, + "learning_rate": 7.087566101541386e-05, + "loss": 0.011388396844267845, + "num_input_tokens_seen": 99795344, + "step": 6094, + "train_runtime": 49518.0893, + "train_tokens_per_second": 2015.331 + }, + { + "epoch": 3.693939393939394, + "grad_norm": 0.004209704231470823, + "learning_rate": 7.08669227240909e-05, + "loss": 0.012166139669716358, + "num_input_tokens_seen": 99811720, + "step": 6095, + "train_runtime": 49526.2034, + "train_tokens_per_second": 2015.332 + }, + { + "epoch": 3.6945454545454544, + "grad_norm": 0.010567103512585163, + "learning_rate": 7.08581836609411e-05, + "loss": 0.011990534141659737, + "num_input_tokens_seen": 99828096, + "step": 6096, + "train_runtime": 49534.3149, + "train_tokens_per_second": 2015.332 + }, + { + "epoch": 3.6951515151515153, + "grad_norm": 0.0044053709134459496, + "learning_rate": 7.08494438262877e-05, + "loss": 0.01174754835665226, + "num_input_tokens_seen": 99844472, + "step": 6097, + "train_runtime": 49542.4309, + "train_tokens_per_second": 2015.333 + }, + { + "epoch": 3.695757575757576, + "grad_norm": 0.012768080458045006, + "learning_rate": 7.0840703220454e-05, + "loss": 0.013282276690006256, + "num_input_tokens_seen": 99860848, + "step": 6098, + "train_runtime": 49550.5441, + "train_tokens_per_second": 2015.333 + }, + { + "epoch": 3.6963636363636363, + "grad_norm": 0.0072809867560863495, + "learning_rate": 7.083196184376326e-05, + "loss": 0.01209140196442604, + "num_input_tokens_seen": 99877224, + "step": 6099, + "train_runtime": 49558.6597, + "train_tokens_per_second": 2015.333 + }, + { + "epoch": 3.6969696969696972, + "grad_norm": 0.004790001083165407, + "learning_rate": 7.082321969653882e-05, + "loss": 0.010578993707895279, + "num_input_tokens_seen": 99893600, + "step": 6100, + "train_runtime": 49566.7762, + "train_tokens_per_second": 2015.334 + }, + { + "epoch": 3.6975757575757577, + "grad_norm": 0.009768215008080006, + "learning_rate": 7.081447677910403e-05, + "loss": 0.011625876650214195, + "num_input_tokens_seen": 99909976, + "step": 6101, + "train_runtime": 49575.7884, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.6981818181818182, + "grad_norm": 0.006306584924459457, + "learning_rate": 7.080573309178228e-05, + "loss": 0.013317365199327469, + "num_input_tokens_seen": 99926352, + "step": 6102, + "train_runtime": 49583.8971, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.6987878787878787, + "grad_norm": 0.00731854559853673, + "learning_rate": 7.079698863489697e-05, + "loss": 0.011600594967603683, + "num_input_tokens_seen": 99942728, + "step": 6103, + "train_runtime": 49592.0091, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.6993939393939392, + "grad_norm": 0.006959907244890928, + "learning_rate": 7.078824340877156e-05, + "loss": 0.012481660582125187, + "num_input_tokens_seen": 99959104, + "step": 6104, + "train_runtime": 49600.1314, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.7, + "grad_norm": 0.005241766571998596, + "learning_rate": 7.077949741372952e-05, + "loss": 0.011339916847646236, + "num_input_tokens_seen": 99975480, + "step": 6105, + "train_runtime": 49608.2512, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.7006060606060607, + "grad_norm": 0.003554681781679392, + "learning_rate": 7.077075065009433e-05, + "loss": 0.011468221433460712, + "num_input_tokens_seen": 99991856, + "step": 6106, + "train_runtime": 49616.3659, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 3.701212121212121, + "grad_norm": 0.010734444484114647, + "learning_rate": 7.076200311818953e-05, + "loss": 0.012592852115631104, + "num_input_tokens_seen": 100008232, + "step": 6107, + "train_runtime": 49624.4778, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 3.7018181818181817, + "grad_norm": 0.005460778716951609, + "learning_rate": 7.075325481833864e-05, + "loss": 0.01201942190527916, + "num_input_tokens_seen": 100024608, + "step": 6108, + "train_runtime": 49632.5895, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 3.702424242424242, + "grad_norm": 0.01087447814643383, + "learning_rate": 7.07445057508653e-05, + "loss": 0.012521288357675076, + "num_input_tokens_seen": 100040984, + "step": 6109, + "train_runtime": 49640.704, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 3.703030303030303, + "grad_norm": 0.0069513460621237755, + "learning_rate": 7.073575591609307e-05, + "loss": 0.012009191326797009, + "num_input_tokens_seen": 100057360, + "step": 6110, + "train_runtime": 49648.8519, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 3.7036363636363636, + "grad_norm": 0.010641966946423054, + "learning_rate": 7.072700531434562e-05, + "loss": 0.012477566488087177, + "num_input_tokens_seen": 100073736, + "step": 6111, + "train_runtime": 49656.972, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 3.704242424242424, + "grad_norm": 0.007244534324854612, + "learning_rate": 7.07182539459466e-05, + "loss": 0.011816626414656639, + "num_input_tokens_seen": 100090112, + "step": 6112, + "train_runtime": 49665.087, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 3.704848484848485, + "grad_norm": 0.004940944258123636, + "learning_rate": 7.070950181121971e-05, + "loss": 0.011662865057587624, + "num_input_tokens_seen": 100106488, + "step": 6113, + "train_runtime": 49673.2039, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 3.7054545454545456, + "grad_norm": 0.009497140534222126, + "learning_rate": 7.070074891048869e-05, + "loss": 0.013099215924739838, + "num_input_tokens_seen": 100122864, + "step": 6114, + "train_runtime": 49681.3325, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 3.706060606060606, + "grad_norm": 0.006224688608199358, + "learning_rate": 7.069199524407729e-05, + "loss": 0.012911731377243996, + "num_input_tokens_seen": 100139240, + "step": 6115, + "train_runtime": 49689.4514, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 3.7066666666666666, + "grad_norm": 0.004790599923580885, + "learning_rate": 7.068324081230926e-05, + "loss": 0.012193256989121437, + "num_input_tokens_seen": 100155616, + "step": 6116, + "train_runtime": 49697.564, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 3.707272727272727, + "grad_norm": 0.010161981917917728, + "learning_rate": 7.067448561550844e-05, + "loss": 0.011556712910532951, + "num_input_tokens_seen": 100171992, + "step": 6117, + "train_runtime": 49705.6757, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 3.707878787878788, + "grad_norm": 0.009129038080573082, + "learning_rate": 7.066572965399865e-05, + "loss": 0.010842759162187576, + "num_input_tokens_seen": 100188368, + "step": 6118, + "train_runtime": 49713.7951, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 3.7084848484848485, + "grad_norm": 0.006416292395442724, + "learning_rate": 7.065697292810379e-05, + "loss": 0.01169460266828537, + "num_input_tokens_seen": 100204744, + "step": 6119, + "train_runtime": 49721.9072, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 3.709090909090909, + "grad_norm": 0.006567268166691065, + "learning_rate": 7.06482154381477e-05, + "loss": 0.01260090060532093, + "num_input_tokens_seen": 100221120, + "step": 6120, + "train_runtime": 49730.0203, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 3.70969696969697, + "grad_norm": 0.014589307829737663, + "learning_rate": 7.063945718445434e-05, + "loss": 0.012710961513221264, + "num_input_tokens_seen": 100237496, + "step": 6121, + "train_runtime": 49738.1324, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 3.7103030303030304, + "grad_norm": 0.018595868721604347, + "learning_rate": 7.063069816734766e-05, + "loss": 0.0131887998431921, + "num_input_tokens_seen": 100253872, + "step": 6122, + "train_runtime": 49746.2435, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 3.710909090909091, + "grad_norm": 0.004063964355736971, + "learning_rate": 7.062193838715163e-05, + "loss": 0.011781821958720684, + "num_input_tokens_seen": 100270248, + "step": 6123, + "train_runtime": 49754.3596, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 3.7115151515151514, + "grad_norm": 0.0060160052962601185, + "learning_rate": 7.061317784419026e-05, + "loss": 0.010919107124209404, + "num_input_tokens_seen": 100286624, + "step": 6124, + "train_runtime": 49762.4719, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 3.712121212121212, + "grad_norm": 0.006648882292211056, + "learning_rate": 7.060441653878757e-05, + "loss": 0.010862770490348339, + "num_input_tokens_seen": 100303000, + "step": 6125, + "train_runtime": 49770.5861, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 3.712727272727273, + "grad_norm": 0.010435924865305424, + "learning_rate": 7.059565447126765e-05, + "loss": 0.012973819859325886, + "num_input_tokens_seen": 100319376, + "step": 6126, + "train_runtime": 49778.7053, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 3.7133333333333334, + "grad_norm": 0.010175895877182484, + "learning_rate": 7.058689164195458e-05, + "loss": 0.011099039576947689, + "num_input_tokens_seen": 100335752, + "step": 6127, + "train_runtime": 49786.8309, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 3.713939393939394, + "grad_norm": 0.008159595541656017, + "learning_rate": 7.057812805117248e-05, + "loss": 0.013320503756403923, + "num_input_tokens_seen": 100352128, + "step": 6128, + "train_runtime": 49794.9417, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 3.714545454545455, + "grad_norm": 0.008256862871348858, + "learning_rate": 7.056936369924548e-05, + "loss": 0.012284916825592518, + "num_input_tokens_seen": 100368504, + "step": 6129, + "train_runtime": 49803.0572, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 3.7151515151515153, + "grad_norm": 0.005253160838037729, + "learning_rate": 7.05605985864978e-05, + "loss": 0.012717574834823608, + "num_input_tokens_seen": 100384880, + "step": 6130, + "train_runtime": 49811.1699, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.715757575757576, + "grad_norm": 0.005987518932670355, + "learning_rate": 7.055183271325359e-05, + "loss": 0.011760477907955647, + "num_input_tokens_seen": 100401256, + "step": 6131, + "train_runtime": 49819.2856, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.7163636363636363, + "grad_norm": 0.0044915336184203625, + "learning_rate": 7.054306607983714e-05, + "loss": 0.013111140578985214, + "num_input_tokens_seen": 100417632, + "step": 6132, + "train_runtime": 49827.3977, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.716969696969697, + "grad_norm": 0.008920470252633095, + "learning_rate": 7.053429868657265e-05, + "loss": 0.012774036265909672, + "num_input_tokens_seen": 100434008, + "step": 6133, + "train_runtime": 49835.5119, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.7175757575757578, + "grad_norm": 0.0033695001620799303, + "learning_rate": 7.052553053378447e-05, + "loss": 0.011837953701615334, + "num_input_tokens_seen": 100450384, + "step": 6134, + "train_runtime": 49843.6316, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.7181818181818183, + "grad_norm": 0.006853393279016018, + "learning_rate": 7.051676162179685e-05, + "loss": 0.012261848896741867, + "num_input_tokens_seen": 100466760, + "step": 6135, + "train_runtime": 49851.7464, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 3.7187878787878788, + "grad_norm": 0.05167530104517937, + "learning_rate": 7.05079919509342e-05, + "loss": 0.013621787540614605, + "num_input_tokens_seen": 100483136, + "step": 6136, + "train_runtime": 49859.8607, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 3.7193939393939393, + "grad_norm": 0.005209414288401604, + "learning_rate": 7.049922152152087e-05, + "loss": 0.012447446584701538, + "num_input_tokens_seen": 100499512, + "step": 6137, + "train_runtime": 49867.9754, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.7199999999999998, + "grad_norm": 0.006675231270492077, + "learning_rate": 7.049045033388127e-05, + "loss": 0.011370368301868439, + "num_input_tokens_seen": 100515888, + "step": 6138, + "train_runtime": 49876.0907, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.7206060606060607, + "grad_norm": 0.004050488118082285, + "learning_rate": 7.048167838833977e-05, + "loss": 0.011599705554544926, + "num_input_tokens_seen": 100532264, + "step": 6139, + "train_runtime": 49884.2006, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 3.721212121212121, + "grad_norm": 0.009823303669691086, + "learning_rate": 7.04729056852209e-05, + "loss": 0.012846910394728184, + "num_input_tokens_seen": 100548640, + "step": 6140, + "train_runtime": 49892.3146, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 3.7218181818181817, + "grad_norm": 0.005695359315723181, + "learning_rate": 7.046413222484912e-05, + "loss": 0.01155055407434702, + "num_input_tokens_seen": 100565016, + "step": 6141, + "train_runtime": 49900.431, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.7224242424242426, + "grad_norm": 0.026027467101812363, + "learning_rate": 7.045535800754894e-05, + "loss": 0.012161691673099995, + "num_input_tokens_seen": 100581392, + "step": 6142, + "train_runtime": 49908.545, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.723030303030303, + "grad_norm": 0.005927909631282091, + "learning_rate": 7.044658303364489e-05, + "loss": 0.01181608997285366, + "num_input_tokens_seen": 100597768, + "step": 6143, + "train_runtime": 49916.6559, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.7236363636363636, + "grad_norm": 0.004832983948290348, + "learning_rate": 7.043780730346155e-05, + "loss": 0.012428647838532925, + "num_input_tokens_seen": 100614144, + "step": 6144, + "train_runtime": 49924.7686, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.724242424242424, + "grad_norm": 0.00798672903329134, + "learning_rate": 7.042903081732353e-05, + "loss": 0.012029257602989674, + "num_input_tokens_seen": 100630520, + "step": 6145, + "train_runtime": 49932.8788, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 3.7248484848484846, + "grad_norm": 0.0037467426154762506, + "learning_rate": 7.042025357555546e-05, + "loss": 0.012248550541698933, + "num_input_tokens_seen": 100646896, + "step": 6146, + "train_runtime": 49940.9914, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 3.7254545454545456, + "grad_norm": 0.006329103838652372, + "learning_rate": 7.041147557848195e-05, + "loss": 0.013356457464396954, + "num_input_tokens_seen": 100663272, + "step": 6147, + "train_runtime": 49949.1111, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 3.726060606060606, + "grad_norm": 0.007191803306341171, + "learning_rate": 7.040269682642772e-05, + "loss": 0.012202011421322823, + "num_input_tokens_seen": 100679648, + "step": 6148, + "train_runtime": 49957.2308, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 3.7266666666666666, + "grad_norm": 0.012146863155066967, + "learning_rate": 7.039391731971746e-05, + "loss": 0.01120641641318798, + "num_input_tokens_seen": 100696024, + "step": 6149, + "train_runtime": 49965.3433, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 3.7272727272727275, + "grad_norm": 0.007628040388226509, + "learning_rate": 7.038513705867592e-05, + "loss": 0.012934167869389057, + "num_input_tokens_seen": 100712400, + "step": 6150, + "train_runtime": 49973.4543, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 3.727878787878788, + "grad_norm": 0.00526001350954175, + "learning_rate": 7.037635604362785e-05, + "loss": 0.013145286589860916, + "num_input_tokens_seen": 100728776, + "step": 6151, + "train_runtime": 49981.5653, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 3.7284848484848485, + "grad_norm": 0.007276763673871756, + "learning_rate": 7.036757427489806e-05, + "loss": 0.011824960820376873, + "num_input_tokens_seen": 100745152, + "step": 6152, + "train_runtime": 49989.6816, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 3.729090909090909, + "grad_norm": 0.006176415365189314, + "learning_rate": 7.035879175281136e-05, + "loss": 0.011389459483325481, + "num_input_tokens_seen": 100761528, + "step": 6153, + "train_runtime": 49997.7992, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 3.7296969696969695, + "grad_norm": 0.006907724775373936, + "learning_rate": 7.03500084776926e-05, + "loss": 0.011938108131289482, + "num_input_tokens_seen": 100777904, + "step": 6154, + "train_runtime": 50005.914, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 3.7303030303030305, + "grad_norm": 0.006327489856630564, + "learning_rate": 7.034122444986666e-05, + "loss": 0.012253811582922935, + "num_input_tokens_seen": 100794280, + "step": 6155, + "train_runtime": 50014.0334, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 3.730909090909091, + "grad_norm": 0.0038651106879115105, + "learning_rate": 7.033243966965842e-05, + "loss": 0.011343946680426598, + "num_input_tokens_seen": 100810656, + "step": 6156, + "train_runtime": 50022.1502, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 3.7315151515151515, + "grad_norm": 0.006951532326638699, + "learning_rate": 7.032365413739286e-05, + "loss": 0.012608840130269527, + "num_input_tokens_seen": 100827032, + "step": 6157, + "train_runtime": 50030.2652, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 3.7321212121212124, + "grad_norm": 0.002559596672654152, + "learning_rate": 7.031486785339488e-05, + "loss": 0.011989197693765163, + "num_input_tokens_seen": 100843408, + "step": 6158, + "train_runtime": 50038.3762, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 3.732727272727273, + "grad_norm": 0.0035041761584579945, + "learning_rate": 7.030608081798954e-05, + "loss": 0.011986854486167431, + "num_input_tokens_seen": 100859784, + "step": 6159, + "train_runtime": 50046.4878, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 3.7333333333333334, + "grad_norm": 0.0061879889108240604, + "learning_rate": 7.029729303150178e-05, + "loss": 0.011915595270693302, + "num_input_tokens_seen": 100876160, + "step": 6160, + "train_runtime": 50054.6022, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 3.733939393939394, + "grad_norm": 0.0062517523765563965, + "learning_rate": 7.02885044942567e-05, + "loss": 0.01267078798264265, + "num_input_tokens_seen": 100892536, + "step": 6161, + "train_runtime": 50062.7127, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 3.7345454545454544, + "grad_norm": 0.00880285445600748, + "learning_rate": 7.027971520657933e-05, + "loss": 0.011717623099684715, + "num_input_tokens_seen": 100908912, + "step": 6162, + "train_runtime": 50070.8313, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 3.7351515151515153, + "grad_norm": 0.008314872160553932, + "learning_rate": 7.02709251687948e-05, + "loss": 0.011546156369149685, + "num_input_tokens_seen": 100925288, + "step": 6163, + "train_runtime": 50078.9468, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 3.735757575757576, + "grad_norm": 0.005738316103816032, + "learning_rate": 7.026213438122822e-05, + "loss": 0.011595218442380428, + "num_input_tokens_seen": 100941664, + "step": 6164, + "train_runtime": 50087.0611, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 3.7363636363636363, + "grad_norm": 0.0035557630471885204, + "learning_rate": 7.025334284420475e-05, + "loss": 0.011444474570453167, + "num_input_tokens_seen": 100958040, + "step": 6165, + "train_runtime": 50095.176, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 3.736969696969697, + "grad_norm": 0.007933280430734158, + "learning_rate": 7.024455055804958e-05, + "loss": 0.011498530395328999, + "num_input_tokens_seen": 100974416, + "step": 6166, + "train_runtime": 50103.2916, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 3.7375757575757573, + "grad_norm": 0.008891841396689415, + "learning_rate": 7.023575752308789e-05, + "loss": 0.011515995487570763, + "num_input_tokens_seen": 100990792, + "step": 6167, + "train_runtime": 50111.4037, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 3.7381818181818183, + "grad_norm": 0.006845598109066486, + "learning_rate": 7.022696373964495e-05, + "loss": 0.011527330614626408, + "num_input_tokens_seen": 101007168, + "step": 6168, + "train_runtime": 50119.516, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 3.7387878787878788, + "grad_norm": 0.009364659897983074, + "learning_rate": 7.0218169208046e-05, + "loss": 0.011629846878349781, + "num_input_tokens_seen": 101023544, + "step": 6169, + "train_runtime": 50127.631, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 3.7393939393939393, + "grad_norm": 0.0070641119964420795, + "learning_rate": 7.020937392861635e-05, + "loss": 0.011849287897348404, + "num_input_tokens_seen": 101039920, + "step": 6170, + "train_runtime": 50135.7477, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 3.74, + "grad_norm": 0.005642692092806101, + "learning_rate": 7.020057790168131e-05, + "loss": 0.011664772406220436, + "num_input_tokens_seen": 101056296, + "step": 6171, + "train_runtime": 50143.8638, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 3.7406060606060607, + "grad_norm": 0.00877367053180933, + "learning_rate": 7.019178112756624e-05, + "loss": 0.011600376106798649, + "num_input_tokens_seen": 101072672, + "step": 6172, + "train_runtime": 50151.9784, + "train_tokens_per_second": 2015.328 + }, + { + "epoch": 3.741212121212121, + "grad_norm": 0.0064525906927883625, + "learning_rate": 7.018298360659651e-05, + "loss": 0.011062362231314182, + "num_input_tokens_seen": 101089048, + "step": 6173, + "train_runtime": 50160.0944, + "train_tokens_per_second": 2015.328 + }, + { + "epoch": 3.7418181818181817, + "grad_norm": 0.005572921130806208, + "learning_rate": 7.017418533909753e-05, + "loss": 0.01225447840988636, + "num_input_tokens_seen": 101105424, + "step": 6174, + "train_runtime": 50168.2095, + "train_tokens_per_second": 2015.329 + }, + { + "epoch": 3.742424242424242, + "grad_norm": 0.00930140819400549, + "learning_rate": 7.016538632539469e-05, + "loss": 0.012088697403669357, + "num_input_tokens_seen": 101121800, + "step": 6175, + "train_runtime": 50176.3324, + "train_tokens_per_second": 2015.329 + }, + { + "epoch": 3.743030303030303, + "grad_norm": 0.004761696793138981, + "learning_rate": 7.01565865658135e-05, + "loss": 0.012576943263411522, + "num_input_tokens_seen": 101138176, + "step": 6176, + "train_runtime": 50184.4432, + "train_tokens_per_second": 2015.329 + }, + { + "epoch": 3.7436363636363637, + "grad_norm": 0.006412998307496309, + "learning_rate": 7.014778606067942e-05, + "loss": 0.011582130566239357, + "num_input_tokens_seen": 101154552, + "step": 6177, + "train_runtime": 50192.5612, + "train_tokens_per_second": 2015.33 + }, + { + "epoch": 3.744242424242424, + "grad_norm": 0.0038372015114873648, + "learning_rate": 7.013898481031798e-05, + "loss": 0.011392990127205849, + "num_input_tokens_seen": 101170928, + "step": 6178, + "train_runtime": 50200.6744, + "train_tokens_per_second": 2015.33 + }, + { + "epoch": 3.744848484848485, + "grad_norm": 0.006285810377448797, + "learning_rate": 7.01301828150547e-05, + "loss": 0.01059443224221468, + "num_input_tokens_seen": 101187304, + "step": 6179, + "train_runtime": 50208.7893, + "train_tokens_per_second": 2015.33 + }, + { + "epoch": 3.7454545454545456, + "grad_norm": 0.0013831730466336012, + "learning_rate": 7.012138007521516e-05, + "loss": 0.01197945885360241, + "num_input_tokens_seen": 101203680, + "step": 6180, + "train_runtime": 50216.9047, + "train_tokens_per_second": 2015.331 + }, + { + "epoch": 3.746060606060606, + "grad_norm": 0.006065284367650747, + "learning_rate": 7.011257659112495e-05, + "loss": 0.012310333549976349, + "num_input_tokens_seen": 101220056, + "step": 6181, + "train_runtime": 50225.019, + "train_tokens_per_second": 2015.331 + }, + { + "epoch": 3.7466666666666666, + "grad_norm": 0.00903791468590498, + "learning_rate": 7.010377236310974e-05, + "loss": 0.012551707215607166, + "num_input_tokens_seen": 101236432, + "step": 6182, + "train_runtime": 50233.1315, + "train_tokens_per_second": 2015.332 + }, + { + "epoch": 3.747272727272727, + "grad_norm": 0.00444659311324358, + "learning_rate": 7.009496739149509e-05, + "loss": 0.010280096903443336, + "num_input_tokens_seen": 101252808, + "step": 6183, + "train_runtime": 50241.246, + "train_tokens_per_second": 2015.332 + }, + { + "epoch": 3.747878787878788, + "grad_norm": 0.007244815118610859, + "learning_rate": 7.008616167660676e-05, + "loss": 0.011820579878985882, + "num_input_tokens_seen": 101269184, + "step": 6184, + "train_runtime": 50249.3586, + "train_tokens_per_second": 2015.333 + }, + { + "epoch": 3.7484848484848485, + "grad_norm": 0.00674406997859478, + "learning_rate": 7.00773552187704e-05, + "loss": 0.011130580678582191, + "num_input_tokens_seen": 101285560, + "step": 6185, + "train_runtime": 50257.4705, + "train_tokens_per_second": 2015.333 + }, + { + "epoch": 3.749090909090909, + "grad_norm": 0.006520967930555344, + "learning_rate": 7.00685480183118e-05, + "loss": 0.011374793015420437, + "num_input_tokens_seen": 101301936, + "step": 6186, + "train_runtime": 50265.5863, + "train_tokens_per_second": 2015.334 + }, + { + "epoch": 3.74969696969697, + "grad_norm": 0.004546190146356821, + "learning_rate": 7.005974007555667e-05, + "loss": 0.011290723457932472, + "num_input_tokens_seen": 101318312, + "step": 6187, + "train_runtime": 50273.6997, + "train_tokens_per_second": 2015.334 + }, + { + "epoch": 3.75030303030303, + "grad_norm": 0.007600209675729275, + "learning_rate": 7.005093139083082e-05, + "loss": 0.01197967678308487, + "num_input_tokens_seen": 101334688, + "step": 6188, + "train_runtime": 50281.8154, + "train_tokens_per_second": 2015.335 + }, + { + "epoch": 3.750909090909091, + "grad_norm": 0.008863379247486591, + "learning_rate": 7.004212196446007e-05, + "loss": 0.012247136794030666, + "num_input_tokens_seen": 101351064, + "step": 6189, + "train_runtime": 50289.9324, + "train_tokens_per_second": 2015.335 + }, + { + "epoch": 3.7515151515151515, + "grad_norm": 0.005255416966974735, + "learning_rate": 7.003331179677025e-05, + "loss": 0.010582678951323032, + "num_input_tokens_seen": 101367440, + "step": 6190, + "train_runtime": 50298.0445, + "train_tokens_per_second": 2015.336 + }, + { + "epoch": 3.752121212121212, + "grad_norm": 0.005283467937260866, + "learning_rate": 7.002450088808725e-05, + "loss": 0.011309179477393627, + "num_input_tokens_seen": 101383816, + "step": 6191, + "train_runtime": 50306.1555, + "train_tokens_per_second": 2015.336 + }, + { + "epoch": 3.752727272727273, + "grad_norm": 0.005632340908050537, + "learning_rate": 7.001568923873697e-05, + "loss": 0.011790482327342033, + "num_input_tokens_seen": 101400192, + "step": 6192, + "train_runtime": 50314.2692, + "train_tokens_per_second": 2015.337 + }, + { + "epoch": 3.7533333333333334, + "grad_norm": 0.008353588171303272, + "learning_rate": 7.00068768490453e-05, + "loss": 0.011613572016358376, + "num_input_tokens_seen": 101416568, + "step": 6193, + "train_runtime": 50322.3794, + "train_tokens_per_second": 2015.337 + }, + { + "epoch": 3.753939393939394, + "grad_norm": 0.00661675538867712, + "learning_rate": 6.999806371933821e-05, + "loss": 0.011216689832508564, + "num_input_tokens_seen": 101432944, + "step": 6194, + "train_runtime": 50330.4954, + "train_tokens_per_second": 2015.338 + }, + { + "epoch": 3.7545454545454544, + "grad_norm": 0.00700417160987854, + "learning_rate": 6.99892498499417e-05, + "loss": 0.012164724059402943, + "num_input_tokens_seen": 101449320, + "step": 6195, + "train_runtime": 50338.6105, + "train_tokens_per_second": 2015.338 + }, + { + "epoch": 3.755151515151515, + "grad_norm": 0.01226690411567688, + "learning_rate": 6.998043524118179e-05, + "loss": 0.012744542211294174, + "num_input_tokens_seen": 101465696, + "step": 6196, + "train_runtime": 50346.7319, + "train_tokens_per_second": 2015.338 + }, + { + "epoch": 3.755757575757576, + "grad_norm": 0.006545317359268665, + "learning_rate": 6.997161989338447e-05, + "loss": 0.013349631801247597, + "num_input_tokens_seen": 101482072, + "step": 6197, + "train_runtime": 50354.8472, + "train_tokens_per_second": 2015.339 + }, + { + "epoch": 3.7563636363636363, + "grad_norm": 0.002004138194024563, + "learning_rate": 6.996280380687582e-05, + "loss": 0.011462580412626266, + "num_input_tokens_seen": 101498448, + "step": 6198, + "train_runtime": 50362.9641, + "train_tokens_per_second": 2015.339 + }, + { + "epoch": 3.756969696969697, + "grad_norm": 0.005299938376992941, + "learning_rate": 6.995398698198193e-05, + "loss": 0.011646542698144913, + "num_input_tokens_seen": 101514824, + "step": 6199, + "train_runtime": 50371.0845, + "train_tokens_per_second": 2015.339 + }, + { + "epoch": 3.757575757575758, + "grad_norm": 0.00693443464115262, + "learning_rate": 6.994516941902892e-05, + "loss": 0.011881626211106777, + "num_input_tokens_seen": 101531200, + "step": 6200, + "train_runtime": 50379.1995, + "train_tokens_per_second": 2015.34 + }, + { + "epoch": 3.7581818181818183, + "grad_norm": 0.006186201702803373, + "learning_rate": 6.993635111834294e-05, + "loss": 0.01320985984057188, + "num_input_tokens_seen": 101547576, + "step": 6201, + "train_runtime": 50388.4068, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 3.758787878787879, + "grad_norm": 0.007361643947660923, + "learning_rate": 6.992753208025016e-05, + "loss": 0.0114086102694273, + "num_input_tokens_seen": 101563952, + "step": 6202, + "train_runtime": 50396.5317, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 3.7593939393939393, + "grad_norm": 0.005118601955473423, + "learning_rate": 6.991871230507677e-05, + "loss": 0.011720122769474983, + "num_input_tokens_seen": 101580328, + "step": 6203, + "train_runtime": 50404.6445, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 3.76, + "grad_norm": 0.009265156462788582, + "learning_rate": 6.990989179314901e-05, + "loss": 0.01224612072110176, + "num_input_tokens_seen": 101596704, + "step": 6204, + "train_runtime": 50412.7583, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 3.7606060606060607, + "grad_norm": 0.009933829307556152, + "learning_rate": 6.990107054479312e-05, + "loss": 0.012603172101080418, + "num_input_tokens_seen": 101613080, + "step": 6205, + "train_runtime": 50420.8714, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.7612121212121212, + "grad_norm": 0.0014932660851627588, + "learning_rate": 6.989224856033539e-05, + "loss": 0.01061934232711792, + "num_input_tokens_seen": 101629456, + "step": 6206, + "train_runtime": 50428.9838, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.7618181818181817, + "grad_norm": 0.004755176603794098, + "learning_rate": 6.988342584010211e-05, + "loss": 0.012208852916955948, + "num_input_tokens_seen": 101645832, + "step": 6207, + "train_runtime": 50437.0945, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.7624242424242427, + "grad_norm": 0.005914865992963314, + "learning_rate": 6.987460238441962e-05, + "loss": 0.012417688965797424, + "num_input_tokens_seen": 101662208, + "step": 6208, + "train_runtime": 50445.2036, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 3.763030303030303, + "grad_norm": 0.01143135316669941, + "learning_rate": 6.986577819361433e-05, + "loss": 0.011047020554542542, + "num_input_tokens_seen": 101678584, + "step": 6209, + "train_runtime": 50453.3145, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 3.7636363636363637, + "grad_norm": 0.003401104360818863, + "learning_rate": 6.985695326801257e-05, + "loss": 0.011690821498632431, + "num_input_tokens_seen": 101694960, + "step": 6210, + "train_runtime": 50461.4317, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 3.764242424242424, + "grad_norm": 0.007185343187302351, + "learning_rate": 6.984812760794079e-05, + "loss": 0.01228499785065651, + "num_input_tokens_seen": 101711336, + "step": 6211, + "train_runtime": 50469.5451, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 3.7648484848484847, + "grad_norm": 0.004432342015206814, + "learning_rate": 6.983930121372541e-05, + "loss": 0.012384985573589802, + "num_input_tokens_seen": 101727712, + "step": 6212, + "train_runtime": 50477.6559, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 3.7654545454545456, + "grad_norm": 0.007363414391875267, + "learning_rate": 6.98304740856929e-05, + "loss": 0.013010870665311813, + "num_input_tokens_seen": 101744088, + "step": 6213, + "train_runtime": 50485.7753, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 3.766060606060606, + "grad_norm": 0.003437815001234412, + "learning_rate": 6.98216462241698e-05, + "loss": 0.012327833101153374, + "num_input_tokens_seen": 101760464, + "step": 6214, + "train_runtime": 50493.8894, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 3.7666666666666666, + "grad_norm": 0.014301762916147709, + "learning_rate": 6.98128176294826e-05, + "loss": 0.01271708495914936, + "num_input_tokens_seen": 101776840, + "step": 6215, + "train_runtime": 50502.0054, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 3.767272727272727, + "grad_norm": 0.007811973802745342, + "learning_rate": 6.980398830195785e-05, + "loss": 0.013009576126933098, + "num_input_tokens_seen": 101793216, + "step": 6216, + "train_runtime": 50510.1168, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 3.7678787878787876, + "grad_norm": 0.006871999241411686, + "learning_rate": 6.979515824192213e-05, + "loss": 0.011495106853544712, + "num_input_tokens_seen": 101809592, + "step": 6217, + "train_runtime": 50518.2316, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 3.7684848484848485, + "grad_norm": 0.01024109497666359, + "learning_rate": 6.978632744970208e-05, + "loss": 0.013111365959048271, + "num_input_tokens_seen": 101825968, + "step": 6218, + "train_runtime": 50526.3443, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 3.769090909090909, + "grad_norm": 0.008795710280537605, + "learning_rate": 6.97774959256243e-05, + "loss": 0.012000957503914833, + "num_input_tokens_seen": 101842344, + "step": 6219, + "train_runtime": 50534.4623, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 3.7696969696969695, + "grad_norm": 0.007226441986858845, + "learning_rate": 6.976866367001547e-05, + "loss": 0.010619294829666615, + "num_input_tokens_seen": 101858720, + "step": 6220, + "train_runtime": 50542.5702, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 3.7703030303030305, + "grad_norm": 0.008143438026309013, + "learning_rate": 6.975983068320224e-05, + "loss": 0.012904131785035133, + "num_input_tokens_seen": 101875096, + "step": 6221, + "train_runtime": 50550.6841, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 3.770909090909091, + "grad_norm": 0.0037744438741356134, + "learning_rate": 6.975099696551137e-05, + "loss": 0.011135710403323174, + "num_input_tokens_seen": 101891472, + "step": 6222, + "train_runtime": 50558.7944, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 3.7715151515151515, + "grad_norm": 0.0052701570093631744, + "learning_rate": 6.974216251726959e-05, + "loss": 0.011534260585904121, + "num_input_tokens_seen": 101907848, + "step": 6223, + "train_runtime": 50566.9076, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 3.772121212121212, + "grad_norm": 0.006530883722007275, + "learning_rate": 6.973332733880366e-05, + "loss": 0.012807786464691162, + "num_input_tokens_seen": 101924224, + "step": 6224, + "train_runtime": 50575.0184, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 3.7727272727272725, + "grad_norm": 0.01617208495736122, + "learning_rate": 6.972449143044038e-05, + "loss": 0.012696351855993271, + "num_input_tokens_seen": 101940600, + "step": 6225, + "train_runtime": 50583.1315, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 3.7733333333333334, + "grad_norm": 0.005134264938533306, + "learning_rate": 6.971565479250659e-05, + "loss": 0.011527364142239094, + "num_input_tokens_seen": 101956976, + "step": 6226, + "train_runtime": 50591.2411, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.773939393939394, + "grad_norm": 0.010241348296403885, + "learning_rate": 6.970681742532911e-05, + "loss": 0.012190565466880798, + "num_input_tokens_seen": 101973352, + "step": 6227, + "train_runtime": 50599.352, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.7745454545454544, + "grad_norm": 0.006757514551281929, + "learning_rate": 6.969797932923483e-05, + "loss": 0.01201988011598587, + "num_input_tokens_seen": 101989728, + "step": 6228, + "train_runtime": 50607.4633, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.7751515151515154, + "grad_norm": 0.004881802015006542, + "learning_rate": 6.968914050455064e-05, + "loss": 0.010866053402423859, + "num_input_tokens_seen": 102006104, + "step": 6229, + "train_runtime": 50615.5755, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 3.775757575757576, + "grad_norm": 0.007667435798794031, + "learning_rate": 6.968030095160352e-05, + "loss": 0.012708387337625027, + "num_input_tokens_seen": 102022480, + "step": 6230, + "train_runtime": 50623.6906, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 3.7763636363636364, + "grad_norm": 0.006477579474449158, + "learning_rate": 6.967146067072037e-05, + "loss": 0.012421897612512112, + "num_input_tokens_seen": 102038856, + "step": 6231, + "train_runtime": 50631.8064, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 3.776969696969697, + "grad_norm": 0.005117279477417469, + "learning_rate": 6.96626196622282e-05, + "loss": 0.012911765836179256, + "num_input_tokens_seen": 102055232, + "step": 6232, + "train_runtime": 50639.9207, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.7775757575757574, + "grad_norm": 0.00917720329016447, + "learning_rate": 6.965377792645403e-05, + "loss": 0.012036660686135292, + "num_input_tokens_seen": 102071608, + "step": 6233, + "train_runtime": 50648.0362, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.7781818181818183, + "grad_norm": 0.011608646251261234, + "learning_rate": 6.964493546372489e-05, + "loss": 0.013109451159834862, + "num_input_tokens_seen": 102087984, + "step": 6234, + "train_runtime": 50656.1464, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 3.778787878787879, + "grad_norm": 0.006358626298606396, + "learning_rate": 6.963609227436783e-05, + "loss": 0.012244734913110733, + "num_input_tokens_seen": 102104360, + "step": 6235, + "train_runtime": 50664.2571, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 3.7793939393939393, + "grad_norm": 0.00882328674197197, + "learning_rate": 6.962724835870996e-05, + "loss": 0.011628245003521442, + "num_input_tokens_seen": 102120736, + "step": 6236, + "train_runtime": 50672.3733, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.7800000000000002, + "grad_norm": 0.006372332572937012, + "learning_rate": 6.96184037170784e-05, + "loss": 0.012618664652109146, + "num_input_tokens_seen": 102137112, + "step": 6237, + "train_runtime": 50680.4852, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.7806060606060607, + "grad_norm": 0.00975918211042881, + "learning_rate": 6.960955834980028e-05, + "loss": 0.012569336220622063, + "num_input_tokens_seen": 102153488, + "step": 6238, + "train_runtime": 50688.5985, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.7812121212121212, + "grad_norm": 0.006028352305293083, + "learning_rate": 6.96007122572028e-05, + "loss": 0.012872707098722458, + "num_input_tokens_seen": 102169864, + "step": 6239, + "train_runtime": 50696.7078, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 3.7818181818181817, + "grad_norm": 0.0055358633399009705, + "learning_rate": 6.959186543961313e-05, + "loss": 0.011887052096426487, + "num_input_tokens_seen": 102186240, + "step": 6240, + "train_runtime": 50704.8316, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 3.7824242424242422, + "grad_norm": 0.0038938550278544426, + "learning_rate": 6.958301789735852e-05, + "loss": 0.011469402350485325, + "num_input_tokens_seen": 102202616, + "step": 6241, + "train_runtime": 50712.958, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 3.783030303030303, + "grad_norm": 0.007196472957730293, + "learning_rate": 6.95741696307662e-05, + "loss": 0.013860711827874184, + "num_input_tokens_seen": 102218992, + "step": 6242, + "train_runtime": 50721.0908, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.7836363636363637, + "grad_norm": 0.00775848189368844, + "learning_rate": 6.956532064016348e-05, + "loss": 0.011535650119185448, + "num_input_tokens_seen": 102235368, + "step": 6243, + "train_runtime": 50729.2168, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.784242424242424, + "grad_norm": 0.005473027005791664, + "learning_rate": 6.955647092587765e-05, + "loss": 0.012289511039853096, + "num_input_tokens_seen": 102251744, + "step": 6244, + "train_runtime": 50737.3433, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.7848484848484847, + "grad_norm": 0.006196952424943447, + "learning_rate": 6.954762048823604e-05, + "loss": 0.010886055417358875, + "num_input_tokens_seen": 102268120, + "step": 6245, + "train_runtime": 50745.4745, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.785454545454545, + "grad_norm": 0.0035442921798676252, + "learning_rate": 6.953876932756602e-05, + "loss": 0.010779273696243763, + "num_input_tokens_seen": 102284496, + "step": 6246, + "train_runtime": 50753.6075, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.786060606060606, + "grad_norm": 0.008348997682332993, + "learning_rate": 6.952991744419499e-05, + "loss": 0.012499358505010605, + "num_input_tokens_seen": 102300872, + "step": 6247, + "train_runtime": 50761.7446, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.7866666666666666, + "grad_norm": 0.011172893457114697, + "learning_rate": 6.952106483845031e-05, + "loss": 0.011979687958955765, + "num_input_tokens_seen": 102317248, + "step": 6248, + "train_runtime": 50769.8826, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.787272727272727, + "grad_norm": 0.006730454508215189, + "learning_rate": 6.951221151065947e-05, + "loss": 0.011870699934661388, + "num_input_tokens_seen": 102333624, + "step": 6249, + "train_runtime": 50778.0048, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.787878787878788, + "grad_norm": 0.005516422912478447, + "learning_rate": 6.950335746114993e-05, + "loss": 0.011424764059484005, + "num_input_tokens_seen": 102350000, + "step": 6250, + "train_runtime": 50786.1341, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.7884848484848486, + "grad_norm": 0.008181494660675526, + "learning_rate": 6.949450269024919e-05, + "loss": 0.012610391713678837, + "num_input_tokens_seen": 102366376, + "step": 6251, + "train_runtime": 50794.2553, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.789090909090909, + "grad_norm": 0.0063712880946695805, + "learning_rate": 6.948564719828473e-05, + "loss": 0.011726764030754566, + "num_input_tokens_seen": 102382752, + "step": 6252, + "train_runtime": 50802.3928, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.7896969696969696, + "grad_norm": 0.006658040452748537, + "learning_rate": 6.947679098558414e-05, + "loss": 0.011346281506121159, + "num_input_tokens_seen": 102399128, + "step": 6253, + "train_runtime": 50810.5324, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 3.79030303030303, + "grad_norm": 0.005373132880777121, + "learning_rate": 6.9467934052475e-05, + "loss": 0.012088925577700138, + "num_input_tokens_seen": 102415504, + "step": 6254, + "train_runtime": 50818.6668, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 3.790909090909091, + "grad_norm": 0.00888933427631855, + "learning_rate": 6.945907639928488e-05, + "loss": 0.012270631268620491, + "num_input_tokens_seen": 102431880, + "step": 6255, + "train_runtime": 50826.8018, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.7915151515151515, + "grad_norm": 0.0037128084804862738, + "learning_rate": 6.945021802634141e-05, + "loss": 0.011445199139416218, + "num_input_tokens_seen": 102448256, + "step": 6256, + "train_runtime": 50834.9338, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.792121212121212, + "grad_norm": 0.008714632131159306, + "learning_rate": 6.944135893397225e-05, + "loss": 0.01311169657856226, + "num_input_tokens_seen": 102464632, + "step": 6257, + "train_runtime": 50843.0668, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.792727272727273, + "grad_norm": 0.008877130225300789, + "learning_rate": 6.94324991225051e-05, + "loss": 0.011372465640306473, + "num_input_tokens_seen": 102481008, + "step": 6258, + "train_runtime": 50851.1967, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.7933333333333334, + "grad_norm": 0.004365301225334406, + "learning_rate": 6.942363859226764e-05, + "loss": 0.0108201764523983, + "num_input_tokens_seen": 102497384, + "step": 6259, + "train_runtime": 50859.3319, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 3.793939393939394, + "grad_norm": 0.006937627214938402, + "learning_rate": 6.941477734358762e-05, + "loss": 0.011296378448605537, + "num_input_tokens_seen": 102513760, + "step": 6260, + "train_runtime": 50867.4617, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 3.7945454545454544, + "grad_norm": 0.0049238745123147964, + "learning_rate": 6.940591537679279e-05, + "loss": 0.011746073141694069, + "num_input_tokens_seen": 102530136, + "step": 6261, + "train_runtime": 50875.5903, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 3.795151515151515, + "grad_norm": 0.003748950781300664, + "learning_rate": 6.939705269221093e-05, + "loss": 0.011998830363154411, + "num_input_tokens_seen": 102546512, + "step": 6262, + "train_runtime": 50883.7485, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.795757575757576, + "grad_norm": 0.005771557800471783, + "learning_rate": 6.938818929016988e-05, + "loss": 0.011723121628165245, + "num_input_tokens_seen": 102562888, + "step": 6263, + "train_runtime": 50891.8851, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.7963636363636364, + "grad_norm": 0.0069015431217849255, + "learning_rate": 6.937932517099747e-05, + "loss": 0.012203088961541653, + "num_input_tokens_seen": 102579264, + "step": 6264, + "train_runtime": 50900.0302, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.796969696969697, + "grad_norm": 0.006898272782564163, + "learning_rate": 6.937046033502155e-05, + "loss": 0.010199167765676975, + "num_input_tokens_seen": 102595640, + "step": 6265, + "train_runtime": 50908.1544, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.797575757575758, + "grad_norm": 0.0032226622570306063, + "learning_rate": 6.936159478257003e-05, + "loss": 0.01159664522856474, + "num_input_tokens_seen": 102612016, + "step": 6266, + "train_runtime": 50916.2813, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.7981818181818183, + "grad_norm": 0.008912288583815098, + "learning_rate": 6.935272851397081e-05, + "loss": 0.011811792850494385, + "num_input_tokens_seen": 102628392, + "step": 6267, + "train_runtime": 50924.3943, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.798787878787879, + "grad_norm": 0.01008197758346796, + "learning_rate": 6.934386152955189e-05, + "loss": 0.013175941072404385, + "num_input_tokens_seen": 102644768, + "step": 6268, + "train_runtime": 50932.5092, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.7993939393939393, + "grad_norm": 0.007799266371876001, + "learning_rate": 6.933499382964115e-05, + "loss": 0.010943270288407803, + "num_input_tokens_seen": 102661144, + "step": 6269, + "train_runtime": 50940.6219, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.8, + "grad_norm": 0.0077049885876476765, + "learning_rate": 6.932612541456666e-05, + "loss": 0.012552403844892979, + "num_input_tokens_seen": 102677520, + "step": 6270, + "train_runtime": 50948.7391, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.8006060606060608, + "grad_norm": 0.0050292289815843105, + "learning_rate": 6.931725628465643e-05, + "loss": 0.01115911453962326, + "num_input_tokens_seen": 102693896, + "step": 6271, + "train_runtime": 50956.8537, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 3.8012121212121213, + "grad_norm": 0.008714534342288971, + "learning_rate": 6.930838644023851e-05, + "loss": 0.012213939800858498, + "num_input_tokens_seen": 102710272, + "step": 6272, + "train_runtime": 50964.9724, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 3.8018181818181818, + "grad_norm": 0.007284838706254959, + "learning_rate": 6.929951588164098e-05, + "loss": 0.012722796760499477, + "num_input_tokens_seen": 102726648, + "step": 6273, + "train_runtime": 50973.094, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 3.8024242424242423, + "grad_norm": 0.004358161240816116, + "learning_rate": 6.929064460919195e-05, + "loss": 0.01252426952123642, + "num_input_tokens_seen": 102743024, + "step": 6274, + "train_runtime": 50981.2175, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 3.8030303030303028, + "grad_norm": 0.008258230984210968, + "learning_rate": 6.928177262321952e-05, + "loss": 0.012353505939245224, + "num_input_tokens_seen": 102759400, + "step": 6275, + "train_runtime": 50989.3322, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.8036363636363637, + "grad_norm": 0.006422831676900387, + "learning_rate": 6.927289992405189e-05, + "loss": 0.012414581142365932, + "num_input_tokens_seen": 102775776, + "step": 6276, + "train_runtime": 50997.4439, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.804242424242424, + "grad_norm": 0.007514102850109339, + "learning_rate": 6.926402651201722e-05, + "loss": 0.013054956682026386, + "num_input_tokens_seen": 102792152, + "step": 6277, + "train_runtime": 51005.5586, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 3.8048484848484847, + "grad_norm": 0.008135175332427025, + "learning_rate": 6.925515238744373e-05, + "loss": 0.012813123874366283, + "num_input_tokens_seen": 102808528, + "step": 6278, + "train_runtime": 51013.6711, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 3.8054545454545456, + "grad_norm": 0.0029677569400519133, + "learning_rate": 6.924627755065963e-05, + "loss": 0.010926828719675541, + "num_input_tokens_seen": 102824904, + "step": 6279, + "train_runtime": 51021.7917, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 3.806060606060606, + "grad_norm": 0.006423068232834339, + "learning_rate": 6.923740200199322e-05, + "loss": 0.012639821507036686, + "num_input_tokens_seen": 102841280, + "step": 6280, + "train_runtime": 51029.9454, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 3.8066666666666666, + "grad_norm": 0.0018232105066999793, + "learning_rate": 6.922852574177277e-05, + "loss": 0.010540521703660488, + "num_input_tokens_seen": 102857656, + "step": 6281, + "train_runtime": 51038.0609, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 3.807272727272727, + "grad_norm": 0.009339011274278164, + "learning_rate": 6.92196487703266e-05, + "loss": 0.012143468484282494, + "num_input_tokens_seen": 102874032, + "step": 6282, + "train_runtime": 51046.1735, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 3.8078787878787876, + "grad_norm": 0.005848820321261883, + "learning_rate": 6.921077108798304e-05, + "loss": 0.01168083306401968, + "num_input_tokens_seen": 102890408, + "step": 6283, + "train_runtime": 51054.286, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.8084848484848486, + "grad_norm": 0.009962841868400574, + "learning_rate": 6.920189269507047e-05, + "loss": 0.011030889116227627, + "num_input_tokens_seen": 102906784, + "step": 6284, + "train_runtime": 51062.4016, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 3.809090909090909, + "grad_norm": 0.006550271529704332, + "learning_rate": 6.91930135919173e-05, + "loss": 0.011285737156867981, + "num_input_tokens_seen": 102923160, + "step": 6285, + "train_runtime": 51070.5191, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.8096969696969696, + "grad_norm": 0.0048977104015648365, + "learning_rate": 6.918413377885192e-05, + "loss": 0.012660562992095947, + "num_input_tokens_seen": 102939536, + "step": 6286, + "train_runtime": 51078.6363, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.8103030303030305, + "grad_norm": 0.008043184876441956, + "learning_rate": 6.91752532562028e-05, + "loss": 0.011686675250530243, + "num_input_tokens_seen": 102955912, + "step": 6287, + "train_runtime": 51086.7547, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.810909090909091, + "grad_norm": 0.006581114139407873, + "learning_rate": 6.916637202429839e-05, + "loss": 0.011970548890531063, + "num_input_tokens_seen": 102972288, + "step": 6288, + "train_runtime": 51094.8733, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 3.8115151515151515, + "grad_norm": 0.009300352074205875, + "learning_rate": 6.915749008346722e-05, + "loss": 0.013120735064148903, + "num_input_tokens_seen": 102988664, + "step": 6289, + "train_runtime": 51102.9887, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 3.812121212121212, + "grad_norm": 0.004947901237756014, + "learning_rate": 6.914860743403777e-05, + "loss": 0.011821339838206768, + "num_input_tokens_seen": 103005040, + "step": 6290, + "train_runtime": 51111.0986, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 3.8127272727272725, + "grad_norm": 0.007652816828340292, + "learning_rate": 6.913972407633866e-05, + "loss": 0.012305631302297115, + "num_input_tokens_seen": 103021416, + "step": 6291, + "train_runtime": 51119.2116, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 3.8133333333333335, + "grad_norm": 0.005294968374073505, + "learning_rate": 6.913084001069841e-05, + "loss": 0.01281295157968998, + "num_input_tokens_seen": 103037792, + "step": 6292, + "train_runtime": 51127.3353, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 3.813939393939394, + "grad_norm": 0.008702762424945831, + "learning_rate": 6.912195523744564e-05, + "loss": 0.012220567092299461, + "num_input_tokens_seen": 103054168, + "step": 6293, + "train_runtime": 51135.4505, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 3.8145454545454545, + "grad_norm": 0.0065480382181704044, + "learning_rate": 6.911306975690899e-05, + "loss": 0.011853879317641258, + "num_input_tokens_seen": 103070544, + "step": 6294, + "train_runtime": 51143.5616, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 3.8151515151515154, + "grad_norm": 0.00338232540525496, + "learning_rate": 6.910418356941711e-05, + "loss": 0.011576632969081402, + "num_input_tokens_seen": 103086920, + "step": 6295, + "train_runtime": 51151.6764, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 3.815757575757576, + "grad_norm": 0.01020827703177929, + "learning_rate": 6.909529667529868e-05, + "loss": 0.012126404792070389, + "num_input_tokens_seen": 103103296, + "step": 6296, + "train_runtime": 51159.7933, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 3.8163636363636364, + "grad_norm": 0.010046305134892464, + "learning_rate": 6.908640907488243e-05, + "loss": 0.011948558501899242, + "num_input_tokens_seen": 103119672, + "step": 6297, + "train_runtime": 51167.9078, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 3.816969696969697, + "grad_norm": 0.004263310227543116, + "learning_rate": 6.907752076849705e-05, + "loss": 0.011777518317103386, + "num_input_tokens_seen": 103136048, + "step": 6298, + "train_runtime": 51176.0217, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 3.8175757575757574, + "grad_norm": 0.005382035858929157, + "learning_rate": 6.906863175647135e-05, + "loss": 0.011934838257730007, + "num_input_tokens_seen": 103152424, + "step": 6299, + "train_runtime": 51184.1422, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 3.8181818181818183, + "grad_norm": 0.006527692545205355, + "learning_rate": 6.90597420391341e-05, + "loss": 0.012705227360129356, + "num_input_tokens_seen": 103168800, + "step": 6300, + "train_runtime": 51192.2557, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 3.818787878787879, + "grad_norm": 0.004525860771536827, + "learning_rate": 6.905085161681408e-05, + "loss": 0.012897767126560211, + "num_input_tokens_seen": 103185176, + "step": 6301, + "train_runtime": 51201.3347, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 3.8193939393939393, + "grad_norm": 0.0029775453731417656, + "learning_rate": 6.904196048984019e-05, + "loss": 0.011589577421545982, + "num_input_tokens_seen": 103201552, + "step": 6302, + "train_runtime": 51209.4504, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 3.82, + "grad_norm": 0.006643456872552633, + "learning_rate": 6.903306865854124e-05, + "loss": 0.01271983701735735, + "num_input_tokens_seen": 103217928, + "step": 6303, + "train_runtime": 51217.5633, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 3.8206060606060603, + "grad_norm": 0.003929393831640482, + "learning_rate": 6.902417612324615e-05, + "loss": 0.01150571834295988, + "num_input_tokens_seen": 103234304, + "step": 6304, + "train_runtime": 51225.677, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 3.8212121212121213, + "grad_norm": 0.016976293176412582, + "learning_rate": 6.901528288428384e-05, + "loss": 0.011471095494925976, + "num_input_tokens_seen": 103250680, + "step": 6305, + "train_runtime": 51233.7908, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 3.821818181818182, + "grad_norm": 0.01078625489026308, + "learning_rate": 6.900638894198326e-05, + "loss": 0.012768702581524849, + "num_input_tokens_seen": 103267056, + "step": 6306, + "train_runtime": 51241.9031, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 3.8224242424242423, + "grad_norm": 0.007635517977178097, + "learning_rate": 6.899749429667334e-05, + "loss": 0.012750457040965557, + "num_input_tokens_seen": 103283432, + "step": 6307, + "train_runtime": 51250.0182, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 3.8230303030303032, + "grad_norm": 0.0076364390552043915, + "learning_rate": 6.898859894868311e-05, + "loss": 0.01268252544105053, + "num_input_tokens_seen": 103299808, + "step": 6308, + "train_runtime": 51258.1311, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 3.8236363636363637, + "grad_norm": 0.013346695341169834, + "learning_rate": 6.897970289834159e-05, + "loss": 0.012609517201781273, + "num_input_tokens_seen": 103316184, + "step": 6309, + "train_runtime": 51266.2505, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 3.824242424242424, + "grad_norm": 0.007399523165076971, + "learning_rate": 6.897080614597782e-05, + "loss": 0.011911976151168346, + "num_input_tokens_seen": 103332560, + "step": 6310, + "train_runtime": 51274.3738, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 3.8248484848484847, + "grad_norm": 0.007801682688295841, + "learning_rate": 6.896190869192087e-05, + "loss": 0.012502849102020264, + "num_input_tokens_seen": 103348936, + "step": 6311, + "train_runtime": 51282.4845, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 3.825454545454545, + "grad_norm": 0.008030702359974384, + "learning_rate": 6.895301053649986e-05, + "loss": 0.012551628984510899, + "num_input_tokens_seen": 103365312, + "step": 6312, + "train_runtime": 51290.5996, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 3.826060606060606, + "grad_norm": 0.007297629490494728, + "learning_rate": 6.894411168004387e-05, + "loss": 0.013094923458993435, + "num_input_tokens_seen": 103381688, + "step": 6313, + "train_runtime": 51298.7138, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 3.8266666666666667, + "grad_norm": 0.004709959030151367, + "learning_rate": 6.893521212288212e-05, + "loss": 0.010728122666478157, + "num_input_tokens_seen": 103398064, + "step": 6314, + "train_runtime": 51306.8378, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 3.827272727272727, + "grad_norm": 0.007531205657869577, + "learning_rate": 6.89263118653437e-05, + "loss": 0.013046185486018658, + "num_input_tokens_seen": 103414440, + "step": 6315, + "train_runtime": 51314.9587, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 3.827878787878788, + "grad_norm": 0.004712633788585663, + "learning_rate": 6.89174109077579e-05, + "loss": 0.011675922200083733, + "num_input_tokens_seen": 103430816, + "step": 6316, + "train_runtime": 51323.0899, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 3.8284848484848486, + "grad_norm": 0.009135774336755276, + "learning_rate": 6.890850925045388e-05, + "loss": 0.011904616840183735, + "num_input_tokens_seen": 103447192, + "step": 6317, + "train_runtime": 51331.2091, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 3.829090909090909, + "grad_norm": 0.0040552509017288685, + "learning_rate": 6.889960689376096e-05, + "loss": 0.011980404146015644, + "num_input_tokens_seen": 103463568, + "step": 6318, + "train_runtime": 51339.3327, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.8296969696969696, + "grad_norm": 0.007523080334067345, + "learning_rate": 6.889070383800837e-05, + "loss": 0.012550567276775837, + "num_input_tokens_seen": 103479944, + "step": 6319, + "train_runtime": 51347.4491, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.83030303030303, + "grad_norm": 0.0059631941840052605, + "learning_rate": 6.888180008352543e-05, + "loss": 0.012197245843708515, + "num_input_tokens_seen": 103496320, + "step": 6320, + "train_runtime": 51355.5626, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.830909090909091, + "grad_norm": 0.009930959902703762, + "learning_rate": 6.887289563064147e-05, + "loss": 0.012333657592535019, + "num_input_tokens_seen": 103512696, + "step": 6321, + "train_runtime": 51363.679, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 3.8315151515151515, + "grad_norm": 0.006942151114344597, + "learning_rate": 6.886399047968585e-05, + "loss": 0.011899762786924839, + "num_input_tokens_seen": 103529072, + "step": 6322, + "train_runtime": 51371.7947, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 3.832121212121212, + "grad_norm": 0.00793468113988638, + "learning_rate": 6.8855084630988e-05, + "loss": 0.01224648766219616, + "num_input_tokens_seen": 103545448, + "step": 6323, + "train_runtime": 51379.9137, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 3.832727272727273, + "grad_norm": 0.006337353494018316, + "learning_rate": 6.884617808487725e-05, + "loss": 0.013225030153989792, + "num_input_tokens_seen": 103561824, + "step": 6324, + "train_runtime": 51388.0428, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 3.8333333333333335, + "grad_norm": 0.009311126545071602, + "learning_rate": 6.883727084168307e-05, + "loss": 0.01117705274373293, + "num_input_tokens_seen": 103578200, + "step": 6325, + "train_runtime": 51396.1638, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 3.833939393939394, + "grad_norm": 0.009820795617997646, + "learning_rate": 6.882836290173493e-05, + "loss": 0.012306584976613522, + "num_input_tokens_seen": 103594576, + "step": 6326, + "train_runtime": 51404.2922, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 3.8345454545454545, + "grad_norm": 0.008414201438426971, + "learning_rate": 6.881945426536234e-05, + "loss": 0.011296059004962444, + "num_input_tokens_seen": 103610952, + "step": 6327, + "train_runtime": 51412.41, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 3.835151515151515, + "grad_norm": 0.005937041714787483, + "learning_rate": 6.881054493289476e-05, + "loss": 0.01272441353648901, + "num_input_tokens_seen": 103627328, + "step": 6328, + "train_runtime": 51420.5219, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 3.835757575757576, + "grad_norm": 0.013918966986238956, + "learning_rate": 6.880163490466176e-05, + "loss": 0.013206088915467262, + "num_input_tokens_seen": 103643704, + "step": 6329, + "train_runtime": 51428.6569, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 3.8363636363636364, + "grad_norm": 0.009284058585762978, + "learning_rate": 6.87927241809929e-05, + "loss": 0.01275438442826271, + "num_input_tokens_seen": 103660080, + "step": 6330, + "train_runtime": 51436.7763, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 3.836969696969697, + "grad_norm": 0.009225012734532356, + "learning_rate": 6.878381276221777e-05, + "loss": 0.012789211235940456, + "num_input_tokens_seen": 103676456, + "step": 6331, + "train_runtime": 51444.9053, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 3.8375757575757574, + "grad_norm": 0.006569364108145237, + "learning_rate": 6.877490064866599e-05, + "loss": 0.012138286605477333, + "num_input_tokens_seen": 103692832, + "step": 6332, + "train_runtime": 51453.0311, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 3.838181818181818, + "grad_norm": 0.0029423313681036234, + "learning_rate": 6.876598784066719e-05, + "loss": 0.011543401516973972, + "num_input_tokens_seen": 103709208, + "step": 6333, + "train_runtime": 51461.1603, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 3.838787878787879, + "grad_norm": 0.006257644854485989, + "learning_rate": 6.875707433855104e-05, + "loss": 0.012940492480993271, + "num_input_tokens_seen": 103725584, + "step": 6334, + "train_runtime": 51469.2801, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 3.8393939393939394, + "grad_norm": 0.008554238826036453, + "learning_rate": 6.874816014264724e-05, + "loss": 0.012283507734537125, + "num_input_tokens_seen": 103741960, + "step": 6335, + "train_runtime": 51477.3961, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 3.84, + "grad_norm": 0.0045417980290949345, + "learning_rate": 6.873924525328553e-05, + "loss": 0.012042131274938583, + "num_input_tokens_seen": 103758336, + "step": 6336, + "train_runtime": 51485.514, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 3.840606060606061, + "grad_norm": 0.0038913278840482235, + "learning_rate": 6.873032967079561e-05, + "loss": 0.011663387529551983, + "num_input_tokens_seen": 103774712, + "step": 6337, + "train_runtime": 51493.6315, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 3.8412121212121213, + "grad_norm": 0.009882887825369835, + "learning_rate": 6.872141339550727e-05, + "loss": 0.013153274543583393, + "num_input_tokens_seen": 103791088, + "step": 6338, + "train_runtime": 51501.7432, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 3.841818181818182, + "grad_norm": 0.007413309067487717, + "learning_rate": 6.871249642775032e-05, + "loss": 0.011152937076985836, + "num_input_tokens_seen": 103807464, + "step": 6339, + "train_runtime": 51509.8589, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 3.8424242424242423, + "grad_norm": 0.00392877496778965, + "learning_rate": 6.870357876785455e-05, + "loss": 0.01119923498481512, + "num_input_tokens_seen": 103823840, + "step": 6340, + "train_runtime": 51517.9812, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 3.843030303030303, + "grad_norm": 0.006815628614276648, + "learning_rate": 6.869466041614984e-05, + "loss": 0.011933619156479836, + "num_input_tokens_seen": 103840216, + "step": 6341, + "train_runtime": 51526.1054, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 3.8436363636363637, + "grad_norm": 0.005715903826057911, + "learning_rate": 6.868574137296604e-05, + "loss": 0.012202315032482147, + "num_input_tokens_seen": 103856592, + "step": 6342, + "train_runtime": 51534.2223, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 3.8442424242424242, + "grad_norm": 0.0059768082574009895, + "learning_rate": 6.867682163863306e-05, + "loss": 0.011637840420007706, + "num_input_tokens_seen": 103872968, + "step": 6343, + "train_runtime": 51542.3348, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 3.8448484848484847, + "grad_norm": 0.0067747351713478565, + "learning_rate": 6.86679012134808e-05, + "loss": 0.012454125098884106, + "num_input_tokens_seen": 103889344, + "step": 6344, + "train_runtime": 51550.45, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 3.8454545454545457, + "grad_norm": 0.013112933374941349, + "learning_rate": 6.865898009783925e-05, + "loss": 0.012568272650241852, + "num_input_tokens_seen": 103905720, + "step": 6345, + "train_runtime": 51558.564, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 3.846060606060606, + "grad_norm": 0.02602527104318142, + "learning_rate": 6.865005829203838e-05, + "loss": 0.011992212384939194, + "num_input_tokens_seen": 103922096, + "step": 6346, + "train_runtime": 51566.6776, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 3.8466666666666667, + "grad_norm": 0.008191145025193691, + "learning_rate": 6.864113579640814e-05, + "loss": 0.012303100898861885, + "num_input_tokens_seen": 103938472, + "step": 6347, + "train_runtime": 51574.7945, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 3.847272727272727, + "grad_norm": 0.007829583249986172, + "learning_rate": 6.86322126112786e-05, + "loss": 0.011761456727981567, + "num_input_tokens_seen": 103954848, + "step": 6348, + "train_runtime": 51582.9315, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 3.8478787878787877, + "grad_norm": 0.011124903336167336, + "learning_rate": 6.862328873697978e-05, + "loss": 0.013220653869211674, + "num_input_tokens_seen": 103971224, + "step": 6349, + "train_runtime": 51591.0515, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 3.8484848484848486, + "grad_norm": 0.005647443700581789, + "learning_rate": 6.861436417384182e-05, + "loss": 0.01169199962168932, + "num_input_tokens_seen": 103987600, + "step": 6350, + "train_runtime": 51599.1654, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 3.849090909090909, + "grad_norm": 0.006080362945795059, + "learning_rate": 6.860543892219476e-05, + "loss": 0.011823397129774094, + "num_input_tokens_seen": 104003976, + "step": 6351, + "train_runtime": 51607.2756, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 3.8496969696969696, + "grad_norm": 0.007333891000598669, + "learning_rate": 6.859651298236873e-05, + "loss": 0.01262492872774601, + "num_input_tokens_seen": 104020352, + "step": 6352, + "train_runtime": 51615.3913, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 3.8503030303030306, + "grad_norm": 0.005763235967606306, + "learning_rate": 6.858758635469391e-05, + "loss": 0.01165561843663454, + "num_input_tokens_seen": 104036728, + "step": 6353, + "train_runtime": 51623.5022, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.850909090909091, + "grad_norm": 0.005852431058883667, + "learning_rate": 6.857865903950047e-05, + "loss": 0.012789268046617508, + "num_input_tokens_seen": 104053104, + "step": 6354, + "train_runtime": 51631.6149, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.8515151515151516, + "grad_norm": 0.008734244853258133, + "learning_rate": 6.856973103711862e-05, + "loss": 0.011357048526406288, + "num_input_tokens_seen": 104069480, + "step": 6355, + "train_runtime": 51639.7473, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.852121212121212, + "grad_norm": 0.007288855966180563, + "learning_rate": 6.856080234787857e-05, + "loss": 0.012908563949167728, + "num_input_tokens_seen": 104085856, + "step": 6356, + "train_runtime": 51647.8697, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.8527272727272726, + "grad_norm": 0.007564263883978128, + "learning_rate": 6.855187297211059e-05, + "loss": 0.01242908276617527, + "num_input_tokens_seen": 104102232, + "step": 6357, + "train_runtime": 51656.0012, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.8533333333333335, + "grad_norm": 0.00795820914208889, + "learning_rate": 6.854294291014496e-05, + "loss": 0.012540027499198914, + "num_input_tokens_seen": 104118608, + "step": 6358, + "train_runtime": 51664.1204, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 3.853939393939394, + "grad_norm": 0.0042033325880765915, + "learning_rate": 6.853401216231197e-05, + "loss": 0.012046794407069683, + "num_input_tokens_seen": 104134984, + "step": 6359, + "train_runtime": 51672.2325, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.8545454545454545, + "grad_norm": 0.008583133108913898, + "learning_rate": 6.852508072894199e-05, + "loss": 0.012888654135167599, + "num_input_tokens_seen": 104151360, + "step": 6360, + "train_runtime": 51680.3449, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 3.855151515151515, + "grad_norm": 0.0042343223467469215, + "learning_rate": 6.851614861036533e-05, + "loss": 0.011959347873926163, + "num_input_tokens_seen": 104167736, + "step": 6361, + "train_runtime": 51688.4584, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 3.8557575757575755, + "grad_norm": 0.0066378978081047535, + "learning_rate": 6.850721580691239e-05, + "loss": 0.012275846675038338, + "num_input_tokens_seen": 104184112, + "step": 6362, + "train_runtime": 51696.5742, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 3.8563636363636364, + "grad_norm": 0.007078374270349741, + "learning_rate": 6.849828231891359e-05, + "loss": 0.011625605635344982, + "num_input_tokens_seen": 104200488, + "step": 6363, + "train_runtime": 51704.6851, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 3.856969696969697, + "grad_norm": 0.007989338599145412, + "learning_rate": 6.848934814669934e-05, + "loss": 0.011937669478356838, + "num_input_tokens_seen": 104216864, + "step": 6364, + "train_runtime": 51712.7985, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 3.8575757575757574, + "grad_norm": 0.005685557145625353, + "learning_rate": 6.848041329060012e-05, + "loss": 0.011166485957801342, + "num_input_tokens_seen": 104233240, + "step": 6365, + "train_runtime": 51720.9096, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 3.8581818181818184, + "grad_norm": 0.00899150874465704, + "learning_rate": 6.847147775094639e-05, + "loss": 0.012788347899913788, + "num_input_tokens_seen": 104249616, + "step": 6366, + "train_runtime": 51729.0306, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 3.858787878787879, + "grad_norm": 0.005294622387737036, + "learning_rate": 6.846254152806865e-05, + "loss": 0.011247401125729084, + "num_input_tokens_seen": 104265992, + "step": 6367, + "train_runtime": 51737.1439, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 3.8593939393939394, + "grad_norm": 0.007399433758109808, + "learning_rate": 6.84536046222975e-05, + "loss": 0.011965127661824226, + "num_input_tokens_seen": 104282368, + "step": 6368, + "train_runtime": 51745.2589, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 3.86, + "grad_norm": 0.006079590879380703, + "learning_rate": 6.844466703396343e-05, + "loss": 0.012255802750587463, + "num_input_tokens_seen": 104298744, + "step": 6369, + "train_runtime": 51753.375, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 3.8606060606060604, + "grad_norm": 0.012702642008662224, + "learning_rate": 6.843572876339705e-05, + "loss": 0.011705173179507256, + "num_input_tokens_seen": 104315120, + "step": 6370, + "train_runtime": 51761.4918, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 3.8612121212121213, + "grad_norm": 0.006723165512084961, + "learning_rate": 6.842678981092896e-05, + "loss": 0.013258319348096848, + "num_input_tokens_seen": 104331496, + "step": 6371, + "train_runtime": 51769.6088, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 3.861818181818182, + "grad_norm": 0.009000041522085667, + "learning_rate": 6.841785017688979e-05, + "loss": 0.012553805485367775, + "num_input_tokens_seen": 104347872, + "step": 6372, + "train_runtime": 51777.7314, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 3.8624242424242423, + "grad_norm": 0.006972875911742449, + "learning_rate": 6.840890986161022e-05, + "loss": 0.012424489483237267, + "num_input_tokens_seen": 104364248, + "step": 6373, + "train_runtime": 51785.8458, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 3.8630303030303033, + "grad_norm": 0.009140817448496819, + "learning_rate": 6.839996886542092e-05, + "loss": 0.01227598451077938, + "num_input_tokens_seen": 104380624, + "step": 6374, + "train_runtime": 51793.9636, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 3.8636363636363638, + "grad_norm": 0.002054952085018158, + "learning_rate": 6.839102718865262e-05, + "loss": 0.011070946231484413, + "num_input_tokens_seen": 104397000, + "step": 6375, + "train_runtime": 51802.0791, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 3.8642424242424243, + "grad_norm": 0.005203670356422663, + "learning_rate": 6.8382084831636e-05, + "loss": 0.011194390244781971, + "num_input_tokens_seen": 104413376, + "step": 6376, + "train_runtime": 51810.1988, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 3.8648484848484848, + "grad_norm": 0.005339459050446749, + "learning_rate": 6.837314179470189e-05, + "loss": 0.01167476549744606, + "num_input_tokens_seen": 104429752, + "step": 6377, + "train_runtime": 51818.3141, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 3.8654545454545453, + "grad_norm": 0.007752533070743084, + "learning_rate": 6.836419807818104e-05, + "loss": 0.013203997164964676, + "num_input_tokens_seen": 104446128, + "step": 6378, + "train_runtime": 51826.4322, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 3.866060606060606, + "grad_norm": 0.007299668621271849, + "learning_rate": 6.835525368240427e-05, + "loss": 0.012742544524371624, + "num_input_tokens_seen": 104462504, + "step": 6379, + "train_runtime": 51834.549, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 3.8666666666666667, + "grad_norm": 0.007718703243881464, + "learning_rate": 6.834630860770241e-05, + "loss": 0.011910384520888329, + "num_input_tokens_seen": 104478880, + "step": 6380, + "train_runtime": 51842.667, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 3.867272727272727, + "grad_norm": 0.007600679062306881, + "learning_rate": 6.833736285440632e-05, + "loss": 0.011816552840173244, + "num_input_tokens_seen": 104495256, + "step": 6381, + "train_runtime": 51850.7771, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 3.867878787878788, + "grad_norm": 0.005711911711841822, + "learning_rate": 6.832841642284689e-05, + "loss": 0.01151470560580492, + "num_input_tokens_seen": 104511632, + "step": 6382, + "train_runtime": 51858.887, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 3.8684848484848486, + "grad_norm": 0.008412271738052368, + "learning_rate": 6.831946931335502e-05, + "loss": 0.011796337552368641, + "num_input_tokens_seen": 104528008, + "step": 6383, + "train_runtime": 51866.9989, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.869090909090909, + "grad_norm": 0.009208939969539642, + "learning_rate": 6.831052152626166e-05, + "loss": 0.011944383382797241, + "num_input_tokens_seen": 104544384, + "step": 6384, + "train_runtime": 51875.1184, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.8696969696969696, + "grad_norm": 0.003875693306326866, + "learning_rate": 6.830157306189778e-05, + "loss": 0.01161068957298994, + "num_input_tokens_seen": 104560760, + "step": 6385, + "train_runtime": 51883.2341, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.87030303030303, + "grad_norm": 0.005553916096687317, + "learning_rate": 6.829262392059435e-05, + "loss": 0.01192825473845005, + "num_input_tokens_seen": 104577136, + "step": 6386, + "train_runtime": 51891.3502, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.870909090909091, + "grad_norm": 0.004811456892639399, + "learning_rate": 6.828367410268238e-05, + "loss": 0.011178002692759037, + "num_input_tokens_seen": 104593512, + "step": 6387, + "train_runtime": 51899.4637, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.8715151515151516, + "grad_norm": 0.006555966567248106, + "learning_rate": 6.82747236084929e-05, + "loss": 0.01272119302302599, + "num_input_tokens_seen": 104609888, + "step": 6388, + "train_runtime": 51907.582, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.872121212121212, + "grad_norm": 0.003667944809421897, + "learning_rate": 6.8265772438357e-05, + "loss": 0.011908757500350475, + "num_input_tokens_seen": 104626264, + "step": 6389, + "train_runtime": 51915.7132, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.8727272727272726, + "grad_norm": 0.005447254981845617, + "learning_rate": 6.825682059260573e-05, + "loss": 0.010992276482284069, + "num_input_tokens_seen": 104642640, + "step": 6390, + "train_runtime": 51923.8325, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.873333333333333, + "grad_norm": 0.004332001321017742, + "learning_rate": 6.824786807157022e-05, + "loss": 0.011789652518928051, + "num_input_tokens_seen": 104659016, + "step": 6391, + "train_runtime": 51931.964, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.873939393939394, + "grad_norm": 0.00913639459758997, + "learning_rate": 6.823891487558163e-05, + "loss": 0.01141613069921732, + "num_input_tokens_seen": 104675392, + "step": 6392, + "train_runtime": 51940.0922, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 3.8745454545454545, + "grad_norm": 0.0069297198206186295, + "learning_rate": 6.822996100497106e-05, + "loss": 0.012909138575196266, + "num_input_tokens_seen": 104691768, + "step": 6393, + "train_runtime": 51948.2369, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.875151515151515, + "grad_norm": 0.0031112697906792164, + "learning_rate": 6.822100646006977e-05, + "loss": 0.010947699658572674, + "num_input_tokens_seen": 104708144, + "step": 6394, + "train_runtime": 51956.3653, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.875757575757576, + "grad_norm": 0.0049369907937943935, + "learning_rate": 6.821205124120892e-05, + "loss": 0.012022101320326328, + "num_input_tokens_seen": 104724520, + "step": 6395, + "train_runtime": 51964.488, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.8763636363636365, + "grad_norm": 0.005992490332573652, + "learning_rate": 6.820309534871976e-05, + "loss": 0.011946536600589752, + "num_input_tokens_seen": 104740896, + "step": 6396, + "train_runtime": 51972.6142, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.876969696969697, + "grad_norm": 0.006365968380123377, + "learning_rate": 6.819413878293354e-05, + "loss": 0.011827570386230946, + "num_input_tokens_seen": 104757272, + "step": 6397, + "train_runtime": 51980.7401, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.8775757575757575, + "grad_norm": 0.014044038020074368, + "learning_rate": 6.818518154418158e-05, + "loss": 0.012429771013557911, + "num_input_tokens_seen": 104773648, + "step": 6398, + "train_runtime": 51988.869, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.878181818181818, + "grad_norm": 0.006978933699429035, + "learning_rate": 6.817622363279515e-05, + "loss": 0.012561517767608166, + "num_input_tokens_seen": 104790024, + "step": 6399, + "train_runtime": 51996.9944, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.878787878787879, + "grad_norm": 0.013232513330876827, + "learning_rate": 6.816726504910563e-05, + "loss": 0.013023456558585167, + "num_input_tokens_seen": 104806400, + "step": 6400, + "train_runtime": 52005.1313, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 3.8793939393939394, + "grad_norm": 0.004022569861263037, + "learning_rate": 6.815830579344434e-05, + "loss": 0.012012426741421223, + "num_input_tokens_seen": 104822776, + "step": 6401, + "train_runtime": 52014.2544, + "train_tokens_per_second": 2015.27 + }, + { + "epoch": 3.88, + "grad_norm": 0.005915552377700806, + "learning_rate": 6.814934586614268e-05, + "loss": 0.012157132849097252, + "num_input_tokens_seen": 104839152, + "step": 6402, + "train_runtime": 52022.3761, + "train_tokens_per_second": 2015.27 + }, + { + "epoch": 3.880606060606061, + "grad_norm": 0.008586011826992035, + "learning_rate": 6.814038526753205e-05, + "loss": 0.013256721198558807, + "num_input_tokens_seen": 104855528, + "step": 6403, + "train_runtime": 52030.4897, + "train_tokens_per_second": 2015.271 + }, + { + "epoch": 3.8812121212121213, + "grad_norm": 0.010426871478557587, + "learning_rate": 6.813142399794391e-05, + "loss": 0.012904987670481205, + "num_input_tokens_seen": 104871904, + "step": 6404, + "train_runtime": 52038.6062, + "train_tokens_per_second": 2015.271 + }, + { + "epoch": 3.881818181818182, + "grad_norm": 0.009036079049110413, + "learning_rate": 6.81224620577097e-05, + "loss": 0.012641535140573978, + "num_input_tokens_seen": 104888280, + "step": 6405, + "train_runtime": 52046.7223, + "train_tokens_per_second": 2015.272 + }, + { + "epoch": 3.8824242424242423, + "grad_norm": 0.0039884429425001144, + "learning_rate": 6.811349944716092e-05, + "loss": 0.013186887837946415, + "num_input_tokens_seen": 104904656, + "step": 6406, + "train_runtime": 52054.835, + "train_tokens_per_second": 2015.272 + }, + { + "epoch": 3.883030303030303, + "grad_norm": 0.00914225447922945, + "learning_rate": 6.810453616662905e-05, + "loss": 0.012560753151774406, + "num_input_tokens_seen": 104921032, + "step": 6407, + "train_runtime": 52062.9463, + "train_tokens_per_second": 2015.273 + }, + { + "epoch": 3.8836363636363638, + "grad_norm": 0.011460991576313972, + "learning_rate": 6.809557221644567e-05, + "loss": 0.013554160483181477, + "num_input_tokens_seen": 104937408, + "step": 6408, + "train_runtime": 52071.0547, + "train_tokens_per_second": 2015.273 + }, + { + "epoch": 3.8842424242424243, + "grad_norm": 0.0044858260080218315, + "learning_rate": 6.80866075969423e-05, + "loss": 0.011682888492941856, + "num_input_tokens_seen": 104953784, + "step": 6409, + "train_runtime": 52079.1712, + "train_tokens_per_second": 2015.274 + }, + { + "epoch": 3.8848484848484848, + "grad_norm": 0.005311533343046904, + "learning_rate": 6.807764230845053e-05, + "loss": 0.011333282105624676, + "num_input_tokens_seen": 104970160, + "step": 6410, + "train_runtime": 52087.2836, + "train_tokens_per_second": 2015.274 + }, + { + "epoch": 3.8854545454545457, + "grad_norm": 0.0043438877910375595, + "learning_rate": 6.806867635130199e-05, + "loss": 0.012564531527459621, + "num_input_tokens_seen": 104986536, + "step": 6411, + "train_runtime": 52095.4014, + "train_tokens_per_second": 2015.275 + }, + { + "epoch": 3.886060606060606, + "grad_norm": 0.007464016787707806, + "learning_rate": 6.805970972582829e-05, + "loss": 0.01323576457798481, + "num_input_tokens_seen": 105002912, + "step": 6412, + "train_runtime": 52103.5144, + "train_tokens_per_second": 2015.275 + }, + { + "epoch": 3.8866666666666667, + "grad_norm": 0.007808825001120567, + "learning_rate": 6.805074243236109e-05, + "loss": 0.012675788253545761, + "num_input_tokens_seen": 105019288, + "step": 6413, + "train_runtime": 52111.6304, + "train_tokens_per_second": 2015.275 + }, + { + "epoch": 3.887272727272727, + "grad_norm": 0.005568093154579401, + "learning_rate": 6.80417744712321e-05, + "loss": 0.0123039111495018, + "num_input_tokens_seen": 105035664, + "step": 6414, + "train_runtime": 52119.7491, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 3.8878787878787877, + "grad_norm": 0.004107446409761906, + "learning_rate": 6.803280584277298e-05, + "loss": 0.0110032819211483, + "num_input_tokens_seen": 105052040, + "step": 6415, + "train_runtime": 52127.8785, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 3.8884848484848487, + "grad_norm": 0.007254080846905708, + "learning_rate": 6.80238365473155e-05, + "loss": 0.012588896788656712, + "num_input_tokens_seen": 105068416, + "step": 6416, + "train_runtime": 52135.9936, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 3.889090909090909, + "grad_norm": 0.006639031693339348, + "learning_rate": 6.801486658519141e-05, + "loss": 0.011394668370485306, + "num_input_tokens_seen": 105084792, + "step": 6417, + "train_runtime": 52144.1102, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 3.8896969696969697, + "grad_norm": 0.003557785414159298, + "learning_rate": 6.800589595673251e-05, + "loss": 0.012601395137608051, + "num_input_tokens_seen": 105101168, + "step": 6418, + "train_runtime": 52152.2319, + "train_tokens_per_second": 2015.277 + }, + { + "epoch": 3.89030303030303, + "grad_norm": 0.006912960670888424, + "learning_rate": 6.799692466227056e-05, + "loss": 0.011831115931272507, + "num_input_tokens_seen": 105117544, + "step": 6419, + "train_runtime": 52160.3491, + "train_tokens_per_second": 2015.277 + }, + { + "epoch": 3.8909090909090907, + "grad_norm": 0.007772157434374094, + "learning_rate": 6.79879527021374e-05, + "loss": 0.012751709669828415, + "num_input_tokens_seen": 105133920, + "step": 6420, + "train_runtime": 52168.4768, + "train_tokens_per_second": 2015.277 + }, + { + "epoch": 3.8915151515151516, + "grad_norm": 0.021773459389805794, + "learning_rate": 6.797898007666492e-05, + "loss": 0.012764612212777138, + "num_input_tokens_seen": 105150296, + "step": 6421, + "train_runtime": 52176.604, + "train_tokens_per_second": 2015.277 + }, + { + "epoch": 3.892121212121212, + "grad_norm": 0.004894225392490625, + "learning_rate": 6.797000678618496e-05, + "loss": 0.011554277502000332, + "num_input_tokens_seen": 105166672, + "step": 6422, + "train_runtime": 52184.7204, + "train_tokens_per_second": 2015.277 + }, + { + "epoch": 3.8927272727272726, + "grad_norm": 0.006482499185949564, + "learning_rate": 6.796103283102948e-05, + "loss": 0.012733708135783672, + "num_input_tokens_seen": 105183048, + "step": 6423, + "train_runtime": 52192.8396, + "train_tokens_per_second": 2015.277 + }, + { + "epoch": 3.8933333333333335, + "grad_norm": 0.005839161574840546, + "learning_rate": 6.795205821153034e-05, + "loss": 0.010899435728788376, + "num_input_tokens_seen": 105199424, + "step": 6424, + "train_runtime": 52200.9562, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 3.893939393939394, + "grad_norm": 0.00740810576826334, + "learning_rate": 6.794308292801954e-05, + "loss": 0.012887225486338139, + "num_input_tokens_seen": 105215800, + "step": 6425, + "train_runtime": 52209.07, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 3.8945454545454545, + "grad_norm": 0.006400708574801683, + "learning_rate": 6.793410698082903e-05, + "loss": 0.011559346690773964, + "num_input_tokens_seen": 105232176, + "step": 6426, + "train_runtime": 52217.1872, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 3.895151515151515, + "grad_norm": 0.011151198297739029, + "learning_rate": 6.792513037029084e-05, + "loss": 0.012091712094843388, + "num_input_tokens_seen": 105248552, + "step": 6427, + "train_runtime": 52225.3088, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 3.8957575757575755, + "grad_norm": 0.00812437105923891, + "learning_rate": 6.791615309673701e-05, + "loss": 0.012318650260567665, + "num_input_tokens_seen": 105264928, + "step": 6428, + "train_runtime": 52233.4354, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 3.8963636363636365, + "grad_norm": 0.006974482908844948, + "learning_rate": 6.790717516049953e-05, + "loss": 0.011715607717633247, + "num_input_tokens_seen": 105281304, + "step": 6429, + "train_runtime": 52241.5507, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 3.896969696969697, + "grad_norm": 0.003282958874478936, + "learning_rate": 6.789819656191053e-05, + "loss": 0.011212784796953201, + "num_input_tokens_seen": 105297680, + "step": 6430, + "train_runtime": 52249.6632, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 3.8975757575757575, + "grad_norm": 0.0062975506298244, + "learning_rate": 6.788921730130209e-05, + "loss": 0.011493801139295101, + "num_input_tokens_seen": 105314056, + "step": 6431, + "train_runtime": 52257.7765, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 3.8981818181818184, + "grad_norm": 0.000841248722281307, + "learning_rate": 6.788023737900632e-05, + "loss": 0.01135325524955988, + "num_input_tokens_seen": 105330432, + "step": 6432, + "train_runtime": 52265.8895, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 3.898787878787879, + "grad_norm": 0.007343157194554806, + "learning_rate": 6.787125679535543e-05, + "loss": 0.012223826721310616, + "num_input_tokens_seen": 105346808, + "step": 6433, + "train_runtime": 52274.005, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 3.8993939393939394, + "grad_norm": 0.008465384133160114, + "learning_rate": 6.786227555068152e-05, + "loss": 0.013353883288800716, + "num_input_tokens_seen": 105363184, + "step": 6434, + "train_runtime": 52282.1187, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 3.9, + "grad_norm": 0.006089374888688326, + "learning_rate": 6.785329364531685e-05, + "loss": 0.012924669310450554, + "num_input_tokens_seen": 105379560, + "step": 6435, + "train_runtime": 52290.2358, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 3.9006060606060604, + "grad_norm": 0.007352704182267189, + "learning_rate": 6.784431107959359e-05, + "loss": 0.013641802594065666, + "num_input_tokens_seen": 105395936, + "step": 6436, + "train_runtime": 52298.354, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 3.9012121212121214, + "grad_norm": 0.006390717811882496, + "learning_rate": 6.783532785384404e-05, + "loss": 0.011246193200349808, + "num_input_tokens_seen": 105412312, + "step": 6437, + "train_runtime": 52306.4664, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 3.901818181818182, + "grad_norm": 0.005773629527539015, + "learning_rate": 6.782634396840043e-05, + "loss": 0.011313146911561489, + "num_input_tokens_seen": 105428688, + "step": 6438, + "train_runtime": 52314.5914, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 3.9024242424242424, + "grad_norm": 0.006826214957982302, + "learning_rate": 6.781735942359506e-05, + "loss": 0.011850197799503803, + "num_input_tokens_seen": 105445064, + "step": 6439, + "train_runtime": 52322.7332, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 3.9030303030303033, + "grad_norm": 0.0051016537472605705, + "learning_rate": 6.780837421976028e-05, + "loss": 0.012787675485014915, + "num_input_tokens_seen": 105461440, + "step": 6440, + "train_runtime": 52330.8755, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 3.903636363636364, + "grad_norm": 0.007134228944778442, + "learning_rate": 6.779938835722841e-05, + "loss": 0.011355869472026825, + "num_input_tokens_seen": 105477816, + "step": 6441, + "train_runtime": 52338.9995, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 3.9042424242424243, + "grad_norm": 0.007024009712040424, + "learning_rate": 6.779040183633184e-05, + "loss": 0.012895681895315647, + "num_input_tokens_seen": 105494192, + "step": 6442, + "train_runtime": 52347.1457, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 3.904848484848485, + "grad_norm": 0.014631570316851139, + "learning_rate": 6.778141465740293e-05, + "loss": 0.013918263837695122, + "num_input_tokens_seen": 105510568, + "step": 6443, + "train_runtime": 52355.2734, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 3.9054545454545453, + "grad_norm": 0.008308978751301765, + "learning_rate": 6.777242682077413e-05, + "loss": 0.011238180100917816, + "num_input_tokens_seen": 105526944, + "step": 6444, + "train_runtime": 52363.3933, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 3.9060606060606062, + "grad_norm": 0.005170789547264576, + "learning_rate": 6.776343832677788e-05, + "loss": 0.011283459141850471, + "num_input_tokens_seen": 105543320, + "step": 6445, + "train_runtime": 52371.5205, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 3.9066666666666667, + "grad_norm": 0.0068200635723769665, + "learning_rate": 6.775444917574663e-05, + "loss": 0.010923531837761402, + "num_input_tokens_seen": 105559696, + "step": 6446, + "train_runtime": 52379.6349, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 3.9072727272727272, + "grad_norm": 0.007320295087993145, + "learning_rate": 6.774545936801289e-05, + "loss": 0.012196102179586887, + "num_input_tokens_seen": 105576072, + "step": 6447, + "train_runtime": 52387.7455, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 3.9078787878787877, + "grad_norm": 0.006725286599248648, + "learning_rate": 6.773646890390915e-05, + "loss": 0.012775799259543419, + "num_input_tokens_seen": 105592448, + "step": 6448, + "train_runtime": 52395.8548, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 3.9084848484848482, + "grad_norm": 0.001916606561280787, + "learning_rate": 6.772747778376798e-05, + "loss": 0.01150575652718544, + "num_input_tokens_seen": 105608824, + "step": 6449, + "train_runtime": 52403.9684, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 3.909090909090909, + "grad_norm": 0.009450610727071762, + "learning_rate": 6.771848600792193e-05, + "loss": 0.012322013266384602, + "num_input_tokens_seen": 105625200, + "step": 6450, + "train_runtime": 52412.0951, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 3.9096969696969697, + "grad_norm": 0.003345611272379756, + "learning_rate": 6.770949357670358e-05, + "loss": 0.011517246253788471, + "num_input_tokens_seen": 105641576, + "step": 6451, + "train_runtime": 52420.2098, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 3.91030303030303, + "grad_norm": 0.004177030175924301, + "learning_rate": 6.770050049044553e-05, + "loss": 0.01244444865733385, + "num_input_tokens_seen": 105657952, + "step": 6452, + "train_runtime": 52428.3218, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 3.910909090909091, + "grad_norm": 0.009247804991900921, + "learning_rate": 6.769150674948046e-05, + "loss": 0.013496419414877892, + "num_input_tokens_seen": 105674328, + "step": 6453, + "train_runtime": 52436.4385, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 3.9115151515151516, + "grad_norm": 0.007861394435167313, + "learning_rate": 6.768251235414101e-05, + "loss": 0.011463810689747334, + "num_input_tokens_seen": 105690704, + "step": 6454, + "train_runtime": 52444.5518, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 3.912121212121212, + "grad_norm": 0.008108093403279781, + "learning_rate": 6.767351730475987e-05, + "loss": 0.012271877378225327, + "num_input_tokens_seen": 105707080, + "step": 6455, + "train_runtime": 52452.6673, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 3.9127272727272726, + "grad_norm": 0.009381504729390144, + "learning_rate": 6.766452160166974e-05, + "loss": 0.012597257271409035, + "num_input_tokens_seen": 105723456, + "step": 6456, + "train_runtime": 52460.782, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 3.913333333333333, + "grad_norm": 0.005490145646035671, + "learning_rate": 6.765552524520333e-05, + "loss": 0.011732572689652443, + "num_input_tokens_seen": 105739832, + "step": 6457, + "train_runtime": 52468.8954, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 3.913939393939394, + "grad_norm": 0.009379802271723747, + "learning_rate": 6.764652823569344e-05, + "loss": 0.013085374608635902, + "num_input_tokens_seen": 105756208, + "step": 6458, + "train_runtime": 52477.0074, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 3.9145454545454546, + "grad_norm": 0.005488468334078789, + "learning_rate": 6.763753057347284e-05, + "loss": 0.011455253697931767, + "num_input_tokens_seen": 105772584, + "step": 6459, + "train_runtime": 52485.1221, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 3.915151515151515, + "grad_norm": 0.010084799490869045, + "learning_rate": 6.762853225887433e-05, + "loss": 0.01138480193912983, + "num_input_tokens_seen": 105788960, + "step": 6460, + "train_runtime": 52493.2348, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 3.915757575757576, + "grad_norm": 0.008909350261092186, + "learning_rate": 6.761953329223075e-05, + "loss": 0.011331785470247269, + "num_input_tokens_seen": 105805336, + "step": 6461, + "train_runtime": 52501.3503, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 3.9163636363636365, + "grad_norm": 0.005921931006014347, + "learning_rate": 6.761053367387493e-05, + "loss": 0.010896963067352772, + "num_input_tokens_seen": 105821712, + "step": 6462, + "train_runtime": 52509.5093, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 3.916969696969697, + "grad_norm": 0.001961705507710576, + "learning_rate": 6.760153340413978e-05, + "loss": 0.011161498725414276, + "num_input_tokens_seen": 105838088, + "step": 6463, + "train_runtime": 52517.6316, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 3.9175757575757575, + "grad_norm": 0.007815143093466759, + "learning_rate": 6.759253248335817e-05, + "loss": 0.011892015114426613, + "num_input_tokens_seen": 105854464, + "step": 6464, + "train_runtime": 52525.7454, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 3.918181818181818, + "grad_norm": 0.008270752616226673, + "learning_rate": 6.758353091186306e-05, + "loss": 0.013711986131966114, + "num_input_tokens_seen": 105870840, + "step": 6465, + "train_runtime": 52533.8638, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 3.918787878787879, + "grad_norm": 0.00747511675581336, + "learning_rate": 6.757452868998737e-05, + "loss": 0.012306815013289452, + "num_input_tokens_seen": 105887216, + "step": 6466, + "train_runtime": 52541.979, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 3.9193939393939394, + "grad_norm": 0.005521410144865513, + "learning_rate": 6.756552581806409e-05, + "loss": 0.013175075873732567, + "num_input_tokens_seen": 105903592, + "step": 6467, + "train_runtime": 52550.0964, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 3.92, + "grad_norm": 0.0027819282840937376, + "learning_rate": 6.755652229642623e-05, + "loss": 0.011434970423579216, + "num_input_tokens_seen": 105919968, + "step": 6468, + "train_runtime": 52558.2112, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.920606060606061, + "grad_norm": 0.006007140502333641, + "learning_rate": 6.754751812540679e-05, + "loss": 0.011644032783806324, + "num_input_tokens_seen": 105936344, + "step": 6469, + "train_runtime": 52566.3322, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.9212121212121214, + "grad_norm": 0.0076867761090397835, + "learning_rate": 6.753851330533883e-05, + "loss": 0.012008686549961567, + "num_input_tokens_seen": 105952720, + "step": 6470, + "train_runtime": 52574.4551, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.921818181818182, + "grad_norm": 0.002013271674513817, + "learning_rate": 6.752950783655541e-05, + "loss": 0.012147719971835613, + "num_input_tokens_seen": 105969096, + "step": 6471, + "train_runtime": 52582.5869, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.9224242424242424, + "grad_norm": 0.0025515782181173563, + "learning_rate": 6.752050171938964e-05, + "loss": 0.011507188901305199, + "num_input_tokens_seen": 105985472, + "step": 6472, + "train_runtime": 52590.7018, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.923030303030303, + "grad_norm": 0.00877495389431715, + "learning_rate": 6.751149495417464e-05, + "loss": 0.01312980242073536, + "num_input_tokens_seen": 106001848, + "step": 6473, + "train_runtime": 52598.8449, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 3.923636363636364, + "grad_norm": 0.005692520644515753, + "learning_rate": 6.75024875412435e-05, + "loss": 0.012474870309233665, + "num_input_tokens_seen": 106018224, + "step": 6474, + "train_runtime": 52606.9796, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 3.9242424242424243, + "grad_norm": 0.007139783352613449, + "learning_rate": 6.749347948092947e-05, + "loss": 0.012518535368144512, + "num_input_tokens_seen": 106034600, + "step": 6475, + "train_runtime": 52615.1203, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 3.924848484848485, + "grad_norm": 0.006498995702713728, + "learning_rate": 6.74844707735657e-05, + "loss": 0.012302231974899769, + "num_input_tokens_seen": 106050976, + "step": 6476, + "train_runtime": 52623.2645, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 3.9254545454545453, + "grad_norm": 0.006589885801076889, + "learning_rate": 6.747546141948541e-05, + "loss": 0.011309774592518806, + "num_input_tokens_seen": 106067352, + "step": 6477, + "train_runtime": 52631.4034, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 3.926060606060606, + "grad_norm": 0.006797242909669876, + "learning_rate": 6.746645141902181e-05, + "loss": 0.012881754897534847, + "num_input_tokens_seen": 106083728, + "step": 6478, + "train_runtime": 52639.5374, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 3.9266666666666667, + "grad_norm": 0.004559420980513096, + "learning_rate": 6.74574407725082e-05, + "loss": 0.011311120353639126, + "num_input_tokens_seen": 106100104, + "step": 6479, + "train_runtime": 52647.6793, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 3.9272727272727272, + "grad_norm": 0.0061518121510744095, + "learning_rate": 6.744842948027786e-05, + "loss": 0.01222110353410244, + "num_input_tokens_seen": 106116480, + "step": 6480, + "train_runtime": 52655.821, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 3.9278787878787877, + "grad_norm": 0.01024465449154377, + "learning_rate": 6.743941754266408e-05, + "loss": 0.013755915686488152, + "num_input_tokens_seen": 106132856, + "step": 6481, + "train_runtime": 52663.9604, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 3.9284848484848487, + "grad_norm": 0.005509643815457821, + "learning_rate": 6.743040496000022e-05, + "loss": 0.011449964717030525, + "num_input_tokens_seen": 106149232, + "step": 6482, + "train_runtime": 52672.0956, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 3.929090909090909, + "grad_norm": 0.0058688754215836525, + "learning_rate": 6.742139173261961e-05, + "loss": 0.011872684583067894, + "num_input_tokens_seen": 106165608, + "step": 6483, + "train_runtime": 52680.2341, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 3.9296969696969697, + "grad_norm": 0.010958590544760227, + "learning_rate": 6.741237786085565e-05, + "loss": 0.01297104824334383, + "num_input_tokens_seen": 106181984, + "step": 6484, + "train_runtime": 52688.3733, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 3.93030303030303, + "grad_norm": 0.007707979995757341, + "learning_rate": 6.740336334504175e-05, + "loss": 0.012075908482074738, + "num_input_tokens_seen": 106198360, + "step": 6485, + "train_runtime": 52696.5167, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 3.9309090909090907, + "grad_norm": 0.005613269750028849, + "learning_rate": 6.739434818551132e-05, + "loss": 0.010788967832922935, + "num_input_tokens_seen": 106214736, + "step": 6486, + "train_runtime": 52704.6586, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 3.9315151515151516, + "grad_norm": 0.006677389144897461, + "learning_rate": 6.738533238259784e-05, + "loss": 0.013803089037537575, + "num_input_tokens_seen": 106231112, + "step": 6487, + "train_runtime": 52712.8008, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 3.932121212121212, + "grad_norm": 0.0058305589482188225, + "learning_rate": 6.737631593663476e-05, + "loss": 0.011969965882599354, + "num_input_tokens_seen": 106247488, + "step": 6488, + "train_runtime": 52720.9429, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 3.9327272727272726, + "grad_norm": 0.009964353404939175, + "learning_rate": 6.73672988479556e-05, + "loss": 0.011583378538489342, + "num_input_tokens_seen": 106263864, + "step": 6489, + "train_runtime": 52729.0884, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 3.9333333333333336, + "grad_norm": 0.00428033946081996, + "learning_rate": 6.735828111689386e-05, + "loss": 0.012437012046575546, + "num_input_tokens_seen": 106280240, + "step": 6490, + "train_runtime": 52737.2329, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 3.933939393939394, + "grad_norm": 0.008684828877449036, + "learning_rate": 6.734926274378312e-05, + "loss": 0.01261742040514946, + "num_input_tokens_seen": 106296616, + "step": 6491, + "train_runtime": 52745.3754, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 3.9345454545454546, + "grad_norm": 0.008188855834305286, + "learning_rate": 6.734024372895694e-05, + "loss": 0.010879598557949066, + "num_input_tokens_seen": 106312992, + "step": 6492, + "train_runtime": 52753.5194, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 3.935151515151515, + "grad_norm": 0.00736241927370429, + "learning_rate": 6.73312240727489e-05, + "loss": 0.012937352061271667, + "num_input_tokens_seen": 106329368, + "step": 6493, + "train_runtime": 52761.6605, + "train_tokens_per_second": 2015.277 + }, + { + "epoch": 3.9357575757575756, + "grad_norm": 0.008125126361846924, + "learning_rate": 6.732220377549264e-05, + "loss": 0.013051779940724373, + "num_input_tokens_seen": 106345744, + "step": 6494, + "train_runtime": 52769.7993, + "train_tokens_per_second": 2015.277 + }, + { + "epoch": 3.9363636363636365, + "grad_norm": 0.005721298977732658, + "learning_rate": 6.73131828375218e-05, + "loss": 0.011094626039266586, + "num_input_tokens_seen": 106362120, + "step": 6495, + "train_runtime": 52777.9721, + "train_tokens_per_second": 2015.275 + }, + { + "epoch": 3.936969696969697, + "grad_norm": 0.007404430769383907, + "learning_rate": 6.730416125917006e-05, + "loss": 0.012684456072747707, + "num_input_tokens_seen": 106378496, + "step": 6496, + "train_runtime": 52786.1225, + "train_tokens_per_second": 2015.274 + }, + { + "epoch": 3.9375757575757575, + "grad_norm": 0.00823443103581667, + "learning_rate": 6.729513904077106e-05, + "loss": 0.012651203200221062, + "num_input_tokens_seen": 106394872, + "step": 6497, + "train_runtime": 52794.2811, + "train_tokens_per_second": 2015.273 + }, + { + "epoch": 3.9381818181818184, + "grad_norm": 0.009745478630065918, + "learning_rate": 6.728611618265855e-05, + "loss": 0.01294238492846489, + "num_input_tokens_seen": 106411248, + "step": 6498, + "train_runtime": 52802.4309, + "train_tokens_per_second": 2015.272 + }, + { + "epoch": 3.938787878787879, + "grad_norm": 0.0038547185249626637, + "learning_rate": 6.727709268516626e-05, + "loss": 0.013034317642450333, + "num_input_tokens_seen": 106427624, + "step": 6499, + "train_runtime": 52810.5824, + "train_tokens_per_second": 2015.271 + }, + { + "epoch": 3.9393939393939394, + "grad_norm": 0.006211794447153807, + "learning_rate": 6.726806854862799e-05, + "loss": 0.012010117992758751, + "num_input_tokens_seen": 106444000, + "step": 6500, + "train_runtime": 52818.7315, + "train_tokens_per_second": 2015.27 + }, + { + "epoch": 3.94, + "grad_norm": 0.010988638736307621, + "learning_rate": 6.725904377337747e-05, + "loss": 0.012165017426013947, + "num_input_tokens_seen": 106460376, + "step": 6501, + "train_runtime": 52827.8318, + "train_tokens_per_second": 2015.233 + }, + { + "epoch": 3.9406060606060604, + "grad_norm": 0.00883723795413971, + "learning_rate": 6.725001835974853e-05, + "loss": 0.011251474730670452, + "num_input_tokens_seen": 106476752, + "step": 6502, + "train_runtime": 52835.9886, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 3.9412121212121214, + "grad_norm": 0.00840666051954031, + "learning_rate": 6.724099230807502e-05, + "loss": 0.013662639074027538, + "num_input_tokens_seen": 106493128, + "step": 6503, + "train_runtime": 52844.1423, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 3.941818181818182, + "grad_norm": 0.00553783169016242, + "learning_rate": 6.723196561869077e-05, + "loss": 0.01225048117339611, + "num_input_tokens_seen": 106509504, + "step": 6504, + "train_runtime": 52852.2995, + "train_tokens_per_second": 2015.229 + }, + { + "epoch": 3.9424242424242424, + "grad_norm": 0.009378945454955101, + "learning_rate": 6.722293829192967e-05, + "loss": 0.01132411789149046, + "num_input_tokens_seen": 106525880, + "step": 6505, + "train_runtime": 52860.4663, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 3.943030303030303, + "grad_norm": 0.004780018702149391, + "learning_rate": 6.721391032812562e-05, + "loss": 0.01158009935170412, + "num_input_tokens_seen": 106542256, + "step": 6506, + "train_runtime": 52868.6125, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 3.9436363636363634, + "grad_norm": 0.004121264908462763, + "learning_rate": 6.720488172761254e-05, + "loss": 0.012397784739732742, + "num_input_tokens_seen": 106558632, + "step": 6507, + "train_runtime": 52876.7675, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 3.9442424242424243, + "grad_norm": 0.007843032479286194, + "learning_rate": 6.719585249072441e-05, + "loss": 0.012046072632074356, + "num_input_tokens_seen": 106575008, + "step": 6508, + "train_runtime": 52884.9142, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 3.944848484848485, + "grad_norm": 0.006719939410686493, + "learning_rate": 6.718682261779519e-05, + "loss": 0.012912501581013203, + "num_input_tokens_seen": 106591384, + "step": 6509, + "train_runtime": 52893.0644, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 3.9454545454545453, + "grad_norm": 0.021725714206695557, + "learning_rate": 6.717779210915885e-05, + "loss": 0.011549402959644794, + "num_input_tokens_seen": 106607760, + "step": 6510, + "train_runtime": 52901.1999, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 3.9460606060606063, + "grad_norm": 0.008273362182080746, + "learning_rate": 6.716876096514943e-05, + "loss": 0.011496607214212418, + "num_input_tokens_seen": 106624136, + "step": 6511, + "train_runtime": 52909.3328, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 3.9466666666666668, + "grad_norm": 0.010103907436132431, + "learning_rate": 6.715972918610098e-05, + "loss": 0.012423638254404068, + "num_input_tokens_seen": 106640512, + "step": 6512, + "train_runtime": 52917.4554, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 3.9472727272727273, + "grad_norm": 0.009598693810403347, + "learning_rate": 6.715069677234758e-05, + "loss": 0.012506000697612762, + "num_input_tokens_seen": 106656888, + "step": 6513, + "train_runtime": 52925.5754, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 3.9478787878787878, + "grad_norm": 0.003666672622784972, + "learning_rate": 6.714166372422329e-05, + "loss": 0.011248378083109856, + "num_input_tokens_seen": 106673264, + "step": 6514, + "train_runtime": 52933.6942, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 3.9484848484848483, + "grad_norm": 0.008831469342112541, + "learning_rate": 6.713263004206224e-05, + "loss": 0.013473467901349068, + "num_input_tokens_seen": 106689640, + "step": 6515, + "train_runtime": 52941.8142, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 3.949090909090909, + "grad_norm": 0.008161789737641811, + "learning_rate": 6.712359572619857e-05, + "loss": 0.01249471865594387, + "num_input_tokens_seen": 106706016, + "step": 6516, + "train_runtime": 52949.9389, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 3.9496969696969697, + "grad_norm": 0.0024966206401586533, + "learning_rate": 6.711456077696646e-05, + "loss": 0.012160527519881725, + "num_input_tokens_seen": 106722392, + "step": 6517, + "train_runtime": 52958.0746, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 3.95030303030303, + "grad_norm": 0.005908643361181021, + "learning_rate": 6.710552519470006e-05, + "loss": 0.013484635390341282, + "num_input_tokens_seen": 106738768, + "step": 6518, + "train_runtime": 52966.1949, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 3.950909090909091, + "grad_norm": 0.007787010632455349, + "learning_rate": 6.70964889797336e-05, + "loss": 0.013033602386713028, + "num_input_tokens_seen": 106755144, + "step": 6519, + "train_runtime": 52974.3212, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 3.9515151515151516, + "grad_norm": 0.011320007964968681, + "learning_rate": 6.708745213240129e-05, + "loss": 0.012367863208055496, + "num_input_tokens_seen": 106771520, + "step": 6520, + "train_runtime": 52982.4438, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 3.952121212121212, + "grad_norm": 0.0053223189897835255, + "learning_rate": 6.707841465303741e-05, + "loss": 0.011887633241713047, + "num_input_tokens_seen": 106787896, + "step": 6521, + "train_runtime": 52990.5703, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 3.9527272727272726, + "grad_norm": 0.005421468988060951, + "learning_rate": 6.706937654197625e-05, + "loss": 0.011847789399325848, + "num_input_tokens_seen": 106804272, + "step": 6522, + "train_runtime": 52998.6953, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 3.953333333333333, + "grad_norm": 0.005767639260739088, + "learning_rate": 6.706033779955208e-05, + "loss": 0.011867949739098549, + "num_input_tokens_seen": 106820648, + "step": 6523, + "train_runtime": 53006.8188, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 3.953939393939394, + "grad_norm": 0.008243339136242867, + "learning_rate": 6.705129842609923e-05, + "loss": 0.011928509920835495, + "num_input_tokens_seen": 106837024, + "step": 6524, + "train_runtime": 53014.9417, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 3.9545454545454546, + "grad_norm": 0.008509257808327675, + "learning_rate": 6.704225842195206e-05, + "loss": 0.012889298610389233, + "num_input_tokens_seen": 106853400, + "step": 6525, + "train_runtime": 53023.0635, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 3.955151515151515, + "grad_norm": 0.006973634473979473, + "learning_rate": 6.703321778744495e-05, + "loss": 0.012101742438971996, + "num_input_tokens_seen": 106869776, + "step": 6526, + "train_runtime": 53031.1822, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 3.955757575757576, + "grad_norm": 0.01658928208053112, + "learning_rate": 6.702417652291227e-05, + "loss": 0.012578731402754784, + "num_input_tokens_seen": 106886152, + "step": 6527, + "train_runtime": 53039.3049, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 3.9563636363636365, + "grad_norm": 0.005565772298723459, + "learning_rate": 6.701513462868847e-05, + "loss": 0.011648627929389477, + "num_input_tokens_seen": 106902528, + "step": 6528, + "train_runtime": 53047.4301, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 3.956969696969697, + "grad_norm": 0.005399870686233044, + "learning_rate": 6.700609210510795e-05, + "loss": 0.012053496204316616, + "num_input_tokens_seen": 106918904, + "step": 6529, + "train_runtime": 53055.558, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 3.9575757575757575, + "grad_norm": 0.006925160065293312, + "learning_rate": 6.699704895250523e-05, + "loss": 0.013775073923170567, + "num_input_tokens_seen": 106935280, + "step": 6530, + "train_runtime": 53063.6814, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 3.958181818181818, + "grad_norm": 0.00795537605881691, + "learning_rate": 6.698800517121475e-05, + "loss": 0.011371806263923645, + "num_input_tokens_seen": 106951656, + "step": 6531, + "train_runtime": 53071.8075, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 3.958787878787879, + "grad_norm": 0.039060328155756, + "learning_rate": 6.697896076157106e-05, + "loss": 0.011892224662005901, + "num_input_tokens_seen": 106968032, + "step": 6532, + "train_runtime": 53079.9331, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 3.9593939393939395, + "grad_norm": 0.006919092033058405, + "learning_rate": 6.696991572390865e-05, + "loss": 0.011820215731859207, + "num_input_tokens_seen": 106984408, + "step": 6533, + "train_runtime": 53088.0618, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 3.96, + "grad_norm": 0.008267408236861229, + "learning_rate": 6.696087005856213e-05, + "loss": 0.012296671979129314, + "num_input_tokens_seen": 107000784, + "step": 6534, + "train_runtime": 53096.1901, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 3.9606060606060605, + "grad_norm": 0.005555623676627874, + "learning_rate": 6.695182376586603e-05, + "loss": 0.01222485862672329, + "num_input_tokens_seen": 107017160, + "step": 6535, + "train_runtime": 53104.3132, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 3.961212121212121, + "grad_norm": 0.008479633368551731, + "learning_rate": 6.6942776846155e-05, + "loss": 0.013767086900770664, + "num_input_tokens_seen": 107033536, + "step": 6536, + "train_runtime": 53112.4341, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 3.961818181818182, + "grad_norm": 0.008189888671040535, + "learning_rate": 6.693372929976365e-05, + "loss": 0.011569686233997345, + "num_input_tokens_seen": 107049912, + "step": 6537, + "train_runtime": 53120.556, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 3.9624242424242424, + "grad_norm": 0.009570608846843243, + "learning_rate": 6.692468112702663e-05, + "loss": 0.012308922596275806, + "num_input_tokens_seen": 107066288, + "step": 6538, + "train_runtime": 53128.673, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 3.963030303030303, + "grad_norm": 0.0053880163468420506, + "learning_rate": 6.69156323282786e-05, + "loss": 0.012178171426057816, + "num_input_tokens_seen": 107082664, + "step": 6539, + "train_runtime": 53136.7885, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 3.963636363636364, + "grad_norm": 0.007569990120828152, + "learning_rate": 6.690658290385428e-05, + "loss": 0.013003873638808727, + "num_input_tokens_seen": 107099040, + "step": 6540, + "train_runtime": 53144.9036, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 3.9642424242424243, + "grad_norm": 0.14633625745773315, + "learning_rate": 6.68975328540884e-05, + "loss": 0.012509978376328945, + "num_input_tokens_seen": 107115416, + "step": 6541, + "train_runtime": 53153.0207, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 3.964848484848485, + "grad_norm": 0.001463325577788055, + "learning_rate": 6.688848217931567e-05, + "loss": 0.011499492451548576, + "num_input_tokens_seen": 107131792, + "step": 6542, + "train_runtime": 53161.1356, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 3.9654545454545453, + "grad_norm": 0.006117834243923426, + "learning_rate": 6.687943087987087e-05, + "loss": 0.012589800171554089, + "num_input_tokens_seen": 107148168, + "step": 6543, + "train_runtime": 53169.2557, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 3.966060606060606, + "grad_norm": 0.007931923493742943, + "learning_rate": 6.687037895608879e-05, + "loss": 0.01133272610604763, + "num_input_tokens_seen": 107164544, + "step": 6544, + "train_runtime": 53177.3796, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 3.966666666666667, + "grad_norm": 0.00783488154411316, + "learning_rate": 6.686132640830426e-05, + "loss": 0.013222197070717812, + "num_input_tokens_seen": 107180920, + "step": 6545, + "train_runtime": 53185.5008, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 3.9672727272727273, + "grad_norm": 0.007313394919037819, + "learning_rate": 6.685227323685209e-05, + "loss": 0.012269326485693455, + "num_input_tokens_seen": 107197296, + "step": 6546, + "train_runtime": 53193.6136, + "train_tokens_per_second": 2015.229 + }, + { + "epoch": 3.967878787878788, + "grad_norm": 0.011161161586642265, + "learning_rate": 6.684321944206715e-05, + "loss": 0.011687691323459148, + "num_input_tokens_seen": 107213672, + "step": 6547, + "train_runtime": 53201.7325, + "train_tokens_per_second": 2015.229 + }, + { + "epoch": 3.9684848484848487, + "grad_norm": 0.007492467295378447, + "learning_rate": 6.683416502428434e-05, + "loss": 0.012194931507110596, + "num_input_tokens_seen": 107230048, + "step": 6548, + "train_runtime": 53209.8514, + "train_tokens_per_second": 2015.229 + }, + { + "epoch": 3.9690909090909092, + "grad_norm": 0.006346522830426693, + "learning_rate": 6.682510998383854e-05, + "loss": 0.01259647123515606, + "num_input_tokens_seen": 107246424, + "step": 6549, + "train_runtime": 53217.9708, + "train_tokens_per_second": 2015.229 + }, + { + "epoch": 3.9696969696969697, + "grad_norm": 0.0065385340712964535, + "learning_rate": 6.681605432106467e-05, + "loss": 0.012597965076565742, + "num_input_tokens_seen": 107262800, + "step": 6550, + "train_runtime": 53226.0878, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 3.9703030303030302, + "grad_norm": 0.0026469065342098475, + "learning_rate": 6.680699803629773e-05, + "loss": 0.011563367210328579, + "num_input_tokens_seen": 107279176, + "step": 6551, + "train_runtime": 53234.2043, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 3.9709090909090907, + "grad_norm": 0.010749312117695808, + "learning_rate": 6.679794112987265e-05, + "loss": 0.011805864050984383, + "num_input_tokens_seen": 107295552, + "step": 6552, + "train_runtime": 53242.3323, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 3.9715151515151517, + "grad_norm": 0.007881419733166695, + "learning_rate": 6.678888360212444e-05, + "loss": 0.011975004337728024, + "num_input_tokens_seen": 107311928, + "step": 6553, + "train_runtime": 53250.4506, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 3.972121212121212, + "grad_norm": 0.004547844175249338, + "learning_rate": 6.677982545338812e-05, + "loss": 0.011275541968643665, + "num_input_tokens_seen": 107328304, + "step": 6554, + "train_runtime": 53258.5666, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 3.9727272727272727, + "grad_norm": 0.008543103002011776, + "learning_rate": 6.677076668399872e-05, + "loss": 0.010835188440978527, + "num_input_tokens_seen": 107344680, + "step": 6555, + "train_runtime": 53266.6796, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 3.9733333333333336, + "grad_norm": 0.006401778198778629, + "learning_rate": 6.676170729429133e-05, + "loss": 0.011799863539636135, + "num_input_tokens_seen": 107361056, + "step": 6556, + "train_runtime": 53274.7901, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 3.973939393939394, + "grad_norm": 0.0037875298876315355, + "learning_rate": 6.675264728460103e-05, + "loss": 0.012104997411370277, + "num_input_tokens_seen": 107377432, + "step": 6557, + "train_runtime": 53282.9044, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 3.9745454545454546, + "grad_norm": 0.005360961891710758, + "learning_rate": 6.674358665526292e-05, + "loss": 0.01124458760023117, + "num_input_tokens_seen": 107393808, + "step": 6558, + "train_runtime": 53291.019, + "train_tokens_per_second": 2015.233 + }, + { + "epoch": 3.975151515151515, + "grad_norm": 0.006977549288421869, + "learning_rate": 6.673452540661217e-05, + "loss": 0.011890798807144165, + "num_input_tokens_seen": 107410184, + "step": 6559, + "train_runtime": 53299.1337, + "train_tokens_per_second": 2015.233 + }, + { + "epoch": 3.9757575757575756, + "grad_norm": 0.006494495086371899, + "learning_rate": 6.67254635389839e-05, + "loss": 0.011936687864363194, + "num_input_tokens_seen": 107426560, + "step": 6560, + "train_runtime": 53307.2482, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 3.9763636363636365, + "grad_norm": 0.008515489287674427, + "learning_rate": 6.671640105271331e-05, + "loss": 0.01286142598837614, + "num_input_tokens_seen": 107442936, + "step": 6561, + "train_runtime": 53315.3618, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 3.976969696969697, + "grad_norm": 0.008803606033325195, + "learning_rate": 6.670733794813558e-05, + "loss": 0.01312145497649908, + "num_input_tokens_seen": 107459312, + "step": 6562, + "train_runtime": 53323.474, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 3.9775757575757575, + "grad_norm": 0.0055271293967962265, + "learning_rate": 6.669827422558598e-05, + "loss": 0.011393346823751926, + "num_input_tokens_seen": 107475688, + "step": 6563, + "train_runtime": 53331.5883, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 3.978181818181818, + "grad_norm": 0.007407458499073982, + "learning_rate": 6.668920988539973e-05, + "loss": 0.013487188145518303, + "num_input_tokens_seen": 107492064, + "step": 6564, + "train_runtime": 53339.7037, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 3.9787878787878785, + "grad_norm": 0.004199409857392311, + "learning_rate": 6.66801449279121e-05, + "loss": 0.011213931255042553, + "num_input_tokens_seen": 107508440, + "step": 6565, + "train_runtime": 53347.8162, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 3.9793939393939395, + "grad_norm": 0.007545046042650938, + "learning_rate": 6.667107935345839e-05, + "loss": 0.012426524423062801, + "num_input_tokens_seen": 107524816, + "step": 6566, + "train_runtime": 53355.9303, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 3.98, + "grad_norm": 0.007890257053077221, + "learning_rate": 6.666201316237392e-05, + "loss": 0.01217838004231453, + "num_input_tokens_seen": 107541192, + "step": 6567, + "train_runtime": 53364.0509, + "train_tokens_per_second": 2015.237 + }, + { + "epoch": 3.9806060606060605, + "grad_norm": 0.007131775841116905, + "learning_rate": 6.665294635499404e-05, + "loss": 0.012047805823385715, + "num_input_tokens_seen": 107557568, + "step": 6568, + "train_runtime": 53372.1645, + "train_tokens_per_second": 2015.237 + }, + { + "epoch": 3.9812121212121214, + "grad_norm": 0.011000075377523899, + "learning_rate": 6.664387893165411e-05, + "loss": 0.01179931964725256, + "num_input_tokens_seen": 107573944, + "step": 6569, + "train_runtime": 53380.2785, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 3.981818181818182, + "grad_norm": 0.0030005166772753, + "learning_rate": 6.66348108926895e-05, + "loss": 0.012781776487827301, + "num_input_tokens_seen": 107590320, + "step": 6570, + "train_runtime": 53388.3898, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 3.9824242424242424, + "grad_norm": 0.008846698328852654, + "learning_rate": 6.662574223843562e-05, + "loss": 0.010764230974018574, + "num_input_tokens_seen": 107606696, + "step": 6571, + "train_runtime": 53396.5064, + "train_tokens_per_second": 2015.239 + }, + { + "epoch": 3.983030303030303, + "grad_norm": 0.008736904710531235, + "learning_rate": 6.661667296922794e-05, + "loss": 0.01231537014245987, + "num_input_tokens_seen": 107623072, + "step": 6572, + "train_runtime": 53404.6209, + "train_tokens_per_second": 2015.239 + }, + { + "epoch": 3.9836363636363634, + "grad_norm": 0.006713247857987881, + "learning_rate": 6.660760308540186e-05, + "loss": 0.01137951947748661, + "num_input_tokens_seen": 107639448, + "step": 6573, + "train_runtime": 53412.7319, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 3.9842424242424244, + "grad_norm": 0.005238441750407219, + "learning_rate": 6.65985325872929e-05, + "loss": 0.011133607476949692, + "num_input_tokens_seen": 107655824, + "step": 6574, + "train_runtime": 53420.8439, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 3.984848484848485, + "grad_norm": 0.007687445729970932, + "learning_rate": 6.658946147523652e-05, + "loss": 0.012262530624866486, + "num_input_tokens_seen": 107672200, + "step": 6575, + "train_runtime": 53428.9587, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 3.9854545454545454, + "grad_norm": 0.0073543572798371315, + "learning_rate": 6.658038974956829e-05, + "loss": 0.012886738404631615, + "num_input_tokens_seen": 107688576, + "step": 6576, + "train_runtime": 53437.0764, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 3.9860606060606063, + "grad_norm": 0.008610925637185574, + "learning_rate": 6.657131741062373e-05, + "loss": 0.01309305801987648, + "num_input_tokens_seen": 107704952, + "step": 6577, + "train_runtime": 53445.193, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 3.986666666666667, + "grad_norm": 0.00890811812132597, + "learning_rate": 6.656224445873841e-05, + "loss": 0.012318043038249016, + "num_input_tokens_seen": 107721328, + "step": 6578, + "train_runtime": 53453.308, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 3.9872727272727273, + "grad_norm": 0.006160541903227568, + "learning_rate": 6.65531708942479e-05, + "loss": 0.013037342578172684, + "num_input_tokens_seen": 107737704, + "step": 6579, + "train_runtime": 53461.4314, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 3.987878787878788, + "grad_norm": 0.0072151911444962025, + "learning_rate": 6.654409671748786e-05, + "loss": 0.01261911727488041, + "num_input_tokens_seen": 107754080, + "step": 6580, + "train_runtime": 53469.5439, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 3.9884848484848483, + "grad_norm": 0.00529538793489337, + "learning_rate": 6.653502192879389e-05, + "loss": 0.0121742133051157, + "num_input_tokens_seen": 107770456, + "step": 6581, + "train_runtime": 53477.6602, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 3.9890909090909092, + "grad_norm": 0.008605719543993473, + "learning_rate": 6.652594652850167e-05, + "loss": 0.010680843144655228, + "num_input_tokens_seen": 107786832, + "step": 6582, + "train_runtime": 53485.7729, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 3.9896969696969697, + "grad_norm": 0.018976807594299316, + "learning_rate": 6.651687051694686e-05, + "loss": 0.012957514263689518, + "num_input_tokens_seen": 107803208, + "step": 6583, + "train_runtime": 53493.884, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 3.9903030303030302, + "grad_norm": 0.006058837287127972, + "learning_rate": 6.650779389446517e-05, + "loss": 0.01278536207973957, + "num_input_tokens_seen": 107819584, + "step": 6584, + "train_runtime": 53501.9965, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 3.990909090909091, + "grad_norm": 0.004997039679437876, + "learning_rate": 6.649871666139234e-05, + "loss": 0.011976274661719799, + "num_input_tokens_seen": 107835960, + "step": 6585, + "train_runtime": 53510.1137, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 3.9915151515151512, + "grad_norm": 0.006643078755587339, + "learning_rate": 6.64896388180641e-05, + "loss": 0.012317109853029251, + "num_input_tokens_seen": 107852336, + "step": 6586, + "train_runtime": 53518.2312, + "train_tokens_per_second": 2015.245 + }, + { + "epoch": 3.992121212121212, + "grad_norm": 0.006515262182801962, + "learning_rate": 6.648056036481625e-05, + "loss": 0.011833401396870613, + "num_input_tokens_seen": 107868712, + "step": 6587, + "train_runtime": 53526.3456, + "train_tokens_per_second": 2015.245 + }, + { + "epoch": 3.9927272727272727, + "grad_norm": 0.0014909404562786222, + "learning_rate": 6.647148130198455e-05, + "loss": 0.010653373785316944, + "num_input_tokens_seen": 107885088, + "step": 6588, + "train_runtime": 53534.4581, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 3.993333333333333, + "grad_norm": 0.008263724856078625, + "learning_rate": 6.646240162990483e-05, + "loss": 0.010792214423418045, + "num_input_tokens_seen": 107901464, + "step": 6589, + "train_runtime": 53542.5702, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 3.993939393939394, + "grad_norm": 0.0031657519284635782, + "learning_rate": 6.645332134891296e-05, + "loss": 0.012245271354913712, + "num_input_tokens_seen": 107917840, + "step": 6590, + "train_runtime": 53550.6853, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 3.9945454545454546, + "grad_norm": 0.008295486681163311, + "learning_rate": 6.644424045934475e-05, + "loss": 0.013156255707144737, + "num_input_tokens_seen": 107934216, + "step": 6591, + "train_runtime": 53558.799, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 3.995151515151515, + "grad_norm": 0.0070218718610703945, + "learning_rate": 6.643515896153612e-05, + "loss": 0.011916803196072578, + "num_input_tokens_seen": 107950592, + "step": 6592, + "train_runtime": 53566.9134, + "train_tokens_per_second": 2015.248 + }, + { + "epoch": 3.9957575757575756, + "grad_norm": 0.010510032065212727, + "learning_rate": 6.642607685582297e-05, + "loss": 0.012985512614250183, + "num_input_tokens_seen": 107966968, + "step": 6593, + "train_runtime": 53575.0329, + "train_tokens_per_second": 2015.248 + }, + { + "epoch": 3.996363636363636, + "grad_norm": 0.006769612431526184, + "learning_rate": 6.64169941425412e-05, + "loss": 0.01392223872244358, + "num_input_tokens_seen": 107983344, + "step": 6594, + "train_runtime": 53583.1489, + "train_tokens_per_second": 2015.248 + }, + { + "epoch": 3.996969696969697, + "grad_norm": 0.005791705101728439, + "learning_rate": 6.640791082202683e-05, + "loss": 0.010839714668691158, + "num_input_tokens_seen": 107999720, + "step": 6595, + "train_runtime": 53591.2604, + "train_tokens_per_second": 2015.249 + }, + { + "epoch": 3.9975757575757576, + "grad_norm": 0.007450724486261606, + "learning_rate": 6.639882689461576e-05, + "loss": 0.011279881000518799, + "num_input_tokens_seen": 108016096, + "step": 6596, + "train_runtime": 53599.3735, + "train_tokens_per_second": 2015.249 + }, + { + "epoch": 3.998181818181818, + "grad_norm": 0.007003985345363617, + "learning_rate": 6.638974236064403e-05, + "loss": 0.01161157712340355, + "num_input_tokens_seen": 108032472, + "step": 6597, + "train_runtime": 53607.4887, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 3.998787878787879, + "grad_norm": 0.004140882752835751, + "learning_rate": 6.638065722044763e-05, + "loss": 0.011944196186959743, + "num_input_tokens_seen": 108048848, + "step": 6598, + "train_runtime": 53615.605, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 3.9993939393939395, + "grad_norm": 0.006332708988338709, + "learning_rate": 6.637157147436264e-05, + "loss": 0.012131761759519577, + "num_input_tokens_seen": 108065224, + "step": 6599, + "train_runtime": 53623.7194, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 4.0, + "grad_norm": 0.0039033929351717234, + "learning_rate": 6.63624851227251e-05, + "loss": 0.011713503859937191, + "num_input_tokens_seen": 108081600, + "step": 6600, + "train_runtime": 53631.8334, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 4.000606060606061, + "grad_norm": 0.009881045669317245, + "learning_rate": 6.635339816587109e-05, + "loss": 0.013164076954126358, + "num_input_tokens_seen": 108097976, + "step": 6601, + "train_runtime": 53640.9798, + "train_tokens_per_second": 2015.213 + }, + { + "epoch": 4.001212121212121, + "grad_norm": 0.0047542802058160305, + "learning_rate": 6.634431060413673e-05, + "loss": 0.012297019362449646, + "num_input_tokens_seen": 108114352, + "step": 6602, + "train_runtime": 53649.0907, + "train_tokens_per_second": 2015.213 + }, + { + "epoch": 4.001818181818182, + "grad_norm": 0.005976487882435322, + "learning_rate": 6.633522243785815e-05, + "loss": 0.011966832913458347, + "num_input_tokens_seen": 108130728, + "step": 6603, + "train_runtime": 53657.2015, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 4.002424242424242, + "grad_norm": 0.009098093025386333, + "learning_rate": 6.632613366737151e-05, + "loss": 0.012944905087351799, + "num_input_tokens_seen": 108147104, + "step": 6604, + "train_runtime": 53665.315, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 4.003030303030303, + "grad_norm": 0.005986894015222788, + "learning_rate": 6.631704429301299e-05, + "loss": 0.011274673976004124, + "num_input_tokens_seen": 108163480, + "step": 6605, + "train_runtime": 53673.4316, + "train_tokens_per_second": 2015.215 + }, + { + "epoch": 4.003636363636364, + "grad_norm": 0.01032146904617548, + "learning_rate": 6.630795431511876e-05, + "loss": 0.012444308958947659, + "num_input_tokens_seen": 108179856, + "step": 6606, + "train_runtime": 53681.5401, + "train_tokens_per_second": 2015.215 + }, + { + "epoch": 4.004242424242424, + "grad_norm": 0.009597398340702057, + "learning_rate": 6.629886373402505e-05, + "loss": 0.012027861550450325, + "num_input_tokens_seen": 108196232, + "step": 6607, + "train_runtime": 53689.6533, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 4.004848484848485, + "grad_norm": 0.0036844569258391857, + "learning_rate": 6.628977255006812e-05, + "loss": 0.011003529652953148, + "num_input_tokens_seen": 108212608, + "step": 6608, + "train_runtime": 53697.7709, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 4.005454545454546, + "grad_norm": 0.007400908507406712, + "learning_rate": 6.628068076358425e-05, + "loss": 0.010667973197996616, + "num_input_tokens_seen": 108228984, + "step": 6609, + "train_runtime": 53705.887, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 4.006060606060606, + "grad_norm": 0.0029918153304606676, + "learning_rate": 6.627158837490968e-05, + "loss": 0.012130324728786945, + "num_input_tokens_seen": 108245360, + "step": 6610, + "train_runtime": 53714.0001, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 4.006666666666667, + "grad_norm": 0.0039725457318127155, + "learning_rate": 6.626249538438074e-05, + "loss": 0.012393374927341938, + "num_input_tokens_seen": 108261736, + "step": 6611, + "train_runtime": 53722.1184, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 4.007272727272727, + "grad_norm": 0.006581656634807587, + "learning_rate": 6.625340179233377e-05, + "loss": 0.011652831919491291, + "num_input_tokens_seen": 108278112, + "step": 6612, + "train_runtime": 53730.2314, + "train_tokens_per_second": 2015.218 + }, + { + "epoch": 4.007878787878788, + "grad_norm": 0.00787278451025486, + "learning_rate": 6.624430759910512e-05, + "loss": 0.01055954210460186, + "num_input_tokens_seen": 108294488, + "step": 6613, + "train_runtime": 53738.3458, + "train_tokens_per_second": 2015.218 + }, + { + "epoch": 4.008484848484849, + "grad_norm": 0.007201340980827808, + "learning_rate": 6.623521280503117e-05, + "loss": 0.012225551530718803, + "num_input_tokens_seen": 108310864, + "step": 6614, + "train_runtime": 53746.4585, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 4.009090909090909, + "grad_norm": 0.005736085586249828, + "learning_rate": 6.622611741044832e-05, + "loss": 0.013558316975831985, + "num_input_tokens_seen": 108327240, + "step": 6615, + "train_runtime": 53754.575, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 4.00969696969697, + "grad_norm": 0.01177095714956522, + "learning_rate": 6.621702141569298e-05, + "loss": 0.012200889177620411, + "num_input_tokens_seen": 108343616, + "step": 6616, + "train_runtime": 53762.6913, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 4.010303030303031, + "grad_norm": 0.004176735877990723, + "learning_rate": 6.62079248211016e-05, + "loss": 0.011988180689513683, + "num_input_tokens_seen": 108359992, + "step": 6617, + "train_runtime": 53770.8096, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 4.010909090909091, + "grad_norm": 0.009272183291614056, + "learning_rate": 6.619882762701064e-05, + "loss": 0.010586785152554512, + "num_input_tokens_seen": 108376368, + "step": 6618, + "train_runtime": 53778.9211, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 4.011515151515152, + "grad_norm": 0.005420962814241648, + "learning_rate": 6.61897298337566e-05, + "loss": 0.011043875478208065, + "num_input_tokens_seen": 108392744, + "step": 6619, + "train_runtime": 53787.0384, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 4.012121212121212, + "grad_norm": 0.0049265362322330475, + "learning_rate": 6.618063144167597e-05, + "loss": 0.011725038290023804, + "num_input_tokens_seen": 108409120, + "step": 6620, + "train_runtime": 53795.1584, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 4.012727272727273, + "grad_norm": 0.01013525016605854, + "learning_rate": 6.61715324511053e-05, + "loss": 0.012159437872469425, + "num_input_tokens_seen": 108425496, + "step": 6621, + "train_runtime": 53803.2702, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 4.013333333333334, + "grad_norm": 0.007110030390322208, + "learning_rate": 6.616243286238115e-05, + "loss": 0.012408256530761719, + "num_input_tokens_seen": 108441872, + "step": 6622, + "train_runtime": 53811.3799, + "train_tokens_per_second": 2015.222 + }, + { + "epoch": 4.013939393939394, + "grad_norm": 0.008874112740159035, + "learning_rate": 6.615333267584006e-05, + "loss": 0.010794769041240215, + "num_input_tokens_seen": 108458248, + "step": 6623, + "train_runtime": 53819.4979, + "train_tokens_per_second": 2015.222 + }, + { + "epoch": 4.014545454545455, + "grad_norm": 0.005947944708168507, + "learning_rate": 6.614423189181866e-05, + "loss": 0.01310049556195736, + "num_input_tokens_seen": 108474624, + "step": 6624, + "train_runtime": 53827.6141, + "train_tokens_per_second": 2015.223 + }, + { + "epoch": 4.015151515151516, + "grad_norm": 0.004656651522964239, + "learning_rate": 6.613513051065357e-05, + "loss": 0.012744910083711147, + "num_input_tokens_seen": 108491000, + "step": 6625, + "train_runtime": 53835.734, + "train_tokens_per_second": 2015.223 + }, + { + "epoch": 4.015757575757576, + "grad_norm": 0.0010660771513357759, + "learning_rate": 6.612602853268142e-05, + "loss": 0.011014383286237717, + "num_input_tokens_seen": 108507376, + "step": 6626, + "train_runtime": 53843.8515, + "train_tokens_per_second": 2015.223 + }, + { + "epoch": 4.016363636363637, + "grad_norm": 0.006187736056745052, + "learning_rate": 6.611692595823888e-05, + "loss": 0.012588555924594402, + "num_input_tokens_seen": 108523752, + "step": 6627, + "train_runtime": 53851.9669, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 4.016969696969697, + "grad_norm": 0.00965807493776083, + "learning_rate": 6.610782278766264e-05, + "loss": 0.012436773627996445, + "num_input_tokens_seen": 108540128, + "step": 6628, + "train_runtime": 53860.0798, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 4.017575757575758, + "grad_norm": 0.0038265727926045656, + "learning_rate": 6.609871902128937e-05, + "loss": 0.011527267284691334, + "num_input_tokens_seen": 108556504, + "step": 6629, + "train_runtime": 53868.1916, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 4.0181818181818185, + "grad_norm": 0.008593316189944744, + "learning_rate": 6.608961465945587e-05, + "loss": 0.012637588195502758, + "num_input_tokens_seen": 108572880, + "step": 6630, + "train_runtime": 53876.3049, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 4.018787878787879, + "grad_norm": 0.0031953139696270227, + "learning_rate": 6.608050970249884e-05, + "loss": 0.012620000168681145, + "num_input_tokens_seen": 108589256, + "step": 6631, + "train_runtime": 53884.4214, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 4.0193939393939395, + "grad_norm": 0.007425799500197172, + "learning_rate": 6.607140415075508e-05, + "loss": 0.012627394869923592, + "num_input_tokens_seen": 108605632, + "step": 6632, + "train_runtime": 53892.5338, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 4.02, + "grad_norm": 0.006978773046284914, + "learning_rate": 6.606229800456137e-05, + "loss": 0.010098385624587536, + "num_input_tokens_seen": 108622008, + "step": 6633, + "train_runtime": 53900.6449, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 4.0206060606060605, + "grad_norm": 0.00918311532586813, + "learning_rate": 6.605319126425454e-05, + "loss": 0.011539066210389137, + "num_input_tokens_seen": 108638384, + "step": 6634, + "train_runtime": 53908.7607, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 4.0212121212121215, + "grad_norm": 0.008134511299431324, + "learning_rate": 6.604408393017143e-05, + "loss": 0.011699188500642776, + "num_input_tokens_seen": 108654760, + "step": 6635, + "train_runtime": 53916.8727, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 4.0218181818181815, + "grad_norm": 0.012781014665961266, + "learning_rate": 6.603497600264892e-05, + "loss": 0.012668032199144363, + "num_input_tokens_seen": 108671136, + "step": 6636, + "train_runtime": 53924.9862, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 4.0224242424242425, + "grad_norm": 0.007021512370556593, + "learning_rate": 6.602586748202384e-05, + "loss": 0.012430474162101746, + "num_input_tokens_seen": 108687512, + "step": 6637, + "train_runtime": 53933.1005, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 4.023030303030303, + "grad_norm": 0.007708024699240923, + "learning_rate": 6.601675836863315e-05, + "loss": 0.012040534988045692, + "num_input_tokens_seen": 108703888, + "step": 6638, + "train_runtime": 53941.2179, + "train_tokens_per_second": 2015.229 + }, + { + "epoch": 4.0236363636363635, + "grad_norm": 0.004515843465924263, + "learning_rate": 6.600764866281375e-05, + "loss": 0.012569981627166271, + "num_input_tokens_seen": 108720264, + "step": 6639, + "train_runtime": 53949.3428, + "train_tokens_per_second": 2015.229 + }, + { + "epoch": 4.024242424242424, + "grad_norm": 0.005732097662985325, + "learning_rate": 6.59985383649026e-05, + "loss": 0.012136969715356827, + "num_input_tokens_seen": 108736640, + "step": 6640, + "train_runtime": 53957.4548, + "train_tokens_per_second": 2015.229 + }, + { + "epoch": 4.0248484848484845, + "grad_norm": 0.006303023546934128, + "learning_rate": 6.598942747523668e-05, + "loss": 0.01229165680706501, + "num_input_tokens_seen": 108753016, + "step": 6641, + "train_runtime": 53965.5679, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 4.025454545454545, + "grad_norm": 0.006611714139580727, + "learning_rate": 6.598031599415298e-05, + "loss": 0.01384616270661354, + "num_input_tokens_seen": 108769392, + "step": 6642, + "train_runtime": 53973.6832, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 4.026060606060606, + "grad_norm": 0.004815991967916489, + "learning_rate": 6.59712039219885e-05, + "loss": 0.012132415547966957, + "num_input_tokens_seen": 108785768, + "step": 6643, + "train_runtime": 53981.7935, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 4.026666666666666, + "grad_norm": 0.011119725182652473, + "learning_rate": 6.596209125908028e-05, + "loss": 0.012316416017711163, + "num_input_tokens_seen": 108802144, + "step": 6644, + "train_runtime": 53989.9045, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 4.027272727272727, + "grad_norm": 0.006024129688739777, + "learning_rate": 6.59529780057654e-05, + "loss": 0.012201709672808647, + "num_input_tokens_seen": 108818520, + "step": 6645, + "train_runtime": 53998.0302, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 4.027878787878788, + "grad_norm": 0.0074819717556238174, + "learning_rate": 6.594386416238094e-05, + "loss": 0.011587687768042088, + "num_input_tokens_seen": 108834896, + "step": 6646, + "train_runtime": 54006.143, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 4.028484848484848, + "grad_norm": 0.007597306277602911, + "learning_rate": 6.593474972926399e-05, + "loss": 0.012362157925963402, + "num_input_tokens_seen": 108851272, + "step": 6647, + "train_runtime": 54014.2596, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 4.029090909090909, + "grad_norm": 0.006195442751049995, + "learning_rate": 6.592563470675166e-05, + "loss": 0.011100806295871735, + "num_input_tokens_seen": 108867648, + "step": 6648, + "train_runtime": 54022.3739, + "train_tokens_per_second": 2015.233 + }, + { + "epoch": 4.029696969696969, + "grad_norm": 0.007398346904665232, + "learning_rate": 6.591651909518113e-05, + "loss": 0.012423758395016193, + "num_input_tokens_seen": 108884024, + "step": 6649, + "train_runtime": 54030.4904, + "train_tokens_per_second": 2015.233 + }, + { + "epoch": 4.03030303030303, + "grad_norm": 0.006924089975655079, + "learning_rate": 6.590740289488956e-05, + "loss": 0.01178696658462286, + "num_input_tokens_seen": 108900400, + "step": 6650, + "train_runtime": 54038.6042, + "train_tokens_per_second": 2015.233 + }, + { + "epoch": 4.030909090909091, + "grad_norm": 0.005651682615280151, + "learning_rate": 6.589828610621413e-05, + "loss": 0.012482096441090107, + "num_input_tokens_seen": 108916776, + "step": 6651, + "train_runtime": 54046.7183, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 4.031515151515151, + "grad_norm": 0.0069713713601231575, + "learning_rate": 6.588916872949204e-05, + "loss": 0.013284913264214993, + "num_input_tokens_seen": 108933152, + "step": 6652, + "train_runtime": 54054.8357, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 4.032121212121212, + "grad_norm": 0.008618372492492199, + "learning_rate": 6.588005076506057e-05, + "loss": 0.012459351681172848, + "num_input_tokens_seen": 108949528, + "step": 6653, + "train_runtime": 54062.9535, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 4.032727272727273, + "grad_norm": 0.007754590827971697, + "learning_rate": 6.587093221325694e-05, + "loss": 0.01096649281680584, + "num_input_tokens_seen": 108965904, + "step": 6654, + "train_runtime": 54071.0679, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 4.033333333333333, + "grad_norm": 0.007360334042459726, + "learning_rate": 6.586181307441843e-05, + "loss": 0.011917858384549618, + "num_input_tokens_seen": 108982280, + "step": 6655, + "train_runtime": 54079.1823, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 4.033939393939394, + "grad_norm": 0.008598837070167065, + "learning_rate": 6.585269334888234e-05, + "loss": 0.011998066678643227, + "num_input_tokens_seen": 108998656, + "step": 6656, + "train_runtime": 54087.3019, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 4.034545454545454, + "grad_norm": 0.005692543461918831, + "learning_rate": 6.584357303698601e-05, + "loss": 0.011975091882050037, + "num_input_tokens_seen": 109015032, + "step": 6657, + "train_runtime": 54095.4168, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 4.035151515151515, + "grad_norm": 0.005801883060485125, + "learning_rate": 6.583445213906674e-05, + "loss": 0.011816740967333317, + "num_input_tokens_seen": 109031408, + "step": 6658, + "train_runtime": 54103.5321, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 4.035757575757576, + "grad_norm": 0.005026319995522499, + "learning_rate": 6.582533065546196e-05, + "loss": 0.01234055869281292, + "num_input_tokens_seen": 109047784, + "step": 6659, + "train_runtime": 54111.649, + "train_tokens_per_second": 2015.237 + }, + { + "epoch": 4.036363636363636, + "grad_norm": 0.008256614208221436, + "learning_rate": 6.581620858650898e-05, + "loss": 0.011411121115088463, + "num_input_tokens_seen": 109064160, + "step": 6660, + "train_runtime": 54119.7638, + "train_tokens_per_second": 2015.237 + }, + { + "epoch": 4.036969696969697, + "grad_norm": 0.007858906872570515, + "learning_rate": 6.580708593254526e-05, + "loss": 0.011611396446824074, + "num_input_tokens_seen": 109080536, + "step": 6661, + "train_runtime": 54127.8756, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 4.037575757575757, + "grad_norm": 0.009773586876690388, + "learning_rate": 6.579796269390823e-05, + "loss": 0.011992339044809341, + "num_input_tokens_seen": 109096912, + "step": 6662, + "train_runtime": 54135.9887, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 4.038181818181818, + "grad_norm": 0.009351823478937149, + "learning_rate": 6.57888388709353e-05, + "loss": 0.011521403677761555, + "num_input_tokens_seen": 109113288, + "step": 6663, + "train_runtime": 54144.1022, + "train_tokens_per_second": 2015.239 + }, + { + "epoch": 4.038787878787879, + "grad_norm": 0.005667380057275295, + "learning_rate": 6.577971446396398e-05, + "loss": 0.0116248968988657, + "num_input_tokens_seen": 109129664, + "step": 6664, + "train_runtime": 54152.2158, + "train_tokens_per_second": 2015.239 + }, + { + "epoch": 4.039393939393939, + "grad_norm": 0.007068190257996321, + "learning_rate": 6.577058947333175e-05, + "loss": 0.013068378902971745, + "num_input_tokens_seen": 109146040, + "step": 6665, + "train_runtime": 54160.3315, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 4.04, + "grad_norm": 0.0073351748287677765, + "learning_rate": 6.576146389937613e-05, + "loss": 0.011988786049187183, + "num_input_tokens_seen": 109162416, + "step": 6666, + "train_runtime": 54168.4456, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 4.040606060606061, + "grad_norm": 0.009006847627460957, + "learning_rate": 6.575233774243465e-05, + "loss": 0.012093137949705124, + "num_input_tokens_seen": 109178792, + "step": 6667, + "train_runtime": 54176.5606, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 4.041212121212121, + "grad_norm": 0.005302943754941225, + "learning_rate": 6.574321100284486e-05, + "loss": 0.012514556758105755, + "num_input_tokens_seen": 109195168, + "step": 6668, + "train_runtime": 54184.6735, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 4.041818181818182, + "grad_norm": 0.009446139447391033, + "learning_rate": 6.573408368094438e-05, + "loss": 0.012095226906239986, + "num_input_tokens_seen": 109211544, + "step": 6669, + "train_runtime": 54192.7851, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 4.042424242424242, + "grad_norm": 0.005150752142071724, + "learning_rate": 6.572495577707078e-05, + "loss": 0.010624253191053867, + "num_input_tokens_seen": 109227920, + "step": 6670, + "train_runtime": 54200.8991, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 4.043030303030303, + "grad_norm": 0.011307463981211185, + "learning_rate": 6.571582729156168e-05, + "loss": 0.011881678365170956, + "num_input_tokens_seen": 109244296, + "step": 6671, + "train_runtime": 54209.0155, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 4.043636363636364, + "grad_norm": 0.005510623566806316, + "learning_rate": 6.570669822475473e-05, + "loss": 0.012312116101384163, + "num_input_tokens_seen": 109260672, + "step": 6672, + "train_runtime": 54217.1306, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 4.044242424242424, + "grad_norm": 0.00725524453446269, + "learning_rate": 6.569756857698761e-05, + "loss": 0.012830747291445732, + "num_input_tokens_seen": 109277048, + "step": 6673, + "train_runtime": 54225.2403, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 4.044848484848485, + "grad_norm": 0.010276025161147118, + "learning_rate": 6.568843834859799e-05, + "loss": 0.012600078247487545, + "num_input_tokens_seen": 109293424, + "step": 6674, + "train_runtime": 54233.3534, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 4.045454545454546, + "grad_norm": 0.006128996144980192, + "learning_rate": 6.567930753992359e-05, + "loss": 0.011292570270597935, + "num_input_tokens_seen": 109309800, + "step": 6675, + "train_runtime": 54241.4692, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 4.046060606060606, + "grad_norm": 0.005186568945646286, + "learning_rate": 6.567017615130214e-05, + "loss": 0.011593570001423359, + "num_input_tokens_seen": 109326176, + "step": 6676, + "train_runtime": 54249.5827, + "train_tokens_per_second": 2015.245 + }, + { + "epoch": 4.046666666666667, + "grad_norm": 0.011170115321874619, + "learning_rate": 6.566104418307138e-05, + "loss": 0.011976173147559166, + "num_input_tokens_seen": 109342552, + "step": 6677, + "train_runtime": 54257.6945, + "train_tokens_per_second": 2015.245 + }, + { + "epoch": 4.047272727272727, + "grad_norm": 0.008233655244112015, + "learning_rate": 6.565191163556912e-05, + "loss": 0.012446227483451366, + "num_input_tokens_seen": 109358928, + "step": 6678, + "train_runtime": 54265.8041, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 4.047878787878788, + "grad_norm": 0.0072286734357476234, + "learning_rate": 6.56427785091331e-05, + "loss": 0.012182018719613552, + "num_input_tokens_seen": 109375304, + "step": 6679, + "train_runtime": 54273.9151, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 4.048484848484849, + "grad_norm": 0.0027835702057927847, + "learning_rate": 6.563364480410119e-05, + "loss": 0.011755697429180145, + "num_input_tokens_seen": 109391680, + "step": 6680, + "train_runtime": 54282.0302, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 4.049090909090909, + "grad_norm": 0.005155585240572691, + "learning_rate": 6.562451052081118e-05, + "loss": 0.010823588818311691, + "num_input_tokens_seen": 109408056, + "step": 6681, + "train_runtime": 54290.1398, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 4.04969696969697, + "grad_norm": 0.005723264999687672, + "learning_rate": 6.561537565960098e-05, + "loss": 0.011575661599636078, + "num_input_tokens_seen": 109424432, + "step": 6682, + "train_runtime": 54298.2557, + "train_tokens_per_second": 2015.248 + }, + { + "epoch": 4.050303030303031, + "grad_norm": 0.004240747075527906, + "learning_rate": 6.560624022080842e-05, + "loss": 0.012635022401809692, + "num_input_tokens_seen": 109440808, + "step": 6683, + "train_runtime": 54306.3752, + "train_tokens_per_second": 2015.248 + }, + { + "epoch": 4.050909090909091, + "grad_norm": 0.0038895977195352316, + "learning_rate": 6.559710420477143e-05, + "loss": 0.012097565457224846, + "num_input_tokens_seen": 109457184, + "step": 6684, + "train_runtime": 54314.489, + "train_tokens_per_second": 2015.248 + }, + { + "epoch": 4.051515151515152, + "grad_norm": 0.0069579132832586765, + "learning_rate": 6.558796761182794e-05, + "loss": 0.01237529143691063, + "num_input_tokens_seen": 109473560, + "step": 6685, + "train_runtime": 54322.6036, + "train_tokens_per_second": 2015.249 + }, + { + "epoch": 4.052121212121212, + "grad_norm": 0.006064839661121368, + "learning_rate": 6.557883044231588e-05, + "loss": 0.012685137800872326, + "num_input_tokens_seen": 109489936, + "step": 6686, + "train_runtime": 54330.7215, + "train_tokens_per_second": 2015.249 + }, + { + "epoch": 4.052727272727273, + "grad_norm": 0.009405514225363731, + "learning_rate": 6.556969269657323e-05, + "loss": 0.013490073382854462, + "num_input_tokens_seen": 109506312, + "step": 6687, + "train_runtime": 54338.8328, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 4.053333333333334, + "grad_norm": 0.007618871051818132, + "learning_rate": 6.556055437493794e-05, + "loss": 0.012695473618805408, + "num_input_tokens_seen": 109522688, + "step": 6688, + "train_runtime": 54346.9549, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 4.053939393939394, + "grad_norm": 0.008057769387960434, + "learning_rate": 6.555141547774807e-05, + "loss": 0.011798663064837456, + "num_input_tokens_seen": 109539064, + "step": 6689, + "train_runtime": 54355.0654, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 4.054545454545455, + "grad_norm": 0.007940909825265408, + "learning_rate": 6.554227600534162e-05, + "loss": 0.011824453249573708, + "num_input_tokens_seen": 109555440, + "step": 6690, + "train_runtime": 54363.1771, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 4.055151515151515, + "grad_norm": 0.007136265281587839, + "learning_rate": 6.553313595805666e-05, + "loss": 0.011803132481873035, + "num_input_tokens_seen": 109571816, + "step": 6691, + "train_runtime": 54371.285, + "train_tokens_per_second": 2015.252 + }, + { + "epoch": 4.055757575757576, + "grad_norm": 0.009932974353432655, + "learning_rate": 6.552399533623123e-05, + "loss": 0.012506055645644665, + "num_input_tokens_seen": 109588192, + "step": 6692, + "train_runtime": 54379.3946, + "train_tokens_per_second": 2015.252 + }, + { + "epoch": 4.056363636363637, + "grad_norm": 0.0069695282727479935, + "learning_rate": 6.551485414020345e-05, + "loss": 0.012191887944936752, + "num_input_tokens_seen": 109604568, + "step": 6693, + "train_runtime": 54387.5103, + "train_tokens_per_second": 2015.253 + }, + { + "epoch": 4.056969696969697, + "grad_norm": 0.005228510592132807, + "learning_rate": 6.550571237031143e-05, + "loss": 0.01191610749810934, + "num_input_tokens_seen": 109620944, + "step": 6694, + "train_runtime": 54395.6211, + "train_tokens_per_second": 2015.253 + }, + { + "epoch": 4.057575757575758, + "grad_norm": 0.008023296482861042, + "learning_rate": 6.54965700268933e-05, + "loss": 0.011114236898720264, + "num_input_tokens_seen": 109637320, + "step": 6695, + "train_runtime": 54403.7332, + "train_tokens_per_second": 2015.254 + }, + { + "epoch": 4.058181818181819, + "grad_norm": 0.0048049199394881725, + "learning_rate": 6.548742711028723e-05, + "loss": 0.010686934925615788, + "num_input_tokens_seen": 109653696, + "step": 6696, + "train_runtime": 54411.8436, + "train_tokens_per_second": 2015.254 + }, + { + "epoch": 4.058787878787879, + "grad_norm": 0.009145306423306465, + "learning_rate": 6.547828362083141e-05, + "loss": 0.012301169335842133, + "num_input_tokens_seen": 109670072, + "step": 6697, + "train_runtime": 54419.9585, + "train_tokens_per_second": 2015.255 + }, + { + "epoch": 4.0593939393939396, + "grad_norm": 0.006722769234329462, + "learning_rate": 6.546913955886398e-05, + "loss": 0.011034268885850906, + "num_input_tokens_seen": 109686448, + "step": 6698, + "train_runtime": 54428.0698, + "train_tokens_per_second": 2015.255 + }, + { + "epoch": 4.06, + "grad_norm": 0.003316167974844575, + "learning_rate": 6.545999492472324e-05, + "loss": 0.01238674484193325, + "num_input_tokens_seen": 109702824, + "step": 6699, + "train_runtime": 54436.1842, + "train_tokens_per_second": 2015.256 + }, + { + "epoch": 4.0606060606060606, + "grad_norm": 0.006658466532826424, + "learning_rate": 6.545084971874738e-05, + "loss": 0.011695478111505508, + "num_input_tokens_seen": 109719200, + "step": 6700, + "train_runtime": 54444.3012, + "train_tokens_per_second": 2015.256 + }, + { + "epoch": 4.0612121212121215, + "grad_norm": 0.009386797435581684, + "learning_rate": 6.544170394127468e-05, + "loss": 0.013375617563724518, + "num_input_tokens_seen": 109735576, + "step": 6701, + "train_runtime": 54453.3692, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 4.0618181818181816, + "grad_norm": 0.005323267541825771, + "learning_rate": 6.543255759264341e-05, + "loss": 0.011256490834057331, + "num_input_tokens_seen": 109751952, + "step": 6702, + "train_runtime": 54461.478, + "train_tokens_per_second": 2015.222 + }, + { + "epoch": 4.0624242424242425, + "grad_norm": 0.006388998590409756, + "learning_rate": 6.54234106731919e-05, + "loss": 0.01113734021782875, + "num_input_tokens_seen": 109768328, + "step": 6703, + "train_runtime": 54469.5867, + "train_tokens_per_second": 2015.222 + }, + { + "epoch": 4.063030303030303, + "grad_norm": 0.006404177285730839, + "learning_rate": 6.541426318325848e-05, + "loss": 0.011481456458568573, + "num_input_tokens_seen": 109784704, + "step": 6704, + "train_runtime": 54477.6955, + "train_tokens_per_second": 2015.223 + }, + { + "epoch": 4.0636363636363635, + "grad_norm": 0.004181183874607086, + "learning_rate": 6.540511512318146e-05, + "loss": 0.012225048616528511, + "num_input_tokens_seen": 109801080, + "step": 6705, + "train_runtime": 54485.8081, + "train_tokens_per_second": 2015.223 + }, + { + "epoch": 4.064242424242424, + "grad_norm": 0.011784178204834461, + "learning_rate": 6.539596649329924e-05, + "loss": 0.01374695636332035, + "num_input_tokens_seen": 109817456, + "step": 6706, + "train_runtime": 54493.9213, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 4.0648484848484845, + "grad_norm": 0.007962322793900967, + "learning_rate": 6.53868172939502e-05, + "loss": 0.013038030825555325, + "num_input_tokens_seen": 109833832, + "step": 6707, + "train_runtime": 54502.0358, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 4.065454545454545, + "grad_norm": 0.011079595424234867, + "learning_rate": 6.537766752547274e-05, + "loss": 0.012054698541760445, + "num_input_tokens_seen": 109850208, + "step": 6708, + "train_runtime": 54510.1483, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 4.066060606060606, + "grad_norm": 0.005833815783262253, + "learning_rate": 6.536851718820532e-05, + "loss": 0.011351658962666988, + "num_input_tokens_seen": 109866584, + "step": 6709, + "train_runtime": 54518.2643, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 4.066666666666666, + "grad_norm": 0.005686975549906492, + "learning_rate": 6.535936628248639e-05, + "loss": 0.012339255772531033, + "num_input_tokens_seen": 109882960, + "step": 6710, + "train_runtime": 54526.3765, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 4.067272727272727, + "grad_norm": 0.007766516879200935, + "learning_rate": 6.535021480865439e-05, + "loss": 0.012319867499172688, + "num_input_tokens_seen": 109899336, + "step": 6711, + "train_runtime": 54534.4869, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 4.067878787878788, + "grad_norm": 0.004162651486694813, + "learning_rate": 6.534106276704785e-05, + "loss": 0.012555945664644241, + "num_input_tokens_seen": 109915712, + "step": 6712, + "train_runtime": 54542.6032, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 4.068484848484848, + "grad_norm": 0.005044703837484121, + "learning_rate": 6.533191015800527e-05, + "loss": 0.011303329840302467, + "num_input_tokens_seen": 109932088, + "step": 6713, + "train_runtime": 54550.7144, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 4.069090909090909, + "grad_norm": 0.0058623687364161015, + "learning_rate": 6.532275698186518e-05, + "loss": 0.01236221194267273, + "num_input_tokens_seen": 109948464, + "step": 6714, + "train_runtime": 54558.8311, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 4.069696969696969, + "grad_norm": 0.004394982010126114, + "learning_rate": 6.531360323896616e-05, + "loss": 0.010746819898486137, + "num_input_tokens_seen": 109964840, + "step": 6715, + "train_runtime": 54566.9478, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 4.07030303030303, + "grad_norm": 0.006570020224899054, + "learning_rate": 6.530444892964678e-05, + "loss": 0.011535318568348885, + "num_input_tokens_seen": 109981216, + "step": 6716, + "train_runtime": 54575.0685, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 4.070909090909091, + "grad_norm": 0.0028879763558506966, + "learning_rate": 6.529529405424562e-05, + "loss": 0.010635592974722385, + "num_input_tokens_seen": 109997592, + "step": 6717, + "train_runtime": 54583.1864, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 4.071515151515151, + "grad_norm": 0.0060540009289979935, + "learning_rate": 6.528613861310136e-05, + "loss": 0.011461170390248299, + "num_input_tokens_seen": 110013968, + "step": 6718, + "train_runtime": 54591.2988, + "train_tokens_per_second": 2015.229 + }, + { + "epoch": 4.072121212121212, + "grad_norm": 0.007831357419490814, + "learning_rate": 6.527698260655256e-05, + "loss": 0.011775614693760872, + "num_input_tokens_seen": 110030344, + "step": 6719, + "train_runtime": 54599.4147, + "train_tokens_per_second": 2015.229 + }, + { + "epoch": 4.072727272727272, + "grad_norm": 0.00578015111386776, + "learning_rate": 6.526782603493794e-05, + "loss": 0.01026417687535286, + "num_input_tokens_seen": 110046720, + "step": 6720, + "train_runtime": 54607.5317, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 4.073333333333333, + "grad_norm": 0.004453750792890787, + "learning_rate": 6.525866889859617e-05, + "loss": 0.011076618917286396, + "num_input_tokens_seen": 110063096, + "step": 6721, + "train_runtime": 54615.647, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 4.073939393939394, + "grad_norm": 0.007201549131423235, + "learning_rate": 6.524951119786594e-05, + "loss": 0.012169227935373783, + "num_input_tokens_seen": 110079472, + "step": 6722, + "train_runtime": 54623.7607, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 4.074545454545454, + "grad_norm": 0.0066411541774868965, + "learning_rate": 6.5240352933086e-05, + "loss": 0.011333543807268143, + "num_input_tokens_seen": 110095848, + "step": 6723, + "train_runtime": 54631.8755, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 4.075151515151515, + "grad_norm": 0.005136855412274599, + "learning_rate": 6.523119410459508e-05, + "loss": 0.011927546001970768, + "num_input_tokens_seen": 110112224, + "step": 6724, + "train_runtime": 54639.9915, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 4.075757575757576, + "grad_norm": 0.009815352968871593, + "learning_rate": 6.522203471273195e-05, + "loss": 0.012241547927260399, + "num_input_tokens_seen": 110128600, + "step": 6725, + "train_runtime": 54648.1084, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 4.076363636363636, + "grad_norm": 0.0047886548563838005, + "learning_rate": 6.52128747578354e-05, + "loss": 0.011591303162276745, + "num_input_tokens_seen": 110144976, + "step": 6726, + "train_runtime": 54656.2299, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 4.076969696969697, + "grad_norm": 0.005385174881666899, + "learning_rate": 6.520371424024425e-05, + "loss": 0.011221512220799923, + "num_input_tokens_seen": 110161352, + "step": 6727, + "train_runtime": 54664.3437, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 4.077575757575757, + "grad_norm": 0.005324078723788261, + "learning_rate": 6.519455316029731e-05, + "loss": 0.012865553610026836, + "num_input_tokens_seen": 110177728, + "step": 6728, + "train_runtime": 54672.4536, + "train_tokens_per_second": 2015.233 + }, + { + "epoch": 4.078181818181818, + "grad_norm": 0.0035979487001895905, + "learning_rate": 6.518539151833344e-05, + "loss": 0.012009773403406143, + "num_input_tokens_seen": 110194104, + "step": 6729, + "train_runtime": 54680.564, + "train_tokens_per_second": 2015.233 + }, + { + "epoch": 4.078787878787879, + "grad_norm": 0.0035360578913241625, + "learning_rate": 6.517622931469149e-05, + "loss": 0.012264511547982693, + "num_input_tokens_seen": 110210480, + "step": 6730, + "train_runtime": 54688.6829, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 4.079393939393939, + "grad_norm": 0.007671200670301914, + "learning_rate": 6.516706654971041e-05, + "loss": 0.012945150956511497, + "num_input_tokens_seen": 110226856, + "step": 6731, + "train_runtime": 54696.7968, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 4.08, + "grad_norm": 0.001291109248995781, + "learning_rate": 6.515790322372906e-05, + "loss": 0.01288651954382658, + "num_input_tokens_seen": 110243232, + "step": 6732, + "train_runtime": 54704.9081, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 4.080606060606061, + "grad_norm": 0.006544061470776796, + "learning_rate": 6.514873933708638e-05, + "loss": 0.011770959012210369, + "num_input_tokens_seen": 110259608, + "step": 6733, + "train_runtime": 54713.0316, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 4.081212121212121, + "grad_norm": 0.0017245948547497392, + "learning_rate": 6.513957489012132e-05, + "loss": 0.01143400464206934, + "num_input_tokens_seen": 110275984, + "step": 6734, + "train_runtime": 54721.1476, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 4.081818181818182, + "grad_norm": 0.008340776897966862, + "learning_rate": 6.513040988317289e-05, + "loss": 0.013612993061542511, + "num_input_tokens_seen": 110292360, + "step": 6735, + "train_runtime": 54729.2578, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 4.082424242424242, + "grad_norm": 0.009151041507720947, + "learning_rate": 6.512124431658006e-05, + "loss": 0.01290676649659872, + "num_input_tokens_seen": 110308736, + "step": 6736, + "train_runtime": 54737.3728, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 4.083030303030303, + "grad_norm": 0.002449723659083247, + "learning_rate": 6.511207819068184e-05, + "loss": 0.01165770273655653, + "num_input_tokens_seen": 110325112, + "step": 6737, + "train_runtime": 54745.4869, + "train_tokens_per_second": 2015.237 + }, + { + "epoch": 4.083636363636364, + "grad_norm": 0.01070158090442419, + "learning_rate": 6.510291150581729e-05, + "loss": 0.012031187303364277, + "num_input_tokens_seen": 110341488, + "step": 6738, + "train_runtime": 54753.6015, + "train_tokens_per_second": 2015.237 + }, + { + "epoch": 4.084242424242424, + "grad_norm": 0.0068915849551558495, + "learning_rate": 6.509374426232544e-05, + "loss": 0.0121897142380476, + "num_input_tokens_seen": 110357864, + "step": 6739, + "train_runtime": 54761.717, + "train_tokens_per_second": 2015.237 + }, + { + "epoch": 4.084848484848485, + "grad_norm": 0.005337721202522516, + "learning_rate": 6.508457646054538e-05, + "loss": 0.011306582018733025, + "num_input_tokens_seen": 110374240, + "step": 6740, + "train_runtime": 54769.8305, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 4.085454545454546, + "grad_norm": 0.005481821484863758, + "learning_rate": 6.507540810081625e-05, + "loss": 0.012838033027946949, + "num_input_tokens_seen": 110390616, + "step": 6741, + "train_runtime": 54777.9443, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 4.086060606060606, + "grad_norm": 0.007203513756394386, + "learning_rate": 6.506623918347709e-05, + "loss": 0.012041829526424408, + "num_input_tokens_seen": 110406992, + "step": 6742, + "train_runtime": 54786.059, + "train_tokens_per_second": 2015.239 + }, + { + "epoch": 4.086666666666667, + "grad_norm": 0.006981387734413147, + "learning_rate": 6.505706970886711e-05, + "loss": 0.012450854294002056, + "num_input_tokens_seen": 110423368, + "step": 6743, + "train_runtime": 54794.1703, + "train_tokens_per_second": 2015.239 + }, + { + "epoch": 4.087272727272727, + "grad_norm": 0.0075949933379888535, + "learning_rate": 6.504789967732543e-05, + "loss": 0.011361047625541687, + "num_input_tokens_seen": 110439744, + "step": 6744, + "train_runtime": 54802.2813, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 4.087878787878788, + "grad_norm": 0.007561851758509874, + "learning_rate": 6.503872908919125e-05, + "loss": 0.013872135430574417, + "num_input_tokens_seen": 110456120, + "step": 6745, + "train_runtime": 54810.399, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 4.088484848484849, + "grad_norm": 0.0032198019325733185, + "learning_rate": 6.502955794480377e-05, + "loss": 0.011963226832449436, + "num_input_tokens_seen": 110472496, + "step": 6746, + "train_runtime": 54818.5082, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 4.089090909090909, + "grad_norm": 0.006146654486656189, + "learning_rate": 6.502038624450221e-05, + "loss": 0.012939522042870522, + "num_input_tokens_seen": 110488872, + "step": 6747, + "train_runtime": 54826.6203, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 4.08969696969697, + "grad_norm": 0.005497810896486044, + "learning_rate": 6.50112139886258e-05, + "loss": 0.012659750878810883, + "num_input_tokens_seen": 110505248, + "step": 6748, + "train_runtime": 54834.7316, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 4.09030303030303, + "grad_norm": 0.0023341402411460876, + "learning_rate": 6.500204117751383e-05, + "loss": 0.011343980208039284, + "num_input_tokens_seen": 110521624, + "step": 6749, + "train_runtime": 54842.8442, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 4.090909090909091, + "grad_norm": 0.005757177714258432, + "learning_rate": 6.499286781150558e-05, + "loss": 0.012286894023418427, + "num_input_tokens_seen": 110538000, + "step": 6750, + "train_runtime": 54850.9582, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 4.091515151515152, + "grad_norm": 0.006162974052131176, + "learning_rate": 6.498369389094034e-05, + "loss": 0.011888248845934868, + "num_input_tokens_seen": 110554376, + "step": 6751, + "train_runtime": 54859.0701, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 4.092121212121212, + "grad_norm": 0.005235807504504919, + "learning_rate": 6.497451941615744e-05, + "loss": 0.011202432215213776, + "num_input_tokens_seen": 110570752, + "step": 6752, + "train_runtime": 54867.1827, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 4.092727272727273, + "grad_norm": 0.010189136490225792, + "learning_rate": 6.496534438749622e-05, + "loss": 0.015134453773498535, + "num_input_tokens_seen": 110587128, + "step": 6753, + "train_runtime": 54875.2984, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 4.093333333333334, + "grad_norm": 0.006050738971680403, + "learning_rate": 6.495616880529607e-05, + "loss": 0.011340834200382233, + "num_input_tokens_seen": 110603504, + "step": 6754, + "train_runtime": 54883.4093, + "train_tokens_per_second": 2015.245 + }, + { + "epoch": 4.093939393939394, + "grad_norm": 0.008403439074754715, + "learning_rate": 6.494699266989635e-05, + "loss": 0.013286009430885315, + "num_input_tokens_seen": 110619880, + "step": 6755, + "train_runtime": 54891.5311, + "train_tokens_per_second": 2015.245 + }, + { + "epoch": 4.094545454545455, + "grad_norm": 0.006676103454083204, + "learning_rate": 6.493781598163649e-05, + "loss": 0.01179078035056591, + "num_input_tokens_seen": 110636256, + "step": 6756, + "train_runtime": 54899.6462, + "train_tokens_per_second": 2015.245 + }, + { + "epoch": 4.095151515151515, + "grad_norm": 0.009173565544188023, + "learning_rate": 6.492863874085589e-05, + "loss": 0.012925084680318832, + "num_input_tokens_seen": 110652632, + "step": 6757, + "train_runtime": 54907.7604, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 4.095757575757576, + "grad_norm": 0.01022444386035204, + "learning_rate": 6.491946094789402e-05, + "loss": 0.011954806745052338, + "num_input_tokens_seen": 110669008, + "step": 6758, + "train_runtime": 54915.8755, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 4.096363636363637, + "grad_norm": 0.005185700487345457, + "learning_rate": 6.491028260309036e-05, + "loss": 0.012397116050124168, + "num_input_tokens_seen": 110685384, + "step": 6759, + "train_runtime": 54923.9903, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 4.096969696969697, + "grad_norm": 0.008497139438986778, + "learning_rate": 6.490110370678438e-05, + "loss": 0.01226062048226595, + "num_input_tokens_seen": 110701760, + "step": 6760, + "train_runtime": 54932.1044, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 4.097575757575758, + "grad_norm": 0.00684884749352932, + "learning_rate": 6.489192425931558e-05, + "loss": 0.01244722306728363, + "num_input_tokens_seen": 110718136, + "step": 6761, + "train_runtime": 54940.2149, + "train_tokens_per_second": 2015.248 + }, + { + "epoch": 4.098181818181819, + "grad_norm": 0.006906589958816767, + "learning_rate": 6.48827442610235e-05, + "loss": 0.01229185052216053, + "num_input_tokens_seen": 110734512, + "step": 6762, + "train_runtime": 54948.3319, + "train_tokens_per_second": 2015.248 + }, + { + "epoch": 4.098787878787879, + "grad_norm": 0.0029716002754867077, + "learning_rate": 6.487356371224771e-05, + "loss": 0.01157145481556654, + "num_input_tokens_seen": 110750888, + "step": 6763, + "train_runtime": 54956.4453, + "train_tokens_per_second": 2015.248 + }, + { + "epoch": 4.09939393939394, + "grad_norm": 0.006014301907271147, + "learning_rate": 6.486438261332776e-05, + "loss": 0.012414783239364624, + "num_input_tokens_seen": 110767264, + "step": 6764, + "train_runtime": 54964.5598, + "train_tokens_per_second": 2015.249 + }, + { + "epoch": 4.1, + "grad_norm": 0.007912810891866684, + "learning_rate": 6.485520096460322e-05, + "loss": 0.012721954844892025, + "num_input_tokens_seen": 110783640, + "step": 6765, + "train_runtime": 54972.6689, + "train_tokens_per_second": 2015.249 + }, + { + "epoch": 4.100606060606061, + "grad_norm": 0.008372180163860321, + "learning_rate": 6.484601876641375e-05, + "loss": 0.01331938337534666, + "num_input_tokens_seen": 110800016, + "step": 6766, + "train_runtime": 54980.7796, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 4.1012121212121215, + "grad_norm": 0.005612993612885475, + "learning_rate": 6.483683601909893e-05, + "loss": 0.010410083457827568, + "num_input_tokens_seen": 110816392, + "step": 6767, + "train_runtime": 54988.8939, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 4.101818181818182, + "grad_norm": 0.010537705384194851, + "learning_rate": 6.482765272299849e-05, + "loss": 0.012821241281926632, + "num_input_tokens_seen": 110832768, + "step": 6768, + "train_runtime": 54997.0081, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 4.1024242424242425, + "grad_norm": 0.007041152101010084, + "learning_rate": 6.4818468878452e-05, + "loss": 0.011709140613675117, + "num_input_tokens_seen": 110849144, + "step": 6769, + "train_runtime": 55005.1186, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 4.1030303030303035, + "grad_norm": 0.00892180111259222, + "learning_rate": 6.480928448579924e-05, + "loss": 0.012480473145842552, + "num_input_tokens_seen": 110865520, + "step": 6770, + "train_runtime": 55013.2335, + "train_tokens_per_second": 2015.252 + }, + { + "epoch": 4.1036363636363635, + "grad_norm": 0.00915713794529438, + "learning_rate": 6.480009954537985e-05, + "loss": 0.013021890074014664, + "num_input_tokens_seen": 110881896, + "step": 6771, + "train_runtime": 55021.3505, + "train_tokens_per_second": 2015.252 + }, + { + "epoch": 4.1042424242424245, + "grad_norm": 0.0076211863197386265, + "learning_rate": 6.479091405753364e-05, + "loss": 0.011296453885734081, + "num_input_tokens_seen": 110898272, + "step": 6772, + "train_runtime": 55029.465, + "train_tokens_per_second": 2015.253 + }, + { + "epoch": 4.1048484848484845, + "grad_norm": 0.00768258236348629, + "learning_rate": 6.478172802260031e-05, + "loss": 0.011906886473298073, + "num_input_tokens_seen": 110914648, + "step": 6773, + "train_runtime": 55037.5808, + "train_tokens_per_second": 2015.253 + }, + { + "epoch": 4.1054545454545455, + "grad_norm": 0.00959132332354784, + "learning_rate": 6.477254144091963e-05, + "loss": 0.013075464405119419, + "num_input_tokens_seen": 110931024, + "step": 6774, + "train_runtime": 55045.6964, + "train_tokens_per_second": 2015.253 + }, + { + "epoch": 4.106060606060606, + "grad_norm": 0.005565129220485687, + "learning_rate": 6.476335431283142e-05, + "loss": 0.011814418248832226, + "num_input_tokens_seen": 110947400, + "step": 6775, + "train_runtime": 55053.818, + "train_tokens_per_second": 2015.254 + }, + { + "epoch": 4.1066666666666665, + "grad_norm": 0.005992848891764879, + "learning_rate": 6.475416663867548e-05, + "loss": 0.013738968409597874, + "num_input_tokens_seen": 110963776, + "step": 6776, + "train_runtime": 55061.9332, + "train_tokens_per_second": 2015.254 + }, + { + "epoch": 4.107272727272727, + "grad_norm": 0.006619342137128115, + "learning_rate": 6.474497841879165e-05, + "loss": 0.013067110441625118, + "num_input_tokens_seen": 110980152, + "step": 6777, + "train_runtime": 55070.0436, + "train_tokens_per_second": 2015.254 + }, + { + "epoch": 4.1078787878787875, + "grad_norm": 0.008347662165760994, + "learning_rate": 6.47357896535198e-05, + "loss": 0.012699134647846222, + "num_input_tokens_seen": 110996528, + "step": 6778, + "train_runtime": 55078.1538, + "train_tokens_per_second": 2015.255 + }, + { + "epoch": 4.108484848484848, + "grad_norm": 0.011149074882268906, + "learning_rate": 6.472660034319977e-05, + "loss": 0.011981187388300896, + "num_input_tokens_seen": 111012904, + "step": 6779, + "train_runtime": 55086.2642, + "train_tokens_per_second": 2015.256 + }, + { + "epoch": 4.109090909090909, + "grad_norm": 0.008265865035355091, + "learning_rate": 6.471741048817147e-05, + "loss": 0.013211892917752266, + "num_input_tokens_seen": 111029280, + "step": 6780, + "train_runtime": 55094.3781, + "train_tokens_per_second": 2015.256 + }, + { + "epoch": 4.109696969696969, + "grad_norm": 0.008559384383261204, + "learning_rate": 6.470822008877482e-05, + "loss": 0.013182109221816063, + "num_input_tokens_seen": 111045656, + "step": 6781, + "train_runtime": 55102.4911, + "train_tokens_per_second": 2015.257 + }, + { + "epoch": 4.11030303030303, + "grad_norm": 0.00919069442898035, + "learning_rate": 6.469902914534976e-05, + "loss": 0.01302702259272337, + "num_input_tokens_seen": 111062032, + "step": 6782, + "train_runtime": 55110.6039, + "train_tokens_per_second": 2015.257 + }, + { + "epoch": 4.110909090909091, + "grad_norm": 0.006796200294047594, + "learning_rate": 6.468983765823624e-05, + "loss": 0.012176006101071835, + "num_input_tokens_seen": 111078408, + "step": 6783, + "train_runtime": 55118.7172, + "train_tokens_per_second": 2015.257 + }, + { + "epoch": 4.111515151515151, + "grad_norm": 0.007507765665650368, + "learning_rate": 6.468064562777421e-05, + "loss": 0.012639706954360008, + "num_input_tokens_seen": 111094784, + "step": 6784, + "train_runtime": 55126.8302, + "train_tokens_per_second": 2015.258 + }, + { + "epoch": 4.112121212121212, + "grad_norm": 0.003989376127719879, + "learning_rate": 6.467145305430371e-05, + "loss": 0.012173742055892944, + "num_input_tokens_seen": 111111160, + "step": 6785, + "train_runtime": 55134.9405, + "train_tokens_per_second": 2015.259 + }, + { + "epoch": 4.112727272727272, + "grad_norm": 0.006851447746157646, + "learning_rate": 6.466225993816473e-05, + "loss": 0.012527129612863064, + "num_input_tokens_seen": 111127536, + "step": 6786, + "train_runtime": 55143.0548, + "train_tokens_per_second": 2015.259 + }, + { + "epoch": 4.113333333333333, + "grad_norm": 0.010228479281067848, + "learning_rate": 6.46530662796973e-05, + "loss": 0.012966207228600979, + "num_input_tokens_seen": 111143912, + "step": 6787, + "train_runtime": 55151.1668, + "train_tokens_per_second": 2015.259 + }, + { + "epoch": 4.113939393939394, + "grad_norm": 0.002193754306063056, + "learning_rate": 6.464387207924147e-05, + "loss": 0.0124110858887434, + "num_input_tokens_seen": 111160288, + "step": 6788, + "train_runtime": 55159.2764, + "train_tokens_per_second": 2015.26 + }, + { + "epoch": 4.114545454545454, + "grad_norm": 0.006738306954503059, + "learning_rate": 6.463467733713735e-05, + "loss": 0.01244714017957449, + "num_input_tokens_seen": 111176664, + "step": 6789, + "train_runtime": 55167.39, + "train_tokens_per_second": 2015.261 + }, + { + "epoch": 4.115151515151515, + "grad_norm": 0.00839934404939413, + "learning_rate": 6.4625482053725e-05, + "loss": 0.012884236872196198, + "num_input_tokens_seen": 111193040, + "step": 6790, + "train_runtime": 55175.5031, + "train_tokens_per_second": 2015.261 + }, + { + "epoch": 4.115757575757576, + "grad_norm": 0.005825200118124485, + "learning_rate": 6.461628622934458e-05, + "loss": 0.012674493715167046, + "num_input_tokens_seen": 111209416, + "step": 6791, + "train_runtime": 55183.6183, + "train_tokens_per_second": 2015.261 + }, + { + "epoch": 4.116363636363636, + "grad_norm": 0.014781314879655838, + "learning_rate": 6.460708986433617e-05, + "loss": 0.013461627066135406, + "num_input_tokens_seen": 111225792, + "step": 6792, + "train_runtime": 55191.7305, + "train_tokens_per_second": 2015.262 + }, + { + "epoch": 4.116969696969697, + "grad_norm": 0.007691128645092249, + "learning_rate": 6.459789295903997e-05, + "loss": 0.012506977654993534, + "num_input_tokens_seen": 111242168, + "step": 6793, + "train_runtime": 55199.8445, + "train_tokens_per_second": 2015.262 + }, + { + "epoch": 4.117575757575757, + "grad_norm": 0.007941563613712788, + "learning_rate": 6.458869551379612e-05, + "loss": 0.012372801080346107, + "num_input_tokens_seen": 111258544, + "step": 6794, + "train_runtime": 55207.958, + "train_tokens_per_second": 2015.263 + }, + { + "epoch": 4.118181818181818, + "grad_norm": 0.004344481509178877, + "learning_rate": 6.457949752894485e-05, + "loss": 0.012550491839647293, + "num_input_tokens_seen": 111274920, + "step": 6795, + "train_runtime": 55216.0734, + "train_tokens_per_second": 2015.263 + }, + { + "epoch": 4.118787878787879, + "grad_norm": 0.0024770034942775965, + "learning_rate": 6.457029900482634e-05, + "loss": 0.012168929912149906, + "num_input_tokens_seen": 111291296, + "step": 6796, + "train_runtime": 55224.189, + "train_tokens_per_second": 2015.264 + }, + { + "epoch": 4.119393939393939, + "grad_norm": 0.006387229077517986, + "learning_rate": 6.456109994178085e-05, + "loss": 0.011289354413747787, + "num_input_tokens_seen": 111307672, + "step": 6797, + "train_runtime": 55232.3045, + "train_tokens_per_second": 2015.264 + }, + { + "epoch": 4.12, + "grad_norm": 0.007669675163924694, + "learning_rate": 6.455190034014864e-05, + "loss": 0.011611681431531906, + "num_input_tokens_seen": 111324048, + "step": 6798, + "train_runtime": 55240.419, + "train_tokens_per_second": 2015.264 + }, + { + "epoch": 4.120606060606061, + "grad_norm": 0.009855726733803749, + "learning_rate": 6.454270020026995e-05, + "loss": 0.012846332043409348, + "num_input_tokens_seen": 111340424, + "step": 6799, + "train_runtime": 55248.5383, + "train_tokens_per_second": 2015.265 + }, + { + "epoch": 4.121212121212121, + "grad_norm": 0.004810289479792118, + "learning_rate": 6.453349952248513e-05, + "loss": 0.012066160328686237, + "num_input_tokens_seen": 111356800, + "step": 6800, + "train_runtime": 55256.6513, + "train_tokens_per_second": 2015.265 + }, + { + "epoch": 4.121818181818182, + "grad_norm": 0.0035376311279833317, + "learning_rate": 6.452429830713444e-05, + "loss": 0.0125046381726861, + "num_input_tokens_seen": 111373176, + "step": 6801, + "train_runtime": 55265.748, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 4.122424242424242, + "grad_norm": 0.006885630544275045, + "learning_rate": 6.451509655455827e-05, + "loss": 0.011477639898657799, + "num_input_tokens_seen": 111389552, + "step": 6802, + "train_runtime": 55273.8631, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 4.123030303030303, + "grad_norm": 0.005448077339679003, + "learning_rate": 6.450589426509692e-05, + "loss": 0.01139042992144823, + "num_input_tokens_seen": 111405928, + "step": 6803, + "train_runtime": 55281.9791, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 4.123636363636364, + "grad_norm": 0.008229555562138557, + "learning_rate": 6.44966914390908e-05, + "loss": 0.012289345264434814, + "num_input_tokens_seen": 111422304, + "step": 6804, + "train_runtime": 55290.1012, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 4.124242424242424, + "grad_norm": 0.0026268225628882647, + "learning_rate": 6.44874880768803e-05, + "loss": 0.011629512533545494, + "num_input_tokens_seen": 111438680, + "step": 6805, + "train_runtime": 55298.2163, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 4.124848484848485, + "grad_norm": 0.007514363154768944, + "learning_rate": 6.447828417880581e-05, + "loss": 0.012247199192643166, + "num_input_tokens_seen": 111455056, + "step": 6806, + "train_runtime": 55306.3322, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 4.125454545454545, + "grad_norm": 0.009518838487565517, + "learning_rate": 6.446907974520779e-05, + "loss": 0.013008835725486279, + "num_input_tokens_seen": 111471432, + "step": 6807, + "train_runtime": 55314.443, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 4.126060606060606, + "grad_norm": 0.0015840057749301195, + "learning_rate": 6.44598747764267e-05, + "loss": 0.011546186171472073, + "num_input_tokens_seen": 111487808, + "step": 6808, + "train_runtime": 55322.5547, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 4.126666666666667, + "grad_norm": 0.00760845560580492, + "learning_rate": 6.4450669272803e-05, + "loss": 0.011701042763888836, + "num_input_tokens_seen": 111504184, + "step": 6809, + "train_runtime": 55330.6693, + "train_tokens_per_second": 2015.233 + }, + { + "epoch": 4.127272727272727, + "grad_norm": 0.008152773603796959, + "learning_rate": 6.44414632346772e-05, + "loss": 0.010762261226773262, + "num_input_tokens_seen": 111520560, + "step": 6810, + "train_runtime": 55338.7849, + "train_tokens_per_second": 2015.233 + }, + { + "epoch": 4.127878787878788, + "grad_norm": 0.007568493951112032, + "learning_rate": 6.443225666238976e-05, + "loss": 0.013148179277777672, + "num_input_tokens_seen": 111536936, + "step": 6811, + "train_runtime": 55346.9036, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 4.128484848484849, + "grad_norm": 0.006007296033203602, + "learning_rate": 6.442304955628127e-05, + "loss": 0.012178211472928524, + "num_input_tokens_seen": 111553312, + "step": 6812, + "train_runtime": 55355.0193, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 4.129090909090909, + "grad_norm": 0.006546088494360447, + "learning_rate": 6.441384191669227e-05, + "loss": 0.010544861666858196, + "num_input_tokens_seen": 111569688, + "step": 6813, + "train_runtime": 55363.1347, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 4.12969696969697, + "grad_norm": 0.004689306020736694, + "learning_rate": 6.440463374396332e-05, + "loss": 0.011589367873966694, + "num_input_tokens_seen": 111586064, + "step": 6814, + "train_runtime": 55371.2482, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 4.13030303030303, + "grad_norm": 0.00758829852566123, + "learning_rate": 6.439542503843501e-05, + "loss": 0.012704254128038883, + "num_input_tokens_seen": 111602440, + "step": 6815, + "train_runtime": 55379.365, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 4.130909090909091, + "grad_norm": 0.0001683660375420004, + "learning_rate": 6.438621580044798e-05, + "loss": 0.012290287762880325, + "num_input_tokens_seen": 111618816, + "step": 6816, + "train_runtime": 55387.4764, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 4.131515151515152, + "grad_norm": 0.006449244916439056, + "learning_rate": 6.437700603034283e-05, + "loss": 0.01201656460762024, + "num_input_tokens_seen": 111635192, + "step": 6817, + "train_runtime": 55395.591, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 4.132121212121212, + "grad_norm": 0.0051890951581299305, + "learning_rate": 6.436779572846023e-05, + "loss": 0.012193800881505013, + "num_input_tokens_seen": 111651568, + "step": 6818, + "train_runtime": 55403.7056, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 4.132727272727273, + "grad_norm": 0.0071116043254733086, + "learning_rate": 6.435858489514086e-05, + "loss": 0.012657828629016876, + "num_input_tokens_seen": 111667944, + "step": 6819, + "train_runtime": 55411.8215, + "train_tokens_per_second": 2015.237 + }, + { + "epoch": 4.133333333333334, + "grad_norm": 0.004109963309019804, + "learning_rate": 6.434937353072537e-05, + "loss": 0.01196388341486454, + "num_input_tokens_seen": 111684320, + "step": 6820, + "train_runtime": 55419.9337, + "train_tokens_per_second": 2015.237 + }, + { + "epoch": 4.133939393939394, + "grad_norm": 0.007665098179131746, + "learning_rate": 6.434016163555452e-05, + "loss": 0.011870422400534153, + "num_input_tokens_seen": 111700696, + "step": 6821, + "train_runtime": 55428.0452, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 4.134545454545455, + "grad_norm": 0.00537806237116456, + "learning_rate": 6.433094920996901e-05, + "loss": 0.012221486307680607, + "num_input_tokens_seen": 111717072, + "step": 6822, + "train_runtime": 55436.159, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 4.135151515151515, + "grad_norm": 0.006745706312358379, + "learning_rate": 6.432173625430959e-05, + "loss": 0.01313024666160345, + "num_input_tokens_seen": 111733448, + "step": 6823, + "train_runtime": 55444.2745, + "train_tokens_per_second": 2015.239 + }, + { + "epoch": 4.135757575757576, + "grad_norm": 0.003918980713933706, + "learning_rate": 6.431252276891704e-05, + "loss": 0.01211816817522049, + "num_input_tokens_seen": 111749824, + "step": 6824, + "train_runtime": 55452.3932, + "train_tokens_per_second": 2015.239 + }, + { + "epoch": 4.136363636363637, + "grad_norm": 0.007002395577728748, + "learning_rate": 6.430330875413215e-05, + "loss": 0.01162298396229744, + "num_input_tokens_seen": 111766200, + "step": 6825, + "train_runtime": 55460.5102, + "train_tokens_per_second": 2015.239 + }, + { + "epoch": 4.136969696969697, + "grad_norm": 0.008254042826592922, + "learning_rate": 6.429409421029572e-05, + "loss": 0.012040762230753899, + "num_input_tokens_seen": 111782576, + "step": 6826, + "train_runtime": 55468.6314, + "train_tokens_per_second": 2015.239 + }, + { + "epoch": 4.137575757575758, + "grad_norm": 0.006543914787471294, + "learning_rate": 6.428487913774857e-05, + "loss": 0.011130653321743011, + "num_input_tokens_seen": 111798952, + "step": 6827, + "train_runtime": 55476.7452, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 4.138181818181819, + "grad_norm": 0.004626874811947346, + "learning_rate": 6.427566353683159e-05, + "loss": 0.012704134918749332, + "num_input_tokens_seen": 111815328, + "step": 6828, + "train_runtime": 55484.861, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 4.138787878787879, + "grad_norm": 0.00413492089137435, + "learning_rate": 6.426644740788559e-05, + "loss": 0.011694412678480148, + "num_input_tokens_seen": 111831704, + "step": 6829, + "train_runtime": 55492.9737, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 4.13939393939394, + "grad_norm": 0.007618648931384087, + "learning_rate": 6.425723075125149e-05, + "loss": 0.012701701372861862, + "num_input_tokens_seen": 111848080, + "step": 6830, + "train_runtime": 55501.0897, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 4.14, + "grad_norm": 0.00866475235670805, + "learning_rate": 6.424801356727019e-05, + "loss": 0.012836270034313202, + "num_input_tokens_seen": 111864456, + "step": 6831, + "train_runtime": 55509.2039, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 4.140606060606061, + "grad_norm": 0.005367221310734749, + "learning_rate": 6.423879585628261e-05, + "loss": 0.012096544727683067, + "num_input_tokens_seen": 111880832, + "step": 6832, + "train_runtime": 55517.3181, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 4.141212121212122, + "grad_norm": 0.013401288539171219, + "learning_rate": 6.422957761862971e-05, + "loss": 0.012532380409538746, + "num_input_tokens_seen": 111897208, + "step": 6833, + "train_runtime": 55525.4312, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 4.141818181818182, + "grad_norm": 0.0060661048628389835, + "learning_rate": 6.422035885465243e-05, + "loss": 0.012720838189125061, + "num_input_tokens_seen": 111913584, + "step": 6834, + "train_runtime": 55533.5458, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 4.142424242424243, + "grad_norm": 0.007214708253741264, + "learning_rate": 6.421113956469179e-05, + "loss": 0.01246915478259325, + "num_input_tokens_seen": 111929960, + "step": 6835, + "train_runtime": 55541.6567, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 4.143030303030303, + "grad_norm": 0.007770147640258074, + "learning_rate": 6.420191974908876e-05, + "loss": 0.01219092309474945, + "num_input_tokens_seen": 111946336, + "step": 6836, + "train_runtime": 55549.7696, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 4.143636363636364, + "grad_norm": 0.006288686767220497, + "learning_rate": 6.419269940818437e-05, + "loss": 0.012528445571660995, + "num_input_tokens_seen": 111962712, + "step": 6837, + "train_runtime": 55557.8849, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 4.1442424242424245, + "grad_norm": 0.009935424663126469, + "learning_rate": 6.418347854231968e-05, + "loss": 0.012631678022444248, + "num_input_tokens_seen": 111979088, + "step": 6838, + "train_runtime": 55566.0015, + "train_tokens_per_second": 2015.245 + }, + { + "epoch": 4.144848484848485, + "grad_norm": 0.007493012119084597, + "learning_rate": 6.417425715183574e-05, + "loss": 0.01116294413805008, + "num_input_tokens_seen": 111995464, + "step": 6839, + "train_runtime": 55574.1205, + "train_tokens_per_second": 2015.245 + }, + { + "epoch": 4.1454545454545455, + "grad_norm": 0.004762480966746807, + "learning_rate": 6.416503523707363e-05, + "loss": 0.011398454196751118, + "num_input_tokens_seen": 112011840, + "step": 6840, + "train_runtime": 55582.2352, + "train_tokens_per_second": 2015.245 + }, + { + "epoch": 4.1460606060606064, + "grad_norm": 0.0021754212211817503, + "learning_rate": 6.415581279837443e-05, + "loss": 0.012141967192292213, + "num_input_tokens_seen": 112028216, + "step": 6841, + "train_runtime": 55590.3532, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 4.1466666666666665, + "grad_norm": 0.009951834566891193, + "learning_rate": 6.41465898360793e-05, + "loss": 0.011857830919325352, + "num_input_tokens_seen": 112044592, + "step": 6842, + "train_runtime": 55598.4676, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 4.147272727272727, + "grad_norm": 0.008700813166797161, + "learning_rate": 6.413736635052937e-05, + "loss": 0.010885066352784634, + "num_input_tokens_seen": 112060968, + "step": 6843, + "train_runtime": 55606.5849, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 4.1478787878787875, + "grad_norm": 0.009598327800631523, + "learning_rate": 6.412814234206578e-05, + "loss": 0.012121143750846386, + "num_input_tokens_seen": 112077344, + "step": 6844, + "train_runtime": 55614.7013, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 4.148484848484848, + "grad_norm": 0.008523743599653244, + "learning_rate": 6.411891781102972e-05, + "loss": 0.0119834179058671, + "num_input_tokens_seen": 112093720, + "step": 6845, + "train_runtime": 55622.8192, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 4.149090909090909, + "grad_norm": 0.010733280330896378, + "learning_rate": 6.410969275776239e-05, + "loss": 0.011993002146482468, + "num_input_tokens_seen": 112110096, + "step": 6846, + "train_runtime": 55630.9354, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 4.149696969696969, + "grad_norm": 0.006049610208719969, + "learning_rate": 6.410046718260498e-05, + "loss": 0.012041687034070492, + "num_input_tokens_seen": 112126472, + "step": 6847, + "train_runtime": 55639.0559, + "train_tokens_per_second": 2015.248 + }, + { + "epoch": 4.15030303030303, + "grad_norm": 0.007391077000647783, + "learning_rate": 6.409124108589877e-05, + "loss": 0.012022452428936958, + "num_input_tokens_seen": 112142848, + "step": 6848, + "train_runtime": 55647.1718, + "train_tokens_per_second": 2015.248 + }, + { + "epoch": 4.150909090909091, + "grad_norm": 0.010761240497231483, + "learning_rate": 6.408201446798497e-05, + "loss": 0.012358885258436203, + "num_input_tokens_seen": 112159224, + "step": 6849, + "train_runtime": 55655.2922, + "train_tokens_per_second": 2015.248 + }, + { + "epoch": 4.151515151515151, + "grad_norm": 0.006437539588660002, + "learning_rate": 6.407278732920492e-05, + "loss": 0.011649228632450104, + "num_input_tokens_seen": 112175600, + "step": 6850, + "train_runtime": 55663.4046, + "train_tokens_per_second": 2015.249 + }, + { + "epoch": 4.152121212121212, + "grad_norm": 0.00893746130168438, + "learning_rate": 6.406355966989983e-05, + "loss": 0.012525797821581364, + "num_input_tokens_seen": 112191976, + "step": 6851, + "train_runtime": 55671.5189, + "train_tokens_per_second": 2015.249 + }, + { + "epoch": 4.152727272727272, + "grad_norm": 0.005367055535316467, + "learning_rate": 6.405433149041108e-05, + "loss": 0.012465769425034523, + "num_input_tokens_seen": 112208352, + "step": 6852, + "train_runtime": 55679.6359, + "train_tokens_per_second": 2015.249 + }, + { + "epoch": 4.153333333333333, + "grad_norm": 0.008020800538361073, + "learning_rate": 6.404510279107996e-05, + "loss": 0.010815596207976341, + "num_input_tokens_seen": 112224728, + "step": 6853, + "train_runtime": 55687.7557, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 4.153939393939394, + "grad_norm": 0.010503056459128857, + "learning_rate": 6.403587357224786e-05, + "loss": 0.013064950704574585, + "num_input_tokens_seen": 112241104, + "step": 6854, + "train_runtime": 55695.8698, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 4.154545454545454, + "grad_norm": 0.01006402913480997, + "learning_rate": 6.402664383425612e-05, + "loss": 0.013327029533684254, + "num_input_tokens_seen": 112257480, + "step": 6855, + "train_runtime": 55703.9857, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 4.155151515151515, + "grad_norm": 0.013930359855294228, + "learning_rate": 6.401741357744616e-05, + "loss": 0.013200460001826286, + "num_input_tokens_seen": 112273856, + "step": 6856, + "train_runtime": 55712.1058, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 4.155757575757576, + "grad_norm": 0.010643518529832363, + "learning_rate": 6.400818280215932e-05, + "loss": 0.010992737486958504, + "num_input_tokens_seen": 112290232, + "step": 6857, + "train_runtime": 55720.2301, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 4.156363636363636, + "grad_norm": 0.0031094676814973354, + "learning_rate": 6.399895150873711e-05, + "loss": 0.012058880180120468, + "num_input_tokens_seen": 112306608, + "step": 6858, + "train_runtime": 55728.347, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 4.156969696969697, + "grad_norm": 0.008688523434102535, + "learning_rate": 6.398971969752095e-05, + "loss": 0.013433623127639294, + "num_input_tokens_seen": 112322984, + "step": 6859, + "train_runtime": 55736.4613, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 4.157575757575757, + "grad_norm": 0.006928227376192808, + "learning_rate": 6.39804873688523e-05, + "loss": 0.011825178749859333, + "num_input_tokens_seen": 112339360, + "step": 6860, + "train_runtime": 55744.5785, + "train_tokens_per_second": 2015.252 + }, + { + "epoch": 4.158181818181818, + "grad_norm": 0.008281384594738483, + "learning_rate": 6.397125452307265e-05, + "loss": 0.01184145174920559, + "num_input_tokens_seen": 112355736, + "step": 6861, + "train_runtime": 55752.6938, + "train_tokens_per_second": 2015.252 + }, + { + "epoch": 4.158787878787879, + "grad_norm": 0.008309024386107922, + "learning_rate": 6.396202116052348e-05, + "loss": 0.013322478160262108, + "num_input_tokens_seen": 112372112, + "step": 6862, + "train_runtime": 55760.8096, + "train_tokens_per_second": 2015.253 + }, + { + "epoch": 4.159393939393939, + "grad_norm": 0.02604571171104908, + "learning_rate": 6.395278728154637e-05, + "loss": 0.012350162491202354, + "num_input_tokens_seen": 112388488, + "step": 6863, + "train_runtime": 55768.9304, + "train_tokens_per_second": 2015.253 + }, + { + "epoch": 4.16, + "grad_norm": 0.007994724437594414, + "learning_rate": 6.394355288648282e-05, + "loss": 0.012545488774776459, + "num_input_tokens_seen": 112404864, + "step": 6864, + "train_runtime": 55777.0489, + "train_tokens_per_second": 2015.253 + }, + { + "epoch": 4.16060606060606, + "grad_norm": 0.006420474965125322, + "learning_rate": 6.39343179756744e-05, + "loss": 0.011941478587687016, + "num_input_tokens_seen": 112421240, + "step": 6865, + "train_runtime": 55785.1615, + "train_tokens_per_second": 2015.253 + }, + { + "epoch": 4.161212121212121, + "grad_norm": 0.006263738963752985, + "learning_rate": 6.392508254946268e-05, + "loss": 0.010190822184085846, + "num_input_tokens_seen": 112437616, + "step": 6866, + "train_runtime": 55793.28, + "train_tokens_per_second": 2015.254 + }, + { + "epoch": 4.161818181818182, + "grad_norm": 0.005178635474294424, + "learning_rate": 6.391584660818927e-05, + "loss": 0.011581411585211754, + "num_input_tokens_seen": 112453992, + "step": 6867, + "train_runtime": 55801.4027, + "train_tokens_per_second": 2015.254 + }, + { + "epoch": 4.162424242424242, + "grad_norm": 0.004705625120550394, + "learning_rate": 6.390661015219582e-05, + "loss": 0.01197720505297184, + "num_input_tokens_seen": 112470368, + "step": 6868, + "train_runtime": 55809.5197, + "train_tokens_per_second": 2015.254 + }, + { + "epoch": 4.163030303030303, + "grad_norm": 0.007268523331731558, + "learning_rate": 6.389737318182393e-05, + "loss": 0.012524503283202648, + "num_input_tokens_seen": 112486744, + "step": 6869, + "train_runtime": 55817.6358, + "train_tokens_per_second": 2015.255 + }, + { + "epoch": 4.163636363636364, + "grad_norm": 0.007445086725056171, + "learning_rate": 6.388813569741527e-05, + "loss": 0.011772837489843369, + "num_input_tokens_seen": 112503120, + "step": 6870, + "train_runtime": 55825.7535, + "train_tokens_per_second": 2015.255 + }, + { + "epoch": 4.164242424242424, + "grad_norm": 0.00626804493367672, + "learning_rate": 6.387889769931152e-05, + "loss": 0.01236027106642723, + "num_input_tokens_seen": 112519496, + "step": 6871, + "train_runtime": 55833.8716, + "train_tokens_per_second": 2015.255 + }, + { + "epoch": 4.164848484848485, + "grad_norm": 0.009195455349981785, + "learning_rate": 6.386965918785436e-05, + "loss": 0.012178661301732063, + "num_input_tokens_seen": 112535872, + "step": 6872, + "train_runtime": 55841.9929, + "train_tokens_per_second": 2015.255 + }, + { + "epoch": 4.165454545454545, + "grad_norm": 0.012282638810575008, + "learning_rate": 6.386042016338553e-05, + "loss": 0.012049874290823936, + "num_input_tokens_seen": 112552248, + "step": 6873, + "train_runtime": 55850.1095, + "train_tokens_per_second": 2015.256 + }, + { + "epoch": 4.166060606060606, + "grad_norm": 0.007594170514494181, + "learning_rate": 6.385118062624676e-05, + "loss": 0.011906994506716728, + "num_input_tokens_seen": 112568624, + "step": 6874, + "train_runtime": 55858.2304, + "train_tokens_per_second": 2015.256 + }, + { + "epoch": 4.166666666666667, + "grad_norm": 0.008155268616974354, + "learning_rate": 6.384194057677977e-05, + "loss": 0.010960728861391544, + "num_input_tokens_seen": 112585000, + "step": 6875, + "train_runtime": 55866.3446, + "train_tokens_per_second": 2015.256 + }, + { + "epoch": 4.167272727272727, + "grad_norm": 0.007708406541496515, + "learning_rate": 6.383270001532635e-05, + "loss": 0.011630121618509293, + "num_input_tokens_seen": 112601376, + "step": 6876, + "train_runtime": 55874.4586, + "train_tokens_per_second": 2015.257 + }, + { + "epoch": 4.167878787878788, + "grad_norm": 0.0040500713512301445, + "learning_rate": 6.382345894222833e-05, + "loss": 0.011397692374885082, + "num_input_tokens_seen": 112617752, + "step": 6877, + "train_runtime": 55882.5761, + "train_tokens_per_second": 2015.257 + }, + { + "epoch": 4.168484848484849, + "grad_norm": 0.010942202992737293, + "learning_rate": 6.381421735782744e-05, + "loss": 0.012059425935149193, + "num_input_tokens_seen": 112634128, + "step": 6878, + "train_runtime": 55890.6979, + "train_tokens_per_second": 2015.257 + }, + { + "epoch": 4.169090909090909, + "grad_norm": 0.007086894009262323, + "learning_rate": 6.380497526246558e-05, + "loss": 0.012042596936225891, + "num_input_tokens_seen": 112650504, + "step": 6879, + "train_runtime": 55898.8148, + "train_tokens_per_second": 2015.257 + }, + { + "epoch": 4.16969696969697, + "grad_norm": 0.003615280147641897, + "learning_rate": 6.379573265648455e-05, + "loss": 0.013053813017904758, + "num_input_tokens_seen": 112666880, + "step": 6880, + "train_runtime": 55906.9303, + "train_tokens_per_second": 2015.258 + }, + { + "epoch": 4.17030303030303, + "grad_norm": 0.0056443954817950726, + "learning_rate": 6.378648954022626e-05, + "loss": 0.01048743724822998, + "num_input_tokens_seen": 112683256, + "step": 6881, + "train_runtime": 55915.0417, + "train_tokens_per_second": 2015.258 + }, + { + "epoch": 4.170909090909091, + "grad_norm": 0.007874313741922379, + "learning_rate": 6.377724591403256e-05, + "loss": 0.01297399215400219, + "num_input_tokens_seen": 112699632, + "step": 6882, + "train_runtime": 55923.1591, + "train_tokens_per_second": 2015.259 + }, + { + "epoch": 4.171515151515152, + "grad_norm": 0.004798084031790495, + "learning_rate": 6.376800177824536e-05, + "loss": 0.01228130143135786, + "num_input_tokens_seen": 112716008, + "step": 6883, + "train_runtime": 55931.2751, + "train_tokens_per_second": 2015.259 + }, + { + "epoch": 4.172121212121212, + "grad_norm": 0.007694761268794537, + "learning_rate": 6.375875713320658e-05, + "loss": 0.012400890700519085, + "num_input_tokens_seen": 112732384, + "step": 6884, + "train_runtime": 55939.3848, + "train_tokens_per_second": 2015.26 + }, + { + "epoch": 4.172727272727273, + "grad_norm": 0.006167495157569647, + "learning_rate": 6.374951197925817e-05, + "loss": 0.01230345293879509, + "num_input_tokens_seen": 112748760, + "step": 6885, + "train_runtime": 55947.4984, + "train_tokens_per_second": 2015.26 + }, + { + "epoch": 4.173333333333334, + "grad_norm": 0.007597097661346197, + "learning_rate": 6.37402663167421e-05, + "loss": 0.013156382367014885, + "num_input_tokens_seen": 112765136, + "step": 6886, + "train_runtime": 55955.6134, + "train_tokens_per_second": 2015.26 + }, + { + "epoch": 4.173939393939394, + "grad_norm": 0.025040749460458755, + "learning_rate": 6.373102014600034e-05, + "loss": 0.010919317603111267, + "num_input_tokens_seen": 112781512, + "step": 6887, + "train_runtime": 55963.7305, + "train_tokens_per_second": 2015.261 + }, + { + "epoch": 4.174545454545455, + "grad_norm": 0.006732979789376259, + "learning_rate": 6.372177346737487e-05, + "loss": 0.012952979654073715, + "num_input_tokens_seen": 112797888, + "step": 6888, + "train_runtime": 55971.8412, + "train_tokens_per_second": 2015.261 + }, + { + "epoch": 4.175151515151515, + "grad_norm": 0.006781100295484066, + "learning_rate": 6.371252628120772e-05, + "loss": 0.011718238703906536, + "num_input_tokens_seen": 112814264, + "step": 6889, + "train_runtime": 55979.9548, + "train_tokens_per_second": 2015.262 + }, + { + "epoch": 4.175757575757576, + "grad_norm": 0.0014034959021955729, + "learning_rate": 6.370327858784093e-05, + "loss": 0.012795984745025635, + "num_input_tokens_seen": 112830640, + "step": 6890, + "train_runtime": 55988.0688, + "train_tokens_per_second": 2015.262 + }, + { + "epoch": 4.176363636363637, + "grad_norm": 0.0063669574446976185, + "learning_rate": 6.369403038761655e-05, + "loss": 0.011950397863984108, + "num_input_tokens_seen": 112847016, + "step": 6891, + "train_runtime": 55996.1826, + "train_tokens_per_second": 2015.263 + }, + { + "epoch": 4.176969696969697, + "grad_norm": 0.007749716751277447, + "learning_rate": 6.368478168087667e-05, + "loss": 0.012205716222524643, + "num_input_tokens_seen": 112863392, + "step": 6892, + "train_runtime": 56004.2947, + "train_tokens_per_second": 2015.263 + }, + { + "epoch": 4.177575757575758, + "grad_norm": 0.0053219273686409, + "learning_rate": 6.367553246796333e-05, + "loss": 0.012319987639784813, + "num_input_tokens_seen": 112879768, + "step": 6893, + "train_runtime": 56012.4101, + "train_tokens_per_second": 2015.264 + }, + { + "epoch": 4.178181818181818, + "grad_norm": 0.006619678344577551, + "learning_rate": 6.36662827492187e-05, + "loss": 0.012759028933942318, + "num_input_tokens_seen": 112896144, + "step": 6894, + "train_runtime": 56020.5305, + "train_tokens_per_second": 2015.264 + }, + { + "epoch": 4.178787878787879, + "grad_norm": 0.0049835797399282455, + "learning_rate": 6.365703252498489e-05, + "loss": 0.012652397155761719, + "num_input_tokens_seen": 112912520, + "step": 6895, + "train_runtime": 56028.6438, + "train_tokens_per_second": 2015.264 + }, + { + "epoch": 4.17939393939394, + "grad_norm": 0.006797394249588251, + "learning_rate": 6.364778179560404e-05, + "loss": 0.01199650950729847, + "num_input_tokens_seen": 112928896, + "step": 6896, + "train_runtime": 56036.7546, + "train_tokens_per_second": 2015.265 + }, + { + "epoch": 4.18, + "grad_norm": 0.006719675380736589, + "learning_rate": 6.36385305614183e-05, + "loss": 0.012254391796886921, + "num_input_tokens_seen": 112945272, + "step": 6897, + "train_runtime": 56044.8737, + "train_tokens_per_second": 2015.265 + }, + { + "epoch": 4.180606060606061, + "grad_norm": 0.0055427346378564835, + "learning_rate": 6.36292788227699e-05, + "loss": 0.011112435720860958, + "num_input_tokens_seen": 112961648, + "step": 6898, + "train_runtime": 56052.9856, + "train_tokens_per_second": 2015.265 + }, + { + "epoch": 4.181212121212122, + "grad_norm": 0.008325114846229553, + "learning_rate": 6.3620026580001e-05, + "loss": 0.01242219191044569, + "num_input_tokens_seen": 112978024, + "step": 6899, + "train_runtime": 56061.0978, + "train_tokens_per_second": 2015.266 + }, + { + "epoch": 4.181818181818182, + "grad_norm": 0.008892298676073551, + "learning_rate": 6.361077383345387e-05, + "loss": 0.01192412804812193, + "num_input_tokens_seen": 112994400, + "step": 6900, + "train_runtime": 56069.217, + "train_tokens_per_second": 2015.266 + }, + { + "epoch": 4.182424242424243, + "grad_norm": 0.005727548152208328, + "learning_rate": 6.360152058347069e-05, + "loss": 0.011437574401497841, + "num_input_tokens_seen": 113010776, + "step": 6901, + "train_runtime": 56078.5087, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 4.183030303030303, + "grad_norm": 0.004121846053749323, + "learning_rate": 6.359226683039376e-05, + "loss": 0.011375869624316692, + "num_input_tokens_seen": 113027152, + "step": 6902, + "train_runtime": 56086.6302, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 4.183636363636364, + "grad_norm": 0.008612724021077156, + "learning_rate": 6.358301257456534e-05, + "loss": 0.010676328092813492, + "num_input_tokens_seen": 113043528, + "step": 6903, + "train_runtime": 56094.7457, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 4.1842424242424245, + "grad_norm": 0.005382446572184563, + "learning_rate": 6.357375781632775e-05, + "loss": 0.010867252014577389, + "num_input_tokens_seen": 113059904, + "step": 6904, + "train_runtime": 56102.8668, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 4.184848484848485, + "grad_norm": 0.01983838900923729, + "learning_rate": 6.356450255602329e-05, + "loss": 0.012318271212279797, + "num_input_tokens_seen": 113076280, + "step": 6905, + "train_runtime": 56110.9857, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 4.1854545454545455, + "grad_norm": 0.0065947650000452995, + "learning_rate": 6.35552467939943e-05, + "loss": 0.01265876553952694, + "num_input_tokens_seen": 113092656, + "step": 6906, + "train_runtime": 56119.1016, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 4.1860606060606065, + "grad_norm": 0.0054896315559744835, + "learning_rate": 6.354599053058312e-05, + "loss": 0.012822289951145649, + "num_input_tokens_seen": 113109032, + "step": 6907, + "train_runtime": 56127.2156, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 4.1866666666666665, + "grad_norm": 0.006131387315690517, + "learning_rate": 6.353673376613213e-05, + "loss": 0.011441155336797237, + "num_input_tokens_seen": 113125408, + "step": 6908, + "train_runtime": 56135.3359, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 4.1872727272727275, + "grad_norm": 0.0036953664384782314, + "learning_rate": 6.352747650098373e-05, + "loss": 0.0115741565823555, + "num_input_tokens_seen": 113141784, + "step": 6909, + "train_runtime": 56143.4475, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 4.1878787878787875, + "grad_norm": 0.00935975369066, + "learning_rate": 6.351821873548031e-05, + "loss": 0.011312738060951233, + "num_input_tokens_seen": 113158160, + "step": 6910, + "train_runtime": 56151.5678, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 4.1884848484848485, + "grad_norm": 0.010734181851148605, + "learning_rate": 6.35089604699643e-05, + "loss": 0.012171557173132896, + "num_input_tokens_seen": 113174536, + "step": 6911, + "train_runtime": 56159.6844, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 4.189090909090909, + "grad_norm": 0.00899182353168726, + "learning_rate": 6.349970170477816e-05, + "loss": 0.011152748949825764, + "num_input_tokens_seen": 113190912, + "step": 6912, + "train_runtime": 56167.8054, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 4.1896969696969695, + "grad_norm": 0.008243264630436897, + "learning_rate": 6.349044244026435e-05, + "loss": 0.012796985916793346, + "num_input_tokens_seen": 113207288, + "step": 6913, + "train_runtime": 56175.9309, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 4.19030303030303, + "grad_norm": 0.006306067109107971, + "learning_rate": 6.348118267676533e-05, + "loss": 0.011744976043701172, + "num_input_tokens_seen": 113223664, + "step": 6914, + "train_runtime": 56184.0568, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 4.190909090909091, + "grad_norm": 0.0042811669409275055, + "learning_rate": 6.347192241462362e-05, + "loss": 0.011202718131244183, + "num_input_tokens_seen": 113240040, + "step": 6915, + "train_runtime": 56192.185, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 4.191515151515151, + "grad_norm": 0.007300754077732563, + "learning_rate": 6.346266165418173e-05, + "loss": 0.01290104165673256, + "num_input_tokens_seen": 113256416, + "step": 6916, + "train_runtime": 56200.3025, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 4.192121212121212, + "grad_norm": 0.00411928491666913, + "learning_rate": 6.345340039578221e-05, + "loss": 0.011840835213661194, + "num_input_tokens_seen": 113272792, + "step": 6917, + "train_runtime": 56208.4297, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 4.192727272727272, + "grad_norm": 0.009779136627912521, + "learning_rate": 6.344413863976762e-05, + "loss": 0.012496448121964931, + "num_input_tokens_seen": 113289168, + "step": 6918, + "train_runtime": 56216.5469, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 4.193333333333333, + "grad_norm": 0.005786315072327852, + "learning_rate": 6.34348763864805e-05, + "loss": 0.012539643794298172, + "num_input_tokens_seen": 113305544, + "step": 6919, + "train_runtime": 56224.6625, + "train_tokens_per_second": 2015.229 + }, + { + "epoch": 4.193939393939394, + "grad_norm": 0.006871899589896202, + "learning_rate": 6.342561363626348e-05, + "loss": 0.012295959517359734, + "num_input_tokens_seen": 113321920, + "step": 6920, + "train_runtime": 56232.7794, + "train_tokens_per_second": 2015.229 + }, + { + "epoch": 4.194545454545454, + "grad_norm": 0.00906828697770834, + "learning_rate": 6.341635038945915e-05, + "loss": 0.011613850481808186, + "num_input_tokens_seen": 113338296, + "step": 6921, + "train_runtime": 56240.8937, + "train_tokens_per_second": 2015.229 + }, + { + "epoch": 4.195151515151515, + "grad_norm": 0.006740303244441748, + "learning_rate": 6.340708664641015e-05, + "loss": 0.011418634094297886, + "num_input_tokens_seen": 113354672, + "step": 6922, + "train_runtime": 56249.0114, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 4.195757575757575, + "grad_norm": 0.006289518903940916, + "learning_rate": 6.339782240745913e-05, + "loss": 0.0128248305991292, + "num_input_tokens_seen": 113371048, + "step": 6923, + "train_runtime": 56257.1299, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 4.196363636363636, + "grad_norm": 0.006296861916780472, + "learning_rate": 6.338855767294874e-05, + "loss": 0.011796567589044571, + "num_input_tokens_seen": 113387424, + "step": 6924, + "train_runtime": 56265.2415, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 4.196969696969697, + "grad_norm": 0.009081852622330189, + "learning_rate": 6.337929244322167e-05, + "loss": 0.011584213003516197, + "num_input_tokens_seen": 113403800, + "step": 6925, + "train_runtime": 56273.3597, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 4.197575757575757, + "grad_norm": 0.008771180175244808, + "learning_rate": 6.337002671862063e-05, + "loss": 0.012407020665705204, + "num_input_tokens_seen": 113420176, + "step": 6926, + "train_runtime": 56281.4797, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 4.198181818181818, + "grad_norm": 0.00600493373349309, + "learning_rate": 6.336076049948837e-05, + "loss": 0.012145531363785267, + "num_input_tokens_seen": 113436552, + "step": 6927, + "train_runtime": 56289.5974, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 4.198787878787879, + "grad_norm": 0.009007000364363194, + "learning_rate": 6.335149378616755e-05, + "loss": 0.012149602174758911, + "num_input_tokens_seen": 113452928, + "step": 6928, + "train_runtime": 56297.7084, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 4.199393939393939, + "grad_norm": 0.001996503910049796, + "learning_rate": 6.3342226579001e-05, + "loss": 0.011852467432618141, + "num_input_tokens_seen": 113469304, + "step": 6929, + "train_runtime": 56305.831, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 4.2, + "grad_norm": 0.0070578414015471935, + "learning_rate": 6.333295887833147e-05, + "loss": 0.012259211391210556, + "num_input_tokens_seen": 113485680, + "step": 6930, + "train_runtime": 56313.9506, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 4.20060606060606, + "grad_norm": 0.008240797556936741, + "learning_rate": 6.332369068450174e-05, + "loss": 0.011915069073438644, + "num_input_tokens_seen": 113502056, + "step": 6931, + "train_runtime": 56322.0686, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 4.201212121212121, + "grad_norm": 0.004783863667398691, + "learning_rate": 6.331442199785468e-05, + "loss": 0.011720610782504082, + "num_input_tokens_seen": 113518432, + "step": 6932, + "train_runtime": 56330.1837, + "train_tokens_per_second": 2015.233 + }, + { + "epoch": 4.201818181818182, + "grad_norm": 0.0063272034749388695, + "learning_rate": 6.330515281873304e-05, + "loss": 0.011412794701755047, + "num_input_tokens_seen": 113534808, + "step": 6933, + "train_runtime": 56338.3011, + "train_tokens_per_second": 2015.233 + }, + { + "epoch": 4.202424242424242, + "grad_norm": 0.0015554563142359257, + "learning_rate": 6.329588314747973e-05, + "loss": 0.011514253914356232, + "num_input_tokens_seen": 113551184, + "step": 6934, + "train_runtime": 56346.4203, + "train_tokens_per_second": 2015.233 + }, + { + "epoch": 4.203030303030303, + "grad_norm": 0.005393885541707277, + "learning_rate": 6.328661298443757e-05, + "loss": 0.012200485914945602, + "num_input_tokens_seen": 113567560, + "step": 6935, + "train_runtime": 56354.539, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 4.203636363636364, + "grad_norm": 0.009804018773138523, + "learning_rate": 6.327734232994949e-05, + "loss": 0.012293459847569466, + "num_input_tokens_seen": 113583936, + "step": 6936, + "train_runtime": 56362.6553, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 4.204242424242424, + "grad_norm": 0.007719079963862896, + "learning_rate": 6.326807118435838e-05, + "loss": 0.011696969158947468, + "num_input_tokens_seen": 113600312, + "step": 6937, + "train_runtime": 56370.7744, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 4.204848484848485, + "grad_norm": 0.003786789020523429, + "learning_rate": 6.325879954800714e-05, + "loss": 0.011894923634827137, + "num_input_tokens_seen": 113616688, + "step": 6938, + "train_runtime": 56378.8877, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 4.205454545454545, + "grad_norm": 0.006776104215532541, + "learning_rate": 6.324952742123871e-05, + "loss": 0.011331185698509216, + "num_input_tokens_seen": 113633064, + "step": 6939, + "train_runtime": 56387.0053, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 4.206060606060606, + "grad_norm": 0.007738078013062477, + "learning_rate": 6.32402548043961e-05, + "loss": 0.01325385831296444, + "num_input_tokens_seen": 113649440, + "step": 6940, + "train_runtime": 56395.1206, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 4.206666666666667, + "grad_norm": 0.0076738144271075726, + "learning_rate": 6.323098169782224e-05, + "loss": 0.012597830966114998, + "num_input_tokens_seen": 113665816, + "step": 6941, + "train_runtime": 56403.2392, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 4.207272727272727, + "grad_norm": 0.008525153622031212, + "learning_rate": 6.322170810186012e-05, + "loss": 0.01181227620691061, + "num_input_tokens_seen": 113682192, + "step": 6942, + "train_runtime": 56411.3583, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 4.207878787878788, + "grad_norm": 0.0017069973982870579, + "learning_rate": 6.321243401685276e-05, + "loss": 0.011385987512767315, + "num_input_tokens_seen": 113698568, + "step": 6943, + "train_runtime": 56419.474, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 4.208484848484849, + "grad_norm": 0.010382258333265781, + "learning_rate": 6.320315944314322e-05, + "loss": 0.01158243976533413, + "num_input_tokens_seen": 113714944, + "step": 6944, + "train_runtime": 56427.5932, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 4.209090909090909, + "grad_norm": 0.009311951696872711, + "learning_rate": 6.31938843810745e-05, + "loss": 0.01153288222849369, + "num_input_tokens_seen": 113731320, + "step": 6945, + "train_runtime": 56435.7107, + "train_tokens_per_second": 2015.237 + }, + { + "epoch": 4.20969696969697, + "grad_norm": 0.0072789303958415985, + "learning_rate": 6.318460883098972e-05, + "loss": 0.013572175055742264, + "num_input_tokens_seen": 113747696, + "step": 6946, + "train_runtime": 56443.8299, + "train_tokens_per_second": 2015.237 + }, + { + "epoch": 4.21030303030303, + "grad_norm": 0.0010107951238751411, + "learning_rate": 6.317533279323189e-05, + "loss": 0.012304224073886871, + "num_input_tokens_seen": 113764072, + "step": 6947, + "train_runtime": 56451.9451, + "train_tokens_per_second": 2015.237 + }, + { + "epoch": 4.210909090909091, + "grad_norm": 0.006396109703928232, + "learning_rate": 6.31660562681442e-05, + "loss": 0.01247490756213665, + "num_input_tokens_seen": 113780448, + "step": 6948, + "train_runtime": 56460.0626, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 4.211515151515152, + "grad_norm": 0.0077385250478982925, + "learning_rate": 6.31567792560697e-05, + "loss": 0.011569447815418243, + "num_input_tokens_seen": 113796824, + "step": 6949, + "train_runtime": 56468.1806, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 4.212121212121212, + "grad_norm": 0.010486487299203873, + "learning_rate": 6.314750175735158e-05, + "loss": 0.015078081749379635, + "num_input_tokens_seen": 113813200, + "step": 6950, + "train_runtime": 56476.2944, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 4.212727272727273, + "grad_norm": 0.007240509148687124, + "learning_rate": 6.313822377233296e-05, + "loss": 0.013657202944159508, + "num_input_tokens_seen": 113829576, + "step": 6951, + "train_runtime": 56484.4109, + "train_tokens_per_second": 2015.239 + }, + { + "epoch": 4.213333333333333, + "grad_norm": 0.005837657023221254, + "learning_rate": 6.312894530135702e-05, + "loss": 0.012736777774989605, + "num_input_tokens_seen": 113845952, + "step": 6952, + "train_runtime": 56492.5311, + "train_tokens_per_second": 2015.239 + }, + { + "epoch": 4.213939393939394, + "grad_norm": 0.0038744015619158745, + "learning_rate": 6.311966634476698e-05, + "loss": 0.011188481003046036, + "num_input_tokens_seen": 113862328, + "step": 6953, + "train_runtime": 56500.6445, + "train_tokens_per_second": 2015.239 + }, + { + "epoch": 4.214545454545455, + "grad_norm": 0.004136449657380581, + "learning_rate": 6.311038690290602e-05, + "loss": 0.01187935285270214, + "num_input_tokens_seen": 113878704, + "step": 6954, + "train_runtime": 56508.7584, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 4.215151515151515, + "grad_norm": 0.008448158390820026, + "learning_rate": 6.31011069761174e-05, + "loss": 0.012274873442947865, + "num_input_tokens_seen": 113895080, + "step": 6955, + "train_runtime": 56516.8748, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 4.215757575757576, + "grad_norm": 0.009429436177015305, + "learning_rate": 6.309182656474431e-05, + "loss": 0.011716051958501339, + "num_input_tokens_seen": 113911456, + "step": 6956, + "train_runtime": 56524.9914, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 4.216363636363637, + "grad_norm": 0.003401412395760417, + "learning_rate": 6.308254566913008e-05, + "loss": 0.011963395401835442, + "num_input_tokens_seen": 113927832, + "step": 6957, + "train_runtime": 56533.1085, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 4.216969696969697, + "grad_norm": 0.00571412593126297, + "learning_rate": 6.307326428961794e-05, + "loss": 0.011297233402729034, + "num_input_tokens_seen": 113944208, + "step": 6958, + "train_runtime": 56541.2211, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 4.217575757575758, + "grad_norm": 0.007147027645260096, + "learning_rate": 6.306398242655123e-05, + "loss": 0.012589799240231514, + "num_input_tokens_seen": 113960584, + "step": 6959, + "train_runtime": 56549.3369, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 4.218181818181818, + "grad_norm": 0.009451452642679214, + "learning_rate": 6.305470008027325e-05, + "loss": 0.013604702427983284, + "num_input_tokens_seen": 113976960, + "step": 6960, + "train_runtime": 56557.457, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 4.218787878787879, + "grad_norm": 0.0050103128887712955, + "learning_rate": 6.304541725112734e-05, + "loss": 0.011822208762168884, + "num_input_tokens_seen": 113993336, + "step": 6961, + "train_runtime": 56565.5728, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 4.21939393939394, + "grad_norm": 0.008943802677094936, + "learning_rate": 6.303613393945683e-05, + "loss": 0.011891870759427547, + "num_input_tokens_seen": 114009712, + "step": 6962, + "train_runtime": 56573.6892, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 4.22, + "grad_norm": 0.006873782258480787, + "learning_rate": 6.302685014560513e-05, + "loss": 0.012867502868175507, + "num_input_tokens_seen": 114026088, + "step": 6963, + "train_runtime": 56581.805, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 4.220606060606061, + "grad_norm": 0.006290758028626442, + "learning_rate": 6.30175658699156e-05, + "loss": 0.012314972467720509, + "num_input_tokens_seen": 114042464, + "step": 6964, + "train_runtime": 56589.9317, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 4.221212121212122, + "grad_norm": 0.005471923388540745, + "learning_rate": 6.300828111273169e-05, + "loss": 0.01145384181290865, + "num_input_tokens_seen": 114058840, + "step": 6965, + "train_runtime": 56598.0504, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 4.221818181818182, + "grad_norm": 0.00581837585195899, + "learning_rate": 6.299899587439675e-05, + "loss": 0.011366224847733974, + "num_input_tokens_seen": 114075216, + "step": 6966, + "train_runtime": 56606.1678, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 4.222424242424243, + "grad_norm": 0.005598251707851887, + "learning_rate": 6.29897101552543e-05, + "loss": 0.011897770687937737, + "num_input_tokens_seen": 114091592, + "step": 6967, + "train_runtime": 56614.2843, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 4.223030303030303, + "grad_norm": 0.00859918911010027, + "learning_rate": 6.298042395564775e-05, + "loss": 0.010953141376376152, + "num_input_tokens_seen": 114107968, + "step": 6968, + "train_runtime": 56622.399, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 4.223636363636364, + "grad_norm": 0.006870792713016272, + "learning_rate": 6.297113727592062e-05, + "loss": 0.011282357387244701, + "num_input_tokens_seen": 114124344, + "step": 6969, + "train_runtime": 56630.5136, + "train_tokens_per_second": 2015.245 + }, + { + "epoch": 4.224242424242425, + "grad_norm": 0.008055748417973518, + "learning_rate": 6.296185011641634e-05, + "loss": 0.012273905798792839, + "num_input_tokens_seen": 114140720, + "step": 6970, + "train_runtime": 56638.6304, + "train_tokens_per_second": 2015.245 + }, + { + "epoch": 4.224848484848485, + "grad_norm": 0.009190967306494713, + "learning_rate": 6.29525624774785e-05, + "loss": 0.012722629122436047, + "num_input_tokens_seen": 114157096, + "step": 6971, + "train_runtime": 56646.7458, + "train_tokens_per_second": 2015.245 + }, + { + "epoch": 4.225454545454546, + "grad_norm": 0.009504413232207298, + "learning_rate": 6.294327435945059e-05, + "loss": 0.011953562498092651, + "num_input_tokens_seen": 114173472, + "step": 6972, + "train_runtime": 56654.8603, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 4.2260606060606065, + "grad_norm": 0.009004352614283562, + "learning_rate": 6.293398576267616e-05, + "loss": 0.011462122201919556, + "num_input_tokens_seen": 114189848, + "step": 6973, + "train_runtime": 56662.9731, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 4.226666666666667, + "grad_norm": 0.007056871894747019, + "learning_rate": 6.292469668749878e-05, + "loss": 0.012293221428990364, + "num_input_tokens_seen": 114206224, + "step": 6974, + "train_runtime": 56671.0909, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 4.2272727272727275, + "grad_norm": 0.004407619591802359, + "learning_rate": 6.291540713426206e-05, + "loss": 0.01103981677442789, + "num_input_tokens_seen": 114222600, + "step": 6975, + "train_runtime": 56679.2082, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 4.227878787878788, + "grad_norm": 0.012969191186130047, + "learning_rate": 6.290611710330956e-05, + "loss": 0.013793570920825005, + "num_input_tokens_seen": 114238976, + "step": 6976, + "train_runtime": 56687.3306, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 4.2284848484848485, + "grad_norm": 0.010489287786185741, + "learning_rate": 6.289682659498495e-05, + "loss": 0.01159415952861309, + "num_input_tokens_seen": 114255352, + "step": 6977, + "train_runtime": 56695.4438, + "train_tokens_per_second": 2015.248 + }, + { + "epoch": 4.2290909090909095, + "grad_norm": 0.003411774057894945, + "learning_rate": 6.288753560963182e-05, + "loss": 0.011920591816306114, + "num_input_tokens_seen": 114271728, + "step": 6978, + "train_runtime": 56703.583, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 4.2296969696969695, + "grad_norm": 0.012952803634107113, + "learning_rate": 6.287824414759384e-05, + "loss": 0.012558434158563614, + "num_input_tokens_seen": 114288104, + "step": 6979, + "train_runtime": 56711.7011, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 4.2303030303030305, + "grad_norm": 0.0038320599123835564, + "learning_rate": 6.28689522092147e-05, + "loss": 0.012165351770818233, + "num_input_tokens_seen": 114304480, + "step": 6980, + "train_runtime": 56719.8145, + "train_tokens_per_second": 2015.248 + }, + { + "epoch": 4.2309090909090905, + "grad_norm": 0.019258933141827583, + "learning_rate": 6.285965979483807e-05, + "loss": 0.012608307413756847, + "num_input_tokens_seen": 114320856, + "step": 6981, + "train_runtime": 56727.9342, + "train_tokens_per_second": 2015.248 + }, + { + "epoch": 4.2315151515151515, + "grad_norm": 0.004359701182693243, + "learning_rate": 6.285036690480768e-05, + "loss": 0.010833397507667542, + "num_input_tokens_seen": 114337232, + "step": 6982, + "train_runtime": 56736.0529, + "train_tokens_per_second": 2015.248 + }, + { + "epoch": 4.232121212121212, + "grad_norm": 0.009556837379932404, + "learning_rate": 6.284107353946723e-05, + "loss": 0.014237478375434875, + "num_input_tokens_seen": 114353608, + "step": 6983, + "train_runtime": 56744.1653, + "train_tokens_per_second": 2015.249 + }, + { + "epoch": 4.2327272727272724, + "grad_norm": 0.007545378990471363, + "learning_rate": 6.283177969916048e-05, + "loss": 0.012095949612557888, + "num_input_tokens_seen": 114369984, + "step": 6984, + "train_runtime": 56752.2786, + "train_tokens_per_second": 2015.249 + }, + { + "epoch": 4.233333333333333, + "grad_norm": 0.010570069774985313, + "learning_rate": 6.282248538423118e-05, + "loss": 0.011665193364024162, + "num_input_tokens_seen": 114386360, + "step": 6985, + "train_runtime": 56760.3959, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 4.233939393939394, + "grad_norm": 0.006071086041629314, + "learning_rate": 6.281319059502313e-05, + "loss": 0.011309342458844185, + "num_input_tokens_seen": 114402736, + "step": 6986, + "train_runtime": 56768.5151, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 4.234545454545454, + "grad_norm": 0.007659859023988247, + "learning_rate": 6.280389533188012e-05, + "loss": 0.012674129568040371, + "num_input_tokens_seen": 114419112, + "step": 6987, + "train_runtime": 56776.6331, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 4.235151515151515, + "grad_norm": 0.0076020280830562115, + "learning_rate": 6.279459959514594e-05, + "loss": 0.012313363142311573, + "num_input_tokens_seen": 114435488, + "step": 6988, + "train_runtime": 56784.7467, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 4.235757575757575, + "grad_norm": 0.007304595783352852, + "learning_rate": 6.278530338516445e-05, + "loss": 0.011973433196544647, + "num_input_tokens_seen": 114451864, + "step": 6989, + "train_runtime": 56792.8635, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 4.236363636363636, + "grad_norm": 0.00873358454555273, + "learning_rate": 6.277600670227946e-05, + "loss": 0.01197432167828083, + "num_input_tokens_seen": 114468240, + "step": 6990, + "train_runtime": 56800.9797, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 4.236969696969697, + "grad_norm": 0.007471440825611353, + "learning_rate": 6.276670954683489e-05, + "loss": 0.012459768913686275, + "num_input_tokens_seen": 114484616, + "step": 6991, + "train_runtime": 56809.0968, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 4.237575757575757, + "grad_norm": 0.006253303028643131, + "learning_rate": 6.275741191917459e-05, + "loss": 0.012189668603241444, + "num_input_tokens_seen": 114500992, + "step": 6992, + "train_runtime": 56817.2114, + "train_tokens_per_second": 2015.252 + }, + { + "epoch": 4.238181818181818, + "grad_norm": 0.012081111781299114, + "learning_rate": 6.274811381964245e-05, + "loss": 0.013250622898340225, + "num_input_tokens_seen": 114517368, + "step": 6993, + "train_runtime": 56825.3327, + "train_tokens_per_second": 2015.252 + }, + { + "epoch": 4.238787878787879, + "grad_norm": 0.007617360446602106, + "learning_rate": 6.273881524858242e-05, + "loss": 0.011512791737914085, + "num_input_tokens_seen": 114533744, + "step": 6994, + "train_runtime": 56833.4458, + "train_tokens_per_second": 2015.253 + }, + { + "epoch": 4.239393939393939, + "grad_norm": 0.0063877408392727375, + "learning_rate": 6.272951620633842e-05, + "loss": 0.012485964223742485, + "num_input_tokens_seen": 114550120, + "step": 6995, + "train_runtime": 56841.5624, + "train_tokens_per_second": 2015.253 + }, + { + "epoch": 4.24, + "grad_norm": 0.003952125087380409, + "learning_rate": 6.272021669325441e-05, + "loss": 0.010304677300155163, + "num_input_tokens_seen": 114566496, + "step": 6996, + "train_runtime": 56849.6815, + "train_tokens_per_second": 2015.253 + }, + { + "epoch": 4.24060606060606, + "grad_norm": 0.008169122971594334, + "learning_rate": 6.271091670967436e-05, + "loss": 0.013117880560457706, + "num_input_tokens_seen": 114582872, + "step": 6997, + "train_runtime": 56857.7947, + "train_tokens_per_second": 2015.254 + }, + { + "epoch": 4.241212121212121, + "grad_norm": 0.005350527353584766, + "learning_rate": 6.270161625594224e-05, + "loss": 0.012066859751939774, + "num_input_tokens_seen": 114599248, + "step": 6998, + "train_runtime": 56865.9079, + "train_tokens_per_second": 2015.254 + }, + { + "epoch": 4.241818181818182, + "grad_norm": 0.006741023622453213, + "learning_rate": 6.269231533240208e-05, + "loss": 0.010940724052488804, + "num_input_tokens_seen": 114615624, + "step": 6999, + "train_runtime": 56874.0182, + "train_tokens_per_second": 2015.255 + }, + { + "epoch": 4.242424242424242, + "grad_norm": 0.00875039678066969, + "learning_rate": 6.268301393939788e-05, + "loss": 0.011783924885094166, + "num_input_tokens_seen": 114632000, + "step": 7000, + "train_runtime": 56882.1314, + "train_tokens_per_second": 2015.255 + }, + { + "epoch": 4.243030303030303, + "grad_norm": 0.008273002691566944, + "learning_rate": 6.26737120772737e-05, + "loss": 0.010525410063564777, + "num_input_tokens_seen": 114648376, + "step": 7001, + "train_runtime": 56891.2734, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 4.243636363636363, + "grad_norm": 0.019098544493317604, + "learning_rate": 6.26644097463736e-05, + "loss": 0.011429384350776672, + "num_input_tokens_seen": 114664752, + "step": 7002, + "train_runtime": 56899.3873, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 4.244242424242424, + "grad_norm": 0.00866622757166624, + "learning_rate": 6.265510694704164e-05, + "loss": 0.012530849315226078, + "num_input_tokens_seen": 114681128, + "step": 7003, + "train_runtime": 56907.5001, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 4.244848484848485, + "grad_norm": 0.005355302710086107, + "learning_rate": 6.264580367962191e-05, + "loss": 0.011091120541095734, + "num_input_tokens_seen": 114697504, + "step": 7004, + "train_runtime": 56915.6146, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 4.245454545454545, + "grad_norm": 0.011985565535724163, + "learning_rate": 6.263649994445855e-05, + "loss": 0.012584619224071503, + "num_input_tokens_seen": 114713880, + "step": 7005, + "train_runtime": 56923.7306, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 4.246060606060606, + "grad_norm": 0.007521754130721092, + "learning_rate": 6.262719574189564e-05, + "loss": 0.012445366010069847, + "num_input_tokens_seen": 114730256, + "step": 7006, + "train_runtime": 56931.8473, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 4.246666666666667, + "grad_norm": 0.004379868041723967, + "learning_rate": 6.261789107227737e-05, + "loss": 0.011506016366183758, + "num_input_tokens_seen": 114746632, + "step": 7007, + "train_runtime": 56939.963, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 4.247272727272727, + "grad_norm": 0.010419159196317196, + "learning_rate": 6.260858593594787e-05, + "loss": 0.01151613611727953, + "num_input_tokens_seen": 114763008, + "step": 7008, + "train_runtime": 56948.0785, + "train_tokens_per_second": 2015.222 + }, + { + "epoch": 4.247878787878788, + "grad_norm": 0.007216797675937414, + "learning_rate": 6.259928033325134e-05, + "loss": 0.012054355815052986, + "num_input_tokens_seen": 114779384, + "step": 7009, + "train_runtime": 56956.1961, + "train_tokens_per_second": 2015.222 + }, + { + "epoch": 4.248484848484848, + "grad_norm": 0.007438914850354195, + "learning_rate": 6.258997426453194e-05, + "loss": 0.012344637885689735, + "num_input_tokens_seen": 114795760, + "step": 7010, + "train_runtime": 56964.3112, + "train_tokens_per_second": 2015.222 + }, + { + "epoch": 4.249090909090909, + "grad_norm": 0.0074223605915904045, + "learning_rate": 6.258066773013395e-05, + "loss": 0.012204492464661598, + "num_input_tokens_seen": 114812136, + "step": 7011, + "train_runtime": 56972.4306, + "train_tokens_per_second": 2015.223 + }, + { + "epoch": 4.24969696969697, + "grad_norm": 0.005668258760124445, + "learning_rate": 6.257136073040153e-05, + "loss": 0.011727429926395416, + "num_input_tokens_seen": 114828512, + "step": 7012, + "train_runtime": 56980.5473, + "train_tokens_per_second": 2015.223 + }, + { + "epoch": 4.25030303030303, + "grad_norm": 0.005656961817294359, + "learning_rate": 6.256205326567897e-05, + "loss": 0.011963551864027977, + "num_input_tokens_seen": 114844888, + "step": 7013, + "train_runtime": 56988.6602, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 4.250909090909091, + "grad_norm": 0.00523189315572381, + "learning_rate": 6.255274533631053e-05, + "loss": 0.011732065118849277, + "num_input_tokens_seen": 114861264, + "step": 7014, + "train_runtime": 56996.7731, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 4.251515151515152, + "grad_norm": 0.00956094078719616, + "learning_rate": 6.254343694264046e-05, + "loss": 0.01144226361066103, + "num_input_tokens_seen": 114877640, + "step": 7015, + "train_runtime": 57004.8845, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 4.252121212121212, + "grad_norm": 0.013415546156466007, + "learning_rate": 6.253412808501312e-05, + "loss": 0.0110838757827878, + "num_input_tokens_seen": 114894016, + "step": 7016, + "train_runtime": 57013.0005, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 4.252727272727273, + "grad_norm": 0.009008154273033142, + "learning_rate": 6.252481876377276e-05, + "loss": 0.012268247082829475, + "num_input_tokens_seen": 114910392, + "step": 7017, + "train_runtime": 57021.1116, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 4.253333333333333, + "grad_norm": 0.010166886262595654, + "learning_rate": 6.251550897926376e-05, + "loss": 0.01331732701510191, + "num_input_tokens_seen": 114926768, + "step": 7018, + "train_runtime": 57029.2298, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 4.253939393939394, + "grad_norm": 0.009320500306785107, + "learning_rate": 6.250619873183046e-05, + "loss": 0.011530434712767601, + "num_input_tokens_seen": 114943144, + "step": 7019, + "train_runtime": 57037.3461, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 4.254545454545455, + "grad_norm": 0.005053642205893993, + "learning_rate": 6.249688802181722e-05, + "loss": 0.012037829495966434, + "num_input_tokens_seen": 114959520, + "step": 7020, + "train_runtime": 57045.4596, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 4.255151515151515, + "grad_norm": 0.006445298902690411, + "learning_rate": 6.248757684956842e-05, + "loss": 0.012320589274168015, + "num_input_tokens_seen": 114975896, + "step": 7021, + "train_runtime": 57053.5741, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 4.255757575757576, + "grad_norm": 0.006640312727540731, + "learning_rate": 6.247826521542848e-05, + "loss": 0.01181582547724247, + "num_input_tokens_seen": 114992272, + "step": 7022, + "train_runtime": 57061.6924, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 4.256363636363636, + "grad_norm": 0.008987942710518837, + "learning_rate": 6.246895311974181e-05, + "loss": 0.012465059757232666, + "num_input_tokens_seen": 115008648, + "step": 7023, + "train_runtime": 57069.8096, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 4.256969696969697, + "grad_norm": 0.004419003613293171, + "learning_rate": 6.245964056285283e-05, + "loss": 0.011583373881876469, + "num_input_tokens_seen": 115025024, + "step": 7024, + "train_runtime": 57077.9306, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 4.257575757575758, + "grad_norm": 0.005611793138086796, + "learning_rate": 6.245032754510603e-05, + "loss": 0.012189830653369427, + "num_input_tokens_seen": 115041400, + "step": 7025, + "train_runtime": 57086.0443, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 4.258181818181818, + "grad_norm": 0.006690877489745617, + "learning_rate": 6.244101406684585e-05, + "loss": 0.010996975935995579, + "num_input_tokens_seen": 115057776, + "step": 7026, + "train_runtime": 57094.1605, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 4.258787878787879, + "grad_norm": 0.009129433892667294, + "learning_rate": 6.243170012841679e-05, + "loss": 0.012605855241417885, + "num_input_tokens_seen": 115074152, + "step": 7027, + "train_runtime": 57102.2756, + "train_tokens_per_second": 2015.229 + }, + { + "epoch": 4.25939393939394, + "grad_norm": 0.0068779378198087215, + "learning_rate": 6.242238573016335e-05, + "loss": 0.011709502898156643, + "num_input_tokens_seen": 115090528, + "step": 7028, + "train_runtime": 57110.3885, + "train_tokens_per_second": 2015.229 + }, + { + "epoch": 4.26, + "grad_norm": 0.003739926964044571, + "learning_rate": 6.241307087243006e-05, + "loss": 0.011604883708059788, + "num_input_tokens_seen": 115106904, + "step": 7029, + "train_runtime": 57118.5094, + "train_tokens_per_second": 2015.229 + }, + { + "epoch": 4.260606060606061, + "grad_norm": 0.00747557170689106, + "learning_rate": 6.240375555556145e-05, + "loss": 0.012814539484679699, + "num_input_tokens_seen": 115123280, + "step": 7030, + "train_runtime": 57126.6305, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 4.261212121212122, + "grad_norm": 0.009239137172698975, + "learning_rate": 6.239443977990206e-05, + "loss": 0.012747636064887047, + "num_input_tokens_seen": 115139656, + "step": 7031, + "train_runtime": 57134.7486, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 4.261818181818182, + "grad_norm": 0.004802403040230274, + "learning_rate": 6.238512354579651e-05, + "loss": 0.011349259875714779, + "num_input_tokens_seen": 115156032, + "step": 7032, + "train_runtime": 57142.8666, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 4.262424242424243, + "grad_norm": 0.007698389235883951, + "learning_rate": 6.237580685358934e-05, + "loss": 0.011988100595772266, + "num_input_tokens_seen": 115172408, + "step": 7033, + "train_runtime": 57150.9835, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 4.263030303030303, + "grad_norm": 0.006380777806043625, + "learning_rate": 6.236648970362518e-05, + "loss": 0.012665089219808578, + "num_input_tokens_seen": 115188784, + "step": 7034, + "train_runtime": 57159.0946, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 4.263636363636364, + "grad_norm": 0.0045127542689442635, + "learning_rate": 6.235717209624864e-05, + "loss": 0.011683829128742218, + "num_input_tokens_seen": 115205160, + "step": 7035, + "train_runtime": 57167.2074, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 4.264242424242425, + "grad_norm": 0.005375554785132408, + "learning_rate": 6.234785403180437e-05, + "loss": 0.012305976822972298, + "num_input_tokens_seen": 115221536, + "step": 7036, + "train_runtime": 57175.3213, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 4.264848484848485, + "grad_norm": 0.0066563137806952, + "learning_rate": 6.233853551063704e-05, + "loss": 0.012018634006381035, + "num_input_tokens_seen": 115237912, + "step": 7037, + "train_runtime": 57183.4405, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 4.265454545454546, + "grad_norm": 0.006394990254193544, + "learning_rate": 6.23292165330913e-05, + "loss": 0.011943633668124676, + "num_input_tokens_seen": 115254288, + "step": 7038, + "train_runtime": 57191.5552, + "train_tokens_per_second": 2015.233 + }, + { + "epoch": 4.266060606060606, + "grad_norm": 0.007291150279343128, + "learning_rate": 6.231989709951185e-05, + "loss": 0.012094919569790363, + "num_input_tokens_seen": 115270664, + "step": 7039, + "train_runtime": 57199.6694, + "train_tokens_per_second": 2015.233 + }, + { + "epoch": 4.266666666666667, + "grad_norm": 0.006792329717427492, + "learning_rate": 6.231057721024339e-05, + "loss": 0.011991500854492188, + "num_input_tokens_seen": 115287040, + "step": 7040, + "train_runtime": 57207.7875, + "train_tokens_per_second": 2015.233 + }, + { + "epoch": 4.2672727272727276, + "grad_norm": 0.0001569887244841084, + "learning_rate": 6.230125686563068e-05, + "loss": 0.011343223974108696, + "num_input_tokens_seen": 115303416, + "step": 7041, + "train_runtime": 57215.9031, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 4.267878787878788, + "grad_norm": 0.0052423798479139805, + "learning_rate": 6.22919360660184e-05, + "loss": 0.010923538357019424, + "num_input_tokens_seen": 115319792, + "step": 7042, + "train_runtime": 57224.0159, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 4.2684848484848485, + "grad_norm": 0.003345941659063101, + "learning_rate": 6.228261481175137e-05, + "loss": 0.011419293470680714, + "num_input_tokens_seen": 115336168, + "step": 7043, + "train_runtime": 57232.1347, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 4.2690909090909095, + "grad_norm": 0.0017331767594441772, + "learning_rate": 6.227329310317432e-05, + "loss": 0.011972655542194843, + "num_input_tokens_seen": 115352544, + "step": 7044, + "train_runtime": 57240.2514, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 4.2696969696969695, + "grad_norm": 0.022322241216897964, + "learning_rate": 6.226397094063206e-05, + "loss": 0.01238132081925869, + "num_input_tokens_seen": 115368920, + "step": 7045, + "train_runtime": 57248.3745, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 4.2703030303030305, + "grad_norm": 0.006001575384289026, + "learning_rate": 6.225464832446942e-05, + "loss": 0.012040253728628159, + "num_input_tokens_seen": 115385296, + "step": 7046, + "train_runtime": 57256.4907, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 4.2709090909090905, + "grad_norm": 0.007208582013845444, + "learning_rate": 6.224532525503119e-05, + "loss": 0.011571655981242657, + "num_input_tokens_seen": 115401672, + "step": 7047, + "train_runtime": 57264.6047, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 4.2715151515151515, + "grad_norm": 0.004761922173202038, + "learning_rate": 6.223600173266221e-05, + "loss": 0.012444381602108479, + "num_input_tokens_seen": 115418048, + "step": 7048, + "train_runtime": 57272.7194, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 4.272121212121212, + "grad_norm": 0.006144651677459478, + "learning_rate": 6.222667775770736e-05, + "loss": 0.012797334231436253, + "num_input_tokens_seen": 115434424, + "step": 7049, + "train_runtime": 57280.8355, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 4.2727272727272725, + "grad_norm": 0.0021460603456944227, + "learning_rate": 6.221735333051153e-05, + "loss": 0.011058458127081394, + "num_input_tokens_seen": 115450800, + "step": 7050, + "train_runtime": 57288.948, + "train_tokens_per_second": 2015.237 + }, + { + "epoch": 4.273333333333333, + "grad_norm": 0.003946735057979822, + "learning_rate": 6.220802845141958e-05, + "loss": 0.011460155248641968, + "num_input_tokens_seen": 115467176, + "step": 7051, + "train_runtime": 57297.0608, + "train_tokens_per_second": 2015.237 + }, + { + "epoch": 4.2739393939393935, + "grad_norm": 0.005678087007254362, + "learning_rate": 6.219870312077642e-05, + "loss": 0.011610078625380993, + "num_input_tokens_seen": 115483552, + "step": 7052, + "train_runtime": 57305.1748, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 4.274545454545454, + "grad_norm": 0.008718382567167282, + "learning_rate": 6.2189377338927e-05, + "loss": 0.011915487237274647, + "num_input_tokens_seen": 115499928, + "step": 7053, + "train_runtime": 57313.2905, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 4.275151515151515, + "grad_norm": 0.006730965804308653, + "learning_rate": 6.218005110621625e-05, + "loss": 0.012607129290699959, + "num_input_tokens_seen": 115516304, + "step": 7054, + "train_runtime": 57321.4067, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 4.275757575757575, + "grad_norm": 0.008321049623191357, + "learning_rate": 6.217072442298913e-05, + "loss": 0.012712669558823109, + "num_input_tokens_seen": 115532680, + "step": 7055, + "train_runtime": 57329.5317, + "train_tokens_per_second": 2015.239 + }, + { + "epoch": 4.276363636363636, + "grad_norm": 0.008415148593485355, + "learning_rate": 6.21613972895906e-05, + "loss": 0.012447581626474857, + "num_input_tokens_seen": 115549056, + "step": 7056, + "train_runtime": 57337.6464, + "train_tokens_per_second": 2015.239 + }, + { + "epoch": 4.276969696969697, + "grad_norm": 0.01083427481353283, + "learning_rate": 6.215206970636569e-05, + "loss": 0.011994468048214912, + "num_input_tokens_seen": 115565432, + "step": 7057, + "train_runtime": 57345.7608, + "train_tokens_per_second": 2015.239 + }, + { + "epoch": 4.277575757575757, + "grad_norm": 0.007787155918776989, + "learning_rate": 6.214274167365937e-05, + "loss": 0.012173672206699848, + "num_input_tokens_seen": 115581808, + "step": 7058, + "train_runtime": 57353.8774, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 4.278181818181818, + "grad_norm": 0.008098658174276352, + "learning_rate": 6.213341319181671e-05, + "loss": 0.012648637406527996, + "num_input_tokens_seen": 115598184, + "step": 7059, + "train_runtime": 57361.9941, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 4.278787878787879, + "grad_norm": 0.007351825479418039, + "learning_rate": 6.21240842611827e-05, + "loss": 0.011899099685251713, + "num_input_tokens_seen": 115614560, + "step": 7060, + "train_runtime": 57370.1161, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 4.279393939393939, + "grad_norm": 0.004098340403288603, + "learning_rate": 6.211475488210242e-05, + "loss": 0.011624746955931187, + "num_input_tokens_seen": 115630936, + "step": 7061, + "train_runtime": 57378.2319, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 4.28, + "grad_norm": 0.013220075517892838, + "learning_rate": 6.210542505492094e-05, + "loss": 0.012292067520320415, + "num_input_tokens_seen": 115647312, + "step": 7062, + "train_runtime": 57386.3461, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 4.28060606060606, + "grad_norm": 0.004186710342764854, + "learning_rate": 6.209609477998338e-05, + "loss": 0.011074480600655079, + "num_input_tokens_seen": 115663688, + "step": 7063, + "train_runtime": 57394.4622, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 4.281212121212121, + "grad_norm": 0.009804705157876015, + "learning_rate": 6.208676405763484e-05, + "loss": 0.01257538516074419, + "num_input_tokens_seen": 115680064, + "step": 7064, + "train_runtime": 57402.5744, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 4.281818181818182, + "grad_norm": 0.04843483492732048, + "learning_rate": 6.207743288822042e-05, + "loss": 0.012642706744372845, + "num_input_tokens_seen": 115696440, + "step": 7065, + "train_runtime": 57410.6865, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 4.282424242424242, + "grad_norm": 0.0038906498812139034, + "learning_rate": 6.206810127208528e-05, + "loss": 0.011654186062514782, + "num_input_tokens_seen": 115712816, + "step": 7066, + "train_runtime": 57418.8023, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 4.283030303030303, + "grad_norm": 0.003161244560033083, + "learning_rate": 6.205876920957457e-05, + "loss": 0.012299271300435066, + "num_input_tokens_seen": 115729192, + "step": 7067, + "train_runtime": 57426.9159, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 4.283636363636363, + "grad_norm": 0.007809331640601158, + "learning_rate": 6.204943670103349e-05, + "loss": 0.012160527519881725, + "num_input_tokens_seen": 115745568, + "step": 7068, + "train_runtime": 57435.0314, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 4.284242424242424, + "grad_norm": 0.010698672384023666, + "learning_rate": 6.20401037468072e-05, + "loss": 0.01363366935402155, + "num_input_tokens_seen": 115761944, + "step": 7069, + "train_runtime": 57443.1488, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 4.284848484848485, + "grad_norm": 0.006482341326773167, + "learning_rate": 6.203077034724091e-05, + "loss": 0.01233337726444006, + "num_input_tokens_seen": 115778320, + "step": 7070, + "train_runtime": 57451.2687, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 4.285454545454545, + "grad_norm": 0.007693901192396879, + "learning_rate": 6.202143650267986e-05, + "loss": 0.01125475112348795, + "num_input_tokens_seen": 115794696, + "step": 7071, + "train_runtime": 57459.3794, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 4.286060606060606, + "grad_norm": 0.009733419865369797, + "learning_rate": 6.20121022134693e-05, + "loss": 0.0127379996702075, + "num_input_tokens_seen": 115811072, + "step": 7072, + "train_runtime": 57467.4898, + "train_tokens_per_second": 2015.245 + }, + { + "epoch": 4.286666666666667, + "grad_norm": 0.006343206390738487, + "learning_rate": 6.200276747995445e-05, + "loss": 0.011312388814985752, + "num_input_tokens_seen": 115827448, + "step": 7073, + "train_runtime": 57475.6049, + "train_tokens_per_second": 2015.245 + }, + { + "epoch": 4.287272727272727, + "grad_norm": 0.010990336537361145, + "learning_rate": 6.199343230248061e-05, + "loss": 0.012799900956451893, + "num_input_tokens_seen": 115843824, + "step": 7074, + "train_runtime": 57483.7295, + "train_tokens_per_second": 2015.245 + }, + { + "epoch": 4.287878787878788, + "grad_norm": 0.03686479479074478, + "learning_rate": 6.198409668139307e-05, + "loss": 0.013017743825912476, + "num_input_tokens_seen": 115860200, + "step": 7075, + "train_runtime": 57491.8413, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 4.288484848484848, + "grad_norm": 0.008655303157866001, + "learning_rate": 6.197476061703713e-05, + "loss": 0.012335910461843014, + "num_input_tokens_seen": 115876576, + "step": 7076, + "train_runtime": 57499.9591, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 4.289090909090909, + "grad_norm": 0.004752609878778458, + "learning_rate": 6.196542410975812e-05, + "loss": 0.01184118539094925, + "num_input_tokens_seen": 115892952, + "step": 7077, + "train_runtime": 57508.0779, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 4.28969696969697, + "grad_norm": 0.002925660228356719, + "learning_rate": 6.195608715990137e-05, + "loss": 0.01249387301504612, + "num_input_tokens_seen": 115909328, + "step": 7078, + "train_runtime": 57516.1959, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 4.29030303030303, + "grad_norm": 0.009888779371976852, + "learning_rate": 6.194674976781223e-05, + "loss": 0.0119596216827631, + "num_input_tokens_seen": 115925704, + "step": 7079, + "train_runtime": 57524.3118, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 4.290909090909091, + "grad_norm": 0.0055937208235263824, + "learning_rate": 6.193741193383608e-05, + "loss": 0.012585856020450592, + "num_input_tokens_seen": 115942080, + "step": 7080, + "train_runtime": 57532.43, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 4.291515151515151, + "grad_norm": 0.0037546379026025534, + "learning_rate": 6.192807365831833e-05, + "loss": 0.011060674674808979, + "num_input_tokens_seen": 115958456, + "step": 7081, + "train_runtime": 57540.5481, + "train_tokens_per_second": 2015.248 + }, + { + "epoch": 4.292121212121212, + "grad_norm": 0.010672503150999546, + "learning_rate": 6.191873494160436e-05, + "loss": 0.013187172822654247, + "num_input_tokens_seen": 115974832, + "step": 7082, + "train_runtime": 57548.6641, + "train_tokens_per_second": 2015.248 + }, + { + "epoch": 4.292727272727273, + "grad_norm": 0.005368201527744532, + "learning_rate": 6.190939578403958e-05, + "loss": 0.010882314294576645, + "num_input_tokens_seen": 115991208, + "step": 7083, + "train_runtime": 57556.7822, + "train_tokens_per_second": 2015.248 + }, + { + "epoch": 4.293333333333333, + "grad_norm": 0.005029626656323671, + "learning_rate": 6.190005618596945e-05, + "loss": 0.01125819981098175, + "num_input_tokens_seen": 116007584, + "step": 7084, + "train_runtime": 57564.8999, + "train_tokens_per_second": 2015.249 + }, + { + "epoch": 4.293939393939394, + "grad_norm": 0.00843080598860979, + "learning_rate": 6.189071614773941e-05, + "loss": 0.012240355834364891, + "num_input_tokens_seen": 116023960, + "step": 7085, + "train_runtime": 57573.0215, + "train_tokens_per_second": 2015.249 + }, + { + "epoch": 4.294545454545455, + "grad_norm": 0.0074979402124881744, + "learning_rate": 6.188137566969495e-05, + "loss": 0.012528466992080212, + "num_input_tokens_seen": 116040336, + "step": 7086, + "train_runtime": 57581.1349, + "train_tokens_per_second": 2015.249 + }, + { + "epoch": 4.295151515151515, + "grad_norm": 0.006487895268946886, + "learning_rate": 6.187203475218154e-05, + "loss": 0.0109635004773736, + "num_input_tokens_seen": 116056712, + "step": 7087, + "train_runtime": 57589.2486, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 4.295757575757576, + "grad_norm": 0.004345133434981108, + "learning_rate": 6.186269339554467e-05, + "loss": 0.011105494573712349, + "num_input_tokens_seen": 116073088, + "step": 7088, + "train_runtime": 57597.3659, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 4.296363636363637, + "grad_norm": 0.01352035254240036, + "learning_rate": 6.185335160012989e-05, + "loss": 0.013007155619561672, + "num_input_tokens_seen": 116089464, + "step": 7089, + "train_runtime": 57605.4857, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 4.296969696969697, + "grad_norm": 0.01189270056784153, + "learning_rate": 6.184400936628269e-05, + "loss": 0.012472981587052345, + "num_input_tokens_seen": 116105840, + "step": 7090, + "train_runtime": 57613.6008, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 4.297575757575758, + "grad_norm": 0.001196831464767456, + "learning_rate": 6.183466669434869e-05, + "loss": 0.011495397426187992, + "num_input_tokens_seen": 116122216, + "step": 7091, + "train_runtime": 57621.7133, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 4.298181818181818, + "grad_norm": 0.0077866786159574986, + "learning_rate": 6.182532358467338e-05, + "loss": 0.012848013080656528, + "num_input_tokens_seen": 116138592, + "step": 7092, + "train_runtime": 57629.8344, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 4.298787878787879, + "grad_norm": 0.007301247213035822, + "learning_rate": 6.18159800376024e-05, + "loss": 0.012300428003072739, + "num_input_tokens_seen": 116154968, + "step": 7093, + "train_runtime": 57637.9479, + "train_tokens_per_second": 2015.252 + }, + { + "epoch": 4.29939393939394, + "grad_norm": 0.004718161188066006, + "learning_rate": 6.180663605348131e-05, + "loss": 0.01072913408279419, + "num_input_tokens_seen": 116171344, + "step": 7094, + "train_runtime": 57646.0575, + "train_tokens_per_second": 2015.252 + }, + { + "epoch": 4.3, + "grad_norm": 0.006432102993130684, + "learning_rate": 6.179729163265576e-05, + "loss": 0.01131734810769558, + "num_input_tokens_seen": 116187720, + "step": 7095, + "train_runtime": 57654.1687, + "train_tokens_per_second": 2015.253 + }, + { + "epoch": 4.300606060606061, + "grad_norm": 0.00925859622657299, + "learning_rate": 6.178794677547137e-05, + "loss": 0.011475511826574802, + "num_input_tokens_seen": 116204096, + "step": 7096, + "train_runtime": 57662.2868, + "train_tokens_per_second": 2015.253 + }, + { + "epoch": 4.301212121212121, + "grad_norm": 0.020881356671452522, + "learning_rate": 6.177860148227378e-05, + "loss": 0.013252904638648033, + "num_input_tokens_seen": 116220472, + "step": 7097, + "train_runtime": 57670.4053, + "train_tokens_per_second": 2015.253 + }, + { + "epoch": 4.301818181818182, + "grad_norm": 0.009923583827912807, + "learning_rate": 6.176925575340866e-05, + "loss": 0.01288150716573, + "num_input_tokens_seen": 116236848, + "step": 7098, + "train_runtime": 57678.5224, + "train_tokens_per_second": 2015.254 + }, + { + "epoch": 4.302424242424243, + "grad_norm": 0.01202578004449606, + "learning_rate": 6.175990958922168e-05, + "loss": 0.01247438881546259, + "num_input_tokens_seen": 116253224, + "step": 7099, + "train_runtime": 57686.6351, + "train_tokens_per_second": 2015.254 + }, + { + "epoch": 4.303030303030303, + "grad_norm": 0.007544725202023983, + "learning_rate": 6.175056299005854e-05, + "loss": 0.01285773329436779, + "num_input_tokens_seen": 116269600, + "step": 7100, + "train_runtime": 57694.7462, + "train_tokens_per_second": 2015.255 + }, + { + "epoch": 4.303636363636364, + "grad_norm": 0.006852264516055584, + "learning_rate": 6.174121595626499e-05, + "loss": 0.012255582958459854, + "num_input_tokens_seen": 116285976, + "step": 7101, + "train_runtime": 57703.8151, + "train_tokens_per_second": 2015.222 + }, + { + "epoch": 4.304242424242425, + "grad_norm": 0.006080076564103365, + "learning_rate": 6.173186848818669e-05, + "loss": 0.01297931931912899, + "num_input_tokens_seen": 116302352, + "step": 7102, + "train_runtime": 57711.9298, + "train_tokens_per_second": 2015.222 + }, + { + "epoch": 4.304848484848485, + "grad_norm": 0.010080605745315552, + "learning_rate": 6.172252058616944e-05, + "loss": 0.01262187771499157, + "num_input_tokens_seen": 116318728, + "step": 7103, + "train_runtime": 57720.0417, + "train_tokens_per_second": 2015.223 + }, + { + "epoch": 4.305454545454546, + "grad_norm": 0.009907773695886135, + "learning_rate": 6.171317225055899e-05, + "loss": 0.011983519420027733, + "num_input_tokens_seen": 116335104, + "step": 7104, + "train_runtime": 57728.156, + "train_tokens_per_second": 2015.223 + }, + { + "epoch": 4.306060606060606, + "grad_norm": 0.00731120677664876, + "learning_rate": 6.17038234817011e-05, + "loss": 0.010787931270897388, + "num_input_tokens_seen": 116351480, + "step": 7105, + "train_runtime": 57736.2676, + "train_tokens_per_second": 2015.223 + }, + { + "epoch": 4.306666666666667, + "grad_norm": 0.004306876566261053, + "learning_rate": 6.169447427994156e-05, + "loss": 0.012032481841742992, + "num_input_tokens_seen": 116367856, + "step": 7106, + "train_runtime": 57744.3803, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 4.307272727272728, + "grad_norm": 0.005444811191409826, + "learning_rate": 6.168512464562621e-05, + "loss": 0.011895950883626938, + "num_input_tokens_seen": 116384232, + "step": 7107, + "train_runtime": 57752.4928, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 4.307878787878788, + "grad_norm": 0.005840490106493235, + "learning_rate": 6.167577457910083e-05, + "loss": 0.011669824831187725, + "num_input_tokens_seen": 116400608, + "step": 7108, + "train_runtime": 57760.6047, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 4.308484848484849, + "grad_norm": 0.007028792519122362, + "learning_rate": 6.166642408071132e-05, + "loss": 0.012772906571626663, + "num_input_tokens_seen": 116416984, + "step": 7109, + "train_runtime": 57768.7206, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 4.309090909090909, + "grad_norm": 0.011694235727190971, + "learning_rate": 6.165707315080349e-05, + "loss": 0.012639481574296951, + "num_input_tokens_seen": 116433360, + "step": 7110, + "train_runtime": 57776.839, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 4.30969696969697, + "grad_norm": 0.003264394821599126, + "learning_rate": 6.16477217897232e-05, + "loss": 0.010976719669997692, + "num_input_tokens_seen": 116449736, + "step": 7111, + "train_runtime": 57784.9518, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 4.3103030303030305, + "grad_norm": 0.008053337223827839, + "learning_rate": 6.16383699978164e-05, + "loss": 0.012405035085976124, + "num_input_tokens_seen": 116466112, + "step": 7112, + "train_runtime": 57793.0715, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 4.310909090909091, + "grad_norm": 0.004885806702077389, + "learning_rate": 6.162901777542893e-05, + "loss": 0.011873934417963028, + "num_input_tokens_seen": 116482488, + "step": 7113, + "train_runtime": 57801.1875, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 4.3115151515151515, + "grad_norm": 0.009304334409534931, + "learning_rate": 6.161966512290676e-05, + "loss": 0.012499305419623852, + "num_input_tokens_seen": 116498864, + "step": 7114, + "train_runtime": 57809.3039, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 4.3121212121212125, + "grad_norm": 0.008511301130056381, + "learning_rate": 6.16103120405958e-05, + "loss": 0.013281782157719135, + "num_input_tokens_seen": 116515240, + "step": 7115, + "train_runtime": 57817.4185, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 4.3127272727272725, + "grad_norm": 0.00979678425937891, + "learning_rate": 6.1600958528842e-05, + "loss": 0.012387886643409729, + "num_input_tokens_seen": 116531616, + "step": 7116, + "train_runtime": 57825.5357, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 4.3133333333333335, + "grad_norm": 0.00988131295889616, + "learning_rate": 6.159160458799134e-05, + "loss": 0.010953946970403194, + "num_input_tokens_seen": 116547992, + "step": 7117, + "train_runtime": 57833.6518, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 4.313939393939394, + "grad_norm": 0.0059661781415343285, + "learning_rate": 6.15822502183898e-05, + "loss": 0.012735003605484962, + "num_input_tokens_seen": 116564368, + "step": 7118, + "train_runtime": 57841.7688, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 4.3145454545454545, + "grad_norm": 0.005830070469528437, + "learning_rate": 6.157289542038337e-05, + "loss": 0.012090307660400867, + "num_input_tokens_seen": 116580744, + "step": 7119, + "train_runtime": 57849.883, + "train_tokens_per_second": 2015.229 + }, + { + "epoch": 4.315151515151515, + "grad_norm": 0.007775646634399891, + "learning_rate": 6.156354019431809e-05, + "loss": 0.013155699707567692, + "num_input_tokens_seen": 116597120, + "step": 7120, + "train_runtime": 57857.9974, + "train_tokens_per_second": 2015.229 + }, + { + "epoch": 4.3157575757575755, + "grad_norm": 0.007464043330401182, + "learning_rate": 6.155418454053997e-05, + "loss": 0.011528918519616127, + "num_input_tokens_seen": 116613496, + "step": 7121, + "train_runtime": 57866.1146, + "train_tokens_per_second": 2015.229 + }, + { + "epoch": 4.316363636363636, + "grad_norm": 0.008275194093585014, + "learning_rate": 6.154482845939506e-05, + "loss": 0.013133621774613857, + "num_input_tokens_seen": 116629872, + "step": 7122, + "train_runtime": 57874.2332, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 4.316969696969697, + "grad_norm": 0.010607543401420116, + "learning_rate": 6.153547195122945e-05, + "loss": 0.012760238721966743, + "num_input_tokens_seen": 116646248, + "step": 7123, + "train_runtime": 57882.3477, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 4.317575757575757, + "grad_norm": 0.005157352425158024, + "learning_rate": 6.152611501638919e-05, + "loss": 0.012125248089432716, + "num_input_tokens_seen": 116662624, + "step": 7124, + "train_runtime": 57890.4631, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 4.318181818181818, + "grad_norm": 0.00467009749263525, + "learning_rate": 6.151675765522035e-05, + "loss": 0.011563179083168507, + "num_input_tokens_seen": 116679000, + "step": 7125, + "train_runtime": 57898.5813, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 4.318787878787878, + "grad_norm": 0.008738767355680466, + "learning_rate": 6.150739986806911e-05, + "loss": 0.012740285135805607, + "num_input_tokens_seen": 116695376, + "step": 7126, + "train_runtime": 57906.6988, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 4.319393939393939, + "grad_norm": 0.005527618806809187, + "learning_rate": 6.149804165528155e-05, + "loss": 0.012944851070642471, + "num_input_tokens_seen": 116711752, + "step": 7127, + "train_runtime": 57914.8136, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 4.32, + "grad_norm": 0.006876682862639427, + "learning_rate": 6.148868301720381e-05, + "loss": 0.011930827982723713, + "num_input_tokens_seen": 116728128, + "step": 7128, + "train_runtime": 57922.9305, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 4.32060606060606, + "grad_norm": 0.0056288777850568295, + "learning_rate": 6.147932395418205e-05, + "loss": 0.012054294347763062, + "num_input_tokens_seen": 116744504, + "step": 7129, + "train_runtime": 57931.0492, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 4.321212121212121, + "grad_norm": 0.005000430159270763, + "learning_rate": 6.146996446656249e-05, + "loss": 0.01131961029022932, + "num_input_tokens_seen": 116760880, + "step": 7130, + "train_runtime": 57939.1661, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 4.321818181818182, + "grad_norm": 0.005094943568110466, + "learning_rate": 6.146060455469125e-05, + "loss": 0.012715357355773449, + "num_input_tokens_seen": 116777256, + "step": 7131, + "train_runtime": 57947.2807, + "train_tokens_per_second": 2015.233 + }, + { + "epoch": 4.322424242424242, + "grad_norm": 0.0075663793832063675, + "learning_rate": 6.145124421891457e-05, + "loss": 0.010765984654426575, + "num_input_tokens_seen": 116793632, + "step": 7132, + "train_runtime": 57955.3958, + "train_tokens_per_second": 2015.233 + }, + { + "epoch": 4.323030303030303, + "grad_norm": 0.006041654851287603, + "learning_rate": 6.144188345957867e-05, + "loss": 0.013014983385801315, + "num_input_tokens_seen": 116810008, + "step": 7133, + "train_runtime": 57963.5131, + "train_tokens_per_second": 2015.233 + }, + { + "epoch": 4.323636363636363, + "grad_norm": 0.007971227169036865, + "learning_rate": 6.143252227702977e-05, + "loss": 0.011985797435045242, + "num_input_tokens_seen": 116826384, + "step": 7134, + "train_runtime": 57971.6302, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 4.324242424242424, + "grad_norm": 0.004352702759206295, + "learning_rate": 6.142316067161415e-05, + "loss": 0.012903328984975815, + "num_input_tokens_seen": 116842760, + "step": 7135, + "train_runtime": 57979.7428, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 4.324848484848485, + "grad_norm": 0.006083239801228046, + "learning_rate": 6.141379864367806e-05, + "loss": 0.01231124997138977, + "num_input_tokens_seen": 116859136, + "step": 7136, + "train_runtime": 57987.8616, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 4.325454545454545, + "grad_norm": 0.007134910672903061, + "learning_rate": 6.14044361935678e-05, + "loss": 0.013265438377857208, + "num_input_tokens_seen": 116875512, + "step": 7137, + "train_runtime": 57995.9754, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 4.326060606060606, + "grad_norm": 0.008015308529138565, + "learning_rate": 6.139507332162963e-05, + "loss": 0.012194725684821606, + "num_input_tokens_seen": 116891888, + "step": 7138, + "train_runtime": 58004.0896, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 4.326666666666666, + "grad_norm": 0.007689644116908312, + "learning_rate": 6.13857100282099e-05, + "loss": 0.012823798693716526, + "num_input_tokens_seen": 116908264, + "step": 7139, + "train_runtime": 58012.2045, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 4.327272727272727, + "grad_norm": 0.009412404149770737, + "learning_rate": 6.137634631365491e-05, + "loss": 0.012296564877033234, + "num_input_tokens_seen": 116924640, + "step": 7140, + "train_runtime": 58020.3317, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 4.327878787878788, + "grad_norm": 0.007101648021489382, + "learning_rate": 6.136698217831106e-05, + "loss": 0.012637555599212646, + "num_input_tokens_seen": 116941016, + "step": 7141, + "train_runtime": 58028.454, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 4.328484848484848, + "grad_norm": 0.0034633802715688944, + "learning_rate": 6.135761762252465e-05, + "loss": 0.012181626632809639, + "num_input_tokens_seen": 116957392, + "step": 7142, + "train_runtime": 58036.5736, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 4.329090909090909, + "grad_norm": 0.008207234553992748, + "learning_rate": 6.134825264664207e-05, + "loss": 0.010518132708966732, + "num_input_tokens_seen": 116973768, + "step": 7143, + "train_runtime": 58044.6895, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 4.32969696969697, + "grad_norm": 0.007240200415253639, + "learning_rate": 6.133888725100975e-05, + "loss": 0.012700878083705902, + "num_input_tokens_seen": 116990144, + "step": 7144, + "train_runtime": 58052.8056, + "train_tokens_per_second": 2015.237 + }, + { + "epoch": 4.33030303030303, + "grad_norm": 0.006169852335005999, + "learning_rate": 6.132952143597407e-05, + "loss": 0.012940004467964172, + "num_input_tokens_seen": 117006520, + "step": 7145, + "train_runtime": 58060.9188, + "train_tokens_per_second": 2015.237 + }, + { + "epoch": 4.330909090909091, + "grad_norm": 0.008109772577881813, + "learning_rate": 6.132015520188145e-05, + "loss": 0.011342469602823257, + "num_input_tokens_seen": 117022896, + "step": 7146, + "train_runtime": 58069.0314, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 4.331515151515152, + "grad_norm": 0.009627211838960648, + "learning_rate": 6.131078854907834e-05, + "loss": 0.01313413679599762, + "num_input_tokens_seen": 117039272, + "step": 7147, + "train_runtime": 58077.1424, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 4.332121212121212, + "grad_norm": 0.004285393748432398, + "learning_rate": 6.130142147791117e-05, + "loss": 0.011199244298040867, + "num_input_tokens_seen": 117055648, + "step": 7148, + "train_runtime": 58085.2544, + "train_tokens_per_second": 2015.239 + }, + { + "epoch": 4.332727272727273, + "grad_norm": 0.00814051739871502, + "learning_rate": 6.129205398872644e-05, + "loss": 0.012249046936631203, + "num_input_tokens_seen": 117072024, + "step": 7149, + "train_runtime": 58093.3733, + "train_tokens_per_second": 2015.239 + }, + { + "epoch": 4.333333333333333, + "grad_norm": 0.002098645316436887, + "learning_rate": 6.128268608187064e-05, + "loss": 0.01294752024114132, + "num_input_tokens_seen": 117088400, + "step": 7150, + "train_runtime": 58101.4881, + "train_tokens_per_second": 2015.239 + }, + { + "epoch": 4.333939393939394, + "grad_norm": 0.00887890625745058, + "learning_rate": 6.127331775769023e-05, + "loss": 0.0126563161611557, + "num_input_tokens_seen": 117104776, + "step": 7151, + "train_runtime": 58109.6042, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 4.334545454545455, + "grad_norm": 0.004041509702801704, + "learning_rate": 6.126394901653175e-05, + "loss": 0.011478863656520844, + "num_input_tokens_seen": 117121152, + "step": 7152, + "train_runtime": 58117.7293, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 4.335151515151515, + "grad_norm": 0.007244465872645378, + "learning_rate": 6.125457985874175e-05, + "loss": 0.011689812876284122, + "num_input_tokens_seen": 117137528, + "step": 7153, + "train_runtime": 58125.8451, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 4.335757575757576, + "grad_norm": 0.007253593765199184, + "learning_rate": 6.124521028466674e-05, + "loss": 0.012835622765123844, + "num_input_tokens_seen": 117153904, + "step": 7154, + "train_runtime": 58133.9603, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 4.336363636363636, + "grad_norm": 0.007165629882365465, + "learning_rate": 6.123584029465331e-05, + "loss": 0.012713047675788403, + "num_input_tokens_seen": 117170280, + "step": 7155, + "train_runtime": 58142.0774, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 4.336969696969697, + "grad_norm": 0.005582911893725395, + "learning_rate": 6.122646988904803e-05, + "loss": 0.01270977407693863, + "num_input_tokens_seen": 117186656, + "step": 7156, + "train_runtime": 58150.1918, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 4.337575757575758, + "grad_norm": 0.006019964348524809, + "learning_rate": 6.121709906819749e-05, + "loss": 0.011248370632529259, + "num_input_tokens_seen": 117203032, + "step": 7157, + "train_runtime": 58158.3058, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 4.338181818181818, + "grad_norm": 0.006351790390908718, + "learning_rate": 6.12077278324483e-05, + "loss": 0.012030570767819881, + "num_input_tokens_seen": 117219408, + "step": 7158, + "train_runtime": 58166.42, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 4.338787878787879, + "grad_norm": 0.00458839675411582, + "learning_rate": 6.119835618214707e-05, + "loss": 0.011903731152415276, + "num_input_tokens_seen": 117235784, + "step": 7159, + "train_runtime": 58174.5329, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 4.33939393939394, + "grad_norm": 0.006791803054511547, + "learning_rate": 6.118898411764047e-05, + "loss": 0.01253083348274231, + "num_input_tokens_seen": 117252160, + "step": 7160, + "train_runtime": 58182.6484, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 4.34, + "grad_norm": 0.006374249700456858, + "learning_rate": 6.117961163927511e-05, + "loss": 0.011418222449719906, + "num_input_tokens_seen": 117268536, + "step": 7161, + "train_runtime": 58190.7655, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 4.340606060606061, + "grad_norm": 0.008549930527806282, + "learning_rate": 6.117023874739772e-05, + "loss": 0.012289385311305523, + "num_input_tokens_seen": 117284912, + "step": 7162, + "train_runtime": 58198.8878, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 4.341212121212121, + "grad_norm": 0.008244951255619526, + "learning_rate": 6.116086544235494e-05, + "loss": 0.012566682882606983, + "num_input_tokens_seen": 117301288, + "step": 7163, + "train_runtime": 58207.0062, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 4.341818181818182, + "grad_norm": 0.00939315464347601, + "learning_rate": 6.115149172449348e-05, + "loss": 0.011334998533129692, + "num_input_tokens_seen": 117317664, + "step": 7164, + "train_runtime": 58215.1316, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 4.342424242424243, + "grad_norm": 0.007422175724059343, + "learning_rate": 6.114211759416005e-05, + "loss": 0.013200083747506142, + "num_input_tokens_seen": 117334040, + "step": 7165, + "train_runtime": 58223.2524, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 4.343030303030303, + "grad_norm": 0.004493348300457001, + "learning_rate": 6.11327430517014e-05, + "loss": 0.010764127597212791, + "num_input_tokens_seen": 117350416, + "step": 7166, + "train_runtime": 58231.374, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 4.343636363636364, + "grad_norm": 0.006506971083581448, + "learning_rate": 6.112336809746426e-05, + "loss": 0.011035479605197906, + "num_input_tokens_seen": 117366792, + "step": 7167, + "train_runtime": 58239.4903, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 4.344242424242424, + "grad_norm": 0.006404386833310127, + "learning_rate": 6.11139927317954e-05, + "loss": 0.011676344089210033, + "num_input_tokens_seen": 117383168, + "step": 7168, + "train_runtime": 58247.6049, + "train_tokens_per_second": 2015.245 + }, + { + "epoch": 4.344848484848485, + "grad_norm": 0.006434170063585043, + "learning_rate": 6.11046169550416e-05, + "loss": 0.01162758469581604, + "num_input_tokens_seen": 117399544, + "step": 7169, + "train_runtime": 58255.7208, + "train_tokens_per_second": 2015.245 + }, + { + "epoch": 4.345454545454546, + "grad_norm": 0.005619818344712257, + "learning_rate": 6.109524076754963e-05, + "loss": 0.0116783007979393, + "num_input_tokens_seen": 117415920, + "step": 7170, + "train_runtime": 58263.838, + "train_tokens_per_second": 2015.245 + }, + { + "epoch": 4.346060606060606, + "grad_norm": 0.006239919923245907, + "learning_rate": 6.10858641696663e-05, + "loss": 0.011380848474800587, + "num_input_tokens_seen": 117432296, + "step": 7171, + "train_runtime": 58271.9534, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 4.346666666666667, + "grad_norm": 0.005026215687394142, + "learning_rate": 6.107648716173846e-05, + "loss": 0.012374107725918293, + "num_input_tokens_seen": 117448672, + "step": 7172, + "train_runtime": 58280.0655, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 4.347272727272728, + "grad_norm": 0.00518860574811697, + "learning_rate": 6.106710974411294e-05, + "loss": 0.01158462930470705, + "num_input_tokens_seen": 117465048, + "step": 7173, + "train_runtime": 58288.1793, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 4.347878787878788, + "grad_norm": 0.009328094311058521, + "learning_rate": 6.105773191713658e-05, + "loss": 0.011841312050819397, + "num_input_tokens_seen": 117481424, + "step": 7174, + "train_runtime": 58296.2991, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 4.348484848484849, + "grad_norm": 0.005876198876649141, + "learning_rate": 6.104835368115622e-05, + "loss": 0.01184016652405262, + "num_input_tokens_seen": 117497800, + "step": 7175, + "train_runtime": 58304.4174, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 4.34909090909091, + "grad_norm": 0.007847006432712078, + "learning_rate": 6.10389750365188e-05, + "loss": 0.012148797512054443, + "num_input_tokens_seen": 117514176, + "step": 7176, + "train_runtime": 58312.5339, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 4.34969696969697, + "grad_norm": 0.007863972336053848, + "learning_rate": 6.1029595983571184e-05, + "loss": 0.012457039207220078, + "num_input_tokens_seen": 117530552, + "step": 7177, + "train_runtime": 58320.6494, + "train_tokens_per_second": 2015.248 + }, + { + "epoch": 4.350303030303031, + "grad_norm": 0.014064669609069824, + "learning_rate": 6.1020216522660304e-05, + "loss": 0.013485535979270935, + "num_input_tokens_seen": 117546928, + "step": 7178, + "train_runtime": 58328.7643, + "train_tokens_per_second": 2015.248 + }, + { + "epoch": 4.350909090909091, + "grad_norm": 0.0060282438062131405, + "learning_rate": 6.1010836654133066e-05, + "loss": 0.012241779826581478, + "num_input_tokens_seen": 117563304, + "step": 7179, + "train_runtime": 58336.8838, + "train_tokens_per_second": 2015.248 + }, + { + "epoch": 4.351515151515152, + "grad_norm": 0.009684165939688683, + "learning_rate": 6.100145637833641e-05, + "loss": 0.012097659520804882, + "num_input_tokens_seen": 117579680, + "step": 7180, + "train_runtime": 58345.0033, + "train_tokens_per_second": 2015.248 + }, + { + "epoch": 4.3521212121212125, + "grad_norm": 0.005650395527482033, + "learning_rate": 6.0992075695617324e-05, + "loss": 0.011246384121477604, + "num_input_tokens_seen": 117596056, + "step": 7181, + "train_runtime": 58353.1205, + "train_tokens_per_second": 2015.249 + }, + { + "epoch": 4.352727272727273, + "grad_norm": 0.010508819483220577, + "learning_rate": 6.098269460632276e-05, + "loss": 0.011656949296593666, + "num_input_tokens_seen": 117612432, + "step": 7182, + "train_runtime": 58361.2352, + "train_tokens_per_second": 2015.249 + }, + { + "epoch": 4.3533333333333335, + "grad_norm": 0.0062281424179673195, + "learning_rate": 6.097331311079971e-05, + "loss": 0.012274224311113358, + "num_input_tokens_seen": 117628808, + "step": 7183, + "train_runtime": 58369.3502, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 4.3539393939393936, + "grad_norm": 0.0041151209734380245, + "learning_rate": 6.096393120939516e-05, + "loss": 0.010719363577663898, + "num_input_tokens_seen": 117645184, + "step": 7184, + "train_runtime": 58377.4661, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 4.3545454545454545, + "grad_norm": 0.00897220242768526, + "learning_rate": 6.0954548902456175e-05, + "loss": 0.011122182011604309, + "num_input_tokens_seen": 117661560, + "step": 7185, + "train_runtime": 58385.5849, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 4.355151515151515, + "grad_norm": 0.004321888089179993, + "learning_rate": 6.094516619032975e-05, + "loss": 0.01120646484196186, + "num_input_tokens_seen": 117677936, + "step": 7186, + "train_runtime": 58393.7052, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 4.3557575757575755, + "grad_norm": 0.007508778013288975, + "learning_rate": 6.093578307336294e-05, + "loss": 0.011796561069786549, + "num_input_tokens_seen": 117694312, + "step": 7187, + "train_runtime": 58401.8299, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 4.356363636363636, + "grad_norm": 0.0061297244392335415, + "learning_rate": 6.09263995519028e-05, + "loss": 0.011672699823975563, + "num_input_tokens_seen": 117710688, + "step": 7188, + "train_runtime": 58409.9531, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 4.356969696969697, + "grad_norm": 0.009549552574753761, + "learning_rate": 6.091701562629644e-05, + "loss": 0.012633584439754486, + "num_input_tokens_seen": 117727064, + "step": 7189, + "train_runtime": 58418.0801, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 4.357575757575757, + "grad_norm": 0.0044892760924994946, + "learning_rate": 6.09076312968909e-05, + "loss": 0.011252457275986671, + "num_input_tokens_seen": 117743440, + "step": 7190, + "train_runtime": 58426.198, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 4.358181818181818, + "grad_norm": 0.007916966453194618, + "learning_rate": 6.0898246564033356e-05, + "loss": 0.012092695571482182, + "num_input_tokens_seen": 117759816, + "step": 7191, + "train_runtime": 58434.3166, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 4.358787878787878, + "grad_norm": 0.0034057556185871363, + "learning_rate": 6.0888861428070874e-05, + "loss": 0.012294942513108253, + "num_input_tokens_seen": 117776192, + "step": 7192, + "train_runtime": 58442.4412, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 4.359393939393939, + "grad_norm": 0.006505809724330902, + "learning_rate": 6.0879475889350615e-05, + "loss": 0.012546290643513203, + "num_input_tokens_seen": 117792568, + "step": 7193, + "train_runtime": 58450.5664, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 4.36, + "grad_norm": 0.004849424120038748, + "learning_rate": 6.087008994821972e-05, + "loss": 0.011646192520856857, + "num_input_tokens_seen": 117808944, + "step": 7194, + "train_runtime": 58458.6866, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 4.36060606060606, + "grad_norm": 0.008047061040997505, + "learning_rate": 6.0860703605025395e-05, + "loss": 0.012185913510620594, + "num_input_tokens_seen": 117825320, + "step": 7195, + "train_runtime": 58466.8087, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 4.361212121212121, + "grad_norm": 0.009238085709512234, + "learning_rate": 6.085131686011477e-05, + "loss": 0.01114228181540966, + "num_input_tokens_seen": 117841696, + "step": 7196, + "train_runtime": 58474.9327, + "train_tokens_per_second": 2015.252 + }, + { + "epoch": 4.361818181818181, + "grad_norm": 0.0041327523067593575, + "learning_rate": 6.0841929713835075e-05, + "loss": 0.011655157431960106, + "num_input_tokens_seen": 117858072, + "step": 7197, + "train_runtime": 58483.06, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 4.362424242424242, + "grad_norm": 0.004987915977835655, + "learning_rate": 6.0832542166533514e-05, + "loss": 0.012835238128900528, + "num_input_tokens_seen": 117874448, + "step": 7198, + "train_runtime": 58491.1883, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 4.363030303030303, + "grad_norm": 0.005309248808771372, + "learning_rate": 6.082315421855731e-05, + "loss": 0.011565460823476315, + "num_input_tokens_seen": 117890824, + "step": 7199, + "train_runtime": 58499.3138, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 4.363636363636363, + "grad_norm": 0.002711846958845854, + "learning_rate": 6.081376587025373e-05, + "loss": 0.010815818794071674, + "num_input_tokens_seen": 117907200, + "step": 7200, + "train_runtime": 58507.4357, + "train_tokens_per_second": 2015.252 + }, + { + "epoch": 4.364242424242424, + "grad_norm": 0.005186888389289379, + "learning_rate": 6.080437712196998e-05, + "loss": 0.011693778447806835, + "num_input_tokens_seen": 117923576, + "step": 7201, + "train_runtime": 58516.5105, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 4.364848484848485, + "grad_norm": 0.004702520556747913, + "learning_rate": 6.079498797405337e-05, + "loss": 0.010667283087968826, + "num_input_tokens_seen": 117939952, + "step": 7202, + "train_runtime": 58524.6332, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 4.365454545454545, + "grad_norm": 0.007813850417733192, + "learning_rate": 6.078559842685118e-05, + "loss": 0.011973759159445763, + "num_input_tokens_seen": 117956328, + "step": 7203, + "train_runtime": 58532.7535, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 4.366060606060606, + "grad_norm": 0.009814882650971413, + "learning_rate": 6.0776208480710704e-05, + "loss": 0.01179689820855856, + "num_input_tokens_seen": 117972704, + "step": 7204, + "train_runtime": 58540.8767, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 4.366666666666666, + "grad_norm": 0.007886679843068123, + "learning_rate": 6.0766818135979266e-05, + "loss": 0.011776899918913841, + "num_input_tokens_seen": 117989080, + "step": 7205, + "train_runtime": 58549.0014, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 4.367272727272727, + "grad_norm": 0.0041811843402683735, + "learning_rate": 6.0757427393004195e-05, + "loss": 0.01260486338287592, + "num_input_tokens_seen": 118005456, + "step": 7206, + "train_runtime": 58557.1308, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 4.367878787878788, + "grad_norm": 0.005688157398253679, + "learning_rate": 6.074803625213282e-05, + "loss": 0.011584609746932983, + "num_input_tokens_seen": 118021832, + "step": 7207, + "train_runtime": 58565.2533, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 4.368484848484848, + "grad_norm": 0.005325486417859793, + "learning_rate": 6.0738644713712525e-05, + "loss": 0.011342920362949371, + "num_input_tokens_seen": 118038208, + "step": 7208, + "train_runtime": 58573.3746, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 4.369090909090909, + "grad_norm": 0.006310905795544386, + "learning_rate": 6.0729252778090676e-05, + "loss": 0.010811830870807171, + "num_input_tokens_seen": 118054584, + "step": 7209, + "train_runtime": 58581.4936, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 4.36969696969697, + "grad_norm": 0.004467066377401352, + "learning_rate": 6.071986044561466e-05, + "loss": 0.011954842135310173, + "num_input_tokens_seen": 118070960, + "step": 7210, + "train_runtime": 58589.6141, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 4.37030303030303, + "grad_norm": 0.007562848273664713, + "learning_rate": 6.0710467716631883e-05, + "loss": 0.01089263241738081, + "num_input_tokens_seen": 118087336, + "step": 7211, + "train_runtime": 58597.7364, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 4.370909090909091, + "grad_norm": 0.006508200895041227, + "learning_rate": 6.070107459148976e-05, + "loss": 0.012080973945558071, + "num_input_tokens_seen": 118103712, + "step": 7212, + "train_runtime": 58605.8593, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 4.371515151515151, + "grad_norm": 0.00851151067763567, + "learning_rate": 6.0691681070535735e-05, + "loss": 0.011878878809511662, + "num_input_tokens_seen": 118120088, + "step": 7213, + "train_runtime": 58613.9799, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 4.372121212121212, + "grad_norm": 0.007017012219876051, + "learning_rate": 6.0682287154117236e-05, + "loss": 0.012641869485378265, + "num_input_tokens_seen": 118136464, + "step": 7214, + "train_runtime": 58622.1061, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 4.372727272727273, + "grad_norm": 0.0059426454827189445, + "learning_rate": 6.067289284258174e-05, + "loss": 0.011902403086423874, + "num_input_tokens_seen": 118152840, + "step": 7215, + "train_runtime": 58630.2307, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 4.373333333333333, + "grad_norm": 0.008807255886495113, + "learning_rate": 6.066349813627673e-05, + "loss": 0.011977688409388065, + "num_input_tokens_seen": 118169216, + "step": 7216, + "train_runtime": 58638.3582, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 4.373939393939394, + "grad_norm": 0.010130614973604679, + "learning_rate": 6.065410303554968e-05, + "loss": 0.01231562253087759, + "num_input_tokens_seen": 118185592, + "step": 7217, + "train_runtime": 58646.499, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 4.374545454545455, + "grad_norm": 0.010925298556685448, + "learning_rate": 6.064470754074812e-05, + "loss": 0.011123151518404484, + "num_input_tokens_seen": 118201968, + "step": 7218, + "train_runtime": 58654.6379, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 4.375151515151515, + "grad_norm": 0.003569727996364236, + "learning_rate": 6.063531165221954e-05, + "loss": 0.01123263780027628, + "num_input_tokens_seen": 118218344, + "step": 7219, + "train_runtime": 58662.7549, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 4.375757575757576, + "grad_norm": 0.010865558870136738, + "learning_rate": 6.0625915370311515e-05, + "loss": 0.012797839939594269, + "num_input_tokens_seen": 118234720, + "step": 7220, + "train_runtime": 58670.8907, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 4.376363636363636, + "grad_norm": 0.005289940629154444, + "learning_rate": 6.061651869537155e-05, + "loss": 0.011514516547322273, + "num_input_tokens_seen": 118251096, + "step": 7221, + "train_runtime": 58679.0316, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 4.376969696969697, + "grad_norm": 0.007462408859282732, + "learning_rate": 6.0607121627747246e-05, + "loss": 0.0135116558521986, + "num_input_tokens_seen": 118267472, + "step": 7222, + "train_runtime": 58687.1706, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 4.377575757575758, + "grad_norm": 0.007127333432435989, + "learning_rate": 6.0597724167786185e-05, + "loss": 0.011233345605432987, + "num_input_tokens_seen": 118283848, + "step": 7223, + "train_runtime": 58695.3105, + "train_tokens_per_second": 2015.218 + }, + { + "epoch": 4.378181818181818, + "grad_norm": 0.005211701616644859, + "learning_rate": 6.058832631583592e-05, + "loss": 0.011554128490388393, + "num_input_tokens_seen": 118300224, + "step": 7224, + "train_runtime": 58703.4493, + "train_tokens_per_second": 2015.218 + }, + { + "epoch": 4.378787878787879, + "grad_norm": 0.005085487384349108, + "learning_rate": 6.0578928072244104e-05, + "loss": 0.012560759671032429, + "num_input_tokens_seen": 118316600, + "step": 7225, + "train_runtime": 58711.5874, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 4.379393939393939, + "grad_norm": 0.0057559143751859665, + "learning_rate": 6.0569529437358316e-05, + "loss": 0.012516390532255173, + "num_input_tokens_seen": 118332976, + "step": 7226, + "train_runtime": 58719.7215, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 4.38, + "grad_norm": 0.0063735563308000565, + "learning_rate": 6.056013041152624e-05, + "loss": 0.0129209840670228, + "num_input_tokens_seen": 118349352, + "step": 7227, + "train_runtime": 58727.8613, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 4.380606060606061, + "grad_norm": 0.008082505315542221, + "learning_rate": 6.05507309950955e-05, + "loss": 0.011956842616200447, + "num_input_tokens_seen": 118365728, + "step": 7228, + "train_runtime": 58736.0044, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 4.381212121212121, + "grad_norm": 0.00577812734991312, + "learning_rate": 6.0541331188413766e-05, + "loss": 0.011361909098923206, + "num_input_tokens_seen": 118382104, + "step": 7229, + "train_runtime": 58744.144, + "train_tokens_per_second": 2015.215 + }, + { + "epoch": 4.381818181818182, + "grad_norm": 0.00605149706825614, + "learning_rate": 6.05319309918287e-05, + "loss": 0.012704472988843918, + "num_input_tokens_seen": 118398480, + "step": 7230, + "train_runtime": 58752.2806, + "train_tokens_per_second": 2015.215 + }, + { + "epoch": 4.382424242424243, + "grad_norm": 0.0064490488730371, + "learning_rate": 6.052253040568804e-05, + "loss": 0.011716480366885662, + "num_input_tokens_seen": 118414856, + "step": 7231, + "train_runtime": 58760.4182, + "train_tokens_per_second": 2015.215 + }, + { + "epoch": 4.383030303030303, + "grad_norm": 0.005647360812872648, + "learning_rate": 6.051312943033947e-05, + "loss": 0.012212228029966354, + "num_input_tokens_seen": 118431232, + "step": 7232, + "train_runtime": 58768.5574, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 4.383636363636364, + "grad_norm": 0.006885737180709839, + "learning_rate": 6.0503728066130706e-05, + "loss": 0.012540744617581367, + "num_input_tokens_seen": 118447608, + "step": 7233, + "train_runtime": 58776.6888, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 4.384242424242424, + "grad_norm": 0.004875736776739359, + "learning_rate": 6.04943263134095e-05, + "loss": 0.01229823101311922, + "num_input_tokens_seen": 118463984, + "step": 7234, + "train_runtime": 58784.8302, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 4.384848484848485, + "grad_norm": 0.008576028048992157, + "learning_rate": 6.04849241725236e-05, + "loss": 0.011705592274665833, + "num_input_tokens_seen": 118480360, + "step": 7235, + "train_runtime": 58792.9679, + "train_tokens_per_second": 2015.213 + }, + { + "epoch": 4.385454545454546, + "grad_norm": 0.004102800507098436, + "learning_rate": 6.0475521643820776e-05, + "loss": 0.01163457427173853, + "num_input_tokens_seen": 118496736, + "step": 7236, + "train_runtime": 58801.1027, + "train_tokens_per_second": 2015.213 + }, + { + "epoch": 4.386060606060606, + "grad_norm": 0.0071503412909805775, + "learning_rate": 6.046611872764881e-05, + "loss": 0.011884487234055996, + "num_input_tokens_seen": 118513112, + "step": 7237, + "train_runtime": 58809.2402, + "train_tokens_per_second": 2015.212 + }, + { + "epoch": 4.386666666666667, + "grad_norm": 0.006931076291948557, + "learning_rate": 6.0456715424355485e-05, + "loss": 0.012213222682476044, + "num_input_tokens_seen": 118529488, + "step": 7238, + "train_runtime": 58817.378, + "train_tokens_per_second": 2015.212 + }, + { + "epoch": 4.387272727272728, + "grad_norm": 0.0059312740340828896, + "learning_rate": 6.044731173428862e-05, + "loss": 0.011341746896505356, + "num_input_tokens_seen": 118545864, + "step": 7239, + "train_runtime": 58825.4965, + "train_tokens_per_second": 2015.212 + }, + { + "epoch": 4.387878787878788, + "grad_norm": 0.007848402485251427, + "learning_rate": 6.0437907657796034e-05, + "loss": 0.011040493845939636, + "num_input_tokens_seen": 118562240, + "step": 7240, + "train_runtime": 58833.6104, + "train_tokens_per_second": 2015.213 + }, + { + "epoch": 4.388484848484849, + "grad_norm": 0.007083002012223005, + "learning_rate": 6.042850319522559e-05, + "loss": 0.011027893051505089, + "num_input_tokens_seen": 118578616, + "step": 7241, + "train_runtime": 58841.7299, + "train_tokens_per_second": 2015.213 + }, + { + "epoch": 4.389090909090909, + "grad_norm": 0.010316570289433002, + "learning_rate": 6.0419098346925105e-05, + "loss": 0.011667700484395027, + "num_input_tokens_seen": 118594992, + "step": 7242, + "train_runtime": 58849.8442, + "train_tokens_per_second": 2015.213 + }, + { + "epoch": 4.38969696969697, + "grad_norm": 0.004619286861270666, + "learning_rate": 6.040969311324247e-05, + "loss": 0.011923962272703648, + "num_input_tokens_seen": 118611368, + "step": 7243, + "train_runtime": 58857.9577, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 4.390303030303031, + "grad_norm": 0.004962217528373003, + "learning_rate": 6.0400287494525566e-05, + "loss": 0.011734114959836006, + "num_input_tokens_seen": 118627744, + "step": 7244, + "train_runtime": 58866.0704, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 4.390909090909091, + "grad_norm": 0.00763348862528801, + "learning_rate": 6.039088149112227e-05, + "loss": 0.011668592691421509, + "num_input_tokens_seen": 118644120, + "step": 7245, + "train_runtime": 58874.183, + "train_tokens_per_second": 2015.215 + }, + { + "epoch": 4.391515151515152, + "grad_norm": 0.006173167377710342, + "learning_rate": 6.0381475103380516e-05, + "loss": 0.011588460765779018, + "num_input_tokens_seen": 118660496, + "step": 7246, + "train_runtime": 58882.3022, + "train_tokens_per_second": 2015.215 + }, + { + "epoch": 4.3921212121212125, + "grad_norm": 0.00831946823745966, + "learning_rate": 6.037206833164819e-05, + "loss": 0.012441151775419712, + "num_input_tokens_seen": 118676872, + "step": 7247, + "train_runtime": 58890.421, + "train_tokens_per_second": 2015.215 + }, + { + "epoch": 4.392727272727273, + "grad_norm": 0.0087212473154068, + "learning_rate": 6.0362661176273286e-05, + "loss": 0.011881284415721893, + "num_input_tokens_seen": 118693248, + "step": 7248, + "train_runtime": 58898.5355, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 4.3933333333333335, + "grad_norm": 0.008253974840044975, + "learning_rate": 6.0353253637603694e-05, + "loss": 0.012515665031969547, + "num_input_tokens_seen": 118709624, + "step": 7249, + "train_runtime": 58906.6444, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 4.393939393939394, + "grad_norm": 0.004785845056176186, + "learning_rate": 6.0343845715987436e-05, + "loss": 0.011519313789904118, + "num_input_tokens_seen": 118726000, + "step": 7250, + "train_runtime": 58914.7587, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 4.3945454545454545, + "grad_norm": 0.00639704754576087, + "learning_rate": 6.033443741177246e-05, + "loss": 0.011105354875326157, + "num_input_tokens_seen": 118742376, + "step": 7251, + "train_runtime": 58922.8724, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 4.3951515151515155, + "grad_norm": 0.006021281238645315, + "learning_rate": 6.032502872530676e-05, + "loss": 0.011858705431222916, + "num_input_tokens_seen": 118758752, + "step": 7252, + "train_runtime": 58930.9859, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 4.3957575757575755, + "grad_norm": 0.005480845924466848, + "learning_rate": 6.031561965693837e-05, + "loss": 0.011752075515687466, + "num_input_tokens_seen": 118775128, + "step": 7253, + "train_runtime": 58939.1013, + "train_tokens_per_second": 2015.218 + }, + { + "epoch": 4.3963636363636365, + "grad_norm": 0.007310271728783846, + "learning_rate": 6.030621020701529e-05, + "loss": 0.012252524495124817, + "num_input_tokens_seen": 118791504, + "step": 7254, + "train_runtime": 58947.2152, + "train_tokens_per_second": 2015.218 + }, + { + "epoch": 4.3969696969696965, + "grad_norm": 0.006573469843715429, + "learning_rate": 6.029680037588556e-05, + "loss": 0.011239140294492245, + "num_input_tokens_seen": 118807880, + "step": 7255, + "train_runtime": 58955.3316, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 4.3975757575757575, + "grad_norm": 0.006990095134824514, + "learning_rate": 6.028739016389725e-05, + "loss": 0.012867686338722706, + "num_input_tokens_seen": 118824256, + "step": 7256, + "train_runtime": 58963.4429, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 4.398181818181818, + "grad_norm": 0.005547692067921162, + "learning_rate": 6.02779795713984e-05, + "loss": 0.012718934565782547, + "num_input_tokens_seen": 118840632, + "step": 7257, + "train_runtime": 58971.5876, + "train_tokens_per_second": 2015.218 + }, + { + "epoch": 4.3987878787878785, + "grad_norm": 0.0031898976303637028, + "learning_rate": 6.0268568598737104e-05, + "loss": 0.011669829487800598, + "num_input_tokens_seen": 118857008, + "step": 7258, + "train_runtime": 58979.7706, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 4.399393939393939, + "grad_norm": 0.009200220927596092, + "learning_rate": 6.025915724626146e-05, + "loss": 0.01265160832554102, + "num_input_tokens_seen": 118873384, + "step": 7259, + "train_runtime": 58987.8985, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 4.4, + "grad_norm": 0.009129141457378864, + "learning_rate": 6.024974551431957e-05, + "loss": 0.012117268517613411, + "num_input_tokens_seen": 118889760, + "step": 7260, + "train_runtime": 58996.0687, + "train_tokens_per_second": 2015.215 + }, + { + "epoch": 4.40060606060606, + "grad_norm": 0.005083995871245861, + "learning_rate": 6.024033340325954e-05, + "loss": 0.012132816016674042, + "num_input_tokens_seen": 118906136, + "step": 7261, + "train_runtime": 59004.2125, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 4.401212121212121, + "grad_norm": 0.008084363304078579, + "learning_rate": 6.023092091342952e-05, + "loss": 0.011245117522776127, + "num_input_tokens_seen": 118922512, + "step": 7262, + "train_runtime": 59012.3512, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 4.401818181818181, + "grad_norm": 0.005938296671956778, + "learning_rate": 6.0221508045177676e-05, + "loss": 0.01155833713710308, + "num_input_tokens_seen": 118938888, + "step": 7263, + "train_runtime": 59020.4931, + "train_tokens_per_second": 2015.213 + }, + { + "epoch": 4.402424242424242, + "grad_norm": 0.005889672785997391, + "learning_rate": 6.021209479885215e-05, + "loss": 0.012114031240344048, + "num_input_tokens_seen": 118955264, + "step": 7264, + "train_runtime": 59028.6389, + "train_tokens_per_second": 2015.213 + }, + { + "epoch": 4.403030303030303, + "grad_norm": 0.013918892480432987, + "learning_rate": 6.02026811748011e-05, + "loss": 0.012156056240200996, + "num_input_tokens_seen": 118971640, + "step": 7265, + "train_runtime": 59036.7583, + "train_tokens_per_second": 2015.213 + }, + { + "epoch": 4.403636363636363, + "grad_norm": 0.010068940930068493, + "learning_rate": 6.019326717337277e-05, + "loss": 0.011936378665268421, + "num_input_tokens_seen": 118988016, + "step": 7266, + "train_runtime": 59044.8743, + "train_tokens_per_second": 2015.213 + }, + { + "epoch": 4.404242424242424, + "grad_norm": 0.0064084939658641815, + "learning_rate": 6.018385279491533e-05, + "loss": 0.012081338092684746, + "num_input_tokens_seen": 119004392, + "step": 7267, + "train_runtime": 59052.9905, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 4.404848484848485, + "grad_norm": 0.009791910648345947, + "learning_rate": 6.0174438039777e-05, + "loss": 0.012573338113725185, + "num_input_tokens_seen": 119020768, + "step": 7268, + "train_runtime": 59061.1065, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 4.405454545454545, + "grad_norm": 0.003459475003182888, + "learning_rate": 6.0165022908306034e-05, + "loss": 0.01307886652648449, + "num_input_tokens_seen": 119037144, + "step": 7269, + "train_runtime": 59069.2315, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 4.406060606060606, + "grad_norm": 0.0027003767900168896, + "learning_rate": 6.0155607400850654e-05, + "loss": 0.010920899920165539, + "num_input_tokens_seen": 119053520, + "step": 7270, + "train_runtime": 59077.3464, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 4.406666666666666, + "grad_norm": 0.006103599909693003, + "learning_rate": 6.0146191517759134e-05, + "loss": 0.013341980054974556, + "num_input_tokens_seen": 119069896, + "step": 7271, + "train_runtime": 59085.459, + "train_tokens_per_second": 2015.215 + }, + { + "epoch": 4.407272727272727, + "grad_norm": 0.0045531680807471275, + "learning_rate": 6.013677525937975e-05, + "loss": 0.012853332795202732, + "num_input_tokens_seen": 119086272, + "step": 7272, + "train_runtime": 59093.5682, + "train_tokens_per_second": 2015.215 + }, + { + "epoch": 4.407878787878788, + "grad_norm": 0.007756724953651428, + "learning_rate": 6.01273586260608e-05, + "loss": 0.01239074021577835, + "num_input_tokens_seen": 119102648, + "step": 7273, + "train_runtime": 59101.6835, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 4.408484848484848, + "grad_norm": 0.0031985167879611254, + "learning_rate": 6.0117941618150565e-05, + "loss": 0.011468161828815937, + "num_input_tokens_seen": 119119024, + "step": 7274, + "train_runtime": 59109.7973, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 4.409090909090909, + "grad_norm": 0.007962129078805447, + "learning_rate": 6.010852423599737e-05, + "loss": 0.011629986576735973, + "num_input_tokens_seen": 119135400, + "step": 7275, + "train_runtime": 59117.9137, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 4.40969696969697, + "grad_norm": 0.00784618966281414, + "learning_rate": 6.009910647994956e-05, + "loss": 0.01149400882422924, + "num_input_tokens_seen": 119151776, + "step": 7276, + "train_runtime": 59126.0303, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 4.41030303030303, + "grad_norm": 0.007237464189529419, + "learning_rate": 6.008968835035547e-05, + "loss": 0.011653260327875614, + "num_input_tokens_seen": 119168152, + "step": 7277, + "train_runtime": 59134.1515, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 4.410909090909091, + "grad_norm": 0.004889533389359713, + "learning_rate": 6.008026984756345e-05, + "loss": 0.012148137204349041, + "num_input_tokens_seen": 119184528, + "step": 7278, + "train_runtime": 59142.2662, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 4.411515151515151, + "grad_norm": 0.007904613390564919, + "learning_rate": 6.0070850971921875e-05, + "loss": 0.012482091784477234, + "num_input_tokens_seen": 119200904, + "step": 7279, + "train_runtime": 59150.3778, + "train_tokens_per_second": 2015.218 + }, + { + "epoch": 4.412121212121212, + "grad_norm": 0.0069750710390508175, + "learning_rate": 6.0061431723779135e-05, + "loss": 0.01190265454351902, + "num_input_tokens_seen": 119217280, + "step": 7280, + "train_runtime": 59158.488, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 4.412727272727273, + "grad_norm": 0.004490159451961517, + "learning_rate": 6.0052012103483635e-05, + "loss": 0.011587965302169323, + "num_input_tokens_seen": 119233656, + "step": 7281, + "train_runtime": 59166.5988, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 4.413333333333333, + "grad_norm": 0.0032428090926259756, + "learning_rate": 6.004259211138379e-05, + "loss": 0.011475803330540657, + "num_input_tokens_seen": 119250032, + "step": 7282, + "train_runtime": 59174.7145, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 4.413939393939394, + "grad_norm": 0.008812963962554932, + "learning_rate": 6.003317174782801e-05, + "loss": 0.011676282621920109, + "num_input_tokens_seen": 119266408, + "step": 7283, + "train_runtime": 59182.8305, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 4.414545454545454, + "grad_norm": 0.007113215979188681, + "learning_rate": 6.002375101316474e-05, + "loss": 0.013444902375340462, + "num_input_tokens_seen": 119282784, + "step": 7284, + "train_runtime": 59190.9492, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 4.415151515151515, + "grad_norm": 0.0030981325544416904, + "learning_rate": 6.001432990774245e-05, + "loss": 0.01188813615590334, + "num_input_tokens_seen": 119299160, + "step": 7285, + "train_runtime": 59199.0707, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 4.415757575757576, + "grad_norm": 0.007099981885403395, + "learning_rate": 6.00049084319096e-05, + "loss": 0.013032372109591961, + "num_input_tokens_seen": 119315536, + "step": 7286, + "train_runtime": 59207.1909, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 4.416363636363636, + "grad_norm": 0.005193258170038462, + "learning_rate": 5.999548658601467e-05, + "loss": 0.011954553425312042, + "num_input_tokens_seen": 119331912, + "step": 7287, + "train_runtime": 59215.3086, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 4.416969696969697, + "grad_norm": 0.006578485947102308, + "learning_rate": 5.9986064370406145e-05, + "loss": 0.012884486466646194, + "num_input_tokens_seen": 119348288, + "step": 7288, + "train_runtime": 59223.4308, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 4.417575757575758, + "grad_norm": 0.005842322949320078, + "learning_rate": 5.9976641785432554e-05, + "loss": 0.012681360356509686, + "num_input_tokens_seen": 119364664, + "step": 7289, + "train_runtime": 59231.5511, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 4.418181818181818, + "grad_norm": 0.005509698297828436, + "learning_rate": 5.996721883144242e-05, + "loss": 0.01148185320198536, + "num_input_tokens_seen": 119381040, + "step": 7290, + "train_runtime": 59239.6699, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 4.418787878787879, + "grad_norm": 0.004123019985854626, + "learning_rate": 5.995779550878426e-05, + "loss": 0.011784854345023632, + "num_input_tokens_seen": 119397416, + "step": 7291, + "train_runtime": 59247.8645, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 4.419393939393939, + "grad_norm": 0.011002606712281704, + "learning_rate": 5.994837181780665e-05, + "loss": 0.011998128145933151, + "num_input_tokens_seen": 119413792, + "step": 7292, + "train_runtime": 59256.0005, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 4.42, + "grad_norm": 0.012277049943804741, + "learning_rate": 5.9938947758858123e-05, + "loss": 0.012745257467031479, + "num_input_tokens_seen": 119430168, + "step": 7293, + "train_runtime": 59264.1562, + "train_tokens_per_second": 2015.218 + }, + { + "epoch": 4.420606060606061, + "grad_norm": 0.011734656989574432, + "learning_rate": 5.992952333228728e-05, + "loss": 0.012304229661822319, + "num_input_tokens_seen": 119446544, + "step": 7294, + "train_runtime": 59272.2946, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 4.421212121212121, + "grad_norm": 0.006237247493118048, + "learning_rate": 5.9920098538442714e-05, + "loss": 0.011959647759795189, + "num_input_tokens_seen": 119462920, + "step": 7295, + "train_runtime": 59280.4329, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 4.421818181818182, + "grad_norm": 0.0001426615344826132, + "learning_rate": 5.991067337767301e-05, + "loss": 0.01181588601320982, + "num_input_tokens_seen": 119479296, + "step": 7296, + "train_runtime": 59288.5701, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 4.422424242424243, + "grad_norm": 0.005273718852549791, + "learning_rate": 5.990124785032679e-05, + "loss": 0.011043763719499111, + "num_input_tokens_seen": 119495672, + "step": 7297, + "train_runtime": 59296.7006, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 4.423030303030303, + "grad_norm": 0.007604257669299841, + "learning_rate": 5.9891821956752715e-05, + "loss": 0.011540810577571392, + "num_input_tokens_seen": 119512048, + "step": 7298, + "train_runtime": 59304.8137, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 4.423636363636364, + "grad_norm": 0.007080035749822855, + "learning_rate": 5.988239569729939e-05, + "loss": 0.012555969879031181, + "num_input_tokens_seen": 119528424, + "step": 7299, + "train_runtime": 59312.9317, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 4.424242424242424, + "grad_norm": 0.00582153582945466, + "learning_rate": 5.987296907231552e-05, + "loss": 0.010732193477451801, + "num_input_tokens_seen": 119544800, + "step": 7300, + "train_runtime": 59321.0466, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 4.424848484848485, + "grad_norm": 0.01020594872534275, + "learning_rate": 5.986354208214973e-05, + "loss": 0.01270096655935049, + "num_input_tokens_seen": 119561176, + "step": 7301, + "train_runtime": 59330.0495, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 4.425454545454546, + "grad_norm": 0.005412591155618429, + "learning_rate": 5.9854114727150726e-05, + "loss": 0.011017434298992157, + "num_input_tokens_seen": 119577552, + "step": 7302, + "train_runtime": 59338.1637, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 4.426060606060606, + "grad_norm": 0.007728938478976488, + "learning_rate": 5.984468700766721e-05, + "loss": 0.012044589035212994, + "num_input_tokens_seen": 119593928, + "step": 7303, + "train_runtime": 59346.2794, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 4.426666666666667, + "grad_norm": 0.007457805331796408, + "learning_rate": 5.98352589240479e-05, + "loss": 0.012521137483417988, + "num_input_tokens_seen": 119610304, + "step": 7304, + "train_runtime": 59354.3999, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 4.427272727272728, + "grad_norm": 0.004503787495195866, + "learning_rate": 5.982583047664151e-05, + "loss": 0.011712935753166676, + "num_input_tokens_seen": 119626680, + "step": 7305, + "train_runtime": 59362.5153, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 4.427878787878788, + "grad_norm": 0.004555953200906515, + "learning_rate": 5.981640166579679e-05, + "loss": 0.010867418721318245, + "num_input_tokens_seen": 119643056, + "step": 7306, + "train_runtime": 59370.6334, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 4.428484848484849, + "grad_norm": 0.005478996783494949, + "learning_rate": 5.980697249186248e-05, + "loss": 0.012282976880669594, + "num_input_tokens_seen": 119659432, + "step": 7307, + "train_runtime": 59378.7499, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 4.429090909090909, + "grad_norm": 0.005615952890366316, + "learning_rate": 5.979754295518737e-05, + "loss": 0.01135013997554779, + "num_input_tokens_seen": 119675808, + "step": 7308, + "train_runtime": 59386.8618, + "train_tokens_per_second": 2015.19 + }, + { + "epoch": 4.42969696969697, + "grad_norm": 0.00641809543594718, + "learning_rate": 5.978811305612022e-05, + "loss": 0.013058959506452084, + "num_input_tokens_seen": 119692184, + "step": 7309, + "train_runtime": 59394.9746, + "train_tokens_per_second": 2015.19 + }, + { + "epoch": 4.430303030303031, + "grad_norm": 0.006531985010951757, + "learning_rate": 5.977868279500982e-05, + "loss": 0.012858068570494652, + "num_input_tokens_seen": 119708560, + "step": 7310, + "train_runtime": 59403.0917, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 4.430909090909091, + "grad_norm": 0.00737941125407815, + "learning_rate": 5.9769252172204996e-05, + "loss": 0.013202918693423271, + "num_input_tokens_seen": 119724936, + "step": 7311, + "train_runtime": 59411.2054, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 4.431515151515152, + "grad_norm": 0.008979957550764084, + "learning_rate": 5.9759821188054556e-05, + "loss": 0.01098744384944439, + "num_input_tokens_seen": 119741312, + "step": 7312, + "train_runtime": 59419.3201, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 4.432121212121212, + "grad_norm": 0.009940141811966896, + "learning_rate": 5.975038984290734e-05, + "loss": 0.012160352431237698, + "num_input_tokens_seen": 119757688, + "step": 7313, + "train_runtime": 59427.438, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 4.432727272727273, + "grad_norm": 0.008732708171010017, + "learning_rate": 5.974095813711219e-05, + "loss": 0.012783216312527657, + "num_input_tokens_seen": 119774064, + "step": 7314, + "train_runtime": 59435.5524, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 4.433333333333334, + "grad_norm": 0.005274198018014431, + "learning_rate": 5.973152607101797e-05, + "loss": 0.01213132031261921, + "num_input_tokens_seen": 119790440, + "step": 7315, + "train_runtime": 59443.663, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 4.433939393939394, + "grad_norm": 0.0029921685345470905, + "learning_rate": 5.9722093644973546e-05, + "loss": 0.011431408114731312, + "num_input_tokens_seen": 119806816, + "step": 7316, + "train_runtime": 59451.7764, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 4.434545454545455, + "grad_norm": 0.007820919156074524, + "learning_rate": 5.971266085932782e-05, + "loss": 0.012600250542163849, + "num_input_tokens_seen": 119823192, + "step": 7317, + "train_runtime": 59459.8897, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 4.4351515151515155, + "grad_norm": 0.005385654512792826, + "learning_rate": 5.970322771442968e-05, + "loss": 0.011444559320807457, + "num_input_tokens_seen": 119839568, + "step": 7318, + "train_runtime": 59468.0031, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 4.435757575757576, + "grad_norm": 0.013531357049942017, + "learning_rate": 5.969379421062804e-05, + "loss": 0.01224389299750328, + "num_input_tokens_seen": 119855944, + "step": 7319, + "train_runtime": 59476.1186, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 4.4363636363636365, + "grad_norm": 0.0036949303466826677, + "learning_rate": 5.968436034827183e-05, + "loss": 0.011928722262382507, + "num_input_tokens_seen": 119872320, + "step": 7320, + "train_runtime": 59484.2317, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 4.436969696969697, + "grad_norm": 0.01010495238006115, + "learning_rate": 5.967492612770999e-05, + "loss": 0.01154239196330309, + "num_input_tokens_seen": 119888696, + "step": 7321, + "train_runtime": 59492.3925, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 4.4375757575757575, + "grad_norm": 0.009180162101984024, + "learning_rate": 5.9665491549291476e-05, + "loss": 0.011736301705241203, + "num_input_tokens_seen": 119905072, + "step": 7322, + "train_runtime": 59500.5322, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 4.4381818181818184, + "grad_norm": 0.0027842118870466948, + "learning_rate": 5.9656056613365254e-05, + "loss": 0.010307535529136658, + "num_input_tokens_seen": 119921448, + "step": 7323, + "train_runtime": 59508.6917, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 4.4387878787878785, + "grad_norm": 0.006131322588771582, + "learning_rate": 5.964662132028029e-05, + "loss": 0.011557911522686481, + "num_input_tokens_seen": 119937824, + "step": 7324, + "train_runtime": 59516.8329, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 4.4393939393939394, + "grad_norm": 0.006486036814749241, + "learning_rate": 5.96371856703856e-05, + "loss": 0.011281915940344334, + "num_input_tokens_seen": 119954200, + "step": 7325, + "train_runtime": 59524.9716, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 4.44, + "grad_norm": 0.006153865251690149, + "learning_rate": 5.962774966403018e-05, + "loss": 0.011531590484082699, + "num_input_tokens_seen": 119970576, + "step": 7326, + "train_runtime": 59533.1095, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 4.4406060606060604, + "grad_norm": 0.00563497468829155, + "learning_rate": 5.9618313301563055e-05, + "loss": 0.011338308453559875, + "num_input_tokens_seen": 119986952, + "step": 7327, + "train_runtime": 59541.2558, + "train_tokens_per_second": 2015.19 + }, + { + "epoch": 4.441212121212121, + "grad_norm": 0.007326111663132906, + "learning_rate": 5.9608876583333226e-05, + "loss": 0.012699738144874573, + "num_input_tokens_seen": 120003328, + "step": 7328, + "train_runtime": 59549.3781, + "train_tokens_per_second": 2015.19 + }, + { + "epoch": 4.441818181818181, + "grad_norm": 0.005719674751162529, + "learning_rate": 5.9599439509689795e-05, + "loss": 0.011628130450844765, + "num_input_tokens_seen": 120019704, + "step": 7329, + "train_runtime": 59557.498, + "train_tokens_per_second": 2015.19 + }, + { + "epoch": 4.442424242424242, + "grad_norm": 0.010847226716578007, + "learning_rate": 5.9590002080981767e-05, + "loss": 0.012606369331479073, + "num_input_tokens_seen": 120036080, + "step": 7330, + "train_runtime": 59565.6145, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 4.443030303030303, + "grad_norm": 0.004233528394252062, + "learning_rate": 5.958056429755825e-05, + "loss": 0.011736981570720673, + "num_input_tokens_seen": 120052456, + "step": 7331, + "train_runtime": 59573.7327, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 4.443636363636363, + "grad_norm": 0.010928942821919918, + "learning_rate": 5.957112615976832e-05, + "loss": 0.01278550922870636, + "num_input_tokens_seen": 120068832, + "step": 7332, + "train_runtime": 59581.8501, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 4.444242424242424, + "grad_norm": 0.012251533567905426, + "learning_rate": 5.9561687667961075e-05, + "loss": 0.012723489664494991, + "num_input_tokens_seen": 120085208, + "step": 7333, + "train_runtime": 59589.9664, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 4.444848484848485, + "grad_norm": 0.008412362076342106, + "learning_rate": 5.955224882248562e-05, + "loss": 0.011086911894381046, + "num_input_tokens_seen": 120101584, + "step": 7334, + "train_runtime": 59598.081, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 4.445454545454545, + "grad_norm": 0.009295827709138393, + "learning_rate": 5.954280962369107e-05, + "loss": 0.011910863220691681, + "num_input_tokens_seen": 120117960, + "step": 7335, + "train_runtime": 59606.1937, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 4.446060606060606, + "grad_norm": 0.00490005174651742, + "learning_rate": 5.953337007192659e-05, + "loss": 0.011638964526355267, + "num_input_tokens_seen": 120134336, + "step": 7336, + "train_runtime": 59614.3092, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 4.446666666666666, + "grad_norm": 0.006535337772220373, + "learning_rate": 5.952393016754131e-05, + "loss": 0.011356716975569725, + "num_input_tokens_seen": 120150712, + "step": 7337, + "train_runtime": 59622.42, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 4.447272727272727, + "grad_norm": 0.008359244093298912, + "learning_rate": 5.951448991088441e-05, + "loss": 0.0129659753292799, + "num_input_tokens_seen": 120167088, + "step": 7338, + "train_runtime": 59630.5352, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 4.447878787878788, + "grad_norm": 0.009195860475301743, + "learning_rate": 5.950504930230504e-05, + "loss": 0.012021773494780064, + "num_input_tokens_seen": 120183464, + "step": 7339, + "train_runtime": 59638.6513, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 4.448484848484848, + "grad_norm": 0.004311786033213139, + "learning_rate": 5.949560834215242e-05, + "loss": 0.012464710511267185, + "num_input_tokens_seen": 120199840, + "step": 7340, + "train_runtime": 59646.7628, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 4.449090909090909, + "grad_norm": 0.00536229507997632, + "learning_rate": 5.948616703077574e-05, + "loss": 0.011605312116444111, + "num_input_tokens_seen": 120216216, + "step": 7341, + "train_runtime": 59654.8761, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 4.449696969696969, + "grad_norm": 0.006746332626789808, + "learning_rate": 5.947672536852421e-05, + "loss": 0.012064961716532707, + "num_input_tokens_seen": 120232592, + "step": 7342, + "train_runtime": 59662.9915, + "train_tokens_per_second": 2015.196 + }, + { + "epoch": 4.45030303030303, + "grad_norm": 0.006410657893866301, + "learning_rate": 5.946728335574705e-05, + "loss": 0.011672263033688068, + "num_input_tokens_seen": 120248968, + "step": 7343, + "train_runtime": 59671.1054, + "train_tokens_per_second": 2015.196 + }, + { + "epoch": 4.450909090909091, + "grad_norm": 0.00583629310131073, + "learning_rate": 5.9457840992793536e-05, + "loss": 0.012719416059553623, + "num_input_tokens_seen": 120265344, + "step": 7344, + "train_runtime": 59679.2214, + "train_tokens_per_second": 2015.196 + }, + { + "epoch": 4.451515151515151, + "grad_norm": 0.00861100759357214, + "learning_rate": 5.9448398280012897e-05, + "loss": 0.012038299813866615, + "num_input_tokens_seen": 120281720, + "step": 7345, + "train_runtime": 59687.3404, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 4.452121212121212, + "grad_norm": 0.003563522594049573, + "learning_rate": 5.943895521775441e-05, + "loss": 0.012734402902424335, + "num_input_tokens_seen": 120298096, + "step": 7346, + "train_runtime": 59695.4568, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 4.452727272727273, + "grad_norm": 0.004082544706761837, + "learning_rate": 5.942951180636733e-05, + "loss": 0.011163691990077496, + "num_input_tokens_seen": 120314472, + "step": 7347, + "train_runtime": 59703.5751, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 4.453333333333333, + "grad_norm": 0.006159675773233175, + "learning_rate": 5.9420068046200995e-05, + "loss": 0.012916757725179195, + "num_input_tokens_seen": 120330848, + "step": 7348, + "train_runtime": 59711.6905, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 4.453939393939394, + "grad_norm": 0.004007155075669289, + "learning_rate": 5.941062393760467e-05, + "loss": 0.010154800489544868, + "num_input_tokens_seen": 120347224, + "step": 7349, + "train_runtime": 59719.8084, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 4.454545454545454, + "grad_norm": 0.008164721541106701, + "learning_rate": 5.9401179480927715e-05, + "loss": 0.012357574887573719, + "num_input_tokens_seen": 120363600, + "step": 7350, + "train_runtime": 59727.9304, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 4.455151515151515, + "grad_norm": 0.005045021418482065, + "learning_rate": 5.939173467651942e-05, + "loss": 0.011513304896652699, + "num_input_tokens_seen": 120379976, + "step": 7351, + "train_runtime": 59736.0528, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 4.455757575757576, + "grad_norm": 0.007720267865806818, + "learning_rate": 5.938228952472915e-05, + "loss": 0.012232200242578983, + "num_input_tokens_seen": 120396352, + "step": 7352, + "train_runtime": 59744.1645, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 4.456363636363636, + "grad_norm": 0.006106114946305752, + "learning_rate": 5.937284402590626e-05, + "loss": 0.011908757500350475, + "num_input_tokens_seen": 120412728, + "step": 7353, + "train_runtime": 59752.281, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 4.456969696969697, + "grad_norm": 0.017284460365772247, + "learning_rate": 5.936339818040013e-05, + "loss": 0.011362496763467789, + "num_input_tokens_seen": 120429104, + "step": 7354, + "train_runtime": 59760.3982, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 4.457575757575758, + "grad_norm": 0.004650470335036516, + "learning_rate": 5.935395198856013e-05, + "loss": 0.01263531856238842, + "num_input_tokens_seen": 120445480, + "step": 7355, + "train_runtime": 59768.5177, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 4.458181818181818, + "grad_norm": 0.006015765015035868, + "learning_rate": 5.934450545073567e-05, + "loss": 0.012254966422915459, + "num_input_tokens_seen": 120461856, + "step": 7356, + "train_runtime": 59776.6329, + "train_tokens_per_second": 2015.2 + }, + { + "epoch": 4.458787878787879, + "grad_norm": 0.002620214829221368, + "learning_rate": 5.9335058567276144e-05, + "loss": 0.011074674315750599, + "num_input_tokens_seen": 120478232, + "step": 7357, + "train_runtime": 59784.7478, + "train_tokens_per_second": 2015.2 + }, + { + "epoch": 4.459393939393939, + "grad_norm": 0.008171136491000652, + "learning_rate": 5.9325611338530985e-05, + "loss": 0.013259831815958023, + "num_input_tokens_seen": 120494608, + "step": 7358, + "train_runtime": 59792.861, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 4.46, + "grad_norm": 0.007186796050518751, + "learning_rate": 5.931616376484962e-05, + "loss": 0.01243534404784441, + "num_input_tokens_seen": 120510984, + "step": 7359, + "train_runtime": 59800.9707, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 4.460606060606061, + "grad_norm": 0.008412345312535763, + "learning_rate": 5.9306715846581506e-05, + "loss": 0.011509754694998264, + "num_input_tokens_seen": 120527360, + "step": 7360, + "train_runtime": 59809.0823, + "train_tokens_per_second": 2015.202 + }, + { + "epoch": 4.461212121212121, + "grad_norm": 0.002701538847759366, + "learning_rate": 5.929726758407609e-05, + "loss": 0.012514428235590458, + "num_input_tokens_seen": 120543736, + "step": 7361, + "train_runtime": 59817.1941, + "train_tokens_per_second": 2015.202 + }, + { + "epoch": 4.461818181818182, + "grad_norm": 0.006582144182175398, + "learning_rate": 5.9287818977682854e-05, + "loss": 0.01247149333357811, + "num_input_tokens_seen": 120560112, + "step": 7362, + "train_runtime": 59825.3088, + "train_tokens_per_second": 2015.203 + }, + { + "epoch": 4.462424242424243, + "grad_norm": 0.0033069169148802757, + "learning_rate": 5.9278370027751275e-05, + "loss": 0.011486891657114029, + "num_input_tokens_seen": 120576488, + "step": 7363, + "train_runtime": 59833.4307, + "train_tokens_per_second": 2015.203 + }, + { + "epoch": 4.463030303030303, + "grad_norm": 0.0048654102720320225, + "learning_rate": 5.9268920734630875e-05, + "loss": 0.012298233807086945, + "num_input_tokens_seen": 120592864, + "step": 7364, + "train_runtime": 59841.5464, + "train_tokens_per_second": 2015.203 + }, + { + "epoch": 4.463636363636364, + "grad_norm": 0.005774909630417824, + "learning_rate": 5.925947109867114e-05, + "loss": 0.01244383491575718, + "num_input_tokens_seen": 120609240, + "step": 7365, + "train_runtime": 59849.6604, + "train_tokens_per_second": 2015.203 + }, + { + "epoch": 4.464242424242424, + "grad_norm": 0.003622927237302065, + "learning_rate": 5.9250021120221576e-05, + "loss": 0.011402766220271587, + "num_input_tokens_seen": 120625616, + "step": 7366, + "train_runtime": 59857.7745, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 4.464848484848485, + "grad_norm": 0.007719808723777533, + "learning_rate": 5.9240570799631765e-05, + "loss": 0.011214827187359333, + "num_input_tokens_seen": 120641992, + "step": 7367, + "train_runtime": 59865.8879, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 4.465454545454546, + "grad_norm": 0.009988962672650814, + "learning_rate": 5.923112013725125e-05, + "loss": 0.01253876369446516, + "num_input_tokens_seen": 120658368, + "step": 7368, + "train_runtime": 59874.0065, + "train_tokens_per_second": 2015.205 + }, + { + "epoch": 4.466060606060606, + "grad_norm": 0.0067720115184783936, + "learning_rate": 5.922166913342956e-05, + "loss": 0.01097847055643797, + "num_input_tokens_seen": 120674744, + "step": 7369, + "train_runtime": 59882.1472, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 4.466666666666667, + "grad_norm": 0.005748811177909374, + "learning_rate": 5.921221778851629e-05, + "loss": 0.010877182707190514, + "num_input_tokens_seen": 120691120, + "step": 7370, + "train_runtime": 59890.2623, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 4.467272727272727, + "grad_norm": 0.006522682495415211, + "learning_rate": 5.920276610286102e-05, + "loss": 0.012215021066367626, + "num_input_tokens_seen": 120707496, + "step": 7371, + "train_runtime": 59898.4462, + "train_tokens_per_second": 2015.202 + }, + { + "epoch": 4.467878787878788, + "grad_norm": 0.005444098263978958, + "learning_rate": 5.9193314076813365e-05, + "loss": 0.01274205930531025, + "num_input_tokens_seen": 120723872, + "step": 7372, + "train_runtime": 59906.5792, + "train_tokens_per_second": 2015.202 + }, + { + "epoch": 4.468484848484849, + "grad_norm": 0.005387032404541969, + "learning_rate": 5.9183861710722924e-05, + "loss": 0.011913316324353218, + "num_input_tokens_seen": 120740248, + "step": 7373, + "train_runtime": 59914.76, + "train_tokens_per_second": 2015.2 + }, + { + "epoch": 4.469090909090909, + "grad_norm": 0.007351752370595932, + "learning_rate": 5.9174409004939305e-05, + "loss": 0.010862283408641815, + "num_input_tokens_seen": 120756624, + "step": 7374, + "train_runtime": 59922.9125, + "train_tokens_per_second": 2015.2 + }, + { + "epoch": 4.46969696969697, + "grad_norm": 0.006846481002867222, + "learning_rate": 5.9164955959812176e-05, + "loss": 0.011668909341096878, + "num_input_tokens_seen": 120773000, + "step": 7375, + "train_runtime": 59931.0511, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 4.470303030303031, + "grad_norm": 0.015766877681016922, + "learning_rate": 5.9155502575691166e-05, + "loss": 0.012032456696033478, + "num_input_tokens_seen": 120789376, + "step": 7376, + "train_runtime": 59939.1911, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 4.470909090909091, + "grad_norm": 0.006479022093117237, + "learning_rate": 5.914604885292596e-05, + "loss": 0.011621473357081413, + "num_input_tokens_seen": 120805752, + "step": 7377, + "train_runtime": 59947.3357, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 4.471515151515152, + "grad_norm": 0.0057898834347724915, + "learning_rate": 5.913659479186621e-05, + "loss": 0.010827083140611649, + "num_input_tokens_seen": 120822128, + "step": 7378, + "train_runtime": 59955.4553, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 4.472121212121212, + "grad_norm": 0.008127969689667225, + "learning_rate": 5.9127140392861614e-05, + "loss": 0.011490117758512497, + "num_input_tokens_seen": 120838504, + "step": 7379, + "train_runtime": 59963.5754, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 4.472727272727273, + "grad_norm": 0.0066016786731779575, + "learning_rate": 5.9117685656261866e-05, + "loss": 0.012192782945930958, + "num_input_tokens_seen": 120854880, + "step": 7380, + "train_runtime": 59971.6949, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 4.473333333333334, + "grad_norm": 0.006692313589155674, + "learning_rate": 5.9108230582416676e-05, + "loss": 0.011416204273700714, + "num_input_tokens_seen": 120871256, + "step": 7381, + "train_runtime": 59979.8122, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 4.473939393939394, + "grad_norm": 0.007159905973821878, + "learning_rate": 5.90987751716758e-05, + "loss": 0.010931414552032948, + "num_input_tokens_seen": 120887632, + "step": 7382, + "train_runtime": 59987.9302, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 4.474545454545455, + "grad_norm": 0.007059791591018438, + "learning_rate": 5.908931942438892e-05, + "loss": 0.011360105127096176, + "num_input_tokens_seen": 120904008, + "step": 7383, + "train_runtime": 59996.0423, + "train_tokens_per_second": 2015.2 + }, + { + "epoch": 4.4751515151515155, + "grad_norm": 0.005092640873044729, + "learning_rate": 5.907986334090583e-05, + "loss": 0.012762497179210186, + "num_input_tokens_seen": 120920384, + "step": 7384, + "train_runtime": 60004.1581, + "train_tokens_per_second": 2015.2 + }, + { + "epoch": 4.475757575757576, + "grad_norm": 0.009130202233791351, + "learning_rate": 5.907040692157626e-05, + "loss": 0.01393795758485794, + "num_input_tokens_seen": 120936760, + "step": 7385, + "train_runtime": 60012.2719, + "train_tokens_per_second": 2015.2 + }, + { + "epoch": 4.4763636363636365, + "grad_norm": 0.005537665914744139, + "learning_rate": 5.9060950166750015e-05, + "loss": 0.011060119606554508, + "num_input_tokens_seen": 120953136, + "step": 7386, + "train_runtime": 60020.3908, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 4.476969696969697, + "grad_norm": 0.005927036516368389, + "learning_rate": 5.905149307677688e-05, + "loss": 0.012143654748797417, + "num_input_tokens_seen": 120969512, + "step": 7387, + "train_runtime": 60028.5405, + "train_tokens_per_second": 2015.2 + }, + { + "epoch": 4.4775757575757575, + "grad_norm": 0.00925386417657137, + "learning_rate": 5.904203565200663e-05, + "loss": 0.011489068157970905, + "num_input_tokens_seen": 120985888, + "step": 7388, + "train_runtime": 60036.6768, + "train_tokens_per_second": 2015.2 + }, + { + "epoch": 4.4781818181818185, + "grad_norm": 0.006231648847460747, + "learning_rate": 5.903257789278909e-05, + "loss": 0.013152793049812317, + "num_input_tokens_seen": 121002264, + "step": 7389, + "train_runtime": 60044.8353, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 4.4787878787878785, + "grad_norm": 0.008495653979480267, + "learning_rate": 5.902311979947408e-05, + "loss": 0.012974532321095467, + "num_input_tokens_seen": 121018640, + "step": 7390, + "train_runtime": 60052.9717, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 4.4793939393939395, + "grad_norm": 0.008892616257071495, + "learning_rate": 5.901366137241145e-05, + "loss": 0.012889361940324306, + "num_input_tokens_seen": 121035016, + "step": 7391, + "train_runtime": 60061.1098, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 4.48, + "grad_norm": 0.008123672567307949, + "learning_rate": 5.900420261195104e-05, + "loss": 0.01263979822397232, + "num_input_tokens_seen": 121051392, + "step": 7392, + "train_runtime": 60069.2483, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 4.4806060606060605, + "grad_norm": 0.008833922445774078, + "learning_rate": 5.8994743518442694e-05, + "loss": 0.012719920836389065, + "num_input_tokens_seen": 121067768, + "step": 7393, + "train_runtime": 60077.3764, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 4.481212121212121, + "grad_norm": 0.007458955515176058, + "learning_rate": 5.898528409223631e-05, + "loss": 0.012645816430449486, + "num_input_tokens_seen": 121084144, + "step": 7394, + "train_runtime": 60085.4913, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 4.4818181818181815, + "grad_norm": 0.007055291905999184, + "learning_rate": 5.897582433368177e-05, + "loss": 0.012438235804438591, + "num_input_tokens_seen": 121100520, + "step": 7395, + "train_runtime": 60093.6107, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 4.482424242424242, + "grad_norm": 0.006025281269103289, + "learning_rate": 5.896636424312898e-05, + "loss": 0.011006435379385948, + "num_input_tokens_seen": 121116896, + "step": 7396, + "train_runtime": 60101.7314, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 4.483030303030303, + "grad_norm": 0.0054452368058264256, + "learning_rate": 5.895690382092781e-05, + "loss": 0.012290939688682556, + "num_input_tokens_seen": 121133272, + "step": 7397, + "train_runtime": 60109.8467, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 4.483636363636363, + "grad_norm": 0.00545475585386157, + "learning_rate": 5.894744306742823e-05, + "loss": 0.011973683722317219, + "num_input_tokens_seen": 121149648, + "step": 7398, + "train_runtime": 60117.9643, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 4.484242424242424, + "grad_norm": 0.007788896095007658, + "learning_rate": 5.893798198298014e-05, + "loss": 0.011684827506542206, + "num_input_tokens_seen": 121166024, + "step": 7399, + "train_runtime": 60126.0787, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 4.484848484848484, + "grad_norm": 0.00698448158800602, + "learning_rate": 5.892852056793352e-05, + "loss": 0.01079865824431181, + "num_input_tokens_seen": 121182400, + "step": 7400, + "train_runtime": 60134.2209, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 4.485454545454545, + "grad_norm": 0.004109036643058062, + "learning_rate": 5.89190588226383e-05, + "loss": 0.011913914233446121, + "num_input_tokens_seen": 121198776, + "step": 7401, + "train_runtime": 60143.4204, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 4.486060606060606, + "grad_norm": 0.00594255281612277, + "learning_rate": 5.8909596747444477e-05, + "loss": 0.0116860531270504, + "num_input_tokens_seen": 121215152, + "step": 7402, + "train_runtime": 60151.535, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 4.486666666666666, + "grad_norm": 0.002688049338757992, + "learning_rate": 5.8900134342702004e-05, + "loss": 0.011077359318733215, + "num_input_tokens_seen": 121231528, + "step": 7403, + "train_runtime": 60159.6453, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 4.487272727272727, + "grad_norm": 0.005429972894489765, + "learning_rate": 5.889067160876091e-05, + "loss": 0.0114585654810071, + "num_input_tokens_seen": 121247904, + "step": 7404, + "train_runtime": 60167.7563, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 4.487878787878788, + "grad_norm": 0.0066853235475718975, + "learning_rate": 5.8881208545971176e-05, + "loss": 0.012389697134494781, + "num_input_tokens_seen": 121264280, + "step": 7405, + "train_runtime": 60175.8656, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 4.488484848484848, + "grad_norm": 0.006963841617107391, + "learning_rate": 5.887174515468282e-05, + "loss": 0.012653941288590431, + "num_input_tokens_seen": 121280656, + "step": 7406, + "train_runtime": 60183.9974, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 4.489090909090909, + "grad_norm": 0.008474506437778473, + "learning_rate": 5.886228143524592e-05, + "loss": 0.011983315460383892, + "num_input_tokens_seen": 121297032, + "step": 7407, + "train_runtime": 60192.1301, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 4.489696969696969, + "grad_norm": 0.006914717610925436, + "learning_rate": 5.885281738801045e-05, + "loss": 0.012009972706437111, + "num_input_tokens_seen": 121313408, + "step": 7408, + "train_runtime": 60200.259, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 4.49030303030303, + "grad_norm": 0.005322455428540707, + "learning_rate": 5.884335301332654e-05, + "loss": 0.011630887165665627, + "num_input_tokens_seen": 121329784, + "step": 7409, + "train_runtime": 60208.3901, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 4.490909090909091, + "grad_norm": 0.0059000179171562195, + "learning_rate": 5.8833888311544207e-05, + "loss": 0.012081107124686241, + "num_input_tokens_seen": 121346160, + "step": 7410, + "train_runtime": 60216.5315, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 4.491515151515151, + "grad_norm": 0.004761508200317621, + "learning_rate": 5.882442328301355e-05, + "loss": 0.012602065689861774, + "num_input_tokens_seen": 121362536, + "step": 7411, + "train_runtime": 60224.6524, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 4.492121212121212, + "grad_norm": 0.003793605137616396, + "learning_rate": 5.881495792808464e-05, + "loss": 0.012878164649009705, + "num_input_tokens_seen": 121378912, + "step": 7412, + "train_runtime": 60232.7698, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 4.492727272727273, + "grad_norm": 0.00781884603202343, + "learning_rate": 5.880549224710764e-05, + "loss": 0.013222620822489262, + "num_input_tokens_seen": 121395288, + "step": 7413, + "train_runtime": 60240.8926, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 4.493333333333333, + "grad_norm": 0.004618137609213591, + "learning_rate": 5.879602624043261e-05, + "loss": 0.012478463351726532, + "num_input_tokens_seen": 121411664, + "step": 7414, + "train_runtime": 60249.0123, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 4.493939393939394, + "grad_norm": 0.0025459351018071175, + "learning_rate": 5.878655990840971e-05, + "loss": 0.011055423878133297, + "num_input_tokens_seen": 121428040, + "step": 7415, + "train_runtime": 60257.1322, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 4.494545454545454, + "grad_norm": 0.008038830943405628, + "learning_rate": 5.877709325138905e-05, + "loss": 0.011643639765679836, + "num_input_tokens_seen": 121444416, + "step": 7416, + "train_runtime": 60265.2448, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 4.495151515151515, + "grad_norm": 0.005348806735128164, + "learning_rate": 5.8767626269720824e-05, + "loss": 0.01219367515295744, + "num_input_tokens_seen": 121460792, + "step": 7417, + "train_runtime": 60273.3613, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 4.495757575757576, + "grad_norm": 0.0068849534727633, + "learning_rate": 5.875815896375517e-05, + "loss": 0.012276221998035908, + "num_input_tokens_seen": 121477168, + "step": 7418, + "train_runtime": 60281.479, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 4.496363636363636, + "grad_norm": 0.007638324983417988, + "learning_rate": 5.8748691333842265e-05, + "loss": 0.011335955932736397, + "num_input_tokens_seen": 121493544, + "step": 7419, + "train_runtime": 60289.5982, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 4.496969696969697, + "grad_norm": 0.0038147957529872656, + "learning_rate": 5.8739223380332306e-05, + "loss": 0.01159148570150137, + "num_input_tokens_seen": 121509920, + "step": 7420, + "train_runtime": 60297.7163, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 4.497575757575758, + "grad_norm": 0.00419682776555419, + "learning_rate": 5.87297551035755e-05, + "loss": 0.012533755972981453, + "num_input_tokens_seen": 121526296, + "step": 7421, + "train_runtime": 60305.8424, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 4.498181818181818, + "grad_norm": 0.006443911697715521, + "learning_rate": 5.872028650392205e-05, + "loss": 0.011310091242194176, + "num_input_tokens_seen": 121542672, + "step": 7422, + "train_runtime": 60313.9678, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 4.498787878787879, + "grad_norm": 0.006715926807373762, + "learning_rate": 5.871081758172219e-05, + "loss": 0.011908059939742088, + "num_input_tokens_seen": 121559048, + "step": 7423, + "train_runtime": 60322.0924, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 4.499393939393939, + "grad_norm": 0.009620782919228077, + "learning_rate": 5.8701348337326146e-05, + "loss": 0.013648133724927902, + "num_input_tokens_seen": 121575424, + "step": 7424, + "train_runtime": 60330.2104, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 4.5, + "grad_norm": 0.009638079442083836, + "learning_rate": 5.8691878771084164e-05, + "loss": 0.012543809600174427, + "num_input_tokens_seen": 121591800, + "step": 7425, + "train_runtime": 60338.3314, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 4.500606060606061, + "grad_norm": 0.0046216025948524475, + "learning_rate": 5.868240888334653e-05, + "loss": 0.012280633673071861, + "num_input_tokens_seen": 121608176, + "step": 7426, + "train_runtime": 60346.4485, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 4.501212121212121, + "grad_norm": 0.004622709937393665, + "learning_rate": 5.8672938674463485e-05, + "loss": 0.01158909685909748, + "num_input_tokens_seen": 121624552, + "step": 7427, + "train_runtime": 60354.5651, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 4.501818181818182, + "grad_norm": 0.006485712248831987, + "learning_rate": 5.866346814478534e-05, + "loss": 0.012570836581289768, + "num_input_tokens_seen": 121640928, + "step": 7428, + "train_runtime": 60362.6831, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 4.502424242424242, + "grad_norm": 0.007876059971749783, + "learning_rate": 5.865399729466237e-05, + "loss": 0.012640135362744331, + "num_input_tokens_seen": 121657304, + "step": 7429, + "train_runtime": 60370.8007, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 4.503030303030303, + "grad_norm": 0.003252696478739381, + "learning_rate": 5.864452612444489e-05, + "loss": 0.012019352056086063, + "num_input_tokens_seen": 121673680, + "step": 7430, + "train_runtime": 60378.9178, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 4.503636363636364, + "grad_norm": 0.005443859379738569, + "learning_rate": 5.863505463448322e-05, + "loss": 0.012280452996492386, + "num_input_tokens_seen": 121690056, + "step": 7431, + "train_runtime": 60387.0322, + "train_tokens_per_second": 2015.169 + }, + { + "epoch": 4.504242424242424, + "grad_norm": 0.00831444002687931, + "learning_rate": 5.862558282512772e-05, + "loss": 0.013229894451797009, + "num_input_tokens_seen": 121706432, + "step": 7432, + "train_runtime": 60395.1478, + "train_tokens_per_second": 2015.169 + }, + { + "epoch": 4.504848484848485, + "grad_norm": 0.00859881192445755, + "learning_rate": 5.861611069672869e-05, + "loss": 0.012600138783454895, + "num_input_tokens_seen": 121722808, + "step": 7433, + "train_runtime": 60403.2635, + "train_tokens_per_second": 2015.169 + }, + { + "epoch": 4.505454545454546, + "grad_norm": 0.03655974566936493, + "learning_rate": 5.8606638249636506e-05, + "loss": 0.012091470882296562, + "num_input_tokens_seen": 121739184, + "step": 7434, + "train_runtime": 60411.3783, + "train_tokens_per_second": 2015.17 + }, + { + "epoch": 4.506060606060606, + "grad_norm": 0.009546743705868721, + "learning_rate": 5.859716548420152e-05, + "loss": 0.013217727653682232, + "num_input_tokens_seen": 121755560, + "step": 7435, + "train_runtime": 60419.4929, + "train_tokens_per_second": 2015.17 + }, + { + "epoch": 4.506666666666667, + "grad_norm": 0.004069843795150518, + "learning_rate": 5.8587692400774154e-05, + "loss": 0.011197997257113457, + "num_input_tokens_seen": 121771936, + "step": 7436, + "train_runtime": 60427.6134, + "train_tokens_per_second": 2015.17 + }, + { + "epoch": 4.507272727272728, + "grad_norm": 0.007265013176947832, + "learning_rate": 5.857821899970475e-05, + "loss": 0.011313153430819511, + "num_input_tokens_seen": 121788312, + "step": 7437, + "train_runtime": 60435.7316, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 4.507878787878788, + "grad_norm": 0.007473419886082411, + "learning_rate": 5.856874528134374e-05, + "loss": 0.012974688783288002, + "num_input_tokens_seen": 121804688, + "step": 7438, + "train_runtime": 60443.8481, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 4.508484848484849, + "grad_norm": 0.0035666124895215034, + "learning_rate": 5.855927124604152e-05, + "loss": 0.010531166568398476, + "num_input_tokens_seen": 121821064, + "step": 7439, + "train_runtime": 60451.9685, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 4.509090909090909, + "grad_norm": 0.006321573164314032, + "learning_rate": 5.8549796894148534e-05, + "loss": 0.010938674211502075, + "num_input_tokens_seen": 121837440, + "step": 7440, + "train_runtime": 60460.0902, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 4.50969696969697, + "grad_norm": 0.007823825813829899, + "learning_rate": 5.854032222601521e-05, + "loss": 0.01219155639410019, + "num_input_tokens_seen": 121853816, + "step": 7441, + "train_runtime": 60468.2056, + "train_tokens_per_second": 2015.172 + }, + { + "epoch": 4.51030303030303, + "grad_norm": 0.003070801729336381, + "learning_rate": 5.8530847241991994e-05, + "loss": 0.011093913577497005, + "num_input_tokens_seen": 121870192, + "step": 7442, + "train_runtime": 60476.3181, + "train_tokens_per_second": 2015.172 + }, + { + "epoch": 4.510909090909091, + "grad_norm": 0.010207605548202991, + "learning_rate": 5.852137194242935e-05, + "loss": 0.01260855607688427, + "num_input_tokens_seen": 121886568, + "step": 7443, + "train_runtime": 60484.4351, + "train_tokens_per_second": 2015.172 + }, + { + "epoch": 4.511515151515152, + "grad_norm": 0.00880705937743187, + "learning_rate": 5.851189632767775e-05, + "loss": 0.01077430322766304, + "num_input_tokens_seen": 121902944, + "step": 7444, + "train_runtime": 60492.5494, + "train_tokens_per_second": 2015.173 + }, + { + "epoch": 4.512121212121212, + "grad_norm": 0.0020832845475524664, + "learning_rate": 5.850242039808769e-05, + "loss": 0.011859538964927197, + "num_input_tokens_seen": 121919320, + "step": 7445, + "train_runtime": 60500.6681, + "train_tokens_per_second": 2015.173 + }, + { + "epoch": 4.512727272727273, + "grad_norm": 0.007662924937903881, + "learning_rate": 5.8492944154009656e-05, + "loss": 0.012502728961408138, + "num_input_tokens_seen": 121935696, + "step": 7446, + "train_runtime": 60508.7825, + "train_tokens_per_second": 2015.174 + }, + { + "epoch": 4.513333333333334, + "grad_norm": 0.004233106970787048, + "learning_rate": 5.848346759579416e-05, + "loss": 0.012202632613480091, + "num_input_tokens_seen": 121952072, + "step": 7447, + "train_runtime": 60516.8984, + "train_tokens_per_second": 2015.174 + }, + { + "epoch": 4.513939393939394, + "grad_norm": 0.006590844597667456, + "learning_rate": 5.84739907237917e-05, + "loss": 0.012571950443089008, + "num_input_tokens_seen": 121968448, + "step": 7448, + "train_runtime": 60525.0127, + "train_tokens_per_second": 2015.174 + }, + { + "epoch": 4.514545454545455, + "grad_norm": 0.005834270268678665, + "learning_rate": 5.846451353835285e-05, + "loss": 0.011259973049163818, + "num_input_tokens_seen": 121984824, + "step": 7449, + "train_runtime": 60533.1302, + "train_tokens_per_second": 2015.175 + }, + { + "epoch": 4.515151515151516, + "grad_norm": 0.0035212747752666473, + "learning_rate": 5.845503603982811e-05, + "loss": 0.011555373668670654, + "num_input_tokens_seen": 122001200, + "step": 7450, + "train_runtime": 60541.2461, + "train_tokens_per_second": 2015.175 + }, + { + "epoch": 4.515757575757576, + "grad_norm": 0.016510434448719025, + "learning_rate": 5.8445558228568066e-05, + "loss": 0.01349032111465931, + "num_input_tokens_seen": 122017576, + "step": 7451, + "train_runtime": 60549.3598, + "train_tokens_per_second": 2015.175 + }, + { + "epoch": 4.516363636363637, + "grad_norm": 0.01796264946460724, + "learning_rate": 5.843608010492325e-05, + "loss": 0.011624453589320183, + "num_input_tokens_seen": 122033952, + "step": 7452, + "train_runtime": 60557.4745, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 4.516969696969697, + "grad_norm": 0.00503959646448493, + "learning_rate": 5.8426601669244286e-05, + "loss": 0.011968092992901802, + "num_input_tokens_seen": 122050328, + "step": 7453, + "train_runtime": 60565.5903, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 4.517575757575758, + "grad_norm": 0.0061596897430717945, + "learning_rate": 5.841712292188172e-05, + "loss": 0.011514652520418167, + "num_input_tokens_seen": 122066704, + "step": 7454, + "train_runtime": 60573.7044, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 4.5181818181818185, + "grad_norm": 0.005164189264178276, + "learning_rate": 5.840764386318618e-05, + "loss": 0.01272825337946415, + "num_input_tokens_seen": 122083080, + "step": 7455, + "train_runtime": 60581.8188, + "train_tokens_per_second": 2015.177 + }, + { + "epoch": 4.518787878787879, + "grad_norm": 0.00570727838203311, + "learning_rate": 5.8398164493508244e-05, + "loss": 0.011025605723261833, + "num_input_tokens_seen": 122099456, + "step": 7456, + "train_runtime": 60589.9334, + "train_tokens_per_second": 2015.177 + }, + { + "epoch": 4.5193939393939395, + "grad_norm": 0.008791424334049225, + "learning_rate": 5.838868481319858e-05, + "loss": 0.0116766057908535, + "num_input_tokens_seen": 122115832, + "step": 7457, + "train_runtime": 60598.0457, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 4.52, + "grad_norm": 0.00338292820379138, + "learning_rate": 5.837920482260778e-05, + "loss": 0.010959308594465256, + "num_input_tokens_seen": 122132208, + "step": 7458, + "train_runtime": 60606.16, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 4.5206060606060605, + "grad_norm": 0.006840405520051718, + "learning_rate": 5.836972452208654e-05, + "loss": 0.014136707410216331, + "num_input_tokens_seen": 122148584, + "step": 7459, + "train_runtime": 60614.2712, + "train_tokens_per_second": 2015.179 + }, + { + "epoch": 4.5212121212121215, + "grad_norm": 0.007938550785183907, + "learning_rate": 5.836024391198547e-05, + "loss": 0.012110092677175999, + "num_input_tokens_seen": 122164960, + "step": 7460, + "train_runtime": 60622.3864, + "train_tokens_per_second": 2015.179 + }, + { + "epoch": 4.5218181818181815, + "grad_norm": 0.006932188291102648, + "learning_rate": 5.835076299265526e-05, + "loss": 0.011745572090148926, + "num_input_tokens_seen": 122181336, + "step": 7461, + "train_runtime": 60630.5009, + "train_tokens_per_second": 2015.179 + }, + { + "epoch": 4.5224242424242425, + "grad_norm": 0.008883235044777393, + "learning_rate": 5.834128176444659e-05, + "loss": 0.013290399685502052, + "num_input_tokens_seen": 122197712, + "step": 7462, + "train_runtime": 60638.6127, + "train_tokens_per_second": 2015.18 + }, + { + "epoch": 4.523030303030303, + "grad_norm": 0.009117498993873596, + "learning_rate": 5.8331800227710154e-05, + "loss": 0.011728959158062935, + "num_input_tokens_seen": 122214088, + "step": 7463, + "train_runtime": 60646.7297, + "train_tokens_per_second": 2015.18 + }, + { + "epoch": 4.5236363636363635, + "grad_norm": 0.01338648796081543, + "learning_rate": 5.832231838279665e-05, + "loss": 0.01240079291164875, + "num_input_tokens_seen": 122230464, + "step": 7464, + "train_runtime": 60654.8422, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 4.524242424242424, + "grad_norm": 0.009684640914201736, + "learning_rate": 5.83128362300568e-05, + "loss": 0.012569054961204529, + "num_input_tokens_seen": 122246840, + "step": 7465, + "train_runtime": 60662.956, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 4.524848484848485, + "grad_norm": 0.005550570320338011, + "learning_rate": 5.8303353769841316e-05, + "loss": 0.011641517281532288, + "num_input_tokens_seen": 122263216, + "step": 7466, + "train_runtime": 60671.0693, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 4.525454545454545, + "grad_norm": 0.007068332750350237, + "learning_rate": 5.8293871002500965e-05, + "loss": 0.013180263340473175, + "num_input_tokens_seen": 122279592, + "step": 7467, + "train_runtime": 60679.1838, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 4.526060606060606, + "grad_norm": 0.008101638406515121, + "learning_rate": 5.828438792838646e-05, + "loss": 0.012949595227837563, + "num_input_tokens_seen": 122295968, + "step": 7468, + "train_runtime": 60687.3006, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 4.526666666666666, + "grad_norm": 0.00924333743751049, + "learning_rate": 5.8274904547848595e-05, + "loss": 0.01064026728272438, + "num_input_tokens_seen": 122312344, + "step": 7469, + "train_runtime": 60695.4179, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 4.527272727272727, + "grad_norm": 0.009208728559315205, + "learning_rate": 5.826542086123812e-05, + "loss": 0.012602102942764759, + "num_input_tokens_seen": 122328720, + "step": 7470, + "train_runtime": 60703.5314, + "train_tokens_per_second": 2015.183 + }, + { + "epoch": 4.527878787878787, + "grad_norm": 0.0025503060314804316, + "learning_rate": 5.82559368689058e-05, + "loss": 0.01304345391690731, + "num_input_tokens_seen": 122345096, + "step": 7471, + "train_runtime": 60711.6461, + "train_tokens_per_second": 2015.183 + }, + { + "epoch": 4.528484848484848, + "grad_norm": 0.008224641904234886, + "learning_rate": 5.824645257120248e-05, + "loss": 0.013123159296810627, + "num_input_tokens_seen": 122361472, + "step": 7472, + "train_runtime": 60719.7605, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 4.529090909090909, + "grad_norm": 0.008876552805304527, + "learning_rate": 5.823696796847894e-05, + "loss": 0.012610513716936111, + "num_input_tokens_seen": 122377848, + "step": 7473, + "train_runtime": 60727.8712, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 4.529696969696969, + "grad_norm": 0.01077653095126152, + "learning_rate": 5.822748306108599e-05, + "loss": 0.012082655914127827, + "num_input_tokens_seen": 122394224, + "step": 7474, + "train_runtime": 60735.9835, + "train_tokens_per_second": 2015.185 + }, + { + "epoch": 4.53030303030303, + "grad_norm": 0.007568391505628824, + "learning_rate": 5.8217997849374454e-05, + "loss": 0.011551016010344028, + "num_input_tokens_seen": 122410600, + "step": 7475, + "train_runtime": 60744.095, + "train_tokens_per_second": 2015.185 + }, + { + "epoch": 4.530909090909091, + "grad_norm": 0.008432717062532902, + "learning_rate": 5.8208512333695185e-05, + "loss": 0.012234884314239025, + "num_input_tokens_seen": 122426976, + "step": 7476, + "train_runtime": 60752.209, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 4.531515151515151, + "grad_norm": 0.002261258428916335, + "learning_rate": 5.819902651439904e-05, + "loss": 0.0115195382386446, + "num_input_tokens_seen": 122443352, + "step": 7477, + "train_runtime": 60760.331, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 4.532121212121212, + "grad_norm": 0.0038170013576745987, + "learning_rate": 5.8189540391836864e-05, + "loss": 0.011260981671512127, + "num_input_tokens_seen": 122459728, + "step": 7478, + "train_runtime": 60768.4423, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 4.532727272727273, + "grad_norm": 0.004033150151371956, + "learning_rate": 5.818005396635953e-05, + "loss": 0.011318730190396309, + "num_input_tokens_seen": 122476104, + "step": 7479, + "train_runtime": 60776.5533, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 4.533333333333333, + "grad_norm": 0.003319941461086273, + "learning_rate": 5.817056723831793e-05, + "loss": 0.011951145716011524, + "num_input_tokens_seen": 122492480, + "step": 7480, + "train_runtime": 60784.665, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 4.533939393939394, + "grad_norm": 0.003065842902287841, + "learning_rate": 5.816108020806297e-05, + "loss": 0.012411411851644516, + "num_input_tokens_seen": 122508856, + "step": 7481, + "train_runtime": 60792.7784, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 4.534545454545454, + "grad_norm": 0.007712319493293762, + "learning_rate": 5.815159287594555e-05, + "loss": 0.011345027945935726, + "num_input_tokens_seen": 122525232, + "step": 7482, + "train_runtime": 60800.8903, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 4.535151515151515, + "grad_norm": 0.006968351546674967, + "learning_rate": 5.814210524231657e-05, + "loss": 0.011736880987882614, + "num_input_tokens_seen": 122541608, + "step": 7483, + "train_runtime": 60809.0032, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 4.535757575757576, + "grad_norm": 0.007816396653652191, + "learning_rate": 5.813261730752698e-05, + "loss": 0.012120930477976799, + "num_input_tokens_seen": 122557984, + "step": 7484, + "train_runtime": 60817.1139, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 4.536363636363636, + "grad_norm": 0.003324455814436078, + "learning_rate": 5.8123129071927705e-05, + "loss": 0.011433332227170467, + "num_input_tokens_seen": 122574360, + "step": 7485, + "train_runtime": 60825.2303, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 4.536969696969697, + "grad_norm": 0.011048461310565472, + "learning_rate": 5.811364053586973e-05, + "loss": 0.012288298457860947, + "num_input_tokens_seen": 122590736, + "step": 7486, + "train_runtime": 60833.3431, + "train_tokens_per_second": 2015.19 + }, + { + "epoch": 4.537575757575757, + "grad_norm": 0.005114708561450243, + "learning_rate": 5.810415169970397e-05, + "loss": 0.011468739248812199, + "num_input_tokens_seen": 122607112, + "step": 7487, + "train_runtime": 60841.4556, + "train_tokens_per_second": 2015.19 + }, + { + "epoch": 4.538181818181818, + "grad_norm": 0.004638183396309614, + "learning_rate": 5.8094662563781424e-05, + "loss": 0.012670084834098816, + "num_input_tokens_seen": 122623488, + "step": 7488, + "train_runtime": 60849.566, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 4.538787878787879, + "grad_norm": 0.008664246648550034, + "learning_rate": 5.8085173128453065e-05, + "loss": 0.01239110715687275, + "num_input_tokens_seen": 122639864, + "step": 7489, + "train_runtime": 60857.6776, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 4.539393939393939, + "grad_norm": 0.008941346779465675, + "learning_rate": 5.807568339406991e-05, + "loss": 0.011571883223950863, + "num_input_tokens_seen": 122656240, + "step": 7490, + "train_runtime": 60865.7885, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 4.54, + "grad_norm": 0.008338113315403461, + "learning_rate": 5.8066193360982956e-05, + "loss": 0.01143043115735054, + "num_input_tokens_seen": 122672616, + "step": 7491, + "train_runtime": 60873.9032, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 4.540606060606061, + "grad_norm": 0.006822745781391859, + "learning_rate": 5.805670302954321e-05, + "loss": 0.01354019995778799, + "num_input_tokens_seen": 122688992, + "step": 7492, + "train_runtime": 60882.0143, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 4.541212121212121, + "grad_norm": 0.0036079545971006155, + "learning_rate": 5.804721240010171e-05, + "loss": 0.011718206107616425, + "num_input_tokens_seen": 122705368, + "step": 7493, + "train_runtime": 60890.1306, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 4.541818181818182, + "grad_norm": 0.0040779272094368935, + "learning_rate": 5.803772147300949e-05, + "loss": 0.011626984924077988, + "num_input_tokens_seen": 122721744, + "step": 7494, + "train_runtime": 60898.2424, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 4.542424242424243, + "grad_norm": 0.006363391876220703, + "learning_rate": 5.802823024861761e-05, + "loss": 0.012588887475430965, + "num_input_tokens_seen": 122738120, + "step": 7495, + "train_runtime": 60906.356, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 4.543030303030303, + "grad_norm": 0.004723140504211187, + "learning_rate": 5.801873872727713e-05, + "loss": 0.013052528724074364, + "num_input_tokens_seen": 122754496, + "step": 7496, + "train_runtime": 60914.4689, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 4.543636363636364, + "grad_norm": 0.007399110589176416, + "learning_rate": 5.800924690933912e-05, + "loss": 0.01345279160887003, + "num_input_tokens_seen": 122770872, + "step": 7497, + "train_runtime": 60922.5817, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 4.544242424242424, + "grad_norm": 0.007962867617607117, + "learning_rate": 5.799975479515466e-05, + "loss": 0.012895972467958927, + "num_input_tokens_seen": 122787248, + "step": 7498, + "train_runtime": 60930.6959, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 4.544848484848485, + "grad_norm": 0.008479689247906208, + "learning_rate": 5.7990262385074854e-05, + "loss": 0.012084566988050938, + "num_input_tokens_seen": 122803624, + "step": 7499, + "train_runtime": 60938.8103, + "train_tokens_per_second": 2015.196 + }, + { + "epoch": 4.545454545454545, + "grad_norm": 0.007886005565524101, + "learning_rate": 5.79807696794508e-05, + "loss": 0.010978351347148418, + "num_input_tokens_seen": 122820000, + "step": 7500, + "train_runtime": 60946.93, + "train_tokens_per_second": 2015.196 + }, + { + "epoch": 4.546060606060606, + "grad_norm": 0.007541271857917309, + "learning_rate": 5.797127667863362e-05, + "loss": 0.011396707966923714, + "num_input_tokens_seen": 122836376, + "step": 7501, + "train_runtime": 60956.0822, + "train_tokens_per_second": 2015.162 + }, + { + "epoch": 4.546666666666667, + "grad_norm": 0.04153669998049736, + "learning_rate": 5.796178338297443e-05, + "loss": 0.011829433962702751, + "num_input_tokens_seen": 122852752, + "step": 7502, + "train_runtime": 60964.1909, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 4.547272727272727, + "grad_norm": 0.007724971044808626, + "learning_rate": 5.795228979282439e-05, + "loss": 0.011533616110682487, + "num_input_tokens_seen": 122869128, + "step": 7503, + "train_runtime": 60972.2981, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 4.547878787878788, + "grad_norm": 0.010146350599825382, + "learning_rate": 5.794279590853463e-05, + "loss": 0.011492525227367878, + "num_input_tokens_seen": 122885504, + "step": 7504, + "train_runtime": 60980.4167, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 4.548484848484849, + "grad_norm": 0.00520604383200407, + "learning_rate": 5.7933301730456324e-05, + "loss": 0.012541307136416435, + "num_input_tokens_seen": 122901880, + "step": 7505, + "train_runtime": 60988.5312, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 4.549090909090909, + "grad_norm": 0.008148075081408024, + "learning_rate": 5.792380725894062e-05, + "loss": 0.012225640006363392, + "num_input_tokens_seen": 122918256, + "step": 7506, + "train_runtime": 60996.6416, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 4.54969696969697, + "grad_norm": 0.007797275669872761, + "learning_rate": 5.791431249433873e-05, + "loss": 0.011669299565255642, + "num_input_tokens_seen": 122934632, + "step": 7507, + "train_runtime": 61004.7477, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 4.550303030303031, + "grad_norm": 0.007242224179208279, + "learning_rate": 5.790481743700182e-05, + "loss": 0.011845477856695652, + "num_input_tokens_seen": 122951008, + "step": 7508, + "train_runtime": 61012.8563, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 4.550909090909091, + "grad_norm": 0.0045040445402264595, + "learning_rate": 5.7895322087281136e-05, + "loss": 0.012223348952829838, + "num_input_tokens_seen": 122967384, + "step": 7509, + "train_runtime": 61020.9702, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 4.551515151515152, + "grad_norm": 0.010100853629410267, + "learning_rate": 5.788582644552782e-05, + "loss": 0.01269073411822319, + "num_input_tokens_seen": 122983760, + "step": 7510, + "train_runtime": 61029.0839, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 4.552121212121212, + "grad_norm": 0.006079551763832569, + "learning_rate": 5.787633051209318e-05, + "loss": 0.012154240161180496, + "num_input_tokens_seen": 123000136, + "step": 7511, + "train_runtime": 61037.1978, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 4.552727272727273, + "grad_norm": 0.00767027772963047, + "learning_rate": 5.786683428732839e-05, + "loss": 0.012795902788639069, + "num_input_tokens_seen": 123016512, + "step": 7512, + "train_runtime": 61045.3113, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 4.553333333333334, + "grad_norm": 0.008745123632252216, + "learning_rate": 5.785733777158473e-05, + "loss": 0.011535726487636566, + "num_input_tokens_seen": 123032888, + "step": 7513, + "train_runtime": 61053.4297, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 4.553939393939394, + "grad_norm": 0.003242170438170433, + "learning_rate": 5.784784096521345e-05, + "loss": 0.01161230355501175, + "num_input_tokens_seen": 123049264, + "step": 7514, + "train_runtime": 61061.5415, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 4.554545454545455, + "grad_norm": 0.0015124240890145302, + "learning_rate": 5.7838343868565815e-05, + "loss": 0.011500673368573189, + "num_input_tokens_seen": 123065640, + "step": 7515, + "train_runtime": 61069.653, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 4.555151515151515, + "grad_norm": 0.012184390798211098, + "learning_rate": 5.782884648199309e-05, + "loss": 0.01287949550896883, + "num_input_tokens_seen": 123082016, + "step": 7516, + "train_runtime": 61077.7689, + "train_tokens_per_second": 2015.169 + }, + { + "epoch": 4.555757575757576, + "grad_norm": 0.005577355623245239, + "learning_rate": 5.781934880584658e-05, + "loss": 0.0132110845297575, + "num_input_tokens_seen": 123098392, + "step": 7517, + "train_runtime": 61085.8823, + "train_tokens_per_second": 2015.169 + }, + { + "epoch": 4.556363636363637, + "grad_norm": 0.006200830917805433, + "learning_rate": 5.780985084047761e-05, + "loss": 0.011666100472211838, + "num_input_tokens_seen": 123114768, + "step": 7518, + "train_runtime": 61093.9939, + "train_tokens_per_second": 2015.17 + }, + { + "epoch": 4.556969696969697, + "grad_norm": 0.0044780271127820015, + "learning_rate": 5.780035258623745e-05, + "loss": 0.01181648951023817, + "num_input_tokens_seen": 123131144, + "step": 7519, + "train_runtime": 61102.1065, + "train_tokens_per_second": 2015.17 + }, + { + "epoch": 4.557575757575758, + "grad_norm": 0.006853947415947914, + "learning_rate": 5.779085404347744e-05, + "loss": 0.01226215809583664, + "num_input_tokens_seen": 123147520, + "step": 7520, + "train_runtime": 61110.2193, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 4.558181818181819, + "grad_norm": 0.00557865621522069, + "learning_rate": 5.7781355212548904e-05, + "loss": 0.011690055951476097, + "num_input_tokens_seen": 123163896, + "step": 7521, + "train_runtime": 61118.3323, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 4.558787878787879, + "grad_norm": 0.006640324369072914, + "learning_rate": 5.77718560938032e-05, + "loss": 0.012518730014562607, + "num_input_tokens_seen": 123180272, + "step": 7522, + "train_runtime": 61126.4442, + "train_tokens_per_second": 2015.172 + }, + { + "epoch": 4.5593939393939396, + "grad_norm": 0.01195869967341423, + "learning_rate": 5.776235668759168e-05, + "loss": 0.014316074550151825, + "num_input_tokens_seen": 123196648, + "step": 7523, + "train_runtime": 61134.5525, + "train_tokens_per_second": 2015.172 + }, + { + "epoch": 4.5600000000000005, + "grad_norm": 0.00306536047719419, + "learning_rate": 5.775285699426569e-05, + "loss": 0.01094839908182621, + "num_input_tokens_seen": 123213024, + "step": 7524, + "train_runtime": 61142.6647, + "train_tokens_per_second": 2015.173 + }, + { + "epoch": 4.5606060606060606, + "grad_norm": 0.007349846884608269, + "learning_rate": 5.7743357014176624e-05, + "loss": 0.012609061785042286, + "num_input_tokens_seen": 123229400, + "step": 7525, + "train_runtime": 61150.7767, + "train_tokens_per_second": 2015.173 + }, + { + "epoch": 4.5612121212121215, + "grad_norm": 0.011159392073750496, + "learning_rate": 5.773385674767586e-05, + "loss": 0.01321688387542963, + "num_input_tokens_seen": 123245776, + "step": 7526, + "train_runtime": 61158.8887, + "train_tokens_per_second": 2015.174 + }, + { + "epoch": 4.5618181818181816, + "grad_norm": 0.008602376095950603, + "learning_rate": 5.7724356195114804e-05, + "loss": 0.012532188557088375, + "num_input_tokens_seen": 123262152, + "step": 7527, + "train_runtime": 61167.0031, + "train_tokens_per_second": 2015.174 + }, + { + "epoch": 4.5624242424242425, + "grad_norm": 0.00945155881345272, + "learning_rate": 5.7714855356844846e-05, + "loss": 0.011462806724011898, + "num_input_tokens_seen": 123278528, + "step": 7528, + "train_runtime": 61175.1186, + "train_tokens_per_second": 2015.174 + }, + { + "epoch": 4.5630303030303025, + "grad_norm": 0.006026644725352526, + "learning_rate": 5.7705354233217416e-05, + "loss": 0.011261316016316414, + "num_input_tokens_seen": 123294904, + "step": 7529, + "train_runtime": 61183.2309, + "train_tokens_per_second": 2015.175 + }, + { + "epoch": 4.5636363636363635, + "grad_norm": 0.0064174155704677105, + "learning_rate": 5.769585282458393e-05, + "loss": 0.012297688983380795, + "num_input_tokens_seen": 123311280, + "step": 7530, + "train_runtime": 61191.3417, + "train_tokens_per_second": 2015.175 + }, + { + "epoch": 4.564242424242424, + "grad_norm": 0.006841061171144247, + "learning_rate": 5.768635113129584e-05, + "loss": 0.012241207994520664, + "num_input_tokens_seen": 123327656, + "step": 7531, + "train_runtime": 61199.4556, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 4.5648484848484845, + "grad_norm": 0.005842931568622589, + "learning_rate": 5.76768491537046e-05, + "loss": 0.01110636256635189, + "num_input_tokens_seen": 123344032, + "step": 7532, + "train_runtime": 61207.5692, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 4.565454545454545, + "grad_norm": 0.0054770284332334995, + "learning_rate": 5.7667346892161645e-05, + "loss": 0.011620347388088703, + "num_input_tokens_seen": 123360408, + "step": 7533, + "train_runtime": 61215.6827, + "train_tokens_per_second": 2015.177 + }, + { + "epoch": 4.566060606060606, + "grad_norm": 0.007903819903731346, + "learning_rate": 5.7657844347018464e-05, + "loss": 0.012204878032207489, + "num_input_tokens_seen": 123376784, + "step": 7534, + "train_runtime": 61223.7936, + "train_tokens_per_second": 2015.177 + }, + { + "epoch": 4.566666666666666, + "grad_norm": 0.003952059894800186, + "learning_rate": 5.7648341518626524e-05, + "loss": 0.010347911156713963, + "num_input_tokens_seen": 123393160, + "step": 7535, + "train_runtime": 61231.9062, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 4.567272727272727, + "grad_norm": 0.009207941591739655, + "learning_rate": 5.763883840733736e-05, + "loss": 0.01295054703950882, + "num_input_tokens_seen": 123409536, + "step": 7536, + "train_runtime": 61240.0177, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 4.567878787878788, + "grad_norm": 0.006342846900224686, + "learning_rate": 5.762933501350242e-05, + "loss": 0.010783017612993717, + "num_input_tokens_seen": 123425912, + "step": 7537, + "train_runtime": 61248.1297, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 4.568484848484848, + "grad_norm": 0.006415924523025751, + "learning_rate": 5.761983133747322e-05, + "loss": 0.011763885617256165, + "num_input_tokens_seen": 123442288, + "step": 7538, + "train_runtime": 61256.2415, + "train_tokens_per_second": 2015.179 + }, + { + "epoch": 4.569090909090909, + "grad_norm": 0.005679226014763117, + "learning_rate": 5.761032737960131e-05, + "loss": 0.011079758405685425, + "num_input_tokens_seen": 123458664, + "step": 7539, + "train_runtime": 61264.3556, + "train_tokens_per_second": 2015.179 + }, + { + "epoch": 4.569696969696969, + "grad_norm": 0.013161004520952702, + "learning_rate": 5.7600823140238206e-05, + "loss": 0.013017905876040459, + "num_input_tokens_seen": 123475040, + "step": 7540, + "train_runtime": 61272.4706, + "train_tokens_per_second": 2015.18 + }, + { + "epoch": 4.57030303030303, + "grad_norm": 0.004125749692320824, + "learning_rate": 5.759131861973548e-05, + "loss": 0.011940286494791508, + "num_input_tokens_seen": 123491416, + "step": 7541, + "train_runtime": 61280.5842, + "train_tokens_per_second": 2015.18 + }, + { + "epoch": 4.570909090909091, + "grad_norm": 0.006007676478475332, + "learning_rate": 5.758181381844463e-05, + "loss": 0.012433272786438465, + "num_input_tokens_seen": 123507792, + "step": 7542, + "train_runtime": 61288.7005, + "train_tokens_per_second": 2015.18 + }, + { + "epoch": 4.571515151515151, + "grad_norm": 0.008022299967706203, + "learning_rate": 5.757230873671726e-05, + "loss": 0.012435559183359146, + "num_input_tokens_seen": 123524168, + "step": 7543, + "train_runtime": 61296.8155, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 4.572121212121212, + "grad_norm": 0.007116473745554686, + "learning_rate": 5.756280337490494e-05, + "loss": 0.011889253742992878, + "num_input_tokens_seen": 123540544, + "step": 7544, + "train_runtime": 61304.9298, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 4.572727272727272, + "grad_norm": 0.007369240280240774, + "learning_rate": 5.755329773335926e-05, + "loss": 0.011305497027933598, + "num_input_tokens_seen": 123556920, + "step": 7545, + "train_runtime": 61313.0425, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 4.573333333333333, + "grad_norm": 0.007066152989864349, + "learning_rate": 5.754379181243179e-05, + "loss": 0.011710061691701412, + "num_input_tokens_seen": 123573296, + "step": 7546, + "train_runtime": 61321.1559, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 4.573939393939394, + "grad_norm": 0.005575936753302813, + "learning_rate": 5.753428561247416e-05, + "loss": 0.012875554151833057, + "num_input_tokens_seen": 123589672, + "step": 7547, + "train_runtime": 61329.2688, + "train_tokens_per_second": 2015.183 + }, + { + "epoch": 4.574545454545454, + "grad_norm": 0.00880954135209322, + "learning_rate": 5.7524779133837966e-05, + "loss": 0.011560854502022266, + "num_input_tokens_seen": 123606048, + "step": 7548, + "train_runtime": 61337.3807, + "train_tokens_per_second": 2015.183 + }, + { + "epoch": 4.575151515151515, + "grad_norm": 0.004786048550158739, + "learning_rate": 5.751527237687486e-05, + "loss": 0.011696670204401016, + "num_input_tokens_seen": 123622424, + "step": 7549, + "train_runtime": 61345.4933, + "train_tokens_per_second": 2015.183 + }, + { + "epoch": 4.575757575757576, + "grad_norm": 0.006636836566030979, + "learning_rate": 5.7505765341936466e-05, + "loss": 0.012182692997157574, + "num_input_tokens_seen": 123638800, + "step": 7550, + "train_runtime": 61353.6064, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 4.576363636363636, + "grad_norm": 0.008410117588937283, + "learning_rate": 5.749625802937443e-05, + "loss": 0.013071565888822079, + "num_input_tokens_seen": 123655176, + "step": 7551, + "train_runtime": 61361.7192, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 4.576969696969697, + "grad_norm": 0.008039836771786213, + "learning_rate": 5.7486750439540395e-05, + "loss": 0.012802342884242535, + "num_input_tokens_seen": 123671552, + "step": 7552, + "train_runtime": 61369.8316, + "train_tokens_per_second": 2015.185 + }, + { + "epoch": 4.577575757575758, + "grad_norm": 0.006448084022849798, + "learning_rate": 5.747724257278605e-05, + "loss": 0.011646611616015434, + "num_input_tokens_seen": 123687928, + "step": 7553, + "train_runtime": 61377.9415, + "train_tokens_per_second": 2015.185 + }, + { + "epoch": 4.578181818181818, + "grad_norm": 0.005208458751440048, + "learning_rate": 5.7467734429463063e-05, + "loss": 0.012276512570679188, + "num_input_tokens_seen": 123704304, + "step": 7554, + "train_runtime": 61386.0545, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 4.578787878787879, + "grad_norm": 0.0077039506286382675, + "learning_rate": 5.745822600992312e-05, + "loss": 0.013511152938008308, + "num_input_tokens_seen": 123720680, + "step": 7555, + "train_runtime": 61394.1669, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 4.579393939393939, + "grad_norm": 0.006642493885010481, + "learning_rate": 5.7448717314517907e-05, + "loss": 0.012238163501024246, + "num_input_tokens_seen": 123737056, + "step": 7556, + "train_runtime": 61402.2797, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 4.58, + "grad_norm": 0.004733848385512829, + "learning_rate": 5.743920834359917e-05, + "loss": 0.011704827658832073, + "num_input_tokens_seen": 123753432, + "step": 7557, + "train_runtime": 61410.3925, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 4.58060606060606, + "grad_norm": 0.007169488351792097, + "learning_rate": 5.7429699097518585e-05, + "loss": 0.012581219896674156, + "num_input_tokens_seen": 123769808, + "step": 7558, + "train_runtime": 61418.5035, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 4.581212121212121, + "grad_norm": 0.004842781461775303, + "learning_rate": 5.7420189576627924e-05, + "loss": 0.011588436551392078, + "num_input_tokens_seen": 123786184, + "step": 7559, + "train_runtime": 61426.6127, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 4.581818181818182, + "grad_norm": 0.007978571578860283, + "learning_rate": 5.741067978127889e-05, + "loss": 0.012637092731893063, + "num_input_tokens_seen": 123802560, + "step": 7560, + "train_runtime": 61434.7304, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 4.582424242424242, + "grad_norm": 0.00700564356520772, + "learning_rate": 5.740116971182322e-05, + "loss": 0.011999641545116901, + "num_input_tokens_seen": 123818936, + "step": 7561, + "train_runtime": 61442.8445, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 4.583030303030303, + "grad_norm": 0.010528091341257095, + "learning_rate": 5.7391659368612715e-05, + "loss": 0.011948628351092339, + "num_input_tokens_seen": 123835312, + "step": 7562, + "train_runtime": 61450.9561, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 4.583636363636364, + "grad_norm": 0.008890559896826744, + "learning_rate": 5.738214875199912e-05, + "loss": 0.012734299525618553, + "num_input_tokens_seen": 123851688, + "step": 7563, + "train_runtime": 61459.0684, + "train_tokens_per_second": 2015.19 + }, + { + "epoch": 4.584242424242424, + "grad_norm": 0.009525255300104618, + "learning_rate": 5.737263786233423e-05, + "loss": 0.011977331712841988, + "num_input_tokens_seen": 123868064, + "step": 7564, + "train_runtime": 61467.1828, + "train_tokens_per_second": 2015.19 + }, + { + "epoch": 4.584848484848485, + "grad_norm": 0.00434644240885973, + "learning_rate": 5.736312669996982e-05, + "loss": 0.013374016620218754, + "num_input_tokens_seen": 123884440, + "step": 7565, + "train_runtime": 61475.2996, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 4.585454545454546, + "grad_norm": 0.009756667539477348, + "learning_rate": 5.735361526525768e-05, + "loss": 0.010512007400393486, + "num_input_tokens_seen": 123900816, + "step": 7566, + "train_runtime": 61483.4128, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 4.586060606060606, + "grad_norm": 0.006827359087765217, + "learning_rate": 5.734410355854963e-05, + "loss": 0.012279028072953224, + "num_input_tokens_seen": 123917192, + "step": 7567, + "train_runtime": 61491.5324, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 4.586666666666667, + "grad_norm": 0.007012124173343182, + "learning_rate": 5.733459158019752e-05, + "loss": 0.012375103309750557, + "num_input_tokens_seen": 123933568, + "step": 7568, + "train_runtime": 61499.6426, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 4.587272727272727, + "grad_norm": 0.007851937785744667, + "learning_rate": 5.732507933055311e-05, + "loss": 0.011330787092447281, + "num_input_tokens_seen": 123949944, + "step": 7569, + "train_runtime": 61507.7542, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 4.587878787878788, + "grad_norm": 0.007283366750925779, + "learning_rate": 5.7315566809968314e-05, + "loss": 0.012636571191251278, + "num_input_tokens_seen": 123966320, + "step": 7570, + "train_runtime": 61515.8663, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 4.588484848484849, + "grad_norm": 0.0095520606264472, + "learning_rate": 5.730605401879492e-05, + "loss": 0.012906312942504883, + "num_input_tokens_seen": 123982696, + "step": 7571, + "train_runtime": 61523.9788, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 4.589090909090909, + "grad_norm": 0.004999467637389898, + "learning_rate": 5.729654095738484e-05, + "loss": 0.010737591423094273, + "num_input_tokens_seen": 123999072, + "step": 7572, + "train_runtime": 61532.093, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 4.58969696969697, + "grad_norm": 0.005521997809410095, + "learning_rate": 5.728702762608991e-05, + "loss": 0.011559804901480675, + "num_input_tokens_seen": 124015448, + "step": 7573, + "train_runtime": 61540.2044, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 4.59030303030303, + "grad_norm": 0.005899071227759123, + "learning_rate": 5.7277514025262026e-05, + "loss": 0.013387801125645638, + "num_input_tokens_seen": 124031824, + "step": 7574, + "train_runtime": 61548.3161, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 4.590909090909091, + "grad_norm": 0.007969418540596962, + "learning_rate": 5.726800015525304e-05, + "loss": 0.011740542016923428, + "num_input_tokens_seen": 124048200, + "step": 7575, + "train_runtime": 61556.4296, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 4.591515151515152, + "grad_norm": 0.004577333573251963, + "learning_rate": 5.725848601641491e-05, + "loss": 0.012206533923745155, + "num_input_tokens_seen": 124064576, + "step": 7576, + "train_runtime": 61564.543, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 4.592121212121212, + "grad_norm": 0.0032349741086363792, + "learning_rate": 5.724897160909952e-05, + "loss": 0.011748086661100388, + "num_input_tokens_seen": 124080952, + "step": 7577, + "train_runtime": 61572.6556, + "train_tokens_per_second": 2015.196 + }, + { + "epoch": 4.592727272727273, + "grad_norm": 0.009078282862901688, + "learning_rate": 5.723945693365879e-05, + "loss": 0.012471905909478664, + "num_input_tokens_seen": 124097328, + "step": 7578, + "train_runtime": 61580.7676, + "train_tokens_per_second": 2015.196 + }, + { + "epoch": 4.593333333333334, + "grad_norm": 0.007438721135258675, + "learning_rate": 5.722994199044462e-05, + "loss": 0.011527849361300468, + "num_input_tokens_seen": 124113704, + "step": 7579, + "train_runtime": 61588.8829, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 4.593939393939394, + "grad_norm": 0.009730101563036442, + "learning_rate": 5.7220426779809e-05, + "loss": 0.010931817814707756, + "num_input_tokens_seen": 124130080, + "step": 7580, + "train_runtime": 61596.9962, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 4.594545454545455, + "grad_norm": 0.00578496465459466, + "learning_rate": 5.721091130210386e-05, + "loss": 0.011340816505253315, + "num_input_tokens_seen": 124146456, + "step": 7581, + "train_runtime": 61605.1084, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 4.595151515151515, + "grad_norm": 0.005467939656227827, + "learning_rate": 5.7201395557681136e-05, + "loss": 0.012916011735796928, + "num_input_tokens_seen": 124162832, + "step": 7582, + "train_runtime": 61613.2202, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 4.595757575757576, + "grad_norm": 0.006117896176874638, + "learning_rate": 5.719187954689281e-05, + "loss": 0.012181630358099937, + "num_input_tokens_seen": 124179208, + "step": 7583, + "train_runtime": 61621.3341, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 4.596363636363637, + "grad_norm": 0.008080813102424145, + "learning_rate": 5.718236327009088e-05, + "loss": 0.0112070944160223, + "num_input_tokens_seen": 124195584, + "step": 7584, + "train_runtime": 61629.4486, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 4.596969696969697, + "grad_norm": 0.022676199674606323, + "learning_rate": 5.7172846727627304e-05, + "loss": 0.013078692369163036, + "num_input_tokens_seen": 124211960, + "step": 7585, + "train_runtime": 61637.56, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 4.597575757575758, + "grad_norm": 0.016419341787695885, + "learning_rate": 5.716332991985411e-05, + "loss": 0.012161212973296642, + "num_input_tokens_seen": 124228336, + "step": 7586, + "train_runtime": 61645.6728, + "train_tokens_per_second": 2015.2 + }, + { + "epoch": 4.598181818181818, + "grad_norm": 0.00868157111108303, + "learning_rate": 5.715381284712329e-05, + "loss": 0.012679677456617355, + "num_input_tokens_seen": 124244712, + "step": 7587, + "train_runtime": 61653.789, + "train_tokens_per_second": 2015.2 + }, + { + "epoch": 4.598787878787879, + "grad_norm": 0.00493884040042758, + "learning_rate": 5.714429550978686e-05, + "loss": 0.011592620983719826, + "num_input_tokens_seen": 124261088, + "step": 7588, + "train_runtime": 61661.9009, + "train_tokens_per_second": 2015.2 + }, + { + "epoch": 4.59939393939394, + "grad_norm": 0.0035126456059515476, + "learning_rate": 5.7134777908196855e-05, + "loss": 0.011506317183375359, + "num_input_tokens_seen": 124277464, + "step": 7589, + "train_runtime": 61670.0108, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 4.6, + "grad_norm": 0.010686330497264862, + "learning_rate": 5.71252600427053e-05, + "loss": 0.01231832429766655, + "num_input_tokens_seen": 124293840, + "step": 7590, + "train_runtime": 61678.1307, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 4.600606060606061, + "grad_norm": 0.009180238470435143, + "learning_rate": 5.7115741913664264e-05, + "loss": 0.013276875950396061, + "num_input_tokens_seen": 124310216, + "step": 7591, + "train_runtime": 61686.2425, + "train_tokens_per_second": 2015.202 + }, + { + "epoch": 4.6012121212121215, + "grad_norm": 0.008844691328704357, + "learning_rate": 5.7106223521425786e-05, + "loss": 0.011884505860507488, + "num_input_tokens_seen": 124326592, + "step": 7592, + "train_runtime": 61694.3517, + "train_tokens_per_second": 2015.202 + }, + { + "epoch": 4.601818181818182, + "grad_norm": 0.008017466403543949, + "learning_rate": 5.709670486634194e-05, + "loss": 0.011739242821931839, + "num_input_tokens_seen": 124342968, + "step": 7593, + "train_runtime": 61702.4635, + "train_tokens_per_second": 2015.203 + }, + { + "epoch": 4.6024242424242425, + "grad_norm": 0.008123587816953659, + "learning_rate": 5.70871859487648e-05, + "loss": 0.01307905837893486, + "num_input_tokens_seen": 124359344, + "step": 7594, + "train_runtime": 61710.5759, + "train_tokens_per_second": 2015.203 + }, + { + "epoch": 4.6030303030303035, + "grad_norm": 0.009568660520017147, + "learning_rate": 5.707766676904646e-05, + "loss": 0.010958974249660969, + "num_input_tokens_seen": 124375720, + "step": 7595, + "train_runtime": 61718.6879, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 4.6036363636363635, + "grad_norm": 0.004283589776605368, + "learning_rate": 5.7068147327539e-05, + "loss": 0.012463541701436043, + "num_input_tokens_seen": 124392096, + "step": 7596, + "train_runtime": 61726.7974, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 4.6042424242424245, + "grad_norm": 0.007651514373719692, + "learning_rate": 5.705862762459454e-05, + "loss": 0.012861331924796104, + "num_input_tokens_seen": 124408472, + "step": 7597, + "train_runtime": 61734.9086, + "train_tokens_per_second": 2015.205 + }, + { + "epoch": 4.6048484848484845, + "grad_norm": 0.006102906074374914, + "learning_rate": 5.7049107660565185e-05, + "loss": 0.011348439380526543, + "num_input_tokens_seen": 124424848, + "step": 7598, + "train_runtime": 61743.0316, + "train_tokens_per_second": 2015.205 + }, + { + "epoch": 4.6054545454545455, + "grad_norm": 0.005809375550597906, + "learning_rate": 5.7039587435803066e-05, + "loss": 0.01151482667773962, + "num_input_tokens_seen": 124441224, + "step": 7599, + "train_runtime": 61751.1432, + "train_tokens_per_second": 2015.205 + }, + { + "epoch": 4.606060606060606, + "grad_norm": 0.005627037957310677, + "learning_rate": 5.7030066950660335e-05, + "loss": 0.011964669451117516, + "num_input_tokens_seen": 124457600, + "step": 7600, + "train_runtime": 61759.2546, + "train_tokens_per_second": 2015.206 + }, + { + "epoch": 4.6066666666666665, + "grad_norm": 0.007520811166614294, + "learning_rate": 5.70205462054891e-05, + "loss": 0.012143184430897236, + "num_input_tokens_seen": 124473976, + "step": 7601, + "train_runtime": 61768.4341, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 4.607272727272727, + "grad_norm": 0.0072519853711128235, + "learning_rate": 5.7011025200641534e-05, + "loss": 0.012625779025256634, + "num_input_tokens_seen": 124490352, + "step": 7602, + "train_runtime": 61776.5429, + "train_tokens_per_second": 2015.172 + }, + { + "epoch": 4.6078787878787875, + "grad_norm": 0.005353286396712065, + "learning_rate": 5.70015039364698e-05, + "loss": 0.011674061417579651, + "num_input_tokens_seen": 124506728, + "step": 7603, + "train_runtime": 61784.6511, + "train_tokens_per_second": 2015.172 + }, + { + "epoch": 4.608484848484848, + "grad_norm": 0.004371905233711004, + "learning_rate": 5.6991982413326074e-05, + "loss": 0.011711842380464077, + "num_input_tokens_seen": 124523104, + "step": 7604, + "train_runtime": 61792.7597, + "train_tokens_per_second": 2015.173 + }, + { + "epoch": 4.609090909090909, + "grad_norm": 0.007317705079913139, + "learning_rate": 5.698246063156253e-05, + "loss": 0.011515907943248749, + "num_input_tokens_seen": 124539480, + "step": 7605, + "train_runtime": 61800.8807, + "train_tokens_per_second": 2015.173 + }, + { + "epoch": 4.609696969696969, + "grad_norm": 0.007294884417206049, + "learning_rate": 5.6972938591531376e-05, + "loss": 0.012360481545329094, + "num_input_tokens_seen": 124555856, + "step": 7606, + "train_runtime": 61808.9945, + "train_tokens_per_second": 2015.174 + }, + { + "epoch": 4.61030303030303, + "grad_norm": 0.00488508865237236, + "learning_rate": 5.696341629358478e-05, + "loss": 0.011719152331352234, + "num_input_tokens_seen": 124572232, + "step": 7607, + "train_runtime": 61817.1064, + "train_tokens_per_second": 2015.174 + }, + { + "epoch": 4.610909090909091, + "grad_norm": 0.0062965820543468, + "learning_rate": 5.695389373807499e-05, + "loss": 0.012021880596876144, + "num_input_tokens_seen": 124588608, + "step": 7608, + "train_runtime": 61825.2192, + "train_tokens_per_second": 2015.175 + }, + { + "epoch": 4.611515151515151, + "grad_norm": 0.0052826595492661, + "learning_rate": 5.6944370925354216e-05, + "loss": 0.012614961713552475, + "num_input_tokens_seen": 124604984, + "step": 7609, + "train_runtime": 61833.3317, + "train_tokens_per_second": 2015.175 + }, + { + "epoch": 4.612121212121212, + "grad_norm": 0.006745902355760336, + "learning_rate": 5.6934847855774684e-05, + "loss": 0.012292834930121899, + "num_input_tokens_seen": 124621360, + "step": 7610, + "train_runtime": 61841.4445, + "train_tokens_per_second": 2015.175 + }, + { + "epoch": 4.612727272727272, + "grad_norm": 0.0075298696756362915, + "learning_rate": 5.6925324529688626e-05, + "loss": 0.011930147185921669, + "num_input_tokens_seen": 124637736, + "step": 7611, + "train_runtime": 61849.553, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 4.613333333333333, + "grad_norm": 0.0036476643290370703, + "learning_rate": 5.691580094744832e-05, + "loss": 0.01082488615065813, + "num_input_tokens_seen": 124654112, + "step": 7612, + "train_runtime": 61857.6631, + "train_tokens_per_second": 2015.177 + }, + { + "epoch": 4.613939393939394, + "grad_norm": 0.005621652118861675, + "learning_rate": 5.690627710940599e-05, + "loss": 0.011921815574169159, + "num_input_tokens_seen": 124670488, + "step": 7613, + "train_runtime": 61865.7783, + "train_tokens_per_second": 2015.177 + }, + { + "epoch": 4.614545454545454, + "grad_norm": 0.006272017024457455, + "learning_rate": 5.689675301591392e-05, + "loss": 0.012271617539227009, + "num_input_tokens_seen": 124686864, + "step": 7614, + "train_runtime": 61873.8878, + "train_tokens_per_second": 2015.177 + }, + { + "epoch": 4.615151515151515, + "grad_norm": 0.021431801840662956, + "learning_rate": 5.688722866732438e-05, + "loss": 0.014209000393748283, + "num_input_tokens_seen": 124703240, + "step": 7615, + "train_runtime": 61881.9983, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 4.615757575757575, + "grad_norm": 0.008496491238474846, + "learning_rate": 5.687770406398967e-05, + "loss": 0.011690196581184864, + "num_input_tokens_seen": 124719616, + "step": 7616, + "train_runtime": 61890.1131, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 4.616363636363636, + "grad_norm": 0.007162962108850479, + "learning_rate": 5.686817920626207e-05, + "loss": 0.01188428234308958, + "num_input_tokens_seen": 124735992, + "step": 7617, + "train_runtime": 61898.2301, + "train_tokens_per_second": 2015.179 + }, + { + "epoch": 4.616969696969697, + "grad_norm": 0.009311377070844173, + "learning_rate": 5.6858654094493924e-05, + "loss": 0.011711744591593742, + "num_input_tokens_seen": 124752368, + "step": 7618, + "train_runtime": 61906.343, + "train_tokens_per_second": 2015.179 + }, + { + "epoch": 4.617575757575757, + "grad_norm": 0.00515404250472784, + "learning_rate": 5.684912872903749e-05, + "loss": 0.012737035751342773, + "num_input_tokens_seen": 124768744, + "step": 7619, + "train_runtime": 61914.4523, + "train_tokens_per_second": 2015.18 + }, + { + "epoch": 4.618181818181818, + "grad_norm": 0.004094431642442942, + "learning_rate": 5.6839603110245145e-05, + "loss": 0.011590168811380863, + "num_input_tokens_seen": 124785120, + "step": 7620, + "train_runtime": 61922.5653, + "train_tokens_per_second": 2015.18 + }, + { + "epoch": 4.618787878787879, + "grad_norm": 0.009279416874051094, + "learning_rate": 5.6830077238469184e-05, + "loss": 0.010926742106676102, + "num_input_tokens_seen": 124801496, + "step": 7621, + "train_runtime": 61930.6785, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 4.619393939393939, + "grad_norm": 0.007099403999745846, + "learning_rate": 5.682055111406198e-05, + "loss": 0.011119210161268711, + "num_input_tokens_seen": 124817872, + "step": 7622, + "train_runtime": 61938.7911, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 4.62, + "grad_norm": 0.00628677848726511, + "learning_rate": 5.681102473737587e-05, + "loss": 0.012258189730346203, + "num_input_tokens_seen": 124834248, + "step": 7623, + "train_runtime": 61946.9046, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 4.620606060606061, + "grad_norm": 0.009792282246053219, + "learning_rate": 5.680149810876322e-05, + "loss": 0.011850590817630291, + "num_input_tokens_seen": 124850624, + "step": 7624, + "train_runtime": 61955.0189, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 4.621212121212121, + "grad_norm": 0.010142377577722073, + "learning_rate": 5.679197122857638e-05, + "loss": 0.013785515911877155, + "num_input_tokens_seen": 124867000, + "step": 7625, + "train_runtime": 61963.1314, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 4.621818181818182, + "grad_norm": 0.007103605195879936, + "learning_rate": 5.678244409716778e-05, + "loss": 0.012177158147096634, + "num_input_tokens_seen": 124883376, + "step": 7626, + "train_runtime": 61971.2404, + "train_tokens_per_second": 2015.183 + }, + { + "epoch": 4.622424242424242, + "grad_norm": 0.006521238014101982, + "learning_rate": 5.677291671488977e-05, + "loss": 0.010466193780303001, + "num_input_tokens_seen": 124899752, + "step": 7627, + "train_runtime": 61979.3551, + "train_tokens_per_second": 2015.183 + }, + { + "epoch": 4.623030303030303, + "grad_norm": 0.005035921465605497, + "learning_rate": 5.6763389082094754e-05, + "loss": 0.011477330699563026, + "num_input_tokens_seen": 124916128, + "step": 7628, + "train_runtime": 61987.4681, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 4.623636363636364, + "grad_norm": 0.007608153857290745, + "learning_rate": 5.675386119913516e-05, + "loss": 0.012142511084675789, + "num_input_tokens_seen": 124932504, + "step": 7629, + "train_runtime": 61995.584, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 4.624242424242424, + "grad_norm": 0.006784569937735796, + "learning_rate": 5.674433306636337e-05, + "loss": 0.011965325102210045, + "num_input_tokens_seen": 124948880, + "step": 7630, + "train_runtime": 62003.6984, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 4.624848484848485, + "grad_norm": 0.0037856209091842175, + "learning_rate": 5.673480468413186e-05, + "loss": 0.011286279186606407, + "num_input_tokens_seen": 124965256, + "step": 7631, + "train_runtime": 62011.8125, + "train_tokens_per_second": 2015.185 + }, + { + "epoch": 4.625454545454545, + "grad_norm": 0.0043901256285607815, + "learning_rate": 5.672527605279303e-05, + "loss": 0.011589082889258862, + "num_input_tokens_seen": 124981632, + "step": 7632, + "train_runtime": 62019.9293, + "train_tokens_per_second": 2015.185 + }, + { + "epoch": 4.626060606060606, + "grad_norm": 0.007125362288206816, + "learning_rate": 5.671574717269933e-05, + "loss": 0.01224651001393795, + "num_input_tokens_seen": 124998008, + "step": 7633, + "train_runtime": 62028.0406, + "train_tokens_per_second": 2015.185 + }, + { + "epoch": 4.626666666666667, + "grad_norm": 0.0065633757039904594, + "learning_rate": 5.670621804420322e-05, + "loss": 0.013003852218389511, + "num_input_tokens_seen": 125014384, + "step": 7634, + "train_runtime": 62036.15, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 4.627272727272727, + "grad_norm": 0.007947299629449844, + "learning_rate": 5.669668866765717e-05, + "loss": 0.01231528539210558, + "num_input_tokens_seen": 125030760, + "step": 7635, + "train_runtime": 62044.2651, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 4.627878787878788, + "grad_norm": 0.005645785015076399, + "learning_rate": 5.668715904341365e-05, + "loss": 0.013138765469193459, + "num_input_tokens_seen": 125047136, + "step": 7636, + "train_runtime": 62052.3775, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 4.628484848484849, + "grad_norm": 0.008948879316449165, + "learning_rate": 5.667762917182513e-05, + "loss": 0.012706336565315723, + "num_input_tokens_seen": 125063512, + "step": 7637, + "train_runtime": 62060.488, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 4.629090909090909, + "grad_norm": 0.00908003468066454, + "learning_rate": 5.6668099053244116e-05, + "loss": 0.012634946964681149, + "num_input_tokens_seen": 125079888, + "step": 7638, + "train_runtime": 62068.5999, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 4.62969696969697, + "grad_norm": 0.0057378485798835754, + "learning_rate": 5.6658568688023104e-05, + "loss": 0.011716208420693874, + "num_input_tokens_seen": 125096264, + "step": 7639, + "train_runtime": 62076.7125, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 4.63030303030303, + "grad_norm": 0.0065591000020504, + "learning_rate": 5.664903807651459e-05, + "loss": 0.012607289478182793, + "num_input_tokens_seen": 125112640, + "step": 7640, + "train_runtime": 62084.8289, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 4.630909090909091, + "grad_norm": 0.0039438605308532715, + "learning_rate": 5.6639507219071144e-05, + "loss": 0.011594616807997227, + "num_input_tokens_seen": 125129016, + "step": 7641, + "train_runtime": 62092.9411, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 4.631515151515152, + "grad_norm": 0.005663083866238594, + "learning_rate": 5.6629976116045214e-05, + "loss": 0.011718695051968098, + "num_input_tokens_seen": 125145392, + "step": 7642, + "train_runtime": 62101.0551, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 4.632121212121212, + "grad_norm": 0.009910519234836102, + "learning_rate": 5.662044476778941e-05, + "loss": 0.011981477960944176, + "num_input_tokens_seen": 125161768, + "step": 7643, + "train_runtime": 62109.1641, + "train_tokens_per_second": 2015.19 + }, + { + "epoch": 4.632727272727273, + "grad_norm": 0.006463565398007631, + "learning_rate": 5.661091317465622e-05, + "loss": 0.012182418256998062, + "num_input_tokens_seen": 125178144, + "step": 7644, + "train_runtime": 62117.276, + "train_tokens_per_second": 2015.19 + }, + { + "epoch": 4.633333333333333, + "grad_norm": 0.009646481834352016, + "learning_rate": 5.660138133699825e-05, + "loss": 0.012147952802479267, + "num_input_tokens_seen": 125194520, + "step": 7645, + "train_runtime": 62125.389, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 4.633939393939394, + "grad_norm": 0.00650237500667572, + "learning_rate": 5.6591849255168015e-05, + "loss": 0.01135533582419157, + "num_input_tokens_seen": 125210896, + "step": 7646, + "train_runtime": 62133.5029, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 4.634545454545455, + "grad_norm": 0.005203657783567905, + "learning_rate": 5.658231692951813e-05, + "loss": 0.011056071147322655, + "num_input_tokens_seen": 125227272, + "step": 7647, + "train_runtime": 62141.6133, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 4.635151515151515, + "grad_norm": 0.006181515287607908, + "learning_rate": 5.657278436040115e-05, + "loss": 0.012019513174891472, + "num_input_tokens_seen": 125243648, + "step": 7648, + "train_runtime": 62149.7305, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 4.635757575757576, + "grad_norm": 0.006251920014619827, + "learning_rate": 5.656325154816969e-05, + "loss": 0.011626752093434334, + "num_input_tokens_seen": 125260024, + "step": 7649, + "train_runtime": 62157.8414, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 4.636363636363637, + "grad_norm": 0.006270985119044781, + "learning_rate": 5.6553718493176344e-05, + "loss": 0.011106343939900398, + "num_input_tokens_seen": 125276400, + "step": 7650, + "train_runtime": 62165.9536, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 4.636969696969697, + "grad_norm": 0.009044978767633438, + "learning_rate": 5.6544185195773694e-05, + "loss": 0.013336820527911186, + "num_input_tokens_seen": 125292776, + "step": 7651, + "train_runtime": 62174.0659, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 4.637575757575758, + "grad_norm": 0.005039642099291086, + "learning_rate": 5.6534651656314384e-05, + "loss": 0.012206834740936756, + "num_input_tokens_seen": 125309152, + "step": 7652, + "train_runtime": 62182.1766, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 4.638181818181819, + "grad_norm": 0.005801711697131395, + "learning_rate": 5.6525117875151024e-05, + "loss": 0.011533577926456928, + "num_input_tokens_seen": 125325528, + "step": 7653, + "train_runtime": 62190.2903, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 4.638787878787879, + "grad_norm": 0.008097313344478607, + "learning_rate": 5.651558385263628e-05, + "loss": 0.012026939541101456, + "num_input_tokens_seen": 125341904, + "step": 7654, + "train_runtime": 62198.4009, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 4.63939393939394, + "grad_norm": 0.004161168821156025, + "learning_rate": 5.650604958912277e-05, + "loss": 0.011938352137804031, + "num_input_tokens_seen": 125358280, + "step": 7655, + "train_runtime": 62206.5114, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 4.64, + "grad_norm": 0.00990255270153284, + "learning_rate": 5.6496515084963156e-05, + "loss": 0.012237548828125, + "num_input_tokens_seen": 125374656, + "step": 7656, + "train_runtime": 62214.6178, + "train_tokens_per_second": 2015.196 + }, + { + "epoch": 4.640606060606061, + "grad_norm": 0.0074542551301419735, + "learning_rate": 5.6486980340510086e-05, + "loss": 0.010302173905074596, + "num_input_tokens_seen": 125391032, + "step": 7657, + "train_runtime": 62222.7322, + "train_tokens_per_second": 2015.196 + }, + { + "epoch": 4.641212121212122, + "grad_norm": 0.0023812619037926197, + "learning_rate": 5.6477445356116265e-05, + "loss": 0.011442835442721844, + "num_input_tokens_seen": 125407408, + "step": 7658, + "train_runtime": 62230.8436, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 4.641818181818182, + "grad_norm": 0.0057721794582903385, + "learning_rate": 5.646791013213435e-05, + "loss": 0.011488020420074463, + "num_input_tokens_seen": 125423784, + "step": 7659, + "train_runtime": 62238.9534, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 4.642424242424243, + "grad_norm": 0.019111162051558495, + "learning_rate": 5.645837466891703e-05, + "loss": 0.0126451151445508, + "num_input_tokens_seen": 125440160, + "step": 7660, + "train_runtime": 62247.0698, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 4.643030303030303, + "grad_norm": 0.004640480503439903, + "learning_rate": 5.644883896681701e-05, + "loss": 0.010987820103764534, + "num_input_tokens_seen": 125456536, + "step": 7661, + "train_runtime": 62255.1833, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 4.643636363636364, + "grad_norm": 0.006884921807795763, + "learning_rate": 5.643930302618701e-05, + "loss": 0.012829242274165154, + "num_input_tokens_seen": 125472912, + "step": 7662, + "train_runtime": 62263.2949, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 4.6442424242424245, + "grad_norm": 0.006314435508102179, + "learning_rate": 5.642976684737971e-05, + "loss": 0.012324531562626362, + "num_input_tokens_seen": 125489288, + "step": 7663, + "train_runtime": 62271.4045, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 4.644848484848485, + "grad_norm": 0.00463859923183918, + "learning_rate": 5.6420230430747866e-05, + "loss": 0.011510903015732765, + "num_input_tokens_seen": 125505664, + "step": 7664, + "train_runtime": 62279.5186, + "train_tokens_per_second": 2015.2 + }, + { + "epoch": 4.6454545454545455, + "grad_norm": 0.0054571256041526794, + "learning_rate": 5.64106937766442e-05, + "loss": 0.011102309450507164, + "num_input_tokens_seen": 125522040, + "step": 7665, + "train_runtime": 62287.6349, + "train_tokens_per_second": 2015.2 + }, + { + "epoch": 4.6460606060606064, + "grad_norm": 0.005168710369616747, + "learning_rate": 5.640115688542145e-05, + "loss": 0.01277313195168972, + "num_input_tokens_seen": 125538416, + "step": 7666, + "train_runtime": 62295.749, + "train_tokens_per_second": 2015.2 + }, + { + "epoch": 4.6466666666666665, + "grad_norm": 0.011695913039147854, + "learning_rate": 5.639161975743237e-05, + "loss": 0.013208121061325073, + "num_input_tokens_seen": 125554792, + "step": 7667, + "train_runtime": 62303.8611, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 4.647272727272727, + "grad_norm": 0.008692912757396698, + "learning_rate": 5.6382082393029746e-05, + "loss": 0.011568975634872913, + "num_input_tokens_seen": 125571168, + "step": 7668, + "train_runtime": 62311.9749, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 4.6478787878787875, + "grad_norm": 0.006611556280404329, + "learning_rate": 5.637254479256629e-05, + "loss": 0.011798643507063389, + "num_input_tokens_seen": 125587544, + "step": 7669, + "train_runtime": 62320.0876, + "train_tokens_per_second": 2015.202 + }, + { + "epoch": 4.648484848484848, + "grad_norm": 0.005717176012694836, + "learning_rate": 5.636300695639484e-05, + "loss": 0.012378595769405365, + "num_input_tokens_seen": 125603920, + "step": 7670, + "train_runtime": 62328.1982, + "train_tokens_per_second": 2015.202 + }, + { + "epoch": 4.649090909090909, + "grad_norm": 0.003923137206584215, + "learning_rate": 5.635346888486814e-05, + "loss": 0.011106952093541622, + "num_input_tokens_seen": 125620296, + "step": 7671, + "train_runtime": 62336.3084, + "train_tokens_per_second": 2015.203 + }, + { + "epoch": 4.649696969696969, + "grad_norm": 0.008024441078305244, + "learning_rate": 5.634393057833899e-05, + "loss": 0.011520912870764732, + "num_input_tokens_seen": 125636672, + "step": 7672, + "train_runtime": 62344.4203, + "train_tokens_per_second": 2015.203 + }, + { + "epoch": 4.65030303030303, + "grad_norm": 0.005509206093847752, + "learning_rate": 5.633439203716022e-05, + "loss": 0.0116068534553051, + "num_input_tokens_seen": 125653048, + "step": 7673, + "train_runtime": 62352.533, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 4.65090909090909, + "grad_norm": 0.0019134001340717077, + "learning_rate": 5.632485326168462e-05, + "loss": 0.011170334182679653, + "num_input_tokens_seen": 125669424, + "step": 7674, + "train_runtime": 62360.644, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 4.651515151515151, + "grad_norm": 0.011636337265372276, + "learning_rate": 5.6315314252265e-05, + "loss": 0.013255230151116848, + "num_input_tokens_seen": 125685800, + "step": 7675, + "train_runtime": 62368.7574, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 4.652121212121212, + "grad_norm": 0.005493410862982273, + "learning_rate": 5.6305775009254225e-05, + "loss": 0.012499762699007988, + "num_input_tokens_seen": 125702176, + "step": 7676, + "train_runtime": 62376.8698, + "train_tokens_per_second": 2015.205 + }, + { + "epoch": 4.652727272727272, + "grad_norm": 0.008104773238301277, + "learning_rate": 5.6296235533005105e-05, + "loss": 0.012191923335194588, + "num_input_tokens_seen": 125718552, + "step": 7677, + "train_runtime": 62384.9813, + "train_tokens_per_second": 2015.205 + }, + { + "epoch": 4.653333333333333, + "grad_norm": 0.010634840466082096, + "learning_rate": 5.6286695823870497e-05, + "loss": 0.013148360885679722, + "num_input_tokens_seen": 125734928, + "step": 7678, + "train_runtime": 62393.0915, + "train_tokens_per_second": 2015.206 + }, + { + "epoch": 4.653939393939394, + "grad_norm": 0.007913218811154366, + "learning_rate": 5.627715588220325e-05, + "loss": 0.012802905403077602, + "num_input_tokens_seen": 125751304, + "step": 7679, + "train_runtime": 62401.2024, + "train_tokens_per_second": 2015.206 + }, + { + "epoch": 4.654545454545454, + "grad_norm": 0.007733066100627184, + "learning_rate": 5.6267615708356216e-05, + "loss": 0.011036624200642109, + "num_input_tokens_seen": 125767680, + "step": 7680, + "train_runtime": 62409.3128, + "train_tokens_per_second": 2015.207 + }, + { + "epoch": 4.655151515151515, + "grad_norm": 0.00851327646523714, + "learning_rate": 5.62580753026823e-05, + "loss": 0.013818138279020786, + "num_input_tokens_seen": 125784056, + "step": 7681, + "train_runtime": 62417.4202, + "train_tokens_per_second": 2015.208 + }, + { + "epoch": 4.655757575757576, + "grad_norm": 0.0049270824529230595, + "learning_rate": 5.624853466553437e-05, + "loss": 0.011906759813427925, + "num_input_tokens_seen": 125800432, + "step": 7682, + "train_runtime": 62425.5315, + "train_tokens_per_second": 2015.208 + }, + { + "epoch": 4.656363636363636, + "grad_norm": 0.006754441652446985, + "learning_rate": 5.6238993797265305e-05, + "loss": 0.012212786823511124, + "num_input_tokens_seen": 125816808, + "step": 7683, + "train_runtime": 62433.6442, + "train_tokens_per_second": 2015.208 + }, + { + "epoch": 4.656969696969697, + "grad_norm": 0.005301730707287788, + "learning_rate": 5.622945269822799e-05, + "loss": 0.012947436422109604, + "num_input_tokens_seen": 125833184, + "step": 7684, + "train_runtime": 62441.7524, + "train_tokens_per_second": 2015.209 + }, + { + "epoch": 4.657575757575757, + "grad_norm": 0.003452088451012969, + "learning_rate": 5.621991136877538e-05, + "loss": 0.011605787090957165, + "num_input_tokens_seen": 125849560, + "step": 7685, + "train_runtime": 62449.8647, + "train_tokens_per_second": 2015.209 + }, + { + "epoch": 4.658181818181818, + "grad_norm": 0.008070512674748898, + "learning_rate": 5.6210369809260355e-05, + "loss": 0.012263590469956398, + "num_input_tokens_seen": 125865936, + "step": 7686, + "train_runtime": 62457.9788, + "train_tokens_per_second": 2015.21 + }, + { + "epoch": 4.658787878787879, + "grad_norm": 0.005163109861314297, + "learning_rate": 5.6200828020035835e-05, + "loss": 0.01280882302671671, + "num_input_tokens_seen": 125882312, + "step": 7687, + "train_runtime": 62466.0926, + "train_tokens_per_second": 2015.21 + }, + { + "epoch": 4.659393939393939, + "grad_norm": 0.005203522741794586, + "learning_rate": 5.6191286001454756e-05, + "loss": 0.011088373139500618, + "num_input_tokens_seen": 125898688, + "step": 7688, + "train_runtime": 62474.2032, + "train_tokens_per_second": 2015.211 + }, + { + "epoch": 4.66, + "grad_norm": 0.006904542911797762, + "learning_rate": 5.6181743753870086e-05, + "loss": 0.012309123761951923, + "num_input_tokens_seen": 125915064, + "step": 7689, + "train_runtime": 62482.3137, + "train_tokens_per_second": 2015.211 + }, + { + "epoch": 4.66060606060606, + "grad_norm": 0.010707304812967777, + "learning_rate": 5.617220127763474e-05, + "loss": 0.012153606861829758, + "num_input_tokens_seen": 125931440, + "step": 7690, + "train_runtime": 62490.4306, + "train_tokens_per_second": 2015.212 + }, + { + "epoch": 4.661212121212121, + "grad_norm": 0.008687201887369156, + "learning_rate": 5.61626585731017e-05, + "loss": 0.013051153160631657, + "num_input_tokens_seen": 125947816, + "step": 7691, + "train_runtime": 62498.5443, + "train_tokens_per_second": 2015.212 + }, + { + "epoch": 4.661818181818182, + "grad_norm": 0.0066212196834385395, + "learning_rate": 5.61531156406239e-05, + "loss": 0.012061841785907745, + "num_input_tokens_seen": 125964192, + "step": 7692, + "train_runtime": 62506.6547, + "train_tokens_per_second": 2015.213 + }, + { + "epoch": 4.662424242424242, + "grad_norm": 0.0028983724769204855, + "learning_rate": 5.614357248055435e-05, + "loss": 0.010881771333515644, + "num_input_tokens_seen": 125980568, + "step": 7693, + "train_runtime": 62514.7666, + "train_tokens_per_second": 2015.213 + }, + { + "epoch": 4.663030303030303, + "grad_norm": 0.007794314529746771, + "learning_rate": 5.613402909324602e-05, + "loss": 0.011921718716621399, + "num_input_tokens_seen": 125996944, + "step": 7694, + "train_runtime": 62522.8799, + "train_tokens_per_second": 2015.213 + }, + { + "epoch": 4.663636363636364, + "grad_norm": 0.005676658358424902, + "learning_rate": 5.612448547905189e-05, + "loss": 0.012300565838813782, + "num_input_tokens_seen": 126013320, + "step": 7695, + "train_runtime": 62530.9899, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 4.664242424242424, + "grad_norm": 0.005331929307430983, + "learning_rate": 5.6114941638324984e-05, + "loss": 0.012403048574924469, + "num_input_tokens_seen": 126029696, + "step": 7696, + "train_runtime": 62539.0992, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 4.664848484848485, + "grad_norm": 0.009708977304399014, + "learning_rate": 5.610539757141827e-05, + "loss": 0.012623783200979233, + "num_input_tokens_seen": 126046072, + "step": 7697, + "train_runtime": 62547.2106, + "train_tokens_per_second": 2015.215 + }, + { + "epoch": 4.665454545454545, + "grad_norm": 0.008059236221015453, + "learning_rate": 5.609585327868481e-05, + "loss": 0.012339089997112751, + "num_input_tokens_seen": 126062448, + "step": 7698, + "train_runtime": 62555.3316, + "train_tokens_per_second": 2015.215 + }, + { + "epoch": 4.666060606060606, + "grad_norm": 0.0070078568533062935, + "learning_rate": 5.608630876047759e-05, + "loss": 0.011981841176748276, + "num_input_tokens_seen": 126078824, + "step": 7699, + "train_runtime": 62563.4405, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 4.666666666666667, + "grad_norm": 0.012355168350040913, + "learning_rate": 5.60767640171497e-05, + "loss": 0.012825828976929188, + "num_input_tokens_seen": 126095200, + "step": 7700, + "train_runtime": 62571.5506, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 4.667272727272727, + "grad_norm": 0.0055010938085615635, + "learning_rate": 5.60672190490541e-05, + "loss": 0.012018473818898201, + "num_input_tokens_seen": 126111576, + "step": 7701, + "train_runtime": 62580.6618, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 4.667878787878788, + "grad_norm": 0.005402136128395796, + "learning_rate": 5.6057673856543904e-05, + "loss": 0.012098276056349277, + "num_input_tokens_seen": 126127952, + "step": 7702, + "train_runtime": 62588.7693, + "train_tokens_per_second": 2015.185 + }, + { + "epoch": 4.668484848484848, + "grad_norm": 0.003978177439421415, + "learning_rate": 5.6048128439972135e-05, + "loss": 0.012030459940433502, + "num_input_tokens_seen": 126144328, + "step": 7703, + "train_runtime": 62596.8794, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 4.669090909090909, + "grad_norm": 0.007263890001922846, + "learning_rate": 5.603858279969188e-05, + "loss": 0.012006246484816074, + "num_input_tokens_seen": 126160704, + "step": 7704, + "train_runtime": 62604.9894, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 4.66969696969697, + "grad_norm": 0.007516798097640276, + "learning_rate": 5.6029036936056225e-05, + "loss": 0.011555795557796955, + "num_input_tokens_seen": 126177080, + "step": 7705, + "train_runtime": 62613.1008, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 4.67030303030303, + "grad_norm": 0.006984604522585869, + "learning_rate": 5.601949084941821e-05, + "loss": 0.011778357438743114, + "num_input_tokens_seen": 126193456, + "step": 7706, + "train_runtime": 62621.2129, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 4.670909090909091, + "grad_norm": 0.0037061921320855618, + "learning_rate": 5.6009944540130956e-05, + "loss": 0.012402926571667194, + "num_input_tokens_seen": 126209832, + "step": 7707, + "train_runtime": 62629.3333, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 4.671515151515152, + "grad_norm": 0.0061592706479132175, + "learning_rate": 5.600039800854756e-05, + "loss": 0.012600324116647243, + "num_input_tokens_seen": 126226208, + "step": 7708, + "train_runtime": 62637.4437, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 4.672121212121212, + "grad_norm": 0.006750389467924833, + "learning_rate": 5.5990851255021135e-05, + "loss": 0.011475963518023491, + "num_input_tokens_seen": 126242584, + "step": 7709, + "train_runtime": 62645.5556, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 4.672727272727273, + "grad_norm": 0.009535623714327812, + "learning_rate": 5.598130427990479e-05, + "loss": 0.012446466833353043, + "num_input_tokens_seen": 126258960, + "step": 7710, + "train_runtime": 62653.6659, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 4.673333333333334, + "grad_norm": 0.006015104707330465, + "learning_rate": 5.597175708355162e-05, + "loss": 0.011753576807677746, + "num_input_tokens_seen": 126275336, + "step": 7711, + "train_runtime": 62661.7782, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 4.673939393939394, + "grad_norm": 0.010641186498105526, + "learning_rate": 5.596220966631481e-05, + "loss": 0.012688640505075455, + "num_input_tokens_seen": 126291712, + "step": 7712, + "train_runtime": 62669.8927, + "train_tokens_per_second": 2015.19 + }, + { + "epoch": 4.674545454545455, + "grad_norm": 0.00641571544110775, + "learning_rate": 5.595266202854748e-05, + "loss": 0.011517995968461037, + "num_input_tokens_seen": 126308088, + "step": 7713, + "train_runtime": 62678.0087, + "train_tokens_per_second": 2015.19 + }, + { + "epoch": 4.675151515151515, + "grad_norm": 0.002296730177477002, + "learning_rate": 5.5943114170602764e-05, + "loss": 0.013043783605098724, + "num_input_tokens_seen": 126324464, + "step": 7714, + "train_runtime": 62686.1209, + "train_tokens_per_second": 2015.19 + }, + { + "epoch": 4.675757575757576, + "grad_norm": 0.00494965398684144, + "learning_rate": 5.5933566092833825e-05, + "loss": 0.01210373267531395, + "num_input_tokens_seen": 126340840, + "step": 7715, + "train_runtime": 62694.2326, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 4.676363636363637, + "grad_norm": 0.010163343511521816, + "learning_rate": 5.592401779559383e-05, + "loss": 0.011575276963412762, + "num_input_tokens_seen": 126357216, + "step": 7716, + "train_runtime": 62702.3477, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 4.676969696969697, + "grad_norm": 0.00719207851216197, + "learning_rate": 5.591446927923596e-05, + "loss": 0.012158653698861599, + "num_input_tokens_seen": 126373592, + "step": 7717, + "train_runtime": 62710.4617, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 4.677575757575758, + "grad_norm": 0.0074624414555728436, + "learning_rate": 5.5904920544113395e-05, + "loss": 0.012974969111382961, + "num_input_tokens_seen": 126389968, + "step": 7718, + "train_runtime": 62718.5731, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 4.678181818181818, + "grad_norm": 0.004055650904774666, + "learning_rate": 5.589537159057932e-05, + "loss": 0.011743529699742794, + "num_input_tokens_seen": 126406344, + "step": 7719, + "train_runtime": 62726.6832, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 4.678787878787879, + "grad_norm": 0.006089130416512489, + "learning_rate": 5.58858224189869e-05, + "loss": 0.011823783628642559, + "num_input_tokens_seen": 126422720, + "step": 7720, + "train_runtime": 62734.8003, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 4.67939393939394, + "grad_norm": 0.004581579938530922, + "learning_rate": 5.58762730296894e-05, + "loss": 0.011635949835181236, + "num_input_tokens_seen": 126439096, + "step": 7721, + "train_runtime": 62742.9107, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 4.68, + "grad_norm": 0.013395867310464382, + "learning_rate": 5.586672342303999e-05, + "loss": 0.013064623810350895, + "num_input_tokens_seen": 126455472, + "step": 7722, + "train_runtime": 62751.0195, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 4.680606060606061, + "grad_norm": 0.005444854497909546, + "learning_rate": 5.585717359939192e-05, + "loss": 0.011943895369768143, + "num_input_tokens_seen": 126471848, + "step": 7723, + "train_runtime": 62759.1334, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 4.681212121212122, + "grad_norm": 0.006839416455477476, + "learning_rate": 5.5847623559098395e-05, + "loss": 0.011963317170739174, + "num_input_tokens_seen": 126488224, + "step": 7724, + "train_runtime": 62767.244, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 4.681818181818182, + "grad_norm": 0.006717733107507229, + "learning_rate": 5.583807330251266e-05, + "loss": 0.01235231664031744, + "num_input_tokens_seen": 126504600, + "step": 7725, + "train_runtime": 62775.3559, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 4.682424242424243, + "grad_norm": 0.0016796269919723272, + "learning_rate": 5.5828522829987964e-05, + "loss": 0.01199068408459425, + "num_input_tokens_seen": 126520976, + "step": 7726, + "train_runtime": 62783.4666, + "train_tokens_per_second": 2015.196 + }, + { + "epoch": 4.683030303030303, + "grad_norm": 0.010398292914032936, + "learning_rate": 5.581897214187757e-05, + "loss": 0.01178824808448553, + "num_input_tokens_seen": 126537352, + "step": 7727, + "train_runtime": 62791.5794, + "train_tokens_per_second": 2015.196 + }, + { + "epoch": 4.683636363636364, + "grad_norm": 0.017961395904421806, + "learning_rate": 5.580942123853471e-05, + "loss": 0.01207984983921051, + "num_input_tokens_seen": 126553728, + "step": 7728, + "train_runtime": 62799.6925, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 4.6842424242424245, + "grad_norm": 0.007925903424620628, + "learning_rate": 5.5799870120312694e-05, + "loss": 0.012823725119233131, + "num_input_tokens_seen": 126570104, + "step": 7729, + "train_runtime": 62807.8049, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 4.684848484848485, + "grad_norm": 0.00829307734966278, + "learning_rate": 5.579031878756475e-05, + "loss": 0.011407403275370598, + "num_input_tokens_seen": 126586480, + "step": 7730, + "train_runtime": 62815.9142, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 4.6854545454545455, + "grad_norm": 0.007010846398770809, + "learning_rate": 5.5780767240644204e-05, + "loss": 0.01168019138276577, + "num_input_tokens_seen": 126602856, + "step": 7731, + "train_runtime": 62824.031, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 4.686060606060606, + "grad_norm": 0.00531670032069087, + "learning_rate": 5.577121547990434e-05, + "loss": 0.012387626804411411, + "num_input_tokens_seen": 126619232, + "step": 7732, + "train_runtime": 62832.1435, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 4.6866666666666665, + "grad_norm": 0.004709227476269007, + "learning_rate": 5.576166350569846e-05, + "loss": 0.011064683087170124, + "num_input_tokens_seen": 126635608, + "step": 7733, + "train_runtime": 62840.2551, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 4.6872727272727275, + "grad_norm": 0.006678287871181965, + "learning_rate": 5.575211131837984e-05, + "loss": 0.012107464484870434, + "num_input_tokens_seen": 126651984, + "step": 7734, + "train_runtime": 62848.3679, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 4.6878787878787875, + "grad_norm": 0.009298877790570259, + "learning_rate": 5.574255891830185e-05, + "loss": 0.012555924244225025, + "num_input_tokens_seen": 126668360, + "step": 7735, + "train_runtime": 62856.4807, + "train_tokens_per_second": 2015.2 + }, + { + "epoch": 4.6884848484848485, + "grad_norm": 0.006215481087565422, + "learning_rate": 5.573300630581778e-05, + "loss": 0.011744514107704163, + "num_input_tokens_seen": 126684736, + "step": 7736, + "train_runtime": 62864.5894, + "train_tokens_per_second": 2015.2 + }, + { + "epoch": 4.689090909090909, + "grad_norm": 0.0052190981805324554, + "learning_rate": 5.572345348128098e-05, + "loss": 0.012277994304895401, + "num_input_tokens_seen": 126701112, + "step": 7737, + "train_runtime": 62872.7017, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 4.6896969696969695, + "grad_norm": 0.0070575871504843235, + "learning_rate": 5.571390044504477e-05, + "loss": 0.011679389514029026, + "num_input_tokens_seen": 126717488, + "step": 7738, + "train_runtime": 62880.8134, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 4.69030303030303, + "grad_norm": 0.006225979421287775, + "learning_rate": 5.5704347197462506e-05, + "loss": 0.012162572704255581, + "num_input_tokens_seen": 126733864, + "step": 7739, + "train_runtime": 62888.9311, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 4.690909090909091, + "grad_norm": 0.008611059747636318, + "learning_rate": 5.569479373888756e-05, + "loss": 0.011751045472919941, + "num_input_tokens_seen": 126750240, + "step": 7740, + "train_runtime": 62897.0449, + "train_tokens_per_second": 2015.202 + }, + { + "epoch": 4.691515151515151, + "grad_norm": 0.005800665821880102, + "learning_rate": 5.568524006967328e-05, + "loss": 0.013195361010730267, + "num_input_tokens_seen": 126766616, + "step": 7741, + "train_runtime": 62905.1543, + "train_tokens_per_second": 2015.202 + }, + { + "epoch": 4.692121212121212, + "grad_norm": 0.05454397201538086, + "learning_rate": 5.567568619017305e-05, + "loss": 0.012747734785079956, + "num_input_tokens_seen": 126782992, + "step": 7742, + "train_runtime": 62913.267, + "train_tokens_per_second": 2015.203 + }, + { + "epoch": 4.692727272727272, + "grad_norm": 0.0083048976957798, + "learning_rate": 5.5666132100740223e-05, + "loss": 0.012259971350431442, + "num_input_tokens_seen": 126799368, + "step": 7743, + "train_runtime": 62921.3767, + "train_tokens_per_second": 2015.203 + }, + { + "epoch": 4.693333333333333, + "grad_norm": 0.004817217588424683, + "learning_rate": 5.5656577801728206e-05, + "loss": 0.012299253605306149, + "num_input_tokens_seen": 126815744, + "step": 7744, + "train_runtime": 62929.4888, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 4.693939393939394, + "grad_norm": 0.004814384039491415, + "learning_rate": 5.5647023293490405e-05, + "loss": 0.013155316933989525, + "num_input_tokens_seen": 126832120, + "step": 7745, + "train_runtime": 62937.5992, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 4.694545454545454, + "grad_norm": 0.005327410064637661, + "learning_rate": 5.56374685763802e-05, + "loss": 0.011984745971858501, + "num_input_tokens_seen": 126848496, + "step": 7746, + "train_runtime": 62945.7132, + "train_tokens_per_second": 2015.205 + }, + { + "epoch": 4.695151515151515, + "grad_norm": 0.006371151655912399, + "learning_rate": 5.5627913650751004e-05, + "loss": 0.011707944795489311, + "num_input_tokens_seen": 126864872, + "step": 7747, + "train_runtime": 62953.8305, + "train_tokens_per_second": 2015.205 + }, + { + "epoch": 4.695757575757575, + "grad_norm": 0.00581658398732543, + "learning_rate": 5.561835851695626e-05, + "loss": 0.011769506148993969, + "num_input_tokens_seen": 126881248, + "step": 7748, + "train_runtime": 62961.946, + "train_tokens_per_second": 2015.205 + }, + { + "epoch": 4.696363636363636, + "grad_norm": 0.006657500751316547, + "learning_rate": 5.5608803175349356e-05, + "loss": 0.01321131456643343, + "num_input_tokens_seen": 126897624, + "step": 7749, + "train_runtime": 62970.0594, + "train_tokens_per_second": 2015.206 + }, + { + "epoch": 4.696969696969697, + "grad_norm": 0.0010108980350196362, + "learning_rate": 5.559924762628377e-05, + "loss": 0.01049572043120861, + "num_input_tokens_seen": 126914000, + "step": 7750, + "train_runtime": 62978.1737, + "train_tokens_per_second": 2015.206 + }, + { + "epoch": 4.697575757575757, + "grad_norm": 0.007261292543262243, + "learning_rate": 5.558969187011289e-05, + "loss": 0.01266803964972496, + "num_input_tokens_seen": 126930376, + "step": 7751, + "train_runtime": 62986.2842, + "train_tokens_per_second": 2015.207 + }, + { + "epoch": 4.698181818181818, + "grad_norm": 0.008241045288741589, + "learning_rate": 5.558013590719021e-05, + "loss": 0.013920708559453487, + "num_input_tokens_seen": 126946752, + "step": 7752, + "train_runtime": 62994.3977, + "train_tokens_per_second": 2015.207 + }, + { + "epoch": 4.698787878787879, + "grad_norm": 0.007093343883752823, + "learning_rate": 5.5570579737869166e-05, + "loss": 0.010648823343217373, + "num_input_tokens_seen": 126963128, + "step": 7753, + "train_runtime": 63002.5131, + "train_tokens_per_second": 2015.207 + }, + { + "epoch": 4.699393939393939, + "grad_norm": 0.0066066281870007515, + "learning_rate": 5.556102336250323e-05, + "loss": 0.012616369873285294, + "num_input_tokens_seen": 126979504, + "step": 7754, + "train_runtime": 63010.6298, + "train_tokens_per_second": 2015.208 + }, + { + "epoch": 4.7, + "grad_norm": 0.007260696031153202, + "learning_rate": 5.5551466781445863e-05, + "loss": 0.012117899954319, + "num_input_tokens_seen": 126995880, + "step": 7755, + "train_runtime": 63018.7433, + "train_tokens_per_second": 2015.208 + }, + { + "epoch": 4.70060606060606, + "grad_norm": 0.004253680352121592, + "learning_rate": 5.5541909995050554e-05, + "loss": 0.01200677827000618, + "num_input_tokens_seen": 127012256, + "step": 7756, + "train_runtime": 63026.8555, + "train_tokens_per_second": 2015.209 + }, + { + "epoch": 4.701212121212121, + "grad_norm": 0.005172673612833023, + "learning_rate": 5.553235300367078e-05, + "loss": 0.011492326855659485, + "num_input_tokens_seen": 127028632, + "step": 7757, + "train_runtime": 63034.9687, + "train_tokens_per_second": 2015.209 + }, + { + "epoch": 4.701818181818182, + "grad_norm": 0.006241387687623501, + "learning_rate": 5.552279580766005e-05, + "loss": 0.012247263453900814, + "num_input_tokens_seen": 127045008, + "step": 7758, + "train_runtime": 63043.0835, + "train_tokens_per_second": 2015.209 + }, + { + "epoch": 4.702424242424242, + "grad_norm": 0.006264199037104845, + "learning_rate": 5.5513238407371856e-05, + "loss": 0.01293090544641018, + "num_input_tokens_seen": 127061384, + "step": 7759, + "train_runtime": 63051.1959, + "train_tokens_per_second": 2015.21 + }, + { + "epoch": 4.703030303030303, + "grad_norm": 0.008050181902945042, + "learning_rate": 5.550368080315972e-05, + "loss": 0.010836660861968994, + "num_input_tokens_seen": 127077760, + "step": 7760, + "train_runtime": 63059.3085, + "train_tokens_per_second": 2015.21 + }, + { + "epoch": 4.703636363636363, + "grad_norm": 0.006183304823935032, + "learning_rate": 5.549412299537714e-05, + "loss": 0.012333364225924015, + "num_input_tokens_seen": 127094136, + "step": 7761, + "train_runtime": 63067.4324, + "train_tokens_per_second": 2015.21 + }, + { + "epoch": 4.704242424242424, + "grad_norm": 0.004287291783839464, + "learning_rate": 5.548456498437764e-05, + "loss": 0.013658436015248299, + "num_input_tokens_seen": 127110512, + "step": 7762, + "train_runtime": 63075.5442, + "train_tokens_per_second": 2015.211 + }, + { + "epoch": 4.704848484848485, + "grad_norm": 0.0041948771104216576, + "learning_rate": 5.547500677051478e-05, + "loss": 0.01175331324338913, + "num_input_tokens_seen": 127126888, + "step": 7763, + "train_runtime": 63083.6566, + "train_tokens_per_second": 2015.211 + }, + { + "epoch": 4.705454545454545, + "grad_norm": 0.002994392067193985, + "learning_rate": 5.546544835414207e-05, + "loss": 0.011214029043912888, + "num_input_tokens_seen": 127143264, + "step": 7764, + "train_runtime": 63091.7702, + "train_tokens_per_second": 2015.212 + }, + { + "epoch": 4.706060606060606, + "grad_norm": 0.015834223479032516, + "learning_rate": 5.545588973561308e-05, + "loss": 0.013221628963947296, + "num_input_tokens_seen": 127159640, + "step": 7765, + "train_runtime": 63099.8843, + "train_tokens_per_second": 2015.212 + }, + { + "epoch": 4.706666666666667, + "grad_norm": 0.007622543256729841, + "learning_rate": 5.544633091528133e-05, + "loss": 0.012780467048287392, + "num_input_tokens_seen": 127176016, + "step": 7766, + "train_runtime": 63107.9957, + "train_tokens_per_second": 2015.212 + }, + { + "epoch": 4.707272727272727, + "grad_norm": 0.005849430337548256, + "learning_rate": 5.543677189350043e-05, + "loss": 0.01104824710637331, + "num_input_tokens_seen": 127192392, + "step": 7767, + "train_runtime": 63116.1064, + "train_tokens_per_second": 2015.213 + }, + { + "epoch": 4.707878787878788, + "grad_norm": 0.004920311272144318, + "learning_rate": 5.542721267062392e-05, + "loss": 0.01239200122654438, + "num_input_tokens_seen": 127208768, + "step": 7768, + "train_runtime": 63124.2202, + "train_tokens_per_second": 2015.213 + }, + { + "epoch": 4.708484848484849, + "grad_norm": 0.00489424541592598, + "learning_rate": 5.541765324700537e-05, + "loss": 0.01158895529806614, + "num_input_tokens_seen": 127225144, + "step": 7769, + "train_runtime": 63132.3329, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 4.709090909090909, + "grad_norm": 0.006797540467232466, + "learning_rate": 5.540809362299838e-05, + "loss": 0.011699507012963295, + "num_input_tokens_seen": 127241520, + "step": 7770, + "train_runtime": 63140.4433, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 4.70969696969697, + "grad_norm": 0.0032727087382227182, + "learning_rate": 5.5398533798956555e-05, + "loss": 0.011421626433730125, + "num_input_tokens_seen": 127257896, + "step": 7771, + "train_runtime": 63148.555, + "train_tokens_per_second": 2015.215 + }, + { + "epoch": 4.71030303030303, + "grad_norm": 0.003754819044843316, + "learning_rate": 5.538897377523347e-05, + "loss": 0.01223233062773943, + "num_input_tokens_seen": 127274272, + "step": 7772, + "train_runtime": 63156.669, + "train_tokens_per_second": 2015.215 + }, + { + "epoch": 4.710909090909091, + "grad_norm": 0.01253515761345625, + "learning_rate": 5.537941355218273e-05, + "loss": 0.011595639400184155, + "num_input_tokens_seen": 127290648, + "step": 7773, + "train_runtime": 63164.7802, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 4.711515151515152, + "grad_norm": 0.004491427913308144, + "learning_rate": 5.536985313015797e-05, + "loss": 0.01158834621310234, + "num_input_tokens_seen": 127307024, + "step": 7774, + "train_runtime": 63172.8911, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 4.712121212121212, + "grad_norm": 0.005442184861749411, + "learning_rate": 5.536029250951279e-05, + "loss": 0.010560455732047558, + "num_input_tokens_seen": 127323400, + "step": 7775, + "train_runtime": 63181.0028, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 4.712727272727273, + "grad_norm": 0.005135543178766966, + "learning_rate": 5.535073169060083e-05, + "loss": 0.01223057508468628, + "num_input_tokens_seen": 127339776, + "step": 7776, + "train_runtime": 63189.1152, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 4.713333333333333, + "grad_norm": 0.006932864896953106, + "learning_rate": 5.534117067377574e-05, + "loss": 0.01129085011780262, + "num_input_tokens_seen": 127356152, + "step": 7777, + "train_runtime": 63197.2307, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 4.713939393939394, + "grad_norm": 0.003931860905140638, + "learning_rate": 5.533160945939113e-05, + "loss": 0.01221106294542551, + "num_input_tokens_seen": 127372528, + "step": 7778, + "train_runtime": 63205.3413, + "train_tokens_per_second": 2015.218 + }, + { + "epoch": 4.714545454545455, + "grad_norm": 0.00748535105958581, + "learning_rate": 5.532204804780068e-05, + "loss": 0.012501387856900692, + "num_input_tokens_seen": 127388904, + "step": 7779, + "train_runtime": 63213.453, + "train_tokens_per_second": 2015.218 + }, + { + "epoch": 4.715151515151515, + "grad_norm": 0.010562242940068245, + "learning_rate": 5.531248643935803e-05, + "loss": 0.014058287255465984, + "num_input_tokens_seen": 127405280, + "step": 7780, + "train_runtime": 63221.5641, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 4.715757575757576, + "grad_norm": 0.005692309234291315, + "learning_rate": 5.530292463441685e-05, + "loss": 0.011898547410964966, + "num_input_tokens_seen": 127421656, + "step": 7781, + "train_runtime": 63229.6779, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 4.716363636363637, + "grad_norm": 0.007420817855745554, + "learning_rate": 5.529336263333083e-05, + "loss": 0.011603828519582748, + "num_input_tokens_seen": 127438032, + "step": 7782, + "train_runtime": 63237.7902, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 4.716969696969697, + "grad_norm": 0.007444572169333696, + "learning_rate": 5.5283800436453615e-05, + "loss": 0.011838075704872608, + "num_input_tokens_seen": 127454408, + "step": 7783, + "train_runtime": 63245.9037, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 4.717575757575758, + "grad_norm": 0.008741327561438084, + "learning_rate": 5.5274238044138925e-05, + "loss": 0.012546725571155548, + "num_input_tokens_seen": 127470784, + "step": 7784, + "train_runtime": 63254.0165, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 4.718181818181818, + "grad_norm": 0.007454595994204283, + "learning_rate": 5.526467545674043e-05, + "loss": 0.011966658756136894, + "num_input_tokens_seen": 127487160, + "step": 7785, + "train_runtime": 63262.1301, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 4.718787878787879, + "grad_norm": 0.012493195943534374, + "learning_rate": 5.5255112674611854e-05, + "loss": 0.012814865447580814, + "num_input_tokens_seen": 127503536, + "step": 7786, + "train_runtime": 63270.2555, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 4.71939393939394, + "grad_norm": 0.004231905564665794, + "learning_rate": 5.524554969810689e-05, + "loss": 0.011465507559478283, + "num_input_tokens_seen": 127519912, + "step": 7787, + "train_runtime": 63278.3812, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 4.72, + "grad_norm": 0.010750692337751389, + "learning_rate": 5.523598652757926e-05, + "loss": 0.012726335786283016, + "num_input_tokens_seen": 127536288, + "step": 7788, + "train_runtime": 63286.5035, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 4.720606060606061, + "grad_norm": 0.0027667072135955095, + "learning_rate": 5.522642316338268e-05, + "loss": 0.011010108515620232, + "num_input_tokens_seen": 127552664, + "step": 7789, + "train_runtime": 63294.629, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 4.721212121212121, + "grad_norm": 0.008852592669427395, + "learning_rate": 5.521685960587089e-05, + "loss": 0.011157935485243797, + "num_input_tokens_seen": 127569040, + "step": 7790, + "train_runtime": 63302.7487, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 4.721818181818182, + "grad_norm": 0.0086141312494874, + "learning_rate": 5.520729585539762e-05, + "loss": 0.011918151751160622, + "num_input_tokens_seen": 127585416, + "step": 7791, + "train_runtime": 63310.8636, + "train_tokens_per_second": 2015.222 + }, + { + "epoch": 4.722424242424243, + "grad_norm": 0.005609265994280577, + "learning_rate": 5.519773191231662e-05, + "loss": 0.011756869032979012, + "num_input_tokens_seen": 127601792, + "step": 7792, + "train_runtime": 63318.9784, + "train_tokens_per_second": 2015.222 + }, + { + "epoch": 4.723030303030303, + "grad_norm": 0.003316278336569667, + "learning_rate": 5.5188167776981626e-05, + "loss": 0.011323943734169006, + "num_input_tokens_seen": 127618168, + "step": 7793, + "train_runtime": 63327.0932, + "train_tokens_per_second": 2015.222 + }, + { + "epoch": 4.723636363636364, + "grad_norm": 0.004913877230137587, + "learning_rate": 5.517860344974642e-05, + "loss": 0.012218224816024303, + "num_input_tokens_seen": 127634544, + "step": 7794, + "train_runtime": 63335.2102, + "train_tokens_per_second": 2015.223 + }, + { + "epoch": 4.724242424242425, + "grad_norm": 0.005767126102000475, + "learning_rate": 5.5169038930964755e-05, + "loss": 0.012266877107322216, + "num_input_tokens_seen": 127650920, + "step": 7795, + "train_runtime": 63343.3318, + "train_tokens_per_second": 2015.223 + }, + { + "epoch": 4.724848484848485, + "grad_norm": 0.006699641700834036, + "learning_rate": 5.5159474220990416e-05, + "loss": 0.010816754773259163, + "num_input_tokens_seen": 127667296, + "step": 7796, + "train_runtime": 63351.4458, + "train_tokens_per_second": 2015.223 + }, + { + "epoch": 4.725454545454546, + "grad_norm": 0.009233702905476093, + "learning_rate": 5.514990932017715e-05, + "loss": 0.013005990535020828, + "num_input_tokens_seen": 127683672, + "step": 7797, + "train_runtime": 63359.5604, + "train_tokens_per_second": 2015.223 + }, + { + "epoch": 4.7260606060606065, + "grad_norm": 0.007553061470389366, + "learning_rate": 5.514034422887879e-05, + "loss": 0.011680412106215954, + "num_input_tokens_seen": 127700048, + "step": 7798, + "train_runtime": 63367.6736, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 4.726666666666667, + "grad_norm": 0.005823295097798109, + "learning_rate": 5.513077894744909e-05, + "loss": 0.012094995006918907, + "num_input_tokens_seen": 127716424, + "step": 7799, + "train_runtime": 63375.785, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 4.7272727272727275, + "grad_norm": 0.004010437522083521, + "learning_rate": 5.5121213476241895e-05, + "loss": 0.011223355308175087, + "num_input_tokens_seen": 127732800, + "step": 7800, + "train_runtime": 63383.8985, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 4.727878787878788, + "grad_norm": 0.010801691561937332, + "learning_rate": 5.511164781561096e-05, + "loss": 0.01221414003521204, + "num_input_tokens_seen": 127749176, + "step": 7801, + "train_runtime": 63393.0169, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 4.7284848484848485, + "grad_norm": 0.006782837677747011, + "learning_rate": 5.5102081965910135e-05, + "loss": 0.010960948653519154, + "num_input_tokens_seen": 127765552, + "step": 7802, + "train_runtime": 63401.1308, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 4.7290909090909095, + "grad_norm": 0.003005251055583358, + "learning_rate": 5.5092515927493226e-05, + "loss": 0.011781456880271435, + "num_input_tokens_seen": 127781928, + "step": 7803, + "train_runtime": 63409.2395, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 4.7296969696969695, + "grad_norm": 0.0066121164709329605, + "learning_rate": 5.508294970071408e-05, + "loss": 0.013491226360201836, + "num_input_tokens_seen": 127798304, + "step": 7804, + "train_runtime": 63417.345, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 4.7303030303030305, + "grad_norm": 0.005546892061829567, + "learning_rate": 5.507338328592653e-05, + "loss": 0.011029604822397232, + "num_input_tokens_seen": 127814680, + "step": 7805, + "train_runtime": 63425.4537, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 4.7309090909090905, + "grad_norm": 0.007575553376227617, + "learning_rate": 5.506381668348441e-05, + "loss": 0.011726319789886475, + "num_input_tokens_seen": 127831056, + "step": 7806, + "train_runtime": 63433.5653, + "train_tokens_per_second": 2015.196 + }, + { + "epoch": 4.7315151515151515, + "grad_norm": 0.009484547190368176, + "learning_rate": 5.505424989374157e-05, + "loss": 0.012681866064667702, + "num_input_tokens_seen": 127847432, + "step": 7807, + "train_runtime": 63441.6739, + "train_tokens_per_second": 2015.196 + }, + { + "epoch": 4.732121212121212, + "grad_norm": 0.009163664653897285, + "learning_rate": 5.504468291705186e-05, + "loss": 0.011042908765375614, + "num_input_tokens_seen": 127863808, + "step": 7808, + "train_runtime": 63449.7947, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 4.7327272727272724, + "grad_norm": 0.006709173787385225, + "learning_rate": 5.5035115753769164e-05, + "loss": 0.011388827115297318, + "num_input_tokens_seen": 127880184, + "step": 7809, + "train_runtime": 63457.9068, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 4.733333333333333, + "grad_norm": 0.0061860098503530025, + "learning_rate": 5.502554840424733e-05, + "loss": 0.011722002178430557, + "num_input_tokens_seen": 127896560, + "step": 7810, + "train_runtime": 63466.018, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 4.733939393939394, + "grad_norm": 0.006838430184870958, + "learning_rate": 5.501598086884025e-05, + "loss": 0.01177933532744646, + "num_input_tokens_seen": 127912936, + "step": 7811, + "train_runtime": 63474.1303, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 4.734545454545454, + "grad_norm": 0.009136213921010494, + "learning_rate": 5.500641314790182e-05, + "loss": 0.01156282052397728, + "num_input_tokens_seen": 127929312, + "step": 7812, + "train_runtime": 63482.2432, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 4.735151515151515, + "grad_norm": 0.007686370052397251, + "learning_rate": 5.49968452417859e-05, + "loss": 0.012194370850920677, + "num_input_tokens_seen": 127945688, + "step": 7813, + "train_runtime": 63490.353, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 4.735757575757575, + "grad_norm": 0.009169268421828747, + "learning_rate": 5.4987277150846415e-05, + "loss": 0.012323364615440369, + "num_input_tokens_seen": 127962064, + "step": 7814, + "train_runtime": 63498.4616, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 4.736363636363636, + "grad_norm": 0.008774418383836746, + "learning_rate": 5.4977708875437264e-05, + "loss": 0.012090123258531094, + "num_input_tokens_seen": 127978440, + "step": 7815, + "train_runtime": 63506.5693, + "train_tokens_per_second": 2015.2 + }, + { + "epoch": 4.736969696969697, + "grad_norm": 0.0031562787480652332, + "learning_rate": 5.4968140415912336e-05, + "loss": 0.011887643486261368, + "num_input_tokens_seen": 127994816, + "step": 7816, + "train_runtime": 63514.6783, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 4.737575757575757, + "grad_norm": 0.005399527959525585, + "learning_rate": 5.495857177262559e-05, + "loss": 0.011396858841180801, + "num_input_tokens_seen": 128011192, + "step": 7817, + "train_runtime": 63522.7905, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 4.738181818181818, + "grad_norm": 0.006530459970235825, + "learning_rate": 5.494900294593092e-05, + "loss": 0.011100883595645428, + "num_input_tokens_seen": 128027568, + "step": 7818, + "train_runtime": 63530.899, + "train_tokens_per_second": 2015.202 + }, + { + "epoch": 4.738787878787878, + "grad_norm": 0.007835956290364265, + "learning_rate": 5.493943393618227e-05, + "loss": 0.012370230630040169, + "num_input_tokens_seen": 128043944, + "step": 7819, + "train_runtime": 63539.0129, + "train_tokens_per_second": 2015.202 + }, + { + "epoch": 4.739393939393939, + "grad_norm": 0.007147411350160837, + "learning_rate": 5.492986474373357e-05, + "loss": 0.011521507985889912, + "num_input_tokens_seen": 128060320, + "step": 7820, + "train_runtime": 63547.1304, + "train_tokens_per_second": 2015.202 + }, + { + "epoch": 4.74, + "grad_norm": 0.008436573669314384, + "learning_rate": 5.492029536893879e-05, + "loss": 0.012130609713494778, + "num_input_tokens_seen": 128076696, + "step": 7821, + "train_runtime": 63555.2424, + "train_tokens_per_second": 2015.203 + }, + { + "epoch": 4.74060606060606, + "grad_norm": 0.012492320500314236, + "learning_rate": 5.4910725812151864e-05, + "loss": 0.012628821656107903, + "num_input_tokens_seen": 128093072, + "step": 7822, + "train_runtime": 63563.3526, + "train_tokens_per_second": 2015.203 + }, + { + "epoch": 4.741212121212121, + "grad_norm": 0.010410550981760025, + "learning_rate": 5.4901156073726746e-05, + "loss": 0.011207236908376217, + "num_input_tokens_seen": 128109448, + "step": 7823, + "train_runtime": 63571.463, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 4.741818181818182, + "grad_norm": 0.00047602399718016386, + "learning_rate": 5.489158615401741e-05, + "loss": 0.010877731256186962, + "num_input_tokens_seen": 128125824, + "step": 7824, + "train_runtime": 63579.5726, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 4.742424242424242, + "grad_norm": 0.006878368556499481, + "learning_rate": 5.4882016053377825e-05, + "loss": 0.012165321037173271, + "num_input_tokens_seen": 128142200, + "step": 7825, + "train_runtime": 63587.6828, + "train_tokens_per_second": 2015.205 + }, + { + "epoch": 4.743030303030303, + "grad_norm": 0.009976751171052456, + "learning_rate": 5.4872445772161973e-05, + "loss": 0.012444731779396534, + "num_input_tokens_seen": 128158576, + "step": 7826, + "train_runtime": 63595.7918, + "train_tokens_per_second": 2015.205 + }, + { + "epoch": 4.743636363636364, + "grad_norm": 0.012607053853571415, + "learning_rate": 5.486287531072386e-05, + "loss": 0.0133493198081851, + "num_input_tokens_seen": 128174952, + "step": 7827, + "train_runtime": 63603.9046, + "train_tokens_per_second": 2015.206 + }, + { + "epoch": 4.744242424242424, + "grad_norm": 0.007742696441709995, + "learning_rate": 5.4853304669417446e-05, + "loss": 0.012487281113862991, + "num_input_tokens_seen": 128191328, + "step": 7828, + "train_runtime": 63612.012, + "train_tokens_per_second": 2015.206 + }, + { + "epoch": 4.744848484848485, + "grad_norm": 0.00554098142310977, + "learning_rate": 5.4843733848596734e-05, + "loss": 0.012563329190015793, + "num_input_tokens_seen": 128207704, + "step": 7829, + "train_runtime": 63620.1214, + "train_tokens_per_second": 2015.207 + }, + { + "epoch": 4.745454545454545, + "grad_norm": 0.00675031216815114, + "learning_rate": 5.4834162848615754e-05, + "loss": 0.011662516742944717, + "num_input_tokens_seen": 128224080, + "step": 7830, + "train_runtime": 63628.2313, + "train_tokens_per_second": 2015.207 + }, + { + "epoch": 4.746060606060606, + "grad_norm": 0.005277880467474461, + "learning_rate": 5.48245916698285e-05, + "loss": 0.012751109898090363, + "num_input_tokens_seen": 128240456, + "step": 7831, + "train_runtime": 63636.3417, + "train_tokens_per_second": 2015.208 + }, + { + "epoch": 4.746666666666667, + "grad_norm": 0.005742565728724003, + "learning_rate": 5.4815020312589e-05, + "loss": 0.012464182451367378, + "num_input_tokens_seen": 128256832, + "step": 7832, + "train_runtime": 63644.4493, + "train_tokens_per_second": 2015.208 + }, + { + "epoch": 4.747272727272727, + "grad_norm": 0.020113147795200348, + "learning_rate": 5.480544877725127e-05, + "loss": 0.012293745763599873, + "num_input_tokens_seen": 128273208, + "step": 7833, + "train_runtime": 63652.5579, + "train_tokens_per_second": 2015.209 + }, + { + "epoch": 4.747878787878788, + "grad_norm": 0.0026031294837594032, + "learning_rate": 5.4795877064169345e-05, + "loss": 0.011134007945656776, + "num_input_tokens_seen": 128289584, + "step": 7834, + "train_runtime": 63660.6669, + "train_tokens_per_second": 2015.21 + }, + { + "epoch": 4.748484848484848, + "grad_norm": 0.005192755721509457, + "learning_rate": 5.478630517369726e-05, + "loss": 0.011175487190485, + "num_input_tokens_seen": 128305960, + "step": 7835, + "train_runtime": 63668.7748, + "train_tokens_per_second": 2015.21 + }, + { + "epoch": 4.749090909090909, + "grad_norm": 0.012804295867681503, + "learning_rate": 5.477673310618908e-05, + "loss": 0.01156587153673172, + "num_input_tokens_seen": 128322336, + "step": 7836, + "train_runtime": 63676.8825, + "train_tokens_per_second": 2015.211 + }, + { + "epoch": 4.74969696969697, + "grad_norm": 0.004947494249790907, + "learning_rate": 5.4767160861998844e-05, + "loss": 0.01314151007682085, + "num_input_tokens_seen": 128338712, + "step": 7837, + "train_runtime": 63684.9896, + "train_tokens_per_second": 2015.211 + }, + { + "epoch": 4.75030303030303, + "grad_norm": 0.0049768658354878426, + "learning_rate": 5.475758844148061e-05, + "loss": 0.012619560584425926, + "num_input_tokens_seen": 128355088, + "step": 7838, + "train_runtime": 63693.1027, + "train_tokens_per_second": 2015.212 + }, + { + "epoch": 4.750909090909091, + "grad_norm": 0.005821592640131712, + "learning_rate": 5.4748015844988446e-05, + "loss": 0.011053354479372501, + "num_input_tokens_seen": 128371464, + "step": 7839, + "train_runtime": 63701.213, + "train_tokens_per_second": 2015.212 + }, + { + "epoch": 4.751515151515152, + "grad_norm": 0.008445676416158676, + "learning_rate": 5.473844307287641e-05, + "loss": 0.012113629840314388, + "num_input_tokens_seen": 128387840, + "step": 7840, + "train_runtime": 63709.3193, + "train_tokens_per_second": 2015.213 + }, + { + "epoch": 4.752121212121212, + "grad_norm": 0.006708419416099787, + "learning_rate": 5.472887012549861e-05, + "loss": 0.012071083299815655, + "num_input_tokens_seen": 128404216, + "step": 7841, + "train_runtime": 63717.4304, + "train_tokens_per_second": 2015.213 + }, + { + "epoch": 4.752727272727273, + "grad_norm": 0.009007965214550495, + "learning_rate": 5.471929700320911e-05, + "loss": 0.013365587219595909, + "num_input_tokens_seen": 128420592, + "step": 7842, + "train_runtime": 63725.5386, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 4.753333333333333, + "grad_norm": 0.0030869257170706987, + "learning_rate": 5.470972370636199e-05, + "loss": 0.012041709385812283, + "num_input_tokens_seen": 128436968, + "step": 7843, + "train_runtime": 63733.6475, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 4.753939393939394, + "grad_norm": 0.005322783719748259, + "learning_rate": 5.4700150235311375e-05, + "loss": 0.012165552005171776, + "num_input_tokens_seen": 128453344, + "step": 7844, + "train_runtime": 63741.7553, + "train_tokens_per_second": 2015.215 + }, + { + "epoch": 4.754545454545455, + "grad_norm": 0.010225621052086353, + "learning_rate": 5.4690576590411355e-05, + "loss": 0.013318624347448349, + "num_input_tokens_seen": 128469720, + "step": 7845, + "train_runtime": 63749.8657, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 4.755151515151515, + "grad_norm": 0.007703743875026703, + "learning_rate": 5.468100277201604e-05, + "loss": 0.01229649968445301, + "num_input_tokens_seen": 128486096, + "step": 7846, + "train_runtime": 63757.9768, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 4.755757575757576, + "grad_norm": 0.008059169165790081, + "learning_rate": 5.467142878047954e-05, + "loss": 0.011372203938663006, + "num_input_tokens_seen": 128502472, + "step": 7847, + "train_runtime": 63766.0845, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 4.756363636363636, + "grad_norm": 0.00419127382338047, + "learning_rate": 5.466185461615599e-05, + "loss": 0.01242068037390709, + "num_input_tokens_seen": 128518848, + "step": 7848, + "train_runtime": 63774.1941, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 4.756969696969697, + "grad_norm": 0.005250133108347654, + "learning_rate": 5.465228027939953e-05, + "loss": 0.011685032397508621, + "num_input_tokens_seen": 128535224, + "step": 7849, + "train_runtime": 63782.3042, + "train_tokens_per_second": 2015.218 + }, + { + "epoch": 4.757575757575758, + "grad_norm": 0.007473303470760584, + "learning_rate": 5.4642705770564275e-05, + "loss": 0.011655866168439388, + "num_input_tokens_seen": 128551600, + "step": 7850, + "train_runtime": 63790.4131, + "train_tokens_per_second": 2015.218 + }, + { + "epoch": 4.758181818181818, + "grad_norm": 0.006742627359926701, + "learning_rate": 5.4633131090004374e-05, + "loss": 0.01247443351894617, + "num_input_tokens_seen": 128567976, + "step": 7851, + "train_runtime": 63798.5315, + "train_tokens_per_second": 2015.218 + }, + { + "epoch": 4.758787878787879, + "grad_norm": 0.005883189383894205, + "learning_rate": 5.462355623807397e-05, + "loss": 0.01312369853258133, + "num_input_tokens_seen": 128584352, + "step": 7852, + "train_runtime": 63806.6396, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 4.75939393939394, + "grad_norm": 0.0056130653247237206, + "learning_rate": 5.461398121512723e-05, + "loss": 0.012823778204619884, + "num_input_tokens_seen": 128600728, + "step": 7853, + "train_runtime": 63814.7496, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 4.76, + "grad_norm": 0.001260093878954649, + "learning_rate": 5.4604406021518315e-05, + "loss": 0.010924192145466805, + "num_input_tokens_seen": 128617104, + "step": 7854, + "train_runtime": 63822.8606, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 4.760606060606061, + "grad_norm": 0.0029958172235637903, + "learning_rate": 5.4594830657601384e-05, + "loss": 0.011837403289973736, + "num_input_tokens_seen": 128633480, + "step": 7855, + "train_runtime": 63830.9685, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 4.761212121212122, + "grad_norm": 0.007814579643309116, + "learning_rate": 5.4585255123730606e-05, + "loss": 0.011465190909802914, + "num_input_tokens_seen": 128649856, + "step": 7856, + "train_runtime": 63839.0788, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 4.761818181818182, + "grad_norm": 0.006455671973526478, + "learning_rate": 5.457567942026018e-05, + "loss": 0.012518517673015594, + "num_input_tokens_seen": 128666232, + "step": 7857, + "train_runtime": 63847.1917, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 4.762424242424243, + "grad_norm": 0.005595626309514046, + "learning_rate": 5.4566103547544277e-05, + "loss": 0.012295530177652836, + "num_input_tokens_seen": 128682608, + "step": 7858, + "train_runtime": 63855.3015, + "train_tokens_per_second": 2015.222 + }, + { + "epoch": 4.763030303030303, + "grad_norm": 0.008645884692668915, + "learning_rate": 5.4556527505937115e-05, + "loss": 0.0127254044637084, + "num_input_tokens_seen": 128698984, + "step": 7859, + "train_runtime": 63863.4105, + "train_tokens_per_second": 2015.223 + }, + { + "epoch": 4.763636363636364, + "grad_norm": 0.005677587818354368, + "learning_rate": 5.454695129579285e-05, + "loss": 0.011858783662319183, + "num_input_tokens_seen": 128715360, + "step": 7860, + "train_runtime": 63871.5216, + "train_tokens_per_second": 2015.223 + }, + { + "epoch": 4.764242424242425, + "grad_norm": 0.008096770383417606, + "learning_rate": 5.453737491746572e-05, + "loss": 0.012202859856188297, + "num_input_tokens_seen": 128731736, + "step": 7861, + "train_runtime": 63879.6305, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 4.764848484848485, + "grad_norm": 0.005977761931717396, + "learning_rate": 5.452779837130992e-05, + "loss": 0.011665412224829197, + "num_input_tokens_seen": 128748112, + "step": 7862, + "train_runtime": 63887.7406, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 4.765454545454546, + "grad_norm": 0.005513329524546862, + "learning_rate": 5.4518221657679694e-05, + "loss": 0.011158935725688934, + "num_input_tokens_seen": 128764488, + "step": 7863, + "train_runtime": 63895.8517, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 4.766060606060606, + "grad_norm": 0.008565876632928848, + "learning_rate": 5.450864477692923e-05, + "loss": 0.011134061962366104, + "num_input_tokens_seen": 128780864, + "step": 7864, + "train_runtime": 63903.9632, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 4.766666666666667, + "grad_norm": 0.004338220227509737, + "learning_rate": 5.449906772941279e-05, + "loss": 0.011255251243710518, + "num_input_tokens_seen": 128797240, + "step": 7865, + "train_runtime": 63912.0735, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 4.7672727272727276, + "grad_norm": 0.006324665620923042, + "learning_rate": 5.448949051548459e-05, + "loss": 0.011847339570522308, + "num_input_tokens_seen": 128813616, + "step": 7866, + "train_runtime": 63920.1824, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 4.767878787878788, + "grad_norm": 0.0013418466551229358, + "learning_rate": 5.4479913135498873e-05, + "loss": 0.012430778704583645, + "num_input_tokens_seen": 128829992, + "step": 7867, + "train_runtime": 63928.293, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 4.7684848484848485, + "grad_norm": 0.006941361352801323, + "learning_rate": 5.447033558980991e-05, + "loss": 0.012676510959863663, + "num_input_tokens_seen": 128846368, + "step": 7868, + "train_runtime": 63936.4074, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 4.7690909090909095, + "grad_norm": 0.0027285870164632797, + "learning_rate": 5.446075787877192e-05, + "loss": 0.010516476817429066, + "num_input_tokens_seen": 128862744, + "step": 7869, + "train_runtime": 63944.5178, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 4.7696969696969695, + "grad_norm": 0.005406194366514683, + "learning_rate": 5.4451180002739186e-05, + "loss": 0.011944283731281757, + "num_input_tokens_seen": 128879120, + "step": 7870, + "train_runtime": 63952.632, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 4.7703030303030305, + "grad_norm": 0.005062499549239874, + "learning_rate": 5.444160196206598e-05, + "loss": 0.012508991174399853, + "num_input_tokens_seen": 128895496, + "step": 7871, + "train_runtime": 63960.744, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 4.7709090909090905, + "grad_norm": 0.00288582150824368, + "learning_rate": 5.443202375710656e-05, + "loss": 0.012881744652986526, + "num_input_tokens_seen": 128911872, + "step": 7872, + "train_runtime": 63968.8532, + "train_tokens_per_second": 2015.229 + }, + { + "epoch": 4.7715151515151515, + "grad_norm": 0.007698263041675091, + "learning_rate": 5.442244538821523e-05, + "loss": 0.01208783220499754, + "num_input_tokens_seen": 128928248, + "step": 7873, + "train_runtime": 63976.9635, + "train_tokens_per_second": 2015.229 + }, + { + "epoch": 4.772121212121212, + "grad_norm": 0.007546284236013889, + "learning_rate": 5.441286685574625e-05, + "loss": 0.012032289989292622, + "num_input_tokens_seen": 128944624, + "step": 7874, + "train_runtime": 63985.073, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 4.7727272727272725, + "grad_norm": 0.007669487502425909, + "learning_rate": 5.44032881600539e-05, + "loss": 0.011375678703188896, + "num_input_tokens_seen": 128961000, + "step": 7875, + "train_runtime": 63993.1868, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 4.773333333333333, + "grad_norm": 0.0036527563352137804, + "learning_rate": 5.439370930149251e-05, + "loss": 0.011167764663696289, + "num_input_tokens_seen": 128977376, + "step": 7876, + "train_runtime": 64001.2942, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 4.7739393939393935, + "grad_norm": 0.006656865123659372, + "learning_rate": 5.4384130280416365e-05, + "loss": 0.012433795258402824, + "num_input_tokens_seen": 128993752, + "step": 7877, + "train_runtime": 64009.4024, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 4.774545454545454, + "grad_norm": 0.004450733307749033, + "learning_rate": 5.4374551097179784e-05, + "loss": 0.012112068012356758, + "num_input_tokens_seen": 129010128, + "step": 7878, + "train_runtime": 64017.5125, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 4.775151515151515, + "grad_norm": 0.007601737976074219, + "learning_rate": 5.4364971752137074e-05, + "loss": 0.011879295110702515, + "num_input_tokens_seen": 129026504, + "step": 7879, + "train_runtime": 64025.6323, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 4.775757575757575, + "grad_norm": 0.00764523446559906, + "learning_rate": 5.435539224564255e-05, + "loss": 0.011984648182988167, + "num_input_tokens_seen": 129042880, + "step": 7880, + "train_runtime": 64033.7432, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 4.776363636363636, + "grad_norm": 0.007920248433947563, + "learning_rate": 5.434581257805056e-05, + "loss": 0.012191393412649632, + "num_input_tokens_seen": 129059256, + "step": 7881, + "train_runtime": 64041.8503, + "train_tokens_per_second": 2015.233 + }, + { + "epoch": 4.776969696969697, + "grad_norm": 0.007811750750988722, + "learning_rate": 5.433623274971543e-05, + "loss": 0.011389036662876606, + "num_input_tokens_seen": 129075632, + "step": 7882, + "train_runtime": 64049.9619, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 4.777575757575757, + "grad_norm": 0.0075828637927770615, + "learning_rate": 5.432665276099148e-05, + "loss": 0.012533457018435001, + "num_input_tokens_seen": 129092008, + "step": 7883, + "train_runtime": 64058.0731, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 4.778181818181818, + "grad_norm": 0.00599028030410409, + "learning_rate": 5.431707261223309e-05, + "loss": 0.011829948052763939, + "num_input_tokens_seen": 129108384, + "step": 7884, + "train_runtime": 64066.1797, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 4.778787878787879, + "grad_norm": 0.007583195809274912, + "learning_rate": 5.430749230379457e-05, + "loss": 0.013321954756975174, + "num_input_tokens_seen": 129124760, + "step": 7885, + "train_runtime": 64074.2909, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 4.779393939393939, + "grad_norm": 0.010045766830444336, + "learning_rate": 5.429791183603031e-05, + "loss": 0.011645630933344364, + "num_input_tokens_seen": 129141136, + "step": 7886, + "train_runtime": 64082.4029, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 4.78, + "grad_norm": 0.016569865867495537, + "learning_rate": 5.428833120929466e-05, + "loss": 0.011908693239092827, + "num_input_tokens_seen": 129157512, + "step": 7887, + "train_runtime": 64090.5156, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 4.78060606060606, + "grad_norm": 0.005201397929340601, + "learning_rate": 5.427875042394199e-05, + "loss": 0.011564147658646107, + "num_input_tokens_seen": 129173888, + "step": 7888, + "train_runtime": 64098.6302, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 4.781212121212121, + "grad_norm": 0.008413280360400677, + "learning_rate": 5.426916948032666e-05, + "loss": 0.011817865073680878, + "num_input_tokens_seen": 129190264, + "step": 7889, + "train_runtime": 64106.7414, + "train_tokens_per_second": 2015.237 + }, + { + "epoch": 4.781818181818182, + "grad_norm": 0.005617856048047543, + "learning_rate": 5.4259588378803094e-05, + "loss": 0.012686707079410553, + "num_input_tokens_seen": 129206640, + "step": 7890, + "train_runtime": 64114.854, + "train_tokens_per_second": 2015.237 + }, + { + "epoch": 4.782424242424242, + "grad_norm": 0.011876258999109268, + "learning_rate": 5.425000711972563e-05, + "loss": 0.012806899845600128, + "num_input_tokens_seen": 129223016, + "step": 7891, + "train_runtime": 64122.9627, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 4.783030303030303, + "grad_norm": 0.009535231627523899, + "learning_rate": 5.4240425703448684e-05, + "loss": 0.011657465249300003, + "num_input_tokens_seen": 129239392, + "step": 7892, + "train_runtime": 64131.0694, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 4.783636363636363, + "grad_norm": 0.006543212570250034, + "learning_rate": 5.423084413032664e-05, + "loss": 0.012326833792030811, + "num_input_tokens_seen": 129255768, + "step": 7893, + "train_runtime": 64139.1783, + "train_tokens_per_second": 2015.239 + }, + { + "epoch": 4.784242424242424, + "grad_norm": 0.0009862191509455442, + "learning_rate": 5.4221262400713925e-05, + "loss": 0.01089545153081417, + "num_input_tokens_seen": 129272144, + "step": 7894, + "train_runtime": 64147.284, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 4.784848484848485, + "grad_norm": 0.008590923622250557, + "learning_rate": 5.421168051496492e-05, + "loss": 0.012281442061066628, + "num_input_tokens_seen": 129288520, + "step": 7895, + "train_runtime": 64155.3928, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 4.785454545454545, + "grad_norm": 0.005395393818616867, + "learning_rate": 5.420209847343407e-05, + "loss": 0.012172514572739601, + "num_input_tokens_seen": 129304896, + "step": 7896, + "train_runtime": 64163.5019, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 4.786060606060606, + "grad_norm": 0.011512644588947296, + "learning_rate": 5.419251627647578e-05, + "loss": 0.011817613616585732, + "num_input_tokens_seen": 129321272, + "step": 7897, + "train_runtime": 64171.6125, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 4.786666666666667, + "grad_norm": 0.010663868859410286, + "learning_rate": 5.418293392444447e-05, + "loss": 0.01106716226786375, + "num_input_tokens_seen": 129337648, + "step": 7898, + "train_runtime": 64179.7303, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 4.787272727272727, + "grad_norm": 0.01053199265152216, + "learning_rate": 5.417335141769457e-05, + "loss": 0.012141745537519455, + "num_input_tokens_seen": 129354024, + "step": 7899, + "train_runtime": 64187.8374, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 4.787878787878788, + "grad_norm": 0.007217059843242168, + "learning_rate": 5.4163768756580546e-05, + "loss": 0.013157972134649754, + "num_input_tokens_seen": 129370400, + "step": 7900, + "train_runtime": 64195.9468, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 4.788484848484848, + "grad_norm": 0.01140880212187767, + "learning_rate": 5.415418594145682e-05, + "loss": 0.012345722876489162, + "num_input_tokens_seen": 129386776, + "step": 7901, + "train_runtime": 64204.9453, + "train_tokens_per_second": 2015.215 + }, + { + "epoch": 4.789090909090909, + "grad_norm": 0.005106334108859301, + "learning_rate": 5.414460297267784e-05, + "loss": 0.01293596625328064, + "num_input_tokens_seen": 129403152, + "step": 7902, + "train_runtime": 64213.0787, + "train_tokens_per_second": 2015.215 + }, + { + "epoch": 4.78969696969697, + "grad_norm": 0.00843972247093916, + "learning_rate": 5.413501985059807e-05, + "loss": 0.011291584931313992, + "num_input_tokens_seen": 129419528, + "step": 7903, + "train_runtime": 64221.2189, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 4.79030303030303, + "grad_norm": 0.008940930478274822, + "learning_rate": 5.4125436575571966e-05, + "loss": 0.012248530052602291, + "num_input_tokens_seen": 129435904, + "step": 7904, + "train_runtime": 64229.3518, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 4.790909090909091, + "grad_norm": 0.006940728053450584, + "learning_rate": 5.411585314795401e-05, + "loss": 0.011923219077289104, + "num_input_tokens_seen": 129452280, + "step": 7905, + "train_runtime": 64237.4737, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 4.791515151515151, + "grad_norm": 0.0038105142302811146, + "learning_rate": 5.410626956809863e-05, + "loss": 0.01129075326025486, + "num_input_tokens_seen": 129468656, + "step": 7906, + "train_runtime": 64245.583, + "train_tokens_per_second": 2015.215 + }, + { + "epoch": 4.792121212121212, + "grad_norm": 0.006375696510076523, + "learning_rate": 5.409668583636036e-05, + "loss": 0.013573604635894299, + "num_input_tokens_seen": 129485032, + "step": 7907, + "train_runtime": 64253.6905, + "train_tokens_per_second": 2015.215 + }, + { + "epoch": 4.792727272727273, + "grad_norm": 0.010161584243178368, + "learning_rate": 5.408710195309366e-05, + "loss": 0.01205070223659277, + "num_input_tokens_seen": 129501408, + "step": 7908, + "train_runtime": 64261.8035, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 4.793333333333333, + "grad_norm": 0.00977786909788847, + "learning_rate": 5.407751791865302e-05, + "loss": 0.012276122346520424, + "num_input_tokens_seen": 129517784, + "step": 7909, + "train_runtime": 64269.9131, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 4.793939393939394, + "grad_norm": 0.018898380920290947, + "learning_rate": 5.4067933733392915e-05, + "loss": 0.01276368834078312, + "num_input_tokens_seen": 129534160, + "step": 7910, + "train_runtime": 64278.0293, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 4.794545454545455, + "grad_norm": 0.00956640113145113, + "learning_rate": 5.4058349397667885e-05, + "loss": 0.01354953832924366, + "num_input_tokens_seen": 129550536, + "step": 7911, + "train_runtime": 64286.135, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 4.795151515151515, + "grad_norm": 0.006277823355048895, + "learning_rate": 5.4048764911832396e-05, + "loss": 0.011808311566710472, + "num_input_tokens_seen": 129566912, + "step": 7912, + "train_runtime": 64294.2461, + "train_tokens_per_second": 2015.218 + }, + { + "epoch": 4.795757575757576, + "grad_norm": 0.008185704238712788, + "learning_rate": 5.403918027624097e-05, + "loss": 0.012253200635313988, + "num_input_tokens_seen": 129583288, + "step": 7913, + "train_runtime": 64302.3537, + "train_tokens_per_second": 2015.218 + }, + { + "epoch": 4.796363636363637, + "grad_norm": 0.0076940651051700115, + "learning_rate": 5.4029595491248164e-05, + "loss": 0.011461117304861546, + "num_input_tokens_seen": 129599664, + "step": 7914, + "train_runtime": 64310.4606, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 4.796969696969697, + "grad_norm": 0.0061295172199606895, + "learning_rate": 5.402001055720844e-05, + "loss": 0.010788757354021072, + "num_input_tokens_seen": 129616040, + "step": 7915, + "train_runtime": 64318.5688, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 4.797575757575758, + "grad_norm": 0.007649904582649469, + "learning_rate": 5.401042547447637e-05, + "loss": 0.01164589636027813, + "num_input_tokens_seen": 129632416, + "step": 7916, + "train_runtime": 64326.6744, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 4.798181818181818, + "grad_norm": 0.005568922031670809, + "learning_rate": 5.400084024340646e-05, + "loss": 0.012134824879467487, + "num_input_tokens_seen": 129648792, + "step": 7917, + "train_runtime": 64334.7815, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 4.798787878787879, + "grad_norm": 0.007181449327617884, + "learning_rate": 5.399125486435327e-05, + "loss": 0.011540709994733334, + "num_input_tokens_seen": 129665168, + "step": 7918, + "train_runtime": 64342.8905, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 4.79939393939394, + "grad_norm": 0.006770353764295578, + "learning_rate": 5.398166933767134e-05, + "loss": 0.011928034946322441, + "num_input_tokens_seen": 129681544, + "step": 7919, + "train_runtime": 64350.9993, + "train_tokens_per_second": 2015.222 + }, + { + "epoch": 4.8, + "grad_norm": 0.005567293148487806, + "learning_rate": 5.39720836637152e-05, + "loss": 0.011597163043916225, + "num_input_tokens_seen": 129697920, + "step": 7920, + "train_runtime": 64359.1081, + "train_tokens_per_second": 2015.222 + }, + { + "epoch": 4.800606060606061, + "grad_norm": 0.005844739731401205, + "learning_rate": 5.396249784283942e-05, + "loss": 0.011704782955348492, + "num_input_tokens_seen": 129714296, + "step": 7921, + "train_runtime": 64367.2154, + "train_tokens_per_second": 2015.223 + }, + { + "epoch": 4.801212121212121, + "grad_norm": 0.007826928049325943, + "learning_rate": 5.395291187539857e-05, + "loss": 0.01186783891171217, + "num_input_tokens_seen": 129730672, + "step": 7922, + "train_runtime": 64375.3316, + "train_tokens_per_second": 2015.223 + }, + { + "epoch": 4.801818181818182, + "grad_norm": 0.01044582761824131, + "learning_rate": 5.394332576174721e-05, + "loss": 0.013853304088115692, + "num_input_tokens_seen": 129747048, + "step": 7923, + "train_runtime": 64383.4406, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 4.802424242424243, + "grad_norm": 0.009032058529555798, + "learning_rate": 5.393373950223991e-05, + "loss": 0.01281433179974556, + "num_input_tokens_seen": 129763424, + "step": 7924, + "train_runtime": 64391.5481, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 4.803030303030303, + "grad_norm": 0.006209979299455881, + "learning_rate": 5.3924153097231237e-05, + "loss": 0.012174495495855808, + "num_input_tokens_seen": 129779800, + "step": 7925, + "train_runtime": 64399.6581, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 4.803636363636364, + "grad_norm": 0.007260410115122795, + "learning_rate": 5.391456654707579e-05, + "loss": 0.01298435777425766, + "num_input_tokens_seen": 129796176, + "step": 7926, + "train_runtime": 64407.7668, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 4.804242424242425, + "grad_norm": 0.008398675359785557, + "learning_rate": 5.390497985212816e-05, + "loss": 0.012828581966459751, + "num_input_tokens_seen": 129812552, + "step": 7927, + "train_runtime": 64415.8785, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 4.804848484848485, + "grad_norm": 0.005309659987688065, + "learning_rate": 5.389539301274291e-05, + "loss": 0.012582269497215748, + "num_input_tokens_seen": 129828928, + "step": 7928, + "train_runtime": 64423.9858, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 4.805454545454546, + "grad_norm": 0.004749129060655832, + "learning_rate": 5.3885806029274666e-05, + "loss": 0.011340965516865253, + "num_input_tokens_seen": 129845304, + "step": 7929, + "train_runtime": 64432.092, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 4.806060606060606, + "grad_norm": 0.01118831243366003, + "learning_rate": 5.387621890207803e-05, + "loss": 0.011936484836041927, + "num_input_tokens_seen": 129861680, + "step": 7930, + "train_runtime": 64440.2005, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 4.806666666666667, + "grad_norm": 0.007600531447678804, + "learning_rate": 5.386663163150759e-05, + "loss": 0.012709558941423893, + "num_input_tokens_seen": 129878056, + "step": 7931, + "train_runtime": 64448.31, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 4.807272727272728, + "grad_norm": 0.00781118543818593, + "learning_rate": 5.385704421791799e-05, + "loss": 0.012919718399643898, + "num_input_tokens_seen": 129894432, + "step": 7932, + "train_runtime": 64456.4201, + "train_tokens_per_second": 2015.229 + }, + { + "epoch": 4.807878787878788, + "grad_norm": 0.0036623107735067606, + "learning_rate": 5.384745666166382e-05, + "loss": 0.011397486552596092, + "num_input_tokens_seen": 129910808, + "step": 7933, + "train_runtime": 64464.5323, + "train_tokens_per_second": 2015.229 + }, + { + "epoch": 4.808484848484849, + "grad_norm": 0.00869760662317276, + "learning_rate": 5.383786896309974e-05, + "loss": 0.01225908100605011, + "num_input_tokens_seen": 129927184, + "step": 7934, + "train_runtime": 64472.641, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 4.809090909090909, + "grad_norm": 0.007894969545304775, + "learning_rate": 5.382828112258035e-05, + "loss": 0.011846808716654778, + "num_input_tokens_seen": 129943560, + "step": 7935, + "train_runtime": 64480.7497, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 4.80969696969697, + "grad_norm": 0.00850802194327116, + "learning_rate": 5.381869314046031e-05, + "loss": 0.01163252629339695, + "num_input_tokens_seen": 129959936, + "step": 7936, + "train_runtime": 64488.8569, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 4.8103030303030305, + "grad_norm": 0.0087225791066885, + "learning_rate": 5.380910501709423e-05, + "loss": 0.012625991366803646, + "num_input_tokens_seen": 129976312, + "step": 7937, + "train_runtime": 64496.968, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 4.810909090909091, + "grad_norm": 0.007836904376745224, + "learning_rate": 5.3799516752836774e-05, + "loss": 0.011689224280416965, + "num_input_tokens_seen": 129992688, + "step": 7938, + "train_runtime": 64505.079, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 4.8115151515151515, + "grad_norm": 0.00825989618897438, + "learning_rate": 5.3789928348042594e-05, + "loss": 0.01270642876625061, + "num_input_tokens_seen": 130009064, + "step": 7939, + "train_runtime": 64513.1859, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 4.8121212121212125, + "grad_norm": 0.0050797793082892895, + "learning_rate": 5.378033980306635e-05, + "loss": 0.011020609177649021, + "num_input_tokens_seen": 130025440, + "step": 7940, + "train_runtime": 64521.2936, + "train_tokens_per_second": 2015.233 + }, + { + "epoch": 4.8127272727272725, + "grad_norm": 0.009830729104578495, + "learning_rate": 5.3770751118262716e-05, + "loss": 0.011583277024328709, + "num_input_tokens_seen": 130041816, + "step": 7941, + "train_runtime": 64529.4039, + "train_tokens_per_second": 2015.233 + }, + { + "epoch": 4.8133333333333335, + "grad_norm": 0.007956464774906635, + "learning_rate": 5.376116229398631e-05, + "loss": 0.011529582552611828, + "num_input_tokens_seen": 130058192, + "step": 7942, + "train_runtime": 64537.5148, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 4.813939393939394, + "grad_norm": 0.009491204284131527, + "learning_rate": 5.375157333059185e-05, + "loss": 0.012814349494874477, + "num_input_tokens_seen": 130074568, + "step": 7943, + "train_runtime": 64545.6207, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 4.8145454545454545, + "grad_norm": 0.0006146999076008797, + "learning_rate": 5.3741984228433996e-05, + "loss": 0.0119182663038373, + "num_input_tokens_seen": 130090944, + "step": 7944, + "train_runtime": 64553.7309, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 4.815151515151515, + "grad_norm": 0.005846685729920864, + "learning_rate": 5.373239498786743e-05, + "loss": 0.011443963274359703, + "num_input_tokens_seen": 130107320, + "step": 7945, + "train_runtime": 64561.8402, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 4.8157575757575755, + "grad_norm": 0.004879082087427378, + "learning_rate": 5.372280560924685e-05, + "loss": 0.011678011156618595, + "num_input_tokens_seen": 130123696, + "step": 7946, + "train_runtime": 64569.9476, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 4.816363636363636, + "grad_norm": 0.004932955838739872, + "learning_rate": 5.3713216092926944e-05, + "loss": 0.010745376348495483, + "num_input_tokens_seen": 130140072, + "step": 7947, + "train_runtime": 64578.0556, + "train_tokens_per_second": 2015.237 + }, + { + "epoch": 4.816969696969697, + "grad_norm": 0.006668645888566971, + "learning_rate": 5.3703626439262386e-05, + "loss": 0.011522757820785046, + "num_input_tokens_seen": 130156448, + "step": 7948, + "train_runtime": 64586.1641, + "train_tokens_per_second": 2015.237 + }, + { + "epoch": 4.817575757575757, + "grad_norm": 0.005249621346592903, + "learning_rate": 5.369403664860791e-05, + "loss": 0.011545016430318356, + "num_input_tokens_seen": 130172824, + "step": 7949, + "train_runtime": 64594.275, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 4.818181818181818, + "grad_norm": 0.006151698529720306, + "learning_rate": 5.368444672131822e-05, + "loss": 0.011951982043683529, + "num_input_tokens_seen": 130189200, + "step": 7950, + "train_runtime": 64602.3836, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 4.818787878787878, + "grad_norm": 0.006128935143351555, + "learning_rate": 5.3674856657748015e-05, + "loss": 0.012949834577739239, + "num_input_tokens_seen": 130205576, + "step": 7951, + "train_runtime": 64610.4916, + "train_tokens_per_second": 2015.239 + }, + { + "epoch": 4.819393939393939, + "grad_norm": 0.0071241408586502075, + "learning_rate": 5.3665266458252004e-05, + "loss": 0.011762103997170925, + "num_input_tokens_seen": 130221952, + "step": 7952, + "train_runtime": 64618.6026, + "train_tokens_per_second": 2015.239 + }, + { + "epoch": 4.82, + "grad_norm": 0.007538822945207357, + "learning_rate": 5.365567612318494e-05, + "loss": 0.01192504446953535, + "num_input_tokens_seen": 130238328, + "step": 7953, + "train_runtime": 64626.7118, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 4.82060606060606, + "grad_norm": 0.006664446089416742, + "learning_rate": 5.364608565290155e-05, + "loss": 0.010920369997620583, + "num_input_tokens_seen": 130254704, + "step": 7954, + "train_runtime": 64634.8204, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 4.821212121212121, + "grad_norm": 0.009826021268963814, + "learning_rate": 5.363649504775653e-05, + "loss": 0.0113326795399189, + "num_input_tokens_seen": 130271080, + "step": 7955, + "train_runtime": 64642.9318, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 4.821818181818182, + "grad_norm": 0.007535375654697418, + "learning_rate": 5.3626904308104634e-05, + "loss": 0.01349166501313448, + "num_input_tokens_seen": 130287456, + "step": 7956, + "train_runtime": 64651.0426, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 4.822424242424242, + "grad_norm": 0.010830850340425968, + "learning_rate": 5.361731343430062e-05, + "loss": 0.012720078229904175, + "num_input_tokens_seen": 130303832, + "step": 7957, + "train_runtime": 64659.1532, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 4.823030303030303, + "grad_norm": 0.01142242643982172, + "learning_rate": 5.360772242669922e-05, + "loss": 0.011251005344092846, + "num_input_tokens_seen": 130320208, + "step": 7958, + "train_runtime": 64667.2607, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 4.823636363636363, + "grad_norm": 0.007494893856346607, + "learning_rate": 5.3598131285655216e-05, + "loss": 0.011391369625926018, + "num_input_tokens_seen": 130336584, + "step": 7959, + "train_runtime": 64675.3694, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 4.824242424242424, + "grad_norm": 0.007962747476994991, + "learning_rate": 5.358854001152333e-05, + "loss": 0.011778423562645912, + "num_input_tokens_seen": 130352960, + "step": 7960, + "train_runtime": 64683.4804, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 4.824848484848485, + "grad_norm": 0.007866855710744858, + "learning_rate": 5.357894860465833e-05, + "loss": 0.011622886173427105, + "num_input_tokens_seen": 130369336, + "step": 7961, + "train_runtime": 64691.591, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 4.825454545454545, + "grad_norm": 0.0025869228411465883, + "learning_rate": 5.356935706541499e-05, + "loss": 0.011502083390951157, + "num_input_tokens_seen": 130385712, + "step": 7962, + "train_runtime": 64699.6989, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 4.826060606060606, + "grad_norm": 0.007607310079038143, + "learning_rate": 5.3559765394148086e-05, + "loss": 0.012961024418473244, + "num_input_tokens_seen": 130402088, + "step": 7963, + "train_runtime": 64707.8103, + "train_tokens_per_second": 2015.245 + }, + { + "epoch": 4.826666666666666, + "grad_norm": 0.01036771945655346, + "learning_rate": 5.3550173591212416e-05, + "loss": 0.012153726071119308, + "num_input_tokens_seen": 130418464, + "step": 7964, + "train_runtime": 64715.9224, + "train_tokens_per_second": 2015.245 + }, + { + "epoch": 4.827272727272727, + "grad_norm": 0.006845572497695684, + "learning_rate": 5.3540581656962706e-05, + "loss": 0.011731993407011032, + "num_input_tokens_seen": 130434840, + "step": 7965, + "train_runtime": 64724.0339, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 4.827878787878788, + "grad_norm": 0.011055968701839447, + "learning_rate": 5.353098959175379e-05, + "loss": 0.01257930975407362, + "num_input_tokens_seen": 130451216, + "step": 7966, + "train_runtime": 64732.1426, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 4.828484848484848, + "grad_norm": 0.006228553596884012, + "learning_rate": 5.352139739594044e-05, + "loss": 0.01171241607517004, + "num_input_tokens_seen": 130467592, + "step": 7967, + "train_runtime": 64740.2531, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 4.829090909090909, + "grad_norm": 0.005879545584321022, + "learning_rate": 5.351180506987747e-05, + "loss": 0.012425174936652184, + "num_input_tokens_seen": 130483968, + "step": 7968, + "train_runtime": 64748.3659, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 4.82969696969697, + "grad_norm": 0.010968852788209915, + "learning_rate": 5.350221261391966e-05, + "loss": 0.012525176629424095, + "num_input_tokens_seen": 130500344, + "step": 7969, + "train_runtime": 64756.4753, + "train_tokens_per_second": 2015.248 + }, + { + "epoch": 4.83030303030303, + "grad_norm": 0.008048497140407562, + "learning_rate": 5.349262002842182e-05, + "loss": 0.012591879814863205, + "num_input_tokens_seen": 130516720, + "step": 7970, + "train_runtime": 64764.5832, + "train_tokens_per_second": 2015.248 + }, + { + "epoch": 4.830909090909091, + "grad_norm": 0.0038498383946716785, + "learning_rate": 5.3483027313738764e-05, + "loss": 0.012366648763418198, + "num_input_tokens_seen": 130533096, + "step": 7971, + "train_runtime": 64772.6925, + "train_tokens_per_second": 2015.249 + }, + { + "epoch": 4.831515151515152, + "grad_norm": 0.004182217642664909, + "learning_rate": 5.347343447022533e-05, + "loss": 0.011695551685988903, + "num_input_tokens_seen": 130549472, + "step": 7972, + "train_runtime": 64780.8011, + "train_tokens_per_second": 2015.249 + }, + { + "epoch": 4.832121212121212, + "grad_norm": 0.006824258249253035, + "learning_rate": 5.346384149823631e-05, + "loss": 0.01162599865347147, + "num_input_tokens_seen": 130565848, + "step": 7973, + "train_runtime": 64788.9088, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 4.832727272727273, + "grad_norm": 0.00496063195168972, + "learning_rate": 5.345424839812654e-05, + "loss": 0.011790132150053978, + "num_input_tokens_seen": 130582224, + "step": 7974, + "train_runtime": 64797.0177, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 4.833333333333333, + "grad_norm": 0.006580311339348555, + "learning_rate": 5.3444655170250834e-05, + "loss": 0.012607484124600887, + "num_input_tokens_seen": 130598600, + "step": 7975, + "train_runtime": 64805.1301, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 4.833939393939394, + "grad_norm": 0.005542982369661331, + "learning_rate": 5.343506181496405e-05, + "loss": 0.012260831892490387, + "num_input_tokens_seen": 130614976, + "step": 7976, + "train_runtime": 64813.2393, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 4.834545454545455, + "grad_norm": 0.009147954173386097, + "learning_rate": 5.3425468332621034e-05, + "loss": 0.011921907775104046, + "num_input_tokens_seen": 130631352, + "step": 7977, + "train_runtime": 64821.3477, + "train_tokens_per_second": 2015.252 + }, + { + "epoch": 4.835151515151515, + "grad_norm": 0.013463393785059452, + "learning_rate": 5.34158747235766e-05, + "loss": 0.012884139083325863, + "num_input_tokens_seen": 130647728, + "step": 7978, + "train_runtime": 64829.456, + "train_tokens_per_second": 2015.253 + }, + { + "epoch": 4.835757575757576, + "grad_norm": 0.006192738655954599, + "learning_rate": 5.340628098818561e-05, + "loss": 0.01182072889059782, + "num_input_tokens_seen": 130664104, + "step": 7979, + "train_runtime": 64837.5663, + "train_tokens_per_second": 2015.253 + }, + { + "epoch": 4.836363636363636, + "grad_norm": 0.006606599316000938, + "learning_rate": 5.339668712680293e-05, + "loss": 0.012475390918552876, + "num_input_tokens_seen": 130680480, + "step": 7980, + "train_runtime": 64845.6732, + "train_tokens_per_second": 2015.254 + }, + { + "epoch": 4.836969696969697, + "grad_norm": 0.008340008556842804, + "learning_rate": 5.33870931397834e-05, + "loss": 0.010902278125286102, + "num_input_tokens_seen": 130696856, + "step": 7981, + "train_runtime": 64853.7806, + "train_tokens_per_second": 2015.254 + }, + { + "epoch": 4.837575757575758, + "grad_norm": 0.0035960020031780005, + "learning_rate": 5.3377499027481895e-05, + "loss": 0.011418286710977554, + "num_input_tokens_seen": 130713232, + "step": 7982, + "train_runtime": 64861.8905, + "train_tokens_per_second": 2015.255 + }, + { + "epoch": 4.838181818181818, + "grad_norm": 0.006967714987695217, + "learning_rate": 5.3367904790253265e-05, + "loss": 0.013138656504452229, + "num_input_tokens_seen": 130729608, + "step": 7983, + "train_runtime": 64869.9984, + "train_tokens_per_second": 2015.255 + }, + { + "epoch": 4.838787878787879, + "grad_norm": 0.008774694055318832, + "learning_rate": 5.335831042845242e-05, + "loss": 0.01199450995773077, + "num_input_tokens_seen": 130745984, + "step": 7984, + "train_runtime": 64878.1081, + "train_tokens_per_second": 2015.256 + }, + { + "epoch": 4.83939393939394, + "grad_norm": 0.007884979248046875, + "learning_rate": 5.334871594243418e-05, + "loss": 0.012999688275158405, + "num_input_tokens_seen": 130762360, + "step": 7985, + "train_runtime": 64886.215, + "train_tokens_per_second": 2015.256 + }, + { + "epoch": 4.84, + "grad_norm": 0.008492684923112392, + "learning_rate": 5.33391213325535e-05, + "loss": 0.012459184974431992, + "num_input_tokens_seen": 130778736, + "step": 7986, + "train_runtime": 64894.3321, + "train_tokens_per_second": 2015.257 + }, + { + "epoch": 4.840606060606061, + "grad_norm": 0.007785535417497158, + "learning_rate": 5.3329526599165204e-05, + "loss": 0.012342577800154686, + "num_input_tokens_seen": 130795112, + "step": 7987, + "train_runtime": 64902.4441, + "train_tokens_per_second": 2015.257 + }, + { + "epoch": 4.841212121212121, + "grad_norm": 0.003358351532369852, + "learning_rate": 5.33199317426242e-05, + "loss": 0.01188154798001051, + "num_input_tokens_seen": 130811488, + "step": 7988, + "train_runtime": 64910.5536, + "train_tokens_per_second": 2015.258 + }, + { + "epoch": 4.841818181818182, + "grad_norm": 0.0071096885949373245, + "learning_rate": 5.3310336763285386e-05, + "loss": 0.01360334362834692, + "num_input_tokens_seen": 130827864, + "step": 7989, + "train_runtime": 64918.6628, + "train_tokens_per_second": 2015.258 + }, + { + "epoch": 4.842424242424243, + "grad_norm": 0.0091654472053051, + "learning_rate": 5.330074166150365e-05, + "loss": 0.012130281887948513, + "num_input_tokens_seen": 130844240, + "step": 7990, + "train_runtime": 64926.7728, + "train_tokens_per_second": 2015.259 + }, + { + "epoch": 4.843030303030303, + "grad_norm": 0.0074912989512085915, + "learning_rate": 5.329114643763393e-05, + "loss": 0.011960696429014206, + "num_input_tokens_seen": 130860616, + "step": 7991, + "train_runtime": 64934.8848, + "train_tokens_per_second": 2015.259 + }, + { + "epoch": 4.843636363636364, + "grad_norm": 0.005820361897349358, + "learning_rate": 5.32815510920311e-05, + "loss": 0.01132245548069477, + "num_input_tokens_seen": 130876992, + "step": 7992, + "train_runtime": 64942.9937, + "train_tokens_per_second": 2015.26 + }, + { + "epoch": 4.844242424242424, + "grad_norm": 0.004559758584946394, + "learning_rate": 5.32719556250501e-05, + "loss": 0.011373481713235378, + "num_input_tokens_seen": 130893368, + "step": 7993, + "train_runtime": 64951.1056, + "train_tokens_per_second": 2015.26 + }, + { + "epoch": 4.844848484848485, + "grad_norm": 0.008521380834281445, + "learning_rate": 5.326236003704581e-05, + "loss": 0.012601376511156559, + "num_input_tokens_seen": 130909744, + "step": 7994, + "train_runtime": 64959.2184, + "train_tokens_per_second": 2015.26 + }, + { + "epoch": 4.845454545454546, + "grad_norm": 0.006054472643882036, + "learning_rate": 5.3252764328373194e-05, + "loss": 0.011877212673425674, + "num_input_tokens_seen": 130926120, + "step": 7995, + "train_runtime": 64967.333, + "train_tokens_per_second": 2015.261 + }, + { + "epoch": 4.846060606060606, + "grad_norm": 0.0072673917748034, + "learning_rate": 5.324316849938715e-05, + "loss": 0.011671725660562515, + "num_input_tokens_seen": 130942496, + "step": 7996, + "train_runtime": 64975.4407, + "train_tokens_per_second": 2015.261 + }, + { + "epoch": 4.846666666666667, + "grad_norm": 0.00534982280805707, + "learning_rate": 5.3233572550442634e-05, + "loss": 0.011874779127538204, + "num_input_tokens_seen": 130958872, + "step": 7997, + "train_runtime": 64983.5511, + "train_tokens_per_second": 2015.262 + }, + { + "epoch": 4.847272727272728, + "grad_norm": 0.0050572482869029045, + "learning_rate": 5.3223976481894545e-05, + "loss": 0.011770452372729778, + "num_input_tokens_seen": 130975248, + "step": 7998, + "train_runtime": 64991.6608, + "train_tokens_per_second": 2015.262 + }, + { + "epoch": 4.847878787878788, + "grad_norm": 0.0054545714519917965, + "learning_rate": 5.321438029409786e-05, + "loss": 0.012070531025528908, + "num_input_tokens_seen": 130991624, + "step": 7999, + "train_runtime": 64999.7706, + "train_tokens_per_second": 2015.263 + }, + { + "epoch": 4.848484848484849, + "grad_norm": 0.004684649407863617, + "learning_rate": 5.3204783987407506e-05, + "loss": 0.011580481193959713, + "num_input_tokens_seen": 131008000, + "step": 8000, + "train_runtime": 65007.8802, + "train_tokens_per_second": 2015.263 + }, + { + "epoch": 4.84909090909091, + "grad_norm": 0.009122513234615326, + "learning_rate": 5.319518756217843e-05, + "loss": 0.011845143511891365, + "num_input_tokens_seen": 131024376, + "step": 8001, + "train_runtime": 65016.9684, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 4.84969696969697, + "grad_norm": 0.006773280445486307, + "learning_rate": 5.318559101876558e-05, + "loss": 0.011739861220121384, + "num_input_tokens_seen": 131040752, + "step": 8002, + "train_runtime": 65025.0744, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 4.850303030303031, + "grad_norm": 0.0064363032579422, + "learning_rate": 5.317599435752393e-05, + "loss": 0.010756755247712135, + "num_input_tokens_seen": 131057128, + "step": 8003, + "train_runtime": 65033.1851, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 4.850909090909091, + "grad_norm": 0.006088779773563147, + "learning_rate": 5.3166397578808436e-05, + "loss": 0.012192950583994389, + "num_input_tokens_seen": 131073504, + "step": 8004, + "train_runtime": 65041.2989, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 4.851515151515152, + "grad_norm": 0.011268655769526958, + "learning_rate": 5.315680068297406e-05, + "loss": 0.01282171905040741, + "num_input_tokens_seen": 131089880, + "step": 8005, + "train_runtime": 65049.4132, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 4.8521212121212125, + "grad_norm": 0.007705950643867254, + "learning_rate": 5.314720367037577e-05, + "loss": 0.01219512801617384, + "num_input_tokens_seen": 131106256, + "step": 8006, + "train_runtime": 65057.5296, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 4.852727272727273, + "grad_norm": 0.00904946681112051, + "learning_rate": 5.313760654136856e-05, + "loss": 0.011855382472276688, + "num_input_tokens_seen": 131122632, + "step": 8007, + "train_runtime": 65065.6399, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 4.8533333333333335, + "grad_norm": 0.006815928500145674, + "learning_rate": 5.312800929630738e-05, + "loss": 0.012605056166648865, + "num_input_tokens_seen": 131139008, + "step": 8008, + "train_runtime": 65073.7507, + "train_tokens_per_second": 2015.237 + }, + { + "epoch": 4.8539393939393936, + "grad_norm": 0.006482995580881834, + "learning_rate": 5.311841193554723e-05, + "loss": 0.011864868924021721, + "num_input_tokens_seen": 131155384, + "step": 8009, + "train_runtime": 65081.8635, + "train_tokens_per_second": 2015.237 + }, + { + "epoch": 4.8545454545454545, + "grad_norm": 0.007762098219245672, + "learning_rate": 5.31088144594431e-05, + "loss": 0.013511043041944504, + "num_input_tokens_seen": 131171760, + "step": 8010, + "train_runtime": 65089.9725, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 4.855151515151515, + "grad_norm": 0.005980928428471088, + "learning_rate": 5.3099216868349966e-05, + "loss": 0.01223065610975027, + "num_input_tokens_seen": 131188136, + "step": 8011, + "train_runtime": 65098.082, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 4.8557575757575755, + "grad_norm": 0.006976263597607613, + "learning_rate": 5.308961916262285e-05, + "loss": 0.011290223337709904, + "num_input_tokens_seen": 131204512, + "step": 8012, + "train_runtime": 65106.1938, + "train_tokens_per_second": 2015.239 + }, + { + "epoch": 4.856363636363636, + "grad_norm": 0.008299719542264938, + "learning_rate": 5.3080021342616734e-05, + "loss": 0.011999385431408882, + "num_input_tokens_seen": 131220888, + "step": 8013, + "train_runtime": 65114.3017, + "train_tokens_per_second": 2015.239 + }, + { + "epoch": 4.856969696969697, + "grad_norm": 0.008607955649495125, + "learning_rate": 5.307042340868662e-05, + "loss": 0.01192860770970583, + "num_input_tokens_seen": 131237264, + "step": 8014, + "train_runtime": 65122.4091, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 4.857575757575757, + "grad_norm": 0.008992774412035942, + "learning_rate": 5.306082536118753e-05, + "loss": 0.012305804528295994, + "num_input_tokens_seen": 131253640, + "step": 8015, + "train_runtime": 65130.5196, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 4.858181818181818, + "grad_norm": 0.00709897093474865, + "learning_rate": 5.305122720047446e-05, + "loss": 0.012411996722221375, + "num_input_tokens_seen": 131270016, + "step": 8016, + "train_runtime": 65138.631, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 4.858787878787878, + "grad_norm": 0.005716558080166578, + "learning_rate": 5.304162892690244e-05, + "loss": 0.011575439013540745, + "num_input_tokens_seen": 131286392, + "step": 8017, + "train_runtime": 65146.7398, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 4.859393939393939, + "grad_norm": 0.004610141739249229, + "learning_rate": 5.3032030540826503e-05, + "loss": 0.011837982572615147, + "num_input_tokens_seen": 131302768, + "step": 8018, + "train_runtime": 65154.8503, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 4.86, + "grad_norm": 0.005796570330858231, + "learning_rate": 5.3022432042601654e-05, + "loss": 0.011446746997535229, + "num_input_tokens_seen": 131319144, + "step": 8019, + "train_runtime": 65162.9631, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 4.86060606060606, + "grad_norm": 0.003428853815421462, + "learning_rate": 5.301283343258293e-05, + "loss": 0.012079784646630287, + "num_input_tokens_seen": 131335520, + "step": 8020, + "train_runtime": 65171.0718, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 4.861212121212121, + "grad_norm": 0.0027451820205897093, + "learning_rate": 5.3003234711125346e-05, + "loss": 0.012217097915709019, + "num_input_tokens_seen": 131351896, + "step": 8021, + "train_runtime": 65179.1813, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 4.861818181818181, + "grad_norm": 0.004095582757145166, + "learning_rate": 5.299363587858399e-05, + "loss": 0.011478072963654995, + "num_input_tokens_seen": 131368272, + "step": 8022, + "train_runtime": 65187.2891, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 4.862424242424242, + "grad_norm": 0.007695556618273258, + "learning_rate": 5.298403693531385e-05, + "loss": 0.01179034449160099, + "num_input_tokens_seen": 131384648, + "step": 8023, + "train_runtime": 65195.3991, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 4.863030303030303, + "grad_norm": 0.005959813483059406, + "learning_rate": 5.2974437881670015e-05, + "loss": 0.010823999531567097, + "num_input_tokens_seen": 131401024, + "step": 8024, + "train_runtime": 65203.5082, + "train_tokens_per_second": 2015.245 + }, + { + "epoch": 4.863636363636363, + "grad_norm": 0.004662640392780304, + "learning_rate": 5.296483871800749e-05, + "loss": 0.012889439240098, + "num_input_tokens_seen": 131417400, + "step": 8025, + "train_runtime": 65211.6208, + "train_tokens_per_second": 2015.245 + }, + { + "epoch": 4.864242424242424, + "grad_norm": 0.006192891858518124, + "learning_rate": 5.295523944468137e-05, + "loss": 0.01241276878863573, + "num_input_tokens_seen": 131433776, + "step": 8026, + "train_runtime": 65219.7315, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 4.864848484848485, + "grad_norm": 0.05126989632844925, + "learning_rate": 5.2945640062046694e-05, + "loss": 0.013079440221190453, + "num_input_tokens_seen": 131450152, + "step": 8027, + "train_runtime": 65227.8434, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 4.865454545454545, + "grad_norm": 0.009196233004331589, + "learning_rate": 5.293604057045854e-05, + "loss": 0.012443510815501213, + "num_input_tokens_seen": 131466528, + "step": 8028, + "train_runtime": 65235.9526, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 4.866060606060606, + "grad_norm": 0.00664319284260273, + "learning_rate": 5.292644097027195e-05, + "loss": 0.011607900261878967, + "num_input_tokens_seen": 131482904, + "step": 8029, + "train_runtime": 65244.0615, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 4.866666666666667, + "grad_norm": 0.00743938609957695, + "learning_rate": 5.291684126184201e-05, + "loss": 0.012245683930814266, + "num_input_tokens_seen": 131499280, + "step": 8030, + "train_runtime": 65252.1729, + "train_tokens_per_second": 2015.248 + }, + { + "epoch": 4.867272727272727, + "grad_norm": 0.006169603206217289, + "learning_rate": 5.290724144552379e-05, + "loss": 0.012830356135964394, + "num_input_tokens_seen": 131515656, + "step": 8031, + "train_runtime": 65260.2837, + "train_tokens_per_second": 2015.248 + }, + { + "epoch": 4.867878787878788, + "grad_norm": 0.005744394846260548, + "learning_rate": 5.289764152167238e-05, + "loss": 0.012334663420915604, + "num_input_tokens_seen": 131532032, + "step": 8032, + "train_runtime": 65268.3932, + "train_tokens_per_second": 2015.249 + }, + { + "epoch": 4.868484848484848, + "grad_norm": 0.0067197829484939575, + "learning_rate": 5.288804149064285e-05, + "loss": 0.011851930990815163, + "num_input_tokens_seen": 131548408, + "step": 8033, + "train_runtime": 65276.5016, + "train_tokens_per_second": 2015.249 + }, + { + "epoch": 4.869090909090909, + "grad_norm": 0.019180642440915108, + "learning_rate": 5.287844135279028e-05, + "loss": 0.011255649849772453, + "num_input_tokens_seen": 131564784, + "step": 8034, + "train_runtime": 65284.613, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 4.86969696969697, + "grad_norm": 0.007505272049456835, + "learning_rate": 5.286884110846978e-05, + "loss": 0.011680633760988712, + "num_input_tokens_seen": 131581160, + "step": 8035, + "train_runtime": 65292.7312, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 4.87030303030303, + "grad_norm": 0.011384771205484867, + "learning_rate": 5.2859240758036435e-05, + "loss": 0.012036658823490143, + "num_input_tokens_seen": 131597536, + "step": 8036, + "train_runtime": 65300.84, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 4.870909090909091, + "grad_norm": 0.009010239504277706, + "learning_rate": 5.284964030184534e-05, + "loss": 0.01298827026039362, + "num_input_tokens_seen": 131613912, + "step": 8037, + "train_runtime": 65308.9502, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 4.871515151515151, + "grad_norm": 0.004034006502479315, + "learning_rate": 5.284003974025159e-05, + "loss": 0.013343521393835545, + "num_input_tokens_seen": 131630288, + "step": 8038, + "train_runtime": 65317.0655, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 4.872121212121212, + "grad_norm": 0.005528433248400688, + "learning_rate": 5.28304390736103e-05, + "loss": 0.01097903586924076, + "num_input_tokens_seen": 131646664, + "step": 8039, + "train_runtime": 65325.1734, + "train_tokens_per_second": 2015.252 + }, + { + "epoch": 4.872727272727273, + "grad_norm": 0.008882706053555012, + "learning_rate": 5.282083830227659e-05, + "loss": 0.012803968042135239, + "num_input_tokens_seen": 131663040, + "step": 8040, + "train_runtime": 65333.2825, + "train_tokens_per_second": 2015.252 + }, + { + "epoch": 4.873333333333333, + "grad_norm": 0.008353727869689465, + "learning_rate": 5.281123742660558e-05, + "loss": 0.011988398618996143, + "num_input_tokens_seen": 131679416, + "step": 8041, + "train_runtime": 65341.3932, + "train_tokens_per_second": 2015.253 + }, + { + "epoch": 4.873939393939394, + "grad_norm": 0.004823810420930386, + "learning_rate": 5.280163644695234e-05, + "loss": 0.012909186072647572, + "num_input_tokens_seen": 131695792, + "step": 8042, + "train_runtime": 65349.5068, + "train_tokens_per_second": 2015.253 + }, + { + "epoch": 4.874545454545455, + "grad_norm": 0.003440561005845666, + "learning_rate": 5.2792035363672045e-05, + "loss": 0.011355619877576828, + "num_input_tokens_seen": 131712168, + "step": 8043, + "train_runtime": 65357.6148, + "train_tokens_per_second": 2015.254 + }, + { + "epoch": 4.875151515151515, + "grad_norm": 0.004587062634527683, + "learning_rate": 5.278243417711979e-05, + "loss": 0.01260740589350462, + "num_input_tokens_seen": 131728544, + "step": 8044, + "train_runtime": 65365.7354, + "train_tokens_per_second": 2015.254 + }, + { + "epoch": 4.875757575757576, + "grad_norm": 0.009701283648610115, + "learning_rate": 5.2772832887650716e-05, + "loss": 0.013316280208528042, + "num_input_tokens_seen": 131744920, + "step": 8045, + "train_runtime": 65373.8499, + "train_tokens_per_second": 2015.254 + }, + { + "epoch": 4.876363636363636, + "grad_norm": 0.0027940247673541307, + "learning_rate": 5.276323149561996e-05, + "loss": 0.011508207768201828, + "num_input_tokens_seen": 131761296, + "step": 8046, + "train_runtime": 65381.9615, + "train_tokens_per_second": 2015.255 + }, + { + "epoch": 4.876969696969697, + "grad_norm": 0.010270636528730392, + "learning_rate": 5.275363000138265e-05, + "loss": 0.011770247481763363, + "num_input_tokens_seen": 131777672, + "step": 8047, + "train_runtime": 65390.0703, + "train_tokens_per_second": 2015.255 + }, + { + "epoch": 4.877575757575758, + "grad_norm": 0.006983809173107147, + "learning_rate": 5.2744028405293913e-05, + "loss": 0.012792632915079594, + "num_input_tokens_seen": 131794048, + "step": 8048, + "train_runtime": 65398.1804, + "train_tokens_per_second": 2015.256 + }, + { + "epoch": 4.878181818181818, + "grad_norm": 0.005246767308562994, + "learning_rate": 5.2734426707708915e-05, + "loss": 0.010900158435106277, + "num_input_tokens_seen": 131810424, + "step": 8049, + "train_runtime": 65406.2903, + "train_tokens_per_second": 2015.256 + }, + { + "epoch": 4.878787878787879, + "grad_norm": 0.00010616699000820518, + "learning_rate": 5.2724824908982815e-05, + "loss": 0.01181704830378294, + "num_input_tokens_seen": 131826800, + "step": 8050, + "train_runtime": 65414.4014, + "train_tokens_per_second": 2015.257 + }, + { + "epoch": 4.879393939393939, + "grad_norm": 0.008218426257371902, + "learning_rate": 5.271522300947074e-05, + "loss": 0.01120320986956358, + "num_input_tokens_seen": 131843176, + "step": 8051, + "train_runtime": 65422.5083, + "train_tokens_per_second": 2015.257 + }, + { + "epoch": 4.88, + "grad_norm": 0.007357695139944553, + "learning_rate": 5.2705621009527836e-05, + "loss": 0.012410789728164673, + "num_input_tokens_seen": 131859552, + "step": 8052, + "train_runtime": 65430.6195, + "train_tokens_per_second": 2015.258 + }, + { + "epoch": 4.880606060606061, + "grad_norm": 0.005873918998986483, + "learning_rate": 5.2696018909509306e-05, + "loss": 0.011498578824102879, + "num_input_tokens_seen": 131875928, + "step": 8053, + "train_runtime": 65438.7318, + "train_tokens_per_second": 2015.258 + }, + { + "epoch": 4.881212121212121, + "grad_norm": 0.009457487612962723, + "learning_rate": 5.268641670977027e-05, + "loss": 0.013002699241042137, + "num_input_tokens_seen": 131892304, + "step": 8054, + "train_runtime": 65446.8407, + "train_tokens_per_second": 2015.259 + }, + { + "epoch": 4.881818181818182, + "grad_norm": 0.010669486597180367, + "learning_rate": 5.267681441066592e-05, + "loss": 0.012837840244174004, + "num_input_tokens_seen": 131908680, + "step": 8055, + "train_runtime": 65454.9493, + "train_tokens_per_second": 2015.259 + }, + { + "epoch": 4.882424242424243, + "grad_norm": 0.0058538177981972694, + "learning_rate": 5.266721201255142e-05, + "loss": 0.011773714795708656, + "num_input_tokens_seen": 131925056, + "step": 8056, + "train_runtime": 65463.0588, + "train_tokens_per_second": 2015.26 + }, + { + "epoch": 4.883030303030303, + "grad_norm": 0.008141196332871914, + "learning_rate": 5.265760951578192e-05, + "loss": 0.011516539379954338, + "num_input_tokens_seen": 131941432, + "step": 8057, + "train_runtime": 65471.1699, + "train_tokens_per_second": 2015.26 + }, + { + "epoch": 4.883636363636364, + "grad_norm": 0.0017173320520669222, + "learning_rate": 5.264800692071265e-05, + "loss": 0.011999749578535557, + "num_input_tokens_seen": 131957808, + "step": 8058, + "train_runtime": 65479.2786, + "train_tokens_per_second": 2015.261 + }, + { + "epoch": 4.884242424242425, + "grad_norm": 0.009480279870331287, + "learning_rate": 5.2638404227698745e-05, + "loss": 0.011842275969684124, + "num_input_tokens_seen": 131974184, + "step": 8059, + "train_runtime": 65487.388, + "train_tokens_per_second": 2015.261 + }, + { + "epoch": 4.884848484848485, + "grad_norm": 0.002993684494867921, + "learning_rate": 5.262880143709541e-05, + "loss": 0.010945526883006096, + "num_input_tokens_seen": 131990560, + "step": 8060, + "train_runtime": 65495.4984, + "train_tokens_per_second": 2015.262 + }, + { + "epoch": 4.885454545454546, + "grad_norm": 0.005822370294481516, + "learning_rate": 5.261919854925782e-05, + "loss": 0.01167953573167324, + "num_input_tokens_seen": 132006936, + "step": 8061, + "train_runtime": 65503.6066, + "train_tokens_per_second": 2015.262 + }, + { + "epoch": 4.886060606060606, + "grad_norm": 0.0064583225175738335, + "learning_rate": 5.260959556454118e-05, + "loss": 0.010922025889158249, + "num_input_tokens_seen": 132023312, + "step": 8062, + "train_runtime": 65511.7159, + "train_tokens_per_second": 2015.263 + }, + { + "epoch": 4.886666666666667, + "grad_norm": 0.00844068918377161, + "learning_rate": 5.2599992483300685e-05, + "loss": 0.012874438427388668, + "num_input_tokens_seen": 132039688, + "step": 8063, + "train_runtime": 65519.8308, + "train_tokens_per_second": 2015.263 + }, + { + "epoch": 4.887272727272728, + "grad_norm": 0.00677742762491107, + "learning_rate": 5.259038930589154e-05, + "loss": 0.012407263740897179, + "num_input_tokens_seen": 132056064, + "step": 8064, + "train_runtime": 65527.9386, + "train_tokens_per_second": 2015.264 + }, + { + "epoch": 4.887878787878788, + "grad_norm": 0.004155490081757307, + "learning_rate": 5.2580786032668914e-05, + "loss": 0.01261073537170887, + "num_input_tokens_seen": 132072440, + "step": 8065, + "train_runtime": 65536.0494, + "train_tokens_per_second": 2015.264 + }, + { + "epoch": 4.888484848484849, + "grad_norm": 0.005315873771905899, + "learning_rate": 5.257118266398806e-05, + "loss": 0.011724649928510189, + "num_input_tokens_seen": 132088816, + "step": 8066, + "train_runtime": 65544.156, + "train_tokens_per_second": 2015.265 + }, + { + "epoch": 4.889090909090909, + "grad_norm": 0.009931445121765137, + "learning_rate": 5.2561579200204156e-05, + "loss": 0.011667370796203613, + "num_input_tokens_seen": 132105192, + "step": 8067, + "train_runtime": 65552.2659, + "train_tokens_per_second": 2015.265 + }, + { + "epoch": 4.88969696969697, + "grad_norm": 0.0047648572362959385, + "learning_rate": 5.2551975641672444e-05, + "loss": 0.01221371628344059, + "num_input_tokens_seen": 132121568, + "step": 8068, + "train_runtime": 65560.3784, + "train_tokens_per_second": 2015.265 + }, + { + "epoch": 4.890303030303031, + "grad_norm": 0.007149063516408205, + "learning_rate": 5.2542371988748106e-05, + "loss": 0.011925785802304745, + "num_input_tokens_seen": 132137944, + "step": 8069, + "train_runtime": 65568.4875, + "train_tokens_per_second": 2015.266 + }, + { + "epoch": 4.890909090909091, + "grad_norm": 0.008924593217670918, + "learning_rate": 5.253276824178638e-05, + "loss": 0.012399137951433659, + "num_input_tokens_seen": 132154320, + "step": 8070, + "train_runtime": 65576.5957, + "train_tokens_per_second": 2015.267 + }, + { + "epoch": 4.891515151515152, + "grad_norm": 0.006806310266256332, + "learning_rate": 5.252316440114249e-05, + "loss": 0.011360768228769302, + "num_input_tokens_seen": 132170696, + "step": 8071, + "train_runtime": 65584.7062, + "train_tokens_per_second": 2015.267 + }, + { + "epoch": 4.8921212121212125, + "grad_norm": 0.0028571945149451494, + "learning_rate": 5.2513560467171666e-05, + "loss": 0.011292332783341408, + "num_input_tokens_seen": 132187072, + "step": 8072, + "train_runtime": 65592.8147, + "train_tokens_per_second": 2015.268 + }, + { + "epoch": 4.892727272727273, + "grad_norm": 0.00656003225594759, + "learning_rate": 5.250395644022913e-05, + "loss": 0.011937571689486504, + "num_input_tokens_seen": 132203448, + "step": 8073, + "train_runtime": 65600.9303, + "train_tokens_per_second": 2015.268 + }, + { + "epoch": 4.8933333333333335, + "grad_norm": 0.004083471372723579, + "learning_rate": 5.2494352320670125e-05, + "loss": 0.010419820435345173, + "num_input_tokens_seen": 132219824, + "step": 8074, + "train_runtime": 65609.0437, + "train_tokens_per_second": 2015.268 + }, + { + "epoch": 4.893939393939394, + "grad_norm": 0.010222554206848145, + "learning_rate": 5.248474810884988e-05, + "loss": 0.013369755819439888, + "num_input_tokens_seen": 132236200, + "step": 8075, + "train_runtime": 65617.1582, + "train_tokens_per_second": 2015.269 + }, + { + "epoch": 4.8945454545454545, + "grad_norm": 0.0072077359072864056, + "learning_rate": 5.247514380512365e-05, + "loss": 0.012270111590623856, + "num_input_tokens_seen": 132252576, + "step": 8076, + "train_runtime": 65625.267, + "train_tokens_per_second": 2015.269 + }, + { + "epoch": 4.8951515151515155, + "grad_norm": 0.005681944079697132, + "learning_rate": 5.246553940984668e-05, + "loss": 0.0111971041187644, + "num_input_tokens_seen": 132268952, + "step": 8077, + "train_runtime": 65633.3777, + "train_tokens_per_second": 2015.27 + }, + { + "epoch": 4.8957575757575755, + "grad_norm": 0.0068212710320949554, + "learning_rate": 5.24559349233742e-05, + "loss": 0.01299622654914856, + "num_input_tokens_seen": 132285328, + "step": 8078, + "train_runtime": 65641.4867, + "train_tokens_per_second": 2015.27 + }, + { + "epoch": 4.8963636363636365, + "grad_norm": 0.009923087432980537, + "learning_rate": 5.244633034606146e-05, + "loss": 0.013858644291758537, + "num_input_tokens_seen": 132301704, + "step": 8079, + "train_runtime": 65649.5975, + "train_tokens_per_second": 2015.271 + }, + { + "epoch": 4.8969696969696965, + "grad_norm": 0.003800388891249895, + "learning_rate": 5.243672567826372e-05, + "loss": 0.01211210060864687, + "num_input_tokens_seen": 132318080, + "step": 8080, + "train_runtime": 65657.7065, + "train_tokens_per_second": 2015.271 + }, + { + "epoch": 4.8975757575757575, + "grad_norm": 0.0073224143125116825, + "learning_rate": 5.242712092033626e-05, + "loss": 0.01261336263269186, + "num_input_tokens_seen": 132334456, + "step": 8081, + "train_runtime": 65665.8149, + "train_tokens_per_second": 2015.272 + }, + { + "epoch": 4.898181818181818, + "grad_norm": 0.009549190290272236, + "learning_rate": 5.241751607263432e-05, + "loss": 0.013199474662542343, + "num_input_tokens_seen": 132350832, + "step": 8082, + "train_runtime": 65673.9305, + "train_tokens_per_second": 2015.272 + }, + { + "epoch": 4.8987878787878785, + "grad_norm": 0.006586884148418903, + "learning_rate": 5.240791113551316e-05, + "loss": 0.012012353166937828, + "num_input_tokens_seen": 132367208, + "step": 8083, + "train_runtime": 65682.0434, + "train_tokens_per_second": 2015.272 + }, + { + "epoch": 4.899393939393939, + "grad_norm": 0.005681970622390509, + "learning_rate": 5.239830610932805e-05, + "loss": 0.01188092865049839, + "num_input_tokens_seen": 132383584, + "step": 8084, + "train_runtime": 65690.1553, + "train_tokens_per_second": 2015.273 + }, + { + "epoch": 4.9, + "grad_norm": 0.0056317877024412155, + "learning_rate": 5.2388700994434294e-05, + "loss": 0.012417290359735489, + "num_input_tokens_seen": 132399960, + "step": 8085, + "train_runtime": 65698.2644, + "train_tokens_per_second": 2015.273 + }, + { + "epoch": 4.90060606060606, + "grad_norm": 0.007313921581953764, + "learning_rate": 5.2379095791187124e-05, + "loss": 0.011680453084409237, + "num_input_tokens_seen": 132416336, + "step": 8086, + "train_runtime": 65706.3767, + "train_tokens_per_second": 2015.274 + }, + { + "epoch": 4.901212121212121, + "grad_norm": 0.009533536620438099, + "learning_rate": 5.236949049994183e-05, + "loss": 0.011770877055823803, + "num_input_tokens_seen": 132432712, + "step": 8087, + "train_runtime": 65714.4883, + "train_tokens_per_second": 2015.274 + }, + { + "epoch": 4.901818181818182, + "grad_norm": 0.0035145781002938747, + "learning_rate": 5.23598851210537e-05, + "loss": 0.010850721038877964, + "num_input_tokens_seen": 132449088, + "step": 8088, + "train_runtime": 65722.599, + "train_tokens_per_second": 2015.275 + }, + { + "epoch": 4.902424242424242, + "grad_norm": 0.011024806648492813, + "learning_rate": 5.235027965487802e-05, + "loss": 0.012838889844715595, + "num_input_tokens_seen": 132465464, + "step": 8089, + "train_runtime": 65730.71, + "train_tokens_per_second": 2015.275 + }, + { + "epoch": 4.903030303030303, + "grad_norm": 0.007675396744161844, + "learning_rate": 5.234067410177006e-05, + "loss": 0.01224182266741991, + "num_input_tokens_seen": 132481840, + "step": 8090, + "train_runtime": 65738.8206, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 4.903636363636363, + "grad_norm": 0.005004181060940027, + "learning_rate": 5.233106846208514e-05, + "loss": 0.01205337792634964, + "num_input_tokens_seen": 132498216, + "step": 8091, + "train_runtime": 65746.9341, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 4.904242424242424, + "grad_norm": 0.008477747440338135, + "learning_rate": 5.2321462736178527e-05, + "loss": 0.012498623691499233, + "num_input_tokens_seen": 132514592, + "step": 8092, + "train_runtime": 65755.0448, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 4.904848484848485, + "grad_norm": 0.006680623162537813, + "learning_rate": 5.2311856924405534e-05, + "loss": 0.01183308009058237, + "num_input_tokens_seen": 132530968, + "step": 8093, + "train_runtime": 65763.1558, + "train_tokens_per_second": 2015.277 + }, + { + "epoch": 4.905454545454545, + "grad_norm": 0.0018793331691995263, + "learning_rate": 5.230225102712144e-05, + "loss": 0.012400808744132519, + "num_input_tokens_seen": 132547344, + "step": 8094, + "train_runtime": 65771.2672, + "train_tokens_per_second": 2015.277 + }, + { + "epoch": 4.906060606060606, + "grad_norm": 0.004927518777549267, + "learning_rate": 5.229264504468159e-05, + "loss": 0.01207792479544878, + "num_input_tokens_seen": 132563720, + "step": 8095, + "train_runtime": 65779.3785, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 4.906666666666666, + "grad_norm": 0.008920213207602501, + "learning_rate": 5.228303897744124e-05, + "loss": 0.012407374568283558, + "num_input_tokens_seen": 132580096, + "step": 8096, + "train_runtime": 65787.4892, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 4.907272727272727, + "grad_norm": 0.005075983703136444, + "learning_rate": 5.2273432825755745e-05, + "loss": 0.01059174444526434, + "num_input_tokens_seen": 132596472, + "step": 8097, + "train_runtime": 65795.6007, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 4.907878787878788, + "grad_norm": 0.008156340569257736, + "learning_rate": 5.226382658998038e-05, + "loss": 0.012693047523498535, + "num_input_tokens_seen": 132612848, + "step": 8098, + "train_runtime": 65803.7119, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 4.908484848484848, + "grad_norm": 0.006721834652125835, + "learning_rate": 5.2254220270470475e-05, + "loss": 0.011653993278741837, + "num_input_tokens_seen": 132629224, + "step": 8099, + "train_runtime": 65811.8357, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 4.909090909090909, + "grad_norm": 0.007422847207635641, + "learning_rate": 5.2244613867581384e-05, + "loss": 0.01147537399083376, + "num_input_tokens_seen": 132645600, + "step": 8100, + "train_runtime": 65819.9486, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 4.90969696969697, + "grad_norm": 0.015262533910572529, + "learning_rate": 5.223500738166837e-05, + "loss": 0.014264903031289577, + "num_input_tokens_seen": 132661976, + "step": 8101, + "train_runtime": 65829.0919, + "train_tokens_per_second": 2015.248 + }, + { + "epoch": 4.91030303030303, + "grad_norm": 0.006853677798062563, + "learning_rate": 5.222540081308679e-05, + "loss": 0.011980346404016018, + "num_input_tokens_seen": 132678352, + "step": 8102, + "train_runtime": 65837.2003, + "train_tokens_per_second": 2015.249 + }, + { + "epoch": 4.910909090909091, + "grad_norm": 0.005552751012146473, + "learning_rate": 5.2215794162191955e-05, + "loss": 0.01125369779765606, + "num_input_tokens_seen": 132694728, + "step": 8103, + "train_runtime": 65845.313, + "train_tokens_per_second": 2015.249 + }, + { + "epoch": 4.911515151515151, + "grad_norm": 0.007500201929360628, + "learning_rate": 5.220618742933923e-05, + "loss": 0.011858447454869747, + "num_input_tokens_seen": 132711104, + "step": 8104, + "train_runtime": 65853.4305, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 4.912121212121212, + "grad_norm": 0.011710080318152905, + "learning_rate": 5.2196580614883916e-05, + "loss": 0.012466363608837128, + "num_input_tokens_seen": 132727480, + "step": 8105, + "train_runtime": 65861.5431, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 4.912727272727273, + "grad_norm": 0.008549893274903297, + "learning_rate": 5.2186973719181374e-05, + "loss": 0.01274811290204525, + "num_input_tokens_seen": 132743856, + "step": 8106, + "train_runtime": 65869.6512, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 4.913333333333333, + "grad_norm": 0.006396854761987925, + "learning_rate": 5.217736674258692e-05, + "loss": 0.011344035156071186, + "num_input_tokens_seen": 132760232, + "step": 8107, + "train_runtime": 65877.7625, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 4.913939393939394, + "grad_norm": 0.010352475568652153, + "learning_rate": 5.216775968545592e-05, + "loss": 0.012297993525862694, + "num_input_tokens_seen": 132776608, + "step": 8108, + "train_runtime": 65885.8752, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 4.914545454545454, + "grad_norm": 0.007753721438348293, + "learning_rate": 5.215815254814371e-05, + "loss": 0.013250057585537434, + "num_input_tokens_seen": 132792984, + "step": 8109, + "train_runtime": 65893.9841, + "train_tokens_per_second": 2015.252 + }, + { + "epoch": 4.915151515151515, + "grad_norm": 0.007723256479948759, + "learning_rate": 5.2148545331005636e-05, + "loss": 0.013771533966064453, + "num_input_tokens_seen": 132809360, + "step": 8110, + "train_runtime": 65902.0916, + "train_tokens_per_second": 2015.253 + }, + { + "epoch": 4.915757575757576, + "grad_norm": 0.0053679486736655235, + "learning_rate": 5.213893803439705e-05, + "loss": 0.01226874440908432, + "num_input_tokens_seen": 132825736, + "step": 8111, + "train_runtime": 65910.2008, + "train_tokens_per_second": 2015.253 + }, + { + "epoch": 4.916363636363636, + "grad_norm": 0.0034910517279058695, + "learning_rate": 5.212933065867332e-05, + "loss": 0.011916126124560833, + "num_input_tokens_seen": 132842112, + "step": 8112, + "train_runtime": 65918.3113, + "train_tokens_per_second": 2015.254 + }, + { + "epoch": 4.916969696969697, + "grad_norm": 0.007577541284263134, + "learning_rate": 5.21197232041898e-05, + "loss": 0.01402321457862854, + "num_input_tokens_seen": 132858488, + "step": 8113, + "train_runtime": 65926.4298, + "train_tokens_per_second": 2015.254 + }, + { + "epoch": 4.917575757575758, + "grad_norm": 0.005858317948877811, + "learning_rate": 5.211011567130184e-05, + "loss": 0.012297302484512329, + "num_input_tokens_seen": 132874864, + "step": 8114, + "train_runtime": 65934.5385, + "train_tokens_per_second": 2015.254 + }, + { + "epoch": 4.918181818181818, + "grad_norm": 0.006826257798820734, + "learning_rate": 5.210050806036481e-05, + "loss": 0.012248987331986427, + "num_input_tokens_seen": 132891240, + "step": 8115, + "train_runtime": 65942.6487, + "train_tokens_per_second": 2015.255 + }, + { + "epoch": 4.918787878787879, + "grad_norm": 0.009027857333421707, + "learning_rate": 5.2090900371734076e-05, + "loss": 0.01258261688053608, + "num_input_tokens_seen": 132907616, + "step": 8116, + "train_runtime": 65950.7596, + "train_tokens_per_second": 2015.255 + }, + { + "epoch": 4.91939393939394, + "grad_norm": 0.008358320221304893, + "learning_rate": 5.2081292605764994e-05, + "loss": 0.011128597892820835, + "num_input_tokens_seen": 132923992, + "step": 8117, + "train_runtime": 65958.8709, + "train_tokens_per_second": 2015.256 + }, + { + "epoch": 4.92, + "grad_norm": 0.006578968837857246, + "learning_rate": 5.207168476281299e-05, + "loss": 0.012126540765166283, + "num_input_tokens_seen": 132940368, + "step": 8118, + "train_runtime": 65966.9812, + "train_tokens_per_second": 2015.256 + }, + { + "epoch": 4.920606060606061, + "grad_norm": 0.006880009546875954, + "learning_rate": 5.2062076843233366e-05, + "loss": 0.012543991208076477, + "num_input_tokens_seen": 132956744, + "step": 8119, + "train_runtime": 65975.0939, + "train_tokens_per_second": 2015.257 + }, + { + "epoch": 4.921212121212121, + "grad_norm": 0.00601153913885355, + "learning_rate": 5.2052468847381555e-05, + "loss": 0.01195676252245903, + "num_input_tokens_seen": 132973120, + "step": 8120, + "train_runtime": 65983.2046, + "train_tokens_per_second": 2015.257 + }, + { + "epoch": 4.921818181818182, + "grad_norm": 0.006278615444898605, + "learning_rate": 5.204286077561291e-05, + "loss": 0.011967274360358715, + "num_input_tokens_seen": 132989496, + "step": 8121, + "train_runtime": 65991.3143, + "train_tokens_per_second": 2015.258 + }, + { + "epoch": 4.922424242424242, + "grad_norm": 0.010382657870650291, + "learning_rate": 5.2033252628282823e-05, + "loss": 0.011205856688320637, + "num_input_tokens_seen": 133005872, + "step": 8122, + "train_runtime": 65999.4302, + "train_tokens_per_second": 2015.258 + }, + { + "epoch": 4.923030303030303, + "grad_norm": 0.003974278457462788, + "learning_rate": 5.20236444057467e-05, + "loss": 0.011213712394237518, + "num_input_tokens_seen": 133022248, + "step": 8123, + "train_runtime": 66007.5414, + "train_tokens_per_second": 2015.258 + }, + { + "epoch": 4.923636363636364, + "grad_norm": 0.008477027527987957, + "learning_rate": 5.2014036108359873e-05, + "loss": 0.013020697049796581, + "num_input_tokens_seen": 133038624, + "step": 8124, + "train_runtime": 66015.6518, + "train_tokens_per_second": 2015.259 + }, + { + "epoch": 4.924242424242424, + "grad_norm": 0.00209442968480289, + "learning_rate": 5.20044277364778e-05, + "loss": 0.011622976511716843, + "num_input_tokens_seen": 133055000, + "step": 8125, + "train_runtime": 66023.764, + "train_tokens_per_second": 2015.259 + }, + { + "epoch": 4.924848484848485, + "grad_norm": 0.014488437213003635, + "learning_rate": 5.199481929045584e-05, + "loss": 0.013385072350502014, + "num_input_tokens_seen": 133071376, + "step": 8126, + "train_runtime": 66031.8749, + "train_tokens_per_second": 2015.26 + }, + { + "epoch": 4.925454545454546, + "grad_norm": 0.0071202171966433525, + "learning_rate": 5.1985210770649406e-05, + "loss": 0.012266871519386768, + "num_input_tokens_seen": 133087752, + "step": 8127, + "train_runtime": 66039.9853, + "train_tokens_per_second": 2015.26 + }, + { + "epoch": 4.926060606060606, + "grad_norm": 0.003775449236854911, + "learning_rate": 5.1975602177413886e-05, + "loss": 0.011984240263700485, + "num_input_tokens_seen": 133104128, + "step": 8128, + "train_runtime": 66048.0948, + "train_tokens_per_second": 2015.261 + }, + { + "epoch": 4.926666666666667, + "grad_norm": 0.007280190009623766, + "learning_rate": 5.196599351110469e-05, + "loss": 0.011978131718933582, + "num_input_tokens_seen": 133120504, + "step": 8129, + "train_runtime": 66056.2031, + "train_tokens_per_second": 2015.261 + }, + { + "epoch": 4.927272727272728, + "grad_norm": 0.005246171727776527, + "learning_rate": 5.195638477207722e-05, + "loss": 0.013196904212236404, + "num_input_tokens_seen": 133136880, + "step": 8130, + "train_runtime": 66064.3139, + "train_tokens_per_second": 2015.262 + }, + { + "epoch": 4.927878787878788, + "grad_norm": 0.007404879666864872, + "learning_rate": 5.1946775960686887e-05, + "loss": 0.01205759309232235, + "num_input_tokens_seen": 133153256, + "step": 8131, + "train_runtime": 66072.4302, + "train_tokens_per_second": 2015.262 + }, + { + "epoch": 4.928484848484849, + "grad_norm": 0.006935220677405596, + "learning_rate": 5.1937167077289105e-05, + "loss": 0.011360076256096363, + "num_input_tokens_seen": 133169632, + "step": 8132, + "train_runtime": 66080.5398, + "train_tokens_per_second": 2015.262 + }, + { + "epoch": 4.929090909090909, + "grad_norm": 0.004061101470142603, + "learning_rate": 5.1927558122239295e-05, + "loss": 0.012517943046987057, + "num_input_tokens_seen": 133186008, + "step": 8133, + "train_runtime": 66088.6497, + "train_tokens_per_second": 2015.263 + }, + { + "epoch": 4.92969696969697, + "grad_norm": 0.013044832274317741, + "learning_rate": 5.191794909589285e-05, + "loss": 0.012207560241222382, + "num_input_tokens_seen": 133202384, + "step": 8134, + "train_runtime": 66096.7601, + "train_tokens_per_second": 2015.263 + }, + { + "epoch": 4.930303030303031, + "grad_norm": 0.0071950689889490604, + "learning_rate": 5.19083399986052e-05, + "loss": 0.012698134407401085, + "num_input_tokens_seen": 133218760, + "step": 8135, + "train_runtime": 66104.8693, + "train_tokens_per_second": 2015.264 + }, + { + "epoch": 4.930909090909091, + "grad_norm": 0.008812281303107738, + "learning_rate": 5.189873083073178e-05, + "loss": 0.011813884600996971, + "num_input_tokens_seen": 133235136, + "step": 8136, + "train_runtime": 66112.9769, + "train_tokens_per_second": 2015.265 + }, + { + "epoch": 4.931515151515152, + "grad_norm": 0.0033081581350415945, + "learning_rate": 5.188912159262801e-05, + "loss": 0.011667264625430107, + "num_input_tokens_seen": 133251512, + "step": 8137, + "train_runtime": 66121.0881, + "train_tokens_per_second": 2015.265 + }, + { + "epoch": 4.932121212121212, + "grad_norm": 0.006372794974595308, + "learning_rate": 5.187951228464929e-05, + "loss": 0.01195809431374073, + "num_input_tokens_seen": 133267888, + "step": 8138, + "train_runtime": 66129.199, + "train_tokens_per_second": 2015.265 + }, + { + "epoch": 4.932727272727273, + "grad_norm": 0.005401939619332552, + "learning_rate": 5.186990290715109e-05, + "loss": 0.012573624961078167, + "num_input_tokens_seen": 133284264, + "step": 8139, + "train_runtime": 66137.308, + "train_tokens_per_second": 2015.266 + }, + { + "epoch": 4.933333333333334, + "grad_norm": 0.00702472822740674, + "learning_rate": 5.186029346048882e-05, + "loss": 0.010891092009842396, + "num_input_tokens_seen": 133300640, + "step": 8140, + "train_runtime": 66145.4191, + "train_tokens_per_second": 2015.266 + }, + { + "epoch": 4.933939393939394, + "grad_norm": 0.007400828413665295, + "learning_rate": 5.185068394501791e-05, + "loss": 0.01329672709107399, + "num_input_tokens_seen": 133317016, + "step": 8141, + "train_runtime": 66153.532, + "train_tokens_per_second": 2015.267 + }, + { + "epoch": 4.934545454545455, + "grad_norm": 0.007016358431428671, + "learning_rate": 5.18410743610938e-05, + "loss": 0.012692013755440712, + "num_input_tokens_seen": 133333392, + "step": 8142, + "train_runtime": 66161.6429, + "train_tokens_per_second": 2015.267 + }, + { + "epoch": 4.9351515151515155, + "grad_norm": 0.004883314482867718, + "learning_rate": 5.1831464709071956e-05, + "loss": 0.011992223560810089, + "num_input_tokens_seen": 133349768, + "step": 8143, + "train_runtime": 66169.7497, + "train_tokens_per_second": 2015.268 + }, + { + "epoch": 4.935757575757576, + "grad_norm": 0.0039023186545819044, + "learning_rate": 5.1821854989307784e-05, + "loss": 0.012127838097512722, + "num_input_tokens_seen": 133366144, + "step": 8144, + "train_runtime": 66177.8595, + "train_tokens_per_second": 2015.268 + }, + { + "epoch": 4.9363636363636365, + "grad_norm": 0.005165124777704477, + "learning_rate": 5.1812245202156754e-05, + "loss": 0.011578134261071682, + "num_input_tokens_seen": 133382520, + "step": 8145, + "train_runtime": 66185.9697, + "train_tokens_per_second": 2015.269 + }, + { + "epoch": 4.9369696969696975, + "grad_norm": 0.00545493233948946, + "learning_rate": 5.1802635347974294e-05, + "loss": 0.011704856529831886, + "num_input_tokens_seen": 133398896, + "step": 8146, + "train_runtime": 66194.0801, + "train_tokens_per_second": 2015.269 + }, + { + "epoch": 4.9375757575757575, + "grad_norm": 0.00863288901746273, + "learning_rate": 5.179302542711585e-05, + "loss": 0.013420075178146362, + "num_input_tokens_seen": 133415272, + "step": 8147, + "train_runtime": 66202.1861, + "train_tokens_per_second": 2015.27 + }, + { + "epoch": 4.9381818181818184, + "grad_norm": 0.005861412268131971, + "learning_rate": 5.178341543993691e-05, + "loss": 0.012080847285687923, + "num_input_tokens_seen": 133431648, + "step": 8148, + "train_runtime": 66210.2991, + "train_tokens_per_second": 2015.27 + }, + { + "epoch": 4.9387878787878785, + "grad_norm": 0.0040725236758589745, + "learning_rate": 5.177380538679288e-05, + "loss": 0.010626991279423237, + "num_input_tokens_seen": 133448024, + "step": 8149, + "train_runtime": 66218.4094, + "train_tokens_per_second": 2015.271 + }, + { + "epoch": 4.9393939393939394, + "grad_norm": 0.008504878729581833, + "learning_rate": 5.176419526803928e-05, + "loss": 0.011800670064985752, + "num_input_tokens_seen": 133464400, + "step": 8150, + "train_runtime": 66226.5192, + "train_tokens_per_second": 2015.271 + }, + { + "epoch": 4.9399999999999995, + "grad_norm": 0.010521248914301395, + "learning_rate": 5.17545850840315e-05, + "loss": 0.012020505033433437, + "num_input_tokens_seen": 133480776, + "step": 8151, + "train_runtime": 66234.6302, + "train_tokens_per_second": 2015.272 + }, + { + "epoch": 4.9406060606060604, + "grad_norm": 0.008279401808977127, + "learning_rate": 5.174497483512506e-05, + "loss": 0.012382610701024532, + "num_input_tokens_seen": 133497152, + "step": 8152, + "train_runtime": 66242.7402, + "train_tokens_per_second": 2015.272 + }, + { + "epoch": 4.941212121212121, + "grad_norm": 0.00851480569690466, + "learning_rate": 5.1735364521675376e-05, + "loss": 0.012233899906277657, + "num_input_tokens_seen": 133513528, + "step": 8153, + "train_runtime": 66250.8509, + "train_tokens_per_second": 2015.273 + }, + { + "epoch": 4.941818181818181, + "grad_norm": 0.006969491019845009, + "learning_rate": 5.172575414403795e-05, + "loss": 0.012245628982782364, + "num_input_tokens_seen": 133529904, + "step": 8154, + "train_runtime": 66258.9579, + "train_tokens_per_second": 2015.273 + }, + { + "epoch": 4.942424242424242, + "grad_norm": 0.0030421142000705004, + "learning_rate": 5.171614370256824e-05, + "loss": 0.010616989806294441, + "num_input_tokens_seen": 133546280, + "step": 8155, + "train_runtime": 66267.0687, + "train_tokens_per_second": 2015.274 + }, + { + "epoch": 4.943030303030303, + "grad_norm": 0.0020916603971272707, + "learning_rate": 5.170653319762172e-05, + "loss": 0.011467711068689823, + "num_input_tokens_seen": 133562656, + "step": 8156, + "train_runtime": 66275.1787, + "train_tokens_per_second": 2015.274 + }, + { + "epoch": 4.943636363636363, + "grad_norm": 0.0034624990075826645, + "learning_rate": 5.1696922629553846e-05, + "loss": 0.011449861340224743, + "num_input_tokens_seen": 133579032, + "step": 8157, + "train_runtime": 66283.2879, + "train_tokens_per_second": 2015.275 + }, + { + "epoch": 4.944242424242424, + "grad_norm": 0.004457138944417238, + "learning_rate": 5.168731199872012e-05, + "loss": 0.01155898254364729, + "num_input_tokens_seen": 133595408, + "step": 8158, + "train_runtime": 66291.3948, + "train_tokens_per_second": 2015.275 + }, + { + "epoch": 4.944848484848485, + "grad_norm": 0.025031372904777527, + "learning_rate": 5.1677701305476e-05, + "loss": 0.012928782030940056, + "num_input_tokens_seen": 133611784, + "step": 8159, + "train_runtime": 66299.5031, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 4.945454545454545, + "grad_norm": 0.010190303437411785, + "learning_rate": 5.1668090550177e-05, + "loss": 0.012587228789925575, + "num_input_tokens_seen": 133628160, + "step": 8160, + "train_runtime": 66307.611, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 4.946060606060606, + "grad_norm": 0.006739833392202854, + "learning_rate": 5.165847973317854e-05, + "loss": 0.01190144382417202, + "num_input_tokens_seen": 133644536, + "step": 8161, + "train_runtime": 66315.7176, + "train_tokens_per_second": 2015.277 + }, + { + "epoch": 4.946666666666666, + "grad_norm": 0.006204592064023018, + "learning_rate": 5.164886885483617e-05, + "loss": 0.010971846990287304, + "num_input_tokens_seen": 133660912, + "step": 8162, + "train_runtime": 66323.8304, + "train_tokens_per_second": 2015.277 + }, + { + "epoch": 4.947272727272727, + "grad_norm": 0.005030219443142414, + "learning_rate": 5.163925791550536e-05, + "loss": 0.012612380087375641, + "num_input_tokens_seen": 133677288, + "step": 8163, + "train_runtime": 66331.9428, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 4.947878787878788, + "grad_norm": 0.004897041246294975, + "learning_rate": 5.162964691554157e-05, + "loss": 0.012082865461707115, + "num_input_tokens_seen": 133693664, + "step": 8164, + "train_runtime": 66340.0538, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 4.948484848484848, + "grad_norm": 0.006862598937004805, + "learning_rate": 5.162003585530033e-05, + "loss": 0.012784481979906559, + "num_input_tokens_seen": 133710040, + "step": 8165, + "train_runtime": 66348.1635, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 4.949090909090909, + "grad_norm": 0.003909309394657612, + "learning_rate": 5.1610424735137105e-05, + "loss": 0.012009684927761555, + "num_input_tokens_seen": 133726416, + "step": 8166, + "train_runtime": 66356.27, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 4.949696969696969, + "grad_norm": 0.006514808163046837, + "learning_rate": 5.160081355540741e-05, + "loss": 0.013028763234615326, + "num_input_tokens_seen": 133742792, + "step": 8167, + "train_runtime": 66364.3798, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 4.95030303030303, + "grad_norm": 0.008131210692226887, + "learning_rate": 5.159120231646675e-05, + "loss": 0.012136307545006275, + "num_input_tokens_seen": 133759168, + "step": 8168, + "train_runtime": 66372.4881, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 4.950909090909091, + "grad_norm": 0.007313882000744343, + "learning_rate": 5.158159101867061e-05, + "loss": 0.012021470814943314, + "num_input_tokens_seen": 133775544, + "step": 8169, + "train_runtime": 66380.5982, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 4.951515151515151, + "grad_norm": 0.007594621274620295, + "learning_rate": 5.157197966237448e-05, + "loss": 0.013012335635721684, + "num_input_tokens_seen": 133791920, + "step": 8170, + "train_runtime": 66388.7059, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 4.952121212121212, + "grad_norm": 0.005774260498583317, + "learning_rate": 5.15623682479339e-05, + "loss": 0.012032175436615944, + "num_input_tokens_seen": 133808296, + "step": 8171, + "train_runtime": 66396.8147, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 4.952727272727273, + "grad_norm": 0.008731472305953503, + "learning_rate": 5.155275677570437e-05, + "loss": 0.01252993755042553, + "num_input_tokens_seen": 133824672, + "step": 8172, + "train_runtime": 66404.9222, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 4.953333333333333, + "grad_norm": 0.008154506795108318, + "learning_rate": 5.154314524604138e-05, + "loss": 0.011395303532481194, + "num_input_tokens_seen": 133841048, + "step": 8173, + "train_runtime": 66413.0312, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 4.953939393939394, + "grad_norm": 0.010758318938314915, + "learning_rate": 5.153353365930045e-05, + "loss": 0.012949532829225063, + "num_input_tokens_seen": 133857424, + "step": 8174, + "train_runtime": 66421.1436, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 4.954545454545455, + "grad_norm": 0.005474520847201347, + "learning_rate": 5.15239220158371e-05, + "loss": 0.011371280066668987, + "num_input_tokens_seen": 133873800, + "step": 8175, + "train_runtime": 66429.2533, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 4.955151515151515, + "grad_norm": 0.006483131088316441, + "learning_rate": 5.1514310316006833e-05, + "loss": 0.012709837406873703, + "num_input_tokens_seen": 133890176, + "step": 8176, + "train_runtime": 66437.3616, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 4.955757575757576, + "grad_norm": 0.006264304742217064, + "learning_rate": 5.15046985601652e-05, + "loss": 0.011322492733597755, + "num_input_tokens_seen": 133906552, + "step": 8177, + "train_runtime": 66445.4719, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 4.956363636363636, + "grad_norm": 0.009905769489705563, + "learning_rate": 5.1495086748667664e-05, + "loss": 0.012310869991779327, + "num_input_tokens_seen": 133922928, + "step": 8178, + "train_runtime": 66453.5832, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 4.956969696969697, + "grad_norm": 0.005434688646346331, + "learning_rate": 5.148547488186981e-05, + "loss": 0.013554216362535954, + "num_input_tokens_seen": 133939304, + "step": 8179, + "train_runtime": 66461.6935, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 4.957575757575757, + "grad_norm": 0.006319258362054825, + "learning_rate": 5.147586296012711e-05, + "loss": 0.012179030105471611, + "num_input_tokens_seen": 133955680, + "step": 8180, + "train_runtime": 66469.8032, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 4.958181818181818, + "grad_norm": 0.007375161163508892, + "learning_rate": 5.1466250983795126e-05, + "loss": 0.012186499312520027, + "num_input_tokens_seen": 133972056, + "step": 8181, + "train_runtime": 66477.9134, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 4.958787878787879, + "grad_norm": 0.009513678029179573, + "learning_rate": 5.145663895322937e-05, + "loss": 0.012622924521565437, + "num_input_tokens_seen": 133988432, + "step": 8182, + "train_runtime": 66486.0304, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 4.959393939393939, + "grad_norm": 0.006154630798846483, + "learning_rate": 5.1447026868785385e-05, + "loss": 0.011654047295451164, + "num_input_tokens_seen": 134004808, + "step": 8183, + "train_runtime": 66494.1379, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 4.96, + "grad_norm": 0.010271712206304073, + "learning_rate": 5.143741473081868e-05, + "loss": 0.011748416349291801, + "num_input_tokens_seen": 134021184, + "step": 8184, + "train_runtime": 66502.2486, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 4.960606060606061, + "grad_norm": 0.008918428793549538, + "learning_rate": 5.142780253968481e-05, + "loss": 0.011477678082883358, + "num_input_tokens_seen": 134037560, + "step": 8185, + "train_runtime": 66510.3563, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 4.961212121212121, + "grad_norm": 0.003607304999604821, + "learning_rate": 5.141819029573931e-05, + "loss": 0.011751585640013218, + "num_input_tokens_seen": 134053936, + "step": 8186, + "train_runtime": 66518.4649, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 4.961818181818182, + "grad_norm": 0.005165799520909786, + "learning_rate": 5.1408577999337716e-05, + "loss": 0.011612402275204659, + "num_input_tokens_seen": 134070312, + "step": 8187, + "train_runtime": 66526.5713, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 4.962424242424243, + "grad_norm": 0.0059174904599785805, + "learning_rate": 5.1398965650835564e-05, + "loss": 0.01225368119776249, + "num_input_tokens_seen": 134086688, + "step": 8188, + "train_runtime": 66534.681, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 4.963030303030303, + "grad_norm": 0.012816806323826313, + "learning_rate": 5.138935325058838e-05, + "loss": 0.014541895128786564, + "num_input_tokens_seen": 134103064, + "step": 8189, + "train_runtime": 66542.7907, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 4.963636363636364, + "grad_norm": 0.005472112447023392, + "learning_rate": 5.137974079895173e-05, + "loss": 0.011519595049321651, + "num_input_tokens_seen": 134119440, + "step": 8190, + "train_runtime": 66550.8987, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 4.964242424242424, + "grad_norm": 0.006768065504729748, + "learning_rate": 5.137012829628116e-05, + "loss": 0.012073270976543427, + "num_input_tokens_seen": 134135816, + "step": 8191, + "train_runtime": 66559.0089, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 4.964848484848485, + "grad_norm": 0.003949771635234356, + "learning_rate": 5.1360515742932224e-05, + "loss": 0.013221388682723045, + "num_input_tokens_seen": 134152192, + "step": 8192, + "train_runtime": 66567.1167, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 4.965454545454546, + "grad_norm": 0.007112518884241581, + "learning_rate": 5.1350903139260434e-05, + "loss": 0.012697778642177582, + "num_input_tokens_seen": 134168568, + "step": 8193, + "train_runtime": 66575.2313, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 4.966060606060606, + "grad_norm": 0.00930185429751873, + "learning_rate": 5.1341290485621385e-05, + "loss": 0.01141928881406784, + "num_input_tokens_seen": 134184944, + "step": 8194, + "train_runtime": 66583.3417, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 4.966666666666667, + "grad_norm": 0.006451472174376249, + "learning_rate": 5.1331677782370614e-05, + "loss": 0.011151661165058613, + "num_input_tokens_seen": 134201320, + "step": 8195, + "train_runtime": 66591.4495, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 4.967272727272727, + "grad_norm": 0.005789584014564753, + "learning_rate": 5.132206502986368e-05, + "loss": 0.012418973259627819, + "num_input_tokens_seen": 134217696, + "step": 8196, + "train_runtime": 66599.5605, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 4.967878787878788, + "grad_norm": 0.009585678577423096, + "learning_rate": 5.131245222845611e-05, + "loss": 0.012659024447202682, + "num_input_tokens_seen": 134234072, + "step": 8197, + "train_runtime": 66607.6698, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 4.968484848484849, + "grad_norm": 0.006286195944994688, + "learning_rate": 5.130283937850351e-05, + "loss": 0.011674138717353344, + "num_input_tokens_seen": 134250448, + "step": 8198, + "train_runtime": 66615.7794, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 4.969090909090909, + "grad_norm": 0.007263315841555595, + "learning_rate": 5.12932264803614e-05, + "loss": 0.011815214529633522, + "num_input_tokens_seen": 134266824, + "step": 8199, + "train_runtime": 66623.889, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 4.96969696969697, + "grad_norm": 0.0065478309988975525, + "learning_rate": 5.128361353438539e-05, + "loss": 0.011465795338153839, + "num_input_tokens_seen": 134283200, + "step": 8200, + "train_runtime": 66631.9989, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 4.970303030303031, + "grad_norm": 0.005514166317880154, + "learning_rate": 5.1274000540930985e-05, + "loss": 0.011562936007976532, + "num_input_tokens_seen": 134299576, + "step": 8201, + "train_runtime": 66641.1388, + "train_tokens_per_second": 2015.265 + }, + { + "epoch": 4.970909090909091, + "grad_norm": 0.00789317674934864, + "learning_rate": 5.126438750035381e-05, + "loss": 0.013294493779540062, + "num_input_tokens_seen": 134315952, + "step": 8202, + "train_runtime": 66649.2462, + "train_tokens_per_second": 2015.266 + }, + { + "epoch": 4.971515151515152, + "grad_norm": 0.00548576470464468, + "learning_rate": 5.1254774413009374e-05, + "loss": 0.012079352512955666, + "num_input_tokens_seen": 134332328, + "step": 8203, + "train_runtime": 66657.3555, + "train_tokens_per_second": 2015.266 + }, + { + "epoch": 4.972121212121213, + "grad_norm": 0.006187865976244211, + "learning_rate": 5.1245161279253293e-05, + "loss": 0.011782792396843433, + "num_input_tokens_seen": 134348704, + "step": 8204, + "train_runtime": 66665.4642, + "train_tokens_per_second": 2015.267 + }, + { + "epoch": 4.972727272727273, + "grad_norm": 0.0029841228388249874, + "learning_rate": 5.1235548099441125e-05, + "loss": 0.012353507801890373, + "num_input_tokens_seen": 134365080, + "step": 8205, + "train_runtime": 66673.5763, + "train_tokens_per_second": 2015.267 + }, + { + "epoch": 4.973333333333334, + "grad_norm": 0.006425634957849979, + "learning_rate": 5.1225934873928425e-05, + "loss": 0.011805751360952854, + "num_input_tokens_seen": 134381456, + "step": 8206, + "train_runtime": 66681.6926, + "train_tokens_per_second": 2015.268 + }, + { + "epoch": 4.973939393939394, + "grad_norm": 0.0108135174959898, + "learning_rate": 5.1216321603070786e-05, + "loss": 0.012070368975400925, + "num_input_tokens_seen": 134397832, + "step": 8207, + "train_runtime": 66689.8108, + "train_tokens_per_second": 2015.268 + }, + { + "epoch": 4.974545454545455, + "grad_norm": 0.004920314997434616, + "learning_rate": 5.1206708287223784e-05, + "loss": 0.01146393921226263, + "num_input_tokens_seen": 134414208, + "step": 8208, + "train_runtime": 66697.9307, + "train_tokens_per_second": 2015.268 + }, + { + "epoch": 4.975151515151515, + "grad_norm": 0.005252638831734657, + "learning_rate": 5.119709492674299e-05, + "loss": 0.011063532903790474, + "num_input_tokens_seen": 134430584, + "step": 8209, + "train_runtime": 66706.0494, + "train_tokens_per_second": 2015.268 + }, + { + "epoch": 4.975757575757576, + "grad_norm": 0.008730476722121239, + "learning_rate": 5.118748152198399e-05, + "loss": 0.012078125029802322, + "num_input_tokens_seen": 134446960, + "step": 8210, + "train_runtime": 66714.168, + "train_tokens_per_second": 2015.268 + }, + { + "epoch": 4.9763636363636365, + "grad_norm": 0.006354598794132471, + "learning_rate": 5.117786807330237e-05, + "loss": 0.01305452175438404, + "num_input_tokens_seen": 134463336, + "step": 8211, + "train_runtime": 66722.2829, + "train_tokens_per_second": 2015.269 + }, + { + "epoch": 4.976969696969697, + "grad_norm": 0.00924699753522873, + "learning_rate": 5.1168254581053675e-05, + "loss": 0.01345380861312151, + "num_input_tokens_seen": 134479712, + "step": 8212, + "train_runtime": 66730.3975, + "train_tokens_per_second": 2015.269 + }, + { + "epoch": 4.9775757575757575, + "grad_norm": 0.005171971395611763, + "learning_rate": 5.115864104559355e-05, + "loss": 0.011935876682400703, + "num_input_tokens_seen": 134496088, + "step": 8213, + "train_runtime": 66738.5125, + "train_tokens_per_second": 2015.269 + }, + { + "epoch": 4.9781818181818185, + "grad_norm": 0.0066617936827242374, + "learning_rate": 5.114902746727753e-05, + "loss": 0.01221306249499321, + "num_input_tokens_seen": 134512464, + "step": 8214, + "train_runtime": 66746.6299, + "train_tokens_per_second": 2015.27 + }, + { + "epoch": 4.9787878787878785, + "grad_norm": 0.0074796779081225395, + "learning_rate": 5.113941384646122e-05, + "loss": 0.012615595012903214, + "num_input_tokens_seen": 134528840, + "step": 8215, + "train_runtime": 66754.7384, + "train_tokens_per_second": 2015.27 + }, + { + "epoch": 4.9793939393939395, + "grad_norm": 0.005864432547241449, + "learning_rate": 5.112980018350021e-05, + "loss": 0.011537251994013786, + "num_input_tokens_seen": 134545216, + "step": 8216, + "train_runtime": 66762.8491, + "train_tokens_per_second": 2015.271 + }, + { + "epoch": 4.98, + "grad_norm": 0.007792472839355469, + "learning_rate": 5.112018647875011e-05, + "loss": 0.012768533080816269, + "num_input_tokens_seen": 134561592, + "step": 8217, + "train_runtime": 66770.9583, + "train_tokens_per_second": 2015.271 + }, + { + "epoch": 4.9806060606060605, + "grad_norm": 0.006808354519307613, + "learning_rate": 5.1110572732566475e-05, + "loss": 0.011315911076962948, + "num_input_tokens_seen": 134577968, + "step": 8218, + "train_runtime": 66779.071, + "train_tokens_per_second": 2015.272 + }, + { + "epoch": 4.981212121212121, + "grad_norm": 0.010216504335403442, + "learning_rate": 5.110095894530493e-05, + "loss": 0.01276070810854435, + "num_input_tokens_seen": 134594344, + "step": 8219, + "train_runtime": 66787.1859, + "train_tokens_per_second": 2015.272 + }, + { + "epoch": 4.9818181818181815, + "grad_norm": 0.009646744467318058, + "learning_rate": 5.109134511732104e-05, + "loss": 0.011752164922654629, + "num_input_tokens_seen": 134610720, + "step": 8220, + "train_runtime": 66795.2991, + "train_tokens_per_second": 2015.272 + }, + { + "epoch": 4.982424242424242, + "grad_norm": 0.005301271099597216, + "learning_rate": 5.1081731248970435e-05, + "loss": 0.011144434101879597, + "num_input_tokens_seen": 134627096, + "step": 8221, + "train_runtime": 66803.4141, + "train_tokens_per_second": 2015.273 + }, + { + "epoch": 4.983030303030303, + "grad_norm": 0.010242871008813381, + "learning_rate": 5.1072117340608694e-05, + "loss": 0.012187977321445942, + "num_input_tokens_seen": 134643472, + "step": 8222, + "train_runtime": 66811.5327, + "train_tokens_per_second": 2015.273 + }, + { + "epoch": 4.983636363636363, + "grad_norm": 0.005926931276917458, + "learning_rate": 5.106250339259142e-05, + "loss": 0.011991791427135468, + "num_input_tokens_seen": 134659848, + "step": 8223, + "train_runtime": 66819.648, + "train_tokens_per_second": 2015.273 + }, + { + "epoch": 4.984242424242424, + "grad_norm": 0.007467166520655155, + "learning_rate": 5.1052889405274207e-05, + "loss": 0.011754586361348629, + "num_input_tokens_seen": 134676224, + "step": 8224, + "train_runtime": 66827.7684, + "train_tokens_per_second": 2015.273 + }, + { + "epoch": 4.984848484848484, + "grad_norm": 0.008686749264597893, + "learning_rate": 5.1043275379012676e-05, + "loss": 0.012123912572860718, + "num_input_tokens_seen": 134692600, + "step": 8225, + "train_runtime": 66835.886, + "train_tokens_per_second": 2015.274 + }, + { + "epoch": 4.985454545454545, + "grad_norm": 0.006785119883716106, + "learning_rate": 5.1033661314162405e-05, + "loss": 0.012908924371004105, + "num_input_tokens_seen": 134708976, + "step": 8226, + "train_runtime": 66844.0056, + "train_tokens_per_second": 2015.274 + }, + { + "epoch": 4.986060606060606, + "grad_norm": 0.004836922977119684, + "learning_rate": 5.102404721107905e-05, + "loss": 0.01173956785351038, + "num_input_tokens_seen": 134725352, + "step": 8227, + "train_runtime": 66852.1191, + "train_tokens_per_second": 2015.274 + }, + { + "epoch": 4.986666666666666, + "grad_norm": 0.008307110518217087, + "learning_rate": 5.101443307011815e-05, + "loss": 0.012658069841563702, + "num_input_tokens_seen": 134741728, + "step": 8228, + "train_runtime": 66860.2337, + "train_tokens_per_second": 2015.275 + }, + { + "epoch": 4.987272727272727, + "grad_norm": 0.0055992272682487965, + "learning_rate": 5.100481889163535e-05, + "loss": 0.011822273954749107, + "num_input_tokens_seen": 134758104, + "step": 8229, + "train_runtime": 66868.3495, + "train_tokens_per_second": 2015.275 + }, + { + "epoch": 4.987878787878788, + "grad_norm": 0.008099223487079144, + "learning_rate": 5.0995204675986265e-05, + "loss": 0.01275227963924408, + "num_input_tokens_seen": 134774480, + "step": 8230, + "train_runtime": 66876.4617, + "train_tokens_per_second": 2015.275 + }, + { + "epoch": 4.988484848484848, + "grad_norm": 0.008597818203270435, + "learning_rate": 5.09855904235265e-05, + "loss": 0.012081300839781761, + "num_input_tokens_seen": 134790856, + "step": 8231, + "train_runtime": 66884.575, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 4.989090909090909, + "grad_norm": 0.0038482986856251955, + "learning_rate": 5.097597613461166e-05, + "loss": 0.011605646461248398, + "num_input_tokens_seen": 134807232, + "step": 8232, + "train_runtime": 66892.6903, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 4.98969696969697, + "grad_norm": 0.008430606685578823, + "learning_rate": 5.0966361809597364e-05, + "loss": 0.012093005701899529, + "num_input_tokens_seen": 134823608, + "step": 8233, + "train_runtime": 66900.8062, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 4.99030303030303, + "grad_norm": 0.0066368356347084045, + "learning_rate": 5.095674744883922e-05, + "loss": 0.01195468008518219, + "num_input_tokens_seen": 134839984, + "step": 8234, + "train_runtime": 66908.9333, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 4.990909090909091, + "grad_norm": 0.007892313413321972, + "learning_rate": 5.094713305269285e-05, + "loss": 0.01383884996175766, + "num_input_tokens_seen": 134856360, + "step": 8235, + "train_runtime": 66917.0484, + "train_tokens_per_second": 2015.277 + }, + { + "epoch": 4.991515151515151, + "grad_norm": 0.006457024719566107, + "learning_rate": 5.093751862151388e-05, + "loss": 0.012497960589826107, + "num_input_tokens_seen": 134872736, + "step": 8236, + "train_runtime": 66925.1611, + "train_tokens_per_second": 2015.277 + }, + { + "epoch": 4.992121212121212, + "grad_norm": 0.002226083306595683, + "learning_rate": 5.0927904155657925e-05, + "loss": 0.010917583480477333, + "num_input_tokens_seen": 134889112, + "step": 8237, + "train_runtime": 66933.2764, + "train_tokens_per_second": 2015.277 + }, + { + "epoch": 4.992727272727272, + "grad_norm": 0.009298913180828094, + "learning_rate": 5.09182896554806e-05, + "loss": 0.01119711622595787, + "num_input_tokens_seen": 134905488, + "step": 8238, + "train_runtime": 66941.388, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 4.993333333333333, + "grad_norm": 0.008322327397763729, + "learning_rate": 5.0908675121337525e-05, + "loss": 0.011911619454622269, + "num_input_tokens_seen": 134921864, + "step": 8239, + "train_runtime": 66949.5021, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 4.993939393939394, + "grad_norm": 0.003505369182676077, + "learning_rate": 5.089906055358432e-05, + "loss": 0.011117411777377129, + "num_input_tokens_seen": 134938240, + "step": 8240, + "train_runtime": 66957.6126, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 4.994545454545454, + "grad_norm": 0.005240668077021837, + "learning_rate": 5.088944595257663e-05, + "loss": 0.012150921858847141, + "num_input_tokens_seen": 134954616, + "step": 8241, + "train_runtime": 66965.7304, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 4.995151515151515, + "grad_norm": 0.0030336875934153795, + "learning_rate": 5.087983131867007e-05, + "loss": 0.011063267476856709, + "num_input_tokens_seen": 134970992, + "step": 8242, + "train_runtime": 66973.8414, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 4.995757575757576, + "grad_norm": 0.007543195504695177, + "learning_rate": 5.087021665222024e-05, + "loss": 0.012925291433930397, + "num_input_tokens_seen": 134987368, + "step": 8243, + "train_runtime": 66981.952, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 4.996363636363636, + "grad_norm": 0.004401626531034708, + "learning_rate": 5.086060195358281e-05, + "loss": 0.011347765102982521, + "num_input_tokens_seen": 135003744, + "step": 8244, + "train_runtime": 66990.0645, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 4.996969696969697, + "grad_norm": 0.005982839968055487, + "learning_rate": 5.085098722311339e-05, + "loss": 0.01136541087180376, + "num_input_tokens_seen": 135020120, + "step": 8245, + "train_runtime": 66998.178, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 4.997575757575758, + "grad_norm": 0.0057999491691589355, + "learning_rate": 5.0841372461167604e-05, + "loss": 0.011447794735431671, + "num_input_tokens_seen": 135036496, + "step": 8246, + "train_runtime": 67006.2882, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 4.998181818181818, + "grad_norm": 0.005134247709065676, + "learning_rate": 5.083175766810108e-05, + "loss": 0.010828719474375248, + "num_input_tokens_seen": 135052872, + "step": 8247, + "train_runtime": 67014.4014, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 4.998787878787879, + "grad_norm": 0.007160807494074106, + "learning_rate": 5.082214284426947e-05, + "loss": 0.011811402626335621, + "num_input_tokens_seen": 135069248, + "step": 8248, + "train_runtime": 67022.5131, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 4.999393939393939, + "grad_norm": 0.006427322048693895, + "learning_rate": 5.0812527990028394e-05, + "loss": 0.0125265596434474, + "num_input_tokens_seen": 135085624, + "step": 8249, + "train_runtime": 67030.6306, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 5.0, + "grad_norm": 0.00840055849403143, + "learning_rate": 5.080291310573349e-05, + "loss": 0.013707771897315979, + "num_input_tokens_seen": 135102000, + "step": 8250, + "train_runtime": 67038.7416, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 5.000606060606061, + "grad_norm": 0.00553882448002696, + "learning_rate": 5.0793298191740404e-05, + "loss": 0.01178129855543375, + "num_input_tokens_seen": 135118376, + "step": 8251, + "train_runtime": 67046.8552, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 5.001212121212121, + "grad_norm": 0.006408744491636753, + "learning_rate": 5.078368324840476e-05, + "loss": 0.012557617388665676, + "num_input_tokens_seen": 135134752, + "step": 8252, + "train_runtime": 67054.967, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 5.001818181818182, + "grad_norm": 0.0041474709287285805, + "learning_rate": 5.0774068276082174e-05, + "loss": 0.011947699822485447, + "num_input_tokens_seen": 135151128, + "step": 8253, + "train_runtime": 67063.0807, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 5.002424242424242, + "grad_norm": 0.004736883100122213, + "learning_rate": 5.0764453275128346e-05, + "loss": 0.011652713641524315, + "num_input_tokens_seen": 135167504, + "step": 8254, + "train_runtime": 67071.1907, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 5.003030303030303, + "grad_norm": 0.0061241574585437775, + "learning_rate": 5.075483824589886e-05, + "loss": 0.01168136391788721, + "num_input_tokens_seen": 135183880, + "step": 8255, + "train_runtime": 67079.3034, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 5.003636363636364, + "grad_norm": 0.005269136279821396, + "learning_rate": 5.074522318874938e-05, + "loss": 0.012372460216283798, + "num_input_tokens_seen": 135200256, + "step": 8256, + "train_runtime": 67087.4134, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 5.004242424242424, + "grad_norm": 0.003983024973422289, + "learning_rate": 5.0735608104035546e-05, + "loss": 0.011117767542600632, + "num_input_tokens_seen": 135216632, + "step": 8257, + "train_runtime": 67095.5306, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 5.004848484848485, + "grad_norm": 0.005235120188444853, + "learning_rate": 5.0725992992112994e-05, + "loss": 0.010942426510155201, + "num_input_tokens_seen": 135233008, + "step": 8258, + "train_runtime": 67103.6393, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 5.005454545454546, + "grad_norm": 0.003157511120662093, + "learning_rate": 5.071637785333739e-05, + "loss": 0.012899093329906464, + "num_input_tokens_seen": 135249384, + "step": 8259, + "train_runtime": 67111.7518, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 5.006060606060606, + "grad_norm": 0.008685837499797344, + "learning_rate": 5.0706762688064355e-05, + "loss": 0.011325963772833347, + "num_input_tokens_seen": 135265760, + "step": 8260, + "train_runtime": 67119.8627, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 5.006666666666667, + "grad_norm": 0.007824521511793137, + "learning_rate": 5.0697147496649544e-05, + "loss": 0.0134503822773695, + "num_input_tokens_seen": 135282136, + "step": 8261, + "train_runtime": 67127.9732, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 5.007272727272727, + "grad_norm": 0.0002552693767938763, + "learning_rate": 5.0687532279448603e-05, + "loss": 0.010875185951590538, + "num_input_tokens_seen": 135298512, + "step": 8262, + "train_runtime": 67136.0791, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 5.007878787878788, + "grad_norm": 0.007151304744184017, + "learning_rate": 5.067791703681719e-05, + "loss": 0.011302225291728973, + "num_input_tokens_seen": 135314888, + "step": 8263, + "train_runtime": 67144.192, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 5.008484848484849, + "grad_norm": 0.007161908317357302, + "learning_rate": 5.066830176911094e-05, + "loss": 0.012912624515593052, + "num_input_tokens_seen": 135331264, + "step": 8264, + "train_runtime": 67152.303, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 5.009090909090909, + "grad_norm": 0.011364428326487541, + "learning_rate": 5.0658686476685516e-05, + "loss": 0.012872017920017242, + "num_input_tokens_seen": 135347640, + "step": 8265, + "train_runtime": 67160.4121, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 5.00969696969697, + "grad_norm": 0.003920624498277903, + "learning_rate": 5.064907115989655e-05, + "loss": 0.011807954870164394, + "num_input_tokens_seen": 135364016, + "step": 8266, + "train_runtime": 67168.5304, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 5.010303030303031, + "grad_norm": 0.005252000410109758, + "learning_rate": 5.063945581909971e-05, + "loss": 0.012046176008880138, + "num_input_tokens_seen": 135380392, + "step": 8267, + "train_runtime": 67176.6433, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 5.010909090909091, + "grad_norm": 0.00623297831043601, + "learning_rate": 5.0629840454650644e-05, + "loss": 0.011990412138402462, + "num_input_tokens_seen": 135396768, + "step": 8268, + "train_runtime": 67184.7546, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 5.011515151515152, + "grad_norm": 0.005057190079241991, + "learning_rate": 5.0620225066905014e-05, + "loss": 0.012164930813014507, + "num_input_tokens_seen": 135413144, + "step": 8269, + "train_runtime": 67192.8646, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 5.012121212121212, + "grad_norm": 0.01145374495536089, + "learning_rate": 5.061060965621846e-05, + "loss": 0.012158042751252651, + "num_input_tokens_seen": 135429520, + "step": 8270, + "train_runtime": 67200.9782, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 5.012727272727273, + "grad_norm": 0.005494717042893171, + "learning_rate": 5.060099422294664e-05, + "loss": 0.011690933257341385, + "num_input_tokens_seen": 135445896, + "step": 8271, + "train_runtime": 67209.0906, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 5.013333333333334, + "grad_norm": 0.005364768672734499, + "learning_rate": 5.059137876744523e-05, + "loss": 0.012058689258992672, + "num_input_tokens_seen": 135462272, + "step": 8272, + "train_runtime": 67217.2002, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 5.013939393939394, + "grad_norm": 0.006850802805274725, + "learning_rate": 5.0581763290069865e-05, + "loss": 0.012159507721662521, + "num_input_tokens_seen": 135478648, + "step": 8273, + "train_runtime": 67225.3092, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 5.014545454545455, + "grad_norm": 0.005907886661589146, + "learning_rate": 5.057214779117619e-05, + "loss": 0.012190775014460087, + "num_input_tokens_seen": 135495024, + "step": 8274, + "train_runtime": 67233.4219, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 5.015151515151516, + "grad_norm": 0.003778220620006323, + "learning_rate": 5.05625322711199e-05, + "loss": 0.011668875813484192, + "num_input_tokens_seen": 135511400, + "step": 8275, + "train_runtime": 67241.532, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 5.015757575757576, + "grad_norm": 0.011213399469852448, + "learning_rate": 5.055291673025663e-05, + "loss": 0.011795091442763805, + "num_input_tokens_seen": 135527776, + "step": 8276, + "train_runtime": 67249.6452, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 5.016363636363637, + "grad_norm": 0.008473982103168964, + "learning_rate": 5.054330116894206e-05, + "loss": 0.013449507765471935, + "num_input_tokens_seen": 135544152, + "step": 8277, + "train_runtime": 67257.757, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 5.016969696969697, + "grad_norm": 0.006476051174104214, + "learning_rate": 5.053368558753183e-05, + "loss": 0.012078224681317806, + "num_input_tokens_seen": 135560528, + "step": 8278, + "train_runtime": 67265.8659, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 5.017575757575758, + "grad_norm": 0.014137729071080685, + "learning_rate": 5.0524069986381605e-05, + "loss": 0.012675135396420956, + "num_input_tokens_seen": 135576904, + "step": 8279, + "train_runtime": 67273.9747, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 5.0181818181818185, + "grad_norm": 0.007644779980182648, + "learning_rate": 5.0514454365847055e-05, + "loss": 0.011656026355922222, + "num_input_tokens_seen": 135593280, + "step": 8280, + "train_runtime": 67282.0867, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 5.018787878787879, + "grad_norm": 0.0032483134418725967, + "learning_rate": 5.050483872628383e-05, + "loss": 0.012381074950098991, + "num_input_tokens_seen": 135609656, + "step": 8281, + "train_runtime": 67290.2005, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 5.0193939393939395, + "grad_norm": 0.006981448270380497, + "learning_rate": 5.0495223068047625e-05, + "loss": 0.011840738356113434, + "num_input_tokens_seen": 135626032, + "step": 8282, + "train_runtime": 67298.3105, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 5.02, + "grad_norm": 0.00409040879458189, + "learning_rate": 5.0485607391494063e-05, + "loss": 0.011651817709207535, + "num_input_tokens_seen": 135642408, + "step": 8283, + "train_runtime": 67306.4329, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 5.0206060606060605, + "grad_norm": 0.00870494358241558, + "learning_rate": 5.047599169697884e-05, + "loss": 0.011883814819157124, + "num_input_tokens_seen": 135658784, + "step": 8284, + "train_runtime": 67314.5437, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 5.0212121212121215, + "grad_norm": 0.013102404773235321, + "learning_rate": 5.046637598485761e-05, + "loss": 0.011688578873872757, + "num_input_tokens_seen": 135675160, + "step": 8285, + "train_runtime": 67322.6566, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 5.0218181818181815, + "grad_norm": 0.010791177861392498, + "learning_rate": 5.045676025548603e-05, + "loss": 0.012235326692461967, + "num_input_tokens_seen": 135691536, + "step": 8286, + "train_runtime": 67330.7677, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 5.0224242424242425, + "grad_norm": 0.004826835356652737, + "learning_rate": 5.044714450921979e-05, + "loss": 0.011948313564062119, + "num_input_tokens_seen": 135707912, + "step": 8287, + "train_runtime": 67338.8754, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 5.023030303030303, + "grad_norm": 0.0078011853620409966, + "learning_rate": 5.043752874641454e-05, + "loss": 0.012313607148826122, + "num_input_tokens_seen": 135724288, + "step": 8288, + "train_runtime": 67346.9867, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 5.0236363636363635, + "grad_norm": 0.009011351503431797, + "learning_rate": 5.042791296742595e-05, + "loss": 0.011273661628365517, + "num_input_tokens_seen": 135740664, + "step": 8289, + "train_runtime": 67355.0986, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 5.024242424242424, + "grad_norm": 0.006729157641530037, + "learning_rate": 5.0418297172609705e-05, + "loss": 0.012316697277128696, + "num_input_tokens_seen": 135757040, + "step": 8290, + "train_runtime": 67363.2102, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 5.0248484848484845, + "grad_norm": 0.005637967959046364, + "learning_rate": 5.040868136232145e-05, + "loss": 0.011752767488360405, + "num_input_tokens_seen": 135773416, + "step": 8291, + "train_runtime": 67371.3332, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 5.025454545454545, + "grad_norm": 0.00638741347938776, + "learning_rate": 5.039906553691688e-05, + "loss": 0.012068787589669228, + "num_input_tokens_seen": 135789792, + "step": 8292, + "train_runtime": 67379.4428, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 5.026060606060606, + "grad_norm": 0.006951635703444481, + "learning_rate": 5.038944969675165e-05, + "loss": 0.011950002983212471, + "num_input_tokens_seen": 135806168, + "step": 8293, + "train_runtime": 67387.556, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 5.026666666666666, + "grad_norm": 0.006495848298072815, + "learning_rate": 5.0379833842181426e-05, + "loss": 0.012531060725450516, + "num_input_tokens_seen": 135822544, + "step": 8294, + "train_runtime": 67395.6657, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 5.027272727272727, + "grad_norm": 0.004297948908060789, + "learning_rate": 5.0370217973561896e-05, + "loss": 0.011687916703522205, + "num_input_tokens_seen": 135838920, + "step": 8295, + "train_runtime": 67403.7755, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 5.027878787878788, + "grad_norm": 0.005864521488547325, + "learning_rate": 5.0360602091248724e-05, + "loss": 0.012519733980298042, + "num_input_tokens_seen": 135855296, + "step": 8296, + "train_runtime": 67411.8868, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 5.028484848484848, + "grad_norm": 0.011788390576839447, + "learning_rate": 5.035098619559756e-05, + "loss": 0.012649256736040115, + "num_input_tokens_seen": 135871672, + "step": 8297, + "train_runtime": 67419.9965, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 5.029090909090909, + "grad_norm": 0.005418900866061449, + "learning_rate": 5.034137028696413e-05, + "loss": 0.012380923144519329, + "num_input_tokens_seen": 135888048, + "step": 8298, + "train_runtime": 67428.1092, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 5.029696969696969, + "grad_norm": 0.010224822908639908, + "learning_rate": 5.033175436570406e-05, + "loss": 0.01156550645828247, + "num_input_tokens_seen": 135904424, + "step": 8299, + "train_runtime": 67436.2209, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 5.03030303030303, + "grad_norm": 0.00644992059096694, + "learning_rate": 5.0322138432173063e-05, + "loss": 0.013195598497986794, + "num_input_tokens_seen": 135920800, + "step": 8300, + "train_runtime": 67444.334, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 5.030909090909091, + "grad_norm": 0.006435430143028498, + "learning_rate": 5.031252248672678e-05, + "loss": 0.012008137069642544, + "num_input_tokens_seen": 135937176, + "step": 8301, + "train_runtime": 67453.4605, + "train_tokens_per_second": 2015.274 + }, + { + "epoch": 5.031515151515151, + "grad_norm": 0.004527329467236996, + "learning_rate": 5.0302906529720885e-05, + "loss": 0.012301946990191936, + "num_input_tokens_seen": 135953552, + "step": 8302, + "train_runtime": 67461.5686, + "train_tokens_per_second": 2015.274 + }, + { + "epoch": 5.032121212121212, + "grad_norm": 0.008144144900143147, + "learning_rate": 5.0293290561511084e-05, + "loss": 0.012775711715221405, + "num_input_tokens_seen": 135969928, + "step": 8303, + "train_runtime": 67469.6771, + "train_tokens_per_second": 2015.275 + }, + { + "epoch": 5.032727272727273, + "grad_norm": 0.006322100758552551, + "learning_rate": 5.0283674582453034e-05, + "loss": 0.011964922770857811, + "num_input_tokens_seen": 135986304, + "step": 8304, + "train_runtime": 67477.7855, + "train_tokens_per_second": 2015.275 + }, + { + "epoch": 5.033333333333333, + "grad_norm": 0.006746840663254261, + "learning_rate": 5.0274058592902415e-05, + "loss": 0.013213821686804295, + "num_input_tokens_seen": 136002680, + "step": 8305, + "train_runtime": 67485.8942, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 5.033939393939394, + "grad_norm": 0.009822105057537556, + "learning_rate": 5.026444259321489e-05, + "loss": 0.012300253845751286, + "num_input_tokens_seen": 136019056, + "step": 8306, + "train_runtime": 67494.0016, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 5.034545454545454, + "grad_norm": 0.009072283282876015, + "learning_rate": 5.025482658374616e-05, + "loss": 0.01244582049548626, + "num_input_tokens_seen": 136035432, + "step": 8307, + "train_runtime": 67502.1127, + "train_tokens_per_second": 2015.277 + }, + { + "epoch": 5.035151515151515, + "grad_norm": 0.004191865213215351, + "learning_rate": 5.0245210564851875e-05, + "loss": 0.01164008118212223, + "num_input_tokens_seen": 136051808, + "step": 8308, + "train_runtime": 67510.2214, + "train_tokens_per_second": 2015.277 + }, + { + "epoch": 5.035757575757576, + "grad_norm": 0.00858981255441904, + "learning_rate": 5.023559453688776e-05, + "loss": 0.01374704297631979, + "num_input_tokens_seen": 136068184, + "step": 8309, + "train_runtime": 67518.3353, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 5.036363636363636, + "grad_norm": 0.003235921263694763, + "learning_rate": 5.0225978500209435e-05, + "loss": 0.011563345789909363, + "num_input_tokens_seen": 136084560, + "step": 8310, + "train_runtime": 67526.4457, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 5.036969696969697, + "grad_norm": 0.00631209509447217, + "learning_rate": 5.0216362455172614e-05, + "loss": 0.012302270159125328, + "num_input_tokens_seen": 136100936, + "step": 8311, + "train_runtime": 67534.5575, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 5.037575757575757, + "grad_norm": 0.006116182077676058, + "learning_rate": 5.0206746402132965e-05, + "loss": 0.011892072856426239, + "num_input_tokens_seen": 136117312, + "step": 8312, + "train_runtime": 67542.6688, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 5.038181818181818, + "grad_norm": 0.006049556657671928, + "learning_rate": 5.0197130341446174e-05, + "loss": 0.011761843226850033, + "num_input_tokens_seen": 136133688, + "step": 8313, + "train_runtime": 67550.7814, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 5.038787878787879, + "grad_norm": 0.005385833326727152, + "learning_rate": 5.018751427346792e-05, + "loss": 0.011372707784175873, + "num_input_tokens_seen": 136150064, + "step": 8314, + "train_runtime": 67558.8918, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 5.039393939393939, + "grad_norm": 0.008138551376760006, + "learning_rate": 5.017789819855386e-05, + "loss": 0.01069559808820486, + "num_input_tokens_seen": 136166440, + "step": 8315, + "train_runtime": 67567.0034, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 5.04, + "grad_norm": 0.004271382000297308, + "learning_rate": 5.0168282117059705e-05, + "loss": 0.012227785773575306, + "num_input_tokens_seen": 136182816, + "step": 8316, + "train_runtime": 67575.1142, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 5.040606060606061, + "grad_norm": 0.007745540235191584, + "learning_rate": 5.015866602934112e-05, + "loss": 0.012568507343530655, + "num_input_tokens_seen": 136199192, + "step": 8317, + "train_runtime": 67583.2306, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 5.041212121212121, + "grad_norm": 0.010733498260378838, + "learning_rate": 5.014904993575379e-05, + "loss": 0.012609539553523064, + "num_input_tokens_seen": 136215568, + "step": 8318, + "train_runtime": 67591.3412, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 5.041818181818182, + "grad_norm": 0.008590422570705414, + "learning_rate": 5.013943383665339e-05, + "loss": 0.012132626958191395, + "num_input_tokens_seen": 136231944, + "step": 8319, + "train_runtime": 67599.4522, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 5.042424242424242, + "grad_norm": 0.00730553362518549, + "learning_rate": 5.01298177323956e-05, + "loss": 0.011441248469054699, + "num_input_tokens_seen": 136248320, + "step": 8320, + "train_runtime": 67607.5604, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 5.043030303030303, + "grad_norm": 0.006355650722980499, + "learning_rate": 5.012020162333612e-05, + "loss": 0.01182939950376749, + "num_input_tokens_seen": 136264696, + "step": 8321, + "train_runtime": 67615.6688, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 5.043636363636364, + "grad_norm": 0.006527552846819162, + "learning_rate": 5.011058550983061e-05, + "loss": 0.010410111397504807, + "num_input_tokens_seen": 136281072, + "step": 8322, + "train_runtime": 67623.7784, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 5.044242424242424, + "grad_norm": 0.006506567355245352, + "learning_rate": 5.0100969392234756e-05, + "loss": 0.012185508385300636, + "num_input_tokens_seen": 136297448, + "step": 8323, + "train_runtime": 67631.8879, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 5.044848484848485, + "grad_norm": 0.006626383867114782, + "learning_rate": 5.009135327090424e-05, + "loss": 0.011658984236419201, + "num_input_tokens_seen": 136313824, + "step": 8324, + "train_runtime": 67640.0, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 5.045454545454546, + "grad_norm": 0.007630739826709032, + "learning_rate": 5.008173714619473e-05, + "loss": 0.012417991645634174, + "num_input_tokens_seen": 136330200, + "step": 8325, + "train_runtime": 67648.1133, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 5.046060606060606, + "grad_norm": 0.009632973931729794, + "learning_rate": 5.007212101846194e-05, + "loss": 0.012361422181129456, + "num_input_tokens_seen": 136346576, + "step": 8326, + "train_runtime": 67656.2308, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 5.046666666666667, + "grad_norm": 0.004495717119425535, + "learning_rate": 5.006250488806154e-05, + "loss": 0.012159018777310848, + "num_input_tokens_seen": 136362952, + "step": 8327, + "train_runtime": 67664.3404, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 5.047272727272727, + "grad_norm": 0.004173946101218462, + "learning_rate": 5.005288875534919e-05, + "loss": 0.010337317362427711, + "num_input_tokens_seen": 136379328, + "step": 8328, + "train_runtime": 67672.4514, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 5.047878787878788, + "grad_norm": 0.010079857893288136, + "learning_rate": 5.004327262068058e-05, + "loss": 0.010337211191654205, + "num_input_tokens_seen": 136395704, + "step": 8329, + "train_runtime": 67680.5742, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 5.048484848484849, + "grad_norm": 0.008880448527634144, + "learning_rate": 5.0033656484411415e-05, + "loss": 0.011353524401783943, + "num_input_tokens_seen": 136412080, + "step": 8330, + "train_runtime": 67688.6855, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 5.049090909090909, + "grad_norm": 0.007502446882426739, + "learning_rate": 5.002404034689736e-05, + "loss": 0.0112943509593606, + "num_input_tokens_seen": 136428456, + "step": 8331, + "train_runtime": 67696.7946, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 5.04969696969697, + "grad_norm": 0.007389831356704235, + "learning_rate": 5.00144242084941e-05, + "loss": 0.01241891086101532, + "num_input_tokens_seen": 136444832, + "step": 8332, + "train_runtime": 67704.9038, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 5.050303030303031, + "grad_norm": 0.007833220064640045, + "learning_rate": 5.000480806955732e-05, + "loss": 0.012798775918781757, + "num_input_tokens_seen": 136461208, + "step": 8333, + "train_runtime": 67713.0157, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 5.050909090909091, + "grad_norm": 0.00411307392641902, + "learning_rate": 4.9995191930442695e-05, + "loss": 0.012639024294912815, + "num_input_tokens_seen": 136477584, + "step": 8334, + "train_runtime": 67721.1318, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 5.051515151515152, + "grad_norm": 0.006849776022136211, + "learning_rate": 4.998557579150591e-05, + "loss": 0.011267402209341526, + "num_input_tokens_seen": 136493960, + "step": 8335, + "train_runtime": 67729.2466, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 5.052121212121212, + "grad_norm": 0.006886915769428015, + "learning_rate": 4.9975959653102645e-05, + "loss": 0.011748380027711391, + "num_input_tokens_seen": 136510336, + "step": 8336, + "train_runtime": 67737.3592, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 5.052727272727273, + "grad_norm": 0.005555429495871067, + "learning_rate": 4.996634351558859e-05, + "loss": 0.012650941498577595, + "num_input_tokens_seen": 136526712, + "step": 8337, + "train_runtime": 67745.4688, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 5.053333333333334, + "grad_norm": 0.007491360418498516, + "learning_rate": 4.995672737931943e-05, + "loss": 0.012909266166388988, + "num_input_tokens_seen": 136543088, + "step": 8338, + "train_runtime": 67753.5802, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 5.053939393939394, + "grad_norm": 0.008174006827175617, + "learning_rate": 4.9947111244650835e-05, + "loss": 0.01289438921958208, + "num_input_tokens_seen": 136559464, + "step": 8339, + "train_runtime": 67761.6894, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 5.054545454545455, + "grad_norm": 0.007791534997522831, + "learning_rate": 4.993749511193848e-05, + "loss": 0.011773689649999142, + "num_input_tokens_seen": 136575840, + "step": 8340, + "train_runtime": 67769.8036, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 5.055151515151515, + "grad_norm": 0.0046482631005346775, + "learning_rate": 4.992787898153807e-05, + "loss": 0.010729984380304813, + "num_input_tokens_seen": 136592216, + "step": 8341, + "train_runtime": 67777.9139, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 5.055757575757576, + "grad_norm": 0.0032164854928851128, + "learning_rate": 4.991826285380528e-05, + "loss": 0.010562192648649216, + "num_input_tokens_seen": 136608592, + "step": 8342, + "train_runtime": 67786.0305, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 5.056363636363637, + "grad_norm": 0.0073778098449110985, + "learning_rate": 4.9908646729095784e-05, + "loss": 0.011173474602401257, + "num_input_tokens_seen": 136624968, + "step": 8343, + "train_runtime": 67794.141, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 5.056969696969697, + "grad_norm": 0.008049871772527695, + "learning_rate": 4.9899030607765255e-05, + "loss": 0.013003354892134666, + "num_input_tokens_seen": 136641344, + "step": 8344, + "train_runtime": 67802.2508, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 5.057575757575758, + "grad_norm": 0.0064498428255319595, + "learning_rate": 4.9889414490169403e-05, + "loss": 0.011912745423614979, + "num_input_tokens_seen": 136657720, + "step": 8345, + "train_runtime": 67810.363, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 5.058181818181819, + "grad_norm": 0.006753233261406422, + "learning_rate": 4.9879798376663894e-05, + "loss": 0.010419188067317009, + "num_input_tokens_seen": 136674096, + "step": 8346, + "train_runtime": 67818.471, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 5.058787878787879, + "grad_norm": 0.007461462635546923, + "learning_rate": 4.9870182267604406e-05, + "loss": 0.011423196643590927, + "num_input_tokens_seen": 136690472, + "step": 8347, + "train_runtime": 67826.5814, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 5.0593939393939396, + "grad_norm": 0.009171088226139545, + "learning_rate": 4.986056616334662e-05, + "loss": 0.012788143940269947, + "num_input_tokens_seen": 136706848, + "step": 8348, + "train_runtime": 67834.6888, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 5.06, + "grad_norm": 0.00620295200496912, + "learning_rate": 4.985095006424621e-05, + "loss": 0.011380964890122414, + "num_input_tokens_seen": 136723224, + "step": 8349, + "train_runtime": 67842.7986, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 5.0606060606060606, + "grad_norm": 0.005861747078597546, + "learning_rate": 4.984133397065889e-05, + "loss": 0.012386595830321312, + "num_input_tokens_seen": 136739600, + "step": 8350, + "train_runtime": 67850.9089, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 5.0612121212121215, + "grad_norm": 0.005825080908834934, + "learning_rate": 4.983171788294031e-05, + "loss": 0.012096296064555645, + "num_input_tokens_seen": 136755976, + "step": 8351, + "train_runtime": 67859.0194, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 5.0618181818181816, + "grad_norm": 0.007503534201532602, + "learning_rate": 4.982210180144615e-05, + "loss": 0.011326254345476627, + "num_input_tokens_seen": 136772352, + "step": 8352, + "train_runtime": 67867.1329, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 5.0624242424242425, + "grad_norm": 0.007871462032198906, + "learning_rate": 4.9812485726532085e-05, + "loss": 0.012511014007031918, + "num_input_tokens_seen": 136788728, + "step": 8353, + "train_runtime": 67875.2431, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 5.063030303030303, + "grad_norm": 0.007302190642803907, + "learning_rate": 4.980286965855384e-05, + "loss": 0.012194191105663776, + "num_input_tokens_seen": 136805104, + "step": 8354, + "train_runtime": 67883.3513, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 5.0636363636363635, + "grad_norm": 0.008489741943776608, + "learning_rate": 4.9793253597867047e-05, + "loss": 0.012533175759017467, + "num_input_tokens_seen": 136821480, + "step": 8355, + "train_runtime": 67891.4627, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 5.064242424242424, + "grad_norm": 0.008080749772489071, + "learning_rate": 4.9783637544827404e-05, + "loss": 0.010332386940717697, + "num_input_tokens_seen": 136837856, + "step": 8356, + "train_runtime": 67899.5722, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 5.0648484848484845, + "grad_norm": 0.012693931348621845, + "learning_rate": 4.977402149979056e-05, + "loss": 0.013704934157431126, + "num_input_tokens_seen": 136854232, + "step": 8357, + "train_runtime": 67907.6803, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 5.065454545454545, + "grad_norm": 0.007884949445724487, + "learning_rate": 4.976440546311225e-05, + "loss": 0.011888192966580391, + "num_input_tokens_seen": 136870608, + "step": 8358, + "train_runtime": 67915.79, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 5.066060606060606, + "grad_norm": 0.008290555328130722, + "learning_rate": 4.975478943514813e-05, + "loss": 0.012344280257821083, + "num_input_tokens_seen": 136886984, + "step": 8359, + "train_runtime": 67923.9018, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 5.066666666666666, + "grad_norm": 0.007990517653524876, + "learning_rate": 4.974517341625386e-05, + "loss": 0.012059589847922325, + "num_input_tokens_seen": 136903360, + "step": 8360, + "train_runtime": 67932.0139, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 5.067272727272727, + "grad_norm": 0.00325188972055912, + "learning_rate": 4.973555740678511e-05, + "loss": 0.011889247223734856, + "num_input_tokens_seen": 136919736, + "step": 8361, + "train_runtime": 67940.1309, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 5.067878787878788, + "grad_norm": 0.009070818312466145, + "learning_rate": 4.97259414070976e-05, + "loss": 0.011300117708742619, + "num_input_tokens_seen": 136936112, + "step": 8362, + "train_runtime": 67948.2428, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 5.068484848484848, + "grad_norm": 0.009019261226058006, + "learning_rate": 4.971632541754698e-05, + "loss": 0.012393585406243801, + "num_input_tokens_seen": 136952488, + "step": 8363, + "train_runtime": 67956.3506, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 5.069090909090909, + "grad_norm": 0.007070454303175211, + "learning_rate": 4.970670943848894e-05, + "loss": 0.012000701390206814, + "num_input_tokens_seen": 136968864, + "step": 8364, + "train_runtime": 67964.4585, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 5.069696969696969, + "grad_norm": 0.01074633002281189, + "learning_rate": 4.969709347027911e-05, + "loss": 0.012346788309514523, + "num_input_tokens_seen": 136985240, + "step": 8365, + "train_runtime": 67972.5671, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 5.07030303030303, + "grad_norm": 0.007930689491331577, + "learning_rate": 4.968747751327323e-05, + "loss": 0.0122322216629982, + "num_input_tokens_seen": 137001616, + "step": 8366, + "train_runtime": 67980.6758, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 5.070909090909091, + "grad_norm": 0.008779794909060001, + "learning_rate": 4.9677861567826955e-05, + "loss": 0.01249274518340826, + "num_input_tokens_seen": 137017992, + "step": 8367, + "train_runtime": 67988.7864, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 5.071515151515151, + "grad_norm": 0.007347263861447573, + "learning_rate": 4.966824563429594e-05, + "loss": 0.012554696761071682, + "num_input_tokens_seen": 137034368, + "step": 8368, + "train_runtime": 67996.8939, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 5.072121212121212, + "grad_norm": 0.006086303852498531, + "learning_rate": 4.96586297130359e-05, + "loss": 0.01190988253802061, + "num_input_tokens_seen": 137050744, + "step": 8369, + "train_runtime": 68005.001, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 5.072727272727272, + "grad_norm": 0.008157070726156235, + "learning_rate": 4.964901380440243e-05, + "loss": 0.012081228196620941, + "num_input_tokens_seen": 137067120, + "step": 8370, + "train_runtime": 68013.1114, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 5.073333333333333, + "grad_norm": 0.007487762253731489, + "learning_rate": 4.9639397908751294e-05, + "loss": 0.011930234730243683, + "num_input_tokens_seen": 137083496, + "step": 8371, + "train_runtime": 68021.2312, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 5.073939393939394, + "grad_norm": 0.002296298509463668, + "learning_rate": 4.962978202643812e-05, + "loss": 0.011380317620933056, + "num_input_tokens_seen": 137099872, + "step": 8372, + "train_runtime": 68029.3395, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 5.074545454545454, + "grad_norm": 0.004575389437377453, + "learning_rate": 4.96201661578186e-05, + "loss": 0.011559530161321163, + "num_input_tokens_seen": 137116248, + "step": 8373, + "train_runtime": 68037.4464, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 5.075151515151515, + "grad_norm": 0.0084438007324934, + "learning_rate": 4.961055030324836e-05, + "loss": 0.011866522952914238, + "num_input_tokens_seen": 137132624, + "step": 8374, + "train_runtime": 68045.5567, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 5.075757575757576, + "grad_norm": 0.007742568850517273, + "learning_rate": 4.9600934463083125e-05, + "loss": 0.0109134865924716, + "num_input_tokens_seen": 137149000, + "step": 8375, + "train_runtime": 68053.6684, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 5.076363636363636, + "grad_norm": 0.006626576650887728, + "learning_rate": 4.959131863767855e-05, + "loss": 0.012903444468975067, + "num_input_tokens_seen": 137165376, + "step": 8376, + "train_runtime": 68061.7768, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 5.076969696969697, + "grad_norm": 0.00601216871291399, + "learning_rate": 4.9581702827390306e-05, + "loss": 0.01249616127461195, + "num_input_tokens_seen": 137181752, + "step": 8377, + "train_runtime": 68069.8907, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 5.077575757575757, + "grad_norm": 0.007035784889012575, + "learning_rate": 4.9572087032574045e-05, + "loss": 0.011972718872129917, + "num_input_tokens_seen": 137198128, + "step": 8378, + "train_runtime": 68078.0031, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 5.078181818181818, + "grad_norm": 0.007568966597318649, + "learning_rate": 4.956247125358546e-05, + "loss": 0.011879374273121357, + "num_input_tokens_seen": 137214504, + "step": 8379, + "train_runtime": 68086.1081, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 5.078787878787879, + "grad_norm": 0.006793234031647444, + "learning_rate": 4.955285549078022e-05, + "loss": 0.011806230060756207, + "num_input_tokens_seen": 137230880, + "step": 8380, + "train_runtime": 68094.2194, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 5.079393939393939, + "grad_norm": 0.0033036451786756516, + "learning_rate": 4.954323974451398e-05, + "loss": 0.01150903943926096, + "num_input_tokens_seen": 137247256, + "step": 8381, + "train_runtime": 68102.3318, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 5.08, + "grad_norm": 0.008114123716950417, + "learning_rate": 4.9533624015142396e-05, + "loss": 0.011575219221413136, + "num_input_tokens_seen": 137263632, + "step": 8382, + "train_runtime": 68110.4446, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 5.080606060606061, + "grad_norm": 0.004351661540567875, + "learning_rate": 4.952400830302117e-05, + "loss": 0.0106391292065382, + "num_input_tokens_seen": 137280008, + "step": 8383, + "train_runtime": 68118.5539, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 5.081212121212121, + "grad_norm": 0.006228295154869556, + "learning_rate": 4.951439260850595e-05, + "loss": 0.01202351413667202, + "num_input_tokens_seen": 137296384, + "step": 8384, + "train_runtime": 68126.6643, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 5.081818181818182, + "grad_norm": 0.009821774438023567, + "learning_rate": 4.950477693195239e-05, + "loss": 0.012960833497345448, + "num_input_tokens_seen": 137312760, + "step": 8385, + "train_runtime": 68134.7765, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 5.082424242424242, + "grad_norm": 0.005890066269785166, + "learning_rate": 4.9495161273716176e-05, + "loss": 0.011346753686666489, + "num_input_tokens_seen": 137329136, + "step": 8386, + "train_runtime": 68142.8875, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 5.083030303030303, + "grad_norm": 0.00649156142026186, + "learning_rate": 4.9485545634152956e-05, + "loss": 0.01069701462984085, + "num_input_tokens_seen": 137345512, + "step": 8387, + "train_runtime": 68151.001, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 5.083636363636364, + "grad_norm": 0.0039390092715620995, + "learning_rate": 4.947593001361841e-05, + "loss": 0.01272264588624239, + "num_input_tokens_seen": 137361888, + "step": 8388, + "train_runtime": 68159.1129, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 5.084242424242424, + "grad_norm": 0.00860094279050827, + "learning_rate": 4.946631441246819e-05, + "loss": 0.012006115168333054, + "num_input_tokens_seen": 137378264, + "step": 8389, + "train_runtime": 68167.2322, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 5.084848484848485, + "grad_norm": 0.0041032349690794945, + "learning_rate": 4.9456698831057965e-05, + "loss": 0.012740520760416985, + "num_input_tokens_seen": 137394640, + "step": 8390, + "train_runtime": 68175.3441, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 5.085454545454546, + "grad_norm": 0.0046050758101046085, + "learning_rate": 4.944708326974337e-05, + "loss": 0.012698102742433548, + "num_input_tokens_seen": 137411016, + "step": 8391, + "train_runtime": 68183.4538, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 5.086060606060606, + "grad_norm": 0.00812074076384306, + "learning_rate": 4.943746772888011e-05, + "loss": 0.012363099493086338, + "num_input_tokens_seen": 137427392, + "step": 8392, + "train_runtime": 68191.5671, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 5.086666666666667, + "grad_norm": 0.008570141158998013, + "learning_rate": 4.942785220882382e-05, + "loss": 0.011898050084710121, + "num_input_tokens_seen": 137443768, + "step": 8393, + "train_runtime": 68199.6785, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 5.087272727272727, + "grad_norm": 0.008466197177767754, + "learning_rate": 4.941823670993016e-05, + "loss": 0.011042165569961071, + "num_input_tokens_seen": 137460144, + "step": 8394, + "train_runtime": 68207.7877, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 5.087878787878788, + "grad_norm": 0.00863884948194027, + "learning_rate": 4.9408621232554784e-05, + "loss": 0.012481046840548515, + "num_input_tokens_seen": 137476520, + "step": 8395, + "train_runtime": 68215.8968, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 5.088484848484849, + "grad_norm": 0.0035057072527706623, + "learning_rate": 4.939900577705336e-05, + "loss": 0.012575234286487103, + "num_input_tokens_seen": 137492896, + "step": 8396, + "train_runtime": 68224.0101, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 5.089090909090909, + "grad_norm": 0.005725866183638573, + "learning_rate": 4.9389390343781553e-05, + "loss": 0.010815635323524475, + "num_input_tokens_seen": 137509272, + "step": 8397, + "train_runtime": 68232.1211, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 5.08969696969697, + "grad_norm": 0.004143290221691132, + "learning_rate": 4.9379774933095005e-05, + "loss": 0.010345329530537128, + "num_input_tokens_seen": 137525648, + "step": 8398, + "train_runtime": 68240.2328, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 5.09030303030303, + "grad_norm": 0.001687912386842072, + "learning_rate": 4.937015954534936e-05, + "loss": 0.011004311963915825, + "num_input_tokens_seen": 137542024, + "step": 8399, + "train_runtime": 68248.3448, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 5.090909090909091, + "grad_norm": 0.0063916281796991825, + "learning_rate": 4.9360544180900294e-05, + "loss": 0.012271614745259285, + "num_input_tokens_seen": 137558400, + "step": 8400, + "train_runtime": 68256.4561, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 5.091515151515152, + "grad_norm": 0.010425840504467487, + "learning_rate": 4.9350928840103464e-05, + "loss": 0.012720411643385887, + "num_input_tokens_seen": 137574776, + "step": 8401, + "train_runtime": 68265.5546, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 5.092121212121212, + "grad_norm": 0.006535636261105537, + "learning_rate": 4.934131352331451e-05, + "loss": 0.011779929511249065, + "num_input_tokens_seen": 137591152, + "step": 8402, + "train_runtime": 68273.6659, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 5.092727272727273, + "grad_norm": 0.007370028644800186, + "learning_rate": 4.9331698230889054e-05, + "loss": 0.012454360723495483, + "num_input_tokens_seen": 137607528, + "step": 8403, + "train_runtime": 68281.7767, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 5.093333333333334, + "grad_norm": 0.006226471159607172, + "learning_rate": 4.9322082963182825e-05, + "loss": 0.012282513082027435, + "num_input_tokens_seen": 137623904, + "step": 8404, + "train_runtime": 68289.8869, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 5.093939393939394, + "grad_norm": 0.0005829664296470582, + "learning_rate": 4.931246772055141e-05, + "loss": 0.010963691398501396, + "num_input_tokens_seen": 137640280, + "step": 8405, + "train_runtime": 68297.9977, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 5.094545454545455, + "grad_norm": 0.007711022160947323, + "learning_rate": 4.9302852503350475e-05, + "loss": 0.011244256049394608, + "num_input_tokens_seen": 137656656, + "step": 8406, + "train_runtime": 68306.107, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 5.095151515151515, + "grad_norm": 0.006949673406779766, + "learning_rate": 4.929323731193565e-05, + "loss": 0.012186422944068909, + "num_input_tokens_seen": 137673032, + "step": 8407, + "train_runtime": 68314.23, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 5.095757575757576, + "grad_norm": 0.0070541962049901485, + "learning_rate": 4.928362214666262e-05, + "loss": 0.012510757893323898, + "num_input_tokens_seen": 137689408, + "step": 8408, + "train_runtime": 68322.3443, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 5.096363636363637, + "grad_norm": 0.007550964597612619, + "learning_rate": 4.927400700788702e-05, + "loss": 0.013313863426446915, + "num_input_tokens_seen": 137705784, + "step": 8409, + "train_runtime": 68330.4559, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 5.096969696969697, + "grad_norm": 0.009123636409640312, + "learning_rate": 4.926439189596447e-05, + "loss": 0.012293392792344093, + "num_input_tokens_seen": 137722160, + "step": 8410, + "train_runtime": 68338.5671, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 5.097575757575758, + "grad_norm": 0.003334686392918229, + "learning_rate": 4.925477681125062e-05, + "loss": 0.011680131778120995, + "num_input_tokens_seen": 137738536, + "step": 8411, + "train_runtime": 68346.6788, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 5.098181818181819, + "grad_norm": 0.006911110132932663, + "learning_rate": 4.924516175410114e-05, + "loss": 0.012099497951567173, + "num_input_tokens_seen": 137754912, + "step": 8412, + "train_runtime": 68354.7904, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 5.098787878787879, + "grad_norm": 0.006529695354402065, + "learning_rate": 4.923554672487166e-05, + "loss": 0.01146679650992155, + "num_input_tokens_seen": 137771288, + "step": 8413, + "train_runtime": 68362.8979, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 5.09939393939394, + "grad_norm": 0.007237353827804327, + "learning_rate": 4.922593172391784e-05, + "loss": 0.012284010648727417, + "num_input_tokens_seen": 137787664, + "step": 8414, + "train_runtime": 68371.0115, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 5.1, + "grad_norm": 0.005414677783846855, + "learning_rate": 4.9216316751595274e-05, + "loss": 0.012041687034070492, + "num_input_tokens_seen": 137804040, + "step": 8415, + "train_runtime": 68379.1311, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 5.100606060606061, + "grad_norm": 0.007273050956428051, + "learning_rate": 4.92067018082596e-05, + "loss": 0.012176580727100372, + "num_input_tokens_seen": 137820416, + "step": 8416, + "train_runtime": 68387.2395, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 5.1012121212121215, + "grad_norm": 0.006581771187484264, + "learning_rate": 4.919708689426651e-05, + "loss": 0.011850250884890556, + "num_input_tokens_seen": 137836792, + "step": 8417, + "train_runtime": 68395.3495, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 5.101818181818182, + "grad_norm": 0.006332386750727892, + "learning_rate": 4.918747200997162e-05, + "loss": 0.011502680368721485, + "num_input_tokens_seen": 137853168, + "step": 8418, + "train_runtime": 68403.462, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 5.1024242424242425, + "grad_norm": 0.0059815822169184685, + "learning_rate": 4.917785715573055e-05, + "loss": 0.011529969051480293, + "num_input_tokens_seen": 137869544, + "step": 8419, + "train_runtime": 68411.5733, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 5.1030303030303035, + "grad_norm": 0.006126226857304573, + "learning_rate": 4.916824233189892e-05, + "loss": 0.012681186199188232, + "num_input_tokens_seen": 137885920, + "step": 8420, + "train_runtime": 68419.6828, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 5.1036363636363635, + "grad_norm": 0.007824774831533432, + "learning_rate": 4.915862753883241e-05, + "loss": 0.01275503821671009, + "num_input_tokens_seen": 137902296, + "step": 8421, + "train_runtime": 68427.7963, + "train_tokens_per_second": 2015.296 + }, + { + "epoch": 5.1042424242424245, + "grad_norm": 0.005485007539391518, + "learning_rate": 4.9149012776886625e-05, + "loss": 0.011645370163023472, + "num_input_tokens_seen": 137918672, + "step": 8422, + "train_runtime": 68435.9089, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 5.1048484848484845, + "grad_norm": 0.009094040840864182, + "learning_rate": 4.913939804641721e-05, + "loss": 0.012506123632192612, + "num_input_tokens_seen": 137935048, + "step": 8423, + "train_runtime": 68444.0307, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 5.1054545454545455, + "grad_norm": 0.006860548164695501, + "learning_rate": 4.912978334777975e-05, + "loss": 0.011691117659211159, + "num_input_tokens_seen": 137951424, + "step": 8424, + "train_runtime": 68452.1427, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 5.106060606060606, + "grad_norm": 0.00643687741830945, + "learning_rate": 4.912016868132994e-05, + "loss": 0.011544311419129372, + "num_input_tokens_seen": 137967800, + "step": 8425, + "train_runtime": 68460.2559, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 5.1066666666666665, + "grad_norm": 0.00649132439866662, + "learning_rate": 4.911055404742338e-05, + "loss": 0.011879927478730679, + "num_input_tokens_seen": 137984176, + "step": 8426, + "train_runtime": 68468.3731, + "train_tokens_per_second": 2015.298 + }, + { + "epoch": 5.107272727272727, + "grad_norm": 0.0080309072509408, + "learning_rate": 4.910093944641569e-05, + "loss": 0.012078572995960712, + "num_input_tokens_seen": 138000552, + "step": 8427, + "train_runtime": 68476.4819, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 5.1078787878787875, + "grad_norm": 0.006285647861659527, + "learning_rate": 4.909132487866248e-05, + "loss": 0.011814765632152557, + "num_input_tokens_seen": 138016928, + "step": 8428, + "train_runtime": 68484.5932, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 5.108484848484848, + "grad_norm": 0.0024470568168908358, + "learning_rate": 4.908171034451941e-05, + "loss": 0.01168534904718399, + "num_input_tokens_seen": 138033304, + "step": 8429, + "train_runtime": 68492.7065, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 5.109090909090909, + "grad_norm": 0.005302962847054005, + "learning_rate": 4.9072095844342087e-05, + "loss": 0.012831909582018852, + "num_input_tokens_seen": 138049680, + "step": 8430, + "train_runtime": 68500.8164, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 5.109696969696969, + "grad_norm": 0.007304329890757799, + "learning_rate": 4.906248137848613e-05, + "loss": 0.012005711905658245, + "num_input_tokens_seen": 138066056, + "step": 8431, + "train_runtime": 68508.9307, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 5.11030303030303, + "grad_norm": 0.005268486216664314, + "learning_rate": 4.9052866947307147e-05, + "loss": 0.01165834628045559, + "num_input_tokens_seen": 138082432, + "step": 8432, + "train_runtime": 68517.0398, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 5.110909090909091, + "grad_norm": 0.007820291444659233, + "learning_rate": 4.904325255116079e-05, + "loss": 0.013074830174446106, + "num_input_tokens_seen": 138098808, + "step": 8433, + "train_runtime": 68525.1502, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 5.111515151515151, + "grad_norm": 0.005803828127682209, + "learning_rate": 4.9033638190402654e-05, + "loss": 0.011833534575998783, + "num_input_tokens_seen": 138115184, + "step": 8434, + "train_runtime": 68533.2592, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 5.112121212121212, + "grad_norm": 0.005016200244426727, + "learning_rate": 4.9024023865388354e-05, + "loss": 0.01123079564422369, + "num_input_tokens_seen": 138131560, + "step": 8435, + "train_runtime": 68541.3646, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 5.112727272727272, + "grad_norm": 0.006215351168066263, + "learning_rate": 4.901440957647352e-05, + "loss": 0.011754140257835388, + "num_input_tokens_seen": 138147936, + "step": 8436, + "train_runtime": 68549.475, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 5.113333333333333, + "grad_norm": 0.007914964109659195, + "learning_rate": 4.900479532401373e-05, + "loss": 0.012089340947568417, + "num_input_tokens_seen": 138164312, + "step": 8437, + "train_runtime": 68557.586, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 5.113939393939394, + "grad_norm": 0.003324261400848627, + "learning_rate": 4.8995181108364655e-05, + "loss": 0.011731304228305817, + "num_input_tokens_seen": 138180688, + "step": 8438, + "train_runtime": 68565.6928, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 5.114545454545454, + "grad_norm": 0.00914143305271864, + "learning_rate": 4.898556692988186e-05, + "loss": 0.012389862909913063, + "num_input_tokens_seen": 138197064, + "step": 8439, + "train_runtime": 68573.8038, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 5.115151515151515, + "grad_norm": 0.005871521309018135, + "learning_rate": 4.897595278892097e-05, + "loss": 0.011237229220569134, + "num_input_tokens_seen": 138213440, + "step": 8440, + "train_runtime": 68581.916, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 5.115757575757576, + "grad_norm": 0.005657857283949852, + "learning_rate": 4.8966338685837586e-05, + "loss": 0.01288657821714878, + "num_input_tokens_seen": 138229816, + "step": 8441, + "train_runtime": 68590.0312, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 5.116363636363636, + "grad_norm": 0.006155570037662983, + "learning_rate": 4.895672462098733e-05, + "loss": 0.011405151337385178, + "num_input_tokens_seen": 138246192, + "step": 8442, + "train_runtime": 68598.1426, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 5.116969696969697, + "grad_norm": 0.007509998511523008, + "learning_rate": 4.89471105947258e-05, + "loss": 0.012355961836874485, + "num_input_tokens_seen": 138262568, + "step": 8443, + "train_runtime": 68606.2548, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 5.117575757575757, + "grad_norm": 0.007060471456497908, + "learning_rate": 4.893749660740859e-05, + "loss": 0.012099642306566238, + "num_input_tokens_seen": 138278944, + "step": 8444, + "train_runtime": 68614.3677, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 5.118181818181818, + "grad_norm": 0.004732118453830481, + "learning_rate": 4.892788265939132e-05, + "loss": 0.011980393901467323, + "num_input_tokens_seen": 138295320, + "step": 8445, + "train_runtime": 68622.4797, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 5.118787878787879, + "grad_norm": 0.009430364705622196, + "learning_rate": 4.891826875102958e-05, + "loss": 0.013036306016147137, + "num_input_tokens_seen": 138311696, + "step": 8446, + "train_runtime": 68630.5915, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 5.119393939393939, + "grad_norm": 0.009887597523629665, + "learning_rate": 4.890865488267897e-05, + "loss": 0.012413701973855495, + "num_input_tokens_seen": 138328072, + "step": 8447, + "train_runtime": 68638.7034, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 5.12, + "grad_norm": 0.006163928657770157, + "learning_rate": 4.889904105469509e-05, + "loss": 0.012206271290779114, + "num_input_tokens_seen": 138344448, + "step": 8448, + "train_runtime": 68646.8136, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 5.120606060606061, + "grad_norm": 0.0050559923984110355, + "learning_rate": 4.888942726743353e-05, + "loss": 0.011568745598196983, + "num_input_tokens_seen": 138360824, + "step": 8449, + "train_runtime": 68654.9313, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 5.121212121212121, + "grad_norm": 0.006056637968868017, + "learning_rate": 4.887981352124991e-05, + "loss": 0.011100533418357372, + "num_input_tokens_seen": 138377200, + "step": 8450, + "train_runtime": 68663.0406, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 5.121818181818182, + "grad_norm": 0.006313201505690813, + "learning_rate": 4.88701998164998e-05, + "loss": 0.012572193518280983, + "num_input_tokens_seen": 138393576, + "step": 8451, + "train_runtime": 68671.156, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 5.122424242424242, + "grad_norm": 0.0070576430298388, + "learning_rate": 4.8860586153538795e-05, + "loss": 0.010852665640413761, + "num_input_tokens_seen": 138409952, + "step": 8452, + "train_runtime": 68679.2677, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 5.123030303030303, + "grad_norm": 0.00710680428892374, + "learning_rate": 4.8850972532722474e-05, + "loss": 0.01174217090010643, + "num_input_tokens_seen": 138426328, + "step": 8453, + "train_runtime": 68687.3784, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 5.123636363636364, + "grad_norm": 0.010778145864605904, + "learning_rate": 4.884135895440647e-05, + "loss": 0.0120486319065094, + "num_input_tokens_seen": 138442704, + "step": 8454, + "train_runtime": 68695.4891, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 5.124242424242424, + "grad_norm": 0.006861309055238962, + "learning_rate": 4.883174541894633e-05, + "loss": 0.012242753058671951, + "num_input_tokens_seen": 138459080, + "step": 8455, + "train_runtime": 68703.6036, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 5.124848484848485, + "grad_norm": 0.004329788964241743, + "learning_rate": 4.8822131926697656e-05, + "loss": 0.012129007838666439, + "num_input_tokens_seen": 138475456, + "step": 8456, + "train_runtime": 68711.7156, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 5.125454545454545, + "grad_norm": 0.005628506187349558, + "learning_rate": 4.8812518478016006e-05, + "loss": 0.012305798009037971, + "num_input_tokens_seen": 138491832, + "step": 8457, + "train_runtime": 68719.8304, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 5.126060606060606, + "grad_norm": 0.0030406410805881023, + "learning_rate": 4.8802905073257014e-05, + "loss": 0.01194281317293644, + "num_input_tokens_seen": 138508208, + "step": 8458, + "train_runtime": 68727.9433, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 5.126666666666667, + "grad_norm": 0.007934898138046265, + "learning_rate": 4.879329171277622e-05, + "loss": 0.010454187169671059, + "num_input_tokens_seen": 138524584, + "step": 8459, + "train_runtime": 68736.0577, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 5.127272727272727, + "grad_norm": 0.006787837482988834, + "learning_rate": 4.878367839692922e-05, + "loss": 0.011807831935584545, + "num_input_tokens_seen": 138540960, + "step": 8460, + "train_runtime": 68744.1708, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 5.127878787878788, + "grad_norm": 0.008294179104268551, + "learning_rate": 4.8774065126071586e-05, + "loss": 0.01169646717607975, + "num_input_tokens_seen": 138557336, + "step": 8461, + "train_runtime": 68752.2836, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 5.128484848484849, + "grad_norm": 0.007297551725059748, + "learning_rate": 4.876445190055888e-05, + "loss": 0.01320556178689003, + "num_input_tokens_seen": 138573712, + "step": 8462, + "train_runtime": 68760.397, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 5.129090909090909, + "grad_norm": 0.006557761691510677, + "learning_rate": 4.875483872074672e-05, + "loss": 0.013023246079683304, + "num_input_tokens_seen": 138590088, + "step": 8463, + "train_runtime": 68768.51, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 5.12969696969697, + "grad_norm": 0.0077821542508900166, + "learning_rate": 4.874522558699064e-05, + "loss": 0.01098568458110094, + "num_input_tokens_seen": 138606464, + "step": 8464, + "train_runtime": 68776.6192, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 5.13030303030303, + "grad_norm": 0.00819367915391922, + "learning_rate": 4.873561249964622e-05, + "loss": 0.011966570280492306, + "num_input_tokens_seen": 138622840, + "step": 8465, + "train_runtime": 68784.7304, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 5.130909090909091, + "grad_norm": 0.006254709791392088, + "learning_rate": 4.872599945906901e-05, + "loss": 0.012690999545156956, + "num_input_tokens_seen": 138639216, + "step": 8466, + "train_runtime": 68792.8436, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 5.131515151515152, + "grad_norm": 0.008464844897389412, + "learning_rate": 4.871638646561462e-05, + "loss": 0.012700291350483894, + "num_input_tokens_seen": 138655592, + "step": 8467, + "train_runtime": 68800.9536, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 5.132121212121212, + "grad_norm": 0.00782747846096754, + "learning_rate": 4.8706773519638605e-05, + "loss": 0.012525822035968304, + "num_input_tokens_seen": 138671968, + "step": 8468, + "train_runtime": 68809.0633, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 5.132727272727273, + "grad_norm": 0.004464113153517246, + "learning_rate": 4.8697160621496516e-05, + "loss": 0.011715728789567947, + "num_input_tokens_seen": 138688344, + "step": 8469, + "train_runtime": 68817.1752, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 5.133333333333334, + "grad_norm": 0.007959314621984959, + "learning_rate": 4.8687547771543887e-05, + "loss": 0.013299328275024891, + "num_input_tokens_seen": 138704720, + "step": 8470, + "train_runtime": 68825.2881, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 5.133939393939394, + "grad_norm": 0.006287924014031887, + "learning_rate": 4.8677934970136335e-05, + "loss": 0.01328364573419094, + "num_input_tokens_seen": 138721096, + "step": 8471, + "train_runtime": 68833.4018, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 5.134545454545455, + "grad_norm": 0.004370466805994511, + "learning_rate": 4.86683222176294e-05, + "loss": 0.011147796176373959, + "num_input_tokens_seen": 138737472, + "step": 8472, + "train_runtime": 68841.513, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 5.135151515151515, + "grad_norm": 0.005385567434132099, + "learning_rate": 4.865870951437863e-05, + "loss": 0.01179320178925991, + "num_input_tokens_seen": 138753848, + "step": 8473, + "train_runtime": 68849.6307, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 5.135757575757576, + "grad_norm": 0.00548461452126503, + "learning_rate": 4.8649096860739564e-05, + "loss": 0.01178848184645176, + "num_input_tokens_seen": 138770224, + "step": 8474, + "train_runtime": 68857.7439, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 5.136363636363637, + "grad_norm": 0.005376025103032589, + "learning_rate": 4.863948425706779e-05, + "loss": 0.01102517545223236, + "num_input_tokens_seen": 138786600, + "step": 8475, + "train_runtime": 68865.8578, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 5.136969696969697, + "grad_norm": 0.007151973433792591, + "learning_rate": 4.862987170371884e-05, + "loss": 0.012838508002460003, + "num_input_tokens_seen": 138802976, + "step": 8476, + "train_runtime": 68873.9703, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 5.137575757575758, + "grad_norm": 0.004434673581272364, + "learning_rate": 4.862025920104828e-05, + "loss": 0.012930973432958126, + "num_input_tokens_seen": 138819352, + "step": 8477, + "train_runtime": 68882.082, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 5.138181818181819, + "grad_norm": 0.005240186117589474, + "learning_rate": 4.8610646749411625e-05, + "loss": 0.01121505256742239, + "num_input_tokens_seen": 138835728, + "step": 8478, + "train_runtime": 68890.1949, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 5.138787878787879, + "grad_norm": 0.009111362509429455, + "learning_rate": 4.8601034349164454e-05, + "loss": 0.012310883961617947, + "num_input_tokens_seen": 138852104, + "step": 8479, + "train_runtime": 68898.3088, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 5.13939393939394, + "grad_norm": 0.0077020167373120785, + "learning_rate": 4.8591422000662296e-05, + "loss": 0.011116789653897285, + "num_input_tokens_seen": 138868480, + "step": 8480, + "train_runtime": 68906.4195, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 5.14, + "grad_norm": 0.007785317953675985, + "learning_rate": 4.85818097042607e-05, + "loss": 0.011453649029135704, + "num_input_tokens_seen": 138884856, + "step": 8481, + "train_runtime": 68914.532, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 5.140606060606061, + "grad_norm": 0.006020388565957546, + "learning_rate": 4.85721974603152e-05, + "loss": 0.011771420016884804, + "num_input_tokens_seen": 138901232, + "step": 8482, + "train_runtime": 68922.6432, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 5.141212121212122, + "grad_norm": 0.005110946949571371, + "learning_rate": 4.856258526918132e-05, + "loss": 0.011589819565415382, + "num_input_tokens_seen": 138917608, + "step": 8483, + "train_runtime": 68930.7541, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 5.141818181818182, + "grad_norm": 0.0018700766377151012, + "learning_rate": 4.8552973131214626e-05, + "loss": 0.011120017617940903, + "num_input_tokens_seen": 138933984, + "step": 8484, + "train_runtime": 68938.867, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 5.142424242424243, + "grad_norm": 0.006462767720222473, + "learning_rate": 4.854336104677064e-05, + "loss": 0.011438325047492981, + "num_input_tokens_seen": 138950360, + "step": 8485, + "train_runtime": 68946.9803, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 5.143030303030303, + "grad_norm": 0.007330893073230982, + "learning_rate": 4.8533749016204885e-05, + "loss": 0.012583397328853607, + "num_input_tokens_seen": 138966736, + "step": 8486, + "train_runtime": 68955.0918, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 5.143636363636364, + "grad_norm": 0.005795520264655352, + "learning_rate": 4.852413703987289e-05, + "loss": 0.011503735557198524, + "num_input_tokens_seen": 138983112, + "step": 8487, + "train_runtime": 68963.2026, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 5.1442424242424245, + "grad_norm": 0.0069251712411642075, + "learning_rate": 4.85145251181302e-05, + "loss": 0.01243771892040968, + "num_input_tokens_seen": 138999488, + "step": 8488, + "train_runtime": 68971.3137, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 5.144848484848485, + "grad_norm": 0.005328953731805086, + "learning_rate": 4.850491325133234e-05, + "loss": 0.013121155090630054, + "num_input_tokens_seen": 139015864, + "step": 8489, + "train_runtime": 68979.4351, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 5.1454545454545455, + "grad_norm": 0.006525726057589054, + "learning_rate": 4.849530143983483e-05, + "loss": 0.011037876829504967, + "num_input_tokens_seen": 139032240, + "step": 8490, + "train_runtime": 68987.5429, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 5.1460606060606064, + "grad_norm": 0.007074307184666395, + "learning_rate": 4.848568968399317e-05, + "loss": 0.01210896484553814, + "num_input_tokens_seen": 139048616, + "step": 8491, + "train_runtime": 68995.6532, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 5.1466666666666665, + "grad_norm": 0.011399013921618462, + "learning_rate": 4.847607798416292e-05, + "loss": 0.01269868016242981, + "num_input_tokens_seen": 139064992, + "step": 8492, + "train_runtime": 69003.767, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 5.147272727272727, + "grad_norm": 0.007042056415230036, + "learning_rate": 4.846646634069957e-05, + "loss": 0.011935079470276833, + "num_input_tokens_seen": 139081368, + "step": 8493, + "train_runtime": 69011.8784, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 5.1478787878787875, + "grad_norm": 0.0017905592685565352, + "learning_rate": 4.8456854753958644e-05, + "loss": 0.01075215358287096, + "num_input_tokens_seen": 139097744, + "step": 8494, + "train_runtime": 69019.9914, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 5.148484848484848, + "grad_norm": 0.00707419915124774, + "learning_rate": 4.844724322429565e-05, + "loss": 0.0123128741979599, + "num_input_tokens_seen": 139114120, + "step": 8495, + "train_runtime": 69028.1048, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 5.149090909090909, + "grad_norm": 0.00721976812928915, + "learning_rate": 4.8437631752066103e-05, + "loss": 0.012307468801736832, + "num_input_tokens_seen": 139130496, + "step": 8496, + "train_runtime": 69036.2154, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 5.149696969696969, + "grad_norm": 0.004948594607412815, + "learning_rate": 4.842802033762553e-05, + "loss": 0.012791519984602928, + "num_input_tokens_seen": 139146872, + "step": 8497, + "train_runtime": 69044.3322, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 5.15030303030303, + "grad_norm": 0.004724833648651838, + "learning_rate": 4.841840898132942e-05, + "loss": 0.01241736114025116, + "num_input_tokens_seen": 139163248, + "step": 8498, + "train_runtime": 69052.4438, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 5.150909090909091, + "grad_norm": 0.004511399660259485, + "learning_rate": 4.840879768353326e-05, + "loss": 0.010619336739182472, + "num_input_tokens_seen": 139179624, + "step": 8499, + "train_runtime": 69060.5576, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 5.151515151515151, + "grad_norm": 0.009481120854616165, + "learning_rate": 4.8399186444592596e-05, + "loss": 0.012699050828814507, + "num_input_tokens_seen": 139196000, + "step": 8500, + "train_runtime": 69068.6698, + "train_tokens_per_second": 2015.328 + }, + { + "epoch": 5.152121212121212, + "grad_norm": 0.00604023365303874, + "learning_rate": 4.8389575264862906e-05, + "loss": 0.012287267483770847, + "num_input_tokens_seen": 139212376, + "step": 8501, + "train_runtime": 69077.6894, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 5.152727272727272, + "grad_norm": 0.0032092025503516197, + "learning_rate": 4.837996414469969e-05, + "loss": 0.01242920197546482, + "num_input_tokens_seen": 139228752, + "step": 8502, + "train_runtime": 69085.7992, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 5.153333333333333, + "grad_norm": 0.005717278923839331, + "learning_rate": 4.8370353084458446e-05, + "loss": 0.011972500942647457, + "num_input_tokens_seen": 139245128, + "step": 8503, + "train_runtime": 69093.9115, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 5.153939393939394, + "grad_norm": 0.004982383456081152, + "learning_rate": 4.836074208449466e-05, + "loss": 0.010454116389155388, + "num_input_tokens_seen": 139261504, + "step": 8504, + "train_runtime": 69102.0304, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 5.154545454545454, + "grad_norm": 0.0033715595491230488, + "learning_rate": 4.835113114516384e-05, + "loss": 0.011099890805780888, + "num_input_tokens_seen": 139277880, + "step": 8505, + "train_runtime": 69110.15, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 5.155151515151515, + "grad_norm": 0.006629358511418104, + "learning_rate": 4.834152026682147e-05, + "loss": 0.012827780097723007, + "num_input_tokens_seen": 139294256, + "step": 8506, + "train_runtime": 69118.2712, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 5.155757575757576, + "grad_norm": 0.0008176225237548351, + "learning_rate": 4.8331909449823035e-05, + "loss": 0.011471408419311047, + "num_input_tokens_seen": 139310632, + "step": 8507, + "train_runtime": 69126.3895, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 5.156363636363636, + "grad_norm": 0.006321595050394535, + "learning_rate": 4.8322298694524005e-05, + "loss": 0.012990044429898262, + "num_input_tokens_seen": 139327008, + "step": 8508, + "train_runtime": 69134.5106, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 5.156969696969697, + "grad_norm": 0.0021988311782479286, + "learning_rate": 4.831268800127989e-05, + "loss": 0.012117387726902962, + "num_input_tokens_seen": 139343384, + "step": 8509, + "train_runtime": 69142.6309, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 5.157575757575757, + "grad_norm": 0.005855946335941553, + "learning_rate": 4.8303077370446165e-05, + "loss": 0.011530320160090923, + "num_input_tokens_seen": 139359760, + "step": 8510, + "train_runtime": 69150.7509, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 5.158181818181818, + "grad_norm": 0.007952646352350712, + "learning_rate": 4.829346680237831e-05, + "loss": 0.012953361496329308, + "num_input_tokens_seen": 139376136, + "step": 8511, + "train_runtime": 69158.8692, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 5.158787878787879, + "grad_norm": 0.003816489363089204, + "learning_rate": 4.828385629743176e-05, + "loss": 0.011916759423911572, + "num_input_tokens_seen": 139392512, + "step": 8512, + "train_runtime": 69166.9863, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 5.159393939393939, + "grad_norm": 0.0063067153096199036, + "learning_rate": 4.827424585596206e-05, + "loss": 0.012353610247373581, + "num_input_tokens_seen": 139408888, + "step": 8513, + "train_runtime": 69175.1041, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 5.16, + "grad_norm": 0.005924342665821314, + "learning_rate": 4.8264635478324636e-05, + "loss": 0.011833631433546543, + "num_input_tokens_seen": 139425264, + "step": 8514, + "train_runtime": 69183.2196, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 5.16060606060606, + "grad_norm": 0.0066050016321241856, + "learning_rate": 4.825502516487497e-05, + "loss": 0.011806696653366089, + "num_input_tokens_seen": 139441640, + "step": 8515, + "train_runtime": 69191.3346, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 5.161212121212121, + "grad_norm": 0.004518529400229454, + "learning_rate": 4.8245414915968496e-05, + "loss": 0.012889519333839417, + "num_input_tokens_seen": 139458016, + "step": 8516, + "train_runtime": 69199.451, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 5.161818181818182, + "grad_norm": 0.008397442288696766, + "learning_rate": 4.823580473196073e-05, + "loss": 0.011356288567185402, + "num_input_tokens_seen": 139474392, + "step": 8517, + "train_runtime": 69207.5655, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 5.162424242424242, + "grad_norm": 0.007890271954238415, + "learning_rate": 4.822619461320712e-05, + "loss": 0.012711755931377411, + "num_input_tokens_seen": 139490768, + "step": 8518, + "train_runtime": 69215.6756, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 5.163030303030303, + "grad_norm": 0.006444644182920456, + "learning_rate": 4.8216584560063116e-05, + "loss": 0.011674110777676105, + "num_input_tokens_seen": 139507144, + "step": 8519, + "train_runtime": 69223.7945, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 5.163636363636364, + "grad_norm": 0.021814431995153427, + "learning_rate": 4.8206974572884144e-05, + "loss": 0.010846326127648354, + "num_input_tokens_seen": 139523520, + "step": 8520, + "train_runtime": 69231.92, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 5.164242424242424, + "grad_norm": 0.0033865543082356453, + "learning_rate": 4.819736465202572e-05, + "loss": 0.013397188857197762, + "num_input_tokens_seen": 139539896, + "step": 8521, + "train_runtime": 69240.0381, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 5.164848484848485, + "grad_norm": 0.005809765309095383, + "learning_rate": 4.8187754797843264e-05, + "loss": 0.01173045951873064, + "num_input_tokens_seen": 139556272, + "step": 8522, + "train_runtime": 69248.1556, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 5.165454545454545, + "grad_norm": 0.006946791894733906, + "learning_rate": 4.817814501069224e-05, + "loss": 0.012032832950353622, + "num_input_tokens_seen": 139572648, + "step": 8523, + "train_runtime": 69256.2719, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 5.166060606060606, + "grad_norm": 0.0068573360331356525, + "learning_rate": 4.816853529092805e-05, + "loss": 0.011404208838939667, + "num_input_tokens_seen": 139589024, + "step": 8524, + "train_runtime": 69264.3821, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 5.166666666666667, + "grad_norm": 0.010204959660768509, + "learning_rate": 4.815892563890619e-05, + "loss": 0.013789079152047634, + "num_input_tokens_seen": 139605400, + "step": 8525, + "train_runtime": 69272.5004, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 5.167272727272727, + "grad_norm": 0.012129289098083973, + "learning_rate": 4.8149316054982095e-05, + "loss": 0.01190265268087387, + "num_input_tokens_seen": 139621776, + "step": 8526, + "train_runtime": 69280.6159, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 5.167878787878788, + "grad_norm": 0.003300922457128763, + "learning_rate": 4.813970653951119e-05, + "loss": 0.010240457952022552, + "num_input_tokens_seen": 139638152, + "step": 8527, + "train_runtime": 69288.7345, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 5.168484848484849, + "grad_norm": 0.003528187284246087, + "learning_rate": 4.813009709284893e-05, + "loss": 0.011903305537998676, + "num_input_tokens_seen": 139654528, + "step": 8528, + "train_runtime": 69296.8485, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 5.169090909090909, + "grad_norm": 0.01025655958801508, + "learning_rate": 4.812048771535071e-05, + "loss": 0.013020878657698631, + "num_input_tokens_seen": 139670904, + "step": 8529, + "train_runtime": 69304.9665, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 5.16969696969697, + "grad_norm": 0.00466802017763257, + "learning_rate": 4.8110878407372e-05, + "loss": 0.010772459208965302, + "num_input_tokens_seen": 139687280, + "step": 8530, + "train_runtime": 69313.0906, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 5.17030303030303, + "grad_norm": 0.006714302580803633, + "learning_rate": 4.8101269169268227e-05, + "loss": 0.010969799011945724, + "num_input_tokens_seen": 139703656, + "step": 8531, + "train_runtime": 69321.2001, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 5.170909090909091, + "grad_norm": 0.005746109876781702, + "learning_rate": 4.809166000139481e-05, + "loss": 0.011808010749518871, + "num_input_tokens_seen": 139720032, + "step": 8532, + "train_runtime": 69329.3167, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 5.171515151515152, + "grad_norm": 0.004156254697591066, + "learning_rate": 4.8082050904107154e-05, + "loss": 0.011638832278549671, + "num_input_tokens_seen": 139736408, + "step": 8533, + "train_runtime": 69337.4383, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 5.172121212121212, + "grad_norm": 0.007530985865741968, + "learning_rate": 4.807244187776072e-05, + "loss": 0.012333003804087639, + "num_input_tokens_seen": 139752784, + "step": 8534, + "train_runtime": 69345.5509, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 5.172727272727273, + "grad_norm": 0.0077163283713161945, + "learning_rate": 4.80628329227109e-05, + "loss": 0.011873026378452778, + "num_input_tokens_seen": 139769160, + "step": 8535, + "train_runtime": 69353.669, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 5.173333333333334, + "grad_norm": 0.005812848452478647, + "learning_rate": 4.8053224039313125e-05, + "loss": 0.011876983568072319, + "num_input_tokens_seen": 139785536, + "step": 8536, + "train_runtime": 69361.7909, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 5.173939393939394, + "grad_norm": 0.005772212520241737, + "learning_rate": 4.8043615227922786e-05, + "loss": 0.012128006666898727, + "num_input_tokens_seen": 139801912, + "step": 8537, + "train_runtime": 69369.9078, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 5.174545454545455, + "grad_norm": 0.006835779175162315, + "learning_rate": 4.803400648889532e-05, + "loss": 0.011642688885331154, + "num_input_tokens_seen": 139818288, + "step": 8538, + "train_runtime": 69378.0308, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 5.175151515151515, + "grad_norm": 0.005803464446216822, + "learning_rate": 4.8024397822586125e-05, + "loss": 0.012720332480967045, + "num_input_tokens_seen": 139834664, + "step": 8539, + "train_runtime": 69386.1432, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 5.175757575757576, + "grad_norm": 0.006406168453395367, + "learning_rate": 4.801478922935061e-05, + "loss": 0.011971108615398407, + "num_input_tokens_seen": 139851040, + "step": 8540, + "train_runtime": 69394.2639, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 5.176363636363637, + "grad_norm": 0.010643348097801208, + "learning_rate": 4.800518070954416e-05, + "loss": 0.01310960203409195, + "num_input_tokens_seen": 139867416, + "step": 8541, + "train_runtime": 69402.3778, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 5.176969696969697, + "grad_norm": 0.008818886242806911, + "learning_rate": 4.799557226352221e-05, + "loss": 0.012243218719959259, + "num_input_tokens_seen": 139883792, + "step": 8542, + "train_runtime": 69410.497, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 5.177575757575758, + "grad_norm": 0.005613622721284628, + "learning_rate": 4.798596389164013e-05, + "loss": 0.010805216617882252, + "num_input_tokens_seen": 139900168, + "step": 8543, + "train_runtime": 69418.6172, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 5.178181818181818, + "grad_norm": 0.005223629530519247, + "learning_rate": 4.797635559425333e-05, + "loss": 0.011941425502300262, + "num_input_tokens_seen": 139916544, + "step": 8544, + "train_runtime": 69426.731, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 5.178787878787879, + "grad_norm": 0.004163004457950592, + "learning_rate": 4.796674737171718e-05, + "loss": 0.011610645800828934, + "num_input_tokens_seen": 139932920, + "step": 8545, + "train_runtime": 69434.8466, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 5.17939393939394, + "grad_norm": 0.0062804087065160275, + "learning_rate": 4.79571392243871e-05, + "loss": 0.012165388092398643, + "num_input_tokens_seen": 139949296, + "step": 8546, + "train_runtime": 69442.9656, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 5.18, + "grad_norm": 0.01117085013538599, + "learning_rate": 4.794753115261846e-05, + "loss": 0.013356831856071949, + "num_input_tokens_seen": 139965672, + "step": 8547, + "train_runtime": 69451.0854, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 5.180606060606061, + "grad_norm": 0.009530738927423954, + "learning_rate": 4.7937923156766646e-05, + "loss": 0.012634611688554287, + "num_input_tokens_seen": 139982048, + "step": 8548, + "train_runtime": 69459.2012, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 5.181212121212122, + "grad_norm": 0.004109921865165234, + "learning_rate": 4.7928315237187034e-05, + "loss": 0.011661773547530174, + "num_input_tokens_seen": 139998424, + "step": 8549, + "train_runtime": 69467.3215, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 5.181818181818182, + "grad_norm": 0.0037713281344622374, + "learning_rate": 4.7918707394235004e-05, + "loss": 0.01101826224476099, + "num_input_tokens_seen": 140014800, + "step": 8550, + "train_runtime": 69475.437, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 5.182424242424243, + "grad_norm": 0.008089405484497547, + "learning_rate": 4.790909962826594e-05, + "loss": 0.012053949758410454, + "num_input_tokens_seen": 140031176, + "step": 8551, + "train_runtime": 69483.5584, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 5.183030303030303, + "grad_norm": 0.009735433384776115, + "learning_rate": 4.789949193963521e-05, + "loss": 0.011156149208545685, + "num_input_tokens_seen": 140047552, + "step": 8552, + "train_runtime": 69491.6789, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 5.183636363636364, + "grad_norm": 0.0067846947349607944, + "learning_rate": 4.788988432869818e-05, + "loss": 0.01192066166549921, + "num_input_tokens_seen": 140063928, + "step": 8553, + "train_runtime": 69499.7971, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 5.1842424242424245, + "grad_norm": 0.00918307900428772, + "learning_rate": 4.788027679581021e-05, + "loss": 0.012324389070272446, + "num_input_tokens_seen": 140080304, + "step": 8554, + "train_runtime": 69507.937, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 5.184848484848485, + "grad_norm": 0.004101764876395464, + "learning_rate": 4.787066934132669e-05, + "loss": 0.012349788099527359, + "num_input_tokens_seen": 140096680, + "step": 8555, + "train_runtime": 69516.0566, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 5.1854545454545455, + "grad_norm": 0.00533085223287344, + "learning_rate": 4.786106196560296e-05, + "loss": 0.011815211735665798, + "num_input_tokens_seen": 140113056, + "step": 8556, + "train_runtime": 69524.1688, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 5.1860606060606065, + "grad_norm": 0.00608861492946744, + "learning_rate": 4.785145466899438e-05, + "loss": 0.01046735979616642, + "num_input_tokens_seen": 140129432, + "step": 8557, + "train_runtime": 69532.2816, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 5.1866666666666665, + "grad_norm": 0.004049048293381929, + "learning_rate": 4.78418474518563e-05, + "loss": 0.011096538975834846, + "num_input_tokens_seen": 140145808, + "step": 8558, + "train_runtime": 69540.3946, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 5.1872727272727275, + "grad_norm": 0.005977073684334755, + "learning_rate": 4.7832240314544095e-05, + "loss": 0.012320302426815033, + "num_input_tokens_seen": 140162184, + "step": 8559, + "train_runtime": 69548.5077, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 5.1878787878787875, + "grad_norm": 0.008832648396492004, + "learning_rate": 4.782263325741309e-05, + "loss": 0.012641483917832375, + "num_input_tokens_seen": 140178560, + "step": 8560, + "train_runtime": 69556.6209, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 5.1884848484848485, + "grad_norm": 0.0036235267762094736, + "learning_rate": 4.7813026280818645e-05, + "loss": 0.010775732807815075, + "num_input_tokens_seen": 140194936, + "step": 8561, + "train_runtime": 69564.7334, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 5.189090909090909, + "grad_norm": 0.008184530772268772, + "learning_rate": 4.7803419385116075e-05, + "loss": 0.011854710057377815, + "num_input_tokens_seen": 140211312, + "step": 8562, + "train_runtime": 69572.8459, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 5.1896969696969695, + "grad_norm": 0.007103292737156153, + "learning_rate": 4.779381257066078e-05, + "loss": 0.012100107967853546, + "num_input_tokens_seen": 140227688, + "step": 8563, + "train_runtime": 69580.9585, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 5.19030303030303, + "grad_norm": 0.005571722984313965, + "learning_rate": 4.7784205837808056e-05, + "loss": 0.011310772970318794, + "num_input_tokens_seen": 140244064, + "step": 8564, + "train_runtime": 69589.0707, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 5.190909090909091, + "grad_norm": 0.0046736025251448154, + "learning_rate": 4.7774599186913235e-05, + "loss": 0.012413127347826958, + "num_input_tokens_seen": 140260440, + "step": 8565, + "train_runtime": 69597.1842, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 5.191515151515151, + "grad_norm": 0.007212819531559944, + "learning_rate": 4.776499261833163e-05, + "loss": 0.012863670475780964, + "num_input_tokens_seen": 140276816, + "step": 8566, + "train_runtime": 69605.297, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 5.192121212121212, + "grad_norm": 0.004778503440320492, + "learning_rate": 4.775538613241863e-05, + "loss": 0.010700661689043045, + "num_input_tokens_seen": 140293192, + "step": 8567, + "train_runtime": 69613.4118, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 5.192727272727272, + "grad_norm": 0.00777791254222393, + "learning_rate": 4.774577972952953e-05, + "loss": 0.013233277015388012, + "num_input_tokens_seen": 140309568, + "step": 8568, + "train_runtime": 69621.5316, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 5.193333333333333, + "grad_norm": 0.008236539550125599, + "learning_rate": 4.773617341001964e-05, + "loss": 0.012857189401984215, + "num_input_tokens_seen": 140325944, + "step": 8569, + "train_runtime": 69629.6448, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 5.193939393939394, + "grad_norm": 0.004573076032102108, + "learning_rate": 4.772656717424428e-05, + "loss": 0.012121538631618023, + "num_input_tokens_seen": 140342320, + "step": 8570, + "train_runtime": 69637.7593, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 5.194545454545454, + "grad_norm": 0.0036026202142238617, + "learning_rate": 4.771696102255876e-05, + "loss": 0.010814531706273556, + "num_input_tokens_seen": 140358696, + "step": 8571, + "train_runtime": 69645.8703, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 5.195151515151515, + "grad_norm": 0.003985235467553139, + "learning_rate": 4.7707354955318416e-05, + "loss": 0.011654849164187908, + "num_input_tokens_seen": 140375072, + "step": 8572, + "train_runtime": 69653.9802, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 5.195757575757575, + "grad_norm": 0.002963418373838067, + "learning_rate": 4.769774897287857e-05, + "loss": 0.012017029337584972, + "num_input_tokens_seen": 140391448, + "step": 8573, + "train_runtime": 69662.094, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 5.196363636363636, + "grad_norm": 0.0059015690349042416, + "learning_rate": 4.7688143075594485e-05, + "loss": 0.01178357657045126, + "num_input_tokens_seen": 140407824, + "step": 8574, + "train_runtime": 69670.2069, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 5.196969696969697, + "grad_norm": 0.006945443339645863, + "learning_rate": 4.767853726382148e-05, + "loss": 0.011834817938506603, + "num_input_tokens_seen": 140424200, + "step": 8575, + "train_runtime": 69678.3208, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 5.197575757575757, + "grad_norm": 0.0073396493680775166, + "learning_rate": 4.7668931537914865e-05, + "loss": 0.01187639869749546, + "num_input_tokens_seen": 140440576, + "step": 8576, + "train_runtime": 69686.4336, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 5.198181818181818, + "grad_norm": 0.004855509847402573, + "learning_rate": 4.765932589822995e-05, + "loss": 0.011236459948122501, + "num_input_tokens_seen": 140456952, + "step": 8577, + "train_runtime": 69694.5479, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 5.198787878787879, + "grad_norm": 0.006219340488314629, + "learning_rate": 4.764972034512201e-05, + "loss": 0.011676288209855556, + "num_input_tokens_seen": 140473328, + "step": 8578, + "train_runtime": 69702.6628, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 5.199393939393939, + "grad_norm": 0.0039581903256475925, + "learning_rate": 4.76401148789463e-05, + "loss": 0.011576162651181221, + "num_input_tokens_seen": 140489704, + "step": 8579, + "train_runtime": 69710.7749, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 5.2, + "grad_norm": 0.008309734985232353, + "learning_rate": 4.7630509500058174e-05, + "loss": 0.010298409499228, + "num_input_tokens_seen": 140506080, + "step": 8580, + "train_runtime": 69718.889, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 5.20060606060606, + "grad_norm": 0.004117004107683897, + "learning_rate": 4.762090420881289e-05, + "loss": 0.012077968567609787, + "num_input_tokens_seen": 140522456, + "step": 8581, + "train_runtime": 69727.001, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 5.201212121212121, + "grad_norm": 0.0053946468979120255, + "learning_rate": 4.761129900556574e-05, + "loss": 0.011988026089966297, + "num_input_tokens_seen": 140538832, + "step": 8582, + "train_runtime": 69735.1128, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 5.201818181818182, + "grad_norm": 0.0037660219240933657, + "learning_rate": 4.7601693890671945e-05, + "loss": 0.012136503122746944, + "num_input_tokens_seen": 140555208, + "step": 8583, + "train_runtime": 69743.2373, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 5.202424242424242, + "grad_norm": 0.007040717173367739, + "learning_rate": 4.759208886448684e-05, + "loss": 0.011898449622094631, + "num_input_tokens_seen": 140571584, + "step": 8584, + "train_runtime": 69751.3504, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 5.203030303030303, + "grad_norm": 0.0069558327086269855, + "learning_rate": 4.7582483927365695e-05, + "loss": 0.012454009614884853, + "num_input_tokens_seen": 140587960, + "step": 8585, + "train_runtime": 69759.4647, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 5.203636363636364, + "grad_norm": 0.007891633547842503, + "learning_rate": 4.757287907966375e-05, + "loss": 0.01206294633448124, + "num_input_tokens_seen": 140604336, + "step": 8586, + "train_runtime": 69767.5753, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 5.204242424242424, + "grad_norm": 0.007340052165091038, + "learning_rate": 4.756327432173628e-05, + "loss": 0.011942259036004543, + "num_input_tokens_seen": 140620712, + "step": 8587, + "train_runtime": 69775.6892, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 5.204848484848485, + "grad_norm": 0.005451074801385403, + "learning_rate": 4.7553669653938546e-05, + "loss": 0.012007256038486958, + "num_input_tokens_seen": 140637088, + "step": 8588, + "train_runtime": 69783.7995, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 5.205454545454545, + "grad_norm": 0.007575137075036764, + "learning_rate": 4.754406507662582e-05, + "loss": 0.01092148944735527, + "num_input_tokens_seen": 140653464, + "step": 8589, + "train_runtime": 69791.913, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 5.206060606060606, + "grad_norm": 0.003464600071310997, + "learning_rate": 4.753446059015334e-05, + "loss": 0.013013762421905994, + "num_input_tokens_seen": 140669840, + "step": 8590, + "train_runtime": 69800.0308, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 5.206666666666667, + "grad_norm": 0.007005032617598772, + "learning_rate": 4.752485619487635e-05, + "loss": 0.011923864483833313, + "num_input_tokens_seen": 140686216, + "step": 8591, + "train_runtime": 69808.1448, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 5.207272727272727, + "grad_norm": 0.005561599507927895, + "learning_rate": 4.751525189115011e-05, + "loss": 0.010566149838268757, + "num_input_tokens_seen": 140702592, + "step": 8592, + "train_runtime": 69816.2561, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 5.207878787878788, + "grad_norm": 0.006777828559279442, + "learning_rate": 4.750564767932988e-05, + "loss": 0.011943970806896687, + "num_input_tokens_seen": 140718968, + "step": 8593, + "train_runtime": 69824.3679, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 5.208484848484849, + "grad_norm": 0.009158704429864883, + "learning_rate": 4.7496043559770876e-05, + "loss": 0.01265780534595251, + "num_input_tokens_seen": 140735344, + "step": 8594, + "train_runtime": 69832.4791, + "train_tokens_per_second": 2015.328 + }, + { + "epoch": 5.209090909090909, + "grad_norm": 0.0038668494671583176, + "learning_rate": 4.7486439532828345e-05, + "loss": 0.011566397733986378, + "num_input_tokens_seen": 140751720, + "step": 8595, + "train_runtime": 69840.592, + "train_tokens_per_second": 2015.328 + }, + { + "epoch": 5.20969696969697, + "grad_norm": 0.010255240835249424, + "learning_rate": 4.747683559885751e-05, + "loss": 0.012970581650733948, + "num_input_tokens_seen": 140768096, + "step": 8596, + "train_runtime": 69848.7051, + "train_tokens_per_second": 2015.329 + }, + { + "epoch": 5.21030303030303, + "grad_norm": 0.00846676155924797, + "learning_rate": 4.7467231758213625e-05, + "loss": 0.011635896749794483, + "num_input_tokens_seen": 140784472, + "step": 8597, + "train_runtime": 69856.843, + "train_tokens_per_second": 2015.328 + }, + { + "epoch": 5.210909090909091, + "grad_norm": 0.004737788811326027, + "learning_rate": 4.745762801125191e-05, + "loss": 0.011674880981445312, + "num_input_tokens_seen": 140800848, + "step": 8598, + "train_runtime": 69864.9532, + "train_tokens_per_second": 2015.329 + }, + { + "epoch": 5.211515151515152, + "grad_norm": 0.008748599328100681, + "learning_rate": 4.7448024358327574e-05, + "loss": 0.012194178067147732, + "num_input_tokens_seen": 140817224, + "step": 8599, + "train_runtime": 69873.0669, + "train_tokens_per_second": 2015.329 + }, + { + "epoch": 5.212121212121212, + "grad_norm": 0.0031436006538569927, + "learning_rate": 4.743842079979584e-05, + "loss": 0.011021561920642853, + "num_input_tokens_seen": 140833600, + "step": 8600, + "train_runtime": 69881.1799, + "train_tokens_per_second": 2015.329 + }, + { + "epoch": 5.212727272727273, + "grad_norm": 0.007335609290748835, + "learning_rate": 4.742881733601195e-05, + "loss": 0.012234616093337536, + "num_input_tokens_seen": 140849976, + "step": 8601, + "train_runtime": 69890.262, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 5.213333333333333, + "grad_norm": 0.0023156842216849327, + "learning_rate": 4.741921396733109e-05, + "loss": 0.011113563552498817, + "num_input_tokens_seen": 140866352, + "step": 8602, + "train_runtime": 69898.3745, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 5.213939393939394, + "grad_norm": 0.008183425292372704, + "learning_rate": 4.740961069410848e-05, + "loss": 0.013082773424685001, + "num_input_tokens_seen": 140882728, + "step": 8603, + "train_runtime": 69906.4836, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 5.214545454545455, + "grad_norm": 0.012019401416182518, + "learning_rate": 4.740000751669932e-05, + "loss": 0.012346005067229271, + "num_input_tokens_seen": 140899104, + "step": 8604, + "train_runtime": 69914.5945, + "train_tokens_per_second": 2015.303 + }, + { + "epoch": 5.215151515151515, + "grad_norm": 0.005586654879152775, + "learning_rate": 4.739040443545883e-05, + "loss": 0.011713799089193344, + "num_input_tokens_seen": 140915480, + "step": 8605, + "train_runtime": 69922.7048, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 5.215757575757576, + "grad_norm": 0.006819733418524265, + "learning_rate": 4.7380801450742194e-05, + "loss": 0.011934585869312286, + "num_input_tokens_seen": 140931856, + "step": 8606, + "train_runtime": 69930.8168, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 5.216363636363637, + "grad_norm": 0.0060556442476809025, + "learning_rate": 4.7371198562904614e-05, + "loss": 0.01305705402046442, + "num_input_tokens_seen": 140948232, + "step": 8607, + "train_runtime": 69938.9313, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 5.216969696969697, + "grad_norm": 0.007495763245970011, + "learning_rate": 4.736159577230127e-05, + "loss": 0.012271364219486713, + "num_input_tokens_seen": 140964608, + "step": 8608, + "train_runtime": 69947.0422, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 5.217575757575758, + "grad_norm": 0.007667028345167637, + "learning_rate": 4.7351993079287364e-05, + "loss": 0.01085108332335949, + "num_input_tokens_seen": 140980984, + "step": 8609, + "train_runtime": 69955.1511, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 5.218181818181818, + "grad_norm": 0.004651032388210297, + "learning_rate": 4.734239048421808e-05, + "loss": 0.01220599003136158, + "num_input_tokens_seen": 140997360, + "step": 8610, + "train_runtime": 69963.2633, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 5.218787878787879, + "grad_norm": 0.006046372931450605, + "learning_rate": 4.73327879874486e-05, + "loss": 0.011228865012526512, + "num_input_tokens_seen": 141013736, + "step": 8611, + "train_runtime": 69971.3776, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 5.21939393939394, + "grad_norm": 0.008707521483302116, + "learning_rate": 4.7323185589334076e-05, + "loss": 0.011483223177492619, + "num_input_tokens_seen": 141030112, + "step": 8612, + "train_runtime": 69979.4884, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 5.22, + "grad_norm": 0.009979650378227234, + "learning_rate": 4.731358329022974e-05, + "loss": 0.012626761570572853, + "num_input_tokens_seen": 141046488, + "step": 8613, + "train_runtime": 69987.5982, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 5.220606060606061, + "grad_norm": 0.003531229915097356, + "learning_rate": 4.730398109049071e-05, + "loss": 0.011381605640053749, + "num_input_tokens_seen": 141062864, + "step": 8614, + "train_runtime": 69995.7111, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 5.221212121212122, + "grad_norm": 0.005882117431610823, + "learning_rate": 4.729437899047217e-05, + "loss": 0.012077144347131252, + "num_input_tokens_seen": 141079240, + "step": 8615, + "train_runtime": 70003.8195, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 5.221818181818182, + "grad_norm": 0.007875386625528336, + "learning_rate": 4.7284776990529286e-05, + "loss": 0.012866749428212643, + "num_input_tokens_seen": 141095616, + "step": 8616, + "train_runtime": 70011.9309, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 5.222424242424243, + "grad_norm": 0.006046108435839415, + "learning_rate": 4.727517509101719e-05, + "loss": 0.011964975856244564, + "num_input_tokens_seen": 141111992, + "step": 8617, + "train_runtime": 70020.0442, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 5.223030303030303, + "grad_norm": 0.007475417573004961, + "learning_rate": 4.726557329229109e-05, + "loss": 0.011990996077656746, + "num_input_tokens_seen": 141128368, + "step": 8618, + "train_runtime": 70028.1529, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 5.223636363636364, + "grad_norm": 0.0063964431174099445, + "learning_rate": 4.72559715947061e-05, + "loss": 0.011703303083777428, + "num_input_tokens_seen": 141144744, + "step": 8619, + "train_runtime": 70036.2637, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 5.224242424242425, + "grad_norm": 0.004758751019835472, + "learning_rate": 4.724636999861738e-05, + "loss": 0.011347425170242786, + "num_input_tokens_seen": 141161120, + "step": 8620, + "train_runtime": 70044.3736, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 5.224848484848485, + "grad_norm": 0.0053651342168450356, + "learning_rate": 4.723676850438005e-05, + "loss": 0.012156398966908455, + "num_input_tokens_seen": 141177496, + "step": 8621, + "train_runtime": 70052.4865, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 5.225454545454546, + "grad_norm": 0.007370494771748781, + "learning_rate": 4.7227167112349295e-05, + "loss": 0.011951715685427189, + "num_input_tokens_seen": 141193872, + "step": 8622, + "train_runtime": 70060.5957, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 5.2260606060606065, + "grad_norm": 0.006865001283586025, + "learning_rate": 4.721756582288023e-05, + "loss": 0.01268919836729765, + "num_input_tokens_seen": 141210248, + "step": 8623, + "train_runtime": 70068.7066, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 5.226666666666667, + "grad_norm": 0.008433585986495018, + "learning_rate": 4.720796463632798e-05, + "loss": 0.012233284302055836, + "num_input_tokens_seen": 141226624, + "step": 8624, + "train_runtime": 70076.8151, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 5.2272727272727275, + "grad_norm": 0.007373930886387825, + "learning_rate": 4.7198363553047656e-05, + "loss": 0.01180165633559227, + "num_input_tokens_seen": 141243000, + "step": 8625, + "train_runtime": 70084.9307, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 5.227878787878788, + "grad_norm": 0.005634380970150232, + "learning_rate": 4.7188762573394435e-05, + "loss": 0.011889902874827385, + "num_input_tokens_seen": 141259376, + "step": 8626, + "train_runtime": 70093.0418, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 5.2284848484848485, + "grad_norm": 0.007529820315539837, + "learning_rate": 4.7179161697723425e-05, + "loss": 0.0122696403414011, + "num_input_tokens_seen": 141275752, + "step": 8627, + "train_runtime": 70101.1518, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 5.2290909090909095, + "grad_norm": 0.010168529115617275, + "learning_rate": 4.7169560926389716e-05, + "loss": 0.012705054134130478, + "num_input_tokens_seen": 141292128, + "step": 8628, + "train_runtime": 70109.2615, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 5.2296969696969695, + "grad_norm": 0.008802075870335102, + "learning_rate": 4.715996025974841e-05, + "loss": 0.011838100850582123, + "num_input_tokens_seen": 141308504, + "step": 8629, + "train_runtime": 70117.3731, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 5.2303030303030305, + "grad_norm": 0.0035405317321419716, + "learning_rate": 4.715035969815467e-05, + "loss": 0.012546456418931484, + "num_input_tokens_seen": 141324880, + "step": 8630, + "train_runtime": 70125.4858, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 5.2309090909090905, + "grad_norm": 0.004728306550532579, + "learning_rate": 4.7140759241963576e-05, + "loss": 0.010227363556623459, + "num_input_tokens_seen": 141341256, + "step": 8631, + "train_runtime": 70133.5947, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 5.2315151515151515, + "grad_norm": 0.004450496751815081, + "learning_rate": 4.713115889153024e-05, + "loss": 0.011435476131737232, + "num_input_tokens_seen": 141357632, + "step": 8632, + "train_runtime": 70141.7037, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 5.232121212121212, + "grad_norm": 0.007994068786501884, + "learning_rate": 4.712155864720972e-05, + "loss": 0.011982460506260395, + "num_input_tokens_seen": 141374008, + "step": 8633, + "train_runtime": 70149.8108, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 5.2327272727272724, + "grad_norm": 0.005238356068730354, + "learning_rate": 4.7111958509357155e-05, + "loss": 0.011714409105479717, + "num_input_tokens_seen": 141390384, + "step": 8634, + "train_runtime": 70157.9169, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 5.233333333333333, + "grad_norm": 0.009076647460460663, + "learning_rate": 4.710235847832763e-05, + "loss": 0.013900824822485447, + "num_input_tokens_seen": 141406760, + "step": 8635, + "train_runtime": 70166.0302, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 5.233939393939394, + "grad_norm": 0.0069411578588187695, + "learning_rate": 4.709275855447621e-05, + "loss": 0.012496738694608212, + "num_input_tokens_seen": 141423136, + "step": 8636, + "train_runtime": 70174.14, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 5.234545454545454, + "grad_norm": 0.009932984597980976, + "learning_rate": 4.708315873815801e-05, + "loss": 0.013290498405694962, + "num_input_tokens_seen": 141439512, + "step": 8637, + "train_runtime": 70182.2487, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 5.235151515151515, + "grad_norm": 0.006745110731571913, + "learning_rate": 4.7073559029728055e-05, + "loss": 0.012514976784586906, + "num_input_tokens_seen": 141455888, + "step": 8638, + "train_runtime": 70190.3549, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 5.235757575757575, + "grad_norm": 0.008265319280326366, + "learning_rate": 4.706395942954147e-05, + "loss": 0.01239892840385437, + "num_input_tokens_seen": 141472264, + "step": 8639, + "train_runtime": 70198.4644, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 5.236363636363636, + "grad_norm": 0.0067689670249819756, + "learning_rate": 4.705435993795331e-05, + "loss": 0.011342395097017288, + "num_input_tokens_seen": 141488640, + "step": 8640, + "train_runtime": 70206.576, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 5.236969696969697, + "grad_norm": 0.006446485873311758, + "learning_rate": 4.7044760555318633e-05, + "loss": 0.01157057099044323, + "num_input_tokens_seen": 141505016, + "step": 8641, + "train_runtime": 70214.6865, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 5.237575757575757, + "grad_norm": 0.00936856959015131, + "learning_rate": 4.703516128199251e-05, + "loss": 0.01324019767343998, + "num_input_tokens_seen": 141521392, + "step": 8642, + "train_runtime": 70222.7943, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 5.238181818181818, + "grad_norm": 0.006850573234260082, + "learning_rate": 4.7025562118329996e-05, + "loss": 0.012302565388381481, + "num_input_tokens_seen": 141537768, + "step": 8643, + "train_runtime": 70230.9081, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 5.238787878787879, + "grad_norm": 0.005202385131269693, + "learning_rate": 4.7015963064686155e-05, + "loss": 0.01121328491717577, + "num_input_tokens_seen": 141554144, + "step": 8644, + "train_runtime": 70239.0198, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 5.239393939393939, + "grad_norm": 0.005433404352515936, + "learning_rate": 4.7006364121416026e-05, + "loss": 0.01104498840868473, + "num_input_tokens_seen": 141570520, + "step": 8645, + "train_runtime": 70247.1307, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 5.24, + "grad_norm": 0.002536205807700753, + "learning_rate": 4.6996765288874645e-05, + "loss": 0.011410987004637718, + "num_input_tokens_seen": 141586896, + "step": 8646, + "train_runtime": 70255.2405, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 5.24060606060606, + "grad_norm": 0.004226612858474255, + "learning_rate": 4.698716656741708e-05, + "loss": 0.013110432773828506, + "num_input_tokens_seen": 141603272, + "step": 8647, + "train_runtime": 70263.3514, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 5.241212121212121, + "grad_norm": 0.009294162504374981, + "learning_rate": 4.697756795739836e-05, + "loss": 0.012914028018712997, + "num_input_tokens_seen": 141619648, + "step": 8648, + "train_runtime": 70271.4635, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 5.241818181818182, + "grad_norm": 0.00015533431724179536, + "learning_rate": 4.696796945917351e-05, + "loss": 0.01039949245750904, + "num_input_tokens_seen": 141636024, + "step": 8649, + "train_runtime": 70279.5744, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 5.242424242424242, + "grad_norm": 0.007950184866786003, + "learning_rate": 4.6958371073097556e-05, + "loss": 0.012266449630260468, + "num_input_tokens_seen": 141652400, + "step": 8650, + "train_runtime": 70287.6842, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 5.243030303030303, + "grad_norm": 0.006991837173700333, + "learning_rate": 4.694877279952555e-05, + "loss": 0.012200918979942799, + "num_input_tokens_seen": 141668776, + "step": 8651, + "train_runtime": 70295.7978, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 5.243636363636363, + "grad_norm": 0.012479342520236969, + "learning_rate": 4.693917463881249e-05, + "loss": 0.011802447959780693, + "num_input_tokens_seen": 141685152, + "step": 8652, + "train_runtime": 70303.9085, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 5.244242424242424, + "grad_norm": 0.006284487899392843, + "learning_rate": 4.6929576591313395e-05, + "loss": 0.012061214074492455, + "num_input_tokens_seen": 141701528, + "step": 8653, + "train_runtime": 70312.0178, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 5.244848484848485, + "grad_norm": 0.003801667131483555, + "learning_rate": 4.691997865738328e-05, + "loss": 0.011831780895590782, + "num_input_tokens_seen": 141717904, + "step": 8654, + "train_runtime": 70320.1306, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 5.245454545454545, + "grad_norm": 0.006961784325540066, + "learning_rate": 4.691038083737717e-05, + "loss": 0.013058921322226524, + "num_input_tokens_seen": 141734280, + "step": 8655, + "train_runtime": 70328.2414, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 5.246060606060606, + "grad_norm": 0.006005587987601757, + "learning_rate": 4.6900783131650045e-05, + "loss": 0.012664939276874065, + "num_input_tokens_seen": 141750656, + "step": 8656, + "train_runtime": 70336.3498, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 5.246666666666667, + "grad_norm": 0.0032399455085396767, + "learning_rate": 4.689118554055692e-05, + "loss": 0.012156127020716667, + "num_input_tokens_seen": 141767032, + "step": 8657, + "train_runtime": 70344.4598, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 5.247272727272727, + "grad_norm": 0.00738222012296319, + "learning_rate": 4.688158806445278e-05, + "loss": 0.012552447617053986, + "num_input_tokens_seen": 141783408, + "step": 8658, + "train_runtime": 70352.57, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 5.247878787878788, + "grad_norm": 0.006683421786874533, + "learning_rate": 4.687199070369263e-05, + "loss": 0.012498554773628712, + "num_input_tokens_seen": 141799784, + "step": 8659, + "train_runtime": 70360.6814, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 5.248484848484848, + "grad_norm": 0.007810194976627827, + "learning_rate": 4.686239345863146e-05, + "loss": 0.011173412203788757, + "num_input_tokens_seen": 141816160, + "step": 8660, + "train_runtime": 70368.7939, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 5.249090909090909, + "grad_norm": 0.005277764517813921, + "learning_rate": 4.6852796329624236e-05, + "loss": 0.011830788105726242, + "num_input_tokens_seen": 141832536, + "step": 8661, + "train_runtime": 70376.9084, + "train_tokens_per_second": 2015.328 + }, + { + "epoch": 5.24969696969697, + "grad_norm": 0.00993553176522255, + "learning_rate": 4.6843199317025956e-05, + "loss": 0.011897126212716103, + "num_input_tokens_seen": 141848912, + "step": 8662, + "train_runtime": 70385.0781, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 5.25030303030303, + "grad_norm": 0.004460988100618124, + "learning_rate": 4.6833602421191575e-05, + "loss": 0.012117322534322739, + "num_input_tokens_seen": 141865288, + "step": 8663, + "train_runtime": 70393.1855, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 5.250909090909091, + "grad_norm": 0.007554213050752878, + "learning_rate": 4.6824005642476074e-05, + "loss": 0.012632821686565876, + "num_input_tokens_seen": 141881664, + "step": 8664, + "train_runtime": 70401.2941, + "train_tokens_per_second": 2015.328 + }, + { + "epoch": 5.251515151515152, + "grad_norm": 0.0036673247814178467, + "learning_rate": 4.6814408981234434e-05, + "loss": 0.013058227486908436, + "num_input_tokens_seen": 141898040, + "step": 8665, + "train_runtime": 70409.4055, + "train_tokens_per_second": 2015.328 + }, + { + "epoch": 5.252121212121212, + "grad_norm": 0.004639252554625273, + "learning_rate": 4.6804812437821595e-05, + "loss": 0.01089001726359129, + "num_input_tokens_seen": 141914416, + "step": 8666, + "train_runtime": 70417.5178, + "train_tokens_per_second": 2015.328 + }, + { + "epoch": 5.252727272727273, + "grad_norm": 0.0066552492789924145, + "learning_rate": 4.6795216012592506e-05, + "loss": 0.01232814509421587, + "num_input_tokens_seen": 141930792, + "step": 8667, + "train_runtime": 70425.6322, + "train_tokens_per_second": 2015.329 + }, + { + "epoch": 5.253333333333333, + "grad_norm": 0.00454815523698926, + "learning_rate": 4.6785619705902154e-05, + "loss": 0.012021981179714203, + "num_input_tokens_seen": 141947168, + "step": 8668, + "train_runtime": 70433.7445, + "train_tokens_per_second": 2015.329 + }, + { + "epoch": 5.253939393939394, + "grad_norm": 0.008906778879463673, + "learning_rate": 4.6776023518105466e-05, + "loss": 0.012975383549928665, + "num_input_tokens_seen": 141963544, + "step": 8669, + "train_runtime": 70441.8548, + "train_tokens_per_second": 2015.329 + }, + { + "epoch": 5.254545454545455, + "grad_norm": 0.006314353086054325, + "learning_rate": 4.676642744955739e-05, + "loss": 0.011165396310389042, + "num_input_tokens_seen": 141979920, + "step": 8670, + "train_runtime": 70449.9651, + "train_tokens_per_second": 2015.33 + }, + { + "epoch": 5.255151515151515, + "grad_norm": 0.011566385626792908, + "learning_rate": 4.675683150061285e-05, + "loss": 0.012445725500583649, + "num_input_tokens_seen": 141996296, + "step": 8671, + "train_runtime": 70458.0718, + "train_tokens_per_second": 2015.33 + }, + { + "epoch": 5.255757575757576, + "grad_norm": 0.01045412290841341, + "learning_rate": 4.6747235671626824e-05, + "loss": 0.011979715898633003, + "num_input_tokens_seen": 142012672, + "step": 8672, + "train_runtime": 70466.1761, + "train_tokens_per_second": 2015.331 + }, + { + "epoch": 5.256363636363636, + "grad_norm": 0.007532523944973946, + "learning_rate": 4.67376399629542e-05, + "loss": 0.011417550966143608, + "num_input_tokens_seen": 142029048, + "step": 8673, + "train_runtime": 70474.286, + "train_tokens_per_second": 2015.331 + }, + { + "epoch": 5.256969696969697, + "grad_norm": 0.005424617789685726, + "learning_rate": 4.6728044374949926e-05, + "loss": 0.012139094062149525, + "num_input_tokens_seen": 142045424, + "step": 8674, + "train_runtime": 70482.396, + "train_tokens_per_second": 2015.332 + }, + { + "epoch": 5.257575757575758, + "grad_norm": 0.003602619282901287, + "learning_rate": 4.67184489079689e-05, + "loss": 0.011941544711589813, + "num_input_tokens_seen": 142061800, + "step": 8675, + "train_runtime": 70490.5061, + "train_tokens_per_second": 2015.332 + }, + { + "epoch": 5.258181818181818, + "grad_norm": 0.007120292168110609, + "learning_rate": 4.670885356236607e-05, + "loss": 0.011723346076905727, + "num_input_tokens_seen": 142078176, + "step": 8676, + "train_runtime": 70498.6184, + "train_tokens_per_second": 2015.333 + }, + { + "epoch": 5.258787878787879, + "grad_norm": 0.007882661186158657, + "learning_rate": 4.669925833849636e-05, + "loss": 0.012614166364073753, + "num_input_tokens_seen": 142094552, + "step": 8677, + "train_runtime": 70506.7299, + "train_tokens_per_second": 2015.333 + }, + { + "epoch": 5.25939393939394, + "grad_norm": 0.006387017201632261, + "learning_rate": 4.668966323671464e-05, + "loss": 0.011329831555485725, + "num_input_tokens_seen": 142110928, + "step": 8678, + "train_runtime": 70514.8388, + "train_tokens_per_second": 2015.334 + }, + { + "epoch": 5.26, + "grad_norm": 0.0047048903070390224, + "learning_rate": 4.6680068257375806e-05, + "loss": 0.012289533391594887, + "num_input_tokens_seen": 142127304, + "step": 8679, + "train_runtime": 70522.9489, + "train_tokens_per_second": 2015.334 + }, + { + "epoch": 5.260606060606061, + "grad_norm": 0.008710606954991817, + "learning_rate": 4.667047340083481e-05, + "loss": 0.012617519125342369, + "num_input_tokens_seen": 142143680, + "step": 8680, + "train_runtime": 70531.0639, + "train_tokens_per_second": 2015.334 + }, + { + "epoch": 5.261212121212122, + "grad_norm": 0.008556870743632317, + "learning_rate": 4.666087866744651e-05, + "loss": 0.012446851469576359, + "num_input_tokens_seen": 142160056, + "step": 8681, + "train_runtime": 70539.1761, + "train_tokens_per_second": 2015.335 + }, + { + "epoch": 5.261818181818182, + "grad_norm": 0.00669449009001255, + "learning_rate": 4.6651284057565824e-05, + "loss": 0.012905669398605824, + "num_input_tokens_seen": 142176432, + "step": 8682, + "train_runtime": 70547.2871, + "train_tokens_per_second": 2015.335 + }, + { + "epoch": 5.262424242424243, + "grad_norm": 0.004976090043783188, + "learning_rate": 4.664168957154761e-05, + "loss": 0.012305125594139099, + "num_input_tokens_seen": 142192808, + "step": 8683, + "train_runtime": 70555.4, + "train_tokens_per_second": 2015.336 + }, + { + "epoch": 5.263030303030303, + "grad_norm": 0.006960708647966385, + "learning_rate": 4.663209520974673e-05, + "loss": 0.012360990978777409, + "num_input_tokens_seen": 142209184, + "step": 8684, + "train_runtime": 70563.5128, + "train_tokens_per_second": 2015.336 + }, + { + "epoch": 5.263636363636364, + "grad_norm": 0.008276723325252533, + "learning_rate": 4.662250097251811e-05, + "loss": 0.012252272106707096, + "num_input_tokens_seen": 142225560, + "step": 8685, + "train_runtime": 70571.6309, + "train_tokens_per_second": 2015.336 + }, + { + "epoch": 5.264242424242425, + "grad_norm": 0.003593289293348789, + "learning_rate": 4.661290686021661e-05, + "loss": 0.010488768108189106, + "num_input_tokens_seen": 142241936, + "step": 8686, + "train_runtime": 70579.7439, + "train_tokens_per_second": 2015.337 + }, + { + "epoch": 5.264848484848485, + "grad_norm": 0.005407534074038267, + "learning_rate": 4.66033128731971e-05, + "loss": 0.012199345044791698, + "num_input_tokens_seen": 142258312, + "step": 8687, + "train_runtime": 70587.8554, + "train_tokens_per_second": 2015.337 + }, + { + "epoch": 5.265454545454546, + "grad_norm": 0.003882840508595109, + "learning_rate": 4.6593719011814395e-05, + "loss": 0.010707040317356586, + "num_input_tokens_seen": 142274688, + "step": 8688, + "train_runtime": 70595.9657, + "train_tokens_per_second": 2015.337 + }, + { + "epoch": 5.266060606060606, + "grad_norm": 0.006020511966198683, + "learning_rate": 4.658412527642341e-05, + "loss": 0.011450082994997501, + "num_input_tokens_seen": 142291064, + "step": 8689, + "train_runtime": 70604.0779, + "train_tokens_per_second": 2015.338 + }, + { + "epoch": 5.266666666666667, + "grad_norm": 0.007638679351657629, + "learning_rate": 4.6574531667378985e-05, + "loss": 0.012199332937598228, + "num_input_tokens_seen": 142307440, + "step": 8690, + "train_runtime": 70612.1897, + "train_tokens_per_second": 2015.338 + }, + { + "epoch": 5.2672727272727276, + "grad_norm": 0.005733525846153498, + "learning_rate": 4.6564938185035956e-05, + "loss": 0.011739004403352737, + "num_input_tokens_seen": 142323816, + "step": 8691, + "train_runtime": 70620.3043, + "train_tokens_per_second": 2015.338 + }, + { + "epoch": 5.267878787878788, + "grad_norm": 0.011077191680669785, + "learning_rate": 4.655534482974917e-05, + "loss": 0.01360364630818367, + "num_input_tokens_seen": 142340192, + "step": 8692, + "train_runtime": 70628.4181, + "train_tokens_per_second": 2015.339 + }, + { + "epoch": 5.2684848484848485, + "grad_norm": 0.009677699767053127, + "learning_rate": 4.654575160187348e-05, + "loss": 0.011700388044118881, + "num_input_tokens_seen": 142356568, + "step": 8693, + "train_runtime": 70636.5334, + "train_tokens_per_second": 2015.339 + }, + { + "epoch": 5.2690909090909095, + "grad_norm": 0.007966621778905392, + "learning_rate": 4.6536158501763705e-05, + "loss": 0.01423229556530714, + "num_input_tokens_seen": 142372944, + "step": 8694, + "train_runtime": 70644.6433, + "train_tokens_per_second": 2015.34 + }, + { + "epoch": 5.2696969696969695, + "grad_norm": 0.004577717278152704, + "learning_rate": 4.6526565529774684e-05, + "loss": 0.0111308041960001, + "num_input_tokens_seen": 142389320, + "step": 8695, + "train_runtime": 70652.7562, + "train_tokens_per_second": 2015.34 + }, + { + "epoch": 5.2703030303030305, + "grad_norm": 0.007321104407310486, + "learning_rate": 4.6516972686261234e-05, + "loss": 0.012705864384770393, + "num_input_tokens_seen": 142405696, + "step": 8696, + "train_runtime": 70660.8658, + "train_tokens_per_second": 2015.34 + }, + { + "epoch": 5.2709090909090905, + "grad_norm": 0.006640320178121328, + "learning_rate": 4.650737997157819e-05, + "loss": 0.011694032698869705, + "num_input_tokens_seen": 142422072, + "step": 8697, + "train_runtime": 70668.9766, + "train_tokens_per_second": 2015.341 + }, + { + "epoch": 5.2715151515151515, + "grad_norm": 0.005849448498338461, + "learning_rate": 4.649778738608036e-05, + "loss": 0.012451408430933952, + "num_input_tokens_seen": 142438448, + "step": 8698, + "train_runtime": 70677.0873, + "train_tokens_per_second": 2015.341 + }, + { + "epoch": 5.272121212121212, + "grad_norm": 0.00410769646987319, + "learning_rate": 4.648819493012255e-05, + "loss": 0.01259559951722622, + "num_input_tokens_seen": 142454824, + "step": 8699, + "train_runtime": 70685.1991, + "train_tokens_per_second": 2015.342 + }, + { + "epoch": 5.2727272727272725, + "grad_norm": 0.007461491972208023, + "learning_rate": 4.6478602604059566e-05, + "loss": 0.0131318224593997, + "num_input_tokens_seen": 142471200, + "step": 8700, + "train_runtime": 70693.308, + "train_tokens_per_second": 2015.342 + } + ], + "logging_steps": 1, + "max_steps": 16500, + "num_input_tokens_seen": 142471200, + "num_train_epochs": 10, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6.544300130014003e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}