diff --git "a/v127rc_exp2/B_rep/checkpoint-4100/trainer_state.json" "b/v127rc_exp2/B_rep/checkpoint-4100/trainer_state.json" new file mode 100644--- /dev/null +++ "b/v127rc_exp2/B_rep/checkpoint-4100/trainer_state.json" @@ -0,0 +1,41034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.484848484848485, + "eval_steps": 500, + "global_step": 4100, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006060606060606061, + "grad_norm": 0.35123783349990845, + "learning_rate": 0.0, + "loss": 1.6639432907104492, + "num_input_tokens_seen": 16376, + "step": 1, + "train_runtime": 9.7703, + "train_tokens_per_second": 1676.104 + }, + { + "epoch": 0.0012121212121212121, + "grad_norm": 0.39342227578163147, + "learning_rate": 6.060606060606061e-07, + "loss": 1.6057767868041992, + "num_input_tokens_seen": 32752, + "step": 2, + "train_runtime": 17.8325, + "train_tokens_per_second": 1836.647 + }, + { + "epoch": 0.0018181818181818182, + "grad_norm": 0.3597555458545685, + "learning_rate": 1.2121212121212122e-06, + "loss": 1.6560568809509277, + "num_input_tokens_seen": 49128, + "step": 3, + "train_runtime": 25.8927, + "train_tokens_per_second": 1897.372 + }, + { + "epoch": 0.0024242424242424242, + "grad_norm": 0.3463701009750366, + "learning_rate": 1.818181818181818e-06, + "loss": 1.6540638208389282, + "num_input_tokens_seen": 65504, + "step": 4, + "train_runtime": 33.9566, + "train_tokens_per_second": 1929.051 + }, + { + "epoch": 0.0030303030303030303, + "grad_norm": 0.34733158349990845, + "learning_rate": 2.4242424242424244e-06, + "loss": 1.664928913116455, + "num_input_tokens_seen": 81880, + "step": 5, + "train_runtime": 42.0394, + "train_tokens_per_second": 1947.697 + }, + { + "epoch": 0.0036363636363636364, + "grad_norm": 0.36326366662979126, + "learning_rate": 3.0303030303030305e-06, + "loss": 1.6352522373199463, + "num_input_tokens_seen": 98256, + "step": 6, + "train_runtime": 50.1229, + "train_tokens_per_second": 1960.302 + }, + { + "epoch": 0.004242424242424243, + "grad_norm": 0.351137638092041, + "learning_rate": 3.636363636363636e-06, + "loss": 1.660022497177124, + "num_input_tokens_seen": 114632, + "step": 7, + "train_runtime": 58.2137, + "train_tokens_per_second": 1969.159 + }, + { + "epoch": 0.0048484848484848485, + "grad_norm": 0.353691428899765, + "learning_rate": 4.242424242424243e-06, + "loss": 1.6757584810256958, + "num_input_tokens_seen": 131008, + "step": 8, + "train_runtime": 66.311, + "train_tokens_per_second": 1975.66 + }, + { + "epoch": 0.005454545454545455, + "grad_norm": 0.3630884885787964, + "learning_rate": 4.848484848484849e-06, + "loss": 1.6366666555404663, + "num_input_tokens_seen": 147384, + "step": 9, + "train_runtime": 74.4151, + "train_tokens_per_second": 1980.565 + }, + { + "epoch": 0.006060606060606061, + "grad_norm": 0.354055255651474, + "learning_rate": 5.4545454545454545e-06, + "loss": 1.6339915990829468, + "num_input_tokens_seen": 163760, + "step": 10, + "train_runtime": 82.5209, + "train_tokens_per_second": 1984.468 + }, + { + "epoch": 0.006666666666666667, + "grad_norm": 0.3574777841567993, + "learning_rate": 6.060606060606061e-06, + "loss": 1.6360563039779663, + "num_input_tokens_seen": 180136, + "step": 11, + "train_runtime": 90.6349, + "train_tokens_per_second": 1987.491 + }, + { + "epoch": 0.007272727272727273, + "grad_norm": 0.3561362028121948, + "learning_rate": 6.666666666666667e-06, + "loss": 1.6641417741775513, + "num_input_tokens_seen": 196512, + "step": 12, + "train_runtime": 98.7492, + "train_tokens_per_second": 1990.012 + }, + { + "epoch": 0.00787878787878788, + "grad_norm": 0.3659680485725403, + "learning_rate": 7.272727272727272e-06, + "loss": 1.6375828981399536, + "num_input_tokens_seen": 212888, + "step": 13, + "train_runtime": 106.8626, + "train_tokens_per_second": 1992.165 + }, + { + "epoch": 0.008484848484848486, + "grad_norm": 0.37148839235305786, + "learning_rate": 7.878787878787878e-06, + "loss": 1.6246858835220337, + "num_input_tokens_seen": 229264, + "step": 14, + "train_runtime": 114.9785, + "train_tokens_per_second": 1993.973 + }, + { + "epoch": 0.00909090909090909, + "grad_norm": 0.38491716980934143, + "learning_rate": 8.484848484848486e-06, + "loss": 1.5969434976577759, + "num_input_tokens_seen": 245640, + "step": 15, + "train_runtime": 123.0953, + "train_tokens_per_second": 1995.526 + }, + { + "epoch": 0.009696969696969697, + "grad_norm": 0.37805187702178955, + "learning_rate": 9.090909090909091e-06, + "loss": 1.6518127918243408, + "num_input_tokens_seen": 262016, + "step": 16, + "train_runtime": 131.2123, + "train_tokens_per_second": 1996.886 + }, + { + "epoch": 0.010303030303030303, + "grad_norm": 0.3775594234466553, + "learning_rate": 9.696969696969698e-06, + "loss": 1.6409087181091309, + "num_input_tokens_seen": 278392, + "step": 17, + "train_runtime": 139.3368, + "train_tokens_per_second": 1997.979 + }, + { + "epoch": 0.01090909090909091, + "grad_norm": 0.39896833896636963, + "learning_rate": 1.0303030303030304e-05, + "loss": 1.5989718437194824, + "num_input_tokens_seen": 294768, + "step": 18, + "train_runtime": 147.4538, + "train_tokens_per_second": 1999.054 + }, + { + "epoch": 0.011515151515151515, + "grad_norm": 0.386406272649765, + "learning_rate": 1.0909090909090909e-05, + "loss": 1.6259583234786987, + "num_input_tokens_seen": 311144, + "step": 19, + "train_runtime": 155.5716, + "train_tokens_per_second": 2000.005 + }, + { + "epoch": 0.012121212121212121, + "grad_norm": 0.3878491520881653, + "learning_rate": 1.1515151515151517e-05, + "loss": 1.5945892333984375, + "num_input_tokens_seen": 327520, + "step": 20, + "train_runtime": 163.6898, + "train_tokens_per_second": 2000.858 + }, + { + "epoch": 0.012727272727272728, + "grad_norm": 0.4080464839935303, + "learning_rate": 1.2121212121212122e-05, + "loss": 1.5983033180236816, + "num_input_tokens_seen": 343896, + "step": 21, + "train_runtime": 171.8117, + "train_tokens_per_second": 2001.587 + }, + { + "epoch": 0.013333333333333334, + "grad_norm": 0.41852834820747375, + "learning_rate": 1.2727272727272727e-05, + "loss": 1.5769346952438354, + "num_input_tokens_seen": 360272, + "step": 22, + "train_runtime": 179.9341, + "train_tokens_per_second": 2002.244 + }, + { + "epoch": 0.013939393939393939, + "grad_norm": 0.4324847459793091, + "learning_rate": 1.3333333333333333e-05, + "loss": 1.5699174404144287, + "num_input_tokens_seen": 376648, + "step": 23, + "train_runtime": 188.0563, + "train_tokens_per_second": 2002.847 + }, + { + "epoch": 0.014545454545454545, + "grad_norm": 0.4219138026237488, + "learning_rate": 1.3939393939393942e-05, + "loss": 1.5589112043380737, + "num_input_tokens_seen": 393024, + "step": 24, + "train_runtime": 196.1758, + "train_tokens_per_second": 2003.427 + }, + { + "epoch": 0.015151515151515152, + "grad_norm": 0.42980635166168213, + "learning_rate": 1.4545454545454545e-05, + "loss": 1.5662312507629395, + "num_input_tokens_seen": 409400, + "step": 25, + "train_runtime": 204.2941, + "train_tokens_per_second": 2003.974 + }, + { + "epoch": 0.01575757575757576, + "grad_norm": 0.4569622576236725, + "learning_rate": 1.5151515151515153e-05, + "loss": 1.4885609149932861, + "num_input_tokens_seen": 425776, + "step": 26, + "train_runtime": 212.4141, + "train_tokens_per_second": 2004.462 + }, + { + "epoch": 0.016363636363636365, + "grad_norm": 0.4413582384586334, + "learning_rate": 1.5757575757575756e-05, + "loss": 1.4823509454727173, + "num_input_tokens_seen": 442152, + "step": 27, + "train_runtime": 220.5358, + "train_tokens_per_second": 2004.899 + }, + { + "epoch": 0.01696969696969697, + "grad_norm": 0.45630744099617004, + "learning_rate": 1.6363636363636366e-05, + "loss": 1.4595903158187866, + "num_input_tokens_seen": 458528, + "step": 28, + "train_runtime": 228.6622, + "train_tokens_per_second": 2005.264 + }, + { + "epoch": 0.017575757575757574, + "grad_norm": 0.457793653011322, + "learning_rate": 1.6969696969696972e-05, + "loss": 1.4525277614593506, + "num_input_tokens_seen": 474904, + "step": 29, + "train_runtime": 236.7808, + "train_tokens_per_second": 2005.67 + }, + { + "epoch": 0.01818181818181818, + "grad_norm": 0.4766552150249481, + "learning_rate": 1.7575757575757576e-05, + "loss": 1.425432801246643, + "num_input_tokens_seen": 491280, + "step": 30, + "train_runtime": 244.8928, + "train_tokens_per_second": 2006.102 + }, + { + "epoch": 0.018787878787878787, + "grad_norm": 0.5165067911148071, + "learning_rate": 1.8181818181818182e-05, + "loss": 1.3646923303604126, + "num_input_tokens_seen": 507656, + "step": 31, + "train_runtime": 253.0027, + "train_tokens_per_second": 2006.524 + }, + { + "epoch": 0.019393939393939394, + "grad_norm": 0.4833853244781494, + "learning_rate": 1.878787878787879e-05, + "loss": 1.3608993291854858, + "num_input_tokens_seen": 524032, + "step": 32, + "train_runtime": 261.1128, + "train_tokens_per_second": 2006.918 + }, + { + "epoch": 0.02, + "grad_norm": 0.49612611532211304, + "learning_rate": 1.9393939393939395e-05, + "loss": 1.350702166557312, + "num_input_tokens_seen": 540408, + "step": 33, + "train_runtime": 269.2241, + "train_tokens_per_second": 2007.28 + }, + { + "epoch": 0.020606060606060607, + "grad_norm": 0.5136600732803345, + "learning_rate": 2e-05, + "loss": 1.291304349899292, + "num_input_tokens_seen": 556784, + "step": 34, + "train_runtime": 277.336, + "train_tokens_per_second": 2007.615 + }, + { + "epoch": 0.021212121212121213, + "grad_norm": 0.5192011594772339, + "learning_rate": 2.0606060606060608e-05, + "loss": 1.2744120359420776, + "num_input_tokens_seen": 573160, + "step": 35, + "train_runtime": 285.4446, + "train_tokens_per_second": 2007.956 + }, + { + "epoch": 0.02181818181818182, + "grad_norm": 0.5397765636444092, + "learning_rate": 2.1212121212121215e-05, + "loss": 1.208145022392273, + "num_input_tokens_seen": 589536, + "step": 36, + "train_runtime": 293.5567, + "train_tokens_per_second": 2008.253 + }, + { + "epoch": 0.022424242424242423, + "grad_norm": 0.5493120551109314, + "learning_rate": 2.1818181818181818e-05, + "loss": 1.2057533264160156, + "num_input_tokens_seen": 605912, + "step": 37, + "train_runtime": 301.6704, + "train_tokens_per_second": 2008.523 + }, + { + "epoch": 0.02303030303030303, + "grad_norm": 0.5603742599487305, + "learning_rate": 2.2424242424242424e-05, + "loss": 1.1387653350830078, + "num_input_tokens_seen": 622288, + "step": 38, + "train_runtime": 309.7816, + "train_tokens_per_second": 2008.796 + }, + { + "epoch": 0.023636363636363636, + "grad_norm": 0.581070601940155, + "learning_rate": 2.3030303030303034e-05, + "loss": 1.138227939605713, + "num_input_tokens_seen": 638664, + "step": 39, + "train_runtime": 317.8926, + "train_tokens_per_second": 2009.056 + }, + { + "epoch": 0.024242424242424242, + "grad_norm": 0.5650333762168884, + "learning_rate": 2.3636363636363637e-05, + "loss": 1.1126341819763184, + "num_input_tokens_seen": 655040, + "step": 40, + "train_runtime": 326.0006, + "train_tokens_per_second": 2009.321 + }, + { + "epoch": 0.02484848484848485, + "grad_norm": 0.6228408813476562, + "learning_rate": 2.4242424242424244e-05, + "loss": 1.0580966472625732, + "num_input_tokens_seen": 671416, + "step": 41, + "train_runtime": 334.1133, + "train_tokens_per_second": 2009.546 + }, + { + "epoch": 0.025454545454545455, + "grad_norm": 0.7027150392532349, + "learning_rate": 2.4848484848484847e-05, + "loss": 1.0436644554138184, + "num_input_tokens_seen": 687792, + "step": 42, + "train_runtime": 342.2227, + "train_tokens_per_second": 2009.779 + }, + { + "epoch": 0.026060606060606062, + "grad_norm": 0.876166045665741, + "learning_rate": 2.5454545454545454e-05, + "loss": 0.9523745775222778, + "num_input_tokens_seen": 704168, + "step": 43, + "train_runtime": 350.3357, + "train_tokens_per_second": 2009.981 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 0.5786903500556946, + "learning_rate": 2.6060606060606063e-05, + "loss": 0.9218084812164307, + "num_input_tokens_seen": 720544, + "step": 44, + "train_runtime": 358.4444, + "train_tokens_per_second": 2010.198 + }, + { + "epoch": 0.02727272727272727, + "grad_norm": 0.6627383828163147, + "learning_rate": 2.6666666666666667e-05, + "loss": 0.8746504187583923, + "num_input_tokens_seen": 736920, + "step": 45, + "train_runtime": 366.5519, + "train_tokens_per_second": 2010.411 + }, + { + "epoch": 0.027878787878787878, + "grad_norm": 0.6991789937019348, + "learning_rate": 2.7272727272727273e-05, + "loss": 0.8592554926872253, + "num_input_tokens_seen": 753296, + "step": 46, + "train_runtime": 374.6632, + "train_tokens_per_second": 2010.595 + }, + { + "epoch": 0.028484848484848484, + "grad_norm": 0.6843043565750122, + "learning_rate": 2.7878787878787883e-05, + "loss": 0.7838267683982849, + "num_input_tokens_seen": 769672, + "step": 47, + "train_runtime": 382.7718, + "train_tokens_per_second": 2010.786 + }, + { + "epoch": 0.02909090909090909, + "grad_norm": 0.6203355193138123, + "learning_rate": 2.8484848484848486e-05, + "loss": 0.7517961263656616, + "num_input_tokens_seen": 786048, + "step": 48, + "train_runtime": 390.883, + "train_tokens_per_second": 2010.955 + }, + { + "epoch": 0.029696969696969697, + "grad_norm": 0.6031985878944397, + "learning_rate": 2.909090909090909e-05, + "loss": 0.7074779272079468, + "num_input_tokens_seen": 802424, + "step": 49, + "train_runtime": 398.992, + "train_tokens_per_second": 2011.128 + }, + { + "epoch": 0.030303030303030304, + "grad_norm": 0.6645159125328064, + "learning_rate": 2.96969696969697e-05, + "loss": 0.6244415044784546, + "num_input_tokens_seen": 818800, + "step": 50, + "train_runtime": 407.1007, + "train_tokens_per_second": 2011.296 + }, + { + "epoch": 0.03090909090909091, + "grad_norm": 0.6037282943725586, + "learning_rate": 3.0303030303030306e-05, + "loss": 0.6258814334869385, + "num_input_tokens_seen": 835176, + "step": 51, + "train_runtime": 415.209, + "train_tokens_per_second": 2011.459 + }, + { + "epoch": 0.03151515151515152, + "grad_norm": 0.7840785980224609, + "learning_rate": 3.090909090909091e-05, + "loss": 0.5502547025680542, + "num_input_tokens_seen": 851552, + "step": 52, + "train_runtime": 423.3167, + "train_tokens_per_second": 2011.619 + }, + { + "epoch": 0.03212121212121212, + "grad_norm": 0.5410464406013489, + "learning_rate": 3.151515151515151e-05, + "loss": 0.4808294475078583, + "num_input_tokens_seen": 867928, + "step": 53, + "train_runtime": 431.4326, + "train_tokens_per_second": 2011.735 + }, + { + "epoch": 0.03272727272727273, + "grad_norm": 0.5532175898551941, + "learning_rate": 3.212121212121212e-05, + "loss": 0.4808656871318817, + "num_input_tokens_seen": 884304, + "step": 54, + "train_runtime": 439.5458, + "train_tokens_per_second": 2011.859 + }, + { + "epoch": 0.03333333333333333, + "grad_norm": 0.6308758854866028, + "learning_rate": 3.272727272727273e-05, + "loss": 0.4137771427631378, + "num_input_tokens_seen": 900680, + "step": 55, + "train_runtime": 447.6539, + "train_tokens_per_second": 2012.001 + }, + { + "epoch": 0.03393939393939394, + "grad_norm": 0.492653489112854, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.3654894530773163, + "num_input_tokens_seen": 917056, + "step": 56, + "train_runtime": 455.7624, + "train_tokens_per_second": 2012.136 + }, + { + "epoch": 0.034545454545454546, + "grad_norm": 0.5767380595207214, + "learning_rate": 3.3939393939393945e-05, + "loss": 0.342722088098526, + "num_input_tokens_seen": 933432, + "step": 57, + "train_runtime": 463.8713, + "train_tokens_per_second": 2012.265 + }, + { + "epoch": 0.03515151515151515, + "grad_norm": 0.5243986248970032, + "learning_rate": 3.454545454545455e-05, + "loss": 0.2960652709007263, + "num_input_tokens_seen": 949808, + "step": 58, + "train_runtime": 471.9805, + "train_tokens_per_second": 2012.388 + }, + { + "epoch": 0.03575757575757576, + "grad_norm": 0.4490169882774353, + "learning_rate": 3.515151515151515e-05, + "loss": 0.26675525307655334, + "num_input_tokens_seen": 966184, + "step": 59, + "train_runtime": 480.0912, + "train_tokens_per_second": 2012.501 + }, + { + "epoch": 0.03636363636363636, + "grad_norm": 0.4677429795265198, + "learning_rate": 3.575757575757576e-05, + "loss": 0.2512170076370239, + "num_input_tokens_seen": 982560, + "step": 60, + "train_runtime": 488.2016, + "train_tokens_per_second": 2012.611 + }, + { + "epoch": 0.03696969696969697, + "grad_norm": 0.37272387742996216, + "learning_rate": 3.6363636363636364e-05, + "loss": 0.19348715245723724, + "num_input_tokens_seen": 998936, + "step": 61, + "train_runtime": 496.3104, + "train_tokens_per_second": 2012.724 + }, + { + "epoch": 0.037575757575757575, + "grad_norm": 0.36983442306518555, + "learning_rate": 3.6969696969696974e-05, + "loss": 0.18563911318778992, + "num_input_tokens_seen": 1015312, + "step": 62, + "train_runtime": 504.419, + "train_tokens_per_second": 2012.835 + }, + { + "epoch": 0.038181818181818185, + "grad_norm": 0.37516751885414124, + "learning_rate": 3.757575757575758e-05, + "loss": 0.16986083984375, + "num_input_tokens_seen": 1031688, + "step": 63, + "train_runtime": 512.5348, + "train_tokens_per_second": 2012.913 + }, + { + "epoch": 0.03878787878787879, + "grad_norm": 0.3174577057361603, + "learning_rate": 3.818181818181819e-05, + "loss": 0.1534540057182312, + "num_input_tokens_seen": 1048064, + "step": 64, + "train_runtime": 520.644, + "train_tokens_per_second": 2013.015 + }, + { + "epoch": 0.03939393939393939, + "grad_norm": 0.30689847469329834, + "learning_rate": 3.878787878787879e-05, + "loss": 0.14156833291053772, + "num_input_tokens_seen": 1064440, + "step": 65, + "train_runtime": 528.7787, + "train_tokens_per_second": 2013.016 + }, + { + "epoch": 0.04, + "grad_norm": 0.2671639621257782, + "learning_rate": 3.939393939393939e-05, + "loss": 0.12481589615345001, + "num_input_tokens_seen": 1080816, + "step": 66, + "train_runtime": 536.8903, + "train_tokens_per_second": 2013.104 + }, + { + "epoch": 0.040606060606060604, + "grad_norm": 0.2459305375814438, + "learning_rate": 4e-05, + "loss": 0.12609152495861053, + "num_input_tokens_seen": 1097192, + "step": 67, + "train_runtime": 545.0023, + "train_tokens_per_second": 2013.188 + }, + { + "epoch": 0.041212121212121214, + "grad_norm": 0.23298931121826172, + "learning_rate": 4.0606060606060606e-05, + "loss": 0.10923294723033905, + "num_input_tokens_seen": 1113568, + "step": 68, + "train_runtime": 553.1113, + "train_tokens_per_second": 2013.28 + }, + { + "epoch": 0.04181818181818182, + "grad_norm": 0.22864830493927002, + "learning_rate": 4.1212121212121216e-05, + "loss": 0.10794200748205185, + "num_input_tokens_seen": 1129944, + "step": 69, + "train_runtime": 561.2215, + "train_tokens_per_second": 2013.365 + }, + { + "epoch": 0.04242424242424243, + "grad_norm": 0.2130967080593109, + "learning_rate": 4.181818181818182e-05, + "loss": 0.09509418904781342, + "num_input_tokens_seen": 1146320, + "step": 70, + "train_runtime": 569.3343, + "train_tokens_per_second": 2013.439 + }, + { + "epoch": 0.04303030303030303, + "grad_norm": 0.19734057784080505, + "learning_rate": 4.242424242424243e-05, + "loss": 0.08767769485712051, + "num_input_tokens_seen": 1162696, + "step": 71, + "train_runtime": 577.4461, + "train_tokens_per_second": 2013.514 + }, + { + "epoch": 0.04363636363636364, + "grad_norm": 0.2512868344783783, + "learning_rate": 4.303030303030303e-05, + "loss": 0.08520924299955368, + "num_input_tokens_seen": 1179072, + "step": 72, + "train_runtime": 585.5562, + "train_tokens_per_second": 2013.593 + }, + { + "epoch": 0.04424242424242424, + "grad_norm": 0.18867339193820953, + "learning_rate": 4.3636363636363636e-05, + "loss": 0.08193657547235489, + "num_input_tokens_seen": 1195448, + "step": 73, + "train_runtime": 593.6659, + "train_tokens_per_second": 2013.671 + }, + { + "epoch": 0.044848484848484846, + "grad_norm": 0.17708271741867065, + "learning_rate": 4.4242424242424246e-05, + "loss": 0.07861079275608063, + "num_input_tokens_seen": 1211824, + "step": 74, + "train_runtime": 601.778, + "train_tokens_per_second": 2013.739 + }, + { + "epoch": 0.045454545454545456, + "grad_norm": 0.16671743988990784, + "learning_rate": 4.484848484848485e-05, + "loss": 0.07204174995422363, + "num_input_tokens_seen": 1228200, + "step": 75, + "train_runtime": 609.889, + "train_tokens_per_second": 2013.809 + }, + { + "epoch": 0.04606060606060606, + "grad_norm": 0.17388567328453064, + "learning_rate": 4.545454545454546e-05, + "loss": 0.05977003276348114, + "num_input_tokens_seen": 1244576, + "step": 76, + "train_runtime": 617.9973, + "train_tokens_per_second": 2013.886 + }, + { + "epoch": 0.04666666666666667, + "grad_norm": 0.14751967787742615, + "learning_rate": 4.606060606060607e-05, + "loss": 0.06652094423770905, + "num_input_tokens_seen": 1260952, + "step": 77, + "train_runtime": 626.1063, + "train_tokens_per_second": 2013.958 + }, + { + "epoch": 0.04727272727272727, + "grad_norm": 0.1427117884159088, + "learning_rate": 4.666666666666667e-05, + "loss": 0.05981641262769699, + "num_input_tokens_seen": 1277328, + "step": 78, + "train_runtime": 634.2178, + "train_tokens_per_second": 2014.021 + }, + { + "epoch": 0.04787878787878788, + "grad_norm": 0.16328735649585724, + "learning_rate": 4.7272727272727275e-05, + "loss": 0.059813786298036575, + "num_input_tokens_seen": 1293704, + "step": 79, + "train_runtime": 642.3361, + "train_tokens_per_second": 2014.061 + }, + { + "epoch": 0.048484848484848485, + "grad_norm": 0.15144814550876617, + "learning_rate": 4.787878787878788e-05, + "loss": 0.05687074735760689, + "num_input_tokens_seen": 1310080, + "step": 80, + "train_runtime": 650.4589, + "train_tokens_per_second": 2014.086 + }, + { + "epoch": 0.04909090909090909, + "grad_norm": 0.19531840085983276, + "learning_rate": 4.848484848484849e-05, + "loss": 0.06199571490287781, + "num_input_tokens_seen": 1326456, + "step": 81, + "train_runtime": 658.5803, + "train_tokens_per_second": 2014.114 + }, + { + "epoch": 0.0496969696969697, + "grad_norm": 0.11535873264074326, + "learning_rate": 4.909090909090909e-05, + "loss": 0.05434288829565048, + "num_input_tokens_seen": 1342832, + "step": 82, + "train_runtime": 666.7006, + "train_tokens_per_second": 2014.145 + }, + { + "epoch": 0.0503030303030303, + "grad_norm": 0.17366129159927368, + "learning_rate": 4.9696969696969694e-05, + "loss": 0.0584072507917881, + "num_input_tokens_seen": 1359208, + "step": 83, + "train_runtime": 674.8206, + "train_tokens_per_second": 2014.177 + }, + { + "epoch": 0.05090909090909091, + "grad_norm": 0.16601437330245972, + "learning_rate": 5.030303030303031e-05, + "loss": 0.055472493171691895, + "num_input_tokens_seen": 1375584, + "step": 84, + "train_runtime": 682.9407, + "train_tokens_per_second": 2014.207 + }, + { + "epoch": 0.051515151515151514, + "grad_norm": 0.12125150859355927, + "learning_rate": 5.090909090909091e-05, + "loss": 0.04972580820322037, + "num_input_tokens_seen": 1391960, + "step": 85, + "train_runtime": 691.0602, + "train_tokens_per_second": 2014.238 + }, + { + "epoch": 0.052121212121212124, + "grad_norm": 0.10404529422521591, + "learning_rate": 5.151515151515152e-05, + "loss": 0.04972917586565018, + "num_input_tokens_seen": 1408336, + "step": 86, + "train_runtime": 699.177, + "train_tokens_per_second": 2014.277 + }, + { + "epoch": 0.05272727272727273, + "grad_norm": 0.19109457731246948, + "learning_rate": 5.212121212121213e-05, + "loss": 0.04995625838637352, + "num_input_tokens_seen": 1424712, + "step": 87, + "train_runtime": 707.2957, + "train_tokens_per_second": 2014.309 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 0.14529068768024445, + "learning_rate": 5.272727272727272e-05, + "loss": 0.044690582901239395, + "num_input_tokens_seen": 1441088, + "step": 88, + "train_runtime": 715.4144, + "train_tokens_per_second": 2014.34 + }, + { + "epoch": 0.05393939393939394, + "grad_norm": 0.12216632813215256, + "learning_rate": 5.333333333333333e-05, + "loss": 0.04490099102258682, + "num_input_tokens_seen": 1457464, + "step": 89, + "train_runtime": 723.5369, + "train_tokens_per_second": 2014.36 + }, + { + "epoch": 0.05454545454545454, + "grad_norm": 0.09520085901021957, + "learning_rate": 5.393939393939394e-05, + "loss": 0.039979420602321625, + "num_input_tokens_seen": 1473840, + "step": 90, + "train_runtime": 731.6566, + "train_tokens_per_second": 2014.388 + }, + { + "epoch": 0.05515151515151515, + "grad_norm": 0.13766801357269287, + "learning_rate": 5.4545454545454546e-05, + "loss": 0.04609033092856407, + "num_input_tokens_seen": 1490216, + "step": 91, + "train_runtime": 739.7761, + "train_tokens_per_second": 2014.415 + }, + { + "epoch": 0.055757575757575756, + "grad_norm": 0.13074332475662231, + "learning_rate": 5.5151515151515156e-05, + "loss": 0.040276553481817245, + "num_input_tokens_seen": 1506592, + "step": 92, + "train_runtime": 747.8977, + "train_tokens_per_second": 2014.436 + }, + { + "epoch": 0.056363636363636366, + "grad_norm": 0.11333464086055756, + "learning_rate": 5.5757575757575766e-05, + "loss": 0.03974860906600952, + "num_input_tokens_seen": 1522968, + "step": 93, + "train_runtime": 756.018, + "train_tokens_per_second": 2014.46 + }, + { + "epoch": 0.05696969696969697, + "grad_norm": 0.09708438813686371, + "learning_rate": 5.636363636363636e-05, + "loss": 0.03745771571993828, + "num_input_tokens_seen": 1539344, + "step": 94, + "train_runtime": 764.1373, + "train_tokens_per_second": 2014.486 + }, + { + "epoch": 0.05757575757575758, + "grad_norm": 0.13791343569755554, + "learning_rate": 5.696969696969697e-05, + "loss": 0.04385356977581978, + "num_input_tokens_seen": 1555720, + "step": 95, + "train_runtime": 772.256, + "train_tokens_per_second": 2014.513 + }, + { + "epoch": 0.05818181818181818, + "grad_norm": 0.15427744388580322, + "learning_rate": 5.757575757575758e-05, + "loss": 0.0388864129781723, + "num_input_tokens_seen": 1572096, + "step": 96, + "train_runtime": 780.3755, + "train_tokens_per_second": 2014.538 + }, + { + "epoch": 0.058787878787878785, + "grad_norm": 0.11847083270549774, + "learning_rate": 5.818181818181818e-05, + "loss": 0.033506229519844055, + "num_input_tokens_seen": 1588472, + "step": 97, + "train_runtime": 788.4951, + "train_tokens_per_second": 2014.562 + }, + { + "epoch": 0.059393939393939395, + "grad_norm": 0.10092757642269135, + "learning_rate": 5.878787878787879e-05, + "loss": 0.03343300521373749, + "num_input_tokens_seen": 1604848, + "step": 98, + "train_runtime": 796.6166, + "train_tokens_per_second": 2014.58 + }, + { + "epoch": 0.06, + "grad_norm": 0.10452481359243393, + "learning_rate": 5.93939393939394e-05, + "loss": 0.036986708641052246, + "num_input_tokens_seen": 1621224, + "step": 99, + "train_runtime": 804.7379, + "train_tokens_per_second": 2014.599 + }, + { + "epoch": 0.06060606060606061, + "grad_norm": 0.08679923415184021, + "learning_rate": 6e-05, + "loss": 0.03295439854264259, + "num_input_tokens_seen": 1637600, + "step": 100, + "train_runtime": 812.8578, + "train_tokens_per_second": 2014.62 + }, + { + "epoch": 0.06121212121212121, + "grad_norm": 0.1115456148982048, + "learning_rate": 6.060606060606061e-05, + "loss": 0.03657374531030655, + "num_input_tokens_seen": 1653976, + "step": 101, + "train_runtime": 821.8569, + "train_tokens_per_second": 2012.487 + }, + { + "epoch": 0.06181818181818182, + "grad_norm": 0.08771228045225143, + "learning_rate": 6.121212121212121e-05, + "loss": 0.0364333875477314, + "num_input_tokens_seen": 1670352, + "step": 102, + "train_runtime": 829.9743, + "train_tokens_per_second": 2012.535 + }, + { + "epoch": 0.062424242424242424, + "grad_norm": 0.08961863070726395, + "learning_rate": 6.181818181818182e-05, + "loss": 0.03239607438445091, + "num_input_tokens_seen": 1686728, + "step": 103, + "train_runtime": 838.0926, + "train_tokens_per_second": 2012.58 + }, + { + "epoch": 0.06303030303030303, + "grad_norm": 0.10658557713031769, + "learning_rate": 6.242424242424243e-05, + "loss": 0.035685982555150986, + "num_input_tokens_seen": 1703104, + "step": 104, + "train_runtime": 846.2114, + "train_tokens_per_second": 2012.622 + }, + { + "epoch": 0.06363636363636363, + "grad_norm": 0.07003116607666016, + "learning_rate": 6.303030303030302e-05, + "loss": 0.03269325941801071, + "num_input_tokens_seen": 1719480, + "step": 105, + "train_runtime": 854.3347, + "train_tokens_per_second": 2012.654 + }, + { + "epoch": 0.06424242424242424, + "grad_norm": 0.0889090895652771, + "learning_rate": 6.363636363636364e-05, + "loss": 0.030469391494989395, + "num_input_tokens_seen": 1735856, + "step": 106, + "train_runtime": 862.4518, + "train_tokens_per_second": 2012.699 + }, + { + "epoch": 0.06484848484848485, + "grad_norm": 0.12026192247867584, + "learning_rate": 6.424242424242424e-05, + "loss": 0.032258037477731705, + "num_input_tokens_seen": 1752232, + "step": 107, + "train_runtime": 870.5683, + "train_tokens_per_second": 2012.745 + }, + { + "epoch": 0.06545454545454546, + "grad_norm": 0.06484470516443253, + "learning_rate": 6.484848484848485e-05, + "loss": 0.026622053235769272, + "num_input_tokens_seen": 1768608, + "step": 108, + "train_runtime": 878.6857, + "train_tokens_per_second": 2012.788 + }, + { + "epoch": 0.06606060606060606, + "grad_norm": 0.09636206179857254, + "learning_rate": 6.545454545454546e-05, + "loss": 0.03460235893726349, + "num_input_tokens_seen": 1784984, + "step": 109, + "train_runtime": 886.8033, + "train_tokens_per_second": 2012.83 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 0.10380304604768753, + "learning_rate": 6.606060606060607e-05, + "loss": 0.030300751328468323, + "num_input_tokens_seen": 1801360, + "step": 110, + "train_runtime": 894.9204, + "train_tokens_per_second": 2012.872 + }, + { + "epoch": 0.06727272727272728, + "grad_norm": 0.07361245900392532, + "learning_rate": 6.666666666666667e-05, + "loss": 0.03334670513868332, + "num_input_tokens_seen": 1817736, + "step": 111, + "train_runtime": 903.0383, + "train_tokens_per_second": 2012.911 + }, + { + "epoch": 0.06787878787878789, + "grad_norm": 0.06159133464097977, + "learning_rate": 6.727272727272727e-05, + "loss": 0.026774805039167404, + "num_input_tokens_seen": 1834112, + "step": 112, + "train_runtime": 911.1548, + "train_tokens_per_second": 2012.953 + }, + { + "epoch": 0.06848484848484848, + "grad_norm": 0.08236563950777054, + "learning_rate": 6.787878787878789e-05, + "loss": 0.02836509235203266, + "num_input_tokens_seen": 1850488, + "step": 113, + "train_runtime": 919.2722, + "train_tokens_per_second": 2012.992 + }, + { + "epoch": 0.06909090909090909, + "grad_norm": 0.06620238721370697, + "learning_rate": 6.848484848484848e-05, + "loss": 0.027467701584100723, + "num_input_tokens_seen": 1866864, + "step": 114, + "train_runtime": 927.3888, + "train_tokens_per_second": 2013.033 + }, + { + "epoch": 0.0696969696969697, + "grad_norm": 0.06323213130235672, + "learning_rate": 6.90909090909091e-05, + "loss": 0.02602136880159378, + "num_input_tokens_seen": 1883240, + "step": 115, + "train_runtime": 935.5053, + "train_tokens_per_second": 2013.072 + }, + { + "epoch": 0.0703030303030303, + "grad_norm": 0.06442830711603165, + "learning_rate": 6.96969696969697e-05, + "loss": 0.024133116006851196, + "num_input_tokens_seen": 1899616, + "step": 116, + "train_runtime": 943.6216, + "train_tokens_per_second": 2013.112 + }, + { + "epoch": 0.07090909090909091, + "grad_norm": 0.057056326419115067, + "learning_rate": 7.03030303030303e-05, + "loss": 0.029189810156822205, + "num_input_tokens_seen": 1915992, + "step": 117, + "train_runtime": 951.74, + "train_tokens_per_second": 2013.147 + }, + { + "epoch": 0.07151515151515152, + "grad_norm": 0.067554771900177, + "learning_rate": 7.090909090909092e-05, + "loss": 0.026694156229496002, + "num_input_tokens_seen": 1932368, + "step": 118, + "train_runtime": 959.8558, + "train_tokens_per_second": 2013.186 + }, + { + "epoch": 0.07212121212121213, + "grad_norm": 0.14906729757785797, + "learning_rate": 7.151515151515152e-05, + "loss": 0.027481166645884514, + "num_input_tokens_seen": 1948744, + "step": 119, + "train_runtime": 967.9726, + "train_tokens_per_second": 2013.222 + }, + { + "epoch": 0.07272727272727272, + "grad_norm": 0.08957181125879288, + "learning_rate": 7.212121212121213e-05, + "loss": 0.026221584528684616, + "num_input_tokens_seen": 1965120, + "step": 120, + "train_runtime": 976.0892, + "train_tokens_per_second": 2013.259 + }, + { + "epoch": 0.07333333333333333, + "grad_norm": 0.06401059031486511, + "learning_rate": 7.272727272727273e-05, + "loss": 0.024882640689611435, + "num_input_tokens_seen": 1981496, + "step": 121, + "train_runtime": 984.2063, + "train_tokens_per_second": 2013.293 + }, + { + "epoch": 0.07393939393939394, + "grad_norm": 0.08041027188301086, + "learning_rate": 7.333333333333333e-05, + "loss": 0.02306070551276207, + "num_input_tokens_seen": 1997872, + "step": 122, + "train_runtime": 992.3345, + "train_tokens_per_second": 2013.305 + }, + { + "epoch": 0.07454545454545454, + "grad_norm": 0.12150601297616959, + "learning_rate": 7.393939393939395e-05, + "loss": 0.024561185389757156, + "num_input_tokens_seen": 2014248, + "step": 123, + "train_runtime": 1000.452, + "train_tokens_per_second": 2013.338 + }, + { + "epoch": 0.07515151515151515, + "grad_norm": 0.24074473977088928, + "learning_rate": 7.454545454545455e-05, + "loss": 0.027396628633141518, + "num_input_tokens_seen": 2030624, + "step": 124, + "train_runtime": 1008.5688, + "train_tokens_per_second": 2013.372 + }, + { + "epoch": 0.07575757575757576, + "grad_norm": 0.05276267230510712, + "learning_rate": 7.515151515151515e-05, + "loss": 0.024067046120762825, + "num_input_tokens_seen": 2047000, + "step": 125, + "train_runtime": 1016.6862, + "train_tokens_per_second": 2013.404 + }, + { + "epoch": 0.07636363636363637, + "grad_norm": 0.17272238433361053, + "learning_rate": 7.575757575757576e-05, + "loss": 0.023468442261219025, + "num_input_tokens_seen": 2063376, + "step": 126, + "train_runtime": 1024.8042, + "train_tokens_per_second": 2013.434 + }, + { + "epoch": 0.07696969696969697, + "grad_norm": 0.3582988977432251, + "learning_rate": 7.636363636363637e-05, + "loss": 0.027403943240642548, + "num_input_tokens_seen": 2079752, + "step": 127, + "train_runtime": 1032.9345, + "train_tokens_per_second": 2013.44 + }, + { + "epoch": 0.07757575757575758, + "grad_norm": 0.0781882107257843, + "learning_rate": 7.696969696969696e-05, + "loss": 0.023713622242212296, + "num_input_tokens_seen": 2096128, + "step": 128, + "train_runtime": 1041.056, + "train_tokens_per_second": 2013.463 + }, + { + "epoch": 0.07818181818181819, + "grad_norm": 0.07272130995988846, + "learning_rate": 7.757575757575758e-05, + "loss": 0.022761020809412003, + "num_input_tokens_seen": 2112504, + "step": 129, + "train_runtime": 1049.1772, + "train_tokens_per_second": 2013.486 + }, + { + "epoch": 0.07878787878787878, + "grad_norm": 0.2158210277557373, + "learning_rate": 7.818181818181818e-05, + "loss": 0.024013228714466095, + "num_input_tokens_seen": 2128880, + "step": 130, + "train_runtime": 1057.2975, + "train_tokens_per_second": 2013.511 + }, + { + "epoch": 0.07939393939393939, + "grad_norm": 0.586162269115448, + "learning_rate": 7.878787878787879e-05, + "loss": 0.022834377363324165, + "num_input_tokens_seen": 2145256, + "step": 131, + "train_runtime": 1065.4164, + "train_tokens_per_second": 2013.538 + }, + { + "epoch": 0.08, + "grad_norm": 0.323000431060791, + "learning_rate": 7.93939393939394e-05, + "loss": 0.022654253989458084, + "num_input_tokens_seen": 2161632, + "step": 132, + "train_runtime": 1073.5352, + "train_tokens_per_second": 2013.564 + }, + { + "epoch": 0.08060606060606061, + "grad_norm": 0.08159562945365906, + "learning_rate": 8e-05, + "loss": 0.02390367165207863, + "num_input_tokens_seen": 2178008, + "step": 133, + "train_runtime": 1081.6528, + "train_tokens_per_second": 2013.593 + }, + { + "epoch": 0.08121212121212121, + "grad_norm": 0.7155167460441589, + "learning_rate": 8.060606060606061e-05, + "loss": 0.022787289693951607, + "num_input_tokens_seen": 2194384, + "step": 134, + "train_runtime": 1089.7709, + "train_tokens_per_second": 2013.619 + }, + { + "epoch": 0.08181818181818182, + "grad_norm": 0.08167142421007156, + "learning_rate": 8.121212121212121e-05, + "loss": 0.02184353396296501, + "num_input_tokens_seen": 2210760, + "step": 135, + "train_runtime": 1097.8902, + "train_tokens_per_second": 2013.644 + }, + { + "epoch": 0.08242424242424243, + "grad_norm": 0.47277864813804626, + "learning_rate": 8.181818181818183e-05, + "loss": 0.02624150738120079, + "num_input_tokens_seen": 2227136, + "step": 136, + "train_runtime": 1106.0079, + "train_tokens_per_second": 2013.671 + }, + { + "epoch": 0.08303030303030302, + "grad_norm": 0.07428373396396637, + "learning_rate": 8.242424242424243e-05, + "loss": 0.02352747693657875, + "num_input_tokens_seen": 2243512, + "step": 137, + "train_runtime": 1114.1326, + "train_tokens_per_second": 2013.685 + }, + { + "epoch": 0.08363636363636363, + "grad_norm": 0.47124460339546204, + "learning_rate": 8.303030303030304e-05, + "loss": 0.025087552145123482, + "num_input_tokens_seen": 2259888, + "step": 138, + "train_runtime": 1122.2501, + "train_tokens_per_second": 2013.712 + }, + { + "epoch": 0.08424242424242424, + "grad_norm": 0.2430545538663864, + "learning_rate": 8.363636363636364e-05, + "loss": 0.024803292006254196, + "num_input_tokens_seen": 2276264, + "step": 139, + "train_runtime": 1130.3676, + "train_tokens_per_second": 2013.738 + }, + { + "epoch": 0.08484848484848485, + "grad_norm": 0.08046893775463104, + "learning_rate": 8.424242424242424e-05, + "loss": 0.022827964276075363, + "num_input_tokens_seen": 2292640, + "step": 140, + "train_runtime": 1138.4851, + "train_tokens_per_second": 2013.764 + }, + { + "epoch": 0.08545454545454545, + "grad_norm": 0.15526282787322998, + "learning_rate": 8.484848484848486e-05, + "loss": 0.02164369635283947, + "num_input_tokens_seen": 2309016, + "step": 141, + "train_runtime": 1146.6046, + "train_tokens_per_second": 2013.786 + }, + { + "epoch": 0.08606060606060606, + "grad_norm": 0.0912376195192337, + "learning_rate": 8.545454545454545e-05, + "loss": 0.0223920289427042, + "num_input_tokens_seen": 2325392, + "step": 142, + "train_runtime": 1154.7226, + "train_tokens_per_second": 2013.81 + }, + { + "epoch": 0.08666666666666667, + "grad_norm": 0.08407703042030334, + "learning_rate": 8.606060606060606e-05, + "loss": 0.022693689912557602, + "num_input_tokens_seen": 2341768, + "step": 143, + "train_runtime": 1162.8406, + "train_tokens_per_second": 2013.834 + }, + { + "epoch": 0.08727272727272728, + "grad_norm": 0.07187625020742416, + "learning_rate": 8.666666666666667e-05, + "loss": 0.020523108541965485, + "num_input_tokens_seen": 2358144, + "step": 144, + "train_runtime": 1170.9602, + "train_tokens_per_second": 2013.855 + }, + { + "epoch": 0.08787878787878788, + "grad_norm": 0.08785762637853622, + "learning_rate": 8.727272727272727e-05, + "loss": 0.023188354447484016, + "num_input_tokens_seen": 2374520, + "step": 145, + "train_runtime": 1179.0803, + "train_tokens_per_second": 2013.875 + }, + { + "epoch": 0.08848484848484849, + "grad_norm": 0.06223875284194946, + "learning_rate": 8.787878787878789e-05, + "loss": 0.019059190526604652, + "num_input_tokens_seen": 2390896, + "step": 146, + "train_runtime": 1187.2017, + "train_tokens_per_second": 2013.892 + }, + { + "epoch": 0.0890909090909091, + "grad_norm": 0.09552452713251114, + "learning_rate": 8.848484848484849e-05, + "loss": 0.020222101360559464, + "num_input_tokens_seen": 2407272, + "step": 147, + "train_runtime": 1195.3217, + "train_tokens_per_second": 2013.911 + }, + { + "epoch": 0.08969696969696969, + "grad_norm": 0.07248228043317795, + "learning_rate": 8.90909090909091e-05, + "loss": 0.020538993179798126, + "num_input_tokens_seen": 2423648, + "step": 148, + "train_runtime": 1203.4411, + "train_tokens_per_second": 2013.932 + }, + { + "epoch": 0.0903030303030303, + "grad_norm": 0.08636505901813507, + "learning_rate": 8.96969696969697e-05, + "loss": 0.020172201097011566, + "num_input_tokens_seen": 2440024, + "step": 149, + "train_runtime": 1211.5609, + "train_tokens_per_second": 2013.951 + }, + { + "epoch": 0.09090909090909091, + "grad_norm": 0.0678800642490387, + "learning_rate": 9.030303030303031e-05, + "loss": 0.01839592307806015, + "num_input_tokens_seen": 2456400, + "step": 150, + "train_runtime": 1219.679, + "train_tokens_per_second": 2013.973 + }, + { + "epoch": 0.09151515151515152, + "grad_norm": 0.08543987572193146, + "learning_rate": 9.090909090909092e-05, + "loss": 0.02213234454393387, + "num_input_tokens_seen": 2472776, + "step": 151, + "train_runtime": 1227.7971, + "train_tokens_per_second": 2013.994 + }, + { + "epoch": 0.09212121212121212, + "grad_norm": 0.06894785910844803, + "learning_rate": 9.151515151515152e-05, + "loss": 0.019493641331791878, + "num_input_tokens_seen": 2489152, + "step": 152, + "train_runtime": 1235.9161, + "train_tokens_per_second": 2014.014 + }, + { + "epoch": 0.09272727272727273, + "grad_norm": 0.0796777755022049, + "learning_rate": 9.212121212121214e-05, + "loss": 0.019212841987609863, + "num_input_tokens_seen": 2505528, + "step": 153, + "train_runtime": 1244.0335, + "train_tokens_per_second": 2014.036 + }, + { + "epoch": 0.09333333333333334, + "grad_norm": 0.03816372528672218, + "learning_rate": 9.272727272727273e-05, + "loss": 0.018845168873667717, + "num_input_tokens_seen": 2521904, + "step": 154, + "train_runtime": 1252.1501, + "train_tokens_per_second": 2014.059 + }, + { + "epoch": 0.09393939393939393, + "grad_norm": 0.05867328122258186, + "learning_rate": 9.333333333333334e-05, + "loss": 0.020137080922722816, + "num_input_tokens_seen": 2538280, + "step": 155, + "train_runtime": 1260.2669, + "train_tokens_per_second": 2014.081 + }, + { + "epoch": 0.09454545454545454, + "grad_norm": 0.12616179883480072, + "learning_rate": 9.393939393939395e-05, + "loss": 0.023685304448008537, + "num_input_tokens_seen": 2554656, + "step": 156, + "train_runtime": 1268.385, + "train_tokens_per_second": 2014.101 + }, + { + "epoch": 0.09515151515151515, + "grad_norm": 0.06801550090312958, + "learning_rate": 9.454545454545455e-05, + "loss": 0.021116768941283226, + "num_input_tokens_seen": 2571032, + "step": 157, + "train_runtime": 1276.5029, + "train_tokens_per_second": 2014.122 + }, + { + "epoch": 0.09575757575757576, + "grad_norm": 0.05668250098824501, + "learning_rate": 9.515151515151515e-05, + "loss": 0.019319312646985054, + "num_input_tokens_seen": 2587408, + "step": 158, + "train_runtime": 1284.6181, + "train_tokens_per_second": 2014.146 + }, + { + "epoch": 0.09636363636363636, + "grad_norm": 0.05750446021556854, + "learning_rate": 9.575757575757576e-05, + "loss": 0.01928100548684597, + "num_input_tokens_seen": 2603784, + "step": 159, + "train_runtime": 1292.7386, + "train_tokens_per_second": 2014.161 + }, + { + "epoch": 0.09696969696969697, + "grad_norm": 0.08826832473278046, + "learning_rate": 9.636363636363637e-05, + "loss": 0.02036631852388382, + "num_input_tokens_seen": 2620160, + "step": 160, + "train_runtime": 1300.8562, + "train_tokens_per_second": 2014.181 + }, + { + "epoch": 0.09757575757575758, + "grad_norm": 0.05680972710251808, + "learning_rate": 9.696969696969698e-05, + "loss": 0.017789499834179878, + "num_input_tokens_seen": 2636536, + "step": 161, + "train_runtime": 1308.9737, + "train_tokens_per_second": 2014.201 + }, + { + "epoch": 0.09818181818181818, + "grad_norm": 0.04641514644026756, + "learning_rate": 9.757575757575758e-05, + "loss": 0.02048567123711109, + "num_input_tokens_seen": 2652912, + "step": 162, + "train_runtime": 1317.092, + "train_tokens_per_second": 2014.219 + }, + { + "epoch": 0.09878787878787879, + "grad_norm": 0.04058675095438957, + "learning_rate": 9.818181818181818e-05, + "loss": 0.019105076789855957, + "num_input_tokens_seen": 2669288, + "step": 163, + "train_runtime": 1325.2097, + "train_tokens_per_second": 2014.238 + }, + { + "epoch": 0.0993939393939394, + "grad_norm": 0.08786831051111221, + "learning_rate": 9.87878787878788e-05, + "loss": 0.020488332957029343, + "num_input_tokens_seen": 2685664, + "step": 164, + "train_runtime": 1333.3352, + "train_tokens_per_second": 2014.245 + }, + { + "epoch": 0.1, + "grad_norm": 0.05097790062427521, + "learning_rate": 9.939393939393939e-05, + "loss": 0.018979694694280624, + "num_input_tokens_seen": 2702040, + "step": 165, + "train_runtime": 1341.4534, + "train_tokens_per_second": 2014.263 + }, + { + "epoch": 0.1006060606060606, + "grad_norm": 0.05220174416899681, + "learning_rate": 0.0001, + "loss": 0.017788853496313095, + "num_input_tokens_seen": 2718416, + "step": 166, + "train_runtime": 1349.5711, + "train_tokens_per_second": 2014.281 + }, + { + "epoch": 0.10121212121212121, + "grad_norm": 0.07084593176841736, + "learning_rate": 9.999999907529869e-05, + "loss": 0.017644576728343964, + "num_input_tokens_seen": 2734792, + "step": 167, + "train_runtime": 1357.6892, + "train_tokens_per_second": 2014.299 + }, + { + "epoch": 0.10181818181818182, + "grad_norm": 0.058325134217739105, + "learning_rate": 9.999999630119479e-05, + "loss": 0.01890077441930771, + "num_input_tokens_seen": 2751168, + "step": 168, + "train_runtime": 1365.8058, + "train_tokens_per_second": 2014.319 + }, + { + "epoch": 0.10242424242424242, + "grad_norm": 0.06277347356081009, + "learning_rate": 9.999999167768837e-05, + "loss": 0.020100781694054604, + "num_input_tokens_seen": 2767544, + "step": 169, + "train_runtime": 1373.9351, + "train_tokens_per_second": 2014.319 + }, + { + "epoch": 0.10303030303030303, + "grad_norm": 0.07524619996547699, + "learning_rate": 9.999998520477966e-05, + "loss": 0.016615130007267, + "num_input_tokens_seen": 2783920, + "step": 170, + "train_runtime": 1382.0536, + "train_tokens_per_second": 2014.336 + }, + { + "epoch": 0.10363636363636364, + "grad_norm": 0.07865840196609497, + "learning_rate": 9.999997688246885e-05, + "loss": 0.02175009250640869, + "num_input_tokens_seen": 2800296, + "step": 171, + "train_runtime": 1390.173, + "train_tokens_per_second": 2014.351 + }, + { + "epoch": 0.10424242424242425, + "grad_norm": 0.10437590628862381, + "learning_rate": 9.999996671075626e-05, + "loss": 0.021732885390520096, + "num_input_tokens_seen": 2816672, + "step": 172, + "train_runtime": 1398.29, + "train_tokens_per_second": 2014.369 + }, + { + "epoch": 0.10484848484848484, + "grad_norm": 0.09102741628885269, + "learning_rate": 9.99999546896423e-05, + "loss": 0.019160069525241852, + "num_input_tokens_seen": 2833048, + "step": 173, + "train_runtime": 1406.4092, + "train_tokens_per_second": 2014.384 + }, + { + "epoch": 0.10545454545454545, + "grad_norm": 0.09274180978536606, + "learning_rate": 9.999994081912736e-05, + "loss": 0.020909177139401436, + "num_input_tokens_seen": 2849424, + "step": 174, + "train_runtime": 1414.5329, + "train_tokens_per_second": 2014.392 + }, + { + "epoch": 0.10606060606060606, + "grad_norm": 0.0448119193315506, + "learning_rate": 9.999992509921199e-05, + "loss": 0.018382754176855087, + "num_input_tokens_seen": 2865800, + "step": 175, + "train_runtime": 1422.6511, + "train_tokens_per_second": 2014.408 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 0.04945825785398483, + "learning_rate": 9.999990752989675e-05, + "loss": 0.01783941313624382, + "num_input_tokens_seen": 2882176, + "step": 176, + "train_runtime": 1430.7704, + "train_tokens_per_second": 2014.422 + }, + { + "epoch": 0.10727272727272727, + "grad_norm": 0.04921802878379822, + "learning_rate": 9.999988811118231e-05, + "loss": 0.01793338730931282, + "num_input_tokens_seen": 2898552, + "step": 177, + "train_runtime": 1438.89, + "train_tokens_per_second": 2014.436 + }, + { + "epoch": 0.10787878787878788, + "grad_norm": 0.05301757901906967, + "learning_rate": 9.999986684306937e-05, + "loss": 0.01700768433511257, + "num_input_tokens_seen": 2914928, + "step": 178, + "train_runtime": 1447.011, + "train_tokens_per_second": 2014.448 + }, + { + "epoch": 0.10848484848484849, + "grad_norm": 0.0539541132748127, + "learning_rate": 9.999984372555874e-05, + "loss": 0.01774643547832966, + "num_input_tokens_seen": 2931304, + "step": 179, + "train_runtime": 1455.1319, + "train_tokens_per_second": 2014.459 + }, + { + "epoch": 0.10909090909090909, + "grad_norm": 0.046017974615097046, + "learning_rate": 9.999981875865125e-05, + "loss": 0.016473708674311638, + "num_input_tokens_seen": 2947680, + "step": 180, + "train_runtime": 1463.2551, + "train_tokens_per_second": 2014.468 + }, + { + "epoch": 0.1096969696969697, + "grad_norm": 0.05201786011457443, + "learning_rate": 9.999979194234786e-05, + "loss": 0.019079631194472313, + "num_input_tokens_seen": 2964056, + "step": 181, + "train_runtime": 1471.3776, + "train_tokens_per_second": 2014.477 + }, + { + "epoch": 0.1103030303030303, + "grad_norm": 0.07819167524576187, + "learning_rate": 9.99997632766495e-05, + "loss": 0.018508095294237137, + "num_input_tokens_seen": 2980432, + "step": 182, + "train_runtime": 1479.496, + "train_tokens_per_second": 2014.491 + }, + { + "epoch": 0.11090909090909092, + "grad_norm": 0.04773807153105736, + "learning_rate": 9.999973276155727e-05, + "loss": 0.016029708087444305, + "num_input_tokens_seen": 2996808, + "step": 183, + "train_runtime": 1487.6149, + "train_tokens_per_second": 2014.505 + }, + { + "epoch": 0.11151515151515151, + "grad_norm": 0.054091572761535645, + "learning_rate": 9.999970039707232e-05, + "loss": 0.01906082220375538, + "num_input_tokens_seen": 3013184, + "step": 184, + "train_runtime": 1495.7326, + "train_tokens_per_second": 2014.521 + }, + { + "epoch": 0.11212121212121212, + "grad_norm": 0.03870342671871185, + "learning_rate": 9.999966618319581e-05, + "loss": 0.01634303852915764, + "num_input_tokens_seen": 3029560, + "step": 185, + "train_runtime": 1503.8521, + "train_tokens_per_second": 2014.533 + }, + { + "epoch": 0.11272727272727273, + "grad_norm": 0.04409291222691536, + "learning_rate": 9.999963011992902e-05, + "loss": 0.016504261642694473, + "num_input_tokens_seen": 3045936, + "step": 186, + "train_runtime": 1511.9705, + "train_tokens_per_second": 2014.547 + }, + { + "epoch": 0.11333333333333333, + "grad_norm": 0.037538424134254456, + "learning_rate": 9.999959220727327e-05, + "loss": 0.016254613175988197, + "num_input_tokens_seen": 3062312, + "step": 187, + "train_runtime": 1520.0898, + "train_tokens_per_second": 2014.56 + }, + { + "epoch": 0.11393939393939394, + "grad_norm": 0.0896935984492302, + "learning_rate": 9.999955244522999e-05, + "loss": 0.016761597245931625, + "num_input_tokens_seen": 3078688, + "step": 188, + "train_runtime": 1528.2094, + "train_tokens_per_second": 2014.572 + }, + { + "epoch": 0.11454545454545455, + "grad_norm": 0.10176566988229752, + "learning_rate": 9.999951083380062e-05, + "loss": 0.01988411694765091, + "num_input_tokens_seen": 3095064, + "step": 189, + "train_runtime": 1536.333, + "train_tokens_per_second": 2014.579 + }, + { + "epoch": 0.11515151515151516, + "grad_norm": 0.039956171065568924, + "learning_rate": 9.999946737298674e-05, + "loss": 0.015326369553804398, + "num_input_tokens_seen": 3111440, + "step": 190, + "train_runtime": 1544.4503, + "train_tokens_per_second": 2014.594 + }, + { + "epoch": 0.11575757575757575, + "grad_norm": 0.06942013651132584, + "learning_rate": 9.99994220627899e-05, + "loss": 0.017792224884033203, + "num_input_tokens_seen": 3127816, + "step": 191, + "train_runtime": 1552.5689, + "train_tokens_per_second": 2014.607 + }, + { + "epoch": 0.11636363636363636, + "grad_norm": 0.06119908019900322, + "learning_rate": 9.999937490321182e-05, + "loss": 0.016535507515072823, + "num_input_tokens_seen": 3144192, + "step": 192, + "train_runtime": 1560.6857, + "train_tokens_per_second": 2014.622 + }, + { + "epoch": 0.11696969696969697, + "grad_norm": 0.07336534559726715, + "learning_rate": 9.999932589425423e-05, + "loss": 0.015493718907237053, + "num_input_tokens_seen": 3160568, + "step": 193, + "train_runtime": 1568.8033, + "train_tokens_per_second": 2014.636 + }, + { + "epoch": 0.11757575757575757, + "grad_norm": 0.03818663954734802, + "learning_rate": 9.999927503591896e-05, + "loss": 0.017348209396004677, + "num_input_tokens_seen": 3176944, + "step": 194, + "train_runtime": 1576.9206, + "train_tokens_per_second": 2014.651 + }, + { + "epoch": 0.11818181818181818, + "grad_norm": 0.028583593666553497, + "learning_rate": 9.999922232820785e-05, + "loss": 0.014952014200389385, + "num_input_tokens_seen": 3193320, + "step": 195, + "train_runtime": 1585.0393, + "train_tokens_per_second": 2014.663 + }, + { + "epoch": 0.11878787878787879, + "grad_norm": 0.04163753613829613, + "learning_rate": 9.999916777112288e-05, + "loss": 0.017875926569104195, + "num_input_tokens_seen": 3209696, + "step": 196, + "train_runtime": 1593.159, + "train_tokens_per_second": 2014.674 + }, + { + "epoch": 0.1193939393939394, + "grad_norm": 0.03779582679271698, + "learning_rate": 9.999911136466608e-05, + "loss": 0.01648208498954773, + "num_input_tokens_seen": 3226072, + "step": 197, + "train_runtime": 1601.2758, + "train_tokens_per_second": 2014.689 + }, + { + "epoch": 0.12, + "grad_norm": 0.06097209453582764, + "learning_rate": 9.99990531088395e-05, + "loss": 0.017982497811317444, + "num_input_tokens_seen": 3242448, + "step": 198, + "train_runtime": 1609.4726, + "train_tokens_per_second": 2014.603 + }, + { + "epoch": 0.1206060606060606, + "grad_norm": 0.07450928539037704, + "learning_rate": 9.999899300364532e-05, + "loss": 0.015351779758930206, + "num_input_tokens_seen": 3258824, + "step": 199, + "train_runtime": 1617.5877, + "train_tokens_per_second": 2014.62 + }, + { + "epoch": 0.12121212121212122, + "grad_norm": 0.06301674991846085, + "learning_rate": 9.999893104908577e-05, + "loss": 0.018576189875602722, + "num_input_tokens_seen": 3275200, + "step": 200, + "train_runtime": 1625.7153, + "train_tokens_per_second": 2014.621 + }, + { + "epoch": 0.12181818181818181, + "grad_norm": 0.05599730834364891, + "learning_rate": 9.999886724516312e-05, + "loss": 0.018099110573530197, + "num_input_tokens_seen": 3291576, + "step": 201, + "train_runtime": 1635.3633, + "train_tokens_per_second": 2012.749 + }, + { + "epoch": 0.12242424242424242, + "grad_norm": 0.040753431618213654, + "learning_rate": 9.999880159187975e-05, + "loss": 0.015437884256243706, + "num_input_tokens_seen": 3307952, + "step": 202, + "train_runtime": 1643.4859, + "train_tokens_per_second": 2012.766 + }, + { + "epoch": 0.12303030303030303, + "grad_norm": 0.03280268982052803, + "learning_rate": 9.999873408923806e-05, + "loss": 0.01625344157218933, + "num_input_tokens_seen": 3324328, + "step": 203, + "train_runtime": 1651.609, + "train_tokens_per_second": 2012.781 + }, + { + "epoch": 0.12363636363636364, + "grad_norm": 0.058769796043634415, + "learning_rate": 9.999866473724057e-05, + "loss": 0.019040308892726898, + "num_input_tokens_seen": 3340704, + "step": 204, + "train_runtime": 1659.7319, + "train_tokens_per_second": 2012.797 + }, + { + "epoch": 0.12424242424242424, + "grad_norm": 0.07302497327327728, + "learning_rate": 9.999859353588984e-05, + "loss": 0.015959227457642555, + "num_input_tokens_seen": 3357080, + "step": 205, + "train_runtime": 1667.8511, + "train_tokens_per_second": 2012.818 + }, + { + "epoch": 0.12484848484848485, + "grad_norm": 0.038392290472984314, + "learning_rate": 9.999852048518849e-05, + "loss": 0.015184870921075344, + "num_input_tokens_seen": 3373456, + "step": 206, + "train_runtime": 1675.97, + "train_tokens_per_second": 2012.838 + }, + { + "epoch": 0.12545454545454546, + "grad_norm": 0.057108521461486816, + "learning_rate": 9.999844558513926e-05, + "loss": 0.018102547153830528, + "num_input_tokens_seen": 3389832, + "step": 207, + "train_runtime": 1684.0874, + "train_tokens_per_second": 2012.86 + }, + { + "epoch": 0.12606060606060607, + "grad_norm": 0.05192007124423981, + "learning_rate": 9.999836883574488e-05, + "loss": 0.016045067459344864, + "num_input_tokens_seen": 3406208, + "step": 208, + "train_runtime": 1692.2048, + "train_tokens_per_second": 2012.882 + }, + { + "epoch": 0.12666666666666668, + "grad_norm": 0.05115659907460213, + "learning_rate": 9.99982902370082e-05, + "loss": 0.016623271629214287, + "num_input_tokens_seen": 3422584, + "step": 209, + "train_runtime": 1700.3232, + "train_tokens_per_second": 2012.902 + }, + { + "epoch": 0.12727272727272726, + "grad_norm": 0.07258911430835724, + "learning_rate": 9.999820978893216e-05, + "loss": 0.020482556894421577, + "num_input_tokens_seen": 3438960, + "step": 210, + "train_runtime": 1708.4412, + "train_tokens_per_second": 2012.923 + }, + { + "epoch": 0.12787878787878787, + "grad_norm": 0.1083996444940567, + "learning_rate": 9.999812749151966e-05, + "loss": 0.020862706005573273, + "num_input_tokens_seen": 3455336, + "step": 211, + "train_runtime": 1716.5608, + "train_tokens_per_second": 2012.941 + }, + { + "epoch": 0.12848484848484848, + "grad_norm": 0.04957745969295502, + "learning_rate": 9.999804334477383e-05, + "loss": 0.019352620467543602, + "num_input_tokens_seen": 3471712, + "step": 212, + "train_runtime": 1724.679, + "train_tokens_per_second": 2012.961 + }, + { + "epoch": 0.1290909090909091, + "grad_norm": 0.05110868439078331, + "learning_rate": 9.999795734869772e-05, + "loss": 0.01801101304590702, + "num_input_tokens_seen": 3488088, + "step": 213, + "train_runtime": 1732.7974, + "train_tokens_per_second": 2012.981 + }, + { + "epoch": 0.1296969696969697, + "grad_norm": 0.03656603768467903, + "learning_rate": 9.999786950329454e-05, + "loss": 0.014664572663605213, + "num_input_tokens_seen": 3504464, + "step": 214, + "train_runtime": 1740.9181, + "train_tokens_per_second": 2012.998 + }, + { + "epoch": 0.1303030303030303, + "grad_norm": 0.06225895509123802, + "learning_rate": 9.999777980856754e-05, + "loss": 0.01811577007174492, + "num_input_tokens_seen": 3520840, + "step": 215, + "train_runtime": 1749.0394, + "train_tokens_per_second": 2013.014 + }, + { + "epoch": 0.13090909090909092, + "grad_norm": 0.06217541545629501, + "learning_rate": 9.999768826452004e-05, + "loss": 0.015230846591293812, + "num_input_tokens_seen": 3537216, + "step": 216, + "train_runtime": 1757.1603, + "train_tokens_per_second": 2013.03 + }, + { + "epoch": 0.1315151515151515, + "grad_norm": 0.0395430289208889, + "learning_rate": 9.999759487115541e-05, + "loss": 0.017680658027529716, + "num_input_tokens_seen": 3553592, + "step": 217, + "train_runtime": 1765.2799, + "train_tokens_per_second": 2013.047 + }, + { + "epoch": 0.1321212121212121, + "grad_norm": 0.04460732638835907, + "learning_rate": 9.999749962847711e-05, + "loss": 0.015775006264448166, + "num_input_tokens_seen": 3569968, + "step": 218, + "train_runtime": 1773.4008, + "train_tokens_per_second": 2013.063 + }, + { + "epoch": 0.13272727272727272, + "grad_norm": 0.026493152603507042, + "learning_rate": 9.999740253648866e-05, + "loss": 0.016286678612232208, + "num_input_tokens_seen": 3586344, + "step": 219, + "train_runtime": 1781.5181, + "train_tokens_per_second": 2013.083 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 0.05032551288604736, + "learning_rate": 9.999730359519366e-05, + "loss": 0.01735139824450016, + "num_input_tokens_seen": 3602720, + "step": 220, + "train_runtime": 1789.6355, + "train_tokens_per_second": 2013.103 + }, + { + "epoch": 0.13393939393939394, + "grad_norm": 0.10480339080095291, + "learning_rate": 9.999720280459576e-05, + "loss": 0.0164189450442791, + "num_input_tokens_seen": 3619096, + "step": 221, + "train_runtime": 1797.7553, + "train_tokens_per_second": 2013.119 + }, + { + "epoch": 0.13454545454545455, + "grad_norm": 0.05456702038645744, + "learning_rate": 9.99971001646987e-05, + "loss": 0.018650280311703682, + "num_input_tokens_seen": 3635472, + "step": 222, + "train_runtime": 1805.876, + "train_tokens_per_second": 2013.135 + }, + { + "epoch": 0.13515151515151516, + "grad_norm": 0.03562236949801445, + "learning_rate": 9.999699567550627e-05, + "loss": 0.014892566949129105, + "num_input_tokens_seen": 3651848, + "step": 223, + "train_runtime": 1813.9965, + "train_tokens_per_second": 2013.151 + }, + { + "epoch": 0.13575757575757577, + "grad_norm": 0.09293515980243683, + "learning_rate": 9.999688933702232e-05, + "loss": 0.019074441865086555, + "num_input_tokens_seen": 3668224, + "step": 224, + "train_runtime": 1822.1164, + "train_tokens_per_second": 2013.167 + }, + { + "epoch": 0.13636363636363635, + "grad_norm": 0.04311508685350418, + "learning_rate": 9.99967811492508e-05, + "loss": 0.016122177243232727, + "num_input_tokens_seen": 3684600, + "step": 225, + "train_runtime": 1830.237, + "train_tokens_per_second": 2013.182 + }, + { + "epoch": 0.13696969696969696, + "grad_norm": 0.0684700533747673, + "learning_rate": 9.999667111219573e-05, + "loss": 0.016784384846687317, + "num_input_tokens_seen": 3700976, + "step": 226, + "train_runtime": 1838.3572, + "train_tokens_per_second": 2013.197 + }, + { + "epoch": 0.13757575757575757, + "grad_norm": 0.051709555089473724, + "learning_rate": 9.999655922586116e-05, + "loss": 0.01756284013390541, + "num_input_tokens_seen": 3717352, + "step": 227, + "train_runtime": 1846.4811, + "train_tokens_per_second": 2013.209 + }, + { + "epoch": 0.13818181818181818, + "grad_norm": 0.06800346821546555, + "learning_rate": 9.99964454902512e-05, + "loss": 0.018563883379101753, + "num_input_tokens_seen": 3733728, + "step": 228, + "train_runtime": 1854.6021, + "train_tokens_per_second": 2013.223 + }, + { + "epoch": 0.1387878787878788, + "grad_norm": 0.04645644128322601, + "learning_rate": 9.99963299053701e-05, + "loss": 0.017479516565799713, + "num_input_tokens_seen": 3750104, + "step": 229, + "train_runtime": 1862.7316, + "train_tokens_per_second": 2013.228 + }, + { + "epoch": 0.1393939393939394, + "grad_norm": 0.07372930645942688, + "learning_rate": 9.999621247122213e-05, + "loss": 0.017878303304314613, + "num_input_tokens_seen": 3766480, + "step": 230, + "train_runtime": 1870.8516, + "train_tokens_per_second": 2013.244 + }, + { + "epoch": 0.14, + "grad_norm": 0.1514655202627182, + "learning_rate": 9.99960931878116e-05, + "loss": 0.015512627549469471, + "num_input_tokens_seen": 3782856, + "step": 231, + "train_runtime": 1878.9708, + "train_tokens_per_second": 2013.26 + }, + { + "epoch": 0.1406060606060606, + "grad_norm": 0.04524844512343407, + "learning_rate": 9.999597205514297e-05, + "loss": 0.01565626822412014, + "num_input_tokens_seen": 3799232, + "step": 232, + "train_runtime": 1887.094, + "train_tokens_per_second": 2013.271 + }, + { + "epoch": 0.1412121212121212, + "grad_norm": 0.03657226637005806, + "learning_rate": 9.999584907322069e-05, + "loss": 0.014475165866315365, + "num_input_tokens_seen": 3815608, + "step": 233, + "train_runtime": 1895.2138, + "train_tokens_per_second": 2013.286 + }, + { + "epoch": 0.14181818181818182, + "grad_norm": 0.10837068408727646, + "learning_rate": 9.99957242420493e-05, + "loss": 0.016292275860905647, + "num_input_tokens_seen": 3831984, + "step": 234, + "train_runtime": 1903.3349, + "train_tokens_per_second": 2013.3 + }, + { + "epoch": 0.14242424242424243, + "grad_norm": 0.06915906816720963, + "learning_rate": 9.999559756163346e-05, + "loss": 0.01956966333091259, + "num_input_tokens_seen": 3848360, + "step": 235, + "train_runtime": 1911.4546, + "train_tokens_per_second": 2013.315 + }, + { + "epoch": 0.14303030303030304, + "grad_norm": 0.03815745189785957, + "learning_rate": 9.99954690319778e-05, + "loss": 0.01515297032892704, + "num_input_tokens_seen": 3864736, + "step": 236, + "train_runtime": 1919.5751, + "train_tokens_per_second": 2013.329 + }, + { + "epoch": 0.14363636363636365, + "grad_norm": 0.04804231598973274, + "learning_rate": 9.999533865308712e-05, + "loss": 0.017410308122634888, + "num_input_tokens_seen": 3881112, + "step": 237, + "train_runtime": 1927.6957, + "train_tokens_per_second": 2013.343 + }, + { + "epoch": 0.14424242424242426, + "grad_norm": 0.10351648926734924, + "learning_rate": 9.999520642496623e-05, + "loss": 0.01582871936261654, + "num_input_tokens_seen": 3897488, + "step": 238, + "train_runtime": 1935.8176, + "train_tokens_per_second": 2013.355 + }, + { + "epoch": 0.14484848484848484, + "grad_norm": 0.06399150937795639, + "learning_rate": 9.999507234762e-05, + "loss": 0.015461472794413567, + "num_input_tokens_seen": 3913864, + "step": 239, + "train_runtime": 1943.945, + "train_tokens_per_second": 2013.361 + }, + { + "epoch": 0.14545454545454545, + "grad_norm": 0.027640361338853836, + "learning_rate": 9.999493642105342e-05, + "loss": 0.01647048071026802, + "num_input_tokens_seen": 3930240, + "step": 240, + "train_runtime": 1952.0688, + "train_tokens_per_second": 2013.372 + }, + { + "epoch": 0.14606060606060606, + "grad_norm": 0.07313567399978638, + "learning_rate": 9.999479864527148e-05, + "loss": 0.015903417021036148, + "num_input_tokens_seen": 3946616, + "step": 241, + "train_runtime": 1960.1915, + "train_tokens_per_second": 2013.383 + }, + { + "epoch": 0.14666666666666667, + "grad_norm": 0.09255962073802948, + "learning_rate": 9.999465902027931e-05, + "loss": 0.01633605733513832, + "num_input_tokens_seen": 3962992, + "step": 242, + "train_runtime": 1968.3145, + "train_tokens_per_second": 2013.394 + }, + { + "epoch": 0.14727272727272728, + "grad_norm": 0.06311100721359253, + "learning_rate": 9.999451754608207e-05, + "loss": 0.018459340557456017, + "num_input_tokens_seen": 3979368, + "step": 243, + "train_runtime": 1976.4343, + "train_tokens_per_second": 2013.408 + }, + { + "epoch": 0.1478787878787879, + "grad_norm": 0.04240158200263977, + "learning_rate": 9.999437422268498e-05, + "loss": 0.01432002056390047, + "num_input_tokens_seen": 3995744, + "step": 244, + "train_runtime": 1984.5577, + "train_tokens_per_second": 2013.418 + }, + { + "epoch": 0.1484848484848485, + "grad_norm": 0.05550538748502731, + "learning_rate": 9.999422905009335e-05, + "loss": 0.014518518932163715, + "num_input_tokens_seen": 4012120, + "step": 245, + "train_runtime": 1992.685, + "train_tokens_per_second": 2013.424 + }, + { + "epoch": 0.14909090909090908, + "grad_norm": 0.037221502512693405, + "learning_rate": 9.999408202831255e-05, + "loss": 0.014823012985289097, + "num_input_tokens_seen": 4028496, + "step": 246, + "train_runtime": 2000.8075, + "train_tokens_per_second": 2013.435 + }, + { + "epoch": 0.1496969696969697, + "grad_norm": 0.06923341751098633, + "learning_rate": 9.999393315734801e-05, + "loss": 0.018903765827417374, + "num_input_tokens_seen": 4044872, + "step": 247, + "train_runtime": 2008.9335, + "train_tokens_per_second": 2013.443 + }, + { + "epoch": 0.1503030303030303, + "grad_norm": 0.07023045420646667, + "learning_rate": 9.999378243720523e-05, + "loss": 0.01768019236624241, + "num_input_tokens_seen": 4061248, + "step": 248, + "train_runtime": 2017.0572, + "train_tokens_per_second": 2013.452 + }, + { + "epoch": 0.1509090909090909, + "grad_norm": 0.04301533102989197, + "learning_rate": 9.999362986788981e-05, + "loss": 0.016754839569330215, + "num_input_tokens_seen": 4077624, + "step": 249, + "train_runtime": 2025.1771, + "train_tokens_per_second": 2013.465 + }, + { + "epoch": 0.15151515151515152, + "grad_norm": 0.08630920946598053, + "learning_rate": 9.999347544940739e-05, + "loss": 0.014999642968177795, + "num_input_tokens_seen": 4094000, + "step": 250, + "train_runtime": 2033.2978, + "train_tokens_per_second": 2013.478 + }, + { + "epoch": 0.15212121212121213, + "grad_norm": 0.03872856870293617, + "learning_rate": 9.999331918176365e-05, + "loss": 0.015648486092686653, + "num_input_tokens_seen": 4110376, + "step": 251, + "train_runtime": 2041.4306, + "train_tokens_per_second": 2013.478 + }, + { + "epoch": 0.15272727272727274, + "grad_norm": 0.0624275766313076, + "learning_rate": 9.999316106496439e-05, + "loss": 0.015371391549706459, + "num_input_tokens_seen": 4126752, + "step": 252, + "train_runtime": 2049.5498, + "train_tokens_per_second": 2013.492 + }, + { + "epoch": 0.15333333333333332, + "grad_norm": 0.03090560808777809, + "learning_rate": 9.999300109901548e-05, + "loss": 0.013192292302846909, + "num_input_tokens_seen": 4143128, + "step": 253, + "train_runtime": 2057.6702, + "train_tokens_per_second": 2013.504 + }, + { + "epoch": 0.15393939393939393, + "grad_norm": 0.5114591121673584, + "learning_rate": 9.99928392839228e-05, + "loss": 0.018224472180008888, + "num_input_tokens_seen": 4159504, + "step": 254, + "train_runtime": 2065.8007, + "train_tokens_per_second": 2013.507 + }, + { + "epoch": 0.15454545454545454, + "grad_norm": 0.05735045298933983, + "learning_rate": 9.999267561969235e-05, + "loss": 0.017389601096510887, + "num_input_tokens_seen": 4175880, + "step": 255, + "train_runtime": 2073.9307, + "train_tokens_per_second": 2013.51 + }, + { + "epoch": 0.15515151515151515, + "grad_norm": 0.13113801181316376, + "learning_rate": 9.99925101063302e-05, + "loss": 0.015801645815372467, + "num_input_tokens_seen": 4192256, + "step": 256, + "train_runtime": 2082.0503, + "train_tokens_per_second": 2013.523 + }, + { + "epoch": 0.15575757575757576, + "grad_norm": 0.1659373939037323, + "learning_rate": 9.999234274384244e-05, + "loss": 0.016719762235879898, + "num_input_tokens_seen": 4208632, + "step": 257, + "train_runtime": 2090.1723, + "train_tokens_per_second": 2013.534 + }, + { + "epoch": 0.15636363636363637, + "grad_norm": 0.09268343448638916, + "learning_rate": 9.99921735322353e-05, + "loss": 0.01958809420466423, + "num_input_tokens_seen": 4225008, + "step": 258, + "train_runtime": 2098.2916, + "train_tokens_per_second": 2013.547 + }, + { + "epoch": 0.15696969696969698, + "grad_norm": 0.08097874373197556, + "learning_rate": 9.999200247151499e-05, + "loss": 0.01584583893418312, + "num_input_tokens_seen": 4241384, + "step": 259, + "train_runtime": 2106.4305, + "train_tokens_per_second": 2013.541 + }, + { + "epoch": 0.15757575757575756, + "grad_norm": 0.072023406624794, + "learning_rate": 9.999182956168787e-05, + "loss": 0.0168259609490633, + "num_input_tokens_seen": 4257760, + "step": 260, + "train_runtime": 2114.5526, + "train_tokens_per_second": 2013.551 + }, + { + "epoch": 0.15818181818181817, + "grad_norm": 0.038404542952775955, + "learning_rate": 9.999165480276034e-05, + "loss": 0.014127206057310104, + "num_input_tokens_seen": 4274136, + "step": 261, + "train_runtime": 2122.6772, + "train_tokens_per_second": 2013.559 + }, + { + "epoch": 0.15878787878787878, + "grad_norm": 0.03950539231300354, + "learning_rate": 9.999147819473884e-05, + "loss": 0.016822200268507004, + "num_input_tokens_seen": 4290512, + "step": 262, + "train_runtime": 2130.7967, + "train_tokens_per_second": 2013.572 + }, + { + "epoch": 0.1593939393939394, + "grad_norm": 0.04290624335408211, + "learning_rate": 9.999129973762992e-05, + "loss": 0.016068218275904655, + "num_input_tokens_seen": 4306888, + "step": 263, + "train_runtime": 2138.9172, + "train_tokens_per_second": 2013.583 + }, + { + "epoch": 0.16, + "grad_norm": 0.05928179994225502, + "learning_rate": 9.99911194314402e-05, + "loss": 0.016628028824925423, + "num_input_tokens_seen": 4323264, + "step": 264, + "train_runtime": 2147.039, + "train_tokens_per_second": 2013.594 + }, + { + "epoch": 0.1606060606060606, + "grad_norm": 0.04302699863910675, + "learning_rate": 9.99909372761763e-05, + "loss": 0.014704343862831593, + "num_input_tokens_seen": 4339640, + "step": 265, + "train_runtime": 2155.1707, + "train_tokens_per_second": 2013.595 + }, + { + "epoch": 0.16121212121212122, + "grad_norm": 0.047466881573200226, + "learning_rate": 9.999075327184499e-05, + "loss": 0.016627237200737, + "num_input_tokens_seen": 4356016, + "step": 266, + "train_runtime": 2163.294, + "train_tokens_per_second": 2013.603 + }, + { + "epoch": 0.1618181818181818, + "grad_norm": 0.04007207974791527, + "learning_rate": 9.999056741845305e-05, + "loss": 0.01723393052816391, + "num_input_tokens_seen": 4372392, + "step": 267, + "train_runtime": 2171.417, + "train_tokens_per_second": 2013.612 + }, + { + "epoch": 0.16242424242424242, + "grad_norm": 0.04319130629301071, + "learning_rate": 9.99903797160074e-05, + "loss": 0.014541544020175934, + "num_input_tokens_seen": 4388768, + "step": 268, + "train_runtime": 2179.5352, + "train_tokens_per_second": 2013.626 + }, + { + "epoch": 0.16303030303030303, + "grad_norm": 0.02772807702422142, + "learning_rate": 9.999019016451494e-05, + "loss": 0.01326832640916109, + "num_input_tokens_seen": 4405144, + "step": 269, + "train_runtime": 2187.6543, + "train_tokens_per_second": 2013.638 + }, + { + "epoch": 0.16363636363636364, + "grad_norm": 0.03225944936275482, + "learning_rate": 9.998999876398271e-05, + "loss": 0.013814960606396198, + "num_input_tokens_seen": 4421520, + "step": 270, + "train_runtime": 2195.7724, + "train_tokens_per_second": 2013.651 + }, + { + "epoch": 0.16424242424242425, + "grad_norm": 0.03607013449072838, + "learning_rate": 9.998980551441776e-05, + "loss": 0.01566735841333866, + "num_input_tokens_seen": 4437896, + "step": 271, + "train_runtime": 2203.8921, + "train_tokens_per_second": 2013.663 + }, + { + "epoch": 0.16484848484848486, + "grad_norm": 0.02214481309056282, + "learning_rate": 9.998961041582727e-05, + "loss": 0.014288516715168953, + "num_input_tokens_seen": 4454272, + "step": 272, + "train_runtime": 2212.0309, + "train_tokens_per_second": 2013.657 + }, + { + "epoch": 0.16545454545454547, + "grad_norm": 0.03539419174194336, + "learning_rate": 9.998941346821844e-05, + "loss": 0.016615379601716995, + "num_input_tokens_seen": 4470648, + "step": 273, + "train_runtime": 2220.1513, + "train_tokens_per_second": 2013.668 + }, + { + "epoch": 0.16606060606060605, + "grad_norm": 0.02361457794904709, + "learning_rate": 9.998921467159855e-05, + "loss": 0.015559839084744453, + "num_input_tokens_seen": 4487024, + "step": 274, + "train_runtime": 2228.2688, + "train_tokens_per_second": 2013.682 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 0.029787061735987663, + "learning_rate": 9.998901402597496e-05, + "loss": 0.014054241590201855, + "num_input_tokens_seen": 4503400, + "step": 275, + "train_runtime": 2236.3965, + "train_tokens_per_second": 2013.686 + }, + { + "epoch": 0.16727272727272727, + "grad_norm": 0.08080027997493744, + "learning_rate": 9.99888115313551e-05, + "loss": 0.01626443862915039, + "num_input_tokens_seen": 4519776, + "step": 276, + "train_runtime": 2244.5116, + "train_tokens_per_second": 2013.701 + }, + { + "epoch": 0.16787878787878788, + "grad_norm": 0.04751146212220192, + "learning_rate": 9.998860718774643e-05, + "loss": 0.015646975487470627, + "num_input_tokens_seen": 4536152, + "step": 277, + "train_runtime": 2252.6223, + "train_tokens_per_second": 2013.721 + }, + { + "epoch": 0.1684848484848485, + "grad_norm": 0.11396390199661255, + "learning_rate": 9.998840099515655e-05, + "loss": 0.01626933366060257, + "num_input_tokens_seen": 4552528, + "step": 278, + "train_runtime": 2260.7374, + "train_tokens_per_second": 2013.736 + }, + { + "epoch": 0.1690909090909091, + "grad_norm": 0.03807124122977257, + "learning_rate": 9.998819295359305e-05, + "loss": 0.01517193578183651, + "num_input_tokens_seen": 4568904, + "step": 279, + "train_runtime": 2268.8455, + "train_tokens_per_second": 2013.757 + }, + { + "epoch": 0.1696969696969697, + "grad_norm": 0.07842900604009628, + "learning_rate": 9.998798306306366e-05, + "loss": 0.016375314444303513, + "num_input_tokens_seen": 4585280, + "step": 280, + "train_runtime": 2276.9581, + "train_tokens_per_second": 2013.774 + }, + { + "epoch": 0.1703030303030303, + "grad_norm": 0.12316741049289703, + "learning_rate": 9.99877713235761e-05, + "loss": 0.0158452857285738, + "num_input_tokens_seen": 4601656, + "step": 281, + "train_runtime": 2285.07, + "train_tokens_per_second": 2013.792 + }, + { + "epoch": 0.1709090909090909, + "grad_norm": 0.035711321979761124, + "learning_rate": 9.998755773513824e-05, + "loss": 0.014004937373101711, + "num_input_tokens_seen": 4618032, + "step": 282, + "train_runtime": 2293.1794, + "train_tokens_per_second": 2013.812 + }, + { + "epoch": 0.1715151515151515, + "grad_norm": 0.04513373225927353, + "learning_rate": 9.998734229775794e-05, + "loss": 0.015064300037920475, + "num_input_tokens_seen": 4634408, + "step": 283, + "train_runtime": 2301.2911, + "train_tokens_per_second": 2013.83 + }, + { + "epoch": 0.17212121212121212, + "grad_norm": 0.04803522303700447, + "learning_rate": 9.998712501144323e-05, + "loss": 0.015632454305887222, + "num_input_tokens_seen": 4650784, + "step": 284, + "train_runtime": 2309.4064, + "train_tokens_per_second": 2013.844 + }, + { + "epoch": 0.17272727272727273, + "grad_norm": 0.0677453801035881, + "learning_rate": 9.99869058762021e-05, + "loss": 0.01668519154191017, + "num_input_tokens_seen": 4667160, + "step": 285, + "train_runtime": 2317.5195, + "train_tokens_per_second": 2013.86 + }, + { + "epoch": 0.17333333333333334, + "grad_norm": 0.06408604979515076, + "learning_rate": 9.998668489204266e-05, + "loss": 0.016011208295822144, + "num_input_tokens_seen": 4683536, + "step": 286, + "train_runtime": 2325.6311, + "train_tokens_per_second": 2013.877 + }, + { + "epoch": 0.17393939393939395, + "grad_norm": 0.049628015607595444, + "learning_rate": 9.998646205897309e-05, + "loss": 0.015140787698328495, + "num_input_tokens_seen": 4699912, + "step": 287, + "train_runtime": 2333.7425, + "train_tokens_per_second": 2013.895 + }, + { + "epoch": 0.17454545454545456, + "grad_norm": 0.05506971478462219, + "learning_rate": 9.998623737700163e-05, + "loss": 0.014441089704632759, + "num_input_tokens_seen": 4716288, + "step": 288, + "train_runtime": 2341.8537, + "train_tokens_per_second": 2013.912 + }, + { + "epoch": 0.17515151515151514, + "grad_norm": 0.04357004538178444, + "learning_rate": 9.99860108461366e-05, + "loss": 0.014559566974639893, + "num_input_tokens_seen": 4732664, + "step": 289, + "train_runtime": 2349.9687, + "train_tokens_per_second": 2013.926 + }, + { + "epoch": 0.17575757575757575, + "grad_norm": 0.03436315059661865, + "learning_rate": 9.998578246638637e-05, + "loss": 0.014904836192727089, + "num_input_tokens_seen": 4749040, + "step": 290, + "train_runtime": 2358.082, + "train_tokens_per_second": 2013.942 + }, + { + "epoch": 0.17636363636363636, + "grad_norm": 0.030473578721284866, + "learning_rate": 9.99855522377594e-05, + "loss": 0.013786690309643745, + "num_input_tokens_seen": 4765416, + "step": 291, + "train_runtime": 2366.1924, + "train_tokens_per_second": 2013.96 + }, + { + "epoch": 0.17696969696969697, + "grad_norm": 0.033072736114263535, + "learning_rate": 9.998532016026418e-05, + "loss": 0.016431497409939766, + "num_input_tokens_seen": 4781792, + "step": 292, + "train_runtime": 2374.3035, + "train_tokens_per_second": 2013.977 + }, + { + "epoch": 0.17757575757575758, + "grad_norm": 0.03811201453208923, + "learning_rate": 9.998508623390932e-05, + "loss": 0.014959779568016529, + "num_input_tokens_seen": 4798168, + "step": 293, + "train_runtime": 2382.4135, + "train_tokens_per_second": 2013.995 + }, + { + "epoch": 0.1781818181818182, + "grad_norm": 0.04069237411022186, + "learning_rate": 9.998485045870344e-05, + "loss": 0.016118772327899933, + "num_input_tokens_seen": 4814544, + "step": 294, + "train_runtime": 2390.5227, + "train_tokens_per_second": 2014.013 + }, + { + "epoch": 0.1787878787878788, + "grad_norm": 0.031989723443984985, + "learning_rate": 9.99846128346553e-05, + "loss": 0.01669073849916458, + "num_input_tokens_seen": 4830920, + "step": 295, + "train_runtime": 2398.6348, + "train_tokens_per_second": 2014.029 + }, + { + "epoch": 0.17939393939393938, + "grad_norm": 0.03683701902627945, + "learning_rate": 9.998437336177369e-05, + "loss": 0.014967912808060646, + "num_input_tokens_seen": 4847296, + "step": 296, + "train_runtime": 2406.7421, + "train_tokens_per_second": 2014.049 + }, + { + "epoch": 0.18, + "grad_norm": 0.057917602360248566, + "learning_rate": 9.998413204006742e-05, + "loss": 0.018314681947231293, + "num_input_tokens_seen": 4863672, + "step": 297, + "train_runtime": 2414.8505, + "train_tokens_per_second": 2014.068 + }, + { + "epoch": 0.1806060606060606, + "grad_norm": 0.042889710515737534, + "learning_rate": 9.998388886954547e-05, + "loss": 0.014539923518896103, + "num_input_tokens_seen": 4880048, + "step": 298, + "train_runtime": 2422.9583, + "train_tokens_per_second": 2014.087 + }, + { + "epoch": 0.1812121212121212, + "grad_norm": 0.04697619378566742, + "learning_rate": 9.998364385021679e-05, + "loss": 0.01652900129556656, + "num_input_tokens_seen": 4896424, + "step": 299, + "train_runtime": 2431.0701, + "train_tokens_per_second": 2014.102 + }, + { + "epoch": 0.18181818181818182, + "grad_norm": 0.038388364017009735, + "learning_rate": 9.998339698209046e-05, + "loss": 0.013660457916557789, + "num_input_tokens_seen": 4912800, + "step": 300, + "train_runtime": 2439.1822, + "train_tokens_per_second": 2014.118 + }, + { + "epoch": 0.18242424242424243, + "grad_norm": 0.026958242058753967, + "learning_rate": 9.998314826517563e-05, + "loss": 0.015251623466610909, + "num_input_tokens_seen": 4929176, + "step": 301, + "train_runtime": 2448.2631, + "train_tokens_per_second": 2013.336 + }, + { + "epoch": 0.18303030303030304, + "grad_norm": 0.04779147729277611, + "learning_rate": 9.998289769948147e-05, + "loss": 0.012775855138897896, + "num_input_tokens_seen": 4945552, + "step": 302, + "train_runtime": 2456.368, + "train_tokens_per_second": 2013.36 + }, + { + "epoch": 0.18363636363636363, + "grad_norm": 0.03123384155333042, + "learning_rate": 9.998264528501727e-05, + "loss": 0.015583731234073639, + "num_input_tokens_seen": 4961928, + "step": 303, + "train_runtime": 2464.4763, + "train_tokens_per_second": 2013.38 + }, + { + "epoch": 0.18424242424242424, + "grad_norm": 0.05030890926718712, + "learning_rate": 9.998239102179236e-05, + "loss": 0.013868209905922413, + "num_input_tokens_seen": 4978304, + "step": 304, + "train_runtime": 2472.5834, + "train_tokens_per_second": 2013.402 + }, + { + "epoch": 0.18484848484848485, + "grad_norm": 0.033021751791238785, + "learning_rate": 9.998213490981614e-05, + "loss": 0.016501927748322487, + "num_input_tokens_seen": 4994680, + "step": 305, + "train_runtime": 2480.6921, + "train_tokens_per_second": 2013.422 + }, + { + "epoch": 0.18545454545454546, + "grad_norm": 0.050541143864393234, + "learning_rate": 9.998187694909807e-05, + "loss": 0.01771150343120098, + "num_input_tokens_seen": 5011056, + "step": 306, + "train_runtime": 2488.7992, + "train_tokens_per_second": 2013.443 + }, + { + "epoch": 0.18606060606060607, + "grad_norm": 0.04063250124454498, + "learning_rate": 9.998161713964774e-05, + "loss": 0.015554912388324738, + "num_input_tokens_seen": 5027432, + "step": 307, + "train_runtime": 2496.9044, + "train_tokens_per_second": 2013.466 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 0.02722395956516266, + "learning_rate": 9.998135548147469e-05, + "loss": 0.013613277114927769, + "num_input_tokens_seen": 5043808, + "step": 308, + "train_runtime": 2505.0089, + "train_tokens_per_second": 2013.489 + }, + { + "epoch": 0.18727272727272729, + "grad_norm": 0.02678558975458145, + "learning_rate": 9.998109197458865e-05, + "loss": 0.014953495003283024, + "num_input_tokens_seen": 5060184, + "step": 309, + "train_runtime": 2513.1157, + "train_tokens_per_second": 2013.51 + }, + { + "epoch": 0.18787878787878787, + "grad_norm": 0.02857518568634987, + "learning_rate": 9.998082661899935e-05, + "loss": 0.013844496570527554, + "num_input_tokens_seen": 5076560, + "step": 310, + "train_runtime": 2521.2304, + "train_tokens_per_second": 2013.525 + }, + { + "epoch": 0.18848484848484848, + "grad_norm": 0.2615605294704437, + "learning_rate": 9.998055941471662e-05, + "loss": 0.01809251680970192, + "num_input_tokens_seen": 5092936, + "step": 311, + "train_runtime": 2529.3408, + "train_tokens_per_second": 2013.543 + }, + { + "epoch": 0.1890909090909091, + "grad_norm": 0.029859403148293495, + "learning_rate": 9.998029036175031e-05, + "loss": 0.015970397740602493, + "num_input_tokens_seen": 5109312, + "step": 312, + "train_runtime": 2537.4488, + "train_tokens_per_second": 2013.563 + }, + { + "epoch": 0.1896969696969697, + "grad_norm": 0.03636668995022774, + "learning_rate": 9.99800194601104e-05, + "loss": 0.01580364629626274, + "num_input_tokens_seen": 5125688, + "step": 313, + "train_runtime": 2545.553, + "train_tokens_per_second": 2013.585 + }, + { + "epoch": 0.1903030303030303, + "grad_norm": 0.0684208944439888, + "learning_rate": 9.997974670980691e-05, + "loss": 0.017103755846619606, + "num_input_tokens_seen": 5142064, + "step": 314, + "train_runtime": 2553.6615, + "train_tokens_per_second": 2013.604 + }, + { + "epoch": 0.19090909090909092, + "grad_norm": 0.028665577992796898, + "learning_rate": 9.997947211084991e-05, + "loss": 0.014511539600789547, + "num_input_tokens_seen": 5158440, + "step": 315, + "train_runtime": 2561.7735, + "train_tokens_per_second": 2013.621 + }, + { + "epoch": 0.19151515151515153, + "grad_norm": 0.09884219616651535, + "learning_rate": 9.997919566324959e-05, + "loss": 0.014168107882142067, + "num_input_tokens_seen": 5174816, + "step": 316, + "train_runtime": 2569.8855, + "train_tokens_per_second": 2013.637 + }, + { + "epoch": 0.1921212121212121, + "grad_norm": 0.1779116839170456, + "learning_rate": 9.997891736701613e-05, + "loss": 0.014995518140494823, + "num_input_tokens_seen": 5191192, + "step": 317, + "train_runtime": 2577.9971, + "train_tokens_per_second": 2013.653 + }, + { + "epoch": 0.19272727272727272, + "grad_norm": 0.030352341011166573, + "learning_rate": 9.997863722215983e-05, + "loss": 0.014715241268277168, + "num_input_tokens_seen": 5207568, + "step": 318, + "train_runtime": 2586.1052, + "train_tokens_per_second": 2013.672 + }, + { + "epoch": 0.19333333333333333, + "grad_norm": 0.03511129692196846, + "learning_rate": 9.99783552286911e-05, + "loss": 0.01499946229159832, + "num_input_tokens_seen": 5223944, + "step": 319, + "train_runtime": 2594.2129, + "train_tokens_per_second": 2013.691 + }, + { + "epoch": 0.19393939393939394, + "grad_norm": 0.04475672170519829, + "learning_rate": 9.997807138662033e-05, + "loss": 0.014523375779390335, + "num_input_tokens_seen": 5240320, + "step": 320, + "train_runtime": 2602.3206, + "train_tokens_per_second": 2013.71 + }, + { + "epoch": 0.19454545454545455, + "grad_norm": 0.02900783158838749, + "learning_rate": 9.997778569595801e-05, + "loss": 0.015447665005922318, + "num_input_tokens_seen": 5256696, + "step": 321, + "train_runtime": 2610.4318, + "train_tokens_per_second": 2013.727 + }, + { + "epoch": 0.19515151515151516, + "grad_norm": 0.022910727187991142, + "learning_rate": 9.997749815671473e-05, + "loss": 0.013799930922687054, + "num_input_tokens_seen": 5273072, + "step": 322, + "train_runtime": 2618.541, + "train_tokens_per_second": 2013.744 + }, + { + "epoch": 0.19575757575757577, + "grad_norm": 0.03925245255231857, + "learning_rate": 9.997720876890113e-05, + "loss": 0.013741591945290565, + "num_input_tokens_seen": 5289448, + "step": 323, + "train_runtime": 2626.6511, + "train_tokens_per_second": 2013.761 + }, + { + "epoch": 0.19636363636363635, + "grad_norm": 0.029477456584572792, + "learning_rate": 9.997691753252791e-05, + "loss": 0.013831754215061665, + "num_input_tokens_seen": 5305824, + "step": 324, + "train_runtime": 2634.7586, + "train_tokens_per_second": 2013.78 + }, + { + "epoch": 0.19696969696969696, + "grad_norm": 0.0368235781788826, + "learning_rate": 9.997662444760583e-05, + "loss": 0.014774560928344727, + "num_input_tokens_seen": 5322200, + "step": 325, + "train_runtime": 2642.8689, + "train_tokens_per_second": 2013.796 + }, + { + "epoch": 0.19757575757575757, + "grad_norm": 0.04399452731013298, + "learning_rate": 9.997632951414573e-05, + "loss": 0.014160547405481339, + "num_input_tokens_seen": 5338576, + "step": 326, + "train_runtime": 2650.978, + "train_tokens_per_second": 2013.814 + }, + { + "epoch": 0.19818181818181818, + "grad_norm": 0.02241128869354725, + "learning_rate": 9.997603273215853e-05, + "loss": 0.013626255095005035, + "num_input_tokens_seen": 5354952, + "step": 327, + "train_runtime": 2659.0857, + "train_tokens_per_second": 2013.832 + }, + { + "epoch": 0.1987878787878788, + "grad_norm": 0.022924182936549187, + "learning_rate": 9.99757341016552e-05, + "loss": 0.013918038457632065, + "num_input_tokens_seen": 5371328, + "step": 328, + "train_runtime": 2667.1942, + "train_tokens_per_second": 2013.85 + }, + { + "epoch": 0.1993939393939394, + "grad_norm": 0.0384218692779541, + "learning_rate": 9.99754336226468e-05, + "loss": 0.01543221715837717, + "num_input_tokens_seen": 5387704, + "step": 329, + "train_runtime": 2675.3051, + "train_tokens_per_second": 2013.865 + }, + { + "epoch": 0.2, + "grad_norm": 0.024983001872897148, + "learning_rate": 9.997513129514442e-05, + "loss": 0.014143919572234154, + "num_input_tokens_seen": 5404080, + "step": 330, + "train_runtime": 2683.4136, + "train_tokens_per_second": 2013.883 + }, + { + "epoch": 0.2006060606060606, + "grad_norm": 0.036509182304143906, + "learning_rate": 9.997482711915927e-05, + "loss": 0.017176145687699318, + "num_input_tokens_seen": 5420456, + "step": 331, + "train_runtime": 2691.5304, + "train_tokens_per_second": 2013.894 + }, + { + "epoch": 0.2012121212121212, + "grad_norm": 0.02530326321721077, + "learning_rate": 9.997452109470257e-05, + "loss": 0.01395807322114706, + "num_input_tokens_seen": 5436832, + "step": 332, + "train_runtime": 2699.6383, + "train_tokens_per_second": 2013.911 + }, + { + "epoch": 0.2018181818181818, + "grad_norm": 0.026743337512016296, + "learning_rate": 9.997421322178566e-05, + "loss": 0.015008356422185898, + "num_input_tokens_seen": 5453208, + "step": 333, + "train_runtime": 2707.7479, + "train_tokens_per_second": 2013.928 + }, + { + "epoch": 0.20242424242424242, + "grad_norm": 0.03141747787594795, + "learning_rate": 9.997390350041993e-05, + "loss": 0.014487622305750847, + "num_input_tokens_seen": 5469584, + "step": 334, + "train_runtime": 2715.8554, + "train_tokens_per_second": 2013.945 + }, + { + "epoch": 0.20303030303030303, + "grad_norm": 0.03556372597813606, + "learning_rate": 9.997359193061681e-05, + "loss": 0.014322612434625626, + "num_input_tokens_seen": 5485960, + "step": 335, + "train_runtime": 2723.964, + "train_tokens_per_second": 2013.962 + }, + { + "epoch": 0.20363636363636364, + "grad_norm": 0.05319400504231453, + "learning_rate": 9.997327851238788e-05, + "loss": 0.015110835433006287, + "num_input_tokens_seen": 5502336, + "step": 336, + "train_runtime": 2732.0746, + "train_tokens_per_second": 2013.977 + }, + { + "epoch": 0.20424242424242425, + "grad_norm": 0.05987285077571869, + "learning_rate": 9.997296324574467e-05, + "loss": 0.015784846618771553, + "num_input_tokens_seen": 5518712, + "step": 337, + "train_runtime": 2740.1837, + "train_tokens_per_second": 2013.993 + }, + { + "epoch": 0.20484848484848484, + "grad_norm": 0.05444290488958359, + "learning_rate": 9.997264613069887e-05, + "loss": 0.016434665769338608, + "num_input_tokens_seen": 5535088, + "step": 338, + "train_runtime": 2748.2918, + "train_tokens_per_second": 2014.01 + }, + { + "epoch": 0.20545454545454545, + "grad_norm": 0.03842825070023537, + "learning_rate": 9.997232716726222e-05, + "loss": 0.01436456385999918, + "num_input_tokens_seen": 5551464, + "step": 339, + "train_runtime": 2756.4036, + "train_tokens_per_second": 2014.024 + }, + { + "epoch": 0.20606060606060606, + "grad_norm": 0.0297915730625391, + "learning_rate": 9.997200635544648e-05, + "loss": 0.014456460252404213, + "num_input_tokens_seen": 5567840, + "step": 340, + "train_runtime": 2764.5114, + "train_tokens_per_second": 2014.041 + }, + { + "epoch": 0.20666666666666667, + "grad_norm": 0.030197616666555405, + "learning_rate": 9.997168369526355e-05, + "loss": 0.013316805474460125, + "num_input_tokens_seen": 5584216, + "step": 341, + "train_runtime": 2772.6201, + "train_tokens_per_second": 2014.057 + }, + { + "epoch": 0.20727272727272728, + "grad_norm": 0.04718567803502083, + "learning_rate": 9.997135918672536e-05, + "loss": 0.014915217645466328, + "num_input_tokens_seen": 5600592, + "step": 342, + "train_runtime": 2780.7298, + "train_tokens_per_second": 2014.073 + }, + { + "epoch": 0.20787878787878789, + "grad_norm": 0.04453250393271446, + "learning_rate": 9.997103282984391e-05, + "loss": 0.013720309361815453, + "num_input_tokens_seen": 5616968, + "step": 343, + "train_runtime": 2788.839, + "train_tokens_per_second": 2014.088 + }, + { + "epoch": 0.2084848484848485, + "grad_norm": 0.028496714308857918, + "learning_rate": 9.997070462463127e-05, + "loss": 0.015428826212882996, + "num_input_tokens_seen": 5633344, + "step": 344, + "train_runtime": 2796.9478, + "train_tokens_per_second": 2014.104 + }, + { + "epoch": 0.20909090909090908, + "grad_norm": 0.025575809180736542, + "learning_rate": 9.99703745710996e-05, + "loss": 0.014485862106084824, + "num_input_tokens_seen": 5649720, + "step": 345, + "train_runtime": 2805.0546, + "train_tokens_per_second": 2014.121 + }, + { + "epoch": 0.2096969696969697, + "grad_norm": 0.03871789202094078, + "learning_rate": 9.997004266926105e-05, + "loss": 0.013593616895377636, + "num_input_tokens_seen": 5666096, + "step": 346, + "train_runtime": 2813.1609, + "train_tokens_per_second": 2014.139 + }, + { + "epoch": 0.2103030303030303, + "grad_norm": 0.07384062558412552, + "learning_rate": 9.996970891912794e-05, + "loss": 0.015072252601385117, + "num_input_tokens_seen": 5682472, + "step": 347, + "train_runtime": 2821.2688, + "train_tokens_per_second": 2014.155 + }, + { + "epoch": 0.2109090909090909, + "grad_norm": 0.041799403727054596, + "learning_rate": 9.996937332071263e-05, + "loss": 0.014217150397598743, + "num_input_tokens_seen": 5698848, + "step": 348, + "train_runtime": 2829.3783, + "train_tokens_per_second": 2014.17 + }, + { + "epoch": 0.21151515151515152, + "grad_norm": 0.04895857349038124, + "learning_rate": 9.99690358740275e-05, + "loss": 0.017368610948324203, + "num_input_tokens_seen": 5715224, + "step": 349, + "train_runtime": 2837.4869, + "train_tokens_per_second": 2014.185 + }, + { + "epoch": 0.21212121212121213, + "grad_norm": 0.03166350722312927, + "learning_rate": 9.996869657908504e-05, + "loss": 0.014376340433955193, + "num_input_tokens_seen": 5731600, + "step": 350, + "train_runtime": 2845.6047, + "train_tokens_per_second": 2014.194 + }, + { + "epoch": 0.21272727272727274, + "grad_norm": 0.06105640158057213, + "learning_rate": 9.996835543589781e-05, + "loss": 0.01661105453968048, + "num_input_tokens_seen": 5747976, + "step": 351, + "train_runtime": 2853.7303, + "train_tokens_per_second": 2014.197 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 0.038000039756298065, + "learning_rate": 9.996801244447842e-05, + "loss": 0.013641721569001675, + "num_input_tokens_seen": 5764352, + "step": 352, + "train_runtime": 2861.847, + "train_tokens_per_second": 2014.207 + }, + { + "epoch": 0.21393939393939393, + "grad_norm": 0.033811476081609726, + "learning_rate": 9.996766760483956e-05, + "loss": 0.01525929756462574, + "num_input_tokens_seen": 5780728, + "step": 353, + "train_runtime": 2869.9635, + "train_tokens_per_second": 2014.217 + }, + { + "epoch": 0.21454545454545454, + "grad_norm": 0.01919690892100334, + "learning_rate": 9.996732091699396e-05, + "loss": 0.013008120469748974, + "num_input_tokens_seen": 5797104, + "step": 354, + "train_runtime": 2878.0782, + "train_tokens_per_second": 2014.227 + }, + { + "epoch": 0.21515151515151515, + "grad_norm": 0.03718187287449837, + "learning_rate": 9.99669723809545e-05, + "loss": 0.015754155814647675, + "num_input_tokens_seen": 5813480, + "step": 355, + "train_runtime": 2886.1934, + "train_tokens_per_second": 2014.238 + }, + { + "epoch": 0.21575757575757576, + "grad_norm": 0.03534379228949547, + "learning_rate": 9.996662199673401e-05, + "loss": 0.014936118386685848, + "num_input_tokens_seen": 5829856, + "step": 356, + "train_runtime": 2894.3081, + "train_tokens_per_second": 2014.249 + }, + { + "epoch": 0.21636363636363637, + "grad_norm": 0.024305060505867004, + "learning_rate": 9.99662697643455e-05, + "loss": 0.01359601877629757, + "num_input_tokens_seen": 5846232, + "step": 357, + "train_runtime": 2902.4301, + "train_tokens_per_second": 2014.254 + }, + { + "epoch": 0.21696969696969698, + "grad_norm": 0.027639245614409447, + "learning_rate": 9.996591568380196e-05, + "loss": 0.014319726265966892, + "num_input_tokens_seen": 5862608, + "step": 358, + "train_runtime": 2910.5461, + "train_tokens_per_second": 2014.264 + }, + { + "epoch": 0.2175757575757576, + "grad_norm": 0.06455444544553757, + "learning_rate": 9.996555975511652e-05, + "loss": 0.013829253613948822, + "num_input_tokens_seen": 5878984, + "step": 359, + "train_runtime": 2918.6646, + "train_tokens_per_second": 2014.272 + }, + { + "epoch": 0.21818181818181817, + "grad_norm": 0.02637428045272827, + "learning_rate": 9.996520197830231e-05, + "loss": 0.01420363038778305, + "num_input_tokens_seen": 5895360, + "step": 360, + "train_runtime": 2926.7812, + "train_tokens_per_second": 2014.281 + }, + { + "epoch": 0.21878787878787878, + "grad_norm": 0.06233112886548042, + "learning_rate": 9.99648423533726e-05, + "loss": 0.017570551484823227, + "num_input_tokens_seen": 5911736, + "step": 361, + "train_runtime": 2934.9002, + "train_tokens_per_second": 2014.289 + }, + { + "epoch": 0.2193939393939394, + "grad_norm": 0.04012456163764, + "learning_rate": 9.996448088034065e-05, + "loss": 0.015336515381932259, + "num_input_tokens_seen": 5928112, + "step": 362, + "train_runtime": 2943.0179, + "train_tokens_per_second": 2014.297 + }, + { + "epoch": 0.22, + "grad_norm": 0.029959173873066902, + "learning_rate": 9.996411755921987e-05, + "loss": 0.013176209293305874, + "num_input_tokens_seen": 5944488, + "step": 363, + "train_runtime": 2951.1353, + "train_tokens_per_second": 2014.305 + }, + { + "epoch": 0.2206060606060606, + "grad_norm": 0.045539602637290955, + "learning_rate": 9.996375239002369e-05, + "loss": 0.017476335167884827, + "num_input_tokens_seen": 5960864, + "step": 364, + "train_runtime": 2959.2526, + "train_tokens_per_second": 2014.314 + }, + { + "epoch": 0.22121212121212122, + "grad_norm": 0.04066498950123787, + "learning_rate": 9.996338537276559e-05, + "loss": 0.015315013006329536, + "num_input_tokens_seen": 5977240, + "step": 365, + "train_runtime": 2967.3711, + "train_tokens_per_second": 2014.322 + }, + { + "epoch": 0.22181818181818183, + "grad_norm": 0.055071763694286346, + "learning_rate": 9.996301650745917e-05, + "loss": 0.013316687196493149, + "num_input_tokens_seen": 5993616, + "step": 366, + "train_runtime": 2975.4906, + "train_tokens_per_second": 2014.329 + }, + { + "epoch": 0.2224242424242424, + "grad_norm": 0.020134275779128075, + "learning_rate": 9.996264579411807e-05, + "loss": 0.012931122444570065, + "num_input_tokens_seen": 6009992, + "step": 367, + "train_runtime": 2983.6081, + "train_tokens_per_second": 2014.337 + }, + { + "epoch": 0.22303030303030302, + "grad_norm": 0.0290455874055624, + "learning_rate": 9.9962273232756e-05, + "loss": 0.013352105394005775, + "num_input_tokens_seen": 6026368, + "step": 368, + "train_runtime": 2991.7313, + "train_tokens_per_second": 2014.341 + }, + { + "epoch": 0.22363636363636363, + "grad_norm": 0.03161335363984108, + "learning_rate": 9.996189882338675e-05, + "loss": 0.012487310916185379, + "num_input_tokens_seen": 6042744, + "step": 369, + "train_runtime": 2999.8498, + "train_tokens_per_second": 2014.349 + }, + { + "epoch": 0.22424242424242424, + "grad_norm": 0.05878787115216255, + "learning_rate": 9.996152256602414e-05, + "loss": 0.014912744984030724, + "num_input_tokens_seen": 6059120, + "step": 370, + "train_runtime": 3007.9654, + "train_tokens_per_second": 2014.358 + }, + { + "epoch": 0.22484848484848485, + "grad_norm": 0.029024092480540276, + "learning_rate": 9.996114446068212e-05, + "loss": 0.012249596416950226, + "num_input_tokens_seen": 6075496, + "step": 371, + "train_runtime": 3016.083, + "train_tokens_per_second": 2014.366 + }, + { + "epoch": 0.22545454545454546, + "grad_norm": 0.023940905928611755, + "learning_rate": 9.996076450737465e-05, + "loss": 0.014684991911053658, + "num_input_tokens_seen": 6091872, + "step": 372, + "train_runtime": 3024.1999, + "train_tokens_per_second": 2014.375 + }, + { + "epoch": 0.22606060606060607, + "grad_norm": 0.07777219265699387, + "learning_rate": 9.99603827061158e-05, + "loss": 0.01571383886039257, + "num_input_tokens_seen": 6108248, + "step": 373, + "train_runtime": 3032.3166, + "train_tokens_per_second": 2014.383 + }, + { + "epoch": 0.22666666666666666, + "grad_norm": 0.030761806294322014, + "learning_rate": 9.99599990569197e-05, + "loss": 0.013848803006112576, + "num_input_tokens_seen": 6124624, + "step": 374, + "train_runtime": 3040.4333, + "train_tokens_per_second": 2014.392 + }, + { + "epoch": 0.22727272727272727, + "grad_norm": 0.0438305102288723, + "learning_rate": 9.995961355980051e-05, + "loss": 0.014024798758327961, + "num_input_tokens_seen": 6141000, + "step": 375, + "train_runtime": 3048.549, + "train_tokens_per_second": 2014.401 + }, + { + "epoch": 0.22787878787878788, + "grad_norm": 0.04035346210002899, + "learning_rate": 9.995922621477252e-05, + "loss": 0.014576055109500885, + "num_input_tokens_seen": 6157376, + "step": 376, + "train_runtime": 3056.6655, + "train_tokens_per_second": 2014.41 + }, + { + "epoch": 0.22848484848484849, + "grad_norm": 0.09497886896133423, + "learning_rate": 9.995883702185003e-05, + "loss": 0.014249450527131557, + "num_input_tokens_seen": 6173752, + "step": 377, + "train_runtime": 3064.7824, + "train_tokens_per_second": 2014.418 + }, + { + "epoch": 0.2290909090909091, + "grad_norm": 0.03223222866654396, + "learning_rate": 9.995844598104746e-05, + "loss": 0.013723311945796013, + "num_input_tokens_seen": 6190128, + "step": 378, + "train_runtime": 3072.8984, + "train_tokens_per_second": 2014.426 + }, + { + "epoch": 0.2296969696969697, + "grad_norm": 0.023603513836860657, + "learning_rate": 9.995805309237926e-05, + "loss": 0.015003862790763378, + "num_input_tokens_seen": 6206504, + "step": 379, + "train_runtime": 3081.015, + "train_tokens_per_second": 2014.435 + }, + { + "epoch": 0.23030303030303031, + "grad_norm": 0.07697781920433044, + "learning_rate": 9.995765835585995e-05, + "loss": 0.01642550155520439, + "num_input_tokens_seen": 6222880, + "step": 380, + "train_runtime": 3089.1312, + "train_tokens_per_second": 2014.443 + }, + { + "epoch": 0.2309090909090909, + "grad_norm": 0.06212541460990906, + "learning_rate": 9.995726177150418e-05, + "loss": 0.013186133466660976, + "num_input_tokens_seen": 6239256, + "step": 381, + "train_runtime": 3097.2484, + "train_tokens_per_second": 2014.451 + }, + { + "epoch": 0.2315151515151515, + "grad_norm": 0.04135077819228172, + "learning_rate": 9.995686333932655e-05, + "loss": 0.015075747855007648, + "num_input_tokens_seen": 6255632, + "step": 382, + "train_runtime": 3105.3662, + "train_tokens_per_second": 2014.459 + }, + { + "epoch": 0.23212121212121212, + "grad_norm": 0.03373231366276741, + "learning_rate": 9.995646305934184e-05, + "loss": 0.015022508800029755, + "num_input_tokens_seen": 6272008, + "step": 383, + "train_runtime": 3113.4845, + "train_tokens_per_second": 2014.466 + }, + { + "epoch": 0.23272727272727273, + "grad_norm": 0.052756134420633316, + "learning_rate": 9.995606093156485e-05, + "loss": 0.016195476055145264, + "num_input_tokens_seen": 6288384, + "step": 384, + "train_runtime": 3121.6016, + "train_tokens_per_second": 2014.474 + }, + { + "epoch": 0.23333333333333334, + "grad_norm": 0.04732633754611015, + "learning_rate": 9.995565695601045e-05, + "loss": 0.015717167407274246, + "num_input_tokens_seen": 6304760, + "step": 385, + "train_runtime": 3129.7191, + "train_tokens_per_second": 2014.481 + }, + { + "epoch": 0.23393939393939395, + "grad_norm": 0.050964321941137314, + "learning_rate": 9.99552511326936e-05, + "loss": 0.013431689701974392, + "num_input_tokens_seen": 6321136, + "step": 386, + "train_runtime": 3137.836, + "train_tokens_per_second": 2014.489 + }, + { + "epoch": 0.23454545454545456, + "grad_norm": 0.029031990095973015, + "learning_rate": 9.995484346162926e-05, + "loss": 0.013563702814280987, + "num_input_tokens_seen": 6337512, + "step": 387, + "train_runtime": 3145.953, + "train_tokens_per_second": 2014.497 + }, + { + "epoch": 0.23515151515151514, + "grad_norm": 0.03224366530776024, + "learning_rate": 9.995443394283257e-05, + "loss": 0.01605670340359211, + "num_input_tokens_seen": 6353888, + "step": 388, + "train_runtime": 3154.0724, + "train_tokens_per_second": 2014.503 + }, + { + "epoch": 0.23575757575757575, + "grad_norm": 0.03045693039894104, + "learning_rate": 9.995402257631865e-05, + "loss": 0.015148544684052467, + "num_input_tokens_seen": 6370264, + "step": 389, + "train_runtime": 3162.1889, + "train_tokens_per_second": 2014.511 + }, + { + "epoch": 0.23636363636363636, + "grad_norm": 0.027332261204719543, + "learning_rate": 9.995360936210271e-05, + "loss": 0.014781562611460686, + "num_input_tokens_seen": 6386640, + "step": 390, + "train_runtime": 3170.3051, + "train_tokens_per_second": 2014.519 + }, + { + "epoch": 0.23696969696969697, + "grad_norm": 0.023009251803159714, + "learning_rate": 9.995319430020003e-05, + "loss": 0.013627824373543262, + "num_input_tokens_seen": 6403016, + "step": 391, + "train_runtime": 3178.43, + "train_tokens_per_second": 2014.522 + }, + { + "epoch": 0.23757575757575758, + "grad_norm": 0.035416360944509506, + "learning_rate": 9.995277739062599e-05, + "loss": 0.01493286807090044, + "num_input_tokens_seen": 6419392, + "step": 392, + "train_runtime": 3186.5451, + "train_tokens_per_second": 2014.53 + }, + { + "epoch": 0.2381818181818182, + "grad_norm": 0.04003625363111496, + "learning_rate": 9.995235863339598e-05, + "loss": 0.016020091250538826, + "num_input_tokens_seen": 6435768, + "step": 393, + "train_runtime": 3194.6612, + "train_tokens_per_second": 2014.539 + }, + { + "epoch": 0.2387878787878788, + "grad_norm": 0.024710826575756073, + "learning_rate": 9.995193802852552e-05, + "loss": 0.015763292089104652, + "num_input_tokens_seen": 6452144, + "step": 394, + "train_runtime": 3202.7765, + "train_tokens_per_second": 2014.547 + }, + { + "epoch": 0.23939393939393938, + "grad_norm": 0.05250145494937897, + "learning_rate": 9.995151557603013e-05, + "loss": 0.017301952466368675, + "num_input_tokens_seen": 6468520, + "step": 395, + "train_runtime": 3210.893, + "train_tokens_per_second": 2014.555 + }, + { + "epoch": 0.24, + "grad_norm": 0.037685710936784744, + "learning_rate": 9.995109127592546e-05, + "loss": 0.014692970551550388, + "num_input_tokens_seen": 6484896, + "step": 396, + "train_runtime": 3219.0101, + "train_tokens_per_second": 2014.562 + }, + { + "epoch": 0.2406060606060606, + "grad_norm": 0.03617233410477638, + "learning_rate": 9.99506651282272e-05, + "loss": 0.015763459727168083, + "num_input_tokens_seen": 6501272, + "step": 397, + "train_runtime": 3227.1302, + "train_tokens_per_second": 2014.568 + }, + { + "epoch": 0.2412121212121212, + "grad_norm": 0.026065215468406677, + "learning_rate": 9.995023713295111e-05, + "loss": 0.013620332814753056, + "num_input_tokens_seen": 6517648, + "step": 398, + "train_runtime": 3235.2472, + "train_tokens_per_second": 2014.575 + }, + { + "epoch": 0.24181818181818182, + "grad_norm": 0.045087747275829315, + "learning_rate": 9.994980729011303e-05, + "loss": 0.015572777949273586, + "num_input_tokens_seen": 6534024, + "step": 399, + "train_runtime": 3243.3644, + "train_tokens_per_second": 2014.582 + }, + { + "epoch": 0.24242424242424243, + "grad_norm": 0.02911469154059887, + "learning_rate": 9.994937559972884e-05, + "loss": 0.014463523402810097, + "num_input_tokens_seen": 6550400, + "step": 400, + "train_runtime": 3251.4815, + "train_tokens_per_second": 2014.589 + }, + { + "epoch": 0.24303030303030304, + "grad_norm": 0.09026223421096802, + "learning_rate": 9.994894206181452e-05, + "loss": 0.015273511409759521, + "num_input_tokens_seen": 6566776, + "step": 401, + "train_runtime": 3260.5529, + "train_tokens_per_second": 2014.007 + }, + { + "epoch": 0.24363636363636362, + "grad_norm": 0.059329140931367874, + "learning_rate": 9.994850667638611e-05, + "loss": 0.017180006951093674, + "num_input_tokens_seen": 6583152, + "step": 402, + "train_runtime": 3268.6733, + "train_tokens_per_second": 2014.013 + }, + { + "epoch": 0.24424242424242423, + "grad_norm": 0.05259858816862106, + "learning_rate": 9.99480694434597e-05, + "loss": 0.01665383018553257, + "num_input_tokens_seen": 6599528, + "step": 403, + "train_runtime": 3276.7926, + "train_tokens_per_second": 2014.021 + }, + { + "epoch": 0.24484848484848484, + "grad_norm": 0.046337101608514786, + "learning_rate": 9.994763036305148e-05, + "loss": 0.01817156933248043, + "num_input_tokens_seen": 6615904, + "step": 404, + "train_runtime": 3284.9091, + "train_tokens_per_second": 2014.03 + }, + { + "epoch": 0.24545454545454545, + "grad_norm": 0.023166598752141, + "learning_rate": 9.994718943517768e-05, + "loss": 0.012105523608624935, + "num_input_tokens_seen": 6632280, + "step": 405, + "train_runtime": 3293.0293, + "train_tokens_per_second": 2014.036 + }, + { + "epoch": 0.24606060606060606, + "grad_norm": 0.044385019689798355, + "learning_rate": 9.994674665985461e-05, + "loss": 0.01413038745522499, + "num_input_tokens_seen": 6648656, + "step": 406, + "train_runtime": 3301.1473, + "train_tokens_per_second": 2014.044 + }, + { + "epoch": 0.24666666666666667, + "grad_norm": 0.038354646414518356, + "learning_rate": 9.994630203709865e-05, + "loss": 0.015764841809868813, + "num_input_tokens_seen": 6665032, + "step": 407, + "train_runtime": 3309.2652, + "train_tokens_per_second": 2014.052 + }, + { + "epoch": 0.24727272727272728, + "grad_norm": 0.026519082486629486, + "learning_rate": 9.994585556692624e-05, + "loss": 0.015617020428180695, + "num_input_tokens_seen": 6681408, + "step": 408, + "train_runtime": 3317.3836, + "train_tokens_per_second": 2014.06 + }, + { + "epoch": 0.24787878787878787, + "grad_norm": 0.07033390551805496, + "learning_rate": 9.994540724935389e-05, + "loss": 0.01747780106961727, + "num_input_tokens_seen": 6697784, + "step": 409, + "train_runtime": 3325.5001, + "train_tokens_per_second": 2014.068 + }, + { + "epoch": 0.24848484848484848, + "grad_norm": 0.02514197863638401, + "learning_rate": 9.994495708439819e-05, + "loss": 0.01398993656039238, + "num_input_tokens_seen": 6714160, + "step": 410, + "train_runtime": 3333.618, + "train_tokens_per_second": 2014.076 + }, + { + "epoch": 0.24909090909090909, + "grad_norm": 0.023313792422413826, + "learning_rate": 9.99445050720758e-05, + "loss": 0.013531757518649101, + "num_input_tokens_seen": 6730536, + "step": 411, + "train_runtime": 3341.7359, + "train_tokens_per_second": 2014.084 + }, + { + "epoch": 0.2496969696969697, + "grad_norm": 0.04927172139286995, + "learning_rate": 9.994405121240344e-05, + "loss": 0.014407115057110786, + "num_input_tokens_seen": 6746912, + "step": 412, + "train_runtime": 3349.8514, + "train_tokens_per_second": 2014.093 + }, + { + "epoch": 0.2503030303030303, + "grad_norm": 0.03376639634370804, + "learning_rate": 9.994359550539787e-05, + "loss": 0.015590015798807144, + "num_input_tokens_seen": 6763288, + "step": 413, + "train_runtime": 3357.9682, + "train_tokens_per_second": 2014.101 + }, + { + "epoch": 0.2509090909090909, + "grad_norm": 0.026951145380735397, + "learning_rate": 9.994313795107597e-05, + "loss": 0.013218428939580917, + "num_input_tokens_seen": 6779664, + "step": 414, + "train_runtime": 3366.0858, + "train_tokens_per_second": 2014.109 + }, + { + "epoch": 0.2515151515151515, + "grad_norm": 0.028939809650182724, + "learning_rate": 9.994267854945465e-05, + "loss": 0.013945825397968292, + "num_input_tokens_seen": 6796040, + "step": 415, + "train_runtime": 3374.204, + "train_tokens_per_second": 2014.116 + }, + { + "epoch": 0.25212121212121213, + "grad_norm": 0.048603300005197525, + "learning_rate": 9.994221730055091e-05, + "loss": 0.014013823121786118, + "num_input_tokens_seen": 6812416, + "step": 416, + "train_runtime": 3382.3201, + "train_tokens_per_second": 2014.125 + }, + { + "epoch": 0.25272727272727274, + "grad_norm": 0.03397737815976143, + "learning_rate": 9.994175420438182e-05, + "loss": 0.016459740698337555, + "num_input_tokens_seen": 6828792, + "step": 417, + "train_runtime": 3390.4376, + "train_tokens_per_second": 2014.133 + }, + { + "epoch": 0.25333333333333335, + "grad_norm": 0.09882687032222748, + "learning_rate": 9.99412892609645e-05, + "loss": 0.019090697169303894, + "num_input_tokens_seen": 6845168, + "step": 418, + "train_runtime": 3398.5545, + "train_tokens_per_second": 2014.141 + }, + { + "epoch": 0.25393939393939396, + "grad_norm": 0.02406393364071846, + "learning_rate": 9.994082247031613e-05, + "loss": 0.01460934616625309, + "num_input_tokens_seen": 6861544, + "step": 419, + "train_runtime": 3406.6729, + "train_tokens_per_second": 2014.148 + }, + { + "epoch": 0.2545454545454545, + "grad_norm": 0.05103567615151405, + "learning_rate": 9.994035383245401e-05, + "loss": 0.014737242832779884, + "num_input_tokens_seen": 6877920, + "step": 420, + "train_runtime": 3414.7913, + "train_tokens_per_second": 2014.155 + }, + { + "epoch": 0.25515151515151513, + "grad_norm": 0.040553025901317596, + "learning_rate": 9.993988334739544e-05, + "loss": 0.015402523800730705, + "num_input_tokens_seen": 6894296, + "step": 421, + "train_runtime": 3422.91, + "train_tokens_per_second": 2014.162 + }, + { + "epoch": 0.25575757575757574, + "grad_norm": 0.038083747029304504, + "learning_rate": 9.993941101515786e-05, + "loss": 0.014769435860216618, + "num_input_tokens_seen": 6910672, + "step": 422, + "train_runtime": 3431.0293, + "train_tokens_per_second": 2014.169 + }, + { + "epoch": 0.25636363636363635, + "grad_norm": 0.018217189237475395, + "learning_rate": 9.99389368357587e-05, + "loss": 0.01357343327254057, + "num_input_tokens_seen": 6927048, + "step": 423, + "train_runtime": 3439.1473, + "train_tokens_per_second": 2014.176 + }, + { + "epoch": 0.25696969696969696, + "grad_norm": 0.04052957519888878, + "learning_rate": 9.993846080921552e-05, + "loss": 0.01406765729188919, + "num_input_tokens_seen": 6943424, + "step": 424, + "train_runtime": 3447.2634, + "train_tokens_per_second": 2014.184 + }, + { + "epoch": 0.25757575757575757, + "grad_norm": 0.02357480488717556, + "learning_rate": 9.993798293554593e-05, + "loss": 0.013200477696955204, + "num_input_tokens_seen": 6959800, + "step": 425, + "train_runtime": 3455.3793, + "train_tokens_per_second": 2014.193 + }, + { + "epoch": 0.2581818181818182, + "grad_norm": 0.02221427671611309, + "learning_rate": 9.99375032147676e-05, + "loss": 0.014044541865587234, + "num_input_tokens_seen": 6976176, + "step": 426, + "train_runtime": 3463.4954, + "train_tokens_per_second": 2014.201 + }, + { + "epoch": 0.2587878787878788, + "grad_norm": 0.03215425834059715, + "learning_rate": 9.993702164689829e-05, + "loss": 0.013318242505192757, + "num_input_tokens_seen": 6992552, + "step": 427, + "train_runtime": 3471.613, + "train_tokens_per_second": 2014.208 + }, + { + "epoch": 0.2593939393939394, + "grad_norm": 0.049007292836904526, + "learning_rate": 9.993653823195578e-05, + "loss": 0.014676532708108425, + "num_input_tokens_seen": 7008928, + "step": 428, + "train_runtime": 3479.731, + "train_tokens_per_second": 2014.215 + }, + { + "epoch": 0.26, + "grad_norm": 0.029083114117383957, + "learning_rate": 9.993605296995796e-05, + "loss": 0.013533808290958405, + "num_input_tokens_seen": 7025304, + "step": 429, + "train_runtime": 3487.8493, + "train_tokens_per_second": 2014.222 + }, + { + "epoch": 0.2606060606060606, + "grad_norm": 0.03159458562731743, + "learning_rate": 9.993556586092281e-05, + "loss": 0.015523270703852177, + "num_input_tokens_seen": 7041680, + "step": 430, + "train_runtime": 3495.9654, + "train_tokens_per_second": 2014.23 + }, + { + "epoch": 0.26121212121212123, + "grad_norm": 0.023704880848526955, + "learning_rate": 9.993507690486831e-05, + "loss": 0.014423849992454052, + "num_input_tokens_seen": 7058056, + "step": 431, + "train_runtime": 3504.0833, + "train_tokens_per_second": 2014.238 + }, + { + "epoch": 0.26181818181818184, + "grad_norm": 0.061435069888830185, + "learning_rate": 9.993458610181256e-05, + "loss": 0.01381218247115612, + "num_input_tokens_seen": 7074432, + "step": 432, + "train_runtime": 3512.2002, + "train_tokens_per_second": 2014.245 + }, + { + "epoch": 0.26242424242424245, + "grad_norm": 0.027623331174254417, + "learning_rate": 9.993409345177371e-05, + "loss": 0.013529473915696144, + "num_input_tokens_seen": 7090808, + "step": 433, + "train_runtime": 3520.3183, + "train_tokens_per_second": 2014.252 + }, + { + "epoch": 0.263030303030303, + "grad_norm": 0.02938493713736534, + "learning_rate": 9.993359895477e-05, + "loss": 0.014209594577550888, + "num_input_tokens_seen": 7107184, + "step": 434, + "train_runtime": 3528.4347, + "train_tokens_per_second": 2014.26 + }, + { + "epoch": 0.2636363636363636, + "grad_norm": 0.05708494782447815, + "learning_rate": 9.993310261081968e-05, + "loss": 0.01838802546262741, + "num_input_tokens_seen": 7123560, + "step": 435, + "train_runtime": 3536.5523, + "train_tokens_per_second": 2014.267 + }, + { + "epoch": 0.2642424242424242, + "grad_norm": 0.01653749868273735, + "learning_rate": 9.993260441994116e-05, + "loss": 0.014132829383015633, + "num_input_tokens_seen": 7139936, + "step": 436, + "train_runtime": 3544.6693, + "train_tokens_per_second": 2014.274 + }, + { + "epoch": 0.26484848484848483, + "grad_norm": 0.06222791597247124, + "learning_rate": 9.993210438215284e-05, + "loss": 0.017560908570885658, + "num_input_tokens_seen": 7156312, + "step": 437, + "train_runtime": 3552.7886, + "train_tokens_per_second": 2014.28 + }, + { + "epoch": 0.26545454545454544, + "grad_norm": 0.023168306797742844, + "learning_rate": 9.993160249747319e-05, + "loss": 0.014680145308375359, + "num_input_tokens_seen": 7172688, + "step": 438, + "train_runtime": 3560.9057, + "train_tokens_per_second": 2014.288 + }, + { + "epoch": 0.26606060606060605, + "grad_norm": 0.03977813571691513, + "learning_rate": 9.993109876592083e-05, + "loss": 0.01688549481332302, + "num_input_tokens_seen": 7189064, + "step": 439, + "train_runtime": 3569.029, + "train_tokens_per_second": 2014.291 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 0.027993550524115562, + "learning_rate": 9.993059318751435e-05, + "loss": 0.012989813461899757, + "num_input_tokens_seen": 7205440, + "step": 440, + "train_runtime": 3577.1458, + "train_tokens_per_second": 2014.299 + }, + { + "epoch": 0.2672727272727273, + "grad_norm": 0.051551882177591324, + "learning_rate": 9.993008576227247e-05, + "loss": 0.016102567315101624, + "num_input_tokens_seen": 7221816, + "step": 441, + "train_runtime": 3585.2621, + "train_tokens_per_second": 2014.306 + }, + { + "epoch": 0.2678787878787879, + "grad_norm": 0.03278960660099983, + "learning_rate": 9.992957649021395e-05, + "loss": 0.014773263595998287, + "num_input_tokens_seen": 7238192, + "step": 442, + "train_runtime": 3593.378, + "train_tokens_per_second": 2014.314 + }, + { + "epoch": 0.2684848484848485, + "grad_norm": 0.030394606292247772, + "learning_rate": 9.992906537135762e-05, + "loss": 0.015549161471426487, + "num_input_tokens_seen": 7254568, + "step": 443, + "train_runtime": 3601.4945, + "train_tokens_per_second": 2014.322 + }, + { + "epoch": 0.2690909090909091, + "grad_norm": 0.027792129665613174, + "learning_rate": 9.992855240572241e-05, + "loss": 0.01473160833120346, + "num_input_tokens_seen": 7270944, + "step": 444, + "train_runtime": 3609.6111, + "train_tokens_per_second": 2014.329 + }, + { + "epoch": 0.2696969696969697, + "grad_norm": 0.01833016611635685, + "learning_rate": 9.992803759332728e-05, + "loss": 0.013827802613377571, + "num_input_tokens_seen": 7287320, + "step": 445, + "train_runtime": 3617.7304, + "train_tokens_per_second": 2014.335 + }, + { + "epoch": 0.2703030303030303, + "grad_norm": 0.021910199895501137, + "learning_rate": 9.992752093419124e-05, + "loss": 0.014088256284594536, + "num_input_tokens_seen": 7303696, + "step": 446, + "train_runtime": 3625.8455, + "train_tokens_per_second": 2014.343 + }, + { + "epoch": 0.27090909090909093, + "grad_norm": 0.03614957630634308, + "learning_rate": 9.992700242833346e-05, + "loss": 0.014040564186871052, + "num_input_tokens_seen": 7320072, + "step": 447, + "train_runtime": 3633.9607, + "train_tokens_per_second": 2014.351 + }, + { + "epoch": 0.27151515151515154, + "grad_norm": 0.03147033974528313, + "learning_rate": 9.992648207577308e-05, + "loss": 0.01510291825979948, + "num_input_tokens_seen": 7336448, + "step": 448, + "train_runtime": 3642.0772, + "train_tokens_per_second": 2014.358 + }, + { + "epoch": 0.2721212121212121, + "grad_norm": 0.01757362298667431, + "learning_rate": 9.992595987652935e-05, + "loss": 0.01235952414572239, + "num_input_tokens_seen": 7352824, + "step": 449, + "train_runtime": 3650.194, + "train_tokens_per_second": 2014.365 + }, + { + "epoch": 0.2727272727272727, + "grad_norm": 0.018810704350471497, + "learning_rate": 9.99254358306216e-05, + "loss": 0.01245784480124712, + "num_input_tokens_seen": 7369200, + "step": 450, + "train_runtime": 3658.3109, + "train_tokens_per_second": 2014.372 + }, + { + "epoch": 0.2733333333333333, + "grad_norm": 0.02399486117064953, + "learning_rate": 9.99249099380692e-05, + "loss": 0.012906970456242561, + "num_input_tokens_seen": 7385576, + "step": 451, + "train_runtime": 3666.429, + "train_tokens_per_second": 2014.379 + }, + { + "epoch": 0.2739393939393939, + "grad_norm": 0.07980017364025116, + "learning_rate": 9.99243821988916e-05, + "loss": 0.017233727499842644, + "num_input_tokens_seen": 7401952, + "step": 452, + "train_runtime": 3674.5475, + "train_tokens_per_second": 2014.385 + }, + { + "epoch": 0.27454545454545454, + "grad_norm": 0.019096143543720245, + "learning_rate": 9.992385261310833e-05, + "loss": 0.013073702342808247, + "num_input_tokens_seen": 7418328, + "step": 453, + "train_runtime": 3682.6645, + "train_tokens_per_second": 2014.392 + }, + { + "epoch": 0.27515151515151515, + "grad_norm": 0.055766504257917404, + "learning_rate": 9.992332118073897e-05, + "loss": 0.014186715707182884, + "num_input_tokens_seen": 7434704, + "step": 454, + "train_runtime": 3690.7797, + "train_tokens_per_second": 2014.399 + }, + { + "epoch": 0.27575757575757576, + "grad_norm": 0.02542242966592312, + "learning_rate": 9.992278790180318e-05, + "loss": 0.016023358330130577, + "num_input_tokens_seen": 7451080, + "step": 455, + "train_runtime": 3698.8942, + "train_tokens_per_second": 2014.407 + }, + { + "epoch": 0.27636363636363637, + "grad_norm": 0.020465506240725517, + "learning_rate": 9.99222527763207e-05, + "loss": 0.013445570133626461, + "num_input_tokens_seen": 7467456, + "step": 456, + "train_runtime": 3707.0085, + "train_tokens_per_second": 2014.416 + }, + { + "epoch": 0.276969696969697, + "grad_norm": 0.022726397961378098, + "learning_rate": 9.992171580431129e-05, + "loss": 0.013883800245821476, + "num_input_tokens_seen": 7483832, + "step": 457, + "train_runtime": 3715.1297, + "train_tokens_per_second": 2014.42 + }, + { + "epoch": 0.2775757575757576, + "grad_norm": 0.06926342844963074, + "learning_rate": 9.992117698579484e-05, + "loss": 0.016109909862279892, + "num_input_tokens_seen": 7500208, + "step": 458, + "train_runtime": 3723.2461, + "train_tokens_per_second": 2014.427 + }, + { + "epoch": 0.2781818181818182, + "grad_norm": 0.03352541849017143, + "learning_rate": 9.992063632079127e-05, + "loss": 0.01359601691365242, + "num_input_tokens_seen": 7516584, + "step": 459, + "train_runtime": 3731.3653, + "train_tokens_per_second": 2014.433 + }, + { + "epoch": 0.2787878787878788, + "grad_norm": 0.046891167759895325, + "learning_rate": 9.992009380932059e-05, + "loss": 0.014447907917201519, + "num_input_tokens_seen": 7532960, + "step": 460, + "train_runtime": 3739.4829, + "train_tokens_per_second": 2014.439 + }, + { + "epoch": 0.2793939393939394, + "grad_norm": 0.05756726115942001, + "learning_rate": 9.991954945140284e-05, + "loss": 0.012774428352713585, + "num_input_tokens_seen": 7549336, + "step": 461, + "train_runtime": 3747.5996, + "train_tokens_per_second": 2014.446 + }, + { + "epoch": 0.28, + "grad_norm": 0.06149715185165405, + "learning_rate": 9.991900324705817e-05, + "loss": 0.015111779794096947, + "num_input_tokens_seen": 7565712, + "step": 462, + "train_runtime": 3755.7151, + "train_tokens_per_second": 2014.453 + }, + { + "epoch": 0.2806060606060606, + "grad_norm": 0.03807002305984497, + "learning_rate": 9.991845519630678e-05, + "loss": 0.014264722354710102, + "num_input_tokens_seen": 7582088, + "step": 463, + "train_runtime": 3763.8316, + "train_tokens_per_second": 2014.46 + }, + { + "epoch": 0.2812121212121212, + "grad_norm": 0.038672804832458496, + "learning_rate": 9.991790529916896e-05, + "loss": 0.014925600029528141, + "num_input_tokens_seen": 7598464, + "step": 464, + "train_runtime": 3771.9486, + "train_tokens_per_second": 2014.466 + }, + { + "epoch": 0.2818181818181818, + "grad_norm": 0.04409286752343178, + "learning_rate": 9.991735355566502e-05, + "loss": 0.01355639100074768, + "num_input_tokens_seen": 7614840, + "step": 465, + "train_runtime": 3780.0654, + "train_tokens_per_second": 2014.473 + }, + { + "epoch": 0.2824242424242424, + "grad_norm": 0.05239715427160263, + "learning_rate": 9.991679996581539e-05, + "loss": 0.01419782917946577, + "num_input_tokens_seen": 7631216, + "step": 466, + "train_runtime": 3788.182, + "train_tokens_per_second": 2014.48 + }, + { + "epoch": 0.283030303030303, + "grad_norm": 0.04078468307852745, + "learning_rate": 9.991624452964054e-05, + "loss": 0.014365100301802158, + "num_input_tokens_seen": 7647592, + "step": 467, + "train_runtime": 3796.2972, + "train_tokens_per_second": 2014.487 + }, + { + "epoch": 0.28363636363636363, + "grad_norm": 0.05068361386656761, + "learning_rate": 9.9915687247161e-05, + "loss": 0.016069650650024414, + "num_input_tokens_seen": 7663968, + "step": 468, + "train_runtime": 3804.4156, + "train_tokens_per_second": 2014.493 + }, + { + "epoch": 0.28424242424242424, + "grad_norm": 0.028354912996292114, + "learning_rate": 9.991512811839741e-05, + "loss": 0.01326735783368349, + "num_input_tokens_seen": 7680344, + "step": 469, + "train_runtime": 3812.5326, + "train_tokens_per_second": 2014.499 + }, + { + "epoch": 0.28484848484848485, + "grad_norm": 0.018959172070026398, + "learning_rate": 9.991456714337041e-05, + "loss": 0.01290344912558794, + "num_input_tokens_seen": 7696720, + "step": 470, + "train_runtime": 3820.6476, + "train_tokens_per_second": 2014.507 + }, + { + "epoch": 0.28545454545454546, + "grad_norm": 0.03419540822505951, + "learning_rate": 9.99140043221008e-05, + "loss": 0.015551136806607246, + "num_input_tokens_seen": 7713096, + "step": 471, + "train_runtime": 3828.7623, + "train_tokens_per_second": 2014.514 + }, + { + "epoch": 0.28606060606060607, + "grad_norm": 0.0427350290119648, + "learning_rate": 9.991343965460937e-05, + "loss": 0.014623988419771194, + "num_input_tokens_seen": 7729472, + "step": 472, + "train_runtime": 3836.8784, + "train_tokens_per_second": 2014.521 + }, + { + "epoch": 0.2866666666666667, + "grad_norm": 0.030883153900504112, + "learning_rate": 9.991287314091699e-05, + "loss": 0.013778546825051308, + "num_input_tokens_seen": 7745848, + "step": 473, + "train_runtime": 3844.9946, + "train_tokens_per_second": 2014.528 + }, + { + "epoch": 0.2872727272727273, + "grad_norm": 0.021236877888441086, + "learning_rate": 9.991230478104466e-05, + "loss": 0.013353691436350346, + "num_input_tokens_seen": 7762224, + "step": 474, + "train_runtime": 3853.1121, + "train_tokens_per_second": 2014.534 + }, + { + "epoch": 0.2878787878787879, + "grad_norm": 0.019137293100357056, + "learning_rate": 9.991173457501337e-05, + "loss": 0.013228803873062134, + "num_input_tokens_seen": 7778600, + "step": 475, + "train_runtime": 3861.229, + "train_tokens_per_second": 2014.54 + }, + { + "epoch": 0.2884848484848485, + "grad_norm": 0.01902465894818306, + "learning_rate": 9.991116252284421e-05, + "loss": 0.013284035958349705, + "num_input_tokens_seen": 7794976, + "step": 476, + "train_runtime": 3869.3457, + "train_tokens_per_second": 2014.546 + }, + { + "epoch": 0.28909090909090907, + "grad_norm": 0.028947357088327408, + "learning_rate": 9.991058862455833e-05, + "loss": 0.01423730794340372, + "num_input_tokens_seen": 7811352, + "step": 477, + "train_runtime": 3877.4643, + "train_tokens_per_second": 2014.552 + }, + { + "epoch": 0.2896969696969697, + "grad_norm": 0.024383556097745895, + "learning_rate": 9.991001288017701e-05, + "loss": 0.013436602428555489, + "num_input_tokens_seen": 7827728, + "step": 478, + "train_runtime": 3885.5822, + "train_tokens_per_second": 2014.557 + }, + { + "epoch": 0.2903030303030303, + "grad_norm": 0.04384802654385567, + "learning_rate": 9.990943528972147e-05, + "loss": 0.013107577338814735, + "num_input_tokens_seen": 7844104, + "step": 479, + "train_runtime": 3893.6976, + "train_tokens_per_second": 2014.564 + }, + { + "epoch": 0.2909090909090909, + "grad_norm": 0.020900119096040726, + "learning_rate": 9.990885585321315e-05, + "loss": 0.015309646725654602, + "num_input_tokens_seen": 7860480, + "step": 480, + "train_runtime": 3901.8179, + "train_tokens_per_second": 2014.569 + }, + { + "epoch": 0.2915151515151515, + "grad_norm": 0.018041405826807022, + "learning_rate": 9.990827457067343e-05, + "loss": 0.012978669255971909, + "num_input_tokens_seen": 7876856, + "step": 481, + "train_runtime": 3909.935, + "train_tokens_per_second": 2014.575 + }, + { + "epoch": 0.2921212121212121, + "grad_norm": 0.02291363663971424, + "learning_rate": 9.99076914421238e-05, + "loss": 0.014085205271840096, + "num_input_tokens_seen": 7893232, + "step": 482, + "train_runtime": 3918.0537, + "train_tokens_per_second": 2014.58 + }, + { + "epoch": 0.2927272727272727, + "grad_norm": 0.023675069212913513, + "learning_rate": 9.990710646758589e-05, + "loss": 0.014468826353549957, + "num_input_tokens_seen": 7909608, + "step": 483, + "train_runtime": 3926.1718, + "train_tokens_per_second": 2014.585 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 0.021886734291911125, + "learning_rate": 9.990651964708128e-05, + "loss": 0.014159688726067543, + "num_input_tokens_seen": 7925984, + "step": 484, + "train_runtime": 3934.2949, + "train_tokens_per_second": 2014.588 + }, + { + "epoch": 0.29393939393939394, + "grad_norm": 0.019282542169094086, + "learning_rate": 9.99059309806317e-05, + "loss": 0.013447335921227932, + "num_input_tokens_seen": 7942360, + "step": 485, + "train_runtime": 3942.412, + "train_tokens_per_second": 2014.594 + }, + { + "epoch": 0.29454545454545455, + "grad_norm": 0.021736539900302887, + "learning_rate": 9.990534046825893e-05, + "loss": 0.014465593732893467, + "num_input_tokens_seen": 7958736, + "step": 486, + "train_runtime": 3950.5289, + "train_tokens_per_second": 2014.6 + }, + { + "epoch": 0.29515151515151516, + "grad_norm": 0.058480676263570786, + "learning_rate": 9.99047481099848e-05, + "loss": 0.015324249863624573, + "num_input_tokens_seen": 7975112, + "step": 487, + "train_runtime": 3958.6459, + "train_tokens_per_second": 2014.606 + }, + { + "epoch": 0.2957575757575758, + "grad_norm": 0.04795762896537781, + "learning_rate": 9.990415390583122e-05, + "loss": 0.015603849664330482, + "num_input_tokens_seen": 7991488, + "step": 488, + "train_runtime": 3966.7616, + "train_tokens_per_second": 2014.613 + }, + { + "epoch": 0.2963636363636364, + "grad_norm": 0.045595213770866394, + "learning_rate": 9.990355785582017e-05, + "loss": 0.013210836797952652, + "num_input_tokens_seen": 8007864, + "step": 489, + "train_runtime": 3974.8769, + "train_tokens_per_second": 2014.619 + }, + { + "epoch": 0.296969696969697, + "grad_norm": 0.03191044181585312, + "learning_rate": 9.99029599599737e-05, + "loss": 0.0133978221565485, + "num_input_tokens_seen": 8024240, + "step": 490, + "train_runtime": 3982.9927, + "train_tokens_per_second": 2014.626 + }, + { + "epoch": 0.29757575757575755, + "grad_norm": 0.03503177687525749, + "learning_rate": 9.990236021831391e-05, + "loss": 0.01524767093360424, + "num_input_tokens_seen": 8040616, + "step": 491, + "train_runtime": 3991.1095, + "train_tokens_per_second": 2014.632 + }, + { + "epoch": 0.29818181818181816, + "grad_norm": 0.021688032895326614, + "learning_rate": 9.990175863086302e-05, + "loss": 0.013602089136838913, + "num_input_tokens_seen": 8056992, + "step": 492, + "train_runtime": 3999.2294, + "train_tokens_per_second": 2014.636 + }, + { + "epoch": 0.29878787878787877, + "grad_norm": 0.02230294793844223, + "learning_rate": 9.990115519764325e-05, + "loss": 0.01378709264099598, + "num_input_tokens_seen": 8073368, + "step": 493, + "train_runtime": 4007.3438, + "train_tokens_per_second": 2014.643 + }, + { + "epoch": 0.2993939393939394, + "grad_norm": 0.0244484543800354, + "learning_rate": 9.990054991867692e-05, + "loss": 0.01362735964357853, + "num_input_tokens_seen": 8089744, + "step": 494, + "train_runtime": 4015.461, + "train_tokens_per_second": 2014.649 + }, + { + "epoch": 0.3, + "grad_norm": 0.021698100492358208, + "learning_rate": 9.989994279398642e-05, + "loss": 0.01317393034696579, + "num_input_tokens_seen": 8106120, + "step": 495, + "train_runtime": 4023.5779, + "train_tokens_per_second": 2014.655 + }, + { + "epoch": 0.3006060606060606, + "grad_norm": 0.04310522973537445, + "learning_rate": 9.989933382359422e-05, + "loss": 0.014429607428610325, + "num_input_tokens_seen": 8122496, + "step": 496, + "train_runtime": 4031.6942, + "train_tokens_per_second": 2014.661 + }, + { + "epoch": 0.3012121212121212, + "grad_norm": 0.018435562029480934, + "learning_rate": 9.989872300752283e-05, + "loss": 0.013920141384005547, + "num_input_tokens_seen": 8138872, + "step": 497, + "train_runtime": 4039.8107, + "train_tokens_per_second": 2014.667 + }, + { + "epoch": 0.3018181818181818, + "grad_norm": 0.023063285276293755, + "learning_rate": 9.989811034579486e-05, + "loss": 0.0139535591006279, + "num_input_tokens_seen": 8155248, + "step": 498, + "train_runtime": 4047.9289, + "train_tokens_per_second": 2014.672 + }, + { + "epoch": 0.30242424242424243, + "grad_norm": 0.0432952381670475, + "learning_rate": 9.989749583843296e-05, + "loss": 0.014083024114370346, + "num_input_tokens_seen": 8171624, + "step": 499, + "train_runtime": 4056.0475, + "train_tokens_per_second": 2014.677 + }, + { + "epoch": 0.30303030303030304, + "grad_norm": 0.0212725643068552, + "learning_rate": 9.989687948545985e-05, + "loss": 0.013151183724403381, + "num_input_tokens_seen": 8188000, + "step": 500, + "train_runtime": 4064.1628, + "train_tokens_per_second": 2014.683 + }, + { + "epoch": 0.30363636363636365, + "grad_norm": 0.03599437326192856, + "learning_rate": 9.989626128689835e-05, + "loss": 0.016130445525050163, + "num_input_tokens_seen": 8204376, + "step": 501, + "train_runtime": 4073.1872, + "train_tokens_per_second": 2014.24 + }, + { + "epoch": 0.30424242424242426, + "grad_norm": 0.021969085559248924, + "learning_rate": 9.98956412427713e-05, + "loss": 0.013816497288644314, + "num_input_tokens_seen": 8220752, + "step": 502, + "train_runtime": 4081.3029, + "train_tokens_per_second": 2014.247 + }, + { + "epoch": 0.30484848484848487, + "grad_norm": 0.032568175345659256, + "learning_rate": 9.989501935310166e-05, + "loss": 0.015003332868218422, + "num_input_tokens_seen": 8237128, + "step": 503, + "train_runtime": 4089.4198, + "train_tokens_per_second": 2014.253 + }, + { + "epoch": 0.3054545454545455, + "grad_norm": 0.0263565294444561, + "learning_rate": 9.98943956179124e-05, + "loss": 0.014254853129386902, + "num_input_tokens_seen": 8253504, + "step": 504, + "train_runtime": 4097.5386, + "train_tokens_per_second": 2014.259 + }, + { + "epoch": 0.30606060606060603, + "grad_norm": 0.019410574808716774, + "learning_rate": 9.989377003722664e-05, + "loss": 0.012152588926255703, + "num_input_tokens_seen": 8269880, + "step": 505, + "train_runtime": 4105.6623, + "train_tokens_per_second": 2014.262 + }, + { + "epoch": 0.30666666666666664, + "grad_norm": 0.043787118047475815, + "learning_rate": 9.989314261106749e-05, + "loss": 0.013709669932723045, + "num_input_tokens_seen": 8286256, + "step": 506, + "train_runtime": 4113.7771, + "train_tokens_per_second": 2014.27 + }, + { + "epoch": 0.30727272727272725, + "grad_norm": 0.04813135415315628, + "learning_rate": 9.989251333945813e-05, + "loss": 0.014608191326260567, + "num_input_tokens_seen": 8302632, + "step": 507, + "train_runtime": 4121.8917, + "train_tokens_per_second": 2014.277 + }, + { + "epoch": 0.30787878787878786, + "grad_norm": 0.021107302978634834, + "learning_rate": 9.989188222242188e-05, + "loss": 0.012715778313577175, + "num_input_tokens_seen": 8319008, + "step": 508, + "train_runtime": 4130.0145, + "train_tokens_per_second": 2014.281 + }, + { + "epoch": 0.3084848484848485, + "grad_norm": 0.019778916612267494, + "learning_rate": 9.989124925998205e-05, + "loss": 0.012830444611608982, + "num_input_tokens_seen": 8335384, + "step": 509, + "train_runtime": 4138.13, + "train_tokens_per_second": 2014.288 + }, + { + "epoch": 0.3090909090909091, + "grad_norm": 0.019679522141814232, + "learning_rate": 9.989061445216208e-05, + "loss": 0.013841142877936363, + "num_input_tokens_seen": 8351760, + "step": 510, + "train_runtime": 4146.2492, + "train_tokens_per_second": 2014.293 + }, + { + "epoch": 0.3096969696969697, + "grad_norm": 0.023106170818209648, + "learning_rate": 9.988997779898545e-05, + "loss": 0.013808130286633968, + "num_input_tokens_seen": 8368136, + "step": 511, + "train_runtime": 4154.3635, + "train_tokens_per_second": 2014.3 + }, + { + "epoch": 0.3103030303030303, + "grad_norm": 0.02681031823158264, + "learning_rate": 9.988933930047569e-05, + "loss": 0.015086129307746887, + "num_input_tokens_seen": 8384512, + "step": 512, + "train_runtime": 4162.4819, + "train_tokens_per_second": 2014.306 + }, + { + "epoch": 0.3109090909090909, + "grad_norm": 0.044101521372795105, + "learning_rate": 9.988869895665642e-05, + "loss": 0.01502022985368967, + "num_input_tokens_seen": 8400888, + "step": 513, + "train_runtime": 4170.6029, + "train_tokens_per_second": 2014.31 + }, + { + "epoch": 0.3115151515151515, + "grad_norm": 0.016393663361668587, + "learning_rate": 9.988805676755133e-05, + "loss": 0.01283847913146019, + "num_input_tokens_seen": 8417264, + "step": 514, + "train_runtime": 4178.7186, + "train_tokens_per_second": 2014.317 + }, + { + "epoch": 0.31212121212121213, + "grad_norm": 0.04226645827293396, + "learning_rate": 9.988741273318416e-05, + "loss": 0.01453358493745327, + "num_input_tokens_seen": 8433640, + "step": 515, + "train_runtime": 4186.8351, + "train_tokens_per_second": 2014.323 + }, + { + "epoch": 0.31272727272727274, + "grad_norm": 0.021670697256922722, + "learning_rate": 9.988676685357876e-05, + "loss": 0.014670845121145248, + "num_input_tokens_seen": 8450016, + "step": 516, + "train_runtime": 4194.9601, + "train_tokens_per_second": 2014.326 + }, + { + "epoch": 0.31333333333333335, + "grad_norm": 0.02870395965874195, + "learning_rate": 9.988611912875901e-05, + "loss": 0.013808513060212135, + "num_input_tokens_seen": 8466392, + "step": 517, + "train_runtime": 4203.0848, + "train_tokens_per_second": 2014.328 + }, + { + "epoch": 0.31393939393939396, + "grad_norm": 0.02202719636261463, + "learning_rate": 9.988546955874885e-05, + "loss": 0.014270287938416004, + "num_input_tokens_seen": 8482768, + "step": 518, + "train_runtime": 4211.2073, + "train_tokens_per_second": 2014.332 + }, + { + "epoch": 0.3145454545454546, + "grad_norm": 0.03838543966412544, + "learning_rate": 9.988481814357233e-05, + "loss": 0.016241563484072685, + "num_input_tokens_seen": 8499144, + "step": 519, + "train_runtime": 4219.3304, + "train_tokens_per_second": 2014.335 + }, + { + "epoch": 0.3151515151515151, + "grad_norm": 0.024275533854961395, + "learning_rate": 9.988416488325352e-05, + "loss": 0.012701138854026794, + "num_input_tokens_seen": 8515520, + "step": 520, + "train_runtime": 4227.4491, + "train_tokens_per_second": 2014.34 + }, + { + "epoch": 0.31575757575757574, + "grad_norm": 0.019648293033242226, + "learning_rate": 9.98835097778166e-05, + "loss": 0.014066259376704693, + "num_input_tokens_seen": 8531896, + "step": 521, + "train_runtime": 4235.5669, + "train_tokens_per_second": 2014.346 + }, + { + "epoch": 0.31636363636363635, + "grad_norm": 0.03942210599780083, + "learning_rate": 9.98828528272858e-05, + "loss": 0.015006550587713718, + "num_input_tokens_seen": 8548272, + "step": 522, + "train_runtime": 4243.6821, + "train_tokens_per_second": 2014.353 + }, + { + "epoch": 0.31696969696969696, + "grad_norm": 0.01995157264173031, + "learning_rate": 9.988219403168542e-05, + "loss": 0.014066948555409908, + "num_input_tokens_seen": 8564648, + "step": 523, + "train_runtime": 4251.7984, + "train_tokens_per_second": 2014.359 + }, + { + "epoch": 0.31757575757575757, + "grad_norm": 0.05812249332666397, + "learning_rate": 9.988153339103983e-05, + "loss": 0.01575363054871559, + "num_input_tokens_seen": 8581024, + "step": 524, + "train_runtime": 4259.9171, + "train_tokens_per_second": 2014.364 + }, + { + "epoch": 0.3181818181818182, + "grad_norm": 0.02528631128370762, + "learning_rate": 9.988087090537344e-05, + "loss": 0.013741475529968739, + "num_input_tokens_seen": 8597400, + "step": 525, + "train_runtime": 4268.0355, + "train_tokens_per_second": 2014.369 + }, + { + "epoch": 0.3187878787878788, + "grad_norm": 0.015316633507609367, + "learning_rate": 9.988020657471077e-05, + "loss": 0.01343776285648346, + "num_input_tokens_seen": 8613776, + "step": 526, + "train_runtime": 4276.154, + "train_tokens_per_second": 2014.375 + }, + { + "epoch": 0.3193939393939394, + "grad_norm": 0.0239357091486454, + "learning_rate": 9.987954039907642e-05, + "loss": 0.013446596451103687, + "num_input_tokens_seen": 8630152, + "step": 527, + "train_runtime": 4284.2698, + "train_tokens_per_second": 2014.381 + }, + { + "epoch": 0.32, + "grad_norm": 0.023286571726202965, + "learning_rate": 9.9878872378495e-05, + "loss": 0.012851690873503685, + "num_input_tokens_seen": 8646528, + "step": 528, + "train_runtime": 4292.3857, + "train_tokens_per_second": 2014.387 + }, + { + "epoch": 0.3206060606060606, + "grad_norm": 0.03030410036444664, + "learning_rate": 9.987820251299122e-05, + "loss": 0.014106137678027153, + "num_input_tokens_seen": 8662904, + "step": 529, + "train_runtime": 4300.5021, + "train_tokens_per_second": 2014.394 + }, + { + "epoch": 0.3212121212121212, + "grad_norm": 0.018672285601496696, + "learning_rate": 9.987753080258986e-05, + "loss": 0.013117408379912376, + "num_input_tokens_seen": 8679280, + "step": 530, + "train_runtime": 4308.6186, + "train_tokens_per_second": 2014.4 + }, + { + "epoch": 0.32181818181818184, + "grad_norm": 0.032513462007045746, + "learning_rate": 9.987685724731577e-05, + "loss": 0.01231987215578556, + "num_input_tokens_seen": 8695656, + "step": 531, + "train_runtime": 4316.7369, + "train_tokens_per_second": 2014.405 + }, + { + "epoch": 0.32242424242424245, + "grad_norm": 0.11805391311645508, + "learning_rate": 9.987618184719386e-05, + "loss": 0.013388572260737419, + "num_input_tokens_seen": 8712032, + "step": 532, + "train_runtime": 4324.8544, + "train_tokens_per_second": 2014.41 + }, + { + "epoch": 0.32303030303030306, + "grad_norm": 0.02607562392950058, + "learning_rate": 9.987550460224912e-05, + "loss": 0.014675582759082317, + "num_input_tokens_seen": 8728408, + "step": 533, + "train_runtime": 4332.9699, + "train_tokens_per_second": 2014.417 + }, + { + "epoch": 0.3236363636363636, + "grad_norm": 0.03229625150561333, + "learning_rate": 9.987482551250659e-05, + "loss": 0.014730843715369701, + "num_input_tokens_seen": 8744784, + "step": 534, + "train_runtime": 4341.0862, + "train_tokens_per_second": 2014.423 + }, + { + "epoch": 0.3242424242424242, + "grad_norm": 0.02484363690018654, + "learning_rate": 9.987414457799138e-05, + "loss": 0.01373380795121193, + "num_input_tokens_seen": 8761160, + "step": 535, + "train_runtime": 4349.2033, + "train_tokens_per_second": 2014.429 + }, + { + "epoch": 0.32484848484848483, + "grad_norm": 0.06518429517745972, + "learning_rate": 9.987346179872869e-05, + "loss": 0.01318280678242445, + "num_input_tokens_seen": 8777536, + "step": 536, + "train_runtime": 4357.3294, + "train_tokens_per_second": 2014.43 + }, + { + "epoch": 0.32545454545454544, + "grad_norm": 0.023426007479429245, + "learning_rate": 9.98727771747438e-05, + "loss": 0.013221761211752892, + "num_input_tokens_seen": 8793912, + "step": 537, + "train_runtime": 4365.4504, + "train_tokens_per_second": 2014.434 + }, + { + "epoch": 0.32606060606060605, + "grad_norm": 0.017606353387236595, + "learning_rate": 9.987209070606199e-05, + "loss": 0.013325998559594154, + "num_input_tokens_seen": 8810288, + "step": 538, + "train_runtime": 4373.5723, + "train_tokens_per_second": 2014.437 + }, + { + "epoch": 0.32666666666666666, + "grad_norm": 0.01875401847064495, + "learning_rate": 9.987140239270865e-05, + "loss": 0.012510064989328384, + "num_input_tokens_seen": 8826664, + "step": 539, + "train_runtime": 4381.6917, + "train_tokens_per_second": 2014.442 + }, + { + "epoch": 0.32727272727272727, + "grad_norm": 0.01927015371620655, + "learning_rate": 9.987071223470926e-05, + "loss": 0.012322126887738705, + "num_input_tokens_seen": 8843040, + "step": 540, + "train_runtime": 4389.8115, + "train_tokens_per_second": 2014.446 + }, + { + "epoch": 0.3278787878787879, + "grad_norm": 0.021669652312994003, + "learning_rate": 9.987002023208935e-05, + "loss": 0.013479230925440788, + "num_input_tokens_seen": 8859416, + "step": 541, + "train_runtime": 4397.9315, + "train_tokens_per_second": 2014.451 + }, + { + "epoch": 0.3284848484848485, + "grad_norm": 0.021821271628141403, + "learning_rate": 9.98693263848745e-05, + "loss": 0.013202294707298279, + "num_input_tokens_seen": 8875792, + "step": 542, + "train_runtime": 4406.0521, + "train_tokens_per_second": 2014.455 + }, + { + "epoch": 0.3290909090909091, + "grad_norm": 0.04035639762878418, + "learning_rate": 9.98686306930904e-05, + "loss": 0.014951585792005062, + "num_input_tokens_seen": 8892168, + "step": 543, + "train_runtime": 4414.1742, + "train_tokens_per_second": 2014.458 + }, + { + "epoch": 0.3296969696969697, + "grad_norm": 0.01868710108101368, + "learning_rate": 9.986793315676276e-05, + "loss": 0.012716731987893581, + "num_input_tokens_seen": 8908544, + "step": 544, + "train_runtime": 4422.2924, + "train_tokens_per_second": 2014.463 + }, + { + "epoch": 0.3303030303030303, + "grad_norm": 0.030803462490439415, + "learning_rate": 9.986723377591738e-05, + "loss": 0.012449722737073898, + "num_input_tokens_seen": 8924920, + "step": 545, + "train_runtime": 4430.4191, + "train_tokens_per_second": 2014.464 + }, + { + "epoch": 0.33090909090909093, + "grad_norm": 0.031005537137389183, + "learning_rate": 9.986653255058014e-05, + "loss": 0.014312123879790306, + "num_input_tokens_seen": 8941296, + "step": 546, + "train_runtime": 4438.5386, + "train_tokens_per_second": 2014.468 + }, + { + "epoch": 0.33151515151515154, + "grad_norm": 0.0480731725692749, + "learning_rate": 9.986582948077696e-05, + "loss": 0.015260567888617516, + "num_input_tokens_seen": 8957672, + "step": 547, + "train_runtime": 4446.6634, + "train_tokens_per_second": 2014.47 + }, + { + "epoch": 0.3321212121212121, + "grad_norm": 0.031962063163518906, + "learning_rate": 9.986512456653388e-05, + "loss": 0.01442326046526432, + "num_input_tokens_seen": 8974048, + "step": 548, + "train_runtime": 4454.7823, + "train_tokens_per_second": 2014.475 + }, + { + "epoch": 0.3327272727272727, + "grad_norm": 0.026429401710629463, + "learning_rate": 9.986441780787692e-05, + "loss": 0.014029188081622124, + "num_input_tokens_seen": 8990424, + "step": 549, + "train_runtime": 4462.9013, + "train_tokens_per_second": 2014.48 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 0.025757839903235435, + "learning_rate": 9.986370920483227e-05, + "loss": 0.013720030896365643, + "num_input_tokens_seen": 9006800, + "step": 550, + "train_runtime": 4471.0287, + "train_tokens_per_second": 2014.48 + }, + { + "epoch": 0.3339393939393939, + "grad_norm": 0.029027970507740974, + "learning_rate": 9.986299875742613e-05, + "loss": 0.014392418786883354, + "num_input_tokens_seen": 9023176, + "step": 551, + "train_runtime": 4479.1476, + "train_tokens_per_second": 2014.485 + }, + { + "epoch": 0.33454545454545453, + "grad_norm": 0.02587730623781681, + "learning_rate": 9.986228646568475e-05, + "loss": 0.014536920003592968, + "num_input_tokens_seen": 9039552, + "step": 552, + "train_runtime": 4487.2645, + "train_tokens_per_second": 2014.491 + }, + { + "epoch": 0.33515151515151514, + "grad_norm": 0.024850307032465935, + "learning_rate": 9.986157232963452e-05, + "loss": 0.014528162777423859, + "num_input_tokens_seen": 9055928, + "step": 553, + "train_runtime": 4495.3823, + "train_tokens_per_second": 2014.496 + }, + { + "epoch": 0.33575757575757575, + "grad_norm": 0.03375309333205223, + "learning_rate": 9.98608563493018e-05, + "loss": 0.01345045492053032, + "num_input_tokens_seen": 9072304, + "step": 554, + "train_runtime": 4503.5123, + "train_tokens_per_second": 2014.495 + }, + { + "epoch": 0.33636363636363636, + "grad_norm": 0.034519318491220474, + "learning_rate": 9.986013852471313e-05, + "loss": 0.016201037913560867, + "num_input_tokens_seen": 9088680, + "step": 555, + "train_runtime": 4511.6315, + "train_tokens_per_second": 2014.5 + }, + { + "epoch": 0.336969696969697, + "grad_norm": 0.025029929354786873, + "learning_rate": 9.985941885589502e-05, + "loss": 0.013687830418348312, + "num_input_tokens_seen": 9105056, + "step": 556, + "train_runtime": 4519.7498, + "train_tokens_per_second": 2014.504 + }, + { + "epoch": 0.3375757575757576, + "grad_norm": 0.02109324000775814, + "learning_rate": 9.98586973428741e-05, + "loss": 0.013876669108867645, + "num_input_tokens_seen": 9121432, + "step": 557, + "train_runtime": 4527.8676, + "train_tokens_per_second": 2014.509 + }, + { + "epoch": 0.3381818181818182, + "grad_norm": 0.017437269911170006, + "learning_rate": 9.985797398567707e-05, + "loss": 0.013100878335535526, + "num_input_tokens_seen": 9137808, + "step": 558, + "train_runtime": 4535.9928, + "train_tokens_per_second": 2014.511 + }, + { + "epoch": 0.3387878787878788, + "grad_norm": 0.04041491076350212, + "learning_rate": 9.985724878433066e-05, + "loss": 0.014973807148635387, + "num_input_tokens_seen": 9154184, + "step": 559, + "train_runtime": 4544.113, + "train_tokens_per_second": 2014.515 + }, + { + "epoch": 0.3393939393939394, + "grad_norm": 0.02034146897494793, + "learning_rate": 9.985652173886174e-05, + "loss": 0.012258726172149181, + "num_input_tokens_seen": 9170560, + "step": 560, + "train_runtime": 4552.2371, + "train_tokens_per_second": 2014.517 + }, + { + "epoch": 0.34, + "grad_norm": 0.016358409076929092, + "learning_rate": 9.985579284929715e-05, + "loss": 0.014534495770931244, + "num_input_tokens_seen": 9186936, + "step": 561, + "train_runtime": 4560.3582, + "train_tokens_per_second": 2014.521 + }, + { + "epoch": 0.3406060606060606, + "grad_norm": 0.017970645800232887, + "learning_rate": 9.985506211566388e-05, + "loss": 0.013168847188353539, + "num_input_tokens_seen": 9203312, + "step": 562, + "train_runtime": 4568.4702, + "train_tokens_per_second": 2014.528 + }, + { + "epoch": 0.3412121212121212, + "grad_norm": 0.02478228323161602, + "learning_rate": 9.985432953798895e-05, + "loss": 0.016286451369524002, + "num_input_tokens_seen": 9219688, + "step": 563, + "train_runtime": 4576.5846, + "train_tokens_per_second": 2014.535 + }, + { + "epoch": 0.3418181818181818, + "grad_norm": 0.023158971220254898, + "learning_rate": 9.985359511629944e-05, + "loss": 0.014812255278229713, + "num_input_tokens_seen": 9236064, + "step": 564, + "train_runtime": 4584.6914, + "train_tokens_per_second": 2014.544 + }, + { + "epoch": 0.3424242424242424, + "grad_norm": 0.017976826056838036, + "learning_rate": 9.985285885062257e-05, + "loss": 0.013011513277888298, + "num_input_tokens_seen": 9252440, + "step": 565, + "train_runtime": 4592.801, + "train_tokens_per_second": 2014.553 + }, + { + "epoch": 0.343030303030303, + "grad_norm": 0.022492917254567146, + "learning_rate": 9.98521207409855e-05, + "loss": 0.014015360735356808, + "num_input_tokens_seen": 9268816, + "step": 566, + "train_runtime": 4600.9112, + "train_tokens_per_second": 2014.561 + }, + { + "epoch": 0.34363636363636363, + "grad_norm": 0.05375469848513603, + "learning_rate": 9.985138078741559e-05, + "loss": 0.013538680039346218, + "num_input_tokens_seen": 9285192, + "step": 567, + "train_runtime": 4609.0183, + "train_tokens_per_second": 2014.57 + }, + { + "epoch": 0.34424242424242424, + "grad_norm": 0.011526068672537804, + "learning_rate": 9.985063898994016e-05, + "loss": 0.012446783483028412, + "num_input_tokens_seen": 9301568, + "step": 568, + "train_runtime": 4617.1293, + "train_tokens_per_second": 2014.578 + }, + { + "epoch": 0.34484848484848485, + "grad_norm": 0.015349720604717731, + "learning_rate": 9.984989534858669e-05, + "loss": 0.012871544808149338, + "num_input_tokens_seen": 9317944, + "step": 569, + "train_runtime": 4625.2366, + "train_tokens_per_second": 2014.588 + }, + { + "epoch": 0.34545454545454546, + "grad_norm": 0.03799523040652275, + "learning_rate": 9.984914986338268e-05, + "loss": 0.014556103385984898, + "num_input_tokens_seen": 9334320, + "step": 570, + "train_runtime": 4633.3464, + "train_tokens_per_second": 2014.596 + }, + { + "epoch": 0.34606060606060607, + "grad_norm": 0.042935777455568314, + "learning_rate": 9.984840253435568e-05, + "loss": 0.015330069698393345, + "num_input_tokens_seen": 9350696, + "step": 571, + "train_runtime": 4641.4533, + "train_tokens_per_second": 2014.605 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 0.026697825640439987, + "learning_rate": 9.984765336153334e-05, + "loss": 0.01370144821703434, + "num_input_tokens_seen": 9367072, + "step": 572, + "train_runtime": 4649.5653, + "train_tokens_per_second": 2014.612 + }, + { + "epoch": 0.3472727272727273, + "grad_norm": 0.04093024507164955, + "learning_rate": 9.984690234494339e-05, + "loss": 0.01424380298703909, + "num_input_tokens_seen": 9383448, + "step": 573, + "train_runtime": 4657.6738, + "train_tokens_per_second": 2014.621 + }, + { + "epoch": 0.3478787878787879, + "grad_norm": 0.03236076980829239, + "learning_rate": 9.984614948461358e-05, + "loss": 0.014988360926508904, + "num_input_tokens_seen": 9399824, + "step": 574, + "train_runtime": 4665.7816, + "train_tokens_per_second": 2014.63 + }, + { + "epoch": 0.3484848484848485, + "grad_norm": 0.016026047989726067, + "learning_rate": 9.984539478057178e-05, + "loss": 0.013162180781364441, + "num_input_tokens_seen": 9416200, + "step": 575, + "train_runtime": 4673.8904, + "train_tokens_per_second": 2014.639 + }, + { + "epoch": 0.3490909090909091, + "grad_norm": 0.03273920342326164, + "learning_rate": 9.984463823284589e-05, + "loss": 0.015174154192209244, + "num_input_tokens_seen": 9432576, + "step": 576, + "train_runtime": 4682.0015, + "train_tokens_per_second": 2014.646 + }, + { + "epoch": 0.3496969696969697, + "grad_norm": 0.03933154046535492, + "learning_rate": 9.98438798414639e-05, + "loss": 0.014418127946555614, + "num_input_tokens_seen": 9448952, + "step": 577, + "train_runtime": 4690.1128, + "train_tokens_per_second": 2014.653 + }, + { + "epoch": 0.3503030303030303, + "grad_norm": 0.02570173889398575, + "learning_rate": 9.984311960645388e-05, + "loss": 0.01333607453852892, + "num_input_tokens_seen": 9465328, + "step": 578, + "train_runtime": 4698.2293, + "train_tokens_per_second": 2014.659 + }, + { + "epoch": 0.3509090909090909, + "grad_norm": 0.024147065356373787, + "learning_rate": 9.984235752784392e-05, + "loss": 0.013619362376630306, + "num_input_tokens_seen": 9481704, + "step": 579, + "train_runtime": 4706.3371, + "train_tokens_per_second": 2014.667 + }, + { + "epoch": 0.3515151515151515, + "grad_norm": 0.04005376994609833, + "learning_rate": 9.98415936056622e-05, + "loss": 0.014414026401937008, + "num_input_tokens_seen": 9498080, + "step": 580, + "train_runtime": 4714.4455, + "train_tokens_per_second": 2014.676 + }, + { + "epoch": 0.3521212121212121, + "grad_norm": 0.03428025171160698, + "learning_rate": 9.984082783993703e-05, + "loss": 0.01436635572463274, + "num_input_tokens_seen": 9514456, + "step": 581, + "train_runtime": 4722.5545, + "train_tokens_per_second": 2014.684 + }, + { + "epoch": 0.3527272727272727, + "grad_norm": 0.02205795608460903, + "learning_rate": 9.984006023069666e-05, + "loss": 0.013060957193374634, + "num_input_tokens_seen": 9530832, + "step": 582, + "train_runtime": 4730.6633, + "train_tokens_per_second": 2014.693 + }, + { + "epoch": 0.35333333333333333, + "grad_norm": 0.020862819626927376, + "learning_rate": 9.983929077796954e-05, + "loss": 0.013365531340241432, + "num_input_tokens_seen": 9547208, + "step": 583, + "train_runtime": 4738.7746, + "train_tokens_per_second": 2014.7 + }, + { + "epoch": 0.35393939393939394, + "grad_norm": 0.012693438678979874, + "learning_rate": 9.983851948178412e-05, + "loss": 0.012265143916010857, + "num_input_tokens_seen": 9563584, + "step": 584, + "train_runtime": 4746.884, + "train_tokens_per_second": 2014.708 + }, + { + "epoch": 0.35454545454545455, + "grad_norm": 0.03995286300778389, + "learning_rate": 9.983774634216892e-05, + "loss": 0.014887749217450619, + "num_input_tokens_seen": 9579960, + "step": 585, + "train_runtime": 4754.9935, + "train_tokens_per_second": 2014.716 + }, + { + "epoch": 0.35515151515151516, + "grad_norm": 0.02919401042163372, + "learning_rate": 9.983697135915252e-05, + "loss": 0.01471506617963314, + "num_input_tokens_seen": 9596336, + "step": 586, + "train_runtime": 4763.1041, + "train_tokens_per_second": 2014.723 + }, + { + "epoch": 0.3557575757575758, + "grad_norm": 0.03058960661292076, + "learning_rate": 9.98361945327636e-05, + "loss": 0.014638346619904041, + "num_input_tokens_seen": 9612712, + "step": 587, + "train_runtime": 4771.2155, + "train_tokens_per_second": 2014.73 + }, + { + "epoch": 0.3563636363636364, + "grad_norm": 0.03899887949228287, + "learning_rate": 9.983541586303091e-05, + "loss": 0.015173106454312801, + "num_input_tokens_seen": 9629088, + "step": 588, + "train_runtime": 4779.3321, + "train_tokens_per_second": 2014.735 + }, + { + "epoch": 0.356969696969697, + "grad_norm": 0.34171223640441895, + "learning_rate": 9.983463534998326e-05, + "loss": 0.01584211364388466, + "num_input_tokens_seen": 9645464, + "step": 589, + "train_runtime": 4787.4435, + "train_tokens_per_second": 2014.742 + }, + { + "epoch": 0.3575757575757576, + "grad_norm": 0.025424521416425705, + "learning_rate": 9.983385299364946e-05, + "loss": 0.01455459464341402, + "num_input_tokens_seen": 9661840, + "step": 590, + "train_runtime": 4795.5546, + "train_tokens_per_second": 2014.749 + }, + { + "epoch": 0.35818181818181816, + "grad_norm": 0.032859109342098236, + "learning_rate": 9.98330687940585e-05, + "loss": 0.0146177988499403, + "num_input_tokens_seen": 9678216, + "step": 591, + "train_runtime": 4803.6648, + "train_tokens_per_second": 2014.757 + }, + { + "epoch": 0.35878787878787877, + "grad_norm": 0.038725532591342926, + "learning_rate": 9.983228275123938e-05, + "loss": 0.014792557805776596, + "num_input_tokens_seen": 9694592, + "step": 592, + "train_runtime": 4811.7746, + "train_tokens_per_second": 2014.764 + }, + { + "epoch": 0.3593939393939394, + "grad_norm": 0.020830297842621803, + "learning_rate": 9.983149486522115e-05, + "loss": 0.014553902670741081, + "num_input_tokens_seen": 9710968, + "step": 593, + "train_runtime": 4819.8876, + "train_tokens_per_second": 2014.771 + }, + { + "epoch": 0.36, + "grad_norm": 0.01844129152595997, + "learning_rate": 9.983070513603295e-05, + "loss": 0.014042770490050316, + "num_input_tokens_seen": 9727344, + "step": 594, + "train_runtime": 4827.9961, + "train_tokens_per_second": 2014.779 + }, + { + "epoch": 0.3606060606060606, + "grad_norm": 0.2604560852050781, + "learning_rate": 9.982991356370404e-05, + "loss": 0.01581915095448494, + "num_input_tokens_seen": 9743720, + "step": 595, + "train_runtime": 4836.1086, + "train_tokens_per_second": 2014.785 + }, + { + "epoch": 0.3612121212121212, + "grad_norm": 0.03814680501818657, + "learning_rate": 9.982912014826365e-05, + "loss": 0.016680167987942696, + "num_input_tokens_seen": 9760096, + "step": 596, + "train_runtime": 4844.2153, + "train_tokens_per_second": 2014.794 + }, + { + "epoch": 0.3618181818181818, + "grad_norm": 0.02060728892683983, + "learning_rate": 9.982832488974115e-05, + "loss": 0.014381843619048595, + "num_input_tokens_seen": 9776472, + "step": 597, + "train_runtime": 4852.3306, + "train_tokens_per_second": 2014.799 + }, + { + "epoch": 0.3624242424242424, + "grad_norm": 0.028759043663740158, + "learning_rate": 9.982752778816595e-05, + "loss": 0.014019730500876904, + "num_input_tokens_seen": 9792848, + "step": 598, + "train_runtime": 4860.4404, + "train_tokens_per_second": 2014.807 + }, + { + "epoch": 0.36303030303030304, + "grad_norm": 0.05189267545938492, + "learning_rate": 9.982672884356752e-05, + "loss": 0.01498887874186039, + "num_input_tokens_seen": 9809224, + "step": 599, + "train_runtime": 4868.5548, + "train_tokens_per_second": 2014.812 + }, + { + "epoch": 0.36363636363636365, + "grad_norm": 0.01455398928374052, + "learning_rate": 9.982592805597544e-05, + "loss": 0.011788399890065193, + "num_input_tokens_seen": 9825600, + "step": 600, + "train_runtime": 4876.663, + "train_tokens_per_second": 2014.82 + }, + { + "epoch": 0.36424242424242426, + "grad_norm": 0.046520307660102844, + "learning_rate": 9.982512542541929e-05, + "loss": 0.012856653891503811, + "num_input_tokens_seen": 9841976, + "step": 601, + "train_runtime": 4885.6882, + "train_tokens_per_second": 2014.45 + }, + { + "epoch": 0.36484848484848487, + "grad_norm": 0.017443792894482613, + "learning_rate": 9.98243209519288e-05, + "loss": 0.013804689049720764, + "num_input_tokens_seen": 9858352, + "step": 602, + "train_runtime": 4893.7966, + "train_tokens_per_second": 2014.459 + }, + { + "epoch": 0.3654545454545455, + "grad_norm": 0.016950292512774467, + "learning_rate": 9.98235146355337e-05, + "loss": 0.01243675872683525, + "num_input_tokens_seen": 9874728, + "step": 603, + "train_runtime": 4901.9018, + "train_tokens_per_second": 2014.469 + }, + { + "epoch": 0.3660606060606061, + "grad_norm": 0.017681090161204338, + "learning_rate": 9.982270647626382e-05, + "loss": 0.011940497905015945, + "num_input_tokens_seen": 9891104, + "step": 604, + "train_runtime": 4910.0066, + "train_tokens_per_second": 2014.479 + }, + { + "epoch": 0.36666666666666664, + "grad_norm": 0.018248707056045532, + "learning_rate": 9.982189647414906e-05, + "loss": 0.012673230841755867, + "num_input_tokens_seen": 9907480, + "step": 605, + "train_runtime": 4918.1184, + "train_tokens_per_second": 2014.486 + }, + { + "epoch": 0.36727272727272725, + "grad_norm": 0.020540893077850342, + "learning_rate": 9.982108462921937e-05, + "loss": 0.014132777228951454, + "num_input_tokens_seen": 9923856, + "step": 606, + "train_runtime": 4926.2283, + "train_tokens_per_second": 2014.494 + }, + { + "epoch": 0.36787878787878786, + "grad_norm": 0.023124700412154198, + "learning_rate": 9.982027094150478e-05, + "loss": 0.012684160843491554, + "num_input_tokens_seen": 9940232, + "step": 607, + "train_runtime": 4934.3331, + "train_tokens_per_second": 2014.504 + }, + { + "epoch": 0.36848484848484847, + "grad_norm": 0.020409781485795975, + "learning_rate": 9.98194554110354e-05, + "loss": 0.014147626236081123, + "num_input_tokens_seen": 9956608, + "step": 608, + "train_runtime": 4942.4391, + "train_tokens_per_second": 2014.513 + }, + { + "epoch": 0.3690909090909091, + "grad_norm": 0.015636246651411057, + "learning_rate": 9.981863803784136e-05, + "loss": 0.014182131737470627, + "num_input_tokens_seen": 9972984, + "step": 609, + "train_runtime": 4950.5477, + "train_tokens_per_second": 2014.521 + }, + { + "epoch": 0.3696969696969697, + "grad_norm": 0.0192013718187809, + "learning_rate": 9.981781882195292e-05, + "loss": 0.013808063231408596, + "num_input_tokens_seen": 9989360, + "step": 610, + "train_runtime": 4958.6543, + "train_tokens_per_second": 2014.53 + }, + { + "epoch": 0.3703030303030303, + "grad_norm": 0.017762696370482445, + "learning_rate": 9.981699776340039e-05, + "loss": 0.013210650533437729, + "num_input_tokens_seen": 10005736, + "step": 611, + "train_runtime": 4966.7598, + "train_tokens_per_second": 2014.54 + }, + { + "epoch": 0.3709090909090909, + "grad_norm": 0.025030212476849556, + "learning_rate": 9.981617486221413e-05, + "loss": 0.01400088518857956, + "num_input_tokens_seen": 10022112, + "step": 612, + "train_runtime": 4974.8675, + "train_tokens_per_second": 2014.549 + }, + { + "epoch": 0.3715151515151515, + "grad_norm": 0.030215473845601082, + "learning_rate": 9.981535011842456e-05, + "loss": 0.01368585042655468, + "num_input_tokens_seen": 10038488, + "step": 613, + "train_runtime": 4982.9771, + "train_tokens_per_second": 2014.556 + }, + { + "epoch": 0.37212121212121213, + "grad_norm": 0.021045658737421036, + "learning_rate": 9.981452353206222e-05, + "loss": 0.014398960396647453, + "num_input_tokens_seen": 10054864, + "step": 614, + "train_runtime": 4991.0863, + "train_tokens_per_second": 2014.564 + }, + { + "epoch": 0.37272727272727274, + "grad_norm": 0.01661411114037037, + "learning_rate": 9.981369510315764e-05, + "loss": 0.0135966120287776, + "num_input_tokens_seen": 10071240, + "step": 615, + "train_runtime": 4999.1912, + "train_tokens_per_second": 2014.574 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 0.013987096957862377, + "learning_rate": 9.98128648317415e-05, + "loss": 0.011429902166128159, + "num_input_tokens_seen": 10087616, + "step": 616, + "train_runtime": 5007.2986, + "train_tokens_per_second": 2014.582 + }, + { + "epoch": 0.37393939393939396, + "grad_norm": 0.01872987300157547, + "learning_rate": 9.981203271784449e-05, + "loss": 0.011292507871985435, + "num_input_tokens_seen": 10103992, + "step": 617, + "train_runtime": 5015.406, + "train_tokens_per_second": 2014.591 + }, + { + "epoch": 0.37454545454545457, + "grad_norm": 0.013638158328831196, + "learning_rate": 9.98111987614974e-05, + "loss": 0.012537346221506596, + "num_input_tokens_seen": 10120368, + "step": 618, + "train_runtime": 5023.5134, + "train_tokens_per_second": 2014.6 + }, + { + "epoch": 0.3751515151515151, + "grad_norm": 0.012727733701467514, + "learning_rate": 9.981036296273106e-05, + "loss": 0.012531593441963196, + "num_input_tokens_seen": 10136744, + "step": 619, + "train_runtime": 5031.6191, + "train_tokens_per_second": 2014.609 + }, + { + "epoch": 0.37575757575757573, + "grad_norm": 0.017707858234643936, + "learning_rate": 9.98095253215764e-05, + "loss": 0.012445853091776371, + "num_input_tokens_seen": 10153120, + "step": 620, + "train_runtime": 5039.7288, + "train_tokens_per_second": 2014.616 + }, + { + "epoch": 0.37636363636363634, + "grad_norm": 0.02095656655728817, + "learning_rate": 9.98086858380644e-05, + "loss": 0.01246220339089632, + "num_input_tokens_seen": 10169496, + "step": 621, + "train_runtime": 5047.8343, + "train_tokens_per_second": 2014.626 + }, + { + "epoch": 0.37696969696969695, + "grad_norm": 0.0194542296230793, + "learning_rate": 9.980784451222612e-05, + "loss": 0.012840205803513527, + "num_input_tokens_seen": 10185872, + "step": 622, + "train_runtime": 5055.9398, + "train_tokens_per_second": 2014.635 + }, + { + "epoch": 0.37757575757575756, + "grad_norm": 0.045034874230623245, + "learning_rate": 9.980700134409266e-05, + "loss": 0.01571492850780487, + "num_input_tokens_seen": 10202248, + "step": 623, + "train_runtime": 5064.0515, + "train_tokens_per_second": 2014.641 + }, + { + "epoch": 0.3781818181818182, + "grad_norm": 0.017045883461833, + "learning_rate": 9.980615633369522e-05, + "loss": 0.013137969188392162, + "num_input_tokens_seen": 10218624, + "step": 624, + "train_runtime": 5072.1723, + "train_tokens_per_second": 2014.644 + }, + { + "epoch": 0.3787878787878788, + "grad_norm": 0.01485395897179842, + "learning_rate": 9.980530948106504e-05, + "loss": 0.01213077548891306, + "num_input_tokens_seen": 10235000, + "step": 625, + "train_runtime": 5080.2996, + "train_tokens_per_second": 2014.645 + }, + { + "epoch": 0.3793939393939394, + "grad_norm": 0.014804039150476456, + "learning_rate": 9.980446078623345e-05, + "loss": 0.012899467721581459, + "num_input_tokens_seen": 10251376, + "step": 626, + "train_runtime": 5088.4306, + "train_tokens_per_second": 2014.644 + }, + { + "epoch": 0.38, + "grad_norm": 0.02651570737361908, + "learning_rate": 9.980361024923185e-05, + "loss": 0.012421991676092148, + "num_input_tokens_seen": 10267752, + "step": 627, + "train_runtime": 5096.5501, + "train_tokens_per_second": 2014.648 + }, + { + "epoch": 0.3806060606060606, + "grad_norm": 0.018621394410729408, + "learning_rate": 9.98027578700917e-05, + "loss": 0.01267517451196909, + "num_input_tokens_seen": 10284128, + "step": 628, + "train_runtime": 5104.6689, + "train_tokens_per_second": 2014.651 + }, + { + "epoch": 0.3812121212121212, + "grad_norm": 0.0398629792034626, + "learning_rate": 9.980190364884452e-05, + "loss": 0.014264339581131935, + "num_input_tokens_seen": 10300504, + "step": 629, + "train_runtime": 5112.783, + "train_tokens_per_second": 2014.657 + }, + { + "epoch": 0.38181818181818183, + "grad_norm": 0.014866935089230537, + "learning_rate": 9.98010475855219e-05, + "loss": 0.01269571203738451, + "num_input_tokens_seen": 10316880, + "step": 630, + "train_runtime": 5120.8995, + "train_tokens_per_second": 2014.662 + }, + { + "epoch": 0.38242424242424244, + "grad_norm": 0.02409232407808304, + "learning_rate": 9.980018968015552e-05, + "loss": 0.01351371593773365, + "num_input_tokens_seen": 10333256, + "step": 631, + "train_runtime": 5129.0287, + "train_tokens_per_second": 2014.661 + }, + { + "epoch": 0.38303030303030305, + "grad_norm": 0.01822233758866787, + "learning_rate": 9.979932993277711e-05, + "loss": 0.011882105842232704, + "num_input_tokens_seen": 10349632, + "step": 632, + "train_runtime": 5137.1531, + "train_tokens_per_second": 2014.663 + }, + { + "epoch": 0.3836363636363636, + "grad_norm": 0.030663253739476204, + "learning_rate": 9.979846834341846e-05, + "loss": 0.014444777742028236, + "num_input_tokens_seen": 10366008, + "step": 633, + "train_runtime": 5145.2769, + "train_tokens_per_second": 2014.665 + }, + { + "epoch": 0.3842424242424242, + "grad_norm": 0.013876891694962978, + "learning_rate": 9.979760491211146e-05, + "loss": 0.012167233973741531, + "num_input_tokens_seen": 10382384, + "step": 634, + "train_runtime": 5153.3942, + "train_tokens_per_second": 2014.669 + }, + { + "epoch": 0.38484848484848483, + "grad_norm": 0.03647688776254654, + "learning_rate": 9.979673963888801e-05, + "loss": 0.013262891210615635, + "num_input_tokens_seen": 10398760, + "step": 635, + "train_runtime": 5161.5119, + "train_tokens_per_second": 2014.673 + }, + { + "epoch": 0.38545454545454544, + "grad_norm": 0.02617211639881134, + "learning_rate": 9.979587252378013e-05, + "loss": 0.014726457186043262, + "num_input_tokens_seen": 10415136, + "step": 636, + "train_runtime": 5169.6294, + "train_tokens_per_second": 2014.678 + }, + { + "epoch": 0.38606060606060605, + "grad_norm": 0.01650061085820198, + "learning_rate": 9.979500356681992e-05, + "loss": 0.014401402324438095, + "num_input_tokens_seen": 10431512, + "step": 637, + "train_runtime": 5177.7469, + "train_tokens_per_second": 2014.682 + }, + { + "epoch": 0.38666666666666666, + "grad_norm": 0.017912236973643303, + "learning_rate": 9.979413276803948e-05, + "loss": 0.011410839855670929, + "num_input_tokens_seen": 10447888, + "step": 638, + "train_runtime": 5185.8616, + "train_tokens_per_second": 2014.687 + }, + { + "epoch": 0.38727272727272727, + "grad_norm": 0.02133595198392868, + "learning_rate": 9.979326012747106e-05, + "loss": 0.01264719758182764, + "num_input_tokens_seen": 10464264, + "step": 639, + "train_runtime": 5193.9789, + "train_tokens_per_second": 2014.691 + }, + { + "epoch": 0.3878787878787879, + "grad_norm": 0.011059283278882504, + "learning_rate": 9.97923856451469e-05, + "loss": 0.011714452877640724, + "num_input_tokens_seen": 10480640, + "step": 640, + "train_runtime": 5202.0952, + "train_tokens_per_second": 2014.696 + }, + { + "epoch": 0.3884848484848485, + "grad_norm": 0.01679043099284172, + "learning_rate": 9.979150932109937e-05, + "loss": 0.012356593273580074, + "num_input_tokens_seen": 10497016, + "step": 641, + "train_runtime": 5210.2129, + "train_tokens_per_second": 2014.7 + }, + { + "epoch": 0.3890909090909091, + "grad_norm": 0.017658302560448647, + "learning_rate": 9.979063115536086e-05, + "loss": 0.014303645119071007, + "num_input_tokens_seen": 10513392, + "step": 642, + "train_runtime": 5218.3299, + "train_tokens_per_second": 2014.704 + }, + { + "epoch": 0.3896969696969697, + "grad_norm": 0.037931594997644424, + "learning_rate": 9.978975114796389e-05, + "loss": 0.015233817510306835, + "num_input_tokens_seen": 10529768, + "step": 643, + "train_runtime": 5226.4474, + "train_tokens_per_second": 2014.709 + }, + { + "epoch": 0.3903030303030303, + "grad_norm": 0.024847477674484253, + "learning_rate": 9.978886929894096e-05, + "loss": 0.011363557539880276, + "num_input_tokens_seen": 10546144, + "step": 644, + "train_runtime": 5234.5646, + "train_tokens_per_second": 2014.713 + }, + { + "epoch": 0.39090909090909093, + "grad_norm": 0.025633033365011215, + "learning_rate": 9.978798560832474e-05, + "loss": 0.01591489464044571, + "num_input_tokens_seen": 10562520, + "step": 645, + "train_runtime": 5242.6796, + "train_tokens_per_second": 2014.718 + }, + { + "epoch": 0.39151515151515154, + "grad_norm": 0.01618288829922676, + "learning_rate": 9.978710007614786e-05, + "loss": 0.012586476281285286, + "num_input_tokens_seen": 10578896, + "step": 646, + "train_runtime": 5250.7993, + "train_tokens_per_second": 2014.721 + }, + { + "epoch": 0.39212121212121215, + "grad_norm": 0.02201761119067669, + "learning_rate": 9.978621270244313e-05, + "loss": 0.015117557719349861, + "num_input_tokens_seen": 10595272, + "step": 647, + "train_runtime": 5258.9174, + "train_tokens_per_second": 2014.725 + }, + { + "epoch": 0.3927272727272727, + "grad_norm": 0.0371362566947937, + "learning_rate": 9.978532348724335e-05, + "loss": 0.014719461090862751, + "num_input_tokens_seen": 10611648, + "step": 648, + "train_runtime": 5267.0377, + "train_tokens_per_second": 2014.728 + }, + { + "epoch": 0.3933333333333333, + "grad_norm": 0.02168305590748787, + "learning_rate": 9.978443243058139e-05, + "loss": 0.01353619247674942, + "num_input_tokens_seen": 10628024, + "step": 649, + "train_runtime": 5275.1562, + "train_tokens_per_second": 2014.732 + }, + { + "epoch": 0.3939393939393939, + "grad_norm": 0.019228238612413406, + "learning_rate": 9.978353953249022e-05, + "loss": 0.013856697827577591, + "num_input_tokens_seen": 10644400, + "step": 650, + "train_runtime": 5283.2715, + "train_tokens_per_second": 2014.737 + }, + { + "epoch": 0.39454545454545453, + "grad_norm": 0.027308976277709007, + "learning_rate": 9.978264479300289e-05, + "loss": 0.013041336089372635, + "num_input_tokens_seen": 10660776, + "step": 651, + "train_runtime": 5291.3911, + "train_tokens_per_second": 2014.74 + }, + { + "epoch": 0.39515151515151514, + "grad_norm": 0.016961168497800827, + "learning_rate": 9.978174821215247e-05, + "loss": 0.012095801532268524, + "num_input_tokens_seen": 10677152, + "step": 652, + "train_runtime": 5299.5022, + "train_tokens_per_second": 2014.746 + }, + { + "epoch": 0.39575757575757575, + "grad_norm": 0.030550425872206688, + "learning_rate": 9.978084978997212e-05, + "loss": 0.014912940561771393, + "num_input_tokens_seen": 10693528, + "step": 653, + "train_runtime": 5307.6115, + "train_tokens_per_second": 2014.753 + }, + { + "epoch": 0.39636363636363636, + "grad_norm": 0.035802144557237625, + "learning_rate": 9.977994952649509e-05, + "loss": 0.014338945969939232, + "num_input_tokens_seen": 10709904, + "step": 654, + "train_runtime": 5315.7289, + "train_tokens_per_second": 2014.757 + }, + { + "epoch": 0.396969696969697, + "grad_norm": 0.016549181193113327, + "learning_rate": 9.977904742175466e-05, + "loss": 0.013156197033822536, + "num_input_tokens_seen": 10726280, + "step": 655, + "train_runtime": 5323.8353, + "train_tokens_per_second": 2014.766 + }, + { + "epoch": 0.3975757575757576, + "grad_norm": 0.020908519625663757, + "learning_rate": 9.977814347578421e-05, + "loss": 0.012832121923565865, + "num_input_tokens_seen": 10742656, + "step": 656, + "train_runtime": 5331.9419, + "train_tokens_per_second": 2014.774 + }, + { + "epoch": 0.3981818181818182, + "grad_norm": 0.0449579656124115, + "learning_rate": 9.977723768861718e-05, + "loss": 0.011967733502388, + "num_input_tokens_seen": 10759032, + "step": 657, + "train_runtime": 5340.0518, + "train_tokens_per_second": 2014.78 + }, + { + "epoch": 0.3987878787878788, + "grad_norm": 0.01602446660399437, + "learning_rate": 9.977633006028706e-05, + "loss": 0.012816080823540688, + "num_input_tokens_seen": 10775408, + "step": 658, + "train_runtime": 5348.1597, + "train_tokens_per_second": 2014.788 + }, + { + "epoch": 0.3993939393939394, + "grad_norm": 0.028448155149817467, + "learning_rate": 9.977542059082742e-05, + "loss": 0.014847241342067719, + "num_input_tokens_seen": 10791784, + "step": 659, + "train_runtime": 5356.2671, + "train_tokens_per_second": 2014.796 + }, + { + "epoch": 0.4, + "grad_norm": 0.011783472262322903, + "learning_rate": 9.977450928027191e-05, + "loss": 0.013164190575480461, + "num_input_tokens_seen": 10808160, + "step": 660, + "train_runtime": 5364.3761, + "train_tokens_per_second": 2014.803 + }, + { + "epoch": 0.40060606060606063, + "grad_norm": 0.026984520256519318, + "learning_rate": 9.977359612865423e-05, + "loss": 0.013657883740961552, + "num_input_tokens_seen": 10824536, + "step": 661, + "train_runtime": 5372.4863, + "train_tokens_per_second": 2014.809 + }, + { + "epoch": 0.4012121212121212, + "grad_norm": 0.022077390924096107, + "learning_rate": 9.977268113600817e-05, + "loss": 0.014578605070710182, + "num_input_tokens_seen": 10840912, + "step": 662, + "train_runtime": 5380.5934, + "train_tokens_per_second": 2014.817 + }, + { + "epoch": 0.4018181818181818, + "grad_norm": 0.01575160026550293, + "learning_rate": 9.977176430236755e-05, + "loss": 0.013932663947343826, + "num_input_tokens_seen": 10857288, + "step": 663, + "train_runtime": 5388.7195, + "train_tokens_per_second": 2014.818 + }, + { + "epoch": 0.4024242424242424, + "grad_norm": 0.029406050220131874, + "learning_rate": 9.977084562776631e-05, + "loss": 0.015834983438253403, + "num_input_tokens_seen": 10873664, + "step": 664, + "train_runtime": 5396.8304, + "train_tokens_per_second": 2014.824 + }, + { + "epoch": 0.403030303030303, + "grad_norm": 0.028436392545700073, + "learning_rate": 9.976992511223839e-05, + "loss": 0.014038406312465668, + "num_input_tokens_seen": 10890040, + "step": 665, + "train_runtime": 5404.9444, + "train_tokens_per_second": 2014.829 + }, + { + "epoch": 0.4036363636363636, + "grad_norm": 0.029235292226076126, + "learning_rate": 9.976900275581789e-05, + "loss": 0.015379410237073898, + "num_input_tokens_seen": 10906416, + "step": 666, + "train_runtime": 5413.0528, + "train_tokens_per_second": 2014.836 + }, + { + "epoch": 0.40424242424242424, + "grad_norm": 0.03774306923151016, + "learning_rate": 9.976807855853886e-05, + "loss": 0.014895454980432987, + "num_input_tokens_seen": 10922792, + "step": 667, + "train_runtime": 5421.159, + "train_tokens_per_second": 2014.844 + }, + { + "epoch": 0.40484848484848485, + "grad_norm": 0.01916997693479061, + "learning_rate": 9.976715252043555e-05, + "loss": 0.0143886748701334, + "num_input_tokens_seen": 10939168, + "step": 668, + "train_runtime": 5429.2675, + "train_tokens_per_second": 2014.852 + }, + { + "epoch": 0.40545454545454546, + "grad_norm": 0.021564677357673645, + "learning_rate": 9.976622464154219e-05, + "loss": 0.013210933655500412, + "num_input_tokens_seen": 10955544, + "step": 669, + "train_runtime": 5437.3809, + "train_tokens_per_second": 2014.857 + }, + { + "epoch": 0.40606060606060607, + "grad_norm": 0.02249998040497303, + "learning_rate": 9.976529492189309e-05, + "loss": 0.013446344994008541, + "num_input_tokens_seen": 10971920, + "step": 670, + "train_runtime": 5445.4997, + "train_tokens_per_second": 2014.86 + }, + { + "epoch": 0.4066666666666667, + "grad_norm": 0.03089592047035694, + "learning_rate": 9.976436336152265e-05, + "loss": 0.014300989918410778, + "num_input_tokens_seen": 10988296, + "step": 671, + "train_runtime": 5453.615, + "train_tokens_per_second": 2014.865 + }, + { + "epoch": 0.4072727272727273, + "grad_norm": 0.01742340438067913, + "learning_rate": 9.976342996046532e-05, + "loss": 0.012858121655881405, + "num_input_tokens_seen": 11004672, + "step": 672, + "train_runtime": 5461.7321, + "train_tokens_per_second": 2014.869 + }, + { + "epoch": 0.4078787878787879, + "grad_norm": 0.0165674090385437, + "learning_rate": 9.976249471875561e-05, + "loss": 0.013976114802062511, + "num_input_tokens_seen": 11021048, + "step": 673, + "train_runtime": 5469.8479, + "train_tokens_per_second": 2014.873 + }, + { + "epoch": 0.4084848484848485, + "grad_norm": 0.013970437459647655, + "learning_rate": 9.976155763642813e-05, + "loss": 0.013127206824719906, + "num_input_tokens_seen": 11037424, + "step": 674, + "train_runtime": 5477.9644, + "train_tokens_per_second": 2014.877 + }, + { + "epoch": 0.4090909090909091, + "grad_norm": 0.028073744848370552, + "learning_rate": 9.976061871351756e-05, + "loss": 0.013469989411532879, + "num_input_tokens_seen": 11053800, + "step": 675, + "train_runtime": 5486.0804, + "train_tokens_per_second": 2014.881 + }, + { + "epoch": 0.40969696969696967, + "grad_norm": 0.02016565017402172, + "learning_rate": 9.975967795005859e-05, + "loss": 0.013997921720147133, + "num_input_tokens_seen": 11070176, + "step": 676, + "train_runtime": 5494.197, + "train_tokens_per_second": 2014.885 + }, + { + "epoch": 0.4103030303030303, + "grad_norm": 0.01767519861459732, + "learning_rate": 9.975873534608604e-05, + "loss": 0.013824408873915672, + "num_input_tokens_seen": 11086552, + "step": 677, + "train_runtime": 5502.3132, + "train_tokens_per_second": 2014.889 + }, + { + "epoch": 0.4109090909090909, + "grad_norm": 0.02294917404651642, + "learning_rate": 9.975779090163478e-05, + "loss": 0.013364237733185291, + "num_input_tokens_seen": 11102928, + "step": 678, + "train_runtime": 5510.4298, + "train_tokens_per_second": 2014.893 + }, + { + "epoch": 0.4115151515151515, + "grad_norm": 0.015453618951141834, + "learning_rate": 9.975684461673972e-05, + "loss": 0.011895030736923218, + "num_input_tokens_seen": 11119304, + "step": 679, + "train_runtime": 5518.5467, + "train_tokens_per_second": 2014.897 + }, + { + "epoch": 0.4121212121212121, + "grad_norm": 0.02744610421359539, + "learning_rate": 9.975589649143588e-05, + "loss": 0.01399244274944067, + "num_input_tokens_seen": 11135680, + "step": 680, + "train_runtime": 5526.6634, + "train_tokens_per_second": 2014.901 + }, + { + "epoch": 0.4127272727272727, + "grad_norm": 0.0141525249928236, + "learning_rate": 9.975494652575832e-05, + "loss": 0.012226445600390434, + "num_input_tokens_seen": 11152056, + "step": 681, + "train_runtime": 5534.7831, + "train_tokens_per_second": 2014.904 + }, + { + "epoch": 0.41333333333333333, + "grad_norm": 0.05674010142683983, + "learning_rate": 9.975399471974218e-05, + "loss": 0.013092868961393833, + "num_input_tokens_seen": 11168432, + "step": 682, + "train_runtime": 5542.8995, + "train_tokens_per_second": 2014.908 + }, + { + "epoch": 0.41393939393939394, + "grad_norm": 0.014718937687575817, + "learning_rate": 9.975304107342268e-05, + "loss": 0.012982090935111046, + "num_input_tokens_seen": 11184808, + "step": 683, + "train_runtime": 5551.0179, + "train_tokens_per_second": 2014.911 + }, + { + "epoch": 0.41454545454545455, + "grad_norm": 0.017596984282135963, + "learning_rate": 9.975208558683508e-05, + "loss": 0.013058310374617577, + "num_input_tokens_seen": 11201184, + "step": 684, + "train_runtime": 5559.1335, + "train_tokens_per_second": 2014.915 + }, + { + "epoch": 0.41515151515151516, + "grad_norm": 0.05556584894657135, + "learning_rate": 9.975112826001471e-05, + "loss": 0.013223481364548206, + "num_input_tokens_seen": 11217560, + "step": 685, + "train_runtime": 5567.2483, + "train_tokens_per_second": 2014.92 + }, + { + "epoch": 0.41575757575757577, + "grad_norm": 0.039875905960798264, + "learning_rate": 9.9750169092997e-05, + "loss": 0.016192132607102394, + "num_input_tokens_seen": 11233936, + "step": 686, + "train_runtime": 5575.3629, + "train_tokens_per_second": 2014.925 + }, + { + "epoch": 0.4163636363636364, + "grad_norm": 0.04174409061670303, + "learning_rate": 9.97492080858174e-05, + "loss": 0.013733956962823868, + "num_input_tokens_seen": 11250312, + "step": 687, + "train_runtime": 5583.4823, + "train_tokens_per_second": 2014.928 + }, + { + "epoch": 0.416969696969697, + "grad_norm": 0.018462834879755974, + "learning_rate": 9.97482452385115e-05, + "loss": 0.011940184980630875, + "num_input_tokens_seen": 11266688, + "step": 688, + "train_runtime": 5591.5984, + "train_tokens_per_second": 2014.932 + }, + { + "epoch": 0.4175757575757576, + "grad_norm": 0.021226534619927406, + "learning_rate": 9.974728055111487e-05, + "loss": 0.013460342772305012, + "num_input_tokens_seen": 11283064, + "step": 689, + "train_runtime": 5599.7136, + "train_tokens_per_second": 2014.936 + }, + { + "epoch": 0.41818181818181815, + "grad_norm": 0.017722534015774727, + "learning_rate": 9.974631402366322e-05, + "loss": 0.013120359741151333, + "num_input_tokens_seen": 11299440, + "step": 690, + "train_runtime": 5607.8302, + "train_tokens_per_second": 2014.94 + }, + { + "epoch": 0.41878787878787876, + "grad_norm": 0.04932510480284691, + "learning_rate": 9.97453456561923e-05, + "loss": 0.014747078530490398, + "num_input_tokens_seen": 11315816, + "step": 691, + "train_runtime": 5615.9471, + "train_tokens_per_second": 2014.943 + }, + { + "epoch": 0.4193939393939394, + "grad_norm": 0.014801602810621262, + "learning_rate": 9.974437544873791e-05, + "loss": 0.012634863145649433, + "num_input_tokens_seen": 11332192, + "step": 692, + "train_runtime": 5624.0643, + "train_tokens_per_second": 2014.947 + }, + { + "epoch": 0.42, + "grad_norm": 0.01846308819949627, + "learning_rate": 9.974340340133595e-05, + "loss": 0.013980153016746044, + "num_input_tokens_seen": 11348568, + "step": 693, + "train_runtime": 5632.1816, + "train_tokens_per_second": 2014.951 + }, + { + "epoch": 0.4206060606060606, + "grad_norm": 0.022268032655119896, + "learning_rate": 9.974242951402235e-05, + "loss": 0.013369940221309662, + "num_input_tokens_seen": 11364944, + "step": 694, + "train_runtime": 5640.2993, + "train_tokens_per_second": 2014.954 + }, + { + "epoch": 0.4212121212121212, + "grad_norm": 0.017928361892700195, + "learning_rate": 9.974145378683318e-05, + "loss": 0.012236877344548702, + "num_input_tokens_seen": 11381320, + "step": 695, + "train_runtime": 5648.4187, + "train_tokens_per_second": 2014.957 + }, + { + "epoch": 0.4218181818181818, + "grad_norm": 0.026991484686732292, + "learning_rate": 9.974047621980447e-05, + "loss": 0.013161352835595608, + "num_input_tokens_seen": 11397696, + "step": 696, + "train_runtime": 5656.5432, + "train_tokens_per_second": 2014.958 + }, + { + "epoch": 0.4224242424242424, + "grad_norm": 0.016671424731612206, + "learning_rate": 9.973949681297244e-05, + "loss": 0.013532438315451145, + "num_input_tokens_seen": 11414072, + "step": 697, + "train_runtime": 5664.6671, + "train_tokens_per_second": 2014.959 + }, + { + "epoch": 0.42303030303030303, + "grad_norm": 0.04440519958734512, + "learning_rate": 9.973851556637326e-05, + "loss": 0.014023078605532646, + "num_input_tokens_seen": 11430448, + "step": 698, + "train_runtime": 5672.7922, + "train_tokens_per_second": 2014.96 + }, + { + "epoch": 0.42363636363636364, + "grad_norm": 0.01818687841296196, + "learning_rate": 9.973753248004326e-05, + "loss": 0.012776060961186886, + "num_input_tokens_seen": 11446824, + "step": 699, + "train_runtime": 5680.9115, + "train_tokens_per_second": 2014.963 + }, + { + "epoch": 0.42424242424242425, + "grad_norm": 0.03709911182522774, + "learning_rate": 9.97365475540188e-05, + "loss": 0.013938689604401588, + "num_input_tokens_seen": 11463200, + "step": 700, + "train_runtime": 5689.0323, + "train_tokens_per_second": 2014.965 + }, + { + "epoch": 0.42484848484848486, + "grad_norm": 0.02871977910399437, + "learning_rate": 9.97355607883363e-05, + "loss": 0.015867041423916817, + "num_input_tokens_seen": 11479576, + "step": 701, + "train_runtime": 5698.2647, + "train_tokens_per_second": 2014.574 + }, + { + "epoch": 0.4254545454545455, + "grad_norm": 0.023145193234086037, + "learning_rate": 9.973457218303226e-05, + "loss": 0.01401555072516203, + "num_input_tokens_seen": 11495952, + "step": 702, + "train_runtime": 5706.3816, + "train_tokens_per_second": 2014.578 + }, + { + "epoch": 0.4260606060606061, + "grad_norm": 0.015238692052662373, + "learning_rate": 9.973358173814324e-05, + "loss": 0.01140027865767479, + "num_input_tokens_seen": 11512328, + "step": 703, + "train_runtime": 5714.5032, + "train_tokens_per_second": 2014.581 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 0.017513658851385117, + "learning_rate": 9.97325894537059e-05, + "loss": 0.01290590688586235, + "num_input_tokens_seen": 11528704, + "step": 704, + "train_runtime": 5722.6292, + "train_tokens_per_second": 2014.582 + }, + { + "epoch": 0.42727272727272725, + "grad_norm": 0.02398119866847992, + "learning_rate": 9.973159532975691e-05, + "loss": 0.013042651116847992, + "num_input_tokens_seen": 11545080, + "step": 705, + "train_runtime": 5730.753, + "train_tokens_per_second": 2014.583 + }, + { + "epoch": 0.42787878787878786, + "grad_norm": 0.01669715717434883, + "learning_rate": 9.973059936633306e-05, + "loss": 0.011862633749842644, + "num_input_tokens_seen": 11561456, + "step": 706, + "train_runtime": 5738.8701, + "train_tokens_per_second": 2014.588 + }, + { + "epoch": 0.42848484848484847, + "grad_norm": 0.0743919089436531, + "learning_rate": 9.97296015634712e-05, + "loss": 0.012939982116222382, + "num_input_tokens_seen": 11577832, + "step": 707, + "train_runtime": 5746.9879, + "train_tokens_per_second": 2014.591 + }, + { + "epoch": 0.4290909090909091, + "grad_norm": 0.014302635565400124, + "learning_rate": 9.972860192120821e-05, + "loss": 0.01308290846645832, + "num_input_tokens_seen": 11594208, + "step": 708, + "train_runtime": 5755.1051, + "train_tokens_per_second": 2014.595 + }, + { + "epoch": 0.4296969696969697, + "grad_norm": 0.03461941331624985, + "learning_rate": 9.972760043958109e-05, + "loss": 0.01451612077653408, + "num_input_tokens_seen": 11610584, + "step": 709, + "train_runtime": 5763.2288, + "train_tokens_per_second": 2014.597 + }, + { + "epoch": 0.4303030303030303, + "grad_norm": 0.026271218433976173, + "learning_rate": 9.972659711862687e-05, + "loss": 0.012233047746121883, + "num_input_tokens_seen": 11626960, + "step": 710, + "train_runtime": 5771.3444, + "train_tokens_per_second": 2014.602 + }, + { + "epoch": 0.4309090909090909, + "grad_norm": 0.03146032616496086, + "learning_rate": 9.972559195838263e-05, + "loss": 0.012203723192214966, + "num_input_tokens_seen": 11643336, + "step": 711, + "train_runtime": 5779.4615, + "train_tokens_per_second": 2014.606 + }, + { + "epoch": 0.4315151515151515, + "grad_norm": 0.023236479610204697, + "learning_rate": 9.97245849588856e-05, + "loss": 0.014339424669742584, + "num_input_tokens_seen": 11659712, + "step": 712, + "train_runtime": 5787.5789, + "train_tokens_per_second": 2014.61 + }, + { + "epoch": 0.43212121212121213, + "grad_norm": 0.016745924949645996, + "learning_rate": 9.972357612017302e-05, + "loss": 0.012629512697458267, + "num_input_tokens_seen": 11676088, + "step": 713, + "train_runtime": 5795.6981, + "train_tokens_per_second": 2014.613 + }, + { + "epoch": 0.43272727272727274, + "grad_norm": 0.028602320700883865, + "learning_rate": 9.972256544228217e-05, + "loss": 0.01239441242069006, + "num_input_tokens_seen": 11692464, + "step": 714, + "train_runtime": 5803.8136, + "train_tokens_per_second": 2014.617 + }, + { + "epoch": 0.43333333333333335, + "grad_norm": 0.04347382113337517, + "learning_rate": 9.972155292525046e-05, + "loss": 0.013399597257375717, + "num_input_tokens_seen": 11708840, + "step": 715, + "train_runtime": 5811.9326, + "train_tokens_per_second": 2014.621 + }, + { + "epoch": 0.43393939393939396, + "grad_norm": 0.027413364499807358, + "learning_rate": 9.972053856911534e-05, + "loss": 0.014752673916518688, + "num_input_tokens_seen": 11725216, + "step": 716, + "train_runtime": 5820.0498, + "train_tokens_per_second": 2014.625 + }, + { + "epoch": 0.43454545454545457, + "grad_norm": 0.034208860248327255, + "learning_rate": 9.971952237391433e-05, + "loss": 0.013670345768332481, + "num_input_tokens_seen": 11741592, + "step": 717, + "train_runtime": 5828.1669, + "train_tokens_per_second": 2014.629 + }, + { + "epoch": 0.4351515151515152, + "grad_norm": 0.08834357559680939, + "learning_rate": 9.971850433968499e-05, + "loss": 0.01636839471757412, + "num_input_tokens_seen": 11757968, + "step": 718, + "train_runtime": 5836.2889, + "train_tokens_per_second": 2014.631 + }, + { + "epoch": 0.43575757575757573, + "grad_norm": 0.09180225431919098, + "learning_rate": 9.971748446646503e-05, + "loss": 0.013547438196837902, + "num_input_tokens_seen": 11774344, + "step": 719, + "train_runtime": 5844.4057, + "train_tokens_per_second": 2014.635 + }, + { + "epoch": 0.43636363636363634, + "grad_norm": 0.021431786939501762, + "learning_rate": 9.971646275429211e-05, + "loss": 0.014424419030547142, + "num_input_tokens_seen": 11790720, + "step": 720, + "train_runtime": 5852.5291, + "train_tokens_per_second": 2014.637 + }, + { + "epoch": 0.43696969696969695, + "grad_norm": 0.014504344202578068, + "learning_rate": 9.971543920320407e-05, + "loss": 0.012794758193194866, + "num_input_tokens_seen": 11807096, + "step": 721, + "train_runtime": 5860.6452, + "train_tokens_per_second": 2014.641 + }, + { + "epoch": 0.43757575757575756, + "grad_norm": 0.04303886368870735, + "learning_rate": 9.971441381323874e-05, + "loss": 0.014037848450243473, + "num_input_tokens_seen": 11823472, + "step": 722, + "train_runtime": 5868.7615, + "train_tokens_per_second": 2014.645 + }, + { + "epoch": 0.4381818181818182, + "grad_norm": 0.028946641832590103, + "learning_rate": 9.971338658443406e-05, + "loss": 0.012954017147421837, + "num_input_tokens_seen": 11839848, + "step": 723, + "train_runtime": 5876.878, + "train_tokens_per_second": 2014.649 + }, + { + "epoch": 0.4387878787878788, + "grad_norm": 0.02165861800312996, + "learning_rate": 9.971235751682802e-05, + "loss": 0.012219181284308434, + "num_input_tokens_seen": 11856224, + "step": 724, + "train_runtime": 5884.9934, + "train_tokens_per_second": 2014.654 + }, + { + "epoch": 0.4393939393939394, + "grad_norm": 0.023574933409690857, + "learning_rate": 9.971132661045868e-05, + "loss": 0.014860106632113457, + "num_input_tokens_seen": 11872600, + "step": 725, + "train_runtime": 5893.1105, + "train_tokens_per_second": 2014.658 + }, + { + "epoch": 0.44, + "grad_norm": 0.05360223352909088, + "learning_rate": 9.971029386536419e-05, + "loss": 0.014855952933430672, + "num_input_tokens_seen": 11888976, + "step": 726, + "train_runtime": 5901.2285, + "train_tokens_per_second": 2014.661 + }, + { + "epoch": 0.4406060606060606, + "grad_norm": 0.03671532869338989, + "learning_rate": 9.970925928158274e-05, + "loss": 0.015136584639549255, + "num_input_tokens_seen": 11905352, + "step": 727, + "train_runtime": 5909.3465, + "train_tokens_per_second": 2014.665 + }, + { + "epoch": 0.4412121212121212, + "grad_norm": 0.012548093684017658, + "learning_rate": 9.970822285915257e-05, + "loss": 0.012122916989028454, + "num_input_tokens_seen": 11921728, + "step": 728, + "train_runtime": 5917.4638, + "train_tokens_per_second": 2014.669 + }, + { + "epoch": 0.44181818181818183, + "grad_norm": 0.02257922850549221, + "learning_rate": 9.970718459811206e-05, + "loss": 0.013802756555378437, + "num_input_tokens_seen": 11938104, + "step": 729, + "train_runtime": 5925.5783, + "train_tokens_per_second": 2014.673 + }, + { + "epoch": 0.44242424242424244, + "grad_norm": 0.014075133018195629, + "learning_rate": 9.97061444984996e-05, + "loss": 0.012838860973715782, + "num_input_tokens_seen": 11954480, + "step": 730, + "train_runtime": 5933.6937, + "train_tokens_per_second": 2014.678 + }, + { + "epoch": 0.44303030303030305, + "grad_norm": 0.022020021453499794, + "learning_rate": 9.970510256035364e-05, + "loss": 0.01375649869441986, + "num_input_tokens_seen": 11970856, + "step": 731, + "train_runtime": 5941.8106, + "train_tokens_per_second": 2014.682 + }, + { + "epoch": 0.44363636363636366, + "grad_norm": 0.01787860319018364, + "learning_rate": 9.970405878371273e-05, + "loss": 0.012008238583803177, + "num_input_tokens_seen": 11987232, + "step": 732, + "train_runtime": 5949.9292, + "train_tokens_per_second": 2014.685 + }, + { + "epoch": 0.4442424242424242, + "grad_norm": 0.019049983471632004, + "learning_rate": 9.970301316861548e-05, + "loss": 0.012502388097345829, + "num_input_tokens_seen": 12003608, + "step": 733, + "train_runtime": 5958.0503, + "train_tokens_per_second": 2014.687 + }, + { + "epoch": 0.4448484848484848, + "grad_norm": 0.02835710346698761, + "learning_rate": 9.970196571510057e-05, + "loss": 0.012223845347762108, + "num_input_tokens_seen": 12019984, + "step": 734, + "train_runtime": 5966.1707, + "train_tokens_per_second": 2014.69 + }, + { + "epoch": 0.44545454545454544, + "grad_norm": 0.04534858092665672, + "learning_rate": 9.970091642320674e-05, + "loss": 0.01531003974378109, + "num_input_tokens_seen": 12036360, + "step": 735, + "train_runtime": 5974.2918, + "train_tokens_per_second": 2014.692 + }, + { + "epoch": 0.44606060606060605, + "grad_norm": 0.02770829014480114, + "learning_rate": 9.96998652929728e-05, + "loss": 0.014202866703271866, + "num_input_tokens_seen": 12052736, + "step": 736, + "train_runtime": 5982.4163, + "train_tokens_per_second": 2014.694 + }, + { + "epoch": 0.44666666666666666, + "grad_norm": 0.01627975143492222, + "learning_rate": 9.969881232443761e-05, + "loss": 0.013593195006251335, + "num_input_tokens_seen": 12069112, + "step": 737, + "train_runtime": 5990.5422, + "train_tokens_per_second": 2014.694 + }, + { + "epoch": 0.44727272727272727, + "grad_norm": 0.02013089507818222, + "learning_rate": 9.969775751764015e-05, + "loss": 0.012935129925608635, + "num_input_tokens_seen": 12085488, + "step": 738, + "train_runtime": 5998.6638, + "train_tokens_per_second": 2014.697 + }, + { + "epoch": 0.4478787878787879, + "grad_norm": 0.03128223493695259, + "learning_rate": 9.969670087261942e-05, + "loss": 0.014752635732293129, + "num_input_tokens_seen": 12101864, + "step": 739, + "train_runtime": 6006.7832, + "train_tokens_per_second": 2014.7 + }, + { + "epoch": 0.4484848484848485, + "grad_norm": 0.08356563001871109, + "learning_rate": 9.969564238941452e-05, + "loss": 0.012013277038931847, + "num_input_tokens_seen": 12118240, + "step": 740, + "train_runtime": 6014.9037, + "train_tokens_per_second": 2014.702 + }, + { + "epoch": 0.4490909090909091, + "grad_norm": 0.04240264743566513, + "learning_rate": 9.969458206806456e-05, + "loss": 0.013846787624061108, + "num_input_tokens_seen": 12134616, + "step": 741, + "train_runtime": 6023.0287, + "train_tokens_per_second": 2014.703 + }, + { + "epoch": 0.4496969696969697, + "grad_norm": 0.020833732560276985, + "learning_rate": 9.96935199086088e-05, + "loss": 0.014301668852567673, + "num_input_tokens_seen": 12150992, + "step": 742, + "train_runtime": 6031.1472, + "train_tokens_per_second": 2014.707 + }, + { + "epoch": 0.4503030303030303, + "grad_norm": 0.021045729517936707, + "learning_rate": 9.969245591108652e-05, + "loss": 0.013184930197894573, + "num_input_tokens_seen": 12167368, + "step": 743, + "train_runtime": 6039.2669, + "train_tokens_per_second": 2014.709 + }, + { + "epoch": 0.4509090909090909, + "grad_norm": 0.014139235951006413, + "learning_rate": 9.969139007553705e-05, + "loss": 0.013327041640877724, + "num_input_tokens_seen": 12183744, + "step": 744, + "train_runtime": 6047.3846, + "train_tokens_per_second": 2014.713 + }, + { + "epoch": 0.45151515151515154, + "grad_norm": 0.7923178672790527, + "learning_rate": 9.969032240199983e-05, + "loss": 0.012914719060063362, + "num_input_tokens_seen": 12200120, + "step": 745, + "train_runtime": 6055.5018, + "train_tokens_per_second": 2014.717 + }, + { + "epoch": 0.45212121212121215, + "grad_norm": 0.033203721046447754, + "learning_rate": 9.968925289051436e-05, + "loss": 0.013039352372288704, + "num_input_tokens_seen": 12216496, + "step": 746, + "train_runtime": 6063.6194, + "train_tokens_per_second": 2014.72 + }, + { + "epoch": 0.4527272727272727, + "grad_norm": 0.02019328624010086, + "learning_rate": 9.96881815411202e-05, + "loss": 0.012438374571502209, + "num_input_tokens_seen": 12232872, + "step": 747, + "train_runtime": 6071.7363, + "train_tokens_per_second": 2014.724 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 0.03482190519571304, + "learning_rate": 9.968710835385696e-05, + "loss": 0.015620945952832699, + "num_input_tokens_seen": 12249248, + "step": 748, + "train_runtime": 6079.8541, + "train_tokens_per_second": 2014.727 + }, + { + "epoch": 0.4539393939393939, + "grad_norm": 0.053270891308784485, + "learning_rate": 9.968603332876434e-05, + "loss": 0.012819363735616207, + "num_input_tokens_seen": 12265624, + "step": 749, + "train_runtime": 6087.9704, + "train_tokens_per_second": 2014.731 + }, + { + "epoch": 0.45454545454545453, + "grad_norm": 0.013719640672206879, + "learning_rate": 9.968495646588211e-05, + "loss": 0.013314586132764816, + "num_input_tokens_seen": 12282000, + "step": 750, + "train_runtime": 6096.0915, + "train_tokens_per_second": 2014.734 + }, + { + "epoch": 0.45515151515151514, + "grad_norm": 0.020413396880030632, + "learning_rate": 9.96838777652501e-05, + "loss": 0.012559941038489342, + "num_input_tokens_seen": 12298376, + "step": 751, + "train_runtime": 6104.2072, + "train_tokens_per_second": 2014.738 + }, + { + "epoch": 0.45575757575757575, + "grad_norm": 0.02567451260983944, + "learning_rate": 9.968279722690819e-05, + "loss": 0.013514967635273933, + "num_input_tokens_seen": 12314752, + "step": 752, + "train_runtime": 6112.3314, + "train_tokens_per_second": 2014.739 + }, + { + "epoch": 0.45636363636363636, + "grad_norm": 0.015409312210977077, + "learning_rate": 9.968171485089638e-05, + "loss": 0.012808658182621002, + "num_input_tokens_seen": 12331128, + "step": 753, + "train_runtime": 6120.4491, + "train_tokens_per_second": 2014.742 + }, + { + "epoch": 0.45696969696969697, + "grad_norm": 0.02095264568924904, + "learning_rate": 9.968063063725468e-05, + "loss": 0.014174265787005424, + "num_input_tokens_seen": 12347504, + "step": 754, + "train_runtime": 6128.5679, + "train_tokens_per_second": 2014.745 + }, + { + "epoch": 0.4575757575757576, + "grad_norm": 0.020611796528100967, + "learning_rate": 9.96795445860232e-05, + "loss": 0.011881090700626373, + "num_input_tokens_seen": 12363880, + "step": 755, + "train_runtime": 6136.6868, + "train_tokens_per_second": 2014.748 + }, + { + "epoch": 0.4581818181818182, + "grad_norm": 0.018243003636598587, + "learning_rate": 9.967845669724212e-05, + "loss": 0.012596143409609795, + "num_input_tokens_seen": 12380256, + "step": 756, + "train_runtime": 6144.8042, + "train_tokens_per_second": 2014.752 + }, + { + "epoch": 0.4587878787878788, + "grad_norm": 0.016125964000821114, + "learning_rate": 9.967736697095167e-05, + "loss": 0.013951683416962624, + "num_input_tokens_seen": 12396632, + "step": 757, + "train_runtime": 6152.9288, + "train_tokens_per_second": 2014.753 + }, + { + "epoch": 0.4593939393939394, + "grad_norm": 0.019307058304548264, + "learning_rate": 9.967627540719215e-05, + "loss": 0.013310304842889309, + "num_input_tokens_seen": 12413008, + "step": 758, + "train_runtime": 6161.047, + "train_tokens_per_second": 2014.756 + }, + { + "epoch": 0.46, + "grad_norm": 0.0198148675262928, + "learning_rate": 9.967518200600396e-05, + "loss": 0.013110843487083912, + "num_input_tokens_seen": 12429384, + "step": 759, + "train_runtime": 6169.1657, + "train_tokens_per_second": 2014.759 + }, + { + "epoch": 0.46060606060606063, + "grad_norm": 0.02929919771850109, + "learning_rate": 9.967408676742751e-05, + "loss": 0.015073966234922409, + "num_input_tokens_seen": 12445760, + "step": 760, + "train_runtime": 6177.2831, + "train_tokens_per_second": 2014.763 + }, + { + "epoch": 0.4612121212121212, + "grad_norm": 0.015382593497633934, + "learning_rate": 9.967298969150334e-05, + "loss": 0.012051237747073174, + "num_input_tokens_seen": 12462136, + "step": 761, + "train_runtime": 6185.4001, + "train_tokens_per_second": 2014.766 + }, + { + "epoch": 0.4618181818181818, + "grad_norm": 0.02371540106832981, + "learning_rate": 9.9671890778272e-05, + "loss": 0.015372917987406254, + "num_input_tokens_seen": 12478512, + "step": 762, + "train_runtime": 6193.5166, + "train_tokens_per_second": 2014.77 + }, + { + "epoch": 0.4624242424242424, + "grad_norm": 0.02178136259317398, + "learning_rate": 9.967079002777417e-05, + "loss": 0.013376548886299133, + "num_input_tokens_seen": 12494888, + "step": 763, + "train_runtime": 6201.6342, + "train_tokens_per_second": 2014.773 + }, + { + "epoch": 0.463030303030303, + "grad_norm": 0.01065842155367136, + "learning_rate": 9.966968744005052e-05, + "loss": 0.012219875119626522, + "num_input_tokens_seen": 12511264, + "step": 764, + "train_runtime": 6209.7525, + "train_tokens_per_second": 2014.777 + }, + { + "epoch": 0.4636363636363636, + "grad_norm": 0.013287489302456379, + "learning_rate": 9.966858301514188e-05, + "loss": 0.011538016609847546, + "num_input_tokens_seen": 12527640, + "step": 765, + "train_runtime": 6217.8691, + "train_tokens_per_second": 2014.78 + }, + { + "epoch": 0.46424242424242423, + "grad_norm": 0.013882887549698353, + "learning_rate": 9.966747675308907e-05, + "loss": 0.012349468655884266, + "num_input_tokens_seen": 12544016, + "step": 766, + "train_runtime": 6225.9864, + "train_tokens_per_second": 2014.784 + }, + { + "epoch": 0.46484848484848484, + "grad_norm": 0.018599022179841995, + "learning_rate": 9.966636865393301e-05, + "loss": 0.012744025327265263, + "num_input_tokens_seen": 12560392, + "step": 767, + "train_runtime": 6234.1026, + "train_tokens_per_second": 2014.787 + }, + { + "epoch": 0.46545454545454545, + "grad_norm": 0.012023529969155788, + "learning_rate": 9.966525871771472e-05, + "loss": 0.01199167687445879, + "num_input_tokens_seen": 12576768, + "step": 768, + "train_runtime": 6242.2199, + "train_tokens_per_second": 2014.791 + }, + { + "epoch": 0.46606060606060606, + "grad_norm": 0.01650414615869522, + "learning_rate": 9.966414694447521e-05, + "loss": 0.012927195057272911, + "num_input_tokens_seen": 12593144, + "step": 769, + "train_runtime": 6250.3375, + "train_tokens_per_second": 2014.794 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 0.034085940569639206, + "learning_rate": 9.966303333425563e-05, + "loss": 0.012202315032482147, + "num_input_tokens_seen": 12609520, + "step": 770, + "train_runtime": 6258.4532, + "train_tokens_per_second": 2014.798 + }, + { + "epoch": 0.4672727272727273, + "grad_norm": 0.013827620074152946, + "learning_rate": 9.966191788709716e-05, + "loss": 0.013147883117198944, + "num_input_tokens_seen": 12625896, + "step": 771, + "train_runtime": 6266.5701, + "train_tokens_per_second": 2014.802 + }, + { + "epoch": 0.4678787878787879, + "grad_norm": 0.0181913860142231, + "learning_rate": 9.966080060304105e-05, + "loss": 0.013427773490548134, + "num_input_tokens_seen": 12642272, + "step": 772, + "train_runtime": 6274.6886, + "train_tokens_per_second": 2014.805 + }, + { + "epoch": 0.4684848484848485, + "grad_norm": 0.07882755249738693, + "learning_rate": 9.965968148212864e-05, + "loss": 0.017075341194868088, + "num_input_tokens_seen": 12658648, + "step": 773, + "train_runtime": 6282.8062, + "train_tokens_per_second": 2014.808 + }, + { + "epoch": 0.4690909090909091, + "grad_norm": 0.007325070444494486, + "learning_rate": 9.965856052440132e-05, + "loss": 0.011197097599506378, + "num_input_tokens_seen": 12675024, + "step": 774, + "train_runtime": 6290.929, + "train_tokens_per_second": 2014.81 + }, + { + "epoch": 0.4696969696969697, + "grad_norm": 0.030580898746848106, + "learning_rate": 9.965743772990054e-05, + "loss": 0.012808885425329208, + "num_input_tokens_seen": 12691400, + "step": 775, + "train_runtime": 6299.0468, + "train_tokens_per_second": 2014.813 + }, + { + "epoch": 0.4703030303030303, + "grad_norm": 0.027805298566818237, + "learning_rate": 9.965631309866788e-05, + "loss": 0.012805595062673092, + "num_input_tokens_seen": 12707776, + "step": 776, + "train_runtime": 6307.1647, + "train_tokens_per_second": 2014.816 + }, + { + "epoch": 0.4709090909090909, + "grad_norm": 0.01449024397879839, + "learning_rate": 9.965518663074487e-05, + "loss": 0.013110213913023472, + "num_input_tokens_seen": 12724152, + "step": 777, + "train_runtime": 6315.2824, + "train_tokens_per_second": 2014.819 + }, + { + "epoch": 0.4715151515151515, + "grad_norm": 0.013304144144058228, + "learning_rate": 9.96540583261732e-05, + "loss": 0.012666239403188229, + "num_input_tokens_seen": 12740528, + "step": 778, + "train_runtime": 6323.3995, + "train_tokens_per_second": 2014.823 + }, + { + "epoch": 0.4721212121212121, + "grad_norm": 0.01922908052802086, + "learning_rate": 9.965292818499463e-05, + "loss": 0.012315730564296246, + "num_input_tokens_seen": 12756904, + "step": 779, + "train_runtime": 6331.5179, + "train_tokens_per_second": 2014.826 + }, + { + "epoch": 0.4727272727272727, + "grad_norm": 0.042174387723207474, + "learning_rate": 9.965179620725093e-05, + "loss": 0.015461819246411324, + "num_input_tokens_seen": 12773280, + "step": 780, + "train_runtime": 6339.636, + "train_tokens_per_second": 2014.829 + }, + { + "epoch": 0.47333333333333333, + "grad_norm": 0.02851157635450363, + "learning_rate": 9.965066239298398e-05, + "loss": 0.012629134580492973, + "num_input_tokens_seen": 12789656, + "step": 781, + "train_runtime": 6347.7537, + "train_tokens_per_second": 2014.832 + }, + { + "epoch": 0.47393939393939394, + "grad_norm": 0.10219256579875946, + "learning_rate": 9.96495267422357e-05, + "loss": 0.014288711361587048, + "num_input_tokens_seen": 12806032, + "step": 782, + "train_runtime": 6355.8718, + "train_tokens_per_second": 2014.835 + }, + { + "epoch": 0.47454545454545455, + "grad_norm": 0.012413585558533669, + "learning_rate": 9.964838925504816e-05, + "loss": 0.012026645243167877, + "num_input_tokens_seen": 12822408, + "step": 783, + "train_runtime": 6363.9912, + "train_tokens_per_second": 2014.837 + }, + { + "epoch": 0.47515151515151516, + "grad_norm": 0.019600611180067062, + "learning_rate": 9.964724993146335e-05, + "loss": 0.012678924947977066, + "num_input_tokens_seen": 12838784, + "step": 784, + "train_runtime": 6372.1105, + "train_tokens_per_second": 2014.84 + }, + { + "epoch": 0.47575757575757577, + "grad_norm": 0.021761193871498108, + "learning_rate": 9.964610877152346e-05, + "loss": 0.012011994607746601, + "num_input_tokens_seen": 12855160, + "step": 785, + "train_runtime": 6380.2296, + "train_tokens_per_second": 2014.843 + }, + { + "epoch": 0.4763636363636364, + "grad_norm": 0.016564620658755302, + "learning_rate": 9.964496577527069e-05, + "loss": 0.01261131465435028, + "num_input_tokens_seen": 12871536, + "step": 786, + "train_runtime": 6388.348, + "train_tokens_per_second": 2014.846 + }, + { + "epoch": 0.476969696969697, + "grad_norm": 0.009226581081748009, + "learning_rate": 9.964382094274732e-05, + "loss": 0.012591596692800522, + "num_input_tokens_seen": 12887912, + "step": 787, + "train_runtime": 6396.4664, + "train_tokens_per_second": 2014.849 + }, + { + "epoch": 0.4775757575757576, + "grad_norm": 0.017386259511113167, + "learning_rate": 9.964267427399568e-05, + "loss": 0.012936464510858059, + "num_input_tokens_seen": 12904288, + "step": 788, + "train_runtime": 6404.5838, + "train_tokens_per_second": 2014.852 + }, + { + "epoch": 0.4781818181818182, + "grad_norm": 0.023312706500291824, + "learning_rate": 9.964152576905819e-05, + "loss": 0.012287257239222527, + "num_input_tokens_seen": 12920664, + "step": 789, + "train_runtime": 6412.7014, + "train_tokens_per_second": 2014.855 + }, + { + "epoch": 0.47878787878787876, + "grad_norm": 0.03517942875623703, + "learning_rate": 9.964037542797735e-05, + "loss": 0.014132940210402012, + "num_input_tokens_seen": 12937040, + "step": 790, + "train_runtime": 6420.8203, + "train_tokens_per_second": 2014.858 + }, + { + "epoch": 0.4793939393939394, + "grad_norm": 0.03619959577918053, + "learning_rate": 9.963922325079567e-05, + "loss": 0.014860968105494976, + "num_input_tokens_seen": 12953416, + "step": 791, + "train_runtime": 6428.9382, + "train_tokens_per_second": 2014.861 + }, + { + "epoch": 0.48, + "grad_norm": 0.03862093389034271, + "learning_rate": 9.96380692375558e-05, + "loss": 0.012788870371878147, + "num_input_tokens_seen": 12969792, + "step": 792, + "train_runtime": 6437.0552, + "train_tokens_per_second": 2014.864 + }, + { + "epoch": 0.4806060606060606, + "grad_norm": 0.014955422841012478, + "learning_rate": 9.963691338830044e-05, + "loss": 0.012180945836007595, + "num_input_tokens_seen": 12986168, + "step": 793, + "train_runtime": 6445.1731, + "train_tokens_per_second": 2014.867 + }, + { + "epoch": 0.4812121212121212, + "grad_norm": 0.02255093678832054, + "learning_rate": 9.963575570307228e-05, + "loss": 0.015188801102340221, + "num_input_tokens_seen": 13002544, + "step": 794, + "train_runtime": 6453.2915, + "train_tokens_per_second": 2014.87 + }, + { + "epoch": 0.4818181818181818, + "grad_norm": 0.023307740688323975, + "learning_rate": 9.96345961819142e-05, + "loss": 0.012430655770003796, + "num_input_tokens_seen": 13018920, + "step": 795, + "train_runtime": 6461.4033, + "train_tokens_per_second": 2014.875 + }, + { + "epoch": 0.4824242424242424, + "grad_norm": 0.015535326674580574, + "learning_rate": 9.963343482486906e-05, + "loss": 0.013036166317760944, + "num_input_tokens_seen": 13035296, + "step": 796, + "train_runtime": 6469.51, + "train_tokens_per_second": 2014.882 + }, + { + "epoch": 0.48303030303030303, + "grad_norm": 0.015238570980727673, + "learning_rate": 9.963227163197982e-05, + "loss": 0.012019358575344086, + "num_input_tokens_seen": 13051672, + "step": 797, + "train_runtime": 6477.6153, + "train_tokens_per_second": 2014.888 + }, + { + "epoch": 0.48363636363636364, + "grad_norm": 0.033798947930336, + "learning_rate": 9.963110660328952e-05, + "loss": 0.013339506462216377, + "num_input_tokens_seen": 13068048, + "step": 798, + "train_runtime": 6485.7294, + "train_tokens_per_second": 2014.893 + }, + { + "epoch": 0.48424242424242425, + "grad_norm": 0.019505798816680908, + "learning_rate": 9.962993973884122e-05, + "loss": 0.012281915172934532, + "num_input_tokens_seen": 13084424, + "step": 799, + "train_runtime": 6493.8366, + "train_tokens_per_second": 2014.899 + }, + { + "epoch": 0.48484848484848486, + "grad_norm": 0.010988899506628513, + "learning_rate": 9.96287710386781e-05, + "loss": 0.011865864507853985, + "num_input_tokens_seen": 13100800, + "step": 800, + "train_runtime": 6501.9444, + "train_tokens_per_second": 2014.905 + }, + { + "epoch": 0.48545454545454547, + "grad_norm": 0.031102674081921577, + "learning_rate": 9.96276005028434e-05, + "loss": 0.013372216373682022, + "num_input_tokens_seen": 13117176, + "step": 801, + "train_runtime": 6511.057, + "train_tokens_per_second": 2014.6 + }, + { + "epoch": 0.4860606060606061, + "grad_norm": 0.009399918839335442, + "learning_rate": 9.962642813138039e-05, + "loss": 0.012573515065014362, + "num_input_tokens_seen": 13133552, + "step": 802, + "train_runtime": 6519.1656, + "train_tokens_per_second": 2014.606 + }, + { + "epoch": 0.4866666666666667, + "grad_norm": 0.06464923918247223, + "learning_rate": 9.962525392433246e-05, + "loss": 0.014730310998857021, + "num_input_tokens_seen": 13149928, + "step": 803, + "train_runtime": 6527.273, + "train_tokens_per_second": 2014.613 + }, + { + "epoch": 0.48727272727272725, + "grad_norm": 0.028241781517863274, + "learning_rate": 9.962407788174301e-05, + "loss": 0.01580268330872059, + "num_input_tokens_seen": 13166304, + "step": 804, + "train_runtime": 6535.3821, + "train_tokens_per_second": 2014.619 + }, + { + "epoch": 0.48787878787878786, + "grad_norm": 0.008157139644026756, + "learning_rate": 9.962290000365558e-05, + "loss": 0.011951067484915257, + "num_input_tokens_seen": 13182680, + "step": 805, + "train_runtime": 6543.4933, + "train_tokens_per_second": 2014.624 + }, + { + "epoch": 0.48848484848484847, + "grad_norm": 0.017825007438659668, + "learning_rate": 9.96217202901137e-05, + "loss": 0.01247593853622675, + "num_input_tokens_seen": 13199056, + "step": 806, + "train_runtime": 6551.599, + "train_tokens_per_second": 2014.631 + }, + { + "epoch": 0.4890909090909091, + "grad_norm": 0.03140291944146156, + "learning_rate": 9.962053874116102e-05, + "loss": 0.013065744191408157, + "num_input_tokens_seen": 13215432, + "step": 807, + "train_runtime": 6559.707, + "train_tokens_per_second": 2014.638 + }, + { + "epoch": 0.4896969696969697, + "grad_norm": 0.020545680075883865, + "learning_rate": 9.961935535684127e-05, + "loss": 0.013503405265510082, + "num_input_tokens_seen": 13231808, + "step": 808, + "train_runtime": 6567.8172, + "train_tokens_per_second": 2014.643 + }, + { + "epoch": 0.4903030303030303, + "grad_norm": 0.010955904610455036, + "learning_rate": 9.961817013719815e-05, + "loss": 0.011936129070818424, + "num_input_tokens_seen": 13248184, + "step": 809, + "train_runtime": 6575.9284, + "train_tokens_per_second": 2014.648 + }, + { + "epoch": 0.4909090909090909, + "grad_norm": 0.01849379763007164, + "learning_rate": 9.961698308227557e-05, + "loss": 0.012791337445378304, + "num_input_tokens_seen": 13264560, + "step": 810, + "train_runtime": 6584.0343, + "train_tokens_per_second": 2014.655 + }, + { + "epoch": 0.4915151515151515, + "grad_norm": 0.014219888485968113, + "learning_rate": 9.961579419211741e-05, + "loss": 0.01348559744656086, + "num_input_tokens_seen": 13280936, + "step": 811, + "train_runtime": 6592.1415, + "train_tokens_per_second": 2014.662 + }, + { + "epoch": 0.4921212121212121, + "grad_norm": 0.02992507442831993, + "learning_rate": 9.961460346676763e-05, + "loss": 0.013612410053610802, + "num_input_tokens_seen": 13297312, + "step": 812, + "train_runtime": 6600.2507, + "train_tokens_per_second": 2014.668 + }, + { + "epoch": 0.49272727272727274, + "grad_norm": 0.029259268194437027, + "learning_rate": 9.961341090627031e-05, + "loss": 0.014138033613562584, + "num_input_tokens_seen": 13313688, + "step": 813, + "train_runtime": 6608.362, + "train_tokens_per_second": 2014.673 + }, + { + "epoch": 0.49333333333333335, + "grad_norm": 0.016515251249074936, + "learning_rate": 9.961221651066952e-05, + "loss": 0.013446497730910778, + "num_input_tokens_seen": 13330064, + "step": 814, + "train_runtime": 6616.47, + "train_tokens_per_second": 2014.679 + }, + { + "epoch": 0.49393939393939396, + "grad_norm": 0.019002556800842285, + "learning_rate": 9.961102028000948e-05, + "loss": 0.013769666664302349, + "num_input_tokens_seen": 13346440, + "step": 815, + "train_runtime": 6624.5765, + "train_tokens_per_second": 2014.686 + }, + { + "epoch": 0.49454545454545457, + "grad_norm": 0.023732759058475494, + "learning_rate": 9.960982221433439e-05, + "loss": 0.01219931710511446, + "num_input_tokens_seen": 13362816, + "step": 816, + "train_runtime": 6632.6975, + "train_tokens_per_second": 2014.688 + }, + { + "epoch": 0.4951515151515152, + "grad_norm": 0.012622934766113758, + "learning_rate": 9.960862231368859e-05, + "loss": 0.012783626094460487, + "num_input_tokens_seen": 13379192, + "step": 817, + "train_runtime": 6640.8076, + "train_tokens_per_second": 2014.694 + }, + { + "epoch": 0.49575757575757573, + "grad_norm": 0.014281938783824444, + "learning_rate": 9.960742057811648e-05, + "loss": 0.012687593698501587, + "num_input_tokens_seen": 13395568, + "step": 818, + "train_runtime": 6648.9137, + "train_tokens_per_second": 2014.7 + }, + { + "epoch": 0.49636363636363634, + "grad_norm": 0.053434181958436966, + "learning_rate": 9.960621700766246e-05, + "loss": 0.013879223726689816, + "num_input_tokens_seen": 13411944, + "step": 819, + "train_runtime": 6657.0289, + "train_tokens_per_second": 2014.704 + }, + { + "epoch": 0.49696969696969695, + "grad_norm": 0.014049537479877472, + "learning_rate": 9.960501160237107e-05, + "loss": 0.011275812052190304, + "num_input_tokens_seen": 13428320, + "step": 820, + "train_runtime": 6665.1394, + "train_tokens_per_second": 2014.71 + }, + { + "epoch": 0.49757575757575756, + "grad_norm": 0.02216215617954731, + "learning_rate": 9.960380436228693e-05, + "loss": 0.01345481164753437, + "num_input_tokens_seen": 13444696, + "step": 821, + "train_runtime": 6673.2486, + "train_tokens_per_second": 2014.715 + }, + { + "epoch": 0.49818181818181817, + "grad_norm": 0.01626548357307911, + "learning_rate": 9.960259528745466e-05, + "loss": 0.01268689427524805, + "num_input_tokens_seen": 13461072, + "step": 822, + "train_runtime": 6681.3546, + "train_tokens_per_second": 2014.722 + }, + { + "epoch": 0.4987878787878788, + "grad_norm": 0.029701311141252518, + "learning_rate": 9.960138437791899e-05, + "loss": 0.013831757940351963, + "num_input_tokens_seen": 13477448, + "step": 823, + "train_runtime": 6689.465, + "train_tokens_per_second": 2014.727 + }, + { + "epoch": 0.4993939393939394, + "grad_norm": 0.01778031513094902, + "learning_rate": 9.96001716337247e-05, + "loss": 0.012985551729798317, + "num_input_tokens_seen": 13493824, + "step": 824, + "train_runtime": 6697.5721, + "train_tokens_per_second": 2014.734 + }, + { + "epoch": 0.5, + "grad_norm": 0.011812685988843441, + "learning_rate": 9.959895705491664e-05, + "loss": 0.013474401086568832, + "num_input_tokens_seen": 13510200, + "step": 825, + "train_runtime": 6705.6803, + "train_tokens_per_second": 2014.74 + }, + { + "epoch": 0.5006060606060606, + "grad_norm": 0.024887410923838615, + "learning_rate": 9.959774064153977e-05, + "loss": 0.012352567166090012, + "num_input_tokens_seen": 13526576, + "step": 826, + "train_runtime": 6713.7875, + "train_tokens_per_second": 2014.746 + }, + { + "epoch": 0.5012121212121212, + "grad_norm": 0.02427525445818901, + "learning_rate": 9.959652239363906e-05, + "loss": 0.01411970891058445, + "num_input_tokens_seen": 13542952, + "step": 827, + "train_runtime": 6721.8992, + "train_tokens_per_second": 2014.751 + }, + { + "epoch": 0.5018181818181818, + "grad_norm": 0.02203851006925106, + "learning_rate": 9.959530231125955e-05, + "loss": 0.01270216703414917, + "num_input_tokens_seen": 13559328, + "step": 828, + "train_runtime": 6730.0067, + "train_tokens_per_second": 2014.757 + }, + { + "epoch": 0.5024242424242424, + "grad_norm": 0.033256348222494125, + "learning_rate": 9.959408039444641e-05, + "loss": 0.013468440622091293, + "num_input_tokens_seen": 13575704, + "step": 829, + "train_runtime": 6738.1159, + "train_tokens_per_second": 2014.763 + }, + { + "epoch": 0.503030303030303, + "grad_norm": 0.030981307849287987, + "learning_rate": 9.95928566432448e-05, + "loss": 0.013072172179818153, + "num_input_tokens_seen": 13592080, + "step": 830, + "train_runtime": 6746.2296, + "train_tokens_per_second": 2014.767 + }, + { + "epoch": 0.5036363636363637, + "grad_norm": 0.019473901018500328, + "learning_rate": 9.959163105770002e-05, + "loss": 0.01263860147446394, + "num_input_tokens_seen": 13608456, + "step": 831, + "train_runtime": 6754.3387, + "train_tokens_per_second": 2014.773 + }, + { + "epoch": 0.5042424242424243, + "grad_norm": 0.023273654282093048, + "learning_rate": 9.959040363785736e-05, + "loss": 0.014287668280303478, + "num_input_tokens_seen": 13624832, + "step": 832, + "train_runtime": 6762.4478, + "train_tokens_per_second": 2014.778 + }, + { + "epoch": 0.5048484848484849, + "grad_norm": 0.0494939386844635, + "learning_rate": 9.958917438376226e-05, + "loss": 0.013972645625472069, + "num_input_tokens_seen": 13641208, + "step": 833, + "train_runtime": 6770.5557, + "train_tokens_per_second": 2014.784 + }, + { + "epoch": 0.5054545454545455, + "grad_norm": 0.0583622045814991, + "learning_rate": 9.958794329546017e-05, + "loss": 0.015316938981413841, + "num_input_tokens_seen": 13657584, + "step": 834, + "train_runtime": 6778.6628, + "train_tokens_per_second": 2014.79 + }, + { + "epoch": 0.5060606060606061, + "grad_norm": 0.022303935140371323, + "learning_rate": 9.958671037299662e-05, + "loss": 0.012674327939748764, + "num_input_tokens_seen": 13673960, + "step": 835, + "train_runtime": 6786.7703, + "train_tokens_per_second": 2014.796 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 0.033000800758600235, + "learning_rate": 9.958547561641722e-05, + "loss": 0.013727420009672642, + "num_input_tokens_seen": 13690336, + "step": 836, + "train_runtime": 6794.8756, + "train_tokens_per_second": 2014.803 + }, + { + "epoch": 0.5072727272727273, + "grad_norm": 0.015586488880217075, + "learning_rate": 9.958423902576763e-05, + "loss": 0.015323062427341938, + "num_input_tokens_seen": 13706712, + "step": 837, + "train_runtime": 6802.9828, + "train_tokens_per_second": 2014.809 + }, + { + "epoch": 0.5078787878787879, + "grad_norm": 0.022322285920381546, + "learning_rate": 9.958300060109362e-05, + "loss": 0.014234354719519615, + "num_input_tokens_seen": 13723088, + "step": 838, + "train_runtime": 6811.0938, + "train_tokens_per_second": 2014.814 + }, + { + "epoch": 0.5084848484848485, + "grad_norm": 0.008347253315150738, + "learning_rate": 9.958176034244097e-05, + "loss": 0.012262934818863869, + "num_input_tokens_seen": 13739464, + "step": 839, + "train_runtime": 6819.2024, + "train_tokens_per_second": 2014.82 + }, + { + "epoch": 0.509090909090909, + "grad_norm": 0.02393462508916855, + "learning_rate": 9.958051824985555e-05, + "loss": 0.01308400183916092, + "num_input_tokens_seen": 13755840, + "step": 840, + "train_runtime": 6827.3112, + "train_tokens_per_second": 2014.825 + }, + { + "epoch": 0.5096969696969696, + "grad_norm": 0.01569426991045475, + "learning_rate": 9.957927432338332e-05, + "loss": 0.012214584276080132, + "num_input_tokens_seen": 13772216, + "step": 841, + "train_runtime": 6835.4183, + "train_tokens_per_second": 2014.831 + }, + { + "epoch": 0.5103030303030303, + "grad_norm": 0.026208873838186264, + "learning_rate": 9.957802856307029e-05, + "loss": 0.014355281367897987, + "num_input_tokens_seen": 13788592, + "step": 842, + "train_runtime": 6843.5292, + "train_tokens_per_second": 2014.836 + }, + { + "epoch": 0.5109090909090909, + "grad_norm": 0.016047121956944466, + "learning_rate": 9.957678096896252e-05, + "loss": 0.012238034047186375, + "num_input_tokens_seen": 13804968, + "step": 843, + "train_runtime": 6851.6374, + "train_tokens_per_second": 2014.842 + }, + { + "epoch": 0.5115151515151515, + "grad_norm": 0.04430484399199486, + "learning_rate": 9.957553154110617e-05, + "loss": 0.013455298729240894, + "num_input_tokens_seen": 13821344, + "step": 844, + "train_runtime": 6859.7446, + "train_tokens_per_second": 2014.848 + }, + { + "epoch": 0.5121212121212121, + "grad_norm": 0.01514506246894598, + "learning_rate": 9.957428027954746e-05, + "loss": 0.014497831463813782, + "num_input_tokens_seen": 13837720, + "step": 845, + "train_runtime": 6867.8522, + "train_tokens_per_second": 2014.854 + }, + { + "epoch": 0.5127272727272727, + "grad_norm": 0.11227481067180634, + "learning_rate": 9.957302718433266e-05, + "loss": 0.01227258238941431, + "num_input_tokens_seen": 13854096, + "step": 846, + "train_runtime": 6875.9627, + "train_tokens_per_second": 2014.859 + }, + { + "epoch": 0.5133333333333333, + "grad_norm": 0.02800634503364563, + "learning_rate": 9.957177225550813e-05, + "loss": 0.013792254962027073, + "num_input_tokens_seen": 13870472, + "step": 847, + "train_runtime": 6884.0675, + "train_tokens_per_second": 2014.866 + }, + { + "epoch": 0.5139393939393939, + "grad_norm": 0.029475996270775795, + "learning_rate": 9.957051549312027e-05, + "loss": 0.013554091565310955, + "num_input_tokens_seen": 13886848, + "step": 848, + "train_runtime": 6892.1731, + "train_tokens_per_second": 2014.872 + }, + { + "epoch": 0.5145454545454545, + "grad_norm": 0.019583873450756073, + "learning_rate": 9.956925689721559e-05, + "loss": 0.014205913059413433, + "num_input_tokens_seen": 13903224, + "step": 849, + "train_runtime": 6900.2826, + "train_tokens_per_second": 2014.877 + }, + { + "epoch": 0.5151515151515151, + "grad_norm": 0.015864579007029533, + "learning_rate": 9.95679964678406e-05, + "loss": 0.01432622317224741, + "num_input_tokens_seen": 13919600, + "step": 850, + "train_runtime": 6908.3907, + "train_tokens_per_second": 2014.883 + }, + { + "epoch": 0.5157575757575757, + "grad_norm": 0.01455528661608696, + "learning_rate": 9.9566734205042e-05, + "loss": 0.015681616961956024, + "num_input_tokens_seen": 13935976, + "step": 851, + "train_runtime": 6916.5024, + "train_tokens_per_second": 2014.888 + }, + { + "epoch": 0.5163636363636364, + "grad_norm": 0.02918148599565029, + "learning_rate": 9.956547010886639e-05, + "loss": 0.012535885907709599, + "num_input_tokens_seen": 13952352, + "step": 852, + "train_runtime": 6924.6094, + "train_tokens_per_second": 2014.894 + }, + { + "epoch": 0.516969696969697, + "grad_norm": 0.0162571519613266, + "learning_rate": 9.956420417936056e-05, + "loss": 0.012905891984701157, + "num_input_tokens_seen": 13968728, + "step": 853, + "train_runtime": 6932.7194, + "train_tokens_per_second": 2014.899 + }, + { + "epoch": 0.5175757575757576, + "grad_norm": 0.01789519377052784, + "learning_rate": 9.956293641657137e-05, + "loss": 0.01288038119673729, + "num_input_tokens_seen": 13985104, + "step": 854, + "train_runtime": 6940.8319, + "train_tokens_per_second": 2014.903 + }, + { + "epoch": 0.5181818181818182, + "grad_norm": 0.01946009323000908, + "learning_rate": 9.956166682054566e-05, + "loss": 0.013123282231390476, + "num_input_tokens_seen": 14001480, + "step": 855, + "train_runtime": 6948.9381, + "train_tokens_per_second": 2014.909 + }, + { + "epoch": 0.5187878787878788, + "grad_norm": 0.02161416970193386, + "learning_rate": 9.956039539133042e-05, + "loss": 0.011395135894417763, + "num_input_tokens_seen": 14017856, + "step": 856, + "train_runtime": 6957.048, + "train_tokens_per_second": 2014.914 + }, + { + "epoch": 0.5193939393939394, + "grad_norm": 0.01752905547618866, + "learning_rate": 9.955912212897267e-05, + "loss": 0.014676744118332863, + "num_input_tokens_seen": 14034232, + "step": 857, + "train_runtime": 6965.1559, + "train_tokens_per_second": 2014.92 + }, + { + "epoch": 0.52, + "grad_norm": 0.012038851156830788, + "learning_rate": 9.955784703351949e-05, + "loss": 0.012578791007399559, + "num_input_tokens_seen": 14050608, + "step": 858, + "train_runtime": 6973.2666, + "train_tokens_per_second": 2014.925 + }, + { + "epoch": 0.5206060606060606, + "grad_norm": 0.01986696757376194, + "learning_rate": 9.955657010501806e-05, + "loss": 0.012446455657482147, + "num_input_tokens_seen": 14066984, + "step": 859, + "train_runtime": 6981.3718, + "train_tokens_per_second": 2014.931 + }, + { + "epoch": 0.5212121212121212, + "grad_norm": 0.020363394170999527, + "learning_rate": 9.955529134351563e-05, + "loss": 0.012604762800037861, + "num_input_tokens_seen": 14083360, + "step": 860, + "train_runtime": 6989.4801, + "train_tokens_per_second": 2014.937 + }, + { + "epoch": 0.5218181818181818, + "grad_norm": 0.010133441537618637, + "learning_rate": 9.955401074905945e-05, + "loss": 0.01250852644443512, + "num_input_tokens_seen": 14099736, + "step": 861, + "train_runtime": 6997.5889, + "train_tokens_per_second": 2014.942 + }, + { + "epoch": 0.5224242424242425, + "grad_norm": 0.012160439044237137, + "learning_rate": 9.955272832169694e-05, + "loss": 0.013129970990121365, + "num_input_tokens_seen": 14116112, + "step": 862, + "train_runtime": 7005.6941, + "train_tokens_per_second": 2014.948 + }, + { + "epoch": 0.5230303030303031, + "grad_norm": 0.0197035763412714, + "learning_rate": 9.95514440614755e-05, + "loss": 0.012795310467481613, + "num_input_tokens_seen": 14132488, + "step": 863, + "train_runtime": 7013.8017, + "train_tokens_per_second": 2014.954 + }, + { + "epoch": 0.5236363636363637, + "grad_norm": 0.029051663354039192, + "learning_rate": 9.955015796844263e-05, + "loss": 0.012731630355119705, + "num_input_tokens_seen": 14148864, + "step": 864, + "train_runtime": 7021.913, + "train_tokens_per_second": 2014.959 + }, + { + "epoch": 0.5242424242424243, + "grad_norm": 0.01819092035293579, + "learning_rate": 9.954887004264591e-05, + "loss": 0.012530642561614513, + "num_input_tokens_seen": 14165240, + "step": 865, + "train_runtime": 7030.031, + "train_tokens_per_second": 2014.961 + }, + { + "epoch": 0.5248484848484849, + "grad_norm": 0.012354613281786442, + "learning_rate": 9.9547580284133e-05, + "loss": 0.012999298982322216, + "num_input_tokens_seen": 14181616, + "step": 866, + "train_runtime": 7038.1398, + "train_tokens_per_second": 2014.966 + }, + { + "epoch": 0.5254545454545455, + "grad_norm": 0.009374301880598068, + "learning_rate": 9.954628869295157e-05, + "loss": 0.012080837972462177, + "num_input_tokens_seen": 14197992, + "step": 867, + "train_runtime": 7046.2489, + "train_tokens_per_second": 2014.972 + }, + { + "epoch": 0.526060606060606, + "grad_norm": 0.04844909533858299, + "learning_rate": 9.954499526914941e-05, + "loss": 0.014849531464278698, + "num_input_tokens_seen": 14214368, + "step": 868, + "train_runtime": 7054.3586, + "train_tokens_per_second": 2014.977 + }, + { + "epoch": 0.5266666666666666, + "grad_norm": 0.0264375489205122, + "learning_rate": 9.954370001277435e-05, + "loss": 0.013595725409686565, + "num_input_tokens_seen": 14230744, + "step": 869, + "train_runtime": 7062.4663, + "train_tokens_per_second": 2014.982 + }, + { + "epoch": 0.5272727272727272, + "grad_norm": 0.011517049744725227, + "learning_rate": 9.954240292387434e-05, + "loss": 0.012497092597186565, + "num_input_tokens_seen": 14247120, + "step": 870, + "train_runtime": 7070.5718, + "train_tokens_per_second": 2014.988 + }, + { + "epoch": 0.5278787878787878, + "grad_norm": 0.012493406422436237, + "learning_rate": 9.95411040024973e-05, + "loss": 0.01143716461956501, + "num_input_tokens_seen": 14263496, + "step": 871, + "train_runtime": 7078.6807, + "train_tokens_per_second": 2014.994 + }, + { + "epoch": 0.5284848484848484, + "grad_norm": 0.04269085079431534, + "learning_rate": 9.95398032486913e-05, + "loss": 0.013632988557219505, + "num_input_tokens_seen": 14279872, + "step": 872, + "train_runtime": 7086.7887, + "train_tokens_per_second": 2014.999 + }, + { + "epoch": 0.5290909090909091, + "grad_norm": 0.04483538493514061, + "learning_rate": 9.953850066250445e-05, + "loss": 0.013953006826341152, + "num_input_tokens_seen": 14296248, + "step": 873, + "train_runtime": 7094.8962, + "train_tokens_per_second": 2015.005 + }, + { + "epoch": 0.5296969696969697, + "grad_norm": 0.05677570030093193, + "learning_rate": 9.953719624398495e-05, + "loss": 0.012957635335624218, + "num_input_tokens_seen": 14312624, + "step": 874, + "train_runtime": 7103.0013, + "train_tokens_per_second": 2015.011 + }, + { + "epoch": 0.5303030303030303, + "grad_norm": 0.038775816559791565, + "learning_rate": 9.953588999318101e-05, + "loss": 0.01283508911728859, + "num_input_tokens_seen": 14329000, + "step": 875, + "train_runtime": 7111.1121, + "train_tokens_per_second": 2015.015 + }, + { + "epoch": 0.5309090909090909, + "grad_norm": 0.032757148146629333, + "learning_rate": 9.953458191014098e-05, + "loss": 0.013316294178366661, + "num_input_tokens_seen": 14345376, + "step": 876, + "train_runtime": 7119.2189, + "train_tokens_per_second": 2015.021 + }, + { + "epoch": 0.5315151515151515, + "grad_norm": 0.022632509469985962, + "learning_rate": 9.953327199491323e-05, + "loss": 0.011890828609466553, + "num_input_tokens_seen": 14361752, + "step": 877, + "train_runtime": 7127.3324, + "train_tokens_per_second": 2015.025 + }, + { + "epoch": 0.5321212121212121, + "grad_norm": 0.013239112682640553, + "learning_rate": 9.953196024754621e-05, + "loss": 0.011631186120212078, + "num_input_tokens_seen": 14378128, + "step": 878, + "train_runtime": 7135.4381, + "train_tokens_per_second": 2015.031 + }, + { + "epoch": 0.5327272727272727, + "grad_norm": 0.012772745452821255, + "learning_rate": 9.953064666808843e-05, + "loss": 0.011507662013173103, + "num_input_tokens_seen": 14394504, + "step": 879, + "train_runtime": 7143.5516, + "train_tokens_per_second": 2015.035 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 0.02860845811665058, + "learning_rate": 9.952933125658849e-05, + "loss": 0.013187154196202755, + "num_input_tokens_seen": 14410880, + "step": 880, + "train_runtime": 7151.6624, + "train_tokens_per_second": 2015.039 + }, + { + "epoch": 0.5339393939393939, + "grad_norm": 0.011422947980463505, + "learning_rate": 9.952801401309503e-05, + "loss": 0.012076064944267273, + "num_input_tokens_seen": 14427256, + "step": 881, + "train_runtime": 7159.772, + "train_tokens_per_second": 2015.044 + }, + { + "epoch": 0.5345454545454545, + "grad_norm": 0.00976222101598978, + "learning_rate": 9.95266949376568e-05, + "loss": 0.011884449049830437, + "num_input_tokens_seen": 14443632, + "step": 882, + "train_runtime": 7167.8808, + "train_tokens_per_second": 2015.049 + }, + { + "epoch": 0.5351515151515152, + "grad_norm": 0.017465714365243912, + "learning_rate": 9.952537403032258e-05, + "loss": 0.012587850913405418, + "num_input_tokens_seen": 14460008, + "step": 883, + "train_runtime": 7175.9926, + "train_tokens_per_second": 2015.053 + }, + { + "epoch": 0.5357575757575758, + "grad_norm": 0.01686178520321846, + "learning_rate": 9.952405129114119e-05, + "loss": 0.01267196424305439, + "num_input_tokens_seen": 14476384, + "step": 884, + "train_runtime": 7184.0994, + "train_tokens_per_second": 2015.059 + }, + { + "epoch": 0.5363636363636364, + "grad_norm": 0.021161451935768127, + "learning_rate": 9.952272672016161e-05, + "loss": 0.012368117459118366, + "num_input_tokens_seen": 14492760, + "step": 885, + "train_runtime": 7192.2074, + "train_tokens_per_second": 2015.064 + }, + { + "epoch": 0.536969696969697, + "grad_norm": 0.018734315410256386, + "learning_rate": 9.95214003174328e-05, + "loss": 0.013907104730606079, + "num_input_tokens_seen": 14509136, + "step": 886, + "train_runtime": 7200.3181, + "train_tokens_per_second": 2015.069 + }, + { + "epoch": 0.5375757575757576, + "grad_norm": 0.017368443310260773, + "learning_rate": 9.952007208300384e-05, + "loss": 0.013688186183571815, + "num_input_tokens_seen": 14525512, + "step": 887, + "train_runtime": 7208.4306, + "train_tokens_per_second": 2015.073 + }, + { + "epoch": 0.5381818181818182, + "grad_norm": 0.014055633917450905, + "learning_rate": 9.951874201692386e-05, + "loss": 0.011441092006862164, + "num_input_tokens_seen": 14541888, + "step": 888, + "train_runtime": 7216.5403, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 0.5387878787878788, + "grad_norm": 0.014830189757049084, + "learning_rate": 9.951741011924202e-05, + "loss": 0.012659481726586819, + "num_input_tokens_seen": 14558264, + "step": 889, + "train_runtime": 7224.6486, + "train_tokens_per_second": 2015.083 + }, + { + "epoch": 0.5393939393939394, + "grad_norm": 0.04141494259238243, + "learning_rate": 9.951607639000763e-05, + "loss": 0.014267532154917717, + "num_input_tokens_seen": 14574640, + "step": 890, + "train_runtime": 7232.7595, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 0.54, + "grad_norm": 0.026582296937704086, + "learning_rate": 9.951474082927e-05, + "loss": 0.01351410336792469, + "num_input_tokens_seen": 14591016, + "step": 891, + "train_runtime": 7240.8673, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 0.5406060606060606, + "grad_norm": 0.029941242188215256, + "learning_rate": 9.951340343707852e-05, + "loss": 0.013386565260589123, + "num_input_tokens_seen": 14607392, + "step": 892, + "train_runtime": 7248.9795, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 0.5412121212121213, + "grad_norm": 0.01376877911388874, + "learning_rate": 9.951206421348267e-05, + "loss": 0.012590361759066582, + "num_input_tokens_seen": 14623768, + "step": 893, + "train_runtime": 7257.0885, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 0.5418181818181819, + "grad_norm": 0.015015073120594025, + "learning_rate": 9.9510723158532e-05, + "loss": 0.012574484571814537, + "num_input_tokens_seen": 14640144, + "step": 894, + "train_runtime": 7265.1975, + "train_tokens_per_second": 2015.106 + }, + { + "epoch": 0.5424242424242425, + "grad_norm": 0.013042068108916283, + "learning_rate": 9.950938027227608e-05, + "loss": 0.01163259893655777, + "num_input_tokens_seen": 14656520, + "step": 895, + "train_runtime": 7273.3074, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 0.5430303030303031, + "grad_norm": 0.2448950558900833, + "learning_rate": 9.950803555476463e-05, + "loss": 0.029144512489438057, + "num_input_tokens_seen": 14672896, + "step": 896, + "train_runtime": 7281.4158, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 0.5436363636363636, + "grad_norm": 0.015140167437493801, + "learning_rate": 9.950668900604733e-05, + "loss": 0.012354775331914425, + "num_input_tokens_seen": 14689272, + "step": 897, + "train_runtime": 7289.5307, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 0.5442424242424242, + "grad_norm": 0.014910165220499039, + "learning_rate": 9.950534062617401e-05, + "loss": 0.013464296236634254, + "num_input_tokens_seen": 14705648, + "step": 898, + "train_runtime": 7297.6408, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 0.5448484848484848, + "grad_norm": 0.025381648913025856, + "learning_rate": 9.950399041519456e-05, + "loss": 0.01381002739071846, + "num_input_tokens_seen": 14722024, + "step": 899, + "train_runtime": 7305.7486, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 0.5454545454545454, + "grad_norm": 0.016502218320965767, + "learning_rate": 9.950263837315891e-05, + "loss": 0.014580944553017616, + "num_input_tokens_seen": 14738400, + "step": 900, + "train_runtime": 7313.8574, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 0.546060606060606, + "grad_norm": 0.036798711866140366, + "learning_rate": 9.950128450011706e-05, + "loss": 0.01336810551583767, + "num_input_tokens_seen": 14754776, + "step": 901, + "train_runtime": 7322.836, + "train_tokens_per_second": 2014.899 + }, + { + "epoch": 0.5466666666666666, + "grad_norm": 0.03919834643602371, + "learning_rate": 9.949992879611911e-05, + "loss": 0.013614124618470669, + "num_input_tokens_seen": 14771152, + "step": 902, + "train_runtime": 7330.9449, + "train_tokens_per_second": 2014.904 + }, + { + "epoch": 0.5472727272727272, + "grad_norm": 0.015492623671889305, + "learning_rate": 9.949857126121517e-05, + "loss": 0.01262598019093275, + "num_input_tokens_seen": 14787528, + "step": 903, + "train_runtime": 7339.051, + "train_tokens_per_second": 2014.91 + }, + { + "epoch": 0.5478787878787879, + "grad_norm": 0.04381313920021057, + "learning_rate": 9.949721189545549e-05, + "loss": 0.012830916792154312, + "num_input_tokens_seen": 14803904, + "step": 904, + "train_runtime": 7347.1591, + "train_tokens_per_second": 2014.915 + }, + { + "epoch": 0.5484848484848485, + "grad_norm": 0.012728218920528889, + "learning_rate": 9.949585069889033e-05, + "loss": 0.012215669266879559, + "num_input_tokens_seen": 14820280, + "step": 905, + "train_runtime": 7355.2671, + "train_tokens_per_second": 2014.921 + }, + { + "epoch": 0.5490909090909091, + "grad_norm": 0.02701408974826336, + "learning_rate": 9.949448767157003e-05, + "loss": 0.014799817465245724, + "num_input_tokens_seen": 14836656, + "step": 906, + "train_runtime": 7363.3735, + "train_tokens_per_second": 2014.926 + }, + { + "epoch": 0.5496969696969697, + "grad_norm": 0.01919523999094963, + "learning_rate": 9.949312281354504e-05, + "loss": 0.012729383073747158, + "num_input_tokens_seen": 14853032, + "step": 907, + "train_runtime": 7371.4797, + "train_tokens_per_second": 2014.932 + }, + { + "epoch": 0.5503030303030303, + "grad_norm": 0.017987912520766258, + "learning_rate": 9.94917561248658e-05, + "loss": 0.011925067752599716, + "num_input_tokens_seen": 14869408, + "step": 908, + "train_runtime": 7379.5909, + "train_tokens_per_second": 2014.937 + }, + { + "epoch": 0.5509090909090909, + "grad_norm": 0.016029933467507362, + "learning_rate": 9.94903876055829e-05, + "loss": 0.014640429988503456, + "num_input_tokens_seen": 14885784, + "step": 909, + "train_runtime": 7387.701, + "train_tokens_per_second": 2014.941 + }, + { + "epoch": 0.5515151515151515, + "grad_norm": 0.02371898479759693, + "learning_rate": 9.948901725574692e-05, + "loss": 0.013192545622587204, + "num_input_tokens_seen": 14902160, + "step": 910, + "train_runtime": 7395.8127, + "train_tokens_per_second": 2014.946 + }, + { + "epoch": 0.5521212121212121, + "grad_norm": 0.028052695095539093, + "learning_rate": 9.948764507540858e-05, + "loss": 0.014127026312053204, + "num_input_tokens_seen": 14918536, + "step": 911, + "train_runtime": 7403.9308, + "train_tokens_per_second": 2014.948 + }, + { + "epoch": 0.5527272727272727, + "grad_norm": 0.022900646552443504, + "learning_rate": 9.94862710646186e-05, + "loss": 0.01369861327111721, + "num_input_tokens_seen": 14934912, + "step": 912, + "train_runtime": 7412.0474, + "train_tokens_per_second": 2014.951 + }, + { + "epoch": 0.5533333333333333, + "grad_norm": 0.024493444710969925, + "learning_rate": 9.948489522342786e-05, + "loss": 0.012475069612264633, + "num_input_tokens_seen": 14951288, + "step": 913, + "train_runtime": 7420.164, + "train_tokens_per_second": 2014.954 + }, + { + "epoch": 0.553939393939394, + "grad_norm": 0.009486420080065727, + "learning_rate": 9.948351755188718e-05, + "loss": 0.011415514163672924, + "num_input_tokens_seen": 14967664, + "step": 914, + "train_runtime": 7428.2787, + "train_tokens_per_second": 2014.957 + }, + { + "epoch": 0.5545454545454546, + "grad_norm": 0.02638114243745804, + "learning_rate": 9.948213805004758e-05, + "loss": 0.014981718733906746, + "num_input_tokens_seen": 14984040, + "step": 915, + "train_runtime": 7436.3836, + "train_tokens_per_second": 2014.963 + }, + { + "epoch": 0.5551515151515152, + "grad_norm": 0.024289410561323166, + "learning_rate": 9.948075671796004e-05, + "loss": 0.013489319942891598, + "num_input_tokens_seen": 15000416, + "step": 916, + "train_runtime": 7444.4934, + "train_tokens_per_second": 2014.968 + }, + { + "epoch": 0.5557575757575758, + "grad_norm": 0.019992362707853317, + "learning_rate": 9.947937355567566e-05, + "loss": 0.013457294553518295, + "num_input_tokens_seen": 15016792, + "step": 917, + "train_runtime": 7452.6, + "train_tokens_per_second": 2014.974 + }, + { + "epoch": 0.5563636363636364, + "grad_norm": 0.01874268427491188, + "learning_rate": 9.947798856324562e-05, + "loss": 0.014019965194165707, + "num_input_tokens_seen": 15033168, + "step": 918, + "train_runtime": 7460.7075, + "train_tokens_per_second": 2014.979 + }, + { + "epoch": 0.556969696969697, + "grad_norm": 0.006537168752402067, + "learning_rate": 9.947660174072113e-05, + "loss": 0.01211620308458805, + "num_input_tokens_seen": 15049544, + "step": 919, + "train_runtime": 7468.8162, + "train_tokens_per_second": 2014.984 + }, + { + "epoch": 0.5575757575757576, + "grad_norm": 0.014149926602840424, + "learning_rate": 9.94752130881535e-05, + "loss": 0.01367366872727871, + "num_input_tokens_seen": 15065920, + "step": 920, + "train_runtime": 7476.9293, + "train_tokens_per_second": 2014.988 + }, + { + "epoch": 0.5581818181818182, + "grad_norm": 0.02201610431075096, + "learning_rate": 9.947382260559408e-05, + "loss": 0.014585314318537712, + "num_input_tokens_seen": 15082296, + "step": 921, + "train_runtime": 7485.0355, + "train_tokens_per_second": 2014.993 + }, + { + "epoch": 0.5587878787878788, + "grad_norm": 0.016061201691627502, + "learning_rate": 9.947243029309433e-05, + "loss": 0.012058419175446033, + "num_input_tokens_seen": 15098672, + "step": 922, + "train_runtime": 7493.1729, + "train_tokens_per_second": 2014.99 + }, + { + "epoch": 0.5593939393939394, + "grad_norm": 0.014283844269812107, + "learning_rate": 9.94710361507057e-05, + "loss": 0.013241814449429512, + "num_input_tokens_seen": 15115048, + "step": 923, + "train_runtime": 7501.2834, + "train_tokens_per_second": 2014.995 + }, + { + "epoch": 0.56, + "grad_norm": 0.014411736279726028, + "learning_rate": 9.94696401784798e-05, + "loss": 0.011771513149142265, + "num_input_tokens_seen": 15131424, + "step": 924, + "train_runtime": 7509.3934, + "train_tokens_per_second": 2015.0 + }, + { + "epoch": 0.5606060606060606, + "grad_norm": 0.015076170675456524, + "learning_rate": 9.946824237646824e-05, + "loss": 0.012921320274472237, + "num_input_tokens_seen": 15147800, + "step": 925, + "train_runtime": 7517.5023, + "train_tokens_per_second": 2015.004 + }, + { + "epoch": 0.5612121212121212, + "grad_norm": 0.019479839131236076, + "learning_rate": 9.94668427447227e-05, + "loss": 0.01365247555077076, + "num_input_tokens_seen": 15164176, + "step": 926, + "train_runtime": 7525.6088, + "train_tokens_per_second": 2015.01 + }, + { + "epoch": 0.5618181818181818, + "grad_norm": 0.015186650678515434, + "learning_rate": 9.946544128329502e-05, + "loss": 0.011964188888669014, + "num_input_tokens_seen": 15180552, + "step": 927, + "train_runtime": 7533.7184, + "train_tokens_per_second": 2015.015 + }, + { + "epoch": 0.5624242424242424, + "grad_norm": 0.01884932816028595, + "learning_rate": 9.9464037992237e-05, + "loss": 0.013231384567916393, + "num_input_tokens_seen": 15196928, + "step": 928, + "train_runtime": 7541.8298, + "train_tokens_per_second": 2015.019 + }, + { + "epoch": 0.563030303030303, + "grad_norm": 0.024524593725800514, + "learning_rate": 9.946263287160051e-05, + "loss": 0.013677388429641724, + "num_input_tokens_seen": 15213304, + "step": 929, + "train_runtime": 7549.9392, + "train_tokens_per_second": 2015.023 + }, + { + "epoch": 0.5636363636363636, + "grad_norm": 0.017896726727485657, + "learning_rate": 9.946122592143758e-05, + "loss": 0.012685752473771572, + "num_input_tokens_seen": 15229680, + "step": 930, + "train_runtime": 7558.0487, + "train_tokens_per_second": 2015.028 + }, + { + "epoch": 0.5642424242424242, + "grad_norm": 0.02456982247531414, + "learning_rate": 9.945981714180021e-05, + "loss": 0.012439090758562088, + "num_input_tokens_seen": 15246056, + "step": 931, + "train_runtime": 7566.1626, + "train_tokens_per_second": 2015.032 + }, + { + "epoch": 0.5648484848484848, + "grad_norm": 0.011778507381677628, + "learning_rate": 9.945840653274052e-05, + "loss": 0.01277371309697628, + "num_input_tokens_seen": 15262432, + "step": 932, + "train_runtime": 7574.272, + "train_tokens_per_second": 2015.036 + }, + { + "epoch": 0.5654545454545454, + "grad_norm": 0.00871087983250618, + "learning_rate": 9.945699409431071e-05, + "loss": 0.012337596155703068, + "num_input_tokens_seen": 15278808, + "step": 933, + "train_runtime": 7582.3801, + "train_tokens_per_second": 2015.041 + }, + { + "epoch": 0.566060606060606, + "grad_norm": 0.02395842783153057, + "learning_rate": 9.945557982656299e-05, + "loss": 0.013987423852086067, + "num_input_tokens_seen": 15295184, + "step": 934, + "train_runtime": 7590.493, + "train_tokens_per_second": 2015.045 + }, + { + "epoch": 0.5666666666666667, + "grad_norm": 0.014825602062046528, + "learning_rate": 9.945416372954968e-05, + "loss": 0.013695470988750458, + "num_input_tokens_seen": 15311560, + "step": 935, + "train_runtime": 7598.6032, + "train_tokens_per_second": 2015.049 + }, + { + "epoch": 0.5672727272727273, + "grad_norm": 0.034912459552288055, + "learning_rate": 9.945274580332316e-05, + "loss": 0.014644785784184933, + "num_input_tokens_seen": 15327936, + "step": 936, + "train_runtime": 7606.7121, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 0.5678787878787879, + "grad_norm": 0.015183918178081512, + "learning_rate": 9.945132604793588e-05, + "loss": 0.013066308572888374, + "num_input_tokens_seen": 15344312, + "step": 937, + "train_runtime": 7614.8211, + "train_tokens_per_second": 2015.059 + }, + { + "epoch": 0.5684848484848485, + "grad_norm": 0.015175413340330124, + "learning_rate": 9.944990446344033e-05, + "loss": 0.012659816071391106, + "num_input_tokens_seen": 15360688, + "step": 938, + "train_runtime": 7622.9305, + "train_tokens_per_second": 2015.063 + }, + { + "epoch": 0.5690909090909091, + "grad_norm": 0.00944305956363678, + "learning_rate": 9.944848104988915e-05, + "loss": 0.012941330671310425, + "num_input_tokens_seen": 15377064, + "step": 939, + "train_runtime": 7631.041, + "train_tokens_per_second": 2015.068 + }, + { + "epoch": 0.5696969696969697, + "grad_norm": 0.008134279400110245, + "learning_rate": 9.944705580733493e-05, + "loss": 0.012083706445991993, + "num_input_tokens_seen": 15393440, + "step": 940, + "train_runtime": 7639.151, + "train_tokens_per_second": 2015.072 + }, + { + "epoch": 0.5703030303030303, + "grad_norm": 0.01920422352850437, + "learning_rate": 9.944562873583042e-05, + "loss": 0.012228092178702354, + "num_input_tokens_seen": 15409816, + "step": 941, + "train_runtime": 7647.2582, + "train_tokens_per_second": 2015.077 + }, + { + "epoch": 0.5709090909090909, + "grad_norm": 0.02532947063446045, + "learning_rate": 9.944419983542839e-05, + "loss": 0.014129354618489742, + "num_input_tokens_seen": 15426192, + "step": 942, + "train_runtime": 7655.3689, + "train_tokens_per_second": 2015.081 + }, + { + "epoch": 0.5715151515151515, + "grad_norm": 0.014770124107599258, + "learning_rate": 9.944276910618168e-05, + "loss": 0.01307615451514721, + "num_input_tokens_seen": 15442568, + "step": 943, + "train_runtime": 7663.4788, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 0.5721212121212121, + "grad_norm": 0.04172991216182709, + "learning_rate": 9.944133654814325e-05, + "loss": 0.01433885470032692, + "num_input_tokens_seen": 15458944, + "step": 944, + "train_runtime": 7671.5887, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 0.5727272727272728, + "grad_norm": 0.02282462641596794, + "learning_rate": 9.943990216136605e-05, + "loss": 0.012092739343643188, + "num_input_tokens_seen": 15475320, + "step": 945, + "train_runtime": 7679.6999, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 0.5733333333333334, + "grad_norm": 0.0323781855404377, + "learning_rate": 9.943846594590316e-05, + "loss": 0.014233306050300598, + "num_input_tokens_seen": 15491696, + "step": 946, + "train_runtime": 7687.8075, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 0.573939393939394, + "grad_norm": 0.016390513628721237, + "learning_rate": 9.943702790180769e-05, + "loss": 0.01384427584707737, + "num_input_tokens_seen": 15508072, + "step": 947, + "train_runtime": 7695.9168, + "train_tokens_per_second": 2015.104 + }, + { + "epoch": 0.5745454545454546, + "grad_norm": 0.017519650980830193, + "learning_rate": 9.943558802913282e-05, + "loss": 0.013568704016506672, + "num_input_tokens_seen": 15524448, + "step": 948, + "train_runtime": 7704.0297, + "train_tokens_per_second": 2015.108 + }, + { + "epoch": 0.5751515151515152, + "grad_norm": 0.012753440998494625, + "learning_rate": 9.943414632793184e-05, + "loss": 0.012147994711995125, + "num_input_tokens_seen": 15540824, + "step": 949, + "train_runtime": 7712.145, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 0.5757575757575758, + "grad_norm": 0.011699757538735867, + "learning_rate": 9.943270279825803e-05, + "loss": 0.013070912100374699, + "num_input_tokens_seen": 15557200, + "step": 950, + "train_runtime": 7720.2554, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 0.5763636363636364, + "grad_norm": 0.01527287345379591, + "learning_rate": 9.943125744016483e-05, + "loss": 0.011352474801242352, + "num_input_tokens_seen": 15573576, + "step": 951, + "train_runtime": 7728.3625, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 0.576969696969697, + "grad_norm": 0.025451278313994408, + "learning_rate": 9.942981025370568e-05, + "loss": 0.013020837679505348, + "num_input_tokens_seen": 15589952, + "step": 952, + "train_runtime": 7736.4706, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 0.5775757575757576, + "grad_norm": 0.021832725033164024, + "learning_rate": 9.942836123893408e-05, + "loss": 0.015131472609937191, + "num_input_tokens_seen": 15606328, + "step": 953, + "train_runtime": 7744.5854, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 0.5781818181818181, + "grad_norm": 0.022370878607034683, + "learning_rate": 9.942691039590369e-05, + "loss": 0.012688050046563148, + "num_input_tokens_seen": 15622704, + "step": 954, + "train_runtime": 7752.6981, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 0.5787878787878787, + "grad_norm": 0.021051136776804924, + "learning_rate": 9.942545772466814e-05, + "loss": 0.012345478869974613, + "num_input_tokens_seen": 15639080, + "step": 955, + "train_runtime": 7760.8061, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 0.5793939393939394, + "grad_norm": 0.01372633595019579, + "learning_rate": 9.942400322528114e-05, + "loss": 0.012315414845943451, + "num_input_tokens_seen": 15655456, + "step": 956, + "train_runtime": 7768.9154, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 0.58, + "grad_norm": 0.028729503974318504, + "learning_rate": 9.942254689779651e-05, + "loss": 0.013109761290252209, + "num_input_tokens_seen": 15671832, + "step": 957, + "train_runtime": 7777.0294, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 0.5806060606060606, + "grad_norm": 0.029019076377153397, + "learning_rate": 9.942108874226811e-05, + "loss": 0.013196980580687523, + "num_input_tokens_seen": 15688208, + "step": 958, + "train_runtime": 7785.1364, + "train_tokens_per_second": 2015.149 + }, + { + "epoch": 0.5812121212121212, + "grad_norm": 0.011110197752714157, + "learning_rate": 9.94196287587499e-05, + "loss": 0.012527218088507652, + "num_input_tokens_seen": 15704584, + "step": 959, + "train_runtime": 7793.2454, + "train_tokens_per_second": 2015.153 + }, + { + "epoch": 0.5818181818181818, + "grad_norm": 0.012445122934877872, + "learning_rate": 9.941816694729586e-05, + "loss": 0.013050834648311138, + "num_input_tokens_seen": 15720960, + "step": 960, + "train_runtime": 7801.3578, + "train_tokens_per_second": 2015.157 + }, + { + "epoch": 0.5824242424242424, + "grad_norm": 0.01324465125799179, + "learning_rate": 9.941670330796007e-05, + "loss": 0.012385859154164791, + "num_input_tokens_seen": 15737336, + "step": 961, + "train_runtime": 7809.4681, + "train_tokens_per_second": 2015.161 + }, + { + "epoch": 0.583030303030303, + "grad_norm": 0.020351726561784744, + "learning_rate": 9.941523784079665e-05, + "loss": 0.013481922447681427, + "num_input_tokens_seen": 15753712, + "step": 962, + "train_runtime": 7817.5774, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 0.5836363636363636, + "grad_norm": 0.017218874767422676, + "learning_rate": 9.94137705458598e-05, + "loss": 0.011243843473494053, + "num_input_tokens_seen": 15770088, + "step": 963, + "train_runtime": 7825.6869, + "train_tokens_per_second": 2015.17 + }, + { + "epoch": 0.5842424242424242, + "grad_norm": 0.020052634179592133, + "learning_rate": 9.941230142320381e-05, + "loss": 0.01419176533818245, + "num_input_tokens_seen": 15786464, + "step": 964, + "train_runtime": 7833.7989, + "train_tokens_per_second": 2015.174 + }, + { + "epoch": 0.5848484848484848, + "grad_norm": 0.01865479350090027, + "learning_rate": 9.941083047288305e-05, + "loss": 0.013855772092938423, + "num_input_tokens_seen": 15802840, + "step": 965, + "train_runtime": 7841.9087, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 0.5854545454545454, + "grad_norm": 0.019557680934667587, + "learning_rate": 9.940935769495186e-05, + "loss": 0.014046021737158298, + "num_input_tokens_seen": 15819216, + "step": 966, + "train_runtime": 7850.0169, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 0.5860606060606061, + "grad_norm": 0.01921168901026249, + "learning_rate": 9.940788308946476e-05, + "loss": 0.013276162557303905, + "num_input_tokens_seen": 15835592, + "step": 967, + "train_runtime": 7858.13, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 0.015911763533949852, + "learning_rate": 9.940640665647626e-05, + "loss": 0.012454750947654247, + "num_input_tokens_seen": 15851968, + "step": 968, + "train_runtime": 7866.2398, + "train_tokens_per_second": 2015.19 + }, + { + "epoch": 0.5872727272727273, + "grad_norm": 0.020958999171853065, + "learning_rate": 9.940492839604103e-05, + "loss": 0.01228359155356884, + "num_input_tokens_seen": 15868344, + "step": 969, + "train_runtime": 7874.3484, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 0.5878787878787879, + "grad_norm": 0.017634913325309753, + "learning_rate": 9.940344830821368e-05, + "loss": 0.013240614905953407, + "num_input_tokens_seen": 15884720, + "step": 970, + "train_runtime": 7882.4581, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 0.5884848484848485, + "grad_norm": 0.018232690170407295, + "learning_rate": 9.9401966393049e-05, + "loss": 0.01443801261484623, + "num_input_tokens_seen": 15901096, + "step": 971, + "train_runtime": 7890.5672, + "train_tokens_per_second": 2015.203 + }, + { + "epoch": 0.5890909090909091, + "grad_norm": 0.021868707612156868, + "learning_rate": 9.94004826506018e-05, + "loss": 0.014730443246662617, + "num_input_tokens_seen": 15917472, + "step": 972, + "train_runtime": 7898.6753, + "train_tokens_per_second": 2015.208 + }, + { + "epoch": 0.5896969696969697, + "grad_norm": 0.015589121729135513, + "learning_rate": 9.939899708092692e-05, + "loss": 0.011880002915859222, + "num_input_tokens_seen": 15933848, + "step": 973, + "train_runtime": 7906.7854, + "train_tokens_per_second": 2015.212 + }, + { + "epoch": 0.5903030303030303, + "grad_norm": 0.010916100814938545, + "learning_rate": 9.939750968407938e-05, + "loss": 0.011822294443845749, + "num_input_tokens_seen": 15950224, + "step": 974, + "train_runtime": 7914.891, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 0.5909090909090909, + "grad_norm": 0.014051892794668674, + "learning_rate": 9.939602046011412e-05, + "loss": 0.012878884561359882, + "num_input_tokens_seen": 15966600, + "step": 975, + "train_runtime": 7923.0019, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 0.5915151515151515, + "grad_norm": 0.032839711755514145, + "learning_rate": 9.939452940908626e-05, + "loss": 0.014527475461363792, + "num_input_tokens_seen": 15982976, + "step": 976, + "train_runtime": 7931.1131, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 0.5921212121212122, + "grad_norm": 0.020389258861541748, + "learning_rate": 9.939303653105096e-05, + "loss": 0.013167984783649445, + "num_input_tokens_seen": 15999352, + "step": 977, + "train_runtime": 7939.2338, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 0.5927272727272728, + "grad_norm": 0.025760652497410774, + "learning_rate": 9.939154182606341e-05, + "loss": 0.01562490500509739, + "num_input_tokens_seen": 16015728, + "step": 978, + "train_runtime": 7947.343, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 0.5933333333333334, + "grad_norm": 0.017900720238685608, + "learning_rate": 9.939004529417894e-05, + "loss": 0.011635327711701393, + "num_input_tokens_seen": 16032104, + "step": 979, + "train_runtime": 7955.4555, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 0.593939393939394, + "grad_norm": 0.018658578395843506, + "learning_rate": 9.938854693545285e-05, + "loss": 0.011654762551188469, + "num_input_tokens_seen": 16048480, + "step": 980, + "train_runtime": 7963.5661, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 0.5945454545454546, + "grad_norm": 0.01790103130042553, + "learning_rate": 9.938704674994062e-05, + "loss": 0.013270128518342972, + "num_input_tokens_seen": 16064856, + "step": 981, + "train_runtime": 7971.6756, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 0.5951515151515151, + "grad_norm": 0.039879657328128815, + "learning_rate": 9.938554473769768e-05, + "loss": 0.01646546646952629, + "num_input_tokens_seen": 16081232, + "step": 982, + "train_runtime": 7979.7879, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 0.5957575757575757, + "grad_norm": 0.013998485170304775, + "learning_rate": 9.938404089877961e-05, + "loss": 0.012206289917230606, + "num_input_tokens_seen": 16097608, + "step": 983, + "train_runtime": 7987.8964, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 0.5963636363636363, + "grad_norm": 0.006746624130755663, + "learning_rate": 9.938253523324206e-05, + "loss": 0.012235766276717186, + "num_input_tokens_seen": 16113984, + "step": 984, + "train_runtime": 7996.0036, + "train_tokens_per_second": 2015.255 + }, + { + "epoch": 0.5969696969696969, + "grad_norm": 0.022575756534934044, + "learning_rate": 9.93810277411407e-05, + "loss": 0.012963814660906792, + "num_input_tokens_seen": 16130360, + "step": 985, + "train_runtime": 8004.1105, + "train_tokens_per_second": 2015.26 + }, + { + "epoch": 0.5975757575757575, + "grad_norm": 0.007626754697412252, + "learning_rate": 9.937951842253127e-05, + "loss": 0.01213219203054905, + "num_input_tokens_seen": 16146736, + "step": 986, + "train_runtime": 8012.2202, + "train_tokens_per_second": 2015.264 + }, + { + "epoch": 0.5981818181818181, + "grad_norm": 0.013599387370049953, + "learning_rate": 9.937800727746964e-05, + "loss": 0.012984167784452438, + "num_input_tokens_seen": 16163112, + "step": 987, + "train_runtime": 8020.337, + "train_tokens_per_second": 2015.266 + }, + { + "epoch": 0.5987878787878788, + "grad_norm": 0.010270299389958382, + "learning_rate": 9.937649430601166e-05, + "loss": 0.011544723995029926, + "num_input_tokens_seen": 16179488, + "step": 988, + "train_runtime": 8028.447, + "train_tokens_per_second": 2015.27 + }, + { + "epoch": 0.5993939393939394, + "grad_norm": 0.03377272188663483, + "learning_rate": 9.937497950821332e-05, + "loss": 0.01466489490121603, + "num_input_tokens_seen": 16195864, + "step": 989, + "train_runtime": 8036.5629, + "train_tokens_per_second": 2015.272 + }, + { + "epoch": 0.6, + "grad_norm": 0.012808220461010933, + "learning_rate": 9.937346288413064e-05, + "loss": 0.014080810360610485, + "num_input_tokens_seen": 16212240, + "step": 990, + "train_runtime": 8044.6741, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 0.6006060606060606, + "grad_norm": 0.022888874635100365, + "learning_rate": 9.937194443381972e-05, + "loss": 0.012964661233127117, + "num_input_tokens_seen": 16228616, + "step": 991, + "train_runtime": 8052.7845, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 0.6012121212121212, + "grad_norm": 0.028279505670070648, + "learning_rate": 9.937042415733673e-05, + "loss": 0.012717594392597675, + "num_input_tokens_seen": 16244992, + "step": 992, + "train_runtime": 8060.8929, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 0.6018181818181818, + "grad_norm": 0.09445340186357498, + "learning_rate": 9.936890205473787e-05, + "loss": 0.013668234460055828, + "num_input_tokens_seen": 16261368, + "step": 993, + "train_runtime": 8069.0044, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 0.6024242424242424, + "grad_norm": 0.008610354736447334, + "learning_rate": 9.936737812607949e-05, + "loss": 0.011679118499159813, + "num_input_tokens_seen": 16277744, + "step": 994, + "train_runtime": 8077.1154, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 0.603030303030303, + "grad_norm": 0.017112495377659798, + "learning_rate": 9.936585237141792e-05, + "loss": 0.012689062394201756, + "num_input_tokens_seen": 16294120, + "step": 995, + "train_runtime": 8085.2294, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 0.6036363636363636, + "grad_norm": 0.0271944347769022, + "learning_rate": 9.936432479080961e-05, + "loss": 0.014213870279490948, + "num_input_tokens_seen": 16310496, + "step": 996, + "train_runtime": 8093.3362, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 0.6042424242424242, + "grad_norm": 0.012547393329441547, + "learning_rate": 9.936279538431106e-05, + "loss": 0.012523166835308075, + "num_input_tokens_seen": 16326872, + "step": 997, + "train_runtime": 8101.4449, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 0.6048484848484849, + "grad_norm": 0.02419351600110531, + "learning_rate": 9.936126415197884e-05, + "loss": 0.014308387413620949, + "num_input_tokens_seen": 16343248, + "step": 998, + "train_runtime": 8109.5556, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 0.6054545454545455, + "grad_norm": 0.015599401667714119, + "learning_rate": 9.935973109386958e-05, + "loss": 0.012808605097234249, + "num_input_tokens_seen": 16359624, + "step": 999, + "train_runtime": 8117.6633, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 0.6060606060606061, + "grad_norm": 0.021892806515097618, + "learning_rate": 9.935819621003999e-05, + "loss": 0.013939116150140762, + "num_input_tokens_seen": 16376000, + "step": 1000, + "train_runtime": 8125.7712, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 0.6066666666666667, + "grad_norm": 0.01672331802546978, + "learning_rate": 9.935665950054684e-05, + "loss": 0.014093529433012009, + "num_input_tokens_seen": 16392376, + "step": 1001, + "train_runtime": 8134.7177, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 0.6072727272727273, + "grad_norm": 0.009217355400323868, + "learning_rate": 9.9355120965447e-05, + "loss": 0.01290955115109682, + "num_input_tokens_seen": 16408752, + "step": 1002, + "train_runtime": 8142.8295, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 0.6078787878787879, + "grad_norm": 0.01524933148175478, + "learning_rate": 9.935358060479731e-05, + "loss": 0.012339223176240921, + "num_input_tokens_seen": 16425128, + "step": 1003, + "train_runtime": 8150.9365, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 0.6084848484848485, + "grad_norm": 0.02360517345368862, + "learning_rate": 9.935203841865482e-05, + "loss": 0.012756834737956524, + "num_input_tokens_seen": 16441504, + "step": 1004, + "train_runtime": 8159.0458, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 0.6090909090909091, + "grad_norm": 0.020947473123669624, + "learning_rate": 9.93504944070765e-05, + "loss": 0.012582367286086082, + "num_input_tokens_seen": 16457880, + "step": 1005, + "train_runtime": 8167.1644, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 0.6096969696969697, + "grad_norm": 0.01945319212973118, + "learning_rate": 9.934894857011953e-05, + "loss": 0.012788314372301102, + "num_input_tokens_seen": 16474256, + "step": 1006, + "train_runtime": 8175.2812, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 0.6103030303030303, + "grad_norm": 0.0219440758228302, + "learning_rate": 9.934740090784103e-05, + "loss": 0.013707922771573067, + "num_input_tokens_seen": 16490632, + "step": 1007, + "train_runtime": 8183.3988, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 0.610909090909091, + "grad_norm": 0.012798693962395191, + "learning_rate": 9.934585142029828e-05, + "loss": 0.013069421984255314, + "num_input_tokens_seen": 16507008, + "step": 1008, + "train_runtime": 8191.5179, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 0.6115151515151516, + "grad_norm": 0.012583008036017418, + "learning_rate": 9.934430010754861e-05, + "loss": 0.011966132558882236, + "num_input_tokens_seen": 16523384, + "step": 1009, + "train_runtime": 8199.6359, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 0.6121212121212121, + "grad_norm": 0.03669752925634384, + "learning_rate": 9.934274696964934e-05, + "loss": 0.014166103675961494, + "num_input_tokens_seen": 16539760, + "step": 1010, + "train_runtime": 8207.7511, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 0.6127272727272727, + "grad_norm": 0.019834555685520172, + "learning_rate": 9.934119200665795e-05, + "loss": 0.011456426233053207, + "num_input_tokens_seen": 16556136, + "step": 1011, + "train_runtime": 8215.8683, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 0.017150534316897392, + "learning_rate": 9.933963521863196e-05, + "loss": 0.012325924821197987, + "num_input_tokens_seen": 16572512, + "step": 1012, + "train_runtime": 8223.99, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 0.6139393939393939, + "grad_norm": 0.013030534610152245, + "learning_rate": 9.933807660562898e-05, + "loss": 0.012827505357563496, + "num_input_tokens_seen": 16588888, + "step": 1013, + "train_runtime": 8232.106, + "train_tokens_per_second": 2015.145 + }, + { + "epoch": 0.6145454545454545, + "grad_norm": 0.01751735992729664, + "learning_rate": 9.933651616770658e-05, + "loss": 0.012782123871147633, + "num_input_tokens_seen": 16605264, + "step": 1014, + "train_runtime": 8240.2294, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 0.6151515151515151, + "grad_norm": 0.013464527204632759, + "learning_rate": 9.933495390492256e-05, + "loss": 0.014123444445431232, + "num_input_tokens_seen": 16621640, + "step": 1015, + "train_runtime": 8248.3463, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 0.6157575757575757, + "grad_norm": 0.026679445058107376, + "learning_rate": 9.933338981733464e-05, + "loss": 0.012160470709204674, + "num_input_tokens_seen": 16638016, + "step": 1016, + "train_runtime": 8256.4635, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 0.6163636363636363, + "grad_norm": 0.010502724908292294, + "learning_rate": 9.933182390500073e-05, + "loss": 0.011820110492408276, + "num_input_tokens_seen": 16654392, + "step": 1017, + "train_runtime": 8264.5788, + "train_tokens_per_second": 2015.153 + }, + { + "epoch": 0.616969696969697, + "grad_norm": 0.013210924342274666, + "learning_rate": 9.93302561679787e-05, + "loss": 0.013029432855546474, + "num_input_tokens_seen": 16670768, + "step": 1018, + "train_runtime": 8272.6927, + "train_tokens_per_second": 2015.156 + }, + { + "epoch": 0.6175757575757576, + "grad_norm": 0.032258208841085434, + "learning_rate": 9.932868660632659e-05, + "loss": 0.012911350466310978, + "num_input_tokens_seen": 16687144, + "step": 1019, + "train_runtime": 8280.8162, + "train_tokens_per_second": 2015.157 + }, + { + "epoch": 0.6181818181818182, + "grad_norm": 0.03345981612801552, + "learning_rate": 9.932711522010241e-05, + "loss": 0.01444256491959095, + "num_input_tokens_seen": 16703520, + "step": 1020, + "train_runtime": 8288.934, + "train_tokens_per_second": 2015.159 + }, + { + "epoch": 0.6187878787878788, + "grad_norm": 0.023281559348106384, + "learning_rate": 9.932554200936429e-05, + "loss": 0.014297975227236748, + "num_input_tokens_seen": 16719896, + "step": 1021, + "train_runtime": 8297.0522, + "train_tokens_per_second": 2015.161 + }, + { + "epoch": 0.6193939393939394, + "grad_norm": 0.02298637479543686, + "learning_rate": 9.932396697417044e-05, + "loss": 0.012052800506353378, + "num_input_tokens_seen": 16736272, + "step": 1022, + "train_runtime": 8305.1688, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 0.62, + "grad_norm": 0.01204346027225256, + "learning_rate": 9.932239011457909e-05, + "loss": 0.012858795002102852, + "num_input_tokens_seen": 16752648, + "step": 1023, + "train_runtime": 8313.2898, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 0.6206060606060606, + "grad_norm": 0.018114762380719185, + "learning_rate": 9.93208114306486e-05, + "loss": 0.013215101324021816, + "num_input_tokens_seen": 16769024, + "step": 1024, + "train_runtime": 8321.4063, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 0.6212121212121212, + "grad_norm": 0.009015897288918495, + "learning_rate": 9.931923092243733e-05, + "loss": 0.013312953524291515, + "num_input_tokens_seen": 16785400, + "step": 1025, + "train_runtime": 8329.5303, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 0.6218181818181818, + "grad_norm": 0.011126898229122162, + "learning_rate": 9.931764859000375e-05, + "loss": 0.011524452827870846, + "num_input_tokens_seen": 16801776, + "step": 1026, + "train_runtime": 8337.647, + "train_tokens_per_second": 2015.17 + }, + { + "epoch": 0.6224242424242424, + "grad_norm": 0.021657567471265793, + "learning_rate": 9.93160644334064e-05, + "loss": 0.012531260028481483, + "num_input_tokens_seen": 16818152, + "step": 1027, + "train_runtime": 8345.7666, + "train_tokens_per_second": 2015.172 + }, + { + "epoch": 0.623030303030303, + "grad_norm": 0.05316740646958351, + "learning_rate": 9.931447845270388e-05, + "loss": 0.013248222880065441, + "num_input_tokens_seen": 16834528, + "step": 1028, + "train_runtime": 8353.8829, + "train_tokens_per_second": 2015.174 + }, + { + "epoch": 0.6236363636363637, + "grad_norm": 0.012917754240334034, + "learning_rate": 9.931289064795482e-05, + "loss": 0.013202149420976639, + "num_input_tokens_seen": 16850904, + "step": 1029, + "train_runtime": 8362.0006, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 0.6242424242424243, + "grad_norm": 0.021064553409814835, + "learning_rate": 9.931130101921795e-05, + "loss": 0.013943769969046116, + "num_input_tokens_seen": 16867280, + "step": 1030, + "train_runtime": 8370.1194, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 0.6248484848484849, + "grad_norm": 0.012005737982690334, + "learning_rate": 9.930970956655212e-05, + "loss": 0.012500936165452003, + "num_input_tokens_seen": 16883656, + "step": 1031, + "train_runtime": 8378.2369, + "train_tokens_per_second": 2015.18 + }, + { + "epoch": 0.6254545454545455, + "grad_norm": 0.02506149373948574, + "learning_rate": 9.930811629001613e-05, + "loss": 0.014318128116428852, + "num_input_tokens_seen": 16900032, + "step": 1032, + "train_runtime": 8386.3552, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 0.6260606060606061, + "grad_norm": 0.03320576995611191, + "learning_rate": 9.930652118966895e-05, + "loss": 0.010508203878998756, + "num_input_tokens_seen": 16916408, + "step": 1033, + "train_runtime": 8394.4718, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 0.6266666666666667, + "grad_norm": 0.03429649397730827, + "learning_rate": 9.93049242655696e-05, + "loss": 0.012183441780507565, + "num_input_tokens_seen": 16932784, + "step": 1034, + "train_runtime": 8402.5875, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 0.6272727272727273, + "grad_norm": 0.01607862487435341, + "learning_rate": 9.930332551777708e-05, + "loss": 0.013750139623880386, + "num_input_tokens_seen": 16949160, + "step": 1035, + "train_runtime": 8410.7043, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 0.6278787878787879, + "grad_norm": 0.01341179572045803, + "learning_rate": 9.930172494635057e-05, + "loss": 0.012538340874016285, + "num_input_tokens_seen": 16965536, + "step": 1036, + "train_runtime": 8418.8297, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 0.6284848484848485, + "grad_norm": 0.00997228641062975, + "learning_rate": 9.930012255134928e-05, + "loss": 0.012722784653306007, + "num_input_tokens_seen": 16981912, + "step": 1037, + "train_runtime": 8426.9482, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 0.6290909090909091, + "grad_norm": 0.00990308728069067, + "learning_rate": 9.929851833283245e-05, + "loss": 0.013942928053438663, + "num_input_tokens_seen": 16998288, + "step": 1038, + "train_runtime": 8435.0672, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 0.6296969696969696, + "grad_norm": 0.011313795112073421, + "learning_rate": 9.929691229085944e-05, + "loss": 0.011238223873078823, + "num_input_tokens_seen": 17014664, + "step": 1039, + "train_runtime": 8443.1862, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 0.6303030303030303, + "grad_norm": 0.010831150226294994, + "learning_rate": 9.929530442548965e-05, + "loss": 0.012601799331605434, + "num_input_tokens_seen": 17031040, + "step": 1040, + "train_runtime": 8451.3035, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 0.6309090909090909, + "grad_norm": 0.014783729799091816, + "learning_rate": 9.929369473678253e-05, + "loss": 0.013956460170447826, + "num_input_tokens_seen": 17047416, + "step": 1041, + "train_runtime": 8459.4295, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 0.6315151515151515, + "grad_norm": 0.01627667248249054, + "learning_rate": 9.929208322479764e-05, + "loss": 0.013232799246907234, + "num_input_tokens_seen": 17063792, + "step": 1042, + "train_runtime": 8467.5479, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 0.6321212121212121, + "grad_norm": 0.011055609211325645, + "learning_rate": 9.92904698895946e-05, + "loss": 0.01293270569294691, + "num_input_tokens_seen": 17080168, + "step": 1043, + "train_runtime": 8475.665, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 0.6327272727272727, + "grad_norm": 0.03507707267999649, + "learning_rate": 9.928885473123306e-05, + "loss": 0.012113180011510849, + "num_input_tokens_seen": 17096544, + "step": 1044, + "train_runtime": 8483.7839, + "train_tokens_per_second": 2015.203 + }, + { + "epoch": 0.6333333333333333, + "grad_norm": 0.01946045272052288, + "learning_rate": 9.928723774977275e-05, + "loss": 0.013142693787813187, + "num_input_tokens_seen": 17112920, + "step": 1045, + "train_runtime": 8491.9041, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 0.6339393939393939, + "grad_norm": 0.021705901250243187, + "learning_rate": 9.928561894527353e-05, + "loss": 0.012501654215157032, + "num_input_tokens_seen": 17129296, + "step": 1046, + "train_runtime": 8500.0295, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 0.6345454545454545, + "grad_norm": 0.019804542884230614, + "learning_rate": 9.928399831779523e-05, + "loss": 0.012758147902786732, + "num_input_tokens_seen": 17145672, + "step": 1047, + "train_runtime": 8508.1486, + "train_tokens_per_second": 2015.206 + }, + { + "epoch": 0.6351515151515151, + "grad_norm": 0.011929893866181374, + "learning_rate": 9.928237586739781e-05, + "loss": 0.013042271137237549, + "num_input_tokens_seen": 17162048, + "step": 1048, + "train_runtime": 8516.2673, + "train_tokens_per_second": 2015.208 + }, + { + "epoch": 0.6357575757575757, + "grad_norm": 0.028489001095294952, + "learning_rate": 9.928075159414128e-05, + "loss": 0.013056590221822262, + "num_input_tokens_seen": 17178424, + "step": 1049, + "train_runtime": 8524.3858, + "train_tokens_per_second": 2015.21 + }, + { + "epoch": 0.6363636363636364, + "grad_norm": 0.01078235823661089, + "learning_rate": 9.927912549808572e-05, + "loss": 0.012080740183591843, + "num_input_tokens_seen": 17194800, + "step": 1050, + "train_runtime": 8532.5029, + "train_tokens_per_second": 2015.212 + }, + { + "epoch": 0.636969696969697, + "grad_norm": 0.021545223891735077, + "learning_rate": 9.927749757929125e-05, + "loss": 0.015170791186392307, + "num_input_tokens_seen": 17211176, + "step": 1051, + "train_runtime": 8540.6203, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 0.6375757575757576, + "grad_norm": 0.021686149761080742, + "learning_rate": 9.927586783781814e-05, + "loss": 0.013388474471867085, + "num_input_tokens_seen": 17227552, + "step": 1052, + "train_runtime": 8548.7393, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 0.6381818181818182, + "grad_norm": 0.019198935478925705, + "learning_rate": 9.927423627372663e-05, + "loss": 0.013151840306818485, + "num_input_tokens_seen": 17243928, + "step": 1053, + "train_runtime": 8556.8572, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 0.6387878787878788, + "grad_norm": 0.026876596733927727, + "learning_rate": 9.927260288707707e-05, + "loss": 0.01568884216248989, + "num_input_tokens_seen": 17260304, + "step": 1054, + "train_runtime": 8564.9754, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 0.6393939393939394, + "grad_norm": 0.02315112017095089, + "learning_rate": 9.92709676779299e-05, + "loss": 0.013643411919474602, + "num_input_tokens_seen": 17276680, + "step": 1055, + "train_runtime": 8573.0936, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 0.64, + "grad_norm": 0.013450577855110168, + "learning_rate": 9.926933064634558e-05, + "loss": 0.011888994835317135, + "num_input_tokens_seen": 17293056, + "step": 1056, + "train_runtime": 8581.213, + "train_tokens_per_second": 2015.223 + }, + { + "epoch": 0.6406060606060606, + "grad_norm": 0.038361355662345886, + "learning_rate": 9.926769179238466e-05, + "loss": 0.01497360784560442, + "num_input_tokens_seen": 17309432, + "step": 1057, + "train_runtime": 8589.3331, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 0.6412121212121212, + "grad_norm": 0.019271399825811386, + "learning_rate": 9.926605111610776e-05, + "loss": 0.014056256040930748, + "num_input_tokens_seen": 17325808, + "step": 1058, + "train_runtime": 8597.4511, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 0.6418181818181818, + "grad_norm": 0.01557596493512392, + "learning_rate": 9.926440861757557e-05, + "loss": 0.012796062976121902, + "num_input_tokens_seen": 17342184, + "step": 1059, + "train_runtime": 8605.5697, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 0.6424242424242425, + "grad_norm": 0.005278696306049824, + "learning_rate": 9.926276429684886e-05, + "loss": 0.011402487754821777, + "num_input_tokens_seen": 17358560, + "step": 1060, + "train_runtime": 8613.6883, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 0.6430303030303031, + "grad_norm": 0.015694163739681244, + "learning_rate": 9.926111815398843e-05, + "loss": 0.013192391023039818, + "num_input_tokens_seen": 17374936, + "step": 1061, + "train_runtime": 8621.8068, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 0.6436363636363637, + "grad_norm": 0.01900624856352806, + "learning_rate": 9.925947018905516e-05, + "loss": 0.013219461776316166, + "num_input_tokens_seen": 17391312, + "step": 1062, + "train_runtime": 8629.9293, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 0.6442424242424243, + "grad_norm": 0.013446804136037827, + "learning_rate": 9.925782040211002e-05, + "loss": 0.011763139627873898, + "num_input_tokens_seen": 17407688, + "step": 1063, + "train_runtime": 8638.0493, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 0.6448484848484849, + "grad_norm": 0.01933007501065731, + "learning_rate": 9.925616879321404e-05, + "loss": 0.011931811459362507, + "num_input_tokens_seen": 17424064, + "step": 1064, + "train_runtime": 8646.1674, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 0.6454545454545455, + "grad_norm": 0.016764989122748375, + "learning_rate": 9.925451536242829e-05, + "loss": 0.013410956598818302, + "num_input_tokens_seen": 17440440, + "step": 1065, + "train_runtime": 8654.2855, + "train_tokens_per_second": 2015.237 + }, + { + "epoch": 0.6460606060606061, + "grad_norm": 0.019174639135599136, + "learning_rate": 9.925286010981394e-05, + "loss": 0.014691396616399288, + "num_input_tokens_seen": 17456816, + "step": 1066, + "train_runtime": 8662.4024, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 0.6466666666666666, + "grad_norm": 0.0077021801844239235, + "learning_rate": 9.925120303543219e-05, + "loss": 0.012529893778264523, + "num_input_tokens_seen": 17473192, + "step": 1067, + "train_runtime": 8670.5209, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 0.6472727272727272, + "grad_norm": 0.014966354705393314, + "learning_rate": 9.924954413934438e-05, + "loss": 0.013215701095759869, + "num_input_tokens_seen": 17489568, + "step": 1068, + "train_runtime": 8678.6394, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 0.6478787878787878, + "grad_norm": 0.020852232351899147, + "learning_rate": 9.924788342161182e-05, + "loss": 0.013355967588722706, + "num_input_tokens_seen": 17505944, + "step": 1069, + "train_runtime": 8686.7585, + "train_tokens_per_second": 2015.245 + }, + { + "epoch": 0.6484848484848484, + "grad_norm": 0.017107227817177773, + "learning_rate": 9.924622088229597e-05, + "loss": 0.014044157229363918, + "num_input_tokens_seen": 17522320, + "step": 1070, + "train_runtime": 8694.8787, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 0.649090909090909, + "grad_norm": 0.015282119624316692, + "learning_rate": 9.924455652145831e-05, + "loss": 0.01387142762541771, + "num_input_tokens_seen": 17538696, + "step": 1071, + "train_runtime": 8702.997, + "train_tokens_per_second": 2015.248 + }, + { + "epoch": 0.6496969696969697, + "grad_norm": 0.010007917881011963, + "learning_rate": 9.92428903391604e-05, + "loss": 0.01257625874131918, + "num_input_tokens_seen": 17555072, + "step": 1072, + "train_runtime": 8711.1152, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 0.6503030303030303, + "grad_norm": 0.009446706622838974, + "learning_rate": 9.924122233546386e-05, + "loss": 0.013552306219935417, + "num_input_tokens_seen": 17571448, + "step": 1073, + "train_runtime": 8719.2328, + "train_tokens_per_second": 2015.252 + }, + { + "epoch": 0.6509090909090909, + "grad_norm": 0.012225381098687649, + "learning_rate": 9.923955251043042e-05, + "loss": 0.011776247061789036, + "num_input_tokens_seen": 17587824, + "step": 1074, + "train_runtime": 8727.35, + "train_tokens_per_second": 2015.254 + }, + { + "epoch": 0.6515151515151515, + "grad_norm": 0.020964186638593674, + "learning_rate": 9.923788086412182e-05, + "loss": 0.012502472847700119, + "num_input_tokens_seen": 17604200, + "step": 1075, + "train_runtime": 8735.4695, + "train_tokens_per_second": 2015.255 + }, + { + "epoch": 0.6521212121212121, + "grad_norm": 0.017575940117239952, + "learning_rate": 9.923620739659989e-05, + "loss": 0.012096179649233818, + "num_input_tokens_seen": 17620576, + "step": 1076, + "train_runtime": 8743.5854, + "train_tokens_per_second": 2015.258 + }, + { + "epoch": 0.6527272727272727, + "grad_norm": 0.013330096378922462, + "learning_rate": 9.923453210792653e-05, + "loss": 0.013803805224597454, + "num_input_tokens_seen": 17636952, + "step": 1077, + "train_runtime": 8751.7034, + "train_tokens_per_second": 2015.259 + }, + { + "epoch": 0.6533333333333333, + "grad_norm": 0.011349702253937721, + "learning_rate": 9.92328549981637e-05, + "loss": 0.013793877325952053, + "num_input_tokens_seen": 17653328, + "step": 1078, + "train_runtime": 8759.8296, + "train_tokens_per_second": 2015.259 + }, + { + "epoch": 0.6539393939393939, + "grad_norm": 0.015959061682224274, + "learning_rate": 9.923117606737346e-05, + "loss": 0.013326899148523808, + "num_input_tokens_seen": 17669704, + "step": 1079, + "train_runtime": 8767.9523, + "train_tokens_per_second": 2015.26 + }, + { + "epoch": 0.6545454545454545, + "grad_norm": 0.014492125250399113, + "learning_rate": 9.922949531561788e-05, + "loss": 0.01288958266377449, + "num_input_tokens_seen": 17686080, + "step": 1080, + "train_runtime": 8776.0735, + "train_tokens_per_second": 2015.261 + }, + { + "epoch": 0.6551515151515152, + "grad_norm": 0.013345365412533283, + "learning_rate": 9.922781274295913e-05, + "loss": 0.012366179376840591, + "num_input_tokens_seen": 17702456, + "step": 1081, + "train_runtime": 8784.1923, + "train_tokens_per_second": 2015.263 + }, + { + "epoch": 0.6557575757575758, + "grad_norm": 0.010763085447251797, + "learning_rate": 9.922612834945947e-05, + "loss": 0.01264217309653759, + "num_input_tokens_seen": 17718832, + "step": 1082, + "train_runtime": 8792.3102, + "train_tokens_per_second": 2015.265 + }, + { + "epoch": 0.6563636363636364, + "grad_norm": 0.011818567290902138, + "learning_rate": 9.922444213518117e-05, + "loss": 0.013193395920097828, + "num_input_tokens_seen": 17735208, + "step": 1083, + "train_runtime": 8800.4295, + "train_tokens_per_second": 2015.266 + }, + { + "epoch": 0.656969696969697, + "grad_norm": 0.010724381543695927, + "learning_rate": 9.922275410018663e-05, + "loss": 0.012857016175985336, + "num_input_tokens_seen": 17751584, + "step": 1084, + "train_runtime": 8808.5474, + "train_tokens_per_second": 2015.268 + }, + { + "epoch": 0.6575757575757576, + "grad_norm": 0.017108984291553497, + "learning_rate": 9.922106424453826e-05, + "loss": 0.013113675639033318, + "num_input_tokens_seen": 17767960, + "step": 1085, + "train_runtime": 8816.6647, + "train_tokens_per_second": 2015.27 + }, + { + "epoch": 0.6581818181818182, + "grad_norm": 0.022697484120726585, + "learning_rate": 9.921937256829859e-05, + "loss": 0.012546958401799202, + "num_input_tokens_seen": 17784336, + "step": 1086, + "train_runtime": 8824.7847, + "train_tokens_per_second": 2015.271 + }, + { + "epoch": 0.6587878787878788, + "grad_norm": 0.014008583500981331, + "learning_rate": 9.921767907153016e-05, + "loss": 0.011740295216441154, + "num_input_tokens_seen": 17800712, + "step": 1087, + "train_runtime": 8832.904, + "train_tokens_per_second": 2015.273 + }, + { + "epoch": 0.6593939393939394, + "grad_norm": 0.011233743280172348, + "learning_rate": 9.921598375429564e-05, + "loss": 0.011731310747563839, + "num_input_tokens_seen": 17817088, + "step": 1088, + "train_runtime": 8841.0299, + "train_tokens_per_second": 2015.273 + }, + { + "epoch": 0.66, + "grad_norm": 0.011883188039064407, + "learning_rate": 9.921428661665772e-05, + "loss": 0.012650273740291595, + "num_input_tokens_seen": 17833464, + "step": 1089, + "train_runtime": 8849.1483, + "train_tokens_per_second": 2015.275 + }, + { + "epoch": 0.6606060606060606, + "grad_norm": 0.010079750791192055, + "learning_rate": 9.921258765867919e-05, + "loss": 0.012131286785006523, + "num_input_tokens_seen": 17849840, + "step": 1090, + "train_runtime": 8857.2661, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 0.6612121212121213, + "grad_norm": 0.013724222779273987, + "learning_rate": 9.921088688042287e-05, + "loss": 0.012973928824067116, + "num_input_tokens_seen": 17866216, + "step": 1091, + "train_runtime": 8865.3859, + "train_tokens_per_second": 2015.278 + }, + { + "epoch": 0.6618181818181819, + "grad_norm": 0.019831640645861626, + "learning_rate": 9.920918428195168e-05, + "loss": 0.01297835074365139, + "num_input_tokens_seen": 17882592, + "step": 1092, + "train_runtime": 8873.5052, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 0.6624242424242425, + "grad_norm": 0.011757400818169117, + "learning_rate": 9.920747986332858e-05, + "loss": 0.013069117441773415, + "num_input_tokens_seen": 17898968, + "step": 1093, + "train_runtime": 8881.6295, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 0.6630303030303031, + "grad_norm": 0.013741742819547653, + "learning_rate": 9.920577362461665e-05, + "loss": 0.013204855844378471, + "num_input_tokens_seen": 17915344, + "step": 1094, + "train_runtime": 8889.749, + "train_tokens_per_second": 2015.281 + }, + { + "epoch": 0.6636363636363637, + "grad_norm": 0.02447706274688244, + "learning_rate": 9.920406556587897e-05, + "loss": 0.011999960988759995, + "num_input_tokens_seen": 17931720, + "step": 1095, + "train_runtime": 8897.8668, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 0.6642424242424242, + "grad_norm": 0.03095782734453678, + "learning_rate": 9.920235568717873e-05, + "loss": 0.01361205242574215, + "num_input_tokens_seen": 17948096, + "step": 1096, + "train_runtime": 8905.9871, + "train_tokens_per_second": 2015.284 + }, + { + "epoch": 0.6648484848484848, + "grad_norm": 0.037076305598020554, + "learning_rate": 9.920064398857916e-05, + "loss": 0.012342737056314945, + "num_input_tokens_seen": 17964472, + "step": 1097, + "train_runtime": 8914.1084, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 0.6654545454545454, + "grad_norm": 0.053048014640808105, + "learning_rate": 9.91989304701436e-05, + "loss": 0.012850755825638771, + "num_input_tokens_seen": 17980848, + "step": 1098, + "train_runtime": 8922.229, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 0.666060606060606, + "grad_norm": 0.018742846325039864, + "learning_rate": 9.919721513193538e-05, + "loss": 0.012020561844110489, + "num_input_tokens_seen": 17997224, + "step": 1099, + "train_runtime": 8930.3477, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 0.013778687454760075, + "learning_rate": 9.919549797401802e-05, + "loss": 0.014269824139773846, + "num_input_tokens_seen": 18013600, + "step": 1100, + "train_runtime": 8938.4671, + "train_tokens_per_second": 2015.29 + }, + { + "epoch": 0.6672727272727272, + "grad_norm": 0.06041925400495529, + "learning_rate": 9.919377899645497e-05, + "loss": 0.013500120490789413, + "num_input_tokens_seen": 18029976, + "step": 1101, + "train_runtime": 8947.5154, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 0.6678787878787878, + "grad_norm": 0.006662312895059586, + "learning_rate": 9.919205819930983e-05, + "loss": 0.011903712525963783, + "num_input_tokens_seen": 18046352, + "step": 1102, + "train_runtime": 8955.6388, + "train_tokens_per_second": 2015.083 + }, + { + "epoch": 0.6684848484848485, + "grad_norm": 0.014133021235466003, + "learning_rate": 9.919033558264627e-05, + "loss": 0.013043178245425224, + "num_input_tokens_seen": 18062728, + "step": 1103, + "train_runtime": 8963.7612, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 0.6690909090909091, + "grad_norm": 0.018031738698482513, + "learning_rate": 9.918861114652798e-05, + "loss": 0.012816919945180416, + "num_input_tokens_seen": 18079104, + "step": 1104, + "train_runtime": 8971.8826, + "train_tokens_per_second": 2015.085 + }, + { + "epoch": 0.6696969696969697, + "grad_norm": 0.030864031985402107, + "learning_rate": 9.918688489101875e-05, + "loss": 0.011915095150470734, + "num_input_tokens_seen": 18095480, + "step": 1105, + "train_runtime": 8980.0063, + "train_tokens_per_second": 2015.085 + }, + { + "epoch": 0.6703030303030303, + "grad_norm": 0.859399676322937, + "learning_rate": 9.918515681618246e-05, + "loss": 0.014253467321395874, + "num_input_tokens_seen": 18111856, + "step": 1106, + "train_runtime": 8988.1298, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 0.6709090909090909, + "grad_norm": 0.009849797002971172, + "learning_rate": 9.918342692208297e-05, + "loss": 0.012211693450808525, + "num_input_tokens_seen": 18128232, + "step": 1107, + "train_runtime": 8996.2594, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 0.6715151515151515, + "grad_norm": 0.008677136152982712, + "learning_rate": 9.918169520878432e-05, + "loss": 0.013050990179181099, + "num_input_tokens_seen": 18144608, + "step": 1108, + "train_runtime": 9004.3806, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 0.6721212121212121, + "grad_norm": 0.020974334329366684, + "learning_rate": 9.917996167635053e-05, + "loss": 0.013656461611390114, + "num_input_tokens_seen": 18160984, + "step": 1109, + "train_runtime": 9012.5058, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 0.6727272727272727, + "grad_norm": 0.013642716221511364, + "learning_rate": 9.917822632484575e-05, + "loss": 0.012185771018266678, + "num_input_tokens_seen": 18177360, + "step": 1110, + "train_runtime": 9020.6295, + "train_tokens_per_second": 2015.088 + }, + { + "epoch": 0.6733333333333333, + "grad_norm": 0.01303725503385067, + "learning_rate": 9.917648915433413e-05, + "loss": 0.012668903917074203, + "num_input_tokens_seen": 18193736, + "step": 1111, + "train_runtime": 9028.755, + "train_tokens_per_second": 2015.088 + }, + { + "epoch": 0.673939393939394, + "grad_norm": 0.02112429402768612, + "learning_rate": 9.917475016487993e-05, + "loss": 0.014089099131524563, + "num_input_tokens_seen": 18210112, + "step": 1112, + "train_runtime": 9036.8746, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 0.6745454545454546, + "grad_norm": 0.016523541882634163, + "learning_rate": 9.917300935654751e-05, + "loss": 0.012728005647659302, + "num_input_tokens_seen": 18226488, + "step": 1113, + "train_runtime": 9044.9946, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 0.6751515151515152, + "grad_norm": 0.0112396739423275, + "learning_rate": 9.917126672940124e-05, + "loss": 0.013019783422350883, + "num_input_tokens_seen": 18242864, + "step": 1114, + "train_runtime": 9053.1208, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 0.6757575757575758, + "grad_norm": 0.21001896262168884, + "learning_rate": 9.916952228350556e-05, + "loss": 0.019040443003177643, + "num_input_tokens_seen": 18259240, + "step": 1115, + "train_runtime": 9061.2411, + "train_tokens_per_second": 2015.093 + }, + { + "epoch": 0.6763636363636364, + "grad_norm": 0.015162148512899876, + "learning_rate": 9.916777601892499e-05, + "loss": 0.011509026400744915, + "num_input_tokens_seen": 18275616, + "step": 1116, + "train_runtime": 9069.361, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 0.676969696969697, + "grad_norm": 0.018534110859036446, + "learning_rate": 9.916602793572415e-05, + "loss": 0.012472787871956825, + "num_input_tokens_seen": 18291992, + "step": 1117, + "train_runtime": 9077.4851, + "train_tokens_per_second": 2015.095 + }, + { + "epoch": 0.6775757575757576, + "grad_norm": 0.08402104675769806, + "learning_rate": 9.916427803396769e-05, + "loss": 0.014569929800927639, + "num_input_tokens_seen": 18308368, + "step": 1118, + "train_runtime": 9085.6102, + "train_tokens_per_second": 2015.095 + }, + { + "epoch": 0.6781818181818182, + "grad_norm": 0.018771981820464134, + "learning_rate": 9.91625263137203e-05, + "loss": 0.011995847336947918, + "num_input_tokens_seen": 18324744, + "step": 1119, + "train_runtime": 9093.7324, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 0.6787878787878788, + "grad_norm": 0.03660675883293152, + "learning_rate": 9.916077277504683e-05, + "loss": 0.013902310281991959, + "num_input_tokens_seen": 18341120, + "step": 1120, + "train_runtime": 9101.8526, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 0.6793939393939394, + "grad_norm": 0.02395397052168846, + "learning_rate": 9.91590174180121e-05, + "loss": 0.012367844581604004, + "num_input_tokens_seen": 18357496, + "step": 1121, + "train_runtime": 9109.9726, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 0.68, + "grad_norm": 0.019227512180805206, + "learning_rate": 9.915726024268104e-05, + "loss": 0.012134227901697159, + "num_input_tokens_seen": 18373872, + "step": 1122, + "train_runtime": 9118.0998, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 0.6806060606060607, + "grad_norm": 0.01857166923582554, + "learning_rate": 9.915550124911866e-05, + "loss": 0.013478003442287445, + "num_input_tokens_seen": 18390248, + "step": 1123, + "train_runtime": 9126.2293, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 0.6812121212121212, + "grad_norm": 0.04824969545006752, + "learning_rate": 9.915374043739003e-05, + "loss": 0.012269456870853901, + "num_input_tokens_seen": 18406624, + "step": 1124, + "train_runtime": 9134.35, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 0.6818181818181818, + "grad_norm": 3.1688060760498047, + "learning_rate": 9.915197780756025e-05, + "loss": 0.02297493815422058, + "num_input_tokens_seen": 18423000, + "step": 1125, + "train_runtime": 9142.4746, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 0.6824242424242424, + "grad_norm": 13.637248992919922, + "learning_rate": 9.915021335969452e-05, + "loss": 0.03535247966647148, + "num_input_tokens_seen": 18439376, + "step": 1126, + "train_runtime": 9150.5959, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 0.683030303030303, + "grad_norm": 0.018440239131450653, + "learning_rate": 9.914844709385813e-05, + "loss": 0.014308687299489975, + "num_input_tokens_seen": 18455752, + "step": 1127, + "train_runtime": 9158.7198, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 0.6836363636363636, + "grad_norm": 0.017091959714889526, + "learning_rate": 9.914667901011638e-05, + "loss": 0.012615025043487549, + "num_input_tokens_seen": 18472128, + "step": 1128, + "train_runtime": 9166.8428, + "train_tokens_per_second": 2015.103 + }, + { + "epoch": 0.6842424242424242, + "grad_norm": 0.040168218314647675, + "learning_rate": 9.91449091085347e-05, + "loss": 0.013721957802772522, + "num_input_tokens_seen": 18488504, + "step": 1129, + "train_runtime": 9174.9646, + "train_tokens_per_second": 2015.104 + }, + { + "epoch": 0.6848484848484848, + "grad_norm": 0.01958506926894188, + "learning_rate": 9.914313738917853e-05, + "loss": 0.015058807097375393, + "num_input_tokens_seen": 18504880, + "step": 1130, + "train_runtime": 9183.0766, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 0.6854545454545454, + "grad_norm": 0.041311051696538925, + "learning_rate": 9.914136385211341e-05, + "loss": 0.011465203016996384, + "num_input_tokens_seen": 18521256, + "step": 1131, + "train_runtime": 9191.1874, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 0.686060606060606, + "grad_norm": 0.029558753594756126, + "learning_rate": 9.913958849740493e-05, + "loss": 0.013997621834278107, + "num_input_tokens_seen": 18537632, + "step": 1132, + "train_runtime": 9199.2987, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 0.6866666666666666, + "grad_norm": 0.01560160145163536, + "learning_rate": 9.913781132511877e-05, + "loss": 0.01135623175650835, + "num_input_tokens_seen": 18554008, + "step": 1133, + "train_runtime": 9207.4109, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 0.6872727272727273, + "grad_norm": 0.026331115514039993, + "learning_rate": 9.913603233532067e-05, + "loss": 0.014213286340236664, + "num_input_tokens_seen": 18570384, + "step": 1134, + "train_runtime": 9215.5295, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 0.6878787878787879, + "grad_norm": 0.012758780270814896, + "learning_rate": 9.913425152807642e-05, + "loss": 0.013095496222376823, + "num_input_tokens_seen": 18586760, + "step": 1135, + "train_runtime": 9223.6386, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 0.6884848484848485, + "grad_norm": 0.02692464366555214, + "learning_rate": 9.913246890345189e-05, + "loss": 0.014479240402579308, + "num_input_tokens_seen": 18603136, + "step": 1136, + "train_runtime": 9231.7499, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 0.6890909090909091, + "grad_norm": 0.023674434050917625, + "learning_rate": 9.913068446151302e-05, + "loss": 0.01468647737056017, + "num_input_tokens_seen": 18619512, + "step": 1137, + "train_runtime": 9239.8624, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 0.6896969696969697, + "grad_norm": 0.043436527252197266, + "learning_rate": 9.912889820232578e-05, + "loss": 0.013666333630681038, + "num_input_tokens_seen": 18635888, + "step": 1138, + "train_runtime": 9247.9735, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 0.6903030303030303, + "grad_norm": 0.010912930592894554, + "learning_rate": 9.91271101259563e-05, + "loss": 0.013306580483913422, + "num_input_tokens_seen": 18652264, + "step": 1139, + "train_runtime": 9256.0853, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 0.6909090909090909, + "grad_norm": 0.027857549488544464, + "learning_rate": 9.912532023247068e-05, + "loss": 0.01315208338201046, + "num_input_tokens_seen": 18668640, + "step": 1140, + "train_runtime": 9264.193, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 0.6915151515151515, + "grad_norm": 0.014686026610434055, + "learning_rate": 9.912352852193514e-05, + "loss": 0.012413710355758667, + "num_input_tokens_seen": 18685016, + "step": 1141, + "train_runtime": 9272.3053, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 0.6921212121212121, + "grad_norm": 0.16849519312381744, + "learning_rate": 9.912173499441593e-05, + "loss": 0.013621876947581768, + "num_input_tokens_seen": 18701392, + "step": 1142, + "train_runtime": 9280.4143, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 0.6927272727272727, + "grad_norm": 0.025766436010599136, + "learning_rate": 9.91199396499794e-05, + "loss": 0.014693841338157654, + "num_input_tokens_seen": 18717768, + "step": 1143, + "train_runtime": 9288.5292, + "train_tokens_per_second": 2015.149 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 0.03636888787150383, + "learning_rate": 9.911814248869198e-05, + "loss": 0.015230114571750164, + "num_input_tokens_seen": 18734144, + "step": 1144, + "train_runtime": 9296.6386, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 0.693939393939394, + "grad_norm": 0.02268008515238762, + "learning_rate": 9.91163435106201e-05, + "loss": 0.014965626411139965, + "num_input_tokens_seen": 18750520, + "step": 1145, + "train_runtime": 9304.7495, + "train_tokens_per_second": 2015.156 + }, + { + "epoch": 0.6945454545454546, + "grad_norm": 0.02825307659804821, + "learning_rate": 9.911454271583034e-05, + "loss": 0.013202480971813202, + "num_input_tokens_seen": 18766896, + "step": 1146, + "train_runtime": 9312.8608, + "train_tokens_per_second": 2015.159 + }, + { + "epoch": 0.6951515151515152, + "grad_norm": 0.0277263056486845, + "learning_rate": 9.911274010438928e-05, + "loss": 0.014979338273406029, + "num_input_tokens_seen": 18783272, + "step": 1147, + "train_runtime": 9320.9729, + "train_tokens_per_second": 2015.162 + }, + { + "epoch": 0.6957575757575758, + "grad_norm": 0.03655631095170975, + "learning_rate": 9.91109356763636e-05, + "loss": 0.01276368834078312, + "num_input_tokens_seen": 18799648, + "step": 1148, + "train_runtime": 9329.0854, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 0.6963636363636364, + "grad_norm": 0.017650572583079338, + "learning_rate": 9.910912943182007e-05, + "loss": 0.013225570321083069, + "num_input_tokens_seen": 18816024, + "step": 1149, + "train_runtime": 9337.1951, + "train_tokens_per_second": 2015.169 + }, + { + "epoch": 0.696969696969697, + "grad_norm": 0.029844503849744797, + "learning_rate": 9.910732137082547e-05, + "loss": 0.012919209897518158, + "num_input_tokens_seen": 18832400, + "step": 1150, + "train_runtime": 9345.3036, + "train_tokens_per_second": 2015.173 + }, + { + "epoch": 0.6975757575757576, + "grad_norm": 0.022128146141767502, + "learning_rate": 9.910551149344669e-05, + "loss": 0.013780666515231133, + "num_input_tokens_seen": 18848776, + "step": 1151, + "train_runtime": 9353.4141, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 0.6981818181818182, + "grad_norm": 0.02025616727769375, + "learning_rate": 9.910369979975065e-05, + "loss": 0.014601497910916805, + "num_input_tokens_seen": 18865152, + "step": 1152, + "train_runtime": 9361.5308, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 0.6987878787878787, + "grad_norm": 0.01940023899078369, + "learning_rate": 9.910188628980439e-05, + "loss": 0.01339776162058115, + "num_input_tokens_seen": 18881528, + "step": 1153, + "train_runtime": 9369.6441, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 0.6993939393939393, + "grad_norm": 0.022027693688869476, + "learning_rate": 9.910007096367497e-05, + "loss": 0.01376222725957632, + "num_input_tokens_seen": 18897904, + "step": 1154, + "train_runtime": 9377.7542, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 0.7, + "grad_norm": 0.006554140709340572, + "learning_rate": 9.909825382142955e-05, + "loss": 0.012087719514966011, + "num_input_tokens_seen": 18914280, + "step": 1155, + "train_runtime": 9385.8634, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 0.7006060606060606, + "grad_norm": 0.011244562454521656, + "learning_rate": 9.909643486313533e-05, + "loss": 0.011743160896003246, + "num_input_tokens_seen": 18930656, + "step": 1156, + "train_runtime": 9393.9756, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 0.7012121212121212, + "grad_norm": 0.015718987211585045, + "learning_rate": 9.909461408885961e-05, + "loss": 0.015649257227778435, + "num_input_tokens_seen": 18947032, + "step": 1157, + "train_runtime": 9402.0879, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 0.7018181818181818, + "grad_norm": 0.014524322003126144, + "learning_rate": 9.909279149866971e-05, + "loss": 0.012584694661200047, + "num_input_tokens_seen": 18963408, + "step": 1158, + "train_runtime": 9410.1978, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 0.7024242424242424, + "grad_norm": 0.01179551426321268, + "learning_rate": 9.909096709263305e-05, + "loss": 0.01177270244807005, + "num_input_tokens_seen": 18979784, + "step": 1159, + "train_runtime": 9418.3067, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 0.703030303030303, + "grad_norm": 0.3294766843318939, + "learning_rate": 9.908914087081714e-05, + "loss": 0.013622680678963661, + "num_input_tokens_seen": 18996160, + "step": 1160, + "train_runtime": 9426.418, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 0.7036363636363636, + "grad_norm": 0.019340241327881813, + "learning_rate": 9.908731283328949e-05, + "loss": 0.013781043700873852, + "num_input_tokens_seen": 19012536, + "step": 1161, + "train_runtime": 9434.5376, + "train_tokens_per_second": 2015.206 + }, + { + "epoch": 0.7042424242424242, + "grad_norm": 0.31950604915618896, + "learning_rate": 9.908548298011774e-05, + "loss": 0.013624520972371101, + "num_input_tokens_seen": 19028912, + "step": 1162, + "train_runtime": 9442.6474, + "train_tokens_per_second": 2015.209 + }, + { + "epoch": 0.7048484848484848, + "grad_norm": 0.01044798456132412, + "learning_rate": 9.908365131136957e-05, + "loss": 0.013481276109814644, + "num_input_tokens_seen": 19045288, + "step": 1163, + "train_runtime": 9450.7603, + "train_tokens_per_second": 2015.212 + }, + { + "epoch": 0.7054545454545454, + "grad_norm": 0.08119679987430573, + "learning_rate": 9.90818178271127e-05, + "loss": 0.01282893493771553, + "num_input_tokens_seen": 19061664, + "step": 1164, + "train_runtime": 9458.8734, + "train_tokens_per_second": 2015.215 + }, + { + "epoch": 0.706060606060606, + "grad_norm": 0.013537311926484108, + "learning_rate": 9.907998252741498e-05, + "loss": 0.013240115717053413, + "num_input_tokens_seen": 19078040, + "step": 1165, + "train_runtime": 9466.9849, + "train_tokens_per_second": 2015.218 + }, + { + "epoch": 0.7066666666666667, + "grad_norm": 0.015183590352535248, + "learning_rate": 9.907814541234429e-05, + "loss": 0.01356966607272625, + "num_input_tokens_seen": 19094416, + "step": 1166, + "train_runtime": 9475.0931, + "train_tokens_per_second": 2015.222 + }, + { + "epoch": 0.7072727272727273, + "grad_norm": 0.01905563659965992, + "learning_rate": 9.907630648196857e-05, + "loss": 0.011865122243762016, + "num_input_tokens_seen": 19110792, + "step": 1167, + "train_runtime": 9483.2064, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 0.7078787878787879, + "grad_norm": 0.01771489344537258, + "learning_rate": 9.907446573635586e-05, + "loss": 0.014323254115879536, + "num_input_tokens_seen": 19127168, + "step": 1168, + "train_runtime": 9491.3179, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 0.7084848484848485, + "grad_norm": 0.013392560184001923, + "learning_rate": 9.907262317557422e-05, + "loss": 0.014154933393001556, + "num_input_tokens_seen": 19143544, + "step": 1169, + "train_runtime": 9499.4298, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 0.7090909090909091, + "grad_norm": 0.01917138509452343, + "learning_rate": 9.907077879969182e-05, + "loss": 0.014620376750826836, + "num_input_tokens_seen": 19159920, + "step": 1170, + "train_runtime": 9507.5424, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 0.7096969696969697, + "grad_norm": 0.023388303816318512, + "learning_rate": 9.906893260877686e-05, + "loss": 0.013931838795542717, + "num_input_tokens_seen": 19176296, + "step": 1171, + "train_runtime": 9515.6549, + "train_tokens_per_second": 2015.237 + }, + { + "epoch": 0.7103030303030303, + "grad_norm": 0.014943883754312992, + "learning_rate": 9.906708460289765e-05, + "loss": 0.012756659649312496, + "num_input_tokens_seen": 19192672, + "step": 1172, + "train_runtime": 9523.7631, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 0.7109090909090909, + "grad_norm": 0.011030408553779125, + "learning_rate": 9.906523478212252e-05, + "loss": 0.01190275140106678, + "num_input_tokens_seen": 19209048, + "step": 1173, + "train_runtime": 9531.8735, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 0.7115151515151515, + "grad_norm": 0.008161013014614582, + "learning_rate": 9.906338314651993e-05, + "loss": 0.012577732093632221, + "num_input_tokens_seen": 19225424, + "step": 1174, + "train_runtime": 9539.9859, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 0.7121212121212122, + "grad_norm": 0.02119288221001625, + "learning_rate": 9.906152969615833e-05, + "loss": 0.012449773959815502, + "num_input_tokens_seen": 19241800, + "step": 1175, + "train_runtime": 9548.0982, + "train_tokens_per_second": 2015.249 + }, + { + "epoch": 0.7127272727272728, + "grad_norm": 0.017582163214683533, + "learning_rate": 9.90596744311063e-05, + "loss": 0.011529134586453438, + "num_input_tokens_seen": 19258176, + "step": 1176, + "train_runtime": 9556.2087, + "train_tokens_per_second": 2015.253 + }, + { + "epoch": 0.7133333333333334, + "grad_norm": 0.04412311315536499, + "learning_rate": 9.905781735143245e-05, + "loss": 0.014292292296886444, + "num_input_tokens_seen": 19274552, + "step": 1177, + "train_runtime": 9564.3204, + "train_tokens_per_second": 2015.256 + }, + { + "epoch": 0.713939393939394, + "grad_norm": 0.07766410708427429, + "learning_rate": 9.905595845720545e-05, + "loss": 0.011792981065809727, + "num_input_tokens_seen": 19290928, + "step": 1178, + "train_runtime": 9572.4335, + "train_tokens_per_second": 2015.258 + }, + { + "epoch": 0.7145454545454546, + "grad_norm": 0.020279264077544212, + "learning_rate": 9.90540977484941e-05, + "loss": 0.014193961396813393, + "num_input_tokens_seen": 19307304, + "step": 1179, + "train_runtime": 9580.5444, + "train_tokens_per_second": 2015.262 + }, + { + "epoch": 0.7151515151515152, + "grad_norm": 0.023957345634698868, + "learning_rate": 9.905223522536719e-05, + "loss": 0.01391246635466814, + "num_input_tokens_seen": 19323680, + "step": 1180, + "train_runtime": 9588.6548, + "train_tokens_per_second": 2015.265 + }, + { + "epoch": 0.7157575757575757, + "grad_norm": 0.02165958844125271, + "learning_rate": 9.905037088789363e-05, + "loss": 0.014714146964251995, + "num_input_tokens_seen": 19340056, + "step": 1181, + "train_runtime": 9596.7692, + "train_tokens_per_second": 2015.267 + }, + { + "epoch": 0.7163636363636363, + "grad_norm": 0.014883043244481087, + "learning_rate": 9.904850473614237e-05, + "loss": 0.013630779460072517, + "num_input_tokens_seen": 19356432, + "step": 1182, + "train_runtime": 9604.8799, + "train_tokens_per_second": 2015.271 + }, + { + "epoch": 0.7169696969696969, + "grad_norm": 0.012120597995817661, + "learning_rate": 9.904663677018245e-05, + "loss": 0.013401714153587818, + "num_input_tokens_seen": 19372808, + "step": 1183, + "train_runtime": 9612.9913, + "train_tokens_per_second": 2015.274 + }, + { + "epoch": 0.7175757575757575, + "grad_norm": 0.024704404175281525, + "learning_rate": 9.904476699008293e-05, + "loss": 0.015781283378601074, + "num_input_tokens_seen": 19389184, + "step": 1184, + "train_runtime": 9621.1054, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 0.7181818181818181, + "grad_norm": 0.015950346365571022, + "learning_rate": 9.9042895395913e-05, + "loss": 0.012905421666800976, + "num_input_tokens_seen": 19405560, + "step": 1185, + "train_runtime": 9629.2186, + "train_tokens_per_second": 2015.279 + }, + { + "epoch": 0.7187878787878788, + "grad_norm": 0.021412916481494904, + "learning_rate": 9.904102198774188e-05, + "loss": 0.012717105448246002, + "num_input_tokens_seen": 19421936, + "step": 1186, + "train_runtime": 9637.3311, + "train_tokens_per_second": 2015.282 + }, + { + "epoch": 0.7193939393939394, + "grad_norm": 0.024673737585544586, + "learning_rate": 9.903914676563885e-05, + "loss": 0.012580260634422302, + "num_input_tokens_seen": 19438312, + "step": 1187, + "train_runtime": 9645.4427, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 0.72, + "grad_norm": 0.07743503898382187, + "learning_rate": 9.90372697296733e-05, + "loss": 0.013859845697879791, + "num_input_tokens_seen": 19454688, + "step": 1188, + "train_runtime": 9653.5584, + "train_tokens_per_second": 2015.287 + }, + { + "epoch": 0.7206060606060606, + "grad_norm": 0.014397671446204185, + "learning_rate": 9.903539087991462e-05, + "loss": 0.013244936242699623, + "num_input_tokens_seen": 19471064, + "step": 1189, + "train_runtime": 9661.6716, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 0.7212121212121212, + "grad_norm": 0.027382057160139084, + "learning_rate": 9.903351021643233e-05, + "loss": 0.014433873817324638, + "num_input_tokens_seen": 19487440, + "step": 1190, + "train_runtime": 9669.7828, + "train_tokens_per_second": 2015.292 + }, + { + "epoch": 0.7218181818181818, + "grad_norm": 0.013371971435844898, + "learning_rate": 9.903162773929599e-05, + "loss": 0.014319634065032005, + "num_input_tokens_seen": 19503816, + "step": 1191, + "train_runtime": 9677.8954, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 0.7224242424242424, + "grad_norm": 0.02415373921394348, + "learning_rate": 9.902974344857521e-05, + "loss": 0.01522553525865078, + "num_input_tokens_seen": 19520192, + "step": 1192, + "train_runtime": 9686.0046, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 0.723030303030303, + "grad_norm": 0.013075731694698334, + "learning_rate": 9.902785734433971e-05, + "loss": 0.012145644053816795, + "num_input_tokens_seen": 19536568, + "step": 1193, + "train_runtime": 9694.1175, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 0.7236363636363636, + "grad_norm": 0.02217678166925907, + "learning_rate": 9.902596942665925e-05, + "loss": 0.013490047305822372, + "num_input_tokens_seen": 19552944, + "step": 1194, + "train_runtime": 9702.2306, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 0.7242424242424242, + "grad_norm": 0.014989197254180908, + "learning_rate": 9.902407969560364e-05, + "loss": 0.015374877490103245, + "num_input_tokens_seen": 19569320, + "step": 1195, + "train_runtime": 9710.3384, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 0.7248484848484849, + "grad_norm": 0.010880461893975735, + "learning_rate": 9.90221881512428e-05, + "loss": 0.010911534540355206, + "num_input_tokens_seen": 19585696, + "step": 1196, + "train_runtime": 9718.4475, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 0.7254545454545455, + "grad_norm": 0.0177223589271307, + "learning_rate": 9.90202947936467e-05, + "loss": 0.01328328251838684, + "num_input_tokens_seen": 19602072, + "step": 1197, + "train_runtime": 9726.5574, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 0.7260606060606061, + "grad_norm": 0.015080858021974564, + "learning_rate": 9.901839962288533e-05, + "loss": 0.013248666189610958, + "num_input_tokens_seen": 19618448, + "step": 1198, + "train_runtime": 9734.6668, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 0.7266666666666667, + "grad_norm": 0.01892446167767048, + "learning_rate": 9.901650263902884e-05, + "loss": 0.012533879838883877, + "num_input_tokens_seen": 19634824, + "step": 1199, + "train_runtime": 9742.776, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 0.7272727272727273, + "grad_norm": 0.0085715651512146, + "learning_rate": 9.901460384214736e-05, + "loss": 0.011274173855781555, + "num_input_tokens_seen": 19651200, + "step": 1200, + "train_runtime": 9750.8874, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 0.7278787878787879, + "grad_norm": 0.030662082135677338, + "learning_rate": 9.901270323231115e-05, + "loss": 0.012586663477122784, + "num_input_tokens_seen": 19667576, + "step": 1201, + "train_runtime": 9759.9377, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 0.7284848484848485, + "grad_norm": 0.012625769712030888, + "learning_rate": 9.901080080959048e-05, + "loss": 0.013224436901509762, + "num_input_tokens_seen": 19683952, + "step": 1202, + "train_runtime": 9768.0467, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 0.7290909090909091, + "grad_norm": 0.012317335233092308, + "learning_rate": 9.900889657405573e-05, + "loss": 0.012883040122687817, + "num_input_tokens_seen": 19700328, + "step": 1203, + "train_runtime": 9776.155, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 0.7296969696969697, + "grad_norm": 0.012403651140630245, + "learning_rate": 9.900699052577736e-05, + "loss": 0.012290080077946186, + "num_input_tokens_seen": 19716704, + "step": 1204, + "train_runtime": 9784.2649, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 0.7303030303030303, + "grad_norm": 0.01588149555027485, + "learning_rate": 9.900508266482582e-05, + "loss": 0.011603264138102531, + "num_input_tokens_seen": 19733080, + "step": 1205, + "train_runtime": 9792.3778, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 0.730909090909091, + "grad_norm": 0.014620691537857056, + "learning_rate": 9.900317299127171e-05, + "loss": 0.012423778884112835, + "num_input_tokens_seen": 19749456, + "step": 1206, + "train_runtime": 9800.4881, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 0.7315151515151516, + "grad_norm": 0.012740055099129677, + "learning_rate": 9.900126150518567e-05, + "loss": 0.013299481943249702, + "num_input_tokens_seen": 19765832, + "step": 1207, + "train_runtime": 9808.599, + "train_tokens_per_second": 2015.153 + }, + { + "epoch": 0.7321212121212122, + "grad_norm": 0.015813497826457024, + "learning_rate": 9.899934820663839e-05, + "loss": 0.014216665178537369, + "num_input_tokens_seen": 19782208, + "step": 1208, + "train_runtime": 9816.7097, + "train_tokens_per_second": 2015.157 + }, + { + "epoch": 0.7327272727272728, + "grad_norm": 0.023462215438485146, + "learning_rate": 9.899743309570065e-05, + "loss": 0.014444109052419662, + "num_input_tokens_seen": 19798584, + "step": 1209, + "train_runtime": 9824.8204, + "train_tokens_per_second": 2015.16 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 0.016535522416234016, + "learning_rate": 9.899551617244326e-05, + "loss": 0.012044892646372318, + "num_input_tokens_seen": 19814960, + "step": 1210, + "train_runtime": 9832.9302, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 0.7339393939393939, + "grad_norm": 0.01581740379333496, + "learning_rate": 9.899359743693714e-05, + "loss": 0.014411653392016888, + "num_input_tokens_seen": 19831336, + "step": 1211, + "train_runtime": 9841.0417, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 0.7345454545454545, + "grad_norm": 0.01694261096417904, + "learning_rate": 9.899167688925328e-05, + "loss": 0.01339998934417963, + "num_input_tokens_seen": 19847712, + "step": 1212, + "train_runtime": 9849.1549, + "train_tokens_per_second": 2015.169 + }, + { + "epoch": 0.7351515151515151, + "grad_norm": 0.011397319845855236, + "learning_rate": 9.898975452946268e-05, + "loss": 0.013992566615343094, + "num_input_tokens_seen": 19864088, + "step": 1213, + "train_runtime": 9857.2628, + "train_tokens_per_second": 2015.173 + }, + { + "epoch": 0.7357575757575757, + "grad_norm": 0.009932632558047771, + "learning_rate": 9.898783035763648e-05, + "loss": 0.013121276162564754, + "num_input_tokens_seen": 19880464, + "step": 1214, + "train_runtime": 9865.3743, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 0.7363636363636363, + "grad_norm": 0.039875004440546036, + "learning_rate": 9.898590437384583e-05, + "loss": 0.013154653832316399, + "num_input_tokens_seen": 19896840, + "step": 1215, + "train_runtime": 9873.4892, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 0.7369696969696969, + "grad_norm": 0.014247733168303967, + "learning_rate": 9.898397657816198e-05, + "loss": 0.012165211141109467, + "num_input_tokens_seen": 19913216, + "step": 1216, + "train_runtime": 9881.6008, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 0.7375757575757576, + "grad_norm": 0.020671000704169273, + "learning_rate": 9.89820469706562e-05, + "loss": 0.012851119041442871, + "num_input_tokens_seen": 19929592, + "step": 1217, + "train_runtime": 9889.711, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 0.7381818181818182, + "grad_norm": 0.01268229354172945, + "learning_rate": 9.898011555139991e-05, + "loss": 0.011670916341245174, + "num_input_tokens_seen": 19945968, + "step": 1218, + "train_runtime": 9897.8448, + "train_tokens_per_second": 2015.183 + }, + { + "epoch": 0.7387878787878788, + "grad_norm": 0.014971123076975346, + "learning_rate": 9.897818232046454e-05, + "loss": 0.012817314825952053, + "num_input_tokens_seen": 19962344, + "step": 1219, + "train_runtime": 9905.9579, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 0.7393939393939394, + "grad_norm": 0.03158552944660187, + "learning_rate": 9.897624727792159e-05, + "loss": 0.01493182685226202, + "num_input_tokens_seen": 19978720, + "step": 1220, + "train_runtime": 9914.0699, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 0.74, + "grad_norm": 0.013837055303156376, + "learning_rate": 9.897431042384261e-05, + "loss": 0.01410394161939621, + "num_input_tokens_seen": 19995096, + "step": 1221, + "train_runtime": 9922.1811, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 0.7406060606060606, + "grad_norm": 0.02035367488861084, + "learning_rate": 9.897237175829926e-05, + "loss": 0.014466963708400726, + "num_input_tokens_seen": 20011472, + "step": 1222, + "train_runtime": 9930.2933, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 0.7412121212121212, + "grad_norm": 0.03811359778046608, + "learning_rate": 9.897043128136325e-05, + "loss": 0.013205880299210548, + "num_input_tokens_seen": 20027848, + "step": 1223, + "train_runtime": 9938.4061, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 0.7418181818181818, + "grad_norm": 0.018652835860848427, + "learning_rate": 9.896848899310636e-05, + "loss": 0.013042958453297615, + "num_input_tokens_seen": 20044224, + "step": 1224, + "train_runtime": 9946.5152, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 0.7424242424242424, + "grad_norm": 0.017733843997120857, + "learning_rate": 9.896654489360042e-05, + "loss": 0.012684517540037632, + "num_input_tokens_seen": 20060600, + "step": 1225, + "train_runtime": 9954.6306, + "train_tokens_per_second": 2015.203 + }, + { + "epoch": 0.743030303030303, + "grad_norm": 0.010155964642763138, + "learning_rate": 9.896459898291734e-05, + "loss": 0.011605635285377502, + "num_input_tokens_seen": 20076976, + "step": 1226, + "train_runtime": 9962.74, + "train_tokens_per_second": 2015.206 + }, + { + "epoch": 0.7436363636363637, + "grad_norm": 0.02421714924275875, + "learning_rate": 9.896265126112911e-05, + "loss": 0.015139145776629448, + "num_input_tokens_seen": 20093352, + "step": 1227, + "train_runtime": 9970.8484, + "train_tokens_per_second": 2015.21 + }, + { + "epoch": 0.7442424242424243, + "grad_norm": 0.02827371098101139, + "learning_rate": 9.896070172830776e-05, + "loss": 0.013175873085856438, + "num_input_tokens_seen": 20109728, + "step": 1228, + "train_runtime": 9978.9575, + "train_tokens_per_second": 2015.213 + }, + { + "epoch": 0.7448484848484849, + "grad_norm": 0.012187021784484386, + "learning_rate": 9.895875038452539e-05, + "loss": 0.013465436175465584, + "num_input_tokens_seen": 20126104, + "step": 1229, + "train_runtime": 9987.0668, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 0.7454545454545455, + "grad_norm": 0.011740162037312984, + "learning_rate": 9.895679722985419e-05, + "loss": 0.013261547312140465, + "num_input_tokens_seen": 20142480, + "step": 1230, + "train_runtime": 9995.1753, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 0.7460606060606061, + "grad_norm": 0.02706027776002884, + "learning_rate": 9.89548422643664e-05, + "loss": 0.013440998271107674, + "num_input_tokens_seen": 20158856, + "step": 1231, + "train_runtime": 10003.284, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 0.05222317576408386, + "learning_rate": 9.895288548813432e-05, + "loss": 0.014066273346543312, + "num_input_tokens_seen": 20175232, + "step": 1232, + "train_runtime": 10011.3904, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 0.7472727272727273, + "grad_norm": 0.011138387955725193, + "learning_rate": 9.895092690123035e-05, + "loss": 0.012343725189566612, + "num_input_tokens_seen": 20191608, + "step": 1233, + "train_runtime": 10019.4992, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 0.7478787878787879, + "grad_norm": 0.019493183121085167, + "learning_rate": 9.894896650372692e-05, + "loss": 0.014319119974970818, + "num_input_tokens_seen": 20207984, + "step": 1234, + "train_runtime": 10027.6078, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 0.7484848484848485, + "grad_norm": 0.010399113409221172, + "learning_rate": 9.894700429569653e-05, + "loss": 0.013344192877411842, + "num_input_tokens_seen": 20224360, + "step": 1235, + "train_runtime": 10035.7155, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 0.7490909090909091, + "grad_norm": 0.013207124546170235, + "learning_rate": 9.894504027721179e-05, + "loss": 0.012579311616718769, + "num_input_tokens_seen": 20240736, + "step": 1236, + "train_runtime": 10043.8304, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 0.7496969696969698, + "grad_norm": 0.007676406297832727, + "learning_rate": 9.89430744483453e-05, + "loss": 0.012105286121368408, + "num_input_tokens_seen": 20257112, + "step": 1237, + "train_runtime": 10051.9411, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 0.7503030303030302, + "grad_norm": 0.26611316204071045, + "learning_rate": 9.894110680916981e-05, + "loss": 0.012751906178891659, + "num_input_tokens_seen": 20273488, + "step": 1238, + "train_runtime": 10060.0502, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 0.7509090909090909, + "grad_norm": 0.016328565776348114, + "learning_rate": 9.89391373597581e-05, + "loss": 0.013627522625029087, + "num_input_tokens_seen": 20289864, + "step": 1239, + "train_runtime": 10068.1581, + "train_tokens_per_second": 2015.251 + }, + { + "epoch": 0.7515151515151515, + "grad_norm": 0.018115442246198654, + "learning_rate": 9.8937166100183e-05, + "loss": 0.014619875699281693, + "num_input_tokens_seen": 20306240, + "step": 1240, + "train_runtime": 10076.2632, + "train_tokens_per_second": 2015.255 + }, + { + "epoch": 0.7521212121212121, + "grad_norm": 0.047225791960954666, + "learning_rate": 9.893519303051742e-05, + "loss": 0.012407291680574417, + "num_input_tokens_seen": 20322616, + "step": 1241, + "train_runtime": 10084.3737, + "train_tokens_per_second": 2015.258 + }, + { + "epoch": 0.7527272727272727, + "grad_norm": 0.00958853680640459, + "learning_rate": 9.893321815083435e-05, + "loss": 0.012367008253932, + "num_input_tokens_seen": 20338992, + "step": 1242, + "train_runtime": 10092.4834, + "train_tokens_per_second": 2015.261 + }, + { + "epoch": 0.7533333333333333, + "grad_norm": 0.01551489531993866, + "learning_rate": 9.893124146120684e-05, + "loss": 0.011828011833131313, + "num_input_tokens_seen": 20355368, + "step": 1243, + "train_runtime": 10100.5915, + "train_tokens_per_second": 2015.265 + }, + { + "epoch": 0.7539393939393939, + "grad_norm": 0.015479539521038532, + "learning_rate": 9.892926296170799e-05, + "loss": 0.013003758154809475, + "num_input_tokens_seen": 20371744, + "step": 1244, + "train_runtime": 10108.6986, + "train_tokens_per_second": 2015.269 + }, + { + "epoch": 0.7545454545454545, + "grad_norm": 0.018905159085989, + "learning_rate": 9.892728265241098e-05, + "loss": 0.013263228349387646, + "num_input_tokens_seen": 20388120, + "step": 1245, + "train_runtime": 10116.8092, + "train_tokens_per_second": 2015.272 + }, + { + "epoch": 0.7551515151515151, + "grad_norm": 0.02863249182701111, + "learning_rate": 9.892530053338909e-05, + "loss": 0.0130619453266263, + "num_input_tokens_seen": 20404496, + "step": 1246, + "train_runtime": 10124.9156, + "train_tokens_per_second": 2015.276 + }, + { + "epoch": 0.7557575757575757, + "grad_norm": 0.016296787187457085, + "learning_rate": 9.892331660471559e-05, + "loss": 0.012045785784721375, + "num_input_tokens_seen": 20420872, + "step": 1247, + "train_runtime": 10133.0202, + "train_tokens_per_second": 2015.28 + }, + { + "epoch": 0.7563636363636363, + "grad_norm": 0.016199452802538872, + "learning_rate": 9.892133086646389e-05, + "loss": 0.012048415839672089, + "num_input_tokens_seen": 20437248, + "step": 1248, + "train_runtime": 10141.1305, + "train_tokens_per_second": 2015.283 + }, + { + "epoch": 0.756969696969697, + "grad_norm": 0.012741641141474247, + "learning_rate": 9.891934331870743e-05, + "loss": 0.01335767563432455, + "num_input_tokens_seen": 20453624, + "step": 1249, + "train_runtime": 10149.2473, + "train_tokens_per_second": 2015.285 + }, + { + "epoch": 0.7575757575757576, + "grad_norm": 0.03929731622338295, + "learning_rate": 9.891735396151972e-05, + "loss": 0.01206697802990675, + "num_input_tokens_seen": 20470000, + "step": 1250, + "train_runtime": 10157.3657, + "train_tokens_per_second": 2015.286 + }, + { + "epoch": 0.7581818181818182, + "grad_norm": 0.007868324406445026, + "learning_rate": 9.891536279497436e-05, + "loss": 0.011791637167334557, + "num_input_tokens_seen": 20486376, + "step": 1251, + "train_runtime": 10165.4828, + "train_tokens_per_second": 2015.288 + }, + { + "epoch": 0.7587878787878788, + "grad_norm": 0.013859824277460575, + "learning_rate": 9.891336981914499e-05, + "loss": 0.014204591512680054, + "num_input_tokens_seen": 20502752, + "step": 1252, + "train_runtime": 10173.6015, + "train_tokens_per_second": 2015.289 + }, + { + "epoch": 0.7593939393939394, + "grad_norm": 0.03682630881667137, + "learning_rate": 9.891137503410531e-05, + "loss": 0.01157104317098856, + "num_input_tokens_seen": 20519128, + "step": 1253, + "train_runtime": 10181.7191, + "train_tokens_per_second": 2015.291 + }, + { + "epoch": 0.76, + "grad_norm": 0.015358424745500088, + "learning_rate": 9.890937843992913e-05, + "loss": 0.013172848150134087, + "num_input_tokens_seen": 20535504, + "step": 1254, + "train_runtime": 10189.8374, + "train_tokens_per_second": 2015.293 + }, + { + "epoch": 0.7606060606060606, + "grad_norm": 0.01969468779861927, + "learning_rate": 9.890738003669029e-05, + "loss": 0.013599451631307602, + "num_input_tokens_seen": 20551880, + "step": 1255, + "train_runtime": 10197.9553, + "train_tokens_per_second": 2015.294 + }, + { + "epoch": 0.7612121212121212, + "grad_norm": 0.01678163930773735, + "learning_rate": 9.89053798244627e-05, + "loss": 0.013114574365317822, + "num_input_tokens_seen": 20568256, + "step": 1256, + "train_runtime": 10206.0749, + "train_tokens_per_second": 2015.295 + }, + { + "epoch": 0.7618181818181818, + "grad_norm": 0.0193489920347929, + "learning_rate": 9.890337780332035e-05, + "loss": 0.011934047564864159, + "num_input_tokens_seen": 20584632, + "step": 1257, + "train_runtime": 10214.1928, + "train_tokens_per_second": 2015.297 + }, + { + "epoch": 0.7624242424242424, + "grad_norm": 0.011665060184895992, + "learning_rate": 9.890137397333729e-05, + "loss": 0.012188711203634739, + "num_input_tokens_seen": 20601008, + "step": 1258, + "train_runtime": 10222.3099, + "train_tokens_per_second": 2015.299 + }, + { + "epoch": 0.7630303030303031, + "grad_norm": 0.005775026045739651, + "learning_rate": 9.889936833458763e-05, + "loss": 0.011419412679970264, + "num_input_tokens_seen": 20617384, + "step": 1259, + "train_runtime": 10230.4305, + "train_tokens_per_second": 2015.3 + }, + { + "epoch": 0.7636363636363637, + "grad_norm": 0.023811450228095055, + "learning_rate": 9.889736088714558e-05, + "loss": 0.01227609720081091, + "num_input_tokens_seen": 20633760, + "step": 1260, + "train_runtime": 10238.5502, + "train_tokens_per_second": 2015.301 + }, + { + "epoch": 0.7642424242424243, + "grad_norm": 0.0233469195663929, + "learning_rate": 9.889535163108537e-05, + "loss": 0.012738242745399475, + "num_input_tokens_seen": 20650136, + "step": 1261, + "train_runtime": 10246.6696, + "train_tokens_per_second": 2015.302 + }, + { + "epoch": 0.7648484848484849, + "grad_norm": 0.01263290736824274, + "learning_rate": 9.889334056648131e-05, + "loss": 0.01269836351275444, + "num_input_tokens_seen": 20666512, + "step": 1262, + "train_runtime": 10254.7884, + "train_tokens_per_second": 2015.304 + }, + { + "epoch": 0.7654545454545455, + "grad_norm": 0.014581980183720589, + "learning_rate": 9.889132769340781e-05, + "loss": 0.013540278188884258, + "num_input_tokens_seen": 20682888, + "step": 1263, + "train_runtime": 10262.9085, + "train_tokens_per_second": 2015.305 + }, + { + "epoch": 0.7660606060606061, + "grad_norm": 0.014391904696822166, + "learning_rate": 9.88893130119393e-05, + "loss": 0.012283596210181713, + "num_input_tokens_seen": 20699264, + "step": 1264, + "train_runtime": 10271.0298, + "train_tokens_per_second": 2015.306 + }, + { + "epoch": 0.7666666666666667, + "grad_norm": 0.015524503774940968, + "learning_rate": 9.888729652215032e-05, + "loss": 0.012001638300716877, + "num_input_tokens_seen": 20715640, + "step": 1265, + "train_runtime": 10279.149, + "train_tokens_per_second": 2015.307 + }, + { + "epoch": 0.7672727272727272, + "grad_norm": 0.011605373583734035, + "learning_rate": 9.888527822411543e-05, + "loss": 0.012554067187011242, + "num_input_tokens_seen": 20732016, + "step": 1266, + "train_runtime": 10287.2682, + "train_tokens_per_second": 2015.308 + }, + { + "epoch": 0.7678787878787878, + "grad_norm": 0.017037956044077873, + "learning_rate": 9.888325811790931e-05, + "loss": 0.013448834419250488, + "num_input_tokens_seen": 20748392, + "step": 1267, + "train_runtime": 10295.3877, + "train_tokens_per_second": 2015.309 + }, + { + "epoch": 0.7684848484848484, + "grad_norm": 0.028556402772665024, + "learning_rate": 9.888123620360666e-05, + "loss": 0.012878211215138435, + "num_input_tokens_seen": 20764768, + "step": 1268, + "train_runtime": 10303.5056, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 0.769090909090909, + "grad_norm": 0.014958829618990421, + "learning_rate": 9.887921248128228e-05, + "loss": 0.013986572623252869, + "num_input_tokens_seen": 20781144, + "step": 1269, + "train_runtime": 10311.6315, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 0.7696969696969697, + "grad_norm": 0.021999262273311615, + "learning_rate": 9.887718695101102e-05, + "loss": 0.01611473597586155, + "num_input_tokens_seen": 20797520, + "step": 1270, + "train_runtime": 10319.7512, + "train_tokens_per_second": 2015.312 + }, + { + "epoch": 0.7703030303030303, + "grad_norm": 0.01434963196516037, + "learning_rate": 9.88751596128678e-05, + "loss": 0.012239954434335232, + "num_input_tokens_seen": 20813896, + "step": 1271, + "train_runtime": 10327.8713, + "train_tokens_per_second": 2015.313 + }, + { + "epoch": 0.7709090909090909, + "grad_norm": 0.02051941119134426, + "learning_rate": 9.887313046692761e-05, + "loss": 0.013740262016654015, + "num_input_tokens_seen": 20830272, + "step": 1272, + "train_runtime": 10335.9914, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 0.7715151515151515, + "grad_norm": 0.00836126133799553, + "learning_rate": 9.88710995132655e-05, + "loss": 0.011003411374986172, + "num_input_tokens_seen": 20846648, + "step": 1273, + "train_runtime": 10344.1113, + "train_tokens_per_second": 2015.316 + }, + { + "epoch": 0.7721212121212121, + "grad_norm": 0.009217855520546436, + "learning_rate": 9.886906675195657e-05, + "loss": 0.012320063076913357, + "num_input_tokens_seen": 20863024, + "step": 1274, + "train_runtime": 10352.2308, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 0.7727272727272727, + "grad_norm": 0.00831685308367014, + "learning_rate": 9.886703218307604e-05, + "loss": 0.013156922534108162, + "num_input_tokens_seen": 20879400, + "step": 1275, + "train_runtime": 10360.351, + "train_tokens_per_second": 2015.318 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 0.01840154640376568, + "learning_rate": 9.886499580669917e-05, + "loss": 0.01196813490241766, + "num_input_tokens_seen": 20895776, + "step": 1276, + "train_runtime": 10368.4707, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 0.7739393939393939, + "grad_norm": 0.016405558213591576, + "learning_rate": 9.886295762290125e-05, + "loss": 0.013263520784676075, + "num_input_tokens_seen": 20912152, + "step": 1277, + "train_runtime": 10376.5894, + "train_tokens_per_second": 2015.32 + }, + { + "epoch": 0.7745454545454545, + "grad_norm": 0.017034931108355522, + "learning_rate": 9.886091763175769e-05, + "loss": 0.013993248343467712, + "num_input_tokens_seen": 20928528, + "step": 1278, + "train_runtime": 10384.7083, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 0.7751515151515151, + "grad_norm": 0.03572826832532883, + "learning_rate": 9.885887583334393e-05, + "loss": 0.012332772836089134, + "num_input_tokens_seen": 20944904, + "step": 1279, + "train_runtime": 10392.8323, + "train_tokens_per_second": 2015.322 + }, + { + "epoch": 0.7757575757575758, + "grad_norm": 0.02001163735985756, + "learning_rate": 9.885683222773551e-05, + "loss": 0.012113104574382305, + "num_input_tokens_seen": 20961280, + "step": 1280, + "train_runtime": 10400.9501, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 0.7763636363636364, + "grad_norm": 0.04807475954294205, + "learning_rate": 9.8854786815008e-05, + "loss": 0.011850223876535892, + "num_input_tokens_seen": 20977656, + "step": 1281, + "train_runtime": 10409.0709, + "train_tokens_per_second": 2015.325 + }, + { + "epoch": 0.776969696969697, + "grad_norm": 0.007331644184887409, + "learning_rate": 9.885273959523707e-05, + "loss": 0.011687932536005974, + "num_input_tokens_seen": 20994032, + "step": 1282, + "train_runtime": 10417.1889, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 0.7775757575757576, + "grad_norm": 0.013896801508963108, + "learning_rate": 9.885069056849845e-05, + "loss": 0.01239155326038599, + "num_input_tokens_seen": 21010408, + "step": 1283, + "train_runtime": 10425.3081, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 0.7781818181818182, + "grad_norm": 0.009068959392607212, + "learning_rate": 9.88486397348679e-05, + "loss": 0.01141006126999855, + "num_input_tokens_seen": 21026784, + "step": 1284, + "train_runtime": 10433.4338, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 0.7787878787878788, + "grad_norm": 0.01311533898115158, + "learning_rate": 9.884658709442132e-05, + "loss": 0.011742614209651947, + "num_input_tokens_seen": 21043160, + "step": 1285, + "train_runtime": 10441.5524, + "train_tokens_per_second": 2015.329 + }, + { + "epoch": 0.7793939393939394, + "grad_norm": 0.01562919095158577, + "learning_rate": 9.884453264723459e-05, + "loss": 0.012607906013727188, + "num_input_tokens_seen": 21059536, + "step": 1286, + "train_runtime": 10449.6697, + "train_tokens_per_second": 2015.33 + }, + { + "epoch": 0.78, + "grad_norm": 0.017651278525590897, + "learning_rate": 9.884247639338373e-05, + "loss": 0.01244867779314518, + "num_input_tokens_seen": 21075912, + "step": 1287, + "train_runtime": 10457.7881, + "train_tokens_per_second": 2015.332 + }, + { + "epoch": 0.7806060606060606, + "grad_norm": 0.020198311656713486, + "learning_rate": 9.884041833294476e-05, + "loss": 0.013492776080965996, + "num_input_tokens_seen": 21092288, + "step": 1288, + "train_runtime": 10465.9062, + "train_tokens_per_second": 2015.333 + }, + { + "epoch": 0.7812121212121212, + "grad_norm": 0.009970282204449177, + "learning_rate": 9.883835846599386e-05, + "loss": 0.013857762329280376, + "num_input_tokens_seen": 21108664, + "step": 1289, + "train_runtime": 10474.1537, + "train_tokens_per_second": 2015.31 + }, + { + "epoch": 0.7818181818181819, + "grad_norm": 0.011340651661157608, + "learning_rate": 9.883629679260715e-05, + "loss": 0.011344236321747303, + "num_input_tokens_seen": 21125040, + "step": 1290, + "train_runtime": 10482.2705, + "train_tokens_per_second": 2015.311 + }, + { + "epoch": 0.7824242424242425, + "grad_norm": 0.03793201595544815, + "learning_rate": 9.883423331286096e-05, + "loss": 0.015287358313798904, + "num_input_tokens_seen": 21141416, + "step": 1291, + "train_runtime": 10490.3857, + "train_tokens_per_second": 2015.314 + }, + { + "epoch": 0.7830303030303031, + "grad_norm": 0.02402154542505741, + "learning_rate": 9.883216802683158e-05, + "loss": 0.013735389336943626, + "num_input_tokens_seen": 21157792, + "step": 1292, + "train_runtime": 10498.5012, + "train_tokens_per_second": 2015.315 + }, + { + "epoch": 0.7836363636363637, + "grad_norm": 0.016549425199627876, + "learning_rate": 9.883010093459537e-05, + "loss": 0.01311381347477436, + "num_input_tokens_seen": 21174168, + "step": 1293, + "train_runtime": 10506.618, + "train_tokens_per_second": 2015.317 + }, + { + "epoch": 0.7842424242424243, + "grad_norm": 0.0236363522708416, + "learning_rate": 9.882803203622884e-05, + "loss": 0.01185927726328373, + "num_input_tokens_seen": 21190544, + "step": 1294, + "train_runtime": 10514.7333, + "train_tokens_per_second": 2015.319 + }, + { + "epoch": 0.7848484848484848, + "grad_norm": 0.015482014045119286, + "learning_rate": 9.882596133180849e-05, + "loss": 0.012073281221091747, + "num_input_tokens_seen": 21206920, + "step": 1295, + "train_runtime": 10522.8502, + "train_tokens_per_second": 2015.321 + }, + { + "epoch": 0.7854545454545454, + "grad_norm": 0.01528620719909668, + "learning_rate": 9.882388882141092e-05, + "loss": 0.012514740228652954, + "num_input_tokens_seen": 21223296, + "step": 1296, + "train_runtime": 10530.9675, + "train_tokens_per_second": 2015.323 + }, + { + "epoch": 0.786060606060606, + "grad_norm": 0.01590045541524887, + "learning_rate": 9.882181450511278e-05, + "loss": 0.014040066860616207, + "num_input_tokens_seen": 21239672, + "step": 1297, + "train_runtime": 10539.085, + "train_tokens_per_second": 2015.324 + }, + { + "epoch": 0.7866666666666666, + "grad_norm": 0.026240071281790733, + "learning_rate": 9.88197383829908e-05, + "loss": 0.012822052463889122, + "num_input_tokens_seen": 21256048, + "step": 1298, + "train_runtime": 10547.2019, + "train_tokens_per_second": 2015.326 + }, + { + "epoch": 0.7872727272727272, + "grad_norm": 0.014810437336564064, + "learning_rate": 9.881766045512176e-05, + "loss": 0.01398603618144989, + "num_input_tokens_seen": 21272424, + "step": 1299, + "train_runtime": 10555.3212, + "train_tokens_per_second": 2015.327 + }, + { + "epoch": 0.7878787878787878, + "grad_norm": 0.0264164749532938, + "learning_rate": 9.881558072158252e-05, + "loss": 0.012693504802882671, + "num_input_tokens_seen": 21288800, + "step": 1300, + "train_runtime": 10563.4399, + "train_tokens_per_second": 2015.328 + }, + { + "epoch": 0.7884848484848485, + "grad_norm": 0.01858045533299446, + "learning_rate": 9.881349918245005e-05, + "loss": 0.013458561152219772, + "num_input_tokens_seen": 21305176, + "step": 1301, + "train_runtime": 10572.4807, + "train_tokens_per_second": 2015.154 + }, + { + "epoch": 0.7890909090909091, + "grad_norm": 0.012029891833662987, + "learning_rate": 9.881141583780127e-05, + "loss": 0.014163712970912457, + "num_input_tokens_seen": 21321552, + "step": 1302, + "train_runtime": 10580.5948, + "train_tokens_per_second": 2015.156 + }, + { + "epoch": 0.7896969696969697, + "grad_norm": 0.016712768003344536, + "learning_rate": 9.880933068771329e-05, + "loss": 0.012644865550100803, + "num_input_tokens_seen": 21337928, + "step": 1303, + "train_runtime": 10588.7111, + "train_tokens_per_second": 2015.158 + }, + { + "epoch": 0.7903030303030303, + "grad_norm": 0.013986393809318542, + "learning_rate": 9.88072437322632e-05, + "loss": 0.015078244730830193, + "num_input_tokens_seen": 21354304, + "step": 1304, + "train_runtime": 10596.8307, + "train_tokens_per_second": 2015.159 + }, + { + "epoch": 0.7909090909090909, + "grad_norm": 0.012918438762426376, + "learning_rate": 9.880515497152823e-05, + "loss": 0.011986867524683475, + "num_input_tokens_seen": 21370680, + "step": 1305, + "train_runtime": 10604.9468, + "train_tokens_per_second": 2015.161 + }, + { + "epoch": 0.7915151515151515, + "grad_norm": 0.0405765101313591, + "learning_rate": 9.880306440558562e-05, + "loss": 0.011655117385089397, + "num_input_tokens_seen": 21387056, + "step": 1306, + "train_runtime": 10613.0635, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 0.7921212121212121, + "grad_norm": 0.01451539620757103, + "learning_rate": 9.880097203451271e-05, + "loss": 0.012863151729106903, + "num_input_tokens_seen": 21403432, + "step": 1307, + "train_runtime": 10621.1883, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 0.7927272727272727, + "grad_norm": 0.015082642436027527, + "learning_rate": 9.879887785838687e-05, + "loss": 0.013372685760259628, + "num_input_tokens_seen": 21419808, + "step": 1308, + "train_runtime": 10629.3154, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 0.7933333333333333, + "grad_norm": 0.013615542091429234, + "learning_rate": 9.879678187728557e-05, + "loss": 0.012768305838108063, + "num_input_tokens_seen": 21436184, + "step": 1309, + "train_runtime": 10637.4354, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 0.793939393939394, + "grad_norm": 0.011857746168971062, + "learning_rate": 9.879468409128632e-05, + "loss": 0.01288798451423645, + "num_input_tokens_seen": 21452560, + "step": 1310, + "train_runtime": 10645.5577, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 0.7945454545454546, + "grad_norm": 0.014605470933020115, + "learning_rate": 9.879258450046673e-05, + "loss": 0.012226996943354607, + "num_input_tokens_seen": 21468936, + "step": 1311, + "train_runtime": 10653.6811, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 0.7951515151515152, + "grad_norm": 0.012224176898598671, + "learning_rate": 9.879048310490448e-05, + "loss": 0.012793928384780884, + "num_input_tokens_seen": 21485312, + "step": 1312, + "train_runtime": 10661.8093, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 0.7957575757575758, + "grad_norm": 0.011239518411457539, + "learning_rate": 9.878837990467725e-05, + "loss": 0.012553832493722439, + "num_input_tokens_seen": 21501688, + "step": 1313, + "train_runtime": 10669.9356, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 0.7963636363636364, + "grad_norm": 0.007681385613977909, + "learning_rate": 9.878627489986287e-05, + "loss": 0.011498531326651573, + "num_input_tokens_seen": 21518064, + "step": 1314, + "train_runtime": 10678.055, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 0.796969696969697, + "grad_norm": 0.01674646884202957, + "learning_rate": 9.87841680905392e-05, + "loss": 0.012630677781999111, + "num_input_tokens_seen": 21534440, + "step": 1315, + "train_runtime": 10686.1747, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 0.7975757575757576, + "grad_norm": 0.041864536702632904, + "learning_rate": 9.878205947678414e-05, + "loss": 0.012615383602678776, + "num_input_tokens_seen": 21550816, + "step": 1316, + "train_runtime": 10694.294, + "train_tokens_per_second": 2015.17 + }, + { + "epoch": 0.7981818181818182, + "grad_norm": 0.02699940651655197, + "learning_rate": 9.877994905867571e-05, + "loss": 0.012835457921028137, + "num_input_tokens_seen": 21567192, + "step": 1317, + "train_runtime": 10702.4144, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 0.7987878787878788, + "grad_norm": 0.014113808050751686, + "learning_rate": 9.877783683629195e-05, + "loss": 0.012954406440258026, + "num_input_tokens_seen": 21583568, + "step": 1318, + "train_runtime": 10710.5348, + "train_tokens_per_second": 2015.172 + }, + { + "epoch": 0.7993939393939394, + "grad_norm": 0.021632181480526924, + "learning_rate": 9.8775722809711e-05, + "loss": 0.01311055850237608, + "num_input_tokens_seen": 21599944, + "step": 1319, + "train_runtime": 10718.6528, + "train_tokens_per_second": 2015.173 + }, + { + "epoch": 0.8, + "grad_norm": 0.017235929146409035, + "learning_rate": 9.877360697901105e-05, + "loss": 0.01242838054895401, + "num_input_tokens_seen": 21616320, + "step": 1320, + "train_runtime": 10726.7712, + "train_tokens_per_second": 2015.175 + }, + { + "epoch": 0.8006060606060607, + "grad_norm": 0.016485046595335007, + "learning_rate": 9.877148934427037e-05, + "loss": 0.012305478565394878, + "num_input_tokens_seen": 21632696, + "step": 1321, + "train_runtime": 10734.8904, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 0.8012121212121213, + "grad_norm": 0.012739230878651142, + "learning_rate": 9.876936990556725e-05, + "loss": 0.012538356706500053, + "num_input_tokens_seen": 21649072, + "step": 1322, + "train_runtime": 10743.0097, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 0.8018181818181818, + "grad_norm": 0.012361896224319935, + "learning_rate": 9.876724866298012e-05, + "loss": 0.013269990682601929, + "num_input_tokens_seen": 21665448, + "step": 1323, + "train_runtime": 10751.1304, + "train_tokens_per_second": 2015.179 + }, + { + "epoch": 0.8024242424242424, + "grad_norm": 0.0066161691211164, + "learning_rate": 9.876512561658745e-05, + "loss": 0.011660989373922348, + "num_input_tokens_seen": 21681824, + "step": 1324, + "train_runtime": 10759.2511, + "train_tokens_per_second": 2015.18 + }, + { + "epoch": 0.803030303030303, + "grad_norm": 0.025402076542377472, + "learning_rate": 9.876300076646774e-05, + "loss": 0.012346756644546986, + "num_input_tokens_seen": 21698200, + "step": 1325, + "train_runtime": 10767.3706, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 0.8036363636363636, + "grad_norm": 0.02845195308327675, + "learning_rate": 9.876087411269959e-05, + "loss": 0.0148523710668087, + "num_input_tokens_seen": 21714576, + "step": 1326, + "train_runtime": 10775.4904, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 0.8042424242424242, + "grad_norm": 0.020637033507227898, + "learning_rate": 9.875874565536167e-05, + "loss": 0.013783378526568413, + "num_input_tokens_seen": 21730952, + "step": 1327, + "train_runtime": 10783.6113, + "train_tokens_per_second": 2015.183 + }, + { + "epoch": 0.8048484848484848, + "grad_norm": 0.03877370432019234, + "learning_rate": 9.87566153945327e-05, + "loss": 0.01296904031187296, + "num_input_tokens_seen": 21747328, + "step": 1328, + "train_runtime": 10791.7314, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 0.8054545454545454, + "grad_norm": 0.02621079795062542, + "learning_rate": 9.875448333029146e-05, + "loss": 0.015151145868003368, + "num_input_tokens_seen": 21763704, + "step": 1329, + "train_runtime": 10799.8508, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 0.806060606060606, + "grad_norm": 0.012624816037714481, + "learning_rate": 9.875234946271685e-05, + "loss": 0.01100456528365612, + "num_input_tokens_seen": 21780080, + "step": 1330, + "train_runtime": 10807.9703, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 0.8066666666666666, + "grad_norm": 0.04244585335254669, + "learning_rate": 9.875021379188776e-05, + "loss": 0.014457973651587963, + "num_input_tokens_seen": 21796456, + "step": 1331, + "train_runtime": 10816.0902, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 0.8072727272727273, + "grad_norm": 0.012526098638772964, + "learning_rate": 9.87480763178832e-05, + "loss": 0.01147517841309309, + "num_input_tokens_seen": 21812832, + "step": 1332, + "train_runtime": 10824.2096, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 0.8078787878787879, + "grad_norm": 0.01184050552546978, + "learning_rate": 9.874593704078224e-05, + "loss": 0.012551544234156609, + "num_input_tokens_seen": 21829208, + "step": 1333, + "train_runtime": 10832.3319, + "train_tokens_per_second": 2015.19 + }, + { + "epoch": 0.8084848484848485, + "grad_norm": 0.01893959939479828, + "learning_rate": 9.874379596066398e-05, + "loss": 0.014782631769776344, + "num_input_tokens_seen": 21845584, + "step": 1334, + "train_runtime": 10840.4505, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 0.8090909090909091, + "grad_norm": 0.014813544228672981, + "learning_rate": 9.874165307760764e-05, + "loss": 0.01277944352477789, + "num_input_tokens_seen": 21861960, + "step": 1335, + "train_runtime": 10848.5696, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 0.8096969696969697, + "grad_norm": 0.011958160437643528, + "learning_rate": 9.873950839169248e-05, + "loss": 0.012058142572641373, + "num_input_tokens_seen": 21878336, + "step": 1336, + "train_runtime": 10856.6889, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 0.8103030303030303, + "grad_norm": 0.016458792611956596, + "learning_rate": 9.87373619029978e-05, + "loss": 0.012132089585065842, + "num_input_tokens_seen": 21894712, + "step": 1337, + "train_runtime": 10864.8105, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 0.8109090909090909, + "grad_norm": 0.027679556980729103, + "learning_rate": 9.873521361160304e-05, + "loss": 0.012615354731678963, + "num_input_tokens_seen": 21911088, + "step": 1338, + "train_runtime": 10872.9347, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 0.8115151515151515, + "grad_norm": 0.01617676578462124, + "learning_rate": 9.873306351758762e-05, + "loss": 0.011802049353718758, + "num_input_tokens_seen": 21927464, + "step": 1339, + "train_runtime": 10881.0546, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 0.8121212121212121, + "grad_norm": 0.009554133750498295, + "learning_rate": 9.87309116210311e-05, + "loss": 0.012184510938823223, + "num_input_tokens_seen": 21943840, + "step": 1340, + "train_runtime": 10889.1741, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 0.8127272727272727, + "grad_norm": 0.01341445092111826, + "learning_rate": 9.872875792201304e-05, + "loss": 0.012919439002871513, + "num_input_tokens_seen": 21960216, + "step": 1341, + "train_runtime": 10897.2932, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 0.8133333333333334, + "grad_norm": 0.010654855519533157, + "learning_rate": 9.872660242061314e-05, + "loss": 0.013909978792071342, + "num_input_tokens_seen": 21976592, + "step": 1342, + "train_runtime": 10905.4183, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 0.813939393939394, + "grad_norm": 0.011213628575205803, + "learning_rate": 9.872444511691107e-05, + "loss": 0.011805294081568718, + "num_input_tokens_seen": 21992968, + "step": 1343, + "train_runtime": 10913.5364, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 0.8145454545454546, + "grad_norm": 0.011983062140643597, + "learning_rate": 9.87222860109867e-05, + "loss": 0.010815788060426712, + "num_input_tokens_seen": 22009344, + "step": 1344, + "train_runtime": 10921.6544, + "train_tokens_per_second": 2015.202 + }, + { + "epoch": 0.8151515151515152, + "grad_norm": 0.011851955205202103, + "learning_rate": 9.872012510291983e-05, + "loss": 0.013886788859963417, + "num_input_tokens_seen": 22025720, + "step": 1345, + "train_runtime": 10929.7729, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 0.8157575757575758, + "grad_norm": 0.012403900735080242, + "learning_rate": 9.871796239279043e-05, + "loss": 0.01268466841429472, + "num_input_tokens_seen": 22042096, + "step": 1346, + "train_runtime": 10937.8919, + "train_tokens_per_second": 2015.205 + }, + { + "epoch": 0.8163636363636364, + "grad_norm": 0.016347525641322136, + "learning_rate": 9.871579788067846e-05, + "loss": 0.012477520853281021, + "num_input_tokens_seen": 22058472, + "step": 1347, + "train_runtime": 10946.0113, + "train_tokens_per_second": 2015.206 + }, + { + "epoch": 0.816969696969697, + "grad_norm": 0.02359098754823208, + "learning_rate": 9.8713631566664e-05, + "loss": 0.012588823214173317, + "num_input_tokens_seen": 22074848, + "step": 1348, + "train_runtime": 10954.1352, + "train_tokens_per_second": 2015.207 + }, + { + "epoch": 0.8175757575757576, + "grad_norm": 0.008759316988289356, + "learning_rate": 9.871146345082716e-05, + "loss": 0.012180456891655922, + "num_input_tokens_seen": 22091224, + "step": 1349, + "train_runtime": 10962.2539, + "train_tokens_per_second": 2015.208 + }, + { + "epoch": 0.8181818181818182, + "grad_norm": 0.03136594220995903, + "learning_rate": 9.870929353324817e-05, + "loss": 0.014148636721074581, + "num_input_tokens_seen": 22107600, + "step": 1350, + "train_runtime": 10970.3731, + "train_tokens_per_second": 2015.209 + }, + { + "epoch": 0.8187878787878788, + "grad_norm": 0.02214551530778408, + "learning_rate": 9.870712181400726e-05, + "loss": 0.012522125616669655, + "num_input_tokens_seen": 22123976, + "step": 1351, + "train_runtime": 10978.4906, + "train_tokens_per_second": 2015.211 + }, + { + "epoch": 0.8193939393939393, + "grad_norm": 0.017318114638328552, + "learning_rate": 9.870494829318478e-05, + "loss": 0.013153801672160625, + "num_input_tokens_seen": 22140352, + "step": 1352, + "train_runtime": 10986.6091, + "train_tokens_per_second": 2015.213 + }, + { + "epoch": 0.82, + "grad_norm": 0.014927364885807037, + "learning_rate": 9.87027729708611e-05, + "loss": 0.012346560135483742, + "num_input_tokens_seen": 22156728, + "step": 1353, + "train_runtime": 10994.7304, + "train_tokens_per_second": 2015.213 + }, + { + "epoch": 0.8206060606060606, + "grad_norm": 0.011974423192441463, + "learning_rate": 9.870059584711668e-05, + "loss": 0.012083306908607483, + "num_input_tokens_seen": 22173104, + "step": 1354, + "train_runtime": 11002.8485, + "train_tokens_per_second": 2015.215 + }, + { + "epoch": 0.8212121212121212, + "grad_norm": 0.014825565740466118, + "learning_rate": 9.869841692203208e-05, + "loss": 0.013183614239096642, + "num_input_tokens_seen": 22189480, + "step": 1355, + "train_runtime": 11010.9676, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 0.8218181818181818, + "grad_norm": 0.01507493481040001, + "learning_rate": 9.869623619568786e-05, + "loss": 0.012968642637133598, + "num_input_tokens_seen": 22205856, + "step": 1356, + "train_runtime": 11019.0872, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 0.8224242424242424, + "grad_norm": 0.009937523864209652, + "learning_rate": 9.86940536681647e-05, + "loss": 0.012275813147425652, + "num_input_tokens_seen": 22222232, + "step": 1357, + "train_runtime": 11027.2052, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 0.823030303030303, + "grad_norm": 0.008918135426938534, + "learning_rate": 9.869186933954331e-05, + "loss": 0.012659851461648941, + "num_input_tokens_seen": 22238608, + "step": 1358, + "train_runtime": 11035.3325, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 0.8236363636363636, + "grad_norm": 0.015751199796795845, + "learning_rate": 9.868968320990452e-05, + "loss": 0.01403406634926796, + "num_input_tokens_seen": 22254984, + "step": 1359, + "train_runtime": 11043.4519, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 0.8242424242424242, + "grad_norm": 0.04037531092762947, + "learning_rate": 9.868749527932914e-05, + "loss": 0.014338891953229904, + "num_input_tokens_seen": 22271360, + "step": 1360, + "train_runtime": 11051.5705, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 0.8248484848484848, + "grad_norm": 0.027882136404514313, + "learning_rate": 9.868530554789815e-05, + "loss": 0.013941345736384392, + "num_input_tokens_seen": 22287736, + "step": 1361, + "train_runtime": 11059.6884, + "train_tokens_per_second": 2015.223 + }, + { + "epoch": 0.8254545454545454, + "grad_norm": 0.012381003238260746, + "learning_rate": 9.868311401569251e-05, + "loss": 0.013261671178042889, + "num_input_tokens_seen": 22304112, + "step": 1362, + "train_runtime": 11067.807, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 0.826060606060606, + "grad_norm": 0.019036108627915382, + "learning_rate": 9.868092068279329e-05, + "loss": 0.011298813857138157, + "num_input_tokens_seen": 22320488, + "step": 1363, + "train_runtime": 11075.931, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 0.017254643142223358, + "learning_rate": 9.86787255492816e-05, + "loss": 0.011698050424456596, + "num_input_tokens_seen": 22336864, + "step": 1364, + "train_runtime": 11084.0513, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 0.8272727272727273, + "grad_norm": 0.012853083200752735, + "learning_rate": 9.867652861523866e-05, + "loss": 0.012597981840372086, + "num_input_tokens_seen": 22353240, + "step": 1365, + "train_runtime": 11092.1694, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 0.8278787878787879, + "grad_norm": 0.01841077208518982, + "learning_rate": 9.867432988074572e-05, + "loss": 0.014389104209840298, + "num_input_tokens_seen": 22369616, + "step": 1366, + "train_runtime": 11100.288, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 0.8284848484848485, + "grad_norm": 0.010745099745690823, + "learning_rate": 9.867212934588411e-05, + "loss": 0.013019641861319542, + "num_input_tokens_seen": 22385992, + "step": 1367, + "train_runtime": 11108.4091, + "train_tokens_per_second": 2015.229 + }, + { + "epoch": 0.8290909090909091, + "grad_norm": 0.015590776689350605, + "learning_rate": 9.866992701073522e-05, + "loss": 0.012512456625699997, + "num_input_tokens_seen": 22402368, + "step": 1368, + "train_runtime": 11116.5309, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 0.8296969696969697, + "grad_norm": 0.012689262628555298, + "learning_rate": 9.866772287538051e-05, + "loss": 0.0124176861718297, + "num_input_tokens_seen": 22418744, + "step": 1369, + "train_runtime": 11124.6504, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 0.8303030303030303, + "grad_norm": 0.01711587980389595, + "learning_rate": 9.866551693990151e-05, + "loss": 0.012735790573060513, + "num_input_tokens_seen": 22435120, + "step": 1370, + "train_runtime": 11132.7682, + "train_tokens_per_second": 2015.233 + }, + { + "epoch": 0.8309090909090909, + "grad_norm": 0.015730151906609535, + "learning_rate": 9.866330920437979e-05, + "loss": 0.012005583383142948, + "num_input_tokens_seen": 22451496, + "step": 1371, + "train_runtime": 11140.8866, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 0.8315151515151515, + "grad_norm": 0.01519712619483471, + "learning_rate": 9.866109966889705e-05, + "loss": 0.01357693038880825, + "num_input_tokens_seen": 22467872, + "step": 1372, + "train_runtime": 11149.0048, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 0.8321212121212122, + "grad_norm": 0.013813342899084091, + "learning_rate": 9.865888833353499e-05, + "loss": 0.01139441505074501, + "num_input_tokens_seen": 22484248, + "step": 1373, + "train_runtime": 11157.132, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 0.8327272727272728, + "grad_norm": 0.0025392461102455854, + "learning_rate": 9.865667519837541e-05, + "loss": 0.012021156027913094, + "num_input_tokens_seen": 22500624, + "step": 1374, + "train_runtime": 11165.2509, + "train_tokens_per_second": 2015.237 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 0.02511661872267723, + "learning_rate": 9.865446026350017e-05, + "loss": 0.013405115343630314, + "num_input_tokens_seen": 22517000, + "step": 1375, + "train_runtime": 11173.3711, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 0.833939393939394, + "grad_norm": 0.00909800361841917, + "learning_rate": 9.865224352899119e-05, + "loss": 0.012450836598873138, + "num_input_tokens_seen": 22533376, + "step": 1376, + "train_runtime": 11181.4899, + "train_tokens_per_second": 2015.239 + }, + { + "epoch": 0.8345454545454546, + "grad_norm": 0.016653254628181458, + "learning_rate": 9.865002499493048e-05, + "loss": 0.012657481245696545, + "num_input_tokens_seen": 22549752, + "step": 1377, + "train_runtime": 11189.6096, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 0.8351515151515152, + "grad_norm": 0.01992923766374588, + "learning_rate": 9.864780466140009e-05, + "loss": 0.014634167775511742, + "num_input_tokens_seen": 22566128, + "step": 1378, + "train_runtime": 11197.7309, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 0.8357575757575758, + "grad_norm": 0.01114288903772831, + "learning_rate": 9.864558252848213e-05, + "loss": 0.012311486527323723, + "num_input_tokens_seen": 22582504, + "step": 1379, + "train_runtime": 11205.8507, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 0.8363636363636363, + "grad_norm": 0.006412926129996777, + "learning_rate": 9.864335859625879e-05, + "loss": 0.011968771927058697, + "num_input_tokens_seen": 22598880, + "step": 1380, + "train_runtime": 11213.9704, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 0.8369696969696969, + "grad_norm": 0.015292688272893429, + "learning_rate": 9.864113286481237e-05, + "loss": 0.012508758343756199, + "num_input_tokens_seen": 22615256, + "step": 1381, + "train_runtime": 11222.0902, + "train_tokens_per_second": 2015.245 + }, + { + "epoch": 0.8375757575757575, + "grad_norm": 0.007994702085852623, + "learning_rate": 9.863890533422516e-05, + "loss": 0.011799611151218414, + "num_input_tokens_seen": 22631632, + "step": 1382, + "train_runtime": 11230.2088, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 0.8381818181818181, + "grad_norm": 0.011203468777239323, + "learning_rate": 9.863667600457957e-05, + "loss": 0.012280134484171867, + "num_input_tokens_seen": 22648008, + "step": 1383, + "train_runtime": 11238.3318, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 0.8387878787878787, + "grad_norm": 0.011974025517702103, + "learning_rate": 9.863444487595803e-05, + "loss": 0.012465615756809711, + "num_input_tokens_seen": 22664384, + "step": 1384, + "train_runtime": 11246.4525, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 0.8393939393939394, + "grad_norm": 0.015972545370459557, + "learning_rate": 9.86322119484431e-05, + "loss": 0.011578064411878586, + "num_input_tokens_seen": 22680760, + "step": 1385, + "train_runtime": 11254.5716, + "train_tokens_per_second": 2015.249 + }, + { + "epoch": 0.84, + "grad_norm": 0.015455652959644794, + "learning_rate": 9.862997722211735e-05, + "loss": 0.013119183480739594, + "num_input_tokens_seen": 22697136, + "step": 1386, + "train_runtime": 11262.6896, + "train_tokens_per_second": 2015.25 + }, + { + "epoch": 0.8406060606060606, + "grad_norm": 0.02377978526055813, + "learning_rate": 9.862774069706346e-05, + "loss": 0.013906264677643776, + "num_input_tokens_seen": 22713512, + "step": 1387, + "train_runtime": 11270.8064, + "train_tokens_per_second": 2015.252 + }, + { + "epoch": 0.8412121212121212, + "grad_norm": 0.012666025198996067, + "learning_rate": 9.862550237336413e-05, + "loss": 0.011985675431787968, + "num_input_tokens_seen": 22729888, + "step": 1388, + "train_runtime": 11278.9313, + "train_tokens_per_second": 2015.252 + }, + { + "epoch": 0.8418181818181818, + "grad_norm": 0.011326838284730911, + "learning_rate": 9.862326225110216e-05, + "loss": 0.011968444101512432, + "num_input_tokens_seen": 22746264, + "step": 1389, + "train_runtime": 11287.0513, + "train_tokens_per_second": 2015.253 + }, + { + "epoch": 0.8424242424242424, + "grad_norm": 0.0107469717040658, + "learning_rate": 9.862102033036042e-05, + "loss": 0.012955324724316597, + "num_input_tokens_seen": 22762640, + "step": 1390, + "train_runtime": 11295.1711, + "train_tokens_per_second": 2015.254 + }, + { + "epoch": 0.843030303030303, + "grad_norm": 0.011529113166034222, + "learning_rate": 9.86187766112218e-05, + "loss": 0.012033510953187943, + "num_input_tokens_seen": 22779016, + "step": 1391, + "train_runtime": 11303.291, + "train_tokens_per_second": 2015.255 + }, + { + "epoch": 0.8436363636363636, + "grad_norm": 0.011735017411410809, + "learning_rate": 9.861653109376934e-05, + "loss": 0.012355628423392773, + "num_input_tokens_seen": 22795392, + "step": 1392, + "train_runtime": 11311.4112, + "train_tokens_per_second": 2015.256 + }, + { + "epoch": 0.8442424242424242, + "grad_norm": 0.05303164944052696, + "learning_rate": 9.861428377808606e-05, + "loss": 0.011541967280209064, + "num_input_tokens_seen": 22811768, + "step": 1393, + "train_runtime": 11319.5333, + "train_tokens_per_second": 2015.257 + }, + { + "epoch": 0.8448484848484848, + "grad_norm": 0.006728252395987511, + "learning_rate": 9.861203466425508e-05, + "loss": 0.013560689054429531, + "num_input_tokens_seen": 22828144, + "step": 1394, + "train_runtime": 11327.6516, + "train_tokens_per_second": 2015.258 + }, + { + "epoch": 0.8454545454545455, + "grad_norm": 0.013561035506427288, + "learning_rate": 9.860978375235963e-05, + "loss": 0.01197909377515316, + "num_input_tokens_seen": 22844520, + "step": 1395, + "train_runtime": 11335.7713, + "train_tokens_per_second": 2015.259 + }, + { + "epoch": 0.8460606060606061, + "grad_norm": 0.020181827247142792, + "learning_rate": 9.860753104248292e-05, + "loss": 0.013650638982653618, + "num_input_tokens_seen": 22860896, + "step": 1396, + "train_runtime": 11343.891, + "train_tokens_per_second": 2015.261 + }, + { + "epoch": 0.8466666666666667, + "grad_norm": 0.014147473499178886, + "learning_rate": 9.860527653470831e-05, + "loss": 0.012558222748339176, + "num_input_tokens_seen": 22877272, + "step": 1397, + "train_runtime": 11352.0104, + "train_tokens_per_second": 2015.262 + }, + { + "epoch": 0.8472727272727273, + "grad_norm": 0.02730811946094036, + "learning_rate": 9.860302022911918e-05, + "loss": 0.011438505724072456, + "num_input_tokens_seen": 22893648, + "step": 1398, + "train_runtime": 11360.1326, + "train_tokens_per_second": 2015.262 + }, + { + "epoch": 0.8478787878787879, + "grad_norm": 0.008396074175834656, + "learning_rate": 9.860076212579896e-05, + "loss": 0.011421089991927147, + "num_input_tokens_seen": 22910024, + "step": 1399, + "train_runtime": 11368.2516, + "train_tokens_per_second": 2015.264 + }, + { + "epoch": 0.8484848484848485, + "grad_norm": 0.015687720850110054, + "learning_rate": 9.859850222483123e-05, + "loss": 0.011946003884077072, + "num_input_tokens_seen": 22926400, + "step": 1400, + "train_runtime": 11376.3701, + "train_tokens_per_second": 2015.265 + }, + { + "epoch": 0.8490909090909091, + "grad_norm": 0.016600729897618294, + "learning_rate": 9.859624052629951e-05, + "loss": 0.013347601518034935, + "num_input_tokens_seen": 22942776, + "step": 1401, + "train_runtime": 11385.3698, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 0.8496969696969697, + "grad_norm": 0.01619286835193634, + "learning_rate": 9.85939770302875e-05, + "loss": 0.01237676665186882, + "num_input_tokens_seen": 22959152, + "step": 1402, + "train_runtime": 11393.4858, + "train_tokens_per_second": 2015.112 + }, + { + "epoch": 0.8503030303030303, + "grad_norm": 0.03248157724738121, + "learning_rate": 9.859171173687891e-05, + "loss": 0.011984573677182198, + "num_input_tokens_seen": 22975528, + "step": 1403, + "train_runtime": 11401.6049, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 0.850909090909091, + "grad_norm": 0.015169057063758373, + "learning_rate": 9.858944464615754e-05, + "loss": 0.012047179043293, + "num_input_tokens_seen": 22991904, + "step": 1404, + "train_runtime": 11409.7207, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 0.8515151515151516, + "grad_norm": 0.013567727990448475, + "learning_rate": 9.858717575820723e-05, + "loss": 0.015603979118168354, + "num_input_tokens_seen": 23008280, + "step": 1405, + "train_runtime": 11417.8368, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 0.8521212121212122, + "grad_norm": 0.009488792158663273, + "learning_rate": 9.85849050731119e-05, + "loss": 0.012402615509927273, + "num_input_tokens_seen": 23024656, + "step": 1406, + "train_runtime": 11425.9595, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 0.8527272727272728, + "grad_norm": 0.010693400166928768, + "learning_rate": 9.858263259095557e-05, + "loss": 0.012290366925299168, + "num_input_tokens_seen": 23041032, + "step": 1407, + "train_runtime": 11434.0763, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 0.033301692456007004, + "learning_rate": 9.858035831182226e-05, + "loss": 0.016457989811897278, + "num_input_tokens_seen": 23057408, + "step": 1408, + "train_runtime": 11442.1926, + "train_tokens_per_second": 2015.121 + }, + { + "epoch": 0.8539393939393939, + "grad_norm": 0.021398158743977547, + "learning_rate": 9.85780822357961e-05, + "loss": 0.01473909430205822, + "num_input_tokens_seen": 23073784, + "step": 1409, + "train_runtime": 11450.3097, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 0.8545454545454545, + "grad_norm": 0.024622607976198196, + "learning_rate": 9.857580436296127e-05, + "loss": 0.01464729942381382, + "num_input_tokens_seen": 23090160, + "step": 1410, + "train_runtime": 11458.4341, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 0.8551515151515151, + "grad_norm": 0.010785657912492752, + "learning_rate": 9.857352469340204e-05, + "loss": 0.01193370670080185, + "num_input_tokens_seen": 23106536, + "step": 1411, + "train_runtime": 11466.5619, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 0.8557575757575757, + "grad_norm": 0.00772235170006752, + "learning_rate": 9.857124322720273e-05, + "loss": 0.01089341752231121, + "num_input_tokens_seen": 23122912, + "step": 1412, + "train_runtime": 11474.6825, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 0.8563636363636363, + "grad_norm": 0.013583921827375889, + "learning_rate": 9.856895996444772e-05, + "loss": 0.011918467469513416, + "num_input_tokens_seen": 23139288, + "step": 1413, + "train_runtime": 11482.8028, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 0.8569696969696969, + "grad_norm": 0.010831024497747421, + "learning_rate": 9.856667490522146e-05, + "loss": 0.011809214949607849, + "num_input_tokens_seen": 23155664, + "step": 1414, + "train_runtime": 11490.9203, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 0.8575757575757575, + "grad_norm": 0.010280442424118519, + "learning_rate": 9.856438804960848e-05, + "loss": 0.011262697167694569, + "num_input_tokens_seen": 23172040, + "step": 1415, + "train_runtime": 11499.0452, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 0.8581818181818182, + "grad_norm": 0.017096806317567825, + "learning_rate": 9.856209939769335e-05, + "loss": 0.013108273036777973, + "num_input_tokens_seen": 23188416, + "step": 1416, + "train_runtime": 11507.1661, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 0.8587878787878788, + "grad_norm": 0.011299760080873966, + "learning_rate": 9.855980894956074e-05, + "loss": 0.011602142825722694, + "num_input_tokens_seen": 23204792, + "step": 1417, + "train_runtime": 11515.2871, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 0.8593939393939394, + "grad_norm": 0.016026539728045464, + "learning_rate": 9.855751670529536e-05, + "loss": 0.013274731114506721, + "num_input_tokens_seen": 23221168, + "step": 1418, + "train_runtime": 11523.407, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 0.86, + "grad_norm": 0.012965604662895203, + "learning_rate": 9.8555222664982e-05, + "loss": 0.01300659030675888, + "num_input_tokens_seen": 23237544, + "step": 1419, + "train_runtime": 11531.5321, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 0.8606060606060606, + "grad_norm": 0.02445352077484131, + "learning_rate": 9.855292682870551e-05, + "loss": 0.013643065467476845, + "num_input_tokens_seen": 23253920, + "step": 1420, + "train_runtime": 11539.6549, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 0.8612121212121212, + "grad_norm": 0.01660062186419964, + "learning_rate": 9.855062919655083e-05, + "loss": 0.012771239504218102, + "num_input_tokens_seen": 23270296, + "step": 1421, + "train_runtime": 11547.78, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 0.8618181818181818, + "grad_norm": 0.022184133529663086, + "learning_rate": 9.854832976860289e-05, + "loss": 0.012608212418854237, + "num_input_tokens_seen": 23286672, + "step": 1422, + "train_runtime": 11555.8992, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 0.8624242424242424, + "grad_norm": 0.018696678802371025, + "learning_rate": 9.85460285449468e-05, + "loss": 0.01361760776489973, + "num_input_tokens_seen": 23303048, + "step": 1423, + "train_runtime": 11564.0219, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 0.863030303030303, + "grad_norm": 0.009972603991627693, + "learning_rate": 9.854372552566764e-05, + "loss": 0.012581647373735905, + "num_input_tokens_seen": 23319424, + "step": 1424, + "train_runtime": 11572.1414, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 0.8636363636363636, + "grad_norm": 0.007429645396769047, + "learning_rate": 9.854142071085061e-05, + "loss": 0.011579862795770168, + "num_input_tokens_seen": 23335800, + "step": 1425, + "train_runtime": 11580.2672, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 0.8642424242424243, + "grad_norm": 0.014917800202965736, + "learning_rate": 9.853911410058097e-05, + "loss": 0.014219231903553009, + "num_input_tokens_seen": 23352176, + "step": 1426, + "train_runtime": 11588.3879, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 0.8648484848484849, + "grad_norm": 0.013650372624397278, + "learning_rate": 9.853680569494401e-05, + "loss": 0.011797931976616383, + "num_input_tokens_seen": 23368552, + "step": 1427, + "train_runtime": 11596.5073, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 0.8654545454545455, + "grad_norm": 0.02313203178346157, + "learning_rate": 9.853449549402514e-05, + "loss": 0.012145559303462505, + "num_input_tokens_seen": 23384928, + "step": 1428, + "train_runtime": 11604.6343, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 0.8660606060606061, + "grad_norm": 0.01312107965350151, + "learning_rate": 9.853218349790979e-05, + "loss": 0.012798131443560123, + "num_input_tokens_seen": 23401304, + "step": 1429, + "train_runtime": 11612.7553, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 0.01796252839267254, + "learning_rate": 9.852986970668349e-05, + "loss": 0.012703349813818932, + "num_input_tokens_seen": 23417680, + "step": 1430, + "train_runtime": 11620.8769, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 0.8672727272727273, + "grad_norm": 0.008615394122898579, + "learning_rate": 9.85275541204318e-05, + "loss": 0.011092279106378555, + "num_input_tokens_seen": 23434056, + "step": 1431, + "train_runtime": 11628.999, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 0.8678787878787879, + "grad_norm": 0.01242077350616455, + "learning_rate": 9.852523673924042e-05, + "loss": 0.012449410744011402, + "num_input_tokens_seen": 23450432, + "step": 1432, + "train_runtime": 11637.1318, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 0.8684848484848485, + "grad_norm": 0.018718481063842773, + "learning_rate": 9.852291756319501e-05, + "loss": 0.014180365018546581, + "num_input_tokens_seen": 23466808, + "step": 1433, + "train_runtime": 11645.2526, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 0.8690909090909091, + "grad_norm": 0.015112209133803844, + "learning_rate": 9.852059659238137e-05, + "loss": 0.011895522475242615, + "num_input_tokens_seen": 23483184, + "step": 1434, + "train_runtime": 11653.3718, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 0.8696969696969697, + "grad_norm": 0.016961101442575455, + "learning_rate": 9.851827382688535e-05, + "loss": 0.013209850527346134, + "num_input_tokens_seen": 23499560, + "step": 1435, + "train_runtime": 11661.4915, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 0.8703030303030304, + "grad_norm": 0.024609530344605446, + "learning_rate": 9.851594926679287e-05, + "loss": 0.013271688483655453, + "num_input_tokens_seen": 23515936, + "step": 1436, + "train_runtime": 11669.6192, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 0.8709090909090909, + "grad_norm": 0.012788881547749043, + "learning_rate": 9.851362291218991e-05, + "loss": 0.014663812704384327, + "num_input_tokens_seen": 23532312, + "step": 1437, + "train_runtime": 11677.741, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 0.8715151515151515, + "grad_norm": 0.012840853072702885, + "learning_rate": 9.851129476316252e-05, + "loss": 0.014140025712549686, + "num_input_tokens_seen": 23548688, + "step": 1438, + "train_runtime": 11685.8601, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 0.8721212121212121, + "grad_norm": 0.008752677589654922, + "learning_rate": 9.85089648197968e-05, + "loss": 0.012541829608380795, + "num_input_tokens_seen": 23565064, + "step": 1439, + "train_runtime": 11694.0189, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 0.8727272727272727, + "grad_norm": 0.019652189686894417, + "learning_rate": 9.850663308217893e-05, + "loss": 0.013279132544994354, + "num_input_tokens_seen": 23581440, + "step": 1440, + "train_runtime": 11702.1396, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 0.8733333333333333, + "grad_norm": 0.020490368828177452, + "learning_rate": 9.850429955039518e-05, + "loss": 0.013454969972372055, + "num_input_tokens_seen": 23597816, + "step": 1441, + "train_runtime": 11710.2595, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 0.8739393939393939, + "grad_norm": 0.007157603278756142, + "learning_rate": 9.850196422453185e-05, + "loss": 0.012067731469869614, + "num_input_tokens_seen": 23614192, + "step": 1442, + "train_runtime": 11718.3843, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 0.8745454545454545, + "grad_norm": 0.01431642472743988, + "learning_rate": 9.849962710467531e-05, + "loss": 0.012786502949893475, + "num_input_tokens_seen": 23630568, + "step": 1443, + "train_runtime": 11726.5058, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 0.8751515151515151, + "grad_norm": 0.019850250333547592, + "learning_rate": 9.849728819091201e-05, + "loss": 0.014681624248623848, + "num_input_tokens_seen": 23646944, + "step": 1444, + "train_runtime": 11734.6344, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 0.8757575757575757, + "grad_norm": 0.020691564306616783, + "learning_rate": 9.849494748332846e-05, + "loss": 0.013279177248477936, + "num_input_tokens_seen": 23663320, + "step": 1445, + "train_runtime": 11742.7555, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 0.8763636363636363, + "grad_norm": 0.01907418854534626, + "learning_rate": 9.849260498201126e-05, + "loss": 0.013074219226837158, + "num_input_tokens_seen": 23679696, + "step": 1446, + "train_runtime": 11750.8745, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 0.876969696969697, + "grad_norm": 0.015929479151964188, + "learning_rate": 9.849026068704702e-05, + "loss": 0.01318158209323883, + "num_input_tokens_seen": 23696072, + "step": 1447, + "train_runtime": 11758.9957, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 0.8775757575757576, + "grad_norm": 0.008310094475746155, + "learning_rate": 9.848791459852247e-05, + "loss": 0.012736203148961067, + "num_input_tokens_seen": 23712448, + "step": 1448, + "train_runtime": 11767.1144, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 0.8781818181818182, + "grad_norm": 0.014460076577961445, + "learning_rate": 9.848556671652438e-05, + "loss": 0.013328369706869125, + "num_input_tokens_seen": 23728824, + "step": 1449, + "train_runtime": 11775.2427, + "train_tokens_per_second": 2015.145 + }, + { + "epoch": 0.8787878787878788, + "grad_norm": 0.03255464881658554, + "learning_rate": 9.84832170411396e-05, + "loss": 0.013615809381008148, + "num_input_tokens_seen": 23745200, + "step": 1450, + "train_runtime": 11783.3648, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 0.8793939393939394, + "grad_norm": 0.012133513577282429, + "learning_rate": 9.848086557245507e-05, + "loss": 0.01337357982993126, + "num_input_tokens_seen": 23761576, + "step": 1451, + "train_runtime": 11791.4852, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 0.88, + "grad_norm": 0.015239196829497814, + "learning_rate": 9.847851231055769e-05, + "loss": 0.014663100242614746, + "num_input_tokens_seen": 23777952, + "step": 1452, + "train_runtime": 11799.6042, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 0.8806060606060606, + "grad_norm": 0.01241212897002697, + "learning_rate": 9.847615725553456e-05, + "loss": 0.010611728765070438, + "num_input_tokens_seen": 23794328, + "step": 1453, + "train_runtime": 11807.7327, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 0.8812121212121212, + "grad_norm": 0.011121004819869995, + "learning_rate": 9.84738004074728e-05, + "loss": 0.01263385359197855, + "num_input_tokens_seen": 23810704, + "step": 1454, + "train_runtime": 11815.8522, + "train_tokens_per_second": 2015.149 + }, + { + "epoch": 0.8818181818181818, + "grad_norm": 0.004201293457299471, + "learning_rate": 9.847144176645954e-05, + "loss": 0.011922663077712059, + "num_input_tokens_seen": 23827080, + "step": 1455, + "train_runtime": 11823.9733, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 0.8824242424242424, + "grad_norm": 0.020469272509217262, + "learning_rate": 9.846908133258204e-05, + "loss": 0.014031194150447845, + "num_input_tokens_seen": 23843456, + "step": 1456, + "train_runtime": 11832.0934, + "train_tokens_per_second": 2015.151 + }, + { + "epoch": 0.883030303030303, + "grad_norm": 0.011495082639157772, + "learning_rate": 9.846671910592761e-05, + "loss": 0.010416560806334019, + "num_input_tokens_seen": 23859832, + "step": 1457, + "train_runtime": 11840.214, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 0.8836363636363637, + "grad_norm": 0.018274936825037003, + "learning_rate": 9.846435508658364e-05, + "loss": 0.013232077471911907, + "num_input_tokens_seen": 23876208, + "step": 1458, + "train_runtime": 11848.3402, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 0.8842424242424243, + "grad_norm": 0.015522732399404049, + "learning_rate": 9.846198927463754e-05, + "loss": 0.01150945108383894, + "num_input_tokens_seen": 23892584, + "step": 1459, + "train_runtime": 11856.4668, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 0.8848484848484849, + "grad_norm": 0.0219294223934412, + "learning_rate": 9.845962167017684e-05, + "loss": 0.014381872490048409, + "num_input_tokens_seen": 23908960, + "step": 1460, + "train_runtime": 11864.5874, + "train_tokens_per_second": 2015.153 + }, + { + "epoch": 0.8854545454545455, + "grad_norm": 0.014889719896018505, + "learning_rate": 9.84572522732891e-05, + "loss": 0.014377344399690628, + "num_input_tokens_seen": 23925336, + "step": 1461, + "train_runtime": 11872.7097, + "train_tokens_per_second": 2015.154 + }, + { + "epoch": 0.8860606060606061, + "grad_norm": 0.03352119028568268, + "learning_rate": 9.845488108406198e-05, + "loss": 0.012402249500155449, + "num_input_tokens_seen": 23941712, + "step": 1462, + "train_runtime": 11880.8311, + "train_tokens_per_second": 2015.155 + }, + { + "epoch": 0.8866666666666667, + "grad_norm": 0.01767772249877453, + "learning_rate": 9.845250810258315e-05, + "loss": 0.01259274035692215, + "num_input_tokens_seen": 23958088, + "step": 1463, + "train_runtime": 11888.9518, + "train_tokens_per_second": 2015.156 + }, + { + "epoch": 0.8872727272727273, + "grad_norm": 0.02254144847393036, + "learning_rate": 9.845013332894043e-05, + "loss": 0.012254755012691021, + "num_input_tokens_seen": 23974464, + "step": 1464, + "train_runtime": 11897.08, + "train_tokens_per_second": 2015.155 + }, + { + "epoch": 0.8878787878787879, + "grad_norm": 0.02848796173930168, + "learning_rate": 9.84477567632216e-05, + "loss": 0.011723476462066174, + "num_input_tokens_seen": 23990840, + "step": 1465, + "train_runtime": 11905.2012, + "train_tokens_per_second": 2015.156 + }, + { + "epoch": 0.8884848484848484, + "grad_norm": 0.010046341456472874, + "learning_rate": 9.844537840551462e-05, + "loss": 0.012512149289250374, + "num_input_tokens_seen": 24007216, + "step": 1466, + "train_runtime": 11913.3224, + "train_tokens_per_second": 2015.157 + }, + { + "epoch": 0.889090909090909, + "grad_norm": 0.010369017720222473, + "learning_rate": 9.844299825590741e-05, + "loss": 0.012070760130882263, + "num_input_tokens_seen": 24023592, + "step": 1467, + "train_runtime": 11921.4458, + "train_tokens_per_second": 2015.158 + }, + { + "epoch": 0.8896969696969697, + "grad_norm": 0.013119385577738285, + "learning_rate": 9.844061631448804e-05, + "loss": 0.013083739206194878, + "num_input_tokens_seen": 24039968, + "step": 1468, + "train_runtime": 11929.5648, + "train_tokens_per_second": 2015.159 + }, + { + "epoch": 0.8903030303030303, + "grad_norm": 0.034340064972639084, + "learning_rate": 9.843823258134461e-05, + "loss": 0.012108924798667431, + "num_input_tokens_seen": 24056344, + "step": 1469, + "train_runtime": 11937.7032, + "train_tokens_per_second": 2015.157 + }, + { + "epoch": 0.8909090909090909, + "grad_norm": 0.014157054014503956, + "learning_rate": 9.84358470565653e-05, + "loss": 0.012553790584206581, + "num_input_tokens_seen": 24072720, + "step": 1470, + "train_runtime": 11945.8324, + "train_tokens_per_second": 2015.156 + }, + { + "epoch": 0.8915151515151515, + "grad_norm": 0.045436400920152664, + "learning_rate": 9.843345974023832e-05, + "loss": 0.012693868018686771, + "num_input_tokens_seen": 24089096, + "step": 1471, + "train_runtime": 11953.9552, + "train_tokens_per_second": 2015.157 + }, + { + "epoch": 0.8921212121212121, + "grad_norm": 0.0057262699119746685, + "learning_rate": 9.843107063245199e-05, + "loss": 0.01204710453748703, + "num_input_tokens_seen": 24105472, + "step": 1472, + "train_runtime": 11962.0766, + "train_tokens_per_second": 2015.158 + }, + { + "epoch": 0.8927272727272727, + "grad_norm": 0.010710208676755428, + "learning_rate": 9.842867973329466e-05, + "loss": 0.011947019957005978, + "num_input_tokens_seen": 24121848, + "step": 1473, + "train_runtime": 11970.1975, + "train_tokens_per_second": 2015.159 + }, + { + "epoch": 0.8933333333333333, + "grad_norm": 0.01923258602619171, + "learning_rate": 9.842628704285479e-05, + "loss": 0.012753183022141457, + "num_input_tokens_seen": 24138224, + "step": 1474, + "train_runtime": 11978.3191, + "train_tokens_per_second": 2015.16 + }, + { + "epoch": 0.8939393939393939, + "grad_norm": 0.01927172765135765, + "learning_rate": 9.842389256122086e-05, + "loss": 0.011474791914224625, + "num_input_tokens_seen": 24154600, + "step": 1475, + "train_runtime": 11986.4408, + "train_tokens_per_second": 2015.16 + }, + { + "epoch": 0.8945454545454545, + "grad_norm": 0.008145877160131931, + "learning_rate": 9.842149628848145e-05, + "loss": 0.01207401417195797, + "num_input_tokens_seen": 24170976, + "step": 1476, + "train_runtime": 11994.5627, + "train_tokens_per_second": 2015.161 + }, + { + "epoch": 0.8951515151515151, + "grad_norm": 0.00783380214124918, + "learning_rate": 9.841909822472518e-05, + "loss": 0.012457404285669327, + "num_input_tokens_seen": 24187352, + "step": 1477, + "train_runtime": 12002.6852, + "train_tokens_per_second": 2015.162 + }, + { + "epoch": 0.8957575757575758, + "grad_norm": 0.0179067924618721, + "learning_rate": 9.841669837004077e-05, + "loss": 0.013264812529087067, + "num_input_tokens_seen": 24203728, + "step": 1478, + "train_runtime": 12010.806, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 0.8963636363636364, + "grad_norm": 0.013561374507844448, + "learning_rate": 9.841429672451697e-05, + "loss": 0.012999416328966618, + "num_input_tokens_seen": 24220104, + "step": 1479, + "train_runtime": 12018.9318, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 0.896969696969697, + "grad_norm": 0.017535557970404625, + "learning_rate": 9.84118932882426e-05, + "loss": 0.01261084619909525, + "num_input_tokens_seen": 24236480, + "step": 1480, + "train_runtime": 12027.0548, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 0.8975757575757576, + "grad_norm": 0.010221567936241627, + "learning_rate": 9.84094880613066e-05, + "loss": 0.012463985942304134, + "num_input_tokens_seen": 24252856, + "step": 1481, + "train_runtime": 12035.177, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 0.8981818181818182, + "grad_norm": 0.015500430017709732, + "learning_rate": 9.84070810437979e-05, + "loss": 0.012875360436737537, + "num_input_tokens_seen": 24269232, + "step": 1482, + "train_runtime": 12043.2972, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 0.8987878787878788, + "grad_norm": 0.018367871642112732, + "learning_rate": 9.840467223580554e-05, + "loss": 0.013259278610348701, + "num_input_tokens_seen": 24285608, + "step": 1483, + "train_runtime": 12051.4351, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 0.8993939393939394, + "grad_norm": 0.010412012226879597, + "learning_rate": 9.840226163741862e-05, + "loss": 0.01317787729203701, + "num_input_tokens_seen": 24301984, + "step": 1484, + "train_runtime": 12059.5701, + "train_tokens_per_second": 2015.162 + }, + { + "epoch": 0.9, + "grad_norm": 0.014846539124846458, + "learning_rate": 9.83998492487263e-05, + "loss": 0.013309162110090256, + "num_input_tokens_seen": 24318360, + "step": 1485, + "train_runtime": 12067.6971, + "train_tokens_per_second": 2015.162 + }, + { + "epoch": 0.9006060606060606, + "grad_norm": 0.015418080613017082, + "learning_rate": 9.839743506981782e-05, + "loss": 0.013252614066004753, + "num_input_tokens_seen": 24334736, + "step": 1486, + "train_runtime": 12075.8171, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 0.9012121212121212, + "grad_norm": 0.005580899305641651, + "learning_rate": 9.839501910078246e-05, + "loss": 0.011665296740829945, + "num_input_tokens_seen": 24351112, + "step": 1487, + "train_runtime": 12083.9372, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 0.9018181818181819, + "grad_norm": 0.020693181082606316, + "learning_rate": 9.839260134170958e-05, + "loss": 0.012977859936654568, + "num_input_tokens_seen": 24367488, + "step": 1488, + "train_runtime": 12092.0566, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 0.9024242424242425, + "grad_norm": 0.02792605198919773, + "learning_rate": 9.839018179268862e-05, + "loss": 0.013854194432497025, + "num_input_tokens_seen": 24383864, + "step": 1489, + "train_runtime": 12100.1926, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 0.9030303030303031, + "grad_norm": 0.017738644033670425, + "learning_rate": 9.838776045380909e-05, + "loss": 0.013810316100716591, + "num_input_tokens_seen": 24400240, + "step": 1490, + "train_runtime": 12108.3165, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 0.9036363636363637, + "grad_norm": 0.013722319155931473, + "learning_rate": 9.838533732516051e-05, + "loss": 0.013590461574494839, + "num_input_tokens_seen": 24416616, + "step": 1491, + "train_runtime": 12116.4361, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 0.9042424242424243, + "grad_norm": 0.015147789381444454, + "learning_rate": 9.838291240683252e-05, + "loss": 0.01322873868048191, + "num_input_tokens_seen": 24432992, + "step": 1492, + "train_runtime": 12124.5584, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 0.9048484848484849, + "grad_norm": 0.024722442030906677, + "learning_rate": 9.838048569891485e-05, + "loss": 0.011483176611363888, + "num_input_tokens_seen": 24449368, + "step": 1493, + "train_runtime": 12132.6806, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 0.9054545454545454, + "grad_norm": 0.011857298202812672, + "learning_rate": 9.837805720149721e-05, + "loss": 0.012423008680343628, + "num_input_tokens_seen": 24465744, + "step": 1494, + "train_runtime": 12140.8021, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 0.906060606060606, + "grad_norm": 0.007093346677720547, + "learning_rate": 9.837562691466946e-05, + "loss": 0.012259125709533691, + "num_input_tokens_seen": 24482120, + "step": 1495, + "train_runtime": 12148.9221, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 0.014173478819429874, + "learning_rate": 9.837319483852147e-05, + "loss": 0.012404312379658222, + "num_input_tokens_seen": 24498496, + "step": 1496, + "train_runtime": 12157.059, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 0.9072727272727272, + "grad_norm": 0.008476898074150085, + "learning_rate": 9.837076097314319e-05, + "loss": 0.011994283646345139, + "num_input_tokens_seen": 24514872, + "step": 1497, + "train_runtime": 12165.1807, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 0.9078787878787878, + "grad_norm": 0.008641723543405533, + "learning_rate": 9.836832531862469e-05, + "loss": 0.01290223654359579, + "num_input_tokens_seen": 24531248, + "step": 1498, + "train_runtime": 12173.3042, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 0.9084848484848485, + "grad_norm": 0.011743849143385887, + "learning_rate": 9.836588787505601e-05, + "loss": 0.012031139805912971, + "num_input_tokens_seen": 24547624, + "step": 1499, + "train_runtime": 12181.4387, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 0.9090909090909091, + "grad_norm": 0.03372018039226532, + "learning_rate": 9.836344864252734e-05, + "loss": 0.013024641200900078, + "num_input_tokens_seen": 24564000, + "step": 1500, + "train_runtime": 12189.5644, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 0.9096969696969697, + "grad_norm": 0.009321557357907295, + "learning_rate": 9.836100762112888e-05, + "loss": 0.012353789061307907, + "num_input_tokens_seen": 24580376, + "step": 1501, + "train_runtime": 12198.7668, + "train_tokens_per_second": 2014.989 + }, + { + "epoch": 0.9103030303030303, + "grad_norm": 0.033833153545856476, + "learning_rate": 9.835856481095092e-05, + "loss": 0.014163315296173096, + "num_input_tokens_seen": 24596752, + "step": 1502, + "train_runtime": 12206.876, + "train_tokens_per_second": 2014.992 + }, + { + "epoch": 0.9109090909090909, + "grad_norm": 0.008616876788437366, + "learning_rate": 9.835612021208382e-05, + "loss": 0.012016966007649899, + "num_input_tokens_seen": 24613128, + "step": 1503, + "train_runtime": 12214.988, + "train_tokens_per_second": 2014.994 + }, + { + "epoch": 0.9115151515151515, + "grad_norm": 0.01279782596975565, + "learning_rate": 9.835367382461802e-05, + "loss": 0.012753800489008427, + "num_input_tokens_seen": 24629504, + "step": 1504, + "train_runtime": 12223.097, + "train_tokens_per_second": 2014.997 + }, + { + "epoch": 0.9121212121212121, + "grad_norm": 0.01015873346477747, + "learning_rate": 9.835122564864397e-05, + "loss": 0.014264887198805809, + "num_input_tokens_seen": 24645880, + "step": 1505, + "train_runtime": 12231.2086, + "train_tokens_per_second": 2015.0 + }, + { + "epoch": 0.9127272727272727, + "grad_norm": 0.01769862323999405, + "learning_rate": 9.834877568425225e-05, + "loss": 0.011948327533900738, + "num_input_tokens_seen": 24662256, + "step": 1506, + "train_runtime": 12239.3149, + "train_tokens_per_second": 2015.003 + }, + { + "epoch": 0.9133333333333333, + "grad_norm": 0.017047662287950516, + "learning_rate": 9.834632393153348e-05, + "loss": 0.013797544874250889, + "num_input_tokens_seen": 24678632, + "step": 1507, + "train_runtime": 12247.4368, + "train_tokens_per_second": 2015.004 + }, + { + "epoch": 0.9139393939393939, + "grad_norm": 0.01629558391869068, + "learning_rate": 9.834387039057833e-05, + "loss": 0.011770099401473999, + "num_input_tokens_seen": 24695008, + "step": 1508, + "train_runtime": 12255.5461, + "train_tokens_per_second": 2015.007 + }, + { + "epoch": 0.9145454545454546, + "grad_norm": 0.01035243272781372, + "learning_rate": 9.834141506147756e-05, + "loss": 0.012996964156627655, + "num_input_tokens_seen": 24711384, + "step": 1509, + "train_runtime": 12263.6587, + "train_tokens_per_second": 2015.009 + }, + { + "epoch": 0.9151515151515152, + "grad_norm": 0.010673630982637405, + "learning_rate": 9.833895794432199e-05, + "loss": 0.011348108761012554, + "num_input_tokens_seen": 24727760, + "step": 1510, + "train_runtime": 12271.7697, + "train_tokens_per_second": 2015.012 + }, + { + "epoch": 0.9157575757575758, + "grad_norm": 0.0162323247641325, + "learning_rate": 9.83364990392025e-05, + "loss": 0.011655725538730621, + "num_input_tokens_seen": 24744136, + "step": 1511, + "train_runtime": 12279.878, + "train_tokens_per_second": 2015.015 + }, + { + "epoch": 0.9163636363636364, + "grad_norm": 0.011112192645668983, + "learning_rate": 9.833403834621005e-05, + "loss": 0.012943776324391365, + "num_input_tokens_seen": 24760512, + "step": 1512, + "train_runtime": 12287.9854, + "train_tokens_per_second": 2015.018 + }, + { + "epoch": 0.916969696969697, + "grad_norm": 0.008601181209087372, + "learning_rate": 9.833157586543565e-05, + "loss": 0.012646821327507496, + "num_input_tokens_seen": 24776888, + "step": 1513, + "train_runtime": 12296.0916, + "train_tokens_per_second": 2015.021 + }, + { + "epoch": 0.9175757575757576, + "grad_norm": 0.012012478895485401, + "learning_rate": 9.832911159697035e-05, + "loss": 0.012354527600109577, + "num_input_tokens_seen": 24793264, + "step": 1514, + "train_runtime": 12304.2001, + "train_tokens_per_second": 2015.024 + }, + { + "epoch": 0.9181818181818182, + "grad_norm": 0.035613156855106354, + "learning_rate": 9.832664554090536e-05, + "loss": 0.016033384948968887, + "num_input_tokens_seen": 24809640, + "step": 1515, + "train_runtime": 12312.3086, + "train_tokens_per_second": 2015.027 + }, + { + "epoch": 0.9187878787878788, + "grad_norm": 0.006802879273891449, + "learning_rate": 9.832417769733185e-05, + "loss": 0.011312966234982014, + "num_input_tokens_seen": 24826016, + "step": 1516, + "train_runtime": 12320.4184, + "train_tokens_per_second": 2015.03 + }, + { + "epoch": 0.9193939393939394, + "grad_norm": 0.021973978728055954, + "learning_rate": 9.832170806634112e-05, + "loss": 0.014309680089354515, + "num_input_tokens_seen": 24842392, + "step": 1517, + "train_runtime": 12328.5323, + "train_tokens_per_second": 2015.032 + }, + { + "epoch": 0.92, + "grad_norm": 0.005967301782220602, + "learning_rate": 9.831923664802452e-05, + "loss": 0.013270399533212185, + "num_input_tokens_seen": 24858768, + "step": 1518, + "train_runtime": 12336.6424, + "train_tokens_per_second": 2015.035 + }, + { + "epoch": 0.9206060606060606, + "grad_norm": 0.009456534869968891, + "learning_rate": 9.831676344247342e-05, + "loss": 0.01195263396948576, + "num_input_tokens_seen": 24875144, + "step": 1519, + "train_runtime": 12344.7519, + "train_tokens_per_second": 2015.038 + }, + { + "epoch": 0.9212121212121213, + "grad_norm": 0.007073413580656052, + "learning_rate": 9.831428844977937e-05, + "loss": 0.01261575985699892, + "num_input_tokens_seen": 24891520, + "step": 1520, + "train_runtime": 12352.8597, + "train_tokens_per_second": 2015.041 + }, + { + "epoch": 0.9218181818181819, + "grad_norm": 0.015730643644928932, + "learning_rate": 9.831181167003385e-05, + "loss": 0.012905986048281193, + "num_input_tokens_seen": 24907896, + "step": 1521, + "train_runtime": 12360.9698, + "train_tokens_per_second": 2015.044 + }, + { + "epoch": 0.9224242424242424, + "grad_norm": 0.018344024196267128, + "learning_rate": 9.830933310332853e-05, + "loss": 0.013406560756266117, + "num_input_tokens_seen": 24924272, + "step": 1522, + "train_runtime": 12369.0808, + "train_tokens_per_second": 2015.046 + }, + { + "epoch": 0.923030303030303, + "grad_norm": 0.012346428819000721, + "learning_rate": 9.830685274975504e-05, + "loss": 0.014868221245706081, + "num_input_tokens_seen": 24940648, + "step": 1523, + "train_runtime": 12377.1913, + "train_tokens_per_second": 2015.049 + }, + { + "epoch": 0.9236363636363636, + "grad_norm": 0.009414401836693287, + "learning_rate": 9.830437060940513e-05, + "loss": 0.011081631295382977, + "num_input_tokens_seen": 24957024, + "step": 1524, + "train_runtime": 12385.2982, + "train_tokens_per_second": 2015.052 + }, + { + "epoch": 0.9242424242424242, + "grad_norm": 0.009916838258504868, + "learning_rate": 9.830188668237063e-05, + "loss": 0.012078795582056046, + "num_input_tokens_seen": 24973400, + "step": 1525, + "train_runtime": 12393.4176, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 0.9248484848484848, + "grad_norm": 0.008649183437228203, + "learning_rate": 9.82994009687434e-05, + "loss": 0.01329115778207779, + "num_input_tokens_seen": 24989776, + "step": 1526, + "train_runtime": 12401.5342, + "train_tokens_per_second": 2015.055 + }, + { + "epoch": 0.9254545454545454, + "grad_norm": 0.008059944026172161, + "learning_rate": 9.829691346861539e-05, + "loss": 0.010912570171058178, + "num_input_tokens_seen": 25006152, + "step": 1527, + "train_runtime": 12409.6441, + "train_tokens_per_second": 2015.058 + }, + { + "epoch": 0.926060606060606, + "grad_norm": 0.01213870570063591, + "learning_rate": 9.82944241820786e-05, + "loss": 0.013018831610679626, + "num_input_tokens_seen": 25022528, + "step": 1528, + "train_runtime": 12417.7549, + "train_tokens_per_second": 2015.061 + }, + { + "epoch": 0.9266666666666666, + "grad_norm": 0.006536056753247976, + "learning_rate": 9.829193310922511e-05, + "loss": 0.012272411026060581, + "num_input_tokens_seen": 25038904, + "step": 1529, + "train_runtime": 12425.8697, + "train_tokens_per_second": 2015.062 + }, + { + "epoch": 0.9272727272727272, + "grad_norm": 0.016091682016849518, + "learning_rate": 9.828944025014707e-05, + "loss": 0.013518830761313438, + "num_input_tokens_seen": 25055280, + "step": 1530, + "train_runtime": 12433.9806, + "train_tokens_per_second": 2015.065 + }, + { + "epoch": 0.9278787878787879, + "grad_norm": 0.013526062481105328, + "learning_rate": 9.828694560493667e-05, + "loss": 0.011766214855015278, + "num_input_tokens_seen": 25071656, + "step": 1531, + "train_runtime": 12442.099, + "train_tokens_per_second": 2015.066 + }, + { + "epoch": 0.9284848484848485, + "grad_norm": 0.012226013466715813, + "learning_rate": 9.828444917368618e-05, + "loss": 0.012446287088096142, + "num_input_tokens_seen": 25088032, + "step": 1532, + "train_runtime": 12450.2199, + "train_tokens_per_second": 2015.067 + }, + { + "epoch": 0.9290909090909091, + "grad_norm": 0.008501514792442322, + "learning_rate": 9.828195095648796e-05, + "loss": 0.011350834742188454, + "num_input_tokens_seen": 25104408, + "step": 1533, + "train_runtime": 12458.3354, + "train_tokens_per_second": 2015.069 + }, + { + "epoch": 0.9296969696969697, + "grad_norm": 0.015691563487052917, + "learning_rate": 9.827945095343438e-05, + "loss": 0.01239042729139328, + "num_input_tokens_seen": 25120784, + "step": 1534, + "train_runtime": 12466.4613, + "train_tokens_per_second": 2015.069 + }, + { + "epoch": 0.9303030303030303, + "grad_norm": 0.010623699985444546, + "learning_rate": 9.827694916461793e-05, + "loss": 0.012479366734623909, + "num_input_tokens_seen": 25137160, + "step": 1535, + "train_runtime": 12474.5831, + "train_tokens_per_second": 2015.07 + }, + { + "epoch": 0.9309090909090909, + "grad_norm": 0.016181156039237976, + "learning_rate": 9.827444559013115e-05, + "loss": 0.012995701283216476, + "num_input_tokens_seen": 25153536, + "step": 1536, + "train_runtime": 12482.7136, + "train_tokens_per_second": 2015.07 + }, + { + "epoch": 0.9315151515151515, + "grad_norm": 0.011957235634326935, + "learning_rate": 9.827194023006665e-05, + "loss": 0.011975055560469627, + "num_input_tokens_seen": 25169912, + "step": 1537, + "train_runtime": 12490.8442, + "train_tokens_per_second": 2015.069 + }, + { + "epoch": 0.9321212121212121, + "grad_norm": 0.006527577061206102, + "learning_rate": 9.826943308451706e-05, + "loss": 0.011895911768078804, + "num_input_tokens_seen": 25186288, + "step": 1538, + "train_runtime": 12498.9672, + "train_tokens_per_second": 2015.07 + }, + { + "epoch": 0.9327272727272727, + "grad_norm": 0.01042003184556961, + "learning_rate": 9.826692415357517e-05, + "loss": 0.014506472274661064, + "num_input_tokens_seen": 25202664, + "step": 1539, + "train_runtime": 12507.0894, + "train_tokens_per_second": 2015.07 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 0.016334760934114456, + "learning_rate": 9.826441343733373e-05, + "loss": 0.0129412692040205, + "num_input_tokens_seen": 25219040, + "step": 1540, + "train_runtime": 12515.2111, + "train_tokens_per_second": 2015.071 + }, + { + "epoch": 0.933939393939394, + "grad_norm": 0.014927252195775509, + "learning_rate": 9.826190093588563e-05, + "loss": 0.012800981290638447, + "num_input_tokens_seen": 25235416, + "step": 1541, + "train_runtime": 12523.3336, + "train_tokens_per_second": 2015.072 + }, + { + "epoch": 0.9345454545454546, + "grad_norm": 0.029416421428322792, + "learning_rate": 9.825938664932381e-05, + "loss": 0.014405488967895508, + "num_input_tokens_seen": 25251792, + "step": 1542, + "train_runtime": 12531.4527, + "train_tokens_per_second": 2015.073 + }, + { + "epoch": 0.9351515151515152, + "grad_norm": 0.02064371295273304, + "learning_rate": 9.825687057774126e-05, + "loss": 0.012185944244265556, + "num_input_tokens_seen": 25268168, + "step": 1543, + "train_runtime": 12539.5726, + "train_tokens_per_second": 2015.074 + }, + { + "epoch": 0.9357575757575758, + "grad_norm": 0.02839016169309616, + "learning_rate": 9.825435272123103e-05, + "loss": 0.01328684575855732, + "num_input_tokens_seen": 25284544, + "step": 1544, + "train_runtime": 12547.6922, + "train_tokens_per_second": 2015.075 + }, + { + "epoch": 0.9363636363636364, + "grad_norm": 0.00797637552022934, + "learning_rate": 9.825183307988628e-05, + "loss": 0.012131592258810997, + "num_input_tokens_seen": 25300920, + "step": 1545, + "train_runtime": 12555.816, + "train_tokens_per_second": 2015.076 + }, + { + "epoch": 0.936969696969697, + "grad_norm": 0.019619282335042953, + "learning_rate": 9.824931165380018e-05, + "loss": 0.012967569753527641, + "num_input_tokens_seen": 25317296, + "step": 1546, + "train_runtime": 12563.9372, + "train_tokens_per_second": 2015.077 + }, + { + "epoch": 0.9375757575757576, + "grad_norm": 0.021843625232577324, + "learning_rate": 9.824678844306601e-05, + "loss": 0.01267443411052227, + "num_input_tokens_seen": 25333672, + "step": 1547, + "train_runtime": 12572.06, + "train_tokens_per_second": 2015.077 + }, + { + "epoch": 0.9381818181818182, + "grad_norm": 0.017489034682512283, + "learning_rate": 9.824426344777708e-05, + "loss": 0.013256320729851723, + "num_input_tokens_seen": 25350048, + "step": 1548, + "train_runtime": 12580.182, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 0.9387878787878788, + "grad_norm": 0.013636451214551926, + "learning_rate": 9.82417366680268e-05, + "loss": 0.012895656749606133, + "num_input_tokens_seen": 25366424, + "step": 1549, + "train_runtime": 12588.3031, + "train_tokens_per_second": 2015.079 + }, + { + "epoch": 0.9393939393939394, + "grad_norm": 0.010681037791073322, + "learning_rate": 9.823920810390864e-05, + "loss": 0.01336823869496584, + "num_input_tokens_seen": 25382800, + "step": 1550, + "train_runtime": 12596.4312, + "train_tokens_per_second": 2015.079 + }, + { + "epoch": 0.94, + "grad_norm": 0.008681289851665497, + "learning_rate": 9.823667775551611e-05, + "loss": 0.012347033247351646, + "num_input_tokens_seen": 25399176, + "step": 1551, + "train_runtime": 12604.5524, + "train_tokens_per_second": 2015.08 + }, + { + "epoch": 0.9406060606060606, + "grad_norm": 0.014772225171327591, + "learning_rate": 9.82341456229428e-05, + "loss": 0.012610615231096745, + "num_input_tokens_seen": 25415552, + "step": 1552, + "train_runtime": 12612.6729, + "train_tokens_per_second": 2015.081 + }, + { + "epoch": 0.9412121212121212, + "grad_norm": 0.009455538354814053, + "learning_rate": 9.823161170628236e-05, + "loss": 0.011409729719161987, + "num_input_tokens_seen": 25431928, + "step": 1553, + "train_runtime": 12620.7947, + "train_tokens_per_second": 2015.081 + }, + { + "epoch": 0.9418181818181818, + "grad_norm": 0.007376038935035467, + "learning_rate": 9.822907600562855e-05, + "loss": 0.01010863482952118, + "num_input_tokens_seen": 25448304, + "step": 1554, + "train_runtime": 12628.9187, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 0.9424242424242424, + "grad_norm": 0.018327629193663597, + "learning_rate": 9.822653852107514e-05, + "loss": 0.012985773384571075, + "num_input_tokens_seen": 25464680, + "step": 1555, + "train_runtime": 12637.0405, + "train_tokens_per_second": 2015.083 + }, + { + "epoch": 0.943030303030303, + "grad_norm": 0.020945513620972633, + "learning_rate": 9.822399925271598e-05, + "loss": 0.012259690091013908, + "num_input_tokens_seen": 25481056, + "step": 1556, + "train_runtime": 12645.1611, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 0.9436363636363636, + "grad_norm": 0.01777738891541958, + "learning_rate": 9.822145820064501e-05, + "loss": 0.013157106004655361, + "num_input_tokens_seen": 25497432, + "step": 1557, + "train_runtime": 12653.2798, + "train_tokens_per_second": 2015.085 + }, + { + "epoch": 0.9442424242424242, + "grad_norm": 0.014833358116447926, + "learning_rate": 9.82189153649562e-05, + "loss": 0.013078153133392334, + "num_input_tokens_seen": 25513808, + "step": 1558, + "train_runtime": 12661.4005, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 0.9448484848484848, + "grad_norm": 0.011480891145765781, + "learning_rate": 9.821637074574362e-05, + "loss": 0.012636776082217693, + "num_input_tokens_seen": 25530184, + "step": 1559, + "train_runtime": 12669.5208, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 0.9454545454545454, + "grad_norm": 0.01719282753765583, + "learning_rate": 9.821382434310136e-05, + "loss": 0.013058073818683624, + "num_input_tokens_seen": 25546560, + "step": 1560, + "train_runtime": 12677.6402, + "train_tokens_per_second": 2015.088 + }, + { + "epoch": 0.946060606060606, + "grad_norm": 0.017694855108857155, + "learning_rate": 9.821127615712364e-05, + "loss": 0.012888854369521141, + "num_input_tokens_seen": 25562936, + "step": 1561, + "train_runtime": 12685.76, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 0.9466666666666667, + "grad_norm": 0.012164749205112457, + "learning_rate": 9.820872618790472e-05, + "loss": 0.011173507198691368, + "num_input_tokens_seen": 25579312, + "step": 1562, + "train_runtime": 12693.8817, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 0.9472727272727273, + "grad_norm": 0.012561215087771416, + "learning_rate": 9.820617443553889e-05, + "loss": 0.012248185463249683, + "num_input_tokens_seen": 25595688, + "step": 1563, + "train_runtime": 12702.0014, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 0.9478787878787879, + "grad_norm": 0.01546258945018053, + "learning_rate": 9.820362090012054e-05, + "loss": 0.013846226967871189, + "num_input_tokens_seen": 25612064, + "step": 1564, + "train_runtime": 12710.1221, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 0.9484848484848485, + "grad_norm": 0.01711091957986355, + "learning_rate": 9.820106558174413e-05, + "loss": 0.012854847125709057, + "num_input_tokens_seen": 25628440, + "step": 1565, + "train_runtime": 12718.245, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 0.9490909090909091, + "grad_norm": 0.014450309798121452, + "learning_rate": 9.819850848050419e-05, + "loss": 0.013084612786769867, + "num_input_tokens_seen": 25644816, + "step": 1566, + "train_runtime": 12726.3658, + "train_tokens_per_second": 2015.093 + }, + { + "epoch": 0.9496969696969697, + "grad_norm": 0.019604388624429703, + "learning_rate": 9.819594959649525e-05, + "loss": 0.014434726908802986, + "num_input_tokens_seen": 25661192, + "step": 1567, + "train_runtime": 12734.4867, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 0.9503030303030303, + "grad_norm": 0.010934766381978989, + "learning_rate": 9.819338892981201e-05, + "loss": 0.012038343586027622, + "num_input_tokens_seen": 25677568, + "step": 1568, + "train_runtime": 12742.6071, + "train_tokens_per_second": 2015.095 + }, + { + "epoch": 0.9509090909090909, + "grad_norm": 0.023866428062319756, + "learning_rate": 9.819082648054915e-05, + "loss": 0.012406328693032265, + "num_input_tokens_seen": 25693944, + "step": 1569, + "train_runtime": 12750.7319, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 0.9515151515151515, + "grad_norm": 0.021724287420511246, + "learning_rate": 9.81882622488015e-05, + "loss": 0.013142053037881851, + "num_input_tokens_seen": 25710320, + "step": 1570, + "train_runtime": 12758.8522, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 0.9521212121212121, + "grad_norm": 0.013339296914637089, + "learning_rate": 9.818569623466383e-05, + "loss": 0.012692091055214405, + "num_input_tokens_seen": 25726696, + "step": 1571, + "train_runtime": 12766.9728, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 0.9527272727272728, + "grad_norm": 0.01173945888876915, + "learning_rate": 9.818312843823113e-05, + "loss": 0.012100563384592533, + "num_input_tokens_seen": 25743072, + "step": 1572, + "train_runtime": 12775.0957, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 0.9533333333333334, + "grad_norm": 0.024036092683672905, + "learning_rate": 9.818055885959831e-05, + "loss": 0.01412537693977356, + "num_input_tokens_seen": 25759448, + "step": 1573, + "train_runtime": 12783.2155, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 0.953939393939394, + "grad_norm": 0.017213815823197365, + "learning_rate": 9.817798749886047e-05, + "loss": 0.012354889884591103, + "num_input_tokens_seen": 25775824, + "step": 1574, + "train_runtime": 12791.3381, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 0.9545454545454546, + "grad_norm": 0.00862727127969265, + "learning_rate": 9.817541435611268e-05, + "loss": 0.011515894904732704, + "num_input_tokens_seen": 25792200, + "step": 1575, + "train_runtime": 12799.4582, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 0.9551515151515152, + "grad_norm": 0.015271785669028759, + "learning_rate": 9.817283943145013e-05, + "loss": 0.013525201007723808, + "num_input_tokens_seen": 25808576, + "step": 1576, + "train_runtime": 12807.5792, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 0.9557575757575758, + "grad_norm": 0.011643790639936924, + "learning_rate": 9.817026272496806e-05, + "loss": 0.012195194140076637, + "num_input_tokens_seen": 25824952, + "step": 1577, + "train_runtime": 12815.6998, + "train_tokens_per_second": 2015.103 + }, + { + "epoch": 0.9563636363636364, + "grad_norm": 0.014296936802566051, + "learning_rate": 9.81676842367618e-05, + "loss": 0.011279569007456303, + "num_input_tokens_seen": 25841328, + "step": 1578, + "train_runtime": 12823.8181, + "train_tokens_per_second": 2015.104 + }, + { + "epoch": 0.9569696969696969, + "grad_norm": 0.0846167728304863, + "learning_rate": 9.816510396692668e-05, + "loss": 0.011400844901800156, + "num_input_tokens_seen": 25857704, + "step": 1579, + "train_runtime": 12831.94, + "train_tokens_per_second": 2015.105 + }, + { + "epoch": 0.9575757575757575, + "grad_norm": 0.01264986302703619, + "learning_rate": 9.816252191555818e-05, + "loss": 0.012491201981902122, + "num_input_tokens_seen": 25874080, + "step": 1580, + "train_runtime": 12840.061, + "train_tokens_per_second": 2015.106 + }, + { + "epoch": 0.9581818181818181, + "grad_norm": 0.012004262767732143, + "learning_rate": 9.815993808275177e-05, + "loss": 0.012015881948173046, + "num_input_tokens_seen": 25890456, + "step": 1581, + "train_runtime": 12848.1818, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 0.9587878787878787, + "grad_norm": 0.017323743551969528, + "learning_rate": 9.815735246860305e-05, + "loss": 0.012511130422353745, + "num_input_tokens_seen": 25906832, + "step": 1582, + "train_runtime": 12856.3029, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 0.9593939393939394, + "grad_norm": 0.007995963096618652, + "learning_rate": 9.815476507320762e-05, + "loss": 0.012292873114347458, + "num_input_tokens_seen": 25923208, + "step": 1583, + "train_runtime": 12864.4325, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 0.96, + "grad_norm": 0.013012220151722431, + "learning_rate": 9.815217589666124e-05, + "loss": 0.012744070030748844, + "num_input_tokens_seen": 25939584, + "step": 1584, + "train_runtime": 12872.5519, + "train_tokens_per_second": 2015.108 + }, + { + "epoch": 0.9606060606060606, + "grad_norm": 0.012448850087821484, + "learning_rate": 9.814958493905963e-05, + "loss": 0.010710782371461391, + "num_input_tokens_seen": 25955960, + "step": 1585, + "train_runtime": 12880.6727, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 0.9612121212121212, + "grad_norm": 0.017448868602514267, + "learning_rate": 9.814699220049863e-05, + "loss": 0.012682373635470867, + "num_input_tokens_seen": 25972336, + "step": 1586, + "train_runtime": 12888.7948, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 0.9618181818181818, + "grad_norm": 0.009177150204777718, + "learning_rate": 9.814439768107418e-05, + "loss": 0.01149784680455923, + "num_input_tokens_seen": 25988712, + "step": 1587, + "train_runtime": 12896.9201, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 0.9624242424242424, + "grad_norm": 0.046517495065927505, + "learning_rate": 9.814180138088218e-05, + "loss": 0.012817755341529846, + "num_input_tokens_seen": 26005088, + "step": 1588, + "train_runtime": 12905.0427, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 0.963030303030303, + "grad_norm": 0.04576029255986214, + "learning_rate": 9.813920330001872e-05, + "loss": 0.013101590797305107, + "num_input_tokens_seen": 26021464, + "step": 1589, + "train_runtime": 12913.1645, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 0.9636363636363636, + "grad_norm": 0.005771426483988762, + "learning_rate": 9.813660343857988e-05, + "loss": 0.011728906072676182, + "num_input_tokens_seen": 26037840, + "step": 1590, + "train_runtime": 12921.2861, + "train_tokens_per_second": 2015.112 + }, + { + "epoch": 0.9642424242424242, + "grad_norm": 0.019825542345643044, + "learning_rate": 9.813400179666181e-05, + "loss": 0.014890195801854134, + "num_input_tokens_seen": 26054216, + "step": 1591, + "train_runtime": 12929.407, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 0.9648484848484848, + "grad_norm": 0.014512875117361546, + "learning_rate": 9.813139837436076e-05, + "loss": 0.012027891352772713, + "num_input_tokens_seen": 26070592, + "step": 1592, + "train_runtime": 12937.5336, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 0.9654545454545455, + "grad_norm": 0.0058611356653273106, + "learning_rate": 9.8128793171773e-05, + "loss": 0.011675585061311722, + "num_input_tokens_seen": 26086968, + "step": 1593, + "train_runtime": 12945.6547, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 0.9660606060606061, + "grad_norm": 0.007386692333966494, + "learning_rate": 9.812618618899491e-05, + "loss": 0.012009193189442158, + "num_input_tokens_seen": 26103344, + "step": 1594, + "train_runtime": 12953.7762, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 0.9666666666666667, + "grad_norm": 0.006147931329905987, + "learning_rate": 9.812357742612293e-05, + "loss": 0.0111166313290596, + "num_input_tokens_seen": 26119720, + "step": 1595, + "train_runtime": 12961.8976, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 0.9672727272727273, + "grad_norm": 0.014322157017886639, + "learning_rate": 9.812096688325354e-05, + "loss": 0.012861751019954681, + "num_input_tokens_seen": 26136096, + "step": 1596, + "train_runtime": 12970.0197, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 0.9678787878787879, + "grad_norm": 0.027148565277457237, + "learning_rate": 9.811835456048328e-05, + "loss": 0.013231952674686909, + "num_input_tokens_seen": 26152472, + "step": 1597, + "train_runtime": 12978.1419, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 0.9684848484848485, + "grad_norm": 0.010908321477472782, + "learning_rate": 9.811574045790879e-05, + "loss": 0.01282698567956686, + "num_input_tokens_seen": 26168848, + "step": 1598, + "train_runtime": 12986.2629, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 0.9690909090909091, + "grad_norm": 0.011924156919121742, + "learning_rate": 9.811312457562678e-05, + "loss": 0.01284240186214447, + "num_input_tokens_seen": 26185224, + "step": 1599, + "train_runtime": 12994.3852, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 0.9696969696969697, + "grad_norm": 0.015434959903359413, + "learning_rate": 9.811050691373396e-05, + "loss": 0.012538165785372257, + "num_input_tokens_seen": 26201600, + "step": 1600, + "train_runtime": 13002.5063, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 0.9703030303030303, + "grad_norm": 0.012002578936517239, + "learning_rate": 9.810788747232721e-05, + "loss": 0.013315416872501373, + "num_input_tokens_seen": 26217976, + "step": 1601, + "train_runtime": 13011.6796, + "train_tokens_per_second": 2014.957 + }, + { + "epoch": 0.9709090909090909, + "grad_norm": 0.014069149270653725, + "learning_rate": 9.810526625150337e-05, + "loss": 0.013467980548739433, + "num_input_tokens_seen": 26234352, + "step": 1602, + "train_runtime": 13019.7981, + "train_tokens_per_second": 2014.958 + }, + { + "epoch": 0.9715151515151516, + "grad_norm": 0.014620055444538593, + "learning_rate": 9.810264325135942e-05, + "loss": 0.013381460681557655, + "num_input_tokens_seen": 26250728, + "step": 1603, + "train_runtime": 13027.918, + "train_tokens_per_second": 2014.96 + }, + { + "epoch": 0.9721212121212122, + "grad_norm": 0.01015267800539732, + "learning_rate": 9.810001847199237e-05, + "loss": 0.011481476947665215, + "num_input_tokens_seen": 26267104, + "step": 1604, + "train_runtime": 13036.036, + "train_tokens_per_second": 2014.961 + }, + { + "epoch": 0.9727272727272728, + "grad_norm": 0.010799713432788849, + "learning_rate": 9.80973919134993e-05, + "loss": 0.012389612384140491, + "num_input_tokens_seen": 26283480, + "step": 1605, + "train_runtime": 13044.1551, + "train_tokens_per_second": 2014.962 + }, + { + "epoch": 0.9733333333333334, + "grad_norm": 0.010257367044687271, + "learning_rate": 9.809476357597738e-05, + "loss": 0.01151387207210064, + "num_input_tokens_seen": 26299856, + "step": 1606, + "train_runtime": 13052.2743, + "train_tokens_per_second": 2014.963 + }, + { + "epoch": 0.973939393939394, + "grad_norm": 0.013825331814587116, + "learning_rate": 9.809213345952381e-05, + "loss": 0.011732700280845165, + "num_input_tokens_seen": 26316232, + "step": 1607, + "train_runtime": 13060.3933, + "train_tokens_per_second": 2014.965 + }, + { + "epoch": 0.9745454545454545, + "grad_norm": 0.012632308527827263, + "learning_rate": 9.808950156423588e-05, + "loss": 0.012355693615972996, + "num_input_tokens_seen": 26332608, + "step": 1608, + "train_runtime": 13068.5111, + "train_tokens_per_second": 2014.966 + }, + { + "epoch": 0.9751515151515151, + "grad_norm": 0.002986186882480979, + "learning_rate": 9.808686789021093e-05, + "loss": 0.011115066707134247, + "num_input_tokens_seen": 26348984, + "step": 1609, + "train_runtime": 13076.6315, + "train_tokens_per_second": 2014.967 + }, + { + "epoch": 0.9757575757575757, + "grad_norm": 0.14690753817558289, + "learning_rate": 9.808423243754639e-05, + "loss": 0.011805863119661808, + "num_input_tokens_seen": 26365360, + "step": 1610, + "train_runtime": 13084.7512, + "train_tokens_per_second": 2014.968 + }, + { + "epoch": 0.9763636363636363, + "grad_norm": 0.010931416414678097, + "learning_rate": 9.808159520633973e-05, + "loss": 0.010769207030534744, + "num_input_tokens_seen": 26381736, + "step": 1611, + "train_runtime": 13092.8686, + "train_tokens_per_second": 2014.97 + }, + { + "epoch": 0.9769696969696969, + "grad_norm": 0.005970633123070002, + "learning_rate": 9.80789561966885e-05, + "loss": 0.011445348151028156, + "num_input_tokens_seen": 26398112, + "step": 1612, + "train_runtime": 13101.0028, + "train_tokens_per_second": 2014.969 + }, + { + "epoch": 0.9775757575757575, + "grad_norm": 0.014702706597745419, + "learning_rate": 9.80763154086903e-05, + "loss": 0.012894170358777046, + "num_input_tokens_seen": 26414488, + "step": 1613, + "train_runtime": 13109.121, + "train_tokens_per_second": 2014.97 + }, + { + "epoch": 0.9781818181818182, + "grad_norm": 0.012428919784724712, + "learning_rate": 9.807367284244282e-05, + "loss": 0.012821591459214687, + "num_input_tokens_seen": 26430864, + "step": 1614, + "train_runtime": 13117.2386, + "train_tokens_per_second": 2014.972 + }, + { + "epoch": 0.9787878787878788, + "grad_norm": 0.06160819157958031, + "learning_rate": 9.807102849804381e-05, + "loss": 0.013576450757682323, + "num_input_tokens_seen": 26447240, + "step": 1615, + "train_runtime": 13125.3585, + "train_tokens_per_second": 2014.973 + }, + { + "epoch": 0.9793939393939394, + "grad_norm": 0.017494510859251022, + "learning_rate": 9.806838237559107e-05, + "loss": 0.013401782140135765, + "num_input_tokens_seen": 26463616, + "step": 1616, + "train_runtime": 13133.477, + "train_tokens_per_second": 2014.974 + }, + { + "epoch": 0.98, + "grad_norm": 0.05766647309064865, + "learning_rate": 9.806573447518246e-05, + "loss": 0.014988632872700691, + "num_input_tokens_seen": 26479992, + "step": 1617, + "train_runtime": 13141.5957, + "train_tokens_per_second": 2014.975 + }, + { + "epoch": 0.9806060606060606, + "grad_norm": 0.0317838154733181, + "learning_rate": 9.806308479691595e-05, + "loss": 0.011899848468601704, + "num_input_tokens_seen": 26496368, + "step": 1618, + "train_runtime": 13149.7145, + "train_tokens_per_second": 2014.977 + }, + { + "epoch": 0.9812121212121212, + "grad_norm": 0.0214092880487442, + "learning_rate": 9.806043334088952e-05, + "loss": 0.011899617500603199, + "num_input_tokens_seen": 26512744, + "step": 1619, + "train_runtime": 13157.8348, + "train_tokens_per_second": 2014.978 + }, + { + "epoch": 0.9818181818181818, + "grad_norm": 0.020651455968618393, + "learning_rate": 9.805778010720126e-05, + "loss": 0.012952842749655247, + "num_input_tokens_seen": 26529120, + "step": 1620, + "train_runtime": 13165.9527, + "train_tokens_per_second": 2014.979 + }, + { + "epoch": 0.9824242424242424, + "grad_norm": 0.011873727664351463, + "learning_rate": 9.80551250959493e-05, + "loss": 0.012238996103405952, + "num_input_tokens_seen": 26545496, + "step": 1621, + "train_runtime": 13174.0706, + "train_tokens_per_second": 2014.981 + }, + { + "epoch": 0.983030303030303, + "grad_norm": 0.008294295519590378, + "learning_rate": 9.805246830723186e-05, + "loss": 0.013415738008916378, + "num_input_tokens_seen": 26561872, + "step": 1622, + "train_runtime": 13182.1887, + "train_tokens_per_second": 2014.982 + }, + { + "epoch": 0.9836363636363636, + "grad_norm": 0.006087975576519966, + "learning_rate": 9.804980974114719e-05, + "loss": 0.01233192440122366, + "num_input_tokens_seen": 26578248, + "step": 1623, + "train_runtime": 13190.3059, + "train_tokens_per_second": 2014.983 + }, + { + "epoch": 0.9842424242424243, + "grad_norm": 0.014780385419726372, + "learning_rate": 9.804714939779362e-05, + "loss": 0.012391527183353901, + "num_input_tokens_seen": 26594624, + "step": 1624, + "train_runtime": 13198.4311, + "train_tokens_per_second": 2014.984 + }, + { + "epoch": 0.9848484848484849, + "grad_norm": 0.027467984706163406, + "learning_rate": 9.804448727726956e-05, + "loss": 0.013444105163216591, + "num_input_tokens_seen": 26611000, + "step": 1625, + "train_runtime": 13206.5451, + "train_tokens_per_second": 2014.986 + }, + { + "epoch": 0.9854545454545455, + "grad_norm": 0.019479792565107346, + "learning_rate": 9.804182337967349e-05, + "loss": 0.012373380362987518, + "num_input_tokens_seen": 26627376, + "step": 1626, + "train_runtime": 13214.6617, + "train_tokens_per_second": 2014.987 + }, + { + "epoch": 0.9860606060606061, + "grad_norm": 0.01583736762404442, + "learning_rate": 9.803915770510393e-05, + "loss": 0.01365045364946127, + "num_input_tokens_seen": 26643752, + "step": 1627, + "train_runtime": 13222.7801, + "train_tokens_per_second": 2014.989 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 0.009480198845267296, + "learning_rate": 9.803649025365947e-05, + "loss": 0.012557166628539562, + "num_input_tokens_seen": 26660128, + "step": 1628, + "train_runtime": 13230.8971, + "train_tokens_per_second": 2014.99 + }, + { + "epoch": 0.9872727272727273, + "grad_norm": 0.04188961163163185, + "learning_rate": 9.803382102543879e-05, + "loss": 0.012879314832389355, + "num_input_tokens_seen": 26676504, + "step": 1629, + "train_runtime": 13239.0155, + "train_tokens_per_second": 2014.992 + }, + { + "epoch": 0.9878787878787879, + "grad_norm": 0.025775199756026268, + "learning_rate": 9.80311500205406e-05, + "loss": 0.012588057667016983, + "num_input_tokens_seen": 26692880, + "step": 1630, + "train_runtime": 13247.1337, + "train_tokens_per_second": 2014.993 + }, + { + "epoch": 0.9884848484848485, + "grad_norm": 0.013006957247853279, + "learning_rate": 9.802847723906371e-05, + "loss": 0.012539117597043514, + "num_input_tokens_seen": 26709256, + "step": 1631, + "train_runtime": 13255.2503, + "train_tokens_per_second": 2014.994 + }, + { + "epoch": 0.9890909090909091, + "grad_norm": 0.015388661995530128, + "learning_rate": 9.802580268110699e-05, + "loss": 0.013457395136356354, + "num_input_tokens_seen": 26725632, + "step": 1632, + "train_runtime": 13263.3681, + "train_tokens_per_second": 2014.996 + }, + { + "epoch": 0.9896969696969697, + "grad_norm": 0.015193904750049114, + "learning_rate": 9.802312634676934e-05, + "loss": 0.012364407069981098, + "num_input_tokens_seen": 26742008, + "step": 1633, + "train_runtime": 13271.4872, + "train_tokens_per_second": 2014.997 + }, + { + "epoch": 0.9903030303030304, + "grad_norm": 0.008997797966003418, + "learning_rate": 9.802044823614978e-05, + "loss": 0.011594077572226524, + "num_input_tokens_seen": 26758384, + "step": 1634, + "train_runtime": 13279.6059, + "train_tokens_per_second": 2014.998 + }, + { + "epoch": 0.990909090909091, + "grad_norm": 0.008219058625400066, + "learning_rate": 9.801776834934736e-05, + "loss": 0.011843642219901085, + "num_input_tokens_seen": 26774760, + "step": 1635, + "train_runtime": 13287.7237, + "train_tokens_per_second": 2015.0 + }, + { + "epoch": 0.9915151515151515, + "grad_norm": 0.010219755582511425, + "learning_rate": 9.801508668646118e-05, + "loss": 0.013223512098193169, + "num_input_tokens_seen": 26791136, + "step": 1636, + "train_runtime": 13295.8407, + "train_tokens_per_second": 2015.001 + }, + { + "epoch": 0.9921212121212121, + "grad_norm": 0.006228118669241667, + "learning_rate": 9.801240324759045e-05, + "loss": 0.011068768799304962, + "num_input_tokens_seen": 26807512, + "step": 1637, + "train_runtime": 13303.9585, + "train_tokens_per_second": 2015.003 + }, + { + "epoch": 0.9927272727272727, + "grad_norm": 0.02005128748714924, + "learning_rate": 9.800971803283443e-05, + "loss": 0.01461564190685749, + "num_input_tokens_seen": 26823888, + "step": 1638, + "train_runtime": 13312.0757, + "train_tokens_per_second": 2015.004 + }, + { + "epoch": 0.9933333333333333, + "grad_norm": 0.01215451955795288, + "learning_rate": 9.800703104229245e-05, + "loss": 0.012672146782279015, + "num_input_tokens_seen": 26840264, + "step": 1639, + "train_runtime": 13320.1936, + "train_tokens_per_second": 2015.006 + }, + { + "epoch": 0.9939393939393939, + "grad_norm": 0.007676825392991304, + "learning_rate": 9.800434227606385e-05, + "loss": 0.011905834078788757, + "num_input_tokens_seen": 26856640, + "step": 1640, + "train_runtime": 13328.3104, + "train_tokens_per_second": 2015.007 + }, + { + "epoch": 0.9945454545454545, + "grad_norm": 0.007540915627032518, + "learning_rate": 9.800165173424814e-05, + "loss": 0.011878485791385174, + "num_input_tokens_seen": 26873016, + "step": 1641, + "train_runtime": 13336.4323, + "train_tokens_per_second": 2015.008 + }, + { + "epoch": 0.9951515151515151, + "grad_norm": 0.00563439354300499, + "learning_rate": 9.799895941694481e-05, + "loss": 0.013368148356676102, + "num_input_tokens_seen": 26889392, + "step": 1642, + "train_runtime": 13344.5508, + "train_tokens_per_second": 2015.009 + }, + { + "epoch": 0.9957575757575757, + "grad_norm": 0.015325483866035938, + "learning_rate": 9.799626532425343e-05, + "loss": 0.012677650898694992, + "num_input_tokens_seen": 26905768, + "step": 1643, + "train_runtime": 13352.671, + "train_tokens_per_second": 2015.01 + }, + { + "epoch": 0.9963636363636363, + "grad_norm": 0.021497316658496857, + "learning_rate": 9.799356945627368e-05, + "loss": 0.012580220587551594, + "num_input_tokens_seen": 26922144, + "step": 1644, + "train_runtime": 13360.7875, + "train_tokens_per_second": 2015.012 + }, + { + "epoch": 0.996969696969697, + "grad_norm": 0.012657717801630497, + "learning_rate": 9.799087181310524e-05, + "loss": 0.012727495282888412, + "num_input_tokens_seen": 26938520, + "step": 1645, + "train_runtime": 13368.9042, + "train_tokens_per_second": 2015.013 + }, + { + "epoch": 0.9975757575757576, + "grad_norm": 0.011814654804766178, + "learning_rate": 9.798817239484792e-05, + "loss": 0.012600673362612724, + "num_input_tokens_seen": 26954896, + "step": 1646, + "train_runtime": 13377.0215, + "train_tokens_per_second": 2015.015 + }, + { + "epoch": 0.9981818181818182, + "grad_norm": 0.020266059786081314, + "learning_rate": 9.798547120160156e-05, + "loss": 0.01349579356610775, + "num_input_tokens_seen": 26971272, + "step": 1647, + "train_runtime": 13385.1417, + "train_tokens_per_second": 2015.016 + }, + { + "epoch": 0.9987878787878788, + "grad_norm": 0.013499039225280285, + "learning_rate": 9.798276823346606e-05, + "loss": 0.013363813050091267, + "num_input_tokens_seen": 26987648, + "step": 1648, + "train_runtime": 13393.2596, + "train_tokens_per_second": 2015.017 + }, + { + "epoch": 0.9993939393939394, + "grad_norm": 0.013451367616653442, + "learning_rate": 9.79800634905414e-05, + "loss": 0.013179901987314224, + "num_input_tokens_seen": 27004024, + "step": 1649, + "train_runtime": 13401.3766, + "train_tokens_per_second": 2015.019 + }, + { + "epoch": 1.0, + "grad_norm": 0.008038493804633617, + "learning_rate": 9.797735697292765e-05, + "loss": 0.012226985767483711, + "num_input_tokens_seen": 27020400, + "step": 1650, + "train_runtime": 13409.4946, + "train_tokens_per_second": 2015.02 + }, + { + "epoch": 1.0006060606060605, + "grad_norm": 0.012623721733689308, + "learning_rate": 9.797464868072488e-05, + "loss": 0.012360217981040478, + "num_input_tokens_seen": 27036776, + "step": 1651, + "train_runtime": 13417.6145, + "train_tokens_per_second": 2015.021 + }, + { + "epoch": 1.0012121212121212, + "grad_norm": 0.012440846301615238, + "learning_rate": 9.797193861403329e-05, + "loss": 0.01371823437511921, + "num_input_tokens_seen": 27053152, + "step": 1652, + "train_runtime": 13425.7343, + "train_tokens_per_second": 2015.022 + }, + { + "epoch": 1.0018181818181817, + "grad_norm": 0.009919981472194195, + "learning_rate": 9.79692267729531e-05, + "loss": 0.01273531373590231, + "num_input_tokens_seen": 27069528, + "step": 1653, + "train_runtime": 13433.8539, + "train_tokens_per_second": 2015.023 + }, + { + "epoch": 1.0024242424242424, + "grad_norm": 0.00648895651102066, + "learning_rate": 9.796651315758463e-05, + "loss": 0.012470672838389874, + "num_input_tokens_seen": 27085904, + "step": 1654, + "train_runtime": 13441.9722, + "train_tokens_per_second": 2015.025 + }, + { + "epoch": 1.003030303030303, + "grad_norm": 0.010717087425291538, + "learning_rate": 9.796379776802826e-05, + "loss": 0.012379986234009266, + "num_input_tokens_seen": 27102280, + "step": 1655, + "train_runtime": 13450.0852, + "train_tokens_per_second": 2015.027 + }, + { + "epoch": 1.0036363636363637, + "grad_norm": 0.010810550302267075, + "learning_rate": 9.79610806043844e-05, + "loss": 0.012405799701809883, + "num_input_tokens_seen": 27118656, + "step": 1656, + "train_runtime": 13458.1958, + "train_tokens_per_second": 2015.029 + }, + { + "epoch": 1.0042424242424242, + "grad_norm": 0.012775463983416557, + "learning_rate": 9.795836166675358e-05, + "loss": 0.01413954608142376, + "num_input_tokens_seen": 27135032, + "step": 1657, + "train_runtime": 13466.3055, + "train_tokens_per_second": 2015.032 + }, + { + "epoch": 1.0048484848484849, + "grad_norm": 0.008526409044861794, + "learning_rate": 9.795564095523635e-05, + "loss": 0.012026500888168812, + "num_input_tokens_seen": 27151408, + "step": 1658, + "train_runtime": 13474.4176, + "train_tokens_per_second": 2015.034 + }, + { + "epoch": 1.0054545454545454, + "grad_norm": 0.02498007006943226, + "learning_rate": 9.795291846993337e-05, + "loss": 0.012756639160215855, + "num_input_tokens_seen": 27167784, + "step": 1659, + "train_runtime": 13482.5333, + "train_tokens_per_second": 2015.036 + }, + { + "epoch": 1.006060606060606, + "grad_norm": 0.007863683626055717, + "learning_rate": 9.79501942109453e-05, + "loss": 0.010186624713242054, + "num_input_tokens_seen": 27184160, + "step": 1660, + "train_runtime": 13490.6437, + "train_tokens_per_second": 2015.038 + }, + { + "epoch": 1.0066666666666666, + "grad_norm": 0.01177004911005497, + "learning_rate": 9.794746817837293e-05, + "loss": 0.01371595449745655, + "num_input_tokens_seen": 27200536, + "step": 1661, + "train_runtime": 13498.7517, + "train_tokens_per_second": 2015.041 + }, + { + "epoch": 1.0072727272727273, + "grad_norm": 0.016004914417862892, + "learning_rate": 9.794474037231708e-05, + "loss": 0.015218988992273808, + "num_input_tokens_seen": 27216912, + "step": 1662, + "train_runtime": 13506.8607, + "train_tokens_per_second": 2015.044 + }, + { + "epoch": 1.0078787878787878, + "grad_norm": 0.002983122132718563, + "learning_rate": 9.794201079287865e-05, + "loss": 0.01247552502900362, + "num_input_tokens_seen": 27233288, + "step": 1663, + "train_runtime": 13514.9678, + "train_tokens_per_second": 2015.046 + }, + { + "epoch": 1.0084848484848485, + "grad_norm": 0.007749462965875864, + "learning_rate": 9.793927944015862e-05, + "loss": 0.013320086523890495, + "num_input_tokens_seen": 27249664, + "step": 1664, + "train_runtime": 13523.0773, + "train_tokens_per_second": 2015.049 + }, + { + "epoch": 1.009090909090909, + "grad_norm": 0.008403261192142963, + "learning_rate": 9.7936546314258e-05, + "loss": 0.01121437270194292, + "num_input_tokens_seen": 27266040, + "step": 1665, + "train_runtime": 13531.191, + "train_tokens_per_second": 2015.051 + }, + { + "epoch": 1.0096969696969698, + "grad_norm": 0.009295761585235596, + "learning_rate": 9.793381141527786e-05, + "loss": 0.011189664714038372, + "num_input_tokens_seen": 27282416, + "step": 1666, + "train_runtime": 13539.3003, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 1.0103030303030303, + "grad_norm": 0.008638166822493076, + "learning_rate": 9.79310747433194e-05, + "loss": 0.012114373967051506, + "num_input_tokens_seen": 27298792, + "step": 1667, + "train_runtime": 13547.4075, + "train_tokens_per_second": 2015.057 + }, + { + "epoch": 1.010909090909091, + "grad_norm": 0.013895823620259762, + "learning_rate": 9.792833629848384e-05, + "loss": 0.012441890314221382, + "num_input_tokens_seen": 27315168, + "step": 1668, + "train_runtime": 13555.5158, + "train_tokens_per_second": 2015.059 + }, + { + "epoch": 1.0115151515151515, + "grad_norm": 0.010214622132480145, + "learning_rate": 9.792559608087243e-05, + "loss": 0.013287513516843319, + "num_input_tokens_seen": 27331544, + "step": 1669, + "train_runtime": 13563.632, + "train_tokens_per_second": 2015.061 + }, + { + "epoch": 1.0121212121212122, + "grad_norm": 0.049505386501550674, + "learning_rate": 9.792285409058657e-05, + "loss": 0.014072421938180923, + "num_input_tokens_seen": 27347920, + "step": 1670, + "train_runtime": 13571.7428, + "train_tokens_per_second": 2015.063 + }, + { + "epoch": 1.0127272727272727, + "grad_norm": 0.0072554643265903, + "learning_rate": 9.792011032772765e-05, + "loss": 0.012629404664039612, + "num_input_tokens_seen": 27364296, + "step": 1671, + "train_runtime": 13579.8507, + "train_tokens_per_second": 2015.066 + }, + { + "epoch": 1.0133333333333334, + "grad_norm": 0.03262757137417793, + "learning_rate": 9.791736479239717e-05, + "loss": 0.013759227469563484, + "num_input_tokens_seen": 27380672, + "step": 1672, + "train_runtime": 13587.9608, + "train_tokens_per_second": 2015.069 + }, + { + "epoch": 1.013939393939394, + "grad_norm": 0.012852982617914677, + "learning_rate": 9.791461748469669e-05, + "loss": 0.012652904726564884, + "num_input_tokens_seen": 27397048, + "step": 1673, + "train_runtime": 13596.0726, + "train_tokens_per_second": 2015.071 + }, + { + "epoch": 1.0145454545454546, + "grad_norm": 0.008402649313211441, + "learning_rate": 9.791186840472781e-05, + "loss": 0.012419513426721096, + "num_input_tokens_seen": 27413424, + "step": 1674, + "train_runtime": 13604.1792, + "train_tokens_per_second": 2015.074 + }, + { + "epoch": 1.0151515151515151, + "grad_norm": 0.00858678761869669, + "learning_rate": 9.790911755259223e-05, + "loss": 0.011973465792834759, + "num_input_tokens_seen": 27429800, + "step": 1675, + "train_runtime": 13612.2879, + "train_tokens_per_second": 2015.076 + }, + { + "epoch": 1.0157575757575759, + "grad_norm": 0.00914035551249981, + "learning_rate": 9.79063649283917e-05, + "loss": 0.012171374633908272, + "num_input_tokens_seen": 27446176, + "step": 1676, + "train_runtime": 13620.3961, + "train_tokens_per_second": 2015.079 + }, + { + "epoch": 1.0163636363636364, + "grad_norm": 0.01112990453839302, + "learning_rate": 9.790361053222799e-05, + "loss": 0.013811156153678894, + "num_input_tokens_seen": 27462552, + "step": 1677, + "train_runtime": 13628.5074, + "train_tokens_per_second": 2015.081 + }, + { + "epoch": 1.016969696969697, + "grad_norm": 0.008288376964628696, + "learning_rate": 9.790085436420304e-05, + "loss": 0.012029111385345459, + "num_input_tokens_seen": 27478928, + "step": 1678, + "train_runtime": 13636.6158, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 1.0175757575757576, + "grad_norm": 0.009814487770199776, + "learning_rate": 9.789809642441877e-05, + "loss": 0.012732294388115406, + "num_input_tokens_seen": 27495304, + "step": 1679, + "train_runtime": 13644.7216, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 1.018181818181818, + "grad_norm": 0.014307788573205471, + "learning_rate": 9.789533671297719e-05, + "loss": 0.013805416412651539, + "num_input_tokens_seen": 27511680, + "step": 1680, + "train_runtime": 13652.8336, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 1.0187878787878788, + "grad_norm": 0.012851865030825138, + "learning_rate": 9.789257522998037e-05, + "loss": 0.012965833768248558, + "num_input_tokens_seen": 27528056, + "step": 1681, + "train_runtime": 13660.9411, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 1.0193939393939393, + "grad_norm": 0.01544270757585764, + "learning_rate": 9.788981197553047e-05, + "loss": 0.012375458143651485, + "num_input_tokens_seen": 27544432, + "step": 1682, + "train_runtime": 13669.0468, + "train_tokens_per_second": 2015.095 + }, + { + "epoch": 1.02, + "grad_norm": 0.017993232235312462, + "learning_rate": 9.788704694972967e-05, + "loss": 0.012339062988758087, + "num_input_tokens_seen": 27560808, + "step": 1683, + "train_runtime": 13677.1562, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.0206060606060605, + "grad_norm": 0.011097032576799393, + "learning_rate": 9.788428015268027e-05, + "loss": 0.01293334923684597, + "num_input_tokens_seen": 27577184, + "step": 1684, + "train_runtime": 13685.2675, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 1.0212121212121212, + "grad_norm": 0.009788686409592628, + "learning_rate": 9.78815115844846e-05, + "loss": 0.013497721403837204, + "num_input_tokens_seen": 27593560, + "step": 1685, + "train_runtime": 13693.378, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 1.0218181818181817, + "grad_norm": 0.011068359948694706, + "learning_rate": 9.787874124524505e-05, + "loss": 0.012450134381651878, + "num_input_tokens_seen": 27609936, + "step": 1686, + "train_runtime": 13701.4857, + "train_tokens_per_second": 2015.105 + }, + { + "epoch": 1.0224242424242425, + "grad_norm": 0.010093600489199162, + "learning_rate": 9.78759691350641e-05, + "loss": 0.012252770364284515, + "num_input_tokens_seen": 27626312, + "step": 1687, + "train_runtime": 13709.5946, + "train_tokens_per_second": 2015.108 + }, + { + "epoch": 1.023030303030303, + "grad_norm": 0.0072128004394471645, + "learning_rate": 9.78731952540443e-05, + "loss": 0.011251643300056458, + "num_input_tokens_seen": 27642688, + "step": 1688, + "train_runtime": 13717.7048, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 1.0236363636363637, + "grad_norm": 0.01251761894673109, + "learning_rate": 9.787041960228823e-05, + "loss": 0.013902283273637295, + "num_input_tokens_seen": 27659064, + "step": 1689, + "train_runtime": 13725.8143, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.0242424242424242, + "grad_norm": 0.011588496156036854, + "learning_rate": 9.786764217989856e-05, + "loss": 0.0121589545160532, + "num_input_tokens_seen": 27675440, + "step": 1690, + "train_runtime": 13733.9208, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 1.024848484848485, + "grad_norm": 0.013498706743121147, + "learning_rate": 9.786486298697803e-05, + "loss": 0.014338000677525997, + "num_input_tokens_seen": 27691816, + "step": 1691, + "train_runtime": 13742.0332, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 1.0254545454545454, + "grad_norm": 0.008712450042366982, + "learning_rate": 9.786208202362943e-05, + "loss": 0.011765317060053349, + "num_input_tokens_seen": 27708192, + "step": 1692, + "train_runtime": 13750.1435, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 1.0260606060606061, + "grad_norm": 0.012163599021732807, + "learning_rate": 9.785929928995561e-05, + "loss": 0.013479816727340221, + "num_input_tokens_seen": 27724568, + "step": 1693, + "train_runtime": 13758.2542, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 1.0266666666666666, + "grad_norm": 0.011169539764523506, + "learning_rate": 9.785651478605953e-05, + "loss": 0.011647282168269157, + "num_input_tokens_seen": 27740944, + "step": 1694, + "train_runtime": 13766.3635, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 1.0272727272727273, + "grad_norm": 0.008172113448381424, + "learning_rate": 9.785372851204415e-05, + "loss": 0.013068556785583496, + "num_input_tokens_seen": 27757320, + "step": 1695, + "train_runtime": 13774.4757, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.0278787878787878, + "grad_norm": 0.01855943165719509, + "learning_rate": 9.785094046801256e-05, + "loss": 0.012416105717420578, + "num_input_tokens_seen": 27773696, + "step": 1696, + "train_runtime": 13782.5835, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 1.0284848484848486, + "grad_norm": 0.011047457344830036, + "learning_rate": 9.784815065406785e-05, + "loss": 0.01277101319283247, + "num_input_tokens_seen": 27790072, + "step": 1697, + "train_runtime": 13790.6921, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 1.029090909090909, + "grad_norm": 0.015314662829041481, + "learning_rate": 9.784535907031322e-05, + "loss": 0.01302441954612732, + "num_input_tokens_seen": 27806448, + "step": 1698, + "train_runtime": 13798.8029, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 1.0296969696969698, + "grad_norm": 0.00843130238354206, + "learning_rate": 9.784256571685195e-05, + "loss": 0.012320177629590034, + "num_input_tokens_seen": 27822824, + "step": 1699, + "train_runtime": 13806.9118, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 1.0303030303030303, + "grad_norm": 0.022686941549181938, + "learning_rate": 9.783977059378734e-05, + "loss": 0.013117888011038303, + "num_input_tokens_seen": 27839200, + "step": 1700, + "train_runtime": 13815.0208, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 1.030909090909091, + "grad_norm": 0.01333204098045826, + "learning_rate": 9.783697370122278e-05, + "loss": 0.014601165428757668, + "num_input_tokens_seen": 27855576, + "step": 1701, + "train_runtime": 13824.1682, + "train_tokens_per_second": 2014.991 + }, + { + "epoch": 1.0315151515151515, + "grad_norm": 0.014649780467152596, + "learning_rate": 9.783417503926172e-05, + "loss": 0.013181449845433235, + "num_input_tokens_seen": 27871952, + "step": 1702, + "train_runtime": 13832.2803, + "train_tokens_per_second": 2014.993 + }, + { + "epoch": 1.0321212121212122, + "grad_norm": 0.011895393021404743, + "learning_rate": 9.783137460800768e-05, + "loss": 0.012327020056545734, + "num_input_tokens_seen": 27888328, + "step": 1703, + "train_runtime": 13840.3903, + "train_tokens_per_second": 2014.996 + }, + { + "epoch": 1.0327272727272727, + "grad_norm": 0.009198145009577274, + "learning_rate": 9.782857240756423e-05, + "loss": 0.011196177452802658, + "num_input_tokens_seen": 27904704, + "step": 1704, + "train_runtime": 13848.5004, + "train_tokens_per_second": 2014.998 + }, + { + "epoch": 1.0333333333333334, + "grad_norm": 0.008451443165540695, + "learning_rate": 9.782576843803504e-05, + "loss": 0.011635595001280308, + "num_input_tokens_seen": 27921080, + "step": 1705, + "train_runtime": 13856.607, + "train_tokens_per_second": 2015.001 + }, + { + "epoch": 1.033939393939394, + "grad_norm": 0.016875306144356728, + "learning_rate": 9.78229626995238e-05, + "loss": 0.012971418909728527, + "num_input_tokens_seen": 27937456, + "step": 1706, + "train_runtime": 13864.7179, + "train_tokens_per_second": 2015.004 + }, + { + "epoch": 1.0345454545454547, + "grad_norm": 0.01164314430207014, + "learning_rate": 9.782015519213433e-05, + "loss": 0.013034064322710037, + "num_input_tokens_seen": 27953832, + "step": 1707, + "train_runtime": 13872.8335, + "train_tokens_per_second": 2015.005 + }, + { + "epoch": 1.0351515151515152, + "grad_norm": 0.010639763437211514, + "learning_rate": 9.78173459159704e-05, + "loss": 0.01193158607929945, + "num_input_tokens_seen": 27970208, + "step": 1708, + "train_runtime": 13880.9435, + "train_tokens_per_second": 2015.008 + }, + { + "epoch": 1.0357575757575757, + "grad_norm": 0.012052073143422604, + "learning_rate": 9.7814534871136e-05, + "loss": 0.012332379817962646, + "num_input_tokens_seen": 27986584, + "step": 1709, + "train_runtime": 13889.0553, + "train_tokens_per_second": 2015.01 + }, + { + "epoch": 1.0363636363636364, + "grad_norm": 0.010986050590872765, + "learning_rate": 9.781172205773506e-05, + "loss": 0.011283627711236477, + "num_input_tokens_seen": 28002960, + "step": 1710, + "train_runtime": 13897.165, + "train_tokens_per_second": 2015.012 + }, + { + "epoch": 1.0369696969696969, + "grad_norm": 0.012194296345114708, + "learning_rate": 9.780890747587164e-05, + "loss": 0.012133404612541199, + "num_input_tokens_seen": 28019336, + "step": 1711, + "train_runtime": 13905.2722, + "train_tokens_per_second": 2015.015 + }, + { + "epoch": 1.0375757575757576, + "grad_norm": 0.011508403345942497, + "learning_rate": 9.780609112564981e-05, + "loss": 0.012315447442233562, + "num_input_tokens_seen": 28035712, + "step": 1712, + "train_runtime": 13913.3813, + "train_tokens_per_second": 2015.018 + }, + { + "epoch": 1.038181818181818, + "grad_norm": 0.009075857698917389, + "learning_rate": 9.780327300717378e-05, + "loss": 0.013060295023024082, + "num_input_tokens_seen": 28052088, + "step": 1713, + "train_runtime": 13921.4903, + "train_tokens_per_second": 2015.02 + }, + { + "epoch": 1.0387878787878788, + "grad_norm": 0.011064046993851662, + "learning_rate": 9.780045312054778e-05, + "loss": 0.011568753980100155, + "num_input_tokens_seen": 28068464, + "step": 1714, + "train_runtime": 13929.6024, + "train_tokens_per_second": 2015.023 + }, + { + "epoch": 1.0393939393939393, + "grad_norm": 0.006963782943785191, + "learning_rate": 9.77976314658761e-05, + "loss": 0.013147801160812378, + "num_input_tokens_seen": 28084840, + "step": 1715, + "train_runtime": 13937.7105, + "train_tokens_per_second": 2015.025 + }, + { + "epoch": 1.04, + "grad_norm": 0.01637214981019497, + "learning_rate": 9.779480804326313e-05, + "loss": 0.013339829631149769, + "num_input_tokens_seen": 28101216, + "step": 1716, + "train_runtime": 13945.8208, + "train_tokens_per_second": 2015.028 + }, + { + "epoch": 1.0406060606060605, + "grad_norm": 0.017523132264614105, + "learning_rate": 9.779198285281325e-05, + "loss": 0.013437901623547077, + "num_input_tokens_seen": 28117592, + "step": 1717, + "train_runtime": 13953.9357, + "train_tokens_per_second": 2015.029 + }, + { + "epoch": 1.0412121212121213, + "grad_norm": 0.010818637907505035, + "learning_rate": 9.778915589463102e-05, + "loss": 0.012181894853711128, + "num_input_tokens_seen": 28133968, + "step": 1718, + "train_runtime": 13962.0499, + "train_tokens_per_second": 2015.031 + }, + { + "epoch": 1.0418181818181818, + "grad_norm": 0.015641039237380028, + "learning_rate": 9.7786327168821e-05, + "loss": 0.012458113953471184, + "num_input_tokens_seen": 28150344, + "step": 1719, + "train_runtime": 13970.1615, + "train_tokens_per_second": 2015.034 + }, + { + "epoch": 1.0424242424242425, + "grad_norm": 0.01187529880553484, + "learning_rate": 9.778349667548776e-05, + "loss": 0.012462708167731762, + "num_input_tokens_seen": 28166720, + "step": 1720, + "train_runtime": 13978.2715, + "train_tokens_per_second": 2015.036 + }, + { + "epoch": 1.043030303030303, + "grad_norm": 0.006183183286339045, + "learning_rate": 9.778066441473604e-05, + "loss": 0.011370932683348656, + "num_input_tokens_seen": 28183096, + "step": 1721, + "train_runtime": 13986.3841, + "train_tokens_per_second": 2015.038 + }, + { + "epoch": 1.0436363636363637, + "grad_norm": 0.004316729959100485, + "learning_rate": 9.777783038667061e-05, + "loss": 0.010927550494670868, + "num_input_tokens_seen": 28199472, + "step": 1722, + "train_runtime": 13994.4946, + "train_tokens_per_second": 2015.04 + }, + { + "epoch": 1.0442424242424242, + "grad_norm": 0.009708013385534286, + "learning_rate": 9.777499459139626e-05, + "loss": 0.01241978257894516, + "num_input_tokens_seen": 28215848, + "step": 1723, + "train_runtime": 14002.6047, + "train_tokens_per_second": 2015.043 + }, + { + "epoch": 1.044848484848485, + "grad_norm": 0.01743965595960617, + "learning_rate": 9.777215702901789e-05, + "loss": 0.012833865359425545, + "num_input_tokens_seen": 28232224, + "step": 1724, + "train_runtime": 14010.7147, + "train_tokens_per_second": 2015.045 + }, + { + "epoch": 1.0454545454545454, + "grad_norm": 0.016349952667951584, + "learning_rate": 9.776931769964049e-05, + "loss": 0.012332115322351456, + "num_input_tokens_seen": 28248600, + "step": 1725, + "train_runtime": 14018.8331, + "train_tokens_per_second": 2015.046 + }, + { + "epoch": 1.0460606060606061, + "grad_norm": 0.012781591154634953, + "learning_rate": 9.776647660336903e-05, + "loss": 0.013009129092097282, + "num_input_tokens_seen": 28264976, + "step": 1726, + "train_runtime": 14026.9411, + "train_tokens_per_second": 2015.049 + }, + { + "epoch": 1.0466666666666666, + "grad_norm": 0.0058420756831765175, + "learning_rate": 9.776363374030864e-05, + "loss": 0.01141081377863884, + "num_input_tokens_seen": 28281352, + "step": 1727, + "train_runtime": 14035.0484, + "train_tokens_per_second": 2015.052 + }, + { + "epoch": 1.0472727272727274, + "grad_norm": 0.010841155424714088, + "learning_rate": 9.776078911056445e-05, + "loss": 0.011902189813554287, + "num_input_tokens_seen": 28297728, + "step": 1728, + "train_runtime": 14043.1635, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 1.0478787878787879, + "grad_norm": 0.006410760339349508, + "learning_rate": 9.775794271424168e-05, + "loss": 0.011528456583619118, + "num_input_tokens_seen": 28314104, + "step": 1729, + "train_runtime": 14051.2752, + "train_tokens_per_second": 2015.056 + }, + { + "epoch": 1.0484848484848486, + "grad_norm": 0.01570526696741581, + "learning_rate": 9.77550945514456e-05, + "loss": 0.01312203984707594, + "num_input_tokens_seen": 28330480, + "step": 1730, + "train_runtime": 14059.3837, + "train_tokens_per_second": 2015.058 + }, + { + "epoch": 1.049090909090909, + "grad_norm": 0.01310622040182352, + "learning_rate": 9.775224462228159e-05, + "loss": 0.012747850269079208, + "num_input_tokens_seen": 28346856, + "step": 1731, + "train_runtime": 14067.4947, + "train_tokens_per_second": 2015.061 + }, + { + "epoch": 1.0496969696969698, + "grad_norm": 0.01001759059727192, + "learning_rate": 9.774939292685504e-05, + "loss": 0.012231552973389626, + "num_input_tokens_seen": 28363232, + "step": 1732, + "train_runtime": 14075.6078, + "train_tokens_per_second": 2015.063 + }, + { + "epoch": 1.0503030303030303, + "grad_norm": 0.008718959987163544, + "learning_rate": 9.774653946527141e-05, + "loss": 0.012665827758610249, + "num_input_tokens_seen": 28379608, + "step": 1733, + "train_runtime": 14083.7165, + "train_tokens_per_second": 2015.065 + }, + { + "epoch": 1.050909090909091, + "grad_norm": 0.013324756175279617, + "learning_rate": 9.774368423763629e-05, + "loss": 0.012266149744391441, + "num_input_tokens_seen": 28395984, + "step": 1734, + "train_runtime": 14091.8314, + "train_tokens_per_second": 2015.067 + }, + { + "epoch": 1.0515151515151515, + "grad_norm": 0.013658484444022179, + "learning_rate": 9.774082724405526e-05, + "loss": 0.013175414875149727, + "num_input_tokens_seen": 28412360, + "step": 1735, + "train_runtime": 14099.9432, + "train_tokens_per_second": 2015.069 + }, + { + "epoch": 1.0521212121212122, + "grad_norm": 0.005875375587493181, + "learning_rate": 9.773796848463402e-05, + "loss": 0.011622895486652851, + "num_input_tokens_seen": 28428736, + "step": 1736, + "train_runtime": 14108.0531, + "train_tokens_per_second": 2015.072 + }, + { + "epoch": 1.0527272727272727, + "grad_norm": 0.007178007159382105, + "learning_rate": 9.773510795947827e-05, + "loss": 0.010762317106127739, + "num_input_tokens_seen": 28445112, + "step": 1737, + "train_runtime": 14116.1633, + "train_tokens_per_second": 2015.074 + }, + { + "epoch": 1.0533333333333332, + "grad_norm": 0.012299314141273499, + "learning_rate": 9.773224566869385e-05, + "loss": 0.012406258843839169, + "num_input_tokens_seen": 28461488, + "step": 1738, + "train_runtime": 14124.2738, + "train_tokens_per_second": 2015.076 + }, + { + "epoch": 1.053939393939394, + "grad_norm": 0.009009703993797302, + "learning_rate": 9.77293816123866e-05, + "loss": 0.013556526973843575, + "num_input_tokens_seen": 28477864, + "step": 1739, + "train_runtime": 14132.3859, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 1.0545454545454545, + "grad_norm": 0.014840499497950077, + "learning_rate": 9.772651579066248e-05, + "loss": 0.012394964694976807, + "num_input_tokens_seen": 28494240, + "step": 1740, + "train_runtime": 14140.4952, + "train_tokens_per_second": 2015.081 + }, + { + "epoch": 1.0551515151515152, + "grad_norm": 0.0068387677893042564, + "learning_rate": 9.772364820362749e-05, + "loss": 0.011850697919726372, + "num_input_tokens_seen": 28510616, + "step": 1741, + "train_runtime": 14148.6048, + "train_tokens_per_second": 2015.083 + }, + { + "epoch": 1.0557575757575757, + "grad_norm": 0.03244248777627945, + "learning_rate": 9.772077885138769e-05, + "loss": 0.013322196900844574, + "num_input_tokens_seen": 28526992, + "step": 1742, + "train_runtime": 14156.7153, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.0563636363636364, + "grad_norm": 0.0062013729475438595, + "learning_rate": 9.771790773404921e-05, + "loss": 0.012244854122400284, + "num_input_tokens_seen": 28543368, + "step": 1743, + "train_runtime": 14164.8321, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 1.056969696969697, + "grad_norm": 0.012492086738348007, + "learning_rate": 9.771503485171824e-05, + "loss": 0.011908994987607002, + "num_input_tokens_seen": 28559744, + "step": 1744, + "train_runtime": 14172.949, + "train_tokens_per_second": 2015.088 + }, + { + "epoch": 1.0575757575757576, + "grad_norm": 0.00868499930948019, + "learning_rate": 9.771216020450108e-05, + "loss": 0.011504937894642353, + "num_input_tokens_seen": 28576120, + "step": 1745, + "train_runtime": 14181.0686, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 1.0581818181818181, + "grad_norm": 0.006381432060152292, + "learning_rate": 9.770928379250399e-05, + "loss": 0.011398052796721458, + "num_input_tokens_seen": 28592496, + "step": 1746, + "train_runtime": 14189.1889, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 1.0587878787878788, + "grad_norm": 0.011300310492515564, + "learning_rate": 9.770640561583342e-05, + "loss": 0.013754375278949738, + "num_input_tokens_seen": 28608872, + "step": 1747, + "train_runtime": 14197.3074, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.0593939393939393, + "grad_norm": 0.01168846245855093, + "learning_rate": 9.770352567459582e-05, + "loss": 0.013645244762301445, + "num_input_tokens_seen": 28625248, + "step": 1748, + "train_runtime": 14205.4333, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 1.06, + "grad_norm": 0.010689773596823215, + "learning_rate": 9.770064396889769e-05, + "loss": 0.012318682856857777, + "num_input_tokens_seen": 28641624, + "step": 1749, + "train_runtime": 14213.5515, + "train_tokens_per_second": 2015.093 + }, + { + "epoch": 1.0606060606060606, + "grad_norm": 0.027544992044568062, + "learning_rate": 9.769776049884563e-05, + "loss": 0.011938882060348988, + "num_input_tokens_seen": 28658000, + "step": 1750, + "train_runtime": 14221.6712, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 1.0612121212121213, + "grad_norm": 0.014839374460279942, + "learning_rate": 9.769487526454631e-05, + "loss": 0.01188915129750967, + "num_input_tokens_seen": 28674376, + "step": 1751, + "train_runtime": 14229.7903, + "train_tokens_per_second": 2015.095 + }, + { + "epoch": 1.0618181818181818, + "grad_norm": 0.01328266877681017, + "learning_rate": 9.769198826610644e-05, + "loss": 0.013045158237218857, + "num_input_tokens_seen": 28690752, + "step": 1752, + "train_runtime": 14237.9087, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 1.0624242424242425, + "grad_norm": 0.005636645946651697, + "learning_rate": 9.768909950363278e-05, + "loss": 0.011337255127727985, + "num_input_tokens_seen": 28707128, + "step": 1753, + "train_runtime": 14246.0326, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 1.063030303030303, + "grad_norm": 0.013464851304888725, + "learning_rate": 9.76862089772322e-05, + "loss": 0.012251244857907295, + "num_input_tokens_seen": 28723504, + "step": 1754, + "train_runtime": 14254.1552, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.0636363636363637, + "grad_norm": 0.016677234321832657, + "learning_rate": 9.768331668701162e-05, + "loss": 0.012644241563975811, + "num_input_tokens_seen": 28739880, + "step": 1755, + "train_runtime": 14262.2737, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.0642424242424242, + "grad_norm": 0.013724375516176224, + "learning_rate": 9.768042263307804e-05, + "loss": 0.013380605727434158, + "num_input_tokens_seen": 28756256, + "step": 1756, + "train_runtime": 14270.3932, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 1.064848484848485, + "grad_norm": 0.008208510465919971, + "learning_rate": 9.767752681553845e-05, + "loss": 0.011122636497020721, + "num_input_tokens_seen": 28772632, + "step": 1757, + "train_runtime": 14278.513, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 1.0654545454545454, + "grad_norm": 0.03562194108963013, + "learning_rate": 9.76746292345e-05, + "loss": 0.01469709537923336, + "num_input_tokens_seen": 28789008, + "step": 1758, + "train_runtime": 14286.6323, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 1.0660606060606062, + "grad_norm": 0.008073823526501656, + "learning_rate": 9.767172989006985e-05, + "loss": 0.012804273515939713, + "num_input_tokens_seen": 28805384, + "step": 1759, + "train_runtime": 14294.7846, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 0.01726573519408703, + "learning_rate": 9.766882878235526e-05, + "loss": 0.012737629935145378, + "num_input_tokens_seen": 28821760, + "step": 1760, + "train_runtime": 14302.9039, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.0672727272727274, + "grad_norm": 0.009928864426910877, + "learning_rate": 9.766592591146352e-05, + "loss": 0.012318781577050686, + "num_input_tokens_seen": 28838136, + "step": 1761, + "train_runtime": 14311.023, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 1.0678787878787879, + "grad_norm": 0.010449480265378952, + "learning_rate": 9.7663021277502e-05, + "loss": 0.012027337215840816, + "num_input_tokens_seen": 28854512, + "step": 1762, + "train_runtime": 14319.1405, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 1.0684848484848484, + "grad_norm": 0.00951461587101221, + "learning_rate": 9.766011488057815e-05, + "loss": 0.012412112206220627, + "num_input_tokens_seen": 28870888, + "step": 1763, + "train_runtime": 14327.2603, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 1.069090909090909, + "grad_norm": 0.012439711019396782, + "learning_rate": 9.765720672079946e-05, + "loss": 0.012368155643343925, + "num_input_tokens_seen": 28887264, + "step": 1764, + "train_runtime": 14335.3786, + "train_tokens_per_second": 2015.103 + }, + { + "epoch": 1.0696969696969698, + "grad_norm": 0.027021143585443497, + "learning_rate": 9.76542967982735e-05, + "loss": 0.013114279136061668, + "num_input_tokens_seen": 28903640, + "step": 1765, + "train_runtime": 14343.4964, + "train_tokens_per_second": 2015.104 + }, + { + "epoch": 1.0703030303030303, + "grad_norm": 0.017193615436553955, + "learning_rate": 9.765138511310791e-05, + "loss": 0.015285216271877289, + "num_input_tokens_seen": 28920016, + "step": 1766, + "train_runtime": 14351.613, + "train_tokens_per_second": 2015.106 + }, + { + "epoch": 1.0709090909090908, + "grad_norm": 0.012239827774465084, + "learning_rate": 9.764847166541038e-05, + "loss": 0.011103234253823757, + "num_input_tokens_seen": 28936392, + "step": 1767, + "train_runtime": 14359.7327, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 1.0715151515151515, + "grad_norm": 0.015190470963716507, + "learning_rate": 9.764555645528867e-05, + "loss": 0.012309125624597073, + "num_input_tokens_seen": 28952768, + "step": 1768, + "train_runtime": 14367.8503, + "train_tokens_per_second": 2015.108 + }, + { + "epoch": 1.072121212121212, + "grad_norm": 0.06645633280277252, + "learning_rate": 9.764263948285062e-05, + "loss": 0.010960210114717484, + "num_input_tokens_seen": 28969144, + "step": 1769, + "train_runtime": 14375.9681, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 1.0727272727272728, + "grad_norm": 0.009598735719919205, + "learning_rate": 9.76397207482041e-05, + "loss": 0.011304397135972977, + "num_input_tokens_seen": 28985520, + "step": 1770, + "train_runtime": 14384.086, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 1.0733333333333333, + "grad_norm": 0.011472896672785282, + "learning_rate": 9.76368002514571e-05, + "loss": 0.011527102440595627, + "num_input_tokens_seen": 29001896, + "step": 1771, + "train_runtime": 14392.2036, + "train_tokens_per_second": 2015.112 + }, + { + "epoch": 1.073939393939394, + "grad_norm": 0.014919782057404518, + "learning_rate": 9.763387799271761e-05, + "loss": 0.012540474534034729, + "num_input_tokens_seen": 29018272, + "step": 1772, + "train_runtime": 14400.3221, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.0745454545454545, + "grad_norm": 0.015532581135630608, + "learning_rate": 9.763095397209374e-05, + "loss": 0.012600000947713852, + "num_input_tokens_seen": 29034648, + "step": 1773, + "train_runtime": 14408.4457, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.0751515151515152, + "grad_norm": 0.008681225590407848, + "learning_rate": 9.762802818969366e-05, + "loss": 0.0126079972833395, + "num_input_tokens_seen": 29051024, + "step": 1774, + "train_runtime": 14416.5656, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 1.0757575757575757, + "grad_norm": 0.03641294687986374, + "learning_rate": 9.762510064562556e-05, + "loss": 0.013490713201463223, + "num_input_tokens_seen": 29067400, + "step": 1775, + "train_runtime": 14424.6824, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 1.0763636363636364, + "grad_norm": 0.13002076745033264, + "learning_rate": 9.762217133999771e-05, + "loss": 0.014682717621326447, + "num_input_tokens_seen": 29083776, + "step": 1776, + "train_runtime": 14432.8024, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 1.076969696969697, + "grad_norm": 0.01150805689394474, + "learning_rate": 9.76192402729185e-05, + "loss": 0.013302959501743317, + "num_input_tokens_seen": 29100152, + "step": 1777, + "train_runtime": 14440.92, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 1.0775757575757576, + "grad_norm": 0.0106755206361413, + "learning_rate": 9.761630744449633e-05, + "loss": 0.01287130918353796, + "num_input_tokens_seen": 29116528, + "step": 1778, + "train_runtime": 14449.038, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 1.0781818181818181, + "grad_norm": 0.01342584379017353, + "learning_rate": 9.761337285483967e-05, + "loss": 0.01304157730191946, + "num_input_tokens_seen": 29132904, + "step": 1779, + "train_runtime": 14457.157, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 1.0787878787878789, + "grad_norm": 0.012149529531598091, + "learning_rate": 9.761043650405708e-05, + "loss": 0.013586745597422123, + "num_input_tokens_seen": 29149280, + "step": 1780, + "train_runtime": 14465.2764, + "train_tokens_per_second": 2015.121 + }, + { + "epoch": 1.0793939393939394, + "grad_norm": 0.030686961486935616, + "learning_rate": 9.760749839225714e-05, + "loss": 0.011370385065674782, + "num_input_tokens_seen": 29165656, + "step": 1781, + "train_runtime": 14473.3971, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.08, + "grad_norm": 0.011646251194179058, + "learning_rate": 9.760455851954857e-05, + "loss": 0.012405425310134888, + "num_input_tokens_seen": 29182032, + "step": 1782, + "train_runtime": 14481.5154, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 1.0806060606060606, + "grad_norm": 0.008761495351791382, + "learning_rate": 9.760161688604008e-05, + "loss": 0.012334661558270454, + "num_input_tokens_seen": 29198408, + "step": 1783, + "train_runtime": 14489.6462, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.0812121212121213, + "grad_norm": 0.017235441133379936, + "learning_rate": 9.759867349184046e-05, + "loss": 0.014072883874177933, + "num_input_tokens_seen": 29214784, + "step": 1784, + "train_runtime": 14497.764, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 1.0818181818181818, + "grad_norm": 0.012654728256165981, + "learning_rate": 9.759572833705864e-05, + "loss": 0.012484287843108177, + "num_input_tokens_seen": 29231160, + "step": 1785, + "train_runtime": 14505.8825, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 1.0824242424242425, + "grad_norm": 0.013732358813285828, + "learning_rate": 9.759278142180348e-05, + "loss": 0.01238732784986496, + "num_input_tokens_seen": 29247536, + "step": 1786, + "train_runtime": 14514.0013, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 1.083030303030303, + "grad_norm": 0.012203603982925415, + "learning_rate": 9.758983274618404e-05, + "loss": 0.012388849630951881, + "num_input_tokens_seen": 29263912, + "step": 1787, + "train_runtime": 14522.1208, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.0836363636363637, + "grad_norm": 0.013789234682917595, + "learning_rate": 9.758688231030935e-05, + "loss": 0.011297831311821938, + "num_input_tokens_seen": 29280288, + "step": 1788, + "train_runtime": 14530.2411, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.0842424242424242, + "grad_norm": 0.008793489076197147, + "learning_rate": 9.758393011428857e-05, + "loss": 0.012010754086077213, + "num_input_tokens_seen": 29296664, + "step": 1789, + "train_runtime": 14538.3598, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 1.084848484848485, + "grad_norm": 0.009672212414443493, + "learning_rate": 9.758097615823088e-05, + "loss": 0.01176269818097353, + "num_input_tokens_seen": 29313040, + "step": 1790, + "train_runtime": 14546.4774, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 1.0854545454545454, + "grad_norm": 0.05065310373902321, + "learning_rate": 9.757802044224553e-05, + "loss": 0.01442466676235199, + "num_input_tokens_seen": 29329416, + "step": 1791, + "train_runtime": 14554.6003, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 1.086060606060606, + "grad_norm": 0.013653626665472984, + "learning_rate": 9.757506296644186e-05, + "loss": 0.012618528679013252, + "num_input_tokens_seen": 29345792, + "step": 1792, + "train_runtime": 14562.7191, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 1.0866666666666667, + "grad_norm": 0.01730596460402012, + "learning_rate": 9.757210373092926e-05, + "loss": 0.012462806887924671, + "num_input_tokens_seen": 29362168, + "step": 1793, + "train_runtime": 14570.8367, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 1.0872727272727274, + "grad_norm": 0.03405005484819412, + "learning_rate": 9.756914273581718e-05, + "loss": 0.012813026085495949, + "num_input_tokens_seen": 29378544, + "step": 1794, + "train_runtime": 14578.957, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 1.087878787878788, + "grad_norm": 0.00812508538365364, + "learning_rate": 9.756617998121516e-05, + "loss": 0.012410092167556286, + "num_input_tokens_seen": 29394920, + "step": 1795, + "train_runtime": 14587.076, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 1.0884848484848484, + "grad_norm": 0.01273594330996275, + "learning_rate": 9.756321546723277e-05, + "loss": 0.014060670509934425, + "num_input_tokens_seen": 29411296, + "step": 1796, + "train_runtime": 14595.1942, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 1.089090909090909, + "grad_norm": 0.010581238195300102, + "learning_rate": 9.756024919397965e-05, + "loss": 0.013785408809781075, + "num_input_tokens_seen": 29427672, + "step": 1797, + "train_runtime": 14603.3146, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 1.0896969696969696, + "grad_norm": 0.008253599517047405, + "learning_rate": 9.755728116156555e-05, + "loss": 0.011731135658919811, + "num_input_tokens_seen": 29444048, + "step": 1798, + "train_runtime": 14611.434, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 1.0903030303030303, + "grad_norm": 0.00944508146494627, + "learning_rate": 9.755431137010023e-05, + "loss": 0.01250201091170311, + "num_input_tokens_seen": 29460424, + "step": 1799, + "train_runtime": 14619.551, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 1.0909090909090908, + "grad_norm": 0.011392886750400066, + "learning_rate": 9.755133981969353e-05, + "loss": 0.011629536747932434, + "num_input_tokens_seen": 29476800, + "step": 1800, + "train_runtime": 14627.6691, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 1.0915151515151515, + "grad_norm": 0.018327729776501656, + "learning_rate": 9.754836651045538e-05, + "loss": 0.014299526810646057, + "num_input_tokens_seen": 29493176, + "step": 1801, + "train_runtime": 14636.8186, + "train_tokens_per_second": 2014.999 + }, + { + "epoch": 1.092121212121212, + "grad_norm": 0.01780688390135765, + "learning_rate": 9.754539144249574e-05, + "loss": 0.012428405694663525, + "num_input_tokens_seen": 29509552, + "step": 1802, + "train_runtime": 14644.9369, + "train_tokens_per_second": 2015.0 + }, + { + "epoch": 1.0927272727272728, + "grad_norm": 0.01762632466852665, + "learning_rate": 9.754241461592468e-05, + "loss": 0.011811019852757454, + "num_input_tokens_seen": 29525928, + "step": 1803, + "train_runtime": 14653.0556, + "train_tokens_per_second": 2015.001 + }, + { + "epoch": 1.0933333333333333, + "grad_norm": 0.00704316608607769, + "learning_rate": 9.753943603085227e-05, + "loss": 0.011734440922737122, + "num_input_tokens_seen": 29542304, + "step": 1804, + "train_runtime": 14661.1926, + "train_tokens_per_second": 2015.0 + }, + { + "epoch": 1.093939393939394, + "grad_norm": 0.007165444549173117, + "learning_rate": 9.753645568738871e-05, + "loss": 0.012202661484479904, + "num_input_tokens_seen": 29558680, + "step": 1805, + "train_runtime": 14669.3124, + "train_tokens_per_second": 2015.001 + }, + { + "epoch": 1.0945454545454545, + "grad_norm": 0.007894719950854778, + "learning_rate": 9.753347358564423e-05, + "loss": 0.01162760891020298, + "num_input_tokens_seen": 29575056, + "step": 1806, + "train_runtime": 14677.4323, + "train_tokens_per_second": 2015.002 + }, + { + "epoch": 1.0951515151515152, + "grad_norm": 0.015263660810887814, + "learning_rate": 9.753048972572912e-05, + "loss": 0.014112107455730438, + "num_input_tokens_seen": 29591432, + "step": 1807, + "train_runtime": 14685.5514, + "train_tokens_per_second": 2015.003 + }, + { + "epoch": 1.0957575757575757, + "grad_norm": 0.015707993879914284, + "learning_rate": 9.752750410775377e-05, + "loss": 0.011696948669850826, + "num_input_tokens_seen": 29607808, + "step": 1808, + "train_runtime": 14693.6719, + "train_tokens_per_second": 2015.004 + }, + { + "epoch": 1.0963636363636364, + "grad_norm": 0.008728215470910072, + "learning_rate": 9.752451673182859e-05, + "loss": 0.01177802961319685, + "num_input_tokens_seen": 29624184, + "step": 1809, + "train_runtime": 14701.7939, + "train_tokens_per_second": 2015.005 + }, + { + "epoch": 1.096969696969697, + "grad_norm": 0.013884173706173897, + "learning_rate": 9.752152759806408e-05, + "loss": 0.012690341100096703, + "num_input_tokens_seen": 29640560, + "step": 1810, + "train_runtime": 14709.9132, + "train_tokens_per_second": 2015.006 + }, + { + "epoch": 1.0975757575757576, + "grad_norm": 0.01112292893230915, + "learning_rate": 9.751853670657081e-05, + "loss": 0.012653153389692307, + "num_input_tokens_seen": 29656936, + "step": 1811, + "train_runtime": 14718.0328, + "train_tokens_per_second": 2015.007 + }, + { + "epoch": 1.0981818181818181, + "grad_norm": 0.010812713764607906, + "learning_rate": 9.751554405745941e-05, + "loss": 0.012738914228975773, + "num_input_tokens_seen": 29673312, + "step": 1812, + "train_runtime": 14726.1501, + "train_tokens_per_second": 2015.008 + }, + { + "epoch": 1.0987878787878789, + "grad_norm": 0.010833236388862133, + "learning_rate": 9.751254965084056e-05, + "loss": 0.012412777170538902, + "num_input_tokens_seen": 29689688, + "step": 1813, + "train_runtime": 14734.2996, + "train_tokens_per_second": 2015.005 + }, + { + "epoch": 1.0993939393939394, + "grad_norm": 0.0037972007412463427, + "learning_rate": 9.750955348682503e-05, + "loss": 0.011926090344786644, + "num_input_tokens_seen": 29706064, + "step": 1814, + "train_runtime": 14742.4187, + "train_tokens_per_second": 2015.006 + }, + { + "epoch": 1.1, + "grad_norm": 0.010151666589081287, + "learning_rate": 9.750655556552364e-05, + "loss": 0.011667672544717789, + "num_input_tokens_seen": 29722440, + "step": 1815, + "train_runtime": 14750.5371, + "train_tokens_per_second": 2015.007 + }, + { + "epoch": 1.1006060606060606, + "grad_norm": 0.004636733792722225, + "learning_rate": 9.750355588704727e-05, + "loss": 0.0114663764834404, + "num_input_tokens_seen": 29738816, + "step": 1816, + "train_runtime": 14758.6548, + "train_tokens_per_second": 2015.009 + }, + { + "epoch": 1.1012121212121213, + "grad_norm": 0.009711910970509052, + "learning_rate": 9.750055445150688e-05, + "loss": 0.012894188985228539, + "num_input_tokens_seen": 29755192, + "step": 1817, + "train_runtime": 14766.7747, + "train_tokens_per_second": 2015.01 + }, + { + "epoch": 1.1018181818181818, + "grad_norm": 0.01187776681035757, + "learning_rate": 9.749755125901349e-05, + "loss": 0.013470055535435677, + "num_input_tokens_seen": 29771568, + "step": 1818, + "train_runtime": 14774.8985, + "train_tokens_per_second": 2015.01 + }, + { + "epoch": 1.1024242424242425, + "grad_norm": 0.014701352454721928, + "learning_rate": 9.749454630967816e-05, + "loss": 0.013217932544648647, + "num_input_tokens_seen": 29787944, + "step": 1819, + "train_runtime": 14783.0202, + "train_tokens_per_second": 2015.011 + }, + { + "epoch": 1.103030303030303, + "grad_norm": 0.01794923096895218, + "learning_rate": 9.749153960361207e-05, + "loss": 0.013999737799167633, + "num_input_tokens_seen": 29804320, + "step": 1820, + "train_runtime": 14791.1415, + "train_tokens_per_second": 2015.011 + }, + { + "epoch": 1.1036363636363635, + "grad_norm": 0.00407990999519825, + "learning_rate": 9.748853114092639e-05, + "loss": 0.012194567359983921, + "num_input_tokens_seen": 29820696, + "step": 1821, + "train_runtime": 14799.2637, + "train_tokens_per_second": 2015.012 + }, + { + "epoch": 1.1042424242424242, + "grad_norm": 0.015737803652882576, + "learning_rate": 9.748552092173246e-05, + "loss": 0.012318138033151627, + "num_input_tokens_seen": 29837072, + "step": 1822, + "train_runtime": 14807.3879, + "train_tokens_per_second": 2015.013 + }, + { + "epoch": 1.1048484848484847, + "grad_norm": 0.01409347727894783, + "learning_rate": 9.748250894614156e-05, + "loss": 0.013581490144133568, + "num_input_tokens_seen": 29853448, + "step": 1823, + "train_runtime": 14815.5037, + "train_tokens_per_second": 2015.014 + }, + { + "epoch": 1.1054545454545455, + "grad_norm": 0.014121908694505692, + "learning_rate": 9.747949521426514e-05, + "loss": 0.012862889096140862, + "num_input_tokens_seen": 29869824, + "step": 1824, + "train_runtime": 14823.6193, + "train_tokens_per_second": 2015.016 + }, + { + "epoch": 1.106060606060606, + "grad_norm": 0.007615895476192236, + "learning_rate": 9.747647972621463e-05, + "loss": 0.012592250481247902, + "num_input_tokens_seen": 29886200, + "step": 1825, + "train_runtime": 14831.7454, + "train_tokens_per_second": 2015.016 + }, + { + "epoch": 1.1066666666666667, + "grad_norm": 0.00876256637275219, + "learning_rate": 9.747346248210161e-05, + "loss": 0.011891753412783146, + "num_input_tokens_seen": 29902576, + "step": 1826, + "train_runtime": 14839.876, + "train_tokens_per_second": 2015.015 + }, + { + "epoch": 1.1072727272727272, + "grad_norm": 0.009708087891340256, + "learning_rate": 9.747044348203766e-05, + "loss": 0.013173967599868774, + "num_input_tokens_seen": 29918952, + "step": 1827, + "train_runtime": 14847.9958, + "train_tokens_per_second": 2015.016 + }, + { + "epoch": 1.107878787878788, + "grad_norm": 0.013138052076101303, + "learning_rate": 9.746742272613443e-05, + "loss": 0.012966789305210114, + "num_input_tokens_seen": 29935328, + "step": 1828, + "train_runtime": 14856.1159, + "train_tokens_per_second": 2015.017 + }, + { + "epoch": 1.1084848484848484, + "grad_norm": 0.008558280766010284, + "learning_rate": 9.74644002145037e-05, + "loss": 0.011896101757884026, + "num_input_tokens_seen": 29951704, + "step": 1829, + "train_runtime": 14864.2363, + "train_tokens_per_second": 2015.018 + }, + { + "epoch": 1.1090909090909091, + "grad_norm": 0.006839067209511995, + "learning_rate": 9.746137594725722e-05, + "loss": 0.013318931683897972, + "num_input_tokens_seen": 29968080, + "step": 1830, + "train_runtime": 14872.3569, + "train_tokens_per_second": 2015.019 + }, + { + "epoch": 1.1096969696969696, + "grad_norm": 0.011994832195341587, + "learning_rate": 9.745834992450689e-05, + "loss": 0.012568656355142593, + "num_input_tokens_seen": 29984456, + "step": 1831, + "train_runtime": 14880.4762, + "train_tokens_per_second": 2015.02 + }, + { + "epoch": 1.1103030303030303, + "grad_norm": 0.014069044031202793, + "learning_rate": 9.745532214636459e-05, + "loss": 0.01205383613705635, + "num_input_tokens_seen": 30000832, + "step": 1832, + "train_runtime": 14888.5973, + "train_tokens_per_second": 2015.021 + }, + { + "epoch": 1.1109090909090908, + "grad_norm": 0.011945140548050404, + "learning_rate": 9.745229261294235e-05, + "loss": 0.01266874186694622, + "num_input_tokens_seen": 30017208, + "step": 1833, + "train_runtime": 14896.7181, + "train_tokens_per_second": 2015.022 + }, + { + "epoch": 1.1115151515151516, + "grad_norm": 0.008917572908103466, + "learning_rate": 9.744926132435223e-05, + "loss": 0.012527624145150185, + "num_input_tokens_seen": 30033584, + "step": 1834, + "train_runtime": 14904.8405, + "train_tokens_per_second": 2015.022 + }, + { + "epoch": 1.112121212121212, + "grad_norm": 0.05755450576543808, + "learning_rate": 9.744622828070632e-05, + "loss": 0.013464560732245445, + "num_input_tokens_seen": 30049960, + "step": 1835, + "train_runtime": 14912.9616, + "train_tokens_per_second": 2015.023 + }, + { + "epoch": 1.1127272727272728, + "grad_norm": 0.08009150624275208, + "learning_rate": 9.744319348211684e-05, + "loss": 0.011920344084501266, + "num_input_tokens_seen": 30066336, + "step": 1836, + "train_runtime": 14921.0823, + "train_tokens_per_second": 2015.024 + }, + { + "epoch": 1.1133333333333333, + "grad_norm": 0.018737800419330597, + "learning_rate": 9.744015692869602e-05, + "loss": 0.013126276433467865, + "num_input_tokens_seen": 30082712, + "step": 1837, + "train_runtime": 14929.2031, + "train_tokens_per_second": 2015.025 + }, + { + "epoch": 1.113939393939394, + "grad_norm": 0.011901168152689934, + "learning_rate": 9.743711862055615e-05, + "loss": 0.013369777239859104, + "num_input_tokens_seen": 30099088, + "step": 1838, + "train_runtime": 14937.3337, + "train_tokens_per_second": 2015.024 + }, + { + "epoch": 1.1145454545454545, + "grad_norm": 0.014852079562842846, + "learning_rate": 9.743407855780969e-05, + "loss": 0.012921641580760479, + "num_input_tokens_seen": 30115464, + "step": 1839, + "train_runtime": 14945.4561, + "train_tokens_per_second": 2015.025 + }, + { + "epoch": 1.1151515151515152, + "grad_norm": 0.02092067152261734, + "learning_rate": 9.7431036740569e-05, + "loss": 0.01345036644488573, + "num_input_tokens_seen": 30131840, + "step": 1840, + "train_runtime": 14953.5781, + "train_tokens_per_second": 2015.025 + }, + { + "epoch": 1.1157575757575757, + "grad_norm": 0.014358977787196636, + "learning_rate": 9.742799316894663e-05, + "loss": 0.012646627612411976, + "num_input_tokens_seen": 30148216, + "step": 1841, + "train_runtime": 14961.699, + "train_tokens_per_second": 2015.026 + }, + { + "epoch": 1.1163636363636364, + "grad_norm": 0.008887351490557194, + "learning_rate": 9.742494784305518e-05, + "loss": 0.01288522221148014, + "num_input_tokens_seen": 30164592, + "step": 1842, + "train_runtime": 14969.8197, + "train_tokens_per_second": 2015.027 + }, + { + "epoch": 1.116969696969697, + "grad_norm": 0.011526895686984062, + "learning_rate": 9.742190076300726e-05, + "loss": 0.013668229803442955, + "num_input_tokens_seen": 30180968, + "step": 1843, + "train_runtime": 14977.9403, + "train_tokens_per_second": 2015.028 + }, + { + "epoch": 1.1175757575757577, + "grad_norm": 0.007548601366579533, + "learning_rate": 9.741885192891556e-05, + "loss": 0.012254860252141953, + "num_input_tokens_seen": 30197344, + "step": 1844, + "train_runtime": 14986.0602, + "train_tokens_per_second": 2015.029 + }, + { + "epoch": 1.1181818181818182, + "grad_norm": 0.008887049742043018, + "learning_rate": 9.74158013408929e-05, + "loss": 0.012711411342024803, + "num_input_tokens_seen": 30213720, + "step": 1845, + "train_runtime": 14994.181, + "train_tokens_per_second": 2015.03 + }, + { + "epoch": 1.1187878787878789, + "grad_norm": 0.009076109156012535, + "learning_rate": 9.741274899905207e-05, + "loss": 0.011722360737621784, + "num_input_tokens_seen": 30230096, + "step": 1846, + "train_runtime": 15002.3018, + "train_tokens_per_second": 2015.031 + }, + { + "epoch": 1.1193939393939394, + "grad_norm": 0.00916222669184208, + "learning_rate": 9.740969490350598e-05, + "loss": 0.012270544655621052, + "num_input_tokens_seen": 30246472, + "step": 1847, + "train_runtime": 15010.4324, + "train_tokens_per_second": 2015.03 + }, + { + "epoch": 1.12, + "grad_norm": 0.013789625838398933, + "learning_rate": 9.74066390543676e-05, + "loss": 0.011830583214759827, + "num_input_tokens_seen": 30262848, + "step": 1848, + "train_runtime": 15018.5537, + "train_tokens_per_second": 2015.031 + }, + { + "epoch": 1.1206060606060606, + "grad_norm": 0.00897382665425539, + "learning_rate": 9.740358145174998e-05, + "loss": 0.012301658280193806, + "num_input_tokens_seen": 30279224, + "step": 1849, + "train_runtime": 15026.6729, + "train_tokens_per_second": 2015.032 + }, + { + "epoch": 1.121212121212121, + "grad_norm": 0.013259979896247387, + "learning_rate": 9.740052209576619e-05, + "loss": 0.013160964474081993, + "num_input_tokens_seen": 30295600, + "step": 1850, + "train_runtime": 15034.7937, + "train_tokens_per_second": 2015.033 + }, + { + "epoch": 1.1218181818181818, + "grad_norm": 0.009648144245147705, + "learning_rate": 9.739746098652939e-05, + "loss": 0.013108627870678902, + "num_input_tokens_seen": 30311976, + "step": 1851, + "train_runtime": 15042.9152, + "train_tokens_per_second": 2015.033 + }, + { + "epoch": 1.1224242424242423, + "grad_norm": 0.006125753745436668, + "learning_rate": 9.739439812415281e-05, + "loss": 0.012194282375276089, + "num_input_tokens_seen": 30328352, + "step": 1852, + "train_runtime": 15051.0341, + "train_tokens_per_second": 2015.034 + }, + { + "epoch": 1.123030303030303, + "grad_norm": 0.01046109851449728, + "learning_rate": 9.739133350874974e-05, + "loss": 0.012304945848882198, + "num_input_tokens_seen": 30344728, + "step": 1853, + "train_runtime": 15059.1522, + "train_tokens_per_second": 2015.036 + }, + { + "epoch": 1.1236363636363635, + "grad_norm": 0.01071660965681076, + "learning_rate": 9.738826714043354e-05, + "loss": 0.011550496332347393, + "num_input_tokens_seen": 30361104, + "step": 1854, + "train_runtime": 15067.2715, + "train_tokens_per_second": 2015.037 + }, + { + "epoch": 1.1242424242424243, + "grad_norm": 0.020824618637561798, + "learning_rate": 9.738519901931762e-05, + "loss": 0.013716255314648151, + "num_input_tokens_seen": 30377480, + "step": 1855, + "train_runtime": 15075.3926, + "train_tokens_per_second": 2015.037 + }, + { + "epoch": 1.1248484848484848, + "grad_norm": 0.010834557004272938, + "learning_rate": 9.738212914551547e-05, + "loss": 0.012743671424686909, + "num_input_tokens_seen": 30393856, + "step": 1856, + "train_runtime": 15083.5149, + "train_tokens_per_second": 2015.038 + }, + { + "epoch": 1.1254545454545455, + "grad_norm": 0.012935217469930649, + "learning_rate": 9.737905751914063e-05, + "loss": 0.014386632479727268, + "num_input_tokens_seen": 30410232, + "step": 1857, + "train_runtime": 15091.6366, + "train_tokens_per_second": 2015.039 + }, + { + "epoch": 1.126060606060606, + "grad_norm": 0.014889142476022243, + "learning_rate": 9.737598414030673e-05, + "loss": 0.012645886279642582, + "num_input_tokens_seen": 30426608, + "step": 1858, + "train_runtime": 15099.7572, + "train_tokens_per_second": 2015.04 + }, + { + "epoch": 1.1266666666666667, + "grad_norm": 0.04802517592906952, + "learning_rate": 9.737290900912743e-05, + "loss": 0.014930625446140766, + "num_input_tokens_seen": 30442984, + "step": 1859, + "train_runtime": 15107.8772, + "train_tokens_per_second": 2015.04 + }, + { + "epoch": 1.1272727272727272, + "grad_norm": 0.02512347884476185, + "learning_rate": 9.736983212571646e-05, + "loss": 0.013300512917339802, + "num_input_tokens_seen": 30459360, + "step": 1860, + "train_runtime": 15115.9968, + "train_tokens_per_second": 2015.041 + }, + { + "epoch": 1.127878787878788, + "grad_norm": 0.023098865523934364, + "learning_rate": 9.736675349018767e-05, + "loss": 0.01131696067750454, + "num_input_tokens_seen": 30475736, + "step": 1861, + "train_runtime": 15124.1176, + "train_tokens_per_second": 2015.042 + }, + { + "epoch": 1.1284848484848484, + "grad_norm": 0.014170211739838123, + "learning_rate": 9.736367310265492e-05, + "loss": 0.013279788196086884, + "num_input_tokens_seen": 30492112, + "step": 1862, + "train_runtime": 15132.2381, + "train_tokens_per_second": 2015.043 + }, + { + "epoch": 1.1290909090909091, + "grad_norm": 0.00675050588324666, + "learning_rate": 9.736059096323212e-05, + "loss": 0.012112347409129143, + "num_input_tokens_seen": 30508488, + "step": 1863, + "train_runtime": 15140.3587, + "train_tokens_per_second": 2015.044 + }, + { + "epoch": 1.1296969696969696, + "grad_norm": 0.01739928312599659, + "learning_rate": 9.735750707203331e-05, + "loss": 0.013913700357079506, + "num_input_tokens_seen": 30524864, + "step": 1864, + "train_runtime": 15148.4784, + "train_tokens_per_second": 2015.045 + }, + { + "epoch": 1.1303030303030304, + "grad_norm": 0.005056953988969326, + "learning_rate": 9.73544214291725e-05, + "loss": 0.010809306055307388, + "num_input_tokens_seen": 30541240, + "step": 1865, + "train_runtime": 15156.5973, + "train_tokens_per_second": 2015.046 + }, + { + "epoch": 1.1309090909090909, + "grad_norm": 0.01127872709184885, + "learning_rate": 9.73513340347639e-05, + "loss": 0.011840720660984516, + "num_input_tokens_seen": 30557616, + "step": 1866, + "train_runtime": 15164.7169, + "train_tokens_per_second": 2015.047 + }, + { + "epoch": 1.1315151515151516, + "grad_norm": 0.010311625897884369, + "learning_rate": 9.734824488892164e-05, + "loss": 0.011924706399440765, + "num_input_tokens_seen": 30573992, + "step": 1867, + "train_runtime": 15172.8357, + "train_tokens_per_second": 2015.048 + }, + { + "epoch": 1.132121212121212, + "grad_norm": 0.007677890360355377, + "learning_rate": 9.734515399176003e-05, + "loss": 0.012517043389379978, + "num_input_tokens_seen": 30590368, + "step": 1868, + "train_runtime": 15180.9572, + "train_tokens_per_second": 2015.049 + }, + { + "epoch": 1.1327272727272728, + "grad_norm": 0.004882013890892267, + "learning_rate": 9.734206134339337e-05, + "loss": 0.011892163194715977, + "num_input_tokens_seen": 30606744, + "step": 1869, + "train_runtime": 15189.0791, + "train_tokens_per_second": 2015.049 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 0.012440717779099941, + "learning_rate": 9.733896694393605e-05, + "loss": 0.012283443473279476, + "num_input_tokens_seen": 30623120, + "step": 1870, + "train_runtime": 15197.1978, + "train_tokens_per_second": 2015.05 + }, + { + "epoch": 1.133939393939394, + "grad_norm": 0.010757396928966045, + "learning_rate": 9.733587079350252e-05, + "loss": 0.011945050209760666, + "num_input_tokens_seen": 30639496, + "step": 1871, + "train_runtime": 15205.3192, + "train_tokens_per_second": 2015.051 + }, + { + "epoch": 1.1345454545454545, + "grad_norm": 0.028685016557574272, + "learning_rate": 9.733277289220733e-05, + "loss": 0.012834792956709862, + "num_input_tokens_seen": 30655872, + "step": 1872, + "train_runtime": 15213.4408, + "train_tokens_per_second": 2015.052 + }, + { + "epoch": 1.1351515151515152, + "grad_norm": 0.009486911818385124, + "learning_rate": 9.732967324016504e-05, + "loss": 0.011870292015373707, + "num_input_tokens_seen": 30672248, + "step": 1873, + "train_runtime": 15221.5618, + "train_tokens_per_second": 2015.053 + }, + { + "epoch": 1.1357575757575757, + "grad_norm": 0.010406363755464554, + "learning_rate": 9.732657183749029e-05, + "loss": 0.014183721505105495, + "num_input_tokens_seen": 30688624, + "step": 1874, + "train_runtime": 15229.6819, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 1.1363636363636362, + "grad_norm": 0.00912264920771122, + "learning_rate": 9.732346868429784e-05, + "loss": 0.012175610288977623, + "num_input_tokens_seen": 30705000, + "step": 1875, + "train_runtime": 15237.8034, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 1.136969696969697, + "grad_norm": 0.013175376690924168, + "learning_rate": 9.732036378070243e-05, + "loss": 0.011904444545507431, + "num_input_tokens_seen": 30721376, + "step": 1876, + "train_runtime": 15245.9237, + "train_tokens_per_second": 2015.055 + }, + { + "epoch": 1.1375757575757577, + "grad_norm": 0.019623400643467903, + "learning_rate": 9.731725712681892e-05, + "loss": 0.012222235091030598, + "num_input_tokens_seen": 30737752, + "step": 1877, + "train_runtime": 15254.0434, + "train_tokens_per_second": 2015.056 + }, + { + "epoch": 1.1381818181818182, + "grad_norm": 0.011514107696712017, + "learning_rate": 9.731414872276221e-05, + "loss": 0.012634092941880226, + "num_input_tokens_seen": 30754128, + "step": 1878, + "train_runtime": 15262.1622, + "train_tokens_per_second": 2015.057 + }, + { + "epoch": 1.1387878787878787, + "grad_norm": 0.010799894109368324, + "learning_rate": 9.731103856864728e-05, + "loss": 0.012497657909989357, + "num_input_tokens_seen": 30770504, + "step": 1879, + "train_runtime": 15270.2827, + "train_tokens_per_second": 2015.058 + }, + { + "epoch": 1.1393939393939394, + "grad_norm": 0.006618468556553125, + "learning_rate": 9.730792666458916e-05, + "loss": 0.011121107265353203, + "num_input_tokens_seen": 30786880, + "step": 1880, + "train_runtime": 15278.4036, + "train_tokens_per_second": 2015.059 + }, + { + "epoch": 1.1400000000000001, + "grad_norm": 0.01856573298573494, + "learning_rate": 9.730481301070298e-05, + "loss": 0.012096052058041096, + "num_input_tokens_seen": 30803256, + "step": 1881, + "train_runtime": 15286.5234, + "train_tokens_per_second": 2015.06 + }, + { + "epoch": 1.1406060606060606, + "grad_norm": 0.0105481231585145, + "learning_rate": 9.730169760710386e-05, + "loss": 0.012894745916128159, + "num_input_tokens_seen": 30819632, + "step": 1882, + "train_runtime": 15294.6412, + "train_tokens_per_second": 2015.061 + }, + { + "epoch": 1.1412121212121211, + "grad_norm": 0.006541201379150152, + "learning_rate": 9.729858045390708e-05, + "loss": 0.011877333745360374, + "num_input_tokens_seen": 30836008, + "step": 1883, + "train_runtime": 15302.7603, + "train_tokens_per_second": 2015.062 + }, + { + "epoch": 1.1418181818181818, + "grad_norm": 0.011717238463461399, + "learning_rate": 9.729546155122792e-05, + "loss": 0.011121803894639015, + "num_input_tokens_seen": 30852384, + "step": 1884, + "train_runtime": 15310.8784, + "train_tokens_per_second": 2015.063 + }, + { + "epoch": 1.1424242424242423, + "grad_norm": 0.01204200740903616, + "learning_rate": 9.729234089918173e-05, + "loss": 0.012877327390015125, + "num_input_tokens_seen": 30868760, + "step": 1885, + "train_runtime": 15318.9981, + "train_tokens_per_second": 2015.064 + }, + { + "epoch": 1.143030303030303, + "grad_norm": 0.009307028725743294, + "learning_rate": 9.728921849788397e-05, + "loss": 0.011277549900114536, + "num_input_tokens_seen": 30885136, + "step": 1886, + "train_runtime": 15327.1181, + "train_tokens_per_second": 2015.065 + }, + { + "epoch": 1.1436363636363636, + "grad_norm": 0.00815204344689846, + "learning_rate": 9.728609434745009e-05, + "loss": 0.012305430136620998, + "num_input_tokens_seen": 30901512, + "step": 1887, + "train_runtime": 15335.2383, + "train_tokens_per_second": 2015.066 + }, + { + "epoch": 1.1442424242424243, + "grad_norm": 0.013295816257596016, + "learning_rate": 9.728296844799567e-05, + "loss": 0.011643802747130394, + "num_input_tokens_seen": 30917888, + "step": 1888, + "train_runtime": 15343.3603, + "train_tokens_per_second": 2015.066 + }, + { + "epoch": 1.1448484848484848, + "grad_norm": 0.015981702134013176, + "learning_rate": 9.727984079963632e-05, + "loss": 0.011237949132919312, + "num_input_tokens_seen": 30934264, + "step": 1889, + "train_runtime": 15351.4806, + "train_tokens_per_second": 2015.067 + }, + { + "epoch": 1.1454545454545455, + "grad_norm": 0.014281482435762882, + "learning_rate": 9.727671140248775e-05, + "loss": 0.0122231962159276, + "num_input_tokens_seen": 30950640, + "step": 1890, + "train_runtime": 15359.6018, + "train_tokens_per_second": 2015.068 + }, + { + "epoch": 1.146060606060606, + "grad_norm": 0.011495551094412804, + "learning_rate": 9.727358025666568e-05, + "loss": 0.012298443354666233, + "num_input_tokens_seen": 30967016, + "step": 1891, + "train_runtime": 15367.7225, + "train_tokens_per_second": 2015.069 + }, + { + "epoch": 1.1466666666666667, + "grad_norm": 0.01308425236493349, + "learning_rate": 9.727044736228594e-05, + "loss": 0.013174796476960182, + "num_input_tokens_seen": 30983392, + "step": 1892, + "train_runtime": 15375.8433, + "train_tokens_per_second": 2015.069 + }, + { + "epoch": 1.1472727272727272, + "grad_norm": 0.01665692962706089, + "learning_rate": 9.726731271946441e-05, + "loss": 0.013839912600815296, + "num_input_tokens_seen": 30999768, + "step": 1893, + "train_runtime": 15383.9648, + "train_tokens_per_second": 2015.07 + }, + { + "epoch": 1.147878787878788, + "grad_norm": 0.011216187849640846, + "learning_rate": 9.726417632831701e-05, + "loss": 0.012092461809515953, + "num_input_tokens_seen": 31016144, + "step": 1894, + "train_runtime": 15392.0887, + "train_tokens_per_second": 2015.071 + }, + { + "epoch": 1.1484848484848484, + "grad_norm": 0.0110325887799263, + "learning_rate": 9.72610381889598e-05, + "loss": 0.013044838793575764, + "num_input_tokens_seen": 31032520, + "step": 1895, + "train_runtime": 15400.2084, + "train_tokens_per_second": 2015.071 + }, + { + "epoch": 1.1490909090909092, + "grad_norm": 0.011509820818901062, + "learning_rate": 9.725789830150882e-05, + "loss": 0.012543351389467716, + "num_input_tokens_seen": 31048896, + "step": 1896, + "train_runtime": 15408.3347, + "train_tokens_per_second": 2015.071 + }, + { + "epoch": 1.1496969696969697, + "grad_norm": 0.016707783564925194, + "learning_rate": 9.725475666608019e-05, + "loss": 0.013459472917020321, + "num_input_tokens_seen": 31065272, + "step": 1897, + "train_runtime": 15416.4555, + "train_tokens_per_second": 2015.072 + }, + { + "epoch": 1.1503030303030304, + "grad_norm": 0.016305526718497276, + "learning_rate": 9.725161328279016e-05, + "loss": 0.012954285368323326, + "num_input_tokens_seen": 31081648, + "step": 1898, + "train_runtime": 15424.5766, + "train_tokens_per_second": 2015.073 + }, + { + "epoch": 1.1509090909090909, + "grad_norm": 0.014199224300682545, + "learning_rate": 9.724846815175495e-05, + "loss": 0.01172240823507309, + "num_input_tokens_seen": 31098024, + "step": 1899, + "train_runtime": 15432.6971, + "train_tokens_per_second": 2015.074 + }, + { + "epoch": 1.1515151515151516, + "grad_norm": 0.014520173892378807, + "learning_rate": 9.724532127309094e-05, + "loss": 0.013250859454274178, + "num_input_tokens_seen": 31114400, + "step": 1900, + "train_runtime": 15440.8165, + "train_tokens_per_second": 2015.075 + }, + { + "epoch": 1.152121212121212, + "grad_norm": 0.014697631821036339, + "learning_rate": 9.724217264691448e-05, + "loss": 0.01238673273473978, + "num_input_tokens_seen": 31130776, + "step": 1901, + "train_runtime": 15449.8926, + "train_tokens_per_second": 2014.951 + }, + { + "epoch": 1.1527272727272728, + "grad_norm": 0.011663121171295643, + "learning_rate": 9.723902227334207e-05, + "loss": 0.012560025788843632, + "num_input_tokens_seen": 31147152, + "step": 1902, + "train_runtime": 15458.0103, + "train_tokens_per_second": 2014.952 + }, + { + "epoch": 1.1533333333333333, + "grad_norm": 0.013661185279488564, + "learning_rate": 9.723587015249021e-05, + "loss": 0.013027054257690907, + "num_input_tokens_seen": 31163528, + "step": 1903, + "train_runtime": 15466.1331, + "train_tokens_per_second": 2014.953 + }, + { + "epoch": 1.1539393939393938, + "grad_norm": 0.01441930141299963, + "learning_rate": 9.72327162844755e-05, + "loss": 0.012912960723042488, + "num_input_tokens_seen": 31179904, + "step": 1904, + "train_runtime": 15474.2486, + "train_tokens_per_second": 2014.954 + }, + { + "epoch": 1.1545454545454545, + "grad_norm": 0.008236902765929699, + "learning_rate": 9.72295606694146e-05, + "loss": 0.01216426957398653, + "num_input_tokens_seen": 31196280, + "step": 1905, + "train_runtime": 15482.3692, + "train_tokens_per_second": 2014.955 + }, + { + "epoch": 1.1551515151515153, + "grad_norm": 0.007718207780271769, + "learning_rate": 9.722640330742423e-05, + "loss": 0.013841992244124413, + "num_input_tokens_seen": 31212656, + "step": 1906, + "train_runtime": 15490.489, + "train_tokens_per_second": 2014.956 + }, + { + "epoch": 1.1557575757575758, + "grad_norm": 0.013511128723621368, + "learning_rate": 9.722324419862116e-05, + "loss": 0.014025630429387093, + "num_input_tokens_seen": 31229032, + "step": 1907, + "train_runtime": 15498.6085, + "train_tokens_per_second": 2014.957 + }, + { + "epoch": 1.1563636363636363, + "grad_norm": 0.01642940379679203, + "learning_rate": 9.722008334312227e-05, + "loss": 0.011956113390624523, + "num_input_tokens_seen": 31245408, + "step": 1908, + "train_runtime": 15506.7329, + "train_tokens_per_second": 2014.958 + }, + { + "epoch": 1.156969696969697, + "grad_norm": 0.0073166899383068085, + "learning_rate": 9.721692074104444e-05, + "loss": 0.011238785460591316, + "num_input_tokens_seen": 31261784, + "step": 1909, + "train_runtime": 15514.852, + "train_tokens_per_second": 2014.959 + }, + { + "epoch": 1.1575757575757575, + "grad_norm": 0.017258716747164726, + "learning_rate": 9.721375639250467e-05, + "loss": 0.012529391795396805, + "num_input_tokens_seen": 31278160, + "step": 1910, + "train_runtime": 15522.9703, + "train_tokens_per_second": 2014.96 + }, + { + "epoch": 1.1581818181818182, + "grad_norm": 0.006015890743583441, + "learning_rate": 9.721059029761999e-05, + "loss": 0.012787789106369019, + "num_input_tokens_seen": 31294536, + "step": 1911, + "train_runtime": 15531.0907, + "train_tokens_per_second": 2014.961 + }, + { + "epoch": 1.1587878787878787, + "grad_norm": 0.01181771419942379, + "learning_rate": 9.720742245650751e-05, + "loss": 0.013402738608419895, + "num_input_tokens_seen": 31310912, + "step": 1912, + "train_runtime": 15539.2103, + "train_tokens_per_second": 2014.962 + }, + { + "epoch": 1.1593939393939394, + "grad_norm": 0.014448689296841621, + "learning_rate": 9.72042528692844e-05, + "loss": 0.012921266257762909, + "num_input_tokens_seen": 31327288, + "step": 1913, + "train_runtime": 15547.3368, + "train_tokens_per_second": 2014.962 + }, + { + "epoch": 1.16, + "grad_norm": 0.01236814260482788, + "learning_rate": 9.720108153606792e-05, + "loss": 0.01249447651207447, + "num_input_tokens_seen": 31343664, + "step": 1914, + "train_runtime": 15555.455, + "train_tokens_per_second": 2014.963 + }, + { + "epoch": 1.1606060606060606, + "grad_norm": 0.012424842454493046, + "learning_rate": 9.719790845697533e-05, + "loss": 0.013374033384025097, + "num_input_tokens_seen": 31360040, + "step": 1915, + "train_runtime": 15563.5726, + "train_tokens_per_second": 2014.964 + }, + { + "epoch": 1.1612121212121211, + "grad_norm": 0.008714928291738033, + "learning_rate": 9.719473363212405e-05, + "loss": 0.013384588994085789, + "num_input_tokens_seen": 31376416, + "step": 1916, + "train_runtime": 15571.6914, + "train_tokens_per_second": 2014.965 + }, + { + "epoch": 1.1618181818181819, + "grad_norm": 0.01234753243625164, + "learning_rate": 9.719155706163145e-05, + "loss": 0.013264678418636322, + "num_input_tokens_seen": 31392792, + "step": 1917, + "train_runtime": 15579.8107, + "train_tokens_per_second": 2014.966 + }, + { + "epoch": 1.1624242424242424, + "grad_norm": 0.008164497092366219, + "learning_rate": 9.718837874561509e-05, + "loss": 0.011975225061178207, + "num_input_tokens_seen": 31409168, + "step": 1918, + "train_runtime": 15587.9331, + "train_tokens_per_second": 2014.967 + }, + { + "epoch": 1.163030303030303, + "grad_norm": 0.01258091814815998, + "learning_rate": 9.718519868419247e-05, + "loss": 0.012896685861051083, + "num_input_tokens_seen": 31425544, + "step": 1919, + "train_runtime": 15596.052, + "train_tokens_per_second": 2014.968 + }, + { + "epoch": 1.1636363636363636, + "grad_norm": 0.014153181575238705, + "learning_rate": 9.718201687748126e-05, + "loss": 0.011028273962438107, + "num_input_tokens_seen": 31441920, + "step": 1920, + "train_runtime": 15604.1723, + "train_tokens_per_second": 2014.969 + }, + { + "epoch": 1.1642424242424243, + "grad_norm": 0.010321607813239098, + "learning_rate": 9.71788333255991e-05, + "loss": 0.012135772034525871, + "num_input_tokens_seen": 31458296, + "step": 1921, + "train_runtime": 15612.2941, + "train_tokens_per_second": 2014.969 + }, + { + "epoch": 1.1648484848484848, + "grad_norm": 0.007093664258718491, + "learning_rate": 9.717564802866379e-05, + "loss": 0.012399922125041485, + "num_input_tokens_seen": 31474672, + "step": 1922, + "train_runtime": 15620.4151, + "train_tokens_per_second": 2014.97 + }, + { + "epoch": 1.1654545454545455, + "grad_norm": 0.023240283131599426, + "learning_rate": 9.717246098679313e-05, + "loss": 0.014058588072657585, + "num_input_tokens_seen": 31491048, + "step": 1923, + "train_runtime": 15628.5367, + "train_tokens_per_second": 2014.971 + }, + { + "epoch": 1.166060606060606, + "grad_norm": 0.009191809222102165, + "learning_rate": 9.716927220010499e-05, + "loss": 0.013260525651276112, + "num_input_tokens_seen": 31507424, + "step": 1924, + "train_runtime": 15636.6557, + "train_tokens_per_second": 2014.972 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 0.011490467004477978, + "learning_rate": 9.716608166871735e-05, + "loss": 0.011423847638070583, + "num_input_tokens_seen": 31523800, + "step": 1925, + "train_runtime": 15644.7767, + "train_tokens_per_second": 2014.973 + }, + { + "epoch": 1.1672727272727272, + "grad_norm": 0.01001172885298729, + "learning_rate": 9.716288939274819e-05, + "loss": 0.013747838325798512, + "num_input_tokens_seen": 31540176, + "step": 1926, + "train_runtime": 15652.8971, + "train_tokens_per_second": 2014.974 + }, + { + "epoch": 1.167878787878788, + "grad_norm": 0.016452116891741753, + "learning_rate": 9.715969537231559e-05, + "loss": 0.013036968186497688, + "num_input_tokens_seen": 31556552, + "step": 1927, + "train_runtime": 15661.0189, + "train_tokens_per_second": 2014.974 + }, + { + "epoch": 1.1684848484848485, + "grad_norm": 0.04086410999298096, + "learning_rate": 9.71564996075377e-05, + "loss": 0.011422049254179, + "num_input_tokens_seen": 31572928, + "step": 1928, + "train_runtime": 15669.1386, + "train_tokens_per_second": 2014.975 + }, + { + "epoch": 1.1690909090909092, + "grad_norm": 0.01278830785304308, + "learning_rate": 9.715330209853272e-05, + "loss": 0.012607689946889877, + "num_input_tokens_seen": 31589304, + "step": 1929, + "train_runtime": 15677.2541, + "train_tokens_per_second": 2014.977 + }, + { + "epoch": 1.1696969696969697, + "grad_norm": 0.007186457980424166, + "learning_rate": 9.715010284541894e-05, + "loss": 0.012316851876676083, + "num_input_tokens_seen": 31605680, + "step": 1930, + "train_runtime": 15685.3619, + "train_tokens_per_second": 2014.979 + }, + { + "epoch": 1.1703030303030304, + "grad_norm": 0.006031945813447237, + "learning_rate": 9.714690184831465e-05, + "loss": 0.01131855882704258, + "num_input_tokens_seen": 31622056, + "step": 1931, + "train_runtime": 15693.4757, + "train_tokens_per_second": 2014.981 + }, + { + "epoch": 1.170909090909091, + "grad_norm": 0.011418391950428486, + "learning_rate": 9.714369910733829e-05, + "loss": 0.012706535868346691, + "num_input_tokens_seen": 31638432, + "step": 1932, + "train_runtime": 15701.5872, + "train_tokens_per_second": 2014.983 + }, + { + "epoch": 1.1715151515151514, + "grad_norm": 0.019623173400759697, + "learning_rate": 9.714049462260833e-05, + "loss": 0.012899842113256454, + "num_input_tokens_seen": 31654808, + "step": 1933, + "train_runtime": 15709.6966, + "train_tokens_per_second": 2014.985 + }, + { + "epoch": 1.1721212121212121, + "grad_norm": 0.006744697690010071, + "learning_rate": 9.713728839424325e-05, + "loss": 0.011626766063272953, + "num_input_tokens_seen": 31671184, + "step": 1934, + "train_runtime": 15717.8085, + "train_tokens_per_second": 2014.987 + }, + { + "epoch": 1.1727272727272728, + "grad_norm": 0.011822863481938839, + "learning_rate": 9.713408042236166e-05, + "loss": 0.012239542789757252, + "num_input_tokens_seen": 31687560, + "step": 1935, + "train_runtime": 15725.9209, + "train_tokens_per_second": 2014.989 + }, + { + "epoch": 1.1733333333333333, + "grad_norm": 0.015078735537827015, + "learning_rate": 9.713087070708224e-05, + "loss": 0.012939630076289177, + "num_input_tokens_seen": 31703936, + "step": 1936, + "train_runtime": 15734.034, + "train_tokens_per_second": 2014.991 + }, + { + "epoch": 1.1739393939393938, + "grad_norm": 0.009683185257017612, + "learning_rate": 9.71276592485237e-05, + "loss": 0.011823873035609722, + "num_input_tokens_seen": 31720312, + "step": 1937, + "train_runtime": 15742.146, + "train_tokens_per_second": 2014.993 + }, + { + "epoch": 1.1745454545454546, + "grad_norm": 0.022101467475295067, + "learning_rate": 9.712444604680481e-05, + "loss": 0.013082625344395638, + "num_input_tokens_seen": 31736688, + "step": 1938, + "train_runtime": 15750.2574, + "train_tokens_per_second": 2014.995 + }, + { + "epoch": 1.175151515151515, + "grad_norm": 0.011323574930429459, + "learning_rate": 9.712123110204442e-05, + "loss": 0.012833611108362675, + "num_input_tokens_seen": 31753064, + "step": 1939, + "train_runtime": 15758.3713, + "train_tokens_per_second": 2014.997 + }, + { + "epoch": 1.1757575757575758, + "grad_norm": 0.008909697644412518, + "learning_rate": 9.711801441436148e-05, + "loss": 0.012643275782465935, + "num_input_tokens_seen": 31769440, + "step": 1940, + "train_runtime": 15766.4811, + "train_tokens_per_second": 2014.999 + }, + { + "epoch": 1.1763636363636363, + "grad_norm": 0.016408922150731087, + "learning_rate": 9.711479598387494e-05, + "loss": 0.011867276392877102, + "num_input_tokens_seen": 31785816, + "step": 1941, + "train_runtime": 15774.5926, + "train_tokens_per_second": 2015.001 + }, + { + "epoch": 1.176969696969697, + "grad_norm": 0.012622256763279438, + "learning_rate": 9.711157581070385e-05, + "loss": 0.011785149574279785, + "num_input_tokens_seen": 31802192, + "step": 1942, + "train_runtime": 15782.7054, + "train_tokens_per_second": 2015.003 + }, + { + "epoch": 1.1775757575757575, + "grad_norm": 0.009545985609292984, + "learning_rate": 9.71083538949673e-05, + "loss": 0.01227510068565607, + "num_input_tokens_seen": 31818568, + "step": 1943, + "train_runtime": 15790.8177, + "train_tokens_per_second": 2015.004 + }, + { + "epoch": 1.1781818181818182, + "grad_norm": 0.012797760777175426, + "learning_rate": 9.710513023678449e-05, + "loss": 0.0132676362991333, + "num_input_tokens_seen": 31834944, + "step": 1944, + "train_runtime": 15798.9326, + "train_tokens_per_second": 2015.006 + }, + { + "epoch": 1.1787878787878787, + "grad_norm": 0.005393213592469692, + "learning_rate": 9.710190483627465e-05, + "loss": 0.01224264781922102, + "num_input_tokens_seen": 31851320, + "step": 1945, + "train_runtime": 15807.0427, + "train_tokens_per_second": 2015.008 + }, + { + "epoch": 1.1793939393939394, + "grad_norm": 0.0058806887827813625, + "learning_rate": 9.709867769355707e-05, + "loss": 0.012036345899105072, + "num_input_tokens_seen": 31867696, + "step": 1946, + "train_runtime": 15815.1548, + "train_tokens_per_second": 2015.01 + }, + { + "epoch": 1.18, + "grad_norm": 0.012321457266807556, + "learning_rate": 9.709544880875113e-05, + "loss": 0.011586939916014671, + "num_input_tokens_seen": 31884072, + "step": 1947, + "train_runtime": 15823.2685, + "train_tokens_per_second": 2015.012 + }, + { + "epoch": 1.1806060606060607, + "grad_norm": 0.0094247255474329, + "learning_rate": 9.709221818197624e-05, + "loss": 0.012648900970816612, + "num_input_tokens_seen": 31900448, + "step": 1948, + "train_runtime": 15831.3802, + "train_tokens_per_second": 2015.014 + }, + { + "epoch": 1.1812121212121212, + "grad_norm": 0.01918826624751091, + "learning_rate": 9.70889858133519e-05, + "loss": 0.014765182510018349, + "num_input_tokens_seen": 31916824, + "step": 1949, + "train_runtime": 15839.4904, + "train_tokens_per_second": 2015.016 + }, + { + "epoch": 1.1818181818181819, + "grad_norm": 0.00749099301174283, + "learning_rate": 9.708575170299771e-05, + "loss": 0.012448623776435852, + "num_input_tokens_seen": 31933200, + "step": 1950, + "train_runtime": 15847.6013, + "train_tokens_per_second": 2015.018 + }, + { + "epoch": 1.1824242424242424, + "grad_norm": 0.012216299772262573, + "learning_rate": 9.708251585103322e-05, + "loss": 0.011859457939863205, + "num_input_tokens_seen": 31949576, + "step": 1951, + "train_runtime": 15855.711, + "train_tokens_per_second": 2015.02 + }, + { + "epoch": 1.183030303030303, + "grad_norm": 0.007972866296768188, + "learning_rate": 9.707927825757819e-05, + "loss": 0.013553624972701073, + "num_input_tokens_seen": 31965952, + "step": 1952, + "train_runtime": 15863.8207, + "train_tokens_per_second": 2015.022 + }, + { + "epoch": 1.1836363636363636, + "grad_norm": 0.009913386777043343, + "learning_rate": 9.707603892275233e-05, + "loss": 0.012638632208108902, + "num_input_tokens_seen": 31982328, + "step": 1953, + "train_runtime": 15871.9335, + "train_tokens_per_second": 2015.024 + }, + { + "epoch": 1.1842424242424243, + "grad_norm": 0.008947964757680893, + "learning_rate": 9.707279784667547e-05, + "loss": 0.011412415653467178, + "num_input_tokens_seen": 31998704, + "step": 1954, + "train_runtime": 15880.0463, + "train_tokens_per_second": 2015.026 + }, + { + "epoch": 1.1848484848484848, + "grad_norm": 0.011816347017884254, + "learning_rate": 9.706955502946748e-05, + "loss": 0.013120824471116066, + "num_input_tokens_seen": 32015080, + "step": 1955, + "train_runtime": 15888.1549, + "train_tokens_per_second": 2015.028 + }, + { + "epoch": 1.1854545454545455, + "grad_norm": 0.01560800801962614, + "learning_rate": 9.706631047124833e-05, + "loss": 0.013406200334429741, + "num_input_tokens_seen": 32031456, + "step": 1956, + "train_runtime": 15896.2624, + "train_tokens_per_second": 2015.031 + }, + { + "epoch": 1.186060606060606, + "grad_norm": 0.007744067348539829, + "learning_rate": 9.706306417213798e-05, + "loss": 0.011934707872569561, + "num_input_tokens_seen": 32047832, + "step": 1957, + "train_runtime": 15904.3754, + "train_tokens_per_second": 2015.032 + }, + { + "epoch": 1.1866666666666668, + "grad_norm": 0.013483401387929916, + "learning_rate": 9.705981613225656e-05, + "loss": 0.01200819294899702, + "num_input_tokens_seen": 32064208, + "step": 1958, + "train_runtime": 15912.4903, + "train_tokens_per_second": 2015.034 + }, + { + "epoch": 1.1872727272727273, + "grad_norm": 0.008434257470071316, + "learning_rate": 9.705656635172419e-05, + "loss": 0.01099243201315403, + "num_input_tokens_seen": 32080584, + "step": 1959, + "train_runtime": 15920.6011, + "train_tokens_per_second": 2015.036 + }, + { + "epoch": 1.187878787878788, + "grad_norm": 0.007052503060549498, + "learning_rate": 9.705331483066106e-05, + "loss": 0.011562798172235489, + "num_input_tokens_seen": 32096960, + "step": 1960, + "train_runtime": 15928.7124, + "train_tokens_per_second": 2015.038 + }, + { + "epoch": 1.1884848484848485, + "grad_norm": 0.0031905195210129023, + "learning_rate": 9.705006156918744e-05, + "loss": 0.012042374350130558, + "num_input_tokens_seen": 32113336, + "step": 1961, + "train_runtime": 15936.8214, + "train_tokens_per_second": 2015.04 + }, + { + "epoch": 1.189090909090909, + "grad_norm": 0.017608430236577988, + "learning_rate": 9.704680656742368e-05, + "loss": 0.012978605926036835, + "num_input_tokens_seen": 32129712, + "step": 1962, + "train_runtime": 15944.9337, + "train_tokens_per_second": 2015.042 + }, + { + "epoch": 1.1896969696969697, + "grad_norm": 0.011044766753911972, + "learning_rate": 9.704354982549016e-05, + "loss": 0.012821994721889496, + "num_input_tokens_seen": 32146088, + "step": 1963, + "train_runtime": 15953.0436, + "train_tokens_per_second": 2015.044 + }, + { + "epoch": 1.1903030303030304, + "grad_norm": 0.0026662456803023815, + "learning_rate": 9.704029134350735e-05, + "loss": 0.011127783916890621, + "num_input_tokens_seen": 32162464, + "step": 1964, + "train_runtime": 15961.1548, + "train_tokens_per_second": 2015.046 + }, + { + "epoch": 1.190909090909091, + "grad_norm": 0.013216360472142696, + "learning_rate": 9.703703112159576e-05, + "loss": 0.013246205635368824, + "num_input_tokens_seen": 32178840, + "step": 1965, + "train_runtime": 15969.2663, + "train_tokens_per_second": 2015.048 + }, + { + "epoch": 1.1915151515151514, + "grad_norm": 0.005839253775775433, + "learning_rate": 9.703376915987601e-05, + "loss": 0.011579563841223717, + "num_input_tokens_seen": 32195216, + "step": 1966, + "train_runtime": 15977.3772, + "train_tokens_per_second": 2015.05 + }, + { + "epoch": 1.1921212121212121, + "grad_norm": 0.014731714501976967, + "learning_rate": 9.703050545846871e-05, + "loss": 0.012472787871956825, + "num_input_tokens_seen": 32211592, + "step": 1967, + "train_runtime": 15985.4869, + "train_tokens_per_second": 2015.052 + }, + { + "epoch": 1.1927272727272726, + "grad_norm": 0.010836574248969555, + "learning_rate": 9.702724001749461e-05, + "loss": 0.01216835342347622, + "num_input_tokens_seen": 32227968, + "step": 1968, + "train_runtime": 15993.5976, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 1.1933333333333334, + "grad_norm": 0.008247988298535347, + "learning_rate": 9.702397283707448e-05, + "loss": 0.012309988029301167, + "num_input_tokens_seen": 32244344, + "step": 1969, + "train_runtime": 16001.7113, + "train_tokens_per_second": 2015.056 + }, + { + "epoch": 1.1939393939393939, + "grad_norm": 0.01689152605831623, + "learning_rate": 9.702070391732919e-05, + "loss": 0.013732653111219406, + "num_input_tokens_seen": 32260720, + "step": 1970, + "train_runtime": 16009.8226, + "train_tokens_per_second": 2015.058 + }, + { + "epoch": 1.1945454545454546, + "grad_norm": 0.013673562556505203, + "learning_rate": 9.70174332583796e-05, + "loss": 0.013782327063381672, + "num_input_tokens_seen": 32277096, + "step": 1971, + "train_runtime": 16017.9333, + "train_tokens_per_second": 2015.06 + }, + { + "epoch": 1.195151515151515, + "grad_norm": 0.010520472191274166, + "learning_rate": 9.701416086034672e-05, + "loss": 0.013326936401426792, + "num_input_tokens_seen": 32293472, + "step": 1972, + "train_runtime": 16026.0492, + "train_tokens_per_second": 2015.061 + }, + { + "epoch": 1.1957575757575758, + "grad_norm": 0.007514155004173517, + "learning_rate": 9.70108867233516e-05, + "loss": 0.012479415163397789, + "num_input_tokens_seen": 32309848, + "step": 1973, + "train_runtime": 16034.161, + "train_tokens_per_second": 2015.063 + }, + { + "epoch": 1.1963636363636363, + "grad_norm": 0.004866638220846653, + "learning_rate": 9.700761084751533e-05, + "loss": 0.011496278457343578, + "num_input_tokens_seen": 32326224, + "step": 1974, + "train_runtime": 16042.2706, + "train_tokens_per_second": 2015.065 + }, + { + "epoch": 1.196969696969697, + "grad_norm": 0.008786221966147423, + "learning_rate": 9.700433323295907e-05, + "loss": 0.012215346097946167, + "num_input_tokens_seen": 32342600, + "step": 1975, + "train_runtime": 16050.3809, + "train_tokens_per_second": 2015.067 + }, + { + "epoch": 1.1975757575757575, + "grad_norm": 0.01284866128116846, + "learning_rate": 9.700105387980406e-05, + "loss": 0.012052543461322784, + "num_input_tokens_seen": 32358976, + "step": 1976, + "train_runtime": 16058.492, + "train_tokens_per_second": 2015.069 + }, + { + "epoch": 1.1981818181818182, + "grad_norm": 0.00797280389815569, + "learning_rate": 9.699777278817161e-05, + "loss": 0.01189066469669342, + "num_input_tokens_seen": 32375352, + "step": 1977, + "train_runtime": 16066.6022, + "train_tokens_per_second": 2015.071 + }, + { + "epoch": 1.1987878787878787, + "grad_norm": 0.005341751966625452, + "learning_rate": 9.699448995818306e-05, + "loss": 0.01284201443195343, + "num_input_tokens_seen": 32391728, + "step": 1978, + "train_runtime": 16074.7091, + "train_tokens_per_second": 2015.074 + }, + { + "epoch": 1.1993939393939395, + "grad_norm": 0.008474123664200306, + "learning_rate": 9.699120538995982e-05, + "loss": 0.011298474855720997, + "num_input_tokens_seen": 32408104, + "step": 1979, + "train_runtime": 16082.8191, + "train_tokens_per_second": 2015.076 + }, + { + "epoch": 1.2, + "grad_norm": 0.005944432690739632, + "learning_rate": 9.698791908362344e-05, + "loss": 0.011836757883429527, + "num_input_tokens_seen": 32424480, + "step": 1980, + "train_runtime": 16090.9333, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 1.2006060606060607, + "grad_norm": 0.04931863769888878, + "learning_rate": 9.698463103929542e-05, + "loss": 0.013664236292243004, + "num_input_tokens_seen": 32440856, + "step": 1981, + "train_runtime": 16099.0461, + "train_tokens_per_second": 2015.079 + }, + { + "epoch": 1.2012121212121212, + "grad_norm": 0.017436610534787178, + "learning_rate": 9.698134125709741e-05, + "loss": 0.012293105944991112, + "num_input_tokens_seen": 32457232, + "step": 1982, + "train_runtime": 16107.1573, + "train_tokens_per_second": 2015.081 + }, + { + "epoch": 1.201818181818182, + "grad_norm": 0.012726429849863052, + "learning_rate": 9.697804973715106e-05, + "loss": 0.0125419395044446, + "num_input_tokens_seen": 32473608, + "step": 1983, + "train_runtime": 16115.2692, + "train_tokens_per_second": 2015.083 + }, + { + "epoch": 1.2024242424242424, + "grad_norm": 0.00739372568204999, + "learning_rate": 9.697475647957814e-05, + "loss": 0.011883998289704323, + "num_input_tokens_seen": 32489984, + "step": 1984, + "train_runtime": 16123.3808, + "train_tokens_per_second": 2015.085 + }, + { + "epoch": 1.2030303030303031, + "grad_norm": 0.009471113793551922, + "learning_rate": 9.697146148450047e-05, + "loss": 0.013606306165456772, + "num_input_tokens_seen": 32506360, + "step": 1985, + "train_runtime": 16131.4911, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 1.2036363636363636, + "grad_norm": 0.010123181156814098, + "learning_rate": 9.696816475203992e-05, + "loss": 0.013429714366793633, + "num_input_tokens_seen": 32522736, + "step": 1986, + "train_runtime": 16139.6034, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 1.2042424242424243, + "grad_norm": 0.0061989715322852135, + "learning_rate": 9.69648662823184e-05, + "loss": 0.012474027462303638, + "num_input_tokens_seen": 32539112, + "step": 1987, + "train_runtime": 16147.7149, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.2048484848484848, + "grad_norm": 0.012311330065131187, + "learning_rate": 9.696156607545795e-05, + "loss": 0.01264164038002491, + "num_input_tokens_seen": 32555488, + "step": 1988, + "train_runtime": 16155.8325, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 1.2054545454545456, + "grad_norm": 0.0055806501768529415, + "learning_rate": 9.69582641315806e-05, + "loss": 0.011141511611640453, + "num_input_tokens_seen": 32571864, + "step": 1989, + "train_runtime": 16163.9452, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 1.206060606060606, + "grad_norm": 0.005560377612709999, + "learning_rate": 9.695496045080853e-05, + "loss": 0.012061137706041336, + "num_input_tokens_seen": 32588240, + "step": 1990, + "train_runtime": 16172.0604, + "train_tokens_per_second": 2015.095 + }, + { + "epoch": 1.2066666666666666, + "grad_norm": 0.012639972381293774, + "learning_rate": 9.69516550332639e-05, + "loss": 0.012125247158110142, + "num_input_tokens_seen": 32604616, + "step": 1991, + "train_runtime": 16180.1724, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.2072727272727273, + "grad_norm": 0.012606188654899597, + "learning_rate": 9.6948347879069e-05, + "loss": 0.012232892215251923, + "num_input_tokens_seen": 32620992, + "step": 1992, + "train_runtime": 16188.2838, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 1.207878787878788, + "grad_norm": 0.008077176287770271, + "learning_rate": 9.694503898834612e-05, + "loss": 0.011687630787491798, + "num_input_tokens_seen": 32637368, + "step": 1993, + "train_runtime": 16196.3922, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 1.2084848484848485, + "grad_norm": 0.006857742555439472, + "learning_rate": 9.694172836121769e-05, + "loss": 0.012620335444808006, + "num_input_tokens_seen": 32653744, + "step": 1994, + "train_runtime": 16204.5031, + "train_tokens_per_second": 2015.103 + }, + { + "epoch": 1.209090909090909, + "grad_norm": 0.007197006605565548, + "learning_rate": 9.693841599780613e-05, + "loss": 0.012056245468556881, + "num_input_tokens_seen": 32670120, + "step": 1995, + "train_runtime": 16212.6138, + "train_tokens_per_second": 2015.105 + }, + { + "epoch": 1.2096969696969697, + "grad_norm": 0.00502738356590271, + "learning_rate": 9.693510189823398e-05, + "loss": 0.012053314596414566, + "num_input_tokens_seen": 32686496, + "step": 1996, + "train_runtime": 16220.723, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 1.2103030303030302, + "grad_norm": 0.010123145766556263, + "learning_rate": 9.69317860626238e-05, + "loss": 0.012628944590687752, + "num_input_tokens_seen": 32702872, + "step": 1997, + "train_runtime": 16228.8373, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 1.210909090909091, + "grad_norm": 0.00677911564707756, + "learning_rate": 9.692846849109827e-05, + "loss": 0.01243099570274353, + "num_input_tokens_seen": 32719248, + "step": 1998, + "train_runtime": 16236.9472, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 1.2115151515151514, + "grad_norm": 0.018878202885389328, + "learning_rate": 9.692514918378006e-05, + "loss": 0.012131169438362122, + "num_input_tokens_seen": 32735624, + "step": 1999, + "train_runtime": 16245.0577, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.2121212121212122, + "grad_norm": 0.009054052643477917, + "learning_rate": 9.692182814079197e-05, + "loss": 0.013145225122570992, + "num_input_tokens_seen": 32752000, + "step": 2000, + "train_runtime": 16253.1657, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 1.2127272727272727, + "grad_norm": 0.00841708667576313, + "learning_rate": 9.691850536225684e-05, + "loss": 0.013336677104234695, + "num_input_tokens_seen": 32768376, + "step": 2001, + "train_runtime": 16262.2722, + "train_tokens_per_second": 2014.994 + }, + { + "epoch": 1.2133333333333334, + "grad_norm": 0.008042370900511742, + "learning_rate": 9.691518084829756e-05, + "loss": 0.011015270836651325, + "num_input_tokens_seen": 32784752, + "step": 2002, + "train_runtime": 16270.3838, + "train_tokens_per_second": 2014.996 + }, + { + "epoch": 1.2139393939393939, + "grad_norm": 0.014253576286137104, + "learning_rate": 9.691185459903709e-05, + "loss": 0.012948616407811642, + "num_input_tokens_seen": 32801128, + "step": 2003, + "train_runtime": 16278.4941, + "train_tokens_per_second": 2014.998 + }, + { + "epoch": 1.2145454545454546, + "grad_norm": 0.010794151574373245, + "learning_rate": 9.690852661459849e-05, + "loss": 0.012411735020577908, + "num_input_tokens_seen": 32817504, + "step": 2004, + "train_runtime": 16286.6014, + "train_tokens_per_second": 2015.0 + }, + { + "epoch": 1.215151515151515, + "grad_norm": 0.018177302554249763, + "learning_rate": 9.690519689510484e-05, + "loss": 0.011491055600345135, + "num_input_tokens_seen": 32833880, + "step": 2005, + "train_runtime": 16294.7116, + "train_tokens_per_second": 2015.002 + }, + { + "epoch": 1.2157575757575758, + "grad_norm": 0.006788145750761032, + "learning_rate": 9.69018654406793e-05, + "loss": 0.012544289231300354, + "num_input_tokens_seen": 32850256, + "step": 2006, + "train_runtime": 16302.8206, + "train_tokens_per_second": 2015.004 + }, + { + "epoch": 1.2163636363636363, + "grad_norm": 0.010082833468914032, + "learning_rate": 9.68985322514451e-05, + "loss": 0.011602875776588917, + "num_input_tokens_seen": 32866632, + "step": 2007, + "train_runtime": 16310.9318, + "train_tokens_per_second": 2015.006 + }, + { + "epoch": 1.216969696969697, + "grad_norm": 0.005194054916501045, + "learning_rate": 9.689519732752552e-05, + "loss": 0.012436242774128914, + "num_input_tokens_seen": 32883008, + "step": 2008, + "train_runtime": 16319.0428, + "train_tokens_per_second": 2015.008 + }, + { + "epoch": 1.2175757575757575, + "grad_norm": 0.011215485632419586, + "learning_rate": 9.68918606690439e-05, + "loss": 0.012597991153597832, + "num_input_tokens_seen": 32899384, + "step": 2009, + "train_runtime": 16327.1604, + "train_tokens_per_second": 2015.01 + }, + { + "epoch": 1.2181818181818183, + "grad_norm": 0.007134478073567152, + "learning_rate": 9.688852227612369e-05, + "loss": 0.0119805708527565, + "num_input_tokens_seen": 32915760, + "step": 2010, + "train_runtime": 16335.2778, + "train_tokens_per_second": 2015.011 + }, + { + "epoch": 1.2187878787878788, + "grad_norm": 0.005966852884739637, + "learning_rate": 9.688518214888836e-05, + "loss": 0.012521771714091301, + "num_input_tokens_seen": 32932136, + "step": 2011, + "train_runtime": 16343.3899, + "train_tokens_per_second": 2015.013 + }, + { + "epoch": 1.2193939393939395, + "grad_norm": 0.011345195583999157, + "learning_rate": 9.688184028746141e-05, + "loss": 0.012418065220117569, + "num_input_tokens_seen": 32948512, + "step": 2012, + "train_runtime": 16351.5105, + "train_tokens_per_second": 2015.013 + }, + { + "epoch": 1.22, + "grad_norm": 0.10804049670696259, + "learning_rate": 9.687849669196652e-05, + "loss": 0.013680079020559788, + "num_input_tokens_seen": 32964888, + "step": 2013, + "train_runtime": 16359.6328, + "train_tokens_per_second": 2015.014 + }, + { + "epoch": 1.2206060606060607, + "grad_norm": 0.03245476633310318, + "learning_rate": 9.687515136252731e-05, + "loss": 0.013701889663934708, + "num_input_tokens_seen": 32981264, + "step": 2014, + "train_runtime": 16367.748, + "train_tokens_per_second": 2015.015 + }, + { + "epoch": 1.2212121212121212, + "grad_norm": 0.011148846708238125, + "learning_rate": 9.687180429926754e-05, + "loss": 0.01261454913765192, + "num_input_tokens_seen": 32997640, + "step": 2015, + "train_runtime": 16375.8622, + "train_tokens_per_second": 2015.017 + }, + { + "epoch": 1.221818181818182, + "grad_norm": 0.01723579317331314, + "learning_rate": 9.686845550231102e-05, + "loss": 0.013511408120393753, + "num_input_tokens_seen": 33014016, + "step": 2016, + "train_runtime": 16383.9829, + "train_tokens_per_second": 2015.018 + }, + { + "epoch": 1.2224242424242424, + "grad_norm": 0.008297464810311794, + "learning_rate": 9.68651049717816e-05, + "loss": 0.012829539366066456, + "num_input_tokens_seen": 33030392, + "step": 2017, + "train_runtime": 16392.0973, + "train_tokens_per_second": 2015.019 + }, + { + "epoch": 1.2230303030303031, + "grad_norm": 0.008225811645388603, + "learning_rate": 9.68617527078032e-05, + "loss": 0.01188596710562706, + "num_input_tokens_seen": 33046768, + "step": 2018, + "train_runtime": 16400.2172, + "train_tokens_per_second": 2015.02 + }, + { + "epoch": 1.2236363636363636, + "grad_norm": 0.01628193072974682, + "learning_rate": 9.685839871049984e-05, + "loss": 0.012169532477855682, + "num_input_tokens_seen": 33063144, + "step": 2019, + "train_runtime": 16408.3356, + "train_tokens_per_second": 2015.021 + }, + { + "epoch": 1.2242424242424241, + "grad_norm": 0.021080298349261284, + "learning_rate": 9.685504297999556e-05, + "loss": 0.012592184357345104, + "num_input_tokens_seen": 33079520, + "step": 2020, + "train_runtime": 16416.4533, + "train_tokens_per_second": 2015.022 + }, + { + "epoch": 1.2248484848484849, + "grad_norm": 0.013393021188676357, + "learning_rate": 9.685168551641448e-05, + "loss": 0.01185669656842947, + "num_input_tokens_seen": 33095896, + "step": 2021, + "train_runtime": 16424.5679, + "train_tokens_per_second": 2015.024 + }, + { + "epoch": 1.2254545454545456, + "grad_norm": 0.011576077900826931, + "learning_rate": 9.68483263198808e-05, + "loss": 0.012582024559378624, + "num_input_tokens_seen": 33112272, + "step": 2022, + "train_runtime": 16432.6854, + "train_tokens_per_second": 2015.025 + }, + { + "epoch": 1.226060606060606, + "grad_norm": 0.015389593318104744, + "learning_rate": 9.684496539051874e-05, + "loss": 0.012190048582851887, + "num_input_tokens_seen": 33128648, + "step": 2023, + "train_runtime": 16440.8056, + "train_tokens_per_second": 2015.026 + }, + { + "epoch": 1.2266666666666666, + "grad_norm": 0.005747777409851551, + "learning_rate": 9.684160272845267e-05, + "loss": 0.011490939185023308, + "num_input_tokens_seen": 33145024, + "step": 2024, + "train_runtime": 16448.9214, + "train_tokens_per_second": 2015.027 + }, + { + "epoch": 1.2272727272727273, + "grad_norm": 0.013231366872787476, + "learning_rate": 9.683823833380692e-05, + "loss": 0.01152926217764616, + "num_input_tokens_seen": 33161400, + "step": 2025, + "train_runtime": 16457.0388, + "train_tokens_per_second": 2015.028 + }, + { + "epoch": 1.2278787878787878, + "grad_norm": 0.007870008237659931, + "learning_rate": 9.683487220670595e-05, + "loss": 0.012311085127294064, + "num_input_tokens_seen": 33177776, + "step": 2026, + "train_runtime": 16465.1579, + "train_tokens_per_second": 2015.029 + }, + { + "epoch": 1.2284848484848485, + "grad_norm": 0.0100321713835001, + "learning_rate": 9.683150434727427e-05, + "loss": 0.013347048312425613, + "num_input_tokens_seen": 33194152, + "step": 2027, + "train_runtime": 16473.2735, + "train_tokens_per_second": 2015.031 + }, + { + "epoch": 1.229090909090909, + "grad_norm": 0.00871365051716566, + "learning_rate": 9.682813475563643e-05, + "loss": 0.012250279076397419, + "num_input_tokens_seen": 33210528, + "step": 2028, + "train_runtime": 16481.3886, + "train_tokens_per_second": 2015.032 + }, + { + "epoch": 1.2296969696969697, + "grad_norm": 0.010484444908797741, + "learning_rate": 9.682476343191708e-05, + "loss": 0.012891886755824089, + "num_input_tokens_seen": 33226904, + "step": 2029, + "train_runtime": 16489.5073, + "train_tokens_per_second": 2015.033 + }, + { + "epoch": 1.2303030303030302, + "grad_norm": 0.015102782286703587, + "learning_rate": 9.682139037624092e-05, + "loss": 0.011657091788947582, + "num_input_tokens_seen": 33243280, + "step": 2030, + "train_runtime": 16497.6346, + "train_tokens_per_second": 2015.033 + }, + { + "epoch": 1.230909090909091, + "grad_norm": 0.015282537788152695, + "learning_rate": 9.681801558873272e-05, + "loss": 0.012187018990516663, + "num_input_tokens_seen": 33259656, + "step": 2031, + "train_runtime": 16505.7523, + "train_tokens_per_second": 2015.034 + }, + { + "epoch": 1.2315151515151515, + "grad_norm": 0.008355207741260529, + "learning_rate": 9.681463906951729e-05, + "loss": 0.011612944304943085, + "num_input_tokens_seen": 33276032, + "step": 2032, + "train_runtime": 16513.8704, + "train_tokens_per_second": 2015.035 + }, + { + "epoch": 1.2321212121212122, + "grad_norm": 0.01107091549783945, + "learning_rate": 9.681126081871955e-05, + "loss": 0.011706216260790825, + "num_input_tokens_seen": 33292408, + "step": 2033, + "train_runtime": 16521.9865, + "train_tokens_per_second": 2015.037 + }, + { + "epoch": 1.2327272727272727, + "grad_norm": 0.0022801703307777643, + "learning_rate": 9.680788083646439e-05, + "loss": 0.011544807814061642, + "num_input_tokens_seen": 33308784, + "step": 2034, + "train_runtime": 16530.102, + "train_tokens_per_second": 2015.038 + }, + { + "epoch": 1.2333333333333334, + "grad_norm": 0.008891911245882511, + "learning_rate": 9.68044991228769e-05, + "loss": 0.01236899383366108, + "num_input_tokens_seen": 33325160, + "step": 2035, + "train_runtime": 16538.2109, + "train_tokens_per_second": 2015.04 + }, + { + "epoch": 1.233939393939394, + "grad_norm": 0.011175681836903095, + "learning_rate": 9.680111567808213e-05, + "loss": 0.011853158473968506, + "num_input_tokens_seen": 33341536, + "step": 2036, + "train_runtime": 16546.3354, + "train_tokens_per_second": 2015.04 + }, + { + "epoch": 1.2345454545454546, + "grad_norm": 0.008549856953322887, + "learning_rate": 9.679773050220524e-05, + "loss": 0.011179388500750065, + "num_input_tokens_seen": 33357912, + "step": 2037, + "train_runtime": 16554.4491, + "train_tokens_per_second": 2015.042 + }, + { + "epoch": 1.2351515151515151, + "grad_norm": 0.005819539073854685, + "learning_rate": 9.67943435953714e-05, + "loss": 0.01257402915507555, + "num_input_tokens_seen": 33374288, + "step": 2038, + "train_runtime": 16562.563, + "train_tokens_per_second": 2015.044 + }, + { + "epoch": 1.2357575757575758, + "grad_norm": 0.01801511086523533, + "learning_rate": 9.679095495770596e-05, + "loss": 0.01320955716073513, + "num_input_tokens_seen": 33390664, + "step": 2039, + "train_runtime": 16570.6848, + "train_tokens_per_second": 2015.044 + }, + { + "epoch": 1.2363636363636363, + "grad_norm": 0.009865757077932358, + "learning_rate": 9.67875645893342e-05, + "loss": 0.012449276633560658, + "num_input_tokens_seen": 33407040, + "step": 2040, + "train_runtime": 16578.8018, + "train_tokens_per_second": 2015.046 + }, + { + "epoch": 1.236969696969697, + "grad_norm": 0.008288033306598663, + "learning_rate": 9.678417249038154e-05, + "loss": 0.01053472887724638, + "num_input_tokens_seen": 33423416, + "step": 2041, + "train_runtime": 16586.9196, + "train_tokens_per_second": 2015.047 + }, + { + "epoch": 1.2375757575757576, + "grad_norm": 0.02465786226093769, + "learning_rate": 9.678077866097345e-05, + "loss": 0.012921673245728016, + "num_input_tokens_seen": 33439792, + "step": 2042, + "train_runtime": 16595.0371, + "train_tokens_per_second": 2015.048 + }, + { + "epoch": 1.2381818181818183, + "grad_norm": 0.013770204037427902, + "learning_rate": 9.677738310123545e-05, + "loss": 0.012475434690713882, + "num_input_tokens_seen": 33456168, + "step": 2043, + "train_runtime": 16603.154, + "train_tokens_per_second": 2015.049 + }, + { + "epoch": 1.2387878787878788, + "grad_norm": 0.01958361268043518, + "learning_rate": 9.677398581129316e-05, + "loss": 0.01197823602706194, + "num_input_tokens_seen": 33472544, + "step": 2044, + "train_runtime": 16611.2666, + "train_tokens_per_second": 2015.051 + }, + { + "epoch": 1.2393939393939393, + "grad_norm": 0.004778360947966576, + "learning_rate": 9.67705867912722e-05, + "loss": 0.012999579310417175, + "num_input_tokens_seen": 33488920, + "step": 2045, + "train_runtime": 16619.3844, + "train_tokens_per_second": 2015.052 + }, + { + "epoch": 1.24, + "grad_norm": 0.015011285431683064, + "learning_rate": 9.676718604129832e-05, + "loss": 0.014176999218761921, + "num_input_tokens_seen": 33505296, + "step": 2046, + "train_runtime": 16627.5012, + "train_tokens_per_second": 2015.053 + }, + { + "epoch": 1.2406060606060607, + "grad_norm": 0.019506732001900673, + "learning_rate": 9.676378356149734e-05, + "loss": 0.012224535457789898, + "num_input_tokens_seen": 33521672, + "step": 2047, + "train_runtime": 16635.6217, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 1.2412121212121212, + "grad_norm": 0.13394935429096222, + "learning_rate": 9.676037935199505e-05, + "loss": 0.01198052242398262, + "num_input_tokens_seen": 33538048, + "step": 2048, + "train_runtime": 16643.7418, + "train_tokens_per_second": 2015.055 + }, + { + "epoch": 1.2418181818181817, + "grad_norm": 0.010586493648588657, + "learning_rate": 9.675697341291738e-05, + "loss": 0.013710709288716316, + "num_input_tokens_seen": 33554424, + "step": 2049, + "train_runtime": 16651.8585, + "train_tokens_per_second": 2015.056 + }, + { + "epoch": 1.2424242424242424, + "grad_norm": 0.007696137297898531, + "learning_rate": 9.675356574439031e-05, + "loss": 0.013701122254133224, + "num_input_tokens_seen": 33570800, + "step": 2050, + "train_runtime": 16659.9803, + "train_tokens_per_second": 2015.056 + }, + { + "epoch": 1.2430303030303032, + "grad_norm": 0.012039069086313248, + "learning_rate": 9.675015634653992e-05, + "loss": 0.011533101089298725, + "num_input_tokens_seen": 33587176, + "step": 2051, + "train_runtime": 16668.0889, + "train_tokens_per_second": 2015.059 + }, + { + "epoch": 1.2436363636363637, + "grad_norm": 0.011677329428493977, + "learning_rate": 9.674674521949227e-05, + "loss": 0.013059742748737335, + "num_input_tokens_seen": 33603552, + "step": 2052, + "train_runtime": 16676.2054, + "train_tokens_per_second": 2015.06 + }, + { + "epoch": 1.2442424242424241, + "grad_norm": 0.009239686653017998, + "learning_rate": 9.674333236337356e-05, + "loss": 0.012059221975505352, + "num_input_tokens_seen": 33619928, + "step": 2053, + "train_runtime": 16684.3227, + "train_tokens_per_second": 2015.061 + }, + { + "epoch": 1.2448484848484849, + "grad_norm": 0.09366417676210403, + "learning_rate": 9.673991777831001e-05, + "loss": 0.012007784098386765, + "num_input_tokens_seen": 33636304, + "step": 2054, + "train_runtime": 16692.4412, + "train_tokens_per_second": 2015.062 + }, + { + "epoch": 1.2454545454545454, + "grad_norm": 0.009940850548446178, + "learning_rate": 9.673650146442791e-05, + "loss": 0.012801412492990494, + "num_input_tokens_seen": 33652680, + "step": 2055, + "train_runtime": 16700.5558, + "train_tokens_per_second": 2015.063 + }, + { + "epoch": 1.246060606060606, + "grad_norm": 0.011787704192101955, + "learning_rate": 9.673308342185365e-05, + "loss": 0.013499357737600803, + "num_input_tokens_seen": 33669056, + "step": 2056, + "train_runtime": 16708.6666, + "train_tokens_per_second": 2015.065 + }, + { + "epoch": 1.2466666666666666, + "grad_norm": 0.012068502604961395, + "learning_rate": 9.672966365071365e-05, + "loss": 0.013464034534990788, + "num_input_tokens_seen": 33685432, + "step": 2057, + "train_runtime": 16716.7761, + "train_tokens_per_second": 2015.067 + }, + { + "epoch": 1.2472727272727273, + "grad_norm": 0.007820342667400837, + "learning_rate": 9.67262421511344e-05, + "loss": 0.011818510480225086, + "num_input_tokens_seen": 33701808, + "step": 2058, + "train_runtime": 16724.8854, + "train_tokens_per_second": 2015.07 + }, + { + "epoch": 1.2478787878787878, + "grad_norm": 0.011603613384068012, + "learning_rate": 9.672281892324242e-05, + "loss": 0.01426640897989273, + "num_input_tokens_seen": 33718184, + "step": 2059, + "train_runtime": 16732.9925, + "train_tokens_per_second": 2015.072 + }, + { + "epoch": 1.2484848484848485, + "grad_norm": 0.015823999419808388, + "learning_rate": 9.671939396716436e-05, + "loss": 0.013109918683767319, + "num_input_tokens_seen": 33734560, + "step": 2060, + "train_runtime": 16741.1026, + "train_tokens_per_second": 2015.074 + }, + { + "epoch": 1.249090909090909, + "grad_norm": 0.00919675175100565, + "learning_rate": 9.671596728302692e-05, + "loss": 0.01303145196288824, + "num_input_tokens_seen": 33750936, + "step": 2061, + "train_runtime": 16749.2125, + "train_tokens_per_second": 2015.076 + }, + { + "epoch": 1.2496969696969698, + "grad_norm": 0.00808015652000904, + "learning_rate": 9.671253887095681e-05, + "loss": 0.013077626936137676, + "num_input_tokens_seen": 33767312, + "step": 2062, + "train_runtime": 16757.3238, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 1.2503030303030302, + "grad_norm": 0.007350953761488199, + "learning_rate": 9.670910873108086e-05, + "loss": 0.012963245622813702, + "num_input_tokens_seen": 33783688, + "step": 2063, + "train_runtime": 16765.4356, + "train_tokens_per_second": 2015.08 + }, + { + "epoch": 1.250909090909091, + "grad_norm": 0.011904108338057995, + "learning_rate": 9.670567686352594e-05, + "loss": 0.012116971425712109, + "num_input_tokens_seen": 33800064, + "step": 2064, + "train_runtime": 16773.546, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 1.2515151515151515, + "grad_norm": 0.011483087204396725, + "learning_rate": 9.6702243268419e-05, + "loss": 0.013674170710146427, + "num_input_tokens_seen": 33816440, + "step": 2065, + "train_runtime": 16781.6557, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 1.2521212121212122, + "grad_norm": 0.010501409880816936, + "learning_rate": 9.669880794588701e-05, + "loss": 0.011777383275330067, + "num_input_tokens_seen": 33832816, + "step": 2066, + "train_runtime": 16789.766, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.2527272727272727, + "grad_norm": 0.009026569314301014, + "learning_rate": 9.669537089605705e-05, + "loss": 0.012171929702162743, + "num_input_tokens_seen": 33849192, + "step": 2067, + "train_runtime": 16797.8767, + "train_tokens_per_second": 2015.088 + }, + { + "epoch": 1.2533333333333334, + "grad_norm": 0.018056530505418777, + "learning_rate": 9.669193211905627e-05, + "loss": 0.012256179004907608, + "num_input_tokens_seen": 33865568, + "step": 2068, + "train_runtime": 16805.9871, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 1.253939393939394, + "grad_norm": 0.010606756433844566, + "learning_rate": 9.668849161501185e-05, + "loss": 0.012096712365746498, + "num_input_tokens_seen": 33881944, + "step": 2069, + "train_runtime": 16814.095, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 1.2545454545454544, + "grad_norm": 0.011599649675190449, + "learning_rate": 9.668504938405105e-05, + "loss": 0.013895167037844658, + "num_input_tokens_seen": 33898320, + "step": 2070, + "train_runtime": 16822.2019, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 1.2551515151515151, + "grad_norm": 0.01043979823589325, + "learning_rate": 9.668160542630118e-05, + "loss": 0.012740423902869225, + "num_input_tokens_seen": 33914696, + "step": 2071, + "train_runtime": 16830.3157, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 1.2557575757575759, + "grad_norm": 0.006431375164538622, + "learning_rate": 9.667815974188965e-05, + "loss": 0.01092531718313694, + "num_input_tokens_seen": 33931072, + "step": 2072, + "train_runtime": 16838.4336, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.2563636363636363, + "grad_norm": 0.012145663611590862, + "learning_rate": 9.667471233094387e-05, + "loss": 0.011573081836104393, + "num_input_tokens_seen": 33947448, + "step": 2073, + "train_runtime": 16846.5416, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 1.2569696969696968, + "grad_norm": 0.01263564545661211, + "learning_rate": 9.667126319359139e-05, + "loss": 0.012712339870631695, + "num_input_tokens_seen": 33963824, + "step": 2074, + "train_runtime": 16854.6507, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 1.2575757575757576, + "grad_norm": 0.010235338471829891, + "learning_rate": 9.666781232995976e-05, + "loss": 0.012764286249876022, + "num_input_tokens_seen": 33980200, + "step": 2075, + "train_runtime": 16862.7613, + "train_tokens_per_second": 2015.103 + }, + { + "epoch": 1.2581818181818183, + "grad_norm": 0.01789509318768978, + "learning_rate": 9.666435974017665e-05, + "loss": 0.012191517278552055, + "num_input_tokens_seen": 33996576, + "step": 2076, + "train_runtime": 16870.8737, + "train_tokens_per_second": 2015.105 + }, + { + "epoch": 1.2587878787878788, + "grad_norm": 0.03132858872413635, + "learning_rate": 9.666090542436974e-05, + "loss": 0.013369154185056686, + "num_input_tokens_seen": 34012952, + "step": 2077, + "train_runtime": 16878.986, + "train_tokens_per_second": 2015.106 + }, + { + "epoch": 1.2593939393939393, + "grad_norm": 0.010015154257416725, + "learning_rate": 9.665744938266681e-05, + "loss": 0.011515887454152107, + "num_input_tokens_seen": 34029328, + "step": 2078, + "train_runtime": 16887.0989, + "train_tokens_per_second": 2015.108 + }, + { + "epoch": 1.26, + "grad_norm": 0.014227609150111675, + "learning_rate": 9.665399161519569e-05, + "loss": 0.011430085636675358, + "num_input_tokens_seen": 34045704, + "step": 2079, + "train_runtime": 16895.2111, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 1.2606060606060607, + "grad_norm": 0.010706653818488121, + "learning_rate": 9.665053212208426e-05, + "loss": 0.012273017317056656, + "num_input_tokens_seen": 34062080, + "step": 2080, + "train_runtime": 16903.3202, + "train_tokens_per_second": 2015.112 + }, + { + "epoch": 1.2612121212121212, + "grad_norm": 0.011419372633099556, + "learning_rate": 9.66470709034605e-05, + "loss": 0.012077580206096172, + "num_input_tokens_seen": 34078456, + "step": 2081, + "train_runtime": 16911.4345, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.2618181818181817, + "grad_norm": 0.004964667372405529, + "learning_rate": 9.664360795945244e-05, + "loss": 0.011858327314257622, + "num_input_tokens_seen": 34094832, + "step": 2082, + "train_runtime": 16919.5448, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 1.2624242424242424, + "grad_norm": 0.01158229261636734, + "learning_rate": 9.664014329018813e-05, + "loss": 0.012356961145997047, + "num_input_tokens_seen": 34111208, + "step": 2083, + "train_runtime": 16927.655, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 1.263030303030303, + "grad_norm": 0.007866512052714825, + "learning_rate": 9.663667689579578e-05, + "loss": 0.012617571279406548, + "num_input_tokens_seen": 34127584, + "step": 2084, + "train_runtime": 16935.7655, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 1.2636363636363637, + "grad_norm": 0.010511146858334541, + "learning_rate": 9.663320877640355e-05, + "loss": 0.013274503871798515, + "num_input_tokens_seen": 34143960, + "step": 2085, + "train_runtime": 16943.8757, + "train_tokens_per_second": 2015.121 + }, + { + "epoch": 1.2642424242424242, + "grad_norm": 0.010922242887318134, + "learning_rate": 9.662973893213976e-05, + "loss": 0.012496921233832836, + "num_input_tokens_seen": 34160336, + "step": 2086, + "train_runtime": 16951.9865, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 1.2648484848484849, + "grad_norm": 0.013786193914711475, + "learning_rate": 9.662626736313271e-05, + "loss": 0.011950873769819736, + "num_input_tokens_seen": 34176712, + "step": 2087, + "train_runtime": 16960.1051, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 1.2654545454545454, + "grad_norm": 0.008805682882666588, + "learning_rate": 9.662279406951084e-05, + "loss": 0.011753477156162262, + "num_input_tokens_seen": 34193088, + "step": 2088, + "train_runtime": 16968.2158, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 1.266060606060606, + "grad_norm": 0.006958288606256247, + "learning_rate": 9.661931905140263e-05, + "loss": 0.01254260540008545, + "num_input_tokens_seen": 34209464, + "step": 2089, + "train_runtime": 16976.3341, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 0.015437749214470387, + "learning_rate": 9.661584230893657e-05, + "loss": 0.012971932999789715, + "num_input_tokens_seen": 34225840, + "step": 2090, + "train_runtime": 16984.4439, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 1.2672727272727273, + "grad_norm": 0.014704076573252678, + "learning_rate": 9.661236384224129e-05, + "loss": 0.013508946634829044, + "num_input_tokens_seen": 34242216, + "step": 2091, + "train_runtime": 16992.5556, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 1.2678787878787878, + "grad_norm": 0.006109852343797684, + "learning_rate": 9.660888365144545e-05, + "loss": 0.012193357571959496, + "num_input_tokens_seen": 34258592, + "step": 2092, + "train_runtime": 17000.6672, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 1.2684848484848485, + "grad_norm": 0.008939354680478573, + "learning_rate": 9.660540173667778e-05, + "loss": 0.01164991408586502, + "num_input_tokens_seen": 34274968, + "step": 2093, + "train_runtime": 17008.7775, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 1.269090909090909, + "grad_norm": 0.012699137441813946, + "learning_rate": 9.660191809806705e-05, + "loss": 0.012199487537145615, + "num_input_tokens_seen": 34291344, + "step": 2094, + "train_runtime": 17016.8855, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 1.2696969696969698, + "grad_norm": 0.01226895023137331, + "learning_rate": 9.659843273574212e-05, + "loss": 0.0135191073641181, + "num_input_tokens_seen": 34307720, + "step": 2095, + "train_runtime": 17024.9979, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 1.2703030303030303, + "grad_norm": 0.0103456387296319, + "learning_rate": 9.659494564983191e-05, + "loss": 0.01234687864780426, + "num_input_tokens_seen": 34324096, + "step": 2096, + "train_runtime": 17033.1086, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 1.270909090909091, + "grad_norm": 0.012149964459240437, + "learning_rate": 9.65914568404654e-05, + "loss": 0.01230283547192812, + "num_input_tokens_seen": 34340472, + "step": 2097, + "train_runtime": 17041.2202, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 1.2715151515151515, + "grad_norm": 0.012203685007989407, + "learning_rate": 9.658796630777162e-05, + "loss": 0.011648830026388168, + "num_input_tokens_seen": 34356848, + "step": 2098, + "train_runtime": 17049.3365, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 1.272121212121212, + "grad_norm": 0.015927039086818695, + "learning_rate": 9.658447405187971e-05, + "loss": 0.010989967733621597, + "num_input_tokens_seen": 34373224, + "step": 2099, + "train_runtime": 17057.4473, + "train_tokens_per_second": 2015.145 + }, + { + "epoch": 1.2727272727272727, + "grad_norm": 0.008946129120886326, + "learning_rate": 9.658098007291883e-05, + "loss": 0.012766811065375805, + "num_input_tokens_seen": 34389600, + "step": 2100, + "train_runtime": 17065.5571, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 1.2733333333333334, + "grad_norm": 0.008856577798724174, + "learning_rate": 9.65774843710182e-05, + "loss": 0.012293383479118347, + "num_input_tokens_seen": 34405976, + "step": 2101, + "train_runtime": 17074.6331, + "train_tokens_per_second": 2015.035 + }, + { + "epoch": 1.273939393939394, + "grad_norm": 0.010618072003126144, + "learning_rate": 9.657398694630712e-05, + "loss": 0.01199729647487402, + "num_input_tokens_seen": 34422352, + "step": 2102, + "train_runtime": 17082.7419, + "train_tokens_per_second": 2015.037 + }, + { + "epoch": 1.2745454545454544, + "grad_norm": 0.0050176153890788555, + "learning_rate": 9.657048779891498e-05, + "loss": 0.012176436372101307, + "num_input_tokens_seen": 34438728, + "step": 2103, + "train_runtime": 17090.8518, + "train_tokens_per_second": 2015.039 + }, + { + "epoch": 1.2751515151515151, + "grad_norm": 0.014950458891689777, + "learning_rate": 9.656698692897117e-05, + "loss": 0.01280341949313879, + "num_input_tokens_seen": 34455104, + "step": 2104, + "train_runtime": 17098.9578, + "train_tokens_per_second": 2015.041 + }, + { + "epoch": 1.2757575757575759, + "grad_norm": 0.010291986167430878, + "learning_rate": 9.656348433660521e-05, + "loss": 0.012015961110591888, + "num_input_tokens_seen": 34471480, + "step": 2105, + "train_runtime": 17107.0671, + "train_tokens_per_second": 2015.043 + }, + { + "epoch": 1.2763636363636364, + "grad_norm": 0.01197121199220419, + "learning_rate": 9.655998002194663e-05, + "loss": 0.013888024725019932, + "num_input_tokens_seen": 34487856, + "step": 2106, + "train_runtime": 17115.1785, + "train_tokens_per_second": 2015.045 + }, + { + "epoch": 1.2769696969696969, + "grad_norm": 0.006403345614671707, + "learning_rate": 9.655647398512509e-05, + "loss": 0.01155434362590313, + "num_input_tokens_seen": 34504232, + "step": 2107, + "train_runtime": 17123.2944, + "train_tokens_per_second": 2015.046 + }, + { + "epoch": 1.2775757575757576, + "grad_norm": 0.009825125336647034, + "learning_rate": 9.655296622627021e-05, + "loss": 0.012322339229285717, + "num_input_tokens_seen": 34520608, + "step": 2108, + "train_runtime": 17131.4068, + "train_tokens_per_second": 2015.048 + }, + { + "epoch": 1.2781818181818183, + "grad_norm": 0.014537754468619823, + "learning_rate": 9.654945674551177e-05, + "loss": 0.0121865663677454, + "num_input_tokens_seen": 34536984, + "step": 2109, + "train_runtime": 17139.5156, + "train_tokens_per_second": 2015.05 + }, + { + "epoch": 1.2787878787878788, + "grad_norm": 0.01560883317142725, + "learning_rate": 9.65459455429796e-05, + "loss": 0.013027477078139782, + "num_input_tokens_seen": 34553360, + "step": 2110, + "train_runtime": 17147.6232, + "train_tokens_per_second": 2015.052 + }, + { + "epoch": 1.2793939393939393, + "grad_norm": 0.01559220440685749, + "learning_rate": 9.654243261880353e-05, + "loss": 0.012413599528372288, + "num_input_tokens_seen": 34569736, + "step": 2111, + "train_runtime": 17155.7332, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 1.28, + "grad_norm": 0.009660288691520691, + "learning_rate": 9.653891797311351e-05, + "loss": 0.012785199098289013, + "num_input_tokens_seen": 34586112, + "step": 2112, + "train_runtime": 17163.8439, + "train_tokens_per_second": 2015.056 + }, + { + "epoch": 1.2806060606060605, + "grad_norm": 0.006946479436010122, + "learning_rate": 9.653540160603956e-05, + "loss": 0.01120313722640276, + "num_input_tokens_seen": 34602488, + "step": 2113, + "train_runtime": 17171.9524, + "train_tokens_per_second": 2015.058 + }, + { + "epoch": 1.2812121212121212, + "grad_norm": 0.01230864692479372, + "learning_rate": 9.653188351771172e-05, + "loss": 0.012665695510804653, + "num_input_tokens_seen": 34618864, + "step": 2114, + "train_runtime": 17180.0636, + "train_tokens_per_second": 2015.06 + }, + { + "epoch": 1.2818181818181817, + "grad_norm": 0.011574783362448215, + "learning_rate": 9.652836370826013e-05, + "loss": 0.01255100592970848, + "num_input_tokens_seen": 34635240, + "step": 2115, + "train_runtime": 17188.1736, + "train_tokens_per_second": 2015.062 + }, + { + "epoch": 1.2824242424242425, + "grad_norm": 0.008793197572231293, + "learning_rate": 9.652484217781497e-05, + "loss": 0.013284552842378616, + "num_input_tokens_seen": 34651616, + "step": 2116, + "train_runtime": 17196.2832, + "train_tokens_per_second": 2015.064 + }, + { + "epoch": 1.283030303030303, + "grad_norm": 0.006198557559400797, + "learning_rate": 9.652131892650651e-05, + "loss": 0.010948103852570057, + "num_input_tokens_seen": 34667992, + "step": 2117, + "train_runtime": 17204.394, + "train_tokens_per_second": 2015.066 + }, + { + "epoch": 1.2836363636363637, + "grad_norm": 0.01315808854997158, + "learning_rate": 9.651779395446505e-05, + "loss": 0.012427638284862041, + "num_input_tokens_seen": 34684368, + "step": 2118, + "train_runtime": 17212.5026, + "train_tokens_per_second": 2015.068 + }, + { + "epoch": 1.2842424242424242, + "grad_norm": 0.0191126000136137, + "learning_rate": 9.651426726182098e-05, + "loss": 0.013659548945724964, + "num_input_tokens_seen": 34700744, + "step": 2119, + "train_runtime": 17220.6124, + "train_tokens_per_second": 2015.07 + }, + { + "epoch": 1.284848484848485, + "grad_norm": 0.009408008307218552, + "learning_rate": 9.651073884870473e-05, + "loss": 0.012401404790580273, + "num_input_tokens_seen": 34717120, + "step": 2120, + "train_runtime": 17228.7217, + "train_tokens_per_second": 2015.072 + }, + { + "epoch": 1.2854545454545454, + "grad_norm": 0.011676698923110962, + "learning_rate": 9.650720871524686e-05, + "loss": 0.011847556568682194, + "num_input_tokens_seen": 34733496, + "step": 2121, + "train_runtime": 17236.8342, + "train_tokens_per_second": 2015.074 + }, + { + "epoch": 1.2860606060606061, + "grad_norm": 0.008365709334611893, + "learning_rate": 9.65036768615779e-05, + "loss": 0.011720732785761356, + "num_input_tokens_seen": 34749872, + "step": 2122, + "train_runtime": 17244.9443, + "train_tokens_per_second": 2015.076 + }, + { + "epoch": 1.2866666666666666, + "grad_norm": 0.008611305616796017, + "learning_rate": 9.650014328782848e-05, + "loss": 0.012757069431245327, + "num_input_tokens_seen": 34766248, + "step": 2123, + "train_runtime": 17253.0952, + "train_tokens_per_second": 2015.073 + }, + { + "epoch": 1.2872727272727273, + "grad_norm": 0.015551083721220493, + "learning_rate": 9.649660799412933e-05, + "loss": 0.01262652687728405, + "num_input_tokens_seen": 34782624, + "step": 2124, + "train_runtime": 17261.2065, + "train_tokens_per_second": 2015.075 + }, + { + "epoch": 1.2878787878787878, + "grad_norm": 0.01336025632917881, + "learning_rate": 9.649307098061119e-05, + "loss": 0.013620062731206417, + "num_input_tokens_seen": 34799000, + "step": 2125, + "train_runtime": 17269.3148, + "train_tokens_per_second": 2015.077 + }, + { + "epoch": 1.2884848484848486, + "grad_norm": 0.01245210412889719, + "learning_rate": 9.648953224740489e-05, + "loss": 0.013339771889150143, + "num_input_tokens_seen": 34815376, + "step": 2126, + "train_runtime": 17277.4317, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 1.289090909090909, + "grad_norm": 0.009113944135606289, + "learning_rate": 9.648599179464134e-05, + "loss": 0.0135605214163661, + "num_input_tokens_seen": 34831752, + "step": 2127, + "train_runtime": 17285.5412, + "train_tokens_per_second": 2015.08 + }, + { + "epoch": 1.2896969696969696, + "grad_norm": 0.009461781941354275, + "learning_rate": 9.648244962245148e-05, + "loss": 0.012702050618827343, + "num_input_tokens_seen": 34848128, + "step": 2128, + "train_runtime": 17293.6537, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 1.2903030303030303, + "grad_norm": 0.007549144793301821, + "learning_rate": 9.647890573096632e-05, + "loss": 0.011274856515228748, + "num_input_tokens_seen": 34864504, + "step": 2129, + "train_runtime": 17301.7635, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 1.290909090909091, + "grad_norm": 0.010565084405243397, + "learning_rate": 9.647536012031695e-05, + "loss": 0.011094843037426472, + "num_input_tokens_seen": 34880880, + "step": 2130, + "train_runtime": 17309.8729, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.2915151515151515, + "grad_norm": 0.015499581582844257, + "learning_rate": 9.647181279063453e-05, + "loss": 0.01364554651081562, + "num_input_tokens_seen": 34897256, + "step": 2131, + "train_runtime": 17317.9876, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 1.292121212121212, + "grad_norm": 0.008567445911467075, + "learning_rate": 9.646826374205022e-05, + "loss": 0.012525910511612892, + "num_input_tokens_seen": 34913632, + "step": 2132, + "train_runtime": 17326.0958, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 1.2927272727272727, + "grad_norm": 0.013958334922790527, + "learning_rate": 9.646471297469537e-05, + "loss": 0.012882156297564507, + "num_input_tokens_seen": 34930008, + "step": 2133, + "train_runtime": 17334.2117, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.2933333333333334, + "grad_norm": 0.007428262382745743, + "learning_rate": 9.646116048870124e-05, + "loss": 0.011848744936287403, + "num_input_tokens_seen": 34946384, + "step": 2134, + "train_runtime": 17342.3224, + "train_tokens_per_second": 2015.093 + }, + { + "epoch": 1.293939393939394, + "grad_norm": 0.009584290906786919, + "learning_rate": 9.645760628419929e-05, + "loss": 0.011593570932745934, + "num_input_tokens_seen": 34962760, + "step": 2135, + "train_runtime": 17350.435, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 1.2945454545454544, + "grad_norm": 0.016960185021162033, + "learning_rate": 9.645405036132093e-05, + "loss": 0.013664147816598415, + "num_input_tokens_seen": 34979136, + "step": 2136, + "train_runtime": 17358.55, + "train_tokens_per_second": 2015.095 + }, + { + "epoch": 1.2951515151515152, + "grad_norm": 0.01113409735262394, + "learning_rate": 9.645049272019773e-05, + "loss": 0.013035980984568596, + "num_input_tokens_seen": 34995512, + "step": 2137, + "train_runtime": 17366.6598, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.2957575757575759, + "grad_norm": 0.02672557160258293, + "learning_rate": 9.644693336096125e-05, + "loss": 0.01212849747389555, + "num_input_tokens_seen": 35011888, + "step": 2138, + "train_runtime": 17374.7703, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 1.2963636363636364, + "grad_norm": 0.015280626714229584, + "learning_rate": 9.644337228374318e-05, + "loss": 0.012423450127243996, + "num_input_tokens_seen": 35028264, + "step": 2139, + "train_runtime": 17382.8807, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 1.2969696969696969, + "grad_norm": 0.014273280277848244, + "learning_rate": 9.643980948867519e-05, + "loss": 0.011721854098141193, + "num_input_tokens_seen": 35044640, + "step": 2140, + "train_runtime": 17390.9881, + "train_tokens_per_second": 2015.103 + }, + { + "epoch": 1.2975757575757576, + "grad_norm": 0.005597109440714121, + "learning_rate": 9.643624497588908e-05, + "loss": 0.011341876350343227, + "num_input_tokens_seen": 35061016, + "step": 2141, + "train_runtime": 17399.099, + "train_tokens_per_second": 2015.105 + }, + { + "epoch": 1.298181818181818, + "grad_norm": 0.009911877103149891, + "learning_rate": 9.643267874551671e-05, + "loss": 0.013796377927064896, + "num_input_tokens_seen": 35077392, + "step": 2142, + "train_runtime": 17407.2072, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 1.2987878787878788, + "grad_norm": 0.008413155563175678, + "learning_rate": 9.642911079768999e-05, + "loss": 0.01139981858432293, + "num_input_tokens_seen": 35093768, + "step": 2143, + "train_runtime": 17415.3192, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 1.2993939393939393, + "grad_norm": 0.00816043745726347, + "learning_rate": 9.642554113254085e-05, + "loss": 0.012404600158333778, + "num_input_tokens_seen": 35110144, + "step": 2144, + "train_runtime": 17423.4327, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 1.3, + "grad_norm": 0.009998388588428497, + "learning_rate": 9.642196975020137e-05, + "loss": 0.013593346811830997, + "num_input_tokens_seen": 35126520, + "step": 2145, + "train_runtime": 17431.5439, + "train_tokens_per_second": 2015.112 + }, + { + "epoch": 1.3006060606060605, + "grad_norm": 0.008478298783302307, + "learning_rate": 9.641839665080363e-05, + "loss": 0.012044758535921574, + "num_input_tokens_seen": 35142896, + "step": 2146, + "train_runtime": 17439.6567, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 1.3012121212121213, + "grad_norm": 0.01786281354725361, + "learning_rate": 9.64148218344798e-05, + "loss": 0.01225617527961731, + "num_input_tokens_seen": 35159272, + "step": 2147, + "train_runtime": 17447.7674, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 1.3018181818181818, + "grad_norm": 0.004462715703994036, + "learning_rate": 9.641124530136209e-05, + "loss": 0.01188221201300621, + "num_input_tokens_seen": 35175648, + "step": 2148, + "train_runtime": 17455.8753, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 1.3024242424242425, + "grad_norm": 0.0029105639550834894, + "learning_rate": 9.64076670515828e-05, + "loss": 0.012195127084851265, + "num_input_tokens_seen": 35192024, + "step": 2149, + "train_runtime": 17463.9854, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 1.303030303030303, + "grad_norm": 0.00953491497784853, + "learning_rate": 9.640408708527429e-05, + "loss": 0.01310074981302023, + "num_input_tokens_seen": 35208400, + "step": 2150, + "train_runtime": 17472.0957, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.3036363636363637, + "grad_norm": 0.018413005396723747, + "learning_rate": 9.640050540256896e-05, + "loss": 0.014166397973895073, + "num_input_tokens_seen": 35224776, + "step": 2151, + "train_runtime": 17480.2096, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 1.3042424242424242, + "grad_norm": 0.010966022498905659, + "learning_rate": 9.639692200359929e-05, + "loss": 0.013556400313973427, + "num_input_tokens_seen": 35241152, + "step": 2152, + "train_runtime": 17488.3185, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 1.304848484848485, + "grad_norm": 0.008611932396888733, + "learning_rate": 9.639333688849784e-05, + "loss": 0.010546581819653511, + "num_input_tokens_seen": 35257528, + "step": 2153, + "train_runtime": 17496.4338, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.3054545454545454, + "grad_norm": 0.01247915904968977, + "learning_rate": 9.638975005739719e-05, + "loss": 0.013438762165606022, + "num_input_tokens_seen": 35273904, + "step": 2154, + "train_runtime": 17504.547, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 1.3060606060606061, + "grad_norm": 0.011569264344871044, + "learning_rate": 9.638616151043003e-05, + "loss": 0.012859106063842773, + "num_input_tokens_seen": 35290280, + "step": 2155, + "train_runtime": 17512.6539, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 1.3066666666666666, + "grad_norm": 0.013723514974117279, + "learning_rate": 9.638257124772909e-05, + "loss": 0.012551365420222282, + "num_input_tokens_seen": 35306656, + "step": 2156, + "train_runtime": 17520.7657, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 1.3072727272727271, + "grad_norm": 0.008372720330953598, + "learning_rate": 9.637897926942716e-05, + "loss": 0.012505918741226196, + "num_input_tokens_seen": 35323032, + "step": 2157, + "train_runtime": 17528.874, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 1.3078787878787879, + "grad_norm": 0.008815966546535492, + "learning_rate": 9.637538557565712e-05, + "loss": 0.011104857549071312, + "num_input_tokens_seen": 35339408, + "step": 2158, + "train_runtime": 17536.9823, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 1.3084848484848486, + "grad_norm": 0.01211005449295044, + "learning_rate": 9.637179016655186e-05, + "loss": 0.01384238712489605, + "num_input_tokens_seen": 35355784, + "step": 2159, + "train_runtime": 17545.0948, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 1.309090909090909, + "grad_norm": 0.017194336280226707, + "learning_rate": 9.63681930422444e-05, + "loss": 0.014862093143165112, + "num_input_tokens_seen": 35372160, + "step": 2160, + "train_runtime": 17553.2169, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 1.3096969696969696, + "grad_norm": 0.010664109140634537, + "learning_rate": 9.636459420286779e-05, + "loss": 0.012079211883246899, + "num_input_tokens_seen": 35388536, + "step": 2161, + "train_runtime": 17561.3338, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 1.3103030303030303, + "grad_norm": 0.01663060300052166, + "learning_rate": 9.636099364855511e-05, + "loss": 0.013289893046021461, + "num_input_tokens_seen": 35404912, + "step": 2162, + "train_runtime": 17569.4461, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 1.310909090909091, + "grad_norm": 0.012000697664916515, + "learning_rate": 9.635739137943957e-05, + "loss": 0.01403304748237133, + "num_input_tokens_seen": 35421288, + "step": 2163, + "train_runtime": 17577.5567, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 1.3115151515151515, + "grad_norm": 0.012029038742184639, + "learning_rate": 9.635378739565439e-05, + "loss": 0.013772612437605858, + "num_input_tokens_seen": 35437664, + "step": 2164, + "train_runtime": 17585.6679, + "train_tokens_per_second": 2015.145 + }, + { + "epoch": 1.312121212121212, + "grad_norm": 0.018417729064822197, + "learning_rate": 9.63501816973329e-05, + "loss": 0.013312135823071003, + "num_input_tokens_seen": 35454040, + "step": 2165, + "train_runtime": 17593.7809, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 1.3127272727272727, + "grad_norm": 0.009397652000188828, + "learning_rate": 9.634657428460844e-05, + "loss": 0.01353902742266655, + "num_input_tokens_seen": 35470416, + "step": 2166, + "train_runtime": 17601.8913, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 1.3133333333333335, + "grad_norm": 0.01506609097123146, + "learning_rate": 9.634296515761445e-05, + "loss": 0.012555200606584549, + "num_input_tokens_seen": 35486792, + "step": 2167, + "train_runtime": 17610.0027, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 1.313939393939394, + "grad_norm": 0.008759674616158009, + "learning_rate": 9.633935431648444e-05, + "loss": 0.012441650964319706, + "num_input_tokens_seen": 35503168, + "step": 2168, + "train_runtime": 17618.1138, + "train_tokens_per_second": 2015.151 + }, + { + "epoch": 1.3145454545454545, + "grad_norm": 0.006139460019767284, + "learning_rate": 9.633574176135194e-05, + "loss": 0.011962692253291607, + "num_input_tokens_seen": 35519544, + "step": 2169, + "train_runtime": 17626.2224, + "train_tokens_per_second": 2015.154 + }, + { + "epoch": 1.3151515151515152, + "grad_norm": 0.012904582545161247, + "learning_rate": 9.63321274923506e-05, + "loss": 0.013941573910415173, + "num_input_tokens_seen": 35535920, + "step": 2170, + "train_runtime": 17634.3357, + "train_tokens_per_second": 2015.155 + }, + { + "epoch": 1.3157575757575757, + "grad_norm": 0.014602463692426682, + "learning_rate": 9.632851150961409e-05, + "loss": 0.01265255268663168, + "num_input_tokens_seen": 35552296, + "step": 2171, + "train_runtime": 17642.4467, + "train_tokens_per_second": 2015.157 + }, + { + "epoch": 1.3163636363636364, + "grad_norm": 0.02269907295703888, + "learning_rate": 9.632489381327617e-05, + "loss": 0.012979868799448013, + "num_input_tokens_seen": 35568672, + "step": 2172, + "train_runtime": 17650.5584, + "train_tokens_per_second": 2015.158 + }, + { + "epoch": 1.316969696969697, + "grad_norm": 0.005800630897283554, + "learning_rate": 9.632127440347062e-05, + "loss": 0.013988342136144638, + "num_input_tokens_seen": 35585048, + "step": 2173, + "train_runtime": 17658.667, + "train_tokens_per_second": 2015.16 + }, + { + "epoch": 1.3175757575757576, + "grad_norm": 0.01643582247197628, + "learning_rate": 9.631765328033134e-05, + "loss": 0.013193177059292793, + "num_input_tokens_seen": 35601424, + "step": 2174, + "train_runtime": 17666.7787, + "train_tokens_per_second": 2015.162 + }, + { + "epoch": 1.3181818181818181, + "grad_norm": 0.012858077883720398, + "learning_rate": 9.631403044399227e-05, + "loss": 0.011166015639901161, + "num_input_tokens_seen": 35617800, + "step": 2175, + "train_runtime": 17674.8914, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 1.3187878787878788, + "grad_norm": 0.013520823791623116, + "learning_rate": 9.631040589458741e-05, + "loss": 0.011491977609694004, + "num_input_tokens_seen": 35634176, + "step": 2176, + "train_runtime": 17683.0014, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 1.3193939393939393, + "grad_norm": 0.010789001360535622, + "learning_rate": 9.630677963225082e-05, + "loss": 0.012689574621617794, + "num_input_tokens_seen": 35650552, + "step": 2177, + "train_runtime": 17691.1129, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 1.32, + "grad_norm": 0.02198723889887333, + "learning_rate": 9.630315165711664e-05, + "loss": 0.01258945930749178, + "num_input_tokens_seen": 35666928, + "step": 2178, + "train_runtime": 17699.2219, + "train_tokens_per_second": 2015.169 + }, + { + "epoch": 1.3206060606060606, + "grad_norm": 0.009862944483757019, + "learning_rate": 9.629952196931901e-05, + "loss": 0.013526612892746925, + "num_input_tokens_seen": 35683304, + "step": 2179, + "train_runtime": 17707.3338, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 1.3212121212121213, + "grad_norm": 0.010745448991656303, + "learning_rate": 9.629589056899226e-05, + "loss": 0.011776023544371128, + "num_input_tokens_seen": 35699680, + "step": 2180, + "train_runtime": 17715.4454, + "train_tokens_per_second": 2015.173 + }, + { + "epoch": 1.3218181818181818, + "grad_norm": 0.011397488415241241, + "learning_rate": 9.629225745627069e-05, + "loss": 0.011914942413568497, + "num_input_tokens_seen": 35716056, + "step": 2181, + "train_runtime": 17723.5593, + "train_tokens_per_second": 2015.174 + }, + { + "epoch": 1.3224242424242425, + "grad_norm": 0.006203577853739262, + "learning_rate": 9.628862263128863e-05, + "loss": 0.011155352927744389, + "num_input_tokens_seen": 35732432, + "step": 2182, + "train_runtime": 17731.6685, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 1.323030303030303, + "grad_norm": 0.02225896529853344, + "learning_rate": 9.628498609418058e-05, + "loss": 0.01270115002989769, + "num_input_tokens_seen": 35748808, + "step": 2183, + "train_runtime": 17739.7798, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 1.3236363636363637, + "grad_norm": 0.008858407847583294, + "learning_rate": 9.628134784508104e-05, + "loss": 0.011755815707147121, + "num_input_tokens_seen": 35765184, + "step": 2184, + "train_runtime": 17747.8901, + "train_tokens_per_second": 2015.179 + }, + { + "epoch": 1.3242424242424242, + "grad_norm": 0.008333653211593628, + "learning_rate": 9.627770788412455e-05, + "loss": 0.011658458970487118, + "num_input_tokens_seen": 35781560, + "step": 2185, + "train_runtime": 17755.9976, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 1.3248484848484847, + "grad_norm": 0.008924845606088638, + "learning_rate": 9.627406621144578e-05, + "loss": 0.013182871043682098, + "num_input_tokens_seen": 35797936, + "step": 2186, + "train_runtime": 17764.1073, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 1.3254545454545454, + "grad_norm": 0.008099894039332867, + "learning_rate": 9.627042282717942e-05, + "loss": 0.012596973218023777, + "num_input_tokens_seen": 35814312, + "step": 2187, + "train_runtime": 17772.2161, + "train_tokens_per_second": 2015.185 + }, + { + "epoch": 1.3260606060606062, + "grad_norm": 0.01450270600616932, + "learning_rate": 9.626677773146022e-05, + "loss": 0.012211315333843231, + "num_input_tokens_seen": 35830688, + "step": 2188, + "train_runtime": 17780.3344, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 1.3266666666666667, + "grad_norm": 0.011908031068742275, + "learning_rate": 9.6263130924423e-05, + "loss": 0.011781557463109493, + "num_input_tokens_seen": 35847064, + "step": 2189, + "train_runtime": 17788.4466, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 1.3272727272727272, + "grad_norm": 0.01242706086486578, + "learning_rate": 9.625948240620269e-05, + "loss": 0.012625298462808132, + "num_input_tokens_seen": 35863440, + "step": 2190, + "train_runtime": 17796.5592, + "train_tokens_per_second": 2015.19 + }, + { + "epoch": 1.3278787878787879, + "grad_norm": 0.010798187926411629, + "learning_rate": 9.62558321769342e-05, + "loss": 0.012144774198532104, + "num_input_tokens_seen": 35879816, + "step": 2191, + "train_runtime": 17804.6702, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 1.3284848484848486, + "grad_norm": 0.010587851516902447, + "learning_rate": 9.625218023675255e-05, + "loss": 0.012753032147884369, + "num_input_tokens_seen": 35896192, + "step": 2192, + "train_runtime": 17812.7787, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 1.329090909090909, + "grad_norm": 0.006857162807136774, + "learning_rate": 9.624852658579282e-05, + "loss": 0.012723483145236969, + "num_input_tokens_seen": 35912568, + "step": 2193, + "train_runtime": 17820.8934, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 1.3296969696969696, + "grad_norm": 0.04555951803922653, + "learning_rate": 9.624487122419017e-05, + "loss": 0.011125890538096428, + "num_input_tokens_seen": 35928944, + "step": 2194, + "train_runtime": 17829.0031, + "train_tokens_per_second": 2015.196 + }, + { + "epoch": 1.3303030303030303, + "grad_norm": 0.009600553661584854, + "learning_rate": 9.624121415207978e-05, + "loss": 0.01181505061686039, + "num_input_tokens_seen": 35945320, + "step": 2195, + "train_runtime": 17837.1166, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 1.330909090909091, + "grad_norm": 0.006707084830850363, + "learning_rate": 9.623755536959693e-05, + "loss": 0.012222236022353172, + "num_input_tokens_seen": 35961696, + "step": 2196, + "train_runtime": 17845.2322, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 1.3315151515151515, + "grad_norm": 0.04006510227918625, + "learning_rate": 9.623389487687696e-05, + "loss": 0.01126607321202755, + "num_input_tokens_seen": 35978072, + "step": 2197, + "train_runtime": 17853.3441, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 1.332121212121212, + "grad_norm": 0.010973289608955383, + "learning_rate": 9.623023267405525e-05, + "loss": 0.012974373064935207, + "num_input_tokens_seen": 35994448, + "step": 2198, + "train_runtime": 17861.4569, + "train_tokens_per_second": 2015.202 + }, + { + "epoch": 1.3327272727272728, + "grad_norm": 0.010835564695298672, + "learning_rate": 9.622656876126726e-05, + "loss": 0.012828577309846878, + "num_input_tokens_seen": 36010824, + "step": 2199, + "train_runtime": 17869.5656, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 0.04496414214372635, + "learning_rate": 9.622290313864852e-05, + "loss": 0.012135835364460945, + "num_input_tokens_seen": 36027200, + "step": 2200, + "train_runtime": 17877.6728, + "train_tokens_per_second": 2015.206 + }, + { + "epoch": 1.333939393939394, + "grad_norm": 0.014242942444980145, + "learning_rate": 9.62192358063346e-05, + "loss": 0.013479562476277351, + "num_input_tokens_seen": 36043576, + "step": 2201, + "train_runtime": 17886.7409, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 1.3345454545454545, + "grad_norm": 0.013382949866354465, + "learning_rate": 9.621556676446117e-05, + "loss": 0.013178217224776745, + "num_input_tokens_seen": 36059952, + "step": 2202, + "train_runtime": 17894.8493, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 1.3351515151515152, + "grad_norm": 0.0031381326261907816, + "learning_rate": 9.621189601316391e-05, + "loss": 0.011984724551439285, + "num_input_tokens_seen": 36076328, + "step": 2203, + "train_runtime": 17903.1459, + "train_tokens_per_second": 2015.083 + }, + { + "epoch": 1.3357575757575757, + "grad_norm": 0.01637393608689308, + "learning_rate": 9.620822355257861e-05, + "loss": 0.013623081147670746, + "num_input_tokens_seen": 36092704, + "step": 2204, + "train_runtime": 17911.2629, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 1.3363636363636364, + "grad_norm": 0.008263975381851196, + "learning_rate": 9.620454938284112e-05, + "loss": 0.012027481570839882, + "num_input_tokens_seen": 36109080, + "step": 2205, + "train_runtime": 17919.3786, + "train_tokens_per_second": 2015.085 + }, + { + "epoch": 1.336969696969697, + "grad_norm": 0.006873487960547209, + "learning_rate": 9.620087350408732e-05, + "loss": 0.011758643202483654, + "num_input_tokens_seen": 36125456, + "step": 2206, + "train_runtime": 17927.4869, + "train_tokens_per_second": 2015.088 + }, + { + "epoch": 1.3375757575757576, + "grad_norm": 0.024665268138051033, + "learning_rate": 9.619719591645317e-05, + "loss": 0.013782843947410583, + "num_input_tokens_seen": 36141832, + "step": 2207, + "train_runtime": 17935.6003, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 1.3381818181818181, + "grad_norm": 0.02779507078230381, + "learning_rate": 9.619351662007473e-05, + "loss": 0.012481366284191608, + "num_input_tokens_seen": 36158208, + "step": 2208, + "train_runtime": 17943.7156, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 1.3387878787878789, + "grad_norm": 0.013757690787315369, + "learning_rate": 9.618983561508805e-05, + "loss": 0.01145961880683899, + "num_input_tokens_seen": 36174584, + "step": 2209, + "train_runtime": 17951.8332, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.3393939393939394, + "grad_norm": 0.010363428853452206, + "learning_rate": 9.618615290162931e-05, + "loss": 0.012521384283900261, + "num_input_tokens_seen": 36190960, + "step": 2210, + "train_runtime": 17959.9421, + "train_tokens_per_second": 2015.093 + }, + { + "epoch": 1.34, + "grad_norm": 0.009606797248125076, + "learning_rate": 9.618246847983471e-05, + "loss": 0.011278870515525341, + "num_input_tokens_seen": 36207336, + "step": 2211, + "train_runtime": 17968.0555, + "train_tokens_per_second": 2015.095 + }, + { + "epoch": 1.3406060606060606, + "grad_norm": 0.01277079712599516, + "learning_rate": 9.617878234984055e-05, + "loss": 0.013770547695457935, + "num_input_tokens_seen": 36223712, + "step": 2212, + "train_runtime": 17976.1678, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 1.3412121212121213, + "grad_norm": 0.03445848822593689, + "learning_rate": 9.617509451178317e-05, + "loss": 0.013981115072965622, + "num_input_tokens_seen": 36240088, + "step": 2213, + "train_runtime": 17984.2782, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.3418181818181818, + "grad_norm": 0.014536652714014053, + "learning_rate": 9.617140496579896e-05, + "loss": 0.012220301665365696, + "num_input_tokens_seen": 36256464, + "step": 2214, + "train_runtime": 17992.387, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 1.3424242424242423, + "grad_norm": 0.01611790619790554, + "learning_rate": 9.616771371202437e-05, + "loss": 0.013024747371673584, + "num_input_tokens_seen": 36272840, + "step": 2215, + "train_runtime": 18000.4999, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 1.343030303030303, + "grad_norm": 0.01158247608691454, + "learning_rate": 9.616402075059597e-05, + "loss": 0.012004833668470383, + "num_input_tokens_seen": 36289216, + "step": 2216, + "train_runtime": 18008.6151, + "train_tokens_per_second": 2015.103 + }, + { + "epoch": 1.3436363636363637, + "grad_norm": 0.023385636508464813, + "learning_rate": 9.616032608165034e-05, + "loss": 0.012171284295618534, + "num_input_tokens_seen": 36305592, + "step": 2217, + "train_runtime": 18016.7328, + "train_tokens_per_second": 2015.104 + }, + { + "epoch": 1.3442424242424242, + "grad_norm": 0.0019628936424851418, + "learning_rate": 9.615662970532416e-05, + "loss": 0.011382883414626122, + "num_input_tokens_seen": 36321968, + "step": 2218, + "train_runtime": 18024.8441, + "train_tokens_per_second": 2015.106 + }, + { + "epoch": 1.3448484848484847, + "grad_norm": 0.006762090139091015, + "learning_rate": 9.615293162175412e-05, + "loss": 0.012919168919324875, + "num_input_tokens_seen": 36338344, + "step": 2219, + "train_runtime": 18032.9565, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 1.3454545454545455, + "grad_norm": 0.009785478003323078, + "learning_rate": 9.6149231831077e-05, + "loss": 0.012674746103584766, + "num_input_tokens_seen": 36354720, + "step": 2220, + "train_runtime": 18041.066, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 1.3460606060606062, + "grad_norm": 0.010461698286235332, + "learning_rate": 9.614553033342969e-05, + "loss": 0.011975216679275036, + "num_input_tokens_seen": 36371096, + "step": 2221, + "train_runtime": 18049.1775, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 1.3466666666666667, + "grad_norm": 0.008190472610294819, + "learning_rate": 9.614182712894907e-05, + "loss": 0.012141593731939793, + "num_input_tokens_seen": 36387472, + "step": 2222, + "train_runtime": 18057.2852, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.3472727272727272, + "grad_norm": 0.008826551958918571, + "learning_rate": 9.613812221777212e-05, + "loss": 0.012635836377739906, + "num_input_tokens_seen": 36403848, + "step": 2223, + "train_runtime": 18065.3978, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 1.347878787878788, + "grad_norm": 0.010394547134637833, + "learning_rate": 9.613441560003588e-05, + "loss": 0.01198198739439249, + "num_input_tokens_seen": 36420224, + "step": 2224, + "train_runtime": 18073.5128, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 1.3484848484848486, + "grad_norm": 0.010476038791239262, + "learning_rate": 9.613070727587745e-05, + "loss": 0.012501906603574753, + "num_input_tokens_seen": 36436600, + "step": 2225, + "train_runtime": 18081.6231, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 1.3490909090909091, + "grad_norm": 0.010277483612298965, + "learning_rate": 9.6126997245434e-05, + "loss": 0.01279398612678051, + "num_input_tokens_seen": 36452976, + "step": 2226, + "train_runtime": 18089.7358, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 1.3496969696969696, + "grad_norm": 0.022543100640177727, + "learning_rate": 9.612328550884274e-05, + "loss": 0.012489145621657372, + "num_input_tokens_seen": 36469352, + "step": 2227, + "train_runtime": 18097.8572, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 1.3503030303030303, + "grad_norm": 0.012405750341713428, + "learning_rate": 9.611957206624098e-05, + "loss": 0.011780200526118279, + "num_input_tokens_seen": 36485728, + "step": 2228, + "train_runtime": 18105.9862, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 1.3509090909090908, + "grad_norm": 0.024704037234187126, + "learning_rate": 9.611585691776606e-05, + "loss": 0.012659060768783092, + "num_input_tokens_seen": 36502104, + "step": 2229, + "train_runtime": 18114.1069, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 1.3515151515151516, + "grad_norm": 0.004990878514945507, + "learning_rate": 9.61121400635554e-05, + "loss": 0.010893161408603191, + "num_input_tokens_seen": 36518480, + "step": 2230, + "train_runtime": 18122.234, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 1.352121212121212, + "grad_norm": 0.011252249591052532, + "learning_rate": 9.610842150374647e-05, + "loss": 0.013744168914854527, + "num_input_tokens_seen": 36534856, + "step": 2231, + "train_runtime": 18130.3576, + "train_tokens_per_second": 2015.121 + }, + { + "epoch": 1.3527272727272728, + "grad_norm": 0.009260553866624832, + "learning_rate": 9.610470123847682e-05, + "loss": 0.011988443322479725, + "num_input_tokens_seen": 36551232, + "step": 2232, + "train_runtime": 18138.4799, + "train_tokens_per_second": 2015.121 + }, + { + "epoch": 1.3533333333333333, + "grad_norm": 0.011449086479842663, + "learning_rate": 9.610097926788406e-05, + "loss": 0.012442278675734997, + "num_input_tokens_seen": 36567608, + "step": 2233, + "train_runtime": 18146.6009, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.353939393939394, + "grad_norm": 0.012972496449947357, + "learning_rate": 9.609725559210586e-05, + "loss": 0.012432006187736988, + "num_input_tokens_seen": 36583984, + "step": 2234, + "train_runtime": 18154.7212, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.3545454545454545, + "grad_norm": 0.010363031178712845, + "learning_rate": 9.609353021127994e-05, + "loss": 0.011616806499660015, + "num_input_tokens_seen": 36600360, + "step": 2235, + "train_runtime": 18162.8432, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 1.3551515151515152, + "grad_norm": 0.008639562875032425, + "learning_rate": 9.60898031255441e-05, + "loss": 0.012379256077110767, + "num_input_tokens_seen": 36616736, + "step": 2236, + "train_runtime": 18170.964, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 1.3557575757575757, + "grad_norm": 0.010502860881388187, + "learning_rate": 9.60860743350362e-05, + "loss": 0.011975327506661415, + "num_input_tokens_seen": 36633112, + "step": 2237, + "train_runtime": 18179.0854, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 1.3563636363636364, + "grad_norm": 0.007005107123404741, + "learning_rate": 9.608234383989416e-05, + "loss": 0.011983048170804977, + "num_input_tokens_seen": 36649488, + "step": 2238, + "train_runtime": 18187.2075, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 1.356969696969697, + "grad_norm": 0.005663185380399227, + "learning_rate": 9.607861164025596e-05, + "loss": 0.011925374157726765, + "num_input_tokens_seen": 36665864, + "step": 2239, + "train_runtime": 18195.3344, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 1.3575757575757577, + "grad_norm": 0.011430823244154453, + "learning_rate": 9.607487773625967e-05, + "loss": 0.011923066340386868, + "num_input_tokens_seen": 36682240, + "step": 2240, + "train_runtime": 18203.4567, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 1.3581818181818182, + "grad_norm": 0.015044212341308594, + "learning_rate": 9.607114212804335e-05, + "loss": 0.01411314494907856, + "num_input_tokens_seen": 36698616, + "step": 2241, + "train_runtime": 18211.5792, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 1.3587878787878789, + "grad_norm": 0.01229457464069128, + "learning_rate": 9.606740481574522e-05, + "loss": 0.013101696036756039, + "num_input_tokens_seen": 36714992, + "step": 2242, + "train_runtime": 18219.7, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 1.3593939393939394, + "grad_norm": 0.010342378169298172, + "learning_rate": 9.606366579950348e-05, + "loss": 0.012473942711949348, + "num_input_tokens_seen": 36731368, + "step": 2243, + "train_runtime": 18227.8219, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 0.010652092285454273, + "learning_rate": 9.605992507945647e-05, + "loss": 0.012655368074774742, + "num_input_tokens_seen": 36747744, + "step": 2244, + "train_runtime": 18235.9442, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.3606060606060606, + "grad_norm": 0.005535294767469168, + "learning_rate": 9.60561826557425e-05, + "loss": 0.01122352760285139, + "num_input_tokens_seen": 36764120, + "step": 2245, + "train_runtime": 18244.067, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.3612121212121213, + "grad_norm": 0.01733524352312088, + "learning_rate": 9.605243852850006e-05, + "loss": 0.013020082376897335, + "num_input_tokens_seen": 36780496, + "step": 2246, + "train_runtime": 18252.1891, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 1.3618181818181818, + "grad_norm": 0.018927304074168205, + "learning_rate": 9.604869269786758e-05, + "loss": 0.013990607112646103, + "num_input_tokens_seen": 36796872, + "step": 2247, + "train_runtime": 18260.3112, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 1.3624242424242423, + "grad_norm": 0.009154443629086018, + "learning_rate": 9.604494516398364e-05, + "loss": 0.010833939537405968, + "num_input_tokens_seen": 36813248, + "step": 2248, + "train_runtime": 18268.4336, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 1.363030303030303, + "grad_norm": 0.010260475799441338, + "learning_rate": 9.604119592698684e-05, + "loss": 0.011943116784095764, + "num_input_tokens_seen": 36829624, + "step": 2249, + "train_runtime": 18276.5587, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 1.3636363636363638, + "grad_norm": 0.010942655615508556, + "learning_rate": 9.603744498701585e-05, + "loss": 0.01344153843820095, + "num_input_tokens_seen": 36846000, + "step": 2250, + "train_runtime": 18284.6808, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 1.3642424242424243, + "grad_norm": 0.01657605543732643, + "learning_rate": 9.603369234420945e-05, + "loss": 0.01295737735927105, + "num_input_tokens_seen": 36862376, + "step": 2251, + "train_runtime": 18292.8034, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 1.3648484848484848, + "grad_norm": 0.013552656397223473, + "learning_rate": 9.602993799870642e-05, + "loss": 0.011583628132939339, + "num_input_tokens_seen": 36878752, + "step": 2252, + "train_runtime": 18300.9349, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 1.3654545454545455, + "grad_norm": 0.008103492669761181, + "learning_rate": 9.602618195064558e-05, + "loss": 0.01070526335388422, + "num_input_tokens_seen": 36895128, + "step": 2253, + "train_runtime": 18309.0571, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 1.3660606060606062, + "grad_norm": 0.01473317015916109, + "learning_rate": 9.602242420016594e-05, + "loss": 0.014281337149441242, + "num_input_tokens_seen": 36911504, + "step": 2254, + "train_runtime": 18317.1808, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 1.3666666666666667, + "grad_norm": 0.013630642555654049, + "learning_rate": 9.601866474740645e-05, + "loss": 0.013544391840696335, + "num_input_tokens_seen": 36927880, + "step": 2255, + "train_runtime": 18325.3025, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 1.3672727272727272, + "grad_norm": 0.010557189583778381, + "learning_rate": 9.601490359250615e-05, + "loss": 0.011742150411009789, + "num_input_tokens_seen": 36944256, + "step": 2256, + "train_runtime": 18333.4243, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 1.367878787878788, + "grad_norm": 0.0053065926767885685, + "learning_rate": 9.60111407356042e-05, + "loss": 0.011019930243492126, + "num_input_tokens_seen": 36960632, + "step": 2257, + "train_runtime": 18341.5509, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 1.3684848484848484, + "grad_norm": 0.010890625417232513, + "learning_rate": 9.600737617683975e-05, + "loss": 0.01254215557128191, + "num_input_tokens_seen": 36977008, + "step": 2258, + "train_runtime": 18349.6736, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 1.3690909090909091, + "grad_norm": 0.0021913303062319756, + "learning_rate": 9.600360991635204e-05, + "loss": 0.011373097077012062, + "num_input_tokens_seen": 36993384, + "step": 2259, + "train_runtime": 18357.7964, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 1.3696969696969696, + "grad_norm": 0.015858152881264687, + "learning_rate": 9.59998419542804e-05, + "loss": 0.01217947993427515, + "num_input_tokens_seen": 37009760, + "step": 2260, + "train_runtime": 18365.9166, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 1.3703030303030304, + "grad_norm": 0.01206044852733612, + "learning_rate": 9.599607229076418e-05, + "loss": 0.012296038679778576, + "num_input_tokens_seen": 37026136, + "step": 2261, + "train_runtime": 18374.0366, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 1.3709090909090909, + "grad_norm": 0.016060523688793182, + "learning_rate": 9.599230092594283e-05, + "loss": 0.012836135923862457, + "num_input_tokens_seen": 37042512, + "step": 2262, + "train_runtime": 18382.157, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 1.3715151515151516, + "grad_norm": 0.012261572293937206, + "learning_rate": 9.598852785995581e-05, + "loss": 0.013203001581132412, + "num_input_tokens_seen": 37058888, + "step": 2263, + "train_runtime": 18390.2794, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 1.372121212121212, + "grad_norm": 0.011548114009201527, + "learning_rate": 9.598475309294272e-05, + "loss": 0.012114378623664379, + "num_input_tokens_seen": 37075264, + "step": 2264, + "train_runtime": 18398.4008, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 1.3727272727272728, + "grad_norm": 0.007986263372004032, + "learning_rate": 9.598097662504315e-05, + "loss": 0.012728969566524029, + "num_input_tokens_seen": 37091640, + "step": 2265, + "train_runtime": 18406.5207, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 1.3733333333333333, + "grad_norm": 0.010519546456634998, + "learning_rate": 9.597719845639682e-05, + "loss": 0.012828621082007885, + "num_input_tokens_seen": 37108016, + "step": 2266, + "train_runtime": 18414.6424, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 1.373939393939394, + "grad_norm": 0.013495163060724735, + "learning_rate": 9.597341858714343e-05, + "loss": 0.01284085027873516, + "num_input_tokens_seen": 37124392, + "step": 2267, + "train_runtime": 18422.7642, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 1.3745454545454545, + "grad_norm": 0.016053834930062294, + "learning_rate": 9.596963701742285e-05, + "loss": 0.013004240579903126, + "num_input_tokens_seen": 37140768, + "step": 2268, + "train_runtime": 18430.8856, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 1.375151515151515, + "grad_norm": 0.012462491169571877, + "learning_rate": 9.59658537473749e-05, + "loss": 0.012112741358578205, + "num_input_tokens_seen": 37157144, + "step": 2269, + "train_runtime": 18439.0076, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 1.3757575757575757, + "grad_norm": 0.008608606643974781, + "learning_rate": 9.596206877713953e-05, + "loss": 0.011361487209796906, + "num_input_tokens_seen": 37173520, + "step": 2270, + "train_runtime": 18447.1339, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 1.3763636363636365, + "grad_norm": 0.01020008884370327, + "learning_rate": 9.595828210685675e-05, + "loss": 0.011964777484536171, + "num_input_tokens_seen": 37189896, + "step": 2271, + "train_runtime": 18455.2562, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 1.376969696969697, + "grad_norm": 0.00379827618598938, + "learning_rate": 9.59544937366666e-05, + "loss": 0.011180602014064789, + "num_input_tokens_seen": 37206272, + "step": 2272, + "train_runtime": 18463.378, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 1.3775757575757575, + "grad_norm": 0.016575191169977188, + "learning_rate": 9.595070366670924e-05, + "loss": 0.014480410143733025, + "num_input_tokens_seen": 37222648, + "step": 2273, + "train_runtime": 18471.4999, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 1.3781818181818182, + "grad_norm": 0.013181117363274097, + "learning_rate": 9.594691189712482e-05, + "loss": 0.012310860678553581, + "num_input_tokens_seen": 37239024, + "step": 2274, + "train_runtime": 18479.6203, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 1.378787878787879, + "grad_norm": 0.013198458589613438, + "learning_rate": 9.594311842805362e-05, + "loss": 0.012114742770791054, + "num_input_tokens_seen": 37255400, + "step": 2275, + "train_runtime": 18487.7432, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 1.3793939393939394, + "grad_norm": 0.013458278961479664, + "learning_rate": 9.593932325963593e-05, + "loss": 0.013166999444365501, + "num_input_tokens_seen": 37271776, + "step": 2276, + "train_runtime": 18495.8672, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 1.38, + "grad_norm": 0.003906270023435354, + "learning_rate": 9.593552639201213e-05, + "loss": 0.01128899771720171, + "num_input_tokens_seen": 37288152, + "step": 2277, + "train_runtime": 18503.9911, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 1.3806060606060606, + "grad_norm": 0.02250552736222744, + "learning_rate": 9.593172782532268e-05, + "loss": 0.012319961562752724, + "num_input_tokens_seen": 37304528, + "step": 2278, + "train_runtime": 18512.1132, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 1.3812121212121213, + "grad_norm": 0.009309303015470505, + "learning_rate": 9.592792755970806e-05, + "loss": 0.012765881605446339, + "num_input_tokens_seen": 37320904, + "step": 2279, + "train_runtime": 18520.2349, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 1.3818181818181818, + "grad_norm": 0.010714930482208729, + "learning_rate": 9.592412559530884e-05, + "loss": 0.012022463604807854, + "num_input_tokens_seen": 37337280, + "step": 2280, + "train_runtime": 18528.3557, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 1.3824242424242423, + "grad_norm": 0.011457400396466255, + "learning_rate": 9.592032193226564e-05, + "loss": 0.01279627624899149, + "num_input_tokens_seen": 37353656, + "step": 2281, + "train_runtime": 18536.4775, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 1.383030303030303, + "grad_norm": 0.01243502926081419, + "learning_rate": 9.591651657071916e-05, + "loss": 0.011872101575136185, + "num_input_tokens_seen": 37370032, + "step": 2282, + "train_runtime": 18544.6001, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 1.3836363636363636, + "grad_norm": 0.005702585447579622, + "learning_rate": 9.591270951081016e-05, + "loss": 0.01134478859603405, + "num_input_tokens_seen": 37386408, + "step": 2283, + "train_runtime": 18552.7226, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 1.3842424242424243, + "grad_norm": 0.021376250311732292, + "learning_rate": 9.590890075267943e-05, + "loss": 0.011881144717335701, + "num_input_tokens_seen": 37402784, + "step": 2284, + "train_runtime": 18560.8455, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 1.3848484848484848, + "grad_norm": 0.008453167043626308, + "learning_rate": 9.590509029646788e-05, + "loss": 0.012073191814124584, + "num_input_tokens_seen": 37419160, + "step": 2285, + "train_runtime": 18568.9689, + "train_tokens_per_second": 2015.145 + }, + { + "epoch": 1.3854545454545455, + "grad_norm": 0.008096368052065372, + "learning_rate": 9.590127814231642e-05, + "loss": 0.011614611372351646, + "num_input_tokens_seen": 37435536, + "step": 2286, + "train_runtime": 18577.09, + "train_tokens_per_second": 2015.145 + }, + { + "epoch": 1.386060606060606, + "grad_norm": 0.006734437309205532, + "learning_rate": 9.589746429036609e-05, + "loss": 0.012934810481965542, + "num_input_tokens_seen": 37451912, + "step": 2287, + "train_runtime": 18585.2112, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 1.3866666666666667, + "grad_norm": 0.01690874621272087, + "learning_rate": 9.589364874075793e-05, + "loss": 0.012905376963317394, + "num_input_tokens_seen": 37468288, + "step": 2288, + "train_runtime": 18593.3338, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 1.3872727272727272, + "grad_norm": 0.011003658175468445, + "learning_rate": 9.588983149363306e-05, + "loss": 0.014468666166067123, + "num_input_tokens_seen": 37484664, + "step": 2289, + "train_runtime": 18601.4552, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 1.387878787878788, + "grad_norm": 0.011732536368072033, + "learning_rate": 9.588601254913272e-05, + "loss": 0.013740056194365025, + "num_input_tokens_seen": 37501040, + "step": 2290, + "train_runtime": 18609.5761, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 1.3884848484848484, + "grad_norm": 0.005165507551282644, + "learning_rate": 9.588219190739811e-05, + "loss": 0.01104014739394188, + "num_input_tokens_seen": 37517416, + "step": 2291, + "train_runtime": 18617.6996, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 1.3890909090909092, + "grad_norm": 0.008920264430344105, + "learning_rate": 9.587836956857059e-05, + "loss": 0.011560751125216484, + "num_input_tokens_seen": 37533792, + "step": 2292, + "train_runtime": 18625.8217, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 1.3896969696969697, + "grad_norm": 0.014716854318976402, + "learning_rate": 9.587454553279152e-05, + "loss": 0.012428359128534794, + "num_input_tokens_seen": 37550168, + "step": 2293, + "train_runtime": 18633.9439, + "train_tokens_per_second": 2015.149 + }, + { + "epoch": 1.3903030303030304, + "grad_norm": 0.005737116560339928, + "learning_rate": 9.587071980020233e-05, + "loss": 0.011969603598117828, + "num_input_tokens_seen": 37566544, + "step": 2294, + "train_runtime": 18642.0653, + "train_tokens_per_second": 2015.149 + }, + { + "epoch": 1.3909090909090909, + "grad_norm": 0.009410570375621319, + "learning_rate": 9.586689237094455e-05, + "loss": 0.012249289080500603, + "num_input_tokens_seen": 37582920, + "step": 2295, + "train_runtime": 18650.1861, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 1.3915151515151516, + "grad_norm": 0.0060779326595366, + "learning_rate": 9.586306324515976e-05, + "loss": 0.011125440709292889, + "num_input_tokens_seen": 37599296, + "step": 2296, + "train_runtime": 18658.3067, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 1.392121212121212, + "grad_norm": 0.006624804809689522, + "learning_rate": 9.585923242298955e-05, + "loss": 0.01228361390531063, + "num_input_tokens_seen": 37615672, + "step": 2297, + "train_runtime": 18666.4325, + "train_tokens_per_second": 2015.151 + }, + { + "epoch": 1.3927272727272726, + "grad_norm": 0.009875877760350704, + "learning_rate": 9.585539990457566e-05, + "loss": 0.012501105666160583, + "num_input_tokens_seen": 37632048, + "step": 2298, + "train_runtime": 18674.5538, + "train_tokens_per_second": 2015.151 + }, + { + "epoch": 1.3933333333333333, + "grad_norm": 0.009113671258091927, + "learning_rate": 9.585156569005982e-05, + "loss": 0.011152594350278378, + "num_input_tokens_seen": 37648424, + "step": 2299, + "train_runtime": 18682.676, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 1.393939393939394, + "grad_norm": 0.007954055443406105, + "learning_rate": 9.584772977958386e-05, + "loss": 0.011372476816177368, + "num_input_tokens_seen": 37664800, + "step": 2300, + "train_runtime": 18690.7976, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 1.3945454545454545, + "grad_norm": 0.002810339443385601, + "learning_rate": 9.584389217328966e-05, + "loss": 0.012347941286861897, + "num_input_tokens_seen": 37681176, + "step": 2301, + "train_runtime": 18699.9071, + "train_tokens_per_second": 2015.046 + }, + { + "epoch": 1.395151515151515, + "grad_norm": 0.01631283573806286, + "learning_rate": 9.584005287131917e-05, + "loss": 0.01252490933984518, + "num_input_tokens_seen": 37697552, + "step": 2302, + "train_runtime": 18708.0344, + "train_tokens_per_second": 2015.046 + }, + { + "epoch": 1.3957575757575758, + "grad_norm": 0.0038756714202463627, + "learning_rate": 9.583621187381437e-05, + "loss": 0.010715951211750507, + "num_input_tokens_seen": 37713928, + "step": 2303, + "train_runtime": 18716.1541, + "train_tokens_per_second": 2015.047 + }, + { + "epoch": 1.3963636363636365, + "grad_norm": 0.007086980622261763, + "learning_rate": 9.583236918091738e-05, + "loss": 0.012314547784626484, + "num_input_tokens_seen": 37730304, + "step": 2304, + "train_runtime": 18724.2742, + "train_tokens_per_second": 2015.048 + }, + { + "epoch": 1.396969696969697, + "grad_norm": 0.005804756656289101, + "learning_rate": 9.58285247927703e-05, + "loss": 0.010950103402137756, + "num_input_tokens_seen": 37746680, + "step": 2305, + "train_runtime": 18732.3971, + "train_tokens_per_second": 2015.048 + }, + { + "epoch": 1.3975757575757575, + "grad_norm": 0.009648945182561874, + "learning_rate": 9.582467870951533e-05, + "loss": 0.0125240758061409, + "num_input_tokens_seen": 37763056, + "step": 2306, + "train_runtime": 18740.5161, + "train_tokens_per_second": 2015.049 + }, + { + "epoch": 1.3981818181818182, + "grad_norm": 0.009216394275426865, + "learning_rate": 9.582083093129473e-05, + "loss": 0.01159854419529438, + "num_input_tokens_seen": 37779432, + "step": 2307, + "train_runtime": 18748.6358, + "train_tokens_per_second": 2015.05 + }, + { + "epoch": 1.398787878787879, + "grad_norm": 0.007218767423182726, + "learning_rate": 9.581698145825084e-05, + "loss": 0.0115420613437891, + "num_input_tokens_seen": 37795808, + "step": 2308, + "train_runtime": 18756.7561, + "train_tokens_per_second": 2015.05 + }, + { + "epoch": 1.3993939393939394, + "grad_norm": 0.009544998407363892, + "learning_rate": 9.581313029052602e-05, + "loss": 0.01166750118136406, + "num_input_tokens_seen": 37812184, + "step": 2309, + "train_runtime": 18764.8776, + "train_tokens_per_second": 2015.051 + }, + { + "epoch": 1.4, + "grad_norm": 0.006320256739854813, + "learning_rate": 9.580927742826274e-05, + "loss": 0.012228483334183693, + "num_input_tokens_seen": 37828560, + "step": 2310, + "train_runtime": 18773.0022, + "train_tokens_per_second": 2015.051 + }, + { + "epoch": 1.4006060606060606, + "grad_norm": 0.006051518488675356, + "learning_rate": 9.580542287160348e-05, + "loss": 0.013318963348865509, + "num_input_tokens_seen": 37844936, + "step": 2311, + "train_runtime": 18781.1243, + "train_tokens_per_second": 2015.052 + }, + { + "epoch": 1.4012121212121211, + "grad_norm": 0.006172268185764551, + "learning_rate": 9.580156662069084e-05, + "loss": 0.013921252451837063, + "num_input_tokens_seen": 37861312, + "step": 2312, + "train_runtime": 18789.2452, + "train_tokens_per_second": 2015.052 + }, + { + "epoch": 1.4018181818181819, + "grad_norm": 0.011184340342879295, + "learning_rate": 9.579770867566744e-05, + "loss": 0.01270650140941143, + "num_input_tokens_seen": 37877688, + "step": 2313, + "train_runtime": 18797.3677, + "train_tokens_per_second": 2015.053 + }, + { + "epoch": 1.4024242424242424, + "grad_norm": 0.009068330749869347, + "learning_rate": 9.5793849036676e-05, + "loss": 0.012799710035324097, + "num_input_tokens_seen": 37894064, + "step": 2314, + "train_runtime": 18805.4876, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 1.403030303030303, + "grad_norm": 0.011376739479601383, + "learning_rate": 9.578998770385925e-05, + "loss": 0.012034818530082703, + "num_input_tokens_seen": 37910440, + "step": 2315, + "train_runtime": 18813.609, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 1.4036363636363636, + "grad_norm": 0.008086096495389938, + "learning_rate": 9.578612467736004e-05, + "loss": 0.01248577143996954, + "num_input_tokens_seen": 37926816, + "step": 2316, + "train_runtime": 18821.7325, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 1.4042424242424243, + "grad_norm": 0.009844251908361912, + "learning_rate": 9.578225995732123e-05, + "loss": 0.010496463626623154, + "num_input_tokens_seen": 37943192, + "step": 2317, + "train_runtime": 18829.8558, + "train_tokens_per_second": 2015.055 + }, + { + "epoch": 1.4048484848484848, + "grad_norm": 0.013504873029887676, + "learning_rate": 9.577839354388577e-05, + "loss": 0.013046303763985634, + "num_input_tokens_seen": 37959568, + "step": 2318, + "train_runtime": 18837.9767, + "train_tokens_per_second": 2015.055 + }, + { + "epoch": 1.4054545454545455, + "grad_norm": 0.008937161415815353, + "learning_rate": 9.577452543719669e-05, + "loss": 0.01245005801320076, + "num_input_tokens_seen": 37975944, + "step": 2319, + "train_runtime": 18846.0993, + "train_tokens_per_second": 2015.056 + }, + { + "epoch": 1.406060606060606, + "grad_norm": 0.00886614341288805, + "learning_rate": 9.577065563739706e-05, + "loss": 0.012116841971874237, + "num_input_tokens_seen": 37992320, + "step": 2320, + "train_runtime": 18854.221, + "train_tokens_per_second": 2015.056 + }, + { + "epoch": 1.4066666666666667, + "grad_norm": 0.01636064611375332, + "learning_rate": 9.576678414463001e-05, + "loss": 0.012819192372262478, + "num_input_tokens_seen": 38008696, + "step": 2321, + "train_runtime": 18862.3418, + "train_tokens_per_second": 2015.057 + }, + { + "epoch": 1.4072727272727272, + "grad_norm": 0.008886885829269886, + "learning_rate": 9.576291095903875e-05, + "loss": 0.012500411830842495, + "num_input_tokens_seen": 38025072, + "step": 2322, + "train_runtime": 18870.4635, + "train_tokens_per_second": 2015.058 + }, + { + "epoch": 1.407878787878788, + "grad_norm": 0.009175439365208149, + "learning_rate": 9.575903608076652e-05, + "loss": 0.011257543228566647, + "num_input_tokens_seen": 38041448, + "step": 2323, + "train_runtime": 18878.5841, + "train_tokens_per_second": 2015.058 + }, + { + "epoch": 1.4084848484848485, + "grad_norm": 0.008139233104884624, + "learning_rate": 9.575515950995666e-05, + "loss": 0.012008091434836388, + "num_input_tokens_seen": 38057824, + "step": 2324, + "train_runtime": 18886.7035, + "train_tokens_per_second": 2015.059 + }, + { + "epoch": 1.4090909090909092, + "grad_norm": 0.010456080548465252, + "learning_rate": 9.575128124675257e-05, + "loss": 0.012284350581467152, + "num_input_tokens_seen": 38074200, + "step": 2325, + "train_runtime": 18894.8317, + "train_tokens_per_second": 2015.059 + }, + { + "epoch": 1.4096969696969697, + "grad_norm": 0.010791217908263206, + "learning_rate": 9.574740129129767e-05, + "loss": 0.012802988290786743, + "num_input_tokens_seen": 38090576, + "step": 2326, + "train_runtime": 18902.9531, + "train_tokens_per_second": 2015.06 + }, + { + "epoch": 1.4103030303030302, + "grad_norm": 0.010405773296952248, + "learning_rate": 9.574351964373548e-05, + "loss": 0.013286586850881577, + "num_input_tokens_seen": 38106952, + "step": 2327, + "train_runtime": 18911.0757, + "train_tokens_per_second": 2015.06 + }, + { + "epoch": 1.410909090909091, + "grad_norm": 0.013858595862984657, + "learning_rate": 9.573963630420958e-05, + "loss": 0.011597267352044582, + "num_input_tokens_seen": 38123328, + "step": 2328, + "train_runtime": 18919.1967, + "train_tokens_per_second": 2015.061 + }, + { + "epoch": 1.4115151515151516, + "grad_norm": 0.0037960167974233627, + "learning_rate": 9.573575127286361e-05, + "loss": 0.01243551168590784, + "num_input_tokens_seen": 38139704, + "step": 2329, + "train_runtime": 18927.3156, + "train_tokens_per_second": 2015.061 + }, + { + "epoch": 1.412121212121212, + "grad_norm": 0.01110562589019537, + "learning_rate": 9.573186454984127e-05, + "loss": 0.012111995369195938, + "num_input_tokens_seen": 38156080, + "step": 2330, + "train_runtime": 18935.4381, + "train_tokens_per_second": 2015.062 + }, + { + "epoch": 1.4127272727272726, + "grad_norm": 0.013541216030716896, + "learning_rate": 9.572797613528633e-05, + "loss": 0.012958360835909843, + "num_input_tokens_seen": 38172456, + "step": 2331, + "train_runtime": 18943.5591, + "train_tokens_per_second": 2015.063 + }, + { + "epoch": 1.4133333333333333, + "grad_norm": 0.008804292418062687, + "learning_rate": 9.572408602934258e-05, + "loss": 0.012310491874814034, + "num_input_tokens_seen": 38188832, + "step": 2332, + "train_runtime": 18951.6803, + "train_tokens_per_second": 2015.063 + }, + { + "epoch": 1.413939393939394, + "grad_norm": 0.00845362152904272, + "learning_rate": 9.572019423215395e-05, + "loss": 0.011901391670107841, + "num_input_tokens_seen": 38205208, + "step": 2333, + "train_runtime": 18959.8011, + "train_tokens_per_second": 2015.064 + }, + { + "epoch": 1.4145454545454546, + "grad_norm": 0.012662705034017563, + "learning_rate": 9.571630074386436e-05, + "loss": 0.01295614056289196, + "num_input_tokens_seen": 38221584, + "step": 2334, + "train_runtime": 18967.9217, + "train_tokens_per_second": 2015.064 + }, + { + "epoch": 1.415151515151515, + "grad_norm": 0.01324890460819006, + "learning_rate": 9.571240556461784e-05, + "loss": 0.012609636411070824, + "num_input_tokens_seen": 38237960, + "step": 2335, + "train_runtime": 18976.0431, + "train_tokens_per_second": 2015.065 + }, + { + "epoch": 1.4157575757575758, + "grad_norm": 0.009312913753092289, + "learning_rate": 9.570850869455845e-05, + "loss": 0.01243036799132824, + "num_input_tokens_seen": 38254336, + "step": 2336, + "train_runtime": 18984.1637, + "train_tokens_per_second": 2015.066 + }, + { + "epoch": 1.4163636363636365, + "grad_norm": 0.009475616738200188, + "learning_rate": 9.570461013383036e-05, + "loss": 0.011987818405032158, + "num_input_tokens_seen": 38270712, + "step": 2337, + "train_runtime": 18992.2853, + "train_tokens_per_second": 2015.066 + }, + { + "epoch": 1.416969696969697, + "grad_norm": 0.017518380656838417, + "learning_rate": 9.570070988257772e-05, + "loss": 0.012605913914740086, + "num_input_tokens_seen": 38287088, + "step": 2338, + "train_runtime": 19000.4061, + "train_tokens_per_second": 2015.067 + }, + { + "epoch": 1.4175757575757575, + "grad_norm": 0.013496988452970982, + "learning_rate": 9.569680794094483e-05, + "loss": 0.012542678974568844, + "num_input_tokens_seen": 38303464, + "step": 2339, + "train_runtime": 19008.5325, + "train_tokens_per_second": 2015.067 + }, + { + "epoch": 1.4181818181818182, + "grad_norm": 0.01436201948672533, + "learning_rate": 9.5692904309076e-05, + "loss": 0.011861974373459816, + "num_input_tokens_seen": 38319840, + "step": 2340, + "train_runtime": 19016.6545, + "train_tokens_per_second": 2015.067 + }, + { + "epoch": 1.4187878787878787, + "grad_norm": 0.009680923074483871, + "learning_rate": 9.568899898711563e-05, + "loss": 0.013890981674194336, + "num_input_tokens_seen": 38336216, + "step": 2341, + "train_runtime": 19024.7747, + "train_tokens_per_second": 2015.068 + }, + { + "epoch": 1.4193939393939394, + "grad_norm": 0.00575306685641408, + "learning_rate": 9.568509197520816e-05, + "loss": 0.01138025987893343, + "num_input_tokens_seen": 38352592, + "step": 2342, + "train_runtime": 19032.8957, + "train_tokens_per_second": 2015.069 + }, + { + "epoch": 1.42, + "grad_norm": 0.008749360218644142, + "learning_rate": 9.568118327349811e-05, + "loss": 0.012537163682281971, + "num_input_tokens_seen": 38368968, + "step": 2343, + "train_runtime": 19041.0165, + "train_tokens_per_second": 2015.069 + }, + { + "epoch": 1.4206060606060606, + "grad_norm": 0.006181587930768728, + "learning_rate": 9.567727288213005e-05, + "loss": 0.011792626231908798, + "num_input_tokens_seen": 38385344, + "step": 2344, + "train_runtime": 19049.1379, + "train_tokens_per_second": 2015.07 + }, + { + "epoch": 1.4212121212121211, + "grad_norm": 0.004062063992023468, + "learning_rate": 9.567336080124861e-05, + "loss": 0.011532892473042011, + "num_input_tokens_seen": 38401720, + "step": 2345, + "train_runtime": 19057.2588, + "train_tokens_per_second": 2015.07 + }, + { + "epoch": 1.4218181818181819, + "grad_norm": 0.007840434089303017, + "learning_rate": 9.566944703099852e-05, + "loss": 0.012082960456609726, + "num_input_tokens_seen": 38418096, + "step": 2346, + "train_runtime": 19065.38, + "train_tokens_per_second": 2015.071 + }, + { + "epoch": 1.4224242424242424, + "grad_norm": 0.006259352900087833, + "learning_rate": 9.56655315715245e-05, + "loss": 0.011257804930210114, + "num_input_tokens_seen": 38434472, + "step": 2347, + "train_runtime": 19073.5015, + "train_tokens_per_second": 2015.072 + }, + { + "epoch": 1.423030303030303, + "grad_norm": 0.018244467675685883, + "learning_rate": 9.56616144229714e-05, + "loss": 0.01377645879983902, + "num_input_tokens_seen": 38450848, + "step": 2348, + "train_runtime": 19081.6232, + "train_tokens_per_second": 2015.072 + }, + { + "epoch": 1.4236363636363636, + "grad_norm": 0.007569839712232351, + "learning_rate": 9.565769558548409e-05, + "loss": 0.010858017019927502, + "num_input_tokens_seen": 38467224, + "step": 2349, + "train_runtime": 19089.7453, + "train_tokens_per_second": 2015.073 + }, + { + "epoch": 1.4242424242424243, + "grad_norm": 0.009098520502448082, + "learning_rate": 9.565377505920756e-05, + "loss": 0.012635212391614914, + "num_input_tokens_seen": 38483600, + "step": 2350, + "train_runtime": 19097.8661, + "train_tokens_per_second": 2015.073 + }, + { + "epoch": 1.4248484848484848, + "grad_norm": 0.011632603593170643, + "learning_rate": 9.564985284428679e-05, + "loss": 0.012759126722812653, + "num_input_tokens_seen": 38499976, + "step": 2351, + "train_runtime": 19105.9874, + "train_tokens_per_second": 2015.074 + }, + { + "epoch": 1.4254545454545455, + "grad_norm": 0.010547060519456863, + "learning_rate": 9.564592894086685e-05, + "loss": 0.01271246001124382, + "num_input_tokens_seen": 38516352, + "step": 2352, + "train_runtime": 19114.1093, + "train_tokens_per_second": 2015.074 + }, + { + "epoch": 1.426060606060606, + "grad_norm": 0.013726288452744484, + "learning_rate": 9.564200334909292e-05, + "loss": 0.013344586826860905, + "num_input_tokens_seen": 38532728, + "step": 2353, + "train_runtime": 19122.2325, + "train_tokens_per_second": 2015.075 + }, + { + "epoch": 1.4266666666666667, + "grad_norm": 0.013462604023516178, + "learning_rate": 9.563807606911015e-05, + "loss": 0.013579159043729305, + "num_input_tokens_seen": 38549104, + "step": 2354, + "train_runtime": 19130.3458, + "train_tokens_per_second": 2015.076 + }, + { + "epoch": 1.4272727272727272, + "grad_norm": 0.008283951319754124, + "learning_rate": 9.563414710106382e-05, + "loss": 0.01217900775372982, + "num_input_tokens_seen": 38565480, + "step": 2355, + "train_runtime": 19138.4556, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 1.4278787878787877, + "grad_norm": 0.011906127445399761, + "learning_rate": 9.563021644509926e-05, + "loss": 0.012697991915047169, + "num_input_tokens_seen": 38581856, + "step": 2356, + "train_runtime": 19146.5656, + "train_tokens_per_second": 2015.08 + }, + { + "epoch": 1.4284848484848485, + "grad_norm": 0.009647993370890617, + "learning_rate": 9.562628410136186e-05, + "loss": 0.012488002888858318, + "num_input_tokens_seen": 38598232, + "step": 2357, + "train_runtime": 19154.6797, + "train_tokens_per_second": 2015.081 + }, + { + "epoch": 1.4290909090909092, + "grad_norm": 0.00915882084518671, + "learning_rate": 9.562235006999705e-05, + "loss": 0.01235243584960699, + "num_input_tokens_seen": 38614608, + "step": 2358, + "train_runtime": 19162.7899, + "train_tokens_per_second": 2015.083 + }, + { + "epoch": 1.4296969696969697, + "grad_norm": 0.009903871454298496, + "learning_rate": 9.561841435115037e-05, + "loss": 0.012734384275972843, + "num_input_tokens_seen": 38630984, + "step": 2359, + "train_runtime": 19170.901, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 1.4303030303030302, + "grad_norm": 0.013122179545462132, + "learning_rate": 9.561447694496736e-05, + "loss": 0.01317589357495308, + "num_input_tokens_seen": 38647360, + "step": 2360, + "train_runtime": 19179.0112, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.430909090909091, + "grad_norm": 0.006084715947508812, + "learning_rate": 9.561053785159371e-05, + "loss": 0.011836802586913109, + "num_input_tokens_seen": 38663736, + "step": 2361, + "train_runtime": 19187.1227, + "train_tokens_per_second": 2015.088 + }, + { + "epoch": 1.4315151515151516, + "grad_norm": 0.009425677359104156, + "learning_rate": 9.560659707117507e-05, + "loss": 0.012720235623419285, + "num_input_tokens_seen": 38680112, + "step": 2362, + "train_runtime": 19195.2334, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 1.4321212121212121, + "grad_norm": 0.01125511433929205, + "learning_rate": 9.560265460385723e-05, + "loss": 0.01049058698117733, + "num_input_tokens_seen": 38696488, + "step": 2363, + "train_runtime": 19203.3454, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.4327272727272726, + "grad_norm": 0.008309018798172474, + "learning_rate": 9.559871044978598e-05, + "loss": 0.012103556655347347, + "num_input_tokens_seen": 38712864, + "step": 2364, + "train_runtime": 19211.4583, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 1.4333333333333333, + "grad_norm": 0.010823550634086132, + "learning_rate": 9.559476460910725e-05, + "loss": 0.013532055541872978, + "num_input_tokens_seen": 38729240, + "step": 2365, + "train_runtime": 19219.5713, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 1.433939393939394, + "grad_norm": 0.007804887369275093, + "learning_rate": 9.559081708196696e-05, + "loss": 0.011757384985685349, + "num_input_tokens_seen": 38745616, + "step": 2366, + "train_runtime": 19227.6813, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 1.4345454545454546, + "grad_norm": 0.011105155572295189, + "learning_rate": 9.558686786851115e-05, + "loss": 0.013056900352239609, + "num_input_tokens_seen": 38761992, + "step": 2367, + "train_runtime": 19235.7941, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.435151515151515, + "grad_norm": 0.006022731773555279, + "learning_rate": 9.558291696888584e-05, + "loss": 0.011985806748270988, + "num_input_tokens_seen": 38778368, + "step": 2368, + "train_runtime": 19243.9058, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 1.4357575757575758, + "grad_norm": 0.021359071135520935, + "learning_rate": 9.55789643832372e-05, + "loss": 0.013741593807935715, + "num_input_tokens_seen": 38794744, + "step": 2369, + "train_runtime": 19252.0165, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 1.4363636363636363, + "grad_norm": 0.016318688169121742, + "learning_rate": 9.557501011171145e-05, + "loss": 0.01426868885755539, + "num_input_tokens_seen": 38811120, + "step": 2370, + "train_runtime": 19260.1321, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 1.436969696969697, + "grad_norm": 0.03291529044508934, + "learning_rate": 9.557105415445484e-05, + "loss": 0.012934263795614243, + "num_input_tokens_seen": 38827496, + "step": 2371, + "train_runtime": 19268.2464, + "train_tokens_per_second": 2015.103 + }, + { + "epoch": 1.4375757575757575, + "grad_norm": 0.007644362282007933, + "learning_rate": 9.556709651161366e-05, + "loss": 0.011257193982601166, + "num_input_tokens_seen": 38843872, + "step": 2372, + "train_runtime": 19276.359, + "train_tokens_per_second": 2015.104 + }, + { + "epoch": 1.4381818181818182, + "grad_norm": 0.009007222019135952, + "learning_rate": 9.556313718333433e-05, + "loss": 0.011448862962424755, + "num_input_tokens_seen": 38860248, + "step": 2373, + "train_runtime": 19284.4725, + "train_tokens_per_second": 2015.106 + }, + { + "epoch": 1.4387878787878787, + "grad_norm": 0.0117417573928833, + "learning_rate": 9.555917616976329e-05, + "loss": 0.013442277908325195, + "num_input_tokens_seen": 38876624, + "step": 2374, + "train_runtime": 19292.5852, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 1.4393939393939394, + "grad_norm": 0.0057337055914103985, + "learning_rate": 9.555521347104703e-05, + "loss": 0.01220876444131136, + "num_input_tokens_seen": 38893000, + "step": 2375, + "train_runtime": 19300.7, + "train_tokens_per_second": 2015.108 + }, + { + "epoch": 1.44, + "grad_norm": 0.006500967778265476, + "learning_rate": 9.555124908733215e-05, + "loss": 0.013221386820077896, + "num_input_tokens_seen": 38909376, + "step": 2376, + "train_runtime": 19308.8126, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 1.4406060606060607, + "grad_norm": 0.013432295992970467, + "learning_rate": 9.554728301876526e-05, + "loss": 0.013204741291701794, + "num_input_tokens_seen": 38925752, + "step": 2377, + "train_runtime": 19316.9331, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 1.4412121212121212, + "grad_norm": 0.015894871205091476, + "learning_rate": 9.554331526549308e-05, + "loss": 0.013291300274431705, + "num_input_tokens_seen": 38942128, + "step": 2378, + "train_runtime": 19325.046, + "train_tokens_per_second": 2015.112 + }, + { + "epoch": 1.4418181818181819, + "grad_norm": 0.020607370883226395, + "learning_rate": 9.553934582766235e-05, + "loss": 0.012713750824332237, + "num_input_tokens_seen": 38958504, + "step": 2379, + "train_runtime": 19333.1603, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.4424242424242424, + "grad_norm": 0.012630708515644073, + "learning_rate": 9.553537470541992e-05, + "loss": 0.011279501020908356, + "num_input_tokens_seen": 38974880, + "step": 2380, + "train_runtime": 19341.2724, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 1.443030303030303, + "grad_norm": 0.011622477322816849, + "learning_rate": 9.553140189891266e-05, + "loss": 0.011939273215830326, + "num_input_tokens_seen": 38991256, + "step": 2381, + "train_runtime": 19349.3856, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 1.4436363636363636, + "grad_norm": 0.007888108491897583, + "learning_rate": 9.552742740828748e-05, + "loss": 0.01125436369329691, + "num_input_tokens_seen": 39007632, + "step": 2382, + "train_runtime": 19357.4988, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 1.4442424242424243, + "grad_norm": 0.012038582935929298, + "learning_rate": 9.552345123369144e-05, + "loss": 0.01231908705085516, + "num_input_tokens_seen": 39024008, + "step": 2383, + "train_runtime": 19365.612, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 1.4448484848484848, + "grad_norm": 0.006818312220275402, + "learning_rate": 9.551947337527159e-05, + "loss": 0.012319983914494514, + "num_input_tokens_seen": 39040384, + "step": 2384, + "train_runtime": 19373.7348, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 1.4454545454545453, + "grad_norm": 0.012328113429248333, + "learning_rate": 9.551549383317506e-05, + "loss": 0.012599589303135872, + "num_input_tokens_seen": 39056760, + "step": 2385, + "train_runtime": 19381.851, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 1.446060606060606, + "grad_norm": 0.00953622069209814, + "learning_rate": 9.551151260754907e-05, + "loss": 0.012549671344459057, + "num_input_tokens_seen": 39073136, + "step": 2386, + "train_runtime": 19389.9639, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.4466666666666668, + "grad_norm": 0.011133255437016487, + "learning_rate": 9.550752969854084e-05, + "loss": 0.013944639824330807, + "num_input_tokens_seen": 39089512, + "step": 2387, + "train_runtime": 19398.0778, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 1.4472727272727273, + "grad_norm": 0.00964092556387186, + "learning_rate": 9.55035451062977e-05, + "loss": 0.012482261285185814, + "num_input_tokens_seen": 39105888, + "step": 2388, + "train_runtime": 19406.1887, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 1.4478787878787878, + "grad_norm": 0.006806143093854189, + "learning_rate": 9.549955883096706e-05, + "loss": 0.01177418977022171, + "num_input_tokens_seen": 39122264, + "step": 2389, + "train_runtime": 19414.3007, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 1.4484848484848485, + "grad_norm": 0.006465624086558819, + "learning_rate": 9.549557087269634e-05, + "loss": 0.013276521116495132, + "num_input_tokens_seen": 39138640, + "step": 2390, + "train_runtime": 19422.414, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.4490909090909092, + "grad_norm": 0.008866474032402039, + "learning_rate": 9.549158123163305e-05, + "loss": 0.012841928750276566, + "num_input_tokens_seen": 39155016, + "step": 2391, + "train_runtime": 19430.5325, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 1.4496969696969697, + "grad_norm": 0.010618673637509346, + "learning_rate": 9.548758990792477e-05, + "loss": 0.012568192556500435, + "num_input_tokens_seen": 39171392, + "step": 2392, + "train_runtime": 19438.6455, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 1.4503030303030302, + "grad_norm": 0.008561650291085243, + "learning_rate": 9.548359690171911e-05, + "loss": 0.012789330445230007, + "num_input_tokens_seen": 39187768, + "step": 2393, + "train_runtime": 19446.7579, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 1.450909090909091, + "grad_norm": 0.0074139004573225975, + "learning_rate": 9.547960221316379e-05, + "loss": 0.012125402688980103, + "num_input_tokens_seen": 39204144, + "step": 2394, + "train_runtime": 19454.8715, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 1.4515151515151516, + "grad_norm": 0.008938332088291645, + "learning_rate": 9.547560584240653e-05, + "loss": 0.012520406395196915, + "num_input_tokens_seen": 39220520, + "step": 2395, + "train_runtime": 19462.9863, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 1.4521212121212121, + "grad_norm": 0.010951995849609375, + "learning_rate": 9.547160778959519e-05, + "loss": 0.013598313555121422, + "num_input_tokens_seen": 39236896, + "step": 2396, + "train_runtime": 19471.1016, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 1.4527272727272726, + "grad_norm": 0.010507755912840366, + "learning_rate": 9.546760805487762e-05, + "loss": 0.011697047390043736, + "num_input_tokens_seen": 39253272, + "step": 2397, + "train_runtime": 19479.2149, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 1.4533333333333334, + "grad_norm": 0.008784889243543148, + "learning_rate": 9.546360663840177e-05, + "loss": 0.01291731558740139, + "num_input_tokens_seen": 39269648, + "step": 2398, + "train_runtime": 19487.3345, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 1.4539393939393939, + "grad_norm": 0.008387868292629719, + "learning_rate": 9.545960354031565e-05, + "loss": 0.011412294581532478, + "num_input_tokens_seen": 39286024, + "step": 2399, + "train_runtime": 19495.4456, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 1.4545454545454546, + "grad_norm": 0.013577899895608425, + "learning_rate": 9.545559876076733e-05, + "loss": 0.012150186114013195, + "num_input_tokens_seen": 39302400, + "step": 2400, + "train_runtime": 19503.5605, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 1.455151515151515, + "grad_norm": 0.00847565196454525, + "learning_rate": 9.545159229990493e-05, + "loss": 0.011293401941657066, + "num_input_tokens_seen": 39318776, + "step": 2401, + "train_runtime": 19512.6323, + "train_tokens_per_second": 2015.042 + }, + { + "epoch": 1.4557575757575758, + "grad_norm": 0.010008870624005795, + "learning_rate": 9.544758415787662e-05, + "loss": 0.012785600498318672, + "num_input_tokens_seen": 39335152, + "step": 2402, + "train_runtime": 19520.7422, + "train_tokens_per_second": 2015.044 + }, + { + "epoch": 1.4563636363636363, + "grad_norm": 0.007814590819180012, + "learning_rate": 9.544357433483071e-05, + "loss": 0.012637192383408546, + "num_input_tokens_seen": 39351528, + "step": 2403, + "train_runtime": 19528.8556, + "train_tokens_per_second": 2015.045 + }, + { + "epoch": 1.456969696969697, + "grad_norm": 0.006621070671826601, + "learning_rate": 9.543956283091546e-05, + "loss": 0.011230424046516418, + "num_input_tokens_seen": 39367904, + "step": 2404, + "train_runtime": 19536.9652, + "train_tokens_per_second": 2015.047 + }, + { + "epoch": 1.4575757575757575, + "grad_norm": 0.01204211637377739, + "learning_rate": 9.54355496462793e-05, + "loss": 0.012431148439645767, + "num_input_tokens_seen": 39384280, + "step": 2405, + "train_runtime": 19545.0747, + "train_tokens_per_second": 2015.049 + }, + { + "epoch": 1.4581818181818182, + "grad_norm": 0.007015693001449108, + "learning_rate": 9.543153478107061e-05, + "loss": 0.011989517137408257, + "num_input_tokens_seen": 39400656, + "step": 2406, + "train_runtime": 19553.1823, + "train_tokens_per_second": 2015.051 + }, + { + "epoch": 1.4587878787878787, + "grad_norm": 0.011309216730296612, + "learning_rate": 9.542751823543793e-05, + "loss": 0.011417721398174763, + "num_input_tokens_seen": 39417032, + "step": 2407, + "train_runtime": 19561.2927, + "train_tokens_per_second": 2015.053 + }, + { + "epoch": 1.4593939393939395, + "grad_norm": 0.015342672355473042, + "learning_rate": 9.542350000952982e-05, + "loss": 0.013073185458779335, + "num_input_tokens_seen": 39433408, + "step": 2408, + "train_runtime": 19569.4017, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 1.46, + "grad_norm": 0.019160790368914604, + "learning_rate": 9.541948010349491e-05, + "loss": 0.01287474948912859, + "num_input_tokens_seen": 39449784, + "step": 2409, + "train_runtime": 19577.5113, + "train_tokens_per_second": 2015.056 + }, + { + "epoch": 1.4606060606060607, + "grad_norm": 0.007335372269153595, + "learning_rate": 9.541545851748186e-05, + "loss": 0.012530960142612457, + "num_input_tokens_seen": 39466160, + "step": 2410, + "train_runtime": 19585.6206, + "train_tokens_per_second": 2015.058 + }, + { + "epoch": 1.4612121212121212, + "grad_norm": 0.012553319334983826, + "learning_rate": 9.541143525163946e-05, + "loss": 0.013056598603725433, + "num_input_tokens_seen": 39482536, + "step": 2411, + "train_runtime": 19593.735, + "train_tokens_per_second": 2015.059 + }, + { + "epoch": 1.461818181818182, + "grad_norm": 0.007112372200936079, + "learning_rate": 9.54074103061165e-05, + "loss": 0.012453190051019192, + "num_input_tokens_seen": 39498912, + "step": 2412, + "train_runtime": 19601.8436, + "train_tokens_per_second": 2015.061 + }, + { + "epoch": 1.4624242424242424, + "grad_norm": 0.0041340249590575695, + "learning_rate": 9.540338368106185e-05, + "loss": 0.010426213033497334, + "num_input_tokens_seen": 39515288, + "step": 2413, + "train_runtime": 19609.9527, + "train_tokens_per_second": 2015.063 + }, + { + "epoch": 1.463030303030303, + "grad_norm": 0.010998896323144436, + "learning_rate": 9.539935537662448e-05, + "loss": 0.012171028181910515, + "num_input_tokens_seen": 39531664, + "step": 2414, + "train_runtime": 19618.0619, + "train_tokens_per_second": 2015.065 + }, + { + "epoch": 1.4636363636363636, + "grad_norm": 0.030140312388539314, + "learning_rate": 9.539532539295335e-05, + "loss": 0.012536111287772655, + "num_input_tokens_seen": 39548040, + "step": 2415, + "train_runtime": 19626.1713, + "train_tokens_per_second": 2015.066 + }, + { + "epoch": 1.4642424242424243, + "grad_norm": 0.009954207576811314, + "learning_rate": 9.539129373019754e-05, + "loss": 0.012501617893576622, + "num_input_tokens_seen": 39564416, + "step": 2416, + "train_runtime": 19634.2817, + "train_tokens_per_second": 2015.068 + }, + { + "epoch": 1.4648484848484848, + "grad_norm": 0.014923661015927792, + "learning_rate": 9.538726038850617e-05, + "loss": 0.013546659611165524, + "num_input_tokens_seen": 39580792, + "step": 2417, + "train_runtime": 19642.3932, + "train_tokens_per_second": 2015.07 + }, + { + "epoch": 1.4654545454545453, + "grad_norm": 0.01247413083910942, + "learning_rate": 9.538322536802842e-05, + "loss": 0.012776615098118782, + "num_input_tokens_seen": 39597168, + "step": 2418, + "train_runtime": 19650.5062, + "train_tokens_per_second": 2015.071 + }, + { + "epoch": 1.466060606060606, + "grad_norm": 0.2203642725944519, + "learning_rate": 9.537918866891355e-05, + "loss": 0.01299387775361538, + "num_input_tokens_seen": 39613544, + "step": 2419, + "train_runtime": 19658.6181, + "train_tokens_per_second": 2015.073 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 0.010403799824416637, + "learning_rate": 9.537515029131086e-05, + "loss": 0.012151832692325115, + "num_input_tokens_seen": 39629920, + "step": 2420, + "train_runtime": 19666.7347, + "train_tokens_per_second": 2015.074 + }, + { + "epoch": 1.4672727272727273, + "grad_norm": 0.010242112912237644, + "learning_rate": 9.537111023536973e-05, + "loss": 0.012043867260217667, + "num_input_tokens_seen": 39646296, + "step": 2421, + "train_runtime": 19674.8455, + "train_tokens_per_second": 2015.075 + }, + { + "epoch": 1.4678787878787878, + "grad_norm": 0.01728324219584465, + "learning_rate": 9.53670685012396e-05, + "loss": 0.01141743827611208, + "num_input_tokens_seen": 39662672, + "step": 2422, + "train_runtime": 19682.9588, + "train_tokens_per_second": 2015.077 + }, + { + "epoch": 1.4684848484848485, + "grad_norm": 0.011234860867261887, + "learning_rate": 9.536302508906993e-05, + "loss": 0.011003116145730019, + "num_input_tokens_seen": 39679048, + "step": 2423, + "train_runtime": 19691.0777, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 1.4690909090909092, + "grad_norm": 0.016217637807130814, + "learning_rate": 9.535897999901032e-05, + "loss": 0.012819355353713036, + "num_input_tokens_seen": 39695424, + "step": 2424, + "train_runtime": 19699.1987, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 1.4696969696969697, + "grad_norm": 0.02879735641181469, + "learning_rate": 9.535493323121036e-05, + "loss": 0.012496593408286572, + "num_input_tokens_seen": 39711800, + "step": 2425, + "train_runtime": 19707.3223, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 1.4703030303030302, + "grad_norm": 0.00734851835295558, + "learning_rate": 9.535088478581975e-05, + "loss": 0.011950269341468811, + "num_input_tokens_seen": 39728176, + "step": 2426, + "train_runtime": 19715.4383, + "train_tokens_per_second": 2015.08 + }, + { + "epoch": 1.470909090909091, + "grad_norm": 0.009754326194524765, + "learning_rate": 9.534683466298823e-05, + "loss": 0.012536708265542984, + "num_input_tokens_seen": 39744552, + "step": 2427, + "train_runtime": 19723.5544, + "train_tokens_per_second": 2015.081 + }, + { + "epoch": 1.4715151515151514, + "grad_norm": 0.009746896103024483, + "learning_rate": 9.53427828628656e-05, + "loss": 0.01178042497485876, + "num_input_tokens_seen": 39760928, + "step": 2428, + "train_runtime": 19731.672, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 1.4721212121212122, + "grad_norm": 0.00775569211691618, + "learning_rate": 9.533872938560174e-05, + "loss": 0.011952969245612621, + "num_input_tokens_seen": 39777304, + "step": 2429, + "train_runtime": 19739.7884, + "train_tokens_per_second": 2015.083 + }, + { + "epoch": 1.4727272727272727, + "grad_norm": 0.00642067426815629, + "learning_rate": 9.533467423134657e-05, + "loss": 0.013351340778172016, + "num_input_tokens_seen": 39793680, + "step": 2430, + "train_runtime": 19747.9027, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 1.4733333333333334, + "grad_norm": 0.00757970055565238, + "learning_rate": 9.533061740025008e-05, + "loss": 0.012188777327537537, + "num_input_tokens_seen": 39810056, + "step": 2431, + "train_runtime": 19756.0198, + "train_tokens_per_second": 2015.085 + }, + { + "epoch": 1.4739393939393939, + "grad_norm": 0.010369901545345783, + "learning_rate": 9.532655889246234e-05, + "loss": 0.01212995033711195, + "num_input_tokens_seen": 39826432, + "step": 2432, + "train_runtime": 19764.139, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.4745454545454546, + "grad_norm": 0.0055848038755357265, + "learning_rate": 9.532249870813344e-05, + "loss": 0.012233348563313484, + "num_input_tokens_seen": 39842808, + "step": 2433, + "train_runtime": 19772.2691, + "train_tokens_per_second": 2015.085 + }, + { + "epoch": 1.475151515151515, + "grad_norm": 0.018506374210119247, + "learning_rate": 9.53184368474136e-05, + "loss": 0.01355830393731594, + "num_input_tokens_seen": 39859184, + "step": 2434, + "train_runtime": 19780.4062, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 1.4757575757575758, + "grad_norm": 0.007064046338200569, + "learning_rate": 9.531437331045301e-05, + "loss": 0.011627360247075558, + "num_input_tokens_seen": 39875560, + "step": 2435, + "train_runtime": 19788.5498, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 1.4763636363636363, + "grad_norm": 0.013803046196699142, + "learning_rate": 9.5310308097402e-05, + "loss": 0.01383510883897543, + "num_input_tokens_seen": 39891936, + "step": 2436, + "train_runtime": 19796.6824, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 1.476969696969697, + "grad_norm": 0.006459993310272694, + "learning_rate": 9.530624120841094e-05, + "loss": 0.011402672156691551, + "num_input_tokens_seen": 39908312, + "step": 2437, + "train_runtime": 19804.8017, + "train_tokens_per_second": 2015.083 + }, + { + "epoch": 1.4775757575757575, + "grad_norm": 0.013386828824877739, + "learning_rate": 9.530217264363024e-05, + "loss": 0.012710126116871834, + "num_input_tokens_seen": 39924688, + "step": 2438, + "train_runtime": 19812.9203, + "train_tokens_per_second": 2015.083 + }, + { + "epoch": 1.4781818181818183, + "grad_norm": 0.008386164903640747, + "learning_rate": 9.52981024032104e-05, + "loss": 0.01170976459980011, + "num_input_tokens_seen": 39941064, + "step": 2439, + "train_runtime": 19821.0403, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 1.4787878787878788, + "grad_norm": 0.019332820549607277, + "learning_rate": 9.529403048730197e-05, + "loss": 0.014887764118611813, + "num_input_tokens_seen": 39957440, + "step": 2440, + "train_runtime": 19829.157, + "train_tokens_per_second": 2015.085 + }, + { + "epoch": 1.4793939393939395, + "grad_norm": 0.01192883588373661, + "learning_rate": 9.528995689605556e-05, + "loss": 0.012767164967954159, + "num_input_tokens_seen": 39973816, + "step": 2441, + "train_runtime": 19837.2767, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.48, + "grad_norm": 0.007754697930067778, + "learning_rate": 9.528588162962184e-05, + "loss": 0.01059720292687416, + "num_input_tokens_seen": 39990192, + "step": 2442, + "train_runtime": 19845.3992, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.4806060606060605, + "grad_norm": 0.0071412501856684685, + "learning_rate": 9.528180468815155e-05, + "loss": 0.0120925884693861, + "num_input_tokens_seen": 40006568, + "step": 2443, + "train_runtime": 19853.5324, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.4812121212121212, + "grad_norm": 0.0038404990918934345, + "learning_rate": 9.527772607179548e-05, + "loss": 0.011214636266231537, + "num_input_tokens_seen": 40022944, + "step": 2444, + "train_runtime": 19861.6572, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.481818181818182, + "grad_norm": 0.006489252671599388, + "learning_rate": 9.52736457807045e-05, + "loss": 0.012799175456166267, + "num_input_tokens_seen": 40039320, + "step": 2445, + "train_runtime": 19869.7777, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.4824242424242424, + "grad_norm": 0.007905665785074234, + "learning_rate": 9.526956381502953e-05, + "loss": 0.01181122101843357, + "num_input_tokens_seen": 40055696, + "step": 2446, + "train_runtime": 19877.9055, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.483030303030303, + "grad_norm": 0.006114371120929718, + "learning_rate": 9.526548017492156e-05, + "loss": 0.01095847599208355, + "num_input_tokens_seen": 40072072, + "step": 2447, + "train_runtime": 19886.0326, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.4836363636363636, + "grad_norm": 0.007537721656262875, + "learning_rate": 9.526139486053162e-05, + "loss": 0.012018000707030296, + "num_input_tokens_seen": 40088448, + "step": 2448, + "train_runtime": 19894.1525, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 1.4842424242424244, + "grad_norm": 0.01275183167308569, + "learning_rate": 9.525730787201083e-05, + "loss": 0.01350702065974474, + "num_input_tokens_seen": 40104824, + "step": 2449, + "train_runtime": 19902.2701, + "train_tokens_per_second": 2015.088 + }, + { + "epoch": 1.4848484848484849, + "grad_norm": 0.006207689177244902, + "learning_rate": 9.525321920951034e-05, + "loss": 0.011248544789850712, + "num_input_tokens_seen": 40121200, + "step": 2450, + "train_runtime": 19910.3904, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 1.4854545454545454, + "grad_norm": 0.0031499108299613, + "learning_rate": 9.524912887318142e-05, + "loss": 0.011422288604080677, + "num_input_tokens_seen": 40137576, + "step": 2451, + "train_runtime": 19918.5126, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 1.486060606060606, + "grad_norm": 0.006611378397792578, + "learning_rate": 9.524503686317534e-05, + "loss": 0.012991274707019329, + "num_input_tokens_seen": 40153952, + "step": 2452, + "train_runtime": 19926.6344, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 1.4866666666666668, + "grad_norm": 0.01270908024162054, + "learning_rate": 9.524094317964345e-05, + "loss": 0.013357768766582012, + "num_input_tokens_seen": 40170328, + "step": 2453, + "train_runtime": 19934.758, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 1.4872727272727273, + "grad_norm": 0.009194605052471161, + "learning_rate": 9.523684782273718e-05, + "loss": 0.012534737586975098, + "num_input_tokens_seen": 40186704, + "step": 2454, + "train_runtime": 19942.8783, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 1.4878787878787878, + "grad_norm": 0.006802633870393038, + "learning_rate": 9.523275079260799e-05, + "loss": 0.012076673097908497, + "num_input_tokens_seen": 40203080, + "step": 2455, + "train_runtime": 19951.0021, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.4884848484848485, + "grad_norm": 0.01247409638017416, + "learning_rate": 9.522865208940745e-05, + "loss": 0.012032316997647285, + "num_input_tokens_seen": 40219456, + "step": 2456, + "train_runtime": 19959.1325, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 1.489090909090909, + "grad_norm": 0.02582702599465847, + "learning_rate": 9.522455171328715e-05, + "loss": 0.011424973607063293, + "num_input_tokens_seen": 40235832, + "step": 2457, + "train_runtime": 19967.2572, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.4896969696969697, + "grad_norm": 0.011868856847286224, + "learning_rate": 9.522044966439873e-05, + "loss": 0.012090028263628483, + "num_input_tokens_seen": 40252208, + "step": 2458, + "train_runtime": 19975.3808, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.4903030303030302, + "grad_norm": 0.009827052243053913, + "learning_rate": 9.521634594289396e-05, + "loss": 0.012720966711640358, + "num_input_tokens_seen": 40268584, + "step": 2459, + "train_runtime": 19983.5029, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.490909090909091, + "grad_norm": 0.007020117249339819, + "learning_rate": 9.52122405489246e-05, + "loss": 0.012172859162092209, + "num_input_tokens_seen": 40284960, + "step": 2460, + "train_runtime": 19991.6205, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 1.4915151515151515, + "grad_norm": 0.004183088894933462, + "learning_rate": 9.520813348264252e-05, + "loss": 0.012092591263353825, + "num_input_tokens_seen": 40301336, + "step": 2461, + "train_runtime": 19999.7464, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 1.4921212121212122, + "grad_norm": 0.004259482026100159, + "learning_rate": 9.52040247441996e-05, + "loss": 0.010745976120233536, + "num_input_tokens_seen": 40317712, + "step": 2462, + "train_runtime": 20007.8652, + "train_tokens_per_second": 2015.093 + }, + { + "epoch": 1.4927272727272727, + "grad_norm": 0.010508165694773197, + "learning_rate": 9.519991433374787e-05, + "loss": 0.012759631499648094, + "num_input_tokens_seen": 40334088, + "step": 2463, + "train_runtime": 20015.9813, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 1.4933333333333334, + "grad_norm": 0.008700689300894737, + "learning_rate": 9.51958022514393e-05, + "loss": 0.013472158461809158, + "num_input_tokens_seen": 40350464, + "step": 2464, + "train_runtime": 20024.0943, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 1.493939393939394, + "grad_norm": 0.007512248121201992, + "learning_rate": 9.519168849742604e-05, + "loss": 0.011116516776382923, + "num_input_tokens_seen": 40366840, + "step": 2465, + "train_runtime": 20032.2166, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 1.4945454545454546, + "grad_norm": 0.006751928012818098, + "learning_rate": 9.518757307186021e-05, + "loss": 0.012299465015530586, + "num_input_tokens_seen": 40383216, + "step": 2466, + "train_runtime": 20040.3361, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.4951515151515151, + "grad_norm": 0.012251977808773518, + "learning_rate": 9.518345597489406e-05, + "loss": 0.012819000519812107, + "num_input_tokens_seen": 40399592, + "step": 2467, + "train_runtime": 20048.4565, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.4957575757575756, + "grad_norm": 0.016346946358680725, + "learning_rate": 9.517933720667986e-05, + "loss": 0.013819322921335697, + "num_input_tokens_seen": 40415968, + "step": 2468, + "train_runtime": 20056.5803, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.4963636363636363, + "grad_norm": 0.013435586355626583, + "learning_rate": 9.517521676736997e-05, + "loss": 0.011113706976175308, + "num_input_tokens_seen": 40432344, + "step": 2469, + "train_runtime": 20064.702, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.496969696969697, + "grad_norm": 0.010519376955926418, + "learning_rate": 9.517109465711678e-05, + "loss": 0.013021894730627537, + "num_input_tokens_seen": 40448720, + "step": 2470, + "train_runtime": 20072.834, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.4975757575757576, + "grad_norm": 0.007130353711545467, + "learning_rate": 9.516697087607276e-05, + "loss": 0.01177508756518364, + "num_input_tokens_seen": 40465096, + "step": 2471, + "train_runtime": 20080.9578, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.498181818181818, + "grad_norm": 0.0072803315706551075, + "learning_rate": 9.516284542439047e-05, + "loss": 0.012394273653626442, + "num_input_tokens_seen": 40481472, + "step": 2472, + "train_runtime": 20089.0934, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.4987878787878788, + "grad_norm": 0.02864505536854267, + "learning_rate": 9.515871830222244e-05, + "loss": 0.012685790657997131, + "num_input_tokens_seen": 40497848, + "step": 2473, + "train_runtime": 20097.2121, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.4993939393939395, + "grad_norm": 0.007369753438979387, + "learning_rate": 9.51545895097214e-05, + "loss": 0.012503450736403465, + "num_input_tokens_seen": 40514224, + "step": 2474, + "train_runtime": 20105.3349, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.5, + "grad_norm": 0.012454242445528507, + "learning_rate": 9.515045904704001e-05, + "loss": 0.011608883738517761, + "num_input_tokens_seen": 40530600, + "step": 2475, + "train_runtime": 20113.4635, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.5006060606060605, + "grad_norm": 0.007108752615749836, + "learning_rate": 9.514632691433107e-05, + "loss": 0.012201216071844101, + "num_input_tokens_seen": 40546976, + "step": 2476, + "train_runtime": 20121.5906, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.5012121212121212, + "grad_norm": 0.010609873570501804, + "learning_rate": 9.514219311174741e-05, + "loss": 0.011614919640123844, + "num_input_tokens_seen": 40563352, + "step": 2477, + "train_runtime": 20129.7127, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.501818181818182, + "grad_norm": 0.036003436893224716, + "learning_rate": 9.513805763944195e-05, + "loss": 0.014732426032423973, + "num_input_tokens_seen": 40579728, + "step": 2478, + "train_runtime": 20137.8216, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 1.5024242424242424, + "grad_norm": 0.0038061749655753374, + "learning_rate": 9.513392049756761e-05, + "loss": 0.011091469787061214, + "num_input_tokens_seen": 40596104, + "step": 2479, + "train_runtime": 20145.9325, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 1.503030303030303, + "grad_norm": 0.016383551061153412, + "learning_rate": 9.512978168627749e-05, + "loss": 0.013081599958240986, + "num_input_tokens_seen": 40612480, + "step": 2480, + "train_runtime": 20154.0449, + "train_tokens_per_second": 2015.103 + }, + { + "epoch": 1.5036363636363637, + "grad_norm": 0.006661078426986933, + "learning_rate": 9.51256412057246e-05, + "loss": 0.012471092864871025, + "num_input_tokens_seen": 40628856, + "step": 2481, + "train_runtime": 20162.1558, + "train_tokens_per_second": 2015.105 + }, + { + "epoch": 1.5042424242424244, + "grad_norm": 0.00888329278677702, + "learning_rate": 9.512149905606213e-05, + "loss": 0.013931803405284882, + "num_input_tokens_seen": 40645232, + "step": 2482, + "train_runtime": 20170.2657, + "train_tokens_per_second": 2015.106 + }, + { + "epoch": 1.5048484848484849, + "grad_norm": 0.009034967981278896, + "learning_rate": 9.511735523744328e-05, + "loss": 0.011597779579460621, + "num_input_tokens_seen": 40661608, + "step": 2483, + "train_runtime": 20178.3784, + "train_tokens_per_second": 2015.108 + }, + { + "epoch": 1.5054545454545454, + "grad_norm": 0.003994882106781006, + "learning_rate": 9.511320975002132e-05, + "loss": 0.012429807335138321, + "num_input_tokens_seen": 40677984, + "step": 2484, + "train_runtime": 20186.489, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 1.506060606060606, + "grad_norm": 0.005746257957071066, + "learning_rate": 9.510906259394958e-05, + "loss": 0.01171612273901701, + "num_input_tokens_seen": 40694360, + "step": 2485, + "train_runtime": 20194.5999, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 1.5066666666666668, + "grad_norm": 0.008891885168850422, + "learning_rate": 9.510491376938147e-05, + "loss": 0.01219995692372322, + "num_input_tokens_seen": 40710736, + "step": 2486, + "train_runtime": 20202.7112, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.5072727272727273, + "grad_norm": 0.0058426158502697945, + "learning_rate": 9.510076327647042e-05, + "loss": 0.012936384417116642, + "num_input_tokens_seen": 40727112, + "step": 2487, + "train_runtime": 20210.8213, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 1.5078787878787878, + "grad_norm": 0.009390783496201038, + "learning_rate": 9.509661111536998e-05, + "loss": 0.012800348922610283, + "num_input_tokens_seen": 40743488, + "step": 2488, + "train_runtime": 20218.932, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 1.5084848484848485, + "grad_norm": 0.007901819422841072, + "learning_rate": 9.509245728623373e-05, + "loss": 0.012779071927070618, + "num_input_tokens_seen": 40759864, + "step": 2489, + "train_runtime": 20227.0427, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 1.509090909090909, + "grad_norm": 0.014118324033915997, + "learning_rate": 9.508830178921529e-05, + "loss": 0.013085502199828625, + "num_input_tokens_seen": 40776240, + "step": 2490, + "train_runtime": 20235.1526, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 1.5096969696969698, + "grad_norm": 0.009893806651234627, + "learning_rate": 9.508414462446835e-05, + "loss": 0.012658249586820602, + "num_input_tokens_seen": 40792616, + "step": 2491, + "train_runtime": 20243.2643, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 1.5103030303030303, + "grad_norm": 0.009940304793417454, + "learning_rate": 9.507998579214671e-05, + "loss": 0.013717295601963997, + "num_input_tokens_seen": 40808992, + "step": 2492, + "train_runtime": 20251.3756, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.5109090909090908, + "grad_norm": 0.008184517733752728, + "learning_rate": 9.50758252924042e-05, + "loss": 0.01362772099673748, + "num_input_tokens_seen": 40825368, + "step": 2493, + "train_runtime": 20259.4871, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 1.5115151515151515, + "grad_norm": 0.008952822536230087, + "learning_rate": 9.507166312539468e-05, + "loss": 0.012840436771512032, + "num_input_tokens_seen": 40841744, + "step": 2494, + "train_runtime": 20267.5958, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 1.5121212121212122, + "grad_norm": 0.006644078996032476, + "learning_rate": 9.506749929127212e-05, + "loss": 0.012481370940804482, + "num_input_tokens_seen": 40858120, + "step": 2495, + "train_runtime": 20275.7082, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.5127272727272727, + "grad_norm": 0.012844868935644627, + "learning_rate": 9.506333379019052e-05, + "loss": 0.012236448004841805, + "num_input_tokens_seen": 40874496, + "step": 2496, + "train_runtime": 20283.8171, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 1.5133333333333332, + "grad_norm": 0.017544275149703026, + "learning_rate": 9.505916662230397e-05, + "loss": 0.012658393010497093, + "num_input_tokens_seen": 40890872, + "step": 2497, + "train_runtime": 20291.9328, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 1.513939393939394, + "grad_norm": 0.01012002769857645, + "learning_rate": 9.505499778776658e-05, + "loss": 0.012574484571814537, + "num_input_tokens_seen": 40907248, + "step": 2498, + "train_runtime": 20300.0437, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 1.5145454545454546, + "grad_norm": 0.018078140914440155, + "learning_rate": 9.505082728673257e-05, + "loss": 0.013375327922403812, + "num_input_tokens_seen": 40923624, + "step": 2499, + "train_runtime": 20308.1567, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 1.5151515151515151, + "grad_norm": 0.007978398352861404, + "learning_rate": 9.50466551193562e-05, + "loss": 0.01290303748100996, + "num_input_tokens_seen": 40940000, + "step": 2500, + "train_runtime": 20316.2678, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 1.5157575757575756, + "grad_norm": 0.006939285434782505, + "learning_rate": 9.504248128579177e-05, + "loss": 0.011965984478592873, + "num_input_tokens_seen": 40956376, + "step": 2501, + "train_runtime": 20325.3605, + "train_tokens_per_second": 2015.038 + }, + { + "epoch": 1.5163636363636364, + "grad_norm": 0.007044460158795118, + "learning_rate": 9.503830578619368e-05, + "loss": 0.011356959119439125, + "num_input_tokens_seen": 40972752, + "step": 2502, + "train_runtime": 20333.468, + "train_tokens_per_second": 2015.04 + }, + { + "epoch": 1.516969696969697, + "grad_norm": 0.007603634148836136, + "learning_rate": 9.503412862071637e-05, + "loss": 0.011854066513478756, + "num_input_tokens_seen": 40989128, + "step": 2503, + "train_runtime": 20341.576, + "train_tokens_per_second": 2015.042 + }, + { + "epoch": 1.5175757575757576, + "grad_norm": 0.009264685213565826, + "learning_rate": 9.502994978951435e-05, + "loss": 0.012205901555716991, + "num_input_tokens_seen": 41005504, + "step": 2504, + "train_runtime": 20349.6894, + "train_tokens_per_second": 2015.043 + }, + { + "epoch": 1.518181818181818, + "grad_norm": 0.004056988749653101, + "learning_rate": 9.502576929274214e-05, + "loss": 0.01200732309371233, + "num_input_tokens_seen": 41021880, + "step": 2505, + "train_runtime": 20357.7988, + "train_tokens_per_second": 2015.045 + }, + { + "epoch": 1.5187878787878788, + "grad_norm": 0.009046165272593498, + "learning_rate": 9.502158713055444e-05, + "loss": 0.012369451113045216, + "num_input_tokens_seen": 41038256, + "step": 2506, + "train_runtime": 20365.9067, + "train_tokens_per_second": 2015.047 + }, + { + "epoch": 1.5193939393939395, + "grad_norm": 0.0077380104921758175, + "learning_rate": 9.50174033031059e-05, + "loss": 0.012663315050303936, + "num_input_tokens_seen": 41054632, + "step": 2507, + "train_runtime": 20374.0154, + "train_tokens_per_second": 2015.049 + }, + { + "epoch": 1.52, + "grad_norm": 0.012925632297992706, + "learning_rate": 9.501321781055129e-05, + "loss": 0.013452775776386261, + "num_input_tokens_seen": 41071008, + "step": 2508, + "train_runtime": 20382.1317, + "train_tokens_per_second": 2015.05 + }, + { + "epoch": 1.5206060606060605, + "grad_norm": 0.024989936500787735, + "learning_rate": 9.50090306530454e-05, + "loss": 0.012328572571277618, + "num_input_tokens_seen": 41087384, + "step": 2509, + "train_runtime": 20390.2429, + "train_tokens_per_second": 2015.051 + }, + { + "epoch": 1.5212121212121212, + "grad_norm": 0.010371995158493519, + "learning_rate": 9.500484183074312e-05, + "loss": 0.013627678155899048, + "num_input_tokens_seen": 41103760, + "step": 2510, + "train_runtime": 20398.3582, + "train_tokens_per_second": 2015.052 + }, + { + "epoch": 1.521818181818182, + "grad_norm": 0.006825145334005356, + "learning_rate": 9.500065134379939e-05, + "loss": 0.012711216695606709, + "num_input_tokens_seen": 41120136, + "step": 2511, + "train_runtime": 20406.48, + "train_tokens_per_second": 2015.053 + }, + { + "epoch": 1.5224242424242425, + "grad_norm": 0.009489607997238636, + "learning_rate": 9.49964591923692e-05, + "loss": 0.013465148396790028, + "num_input_tokens_seen": 41136512, + "step": 2512, + "train_runtime": 20414.5979, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 1.523030303030303, + "grad_norm": 0.006394787225872278, + "learning_rate": 9.49922653766076e-05, + "loss": 0.010701936669647694, + "num_input_tokens_seen": 41152888, + "step": 2513, + "train_runtime": 20422.7123, + "train_tokens_per_second": 2015.055 + }, + { + "epoch": 1.5236363636363637, + "grad_norm": 0.029152654111385345, + "learning_rate": 9.498806989666972e-05, + "loss": 0.010741956532001495, + "num_input_tokens_seen": 41169264, + "step": 2514, + "train_runtime": 20430.8349, + "train_tokens_per_second": 2015.055 + }, + { + "epoch": 1.5242424242424244, + "grad_norm": 0.017097560688853264, + "learning_rate": 9.498387275271074e-05, + "loss": 0.011944272555410862, + "num_input_tokens_seen": 41185640, + "step": 2515, + "train_runtime": 20438.952, + "train_tokens_per_second": 2015.056 + }, + { + "epoch": 1.524848484848485, + "grad_norm": 0.008915740065276623, + "learning_rate": 9.497967394488594e-05, + "loss": 0.012338991276919842, + "num_input_tokens_seen": 41202016, + "step": 2516, + "train_runtime": 20447.0655, + "train_tokens_per_second": 2015.058 + }, + { + "epoch": 1.5254545454545454, + "grad_norm": 0.01135164313018322, + "learning_rate": 9.497547347335058e-05, + "loss": 0.012441879138350487, + "num_input_tokens_seen": 41218392, + "step": 2517, + "train_runtime": 20455.1815, + "train_tokens_per_second": 2015.059 + }, + { + "epoch": 1.526060606060606, + "grad_norm": 0.00980446208268404, + "learning_rate": 9.497127133826003e-05, + "loss": 0.011328864842653275, + "num_input_tokens_seen": 41234768, + "step": 2518, + "train_runtime": 20463.2979, + "train_tokens_per_second": 2015.06 + }, + { + "epoch": 1.5266666666666666, + "grad_norm": 0.010182627476751804, + "learning_rate": 9.496706753976974e-05, + "loss": 0.012514796108007431, + "num_input_tokens_seen": 41251144, + "step": 2519, + "train_runtime": 20471.4184, + "train_tokens_per_second": 2015.06 + }, + { + "epoch": 1.5272727272727273, + "grad_norm": 0.012182634323835373, + "learning_rate": 9.49628620780352e-05, + "loss": 0.01241758931428194, + "num_input_tokens_seen": 41267520, + "step": 2520, + "train_runtime": 20479.541, + "train_tokens_per_second": 2015.061 + }, + { + "epoch": 1.5278787878787878, + "grad_norm": 0.0064279730431735516, + "learning_rate": 9.495865495321194e-05, + "loss": 0.010764975100755692, + "num_input_tokens_seen": 41283896, + "step": 2521, + "train_runtime": 20487.6618, + "train_tokens_per_second": 2015.061 + }, + { + "epoch": 1.5284848484848483, + "grad_norm": 0.009709006175398827, + "learning_rate": 9.495444616545559e-05, + "loss": 0.012598009780049324, + "num_input_tokens_seen": 41300272, + "step": 2522, + "train_runtime": 20495.7827, + "train_tokens_per_second": 2015.062 + }, + { + "epoch": 1.529090909090909, + "grad_norm": 0.01092903409153223, + "learning_rate": 9.495023571492181e-05, + "loss": 0.012402733787894249, + "num_input_tokens_seen": 41316648, + "step": 2523, + "train_runtime": 20503.8968, + "train_tokens_per_second": 2015.063 + }, + { + "epoch": 1.5296969696969698, + "grad_norm": 0.009195779450237751, + "learning_rate": 9.494602360176637e-05, + "loss": 0.012332282029092312, + "num_input_tokens_seen": 41333024, + "step": 2524, + "train_runtime": 20512.0154, + "train_tokens_per_second": 2015.064 + }, + { + "epoch": 1.5303030303030303, + "grad_norm": 0.007992182858288288, + "learning_rate": 9.494180982614502e-05, + "loss": 0.012078803032636642, + "num_input_tokens_seen": 41349400, + "step": 2525, + "train_runtime": 20520.134, + "train_tokens_per_second": 2015.065 + }, + { + "epoch": 1.5309090909090908, + "grad_norm": 0.00906646903604269, + "learning_rate": 9.493759438821366e-05, + "loss": 0.01147826574742794, + "num_input_tokens_seen": 41365776, + "step": 2526, + "train_runtime": 20528.2519, + "train_tokens_per_second": 2015.066 + }, + { + "epoch": 1.5315151515151515, + "grad_norm": 0.012746486812829971, + "learning_rate": 9.49333772881282e-05, + "loss": 0.013307849876582623, + "num_input_tokens_seen": 41382152, + "step": 2527, + "train_runtime": 20536.3683, + "train_tokens_per_second": 2015.067 + }, + { + "epoch": 1.5321212121212122, + "grad_norm": 0.02986880950629711, + "learning_rate": 9.49291585260446e-05, + "loss": 0.012137986719608307, + "num_input_tokens_seen": 41398528, + "step": 2528, + "train_runtime": 20544.4827, + "train_tokens_per_second": 2015.068 + }, + { + "epoch": 1.5327272727272727, + "grad_norm": 0.006111033260822296, + "learning_rate": 9.492493810211895e-05, + "loss": 0.012053197249770164, + "num_input_tokens_seen": 41414904, + "step": 2529, + "train_runtime": 20552.6098, + "train_tokens_per_second": 2015.068 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 0.005969110410660505, + "learning_rate": 9.492071601650731e-05, + "loss": 0.012252414599061012, + "num_input_tokens_seen": 41431280, + "step": 2530, + "train_runtime": 20560.7227, + "train_tokens_per_second": 2015.069 + }, + { + "epoch": 1.533939393939394, + "grad_norm": 0.004376005847007036, + "learning_rate": 9.491649226936585e-05, + "loss": 0.011911889538168907, + "num_input_tokens_seen": 41447656, + "step": 2531, + "train_runtime": 20568.8377, + "train_tokens_per_second": 2015.07 + }, + { + "epoch": 1.5345454545454547, + "grad_norm": 0.005644640419632196, + "learning_rate": 9.491226686085084e-05, + "loss": 0.01167337503284216, + "num_input_tokens_seen": 41464032, + "step": 2532, + "train_runtime": 20576.9582, + "train_tokens_per_second": 2015.071 + }, + { + "epoch": 1.5351515151515152, + "grad_norm": 0.007492442615330219, + "learning_rate": 9.490803979111851e-05, + "loss": 0.011597873643040657, + "num_input_tokens_seen": 41480408, + "step": 2533, + "train_runtime": 20585.0778, + "train_tokens_per_second": 2015.072 + }, + { + "epoch": 1.5357575757575757, + "grad_norm": 0.010739155113697052, + "learning_rate": 9.490381106032526e-05, + "loss": 0.0118883540853858, + "num_input_tokens_seen": 41496784, + "step": 2534, + "train_runtime": 20593.1973, + "train_tokens_per_second": 2015.072 + }, + { + "epoch": 1.5363636363636364, + "grad_norm": 0.010975335724651814, + "learning_rate": 9.48995806686275e-05, + "loss": 0.011686563491821289, + "num_input_tokens_seen": 41513160, + "step": 2535, + "train_runtime": 20601.31, + "train_tokens_per_second": 2015.074 + }, + { + "epoch": 1.536969696969697, + "grad_norm": 0.015743060037493706, + "learning_rate": 9.489534861618166e-05, + "loss": 0.014532624743878841, + "num_input_tokens_seen": 41529536, + "step": 2536, + "train_runtime": 20609.4325, + "train_tokens_per_second": 2015.074 + }, + { + "epoch": 1.5375757575757576, + "grad_norm": 0.006268225610256195, + "learning_rate": 9.489111490314433e-05, + "loss": 0.011723088100552559, + "num_input_tokens_seen": 41545912, + "step": 2537, + "train_runtime": 20617.55, + "train_tokens_per_second": 2015.075 + }, + { + "epoch": 1.538181818181818, + "grad_norm": 0.008138432167470455, + "learning_rate": 9.488687952967207e-05, + "loss": 0.0119707603007555, + "num_input_tokens_seen": 41562288, + "step": 2538, + "train_runtime": 20625.6618, + "train_tokens_per_second": 2015.077 + }, + { + "epoch": 1.5387878787878788, + "grad_norm": 0.010901181027293205, + "learning_rate": 9.488264249592154e-05, + "loss": 0.01287322398275137, + "num_input_tokens_seen": 41578664, + "step": 2539, + "train_runtime": 20633.7742, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 1.5393939393939395, + "grad_norm": 0.008760156109929085, + "learning_rate": 9.487840380204949e-05, + "loss": 0.012112857773900032, + "num_input_tokens_seen": 41595040, + "step": 2540, + "train_runtime": 20641.8924, + "train_tokens_per_second": 2015.079 + }, + { + "epoch": 1.54, + "grad_norm": 0.014003272168338299, + "learning_rate": 9.487416344821267e-05, + "loss": 0.012452900409698486, + "num_input_tokens_seen": 41611416, + "step": 2541, + "train_runtime": 20650.0123, + "train_tokens_per_second": 2015.079 + }, + { + "epoch": 1.5406060606060605, + "grad_norm": 0.01951734907925129, + "learning_rate": 9.486992143456792e-05, + "loss": 0.01286186370998621, + "num_input_tokens_seen": 41627792, + "step": 2542, + "train_runtime": 20658.1339, + "train_tokens_per_second": 2015.08 + }, + { + "epoch": 1.5412121212121213, + "grad_norm": 0.004000355023890734, + "learning_rate": 9.486567776127218e-05, + "loss": 0.011163925752043724, + "num_input_tokens_seen": 41644168, + "step": 2543, + "train_runtime": 20666.2487, + "train_tokens_per_second": 2015.081 + }, + { + "epoch": 1.541818181818182, + "grad_norm": 0.015947144478559494, + "learning_rate": 9.486143242848238e-05, + "loss": 0.011878188699483871, + "num_input_tokens_seen": 41660544, + "step": 2544, + "train_runtime": 20674.3655, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 1.5424242424242425, + "grad_norm": 0.01202548760920763, + "learning_rate": 9.485718543635555e-05, + "loss": 0.013055860064923763, + "num_input_tokens_seen": 41676920, + "step": 2545, + "train_runtime": 20682.4826, + "train_tokens_per_second": 2015.083 + }, + { + "epoch": 1.543030303030303, + "grad_norm": 0.02771756984293461, + "learning_rate": 9.485293678504879e-05, + "loss": 0.012415559962391853, + "num_input_tokens_seen": 41693296, + "step": 2546, + "train_runtime": 20690.6025, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 1.5436363636363635, + "grad_norm": 0.0019140188815072179, + "learning_rate": 9.484868647471926e-05, + "loss": 0.010037734173238277, + "num_input_tokens_seen": 41709672, + "step": 2547, + "train_runtime": 20698.72, + "train_tokens_per_second": 2015.085 + }, + { + "epoch": 1.5442424242424242, + "grad_norm": 0.00840219296514988, + "learning_rate": 9.484443450552413e-05, + "loss": 0.012736542150378227, + "num_input_tokens_seen": 41726048, + "step": 2548, + "train_runtime": 20706.8338, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.544848484848485, + "grad_norm": 0.025348152965307236, + "learning_rate": 9.484018087762072e-05, + "loss": 0.013136149384081364, + "num_input_tokens_seen": 41742424, + "step": 2549, + "train_runtime": 20714.95, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 1.5454545454545454, + "grad_norm": 0.01935630850493908, + "learning_rate": 9.483592559116633e-05, + "loss": 0.012145849876105785, + "num_input_tokens_seen": 41758800, + "step": 2550, + "train_runtime": 20723.0714, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 1.546060606060606, + "grad_norm": 0.015744099393486977, + "learning_rate": 9.483166864631837e-05, + "loss": 0.011445660144090652, + "num_input_tokens_seen": 41775176, + "step": 2551, + "train_runtime": 20731.188, + "train_tokens_per_second": 2015.088 + }, + { + "epoch": 1.5466666666666666, + "grad_norm": 0.009685778059065342, + "learning_rate": 9.48274100432343e-05, + "loss": 0.013055415824055672, + "num_input_tokens_seen": 41791552, + "step": 2552, + "train_runtime": 20739.3039, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 1.5472727272727274, + "grad_norm": 0.010554308071732521, + "learning_rate": 9.48231497820716e-05, + "loss": 0.01200939528644085, + "num_input_tokens_seen": 41807928, + "step": 2553, + "train_runtime": 20747.4234, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 1.5478787878787879, + "grad_norm": 0.0028376460541039705, + "learning_rate": 9.481888786298791e-05, + "loss": 0.011462630704045296, + "num_input_tokens_seen": 41824304, + "step": 2554, + "train_runtime": 20755.5419, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.5484848484848484, + "grad_norm": 0.009041995741426945, + "learning_rate": 9.481462428614083e-05, + "loss": 0.012170386500656605, + "num_input_tokens_seen": 41840680, + "step": 2555, + "train_runtime": 20763.6649, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.549090909090909, + "grad_norm": 0.015133941546082497, + "learning_rate": 9.481035905168808e-05, + "loss": 0.01340141985565424, + "num_input_tokens_seen": 41857056, + "step": 2556, + "train_runtime": 20771.7807, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 1.5496969696969698, + "grad_norm": 0.01299117412418127, + "learning_rate": 9.48060921597874e-05, + "loss": 0.012615041807293892, + "num_input_tokens_seen": 41873432, + "step": 2557, + "train_runtime": 20779.8958, + "train_tokens_per_second": 2015.093 + }, + { + "epoch": 1.5503030303030303, + "grad_norm": 0.013336027972400188, + "learning_rate": 9.480182361059662e-05, + "loss": 0.013393765315413475, + "num_input_tokens_seen": 41889808, + "step": 2558, + "train_runtime": 20788.0093, + "train_tokens_per_second": 2015.095 + }, + { + "epoch": 1.5509090909090908, + "grad_norm": 0.001979593886062503, + "learning_rate": 9.479755340427365e-05, + "loss": 0.011380261741578579, + "num_input_tokens_seen": 41906184, + "step": 2559, + "train_runtime": 20796.1337, + "train_tokens_per_second": 2015.095 + }, + { + "epoch": 1.5515151515151515, + "grad_norm": 0.011270053684711456, + "learning_rate": 9.479328154097642e-05, + "loss": 0.012305236421525478, + "num_input_tokens_seen": 41922560, + "step": 2560, + "train_runtime": 20804.2513, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 1.5521212121212122, + "grad_norm": 0.0068170237354934216, + "learning_rate": 9.478900802086292e-05, + "loss": 0.011324869468808174, + "num_input_tokens_seen": 41938936, + "step": 2561, + "train_runtime": 20812.3727, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 1.5527272727272727, + "grad_norm": 0.022808684036135674, + "learning_rate": 9.478473284409124e-05, + "loss": 0.01196727342903614, + "num_input_tokens_seen": 41955312, + "step": 2562, + "train_runtime": 20820.4878, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.5533333333333332, + "grad_norm": 0.007863566279411316, + "learning_rate": 9.478045601081952e-05, + "loss": 0.012731033377349377, + "num_input_tokens_seen": 41971688, + "step": 2563, + "train_runtime": 20828.6152, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.553939393939394, + "grad_norm": 0.007482618093490601, + "learning_rate": 9.477617752120593e-05, + "loss": 0.012603063136339188, + "num_input_tokens_seen": 41988064, + "step": 2564, + "train_runtime": 20836.7363, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.5545454545454547, + "grad_norm": 0.03489001467823982, + "learning_rate": 9.477189737540873e-05, + "loss": 0.012344112619757652, + "num_input_tokens_seen": 42004440, + "step": 2565, + "train_runtime": 20844.8571, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.5551515151515152, + "grad_norm": 0.011057870462536812, + "learning_rate": 9.476761557358623e-05, + "loss": 0.011534559540450573, + "num_input_tokens_seen": 42020816, + "step": 2566, + "train_runtime": 20852.9755, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 1.5557575757575757, + "grad_norm": 0.009332367219030857, + "learning_rate": 9.476333211589682e-05, + "loss": 0.013054633513092995, + "num_input_tokens_seen": 42037192, + "step": 2567, + "train_runtime": 20861.091, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 1.5563636363636364, + "grad_norm": 0.006530660204589367, + "learning_rate": 9.475904700249892e-05, + "loss": 0.010158851742744446, + "num_input_tokens_seen": 42053568, + "step": 2568, + "train_runtime": 20869.2092, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 1.5569696969696971, + "grad_norm": 0.010101820342242718, + "learning_rate": 9.475476023355103e-05, + "loss": 0.012717381119728088, + "num_input_tokens_seen": 42069944, + "step": 2569, + "train_runtime": 20877.3339, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 1.5575757575757576, + "grad_norm": 0.010927229188382626, + "learning_rate": 9.475047180921172e-05, + "loss": 0.011622841469943523, + "num_input_tokens_seen": 42086320, + "step": 2570, + "train_runtime": 20885.4524, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 1.5581818181818181, + "grad_norm": 0.006744182202965021, + "learning_rate": 9.474618172963963e-05, + "loss": 0.012462071143090725, + "num_input_tokens_seen": 42102696, + "step": 2571, + "train_runtime": 20893.5671, + "train_tokens_per_second": 2015.103 + }, + { + "epoch": 1.5587878787878788, + "grad_norm": 0.0053069149143993855, + "learning_rate": 9.474188999499339e-05, + "loss": 0.009743815287947655, + "num_input_tokens_seen": 42119072, + "step": 2572, + "train_runtime": 20901.6827, + "train_tokens_per_second": 2015.104 + }, + { + "epoch": 1.5593939393939396, + "grad_norm": 0.009455588646233082, + "learning_rate": 9.473759660543178e-05, + "loss": 0.012206991203129292, + "num_input_tokens_seen": 42135448, + "step": 2573, + "train_runtime": 20909.8014, + "train_tokens_per_second": 2015.105 + }, + { + "epoch": 1.56, + "grad_norm": 0.015813136473298073, + "learning_rate": 9.473330156111358e-05, + "loss": 0.012782268226146698, + "num_input_tokens_seen": 42151824, + "step": 2574, + "train_runtime": 20917.9217, + "train_tokens_per_second": 2015.106 + }, + { + "epoch": 1.5606060606060606, + "grad_norm": 0.011628096923232079, + "learning_rate": 9.472900486219769e-05, + "loss": 0.012355889193713665, + "num_input_tokens_seen": 42168200, + "step": 2575, + "train_runtime": 20926.038, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 1.561212121212121, + "grad_norm": 0.007116169203072786, + "learning_rate": 9.4724706508843e-05, + "loss": 0.012335915118455887, + "num_input_tokens_seen": 42184576, + "step": 2576, + "train_runtime": 20934.1567, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 1.5618181818181818, + "grad_norm": 0.007721391040831804, + "learning_rate": 9.472040650120852e-05, + "loss": 0.010860590264201164, + "num_input_tokens_seen": 42200952, + "step": 2577, + "train_runtime": 20942.2738, + "train_tokens_per_second": 2015.108 + }, + { + "epoch": 1.5624242424242425, + "grad_norm": 0.010738436132669449, + "learning_rate": 9.471610483945329e-05, + "loss": 0.012578247115015984, + "num_input_tokens_seen": 42217328, + "step": 2578, + "train_runtime": 20950.3912, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 1.563030303030303, + "grad_norm": 0.007581173907965422, + "learning_rate": 9.471180152373642e-05, + "loss": 0.011820399202406406, + "num_input_tokens_seen": 42233704, + "step": 2579, + "train_runtime": 20958.5067, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 1.5636363636363635, + "grad_norm": 0.013069345615804195, + "learning_rate": 9.47074965542171e-05, + "loss": 0.011658655479550362, + "num_input_tokens_seen": 42250080, + "step": 2580, + "train_runtime": 20966.6228, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 1.5642424242424242, + "grad_norm": 0.012723376974463463, + "learning_rate": 9.470318993105453e-05, + "loss": 0.014138157479465008, + "num_input_tokens_seen": 42266456, + "step": 2581, + "train_runtime": 20974.7366, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.564848484848485, + "grad_norm": 0.010134165175259113, + "learning_rate": 9.469888165440803e-05, + "loss": 0.012676380574703217, + "num_input_tokens_seen": 42282832, + "step": 2582, + "train_runtime": 20982.8521, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 1.5654545454545454, + "grad_norm": 0.02984500490128994, + "learning_rate": 9.469457172443694e-05, + "loss": 0.013629116117954254, + "num_input_tokens_seen": 42299208, + "step": 2583, + "train_runtime": 20990.9659, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 1.566060606060606, + "grad_norm": 0.006516328081488609, + "learning_rate": 9.469026014130068e-05, + "loss": 0.012136437930166721, + "num_input_tokens_seen": 42315584, + "step": 2584, + "train_runtime": 20999.0825, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 1.5666666666666667, + "grad_norm": 0.012075986713171005, + "learning_rate": 9.468594690515873e-05, + "loss": 0.011823137290775776, + "num_input_tokens_seen": 42331960, + "step": 2585, + "train_runtime": 21007.1994, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 1.5672727272727274, + "grad_norm": 0.018787235021591187, + "learning_rate": 9.468163201617062e-05, + "loss": 0.013657747767865658, + "num_input_tokens_seen": 42348336, + "step": 2586, + "train_runtime": 21015.3203, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 1.5678787878787879, + "grad_norm": 0.0069130174815654755, + "learning_rate": 9.467731547449596e-05, + "loss": 0.013095945119857788, + "num_input_tokens_seen": 42364712, + "step": 2587, + "train_runtime": 21023.4359, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 1.5684848484848484, + "grad_norm": 0.01700754649937153, + "learning_rate": 9.46729972802944e-05, + "loss": 0.012791439890861511, + "num_input_tokens_seen": 42381088, + "step": 2588, + "train_runtime": 21031.5541, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 1.569090909090909, + "grad_norm": 0.009264576248824596, + "learning_rate": 9.466867743372567e-05, + "loss": 0.012931954115629196, + "num_input_tokens_seen": 42397464, + "step": 2589, + "train_runtime": 21039.669, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 1.5696969696969698, + "grad_norm": 0.009157332591712475, + "learning_rate": 9.466435593494955e-05, + "loss": 0.01335228979587555, + "num_input_tokens_seen": 42413840, + "step": 2590, + "train_runtime": 21047.7829, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.5703030303030303, + "grad_norm": 0.006586118135601282, + "learning_rate": 9.46600327841259e-05, + "loss": 0.011794445104897022, + "num_input_tokens_seen": 42430216, + "step": 2591, + "train_runtime": 21055.8959, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 1.5709090909090908, + "grad_norm": 0.028860818594694138, + "learning_rate": 9.465570798141459e-05, + "loss": 0.012518493458628654, + "num_input_tokens_seen": 42446592, + "step": 2592, + "train_runtime": 21064.0134, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 1.5715151515151515, + "grad_norm": 0.008049269206821918, + "learning_rate": 9.46513815269756e-05, + "loss": 0.01254999078810215, + "num_input_tokens_seen": 42462968, + "step": 2593, + "train_runtime": 21072.1324, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 1.5721212121212123, + "grad_norm": 0.01140893530100584, + "learning_rate": 9.464705342096897e-05, + "loss": 0.012186221778392792, + "num_input_tokens_seen": 42479344, + "step": 2594, + "train_runtime": 21080.2464, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 1.5727272727272728, + "grad_norm": 0.009714704938232899, + "learning_rate": 9.464272366355479e-05, + "loss": 0.012780411168932915, + "num_input_tokens_seen": 42495720, + "step": 2595, + "train_runtime": 21088.3621, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.5733333333333333, + "grad_norm": 0.012686456553637981, + "learning_rate": 9.46383922548932e-05, + "loss": 0.011896700598299503, + "num_input_tokens_seen": 42512096, + "step": 2596, + "train_runtime": 21096.4739, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 1.573939393939394, + "grad_norm": 0.006144341081380844, + "learning_rate": 9.463405919514438e-05, + "loss": 0.011786655522882938, + "num_input_tokens_seen": 42528472, + "step": 2597, + "train_runtime": 21104.5883, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 1.5745454545454547, + "grad_norm": 0.006458967924118042, + "learning_rate": 9.462972448446865e-05, + "loss": 0.011068484745919704, + "num_input_tokens_seen": 42544848, + "step": 2598, + "train_runtime": 21112.6999, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 1.5751515151515152, + "grad_norm": 0.009129184298217297, + "learning_rate": 9.462538812302634e-05, + "loss": 0.011313150636851788, + "num_input_tokens_seen": 42561224, + "step": 2599, + "train_runtime": 21120.8129, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 1.5757575757575757, + "grad_norm": 0.01139355730265379, + "learning_rate": 9.462105011097781e-05, + "loss": 0.013862118124961853, + "num_input_tokens_seen": 42577600, + "step": 2600, + "train_runtime": 21128.933, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 1.5763636363636364, + "grad_norm": 0.008616439066827297, + "learning_rate": 9.461671044848352e-05, + "loss": 0.012551181018352509, + "num_input_tokens_seen": 42593976, + "step": 2601, + "train_runtime": 21138.0207, + "train_tokens_per_second": 2015.041 + }, + { + "epoch": 1.5769696969696971, + "grad_norm": 0.00892567913979292, + "learning_rate": 9.461236913570403e-05, + "loss": 0.013059779070317745, + "num_input_tokens_seen": 42610352, + "step": 2602, + "train_runtime": 21146.1334, + "train_tokens_per_second": 2015.042 + }, + { + "epoch": 1.5775757575757576, + "grad_norm": 0.009504597634077072, + "learning_rate": 9.460802617279988e-05, + "loss": 0.011425955221056938, + "num_input_tokens_seen": 42626728, + "step": 2603, + "train_runtime": 21154.2477, + "train_tokens_per_second": 2015.043 + }, + { + "epoch": 1.5781818181818181, + "grad_norm": 0.010583021678030491, + "learning_rate": 9.460368155993169e-05, + "loss": 0.011836562305688858, + "num_input_tokens_seen": 42643104, + "step": 2604, + "train_runtime": 21162.3587, + "train_tokens_per_second": 2015.045 + }, + { + "epoch": 1.5787878787878786, + "grad_norm": 0.008107494562864304, + "learning_rate": 9.459933529726018e-05, + "loss": 0.011621729470789433, + "num_input_tokens_seen": 42659480, + "step": 2605, + "train_runtime": 21170.4693, + "train_tokens_per_second": 2015.046 + }, + { + "epoch": 1.5793939393939394, + "grad_norm": 0.004669983871281147, + "learning_rate": 9.459498738494613e-05, + "loss": 0.011748522520065308, + "num_input_tokens_seen": 42675856, + "step": 2606, + "train_runtime": 21178.5793, + "train_tokens_per_second": 2015.048 + }, + { + "epoch": 1.58, + "grad_norm": 0.008583268150687218, + "learning_rate": 9.459063782315032e-05, + "loss": 0.011582738719880581, + "num_input_tokens_seen": 42692232, + "step": 2607, + "train_runtime": 21186.6935, + "train_tokens_per_second": 2015.049 + }, + { + "epoch": 1.5806060606060606, + "grad_norm": 0.0053719752468168736, + "learning_rate": 9.458628661203367e-05, + "loss": 0.011640815064311028, + "num_input_tokens_seen": 42708608, + "step": 2608, + "train_runtime": 21194.8072, + "train_tokens_per_second": 2015.051 + }, + { + "epoch": 1.581212121212121, + "grad_norm": 0.008548380807042122, + "learning_rate": 9.45819337517571e-05, + "loss": 0.013440142385661602, + "num_input_tokens_seen": 42724984, + "step": 2609, + "train_runtime": 21202.923, + "train_tokens_per_second": 2015.052 + }, + { + "epoch": 1.5818181818181818, + "grad_norm": 0.009923234581947327, + "learning_rate": 9.457757924248163e-05, + "loss": 0.012190381065011024, + "num_input_tokens_seen": 42741360, + "step": 2610, + "train_runtime": 21211.0385, + "train_tokens_per_second": 2015.053 + }, + { + "epoch": 1.5824242424242425, + "grad_norm": 0.007502941880375147, + "learning_rate": 9.457322308436828e-05, + "loss": 0.012137582525610924, + "num_input_tokens_seen": 42757736, + "step": 2611, + "train_runtime": 21219.1524, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 1.583030303030303, + "grad_norm": 0.009094790555536747, + "learning_rate": 9.456886527757825e-05, + "loss": 0.012535467743873596, + "num_input_tokens_seen": 42774112, + "step": 2612, + "train_runtime": 21227.262, + "train_tokens_per_second": 2015.056 + }, + { + "epoch": 1.5836363636363635, + "grad_norm": 0.014347701333463192, + "learning_rate": 9.456450582227267e-05, + "loss": 0.013268754817545414, + "num_input_tokens_seen": 42790488, + "step": 2613, + "train_runtime": 21235.3767, + "train_tokens_per_second": 2015.057 + }, + { + "epoch": 1.5842424242424242, + "grad_norm": 0.01596418023109436, + "learning_rate": 9.456014471861281e-05, + "loss": 0.013029556721448898, + "num_input_tokens_seen": 42806864, + "step": 2614, + "train_runtime": 21243.49, + "train_tokens_per_second": 2015.058 + }, + { + "epoch": 1.584848484848485, + "grad_norm": 0.00878889486193657, + "learning_rate": 9.455578196675999e-05, + "loss": 0.01179637759923935, + "num_input_tokens_seen": 42823240, + "step": 2615, + "train_runtime": 21251.6075, + "train_tokens_per_second": 2015.059 + }, + { + "epoch": 1.5854545454545454, + "grad_norm": 0.0020696879364550114, + "learning_rate": 9.455141756687554e-05, + "loss": 0.012115818448364735, + "num_input_tokens_seen": 42839616, + "step": 2616, + "train_runtime": 21259.7328, + "train_tokens_per_second": 2015.059 + }, + { + "epoch": 1.586060606060606, + "grad_norm": 0.01816830411553383, + "learning_rate": 9.454705151912091e-05, + "loss": 0.012639476917684078, + "num_input_tokens_seen": 42855992, + "step": 2617, + "train_runtime": 21267.8554, + "train_tokens_per_second": 2015.059 + }, + { + "epoch": 1.5866666666666667, + "grad_norm": 0.011715560220181942, + "learning_rate": 9.45426838236576e-05, + "loss": 0.01289452612400055, + "num_input_tokens_seen": 42872368, + "step": 2618, + "train_runtime": 21275.9757, + "train_tokens_per_second": 2015.06 + }, + { + "epoch": 1.5872727272727274, + "grad_norm": 0.00640740105882287, + "learning_rate": 9.453831448064717e-05, + "loss": 0.011819720268249512, + "num_input_tokens_seen": 42888744, + "step": 2619, + "train_runtime": 21284.1025, + "train_tokens_per_second": 2015.06 + }, + { + "epoch": 1.587878787878788, + "grad_norm": 0.00489066680893302, + "learning_rate": 9.453394349025122e-05, + "loss": 0.0112529331818223, + "num_input_tokens_seen": 42905120, + "step": 2620, + "train_runtime": 21292.2323, + "train_tokens_per_second": 2015.06 + }, + { + "epoch": 1.5884848484848484, + "grad_norm": 0.010229917243123055, + "learning_rate": 9.452957085263142e-05, + "loss": 0.01201337669044733, + "num_input_tokens_seen": 42921496, + "step": 2621, + "train_runtime": 21300.3542, + "train_tokens_per_second": 2015.06 + }, + { + "epoch": 1.589090909090909, + "grad_norm": 0.007779464591294527, + "learning_rate": 9.452519656794952e-05, + "loss": 0.012388527393341064, + "num_input_tokens_seen": 42937872, + "step": 2622, + "train_runtime": 21308.4726, + "train_tokens_per_second": 2015.061 + }, + { + "epoch": 1.5896969696969698, + "grad_norm": 0.01498553529381752, + "learning_rate": 9.452082063636729e-05, + "loss": 0.012615111656486988, + "num_input_tokens_seen": 42954248, + "step": 2623, + "train_runtime": 21316.591, + "train_tokens_per_second": 2015.062 + }, + { + "epoch": 1.5903030303030303, + "grad_norm": 0.01064207497984171, + "learning_rate": 9.45164430580466e-05, + "loss": 0.012389152310788631, + "num_input_tokens_seen": 42970624, + "step": 2624, + "train_runtime": 21324.7033, + "train_tokens_per_second": 2015.063 + }, + { + "epoch": 1.5909090909090908, + "grad_norm": 0.019471580162644386, + "learning_rate": 9.451206383314941e-05, + "loss": 0.014277242124080658, + "num_input_tokens_seen": 42987000, + "step": 2625, + "train_runtime": 21332.8185, + "train_tokens_per_second": 2015.064 + }, + { + "epoch": 1.5915151515151515, + "grad_norm": 0.012485023587942123, + "learning_rate": 9.450768296183765e-05, + "loss": 0.011769906617701054, + "num_input_tokens_seen": 43003376, + "step": 2626, + "train_runtime": 21340.9508, + "train_tokens_per_second": 2015.064 + }, + { + "epoch": 1.5921212121212123, + "grad_norm": 0.006313348188996315, + "learning_rate": 9.450330044427336e-05, + "loss": 0.010830282233655453, + "num_input_tokens_seen": 43019752, + "step": 2627, + "train_runtime": 21349.0674, + "train_tokens_per_second": 2015.065 + }, + { + "epoch": 1.5927272727272728, + "grad_norm": 0.005766854155808687, + "learning_rate": 9.449891628061864e-05, + "loss": 0.012779559940099716, + "num_input_tokens_seen": 43036128, + "step": 2628, + "train_runtime": 21357.1866, + "train_tokens_per_second": 2015.065 + }, + { + "epoch": 1.5933333333333333, + "grad_norm": 0.022942470386624336, + "learning_rate": 9.449453047103569e-05, + "loss": 0.012017364613711834, + "num_input_tokens_seen": 43052504, + "step": 2629, + "train_runtime": 21365.3011, + "train_tokens_per_second": 2015.067 + }, + { + "epoch": 1.593939393939394, + "grad_norm": 0.005800590384751558, + "learning_rate": 9.449014301568671e-05, + "loss": 0.012153777293860912, + "num_input_tokens_seen": 43068880, + "step": 2630, + "train_runtime": 21373.4177, + "train_tokens_per_second": 2015.068 + }, + { + "epoch": 1.5945454545454547, + "grad_norm": 0.014468961395323277, + "learning_rate": 9.448575391473396e-05, + "loss": 0.012839552015066147, + "num_input_tokens_seen": 43085256, + "step": 2631, + "train_runtime": 21381.5335, + "train_tokens_per_second": 2015.069 + }, + { + "epoch": 1.5951515151515152, + "grad_norm": 0.020431358367204666, + "learning_rate": 9.448136316833981e-05, + "loss": 0.01263053435832262, + "num_input_tokens_seen": 43101632, + "step": 2632, + "train_runtime": 21389.6484, + "train_tokens_per_second": 2015.07 + }, + { + "epoch": 1.5957575757575757, + "grad_norm": 0.028599435463547707, + "learning_rate": 9.447697077666666e-05, + "loss": 0.013531115837395191, + "num_input_tokens_seen": 43118008, + "step": 2633, + "train_runtime": 21397.7637, + "train_tokens_per_second": 2015.071 + }, + { + "epoch": 1.5963636363636362, + "grad_norm": 0.004123292397707701, + "learning_rate": 9.447257673987697e-05, + "loss": 0.011205705814063549, + "num_input_tokens_seen": 43134384, + "step": 2634, + "train_runtime": 21405.8785, + "train_tokens_per_second": 2015.072 + }, + { + "epoch": 1.596969696969697, + "grad_norm": 0.009270669892430305, + "learning_rate": 9.44681810581333e-05, + "loss": 0.01207477692514658, + "num_input_tokens_seen": 43150760, + "step": 2635, + "train_runtime": 21413.992, + "train_tokens_per_second": 2015.073 + }, + { + "epoch": 1.5975757575757576, + "grad_norm": 0.0058461823500692844, + "learning_rate": 9.446378373159818e-05, + "loss": 0.012760588899254799, + "num_input_tokens_seen": 43167136, + "step": 2636, + "train_runtime": 21422.1076, + "train_tokens_per_second": 2015.074 + }, + { + "epoch": 1.5981818181818181, + "grad_norm": 0.009135591797530651, + "learning_rate": 9.44593847604343e-05, + "loss": 0.012779445387423038, + "num_input_tokens_seen": 43183512, + "step": 2637, + "train_runtime": 21430.2211, + "train_tokens_per_second": 2015.075 + }, + { + "epoch": 1.5987878787878786, + "grad_norm": 0.007859868928790092, + "learning_rate": 9.445498414480436e-05, + "loss": 0.012065579183399677, + "num_input_tokens_seen": 43199888, + "step": 2638, + "train_runtime": 21438.3375, + "train_tokens_per_second": 2015.076 + }, + { + "epoch": 1.5993939393939394, + "grad_norm": 0.0413634367287159, + "learning_rate": 9.445058188487113e-05, + "loss": 0.011168006807565689, + "num_input_tokens_seen": 43216264, + "step": 2639, + "train_runtime": 21446.4552, + "train_tokens_per_second": 2015.077 + }, + { + "epoch": 1.6, + "grad_norm": 0.006517525762319565, + "learning_rate": 9.444617798079744e-05, + "loss": 0.011061888188123703, + "num_input_tokens_seen": 43232640, + "step": 2640, + "train_runtime": 21454.5714, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 1.6006060606060606, + "grad_norm": 0.007652346510440111, + "learning_rate": 9.444177243274618e-05, + "loss": 0.01161403302103281, + "num_input_tokens_seen": 43249016, + "step": 2641, + "train_runtime": 21462.6866, + "train_tokens_per_second": 2015.079 + }, + { + "epoch": 1.601212121212121, + "grad_norm": 0.008789447136223316, + "learning_rate": 9.44373652408803e-05, + "loss": 0.012821921147406101, + "num_input_tokens_seen": 43265392, + "step": 2642, + "train_runtime": 21470.8059, + "train_tokens_per_second": 2015.08 + }, + { + "epoch": 1.6018181818181818, + "grad_norm": 0.012086639180779457, + "learning_rate": 9.443295640536283e-05, + "loss": 0.011262400075793266, + "num_input_tokens_seen": 43281768, + "step": 2643, + "train_runtime": 21478.9332, + "train_tokens_per_second": 2015.08 + }, + { + "epoch": 1.6024242424242425, + "grad_norm": 0.011036599986255169, + "learning_rate": 9.442854592635681e-05, + "loss": 0.011876849457621574, + "num_input_tokens_seen": 43298144, + "step": 2644, + "train_runtime": 21487.0481, + "train_tokens_per_second": 2015.081 + }, + { + "epoch": 1.603030303030303, + "grad_norm": 0.011125577613711357, + "learning_rate": 9.442413380402541e-05, + "loss": 0.012037638574838638, + "num_input_tokens_seen": 43314520, + "step": 2645, + "train_runtime": 21495.1688, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 1.6036363636363635, + "grad_norm": 0.007470986340194941, + "learning_rate": 9.441972003853181e-05, + "loss": 0.011867578141391277, + "num_input_tokens_seen": 43330896, + "step": 2646, + "train_runtime": 21503.286, + "train_tokens_per_second": 2015.083 + }, + { + "epoch": 1.6042424242424242, + "grad_norm": 0.009798098355531693, + "learning_rate": 9.441530463003928e-05, + "loss": 0.011247251182794571, + "num_input_tokens_seen": 43347272, + "step": 2647, + "train_runtime": 21511.4064, + "train_tokens_per_second": 2015.083 + }, + { + "epoch": 1.604848484848485, + "grad_norm": 0.05813064053654671, + "learning_rate": 9.441088757871112e-05, + "loss": 0.011350232176482677, + "num_input_tokens_seen": 43363648, + "step": 2648, + "train_runtime": 21519.5234, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 1.6054545454545455, + "grad_norm": 0.009082259610295296, + "learning_rate": 9.440646888471071e-05, + "loss": 0.013249941170215607, + "num_input_tokens_seen": 43380024, + "step": 2649, + "train_runtime": 21527.6397, + "train_tokens_per_second": 2015.085 + }, + { + "epoch": 1.606060606060606, + "grad_norm": 0.013390403240919113, + "learning_rate": 9.440204854820149e-05, + "loss": 0.01255058590322733, + "num_input_tokens_seen": 43396400, + "step": 2650, + "train_runtime": 21535.7563, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.6066666666666667, + "grad_norm": 0.007771987933665514, + "learning_rate": 9.439762656934698e-05, + "loss": 0.012910917401313782, + "num_input_tokens_seen": 43412776, + "step": 2651, + "train_runtime": 21543.8783, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.6072727272727274, + "grad_norm": 0.010885335505008698, + "learning_rate": 9.43932029483107e-05, + "loss": 0.011177360080182552, + "num_input_tokens_seen": 43429152, + "step": 2652, + "train_runtime": 21551.994, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 1.607878787878788, + "grad_norm": 0.022055966779589653, + "learning_rate": 9.438877768525631e-05, + "loss": 0.01327504776418209, + "num_input_tokens_seen": 43445528, + "step": 2653, + "train_runtime": 21560.1083, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 1.6084848484848484, + "grad_norm": 0.007112656719982624, + "learning_rate": 9.438435078034749e-05, + "loss": 0.012204596772789955, + "num_input_tokens_seen": 43461904, + "step": 2654, + "train_runtime": 21568.235, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 1.6090909090909091, + "grad_norm": 0.010424617677927017, + "learning_rate": 9.437992223374794e-05, + "loss": 0.012944593094289303, + "num_input_tokens_seen": 43478280, + "step": 2655, + "train_runtime": 21576.3518, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 1.6096969696969698, + "grad_norm": 0.007999579422175884, + "learning_rate": 9.437549204562151e-05, + "loss": 0.011439507827162743, + "num_input_tokens_seen": 43494656, + "step": 2656, + "train_runtime": 21584.4663, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.6103030303030303, + "grad_norm": 0.009420707821846008, + "learning_rate": 9.437106021613204e-05, + "loss": 0.010938690043985844, + "num_input_tokens_seen": 43511032, + "step": 2657, + "train_runtime": 21592.5817, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 1.6109090909090908, + "grad_norm": 0.010780644603073597, + "learning_rate": 9.436662674544348e-05, + "loss": 0.013350578024983406, + "num_input_tokens_seen": 43527408, + "step": 2658, + "train_runtime": 21600.699, + "train_tokens_per_second": 2015.093 + }, + { + "epoch": 1.6115151515151516, + "grad_norm": 0.013084693811833858, + "learning_rate": 9.436219163371977e-05, + "loss": 0.013172319158911705, + "num_input_tokens_seen": 43543784, + "step": 2659, + "train_runtime": 21608.814, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 1.612121212121212, + "grad_norm": 0.012968843802809715, + "learning_rate": 9.4357754881125e-05, + "loss": 0.012867764569818974, + "num_input_tokens_seen": 43560160, + "step": 2660, + "train_runtime": 21616.9359, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 1.6127272727272728, + "grad_norm": 0.019006604328751564, + "learning_rate": 9.435331648782324e-05, + "loss": 0.011904004961252213, + "num_input_tokens_seen": 43576536, + "step": 2661, + "train_runtime": 21625.0507, + "train_tokens_per_second": 2015.095 + }, + { + "epoch": 1.6133333333333333, + "grad_norm": 0.0019992270972579718, + "learning_rate": 9.43488764539787e-05, + "loss": 0.013505300506949425, + "num_input_tokens_seen": 43592912, + "step": 2662, + "train_runtime": 21633.1713, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 1.6139393939393938, + "grad_norm": 0.01781928539276123, + "learning_rate": 9.434443477975558e-05, + "loss": 0.012036633677780628, + "num_input_tokens_seen": 43609288, + "step": 2663, + "train_runtime": 21641.2871, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.6145454545454545, + "grad_norm": 0.009235371835529804, + "learning_rate": 9.433999146531815e-05, + "loss": 0.012760656885802746, + "num_input_tokens_seen": 43625664, + "step": 2664, + "train_runtime": 21649.4077, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.6151515151515152, + "grad_norm": 0.010370719246566296, + "learning_rate": 9.433554651083082e-05, + "loss": 0.011637035757303238, + "num_input_tokens_seen": 43642040, + "step": 2665, + "train_runtime": 21657.5339, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.6157575757575757, + "grad_norm": 0.036511119455099106, + "learning_rate": 9.433109991645795e-05, + "loss": 0.011988237500190735, + "num_input_tokens_seen": 43658416, + "step": 2666, + "train_runtime": 21665.6475, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 1.6163636363636362, + "grad_norm": 0.013008903712034225, + "learning_rate": 9.432665168236401e-05, + "loss": 0.01194752287119627, + "num_input_tokens_seen": 43674792, + "step": 2667, + "train_runtime": 21673.7643, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 1.616969696969697, + "grad_norm": 0.012565176002681255, + "learning_rate": 9.432220180871358e-05, + "loss": 0.012999428436160088, + "num_input_tokens_seen": 43691168, + "step": 2668, + "train_runtime": 21681.8784, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 1.6175757575757577, + "grad_norm": 0.005679360590875149, + "learning_rate": 9.43177502956712e-05, + "loss": 0.012297439388930798, + "num_input_tokens_seen": 43707544, + "step": 2669, + "train_runtime": 21689.994, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 1.6181818181818182, + "grad_norm": 0.015765322372317314, + "learning_rate": 9.431329714340154e-05, + "loss": 0.012844718061387539, + "num_input_tokens_seen": 43723920, + "step": 2670, + "train_runtime": 21698.1079, + "train_tokens_per_second": 2015.103 + }, + { + "epoch": 1.6187878787878787, + "grad_norm": 0.005528231151401997, + "learning_rate": 9.43088423520693e-05, + "loss": 0.012213103473186493, + "num_input_tokens_seen": 43740296, + "step": 2671, + "train_runtime": 21706.2193, + "train_tokens_per_second": 2015.104 + }, + { + "epoch": 1.6193939393939394, + "grad_norm": 0.020404066890478134, + "learning_rate": 9.430438592183928e-05, + "loss": 0.011327740736305714, + "num_input_tokens_seen": 43756672, + "step": 2672, + "train_runtime": 21714.334, + "train_tokens_per_second": 2015.105 + }, + { + "epoch": 1.62, + "grad_norm": 0.00987694226205349, + "learning_rate": 9.429992785287632e-05, + "loss": 0.010421988554298878, + "num_input_tokens_seen": 43773048, + "step": 2673, + "train_runtime": 21722.4492, + "train_tokens_per_second": 2015.106 + }, + { + "epoch": 1.6206060606060606, + "grad_norm": 0.0061499676667153835, + "learning_rate": 9.429546814534529e-05, + "loss": 0.011812202632427216, + "num_input_tokens_seen": 43789424, + "step": 2674, + "train_runtime": 21730.5672, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 1.621212121212121, + "grad_norm": 0.00769574660807848, + "learning_rate": 9.429100679941114e-05, + "loss": 0.010980362072587013, + "num_input_tokens_seen": 43805800, + "step": 2675, + "train_runtime": 21738.6884, + "train_tokens_per_second": 2015.108 + }, + { + "epoch": 1.6218181818181818, + "grad_norm": 0.011081526055932045, + "learning_rate": 9.428654381523892e-05, + "loss": 0.012496310286223888, + "num_input_tokens_seen": 43822176, + "step": 2676, + "train_runtime": 21746.807, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 1.6224242424242425, + "grad_norm": 0.013930363580584526, + "learning_rate": 9.428207919299368e-05, + "loss": 0.013375709764659405, + "num_input_tokens_seen": 43838552, + "step": 2677, + "train_runtime": 21754.9233, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 1.623030303030303, + "grad_norm": 0.016915637999773026, + "learning_rate": 9.427761293284057e-05, + "loss": 0.014272555708885193, + "num_input_tokens_seen": 43854928, + "step": 2678, + "train_runtime": 21763.0382, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 1.6236363636363635, + "grad_norm": 0.011937938630580902, + "learning_rate": 9.427314503494477e-05, + "loss": 0.012509403750300407, + "num_input_tokens_seen": 43871304, + "step": 2679, + "train_runtime": 21771.1558, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 1.6242424242424243, + "grad_norm": 0.014566425234079361, + "learning_rate": 9.426867549947158e-05, + "loss": 0.012007796205580235, + "num_input_tokens_seen": 43887680, + "step": 2680, + "train_runtime": 21779.2721, + "train_tokens_per_second": 2015.112 + }, + { + "epoch": 1.624848484848485, + "grad_norm": 0.011659136973321438, + "learning_rate": 9.426420432658627e-05, + "loss": 0.012796234339475632, + "num_input_tokens_seen": 43904056, + "step": 2681, + "train_runtime": 21787.388, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.6254545454545455, + "grad_norm": 0.007594785187393427, + "learning_rate": 9.425973151645426e-05, + "loss": 0.012028402648866177, + "num_input_tokens_seen": 43920432, + "step": 2682, + "train_runtime": 21795.5065, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 1.626060606060606, + "grad_norm": 0.017702842131257057, + "learning_rate": 9.425525706924096e-05, + "loss": 0.01287344191223383, + "num_input_tokens_seen": 43936808, + "step": 2683, + "train_runtime": 21803.6224, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 1.6266666666666667, + "grad_norm": 0.010299012064933777, + "learning_rate": 9.425078098511188e-05, + "loss": 0.01260048896074295, + "num_input_tokens_seen": 43953184, + "step": 2684, + "train_runtime": 21811.7422, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 1.6272727272727274, + "grad_norm": 0.009039688855409622, + "learning_rate": 9.424630326423259e-05, + "loss": 0.011654467321932316, + "num_input_tokens_seen": 43969560, + "step": 2685, + "train_runtime": 21819.8614, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 1.627878787878788, + "grad_norm": 0.006915550213307142, + "learning_rate": 9.424182390676872e-05, + "loss": 0.011898837052285671, + "num_input_tokens_seen": 43985936, + "step": 2686, + "train_runtime": 21827.9726, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 1.6284848484848484, + "grad_norm": 0.01088861282914877, + "learning_rate": 9.423734291288592e-05, + "loss": 0.012327456846833229, + "num_input_tokens_seen": 44002312, + "step": 2687, + "train_runtime": 21836.0845, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 1.6290909090909091, + "grad_norm": 0.012999316677451134, + "learning_rate": 9.423286028274997e-05, + "loss": 0.013050897978246212, + "num_input_tokens_seen": 44018688, + "step": 2688, + "train_runtime": 21844.1972, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 1.6296969696969696, + "grad_norm": 0.009379571303725243, + "learning_rate": 9.422837601652665e-05, + "loss": 0.012682823464274406, + "num_input_tokens_seen": 44035064, + "step": 2689, + "train_runtime": 21852.3117, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.6303030303030304, + "grad_norm": 0.00579515565186739, + "learning_rate": 9.422389011438184e-05, + "loss": 0.01177308801561594, + "num_input_tokens_seen": 44051440, + "step": 2690, + "train_runtime": 21860.4334, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.6309090909090909, + "grad_norm": 0.0018515904666855931, + "learning_rate": 9.421940257648146e-05, + "loss": 0.012456430122256279, + "num_input_tokens_seen": 44067816, + "step": 2691, + "train_runtime": 21868.5457, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 1.6315151515151514, + "grad_norm": 0.013134176842868328, + "learning_rate": 9.421491340299148e-05, + "loss": 0.012923019006848335, + "num_input_tokens_seen": 44084192, + "step": 2692, + "train_runtime": 21876.6591, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 1.632121212121212, + "grad_norm": 0.007743083406239748, + "learning_rate": 9.421042259407796e-05, + "loss": 0.011915619485080242, + "num_input_tokens_seen": 44100568, + "step": 2693, + "train_runtime": 21884.772, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 1.6327272727272728, + "grad_norm": 0.005690127145498991, + "learning_rate": 9.4205930149907e-05, + "loss": 0.010516472160816193, + "num_input_tokens_seen": 44116944, + "step": 2694, + "train_runtime": 21892.8858, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.6333333333333333, + "grad_norm": 0.011323309503495693, + "learning_rate": 9.420143607064478e-05, + "loss": 0.012086287140846252, + "num_input_tokens_seen": 44133320, + "step": 2695, + "train_runtime": 21901.0625, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.6339393939393938, + "grad_norm": 0.007330689113587141, + "learning_rate": 9.419694035645751e-05, + "loss": 0.012611635960638523, + "num_input_tokens_seen": 44149696, + "step": 2696, + "train_runtime": 21909.176, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 1.6345454545454545, + "grad_norm": 0.006508952938020229, + "learning_rate": 9.41924430075115e-05, + "loss": 0.010217411443591118, + "num_input_tokens_seen": 44166072, + "step": 2697, + "train_runtime": 21917.2911, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 1.6351515151515152, + "grad_norm": 0.009744730778038502, + "learning_rate": 9.418794402397307e-05, + "loss": 0.01241858210414648, + "num_input_tokens_seen": 44182448, + "step": 2698, + "train_runtime": 21925.4067, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 1.6357575757575757, + "grad_norm": 0.011063654907047749, + "learning_rate": 9.418344340600865e-05, + "loss": 0.011393502354621887, + "num_input_tokens_seen": 44198824, + "step": 2699, + "train_runtime": 21933.5244, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 1.6363636363636362, + "grad_norm": 0.006254155188798904, + "learning_rate": 9.41789411537847e-05, + "loss": 0.01240509282797575, + "num_input_tokens_seen": 44215200, + "step": 2700, + "train_runtime": 21941.6385, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 1.636969696969697, + "grad_norm": 0.01012837328016758, + "learning_rate": 9.417443726746776e-05, + "loss": 0.011583870276808739, + "num_input_tokens_seen": 44231576, + "step": 2701, + "train_runtime": 21950.696, + "train_tokens_per_second": 2015.042 + }, + { + "epoch": 1.6375757575757577, + "grad_norm": 0.013250237330794334, + "learning_rate": 9.416993174722439e-05, + "loss": 0.012408467009663582, + "num_input_tokens_seen": 44247952, + "step": 2702, + "train_runtime": 21958.8131, + "train_tokens_per_second": 2015.043 + }, + { + "epoch": 1.6381818181818182, + "grad_norm": 0.006648391485214233, + "learning_rate": 9.416542459322129e-05, + "loss": 0.013180596753954887, + "num_input_tokens_seen": 44264328, + "step": 2703, + "train_runtime": 21966.9333, + "train_tokens_per_second": 2015.044 + }, + { + "epoch": 1.6387878787878787, + "grad_norm": 0.009948927909135818, + "learning_rate": 9.416091580562512e-05, + "loss": 0.012322509661316872, + "num_input_tokens_seen": 44280704, + "step": 2704, + "train_runtime": 21975.0494, + "train_tokens_per_second": 2015.045 + }, + { + "epoch": 1.6393939393939394, + "grad_norm": 0.0077480534091591835, + "learning_rate": 9.415640538460267e-05, + "loss": 0.011874118819832802, + "num_input_tokens_seen": 44297080, + "step": 2705, + "train_runtime": 21983.1643, + "train_tokens_per_second": 2015.046 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 0.009989479556679726, + "learning_rate": 9.41518933303208e-05, + "loss": 0.013704563491046429, + "num_input_tokens_seen": 44313456, + "step": 2706, + "train_runtime": 21991.2775, + "train_tokens_per_second": 2015.047 + }, + { + "epoch": 1.6406060606060606, + "grad_norm": 0.00627144007012248, + "learning_rate": 9.414737964294636e-05, + "loss": 0.01262811291962862, + "num_input_tokens_seen": 44329832, + "step": 2707, + "train_runtime": 21999.3947, + "train_tokens_per_second": 2015.048 + }, + { + "epoch": 1.6412121212121211, + "grad_norm": 0.009063741192221642, + "learning_rate": 9.414286432264631e-05, + "loss": 0.013369398191571236, + "num_input_tokens_seen": 44346208, + "step": 2708, + "train_runtime": 22007.5139, + "train_tokens_per_second": 2015.049 + }, + { + "epoch": 1.6418181818181818, + "grad_norm": 0.007614328060299158, + "learning_rate": 9.413834736958768e-05, + "loss": 0.012139086611568928, + "num_input_tokens_seen": 44362584, + "step": 2709, + "train_runtime": 22015.6353, + "train_tokens_per_second": 2015.049 + }, + { + "epoch": 1.6424242424242426, + "grad_norm": 0.007654812186956406, + "learning_rate": 9.413382878393754e-05, + "loss": 0.011492074467241764, + "num_input_tokens_seen": 44378960, + "step": 2710, + "train_runtime": 22023.752, + "train_tokens_per_second": 2015.05 + }, + { + "epoch": 1.643030303030303, + "grad_norm": 0.007990519516170025, + "learning_rate": 9.412930856586304e-05, + "loss": 0.012617778033018112, + "num_input_tokens_seen": 44395336, + "step": 2711, + "train_runtime": 22031.8714, + "train_tokens_per_second": 2015.051 + }, + { + "epoch": 1.6436363636363636, + "grad_norm": 0.004427951294928789, + "learning_rate": 9.412478671553134e-05, + "loss": 0.011064354330301285, + "num_input_tokens_seen": 44411712, + "step": 2712, + "train_runtime": 22039.991, + "train_tokens_per_second": 2015.051 + }, + { + "epoch": 1.6442424242424243, + "grad_norm": 0.011509922333061695, + "learning_rate": 9.41202632331097e-05, + "loss": 0.012677370570600033, + "num_input_tokens_seen": 44428088, + "step": 2713, + "train_runtime": 22048.1121, + "train_tokens_per_second": 2015.052 + }, + { + "epoch": 1.644848484848485, + "grad_norm": 0.004903316497802734, + "learning_rate": 9.411573811876544e-05, + "loss": 0.011655345559120178, + "num_input_tokens_seen": 44444464, + "step": 2714, + "train_runtime": 22056.234, + "train_tokens_per_second": 2015.052 + }, + { + "epoch": 1.6454545454545455, + "grad_norm": 0.004879241809248924, + "learning_rate": 9.411121137266595e-05, + "loss": 0.011117602698504925, + "num_input_tokens_seen": 44460840, + "step": 2715, + "train_runtime": 22064.3531, + "train_tokens_per_second": 2015.053 + }, + { + "epoch": 1.646060606060606, + "grad_norm": 0.014790347777307034, + "learning_rate": 9.410668299497864e-05, + "loss": 0.013863930478692055, + "num_input_tokens_seen": 44477216, + "step": 2716, + "train_runtime": 22072.4672, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 1.6466666666666665, + "grad_norm": 0.01018503587692976, + "learning_rate": 9.410215298587104e-05, + "loss": 0.010815615765750408, + "num_input_tokens_seen": 44493592, + "step": 2717, + "train_runtime": 22080.5833, + "train_tokens_per_second": 2015.055 + }, + { + "epoch": 1.6472727272727272, + "grad_norm": 0.005130484700202942, + "learning_rate": 9.409762134551068e-05, + "loss": 0.012508288025856018, + "num_input_tokens_seen": 44509968, + "step": 2718, + "train_runtime": 22088.7033, + "train_tokens_per_second": 2015.056 + }, + { + "epoch": 1.647878787878788, + "grad_norm": 0.007447066716849804, + "learning_rate": 9.409308807406518e-05, + "loss": 0.011768568307161331, + "num_input_tokens_seen": 44526344, + "step": 2719, + "train_runtime": 22096.8327, + "train_tokens_per_second": 2015.055 + }, + { + "epoch": 1.6484848484848484, + "grad_norm": 0.007597601041197777, + "learning_rate": 9.408855317170222e-05, + "loss": 0.011454282328486443, + "num_input_tokens_seen": 44542720, + "step": 2720, + "train_runtime": 22104.9518, + "train_tokens_per_second": 2015.056 + }, + { + "epoch": 1.649090909090909, + "grad_norm": 0.00861985981464386, + "learning_rate": 9.408401663858953e-05, + "loss": 0.011637267656624317, + "num_input_tokens_seen": 44559096, + "step": 2721, + "train_runtime": 22113.0731, + "train_tokens_per_second": 2015.057 + }, + { + "epoch": 1.6496969696969697, + "grad_norm": 0.008903435431420803, + "learning_rate": 9.407947847489494e-05, + "loss": 0.01246644090861082, + "num_input_tokens_seen": 44575472, + "step": 2722, + "train_runtime": 22121.1881, + "train_tokens_per_second": 2015.058 + }, + { + "epoch": 1.6503030303030304, + "grad_norm": 0.013762188144028187, + "learning_rate": 9.407493868078625e-05, + "loss": 0.012603108771145344, + "num_input_tokens_seen": 44591848, + "step": 2723, + "train_runtime": 22129.3025, + "train_tokens_per_second": 2015.059 + }, + { + "epoch": 1.6509090909090909, + "grad_norm": 0.007037809584289789, + "learning_rate": 9.407039725643142e-05, + "loss": 0.011482897214591503, + "num_input_tokens_seen": 44608224, + "step": 2724, + "train_runtime": 22137.4172, + "train_tokens_per_second": 2015.06 + }, + { + "epoch": 1.6515151515151514, + "grad_norm": 0.0225937832146883, + "learning_rate": 9.406585420199843e-05, + "loss": 0.011697374284267426, + "num_input_tokens_seen": 44624600, + "step": 2725, + "train_runtime": 22145.5353, + "train_tokens_per_second": 2015.061 + }, + { + "epoch": 1.652121212121212, + "grad_norm": 0.004367986228317022, + "learning_rate": 9.406130951765529e-05, + "loss": 0.01129780150949955, + "num_input_tokens_seen": 44640976, + "step": 2726, + "train_runtime": 22153.6512, + "train_tokens_per_second": 2015.062 + }, + { + "epoch": 1.6527272727272728, + "grad_norm": 0.008844173513352871, + "learning_rate": 9.405676320357013e-05, + "loss": 0.010827938094735146, + "num_input_tokens_seen": 44657352, + "step": 2727, + "train_runtime": 22161.7685, + "train_tokens_per_second": 2015.063 + }, + { + "epoch": 1.6533333333333333, + "grad_norm": 0.009946335107088089, + "learning_rate": 9.405221525991108e-05, + "loss": 0.01152572687715292, + "num_input_tokens_seen": 44673728, + "step": 2728, + "train_runtime": 22169.8861, + "train_tokens_per_second": 2015.063 + }, + { + "epoch": 1.6539393939393938, + "grad_norm": 0.00656983582302928, + "learning_rate": 9.40476656868464e-05, + "loss": 0.012073036283254623, + "num_input_tokens_seen": 44690104, + "step": 2729, + "train_runtime": 22178.0059, + "train_tokens_per_second": 2015.064 + }, + { + "epoch": 1.6545454545454545, + "grad_norm": 0.008812750689685345, + "learning_rate": 9.404311448454433e-05, + "loss": 0.011705856770277023, + "num_input_tokens_seen": 44706480, + "step": 2730, + "train_runtime": 22186.1189, + "train_tokens_per_second": 2015.065 + }, + { + "epoch": 1.6551515151515153, + "grad_norm": 0.0035067375283688307, + "learning_rate": 9.403856165317321e-05, + "loss": 0.012019923888146877, + "num_input_tokens_seen": 44722856, + "step": 2731, + "train_runtime": 22194.2334, + "train_tokens_per_second": 2015.066 + }, + { + "epoch": 1.6557575757575758, + "grad_norm": 0.015511504374444485, + "learning_rate": 9.403400719290147e-05, + "loss": 0.01346611324697733, + "num_input_tokens_seen": 44739232, + "step": 2732, + "train_runtime": 22202.352, + "train_tokens_per_second": 2015.067 + }, + { + "epoch": 1.6563636363636363, + "grad_norm": 0.007372966967523098, + "learning_rate": 9.402945110389757e-05, + "loss": 0.012887794524431229, + "num_input_tokens_seen": 44755608, + "step": 2733, + "train_runtime": 22210.4716, + "train_tokens_per_second": 2015.068 + }, + { + "epoch": 1.656969696969697, + "grad_norm": 0.023236919194459915, + "learning_rate": 9.402489338633001e-05, + "loss": 0.01442014705389738, + "num_input_tokens_seen": 44771984, + "step": 2734, + "train_runtime": 22218.5872, + "train_tokens_per_second": 2015.069 + }, + { + "epoch": 1.6575757575757577, + "grad_norm": 0.007083198521286249, + "learning_rate": 9.402033404036736e-05, + "loss": 0.012199325487017632, + "num_input_tokens_seen": 44788360, + "step": 2735, + "train_runtime": 22226.7057, + "train_tokens_per_second": 2015.07 + }, + { + "epoch": 1.6581818181818182, + "grad_norm": 0.008631768636405468, + "learning_rate": 9.40157730661783e-05, + "loss": 0.01288297027349472, + "num_input_tokens_seen": 44804736, + "step": 2736, + "train_runtime": 22234.834, + "train_tokens_per_second": 2015.07 + }, + { + "epoch": 1.6587878787878787, + "grad_norm": 0.009771174751222134, + "learning_rate": 9.401121046393151e-05, + "loss": 0.01390773430466652, + "num_input_tokens_seen": 44821112, + "step": 2737, + "train_runtime": 22242.9515, + "train_tokens_per_second": 2015.07 + }, + { + "epoch": 1.6593939393939394, + "grad_norm": 0.007200576830655336, + "learning_rate": 9.400664623379573e-05, + "loss": 0.01203729398548603, + "num_input_tokens_seen": 44837488, + "step": 2738, + "train_runtime": 22251.0674, + "train_tokens_per_second": 2015.071 + }, + { + "epoch": 1.6600000000000001, + "grad_norm": 0.006853716913610697, + "learning_rate": 9.400208037593983e-05, + "loss": 0.012496777810156345, + "num_input_tokens_seen": 44853864, + "step": 2739, + "train_runtime": 22259.1856, + "train_tokens_per_second": 2015.072 + }, + { + "epoch": 1.6606060606060606, + "grad_norm": 0.017664391547441483, + "learning_rate": 9.399751289053267e-05, + "loss": 0.013294186443090439, + "num_input_tokens_seen": 44870240, + "step": 2740, + "train_runtime": 22267.3004, + "train_tokens_per_second": 2015.073 + }, + { + "epoch": 1.6612121212121211, + "grad_norm": 0.005868007894605398, + "learning_rate": 9.399294377774318e-05, + "loss": 0.012370945885777473, + "num_input_tokens_seen": 44886616, + "step": 2741, + "train_runtime": 22275.4172, + "train_tokens_per_second": 2015.074 + }, + { + "epoch": 1.6618181818181819, + "grad_norm": 0.014271008782088757, + "learning_rate": 9.398837303774037e-05, + "loss": 0.01195800956338644, + "num_input_tokens_seen": 44902992, + "step": 2742, + "train_runtime": 22283.5344, + "train_tokens_per_second": 2015.075 + }, + { + "epoch": 1.6624242424242426, + "grad_norm": 0.012290251441299915, + "learning_rate": 9.39838006706933e-05, + "loss": 0.0112903518602252, + "num_input_tokens_seen": 44919368, + "step": 2743, + "train_runtime": 22291.6532, + "train_tokens_per_second": 2015.076 + }, + { + "epoch": 1.663030303030303, + "grad_norm": 0.006131591275334358, + "learning_rate": 9.39792266767711e-05, + "loss": 0.012527124024927616, + "num_input_tokens_seen": 44935744, + "step": 2744, + "train_runtime": 22299.7686, + "train_tokens_per_second": 2015.077 + }, + { + "epoch": 1.6636363636363636, + "grad_norm": 0.019052183255553246, + "learning_rate": 9.397465105614296e-05, + "loss": 0.013172638602554798, + "num_input_tokens_seen": 44952120, + "step": 2745, + "train_runtime": 22307.8823, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 1.664242424242424, + "grad_norm": 0.013884824700653553, + "learning_rate": 9.39700738089781e-05, + "loss": 0.012267852202057838, + "num_input_tokens_seen": 44968496, + "step": 2746, + "train_runtime": 22315.9938, + "train_tokens_per_second": 2015.079 + }, + { + "epoch": 1.6648484848484848, + "grad_norm": 0.00940402876585722, + "learning_rate": 9.396549493544584e-05, + "loss": 0.011282133869826794, + "num_input_tokens_seen": 44984872, + "step": 2747, + "train_runtime": 22324.1106, + "train_tokens_per_second": 2015.08 + }, + { + "epoch": 1.6654545454545455, + "grad_norm": 0.004152586217969656, + "learning_rate": 9.396091443571555e-05, + "loss": 0.010489017702639103, + "num_input_tokens_seen": 45001248, + "step": 2748, + "train_runtime": 22332.2249, + "train_tokens_per_second": 2015.081 + }, + { + "epoch": 1.666060606060606, + "grad_norm": 0.010575785301625729, + "learning_rate": 9.395633230995664e-05, + "loss": 0.011070530861616135, + "num_input_tokens_seen": 45017624, + "step": 2749, + "train_runtime": 22340.3421, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 0.010641394183039665, + "learning_rate": 9.39517485583386e-05, + "loss": 0.012648802250623703, + "num_input_tokens_seen": 45034000, + "step": 2750, + "train_runtime": 22348.4522, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 1.6672727272727272, + "grad_norm": 0.008935361169278622, + "learning_rate": 9.394716318103098e-05, + "loss": 0.01304717268794775, + "num_input_tokens_seen": 45050376, + "step": 2751, + "train_runtime": 22356.568, + "train_tokens_per_second": 2015.085 + }, + { + "epoch": 1.667878787878788, + "grad_norm": 0.009867693297564983, + "learning_rate": 9.394257617820336e-05, + "loss": 0.011446228250861168, + "num_input_tokens_seen": 45066752, + "step": 2752, + "train_runtime": 22364.6824, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.6684848484848485, + "grad_norm": 0.007289520464837551, + "learning_rate": 9.393798755002544e-05, + "loss": 0.012150555849075317, + "num_input_tokens_seen": 45083128, + "step": 2753, + "train_runtime": 22372.7957, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 1.669090909090909, + "grad_norm": 0.01773468591272831, + "learning_rate": 9.393339729666693e-05, + "loss": 0.01132100261747837, + "num_input_tokens_seen": 45099504, + "step": 2754, + "train_runtime": 22380.9119, + "train_tokens_per_second": 2015.088 + }, + { + "epoch": 1.6696969696969697, + "grad_norm": 0.01178825180977583, + "learning_rate": 9.392880541829758e-05, + "loss": 0.013354834169149399, + "num_input_tokens_seen": 45115880, + "step": 2755, + "train_runtime": 22389.0244, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 1.6703030303030304, + "grad_norm": 0.012685943394899368, + "learning_rate": 9.392421191508729e-05, + "loss": 0.012555164285004139, + "num_input_tokens_seen": 45132256, + "step": 2756, + "train_runtime": 22397.1411, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 1.670909090909091, + "grad_norm": 0.008436055853962898, + "learning_rate": 9.391961678720594e-05, + "loss": 0.012797040864825249, + "num_input_tokens_seen": 45148632, + "step": 2757, + "train_runtime": 22405.2557, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.6715151515151514, + "grad_norm": 0.011204235255718231, + "learning_rate": 9.391502003482349e-05, + "loss": 0.01226828433573246, + "num_input_tokens_seen": 45165008, + "step": 2758, + "train_runtime": 22413.372, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 1.6721212121212121, + "grad_norm": 0.007430667523294687, + "learning_rate": 9.391042165810996e-05, + "loss": 0.013038146309554577, + "num_input_tokens_seen": 45181384, + "step": 2759, + "train_runtime": 22421.4906, + "train_tokens_per_second": 2015.093 + }, + { + "epoch": 1.6727272727272728, + "grad_norm": 0.012358262203633785, + "learning_rate": 9.390582165723544e-05, + "loss": 0.012622443027794361, + "num_input_tokens_seen": 45197760, + "step": 2760, + "train_runtime": 22429.6043, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 1.6733333333333333, + "grad_norm": 0.009910723194479942, + "learning_rate": 9.39012200323701e-05, + "loss": 0.013371050357818604, + "num_input_tokens_seen": 45214136, + "step": 2761, + "train_runtime": 22437.7158, + "train_tokens_per_second": 2015.095 + }, + { + "epoch": 1.6739393939393938, + "grad_norm": 0.006673166062682867, + "learning_rate": 9.389661678368413e-05, + "loss": 0.012958286330103874, + "num_input_tokens_seen": 45230512, + "step": 2762, + "train_runtime": 22445.8348, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 1.6745454545454546, + "grad_norm": 0.010125633329153061, + "learning_rate": 9.389201191134776e-05, + "loss": 0.011834094300866127, + "num_input_tokens_seen": 45246888, + "step": 2763, + "train_runtime": 22453.9495, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.6751515151515153, + "grad_norm": 0.00810938235372305, + "learning_rate": 9.388740541553138e-05, + "loss": 0.012278936803340912, + "num_input_tokens_seen": 45263264, + "step": 2764, + "train_runtime": 22462.0643, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.6757575757575758, + "grad_norm": 0.009514245204627514, + "learning_rate": 9.388279729640531e-05, + "loss": 0.011925340630114079, + "num_input_tokens_seen": 45279640, + "step": 2765, + "train_runtime": 22470.1772, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 1.6763636363636363, + "grad_norm": 0.015000533312559128, + "learning_rate": 9.387818755414004e-05, + "loss": 0.011756815947592258, + "num_input_tokens_seen": 45296016, + "step": 2766, + "train_runtime": 22478.2921, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 1.676969696969697, + "grad_norm": 0.008925878442823887, + "learning_rate": 9.387357618890606e-05, + "loss": 0.011443368159234524, + "num_input_tokens_seen": 45312392, + "step": 2767, + "train_runtime": 22486.4077, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 1.6775757575757577, + "grad_norm": 0.008592803962528706, + "learning_rate": 9.386896320087392e-05, + "loss": 0.012534118257462978, + "num_input_tokens_seen": 45328768, + "step": 2768, + "train_runtime": 22494.5243, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 1.6781818181818182, + "grad_norm": 0.007586160209029913, + "learning_rate": 9.386434859021429e-05, + "loss": 0.011447837576270103, + "num_input_tokens_seen": 45345144, + "step": 2769, + "train_runtime": 22502.6471, + "train_tokens_per_second": 2015.103 + }, + { + "epoch": 1.6787878787878787, + "grad_norm": 0.004868637304753065, + "learning_rate": 9.385973235709781e-05, + "loss": 0.011121487244963646, + "num_input_tokens_seen": 45361520, + "step": 2770, + "train_runtime": 22510.7612, + "train_tokens_per_second": 2015.104 + }, + { + "epoch": 1.6793939393939394, + "grad_norm": 0.01013186015188694, + "learning_rate": 9.385511450169525e-05, + "loss": 0.012332988902926445, + "num_input_tokens_seen": 45377896, + "step": 2771, + "train_runtime": 22518.8756, + "train_tokens_per_second": 2015.105 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 0.013302835635840893, + "learning_rate": 9.385049502417742e-05, + "loss": 0.012212036177515984, + "num_input_tokens_seen": 45394272, + "step": 2772, + "train_runtime": 22526.9886, + "train_tokens_per_second": 2015.106 + }, + { + "epoch": 1.6806060606060607, + "grad_norm": 0.01163007877767086, + "learning_rate": 9.384587392471515e-05, + "loss": 0.011997763998806477, + "num_input_tokens_seen": 45410648, + "step": 2773, + "train_runtime": 22535.105, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 1.6812121212121212, + "grad_norm": 0.010505879297852516, + "learning_rate": 9.38412512034794e-05, + "loss": 0.013251842930912971, + "num_input_tokens_seen": 45427024, + "step": 2774, + "train_runtime": 22543.222, + "train_tokens_per_second": 2015.108 + }, + { + "epoch": 1.6818181818181817, + "grad_norm": 0.009793641045689583, + "learning_rate": 9.383662686064114e-05, + "loss": 0.012996641919016838, + "num_input_tokens_seen": 45443400, + "step": 2775, + "train_runtime": 22551.3383, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 1.6824242424242424, + "grad_norm": 0.010234086774289608, + "learning_rate": 9.383200089637143e-05, + "loss": 0.01099113654345274, + "num_input_tokens_seen": 45459776, + "step": 2776, + "train_runtime": 22559.4538, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 1.683030303030303, + "grad_norm": 0.013903598301112652, + "learning_rate": 9.382737331084137e-05, + "loss": 0.01228981651365757, + "num_input_tokens_seen": 45476152, + "step": 2777, + "train_runtime": 22567.5733, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 1.6836363636363636, + "grad_norm": 0.010938679799437523, + "learning_rate": 9.382274410422211e-05, + "loss": 0.012848911806941032, + "num_input_tokens_seen": 45492528, + "step": 2778, + "train_runtime": 22575.6879, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 1.684242424242424, + "grad_norm": 0.022132035344839096, + "learning_rate": 9.381811327668488e-05, + "loss": 0.012318627908825874, + "num_input_tokens_seen": 45508904, + "step": 2779, + "train_runtime": 22583.8014, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.6848484848484848, + "grad_norm": 0.011289408430457115, + "learning_rate": 9.381348082840098e-05, + "loss": 0.011903293430805206, + "num_input_tokens_seen": 45525280, + "step": 2780, + "train_runtime": 22591.9158, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 1.6854545454545455, + "grad_norm": 0.019458260387182236, + "learning_rate": 9.380884675954176e-05, + "loss": 0.013369610533118248, + "num_input_tokens_seen": 45541656, + "step": 2781, + "train_runtime": 22600.0616, + "train_tokens_per_second": 2015.112 + }, + { + "epoch": 1.686060606060606, + "grad_norm": 0.014678013511002064, + "learning_rate": 9.380421107027859e-05, + "loss": 0.012422224506735802, + "num_input_tokens_seen": 45558032, + "step": 2782, + "train_runtime": 22608.1756, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.6866666666666665, + "grad_norm": 0.008445504121482372, + "learning_rate": 9.379957376078297e-05, + "loss": 0.011338223703205585, + "num_input_tokens_seen": 45574408, + "step": 2783, + "train_runtime": 22616.2904, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 1.6872727272727273, + "grad_norm": 0.01064318511635065, + "learning_rate": 9.379493483122642e-05, + "loss": 0.01236021425575018, + "num_input_tokens_seen": 45590784, + "step": 2784, + "train_runtime": 22624.4065, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 1.687878787878788, + "grad_norm": 0.006612302735447884, + "learning_rate": 9.37902942817805e-05, + "loss": 0.011895343661308289, + "num_input_tokens_seen": 45607160, + "step": 2785, + "train_runtime": 22632.5329, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 1.6884848484848485, + "grad_norm": 0.030377233400940895, + "learning_rate": 9.378565211261687e-05, + "loss": 0.013250184245407581, + "num_input_tokens_seen": 45623536, + "step": 2786, + "train_runtime": 22640.6471, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 1.689090909090909, + "grad_norm": 0.01889094151556492, + "learning_rate": 9.378100832390727e-05, + "loss": 0.014152915216982365, + "num_input_tokens_seen": 45639912, + "step": 2787, + "train_runtime": 22648.7656, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 1.6896969696969697, + "grad_norm": 0.008286499418318272, + "learning_rate": 9.377636291582339e-05, + "loss": 0.01310395635664463, + "num_input_tokens_seen": 45656288, + "step": 2788, + "train_runtime": 22656.882, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 1.6903030303030304, + "grad_norm": 0.01981155388057232, + "learning_rate": 9.377171588853712e-05, + "loss": 0.011624631471931934, + "num_input_tokens_seen": 45672664, + "step": 2789, + "train_runtime": 22664.9985, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 1.690909090909091, + "grad_norm": 0.007989699020981789, + "learning_rate": 9.376706724222031e-05, + "loss": 0.012425190769135952, + "num_input_tokens_seen": 45689040, + "step": 2790, + "train_runtime": 22673.1109, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 1.6915151515151514, + "grad_norm": 0.01827121339738369, + "learning_rate": 9.376241697704493e-05, + "loss": 0.012519946321845055, + "num_input_tokens_seen": 45705416, + "step": 2791, + "train_runtime": 22681.2318, + "train_tokens_per_second": 2015.121 + }, + { + "epoch": 1.6921212121212121, + "grad_norm": 0.032639652490615845, + "learning_rate": 9.375776509318296e-05, + "loss": 0.012380331754684448, + "num_input_tokens_seen": 45721792, + "step": 2792, + "train_runtime": 22689.3466, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.6927272727272729, + "grad_norm": 0.02035784162580967, + "learning_rate": 9.375311159080647e-05, + "loss": 0.011968818493187428, + "num_input_tokens_seen": 45738168, + "step": 2793, + "train_runtime": 22697.4604, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 1.6933333333333334, + "grad_norm": 0.025810981169342995, + "learning_rate": 9.374845647008758e-05, + "loss": 0.012797437608242035, + "num_input_tokens_seen": 45754544, + "step": 2794, + "train_runtime": 22705.5736, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 1.6939393939393939, + "grad_norm": 0.01363008376210928, + "learning_rate": 9.37437997311985e-05, + "loss": 0.012758086435496807, + "num_input_tokens_seen": 45770920, + "step": 2795, + "train_runtime": 22713.6878, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 1.6945454545454546, + "grad_norm": 0.013838708400726318, + "learning_rate": 9.373914137431146e-05, + "loss": 0.011740066111087799, + "num_input_tokens_seen": 45787296, + "step": 2796, + "train_runtime": 22721.8028, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 1.6951515151515153, + "grad_norm": 0.013491389341652393, + "learning_rate": 9.373448139959873e-05, + "loss": 0.011969367042183876, + "num_input_tokens_seen": 45803672, + "step": 2797, + "train_runtime": 22729.921, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.6957575757575758, + "grad_norm": 0.006902933586388826, + "learning_rate": 9.372981980723272e-05, + "loss": 0.01120357122272253, + "num_input_tokens_seen": 45820048, + "step": 2798, + "train_runtime": 22738.0404, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.6963636363636363, + "grad_norm": 0.01884661242365837, + "learning_rate": 9.372515659738583e-05, + "loss": 0.012923416681587696, + "num_input_tokens_seen": 45836424, + "step": 2799, + "train_runtime": 22746.156, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 1.696969696969697, + "grad_norm": 0.009512597694993019, + "learning_rate": 9.372049177023055e-05, + "loss": 0.012344890274107456, + "num_input_tokens_seen": 45852800, + "step": 2800, + "train_runtime": 22754.2693, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 1.6975757575757577, + "grad_norm": 0.016322260722517967, + "learning_rate": 9.371582532593943e-05, + "loss": 0.013447512872517109, + "num_input_tokens_seen": 45869176, + "step": 2801, + "train_runtime": 22763.3025, + "train_tokens_per_second": 2015.049 + }, + { + "epoch": 1.6981818181818182, + "grad_norm": 0.01216021366417408, + "learning_rate": 9.371115726468505e-05, + "loss": 0.013008292764425278, + "num_input_tokens_seen": 45885552, + "step": 2802, + "train_runtime": 22771.4191, + "train_tokens_per_second": 2015.05 + }, + { + "epoch": 1.6987878787878787, + "grad_norm": 0.039111893624067307, + "learning_rate": 9.37064875866401e-05, + "loss": 0.014293879270553589, + "num_input_tokens_seen": 45901928, + "step": 2803, + "train_runtime": 22779.5344, + "train_tokens_per_second": 2015.051 + }, + { + "epoch": 1.6993939393939392, + "grad_norm": 0.0064725009724497795, + "learning_rate": 9.370181629197728e-05, + "loss": 0.011476884596049786, + "num_input_tokens_seen": 45918304, + "step": 2804, + "train_runtime": 22787.6455, + "train_tokens_per_second": 2015.053 + }, + { + "epoch": 1.7, + "grad_norm": 0.010711527429521084, + "learning_rate": 9.369714338086939e-05, + "loss": 0.012465567328035831, + "num_input_tokens_seen": 45934680, + "step": 2805, + "train_runtime": 22795.7584, + "train_tokens_per_second": 2015.054 + }, + { + "epoch": 1.7006060606060607, + "grad_norm": 0.006154636852443218, + "learning_rate": 9.369246885348926e-05, + "loss": 0.012477940879762173, + "num_input_tokens_seen": 45951056, + "step": 2806, + "train_runtime": 22803.872, + "train_tokens_per_second": 2015.055 + }, + { + "epoch": 1.7012121212121212, + "grad_norm": 0.008175252936780453, + "learning_rate": 9.368779271000978e-05, + "loss": 0.011836701072752476, + "num_input_tokens_seen": 45967432, + "step": 2807, + "train_runtime": 22811.9839, + "train_tokens_per_second": 2015.056 + }, + { + "epoch": 1.7018181818181817, + "grad_norm": 0.00616717291995883, + "learning_rate": 9.368311495060393e-05, + "loss": 0.010793658904731274, + "num_input_tokens_seen": 45983808, + "step": 2808, + "train_runtime": 22820.0984, + "train_tokens_per_second": 2015.057 + }, + { + "epoch": 1.7024242424242424, + "grad_norm": 0.03607097640633583, + "learning_rate": 9.367843557544474e-05, + "loss": 0.014611356891691685, + "num_input_tokens_seen": 46000184, + "step": 2809, + "train_runtime": 22828.2107, + "train_tokens_per_second": 2015.059 + }, + { + "epoch": 1.7030303030303031, + "grad_norm": 0.018071550875902176, + "learning_rate": 9.367375458470526e-05, + "loss": 0.01233680546283722, + "num_input_tokens_seen": 46016560, + "step": 2810, + "train_runtime": 22836.3246, + "train_tokens_per_second": 2015.06 + }, + { + "epoch": 1.7036363636363636, + "grad_norm": 0.00596799748018384, + "learning_rate": 9.366907197855868e-05, + "loss": 0.011606480926275253, + "num_input_tokens_seen": 46032936, + "step": 2811, + "train_runtime": 22844.4382, + "train_tokens_per_second": 2015.061 + }, + { + "epoch": 1.7042424242424241, + "grad_norm": 0.007533122319728136, + "learning_rate": 9.366438775717814e-05, + "loss": 0.012357478961348534, + "num_input_tokens_seen": 46049312, + "step": 2812, + "train_runtime": 22852.5509, + "train_tokens_per_second": 2015.062 + }, + { + "epoch": 1.7048484848484848, + "grad_norm": 0.01864629052579403, + "learning_rate": 9.365970192073694e-05, + "loss": 0.013025326654314995, + "num_input_tokens_seen": 46065688, + "step": 2813, + "train_runtime": 22860.6639, + "train_tokens_per_second": 2015.063 + }, + { + "epoch": 1.7054545454545456, + "grad_norm": 0.008015927858650684, + "learning_rate": 9.365501446940839e-05, + "loss": 0.01284027099609375, + "num_input_tokens_seen": 46082064, + "step": 2814, + "train_runtime": 22868.7759, + "train_tokens_per_second": 2015.065 + }, + { + "epoch": 1.706060606060606, + "grad_norm": 0.010607431642711163, + "learning_rate": 9.365032540336587e-05, + "loss": 0.012594718486070633, + "num_input_tokens_seen": 46098440, + "step": 2815, + "train_runtime": 22876.8894, + "train_tokens_per_second": 2015.066 + }, + { + "epoch": 1.7066666666666666, + "grad_norm": 0.007729121949523687, + "learning_rate": 9.36456347227828e-05, + "loss": 0.013601492159068584, + "num_input_tokens_seen": 46114816, + "step": 2816, + "train_runtime": 22885.0026, + "train_tokens_per_second": 2015.067 + }, + { + "epoch": 1.7072727272727273, + "grad_norm": 0.010761508718132973, + "learning_rate": 9.364094242783272e-05, + "loss": 0.011926738545298576, + "num_input_tokens_seen": 46131192, + "step": 2817, + "train_runtime": 22893.1206, + "train_tokens_per_second": 2015.068 + }, + { + "epoch": 1.707878787878788, + "grad_norm": 0.014684220775961876, + "learning_rate": 9.363624851868916e-05, + "loss": 0.013269368559122086, + "num_input_tokens_seen": 46147568, + "step": 2818, + "train_runtime": 22901.2337, + "train_tokens_per_second": 2015.069 + }, + { + "epoch": 1.7084848484848485, + "grad_norm": 0.008672056719660759, + "learning_rate": 9.363155299552573e-05, + "loss": 0.012723954394459724, + "num_input_tokens_seen": 46163944, + "step": 2819, + "train_runtime": 22909.3467, + "train_tokens_per_second": 2015.07 + }, + { + "epoch": 1.709090909090909, + "grad_norm": 0.007565699517726898, + "learning_rate": 9.362685585851614e-05, + "loss": 0.011728525161743164, + "num_input_tokens_seen": 46180320, + "step": 2820, + "train_runtime": 22917.4588, + "train_tokens_per_second": 2015.072 + }, + { + "epoch": 1.7096969696969697, + "grad_norm": 0.013421568088233471, + "learning_rate": 9.362215710783411e-05, + "loss": 0.010953246615827084, + "num_input_tokens_seen": 46196696, + "step": 2821, + "train_runtime": 22925.5745, + "train_tokens_per_second": 2015.073 + }, + { + "epoch": 1.7103030303030304, + "grad_norm": 0.014821798540651798, + "learning_rate": 9.361745674365345e-05, + "loss": 0.014481105841696262, + "num_input_tokens_seen": 46213072, + "step": 2822, + "train_runtime": 22933.6913, + "train_tokens_per_second": 2015.073 + }, + { + "epoch": 1.710909090909091, + "grad_norm": 0.01066959835588932, + "learning_rate": 9.361275476614798e-05, + "loss": 0.012770322151482105, + "num_input_tokens_seen": 46229448, + "step": 2823, + "train_runtime": 22941.8058, + "train_tokens_per_second": 2015.075 + }, + { + "epoch": 1.7115151515151514, + "grad_norm": 0.02900104783475399, + "learning_rate": 9.360805117549165e-05, + "loss": 0.013311613351106644, + "num_input_tokens_seen": 46245824, + "step": 2824, + "train_runtime": 22949.9216, + "train_tokens_per_second": 2015.075 + }, + { + "epoch": 1.7121212121212122, + "grad_norm": 0.00498326076194644, + "learning_rate": 9.360334597185845e-05, + "loss": 0.011704309843480587, + "num_input_tokens_seen": 46262200, + "step": 2825, + "train_runtime": 22958.0391, + "train_tokens_per_second": 2015.076 + }, + { + "epoch": 1.7127272727272729, + "grad_norm": 0.005870001390576363, + "learning_rate": 9.359863915542238e-05, + "loss": 0.010895316489040852, + "num_input_tokens_seen": 46278576, + "step": 2826, + "train_runtime": 22966.1548, + "train_tokens_per_second": 2015.077 + }, + { + "epoch": 1.7133333333333334, + "grad_norm": 0.008924623019993305, + "learning_rate": 9.359393072635755e-05, + "loss": 0.011977889575064182, + "num_input_tokens_seen": 46294952, + "step": 2827, + "train_runtime": 22974.268, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 1.7139393939393939, + "grad_norm": 0.006065470166504383, + "learning_rate": 9.358922068483812e-05, + "loss": 0.011861737817525864, + "num_input_tokens_seen": 46311328, + "step": 2828, + "train_runtime": 22982.3833, + "train_tokens_per_second": 2015.079 + }, + { + "epoch": 1.7145454545454546, + "grad_norm": 0.005640857852995396, + "learning_rate": 9.35845090310383e-05, + "loss": 0.011305807158350945, + "num_input_tokens_seen": 46327704, + "step": 2829, + "train_runtime": 22990.5002, + "train_tokens_per_second": 2015.08 + }, + { + "epoch": 1.7151515151515153, + "grad_norm": 0.023728247731924057, + "learning_rate": 9.357979576513238e-05, + "loss": 0.01281740888953209, + "num_input_tokens_seen": 46344080, + "step": 2830, + "train_runtime": 22998.6172, + "train_tokens_per_second": 2015.081 + }, + { + "epoch": 1.7157575757575758, + "grad_norm": 0.006878760643303394, + "learning_rate": 9.357508088729468e-05, + "loss": 0.011280113831162453, + "num_input_tokens_seen": 46360456, + "step": 2831, + "train_runtime": 23006.735, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 1.7163636363636363, + "grad_norm": 0.012969830073416233, + "learning_rate": 9.35703643976996e-05, + "loss": 0.013124781660735607, + "num_input_tokens_seen": 46376832, + "step": 2832, + "train_runtime": 23014.8512, + "train_tokens_per_second": 2015.083 + }, + { + "epoch": 1.7169696969696968, + "grad_norm": 0.00749437790364027, + "learning_rate": 9.356564629652158e-05, + "loss": 0.011703899130225182, + "num_input_tokens_seen": 46393208, + "step": 2833, + "train_runtime": 23022.9653, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 1.7175757575757575, + "grad_norm": 0.015355957671999931, + "learning_rate": 9.356092658393514e-05, + "loss": 0.011799749918282032, + "num_input_tokens_seen": 46409584, + "step": 2834, + "train_runtime": 23031.0777, + "train_tokens_per_second": 2015.085 + }, + { + "epoch": 1.7181818181818183, + "grad_norm": 0.006823359522968531, + "learning_rate": 9.355620526011486e-05, + "loss": 0.01280257198959589, + "num_input_tokens_seen": 46425960, + "step": 2835, + "train_runtime": 23039.1904, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.7187878787878788, + "grad_norm": 0.009748456999659538, + "learning_rate": 9.355148232523537e-05, + "loss": 0.013985298573970795, + "num_input_tokens_seen": 46442336, + "step": 2836, + "train_runtime": 23047.3039, + "train_tokens_per_second": 2015.088 + }, + { + "epoch": 1.7193939393939393, + "grad_norm": 0.011919494718313217, + "learning_rate": 9.354675777947138e-05, + "loss": 0.012215827591717243, + "num_input_tokens_seen": 46458712, + "step": 2837, + "train_runtime": 23055.4193, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 1.72, + "grad_norm": 0.00848611444234848, + "learning_rate": 9.354203162299759e-05, + "loss": 0.012414870783686638, + "num_input_tokens_seen": 46475088, + "step": 2838, + "train_runtime": 23063.5341, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 1.7206060606060607, + "grad_norm": 0.008148525841534138, + "learning_rate": 9.353730385598887e-05, + "loss": 0.013450459577143192, + "num_input_tokens_seen": 46491464, + "step": 2839, + "train_runtime": 23071.6495, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.7212121212121212, + "grad_norm": 0.006279889028519392, + "learning_rate": 9.353257447862005e-05, + "loss": 0.011724742129445076, + "num_input_tokens_seen": 46507840, + "step": 2840, + "train_runtime": 23079.7611, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 1.7218181818181817, + "grad_norm": 0.008062539622187614, + "learning_rate": 9.352784349106608e-05, + "loss": 0.01200947817414999, + "num_input_tokens_seen": 46524216, + "step": 2841, + "train_runtime": 23087.8723, + "train_tokens_per_second": 2015.093 + }, + { + "epoch": 1.7224242424242424, + "grad_norm": 0.020648252218961716, + "learning_rate": 9.352311089350195e-05, + "loss": 0.014192345552146435, + "num_input_tokens_seen": 46540592, + "step": 2842, + "train_runtime": 23095.987, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 1.7230303030303031, + "grad_norm": 0.007957853376865387, + "learning_rate": 9.35183766861027e-05, + "loss": 0.012997648678719997, + "num_input_tokens_seen": 46556968, + "step": 2843, + "train_runtime": 23104.1043, + "train_tokens_per_second": 2015.095 + }, + { + "epoch": 1.7236363636363636, + "grad_norm": 0.011467264033854008, + "learning_rate": 9.351364086904345e-05, + "loss": 0.012560134753584862, + "num_input_tokens_seen": 46573344, + "step": 2844, + "train_runtime": 23112.2178, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 1.7242424242424241, + "grad_norm": 0.008995643816888332, + "learning_rate": 9.350890344249936e-05, + "loss": 0.012183014303445816, + "num_input_tokens_seen": 46589720, + "step": 2845, + "train_runtime": 23120.3346, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.7248484848484849, + "grad_norm": 0.010562472976744175, + "learning_rate": 9.350416440664566e-05, + "loss": 0.01177982147783041, + "num_input_tokens_seen": 46606096, + "step": 2846, + "train_runtime": 23128.4506, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.7254545454545456, + "grad_norm": 0.009001946076750755, + "learning_rate": 9.349942376165766e-05, + "loss": 0.012112541124224663, + "num_input_tokens_seen": 46622472, + "step": 2847, + "train_runtime": 23136.5642, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 1.726060606060606, + "grad_norm": 0.009399345144629478, + "learning_rate": 9.349468150771065e-05, + "loss": 0.012279201298952103, + "num_input_tokens_seen": 46638848, + "step": 2848, + "train_runtime": 23144.6777, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 1.7266666666666666, + "grad_norm": 0.010582569986581802, + "learning_rate": 9.34899376449801e-05, + "loss": 0.012267028912901878, + "num_input_tokens_seen": 46655224, + "step": 2849, + "train_runtime": 23152.7895, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 1.7272727272727273, + "grad_norm": 0.015244287438690662, + "learning_rate": 9.348519217364145e-05, + "loss": 0.013822423294186592, + "num_input_tokens_seen": 46671600, + "step": 2850, + "train_runtime": 23160.9084, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 1.727878787878788, + "grad_norm": 0.006491030566394329, + "learning_rate": 9.34804450938702e-05, + "loss": 0.012164949439466, + "num_input_tokens_seen": 46687976, + "step": 2851, + "train_runtime": 23169.0229, + "train_tokens_per_second": 2015.103 + }, + { + "epoch": 1.7284848484848485, + "grad_norm": 0.0091371675953269, + "learning_rate": 9.347569640584198e-05, + "loss": 0.012326296418905258, + "num_input_tokens_seen": 46704352, + "step": 2852, + "train_runtime": 23177.1375, + "train_tokens_per_second": 2015.104 + }, + { + "epoch": 1.729090909090909, + "grad_norm": 0.007827811874449253, + "learning_rate": 9.347094610973241e-05, + "loss": 0.010904887691140175, + "num_input_tokens_seen": 46720728, + "step": 2853, + "train_runtime": 23185.2512, + "train_tokens_per_second": 2015.106 + }, + { + "epoch": 1.7296969696969697, + "grad_norm": 0.01404674630612135, + "learning_rate": 9.346619420571721e-05, + "loss": 0.012772872112691402, + "num_input_tokens_seen": 46737104, + "step": 2854, + "train_runtime": 23193.3674, + "train_tokens_per_second": 2015.106 + }, + { + "epoch": 1.7303030303030305, + "grad_norm": 0.008301572874188423, + "learning_rate": 9.346144069397211e-05, + "loss": 0.011670062318444252, + "num_input_tokens_seen": 46753480, + "step": 2855, + "train_runtime": 23201.4816, + "train_tokens_per_second": 2015.108 + }, + { + "epoch": 1.730909090909091, + "grad_norm": 0.06437455862760544, + "learning_rate": 9.345668557467298e-05, + "loss": 0.014140639454126358, + "num_input_tokens_seen": 46769856, + "step": 2856, + "train_runtime": 23209.596, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 1.7315151515151515, + "grad_norm": 0.014902097173035145, + "learning_rate": 9.345192884799567e-05, + "loss": 0.014665037393569946, + "num_input_tokens_seen": 46786232, + "step": 2857, + "train_runtime": 23217.7111, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 1.7321212121212122, + "grad_norm": 0.014375180006027222, + "learning_rate": 9.344717051411612e-05, + "loss": 0.01194059569388628, + "num_input_tokens_seen": 46802608, + "step": 2858, + "train_runtime": 23225.8241, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 1.732727272727273, + "grad_norm": 0.009062650613486767, + "learning_rate": 9.344241057321035e-05, + "loss": 0.01136862114071846, + "num_input_tokens_seen": 46818984, + "step": 2859, + "train_runtime": 23233.938, + "train_tokens_per_second": 2015.112 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 0.010465494357049465, + "learning_rate": 9.343764902545443e-05, + "loss": 0.011493357829749584, + "num_input_tokens_seen": 46835360, + "step": 2860, + "train_runtime": 23242.0502, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.733939393939394, + "grad_norm": 0.009279366582632065, + "learning_rate": 9.343288587102443e-05, + "loss": 0.011399534530937672, + "num_input_tokens_seen": 46851736, + "step": 2861, + "train_runtime": 23250.1646, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 1.7345454545454544, + "grad_norm": 0.007747293449938297, + "learning_rate": 9.342812111009658e-05, + "loss": 0.012958042323589325, + "num_input_tokens_seen": 46868112, + "step": 2862, + "train_runtime": 23258.284, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 1.7351515151515151, + "grad_norm": 0.008354146964848042, + "learning_rate": 9.342335474284711e-05, + "loss": 0.012031560763716698, + "num_input_tokens_seen": 46884488, + "step": 2863, + "train_runtime": 23266.3974, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 1.7357575757575758, + "grad_norm": 0.00816110149025917, + "learning_rate": 9.34185867694523e-05, + "loss": 0.01127876527607441, + "num_input_tokens_seen": 46900864, + "step": 2864, + "train_runtime": 23274.5083, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 1.7363636363636363, + "grad_norm": 0.007726035080850124, + "learning_rate": 9.341381719008853e-05, + "loss": 0.013300550170242786, + "num_input_tokens_seen": 46917240, + "step": 2865, + "train_runtime": 23282.6218, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 1.7369696969696968, + "grad_norm": 0.008016044273972511, + "learning_rate": 9.34090460049322e-05, + "loss": 0.011594901792705059, + "num_input_tokens_seen": 46933616, + "step": 2866, + "train_runtime": 23290.7358, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 1.7375757575757576, + "grad_norm": 0.013905017636716366, + "learning_rate": 9.340427321415978e-05, + "loss": 0.013001223094761372, + "num_input_tokens_seen": 46949992, + "step": 2867, + "train_runtime": 23298.8518, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 1.7381818181818183, + "grad_norm": 0.01734462007880211, + "learning_rate": 9.339949881794785e-05, + "loss": 0.013187142089009285, + "num_input_tokens_seen": 46966368, + "step": 2868, + "train_runtime": 23306.9631, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.7387878787878788, + "grad_norm": 0.012361729517579079, + "learning_rate": 9.339472281647294e-05, + "loss": 0.0126962810754776, + "num_input_tokens_seen": 46982744, + "step": 2869, + "train_runtime": 23315.0772, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 1.7393939393939393, + "grad_norm": 0.009342917241156101, + "learning_rate": 9.338994520991177e-05, + "loss": 0.012031439691781998, + "num_input_tokens_seen": 46999120, + "step": 2870, + "train_runtime": 23323.1954, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 1.74, + "grad_norm": 0.00756025267764926, + "learning_rate": 9.338516599844101e-05, + "loss": 0.011724259704351425, + "num_input_tokens_seen": 47015496, + "step": 2871, + "train_runtime": 23331.311, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 1.7406060606060607, + "grad_norm": 0.004233363550156355, + "learning_rate": 9.338038518223747e-05, + "loss": 0.011528807692229748, + "num_input_tokens_seen": 47031872, + "step": 2872, + "train_runtime": 23339.4332, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 1.7412121212121212, + "grad_norm": 0.0077259112149477005, + "learning_rate": 9.337560276147793e-05, + "loss": 0.011708910576999187, + "num_input_tokens_seen": 47048248, + "step": 2873, + "train_runtime": 23347.5464, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 1.7418181818181817, + "grad_norm": 0.011294611729681492, + "learning_rate": 9.337081873633934e-05, + "loss": 0.012407934293150902, + "num_input_tokens_seen": 47064624, + "step": 2874, + "train_runtime": 23355.6638, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.7424242424242424, + "grad_norm": 0.007208488415926695, + "learning_rate": 9.33660331069986e-05, + "loss": 0.012143628671765327, + "num_input_tokens_seen": 47081000, + "step": 2875, + "train_runtime": 23363.7804, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 1.7430303030303032, + "grad_norm": 0.008625108748674393, + "learning_rate": 9.336124587363278e-05, + "loss": 0.012219018302857876, + "num_input_tokens_seen": 47097376, + "step": 2876, + "train_runtime": 23371.9001, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 1.7436363636363637, + "grad_norm": 0.003514588577672839, + "learning_rate": 9.335645703641889e-05, + "loss": 0.011641733348369598, + "num_input_tokens_seen": 47113752, + "step": 2877, + "train_runtime": 23380.0173, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 1.7442424242424241, + "grad_norm": 0.00892610289156437, + "learning_rate": 9.33516665955341e-05, + "loss": 0.013232930563390255, + "num_input_tokens_seen": 47130128, + "step": 2878, + "train_runtime": 23388.1336, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 1.7448484848484849, + "grad_norm": 0.015292099677026272, + "learning_rate": 9.334687455115559e-05, + "loss": 0.012574004009366035, + "num_input_tokens_seen": 47146504, + "step": 2879, + "train_runtime": 23396.2437, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 1.7454545454545456, + "grad_norm": 0.007786950096487999, + "learning_rate": 9.334208090346058e-05, + "loss": 0.013052877970039845, + "num_input_tokens_seen": 47162880, + "step": 2880, + "train_runtime": 23404.3577, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 1.746060606060606, + "grad_norm": 0.008695406839251518, + "learning_rate": 9.333728565262642e-05, + "loss": 0.011388403363525867, + "num_input_tokens_seen": 47179256, + "step": 2881, + "train_runtime": 23412.4708, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 1.7466666666666666, + "grad_norm": 0.01445862464606762, + "learning_rate": 9.333248879883045e-05, + "loss": 0.01251003984361887, + "num_input_tokens_seen": 47195632, + "step": 2882, + "train_runtime": 23420.587, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 1.7472727272727273, + "grad_norm": 0.00993890967220068, + "learning_rate": 9.332769034225012e-05, + "loss": 0.013060958124697208, + "num_input_tokens_seen": 47212008, + "step": 2883, + "train_runtime": 23428.6997, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 1.747878787878788, + "grad_norm": 0.007343083154410124, + "learning_rate": 9.332289028306289e-05, + "loss": 0.011927951127290726, + "num_input_tokens_seen": 47228384, + "step": 2884, + "train_runtime": 23436.8135, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 1.7484848484848485, + "grad_norm": 0.012586996890604496, + "learning_rate": 9.331808862144633e-05, + "loss": 0.014243541285395622, + "num_input_tokens_seen": 47244760, + "step": 2885, + "train_runtime": 23444.9342, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 1.749090909090909, + "grad_norm": 0.007525481283664703, + "learning_rate": 9.331328535757801e-05, + "loss": 0.012809276580810547, + "num_input_tokens_seen": 47261136, + "step": 2886, + "train_runtime": 23453.0478, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 1.7496969696969698, + "grad_norm": 0.008755379356443882, + "learning_rate": 9.330848049163562e-05, + "loss": 0.012331864796578884, + "num_input_tokens_seen": 47277512, + "step": 2887, + "train_runtime": 23461.1604, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 1.7503030303030302, + "grad_norm": 0.007854153402149677, + "learning_rate": 9.33036740237969e-05, + "loss": 0.011172168888151646, + "num_input_tokens_seen": 47293888, + "step": 2888, + "train_runtime": 23469.2714, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 1.750909090909091, + "grad_norm": 0.007569638080894947, + "learning_rate": 9.329886595423958e-05, + "loss": 0.011403515934944153, + "num_input_tokens_seen": 47310264, + "step": 2889, + "train_runtime": 23477.3853, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 1.7515151515151515, + "grad_norm": 0.006328298710286617, + "learning_rate": 9.329405628314152e-05, + "loss": 0.011399973183870316, + "num_input_tokens_seen": 47326640, + "step": 2890, + "train_runtime": 23485.5008, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 1.752121212121212, + "grad_norm": 0.00960509479045868, + "learning_rate": 9.328924501068066e-05, + "loss": 0.013432014733552933, + "num_input_tokens_seen": 47343016, + "step": 2891, + "train_runtime": 23493.6174, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 1.7527272727272727, + "grad_norm": 0.006631573662161827, + "learning_rate": 9.32844321370349e-05, + "loss": 0.012809229083359241, + "num_input_tokens_seen": 47359392, + "step": 2892, + "train_runtime": 23501.7329, + "train_tokens_per_second": 2015.145 + }, + { + "epoch": 1.7533333333333334, + "grad_norm": 0.008058437146246433, + "learning_rate": 9.327961766238231e-05, + "loss": 0.011357891373336315, + "num_input_tokens_seen": 47375768, + "step": 2893, + "train_runtime": 23509.8459, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 1.753939393939394, + "grad_norm": 0.004802866373211145, + "learning_rate": 9.327480158690094e-05, + "loss": 0.011158658191561699, + "num_input_tokens_seen": 47392144, + "step": 2894, + "train_runtime": 23517.9586, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 1.7545454545454544, + "grad_norm": 0.018760213628411293, + "learning_rate": 9.326998391076893e-05, + "loss": 0.01469513401389122, + "num_input_tokens_seen": 47408520, + "step": 2895, + "train_runtime": 23526.0771, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 1.7551515151515151, + "grad_norm": 0.011593660339713097, + "learning_rate": 9.326516463416448e-05, + "loss": 0.012592260725796223, + "num_input_tokens_seen": 47424896, + "step": 2896, + "train_runtime": 23534.1892, + "train_tokens_per_second": 2015.149 + }, + { + "epoch": 1.7557575757575759, + "grad_norm": 0.00899164192378521, + "learning_rate": 9.326034375726586e-05, + "loss": 0.012454191222786903, + "num_input_tokens_seen": 47441272, + "step": 2897, + "train_runtime": 23542.3037, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 1.7563636363636363, + "grad_norm": 0.007111812941730022, + "learning_rate": 9.325552128025135e-05, + "loss": 0.012857899069786072, + "num_input_tokens_seen": 47457648, + "step": 2898, + "train_runtime": 23550.4184, + "train_tokens_per_second": 2015.151 + }, + { + "epoch": 1.7569696969696968, + "grad_norm": 0.006486005615442991, + "learning_rate": 9.325069720329936e-05, + "loss": 0.011890999972820282, + "num_input_tokens_seen": 47474024, + "step": 2899, + "train_runtime": 23558.5372, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 1.7575757575757576, + "grad_norm": 0.004689642693847418, + "learning_rate": 9.324587152658828e-05, + "loss": 0.013131581246852875, + "num_input_tokens_seen": 47490400, + "step": 2900, + "train_runtime": 23566.6511, + "train_tokens_per_second": 2015.153 + }, + { + "epoch": 1.7581818181818183, + "grad_norm": 0.008016454055905342, + "learning_rate": 9.324104425029665e-05, + "loss": 0.01247711107134819, + "num_input_tokens_seen": 47506776, + "step": 2901, + "train_runtime": 23575.6961, + "train_tokens_per_second": 2015.074 + }, + { + "epoch": 1.7587878787878788, + "grad_norm": 0.008500835858285427, + "learning_rate": 9.323621537460301e-05, + "loss": 0.011386177502572536, + "num_input_tokens_seen": 47523152, + "step": 2902, + "train_runtime": 23583.8132, + "train_tokens_per_second": 2015.075 + }, + { + "epoch": 1.7593939393939393, + "grad_norm": 0.006690427660942078, + "learning_rate": 9.323138489968595e-05, + "loss": 0.012762854807078838, + "num_input_tokens_seen": 47539528, + "step": 2903, + "train_runtime": 23591.933, + "train_tokens_per_second": 2015.076 + }, + { + "epoch": 1.76, + "grad_norm": 0.016449101269245148, + "learning_rate": 9.322655282572414e-05, + "loss": 0.0136633962392807, + "num_input_tokens_seen": 47555904, + "step": 2904, + "train_runtime": 23600.0479, + "train_tokens_per_second": 2015.077 + }, + { + "epoch": 1.7606060606060607, + "grad_norm": 0.00727116409689188, + "learning_rate": 9.322171915289635e-05, + "loss": 0.011916939169168472, + "num_input_tokens_seen": 47572280, + "step": 2905, + "train_runtime": 23608.163, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 1.7612121212121212, + "grad_norm": 0.008331136777997017, + "learning_rate": 9.321688388138132e-05, + "loss": 0.011783263646066189, + "num_input_tokens_seen": 47588656, + "step": 2906, + "train_runtime": 23616.2789, + "train_tokens_per_second": 2015.079 + }, + { + "epoch": 1.7618181818181817, + "grad_norm": 0.020789558067917824, + "learning_rate": 9.32120470113579e-05, + "loss": 0.013218401931226254, + "num_input_tokens_seen": 47605032, + "step": 2907, + "train_runtime": 23624.391, + "train_tokens_per_second": 2015.08 + }, + { + "epoch": 1.7624242424242424, + "grad_norm": 0.00496008712798357, + "learning_rate": 9.320720854300504e-05, + "loss": 0.012365386821329594, + "num_input_tokens_seen": 47621408, + "step": 2908, + "train_runtime": 23632.5039, + "train_tokens_per_second": 2015.081 + }, + { + "epoch": 1.7630303030303032, + "grad_norm": 0.006191062740981579, + "learning_rate": 9.320236847650168e-05, + "loss": 0.012062267400324345, + "num_input_tokens_seen": 47637784, + "step": 2909, + "train_runtime": 23640.6186, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 1.7636363636363637, + "grad_norm": 0.006267681252211332, + "learning_rate": 9.319752681202683e-05, + "loss": 0.01223327498883009, + "num_input_tokens_seen": 47654160, + "step": 2910, + "train_runtime": 23648.7416, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 1.7642424242424242, + "grad_norm": 0.01227263081818819, + "learning_rate": 9.319268354975959e-05, + "loss": 0.013540641404688358, + "num_input_tokens_seen": 47670536, + "step": 2911, + "train_runtime": 23656.8565, + "train_tokens_per_second": 2015.083 + }, + { + "epoch": 1.7648484848484849, + "grad_norm": 0.005225658882409334, + "learning_rate": 9.31878386898791e-05, + "loss": 0.01202879287302494, + "num_input_tokens_seen": 47686912, + "step": 2912, + "train_runtime": 23664.972, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 1.7654545454545456, + "grad_norm": 0.02399802766740322, + "learning_rate": 9.318299223256456e-05, + "loss": 0.012255542911589146, + "num_input_tokens_seen": 47703288, + "step": 2913, + "train_runtime": 23673.0935, + "train_tokens_per_second": 2015.085 + }, + { + "epoch": 1.766060606060606, + "grad_norm": 0.014451933093369007, + "learning_rate": 9.317814417799523e-05, + "loss": 0.013469447381794453, + "num_input_tokens_seen": 47719664, + "step": 2914, + "train_runtime": 23681.2101, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.7666666666666666, + "grad_norm": 0.009129231795668602, + "learning_rate": 9.317329452635044e-05, + "loss": 0.011540930718183517, + "num_input_tokens_seen": 47736040, + "step": 2915, + "train_runtime": 23689.3237, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 1.767272727272727, + "grad_norm": 0.008276116102933884, + "learning_rate": 9.316844327780955e-05, + "loss": 0.012826155871152878, + "num_input_tokens_seen": 47752416, + "step": 2916, + "train_runtime": 23697.4395, + "train_tokens_per_second": 2015.088 + }, + { + "epoch": 1.7678787878787878, + "grad_norm": 0.01835167407989502, + "learning_rate": 9.316359043255201e-05, + "loss": 0.013342682272195816, + "num_input_tokens_seen": 47768792, + "step": 2917, + "train_runtime": 23705.5611, + "train_tokens_per_second": 2015.088 + }, + { + "epoch": 1.7684848484848485, + "grad_norm": 0.009004290215671062, + "learning_rate": 9.315873599075733e-05, + "loss": 0.01241071242839098, + "num_input_tokens_seen": 47785168, + "step": 2918, + "train_runtime": 23713.6782, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 1.769090909090909, + "grad_norm": 0.018462710082530975, + "learning_rate": 9.315387995260505e-05, + "loss": 0.011892465874552727, + "num_input_tokens_seen": 47801544, + "step": 2919, + "train_runtime": 23721.7922, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 1.7696969696969695, + "grad_norm": 0.010308179073035717, + "learning_rate": 9.314902231827478e-05, + "loss": 0.012521905824542046, + "num_input_tokens_seen": 47817920, + "step": 2920, + "train_runtime": 23729.9118, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.7703030303030303, + "grad_norm": 0.005858840420842171, + "learning_rate": 9.314416308794621e-05, + "loss": 0.0120368218049407, + "num_input_tokens_seen": 47834296, + "step": 2921, + "train_runtime": 23738.0341, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.770909090909091, + "grad_norm": 0.025842219591140747, + "learning_rate": 9.313930226179908e-05, + "loss": 0.012849163264036179, + "num_input_tokens_seen": 47850672, + "step": 2922, + "train_runtime": 23746.1488, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 1.7715151515151515, + "grad_norm": 0.006477030925452709, + "learning_rate": 9.313443984001315e-05, + "loss": 0.011147328652441502, + "num_input_tokens_seen": 47867048, + "step": 2923, + "train_runtime": 23754.2656, + "train_tokens_per_second": 2015.093 + }, + { + "epoch": 1.772121212121212, + "grad_norm": 0.00806950218975544, + "learning_rate": 9.312957582276829e-05, + "loss": 0.012119542807340622, + "num_input_tokens_seen": 47883424, + "step": 2924, + "train_runtime": 23762.385, + "train_tokens_per_second": 2015.093 + }, + { + "epoch": 1.7727272727272727, + "grad_norm": 0.010666623711585999, + "learning_rate": 9.312471021024443e-05, + "loss": 0.013124541379511356, + "num_input_tokens_seen": 47899800, + "step": 2925, + "train_runtime": 23770.5067, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 1.7733333333333334, + "grad_norm": 0.008134602569043636, + "learning_rate": 9.31198430026215e-05, + "loss": 0.012375653721392155, + "num_input_tokens_seen": 47916176, + "step": 2926, + "train_runtime": 23778.6358, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 1.773939393939394, + "grad_norm": 0.007719589862972498, + "learning_rate": 9.311497420007955e-05, + "loss": 0.011732139624655247, + "num_input_tokens_seen": 47932552, + "step": 2927, + "train_runtime": 23786.7556, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 1.7745454545454544, + "grad_norm": 0.01229825522750616, + "learning_rate": 9.311010380279868e-05, + "loss": 0.01175294816493988, + "num_input_tokens_seen": 47948928, + "step": 2928, + "train_runtime": 23794.8768, + "train_tokens_per_second": 2015.095 + }, + { + "epoch": 1.7751515151515151, + "grad_norm": 0.0076052104122936726, + "learning_rate": 9.310523181095903e-05, + "loss": 0.012578996829688549, + "num_input_tokens_seen": 47965304, + "step": 2929, + "train_runtime": 23802.9912, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 1.7757575757575759, + "grad_norm": 0.008468554355204105, + "learning_rate": 9.310035822474076e-05, + "loss": 0.011648007668554783, + "num_input_tokens_seen": 47981680, + "step": 2930, + "train_runtime": 23811.1077, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.7763636363636364, + "grad_norm": 0.010106262750923634, + "learning_rate": 9.309548304432421e-05, + "loss": 0.012974138371646404, + "num_input_tokens_seen": 47998056, + "step": 2931, + "train_runtime": 23819.2234, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.7769696969696969, + "grad_norm": 0.008024870418012142, + "learning_rate": 9.309060626988966e-05, + "loss": 0.0120691554620862, + "num_input_tokens_seen": 48014432, + "step": 2932, + "train_runtime": 23827.3433, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.7775757575757576, + "grad_norm": 0.006827709265053272, + "learning_rate": 9.30857279016175e-05, + "loss": 0.012827214784920216, + "num_input_tokens_seen": 48030808, + "step": 2933, + "train_runtime": 23835.4651, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.7781818181818183, + "grad_norm": 0.005936678033322096, + "learning_rate": 9.308084793968816e-05, + "loss": 0.011023994535207748, + "num_input_tokens_seen": 48047184, + "step": 2934, + "train_runtime": 23843.5821, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 1.7787878787878788, + "grad_norm": 0.1119927316904068, + "learning_rate": 9.307596638428217e-05, + "loss": 0.011640896089375019, + "num_input_tokens_seen": 48063560, + "step": 2935, + "train_runtime": 23851.6986, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 1.7793939393939393, + "grad_norm": 0.012935559265315533, + "learning_rate": 9.307108323558005e-05, + "loss": 0.01166920829564333, + "num_input_tokens_seen": 48079936, + "step": 2936, + "train_runtime": 23859.8191, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 1.78, + "grad_norm": 0.016682934015989304, + "learning_rate": 9.306619849376245e-05, + "loss": 0.012249463237822056, + "num_input_tokens_seen": 48096312, + "step": 2937, + "train_runtime": 23867.9395, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 1.7806060606060607, + "grad_norm": 0.01278962567448616, + "learning_rate": 9.306131215901003e-05, + "loss": 0.012957882136106491, + "num_input_tokens_seen": 48112688, + "step": 2938, + "train_runtime": 23876.0537, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 1.7812121212121212, + "grad_norm": 0.009745283052325249, + "learning_rate": 9.305642423150353e-05, + "loss": 0.014513827860355377, + "num_input_tokens_seen": 48129064, + "step": 2939, + "train_runtime": 23884.1711, + "train_tokens_per_second": 2015.103 + }, + { + "epoch": 1.7818181818181817, + "grad_norm": 0.010113997384905815, + "learning_rate": 9.305153471142377e-05, + "loss": 0.0118255615234375, + "num_input_tokens_seen": 48145440, + "step": 2940, + "train_runtime": 23892.2873, + "train_tokens_per_second": 2015.104 + }, + { + "epoch": 1.7824242424242425, + "grad_norm": 0.009374349378049374, + "learning_rate": 9.304664359895155e-05, + "loss": 0.012293674983084202, + "num_input_tokens_seen": 48161816, + "step": 2941, + "train_runtime": 23900.4033, + "train_tokens_per_second": 2015.105 + }, + { + "epoch": 1.7830303030303032, + "grad_norm": 0.021635450422763824, + "learning_rate": 9.30417508942678e-05, + "loss": 0.012411084957420826, + "num_input_tokens_seen": 48178192, + "step": 2942, + "train_runtime": 23908.5186, + "train_tokens_per_second": 2015.106 + }, + { + "epoch": 1.7836363636363637, + "grad_norm": 0.005758213810622692, + "learning_rate": 9.303685659755354e-05, + "loss": 0.011029191315174103, + "num_input_tokens_seen": 48194568, + "step": 2943, + "train_runtime": 23916.6336, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 1.7842424242424242, + "grad_norm": 0.007983488030731678, + "learning_rate": 9.303196070898975e-05, + "loss": 0.012368512339890003, + "num_input_tokens_seen": 48210944, + "step": 2944, + "train_runtime": 23924.7466, + "train_tokens_per_second": 2015.108 + }, + { + "epoch": 1.7848484848484847, + "grad_norm": 0.011937152594327927, + "learning_rate": 9.302706322875753e-05, + "loss": 0.011813906021416187, + "num_input_tokens_seen": 48227320, + "step": 2945, + "train_runtime": 23932.8634, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 1.7854545454545454, + "grad_norm": 0.01328189205378294, + "learning_rate": 9.302216415703805e-05, + "loss": 0.01319885067641735, + "num_input_tokens_seen": 48243696, + "step": 2946, + "train_runtime": 23940.9812, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 1.7860606060606061, + "grad_norm": 0.007090682163834572, + "learning_rate": 9.301726349401249e-05, + "loss": 0.01120240893214941, + "num_input_tokens_seen": 48260072, + "step": 2947, + "train_runtime": 23949.1004, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 1.7866666666666666, + "grad_norm": 0.012933997437357903, + "learning_rate": 9.301236123986212e-05, + "loss": 0.01253314595669508, + "num_input_tokens_seen": 48276448, + "step": 2948, + "train_runtime": 23957.2143, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 1.7872727272727271, + "grad_norm": 0.015164041891694069, + "learning_rate": 9.300745739476829e-05, + "loss": 0.012761669233441353, + "num_input_tokens_seen": 48292824, + "step": 2949, + "train_runtime": 23965.3347, + "train_tokens_per_second": 2015.112 + }, + { + "epoch": 1.7878787878787878, + "grad_norm": 0.009341921657323837, + "learning_rate": 9.300255195891233e-05, + "loss": 0.013164439238607883, + "num_input_tokens_seen": 48309200, + "step": 2950, + "train_runtime": 23973.4578, + "train_tokens_per_second": 2015.112 + }, + { + "epoch": 1.7884848484848486, + "grad_norm": 0.010138064622879028, + "learning_rate": 9.299764493247574e-05, + "loss": 0.013836441561579704, + "num_input_tokens_seen": 48325576, + "step": 2951, + "train_runtime": 23981.5762, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.789090909090909, + "grad_norm": 0.009686525911092758, + "learning_rate": 9.299273631563998e-05, + "loss": 0.011630890890955925, + "num_input_tokens_seen": 48341952, + "step": 2952, + "train_runtime": 23989.6919, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.7896969696969696, + "grad_norm": 0.017342818900942802, + "learning_rate": 9.298782610858664e-05, + "loss": 0.013579259626567364, + "num_input_tokens_seen": 48358328, + "step": 2953, + "train_runtime": 23997.8112, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 1.7903030303030303, + "grad_norm": 0.009860479272902012, + "learning_rate": 9.298291431149733e-05, + "loss": 0.013241427019238472, + "num_input_tokens_seen": 48374704, + "step": 2954, + "train_runtime": 24005.9337, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 1.790909090909091, + "grad_norm": 0.007178580854088068, + "learning_rate": 9.297800092455373e-05, + "loss": 0.011353488080203533, + "num_input_tokens_seen": 48391080, + "step": 2955, + "train_runtime": 24014.0491, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 1.7915151515151515, + "grad_norm": 0.007200692314654589, + "learning_rate": 9.297308594793756e-05, + "loss": 0.012314318679273129, + "num_input_tokens_seen": 48407456, + "step": 2956, + "train_runtime": 24022.1626, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 1.792121212121212, + "grad_norm": 0.006994608324021101, + "learning_rate": 9.296816938183063e-05, + "loss": 0.011929539032280445, + "num_input_tokens_seen": 48423832, + "step": 2957, + "train_runtime": 24030.2772, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 1.7927272727272727, + "grad_norm": 0.01123084407299757, + "learning_rate": 9.29632512264148e-05, + "loss": 0.01273421198129654, + "num_input_tokens_seen": 48440208, + "step": 2958, + "train_runtime": 24038.3949, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 1.7933333333333334, + "grad_norm": 0.00830906443297863, + "learning_rate": 9.295833148187197e-05, + "loss": 0.012322529219090939, + "num_input_tokens_seen": 48456584, + "step": 2959, + "train_runtime": 24046.5123, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 1.793939393939394, + "grad_norm": 0.010710365138947964, + "learning_rate": 9.295341014838412e-05, + "loss": 0.011278321035206318, + "num_input_tokens_seen": 48472960, + "step": 2960, + "train_runtime": 24054.6331, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 1.7945454545454544, + "grad_norm": 0.010728915221989155, + "learning_rate": 9.294848722613326e-05, + "loss": 0.010784944519400597, + "num_input_tokens_seen": 48489336, + "step": 2961, + "train_runtime": 24062.7468, + "train_tokens_per_second": 2015.121 + }, + { + "epoch": 1.7951515151515152, + "grad_norm": 0.007336590439081192, + "learning_rate": 9.294356271530151e-05, + "loss": 0.011799480766057968, + "num_input_tokens_seen": 48505712, + "step": 2962, + "train_runtime": 24070.8603, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.7957575757575759, + "grad_norm": 0.00758334482088685, + "learning_rate": 9.2938636616071e-05, + "loss": 0.01114749163389206, + "num_input_tokens_seen": 48522088, + "step": 2963, + "train_runtime": 24078.9751, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 1.7963636363636364, + "grad_norm": 0.010609005577862263, + "learning_rate": 9.293370892862395e-05, + "loss": 0.012277994304895401, + "num_input_tokens_seen": 48538464, + "step": 2964, + "train_runtime": 24087.0912, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 1.7969696969696969, + "grad_norm": 0.007987729273736477, + "learning_rate": 9.29287796531426e-05, + "loss": 0.012755843810737133, + "num_input_tokens_seen": 48554840, + "step": 2965, + "train_runtime": 24095.2123, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 1.7975757575757576, + "grad_norm": 0.00924461055546999, + "learning_rate": 9.29238487898093e-05, + "loss": 0.013270684517920017, + "num_input_tokens_seen": 48571216, + "step": 2966, + "train_runtime": 24103.3345, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 1.7981818181818183, + "grad_norm": 0.0071054198779165745, + "learning_rate": 9.291891633880642e-05, + "loss": 0.012391364201903343, + "num_input_tokens_seen": 48587592, + "step": 2967, + "train_runtime": 24111.4534, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 1.7987878787878788, + "grad_norm": 0.009188920259475708, + "learning_rate": 9.29139823003164e-05, + "loss": 0.011726969853043556, + "num_input_tokens_seen": 48603968, + "step": 2968, + "train_runtime": 24119.5686, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 1.7993939393939393, + "grad_norm": 0.005871398374438286, + "learning_rate": 9.290904667452177e-05, + "loss": 0.01141232531517744, + "num_input_tokens_seen": 48620344, + "step": 2969, + "train_runtime": 24127.6861, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.8, + "grad_norm": 0.010933088138699532, + "learning_rate": 9.290410946160504e-05, + "loss": 0.012397650629281998, + "num_input_tokens_seen": 48636720, + "step": 2970, + "train_runtime": 24135.8006, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 1.8006060606060608, + "grad_norm": 0.008161036297678947, + "learning_rate": 9.289917066174886e-05, + "loss": 0.012152892537415028, + "num_input_tokens_seen": 48653096, + "step": 2971, + "train_runtime": 24143.9187, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 1.8012121212121213, + "grad_norm": 0.007746783550828695, + "learning_rate": 9.28942302751359e-05, + "loss": 0.013096818700432777, + "num_input_tokens_seen": 48669472, + "step": 2972, + "train_runtime": 24152.0341, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 1.8018181818181818, + "grad_norm": 0.011512810364365578, + "learning_rate": 9.28892883019489e-05, + "loss": 0.012248929589986801, + "num_input_tokens_seen": 48685848, + "step": 2973, + "train_runtime": 24160.1509, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 1.8024242424242423, + "grad_norm": 0.009246395900845528, + "learning_rate": 9.288434474237064e-05, + "loss": 0.011867566034197807, + "num_input_tokens_seen": 48702224, + "step": 2974, + "train_runtime": 24168.2635, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 1.803030303030303, + "grad_norm": 0.017433451488614082, + "learning_rate": 9.287939959658399e-05, + "loss": 0.013736135326325893, + "num_input_tokens_seen": 48718600, + "step": 2975, + "train_runtime": 24176.3771, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 1.8036363636363637, + "grad_norm": 0.010817728005349636, + "learning_rate": 9.287445286477184e-05, + "loss": 0.011506814509630203, + "num_input_tokens_seen": 48734976, + "step": 2976, + "train_runtime": 24184.4902, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 1.8042424242424242, + "grad_norm": 0.009098121896386147, + "learning_rate": 9.286950454711717e-05, + "loss": 0.012756666168570518, + "num_input_tokens_seen": 48751352, + "step": 2977, + "train_runtime": 24192.6057, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 1.8048484848484847, + "grad_norm": 0.009739307686686516, + "learning_rate": 9.286455464380304e-05, + "loss": 0.013063987717032433, + "num_input_tokens_seen": 48767728, + "step": 2978, + "train_runtime": 24200.7221, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 1.8054545454545454, + "grad_norm": 0.014028403908014297, + "learning_rate": 9.285960315501248e-05, + "loss": 0.01358321774750948, + "num_input_tokens_seen": 48784104, + "step": 2979, + "train_runtime": 24208.8405, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 1.8060606060606061, + "grad_norm": 0.008561541326344013, + "learning_rate": 9.285465008092868e-05, + "loss": 0.0112238060683012, + "num_input_tokens_seen": 48800480, + "step": 2980, + "train_runtime": 24216.9613, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 1.8066666666666666, + "grad_norm": 0.014710756950080395, + "learning_rate": 9.284969542173482e-05, + "loss": 0.012869363650679588, + "num_input_tokens_seen": 48816856, + "step": 2981, + "train_runtime": 24225.0787, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 1.8072727272727271, + "grad_norm": 0.004121546167880297, + "learning_rate": 9.284473917761419e-05, + "loss": 0.011222013272345066, + "num_input_tokens_seen": 48833232, + "step": 2982, + "train_runtime": 24233.1928, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 1.8078787878787879, + "grad_norm": 0.016711929813027382, + "learning_rate": 9.283978134875006e-05, + "loss": 0.013897864148020744, + "num_input_tokens_seen": 48849608, + "step": 2983, + "train_runtime": 24241.3078, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 1.8084848484848486, + "grad_norm": 0.010518312454223633, + "learning_rate": 9.283482193532587e-05, + "loss": 0.011868288740515709, + "num_input_tokens_seen": 48865984, + "step": 2984, + "train_runtime": 24249.4339, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 1.809090909090909, + "grad_norm": 0.006691992282867432, + "learning_rate": 9.282986093752504e-05, + "loss": 0.010689632967114449, + "num_input_tokens_seen": 48882360, + "step": 2985, + "train_runtime": 24257.5472, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 1.8096969696969696, + "grad_norm": 0.008037862367928028, + "learning_rate": 9.282489835553106e-05, + "loss": 0.011833954602479935, + "num_input_tokens_seen": 48898736, + "step": 2986, + "train_runtime": 24265.6632, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 1.8103030303030303, + "grad_norm": 0.009776163846254349, + "learning_rate": 9.281993418952746e-05, + "loss": 0.012045754119753838, + "num_input_tokens_seen": 48915112, + "step": 2987, + "train_runtime": 24273.7804, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 1.810909090909091, + "grad_norm": 0.012512095272541046, + "learning_rate": 9.28149684396979e-05, + "loss": 0.012259161099791527, + "num_input_tokens_seen": 48931488, + "step": 2988, + "train_runtime": 24281.8933, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 1.8115151515151515, + "grad_norm": 0.010951677337288857, + "learning_rate": 9.281000110622605e-05, + "loss": 0.012102634645998478, + "num_input_tokens_seen": 48947864, + "step": 2989, + "train_runtime": 24290.0107, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 1.812121212121212, + "grad_norm": 0.004510990809649229, + "learning_rate": 9.28050321892956e-05, + "loss": 0.012496976181864738, + "num_input_tokens_seen": 48964240, + "step": 2990, + "train_runtime": 24298.1351, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 1.8127272727272727, + "grad_norm": 0.007676657754927874, + "learning_rate": 9.280006168909039e-05, + "loss": 0.011141535826027393, + "num_input_tokens_seen": 48980616, + "step": 2991, + "train_runtime": 24306.2519, + "train_tokens_per_second": 2015.145 + }, + { + "epoch": 1.8133333333333335, + "grad_norm": 0.004773823544383049, + "learning_rate": 9.279508960579424e-05, + "loss": 0.011702566407620907, + "num_input_tokens_seen": 48996992, + "step": 2992, + "train_runtime": 24314.3665, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 1.813939393939394, + "grad_norm": 0.013620059005916119, + "learning_rate": 9.279011593959106e-05, + "loss": 0.013101032935082912, + "num_input_tokens_seen": 49013368, + "step": 2993, + "train_runtime": 24322.4857, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 1.8145454545454545, + "grad_norm": 0.00769168371334672, + "learning_rate": 9.278514069066483e-05, + "loss": 0.012206891551613808, + "num_input_tokens_seen": 49029744, + "step": 2994, + "train_runtime": 24330.6009, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 1.8151515151515152, + "grad_norm": 0.017870178446173668, + "learning_rate": 9.278016385919957e-05, + "loss": 0.013382025063037872, + "num_input_tokens_seen": 49046120, + "step": 2995, + "train_runtime": 24338.7174, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 1.815757575757576, + "grad_norm": 0.024471454322338104, + "learning_rate": 9.277518544537934e-05, + "loss": 0.012662556953728199, + "num_input_tokens_seen": 49062496, + "step": 2996, + "train_runtime": 24346.8339, + "train_tokens_per_second": 2015.149 + }, + { + "epoch": 1.8163636363636364, + "grad_norm": 0.0086158262565732, + "learning_rate": 9.277020544938832e-05, + "loss": 0.012152843177318573, + "num_input_tokens_seen": 49078872, + "step": 2997, + "train_runtime": 24354.9513, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 1.816969696969697, + "grad_norm": 0.025600312277674675, + "learning_rate": 9.276522387141068e-05, + "loss": 0.015133306384086609, + "num_input_tokens_seen": 49095248, + "step": 2998, + "train_runtime": 24363.07, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 1.8175757575757576, + "grad_norm": 0.010355425998568535, + "learning_rate": 9.27602407116307e-05, + "loss": 0.012976177968084812, + "num_input_tokens_seen": 49111624, + "step": 2999, + "train_runtime": 24371.1852, + "train_tokens_per_second": 2015.151 + }, + { + "epoch": 1.8181818181818183, + "grad_norm": 0.01108719315379858, + "learning_rate": 9.275525597023267e-05, + "loss": 0.01246652752161026, + "num_input_tokens_seen": 49128000, + "step": 3000, + "train_runtime": 24379.3032, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 1.8187878787878788, + "grad_norm": 0.010925337672233582, + "learning_rate": 9.275026964740101e-05, + "loss": 0.01247719768434763, + "num_input_tokens_seen": 49144376, + "step": 3001, + "train_runtime": 24388.476, + "train_tokens_per_second": 2015.065 + }, + { + "epoch": 1.8193939393939393, + "grad_norm": 0.005827899090945721, + "learning_rate": 9.274528174332011e-05, + "loss": 0.011851150542497635, + "num_input_tokens_seen": 49160752, + "step": 3002, + "train_runtime": 24396.5903, + "train_tokens_per_second": 2015.067 + }, + { + "epoch": 1.8199999999999998, + "grad_norm": 0.006017337553203106, + "learning_rate": 9.274029225817449e-05, + "loss": 0.012704812921583652, + "num_input_tokens_seen": 49177128, + "step": 3003, + "train_runtime": 24404.7062, + "train_tokens_per_second": 2015.067 + }, + { + "epoch": 1.8206060606060606, + "grad_norm": 0.010842292569577694, + "learning_rate": 9.273530119214868e-05, + "loss": 0.012414321303367615, + "num_input_tokens_seen": 49193504, + "step": 3004, + "train_runtime": 24412.8222, + "train_tokens_per_second": 2015.068 + }, + { + "epoch": 1.8212121212121213, + "grad_norm": 0.013193870894610882, + "learning_rate": 9.27303085454273e-05, + "loss": 0.013441788032650948, + "num_input_tokens_seen": 49209880, + "step": 3005, + "train_runtime": 24420.9392, + "train_tokens_per_second": 2015.069 + }, + { + "epoch": 1.8218181818181818, + "grad_norm": 0.009797339327633381, + "learning_rate": 9.272531431819504e-05, + "loss": 0.012301875278353691, + "num_input_tokens_seen": 49226256, + "step": 3006, + "train_runtime": 24429.0554, + "train_tokens_per_second": 2015.07 + }, + { + "epoch": 1.8224242424242423, + "grad_norm": 0.010019945912063122, + "learning_rate": 9.27203185106366e-05, + "loss": 0.01279345341026783, + "num_input_tokens_seen": 49242632, + "step": 3007, + "train_runtime": 24437.167, + "train_tokens_per_second": 2015.071 + }, + { + "epoch": 1.823030303030303, + "grad_norm": 0.011813540011644363, + "learning_rate": 9.271532112293678e-05, + "loss": 0.012321519665420055, + "num_input_tokens_seen": 49259008, + "step": 3008, + "train_runtime": 24445.2808, + "train_tokens_per_second": 2015.072 + }, + { + "epoch": 1.8236363636363637, + "grad_norm": 0.010466402396559715, + "learning_rate": 9.27103221552804e-05, + "loss": 0.012350622564554214, + "num_input_tokens_seen": 49275384, + "step": 3009, + "train_runtime": 24453.3942, + "train_tokens_per_second": 2015.073 + }, + { + "epoch": 1.8242424242424242, + "grad_norm": 0.016839874908328056, + "learning_rate": 9.270532160785238e-05, + "loss": 0.013571641407907009, + "num_input_tokens_seen": 49291760, + "step": 3010, + "train_runtime": 24461.5065, + "train_tokens_per_second": 2015.075 + }, + { + "epoch": 1.8248484848484847, + "grad_norm": 0.009407256729900837, + "learning_rate": 9.270031948083769e-05, + "loss": 0.011951828375458717, + "num_input_tokens_seen": 49308136, + "step": 3011, + "train_runtime": 24469.6211, + "train_tokens_per_second": 2015.076 + }, + { + "epoch": 1.8254545454545454, + "grad_norm": 0.01694776676595211, + "learning_rate": 9.269531577442132e-05, + "loss": 0.012847152538597584, + "num_input_tokens_seen": 49324512, + "step": 3012, + "train_runtime": 24477.7371, + "train_tokens_per_second": 2015.076 + }, + { + "epoch": 1.8260606060606062, + "grad_norm": 0.012286297976970673, + "learning_rate": 9.269031048878838e-05, + "loss": 0.01283589843660593, + "num_input_tokens_seen": 49340888, + "step": 3013, + "train_runtime": 24485.8537, + "train_tokens_per_second": 2015.077 + }, + { + "epoch": 1.8266666666666667, + "grad_norm": 0.003697991603985429, + "learning_rate": 9.268530362412398e-05, + "loss": 0.01160636730492115, + "num_input_tokens_seen": 49357264, + "step": 3014, + "train_runtime": 24493.9669, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 1.8272727272727272, + "grad_norm": 0.011356630362570286, + "learning_rate": 9.268029518061334e-05, + "loss": 0.013032147660851479, + "num_input_tokens_seen": 49373640, + "step": 3015, + "train_runtime": 24502.0825, + "train_tokens_per_second": 2015.079 + }, + { + "epoch": 1.8278787878787879, + "grad_norm": 0.0060284812934696674, + "learning_rate": 9.267528515844168e-05, + "loss": 0.01141420565545559, + "num_input_tokens_seen": 49390016, + "step": 3016, + "train_runtime": 24510.2002, + "train_tokens_per_second": 2015.08 + }, + { + "epoch": 1.8284848484848486, + "grad_norm": 0.0051521072164177895, + "learning_rate": 9.267027355779434e-05, + "loss": 0.011409430764615536, + "num_input_tokens_seen": 49406392, + "step": 3017, + "train_runtime": 24518.3136, + "train_tokens_per_second": 2015.081 + }, + { + "epoch": 1.829090909090909, + "grad_norm": 0.005750606767833233, + "learning_rate": 9.266526037885668e-05, + "loss": 0.011167693883180618, + "num_input_tokens_seen": 49422768, + "step": 3018, + "train_runtime": 24526.4326, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 1.8296969696969696, + "grad_norm": 0.011581099592149258, + "learning_rate": 9.26602456218141e-05, + "loss": 0.013885931111872196, + "num_input_tokens_seen": 49439144, + "step": 3019, + "train_runtime": 24534.5512, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 1.8303030303030303, + "grad_norm": 0.006689704954624176, + "learning_rate": 9.265522928685215e-05, + "loss": 0.011876719072461128, + "num_input_tokens_seen": 49455520, + "step": 3020, + "train_runtime": 24542.6651, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 1.830909090909091, + "grad_norm": 0.008976902812719345, + "learning_rate": 9.26502113741563e-05, + "loss": 0.0122674610465765, + "num_input_tokens_seen": 49471896, + "step": 3021, + "train_runtime": 24550.7824, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 1.8315151515151515, + "grad_norm": 0.005990789737552404, + "learning_rate": 9.26451918839122e-05, + "loss": 0.012194567359983921, + "num_input_tokens_seen": 49488272, + "step": 3022, + "train_runtime": 24558.9059, + "train_tokens_per_second": 2015.085 + }, + { + "epoch": 1.832121212121212, + "grad_norm": 0.007173141464591026, + "learning_rate": 9.264017081630551e-05, + "loss": 0.013572081923484802, + "num_input_tokens_seen": 49504648, + "step": 3023, + "train_runtime": 24567.0235, + "train_tokens_per_second": 2015.085 + }, + { + "epoch": 1.8327272727272728, + "grad_norm": 0.008800432085990906, + "learning_rate": 9.263514817152195e-05, + "loss": 0.011637565679848194, + "num_input_tokens_seen": 49521024, + "step": 3024, + "train_runtime": 24575.1469, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 0.015690628439188004, + "learning_rate": 9.263012394974726e-05, + "loss": 0.012906611897051334, + "num_input_tokens_seen": 49537400, + "step": 3025, + "train_runtime": 24583.2632, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.833939393939394, + "grad_norm": 0.013949367217719555, + "learning_rate": 9.262509815116732e-05, + "loss": 0.011235008016228676, + "num_input_tokens_seen": 49553776, + "step": 3026, + "train_runtime": 24591.3813, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 1.8345454545454545, + "grad_norm": 0.012325744144618511, + "learning_rate": 9.262007077596799e-05, + "loss": 0.012805722653865814, + "num_input_tokens_seen": 49570152, + "step": 3027, + "train_runtime": 24599.4999, + "train_tokens_per_second": 2015.088 + }, + { + "epoch": 1.8351515151515152, + "grad_norm": 0.009210403077304363, + "learning_rate": 9.261504182433528e-05, + "loss": 0.012145797722041607, + "num_input_tokens_seen": 49586528, + "step": 3028, + "train_runtime": 24607.6184, + "train_tokens_per_second": 2015.088 + }, + { + "epoch": 1.835757575757576, + "grad_norm": 0.00966776255518198, + "learning_rate": 9.261001129645513e-05, + "loss": 0.012410152703523636, + "num_input_tokens_seen": 49602904, + "step": 3029, + "train_runtime": 24615.7372, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 1.8363636363636364, + "grad_norm": 0.008486943319439888, + "learning_rate": 9.260497919251364e-05, + "loss": 0.01206645742058754, + "num_input_tokens_seen": 49619280, + "step": 3030, + "train_runtime": 24623.8556, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 1.836969696969697, + "grad_norm": 0.007263388019055128, + "learning_rate": 9.259994551269694e-05, + "loss": 0.011400827206671238, + "num_input_tokens_seen": 49635656, + "step": 3031, + "train_runtime": 24631.9724, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.8375757575757574, + "grad_norm": 0.005560369696468115, + "learning_rate": 9.259491025719122e-05, + "loss": 0.012198167853057384, + "num_input_tokens_seen": 49652032, + "step": 3032, + "train_runtime": 24640.0936, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.8381818181818181, + "grad_norm": 0.008768472820520401, + "learning_rate": 9.258987342618273e-05, + "loss": 0.012726176530122757, + "num_input_tokens_seen": 49668408, + "step": 3033, + "train_runtime": 24648.2093, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 1.8387878787878789, + "grad_norm": 0.009570861235260963, + "learning_rate": 9.258483501985775e-05, + "loss": 0.011579609476029873, + "num_input_tokens_seen": 49684784, + "step": 3034, + "train_runtime": 24656.324, + "train_tokens_per_second": 2015.093 + }, + { + "epoch": 1.8393939393939394, + "grad_norm": 0.009371986612677574, + "learning_rate": 9.257979503840266e-05, + "loss": 0.012336795218288898, + "num_input_tokens_seen": 49701160, + "step": 3035, + "train_runtime": 24664.4461, + "train_tokens_per_second": 2015.093 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 0.009010751731693745, + "learning_rate": 9.257475348200387e-05, + "loss": 0.011590475216507912, + "num_input_tokens_seen": 49717536, + "step": 3036, + "train_runtime": 24672.5666, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 1.8406060606060606, + "grad_norm": 0.008704320527613163, + "learning_rate": 9.256971035084785e-05, + "loss": 0.012646995484828949, + "num_input_tokens_seen": 49733912, + "step": 3037, + "train_runtime": 24680.6929, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 1.8412121212121213, + "grad_norm": 0.011614583432674408, + "learning_rate": 9.256466564512115e-05, + "loss": 0.012255294248461723, + "num_input_tokens_seen": 49750288, + "step": 3038, + "train_runtime": 24688.816, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 1.8418181818181818, + "grad_norm": 0.009356755763292313, + "learning_rate": 9.255961936501036e-05, + "loss": 0.012481609359383583, + "num_input_tokens_seen": 49766664, + "step": 3039, + "train_runtime": 24696.9324, + "train_tokens_per_second": 2015.095 + }, + { + "epoch": 1.8424242424242423, + "grad_norm": 0.008717809803783894, + "learning_rate": 9.255457151070213e-05, + "loss": 0.013015971519052982, + "num_input_tokens_seen": 49783040, + "step": 3040, + "train_runtime": 24705.0478, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 1.843030303030303, + "grad_norm": 0.008320217952132225, + "learning_rate": 9.254952208238318e-05, + "loss": 0.01091097667813301, + "num_input_tokens_seen": 49799416, + "step": 3041, + "train_runtime": 24713.1648, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.8436363636363637, + "grad_norm": 0.003688125405460596, + "learning_rate": 9.254447108024026e-05, + "loss": 0.012057192623615265, + "num_input_tokens_seen": 49815792, + "step": 3042, + "train_runtime": 24721.277, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.8442424242424242, + "grad_norm": 0.004650153685361147, + "learning_rate": 9.25394185044602e-05, + "loss": 0.011981946416199207, + "num_input_tokens_seen": 49832168, + "step": 3043, + "train_runtime": 24729.391, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 1.8448484848484847, + "grad_norm": 0.006913701072335243, + "learning_rate": 9.253436435522991e-05, + "loss": 0.011715936474502087, + "num_input_tokens_seen": 49848544, + "step": 3044, + "train_runtime": 24737.5075, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 1.8454545454545455, + "grad_norm": 0.008736771531403065, + "learning_rate": 9.25293086327363e-05, + "loss": 0.012155945412814617, + "num_input_tokens_seen": 49864920, + "step": 3045, + "train_runtime": 24745.6316, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 1.8460606060606062, + "grad_norm": 0.021866118535399437, + "learning_rate": 9.25242513371664e-05, + "loss": 0.013389071449637413, + "num_input_tokens_seen": 49881296, + "step": 3046, + "train_runtime": 24753.7474, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 1.8466666666666667, + "grad_norm": 0.009575455449521542, + "learning_rate": 9.251919246870724e-05, + "loss": 0.011815833859145641, + "num_input_tokens_seen": 49897672, + "step": 3047, + "train_runtime": 24761.8582, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 1.8472727272727272, + "grad_norm": 0.012831861153244972, + "learning_rate": 9.251413202754595e-05, + "loss": 0.012752903625369072, + "num_input_tokens_seen": 49914048, + "step": 3048, + "train_runtime": 24769.9707, + "train_tokens_per_second": 2015.103 + }, + { + "epoch": 1.847878787878788, + "grad_norm": 0.009517887607216835, + "learning_rate": 9.250907001386972e-05, + "loss": 0.011513019911944866, + "num_input_tokens_seen": 49930424, + "step": 3049, + "train_runtime": 24778.082, + "train_tokens_per_second": 2015.104 + }, + { + "epoch": 1.8484848484848486, + "grad_norm": 0.013358705677092075, + "learning_rate": 9.250400642786576e-05, + "loss": 0.012704689055681229, + "num_input_tokens_seen": 49946800, + "step": 3050, + "train_runtime": 24786.1962, + "train_tokens_per_second": 2015.105 + }, + { + "epoch": 1.8490909090909091, + "grad_norm": 0.011109757237136364, + "learning_rate": 9.24989412697214e-05, + "loss": 0.012690899893641472, + "num_input_tokens_seen": 49963176, + "step": 3051, + "train_runtime": 24794.3077, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 1.8496969696969696, + "grad_norm": 0.0029274390544742346, + "learning_rate": 9.249387453962394e-05, + "loss": 0.011420530267059803, + "num_input_tokens_seen": 49979552, + "step": 3052, + "train_runtime": 24802.4218, + "train_tokens_per_second": 2015.108 + }, + { + "epoch": 1.8503030303030303, + "grad_norm": 0.006933121010661125, + "learning_rate": 9.248880623776081e-05, + "loss": 0.011758833192288876, + "num_input_tokens_seen": 49995928, + "step": 3053, + "train_runtime": 24810.5355, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 1.850909090909091, + "grad_norm": 0.008510863408446312, + "learning_rate": 9.248373636431951e-05, + "loss": 0.011804106645286083, + "num_input_tokens_seen": 50012304, + "step": 3054, + "train_runtime": 24818.65, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 1.8515151515151516, + "grad_norm": 0.00596174830570817, + "learning_rate": 9.247866491948752e-05, + "loss": 0.01262554433196783, + "num_input_tokens_seen": 50028680, + "step": 3055, + "train_runtime": 24826.7631, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 1.852121212121212, + "grad_norm": 0.011812315322458744, + "learning_rate": 9.247359190345243e-05, + "loss": 0.012343344278633595, + "num_input_tokens_seen": 50045056, + "step": 3056, + "train_runtime": 24834.8763, + "train_tokens_per_second": 2015.112 + }, + { + "epoch": 1.8527272727272728, + "grad_norm": 0.010871347971260548, + "learning_rate": 9.24685173164019e-05, + "loss": 0.01399338711053133, + "num_input_tokens_seen": 50061432, + "step": 3057, + "train_runtime": 24842.9898, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.8533333333333335, + "grad_norm": 0.01032578106969595, + "learning_rate": 9.246344115852361e-05, + "loss": 0.011646384373307228, + "num_input_tokens_seen": 50077808, + "step": 3058, + "train_runtime": 24851.1039, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 1.853939393939394, + "grad_norm": 0.009668087586760521, + "learning_rate": 9.245836343000533e-05, + "loss": 0.01203220710158348, + "num_input_tokens_seen": 50094184, + "step": 3059, + "train_runtime": 24859.2162, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 1.8545454545454545, + "grad_norm": 0.0030508043710142374, + "learning_rate": 9.245328413103488e-05, + "loss": 0.011559851467609406, + "num_input_tokens_seen": 50110560, + "step": 3060, + "train_runtime": 24867.3359, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 1.855151515151515, + "grad_norm": 0.012229938060045242, + "learning_rate": 9.244820326180011e-05, + "loss": 0.012419401668012142, + "num_input_tokens_seen": 50126936, + "step": 3061, + "train_runtime": 24875.4521, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 1.8557575757575757, + "grad_norm": 0.01794649288058281, + "learning_rate": 9.244312082248897e-05, + "loss": 0.013206228613853455, + "num_input_tokens_seen": 50143312, + "step": 3062, + "train_runtime": 24883.5707, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 1.8563636363636364, + "grad_norm": 0.007639274932444096, + "learning_rate": 9.243803681328943e-05, + "loss": 0.011870847083628178, + "num_input_tokens_seen": 50159688, + "step": 3063, + "train_runtime": 24891.6848, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 1.856969696969697, + "grad_norm": 0.0070930058136582375, + "learning_rate": 9.243295123438958e-05, + "loss": 0.010470615699887276, + "num_input_tokens_seen": 50176064, + "step": 3064, + "train_runtime": 24899.7968, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 1.8575757575757574, + "grad_norm": 0.007529653608798981, + "learning_rate": 9.24278640859775e-05, + "loss": 0.013072483241558075, + "num_input_tokens_seen": 50192440, + "step": 3065, + "train_runtime": 24907.9126, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 1.8581818181818182, + "grad_norm": 0.005360973067581654, + "learning_rate": 9.242277536824134e-05, + "loss": 0.011755731888115406, + "num_input_tokens_seen": 50208816, + "step": 3066, + "train_runtime": 24916.0331, + "train_tokens_per_second": 2015.121 + }, + { + "epoch": 1.8587878787878789, + "grad_norm": 0.00501141045242548, + "learning_rate": 9.241768508136933e-05, + "loss": 0.011487549170851707, + "num_input_tokens_seen": 50225192, + "step": 3067, + "train_runtime": 24924.1492, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.8593939393939394, + "grad_norm": 0.01034004520624876, + "learning_rate": 9.241259322554973e-05, + "loss": 0.012985237874090672, + "num_input_tokens_seen": 50241568, + "step": 3068, + "train_runtime": 24932.2668, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.8599999999999999, + "grad_norm": 0.007243257015943527, + "learning_rate": 9.240749980097094e-05, + "loss": 0.012754643335938454, + "num_input_tokens_seen": 50257944, + "step": 3069, + "train_runtime": 24940.3821, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 1.8606060606060606, + "grad_norm": 0.01349063403904438, + "learning_rate": 9.24024048078213e-05, + "loss": 0.012604167684912682, + "num_input_tokens_seen": 50274320, + "step": 3070, + "train_runtime": 24948.5006, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 1.8612121212121213, + "grad_norm": 0.004877285100519657, + "learning_rate": 9.23973082462893e-05, + "loss": 0.011970577761530876, + "num_input_tokens_seen": 50290696, + "step": 3071, + "train_runtime": 24956.6188, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 1.8618181818181818, + "grad_norm": 0.009895937517285347, + "learning_rate": 9.239221011656341e-05, + "loss": 0.01223594881594181, + "num_input_tokens_seen": 50307072, + "step": 3072, + "train_runtime": 24964.7356, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 1.8624242424242423, + "grad_norm": 0.006175011862069368, + "learning_rate": 9.238711041883222e-05, + "loss": 0.012292975559830666, + "num_input_tokens_seen": 50323448, + "step": 3073, + "train_runtime": 24972.8571, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 1.863030303030303, + "grad_norm": 0.008000146597623825, + "learning_rate": 9.238200915328438e-05, + "loss": 0.01205262541770935, + "num_input_tokens_seen": 50339824, + "step": 3074, + "train_runtime": 24980.9726, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.8636363636363638, + "grad_norm": 0.007993191480636597, + "learning_rate": 9.237690632010853e-05, + "loss": 0.013416312634944916, + "num_input_tokens_seen": 50356200, + "step": 3075, + "train_runtime": 24989.091, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.8642424242424243, + "grad_norm": 0.005173394922167063, + "learning_rate": 9.237180191949347e-05, + "loss": 0.012492675334215164, + "num_input_tokens_seen": 50372576, + "step": 3076, + "train_runtime": 24997.2059, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 1.8648484848484848, + "grad_norm": 0.008313057012856007, + "learning_rate": 9.236669595162797e-05, + "loss": 0.012614956125617027, + "num_input_tokens_seen": 50388952, + "step": 3077, + "train_runtime": 25005.3215, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 1.8654545454545455, + "grad_norm": 0.00968491192907095, + "learning_rate": 9.236158841670088e-05, + "loss": 0.012565825134515762, + "num_input_tokens_seen": 50405328, + "step": 3078, + "train_runtime": 25013.4393, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 1.8660606060606062, + "grad_norm": 0.007664080243557692, + "learning_rate": 9.235647931490112e-05, + "loss": 0.011506912298500538, + "num_input_tokens_seen": 50421704, + "step": 3079, + "train_runtime": 25021.5561, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 0.00756417540833354, + "learning_rate": 9.23513686464177e-05, + "loss": 0.01151568628847599, + "num_input_tokens_seen": 50438080, + "step": 3080, + "train_runtime": 25029.6711, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 1.8672727272727272, + "grad_norm": 0.008650683797895908, + "learning_rate": 9.23462564114396e-05, + "loss": 0.01214786246418953, + "num_input_tokens_seen": 50454456, + "step": 3081, + "train_runtime": 25037.7853, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 1.867878787878788, + "grad_norm": 0.007602212950587273, + "learning_rate": 9.234114261015597e-05, + "loss": 0.012525323778390884, + "num_input_tokens_seen": 50470832, + "step": 3082, + "train_runtime": 25045.9002, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 1.8684848484848486, + "grad_norm": 0.011407344602048397, + "learning_rate": 9.233602724275592e-05, + "loss": 0.012371896766126156, + "num_input_tokens_seen": 50487208, + "step": 3083, + "train_runtime": 25054.0193, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 1.8690909090909091, + "grad_norm": 0.006255041342228651, + "learning_rate": 9.233091030942866e-05, + "loss": 0.012409602291882038, + "num_input_tokens_seen": 50503584, + "step": 3084, + "train_runtime": 25062.1368, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 1.8696969696969696, + "grad_norm": 0.010671233758330345, + "learning_rate": 9.232579181036347e-05, + "loss": 0.012695337645709515, + "num_input_tokens_seen": 50519960, + "step": 3085, + "train_runtime": 25070.2486, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 1.8703030303030304, + "grad_norm": 0.009065535850822926, + "learning_rate": 9.232067174574968e-05, + "loss": 0.011437319219112396, + "num_input_tokens_seen": 50536336, + "step": 3086, + "train_runtime": 25078.3618, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 1.8709090909090909, + "grad_norm": 0.005685082171112299, + "learning_rate": 9.231555011577663e-05, + "loss": 0.011578761972486973, + "num_input_tokens_seen": 50552712, + "step": 3087, + "train_runtime": 25086.4723, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 1.8715151515151516, + "grad_norm": 0.0075594717636704445, + "learning_rate": 9.23104269206338e-05, + "loss": 0.011918365955352783, + "num_input_tokens_seen": 50569088, + "step": 3088, + "train_runtime": 25094.5849, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 1.872121212121212, + "grad_norm": 0.006498353555798531, + "learning_rate": 9.230530216051069e-05, + "loss": 0.012700119987130165, + "num_input_tokens_seen": 50585464, + "step": 3089, + "train_runtime": 25102.7055, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 1.8727272727272726, + "grad_norm": 0.006765829864889383, + "learning_rate": 9.230017583559682e-05, + "loss": 0.011392736807465553, + "num_input_tokens_seen": 50601840, + "step": 3090, + "train_runtime": 25110.8334, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 1.8733333333333333, + "grad_norm": 0.011798511259257793, + "learning_rate": 9.229504794608182e-05, + "loss": 0.013159212656319141, + "num_input_tokens_seen": 50618216, + "step": 3091, + "train_runtime": 25118.9529, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 1.873939393939394, + "grad_norm": 0.008059892803430557, + "learning_rate": 9.228991849215538e-05, + "loss": 0.012083720415830612, + "num_input_tokens_seen": 50634592, + "step": 3092, + "train_runtime": 25127.0722, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 1.8745454545454545, + "grad_norm": 0.006497847847640514, + "learning_rate": 9.22847874740072e-05, + "loss": 0.01243192795664072, + "num_input_tokens_seen": 50650968, + "step": 3093, + "train_runtime": 25135.1934, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 1.875151515151515, + "grad_norm": 0.008748437277972698, + "learning_rate": 9.227965489182708e-05, + "loss": 0.011547371745109558, + "num_input_tokens_seen": 50667344, + "step": 3094, + "train_runtime": 25143.3106, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 1.8757575757575757, + "grad_norm": 0.009610083885490894, + "learning_rate": 9.227452074580485e-05, + "loss": 0.01293950341641903, + "num_input_tokens_seen": 50683720, + "step": 3095, + "train_runtime": 25151.4328, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 1.8763636363636365, + "grad_norm": 0.006703491788357496, + "learning_rate": 9.226938503613043e-05, + "loss": 0.012840256094932556, + "num_input_tokens_seen": 50700096, + "step": 3096, + "train_runtime": 25159.546, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 1.876969696969697, + "grad_norm": 0.006835185922682285, + "learning_rate": 9.226424776299378e-05, + "loss": 0.012015492655336857, + "num_input_tokens_seen": 50716472, + "step": 3097, + "train_runtime": 25167.6608, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 1.8775757575757575, + "grad_norm": 0.014235780574381351, + "learning_rate": 9.22591089265849e-05, + "loss": 0.01358760241419077, + "num_input_tokens_seen": 50732848, + "step": 3098, + "train_runtime": 25175.7736, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 1.8781818181818182, + "grad_norm": 0.007752373814582825, + "learning_rate": 9.225396852709389e-05, + "loss": 0.01264961063861847, + "num_input_tokens_seen": 50749224, + "step": 3099, + "train_runtime": 25183.896, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 1.878787878787879, + "grad_norm": 0.00946936383843422, + "learning_rate": 9.224882656471086e-05, + "loss": 0.013263813219964504, + "num_input_tokens_seen": 50765600, + "step": 3100, + "train_runtime": 25192.0111, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 1.8793939393939394, + "grad_norm": 0.00941481627523899, + "learning_rate": 9.2243683039626e-05, + "loss": 0.013559294864535332, + "num_input_tokens_seen": 50781976, + "step": 3101, + "train_runtime": 25201.0372, + "train_tokens_per_second": 2015.075 + }, + { + "epoch": 1.88, + "grad_norm": 0.0050283982418477535, + "learning_rate": 9.22385379520296e-05, + "loss": 0.012278541922569275, + "num_input_tokens_seen": 50798352, + "step": 3102, + "train_runtime": 25209.1467, + "train_tokens_per_second": 2015.076 + }, + { + "epoch": 1.8806060606060606, + "grad_norm": 0.034983959048986435, + "learning_rate": 9.223339130211192e-05, + "loss": 0.013488172553479671, + "num_input_tokens_seen": 50814728, + "step": 3103, + "train_runtime": 25217.2561, + "train_tokens_per_second": 2015.078 + }, + { + "epoch": 1.8812121212121213, + "grad_norm": 0.006300953682512045, + "learning_rate": 9.222824309006335e-05, + "loss": 0.011517742648720741, + "num_input_tokens_seen": 50831104, + "step": 3104, + "train_runtime": 25225.3653, + "train_tokens_per_second": 2015.079 + }, + { + "epoch": 1.8818181818181818, + "grad_norm": 0.009675579145550728, + "learning_rate": 9.222309331607428e-05, + "loss": 0.011878578923642635, + "num_input_tokens_seen": 50847480, + "step": 3105, + "train_runtime": 25233.4811, + "train_tokens_per_second": 2015.08 + }, + { + "epoch": 1.8824242424242423, + "grad_norm": 0.010816797614097595, + "learning_rate": 9.221794198033525e-05, + "loss": 0.011553348042070866, + "num_input_tokens_seen": 50863856, + "step": 3106, + "train_runtime": 25241.5952, + "train_tokens_per_second": 2015.081 + }, + { + "epoch": 1.883030303030303, + "grad_norm": 0.007883163169026375, + "learning_rate": 9.221278908303674e-05, + "loss": 0.012542840093374252, + "num_input_tokens_seen": 50880232, + "step": 3107, + "train_runtime": 25249.7106, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 1.8836363636363638, + "grad_norm": 0.009454211220145226, + "learning_rate": 9.220763462436937e-05, + "loss": 0.01177777536213398, + "num_input_tokens_seen": 50896608, + "step": 3108, + "train_runtime": 25257.8324, + "train_tokens_per_second": 2015.082 + }, + { + "epoch": 1.8842424242424243, + "grad_norm": 0.012845884077250957, + "learning_rate": 9.220247860452378e-05, + "loss": 0.011665490455925465, + "num_input_tokens_seen": 50912984, + "step": 3109, + "train_runtime": 25265.9507, + "train_tokens_per_second": 2015.083 + }, + { + "epoch": 1.8848484848484848, + "grad_norm": 0.008703289553523064, + "learning_rate": 9.21973210236907e-05, + "loss": 0.012141491286456585, + "num_input_tokens_seen": 50929360, + "step": 3110, + "train_runtime": 25274.0675, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 1.8854545454545455, + "grad_norm": 0.011421225033700466, + "learning_rate": 9.21921618820609e-05, + "loss": 0.013285665772855282, + "num_input_tokens_seen": 50945736, + "step": 3111, + "train_runtime": 25282.1874, + "train_tokens_per_second": 2015.084 + }, + { + "epoch": 1.8860606060606062, + "grad_norm": 0.009302772581577301, + "learning_rate": 9.218700117982519e-05, + "loss": 0.011570766568183899, + "num_input_tokens_seen": 50962112, + "step": 3112, + "train_runtime": 25290.3063, + "train_tokens_per_second": 2015.085 + }, + { + "epoch": 1.8866666666666667, + "grad_norm": 0.012814724817872047, + "learning_rate": 9.218183891717447e-05, + "loss": 0.011747146025300026, + "num_input_tokens_seen": 50978488, + "step": 3113, + "train_runtime": 25298.4184, + "train_tokens_per_second": 2015.086 + }, + { + "epoch": 1.8872727272727272, + "grad_norm": 0.011627566069364548, + "learning_rate": 9.217667509429966e-05, + "loss": 0.011807992123067379, + "num_input_tokens_seen": 50994864, + "step": 3114, + "train_runtime": 25306.5341, + "train_tokens_per_second": 2015.087 + }, + { + "epoch": 1.887878787878788, + "grad_norm": 0.010376955382525921, + "learning_rate": 9.217150971139178e-05, + "loss": 0.01147371158003807, + "num_input_tokens_seen": 51011240, + "step": 3115, + "train_runtime": 25314.6486, + "train_tokens_per_second": 2015.088 + }, + { + "epoch": 1.8884848484848484, + "grad_norm": 0.012348284013569355, + "learning_rate": 9.216634276864188e-05, + "loss": 0.011648658663034439, + "num_input_tokens_seen": 51027616, + "step": 3116, + "train_runtime": 25322.7595, + "train_tokens_per_second": 2015.089 + }, + { + "epoch": 1.8890909090909092, + "grad_norm": 0.009763936512172222, + "learning_rate": 9.216117426624107e-05, + "loss": 0.013211511075496674, + "num_input_tokens_seen": 51043992, + "step": 3117, + "train_runtime": 25330.8803, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 1.8896969696969697, + "grad_norm": 0.008509074337780476, + "learning_rate": 9.215600420438054e-05, + "loss": 0.013487070798873901, + "num_input_tokens_seen": 51060368, + "step": 3118, + "train_runtime": 25338.9955, + "train_tokens_per_second": 2015.09 + }, + { + "epoch": 1.8903030303030302, + "grad_norm": 0.0092775272205472, + "learning_rate": 9.215083258325152e-05, + "loss": 0.012261521071195602, + "num_input_tokens_seen": 51076744, + "step": 3119, + "train_runtime": 25347.1108, + "train_tokens_per_second": 2015.091 + }, + { + "epoch": 1.8909090909090909, + "grad_norm": 0.007059205323457718, + "learning_rate": 9.214565940304528e-05, + "loss": 0.011692165397107601, + "num_input_tokens_seen": 51093120, + "step": 3120, + "train_runtime": 25355.2335, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 1.8915151515151516, + "grad_norm": 0.01579814963042736, + "learning_rate": 9.214048466395316e-05, + "loss": 0.012515694834291935, + "num_input_tokens_seen": 51109496, + "step": 3121, + "train_runtime": 25363.3518, + "train_tokens_per_second": 2015.092 + }, + { + "epoch": 1.892121212121212, + "grad_norm": 0.008557470515370369, + "learning_rate": 9.213530836616657e-05, + "loss": 0.01166028343141079, + "num_input_tokens_seen": 51125872, + "step": 3122, + "train_runtime": 25371.4695, + "train_tokens_per_second": 2015.093 + }, + { + "epoch": 1.8927272727272726, + "grad_norm": 0.009341249242424965, + "learning_rate": 9.2130130509877e-05, + "loss": 0.01161861326545477, + "num_input_tokens_seen": 51142248, + "step": 3123, + "train_runtime": 25379.5857, + "train_tokens_per_second": 2015.094 + }, + { + "epoch": 1.8933333333333333, + "grad_norm": 0.009486070834100246, + "learning_rate": 9.212495109527594e-05, + "loss": 0.012822091579437256, + "num_input_tokens_seen": 51158624, + "step": 3124, + "train_runtime": 25387.6977, + "train_tokens_per_second": 2015.095 + }, + { + "epoch": 1.893939393939394, + "grad_norm": 0.004312561824917793, + "learning_rate": 9.211977012255498e-05, + "loss": 0.012744562700390816, + "num_input_tokens_seen": 51175000, + "step": 3125, + "train_runtime": 25395.8131, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 1.8945454545454545, + "grad_norm": 0.008629312738776207, + "learning_rate": 9.211458759190573e-05, + "loss": 0.012353931553661823, + "num_input_tokens_seen": 51191376, + "step": 3126, + "train_runtime": 25403.9329, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 1.895151515151515, + "grad_norm": 0.005605903919786215, + "learning_rate": 9.210940350351991e-05, + "loss": 0.01195718813687563, + "num_input_tokens_seen": 51207752, + "step": 3127, + "train_runtime": 25412.0511, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.8957575757575758, + "grad_norm": 0.010096116922795773, + "learning_rate": 9.210421785758927e-05, + "loss": 0.01376781240105629, + "num_input_tokens_seen": 51224128, + "step": 3128, + "train_runtime": 25420.1712, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.8963636363636365, + "grad_norm": 0.010645140893757343, + "learning_rate": 9.209903065430558e-05, + "loss": 0.012773418799042702, + "num_input_tokens_seen": 51240504, + "step": 3129, + "train_runtime": 25428.2855, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 1.896969696969697, + "grad_norm": 0.012686858884990215, + "learning_rate": 9.209384189386075e-05, + "loss": 0.01278688758611679, + "num_input_tokens_seen": 51256880, + "step": 3130, + "train_runtime": 25436.403, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 1.8975757575757575, + "grad_norm": 0.0073151239193975925, + "learning_rate": 9.208865157644668e-05, + "loss": 0.012328793294727802, + "num_input_tokens_seen": 51273256, + "step": 3131, + "train_runtime": 25444.5158, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 1.8981818181818182, + "grad_norm": 0.009652036242187023, + "learning_rate": 9.208345970225535e-05, + "loss": 0.012286683544516563, + "num_input_tokens_seen": 51289632, + "step": 3132, + "train_runtime": 25452.6326, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 1.898787878787879, + "grad_norm": 0.008292704820632935, + "learning_rate": 9.207826627147879e-05, + "loss": 0.01197260431945324, + "num_input_tokens_seen": 51306008, + "step": 3133, + "train_runtime": 25460.7457, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 1.8993939393939394, + "grad_norm": 0.014060231857001781, + "learning_rate": 9.207307128430913e-05, + "loss": 0.013115906156599522, + "num_input_tokens_seen": 51322384, + "step": 3134, + "train_runtime": 25468.8578, + "train_tokens_per_second": 2015.103 + }, + { + "epoch": 1.9, + "grad_norm": 0.011946297250688076, + "learning_rate": 9.206787474093848e-05, + "loss": 0.011841751635074615, + "num_input_tokens_seen": 51338760, + "step": 3135, + "train_runtime": 25476.9731, + "train_tokens_per_second": 2015.104 + }, + { + "epoch": 1.9006060606060606, + "grad_norm": 0.007455665152519941, + "learning_rate": 9.206267664155907e-05, + "loss": 0.012197930365800858, + "num_input_tokens_seen": 51355136, + "step": 3136, + "train_runtime": 25485.0917, + "train_tokens_per_second": 2015.105 + }, + { + "epoch": 1.9012121212121214, + "grad_norm": 0.026311933994293213, + "learning_rate": 9.205747698636316e-05, + "loss": 0.012256120331585407, + "num_input_tokens_seen": 51371512, + "step": 3137, + "train_runtime": 25493.2028, + "train_tokens_per_second": 2015.106 + }, + { + "epoch": 1.9018181818181819, + "grad_norm": 0.004334684461355209, + "learning_rate": 9.205227577554307e-05, + "loss": 0.011752675287425518, + "num_input_tokens_seen": 51387888, + "step": 3138, + "train_runtime": 25501.3209, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 1.9024242424242424, + "grad_norm": 0.028817525133490562, + "learning_rate": 9.204707300929121e-05, + "loss": 0.01371039915829897, + "num_input_tokens_seen": 51404264, + "step": 3139, + "train_runtime": 25509.437, + "train_tokens_per_second": 2015.108 + }, + { + "epoch": 1.903030303030303, + "grad_norm": 0.012218418531119823, + "learning_rate": 9.204186868779999e-05, + "loss": 0.012458586134016514, + "num_input_tokens_seen": 51420640, + "step": 3140, + "train_runtime": 25517.548, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 1.9036363636363638, + "grad_norm": 0.010442850179970264, + "learning_rate": 9.203666281126193e-05, + "loss": 0.012032663449645042, + "num_input_tokens_seen": 51437016, + "step": 3141, + "train_runtime": 25525.6624, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 1.9042424242424243, + "grad_norm": 0.010629463009536266, + "learning_rate": 9.203145537986957e-05, + "loss": 0.012171917594969273, + "num_input_tokens_seen": 51453392, + "step": 3142, + "train_runtime": 25533.7739, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 1.9048484848484848, + "grad_norm": 0.004095530137419701, + "learning_rate": 9.202624639381552e-05, + "loss": 0.011937003582715988, + "num_input_tokens_seen": 51469768, + "step": 3143, + "train_runtime": 25541.8863, + "train_tokens_per_second": 2015.112 + }, + { + "epoch": 1.9054545454545453, + "grad_norm": 0.00807954091578722, + "learning_rate": 9.202103585329247e-05, + "loss": 0.012175404466688633, + "num_input_tokens_seen": 51486144, + "step": 3144, + "train_runtime": 25550.002, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.906060606060606, + "grad_norm": 0.012023149989545345, + "learning_rate": 9.201582375849313e-05, + "loss": 0.013452206738293171, + "num_input_tokens_seen": 51502520, + "step": 3145, + "train_runtime": 25558.118, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 1.9066666666666667, + "grad_norm": 0.006550980266183615, + "learning_rate": 9.20106101096103e-05, + "loss": 0.011639823205769062, + "num_input_tokens_seen": 51518896, + "step": 3146, + "train_runtime": 25566.2333, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 1.9072727272727272, + "grad_norm": 0.011515556834638119, + "learning_rate": 9.200539490683682e-05, + "loss": 0.0110179977491498, + "num_input_tokens_seen": 51535272, + "step": 3147, + "train_runtime": 25574.3546, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 1.9078787878787877, + "grad_norm": 0.014057625085115433, + "learning_rate": 9.200017815036557e-05, + "loss": 0.014424681663513184, + "num_input_tokens_seen": 51551648, + "step": 3148, + "train_runtime": 25582.4699, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 1.9084848484848485, + "grad_norm": 0.00884437095373869, + "learning_rate": 9.199495984038953e-05, + "loss": 0.011430526152253151, + "num_input_tokens_seen": 51568024, + "step": 3149, + "train_runtime": 25590.5833, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 1.9090909090909092, + "grad_norm": 0.007343432400375605, + "learning_rate": 9.198973997710169e-05, + "loss": 0.01341752428561449, + "num_input_tokens_seen": 51584400, + "step": 3150, + "train_runtime": 25598.6979, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 1.9096969696969697, + "grad_norm": 0.011537443846464157, + "learning_rate": 9.198451856069515e-05, + "loss": 0.012639664113521576, + "num_input_tokens_seen": 51600776, + "step": 3151, + "train_runtime": 25606.8086, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 1.9103030303030302, + "grad_norm": 0.006157017312943935, + "learning_rate": 9.197929559136304e-05, + "loss": 0.012154512107372284, + "num_input_tokens_seen": 51617152, + "step": 3152, + "train_runtime": 25614.9229, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 1.910909090909091, + "grad_norm": 0.014012346975505352, + "learning_rate": 9.197407106929851e-05, + "loss": 0.014261198230087757, + "num_input_tokens_seen": 51633528, + "step": 3153, + "train_runtime": 25623.042, + "train_tokens_per_second": 2015.121 + }, + { + "epoch": 1.9115151515151516, + "grad_norm": 0.011342213489115238, + "learning_rate": 9.196884499469486e-05, + "loss": 0.012025252915918827, + "num_input_tokens_seen": 51649904, + "step": 3154, + "train_runtime": 25631.1577, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.912121212121212, + "grad_norm": 0.008528700098395348, + "learning_rate": 9.196361736774535e-05, + "loss": 0.01199396327137947, + "num_input_tokens_seen": 51666280, + "step": 3155, + "train_runtime": 25639.2724, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 1.9127272727272726, + "grad_norm": 0.00821691658347845, + "learning_rate": 9.195838818864337e-05, + "loss": 0.012443341314792633, + "num_input_tokens_seen": 51682656, + "step": 3156, + "train_runtime": 25647.3857, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 1.9133333333333333, + "grad_norm": 0.010422799736261368, + "learning_rate": 9.19531574575823e-05, + "loss": 0.011776940897107124, + "num_input_tokens_seen": 51699032, + "step": 3157, + "train_runtime": 25655.4992, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 1.913939393939394, + "grad_norm": 0.012758140452206135, + "learning_rate": 9.194792517475565e-05, + "loss": 0.013998530805110931, + "num_input_tokens_seen": 51715408, + "step": 3158, + "train_runtime": 25663.6124, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 1.9145454545454546, + "grad_norm": 0.0021320863161236048, + "learning_rate": 9.194269134035692e-05, + "loss": 0.011572036892175674, + "num_input_tokens_seen": 51731784, + "step": 3159, + "train_runtime": 25671.7345, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 1.915151515151515, + "grad_norm": 0.005653473548591137, + "learning_rate": 9.193745595457974e-05, + "loss": 0.01234716922044754, + "num_input_tokens_seen": 51748160, + "step": 3160, + "train_runtime": 25679.8465, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.9157575757575758, + "grad_norm": 0.00827194843441248, + "learning_rate": 9.193221901761772e-05, + "loss": 0.011896461248397827, + "num_input_tokens_seen": 51764536, + "step": 3161, + "train_runtime": 25687.958, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 1.9163636363636365, + "grad_norm": 0.00640900107100606, + "learning_rate": 9.192698052966457e-05, + "loss": 0.011466245166957378, + "num_input_tokens_seen": 51780912, + "step": 3162, + "train_runtime": 25696.0738, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 1.916969696969697, + "grad_norm": 0.0063836583867669106, + "learning_rate": 9.192174049091407e-05, + "loss": 0.011264004744589329, + "num_input_tokens_seen": 51797288, + "step": 3163, + "train_runtime": 25704.1858, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 1.9175757575757575, + "grad_norm": 0.011097591370344162, + "learning_rate": 9.191649890156003e-05, + "loss": 0.014016124419867992, + "num_input_tokens_seen": 51813664, + "step": 3164, + "train_runtime": 25712.3007, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 1.9181818181818182, + "grad_norm": 0.008603382855653763, + "learning_rate": 9.191125576179634e-05, + "loss": 0.011245546862483025, + "num_input_tokens_seen": 51830040, + "step": 3165, + "train_runtime": 25720.4175, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 1.918787878787879, + "grad_norm": 0.007882286794483662, + "learning_rate": 9.19060110718169e-05, + "loss": 0.011330833658576012, + "num_input_tokens_seen": 51846416, + "step": 3166, + "train_runtime": 25728.533, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 1.9193939393939394, + "grad_norm": 0.009839195758104324, + "learning_rate": 9.190076483181572e-05, + "loss": 0.012397222220897675, + "num_input_tokens_seen": 51862792, + "step": 3167, + "train_runtime": 25736.6463, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 1.92, + "grad_norm": 0.021807054057717323, + "learning_rate": 9.189551704198683e-05, + "loss": 0.014394733123481274, + "num_input_tokens_seen": 51879168, + "step": 3168, + "train_runtime": 25744.7632, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 1.9206060606060606, + "grad_norm": 0.02324414625763893, + "learning_rate": 9.189026770252436e-05, + "loss": 0.013463632203638554, + "num_input_tokens_seen": 51895544, + "step": 3169, + "train_runtime": 25752.8788, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 1.9212121212121214, + "grad_norm": 0.008596673607826233, + "learning_rate": 9.18850168136225e-05, + "loss": 0.012573868036270142, + "num_input_tokens_seen": 51911920, + "step": 3170, + "train_runtime": 25760.9907, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 1.9218181818181819, + "grad_norm": 0.008097982965409756, + "learning_rate": 9.187976437547538e-05, + "loss": 0.013436982408165932, + "num_input_tokens_seen": 51928296, + "step": 3171, + "train_runtime": 25769.11, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 1.9224242424242424, + "grad_norm": 0.01106534618884325, + "learning_rate": 9.187451038827737e-05, + "loss": 0.012379739433526993, + "num_input_tokens_seen": 51944672, + "step": 3172, + "train_runtime": 25777.2322, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 1.9230303030303029, + "grad_norm": 0.011437936685979366, + "learning_rate": 9.186925485222276e-05, + "loss": 0.01257653534412384, + "num_input_tokens_seen": 51961048, + "step": 3173, + "train_runtime": 25785.3456, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 1.9236363636363636, + "grad_norm": 0.0026785824447870255, + "learning_rate": 9.186399776750596e-05, + "loss": 0.010649677366018295, + "num_input_tokens_seen": 51977424, + "step": 3174, + "train_runtime": 25793.4554, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 1.9242424242424243, + "grad_norm": 0.004529865458607674, + "learning_rate": 9.185873913432139e-05, + "loss": 0.01199475396424532, + "num_input_tokens_seen": 51993800, + "step": 3175, + "train_runtime": 25801.5682, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 1.9248484848484848, + "grad_norm": 0.008403346873819828, + "learning_rate": 9.185347895286358e-05, + "loss": 0.011764888651669025, + "num_input_tokens_seen": 52010176, + "step": 3176, + "train_runtime": 25809.6829, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 1.9254545454545453, + "grad_norm": 0.005153805483132601, + "learning_rate": 9.18482172233271e-05, + "loss": 0.011884771287441254, + "num_input_tokens_seen": 52026552, + "step": 3177, + "train_runtime": 25817.798, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 1.926060606060606, + "grad_norm": 0.008104590699076653, + "learning_rate": 9.184295394590655e-05, + "loss": 0.012589624151587486, + "num_input_tokens_seen": 52042928, + "step": 3178, + "train_runtime": 25825.9099, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 1.9266666666666667, + "grad_norm": 0.004483804106712341, + "learning_rate": 9.183768912079662e-05, + "loss": 0.011564332991838455, + "num_input_tokens_seen": 52059304, + "step": 3179, + "train_runtime": 25834.0235, + "train_tokens_per_second": 2015.145 + }, + { + "epoch": 1.9272727272727272, + "grad_norm": 0.007661936338990927, + "learning_rate": 9.183242274819205e-05, + "loss": 0.011691214516758919, + "num_input_tokens_seen": 52075680, + "step": 3180, + "train_runtime": 25842.1332, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 1.9278787878787877, + "grad_norm": 0.006727028172463179, + "learning_rate": 9.182715482828763e-05, + "loss": 0.011927913874387741, + "num_input_tokens_seen": 52092056, + "step": 3181, + "train_runtime": 25850.2436, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 1.9284848484848485, + "grad_norm": 0.007131942082196474, + "learning_rate": 9.18218853612782e-05, + "loss": 0.01202892605215311, + "num_input_tokens_seen": 52108432, + "step": 3182, + "train_runtime": 25858.354, + "train_tokens_per_second": 2015.149 + }, + { + "epoch": 1.9290909090909092, + "grad_norm": 0.018185211345553398, + "learning_rate": 9.181661434735867e-05, + "loss": 0.011910402216017246, + "num_input_tokens_seen": 52124808, + "step": 3183, + "train_runtime": 25866.4679, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 1.9296969696969697, + "grad_norm": 0.013236770406365395, + "learning_rate": 9.181134178672401e-05, + "loss": 0.011599000543355942, + "num_input_tokens_seen": 52141184, + "step": 3184, + "train_runtime": 25874.5825, + "train_tokens_per_second": 2015.151 + }, + { + "epoch": 1.9303030303030302, + "grad_norm": 0.007080638315528631, + "learning_rate": 9.180606767956925e-05, + "loss": 0.011230498552322388, + "num_input_tokens_seen": 52157560, + "step": 3185, + "train_runtime": 25882.7011, + "train_tokens_per_second": 2015.151 + }, + { + "epoch": 1.930909090909091, + "grad_norm": 0.006421273574233055, + "learning_rate": 9.180079202608947e-05, + "loss": 0.012318437919020653, + "num_input_tokens_seen": 52173936, + "step": 3186, + "train_runtime": 25890.8138, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 1.9315151515151516, + "grad_norm": 0.006406648550182581, + "learning_rate": 9.179551482647978e-05, + "loss": 0.012266881763935089, + "num_input_tokens_seen": 52190312, + "step": 3187, + "train_runtime": 25898.9317, + "train_tokens_per_second": 2015.153 + }, + { + "epoch": 1.9321212121212121, + "grad_norm": 0.008923706598579884, + "learning_rate": 9.17902360809354e-05, + "loss": 0.011904872953891754, + "num_input_tokens_seen": 52206688, + "step": 3188, + "train_runtime": 25907.0441, + "train_tokens_per_second": 2015.154 + }, + { + "epoch": 1.9327272727272726, + "grad_norm": 0.009994618594646454, + "learning_rate": 9.178495578965157e-05, + "loss": 0.01313931867480278, + "num_input_tokens_seen": 52223064, + "step": 3189, + "train_runtime": 25915.1572, + "train_tokens_per_second": 2015.155 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 0.010433097369968891, + "learning_rate": 9.177967395282359e-05, + "loss": 0.011210711672902107, + "num_input_tokens_seen": 52239440, + "step": 3190, + "train_runtime": 25923.2678, + "train_tokens_per_second": 2015.156 + }, + { + "epoch": 1.933939393939394, + "grad_norm": 0.01240938063710928, + "learning_rate": 9.177439057064683e-05, + "loss": 0.012704628519713879, + "num_input_tokens_seen": 52255816, + "step": 3191, + "train_runtime": 25931.3805, + "train_tokens_per_second": 2015.158 + }, + { + "epoch": 1.9345454545454546, + "grad_norm": 0.011206640861928463, + "learning_rate": 9.176910564331671e-05, + "loss": 0.012657486833631992, + "num_input_tokens_seen": 52272192, + "step": 3192, + "train_runtime": 25939.4924, + "train_tokens_per_second": 2015.159 + }, + { + "epoch": 1.935151515151515, + "grad_norm": 0.01022346131503582, + "learning_rate": 9.176381917102873e-05, + "loss": 0.011253394186496735, + "num_input_tokens_seen": 52288568, + "step": 3193, + "train_runtime": 25947.6051, + "train_tokens_per_second": 2015.16 + }, + { + "epoch": 1.9357575757575758, + "grad_norm": 0.01788550242781639, + "learning_rate": 9.17585311539784e-05, + "loss": 0.013412839733064175, + "num_input_tokens_seen": 52304944, + "step": 3194, + "train_runtime": 25955.7207, + "train_tokens_per_second": 2015.161 + }, + { + "epoch": 1.9363636363636365, + "grad_norm": 0.01697622984647751, + "learning_rate": 9.175324159236132e-05, + "loss": 0.01345901656895876, + "num_input_tokens_seen": 52321320, + "step": 3195, + "train_runtime": 25963.8327, + "train_tokens_per_second": 2015.162 + }, + { + "epoch": 1.936969696969697, + "grad_norm": 0.009710061363875866, + "learning_rate": 9.174795048637316e-05, + "loss": 0.012749395333230495, + "num_input_tokens_seen": 52337696, + "step": 3196, + "train_runtime": 25971.9426, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 1.9375757575757575, + "grad_norm": 0.011495089158415794, + "learning_rate": 9.174265783620961e-05, + "loss": 0.0130428122356534, + "num_input_tokens_seen": 52354072, + "step": 3197, + "train_runtime": 25980.0524, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 1.9381818181818182, + "grad_norm": 0.006617996841669083, + "learning_rate": 9.173736364206642e-05, + "loss": 0.01320140715688467, + "num_input_tokens_seen": 52370448, + "step": 3198, + "train_runtime": 25988.1659, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 1.938787878787879, + "grad_norm": 0.006989014334976673, + "learning_rate": 9.173206790413945e-05, + "loss": 0.011760172434151173, + "num_input_tokens_seen": 52386824, + "step": 3199, + "train_runtime": 25996.2801, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 1.9393939393939394, + "grad_norm": 0.035555772483348846, + "learning_rate": 9.172677062262453e-05, + "loss": 0.01268516480922699, + "num_input_tokens_seen": 52403200, + "step": 3200, + "train_runtime": 26004.3932, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 1.94, + "grad_norm": 0.007417343556880951, + "learning_rate": 9.172147179771765e-05, + "loss": 0.011998838745057583, + "num_input_tokens_seen": 52419576, + "step": 3201, + "train_runtime": 26013.4365, + "train_tokens_per_second": 2015.096 + }, + { + "epoch": 1.9406060606060604, + "grad_norm": 0.007505638990551233, + "learning_rate": 9.171617142961477e-05, + "loss": 0.012394100427627563, + "num_input_tokens_seen": 52435952, + "step": 3202, + "train_runtime": 26021.5523, + "train_tokens_per_second": 2015.097 + }, + { + "epoch": 1.9412121212121212, + "grad_norm": 0.012211352586746216, + "learning_rate": 9.171086951851194e-05, + "loss": 0.012769252061843872, + "num_input_tokens_seen": 52452328, + "step": 3203, + "train_runtime": 26029.6684, + "train_tokens_per_second": 2015.098 + }, + { + "epoch": 1.9418181818181819, + "grad_norm": 0.007897526025772095, + "learning_rate": 9.170556606460527e-05, + "loss": 0.011707163415849209, + "num_input_tokens_seen": 52468704, + "step": 3204, + "train_runtime": 26037.7862, + "train_tokens_per_second": 2015.099 + }, + { + "epoch": 1.9424242424242424, + "grad_norm": 0.012640303000807762, + "learning_rate": 9.170026106809095e-05, + "loss": 0.012289149686694145, + "num_input_tokens_seen": 52485080, + "step": 3205, + "train_runtime": 26045.8995, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 1.9430303030303029, + "grad_norm": 0.006511483807116747, + "learning_rate": 9.169495452916516e-05, + "loss": 0.01183705311268568, + "num_input_tokens_seen": 52501456, + "step": 3206, + "train_runtime": 26054.0156, + "train_tokens_per_second": 2015.1 + }, + { + "epoch": 1.9436363636363636, + "grad_norm": 0.013782680965960026, + "learning_rate": 9.168964644802422e-05, + "loss": 0.011273516342043877, + "num_input_tokens_seen": 52517832, + "step": 3207, + "train_runtime": 26062.134, + "train_tokens_per_second": 2015.101 + }, + { + "epoch": 1.9442424242424243, + "grad_norm": 0.008283359929919243, + "learning_rate": 9.168433682486444e-05, + "loss": 0.012856019660830498, + "num_input_tokens_seen": 52534208, + "step": 3208, + "train_runtime": 26070.2527, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 1.9448484848484848, + "grad_norm": 0.006199698895215988, + "learning_rate": 9.16790256598822e-05, + "loss": 0.011374854482710361, + "num_input_tokens_seen": 52550584, + "step": 3209, + "train_runtime": 26078.3698, + "train_tokens_per_second": 2015.102 + }, + { + "epoch": 1.9454545454545453, + "grad_norm": 0.004435536917299032, + "learning_rate": 9.167371295327399e-05, + "loss": 0.011426806449890137, + "num_input_tokens_seen": 52566960, + "step": 3210, + "train_runtime": 26086.4805, + "train_tokens_per_second": 2015.104 + }, + { + "epoch": 1.946060606060606, + "grad_norm": 0.00908664334565401, + "learning_rate": 9.166839870523627e-05, + "loss": 0.011732470244169235, + "num_input_tokens_seen": 52583336, + "step": 3211, + "train_runtime": 26094.5916, + "train_tokens_per_second": 2015.105 + }, + { + "epoch": 1.9466666666666668, + "grad_norm": 0.007592847105115652, + "learning_rate": 9.166308291596563e-05, + "loss": 0.012461712583899498, + "num_input_tokens_seen": 52599712, + "step": 3212, + "train_runtime": 26102.7074, + "train_tokens_per_second": 2015.106 + }, + { + "epoch": 1.9472727272727273, + "grad_norm": 0.013892276212573051, + "learning_rate": 9.16577655856587e-05, + "loss": 0.012362895533442497, + "num_input_tokens_seen": 52616088, + "step": 3213, + "train_runtime": 26110.8219, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 1.9478787878787878, + "grad_norm": 0.006196176167577505, + "learning_rate": 9.165244671451214e-05, + "loss": 0.011770393699407578, + "num_input_tokens_seen": 52632464, + "step": 3214, + "train_runtime": 26118.9426, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 1.9484848484848485, + "grad_norm": 0.007588802836835384, + "learning_rate": 9.16471263027227e-05, + "loss": 0.011993967927992344, + "num_input_tokens_seen": 52648840, + "step": 3215, + "train_runtime": 26127.0587, + "train_tokens_per_second": 2015.108 + }, + { + "epoch": 1.9490909090909092, + "grad_norm": 0.006540258880704641, + "learning_rate": 9.164180435048715e-05, + "loss": 0.012559541501104832, + "num_input_tokens_seen": 52665216, + "step": 3216, + "train_runtime": 26135.1757, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 1.9496969696969697, + "grad_norm": 0.007406895514577627, + "learning_rate": 9.163648085800236e-05, + "loss": 0.012432373128831387, + "num_input_tokens_seen": 52681592, + "step": 3217, + "train_runtime": 26143.2906, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 1.9503030303030302, + "grad_norm": 0.006238951813429594, + "learning_rate": 9.163115582546522e-05, + "loss": 0.012486881576478481, + "num_input_tokens_seen": 52697968, + "step": 3218, + "train_runtime": 26151.4015, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 1.950909090909091, + "grad_norm": 0.005955249071121216, + "learning_rate": 9.162582925307271e-05, + "loss": 0.012666616588830948, + "num_input_tokens_seen": 52714344, + "step": 3219, + "train_runtime": 26159.5147, + "train_tokens_per_second": 2015.112 + }, + { + "epoch": 1.9515151515151516, + "grad_norm": 0.01560470461845398, + "learning_rate": 9.162050114102184e-05, + "loss": 0.01188839040696621, + "num_input_tokens_seen": 52730720, + "step": 3220, + "train_runtime": 26167.6327, + "train_tokens_per_second": 2015.112 + }, + { + "epoch": 1.9521212121212121, + "grad_norm": 0.006458323448896408, + "learning_rate": 9.161517148950967e-05, + "loss": 0.011296706274151802, + "num_input_tokens_seen": 52747096, + "step": 3221, + "train_runtime": 26175.7479, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 1.9527272727272726, + "grad_norm": 0.006805418990552425, + "learning_rate": 9.160984029873334e-05, + "loss": 0.012141970917582512, + "num_input_tokens_seen": 52763472, + "step": 3222, + "train_runtime": 26183.8621, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 1.9533333333333334, + "grad_norm": 0.007566072512418032, + "learning_rate": 9.160450756889006e-05, + "loss": 0.01246306486427784, + "num_input_tokens_seen": 52779848, + "step": 3223, + "train_runtime": 26191.9843, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 1.953939393939394, + "grad_norm": 0.008537397719919682, + "learning_rate": 9.159917330017707e-05, + "loss": 0.012608175165951252, + "num_input_tokens_seen": 52796224, + "step": 3224, + "train_runtime": 26200.0967, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 1.9545454545454546, + "grad_norm": 0.008435552939772606, + "learning_rate": 9.159383749279167e-05, + "loss": 0.012503480538725853, + "num_input_tokens_seen": 52812600, + "step": 3225, + "train_runtime": 26208.214, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 1.955151515151515, + "grad_norm": 0.00943230465054512, + "learning_rate": 9.158850014693123e-05, + "loss": 0.012536080554127693, + "num_input_tokens_seen": 52828976, + "step": 3226, + "train_runtime": 26216.3339, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 1.9557575757575758, + "grad_norm": 0.010770905762910843, + "learning_rate": 9.158316126279314e-05, + "loss": 0.012846671044826508, + "num_input_tokens_seen": 52845352, + "step": 3227, + "train_runtime": 26224.4505, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 1.9563636363636365, + "grad_norm": 0.00734669528901577, + "learning_rate": 9.157782084057491e-05, + "loss": 0.011868518777191639, + "num_input_tokens_seen": 52861728, + "step": 3228, + "train_runtime": 26232.5655, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 1.956969696969697, + "grad_norm": 0.00950642116367817, + "learning_rate": 9.157247888047405e-05, + "loss": 0.011426198296248913, + "num_input_tokens_seen": 52878104, + "step": 3229, + "train_runtime": 26240.6832, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 1.9575757575757575, + "grad_norm": 0.0105692557990551, + "learning_rate": 9.156713538268815e-05, + "loss": 0.01276457030326128, + "num_input_tokens_seen": 52894480, + "step": 3230, + "train_runtime": 26248.7985, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 1.958181818181818, + "grad_norm": 0.012945852242410183, + "learning_rate": 9.156179034741486e-05, + "loss": 0.012674668803811073, + "num_input_tokens_seen": 52910856, + "step": 3231, + "train_runtime": 26256.9107, + "train_tokens_per_second": 2015.121 + }, + { + "epoch": 1.9587878787878787, + "grad_norm": 0.011738558299839497, + "learning_rate": 9.155644377485188e-05, + "loss": 0.012540457770228386, + "num_input_tokens_seen": 52927232, + "step": 3232, + "train_runtime": 26265.0321, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 1.9593939393939395, + "grad_norm": 0.007134211249649525, + "learning_rate": 9.1551095665197e-05, + "loss": 0.013350131921470165, + "num_input_tokens_seen": 52943608, + "step": 3233, + "train_runtime": 26273.1455, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 1.96, + "grad_norm": 0.004556073807179928, + "learning_rate": 9.154574601864799e-05, + "loss": 0.012090170755982399, + "num_input_tokens_seen": 52959984, + "step": 3234, + "train_runtime": 26281.2589, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 1.9606060606060605, + "grad_norm": 0.0194314606487751, + "learning_rate": 9.154039483540273e-05, + "loss": 0.013460388407111168, + "num_input_tokens_seen": 52976360, + "step": 3235, + "train_runtime": 26289.3732, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 1.9612121212121212, + "grad_norm": 0.011464192532002926, + "learning_rate": 9.153504211565917e-05, + "loss": 0.009950187988579273, + "num_input_tokens_seen": 52992736, + "step": 3236, + "train_runtime": 26297.4864, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 1.961818181818182, + "grad_norm": 0.00886337086558342, + "learning_rate": 9.152968785961529e-05, + "loss": 0.012892349623143673, + "num_input_tokens_seen": 53009112, + "step": 3237, + "train_runtime": 26305.6, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.9624242424242424, + "grad_norm": 0.008062895387411118, + "learning_rate": 9.152433206746913e-05, + "loss": 0.012557323090732098, + "num_input_tokens_seen": 53025488, + "step": 3238, + "train_runtime": 26313.7176, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 1.963030303030303, + "grad_norm": 0.009354766458272934, + "learning_rate": 9.151897473941879e-05, + "loss": 0.012314814142882824, + "num_input_tokens_seen": 53041864, + "step": 3239, + "train_runtime": 26321.8323, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 1.9636363636363636, + "grad_norm": 0.007434674073010683, + "learning_rate": 9.151361587566246e-05, + "loss": 0.012359886430203915, + "num_input_tokens_seen": 53058240, + "step": 3240, + "train_runtime": 26329.9456, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 1.9642424242424243, + "grad_norm": 0.010088525712490082, + "learning_rate": 9.150825547639827e-05, + "loss": 0.013497358188033104, + "num_input_tokens_seen": 53074616, + "step": 3241, + "train_runtime": 26338.0619, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 1.9648484848484848, + "grad_norm": 0.010220595635473728, + "learning_rate": 9.150289354182458e-05, + "loss": 0.01281267125159502, + "num_input_tokens_seen": 53090992, + "step": 3242, + "train_runtime": 26346.1755, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 1.9654545454545453, + "grad_norm": 0.007582931779325008, + "learning_rate": 9.149753007213966e-05, + "loss": 0.011935632675886154, + "num_input_tokens_seen": 53107368, + "step": 3243, + "train_runtime": 26354.2901, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 1.966060606060606, + "grad_norm": 0.03495870903134346, + "learning_rate": 9.149216506754192e-05, + "loss": 0.013317975215613842, + "num_input_tokens_seen": 53123744, + "step": 3244, + "train_runtime": 26362.4037, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 1.9666666666666668, + "grad_norm": 0.009395863860845566, + "learning_rate": 9.148679852822981e-05, + "loss": 0.012221533805131912, + "num_input_tokens_seen": 53140120, + "step": 3245, + "train_runtime": 26370.5241, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 1.9672727272727273, + "grad_norm": 0.011900022625923157, + "learning_rate": 9.14814304544018e-05, + "loss": 0.011450408957898617, + "num_input_tokens_seen": 53156496, + "step": 3246, + "train_runtime": 26378.6407, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 1.9678787878787878, + "grad_norm": 0.004700829740613699, + "learning_rate": 9.147606084625648e-05, + "loss": 0.012666188180446625, + "num_input_tokens_seen": 53172872, + "step": 3247, + "train_runtime": 26386.7572, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 1.9684848484848485, + "grad_norm": 0.00924025196582079, + "learning_rate": 9.147068970399242e-05, + "loss": 0.013658061623573303, + "num_input_tokens_seen": 53189248, + "step": 3248, + "train_runtime": 26394.8721, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 1.9690909090909092, + "grad_norm": 0.009269666858017445, + "learning_rate": 9.146531702780832e-05, + "loss": 0.012272707186639309, + "num_input_tokens_seen": 53205624, + "step": 3249, + "train_runtime": 26402.9859, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 1.9696969696969697, + "grad_norm": 0.00729750981554389, + "learning_rate": 9.145994281790287e-05, + "loss": 0.011746074073016644, + "num_input_tokens_seen": 53222000, + "step": 3250, + "train_runtime": 26411.1001, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 1.9703030303030302, + "grad_norm": 0.011676258407533169, + "learning_rate": 9.145456707447491e-05, + "loss": 0.01279502548277378, + "num_input_tokens_seen": 53238376, + "step": 3251, + "train_runtime": 26419.2149, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 1.970909090909091, + "grad_norm": 0.005959291011095047, + "learning_rate": 9.144918979772322e-05, + "loss": 0.011548520997166634, + "num_input_tokens_seen": 53254752, + "step": 3252, + "train_runtime": 26427.3347, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 1.9715151515151517, + "grad_norm": 0.010743229649960995, + "learning_rate": 9.144381098784671e-05, + "loss": 0.011153536848723888, + "num_input_tokens_seen": 53271128, + "step": 3253, + "train_runtime": 26435.4522, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 1.9721212121212122, + "grad_norm": 0.011109214276075363, + "learning_rate": 9.143843064504437e-05, + "loss": 0.011543774977326393, + "num_input_tokens_seen": 53287504, + "step": 3254, + "train_runtime": 26443.568, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 1.9727272727272727, + "grad_norm": 0.012104131281375885, + "learning_rate": 9.143304876951515e-05, + "loss": 0.012650061398744583, + "num_input_tokens_seen": 53303880, + "step": 3255, + "train_runtime": 26451.684, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 1.9733333333333334, + "grad_norm": 0.007510365452617407, + "learning_rate": 9.142766536145815e-05, + "loss": 0.01217754278331995, + "num_input_tokens_seen": 53320256, + "step": 3256, + "train_runtime": 26459.8042, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 1.973939393939394, + "grad_norm": 0.011189469136297703, + "learning_rate": 9.142228042107248e-05, + "loss": 0.012512904591858387, + "num_input_tokens_seen": 53336632, + "step": 3257, + "train_runtime": 26467.918, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 1.9745454545454546, + "grad_norm": 0.015042346902191639, + "learning_rate": 9.141689394855734e-05, + "loss": 0.012815610505640507, + "num_input_tokens_seen": 53353008, + "step": 3258, + "train_runtime": 26476.0346, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 1.975151515151515, + "grad_norm": 0.0030142359901219606, + "learning_rate": 9.141150594411195e-05, + "loss": 0.012168426997959614, + "num_input_tokens_seen": 53369384, + "step": 3259, + "train_runtime": 26484.1476, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 1.9757575757575756, + "grad_norm": 0.0037197330966591835, + "learning_rate": 9.140611640793558e-05, + "loss": 0.012724127620458603, + "num_input_tokens_seen": 53385760, + "step": 3260, + "train_runtime": 26492.262, + "train_tokens_per_second": 2015.145 + }, + { + "epoch": 1.9763636363636363, + "grad_norm": 0.009197385981678963, + "learning_rate": 9.14007253402276e-05, + "loss": 0.011984776705503464, + "num_input_tokens_seen": 53402136, + "step": 3261, + "train_runtime": 26500.3733, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 1.976969696969697, + "grad_norm": 0.00908108614385128, + "learning_rate": 9.139533274118743e-05, + "loss": 0.013469175435602665, + "num_input_tokens_seen": 53418512, + "step": 3262, + "train_runtime": 26508.4878, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 1.9775757575757575, + "grad_norm": 0.008263330906629562, + "learning_rate": 9.138993861101452e-05, + "loss": 0.012428317219018936, + "num_input_tokens_seen": 53434888, + "step": 3263, + "train_runtime": 26516.6014, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 1.978181818181818, + "grad_norm": 0.00935580488294363, + "learning_rate": 9.138454294990837e-05, + "loss": 0.012690423987805843, + "num_input_tokens_seen": 53451264, + "step": 3264, + "train_runtime": 26524.7165, + "train_tokens_per_second": 2015.149 + }, + { + "epoch": 1.9787878787878788, + "grad_norm": 0.006904932204633951, + "learning_rate": 9.137914575806856e-05, + "loss": 0.013632209971547127, + "num_input_tokens_seen": 53467640, + "step": 3265, + "train_runtime": 26532.8344, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 1.9793939393939395, + "grad_norm": 0.010423954576253891, + "learning_rate": 9.137374703569475e-05, + "loss": 0.012165880762040615, + "num_input_tokens_seen": 53484016, + "step": 3266, + "train_runtime": 26540.948, + "train_tokens_per_second": 2015.151 + }, + { + "epoch": 1.98, + "grad_norm": 0.011065114289522171, + "learning_rate": 9.13683467829866e-05, + "loss": 0.01318406593054533, + "num_input_tokens_seen": 53500392, + "step": 3267, + "train_runtime": 26549.0617, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 1.9806060606060605, + "grad_norm": 0.007728222757577896, + "learning_rate": 9.136294500014386e-05, + "loss": 0.013370378874242306, + "num_input_tokens_seen": 53516768, + "step": 3268, + "train_runtime": 26557.1762, + "train_tokens_per_second": 2015.153 + }, + { + "epoch": 1.9812121212121212, + "grad_norm": 0.010369036346673965, + "learning_rate": 9.135754168736635e-05, + "loss": 0.01195678859949112, + "num_input_tokens_seen": 53533144, + "step": 3269, + "train_runtime": 26565.2909, + "train_tokens_per_second": 2015.154 + }, + { + "epoch": 1.981818181818182, + "grad_norm": 0.007326615508645773, + "learning_rate": 9.135213684485389e-05, + "loss": 0.0117134815081954, + "num_input_tokens_seen": 53549520, + "step": 3270, + "train_runtime": 26573.4064, + "train_tokens_per_second": 2015.155 + }, + { + "epoch": 1.9824242424242424, + "grad_norm": 0.006437429692596197, + "learning_rate": 9.134673047280645e-05, + "loss": 0.011881757527589798, + "num_input_tokens_seen": 53565896, + "step": 3271, + "train_runtime": 26581.5229, + "train_tokens_per_second": 2015.155 + }, + { + "epoch": 1.983030303030303, + "grad_norm": 0.011852375231683254, + "learning_rate": 9.134132257142394e-05, + "loss": 0.013348350301384926, + "num_input_tokens_seen": 53582272, + "step": 3272, + "train_runtime": 26589.6392, + "train_tokens_per_second": 2015.156 + }, + { + "epoch": 1.9836363636363636, + "grad_norm": 0.008073097094893456, + "learning_rate": 9.133591314090643e-05, + "loss": 0.011144507676362991, + "num_input_tokens_seen": 53598648, + "step": 3273, + "train_runtime": 26597.7518, + "train_tokens_per_second": 2015.157 + }, + { + "epoch": 1.9842424242424244, + "grad_norm": 0.006733953952789307, + "learning_rate": 9.133050218145398e-05, + "loss": 0.012771431356668472, + "num_input_tokens_seen": 53615024, + "step": 3274, + "train_runtime": 26605.8616, + "train_tokens_per_second": 2015.158 + }, + { + "epoch": 1.9848484848484849, + "grad_norm": 0.005549773573875427, + "learning_rate": 9.132508969326675e-05, + "loss": 0.012543238699436188, + "num_input_tokens_seen": 53631400, + "step": 3275, + "train_runtime": 26613.98, + "train_tokens_per_second": 2015.159 + }, + { + "epoch": 1.9854545454545454, + "grad_norm": 0.006168725900352001, + "learning_rate": 9.131967567654493e-05, + "loss": 0.011709775775671005, + "num_input_tokens_seen": 53647776, + "step": 3276, + "train_runtime": 26622.0908, + "train_tokens_per_second": 2015.16 + }, + { + "epoch": 1.986060606060606, + "grad_norm": 0.008207867853343487, + "learning_rate": 9.131426013148876e-05, + "loss": 0.011277429759502411, + "num_input_tokens_seen": 53664152, + "step": 3277, + "train_runtime": 26630.2042, + "train_tokens_per_second": 2015.161 + }, + { + "epoch": 1.9866666666666668, + "grad_norm": 0.014408214017748833, + "learning_rate": 9.130884305829859e-05, + "loss": 0.01298760250210762, + "num_input_tokens_seen": 53680528, + "step": 3278, + "train_runtime": 26638.321, + "train_tokens_per_second": 2015.162 + }, + { + "epoch": 1.9872727272727273, + "grad_norm": 0.003766058012843132, + "learning_rate": 9.130342445717475e-05, + "loss": 0.011248024180531502, + "num_input_tokens_seen": 53696904, + "step": 3279, + "train_runtime": 26646.4374, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 1.9878787878787878, + "grad_norm": 0.006661458872258663, + "learning_rate": 9.129800432831767e-05, + "loss": 0.013307865709066391, + "num_input_tokens_seen": 53713280, + "step": 3280, + "train_runtime": 26654.5509, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 1.9884848484848485, + "grad_norm": 0.02135315351188183, + "learning_rate": 9.129258267192783e-05, + "loss": 0.012892954051494598, + "num_input_tokens_seen": 53729656, + "step": 3281, + "train_runtime": 26662.6636, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 1.9890909090909092, + "grad_norm": 0.01414970587939024, + "learning_rate": 9.128715948820576e-05, + "loss": 0.01163790188729763, + "num_input_tokens_seen": 53746032, + "step": 3282, + "train_runtime": 26670.7762, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 1.9896969696969697, + "grad_norm": 0.007704513147473335, + "learning_rate": 9.128173477735209e-05, + "loss": 0.01249206718057394, + "num_input_tokens_seen": 53762408, + "step": 3283, + "train_runtime": 26678.8915, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 1.9903030303030302, + "grad_norm": 0.01047444436699152, + "learning_rate": 9.127630853956743e-05, + "loss": 0.011483356356620789, + "num_input_tokens_seen": 53778784, + "step": 3284, + "train_runtime": 26687.0032, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 1.990909090909091, + "grad_norm": 0.013243939727544785, + "learning_rate": 9.12708807750525e-05, + "loss": 0.012570211663842201, + "num_input_tokens_seen": 53795160, + "step": 3285, + "train_runtime": 26695.1198, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 1.9915151515151515, + "grad_norm": 0.008042296394705772, + "learning_rate": 9.126545148400807e-05, + "loss": 0.011773718520998955, + "num_input_tokens_seen": 53811536, + "step": 3286, + "train_runtime": 26703.2386, + "train_tokens_per_second": 2015.169 + }, + { + "epoch": 1.9921212121212122, + "grad_norm": 0.010223199613392353, + "learning_rate": 9.126002066663492e-05, + "loss": 0.013065483421087265, + "num_input_tokens_seen": 53827912, + "step": 3287, + "train_runtime": 26711.3547, + "train_tokens_per_second": 2015.17 + }, + { + "epoch": 1.9927272727272727, + "grad_norm": 0.007627638056874275, + "learning_rate": 9.125458832313399e-05, + "loss": 0.012292512692511082, + "num_input_tokens_seen": 53844288, + "step": 3288, + "train_runtime": 26719.4688, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 1.9933333333333332, + "grad_norm": 0.006064161658287048, + "learning_rate": 9.124915445370617e-05, + "loss": 0.01392812468111515, + "num_input_tokens_seen": 53860664, + "step": 3289, + "train_runtime": 26727.5878, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 1.993939393939394, + "grad_norm": 0.007689913269132376, + "learning_rate": 9.124371905855244e-05, + "loss": 0.012879272922873497, + "num_input_tokens_seen": 53877040, + "step": 3290, + "train_runtime": 26735.7022, + "train_tokens_per_second": 2015.172 + }, + { + "epoch": 1.9945454545454546, + "grad_norm": 0.00797196477651596, + "learning_rate": 9.123828213787389e-05, + "loss": 0.013234483078122139, + "num_input_tokens_seen": 53893416, + "step": 3291, + "train_runtime": 26743.8161, + "train_tokens_per_second": 2015.173 + }, + { + "epoch": 1.9951515151515151, + "grad_norm": 0.007768760900944471, + "learning_rate": 9.123284369187157e-05, + "loss": 0.011896961368620396, + "num_input_tokens_seen": 53909792, + "step": 3292, + "train_runtime": 26751.9335, + "train_tokens_per_second": 2015.174 + }, + { + "epoch": 1.9957575757575756, + "grad_norm": 0.008472788147628307, + "learning_rate": 9.122740372074665e-05, + "loss": 0.012093445286154747, + "num_input_tokens_seen": 53926168, + "step": 3293, + "train_runtime": 26760.0478, + "train_tokens_per_second": 2015.175 + }, + { + "epoch": 1.9963636363636363, + "grad_norm": 0.013480499386787415, + "learning_rate": 9.122196222470036e-05, + "loss": 0.012600832618772984, + "num_input_tokens_seen": 53942544, + "step": 3294, + "train_runtime": 26768.1606, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 1.996969696969697, + "grad_norm": 0.018711727112531662, + "learning_rate": 9.121651920393399e-05, + "loss": 0.012485871091485023, + "num_input_tokens_seen": 53958920, + "step": 3295, + "train_runtime": 26776.2771, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 1.9975757575757576, + "grad_norm": 0.007433328777551651, + "learning_rate": 9.121107465864882e-05, + "loss": 0.01180267333984375, + "num_input_tokens_seen": 53975296, + "step": 3296, + "train_runtime": 26784.391, + "train_tokens_per_second": 2015.177 + }, + { + "epoch": 1.998181818181818, + "grad_norm": 0.0183311365544796, + "learning_rate": 9.120562858904624e-05, + "loss": 0.013096505776047707, + "num_input_tokens_seen": 53991672, + "step": 3297, + "train_runtime": 26792.504, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 1.9987878787878788, + "grad_norm": 0.010210291482508183, + "learning_rate": 9.120018099532773e-05, + "loss": 0.012559071183204651, + "num_input_tokens_seen": 54008048, + "step": 3298, + "train_runtime": 26800.6169, + "train_tokens_per_second": 2015.179 + }, + { + "epoch": 1.9993939393939395, + "grad_norm": 0.01029531005769968, + "learning_rate": 9.119473187769474e-05, + "loss": 0.011382916942238808, + "num_input_tokens_seen": 54024424, + "step": 3299, + "train_runtime": 26808.7324, + "train_tokens_per_second": 2015.18 + }, + { + "epoch": 2.0, + "grad_norm": 0.011802438646554947, + "learning_rate": 9.118928123634885e-05, + "loss": 0.011201423592865467, + "num_input_tokens_seen": 54040800, + "step": 3300, + "train_runtime": 26816.8424, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 2.0006060606060605, + "grad_norm": 0.007276556454598904, + "learning_rate": 9.118382907149165e-05, + "loss": 0.012055739760398865, + "num_input_tokens_seen": 54057176, + "step": 3301, + "train_runtime": 26825.9949, + "train_tokens_per_second": 2015.104 + }, + { + "epoch": 2.001212121212121, + "grad_norm": 0.010496939532458782, + "learning_rate": 9.117837538332481e-05, + "loss": 0.012779267504811287, + "num_input_tokens_seen": 54073552, + "step": 3302, + "train_runtime": 26834.1084, + "train_tokens_per_second": 2015.105 + }, + { + "epoch": 2.001818181818182, + "grad_norm": 0.008761009201407433, + "learning_rate": 9.117292017205007e-05, + "loss": 0.012465574778616428, + "num_input_tokens_seen": 54089928, + "step": 3303, + "train_runtime": 26842.2204, + "train_tokens_per_second": 2015.106 + }, + { + "epoch": 2.0024242424242424, + "grad_norm": 0.02535530924797058, + "learning_rate": 9.116746343786919e-05, + "loss": 0.013483214192092419, + "num_input_tokens_seen": 54106304, + "step": 3304, + "train_runtime": 26850.336, + "train_tokens_per_second": 2015.107 + }, + { + "epoch": 2.003030303030303, + "grad_norm": 0.009476523846387863, + "learning_rate": 9.1162005180984e-05, + "loss": 0.011629991233348846, + "num_input_tokens_seen": 54122680, + "step": 3305, + "train_runtime": 26858.4529, + "train_tokens_per_second": 2015.108 + }, + { + "epoch": 2.0036363636363634, + "grad_norm": 0.006627608090639114, + "learning_rate": 9.115654540159641e-05, + "loss": 0.012818768620491028, + "num_input_tokens_seen": 54139056, + "step": 3306, + "train_runtime": 26866.565, + "train_tokens_per_second": 2015.109 + }, + { + "epoch": 2.0042424242424244, + "grad_norm": 0.004400262143462896, + "learning_rate": 9.115108409990833e-05, + "loss": 0.01134046632796526, + "num_input_tokens_seen": 54155432, + "step": 3307, + "train_runtime": 26874.676, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 2.004848484848485, + "grad_norm": 0.0066299899481236935, + "learning_rate": 9.114562127612181e-05, + "loss": 0.011135777458548546, + "num_input_tokens_seen": 54171808, + "step": 3308, + "train_runtime": 26882.7915, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 2.0054545454545454, + "grad_norm": 0.02751355618238449, + "learning_rate": 9.11401569304389e-05, + "loss": 0.01189148798584938, + "num_input_tokens_seen": 54188184, + "step": 3309, + "train_runtime": 26890.9074, + "train_tokens_per_second": 2015.112 + }, + { + "epoch": 2.006060606060606, + "grad_norm": 0.008675160817801952, + "learning_rate": 9.113469106306167e-05, + "loss": 0.01138359121978283, + "num_input_tokens_seen": 54204560, + "step": 3310, + "train_runtime": 26899.0207, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 2.006666666666667, + "grad_norm": 0.018375243991613388, + "learning_rate": 9.112922367419234e-05, + "loss": 0.01198117621243, + "num_input_tokens_seen": 54220936, + "step": 3311, + "train_runtime": 26907.1346, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 2.0072727272727273, + "grad_norm": 0.01007237657904625, + "learning_rate": 9.112375476403312e-05, + "loss": 0.011535527184605598, + "num_input_tokens_seen": 54237312, + "step": 3312, + "train_runtime": 26915.2469, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 2.007878787878788, + "grad_norm": 0.010128041729331017, + "learning_rate": 9.111828433278628e-05, + "loss": 0.012508670799434185, + "num_input_tokens_seen": 54253688, + "step": 3313, + "train_runtime": 26923.3578, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 2.0084848484848483, + "grad_norm": 0.026225844398140907, + "learning_rate": 9.11128123806542e-05, + "loss": 0.011080056428909302, + "num_input_tokens_seen": 54270064, + "step": 3314, + "train_runtime": 26931.4726, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 2.0090909090909093, + "grad_norm": 0.00960629153996706, + "learning_rate": 9.110733890783925e-05, + "loss": 0.012581977993249893, + "num_input_tokens_seen": 54286440, + "step": 3315, + "train_runtime": 26939.5865, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 2.0096969696969698, + "grad_norm": 0.005847670137882233, + "learning_rate": 9.110186391454389e-05, + "loss": 0.011724804528057575, + "num_input_tokens_seen": 54302816, + "step": 3316, + "train_runtime": 26947.6966, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 2.0103030303030303, + "grad_norm": 0.026506055146455765, + "learning_rate": 9.109638740097062e-05, + "loss": 0.011649670079350471, + "num_input_tokens_seen": 54319192, + "step": 3317, + "train_runtime": 26955.8123, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 2.0109090909090908, + "grad_norm": 0.009331312030553818, + "learning_rate": 9.1090909367322e-05, + "loss": 0.01132029015570879, + "num_input_tokens_seen": 54335568, + "step": 3318, + "train_runtime": 26963.9355, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 2.0115151515151517, + "grad_norm": 0.007260499056428671, + "learning_rate": 9.108542981380067e-05, + "loss": 0.01217691320925951, + "num_input_tokens_seen": 54351944, + "step": 3319, + "train_runtime": 26972.0564, + "train_tokens_per_second": 2015.121 + }, + { + "epoch": 2.012121212121212, + "grad_norm": 0.00819828175008297, + "learning_rate": 9.10799487406093e-05, + "loss": 0.011852500028908253, + "num_input_tokens_seen": 54368320, + "step": 3320, + "train_runtime": 26980.1718, + "train_tokens_per_second": 2015.121 + }, + { + "epoch": 2.0127272727272727, + "grad_norm": 0.015189445577561855, + "learning_rate": 9.107446614795063e-05, + "loss": 0.013060184195637703, + "num_input_tokens_seen": 54384696, + "step": 3321, + "train_runtime": 26988.2907, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 2.013333333333333, + "grad_norm": 0.007456401828676462, + "learning_rate": 9.106898203602745e-05, + "loss": 0.012429913505911827, + "num_input_tokens_seen": 54401072, + "step": 3322, + "train_runtime": 26996.4101, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 2.013939393939394, + "grad_norm": 0.009956259280443192, + "learning_rate": 9.10634964050426e-05, + "loss": 0.011429233476519585, + "num_input_tokens_seen": 54417448, + "step": 3323, + "train_runtime": 27004.5332, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 2.0145454545454546, + "grad_norm": 0.024274544790387154, + "learning_rate": 9.105800925519898e-05, + "loss": 0.01382430363446474, + "num_input_tokens_seen": 54433824, + "step": 3324, + "train_runtime": 27012.6509, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 2.015151515151515, + "grad_norm": 0.007276281714439392, + "learning_rate": 9.105252058669957e-05, + "loss": 0.012992753647267818, + "num_input_tokens_seen": 54450200, + "step": 3325, + "train_runtime": 27020.7716, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 2.0157575757575756, + "grad_norm": 0.006913206540048122, + "learning_rate": 9.104703039974736e-05, + "loss": 0.012510275468230247, + "num_input_tokens_seen": 54466576, + "step": 3326, + "train_runtime": 27028.8892, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 2.0163636363636366, + "grad_norm": 0.011480524204671383, + "learning_rate": 9.104153869454543e-05, + "loss": 0.01283776294440031, + "num_input_tokens_seen": 54482952, + "step": 3327, + "train_runtime": 27037.0067, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 2.016969696969697, + "grad_norm": 0.006622251123189926, + "learning_rate": 9.10360454712969e-05, + "loss": 0.011710776016116142, + "num_input_tokens_seen": 54499328, + "step": 3328, + "train_runtime": 27045.1199, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 2.0175757575757576, + "grad_norm": 0.006461408920586109, + "learning_rate": 9.103055073020497e-05, + "loss": 0.011920584365725517, + "num_input_tokens_seen": 54515704, + "step": 3329, + "train_runtime": 27053.2408, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 2.018181818181818, + "grad_norm": 0.0003032787353731692, + "learning_rate": 9.102505447147287e-05, + "loss": 0.012296212837100029, + "num_input_tokens_seen": 54532080, + "step": 3330, + "train_runtime": 27061.3579, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 2.0187878787878786, + "grad_norm": 0.009282547980546951, + "learning_rate": 9.101955669530391e-05, + "loss": 0.012170149944722652, + "num_input_tokens_seen": 54548456, + "step": 3331, + "train_runtime": 27069.4762, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 2.0193939393939395, + "grad_norm": 0.007768985815346241, + "learning_rate": 9.101405740190141e-05, + "loss": 0.011895807459950447, + "num_input_tokens_seen": 54564832, + "step": 3332, + "train_runtime": 27077.5966, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 2.02, + "grad_norm": 0.024832775816321373, + "learning_rate": 9.10085565914688e-05, + "loss": 0.013282284140586853, + "num_input_tokens_seen": 54581208, + "step": 3333, + "train_runtime": 27085.7138, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 2.0206060606060605, + "grad_norm": 0.007753964047878981, + "learning_rate": 9.100305426420956e-05, + "loss": 0.012050673365592957, + "num_input_tokens_seen": 54597584, + "step": 3334, + "train_runtime": 27093.833, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 2.021212121212121, + "grad_norm": 0.04361976683139801, + "learning_rate": 9.099755042032718e-05, + "loss": 0.012513071298599243, + "num_input_tokens_seen": 54613960, + "step": 3335, + "train_runtime": 27101.9434, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 2.021818181818182, + "grad_norm": 0.010623808018863201, + "learning_rate": 9.099204506002525e-05, + "loss": 0.01084177102893591, + "num_input_tokens_seen": 54630336, + "step": 3336, + "train_runtime": 27110.0619, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 2.0224242424242425, + "grad_norm": 0.011434576474130154, + "learning_rate": 9.09865381835074e-05, + "loss": 0.012685502879321575, + "num_input_tokens_seen": 54646712, + "step": 3337, + "train_runtime": 27118.1774, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 2.023030303030303, + "grad_norm": 0.005738785490393639, + "learning_rate": 9.098102979097733e-05, + "loss": 0.011825205758213997, + "num_input_tokens_seen": 54663088, + "step": 3338, + "train_runtime": 27126.296, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 2.0236363636363635, + "grad_norm": 0.01584089733660221, + "learning_rate": 9.097551988263877e-05, + "loss": 0.013741337694227695, + "num_input_tokens_seen": 54679464, + "step": 3339, + "train_runtime": 27134.4219, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 2.0242424242424244, + "grad_norm": 0.0060920617543160915, + "learning_rate": 9.097000845869553e-05, + "loss": 0.012348243035376072, + "num_input_tokens_seen": 54695840, + "step": 3340, + "train_runtime": 27142.5426, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 2.024848484848485, + "grad_norm": 0.005727679468691349, + "learning_rate": 9.096449551935144e-05, + "loss": 0.011096533387899399, + "num_input_tokens_seen": 54712216, + "step": 3341, + "train_runtime": 27150.6608, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 2.0254545454545454, + "grad_norm": 0.0022178071085363626, + "learning_rate": 9.095898106481045e-05, + "loss": 0.011531295254826546, + "num_input_tokens_seen": 54728592, + "step": 3342, + "train_runtime": 27158.7798, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 2.026060606060606, + "grad_norm": 0.003265876555815339, + "learning_rate": 9.095346509527652e-05, + "loss": 0.012010122649371624, + "num_input_tokens_seen": 54744968, + "step": 3343, + "train_runtime": 27166.8861, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 2.026666666666667, + "grad_norm": 0.00931676384061575, + "learning_rate": 9.094794761095366e-05, + "loss": 0.011602475307881832, + "num_input_tokens_seen": 54761344, + "step": 3344, + "train_runtime": 27174.9978, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 2.0272727272727273, + "grad_norm": 0.029080282896757126, + "learning_rate": 9.094242861204599e-05, + "loss": 0.012969661504030228, + "num_input_tokens_seen": 54777720, + "step": 3345, + "train_runtime": 27183.1125, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 2.027878787878788, + "grad_norm": 0.013951561413705349, + "learning_rate": 9.093690809875758e-05, + "loss": 0.013166350312530994, + "num_input_tokens_seen": 54794096, + "step": 3346, + "train_runtime": 27191.2325, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 2.0284848484848483, + "grad_norm": 0.006821845192462206, + "learning_rate": 9.093138607129268e-05, + "loss": 0.011342315934598446, + "num_input_tokens_seen": 54810472, + "step": 3347, + "train_runtime": 27199.35, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 2.0290909090909093, + "grad_norm": 0.010899233631789684, + "learning_rate": 9.092586252985551e-05, + "loss": 0.012500293552875519, + "num_input_tokens_seen": 54826848, + "step": 3348, + "train_runtime": 27207.4661, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 2.0296969696969698, + "grad_norm": 0.007849505171179771, + "learning_rate": 9.092033747465039e-05, + "loss": 0.012547525577247143, + "num_input_tokens_seen": 54843224, + "step": 3349, + "train_runtime": 27215.5858, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 2.0303030303030303, + "grad_norm": 0.007524041458964348, + "learning_rate": 9.091481090588166e-05, + "loss": 0.011828714981675148, + "num_input_tokens_seen": 54859600, + "step": 3350, + "train_runtime": 27223.7056, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 2.0309090909090908, + "grad_norm": 0.009752164594829082, + "learning_rate": 9.090928282375378e-05, + "loss": 0.011578820645809174, + "num_input_tokens_seen": 54875976, + "step": 3351, + "train_runtime": 27231.8332, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 2.0315151515151517, + "grad_norm": 0.06355761736631393, + "learning_rate": 9.090375322847118e-05, + "loss": 0.011825401335954666, + "num_input_tokens_seen": 54892352, + "step": 3352, + "train_runtime": 27239.9532, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 2.032121212121212, + "grad_norm": 0.007345013786107302, + "learning_rate": 9.089822212023839e-05, + "loss": 0.011034861207008362, + "num_input_tokens_seen": 54908728, + "step": 3353, + "train_runtime": 27248.0725, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 2.0327272727272727, + "grad_norm": 0.0073317899368703365, + "learning_rate": 9.089268949926004e-05, + "loss": 0.01266011968255043, + "num_input_tokens_seen": 54925104, + "step": 3354, + "train_runtime": 27256.1914, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 2.033333333333333, + "grad_norm": 0.0070832595229148865, + "learning_rate": 9.088715536574071e-05, + "loss": 0.011928196996450424, + "num_input_tokens_seen": 54941480, + "step": 3355, + "train_runtime": 27264.3078, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 2.033939393939394, + "grad_norm": 0.011500328779220581, + "learning_rate": 9.088161971988516e-05, + "loss": 0.011790191754698753, + "num_input_tokens_seen": 54957856, + "step": 3356, + "train_runtime": 27272.4332, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 2.0345454545454547, + "grad_norm": 0.00981878861784935, + "learning_rate": 9.087608256189808e-05, + "loss": 0.012370465323328972, + "num_input_tokens_seen": 54974232, + "step": 3357, + "train_runtime": 27280.5519, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 2.035151515151515, + "grad_norm": 0.011719790287315845, + "learning_rate": 9.087054389198432e-05, + "loss": 0.012797150760889053, + "num_input_tokens_seen": 54990608, + "step": 3358, + "train_runtime": 27288.6627, + "train_tokens_per_second": 2015.145 + }, + { + "epoch": 2.0357575757575757, + "grad_norm": 0.005948623642325401, + "learning_rate": 9.086500371034874e-05, + "loss": 0.012527494691312313, + "num_input_tokens_seen": 55006984, + "step": 3359, + "train_runtime": 27296.776, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 2.036363636363636, + "grad_norm": 0.009340680204331875, + "learning_rate": 9.085946201719625e-05, + "loss": 0.011543444357812405, + "num_input_tokens_seen": 55023360, + "step": 3360, + "train_runtime": 27304.8956, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 2.036969696969697, + "grad_norm": 0.007699036505073309, + "learning_rate": 9.085391881273182e-05, + "loss": 0.011673328466713428, + "num_input_tokens_seen": 55039736, + "step": 3361, + "train_runtime": 27313.0081, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 2.0375757575757576, + "grad_norm": 0.008742819540202618, + "learning_rate": 9.084837409716051e-05, + "loss": 0.012066803872585297, + "num_input_tokens_seen": 55056112, + "step": 3362, + "train_runtime": 27321.1237, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 2.038181818181818, + "grad_norm": 0.006048336159437895, + "learning_rate": 9.084282787068739e-05, + "loss": 0.012774009257555008, + "num_input_tokens_seen": 55072488, + "step": 3363, + "train_runtime": 27329.2452, + "train_tokens_per_second": 2015.149 + }, + { + "epoch": 2.0387878787878786, + "grad_norm": 0.007238797843456268, + "learning_rate": 9.083728013351758e-05, + "loss": 0.011799037456512451, + "num_input_tokens_seen": 55088864, + "step": 3364, + "train_runtime": 27337.3642, + "train_tokens_per_second": 2015.149 + }, + { + "epoch": 2.0393939393939395, + "grad_norm": 0.021580960601568222, + "learning_rate": 9.083173088585632e-05, + "loss": 0.011892813257873058, + "num_input_tokens_seen": 55105240, + "step": 3365, + "train_runtime": 27345.484, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 2.04, + "grad_norm": 0.009234655648469925, + "learning_rate": 9.082618012790886e-05, + "loss": 0.011318245902657509, + "num_input_tokens_seen": 55121616, + "step": 3366, + "train_runtime": 27353.6041, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 2.0406060606060605, + "grad_norm": 0.012906300835311413, + "learning_rate": 9.082062785988049e-05, + "loss": 0.012823051773011684, + "num_input_tokens_seen": 55137992, + "step": 3367, + "train_runtime": 27361.723, + "train_tokens_per_second": 2015.151 + }, + { + "epoch": 2.041212121212121, + "grad_norm": 0.0036449427716434, + "learning_rate": 9.08150740819766e-05, + "loss": 0.01129306573420763, + "num_input_tokens_seen": 55154368, + "step": 3368, + "train_runtime": 27369.8385, + "train_tokens_per_second": 2015.151 + }, + { + "epoch": 2.041818181818182, + "grad_norm": 0.012543014250695705, + "learning_rate": 9.08095187944026e-05, + "loss": 0.013440349139273167, + "num_input_tokens_seen": 55170744, + "step": 3369, + "train_runtime": 27377.953, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 2.0424242424242425, + "grad_norm": 0.0104695875197649, + "learning_rate": 9.080396199736396e-05, + "loss": 0.012156671844422817, + "num_input_tokens_seen": 55187120, + "step": 3370, + "train_runtime": 27386.0679, + "train_tokens_per_second": 2015.153 + }, + { + "epoch": 2.043030303030303, + "grad_norm": 0.0225234292447567, + "learning_rate": 9.079840369106625e-05, + "loss": 0.011554519645869732, + "num_input_tokens_seen": 55203496, + "step": 3371, + "train_runtime": 27394.1921, + "train_tokens_per_second": 2015.153 + }, + { + "epoch": 2.0436363636363635, + "grad_norm": 0.017583031207323074, + "learning_rate": 9.079284387571503e-05, + "loss": 0.012483416125178337, + "num_input_tokens_seen": 55219872, + "step": 3372, + "train_runtime": 27402.313, + "train_tokens_per_second": 2015.154 + }, + { + "epoch": 2.0442424242424244, + "grad_norm": 0.00885077379643917, + "learning_rate": 9.078728255151594e-05, + "loss": 0.011692331172525883, + "num_input_tokens_seen": 55236248, + "step": 3373, + "train_runtime": 27410.433, + "train_tokens_per_second": 2015.154 + }, + { + "epoch": 2.044848484848485, + "grad_norm": 0.004110273905098438, + "learning_rate": 9.078171971867471e-05, + "loss": 0.012116055004298687, + "num_input_tokens_seen": 55252624, + "step": 3374, + "train_runtime": 27418.5477, + "train_tokens_per_second": 2015.155 + }, + { + "epoch": 2.0454545454545454, + "grad_norm": 0.011093060486018658, + "learning_rate": 9.077615537739709e-05, + "loss": 0.01290032360702753, + "num_input_tokens_seen": 55269000, + "step": 3375, + "train_runtime": 27426.6586, + "train_tokens_per_second": 2015.156 + }, + { + "epoch": 2.046060606060606, + "grad_norm": 0.007847296074032784, + "learning_rate": 9.077058952788888e-05, + "loss": 0.013083033263683319, + "num_input_tokens_seen": 55285376, + "step": 3376, + "train_runtime": 27434.772, + "train_tokens_per_second": 2015.157 + }, + { + "epoch": 2.046666666666667, + "grad_norm": 0.006762146949768066, + "learning_rate": 9.076502217035597e-05, + "loss": 0.013601238839328289, + "num_input_tokens_seen": 55301752, + "step": 3377, + "train_runtime": 27442.8879, + "train_tokens_per_second": 2015.158 + }, + { + "epoch": 2.0472727272727274, + "grad_norm": 0.006784932222217321, + "learning_rate": 9.075945330500428e-05, + "loss": 0.01189483143389225, + "num_input_tokens_seen": 55318128, + "step": 3378, + "train_runtime": 27451.0075, + "train_tokens_per_second": 2015.158 + }, + { + "epoch": 2.047878787878788, + "grad_norm": 0.007549419533461332, + "learning_rate": 9.075388293203978e-05, + "loss": 0.01299357507377863, + "num_input_tokens_seen": 55334504, + "step": 3379, + "train_runtime": 27459.1348, + "train_tokens_per_second": 2015.158 + }, + { + "epoch": 2.0484848484848484, + "grad_norm": 0.006521687377244234, + "learning_rate": 9.074831105166852e-05, + "loss": 0.013047239743173122, + "num_input_tokens_seen": 55350880, + "step": 3380, + "train_runtime": 27467.2501, + "train_tokens_per_second": 2015.159 + }, + { + "epoch": 2.0490909090909093, + "grad_norm": 0.008688780479133129, + "learning_rate": 9.074273766409657e-05, + "loss": 0.013033932074904442, + "num_input_tokens_seen": 55367256, + "step": 3381, + "train_runtime": 27475.3662, + "train_tokens_per_second": 2015.16 + }, + { + "epoch": 2.04969696969697, + "grad_norm": 0.005272239912301302, + "learning_rate": 9.073716276953012e-05, + "loss": 0.013080219738185406, + "num_input_tokens_seen": 55383632, + "step": 3382, + "train_runtime": 27483.4813, + "train_tokens_per_second": 2015.161 + }, + { + "epoch": 2.0503030303030303, + "grad_norm": 0.008483667857944965, + "learning_rate": 9.073158636817535e-05, + "loss": 0.010860172100365162, + "num_input_tokens_seen": 55400008, + "step": 3383, + "train_runtime": 27491.6, + "train_tokens_per_second": 2015.161 + }, + { + "epoch": 2.050909090909091, + "grad_norm": 0.0050791422836482525, + "learning_rate": 9.07260084602385e-05, + "loss": 0.011222576722502708, + "num_input_tokens_seen": 55416384, + "step": 3384, + "train_runtime": 27499.717, + "train_tokens_per_second": 2015.162 + }, + { + "epoch": 2.0515151515151517, + "grad_norm": 0.00709482142701745, + "learning_rate": 9.072042904592593e-05, + "loss": 0.011624621227383614, + "num_input_tokens_seen": 55432760, + "step": 3385, + "train_runtime": 27507.8357, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 2.0521212121212122, + "grad_norm": 0.004702972248196602, + "learning_rate": 9.071484812544398e-05, + "loss": 0.011560735292732716, + "num_input_tokens_seen": 55449136, + "step": 3386, + "train_runtime": 27515.9538, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 2.0527272727272727, + "grad_norm": 0.008912491612136364, + "learning_rate": 9.070926569899909e-05, + "loss": 0.011354231275618076, + "num_input_tokens_seen": 55465512, + "step": 3387, + "train_runtime": 27524.0687, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 2.0533333333333332, + "grad_norm": 0.0077764480374753475, + "learning_rate": 9.070368176679774e-05, + "loss": 0.012655006721615791, + "num_input_tokens_seen": 55481888, + "step": 3388, + "train_runtime": 27532.1814, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 2.0539393939393937, + "grad_norm": 0.015883052721619606, + "learning_rate": 9.069809632904646e-05, + "loss": 0.012338697910308838, + "num_input_tokens_seen": 55498264, + "step": 3389, + "train_runtime": 27540.3008, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 2.0545454545454547, + "grad_norm": 0.01027887687087059, + "learning_rate": 9.069250938595185e-05, + "loss": 0.012151487171649933, + "num_input_tokens_seen": 55514640, + "step": 3390, + "train_runtime": 27548.4228, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 2.055151515151515, + "grad_norm": 0.0059898607432842255, + "learning_rate": 9.068692093772058e-05, + "loss": 0.011957871727645397, + "num_input_tokens_seen": 55531016, + "step": 3391, + "train_runtime": 27556.5422, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 2.0557575757575757, + "grad_norm": 0.01387920044362545, + "learning_rate": 9.068133098455932e-05, + "loss": 0.01215735636651516, + "num_input_tokens_seen": 55547392, + "step": 3392, + "train_runtime": 27564.6577, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 2.056363636363636, + "grad_norm": 0.006400204263627529, + "learning_rate": 9.067573952667486e-05, + "loss": 0.012007832527160645, + "num_input_tokens_seen": 55563768, + "step": 3393, + "train_runtime": 27572.7756, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 2.056969696969697, + "grad_norm": 0.009694559499621391, + "learning_rate": 9.067014656427401e-05, + "loss": 0.011804431676864624, + "num_input_tokens_seen": 55580144, + "step": 3394, + "train_runtime": 27580.8899, + "train_tokens_per_second": 2015.169 + }, + { + "epoch": 2.0575757575757576, + "grad_norm": 0.010956378653645515, + "learning_rate": 9.066455209756364e-05, + "loss": 0.012839428149163723, + "num_input_tokens_seen": 55596520, + "step": 3395, + "train_runtime": 27589.0075, + "train_tokens_per_second": 2015.169 + }, + { + "epoch": 2.058181818181818, + "grad_norm": 0.008832655847072601, + "learning_rate": 9.065895612675066e-05, + "loss": 0.011447408236563206, + "num_input_tokens_seen": 55612896, + "step": 3396, + "train_runtime": 27597.1214, + "train_tokens_per_second": 2015.17 + }, + { + "epoch": 2.0587878787878786, + "grad_norm": 0.013783660717308521, + "learning_rate": 9.06533586520421e-05, + "loss": 0.012833976186811924, + "num_input_tokens_seen": 55629272, + "step": 3397, + "train_runtime": 27605.2398, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 2.0593939393939396, + "grad_norm": 0.00604918971657753, + "learning_rate": 9.064775967364495e-05, + "loss": 0.010695607401430607, + "num_input_tokens_seen": 55645648, + "step": 3398, + "train_runtime": 27613.3528, + "train_tokens_per_second": 2015.172 + }, + { + "epoch": 2.06, + "grad_norm": 0.010326673276722431, + "learning_rate": 9.064215919176634e-05, + "loss": 0.01307748258113861, + "num_input_tokens_seen": 55662024, + "step": 3399, + "train_runtime": 27621.4707, + "train_tokens_per_second": 2015.172 + }, + { + "epoch": 2.0606060606060606, + "grad_norm": 0.00769348070025444, + "learning_rate": 9.06365572066134e-05, + "loss": 0.011743209324777126, + "num_input_tokens_seen": 55678400, + "step": 3400, + "train_runtime": 27629.5868, + "train_tokens_per_second": 2015.173 + }, + { + "epoch": 2.061212121212121, + "grad_norm": 0.019033854827284813, + "learning_rate": 9.063095371839337e-05, + "loss": 0.012079644948244095, + "num_input_tokens_seen": 55694776, + "step": 3401, + "train_runtime": 27638.5839, + "train_tokens_per_second": 2015.11 + }, + { + "epoch": 2.061818181818182, + "grad_norm": 0.008042428642511368, + "learning_rate": 9.062534872731346e-05, + "loss": 0.011539160273969173, + "num_input_tokens_seen": 55711152, + "step": 3402, + "train_runtime": 27646.693, + "train_tokens_per_second": 2015.111 + }, + { + "epoch": 2.0624242424242425, + "grad_norm": 0.007993514649569988, + "learning_rate": 9.061974223358101e-05, + "loss": 0.012027964927256107, + "num_input_tokens_seen": 55727528, + "step": 3403, + "train_runtime": 27654.8055, + "train_tokens_per_second": 2015.112 + }, + { + "epoch": 2.063030303030303, + "grad_norm": 0.0052061243914067745, + "learning_rate": 9.061413423740342e-05, + "loss": 0.011442933231592178, + "num_input_tokens_seen": 55743904, + "step": 3404, + "train_runtime": 27662.9148, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 2.0636363636363635, + "grad_norm": 0.010407760739326477, + "learning_rate": 9.060852473898808e-05, + "loss": 0.012942980974912643, + "num_input_tokens_seen": 55760280, + "step": 3405, + "train_runtime": 27671.0408, + "train_tokens_per_second": 2015.113 + }, + { + "epoch": 2.0642424242424244, + "grad_norm": 0.012654558755457401, + "learning_rate": 9.060291373854251e-05, + "loss": 0.013148204423487186, + "num_input_tokens_seen": 55776656, + "step": 3406, + "train_runtime": 27679.1555, + "train_tokens_per_second": 2015.114 + }, + { + "epoch": 2.064848484848485, + "grad_norm": 0.012586663477122784, + "learning_rate": 9.05973012362742e-05, + "loss": 0.012279224582016468, + "num_input_tokens_seen": 55793032, + "step": 3407, + "train_runtime": 27687.2688, + "train_tokens_per_second": 2015.115 + }, + { + "epoch": 2.0654545454545454, + "grad_norm": 0.011190484277904034, + "learning_rate": 9.059168723239081e-05, + "loss": 0.011963452212512493, + "num_input_tokens_seen": 55809408, + "step": 3408, + "train_runtime": 27695.3819, + "train_tokens_per_second": 2015.116 + }, + { + "epoch": 2.066060606060606, + "grad_norm": 0.009318447671830654, + "learning_rate": 9.058607172709994e-05, + "loss": 0.011981315910816193, + "num_input_tokens_seen": 55825784, + "step": 3409, + "train_runtime": 27703.4932, + "train_tokens_per_second": 2015.117 + }, + { + "epoch": 2.066666666666667, + "grad_norm": 0.009665393270552158, + "learning_rate": 9.058045472060931e-05, + "loss": 0.011912458576261997, + "num_input_tokens_seen": 55842160, + "step": 3410, + "train_runtime": 27711.6106, + "train_tokens_per_second": 2015.118 + }, + { + "epoch": 2.0672727272727274, + "grad_norm": 0.005949472542852163, + "learning_rate": 9.057483621312671e-05, + "loss": 0.012273924425244331, + "num_input_tokens_seen": 55858536, + "step": 3411, + "train_runtime": 27719.7219, + "train_tokens_per_second": 2015.119 + }, + { + "epoch": 2.067878787878788, + "grad_norm": 0.008634460158646107, + "learning_rate": 9.056921620485992e-05, + "loss": 0.01283192541450262, + "num_input_tokens_seen": 55874912, + "step": 3412, + "train_runtime": 27727.8332, + "train_tokens_per_second": 2015.12 + }, + { + "epoch": 2.0684848484848484, + "grad_norm": 0.0065974947065114975, + "learning_rate": 9.056359469601683e-05, + "loss": 0.012668941169977188, + "num_input_tokens_seen": 55891288, + "step": 3413, + "train_runtime": 27735.9456, + "train_tokens_per_second": 2015.121 + }, + { + "epoch": 2.0690909090909093, + "grad_norm": 0.007294429000467062, + "learning_rate": 9.055797168680538e-05, + "loss": 0.01187070831656456, + "num_input_tokens_seen": 55907664, + "step": 3414, + "train_runtime": 27744.0589, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 2.06969696969697, + "grad_norm": 0.009185160510241985, + "learning_rate": 9.055234717743351e-05, + "loss": 0.012446683831512928, + "num_input_tokens_seen": 55924040, + "step": 3415, + "train_runtime": 27752.1797, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 2.0703030303030303, + "grad_norm": 0.006855017505586147, + "learning_rate": 9.054672116810932e-05, + "loss": 0.011901344172656536, + "num_input_tokens_seen": 55940416, + "step": 3416, + "train_runtime": 27760.2968, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 2.070909090909091, + "grad_norm": 0.011837942525744438, + "learning_rate": 9.054109365904085e-05, + "loss": 0.012001942843198776, + "num_input_tokens_seen": 55956792, + "step": 3417, + "train_runtime": 27768.4105, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 2.0715151515151513, + "grad_norm": 0.0068351225927472115, + "learning_rate": 9.053546465043629e-05, + "loss": 0.01226651668548584, + "num_input_tokens_seen": 55973168, + "step": 3418, + "train_runtime": 27776.5321, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 2.0721212121212123, + "grad_norm": 0.007168815471231937, + "learning_rate": 9.052983414250382e-05, + "loss": 0.01221819780766964, + "num_input_tokens_seen": 55989544, + "step": 3419, + "train_runtime": 27784.6461, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 2.0727272727272728, + "grad_norm": 0.0102562690153718, + "learning_rate": 9.052420213545172e-05, + "loss": 0.014302713796496391, + "num_input_tokens_seen": 56005920, + "step": 3420, + "train_runtime": 27792.7619, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 2.0733333333333333, + "grad_norm": 0.009133314713835716, + "learning_rate": 9.05185686294883e-05, + "loss": 0.012644648551940918, + "num_input_tokens_seen": 56022296, + "step": 3421, + "train_runtime": 27800.8763, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 2.0739393939393937, + "grad_norm": 0.0054561379365623, + "learning_rate": 9.051293362482193e-05, + "loss": 0.010259821079671383, + "num_input_tokens_seen": 56038672, + "step": 3422, + "train_runtime": 27808.987, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 2.0745454545454547, + "grad_norm": 0.015878645703196526, + "learning_rate": 9.050729712166105e-05, + "loss": 0.012173894792795181, + "num_input_tokens_seen": 56055048, + "step": 3423, + "train_runtime": 27817.1057, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 2.075151515151515, + "grad_norm": 0.007376739289611578, + "learning_rate": 9.050165912021413e-05, + "loss": 0.011926956474781036, + "num_input_tokens_seen": 56071424, + "step": 3424, + "train_runtime": 27825.2176, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 2.0757575757575757, + "grad_norm": 0.008722481317818165, + "learning_rate": 9.049601962068971e-05, + "loss": 0.010563036426901817, + "num_input_tokens_seen": 56087800, + "step": 3425, + "train_runtime": 27833.3333, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 2.076363636363636, + "grad_norm": 0.018847903236746788, + "learning_rate": 9.04903786232964e-05, + "loss": 0.012597802095115185, + "num_input_tokens_seen": 56104176, + "step": 3426, + "train_runtime": 27841.5441, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 2.076969696969697, + "grad_norm": 0.005797175224870443, + "learning_rate": 9.048473612824282e-05, + "loss": 0.012497548013925552, + "num_input_tokens_seen": 56120552, + "step": 3427, + "train_runtime": 27849.6592, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 2.0775757575757576, + "grad_norm": 0.010824366472661495, + "learning_rate": 9.047909213573769e-05, + "loss": 0.01156754419207573, + "num_input_tokens_seen": 56136928, + "step": 3428, + "train_runtime": 27857.7764, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 2.078181818181818, + "grad_norm": 0.007866271771490574, + "learning_rate": 9.047344664598978e-05, + "loss": 0.011103162541985512, + "num_input_tokens_seen": 56153304, + "step": 3429, + "train_runtime": 27865.8902, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 2.0787878787878786, + "grad_norm": 0.009463231079280376, + "learning_rate": 9.046779965920788e-05, + "loss": 0.012735102325677872, + "num_input_tokens_seen": 56169680, + "step": 3430, + "train_runtime": 27874.0089, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 2.0793939393939396, + "grad_norm": 0.013803756795823574, + "learning_rate": 9.04621511756009e-05, + "loss": 0.012847594916820526, + "num_input_tokens_seen": 56186056, + "step": 3431, + "train_runtime": 27882.1324, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 2.08, + "grad_norm": 0.00846054870635271, + "learning_rate": 9.045650119537774e-05, + "loss": 0.01081385649740696, + "num_input_tokens_seen": 56202432, + "step": 3432, + "train_runtime": 27890.2506, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 2.0806060606060606, + "grad_norm": 0.008346304297447205, + "learning_rate": 9.045084971874738e-05, + "loss": 0.012644726783037186, + "num_input_tokens_seen": 56218808, + "step": 3433, + "train_runtime": 27898.3623, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 2.081212121212121, + "grad_norm": 0.012914376333355904, + "learning_rate": 9.044519674591887e-05, + "loss": 0.012044238857924938, + "num_input_tokens_seen": 56235184, + "step": 3434, + "train_runtime": 27906.4762, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 2.081818181818182, + "grad_norm": 0.00625306461006403, + "learning_rate": 9.043954227710128e-05, + "loss": 0.009518924169242382, + "num_input_tokens_seen": 56251560, + "step": 3435, + "train_runtime": 27914.5917, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 2.0824242424242425, + "grad_norm": 0.00748471962288022, + "learning_rate": 9.04338863125038e-05, + "loss": 0.012188691645860672, + "num_input_tokens_seen": 56267936, + "step": 3436, + "train_runtime": 27922.7087, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 2.083030303030303, + "grad_norm": 0.0038603260181844234, + "learning_rate": 9.042822885233557e-05, + "loss": 0.011931288056075573, + "num_input_tokens_seen": 56284312, + "step": 3437, + "train_runtime": 27930.8323, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 2.0836363636363635, + "grad_norm": 0.005839409306645393, + "learning_rate": 9.04225698968059e-05, + "loss": 0.011982793919742107, + "num_input_tokens_seen": 56300688, + "step": 3438, + "train_runtime": 27938.9479, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 2.0842424242424245, + "grad_norm": 0.008254511281847954, + "learning_rate": 9.04169094461241e-05, + "loss": 0.011427072808146477, + "num_input_tokens_seen": 56317064, + "step": 3439, + "train_runtime": 27947.0623, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 2.084848484848485, + "grad_norm": 0.00821908749639988, + "learning_rate": 9.041124750049955e-05, + "loss": 0.012207668274641037, + "num_input_tokens_seen": 56333440, + "step": 3440, + "train_runtime": 27955.1756, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 2.0854545454545454, + "grad_norm": 0.021660784259438515, + "learning_rate": 9.040558406014161e-05, + "loss": 0.01299472339451313, + "num_input_tokens_seen": 56349816, + "step": 3441, + "train_runtime": 27963.2951, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 2.086060606060606, + "grad_norm": 0.007126884069293737, + "learning_rate": 9.039991912525983e-05, + "loss": 0.010887030512094498, + "num_input_tokens_seen": 56366192, + "step": 3442, + "train_runtime": 27971.4076, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 2.086666666666667, + "grad_norm": 0.0057904645800590515, + "learning_rate": 9.03942526960637e-05, + "loss": 0.011882147751748562, + "num_input_tokens_seen": 56382568, + "step": 3443, + "train_runtime": 27979.5347, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 2.0872727272727274, + "grad_norm": 0.014943573623895645, + "learning_rate": 9.038858477276282e-05, + "loss": 0.013633402064442635, + "num_input_tokens_seen": 56398944, + "step": 3444, + "train_runtime": 27987.649, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 2.087878787878788, + "grad_norm": 0.02003590203821659, + "learning_rate": 9.038291535556686e-05, + "loss": 0.011881126090884209, + "num_input_tokens_seen": 56415320, + "step": 3445, + "train_runtime": 27995.7668, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 2.0884848484848484, + "grad_norm": 0.005910848267376423, + "learning_rate": 9.03772444446855e-05, + "loss": 0.012205943465232849, + "num_input_tokens_seen": 56431696, + "step": 3446, + "train_runtime": 28003.8788, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 2.089090909090909, + "grad_norm": 0.005043689161539078, + "learning_rate": 9.037157204032848e-05, + "loss": 0.01205090619623661, + "num_input_tokens_seen": 56448072, + "step": 3447, + "train_runtime": 28011.9913, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 2.08969696969697, + "grad_norm": 0.012684042565524578, + "learning_rate": 9.036589814270565e-05, + "loss": 0.012548624537885189, + "num_input_tokens_seen": 56464448, + "step": 3448, + "train_runtime": 28020.1027, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 2.0903030303030303, + "grad_norm": 0.011274142190814018, + "learning_rate": 9.036022275202686e-05, + "loss": 0.01294254045933485, + "num_input_tokens_seen": 56480824, + "step": 3449, + "train_runtime": 28028.2168, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 2.090909090909091, + "grad_norm": 0.007525157183408737, + "learning_rate": 9.035454586850202e-05, + "loss": 0.012265852652490139, + "num_input_tokens_seen": 56497200, + "step": 3450, + "train_runtime": 28036.3356, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 2.0915151515151513, + "grad_norm": 0.007187213283032179, + "learning_rate": 9.034886749234111e-05, + "loss": 0.011971338652074337, + "num_input_tokens_seen": 56513576, + "step": 3451, + "train_runtime": 28044.4535, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 2.0921212121212123, + "grad_norm": 0.00701161241158843, + "learning_rate": 9.034318762375418e-05, + "loss": 0.012336109764873981, + "num_input_tokens_seen": 56529952, + "step": 3452, + "train_runtime": 28052.5663, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 2.0927272727272728, + "grad_norm": 0.009412666782736778, + "learning_rate": 9.03375062629513e-05, + "loss": 0.012127682566642761, + "num_input_tokens_seen": 56546328, + "step": 3453, + "train_runtime": 28060.6789, + "train_tokens_per_second": 2015.145 + }, + { + "epoch": 2.0933333333333333, + "grad_norm": 0.00752132898196578, + "learning_rate": 9.033182341014261e-05, + "loss": 0.011383445002138615, + "num_input_tokens_seen": 56562704, + "step": 3454, + "train_runtime": 28068.7885, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 2.0939393939393938, + "grad_norm": 0.010009681805968285, + "learning_rate": 9.032613906553833e-05, + "loss": 0.01266009733080864, + "num_input_tokens_seen": 56579080, + "step": 3455, + "train_runtime": 28076.9019, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 2.0945454545454547, + "grad_norm": 0.004542776383459568, + "learning_rate": 9.032045322934868e-05, + "loss": 0.013468949124217033, + "num_input_tokens_seen": 56595456, + "step": 3456, + "train_runtime": 28085.0148, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 2.095151515151515, + "grad_norm": 0.015904322266578674, + "learning_rate": 9.031476590178399e-05, + "loss": 0.012843945994973183, + "num_input_tokens_seen": 56611832, + "step": 3457, + "train_runtime": 28093.1335, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 2.0957575757575757, + "grad_norm": 0.007006669882684946, + "learning_rate": 9.030907708305463e-05, + "loss": 0.012989908456802368, + "num_input_tokens_seen": 56628208, + "step": 3458, + "train_runtime": 28101.2497, + "train_tokens_per_second": 2015.149 + }, + { + "epoch": 2.096363636363636, + "grad_norm": 0.004681939259171486, + "learning_rate": 9.0303386773371e-05, + "loss": 0.012774134986102581, + "num_input_tokens_seen": 56644584, + "step": 3459, + "train_runtime": 28109.3682, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 2.096969696969697, + "grad_norm": 0.004613004624843597, + "learning_rate": 9.029769497294358e-05, + "loss": 0.012811033055186272, + "num_input_tokens_seen": 56660960, + "step": 3460, + "train_runtime": 28117.4807, + "train_tokens_per_second": 2015.151 + }, + { + "epoch": 2.0975757575757576, + "grad_norm": 0.006610489450395107, + "learning_rate": 9.029200168198289e-05, + "loss": 0.012730253860354424, + "num_input_tokens_seen": 56677336, + "step": 3461, + "train_runtime": 28125.5904, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 2.098181818181818, + "grad_norm": 0.011558209545910358, + "learning_rate": 9.028630690069954e-05, + "loss": 0.013134753331542015, + "num_input_tokens_seen": 56693712, + "step": 3462, + "train_runtime": 28133.7036, + "train_tokens_per_second": 2015.153 + }, + { + "epoch": 2.0987878787878786, + "grad_norm": 0.013601330108940601, + "learning_rate": 9.028061062930414e-05, + "loss": 0.012436976656317711, + "num_input_tokens_seen": 56710088, + "step": 3463, + "train_runtime": 28141.8175, + "train_tokens_per_second": 2015.154 + }, + { + "epoch": 2.0993939393939396, + "grad_norm": 0.01037578471004963, + "learning_rate": 9.02749128680074e-05, + "loss": 0.013326249085366726, + "num_input_tokens_seen": 56726464, + "step": 3464, + "train_runtime": 28149.9323, + "train_tokens_per_second": 2015.155 + }, + { + "epoch": 2.1, + "grad_norm": 0.007674772758036852, + "learning_rate": 9.026921361702007e-05, + "loss": 0.011600622907280922, + "num_input_tokens_seen": 56742840, + "step": 3465, + "train_runtime": 28158.0456, + "train_tokens_per_second": 2015.155 + }, + { + "epoch": 2.1006060606060606, + "grad_norm": 0.014724268577992916, + "learning_rate": 9.026351287655294e-05, + "loss": 0.012907741591334343, + "num_input_tokens_seen": 56759216, + "step": 3466, + "train_runtime": 28166.1554, + "train_tokens_per_second": 2015.157 + }, + { + "epoch": 2.101212121212121, + "grad_norm": 0.00393960764631629, + "learning_rate": 9.025781064681687e-05, + "loss": 0.011136265471577644, + "num_input_tokens_seen": 56775592, + "step": 3467, + "train_runtime": 28174.2678, + "train_tokens_per_second": 2015.158 + }, + { + "epoch": 2.101818181818182, + "grad_norm": 0.01141727901995182, + "learning_rate": 9.02521069280228e-05, + "loss": 0.01337271649390459, + "num_input_tokens_seen": 56791968, + "step": 3468, + "train_runtime": 28182.3794, + "train_tokens_per_second": 2015.159 + }, + { + "epoch": 2.1024242424242425, + "grad_norm": 0.021951353177428246, + "learning_rate": 9.024640172038168e-05, + "loss": 0.012599781155586243, + "num_input_tokens_seen": 56808344, + "step": 3469, + "train_runtime": 28190.4956, + "train_tokens_per_second": 2015.159 + }, + { + "epoch": 2.103030303030303, + "grad_norm": 0.005962764844298363, + "learning_rate": 9.024069502410453e-05, + "loss": 0.01175682246685028, + "num_input_tokens_seen": 56824720, + "step": 3470, + "train_runtime": 28198.606, + "train_tokens_per_second": 2015.161 + }, + { + "epoch": 2.1036363636363635, + "grad_norm": 0.008232819847762585, + "learning_rate": 9.023498683940243e-05, + "loss": 0.01175486296415329, + "num_input_tokens_seen": 56841096, + "step": 3471, + "train_runtime": 28206.717, + "train_tokens_per_second": 2015.162 + }, + { + "epoch": 2.1042424242424245, + "grad_norm": 0.00799788348376751, + "learning_rate": 9.022927716648653e-05, + "loss": 0.013452235609292984, + "num_input_tokens_seen": 56857472, + "step": 3472, + "train_runtime": 28214.8326, + "train_tokens_per_second": 2015.162 + }, + { + "epoch": 2.104848484848485, + "grad_norm": 0.005621060729026794, + "learning_rate": 9.022356600556801e-05, + "loss": 0.011244012042880058, + "num_input_tokens_seen": 56873848, + "step": 3473, + "train_runtime": 28222.9449, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 2.1054545454545455, + "grad_norm": 0.005224290303885937, + "learning_rate": 9.021785335685813e-05, + "loss": 0.012814436107873917, + "num_input_tokens_seen": 56890224, + "step": 3474, + "train_runtime": 28231.0612, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 2.106060606060606, + "grad_norm": 0.010674373246729374, + "learning_rate": 9.021213922056815e-05, + "loss": 0.0114644356071949, + "num_input_tokens_seen": 56906600, + "step": 3475, + "train_runtime": 28239.1737, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 2.1066666666666665, + "grad_norm": 0.0121085736900568, + "learning_rate": 9.020642359690947e-05, + "loss": 0.012265300378203392, + "num_input_tokens_seen": 56922976, + "step": 3476, + "train_runtime": 28247.2869, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 2.1072727272727274, + "grad_norm": 0.009924935176968575, + "learning_rate": 9.020070648609347e-05, + "loss": 0.013388853520154953, + "num_input_tokens_seen": 56939352, + "step": 3477, + "train_runtime": 28255.3966, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 2.107878787878788, + "grad_norm": 0.005500199273228645, + "learning_rate": 9.019498788833161e-05, + "loss": 0.011340290307998657, + "num_input_tokens_seen": 56955728, + "step": 3478, + "train_runtime": 28263.5114, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 2.1084848484848484, + "grad_norm": 0.008234964683651924, + "learning_rate": 9.018926780383545e-05, + "loss": 0.012281153351068497, + "num_input_tokens_seen": 56972104, + "step": 3479, + "train_runtime": 28271.6223, + "train_tokens_per_second": 2015.169 + }, + { + "epoch": 2.109090909090909, + "grad_norm": 0.008555219508707523, + "learning_rate": 9.018354623281653e-05, + "loss": 0.012110285460948944, + "num_input_tokens_seen": 56988480, + "step": 3480, + "train_runtime": 28279.7386, + "train_tokens_per_second": 2015.17 + }, + { + "epoch": 2.10969696969697, + "grad_norm": 0.012038682587444782, + "learning_rate": 9.017782317548649e-05, + "loss": 0.013431099243462086, + "num_input_tokens_seen": 57004856, + "step": 3481, + "train_runtime": 28287.8525, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 2.1103030303030303, + "grad_norm": 0.010675543919205666, + "learning_rate": 9.0172098632057e-05, + "loss": 0.01298239640891552, + "num_input_tokens_seen": 57021232, + "step": 3482, + "train_runtime": 28295.966, + "train_tokens_per_second": 2015.172 + }, + { + "epoch": 2.110909090909091, + "grad_norm": 0.010105367749929428, + "learning_rate": 9.016637260273983e-05, + "loss": 0.012679103761911392, + "num_input_tokens_seen": 57037608, + "step": 3483, + "train_runtime": 28304.0783, + "train_tokens_per_second": 2015.173 + }, + { + "epoch": 2.1115151515151513, + "grad_norm": 0.006353117059916258, + "learning_rate": 9.016064508774675e-05, + "loss": 0.012139725498855114, + "num_input_tokens_seen": 57053984, + "step": 3484, + "train_runtime": 28312.1911, + "train_tokens_per_second": 2015.174 + }, + { + "epoch": 2.1121212121212123, + "grad_norm": 0.006390564609318972, + "learning_rate": 9.015491608728961e-05, + "loss": 0.012158969417214394, + "num_input_tokens_seen": 57070360, + "step": 3485, + "train_runtime": 28320.303, + "train_tokens_per_second": 2015.175 + }, + { + "epoch": 2.112727272727273, + "grad_norm": 0.006470012944191694, + "learning_rate": 9.014918560158035e-05, + "loss": 0.012223651632666588, + "num_input_tokens_seen": 57086736, + "step": 3486, + "train_runtime": 28328.4147, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 2.1133333333333333, + "grad_norm": 0.011723276227712631, + "learning_rate": 9.014345363083086e-05, + "loss": 0.012710933573544025, + "num_input_tokens_seen": 57103112, + "step": 3487, + "train_runtime": 28336.5331, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 2.113939393939394, + "grad_norm": 0.01011462602764368, + "learning_rate": 9.013772017525322e-05, + "loss": 0.011596627533435822, + "num_input_tokens_seen": 57119488, + "step": 3488, + "train_runtime": 28344.6508, + "train_tokens_per_second": 2015.177 + }, + { + "epoch": 2.1145454545454547, + "grad_norm": 0.010281038470566273, + "learning_rate": 9.013198523505948e-05, + "loss": 0.011673036962747574, + "num_input_tokens_seen": 57135864, + "step": 3489, + "train_runtime": 28352.7677, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 2.1151515151515152, + "grad_norm": 0.01556433830410242, + "learning_rate": 9.012624881046176e-05, + "loss": 0.0127674276009202, + "num_input_tokens_seen": 57152240, + "step": 3490, + "train_runtime": 28360.8807, + "train_tokens_per_second": 2015.179 + }, + { + "epoch": 2.1157575757575757, + "grad_norm": 0.0046404823660850525, + "learning_rate": 9.012051090167222e-05, + "loss": 0.012291817925870419, + "num_input_tokens_seen": 57168616, + "step": 3491, + "train_runtime": 28368.9922, + "train_tokens_per_second": 2015.18 + }, + { + "epoch": 2.1163636363636362, + "grad_norm": 0.010019625537097454, + "learning_rate": 9.011477150890313e-05, + "loss": 0.012228306382894516, + "num_input_tokens_seen": 57184992, + "step": 3492, + "train_runtime": 28377.1083, + "train_tokens_per_second": 2015.18 + }, + { + "epoch": 2.116969696969697, + "grad_norm": 0.006875072605907917, + "learning_rate": 9.010903063236675e-05, + "loss": 0.011554446071386337, + "num_input_tokens_seen": 57201368, + "step": 3493, + "train_runtime": 28385.2224, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 2.1175757575757577, + "grad_norm": 0.009701329283416271, + "learning_rate": 9.010328827227545e-05, + "loss": 0.012353150174021721, + "num_input_tokens_seen": 57217744, + "step": 3494, + "train_runtime": 28393.3452, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 2.118181818181818, + "grad_norm": 0.006121743004769087, + "learning_rate": 9.00975444288416e-05, + "loss": 0.011772389523684978, + "num_input_tokens_seen": 57234120, + "step": 3495, + "train_runtime": 28401.4597, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 2.1187878787878787, + "grad_norm": 0.00822470337152481, + "learning_rate": 9.009179910227768e-05, + "loss": 0.012995040975511074, + "num_input_tokens_seen": 57250496, + "step": 3496, + "train_runtime": 28409.5721, + "train_tokens_per_second": 2015.183 + }, + { + "epoch": 2.1193939393939396, + "grad_norm": 0.01121497992426157, + "learning_rate": 9.008605229279618e-05, + "loss": 0.012147591449320316, + "num_input_tokens_seen": 57266872, + "step": 3497, + "train_runtime": 28417.6857, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 2.12, + "grad_norm": 0.012420396320521832, + "learning_rate": 9.008030400060967e-05, + "loss": 0.01183453667908907, + "num_input_tokens_seen": 57283248, + "step": 3498, + "train_runtime": 28425.7985, + "train_tokens_per_second": 2015.185 + }, + { + "epoch": 2.1206060606060606, + "grad_norm": 0.009392550215125084, + "learning_rate": 9.007455422593077e-05, + "loss": 0.012206954881548882, + "num_input_tokens_seen": 57299624, + "step": 3499, + "train_runtime": 28433.9121, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 2.121212121212121, + "grad_norm": 0.011173112317919731, + "learning_rate": 9.006880296897215e-05, + "loss": 0.013555949553847313, + "num_input_tokens_seen": 57316000, + "step": 3500, + "train_runtime": 28442.0324, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 2.1218181818181816, + "grad_norm": 0.008368341252207756, + "learning_rate": 9.006305022994654e-05, + "loss": 0.013381442055106163, + "num_input_tokens_seen": 57332376, + "step": 3501, + "train_runtime": 28451.0449, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 2.1224242424242425, + "grad_norm": 0.004341749008744955, + "learning_rate": 9.005729600906671e-05, + "loss": 0.01216307282447815, + "num_input_tokens_seen": 57348752, + "step": 3502, + "train_runtime": 28459.1587, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 2.123030303030303, + "grad_norm": 0.00595821114256978, + "learning_rate": 9.005154030654553e-05, + "loss": 0.01141276303678751, + "num_input_tokens_seen": 57365128, + "step": 3503, + "train_runtime": 28467.2732, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 2.1236363636363635, + "grad_norm": 0.010522489435970783, + "learning_rate": 9.004578312259586e-05, + "loss": 0.014437702484428883, + "num_input_tokens_seen": 57381504, + "step": 3504, + "train_runtime": 28475.3897, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 2.124242424242424, + "grad_norm": 0.008232937194406986, + "learning_rate": 9.004002445743065e-05, + "loss": 0.011311432346701622, + "num_input_tokens_seen": 57397880, + "step": 3505, + "train_runtime": 28483.5045, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 2.124848484848485, + "grad_norm": 0.01173914410173893, + "learning_rate": 9.003426431126291e-05, + "loss": 0.011294864118099213, + "num_input_tokens_seen": 57414256, + "step": 3506, + "train_runtime": 28491.6177, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 2.1254545454545455, + "grad_norm": 0.011773685924708843, + "learning_rate": 9.002850268430572e-05, + "loss": 0.012857058085501194, + "num_input_tokens_seen": 57430632, + "step": 3507, + "train_runtime": 28499.7337, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 2.126060606060606, + "grad_norm": 0.0063151223585009575, + "learning_rate": 9.002273957677214e-05, + "loss": 0.011314822360873222, + "num_input_tokens_seen": 57447008, + "step": 3508, + "train_runtime": 28507.8479, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 2.1266666666666665, + "grad_norm": 0.010733768343925476, + "learning_rate": 9.001697498887537e-05, + "loss": 0.01355978474020958, + "num_input_tokens_seen": 57463384, + "step": 3509, + "train_runtime": 28515.9634, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 2.1272727272727274, + "grad_norm": 0.006729649379849434, + "learning_rate": 9.001120892082864e-05, + "loss": 0.012324851006269455, + "num_input_tokens_seen": 57479760, + "step": 3510, + "train_runtime": 28524.0773, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 2.127878787878788, + "grad_norm": 0.008031347766518593, + "learning_rate": 9.000544137284519e-05, + "loss": 0.012918076477944851, + "num_input_tokens_seen": 57496136, + "step": 3511, + "train_runtime": 28532.1947, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 2.1284848484848484, + "grad_norm": 0.00953582301735878, + "learning_rate": 8.999967234513838e-05, + "loss": 0.013045232743024826, + "num_input_tokens_seen": 57512512, + "step": 3512, + "train_runtime": 28540.3099, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 2.129090909090909, + "grad_norm": 0.005362562369555235, + "learning_rate": 8.999390183792159e-05, + "loss": 0.012626387178897858, + "num_input_tokens_seen": 57528888, + "step": 3513, + "train_runtime": 28548.4227, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 2.12969696969697, + "grad_norm": 0.008026414550840855, + "learning_rate": 8.998812985140825e-05, + "loss": 0.012113875709474087, + "num_input_tokens_seen": 57545264, + "step": 3514, + "train_runtime": 28556.5366, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 2.1303030303030304, + "grad_norm": 0.009107707999646664, + "learning_rate": 8.998235638581186e-05, + "loss": 0.012264639139175415, + "num_input_tokens_seen": 57561640, + "step": 3515, + "train_runtime": 28564.651, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 2.130909090909091, + "grad_norm": 0.008588920347392559, + "learning_rate": 8.997658144134598e-05, + "loss": 0.01172946859151125, + "num_input_tokens_seen": 57578016, + "step": 3516, + "train_runtime": 28572.7705, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 2.1315151515151514, + "grad_norm": 0.008900281973183155, + "learning_rate": 8.99708050182242e-05, + "loss": 0.013760112226009369, + "num_input_tokens_seen": 57594392, + "step": 3517, + "train_runtime": 28580.8907, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 2.1321212121212123, + "grad_norm": 0.009173940867185593, + "learning_rate": 8.996502711666016e-05, + "loss": 0.012618775479495525, + "num_input_tokens_seen": 57610768, + "step": 3518, + "train_runtime": 28589.0102, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 2.132727272727273, + "grad_norm": 0.004674053750932217, + "learning_rate": 8.995924773686761e-05, + "loss": 0.012420371174812317, + "num_input_tokens_seen": 57627144, + "step": 3519, + "train_runtime": 28597.1324, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 0.015587940812110901, + "learning_rate": 8.99534668790603e-05, + "loss": 0.01104898750782013, + "num_input_tokens_seen": 57643520, + "step": 3520, + "train_runtime": 28605.2483, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 2.133939393939394, + "grad_norm": 0.008624122478067875, + "learning_rate": 8.994768454345206e-05, + "loss": 0.011609626933932304, + "num_input_tokens_seen": 57659896, + "step": 3521, + "train_runtime": 28613.361, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 2.1345454545454547, + "grad_norm": 0.0049206861294806, + "learning_rate": 8.994190073025676e-05, + "loss": 0.011751390993595123, + "num_input_tokens_seen": 57676272, + "step": 3522, + "train_runtime": 28621.4759, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 2.1351515151515152, + "grad_norm": 0.01566511020064354, + "learning_rate": 8.993611543968835e-05, + "loss": 0.012831299565732479, + "num_input_tokens_seen": 57692648, + "step": 3523, + "train_runtime": 28629.5909, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 2.1357575757575757, + "grad_norm": 0.008315403945744038, + "learning_rate": 8.99303286719608e-05, + "loss": 0.012952431105077267, + "num_input_tokens_seen": 57709024, + "step": 3524, + "train_runtime": 28637.7062, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 2.1363636363636362, + "grad_norm": 0.009655642323195934, + "learning_rate": 8.992454042728813e-05, + "loss": 0.01324331946671009, + "num_input_tokens_seen": 57725400, + "step": 3525, + "train_runtime": 28645.8234, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 2.1369696969696967, + "grad_norm": 0.006624852307140827, + "learning_rate": 8.991875070588447e-05, + "loss": 0.01158294826745987, + "num_input_tokens_seen": 57741776, + "step": 3526, + "train_runtime": 28653.9459, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 2.1375757575757577, + "grad_norm": 0.015377560630440712, + "learning_rate": 8.991295950796397e-05, + "loss": 0.013609301298856735, + "num_input_tokens_seen": 57758152, + "step": 3527, + "train_runtime": 28662.0625, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 2.138181818181818, + "grad_norm": 0.0044901263900101185, + "learning_rate": 8.990716683374082e-05, + "loss": 0.010975447483360767, + "num_input_tokens_seen": 57774528, + "step": 3528, + "train_runtime": 28670.177, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 2.1387878787878787, + "grad_norm": 0.003714088350534439, + "learning_rate": 8.990137268342929e-05, + "loss": 0.012609384953975677, + "num_input_tokens_seen": 57790904, + "step": 3529, + "train_runtime": 28678.2931, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 2.1393939393939396, + "grad_norm": 0.007226061541587114, + "learning_rate": 8.989557705724367e-05, + "loss": 0.011608580127358437, + "num_input_tokens_seen": 57807280, + "step": 3530, + "train_runtime": 28686.4065, + "train_tokens_per_second": 2015.145 + }, + { + "epoch": 2.14, + "grad_norm": 0.01645052060484886, + "learning_rate": 8.988977995539837e-05, + "loss": 0.012488780543208122, + "num_input_tokens_seen": 57823656, + "step": 3531, + "train_runtime": 28694.5227, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 2.1406060606060606, + "grad_norm": 0.0056457314640283585, + "learning_rate": 8.988398137810777e-05, + "loss": 0.012707125395536423, + "num_input_tokens_seen": 57840032, + "step": 3532, + "train_runtime": 28702.6373, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 2.141212121212121, + "grad_norm": 0.011652040295302868, + "learning_rate": 8.987818132558639e-05, + "loss": 0.012485072016716003, + "num_input_tokens_seen": 57856408, + "step": 3533, + "train_runtime": 28710.7566, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 2.1418181818181816, + "grad_norm": 0.00488140806555748, + "learning_rate": 8.987237979804872e-05, + "loss": 0.011395161971449852, + "num_input_tokens_seen": 57872784, + "step": 3534, + "train_runtime": 28718.8711, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 2.1424242424242426, + "grad_norm": 0.006278018932789564, + "learning_rate": 8.986657679570938e-05, + "loss": 0.011353380978107452, + "num_input_tokens_seen": 57889160, + "step": 3535, + "train_runtime": 28726.9829, + "train_tokens_per_second": 2015.149 + }, + { + "epoch": 2.143030303030303, + "grad_norm": 0.012279496528208256, + "learning_rate": 8.9860772318783e-05, + "loss": 0.011512484401464462, + "num_input_tokens_seen": 57905536, + "step": 3536, + "train_runtime": 28735.0966, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 2.1436363636363636, + "grad_norm": 0.009725527837872505, + "learning_rate": 8.985496636748428e-05, + "loss": 0.012157324701547623, + "num_input_tokens_seen": 57921912, + "step": 3537, + "train_runtime": 28743.2101, + "train_tokens_per_second": 2015.151 + }, + { + "epoch": 2.144242424242424, + "grad_norm": 0.08072280138731003, + "learning_rate": 8.984915894202797e-05, + "loss": 0.01255282387137413, + "num_input_tokens_seen": 57938288, + "step": 3538, + "train_runtime": 28751.3233, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 2.144848484848485, + "grad_norm": 0.011321510188281536, + "learning_rate": 8.984335004262888e-05, + "loss": 0.012012355960905552, + "num_input_tokens_seen": 57954664, + "step": 3539, + "train_runtime": 28759.4365, + "train_tokens_per_second": 2015.153 + }, + { + "epoch": 2.1454545454545455, + "grad_norm": 0.015953831374645233, + "learning_rate": 8.983753966950185e-05, + "loss": 0.012593870982527733, + "num_input_tokens_seen": 57971040, + "step": 3540, + "train_runtime": 28767.5752, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 2.146060606060606, + "grad_norm": 0.0007118682260625064, + "learning_rate": 8.98317278228618e-05, + "loss": 0.010947332717478275, + "num_input_tokens_seen": 57987416, + "step": 3541, + "train_runtime": 28775.6895, + "train_tokens_per_second": 2015.153 + }, + { + "epoch": 2.1466666666666665, + "grad_norm": 0.007488769944757223, + "learning_rate": 8.982591450292372e-05, + "loss": 0.011720303446054459, + "num_input_tokens_seen": 58003792, + "step": 3542, + "train_runtime": 28783.8035, + "train_tokens_per_second": 2015.154 + }, + { + "epoch": 2.1472727272727274, + "grad_norm": 0.006082794163376093, + "learning_rate": 8.982009970990261e-05, + "loss": 0.011088498868048191, + "num_input_tokens_seen": 58020168, + "step": 3543, + "train_runtime": 28791.9189, + "train_tokens_per_second": 2015.155 + }, + { + "epoch": 2.147878787878788, + "grad_norm": 0.012090684846043587, + "learning_rate": 8.981428344401359e-05, + "loss": 0.012264646589756012, + "num_input_tokens_seen": 58036544, + "step": 3544, + "train_runtime": 28800.0347, + "train_tokens_per_second": 2015.155 + }, + { + "epoch": 2.1484848484848484, + "grad_norm": 0.006332604214549065, + "learning_rate": 8.980846570547172e-05, + "loss": 0.011855223216116428, + "num_input_tokens_seen": 58052920, + "step": 3545, + "train_runtime": 28808.1468, + "train_tokens_per_second": 2015.156 + }, + { + "epoch": 2.149090909090909, + "grad_norm": 0.00799900759011507, + "learning_rate": 8.980264649449225e-05, + "loss": 0.012117343954741955, + "num_input_tokens_seen": 58069296, + "step": 3546, + "train_runtime": 28816.2577, + "train_tokens_per_second": 2015.157 + }, + { + "epoch": 2.14969696969697, + "grad_norm": 0.006898942403495312, + "learning_rate": 8.979682581129038e-05, + "loss": 0.011957213282585144, + "num_input_tokens_seen": 58085672, + "step": 3547, + "train_runtime": 28824.3734, + "train_tokens_per_second": 2015.158 + }, + { + "epoch": 2.1503030303030304, + "grad_norm": 0.017603494226932526, + "learning_rate": 8.979100365608144e-05, + "loss": 0.012435558252036572, + "num_input_tokens_seen": 58102048, + "step": 3548, + "train_runtime": 28832.4891, + "train_tokens_per_second": 2015.159 + }, + { + "epoch": 2.150909090909091, + "grad_norm": 0.009896202012896538, + "learning_rate": 8.978518002908076e-05, + "loss": 0.01290203258395195, + "num_input_tokens_seen": 58118424, + "step": 3549, + "train_runtime": 28840.6049, + "train_tokens_per_second": 2015.16 + }, + { + "epoch": 2.1515151515151514, + "grad_norm": 0.0274839848279953, + "learning_rate": 8.977935493050375e-05, + "loss": 0.01113799400627613, + "num_input_tokens_seen": 58134800, + "step": 3550, + "train_runtime": 28848.7218, + "train_tokens_per_second": 2015.16 + }, + { + "epoch": 2.1521212121212123, + "grad_norm": 0.012439846992492676, + "learning_rate": 8.977352836056587e-05, + "loss": 0.013506392948329449, + "num_input_tokens_seen": 58151176, + "step": 3551, + "train_runtime": 28856.8415, + "train_tokens_per_second": 2015.161 + }, + { + "epoch": 2.152727272727273, + "grad_norm": 0.0051726060919463634, + "learning_rate": 8.976770031948263e-05, + "loss": 0.011873546056449413, + "num_input_tokens_seen": 58167552, + "step": 3552, + "train_runtime": 28864.9598, + "train_tokens_per_second": 2015.161 + }, + { + "epoch": 2.1533333333333333, + "grad_norm": 0.00895746424794197, + "learning_rate": 8.97618708074696e-05, + "loss": 0.012198572047054768, + "num_input_tokens_seen": 58183928, + "step": 3553, + "train_runtime": 28873.0733, + "train_tokens_per_second": 2015.162 + }, + { + "epoch": 2.153939393939394, + "grad_norm": 0.01121628936380148, + "learning_rate": 8.97560398247424e-05, + "loss": 0.012298112735152245, + "num_input_tokens_seen": 58200304, + "step": 3554, + "train_runtime": 28881.1851, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 2.1545454545454543, + "grad_norm": 0.007435557898133993, + "learning_rate": 8.975020737151669e-05, + "loss": 0.010877593420445919, + "num_input_tokens_seen": 58216680, + "step": 3555, + "train_runtime": 28889.2984, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 2.1551515151515153, + "grad_norm": 0.00940319150686264, + "learning_rate": 8.974437344800825e-05, + "loss": 0.012261370196938515, + "num_input_tokens_seen": 58233056, + "step": 3556, + "train_runtime": 28897.4122, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 2.1557575757575758, + "grad_norm": 0.0037964419461786747, + "learning_rate": 8.973853805443282e-05, + "loss": 0.011090653017163277, + "num_input_tokens_seen": 58249432, + "step": 3557, + "train_runtime": 28905.5327, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 2.1563636363636363, + "grad_norm": 0.007811735384166241, + "learning_rate": 8.973270119100625e-05, + "loss": 0.012524952180683613, + "num_input_tokens_seen": 58265808, + "step": 3558, + "train_runtime": 28913.6435, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 2.156969696969697, + "grad_norm": 0.008808665908873081, + "learning_rate": 8.972686285794445e-05, + "loss": 0.013249467127025127, + "num_input_tokens_seen": 58282184, + "step": 3559, + "train_runtime": 28921.7604, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 2.1575757575757577, + "grad_norm": 0.0032920176163315773, + "learning_rate": 8.972102305546334e-05, + "loss": 0.011471050791442394, + "num_input_tokens_seen": 58298560, + "step": 3560, + "train_runtime": 28929.8725, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 2.158181818181818, + "grad_norm": 0.008728522807359695, + "learning_rate": 8.971518178377895e-05, + "loss": 0.01315800566226244, + "num_input_tokens_seen": 58314936, + "step": 3561, + "train_runtime": 28937.9839, + "train_tokens_per_second": 2015.169 + }, + { + "epoch": 2.1587878787878787, + "grad_norm": 0.010329267010092735, + "learning_rate": 8.970933904310734e-05, + "loss": 0.012310674414038658, + "num_input_tokens_seen": 58331312, + "step": 3562, + "train_runtime": 28946.0971, + "train_tokens_per_second": 2015.17 + }, + { + "epoch": 2.159393939393939, + "grad_norm": 0.007029344793409109, + "learning_rate": 8.970349483366461e-05, + "loss": 0.011882564052939415, + "num_input_tokens_seen": 58347688, + "step": 3563, + "train_runtime": 28954.2127, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 2.16, + "grad_norm": 0.009087336249649525, + "learning_rate": 8.96976491556669e-05, + "loss": 0.013293171301484108, + "num_input_tokens_seen": 58364064, + "step": 3564, + "train_runtime": 28962.3346, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 2.1606060606060606, + "grad_norm": 0.005902925040572882, + "learning_rate": 8.969180200933047e-05, + "loss": 0.012850413098931313, + "num_input_tokens_seen": 58380440, + "step": 3565, + "train_runtime": 28970.451, + "train_tokens_per_second": 2015.172 + }, + { + "epoch": 2.161212121212121, + "grad_norm": 0.00861071515828371, + "learning_rate": 8.968595339487157e-05, + "loss": 0.012767734937369823, + "num_input_tokens_seen": 58396816, + "step": 3566, + "train_runtime": 28978.5649, + "train_tokens_per_second": 2015.173 + }, + { + "epoch": 2.1618181818181816, + "grad_norm": 0.006146362982690334, + "learning_rate": 8.968010331250656e-05, + "loss": 0.011280233040452003, + "num_input_tokens_seen": 58413192, + "step": 3567, + "train_runtime": 28986.6774, + "train_tokens_per_second": 2015.174 + }, + { + "epoch": 2.1624242424242426, + "grad_norm": 0.008694643154740334, + "learning_rate": 8.967425176245178e-05, + "loss": 0.010438431054353714, + "num_input_tokens_seen": 58429568, + "step": 3568, + "train_runtime": 28994.7878, + "train_tokens_per_second": 2015.175 + }, + { + "epoch": 2.163030303030303, + "grad_norm": 0.0005360030918382108, + "learning_rate": 8.966839874492371e-05, + "loss": 0.012298746034502983, + "num_input_tokens_seen": 58445944, + "step": 3569, + "train_runtime": 29002.9014, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 2.1636363636363636, + "grad_norm": 0.005812949035316706, + "learning_rate": 8.96625442601388e-05, + "loss": 0.011884803883731365, + "num_input_tokens_seen": 58462320, + "step": 3570, + "train_runtime": 29011.0144, + "train_tokens_per_second": 2015.177 + }, + { + "epoch": 2.164242424242424, + "grad_norm": 0.005894954781979322, + "learning_rate": 8.965668830831364e-05, + "loss": 0.013101841323077679, + "num_input_tokens_seen": 58478696, + "step": 3571, + "train_runtime": 29019.1364, + "train_tokens_per_second": 2015.177 + }, + { + "epoch": 2.164848484848485, + "grad_norm": 0.006240121088922024, + "learning_rate": 8.96508308896648e-05, + "loss": 0.013054500333964825, + "num_input_tokens_seen": 58495072, + "step": 3572, + "train_runtime": 29027.2486, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 2.1654545454545455, + "grad_norm": 0.007768931332975626, + "learning_rate": 8.964497200440894e-05, + "loss": 0.011665784753859043, + "num_input_tokens_seen": 58511448, + "step": 3573, + "train_runtime": 29035.3681, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 2.166060606060606, + "grad_norm": 0.008867294527590275, + "learning_rate": 8.963911165276275e-05, + "loss": 0.011086254380643368, + "num_input_tokens_seen": 58527824, + "step": 3574, + "train_runtime": 29043.4821, + "train_tokens_per_second": 2015.179 + }, + { + "epoch": 2.1666666666666665, + "grad_norm": 0.008764234371483326, + "learning_rate": 8.963324983494303e-05, + "loss": 0.012631715275347233, + "num_input_tokens_seen": 58544200, + "step": 3575, + "train_runtime": 29051.5973, + "train_tokens_per_second": 2015.18 + }, + { + "epoch": 2.1672727272727275, + "grad_norm": 0.01382221095263958, + "learning_rate": 8.962738655116658e-05, + "loss": 0.013242697343230247, + "num_input_tokens_seen": 58560576, + "step": 3576, + "train_runtime": 29059.7068, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 2.167878787878788, + "grad_norm": 0.019054502248764038, + "learning_rate": 8.962152180165028e-05, + "loss": 0.01292281411588192, + "num_input_tokens_seen": 58576952, + "step": 3577, + "train_runtime": 29067.8199, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 2.1684848484848485, + "grad_norm": 0.002178309252485633, + "learning_rate": 8.961565558661104e-05, + "loss": 0.011553612537682056, + "num_input_tokens_seen": 58593328, + "step": 3578, + "train_runtime": 29075.9338, + "train_tokens_per_second": 2015.183 + }, + { + "epoch": 2.169090909090909, + "grad_norm": 0.016054967418313026, + "learning_rate": 8.960978790626587e-05, + "loss": 0.011835544370114803, + "num_input_tokens_seen": 58609704, + "step": 3579, + "train_runtime": 29084.0467, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 2.16969696969697, + "grad_norm": 0.024970337748527527, + "learning_rate": 8.960391876083174e-05, + "loss": 0.012018397450447083, + "num_input_tokens_seen": 58626080, + "step": 3580, + "train_runtime": 29092.1608, + "train_tokens_per_second": 2015.185 + }, + { + "epoch": 2.1703030303030304, + "grad_norm": 0.007381356321275234, + "learning_rate": 8.959804815052582e-05, + "loss": 0.011703860014677048, + "num_input_tokens_seen": 58642456, + "step": 3581, + "train_runtime": 29100.2831, + "train_tokens_per_second": 2015.185 + }, + { + "epoch": 2.170909090909091, + "grad_norm": 0.008867908269166946, + "learning_rate": 8.959217607556519e-05, + "loss": 0.012440843507647514, + "num_input_tokens_seen": 58658832, + "step": 3582, + "train_runtime": 29108.4007, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 2.1715151515151514, + "grad_norm": 0.008855480700731277, + "learning_rate": 8.958630253616706e-05, + "loss": 0.01147475279867649, + "num_input_tokens_seen": 58675208, + "step": 3583, + "train_runtime": 29116.5152, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 2.172121212121212, + "grad_norm": 0.005014322232455015, + "learning_rate": 8.958042753254872e-05, + "loss": 0.011361206881701946, + "num_input_tokens_seen": 58691584, + "step": 3584, + "train_runtime": 29124.6325, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 2.172727272727273, + "grad_norm": 0.005862353835254908, + "learning_rate": 8.957455106492742e-05, + "loss": 0.012513482943177223, + "num_input_tokens_seen": 58707960, + "step": 3585, + "train_runtime": 29132.8568, + "train_tokens_per_second": 2015.18 + }, + { + "epoch": 2.1733333333333333, + "grad_norm": 0.010451595298945904, + "learning_rate": 8.956867313352056e-05, + "loss": 0.012588118202984333, + "num_input_tokens_seen": 58724336, + "step": 3586, + "train_runtime": 29140.9745, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 2.173939393939394, + "grad_norm": 0.007304052356630564, + "learning_rate": 8.956279373854552e-05, + "loss": 0.012194668874144554, + "num_input_tokens_seen": 58740712, + "step": 3587, + "train_runtime": 29149.0938, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 2.174545454545455, + "grad_norm": 0.0036269514821469784, + "learning_rate": 8.95569128802198e-05, + "loss": 0.011555613949894905, + "num_input_tokens_seen": 58757088, + "step": 3588, + "train_runtime": 29157.2151, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 2.1751515151515153, + "grad_norm": 0.0072369822300970554, + "learning_rate": 8.95510305587609e-05, + "loss": 0.010729311965405941, + "num_input_tokens_seen": 58773464, + "step": 3589, + "train_runtime": 29165.3345, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 2.175757575757576, + "grad_norm": 0.006867996882647276, + "learning_rate": 8.95451467743864e-05, + "loss": 0.012114688754081726, + "num_input_tokens_seen": 58789840, + "step": 3590, + "train_runtime": 29173.4604, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 2.1763636363636363, + "grad_norm": 0.0026318137533962727, + "learning_rate": 8.953926152731394e-05, + "loss": 0.012799869291484356, + "num_input_tokens_seen": 58806216, + "step": 3591, + "train_runtime": 29181.5691, + "train_tokens_per_second": 2015.183 + }, + { + "epoch": 2.1769696969696968, + "grad_norm": 0.006512695923447609, + "learning_rate": 8.953337481776119e-05, + "loss": 0.01212363876402378, + "num_input_tokens_seen": 58822592, + "step": 3592, + "train_runtime": 29189.6855, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 2.1775757575757577, + "grad_norm": 0.02227167598903179, + "learning_rate": 8.95274866459459e-05, + "loss": 0.012120643630623817, + "num_input_tokens_seen": 58838968, + "step": 3593, + "train_runtime": 29197.8002, + "train_tokens_per_second": 2015.185 + }, + { + "epoch": 2.178181818181818, + "grad_norm": 0.006759993266314268, + "learning_rate": 8.952159701208584e-05, + "loss": 0.012031888589262962, + "num_input_tokens_seen": 58855344, + "step": 3594, + "train_runtime": 29205.9148, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 2.1787878787878787, + "grad_norm": 0.014940389432013035, + "learning_rate": 8.951570591639889e-05, + "loss": 0.012228570878505707, + "num_input_tokens_seen": 58871720, + "step": 3595, + "train_runtime": 29214.036, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 2.179393939393939, + "grad_norm": 0.009364038705825806, + "learning_rate": 8.950981335910291e-05, + "loss": 0.012020081281661987, + "num_input_tokens_seen": 58888096, + "step": 3596, + "train_runtime": 29222.1557, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 2.18, + "grad_norm": 0.006528595462441444, + "learning_rate": 8.950391934041589e-05, + "loss": 0.012315641157329082, + "num_input_tokens_seen": 58904472, + "step": 3597, + "train_runtime": 29230.2688, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 2.1806060606060607, + "grad_norm": 0.01194236520677805, + "learning_rate": 8.949802386055581e-05, + "loss": 0.013185814023017883, + "num_input_tokens_seen": 58920848, + "step": 3598, + "train_runtime": 29238.3888, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 2.181212121212121, + "grad_norm": 0.008188650012016296, + "learning_rate": 8.949212691974077e-05, + "loss": 0.011144852265715599, + "num_input_tokens_seen": 58937224, + "step": 3599, + "train_runtime": 29246.5055, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 2.1818181818181817, + "grad_norm": 0.008416100405156612, + "learning_rate": 8.948622851818885e-05, + "loss": 0.012679114006459713, + "num_input_tokens_seen": 58953600, + "step": 3600, + "train_runtime": 29254.633, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 2.1824242424242426, + "grad_norm": 0.004923258442431688, + "learning_rate": 8.948032865611822e-05, + "loss": 0.01115406770259142, + "num_input_tokens_seen": 58969976, + "step": 3601, + "train_runtime": 29263.7377, + "train_tokens_per_second": 2015.121 + }, + { + "epoch": 2.183030303030303, + "grad_norm": 0.004624438937753439, + "learning_rate": 8.947442733374714e-05, + "loss": 0.011263374239206314, + "num_input_tokens_seen": 58986352, + "step": 3602, + "train_runtime": 29271.8564, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 2.1836363636363636, + "grad_norm": 0.0119725801050663, + "learning_rate": 8.946852455129384e-05, + "loss": 0.01155043113976717, + "num_input_tokens_seen": 59002728, + "step": 3603, + "train_runtime": 29279.9724, + "train_tokens_per_second": 2015.122 + }, + { + "epoch": 2.184242424242424, + "grad_norm": 0.01406806893646717, + "learning_rate": 8.94626203089767e-05, + "loss": 0.011634095571935177, + "num_input_tokens_seen": 59019104, + "step": 3604, + "train_runtime": 29288.0849, + "train_tokens_per_second": 2015.123 + }, + { + "epoch": 2.184848484848485, + "grad_norm": 0.0035006983671337366, + "learning_rate": 8.945671460701408e-05, + "loss": 0.011123578995466232, + "num_input_tokens_seen": 59035480, + "step": 3605, + "train_runtime": 29296.2014, + "train_tokens_per_second": 2015.124 + }, + { + "epoch": 2.1854545454545455, + "grad_norm": 0.0063458941876888275, + "learning_rate": 8.945080744562442e-05, + "loss": 0.012211378663778305, + "num_input_tokens_seen": 59051856, + "step": 3606, + "train_runtime": 29304.3199, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 2.186060606060606, + "grad_norm": 0.009857217781245708, + "learning_rate": 8.944489882502623e-05, + "loss": 0.012334001250565052, + "num_input_tokens_seen": 59068232, + "step": 3607, + "train_runtime": 29312.4364, + "train_tokens_per_second": 2015.125 + }, + { + "epoch": 2.1866666666666665, + "grad_norm": 0.013009733520448208, + "learning_rate": 8.943898874543803e-05, + "loss": 0.013556170277297497, + "num_input_tokens_seen": 59084608, + "step": 3608, + "train_runtime": 29320.5544, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 2.1872727272727275, + "grad_norm": 0.01466745138168335, + "learning_rate": 8.943307720707845e-05, + "loss": 0.011487782001495361, + "num_input_tokens_seen": 59100984, + "step": 3609, + "train_runtime": 29328.6739, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 2.187878787878788, + "grad_norm": 0.011151660233736038, + "learning_rate": 8.942716421016614e-05, + "loss": 0.012820694595575333, + "num_input_tokens_seen": 59117360, + "step": 3610, + "train_runtime": 29336.7895, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 2.1884848484848485, + "grad_norm": 0.004317841026932001, + "learning_rate": 8.942124975491981e-05, + "loss": 0.01183843333274126, + "num_input_tokens_seen": 59133736, + "step": 3611, + "train_runtime": 29344.9083, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 2.189090909090909, + "grad_norm": 0.014631208963692188, + "learning_rate": 8.941533384155822e-05, + "loss": 0.012403767555952072, + "num_input_tokens_seen": 59150112, + "step": 3612, + "train_runtime": 29353.0328, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 2.1896969696969695, + "grad_norm": 0.004952683579176664, + "learning_rate": 8.940941647030019e-05, + "loss": 0.01236814446747303, + "num_input_tokens_seen": 59166488, + "step": 3613, + "train_runtime": 29361.1816, + "train_tokens_per_second": 2015.126 + }, + { + "epoch": 2.1903030303030304, + "grad_norm": 0.009116880595684052, + "learning_rate": 8.940349764136457e-05, + "loss": 0.013562958687543869, + "num_input_tokens_seen": 59182864, + "step": 3614, + "train_runtime": 29369.301, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 2.190909090909091, + "grad_norm": 0.0036380335222929716, + "learning_rate": 8.939757735497034e-05, + "loss": 0.011842243373394012, + "num_input_tokens_seen": 59199240, + "step": 3615, + "train_runtime": 29377.4179, + "train_tokens_per_second": 2015.127 + }, + { + "epoch": 2.1915151515151514, + "grad_norm": 0.008418438956141472, + "learning_rate": 8.939165561133642e-05, + "loss": 0.011070848442614079, + "num_input_tokens_seen": 59215616, + "step": 3616, + "train_runtime": 29385.5329, + "train_tokens_per_second": 2015.128 + }, + { + "epoch": 2.192121212121212, + "grad_norm": 0.00455420883372426, + "learning_rate": 8.938573241068189e-05, + "loss": 0.01256749127060175, + "num_input_tokens_seen": 59231992, + "step": 3617, + "train_runtime": 29393.6487, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 2.192727272727273, + "grad_norm": 0.004541611764580011, + "learning_rate": 8.937980775322581e-05, + "loss": 0.010654402896761894, + "num_input_tokens_seen": 59248368, + "step": 3618, + "train_runtime": 29401.767, + "train_tokens_per_second": 2015.129 + }, + { + "epoch": 2.1933333333333334, + "grad_norm": 0.012203039601445198, + "learning_rate": 8.937388163918731e-05, + "loss": 0.012502270750701427, + "num_input_tokens_seen": 59264744, + "step": 3619, + "train_runtime": 29409.8869, + "train_tokens_per_second": 2015.13 + }, + { + "epoch": 2.193939393939394, + "grad_norm": 0.007232667412608862, + "learning_rate": 8.936795406878564e-05, + "loss": 0.011787505820393562, + "num_input_tokens_seen": 59281120, + "step": 3620, + "train_runtime": 29417.9999, + "train_tokens_per_second": 2015.131 + }, + { + "epoch": 2.1945454545454544, + "grad_norm": 0.011393684893846512, + "learning_rate": 8.936202504224e-05, + "loss": 0.013321079313755035, + "num_input_tokens_seen": 59297496, + "step": 3621, + "train_runtime": 29426.1152, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 2.1951515151515153, + "grad_norm": 0.01522404607385397, + "learning_rate": 8.93560945597697e-05, + "loss": 0.011272291652858257, + "num_input_tokens_seen": 59313872, + "step": 3622, + "train_runtime": 29434.2338, + "train_tokens_per_second": 2015.132 + }, + { + "epoch": 2.195757575757576, + "grad_norm": 0.006869960110634565, + "learning_rate": 8.935016262159412e-05, + "loss": 0.012107600457966328, + "num_input_tokens_seen": 59330248, + "step": 3623, + "train_runtime": 29442.3517, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 2.1963636363636363, + "grad_norm": 0.00808816310018301, + "learning_rate": 8.934422922793265e-05, + "loss": 0.011816064827144146, + "num_input_tokens_seen": 59346624, + "step": 3624, + "train_runtime": 29450.463, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 2.196969696969697, + "grad_norm": 0.0004978812648914754, + "learning_rate": 8.933829437900475e-05, + "loss": 0.011826543137431145, + "num_input_tokens_seen": 59363000, + "step": 3625, + "train_runtime": 29458.577, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 2.1975757575757577, + "grad_norm": 0.006791422143578529, + "learning_rate": 8.933235807502996e-05, + "loss": 0.012391680851578712, + "num_input_tokens_seen": 59379376, + "step": 3626, + "train_runtime": 29466.6925, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 2.1981818181818182, + "grad_norm": 0.0055072130635380745, + "learning_rate": 8.932642031622783e-05, + "loss": 0.011894084513187408, + "num_input_tokens_seen": 59395752, + "step": 3627, + "train_runtime": 29474.8059, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 2.1987878787878787, + "grad_norm": 0.008963002823293209, + "learning_rate": 8.9320481102818e-05, + "loss": 0.012350327335298061, + "num_input_tokens_seen": 59412128, + "step": 3628, + "train_runtime": 29482.9188, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 2.1993939393939392, + "grad_norm": 0.010187532752752304, + "learning_rate": 8.931454043502016e-05, + "loss": 0.011566300876438618, + "num_input_tokens_seen": 59428504, + "step": 3629, + "train_runtime": 29491.033, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 2.2, + "grad_norm": 0.014371749944984913, + "learning_rate": 8.930859831305401e-05, + "loss": 0.011745520867407322, + "num_input_tokens_seen": 59444880, + "step": 3630, + "train_runtime": 29499.148, + "train_tokens_per_second": 2015.139 + }, + { + "epoch": 2.2006060606060607, + "grad_norm": 0.004343628883361816, + "learning_rate": 8.930265473713938e-05, + "loss": 0.012125818058848381, + "num_input_tokens_seen": 59461256, + "step": 3631, + "train_runtime": 29507.262, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 2.201212121212121, + "grad_norm": 0.00850563496351242, + "learning_rate": 8.929670970749608e-05, + "loss": 0.012195354327559471, + "num_input_tokens_seen": 59477632, + "step": 3632, + "train_runtime": 29515.3773, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 2.2018181818181817, + "grad_norm": 0.01019760686904192, + "learning_rate": 8.929076322434402e-05, + "loss": 0.013076315633952618, + "num_input_tokens_seen": 59494008, + "step": 3633, + "train_runtime": 29523.4923, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 2.2024242424242426, + "grad_norm": 0.006664637941867113, + "learning_rate": 8.928481528790313e-05, + "loss": 0.012966278940439224, + "num_input_tokens_seen": 59510384, + "step": 3634, + "train_runtime": 29531.6111, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 2.203030303030303, + "grad_norm": 0.008315959945321083, + "learning_rate": 8.927886589839344e-05, + "loss": 0.012466199696063995, + "num_input_tokens_seen": 59526760, + "step": 3635, + "train_runtime": 29539.7326, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 2.2036363636363636, + "grad_norm": 0.006912038661539555, + "learning_rate": 8.9272915056035e-05, + "loss": 0.013427576050162315, + "num_input_tokens_seen": 59543136, + "step": 3636, + "train_runtime": 29547.8478, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 2.204242424242424, + "grad_norm": 0.006813558284193277, + "learning_rate": 8.92669627610479e-05, + "loss": 0.012301689945161343, + "num_input_tokens_seen": 59559512, + "step": 3637, + "train_runtime": 29555.9654, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 2.204848484848485, + "grad_norm": 0.0047012618742883205, + "learning_rate": 8.92610090136523e-05, + "loss": 0.012508450075984001, + "num_input_tokens_seen": 59575888, + "step": 3638, + "train_runtime": 29564.0806, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 2.2054545454545456, + "grad_norm": 0.006925490219146013, + "learning_rate": 8.925505381406845e-05, + "loss": 0.012498512864112854, + "num_input_tokens_seen": 59592264, + "step": 3639, + "train_runtime": 29572.1905, + "train_tokens_per_second": 2015.145 + }, + { + "epoch": 2.206060606060606, + "grad_norm": 0.005478878039866686, + "learning_rate": 8.924909716251661e-05, + "loss": 0.011745494790375233, + "num_input_tokens_seen": 59608640, + "step": 3640, + "train_runtime": 29580.3016, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 2.2066666666666666, + "grad_norm": 0.00699197594076395, + "learning_rate": 8.924313905921709e-05, + "loss": 0.012664221227169037, + "num_input_tokens_seen": 59625016, + "step": 3641, + "train_runtime": 29588.4179, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 2.207272727272727, + "grad_norm": 0.012702935375273228, + "learning_rate": 8.923717950439029e-05, + "loss": 0.013789419084787369, + "num_input_tokens_seen": 59641392, + "step": 3642, + "train_runtime": 29596.5348, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 2.207878787878788, + "grad_norm": 0.006319780368357897, + "learning_rate": 8.923121849825662e-05, + "loss": 0.012776189483702183, + "num_input_tokens_seen": 59657768, + "step": 3643, + "train_runtime": 29604.6513, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 2.2084848484848485, + "grad_norm": 0.0030518770217895508, + "learning_rate": 8.922525604103659e-05, + "loss": 0.011326993815600872, + "num_input_tokens_seen": 59674144, + "step": 3644, + "train_runtime": 29612.7676, + "train_tokens_per_second": 2015.149 + }, + { + "epoch": 2.209090909090909, + "grad_norm": 0.006767469458281994, + "learning_rate": 8.921929213295071e-05, + "loss": 0.012699131853878498, + "num_input_tokens_seen": 59690520, + "step": 3645, + "train_runtime": 29620.8821, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 2.2096969696969695, + "grad_norm": 0.005254995543509722, + "learning_rate": 8.921332677421961e-05, + "loss": 0.011081083677709103, + "num_input_tokens_seen": 59706896, + "step": 3646, + "train_runtime": 29628.9966, + "train_tokens_per_second": 2015.151 + }, + { + "epoch": 2.2103030303030304, + "grad_norm": 0.007556076627224684, + "learning_rate": 8.92073599650639e-05, + "loss": 0.013109242543578148, + "num_input_tokens_seen": 59723272, + "step": 3647, + "train_runtime": 29637.1096, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 2.210909090909091, + "grad_norm": 0.010643698275089264, + "learning_rate": 8.920139170570429e-05, + "loss": 0.011104393750429153, + "num_input_tokens_seen": 59739648, + "step": 3648, + "train_runtime": 29645.2332, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 2.2115151515151514, + "grad_norm": 0.006891094613820314, + "learning_rate": 8.919542199636158e-05, + "loss": 0.012059992179274559, + "num_input_tokens_seen": 59756024, + "step": 3649, + "train_runtime": 29653.3446, + "train_tokens_per_second": 2015.153 + }, + { + "epoch": 2.212121212121212, + "grad_norm": 0.005681055597960949, + "learning_rate": 8.91894508372565e-05, + "loss": 0.012363753281533718, + "num_input_tokens_seen": 59772400, + "step": 3650, + "train_runtime": 29661.4574, + "train_tokens_per_second": 2015.154 + }, + { + "epoch": 2.212727272727273, + "grad_norm": 0.006800381001085043, + "learning_rate": 8.918347822860997e-05, + "loss": 0.012068090960383415, + "num_input_tokens_seen": 59788776, + "step": 3651, + "train_runtime": 29669.5712, + "train_tokens_per_second": 2015.155 + }, + { + "epoch": 2.2133333333333334, + "grad_norm": 0.007353676483035088, + "learning_rate": 8.917750417064289e-05, + "loss": 0.012048767879605293, + "num_input_tokens_seen": 59805152, + "step": 3652, + "train_runtime": 29677.6856, + "train_tokens_per_second": 2015.156 + }, + { + "epoch": 2.213939393939394, + "grad_norm": 0.009312749840319157, + "learning_rate": 8.91715286635762e-05, + "loss": 0.01307837013155222, + "num_input_tokens_seen": 59821528, + "step": 3653, + "train_runtime": 29685.8003, + "train_tokens_per_second": 2015.156 + }, + { + "epoch": 2.2145454545454544, + "grad_norm": 0.005366871133446693, + "learning_rate": 8.916555170763099e-05, + "loss": 0.012519482523202896, + "num_input_tokens_seen": 59837904, + "step": 3654, + "train_runtime": 29693.9134, + "train_tokens_per_second": 2015.157 + }, + { + "epoch": 2.2151515151515153, + "grad_norm": 0.025069156661629677, + "learning_rate": 8.915957330302827e-05, + "loss": 0.013411092571914196, + "num_input_tokens_seen": 59854280, + "step": 3655, + "train_runtime": 29702.0319, + "train_tokens_per_second": 2015.158 + }, + { + "epoch": 2.215757575757576, + "grad_norm": 0.011153807863593102, + "learning_rate": 8.915359344998919e-05, + "loss": 0.013310923241078854, + "num_input_tokens_seen": 59870656, + "step": 3656, + "train_runtime": 29710.1442, + "train_tokens_per_second": 2015.159 + }, + { + "epoch": 2.2163636363636363, + "grad_norm": 0.008448402397334576, + "learning_rate": 8.914761214873493e-05, + "loss": 0.012956599704921246, + "num_input_tokens_seen": 59887032, + "step": 3657, + "train_runtime": 29718.2542, + "train_tokens_per_second": 2015.16 + }, + { + "epoch": 2.216969696969697, + "grad_norm": 0.005610155873000622, + "learning_rate": 8.914162939948676e-05, + "loss": 0.0120665542781353, + "num_input_tokens_seen": 59903408, + "step": 3658, + "train_runtime": 29726.3677, + "train_tokens_per_second": 2015.161 + }, + { + "epoch": 2.2175757575757578, + "grad_norm": 0.009017917327582836, + "learning_rate": 8.913564520246592e-05, + "loss": 0.010684678331017494, + "num_input_tokens_seen": 59919784, + "step": 3659, + "train_runtime": 29734.4873, + "train_tokens_per_second": 2015.161 + }, + { + "epoch": 2.2181818181818183, + "grad_norm": 0.0062376754358410835, + "learning_rate": 8.912965955789378e-05, + "loss": 0.012134547345340252, + "num_input_tokens_seen": 59936160, + "step": 3660, + "train_runtime": 29742.6015, + "train_tokens_per_second": 2015.162 + }, + { + "epoch": 2.2187878787878788, + "grad_norm": 0.02972797304391861, + "learning_rate": 8.912367246599175e-05, + "loss": 0.013637243770062923, + "num_input_tokens_seen": 59952536, + "step": 3661, + "train_runtime": 29750.7121, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 2.2193939393939393, + "grad_norm": 0.008331749588251114, + "learning_rate": 8.911768392698126e-05, + "loss": 0.011612921953201294, + "num_input_tokens_seen": 59968912, + "step": 3662, + "train_runtime": 29758.8326, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 2.22, + "grad_norm": 0.006937834434211254, + "learning_rate": 8.91116939410838e-05, + "loss": 0.011773437261581421, + "num_input_tokens_seen": 59985288, + "step": 3663, + "train_runtime": 29766.9488, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 2.2206060606060607, + "grad_norm": 0.011741766706109047, + "learning_rate": 8.910570250852097e-05, + "loss": 0.014320777729153633, + "num_input_tokens_seen": 60001664, + "step": 3664, + "train_runtime": 29775.063, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 2.221212121212121, + "grad_norm": 0.00781282875686884, + "learning_rate": 8.909970962951435e-05, + "loss": 0.011964559555053711, + "num_input_tokens_seen": 60018040, + "step": 3665, + "train_runtime": 29783.1785, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 2.2218181818181817, + "grad_norm": 0.007946248166263103, + "learning_rate": 8.909371530428561e-05, + "loss": 0.012657862156629562, + "num_input_tokens_seen": 60034416, + "step": 3666, + "train_runtime": 29791.2958, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 2.2224242424242426, + "grad_norm": 0.010118436068296432, + "learning_rate": 8.908771953305648e-05, + "loss": 0.012623686343431473, + "num_input_tokens_seen": 60050792, + "step": 3667, + "train_runtime": 29799.4144, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 2.223030303030303, + "grad_norm": 0.010533769614994526, + "learning_rate": 8.908172231604873e-05, + "loss": 0.012056194245815277, + "num_input_tokens_seen": 60067168, + "step": 3668, + "train_runtime": 29807.5331, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 2.2236363636363636, + "grad_norm": 0.030545897781848907, + "learning_rate": 8.907572365348416e-05, + "loss": 0.012916878797113895, + "num_input_tokens_seen": 60083544, + "step": 3669, + "train_runtime": 29815.6463, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 2.224242424242424, + "grad_norm": 0.005548179615288973, + "learning_rate": 8.906972354558469e-05, + "loss": 0.011496108956634998, + "num_input_tokens_seen": 60099920, + "step": 3670, + "train_runtime": 29823.7611, + "train_tokens_per_second": 2015.169 + }, + { + "epoch": 2.2248484848484846, + "grad_norm": 0.007247095461934805, + "learning_rate": 8.906372199257223e-05, + "loss": 0.01363338902592659, + "num_input_tokens_seen": 60116296, + "step": 3671, + "train_runtime": 29831.875, + "train_tokens_per_second": 2015.17 + }, + { + "epoch": 2.2254545454545456, + "grad_norm": 0.01965804398059845, + "learning_rate": 8.905771899466875e-05, + "loss": 0.01304022315889597, + "num_input_tokens_seen": 60132672, + "step": 3672, + "train_runtime": 29839.99, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 2.226060606060606, + "grad_norm": 0.014896688051521778, + "learning_rate": 8.905171455209631e-05, + "loss": 0.012952609919011593, + "num_input_tokens_seen": 60149048, + "step": 3673, + "train_runtime": 29848.1095, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 2.2266666666666666, + "grad_norm": 0.008357114158570766, + "learning_rate": 8.9045708665077e-05, + "loss": 0.011900687590241432, + "num_input_tokens_seen": 60165424, + "step": 3674, + "train_runtime": 29856.2329, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 2.227272727272727, + "grad_norm": 0.013724857941269875, + "learning_rate": 8.903970133383297e-05, + "loss": 0.011536635458469391, + "num_input_tokens_seen": 60181800, + "step": 3675, + "train_runtime": 29864.3458, + "train_tokens_per_second": 2015.172 + }, + { + "epoch": 2.227878787878788, + "grad_norm": 0.006729908287525177, + "learning_rate": 8.90336925585864e-05, + "loss": 0.012406526133418083, + "num_input_tokens_seen": 60198176, + "step": 3676, + "train_runtime": 29872.4534, + "train_tokens_per_second": 2015.173 + }, + { + "epoch": 2.2284848484848485, + "grad_norm": 0.00586884468793869, + "learning_rate": 8.902768233955958e-05, + "loss": 0.011684720404446125, + "num_input_tokens_seen": 60214552, + "step": 3677, + "train_runtime": 29880.565, + "train_tokens_per_second": 2015.174 + }, + { + "epoch": 2.229090909090909, + "grad_norm": 0.00459505058825016, + "learning_rate": 8.902167067697477e-05, + "loss": 0.012015356682240963, + "num_input_tokens_seen": 60230928, + "step": 3678, + "train_runtime": 29888.6788, + "train_tokens_per_second": 2015.175 + }, + { + "epoch": 2.2296969696969695, + "grad_norm": 0.006937180645763874, + "learning_rate": 8.901565757105437e-05, + "loss": 0.012867008335888386, + "num_input_tokens_seen": 60247304, + "step": 3679, + "train_runtime": 29896.7935, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 2.2303030303030305, + "grad_norm": 0.008642815984785557, + "learning_rate": 8.900964302202078e-05, + "loss": 0.012027869001030922, + "num_input_tokens_seen": 60263680, + "step": 3680, + "train_runtime": 29904.9075, + "train_tokens_per_second": 2015.177 + }, + { + "epoch": 2.230909090909091, + "grad_norm": 0.009581885300576687, + "learning_rate": 8.900362703009644e-05, + "loss": 0.012776635587215424, + "num_input_tokens_seen": 60280056, + "step": 3681, + "train_runtime": 29913.0217, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 2.2315151515151515, + "grad_norm": 0.01875944249331951, + "learning_rate": 8.899760959550389e-05, + "loss": 0.013734135776758194, + "num_input_tokens_seen": 60296432, + "step": 3682, + "train_runtime": 29921.1351, + "train_tokens_per_second": 2015.179 + }, + { + "epoch": 2.232121212121212, + "grad_norm": 0.008612217381596565, + "learning_rate": 8.899159071846575e-05, + "loss": 0.012646391056478024, + "num_input_tokens_seen": 60312808, + "step": 3683, + "train_runtime": 29929.2454, + "train_tokens_per_second": 2015.18 + }, + { + "epoch": 2.232727272727273, + "grad_norm": 0.011928489431738853, + "learning_rate": 8.898557039920457e-05, + "loss": 0.011622844263911247, + "num_input_tokens_seen": 60329184, + "step": 3684, + "train_runtime": 29937.3581, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 2.2333333333333334, + "grad_norm": 0.011898646131157875, + "learning_rate": 8.897954863794305e-05, + "loss": 0.010517679154872894, + "num_input_tokens_seen": 60345560, + "step": 3685, + "train_runtime": 29945.469, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 2.233939393939394, + "grad_norm": 0.011300486512482166, + "learning_rate": 8.897352543490395e-05, + "loss": 0.013875912874937057, + "num_input_tokens_seen": 60361936, + "step": 3686, + "train_runtime": 29953.5902, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 2.2345454545454544, + "grad_norm": 0.0071717859245836735, + "learning_rate": 8.896750079031005e-05, + "loss": 0.011511060409247875, + "num_input_tokens_seen": 60378312, + "step": 3687, + "train_runtime": 29961.7012, + "train_tokens_per_second": 2015.183 + }, + { + "epoch": 2.2351515151515153, + "grad_norm": 0.005728852469474077, + "learning_rate": 8.896147470438416e-05, + "loss": 0.012795530259609222, + "num_input_tokens_seen": 60394688, + "step": 3688, + "train_runtime": 29969.8163, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 2.235757575757576, + "grad_norm": 0.004077422432601452, + "learning_rate": 8.89554471773492e-05, + "loss": 0.01207432895898819, + "num_input_tokens_seen": 60411064, + "step": 3689, + "train_runtime": 29977.9329, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 2.2363636363636363, + "grad_norm": 0.010106992907822132, + "learning_rate": 8.894941820942813e-05, + "loss": 0.012829555198550224, + "num_input_tokens_seen": 60427440, + "step": 3690, + "train_runtime": 29986.0439, + "train_tokens_per_second": 2015.185 + }, + { + "epoch": 2.236969696969697, + "grad_norm": 0.011969654820859432, + "learning_rate": 8.894338780084392e-05, + "loss": 0.010625853203237057, + "num_input_tokens_seen": 60443816, + "step": 3691, + "train_runtime": 29994.1554, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 2.2375757575757578, + "grad_norm": 0.011020442470908165, + "learning_rate": 8.893735595181962e-05, + "loss": 0.012517699040472507, + "num_input_tokens_seen": 60460192, + "step": 3692, + "train_runtime": 30002.2734, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 2.2381818181818183, + "grad_norm": 0.009888879954814911, + "learning_rate": 8.893132266257837e-05, + "loss": 0.011072501540184021, + "num_input_tokens_seen": 60476568, + "step": 3693, + "train_runtime": 30010.3922, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 2.2387878787878788, + "grad_norm": 0.0004854231374338269, + "learning_rate": 8.89252879333433e-05, + "loss": 0.01135720033198595, + "num_input_tokens_seen": 60492944, + "step": 3694, + "train_runtime": 30018.5065, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 2.2393939393939393, + "grad_norm": 0.0065515311434865, + "learning_rate": 8.891925176433764e-05, + "loss": 0.011831994168460369, + "num_input_tokens_seen": 60509320, + "step": 3695, + "train_runtime": 30026.6188, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 2.24, + "grad_norm": 0.006119894795119762, + "learning_rate": 8.891321415578464e-05, + "loss": 0.011481634341180325, + "num_input_tokens_seen": 60525696, + "step": 3696, + "train_runtime": 30034.7353, + "train_tokens_per_second": 2015.19 + }, + { + "epoch": 2.2406060606060607, + "grad_norm": 0.006571260746568441, + "learning_rate": 8.890717510790763e-05, + "loss": 0.013411670923233032, + "num_input_tokens_seen": 60542072, + "step": 3697, + "train_runtime": 30042.8507, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 2.241212121212121, + "grad_norm": 0.008368059061467648, + "learning_rate": 8.890113462093e-05, + "loss": 0.011599770747125149, + "num_input_tokens_seen": 60558448, + "step": 3698, + "train_runtime": 30050.965, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 2.2418181818181817, + "grad_norm": 0.0047463481314480305, + "learning_rate": 8.889509269507514e-05, + "loss": 0.010899157263338566, + "num_input_tokens_seen": 60574824, + "step": 3699, + "train_runtime": 30059.0815, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 2.242424242424242, + "grad_norm": 0.007197657600045204, + "learning_rate": 8.888904933056654e-05, + "loss": 0.012830094434320927, + "num_input_tokens_seen": 60591200, + "step": 3700, + "train_runtime": 30067.1921, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 2.243030303030303, + "grad_norm": 0.004900816362351179, + "learning_rate": 8.888300452762774e-05, + "loss": 0.011792563833296299, + "num_input_tokens_seen": 60607576, + "step": 3701, + "train_runtime": 30076.1536, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 2.2436363636363637, + "grad_norm": 0.008804868906736374, + "learning_rate": 8.887695828648232e-05, + "loss": 0.011721835471689701, + "num_input_tokens_seen": 60623952, + "step": 3702, + "train_runtime": 30084.2656, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 2.244242424242424, + "grad_norm": 0.010449771769344807, + "learning_rate": 8.887091060735395e-05, + "loss": 0.0124953743070364, + "num_input_tokens_seen": 60640328, + "step": 3703, + "train_runtime": 30092.4732, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 2.2448484848484846, + "grad_norm": 0.008419829420745373, + "learning_rate": 8.886486149046627e-05, + "loss": 0.01311418879777193, + "num_input_tokens_seen": 60656704, + "step": 3704, + "train_runtime": 30100.593, + "train_tokens_per_second": 2015.133 + }, + { + "epoch": 2.2454545454545456, + "grad_norm": 0.0049407086335122585, + "learning_rate": 8.885881093604306e-05, + "loss": 0.012327872216701508, + "num_input_tokens_seen": 60673080, + "step": 3705, + "train_runtime": 30108.7078, + "train_tokens_per_second": 2015.134 + }, + { + "epoch": 2.246060606060606, + "grad_norm": 0.004161890130490065, + "learning_rate": 8.88527589443081e-05, + "loss": 0.011168462224304676, + "num_input_tokens_seen": 60689456, + "step": 3706, + "train_runtime": 30116.8229, + "train_tokens_per_second": 2015.135 + }, + { + "epoch": 2.2466666666666666, + "grad_norm": 0.004563709255307913, + "learning_rate": 8.884670551548525e-05, + "loss": 0.012025438249111176, + "num_input_tokens_seen": 60705832, + "step": 3707, + "train_runtime": 30124.9366, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 2.247272727272727, + "grad_norm": 0.009686720557510853, + "learning_rate": 8.884065064979841e-05, + "loss": 0.012253142893314362, + "num_input_tokens_seen": 60722208, + "step": 3708, + "train_runtime": 30133.0553, + "train_tokens_per_second": 2015.136 + }, + { + "epoch": 2.247878787878788, + "grad_norm": 0.01068910863250494, + "learning_rate": 8.883459434747154e-05, + "loss": 0.012575153261423111, + "num_input_tokens_seen": 60738584, + "step": 3709, + "train_runtime": 30141.1687, + "train_tokens_per_second": 2015.137 + }, + { + "epoch": 2.2484848484848485, + "grad_norm": 0.006846324075013399, + "learning_rate": 8.882853660872867e-05, + "loss": 0.012148548848927021, + "num_input_tokens_seen": 60754960, + "step": 3710, + "train_runtime": 30149.287, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 2.249090909090909, + "grad_norm": 0.0058296844363212585, + "learning_rate": 8.882247743379383e-05, + "loss": 0.013228103518486023, + "num_input_tokens_seen": 60771336, + "step": 3711, + "train_runtime": 30157.3998, + "train_tokens_per_second": 2015.138 + }, + { + "epoch": 2.2496969696969695, + "grad_norm": 0.005055832210928202, + "learning_rate": 8.881641682289117e-05, + "loss": 0.01328389160335064, + "num_input_tokens_seen": 60787712, + "step": 3712, + "train_runtime": 30165.5105, + "train_tokens_per_second": 2015.14 + }, + { + "epoch": 2.2503030303030305, + "grad_norm": 0.006351431831717491, + "learning_rate": 8.881035477624483e-05, + "loss": 0.011203351430594921, + "num_input_tokens_seen": 60804088, + "step": 3713, + "train_runtime": 30173.6214, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 2.250909090909091, + "grad_norm": 0.005891186185181141, + "learning_rate": 8.880429129407904e-05, + "loss": 0.012171884998679161, + "num_input_tokens_seen": 60820464, + "step": 3714, + "train_runtime": 30181.7337, + "train_tokens_per_second": 2015.141 + }, + { + "epoch": 2.2515151515151515, + "grad_norm": 0.010121798142790794, + "learning_rate": 8.879822637661809e-05, + "loss": 0.011959838680922985, + "num_input_tokens_seen": 60836840, + "step": 3715, + "train_runtime": 30189.8449, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 2.252121212121212, + "grad_norm": 0.008183280937373638, + "learning_rate": 8.879216002408631e-05, + "loss": 0.013505983166396618, + "num_input_tokens_seen": 60853216, + "step": 3716, + "train_runtime": 30197.9576, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 2.252727272727273, + "grad_norm": 0.006862631533294916, + "learning_rate": 8.878609223670806e-05, + "loss": 0.012125739827752113, + "num_input_tokens_seen": 60869592, + "step": 3717, + "train_runtime": 30206.0714, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 2.2533333333333334, + "grad_norm": 0.007013807073235512, + "learning_rate": 8.87800230147078e-05, + "loss": 0.012417798861861229, + "num_input_tokens_seen": 60885968, + "step": 3718, + "train_runtime": 30214.1929, + "train_tokens_per_second": 2015.145 + }, + { + "epoch": 2.253939393939394, + "grad_norm": 0.0071312859654426575, + "learning_rate": 8.877395235831001e-05, + "loss": 0.012290849350392818, + "num_input_tokens_seen": 60902344, + "step": 3719, + "train_runtime": 30222.3058, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 2.2545454545454544, + "grad_norm": 0.009357710368931293, + "learning_rate": 8.876788026773922e-05, + "loss": 0.01159263588488102, + "num_input_tokens_seen": 60918720, + "step": 3720, + "train_runtime": 30230.4202, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 2.255151515151515, + "grad_norm": 0.016083406284451485, + "learning_rate": 8.876180674322005e-05, + "loss": 0.011838029138743877, + "num_input_tokens_seen": 60935096, + "step": 3721, + "train_runtime": 30238.5359, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 2.255757575757576, + "grad_norm": 0.0025645827408879995, + "learning_rate": 8.875573178497714e-05, + "loss": 0.010505922138690948, + "num_input_tokens_seen": 60951472, + "step": 3722, + "train_runtime": 30246.6542, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 2.2563636363636363, + "grad_norm": 0.005660255905240774, + "learning_rate": 8.874965539323517e-05, + "loss": 0.012189293280243874, + "num_input_tokens_seen": 60967848, + "step": 3723, + "train_runtime": 30254.7693, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 2.256969696969697, + "grad_norm": 0.010649233125150204, + "learning_rate": 8.87435775682189e-05, + "loss": 0.012099821120500565, + "num_input_tokens_seen": 60984224, + "step": 3724, + "train_runtime": 30262.8821, + "train_tokens_per_second": 2015.149 + }, + { + "epoch": 2.257575757575758, + "grad_norm": 0.005941980052739382, + "learning_rate": 8.873749831015315e-05, + "loss": 0.011310269124805927, + "num_input_tokens_seen": 61000600, + "step": 3725, + "train_runtime": 30270.9994, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 2.2581818181818183, + "grad_norm": 0.010598689317703247, + "learning_rate": 8.87314176192628e-05, + "loss": 0.012694881297647953, + "num_input_tokens_seen": 61016976, + "step": 3726, + "train_runtime": 30279.1172, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 2.258787878787879, + "grad_norm": 0.01025476586073637, + "learning_rate": 8.872533549577271e-05, + "loss": 0.012136287987232208, + "num_input_tokens_seen": 61033352, + "step": 3727, + "train_runtime": 30287.2334, + "train_tokens_per_second": 2015.151 + }, + { + "epoch": 2.2593939393939393, + "grad_norm": 0.006790067069232464, + "learning_rate": 8.871925193990789e-05, + "loss": 0.013514727354049683, + "num_input_tokens_seen": 61049728, + "step": 3728, + "train_runtime": 30295.3472, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 2.26, + "grad_norm": 0.007537974044680595, + "learning_rate": 8.871316695189334e-05, + "loss": 0.012649727053940296, + "num_input_tokens_seen": 61066104, + "step": 3729, + "train_runtime": 30303.4656, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 2.2606060606060607, + "grad_norm": 0.0073478384874761105, + "learning_rate": 8.870708053195413e-05, + "loss": 0.011982829309999943, + "num_input_tokens_seen": 61082480, + "step": 3730, + "train_runtime": 30311.5858, + "train_tokens_per_second": 2015.153 + }, + { + "epoch": 2.2612121212121212, + "grad_norm": 0.028094328939914703, + "learning_rate": 8.87009926803154e-05, + "loss": 0.012677352875471115, + "num_input_tokens_seen": 61098856, + "step": 3731, + "train_runtime": 30319.7025, + "train_tokens_per_second": 2015.154 + }, + { + "epoch": 2.2618181818181817, + "grad_norm": 0.011618182994425297, + "learning_rate": 8.86949033972023e-05, + "loss": 0.012250116094946861, + "num_input_tokens_seen": 61115232, + "step": 3732, + "train_runtime": 30327.8211, + "train_tokens_per_second": 2015.154 + }, + { + "epoch": 2.2624242424242427, + "grad_norm": 0.010365051217377186, + "learning_rate": 8.868881268284008e-05, + "loss": 0.011696823872625828, + "num_input_tokens_seen": 61131608, + "step": 3733, + "train_runtime": 30335.9403, + "train_tokens_per_second": 2015.155 + }, + { + "epoch": 2.263030303030303, + "grad_norm": 0.006496044807136059, + "learning_rate": 8.868272053745403e-05, + "loss": 0.011718837544322014, + "num_input_tokens_seen": 61147984, + "step": 3734, + "train_runtime": 30344.0549, + "train_tokens_per_second": 2015.155 + }, + { + "epoch": 2.2636363636363637, + "grad_norm": 0.005344794597476721, + "learning_rate": 8.867662696126948e-05, + "loss": 0.010989891365170479, + "num_input_tokens_seen": 61164360, + "step": 3735, + "train_runtime": 30352.1698, + "train_tokens_per_second": 2015.156 + }, + { + "epoch": 2.264242424242424, + "grad_norm": 0.012590425089001656, + "learning_rate": 8.867053195451183e-05, + "loss": 0.013337450101971626, + "num_input_tokens_seen": 61180736, + "step": 3736, + "train_runtime": 30360.2893, + "train_tokens_per_second": 2015.157 + }, + { + "epoch": 2.2648484848484847, + "grad_norm": 0.010968165472149849, + "learning_rate": 8.866443551740648e-05, + "loss": 0.013668050989508629, + "num_input_tokens_seen": 61197112, + "step": 3737, + "train_runtime": 30368.4071, + "train_tokens_per_second": 2015.157 + }, + { + "epoch": 2.2654545454545456, + "grad_norm": 0.00999407097697258, + "learning_rate": 8.865833765017899e-05, + "loss": 0.01268429309129715, + "num_input_tokens_seen": 61213488, + "step": 3738, + "train_runtime": 30376.5202, + "train_tokens_per_second": 2015.158 + }, + { + "epoch": 2.266060606060606, + "grad_norm": 0.012314529158174992, + "learning_rate": 8.865223835305485e-05, + "loss": 0.01283974852412939, + "num_input_tokens_seen": 61229864, + "step": 3739, + "train_runtime": 30384.635, + "train_tokens_per_second": 2015.159 + }, + { + "epoch": 2.2666666666666666, + "grad_norm": 0.0056611280888319016, + "learning_rate": 8.864613762625969e-05, + "loss": 0.01165764406323433, + "num_input_tokens_seen": 61246240, + "step": 3740, + "train_runtime": 30392.7519, + "train_tokens_per_second": 2015.159 + }, + { + "epoch": 2.267272727272727, + "grad_norm": 0.0059904055669903755, + "learning_rate": 8.864003547001915e-05, + "loss": 0.011943137273192406, + "num_input_tokens_seen": 61262616, + "step": 3741, + "train_runtime": 30400.8654, + "train_tokens_per_second": 2015.16 + }, + { + "epoch": 2.267878787878788, + "grad_norm": 0.00643067667260766, + "learning_rate": 8.863393188455897e-05, + "loss": 0.01149215642362833, + "num_input_tokens_seen": 61278992, + "step": 3742, + "train_runtime": 30408.9806, + "train_tokens_per_second": 2015.161 + }, + { + "epoch": 2.2684848484848485, + "grad_norm": 0.007379885762929916, + "learning_rate": 8.862782687010487e-05, + "loss": 0.012658249586820602, + "num_input_tokens_seen": 61295368, + "step": 3743, + "train_runtime": 30417.0971, + "train_tokens_per_second": 2015.162 + }, + { + "epoch": 2.269090909090909, + "grad_norm": 0.007340370211750269, + "learning_rate": 8.862172042688268e-05, + "loss": 0.01166062243282795, + "num_input_tokens_seen": 61311744, + "step": 3744, + "train_runtime": 30425.213, + "train_tokens_per_second": 2015.162 + }, + { + "epoch": 2.2696969696969695, + "grad_norm": 0.009426895529031754, + "learning_rate": 8.861561255511826e-05, + "loss": 0.010667637921869755, + "num_input_tokens_seen": 61328120, + "step": 3745, + "train_runtime": 30433.3335, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 2.2703030303030305, + "grad_norm": 0.009414087980985641, + "learning_rate": 8.860950325503754e-05, + "loss": 0.011794875375926495, + "num_input_tokens_seen": 61344496, + "step": 3746, + "train_runtime": 30441.4479, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 2.270909090909091, + "grad_norm": 0.003789094975218177, + "learning_rate": 8.860339252686648e-05, + "loss": 0.011640205979347229, + "num_input_tokens_seen": 61360872, + "step": 3747, + "train_runtime": 30449.5634, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 2.2715151515151515, + "grad_norm": 0.008663547225296497, + "learning_rate": 8.85972803708311e-05, + "loss": 0.011976547539234161, + "num_input_tokens_seen": 61377248, + "step": 3748, + "train_runtime": 30457.6768, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 2.272121212121212, + "grad_norm": 0.005016832146793604, + "learning_rate": 8.859116678715751e-05, + "loss": 0.011901703663170338, + "num_input_tokens_seen": 61393624, + "step": 3749, + "train_runtime": 30465.787, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 2.2727272727272725, + "grad_norm": 0.0046637230552732944, + "learning_rate": 8.85850517760718e-05, + "loss": 0.011695494875311852, + "num_input_tokens_seen": 61410000, + "step": 3750, + "train_runtime": 30473.9004, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 2.2733333333333334, + "grad_norm": 0.007660789415240288, + "learning_rate": 8.857893533780015e-05, + "loss": 0.011524790897965431, + "num_input_tokens_seen": 61426376, + "step": 3751, + "train_runtime": 30482.0201, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 2.273939393939394, + "grad_norm": 0.005929175764322281, + "learning_rate": 8.857281747256882e-05, + "loss": 0.01320036593824625, + "num_input_tokens_seen": 61442752, + "step": 3752, + "train_runtime": 30490.1361, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 2.2745454545454544, + "grad_norm": 0.013882546685636044, + "learning_rate": 8.856669818060409e-05, + "loss": 0.01192145049571991, + "num_input_tokens_seen": 61459128, + "step": 3753, + "train_runtime": 30498.2521, + "train_tokens_per_second": 2015.169 + }, + { + "epoch": 2.2751515151515154, + "grad_norm": 0.003313496010378003, + "learning_rate": 8.85605774621323e-05, + "loss": 0.011736012995243073, + "num_input_tokens_seen": 61475504, + "step": 3754, + "train_runtime": 30506.3624, + "train_tokens_per_second": 2015.17 + }, + { + "epoch": 2.275757575757576, + "grad_norm": 0.005379652138799429, + "learning_rate": 8.855445531737985e-05, + "loss": 0.012325488962233067, + "num_input_tokens_seen": 61491880, + "step": 3755, + "train_runtime": 30514.4757, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 2.2763636363636364, + "grad_norm": 0.012600576505064964, + "learning_rate": 8.854833174657317e-05, + "loss": 0.012256315909326077, + "num_input_tokens_seen": 61508256, + "step": 3756, + "train_runtime": 30522.5913, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 2.276969696969697, + "grad_norm": 0.011063162237405777, + "learning_rate": 8.854220674993876e-05, + "loss": 0.013093437068164349, + "num_input_tokens_seen": 61524632, + "step": 3757, + "train_runtime": 30530.7003, + "train_tokens_per_second": 2015.173 + }, + { + "epoch": 2.2775757575757574, + "grad_norm": 0.011599640361964703, + "learning_rate": 8.85360803277032e-05, + "loss": 0.011047718115150928, + "num_input_tokens_seen": 61541008, + "step": 3758, + "train_runtime": 30538.8161, + "train_tokens_per_second": 2015.173 + }, + { + "epoch": 2.2781818181818183, + "grad_norm": 0.011033455841243267, + "learning_rate": 8.852995248009305e-05, + "loss": 0.012573250569403172, + "num_input_tokens_seen": 61557384, + "step": 3759, + "train_runtime": 30546.9326, + "train_tokens_per_second": 2015.174 + }, + { + "epoch": 2.278787878787879, + "grad_norm": 0.01250431314110756, + "learning_rate": 8.852382320733501e-05, + "loss": 0.011132653802633286, + "num_input_tokens_seen": 61573760, + "step": 3760, + "train_runtime": 30555.0493, + "train_tokens_per_second": 2015.175 + }, + { + "epoch": 2.2793939393939393, + "grad_norm": 0.008397900499403477, + "learning_rate": 8.851769250965577e-05, + "loss": 0.012294886633753777, + "num_input_tokens_seen": 61590136, + "step": 3761, + "train_runtime": 30563.1648, + "train_tokens_per_second": 2015.175 + }, + { + "epoch": 2.2800000000000002, + "grad_norm": 0.0026903818361461163, + "learning_rate": 8.851156038728209e-05, + "loss": 0.012650152668356895, + "num_input_tokens_seen": 61606512, + "step": 3762, + "train_runtime": 30571.2819, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 2.2806060606060607, + "grad_norm": 0.006723749917000532, + "learning_rate": 8.850542684044078e-05, + "loss": 0.011322797276079655, + "num_input_tokens_seen": 61622888, + "step": 3763, + "train_runtime": 30579.3977, + "train_tokens_per_second": 2015.177 + }, + { + "epoch": 2.2812121212121212, + "grad_norm": 0.006619950756430626, + "learning_rate": 8.849929186935874e-05, + "loss": 0.012670768424868584, + "num_input_tokens_seen": 61639264, + "step": 3764, + "train_runtime": 30587.5168, + "train_tokens_per_second": 2015.177 + }, + { + "epoch": 2.2818181818181817, + "grad_norm": 0.004861629568040371, + "learning_rate": 8.849315547426284e-05, + "loss": 0.011259309016168118, + "num_input_tokens_seen": 61655640, + "step": 3765, + "train_runtime": 30595.6326, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 2.2824242424242422, + "grad_norm": 0.008465851657092571, + "learning_rate": 8.84870176553801e-05, + "loss": 0.010890805162489414, + "num_input_tokens_seen": 61672016, + "step": 3766, + "train_runtime": 30603.7507, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 2.283030303030303, + "grad_norm": 0.01856720820069313, + "learning_rate": 8.848087841293753e-05, + "loss": 0.011901823803782463, + "num_input_tokens_seen": 61688392, + "step": 3767, + "train_runtime": 30611.8653, + "train_tokens_per_second": 2015.179 + }, + { + "epoch": 2.2836363636363637, + "grad_norm": 0.006428460590541363, + "learning_rate": 8.84747377471622e-05, + "loss": 0.013656743802130222, + "num_input_tokens_seen": 61704768, + "step": 3768, + "train_runtime": 30619.9787, + "train_tokens_per_second": 2015.18 + }, + { + "epoch": 2.284242424242424, + "grad_norm": 0.008296936750411987, + "learning_rate": 8.846859565828124e-05, + "loss": 0.012134167365729809, + "num_input_tokens_seen": 61721144, + "step": 3769, + "train_runtime": 30628.0958, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 2.2848484848484847, + "grad_norm": 0.006756153889000416, + "learning_rate": 8.846245214652185e-05, + "loss": 0.011300654150545597, + "num_input_tokens_seen": 61737520, + "step": 3770, + "train_runtime": 30636.2146, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 2.2854545454545456, + "grad_norm": 0.007698638364672661, + "learning_rate": 8.845630721211124e-05, + "loss": 0.011657550930976868, + "num_input_tokens_seen": 61753896, + "step": 3771, + "train_runtime": 30644.3343, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 2.286060606060606, + "grad_norm": 0.00994476955384016, + "learning_rate": 8.845016085527673e-05, + "loss": 0.010930661112070084, + "num_input_tokens_seen": 61770272, + "step": 3772, + "train_runtime": 30652.4485, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 2.2866666666666666, + "grad_norm": 0.009451497346162796, + "learning_rate": 8.844401307624566e-05, + "loss": 0.012202934361994267, + "num_input_tokens_seen": 61786648, + "step": 3773, + "train_runtime": 30660.5626, + "train_tokens_per_second": 2015.183 + }, + { + "epoch": 2.287272727272727, + "grad_norm": 0.010669391602277756, + "learning_rate": 8.84378638752454e-05, + "loss": 0.013062708079814911, + "num_input_tokens_seen": 61803024, + "step": 3774, + "train_runtime": 30668.6819, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 2.287878787878788, + "grad_norm": 0.005233460105955601, + "learning_rate": 8.843171325250341e-05, + "loss": 0.011414062231779099, + "num_input_tokens_seen": 61819400, + "step": 3775, + "train_runtime": 30676.7958, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 2.2884848484848486, + "grad_norm": 0.006945099215954542, + "learning_rate": 8.842556120824719e-05, + "loss": 0.013735410757362843, + "num_input_tokens_seen": 61835776, + "step": 3776, + "train_runtime": 30684.9131, + "train_tokens_per_second": 2015.185 + }, + { + "epoch": 2.289090909090909, + "grad_norm": 0.011563980020582676, + "learning_rate": 8.841940774270429e-05, + "loss": 0.011850640177726746, + "num_input_tokens_seen": 61852152, + "step": 3777, + "train_runtime": 30693.0323, + "train_tokens_per_second": 2015.185 + }, + { + "epoch": 2.2896969696969696, + "grad_norm": 0.019558526575565338, + "learning_rate": 8.841325285610232e-05, + "loss": 0.012019994668662548, + "num_input_tokens_seen": 61868528, + "step": 3778, + "train_runtime": 30701.1474, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 2.29030303030303, + "grad_norm": 0.0072617363184690475, + "learning_rate": 8.840709654866892e-05, + "loss": 0.012482079677283764, + "num_input_tokens_seen": 61884904, + "step": 3779, + "train_runtime": 30709.2577, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 2.290909090909091, + "grad_norm": 0.008295083418488503, + "learning_rate": 8.840093882063182e-05, + "loss": 0.012043890543282032, + "num_input_tokens_seen": 61901280, + "step": 3780, + "train_runtime": 30717.3684, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 2.2915151515151515, + "grad_norm": 0.010095818899571896, + "learning_rate": 8.839477967221879e-05, + "loss": 0.0124919218942523, + "num_input_tokens_seen": 61917656, + "step": 3781, + "train_runtime": 30725.4828, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 2.292121212121212, + "grad_norm": 0.004255611915141344, + "learning_rate": 8.838861910365762e-05, + "loss": 0.01309207733720541, + "num_input_tokens_seen": 61934032, + "step": 3782, + "train_runtime": 30733.5973, + "train_tokens_per_second": 2015.19 + }, + { + "epoch": 2.292727272727273, + "grad_norm": 0.00850534439086914, + "learning_rate": 8.838245711517618e-05, + "loss": 0.012254266068339348, + "num_input_tokens_seen": 61950408, + "step": 3783, + "train_runtime": 30741.7101, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 2.2933333333333334, + "grad_norm": 0.00831502303481102, + "learning_rate": 8.83762937070024e-05, + "loss": 0.01236623153090477, + "num_input_tokens_seen": 61966784, + "step": 3784, + "train_runtime": 30749.8321, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 2.293939393939394, + "grad_norm": 0.00466503482311964, + "learning_rate": 8.837012887936426e-05, + "loss": 0.012684816494584084, + "num_input_tokens_seen": 61983160, + "step": 3785, + "train_runtime": 30757.9435, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 2.2945454545454544, + "grad_norm": 0.007791228126734495, + "learning_rate": 8.836396263248976e-05, + "loss": 0.012480277568101883, + "num_input_tokens_seen": 61999536, + "step": 3786, + "train_runtime": 30766.0566, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 2.295151515151515, + "grad_norm": 0.008186898194253445, + "learning_rate": 8.835779496660701e-05, + "loss": 0.012753861956298351, + "num_input_tokens_seen": 62015912, + "step": 3787, + "train_runtime": 30774.1735, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 2.295757575757576, + "grad_norm": 0.008967457339167595, + "learning_rate": 8.835162588194411e-05, + "loss": 0.011731700040400028, + "num_input_tokens_seen": 62032288, + "step": 3788, + "train_runtime": 30782.2879, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 2.2963636363636364, + "grad_norm": 0.003565243910998106, + "learning_rate": 8.834545537872925e-05, + "loss": 0.013852463103830814, + "num_input_tokens_seen": 62048664, + "step": 3789, + "train_runtime": 30790.4021, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 2.296969696969697, + "grad_norm": 0.0027453871443867683, + "learning_rate": 8.833928345719069e-05, + "loss": 0.011183127760887146, + "num_input_tokens_seen": 62065040, + "step": 3790, + "train_runtime": 30798.518, + "train_tokens_per_second": 2015.196 + }, + { + "epoch": 2.2975757575757574, + "grad_norm": 0.007668246980756521, + "learning_rate": 8.833311011755668e-05, + "loss": 0.011989946477115154, + "num_input_tokens_seen": 62081416, + "step": 3791, + "train_runtime": 30806.6335, + "train_tokens_per_second": 2015.196 + }, + { + "epoch": 2.2981818181818183, + "grad_norm": 0.025066649541258812, + "learning_rate": 8.832693536005558e-05, + "loss": 0.014158163219690323, + "num_input_tokens_seen": 62097792, + "step": 3792, + "train_runtime": 30814.7498, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 2.298787878787879, + "grad_norm": 0.017292816191911697, + "learning_rate": 8.832075918491579e-05, + "loss": 0.011756017804145813, + "num_input_tokens_seen": 62114168, + "step": 3793, + "train_runtime": 30822.8675, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 2.2993939393939393, + "grad_norm": 0.0034284384455531836, + "learning_rate": 8.831458159236575e-05, + "loss": 0.012419110164046288, + "num_input_tokens_seen": 62130544, + "step": 3794, + "train_runtime": 30830.9797, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 2.3, + "grad_norm": 0.00656497897580266, + "learning_rate": 8.830840258263393e-05, + "loss": 0.012269002385437489, + "num_input_tokens_seen": 62146920, + "step": 3795, + "train_runtime": 30839.0917, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 2.3006060606060608, + "grad_norm": 0.009967640973627567, + "learning_rate": 8.83022221559489e-05, + "loss": 0.012140346691012383, + "num_input_tokens_seen": 62163296, + "step": 3796, + "train_runtime": 30847.2027, + "train_tokens_per_second": 2015.2 + }, + { + "epoch": 2.3012121212121213, + "grad_norm": 0.007896405644714832, + "learning_rate": 8.829604031253929e-05, + "loss": 0.013535144738852978, + "num_input_tokens_seen": 62179672, + "step": 3797, + "train_runtime": 30855.3143, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 2.3018181818181818, + "grad_norm": 0.011223090812563896, + "learning_rate": 8.828985705263369e-05, + "loss": 0.012020010501146317, + "num_input_tokens_seen": 62196048, + "step": 3798, + "train_runtime": 30863.4314, + "train_tokens_per_second": 2015.202 + }, + { + "epoch": 2.3024242424242423, + "grad_norm": 0.006788720842450857, + "learning_rate": 8.828367237646087e-05, + "loss": 0.012599104084074497, + "num_input_tokens_seen": 62212424, + "step": 3799, + "train_runtime": 30871.5457, + "train_tokens_per_second": 2015.203 + }, + { + "epoch": 2.303030303030303, + "grad_norm": 0.006533706095069647, + "learning_rate": 8.827748628424956e-05, + "loss": 0.012083176523447037, + "num_input_tokens_seen": 62228800, + "step": 3800, + "train_runtime": 30879.6599, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 2.3036363636363637, + "grad_norm": 0.008081979118287563, + "learning_rate": 8.827129877622857e-05, + "loss": 0.012634792365133762, + "num_input_tokens_seen": 62245176, + "step": 3801, + "train_runtime": 30888.7316, + "train_tokens_per_second": 2015.142 + }, + { + "epoch": 2.304242424242424, + "grad_norm": 0.0023193880915641785, + "learning_rate": 8.826510985262677e-05, + "loss": 0.011524361558258533, + "num_input_tokens_seen": 62261552, + "step": 3802, + "train_runtime": 30896.8426, + "train_tokens_per_second": 2015.143 + }, + { + "epoch": 2.3048484848484847, + "grad_norm": 0.006892868783324957, + "learning_rate": 8.825891951367307e-05, + "loss": 0.011532147414982319, + "num_input_tokens_seen": 62277928, + "step": 3803, + "train_runtime": 30904.9545, + "train_tokens_per_second": 2015.144 + }, + { + "epoch": 2.3054545454545456, + "grad_norm": 0.0047895400784909725, + "learning_rate": 8.825272775959644e-05, + "loss": 0.013725175522267818, + "num_input_tokens_seen": 62294304, + "step": 3804, + "train_runtime": 30913.0669, + "train_tokens_per_second": 2015.145 + }, + { + "epoch": 2.306060606060606, + "grad_norm": 0.0064620282500982285, + "learning_rate": 8.824653459062591e-05, + "loss": 0.012814794667065144, + "num_input_tokens_seen": 62310680, + "step": 3805, + "train_runtime": 30921.1796, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 2.3066666666666666, + "grad_norm": 0.008356164209544659, + "learning_rate": 8.824034000699055e-05, + "loss": 0.01243899017572403, + "num_input_tokens_seen": 62327056, + "step": 3806, + "train_runtime": 30929.2943, + "train_tokens_per_second": 2015.146 + }, + { + "epoch": 2.307272727272727, + "grad_norm": 0.004934507422149181, + "learning_rate": 8.823414400891948e-05, + "loss": 0.011569508351385593, + "num_input_tokens_seen": 62343432, + "step": 3807, + "train_runtime": 30937.41, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 2.3078787878787876, + "grad_norm": 0.031424786895513535, + "learning_rate": 8.822794659664187e-05, + "loss": 0.01256850641220808, + "num_input_tokens_seen": 62359808, + "step": 3808, + "train_runtime": 30945.532, + "train_tokens_per_second": 2015.147 + }, + { + "epoch": 2.3084848484848486, + "grad_norm": 0.011678419075906277, + "learning_rate": 8.822174777038697e-05, + "loss": 0.012379190884530544, + "num_input_tokens_seen": 62376184, + "step": 3809, + "train_runtime": 30953.6454, + "train_tokens_per_second": 2015.148 + }, + { + "epoch": 2.309090909090909, + "grad_norm": 0.009252636693418026, + "learning_rate": 8.821554753038406e-05, + "loss": 0.012269056402146816, + "num_input_tokens_seen": 62392560, + "step": 3810, + "train_runtime": 30961.7611, + "train_tokens_per_second": 2015.149 + }, + { + "epoch": 2.3096969696969696, + "grad_norm": 0.012772388756275177, + "learning_rate": 8.820934587686247e-05, + "loss": 0.013819447718560696, + "num_input_tokens_seen": 62408936, + "step": 3811, + "train_runtime": 30969.8761, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 2.3103030303030305, + "grad_norm": 0.016963202506303787, + "learning_rate": 8.820314281005158e-05, + "loss": 0.013393501751124859, + "num_input_tokens_seen": 62425312, + "step": 3812, + "train_runtime": 30977.9916, + "train_tokens_per_second": 2015.15 + }, + { + "epoch": 2.310909090909091, + "grad_norm": 0.008827922865748405, + "learning_rate": 8.819693833018083e-05, + "loss": 0.011720137670636177, + "num_input_tokens_seen": 62441688, + "step": 3813, + "train_runtime": 30986.1051, + "train_tokens_per_second": 2015.151 + }, + { + "epoch": 2.3115151515151515, + "grad_norm": 0.012870227918028831, + "learning_rate": 8.81907324374797e-05, + "loss": 0.0119565948843956, + "num_input_tokens_seen": 62458064, + "step": 3814, + "train_runtime": 30994.2197, + "train_tokens_per_second": 2015.152 + }, + { + "epoch": 2.312121212121212, + "grad_norm": 0.010535592213273048, + "learning_rate": 8.818452513217778e-05, + "loss": 0.011577087454497814, + "num_input_tokens_seen": 62474440, + "step": 3815, + "train_runtime": 31002.3329, + "train_tokens_per_second": 2015.153 + }, + { + "epoch": 2.3127272727272725, + "grad_norm": 0.008618311025202274, + "learning_rate": 8.817831641450462e-05, + "loss": 0.011819293722510338, + "num_input_tokens_seen": 62490816, + "step": 3816, + "train_runtime": 31010.4426, + "train_tokens_per_second": 2015.154 + }, + { + "epoch": 2.3133333333333335, + "grad_norm": 0.006642166990786791, + "learning_rate": 8.817210628468991e-05, + "loss": 0.012905126437544823, + "num_input_tokens_seen": 62507192, + "step": 3817, + "train_runtime": 31018.5576, + "train_tokens_per_second": 2015.155 + }, + { + "epoch": 2.313939393939394, + "grad_norm": 0.005279494449496269, + "learning_rate": 8.81658947429633e-05, + "loss": 0.011396235786378384, + "num_input_tokens_seen": 62523568, + "step": 3818, + "train_runtime": 31026.6711, + "train_tokens_per_second": 2015.156 + }, + { + "epoch": 2.3145454545454545, + "grad_norm": 0.005412065424025059, + "learning_rate": 8.815968178955456e-05, + "loss": 0.01221628300845623, + "num_input_tokens_seen": 62539944, + "step": 3819, + "train_runtime": 31034.7855, + "train_tokens_per_second": 2015.156 + }, + { + "epoch": 2.315151515151515, + "grad_norm": 0.010122607462108135, + "learning_rate": 8.815346742469352e-05, + "loss": 0.013018102385103703, + "num_input_tokens_seen": 62556320, + "step": 3820, + "train_runtime": 31042.8967, + "train_tokens_per_second": 2015.157 + }, + { + "epoch": 2.315757575757576, + "grad_norm": 0.0067726317793130875, + "learning_rate": 8.814725164861001e-05, + "loss": 0.011113530024886131, + "num_input_tokens_seen": 62572696, + "step": 3821, + "train_runtime": 31051.0128, + "train_tokens_per_second": 2015.158 + }, + { + "epoch": 2.3163636363636364, + "grad_norm": 0.007007678970694542, + "learning_rate": 8.814103446153396e-05, + "loss": 0.011365599930286407, + "num_input_tokens_seen": 62589072, + "step": 3822, + "train_runtime": 31059.1319, + "train_tokens_per_second": 2015.158 + }, + { + "epoch": 2.316969696969697, + "grad_norm": 0.006877017207443714, + "learning_rate": 8.813481586369532e-05, + "loss": 0.013382461853325367, + "num_input_tokens_seen": 62605448, + "step": 3823, + "train_runtime": 31067.2418, + "train_tokens_per_second": 2015.16 + }, + { + "epoch": 2.3175757575757574, + "grad_norm": 0.009172811172902584, + "learning_rate": 8.812859585532411e-05, + "loss": 0.011877622455358505, + "num_input_tokens_seen": 62621824, + "step": 3824, + "train_runtime": 31075.3563, + "train_tokens_per_second": 2015.16 + }, + { + "epoch": 2.3181818181818183, + "grad_norm": 0.0055466280318796635, + "learning_rate": 8.81223744366504e-05, + "loss": 0.011979153379797935, + "num_input_tokens_seen": 62638200, + "step": 3825, + "train_runtime": 31083.4744, + "train_tokens_per_second": 2015.161 + }, + { + "epoch": 2.318787878787879, + "grad_norm": 0.011318989098072052, + "learning_rate": 8.811615160790427e-05, + "loss": 0.012014471925795078, + "num_input_tokens_seen": 62654576, + "step": 3826, + "train_runtime": 31091.593, + "train_tokens_per_second": 2015.161 + }, + { + "epoch": 2.3193939393939393, + "grad_norm": 0.005611430387943983, + "learning_rate": 8.810992736931594e-05, + "loss": 0.012370433658361435, + "num_input_tokens_seen": 62670952, + "step": 3827, + "train_runtime": 31099.7092, + "train_tokens_per_second": 2015.162 + }, + { + "epoch": 2.32, + "grad_norm": 0.005884826648980379, + "learning_rate": 8.810370172111559e-05, + "loss": 0.011997217312455177, + "num_input_tokens_seen": 62687328, + "step": 3828, + "train_runtime": 31107.8319, + "train_tokens_per_second": 2015.162 + }, + { + "epoch": 2.320606060606061, + "grad_norm": 0.008681188337504864, + "learning_rate": 8.809747466353356e-05, + "loss": 0.011951385997235775, + "num_input_tokens_seen": 62703704, + "step": 3829, + "train_runtime": 31115.946, + "train_tokens_per_second": 2015.163 + }, + { + "epoch": 2.3212121212121213, + "grad_norm": 0.006343021057546139, + "learning_rate": 8.80912461968001e-05, + "loss": 0.012676788493990898, + "num_input_tokens_seen": 62720080, + "step": 3830, + "train_runtime": 31124.0602, + "train_tokens_per_second": 2015.164 + }, + { + "epoch": 2.321818181818182, + "grad_norm": 0.02809157408773899, + "learning_rate": 8.808501632114563e-05, + "loss": 0.012784118764102459, + "num_input_tokens_seen": 62736456, + "step": 3831, + "train_runtime": 31132.1724, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 2.3224242424242423, + "grad_norm": 0.007597580552101135, + "learning_rate": 8.807878503680056e-05, + "loss": 0.012820257805287838, + "num_input_tokens_seen": 62752832, + "step": 3832, + "train_runtime": 31140.2903, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 2.3230303030303032, + "grad_norm": 0.011442799121141434, + "learning_rate": 8.80725523439954e-05, + "loss": 0.01274331659078598, + "num_input_tokens_seen": 62769208, + "step": 3833, + "train_runtime": 31148.4048, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 2.3236363636363637, + "grad_norm": 0.020038971677422523, + "learning_rate": 8.806631824296068e-05, + "loss": 0.012703890912234783, + "num_input_tokens_seen": 62785584, + "step": 3834, + "train_runtime": 31156.521, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 2.324242424242424, + "grad_norm": 0.005542220082134008, + "learning_rate": 8.806008273392698e-05, + "loss": 0.01139106322079897, + "num_input_tokens_seen": 62801960, + "step": 3835, + "train_runtime": 31164.6348, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 2.3248484848484847, + "grad_norm": 0.003856452414765954, + "learning_rate": 8.805384581712492e-05, + "loss": 0.011459710076451302, + "num_input_tokens_seen": 62818336, + "step": 3836, + "train_runtime": 31172.7486, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 2.325454545454545, + "grad_norm": 0.008005725219845772, + "learning_rate": 8.804760749278522e-05, + "loss": 0.01386493630707264, + "num_input_tokens_seen": 62834712, + "step": 3837, + "train_runtime": 31180.8619, + "train_tokens_per_second": 2015.169 + }, + { + "epoch": 2.326060606060606, + "grad_norm": 0.007413184270262718, + "learning_rate": 8.80413677611386e-05, + "loss": 0.012458150275051594, + "num_input_tokens_seen": 62851088, + "step": 3838, + "train_runtime": 31188.9752, + "train_tokens_per_second": 2015.17 + }, + { + "epoch": 2.3266666666666667, + "grad_norm": 0.006525777745991945, + "learning_rate": 8.803512662241589e-05, + "loss": 0.01186311710625887, + "num_input_tokens_seen": 62867464, + "step": 3839, + "train_runtime": 31197.0905, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 2.327272727272727, + "grad_norm": 0.007110640872269869, + "learning_rate": 8.802888407684791e-05, + "loss": 0.011905853636562824, + "num_input_tokens_seen": 62883840, + "step": 3840, + "train_runtime": 31205.2066, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 2.327878787878788, + "grad_norm": 0.006256428547203541, + "learning_rate": 8.802264012466557e-05, + "loss": 0.011172623373568058, + "num_input_tokens_seen": 62900216, + "step": 3841, + "train_runtime": 31213.3215, + "train_tokens_per_second": 2015.172 + }, + { + "epoch": 2.3284848484848486, + "grad_norm": 0.008018501102924347, + "learning_rate": 8.801639476609979e-05, + "loss": 0.012479901313781738, + "num_input_tokens_seen": 62916592, + "step": 3842, + "train_runtime": 31221.4325, + "train_tokens_per_second": 2015.173 + }, + { + "epoch": 2.329090909090909, + "grad_norm": 0.01011030375957489, + "learning_rate": 8.801014800138164e-05, + "loss": 0.013398240320384502, + "num_input_tokens_seen": 62932968, + "step": 3843, + "train_runtime": 31229.5477, + "train_tokens_per_second": 2015.174 + }, + { + "epoch": 2.3296969696969696, + "grad_norm": 0.01473653968423605, + "learning_rate": 8.800389983074211e-05, + "loss": 0.01274619810283184, + "num_input_tokens_seen": 62949344, + "step": 3844, + "train_runtime": 31237.6629, + "train_tokens_per_second": 2015.175 + }, + { + "epoch": 2.33030303030303, + "grad_norm": 0.006297265645116568, + "learning_rate": 8.799765025441235e-05, + "loss": 0.011753606610000134, + "num_input_tokens_seen": 62965720, + "step": 3845, + "train_runtime": 31245.7774, + "train_tokens_per_second": 2015.175 + }, + { + "epoch": 2.330909090909091, + "grad_norm": 0.009894706308841705, + "learning_rate": 8.79913992726235e-05, + "loss": 0.013425085693597794, + "num_input_tokens_seen": 62982096, + "step": 3846, + "train_runtime": 31253.8878, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 2.3315151515151515, + "grad_norm": 0.015656357631087303, + "learning_rate": 8.798514688560678e-05, + "loss": 0.012578295543789864, + "num_input_tokens_seen": 62998472, + "step": 3847, + "train_runtime": 31262.0009, + "train_tokens_per_second": 2015.177 + }, + { + "epoch": 2.332121212121212, + "grad_norm": 0.009474585764110088, + "learning_rate": 8.797889309359343e-05, + "loss": 0.012363191694021225, + "num_input_tokens_seen": 63014848, + "step": 3848, + "train_runtime": 31270.1147, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 2.3327272727272725, + "grad_norm": 0.007632074877619743, + "learning_rate": 8.79726378968148e-05, + "loss": 0.01290170382708311, + "num_input_tokens_seen": 63031224, + "step": 3849, + "train_runtime": 31278.2312, + "train_tokens_per_second": 2015.179 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 0.015381796285510063, + "learning_rate": 8.796638129550223e-05, + "loss": 0.013437875546514988, + "num_input_tokens_seen": 63047600, + "step": 3850, + "train_runtime": 31286.345, + "train_tokens_per_second": 2015.179 + }, + { + "epoch": 2.333939393939394, + "grad_norm": 0.0024572880938649178, + "learning_rate": 8.796012328988716e-05, + "loss": 0.012945571914315224, + "num_input_tokens_seen": 63063976, + "step": 3851, + "train_runtime": 31294.4586, + "train_tokens_per_second": 2015.18 + }, + { + "epoch": 2.3345454545454545, + "grad_norm": 0.03190493956208229, + "learning_rate": 8.795386388020106e-05, + "loss": 0.013913745060563087, + "num_input_tokens_seen": 63080352, + "step": 3852, + "train_runtime": 31302.5703, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 2.335151515151515, + "grad_norm": 0.007880356162786484, + "learning_rate": 8.794760306667544e-05, + "loss": 0.012863239273428917, + "num_input_tokens_seen": 63096728, + "step": 3853, + "train_runtime": 31310.6847, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 2.335757575757576, + "grad_norm": 0.00890274252742529, + "learning_rate": 8.794134084954189e-05, + "loss": 0.014049514196813107, + "num_input_tokens_seen": 63113104, + "step": 3854, + "train_runtime": 31318.7996, + "train_tokens_per_second": 2015.183 + }, + { + "epoch": 2.3363636363636364, + "grad_norm": 0.010360688902437687, + "learning_rate": 8.793507722903203e-05, + "loss": 0.013474004343152046, + "num_input_tokens_seen": 63129480, + "step": 3855, + "train_runtime": 31326.9153, + "train_tokens_per_second": 2015.183 + }, + { + "epoch": 2.336969696969697, + "grad_norm": 0.009520480409264565, + "learning_rate": 8.792881220537751e-05, + "loss": 0.012026038020849228, + "num_input_tokens_seen": 63145856, + "step": 3856, + "train_runtime": 31335.0317, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 2.3375757575757574, + "grad_norm": 0.004477637819945812, + "learning_rate": 8.792254577881012e-05, + "loss": 0.011705001816153526, + "num_input_tokens_seen": 63162232, + "step": 3857, + "train_runtime": 31343.1442, + "train_tokens_per_second": 2015.185 + }, + { + "epoch": 2.3381818181818184, + "grad_norm": 0.014203597791492939, + "learning_rate": 8.79162779495616e-05, + "loss": 0.01088397391140461, + "num_input_tokens_seen": 63178608, + "step": 3858, + "train_runtime": 31351.2613, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 2.338787878787879, + "grad_norm": 0.01523826364427805, + "learning_rate": 8.791000871786381e-05, + "loss": 0.012338997796177864, + "num_input_tokens_seen": 63194984, + "step": 3859, + "train_runtime": 31359.3787, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 2.3393939393939394, + "grad_norm": 0.009995555505156517, + "learning_rate": 8.790373808394862e-05, + "loss": 0.012309486977756023, + "num_input_tokens_seen": 63211360, + "step": 3860, + "train_runtime": 31367.4914, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 2.34, + "grad_norm": 0.0040956162847578526, + "learning_rate": 8.789746604804796e-05, + "loss": 0.011944938451051712, + "num_input_tokens_seen": 63227736, + "step": 3861, + "train_runtime": 31375.6055, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 2.340606060606061, + "grad_norm": 0.00751397805288434, + "learning_rate": 8.789119261039385e-05, + "loss": 0.011225864291191101, + "num_input_tokens_seen": 63244112, + "step": 3862, + "train_runtime": 31383.7176, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 2.3412121212121213, + "grad_norm": 0.007151484955102205, + "learning_rate": 8.78849177712183e-05, + "loss": 0.011669758707284927, + "num_input_tokens_seen": 63260488, + "step": 3863, + "train_runtime": 31391.8328, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 2.341818181818182, + "grad_norm": 0.0053044590167701244, + "learning_rate": 8.787864153075342e-05, + "loss": 0.011644741520285606, + "num_input_tokens_seen": 63276864, + "step": 3864, + "train_runtime": 31399.9456, + "train_tokens_per_second": 2015.19 + }, + { + "epoch": 2.3424242424242423, + "grad_norm": 0.003996069077402353, + "learning_rate": 8.787236388923137e-05, + "loss": 0.01181547436863184, + "num_input_tokens_seen": 63293240, + "step": 3865, + "train_runtime": 31408.0627, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 2.343030303030303, + "grad_norm": 0.009208021685481071, + "learning_rate": 8.786608484688432e-05, + "loss": 0.011496206745505333, + "num_input_tokens_seen": 63309616, + "step": 3866, + "train_runtime": 31416.1761, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 2.3436363636363637, + "grad_norm": 0.003765063127502799, + "learning_rate": 8.785980440394454e-05, + "loss": 0.012080052867531776, + "num_input_tokens_seen": 63325992, + "step": 3867, + "train_runtime": 31424.2906, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 2.3442424242424242, + "grad_norm": 0.013859529979526997, + "learning_rate": 8.785352256064432e-05, + "loss": 0.013082655146718025, + "num_input_tokens_seen": 63342368, + "step": 3868, + "train_runtime": 31432.4026, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 2.3448484848484847, + "grad_norm": 0.006903677247464657, + "learning_rate": 8.784723931721602e-05, + "loss": 0.011762754060328007, + "num_input_tokens_seen": 63358744, + "step": 3869, + "train_runtime": 31440.5151, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 2.3454545454545457, + "grad_norm": 0.011766036041080952, + "learning_rate": 8.784095467389202e-05, + "loss": 0.011996396817266941, + "num_input_tokens_seen": 63375120, + "step": 3870, + "train_runtime": 31448.632, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 2.346060606060606, + "grad_norm": 0.006888863630592823, + "learning_rate": 8.783466863090482e-05, + "loss": 0.01209091953933239, + "num_input_tokens_seen": 63391496, + "step": 3871, + "train_runtime": 31456.7487, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 2.3466666666666667, + "grad_norm": 0.005084891337901354, + "learning_rate": 8.78283811884869e-05, + "loss": 0.012265229597687721, + "num_input_tokens_seen": 63407872, + "step": 3872, + "train_runtime": 31464.8643, + "train_tokens_per_second": 2015.196 + }, + { + "epoch": 2.347272727272727, + "grad_norm": 0.00594416493549943, + "learning_rate": 8.782209234687083e-05, + "loss": 0.013370493426918983, + "num_input_tokens_seen": 63424248, + "step": 3873, + "train_runtime": 31472.9765, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 2.3478787878787877, + "grad_norm": 0.007754097227007151, + "learning_rate": 8.781580210628922e-05, + "loss": 0.011244947090744972, + "num_input_tokens_seen": 63440624, + "step": 3874, + "train_runtime": 31481.0909, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 2.3484848484848486, + "grad_norm": 0.006137073040008545, + "learning_rate": 8.780951046697475e-05, + "loss": 0.011433717794716358, + "num_input_tokens_seen": 63457000, + "step": 3875, + "train_runtime": 31489.2069, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 2.349090909090909, + "grad_norm": 0.008919673040509224, + "learning_rate": 8.780321742916008e-05, + "loss": 0.012265768833458424, + "num_input_tokens_seen": 63473376, + "step": 3876, + "train_runtime": 31497.3208, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 2.3496969696969696, + "grad_norm": 0.0061377896927297115, + "learning_rate": 8.779692299307804e-05, + "loss": 0.012009664438664913, + "num_input_tokens_seen": 63489752, + "step": 3877, + "train_runtime": 31505.4386, + "train_tokens_per_second": 2015.2 + }, + { + "epoch": 2.35030303030303, + "grad_norm": 0.009156587533652782, + "learning_rate": 8.779062715896143e-05, + "loss": 0.012589774094522, + "num_input_tokens_seen": 63506128, + "step": 3878, + "train_runtime": 31513.5564, + "train_tokens_per_second": 2015.2 + }, + { + "epoch": 2.350909090909091, + "grad_norm": 0.01206312794238329, + "learning_rate": 8.778432992704311e-05, + "loss": 0.011919211596250534, + "num_input_tokens_seen": 63522504, + "step": 3879, + "train_runtime": 31521.6703, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 2.3515151515151516, + "grad_norm": 0.003118017455562949, + "learning_rate": 8.777803129755599e-05, + "loss": 0.011527287773787975, + "num_input_tokens_seen": 63538880, + "step": 3880, + "train_runtime": 31529.7853, + "train_tokens_per_second": 2015.202 + }, + { + "epoch": 2.352121212121212, + "grad_norm": 0.008360499516129494, + "learning_rate": 8.777173127073308e-05, + "loss": 0.012660115957260132, + "num_input_tokens_seen": 63555256, + "step": 3881, + "train_runtime": 31537.8983, + "train_tokens_per_second": 2015.203 + }, + { + "epoch": 2.3527272727272726, + "grad_norm": 0.010115530341863632, + "learning_rate": 8.776542984680738e-05, + "loss": 0.012301959097385406, + "num_input_tokens_seen": 63571632, + "step": 3882, + "train_runtime": 31546.0083, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 2.3533333333333335, + "grad_norm": 0.00837163906544447, + "learning_rate": 8.7759127026012e-05, + "loss": 0.013043307699263096, + "num_input_tokens_seen": 63588008, + "step": 3883, + "train_runtime": 31554.1174, + "train_tokens_per_second": 2015.205 + }, + { + "epoch": 2.353939393939394, + "grad_norm": 0.006768459919840097, + "learning_rate": 8.775282280858e-05, + "loss": 0.012747212313115597, + "num_input_tokens_seen": 63604384, + "step": 3884, + "train_runtime": 31562.2343, + "train_tokens_per_second": 2015.205 + }, + { + "epoch": 2.3545454545454545, + "grad_norm": 0.007784388028085232, + "learning_rate": 8.774651719474463e-05, + "loss": 0.01133689470589161, + "num_input_tokens_seen": 63620760, + "step": 3885, + "train_runtime": 31570.3465, + "train_tokens_per_second": 2015.206 + }, + { + "epoch": 2.355151515151515, + "grad_norm": 0.0044767530634999275, + "learning_rate": 8.77402101847391e-05, + "loss": 0.011376718059182167, + "num_input_tokens_seen": 63637136, + "step": 3886, + "train_runtime": 31578.4583, + "train_tokens_per_second": 2015.207 + }, + { + "epoch": 2.355757575757576, + "grad_norm": 0.004322875756770372, + "learning_rate": 8.773390177879668e-05, + "loss": 0.010949796997010708, + "num_input_tokens_seen": 63653512, + "step": 3887, + "train_runtime": 31586.5693, + "train_tokens_per_second": 2015.208 + }, + { + "epoch": 2.3563636363636364, + "grad_norm": 0.008982558734714985, + "learning_rate": 8.772759197715073e-05, + "loss": 0.013297686353325844, + "num_input_tokens_seen": 63669888, + "step": 3888, + "train_runtime": 31594.6844, + "train_tokens_per_second": 2015.209 + }, + { + "epoch": 2.356969696969697, + "grad_norm": 0.005320236552506685, + "learning_rate": 8.772128078003461e-05, + "loss": 0.011769617907702923, + "num_input_tokens_seen": 63686264, + "step": 3889, + "train_runtime": 31602.7968, + "train_tokens_per_second": 2015.21 + }, + { + "epoch": 2.3575757575757574, + "grad_norm": 0.008895252831280231, + "learning_rate": 8.771496818768177e-05, + "loss": 0.011837522499263287, + "num_input_tokens_seen": 63702640, + "step": 3890, + "train_runtime": 31610.9094, + "train_tokens_per_second": 2015.211 + }, + { + "epoch": 2.3581818181818184, + "grad_norm": 0.00486038438975811, + "learning_rate": 8.770865420032571e-05, + "loss": 0.011740483343601227, + "num_input_tokens_seen": 63719016, + "step": 3891, + "train_runtime": 31619.0347, + "train_tokens_per_second": 2015.211 + }, + { + "epoch": 2.358787878787879, + "grad_norm": 0.004801337141543627, + "learning_rate": 8.770233881819997e-05, + "loss": 0.011615416966378689, + "num_input_tokens_seen": 63735392, + "step": 3892, + "train_runtime": 31627.1484, + "train_tokens_per_second": 2015.211 + }, + { + "epoch": 2.3593939393939394, + "grad_norm": 0.007693479303270578, + "learning_rate": 8.769602204153813e-05, + "loss": 0.01139110792428255, + "num_input_tokens_seen": 63751768, + "step": 3893, + "train_runtime": 31635.2584, + "train_tokens_per_second": 2015.212 + }, + { + "epoch": 2.36, + "grad_norm": 0.00855227466672659, + "learning_rate": 8.768970387057385e-05, + "loss": 0.012721371836960316, + "num_input_tokens_seen": 63768144, + "step": 3894, + "train_runtime": 31643.3682, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 2.3606060606060604, + "grad_norm": 0.008411828428506851, + "learning_rate": 8.768338430554082e-05, + "loss": 0.012851890176534653, + "num_input_tokens_seen": 63784520, + "step": 3895, + "train_runtime": 31651.4806, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 2.3612121212121213, + "grad_norm": 0.009419060312211514, + "learning_rate": 8.767706334667279e-05, + "loss": 0.011950638145208359, + "num_input_tokens_seen": 63800896, + "step": 3896, + "train_runtime": 31659.5904, + "train_tokens_per_second": 2015.215 + }, + { + "epoch": 2.361818181818182, + "grad_norm": 0.005601715762168169, + "learning_rate": 8.767074099420356e-05, + "loss": 0.011967363767325878, + "num_input_tokens_seen": 63817272, + "step": 3897, + "train_runtime": 31667.7024, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 2.3624242424242423, + "grad_norm": 0.014286424033343792, + "learning_rate": 8.766441724836698e-05, + "loss": 0.011659272015094757, + "num_input_tokens_seen": 63833648, + "step": 3898, + "train_runtime": 31675.8155, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 2.3630303030303033, + "grad_norm": 0.00412064278498292, + "learning_rate": 8.765809210939697e-05, + "loss": 0.011589322239160538, + "num_input_tokens_seen": 63850024, + "step": 3899, + "train_runtime": 31683.9353, + "train_tokens_per_second": 2015.218 + }, + { + "epoch": 2.3636363636363638, + "grad_norm": 0.007143765222281218, + "learning_rate": 8.765176557752744e-05, + "loss": 0.012061070650815964, + "num_input_tokens_seen": 63866400, + "step": 3900, + "train_runtime": 31692.046, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 2.3642424242424243, + "grad_norm": 0.004888126160949469, + "learning_rate": 8.764543765299245e-05, + "loss": 0.011578274890780449, + "num_input_tokens_seen": 63882776, + "step": 3901, + "train_runtime": 31701.0096, + "train_tokens_per_second": 2015.165 + }, + { + "epoch": 2.3648484848484848, + "grad_norm": 0.005478084087371826, + "learning_rate": 8.763910833602601e-05, + "loss": 0.011829855851829052, + "num_input_tokens_seen": 63899152, + "step": 3902, + "train_runtime": 31709.1321, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 2.3654545454545453, + "grad_norm": 0.007182782515883446, + "learning_rate": 8.763277762686227e-05, + "loss": 0.0127269197255373, + "num_input_tokens_seen": 63915528, + "step": 3903, + "train_runtime": 31717.2513, + "train_tokens_per_second": 2015.166 + }, + { + "epoch": 2.366060606060606, + "grad_norm": 0.007466602139174938, + "learning_rate": 8.762644552573535e-05, + "loss": 0.012824708595871925, + "num_input_tokens_seen": 63931904, + "step": 3904, + "train_runtime": 31725.3651, + "train_tokens_per_second": 2015.167 + }, + { + "epoch": 2.3666666666666667, + "grad_norm": 0.005637146532535553, + "learning_rate": 8.76201120328795e-05, + "loss": 0.011652861721813679, + "num_input_tokens_seen": 63948280, + "step": 3905, + "train_runtime": 31733.4766, + "train_tokens_per_second": 2015.168 + }, + { + "epoch": 2.367272727272727, + "grad_norm": 0.005748794414103031, + "learning_rate": 8.761377714852899e-05, + "loss": 0.011319580487906933, + "num_input_tokens_seen": 63964656, + "step": 3906, + "train_runtime": 31741.5911, + "train_tokens_per_second": 2015.169 + }, + { + "epoch": 2.3678787878787877, + "grad_norm": 0.00832221657037735, + "learning_rate": 8.760744087291808e-05, + "loss": 0.012169033288955688, + "num_input_tokens_seen": 63981032, + "step": 3907, + "train_runtime": 31749.7061, + "train_tokens_per_second": 2015.169 + }, + { + "epoch": 2.3684848484848486, + "grad_norm": 0.018167784437537193, + "learning_rate": 8.760110320628118e-05, + "loss": 0.012270736508071423, + "num_input_tokens_seen": 63997408, + "step": 3908, + "train_runtime": 31757.8211, + "train_tokens_per_second": 2015.17 + }, + { + "epoch": 2.369090909090909, + "grad_norm": 0.010734605602920055, + "learning_rate": 8.759476414885269e-05, + "loss": 0.012558269314467907, + "num_input_tokens_seen": 64013784, + "step": 3909, + "train_runtime": 31765.9363, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 2.3696969696969696, + "grad_norm": 0.023609034717082977, + "learning_rate": 8.758842370086709e-05, + "loss": 0.012377963401377201, + "num_input_tokens_seen": 64030160, + "step": 3910, + "train_runtime": 31774.0521, + "train_tokens_per_second": 2015.171 + }, + { + "epoch": 2.37030303030303, + "grad_norm": 0.008125863038003445, + "learning_rate": 8.75820818625589e-05, + "loss": 0.012392617762088776, + "num_input_tokens_seen": 64046536, + "step": 3911, + "train_runtime": 31782.1664, + "train_tokens_per_second": 2015.172 + }, + { + "epoch": 2.370909090909091, + "grad_norm": 0.005616354290395975, + "learning_rate": 8.757573863416269e-05, + "loss": 0.011652743443846703, + "num_input_tokens_seen": 64062912, + "step": 3912, + "train_runtime": 31790.2815, + "train_tokens_per_second": 2015.173 + }, + { + "epoch": 2.3715151515151516, + "grad_norm": 0.008864682167768478, + "learning_rate": 8.756939401591309e-05, + "loss": 0.012643544003367424, + "num_input_tokens_seen": 64079288, + "step": 3913, + "train_runtime": 31798.3948, + "train_tokens_per_second": 2015.174 + }, + { + "epoch": 2.372121212121212, + "grad_norm": 0.008480784483253956, + "learning_rate": 8.756304800804475e-05, + "loss": 0.011656440794467926, + "num_input_tokens_seen": 64095664, + "step": 3914, + "train_runtime": 31806.5085, + "train_tokens_per_second": 2015.174 + }, + { + "epoch": 2.3727272727272726, + "grad_norm": 0.001905617187730968, + "learning_rate": 8.755670061079244e-05, + "loss": 0.01306787971407175, + "num_input_tokens_seen": 64112040, + "step": 3915, + "train_runtime": 31814.6328, + "train_tokens_per_second": 2015.175 + }, + { + "epoch": 2.3733333333333335, + "grad_norm": 0.006388600450009108, + "learning_rate": 8.755035182439088e-05, + "loss": 0.011712341569364071, + "num_input_tokens_seen": 64128416, + "step": 3916, + "train_runtime": 31822.7462, + "train_tokens_per_second": 2015.175 + }, + { + "epoch": 2.373939393939394, + "grad_norm": 0.008756665512919426, + "learning_rate": 8.754400164907497e-05, + "loss": 0.012182825244963169, + "num_input_tokens_seen": 64144792, + "step": 3917, + "train_runtime": 31830.8643, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 2.3745454545454545, + "grad_norm": 0.008914709091186523, + "learning_rate": 8.753765008507953e-05, + "loss": 0.012191008776426315, + "num_input_tokens_seen": 64161168, + "step": 3918, + "train_runtime": 31838.9795, + "train_tokens_per_second": 2015.177 + }, + { + "epoch": 2.375151515151515, + "grad_norm": 0.022416841238737106, + "learning_rate": 8.753129713263951e-05, + "loss": 0.014600102789700031, + "num_input_tokens_seen": 64177544, + "step": 3919, + "train_runtime": 31847.0936, + "train_tokens_per_second": 2015.177 + }, + { + "epoch": 2.375757575757576, + "grad_norm": 0.007959865033626556, + "learning_rate": 8.75249427919899e-05, + "loss": 0.012604182586073875, + "num_input_tokens_seen": 64193920, + "step": 3920, + "train_runtime": 31855.2065, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 2.3763636363636365, + "grad_norm": 0.007637340109795332, + "learning_rate": 8.751858706336576e-05, + "loss": 0.01179521530866623, + "num_input_tokens_seen": 64210296, + "step": 3921, + "train_runtime": 31863.3188, + "train_tokens_per_second": 2015.179 + }, + { + "epoch": 2.376969696969697, + "grad_norm": 0.007538910489529371, + "learning_rate": 8.751222994700213e-05, + "loss": 0.012452198192477226, + "num_input_tokens_seen": 64226672, + "step": 3922, + "train_runtime": 31871.4351, + "train_tokens_per_second": 2015.18 + }, + { + "epoch": 2.3775757575757575, + "grad_norm": 0.011203823611140251, + "learning_rate": 8.750587144313416e-05, + "loss": 0.012804090976715088, + "num_input_tokens_seen": 64243048, + "step": 3923, + "train_runtime": 31879.5506, + "train_tokens_per_second": 2015.18 + }, + { + "epoch": 2.378181818181818, + "grad_norm": 0.00984243955463171, + "learning_rate": 8.749951155199703e-05, + "loss": 0.011516422033309937, + "num_input_tokens_seen": 64259424, + "step": 3924, + "train_runtime": 31887.6672, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 2.378787878787879, + "grad_norm": 0.007776329293847084, + "learning_rate": 8.749315027382601e-05, + "loss": 0.012981178238987923, + "num_input_tokens_seen": 64275800, + "step": 3925, + "train_runtime": 31895.7817, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 2.3793939393939394, + "grad_norm": 0.007024808786809444, + "learning_rate": 8.748678760885638e-05, + "loss": 0.012982901185750961, + "num_input_tokens_seen": 64292176, + "step": 3926, + "train_runtime": 31903.8951, + "train_tokens_per_second": 2015.183 + }, + { + "epoch": 2.38, + "grad_norm": 0.0047778841108083725, + "learning_rate": 8.748042355732349e-05, + "loss": 0.01131907757371664, + "num_input_tokens_seen": 64308552, + "step": 3927, + "train_runtime": 31912.0105, + "train_tokens_per_second": 2015.183 + }, + { + "epoch": 2.380606060606061, + "grad_norm": 0.004458183888345957, + "learning_rate": 8.74740581194627e-05, + "loss": 0.011565866880118847, + "num_input_tokens_seen": 64324928, + "step": 3928, + "train_runtime": 31920.1334, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 2.3812121212121213, + "grad_norm": 0.0061905584298074245, + "learning_rate": 8.746769129550949e-05, + "loss": 0.011856907047331333, + "num_input_tokens_seen": 64341304, + "step": 3929, + "train_runtime": 31928.2494, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 2.381818181818182, + "grad_norm": 0.008414180018007755, + "learning_rate": 8.746132308569934e-05, + "loss": 0.011779951862990856, + "num_input_tokens_seen": 64357680, + "step": 3930, + "train_runtime": 31936.3652, + "train_tokens_per_second": 2015.185 + }, + { + "epoch": 2.3824242424242423, + "grad_norm": 0.017430568113923073, + "learning_rate": 8.745495349026781e-05, + "loss": 0.013200096786022186, + "num_input_tokens_seen": 64374056, + "step": 3931, + "train_runtime": 31944.4848, + "train_tokens_per_second": 2015.185 + }, + { + "epoch": 2.383030303030303, + "grad_norm": 0.007815414108335972, + "learning_rate": 8.744858250945049e-05, + "loss": 0.011149406433105469, + "num_input_tokens_seen": 64390432, + "step": 3932, + "train_runtime": 31952.6034, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 2.3836363636363638, + "grad_norm": 0.011867988854646683, + "learning_rate": 8.744221014348301e-05, + "loss": 0.012829601764678955, + "num_input_tokens_seen": 64406808, + "step": 3933, + "train_runtime": 31960.7217, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 2.3842424242424243, + "grad_norm": 0.01321521494537592, + "learning_rate": 8.743583639260111e-05, + "loss": 0.011815814301371574, + "num_input_tokens_seen": 64423184, + "step": 3934, + "train_runtime": 31968.8435, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 2.3848484848484848, + "grad_norm": 0.010845907032489777, + "learning_rate": 8.742946125704052e-05, + "loss": 0.012657510116696358, + "num_input_tokens_seen": 64439560, + "step": 3935, + "train_runtime": 31976.9608, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 2.3854545454545453, + "grad_norm": 0.007816660217940807, + "learning_rate": 8.742308473703706e-05, + "loss": 0.01170468982309103, + "num_input_tokens_seen": 64455936, + "step": 3936, + "train_runtime": 31985.0806, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 2.386060606060606, + "grad_norm": 0.005382678937166929, + "learning_rate": 8.741670683282655e-05, + "loss": 0.0122041841968894, + "num_input_tokens_seen": 64472312, + "step": 3937, + "train_runtime": 31993.1961, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 2.3866666666666667, + "grad_norm": 0.004945850465446711, + "learning_rate": 8.741032754464494e-05, + "loss": 0.011137978173792362, + "num_input_tokens_seen": 64488688, + "step": 3938, + "train_runtime": 32001.3096, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 2.387272727272727, + "grad_norm": 0.004918837919831276, + "learning_rate": 8.740394687272816e-05, + "loss": 0.011163354851305485, + "num_input_tokens_seen": 64505064, + "step": 3939, + "train_runtime": 32009.4312, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 2.3878787878787877, + "grad_norm": 0.006628453731536865, + "learning_rate": 8.739756481731223e-05, + "loss": 0.012198293581604958, + "num_input_tokens_seen": 64521440, + "step": 3940, + "train_runtime": 32017.5454, + "train_tokens_per_second": 2015.19 + }, + { + "epoch": 2.3884848484848487, + "grad_norm": 0.006536687724292278, + "learning_rate": 8.73911813786332e-05, + "loss": 0.011898152530193329, + "num_input_tokens_seen": 64537816, + "step": 3941, + "train_runtime": 32025.6611, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 2.389090909090909, + "grad_norm": 0.011323172599077225, + "learning_rate": 8.738479655692719e-05, + "loss": 0.013106940314173698, + "num_input_tokens_seen": 64554192, + "step": 3942, + "train_runtime": 32033.7775, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 2.3896969696969697, + "grad_norm": 0.010536973364651203, + "learning_rate": 8.737841035243036e-05, + "loss": 0.012583276256918907, + "num_input_tokens_seen": 64570568, + "step": 3943, + "train_runtime": 32041.8972, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 2.39030303030303, + "grad_norm": 0.007271362002938986, + "learning_rate": 8.737202276537891e-05, + "loss": 0.012286387383937836, + "num_input_tokens_seen": 64586944, + "step": 3944, + "train_runtime": 32050.0111, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 2.390909090909091, + "grad_norm": 0.009338966570794582, + "learning_rate": 8.736563379600913e-05, + "loss": 0.012851104140281677, + "num_input_tokens_seen": 64603320, + "step": 3945, + "train_runtime": 32058.1218, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 2.3915151515151516, + "grad_norm": 0.008454913273453712, + "learning_rate": 8.735924344455732e-05, + "loss": 0.012162717990577221, + "num_input_tokens_seen": 64619696, + "step": 3946, + "train_runtime": 32066.2372, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 2.392121212121212, + "grad_norm": 0.0019281277200207114, + "learning_rate": 8.735285171125986e-05, + "loss": 0.010228649713099003, + "num_input_tokens_seen": 64636072, + "step": 3947, + "train_runtime": 32074.3517, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 2.3927272727272726, + "grad_norm": 0.0058963717892766, + "learning_rate": 8.734645859635313e-05, + "loss": 0.011077743954956532, + "num_input_tokens_seen": 64652448, + "step": 3948, + "train_runtime": 32082.4649, + "train_tokens_per_second": 2015.196 + }, + { + "epoch": 2.3933333333333335, + "grad_norm": 0.01275597233325243, + "learning_rate": 8.734006410007365e-05, + "loss": 0.013113114051520824, + "num_input_tokens_seen": 64668824, + "step": 3949, + "train_runtime": 32090.5816, + "train_tokens_per_second": 2015.196 + }, + { + "epoch": 2.393939393939394, + "grad_norm": 0.008671462535858154, + "learning_rate": 8.73336682226579e-05, + "loss": 0.012027603574097157, + "num_input_tokens_seen": 64685200, + "step": 3950, + "train_runtime": 32098.6971, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 2.3945454545454545, + "grad_norm": 0.0048281666822731495, + "learning_rate": 8.732727096434247e-05, + "loss": 0.011461691930890083, + "num_input_tokens_seen": 64701576, + "step": 3951, + "train_runtime": 32106.817, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 2.395151515151515, + "grad_norm": 0.0087870042771101, + "learning_rate": 8.732087232536399e-05, + "loss": 0.011858628131449223, + "num_input_tokens_seen": 64717952, + "step": 3952, + "train_runtime": 32114.933, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 2.3957575757575755, + "grad_norm": 0.023671137169003487, + "learning_rate": 8.731447230595911e-05, + "loss": 0.013579844497144222, + "num_input_tokens_seen": 64734328, + "step": 3953, + "train_runtime": 32123.0491, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 2.3963636363636365, + "grad_norm": 0.00703487079590559, + "learning_rate": 8.730807090636457e-05, + "loss": 0.011315067298710346, + "num_input_tokens_seen": 64750704, + "step": 3954, + "train_runtime": 32131.1664, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 2.396969696969697, + "grad_norm": 0.009797473438084126, + "learning_rate": 8.730166812681713e-05, + "loss": 0.01242794282734394, + "num_input_tokens_seen": 64767080, + "step": 3955, + "train_runtime": 32139.2798, + "train_tokens_per_second": 2015.2 + }, + { + "epoch": 2.3975757575757575, + "grad_norm": 0.009140574373304844, + "learning_rate": 8.729526396755365e-05, + "loss": 0.011765132658183575, + "num_input_tokens_seen": 64783456, + "step": 3956, + "train_runtime": 32147.3949, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 2.3981818181818184, + "grad_norm": 0.002114366739988327, + "learning_rate": 8.728885842881095e-05, + "loss": 0.01141907088458538, + "num_input_tokens_seen": 64799832, + "step": 3957, + "train_runtime": 32155.5104, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 2.398787878787879, + "grad_norm": 0.010168755427002907, + "learning_rate": 8.728245151082604e-05, + "loss": 0.012520655058324337, + "num_input_tokens_seen": 64816208, + "step": 3958, + "train_runtime": 32163.6325, + "train_tokens_per_second": 2015.202 + }, + { + "epoch": 2.3993939393939394, + "grad_norm": 0.01424007210880518, + "learning_rate": 8.727604321383583e-05, + "loss": 0.013034731149673462, + "num_input_tokens_seen": 64832584, + "step": 3959, + "train_runtime": 32171.7515, + "train_tokens_per_second": 2015.202 + }, + { + "epoch": 2.4, + "grad_norm": 0.0068917106837034225, + "learning_rate": 8.726963353807735e-05, + "loss": 0.011889351531863213, + "num_input_tokens_seen": 64848960, + "step": 3960, + "train_runtime": 32179.8659, + "train_tokens_per_second": 2015.203 + }, + { + "epoch": 2.4006060606060604, + "grad_norm": 0.009508705697953701, + "learning_rate": 8.726322248378775e-05, + "loss": 0.01212370302528143, + "num_input_tokens_seen": 64865336, + "step": 3961, + "train_runtime": 32187.9824, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 2.4012121212121214, + "grad_norm": 0.00829673558473587, + "learning_rate": 8.725681005120409e-05, + "loss": 0.011778369545936584, + "num_input_tokens_seen": 64881712, + "step": 3962, + "train_runtime": 32196.0985, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 2.401818181818182, + "grad_norm": 0.005323054734617472, + "learning_rate": 8.725039624056359e-05, + "loss": 0.012644222006201744, + "num_input_tokens_seen": 64898088, + "step": 3963, + "train_runtime": 32204.2147, + "train_tokens_per_second": 2015.205 + }, + { + "epoch": 2.4024242424242424, + "grad_norm": 0.008517703972756863, + "learning_rate": 8.724398105210345e-05, + "loss": 0.01162803266197443, + "num_input_tokens_seen": 64914464, + "step": 3964, + "train_runtime": 32212.3337, + "train_tokens_per_second": 2015.205 + }, + { + "epoch": 2.403030303030303, + "grad_norm": 0.010649501346051693, + "learning_rate": 8.723756448606101e-05, + "loss": 0.011956385336816311, + "num_input_tokens_seen": 64930840, + "step": 3965, + "train_runtime": 32220.4482, + "train_tokens_per_second": 2015.206 + }, + { + "epoch": 2.403636363636364, + "grad_norm": 0.009511959739029408, + "learning_rate": 8.723114654267356e-05, + "loss": 0.011983465403318405, + "num_input_tokens_seen": 64947216, + "step": 3966, + "train_runtime": 32228.5616, + "train_tokens_per_second": 2015.207 + }, + { + "epoch": 2.4042424242424243, + "grad_norm": 0.006440349854528904, + "learning_rate": 8.722472722217852e-05, + "loss": 0.012858221307396889, + "num_input_tokens_seen": 64963592, + "step": 3967, + "train_runtime": 32236.6751, + "train_tokens_per_second": 2015.208 + }, + { + "epoch": 2.404848484848485, + "grad_norm": 0.007032300345599651, + "learning_rate": 8.721830652481328e-05, + "loss": 0.013298509642481804, + "num_input_tokens_seen": 64979968, + "step": 3968, + "train_runtime": 32244.7854, + "train_tokens_per_second": 2015.209 + }, + { + "epoch": 2.4054545454545453, + "grad_norm": 0.005947112571448088, + "learning_rate": 8.72118844508154e-05, + "loss": 0.012420369312167168, + "num_input_tokens_seen": 64996344, + "step": 3969, + "train_runtime": 32252.903, + "train_tokens_per_second": 2015.209 + }, + { + "epoch": 2.4060606060606062, + "grad_norm": 0.009514856152236462, + "learning_rate": 8.720546100042235e-05, + "loss": 0.012390246614813805, + "num_input_tokens_seen": 65012720, + "step": 3970, + "train_runtime": 32261.0208, + "train_tokens_per_second": 2015.21 + }, + { + "epoch": 2.4066666666666667, + "grad_norm": 0.010288123041391373, + "learning_rate": 8.719903617387178e-05, + "loss": 0.011466922238469124, + "num_input_tokens_seen": 65029096, + "step": 3971, + "train_runtime": 32269.1401, + "train_tokens_per_second": 2015.21 + }, + { + "epoch": 2.4072727272727272, + "grad_norm": 0.008854511193931103, + "learning_rate": 8.719260997140128e-05, + "loss": 0.013790170662105083, + "num_input_tokens_seen": 65045472, + "step": 3972, + "train_runtime": 32277.2523, + "train_tokens_per_second": 2015.211 + }, + { + "epoch": 2.4078787878787877, + "grad_norm": 0.00761467544361949, + "learning_rate": 8.718618239324858e-05, + "loss": 0.012384867295622826, + "num_input_tokens_seen": 65061848, + "step": 3973, + "train_runtime": 32285.3664, + "train_tokens_per_second": 2015.212 + }, + { + "epoch": 2.4084848484848487, + "grad_norm": 0.014865431934595108, + "learning_rate": 8.717975343965141e-05, + "loss": 0.012042179703712463, + "num_input_tokens_seen": 65078224, + "step": 3974, + "train_runtime": 32293.4804, + "train_tokens_per_second": 2015.212 + }, + { + "epoch": 2.409090909090909, + "grad_norm": 0.012648499570786953, + "learning_rate": 8.717332311084755e-05, + "loss": 0.013338636606931686, + "num_input_tokens_seen": 65094600, + "step": 3975, + "train_runtime": 32301.595, + "train_tokens_per_second": 2015.213 + }, + { + "epoch": 2.4096969696969697, + "grad_norm": 0.010478436946868896, + "learning_rate": 8.716689140707488e-05, + "loss": 0.01220523752272129, + "num_input_tokens_seen": 65110976, + "step": 3976, + "train_runtime": 32309.7103, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 2.41030303030303, + "grad_norm": 0.005855999421328306, + "learning_rate": 8.716045832857128e-05, + "loss": 0.012362138368189335, + "num_input_tokens_seen": 65127352, + "step": 3977, + "train_runtime": 32317.833, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 2.410909090909091, + "grad_norm": 0.008465795777738094, + "learning_rate": 8.715402387557467e-05, + "loss": 0.012066630646586418, + "num_input_tokens_seen": 65143728, + "step": 3978, + "train_runtime": 32325.9436, + "train_tokens_per_second": 2015.215 + }, + { + "epoch": 2.4115151515151516, + "grad_norm": 0.004939177073538303, + "learning_rate": 8.714758804832309e-05, + "loss": 0.011245203204452991, + "num_input_tokens_seen": 65160104, + "step": 3979, + "train_runtime": 32334.0546, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 2.412121212121212, + "grad_norm": 0.008906640112400055, + "learning_rate": 8.714115084705454e-05, + "loss": 0.012994782999157906, + "num_input_tokens_seen": 65176480, + "step": 3980, + "train_runtime": 32342.1684, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 2.4127272727272726, + "grad_norm": 0.006914692930877209, + "learning_rate": 8.713471227200719e-05, + "loss": 0.011876557022333145, + "num_input_tokens_seen": 65192856, + "step": 3981, + "train_runtime": 32350.2791, + "train_tokens_per_second": 2015.218 + }, + { + "epoch": 2.413333333333333, + "grad_norm": 0.007783967535942793, + "learning_rate": 8.712827232341911e-05, + "loss": 0.01124640740454197, + "num_input_tokens_seen": 65209232, + "step": 3982, + "train_runtime": 32358.3956, + "train_tokens_per_second": 2015.218 + }, + { + "epoch": 2.413939393939394, + "grad_norm": 0.009643170982599258, + "learning_rate": 8.712183100152857e-05, + "loss": 0.01266550924628973, + "num_input_tokens_seen": 65225608, + "step": 3983, + "train_runtime": 32366.5081, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 2.4145454545454546, + "grad_norm": 0.008697593584656715, + "learning_rate": 8.711538830657378e-05, + "loss": 0.011208837851881981, + "num_input_tokens_seen": 65241984, + "step": 3984, + "train_runtime": 32374.6203, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 2.415151515151515, + "grad_norm": 0.008631549775600433, + "learning_rate": 8.710894423879305e-05, + "loss": 0.012429659254848957, + "num_input_tokens_seen": 65258360, + "step": 3985, + "train_runtime": 32382.7329, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 2.415757575757576, + "grad_norm": 0.017001524567604065, + "learning_rate": 8.710249879842476e-05, + "loss": 0.012807359918951988, + "num_input_tokens_seen": 65274736, + "step": 3986, + "train_runtime": 32390.8453, + "train_tokens_per_second": 2015.222 + }, + { + "epoch": 2.4163636363636365, + "grad_norm": 0.008777068927884102, + "learning_rate": 8.709605198570728e-05, + "loss": 0.010934505611658096, + "num_input_tokens_seen": 65291112, + "step": 3987, + "train_runtime": 32398.9606, + "train_tokens_per_second": 2015.222 + }, + { + "epoch": 2.416969696969697, + "grad_norm": 0.005198657512664795, + "learning_rate": 8.708960380087907e-05, + "loss": 0.01222632359713316, + "num_input_tokens_seen": 65307488, + "step": 3988, + "train_runtime": 32407.0754, + "train_tokens_per_second": 2015.223 + }, + { + "epoch": 2.4175757575757575, + "grad_norm": 0.010032770223915577, + "learning_rate": 8.708315424417866e-05, + "loss": 0.012328274548053741, + "num_input_tokens_seen": 65323864, + "step": 3989, + "train_runtime": 32415.1872, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 2.418181818181818, + "grad_norm": 0.009887870401144028, + "learning_rate": 8.707670331584459e-05, + "loss": 0.012643869034945965, + "num_input_tokens_seen": 65340240, + "step": 3990, + "train_runtime": 32423.3028, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 2.418787878787879, + "grad_norm": 0.009102854877710342, + "learning_rate": 8.707025101611545e-05, + "loss": 0.012734930962324142, + "num_input_tokens_seen": 65356616, + "step": 3991, + "train_runtime": 32431.4161, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 2.4193939393939394, + "grad_norm": 0.025766532868146896, + "learning_rate": 8.706379734522994e-05, + "loss": 0.01270482037216425, + "num_input_tokens_seen": 65372992, + "step": 3992, + "train_runtime": 32439.5348, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 2.42, + "grad_norm": 0.006967430934309959, + "learning_rate": 8.705734230342672e-05, + "loss": 0.01074296422302723, + "num_input_tokens_seen": 65389368, + "step": 3993, + "train_runtime": 32447.6469, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 2.4206060606060604, + "grad_norm": 0.011303038336336613, + "learning_rate": 8.705088589094459e-05, + "loss": 0.013301991857588291, + "num_input_tokens_seen": 65405744, + "step": 3994, + "train_runtime": 32455.7634, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 2.4212121212121214, + "grad_norm": 0.014830484986305237, + "learning_rate": 8.704442810802234e-05, + "loss": 0.013416048139333725, + "num_input_tokens_seen": 65422120, + "step": 3995, + "train_runtime": 32463.8793, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 2.421818181818182, + "grad_norm": 0.0049521829932928085, + "learning_rate": 8.703796895489883e-05, + "loss": 0.013172317296266556, + "num_input_tokens_seen": 65438496, + "step": 3996, + "train_runtime": 32471.9961, + "train_tokens_per_second": 2015.229 + }, + { + "epoch": 2.4224242424242424, + "grad_norm": 0.0015984463971108198, + "learning_rate": 8.7031508431813e-05, + "loss": 0.011939289048314095, + "num_input_tokens_seen": 65454872, + "step": 3997, + "train_runtime": 32480.1114, + "train_tokens_per_second": 2015.229 + }, + { + "epoch": 2.423030303030303, + "grad_norm": 0.0038701631128787994, + "learning_rate": 8.702504653900376e-05, + "loss": 0.012130429968237877, + "num_input_tokens_seen": 65471248, + "step": 3998, + "train_runtime": 32488.2337, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 2.423636363636364, + "grad_norm": 0.007171195466071367, + "learning_rate": 8.701858327671016e-05, + "loss": 0.01233917847275734, + "num_input_tokens_seen": 65487624, + "step": 3999, + "train_runtime": 32496.3526, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 2.4242424242424243, + "grad_norm": 0.0014149510534480214, + "learning_rate": 8.701211864517126e-05, + "loss": 0.010504164732992649, + "num_input_tokens_seen": 65504000, + "step": 4000, + "train_runtime": 32504.4642, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 2.424848484848485, + "grad_norm": 0.0008805702673271298, + "learning_rate": 8.700565264462617e-05, + "loss": 0.011478891596198082, + "num_input_tokens_seen": 65520376, + "step": 4001, + "train_runtime": 32513.4696, + "train_tokens_per_second": 2015.176 + }, + { + "epoch": 2.4254545454545453, + "grad_norm": 0.007924284785985947, + "learning_rate": 8.699918527531404e-05, + "loss": 0.011402283795177937, + "num_input_tokens_seen": 65536752, + "step": 4002, + "train_runtime": 32521.5831, + "train_tokens_per_second": 2015.177 + }, + { + "epoch": 2.4260606060606063, + "grad_norm": 0.009870833717286587, + "learning_rate": 8.699271653747411e-05, + "loss": 0.01156836748123169, + "num_input_tokens_seen": 65553128, + "step": 4003, + "train_runtime": 32529.6943, + "train_tokens_per_second": 2015.178 + }, + { + "epoch": 2.4266666666666667, + "grad_norm": 0.011118307709693909, + "learning_rate": 8.698624643134564e-05, + "loss": 0.013963157311081886, + "num_input_tokens_seen": 65569504, + "step": 4004, + "train_runtime": 32537.8051, + "train_tokens_per_second": 2015.179 + }, + { + "epoch": 2.4272727272727272, + "grad_norm": 0.021435871720314026, + "learning_rate": 8.697977495716793e-05, + "loss": 0.012135976925492287, + "num_input_tokens_seen": 65585880, + "step": 4005, + "train_runtime": 32545.9167, + "train_tokens_per_second": 2015.18 + }, + { + "epoch": 2.4278787878787877, + "grad_norm": 0.009903870522975922, + "learning_rate": 8.697330211518038e-05, + "loss": 0.012505902908742428, + "num_input_tokens_seen": 65602256, + "step": 4006, + "train_runtime": 32554.0318, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 2.4284848484848487, + "grad_norm": 0.008014468476176262, + "learning_rate": 8.696682790562236e-05, + "loss": 0.012186196632683277, + "num_input_tokens_seen": 65618632, + "step": 4007, + "train_runtime": 32562.1483, + "train_tokens_per_second": 2015.181 + }, + { + "epoch": 2.429090909090909, + "grad_norm": 0.007817745208740234, + "learning_rate": 8.696035232873339e-05, + "loss": 0.012502459809184074, + "num_input_tokens_seen": 65635008, + "step": 4008, + "train_runtime": 32570.2616, + "train_tokens_per_second": 2015.182 + }, + { + "epoch": 2.4296969696969697, + "grad_norm": 0.01054783258587122, + "learning_rate": 8.695387538475295e-05, + "loss": 0.012050812132656574, + "num_input_tokens_seen": 65651384, + "step": 4009, + "train_runtime": 32578.3709, + "train_tokens_per_second": 2015.183 + }, + { + "epoch": 2.43030303030303, + "grad_norm": 0.010385658591985703, + "learning_rate": 8.694739707392063e-05, + "loss": 0.012348880060017109, + "num_input_tokens_seen": 65667760, + "step": 4010, + "train_runtime": 32586.4871, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 2.4309090909090907, + "grad_norm": 0.006136162206530571, + "learning_rate": 8.694091739647602e-05, + "loss": 0.01128119695931673, + "num_input_tokens_seen": 65684136, + "step": 4011, + "train_runtime": 32594.6042, + "train_tokens_per_second": 2015.184 + }, + { + "epoch": 2.4315151515151516, + "grad_norm": 0.16773830354213715, + "learning_rate": 8.693443635265884e-05, + "loss": 0.019942179322242737, + "num_input_tokens_seen": 65700512, + "step": 4012, + "train_runtime": 32602.7162, + "train_tokens_per_second": 2015.185 + }, + { + "epoch": 2.432121212121212, + "grad_norm": 0.007391555700451136, + "learning_rate": 8.692795394270878e-05, + "loss": 0.011346128769218922, + "num_input_tokens_seen": 65716888, + "step": 4013, + "train_runtime": 32610.8327, + "train_tokens_per_second": 2015.186 + }, + { + "epoch": 2.4327272727272726, + "grad_norm": 0.008623134344816208, + "learning_rate": 8.692147016686562e-05, + "loss": 0.011837894096970558, + "num_input_tokens_seen": 65733264, + "step": 4014, + "train_runtime": 32618.9456, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 2.4333333333333336, + "grad_norm": 0.017599230632185936, + "learning_rate": 8.691498502536919e-05, + "loss": 0.013225538656115532, + "num_input_tokens_seen": 65749640, + "step": 4015, + "train_runtime": 32627.0611, + "train_tokens_per_second": 2015.187 + }, + { + "epoch": 2.433939393939394, + "grad_norm": 0.008119662292301655, + "learning_rate": 8.690849851845933e-05, + "loss": 0.013446874916553497, + "num_input_tokens_seen": 65766016, + "step": 4016, + "train_runtime": 32635.1768, + "train_tokens_per_second": 2015.188 + }, + { + "epoch": 2.4345454545454546, + "grad_norm": 1.2305080890655518, + "learning_rate": 8.6902010646376e-05, + "loss": 0.02664513699710369, + "num_input_tokens_seen": 65782392, + "step": 4017, + "train_runtime": 32643.2922, + "train_tokens_per_second": 2015.189 + }, + { + "epoch": 2.435151515151515, + "grad_norm": 0.004301746841520071, + "learning_rate": 8.689552140935914e-05, + "loss": 0.011035654693841934, + "num_input_tokens_seen": 65798768, + "step": 4018, + "train_runtime": 32651.4034, + "train_tokens_per_second": 2015.19 + }, + { + "epoch": 2.4357575757575756, + "grad_norm": 0.004877461586147547, + "learning_rate": 8.688903080764883e-05, + "loss": 0.011010750196874142, + "num_input_tokens_seen": 65815144, + "step": 4019, + "train_runtime": 32659.5176, + "train_tokens_per_second": 2015.19 + }, + { + "epoch": 2.4363636363636365, + "grad_norm": 0.00430849427357316, + "learning_rate": 8.688253884148509e-05, + "loss": 0.012497692368924618, + "num_input_tokens_seen": 65831520, + "step": 4020, + "train_runtime": 32667.6349, + "train_tokens_per_second": 2015.191 + }, + { + "epoch": 2.436969696969697, + "grad_norm": 0.04685498774051666, + "learning_rate": 8.687604551110807e-05, + "loss": 0.013098020106554031, + "num_input_tokens_seen": 65847896, + "step": 4021, + "train_runtime": 32675.7503, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 2.4375757575757575, + "grad_norm": 0.007516586687415838, + "learning_rate": 8.686955081675791e-05, + "loss": 0.011286056600511074, + "num_input_tokens_seen": 65864272, + "step": 4022, + "train_runtime": 32683.8619, + "train_tokens_per_second": 2015.192 + }, + { + "epoch": 2.438181818181818, + "grad_norm": 0.006795146968215704, + "learning_rate": 8.68630547586749e-05, + "loss": 0.011966537684202194, + "num_input_tokens_seen": 65880648, + "step": 4023, + "train_runtime": 32691.9751, + "train_tokens_per_second": 2015.193 + }, + { + "epoch": 2.438787878787879, + "grad_norm": 0.0067265634424984455, + "learning_rate": 8.685655733709928e-05, + "loss": 0.012211540713906288, + "num_input_tokens_seen": 65897024, + "step": 4024, + "train_runtime": 32700.0912, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 2.4393939393939394, + "grad_norm": 0.08641108870506287, + "learning_rate": 8.685005855227135e-05, + "loss": 0.017108287662267685, + "num_input_tokens_seen": 65913400, + "step": 4025, + "train_runtime": 32708.2163, + "train_tokens_per_second": 2015.194 + }, + { + "epoch": 2.44, + "grad_norm": 0.011075939051806927, + "learning_rate": 8.684355840443155e-05, + "loss": 0.011944500729441643, + "num_input_tokens_seen": 65929776, + "step": 4026, + "train_runtime": 32716.3331, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 2.4406060606060604, + "grad_norm": 0.069160595536232, + "learning_rate": 8.683705689382024e-05, + "loss": 0.017644496634602547, + "num_input_tokens_seen": 65946152, + "step": 4027, + "train_runtime": 32724.4465, + "train_tokens_per_second": 2015.195 + }, + { + "epoch": 2.4412121212121214, + "grad_norm": 0.009637738578021526, + "learning_rate": 8.683055402067797e-05, + "loss": 0.012353415600955486, + "num_input_tokens_seen": 65962528, + "step": 4028, + "train_runtime": 32732.5617, + "train_tokens_per_second": 2015.196 + }, + { + "epoch": 2.441818181818182, + "grad_norm": 0.005617988295853138, + "learning_rate": 8.682404978524522e-05, + "loss": 0.012186834588646889, + "num_input_tokens_seen": 65978904, + "step": 4029, + "train_runtime": 32740.6745, + "train_tokens_per_second": 2015.197 + }, + { + "epoch": 2.4424242424242424, + "grad_norm": 0.010201000608503819, + "learning_rate": 8.681754418776255e-05, + "loss": 0.011051755398511887, + "num_input_tokens_seen": 65995280, + "step": 4030, + "train_runtime": 32748.7901, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 2.443030303030303, + "grad_norm": 0.008390221744775772, + "learning_rate": 8.681103722847065e-05, + "loss": 0.011525029316544533, + "num_input_tokens_seen": 66011656, + "step": 4031, + "train_runtime": 32756.905, + "train_tokens_per_second": 2015.198 + }, + { + "epoch": 2.443636363636364, + "grad_norm": 0.008890388533473015, + "learning_rate": 8.680452890761016e-05, + "loss": 0.011630352586507797, + "num_input_tokens_seen": 66028032, + "step": 4032, + "train_runtime": 32765.017, + "train_tokens_per_second": 2015.199 + }, + { + "epoch": 2.4442424242424243, + "grad_norm": 0.034759897738695145, + "learning_rate": 8.679801922542182e-05, + "loss": 0.014052574522793293, + "num_input_tokens_seen": 66044408, + "step": 4033, + "train_runtime": 32773.1319, + "train_tokens_per_second": 2015.2 + }, + { + "epoch": 2.444848484848485, + "grad_norm": 0.007354637607932091, + "learning_rate": 8.67915081821464e-05, + "loss": 0.012096257880330086, + "num_input_tokens_seen": 66060784, + "step": 4034, + "train_runtime": 32781.2426, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 2.4454545454545453, + "grad_norm": 0.010367256589233875, + "learning_rate": 8.678499577802476e-05, + "loss": 0.012405885383486748, + "num_input_tokens_seen": 66077160, + "step": 4035, + "train_runtime": 32789.3584, + "train_tokens_per_second": 2015.201 + }, + { + "epoch": 2.4460606060606063, + "grad_norm": 0.007225418463349342, + "learning_rate": 8.677848201329774e-05, + "loss": 0.011580531485378742, + "num_input_tokens_seen": 66093536, + "step": 4036, + "train_runtime": 32797.4719, + "train_tokens_per_second": 2015.202 + }, + { + "epoch": 2.4466666666666668, + "grad_norm": 0.007067396771162748, + "learning_rate": 8.677196688820631e-05, + "loss": 0.011616021394729614, + "num_input_tokens_seen": 66109912, + "step": 4037, + "train_runtime": 32805.585, + "train_tokens_per_second": 2015.203 + }, + { + "epoch": 2.4472727272727273, + "grad_norm": 0.007955334149301052, + "learning_rate": 8.676545040299145e-05, + "loss": 0.01117919571697712, + "num_input_tokens_seen": 66126288, + "step": 4038, + "train_runtime": 32813.6965, + "train_tokens_per_second": 2015.204 + }, + { + "epoch": 2.4478787878787878, + "grad_norm": 0.0061789704486727715, + "learning_rate": 8.675893255789413e-05, + "loss": 0.01187138445675373, + "num_input_tokens_seen": 66142664, + "step": 4039, + "train_runtime": 32821.8121, + "train_tokens_per_second": 2015.205 + }, + { + "epoch": 2.4484848484848483, + "grad_norm": 0.009460385888814926, + "learning_rate": 8.675241335315551e-05, + "loss": 0.0124445129185915, + "num_input_tokens_seen": 66159040, + "step": 4040, + "train_runtime": 32829.9313, + "train_tokens_per_second": 2015.205 + }, + { + "epoch": 2.449090909090909, + "grad_norm": 0.008185412734746933, + "learning_rate": 8.67458927890167e-05, + "loss": 0.011840671300888062, + "num_input_tokens_seen": 66175416, + "step": 4041, + "train_runtime": 32838.0484, + "train_tokens_per_second": 2015.206 + }, + { + "epoch": 2.4496969696969697, + "grad_norm": 0.03455425798892975, + "learning_rate": 8.673937086571886e-05, + "loss": 0.015499784611165524, + "num_input_tokens_seen": 66191792, + "step": 4042, + "train_runtime": 32846.1583, + "train_tokens_per_second": 2015.207 + }, + { + "epoch": 2.45030303030303, + "grad_norm": 0.006477878894656897, + "learning_rate": 8.673284758350324e-05, + "loss": 0.011704593896865845, + "num_input_tokens_seen": 66208168, + "step": 4043, + "train_runtime": 32854.2708, + "train_tokens_per_second": 2015.207 + }, + { + "epoch": 2.450909090909091, + "grad_norm": 0.004375692456960678, + "learning_rate": 8.672632294261114e-05, + "loss": 0.011329500935971737, + "num_input_tokens_seen": 66224544, + "step": 4044, + "train_runtime": 32862.3824, + "train_tokens_per_second": 2015.208 + }, + { + "epoch": 2.4515151515151516, + "grad_norm": 0.004360073246061802, + "learning_rate": 8.671979694328385e-05, + "loss": 0.012417464517056942, + "num_input_tokens_seen": 66240920, + "step": 4045, + "train_runtime": 32870.4933, + "train_tokens_per_second": 2015.209 + }, + { + "epoch": 2.452121212121212, + "grad_norm": 0.011735597625374794, + "learning_rate": 8.671326958576279e-05, + "loss": 0.01322389580309391, + "num_input_tokens_seen": 66257296, + "step": 4046, + "train_runtime": 32878.6066, + "train_tokens_per_second": 2015.21 + }, + { + "epoch": 2.4527272727272726, + "grad_norm": 0.011418039910495281, + "learning_rate": 8.670674087028939e-05, + "loss": 0.012756542302668095, + "num_input_tokens_seen": 66273672, + "step": 4047, + "train_runtime": 32886.7165, + "train_tokens_per_second": 2015.211 + }, + { + "epoch": 2.453333333333333, + "grad_norm": 0.032791610807180405, + "learning_rate": 8.67002107971051e-05, + "loss": 0.014766186475753784, + "num_input_tokens_seen": 66290048, + "step": 4048, + "train_runtime": 32894.8314, + "train_tokens_per_second": 2015.212 + }, + { + "epoch": 2.453939393939394, + "grad_norm": 0.0517447367310524, + "learning_rate": 8.669367936645151e-05, + "loss": 0.017308732494711876, + "num_input_tokens_seen": 66306424, + "step": 4049, + "train_runtime": 32902.9453, + "train_tokens_per_second": 2015.212 + }, + { + "epoch": 2.4545454545454546, + "grad_norm": 0.03526085987687111, + "learning_rate": 8.668714657857018e-05, + "loss": 0.018967142328619957, + "num_input_tokens_seen": 66322800, + "step": 4050, + "train_runtime": 32911.0585, + "train_tokens_per_second": 2015.213 + }, + { + "epoch": 2.455151515151515, + "grad_norm": 0.00729320477694273, + "learning_rate": 8.668061243370274e-05, + "loss": 0.012054507620632648, + "num_input_tokens_seen": 66339176, + "step": 4051, + "train_runtime": 32919.1705, + "train_tokens_per_second": 2015.214 + }, + { + "epoch": 2.4557575757575756, + "grad_norm": 0.03337623551487923, + "learning_rate": 8.667407693209087e-05, + "loss": 0.015036912634968758, + "num_input_tokens_seen": 66355552, + "step": 4052, + "train_runtime": 32927.2799, + "train_tokens_per_second": 2015.215 + }, + { + "epoch": 2.4563636363636365, + "grad_norm": 0.015217745676636696, + "learning_rate": 8.666754007397632e-05, + "loss": 0.012794998474419117, + "num_input_tokens_seen": 66371928, + "step": 4053, + "train_runtime": 32935.3918, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 2.456969696969697, + "grad_norm": 0.017872435972094536, + "learning_rate": 8.666100185960087e-05, + "loss": 0.017377035692334175, + "num_input_tokens_seen": 66388304, + "step": 4054, + "train_runtime": 32943.5101, + "train_tokens_per_second": 2015.216 + }, + { + "epoch": 2.4575757575757575, + "grad_norm": 0.023252859711647034, + "learning_rate": 8.665446228920635e-05, + "loss": 0.013442318886518478, + "num_input_tokens_seen": 66404680, + "step": 4055, + "train_runtime": 32951.6321, + "train_tokens_per_second": 2015.217 + }, + { + "epoch": 2.458181818181818, + "grad_norm": 0.0032380518969148397, + "learning_rate": 8.664792136303465e-05, + "loss": 0.010896595194935799, + "num_input_tokens_seen": 66421056, + "step": 4056, + "train_runtime": 32959.7444, + "train_tokens_per_second": 2015.218 + }, + { + "epoch": 2.458787878787879, + "grad_norm": 0.01565861888229847, + "learning_rate": 8.664137908132772e-05, + "loss": 0.01155568566173315, + "num_input_tokens_seen": 66437432, + "step": 4057, + "train_runtime": 32967.8557, + "train_tokens_per_second": 2015.218 + }, + { + "epoch": 2.4593939393939395, + "grad_norm": 0.013617758639156818, + "learning_rate": 8.663483544432751e-05, + "loss": 0.013444559648633003, + "num_input_tokens_seen": 66453808, + "step": 4058, + "train_runtime": 32975.9744, + "train_tokens_per_second": 2015.219 + }, + { + "epoch": 2.46, + "grad_norm": 0.009455419145524502, + "learning_rate": 8.662829045227609e-05, + "loss": 0.010722637176513672, + "num_input_tokens_seen": 66470184, + "step": 4059, + "train_runtime": 32984.0875, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 2.4606060606060605, + "grad_norm": 0.00824559573084116, + "learning_rate": 8.662174410541555e-05, + "loss": 0.011758264154195786, + "num_input_tokens_seen": 66486560, + "step": 4060, + "train_runtime": 32992.2057, + "train_tokens_per_second": 2015.22 + }, + { + "epoch": 2.461212121212121, + "grad_norm": 0.007522704545408487, + "learning_rate": 8.661519640398801e-05, + "loss": 0.011111623607575893, + "num_input_tokens_seen": 66502936, + "step": 4061, + "train_runtime": 33000.317, + "train_tokens_per_second": 2015.221 + }, + { + "epoch": 2.461818181818182, + "grad_norm": 0.003967296797782183, + "learning_rate": 8.660864734823564e-05, + "loss": 0.011284412816166878, + "num_input_tokens_seen": 66519312, + "step": 4062, + "train_runtime": 33008.4349, + "train_tokens_per_second": 2015.222 + }, + { + "epoch": 2.4624242424242424, + "grad_norm": 0.005898671690374613, + "learning_rate": 8.660209693840072e-05, + "loss": 0.012734920717775822, + "num_input_tokens_seen": 66535688, + "step": 4063, + "train_runtime": 33016.5501, + "train_tokens_per_second": 2015.222 + }, + { + "epoch": 2.463030303030303, + "grad_norm": 0.008629771880805492, + "learning_rate": 8.65955451747255e-05, + "loss": 0.012427698820829391, + "num_input_tokens_seen": 66552064, + "step": 4064, + "train_runtime": 33024.6649, + "train_tokens_per_second": 2015.223 + }, + { + "epoch": 2.463636363636364, + "grad_norm": 0.006099363323301077, + "learning_rate": 8.658899205745235e-05, + "loss": 0.011295391246676445, + "num_input_tokens_seen": 66568440, + "step": 4065, + "train_runtime": 33032.7814, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 2.4642424242424243, + "grad_norm": 0.008219312876462936, + "learning_rate": 8.658243758682361e-05, + "loss": 0.011767336167395115, + "num_input_tokens_seen": 66584816, + "step": 4066, + "train_runtime": 33040.8935, + "train_tokens_per_second": 2015.224 + }, + { + "epoch": 2.464848484848485, + "grad_norm": 0.009053852409124374, + "learning_rate": 8.657588176308176e-05, + "loss": 0.012841441668570042, + "num_input_tokens_seen": 66601192, + "step": 4067, + "train_runtime": 33049.0074, + "train_tokens_per_second": 2015.225 + }, + { + "epoch": 2.4654545454545453, + "grad_norm": 0.09069479256868362, + "learning_rate": 8.656932458646927e-05, + "loss": 0.013467703014612198, + "num_input_tokens_seen": 66617568, + "step": 4068, + "train_runtime": 33057.121, + "train_tokens_per_second": 2015.226 + }, + { + "epoch": 2.466060606060606, + "grad_norm": 0.009245152585208416, + "learning_rate": 8.656276605722868e-05, + "loss": 0.012219304218888283, + "num_input_tokens_seen": 66633944, + "step": 4069, + "train_runtime": 33065.2343, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 2.466666666666667, + "grad_norm": 0.008828767575323582, + "learning_rate": 8.655620617560257e-05, + "loss": 0.012695424258708954, + "num_input_tokens_seen": 66650320, + "step": 4070, + "train_runtime": 33073.3479, + "train_tokens_per_second": 2015.227 + }, + { + "epoch": 2.4672727272727273, + "grad_norm": 0.005800986662507057, + "learning_rate": 8.654964494183358e-05, + "loss": 0.012122102081775665, + "num_input_tokens_seen": 66666696, + "step": 4071, + "train_runtime": 33081.4597, + "train_tokens_per_second": 2015.228 + }, + { + "epoch": 2.467878787878788, + "grad_norm": 0.005396679975092411, + "learning_rate": 8.654308235616442e-05, + "loss": 0.012633191421627998, + "num_input_tokens_seen": 66683072, + "step": 4072, + "train_runtime": 33089.5751, + "train_tokens_per_second": 2015.229 + }, + { + "epoch": 2.4684848484848487, + "grad_norm": 0.007578667718917131, + "learning_rate": 8.653651841883779e-05, + "loss": 0.011809978634119034, + "num_input_tokens_seen": 66699448, + "step": 4073, + "train_runtime": 33097.6868, + "train_tokens_per_second": 2015.23 + }, + { + "epoch": 2.4690909090909092, + "grad_norm": 0.0044122799299657345, + "learning_rate": 8.65299531300965e-05, + "loss": 0.011763782240450382, + "num_input_tokens_seen": 66715824, + "step": 4074, + "train_runtime": 33105.7999, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 2.4696969696969697, + "grad_norm": 0.0043112775310873985, + "learning_rate": 8.652338649018339e-05, + "loss": 0.012082591652870178, + "num_input_tokens_seen": 66732200, + "step": 4075, + "train_runtime": 33113.9159, + "train_tokens_per_second": 2015.231 + }, + { + "epoch": 2.4703030303030302, + "grad_norm": 0.28410032391548157, + "learning_rate": 8.651681849934134e-05, + "loss": 0.01635185442864895, + "num_input_tokens_seen": 66748576, + "step": 4076, + "train_runtime": 33122.0317, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 2.4709090909090907, + "grad_norm": 0.12259082496166229, + "learning_rate": 8.651024915781327e-05, + "loss": 0.01743101142346859, + "num_input_tokens_seen": 66764952, + "step": 4077, + "train_runtime": 33130.1487, + "train_tokens_per_second": 2015.232 + }, + { + "epoch": 2.4715151515151517, + "grad_norm": 0.005474293604493141, + "learning_rate": 8.650367846584219e-05, + "loss": 0.012096352875232697, + "num_input_tokens_seen": 66781328, + "step": 4078, + "train_runtime": 33138.2613, + "train_tokens_per_second": 2015.233 + }, + { + "epoch": 2.472121212121212, + "grad_norm": 0.006114025134593248, + "learning_rate": 8.649710642367115e-05, + "loss": 0.013284233398735523, + "num_input_tokens_seen": 66797704, + "step": 4079, + "train_runtime": 33146.3753, + "train_tokens_per_second": 2015.234 + }, + { + "epoch": 2.4727272727272727, + "grad_norm": 0.011463556438684464, + "learning_rate": 8.64905330315432e-05, + "loss": 0.012340724468231201, + "num_input_tokens_seen": 66814080, + "step": 4080, + "train_runtime": 33154.491, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 2.473333333333333, + "grad_norm": 0.00733643164858222, + "learning_rate": 8.64839582897015e-05, + "loss": 0.011145420372486115, + "num_input_tokens_seen": 66830456, + "step": 4081, + "train_runtime": 33162.609, + "train_tokens_per_second": 2015.235 + }, + { + "epoch": 2.473939393939394, + "grad_norm": 0.007582388818264008, + "learning_rate": 8.647738219838924e-05, + "loss": 0.011645256541669369, + "num_input_tokens_seen": 66846832, + "step": 4082, + "train_runtime": 33170.7236, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 2.4745454545454546, + "grad_norm": 0.006805262062698603, + "learning_rate": 8.647080475784964e-05, + "loss": 0.012549490667879581, + "num_input_tokens_seen": 66863208, + "step": 4083, + "train_runtime": 33178.8401, + "train_tokens_per_second": 2015.236 + }, + { + "epoch": 2.475151515151515, + "grad_norm": 0.008055298589169979, + "learning_rate": 8.646422596832599e-05, + "loss": 0.012534864246845245, + "num_input_tokens_seen": 66879584, + "step": 4084, + "train_runtime": 33186.9521, + "train_tokens_per_second": 2015.237 + }, + { + "epoch": 2.4757575757575756, + "grad_norm": 0.008632096461951733, + "learning_rate": 8.645764583006165e-05, + "loss": 0.011780787259340286, + "num_input_tokens_seen": 66895960, + "step": 4085, + "train_runtime": 33195.0702, + "train_tokens_per_second": 2015.238 + }, + { + "epoch": 2.4763636363636365, + "grad_norm": 0.010974240489304066, + "learning_rate": 8.645106434329996e-05, + "loss": 0.01195211336016655, + "num_input_tokens_seen": 66912336, + "step": 4086, + "train_runtime": 33203.1836, + "train_tokens_per_second": 2015.239 + }, + { + "epoch": 2.476969696969697, + "grad_norm": 0.017518963664770126, + "learning_rate": 8.644448150828442e-05, + "loss": 0.012673698365688324, + "num_input_tokens_seen": 66928712, + "step": 4087, + "train_runtime": 33211.2984, + "train_tokens_per_second": 2015.239 + }, + { + "epoch": 2.4775757575757575, + "grad_norm": 0.008905385620892048, + "learning_rate": 8.643789732525846e-05, + "loss": 0.012078780680894852, + "num_input_tokens_seen": 66945088, + "step": 4088, + "train_runtime": 33219.4156, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 2.478181818181818, + "grad_norm": 0.004331223201006651, + "learning_rate": 8.643131179446564e-05, + "loss": 0.011904648505151272, + "num_input_tokens_seen": 66961464, + "step": 4089, + "train_runtime": 33227.5336, + "train_tokens_per_second": 2015.24 + }, + { + "epoch": 2.4787878787878785, + "grad_norm": 0.0028569402638822794, + "learning_rate": 8.642472491614954e-05, + "loss": 0.011711115948855877, + "num_input_tokens_seen": 66977840, + "step": 4090, + "train_runtime": 33235.6505, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 2.4793939393939395, + "grad_norm": 0.008105210028588772, + "learning_rate": 8.641813669055381e-05, + "loss": 0.011557997204363346, + "num_input_tokens_seen": 66994216, + "step": 4091, + "train_runtime": 33243.7668, + "train_tokens_per_second": 2015.241 + }, + { + "epoch": 2.48, + "grad_norm": 0.008874750696122646, + "learning_rate": 8.641154711792212e-05, + "loss": 0.0119530213996768, + "num_input_tokens_seen": 67010592, + "step": 4092, + "train_runtime": 33251.8847, + "train_tokens_per_second": 2015.242 + }, + { + "epoch": 2.4806060606060605, + "grad_norm": 0.011586075648665428, + "learning_rate": 8.640495619849821e-05, + "loss": 0.012206172570586205, + "num_input_tokens_seen": 67026968, + "step": 4093, + "train_runtime": 33260.0003, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 2.4812121212121214, + "grad_norm": 0.013051996007561684, + "learning_rate": 8.639836393252587e-05, + "loss": 0.012065069749951363, + "num_input_tokens_seen": 67043344, + "step": 4094, + "train_runtime": 33268.1136, + "train_tokens_per_second": 2015.243 + }, + { + "epoch": 2.481818181818182, + "grad_norm": 0.01702691800892353, + "learning_rate": 8.639177032024892e-05, + "loss": 0.011707555502653122, + "num_input_tokens_seen": 67059720, + "step": 4095, + "train_runtime": 33276.2314, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 2.4824242424242424, + "grad_norm": 0.004312279634177685, + "learning_rate": 8.638517536191127e-05, + "loss": 0.012810110114514828, + "num_input_tokens_seen": 67076096, + "step": 4096, + "train_runtime": 33284.348, + "train_tokens_per_second": 2015.244 + }, + { + "epoch": 2.483030303030303, + "grad_norm": 0.004693899769335985, + "learning_rate": 8.637857905775684e-05, + "loss": 0.011241926811635494, + "num_input_tokens_seen": 67092472, + "step": 4097, + "train_runtime": 33292.4626, + "train_tokens_per_second": 2015.245 + }, + { + "epoch": 2.4836363636363634, + "grad_norm": 0.010167837142944336, + "learning_rate": 8.63719814080296e-05, + "loss": 0.012888771481812, + "num_input_tokens_seen": 67108848, + "step": 4098, + "train_runtime": 33300.5761, + "train_tokens_per_second": 2015.246 + }, + { + "epoch": 2.4842424242424244, + "grad_norm": 0.008203212171792984, + "learning_rate": 8.63653824129736e-05, + "loss": 0.01142559852451086, + "num_input_tokens_seen": 67125224, + "step": 4099, + "train_runtime": 33308.688, + "train_tokens_per_second": 2015.247 + }, + { + "epoch": 2.484848484848485, + "grad_norm": 0.006197168026119471, + "learning_rate": 8.635878207283293e-05, + "loss": 0.012334875762462616, + "num_input_tokens_seen": 67141600, + "step": 4100, + "train_runtime": 33316.8014, + "train_tokens_per_second": 2015.247 + } + ], + "logging_steps": 1, + "max_steps": 16500, + "num_input_tokens_seen": 67141600, + "num_train_epochs": 10, + "save_steps": 100, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 3.0840954635698176e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}