{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 2000, "global_step": 100000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 5.4375, "learning_rate": 2.0000000000000003e-06, "loss": 2.3544, "step": 1000 }, { "epoch": 0.02, "grad_norm": 5.46875, "learning_rate": 4.000000000000001e-06, "loss": 2.1179, "step": 2000 }, { "epoch": 0.02, "eval_loss": 2.0292959213256836, "eval_runtime": 91.9825, "eval_samples_per_second": 53.869, "eval_steps_per_second": 13.47, "step": 2000 }, { "epoch": 0.03, "grad_norm": 6.3125, "learning_rate": 6e-06, "loss": 1.9323, "step": 3000 }, { "epoch": 0.04, "grad_norm": 7.25, "learning_rate": 8.000000000000001e-06, "loss": 1.7895, "step": 4000 }, { "epoch": 0.04, "eval_loss": 1.7362630367279053, "eval_runtime": 92.5723, "eval_samples_per_second": 53.526, "eval_steps_per_second": 13.384, "step": 4000 }, { "epoch": 0.05, "grad_norm": 7.0625, "learning_rate": 1e-05, "loss": 1.667, "step": 5000 }, { "epoch": 0.06, "grad_norm": 7.75, "learning_rate": 9.99726628670463e-06, "loss": 1.556, "step": 6000 }, { "epoch": 0.06, "eval_loss": 1.5130035877227783, "eval_runtime": 92.1543, "eval_samples_per_second": 53.768, "eval_steps_per_second": 13.445, "step": 6000 }, { "epoch": 0.07, "grad_norm": 7.5625, "learning_rate": 9.989068136093873e-06, "loss": 1.4736, "step": 7000 }, { "epoch": 0.08, "grad_norm": 8.75, "learning_rate": 9.975414512725058e-06, "loss": 1.4118, "step": 8000 }, { "epoch": 0.08, "eval_loss": 1.3977240324020386, "eval_runtime": 92.0357, "eval_samples_per_second": 53.838, "eval_steps_per_second": 13.462, "step": 8000 }, { "epoch": 0.09, "grad_norm": 7.8125, "learning_rate": 9.956320346634877e-06, "loss": 1.3674, "step": 9000 }, { "epoch": 0.1, "grad_norm": 7.40625, "learning_rate": 9.931806517013612e-06, "loss": 1.3417, "step": 10000 }, { "epoch": 0.1, "eval_loss": 1.3288121223449707, "eval_runtime": 92.0559, "eval_samples_per_second": 53.826, "eval_steps_per_second": 13.459, "step": 10000 }, { "epoch": 0.11, "grad_norm": 7.75, "learning_rate": 9.901899829374048e-06, "loss": 1.3169, "step": 11000 }, { "epoch": 0.12, "grad_norm": 8.0625, "learning_rate": 9.86663298624003e-06, "loss": 1.2918, "step": 12000 }, { "epoch": 0.12, "eval_loss": 1.287561058998108, "eval_runtime": 92.0116, "eval_samples_per_second": 53.852, "eval_steps_per_second": 13.466, "step": 12000 }, { "epoch": 0.13, "grad_norm": 8.3125, "learning_rate": 9.826044551386743e-06, "loss": 1.2723, "step": 13000 }, { "epoch": 0.14, "grad_norm": 8.625, "learning_rate": 9.780178907671788e-06, "loss": 1.2511, "step": 14000 }, { "epoch": 0.14, "eval_loss": 1.2546547651290894, "eval_runtime": 91.8637, "eval_samples_per_second": 53.939, "eval_steps_per_second": 13.487, "step": 14000 }, { "epoch": 0.15, "grad_norm": 8.5625, "learning_rate": 9.729086208503174e-06, "loss": 1.2465, "step": 15000 }, { "epoch": 0.16, "grad_norm": 7.6875, "learning_rate": 9.672822322997305e-06, "loss": 1.2293, "step": 16000 }, { "epoch": 0.16, "eval_loss": 1.2318358421325684, "eval_runtime": 92.2103, "eval_samples_per_second": 53.736, "eval_steps_per_second": 13.437, "step": 16000 }, { "epoch": 0.17, "grad_norm": 7.5625, "learning_rate": 9.611448774886925e-06, "loss": 1.2212, "step": 17000 }, { "epoch": 0.18, "grad_norm": 7.625, "learning_rate": 9.545032675245814e-06, "loss": 1.2145, "step": 18000 }, { "epoch": 0.18, "eval_loss": 1.2164242267608643, "eval_runtime": 92.4452, "eval_samples_per_second": 53.599, "eval_steps_per_second": 13.403, "step": 18000 }, { "epoch": 0.19, "grad_norm": 8.375, "learning_rate": 9.473646649103819e-06, "loss": 1.202, "step": 19000 }, { "epoch": 0.2, "grad_norm": 8.9375, "learning_rate": 9.397368756032445e-06, "loss": 1.1945, "step": 20000 }, { "epoch": 0.2, "eval_loss": 1.2037837505340576, "eval_runtime": 92.0761, "eval_samples_per_second": 53.814, "eval_steps_per_second": 13.456, "step": 20000 }, { "epoch": 0.21, "grad_norm": 9.0625, "learning_rate": 9.31628240478787e-06, "loss": 1.1909, "step": 21000 }, { "epoch": 0.22, "grad_norm": 8.25, "learning_rate": 9.230476262104678e-06, "loss": 1.1871, "step": 22000 }, { "epoch": 0.22, "eval_loss": 1.1938320398330688, "eval_runtime": 91.9107, "eval_samples_per_second": 53.911, "eval_steps_per_second": 13.48, "step": 22000 }, { "epoch": 0.23, "grad_norm": 9.25, "learning_rate": 9.140044155740102e-06, "loss": 1.177, "step": 23000 }, { "epoch": 0.24, "grad_norm": 9.5625, "learning_rate": 9.045084971874738e-06, "loss": 1.1825, "step": 24000 }, { "epoch": 0.24, "eval_loss": 1.185314655303955, "eval_runtime": 92.1065, "eval_samples_per_second": 53.796, "eval_steps_per_second": 13.452, "step": 24000 }, { "epoch": 0.25, "grad_norm": 8.875, "learning_rate": 8.94570254698197e-06, "loss": 1.1751, "step": 25000 }, { "epoch": 0.26, "grad_norm": 8.375, "learning_rate": 8.842005554284296e-06, "loss": 1.1735, "step": 26000 }, { "epoch": 0.26, "eval_loss": 1.1786526441574097, "eval_runtime": 91.9617, "eval_samples_per_second": 53.881, "eval_steps_per_second": 13.473, "step": 26000 }, { "epoch": 0.27, "grad_norm": 8.75, "learning_rate": 8.734107384920771e-06, "loss": 1.1704, "step": 27000 }, { "epoch": 0.28, "grad_norm": 7.46875, "learning_rate": 8.622126023955446e-06, "loss": 1.1684, "step": 28000 }, { "epoch": 0.28, "eval_loss": 1.1743632555007935, "eval_runtime": 94.6714, "eval_samples_per_second": 52.339, "eval_steps_per_second": 13.087, "step": 28000 }, { "epoch": 0.29, "grad_norm": 8.5, "learning_rate": 8.506183921362443e-06, "loss": 1.1664, "step": 29000 }, { "epoch": 0.3, "grad_norm": 8.125, "learning_rate": 8.386407858128707e-06, "loss": 1.1627, "step": 30000 }, { "epoch": 0.3, "eval_loss": 1.1703976392745972, "eval_runtime": 91.9503, "eval_samples_per_second": 53.888, "eval_steps_per_second": 13.475, "step": 30000 }, { "epoch": 0.31, "grad_norm": 8.375, "learning_rate": 8.262928807620843e-06, "loss": 1.1658, "step": 31000 }, { "epoch": 0.32, "grad_norm": 8.25, "learning_rate": 8.135881792367686e-06, "loss": 1.1637, "step": 32000 }, { "epoch": 0.32, "eval_loss": 1.168877363204956, "eval_runtime": 92.267, "eval_samples_per_second": 53.703, "eval_steps_per_second": 13.428, "step": 32000 }, { "epoch": 0.33, "grad_norm": 9.625, "learning_rate": 8.005405736415127e-06, "loss": 1.1603, "step": 33000 }, { "epoch": 0.34, "grad_norm": 9.3125, "learning_rate": 7.871643313414718e-06, "loss": 1.156, "step": 34000 }, { "epoch": 0.34, "eval_loss": 1.164509654045105, "eval_runtime": 91.883, "eval_samples_per_second": 53.927, "eval_steps_per_second": 13.485, "step": 34000 }, { "epoch": 0.35, "grad_norm": 9.375, "learning_rate": 7.734740790612137e-06, "loss": 1.1559, "step": 35000 }, { "epoch": 0.36, "grad_norm": 8.875, "learning_rate": 7.594847868906076e-06, "loss": 1.1513, "step": 36000 }, { "epoch": 0.36, "eval_loss": 1.1633930206298828, "eval_runtime": 92.0359, "eval_samples_per_second": 53.838, "eval_steps_per_second": 13.462, "step": 36000 }, { "epoch": 0.37, "grad_norm": 7.4375, "learning_rate": 7.452117519152542e-06, "loss": 1.1524, "step": 37000 }, { "epoch": 0.38, "grad_norm": 9.125, "learning_rate": 7.30670581489344e-06, "loss": 1.1517, "step": 38000 }, { "epoch": 0.38, "eval_loss": 1.1613472700119019, "eval_runtime": 92.9351, "eval_samples_per_second": 53.317, "eval_steps_per_second": 13.332, "step": 38000 }, { "epoch": 0.39, "grad_norm": 8.6875, "learning_rate": 7.158771761692464e-06, "loss": 1.1486, "step": 39000 }, { "epoch": 0.4, "grad_norm": 9.75, "learning_rate": 7.008477123264849e-06, "loss": 1.1487, "step": 40000 }, { "epoch": 0.4, "eval_loss": 1.1614651679992676, "eval_runtime": 92.598, "eval_samples_per_second": 53.511, "eval_steps_per_second": 13.38, "step": 40000 }, { "epoch": 0.41, "grad_norm": 9.25, "learning_rate": 6.855986244591104e-06, "loss": 1.139, "step": 41000 }, { "epoch": 0.42, "grad_norm": 8.5625, "learning_rate": 6.701465872208216e-06, "loss": 1.1503, "step": 42000 }, { "epoch": 0.42, "eval_loss": 1.1591981649398804, "eval_runtime": 92.6092, "eval_samples_per_second": 53.504, "eval_steps_per_second": 13.379, "step": 42000 }, { "epoch": 0.43, "grad_norm": 8.0, "learning_rate": 6.545084971874738e-06, "loss": 1.1443, "step": 43000 }, { "epoch": 0.44, "grad_norm": 9.0625, "learning_rate": 6.387014543809224e-06, "loss": 1.1502, "step": 44000 }, { "epoch": 0.44, "eval_loss": 1.1588733196258545, "eval_runtime": 92.6849, "eval_samples_per_second": 53.461, "eval_steps_per_second": 13.368, "step": 44000 }, { "epoch": 0.45, "grad_norm": 10.1875, "learning_rate": 6.227427435703997e-06, "loss": 1.1496, "step": 45000 }, { "epoch": 0.46, "grad_norm": 9.625, "learning_rate": 6.066498153718735e-06, "loss": 1.1427, "step": 46000 }, { "epoch": 0.46, "eval_loss": 1.1595520973205566, "eval_runtime": 93.5685, "eval_samples_per_second": 52.956, "eval_steps_per_second": 13.242, "step": 46000 }, { "epoch": 0.47, "grad_norm": 9.8125, "learning_rate": 5.904402671660551e-06, "loss": 1.1444, "step": 47000 }, { "epoch": 0.48, "grad_norm": 10.375, "learning_rate": 5.74131823855921e-06, "loss": 1.1433, "step": 48000 }, { "epoch": 0.48, "eval_loss": 1.1582742929458618, "eval_runtime": 92.9369, "eval_samples_per_second": 53.316, "eval_steps_per_second": 13.332, "step": 48000 }, { "epoch": 0.49, "grad_norm": 9.6875, "learning_rate": 5.577423184847932e-06, "loss": 1.1437, "step": 49000 }, { "epoch": 0.5, "grad_norm": 8.375, "learning_rate": 5.412896727361663e-06, "loss": 1.1483, "step": 50000 }, { "epoch": 0.5, "eval_loss": 1.1573219299316406, "eval_runtime": 93.0508, "eval_samples_per_second": 53.25, "eval_steps_per_second": 13.315, "step": 50000 }, { "epoch": 0.51, "grad_norm": 8.25, "learning_rate": 5.247918773366112e-06, "loss": 1.1408, "step": 51000 }, { "epoch": 0.52, "grad_norm": 9.625, "learning_rate": 5.082669723831793e-06, "loss": 1.1444, "step": 52000 }, { "epoch": 0.52, "eval_loss": 1.1576628684997559, "eval_runtime": 92.489, "eval_samples_per_second": 53.574, "eval_steps_per_second": 13.396, "step": 52000 }, { "epoch": 0.53, "grad_norm": 8.1875, "learning_rate": 4.917330276168208e-06, "loss": 1.1405, "step": 53000 }, { "epoch": 0.54, "grad_norm": 9.0625, "learning_rate": 4.752081226633888e-06, "loss": 1.142, "step": 54000 }, { "epoch": 0.54, "eval_loss": 1.1567600965499878, "eval_runtime": 92.8548, "eval_samples_per_second": 53.363, "eval_steps_per_second": 13.343, "step": 54000 }, { "epoch": 0.55, "grad_norm": 10.1875, "learning_rate": 4.587103272638339e-06, "loss": 1.1428, "step": 55000 }, { "epoch": 0.56, "grad_norm": 9.5, "learning_rate": 4.42257681515207e-06, "loss": 1.1405, "step": 56000 }, { "epoch": 0.56, "eval_loss": 1.1568113565444946, "eval_runtime": 92.8975, "eval_samples_per_second": 53.338, "eval_steps_per_second": 13.337, "step": 56000 }, { "epoch": 0.57, "grad_norm": 10.1875, "learning_rate": 4.25868176144079e-06, "loss": 1.1475, "step": 57000 }, { "epoch": 0.58, "grad_norm": 8.75, "learning_rate": 4.0955973283394525e-06, "loss": 1.1428, "step": 58000 }, { "epoch": 0.58, "eval_loss": 1.1565525531768799, "eval_runtime": 92.7278, "eval_samples_per_second": 53.436, "eval_steps_per_second": 13.362, "step": 58000 }, { "epoch": 0.59, "grad_norm": 9.4375, "learning_rate": 3.9335018462812664e-06, "loss": 1.1465, "step": 59000 }, { "epoch": 0.6, "grad_norm": 8.8125, "learning_rate": 3.7725725642960047e-06, "loss": 1.1465, "step": 60000 }, { "epoch": 0.6, "eval_loss": 1.1557884216308594, "eval_runtime": 92.8391, "eval_samples_per_second": 53.372, "eval_steps_per_second": 13.346, "step": 60000 }, { "epoch": 0.61, "grad_norm": 7.84375, "learning_rate": 3.6129854561907786e-06, "loss": 1.1454, "step": 61000 }, { "epoch": 0.62, "grad_norm": 10.125, "learning_rate": 3.4549150281252635e-06, "loss": 1.145, "step": 62000 }, { "epoch": 0.62, "eval_loss": 1.1553471088409424, "eval_runtime": 92.7621, "eval_samples_per_second": 53.416, "eval_steps_per_second": 13.357, "step": 62000 }, { "epoch": 0.63, "grad_norm": 9.75, "learning_rate": 3.298534127791785e-06, "loss": 1.1459, "step": 63000 }, { "epoch": 0.64, "grad_norm": 9.625, "learning_rate": 3.1440137554088957e-06, "loss": 1.1495, "step": 64000 }, { "epoch": 0.64, "eval_loss": 1.1568386554718018, "eval_runtime": 92.8646, "eval_samples_per_second": 53.357, "eval_steps_per_second": 13.342, "step": 64000 }, { "epoch": 0.65, "grad_norm": 8.5, "learning_rate": 2.991522876735154e-06, "loss": 1.1448, "step": 65000 }, { "epoch": 0.66, "grad_norm": 9.75, "learning_rate": 2.8412282383075362e-06, "loss": 1.1425, "step": 66000 }, { "epoch": 0.66, "eval_loss": 1.1566816568374634, "eval_runtime": 93.131, "eval_samples_per_second": 53.205, "eval_steps_per_second": 13.304, "step": 66000 }, { "epoch": 0.67, "grad_norm": 8.3125, "learning_rate": 2.693294185106562e-06, "loss": 1.1466, "step": 67000 }, { "epoch": 0.68, "grad_norm": 9.0, "learning_rate": 2.5478824808474613e-06, "loss": 1.1377, "step": 68000 }, { "epoch": 0.68, "eval_loss": 1.1565214395523071, "eval_runtime": 92.804, "eval_samples_per_second": 53.392, "eval_steps_per_second": 13.351, "step": 68000 }, { "epoch": 0.69, "grad_norm": 8.875, "learning_rate": 2.4051521310939258e-06, "loss": 1.1434, "step": 69000 }, { "epoch": 0.7, "grad_norm": 10.0, "learning_rate": 2.265259209387867e-06, "loss": 1.1389, "step": 70000 }, { "epoch": 0.7, "eval_loss": 1.156681776046753, "eval_runtime": 92.6941, "eval_samples_per_second": 53.455, "eval_steps_per_second": 13.367, "step": 70000 }, { "epoch": 0.71, "grad_norm": 11.5625, "learning_rate": 2.1283566865852824e-06, "loss": 1.144, "step": 71000 }, { "epoch": 0.72, "grad_norm": 9.625, "learning_rate": 1.9945942635848745e-06, "loss": 1.1387, "step": 72000 }, { "epoch": 0.72, "eval_loss": 1.15574312210083, "eval_runtime": 92.8587, "eval_samples_per_second": 53.361, "eval_steps_per_second": 13.343, "step": 72000 }, { "epoch": 0.73, "grad_norm": 9.0, "learning_rate": 1.864118207632315e-06, "loss": 1.1346, "step": 73000 }, { "epoch": 0.74, "grad_norm": 8.4375, "learning_rate": 1.7370711923791567e-06, "loss": 1.1418, "step": 74000 }, { "epoch": 0.74, "eval_loss": 1.1560354232788086, "eval_runtime": 92.8217, "eval_samples_per_second": 53.382, "eval_steps_per_second": 13.348, "step": 74000 }, { "epoch": 0.75, "grad_norm": 8.875, "learning_rate": 1.6135921418712959e-06, "loss": 1.1375, "step": 75000 }, { "epoch": 0.76, "grad_norm": 8.625, "learning_rate": 1.4938160786375571e-06, "loss": 1.143, "step": 76000 }, { "epoch": 0.76, "eval_loss": 1.1558743715286255, "eval_runtime": 93.1651, "eval_samples_per_second": 53.185, "eval_steps_per_second": 13.299, "step": 76000 }, { "epoch": 0.77, "grad_norm": 9.625, "learning_rate": 1.3778739760445552e-06, "loss": 1.1455, "step": 77000 }, { "epoch": 0.78, "grad_norm": 8.875, "learning_rate": 1.2658926150792321e-06, "loss": 1.1401, "step": 78000 }, { "epoch": 0.78, "eval_loss": 1.1553114652633667, "eval_runtime": 93.7037, "eval_samples_per_second": 52.879, "eval_steps_per_second": 13.223, "step": 78000 }, { "epoch": 0.79, "grad_norm": 9.25, "learning_rate": 1.157994445715706e-06, "loss": 1.14, "step": 79000 }, { "epoch": 0.8, "grad_norm": 10.9375, "learning_rate": 1.0542974530180327e-06, "loss": 1.1444, "step": 80000 }, { "epoch": 0.8, "eval_loss": 1.155322551727295, "eval_runtime": 93.8365, "eval_samples_per_second": 52.805, "eval_steps_per_second": 13.204, "step": 80000 }, { "epoch": 0.81, "grad_norm": 8.4375, "learning_rate": 9.549150281252633e-07, "loss": 1.1428, "step": 81000 }, { "epoch": 0.82, "grad_norm": 8.4375, "learning_rate": 8.599558442598998e-07, "loss": 1.1484, "step": 82000 }, { "epoch": 0.82, "eval_loss": 1.1569843292236328, "eval_runtime": 93.6505, "eval_samples_per_second": 52.909, "eval_steps_per_second": 13.23, "step": 82000 }, { "epoch": 0.83, "grad_norm": 9.75, "learning_rate": 7.695237378953224e-07, "loss": 1.1438, "step": 83000 }, { "epoch": 0.84, "grad_norm": 9.25, "learning_rate": 6.837175952121305e-07, "loss": 1.1369, "step": 84000 }, { "epoch": 0.84, "eval_loss": 1.155900478363037, "eval_runtime": 97.8841, "eval_samples_per_second": 50.621, "eval_steps_per_second": 12.658, "step": 84000 }, { "epoch": 0.85, "grad_norm": 8.625, "learning_rate": 6.026312439675553e-07, "loss": 1.1379, "step": 85000 }, { "epoch": 0.86, "grad_norm": 8.375, "learning_rate": 5.263533508961827e-07, "loss": 1.1417, "step": 86000 }, { "epoch": 0.86, "eval_loss": 1.156594157218933, "eval_runtime": 93.5258, "eval_samples_per_second": 52.98, "eval_steps_per_second": 13.248, "step": 86000 }, { "epoch": 0.87, "grad_norm": 9.1875, "learning_rate": 4.549673247541875e-07, "loss": 1.1395, "step": 87000 }, { "epoch": 0.88, "grad_norm": 8.9375, "learning_rate": 3.885512251130763e-07, "loss": 1.1416, "step": 88000 }, { "epoch": 0.88, "eval_loss": 1.1570854187011719, "eval_runtime": 94.1553, "eval_samples_per_second": 52.626, "eval_steps_per_second": 13.159, "step": 88000 }, { "epoch": 0.89, "grad_norm": 9.1875, "learning_rate": 3.271776770026963e-07, "loss": 1.1422, "step": 89000 }, { "epoch": 0.9, "grad_norm": 9.5625, "learning_rate": 2.7091379149682683e-07, "loss": 1.1433, "step": 90000 }, { "epoch": 0.9, "eval_loss": 1.1558341979980469, "eval_runtime": 93.8721, "eval_samples_per_second": 52.785, "eval_steps_per_second": 13.199, "step": 90000 }, { "epoch": 0.91, "grad_norm": 9.25, "learning_rate": 2.198210923282118e-07, "loss": 1.1465, "step": 91000 }, { "epoch": 0.92, "grad_norm": 9.125, "learning_rate": 1.7395544861325718e-07, "loss": 1.1395, "step": 92000 }, { "epoch": 0.92, "eval_loss": 1.155996561050415, "eval_runtime": 97.435, "eval_samples_per_second": 50.854, "eval_steps_per_second": 12.716, "step": 92000 }, { "epoch": 0.93, "grad_norm": 8.25, "learning_rate": 1.333670137599713e-07, "loss": 1.147, "step": 93000 }, { "epoch": 0.94, "grad_norm": 8.4375, "learning_rate": 9.810017062595322e-08, "loss": 1.1465, "step": 94000 }, { "epoch": 0.94, "eval_loss": 1.1561691761016846, "eval_runtime": 93.8214, "eval_samples_per_second": 52.813, "eval_steps_per_second": 13.206, "step": 94000 }, { "epoch": 0.95, "grad_norm": 9.625, "learning_rate": 6.819348298638839e-08, "loss": 1.1458, "step": 95000 }, { "epoch": 0.96, "grad_norm": 7.5, "learning_rate": 4.367965336512403e-08, "loss": 1.1474, "step": 96000 }, { "epoch": 0.96, "eval_loss": 1.1555160284042358, "eval_runtime": 93.7343, "eval_samples_per_second": 52.862, "eval_steps_per_second": 13.218, "step": 96000 }, { "epoch": 0.97, "grad_norm": 10.4375, "learning_rate": 2.4585487274942922e-08, "loss": 1.145, "step": 97000 }, { "epoch": 0.98, "grad_norm": 8.6875, "learning_rate": 1.0931863906127327e-08, "loss": 1.1431, "step": 98000 }, { "epoch": 0.98, "eval_loss": 1.1566122770309448, "eval_runtime": 92.9031, "eval_samples_per_second": 53.335, "eval_steps_per_second": 13.336, "step": 98000 }, { "epoch": 0.99, "grad_norm": 8.0, "learning_rate": 2.7337132953697555e-09, "loss": 1.1458, "step": 99000 }, { "epoch": 1.0, "grad_norm": 8.25, "learning_rate": 0.0, "loss": 1.1395, "step": 100000 }, { "epoch": 1.0, "eval_loss": 1.155640959739685, "eval_runtime": 93.0092, "eval_samples_per_second": 53.274, "eval_steps_per_second": 13.321, "step": 100000 } ], "logging_steps": 1000, "max_steps": 100000, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.24764153413632e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null }