| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 2000, | |
| "global_step": 100000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01, | |
| "grad_norm": 5.4375, | |
| "learning_rate": 2.0000000000000003e-06, | |
| "loss": 2.3544, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "grad_norm": 5.46875, | |
| "learning_rate": 4.000000000000001e-06, | |
| "loss": 2.1179, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.02, | |
| "eval_loss": 2.0292959213256836, | |
| "eval_runtime": 91.9825, | |
| "eval_samples_per_second": 53.869, | |
| "eval_steps_per_second": 13.47, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.03, | |
| "grad_norm": 6.3125, | |
| "learning_rate": 6e-06, | |
| "loss": 1.9323, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "grad_norm": 7.25, | |
| "learning_rate": 8.000000000000001e-06, | |
| "loss": 1.7895, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.04, | |
| "eval_loss": 1.7362630367279053, | |
| "eval_runtime": 92.5723, | |
| "eval_samples_per_second": 53.526, | |
| "eval_steps_per_second": 13.384, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.05, | |
| "grad_norm": 7.0625, | |
| "learning_rate": 1e-05, | |
| "loss": 1.667, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "grad_norm": 7.75, | |
| "learning_rate": 9.99726628670463e-06, | |
| "loss": 1.556, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.06, | |
| "eval_loss": 1.5130035877227783, | |
| "eval_runtime": 92.1543, | |
| "eval_samples_per_second": 53.768, | |
| "eval_steps_per_second": 13.445, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.07, | |
| "grad_norm": 7.5625, | |
| "learning_rate": 9.989068136093873e-06, | |
| "loss": 1.4736, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "grad_norm": 8.75, | |
| "learning_rate": 9.975414512725058e-06, | |
| "loss": 1.4118, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.08, | |
| "eval_loss": 1.3977240324020386, | |
| "eval_runtime": 92.0357, | |
| "eval_samples_per_second": 53.838, | |
| "eval_steps_per_second": 13.462, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 0.09, | |
| "grad_norm": 7.8125, | |
| "learning_rate": 9.956320346634877e-06, | |
| "loss": 1.3674, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "grad_norm": 7.40625, | |
| "learning_rate": 9.931806517013612e-06, | |
| "loss": 1.3417, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.1, | |
| "eval_loss": 1.3288121223449707, | |
| "eval_runtime": 92.0559, | |
| "eval_samples_per_second": 53.826, | |
| "eval_steps_per_second": 13.459, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.11, | |
| "grad_norm": 7.75, | |
| "learning_rate": 9.901899829374048e-06, | |
| "loss": 1.3169, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "grad_norm": 8.0625, | |
| "learning_rate": 9.86663298624003e-06, | |
| "loss": 1.2918, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.12, | |
| "eval_loss": 1.287561058998108, | |
| "eval_runtime": 92.0116, | |
| "eval_samples_per_second": 53.852, | |
| "eval_steps_per_second": 13.466, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.13, | |
| "grad_norm": 8.3125, | |
| "learning_rate": 9.826044551386743e-06, | |
| "loss": 1.2723, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "grad_norm": 8.625, | |
| "learning_rate": 9.780178907671788e-06, | |
| "loss": 1.2511, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.14, | |
| "eval_loss": 1.2546547651290894, | |
| "eval_runtime": 91.8637, | |
| "eval_samples_per_second": 53.939, | |
| "eval_steps_per_second": 13.487, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.15, | |
| "grad_norm": 8.5625, | |
| "learning_rate": 9.729086208503174e-06, | |
| "loss": 1.2465, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "grad_norm": 7.6875, | |
| "learning_rate": 9.672822322997305e-06, | |
| "loss": 1.2293, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.16, | |
| "eval_loss": 1.2318358421325684, | |
| "eval_runtime": 92.2103, | |
| "eval_samples_per_second": 53.736, | |
| "eval_steps_per_second": 13.437, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.17, | |
| "grad_norm": 7.5625, | |
| "learning_rate": 9.611448774886925e-06, | |
| "loss": 1.2212, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "grad_norm": 7.625, | |
| "learning_rate": 9.545032675245814e-06, | |
| "loss": 1.2145, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.18, | |
| "eval_loss": 1.2164242267608643, | |
| "eval_runtime": 92.4452, | |
| "eval_samples_per_second": 53.599, | |
| "eval_steps_per_second": 13.403, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.19, | |
| "grad_norm": 8.375, | |
| "learning_rate": 9.473646649103819e-06, | |
| "loss": 1.202, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 9.397368756032445e-06, | |
| "loss": 1.1945, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.2, | |
| "eval_loss": 1.2037837505340576, | |
| "eval_runtime": 92.0761, | |
| "eval_samples_per_second": 53.814, | |
| "eval_steps_per_second": 13.456, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.21, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 9.31628240478787e-06, | |
| "loss": 1.1909, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "grad_norm": 8.25, | |
| "learning_rate": 9.230476262104678e-06, | |
| "loss": 1.1871, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.22, | |
| "eval_loss": 1.1938320398330688, | |
| "eval_runtime": 91.9107, | |
| "eval_samples_per_second": 53.911, | |
| "eval_steps_per_second": 13.48, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.23, | |
| "grad_norm": 9.25, | |
| "learning_rate": 9.140044155740102e-06, | |
| "loss": 1.177, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 9.045084971874738e-06, | |
| "loss": 1.1825, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.24, | |
| "eval_loss": 1.185314655303955, | |
| "eval_runtime": 92.1065, | |
| "eval_samples_per_second": 53.796, | |
| "eval_steps_per_second": 13.452, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.25, | |
| "grad_norm": 8.875, | |
| "learning_rate": 8.94570254698197e-06, | |
| "loss": 1.1751, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "grad_norm": 8.375, | |
| "learning_rate": 8.842005554284296e-06, | |
| "loss": 1.1735, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.26, | |
| "eval_loss": 1.1786526441574097, | |
| "eval_runtime": 91.9617, | |
| "eval_samples_per_second": 53.881, | |
| "eval_steps_per_second": 13.473, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.27, | |
| "grad_norm": 8.75, | |
| "learning_rate": 8.734107384920771e-06, | |
| "loss": 1.1704, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "grad_norm": 7.46875, | |
| "learning_rate": 8.622126023955446e-06, | |
| "loss": 1.1684, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.28, | |
| "eval_loss": 1.1743632555007935, | |
| "eval_runtime": 94.6714, | |
| "eval_samples_per_second": 52.339, | |
| "eval_steps_per_second": 13.087, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.29, | |
| "grad_norm": 8.5, | |
| "learning_rate": 8.506183921362443e-06, | |
| "loss": 1.1664, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "grad_norm": 8.125, | |
| "learning_rate": 8.386407858128707e-06, | |
| "loss": 1.1627, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.3, | |
| "eval_loss": 1.1703976392745972, | |
| "eval_runtime": 91.9503, | |
| "eval_samples_per_second": 53.888, | |
| "eval_steps_per_second": 13.475, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.31, | |
| "grad_norm": 8.375, | |
| "learning_rate": 8.262928807620843e-06, | |
| "loss": 1.1658, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "grad_norm": 8.25, | |
| "learning_rate": 8.135881792367686e-06, | |
| "loss": 1.1637, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 0.32, | |
| "eval_loss": 1.168877363204956, | |
| "eval_runtime": 92.267, | |
| "eval_samples_per_second": 53.703, | |
| "eval_steps_per_second": 13.428, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 0.33, | |
| "grad_norm": 9.625, | |
| "learning_rate": 8.005405736415127e-06, | |
| "loss": 1.1603, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "grad_norm": 9.3125, | |
| "learning_rate": 7.871643313414718e-06, | |
| "loss": 1.156, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 0.34, | |
| "eval_loss": 1.164509654045105, | |
| "eval_runtime": 91.883, | |
| "eval_samples_per_second": 53.927, | |
| "eval_steps_per_second": 13.485, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 0.35, | |
| "grad_norm": 9.375, | |
| "learning_rate": 7.734740790612137e-06, | |
| "loss": 1.1559, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "grad_norm": 8.875, | |
| "learning_rate": 7.594847868906076e-06, | |
| "loss": 1.1513, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 0.36, | |
| "eval_loss": 1.1633930206298828, | |
| "eval_runtime": 92.0359, | |
| "eval_samples_per_second": 53.838, | |
| "eval_steps_per_second": 13.462, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 0.37, | |
| "grad_norm": 7.4375, | |
| "learning_rate": 7.452117519152542e-06, | |
| "loss": 1.1524, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "grad_norm": 9.125, | |
| "learning_rate": 7.30670581489344e-06, | |
| "loss": 1.1517, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 0.38, | |
| "eval_loss": 1.1613472700119019, | |
| "eval_runtime": 92.9351, | |
| "eval_samples_per_second": 53.317, | |
| "eval_steps_per_second": 13.332, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 0.39, | |
| "grad_norm": 8.6875, | |
| "learning_rate": 7.158771761692464e-06, | |
| "loss": 1.1486, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "grad_norm": 9.75, | |
| "learning_rate": 7.008477123264849e-06, | |
| "loss": 1.1487, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.4, | |
| "eval_loss": 1.1614651679992676, | |
| "eval_runtime": 92.598, | |
| "eval_samples_per_second": 53.511, | |
| "eval_steps_per_second": 13.38, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.41, | |
| "grad_norm": 9.25, | |
| "learning_rate": 6.855986244591104e-06, | |
| "loss": 1.139, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "grad_norm": 8.5625, | |
| "learning_rate": 6.701465872208216e-06, | |
| "loss": 1.1503, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 0.42, | |
| "eval_loss": 1.1591981649398804, | |
| "eval_runtime": 92.6092, | |
| "eval_samples_per_second": 53.504, | |
| "eval_steps_per_second": 13.379, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 0.43, | |
| "grad_norm": 8.0, | |
| "learning_rate": 6.545084971874738e-06, | |
| "loss": 1.1443, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 6.387014543809224e-06, | |
| "loss": 1.1502, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 0.44, | |
| "eval_loss": 1.1588733196258545, | |
| "eval_runtime": 92.6849, | |
| "eval_samples_per_second": 53.461, | |
| "eval_steps_per_second": 13.368, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 0.45, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 6.227427435703997e-06, | |
| "loss": 1.1496, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "grad_norm": 9.625, | |
| "learning_rate": 6.066498153718735e-06, | |
| "loss": 1.1427, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 0.46, | |
| "eval_loss": 1.1595520973205566, | |
| "eval_runtime": 93.5685, | |
| "eval_samples_per_second": 52.956, | |
| "eval_steps_per_second": 13.242, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 0.47, | |
| "grad_norm": 9.8125, | |
| "learning_rate": 5.904402671660551e-06, | |
| "loss": 1.1444, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "grad_norm": 10.375, | |
| "learning_rate": 5.74131823855921e-06, | |
| "loss": 1.1433, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 0.48, | |
| "eval_loss": 1.1582742929458618, | |
| "eval_runtime": 92.9369, | |
| "eval_samples_per_second": 53.316, | |
| "eval_steps_per_second": 13.332, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 0.49, | |
| "grad_norm": 9.6875, | |
| "learning_rate": 5.577423184847932e-06, | |
| "loss": 1.1437, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "grad_norm": 8.375, | |
| "learning_rate": 5.412896727361663e-06, | |
| "loss": 1.1483, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 0.5, | |
| "eval_loss": 1.1573219299316406, | |
| "eval_runtime": 93.0508, | |
| "eval_samples_per_second": 53.25, | |
| "eval_steps_per_second": 13.315, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 0.51, | |
| "grad_norm": 8.25, | |
| "learning_rate": 5.247918773366112e-06, | |
| "loss": 1.1408, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "grad_norm": 9.625, | |
| "learning_rate": 5.082669723831793e-06, | |
| "loss": 1.1444, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 0.52, | |
| "eval_loss": 1.1576628684997559, | |
| "eval_runtime": 92.489, | |
| "eval_samples_per_second": 53.574, | |
| "eval_steps_per_second": 13.396, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 0.53, | |
| "grad_norm": 8.1875, | |
| "learning_rate": 4.917330276168208e-06, | |
| "loss": 1.1405, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "grad_norm": 9.0625, | |
| "learning_rate": 4.752081226633888e-06, | |
| "loss": 1.142, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 0.54, | |
| "eval_loss": 1.1567600965499878, | |
| "eval_runtime": 92.8548, | |
| "eval_samples_per_second": 53.363, | |
| "eval_steps_per_second": 13.343, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 0.55, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 4.587103272638339e-06, | |
| "loss": 1.1428, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "grad_norm": 9.5, | |
| "learning_rate": 4.42257681515207e-06, | |
| "loss": 1.1405, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 0.56, | |
| "eval_loss": 1.1568113565444946, | |
| "eval_runtime": 92.8975, | |
| "eval_samples_per_second": 53.338, | |
| "eval_steps_per_second": 13.337, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 0.57, | |
| "grad_norm": 10.1875, | |
| "learning_rate": 4.25868176144079e-06, | |
| "loss": 1.1475, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "grad_norm": 8.75, | |
| "learning_rate": 4.0955973283394525e-06, | |
| "loss": 1.1428, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 0.58, | |
| "eval_loss": 1.1565525531768799, | |
| "eval_runtime": 92.7278, | |
| "eval_samples_per_second": 53.436, | |
| "eval_steps_per_second": 13.362, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 0.59, | |
| "grad_norm": 9.4375, | |
| "learning_rate": 3.9335018462812664e-06, | |
| "loss": 1.1465, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "grad_norm": 8.8125, | |
| "learning_rate": 3.7725725642960047e-06, | |
| "loss": 1.1465, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 0.6, | |
| "eval_loss": 1.1557884216308594, | |
| "eval_runtime": 92.8391, | |
| "eval_samples_per_second": 53.372, | |
| "eval_steps_per_second": 13.346, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 0.61, | |
| "grad_norm": 7.84375, | |
| "learning_rate": 3.6129854561907786e-06, | |
| "loss": 1.1454, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "grad_norm": 10.125, | |
| "learning_rate": 3.4549150281252635e-06, | |
| "loss": 1.145, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 0.62, | |
| "eval_loss": 1.1553471088409424, | |
| "eval_runtime": 92.7621, | |
| "eval_samples_per_second": 53.416, | |
| "eval_steps_per_second": 13.357, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 0.63, | |
| "grad_norm": 9.75, | |
| "learning_rate": 3.298534127791785e-06, | |
| "loss": 1.1459, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "grad_norm": 9.625, | |
| "learning_rate": 3.1440137554088957e-06, | |
| "loss": 1.1495, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 0.64, | |
| "eval_loss": 1.1568386554718018, | |
| "eval_runtime": 92.8646, | |
| "eval_samples_per_second": 53.357, | |
| "eval_steps_per_second": 13.342, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 0.65, | |
| "grad_norm": 8.5, | |
| "learning_rate": 2.991522876735154e-06, | |
| "loss": 1.1448, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "grad_norm": 9.75, | |
| "learning_rate": 2.8412282383075362e-06, | |
| "loss": 1.1425, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 0.66, | |
| "eval_loss": 1.1566816568374634, | |
| "eval_runtime": 93.131, | |
| "eval_samples_per_second": 53.205, | |
| "eval_steps_per_second": 13.304, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 0.67, | |
| "grad_norm": 8.3125, | |
| "learning_rate": 2.693294185106562e-06, | |
| "loss": 1.1466, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "grad_norm": 9.0, | |
| "learning_rate": 2.5478824808474613e-06, | |
| "loss": 1.1377, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 0.68, | |
| "eval_loss": 1.1565214395523071, | |
| "eval_runtime": 92.804, | |
| "eval_samples_per_second": 53.392, | |
| "eval_steps_per_second": 13.351, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 0.69, | |
| "grad_norm": 8.875, | |
| "learning_rate": 2.4051521310939258e-06, | |
| "loss": 1.1434, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "grad_norm": 10.0, | |
| "learning_rate": 2.265259209387867e-06, | |
| "loss": 1.1389, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 0.7, | |
| "eval_loss": 1.156681776046753, | |
| "eval_runtime": 92.6941, | |
| "eval_samples_per_second": 53.455, | |
| "eval_steps_per_second": 13.367, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 0.71, | |
| "grad_norm": 11.5625, | |
| "learning_rate": 2.1283566865852824e-06, | |
| "loss": 1.144, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "grad_norm": 9.625, | |
| "learning_rate": 1.9945942635848745e-06, | |
| "loss": 1.1387, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 0.72, | |
| "eval_loss": 1.15574312210083, | |
| "eval_runtime": 92.8587, | |
| "eval_samples_per_second": 53.361, | |
| "eval_steps_per_second": 13.343, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 0.73, | |
| "grad_norm": 9.0, | |
| "learning_rate": 1.864118207632315e-06, | |
| "loss": 1.1346, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "grad_norm": 8.4375, | |
| "learning_rate": 1.7370711923791567e-06, | |
| "loss": 1.1418, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 0.74, | |
| "eval_loss": 1.1560354232788086, | |
| "eval_runtime": 92.8217, | |
| "eval_samples_per_second": 53.382, | |
| "eval_steps_per_second": 13.348, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 0.75, | |
| "grad_norm": 8.875, | |
| "learning_rate": 1.6135921418712959e-06, | |
| "loss": 1.1375, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "grad_norm": 8.625, | |
| "learning_rate": 1.4938160786375571e-06, | |
| "loss": 1.143, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 0.76, | |
| "eval_loss": 1.1558743715286255, | |
| "eval_runtime": 93.1651, | |
| "eval_samples_per_second": 53.185, | |
| "eval_steps_per_second": 13.299, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 0.77, | |
| "grad_norm": 9.625, | |
| "learning_rate": 1.3778739760445552e-06, | |
| "loss": 1.1455, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "grad_norm": 8.875, | |
| "learning_rate": 1.2658926150792321e-06, | |
| "loss": 1.1401, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 0.78, | |
| "eval_loss": 1.1553114652633667, | |
| "eval_runtime": 93.7037, | |
| "eval_samples_per_second": 52.879, | |
| "eval_steps_per_second": 13.223, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 0.79, | |
| "grad_norm": 9.25, | |
| "learning_rate": 1.157994445715706e-06, | |
| "loss": 1.14, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "grad_norm": 10.9375, | |
| "learning_rate": 1.0542974530180327e-06, | |
| "loss": 1.1444, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 0.8, | |
| "eval_loss": 1.155322551727295, | |
| "eval_runtime": 93.8365, | |
| "eval_samples_per_second": 52.805, | |
| "eval_steps_per_second": 13.204, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 0.81, | |
| "grad_norm": 8.4375, | |
| "learning_rate": 9.549150281252633e-07, | |
| "loss": 1.1428, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "grad_norm": 8.4375, | |
| "learning_rate": 8.599558442598998e-07, | |
| "loss": 1.1484, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 0.82, | |
| "eval_loss": 1.1569843292236328, | |
| "eval_runtime": 93.6505, | |
| "eval_samples_per_second": 52.909, | |
| "eval_steps_per_second": 13.23, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 0.83, | |
| "grad_norm": 9.75, | |
| "learning_rate": 7.695237378953224e-07, | |
| "loss": 1.1438, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "grad_norm": 9.25, | |
| "learning_rate": 6.837175952121305e-07, | |
| "loss": 1.1369, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 0.84, | |
| "eval_loss": 1.155900478363037, | |
| "eval_runtime": 97.8841, | |
| "eval_samples_per_second": 50.621, | |
| "eval_steps_per_second": 12.658, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 0.85, | |
| "grad_norm": 8.625, | |
| "learning_rate": 6.026312439675553e-07, | |
| "loss": 1.1379, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "grad_norm": 8.375, | |
| "learning_rate": 5.263533508961827e-07, | |
| "loss": 1.1417, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 0.86, | |
| "eval_loss": 1.156594157218933, | |
| "eval_runtime": 93.5258, | |
| "eval_samples_per_second": 52.98, | |
| "eval_steps_per_second": 13.248, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 0.87, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 4.549673247541875e-07, | |
| "loss": 1.1395, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "grad_norm": 8.9375, | |
| "learning_rate": 3.885512251130763e-07, | |
| "loss": 1.1416, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 0.88, | |
| "eval_loss": 1.1570854187011719, | |
| "eval_runtime": 94.1553, | |
| "eval_samples_per_second": 52.626, | |
| "eval_steps_per_second": 13.159, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 0.89, | |
| "grad_norm": 9.1875, | |
| "learning_rate": 3.271776770026963e-07, | |
| "loss": 1.1422, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "grad_norm": 9.5625, | |
| "learning_rate": 2.7091379149682683e-07, | |
| "loss": 1.1433, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 0.9, | |
| "eval_loss": 1.1558341979980469, | |
| "eval_runtime": 93.8721, | |
| "eval_samples_per_second": 52.785, | |
| "eval_steps_per_second": 13.199, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 0.91, | |
| "grad_norm": 9.25, | |
| "learning_rate": 2.198210923282118e-07, | |
| "loss": 1.1465, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "grad_norm": 9.125, | |
| "learning_rate": 1.7395544861325718e-07, | |
| "loss": 1.1395, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 0.92, | |
| "eval_loss": 1.155996561050415, | |
| "eval_runtime": 97.435, | |
| "eval_samples_per_second": 50.854, | |
| "eval_steps_per_second": 12.716, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 0.93, | |
| "grad_norm": 8.25, | |
| "learning_rate": 1.333670137599713e-07, | |
| "loss": 1.147, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "grad_norm": 8.4375, | |
| "learning_rate": 9.810017062595322e-08, | |
| "loss": 1.1465, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 0.94, | |
| "eval_loss": 1.1561691761016846, | |
| "eval_runtime": 93.8214, | |
| "eval_samples_per_second": 52.813, | |
| "eval_steps_per_second": 13.206, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 0.95, | |
| "grad_norm": 9.625, | |
| "learning_rate": 6.819348298638839e-08, | |
| "loss": 1.1458, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "grad_norm": 7.5, | |
| "learning_rate": 4.367965336512403e-08, | |
| "loss": 1.1474, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 0.96, | |
| "eval_loss": 1.1555160284042358, | |
| "eval_runtime": 93.7343, | |
| "eval_samples_per_second": 52.862, | |
| "eval_steps_per_second": 13.218, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 0.97, | |
| "grad_norm": 10.4375, | |
| "learning_rate": 2.4585487274942922e-08, | |
| "loss": 1.145, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "grad_norm": 8.6875, | |
| "learning_rate": 1.0931863906127327e-08, | |
| "loss": 1.1431, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 0.98, | |
| "eval_loss": 1.1566122770309448, | |
| "eval_runtime": 92.9031, | |
| "eval_samples_per_second": 53.335, | |
| "eval_steps_per_second": 13.336, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 0.99, | |
| "grad_norm": 8.0, | |
| "learning_rate": 2.7337132953697555e-09, | |
| "loss": 1.1458, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 8.25, | |
| "learning_rate": 0.0, | |
| "loss": 1.1395, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "eval_loss": 1.155640959739685, | |
| "eval_runtime": 93.0092, | |
| "eval_samples_per_second": 53.274, | |
| "eval_steps_per_second": 13.321, | |
| "step": 100000 | |
| } | |
| ], | |
| "logging_steps": 1000, | |
| "max_steps": 100000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 9223372036854775807, | |
| "save_steps": 2000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.24764153413632e+19, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |