| { |
| "best_global_step": 290, |
| "best_metric": 0.2338075339794159, |
| "best_model_checkpoint": "saves_stability/prompt-tuning/llama-3-8b-instruct/train_cb_456_1757596103/checkpoint-290", |
| "epoch": 10.0, |
| "eval_steps": 29, |
| "global_step": 570, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.08771929824561403, |
| "grad_norm": 15.25, |
| "learning_rate": 3.5087719298245615e-06, |
| "loss": 0.8073, |
| "num_input_tokens_seen": 3232, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.17543859649122806, |
| "grad_norm": 13.625, |
| "learning_rate": 7.894736842105263e-06, |
| "loss": 0.7545, |
| "num_input_tokens_seen": 6272, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.2631578947368421, |
| "grad_norm": 16.25, |
| "learning_rate": 1.2280701754385964e-05, |
| "loss": 0.7277, |
| "num_input_tokens_seen": 9088, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.3508771929824561, |
| "grad_norm": 13.1875, |
| "learning_rate": 1.6666666666666667e-05, |
| "loss": 0.7708, |
| "num_input_tokens_seen": 12480, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.43859649122807015, |
| "grad_norm": 40.75, |
| "learning_rate": 2.105263157894737e-05, |
| "loss": 0.8625, |
| "num_input_tokens_seen": 15232, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.5087719298245614, |
| "eval_loss": 0.6373762488365173, |
| "eval_runtime": 0.8706, |
| "eval_samples_per_second": 28.716, |
| "eval_steps_per_second": 8.04, |
| "num_input_tokens_seen": 18048, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.5263157894736842, |
| "grad_norm": 96.5, |
| "learning_rate": 2.5438596491228074e-05, |
| "loss": 0.6894, |
| "num_input_tokens_seen": 18496, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.6140350877192983, |
| "grad_norm": 9.625, |
| "learning_rate": 2.9824561403508772e-05, |
| "loss": 0.4332, |
| "num_input_tokens_seen": 21632, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.7017543859649122, |
| "grad_norm": 239.0, |
| "learning_rate": 3.421052631578947e-05, |
| "loss": 0.4532, |
| "num_input_tokens_seen": 25312, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.7894736842105263, |
| "grad_norm": 17.375, |
| "learning_rate": 3.859649122807018e-05, |
| "loss": 0.3069, |
| "num_input_tokens_seen": 29024, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.8771929824561403, |
| "grad_norm": 15.875, |
| "learning_rate": 4.298245614035088e-05, |
| "loss": 0.1583, |
| "num_input_tokens_seen": 32096, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.9649122807017544, |
| "grad_norm": 46.5, |
| "learning_rate": 4.736842105263158e-05, |
| "loss": 0.2666, |
| "num_input_tokens_seen": 35232, |
| "step": 55 |
| }, |
| { |
| "epoch": 1.0175438596491229, |
| "eval_loss": 0.2585161626338959, |
| "eval_runtime": 0.8614, |
| "eval_samples_per_second": 29.021, |
| "eval_steps_per_second": 8.126, |
| "num_input_tokens_seen": 36328, |
| "step": 58 |
| }, |
| { |
| "epoch": 1.0526315789473684, |
| "grad_norm": 43.75, |
| "learning_rate": 4.999812487773597e-05, |
| "loss": 0.2788, |
| "num_input_tokens_seen": 37288, |
| "step": 60 |
| }, |
| { |
| "epoch": 1.1403508771929824, |
| "grad_norm": 28.75, |
| "learning_rate": 4.997703298253406e-05, |
| "loss": 0.1076, |
| "num_input_tokens_seen": 41256, |
| "step": 65 |
| }, |
| { |
| "epoch": 1.2280701754385965, |
| "grad_norm": 34.5, |
| "learning_rate": 4.993252512887069e-05, |
| "loss": 0.1633, |
| "num_input_tokens_seen": 44680, |
| "step": 70 |
| }, |
| { |
| "epoch": 1.3157894736842106, |
| "grad_norm": 72.5, |
| "learning_rate": 4.986464304284091e-05, |
| "loss": 0.3199, |
| "num_input_tokens_seen": 48424, |
| "step": 75 |
| }, |
| { |
| "epoch": 1.4035087719298245, |
| "grad_norm": 5.65625, |
| "learning_rate": 4.977345036387331e-05, |
| "loss": 0.0449, |
| "num_input_tokens_seen": 51880, |
| "step": 80 |
| }, |
| { |
| "epoch": 1.4912280701754386, |
| "grad_norm": 50.25, |
| "learning_rate": 4.965903258506806e-05, |
| "loss": 0.1799, |
| "num_input_tokens_seen": 55176, |
| "step": 85 |
| }, |
| { |
| "epoch": 1.526315789473684, |
| "eval_loss": 0.3019123077392578, |
| "eval_runtime": 0.8638, |
| "eval_samples_per_second": 28.943, |
| "eval_steps_per_second": 8.104, |
| "num_input_tokens_seen": 56168, |
| "step": 87 |
| }, |
| { |
| "epoch": 1.5789473684210527, |
| "grad_norm": 15.8125, |
| "learning_rate": 4.952149697304716e-05, |
| "loss": 0.1852, |
| "num_input_tokens_seen": 57640, |
| "step": 90 |
| }, |
| { |
| "epoch": 1.6666666666666665, |
| "grad_norm": 0.7734375, |
| "learning_rate": 4.9360972467392056e-05, |
| "loss": 0.0961, |
| "num_input_tokens_seen": 61352, |
| "step": 95 |
| }, |
| { |
| "epoch": 1.7543859649122808, |
| "grad_norm": 29.0, |
| "learning_rate": 4.917760955976277e-05, |
| "loss": 0.3273, |
| "num_input_tokens_seen": 64488, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.8421052631578947, |
| "grad_norm": 7.875, |
| "learning_rate": 4.897158015281209e-05, |
| "loss": 0.2378, |
| "num_input_tokens_seen": 67400, |
| "step": 105 |
| }, |
| { |
| "epoch": 1.9298245614035088, |
| "grad_norm": 50.25, |
| "learning_rate": 4.874307739902689e-05, |
| "loss": 0.2151, |
| "num_input_tokens_seen": 70280, |
| "step": 110 |
| }, |
| { |
| "epoch": 2.017543859649123, |
| "grad_norm": 17.125, |
| "learning_rate": 4.849231551964771e-05, |
| "loss": 0.1285, |
| "num_input_tokens_seen": 73184, |
| "step": 115 |
| }, |
| { |
| "epoch": 2.0350877192982457, |
| "eval_loss": 0.3430290222167969, |
| "eval_runtime": 0.8645, |
| "eval_samples_per_second": 28.918, |
| "eval_steps_per_second": 8.097, |
| "num_input_tokens_seen": 73792, |
| "step": 116 |
| }, |
| { |
| "epoch": 2.1052631578947367, |
| "grad_norm": 10.875, |
| "learning_rate": 4.821952960383649e-05, |
| "loss": 0.1257, |
| "num_input_tokens_seen": 76256, |
| "step": 120 |
| }, |
| { |
| "epoch": 2.192982456140351, |
| "grad_norm": 40.0, |
| "learning_rate": 4.7924975388280524e-05, |
| "loss": 0.1998, |
| "num_input_tokens_seen": 79200, |
| "step": 125 |
| }, |
| { |
| "epoch": 2.280701754385965, |
| "grad_norm": 20.0, |
| "learning_rate": 4.760892901743944e-05, |
| "loss": 0.1488, |
| "num_input_tokens_seen": 82272, |
| "step": 130 |
| }, |
| { |
| "epoch": 2.3684210526315788, |
| "grad_norm": 8.0, |
| "learning_rate": 4.727168678465988e-05, |
| "loss": 0.1076, |
| "num_input_tokens_seen": 86624, |
| "step": 135 |
| }, |
| { |
| "epoch": 2.456140350877193, |
| "grad_norm": 34.25, |
| "learning_rate": 4.6913564854400595e-05, |
| "loss": 0.1963, |
| "num_input_tokens_seen": 89568, |
| "step": 140 |
| }, |
| { |
| "epoch": 2.543859649122807, |
| "grad_norm": 44.25, |
| "learning_rate": 4.6534898965828405e-05, |
| "loss": 0.279, |
| "num_input_tokens_seen": 92768, |
| "step": 145 |
| }, |
| { |
| "epoch": 2.543859649122807, |
| "eval_loss": 0.2732478976249695, |
| "eval_runtime": 0.863, |
| "eval_samples_per_second": 28.968, |
| "eval_steps_per_second": 8.111, |
| "num_input_tokens_seen": 92768, |
| "step": 145 |
| }, |
| { |
| "epoch": 2.6315789473684212, |
| "grad_norm": 16.125, |
| "learning_rate": 4.613604411806285e-05, |
| "loss": 0.1346, |
| "num_input_tokens_seen": 96160, |
| "step": 150 |
| }, |
| { |
| "epoch": 2.719298245614035, |
| "grad_norm": 2.140625, |
| "learning_rate": 4.5717374237364665e-05, |
| "loss": 0.1928, |
| "num_input_tokens_seen": 99328, |
| "step": 155 |
| }, |
| { |
| "epoch": 2.807017543859649, |
| "grad_norm": 27.25, |
| "learning_rate": 4.5279281826580056e-05, |
| "loss": 0.194, |
| "num_input_tokens_seen": 102368, |
| "step": 160 |
| }, |
| { |
| "epoch": 2.8947368421052633, |
| "grad_norm": 28.25, |
| "learning_rate": 4.482217759716946e-05, |
| "loss": 0.0586, |
| "num_input_tokens_seen": 105184, |
| "step": 165 |
| }, |
| { |
| "epoch": 2.982456140350877, |
| "grad_norm": 2.25, |
| "learning_rate": 4.434649008416565e-05, |
| "loss": 0.0709, |
| "num_input_tokens_seen": 108320, |
| "step": 170 |
| }, |
| { |
| "epoch": 3.0526315789473686, |
| "eval_loss": 0.26425841450691223, |
| "eval_runtime": 0.8666, |
| "eval_samples_per_second": 28.848, |
| "eval_steps_per_second": 8.077, |
| "num_input_tokens_seen": 110064, |
| "step": 174 |
| }, |
| { |
| "epoch": 3.0701754385964914, |
| "grad_norm": 36.25, |
| "learning_rate": 4.385266524442241e-05, |
| "loss": 0.0307, |
| "num_input_tokens_seen": 110640, |
| "step": 175 |
| }, |
| { |
| "epoch": 3.1578947368421053, |
| "grad_norm": 70.0, |
| "learning_rate": 4.334116603853007e-05, |
| "loss": 0.1662, |
| "num_input_tokens_seen": 114128, |
| "step": 180 |
| }, |
| { |
| "epoch": 3.245614035087719, |
| "grad_norm": 40.5, |
| "learning_rate": 4.2812471996790206e-05, |
| "loss": 0.1356, |
| "num_input_tokens_seen": 116976, |
| "step": 185 |
| }, |
| { |
| "epoch": 3.3333333333333335, |
| "grad_norm": 8.5, |
| "learning_rate": 4.226707876965611e-05, |
| "loss": 0.4012, |
| "num_input_tokens_seen": 120656, |
| "step": 190 |
| }, |
| { |
| "epoch": 3.4210526315789473, |
| "grad_norm": 3.078125, |
| "learning_rate": 4.1705497663060767e-05, |
| "loss": 0.1468, |
| "num_input_tokens_seen": 124432, |
| "step": 195 |
| }, |
| { |
| "epoch": 3.5087719298245617, |
| "grad_norm": 5.5, |
| "learning_rate": 4.1128255159067665e-05, |
| "loss": 0.0766, |
| "num_input_tokens_seen": 127728, |
| "step": 200 |
| }, |
| { |
| "epoch": 3.56140350877193, |
| "eval_loss": 0.24596631526947021, |
| "eval_runtime": 0.8653, |
| "eval_samples_per_second": 28.892, |
| "eval_steps_per_second": 8.09, |
| "num_input_tokens_seen": 129808, |
| "step": 203 |
| }, |
| { |
| "epoch": 3.5964912280701755, |
| "grad_norm": 1.1328125, |
| "learning_rate": 4.053589242229412e-05, |
| "loss": 0.0812, |
| "num_input_tokens_seen": 131024, |
| "step": 205 |
| }, |
| { |
| "epoch": 3.6842105263157894, |
| "grad_norm": 17.125, |
| "learning_rate": 3.9928964792569655e-05, |
| "loss": 0.0511, |
| "num_input_tokens_seen": 134000, |
| "step": 210 |
| }, |
| { |
| "epoch": 3.7719298245614032, |
| "grad_norm": 3.53125, |
| "learning_rate": 3.930804126430513e-05, |
| "loss": 0.1721, |
| "num_input_tokens_seen": 136880, |
| "step": 215 |
| }, |
| { |
| "epoch": 3.8596491228070176, |
| "grad_norm": 10.9375, |
| "learning_rate": 3.867370395306068e-05, |
| "loss": 0.2545, |
| "num_input_tokens_seen": 139952, |
| "step": 220 |
| }, |
| { |
| "epoch": 3.9473684210526314, |
| "grad_norm": 2.109375, |
| "learning_rate": 3.8026547549812665e-05, |
| "loss": 0.0915, |
| "num_input_tokens_seen": 143408, |
| "step": 225 |
| }, |
| { |
| "epoch": 4.035087719298246, |
| "grad_norm": 0.71875, |
| "learning_rate": 3.736717876343106e-05, |
| "loss": 0.5976, |
| "num_input_tokens_seen": 146088, |
| "step": 230 |
| }, |
| { |
| "epoch": 4.0701754385964914, |
| "eval_loss": 0.2900010049343109, |
| "eval_runtime": 0.8705, |
| "eval_samples_per_second": 28.72, |
| "eval_steps_per_second": 8.042, |
| "num_input_tokens_seen": 147240, |
| "step": 232 |
| }, |
| { |
| "epoch": 4.12280701754386, |
| "grad_norm": 46.75, |
| "learning_rate": 3.66962157518902e-05, |
| "loss": 0.1247, |
| "num_input_tokens_seen": 149288, |
| "step": 235 |
| }, |
| { |
| "epoch": 4.2105263157894735, |
| "grad_norm": 17.75, |
| "learning_rate": 3.601428754274584e-05, |
| "loss": 0.1385, |
| "num_input_tokens_seen": 152200, |
| "step": 240 |
| }, |
| { |
| "epoch": 4.298245614035087, |
| "grad_norm": 4.5, |
| "learning_rate": 3.532203344342212e-05, |
| "loss": 0.1421, |
| "num_input_tokens_seen": 155560, |
| "step": 245 |
| }, |
| { |
| "epoch": 4.385964912280702, |
| "grad_norm": 109.0, |
| "learning_rate": 3.4620102441861143e-05, |
| "loss": 0.0635, |
| "num_input_tokens_seen": 158760, |
| "step": 250 |
| }, |
| { |
| "epoch": 4.473684210526316, |
| "grad_norm": 1.1875, |
| "learning_rate": 3.390915259809696e-05, |
| "loss": 0.027, |
| "num_input_tokens_seen": 161384, |
| "step": 255 |
| }, |
| { |
| "epoch": 4.56140350877193, |
| "grad_norm": 35.25, |
| "learning_rate": 3.318985042732461e-05, |
| "loss": 0.1025, |
| "num_input_tokens_seen": 163976, |
| "step": 260 |
| }, |
| { |
| "epoch": 4.578947368421053, |
| "eval_loss": 0.30449309945106506, |
| "eval_runtime": 0.8679, |
| "eval_samples_per_second": 28.804, |
| "eval_steps_per_second": 8.065, |
| "num_input_tokens_seen": 164744, |
| "step": 261 |
| }, |
| { |
| "epoch": 4.649122807017544, |
| "grad_norm": 2.28125, |
| "learning_rate": 3.246287027504237e-05, |
| "loss": 0.1271, |
| "num_input_tokens_seen": 167368, |
| "step": 265 |
| }, |
| { |
| "epoch": 4.7368421052631575, |
| "grad_norm": 36.25, |
| "learning_rate": 3.172889368485311e-05, |
| "loss": 0.2534, |
| "num_input_tokens_seen": 171240, |
| "step": 270 |
| }, |
| { |
| "epoch": 4.824561403508772, |
| "grad_norm": 2.875, |
| "learning_rate": 3.0988608759517475e-05, |
| "loss": 0.1075, |
| "num_input_tokens_seen": 174728, |
| "step": 275 |
| }, |
| { |
| "epoch": 4.912280701754386, |
| "grad_norm": 66.0, |
| "learning_rate": 3.0242709515857758e-05, |
| "loss": 0.1823, |
| "num_input_tokens_seen": 178216, |
| "step": 280 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.06982421875, |
| "learning_rate": 2.949189523411747e-05, |
| "loss": 0.1201, |
| "num_input_tokens_seen": 180696, |
| "step": 285 |
| }, |
| { |
| "epoch": 5.087719298245614, |
| "grad_norm": 0.07861328125, |
| "learning_rate": 2.8736869802386364e-05, |
| "loss": 0.0229, |
| "num_input_tokens_seen": 184440, |
| "step": 290 |
| }, |
| { |
| "epoch": 5.087719298245614, |
| "eval_loss": 0.2338075339794159, |
| "eval_runtime": 0.8735, |
| "eval_samples_per_second": 28.619, |
| "eval_steps_per_second": 8.013, |
| "num_input_tokens_seen": 184440, |
| "step": 290 |
| }, |
| { |
| "epoch": 5.175438596491228, |
| "grad_norm": 10.1875, |
| "learning_rate": 2.797834105670559e-05, |
| "loss": 0.1191, |
| "num_input_tokens_seen": 186968, |
| "step": 295 |
| }, |
| { |
| "epoch": 5.2631578947368425, |
| "grad_norm": 65.0, |
| "learning_rate": 2.7217020117471793e-05, |
| "loss": 0.2727, |
| "num_input_tokens_seen": 190808, |
| "step": 300 |
| }, |
| { |
| "epoch": 5.350877192982456, |
| "grad_norm": 8.0625, |
| "learning_rate": 2.6453620722761896e-05, |
| "loss": 0.016, |
| "num_input_tokens_seen": 194168, |
| "step": 305 |
| }, |
| { |
| "epoch": 5.43859649122807, |
| "grad_norm": 0.498046875, |
| "learning_rate": 2.5688858559204053e-05, |
| "loss": 0.102, |
| "num_input_tokens_seen": 197176, |
| "step": 310 |
| }, |
| { |
| "epoch": 5.526315789473684, |
| "grad_norm": 1.7734375, |
| "learning_rate": 2.492345059102164e-05, |
| "loss": 0.0101, |
| "num_input_tokens_seen": 199992, |
| "step": 315 |
| }, |
| { |
| "epoch": 5.5964912280701755, |
| "eval_loss": 0.2630562484264374, |
| "eval_runtime": 0.8728, |
| "eval_samples_per_second": 28.642, |
| "eval_steps_per_second": 8.02, |
| "num_input_tokens_seen": 202456, |
| "step": 319 |
| }, |
| { |
| "epoch": 5.614035087719298, |
| "grad_norm": 0.3671875, |
| "learning_rate": 2.4158114387879616e-05, |
| "loss": 0.0917, |
| "num_input_tokens_seen": 203096, |
| "step": 320 |
| }, |
| { |
| "epoch": 5.701754385964913, |
| "grad_norm": 24.0, |
| "learning_rate": 2.3393567452163252e-05, |
| "loss": 0.0334, |
| "num_input_tokens_seen": 206616, |
| "step": 325 |
| }, |
| { |
| "epoch": 5.7894736842105265, |
| "grad_norm": 18.0, |
| "learning_rate": 2.2630526546319914e-05, |
| "loss": 0.1046, |
| "num_input_tokens_seen": 210040, |
| "step": 330 |
| }, |
| { |
| "epoch": 5.87719298245614, |
| "grad_norm": 19.625, |
| "learning_rate": 2.186970702089457e-05, |
| "loss": 0.2209, |
| "num_input_tokens_seen": 212952, |
| "step": 335 |
| }, |
| { |
| "epoch": 5.964912280701754, |
| "grad_norm": 4.21875, |
| "learning_rate": 2.111182214388893e-05, |
| "loss": 0.0028, |
| "num_input_tokens_seen": 216248, |
| "step": 340 |
| }, |
| { |
| "epoch": 6.052631578947368, |
| "grad_norm": 2.625, |
| "learning_rate": 2.0357582432072957e-05, |
| "loss": 0.002, |
| "num_input_tokens_seen": 218656, |
| "step": 345 |
| }, |
| { |
| "epoch": 6.105263157894737, |
| "eval_loss": 0.27883315086364746, |
| "eval_runtime": 1.6886, |
| "eval_samples_per_second": 14.805, |
| "eval_steps_per_second": 4.145, |
| "num_input_tokens_seen": 220288, |
| "step": 348 |
| }, |
| { |
| "epoch": 6.140350877192983, |
| "grad_norm": 28.5, |
| "learning_rate": 1.9607694984875754e-05, |
| "loss": 0.0599, |
| "num_input_tokens_seen": 221760, |
| "step": 350 |
| }, |
| { |
| "epoch": 6.228070175438597, |
| "grad_norm": 75.0, |
| "learning_rate": 1.8862862821480025e-05, |
| "loss": 0.1713, |
| "num_input_tokens_seen": 224672, |
| "step": 355 |
| }, |
| { |
| "epoch": 6.315789473684211, |
| "grad_norm": 7.0, |
| "learning_rate": 1.8123784221741964e-05, |
| "loss": 0.0233, |
| "num_input_tokens_seen": 227872, |
| "step": 360 |
| }, |
| { |
| "epoch": 6.4035087719298245, |
| "grad_norm": 0.062255859375, |
| "learning_rate": 1.73911520715541e-05, |
| "loss": 0.0573, |
| "num_input_tokens_seen": 230528, |
| "step": 365 |
| }, |
| { |
| "epoch": 6.491228070175438, |
| "grad_norm": 0.431640625, |
| "learning_rate": 1.666565321326512e-05, |
| "loss": 0.0808, |
| "num_input_tokens_seen": 234880, |
| "step": 370 |
| }, |
| { |
| "epoch": 6.578947368421053, |
| "grad_norm": 7.3125, |
| "learning_rate": 1.5947967801765345e-05, |
| "loss": 0.172, |
| "num_input_tokens_seen": 237792, |
| "step": 375 |
| }, |
| { |
| "epoch": 6.614035087719298, |
| "eval_loss": 0.2708320915699005, |
| "eval_runtime": 0.8648, |
| "eval_samples_per_second": 28.909, |
| "eval_steps_per_second": 8.095, |
| "num_input_tokens_seen": 239200, |
| "step": 377 |
| }, |
| { |
| "epoch": 6.666666666666667, |
| "grad_norm": 0.06591796875, |
| "learning_rate": 1.5238768666841907e-05, |
| "loss": 0.0057, |
| "num_input_tokens_seen": 240736, |
| "step": 380 |
| }, |
| { |
| "epoch": 6.754385964912281, |
| "grad_norm": 29.75, |
| "learning_rate": 1.4538720682400969e-05, |
| "loss": 0.0099, |
| "num_input_tokens_seen": 243744, |
| "step": 385 |
| }, |
| { |
| "epoch": 6.842105263157895, |
| "grad_norm": 0.7578125, |
| "learning_rate": 1.3848480143148839e-05, |
| "loss": 0.2363, |
| "num_input_tokens_seen": 247008, |
| "step": 390 |
| }, |
| { |
| "epoch": 6.9298245614035086, |
| "grad_norm": 0.07177734375, |
| "learning_rate": 1.3168694149315796e-05, |
| "loss": 0.1729, |
| "num_input_tokens_seen": 249984, |
| "step": 395 |
| }, |
| { |
| "epoch": 7.017543859649122, |
| "grad_norm": 8.625, |
| "learning_rate": 1.2500000000000006e-05, |
| "loss": 0.0386, |
| "num_input_tokens_seen": 252936, |
| "step": 400 |
| }, |
| { |
| "epoch": 7.105263157894737, |
| "grad_norm": 6.9375, |
| "learning_rate": 1.1843024595699805e-05, |
| "loss": 0.1097, |
| "num_input_tokens_seen": 255880, |
| "step": 405 |
| }, |
| { |
| "epoch": 7.12280701754386, |
| "eval_loss": 0.2558472156524658, |
| "eval_runtime": 0.869, |
| "eval_samples_per_second": 28.767, |
| "eval_steps_per_second": 8.055, |
| "num_input_tokens_seen": 256296, |
| "step": 406 |
| }, |
| { |
| "epoch": 7.192982456140351, |
| "grad_norm": 0.77734375, |
| "learning_rate": 1.1198383850594758e-05, |
| "loss": 0.0113, |
| "num_input_tokens_seen": 259240, |
| "step": 410 |
| }, |
| { |
| "epoch": 7.280701754385965, |
| "grad_norm": 0.46875, |
| "learning_rate": 1.0566682115126344e-05, |
| "loss": 0.0609, |
| "num_input_tokens_seen": 262632, |
| "step": 415 |
| }, |
| { |
| "epoch": 7.368421052631579, |
| "grad_norm": 0.369140625, |
| "learning_rate": 9.948511609419675e-06, |
| "loss": 0.1146, |
| "num_input_tokens_seen": 265704, |
| "step": 420 |
| }, |
| { |
| "epoch": 7.456140350877193, |
| "grad_norm": 19.875, |
| "learning_rate": 9.344451868077353e-06, |
| "loss": 0.0692, |
| "num_input_tokens_seen": 269480, |
| "step": 425 |
| }, |
| { |
| "epoch": 7.543859649122807, |
| "grad_norm": 0.09423828125, |
| "learning_rate": 8.755069196866014e-06, |
| "loss": 0.0216, |
| "num_input_tokens_seen": 272648, |
| "step": 430 |
| }, |
| { |
| "epoch": 7.631578947368421, |
| "grad_norm": 1.390625, |
| "learning_rate": 8.180916141804906e-06, |
| "loss": 0.009, |
| "num_input_tokens_seen": 275688, |
| "step": 435 |
| }, |
| { |
| "epoch": 7.631578947368421, |
| "eval_loss": 0.25770363211631775, |
| "eval_runtime": 0.8623, |
| "eval_samples_per_second": 28.992, |
| "eval_steps_per_second": 8.118, |
| "num_input_tokens_seen": 275688, |
| "step": 435 |
| }, |
| { |
| "epoch": 7.719298245614035, |
| "grad_norm": 37.5, |
| "learning_rate": 7.622530971154199e-06, |
| "loss": 0.1372, |
| "num_input_tokens_seen": 279400, |
| "step": 440 |
| }, |
| { |
| "epoch": 7.807017543859649, |
| "grad_norm": 0.039306640625, |
| "learning_rate": 7.080437170788723e-06, |
| "loss": 0.0864, |
| "num_input_tokens_seen": 282504, |
| "step": 445 |
| }, |
| { |
| "epoch": 7.894736842105263, |
| "grad_norm": 28.25, |
| "learning_rate": 6.555142953430158e-06, |
| "loss": 0.2614, |
| "num_input_tokens_seen": 285448, |
| "step": 450 |
| }, |
| { |
| "epoch": 7.982456140350877, |
| "grad_norm": 2.34375, |
| "learning_rate": 6.0471407821978135e-06, |
| "loss": 0.0152, |
| "num_input_tokens_seen": 288680, |
| "step": 455 |
| }, |
| { |
| "epoch": 8.070175438596491, |
| "grad_norm": 44.25, |
| "learning_rate": 5.556906908924655e-06, |
| "loss": 0.2481, |
| "num_input_tokens_seen": 291728, |
| "step": 460 |
| }, |
| { |
| "epoch": 8.140350877192983, |
| "eval_loss": 0.25524288415908813, |
| "eval_runtime": 0.8698, |
| "eval_samples_per_second": 28.743, |
| "eval_steps_per_second": 8.048, |
| "num_input_tokens_seen": 294608, |
| "step": 464 |
| }, |
| { |
| "epoch": 8.157894736842104, |
| "grad_norm": 13.9375, |
| "learning_rate": 5.084900927671393e-06, |
| "loss": 0.0169, |
| "num_input_tokens_seen": 295216, |
| "step": 465 |
| }, |
| { |
| "epoch": 8.24561403508772, |
| "grad_norm": 0.59375, |
| "learning_rate": 4.631565343857239e-06, |
| "loss": 0.0047, |
| "num_input_tokens_seen": 297552, |
| "step": 470 |
| }, |
| { |
| "epoch": 8.333333333333334, |
| "grad_norm": 36.75, |
| "learning_rate": 4.19732515941125e-06, |
| "loss": 0.0674, |
| "num_input_tokens_seen": 300688, |
| "step": 475 |
| }, |
| { |
| "epoch": 8.421052631578947, |
| "grad_norm": 0.40625, |
| "learning_rate": 3.7825874743331907e-06, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 303664, |
| "step": 480 |
| }, |
| { |
| "epoch": 8.508771929824562, |
| "grad_norm": 0.130859375, |
| "learning_rate": 3.3877411050374424e-06, |
| "loss": 0.0808, |
| "num_input_tokens_seen": 307120, |
| "step": 485 |
| }, |
| { |
| "epoch": 8.596491228070175, |
| "grad_norm": 0.07421875, |
| "learning_rate": 3.013156219837776e-06, |
| "loss": 0.1663, |
| "num_input_tokens_seen": 310352, |
| "step": 490 |
| }, |
| { |
| "epoch": 8.649122807017545, |
| "eval_loss": 0.255894273519516, |
| "eval_runtime": 0.8672, |
| "eval_samples_per_second": 28.828, |
| "eval_steps_per_second": 8.072, |
| "num_input_tokens_seen": 312144, |
| "step": 493 |
| }, |
| { |
| "epoch": 8.68421052631579, |
| "grad_norm": 0.2255859375, |
| "learning_rate": 2.659183991914696e-06, |
| "loss": 0.0009, |
| "num_input_tokens_seen": 313328, |
| "step": 495 |
| }, |
| { |
| "epoch": 8.771929824561404, |
| "grad_norm": 1.5859375, |
| "learning_rate": 2.326156270090735e-06, |
| "loss": 0.1523, |
| "num_input_tokens_seen": 316688, |
| "step": 500 |
| }, |
| { |
| "epoch": 8.859649122807017, |
| "grad_norm": 0.06640625, |
| "learning_rate": 2.0143852677223075e-06, |
| "loss": 0.1186, |
| "num_input_tokens_seen": 319472, |
| "step": 505 |
| }, |
| { |
| "epoch": 8.947368421052632, |
| "grad_norm": 78.5, |
| "learning_rate": 1.7241632699998123e-06, |
| "loss": 0.011, |
| "num_input_tokens_seen": 322896, |
| "step": 510 |
| }, |
| { |
| "epoch": 9.035087719298245, |
| "grad_norm": 0.62890625, |
| "learning_rate": 1.4557623599303903e-06, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 325320, |
| "step": 515 |
| }, |
| { |
| "epoch": 9.12280701754386, |
| "grad_norm": 0.08349609375, |
| "learning_rate": 1.2094341632602064e-06, |
| "loss": 0.0393, |
| "num_input_tokens_seen": 328936, |
| "step": 520 |
| }, |
| { |
| "epoch": 9.157894736842104, |
| "eval_loss": 0.2579546868801117, |
| "eval_runtime": 0.8687, |
| "eval_samples_per_second": 28.779, |
| "eval_steps_per_second": 8.058, |
| "num_input_tokens_seen": 330152, |
| "step": 522 |
| }, |
| { |
| "epoch": 9.210526315789474, |
| "grad_norm": 3.828125, |
| "learning_rate": 9.85409612575411e-07, |
| "loss": 0.049, |
| "num_input_tokens_seen": 331944, |
| "step": 525 |
| }, |
| { |
| "epoch": 9.298245614035087, |
| "grad_norm": 38.75, |
| "learning_rate": 7.838987308029427e-07, |
| "loss": 0.1322, |
| "num_input_tokens_seen": 334952, |
| "step": 530 |
| }, |
| { |
| "epoch": 9.385964912280702, |
| "grad_norm": 0.71875, |
| "learning_rate": 6.050904343141095e-07, |
| "loss": 0.0047, |
| "num_input_tokens_seen": 337992, |
| "step": 535 |
| }, |
| { |
| "epoch": 9.473684210526315, |
| "grad_norm": 13.1875, |
| "learning_rate": 4.491523558155714e-07, |
| "loss": 0.138, |
| "num_input_tokens_seen": 340456, |
| "step": 540 |
| }, |
| { |
| "epoch": 9.56140350877193, |
| "grad_norm": 11.25, |
| "learning_rate": 3.162306871937387e-07, |
| "loss": 0.0646, |
| "num_input_tokens_seen": 344360, |
| "step": 545 |
| }, |
| { |
| "epoch": 9.649122807017545, |
| "grad_norm": 0.7890625, |
| "learning_rate": 2.064500424599436e-07, |
| "loss": 0.0009, |
| "num_input_tokens_seen": 347368, |
| "step": 550 |
| }, |
| { |
| "epoch": 9.666666666666666, |
| "eval_loss": 0.2559719383716583, |
| "eval_runtime": 0.8719, |
| "eval_samples_per_second": 28.675, |
| "eval_steps_per_second": 8.029, |
| "num_input_tokens_seen": 347976, |
| "step": 551 |
| }, |
| { |
| "epoch": 9.736842105263158, |
| "grad_norm": 0.5078125, |
| "learning_rate": 1.1991334092484318e-07, |
| "loss": 0.1687, |
| "num_input_tokens_seen": 350664, |
| "step": 555 |
| }, |
| { |
| "epoch": 9.824561403508772, |
| "grad_norm": 0.482421875, |
| "learning_rate": 5.6701710711626334e-08, |
| "loss": 0.1055, |
| "num_input_tokens_seen": 353832, |
| "step": 560 |
| }, |
| { |
| "epoch": 9.912280701754385, |
| "grad_norm": 0.25, |
| "learning_rate": 1.6874412698408836e-08, |
| "loss": 0.0957, |
| "num_input_tokens_seen": 356872, |
| "step": 565 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 0.7109375, |
| "learning_rate": 4.687849611939576e-10, |
| "loss": 0.0036, |
| "num_input_tokens_seen": 359688, |
| "step": 570 |
| }, |
| { |
| "epoch": 10.0, |
| "num_input_tokens_seen": 359688, |
| "step": 570, |
| "total_flos": 1.6196588072534016e+16, |
| "train_loss": 0.16148895006422606, |
| "train_runtime": 163.8336, |
| "train_samples_per_second": 13.733, |
| "train_steps_per_second": 3.479 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 570, |
| "num_input_tokens_seen": 359688, |
| "num_train_epochs": 10, |
| "save_steps": 29, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.6196588072534016e+16, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|