| { |
| "best_global_step": 174, |
| "best_metric": 0.1527228057384491, |
| "best_model_checkpoint": "saves_stability/lntuning/llama-3-8b-instruct/train_cb_1757340194/checkpoint-174", |
| "epoch": 10.0, |
| "eval_steps": 29, |
| "global_step": 570, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.08771929824561403, |
| "grad_norm": 8.722345352172852, |
| "learning_rate": 3.5087719298245615e-06, |
| "loss": 1.1591, |
| "num_input_tokens_seen": 3552, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.17543859649122806, |
| "grad_norm": 7.453003406524658, |
| "learning_rate": 7.894736842105263e-06, |
| "loss": 1.2768, |
| "num_input_tokens_seen": 7264, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.2631578947368421, |
| "grad_norm": 7.4476189613342285, |
| "learning_rate": 1.2280701754385964e-05, |
| "loss": 1.1102, |
| "num_input_tokens_seen": 10528, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.3508771929824561, |
| "grad_norm": 6.995124340057373, |
| "learning_rate": 1.6666666666666667e-05, |
| "loss": 0.9473, |
| "num_input_tokens_seen": 14720, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.43859649122807015, |
| "grad_norm": 7.862689018249512, |
| "learning_rate": 2.105263157894737e-05, |
| "loss": 0.9966, |
| "num_input_tokens_seen": 18016, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.5087719298245614, |
| "eval_loss": 0.7891942262649536, |
| "eval_runtime": 0.6287, |
| "eval_samples_per_second": 39.766, |
| "eval_steps_per_second": 11.135, |
| "num_input_tokens_seen": 20064, |
| "step": 29 |
| }, |
| { |
| "epoch": 0.5263157894736842, |
| "grad_norm": 5.243933200836182, |
| "learning_rate": 2.5438596491228074e-05, |
| "loss": 0.9601, |
| "num_input_tokens_seen": 20640, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.6140350877192983, |
| "grad_norm": 6.814531326293945, |
| "learning_rate": 2.9824561403508772e-05, |
| "loss": 0.6907, |
| "num_input_tokens_seen": 24800, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.7017543859649122, |
| "grad_norm": 6.561820983886719, |
| "learning_rate": 3.421052631578947e-05, |
| "loss": 0.567, |
| "num_input_tokens_seen": 28064, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.7894736842105263, |
| "grad_norm": 3.355044364929199, |
| "learning_rate": 3.859649122807018e-05, |
| "loss": 0.6682, |
| "num_input_tokens_seen": 30944, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.8771929824561403, |
| "grad_norm": 3.2820112705230713, |
| "learning_rate": 4.298245614035088e-05, |
| "loss": 0.2974, |
| "num_input_tokens_seen": 33664, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.9649122807017544, |
| "grad_norm": 2.775641441345215, |
| "learning_rate": 4.736842105263158e-05, |
| "loss": 0.2673, |
| "num_input_tokens_seen": 36320, |
| "step": 55 |
| }, |
| { |
| "epoch": 1.0175438596491229, |
| "eval_loss": 0.2178267389535904, |
| "eval_runtime": 0.6233, |
| "eval_samples_per_second": 40.106, |
| "eval_steps_per_second": 11.23, |
| "num_input_tokens_seen": 37832, |
| "step": 58 |
| }, |
| { |
| "epoch": 1.0526315789473684, |
| "grad_norm": 2.8043813705444336, |
| "learning_rate": 4.999812487773597e-05, |
| "loss": 0.0722, |
| "num_input_tokens_seen": 39080, |
| "step": 60 |
| }, |
| { |
| "epoch": 1.1403508771929824, |
| "grad_norm": 6.980294227600098, |
| "learning_rate": 4.997703298253406e-05, |
| "loss": 0.2437, |
| "num_input_tokens_seen": 42536, |
| "step": 65 |
| }, |
| { |
| "epoch": 1.2280701754385965, |
| "grad_norm": 0.5775801539421082, |
| "learning_rate": 4.993252512887069e-05, |
| "loss": 0.2792, |
| "num_input_tokens_seen": 45608, |
| "step": 70 |
| }, |
| { |
| "epoch": 1.3157894736842106, |
| "grad_norm": 2.4865057468414307, |
| "learning_rate": 4.986464304284091e-05, |
| "loss": 0.1133, |
| "num_input_tokens_seen": 49352, |
| "step": 75 |
| }, |
| { |
| "epoch": 1.4035087719298245, |
| "grad_norm": 8.858036041259766, |
| "learning_rate": 4.977345036387331e-05, |
| "loss": 0.2279, |
| "num_input_tokens_seen": 52328, |
| "step": 80 |
| }, |
| { |
| "epoch": 1.4912280701754386, |
| "grad_norm": 2.412734270095825, |
| "learning_rate": 4.965903258506806e-05, |
| "loss": 0.1677, |
| "num_input_tokens_seen": 56328, |
| "step": 85 |
| }, |
| { |
| "epoch": 1.526315789473684, |
| "eval_loss": 0.1670335978269577, |
| "eval_runtime": 0.6305, |
| "eval_samples_per_second": 39.651, |
| "eval_steps_per_second": 11.102, |
| "num_input_tokens_seen": 57288, |
| "step": 87 |
| }, |
| { |
| "epoch": 1.5789473684210527, |
| "grad_norm": 6.0660223960876465, |
| "learning_rate": 4.952149697304716e-05, |
| "loss": 0.3608, |
| "num_input_tokens_seen": 59048, |
| "step": 90 |
| }, |
| { |
| "epoch": 1.6666666666666665, |
| "grad_norm": 0.5810133218765259, |
| "learning_rate": 4.9360972467392056e-05, |
| "loss": 0.051, |
| "num_input_tokens_seen": 62504, |
| "step": 95 |
| }, |
| { |
| "epoch": 1.7543859649122808, |
| "grad_norm": 0.18257273733615875, |
| "learning_rate": 4.917760955976277e-05, |
| "loss": 0.1483, |
| "num_input_tokens_seen": 65832, |
| "step": 100 |
| }, |
| { |
| "epoch": 1.8421052631578947, |
| "grad_norm": 5.053678035736084, |
| "learning_rate": 4.897158015281209e-05, |
| "loss": 0.2659, |
| "num_input_tokens_seen": 68808, |
| "step": 105 |
| }, |
| { |
| "epoch": 1.9298245614035088, |
| "grad_norm": 0.13577021658420563, |
| "learning_rate": 4.874307739902689e-05, |
| "loss": 0.1155, |
| "num_input_tokens_seen": 71848, |
| "step": 110 |
| }, |
| { |
| "epoch": 2.017543859649123, |
| "grad_norm": 2.2010562419891357, |
| "learning_rate": 4.849231551964771e-05, |
| "loss": 0.0877, |
| "num_input_tokens_seen": 74040, |
| "step": 115 |
| }, |
| { |
| "epoch": 2.0350877192982457, |
| "eval_loss": 0.15614792704582214, |
| "eval_runtime": 0.6288, |
| "eval_samples_per_second": 39.757, |
| "eval_steps_per_second": 11.132, |
| "num_input_tokens_seen": 74520, |
| "step": 116 |
| }, |
| { |
| "epoch": 2.1052631578947367, |
| "grad_norm": 5.3920722007751465, |
| "learning_rate": 4.821952960383649e-05, |
| "loss": 0.0371, |
| "num_input_tokens_seen": 77272, |
| "step": 120 |
| }, |
| { |
| "epoch": 2.192982456140351, |
| "grad_norm": 0.12697073817253113, |
| "learning_rate": 4.7924975388280524e-05, |
| "loss": 0.0994, |
| "num_input_tokens_seen": 80280, |
| "step": 125 |
| }, |
| { |
| "epoch": 2.280701754385965, |
| "grad_norm": 0.2578938603401184, |
| "learning_rate": 4.760892901743944e-05, |
| "loss": 0.0593, |
| "num_input_tokens_seen": 83480, |
| "step": 130 |
| }, |
| { |
| "epoch": 2.3684210526315788, |
| "grad_norm": 3.375352621078491, |
| "learning_rate": 4.727168678465988e-05, |
| "loss": 0.2741, |
| "num_input_tokens_seen": 86232, |
| "step": 135 |
| }, |
| { |
| "epoch": 2.456140350877193, |
| "grad_norm": 4.711446285247803, |
| "learning_rate": 4.6913564854400595e-05, |
| "loss": 0.1137, |
| "num_input_tokens_seen": 89656, |
| "step": 140 |
| }, |
| { |
| "epoch": 2.543859649122807, |
| "grad_norm": 0.10915286839008331, |
| "learning_rate": 4.6534898965828405e-05, |
| "loss": 0.5623, |
| "num_input_tokens_seen": 93080, |
| "step": 145 |
| }, |
| { |
| "epoch": 2.543859649122807, |
| "eval_loss": 0.16362537443637848, |
| "eval_runtime": 0.6244, |
| "eval_samples_per_second": 40.038, |
| "eval_steps_per_second": 11.211, |
| "num_input_tokens_seen": 93080, |
| "step": 145 |
| }, |
| { |
| "epoch": 2.6315789473684212, |
| "grad_norm": 4.58852481842041, |
| "learning_rate": 4.613604411806285e-05, |
| "loss": 0.1534, |
| "num_input_tokens_seen": 96440, |
| "step": 150 |
| }, |
| { |
| "epoch": 2.719298245614035, |
| "grad_norm": 6.296114921569824, |
| "learning_rate": 4.5717374237364665e-05, |
| "loss": 0.0729, |
| "num_input_tokens_seen": 100280, |
| "step": 155 |
| }, |
| { |
| "epoch": 2.807017543859649, |
| "grad_norm": 0.1973930299282074, |
| "learning_rate": 4.5279281826580056e-05, |
| "loss": 0.1374, |
| "num_input_tokens_seen": 103512, |
| "step": 160 |
| }, |
| { |
| "epoch": 2.8947368421052633, |
| "grad_norm": 2.7350099086761475, |
| "learning_rate": 4.482217759716946e-05, |
| "loss": 0.1528, |
| "num_input_tokens_seen": 106168, |
| "step": 165 |
| }, |
| { |
| "epoch": 2.982456140350877, |
| "grad_norm": 0.7484534978866577, |
| "learning_rate": 4.434649008416565e-05, |
| "loss": 0.1167, |
| "num_input_tokens_seen": 109624, |
| "step": 170 |
| }, |
| { |
| "epoch": 3.0526315789473686, |
| "eval_loss": 0.1527228057384491, |
| "eval_runtime": 0.6297, |
| "eval_samples_per_second": 39.7, |
| "eval_steps_per_second": 11.116, |
| "num_input_tokens_seen": 111928, |
| "step": 174 |
| }, |
| { |
| "epoch": 3.0701754385964914, |
| "grad_norm": 2.574030876159668, |
| "learning_rate": 4.385266524442241e-05, |
| "loss": 0.1149, |
| "num_input_tokens_seen": 112472, |
| "step": 175 |
| }, |
| { |
| "epoch": 3.1578947368421053, |
| "grad_norm": 3.066100597381592, |
| "learning_rate": 4.334116603853007e-05, |
| "loss": 0.0558, |
| "num_input_tokens_seen": 115576, |
| "step": 180 |
| }, |
| { |
| "epoch": 3.245614035087719, |
| "grad_norm": 0.6165286898612976, |
| "learning_rate": 4.2812471996790206e-05, |
| "loss": 0.0348, |
| "num_input_tokens_seen": 119000, |
| "step": 185 |
| }, |
| { |
| "epoch": 3.3333333333333335, |
| "grad_norm": 3.5882608890533447, |
| "learning_rate": 4.226707876965611e-05, |
| "loss": 0.027, |
| "num_input_tokens_seen": 122360, |
| "step": 190 |
| }, |
| { |
| "epoch": 3.4210526315789473, |
| "grad_norm": 0.05624282732605934, |
| "learning_rate": 4.1705497663060767e-05, |
| "loss": 0.2526, |
| "num_input_tokens_seen": 125496, |
| "step": 195 |
| }, |
| { |
| "epoch": 3.5087719298245617, |
| "grad_norm": 4.903688430786133, |
| "learning_rate": 4.1128255159067665e-05, |
| "loss": 0.2432, |
| "num_input_tokens_seen": 128760, |
| "step": 200 |
| }, |
| { |
| "epoch": 3.56140350877193, |
| "eval_loss": 0.15741029381752014, |
| "eval_runtime": 0.625, |
| "eval_samples_per_second": 40.0, |
| "eval_steps_per_second": 11.2, |
| "num_input_tokens_seen": 131160, |
| "step": 203 |
| }, |
| { |
| "epoch": 3.5964912280701755, |
| "grad_norm": 2.2508561611175537, |
| "learning_rate": 4.053589242229412e-05, |
| "loss": 0.0905, |
| "num_input_tokens_seen": 132248, |
| "step": 205 |
| }, |
| { |
| "epoch": 3.6842105263157894, |
| "grad_norm": 4.933653831481934, |
| "learning_rate": 3.9928964792569655e-05, |
| "loss": 0.0684, |
| "num_input_tokens_seen": 135160, |
| "step": 210 |
| }, |
| { |
| "epoch": 3.7719298245614032, |
| "grad_norm": 0.29671263694763184, |
| "learning_rate": 3.930804126430513e-05, |
| "loss": 0.2413, |
| "num_input_tokens_seen": 138200, |
| "step": 215 |
| }, |
| { |
| "epoch": 3.8596491228070176, |
| "grad_norm": 1.6507714986801147, |
| "learning_rate": 3.867370395306068e-05, |
| "loss": 0.0422, |
| "num_input_tokens_seen": 141752, |
| "step": 220 |
| }, |
| { |
| "epoch": 3.9473684210526314, |
| "grad_norm": 3.61600923538208, |
| "learning_rate": 3.8026547549812665e-05, |
| "loss": 0.138, |
| "num_input_tokens_seen": 146008, |
| "step": 225 |
| }, |
| { |
| "epoch": 4.035087719298246, |
| "grad_norm": 0.08082824945449829, |
| "learning_rate": 3.736717876343106e-05, |
| "loss": 0.1046, |
| "num_input_tokens_seen": 148648, |
| "step": 230 |
| }, |
| { |
| "epoch": 4.0701754385964914, |
| "eval_loss": 0.15744303166866302, |
| "eval_runtime": 0.6266, |
| "eval_samples_per_second": 39.9, |
| "eval_steps_per_second": 11.172, |
| "num_input_tokens_seen": 150056, |
| "step": 232 |
| }, |
| { |
| "epoch": 4.12280701754386, |
| "grad_norm": 3.7673180103302, |
| "learning_rate": 3.66962157518902e-05, |
| "loss": 0.1353, |
| "num_input_tokens_seen": 151656, |
| "step": 235 |
| }, |
| { |
| "epoch": 4.2105263157894735, |
| "grad_norm": 8.22121810913086, |
| "learning_rate": 3.601428754274584e-05, |
| "loss": 0.2446, |
| "num_input_tokens_seen": 154280, |
| "step": 240 |
| }, |
| { |
| "epoch": 4.298245614035087, |
| "grad_norm": 2.0769782066345215, |
| "learning_rate": 3.532203344342212e-05, |
| "loss": 0.1445, |
| "num_input_tokens_seen": 157160, |
| "step": 245 |
| }, |
| { |
| "epoch": 4.385964912280702, |
| "grad_norm": 3.6512463092803955, |
| "learning_rate": 3.4620102441861143e-05, |
| "loss": 0.113, |
| "num_input_tokens_seen": 160072, |
| "step": 250 |
| }, |
| { |
| "epoch": 4.473684210526316, |
| "grad_norm": 0.4918835759162903, |
| "learning_rate": 3.390915259809696e-05, |
| "loss": 0.0421, |
| "num_input_tokens_seen": 163752, |
| "step": 255 |
| }, |
| { |
| "epoch": 4.56140350877193, |
| "grad_norm": 1.2423542737960815, |
| "learning_rate": 3.318985042732461e-05, |
| "loss": 0.0209, |
| "num_input_tokens_seen": 166600, |
| "step": 260 |
| }, |
| { |
| "epoch": 4.578947368421053, |
| "eval_loss": 0.16166813671588898, |
| "eval_runtime": 0.6244, |
| "eval_samples_per_second": 40.039, |
| "eval_steps_per_second": 11.211, |
| "num_input_tokens_seen": 167208, |
| "step": 261 |
| }, |
| { |
| "epoch": 4.649122807017544, |
| "grad_norm": 0.30374425649642944, |
| "learning_rate": 3.246287027504237e-05, |
| "loss": 0.1004, |
| "num_input_tokens_seen": 169704, |
| "step": 265 |
| }, |
| { |
| "epoch": 4.7368421052631575, |
| "grad_norm": 5.962034702301025, |
| "learning_rate": 3.172889368485311e-05, |
| "loss": 0.1107, |
| "num_input_tokens_seen": 173000, |
| "step": 270 |
| }, |
| { |
| "epoch": 4.824561403508772, |
| "grad_norm": 0.9729006886482239, |
| "learning_rate": 3.0988608759517475e-05, |
| "loss": 0.1636, |
| "num_input_tokens_seen": 177128, |
| "step": 275 |
| }, |
| { |
| "epoch": 4.912280701754386, |
| "grad_norm": 2.61875319480896, |
| "learning_rate": 3.0242709515857758e-05, |
| "loss": 0.0385, |
| "num_input_tokens_seen": 180680, |
| "step": 280 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.008530108258128166, |
| "learning_rate": 2.949189523411747e-05, |
| "loss": 0.0176, |
| "num_input_tokens_seen": 183280, |
| "step": 285 |
| }, |
| { |
| "epoch": 5.087719298245614, |
| "grad_norm": 2.3798036575317383, |
| "learning_rate": 2.8736869802386364e-05, |
| "loss": 0.0522, |
| "num_input_tokens_seen": 186160, |
| "step": 290 |
| }, |
| { |
| "epoch": 5.087719298245614, |
| "eval_loss": 0.15992014110088348, |
| "eval_runtime": 0.6279, |
| "eval_samples_per_second": 39.817, |
| "eval_steps_per_second": 11.149, |
| "num_input_tokens_seen": 186160, |
| "step": 290 |
| }, |
| { |
| "epoch": 5.175438596491228, |
| "grad_norm": 4.368771076202393, |
| "learning_rate": 2.797834105670559e-05, |
| "loss": 0.0762, |
| "num_input_tokens_seen": 189584, |
| "step": 295 |
| }, |
| { |
| "epoch": 5.2631578947368425, |
| "grad_norm": 2.463073492050171, |
| "learning_rate": 2.7217020117471793e-05, |
| "loss": 0.0862, |
| "num_input_tokens_seen": 193296, |
| "step": 300 |
| }, |
| { |
| "epoch": 5.350877192982456, |
| "grad_norm": 3.5835349559783936, |
| "learning_rate": 2.6453620722761896e-05, |
| "loss": 0.1214, |
| "num_input_tokens_seen": 197328, |
| "step": 305 |
| }, |
| { |
| "epoch": 5.43859649122807, |
| "grad_norm": 0.493797242641449, |
| "learning_rate": 2.5688858559204053e-05, |
| "loss": 0.0184, |
| "num_input_tokens_seen": 200560, |
| "step": 310 |
| }, |
| { |
| "epoch": 5.526315789473684, |
| "grad_norm": 0.5083137154579163, |
| "learning_rate": 2.492345059102164e-05, |
| "loss": 0.0172, |
| "num_input_tokens_seen": 203632, |
| "step": 315 |
| }, |
| { |
| "epoch": 5.5964912280701755, |
| "eval_loss": 0.1626342236995697, |
| "eval_runtime": 0.6255, |
| "eval_samples_per_second": 39.968, |
| "eval_steps_per_second": 11.191, |
| "num_input_tokens_seen": 206000, |
| "step": 319 |
| }, |
| { |
| "epoch": 5.614035087719298, |
| "grad_norm": 3.622645854949951, |
| "learning_rate": 2.4158114387879616e-05, |
| "loss": 0.1651, |
| "num_input_tokens_seen": 206480, |
| "step": 320 |
| }, |
| { |
| "epoch": 5.701754385964913, |
| "grad_norm": 2.6484100818634033, |
| "learning_rate": 2.3393567452163252e-05, |
| "loss": 0.2633, |
| "num_input_tokens_seen": 209232, |
| "step": 325 |
| }, |
| { |
| "epoch": 5.7894736842105265, |
| "grad_norm": 3.7363364696502686, |
| "learning_rate": 2.2630526546319914e-05, |
| "loss": 0.1361, |
| "num_input_tokens_seen": 213168, |
| "step": 330 |
| }, |
| { |
| "epoch": 5.87719298245614, |
| "grad_norm": 3.393617630004883, |
| "learning_rate": 2.186970702089457e-05, |
| "loss": 0.109, |
| "num_input_tokens_seen": 216752, |
| "step": 335 |
| }, |
| { |
| "epoch": 5.964912280701754, |
| "grad_norm": 0.14371931552886963, |
| "learning_rate": 2.111182214388893e-05, |
| "loss": 0.0477, |
| "num_input_tokens_seen": 219536, |
| "step": 340 |
| }, |
| { |
| "epoch": 6.052631578947368, |
| "grad_norm": 0.11867135018110275, |
| "learning_rate": 2.0357582432072957e-05, |
| "loss": 0.1588, |
| "num_input_tokens_seen": 222272, |
| "step": 345 |
| }, |
| { |
| "epoch": 6.105263157894737, |
| "eval_loss": 0.15937849879264832, |
| "eval_runtime": 0.619, |
| "eval_samples_per_second": 40.389, |
| "eval_steps_per_second": 11.309, |
| "num_input_tokens_seen": 224064, |
| "step": 348 |
| }, |
| { |
| "epoch": 6.140350877192983, |
| "grad_norm": 1.5079829692840576, |
| "learning_rate": 1.9607694984875754e-05, |
| "loss": 0.1563, |
| "num_input_tokens_seen": 225664, |
| "step": 350 |
| }, |
| { |
| "epoch": 6.228070175438597, |
| "grad_norm": 3.4089200496673584, |
| "learning_rate": 1.8862862821480025e-05, |
| "loss": 0.1281, |
| "num_input_tokens_seen": 229408, |
| "step": 355 |
| }, |
| { |
| "epoch": 6.315789473684211, |
| "grad_norm": 1.7254390716552734, |
| "learning_rate": 1.8123784221741964e-05, |
| "loss": 0.2271, |
| "num_input_tokens_seen": 232832, |
| "step": 360 |
| }, |
| { |
| "epoch": 6.4035087719298245, |
| "grad_norm": 0.4033445715904236, |
| "learning_rate": 1.73911520715541e-05, |
| "loss": 0.0431, |
| "num_input_tokens_seen": 236288, |
| "step": 365 |
| }, |
| { |
| "epoch": 6.491228070175438, |
| "grad_norm": 0.5499050617218018, |
| "learning_rate": 1.666565321326512e-05, |
| "loss": 0.0094, |
| "num_input_tokens_seen": 239296, |
| "step": 370 |
| }, |
| { |
| "epoch": 6.578947368421053, |
| "grad_norm": 2.251795768737793, |
| "learning_rate": 1.5947967801765345e-05, |
| "loss": 0.1067, |
| "num_input_tokens_seen": 242848, |
| "step": 375 |
| }, |
| { |
| "epoch": 6.614035087719298, |
| "eval_loss": 0.16075924038887024, |
| "eval_runtime": 0.6242, |
| "eval_samples_per_second": 40.05, |
| "eval_steps_per_second": 11.214, |
| "num_input_tokens_seen": 243840, |
| "step": 377 |
| }, |
| { |
| "epoch": 6.666666666666667, |
| "grad_norm": 0.15847764909267426, |
| "learning_rate": 1.5238768666841907e-05, |
| "loss": 0.1003, |
| "num_input_tokens_seen": 245344, |
| "step": 380 |
| }, |
| { |
| "epoch": 6.754385964912281, |
| "grad_norm": 0.2301883101463318, |
| "learning_rate": 1.4538720682400969e-05, |
| "loss": 0.0209, |
| "num_input_tokens_seen": 248256, |
| "step": 385 |
| }, |
| { |
| "epoch": 6.842105263157895, |
| "grad_norm": 0.41148290038108826, |
| "learning_rate": 1.3848480143148839e-05, |
| "loss": 0.0439, |
| "num_input_tokens_seen": 250976, |
| "step": 390 |
| }, |
| { |
| "epoch": 6.9298245614035086, |
| "grad_norm": 0.8497050404548645, |
| "learning_rate": 1.3168694149315796e-05, |
| "loss": 0.0176, |
| "num_input_tokens_seen": 254944, |
| "step": 395 |
| }, |
| { |
| "epoch": 7.017543859649122, |
| "grad_norm": 0.3716928958892822, |
| "learning_rate": 1.2500000000000006e-05, |
| "loss": 0.0392, |
| "num_input_tokens_seen": 257888, |
| "step": 400 |
| }, |
| { |
| "epoch": 7.105263157894737, |
| "grad_norm": 0.051173288375139236, |
| "learning_rate": 1.1843024595699805e-05, |
| "loss": 0.0126, |
| "num_input_tokens_seen": 261088, |
| "step": 405 |
| }, |
| { |
| "epoch": 7.12280701754386, |
| "eval_loss": 0.16655094921588898, |
| "eval_runtime": 0.6224, |
| "eval_samples_per_second": 40.167, |
| "eval_steps_per_second": 11.247, |
| "num_input_tokens_seen": 261504, |
| "step": 406 |
| }, |
| { |
| "epoch": 7.192982456140351, |
| "grad_norm": 0.2367202639579773, |
| "learning_rate": 1.1198383850594758e-05, |
| "loss": 0.0129, |
| "num_input_tokens_seen": 263840, |
| "step": 410 |
| }, |
| { |
| "epoch": 7.280701754385965, |
| "grad_norm": 6.449048042297363, |
| "learning_rate": 1.0566682115126344e-05, |
| "loss": 0.1585, |
| "num_input_tokens_seen": 267424, |
| "step": 415 |
| }, |
| { |
| "epoch": 7.368421052631579, |
| "grad_norm": 3.6663174629211426, |
| "learning_rate": 9.948511609419675e-06, |
| "loss": 0.1895, |
| "num_input_tokens_seen": 270720, |
| "step": 420 |
| }, |
| { |
| "epoch": 7.456140350877193, |
| "grad_norm": 0.47472232580184937, |
| "learning_rate": 9.344451868077353e-06, |
| "loss": 0.1807, |
| "num_input_tokens_seen": 273504, |
| "step": 425 |
| }, |
| { |
| "epoch": 7.543859649122807, |
| "grad_norm": 2.3296167850494385, |
| "learning_rate": 8.755069196866014e-06, |
| "loss": 0.0791, |
| "num_input_tokens_seen": 276608, |
| "step": 430 |
| }, |
| { |
| "epoch": 7.631578947368421, |
| "grad_norm": 0.2966596782207489, |
| "learning_rate": 8.180916141804906e-06, |
| "loss": 0.1272, |
| "num_input_tokens_seen": 280352, |
| "step": 435 |
| }, |
| { |
| "epoch": 7.631578947368421, |
| "eval_loss": 0.1653868407011032, |
| "eval_runtime": 0.6224, |
| "eval_samples_per_second": 40.168, |
| "eval_steps_per_second": 11.247, |
| "num_input_tokens_seen": 280352, |
| "step": 435 |
| }, |
| { |
| "epoch": 7.719298245614035, |
| "grad_norm": 0.9520754814147949, |
| "learning_rate": 7.622530971154199e-06, |
| "loss": 0.0304, |
| "num_input_tokens_seen": 283808, |
| "step": 440 |
| }, |
| { |
| "epoch": 7.807017543859649, |
| "grad_norm": 2.3160340785980225, |
| "learning_rate": 7.080437170788723e-06, |
| "loss": 0.0661, |
| "num_input_tokens_seen": 286688, |
| "step": 445 |
| }, |
| { |
| "epoch": 7.894736842105263, |
| "grad_norm": 0.5872668027877808, |
| "learning_rate": 6.555142953430158e-06, |
| "loss": 0.0959, |
| "num_input_tokens_seen": 290240, |
| "step": 450 |
| }, |
| { |
| "epoch": 7.982456140350877, |
| "grad_norm": 1.039448618888855, |
| "learning_rate": 6.0471407821978135e-06, |
| "loss": 0.0529, |
| "num_input_tokens_seen": 293568, |
| "step": 455 |
| }, |
| { |
| "epoch": 8.070175438596491, |
| "grad_norm": 0.09243475645780563, |
| "learning_rate": 5.556906908924655e-06, |
| "loss": 0.0081, |
| "num_input_tokens_seen": 296720, |
| "step": 460 |
| }, |
| { |
| "epoch": 8.140350877192983, |
| "eval_loss": 0.16728512942790985, |
| "eval_runtime": 0.6258, |
| "eval_samples_per_second": 39.951, |
| "eval_steps_per_second": 11.186, |
| "num_input_tokens_seen": 299344, |
| "step": 464 |
| }, |
| { |
| "epoch": 8.157894736842104, |
| "grad_norm": 0.158346489071846, |
| "learning_rate": 5.084900927671393e-06, |
| "loss": 0.1124, |
| "num_input_tokens_seen": 300112, |
| "step": 465 |
| }, |
| { |
| "epoch": 8.24561403508772, |
| "grad_norm": 4.965975761413574, |
| "learning_rate": 4.631565343857239e-06, |
| "loss": 0.0589, |
| "num_input_tokens_seen": 303664, |
| "step": 470 |
| }, |
| { |
| "epoch": 8.333333333333334, |
| "grad_norm": 0.4916265308856964, |
| "learning_rate": 4.19732515941125e-06, |
| "loss": 0.0644, |
| "num_input_tokens_seen": 306480, |
| "step": 475 |
| }, |
| { |
| "epoch": 8.421052631578947, |
| "grad_norm": 3.4331796169281006, |
| "learning_rate": 3.7825874743331907e-06, |
| "loss": 0.1192, |
| "num_input_tokens_seen": 309744, |
| "step": 480 |
| }, |
| { |
| "epoch": 8.508771929824562, |
| "grad_norm": 0.8256147503852844, |
| "learning_rate": 3.3877411050374424e-06, |
| "loss": 0.076, |
| "num_input_tokens_seen": 313360, |
| "step": 485 |
| }, |
| { |
| "epoch": 8.596491228070175, |
| "grad_norm": 0.791651725769043, |
| "learning_rate": 3.013156219837776e-06, |
| "loss": 0.2357, |
| "num_input_tokens_seen": 316784, |
| "step": 490 |
| }, |
| { |
| "epoch": 8.649122807017545, |
| "eval_loss": 0.1685744822025299, |
| "eval_runtime": 0.6236, |
| "eval_samples_per_second": 40.088, |
| "eval_steps_per_second": 11.225, |
| "num_input_tokens_seen": 318672, |
| "step": 493 |
| }, |
| { |
| "epoch": 8.68421052631579, |
| "grad_norm": 0.21202699840068817, |
| "learning_rate": 2.659183991914696e-06, |
| "loss": 0.0072, |
| "num_input_tokens_seen": 319760, |
| "step": 495 |
| }, |
| { |
| "epoch": 8.771929824561404, |
| "grad_norm": 0.08041726052761078, |
| "learning_rate": 2.326156270090735e-06, |
| "loss": 0.0537, |
| "num_input_tokens_seen": 322512, |
| "step": 500 |
| }, |
| { |
| "epoch": 8.859649122807017, |
| "grad_norm": 0.1982102245092392, |
| "learning_rate": 2.0143852677223075e-06, |
| "loss": 0.0531, |
| "num_input_tokens_seen": 325936, |
| "step": 505 |
| }, |
| { |
| "epoch": 8.947368421052632, |
| "grad_norm": 0.7879493236541748, |
| "learning_rate": 1.7241632699998123e-06, |
| "loss": 0.0596, |
| "num_input_tokens_seen": 329360, |
| "step": 510 |
| }, |
| { |
| "epoch": 9.035087719298245, |
| "grad_norm": 0.848185658454895, |
| "learning_rate": 1.4557623599303903e-06, |
| "loss": 0.1259, |
| "num_input_tokens_seen": 332168, |
| "step": 515 |
| }, |
| { |
| "epoch": 9.12280701754386, |
| "grad_norm": 0.2810516059398651, |
| "learning_rate": 1.2094341632602064e-06, |
| "loss": 0.0518, |
| "num_input_tokens_seen": 336296, |
| "step": 520 |
| }, |
| { |
| "epoch": 9.157894736842104, |
| "eval_loss": 0.16631467640399933, |
| "eval_runtime": 0.6255, |
| "eval_samples_per_second": 39.967, |
| "eval_steps_per_second": 11.191, |
| "num_input_tokens_seen": 337480, |
| "step": 522 |
| }, |
| { |
| "epoch": 9.210526315789474, |
| "grad_norm": 0.24005332589149475, |
| "learning_rate": 9.85409612575411e-07, |
| "loss": 0.1521, |
| "num_input_tokens_seen": 339080, |
| "step": 525 |
| }, |
| { |
| "epoch": 9.298245614035087, |
| "grad_norm": 0.04881466552615166, |
| "learning_rate": 7.838987308029427e-07, |
| "loss": 0.0416, |
| "num_input_tokens_seen": 342568, |
| "step": 530 |
| }, |
| { |
| "epoch": 9.385964912280702, |
| "grad_norm": 0.5010470151901245, |
| "learning_rate": 6.050904343141095e-07, |
| "loss": 0.0651, |
| "num_input_tokens_seen": 345576, |
| "step": 535 |
| }, |
| { |
| "epoch": 9.473684210526315, |
| "grad_norm": 0.8071462512016296, |
| "learning_rate": 4.491523558155714e-07, |
| "loss": 0.0202, |
| "num_input_tokens_seen": 348392, |
| "step": 540 |
| }, |
| { |
| "epoch": 9.56140350877193, |
| "grad_norm": 1.663230299949646, |
| "learning_rate": 3.162306871937387e-07, |
| "loss": 0.0309, |
| "num_input_tokens_seen": 351912, |
| "step": 545 |
| }, |
| { |
| "epoch": 9.649122807017545, |
| "grad_norm": 0.24659748375415802, |
| "learning_rate": 2.064500424599436e-07, |
| "loss": 0.0621, |
| "num_input_tokens_seen": 355432, |
| "step": 550 |
| }, |
| { |
| "epoch": 9.666666666666666, |
| "eval_loss": 0.16462185978889465, |
| "eval_runtime": 0.6213, |
| "eval_samples_per_second": 40.239, |
| "eval_steps_per_second": 11.267, |
| "num_input_tokens_seen": 356456, |
| "step": 551 |
| }, |
| { |
| "epoch": 9.736842105263158, |
| "grad_norm": 0.29752203822135925, |
| "learning_rate": 1.1991334092484318e-07, |
| "loss": 0.0782, |
| "num_input_tokens_seen": 358760, |
| "step": 555 |
| }, |
| { |
| "epoch": 9.824561403508772, |
| "grad_norm": 0.029567603021860123, |
| "learning_rate": 5.6701710711626334e-08, |
| "loss": 0.1437, |
| "num_input_tokens_seen": 362088, |
| "step": 560 |
| }, |
| { |
| "epoch": 9.912280701754385, |
| "grad_norm": 0.5893417596817017, |
| "learning_rate": 1.6874412698408836e-08, |
| "loss": 0.2064, |
| "num_input_tokens_seen": 365544, |
| "step": 565 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 0.15692667663097382, |
| "learning_rate": 4.687849611939576e-10, |
| "loss": 0.1124, |
| "num_input_tokens_seen": 367864, |
| "step": 570 |
| }, |
| { |
| "epoch": 10.0, |
| "num_input_tokens_seen": 367864, |
| "step": 570, |
| "total_flos": 1.6565337297911808e+16, |
| "train_loss": 0.17970421045626464, |
| "train_runtime": 155.0387, |
| "train_samples_per_second": 14.513, |
| "train_steps_per_second": 3.677 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 570, |
| "num_input_tokens_seen": 367864, |
| "num_train_epochs": 10, |
| "save_steps": 29, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.6565337297911808e+16, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|