| { |
| "best_global_step": 948, |
| "best_metric": 0.07719651609659195, |
| "best_model_checkpoint": "saves/p-tuning/llama-3-8b-instruct/train_svamp_1754652179/checkpoint-948", |
| "epoch": 10.0, |
| "eval_steps": 79, |
| "global_step": 1580, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.03164556962025317, |
| "grad_norm": 57.304134368896484, |
| "learning_rate": 1.2658227848101265e-06, |
| "loss": 2.5423, |
| "num_input_tokens_seen": 2272, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.06329113924050633, |
| "grad_norm": 31.999879837036133, |
| "learning_rate": 2.848101265822785e-06, |
| "loss": 2.316, |
| "num_input_tokens_seen": 4640, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0949367088607595, |
| "grad_norm": 29.859228134155273, |
| "learning_rate": 4.430379746835443e-06, |
| "loss": 2.0954, |
| "num_input_tokens_seen": 6848, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.12658227848101267, |
| "grad_norm": 30.689695358276367, |
| "learning_rate": 6.012658227848101e-06, |
| "loss": 1.8232, |
| "num_input_tokens_seen": 9056, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.15822784810126583, |
| "grad_norm": 16.21649169921875, |
| "learning_rate": 7.5949367088607605e-06, |
| "loss": 1.5523, |
| "num_input_tokens_seen": 11296, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.189873417721519, |
| "grad_norm": 31.955402374267578, |
| "learning_rate": 9.177215189873418e-06, |
| "loss": 1.2851, |
| "num_input_tokens_seen": 13568, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.22151898734177214, |
| "grad_norm": 43.333091735839844, |
| "learning_rate": 1.0759493670886076e-05, |
| "loss": 1.0306, |
| "num_input_tokens_seen": 15840, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.25316455696202533, |
| "grad_norm": 32.48151397705078, |
| "learning_rate": 1.2341772151898735e-05, |
| "loss": 0.7859, |
| "num_input_tokens_seen": 18240, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.2848101265822785, |
| "grad_norm": 30.37523651123047, |
| "learning_rate": 1.3924050632911393e-05, |
| "loss": 0.4745, |
| "num_input_tokens_seen": 20480, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.31645569620253167, |
| "grad_norm": 57.03865051269531, |
| "learning_rate": 1.550632911392405e-05, |
| "loss": 0.3407, |
| "num_input_tokens_seen": 22624, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.34810126582278483, |
| "grad_norm": 36.00380325317383, |
| "learning_rate": 1.7088607594936708e-05, |
| "loss": 0.2401, |
| "num_input_tokens_seen": 24928, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.379746835443038, |
| "grad_norm": 21.761754989624023, |
| "learning_rate": 1.8670886075949368e-05, |
| "loss": 0.279, |
| "num_input_tokens_seen": 27264, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.41139240506329117, |
| "grad_norm": 15.638097763061523, |
| "learning_rate": 2.0253164556962025e-05, |
| "loss": 0.232, |
| "num_input_tokens_seen": 29504, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.4430379746835443, |
| "grad_norm": 16.34490966796875, |
| "learning_rate": 2.1835443037974685e-05, |
| "loss": 0.4193, |
| "num_input_tokens_seen": 31776, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.47468354430379744, |
| "grad_norm": 38.84067153930664, |
| "learning_rate": 2.341772151898734e-05, |
| "loss": 0.2917, |
| "num_input_tokens_seen": 34016, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.5, |
| "eval_loss": 0.26484471559524536, |
| "eval_runtime": 1.7772, |
| "eval_samples_per_second": 39.387, |
| "eval_steps_per_second": 10.128, |
| "num_input_tokens_seen": 35776, |
| "step": 79 |
| }, |
| { |
| "epoch": 0.5063291139240507, |
| "grad_norm": 29.9153995513916, |
| "learning_rate": 2.5e-05, |
| "loss": 0.2394, |
| "num_input_tokens_seen": 36224, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.5379746835443038, |
| "grad_norm": 45.06348419189453, |
| "learning_rate": 2.6582278481012658e-05, |
| "loss": 0.1805, |
| "num_input_tokens_seen": 38400, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.569620253164557, |
| "grad_norm": 223.73489379882812, |
| "learning_rate": 2.8164556962025318e-05, |
| "loss": 0.3283, |
| "num_input_tokens_seen": 40704, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.6012658227848101, |
| "grad_norm": 12.492780685424805, |
| "learning_rate": 2.9746835443037974e-05, |
| "loss": 0.1008, |
| "num_input_tokens_seen": 42848, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.6329113924050633, |
| "grad_norm": 156.92747497558594, |
| "learning_rate": 3.132911392405064e-05, |
| "loss": 0.4571, |
| "num_input_tokens_seen": 45056, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.6645569620253164, |
| "grad_norm": 14.881486892700195, |
| "learning_rate": 3.291139240506329e-05, |
| "loss": 0.3661, |
| "num_input_tokens_seen": 47296, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.6962025316455697, |
| "grad_norm": 9.662066459655762, |
| "learning_rate": 3.449367088607595e-05, |
| "loss": 0.1226, |
| "num_input_tokens_seen": 49536, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.7278481012658228, |
| "grad_norm": 17.59064483642578, |
| "learning_rate": 3.607594936708861e-05, |
| "loss": 0.221, |
| "num_input_tokens_seen": 51744, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.759493670886076, |
| "grad_norm": 14.502843856811523, |
| "learning_rate": 3.765822784810127e-05, |
| "loss": 0.1247, |
| "num_input_tokens_seen": 53888, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.7911392405063291, |
| "grad_norm": 94.63813018798828, |
| "learning_rate": 3.924050632911392e-05, |
| "loss": 0.2585, |
| "num_input_tokens_seen": 56128, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.8227848101265823, |
| "grad_norm": 12.14871597290039, |
| "learning_rate": 4.0822784810126584e-05, |
| "loss": 0.1239, |
| "num_input_tokens_seen": 58400, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.8544303797468354, |
| "grad_norm": 19.46076202392578, |
| "learning_rate": 4.240506329113924e-05, |
| "loss": 0.2201, |
| "num_input_tokens_seen": 60640, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.8860759493670886, |
| "grad_norm": 35.38789749145508, |
| "learning_rate": 4.3987341772151904e-05, |
| "loss": 0.2173, |
| "num_input_tokens_seen": 62880, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.9177215189873418, |
| "grad_norm": 24.81549835205078, |
| "learning_rate": 4.556962025316456e-05, |
| "loss": 0.2943, |
| "num_input_tokens_seen": 65088, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.9493670886075949, |
| "grad_norm": 25.282819747924805, |
| "learning_rate": 4.715189873417722e-05, |
| "loss": 0.2326, |
| "num_input_tokens_seen": 67296, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.9810126582278481, |
| "grad_norm": 62.82623291015625, |
| "learning_rate": 4.8734177215189874e-05, |
| "loss": 0.1311, |
| "num_input_tokens_seen": 69632, |
| "step": 155 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 0.6598067879676819, |
| "eval_runtime": 1.7725, |
| "eval_samples_per_second": 39.491, |
| "eval_steps_per_second": 10.155, |
| "num_input_tokens_seen": 70672, |
| "step": 158 |
| }, |
| { |
| "epoch": 1.0126582278481013, |
| "grad_norm": 59.08744430541992, |
| "learning_rate": 4.999993898868453e-05, |
| "loss": 0.3077, |
| "num_input_tokens_seen": 71504, |
| "step": 160 |
| }, |
| { |
| "epoch": 1.0443037974683544, |
| "grad_norm": 12.225666046142578, |
| "learning_rate": 4.999780362391087e-05, |
| "loss": 0.2211, |
| "num_input_tokens_seen": 73744, |
| "step": 165 |
| }, |
| { |
| "epoch": 1.0759493670886076, |
| "grad_norm": 19.726652145385742, |
| "learning_rate": 4.999261799114754e-05, |
| "loss": 0.2402, |
| "num_input_tokens_seen": 75888, |
| "step": 170 |
| }, |
| { |
| "epoch": 1.1075949367088607, |
| "grad_norm": 9.605252265930176, |
| "learning_rate": 4.9984382723152935e-05, |
| "loss": 0.4507, |
| "num_input_tokens_seen": 78288, |
| "step": 175 |
| }, |
| { |
| "epoch": 1.139240506329114, |
| "grad_norm": 11.414168357849121, |
| "learning_rate": 4.99730988248063e-05, |
| "loss": 0.3113, |
| "num_input_tokens_seen": 80496, |
| "step": 180 |
| }, |
| { |
| "epoch": 1.1708860759493671, |
| "grad_norm": 2.0183305740356445, |
| "learning_rate": 4.9958767672985176e-05, |
| "loss": 0.1182, |
| "num_input_tokens_seen": 82800, |
| "step": 185 |
| }, |
| { |
| "epoch": 1.2025316455696202, |
| "grad_norm": 3.184451103210449, |
| "learning_rate": 4.994139101639732e-05, |
| "loss": 0.0623, |
| "num_input_tokens_seen": 84976, |
| "step": 190 |
| }, |
| { |
| "epoch": 1.2341772151898733, |
| "grad_norm": 3.47723388671875, |
| "learning_rate": 4.99209709753674e-05, |
| "loss": 0.1357, |
| "num_input_tokens_seen": 87280, |
| "step": 195 |
| }, |
| { |
| "epoch": 1.2658227848101267, |
| "grad_norm": 3.638543128967285, |
| "learning_rate": 4.989751004157822e-05, |
| "loss": 0.12, |
| "num_input_tokens_seen": 89584, |
| "step": 200 |
| }, |
| { |
| "epoch": 1.2974683544303798, |
| "grad_norm": 2.2222177982330322, |
| "learning_rate": 4.98710110777667e-05, |
| "loss": 0.0584, |
| "num_input_tokens_seen": 91856, |
| "step": 205 |
| }, |
| { |
| "epoch": 1.3291139240506329, |
| "grad_norm": 26.8077392578125, |
| "learning_rate": 4.984147731737455e-05, |
| "loss": 0.2019, |
| "num_input_tokens_seen": 94064, |
| "step": 210 |
| }, |
| { |
| "epoch": 1.360759493670886, |
| "grad_norm": 7.961758136749268, |
| "learning_rate": 4.980891236415376e-05, |
| "loss": 0.2216, |
| "num_input_tokens_seen": 96304, |
| "step": 215 |
| }, |
| { |
| "epoch": 1.3924050632911391, |
| "grad_norm": 45.82145690917969, |
| "learning_rate": 4.9773320191726776e-05, |
| "loss": 0.1724, |
| "num_input_tokens_seen": 98576, |
| "step": 220 |
| }, |
| { |
| "epoch": 1.4240506329113924, |
| "grad_norm": 13.437238693237305, |
| "learning_rate": 4.973470514310175e-05, |
| "loss": 0.2237, |
| "num_input_tokens_seen": 100688, |
| "step": 225 |
| }, |
| { |
| "epoch": 1.4556962025316456, |
| "grad_norm": 4.555171489715576, |
| "learning_rate": 4.969307193014248e-05, |
| "loss": 0.2799, |
| "num_input_tokens_seen": 102864, |
| "step": 230 |
| }, |
| { |
| "epoch": 1.4873417721518987, |
| "grad_norm": 0.40021178126335144, |
| "learning_rate": 4.964842563299359e-05, |
| "loss": 0.0572, |
| "num_input_tokens_seen": 104944, |
| "step": 235 |
| }, |
| { |
| "epoch": 1.5, |
| "eval_loss": 0.14671920239925385, |
| "eval_runtime": 1.7757, |
| "eval_samples_per_second": 39.421, |
| "eval_steps_per_second": 10.137, |
| "num_input_tokens_seen": 105904, |
| "step": 237 |
| }, |
| { |
| "epoch": 1.518987341772152, |
| "grad_norm": 2.4955692291259766, |
| "learning_rate": 4.960077169946052e-05, |
| "loss": 0.1011, |
| "num_input_tokens_seen": 107248, |
| "step": 240 |
| }, |
| { |
| "epoch": 1.5506329113924051, |
| "grad_norm": 0.9815319180488586, |
| "learning_rate": 4.9550115944344855e-05, |
| "loss": 0.0578, |
| "num_input_tokens_seen": 109360, |
| "step": 245 |
| }, |
| { |
| "epoch": 1.5822784810126582, |
| "grad_norm": 46.03337860107422, |
| "learning_rate": 4.949646454873477e-05, |
| "loss": 0.1606, |
| "num_input_tokens_seen": 111568, |
| "step": 250 |
| }, |
| { |
| "epoch": 1.6139240506329116, |
| "grad_norm": 9.345287322998047, |
| "learning_rate": 4.9439824059250794e-05, |
| "loss": 0.2063, |
| "num_input_tokens_seen": 113936, |
| "step": 255 |
| }, |
| { |
| "epoch": 1.6455696202531644, |
| "grad_norm": 27.278141021728516, |
| "learning_rate": 4.938020138724702e-05, |
| "loss": 0.2002, |
| "num_input_tokens_seen": 116304, |
| "step": 260 |
| }, |
| { |
| "epoch": 1.6772151898734178, |
| "grad_norm": 3.0424633026123047, |
| "learning_rate": 4.9317603807967705e-05, |
| "loss": 0.1304, |
| "num_input_tokens_seen": 118608, |
| "step": 265 |
| }, |
| { |
| "epoch": 1.7088607594936709, |
| "grad_norm": 3.5982022285461426, |
| "learning_rate": 4.925203895965963e-05, |
| "loss": 0.128, |
| "num_input_tokens_seen": 120784, |
| "step": 270 |
| }, |
| { |
| "epoch": 1.740506329113924, |
| "grad_norm": 2.8051130771636963, |
| "learning_rate": 4.918351484263997e-05, |
| "loss": 0.12, |
| "num_input_tokens_seen": 122960, |
| "step": 275 |
| }, |
| { |
| "epoch": 1.7721518987341773, |
| "grad_norm": 5.42772102355957, |
| "learning_rate": 4.911203981832013e-05, |
| "loss": 0.0565, |
| "num_input_tokens_seen": 125232, |
| "step": 280 |
| }, |
| { |
| "epoch": 1.8037974683544302, |
| "grad_norm": 5.5666937828063965, |
| "learning_rate": 4.903762260818551e-05, |
| "loss": 0.2118, |
| "num_input_tokens_seen": 127504, |
| "step": 285 |
| }, |
| { |
| "epoch": 1.8354430379746836, |
| "grad_norm": 2.1527256965637207, |
| "learning_rate": 4.896027229273123e-05, |
| "loss": 0.1114, |
| "num_input_tokens_seen": 129776, |
| "step": 290 |
| }, |
| { |
| "epoch": 1.8670886075949367, |
| "grad_norm": 2.752041816711426, |
| "learning_rate": 4.887999831035414e-05, |
| "loss": 0.1067, |
| "num_input_tokens_seen": 131984, |
| "step": 295 |
| }, |
| { |
| "epoch": 1.8987341772151898, |
| "grad_norm": 5.814232349395752, |
| "learning_rate": 4.879681045620115e-05, |
| "loss": 0.2103, |
| "num_input_tokens_seen": 134288, |
| "step": 300 |
| }, |
| { |
| "epoch": 1.9303797468354431, |
| "grad_norm": 6.459428310394287, |
| "learning_rate": 4.8710718880974e-05, |
| "loss": 0.1299, |
| "num_input_tokens_seen": 136496, |
| "step": 305 |
| }, |
| { |
| "epoch": 1.9620253164556962, |
| "grad_norm": 11.473212242126465, |
| "learning_rate": 4.862173408969066e-05, |
| "loss": 0.0624, |
| "num_input_tokens_seen": 138896, |
| "step": 310 |
| }, |
| { |
| "epoch": 1.9936708860759493, |
| "grad_norm": 25.566421508789062, |
| "learning_rate": 4.852986694040347e-05, |
| "loss": 0.1331, |
| "num_input_tokens_seen": 141136, |
| "step": 315 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.15266959369182587, |
| "eval_runtime": 1.7734, |
| "eval_samples_per_second": 39.471, |
| "eval_steps_per_second": 10.15, |
| "num_input_tokens_seen": 141328, |
| "step": 316 |
| }, |
| { |
| "epoch": 2.0253164556962027, |
| "grad_norm": 2.049661159515381, |
| "learning_rate": 4.843512864287425e-05, |
| "loss": 0.1069, |
| "num_input_tokens_seen": 143152, |
| "step": 320 |
| }, |
| { |
| "epoch": 2.0569620253164556, |
| "grad_norm": 1.3843505382537842, |
| "learning_rate": 4.833753075720647e-05, |
| "loss": 0.0599, |
| "num_input_tokens_seen": 145328, |
| "step": 325 |
| }, |
| { |
| "epoch": 2.088607594936709, |
| "grad_norm": 1.9298903942108154, |
| "learning_rate": 4.8237085192434676e-05, |
| "loss": 0.0544, |
| "num_input_tokens_seen": 147632, |
| "step": 330 |
| }, |
| { |
| "epoch": 2.1202531645569622, |
| "grad_norm": 1.791174054145813, |
| "learning_rate": 4.8133804205071285e-05, |
| "loss": 0.0435, |
| "num_input_tokens_seen": 149680, |
| "step": 335 |
| }, |
| { |
| "epoch": 2.151898734177215, |
| "grad_norm": 14.78237533569336, |
| "learning_rate": 4.802770039761108e-05, |
| "loss": 0.216, |
| "num_input_tokens_seen": 151952, |
| "step": 340 |
| }, |
| { |
| "epoch": 2.1835443037974684, |
| "grad_norm": 0.16718049347400665, |
| "learning_rate": 4.791878671699342e-05, |
| "loss": 0.1604, |
| "num_input_tokens_seen": 154224, |
| "step": 345 |
| }, |
| { |
| "epoch": 2.2151898734177213, |
| "grad_norm": 0.8592212796211243, |
| "learning_rate": 4.7807076453022425e-05, |
| "loss": 0.0732, |
| "num_input_tokens_seen": 156400, |
| "step": 350 |
| }, |
| { |
| "epoch": 2.2468354430379747, |
| "grad_norm": 1.9021673202514648, |
| "learning_rate": 4.7692583236745334e-05, |
| "loss": 0.2025, |
| "num_input_tokens_seen": 158576, |
| "step": 355 |
| }, |
| { |
| "epoch": 2.278481012658228, |
| "grad_norm": 3.9820752143859863, |
| "learning_rate": 4.757532103878925e-05, |
| "loss": 0.1819, |
| "num_input_tokens_seen": 160944, |
| "step": 360 |
| }, |
| { |
| "epoch": 2.310126582278481, |
| "grad_norm": 0.6237077116966248, |
| "learning_rate": 4.745530416765641e-05, |
| "loss": 0.1606, |
| "num_input_tokens_seen": 163248, |
| "step": 365 |
| }, |
| { |
| "epoch": 2.3417721518987342, |
| "grad_norm": 5.74051570892334, |
| "learning_rate": 4.733254726797821e-05, |
| "loss": 0.1306, |
| "num_input_tokens_seen": 165584, |
| "step": 370 |
| }, |
| { |
| "epoch": 2.3734177215189876, |
| "grad_norm": 0.508920431137085, |
| "learning_rate": 4.72070653187283e-05, |
| "loss": 0.0429, |
| "num_input_tokens_seen": 167824, |
| "step": 375 |
| }, |
| { |
| "epoch": 2.4050632911392404, |
| "grad_norm": 2.0854220390319824, |
| "learning_rate": 4.707887363139479e-05, |
| "loss": 0.1268, |
| "num_input_tokens_seen": 170000, |
| "step": 380 |
| }, |
| { |
| "epoch": 2.4367088607594938, |
| "grad_norm": 0.6447995901107788, |
| "learning_rate": 4.6947987848111955e-05, |
| "loss": 0.0904, |
| "num_input_tokens_seen": 172208, |
| "step": 385 |
| }, |
| { |
| "epoch": 2.4683544303797467, |
| "grad_norm": 2.10355806350708, |
| "learning_rate": 4.68144239397515e-05, |
| "loss": 0.0635, |
| "num_input_tokens_seen": 174480, |
| "step": 390 |
| }, |
| { |
| "epoch": 2.5, |
| "grad_norm": 2.1555063724517822, |
| "learning_rate": 4.667819820397382e-05, |
| "loss": 0.1217, |
| "num_input_tokens_seen": 176752, |
| "step": 395 |
| }, |
| { |
| "epoch": 2.5, |
| "eval_loss": 0.09895941615104675, |
| "eval_runtime": 1.778, |
| "eval_samples_per_second": 39.371, |
| "eval_steps_per_second": 10.124, |
| "num_input_tokens_seen": 176752, |
| "step": 395 |
| }, |
| { |
| "epoch": 2.5316455696202533, |
| "grad_norm": 3.2891459465026855, |
| "learning_rate": 4.653932726323935e-05, |
| "loss": 0.1123, |
| "num_input_tokens_seen": 178864, |
| "step": 400 |
| }, |
| { |
| "epoch": 2.5632911392405062, |
| "grad_norm": 0.8346641063690186, |
| "learning_rate": 4.639782806278021e-05, |
| "loss": 0.0482, |
| "num_input_tokens_seen": 181104, |
| "step": 405 |
| }, |
| { |
| "epoch": 2.5949367088607596, |
| "grad_norm": 16.31033706665039, |
| "learning_rate": 4.625371786853258e-05, |
| "loss": 0.2108, |
| "num_input_tokens_seen": 183472, |
| "step": 410 |
| }, |
| { |
| "epoch": 2.6265822784810124, |
| "grad_norm": 1.3053120374679565, |
| "learning_rate": 4.610701426502984e-05, |
| "loss": 0.1201, |
| "num_input_tokens_seen": 185776, |
| "step": 415 |
| }, |
| { |
| "epoch": 2.6582278481012658, |
| "grad_norm": 1.6082241535186768, |
| "learning_rate": 4.5957735153256915e-05, |
| "loss": 0.1283, |
| "num_input_tokens_seen": 188080, |
| "step": 420 |
| }, |
| { |
| "epoch": 2.689873417721519, |
| "grad_norm": 1.7989009618759155, |
| "learning_rate": 4.5805898748465955e-05, |
| "loss": 0.0637, |
| "num_input_tokens_seen": 190320, |
| "step": 425 |
| }, |
| { |
| "epoch": 2.721518987341772, |
| "grad_norm": 2.346414804458618, |
| "learning_rate": 4.565152357795368e-05, |
| "loss": 0.1731, |
| "num_input_tokens_seen": 192496, |
| "step": 430 |
| }, |
| { |
| "epoch": 2.7531645569620253, |
| "grad_norm": 0.5701020359992981, |
| "learning_rate": 4.549462847880066e-05, |
| "loss": 0.0451, |
| "num_input_tokens_seen": 194704, |
| "step": 435 |
| }, |
| { |
| "epoch": 2.7848101265822782, |
| "grad_norm": 2.4175257682800293, |
| "learning_rate": 4.53352325955728e-05, |
| "loss": 0.0796, |
| "num_input_tokens_seen": 196976, |
| "step": 440 |
| }, |
| { |
| "epoch": 2.8164556962025316, |
| "grad_norm": 1.1178758144378662, |
| "learning_rate": 4.517335537798526e-05, |
| "loss": 0.1103, |
| "num_input_tokens_seen": 199312, |
| "step": 445 |
| }, |
| { |
| "epoch": 2.848101265822785, |
| "grad_norm": 1.3870896100997925, |
| "learning_rate": 4.5009016578529194e-05, |
| "loss": 0.072, |
| "num_input_tokens_seen": 201584, |
| "step": 450 |
| }, |
| { |
| "epoch": 2.879746835443038, |
| "grad_norm": 1.876051664352417, |
| "learning_rate": 4.4842236250061535e-05, |
| "loss": 0.13, |
| "num_input_tokens_seen": 203760, |
| "step": 455 |
| }, |
| { |
| "epoch": 2.911392405063291, |
| "grad_norm": 2.2049477100372314, |
| "learning_rate": 4.467303474335809e-05, |
| "loss": 0.083, |
| "num_input_tokens_seen": 205968, |
| "step": 460 |
| }, |
| { |
| "epoch": 2.9430379746835444, |
| "grad_norm": 1.2528291940689087, |
| "learning_rate": 4.4501432704630305e-05, |
| "loss": 0.1533, |
| "num_input_tokens_seen": 208240, |
| "step": 465 |
| }, |
| { |
| "epoch": 2.9746835443037973, |
| "grad_norm": 1.5981637239456177, |
| "learning_rate": 4.432745107300603e-05, |
| "loss": 0.0748, |
| "num_input_tokens_seen": 210416, |
| "step": 470 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 0.10051123797893524, |
| "eval_runtime": 1.7672, |
| "eval_samples_per_second": 39.611, |
| "eval_steps_per_second": 10.186, |
| "num_input_tokens_seen": 211808, |
| "step": 474 |
| }, |
| { |
| "epoch": 3.0063291139240507, |
| "grad_norm": 2.3567254543304443, |
| "learning_rate": 4.415111107797445e-05, |
| "loss": 0.0729, |
| "num_input_tokens_seen": 212288, |
| "step": 475 |
| }, |
| { |
| "epoch": 3.037974683544304, |
| "grad_norm": 2.7346839904785156, |
| "learning_rate": 4.3972434236795656e-05, |
| "loss": 0.1298, |
| "num_input_tokens_seen": 214400, |
| "step": 480 |
| }, |
| { |
| "epoch": 3.069620253164557, |
| "grad_norm": 0.6904764771461487, |
| "learning_rate": 4.379144235187505e-05, |
| "loss": 0.0523, |
| "num_input_tokens_seen": 216736, |
| "step": 485 |
| }, |
| { |
| "epoch": 3.1012658227848102, |
| "grad_norm": 1.1353236436843872, |
| "learning_rate": 4.360815750810302e-05, |
| "loss": 0.0871, |
| "num_input_tokens_seen": 218976, |
| "step": 490 |
| }, |
| { |
| "epoch": 3.132911392405063, |
| "grad_norm": 1.1411477327346802, |
| "learning_rate": 4.342260207016012e-05, |
| "loss": 0.0731, |
| "num_input_tokens_seen": 221216, |
| "step": 495 |
| }, |
| { |
| "epoch": 3.1645569620253164, |
| "grad_norm": 0.23975726962089539, |
| "learning_rate": 4.3234798679788016e-05, |
| "loss": 0.076, |
| "num_input_tokens_seen": 223520, |
| "step": 500 |
| }, |
| { |
| "epoch": 3.1962025316455698, |
| "grad_norm": 3.357827663421631, |
| "learning_rate": 4.304477025302681e-05, |
| "loss": 0.0982, |
| "num_input_tokens_seen": 225824, |
| "step": 505 |
| }, |
| { |
| "epoch": 3.2278481012658227, |
| "grad_norm": 1.5692389011383057, |
| "learning_rate": 4.285253997741875e-05, |
| "loss": 0.0788, |
| "num_input_tokens_seen": 228000, |
| "step": 510 |
| }, |
| { |
| "epoch": 3.259493670886076, |
| "grad_norm": 0.5077994465827942, |
| "learning_rate": 4.2658131309178805e-05, |
| "loss": 0.0141, |
| "num_input_tokens_seen": 230144, |
| "step": 515 |
| }, |
| { |
| "epoch": 3.291139240506329, |
| "grad_norm": 1.5440895557403564, |
| "learning_rate": 4.246156797033261e-05, |
| "loss": 0.0495, |
| "num_input_tokens_seen": 232512, |
| "step": 520 |
| }, |
| { |
| "epoch": 3.3227848101265822, |
| "grad_norm": 0.6975259780883789, |
| "learning_rate": 4.2262873945821754e-05, |
| "loss": 0.0446, |
| "num_input_tokens_seen": 234752, |
| "step": 525 |
| }, |
| { |
| "epoch": 3.3544303797468356, |
| "grad_norm": 1.3985381126403809, |
| "learning_rate": 4.206207348057721e-05, |
| "loss": 0.0229, |
| "num_input_tokens_seen": 236960, |
| "step": 530 |
| }, |
| { |
| "epoch": 3.3860759493670884, |
| "grad_norm": 0.12185825407505035, |
| "learning_rate": 4.1859191076560844e-05, |
| "loss": 0.0489, |
| "num_input_tokens_seen": 239200, |
| "step": 535 |
| }, |
| { |
| "epoch": 3.4177215189873418, |
| "grad_norm": 0.8711693286895752, |
| "learning_rate": 4.165425148977571e-05, |
| "loss": 0.0636, |
| "num_input_tokens_seen": 241344, |
| "step": 540 |
| }, |
| { |
| "epoch": 3.449367088607595, |
| "grad_norm": 1.7671948671340942, |
| "learning_rate": 4.144727972724524e-05, |
| "loss": 0.1113, |
| "num_input_tokens_seen": 243648, |
| "step": 545 |
| }, |
| { |
| "epoch": 3.481012658227848, |
| "grad_norm": 3.0171236991882324, |
| "learning_rate": 4.123830104396189e-05, |
| "loss": 0.0843, |
| "num_input_tokens_seen": 245792, |
| "step": 550 |
| }, |
| { |
| "epoch": 3.5, |
| "eval_loss": 0.08688981086015701, |
| "eval_runtime": 1.7725, |
| "eval_samples_per_second": 39.492, |
| "eval_steps_per_second": 10.155, |
| "num_input_tokens_seen": 247104, |
| "step": 553 |
| }, |
| { |
| "epoch": 3.5126582278481013, |
| "grad_norm": 0.3095618486404419, |
| "learning_rate": 4.10273409398055e-05, |
| "loss": 0.1233, |
| "num_input_tokens_seen": 248000, |
| "step": 555 |
| }, |
| { |
| "epoch": 3.5443037974683547, |
| "grad_norm": 1.4407626390457153, |
| "learning_rate": 4.0814425156431713e-05, |
| "loss": 0.0679, |
| "num_input_tokens_seen": 250144, |
| "step": 560 |
| }, |
| { |
| "epoch": 3.5759493670886076, |
| "grad_norm": 0.7778800129890442, |
| "learning_rate": 4.0599579674130964e-05, |
| "loss": 0.0232, |
| "num_input_tokens_seen": 252288, |
| "step": 565 |
| }, |
| { |
| "epoch": 3.607594936708861, |
| "grad_norm": 1.3945868015289307, |
| "learning_rate": 4.0382830708658316e-05, |
| "loss": 0.0948, |
| "num_input_tokens_seen": 254496, |
| "step": 570 |
| }, |
| { |
| "epoch": 3.6392405063291138, |
| "grad_norm": 0.7456892132759094, |
| "learning_rate": 4.016420470803464e-05, |
| "loss": 0.0113, |
| "num_input_tokens_seen": 256672, |
| "step": 575 |
| }, |
| { |
| "epoch": 3.670886075949367, |
| "grad_norm": 1.2941418886184692, |
| "learning_rate": 3.9943728349319286e-05, |
| "loss": 0.0971, |
| "num_input_tokens_seen": 258944, |
| "step": 580 |
| }, |
| { |
| "epoch": 3.7025316455696204, |
| "grad_norm": 0.5528960824012756, |
| "learning_rate": 3.972142853535499e-05, |
| "loss": 0.0909, |
| "num_input_tokens_seen": 261216, |
| "step": 585 |
| }, |
| { |
| "epoch": 3.7341772151898733, |
| "grad_norm": 1.220745325088501, |
| "learning_rate": 3.949733239148512e-05, |
| "loss": 0.1046, |
| "num_input_tokens_seen": 263552, |
| "step": 590 |
| }, |
| { |
| "epoch": 3.7658227848101267, |
| "grad_norm": 1.580295443534851, |
| "learning_rate": 3.927146726224379e-05, |
| "loss": 0.0776, |
| "num_input_tokens_seen": 265792, |
| "step": 595 |
| }, |
| { |
| "epoch": 3.7974683544303796, |
| "grad_norm": 1.8577332496643066, |
| "learning_rate": 3.9043860708019273e-05, |
| "loss": 0.0815, |
| "num_input_tokens_seen": 267968, |
| "step": 600 |
| }, |
| { |
| "epoch": 3.829113924050633, |
| "grad_norm": 2.401762008666992, |
| "learning_rate": 3.881454050169102e-05, |
| "loss": 0.0598, |
| "num_input_tokens_seen": 270112, |
| "step": 605 |
| }, |
| { |
| "epoch": 3.8607594936708862, |
| "grad_norm": 3.1154701709747314, |
| "learning_rate": 3.8583534625240786e-05, |
| "loss": 0.1975, |
| "num_input_tokens_seen": 272352, |
| "step": 610 |
| }, |
| { |
| "epoch": 3.892405063291139, |
| "grad_norm": 1.4128812551498413, |
| "learning_rate": 3.835087126633821e-05, |
| "loss": 0.1122, |
| "num_input_tokens_seen": 274688, |
| "step": 615 |
| }, |
| { |
| "epoch": 3.9240506329113924, |
| "grad_norm": 1.2004035711288452, |
| "learning_rate": 3.8116578814901315e-05, |
| "loss": 0.1368, |
| "num_input_tokens_seen": 276864, |
| "step": 620 |
| }, |
| { |
| "epoch": 3.9556962025316453, |
| "grad_norm": 0.5005134344100952, |
| "learning_rate": 3.788068585963238e-05, |
| "loss": 0.0475, |
| "num_input_tokens_seen": 279104, |
| "step": 625 |
| }, |
| { |
| "epoch": 3.9873417721518987, |
| "grad_norm": 0.2701457440853119, |
| "learning_rate": 3.764322118452943e-05, |
| "loss": 0.1127, |
| "num_input_tokens_seen": 281344, |
| "step": 630 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 0.08055847138166428, |
| "eval_runtime": 1.77, |
| "eval_samples_per_second": 39.549, |
| "eval_steps_per_second": 10.17, |
| "num_input_tokens_seen": 282048, |
| "step": 632 |
| }, |
| { |
| "epoch": 4.018987341772152, |
| "grad_norm": 0.7883828282356262, |
| "learning_rate": 3.740421376537402e-05, |
| "loss": 0.0492, |
| "num_input_tokens_seen": 283456, |
| "step": 635 |
| }, |
| { |
| "epoch": 4.050632911392405, |
| "grad_norm": 2.189941883087158, |
| "learning_rate": 3.7163692766195554e-05, |
| "loss": 0.0655, |
| "num_input_tokens_seen": 285600, |
| "step": 640 |
| }, |
| { |
| "epoch": 4.082278481012658, |
| "grad_norm": 0.2087947428226471, |
| "learning_rate": 3.6921687535712656e-05, |
| "loss": 0.0519, |
| "num_input_tokens_seen": 287808, |
| "step": 645 |
| }, |
| { |
| "epoch": 4.113924050632911, |
| "grad_norm": 1.8172799348831177, |
| "learning_rate": 3.667822760375198e-05, |
| "loss": 0.0551, |
| "num_input_tokens_seen": 290048, |
| "step": 650 |
| }, |
| { |
| "epoch": 4.1455696202531644, |
| "grad_norm": 0.24938087165355682, |
| "learning_rate": 3.643334267764495e-05, |
| "loss": 0.011, |
| "num_input_tokens_seen": 292320, |
| "step": 655 |
| }, |
| { |
| "epoch": 4.177215189873418, |
| "grad_norm": 1.2110445499420166, |
| "learning_rate": 3.618706263860284e-05, |
| "loss": 0.0557, |
| "num_input_tokens_seen": 294560, |
| "step": 660 |
| }, |
| { |
| "epoch": 4.208860759493671, |
| "grad_norm": 0.8580341339111328, |
| "learning_rate": 3.5939417538070595e-05, |
| "loss": 0.0788, |
| "num_input_tokens_seen": 296832, |
| "step": 665 |
| }, |
| { |
| "epoch": 4.2405063291139244, |
| "grad_norm": 0.9276352524757385, |
| "learning_rate": 3.569043759405995e-05, |
| "loss": 0.0306, |
| "num_input_tokens_seen": 299168, |
| "step": 670 |
| }, |
| { |
| "epoch": 4.272151898734177, |
| "grad_norm": 1.2783339023590088, |
| "learning_rate": 3.5440153187462145e-05, |
| "loss": 0.0951, |
| "num_input_tokens_seen": 301312, |
| "step": 675 |
| }, |
| { |
| "epoch": 4.30379746835443, |
| "grad_norm": 1.2173526287078857, |
| "learning_rate": 3.518859485834082e-05, |
| "loss": 0.0305, |
| "num_input_tokens_seen": 303520, |
| "step": 680 |
| }, |
| { |
| "epoch": 4.3354430379746836, |
| "grad_norm": 8.077330589294434, |
| "learning_rate": 3.49357933022055e-05, |
| "loss": 0.1058, |
| "num_input_tokens_seen": 305792, |
| "step": 685 |
| }, |
| { |
| "epoch": 4.367088607594937, |
| "grad_norm": 0.9942968487739563, |
| "learning_rate": 3.468177936626603e-05, |
| "loss": 0.0481, |
| "num_input_tokens_seen": 307968, |
| "step": 690 |
| }, |
| { |
| "epoch": 4.39873417721519, |
| "grad_norm": 0.5724031329154968, |
| "learning_rate": 3.442658404566861e-05, |
| "loss": 0.056, |
| "num_input_tokens_seen": 310240, |
| "step": 695 |
| }, |
| { |
| "epoch": 4.430379746835443, |
| "grad_norm": 0.6137570738792419, |
| "learning_rate": 3.417023847971368e-05, |
| "loss": 0.0666, |
| "num_input_tokens_seen": 312416, |
| "step": 700 |
| }, |
| { |
| "epoch": 4.462025316455696, |
| "grad_norm": 0.5140513181686401, |
| "learning_rate": 3.3912773948056274e-05, |
| "loss": 0.1092, |
| "num_input_tokens_seen": 314656, |
| "step": 705 |
| }, |
| { |
| "epoch": 4.493670886075949, |
| "grad_norm": 1.4817256927490234, |
| "learning_rate": 3.3654221866889254e-05, |
| "loss": 0.0631, |
| "num_input_tokens_seen": 316768, |
| "step": 710 |
| }, |
| { |
| "epoch": 4.5, |
| "eval_loss": 0.08643601834774017, |
| "eval_runtime": 1.784, |
| "eval_samples_per_second": 39.237, |
| "eval_steps_per_second": 10.09, |
| "num_input_tokens_seen": 317248, |
| "step": 711 |
| }, |
| { |
| "epoch": 4.525316455696203, |
| "grad_norm": 0.3269573748111725, |
| "learning_rate": 3.339461378510982e-05, |
| "loss": 0.0729, |
| "num_input_tokens_seen": 318912, |
| "step": 715 |
| }, |
| { |
| "epoch": 4.556962025316456, |
| "grad_norm": 1.8290832042694092, |
| "learning_rate": 3.313398138046987e-05, |
| "loss": 0.1263, |
| "num_input_tokens_seen": 321312, |
| "step": 720 |
| }, |
| { |
| "epoch": 4.588607594936709, |
| "grad_norm": 1.4354349374771118, |
| "learning_rate": 3.287235645571071e-05, |
| "loss": 0.1048, |
| "num_input_tokens_seen": 323488, |
| "step": 725 |
| }, |
| { |
| "epoch": 4.620253164556962, |
| "grad_norm": 1.9681675434112549, |
| "learning_rate": 3.26097709346823e-05, |
| "loss": 0.0817, |
| "num_input_tokens_seen": 325824, |
| "step": 730 |
| }, |
| { |
| "epoch": 4.651898734177215, |
| "grad_norm": 1.6938735246658325, |
| "learning_rate": 3.234625685844803e-05, |
| "loss": 0.0464, |
| "num_input_tokens_seen": 328064, |
| "step": 735 |
| }, |
| { |
| "epoch": 4.6835443037974684, |
| "grad_norm": 2.0596401691436768, |
| "learning_rate": 3.208184638137484e-05, |
| "loss": 0.1184, |
| "num_input_tokens_seen": 330208, |
| "step": 740 |
| }, |
| { |
| "epoch": 4.715189873417722, |
| "grad_norm": 0.32809868454933167, |
| "learning_rate": 3.181657176720987e-05, |
| "loss": 0.0494, |
| "num_input_tokens_seen": 332512, |
| "step": 745 |
| }, |
| { |
| "epoch": 4.746835443037975, |
| "grad_norm": 1.6838154792785645, |
| "learning_rate": 3.1550465385143455e-05, |
| "loss": 0.1421, |
| "num_input_tokens_seen": 334752, |
| "step": 750 |
| }, |
| { |
| "epoch": 4.7784810126582276, |
| "grad_norm": 0.052991144359111786, |
| "learning_rate": 3.128355970585949e-05, |
| "loss": 0.0342, |
| "num_input_tokens_seen": 337024, |
| "step": 755 |
| }, |
| { |
| "epoch": 4.810126582278481, |
| "grad_norm": 0.853191077709198, |
| "learning_rate": 3.101588729757323e-05, |
| "loss": 0.0806, |
| "num_input_tokens_seen": 339264, |
| "step": 760 |
| }, |
| { |
| "epoch": 4.841772151898734, |
| "grad_norm": 0.7384766340255737, |
| "learning_rate": 3.074748082205734e-05, |
| "loss": 0.0258, |
| "num_input_tokens_seen": 341536, |
| "step": 765 |
| }, |
| { |
| "epoch": 4.8734177215189876, |
| "grad_norm": 1.546053171157837, |
| "learning_rate": 3.0478373030656405e-05, |
| "loss": 0.0363, |
| "num_input_tokens_seen": 343872, |
| "step": 770 |
| }, |
| { |
| "epoch": 4.905063291139241, |
| "grad_norm": 2.8107569217681885, |
| "learning_rate": 3.0208596760290587e-05, |
| "loss": 0.0848, |
| "num_input_tokens_seen": 346176, |
| "step": 775 |
| }, |
| { |
| "epoch": 4.936708860759493, |
| "grad_norm": 0.4796382486820221, |
| "learning_rate": 2.993818492944882e-05, |
| "loss": 0.0604, |
| "num_input_tokens_seen": 348384, |
| "step": 780 |
| }, |
| { |
| "epoch": 4.968354430379747, |
| "grad_norm": 0.3333726227283478, |
| "learning_rate": 2.9667170534172035e-05, |
| "loss": 0.0462, |
| "num_input_tokens_seen": 350560, |
| "step": 785 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.2007966786623001, |
| "learning_rate": 2.9395586644026945e-05, |
| "loss": 0.0826, |
| "num_input_tokens_seen": 352592, |
| "step": 790 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_loss": 0.08093634247779846, |
| "eval_runtime": 1.7738, |
| "eval_samples_per_second": 39.464, |
| "eval_steps_per_second": 10.148, |
| "num_input_tokens_seen": 352592, |
| "step": 790 |
| }, |
| { |
| "epoch": 5.031645569620253, |
| "grad_norm": 0.4830271005630493, |
| "learning_rate": 2.9123466398070852e-05, |
| "loss": 0.0523, |
| "num_input_tokens_seen": 354736, |
| "step": 795 |
| }, |
| { |
| "epoch": 5.063291139240507, |
| "grad_norm": 0.9163768291473389, |
| "learning_rate": 2.8850843000807964e-05, |
| "loss": 0.0531, |
| "num_input_tokens_seen": 357072, |
| "step": 800 |
| }, |
| { |
| "epoch": 5.094936708860759, |
| "grad_norm": 1.5045676231384277, |
| "learning_rate": 2.8577749718137724e-05, |
| "loss": 0.0262, |
| "num_input_tokens_seen": 359376, |
| "step": 805 |
| }, |
| { |
| "epoch": 5.1265822784810124, |
| "grad_norm": 0.9244067668914795, |
| "learning_rate": 2.830421987329569e-05, |
| "loss": 0.0479, |
| "num_input_tokens_seen": 361744, |
| "step": 810 |
| }, |
| { |
| "epoch": 5.158227848101266, |
| "grad_norm": 0.04344862326979637, |
| "learning_rate": 2.803028684278735e-05, |
| "loss": 0.1163, |
| "num_input_tokens_seen": 363952, |
| "step": 815 |
| }, |
| { |
| "epoch": 5.189873417721519, |
| "grad_norm": 5.920812129974365, |
| "learning_rate": 2.775598405231549e-05, |
| "loss": 0.0831, |
| "num_input_tokens_seen": 366224, |
| "step": 820 |
| }, |
| { |
| "epoch": 5.2215189873417724, |
| "grad_norm": 3.040343761444092, |
| "learning_rate": 2.7481344972701545e-05, |
| "loss": 0.077, |
| "num_input_tokens_seen": 368496, |
| "step": 825 |
| }, |
| { |
| "epoch": 5.253164556962025, |
| "grad_norm": 0.3769160509109497, |
| "learning_rate": 2.7206403115801427e-05, |
| "loss": 0.039, |
| "num_input_tokens_seen": 370800, |
| "step": 830 |
| }, |
| { |
| "epoch": 5.284810126582278, |
| "grad_norm": 0.4965035021305084, |
| "learning_rate": 2.6931192030416362e-05, |
| "loss": 0.0347, |
| "num_input_tokens_seen": 373040, |
| "step": 835 |
| }, |
| { |
| "epoch": 5.3164556962025316, |
| "grad_norm": 1.6629559993743896, |
| "learning_rate": 2.6655745298199253e-05, |
| "loss": 0.067, |
| "num_input_tokens_seen": 375280, |
| "step": 840 |
| }, |
| { |
| "epoch": 5.348101265822785, |
| "grad_norm": 0.16424624621868134, |
| "learning_rate": 2.6380096529556936e-05, |
| "loss": 0.0659, |
| "num_input_tokens_seen": 377520, |
| "step": 845 |
| }, |
| { |
| "epoch": 5.379746835443038, |
| "grad_norm": 1.512812852859497, |
| "learning_rate": 2.6104279359549043e-05, |
| "loss": 0.0597, |
| "num_input_tokens_seen": 379696, |
| "step": 850 |
| }, |
| { |
| "epoch": 5.4113924050632916, |
| "grad_norm": 27.909957885742188, |
| "learning_rate": 2.582832744378377e-05, |
| "loss": 0.1124, |
| "num_input_tokens_seen": 381904, |
| "step": 855 |
| }, |
| { |
| "epoch": 5.443037974683544, |
| "grad_norm": 0.5168792605400085, |
| "learning_rate": 2.555227445431119e-05, |
| "loss": 0.0281, |
| "num_input_tokens_seen": 384112, |
| "step": 860 |
| }, |
| { |
| "epoch": 5.474683544303797, |
| "grad_norm": 0.2397191971540451, |
| "learning_rate": 2.5276154075514547e-05, |
| "loss": 0.0446, |
| "num_input_tokens_seen": 386416, |
| "step": 865 |
| }, |
| { |
| "epoch": 5.5, |
| "eval_loss": 0.09230098128318787, |
| "eval_runtime": 1.7721, |
| "eval_samples_per_second": 39.501, |
| "eval_steps_per_second": 10.157, |
| "num_input_tokens_seen": 388176, |
| "step": 869 |
| }, |
| { |
| "epoch": 5.506329113924051, |
| "grad_norm": 0.22594130039215088, |
| "learning_rate": 2.5e-05, |
| "loss": 0.0487, |
| "num_input_tokens_seen": 388624, |
| "step": 870 |
| }, |
| { |
| "epoch": 5.537974683544304, |
| "grad_norm": 1.0183571577072144, |
| "learning_rate": 2.472384592448546e-05, |
| "loss": 0.0566, |
| "num_input_tokens_seen": 390864, |
| "step": 875 |
| }, |
| { |
| "epoch": 5.569620253164557, |
| "grad_norm": 0.8194377422332764, |
| "learning_rate": 2.4447725545688814e-05, |
| "loss": 0.0534, |
| "num_input_tokens_seen": 393104, |
| "step": 880 |
| }, |
| { |
| "epoch": 5.60126582278481, |
| "grad_norm": 0.9605931639671326, |
| "learning_rate": 2.4171672556216234e-05, |
| "loss": 0.0834, |
| "num_input_tokens_seen": 395440, |
| "step": 885 |
| }, |
| { |
| "epoch": 5.632911392405063, |
| "grad_norm": 0.7708854675292969, |
| "learning_rate": 2.3895720640450963e-05, |
| "loss": 0.0516, |
| "num_input_tokens_seen": 397776, |
| "step": 890 |
| }, |
| { |
| "epoch": 5.6645569620253164, |
| "grad_norm": 0.2176842838525772, |
| "learning_rate": 2.3619903470443067e-05, |
| "loss": 0.0432, |
| "num_input_tokens_seen": 399984, |
| "step": 895 |
| }, |
| { |
| "epoch": 5.69620253164557, |
| "grad_norm": 2.65143084526062, |
| "learning_rate": 2.3344254701800756e-05, |
| "loss": 0.0658, |
| "num_input_tokens_seen": 402256, |
| "step": 900 |
| }, |
| { |
| "epoch": 5.727848101265823, |
| "grad_norm": 0.07710819691419601, |
| "learning_rate": 2.306880796958364e-05, |
| "loss": 0.0064, |
| "num_input_tokens_seen": 404400, |
| "step": 905 |
| }, |
| { |
| "epoch": 5.759493670886076, |
| "grad_norm": 0.05985208973288536, |
| "learning_rate": 2.279359688419858e-05, |
| "loss": 0.0102, |
| "num_input_tokens_seen": 406512, |
| "step": 910 |
| }, |
| { |
| "epoch": 5.791139240506329, |
| "grad_norm": 2.6067276000976562, |
| "learning_rate": 2.2518655027298464e-05, |
| "loss": 0.0914, |
| "num_input_tokens_seen": 408752, |
| "step": 915 |
| }, |
| { |
| "epoch": 5.822784810126582, |
| "grad_norm": 0.6947869062423706, |
| "learning_rate": 2.2244015947684514e-05, |
| "loss": 0.0731, |
| "num_input_tokens_seen": 411088, |
| "step": 920 |
| }, |
| { |
| "epoch": 5.8544303797468356, |
| "grad_norm": 1.428839087486267, |
| "learning_rate": 2.1969713157212655e-05, |
| "loss": 0.061, |
| "num_input_tokens_seen": 413200, |
| "step": 925 |
| }, |
| { |
| "epoch": 5.886075949367089, |
| "grad_norm": 2.066138982772827, |
| "learning_rate": 2.169578012670431e-05, |
| "loss": 0.0674, |
| "num_input_tokens_seen": 415440, |
| "step": 930 |
| }, |
| { |
| "epoch": 5.917721518987342, |
| "grad_norm": 0.2917608618736267, |
| "learning_rate": 2.142225028186228e-05, |
| "loss": 0.0796, |
| "num_input_tokens_seen": 417744, |
| "step": 935 |
| }, |
| { |
| "epoch": 5.949367088607595, |
| "grad_norm": 1.0031896829605103, |
| "learning_rate": 2.1149156999192042e-05, |
| "loss": 0.0833, |
| "num_input_tokens_seen": 419952, |
| "step": 940 |
| }, |
| { |
| "epoch": 5.981012658227848, |
| "grad_norm": 1.1027328968048096, |
| "learning_rate": 2.087653360192915e-05, |
| "loss": 0.0774, |
| "num_input_tokens_seen": 422160, |
| "step": 945 |
| }, |
| { |
| "epoch": 6.0, |
| "eval_loss": 0.07719651609659195, |
| "eval_runtime": 1.7754, |
| "eval_samples_per_second": 39.428, |
| "eval_steps_per_second": 10.139, |
| "num_input_tokens_seen": 423184, |
| "step": 948 |
| }, |
| { |
| "epoch": 6.012658227848101, |
| "grad_norm": 0.5562558174133301, |
| "learning_rate": 2.0604413355973054e-05, |
| "loss": 0.0359, |
| "num_input_tokens_seen": 424176, |
| "step": 950 |
| }, |
| { |
| "epoch": 6.044303797468355, |
| "grad_norm": 0.7119388580322266, |
| "learning_rate": 2.033282946582797e-05, |
| "loss": 0.0186, |
| "num_input_tokens_seen": 426352, |
| "step": 955 |
| }, |
| { |
| "epoch": 6.075949367088608, |
| "grad_norm": 1.0250580310821533, |
| "learning_rate": 2.0061815070551186e-05, |
| "loss": 0.045, |
| "num_input_tokens_seen": 428560, |
| "step": 960 |
| }, |
| { |
| "epoch": 6.1075949367088604, |
| "grad_norm": 1.0718591213226318, |
| "learning_rate": 1.9791403239709416e-05, |
| "loss": 0.016, |
| "num_input_tokens_seen": 430832, |
| "step": 965 |
| }, |
| { |
| "epoch": 6.139240506329114, |
| "grad_norm": 3.242114543914795, |
| "learning_rate": 1.9521626969343608e-05, |
| "loss": 0.038, |
| "num_input_tokens_seen": 432976, |
| "step": 970 |
| }, |
| { |
| "epoch": 6.170886075949367, |
| "grad_norm": 2.3887522220611572, |
| "learning_rate": 1.9252519177942667e-05, |
| "loss": 0.0514, |
| "num_input_tokens_seen": 435216, |
| "step": 975 |
| }, |
| { |
| "epoch": 6.2025316455696204, |
| "grad_norm": 2.0255088806152344, |
| "learning_rate": 1.8984112702426774e-05, |
| "loss": 0.0561, |
| "num_input_tokens_seen": 437584, |
| "step": 980 |
| }, |
| { |
| "epoch": 6.234177215189874, |
| "grad_norm": 1.4888111352920532, |
| "learning_rate": 1.8716440294140518e-05, |
| "loss": 0.0444, |
| "num_input_tokens_seen": 439760, |
| "step": 985 |
| }, |
| { |
| "epoch": 6.265822784810126, |
| "grad_norm": 0.281730592250824, |
| "learning_rate": 1.8449534614856558e-05, |
| "loss": 0.0424, |
| "num_input_tokens_seen": 442000, |
| "step": 990 |
| }, |
| { |
| "epoch": 6.2974683544303796, |
| "grad_norm": 0.588544487953186, |
| "learning_rate": 1.818342823279014e-05, |
| "loss": 0.0307, |
| "num_input_tokens_seen": 444208, |
| "step": 995 |
| }, |
| { |
| "epoch": 6.329113924050633, |
| "grad_norm": 0.21011468768119812, |
| "learning_rate": 1.7918153618625162e-05, |
| "loss": 0.0459, |
| "num_input_tokens_seen": 446480, |
| "step": 1000 |
| }, |
| { |
| "epoch": 6.360759493670886, |
| "grad_norm": 1.0189317464828491, |
| "learning_rate": 1.7653743141551983e-05, |
| "loss": 0.1302, |
| "num_input_tokens_seen": 448624, |
| "step": 1005 |
| }, |
| { |
| "epoch": 6.3924050632911396, |
| "grad_norm": 0.2481241524219513, |
| "learning_rate": 1.7390229065317696e-05, |
| "loss": 0.0243, |
| "num_input_tokens_seen": 450960, |
| "step": 1010 |
| }, |
| { |
| "epoch": 6.424050632911392, |
| "grad_norm": 0.10512924939393997, |
| "learning_rate": 1.7127643544289297e-05, |
| "loss": 0.053, |
| "num_input_tokens_seen": 453264, |
| "step": 1015 |
| }, |
| { |
| "epoch": 6.455696202531645, |
| "grad_norm": 0.8155413866043091, |
| "learning_rate": 1.6866018619530123e-05, |
| "loss": 0.0309, |
| "num_input_tokens_seen": 455472, |
| "step": 1020 |
| }, |
| { |
| "epoch": 6.487341772151899, |
| "grad_norm": 1.3210691213607788, |
| "learning_rate": 1.660538621489019e-05, |
| "loss": 0.0554, |
| "num_input_tokens_seen": 457744, |
| "step": 1025 |
| }, |
| { |
| "epoch": 6.5, |
| "eval_loss": 0.10978475958108902, |
| "eval_runtime": 1.7716, |
| "eval_samples_per_second": 39.512, |
| "eval_steps_per_second": 10.16, |
| "num_input_tokens_seen": 458640, |
| "step": 1027 |
| }, |
| { |
| "epoch": 6.518987341772152, |
| "grad_norm": 0.3908012807369232, |
| "learning_rate": 1.6345778133110752e-05, |
| "loss": 0.0129, |
| "num_input_tokens_seen": 459984, |
| "step": 1030 |
| }, |
| { |
| "epoch": 6.550632911392405, |
| "grad_norm": 2.6782333850860596, |
| "learning_rate": 1.6087226051943728e-05, |
| "loss": 0.0748, |
| "num_input_tokens_seen": 462224, |
| "step": 1035 |
| }, |
| { |
| "epoch": 6.582278481012658, |
| "grad_norm": 0.06257260590791702, |
| "learning_rate": 1.5829761520286334e-05, |
| "loss": 0.0517, |
| "num_input_tokens_seen": 464304, |
| "step": 1040 |
| }, |
| { |
| "epoch": 6.613924050632911, |
| "grad_norm": 0.9672399759292603, |
| "learning_rate": 1.5573415954331398e-05, |
| "loss": 0.0334, |
| "num_input_tokens_seen": 466576, |
| "step": 1045 |
| }, |
| { |
| "epoch": 6.6455696202531644, |
| "grad_norm": 1.3417441844940186, |
| "learning_rate": 1.5318220633733978e-05, |
| "loss": 0.0753, |
| "num_input_tokens_seen": 468720, |
| "step": 1050 |
| }, |
| { |
| "epoch": 6.677215189873418, |
| "grad_norm": 0.6716601252555847, |
| "learning_rate": 1.5064206697794509e-05, |
| "loss": 0.0394, |
| "num_input_tokens_seen": 470992, |
| "step": 1055 |
| }, |
| { |
| "epoch": 6.708860759493671, |
| "grad_norm": 0.5984333157539368, |
| "learning_rate": 1.481140514165919e-05, |
| "loss": 0.0369, |
| "num_input_tokens_seen": 473264, |
| "step": 1060 |
| }, |
| { |
| "epoch": 6.740506329113924, |
| "grad_norm": 1.1648204326629639, |
| "learning_rate": 1.455984681253787e-05, |
| "loss": 0.0522, |
| "num_input_tokens_seen": 475504, |
| "step": 1065 |
| }, |
| { |
| "epoch": 6.772151898734177, |
| "grad_norm": 0.014528759755194187, |
| "learning_rate": 1.4309562405940052e-05, |
| "loss": 0.0625, |
| "num_input_tokens_seen": 477776, |
| "step": 1070 |
| }, |
| { |
| "epoch": 6.80379746835443, |
| "grad_norm": 0.037693943828344345, |
| "learning_rate": 1.40605824619294e-05, |
| "loss": 0.0443, |
| "num_input_tokens_seen": 480016, |
| "step": 1075 |
| }, |
| { |
| "epoch": 6.8354430379746836, |
| "grad_norm": 0.6183596253395081, |
| "learning_rate": 1.3812937361397165e-05, |
| "loss": 0.0506, |
| "num_input_tokens_seen": 482256, |
| "step": 1080 |
| }, |
| { |
| "epoch": 6.867088607594937, |
| "grad_norm": 0.3752747178077698, |
| "learning_rate": 1.3566657322355058e-05, |
| "loss": 0.0775, |
| "num_input_tokens_seen": 484432, |
| "step": 1085 |
| }, |
| { |
| "epoch": 6.89873417721519, |
| "grad_norm": 0.7681966423988342, |
| "learning_rate": 1.3321772396248029e-05, |
| "loss": 0.0336, |
| "num_input_tokens_seen": 486608, |
| "step": 1090 |
| }, |
| { |
| "epoch": 6.930379746835443, |
| "grad_norm": 0.9100373387336731, |
| "learning_rate": 1.3078312464287353e-05, |
| "loss": 0.0285, |
| "num_input_tokens_seen": 488784, |
| "step": 1095 |
| }, |
| { |
| "epoch": 6.962025316455696, |
| "grad_norm": 3.5124988555908203, |
| "learning_rate": 1.2836307233804448e-05, |
| "loss": 0.0453, |
| "num_input_tokens_seen": 491024, |
| "step": 1100 |
| }, |
| { |
| "epoch": 6.993670886075949, |
| "grad_norm": 1.1614471673965454, |
| "learning_rate": 1.2595786234625989e-05, |
| "loss": 0.046, |
| "num_input_tokens_seen": 493232, |
| "step": 1105 |
| }, |
| { |
| "epoch": 7.0, |
| "eval_loss": 0.09714015573263168, |
| "eval_runtime": 1.7702, |
| "eval_samples_per_second": 39.543, |
| "eval_steps_per_second": 10.168, |
| "num_input_tokens_seen": 493440, |
| "step": 1106 |
| }, |
| { |
| "epoch": 7.025316455696203, |
| "grad_norm": 0.7261226773262024, |
| "learning_rate": 1.2356778815470573e-05, |
| "loss": 0.0066, |
| "num_input_tokens_seen": 495200, |
| "step": 1110 |
| }, |
| { |
| "epoch": 7.056962025316456, |
| "grad_norm": 1.433316946029663, |
| "learning_rate": 1.2119314140367628e-05, |
| "loss": 0.0224, |
| "num_input_tokens_seen": 497504, |
| "step": 1115 |
| }, |
| { |
| "epoch": 7.0886075949367084, |
| "grad_norm": 0.3160828948020935, |
| "learning_rate": 1.1883421185098684e-05, |
| "loss": 0.0242, |
| "num_input_tokens_seen": 499744, |
| "step": 1120 |
| }, |
| { |
| "epoch": 7.120253164556962, |
| "grad_norm": 5.301418304443359, |
| "learning_rate": 1.1649128733661802e-05, |
| "loss": 0.0706, |
| "num_input_tokens_seen": 501952, |
| "step": 1125 |
| }, |
| { |
| "epoch": 7.151898734177215, |
| "grad_norm": 1.4883524179458618, |
| "learning_rate": 1.1416465374759222e-05, |
| "loss": 0.0311, |
| "num_input_tokens_seen": 504128, |
| "step": 1130 |
| }, |
| { |
| "epoch": 7.1835443037974684, |
| "grad_norm": 0.2804189920425415, |
| "learning_rate": 1.118545949830898e-05, |
| "loss": 0.0176, |
| "num_input_tokens_seen": 506368, |
| "step": 1135 |
| }, |
| { |
| "epoch": 7.215189873417722, |
| "grad_norm": 1.2443137168884277, |
| "learning_rate": 1.0956139291980727e-05, |
| "loss": 0.0342, |
| "num_input_tokens_seen": 508576, |
| "step": 1140 |
| }, |
| { |
| "epoch": 7.246835443037975, |
| "grad_norm": 0.038201794028282166, |
| "learning_rate": 1.0728532737756217e-05, |
| "loss": 0.038, |
| "num_input_tokens_seen": 510848, |
| "step": 1145 |
| }, |
| { |
| "epoch": 7.2784810126582276, |
| "grad_norm": 0.007585880812257528, |
| "learning_rate": 1.050266760851489e-05, |
| "loss": 0.0022, |
| "num_input_tokens_seen": 513056, |
| "step": 1150 |
| }, |
| { |
| "epoch": 7.310126582278481, |
| "grad_norm": 2.4454050064086914, |
| "learning_rate": 1.0278571464645011e-05, |
| "loss": 0.0579, |
| "num_input_tokens_seen": 515360, |
| "step": 1155 |
| }, |
| { |
| "epoch": 7.341772151898734, |
| "grad_norm": 0.8718327283859253, |
| "learning_rate": 1.0056271650680713e-05, |
| "loss": 0.0696, |
| "num_input_tokens_seen": 517632, |
| "step": 1160 |
| }, |
| { |
| "epoch": 7.3734177215189876, |
| "grad_norm": 1.197663426399231, |
| "learning_rate": 9.835795291965372e-06, |
| "loss": 0.0513, |
| "num_input_tokens_seen": 519872, |
| "step": 1165 |
| }, |
| { |
| "epoch": 7.405063291139241, |
| "grad_norm": 2.151942253112793, |
| "learning_rate": 9.617169291341689e-06, |
| "loss": 0.0362, |
| "num_input_tokens_seen": 522048, |
| "step": 1170 |
| }, |
| { |
| "epoch": 7.436708860759493, |
| "grad_norm": 0.21822425723075867, |
| "learning_rate": 9.400420325869047e-06, |
| "loss": 0.0386, |
| "num_input_tokens_seen": 524256, |
| "step": 1175 |
| }, |
| { |
| "epoch": 7.468354430379747, |
| "grad_norm": 0.42767906188964844, |
| "learning_rate": 9.185574843568292e-06, |
| "loss": 0.0254, |
| "num_input_tokens_seen": 526528, |
| "step": 1180 |
| }, |
| { |
| "epoch": 7.5, |
| "grad_norm": 0.6816165447235107, |
| "learning_rate": 8.972659060194506e-06, |
| "loss": 0.1462, |
| "num_input_tokens_seen": 528768, |
| "step": 1185 |
| }, |
| { |
| "epoch": 7.5, |
| "eval_loss": 0.10821857303380966, |
| "eval_runtime": 1.7807, |
| "eval_samples_per_second": 39.311, |
| "eval_steps_per_second": 10.109, |
| "num_input_tokens_seen": 528768, |
| "step": 1185 |
| }, |
| { |
| "epoch": 7.531645569620253, |
| "grad_norm": 0.052816856652498245, |
| "learning_rate": 8.761698956038111e-06, |
| "loss": 0.0325, |
| "num_input_tokens_seen": 531040, |
| "step": 1190 |
| }, |
| { |
| "epoch": 7.563291139240507, |
| "grad_norm": 1.8085730075836182, |
| "learning_rate": 8.552720272754767e-06, |
| "loss": 0.0397, |
| "num_input_tokens_seen": 533312, |
| "step": 1195 |
| }, |
| { |
| "epoch": 7.594936708860759, |
| "grad_norm": 4.758277893066406, |
| "learning_rate": 8.345748510224299e-06, |
| "loss": 0.0334, |
| "num_input_tokens_seen": 535520, |
| "step": 1200 |
| }, |
| { |
| "epoch": 7.6265822784810124, |
| "grad_norm": 2.2314627170562744, |
| "learning_rate": 8.140808923439162e-06, |
| "loss": 0.0192, |
| "num_input_tokens_seen": 537792, |
| "step": 1205 |
| }, |
| { |
| "epoch": 7.658227848101266, |
| "grad_norm": 0.8286902904510498, |
| "learning_rate": 7.937926519422789e-06, |
| "loss": 0.0445, |
| "num_input_tokens_seen": 539968, |
| "step": 1210 |
| }, |
| { |
| "epoch": 7.689873417721519, |
| "grad_norm": 2.621687650680542, |
| "learning_rate": 7.737126054178237e-06, |
| "loss": 0.0544, |
| "num_input_tokens_seen": 542208, |
| "step": 1215 |
| }, |
| { |
| "epoch": 7.7215189873417724, |
| "grad_norm": 0.02005881257355213, |
| "learning_rate": 7.538432029667395e-06, |
| "loss": 0.0199, |
| "num_input_tokens_seen": 544416, |
| "step": 1220 |
| }, |
| { |
| "epoch": 7.753164556962025, |
| "grad_norm": 0.35898730158805847, |
| "learning_rate": 7.341868690821199e-06, |
| "loss": 0.0619, |
| "num_input_tokens_seen": 546816, |
| "step": 1225 |
| }, |
| { |
| "epoch": 7.784810126582278, |
| "grad_norm": 0.4341179430484772, |
| "learning_rate": 7.147460022581257e-06, |
| "loss": 0.0158, |
| "num_input_tokens_seen": 549120, |
| "step": 1230 |
| }, |
| { |
| "epoch": 7.8164556962025316, |
| "grad_norm": 1.9629868268966675, |
| "learning_rate": 6.955229746973185e-06, |
| "loss": 0.0194, |
| "num_input_tokens_seen": 551328, |
| "step": 1235 |
| }, |
| { |
| "epoch": 7.848101265822785, |
| "grad_norm": 0.34731948375701904, |
| "learning_rate": 6.765201320211987e-06, |
| "loss": 0.0091, |
| "num_input_tokens_seen": 553504, |
| "step": 1240 |
| }, |
| { |
| "epoch": 7.879746835443038, |
| "grad_norm": 5.475011348724365, |
| "learning_rate": 6.57739792983989e-06, |
| "loss": 0.1217, |
| "num_input_tokens_seen": 555680, |
| "step": 1245 |
| }, |
| { |
| "epoch": 7.911392405063291, |
| "grad_norm": 0.41866156458854675, |
| "learning_rate": 6.391842491896977e-06, |
| "loss": 0.0086, |
| "num_input_tokens_seen": 557824, |
| "step": 1250 |
| }, |
| { |
| "epoch": 7.943037974683544, |
| "grad_norm": 0.4846521317958832, |
| "learning_rate": 6.2085576481249575e-06, |
| "loss": 0.0197, |
| "num_input_tokens_seen": 560032, |
| "step": 1255 |
| }, |
| { |
| "epoch": 7.974683544303797, |
| "grad_norm": 0.3908102810382843, |
| "learning_rate": 6.0275657632043485e-06, |
| "loss": 0.0111, |
| "num_input_tokens_seen": 562304, |
| "step": 1260 |
| }, |
| { |
| "epoch": 8.0, |
| "eval_loss": 0.12035088986158371, |
| "eval_runtime": 1.7665, |
| "eval_samples_per_second": 39.626, |
| "eval_steps_per_second": 10.19, |
| "num_input_tokens_seen": 563872, |
| "step": 1264 |
| }, |
| { |
| "epoch": 8.00632911392405, |
| "grad_norm": 0.4609631896018982, |
| "learning_rate": 5.848888922025553e-06, |
| "loss": 0.0333, |
| "num_input_tokens_seen": 564320, |
| "step": 1265 |
| }, |
| { |
| "epoch": 8.037974683544304, |
| "grad_norm": 0.19440504908561707, |
| "learning_rate": 5.6725489269939704e-06, |
| "loss": 0.003, |
| "num_input_tokens_seen": 566432, |
| "step": 1270 |
| }, |
| { |
| "epoch": 8.069620253164556, |
| "grad_norm": 1.0085276365280151, |
| "learning_rate": 5.4985672953697e-06, |
| "loss": 0.0435, |
| "num_input_tokens_seen": 568704, |
| "step": 1275 |
| }, |
| { |
| "epoch": 8.10126582278481, |
| "grad_norm": 0.4334363639354706, |
| "learning_rate": 5.3269652566419165e-06, |
| "loss": 0.0496, |
| "num_input_tokens_seen": 570912, |
| "step": 1280 |
| }, |
| { |
| "epoch": 8.132911392405063, |
| "grad_norm": 1.7338502407073975, |
| "learning_rate": 5.157763749938468e-06, |
| "loss": 0.0107, |
| "num_input_tokens_seen": 573184, |
| "step": 1285 |
| }, |
| { |
| "epoch": 8.164556962025316, |
| "grad_norm": 2.3482325077056885, |
| "learning_rate": 4.990983421470813e-06, |
| "loss": 0.0322, |
| "num_input_tokens_seen": 575424, |
| "step": 1290 |
| }, |
| { |
| "epoch": 8.19620253164557, |
| "grad_norm": 0.08308505266904831, |
| "learning_rate": 4.826644622014748e-06, |
| "loss": 0.0251, |
| "num_input_tokens_seen": 577664, |
| "step": 1295 |
| }, |
| { |
| "epoch": 8.227848101265822, |
| "grad_norm": 0.07593591511249542, |
| "learning_rate": 4.664767404427201e-06, |
| "loss": 0.0081, |
| "num_input_tokens_seen": 579968, |
| "step": 1300 |
| }, |
| { |
| "epoch": 8.259493670886076, |
| "grad_norm": 4.225083351135254, |
| "learning_rate": 4.505371521199342e-06, |
| "loss": 0.0225, |
| "num_input_tokens_seen": 582208, |
| "step": 1305 |
| }, |
| { |
| "epoch": 8.291139240506329, |
| "grad_norm": 1.984985589981079, |
| "learning_rate": 4.3484764220463295e-06, |
| "loss": 0.0402, |
| "num_input_tokens_seen": 584512, |
| "step": 1310 |
| }, |
| { |
| "epoch": 8.322784810126583, |
| "grad_norm": 1.2428722381591797, |
| "learning_rate": 4.19410125153405e-06, |
| "loss": 0.0378, |
| "num_input_tokens_seen": 586848, |
| "step": 1315 |
| }, |
| { |
| "epoch": 8.354430379746836, |
| "grad_norm": 0.7120981216430664, |
| "learning_rate": 4.042264846743085e-06, |
| "loss": 0.051, |
| "num_input_tokens_seen": 589120, |
| "step": 1320 |
| }, |
| { |
| "epoch": 8.386075949367088, |
| "grad_norm": 0.08994811773300171, |
| "learning_rate": 3.892985734970162e-06, |
| "loss": 0.0074, |
| "num_input_tokens_seen": 591232, |
| "step": 1325 |
| }, |
| { |
| "epoch": 8.417721518987342, |
| "grad_norm": 0.10305431485176086, |
| "learning_rate": 3.746282131467424e-06, |
| "loss": 0.0883, |
| "num_input_tokens_seen": 593408, |
| "step": 1330 |
| }, |
| { |
| "epoch": 8.449367088607595, |
| "grad_norm": 0.4406466782093048, |
| "learning_rate": 3.602171937219789e-06, |
| "loss": 0.0104, |
| "num_input_tokens_seen": 595648, |
| "step": 1335 |
| }, |
| { |
| "epoch": 8.481012658227849, |
| "grad_norm": 0.7376933693885803, |
| "learning_rate": 3.4606727367606508e-06, |
| "loss": 0.0084, |
| "num_input_tokens_seen": 597920, |
| "step": 1340 |
| }, |
| { |
| "epoch": 8.5, |
| "eval_loss": 0.12124493718147278, |
| "eval_runtime": 1.7759, |
| "eval_samples_per_second": 39.417, |
| "eval_steps_per_second": 10.136, |
| "num_input_tokens_seen": 599232, |
| "step": 1343 |
| }, |
| { |
| "epoch": 8.512658227848101, |
| "grad_norm": 1.2201383113861084, |
| "learning_rate": 3.321801796026189e-06, |
| "loss": 0.014, |
| "num_input_tokens_seen": 600224, |
| "step": 1345 |
| }, |
| { |
| "epoch": 8.544303797468354, |
| "grad_norm": 1.9123611450195312, |
| "learning_rate": 3.185576060248513e-06, |
| "loss": 0.0314, |
| "num_input_tokens_seen": 602432, |
| "step": 1350 |
| }, |
| { |
| "epoch": 8.575949367088608, |
| "grad_norm": 4.122493743896484, |
| "learning_rate": 3.0520121518880534e-06, |
| "loss": 0.0547, |
| "num_input_tokens_seen": 604736, |
| "step": 1355 |
| }, |
| { |
| "epoch": 8.60759493670886, |
| "grad_norm": 0.40070468187332153, |
| "learning_rate": 2.9211263686052065e-06, |
| "loss": 0.043, |
| "num_input_tokens_seen": 606976, |
| "step": 1360 |
| }, |
| { |
| "epoch": 8.639240506329115, |
| "grad_norm": 6.602436065673828, |
| "learning_rate": 2.792934681271708e-06, |
| "loss": 0.0443, |
| "num_input_tokens_seen": 609120, |
| "step": 1365 |
| }, |
| { |
| "epoch": 8.670886075949367, |
| "grad_norm": 0.46621787548065186, |
| "learning_rate": 2.667452732021794e-06, |
| "loss": 0.0092, |
| "num_input_tokens_seen": 611296, |
| "step": 1370 |
| }, |
| { |
| "epoch": 8.70253164556962, |
| "grad_norm": 0.17130400240421295, |
| "learning_rate": 2.544695832343591e-06, |
| "loss": 0.0349, |
| "num_input_tokens_seen": 613568, |
| "step": 1375 |
| }, |
| { |
| "epoch": 8.734177215189874, |
| "grad_norm": 0.0715862512588501, |
| "learning_rate": 2.424678961210747e-06, |
| "loss": 0.0204, |
| "num_input_tokens_seen": 615712, |
| "step": 1380 |
| }, |
| { |
| "epoch": 8.765822784810126, |
| "grad_norm": 2.6348278522491455, |
| "learning_rate": 2.307416763254666e-06, |
| "loss": 0.0323, |
| "num_input_tokens_seen": 618048, |
| "step": 1385 |
| }, |
| { |
| "epoch": 8.79746835443038, |
| "grad_norm": 1.5881404876708984, |
| "learning_rate": 2.1929235469775767e-06, |
| "loss": 0.0159, |
| "num_input_tokens_seen": 620384, |
| "step": 1390 |
| }, |
| { |
| "epoch": 8.829113924050633, |
| "grad_norm": 0.0131320646032691, |
| "learning_rate": 2.081213283006575e-06, |
| "loss": 0.0413, |
| "num_input_tokens_seen": 622656, |
| "step": 1395 |
| }, |
| { |
| "epoch": 8.860759493670885, |
| "grad_norm": 2.6171875, |
| "learning_rate": 1.9722996023889162e-06, |
| "loss": 0.0179, |
| "num_input_tokens_seen": 624928, |
| "step": 1400 |
| }, |
| { |
| "epoch": 8.89240506329114, |
| "grad_norm": 0.9136054515838623, |
| "learning_rate": 1.8661957949287157e-06, |
| "loss": 0.0439, |
| "num_input_tokens_seen": 627200, |
| "step": 1405 |
| }, |
| { |
| "epoch": 8.924050632911392, |
| "grad_norm": 1.1005070209503174, |
| "learning_rate": 1.7629148075653245e-06, |
| "loss": 0.0207, |
| "num_input_tokens_seen": 629376, |
| "step": 1410 |
| }, |
| { |
| "epoch": 8.955696202531646, |
| "grad_norm": 1.8107990026474, |
| "learning_rate": 1.6624692427935295e-06, |
| "loss": 0.0289, |
| "num_input_tokens_seen": 631552, |
| "step": 1415 |
| }, |
| { |
| "epoch": 8.987341772151899, |
| "grad_norm": 1.783125400543213, |
| "learning_rate": 1.564871357125755e-06, |
| "loss": 0.0727, |
| "num_input_tokens_seen": 633856, |
| "step": 1420 |
| }, |
| { |
| "epoch": 9.0, |
| "eval_loss": 0.12704677879810333, |
| "eval_runtime": 1.7742, |
| "eval_samples_per_second": 39.454, |
| "eval_steps_per_second": 10.145, |
| "num_input_tokens_seen": 634544, |
| "step": 1422 |
| }, |
| { |
| "epoch": 9.018987341772151, |
| "grad_norm": 0.2849749028682709, |
| "learning_rate": 1.4701330595965402e-06, |
| "loss": 0.0146, |
| "num_input_tokens_seen": 635952, |
| "step": 1425 |
| }, |
| { |
| "epoch": 9.050632911392405, |
| "grad_norm": 0.537175178527832, |
| "learning_rate": 1.3782659103093426e-06, |
| "loss": 0.0831, |
| "num_input_tokens_seen": 638224, |
| "step": 1430 |
| }, |
| { |
| "epoch": 9.082278481012658, |
| "grad_norm": 2.8358240127563477, |
| "learning_rate": 1.2892811190259979e-06, |
| "loss": 0.011, |
| "num_input_tokens_seen": 640496, |
| "step": 1435 |
| }, |
| { |
| "epoch": 9.113924050632912, |
| "grad_norm": 2.4374454021453857, |
| "learning_rate": 1.2031895437988493e-06, |
| "loss": 0.0455, |
| "num_input_tokens_seen": 642800, |
| "step": 1440 |
| }, |
| { |
| "epoch": 9.145569620253164, |
| "grad_norm": 0.006563578732311726, |
| "learning_rate": 1.1200016896458636e-06, |
| "loss": 0.0241, |
| "num_input_tokens_seen": 645040, |
| "step": 1445 |
| }, |
| { |
| "epoch": 9.177215189873417, |
| "grad_norm": 0.4594639539718628, |
| "learning_rate": 1.0397277072687729e-06, |
| "loss": 0.0562, |
| "num_input_tokens_seen": 647184, |
| "step": 1450 |
| }, |
| { |
| "epoch": 9.208860759493671, |
| "grad_norm": 1.900719404220581, |
| "learning_rate": 9.623773918144897e-07, |
| "loss": 0.0395, |
| "num_input_tokens_seen": 649360, |
| "step": 1455 |
| }, |
| { |
| "epoch": 9.240506329113924, |
| "grad_norm": 0.4248761832714081, |
| "learning_rate": 8.879601816798699e-07, |
| "loss": 0.0207, |
| "num_input_tokens_seen": 651600, |
| "step": 1460 |
| }, |
| { |
| "epoch": 9.272151898734178, |
| "grad_norm": 1.5442092418670654, |
| "learning_rate": 8.164851573600391e-07, |
| "loss": 0.0167, |
| "num_input_tokens_seen": 653936, |
| "step": 1465 |
| }, |
| { |
| "epoch": 9.30379746835443, |
| "grad_norm": 0.35519126057624817, |
| "learning_rate": 7.479610403403781e-07, |
| "loss": 0.0198, |
| "num_input_tokens_seen": 656176, |
| "step": 1470 |
| }, |
| { |
| "epoch": 9.335443037974684, |
| "grad_norm": 0.621478796005249, |
| "learning_rate": 6.823961920322974e-07, |
| "loss": 0.0648, |
| "num_input_tokens_seen": 658256, |
| "step": 1475 |
| }, |
| { |
| "epoch": 9.367088607594937, |
| "grad_norm": 0.49386000633239746, |
| "learning_rate": 6.197986127529892e-07, |
| "loss": 0.0054, |
| "num_input_tokens_seen": 660496, |
| "step": 1480 |
| }, |
| { |
| "epoch": 9.39873417721519, |
| "grad_norm": 0.17025251686573029, |
| "learning_rate": 5.601759407492107e-07, |
| "loss": 0.0254, |
| "num_input_tokens_seen": 662800, |
| "step": 1485 |
| }, |
| { |
| "epoch": 9.430379746835444, |
| "grad_norm": 2.2807698249816895, |
| "learning_rate": 5.035354512652385e-07, |
| "loss": 0.0188, |
| "num_input_tokens_seen": 665008, |
| "step": 1490 |
| }, |
| { |
| "epoch": 9.462025316455696, |
| "grad_norm": 0.8504540920257568, |
| "learning_rate": 4.4988405565515033e-07, |
| "loss": 0.0476, |
| "num_input_tokens_seen": 667280, |
| "step": 1495 |
| }, |
| { |
| "epoch": 9.49367088607595, |
| "grad_norm": 1.3653701543807983, |
| "learning_rate": 3.992283005394837e-07, |
| "loss": 0.0517, |
| "num_input_tokens_seen": 669584, |
| "step": 1500 |
| }, |
| { |
| "epoch": 9.5, |
| "eval_loss": 0.1276809424161911, |
| "eval_runtime": 1.7754, |
| "eval_samples_per_second": 39.427, |
| "eval_steps_per_second": 10.138, |
| "num_input_tokens_seen": 670064, |
| "step": 1501 |
| }, |
| { |
| "epoch": 9.525316455696203, |
| "grad_norm": 0.9614121913909912, |
| "learning_rate": 3.515743670064159e-07, |
| "loss": 0.0264, |
| "num_input_tokens_seen": 671952, |
| "step": 1505 |
| }, |
| { |
| "epoch": 9.556962025316455, |
| "grad_norm": 0.009983611293137074, |
| "learning_rate": 3.0692806985752053e-07, |
| "loss": 0.0289, |
| "num_input_tokens_seen": 674192, |
| "step": 1510 |
| }, |
| { |
| "epoch": 9.58860759493671, |
| "grad_norm": 0.43508023023605347, |
| "learning_rate": 2.6529485689825997e-07, |
| "loss": 0.0332, |
| "num_input_tokens_seen": 676400, |
| "step": 1515 |
| }, |
| { |
| "epoch": 9.620253164556962, |
| "grad_norm": 0.21629108488559723, |
| "learning_rate": 2.266798082732252e-07, |
| "loss": 0.0642, |
| "num_input_tokens_seen": 678640, |
| "step": 1520 |
| }, |
| { |
| "epoch": 9.651898734177216, |
| "grad_norm": 0.09474813938140869, |
| "learning_rate": 1.9108763584624878e-07, |
| "loss": 0.0021, |
| "num_input_tokens_seen": 680848, |
| "step": 1525 |
| }, |
| { |
| "epoch": 9.683544303797468, |
| "grad_norm": 0.7714481949806213, |
| "learning_rate": 1.5852268262544768e-07, |
| "loss": 0.0158, |
| "num_input_tokens_seen": 683152, |
| "step": 1530 |
| }, |
| { |
| "epoch": 9.715189873417721, |
| "grad_norm": 0.5944305062294006, |
| "learning_rate": 1.2898892223330294e-07, |
| "loss": 0.0108, |
| "num_input_tokens_seen": 685328, |
| "step": 1535 |
| }, |
| { |
| "epoch": 9.746835443037975, |
| "grad_norm": 0.011387128382921219, |
| "learning_rate": 1.0248995842178366e-07, |
| "loss": 0.0088, |
| "num_input_tokens_seen": 687632, |
| "step": 1540 |
| }, |
| { |
| "epoch": 9.778481012658228, |
| "grad_norm": 0.8135620951652527, |
| "learning_rate": 7.90290246326042e-08, |
| "loss": 0.0077, |
| "num_input_tokens_seen": 689776, |
| "step": 1545 |
| }, |
| { |
| "epoch": 9.810126582278482, |
| "grad_norm": 0.3980692923069, |
| "learning_rate": 5.86089836026843e-08, |
| "loss": 0.0152, |
| "num_input_tokens_seen": 691984, |
| "step": 1550 |
| }, |
| { |
| "epoch": 9.841772151898734, |
| "grad_norm": 0.014442904852330685, |
| "learning_rate": 4.1232327014831266e-08, |
| "loss": 0.018, |
| "num_input_tokens_seen": 694128, |
| "step": 1555 |
| }, |
| { |
| "epoch": 9.873417721518987, |
| "grad_norm": 0.6166678071022034, |
| "learning_rate": 2.6901175193699835e-08, |
| "loss": 0.0147, |
| "num_input_tokens_seen": 696304, |
| "step": 1560 |
| }, |
| { |
| "epoch": 9.905063291139241, |
| "grad_norm": 0.7606804966926575, |
| "learning_rate": 1.5617276847068598e-08, |
| "loss": 0.0087, |
| "num_input_tokens_seen": 698608, |
| "step": 1565 |
| }, |
| { |
| "epoch": 9.936708860759493, |
| "grad_norm": 0.15441037714481354, |
| "learning_rate": 7.382008852463429e-09, |
| "loss": 0.032, |
| "num_input_tokens_seen": 700816, |
| "step": 1570 |
| }, |
| { |
| "epoch": 9.968354430379748, |
| "grad_norm": 0.7360374927520752, |
| "learning_rate": 2.1963760891391407e-09, |
| "loss": 0.0257, |
| "num_input_tokens_seen": 703184, |
| "step": 1575 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 0.013925778679549694, |
| "learning_rate": 6.101131547198157e-11, |
| "loss": 0.0049, |
| "num_input_tokens_seen": 705184, |
| "step": 1580 |
| }, |
| { |
| "epoch": 10.0, |
| "eval_loss": 0.12569361925125122, |
| "eval_runtime": 1.7778, |
| "eval_samples_per_second": 39.374, |
| "eval_steps_per_second": 10.125, |
| "num_input_tokens_seen": 705184, |
| "step": 1580 |
| }, |
| { |
| "epoch": 10.0, |
| "num_input_tokens_seen": 705184, |
| "step": 1580, |
| "total_flos": 3.175411679939789e+16, |
| "train_loss": 0.12395850797714311, |
| "train_runtime": 397.268, |
| "train_samples_per_second": 15.858, |
| "train_steps_per_second": 3.977 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 1580, |
| "num_input_tokens_seen": 705184, |
| "num_train_epochs": 10, |
| "save_steps": 79, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.175411679939789e+16, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|