| { |
| "best_global_step": 1608, |
| "best_metric": 0.574218213558197, |
| "best_model_checkpoint": "saves_multiple/lora/llama-3-8b-instruct/train_conala_123_1760637666/checkpoint-1608", |
| "epoch": 20.0, |
| "eval_steps": 536, |
| "global_step": 10720, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.009328358208955223, |
| "grad_norm": 3.2155842781066895, |
| "learning_rate": 1.8656716417910447e-07, |
| "loss": 2.9974, |
| "num_input_tokens_seen": 1216, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.018656716417910446, |
| "grad_norm": 2.3419840335845947, |
| "learning_rate": 4.197761194029851e-07, |
| "loss": 2.8191, |
| "num_input_tokens_seen": 2528, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.027985074626865673, |
| "grad_norm": 2.8622381687164307, |
| "learning_rate": 6.529850746268657e-07, |
| "loss": 2.9078, |
| "num_input_tokens_seen": 4160, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.03731343283582089, |
| "grad_norm": 3.2237131595611572, |
| "learning_rate": 8.861940298507463e-07, |
| "loss": 2.9569, |
| "num_input_tokens_seen": 5504, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.04664179104477612, |
| "grad_norm": 2.3724913597106934, |
| "learning_rate": 1.119402985074627e-06, |
| "loss": 2.929, |
| "num_input_tokens_seen": 6912, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.055970149253731345, |
| "grad_norm": 3.4295129776000977, |
| "learning_rate": 1.3526119402985074e-06, |
| "loss": 2.5339, |
| "num_input_tokens_seen": 8544, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.06529850746268656, |
| "grad_norm": 3.0244059562683105, |
| "learning_rate": 1.585820895522388e-06, |
| "loss": 3.2228, |
| "num_input_tokens_seen": 9696, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.07462686567164178, |
| "grad_norm": 3.697382688522339, |
| "learning_rate": 1.8190298507462688e-06, |
| "loss": 3.1979, |
| "num_input_tokens_seen": 10976, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.08395522388059702, |
| "grad_norm": 2.5094709396362305, |
| "learning_rate": 2.0522388059701493e-06, |
| "loss": 2.9701, |
| "num_input_tokens_seen": 12320, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.09328358208955224, |
| "grad_norm": 3.7412240505218506, |
| "learning_rate": 2.28544776119403e-06, |
| "loss": 2.4346, |
| "num_input_tokens_seen": 13920, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.10261194029850747, |
| "grad_norm": 1.986556053161621, |
| "learning_rate": 2.5186567164179106e-06, |
| "loss": 2.6466, |
| "num_input_tokens_seen": 15456, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.11194029850746269, |
| "grad_norm": 1.948824167251587, |
| "learning_rate": 2.7518656716417913e-06, |
| "loss": 2.7756, |
| "num_input_tokens_seen": 16864, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.12126865671641791, |
| "grad_norm": 2.767160654067993, |
| "learning_rate": 2.9850746268656716e-06, |
| "loss": 2.6556, |
| "num_input_tokens_seen": 18336, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.13059701492537312, |
| "grad_norm": 3.187908887863159, |
| "learning_rate": 3.2182835820895527e-06, |
| "loss": 2.6107, |
| "num_input_tokens_seen": 19712, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.13992537313432835, |
| "grad_norm": 2.7606003284454346, |
| "learning_rate": 3.4514925373134334e-06, |
| "loss": 2.6895, |
| "num_input_tokens_seen": 21152, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.14925373134328357, |
| "grad_norm": 4.205873012542725, |
| "learning_rate": 3.684701492537314e-06, |
| "loss": 2.5966, |
| "num_input_tokens_seen": 22560, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.15858208955223882, |
| "grad_norm": 3.8996758460998535, |
| "learning_rate": 3.917910447761194e-06, |
| "loss": 2.5544, |
| "num_input_tokens_seen": 23968, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.16791044776119404, |
| "grad_norm": 3.294313907623291, |
| "learning_rate": 4.151119402985075e-06, |
| "loss": 1.9073, |
| "num_input_tokens_seen": 25600, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.17723880597014927, |
| "grad_norm": 4.302979946136475, |
| "learning_rate": 4.384328358208956e-06, |
| "loss": 2.1112, |
| "num_input_tokens_seen": 26912, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.1865671641791045, |
| "grad_norm": 4.676641941070557, |
| "learning_rate": 4.617537313432836e-06, |
| "loss": 2.1064, |
| "num_input_tokens_seen": 28512, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.1958955223880597, |
| "grad_norm": 8.729401588439941, |
| "learning_rate": 4.850746268656717e-06, |
| "loss": 2.5016, |
| "num_input_tokens_seen": 29888, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.20522388059701493, |
| "grad_norm": 4.578638553619385, |
| "learning_rate": 5.083955223880597e-06, |
| "loss": 2.0246, |
| "num_input_tokens_seen": 31296, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.21455223880597016, |
| "grad_norm": 4.33447790145874, |
| "learning_rate": 5.3171641791044776e-06, |
| "loss": 1.7017, |
| "num_input_tokens_seen": 32736, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.22388059701492538, |
| "grad_norm": 4.826817989349365, |
| "learning_rate": 5.550373134328359e-06, |
| "loss": 1.9632, |
| "num_input_tokens_seen": 34240, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.2332089552238806, |
| "grad_norm": 4.4384050369262695, |
| "learning_rate": 5.783582089552239e-06, |
| "loss": 1.6287, |
| "num_input_tokens_seen": 35648, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.24253731343283583, |
| "grad_norm": 3.766493797302246, |
| "learning_rate": 6.01679104477612e-06, |
| "loss": 1.4407, |
| "num_input_tokens_seen": 36992, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.251865671641791, |
| "grad_norm": 3.4355289936065674, |
| "learning_rate": 6.25e-06, |
| "loss": 1.1156, |
| "num_input_tokens_seen": 38528, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.26119402985074625, |
| "grad_norm": 3.5228874683380127, |
| "learning_rate": 6.4832089552238806e-06, |
| "loss": 1.3216, |
| "num_input_tokens_seen": 39840, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.27052238805970147, |
| "grad_norm": 3.495731830596924, |
| "learning_rate": 6.716417910447762e-06, |
| "loss": 1.0644, |
| "num_input_tokens_seen": 41216, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.2798507462686567, |
| "grad_norm": 3.4281909465789795, |
| "learning_rate": 6.949626865671642e-06, |
| "loss": 0.9045, |
| "num_input_tokens_seen": 42624, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.2891791044776119, |
| "grad_norm": 2.5741677284240723, |
| "learning_rate": 7.182835820895523e-06, |
| "loss": 0.8779, |
| "num_input_tokens_seen": 44192, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.29850746268656714, |
| "grad_norm": 2.587193250656128, |
| "learning_rate": 7.416044776119403e-06, |
| "loss": 1.1593, |
| "num_input_tokens_seen": 45504, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.30783582089552236, |
| "grad_norm": 2.819118022918701, |
| "learning_rate": 7.649253731343284e-06, |
| "loss": 0.6896, |
| "num_input_tokens_seen": 47360, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.31716417910447764, |
| "grad_norm": 3.108252763748169, |
| "learning_rate": 7.882462686567164e-06, |
| "loss": 0.9767, |
| "num_input_tokens_seen": 48640, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.32649253731343286, |
| "grad_norm": 3.960571765899658, |
| "learning_rate": 8.115671641791045e-06, |
| "loss": 1.0548, |
| "num_input_tokens_seen": 50016, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.3358208955223881, |
| "grad_norm": 4.127678871154785, |
| "learning_rate": 8.348880597014926e-06, |
| "loss": 1.1732, |
| "num_input_tokens_seen": 51296, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.3451492537313433, |
| "grad_norm": 4.358732223510742, |
| "learning_rate": 8.582089552238805e-06, |
| "loss": 1.2371, |
| "num_input_tokens_seen": 52576, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.35447761194029853, |
| "grad_norm": 1.7422306537628174, |
| "learning_rate": 8.815298507462687e-06, |
| "loss": 0.6859, |
| "num_input_tokens_seen": 54080, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.36380597014925375, |
| "grad_norm": 3.3703925609588623, |
| "learning_rate": 9.048507462686568e-06, |
| "loss": 1.2788, |
| "num_input_tokens_seen": 55392, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.373134328358209, |
| "grad_norm": 3.525810480117798, |
| "learning_rate": 9.281716417910449e-06, |
| "loss": 0.8965, |
| "num_input_tokens_seen": 56864, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.3824626865671642, |
| "grad_norm": 3.0293571949005127, |
| "learning_rate": 9.514925373134328e-06, |
| "loss": 0.8913, |
| "num_input_tokens_seen": 58368, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.3917910447761194, |
| "grad_norm": 2.385465145111084, |
| "learning_rate": 9.74813432835821e-06, |
| "loss": 1.1208, |
| "num_input_tokens_seen": 59744, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.40111940298507465, |
| "grad_norm": 3.6248044967651367, |
| "learning_rate": 9.98134328358209e-06, |
| "loss": 0.7233, |
| "num_input_tokens_seen": 61152, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.41044776119402987, |
| "grad_norm": 3.5897328853607178, |
| "learning_rate": 1.021455223880597e-05, |
| "loss": 0.8246, |
| "num_input_tokens_seen": 62496, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.4197761194029851, |
| "grad_norm": 3.7831625938415527, |
| "learning_rate": 1.0447761194029851e-05, |
| "loss": 0.7249, |
| "num_input_tokens_seen": 63808, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.4291044776119403, |
| "grad_norm": 3.4816417694091797, |
| "learning_rate": 1.0680970149253732e-05, |
| "loss": 0.7725, |
| "num_input_tokens_seen": 65088, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.43843283582089554, |
| "grad_norm": 3.978632688522339, |
| "learning_rate": 1.0914179104477611e-05, |
| "loss": 0.8411, |
| "num_input_tokens_seen": 66496, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.44776119402985076, |
| "grad_norm": 2.229631185531616, |
| "learning_rate": 1.1147388059701493e-05, |
| "loss": 0.7178, |
| "num_input_tokens_seen": 67872, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.457089552238806, |
| "grad_norm": 3.3504767417907715, |
| "learning_rate": 1.1380597014925374e-05, |
| "loss": 0.9885, |
| "num_input_tokens_seen": 69216, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.4664179104477612, |
| "grad_norm": 2.3532721996307373, |
| "learning_rate": 1.1613805970149253e-05, |
| "loss": 0.9294, |
| "num_input_tokens_seen": 70816, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.47574626865671643, |
| "grad_norm": 2.348051071166992, |
| "learning_rate": 1.1847014925373134e-05, |
| "loss": 0.7236, |
| "num_input_tokens_seen": 72224, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.48507462686567165, |
| "grad_norm": 2.541053056716919, |
| "learning_rate": 1.2080223880597015e-05, |
| "loss": 1.1469, |
| "num_input_tokens_seen": 73600, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.4944029850746269, |
| "grad_norm": 2.5242364406585693, |
| "learning_rate": 1.2313432835820896e-05, |
| "loss": 0.7002, |
| "num_input_tokens_seen": 75168, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.503731343283582, |
| "grad_norm": 2.9303579330444336, |
| "learning_rate": 1.2546641791044777e-05, |
| "loss": 0.7882, |
| "num_input_tokens_seen": 76544, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.5130597014925373, |
| "grad_norm": 2.201392889022827, |
| "learning_rate": 1.2779850746268657e-05, |
| "loss": 0.6852, |
| "num_input_tokens_seen": 78080, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.5223880597014925, |
| "grad_norm": 3.580583095550537, |
| "learning_rate": 1.3013059701492538e-05, |
| "loss": 0.6647, |
| "num_input_tokens_seen": 79520, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.5317164179104478, |
| "grad_norm": 2.079744577407837, |
| "learning_rate": 1.3246268656716417e-05, |
| "loss": 0.9595, |
| "num_input_tokens_seen": 80992, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.5410447761194029, |
| "grad_norm": 2.789869546890259, |
| "learning_rate": 1.34794776119403e-05, |
| "loss": 0.7462, |
| "num_input_tokens_seen": 82496, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.5503731343283582, |
| "grad_norm": 2.199842929840088, |
| "learning_rate": 1.371268656716418e-05, |
| "loss": 0.63, |
| "num_input_tokens_seen": 83712, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.5597014925373134, |
| "grad_norm": 2.0063514709472656, |
| "learning_rate": 1.394589552238806e-05, |
| "loss": 0.7378, |
| "num_input_tokens_seen": 85120, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.5690298507462687, |
| "grad_norm": 3.1145708560943604, |
| "learning_rate": 1.417910447761194e-05, |
| "loss": 0.7882, |
| "num_input_tokens_seen": 86560, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.5783582089552238, |
| "grad_norm": 2.9709813594818115, |
| "learning_rate": 1.4412313432835823e-05, |
| "loss": 0.6043, |
| "num_input_tokens_seen": 87936, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.5876865671641791, |
| "grad_norm": 2.8750739097595215, |
| "learning_rate": 1.46455223880597e-05, |
| "loss": 0.7181, |
| "num_input_tokens_seen": 89312, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.5970149253731343, |
| "grad_norm": 4.352518081665039, |
| "learning_rate": 1.4878731343283583e-05, |
| "loss": 0.7264, |
| "num_input_tokens_seen": 90560, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.6063432835820896, |
| "grad_norm": 2.636489152908325, |
| "learning_rate": 1.5111940298507463e-05, |
| "loss": 0.908, |
| "num_input_tokens_seen": 91936, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.6156716417910447, |
| "grad_norm": 2.8545966148376465, |
| "learning_rate": 1.5345149253731346e-05, |
| "loss": 0.523, |
| "num_input_tokens_seen": 93344, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.625, |
| "grad_norm": 2.488724708557129, |
| "learning_rate": 1.5578358208955223e-05, |
| "loss": 0.8352, |
| "num_input_tokens_seen": 94560, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.6343283582089553, |
| "grad_norm": 2.4599735736846924, |
| "learning_rate": 1.5811567164179105e-05, |
| "loss": 0.5659, |
| "num_input_tokens_seen": 96032, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.6436567164179104, |
| "grad_norm": 1.9525706768035889, |
| "learning_rate": 1.6044776119402986e-05, |
| "loss": 0.833, |
| "num_input_tokens_seen": 97504, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.6529850746268657, |
| "grad_norm": 2.631007194519043, |
| "learning_rate": 1.6277985074626867e-05, |
| "loss": 0.8833, |
| "num_input_tokens_seen": 98720, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.6623134328358209, |
| "grad_norm": 4.358158588409424, |
| "learning_rate": 1.6511194029850744e-05, |
| "loss": 0.6994, |
| "num_input_tokens_seen": 100064, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.6716417910447762, |
| "grad_norm": 3.1363024711608887, |
| "learning_rate": 1.674440298507463e-05, |
| "loss": 0.6367, |
| "num_input_tokens_seen": 101536, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.6809701492537313, |
| "grad_norm": 2.199251651763916, |
| "learning_rate": 1.6977611940298507e-05, |
| "loss": 0.7608, |
| "num_input_tokens_seen": 103072, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.6902985074626866, |
| "grad_norm": 3.813891887664795, |
| "learning_rate": 1.7210820895522388e-05, |
| "loss": 0.9477, |
| "num_input_tokens_seen": 104416, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.6996268656716418, |
| "grad_norm": 2.2994332313537598, |
| "learning_rate": 1.744402985074627e-05, |
| "loss": 0.6536, |
| "num_input_tokens_seen": 105952, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.7089552238805971, |
| "grad_norm": 2.796607732772827, |
| "learning_rate": 1.767723880597015e-05, |
| "loss": 0.7488, |
| "num_input_tokens_seen": 107520, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.7182835820895522, |
| "grad_norm": 2.5300981998443604, |
| "learning_rate": 1.791044776119403e-05, |
| "loss": 0.8754, |
| "num_input_tokens_seen": 109248, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.7276119402985075, |
| "grad_norm": 4.521425724029541, |
| "learning_rate": 1.8143656716417912e-05, |
| "loss": 0.7731, |
| "num_input_tokens_seen": 110752, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.7369402985074627, |
| "grad_norm": 3.1729986667633057, |
| "learning_rate": 1.837686567164179e-05, |
| "loss": 0.5673, |
| "num_input_tokens_seen": 112320, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.746268656716418, |
| "grad_norm": 2.2666549682617188, |
| "learning_rate": 1.8610074626865674e-05, |
| "loss": 0.8216, |
| "num_input_tokens_seen": 113792, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.7555970149253731, |
| "grad_norm": 3.2176127433776855, |
| "learning_rate": 1.8843283582089552e-05, |
| "loss": 0.6449, |
| "num_input_tokens_seen": 115328, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.7649253731343284, |
| "grad_norm": 3.751394510269165, |
| "learning_rate": 1.9076492537313433e-05, |
| "loss": 0.518, |
| "num_input_tokens_seen": 116800, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.7742537313432836, |
| "grad_norm": 2.6425139904022217, |
| "learning_rate": 1.9309701492537314e-05, |
| "loss": 0.6036, |
| "num_input_tokens_seen": 118592, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.7835820895522388, |
| "grad_norm": 5.427814483642578, |
| "learning_rate": 1.9542910447761195e-05, |
| "loss": 0.6243, |
| "num_input_tokens_seen": 119936, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.792910447761194, |
| "grad_norm": 3.0762557983398438, |
| "learning_rate": 1.9776119402985073e-05, |
| "loss": 0.8678, |
| "num_input_tokens_seen": 121280, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.8022388059701493, |
| "grad_norm": 2.287429094314575, |
| "learning_rate": 2.0009328358208958e-05, |
| "loss": 0.6082, |
| "num_input_tokens_seen": 122944, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.8115671641791045, |
| "grad_norm": 2.8894004821777344, |
| "learning_rate": 2.0242537313432835e-05, |
| "loss": 0.5936, |
| "num_input_tokens_seen": 124320, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.8208955223880597, |
| "grad_norm": 3.3376758098602295, |
| "learning_rate": 2.0475746268656717e-05, |
| "loss": 0.7032, |
| "num_input_tokens_seen": 125888, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.8302238805970149, |
| "grad_norm": 2.812182903289795, |
| "learning_rate": 2.0708955223880598e-05, |
| "loss": 0.6494, |
| "num_input_tokens_seen": 127296, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.8395522388059702, |
| "grad_norm": 3.458345413208008, |
| "learning_rate": 2.094216417910448e-05, |
| "loss": 0.6661, |
| "num_input_tokens_seen": 128512, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.8488805970149254, |
| "grad_norm": 1.4977647066116333, |
| "learning_rate": 2.1175373134328356e-05, |
| "loss": 0.6084, |
| "num_input_tokens_seen": 130048, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.8582089552238806, |
| "grad_norm": 3.7985336780548096, |
| "learning_rate": 2.140858208955224e-05, |
| "loss": 0.6004, |
| "num_input_tokens_seen": 131424, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.8675373134328358, |
| "grad_norm": 2.201958179473877, |
| "learning_rate": 2.164179104477612e-05, |
| "loss": 0.5567, |
| "num_input_tokens_seen": 132704, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.8768656716417911, |
| "grad_norm": 4.443638324737549, |
| "learning_rate": 2.1875e-05, |
| "loss": 0.7844, |
| "num_input_tokens_seen": 134048, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.8861940298507462, |
| "grad_norm": 2.422429323196411, |
| "learning_rate": 2.2108208955223884e-05, |
| "loss": 0.592, |
| "num_input_tokens_seen": 135392, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.8955223880597015, |
| "grad_norm": 2.377540111541748, |
| "learning_rate": 2.2341417910447762e-05, |
| "loss": 0.9044, |
| "num_input_tokens_seen": 136832, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.9048507462686567, |
| "grad_norm": 4.504119873046875, |
| "learning_rate": 2.2574626865671643e-05, |
| "loss": 0.7338, |
| "num_input_tokens_seen": 138336, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.914179104477612, |
| "grad_norm": 3.372610092163086, |
| "learning_rate": 2.2807835820895524e-05, |
| "loss": 0.5419, |
| "num_input_tokens_seen": 139840, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.9235074626865671, |
| "grad_norm": 3.999953269958496, |
| "learning_rate": 2.3041044776119405e-05, |
| "loss": 0.5545, |
| "num_input_tokens_seen": 141344, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.9328358208955224, |
| "grad_norm": 3.0528242588043213, |
| "learning_rate": 2.3274253731343283e-05, |
| "loss": 0.5944, |
| "num_input_tokens_seen": 142848, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.9421641791044776, |
| "grad_norm": 2.436511516571045, |
| "learning_rate": 2.3507462686567168e-05, |
| "loss": 0.564, |
| "num_input_tokens_seen": 144384, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.9514925373134329, |
| "grad_norm": 3.3364241123199463, |
| "learning_rate": 2.3740671641791045e-05, |
| "loss": 0.5367, |
| "num_input_tokens_seen": 145760, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.960820895522388, |
| "grad_norm": 2.424896001815796, |
| "learning_rate": 2.3973880597014926e-05, |
| "loss": 0.4441, |
| "num_input_tokens_seen": 147168, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.9701492537313433, |
| "grad_norm": 2.6522722244262695, |
| "learning_rate": 2.4207089552238807e-05, |
| "loss": 0.374, |
| "num_input_tokens_seen": 148480, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.9794776119402985, |
| "grad_norm": 2.7472426891326904, |
| "learning_rate": 2.444029850746269e-05, |
| "loss": 0.578, |
| "num_input_tokens_seen": 149920, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.9888059701492538, |
| "grad_norm": 3.616218090057373, |
| "learning_rate": 2.467350746268657e-05, |
| "loss": 0.8165, |
| "num_input_tokens_seen": 151264, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.9981343283582089, |
| "grad_norm": 3.1265878677368164, |
| "learning_rate": 2.490671641791045e-05, |
| "loss": 0.8413, |
| "num_input_tokens_seen": 152608, |
| "step": 535 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 0.6011408567428589, |
| "eval_runtime": 2.9145, |
| "eval_samples_per_second": 81.661, |
| "eval_steps_per_second": 20.587, |
| "num_input_tokens_seen": 152672, |
| "step": 536 |
| }, |
| { |
| "epoch": 1.007462686567164, |
| "grad_norm": 2.0888760089874268, |
| "learning_rate": 2.5139925373134332e-05, |
| "loss": 0.7721, |
| "num_input_tokens_seen": 153760, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.0167910447761195, |
| "grad_norm": 3.7932422161102295, |
| "learning_rate": 2.537313432835821e-05, |
| "loss": 0.7126, |
| "num_input_tokens_seen": 155104, |
| "step": 545 |
| }, |
| { |
| "epoch": 1.0261194029850746, |
| "grad_norm": 2.8085086345672607, |
| "learning_rate": 2.560634328358209e-05, |
| "loss": 0.8332, |
| "num_input_tokens_seen": 156416, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.0354477611940298, |
| "grad_norm": 2.4556965827941895, |
| "learning_rate": 2.583955223880597e-05, |
| "loss": 0.7536, |
| "num_input_tokens_seen": 157888, |
| "step": 555 |
| }, |
| { |
| "epoch": 1.044776119402985, |
| "grad_norm": 3.240560531616211, |
| "learning_rate": 2.6072761194029853e-05, |
| "loss": 0.5994, |
| "num_input_tokens_seen": 159392, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.0541044776119404, |
| "grad_norm": 3.46124267578125, |
| "learning_rate": 2.6305970149253734e-05, |
| "loss": 0.6847, |
| "num_input_tokens_seen": 160864, |
| "step": 565 |
| }, |
| { |
| "epoch": 1.0634328358208955, |
| "grad_norm": 4.8628926277160645, |
| "learning_rate": 2.6539179104477612e-05, |
| "loss": 0.786, |
| "num_input_tokens_seen": 162240, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.0727611940298507, |
| "grad_norm": 2.554652214050293, |
| "learning_rate": 2.6772388059701493e-05, |
| "loss": 0.6826, |
| "num_input_tokens_seen": 163616, |
| "step": 575 |
| }, |
| { |
| "epoch": 1.0820895522388059, |
| "grad_norm": 4.17311429977417, |
| "learning_rate": 2.7005597014925377e-05, |
| "loss": 0.893, |
| "num_input_tokens_seen": 165152, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.0914179104477613, |
| "grad_norm": 3.1866776943206787, |
| "learning_rate": 2.7238805970149255e-05, |
| "loss": 0.5222, |
| "num_input_tokens_seen": 166560, |
| "step": 585 |
| }, |
| { |
| "epoch": 1.1007462686567164, |
| "grad_norm": 3.497123956680298, |
| "learning_rate": 2.7472014925373136e-05, |
| "loss": 0.599, |
| "num_input_tokens_seen": 167904, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.1100746268656716, |
| "grad_norm": 5.333121299743652, |
| "learning_rate": 2.7705223880597014e-05, |
| "loss": 0.7263, |
| "num_input_tokens_seen": 169280, |
| "step": 595 |
| }, |
| { |
| "epoch": 1.1194029850746268, |
| "grad_norm": 3.0324625968933105, |
| "learning_rate": 2.79384328358209e-05, |
| "loss": 0.5613, |
| "num_input_tokens_seen": 170656, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.1287313432835822, |
| "grad_norm": 3.550719976425171, |
| "learning_rate": 2.817164179104478e-05, |
| "loss": 0.6407, |
| "num_input_tokens_seen": 172352, |
| "step": 605 |
| }, |
| { |
| "epoch": 1.1380597014925373, |
| "grad_norm": 3.053112268447876, |
| "learning_rate": 2.8404850746268657e-05, |
| "loss": 0.4429, |
| "num_input_tokens_seen": 173888, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.1473880597014925, |
| "grad_norm": 2.621811628341675, |
| "learning_rate": 2.863805970149254e-05, |
| "loss": 0.8407, |
| "num_input_tokens_seen": 175040, |
| "step": 615 |
| }, |
| { |
| "epoch": 1.1567164179104479, |
| "grad_norm": 2.650109052658081, |
| "learning_rate": 2.8871268656716423e-05, |
| "loss": 0.5142, |
| "num_input_tokens_seen": 176480, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.166044776119403, |
| "grad_norm": 3.527712821960449, |
| "learning_rate": 2.91044776119403e-05, |
| "loss": 0.5449, |
| "num_input_tokens_seen": 177952, |
| "step": 625 |
| }, |
| { |
| "epoch": 1.1753731343283582, |
| "grad_norm": 4.733219146728516, |
| "learning_rate": 2.9337686567164178e-05, |
| "loss": 0.8772, |
| "num_input_tokens_seen": 179552, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.1847014925373134, |
| "grad_norm": 3.7956483364105225, |
| "learning_rate": 2.957089552238806e-05, |
| "loss": 0.6542, |
| "num_input_tokens_seen": 181120, |
| "step": 635 |
| }, |
| { |
| "epoch": 1.1940298507462686, |
| "grad_norm": 2.491182565689087, |
| "learning_rate": 2.9804104477611944e-05, |
| "loss": 0.4873, |
| "num_input_tokens_seen": 182560, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.203358208955224, |
| "grad_norm": 3.6816084384918213, |
| "learning_rate": 3.003731343283582e-05, |
| "loss": 0.5275, |
| "num_input_tokens_seen": 184064, |
| "step": 645 |
| }, |
| { |
| "epoch": 1.212686567164179, |
| "grad_norm": 2.040377140045166, |
| "learning_rate": 3.0270522388059703e-05, |
| "loss": 0.4792, |
| "num_input_tokens_seen": 185600, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.2220149253731343, |
| "grad_norm": 2.9813172817230225, |
| "learning_rate": 3.050373134328358e-05, |
| "loss": 0.5482, |
| "num_input_tokens_seen": 186848, |
| "step": 655 |
| }, |
| { |
| "epoch": 1.2313432835820897, |
| "grad_norm": 3.298184633255005, |
| "learning_rate": 3.0736940298507465e-05, |
| "loss": 0.4932, |
| "num_input_tokens_seen": 188224, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.2406716417910448, |
| "grad_norm": 3.5363686084747314, |
| "learning_rate": 3.0970149253731346e-05, |
| "loss": 0.553, |
| "num_input_tokens_seen": 189696, |
| "step": 665 |
| }, |
| { |
| "epoch": 1.25, |
| "grad_norm": 3.1834492683410645, |
| "learning_rate": 3.120335820895523e-05, |
| "loss": 0.6838, |
| "num_input_tokens_seen": 191360, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.2593283582089552, |
| "grad_norm": 3.2151713371276855, |
| "learning_rate": 3.14365671641791e-05, |
| "loss": 0.5203, |
| "num_input_tokens_seen": 192800, |
| "step": 675 |
| }, |
| { |
| "epoch": 1.2686567164179103, |
| "grad_norm": 3.447000741958618, |
| "learning_rate": 3.166977611940299e-05, |
| "loss": 0.8755, |
| "num_input_tokens_seen": 194208, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.2779850746268657, |
| "grad_norm": 3.3230972290039062, |
| "learning_rate": 3.190298507462687e-05, |
| "loss": 0.6844, |
| "num_input_tokens_seen": 195552, |
| "step": 685 |
| }, |
| { |
| "epoch": 1.287313432835821, |
| "grad_norm": 2.030858039855957, |
| "learning_rate": 3.2136194029850745e-05, |
| "loss": 0.4773, |
| "num_input_tokens_seen": 196832, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.296641791044776, |
| "grad_norm": 2.7665631771087646, |
| "learning_rate": 3.2369402985074626e-05, |
| "loss": 0.5726, |
| "num_input_tokens_seen": 198112, |
| "step": 695 |
| }, |
| { |
| "epoch": 1.3059701492537314, |
| "grad_norm": 3.8224518299102783, |
| "learning_rate": 3.2602611940298514e-05, |
| "loss": 0.6753, |
| "num_input_tokens_seen": 199680, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.3152985074626866, |
| "grad_norm": 2.3687782287597656, |
| "learning_rate": 3.283582089552239e-05, |
| "loss": 0.5688, |
| "num_input_tokens_seen": 200960, |
| "step": 705 |
| }, |
| { |
| "epoch": 1.3246268656716418, |
| "grad_norm": 2.6985511779785156, |
| "learning_rate": 3.306902985074627e-05, |
| "loss": 0.5203, |
| "num_input_tokens_seen": 202368, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.333955223880597, |
| "grad_norm": 3.449769973754883, |
| "learning_rate": 3.330223880597015e-05, |
| "loss": 0.6356, |
| "num_input_tokens_seen": 203744, |
| "step": 715 |
| }, |
| { |
| "epoch": 1.3432835820895521, |
| "grad_norm": 3.68648362159729, |
| "learning_rate": 3.353544776119403e-05, |
| "loss": 0.5851, |
| "num_input_tokens_seen": 205056, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.3526119402985075, |
| "grad_norm": 2.581493854522705, |
| "learning_rate": 3.376865671641791e-05, |
| "loss": 0.5889, |
| "num_input_tokens_seen": 206432, |
| "step": 725 |
| }, |
| { |
| "epoch": 1.3619402985074627, |
| "grad_norm": 3.659214496612549, |
| "learning_rate": 3.4001865671641794e-05, |
| "loss": 0.8588, |
| "num_input_tokens_seen": 207712, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.3712686567164178, |
| "grad_norm": 2.3766164779663086, |
| "learning_rate": 3.423507462686567e-05, |
| "loss": 0.5765, |
| "num_input_tokens_seen": 208992, |
| "step": 735 |
| }, |
| { |
| "epoch": 1.3805970149253732, |
| "grad_norm": 3.7465713024139404, |
| "learning_rate": 3.4468283582089556e-05, |
| "loss": 0.7431, |
| "num_input_tokens_seen": 210400, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.3899253731343284, |
| "grad_norm": 4.194060802459717, |
| "learning_rate": 3.470149253731344e-05, |
| "loss": 0.6413, |
| "num_input_tokens_seen": 211936, |
| "step": 745 |
| }, |
| { |
| "epoch": 1.3992537313432836, |
| "grad_norm": 3.062356948852539, |
| "learning_rate": 3.493470149253731e-05, |
| "loss": 0.5948, |
| "num_input_tokens_seen": 213376, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.4085820895522387, |
| "grad_norm": 4.362344264984131, |
| "learning_rate": 3.516791044776119e-05, |
| "loss": 0.7868, |
| "num_input_tokens_seen": 214784, |
| "step": 755 |
| }, |
| { |
| "epoch": 1.417910447761194, |
| "grad_norm": 4.349671840667725, |
| "learning_rate": 3.540111940298508e-05, |
| "loss": 0.509, |
| "num_input_tokens_seen": 216224, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.4272388059701493, |
| "grad_norm": 3.4866678714752197, |
| "learning_rate": 3.5634328358208955e-05, |
| "loss": 0.7522, |
| "num_input_tokens_seen": 217600, |
| "step": 765 |
| }, |
| { |
| "epoch": 1.4365671641791045, |
| "grad_norm": 2.970207691192627, |
| "learning_rate": 3.5867537313432836e-05, |
| "loss": 0.6378, |
| "num_input_tokens_seen": 219040, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.4458955223880596, |
| "grad_norm": 5.6999287605285645, |
| "learning_rate": 3.610074626865672e-05, |
| "loss": 0.7619, |
| "num_input_tokens_seen": 220480, |
| "step": 775 |
| }, |
| { |
| "epoch": 1.455223880597015, |
| "grad_norm": 2.8607735633850098, |
| "learning_rate": 3.63339552238806e-05, |
| "loss": 0.4045, |
| "num_input_tokens_seen": 222208, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.4645522388059702, |
| "grad_norm": 3.6118452548980713, |
| "learning_rate": 3.656716417910448e-05, |
| "loss": 0.4685, |
| "num_input_tokens_seen": 223648, |
| "step": 785 |
| }, |
| { |
| "epoch": 1.4738805970149254, |
| "grad_norm": 2.9141979217529297, |
| "learning_rate": 3.680037313432836e-05, |
| "loss": 0.5132, |
| "num_input_tokens_seen": 225152, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.4832089552238805, |
| "grad_norm": 3.447788715362549, |
| "learning_rate": 3.7033582089552234e-05, |
| "loss": 0.6576, |
| "num_input_tokens_seen": 226528, |
| "step": 795 |
| }, |
| { |
| "epoch": 1.4925373134328357, |
| "grad_norm": 4.151407241821289, |
| "learning_rate": 3.726679104477612e-05, |
| "loss": 0.58, |
| "num_input_tokens_seen": 228000, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.501865671641791, |
| "grad_norm": 2.671635150909424, |
| "learning_rate": 3.7500000000000003e-05, |
| "loss": 0.4283, |
| "num_input_tokens_seen": 229600, |
| "step": 805 |
| }, |
| { |
| "epoch": 1.5111940298507462, |
| "grad_norm": 2.491536855697632, |
| "learning_rate": 3.773320895522388e-05, |
| "loss": 0.5973, |
| "num_input_tokens_seen": 230912, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.5205223880597014, |
| "grad_norm": 2.8027684688568115, |
| "learning_rate": 3.7966417910447766e-05, |
| "loss": 0.5326, |
| "num_input_tokens_seen": 232256, |
| "step": 815 |
| }, |
| { |
| "epoch": 1.5298507462686568, |
| "grad_norm": 2.925844192504883, |
| "learning_rate": 3.819962686567165e-05, |
| "loss": 0.4365, |
| "num_input_tokens_seen": 233728, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.539179104477612, |
| "grad_norm": 2.696824789047241, |
| "learning_rate": 3.843283582089552e-05, |
| "loss": 0.4974, |
| "num_input_tokens_seen": 235072, |
| "step": 825 |
| }, |
| { |
| "epoch": 1.5485074626865671, |
| "grad_norm": 2.4541735649108887, |
| "learning_rate": 3.86660447761194e-05, |
| "loss": 0.6577, |
| "num_input_tokens_seen": 236512, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.5578358208955225, |
| "grad_norm": 2.5119307041168213, |
| "learning_rate": 3.889925373134329e-05, |
| "loss": 0.4667, |
| "num_input_tokens_seen": 237952, |
| "step": 835 |
| }, |
| { |
| "epoch": 1.5671641791044775, |
| "grad_norm": 2.563211441040039, |
| "learning_rate": 3.9132462686567164e-05, |
| "loss": 0.5993, |
| "num_input_tokens_seen": 239328, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.5764925373134329, |
| "grad_norm": 2.452744245529175, |
| "learning_rate": 3.9365671641791046e-05, |
| "loss": 0.4841, |
| "num_input_tokens_seen": 240736, |
| "step": 845 |
| }, |
| { |
| "epoch": 1.585820895522388, |
| "grad_norm": 3.0569136142730713, |
| "learning_rate": 3.959888059701493e-05, |
| "loss": 0.6129, |
| "num_input_tokens_seen": 241984, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.5951492537313432, |
| "grad_norm": 5.140933036804199, |
| "learning_rate": 3.983208955223881e-05, |
| "loss": 0.6906, |
| "num_input_tokens_seen": 243296, |
| "step": 855 |
| }, |
| { |
| "epoch": 1.6044776119402986, |
| "grad_norm": 3.8570048809051514, |
| "learning_rate": 4.006529850746269e-05, |
| "loss": 0.5247, |
| "num_input_tokens_seen": 244960, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.6138059701492538, |
| "grad_norm": 3.173438310623169, |
| "learning_rate": 4.029850746268657e-05, |
| "loss": 0.6009, |
| "num_input_tokens_seen": 246304, |
| "step": 865 |
| }, |
| { |
| "epoch": 1.623134328358209, |
| "grad_norm": 6.471933841705322, |
| "learning_rate": 4.053171641791045e-05, |
| "loss": 0.6328, |
| "num_input_tokens_seen": 247680, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.6324626865671643, |
| "grad_norm": 3.5459954738616943, |
| "learning_rate": 4.076492537313433e-05, |
| "loss": 0.625, |
| "num_input_tokens_seen": 249152, |
| "step": 875 |
| }, |
| { |
| "epoch": 1.6417910447761193, |
| "grad_norm": 2.7012815475463867, |
| "learning_rate": 4.099813432835821e-05, |
| "loss": 0.601, |
| "num_input_tokens_seen": 250496, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.6511194029850746, |
| "grad_norm": 3.285043954849243, |
| "learning_rate": 4.1231343283582094e-05, |
| "loss": 0.5433, |
| "num_input_tokens_seen": 251872, |
| "step": 885 |
| }, |
| { |
| "epoch": 1.6604477611940298, |
| "grad_norm": 3.363837480545044, |
| "learning_rate": 4.146455223880597e-05, |
| "loss": 0.4303, |
| "num_input_tokens_seen": 253440, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.669776119402985, |
| "grad_norm": 3.7926578521728516, |
| "learning_rate": 4.169776119402986e-05, |
| "loss": 0.5429, |
| "num_input_tokens_seen": 254720, |
| "step": 895 |
| }, |
| { |
| "epoch": 1.6791044776119404, |
| "grad_norm": 7.360354423522949, |
| "learning_rate": 4.193097014925373e-05, |
| "loss": 0.764, |
| "num_input_tokens_seen": 256320, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.6884328358208955, |
| "grad_norm": 5.666825771331787, |
| "learning_rate": 4.216417910447761e-05, |
| "loss": 0.5493, |
| "num_input_tokens_seen": 257600, |
| "step": 905 |
| }, |
| { |
| "epoch": 1.6977611940298507, |
| "grad_norm": 2.6743109226226807, |
| "learning_rate": 4.239738805970149e-05, |
| "loss": 0.3762, |
| "num_input_tokens_seen": 259008, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.707089552238806, |
| "grad_norm": 3.6945884227752686, |
| "learning_rate": 4.2630597014925374e-05, |
| "loss": 0.4236, |
| "num_input_tokens_seen": 260448, |
| "step": 915 |
| }, |
| { |
| "epoch": 1.716417910447761, |
| "grad_norm": 3.7885358333587646, |
| "learning_rate": 4.2863805970149255e-05, |
| "loss": 0.7139, |
| "num_input_tokens_seen": 261888, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.7257462686567164, |
| "grad_norm": 2.1902737617492676, |
| "learning_rate": 4.3097014925373137e-05, |
| "loss": 0.5821, |
| "num_input_tokens_seen": 263328, |
| "step": 925 |
| }, |
| { |
| "epoch": 1.7350746268656716, |
| "grad_norm": 3.4764726161956787, |
| "learning_rate": 4.333022388059702e-05, |
| "loss": 0.5165, |
| "num_input_tokens_seen": 264832, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.7444029850746268, |
| "grad_norm": 3.5427489280700684, |
| "learning_rate": 4.35634328358209e-05, |
| "loss": 0.8928, |
| "num_input_tokens_seen": 266368, |
| "step": 935 |
| }, |
| { |
| "epoch": 1.7537313432835822, |
| "grad_norm": 3.624504566192627, |
| "learning_rate": 4.379664179104478e-05, |
| "loss": 0.7654, |
| "num_input_tokens_seen": 267904, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.7630597014925373, |
| "grad_norm": 2.536579132080078, |
| "learning_rate": 4.402985074626866e-05, |
| "loss": 0.3961, |
| "num_input_tokens_seen": 269504, |
| "step": 945 |
| }, |
| { |
| "epoch": 1.7723880597014925, |
| "grad_norm": 2.0857138633728027, |
| "learning_rate": 4.4263059701492535e-05, |
| "loss": 0.4261, |
| "num_input_tokens_seen": 270912, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.7817164179104479, |
| "grad_norm": 3.7415146827697754, |
| "learning_rate": 4.449626865671642e-05, |
| "loss": 0.5659, |
| "num_input_tokens_seen": 272384, |
| "step": 955 |
| }, |
| { |
| "epoch": 1.7910447761194028, |
| "grad_norm": 3.1793229579925537, |
| "learning_rate": 4.4729477611940304e-05, |
| "loss": 0.5779, |
| "num_input_tokens_seen": 273760, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.8003731343283582, |
| "grad_norm": 1.8699004650115967, |
| "learning_rate": 4.496268656716418e-05, |
| "loss": 0.3813, |
| "num_input_tokens_seen": 275424, |
| "step": 965 |
| }, |
| { |
| "epoch": 1.8097014925373134, |
| "grad_norm": 7.255972862243652, |
| "learning_rate": 4.519589552238806e-05, |
| "loss": 0.8168, |
| "num_input_tokens_seen": 276832, |
| "step": 970 |
| }, |
| { |
| "epoch": 1.8190298507462686, |
| "grad_norm": 3.1394460201263428, |
| "learning_rate": 4.542910447761195e-05, |
| "loss": 0.8514, |
| "num_input_tokens_seen": 278336, |
| "step": 975 |
| }, |
| { |
| "epoch": 1.828358208955224, |
| "grad_norm": 3.5392227172851562, |
| "learning_rate": 4.566231343283582e-05, |
| "loss": 0.5994, |
| "num_input_tokens_seen": 279712, |
| "step": 980 |
| }, |
| { |
| "epoch": 1.837686567164179, |
| "grad_norm": 3.2066144943237305, |
| "learning_rate": 4.58955223880597e-05, |
| "loss": 0.6509, |
| "num_input_tokens_seen": 281152, |
| "step": 985 |
| }, |
| { |
| "epoch": 1.8470149253731343, |
| "grad_norm": 4.86817741394043, |
| "learning_rate": 4.6128731343283584e-05, |
| "loss": 0.693, |
| "num_input_tokens_seen": 282560, |
| "step": 990 |
| }, |
| { |
| "epoch": 1.8563432835820897, |
| "grad_norm": 2.7181270122528076, |
| "learning_rate": 4.6361940298507465e-05, |
| "loss": 0.5695, |
| "num_input_tokens_seen": 283968, |
| "step": 995 |
| }, |
| { |
| "epoch": 1.8656716417910446, |
| "grad_norm": 3.693114757537842, |
| "learning_rate": 4.6595149253731346e-05, |
| "loss": 0.5724, |
| "num_input_tokens_seen": 285440, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.875, |
| "grad_norm": 2.446584463119507, |
| "learning_rate": 4.682835820895523e-05, |
| "loss": 0.4546, |
| "num_input_tokens_seen": 286944, |
| "step": 1005 |
| }, |
| { |
| "epoch": 1.8843283582089554, |
| "grad_norm": 3.6125104427337646, |
| "learning_rate": 4.70615671641791e-05, |
| "loss": 0.6626, |
| "num_input_tokens_seen": 288320, |
| "step": 1010 |
| }, |
| { |
| "epoch": 1.8936567164179103, |
| "grad_norm": 4.2701568603515625, |
| "learning_rate": 4.729477611940299e-05, |
| "loss": 0.5084, |
| "num_input_tokens_seen": 289568, |
| "step": 1015 |
| }, |
| { |
| "epoch": 1.9029850746268657, |
| "grad_norm": 3.699568748474121, |
| "learning_rate": 4.752798507462687e-05, |
| "loss": 0.791, |
| "num_input_tokens_seen": 290976, |
| "step": 1020 |
| }, |
| { |
| "epoch": 1.912313432835821, |
| "grad_norm": 3.4514174461364746, |
| "learning_rate": 4.7761194029850745e-05, |
| "loss": 0.6876, |
| "num_input_tokens_seen": 292576, |
| "step": 1025 |
| }, |
| { |
| "epoch": 1.921641791044776, |
| "grad_norm": 3.7477970123291016, |
| "learning_rate": 4.7994402985074626e-05, |
| "loss": 0.6096, |
| "num_input_tokens_seen": 293952, |
| "step": 1030 |
| }, |
| { |
| "epoch": 1.9309701492537314, |
| "grad_norm": 3.3548812866210938, |
| "learning_rate": 4.8227611940298514e-05, |
| "loss": 0.5653, |
| "num_input_tokens_seen": 295488, |
| "step": 1035 |
| }, |
| { |
| "epoch": 1.9402985074626866, |
| "grad_norm": 6.471683502197266, |
| "learning_rate": 4.846082089552239e-05, |
| "loss": 0.6284, |
| "num_input_tokens_seen": 296768, |
| "step": 1040 |
| }, |
| { |
| "epoch": 1.9496268656716418, |
| "grad_norm": 2.4862189292907715, |
| "learning_rate": 4.869402985074627e-05, |
| "loss": 0.464, |
| "num_input_tokens_seen": 298240, |
| "step": 1045 |
| }, |
| { |
| "epoch": 1.9589552238805972, |
| "grad_norm": 2.749873399734497, |
| "learning_rate": 4.892723880597015e-05, |
| "loss": 0.5278, |
| "num_input_tokens_seen": 299712, |
| "step": 1050 |
| }, |
| { |
| "epoch": 1.9682835820895521, |
| "grad_norm": 3.4777090549468994, |
| "learning_rate": 4.916044776119403e-05, |
| "loss": 0.5709, |
| "num_input_tokens_seen": 300864, |
| "step": 1055 |
| }, |
| { |
| "epoch": 1.9776119402985075, |
| "grad_norm": 4.218428134918213, |
| "learning_rate": 4.939365671641791e-05, |
| "loss": 0.5777, |
| "num_input_tokens_seen": 302336, |
| "step": 1060 |
| }, |
| { |
| "epoch": 1.9869402985074627, |
| "grad_norm": 3.2908074855804443, |
| "learning_rate": 4.9626865671641794e-05, |
| "loss": 0.6037, |
| "num_input_tokens_seen": 303584, |
| "step": 1065 |
| }, |
| { |
| "epoch": 1.9962686567164178, |
| "grad_norm": 2.5879201889038086, |
| "learning_rate": 4.986007462686567e-05, |
| "loss": 0.6224, |
| "num_input_tokens_seen": 304960, |
| "step": 1070 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.5773242712020874, |
| "eval_runtime": 2.8941, |
| "eval_samples_per_second": 82.237, |
| "eval_steps_per_second": 20.732, |
| "num_input_tokens_seen": 305288, |
| "step": 1072 |
| }, |
| { |
| "epoch": 2.0055970149253732, |
| "grad_norm": 3.076622724533081, |
| "learning_rate": 4.999999469854421e-05, |
| "loss": 0.4418, |
| "num_input_tokens_seen": 306152, |
| "step": 1075 |
| }, |
| { |
| "epoch": 2.014925373134328, |
| "grad_norm": 3.1399412155151367, |
| "learning_rate": 4.999993505719229e-05, |
| "loss": 0.6218, |
| "num_input_tokens_seen": 307464, |
| "step": 1080 |
| }, |
| { |
| "epoch": 2.0242537313432836, |
| "grad_norm": 1.369956612586975, |
| "learning_rate": 4.9999809147827334e-05, |
| "loss": 0.5342, |
| "num_input_tokens_seen": 308968, |
| "step": 1085 |
| }, |
| { |
| "epoch": 2.033582089552239, |
| "grad_norm": 3.571125030517578, |
| "learning_rate": 4.9999616970783104e-05, |
| "loss": 0.5617, |
| "num_input_tokens_seen": 310440, |
| "step": 1090 |
| }, |
| { |
| "epoch": 2.042910447761194, |
| "grad_norm": 2.698582172393799, |
| "learning_rate": 4.999935852656898e-05, |
| "loss": 0.6012, |
| "num_input_tokens_seen": 312008, |
| "step": 1095 |
| }, |
| { |
| "epoch": 2.0522388059701493, |
| "grad_norm": 3.306818962097168, |
| "learning_rate": 4.9999033815870047e-05, |
| "loss": 0.5944, |
| "num_input_tokens_seen": 313416, |
| "step": 1100 |
| }, |
| { |
| "epoch": 2.0615671641791047, |
| "grad_norm": 2.9419455528259277, |
| "learning_rate": 4.999864283954701e-05, |
| "loss": 0.5887, |
| "num_input_tokens_seen": 314728, |
| "step": 1105 |
| }, |
| { |
| "epoch": 2.0708955223880596, |
| "grad_norm": 2.3801064491271973, |
| "learning_rate": 4.999818559863626e-05, |
| "loss": 0.6053, |
| "num_input_tokens_seen": 316072, |
| "step": 1110 |
| }, |
| { |
| "epoch": 2.080223880597015, |
| "grad_norm": 3.8569228649139404, |
| "learning_rate": 4.99976620943498e-05, |
| "loss": 0.5197, |
| "num_input_tokens_seen": 317384, |
| "step": 1115 |
| }, |
| { |
| "epoch": 2.08955223880597, |
| "grad_norm": 2.539523124694824, |
| "learning_rate": 4.9997072328075315e-05, |
| "loss": 0.4471, |
| "num_input_tokens_seen": 318696, |
| "step": 1120 |
| }, |
| { |
| "epoch": 2.0988805970149254, |
| "grad_norm": 3.7452080249786377, |
| "learning_rate": 4.999641630137609e-05, |
| "loss": 0.4622, |
| "num_input_tokens_seen": 320008, |
| "step": 1125 |
| }, |
| { |
| "epoch": 2.1082089552238807, |
| "grad_norm": 2.360616445541382, |
| "learning_rate": 4.9995694015991105e-05, |
| "loss": 0.4203, |
| "num_input_tokens_seen": 321480, |
| "step": 1130 |
| }, |
| { |
| "epoch": 2.1175373134328357, |
| "grad_norm": 3.3733887672424316, |
| "learning_rate": 4.9994905473834905e-05, |
| "loss": 0.4315, |
| "num_input_tokens_seen": 323016, |
| "step": 1135 |
| }, |
| { |
| "epoch": 2.126865671641791, |
| "grad_norm": 2.697133779525757, |
| "learning_rate": 4.999405067699773e-05, |
| "loss": 0.3873, |
| "num_input_tokens_seen": 324456, |
| "step": 1140 |
| }, |
| { |
| "epoch": 2.1361940298507465, |
| "grad_norm": 1.9267628192901611, |
| "learning_rate": 4.99931296277454e-05, |
| "loss": 0.2808, |
| "num_input_tokens_seen": 326120, |
| "step": 1145 |
| }, |
| { |
| "epoch": 2.1455223880597014, |
| "grad_norm": 2.748382568359375, |
| "learning_rate": 4.999214232851937e-05, |
| "loss": 0.625, |
| "num_input_tokens_seen": 327624, |
| "step": 1150 |
| }, |
| { |
| "epoch": 2.154850746268657, |
| "grad_norm": 4.176105976104736, |
| "learning_rate": 4.99910887819367e-05, |
| "loss": 0.4299, |
| "num_input_tokens_seen": 329160, |
| "step": 1155 |
| }, |
| { |
| "epoch": 2.1641791044776117, |
| "grad_norm": 4.723660945892334, |
| "learning_rate": 4.998996899079005e-05, |
| "loss": 0.7819, |
| "num_input_tokens_seen": 330472, |
| "step": 1160 |
| }, |
| { |
| "epoch": 2.173507462686567, |
| "grad_norm": 3.9092824459075928, |
| "learning_rate": 4.998878295804768e-05, |
| "loss": 0.311, |
| "num_input_tokens_seen": 332008, |
| "step": 1165 |
| }, |
| { |
| "epoch": 2.1828358208955225, |
| "grad_norm": 3.1630048751831055, |
| "learning_rate": 4.998753068685346e-05, |
| "loss": 0.3946, |
| "num_input_tokens_seen": 333384, |
| "step": 1170 |
| }, |
| { |
| "epoch": 2.1921641791044775, |
| "grad_norm": 4.498344421386719, |
| "learning_rate": 4.998621218052679e-05, |
| "loss": 0.533, |
| "num_input_tokens_seen": 334696, |
| "step": 1175 |
| }, |
| { |
| "epoch": 2.201492537313433, |
| "grad_norm": 3.314195394515991, |
| "learning_rate": 4.99848274425627e-05, |
| "loss": 0.4473, |
| "num_input_tokens_seen": 336232, |
| "step": 1180 |
| }, |
| { |
| "epoch": 2.2108208955223883, |
| "grad_norm": 3.516960859298706, |
| "learning_rate": 4.998337647663173e-05, |
| "loss": 0.4742, |
| "num_input_tokens_seen": 337512, |
| "step": 1185 |
| }, |
| { |
| "epoch": 2.220149253731343, |
| "grad_norm": 2.3780012130737305, |
| "learning_rate": 4.998185928658e-05, |
| "loss": 0.4866, |
| "num_input_tokens_seen": 339080, |
| "step": 1190 |
| }, |
| { |
| "epoch": 2.2294776119402986, |
| "grad_norm": 4.294321060180664, |
| "learning_rate": 4.998027587642917e-05, |
| "loss": 0.6281, |
| "num_input_tokens_seen": 340456, |
| "step": 1195 |
| }, |
| { |
| "epoch": 2.2388059701492535, |
| "grad_norm": 0.9334647059440613, |
| "learning_rate": 4.9978626250376435e-05, |
| "loss": 0.4126, |
| "num_input_tokens_seen": 341864, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.248134328358209, |
| "grad_norm": 3.350092649459839, |
| "learning_rate": 4.99769104127945e-05, |
| "loss": 0.567, |
| "num_input_tokens_seen": 343240, |
| "step": 1205 |
| }, |
| { |
| "epoch": 2.2574626865671643, |
| "grad_norm": 2.1908762454986572, |
| "learning_rate": 4.9975128368231574e-05, |
| "loss": 0.6061, |
| "num_input_tokens_seen": 344808, |
| "step": 1210 |
| }, |
| { |
| "epoch": 2.2667910447761193, |
| "grad_norm": 4.0524773597717285, |
| "learning_rate": 4.997328012141138e-05, |
| "loss": 0.4128, |
| "num_input_tokens_seen": 346152, |
| "step": 1215 |
| }, |
| { |
| "epoch": 2.2761194029850746, |
| "grad_norm": 3.479384660720825, |
| "learning_rate": 4.997136567723311e-05, |
| "loss": 0.4272, |
| "num_input_tokens_seen": 347528, |
| "step": 1220 |
| }, |
| { |
| "epoch": 2.28544776119403, |
| "grad_norm": 3.9866700172424316, |
| "learning_rate": 4.9969385040771445e-05, |
| "loss": 0.4487, |
| "num_input_tokens_seen": 348776, |
| "step": 1225 |
| }, |
| { |
| "epoch": 2.294776119402985, |
| "grad_norm": 4.011523723602295, |
| "learning_rate": 4.99673382172765e-05, |
| "loss": 0.4096, |
| "num_input_tokens_seen": 350088, |
| "step": 1230 |
| }, |
| { |
| "epoch": 2.3041044776119404, |
| "grad_norm": 4.297852039337158, |
| "learning_rate": 4.996522521217386e-05, |
| "loss": 0.5967, |
| "num_input_tokens_seen": 351528, |
| "step": 1235 |
| }, |
| { |
| "epoch": 2.3134328358208958, |
| "grad_norm": 2.498403310775757, |
| "learning_rate": 4.996304603106451e-05, |
| "loss": 0.6526, |
| "num_input_tokens_seen": 352840, |
| "step": 1240 |
| }, |
| { |
| "epoch": 2.3227611940298507, |
| "grad_norm": 1.636733055114746, |
| "learning_rate": 4.996080067972487e-05, |
| "loss": 0.506, |
| "num_input_tokens_seen": 354152, |
| "step": 1245 |
| }, |
| { |
| "epoch": 2.332089552238806, |
| "grad_norm": 4.06488561630249, |
| "learning_rate": 4.995848916410677e-05, |
| "loss": 0.5328, |
| "num_input_tokens_seen": 355496, |
| "step": 1250 |
| }, |
| { |
| "epoch": 2.341417910447761, |
| "grad_norm": 2.9285223484039307, |
| "learning_rate": 4.995611149033739e-05, |
| "loss": 0.3659, |
| "num_input_tokens_seen": 357096, |
| "step": 1255 |
| }, |
| { |
| "epoch": 2.3507462686567164, |
| "grad_norm": 3.7447516918182373, |
| "learning_rate": 4.995366766471929e-05, |
| "loss": 0.4156, |
| "num_input_tokens_seen": 358312, |
| "step": 1260 |
| }, |
| { |
| "epoch": 2.360074626865672, |
| "grad_norm": 4.072560787200928, |
| "learning_rate": 4.99511576937304e-05, |
| "loss": 0.638, |
| "num_input_tokens_seen": 359752, |
| "step": 1265 |
| }, |
| { |
| "epoch": 2.3694029850746268, |
| "grad_norm": 3.8190813064575195, |
| "learning_rate": 4.9948581584023965e-05, |
| "loss": 0.4103, |
| "num_input_tokens_seen": 361128, |
| "step": 1270 |
| }, |
| { |
| "epoch": 2.378731343283582, |
| "grad_norm": 3.4160633087158203, |
| "learning_rate": 4.994593934242855e-05, |
| "loss": 0.3994, |
| "num_input_tokens_seen": 362600, |
| "step": 1275 |
| }, |
| { |
| "epoch": 2.388059701492537, |
| "grad_norm": 3.4851877689361572, |
| "learning_rate": 4.9943230975948016e-05, |
| "loss": 0.6019, |
| "num_input_tokens_seen": 364072, |
| "step": 1280 |
| }, |
| { |
| "epoch": 2.3973880597014925, |
| "grad_norm": 3.7114920616149902, |
| "learning_rate": 4.99404564917615e-05, |
| "loss": 0.6478, |
| "num_input_tokens_seen": 365576, |
| "step": 1285 |
| }, |
| { |
| "epoch": 2.406716417910448, |
| "grad_norm": 4.34594202041626, |
| "learning_rate": 4.993761589722341e-05, |
| "loss": 0.4604, |
| "num_input_tokens_seen": 367208, |
| "step": 1290 |
| }, |
| { |
| "epoch": 2.416044776119403, |
| "grad_norm": 3.012343168258667, |
| "learning_rate": 4.9934709199863386e-05, |
| "loss": 0.6368, |
| "num_input_tokens_seen": 368680, |
| "step": 1295 |
| }, |
| { |
| "epoch": 2.425373134328358, |
| "grad_norm": 1.8549182415008545, |
| "learning_rate": 4.993173640738629e-05, |
| "loss": 0.6399, |
| "num_input_tokens_seen": 370408, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.4347014925373136, |
| "grad_norm": 3.024909257888794, |
| "learning_rate": 4.992869752767218e-05, |
| "loss": 0.3907, |
| "num_input_tokens_seen": 371784, |
| "step": 1305 |
| }, |
| { |
| "epoch": 2.4440298507462686, |
| "grad_norm": 3.6774892807006836, |
| "learning_rate": 4.99255925687763e-05, |
| "loss": 0.4805, |
| "num_input_tokens_seen": 373256, |
| "step": 1310 |
| }, |
| { |
| "epoch": 2.453358208955224, |
| "grad_norm": 1.8474637269973755, |
| "learning_rate": 4.992242153892906e-05, |
| "loss": 0.4132, |
| "num_input_tokens_seen": 374760, |
| "step": 1315 |
| }, |
| { |
| "epoch": 2.4626865671641793, |
| "grad_norm": 2.9736571311950684, |
| "learning_rate": 4.991918444653598e-05, |
| "loss": 0.484, |
| "num_input_tokens_seen": 376360, |
| "step": 1320 |
| }, |
| { |
| "epoch": 2.4720149253731343, |
| "grad_norm": 3.373361349105835, |
| "learning_rate": 4.9915881300177725e-05, |
| "loss": 0.4176, |
| "num_input_tokens_seen": 377960, |
| "step": 1325 |
| }, |
| { |
| "epoch": 2.4813432835820897, |
| "grad_norm": 3.993251323699951, |
| "learning_rate": 4.991251210861002e-05, |
| "loss": 0.6992, |
| "num_input_tokens_seen": 379208, |
| "step": 1330 |
| }, |
| { |
| "epoch": 2.4906716417910446, |
| "grad_norm": 3.271693706512451, |
| "learning_rate": 4.9909076880763684e-05, |
| "loss": 0.5246, |
| "num_input_tokens_seen": 380744, |
| "step": 1335 |
| }, |
| { |
| "epoch": 2.5, |
| "grad_norm": 3.4205448627471924, |
| "learning_rate": 4.9905575625744564e-05, |
| "loss": 0.5214, |
| "num_input_tokens_seen": 382120, |
| "step": 1340 |
| }, |
| { |
| "epoch": 2.5093283582089554, |
| "grad_norm": 4.5667877197265625, |
| "learning_rate": 4.990200835283353e-05, |
| "loss": 0.6159, |
| "num_input_tokens_seen": 383400, |
| "step": 1345 |
| }, |
| { |
| "epoch": 2.5186567164179103, |
| "grad_norm": 3.362797975540161, |
| "learning_rate": 4.9898375071486465e-05, |
| "loss": 0.5133, |
| "num_input_tokens_seen": 384680, |
| "step": 1350 |
| }, |
| { |
| "epoch": 2.5279850746268657, |
| "grad_norm": 4.309556007385254, |
| "learning_rate": 4.989467579133419e-05, |
| "loss": 0.5545, |
| "num_input_tokens_seen": 385992, |
| "step": 1355 |
| }, |
| { |
| "epoch": 2.5373134328358207, |
| "grad_norm": 3.0154802799224854, |
| "learning_rate": 4.98909105221825e-05, |
| "loss": 0.4216, |
| "num_input_tokens_seen": 387368, |
| "step": 1360 |
| }, |
| { |
| "epoch": 2.546641791044776, |
| "grad_norm": 3.6007492542266846, |
| "learning_rate": 4.98870792740121e-05, |
| "loss": 0.664, |
| "num_input_tokens_seen": 388776, |
| "step": 1365 |
| }, |
| { |
| "epoch": 2.5559701492537314, |
| "grad_norm": 2.9543521404266357, |
| "learning_rate": 4.988318205697856e-05, |
| "loss": 0.5082, |
| "num_input_tokens_seen": 390440, |
| "step": 1370 |
| }, |
| { |
| "epoch": 2.5652985074626864, |
| "grad_norm": 4.019232749938965, |
| "learning_rate": 4.9879218881412366e-05, |
| "loss": 0.5942, |
| "num_input_tokens_seen": 391720, |
| "step": 1375 |
| }, |
| { |
| "epoch": 2.574626865671642, |
| "grad_norm": 3.4183766841888428, |
| "learning_rate": 4.9875189757818805e-05, |
| "loss": 0.5922, |
| "num_input_tokens_seen": 393032, |
| "step": 1380 |
| }, |
| { |
| "epoch": 2.583955223880597, |
| "grad_norm": 2.1808016300201416, |
| "learning_rate": 4.9871094696877995e-05, |
| "loss": 0.5059, |
| "num_input_tokens_seen": 394312, |
| "step": 1385 |
| }, |
| { |
| "epoch": 2.593283582089552, |
| "grad_norm": 3.1968891620635986, |
| "learning_rate": 4.986693370944481e-05, |
| "loss": 0.652, |
| "num_input_tokens_seen": 395656, |
| "step": 1390 |
| }, |
| { |
| "epoch": 2.6026119402985075, |
| "grad_norm": 1.5971628427505493, |
| "learning_rate": 4.986270680654891e-05, |
| "loss": 0.5223, |
| "num_input_tokens_seen": 397288, |
| "step": 1395 |
| }, |
| { |
| "epoch": 2.611940298507463, |
| "grad_norm": 2.5914952754974365, |
| "learning_rate": 4.985841399939465e-05, |
| "loss": 0.6079, |
| "num_input_tokens_seen": 398600, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.621268656716418, |
| "grad_norm": 4.969552040100098, |
| "learning_rate": 4.98540552993611e-05, |
| "loss": 0.6091, |
| "num_input_tokens_seen": 400168, |
| "step": 1405 |
| }, |
| { |
| "epoch": 2.6305970149253732, |
| "grad_norm": 6.151612758636475, |
| "learning_rate": 4.9849630718001986e-05, |
| "loss": 0.6323, |
| "num_input_tokens_seen": 401544, |
| "step": 1410 |
| }, |
| { |
| "epoch": 2.6399253731343286, |
| "grad_norm": 3.6210825443267822, |
| "learning_rate": 4.9845140267045654e-05, |
| "loss": 0.4287, |
| "num_input_tokens_seen": 402984, |
| "step": 1415 |
| }, |
| { |
| "epoch": 2.6492537313432836, |
| "grad_norm": 4.254519939422607, |
| "learning_rate": 4.984058395839508e-05, |
| "loss": 0.6632, |
| "num_input_tokens_seen": 404328, |
| "step": 1420 |
| }, |
| { |
| "epoch": 2.658582089552239, |
| "grad_norm": 3.8595051765441895, |
| "learning_rate": 4.983596180412778e-05, |
| "loss": 0.5151, |
| "num_input_tokens_seen": 405608, |
| "step": 1425 |
| }, |
| { |
| "epoch": 2.667910447761194, |
| "grad_norm": 3.393634080886841, |
| "learning_rate": 4.983127381649585e-05, |
| "loss": 0.3853, |
| "num_input_tokens_seen": 407208, |
| "step": 1430 |
| }, |
| { |
| "epoch": 2.6772388059701493, |
| "grad_norm": 2.2769412994384766, |
| "learning_rate": 4.9826520007925846e-05, |
| "loss": 0.4185, |
| "num_input_tokens_seen": 408776, |
| "step": 1435 |
| }, |
| { |
| "epoch": 2.6865671641791042, |
| "grad_norm": 3.912013530731201, |
| "learning_rate": 4.982170039101883e-05, |
| "loss": 0.546, |
| "num_input_tokens_seen": 410120, |
| "step": 1440 |
| }, |
| { |
| "epoch": 2.6958955223880596, |
| "grad_norm": 3.378753900527954, |
| "learning_rate": 4.981681497855029e-05, |
| "loss": 0.4098, |
| "num_input_tokens_seen": 411624, |
| "step": 1445 |
| }, |
| { |
| "epoch": 2.705223880597015, |
| "grad_norm": 3.542613983154297, |
| "learning_rate": 4.981186378347011e-05, |
| "loss": 0.3672, |
| "num_input_tokens_seen": 413096, |
| "step": 1450 |
| }, |
| { |
| "epoch": 2.71455223880597, |
| "grad_norm": 6.728061199188232, |
| "learning_rate": 4.9806846818902575e-05, |
| "loss": 0.7264, |
| "num_input_tokens_seen": 414472, |
| "step": 1455 |
| }, |
| { |
| "epoch": 2.7238805970149254, |
| "grad_norm": 2.363213062286377, |
| "learning_rate": 4.980176409814629e-05, |
| "loss": 0.5699, |
| "num_input_tokens_seen": 416072, |
| "step": 1460 |
| }, |
| { |
| "epoch": 2.7332089552238807, |
| "grad_norm": 3.582933187484741, |
| "learning_rate": 4.9796615634674155e-05, |
| "loss": 0.6766, |
| "num_input_tokens_seen": 417640, |
| "step": 1465 |
| }, |
| { |
| "epoch": 2.7425373134328357, |
| "grad_norm": 3.859849691390991, |
| "learning_rate": 4.9791401442133345e-05, |
| "loss": 0.397, |
| "num_input_tokens_seen": 419048, |
| "step": 1470 |
| }, |
| { |
| "epoch": 2.751865671641791, |
| "grad_norm": 2.291114330291748, |
| "learning_rate": 4.9786121534345265e-05, |
| "loss": 0.4941, |
| "num_input_tokens_seen": 420520, |
| "step": 1475 |
| }, |
| { |
| "epoch": 2.7611940298507465, |
| "grad_norm": 3.24870228767395, |
| "learning_rate": 4.978077592530551e-05, |
| "loss": 0.5484, |
| "num_input_tokens_seen": 421960, |
| "step": 1480 |
| }, |
| { |
| "epoch": 2.7705223880597014, |
| "grad_norm": 3.271296739578247, |
| "learning_rate": 4.977536462918383e-05, |
| "loss": 0.5362, |
| "num_input_tokens_seen": 423304, |
| "step": 1485 |
| }, |
| { |
| "epoch": 2.779850746268657, |
| "grad_norm": 2.682514190673828, |
| "learning_rate": 4.9769887660324094e-05, |
| "loss": 0.5723, |
| "num_input_tokens_seen": 424552, |
| "step": 1490 |
| }, |
| { |
| "epoch": 2.789179104477612, |
| "grad_norm": 2.4217052459716797, |
| "learning_rate": 4.9764345033244265e-05, |
| "loss": 0.4591, |
| "num_input_tokens_seen": 425896, |
| "step": 1495 |
| }, |
| { |
| "epoch": 2.798507462686567, |
| "grad_norm": 3.089872360229492, |
| "learning_rate": 4.9758736762636326e-05, |
| "loss": 0.4165, |
| "num_input_tokens_seen": 427208, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.8078358208955225, |
| "grad_norm": 2.684887170791626, |
| "learning_rate": 4.9753062863366276e-05, |
| "loss": 0.4558, |
| "num_input_tokens_seen": 428392, |
| "step": 1505 |
| }, |
| { |
| "epoch": 2.8171641791044775, |
| "grad_norm": 2.7765729427337646, |
| "learning_rate": 4.974732335047408e-05, |
| "loss": 0.4961, |
| "num_input_tokens_seen": 429928, |
| "step": 1510 |
| }, |
| { |
| "epoch": 2.826492537313433, |
| "grad_norm": 3.068272352218628, |
| "learning_rate": 4.974151823917361e-05, |
| "loss": 0.5264, |
| "num_input_tokens_seen": 431464, |
| "step": 1515 |
| }, |
| { |
| "epoch": 2.835820895522388, |
| "grad_norm": 2.947761058807373, |
| "learning_rate": 4.973564754485265e-05, |
| "loss": 0.4414, |
| "num_input_tokens_seen": 432968, |
| "step": 1520 |
| }, |
| { |
| "epoch": 2.845149253731343, |
| "grad_norm": 5.090349197387695, |
| "learning_rate": 4.97297112830728e-05, |
| "loss": 0.5657, |
| "num_input_tokens_seen": 434472, |
| "step": 1525 |
| }, |
| { |
| "epoch": 2.8544776119402986, |
| "grad_norm": 4.936018943786621, |
| "learning_rate": 4.9723709469569476e-05, |
| "loss": 0.4971, |
| "num_input_tokens_seen": 435944, |
| "step": 1530 |
| }, |
| { |
| "epoch": 2.8638059701492535, |
| "grad_norm": 3.9170398712158203, |
| "learning_rate": 4.9717642120251846e-05, |
| "loss": 0.4385, |
| "num_input_tokens_seen": 437608, |
| "step": 1535 |
| }, |
| { |
| "epoch": 2.873134328358209, |
| "grad_norm": 5.93184757232666, |
| "learning_rate": 4.97115092512028e-05, |
| "loss": 0.5671, |
| "num_input_tokens_seen": 439176, |
| "step": 1540 |
| }, |
| { |
| "epoch": 2.8824626865671643, |
| "grad_norm": 3.2376952171325684, |
| "learning_rate": 4.97053108786789e-05, |
| "loss": 0.9106, |
| "num_input_tokens_seen": 440520, |
| "step": 1545 |
| }, |
| { |
| "epoch": 2.8917910447761193, |
| "grad_norm": 3.3897287845611572, |
| "learning_rate": 4.9699047019110346e-05, |
| "loss": 0.6528, |
| "num_input_tokens_seen": 441896, |
| "step": 1550 |
| }, |
| { |
| "epoch": 2.9011194029850746, |
| "grad_norm": 5.194111347198486, |
| "learning_rate": 4.969271768910093e-05, |
| "loss": 0.7076, |
| "num_input_tokens_seen": 443176, |
| "step": 1555 |
| }, |
| { |
| "epoch": 2.91044776119403, |
| "grad_norm": 2.646667957305908, |
| "learning_rate": 4.968632290542796e-05, |
| "loss": 0.4641, |
| "num_input_tokens_seen": 444680, |
| "step": 1560 |
| }, |
| { |
| "epoch": 2.919776119402985, |
| "grad_norm": 2.4147565364837646, |
| "learning_rate": 4.9679862685042275e-05, |
| "loss": 0.5277, |
| "num_input_tokens_seen": 446024, |
| "step": 1565 |
| }, |
| { |
| "epoch": 2.9291044776119404, |
| "grad_norm": 2.3430593013763428, |
| "learning_rate": 4.967333704506817e-05, |
| "loss": 0.4, |
| "num_input_tokens_seen": 447656, |
| "step": 1570 |
| }, |
| { |
| "epoch": 2.9384328358208958, |
| "grad_norm": 2.9953832626342773, |
| "learning_rate": 4.966674600280332e-05, |
| "loss": 0.5838, |
| "num_input_tokens_seen": 448904, |
| "step": 1575 |
| }, |
| { |
| "epoch": 2.9477611940298507, |
| "grad_norm": 3.121331214904785, |
| "learning_rate": 4.9660089575718786e-05, |
| "loss": 0.4421, |
| "num_input_tokens_seen": 450216, |
| "step": 1580 |
| }, |
| { |
| "epoch": 2.957089552238806, |
| "grad_norm": 3.0576043128967285, |
| "learning_rate": 4.965336778145895e-05, |
| "loss": 0.5014, |
| "num_input_tokens_seen": 451592, |
| "step": 1585 |
| }, |
| { |
| "epoch": 2.966417910447761, |
| "grad_norm": 2.610929250717163, |
| "learning_rate": 4.964658063784144e-05, |
| "loss": 0.7541, |
| "num_input_tokens_seen": 453032, |
| "step": 1590 |
| }, |
| { |
| "epoch": 2.9757462686567164, |
| "grad_norm": 3.6712334156036377, |
| "learning_rate": 4.963972816285715e-05, |
| "loss": 0.4498, |
| "num_input_tokens_seen": 454408, |
| "step": 1595 |
| }, |
| { |
| "epoch": 2.9850746268656714, |
| "grad_norm": 4.737540245056152, |
| "learning_rate": 4.9632810374670094e-05, |
| "loss": 0.5783, |
| "num_input_tokens_seen": 455912, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.9944029850746268, |
| "grad_norm": 3.4872634410858154, |
| "learning_rate": 4.9625827291617454e-05, |
| "loss": 0.4317, |
| "num_input_tokens_seen": 457256, |
| "step": 1605 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 0.574218213558197, |
| "eval_runtime": 2.8992, |
| "eval_samples_per_second": 82.092, |
| "eval_steps_per_second": 20.695, |
| "num_input_tokens_seen": 457952, |
| "step": 1608 |
| }, |
| { |
| "epoch": 3.003731343283582, |
| "grad_norm": 2.9664394855499268, |
| "learning_rate": 4.961877893220949e-05, |
| "loss": 0.5241, |
| "num_input_tokens_seen": 458464, |
| "step": 1610 |
| }, |
| { |
| "epoch": 3.013059701492537, |
| "grad_norm": 1.7026567459106445, |
| "learning_rate": 4.961166531512947e-05, |
| "loss": 0.342, |
| "num_input_tokens_seen": 459872, |
| "step": 1615 |
| }, |
| { |
| "epoch": 3.0223880597014925, |
| "grad_norm": 2.048572540283203, |
| "learning_rate": 4.9604486459233655e-05, |
| "loss": 0.2007, |
| "num_input_tokens_seen": 461408, |
| "step": 1620 |
| }, |
| { |
| "epoch": 3.031716417910448, |
| "grad_norm": 2.2478630542755127, |
| "learning_rate": 4.959724238355123e-05, |
| "loss": 0.5383, |
| "num_input_tokens_seen": 462624, |
| "step": 1625 |
| }, |
| { |
| "epoch": 3.041044776119403, |
| "grad_norm": 4.3553643226623535, |
| "learning_rate": 4.9589933107284286e-05, |
| "loss": 0.3753, |
| "num_input_tokens_seen": 464096, |
| "step": 1630 |
| }, |
| { |
| "epoch": 3.050373134328358, |
| "grad_norm": 3.8952770233154297, |
| "learning_rate": 4.9582558649807696e-05, |
| "loss": 0.4181, |
| "num_input_tokens_seen": 465408, |
| "step": 1635 |
| }, |
| { |
| "epoch": 3.0597014925373136, |
| "grad_norm": 4.306611061096191, |
| "learning_rate": 4.9575119030669157e-05, |
| "loss": 0.4972, |
| "num_input_tokens_seen": 466880, |
| "step": 1640 |
| }, |
| { |
| "epoch": 3.0690298507462686, |
| "grad_norm": 2.1740450859069824, |
| "learning_rate": 4.956761426958906e-05, |
| "loss": 0.2782, |
| "num_input_tokens_seen": 468416, |
| "step": 1645 |
| }, |
| { |
| "epoch": 3.078358208955224, |
| "grad_norm": 3.596029281616211, |
| "learning_rate": 4.956004438646048e-05, |
| "loss": 0.3836, |
| "num_input_tokens_seen": 469952, |
| "step": 1650 |
| }, |
| { |
| "epoch": 3.0876865671641793, |
| "grad_norm": 2.3618667125701904, |
| "learning_rate": 4.955240940134912e-05, |
| "loss": 0.4361, |
| "num_input_tokens_seen": 471264, |
| "step": 1655 |
| }, |
| { |
| "epoch": 3.0970149253731343, |
| "grad_norm": 2.5000288486480713, |
| "learning_rate": 4.954470933449324e-05, |
| "loss": 0.5823, |
| "num_input_tokens_seen": 472640, |
| "step": 1660 |
| }, |
| { |
| "epoch": 3.1063432835820897, |
| "grad_norm": 1.5906955003738403, |
| "learning_rate": 4.953694420630361e-05, |
| "loss": 0.4308, |
| "num_input_tokens_seen": 473984, |
| "step": 1665 |
| }, |
| { |
| "epoch": 3.1156716417910446, |
| "grad_norm": 3.7199573516845703, |
| "learning_rate": 4.9529114037363496e-05, |
| "loss": 0.4204, |
| "num_input_tokens_seen": 475488, |
| "step": 1670 |
| }, |
| { |
| "epoch": 3.125, |
| "grad_norm": 4.51806640625, |
| "learning_rate": 4.952121884842851e-05, |
| "loss": 0.4236, |
| "num_input_tokens_seen": 476864, |
| "step": 1675 |
| }, |
| { |
| "epoch": 3.1343283582089554, |
| "grad_norm": 3.3115038871765137, |
| "learning_rate": 4.951325866042667e-05, |
| "loss": 0.3054, |
| "num_input_tokens_seen": 478272, |
| "step": 1680 |
| }, |
| { |
| "epoch": 3.1436567164179103, |
| "grad_norm": 3.5390982627868652, |
| "learning_rate": 4.950523349445824e-05, |
| "loss": 0.4668, |
| "num_input_tokens_seen": 479648, |
| "step": 1685 |
| }, |
| { |
| "epoch": 3.1529850746268657, |
| "grad_norm": 2.7740437984466553, |
| "learning_rate": 4.949714337179577e-05, |
| "loss": 0.2554, |
| "num_input_tokens_seen": 481152, |
| "step": 1690 |
| }, |
| { |
| "epoch": 3.1623134328358207, |
| "grad_norm": 3.017284870147705, |
| "learning_rate": 4.9488988313883956e-05, |
| "loss": 0.4682, |
| "num_input_tokens_seen": 482624, |
| "step": 1695 |
| }, |
| { |
| "epoch": 3.171641791044776, |
| "grad_norm": 2.4900712966918945, |
| "learning_rate": 4.9480768342339636e-05, |
| "loss": 0.3134, |
| "num_input_tokens_seen": 484032, |
| "step": 1700 |
| }, |
| { |
| "epoch": 3.1809701492537314, |
| "grad_norm": 3.6495842933654785, |
| "learning_rate": 4.947248347895172e-05, |
| "loss": 0.4432, |
| "num_input_tokens_seen": 485440, |
| "step": 1705 |
| }, |
| { |
| "epoch": 3.1902985074626864, |
| "grad_norm": 2.3622589111328125, |
| "learning_rate": 4.946413374568112e-05, |
| "loss": 0.3341, |
| "num_input_tokens_seen": 486848, |
| "step": 1710 |
| }, |
| { |
| "epoch": 3.199626865671642, |
| "grad_norm": 3.6242547035217285, |
| "learning_rate": 4.94557191646607e-05, |
| "loss": 0.4302, |
| "num_input_tokens_seen": 488192, |
| "step": 1715 |
| }, |
| { |
| "epoch": 3.208955223880597, |
| "grad_norm": 4.912967205047607, |
| "learning_rate": 4.944723975819522e-05, |
| "loss": 0.4853, |
| "num_input_tokens_seen": 489696, |
| "step": 1720 |
| }, |
| { |
| "epoch": 3.218283582089552, |
| "grad_norm": 5.233360767364502, |
| "learning_rate": 4.943869554876127e-05, |
| "loss": 0.6421, |
| "num_input_tokens_seen": 491008, |
| "step": 1725 |
| }, |
| { |
| "epoch": 3.2276119402985075, |
| "grad_norm": 3.4871621131896973, |
| "learning_rate": 4.9430086559007235e-05, |
| "loss": 0.3641, |
| "num_input_tokens_seen": 492576, |
| "step": 1730 |
| }, |
| { |
| "epoch": 3.236940298507463, |
| "grad_norm": 3.3779678344726562, |
| "learning_rate": 4.94214128117532e-05, |
| "loss": 0.6013, |
| "num_input_tokens_seen": 493824, |
| "step": 1735 |
| }, |
| { |
| "epoch": 3.246268656716418, |
| "grad_norm": 3.3412258625030518, |
| "learning_rate": 4.941267432999088e-05, |
| "loss": 0.3786, |
| "num_input_tokens_seen": 495200, |
| "step": 1740 |
| }, |
| { |
| "epoch": 3.2555970149253732, |
| "grad_norm": 2.4862897396087646, |
| "learning_rate": 4.940387113688363e-05, |
| "loss": 0.5212, |
| "num_input_tokens_seen": 496576, |
| "step": 1745 |
| }, |
| { |
| "epoch": 3.264925373134328, |
| "grad_norm": 2.420630931854248, |
| "learning_rate": 4.9395003255766313e-05, |
| "loss": 0.4904, |
| "num_input_tokens_seen": 498112, |
| "step": 1750 |
| }, |
| { |
| "epoch": 3.2742537313432836, |
| "grad_norm": 2.542557716369629, |
| "learning_rate": 4.938607071014526e-05, |
| "loss": 0.3714, |
| "num_input_tokens_seen": 499552, |
| "step": 1755 |
| }, |
| { |
| "epoch": 3.283582089552239, |
| "grad_norm": 3.03322696685791, |
| "learning_rate": 4.937707352369822e-05, |
| "loss": 0.2615, |
| "num_input_tokens_seen": 500960, |
| "step": 1760 |
| }, |
| { |
| "epoch": 3.292910447761194, |
| "grad_norm": 4.63538122177124, |
| "learning_rate": 4.936801172027428e-05, |
| "loss": 0.2741, |
| "num_input_tokens_seen": 502208, |
| "step": 1765 |
| }, |
| { |
| "epoch": 3.3022388059701493, |
| "grad_norm": 3.04461669921875, |
| "learning_rate": 4.93588853238938e-05, |
| "loss": 0.3964, |
| "num_input_tokens_seen": 503808, |
| "step": 1770 |
| }, |
| { |
| "epoch": 3.3115671641791042, |
| "grad_norm": 2.3168342113494873, |
| "learning_rate": 4.934969435874838e-05, |
| "loss": 0.2913, |
| "num_input_tokens_seen": 505248, |
| "step": 1775 |
| }, |
| { |
| "epoch": 3.3208955223880596, |
| "grad_norm": 2.5833566188812256, |
| "learning_rate": 4.934043884920076e-05, |
| "loss": 0.2517, |
| "num_input_tokens_seen": 507072, |
| "step": 1780 |
| }, |
| { |
| "epoch": 3.330223880597015, |
| "grad_norm": 6.644166469573975, |
| "learning_rate": 4.9331118819784773e-05, |
| "loss": 0.4327, |
| "num_input_tokens_seen": 508576, |
| "step": 1785 |
| }, |
| { |
| "epoch": 3.33955223880597, |
| "grad_norm": 3.8258185386657715, |
| "learning_rate": 4.932173429520528e-05, |
| "loss": 0.5081, |
| "num_input_tokens_seen": 509824, |
| "step": 1790 |
| }, |
| { |
| "epoch": 3.3488805970149254, |
| "grad_norm": 2.836574077606201, |
| "learning_rate": 4.931228530033809e-05, |
| "loss": 0.3434, |
| "num_input_tokens_seen": 511392, |
| "step": 1795 |
| }, |
| { |
| "epoch": 3.3582089552238807, |
| "grad_norm": 6.194258213043213, |
| "learning_rate": 4.930277186022992e-05, |
| "loss": 0.439, |
| "num_input_tokens_seen": 512672, |
| "step": 1800 |
| }, |
| { |
| "epoch": 3.3675373134328357, |
| "grad_norm": 3.2283599376678467, |
| "learning_rate": 4.929319400009831e-05, |
| "loss": 0.3252, |
| "num_input_tokens_seen": 514144, |
| "step": 1805 |
| }, |
| { |
| "epoch": 3.376865671641791, |
| "grad_norm": 5.301681041717529, |
| "learning_rate": 4.9283551745331534e-05, |
| "loss": 0.4213, |
| "num_input_tokens_seen": 515584, |
| "step": 1810 |
| }, |
| { |
| "epoch": 3.3861940298507465, |
| "grad_norm": 3.6308445930480957, |
| "learning_rate": 4.927384512148861e-05, |
| "loss": 0.338, |
| "num_input_tokens_seen": 516928, |
| "step": 1815 |
| }, |
| { |
| "epoch": 3.3955223880597014, |
| "grad_norm": 3.7607781887054443, |
| "learning_rate": 4.926407415429913e-05, |
| "loss": 0.3232, |
| "num_input_tokens_seen": 518432, |
| "step": 1820 |
| }, |
| { |
| "epoch": 3.404850746268657, |
| "grad_norm": 3.6675467491149902, |
| "learning_rate": 4.925423886966328e-05, |
| "loss": 0.4096, |
| "num_input_tokens_seen": 519904, |
| "step": 1825 |
| }, |
| { |
| "epoch": 3.4141791044776117, |
| "grad_norm": 4.842457294464111, |
| "learning_rate": 4.924433929365171e-05, |
| "loss": 0.3551, |
| "num_input_tokens_seen": 521472, |
| "step": 1830 |
| }, |
| { |
| "epoch": 3.423507462686567, |
| "grad_norm": 3.20354962348938, |
| "learning_rate": 4.92343754525055e-05, |
| "loss": 0.4844, |
| "num_input_tokens_seen": 522848, |
| "step": 1835 |
| }, |
| { |
| "epoch": 3.4328358208955225, |
| "grad_norm": 5.549618244171143, |
| "learning_rate": 4.922434737263607e-05, |
| "loss": 0.4643, |
| "num_input_tokens_seen": 524160, |
| "step": 1840 |
| }, |
| { |
| "epoch": 3.4421641791044775, |
| "grad_norm": 4.986070156097412, |
| "learning_rate": 4.921425508062514e-05, |
| "loss": 0.5012, |
| "num_input_tokens_seen": 525504, |
| "step": 1845 |
| }, |
| { |
| "epoch": 3.451492537313433, |
| "grad_norm": 3.1051692962646484, |
| "learning_rate": 4.920409860322461e-05, |
| "loss": 0.3257, |
| "num_input_tokens_seen": 526816, |
| "step": 1850 |
| }, |
| { |
| "epoch": 3.4608208955223883, |
| "grad_norm": 4.339359283447266, |
| "learning_rate": 4.919387796735655e-05, |
| "loss": 0.621, |
| "num_input_tokens_seen": 528224, |
| "step": 1855 |
| }, |
| { |
| "epoch": 3.470149253731343, |
| "grad_norm": 2.9170286655426025, |
| "learning_rate": 4.918359320011306e-05, |
| "loss": 0.4601, |
| "num_input_tokens_seen": 529504, |
| "step": 1860 |
| }, |
| { |
| "epoch": 3.4794776119402986, |
| "grad_norm": 2.5929200649261475, |
| "learning_rate": 4.917324432875627e-05, |
| "loss": 0.5417, |
| "num_input_tokens_seen": 531072, |
| "step": 1865 |
| }, |
| { |
| "epoch": 3.4888059701492535, |
| "grad_norm": 2.438239336013794, |
| "learning_rate": 4.916283138071821e-05, |
| "loss": 0.308, |
| "num_input_tokens_seen": 532832, |
| "step": 1870 |
| }, |
| { |
| "epoch": 3.498134328358209, |
| "grad_norm": 2.8249967098236084, |
| "learning_rate": 4.9152354383600766e-05, |
| "loss": 0.3958, |
| "num_input_tokens_seen": 534272, |
| "step": 1875 |
| }, |
| { |
| "epoch": 3.5074626865671643, |
| "grad_norm": 3.8364765644073486, |
| "learning_rate": 4.91418133651756e-05, |
| "loss": 0.4657, |
| "num_input_tokens_seen": 535872, |
| "step": 1880 |
| }, |
| { |
| "epoch": 3.5167910447761193, |
| "grad_norm": 4.824006080627441, |
| "learning_rate": 4.913120835338409e-05, |
| "loss": 0.4225, |
| "num_input_tokens_seen": 537376, |
| "step": 1885 |
| }, |
| { |
| "epoch": 3.5261194029850746, |
| "grad_norm": 3.998689651489258, |
| "learning_rate": 4.912053937633722e-05, |
| "loss": 0.6766, |
| "num_input_tokens_seen": 538880, |
| "step": 1890 |
| }, |
| { |
| "epoch": 3.53544776119403, |
| "grad_norm": 4.104874610900879, |
| "learning_rate": 4.910980646231554e-05, |
| "loss": 0.3169, |
| "num_input_tokens_seen": 540512, |
| "step": 1895 |
| }, |
| { |
| "epoch": 3.544776119402985, |
| "grad_norm": 4.822265625, |
| "learning_rate": 4.9099009639769084e-05, |
| "loss": 0.4353, |
| "num_input_tokens_seen": 541920, |
| "step": 1900 |
| }, |
| { |
| "epoch": 3.5541044776119404, |
| "grad_norm": 6.059118270874023, |
| "learning_rate": 4.908814893731728e-05, |
| "loss": 0.4843, |
| "num_input_tokens_seen": 543328, |
| "step": 1905 |
| }, |
| { |
| "epoch": 3.5634328358208958, |
| "grad_norm": 5.0667195320129395, |
| "learning_rate": 4.90772243837489e-05, |
| "loss": 0.3783, |
| "num_input_tokens_seen": 544800, |
| "step": 1910 |
| }, |
| { |
| "epoch": 3.5727611940298507, |
| "grad_norm": 6.243183612823486, |
| "learning_rate": 4.906623600802195e-05, |
| "loss": 0.5315, |
| "num_input_tokens_seen": 546208, |
| "step": 1915 |
| }, |
| { |
| "epoch": 3.582089552238806, |
| "grad_norm": 3.933440685272217, |
| "learning_rate": 4.905518383926362e-05, |
| "loss": 0.3139, |
| "num_input_tokens_seen": 547616, |
| "step": 1920 |
| }, |
| { |
| "epoch": 3.591417910447761, |
| "grad_norm": 5.931534767150879, |
| "learning_rate": 4.90440679067702e-05, |
| "loss": 0.5035, |
| "num_input_tokens_seen": 549088, |
| "step": 1925 |
| }, |
| { |
| "epoch": 3.6007462686567164, |
| "grad_norm": 2.1538681983947754, |
| "learning_rate": 4.903288824000698e-05, |
| "loss": 0.351, |
| "num_input_tokens_seen": 550464, |
| "step": 1930 |
| }, |
| { |
| "epoch": 3.6100746268656714, |
| "grad_norm": 6.328001976013184, |
| "learning_rate": 4.902164486860823e-05, |
| "loss": 0.3204, |
| "num_input_tokens_seen": 552032, |
| "step": 1935 |
| }, |
| { |
| "epoch": 3.6194029850746268, |
| "grad_norm": 5.8199334144592285, |
| "learning_rate": 4.901033782237706e-05, |
| "loss": 0.5541, |
| "num_input_tokens_seen": 553344, |
| "step": 1940 |
| }, |
| { |
| "epoch": 3.628731343283582, |
| "grad_norm": 3.8570632934570312, |
| "learning_rate": 4.8998967131285356e-05, |
| "loss": 0.4219, |
| "num_input_tokens_seen": 554592, |
| "step": 1945 |
| }, |
| { |
| "epoch": 3.638059701492537, |
| "grad_norm": 3.0721070766448975, |
| "learning_rate": 4.898753282547373e-05, |
| "loss": 0.4132, |
| "num_input_tokens_seen": 556064, |
| "step": 1950 |
| }, |
| { |
| "epoch": 3.6473880597014925, |
| "grad_norm": 2.4061243534088135, |
| "learning_rate": 4.89760349352514e-05, |
| "loss": 0.2998, |
| "num_input_tokens_seen": 557536, |
| "step": 1955 |
| }, |
| { |
| "epoch": 3.656716417910448, |
| "grad_norm": 5.237555980682373, |
| "learning_rate": 4.896447349109616e-05, |
| "loss": 0.3841, |
| "num_input_tokens_seen": 558976, |
| "step": 1960 |
| }, |
| { |
| "epoch": 3.666044776119403, |
| "grad_norm": 3.542304754257202, |
| "learning_rate": 4.895284852365422e-05, |
| "loss": 0.3726, |
| "num_input_tokens_seen": 560192, |
| "step": 1965 |
| }, |
| { |
| "epoch": 3.675373134328358, |
| "grad_norm": 6.639720916748047, |
| "learning_rate": 4.894116006374022e-05, |
| "loss": 0.4313, |
| "num_input_tokens_seen": 561472, |
| "step": 1970 |
| }, |
| { |
| "epoch": 3.6847014925373136, |
| "grad_norm": 8.025578498840332, |
| "learning_rate": 4.8929408142337064e-05, |
| "loss": 0.5376, |
| "num_input_tokens_seen": 562912, |
| "step": 1975 |
| }, |
| { |
| "epoch": 3.6940298507462686, |
| "grad_norm": 4.457633972167969, |
| "learning_rate": 4.891759279059591e-05, |
| "loss": 0.3574, |
| "num_input_tokens_seen": 564288, |
| "step": 1980 |
| }, |
| { |
| "epoch": 3.703358208955224, |
| "grad_norm": 3.9600882530212402, |
| "learning_rate": 4.890571403983603e-05, |
| "loss": 0.4557, |
| "num_input_tokens_seen": 565664, |
| "step": 1985 |
| }, |
| { |
| "epoch": 3.7126865671641793, |
| "grad_norm": 4.22388219833374, |
| "learning_rate": 4.889377192154474e-05, |
| "loss": 0.5508, |
| "num_input_tokens_seen": 567040, |
| "step": 1990 |
| }, |
| { |
| "epoch": 3.7220149253731343, |
| "grad_norm": 3.2904796600341797, |
| "learning_rate": 4.888176646737735e-05, |
| "loss": 0.4001, |
| "num_input_tokens_seen": 568320, |
| "step": 1995 |
| }, |
| { |
| "epoch": 3.7313432835820897, |
| "grad_norm": 2.939121961593628, |
| "learning_rate": 4.8869697709157047e-05, |
| "loss": 0.556, |
| "num_input_tokens_seen": 569696, |
| "step": 2000 |
| }, |
| { |
| "epoch": 3.7406716417910446, |
| "grad_norm": 2.685490369796753, |
| "learning_rate": 4.8857565678874826e-05, |
| "loss": 0.3689, |
| "num_input_tokens_seen": 571296, |
| "step": 2005 |
| }, |
| { |
| "epoch": 3.75, |
| "grad_norm": 3.7600741386413574, |
| "learning_rate": 4.884537040868938e-05, |
| "loss": 0.4361, |
| "num_input_tokens_seen": 572768, |
| "step": 2010 |
| }, |
| { |
| "epoch": 3.7593283582089554, |
| "grad_norm": 3.3371317386627197, |
| "learning_rate": 4.8833111930927056e-05, |
| "loss": 0.2978, |
| "num_input_tokens_seen": 574080, |
| "step": 2015 |
| }, |
| { |
| "epoch": 3.7686567164179103, |
| "grad_norm": 8.625703811645508, |
| "learning_rate": 4.882079027808173e-05, |
| "loss": 0.3756, |
| "num_input_tokens_seen": 575456, |
| "step": 2020 |
| }, |
| { |
| "epoch": 3.7779850746268657, |
| "grad_norm": 4.930742263793945, |
| "learning_rate": 4.880840548281475e-05, |
| "loss": 0.3726, |
| "num_input_tokens_seen": 576864, |
| "step": 2025 |
| }, |
| { |
| "epoch": 3.7873134328358207, |
| "grad_norm": 4.922532081604004, |
| "learning_rate": 4.879595757795482e-05, |
| "loss": 0.347, |
| "num_input_tokens_seen": 578304, |
| "step": 2030 |
| }, |
| { |
| "epoch": 3.796641791044776, |
| "grad_norm": 2.6869003772735596, |
| "learning_rate": 4.878344659649796e-05, |
| "loss": 0.1693, |
| "num_input_tokens_seen": 579744, |
| "step": 2035 |
| }, |
| { |
| "epoch": 3.8059701492537314, |
| "grad_norm": 6.64580774307251, |
| "learning_rate": 4.8770872571607365e-05, |
| "loss": 0.4665, |
| "num_input_tokens_seen": 581088, |
| "step": 2040 |
| }, |
| { |
| "epoch": 3.8152985074626864, |
| "grad_norm": 6.879859447479248, |
| "learning_rate": 4.875823553661334e-05, |
| "loss": 0.4603, |
| "num_input_tokens_seen": 582560, |
| "step": 2045 |
| }, |
| { |
| "epoch": 3.824626865671642, |
| "grad_norm": 3.580334424972534, |
| "learning_rate": 4.874553552501323e-05, |
| "loss": 0.3539, |
| "num_input_tokens_seen": 584000, |
| "step": 2050 |
| }, |
| { |
| "epoch": 3.833955223880597, |
| "grad_norm": 5.23954439163208, |
| "learning_rate": 4.8732772570471295e-05, |
| "loss": 0.6248, |
| "num_input_tokens_seen": 585248, |
| "step": 2055 |
| }, |
| { |
| "epoch": 3.843283582089552, |
| "grad_norm": 4.419898986816406, |
| "learning_rate": 4.871994670681865e-05, |
| "loss": 0.5401, |
| "num_input_tokens_seen": 586720, |
| "step": 2060 |
| }, |
| { |
| "epoch": 3.8526119402985075, |
| "grad_norm": 4.50137186050415, |
| "learning_rate": 4.8707057968053175e-05, |
| "loss": 0.367, |
| "num_input_tokens_seen": 588352, |
| "step": 2065 |
| }, |
| { |
| "epoch": 3.861940298507463, |
| "grad_norm": 5.150646686553955, |
| "learning_rate": 4.8694106388339393e-05, |
| "loss": 0.4141, |
| "num_input_tokens_seen": 589632, |
| "step": 2070 |
| }, |
| { |
| "epoch": 3.871268656716418, |
| "grad_norm": 1.639139175415039, |
| "learning_rate": 4.868109200200841e-05, |
| "loss": 0.3656, |
| "num_input_tokens_seen": 591072, |
| "step": 2075 |
| }, |
| { |
| "epoch": 3.8805970149253732, |
| "grad_norm": 6.142917633056641, |
| "learning_rate": 4.866801484355782e-05, |
| "loss": 0.3362, |
| "num_input_tokens_seen": 592544, |
| "step": 2080 |
| }, |
| { |
| "epoch": 3.8899253731343286, |
| "grad_norm": 4.8141303062438965, |
| "learning_rate": 4.865487494765161e-05, |
| "loss": 0.3554, |
| "num_input_tokens_seen": 594112, |
| "step": 2085 |
| }, |
| { |
| "epoch": 3.8992537313432836, |
| "grad_norm": 4.472612380981445, |
| "learning_rate": 4.864167234912005e-05, |
| "loss": 0.3505, |
| "num_input_tokens_seen": 595744, |
| "step": 2090 |
| }, |
| { |
| "epoch": 3.908582089552239, |
| "grad_norm": 4.838069438934326, |
| "learning_rate": 4.8628407082959636e-05, |
| "loss": 0.4871, |
| "num_input_tokens_seen": 597184, |
| "step": 2095 |
| }, |
| { |
| "epoch": 3.917910447761194, |
| "grad_norm": 3.0444555282592773, |
| "learning_rate": 4.861507918433298e-05, |
| "loss": 0.4196, |
| "num_input_tokens_seen": 598816, |
| "step": 2100 |
| }, |
| { |
| "epoch": 3.9272388059701493, |
| "grad_norm": 4.453574180603027, |
| "learning_rate": 4.8601688688568695e-05, |
| "loss": 0.5709, |
| "num_input_tokens_seen": 600192, |
| "step": 2105 |
| }, |
| { |
| "epoch": 3.9365671641791042, |
| "grad_norm": 2.8588922023773193, |
| "learning_rate": 4.858823563116135e-05, |
| "loss": 0.3683, |
| "num_input_tokens_seen": 601472, |
| "step": 2110 |
| }, |
| { |
| "epoch": 3.9458955223880596, |
| "grad_norm": 3.220468759536743, |
| "learning_rate": 4.857472004777132e-05, |
| "loss": 0.399, |
| "num_input_tokens_seen": 602880, |
| "step": 2115 |
| }, |
| { |
| "epoch": 3.955223880597015, |
| "grad_norm": 6.396601676940918, |
| "learning_rate": 4.8561141974224726e-05, |
| "loss": 0.5529, |
| "num_input_tokens_seen": 604416, |
| "step": 2120 |
| }, |
| { |
| "epoch": 3.96455223880597, |
| "grad_norm": 3.129733085632324, |
| "learning_rate": 4.854750144651337e-05, |
| "loss": 0.5352, |
| "num_input_tokens_seen": 605696, |
| "step": 2125 |
| }, |
| { |
| "epoch": 3.9738805970149254, |
| "grad_norm": 2.5826094150543213, |
| "learning_rate": 4.8533798500794536e-05, |
| "loss": 0.3624, |
| "num_input_tokens_seen": 607040, |
| "step": 2130 |
| }, |
| { |
| "epoch": 3.9832089552238807, |
| "grad_norm": 2.7732017040252686, |
| "learning_rate": 4.852003317339102e-05, |
| "loss": 0.4075, |
| "num_input_tokens_seen": 608608, |
| "step": 2135 |
| }, |
| { |
| "epoch": 3.9925373134328357, |
| "grad_norm": 3.7826075553894043, |
| "learning_rate": 4.8506205500790944e-05, |
| "loss": 0.2788, |
| "num_input_tokens_seen": 609984, |
| "step": 2140 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 0.5962114334106445, |
| "eval_runtime": 2.9013, |
| "eval_samples_per_second": 82.031, |
| "eval_steps_per_second": 20.68, |
| "num_input_tokens_seen": 610944, |
| "step": 2144 |
| }, |
| { |
| "epoch": 4.001865671641791, |
| "grad_norm": 1.562759280204773, |
| "learning_rate": 4.849231551964771e-05, |
| "loss": 0.2205, |
| "num_input_tokens_seen": 611200, |
| "step": 2145 |
| }, |
| { |
| "epoch": 4.0111940298507465, |
| "grad_norm": 2.054766893386841, |
| "learning_rate": 4.847836326677986e-05, |
| "loss": 0.1987, |
| "num_input_tokens_seen": 612512, |
| "step": 2150 |
| }, |
| { |
| "epoch": 4.020522388059701, |
| "grad_norm": 1.9645274877548218, |
| "learning_rate": 4.846434877917102e-05, |
| "loss": 0.323, |
| "num_input_tokens_seen": 613920, |
| "step": 2155 |
| }, |
| { |
| "epoch": 4.029850746268656, |
| "grad_norm": 3.4472787380218506, |
| "learning_rate": 4.845027209396977e-05, |
| "loss": 0.2619, |
| "num_input_tokens_seen": 615264, |
| "step": 2160 |
| }, |
| { |
| "epoch": 4.039179104477612, |
| "grad_norm": 3.1487178802490234, |
| "learning_rate": 4.8436133248489576e-05, |
| "loss": 0.4537, |
| "num_input_tokens_seen": 616576, |
| "step": 2165 |
| }, |
| { |
| "epoch": 4.048507462686567, |
| "grad_norm": 3.712383985519409, |
| "learning_rate": 4.842193228020865e-05, |
| "loss": 0.2538, |
| "num_input_tokens_seen": 617984, |
| "step": 2170 |
| }, |
| { |
| "epoch": 4.057835820895522, |
| "grad_norm": 4.641477108001709, |
| "learning_rate": 4.840766922676989e-05, |
| "loss": 0.1984, |
| "num_input_tokens_seen": 619520, |
| "step": 2175 |
| }, |
| { |
| "epoch": 4.067164179104478, |
| "grad_norm": 3.4469504356384277, |
| "learning_rate": 4.839334412598077e-05, |
| "loss": 0.2977, |
| "num_input_tokens_seen": 620864, |
| "step": 2180 |
| }, |
| { |
| "epoch": 4.076492537313433, |
| "grad_norm": 5.328348636627197, |
| "learning_rate": 4.8378957015813225e-05, |
| "loss": 0.3704, |
| "num_input_tokens_seen": 622272, |
| "step": 2185 |
| }, |
| { |
| "epoch": 4.085820895522388, |
| "grad_norm": 5.046899795532227, |
| "learning_rate": 4.8364507934403556e-05, |
| "loss": 0.1858, |
| "num_input_tokens_seen": 623840, |
| "step": 2190 |
| }, |
| { |
| "epoch": 4.095149253731344, |
| "grad_norm": 5.377172470092773, |
| "learning_rate": 4.8349996920052356e-05, |
| "loss": 0.3418, |
| "num_input_tokens_seen": 625216, |
| "step": 2195 |
| }, |
| { |
| "epoch": 4.104477611940299, |
| "grad_norm": 3.5020639896392822, |
| "learning_rate": 4.833542401122434e-05, |
| "loss": 0.262, |
| "num_input_tokens_seen": 626592, |
| "step": 2200 |
| }, |
| { |
| "epoch": 4.1138059701492535, |
| "grad_norm": 3.710578680038452, |
| "learning_rate": 4.832078924654835e-05, |
| "loss": 0.2471, |
| "num_input_tokens_seen": 627968, |
| "step": 2205 |
| }, |
| { |
| "epoch": 4.123134328358209, |
| "grad_norm": 3.740708827972412, |
| "learning_rate": 4.830609266481713e-05, |
| "loss": 0.3679, |
| "num_input_tokens_seen": 629248, |
| "step": 2210 |
| }, |
| { |
| "epoch": 4.132462686567164, |
| "grad_norm": 2.8835160732269287, |
| "learning_rate": 4.829133430498732e-05, |
| "loss": 0.2888, |
| "num_input_tokens_seen": 631040, |
| "step": 2215 |
| }, |
| { |
| "epoch": 4.141791044776119, |
| "grad_norm": 5.505259990692139, |
| "learning_rate": 4.827651420617932e-05, |
| "loss": 0.1953, |
| "num_input_tokens_seen": 632320, |
| "step": 2220 |
| }, |
| { |
| "epoch": 4.151119402985074, |
| "grad_norm": 4.913973331451416, |
| "learning_rate": 4.8261632407677174e-05, |
| "loss": 0.3872, |
| "num_input_tokens_seen": 633792, |
| "step": 2225 |
| }, |
| { |
| "epoch": 4.16044776119403, |
| "grad_norm": 3.7683751583099365, |
| "learning_rate": 4.824668894892844e-05, |
| "loss": 0.2387, |
| "num_input_tokens_seen": 635328, |
| "step": 2230 |
| }, |
| { |
| "epoch": 4.169776119402985, |
| "grad_norm": 14.22788143157959, |
| "learning_rate": 4.82316838695442e-05, |
| "loss": 0.2696, |
| "num_input_tokens_seen": 636864, |
| "step": 2235 |
| }, |
| { |
| "epoch": 4.17910447761194, |
| "grad_norm": 3.2358357906341553, |
| "learning_rate": 4.821661720929879e-05, |
| "loss": 0.2464, |
| "num_input_tokens_seen": 638112, |
| "step": 2240 |
| }, |
| { |
| "epoch": 4.188432835820896, |
| "grad_norm": 3.298790216445923, |
| "learning_rate": 4.820148900812984e-05, |
| "loss": 0.2978, |
| "num_input_tokens_seen": 639456, |
| "step": 2245 |
| }, |
| { |
| "epoch": 4.197761194029851, |
| "grad_norm": 4.4499125480651855, |
| "learning_rate": 4.8186299306138084e-05, |
| "loss": 0.3351, |
| "num_input_tokens_seen": 640736, |
| "step": 2250 |
| }, |
| { |
| "epoch": 4.207089552238806, |
| "grad_norm": 4.803750514984131, |
| "learning_rate": 4.817104814358728e-05, |
| "loss": 0.3212, |
| "num_input_tokens_seen": 642208, |
| "step": 2255 |
| }, |
| { |
| "epoch": 4.2164179104477615, |
| "grad_norm": 3.6328041553497314, |
| "learning_rate": 4.8155735560904106e-05, |
| "loss": 0.1945, |
| "num_input_tokens_seen": 643616, |
| "step": 2260 |
| }, |
| { |
| "epoch": 4.225746268656716, |
| "grad_norm": 4.701669216156006, |
| "learning_rate": 4.814036159867803e-05, |
| "loss": 0.309, |
| "num_input_tokens_seen": 645024, |
| "step": 2265 |
| }, |
| { |
| "epoch": 4.235074626865671, |
| "grad_norm": 4.907166481018066, |
| "learning_rate": 4.812492629766126e-05, |
| "loss": 0.348, |
| "num_input_tokens_seen": 646464, |
| "step": 2270 |
| }, |
| { |
| "epoch": 4.244402985074627, |
| "grad_norm": 2.9512081146240234, |
| "learning_rate": 4.810942969876855e-05, |
| "loss": 0.3241, |
| "num_input_tokens_seen": 647968, |
| "step": 2275 |
| }, |
| { |
| "epoch": 4.253731343283582, |
| "grad_norm": 4.267170429229736, |
| "learning_rate": 4.8093871843077166e-05, |
| "loss": 0.2996, |
| "num_input_tokens_seen": 649312, |
| "step": 2280 |
| }, |
| { |
| "epoch": 4.263059701492537, |
| "grad_norm": 5.341393947601318, |
| "learning_rate": 4.807825277182675e-05, |
| "loss": 0.5046, |
| "num_input_tokens_seen": 650656, |
| "step": 2285 |
| }, |
| { |
| "epoch": 4.272388059701493, |
| "grad_norm": 3.1686532497406006, |
| "learning_rate": 4.806257252641919e-05, |
| "loss": 0.3679, |
| "num_input_tokens_seen": 652032, |
| "step": 2290 |
| }, |
| { |
| "epoch": 4.281716417910448, |
| "grad_norm": 2.6950771808624268, |
| "learning_rate": 4.804683114841855e-05, |
| "loss": 0.2329, |
| "num_input_tokens_seen": 653504, |
| "step": 2295 |
| }, |
| { |
| "epoch": 4.291044776119403, |
| "grad_norm": 4.497025489807129, |
| "learning_rate": 4.803102867955093e-05, |
| "loss": 0.2163, |
| "num_input_tokens_seen": 654912, |
| "step": 2300 |
| }, |
| { |
| "epoch": 4.300373134328359, |
| "grad_norm": 3.6295406818389893, |
| "learning_rate": 4.8015165161704375e-05, |
| "loss": 0.3932, |
| "num_input_tokens_seen": 656256, |
| "step": 2305 |
| }, |
| { |
| "epoch": 4.309701492537314, |
| "grad_norm": 5.509720325469971, |
| "learning_rate": 4.799924063692873e-05, |
| "loss": 0.2342, |
| "num_input_tokens_seen": 657792, |
| "step": 2310 |
| }, |
| { |
| "epoch": 4.3190298507462686, |
| "grad_norm": 2.10560941696167, |
| "learning_rate": 4.798325514743558e-05, |
| "loss": 0.2148, |
| "num_input_tokens_seen": 659264, |
| "step": 2315 |
| }, |
| { |
| "epoch": 4.3283582089552235, |
| "grad_norm": 5.997559547424316, |
| "learning_rate": 4.7967208735598105e-05, |
| "loss": 0.4313, |
| "num_input_tokens_seen": 660640, |
| "step": 2320 |
| }, |
| { |
| "epoch": 4.337686567164179, |
| "grad_norm": 3.5801284313201904, |
| "learning_rate": 4.795110144395096e-05, |
| "loss": 0.3437, |
| "num_input_tokens_seen": 662016, |
| "step": 2325 |
| }, |
| { |
| "epoch": 4.347014925373134, |
| "grad_norm": 4.251504421234131, |
| "learning_rate": 4.793493331519018e-05, |
| "loss": 0.2484, |
| "num_input_tokens_seen": 663392, |
| "step": 2330 |
| }, |
| { |
| "epoch": 4.356343283582089, |
| "grad_norm": 7.537037372589111, |
| "learning_rate": 4.791870439217308e-05, |
| "loss": 0.3917, |
| "num_input_tokens_seen": 664928, |
| "step": 2335 |
| }, |
| { |
| "epoch": 4.365671641791045, |
| "grad_norm": 5.8342084884643555, |
| "learning_rate": 4.7902414717918105e-05, |
| "loss": 0.455, |
| "num_input_tokens_seen": 666176, |
| "step": 2340 |
| }, |
| { |
| "epoch": 4.375, |
| "grad_norm": 2.365814208984375, |
| "learning_rate": 4.788606433560474e-05, |
| "loss": 0.3125, |
| "num_input_tokens_seen": 667552, |
| "step": 2345 |
| }, |
| { |
| "epoch": 4.384328358208955, |
| "grad_norm": 3.6924357414245605, |
| "learning_rate": 4.786965328857339e-05, |
| "loss": 0.3538, |
| "num_input_tokens_seen": 669056, |
| "step": 2350 |
| }, |
| { |
| "epoch": 4.393656716417911, |
| "grad_norm": 2.19561767578125, |
| "learning_rate": 4.785318162032527e-05, |
| "loss": 0.2779, |
| "num_input_tokens_seen": 670592, |
| "step": 2355 |
| }, |
| { |
| "epoch": 4.402985074626866, |
| "grad_norm": 2.06274676322937, |
| "learning_rate": 4.783664937452228e-05, |
| "loss": 0.1065, |
| "num_input_tokens_seen": 672096, |
| "step": 2360 |
| }, |
| { |
| "epoch": 4.412313432835821, |
| "grad_norm": 6.070491313934326, |
| "learning_rate": 4.78200565949869e-05, |
| "loss": 0.5488, |
| "num_input_tokens_seen": 673568, |
| "step": 2365 |
| }, |
| { |
| "epoch": 4.4216417910447765, |
| "grad_norm": 3.15374493598938, |
| "learning_rate": 4.780340332570207e-05, |
| "loss": 0.281, |
| "num_input_tokens_seen": 674976, |
| "step": 2370 |
| }, |
| { |
| "epoch": 4.4309701492537314, |
| "grad_norm": 4.8091325759887695, |
| "learning_rate": 4.7786689610811055e-05, |
| "loss": 0.4135, |
| "num_input_tokens_seen": 676288, |
| "step": 2375 |
| }, |
| { |
| "epoch": 4.440298507462686, |
| "grad_norm": 3.43695330619812, |
| "learning_rate": 4.776991549461737e-05, |
| "loss": 0.2368, |
| "num_input_tokens_seen": 677888, |
| "step": 2380 |
| }, |
| { |
| "epoch": 4.449626865671641, |
| "grad_norm": 4.201904773712158, |
| "learning_rate": 4.775308102158461e-05, |
| "loss": 0.2904, |
| "num_input_tokens_seen": 679328, |
| "step": 2385 |
| }, |
| { |
| "epoch": 4.458955223880597, |
| "grad_norm": 2.595428466796875, |
| "learning_rate": 4.773618623633639e-05, |
| "loss": 0.3137, |
| "num_input_tokens_seen": 680736, |
| "step": 2390 |
| }, |
| { |
| "epoch": 4.468283582089552, |
| "grad_norm": 2.0509350299835205, |
| "learning_rate": 4.771923118365617e-05, |
| "loss": 0.1901, |
| "num_input_tokens_seen": 682304, |
| "step": 2395 |
| }, |
| { |
| "epoch": 4.477611940298507, |
| "grad_norm": 5.0024895668029785, |
| "learning_rate": 4.770221590848718e-05, |
| "loss": 0.406, |
| "num_input_tokens_seen": 683936, |
| "step": 2400 |
| }, |
| { |
| "epoch": 4.486940298507463, |
| "grad_norm": 3.0875470638275146, |
| "learning_rate": 4.7685140455932267e-05, |
| "loss": 0.2891, |
| "num_input_tokens_seen": 685312, |
| "step": 2405 |
| }, |
| { |
| "epoch": 4.496268656716418, |
| "grad_norm": 4.056346416473389, |
| "learning_rate": 4.76680048712538e-05, |
| "loss": 0.2005, |
| "num_input_tokens_seen": 686752, |
| "step": 2410 |
| }, |
| { |
| "epoch": 4.505597014925373, |
| "grad_norm": 3.0078048706054688, |
| "learning_rate": 4.765080919987356e-05, |
| "loss": 0.3014, |
| "num_input_tokens_seen": 688096, |
| "step": 2415 |
| }, |
| { |
| "epoch": 4.514925373134329, |
| "grad_norm": 3.3334035873413086, |
| "learning_rate": 4.763355348737257e-05, |
| "loss": 0.3126, |
| "num_input_tokens_seen": 689664, |
| "step": 2420 |
| }, |
| { |
| "epoch": 4.524253731343284, |
| "grad_norm": 6.881554126739502, |
| "learning_rate": 4.761623777949102e-05, |
| "loss": 0.5296, |
| "num_input_tokens_seen": 690912, |
| "step": 2425 |
| }, |
| { |
| "epoch": 4.5335820895522385, |
| "grad_norm": 5.216423511505127, |
| "learning_rate": 4.759886212212814e-05, |
| "loss": 0.1929, |
| "num_input_tokens_seen": 692320, |
| "step": 2430 |
| }, |
| { |
| "epoch": 4.542910447761194, |
| "grad_norm": 3.3800344467163086, |
| "learning_rate": 4.758142656134205e-05, |
| "loss": 0.2823, |
| "num_input_tokens_seen": 693568, |
| "step": 2435 |
| }, |
| { |
| "epoch": 4.552238805970149, |
| "grad_norm": 5.284757137298584, |
| "learning_rate": 4.756393114334968e-05, |
| "loss": 0.2364, |
| "num_input_tokens_seen": 695168, |
| "step": 2440 |
| }, |
| { |
| "epoch": 4.561567164179104, |
| "grad_norm": 17.076181411743164, |
| "learning_rate": 4.754637591452661e-05, |
| "loss": 0.3559, |
| "num_input_tokens_seen": 696384, |
| "step": 2445 |
| }, |
| { |
| "epoch": 4.57089552238806, |
| "grad_norm": 4.131042957305908, |
| "learning_rate": 4.7528760921406955e-05, |
| "loss": 0.2729, |
| "num_input_tokens_seen": 697792, |
| "step": 2450 |
| }, |
| { |
| "epoch": 4.580223880597015, |
| "grad_norm": 5.547695636749268, |
| "learning_rate": 4.751108621068328e-05, |
| "loss": 0.2547, |
| "num_input_tokens_seen": 699232, |
| "step": 2455 |
| }, |
| { |
| "epoch": 4.58955223880597, |
| "grad_norm": 2.796549081802368, |
| "learning_rate": 4.7493351829206415e-05, |
| "loss": 0.2788, |
| "num_input_tokens_seen": 700704, |
| "step": 2460 |
| }, |
| { |
| "epoch": 4.598880597014926, |
| "grad_norm": 6.005579471588135, |
| "learning_rate": 4.747555782398537e-05, |
| "loss": 0.3636, |
| "num_input_tokens_seen": 702016, |
| "step": 2465 |
| }, |
| { |
| "epoch": 4.608208955223881, |
| "grad_norm": 2.630554676055908, |
| "learning_rate": 4.74577042421872e-05, |
| "loss": 0.3592, |
| "num_input_tokens_seen": 703456, |
| "step": 2470 |
| }, |
| { |
| "epoch": 4.617537313432836, |
| "grad_norm": 2.7301981449127197, |
| "learning_rate": 4.743979113113689e-05, |
| "loss": 0.3199, |
| "num_input_tokens_seen": 704896, |
| "step": 2475 |
| }, |
| { |
| "epoch": 4.6268656716417915, |
| "grad_norm": 2.7551496028900146, |
| "learning_rate": 4.742181853831721e-05, |
| "loss": 0.3089, |
| "num_input_tokens_seen": 706272, |
| "step": 2480 |
| }, |
| { |
| "epoch": 4.6361940298507465, |
| "grad_norm": 6.6818528175354, |
| "learning_rate": 4.740378651136861e-05, |
| "loss": 0.2037, |
| "num_input_tokens_seen": 707744, |
| "step": 2485 |
| }, |
| { |
| "epoch": 4.645522388059701, |
| "grad_norm": 3.114124059677124, |
| "learning_rate": 4.738569509808907e-05, |
| "loss": 0.4182, |
| "num_input_tokens_seen": 709120, |
| "step": 2490 |
| }, |
| { |
| "epoch": 4.654850746268656, |
| "grad_norm": 3.9045345783233643, |
| "learning_rate": 4.7367544346434e-05, |
| "loss": 0.338, |
| "num_input_tokens_seen": 710592, |
| "step": 2495 |
| }, |
| { |
| "epoch": 4.664179104477612, |
| "grad_norm": 5.1609907150268555, |
| "learning_rate": 4.7349334304516094e-05, |
| "loss": 0.4376, |
| "num_input_tokens_seen": 712000, |
| "step": 2500 |
| }, |
| { |
| "epoch": 4.673507462686567, |
| "grad_norm": 5.8798980712890625, |
| "learning_rate": 4.7331065020605204e-05, |
| "loss": 0.2303, |
| "num_input_tokens_seen": 713408, |
| "step": 2505 |
| }, |
| { |
| "epoch": 4.682835820895522, |
| "grad_norm": 1.563310980796814, |
| "learning_rate": 4.7312736543128236e-05, |
| "loss": 0.294, |
| "num_input_tokens_seen": 714688, |
| "step": 2510 |
| }, |
| { |
| "epoch": 4.692164179104478, |
| "grad_norm": 6.273819446563721, |
| "learning_rate": 4.7294348920668974e-05, |
| "loss": 0.3264, |
| "num_input_tokens_seen": 716128, |
| "step": 2515 |
| }, |
| { |
| "epoch": 4.701492537313433, |
| "grad_norm": 7.700003147125244, |
| "learning_rate": 4.727590220196799e-05, |
| "loss": 0.3835, |
| "num_input_tokens_seen": 717632, |
| "step": 2520 |
| }, |
| { |
| "epoch": 4.710820895522388, |
| "grad_norm": 6.855437278747559, |
| "learning_rate": 4.7257396435922526e-05, |
| "loss": 0.4013, |
| "num_input_tokens_seen": 719264, |
| "step": 2525 |
| }, |
| { |
| "epoch": 4.720149253731344, |
| "grad_norm": 4.01498556137085, |
| "learning_rate": 4.7238831671586305e-05, |
| "loss": 0.311, |
| "num_input_tokens_seen": 720832, |
| "step": 2530 |
| }, |
| { |
| "epoch": 4.729477611940299, |
| "grad_norm": 8.66020679473877, |
| "learning_rate": 4.722020795816947e-05, |
| "loss": 0.3767, |
| "num_input_tokens_seen": 722240, |
| "step": 2535 |
| }, |
| { |
| "epoch": 4.7388059701492535, |
| "grad_norm": 4.132741451263428, |
| "learning_rate": 4.720152534503839e-05, |
| "loss": 0.4671, |
| "num_input_tokens_seen": 723552, |
| "step": 2540 |
| }, |
| { |
| "epoch": 4.7481343283582085, |
| "grad_norm": 4.426851749420166, |
| "learning_rate": 4.71827838817156e-05, |
| "loss": 0.3964, |
| "num_input_tokens_seen": 724864, |
| "step": 2545 |
| }, |
| { |
| "epoch": 4.757462686567164, |
| "grad_norm": 2.8156912326812744, |
| "learning_rate": 4.716398361787959e-05, |
| "loss": 0.2392, |
| "num_input_tokens_seen": 726176, |
| "step": 2550 |
| }, |
| { |
| "epoch": 4.766791044776119, |
| "grad_norm": 3.6759626865386963, |
| "learning_rate": 4.714512460336475e-05, |
| "loss": 0.3435, |
| "num_input_tokens_seen": 727680, |
| "step": 2555 |
| }, |
| { |
| "epoch": 4.776119402985074, |
| "grad_norm": 5.040331840515137, |
| "learning_rate": 4.7126206888161194e-05, |
| "loss": 0.4074, |
| "num_input_tokens_seen": 729024, |
| "step": 2560 |
| }, |
| { |
| "epoch": 4.78544776119403, |
| "grad_norm": 6.2503204345703125, |
| "learning_rate": 4.7107230522414615e-05, |
| "loss": 0.2751, |
| "num_input_tokens_seen": 730304, |
| "step": 2565 |
| }, |
| { |
| "epoch": 4.794776119402985, |
| "grad_norm": 5.034648418426514, |
| "learning_rate": 4.7088195556426176e-05, |
| "loss": 0.2499, |
| "num_input_tokens_seen": 731776, |
| "step": 2570 |
| }, |
| { |
| "epoch": 4.80410447761194, |
| "grad_norm": 1.7610746622085571, |
| "learning_rate": 4.70691020406524e-05, |
| "loss": 0.2449, |
| "num_input_tokens_seen": 733152, |
| "step": 2575 |
| }, |
| { |
| "epoch": 4.813432835820896, |
| "grad_norm": 3.7510221004486084, |
| "learning_rate": 4.704995002570499e-05, |
| "loss": 0.4952, |
| "num_input_tokens_seen": 734432, |
| "step": 2580 |
| }, |
| { |
| "epoch": 4.822761194029851, |
| "grad_norm": 3.396411180496216, |
| "learning_rate": 4.7030739562350713e-05, |
| "loss": 0.3823, |
| "num_input_tokens_seen": 735808, |
| "step": 2585 |
| }, |
| { |
| "epoch": 4.832089552238806, |
| "grad_norm": 4.5465006828308105, |
| "learning_rate": 4.701147070151127e-05, |
| "loss": 0.4112, |
| "num_input_tokens_seen": 737088, |
| "step": 2590 |
| }, |
| { |
| "epoch": 4.8414179104477615, |
| "grad_norm": 3.1737515926361084, |
| "learning_rate": 4.6992143494263164e-05, |
| "loss": 0.3256, |
| "num_input_tokens_seen": 738528, |
| "step": 2595 |
| }, |
| { |
| "epoch": 4.850746268656716, |
| "grad_norm": 3.1744399070739746, |
| "learning_rate": 4.697275799183755e-05, |
| "loss": 0.3559, |
| "num_input_tokens_seen": 740128, |
| "step": 2600 |
| }, |
| { |
| "epoch": 4.860074626865671, |
| "grad_norm": 6.2648468017578125, |
| "learning_rate": 4.695331424562011e-05, |
| "loss": 0.3428, |
| "num_input_tokens_seen": 741760, |
| "step": 2605 |
| }, |
| { |
| "epoch": 4.869402985074627, |
| "grad_norm": 6.683523178100586, |
| "learning_rate": 4.693381230715091e-05, |
| "loss": 0.3457, |
| "num_input_tokens_seen": 743136, |
| "step": 2610 |
| }, |
| { |
| "epoch": 4.878731343283582, |
| "grad_norm": 4.396423816680908, |
| "learning_rate": 4.6914252228124277e-05, |
| "loss": 0.2386, |
| "num_input_tokens_seen": 744384, |
| "step": 2615 |
| }, |
| { |
| "epoch": 4.888059701492537, |
| "grad_norm": 3.4351277351379395, |
| "learning_rate": 4.689463406038866e-05, |
| "loss": 0.3794, |
| "num_input_tokens_seen": 745888, |
| "step": 2620 |
| }, |
| { |
| "epoch": 4.897388059701493, |
| "grad_norm": 4.846048355102539, |
| "learning_rate": 4.6874957855946455e-05, |
| "loss": 0.3445, |
| "num_input_tokens_seen": 747360, |
| "step": 2625 |
| }, |
| { |
| "epoch": 4.906716417910448, |
| "grad_norm": 5.725438117980957, |
| "learning_rate": 4.685522366695393e-05, |
| "loss": 0.4863, |
| "num_input_tokens_seen": 748576, |
| "step": 2630 |
| }, |
| { |
| "epoch": 4.916044776119403, |
| "grad_norm": 3.7922306060791016, |
| "learning_rate": 4.6835431545721034e-05, |
| "loss": 0.2485, |
| "num_input_tokens_seen": 749792, |
| "step": 2635 |
| }, |
| { |
| "epoch": 4.925373134328359, |
| "grad_norm": 3.4977328777313232, |
| "learning_rate": 4.6815581544711284e-05, |
| "loss": 0.2238, |
| "num_input_tokens_seen": 751200, |
| "step": 2640 |
| }, |
| { |
| "epoch": 4.934701492537314, |
| "grad_norm": 4.492105007171631, |
| "learning_rate": 4.679567371654162e-05, |
| "loss": 0.2627, |
| "num_input_tokens_seen": 752608, |
| "step": 2645 |
| }, |
| { |
| "epoch": 4.9440298507462686, |
| "grad_norm": 4.780972003936768, |
| "learning_rate": 4.677570811398228e-05, |
| "loss": 0.3113, |
| "num_input_tokens_seen": 754144, |
| "step": 2650 |
| }, |
| { |
| "epoch": 4.9533582089552235, |
| "grad_norm": 4.835292816162109, |
| "learning_rate": 4.675568478995662e-05, |
| "loss": 0.3541, |
| "num_input_tokens_seen": 755584, |
| "step": 2655 |
| }, |
| { |
| "epoch": 4.962686567164179, |
| "grad_norm": 4.228946208953857, |
| "learning_rate": 4.673560379754103e-05, |
| "loss": 0.3011, |
| "num_input_tokens_seen": 757152, |
| "step": 2660 |
| }, |
| { |
| "epoch": 4.972014925373134, |
| "grad_norm": 6.853209972381592, |
| "learning_rate": 4.6715465189964724e-05, |
| "loss": 0.2382, |
| "num_input_tokens_seen": 758528, |
| "step": 2665 |
| }, |
| { |
| "epoch": 4.981343283582089, |
| "grad_norm": 4.100311279296875, |
| "learning_rate": 4.6695269020609676e-05, |
| "loss": 0.3625, |
| "num_input_tokens_seen": 759904, |
| "step": 2670 |
| }, |
| { |
| "epoch": 4.990671641791045, |
| "grad_norm": 3.425264835357666, |
| "learning_rate": 4.667501534301043e-05, |
| "loss": 0.2996, |
| "num_input_tokens_seen": 761344, |
| "step": 2675 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 9.684795379638672, |
| "learning_rate": 4.665470421085395e-05, |
| "loss": 0.4801, |
| "num_input_tokens_seen": 762440, |
| "step": 2680 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_loss": 0.6492366194725037, |
| "eval_runtime": 2.8744, |
| "eval_samples_per_second": 82.799, |
| "eval_steps_per_second": 20.874, |
| "num_input_tokens_seen": 762440, |
| "step": 2680 |
| }, |
| { |
| "epoch": 5.009328358208955, |
| "grad_norm": 2.5784518718719482, |
| "learning_rate": 4.663433567797952e-05, |
| "loss": 0.203, |
| "num_input_tokens_seen": 763656, |
| "step": 2685 |
| }, |
| { |
| "epoch": 5.018656716417911, |
| "grad_norm": 2.988851308822632, |
| "learning_rate": 4.661390979837858e-05, |
| "loss": 0.3063, |
| "num_input_tokens_seen": 764968, |
| "step": 2690 |
| }, |
| { |
| "epoch": 5.027985074626866, |
| "grad_norm": 2.519805431365967, |
| "learning_rate": 4.659342662619454e-05, |
| "loss": 0.1515, |
| "num_input_tokens_seen": 766440, |
| "step": 2695 |
| }, |
| { |
| "epoch": 5.037313432835821, |
| "grad_norm": 4.73142671585083, |
| "learning_rate": 4.657288621572273e-05, |
| "loss": 0.1811, |
| "num_input_tokens_seen": 767784, |
| "step": 2700 |
| }, |
| { |
| "epoch": 5.0466417910447765, |
| "grad_norm": 5.220130920410156, |
| "learning_rate": 4.655228862141017e-05, |
| "loss": 0.2581, |
| "num_input_tokens_seen": 769128, |
| "step": 2705 |
| }, |
| { |
| "epoch": 5.0559701492537314, |
| "grad_norm": 5.714827537536621, |
| "learning_rate": 4.653163389785546e-05, |
| "loss": 0.264, |
| "num_input_tokens_seen": 770536, |
| "step": 2710 |
| }, |
| { |
| "epoch": 5.065298507462686, |
| "grad_norm": 7.098995685577393, |
| "learning_rate": 4.651092209980866e-05, |
| "loss": 0.1395, |
| "num_input_tokens_seen": 772040, |
| "step": 2715 |
| }, |
| { |
| "epoch": 5.074626865671641, |
| "grad_norm": 5.708429336547852, |
| "learning_rate": 4.649015328217108e-05, |
| "loss": 0.2017, |
| "num_input_tokens_seen": 773480, |
| "step": 2720 |
| }, |
| { |
| "epoch": 5.083955223880597, |
| "grad_norm": 3.187607526779175, |
| "learning_rate": 4.6469327499995214e-05, |
| "loss": 0.1783, |
| "num_input_tokens_seen": 774888, |
| "step": 2725 |
| }, |
| { |
| "epoch": 5.093283582089552, |
| "grad_norm": 8.77532958984375, |
| "learning_rate": 4.644844480848452e-05, |
| "loss": 0.2536, |
| "num_input_tokens_seen": 776296, |
| "step": 2730 |
| }, |
| { |
| "epoch": 5.102611940298507, |
| "grad_norm": 3.24391508102417, |
| "learning_rate": 4.642750526299333e-05, |
| "loss": 0.1455, |
| "num_input_tokens_seen": 777928, |
| "step": 2735 |
| }, |
| { |
| "epoch": 5.111940298507463, |
| "grad_norm": 2.8618836402893066, |
| "learning_rate": 4.640650891902666e-05, |
| "loss": 0.1552, |
| "num_input_tokens_seen": 779304, |
| "step": 2740 |
| }, |
| { |
| "epoch": 5.121268656716418, |
| "grad_norm": 5.80402135848999, |
| "learning_rate": 4.638545583224011e-05, |
| "loss": 0.2426, |
| "num_input_tokens_seen": 780456, |
| "step": 2745 |
| }, |
| { |
| "epoch": 5.130597014925373, |
| "grad_norm": 4.056662082672119, |
| "learning_rate": 4.6364346058439654e-05, |
| "loss": 0.2236, |
| "num_input_tokens_seen": 781800, |
| "step": 2750 |
| }, |
| { |
| "epoch": 5.139925373134329, |
| "grad_norm": 2.6546411514282227, |
| "learning_rate": 4.634317965358157e-05, |
| "loss": 0.2282, |
| "num_input_tokens_seen": 783176, |
| "step": 2755 |
| }, |
| { |
| "epoch": 5.149253731343284, |
| "grad_norm": 3.019340753555298, |
| "learning_rate": 4.63219566737722e-05, |
| "loss": 0.2001, |
| "num_input_tokens_seen": 784488, |
| "step": 2760 |
| }, |
| { |
| "epoch": 5.1585820895522385, |
| "grad_norm": 2.4385337829589844, |
| "learning_rate": 4.6300677175267914e-05, |
| "loss": 0.1353, |
| "num_input_tokens_seen": 785928, |
| "step": 2765 |
| }, |
| { |
| "epoch": 5.167910447761194, |
| "grad_norm": 2.303985595703125, |
| "learning_rate": 4.6279341214474836e-05, |
| "loss": 0.1592, |
| "num_input_tokens_seen": 787368, |
| "step": 2770 |
| }, |
| { |
| "epoch": 5.177238805970149, |
| "grad_norm": 3.8047120571136475, |
| "learning_rate": 4.625794884794879e-05, |
| "loss": 0.1639, |
| "num_input_tokens_seen": 788744, |
| "step": 2775 |
| }, |
| { |
| "epoch": 5.186567164179104, |
| "grad_norm": 3.399069309234619, |
| "learning_rate": 4.62365001323951e-05, |
| "loss": 0.2306, |
| "num_input_tokens_seen": 790216, |
| "step": 2780 |
| }, |
| { |
| "epoch": 5.19589552238806, |
| "grad_norm": 2.5097756385803223, |
| "learning_rate": 4.621499512466847e-05, |
| "loss": 0.1911, |
| "num_input_tokens_seen": 791656, |
| "step": 2785 |
| }, |
| { |
| "epoch": 5.205223880597015, |
| "grad_norm": 4.051816940307617, |
| "learning_rate": 4.6193433881772825e-05, |
| "loss": 0.2641, |
| "num_input_tokens_seen": 793288, |
| "step": 2790 |
| }, |
| { |
| "epoch": 5.21455223880597, |
| "grad_norm": 3.782460927963257, |
| "learning_rate": 4.617181646086112e-05, |
| "loss": 0.1565, |
| "num_input_tokens_seen": 794984, |
| "step": 2795 |
| }, |
| { |
| "epoch": 5.223880597014926, |
| "grad_norm": 3.4350125789642334, |
| "learning_rate": 4.615014291923527e-05, |
| "loss": 0.2084, |
| "num_input_tokens_seen": 796328, |
| "step": 2800 |
| }, |
| { |
| "epoch": 5.233208955223881, |
| "grad_norm": 5.9458160400390625, |
| "learning_rate": 4.61284133143459e-05, |
| "loss": 0.3348, |
| "num_input_tokens_seen": 797704, |
| "step": 2805 |
| }, |
| { |
| "epoch": 5.242537313432836, |
| "grad_norm": 3.3840394020080566, |
| "learning_rate": 4.6106627703792294e-05, |
| "loss": 0.206, |
| "num_input_tokens_seen": 798984, |
| "step": 2810 |
| }, |
| { |
| "epoch": 5.251865671641791, |
| "grad_norm": 3.8996832370758057, |
| "learning_rate": 4.608478614532215e-05, |
| "loss": 0.2047, |
| "num_input_tokens_seen": 800392, |
| "step": 2815 |
| }, |
| { |
| "epoch": 5.2611940298507465, |
| "grad_norm": 3.301448345184326, |
| "learning_rate": 4.6062888696831484e-05, |
| "loss": 0.1893, |
| "num_input_tokens_seen": 801832, |
| "step": 2820 |
| }, |
| { |
| "epoch": 5.270522388059701, |
| "grad_norm": 2.653963327407837, |
| "learning_rate": 4.604093541636447e-05, |
| "loss": 0.3095, |
| "num_input_tokens_seen": 803272, |
| "step": 2825 |
| }, |
| { |
| "epoch": 5.279850746268656, |
| "grad_norm": 6.305687427520752, |
| "learning_rate": 4.601892636211328e-05, |
| "loss": 0.2604, |
| "num_input_tokens_seen": 804872, |
| "step": 2830 |
| }, |
| { |
| "epoch": 5.289179104477612, |
| "grad_norm": 4.2062225341796875, |
| "learning_rate": 4.5996861592417906e-05, |
| "loss": 0.243, |
| "num_input_tokens_seen": 806312, |
| "step": 2835 |
| }, |
| { |
| "epoch": 5.298507462686567, |
| "grad_norm": 1.1816065311431885, |
| "learning_rate": 4.597474116576603e-05, |
| "loss": 0.1419, |
| "num_input_tokens_seen": 807944, |
| "step": 2840 |
| }, |
| { |
| "epoch": 5.307835820895522, |
| "grad_norm": 6.607985496520996, |
| "learning_rate": 4.595256514079289e-05, |
| "loss": 0.2469, |
| "num_input_tokens_seen": 809320, |
| "step": 2845 |
| }, |
| { |
| "epoch": 5.317164179104478, |
| "grad_norm": 3.8233156204223633, |
| "learning_rate": 4.593033357628107e-05, |
| "loss": 0.1489, |
| "num_input_tokens_seen": 810952, |
| "step": 2850 |
| }, |
| { |
| "epoch": 5.326492537313433, |
| "grad_norm": 5.152980804443359, |
| "learning_rate": 4.5908046531160396e-05, |
| "loss": 0.1587, |
| "num_input_tokens_seen": 812264, |
| "step": 2855 |
| }, |
| { |
| "epoch": 5.335820895522388, |
| "grad_norm": 3.260324478149414, |
| "learning_rate": 4.588570406450774e-05, |
| "loss": 0.2434, |
| "num_input_tokens_seen": 813704, |
| "step": 2860 |
| }, |
| { |
| "epoch": 5.345149253731344, |
| "grad_norm": 2.429797887802124, |
| "learning_rate": 4.586330623554691e-05, |
| "loss": 0.2071, |
| "num_input_tokens_seen": 815272, |
| "step": 2865 |
| }, |
| { |
| "epoch": 5.354477611940299, |
| "grad_norm": 3.600126028060913, |
| "learning_rate": 4.5840853103648415e-05, |
| "loss": 0.2119, |
| "num_input_tokens_seen": 816904, |
| "step": 2870 |
| }, |
| { |
| "epoch": 5.3638059701492535, |
| "grad_norm": 4.991045951843262, |
| "learning_rate": 4.581834472832942e-05, |
| "loss": 0.221, |
| "num_input_tokens_seen": 818440, |
| "step": 2875 |
| }, |
| { |
| "epoch": 5.373134328358209, |
| "grad_norm": 1.5262837409973145, |
| "learning_rate": 4.579578116925347e-05, |
| "loss": 0.2948, |
| "num_input_tokens_seen": 819752, |
| "step": 2880 |
| }, |
| { |
| "epoch": 5.382462686567164, |
| "grad_norm": 8.558094024658203, |
| "learning_rate": 4.577316248623041e-05, |
| "loss": 0.4095, |
| "num_input_tokens_seen": 820936, |
| "step": 2885 |
| }, |
| { |
| "epoch": 5.391791044776119, |
| "grad_norm": 4.643557071685791, |
| "learning_rate": 4.575048873921621e-05, |
| "loss": 0.2326, |
| "num_input_tokens_seen": 822408, |
| "step": 2890 |
| }, |
| { |
| "epoch": 5.401119402985074, |
| "grad_norm": 5.719346523284912, |
| "learning_rate": 4.57277599883128e-05, |
| "loss": 0.3012, |
| "num_input_tokens_seen": 823784, |
| "step": 2895 |
| }, |
| { |
| "epoch": 5.41044776119403, |
| "grad_norm": 6.364695072174072, |
| "learning_rate": 4.570497629376789e-05, |
| "loss": 0.1917, |
| "num_input_tokens_seen": 825064, |
| "step": 2900 |
| }, |
| { |
| "epoch": 5.419776119402985, |
| "grad_norm": 6.654156684875488, |
| "learning_rate": 4.5682137715974835e-05, |
| "loss": 0.188, |
| "num_input_tokens_seen": 826600, |
| "step": 2905 |
| }, |
| { |
| "epoch": 5.42910447761194, |
| "grad_norm": 2.860431671142578, |
| "learning_rate": 4.565924431547251e-05, |
| "loss": 0.1602, |
| "num_input_tokens_seen": 828104, |
| "step": 2910 |
| }, |
| { |
| "epoch": 5.438432835820896, |
| "grad_norm": 4.316542625427246, |
| "learning_rate": 4.563629615294505e-05, |
| "loss": 0.2745, |
| "num_input_tokens_seen": 829640, |
| "step": 2915 |
| }, |
| { |
| "epoch": 5.447761194029851, |
| "grad_norm": 6.560685157775879, |
| "learning_rate": 4.561329328922179e-05, |
| "loss": 0.1843, |
| "num_input_tokens_seen": 831112, |
| "step": 2920 |
| }, |
| { |
| "epoch": 5.457089552238806, |
| "grad_norm": 2.6812708377838135, |
| "learning_rate": 4.559023578527706e-05, |
| "loss": 0.261, |
| "num_input_tokens_seen": 832744, |
| "step": 2925 |
| }, |
| { |
| "epoch": 5.4664179104477615, |
| "grad_norm": 3.056436061859131, |
| "learning_rate": 4.5567123702230004e-05, |
| "loss": 0.1651, |
| "num_input_tokens_seen": 834248, |
| "step": 2930 |
| }, |
| { |
| "epoch": 5.475746268656716, |
| "grad_norm": 6.146241188049316, |
| "learning_rate": 4.554395710134446e-05, |
| "loss": 0.3091, |
| "num_input_tokens_seen": 835752, |
| "step": 2935 |
| }, |
| { |
| "epoch": 5.485074626865671, |
| "grad_norm": 3.1170620918273926, |
| "learning_rate": 4.5520736044028764e-05, |
| "loss": 0.2881, |
| "num_input_tokens_seen": 837064, |
| "step": 2940 |
| }, |
| { |
| "epoch": 5.494402985074627, |
| "grad_norm": 5.3431077003479, |
| "learning_rate": 4.5497460591835615e-05, |
| "loss": 0.1938, |
| "num_input_tokens_seen": 838440, |
| "step": 2945 |
| }, |
| { |
| "epoch": 5.503731343283582, |
| "grad_norm": 3.4499943256378174, |
| "learning_rate": 4.54741308064619e-05, |
| "loss": 0.1767, |
| "num_input_tokens_seen": 840232, |
| "step": 2950 |
| }, |
| { |
| "epoch": 5.513059701492537, |
| "grad_norm": 4.75957727432251, |
| "learning_rate": 4.545074674974851e-05, |
| "loss": 0.2097, |
| "num_input_tokens_seen": 841576, |
| "step": 2955 |
| }, |
| { |
| "epoch": 5.522388059701493, |
| "grad_norm": 9.220419883728027, |
| "learning_rate": 4.54273084836802e-05, |
| "loss": 0.2368, |
| "num_input_tokens_seen": 842984, |
| "step": 2960 |
| }, |
| { |
| "epoch": 5.531716417910448, |
| "grad_norm": 3.5264084339141846, |
| "learning_rate": 4.540381607038544e-05, |
| "loss": 0.217, |
| "num_input_tokens_seen": 844232, |
| "step": 2965 |
| }, |
| { |
| "epoch": 5.541044776119403, |
| "grad_norm": 5.536301612854004, |
| "learning_rate": 4.538026957213621e-05, |
| "loss": 0.3384, |
| "num_input_tokens_seen": 845576, |
| "step": 2970 |
| }, |
| { |
| "epoch": 5.550373134328359, |
| "grad_norm": 2.957416534423828, |
| "learning_rate": 4.535666905134784e-05, |
| "loss": 0.2334, |
| "num_input_tokens_seen": 846984, |
| "step": 2975 |
| }, |
| { |
| "epoch": 5.559701492537314, |
| "grad_norm": 10.137619972229004, |
| "learning_rate": 4.533301457057891e-05, |
| "loss": 0.2847, |
| "num_input_tokens_seen": 848328, |
| "step": 2980 |
| }, |
| { |
| "epoch": 5.5690298507462686, |
| "grad_norm": 6.274179935455322, |
| "learning_rate": 4.530930619253097e-05, |
| "loss": 0.1838, |
| "num_input_tokens_seen": 849800, |
| "step": 2985 |
| }, |
| { |
| "epoch": 5.5783582089552235, |
| "grad_norm": 5.382236480712891, |
| "learning_rate": 4.5285543980048484e-05, |
| "loss": 0.2166, |
| "num_input_tokens_seen": 851176, |
| "step": 2990 |
| }, |
| { |
| "epoch": 5.587686567164179, |
| "grad_norm": 0.20699748396873474, |
| "learning_rate": 4.5261727996118594e-05, |
| "loss": 0.1783, |
| "num_input_tokens_seen": 852776, |
| "step": 2995 |
| }, |
| { |
| "epoch": 5.597014925373134, |
| "grad_norm": 2.9458279609680176, |
| "learning_rate": 4.5237858303870984e-05, |
| "loss": 0.232, |
| "num_input_tokens_seen": 854184, |
| "step": 3000 |
| }, |
| { |
| "epoch": 5.606343283582089, |
| "grad_norm": 3.901914358139038, |
| "learning_rate": 4.52139349665777e-05, |
| "loss": 0.2589, |
| "num_input_tokens_seen": 855496, |
| "step": 3005 |
| }, |
| { |
| "epoch": 5.615671641791045, |
| "grad_norm": 4.247676849365234, |
| "learning_rate": 4.518995804765298e-05, |
| "loss": 0.2254, |
| "num_input_tokens_seen": 856840, |
| "step": 3010 |
| }, |
| { |
| "epoch": 5.625, |
| "grad_norm": 7.148390769958496, |
| "learning_rate": 4.516592761065311e-05, |
| "loss": 0.2237, |
| "num_input_tokens_seen": 858184, |
| "step": 3015 |
| }, |
| { |
| "epoch": 5.634328358208955, |
| "grad_norm": 5.459695339202881, |
| "learning_rate": 4.5141843719276235e-05, |
| "loss": 0.2594, |
| "num_input_tokens_seen": 859624, |
| "step": 3020 |
| }, |
| { |
| "epoch": 5.643656716417911, |
| "grad_norm": 2.143315076828003, |
| "learning_rate": 4.5117706437362176e-05, |
| "loss": 0.2665, |
| "num_input_tokens_seen": 860968, |
| "step": 3025 |
| }, |
| { |
| "epoch": 5.652985074626866, |
| "grad_norm": 6.014386177062988, |
| "learning_rate": 4.5093515828892285e-05, |
| "loss": 0.2849, |
| "num_input_tokens_seen": 862472, |
| "step": 3030 |
| }, |
| { |
| "epoch": 5.662313432835821, |
| "grad_norm": 5.910489082336426, |
| "learning_rate": 4.5069271957989276e-05, |
| "loss": 0.1755, |
| "num_input_tokens_seen": 863880, |
| "step": 3035 |
| }, |
| { |
| "epoch": 5.6716417910447765, |
| "grad_norm": 4.93767786026001, |
| "learning_rate": 4.5044974888917035e-05, |
| "loss": 0.1953, |
| "num_input_tokens_seen": 865288, |
| "step": 3040 |
| }, |
| { |
| "epoch": 5.6809701492537314, |
| "grad_norm": 1.0762439966201782, |
| "learning_rate": 4.5020624686080485e-05, |
| "loss": 0.0787, |
| "num_input_tokens_seen": 866824, |
| "step": 3045 |
| }, |
| { |
| "epoch": 5.690298507462686, |
| "grad_norm": 3.0029850006103516, |
| "learning_rate": 4.499622141402536e-05, |
| "loss": 0.1532, |
| "num_input_tokens_seen": 868104, |
| "step": 3050 |
| }, |
| { |
| "epoch": 5.699626865671641, |
| "grad_norm": 7.032677173614502, |
| "learning_rate": 4.497176513743808e-05, |
| "loss": 0.1664, |
| "num_input_tokens_seen": 869512, |
| "step": 3055 |
| }, |
| { |
| "epoch": 5.708955223880597, |
| "grad_norm": 9.000031471252441, |
| "learning_rate": 4.494725592114559e-05, |
| "loss": 0.2156, |
| "num_input_tokens_seen": 871048, |
| "step": 3060 |
| }, |
| { |
| "epoch": 5.718283582089552, |
| "grad_norm": 4.98378849029541, |
| "learning_rate": 4.492269383011512e-05, |
| "loss": 0.2735, |
| "num_input_tokens_seen": 872424, |
| "step": 3065 |
| }, |
| { |
| "epoch": 5.727611940298507, |
| "grad_norm": 3.4251692295074463, |
| "learning_rate": 4.489807892945409e-05, |
| "loss": 0.193, |
| "num_input_tokens_seen": 873992, |
| "step": 3070 |
| }, |
| { |
| "epoch": 5.736940298507463, |
| "grad_norm": 5.345053195953369, |
| "learning_rate": 4.487341128440987e-05, |
| "loss": 0.1249, |
| "num_input_tokens_seen": 875368, |
| "step": 3075 |
| }, |
| { |
| "epoch": 5.746268656716418, |
| "grad_norm": 5.85477876663208, |
| "learning_rate": 4.484869096036969e-05, |
| "loss": 0.1864, |
| "num_input_tokens_seen": 876808, |
| "step": 3080 |
| }, |
| { |
| "epoch": 5.755597014925373, |
| "grad_norm": 7.493655204772949, |
| "learning_rate": 4.482391802286038e-05, |
| "loss": 0.3327, |
| "num_input_tokens_seen": 878152, |
| "step": 3085 |
| }, |
| { |
| "epoch": 5.764925373134329, |
| "grad_norm": 5.178029537200928, |
| "learning_rate": 4.479909253754825e-05, |
| "loss": 0.1986, |
| "num_input_tokens_seen": 879496, |
| "step": 3090 |
| }, |
| { |
| "epoch": 5.774253731343284, |
| "grad_norm": 7.651886940002441, |
| "learning_rate": 4.4774214570238884e-05, |
| "loss": 0.3702, |
| "num_input_tokens_seen": 880904, |
| "step": 3095 |
| }, |
| { |
| "epoch": 5.7835820895522385, |
| "grad_norm": 5.524927139282227, |
| "learning_rate": 4.474928418687699e-05, |
| "loss": 0.3232, |
| "num_input_tokens_seen": 882152, |
| "step": 3100 |
| }, |
| { |
| "epoch": 5.792910447761194, |
| "grad_norm": 5.230311393737793, |
| "learning_rate": 4.472430145354622e-05, |
| "loss": 0.2397, |
| "num_input_tokens_seen": 883400, |
| "step": 3105 |
| }, |
| { |
| "epoch": 5.802238805970149, |
| "grad_norm": 2.379998207092285, |
| "learning_rate": 4.469926643646901e-05, |
| "loss": 0.1465, |
| "num_input_tokens_seen": 884968, |
| "step": 3110 |
| }, |
| { |
| "epoch": 5.811567164179104, |
| "grad_norm": 3.1780765056610107, |
| "learning_rate": 4.467417920200635e-05, |
| "loss": 0.1917, |
| "num_input_tokens_seen": 886312, |
| "step": 3115 |
| }, |
| { |
| "epoch": 5.82089552238806, |
| "grad_norm": 3.0208687782287598, |
| "learning_rate": 4.4649039816657654e-05, |
| "loss": 0.1574, |
| "num_input_tokens_seen": 887752, |
| "step": 3120 |
| }, |
| { |
| "epoch": 5.830223880597015, |
| "grad_norm": 7.244109630584717, |
| "learning_rate": 4.462384834706058e-05, |
| "loss": 0.2744, |
| "num_input_tokens_seen": 889096, |
| "step": 3125 |
| }, |
| { |
| "epoch": 5.83955223880597, |
| "grad_norm": 7.628950119018555, |
| "learning_rate": 4.459860485999086e-05, |
| "loss": 0.2572, |
| "num_input_tokens_seen": 890408, |
| "step": 3130 |
| }, |
| { |
| "epoch": 5.848880597014926, |
| "grad_norm": 4.650967121124268, |
| "learning_rate": 4.457330942236209e-05, |
| "loss": 0.1781, |
| "num_input_tokens_seen": 892072, |
| "step": 3135 |
| }, |
| { |
| "epoch": 5.858208955223881, |
| "grad_norm": 2.999238967895508, |
| "learning_rate": 4.4547962101225584e-05, |
| "loss": 0.2597, |
| "num_input_tokens_seen": 893608, |
| "step": 3140 |
| }, |
| { |
| "epoch": 5.867537313432836, |
| "grad_norm": 3.6925904750823975, |
| "learning_rate": 4.452256296377017e-05, |
| "loss": 0.2861, |
| "num_input_tokens_seen": 894920, |
| "step": 3145 |
| }, |
| { |
| "epoch": 5.8768656716417915, |
| "grad_norm": 4.159998416900635, |
| "learning_rate": 4.4497112077322044e-05, |
| "loss": 0.2415, |
| "num_input_tokens_seen": 896328, |
| "step": 3150 |
| }, |
| { |
| "epoch": 5.8861940298507465, |
| "grad_norm": 0.6956812143325806, |
| "learning_rate": 4.4471609509344575e-05, |
| "loss": 0.2141, |
| "num_input_tokens_seen": 897896, |
| "step": 3155 |
| }, |
| { |
| "epoch": 5.895522388059701, |
| "grad_norm": 2.0524721145629883, |
| "learning_rate": 4.444605532743811e-05, |
| "loss": 0.2226, |
| "num_input_tokens_seen": 899272, |
| "step": 3160 |
| }, |
| { |
| "epoch": 5.904850746268656, |
| "grad_norm": 2.6856722831726074, |
| "learning_rate": 4.442044959933982e-05, |
| "loss": 0.1491, |
| "num_input_tokens_seen": 900584, |
| "step": 3165 |
| }, |
| { |
| "epoch": 5.914179104477612, |
| "grad_norm": 1.81888747215271, |
| "learning_rate": 4.439479239292351e-05, |
| "loss": 0.1252, |
| "num_input_tokens_seen": 901992, |
| "step": 3170 |
| }, |
| { |
| "epoch": 5.923507462686567, |
| "grad_norm": 7.098339557647705, |
| "learning_rate": 4.4369083776199446e-05, |
| "loss": 0.3237, |
| "num_input_tokens_seen": 903304, |
| "step": 3175 |
| }, |
| { |
| "epoch": 5.932835820895522, |
| "grad_norm": 9.416855812072754, |
| "learning_rate": 4.434332381731416e-05, |
| "loss": 0.2266, |
| "num_input_tokens_seen": 904712, |
| "step": 3180 |
| }, |
| { |
| "epoch": 5.942164179104478, |
| "grad_norm": 3.284900426864624, |
| "learning_rate": 4.431751258455029e-05, |
| "loss": 0.1936, |
| "num_input_tokens_seen": 906120, |
| "step": 3185 |
| }, |
| { |
| "epoch": 5.951492537313433, |
| "grad_norm": 5.820497989654541, |
| "learning_rate": 4.4291650146326356e-05, |
| "loss": 0.2717, |
| "num_input_tokens_seen": 907464, |
| "step": 3190 |
| }, |
| { |
| "epoch": 5.960820895522388, |
| "grad_norm": 6.642111301422119, |
| "learning_rate": 4.4265736571196645e-05, |
| "loss": 0.2223, |
| "num_input_tokens_seen": 909000, |
| "step": 3195 |
| }, |
| { |
| "epoch": 5.970149253731344, |
| "grad_norm": 4.0866594314575195, |
| "learning_rate": 4.423977192785099e-05, |
| "loss": 0.24, |
| "num_input_tokens_seen": 910280, |
| "step": 3200 |
| }, |
| { |
| "epoch": 5.979477611940299, |
| "grad_norm": 5.743937969207764, |
| "learning_rate": 4.421375628511456e-05, |
| "loss": 0.2924, |
| "num_input_tokens_seen": 911816, |
| "step": 3205 |
| }, |
| { |
| "epoch": 5.9888059701492535, |
| "grad_norm": 5.13763952255249, |
| "learning_rate": 4.4187689711947754e-05, |
| "loss": 0.3452, |
| "num_input_tokens_seen": 913320, |
| "step": 3210 |
| }, |
| { |
| "epoch": 5.9981343283582085, |
| "grad_norm": 5.567590713500977, |
| "learning_rate": 4.416157227744594e-05, |
| "loss": 0.2656, |
| "num_input_tokens_seen": 914856, |
| "step": 3215 |
| }, |
| { |
| "epoch": 6.0, |
| "eval_loss": 0.7368558049201965, |
| "eval_runtime": 2.9099, |
| "eval_samples_per_second": 81.791, |
| "eval_steps_per_second": 20.619, |
| "num_input_tokens_seen": 914920, |
| "step": 3216 |
| }, |
| { |
| "epoch": 6.007462686567164, |
| "grad_norm": 1.9122474193572998, |
| "learning_rate": 4.4135404050839326e-05, |
| "loss": 0.1506, |
| "num_input_tokens_seen": 916136, |
| "step": 3220 |
| }, |
| { |
| "epoch": 6.016791044776119, |
| "grad_norm": 0.874113142490387, |
| "learning_rate": 4.4109185101492735e-05, |
| "loss": 0.1008, |
| "num_input_tokens_seen": 917544, |
| "step": 3225 |
| }, |
| { |
| "epoch": 6.026119402985074, |
| "grad_norm": 2.3850107192993164, |
| "learning_rate": 4.408291549890546e-05, |
| "loss": 0.1239, |
| "num_input_tokens_seen": 919080, |
| "step": 3230 |
| }, |
| { |
| "epoch": 6.03544776119403, |
| "grad_norm": 7.043910503387451, |
| "learning_rate": 4.4056595312711066e-05, |
| "loss": 0.1211, |
| "num_input_tokens_seen": 920840, |
| "step": 3235 |
| }, |
| { |
| "epoch": 6.044776119402985, |
| "grad_norm": 0.7303712964057922, |
| "learning_rate": 4.403022461267718e-05, |
| "loss": 0.0899, |
| "num_input_tokens_seen": 922248, |
| "step": 3240 |
| }, |
| { |
| "epoch": 6.05410447761194, |
| "grad_norm": 1.4832550287246704, |
| "learning_rate": 4.400380346870534e-05, |
| "loss": 0.1651, |
| "num_input_tokens_seen": 923912, |
| "step": 3245 |
| }, |
| { |
| "epoch": 6.063432835820896, |
| "grad_norm": 4.453665256500244, |
| "learning_rate": 4.3977331950830805e-05, |
| "loss": 0.1476, |
| "num_input_tokens_seen": 925352, |
| "step": 3250 |
| }, |
| { |
| "epoch": 6.072761194029851, |
| "grad_norm": 3.9456849098205566, |
| "learning_rate": 4.395081012922235e-05, |
| "loss": 0.1176, |
| "num_input_tokens_seen": 926856, |
| "step": 3255 |
| }, |
| { |
| "epoch": 6.082089552238806, |
| "grad_norm": 2.822026491165161, |
| "learning_rate": 4.392423807418209e-05, |
| "loss": 0.2562, |
| "num_input_tokens_seen": 928168, |
| "step": 3260 |
| }, |
| { |
| "epoch": 6.0914179104477615, |
| "grad_norm": 7.243401050567627, |
| "learning_rate": 4.38976158561453e-05, |
| "loss": 0.1968, |
| "num_input_tokens_seen": 929544, |
| "step": 3265 |
| }, |
| { |
| "epoch": 6.100746268656716, |
| "grad_norm": 2.664954423904419, |
| "learning_rate": 4.3870943545680225e-05, |
| "loss": 0.1393, |
| "num_input_tokens_seen": 930792, |
| "step": 3270 |
| }, |
| { |
| "epoch": 6.110074626865671, |
| "grad_norm": 10.708439826965332, |
| "learning_rate": 4.384422121348789e-05, |
| "loss": 0.1656, |
| "num_input_tokens_seen": 932072, |
| "step": 3275 |
| }, |
| { |
| "epoch": 6.119402985074627, |
| "grad_norm": 4.780562400817871, |
| "learning_rate": 4.381744893040192e-05, |
| "loss": 0.1996, |
| "num_input_tokens_seen": 933384, |
| "step": 3280 |
| }, |
| { |
| "epoch": 6.128731343283582, |
| "grad_norm": 4.849841594696045, |
| "learning_rate": 4.379062676738832e-05, |
| "loss": 0.1228, |
| "num_input_tokens_seen": 934888, |
| "step": 3285 |
| }, |
| { |
| "epoch": 6.138059701492537, |
| "grad_norm": 1.2637876272201538, |
| "learning_rate": 4.3763754795545355e-05, |
| "loss": 0.0812, |
| "num_input_tokens_seen": 936328, |
| "step": 3290 |
| }, |
| { |
| "epoch": 6.147388059701493, |
| "grad_norm": 4.03655481338501, |
| "learning_rate": 4.373683308610328e-05, |
| "loss": 0.1417, |
| "num_input_tokens_seen": 937864, |
| "step": 3295 |
| }, |
| { |
| "epoch": 6.156716417910448, |
| "grad_norm": 5.6414031982421875, |
| "learning_rate": 4.370986171042422e-05, |
| "loss": 0.125, |
| "num_input_tokens_seen": 939464, |
| "step": 3300 |
| }, |
| { |
| "epoch": 6.166044776119403, |
| "grad_norm": 1.8103418350219727, |
| "learning_rate": 4.368284074000193e-05, |
| "loss": 0.0828, |
| "num_input_tokens_seen": 940968, |
| "step": 3305 |
| }, |
| { |
| "epoch": 6.175373134328359, |
| "grad_norm": 1.9451091289520264, |
| "learning_rate": 4.3655770246461645e-05, |
| "loss": 0.2064, |
| "num_input_tokens_seen": 942152, |
| "step": 3310 |
| }, |
| { |
| "epoch": 6.184701492537314, |
| "grad_norm": 2.288334846496582, |
| "learning_rate": 4.362865030155986e-05, |
| "loss": 0.1366, |
| "num_input_tokens_seen": 943720, |
| "step": 3315 |
| }, |
| { |
| "epoch": 6.1940298507462686, |
| "grad_norm": 0.7296946048736572, |
| "learning_rate": 4.360148097718416e-05, |
| "loss": 0.2007, |
| "num_input_tokens_seen": 944904, |
| "step": 3320 |
| }, |
| { |
| "epoch": 6.2033582089552235, |
| "grad_norm": 5.528839588165283, |
| "learning_rate": 4.3574262345353015e-05, |
| "loss": 0.1247, |
| "num_input_tokens_seen": 946344, |
| "step": 3325 |
| }, |
| { |
| "epoch": 6.212686567164179, |
| "grad_norm": 1.019931435585022, |
| "learning_rate": 4.354699447821558e-05, |
| "loss": 0.1255, |
| "num_input_tokens_seen": 947816, |
| "step": 3330 |
| }, |
| { |
| "epoch": 6.222014925373134, |
| "grad_norm": 4.42401647567749, |
| "learning_rate": 4.3519677448051576e-05, |
| "loss": 0.1238, |
| "num_input_tokens_seen": 949320, |
| "step": 3335 |
| }, |
| { |
| "epoch": 6.231343283582089, |
| "grad_norm": 2.5572609901428223, |
| "learning_rate": 4.3492311327270987e-05, |
| "loss": 0.0847, |
| "num_input_tokens_seen": 950664, |
| "step": 3340 |
| }, |
| { |
| "epoch": 6.240671641791045, |
| "grad_norm": 9.500044822692871, |
| "learning_rate": 4.346489618841393e-05, |
| "loss": 0.2054, |
| "num_input_tokens_seen": 952328, |
| "step": 3345 |
| }, |
| { |
| "epoch": 6.25, |
| "grad_norm": 8.229207038879395, |
| "learning_rate": 4.3437432104150466e-05, |
| "loss": 0.1975, |
| "num_input_tokens_seen": 953736, |
| "step": 3350 |
| }, |
| { |
| "epoch": 6.259328358208955, |
| "grad_norm": 1.4741579294204712, |
| "learning_rate": 4.34099191472804e-05, |
| "loss": 0.0514, |
| "num_input_tokens_seen": 955272, |
| "step": 3355 |
| }, |
| { |
| "epoch": 6.268656716417911, |
| "grad_norm": 6.567641735076904, |
| "learning_rate": 4.338235739073309e-05, |
| "loss": 0.0993, |
| "num_input_tokens_seen": 956808, |
| "step": 3360 |
| }, |
| { |
| "epoch": 6.277985074626866, |
| "grad_norm": 8.205889701843262, |
| "learning_rate": 4.335474690756722e-05, |
| "loss": 0.2836, |
| "num_input_tokens_seen": 958152, |
| "step": 3365 |
| }, |
| { |
| "epoch": 6.287313432835821, |
| "grad_norm": 10.514901161193848, |
| "learning_rate": 4.3327087770970674e-05, |
| "loss": 0.1402, |
| "num_input_tokens_seen": 959560, |
| "step": 3370 |
| }, |
| { |
| "epoch": 6.2966417910447765, |
| "grad_norm": 4.420656681060791, |
| "learning_rate": 4.329938005426027e-05, |
| "loss": 0.1495, |
| "num_input_tokens_seen": 960776, |
| "step": 3375 |
| }, |
| { |
| "epoch": 6.3059701492537314, |
| "grad_norm": 2.2710464000701904, |
| "learning_rate": 4.3271623830881625e-05, |
| "loss": 0.1127, |
| "num_input_tokens_seen": 962280, |
| "step": 3380 |
| }, |
| { |
| "epoch": 6.315298507462686, |
| "grad_norm": 5.242120742797852, |
| "learning_rate": 4.324381917440891e-05, |
| "loss": 0.067, |
| "num_input_tokens_seen": 963752, |
| "step": 3385 |
| }, |
| { |
| "epoch": 6.324626865671641, |
| "grad_norm": 2.9965147972106934, |
| "learning_rate": 4.321596615854469e-05, |
| "loss": 0.0801, |
| "num_input_tokens_seen": 965096, |
| "step": 3390 |
| }, |
| { |
| "epoch": 6.333955223880597, |
| "grad_norm": 3.3844876289367676, |
| "learning_rate": 4.318806485711972e-05, |
| "loss": 0.1405, |
| "num_input_tokens_seen": 966600, |
| "step": 3395 |
| }, |
| { |
| "epoch": 6.343283582089552, |
| "grad_norm": 5.05831241607666, |
| "learning_rate": 4.316011534409275e-05, |
| "loss": 0.1742, |
| "num_input_tokens_seen": 967976, |
| "step": 3400 |
| }, |
| { |
| "epoch": 6.352611940298507, |
| "grad_norm": 5.531286239624023, |
| "learning_rate": 4.313211769355031e-05, |
| "loss": 0.2262, |
| "num_input_tokens_seen": 969256, |
| "step": 3405 |
| }, |
| { |
| "epoch": 6.361940298507463, |
| "grad_norm": 6.615349292755127, |
| "learning_rate": 4.310407197970655e-05, |
| "loss": 0.1478, |
| "num_input_tokens_seen": 970696, |
| "step": 3410 |
| }, |
| { |
| "epoch": 6.371268656716418, |
| "grad_norm": 1.9675002098083496, |
| "learning_rate": 4.3075978276903e-05, |
| "loss": 0.1559, |
| "num_input_tokens_seen": 972072, |
| "step": 3415 |
| }, |
| { |
| "epoch": 6.380597014925373, |
| "grad_norm": 4.417579174041748, |
| "learning_rate": 4.304783665960842e-05, |
| "loss": 0.1082, |
| "num_input_tokens_seen": 973736, |
| "step": 3420 |
| }, |
| { |
| "epoch": 6.389925373134329, |
| "grad_norm": 10.713567733764648, |
| "learning_rate": 4.3019647202418566e-05, |
| "loss": 0.1759, |
| "num_input_tokens_seen": 975144, |
| "step": 3425 |
| }, |
| { |
| "epoch": 6.399253731343284, |
| "grad_norm": 2.2406625747680664, |
| "learning_rate": 4.2991409980055996e-05, |
| "loss": 0.1705, |
| "num_input_tokens_seen": 976712, |
| "step": 3430 |
| }, |
| { |
| "epoch": 6.4085820895522385, |
| "grad_norm": 10.277525901794434, |
| "learning_rate": 4.2963125067369894e-05, |
| "loss": 0.2392, |
| "num_input_tokens_seen": 978088, |
| "step": 3435 |
| }, |
| { |
| "epoch": 6.417910447761194, |
| "grad_norm": 9.7416353225708, |
| "learning_rate": 4.293479253933584e-05, |
| "loss": 0.0722, |
| "num_input_tokens_seen": 979560, |
| "step": 3440 |
| }, |
| { |
| "epoch": 6.427238805970149, |
| "grad_norm": 2.5611801147460938, |
| "learning_rate": 4.2906412471055675e-05, |
| "loss": 0.1689, |
| "num_input_tokens_seen": 981000, |
| "step": 3445 |
| }, |
| { |
| "epoch": 6.436567164179104, |
| "grad_norm": 5.433265209197998, |
| "learning_rate": 4.287798493775719e-05, |
| "loss": 0.2323, |
| "num_input_tokens_seen": 982568, |
| "step": 3450 |
| }, |
| { |
| "epoch": 6.44589552238806, |
| "grad_norm": 4.462908744812012, |
| "learning_rate": 4.2849510014794045e-05, |
| "loss": 0.1644, |
| "num_input_tokens_seen": 984040, |
| "step": 3455 |
| }, |
| { |
| "epoch": 6.455223880597015, |
| "grad_norm": 3.476276397705078, |
| "learning_rate": 4.2820987777645506e-05, |
| "loss": 0.1183, |
| "num_input_tokens_seen": 985384, |
| "step": 3460 |
| }, |
| { |
| "epoch": 6.46455223880597, |
| "grad_norm": 6.266582489013672, |
| "learning_rate": 4.2792418301916224e-05, |
| "loss": 0.1227, |
| "num_input_tokens_seen": 986696, |
| "step": 3465 |
| }, |
| { |
| "epoch": 6.473880597014926, |
| "grad_norm": 3.5238277912139893, |
| "learning_rate": 4.2763801663336114e-05, |
| "loss": 0.0819, |
| "num_input_tokens_seen": 988136, |
| "step": 3470 |
| }, |
| { |
| "epoch": 6.483208955223881, |
| "grad_norm": 0.9531679749488831, |
| "learning_rate": 4.273513793776006e-05, |
| "loss": 0.219, |
| "num_input_tokens_seen": 989608, |
| "step": 3475 |
| }, |
| { |
| "epoch": 6.492537313432836, |
| "grad_norm": 5.307140350341797, |
| "learning_rate": 4.2706427201167806e-05, |
| "loss": 0.0749, |
| "num_input_tokens_seen": 990984, |
| "step": 3480 |
| }, |
| { |
| "epoch": 6.5018656716417915, |
| "grad_norm": 3.4653844833374023, |
| "learning_rate": 4.267766952966369e-05, |
| "loss": 0.175, |
| "num_input_tokens_seen": 992360, |
| "step": 3485 |
| }, |
| { |
| "epoch": 6.5111940298507465, |
| "grad_norm": 3.472562313079834, |
| "learning_rate": 4.264886499947645e-05, |
| "loss": 0.2199, |
| "num_input_tokens_seen": 993864, |
| "step": 3490 |
| }, |
| { |
| "epoch": 6.520522388059701, |
| "grad_norm": 2.70194411277771, |
| "learning_rate": 4.262001368695904e-05, |
| "loss": 0.1452, |
| "num_input_tokens_seen": 995176, |
| "step": 3495 |
| }, |
| { |
| "epoch": 6.529850746268656, |
| "grad_norm": 3.1895768642425537, |
| "learning_rate": 4.2591115668588436e-05, |
| "loss": 0.1765, |
| "num_input_tokens_seen": 996680, |
| "step": 3500 |
| }, |
| { |
| "epoch": 6.539179104477612, |
| "grad_norm": 2.95902681350708, |
| "learning_rate": 4.25621710209654e-05, |
| "loss": 0.136, |
| "num_input_tokens_seen": 998024, |
| "step": 3505 |
| }, |
| { |
| "epoch": 6.548507462686567, |
| "grad_norm": 10.467219352722168, |
| "learning_rate": 4.25331798208143e-05, |
| "loss": 0.2019, |
| "num_input_tokens_seen": 999432, |
| "step": 3510 |
| }, |
| { |
| "epoch": 6.557835820895522, |
| "grad_norm": 14.205561637878418, |
| "learning_rate": 4.2504142144982916e-05, |
| "loss": 0.1812, |
| "num_input_tokens_seen": 1000712, |
| "step": 3515 |
| }, |
| { |
| "epoch": 6.567164179104478, |
| "grad_norm": 4.784630298614502, |
| "learning_rate": 4.2475058070442195e-05, |
| "loss": 0.1781, |
| "num_input_tokens_seen": 1002056, |
| "step": 3520 |
| }, |
| { |
| "epoch": 6.576492537313433, |
| "grad_norm": 5.7503180503845215, |
| "learning_rate": 4.2445927674286114e-05, |
| "loss": 0.1499, |
| "num_input_tokens_seen": 1003464, |
| "step": 3525 |
| }, |
| { |
| "epoch": 6.585820895522388, |
| "grad_norm": 2.5332953929901123, |
| "learning_rate": 4.241675103373139e-05, |
| "loss": 0.1008, |
| "num_input_tokens_seen": 1004808, |
| "step": 3530 |
| }, |
| { |
| "epoch": 6.595149253731344, |
| "grad_norm": 5.649632930755615, |
| "learning_rate": 4.238752822611735e-05, |
| "loss": 0.0991, |
| "num_input_tokens_seen": 1006120, |
| "step": 3535 |
| }, |
| { |
| "epoch": 6.604477611940299, |
| "grad_norm": 6.08969783782959, |
| "learning_rate": 4.2358259328905704e-05, |
| "loss": 0.2049, |
| "num_input_tokens_seen": 1007624, |
| "step": 3540 |
| }, |
| { |
| "epoch": 6.6138059701492535, |
| "grad_norm": 5.0536699295043945, |
| "learning_rate": 4.23289444196803e-05, |
| "loss": 0.2635, |
| "num_input_tokens_seen": 1009000, |
| "step": 3545 |
| }, |
| { |
| "epoch": 6.6231343283582085, |
| "grad_norm": 1.46120023727417, |
| "learning_rate": 4.2299583576146984e-05, |
| "loss": 0.1067, |
| "num_input_tokens_seen": 1010376, |
| "step": 3550 |
| }, |
| { |
| "epoch": 6.632462686567164, |
| "grad_norm": 8.65711498260498, |
| "learning_rate": 4.227017687613335e-05, |
| "loss": 0.1305, |
| "num_input_tokens_seen": 1011816, |
| "step": 3555 |
| }, |
| { |
| "epoch": 6.641791044776119, |
| "grad_norm": 3.085489273071289, |
| "learning_rate": 4.224072439758853e-05, |
| "loss": 0.1417, |
| "num_input_tokens_seen": 1013416, |
| "step": 3560 |
| }, |
| { |
| "epoch": 6.651119402985074, |
| "grad_norm": 4.197227954864502, |
| "learning_rate": 4.2211226218583037e-05, |
| "loss": 0.1199, |
| "num_input_tokens_seen": 1014728, |
| "step": 3565 |
| }, |
| { |
| "epoch": 6.66044776119403, |
| "grad_norm": 6.127635478973389, |
| "learning_rate": 4.2181682417308475e-05, |
| "loss": 0.1733, |
| "num_input_tokens_seen": 1016136, |
| "step": 3570 |
| }, |
| { |
| "epoch": 6.669776119402985, |
| "grad_norm": 3.593193531036377, |
| "learning_rate": 4.2152093072077435e-05, |
| "loss": 0.152, |
| "num_input_tokens_seen": 1017416, |
| "step": 3575 |
| }, |
| { |
| "epoch": 6.67910447761194, |
| "grad_norm": 4.765720367431641, |
| "learning_rate": 4.21224582613232e-05, |
| "loss": 0.1451, |
| "num_input_tokens_seen": 1019048, |
| "step": 3580 |
| }, |
| { |
| "epoch": 6.688432835820896, |
| "grad_norm": 1.5482021570205688, |
| "learning_rate": 4.2092778063599555e-05, |
| "loss": 0.0975, |
| "num_input_tokens_seen": 1020552, |
| "step": 3585 |
| }, |
| { |
| "epoch": 6.697761194029851, |
| "grad_norm": 3.191892147064209, |
| "learning_rate": 4.206305255758063e-05, |
| "loss": 0.0905, |
| "num_input_tokens_seen": 1021896, |
| "step": 3590 |
| }, |
| { |
| "epoch": 6.707089552238806, |
| "grad_norm": 0.8187964558601379, |
| "learning_rate": 4.203328182206064e-05, |
| "loss": 0.1474, |
| "num_input_tokens_seen": 1023240, |
| "step": 3595 |
| }, |
| { |
| "epoch": 6.7164179104477615, |
| "grad_norm": 5.285727500915527, |
| "learning_rate": 4.200346593595368e-05, |
| "loss": 0.1141, |
| "num_input_tokens_seen": 1024712, |
| "step": 3600 |
| }, |
| { |
| "epoch": 6.725746268656716, |
| "grad_norm": 2.864414930343628, |
| "learning_rate": 4.197360497829355e-05, |
| "loss": 0.0777, |
| "num_input_tokens_seen": 1026216, |
| "step": 3605 |
| }, |
| { |
| "epoch": 6.735074626865671, |
| "grad_norm": 2.3210506439208984, |
| "learning_rate": 4.19436990282335e-05, |
| "loss": 0.2146, |
| "num_input_tokens_seen": 1027592, |
| "step": 3610 |
| }, |
| { |
| "epoch": 6.744402985074627, |
| "grad_norm": 3.9772017002105713, |
| "learning_rate": 4.191374816504605e-05, |
| "loss": 0.0775, |
| "num_input_tokens_seen": 1029064, |
| "step": 3615 |
| }, |
| { |
| "epoch": 6.753731343283582, |
| "grad_norm": 1.6916282176971436, |
| "learning_rate": 4.1883752468122776e-05, |
| "loss": 0.1185, |
| "num_input_tokens_seen": 1030632, |
| "step": 3620 |
| }, |
| { |
| "epoch": 6.763059701492537, |
| "grad_norm": 7.303528785705566, |
| "learning_rate": 4.18537120169741e-05, |
| "loss": 0.1638, |
| "num_input_tokens_seen": 1032072, |
| "step": 3625 |
| }, |
| { |
| "epoch": 6.772388059701493, |
| "grad_norm": 4.329573154449463, |
| "learning_rate": 4.1823626891229055e-05, |
| "loss": 0.1872, |
| "num_input_tokens_seen": 1033480, |
| "step": 3630 |
| }, |
| { |
| "epoch": 6.781716417910448, |
| "grad_norm": 6.078740119934082, |
| "learning_rate": 4.179349717063512e-05, |
| "loss": 0.1924, |
| "num_input_tokens_seen": 1034792, |
| "step": 3635 |
| }, |
| { |
| "epoch": 6.791044776119403, |
| "grad_norm": 2.0301928520202637, |
| "learning_rate": 4.1763322935057974e-05, |
| "loss": 0.1588, |
| "num_input_tokens_seen": 1036200, |
| "step": 3640 |
| }, |
| { |
| "epoch": 6.800373134328359, |
| "grad_norm": 2.8798141479492188, |
| "learning_rate": 4.173310426448128e-05, |
| "loss": 0.1484, |
| "num_input_tokens_seen": 1037608, |
| "step": 3645 |
| }, |
| { |
| "epoch": 6.809701492537314, |
| "grad_norm": 7.207148551940918, |
| "learning_rate": 4.1702841239006496e-05, |
| "loss": 0.3446, |
| "num_input_tokens_seen": 1038952, |
| "step": 3650 |
| }, |
| { |
| "epoch": 6.8190298507462686, |
| "grad_norm": 3.570129871368408, |
| "learning_rate": 4.167253393885266e-05, |
| "loss": 0.1422, |
| "num_input_tokens_seen": 1040584, |
| "step": 3655 |
| }, |
| { |
| "epoch": 6.8283582089552235, |
| "grad_norm": 14.99905776977539, |
| "learning_rate": 4.164218244435617e-05, |
| "loss": 0.1712, |
| "num_input_tokens_seen": 1041960, |
| "step": 3660 |
| }, |
| { |
| "epoch": 6.837686567164179, |
| "grad_norm": 5.363063812255859, |
| "learning_rate": 4.161178683597054e-05, |
| "loss": 0.1252, |
| "num_input_tokens_seen": 1043336, |
| "step": 3665 |
| }, |
| { |
| "epoch": 6.847014925373134, |
| "grad_norm": 10.54794979095459, |
| "learning_rate": 4.158134719426627e-05, |
| "loss": 0.2996, |
| "num_input_tokens_seen": 1044712, |
| "step": 3670 |
| }, |
| { |
| "epoch": 6.856343283582089, |
| "grad_norm": 3.19741153717041, |
| "learning_rate": 4.155086359993054e-05, |
| "loss": 0.1253, |
| "num_input_tokens_seen": 1046120, |
| "step": 3675 |
| }, |
| { |
| "epoch": 6.865671641791045, |
| "grad_norm": 5.277954578399658, |
| "learning_rate": 4.152033613376704e-05, |
| "loss": 0.1648, |
| "num_input_tokens_seen": 1047592, |
| "step": 3680 |
| }, |
| { |
| "epoch": 6.875, |
| "grad_norm": 2.524416446685791, |
| "learning_rate": 4.1489764876695775e-05, |
| "loss": 0.2349, |
| "num_input_tokens_seen": 1048936, |
| "step": 3685 |
| }, |
| { |
| "epoch": 6.884328358208955, |
| "grad_norm": 2.8591644763946533, |
| "learning_rate": 4.145914990975281e-05, |
| "loss": 0.1575, |
| "num_input_tokens_seen": 1050376, |
| "step": 3690 |
| }, |
| { |
| "epoch": 6.893656716417911, |
| "grad_norm": 6.1221923828125, |
| "learning_rate": 4.1428491314090076e-05, |
| "loss": 0.2159, |
| "num_input_tokens_seen": 1051816, |
| "step": 3695 |
| }, |
| { |
| "epoch": 6.902985074626866, |
| "grad_norm": 2.083186388015747, |
| "learning_rate": 4.1397789170975154e-05, |
| "loss": 0.2511, |
| "num_input_tokens_seen": 1053128, |
| "step": 3700 |
| }, |
| { |
| "epoch": 6.912313432835821, |
| "grad_norm": 23.185707092285156, |
| "learning_rate": 4.1367043561791055e-05, |
| "loss": 0.1561, |
| "num_input_tokens_seen": 1054440, |
| "step": 3705 |
| }, |
| { |
| "epoch": 6.9216417910447765, |
| "grad_norm": 6.234776496887207, |
| "learning_rate": 4.1336254568036e-05, |
| "loss": 0.2811, |
| "num_input_tokens_seen": 1055816, |
| "step": 3710 |
| }, |
| { |
| "epoch": 6.9309701492537314, |
| "grad_norm": 0.9195011854171753, |
| "learning_rate": 4.130542227132323e-05, |
| "loss": 0.1156, |
| "num_input_tokens_seen": 1057288, |
| "step": 3715 |
| }, |
| { |
| "epoch": 6.940298507462686, |
| "grad_norm": 1.783787727355957, |
| "learning_rate": 4.127454675338076e-05, |
| "loss": 0.1161, |
| "num_input_tokens_seen": 1058888, |
| "step": 3720 |
| }, |
| { |
| "epoch": 6.949626865671641, |
| "grad_norm": 4.195655345916748, |
| "learning_rate": 4.124362809605117e-05, |
| "loss": 0.2369, |
| "num_input_tokens_seen": 1060296, |
| "step": 3725 |
| }, |
| { |
| "epoch": 6.958955223880597, |
| "grad_norm": 3.6299350261688232, |
| "learning_rate": 4.1212666381291386e-05, |
| "loss": 0.1201, |
| "num_input_tokens_seen": 1061640, |
| "step": 3730 |
| }, |
| { |
| "epoch": 6.968283582089552, |
| "grad_norm": 0.7743578553199768, |
| "learning_rate": 4.118166169117248e-05, |
| "loss": 0.126, |
| "num_input_tokens_seen": 1062888, |
| "step": 3735 |
| }, |
| { |
| "epoch": 6.977611940298507, |
| "grad_norm": 3.4953243732452393, |
| "learning_rate": 4.115061410787943e-05, |
| "loss": 0.1541, |
| "num_input_tokens_seen": 1064328, |
| "step": 3740 |
| }, |
| { |
| "epoch": 6.986940298507463, |
| "grad_norm": 2.799726724624634, |
| "learning_rate": 4.1119523713710904e-05, |
| "loss": 0.081, |
| "num_input_tokens_seen": 1065928, |
| "step": 3745 |
| }, |
| { |
| "epoch": 6.996268656716418, |
| "grad_norm": 1.76080322265625, |
| "learning_rate": 4.1088390591079097e-05, |
| "loss": 0.1983, |
| "num_input_tokens_seen": 1067176, |
| "step": 3750 |
| }, |
| { |
| "epoch": 7.0, |
| "eval_loss": 0.8697178959846497, |
| "eval_runtime": 3.2119, |
| "eval_samples_per_second": 74.1, |
| "eval_steps_per_second": 18.681, |
| "num_input_tokens_seen": 1067520, |
| "step": 3752 |
| }, |
| { |
| "epoch": 7.005597014925373, |
| "grad_norm": 3.5381438732147217, |
| "learning_rate": 4.1057214822509394e-05, |
| "loss": 0.1155, |
| "num_input_tokens_seen": 1068192, |
| "step": 3755 |
| }, |
| { |
| "epoch": 7.014925373134329, |
| "grad_norm": 3.3822264671325684, |
| "learning_rate": 4.1025996490640276e-05, |
| "loss": 0.1101, |
| "num_input_tokens_seen": 1069600, |
| "step": 3760 |
| }, |
| { |
| "epoch": 7.024253731343284, |
| "grad_norm": 3.9424831867218018, |
| "learning_rate": 4.099473567822303e-05, |
| "loss": 0.105, |
| "num_input_tokens_seen": 1070880, |
| "step": 3765 |
| }, |
| { |
| "epoch": 7.0335820895522385, |
| "grad_norm": 2.811324119567871, |
| "learning_rate": 4.096343246812155e-05, |
| "loss": 0.0438, |
| "num_input_tokens_seen": 1072416, |
| "step": 3770 |
| }, |
| { |
| "epoch": 7.042910447761194, |
| "grad_norm": 5.017489433288574, |
| "learning_rate": 4.093208694331211e-05, |
| "loss": 0.0922, |
| "num_input_tokens_seen": 1073824, |
| "step": 3775 |
| }, |
| { |
| "epoch": 7.052238805970149, |
| "grad_norm": 4.754647254943848, |
| "learning_rate": 4.090069918688315e-05, |
| "loss": 0.0392, |
| "num_input_tokens_seen": 1074976, |
| "step": 3780 |
| }, |
| { |
| "epoch": 7.061567164179104, |
| "grad_norm": 4.402016639709473, |
| "learning_rate": 4.0869269282035057e-05, |
| "loss": 0.053, |
| "num_input_tokens_seen": 1076416, |
| "step": 3785 |
| }, |
| { |
| "epoch": 7.07089552238806, |
| "grad_norm": 2.5235650539398193, |
| "learning_rate": 4.083779731207994e-05, |
| "loss": 0.0806, |
| "num_input_tokens_seen": 1077920, |
| "step": 3790 |
| }, |
| { |
| "epoch": 7.080223880597015, |
| "grad_norm": 3.4549667835235596, |
| "learning_rate": 4.080628336044142e-05, |
| "loss": 0.0718, |
| "num_input_tokens_seen": 1079776, |
| "step": 3795 |
| }, |
| { |
| "epoch": 7.08955223880597, |
| "grad_norm": 7.606328010559082, |
| "learning_rate": 4.077472751065439e-05, |
| "loss": 0.0882, |
| "num_input_tokens_seen": 1081088, |
| "step": 3800 |
| }, |
| { |
| "epoch": 7.098880597014926, |
| "grad_norm": 5.059791564941406, |
| "learning_rate": 4.074312984636479e-05, |
| "loss": 0.0872, |
| "num_input_tokens_seen": 1082624, |
| "step": 3805 |
| }, |
| { |
| "epoch": 7.108208955223881, |
| "grad_norm": 1.167157530784607, |
| "learning_rate": 4.0711490451329435e-05, |
| "loss": 0.0438, |
| "num_input_tokens_seen": 1084000, |
| "step": 3810 |
| }, |
| { |
| "epoch": 7.117537313432836, |
| "grad_norm": 4.197689533233643, |
| "learning_rate": 4.0679809409415734e-05, |
| "loss": 0.0682, |
| "num_input_tokens_seen": 1085504, |
| "step": 3815 |
| }, |
| { |
| "epoch": 7.126865671641791, |
| "grad_norm": 1.019307255744934, |
| "learning_rate": 4.064808680460148e-05, |
| "loss": 0.0281, |
| "num_input_tokens_seen": 1086848, |
| "step": 3820 |
| }, |
| { |
| "epoch": 7.1361940298507465, |
| "grad_norm": 2.008424758911133, |
| "learning_rate": 4.0616322720974664e-05, |
| "loss": 0.0757, |
| "num_input_tokens_seen": 1088160, |
| "step": 3825 |
| }, |
| { |
| "epoch": 7.145522388059701, |
| "grad_norm": 4.1604743003845215, |
| "learning_rate": 4.05845172427332e-05, |
| "loss": 0.1263, |
| "num_input_tokens_seen": 1089568, |
| "step": 3830 |
| }, |
| { |
| "epoch": 7.154850746268656, |
| "grad_norm": 4.1245269775390625, |
| "learning_rate": 4.055267045418476e-05, |
| "loss": 0.1347, |
| "num_input_tokens_seen": 1090752, |
| "step": 3835 |
| }, |
| { |
| "epoch": 7.164179104477612, |
| "grad_norm": 4.09068489074707, |
| "learning_rate": 4.052078243974648e-05, |
| "loss": 0.064, |
| "num_input_tokens_seen": 1092128, |
| "step": 3840 |
| }, |
| { |
| "epoch": 7.173507462686567, |
| "grad_norm": 2.140700101852417, |
| "learning_rate": 4.0488853283944806e-05, |
| "loss": 0.0842, |
| "num_input_tokens_seen": 1093376, |
| "step": 3845 |
| }, |
| { |
| "epoch": 7.182835820895522, |
| "grad_norm": 2.3313193321228027, |
| "learning_rate": 4.045688307141523e-05, |
| "loss": 0.0676, |
| "num_input_tokens_seen": 1094848, |
| "step": 3850 |
| }, |
| { |
| "epoch": 7.192164179104478, |
| "grad_norm": 5.560648441314697, |
| "learning_rate": 4.042487188690205e-05, |
| "loss": 0.1219, |
| "num_input_tokens_seen": 1096352, |
| "step": 3855 |
| }, |
| { |
| "epoch": 7.201492537313433, |
| "grad_norm": 4.215773582458496, |
| "learning_rate": 4.039281981525821e-05, |
| "loss": 0.079, |
| "num_input_tokens_seen": 1097824, |
| "step": 3860 |
| }, |
| { |
| "epoch": 7.210820895522388, |
| "grad_norm": 24.869749069213867, |
| "learning_rate": 4.036072694144501e-05, |
| "loss": 0.1014, |
| "num_input_tokens_seen": 1099136, |
| "step": 3865 |
| }, |
| { |
| "epoch": 7.220149253731344, |
| "grad_norm": 2.4490253925323486, |
| "learning_rate": 4.032859335053189e-05, |
| "loss": 0.0218, |
| "num_input_tokens_seen": 1100512, |
| "step": 3870 |
| }, |
| { |
| "epoch": 7.229477611940299, |
| "grad_norm": 9.224404335021973, |
| "learning_rate": 4.029641912769628e-05, |
| "loss": 0.1759, |
| "num_input_tokens_seen": 1101856, |
| "step": 3875 |
| }, |
| { |
| "epoch": 7.2388059701492535, |
| "grad_norm": 5.647240161895752, |
| "learning_rate": 4.026420435822325e-05, |
| "loss": 0.1105, |
| "num_input_tokens_seen": 1103264, |
| "step": 3880 |
| }, |
| { |
| "epoch": 7.248134328358209, |
| "grad_norm": 23.588088989257812, |
| "learning_rate": 4.0231949127505365e-05, |
| "loss": 0.3376, |
| "num_input_tokens_seen": 1104736, |
| "step": 3885 |
| }, |
| { |
| "epoch": 7.257462686567164, |
| "grad_norm": 1.3037164211273193, |
| "learning_rate": 4.0199653521042464e-05, |
| "loss": 0.0495, |
| "num_input_tokens_seen": 1106336, |
| "step": 3890 |
| }, |
| { |
| "epoch": 7.266791044776119, |
| "grad_norm": 3.902587652206421, |
| "learning_rate": 4.0167317624441393e-05, |
| "loss": 0.1494, |
| "num_input_tokens_seen": 1107808, |
| "step": 3895 |
| }, |
| { |
| "epoch": 7.276119402985074, |
| "grad_norm": 3.893819808959961, |
| "learning_rate": 4.013494152341579e-05, |
| "loss": 0.1113, |
| "num_input_tokens_seen": 1109184, |
| "step": 3900 |
| }, |
| { |
| "epoch": 7.28544776119403, |
| "grad_norm": 3.1408629417419434, |
| "learning_rate": 4.010252530378589e-05, |
| "loss": 0.0933, |
| "num_input_tokens_seen": 1110720, |
| "step": 3905 |
| }, |
| { |
| "epoch": 7.294776119402985, |
| "grad_norm": 1.87326180934906, |
| "learning_rate": 4.007006905147824e-05, |
| "loss": 0.045, |
| "num_input_tokens_seen": 1112096, |
| "step": 3910 |
| }, |
| { |
| "epoch": 7.30410447761194, |
| "grad_norm": 4.533140182495117, |
| "learning_rate": 4.003757285252554e-05, |
| "loss": 0.077, |
| "num_input_tokens_seen": 1113408, |
| "step": 3915 |
| }, |
| { |
| "epoch": 7.313432835820896, |
| "grad_norm": 4.174347400665283, |
| "learning_rate": 4.0005036793066316e-05, |
| "loss": 0.1107, |
| "num_input_tokens_seen": 1114784, |
| "step": 3920 |
| }, |
| { |
| "epoch": 7.322761194029851, |
| "grad_norm": 1.7349988222122192, |
| "learning_rate": 3.997246095934483e-05, |
| "loss": 0.0541, |
| "num_input_tokens_seen": 1116416, |
| "step": 3925 |
| }, |
| { |
| "epoch": 7.332089552238806, |
| "grad_norm": 8.258286476135254, |
| "learning_rate": 3.993984543771071e-05, |
| "loss": 0.0673, |
| "num_input_tokens_seen": 1117824, |
| "step": 3930 |
| }, |
| { |
| "epoch": 7.3414179104477615, |
| "grad_norm": 4.800844669342041, |
| "learning_rate": 3.990719031461884e-05, |
| "loss": 0.1271, |
| "num_input_tokens_seen": 1119136, |
| "step": 3935 |
| }, |
| { |
| "epoch": 7.350746268656716, |
| "grad_norm": 2.7747137546539307, |
| "learning_rate": 3.987449567662902e-05, |
| "loss": 0.1284, |
| "num_input_tokens_seen": 1120480, |
| "step": 3940 |
| }, |
| { |
| "epoch": 7.360074626865671, |
| "grad_norm": 4.568089008331299, |
| "learning_rate": 3.9841761610405845e-05, |
| "loss": 0.063, |
| "num_input_tokens_seen": 1121856, |
| "step": 3945 |
| }, |
| { |
| "epoch": 7.369402985074627, |
| "grad_norm": 1.3297767639160156, |
| "learning_rate": 3.980898820271839e-05, |
| "loss": 0.1009, |
| "num_input_tokens_seen": 1123136, |
| "step": 3950 |
| }, |
| { |
| "epoch": 7.378731343283582, |
| "grad_norm": 2.215484142303467, |
| "learning_rate": 3.977617554044004e-05, |
| "loss": 0.0751, |
| "num_input_tokens_seen": 1124480, |
| "step": 3955 |
| }, |
| { |
| "epoch": 7.388059701492537, |
| "grad_norm": 3.5656235218048096, |
| "learning_rate": 3.9743323710548196e-05, |
| "loss": 0.2743, |
| "num_input_tokens_seen": 1125856, |
| "step": 3960 |
| }, |
| { |
| "epoch": 7.397388059701493, |
| "grad_norm": 3.2043938636779785, |
| "learning_rate": 3.971043280012413e-05, |
| "loss": 0.1174, |
| "num_input_tokens_seen": 1127264, |
| "step": 3965 |
| }, |
| { |
| "epoch": 7.406716417910448, |
| "grad_norm": 2.498500347137451, |
| "learning_rate": 3.967750289635266e-05, |
| "loss": 0.0755, |
| "num_input_tokens_seen": 1128640, |
| "step": 3970 |
| }, |
| { |
| "epoch": 7.416044776119403, |
| "grad_norm": 5.654205322265625, |
| "learning_rate": 3.9644534086521986e-05, |
| "loss": 0.2114, |
| "num_input_tokens_seen": 1129984, |
| "step": 3975 |
| }, |
| { |
| "epoch": 7.425373134328359, |
| "grad_norm": 2.302154064178467, |
| "learning_rate": 3.961152645802343e-05, |
| "loss": 0.0905, |
| "num_input_tokens_seen": 1131552, |
| "step": 3980 |
| }, |
| { |
| "epoch": 7.434701492537314, |
| "grad_norm": 7.422224521636963, |
| "learning_rate": 3.9578480098351244e-05, |
| "loss": 0.1493, |
| "num_input_tokens_seen": 1132928, |
| "step": 3985 |
| }, |
| { |
| "epoch": 7.4440298507462686, |
| "grad_norm": 1.3771966695785522, |
| "learning_rate": 3.95453950951023e-05, |
| "loss": 0.0741, |
| "num_input_tokens_seen": 1134336, |
| "step": 3990 |
| }, |
| { |
| "epoch": 7.4533582089552235, |
| "grad_norm": 1.76166832447052, |
| "learning_rate": 3.951227153597592e-05, |
| "loss": 0.0858, |
| "num_input_tokens_seen": 1135840, |
| "step": 3995 |
| }, |
| { |
| "epoch": 7.462686567164179, |
| "grad_norm": 5.828932762145996, |
| "learning_rate": 3.947910950877364e-05, |
| "loss": 0.0865, |
| "num_input_tokens_seen": 1137216, |
| "step": 4000 |
| }, |
| { |
| "epoch": 7.472014925373134, |
| "grad_norm": 0.807457447052002, |
| "learning_rate": 3.944590910139895e-05, |
| "loss": 0.1167, |
| "num_input_tokens_seen": 1138528, |
| "step": 4005 |
| }, |
| { |
| "epoch": 7.481343283582089, |
| "grad_norm": 34.976524353027344, |
| "learning_rate": 3.941267040185707e-05, |
| "loss": 0.1341, |
| "num_input_tokens_seen": 1139872, |
| "step": 4010 |
| }, |
| { |
| "epoch": 7.490671641791045, |
| "grad_norm": 5.210699558258057, |
| "learning_rate": 3.937939349825475e-05, |
| "loss": 0.0804, |
| "num_input_tokens_seen": 1141408, |
| "step": 4015 |
| }, |
| { |
| "epoch": 7.5, |
| "grad_norm": 7.250500202178955, |
| "learning_rate": 3.934607847879999e-05, |
| "loss": 0.1551, |
| "num_input_tokens_seen": 1142912, |
| "step": 4020 |
| }, |
| { |
| "epoch": 7.509328358208955, |
| "grad_norm": 1.241698145866394, |
| "learning_rate": 3.93127254318018e-05, |
| "loss": 0.0766, |
| "num_input_tokens_seen": 1144256, |
| "step": 4025 |
| }, |
| { |
| "epoch": 7.518656716417911, |
| "grad_norm": 6.604357719421387, |
| "learning_rate": 3.927933444567004e-05, |
| "loss": 0.1487, |
| "num_input_tokens_seen": 1145568, |
| "step": 4030 |
| }, |
| { |
| "epoch": 7.527985074626866, |
| "grad_norm": 7.191267013549805, |
| "learning_rate": 3.924590560891511e-05, |
| "loss": 0.0922, |
| "num_input_tokens_seen": 1147008, |
| "step": 4035 |
| }, |
| { |
| "epoch": 7.537313432835821, |
| "grad_norm": 4.750164031982422, |
| "learning_rate": 3.921243901014773e-05, |
| "loss": 0.0476, |
| "num_input_tokens_seen": 1148416, |
| "step": 4040 |
| }, |
| { |
| "epoch": 7.5466417910447765, |
| "grad_norm": 2.237189769744873, |
| "learning_rate": 3.9178934738078745e-05, |
| "loss": 0.0554, |
| "num_input_tokens_seen": 1149888, |
| "step": 4045 |
| }, |
| { |
| "epoch": 7.5559701492537314, |
| "grad_norm": 4.023770809173584, |
| "learning_rate": 3.914539288151884e-05, |
| "loss": 0.1127, |
| "num_input_tokens_seen": 1151360, |
| "step": 4050 |
| }, |
| { |
| "epoch": 7.565298507462686, |
| "grad_norm": 1.2493197917938232, |
| "learning_rate": 3.9111813529378336e-05, |
| "loss": 0.1327, |
| "num_input_tokens_seen": 1152800, |
| "step": 4055 |
| }, |
| { |
| "epoch": 7.574626865671641, |
| "grad_norm": 4.745022296905518, |
| "learning_rate": 3.907819677066694e-05, |
| "loss": 0.0573, |
| "num_input_tokens_seen": 1154272, |
| "step": 4060 |
| }, |
| { |
| "epoch": 7.583955223880597, |
| "grad_norm": 3.730696678161621, |
| "learning_rate": 3.904454269449351e-05, |
| "loss": 0.1078, |
| "num_input_tokens_seen": 1155584, |
| "step": 4065 |
| }, |
| { |
| "epoch": 7.593283582089552, |
| "grad_norm": 3.649820327758789, |
| "learning_rate": 3.901085139006585e-05, |
| "loss": 0.1143, |
| "num_input_tokens_seen": 1157152, |
| "step": 4070 |
| }, |
| { |
| "epoch": 7.602611940298507, |
| "grad_norm": 7.420780181884766, |
| "learning_rate": 3.8977122946690395e-05, |
| "loss": 0.1089, |
| "num_input_tokens_seen": 1158432, |
| "step": 4075 |
| }, |
| { |
| "epoch": 7.611940298507463, |
| "grad_norm": 2.127516984939575, |
| "learning_rate": 3.894335745377208e-05, |
| "loss": 0.057, |
| "num_input_tokens_seen": 1160032, |
| "step": 4080 |
| }, |
| { |
| "epoch": 7.621268656716418, |
| "grad_norm": 4.309483051300049, |
| "learning_rate": 3.8909555000814e-05, |
| "loss": 0.1039, |
| "num_input_tokens_seen": 1161536, |
| "step": 4085 |
| }, |
| { |
| "epoch": 7.630597014925373, |
| "grad_norm": 4.552899360656738, |
| "learning_rate": 3.8875715677417255e-05, |
| "loss": 0.1482, |
| "num_input_tokens_seen": 1163072, |
| "step": 4090 |
| }, |
| { |
| "epoch": 7.639925373134329, |
| "grad_norm": 2.146493911743164, |
| "learning_rate": 3.884183957328067e-05, |
| "loss": 0.0429, |
| "num_input_tokens_seen": 1164384, |
| "step": 4095 |
| }, |
| { |
| "epoch": 7.649253731343284, |
| "grad_norm": 5.189268589019775, |
| "learning_rate": 3.880792677820056e-05, |
| "loss": 0.1095, |
| "num_input_tokens_seen": 1165728, |
| "step": 4100 |
| }, |
| { |
| "epoch": 7.6585820895522385, |
| "grad_norm": 2.359233856201172, |
| "learning_rate": 3.87739773820705e-05, |
| "loss": 0.0744, |
| "num_input_tokens_seen": 1167264, |
| "step": 4105 |
| }, |
| { |
| "epoch": 7.667910447761194, |
| "grad_norm": 2.3464906215667725, |
| "learning_rate": 3.873999147488108e-05, |
| "loss": 0.156, |
| "num_input_tokens_seen": 1168800, |
| "step": 4110 |
| }, |
| { |
| "epoch": 7.677238805970149, |
| "grad_norm": 2.5241103172302246, |
| "learning_rate": 3.8705969146719686e-05, |
| "loss": 0.1229, |
| "num_input_tokens_seen": 1170240, |
| "step": 4115 |
| }, |
| { |
| "epoch": 7.686567164179104, |
| "grad_norm": 8.90768814086914, |
| "learning_rate": 3.8671910487770223e-05, |
| "loss": 0.0701, |
| "num_input_tokens_seen": 1171936, |
| "step": 4120 |
| }, |
| { |
| "epoch": 7.69589552238806, |
| "grad_norm": 4.894184589385986, |
| "learning_rate": 3.863781558831292e-05, |
| "loss": 0.1053, |
| "num_input_tokens_seen": 1173216, |
| "step": 4125 |
| }, |
| { |
| "epoch": 7.705223880597015, |
| "grad_norm": 2.496595621109009, |
| "learning_rate": 3.8603684538724055e-05, |
| "loss": 0.1036, |
| "num_input_tokens_seen": 1174496, |
| "step": 4130 |
| }, |
| { |
| "epoch": 7.71455223880597, |
| "grad_norm": 4.656245708465576, |
| "learning_rate": 3.856951742947574e-05, |
| "loss": 0.096, |
| "num_input_tokens_seen": 1176032, |
| "step": 4135 |
| }, |
| { |
| "epoch": 7.723880597014926, |
| "grad_norm": 2.24885892868042, |
| "learning_rate": 3.8535314351135674e-05, |
| "loss": 0.1538, |
| "num_input_tokens_seen": 1177472, |
| "step": 4140 |
| }, |
| { |
| "epoch": 7.733208955223881, |
| "grad_norm": 6.146028518676758, |
| "learning_rate": 3.850107539436689e-05, |
| "loss": 0.0775, |
| "num_input_tokens_seen": 1178976, |
| "step": 4145 |
| }, |
| { |
| "epoch": 7.742537313432836, |
| "grad_norm": 2.241814374923706, |
| "learning_rate": 3.8466800649927536e-05, |
| "loss": 0.1338, |
| "num_input_tokens_seen": 1180448, |
| "step": 4150 |
| }, |
| { |
| "epoch": 7.7518656716417915, |
| "grad_norm": 9.829728126525879, |
| "learning_rate": 3.84324902086706e-05, |
| "loss": 0.0978, |
| "num_input_tokens_seen": 1181888, |
| "step": 4155 |
| }, |
| { |
| "epoch": 7.7611940298507465, |
| "grad_norm": 1.3478586673736572, |
| "learning_rate": 3.839814416154371e-05, |
| "loss": 0.0809, |
| "num_input_tokens_seen": 1183456, |
| "step": 4160 |
| }, |
| { |
| "epoch": 7.770522388059701, |
| "grad_norm": 7.280726432800293, |
| "learning_rate": 3.83637625995889e-05, |
| "loss": 0.1153, |
| "num_input_tokens_seen": 1184800, |
| "step": 4165 |
| }, |
| { |
| "epoch": 7.779850746268656, |
| "grad_norm": 5.220284938812256, |
| "learning_rate": 3.832934561394229e-05, |
| "loss": 0.1406, |
| "num_input_tokens_seen": 1186304, |
| "step": 4170 |
| }, |
| { |
| "epoch": 7.789179104477612, |
| "grad_norm": 2.703441858291626, |
| "learning_rate": 3.829489329583394e-05, |
| "loss": 0.1835, |
| "num_input_tokens_seen": 1187744, |
| "step": 4175 |
| }, |
| { |
| "epoch": 7.798507462686567, |
| "grad_norm": 2.567797899246216, |
| "learning_rate": 3.8260405736587546e-05, |
| "loss": 0.0903, |
| "num_input_tokens_seen": 1189376, |
| "step": 4180 |
| }, |
| { |
| "epoch": 7.807835820895522, |
| "grad_norm": 5.175423622131348, |
| "learning_rate": 3.822588302762024e-05, |
| "loss": 0.1082, |
| "num_input_tokens_seen": 1190912, |
| "step": 4185 |
| }, |
| { |
| "epoch": 7.817164179104478, |
| "grad_norm": 2.9097766876220703, |
| "learning_rate": 3.81913252604423e-05, |
| "loss": 0.1852, |
| "num_input_tokens_seen": 1192416, |
| "step": 4190 |
| }, |
| { |
| "epoch": 7.826492537313433, |
| "grad_norm": 5.9409565925598145, |
| "learning_rate": 3.815673252665696e-05, |
| "loss": 0.1253, |
| "num_input_tokens_seen": 1193792, |
| "step": 4195 |
| }, |
| { |
| "epoch": 7.835820895522388, |
| "grad_norm": 1.2797726392745972, |
| "learning_rate": 3.812210491796011e-05, |
| "loss": 0.0628, |
| "num_input_tokens_seen": 1195200, |
| "step": 4200 |
| }, |
| { |
| "epoch": 7.845149253731344, |
| "grad_norm": 3.7713513374328613, |
| "learning_rate": 3.808744252614012e-05, |
| "loss": 0.0882, |
| "num_input_tokens_seen": 1196672, |
| "step": 4205 |
| }, |
| { |
| "epoch": 7.854477611940299, |
| "grad_norm": 4.551257610321045, |
| "learning_rate": 3.805274544307752e-05, |
| "loss": 0.1047, |
| "num_input_tokens_seen": 1198336, |
| "step": 4210 |
| }, |
| { |
| "epoch": 7.8638059701492535, |
| "grad_norm": 2.525516986846924, |
| "learning_rate": 3.8018013760744844e-05, |
| "loss": 0.0182, |
| "num_input_tokens_seen": 1199712, |
| "step": 4215 |
| }, |
| { |
| "epoch": 7.8731343283582085, |
| "grad_norm": 1.092524528503418, |
| "learning_rate": 3.798324757120629e-05, |
| "loss": 0.0592, |
| "num_input_tokens_seen": 1201088, |
| "step": 4220 |
| }, |
| { |
| "epoch": 7.882462686567164, |
| "grad_norm": 17.674110412597656, |
| "learning_rate": 3.794844696661757e-05, |
| "loss": 0.1366, |
| "num_input_tokens_seen": 1202432, |
| "step": 4225 |
| }, |
| { |
| "epoch": 7.891791044776119, |
| "grad_norm": 4.679252624511719, |
| "learning_rate": 3.7913612039225596e-05, |
| "loss": 0.0941, |
| "num_input_tokens_seen": 1203872, |
| "step": 4230 |
| }, |
| { |
| "epoch": 7.901119402985074, |
| "grad_norm": 9.335509300231934, |
| "learning_rate": 3.787874288136824e-05, |
| "loss": 0.0787, |
| "num_input_tokens_seen": 1205312, |
| "step": 4235 |
| }, |
| { |
| "epoch": 7.91044776119403, |
| "grad_norm": 2.389305353164673, |
| "learning_rate": 3.7843839585474186e-05, |
| "loss": 0.1002, |
| "num_input_tokens_seen": 1206688, |
| "step": 4240 |
| }, |
| { |
| "epoch": 7.919776119402985, |
| "grad_norm": 4.4145894050598145, |
| "learning_rate": 3.78089022440625e-05, |
| "loss": 0.1364, |
| "num_input_tokens_seen": 1208064, |
| "step": 4245 |
| }, |
| { |
| "epoch": 7.92910447761194, |
| "grad_norm": 5.871304988861084, |
| "learning_rate": 3.777393094974259e-05, |
| "loss": 0.0953, |
| "num_input_tokens_seen": 1209600, |
| "step": 4250 |
| }, |
| { |
| "epoch": 7.938432835820896, |
| "grad_norm": 0.7353957295417786, |
| "learning_rate": 3.773892579521381e-05, |
| "loss": 0.1098, |
| "num_input_tokens_seen": 1210976, |
| "step": 4255 |
| }, |
| { |
| "epoch": 7.947761194029851, |
| "grad_norm": 7.335037708282471, |
| "learning_rate": 3.7703886873265285e-05, |
| "loss": 0.1243, |
| "num_input_tokens_seen": 1212416, |
| "step": 4260 |
| }, |
| { |
| "epoch": 7.957089552238806, |
| "grad_norm": 2.7760443687438965, |
| "learning_rate": 3.766881427677563e-05, |
| "loss": 0.0685, |
| "num_input_tokens_seen": 1213920, |
| "step": 4265 |
| }, |
| { |
| "epoch": 7.9664179104477615, |
| "grad_norm": 5.166417121887207, |
| "learning_rate": 3.7633708098712766e-05, |
| "loss": 0.0913, |
| "num_input_tokens_seen": 1215360, |
| "step": 4270 |
| }, |
| { |
| "epoch": 7.975746268656716, |
| "grad_norm": 5.738154888153076, |
| "learning_rate": 3.7598568432133586e-05, |
| "loss": 0.0571, |
| "num_input_tokens_seen": 1216736, |
| "step": 4275 |
| }, |
| { |
| "epoch": 7.985074626865671, |
| "grad_norm": 0.8824310302734375, |
| "learning_rate": 3.7563395370183755e-05, |
| "loss": 0.1583, |
| "num_input_tokens_seen": 1218240, |
| "step": 4280 |
| }, |
| { |
| "epoch": 7.994402985074627, |
| "grad_norm": 4.588912487030029, |
| "learning_rate": 3.75281890060975e-05, |
| "loss": 0.1768, |
| "num_input_tokens_seen": 1219488, |
| "step": 4285 |
| }, |
| { |
| "epoch": 8.0, |
| "eval_loss": 0.9656381607055664, |
| "eval_runtime": 2.9202, |
| "eval_samples_per_second": 81.5, |
| "eval_steps_per_second": 20.546, |
| "num_input_tokens_seen": 1220200, |
| "step": 4288 |
| }, |
| { |
| "epoch": 8.003731343283581, |
| "grad_norm": 9.171119689941406, |
| "learning_rate": 3.749294943319728e-05, |
| "loss": 0.2059, |
| "num_input_tokens_seen": 1220808, |
| "step": 4290 |
| }, |
| { |
| "epoch": 8.013059701492537, |
| "grad_norm": 2.7020506858825684, |
| "learning_rate": 3.7457676744893594e-05, |
| "loss": 0.1027, |
| "num_input_tokens_seen": 1222280, |
| "step": 4295 |
| }, |
| { |
| "epoch": 8.022388059701493, |
| "grad_norm": 0.8913432359695435, |
| "learning_rate": 3.7422371034684735e-05, |
| "loss": 0.0788, |
| "num_input_tokens_seen": 1223688, |
| "step": 4300 |
| }, |
| { |
| "epoch": 8.031716417910447, |
| "grad_norm": 16.711389541625977, |
| "learning_rate": 3.73870323961565e-05, |
| "loss": 0.0888, |
| "num_input_tokens_seen": 1225352, |
| "step": 4305 |
| }, |
| { |
| "epoch": 8.041044776119403, |
| "grad_norm": 3.333115816116333, |
| "learning_rate": 3.735166092298199e-05, |
| "loss": 0.0604, |
| "num_input_tokens_seen": 1226824, |
| "step": 4310 |
| }, |
| { |
| "epoch": 8.050373134328359, |
| "grad_norm": 3.606842279434204, |
| "learning_rate": 3.731625670892135e-05, |
| "loss": 0.0283, |
| "num_input_tokens_seen": 1228232, |
| "step": 4315 |
| }, |
| { |
| "epoch": 8.059701492537313, |
| "grad_norm": 3.3428258895874023, |
| "learning_rate": 3.7280819847821504e-05, |
| "loss": 0.0289, |
| "num_input_tokens_seen": 1229672, |
| "step": 4320 |
| }, |
| { |
| "epoch": 8.069029850746269, |
| "grad_norm": 3.6926119327545166, |
| "learning_rate": 3.724535043361589e-05, |
| "loss": 0.0964, |
| "num_input_tokens_seen": 1230920, |
| "step": 4325 |
| }, |
| { |
| "epoch": 8.078358208955224, |
| "grad_norm": 0.509421169757843, |
| "learning_rate": 3.720984856032428e-05, |
| "loss": 0.0145, |
| "num_input_tokens_seen": 1232232, |
| "step": 4330 |
| }, |
| { |
| "epoch": 8.087686567164178, |
| "grad_norm": 0.8331343531608582, |
| "learning_rate": 3.717431432205244e-05, |
| "loss": 0.0548, |
| "num_input_tokens_seen": 1233640, |
| "step": 4335 |
| }, |
| { |
| "epoch": 8.097014925373134, |
| "grad_norm": 11.62999153137207, |
| "learning_rate": 3.713874781299196e-05, |
| "loss": 0.0728, |
| "num_input_tokens_seen": 1235048, |
| "step": 4340 |
| }, |
| { |
| "epoch": 8.10634328358209, |
| "grad_norm": 9.598188400268555, |
| "learning_rate": 3.710314912741997e-05, |
| "loss": 0.0338, |
| "num_input_tokens_seen": 1236552, |
| "step": 4345 |
| }, |
| { |
| "epoch": 8.115671641791044, |
| "grad_norm": 0.6860648393630981, |
| "learning_rate": 3.7067518359698856e-05, |
| "loss": 0.083, |
| "num_input_tokens_seen": 1237896, |
| "step": 4350 |
| }, |
| { |
| "epoch": 8.125, |
| "grad_norm": 5.933894157409668, |
| "learning_rate": 3.70318556042761e-05, |
| "loss": 0.0514, |
| "num_input_tokens_seen": 1239112, |
| "step": 4355 |
| }, |
| { |
| "epoch": 8.134328358208956, |
| "grad_norm": 4.2529401779174805, |
| "learning_rate": 3.6996160955683924e-05, |
| "loss": 0.0231, |
| "num_input_tokens_seen": 1240744, |
| "step": 4360 |
| }, |
| { |
| "epoch": 8.14365671641791, |
| "grad_norm": 2.004309892654419, |
| "learning_rate": 3.696043450853912e-05, |
| "loss": 0.0484, |
| "num_input_tokens_seen": 1242152, |
| "step": 4365 |
| }, |
| { |
| "epoch": 8.152985074626866, |
| "grad_norm": 4.186731815338135, |
| "learning_rate": 3.692467635754276e-05, |
| "loss": 0.0251, |
| "num_input_tokens_seen": 1243496, |
| "step": 4370 |
| }, |
| { |
| "epoch": 8.162313432835822, |
| "grad_norm": 1.1354295015335083, |
| "learning_rate": 3.688888659747995e-05, |
| "loss": 0.0242, |
| "num_input_tokens_seen": 1245064, |
| "step": 4375 |
| }, |
| { |
| "epoch": 8.171641791044776, |
| "grad_norm": 8.906183242797852, |
| "learning_rate": 3.6853065323219606e-05, |
| "loss": 0.0722, |
| "num_input_tokens_seen": 1246408, |
| "step": 4380 |
| }, |
| { |
| "epoch": 8.180970149253731, |
| "grad_norm": 3.374791383743286, |
| "learning_rate": 3.681721262971413e-05, |
| "loss": 0.129, |
| "num_input_tokens_seen": 1247784, |
| "step": 4385 |
| }, |
| { |
| "epoch": 8.190298507462687, |
| "grad_norm": 1.3529633283615112, |
| "learning_rate": 3.678132861199927e-05, |
| "loss": 0.0649, |
| "num_input_tokens_seen": 1249192, |
| "step": 4390 |
| }, |
| { |
| "epoch": 8.199626865671641, |
| "grad_norm": 2.120964527130127, |
| "learning_rate": 3.674541336519376e-05, |
| "loss": 0.0444, |
| "num_input_tokens_seen": 1250664, |
| "step": 4395 |
| }, |
| { |
| "epoch": 8.208955223880597, |
| "grad_norm": 4.210252285003662, |
| "learning_rate": 3.670946698449912e-05, |
| "loss": 0.1187, |
| "num_input_tokens_seen": 1252072, |
| "step": 4400 |
| }, |
| { |
| "epoch": 8.218283582089553, |
| "grad_norm": 8.924422264099121, |
| "learning_rate": 3.667348956519943e-05, |
| "loss": 0.1183, |
| "num_input_tokens_seen": 1253416, |
| "step": 4405 |
| }, |
| { |
| "epoch": 8.227611940298507, |
| "grad_norm": 1.896562933921814, |
| "learning_rate": 3.6637481202661006e-05, |
| "loss": 0.0809, |
| "num_input_tokens_seen": 1254728, |
| "step": 4410 |
| }, |
| { |
| "epoch": 8.236940298507463, |
| "grad_norm": 3.677365303039551, |
| "learning_rate": 3.660144199233221e-05, |
| "loss": 0.0278, |
| "num_input_tokens_seen": 1255976, |
| "step": 4415 |
| }, |
| { |
| "epoch": 8.246268656716419, |
| "grad_norm": 4.029061794281006, |
| "learning_rate": 3.656537202974315e-05, |
| "loss": 0.0586, |
| "num_input_tokens_seen": 1257288, |
| "step": 4420 |
| }, |
| { |
| "epoch": 8.255597014925373, |
| "grad_norm": 0.9684045314788818, |
| "learning_rate": 3.652927141050548e-05, |
| "loss": 0.0351, |
| "num_input_tokens_seen": 1258664, |
| "step": 4425 |
| }, |
| { |
| "epoch": 8.264925373134329, |
| "grad_norm": 2.5019989013671875, |
| "learning_rate": 3.649314023031208e-05, |
| "loss": 0.0531, |
| "num_input_tokens_seen": 1260104, |
| "step": 4430 |
| }, |
| { |
| "epoch": 8.274253731343283, |
| "grad_norm": 7.469059467315674, |
| "learning_rate": 3.64569785849369e-05, |
| "loss": 0.0904, |
| "num_input_tokens_seen": 1261448, |
| "step": 4435 |
| }, |
| { |
| "epoch": 8.283582089552239, |
| "grad_norm": 2.52644419670105, |
| "learning_rate": 3.642078657023456e-05, |
| "loss": 0.0491, |
| "num_input_tokens_seen": 1263048, |
| "step": 4440 |
| }, |
| { |
| "epoch": 8.292910447761194, |
| "grad_norm": 1.1150394678115845, |
| "learning_rate": 3.638456428214024e-05, |
| "loss": 0.0517, |
| "num_input_tokens_seen": 1264360, |
| "step": 4445 |
| }, |
| { |
| "epoch": 8.302238805970148, |
| "grad_norm": 2.5041840076446533, |
| "learning_rate": 3.6348311816669366e-05, |
| "loss": 0.0461, |
| "num_input_tokens_seen": 1265640, |
| "step": 4450 |
| }, |
| { |
| "epoch": 8.311567164179104, |
| "grad_norm": 3.7155001163482666, |
| "learning_rate": 3.6312029269917335e-05, |
| "loss": 0.0548, |
| "num_input_tokens_seen": 1267016, |
| "step": 4455 |
| }, |
| { |
| "epoch": 8.32089552238806, |
| "grad_norm": 2.7697818279266357, |
| "learning_rate": 3.627571673805927e-05, |
| "loss": 0.0356, |
| "num_input_tokens_seen": 1268744, |
| "step": 4460 |
| }, |
| { |
| "epoch": 8.330223880597014, |
| "grad_norm": 7.993197441101074, |
| "learning_rate": 3.623937431734982e-05, |
| "loss": 0.0194, |
| "num_input_tokens_seen": 1270120, |
| "step": 4465 |
| }, |
| { |
| "epoch": 8.33955223880597, |
| "grad_norm": 2.1708481311798096, |
| "learning_rate": 3.6203002104122824e-05, |
| "loss": 0.0611, |
| "num_input_tokens_seen": 1271656, |
| "step": 4470 |
| }, |
| { |
| "epoch": 8.348880597014926, |
| "grad_norm": 1.8060230016708374, |
| "learning_rate": 3.61666001947911e-05, |
| "loss": 0.0693, |
| "num_input_tokens_seen": 1273128, |
| "step": 4475 |
| }, |
| { |
| "epoch": 8.35820895522388, |
| "grad_norm": 0.8724203109741211, |
| "learning_rate": 3.61301686858462e-05, |
| "loss": 0.1094, |
| "num_input_tokens_seen": 1274632, |
| "step": 4480 |
| }, |
| { |
| "epoch": 8.367537313432836, |
| "grad_norm": 22.017351150512695, |
| "learning_rate": 3.609370767385811e-05, |
| "loss": 0.1008, |
| "num_input_tokens_seen": 1275848, |
| "step": 4485 |
| }, |
| { |
| "epoch": 8.376865671641792, |
| "grad_norm": 3.6052303314208984, |
| "learning_rate": 3.6057217255475034e-05, |
| "loss": 0.096, |
| "num_input_tokens_seen": 1277224, |
| "step": 4490 |
| }, |
| { |
| "epoch": 8.386194029850746, |
| "grad_norm": 8.40788459777832, |
| "learning_rate": 3.6020697527423134e-05, |
| "loss": 0.131, |
| "num_input_tokens_seen": 1278504, |
| "step": 4495 |
| }, |
| { |
| "epoch": 8.395522388059701, |
| "grad_norm": 4.599893093109131, |
| "learning_rate": 3.598414858650625e-05, |
| "loss": 0.0596, |
| "num_input_tokens_seen": 1279784, |
| "step": 4500 |
| }, |
| { |
| "epoch": 8.404850746268657, |
| "grad_norm": 2.25706148147583, |
| "learning_rate": 3.594757052960566e-05, |
| "loss": 0.0629, |
| "num_input_tokens_seen": 1281256, |
| "step": 4505 |
| }, |
| { |
| "epoch": 8.414179104477611, |
| "grad_norm": 0.5025781393051147, |
| "learning_rate": 3.591096345367982e-05, |
| "loss": 0.0705, |
| "num_input_tokens_seen": 1282792, |
| "step": 4510 |
| }, |
| { |
| "epoch": 8.423507462686567, |
| "grad_norm": 0.2689116299152374, |
| "learning_rate": 3.587432745576411e-05, |
| "loss": 0.0351, |
| "num_input_tokens_seen": 1284232, |
| "step": 4515 |
| }, |
| { |
| "epoch": 8.432835820895523, |
| "grad_norm": 4.30033016204834, |
| "learning_rate": 3.583766263297058e-05, |
| "loss": 0.0526, |
| "num_input_tokens_seen": 1285448, |
| "step": 4520 |
| }, |
| { |
| "epoch": 8.442164179104477, |
| "grad_norm": 6.63388204574585, |
| "learning_rate": 3.580096908248768e-05, |
| "loss": 0.1096, |
| "num_input_tokens_seen": 1286824, |
| "step": 4525 |
| }, |
| { |
| "epoch": 8.451492537313433, |
| "grad_norm": 4.595589637756348, |
| "learning_rate": 3.576424690158e-05, |
| "loss": 0.1086, |
| "num_input_tokens_seen": 1288136, |
| "step": 4530 |
| }, |
| { |
| "epoch": 8.460820895522389, |
| "grad_norm": 0.34528595209121704, |
| "learning_rate": 3.572749618758804e-05, |
| "loss": 0.0096, |
| "num_input_tokens_seen": 1289512, |
| "step": 4535 |
| }, |
| { |
| "epoch": 8.470149253731343, |
| "grad_norm": 1.5462981462478638, |
| "learning_rate": 3.5690717037927926e-05, |
| "loss": 0.0591, |
| "num_input_tokens_seen": 1291144, |
| "step": 4540 |
| }, |
| { |
| "epoch": 8.479477611940299, |
| "grad_norm": 6.226982593536377, |
| "learning_rate": 3.565390955009113e-05, |
| "loss": 0.085, |
| "num_input_tokens_seen": 1292680, |
| "step": 4545 |
| }, |
| { |
| "epoch": 8.488805970149254, |
| "grad_norm": 3.3477790355682373, |
| "learning_rate": 3.561707382164432e-05, |
| "loss": 0.0863, |
| "num_input_tokens_seen": 1294056, |
| "step": 4550 |
| }, |
| { |
| "epoch": 8.498134328358208, |
| "grad_norm": 3.2216649055480957, |
| "learning_rate": 3.5580209950228936e-05, |
| "loss": 0.096, |
| "num_input_tokens_seen": 1295400, |
| "step": 4555 |
| }, |
| { |
| "epoch": 8.507462686567164, |
| "grad_norm": 1.6915512084960938, |
| "learning_rate": 3.554331803356107e-05, |
| "loss": 0.0849, |
| "num_input_tokens_seen": 1296872, |
| "step": 4560 |
| }, |
| { |
| "epoch": 8.51679104477612, |
| "grad_norm": 1.9123958349227905, |
| "learning_rate": 3.550639816943111e-05, |
| "loss": 0.0644, |
| "num_input_tokens_seen": 1298408, |
| "step": 4565 |
| }, |
| { |
| "epoch": 8.526119402985074, |
| "grad_norm": 0.4139266908168793, |
| "learning_rate": 3.546945045570358e-05, |
| "loss": 0.1078, |
| "num_input_tokens_seen": 1299848, |
| "step": 4570 |
| }, |
| { |
| "epoch": 8.53544776119403, |
| "grad_norm": 2.9418513774871826, |
| "learning_rate": 3.543247499031679e-05, |
| "loss": 0.0933, |
| "num_input_tokens_seen": 1301224, |
| "step": 4575 |
| }, |
| { |
| "epoch": 8.544776119402986, |
| "grad_norm": 2.208789110183716, |
| "learning_rate": 3.5395471871282604e-05, |
| "loss": 0.0663, |
| "num_input_tokens_seen": 1302856, |
| "step": 4580 |
| }, |
| { |
| "epoch": 8.55410447761194, |
| "grad_norm": 9.38367748260498, |
| "learning_rate": 3.535844119668622e-05, |
| "loss": 0.0907, |
| "num_input_tokens_seen": 1304168, |
| "step": 4585 |
| }, |
| { |
| "epoch": 8.563432835820896, |
| "grad_norm": 0.8789964914321899, |
| "learning_rate": 3.532138306468586e-05, |
| "loss": 0.0739, |
| "num_input_tokens_seen": 1305704, |
| "step": 4590 |
| }, |
| { |
| "epoch": 8.572761194029852, |
| "grad_norm": 3.687695026397705, |
| "learning_rate": 3.528429757351253e-05, |
| "loss": 0.0432, |
| "num_input_tokens_seen": 1307080, |
| "step": 4595 |
| }, |
| { |
| "epoch": 8.582089552238806, |
| "grad_norm": 4.179150104522705, |
| "learning_rate": 3.524718482146975e-05, |
| "loss": 0.0802, |
| "num_input_tokens_seen": 1308296, |
| "step": 4600 |
| }, |
| { |
| "epoch": 8.591417910447761, |
| "grad_norm": 0.3509630858898163, |
| "learning_rate": 3.521004490693331e-05, |
| "loss": 0.0635, |
| "num_input_tokens_seen": 1309832, |
| "step": 4605 |
| }, |
| { |
| "epoch": 8.600746268656717, |
| "grad_norm": 0.3683817684650421, |
| "learning_rate": 3.5172877928351024e-05, |
| "loss": 0.1923, |
| "num_input_tokens_seen": 1311368, |
| "step": 4610 |
| }, |
| { |
| "epoch": 8.610074626865671, |
| "grad_norm": 2.846247911453247, |
| "learning_rate": 3.513568398424239e-05, |
| "loss": 0.1391, |
| "num_input_tokens_seen": 1312680, |
| "step": 4615 |
| }, |
| { |
| "epoch": 8.619402985074627, |
| "grad_norm": 2.936321258544922, |
| "learning_rate": 3.509846317319841e-05, |
| "loss": 0.0304, |
| "num_input_tokens_seen": 1314152, |
| "step": 4620 |
| }, |
| { |
| "epoch": 8.628731343283581, |
| "grad_norm": 3.5020575523376465, |
| "learning_rate": 3.5061215593881345e-05, |
| "loss": 0.1018, |
| "num_input_tokens_seen": 1315496, |
| "step": 4625 |
| }, |
| { |
| "epoch": 8.638059701492537, |
| "grad_norm": 0.962485134601593, |
| "learning_rate": 3.502394134502435e-05, |
| "loss": 0.0958, |
| "num_input_tokens_seen": 1317192, |
| "step": 4630 |
| }, |
| { |
| "epoch": 8.647388059701493, |
| "grad_norm": 0.6979961395263672, |
| "learning_rate": 3.4986640525431286e-05, |
| "loss": 0.0515, |
| "num_input_tokens_seen": 1318632, |
| "step": 4635 |
| }, |
| { |
| "epoch": 8.656716417910447, |
| "grad_norm": 0.6442877650260925, |
| "learning_rate": 3.494931323397649e-05, |
| "loss": 0.078, |
| "num_input_tokens_seen": 1320232, |
| "step": 4640 |
| }, |
| { |
| "epoch": 8.666044776119403, |
| "grad_norm": 12.476752281188965, |
| "learning_rate": 3.491195956960441e-05, |
| "loss": 0.0637, |
| "num_input_tokens_seen": 1321768, |
| "step": 4645 |
| }, |
| { |
| "epoch": 8.675373134328359, |
| "grad_norm": 1.2380638122558594, |
| "learning_rate": 3.4874579631329443e-05, |
| "loss": 0.0328, |
| "num_input_tokens_seen": 1323272, |
| "step": 4650 |
| }, |
| { |
| "epoch": 8.684701492537313, |
| "grad_norm": 11.049072265625, |
| "learning_rate": 3.483717351823561e-05, |
| "loss": 0.0963, |
| "num_input_tokens_seen": 1324680, |
| "step": 4655 |
| }, |
| { |
| "epoch": 8.694029850746269, |
| "grad_norm": 6.148132801055908, |
| "learning_rate": 3.479974132947632e-05, |
| "loss": 0.0676, |
| "num_input_tokens_seen": 1325992, |
| "step": 4660 |
| }, |
| { |
| "epoch": 8.703358208955224, |
| "grad_norm": 0.5680158138275146, |
| "learning_rate": 3.47622831642741e-05, |
| "loss": 0.1154, |
| "num_input_tokens_seen": 1327400, |
| "step": 4665 |
| }, |
| { |
| "epoch": 8.712686567164178, |
| "grad_norm": 0.5640116930007935, |
| "learning_rate": 3.472479912192034e-05, |
| "loss": 0.0845, |
| "num_input_tokens_seen": 1328744, |
| "step": 4670 |
| }, |
| { |
| "epoch": 8.722014925373134, |
| "grad_norm": 1.8546143770217896, |
| "learning_rate": 3.468728930177501e-05, |
| "loss": 0.1032, |
| "num_input_tokens_seen": 1330280, |
| "step": 4675 |
| }, |
| { |
| "epoch": 8.73134328358209, |
| "grad_norm": 5.749939441680908, |
| "learning_rate": 3.464975380326643e-05, |
| "loss": 0.0902, |
| "num_input_tokens_seen": 1331784, |
| "step": 4680 |
| }, |
| { |
| "epoch": 8.740671641791044, |
| "grad_norm": 1.7578449249267578, |
| "learning_rate": 3.461219272589097e-05, |
| "loss": 0.0637, |
| "num_input_tokens_seen": 1333128, |
| "step": 4685 |
| }, |
| { |
| "epoch": 8.75, |
| "grad_norm": 1.2080515623092651, |
| "learning_rate": 3.45746061692128e-05, |
| "loss": 0.0546, |
| "num_input_tokens_seen": 1334696, |
| "step": 4690 |
| }, |
| { |
| "epoch": 8.759328358208956, |
| "grad_norm": 2.6123621463775635, |
| "learning_rate": 3.4536994232863637e-05, |
| "loss": 0.0416, |
| "num_input_tokens_seen": 1336264, |
| "step": 4695 |
| }, |
| { |
| "epoch": 8.76865671641791, |
| "grad_norm": 6.658786773681641, |
| "learning_rate": 3.4499357016542485e-05, |
| "loss": 0.1156, |
| "num_input_tokens_seen": 1337608, |
| "step": 4700 |
| }, |
| { |
| "epoch": 8.777985074626866, |
| "grad_norm": 5.080148220062256, |
| "learning_rate": 3.446169462001534e-05, |
| "loss": 0.0658, |
| "num_input_tokens_seen": 1339144, |
| "step": 4705 |
| }, |
| { |
| "epoch": 8.787313432835822, |
| "grad_norm": 2.1031360626220703, |
| "learning_rate": 3.4424007143114944e-05, |
| "loss": 0.1065, |
| "num_input_tokens_seen": 1340488, |
| "step": 4710 |
| }, |
| { |
| "epoch": 8.796641791044776, |
| "grad_norm": 2.633694887161255, |
| "learning_rate": 3.438629468574052e-05, |
| "loss": 0.1384, |
| "num_input_tokens_seen": 1341960, |
| "step": 4715 |
| }, |
| { |
| "epoch": 8.805970149253731, |
| "grad_norm": 0.4308924674987793, |
| "learning_rate": 3.4348557347857526e-05, |
| "loss": 0.1195, |
| "num_input_tokens_seen": 1343624, |
| "step": 4720 |
| }, |
| { |
| "epoch": 8.815298507462687, |
| "grad_norm": 0.669029712677002, |
| "learning_rate": 3.431079522949734e-05, |
| "loss": 0.0699, |
| "num_input_tokens_seen": 1344936, |
| "step": 4725 |
| }, |
| { |
| "epoch": 8.824626865671641, |
| "grad_norm": 4.409765720367432, |
| "learning_rate": 3.427300843075706e-05, |
| "loss": 0.1596, |
| "num_input_tokens_seen": 1346408, |
| "step": 4730 |
| }, |
| { |
| "epoch": 8.833955223880597, |
| "grad_norm": 4.513115882873535, |
| "learning_rate": 3.423519705179918e-05, |
| "loss": 0.0727, |
| "num_input_tokens_seen": 1347784, |
| "step": 4735 |
| }, |
| { |
| "epoch": 8.843283582089553, |
| "grad_norm": 3.0124709606170654, |
| "learning_rate": 3.419736119285136e-05, |
| "loss": 0.0373, |
| "num_input_tokens_seen": 1349128, |
| "step": 4740 |
| }, |
| { |
| "epoch": 8.852611940298507, |
| "grad_norm": 7.9226813316345215, |
| "learning_rate": 3.415950095420616e-05, |
| "loss": 0.1135, |
| "num_input_tokens_seen": 1350312, |
| "step": 4745 |
| }, |
| { |
| "epoch": 8.861940298507463, |
| "grad_norm": 5.010641098022461, |
| "learning_rate": 3.4121616436220734e-05, |
| "loss": 0.0481, |
| "num_input_tokens_seen": 1351784, |
| "step": 4750 |
| }, |
| { |
| "epoch": 8.871268656716419, |
| "grad_norm": 5.822708606719971, |
| "learning_rate": 3.4083707739316614e-05, |
| "loss": 0.0587, |
| "num_input_tokens_seen": 1353416, |
| "step": 4755 |
| }, |
| { |
| "epoch": 8.880597014925373, |
| "grad_norm": 1.4002076387405396, |
| "learning_rate": 3.404577496397944e-05, |
| "loss": 0.086, |
| "num_input_tokens_seen": 1354760, |
| "step": 4760 |
| }, |
| { |
| "epoch": 8.889925373134329, |
| "grad_norm": 18.01910972595215, |
| "learning_rate": 3.400781821075865e-05, |
| "loss": 0.0878, |
| "num_input_tokens_seen": 1355976, |
| "step": 4765 |
| }, |
| { |
| "epoch": 8.899253731343283, |
| "grad_norm": 2.283363103866577, |
| "learning_rate": 3.396983758026724e-05, |
| "loss": 0.0227, |
| "num_input_tokens_seen": 1357416, |
| "step": 4770 |
| }, |
| { |
| "epoch": 8.908582089552239, |
| "grad_norm": 4.490248680114746, |
| "learning_rate": 3.393183317318151e-05, |
| "loss": 0.0814, |
| "num_input_tokens_seen": 1359080, |
| "step": 4775 |
| }, |
| { |
| "epoch": 8.917910447761194, |
| "grad_norm": 0.5415623784065247, |
| "learning_rate": 3.389380509024081e-05, |
| "loss": 0.0697, |
| "num_input_tokens_seen": 1360552, |
| "step": 4780 |
| }, |
| { |
| "epoch": 8.927238805970148, |
| "grad_norm": 4.2293853759765625, |
| "learning_rate": 3.385575343224718e-05, |
| "loss": 0.0951, |
| "num_input_tokens_seen": 1361960, |
| "step": 4785 |
| }, |
| { |
| "epoch": 8.936567164179104, |
| "grad_norm": 7.095118045806885, |
| "learning_rate": 3.381767830006522e-05, |
| "loss": 0.1561, |
| "num_input_tokens_seen": 1363368, |
| "step": 4790 |
| }, |
| { |
| "epoch": 8.94589552238806, |
| "grad_norm": 3.372957468032837, |
| "learning_rate": 3.377957979462172e-05, |
| "loss": 0.0639, |
| "num_input_tokens_seen": 1364840, |
| "step": 4795 |
| }, |
| { |
| "epoch": 8.955223880597014, |
| "grad_norm": 5.323475360870361, |
| "learning_rate": 3.3741458016905436e-05, |
| "loss": 0.0479, |
| "num_input_tokens_seen": 1366152, |
| "step": 4800 |
| }, |
| { |
| "epoch": 8.96455223880597, |
| "grad_norm": 1.735262155532837, |
| "learning_rate": 3.37033130679668e-05, |
| "loss": 0.0467, |
| "num_input_tokens_seen": 1367528, |
| "step": 4805 |
| }, |
| { |
| "epoch": 8.973880597014926, |
| "grad_norm": 2.7886672019958496, |
| "learning_rate": 3.366514504891769e-05, |
| "loss": 0.0234, |
| "num_input_tokens_seen": 1368968, |
| "step": 4810 |
| }, |
| { |
| "epoch": 8.98320895522388, |
| "grad_norm": 0.41698721051216125, |
| "learning_rate": 3.36269540609311e-05, |
| "loss": 0.0837, |
| "num_input_tokens_seen": 1370248, |
| "step": 4815 |
| }, |
| { |
| "epoch": 8.992537313432836, |
| "grad_norm": 9.545626640319824, |
| "learning_rate": 3.358874020524094e-05, |
| "loss": 0.11, |
| "num_input_tokens_seen": 1371496, |
| "step": 4820 |
| }, |
| { |
| "epoch": 9.0, |
| "eval_loss": 1.0390026569366455, |
| "eval_runtime": 2.8867, |
| "eval_samples_per_second": 82.447, |
| "eval_steps_per_second": 20.785, |
| "num_input_tokens_seen": 1372560, |
| "step": 4824 |
| }, |
| { |
| "epoch": 9.001865671641792, |
| "grad_norm": 2.0261106491088867, |
| "learning_rate": 3.355050358314172e-05, |
| "loss": 0.0429, |
| "num_input_tokens_seen": 1372784, |
| "step": 4825 |
| }, |
| { |
| "epoch": 9.011194029850746, |
| "grad_norm": 0.7471911907196045, |
| "learning_rate": 3.3512244295988306e-05, |
| "loss": 0.0437, |
| "num_input_tokens_seen": 1374160, |
| "step": 4830 |
| }, |
| { |
| "epoch": 9.020522388059701, |
| "grad_norm": 3.745771646499634, |
| "learning_rate": 3.3473962445195646e-05, |
| "loss": 0.0523, |
| "num_input_tokens_seen": 1375472, |
| "step": 4835 |
| }, |
| { |
| "epoch": 9.029850746268657, |
| "grad_norm": 1.724047064781189, |
| "learning_rate": 3.343565813223847e-05, |
| "loss": 0.0755, |
| "num_input_tokens_seen": 1376976, |
| "step": 4840 |
| }, |
| { |
| "epoch": 9.039179104477611, |
| "grad_norm": 3.4658963680267334, |
| "learning_rate": 3.339733145865109e-05, |
| "loss": 0.0196, |
| "num_input_tokens_seen": 1378256, |
| "step": 4845 |
| }, |
| { |
| "epoch": 9.048507462686567, |
| "grad_norm": 0.3133438527584076, |
| "learning_rate": 3.3358982526027065e-05, |
| "loss": 0.0127, |
| "num_input_tokens_seen": 1379856, |
| "step": 4850 |
| }, |
| { |
| "epoch": 9.057835820895523, |
| "grad_norm": 0.4704878628253937, |
| "learning_rate": 3.3320611436018955e-05, |
| "loss": 0.0096, |
| "num_input_tokens_seen": 1381584, |
| "step": 4855 |
| }, |
| { |
| "epoch": 9.067164179104477, |
| "grad_norm": 1.7006503343582153, |
| "learning_rate": 3.328221829033807e-05, |
| "loss": 0.0915, |
| "num_input_tokens_seen": 1382896, |
| "step": 4860 |
| }, |
| { |
| "epoch": 9.076492537313433, |
| "grad_norm": 5.024104595184326, |
| "learning_rate": 3.324380319075416e-05, |
| "loss": 0.0461, |
| "num_input_tokens_seen": 1384240, |
| "step": 4865 |
| }, |
| { |
| "epoch": 9.085820895522389, |
| "grad_norm": 2.18302583694458, |
| "learning_rate": 3.32053662390952e-05, |
| "loss": 0.0457, |
| "num_input_tokens_seen": 1385808, |
| "step": 4870 |
| }, |
| { |
| "epoch": 9.095149253731343, |
| "grad_norm": 1.6838951110839844, |
| "learning_rate": 3.316690753724706e-05, |
| "loss": 0.0079, |
| "num_input_tokens_seen": 1387312, |
| "step": 4875 |
| }, |
| { |
| "epoch": 9.104477611940299, |
| "grad_norm": 3.802182197570801, |
| "learning_rate": 3.312842718715328e-05, |
| "loss": 0.0659, |
| "num_input_tokens_seen": 1388688, |
| "step": 4880 |
| }, |
| { |
| "epoch": 9.113805970149254, |
| "grad_norm": 1.426719069480896, |
| "learning_rate": 3.308992529081477e-05, |
| "loss": 0.0479, |
| "num_input_tokens_seen": 1390000, |
| "step": 4885 |
| }, |
| { |
| "epoch": 9.123134328358208, |
| "grad_norm": 0.44229546189308167, |
| "learning_rate": 3.3051401950289566e-05, |
| "loss": 0.1324, |
| "num_input_tokens_seen": 1391408, |
| "step": 4890 |
| }, |
| { |
| "epoch": 9.132462686567164, |
| "grad_norm": 3.7299137115478516, |
| "learning_rate": 3.301285726769255e-05, |
| "loss": 0.0548, |
| "num_input_tokens_seen": 1392848, |
| "step": 4895 |
| }, |
| { |
| "epoch": 9.14179104477612, |
| "grad_norm": 0.46122056245803833, |
| "learning_rate": 3.297429134519516e-05, |
| "loss": 0.0249, |
| "num_input_tokens_seen": 1394320, |
| "step": 4900 |
| }, |
| { |
| "epoch": 9.151119402985074, |
| "grad_norm": 1.4727210998535156, |
| "learning_rate": 3.293570428502515e-05, |
| "loss": 0.0272, |
| "num_input_tokens_seen": 1395504, |
| "step": 4905 |
| }, |
| { |
| "epoch": 9.16044776119403, |
| "grad_norm": 0.4588846266269684, |
| "learning_rate": 3.2897096189466284e-05, |
| "loss": 0.0743, |
| "num_input_tokens_seen": 1396880, |
| "step": 4910 |
| }, |
| { |
| "epoch": 9.169776119402986, |
| "grad_norm": 2.191148042678833, |
| "learning_rate": 3.2858467160858116e-05, |
| "loss": 0.0468, |
| "num_input_tokens_seen": 1398064, |
| "step": 4915 |
| }, |
| { |
| "epoch": 9.17910447761194, |
| "grad_norm": 3.2878174781799316, |
| "learning_rate": 3.281981730159567e-05, |
| "loss": 0.0565, |
| "num_input_tokens_seen": 1399504, |
| "step": 4920 |
| }, |
| { |
| "epoch": 9.188432835820896, |
| "grad_norm": 1.8859481811523438, |
| "learning_rate": 3.278114671412917e-05, |
| "loss": 0.0624, |
| "num_input_tokens_seen": 1400848, |
| "step": 4925 |
| }, |
| { |
| "epoch": 9.197761194029852, |
| "grad_norm": 1.705236792564392, |
| "learning_rate": 3.274245550096382e-05, |
| "loss": 0.0367, |
| "num_input_tokens_seen": 1402128, |
| "step": 4930 |
| }, |
| { |
| "epoch": 9.207089552238806, |
| "grad_norm": 0.7123275995254517, |
| "learning_rate": 3.2703743764659475e-05, |
| "loss": 0.0298, |
| "num_input_tokens_seen": 1403632, |
| "step": 4935 |
| }, |
| { |
| "epoch": 9.216417910447761, |
| "grad_norm": 0.6261950731277466, |
| "learning_rate": 3.266501160783039e-05, |
| "loss": 0.0411, |
| "num_input_tokens_seen": 1405072, |
| "step": 4940 |
| }, |
| { |
| "epoch": 9.225746268656717, |
| "grad_norm": 8.514904022216797, |
| "learning_rate": 3.262625913314496e-05, |
| "loss": 0.0517, |
| "num_input_tokens_seen": 1406480, |
| "step": 4945 |
| }, |
| { |
| "epoch": 9.235074626865671, |
| "grad_norm": 1.461437702178955, |
| "learning_rate": 3.2587486443325424e-05, |
| "loss": 0.0259, |
| "num_input_tokens_seen": 1407792, |
| "step": 4950 |
| }, |
| { |
| "epoch": 9.244402985074627, |
| "grad_norm": 0.9036996960639954, |
| "learning_rate": 3.254869364114763e-05, |
| "loss": 0.0584, |
| "num_input_tokens_seen": 1409072, |
| "step": 4955 |
| }, |
| { |
| "epoch": 9.253731343283581, |
| "grad_norm": 8.54856014251709, |
| "learning_rate": 3.25098808294407e-05, |
| "loss": 0.0535, |
| "num_input_tokens_seen": 1410576, |
| "step": 4960 |
| }, |
| { |
| "epoch": 9.263059701492537, |
| "grad_norm": 2.066290855407715, |
| "learning_rate": 3.2471048111086825e-05, |
| "loss": 0.074, |
| "num_input_tokens_seen": 1411856, |
| "step": 4965 |
| }, |
| { |
| "epoch": 9.272388059701493, |
| "grad_norm": 3.616798162460327, |
| "learning_rate": 3.2432195589020976e-05, |
| "loss": 0.0799, |
| "num_input_tokens_seen": 1413200, |
| "step": 4970 |
| }, |
| { |
| "epoch": 9.281716417910447, |
| "grad_norm": 0.3872012495994568, |
| "learning_rate": 3.2393323366230575e-05, |
| "loss": 0.0282, |
| "num_input_tokens_seen": 1414576, |
| "step": 4975 |
| }, |
| { |
| "epoch": 9.291044776119403, |
| "grad_norm": 0.9040723443031311, |
| "learning_rate": 3.2354431545755296e-05, |
| "loss": 0.01, |
| "num_input_tokens_seen": 1415952, |
| "step": 4980 |
| }, |
| { |
| "epoch": 9.300373134328359, |
| "grad_norm": 1.6493005752563477, |
| "learning_rate": 3.231552023068675e-05, |
| "loss": 0.0576, |
| "num_input_tokens_seen": 1417136, |
| "step": 4985 |
| }, |
| { |
| "epoch": 9.309701492537313, |
| "grad_norm": 1.7681238651275635, |
| "learning_rate": 3.227658952416822e-05, |
| "loss": 0.025, |
| "num_input_tokens_seen": 1418736, |
| "step": 4990 |
| }, |
| { |
| "epoch": 9.319029850746269, |
| "grad_norm": 1.3551405668258667, |
| "learning_rate": 3.223763952939442e-05, |
| "loss": 0.0629, |
| "num_input_tokens_seen": 1420144, |
| "step": 4995 |
| }, |
| { |
| "epoch": 9.328358208955224, |
| "grad_norm": 1.195322036743164, |
| "learning_rate": 3.219867034961114e-05, |
| "loss": 0.0861, |
| "num_input_tokens_seen": 1421552, |
| "step": 5000 |
| }, |
| { |
| "epoch": 9.337686567164178, |
| "grad_norm": 20.339828491210938, |
| "learning_rate": 3.215968208811508e-05, |
| "loss": 0.0491, |
| "num_input_tokens_seen": 1423024, |
| "step": 5005 |
| }, |
| { |
| "epoch": 9.347014925373134, |
| "grad_norm": 2.1667182445526123, |
| "learning_rate": 3.2120674848253475e-05, |
| "loss": 0.0341, |
| "num_input_tokens_seen": 1424208, |
| "step": 5010 |
| }, |
| { |
| "epoch": 9.35634328358209, |
| "grad_norm": 4.788270950317383, |
| "learning_rate": 3.2081648733423893e-05, |
| "loss": 0.0366, |
| "num_input_tokens_seen": 1425648, |
| "step": 5015 |
| }, |
| { |
| "epoch": 9.365671641791044, |
| "grad_norm": 2.3559982776641846, |
| "learning_rate": 3.204260384707393e-05, |
| "loss": 0.0447, |
| "num_input_tokens_seen": 1426896, |
| "step": 5020 |
| }, |
| { |
| "epoch": 9.375, |
| "grad_norm": 1.1366091966629028, |
| "learning_rate": 3.200354029270091e-05, |
| "loss": 0.0228, |
| "num_input_tokens_seen": 1428336, |
| "step": 5025 |
| }, |
| { |
| "epoch": 9.384328358208956, |
| "grad_norm": 1.1767058372497559, |
| "learning_rate": 3.196445817385171e-05, |
| "loss": 0.0552, |
| "num_input_tokens_seen": 1429712, |
| "step": 5030 |
| }, |
| { |
| "epoch": 9.39365671641791, |
| "grad_norm": 0.8556516766548157, |
| "learning_rate": 3.192535759412233e-05, |
| "loss": 0.0988, |
| "num_input_tokens_seen": 1430928, |
| "step": 5035 |
| }, |
| { |
| "epoch": 9.402985074626866, |
| "grad_norm": 3.0809617042541504, |
| "learning_rate": 3.188623865715778e-05, |
| "loss": 0.0127, |
| "num_input_tokens_seen": 1432272, |
| "step": 5040 |
| }, |
| { |
| "epoch": 9.412313432835822, |
| "grad_norm": 4.3652215003967285, |
| "learning_rate": 3.184710146665169e-05, |
| "loss": 0.1227, |
| "num_input_tokens_seen": 1433616, |
| "step": 5045 |
| }, |
| { |
| "epoch": 9.421641791044776, |
| "grad_norm": 0.824479341506958, |
| "learning_rate": 3.180794612634608e-05, |
| "loss": 0.0444, |
| "num_input_tokens_seen": 1435216, |
| "step": 5050 |
| }, |
| { |
| "epoch": 9.430970149253731, |
| "grad_norm": 0.4703706204891205, |
| "learning_rate": 3.176877274003108e-05, |
| "loss": 0.0179, |
| "num_input_tokens_seen": 1436432, |
| "step": 5055 |
| }, |
| { |
| "epoch": 9.440298507462687, |
| "grad_norm": 1.6509876251220703, |
| "learning_rate": 3.172958141154466e-05, |
| "loss": 0.0671, |
| "num_input_tokens_seen": 1437808, |
| "step": 5060 |
| }, |
| { |
| "epoch": 9.449626865671641, |
| "grad_norm": 0.35234540700912476, |
| "learning_rate": 3.1690372244772356e-05, |
| "loss": 0.0666, |
| "num_input_tokens_seen": 1439152, |
| "step": 5065 |
| }, |
| { |
| "epoch": 9.458955223880597, |
| "grad_norm": 0.18852519989013672, |
| "learning_rate": 3.165114534364698e-05, |
| "loss": 0.0215, |
| "num_input_tokens_seen": 1440752, |
| "step": 5070 |
| }, |
| { |
| "epoch": 9.468283582089553, |
| "grad_norm": 8.738310813903809, |
| "learning_rate": 3.161190081214835e-05, |
| "loss": 0.0434, |
| "num_input_tokens_seen": 1442128, |
| "step": 5075 |
| }, |
| { |
| "epoch": 9.477611940298507, |
| "grad_norm": 3.2702159881591797, |
| "learning_rate": 3.157263875430302e-05, |
| "loss": 0.0912, |
| "num_input_tokens_seen": 1443728, |
| "step": 5080 |
| }, |
| { |
| "epoch": 9.486940298507463, |
| "grad_norm": 1.0829740762710571, |
| "learning_rate": 3.1533359274184e-05, |
| "loss": 0.0176, |
| "num_input_tokens_seen": 1445136, |
| "step": 5085 |
| }, |
| { |
| "epoch": 9.496268656716419, |
| "grad_norm": 1.3217264413833618, |
| "learning_rate": 3.14940624759105e-05, |
| "loss": 0.0671, |
| "num_input_tokens_seen": 1446736, |
| "step": 5090 |
| }, |
| { |
| "epoch": 9.505597014925373, |
| "grad_norm": 4.323888778686523, |
| "learning_rate": 3.145474846364761e-05, |
| "loss": 0.0423, |
| "num_input_tokens_seen": 1448112, |
| "step": 5095 |
| }, |
| { |
| "epoch": 9.514925373134329, |
| "grad_norm": 4.108525276184082, |
| "learning_rate": 3.1415417341606054e-05, |
| "loss": 0.0449, |
| "num_input_tokens_seen": 1449744, |
| "step": 5100 |
| }, |
| { |
| "epoch": 9.524253731343283, |
| "grad_norm": 2.8086485862731934, |
| "learning_rate": 3.1376069214041913e-05, |
| "loss": 0.1606, |
| "num_input_tokens_seen": 1451024, |
| "step": 5105 |
| }, |
| { |
| "epoch": 9.533582089552239, |
| "grad_norm": 1.6963788270950317, |
| "learning_rate": 3.1336704185256363e-05, |
| "loss": 0.0735, |
| "num_input_tokens_seen": 1452304, |
| "step": 5110 |
| }, |
| { |
| "epoch": 9.542910447761194, |
| "grad_norm": 1.075136423110962, |
| "learning_rate": 3.129732235959535e-05, |
| "loss": 0.0282, |
| "num_input_tokens_seen": 1453968, |
| "step": 5115 |
| }, |
| { |
| "epoch": 9.552238805970148, |
| "grad_norm": 0.7162839770317078, |
| "learning_rate": 3.1257923841449374e-05, |
| "loss": 0.0319, |
| "num_input_tokens_seen": 1455376, |
| "step": 5120 |
| }, |
| { |
| "epoch": 9.561567164179104, |
| "grad_norm": 0.8711545467376709, |
| "learning_rate": 3.121850873525315e-05, |
| "loss": 0.0359, |
| "num_input_tokens_seen": 1456816, |
| "step": 5125 |
| }, |
| { |
| "epoch": 9.57089552238806, |
| "grad_norm": 0.5436994433403015, |
| "learning_rate": 3.1179077145485395e-05, |
| "loss": 0.0449, |
| "num_input_tokens_seen": 1458320, |
| "step": 5130 |
| }, |
| { |
| "epoch": 9.580223880597014, |
| "grad_norm": 1.2748678922653198, |
| "learning_rate": 3.1139629176668496e-05, |
| "loss": 0.0424, |
| "num_input_tokens_seen": 1459632, |
| "step": 5135 |
| }, |
| { |
| "epoch": 9.58955223880597, |
| "grad_norm": 3.645084857940674, |
| "learning_rate": 3.1100164933368263e-05, |
| "loss": 0.0169, |
| "num_input_tokens_seen": 1461008, |
| "step": 5140 |
| }, |
| { |
| "epoch": 9.598880597014926, |
| "grad_norm": 5.222319602966309, |
| "learning_rate": 3.106068452019365e-05, |
| "loss": 0.0778, |
| "num_input_tokens_seen": 1462224, |
| "step": 5145 |
| }, |
| { |
| "epoch": 9.60820895522388, |
| "grad_norm": 2.273914098739624, |
| "learning_rate": 3.1021188041796476e-05, |
| "loss": 0.0395, |
| "num_input_tokens_seen": 1463664, |
| "step": 5150 |
| }, |
| { |
| "epoch": 9.617537313432836, |
| "grad_norm": 0.5332937240600586, |
| "learning_rate": 3.098167560287113e-05, |
| "loss": 0.0109, |
| "num_input_tokens_seen": 1465168, |
| "step": 5155 |
| }, |
| { |
| "epoch": 9.626865671641792, |
| "grad_norm": 6.937219619750977, |
| "learning_rate": 3.094214730815433e-05, |
| "loss": 0.0537, |
| "num_input_tokens_seen": 1466544, |
| "step": 5160 |
| }, |
| { |
| "epoch": 9.636194029850746, |
| "grad_norm": 1.4001652002334595, |
| "learning_rate": 3.09026032624248e-05, |
| "loss": 0.1148, |
| "num_input_tokens_seen": 1468176, |
| "step": 5165 |
| }, |
| { |
| "epoch": 9.645522388059701, |
| "grad_norm": 6.2254815101623535, |
| "learning_rate": 3.086304357050302e-05, |
| "loss": 0.0871, |
| "num_input_tokens_seen": 1469680, |
| "step": 5170 |
| }, |
| { |
| "epoch": 9.654850746268657, |
| "grad_norm": 6.455643177032471, |
| "learning_rate": 3.082346833725095e-05, |
| "loss": 0.0803, |
| "num_input_tokens_seen": 1471024, |
| "step": 5175 |
| }, |
| { |
| "epoch": 9.664179104477611, |
| "grad_norm": 3.4399030208587646, |
| "learning_rate": 3.078387766757177e-05, |
| "loss": 0.0567, |
| "num_input_tokens_seen": 1472464, |
| "step": 5180 |
| }, |
| { |
| "epoch": 9.673507462686567, |
| "grad_norm": 0.42754167318344116, |
| "learning_rate": 3.0744271666409524e-05, |
| "loss": 0.0387, |
| "num_input_tokens_seen": 1473904, |
| "step": 5185 |
| }, |
| { |
| "epoch": 9.682835820895523, |
| "grad_norm": 1.32429838180542, |
| "learning_rate": 3.0704650438748946e-05, |
| "loss": 0.0603, |
| "num_input_tokens_seen": 1475408, |
| "step": 5190 |
| }, |
| { |
| "epoch": 9.692164179104477, |
| "grad_norm": 0.4274611473083496, |
| "learning_rate": 3.066501408961509e-05, |
| "loss": 0.0398, |
| "num_input_tokens_seen": 1476784, |
| "step": 5195 |
| }, |
| { |
| "epoch": 9.701492537313433, |
| "grad_norm": 1.966681957244873, |
| "learning_rate": 3.062536272407313e-05, |
| "loss": 0.0672, |
| "num_input_tokens_seen": 1478160, |
| "step": 5200 |
| }, |
| { |
| "epoch": 9.710820895522389, |
| "grad_norm": 4.4761762619018555, |
| "learning_rate": 3.0585696447228006e-05, |
| "loss": 0.057, |
| "num_input_tokens_seen": 1479600, |
| "step": 5205 |
| }, |
| { |
| "epoch": 9.720149253731343, |
| "grad_norm": 5.236489295959473, |
| "learning_rate": 3.054601536422423e-05, |
| "loss": 0.0211, |
| "num_input_tokens_seen": 1480944, |
| "step": 5210 |
| }, |
| { |
| "epoch": 9.729477611940299, |
| "grad_norm": 3.1009409427642822, |
| "learning_rate": 3.0506319580245536e-05, |
| "loss": 0.0567, |
| "num_input_tokens_seen": 1482448, |
| "step": 5215 |
| }, |
| { |
| "epoch": 9.738805970149254, |
| "grad_norm": 0.8744551539421082, |
| "learning_rate": 3.0466609200514605e-05, |
| "loss": 0.1318, |
| "num_input_tokens_seen": 1483920, |
| "step": 5220 |
| }, |
| { |
| "epoch": 9.748134328358208, |
| "grad_norm": 0.3664301633834839, |
| "learning_rate": 3.0426884330292842e-05, |
| "loss": 0.0493, |
| "num_input_tokens_seen": 1485232, |
| "step": 5225 |
| }, |
| { |
| "epoch": 9.757462686567164, |
| "grad_norm": 1.084946632385254, |
| "learning_rate": 3.0387145074880053e-05, |
| "loss": 0.0481, |
| "num_input_tokens_seen": 1486640, |
| "step": 5230 |
| }, |
| { |
| "epoch": 9.76679104477612, |
| "grad_norm": 2.7911949157714844, |
| "learning_rate": 3.0347391539614156e-05, |
| "loss": 0.0341, |
| "num_input_tokens_seen": 1488080, |
| "step": 5235 |
| }, |
| { |
| "epoch": 9.776119402985074, |
| "grad_norm": 1.6702381372451782, |
| "learning_rate": 3.0307623829870952e-05, |
| "loss": 0.0308, |
| "num_input_tokens_seen": 1489456, |
| "step": 5240 |
| }, |
| { |
| "epoch": 9.78544776119403, |
| "grad_norm": 2.5625648498535156, |
| "learning_rate": 3.0267842051063795e-05, |
| "loss": 0.0179, |
| "num_input_tokens_seen": 1491024, |
| "step": 5245 |
| }, |
| { |
| "epoch": 9.794776119402986, |
| "grad_norm": 1.3376870155334473, |
| "learning_rate": 3.022804630864333e-05, |
| "loss": 0.0575, |
| "num_input_tokens_seen": 1492432, |
| "step": 5250 |
| }, |
| { |
| "epoch": 9.80410447761194, |
| "grad_norm": 0.9751670956611633, |
| "learning_rate": 3.018823670809724e-05, |
| "loss": 0.0526, |
| "num_input_tokens_seen": 1494064, |
| "step": 5255 |
| }, |
| { |
| "epoch": 9.813432835820896, |
| "grad_norm": 5.751134395599365, |
| "learning_rate": 3.0148413354949902e-05, |
| "loss": 0.0915, |
| "num_input_tokens_seen": 1495536, |
| "step": 5260 |
| }, |
| { |
| "epoch": 9.822761194029852, |
| "grad_norm": 1.9670438766479492, |
| "learning_rate": 3.0108576354762175e-05, |
| "loss": 0.0462, |
| "num_input_tokens_seen": 1496848, |
| "step": 5265 |
| }, |
| { |
| "epoch": 9.832089552238806, |
| "grad_norm": 0.9635236263275146, |
| "learning_rate": 3.0068725813131098e-05, |
| "loss": 0.0404, |
| "num_input_tokens_seen": 1498352, |
| "step": 5270 |
| }, |
| { |
| "epoch": 9.841417910447761, |
| "grad_norm": 1.1133617162704468, |
| "learning_rate": 3.0028861835689587e-05, |
| "loss": 0.0344, |
| "num_input_tokens_seen": 1499824, |
| "step": 5275 |
| }, |
| { |
| "epoch": 9.850746268656717, |
| "grad_norm": 8.596325874328613, |
| "learning_rate": 2.9988984528106177e-05, |
| "loss": 0.1146, |
| "num_input_tokens_seen": 1501328, |
| "step": 5280 |
| }, |
| { |
| "epoch": 9.860074626865671, |
| "grad_norm": 5.395959377288818, |
| "learning_rate": 2.9949093996084747e-05, |
| "loss": 0.0728, |
| "num_input_tokens_seen": 1502640, |
| "step": 5285 |
| }, |
| { |
| "epoch": 9.869402985074627, |
| "grad_norm": 2.9841554164886475, |
| "learning_rate": 2.9909190345364217e-05, |
| "loss": 0.0689, |
| "num_input_tokens_seen": 1504144, |
| "step": 5290 |
| }, |
| { |
| "epoch": 9.878731343283581, |
| "grad_norm": 3.151871681213379, |
| "learning_rate": 2.986927368171829e-05, |
| "loss": 0.0315, |
| "num_input_tokens_seen": 1505584, |
| "step": 5295 |
| }, |
| { |
| "epoch": 9.888059701492537, |
| "grad_norm": 0.5696917176246643, |
| "learning_rate": 2.982934411095516e-05, |
| "loss": 0.0397, |
| "num_input_tokens_seen": 1507184, |
| "step": 5300 |
| }, |
| { |
| "epoch": 9.897388059701493, |
| "grad_norm": 0.5546368956565857, |
| "learning_rate": 2.9789401738917244e-05, |
| "loss": 0.1047, |
| "num_input_tokens_seen": 1508656, |
| "step": 5305 |
| }, |
| { |
| "epoch": 9.906716417910447, |
| "grad_norm": 2.911978244781494, |
| "learning_rate": 2.9749446671480862e-05, |
| "loss": 0.0793, |
| "num_input_tokens_seen": 1510192, |
| "step": 5310 |
| }, |
| { |
| "epoch": 9.916044776119403, |
| "grad_norm": 5.4225616455078125, |
| "learning_rate": 2.9709479014556024e-05, |
| "loss": 0.059, |
| "num_input_tokens_seen": 1511632, |
| "step": 5315 |
| }, |
| { |
| "epoch": 9.925373134328359, |
| "grad_norm": 2.0258290767669678, |
| "learning_rate": 2.966949887408608e-05, |
| "loss": 0.0231, |
| "num_input_tokens_seen": 1513008, |
| "step": 5320 |
| }, |
| { |
| "epoch": 9.934701492537313, |
| "grad_norm": 3.264925003051758, |
| "learning_rate": 2.96295063560475e-05, |
| "loss": 0.0479, |
| "num_input_tokens_seen": 1514416, |
| "step": 5325 |
| }, |
| { |
| "epoch": 9.944029850746269, |
| "grad_norm": 1.7954421043395996, |
| "learning_rate": 2.9589501566449534e-05, |
| "loss": 0.0781, |
| "num_input_tokens_seen": 1515856, |
| "step": 5330 |
| }, |
| { |
| "epoch": 9.953358208955224, |
| "grad_norm": 3.470778226852417, |
| "learning_rate": 2.9549484611333983e-05, |
| "loss": 0.0601, |
| "num_input_tokens_seen": 1517328, |
| "step": 5335 |
| }, |
| { |
| "epoch": 9.962686567164178, |
| "grad_norm": 0.1933203786611557, |
| "learning_rate": 2.950945559677488e-05, |
| "loss": 0.0348, |
| "num_input_tokens_seen": 1518800, |
| "step": 5340 |
| }, |
| { |
| "epoch": 9.972014925373134, |
| "grad_norm": 2.8057632446289062, |
| "learning_rate": 2.946941462887824e-05, |
| "loss": 0.0571, |
| "num_input_tokens_seen": 1520208, |
| "step": 5345 |
| }, |
| { |
| "epoch": 9.98134328358209, |
| "grad_norm": 0.44213059544563293, |
| "learning_rate": 2.942936181378174e-05, |
| "loss": 0.0617, |
| "num_input_tokens_seen": 1521616, |
| "step": 5350 |
| }, |
| { |
| "epoch": 9.990671641791044, |
| "grad_norm": 6.331246852874756, |
| "learning_rate": 2.9389297257654482e-05, |
| "loss": 0.0729, |
| "num_input_tokens_seen": 1522992, |
| "step": 5355 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 39.72452926635742, |
| "learning_rate": 2.9349221066696693e-05, |
| "loss": 0.0316, |
| "num_input_tokens_seen": 1524216, |
| "step": 5360 |
| }, |
| { |
| "epoch": 10.0, |
| "eval_loss": 1.0801044702529907, |
| "eval_runtime": 2.9086, |
| "eval_samples_per_second": 81.827, |
| "eval_steps_per_second": 20.629, |
| "num_input_tokens_seen": 1524216, |
| "step": 5360 |
| }, |
| { |
| "epoch": 10.009328358208956, |
| "grad_norm": 2.584059476852417, |
| "learning_rate": 2.9309133347139412e-05, |
| "loss": 0.0608, |
| "num_input_tokens_seen": 1525560, |
| "step": 5365 |
| }, |
| { |
| "epoch": 10.01865671641791, |
| "grad_norm": 0.5620616674423218, |
| "learning_rate": 2.9269034205244272e-05, |
| "loss": 0.0067, |
| "num_input_tokens_seen": 1526872, |
| "step": 5370 |
| }, |
| { |
| "epoch": 10.027985074626866, |
| "grad_norm": 0.24928458034992218, |
| "learning_rate": 2.922892374730316e-05, |
| "loss": 0.0233, |
| "num_input_tokens_seen": 1528440, |
| "step": 5375 |
| }, |
| { |
| "epoch": 10.037313432835822, |
| "grad_norm": 0.2981283664703369, |
| "learning_rate": 2.9188802079637966e-05, |
| "loss": 0.0189, |
| "num_input_tokens_seen": 1529912, |
| "step": 5380 |
| }, |
| { |
| "epoch": 10.046641791044776, |
| "grad_norm": 0.32724976539611816, |
| "learning_rate": 2.9148669308600296e-05, |
| "loss": 0.0149, |
| "num_input_tokens_seen": 1531256, |
| "step": 5385 |
| }, |
| { |
| "epoch": 10.055970149253731, |
| "grad_norm": 6.156827926635742, |
| "learning_rate": 2.910852554057118e-05, |
| "loss": 0.0506, |
| "num_input_tokens_seen": 1532696, |
| "step": 5390 |
| }, |
| { |
| "epoch": 10.065298507462687, |
| "grad_norm": 2.0076308250427246, |
| "learning_rate": 2.9068370881960817e-05, |
| "loss": 0.0786, |
| "num_input_tokens_seen": 1534072, |
| "step": 5395 |
| }, |
| { |
| "epoch": 10.074626865671641, |
| "grad_norm": 1.6561648845672607, |
| "learning_rate": 2.902820543920825e-05, |
| "loss": 0.0318, |
| "num_input_tokens_seen": 1535480, |
| "step": 5400 |
| }, |
| { |
| "epoch": 10.083955223880597, |
| "grad_norm": 1.1208945512771606, |
| "learning_rate": 2.8988029318781124e-05, |
| "loss": 0.035, |
| "num_input_tokens_seen": 1537016, |
| "step": 5405 |
| }, |
| { |
| "epoch": 10.093283582089553, |
| "grad_norm": 3.6789183616638184, |
| "learning_rate": 2.894784262717538e-05, |
| "loss": 0.0876, |
| "num_input_tokens_seen": 1538488, |
| "step": 5410 |
| }, |
| { |
| "epoch": 10.102611940298507, |
| "grad_norm": 1.9673398733139038, |
| "learning_rate": 2.8907645470914978e-05, |
| "loss": 0.068, |
| "num_input_tokens_seen": 1539864, |
| "step": 5415 |
| }, |
| { |
| "epoch": 10.111940298507463, |
| "grad_norm": 0.3734869062900543, |
| "learning_rate": 2.886743795655164e-05, |
| "loss": 0.0447, |
| "num_input_tokens_seen": 1541208, |
| "step": 5420 |
| }, |
| { |
| "epoch": 10.121268656716419, |
| "grad_norm": 0.11753521114587784, |
| "learning_rate": 2.8827220190664506e-05, |
| "loss": 0.0037, |
| "num_input_tokens_seen": 1542584, |
| "step": 5425 |
| }, |
| { |
| "epoch": 10.130597014925373, |
| "grad_norm": 0.471003919839859, |
| "learning_rate": 2.8786992279859922e-05, |
| "loss": 0.024, |
| "num_input_tokens_seen": 1544024, |
| "step": 5430 |
| }, |
| { |
| "epoch": 10.139925373134329, |
| "grad_norm": 7.359152317047119, |
| "learning_rate": 2.8746754330771104e-05, |
| "loss": 0.0321, |
| "num_input_tokens_seen": 1545176, |
| "step": 5435 |
| }, |
| { |
| "epoch": 10.149253731343283, |
| "grad_norm": 0.15038806200027466, |
| "learning_rate": 2.8706506450057902e-05, |
| "loss": 0.0346, |
| "num_input_tokens_seen": 1546520, |
| "step": 5440 |
| }, |
| { |
| "epoch": 10.158582089552239, |
| "grad_norm": 0.13734470307826996, |
| "learning_rate": 2.8666248744406454e-05, |
| "loss": 0.0111, |
| "num_input_tokens_seen": 1547928, |
| "step": 5445 |
| }, |
| { |
| "epoch": 10.167910447761194, |
| "grad_norm": 0.9978963732719421, |
| "learning_rate": 2.862598132052898e-05, |
| "loss": 0.0376, |
| "num_input_tokens_seen": 1549528, |
| "step": 5450 |
| }, |
| { |
| "epoch": 10.177238805970148, |
| "grad_norm": 3.8069307804107666, |
| "learning_rate": 2.8585704285163454e-05, |
| "loss": 0.0163, |
| "num_input_tokens_seen": 1550840, |
| "step": 5455 |
| }, |
| { |
| "epoch": 10.186567164179104, |
| "grad_norm": 0.20897459983825684, |
| "learning_rate": 2.8545417745073294e-05, |
| "loss": 0.0384, |
| "num_input_tokens_seen": 1552184, |
| "step": 5460 |
| }, |
| { |
| "epoch": 10.19589552238806, |
| "grad_norm": 0.43274596333503723, |
| "learning_rate": 2.850512180704715e-05, |
| "loss": 0.0461, |
| "num_input_tokens_seen": 1553528, |
| "step": 5465 |
| }, |
| { |
| "epoch": 10.205223880597014, |
| "grad_norm": 6.514866352081299, |
| "learning_rate": 2.846481657789856e-05, |
| "loss": 0.1777, |
| "num_input_tokens_seen": 1554776, |
| "step": 5470 |
| }, |
| { |
| "epoch": 10.21455223880597, |
| "grad_norm": 1.5861339569091797, |
| "learning_rate": 2.8424502164465705e-05, |
| "loss": 0.0452, |
| "num_input_tokens_seen": 1556120, |
| "step": 5475 |
| }, |
| { |
| "epoch": 10.223880597014926, |
| "grad_norm": 0.17910319566726685, |
| "learning_rate": 2.838417867361111e-05, |
| "loss": 0.0057, |
| "num_input_tokens_seen": 1557624, |
| "step": 5480 |
| }, |
| { |
| "epoch": 10.23320895522388, |
| "grad_norm": 1.9948867559432983, |
| "learning_rate": 2.8343846212221354e-05, |
| "loss": 0.0124, |
| "num_input_tokens_seen": 1559000, |
| "step": 5485 |
| }, |
| { |
| "epoch": 10.242537313432836, |
| "grad_norm": 0.32342636585235596, |
| "learning_rate": 2.8303504887206794e-05, |
| "loss": 0.0595, |
| "num_input_tokens_seen": 1560312, |
| "step": 5490 |
| }, |
| { |
| "epoch": 10.251865671641792, |
| "grad_norm": 0.17355474829673767, |
| "learning_rate": 2.8263154805501297e-05, |
| "loss": 0.0318, |
| "num_input_tokens_seen": 1561880, |
| "step": 5495 |
| }, |
| { |
| "epoch": 10.261194029850746, |
| "grad_norm": 0.3676762878894806, |
| "learning_rate": 2.8222796074061907e-05, |
| "loss": 0.0039, |
| "num_input_tokens_seen": 1563384, |
| "step": 5500 |
| }, |
| { |
| "epoch": 10.270522388059701, |
| "grad_norm": 12.582818031311035, |
| "learning_rate": 2.8182428799868645e-05, |
| "loss": 0.03, |
| "num_input_tokens_seen": 1564824, |
| "step": 5505 |
| }, |
| { |
| "epoch": 10.279850746268657, |
| "grad_norm": 1.4991917610168457, |
| "learning_rate": 2.8142053089924142e-05, |
| "loss": 0.0866, |
| "num_input_tokens_seen": 1566168, |
| "step": 5510 |
| }, |
| { |
| "epoch": 10.289179104477611, |
| "grad_norm": 2.561047077178955, |
| "learning_rate": 2.8101669051253392e-05, |
| "loss": 0.0678, |
| "num_input_tokens_seen": 1567576, |
| "step": 5515 |
| }, |
| { |
| "epoch": 10.298507462686567, |
| "grad_norm": 0.6250370144844055, |
| "learning_rate": 2.806127679090349e-05, |
| "loss": 0.0265, |
| "num_input_tokens_seen": 1569112, |
| "step": 5520 |
| }, |
| { |
| "epoch": 10.307835820895523, |
| "grad_norm": 1.1823796033859253, |
| "learning_rate": 2.8020876415943287e-05, |
| "loss": 0.0173, |
| "num_input_tokens_seen": 1570616, |
| "step": 5525 |
| }, |
| { |
| "epoch": 10.317164179104477, |
| "grad_norm": 9.904471397399902, |
| "learning_rate": 2.7980468033463175e-05, |
| "loss": 0.0603, |
| "num_input_tokens_seen": 1571928, |
| "step": 5530 |
| }, |
| { |
| "epoch": 10.326492537313433, |
| "grad_norm": 2.228588342666626, |
| "learning_rate": 2.7940051750574763e-05, |
| "loss": 0.0345, |
| "num_input_tokens_seen": 1573496, |
| "step": 5535 |
| }, |
| { |
| "epoch": 10.335820895522389, |
| "grad_norm": 0.10949017852544785, |
| "learning_rate": 2.7899627674410587e-05, |
| "loss": 0.0334, |
| "num_input_tokens_seen": 1575128, |
| "step": 5540 |
| }, |
| { |
| "epoch": 10.345149253731343, |
| "grad_norm": 1.7777019739151, |
| "learning_rate": 2.7859195912123874e-05, |
| "loss": 0.0978, |
| "num_input_tokens_seen": 1576568, |
| "step": 5545 |
| }, |
| { |
| "epoch": 10.354477611940299, |
| "grad_norm": 0.2502020299434662, |
| "learning_rate": 2.7818756570888194e-05, |
| "loss": 0.0076, |
| "num_input_tokens_seen": 1578008, |
| "step": 5550 |
| }, |
| { |
| "epoch": 10.363805970149254, |
| "grad_norm": 5.7101898193359375, |
| "learning_rate": 2.7778309757897213e-05, |
| "loss": 0.0398, |
| "num_input_tokens_seen": 1579576, |
| "step": 5555 |
| }, |
| { |
| "epoch": 10.373134328358208, |
| "grad_norm": 1.5410045385360718, |
| "learning_rate": 2.77378555803644e-05, |
| "loss": 0.0357, |
| "num_input_tokens_seen": 1581016, |
| "step": 5560 |
| }, |
| { |
| "epoch": 10.382462686567164, |
| "grad_norm": 5.816549777984619, |
| "learning_rate": 2.7697394145522775e-05, |
| "loss": 0.0462, |
| "num_input_tokens_seen": 1582360, |
| "step": 5565 |
| }, |
| { |
| "epoch": 10.39179104477612, |
| "grad_norm": 3.148904800415039, |
| "learning_rate": 2.765692556062456e-05, |
| "loss": 0.0579, |
| "num_input_tokens_seen": 1583704, |
| "step": 5570 |
| }, |
| { |
| "epoch": 10.401119402985074, |
| "grad_norm": 0.7911895513534546, |
| "learning_rate": 2.7616449932940942e-05, |
| "loss": 0.0078, |
| "num_input_tokens_seen": 1585080, |
| "step": 5575 |
| }, |
| { |
| "epoch": 10.41044776119403, |
| "grad_norm": 2.137512445449829, |
| "learning_rate": 2.7575967369761775e-05, |
| "loss": 0.0287, |
| "num_input_tokens_seen": 1586520, |
| "step": 5580 |
| }, |
| { |
| "epoch": 10.419776119402986, |
| "grad_norm": 1.473531723022461, |
| "learning_rate": 2.7535477978395297e-05, |
| "loss": 0.0344, |
| "num_input_tokens_seen": 1588088, |
| "step": 5585 |
| }, |
| { |
| "epoch": 10.42910447761194, |
| "grad_norm": 0.8847230672836304, |
| "learning_rate": 2.749498186616785e-05, |
| "loss": 0.0226, |
| "num_input_tokens_seen": 1589688, |
| "step": 5590 |
| }, |
| { |
| "epoch": 10.438432835820896, |
| "grad_norm": 1.4490429162979126, |
| "learning_rate": 2.745447914042359e-05, |
| "loss": 0.0069, |
| "num_input_tokens_seen": 1591416, |
| "step": 5595 |
| }, |
| { |
| "epoch": 10.447761194029852, |
| "grad_norm": 2.4925129413604736, |
| "learning_rate": 2.7413969908524206e-05, |
| "loss": 0.0125, |
| "num_input_tokens_seen": 1592952, |
| "step": 5600 |
| }, |
| { |
| "epoch": 10.457089552238806, |
| "grad_norm": 2.0698132514953613, |
| "learning_rate": 2.7373454277848622e-05, |
| "loss": 0.0361, |
| "num_input_tokens_seen": 1594424, |
| "step": 5605 |
| }, |
| { |
| "epoch": 10.466417910447761, |
| "grad_norm": 2.106790542602539, |
| "learning_rate": 2.733293235579274e-05, |
| "loss": 0.0174, |
| "num_input_tokens_seen": 1595672, |
| "step": 5610 |
| }, |
| { |
| "epoch": 10.475746268656717, |
| "grad_norm": 1.7414101362228394, |
| "learning_rate": 2.729240424976911e-05, |
| "loss": 0.0523, |
| "num_input_tokens_seen": 1597016, |
| "step": 5615 |
| }, |
| { |
| "epoch": 10.485074626865671, |
| "grad_norm": 1.2917792797088623, |
| "learning_rate": 2.7251870067206715e-05, |
| "loss": 0.0519, |
| "num_input_tokens_seen": 1598488, |
| "step": 5620 |
| }, |
| { |
| "epoch": 10.494402985074627, |
| "grad_norm": 2.051412343978882, |
| "learning_rate": 2.7211329915550615e-05, |
| "loss": 0.0259, |
| "num_input_tokens_seen": 1599864, |
| "step": 5625 |
| }, |
| { |
| "epoch": 10.503731343283581, |
| "grad_norm": 6.700649738311768, |
| "learning_rate": 2.7170783902261692e-05, |
| "loss": 0.0395, |
| "num_input_tokens_seen": 1601176, |
| "step": 5630 |
| }, |
| { |
| "epoch": 10.513059701492537, |
| "grad_norm": 3.2577128410339355, |
| "learning_rate": 2.7130232134816397e-05, |
| "loss": 0.0157, |
| "num_input_tokens_seen": 1602776, |
| "step": 5635 |
| }, |
| { |
| "epoch": 10.522388059701493, |
| "grad_norm": 0.15221533179283142, |
| "learning_rate": 2.7089674720706387e-05, |
| "loss": 0.0289, |
| "num_input_tokens_seen": 1604632, |
| "step": 5640 |
| }, |
| { |
| "epoch": 10.531716417910447, |
| "grad_norm": 0.8759858012199402, |
| "learning_rate": 2.704911176743833e-05, |
| "loss": 0.0267, |
| "num_input_tokens_seen": 1606104, |
| "step": 5645 |
| }, |
| { |
| "epoch": 10.541044776119403, |
| "grad_norm": 0.6482117772102356, |
| "learning_rate": 2.7008543382533545e-05, |
| "loss": 0.0195, |
| "num_input_tokens_seen": 1607512, |
| "step": 5650 |
| }, |
| { |
| "epoch": 10.550373134328359, |
| "grad_norm": 0.7962613701820374, |
| "learning_rate": 2.6967969673527764e-05, |
| "loss": 0.0491, |
| "num_input_tokens_seen": 1608920, |
| "step": 5655 |
| }, |
| { |
| "epoch": 10.559701492537313, |
| "grad_norm": 0.31499433517456055, |
| "learning_rate": 2.6927390747970843e-05, |
| "loss": 0.0258, |
| "num_input_tokens_seen": 1610392, |
| "step": 5660 |
| }, |
| { |
| "epoch": 10.569029850746269, |
| "grad_norm": 4.366092205047607, |
| "learning_rate": 2.6886806713426434e-05, |
| "loss": 0.0625, |
| "num_input_tokens_seen": 1611768, |
| "step": 5665 |
| }, |
| { |
| "epoch": 10.578358208955224, |
| "grad_norm": 0.16790539026260376, |
| "learning_rate": 2.6846217677471765e-05, |
| "loss": 0.0621, |
| "num_input_tokens_seen": 1613080, |
| "step": 5670 |
| }, |
| { |
| "epoch": 10.587686567164178, |
| "grad_norm": 18.218826293945312, |
| "learning_rate": 2.6805623747697283e-05, |
| "loss": 0.0636, |
| "num_input_tokens_seen": 1614424, |
| "step": 5675 |
| }, |
| { |
| "epoch": 10.597014925373134, |
| "grad_norm": 0.43069151043891907, |
| "learning_rate": 2.6765025031706453e-05, |
| "loss": 0.0042, |
| "num_input_tokens_seen": 1615768, |
| "step": 5680 |
| }, |
| { |
| "epoch": 10.60634328358209, |
| "grad_norm": 1.3923701047897339, |
| "learning_rate": 2.67244216371154e-05, |
| "loss": 0.0169, |
| "num_input_tokens_seen": 1617016, |
| "step": 5685 |
| }, |
| { |
| "epoch": 10.615671641791044, |
| "grad_norm": 3.978377103805542, |
| "learning_rate": 2.668381367155265e-05, |
| "loss": 0.1198, |
| "num_input_tokens_seen": 1618328, |
| "step": 5690 |
| }, |
| { |
| "epoch": 10.625, |
| "grad_norm": 0.179210364818573, |
| "learning_rate": 2.664320124265885e-05, |
| "loss": 0.0068, |
| "num_input_tokens_seen": 1619768, |
| "step": 5695 |
| }, |
| { |
| "epoch": 10.634328358208956, |
| "grad_norm": 4.74570369720459, |
| "learning_rate": 2.660258445808648e-05, |
| "loss": 0.0448, |
| "num_input_tokens_seen": 1621080, |
| "step": 5700 |
| }, |
| { |
| "epoch": 10.64365671641791, |
| "grad_norm": 0.41598185896873474, |
| "learning_rate": 2.6561963425499574e-05, |
| "loss": 0.0392, |
| "num_input_tokens_seen": 1622488, |
| "step": 5705 |
| }, |
| { |
| "epoch": 10.652985074626866, |
| "grad_norm": 0.270162433385849, |
| "learning_rate": 2.652133825257339e-05, |
| "loss": 0.044, |
| "num_input_tokens_seen": 1623960, |
| "step": 5710 |
| }, |
| { |
| "epoch": 10.662313432835822, |
| "grad_norm": 0.48668423295021057, |
| "learning_rate": 2.6480709046994218e-05, |
| "loss": 0.0222, |
| "num_input_tokens_seen": 1625592, |
| "step": 5715 |
| }, |
| { |
| "epoch": 10.671641791044776, |
| "grad_norm": 0.2706654369831085, |
| "learning_rate": 2.6440075916458982e-05, |
| "loss": 0.0242, |
| "num_input_tokens_seen": 1627224, |
| "step": 5720 |
| }, |
| { |
| "epoch": 10.680970149253731, |
| "grad_norm": 2.869346857070923, |
| "learning_rate": 2.6399438968675055e-05, |
| "loss": 0.0633, |
| "num_input_tokens_seen": 1628952, |
| "step": 5725 |
| }, |
| { |
| "epoch": 10.690298507462687, |
| "grad_norm": 3.2426533699035645, |
| "learning_rate": 2.635879831135989e-05, |
| "loss": 0.0855, |
| "num_input_tokens_seen": 1630392, |
| "step": 5730 |
| }, |
| { |
| "epoch": 10.699626865671641, |
| "grad_norm": 0.86636883020401, |
| "learning_rate": 2.6318154052240807e-05, |
| "loss": 0.0477, |
| "num_input_tokens_seen": 1631832, |
| "step": 5735 |
| }, |
| { |
| "epoch": 10.708955223880597, |
| "grad_norm": 2.171286106109619, |
| "learning_rate": 2.6277506299054645e-05, |
| "loss": 0.0416, |
| "num_input_tokens_seen": 1633144, |
| "step": 5740 |
| }, |
| { |
| "epoch": 10.718283582089553, |
| "grad_norm": 6.782168388366699, |
| "learning_rate": 2.6236855159547525e-05, |
| "loss": 0.0534, |
| "num_input_tokens_seen": 1634616, |
| "step": 5745 |
| }, |
| { |
| "epoch": 10.727611940298507, |
| "grad_norm": 6.185320854187012, |
| "learning_rate": 2.6196200741474534e-05, |
| "loss": 0.0833, |
| "num_input_tokens_seen": 1635800, |
| "step": 5750 |
| }, |
| { |
| "epoch": 10.736940298507463, |
| "grad_norm": 0.20130524039268494, |
| "learning_rate": 2.6155543152599455e-05, |
| "loss": 0.0631, |
| "num_input_tokens_seen": 1637208, |
| "step": 5755 |
| }, |
| { |
| "epoch": 10.746268656716419, |
| "grad_norm": 0.1830572783946991, |
| "learning_rate": 2.611488250069447e-05, |
| "loss": 0.0549, |
| "num_input_tokens_seen": 1638488, |
| "step": 5760 |
| }, |
| { |
| "epoch": 10.755597014925373, |
| "grad_norm": 0.5310367345809937, |
| "learning_rate": 2.6074218893539885e-05, |
| "loss": 0.0257, |
| "num_input_tokens_seen": 1640088, |
| "step": 5765 |
| }, |
| { |
| "epoch": 10.764925373134329, |
| "grad_norm": 0.07111919671297073, |
| "learning_rate": 2.6033552438923837e-05, |
| "loss": 0.0092, |
| "num_input_tokens_seen": 1641368, |
| "step": 5770 |
| }, |
| { |
| "epoch": 10.774253731343283, |
| "grad_norm": 0.26063576340675354, |
| "learning_rate": 2.5992883244642014e-05, |
| "loss": 0.0315, |
| "num_input_tokens_seen": 1642712, |
| "step": 5775 |
| }, |
| { |
| "epoch": 10.783582089552239, |
| "grad_norm": 1.7227648496627808, |
| "learning_rate": 2.5952211418497358e-05, |
| "loss": 0.0471, |
| "num_input_tokens_seen": 1644088, |
| "step": 5780 |
| }, |
| { |
| "epoch": 10.792910447761194, |
| "grad_norm": 0.1042187288403511, |
| "learning_rate": 2.5911537068299802e-05, |
| "loss": 0.0148, |
| "num_input_tokens_seen": 1645496, |
| "step": 5785 |
| }, |
| { |
| "epoch": 10.802238805970148, |
| "grad_norm": 0.4890771210193634, |
| "learning_rate": 2.5870860301865944e-05, |
| "loss": 0.0581, |
| "num_input_tokens_seen": 1646872, |
| "step": 5790 |
| }, |
| { |
| "epoch": 10.811567164179104, |
| "grad_norm": 0.8559497594833374, |
| "learning_rate": 2.5830181227018833e-05, |
| "loss": 0.0063, |
| "num_input_tokens_seen": 1648280, |
| "step": 5795 |
| }, |
| { |
| "epoch": 10.82089552238806, |
| "grad_norm": 4.908860683441162, |
| "learning_rate": 2.5789499951587575e-05, |
| "loss": 0.0881, |
| "num_input_tokens_seen": 1649656, |
| "step": 5800 |
| }, |
| { |
| "epoch": 10.830223880597014, |
| "grad_norm": 0.6003031134605408, |
| "learning_rate": 2.5748816583407163e-05, |
| "loss": 0.0665, |
| "num_input_tokens_seen": 1651064, |
| "step": 5805 |
| }, |
| { |
| "epoch": 10.83955223880597, |
| "grad_norm": 0.7351614236831665, |
| "learning_rate": 2.570813123031811e-05, |
| "loss": 0.011, |
| "num_input_tokens_seen": 1652408, |
| "step": 5810 |
| }, |
| { |
| "epoch": 10.848880597014926, |
| "grad_norm": 0.17029403150081635, |
| "learning_rate": 2.5667444000166203e-05, |
| "loss": 0.0298, |
| "num_input_tokens_seen": 1653944, |
| "step": 5815 |
| }, |
| { |
| "epoch": 10.85820895522388, |
| "grad_norm": 3.4544613361358643, |
| "learning_rate": 2.5626755000802188e-05, |
| "loss": 0.081, |
| "num_input_tokens_seen": 1655192, |
| "step": 5820 |
| }, |
| { |
| "epoch": 10.867537313432836, |
| "grad_norm": 3.627028226852417, |
| "learning_rate": 2.5586064340081516e-05, |
| "loss": 0.0765, |
| "num_input_tokens_seen": 1656472, |
| "step": 5825 |
| }, |
| { |
| "epoch": 10.876865671641792, |
| "grad_norm": 1.8129554986953735, |
| "learning_rate": 2.5545372125864032e-05, |
| "loss": 0.061, |
| "num_input_tokens_seen": 1657656, |
| "step": 5830 |
| }, |
| { |
| "epoch": 10.886194029850746, |
| "grad_norm": 4.25105094909668, |
| "learning_rate": 2.5504678466013705e-05, |
| "loss": 0.0237, |
| "num_input_tokens_seen": 1659096, |
| "step": 5835 |
| }, |
| { |
| "epoch": 10.895522388059701, |
| "grad_norm": 0.14929920434951782, |
| "learning_rate": 2.546398346839834e-05, |
| "loss": 0.0577, |
| "num_input_tokens_seen": 1660344, |
| "step": 5840 |
| }, |
| { |
| "epoch": 10.904850746268657, |
| "grad_norm": 1.4634943008422852, |
| "learning_rate": 2.5423287240889277e-05, |
| "loss": 0.006, |
| "num_input_tokens_seen": 1662008, |
| "step": 5845 |
| }, |
| { |
| "epoch": 10.914179104477611, |
| "grad_norm": 0.15748544037342072, |
| "learning_rate": 2.5382589891361125e-05, |
| "loss": 0.0706, |
| "num_input_tokens_seen": 1663448, |
| "step": 5850 |
| }, |
| { |
| "epoch": 10.923507462686567, |
| "grad_norm": 0.32828202843666077, |
| "learning_rate": 2.5341891527691457e-05, |
| "loss": 0.0331, |
| "num_input_tokens_seen": 1665048, |
| "step": 5855 |
| }, |
| { |
| "epoch": 10.932835820895523, |
| "grad_norm": 0.12406160682439804, |
| "learning_rate": 2.5301192257760555e-05, |
| "loss": 0.0258, |
| "num_input_tokens_seen": 1666392, |
| "step": 5860 |
| }, |
| { |
| "epoch": 10.942164179104477, |
| "grad_norm": 5.545308589935303, |
| "learning_rate": 2.5260492189451073e-05, |
| "loss": 0.1397, |
| "num_input_tokens_seen": 1667704, |
| "step": 5865 |
| }, |
| { |
| "epoch": 10.951492537313433, |
| "grad_norm": 0.9126232862472534, |
| "learning_rate": 2.521979143064781e-05, |
| "loss": 0.0311, |
| "num_input_tokens_seen": 1669080, |
| "step": 5870 |
| }, |
| { |
| "epoch": 10.960820895522389, |
| "grad_norm": 6.081330299377441, |
| "learning_rate": 2.5179090089237378e-05, |
| "loss": 0.0797, |
| "num_input_tokens_seen": 1670296, |
| "step": 5875 |
| }, |
| { |
| "epoch": 10.970149253731343, |
| "grad_norm": 0.4514051079750061, |
| "learning_rate": 2.5138388273107932e-05, |
| "loss": 0.0418, |
| "num_input_tokens_seen": 1671704, |
| "step": 5880 |
| }, |
| { |
| "epoch": 10.979477611940299, |
| "grad_norm": 1.0167171955108643, |
| "learning_rate": 2.5097686090148904e-05, |
| "loss": 0.0478, |
| "num_input_tokens_seen": 1673176, |
| "step": 5885 |
| }, |
| { |
| "epoch": 10.988805970149254, |
| "grad_norm": 0.7070131897926331, |
| "learning_rate": 2.5056983648250677e-05, |
| "loss": 0.0252, |
| "num_input_tokens_seen": 1674520, |
| "step": 5890 |
| }, |
| { |
| "epoch": 10.998134328358208, |
| "grad_norm": 0.38246408104896545, |
| "learning_rate": 2.501628105530433e-05, |
| "loss": 0.0699, |
| "num_input_tokens_seen": 1675832, |
| "step": 5895 |
| }, |
| { |
| "epoch": 11.0, |
| "eval_loss": 1.204637885093689, |
| "eval_runtime": 2.8813, |
| "eval_samples_per_second": 82.602, |
| "eval_steps_per_second": 20.824, |
| "num_input_tokens_seen": 1675880, |
| "step": 5896 |
| }, |
| { |
| "epoch": 11.007462686567164, |
| "grad_norm": 0.4067462384700775, |
| "learning_rate": 2.4975578419201347e-05, |
| "loss": 0.0033, |
| "num_input_tokens_seen": 1677096, |
| "step": 5900 |
| }, |
| { |
| "epoch": 11.01679104477612, |
| "grad_norm": 0.4786495864391327, |
| "learning_rate": 2.4934875847833308e-05, |
| "loss": 0.0082, |
| "num_input_tokens_seen": 1678440, |
| "step": 5905 |
| }, |
| { |
| "epoch": 11.026119402985074, |
| "grad_norm": 0.26938506960868835, |
| "learning_rate": 2.489417344909166e-05, |
| "loss": 0.0263, |
| "num_input_tokens_seen": 1679752, |
| "step": 5910 |
| }, |
| { |
| "epoch": 11.03544776119403, |
| "grad_norm": 0.2655721604824066, |
| "learning_rate": 2.4853471330867335e-05, |
| "loss": 0.04, |
| "num_input_tokens_seen": 1681160, |
| "step": 5915 |
| }, |
| { |
| "epoch": 11.044776119402986, |
| "grad_norm": 0.08613605797290802, |
| "learning_rate": 2.4812769601050587e-05, |
| "loss": 0.0043, |
| "num_input_tokens_seen": 1682440, |
| "step": 5920 |
| }, |
| { |
| "epoch": 11.05410447761194, |
| "grad_norm": 0.9604107737541199, |
| "learning_rate": 2.477206836753057e-05, |
| "loss": 0.0644, |
| "num_input_tokens_seen": 1683912, |
| "step": 5925 |
| }, |
| { |
| "epoch": 11.063432835820896, |
| "grad_norm": 0.10745811462402344, |
| "learning_rate": 2.4731367738195195e-05, |
| "loss": 0.0065, |
| "num_input_tokens_seen": 1685384, |
| "step": 5930 |
| }, |
| { |
| "epoch": 11.072761194029852, |
| "grad_norm": 0.9260890483856201, |
| "learning_rate": 2.4690667820930706e-05, |
| "loss": 0.0248, |
| "num_input_tokens_seen": 1686728, |
| "step": 5935 |
| }, |
| { |
| "epoch": 11.082089552238806, |
| "grad_norm": 1.8620458841323853, |
| "learning_rate": 2.4649968723621502e-05, |
| "loss": 0.0281, |
| "num_input_tokens_seen": 1688200, |
| "step": 5940 |
| }, |
| { |
| "epoch": 11.091417910447761, |
| "grad_norm": 0.9683085083961487, |
| "learning_rate": 2.460927055414981e-05, |
| "loss": 0.0173, |
| "num_input_tokens_seen": 1689640, |
| "step": 5945 |
| }, |
| { |
| "epoch": 11.100746268656716, |
| "grad_norm": 3.614527702331543, |
| "learning_rate": 2.4568573420395354e-05, |
| "loss": 0.0143, |
| "num_input_tokens_seen": 1691144, |
| "step": 5950 |
| }, |
| { |
| "epoch": 11.110074626865671, |
| "grad_norm": 1.4966660737991333, |
| "learning_rate": 2.452787743023517e-05, |
| "loss": 0.0167, |
| "num_input_tokens_seen": 1692616, |
| "step": 5955 |
| }, |
| { |
| "epoch": 11.119402985074627, |
| "grad_norm": 1.9839037656784058, |
| "learning_rate": 2.4487182691543206e-05, |
| "loss": 0.0562, |
| "num_input_tokens_seen": 1693928, |
| "step": 5960 |
| }, |
| { |
| "epoch": 11.128731343283581, |
| "grad_norm": 1.2990456819534302, |
| "learning_rate": 2.4446489312190144e-05, |
| "loss": 0.0183, |
| "num_input_tokens_seen": 1695432, |
| "step": 5965 |
| }, |
| { |
| "epoch": 11.138059701492537, |
| "grad_norm": 0.14193353056907654, |
| "learning_rate": 2.4405797400043034e-05, |
| "loss": 0.0406, |
| "num_input_tokens_seen": 1696712, |
| "step": 5970 |
| }, |
| { |
| "epoch": 11.147388059701493, |
| "grad_norm": 1.5708144903182983, |
| "learning_rate": 2.436510706296504e-05, |
| "loss": 0.0094, |
| "num_input_tokens_seen": 1698312, |
| "step": 5975 |
| }, |
| { |
| "epoch": 11.156716417910447, |
| "grad_norm": 0.6482675075531006, |
| "learning_rate": 2.432441840881516e-05, |
| "loss": 0.0522, |
| "num_input_tokens_seen": 1699624, |
| "step": 5980 |
| }, |
| { |
| "epoch": 11.166044776119403, |
| "grad_norm": 1.3017324209213257, |
| "learning_rate": 2.428373154544791e-05, |
| "loss": 0.0181, |
| "num_input_tokens_seen": 1701160, |
| "step": 5985 |
| }, |
| { |
| "epoch": 11.175373134328359, |
| "grad_norm": 0.06528612971305847, |
| "learning_rate": 2.424304658071309e-05, |
| "loss": 0.0343, |
| "num_input_tokens_seen": 1702472, |
| "step": 5990 |
| }, |
| { |
| "epoch": 11.184701492537313, |
| "grad_norm": 0.08959075063467026, |
| "learning_rate": 2.4202363622455442e-05, |
| "loss": 0.0386, |
| "num_input_tokens_seen": 1703944, |
| "step": 5995 |
| }, |
| { |
| "epoch": 11.194029850746269, |
| "grad_norm": 0.15471197664737701, |
| "learning_rate": 2.4161682778514403e-05, |
| "loss": 0.0038, |
| "num_input_tokens_seen": 1705320, |
| "step": 6000 |
| }, |
| { |
| "epoch": 11.203358208955224, |
| "grad_norm": 0.19762662053108215, |
| "learning_rate": 2.4121004156723802e-05, |
| "loss": 0.0244, |
| "num_input_tokens_seen": 1706952, |
| "step": 6005 |
| }, |
| { |
| "epoch": 11.212686567164178, |
| "grad_norm": 4.914639949798584, |
| "learning_rate": 2.4080327864911567e-05, |
| "loss": 0.0547, |
| "num_input_tokens_seen": 1708392, |
| "step": 6010 |
| }, |
| { |
| "epoch": 11.222014925373134, |
| "grad_norm": 8.532730102539062, |
| "learning_rate": 2.403965401089947e-05, |
| "loss": 0.0449, |
| "num_input_tokens_seen": 1709864, |
| "step": 6015 |
| }, |
| { |
| "epoch": 11.23134328358209, |
| "grad_norm": 0.31478214263916016, |
| "learning_rate": 2.3998982702502807e-05, |
| "loss": 0.0235, |
| "num_input_tokens_seen": 1711368, |
| "step": 6020 |
| }, |
| { |
| "epoch": 11.240671641791044, |
| "grad_norm": 0.13404163718223572, |
| "learning_rate": 2.3958314047530125e-05, |
| "loss": 0.0113, |
| "num_input_tokens_seen": 1712840, |
| "step": 6025 |
| }, |
| { |
| "epoch": 11.25, |
| "grad_norm": 0.06108865886926651, |
| "learning_rate": 2.3917648153782956e-05, |
| "loss": 0.0036, |
| "num_input_tokens_seen": 1714440, |
| "step": 6030 |
| }, |
| { |
| "epoch": 11.259328358208956, |
| "grad_norm": 0.2284984141588211, |
| "learning_rate": 2.3876985129055486e-05, |
| "loss": 0.0022, |
| "num_input_tokens_seen": 1715848, |
| "step": 6035 |
| }, |
| { |
| "epoch": 11.26865671641791, |
| "grad_norm": 0.1380099207162857, |
| "learning_rate": 2.3836325081134314e-05, |
| "loss": 0.0269, |
| "num_input_tokens_seen": 1717512, |
| "step": 6040 |
| }, |
| { |
| "epoch": 11.277985074626866, |
| "grad_norm": 0.22938765585422516, |
| "learning_rate": 2.3795668117798138e-05, |
| "loss": 0.0114, |
| "num_input_tokens_seen": 1718888, |
| "step": 6045 |
| }, |
| { |
| "epoch": 11.287313432835822, |
| "grad_norm": 0.5782793164253235, |
| "learning_rate": 2.37550143468175e-05, |
| "loss": 0.0174, |
| "num_input_tokens_seen": 1720104, |
| "step": 6050 |
| }, |
| { |
| "epoch": 11.296641791044776, |
| "grad_norm": 1.1648858785629272, |
| "learning_rate": 2.3714363875954447e-05, |
| "loss": 0.0534, |
| "num_input_tokens_seen": 1721352, |
| "step": 6055 |
| }, |
| { |
| "epoch": 11.305970149253731, |
| "grad_norm": 0.1358286291360855, |
| "learning_rate": 2.36737168129623e-05, |
| "loss": 0.0556, |
| "num_input_tokens_seen": 1723016, |
| "step": 6060 |
| }, |
| { |
| "epoch": 11.315298507462687, |
| "grad_norm": 0.4931541681289673, |
| "learning_rate": 2.3633073265585356e-05, |
| "loss": 0.0491, |
| "num_input_tokens_seen": 1724392, |
| "step": 6065 |
| }, |
| { |
| "epoch": 11.324626865671641, |
| "grad_norm": 0.5054898262023926, |
| "learning_rate": 2.3592433341558563e-05, |
| "loss": 0.116, |
| "num_input_tokens_seen": 1726024, |
| "step": 6070 |
| }, |
| { |
| "epoch": 11.333955223880597, |
| "grad_norm": 3.418457508087158, |
| "learning_rate": 2.3551797148607298e-05, |
| "loss": 0.034, |
| "num_input_tokens_seen": 1727432, |
| "step": 6075 |
| }, |
| { |
| "epoch": 11.343283582089553, |
| "grad_norm": 0.3684387803077698, |
| "learning_rate": 2.3511164794447015e-05, |
| "loss": 0.0344, |
| "num_input_tokens_seen": 1728680, |
| "step": 6080 |
| }, |
| { |
| "epoch": 11.352611940298507, |
| "grad_norm": 1.9423009157180786, |
| "learning_rate": 2.347053638678302e-05, |
| "loss": 0.0055, |
| "num_input_tokens_seen": 1730152, |
| "step": 6085 |
| }, |
| { |
| "epoch": 11.361940298507463, |
| "grad_norm": 1.5360335111618042, |
| "learning_rate": 2.3429912033310143e-05, |
| "loss": 0.035, |
| "num_input_tokens_seen": 1731400, |
| "step": 6090 |
| }, |
| { |
| "epoch": 11.371268656716419, |
| "grad_norm": 0.9016702771186829, |
| "learning_rate": 2.338929184171247e-05, |
| "loss": 0.0247, |
| "num_input_tokens_seen": 1732744, |
| "step": 6095 |
| }, |
| { |
| "epoch": 11.380597014925373, |
| "grad_norm": 0.03430149331688881, |
| "learning_rate": 2.3348675919663065e-05, |
| "loss": 0.0233, |
| "num_input_tokens_seen": 1734344, |
| "step": 6100 |
| }, |
| { |
| "epoch": 11.389925373134329, |
| "grad_norm": 0.5950490832328796, |
| "learning_rate": 2.330806437482365e-05, |
| "loss": 0.0269, |
| "num_input_tokens_seen": 1735784, |
| "step": 6105 |
| }, |
| { |
| "epoch": 11.399253731343283, |
| "grad_norm": 1.337599754333496, |
| "learning_rate": 2.3267457314844372e-05, |
| "loss": 0.0031, |
| "num_input_tokens_seen": 1737064, |
| "step": 6110 |
| }, |
| { |
| "epoch": 11.408582089552239, |
| "grad_norm": 0.21439221501350403, |
| "learning_rate": 2.3226854847363473e-05, |
| "loss": 0.0284, |
| "num_input_tokens_seen": 1738504, |
| "step": 6115 |
| }, |
| { |
| "epoch": 11.417910447761194, |
| "grad_norm": 4.238887310028076, |
| "learning_rate": 2.3186257080007016e-05, |
| "loss": 0.0043, |
| "num_input_tokens_seen": 1739944, |
| "step": 6120 |
| }, |
| { |
| "epoch": 11.427238805970148, |
| "grad_norm": 0.1856064349412918, |
| "learning_rate": 2.314566412038865e-05, |
| "loss": 0.0312, |
| "num_input_tokens_seen": 1741480, |
| "step": 6125 |
| }, |
| { |
| "epoch": 11.436567164179104, |
| "grad_norm": 0.6954842209815979, |
| "learning_rate": 2.31050760761092e-05, |
| "loss": 0.0177, |
| "num_input_tokens_seen": 1742888, |
| "step": 6130 |
| }, |
| { |
| "epoch": 11.44589552238806, |
| "grad_norm": 0.45111992955207825, |
| "learning_rate": 2.306449305475655e-05, |
| "loss": 0.1042, |
| "num_input_tokens_seen": 1744200, |
| "step": 6135 |
| }, |
| { |
| "epoch": 11.455223880597014, |
| "grad_norm": 1.9380782842636108, |
| "learning_rate": 2.3023915163905198e-05, |
| "loss": 0.0073, |
| "num_input_tokens_seen": 1745672, |
| "step": 6140 |
| }, |
| { |
| "epoch": 11.46455223880597, |
| "grad_norm": 0.14225023984909058, |
| "learning_rate": 2.298334251111607e-05, |
| "loss": 0.021, |
| "num_input_tokens_seen": 1747272, |
| "step": 6145 |
| }, |
| { |
| "epoch": 11.473880597014926, |
| "grad_norm": 1.8557679653167725, |
| "learning_rate": 2.2942775203936238e-05, |
| "loss": 0.0326, |
| "num_input_tokens_seen": 1748712, |
| "step": 6150 |
| }, |
| { |
| "epoch": 11.48320895522388, |
| "grad_norm": 0.12226130813360214, |
| "learning_rate": 2.290221334989854e-05, |
| "loss": 0.0221, |
| "num_input_tokens_seen": 1750152, |
| "step": 6155 |
| }, |
| { |
| "epoch": 11.492537313432836, |
| "grad_norm": 0.5441533327102661, |
| "learning_rate": 2.286165705652143e-05, |
| "loss": 0.0511, |
| "num_input_tokens_seen": 1751560, |
| "step": 6160 |
| }, |
| { |
| "epoch": 11.501865671641792, |
| "grad_norm": 0.08601506799459457, |
| "learning_rate": 2.2821106431308544e-05, |
| "loss": 0.0109, |
| "num_input_tokens_seen": 1753032, |
| "step": 6165 |
| }, |
| { |
| "epoch": 11.511194029850746, |
| "grad_norm": 1.0348618030548096, |
| "learning_rate": 2.278056158174858e-05, |
| "loss": 0.0044, |
| "num_input_tokens_seen": 1754408, |
| "step": 6170 |
| }, |
| { |
| "epoch": 11.520522388059701, |
| "grad_norm": 0.6561064124107361, |
| "learning_rate": 2.274002261531484e-05, |
| "loss": 0.0061, |
| "num_input_tokens_seen": 1755784, |
| "step": 6175 |
| }, |
| { |
| "epoch": 11.529850746268657, |
| "grad_norm": 0.5152919292449951, |
| "learning_rate": 2.2699489639465103e-05, |
| "loss": 0.0141, |
| "num_input_tokens_seen": 1757192, |
| "step": 6180 |
| }, |
| { |
| "epoch": 11.539179104477611, |
| "grad_norm": 2.713244676589966, |
| "learning_rate": 2.2658962761641232e-05, |
| "loss": 0.0583, |
| "num_input_tokens_seen": 1758568, |
| "step": 6185 |
| }, |
| { |
| "epoch": 11.548507462686567, |
| "grad_norm": 1.1397849321365356, |
| "learning_rate": 2.2618442089268926e-05, |
| "loss": 0.0441, |
| "num_input_tokens_seen": 1759944, |
| "step": 6190 |
| }, |
| { |
| "epoch": 11.557835820895523, |
| "grad_norm": 0.19099606573581696, |
| "learning_rate": 2.2577927729757458e-05, |
| "loss": 0.0885, |
| "num_input_tokens_seen": 1761160, |
| "step": 6195 |
| }, |
| { |
| "epoch": 11.567164179104477, |
| "grad_norm": 3.7199039459228516, |
| "learning_rate": 2.2537419790499323e-05, |
| "loss": 0.0345, |
| "num_input_tokens_seen": 1762600, |
| "step": 6200 |
| }, |
| { |
| "epoch": 11.576492537313433, |
| "grad_norm": 2.9567983150482178, |
| "learning_rate": 2.249691837887005e-05, |
| "loss": 0.0751, |
| "num_input_tokens_seen": 1763976, |
| "step": 6205 |
| }, |
| { |
| "epoch": 11.585820895522389, |
| "grad_norm": 0.0750470757484436, |
| "learning_rate": 2.2456423602227835e-05, |
| "loss": 0.0082, |
| "num_input_tokens_seen": 1765480, |
| "step": 6210 |
| }, |
| { |
| "epoch": 11.595149253731343, |
| "grad_norm": 0.06895919889211655, |
| "learning_rate": 2.2415935567913286e-05, |
| "loss": 0.0297, |
| "num_input_tokens_seen": 1766760, |
| "step": 6215 |
| }, |
| { |
| "epoch": 11.604477611940299, |
| "grad_norm": 0.052315544337034225, |
| "learning_rate": 2.2375454383249154e-05, |
| "loss": 0.0156, |
| "num_input_tokens_seen": 1768232, |
| "step": 6220 |
| }, |
| { |
| "epoch": 11.613805970149254, |
| "grad_norm": 1.2783445119857788, |
| "learning_rate": 2.233498015554002e-05, |
| "loss": 0.0257, |
| "num_input_tokens_seen": 1769576, |
| "step": 6225 |
| }, |
| { |
| "epoch": 11.623134328358208, |
| "grad_norm": 5.374032974243164, |
| "learning_rate": 2.229451299207203e-05, |
| "loss": 0.102, |
| "num_input_tokens_seen": 1770984, |
| "step": 6230 |
| }, |
| { |
| "epoch": 11.632462686567164, |
| "grad_norm": 0.43714866042137146, |
| "learning_rate": 2.2254053000112597e-05, |
| "loss": 0.0079, |
| "num_input_tokens_seen": 1772392, |
| "step": 6235 |
| }, |
| { |
| "epoch": 11.64179104477612, |
| "grad_norm": 0.1446562111377716, |
| "learning_rate": 2.2213600286910134e-05, |
| "loss": 0.0044, |
| "num_input_tokens_seen": 1773832, |
| "step": 6240 |
| }, |
| { |
| "epoch": 11.651119402985074, |
| "grad_norm": 0.16494818031787872, |
| "learning_rate": 2.217315495969377e-05, |
| "loss": 0.0277, |
| "num_input_tokens_seen": 1775272, |
| "step": 6245 |
| }, |
| { |
| "epoch": 11.66044776119403, |
| "grad_norm": 0.20410147309303284, |
| "learning_rate": 2.2132717125673024e-05, |
| "loss": 0.025, |
| "num_input_tokens_seen": 1776520, |
| "step": 6250 |
| }, |
| { |
| "epoch": 11.669776119402986, |
| "grad_norm": 1.881449580192566, |
| "learning_rate": 2.209228689203758e-05, |
| "loss": 0.0348, |
| "num_input_tokens_seen": 1777928, |
| "step": 6255 |
| }, |
| { |
| "epoch": 11.67910447761194, |
| "grad_norm": 4.987156391143799, |
| "learning_rate": 2.205186436595696e-05, |
| "loss": 0.0538, |
| "num_input_tokens_seen": 1779464, |
| "step": 6260 |
| }, |
| { |
| "epoch": 11.688432835820896, |
| "grad_norm": 0.445840448141098, |
| "learning_rate": 2.2011449654580266e-05, |
| "loss": 0.0044, |
| "num_input_tokens_seen": 1781064, |
| "step": 6265 |
| }, |
| { |
| "epoch": 11.697761194029852, |
| "grad_norm": 0.259150892496109, |
| "learning_rate": 2.197104286503589e-05, |
| "loss": 0.0467, |
| "num_input_tokens_seen": 1782344, |
| "step": 6270 |
| }, |
| { |
| "epoch": 11.707089552238806, |
| "grad_norm": 0.06959406286478043, |
| "learning_rate": 2.1930644104431197e-05, |
| "loss": 0.0098, |
| "num_input_tokens_seen": 1783720, |
| "step": 6275 |
| }, |
| { |
| "epoch": 11.716417910447761, |
| "grad_norm": 0.12992776930332184, |
| "learning_rate": 2.1890253479852307e-05, |
| "loss": 0.0045, |
| "num_input_tokens_seen": 1785000, |
| "step": 6280 |
| }, |
| { |
| "epoch": 11.725746268656717, |
| "grad_norm": 0.06712110340595245, |
| "learning_rate": 2.184987109836374e-05, |
| "loss": 0.0067, |
| "num_input_tokens_seen": 1786344, |
| "step": 6285 |
| }, |
| { |
| "epoch": 11.735074626865671, |
| "grad_norm": 0.824114978313446, |
| "learning_rate": 2.18094970670082e-05, |
| "loss": 0.0386, |
| "num_input_tokens_seen": 1787752, |
| "step": 6290 |
| }, |
| { |
| "epoch": 11.744402985074627, |
| "grad_norm": 0.1707543283700943, |
| "learning_rate": 2.176913149280622e-05, |
| "loss": 0.0261, |
| "num_input_tokens_seen": 1789256, |
| "step": 6295 |
| }, |
| { |
| "epoch": 11.753731343283581, |
| "grad_norm": 0.10945512354373932, |
| "learning_rate": 2.1728774482755942e-05, |
| "loss": 0.0243, |
| "num_input_tokens_seen": 1790632, |
| "step": 6300 |
| }, |
| { |
| "epoch": 11.763059701492537, |
| "grad_norm": 0.21477095782756805, |
| "learning_rate": 2.1688426143832803e-05, |
| "loss": 0.0255, |
| "num_input_tokens_seen": 1791944, |
| "step": 6305 |
| }, |
| { |
| "epoch": 11.772388059701493, |
| "grad_norm": 1.978134036064148, |
| "learning_rate": 2.1648086582989242e-05, |
| "loss": 0.1017, |
| "num_input_tokens_seen": 1793320, |
| "step": 6310 |
| }, |
| { |
| "epoch": 11.781716417910447, |
| "grad_norm": 0.7244099378585815, |
| "learning_rate": 2.160775590715445e-05, |
| "loss": 0.0154, |
| "num_input_tokens_seen": 1794728, |
| "step": 6315 |
| }, |
| { |
| "epoch": 11.791044776119403, |
| "grad_norm": 0.6030241250991821, |
| "learning_rate": 2.1567434223234038e-05, |
| "loss": 0.1024, |
| "num_input_tokens_seen": 1796328, |
| "step": 6320 |
| }, |
| { |
| "epoch": 11.800373134328359, |
| "grad_norm": 13.344270706176758, |
| "learning_rate": 2.152712163810981e-05, |
| "loss": 0.0371, |
| "num_input_tokens_seen": 1797768, |
| "step": 6325 |
| }, |
| { |
| "epoch": 11.809701492537313, |
| "grad_norm": 0.030756594613194466, |
| "learning_rate": 2.1486818258639445e-05, |
| "loss": 0.1188, |
| "num_input_tokens_seen": 1799112, |
| "step": 6330 |
| }, |
| { |
| "epoch": 11.819029850746269, |
| "grad_norm": 0.101422518491745, |
| "learning_rate": 2.1446524191656205e-05, |
| "loss": 0.0051, |
| "num_input_tokens_seen": 1800776, |
| "step": 6335 |
| }, |
| { |
| "epoch": 11.828358208955224, |
| "grad_norm": 0.3077526092529297, |
| "learning_rate": 2.1406239543968687e-05, |
| "loss": 0.0715, |
| "num_input_tokens_seen": 1802280, |
| "step": 6340 |
| }, |
| { |
| "epoch": 11.837686567164178, |
| "grad_norm": 0.7722053527832031, |
| "learning_rate": 2.1365964422360497e-05, |
| "loss": 0.0331, |
| "num_input_tokens_seen": 1803656, |
| "step": 6345 |
| }, |
| { |
| "epoch": 11.847014925373134, |
| "grad_norm": 0.07878338545560837, |
| "learning_rate": 2.132569893359002e-05, |
| "loss": 0.0342, |
| "num_input_tokens_seen": 1805192, |
| "step": 6350 |
| }, |
| { |
| "epoch": 11.85634328358209, |
| "grad_norm": 0.41953355073928833, |
| "learning_rate": 2.1285443184390076e-05, |
| "loss": 0.0973, |
| "num_input_tokens_seen": 1806600, |
| "step": 6355 |
| }, |
| { |
| "epoch": 11.865671641791044, |
| "grad_norm": 0.1671505570411682, |
| "learning_rate": 2.1245197281467686e-05, |
| "loss": 0.0023, |
| "num_input_tokens_seen": 1807976, |
| "step": 6360 |
| }, |
| { |
| "epoch": 11.875, |
| "grad_norm": 0.41215357184410095, |
| "learning_rate": 2.1204961331503787e-05, |
| "loss": 0.0723, |
| "num_input_tokens_seen": 1809192, |
| "step": 6365 |
| }, |
| { |
| "epoch": 11.884328358208956, |
| "grad_norm": 2.25791335105896, |
| "learning_rate": 2.1164735441152882e-05, |
| "loss": 0.0203, |
| "num_input_tokens_seen": 1810440, |
| "step": 6370 |
| }, |
| { |
| "epoch": 11.89365671641791, |
| "grad_norm": 0.04752112552523613, |
| "learning_rate": 2.1124519717042873e-05, |
| "loss": 0.0018, |
| "num_input_tokens_seen": 1811784, |
| "step": 6375 |
| }, |
| { |
| "epoch": 11.902985074626866, |
| "grad_norm": 5.445159912109375, |
| "learning_rate": 2.108431426577466e-05, |
| "loss": 0.0279, |
| "num_input_tokens_seen": 1813064, |
| "step": 6380 |
| }, |
| { |
| "epoch": 11.912313432835822, |
| "grad_norm": 1.1380277872085571, |
| "learning_rate": 2.104411919392193e-05, |
| "loss": 0.0293, |
| "num_input_tokens_seen": 1814440, |
| "step": 6385 |
| }, |
| { |
| "epoch": 11.921641791044776, |
| "grad_norm": 0.05503709241747856, |
| "learning_rate": 2.1003934608030895e-05, |
| "loss": 0.0635, |
| "num_input_tokens_seen": 1816104, |
| "step": 6390 |
| }, |
| { |
| "epoch": 11.930970149253731, |
| "grad_norm": 1.0821571350097656, |
| "learning_rate": 2.0963760614619898e-05, |
| "loss": 0.0274, |
| "num_input_tokens_seen": 1817640, |
| "step": 6395 |
| }, |
| { |
| "epoch": 11.940298507462687, |
| "grad_norm": 2.46370005607605, |
| "learning_rate": 2.092359732017928e-05, |
| "loss": 0.0253, |
| "num_input_tokens_seen": 1819336, |
| "step": 6400 |
| }, |
| { |
| "epoch": 11.949626865671641, |
| "grad_norm": 2.830101490020752, |
| "learning_rate": 2.0883444831170952e-05, |
| "loss": 0.0507, |
| "num_input_tokens_seen": 1821192, |
| "step": 6405 |
| }, |
| { |
| "epoch": 11.958955223880597, |
| "grad_norm": 1.9961892366409302, |
| "learning_rate": 2.0843303254028253e-05, |
| "loss": 0.0407, |
| "num_input_tokens_seen": 1822568, |
| "step": 6410 |
| }, |
| { |
| "epoch": 11.968283582089553, |
| "grad_norm": 0.49591681361198425, |
| "learning_rate": 2.0803172695155526e-05, |
| "loss": 0.0638, |
| "num_input_tokens_seen": 1824040, |
| "step": 6415 |
| }, |
| { |
| "epoch": 11.977611940298507, |
| "grad_norm": 2.071251392364502, |
| "learning_rate": 2.076305326092796e-05, |
| "loss": 0.0516, |
| "num_input_tokens_seen": 1825416, |
| "step": 6420 |
| }, |
| { |
| "epoch": 11.986940298507463, |
| "grad_norm": 3.519652843475342, |
| "learning_rate": 2.0722945057691252e-05, |
| "loss": 0.0484, |
| "num_input_tokens_seen": 1826760, |
| "step": 6425 |
| }, |
| { |
| "epoch": 11.996268656716419, |
| "grad_norm": 0.592960774898529, |
| "learning_rate": 2.0682848191761296e-05, |
| "loss": 0.0556, |
| "num_input_tokens_seen": 1828072, |
| "step": 6430 |
| }, |
| { |
| "epoch": 12.0, |
| "eval_loss": 1.1798110008239746, |
| "eval_runtime": 2.917, |
| "eval_samples_per_second": 81.591, |
| "eval_steps_per_second": 20.569, |
| "num_input_tokens_seen": 1828344, |
| "step": 6432 |
| }, |
| { |
| "epoch": 12.005597014925373, |
| "grad_norm": 13.00069808959961, |
| "learning_rate": 2.0642762769423968e-05, |
| "loss": 0.0912, |
| "num_input_tokens_seen": 1829272, |
| "step": 6435 |
| }, |
| { |
| "epoch": 12.014925373134329, |
| "grad_norm": 0.05859753116965294, |
| "learning_rate": 2.060268889693477e-05, |
| "loss": 0.0604, |
| "num_input_tokens_seen": 1830808, |
| "step": 6440 |
| }, |
| { |
| "epoch": 12.024253731343284, |
| "grad_norm": 0.14973847568035126, |
| "learning_rate": 2.056262668051864e-05, |
| "loss": 0.0129, |
| "num_input_tokens_seen": 1832376, |
| "step": 6445 |
| }, |
| { |
| "epoch": 12.033582089552239, |
| "grad_norm": 0.49489596486091614, |
| "learning_rate": 2.0522576226369592e-05, |
| "loss": 0.015, |
| "num_input_tokens_seen": 1833944, |
| "step": 6450 |
| }, |
| { |
| "epoch": 12.042910447761194, |
| "grad_norm": 0.09353584796190262, |
| "learning_rate": 2.048253764065045e-05, |
| "loss": 0.0213, |
| "num_input_tokens_seen": 1835320, |
| "step": 6455 |
| }, |
| { |
| "epoch": 12.052238805970148, |
| "grad_norm": 0.09948700666427612, |
| "learning_rate": 2.04425110294926e-05, |
| "loss": 0.0574, |
| "num_input_tokens_seen": 1836824, |
| "step": 6460 |
| }, |
| { |
| "epoch": 12.061567164179104, |
| "grad_norm": 1.9585660696029663, |
| "learning_rate": 2.0402496498995667e-05, |
| "loss": 0.0887, |
| "num_input_tokens_seen": 1838072, |
| "step": 6465 |
| }, |
| { |
| "epoch": 12.07089552238806, |
| "grad_norm": 0.19126485288143158, |
| "learning_rate": 2.0362494155227275e-05, |
| "loss": 0.0499, |
| "num_input_tokens_seen": 1839384, |
| "step": 6470 |
| }, |
| { |
| "epoch": 12.080223880597014, |
| "grad_norm": 0.06036051735281944, |
| "learning_rate": 2.0322504104222723e-05, |
| "loss": 0.0179, |
| "num_input_tokens_seen": 1840760, |
| "step": 6475 |
| }, |
| { |
| "epoch": 12.08955223880597, |
| "grad_norm": 0.06464926898479462, |
| "learning_rate": 2.028252645198474e-05, |
| "loss": 0.0231, |
| "num_input_tokens_seen": 1842168, |
| "step": 6480 |
| }, |
| { |
| "epoch": 12.098880597014926, |
| "grad_norm": 2.0668959617614746, |
| "learning_rate": 2.024256130448319e-05, |
| "loss": 0.0409, |
| "num_input_tokens_seen": 1843608, |
| "step": 6485 |
| }, |
| { |
| "epoch": 12.10820895522388, |
| "grad_norm": 2.8032543659210205, |
| "learning_rate": 2.0202608767654773e-05, |
| "loss": 0.0083, |
| "num_input_tokens_seen": 1845112, |
| "step": 6490 |
| }, |
| { |
| "epoch": 12.117537313432836, |
| "grad_norm": 0.14933788776397705, |
| "learning_rate": 2.016266894740279e-05, |
| "loss": 0.028, |
| "num_input_tokens_seen": 1846456, |
| "step": 6495 |
| }, |
| { |
| "epoch": 12.126865671641792, |
| "grad_norm": 0.4332592785358429, |
| "learning_rate": 2.0122741949596797e-05, |
| "loss": 0.0806, |
| "num_input_tokens_seen": 1847960, |
| "step": 6500 |
| }, |
| { |
| "epoch": 12.136194029850746, |
| "grad_norm": 0.14129140973091125, |
| "learning_rate": 2.008282788007239e-05, |
| "loss": 0.0406, |
| "num_input_tokens_seen": 1849240, |
| "step": 6505 |
| }, |
| { |
| "epoch": 12.145522388059701, |
| "grad_norm": 0.04044075682759285, |
| "learning_rate": 2.0042926844630896e-05, |
| "loss": 0.0013, |
| "num_input_tokens_seen": 1850680, |
| "step": 6510 |
| }, |
| { |
| "epoch": 12.154850746268657, |
| "grad_norm": 0.07222745567560196, |
| "learning_rate": 2.000303894903907e-05, |
| "loss": 0.009, |
| "num_input_tokens_seen": 1851928, |
| "step": 6515 |
| }, |
| { |
| "epoch": 12.164179104477611, |
| "grad_norm": 0.22499555349349976, |
| "learning_rate": 1.9963164299028865e-05, |
| "loss": 0.0372, |
| "num_input_tokens_seen": 1853208, |
| "step": 6520 |
| }, |
| { |
| "epoch": 12.173507462686567, |
| "grad_norm": 2.6657187938690186, |
| "learning_rate": 1.992330300029709e-05, |
| "loss": 0.0303, |
| "num_input_tokens_seen": 1854744, |
| "step": 6525 |
| }, |
| { |
| "epoch": 12.182835820895523, |
| "grad_norm": 0.23226457834243774, |
| "learning_rate": 1.988345515850519e-05, |
| "loss": 0.0239, |
| "num_input_tokens_seen": 1856056, |
| "step": 6530 |
| }, |
| { |
| "epoch": 12.192164179104477, |
| "grad_norm": 0.11811388283967972, |
| "learning_rate": 1.984362087927894e-05, |
| "loss": 0.0547, |
| "num_input_tokens_seen": 1857560, |
| "step": 6535 |
| }, |
| { |
| "epoch": 12.201492537313433, |
| "grad_norm": 0.10323216766119003, |
| "learning_rate": 1.9803800268208146e-05, |
| "loss": 0.021, |
| "num_input_tokens_seen": 1859000, |
| "step": 6540 |
| }, |
| { |
| "epoch": 12.210820895522389, |
| "grad_norm": 6.4913835525512695, |
| "learning_rate": 1.9763993430846395e-05, |
| "loss": 0.0702, |
| "num_input_tokens_seen": 1860344, |
| "step": 6545 |
| }, |
| { |
| "epoch": 12.220149253731343, |
| "grad_norm": 5.01142692565918, |
| "learning_rate": 1.972420047271076e-05, |
| "loss": 0.0236, |
| "num_input_tokens_seen": 1861688, |
| "step": 6550 |
| }, |
| { |
| "epoch": 12.229477611940299, |
| "grad_norm": 0.08434057980775833, |
| "learning_rate": 1.9684421499281533e-05, |
| "loss": 0.0025, |
| "num_input_tokens_seen": 1863192, |
| "step": 6555 |
| }, |
| { |
| "epoch": 12.238805970149254, |
| "grad_norm": 0.08729685097932816, |
| "learning_rate": 1.964465661600192e-05, |
| "loss": 0.0035, |
| "num_input_tokens_seen": 1864760, |
| "step": 6560 |
| }, |
| { |
| "epoch": 12.248134328358208, |
| "grad_norm": 1.1227364540100098, |
| "learning_rate": 1.9604905928277782e-05, |
| "loss": 0.0253, |
| "num_input_tokens_seen": 1866104, |
| "step": 6565 |
| }, |
| { |
| "epoch": 12.257462686567164, |
| "grad_norm": 1.5903313159942627, |
| "learning_rate": 1.9565169541477388e-05, |
| "loss": 0.0243, |
| "num_input_tokens_seen": 1867384, |
| "step": 6570 |
| }, |
| { |
| "epoch": 12.26679104477612, |
| "grad_norm": 2.8971734046936035, |
| "learning_rate": 1.9525447560931028e-05, |
| "loss": 0.0577, |
| "num_input_tokens_seen": 1868728, |
| "step": 6575 |
| }, |
| { |
| "epoch": 12.276119402985074, |
| "grad_norm": 0.06916771084070206, |
| "learning_rate": 1.948574009193087e-05, |
| "loss": 0.0636, |
| "num_input_tokens_seen": 1870168, |
| "step": 6580 |
| }, |
| { |
| "epoch": 12.28544776119403, |
| "grad_norm": 1.005316138267517, |
| "learning_rate": 1.944604723973058e-05, |
| "loss": 0.0532, |
| "num_input_tokens_seen": 1871736, |
| "step": 6585 |
| }, |
| { |
| "epoch": 12.294776119402986, |
| "grad_norm": 1.2696455717086792, |
| "learning_rate": 1.940636910954508e-05, |
| "loss": 0.0246, |
| "num_input_tokens_seen": 1873080, |
| "step": 6590 |
| }, |
| { |
| "epoch": 12.30410447761194, |
| "grad_norm": 0.08529263734817505, |
| "learning_rate": 1.9366705806550308e-05, |
| "loss": 0.0133, |
| "num_input_tokens_seen": 1874488, |
| "step": 6595 |
| }, |
| { |
| "epoch": 12.313432835820896, |
| "grad_norm": 12.988985061645508, |
| "learning_rate": 1.9327057435882835e-05, |
| "loss": 0.0094, |
| "num_input_tokens_seen": 1876056, |
| "step": 6600 |
| }, |
| { |
| "epoch": 12.322761194029852, |
| "grad_norm": 0.11772555857896805, |
| "learning_rate": 1.928742410263971e-05, |
| "loss": 0.0016, |
| "num_input_tokens_seen": 1877528, |
| "step": 6605 |
| }, |
| { |
| "epoch": 12.332089552238806, |
| "grad_norm": 1.6249841451644897, |
| "learning_rate": 1.9247805911878065e-05, |
| "loss": 0.0232, |
| "num_input_tokens_seen": 1878808, |
| "step": 6610 |
| }, |
| { |
| "epoch": 12.341417910447761, |
| "grad_norm": 1.6250699758529663, |
| "learning_rate": 1.920820296861496e-05, |
| "loss": 0.0124, |
| "num_input_tokens_seen": 1880248, |
| "step": 6615 |
| }, |
| { |
| "epoch": 12.350746268656717, |
| "grad_norm": 0.01850873790681362, |
| "learning_rate": 1.916861537782697e-05, |
| "loss": 0.0057, |
| "num_input_tokens_seen": 1881592, |
| "step": 6620 |
| }, |
| { |
| "epoch": 12.360074626865671, |
| "grad_norm": 1.3370493650436401, |
| "learning_rate": 1.9129043244450026e-05, |
| "loss": 0.0232, |
| "num_input_tokens_seen": 1882936, |
| "step": 6625 |
| }, |
| { |
| "epoch": 12.369402985074627, |
| "grad_norm": 0.08863592147827148, |
| "learning_rate": 1.9089486673379073e-05, |
| "loss": 0.0133, |
| "num_input_tokens_seen": 1884504, |
| "step": 6630 |
| }, |
| { |
| "epoch": 12.378731343283581, |
| "grad_norm": 2.6097774505615234, |
| "learning_rate": 1.904994576946777e-05, |
| "loss": 0.0286, |
| "num_input_tokens_seen": 1885848, |
| "step": 6635 |
| }, |
| { |
| "epoch": 12.388059701492537, |
| "grad_norm": 0.8592023253440857, |
| "learning_rate": 1.9010420637528308e-05, |
| "loss": 0.018, |
| "num_input_tokens_seen": 1887256, |
| "step": 6640 |
| }, |
| { |
| "epoch": 12.397388059701493, |
| "grad_norm": 0.056543003767728806, |
| "learning_rate": 1.8970911382331006e-05, |
| "loss": 0.0036, |
| "num_input_tokens_seen": 1888792, |
| "step": 6645 |
| }, |
| { |
| "epoch": 12.406716417910447, |
| "grad_norm": 1.3999927043914795, |
| "learning_rate": 1.8931418108604148e-05, |
| "loss": 0.0107, |
| "num_input_tokens_seen": 1890456, |
| "step": 6650 |
| }, |
| { |
| "epoch": 12.416044776119403, |
| "grad_norm": 0.06248648837208748, |
| "learning_rate": 1.889194092103364e-05, |
| "loss": 0.0488, |
| "num_input_tokens_seen": 1891800, |
| "step": 6655 |
| }, |
| { |
| "epoch": 12.425373134328359, |
| "grad_norm": 3.594332695007324, |
| "learning_rate": 1.8852479924262733e-05, |
| "loss": 0.0649, |
| "num_input_tokens_seen": 1893208, |
| "step": 6660 |
| }, |
| { |
| "epoch": 12.434701492537313, |
| "grad_norm": 0.04901442304253578, |
| "learning_rate": 1.8813035222891784e-05, |
| "loss": 0.0026, |
| "num_input_tokens_seen": 1894648, |
| "step": 6665 |
| }, |
| { |
| "epoch": 12.444029850746269, |
| "grad_norm": 0.750293493270874, |
| "learning_rate": 1.8773606921477933e-05, |
| "loss": 0.0036, |
| "num_input_tokens_seen": 1895992, |
| "step": 6670 |
| }, |
| { |
| "epoch": 12.453358208955224, |
| "grad_norm": 0.24152354896068573, |
| "learning_rate": 1.873419512453487e-05, |
| "loss": 0.0135, |
| "num_input_tokens_seen": 1897592, |
| "step": 6675 |
| }, |
| { |
| "epoch": 12.462686567164178, |
| "grad_norm": 1.9488557577133179, |
| "learning_rate": 1.869479993653252e-05, |
| "loss": 0.0722, |
| "num_input_tokens_seen": 1898840, |
| "step": 6680 |
| }, |
| { |
| "epoch": 12.472014925373134, |
| "grad_norm": 2.3710365295410156, |
| "learning_rate": 1.865542146189678e-05, |
| "loss": 0.0103, |
| "num_input_tokens_seen": 1900312, |
| "step": 6685 |
| }, |
| { |
| "epoch": 12.48134328358209, |
| "grad_norm": 0.0495469905436039, |
| "learning_rate": 1.861605980500927e-05, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 1901752, |
| "step": 6690 |
| }, |
| { |
| "epoch": 12.490671641791044, |
| "grad_norm": 3.8302056789398193, |
| "learning_rate": 1.8576715070206992e-05, |
| "loss": 0.0386, |
| "num_input_tokens_seen": 1903192, |
| "step": 6695 |
| }, |
| { |
| "epoch": 12.5, |
| "grad_norm": 4.029078483581543, |
| "learning_rate": 1.853738736178213e-05, |
| "loss": 0.0876, |
| "num_input_tokens_seen": 1904536, |
| "step": 6700 |
| }, |
| { |
| "epoch": 12.509328358208956, |
| "grad_norm": 0.05670896917581558, |
| "learning_rate": 1.849807678398171e-05, |
| "loss": 0.0153, |
| "num_input_tokens_seen": 1906040, |
| "step": 6705 |
| }, |
| { |
| "epoch": 12.51865671641791, |
| "grad_norm": 0.3168881833553314, |
| "learning_rate": 1.845878344100736e-05, |
| "loss": 0.0075, |
| "num_input_tokens_seen": 1907576, |
| "step": 6710 |
| }, |
| { |
| "epoch": 12.527985074626866, |
| "grad_norm": 0.07527896761894226, |
| "learning_rate": 1.8419507437015022e-05, |
| "loss": 0.0272, |
| "num_input_tokens_seen": 1908920, |
| "step": 6715 |
| }, |
| { |
| "epoch": 12.537313432835822, |
| "grad_norm": 2.2421786785125732, |
| "learning_rate": 1.838024887611467e-05, |
| "loss": 0.0064, |
| "num_input_tokens_seen": 1910328, |
| "step": 6720 |
| }, |
| { |
| "epoch": 12.546641791044776, |
| "grad_norm": 1.9473927021026611, |
| "learning_rate": 1.8341007862370056e-05, |
| "loss": 0.0217, |
| "num_input_tokens_seen": 1911736, |
| "step": 6725 |
| }, |
| { |
| "epoch": 12.555970149253731, |
| "grad_norm": 0.22508692741394043, |
| "learning_rate": 1.830178449979841e-05, |
| "loss": 0.0013, |
| "num_input_tokens_seen": 1913176, |
| "step": 6730 |
| }, |
| { |
| "epoch": 12.565298507462687, |
| "grad_norm": 0.1505061239004135, |
| "learning_rate": 1.826257889237017e-05, |
| "loss": 0.015, |
| "num_input_tokens_seen": 1914584, |
| "step": 6735 |
| }, |
| { |
| "epoch": 12.574626865671641, |
| "grad_norm": 0.13239292800426483, |
| "learning_rate": 1.822339114400871e-05, |
| "loss": 0.0013, |
| "num_input_tokens_seen": 1916024, |
| "step": 6740 |
| }, |
| { |
| "epoch": 12.583955223880597, |
| "grad_norm": 0.12179084122180939, |
| "learning_rate": 1.818422135859008e-05, |
| "loss": 0.0376, |
| "num_input_tokens_seen": 1917496, |
| "step": 6745 |
| }, |
| { |
| "epoch": 12.593283582089553, |
| "grad_norm": 0.25675007700920105, |
| "learning_rate": 1.8145069639942697e-05, |
| "loss": 0.0275, |
| "num_input_tokens_seen": 1918968, |
| "step": 6750 |
| }, |
| { |
| "epoch": 12.602611940298507, |
| "grad_norm": 0.5979077219963074, |
| "learning_rate": 1.8105936091847093e-05, |
| "loss": 0.0235, |
| "num_input_tokens_seen": 1920312, |
| "step": 6755 |
| }, |
| { |
| "epoch": 12.611940298507463, |
| "grad_norm": 0.854958176612854, |
| "learning_rate": 1.806682081803564e-05, |
| "loss": 0.0522, |
| "num_input_tokens_seen": 1921656, |
| "step": 6760 |
| }, |
| { |
| "epoch": 12.621268656716419, |
| "grad_norm": 0.11890462040901184, |
| "learning_rate": 1.8027723922192264e-05, |
| "loss": 0.0204, |
| "num_input_tokens_seen": 1922936, |
| "step": 6765 |
| }, |
| { |
| "epoch": 12.630597014925373, |
| "grad_norm": 0.04574445262551308, |
| "learning_rate": 1.798864550795218e-05, |
| "loss": 0.0107, |
| "num_input_tokens_seen": 1924472, |
| "step": 6770 |
| }, |
| { |
| "epoch": 12.639925373134329, |
| "grad_norm": 0.05469915270805359, |
| "learning_rate": 1.7949585678901614e-05, |
| "loss": 0.0217, |
| "num_input_tokens_seen": 1926008, |
| "step": 6775 |
| }, |
| { |
| "epoch": 12.649253731343283, |
| "grad_norm": 0.16914303600788116, |
| "learning_rate": 1.7910544538577522e-05, |
| "loss": 0.0129, |
| "num_input_tokens_seen": 1927448, |
| "step": 6780 |
| }, |
| { |
| "epoch": 12.658582089552239, |
| "grad_norm": 1.6153976917266846, |
| "learning_rate": 1.787152219046733e-05, |
| "loss": 0.0076, |
| "num_input_tokens_seen": 1928856, |
| "step": 6785 |
| }, |
| { |
| "epoch": 12.667910447761194, |
| "grad_norm": 0.2268504798412323, |
| "learning_rate": 1.783251873800863e-05, |
| "loss": 0.0213, |
| "num_input_tokens_seen": 1930200, |
| "step": 6790 |
| }, |
| { |
| "epoch": 12.677238805970148, |
| "grad_norm": 0.23461073637008667, |
| "learning_rate": 1.779353428458896e-05, |
| "loss": 0.0169, |
| "num_input_tokens_seen": 1931512, |
| "step": 6795 |
| }, |
| { |
| "epoch": 12.686567164179104, |
| "grad_norm": 0.545695960521698, |
| "learning_rate": 1.775456893354547e-05, |
| "loss": 0.0143, |
| "num_input_tokens_seen": 1932888, |
| "step": 6800 |
| }, |
| { |
| "epoch": 12.69589552238806, |
| "grad_norm": 2.6102206707000732, |
| "learning_rate": 1.7715622788164683e-05, |
| "loss": 0.0407, |
| "num_input_tokens_seen": 1934424, |
| "step": 6805 |
| }, |
| { |
| "epoch": 12.705223880597014, |
| "grad_norm": 0.6711713075637817, |
| "learning_rate": 1.767669595168223e-05, |
| "loss": 0.0294, |
| "num_input_tokens_seen": 1935960, |
| "step": 6810 |
| }, |
| { |
| "epoch": 12.71455223880597, |
| "grad_norm": 0.21352359652519226, |
| "learning_rate": 1.7637788527282522e-05, |
| "loss": 0.0027, |
| "num_input_tokens_seen": 1937272, |
| "step": 6815 |
| }, |
| { |
| "epoch": 12.723880597014926, |
| "grad_norm": 0.08048262447118759, |
| "learning_rate": 1.7598900618098548e-05, |
| "loss": 0.0088, |
| "num_input_tokens_seen": 1938712, |
| "step": 6820 |
| }, |
| { |
| "epoch": 12.73320895522388, |
| "grad_norm": 4.67939567565918, |
| "learning_rate": 1.7560032327211544e-05, |
| "loss": 0.0666, |
| "num_input_tokens_seen": 1939928, |
| "step": 6825 |
| }, |
| { |
| "epoch": 12.742537313432836, |
| "grad_norm": 0.786533772945404, |
| "learning_rate": 1.7521183757650762e-05, |
| "loss": 0.04, |
| "num_input_tokens_seen": 1941368, |
| "step": 6830 |
| }, |
| { |
| "epoch": 12.751865671641792, |
| "grad_norm": 3.0949418544769287, |
| "learning_rate": 1.7482355012393177e-05, |
| "loss": 0.0368, |
| "num_input_tokens_seen": 1942968, |
| "step": 6835 |
| }, |
| { |
| "epoch": 12.761194029850746, |
| "grad_norm": 0.1724870800971985, |
| "learning_rate": 1.7443546194363188e-05, |
| "loss": 0.0371, |
| "num_input_tokens_seen": 1944600, |
| "step": 6840 |
| }, |
| { |
| "epoch": 12.770522388059701, |
| "grad_norm": 0.6978046298027039, |
| "learning_rate": 1.740475740643242e-05, |
| "loss": 0.0334, |
| "num_input_tokens_seen": 1945976, |
| "step": 6845 |
| }, |
| { |
| "epoch": 12.779850746268657, |
| "grad_norm": 0.6531873345375061, |
| "learning_rate": 1.7365988751419343e-05, |
| "loss": 0.0481, |
| "num_input_tokens_seen": 1947480, |
| "step": 6850 |
| }, |
| { |
| "epoch": 12.789179104477611, |
| "grad_norm": 1.6857367753982544, |
| "learning_rate": 1.7327240332089128e-05, |
| "loss": 0.0473, |
| "num_input_tokens_seen": 1948792, |
| "step": 6855 |
| }, |
| { |
| "epoch": 12.798507462686567, |
| "grad_norm": 0.03528475761413574, |
| "learning_rate": 1.7288512251153243e-05, |
| "loss": 0.0361, |
| "num_input_tokens_seen": 1950360, |
| "step": 6860 |
| }, |
| { |
| "epoch": 12.807835820895523, |
| "grad_norm": 0.19496102631092072, |
| "learning_rate": 1.7249804611269288e-05, |
| "loss": 0.0248, |
| "num_input_tokens_seen": 1951672, |
| "step": 6865 |
| }, |
| { |
| "epoch": 12.817164179104477, |
| "grad_norm": 0.1142788901925087, |
| "learning_rate": 1.7211117515040676e-05, |
| "loss": 0.0111, |
| "num_input_tokens_seen": 1953016, |
| "step": 6870 |
| }, |
| { |
| "epoch": 12.826492537313433, |
| "grad_norm": 0.1901704967021942, |
| "learning_rate": 1.717245106501632e-05, |
| "loss": 0.0373, |
| "num_input_tokens_seen": 1954552, |
| "step": 6875 |
| }, |
| { |
| "epoch": 12.835820895522389, |
| "grad_norm": 1.7312973737716675, |
| "learning_rate": 1.7133805363690478e-05, |
| "loss": 0.019, |
| "num_input_tokens_seen": 1955800, |
| "step": 6880 |
| }, |
| { |
| "epoch": 12.845149253731343, |
| "grad_norm": 0.1131213903427124, |
| "learning_rate": 1.7095180513502334e-05, |
| "loss": 0.0303, |
| "num_input_tokens_seen": 1957240, |
| "step": 6885 |
| }, |
| { |
| "epoch": 12.854477611940299, |
| "grad_norm": 0.9880719184875488, |
| "learning_rate": 1.705657661683586e-05, |
| "loss": 0.013, |
| "num_input_tokens_seen": 1958712, |
| "step": 6890 |
| }, |
| { |
| "epoch": 12.863805970149254, |
| "grad_norm": 0.34949660301208496, |
| "learning_rate": 1.701799377601946e-05, |
| "loss": 0.0087, |
| "num_input_tokens_seen": 1959992, |
| "step": 6895 |
| }, |
| { |
| "epoch": 12.873134328358208, |
| "grad_norm": 0.08685438334941864, |
| "learning_rate": 1.697943209332572e-05, |
| "loss": 0.0046, |
| "num_input_tokens_seen": 1961304, |
| "step": 6900 |
| }, |
| { |
| "epoch": 12.882462686567164, |
| "grad_norm": 1.894152283668518, |
| "learning_rate": 1.694089167097116e-05, |
| "loss": 0.0157, |
| "num_input_tokens_seen": 1962648, |
| "step": 6905 |
| }, |
| { |
| "epoch": 12.89179104477612, |
| "grad_norm": 0.030265873298048973, |
| "learning_rate": 1.6902372611115917e-05, |
| "loss": 0.04, |
| "num_input_tokens_seen": 1963928, |
| "step": 6910 |
| }, |
| { |
| "epoch": 12.901119402985074, |
| "grad_norm": 0.3923504054546356, |
| "learning_rate": 1.686387501586354e-05, |
| "loss": 0.015, |
| "num_input_tokens_seen": 1965656, |
| "step": 6915 |
| }, |
| { |
| "epoch": 12.91044776119403, |
| "grad_norm": 0.0862976461648941, |
| "learning_rate": 1.6825398987260642e-05, |
| "loss": 0.0546, |
| "num_input_tokens_seen": 1967064, |
| "step": 6920 |
| }, |
| { |
| "epoch": 12.919776119402986, |
| "grad_norm": 0.6729326844215393, |
| "learning_rate": 1.678694462729669e-05, |
| "loss": 0.0178, |
| "num_input_tokens_seen": 1968568, |
| "step": 6925 |
| }, |
| { |
| "epoch": 12.92910447761194, |
| "grad_norm": 0.4637919068336487, |
| "learning_rate": 1.6748512037903725e-05, |
| "loss": 0.0311, |
| "num_input_tokens_seen": 1970200, |
| "step": 6930 |
| }, |
| { |
| "epoch": 12.938432835820896, |
| "grad_norm": 0.26998990774154663, |
| "learning_rate": 1.671010132095604e-05, |
| "loss": 0.0147, |
| "num_input_tokens_seen": 1971736, |
| "step": 6935 |
| }, |
| { |
| "epoch": 12.947761194029852, |
| "grad_norm": 0.5264920592308044, |
| "learning_rate": 1.6671712578269997e-05, |
| "loss": 0.0169, |
| "num_input_tokens_seen": 1972984, |
| "step": 6940 |
| }, |
| { |
| "epoch": 12.957089552238806, |
| "grad_norm": 0.21926730871200562, |
| "learning_rate": 1.663334591160368e-05, |
| "loss": 0.0243, |
| "num_input_tokens_seen": 1974552, |
| "step": 6945 |
| }, |
| { |
| "epoch": 12.966417910447761, |
| "grad_norm": 0.1628131866455078, |
| "learning_rate": 1.659500142265666e-05, |
| "loss": 0.0012, |
| "num_input_tokens_seen": 1975864, |
| "step": 6950 |
| }, |
| { |
| "epoch": 12.975746268656717, |
| "grad_norm": 0.08070550113916397, |
| "learning_rate": 1.655667921306973e-05, |
| "loss": 0.0304, |
| "num_input_tokens_seen": 1977240, |
| "step": 6955 |
| }, |
| { |
| "epoch": 12.985074626865671, |
| "grad_norm": 0.154098778963089, |
| "learning_rate": 1.6518379384424616e-05, |
| "loss": 0.011, |
| "num_input_tokens_seen": 1978584, |
| "step": 6960 |
| }, |
| { |
| "epoch": 12.994402985074627, |
| "grad_norm": 2.0057332515716553, |
| "learning_rate": 1.6480102038243735e-05, |
| "loss": 0.0768, |
| "num_input_tokens_seen": 1979832, |
| "step": 6965 |
| }, |
| { |
| "epoch": 13.0, |
| "eval_loss": 1.2252000570297241, |
| "eval_runtime": 2.9008, |
| "eval_samples_per_second": 82.046, |
| "eval_steps_per_second": 20.684, |
| "num_input_tokens_seen": 1980376, |
| "step": 6968 |
| }, |
| { |
| "epoch": 13.003731343283581, |
| "grad_norm": 0.1637083888053894, |
| "learning_rate": 1.644184727598988e-05, |
| "loss": 0.0477, |
| "num_input_tokens_seen": 1981048, |
| "step": 6970 |
| }, |
| { |
| "epoch": 13.013059701492537, |
| "grad_norm": 0.8645164370536804, |
| "learning_rate": 1.640361519906602e-05, |
| "loss": 0.0773, |
| "num_input_tokens_seen": 1982328, |
| "step": 6975 |
| }, |
| { |
| "epoch": 13.022388059701493, |
| "grad_norm": 0.15362635254859924, |
| "learning_rate": 1.6365405908814946e-05, |
| "loss": 0.0575, |
| "num_input_tokens_seen": 1983640, |
| "step": 6980 |
| }, |
| { |
| "epoch": 13.031716417910447, |
| "grad_norm": 0.10971387475728989, |
| "learning_rate": 1.6327219506519083e-05, |
| "loss": 0.0017, |
| "num_input_tokens_seen": 1985048, |
| "step": 6985 |
| }, |
| { |
| "epoch": 13.041044776119403, |
| "grad_norm": 2.1165037155151367, |
| "learning_rate": 1.6289056093400178e-05, |
| "loss": 0.0275, |
| "num_input_tokens_seen": 1986392, |
| "step": 6990 |
| }, |
| { |
| "epoch": 13.050373134328359, |
| "grad_norm": 0.2167270928621292, |
| "learning_rate": 1.6250915770619023e-05, |
| "loss": 0.0567, |
| "num_input_tokens_seen": 1987704, |
| "step": 6995 |
| }, |
| { |
| "epoch": 13.059701492537313, |
| "grad_norm": 0.19175602495670319, |
| "learning_rate": 1.6212798639275233e-05, |
| "loss": 0.0035, |
| "num_input_tokens_seen": 1989144, |
| "step": 7000 |
| }, |
| { |
| "epoch": 13.069029850746269, |
| "grad_norm": 0.13278843462467194, |
| "learning_rate": 1.617470480040692e-05, |
| "loss": 0.022, |
| "num_input_tokens_seen": 1990520, |
| "step": 7005 |
| }, |
| { |
| "epoch": 13.078358208955224, |
| "grad_norm": 2.491051435470581, |
| "learning_rate": 1.613663435499047e-05, |
| "loss": 0.0108, |
| "num_input_tokens_seen": 1992248, |
| "step": 7010 |
| }, |
| { |
| "epoch": 13.087686567164178, |
| "grad_norm": 0.10960175096988678, |
| "learning_rate": 1.609858740394026e-05, |
| "loss": 0.0143, |
| "num_input_tokens_seen": 1993720, |
| "step": 7015 |
| }, |
| { |
| "epoch": 13.097014925373134, |
| "grad_norm": 0.3000943660736084, |
| "learning_rate": 1.6060564048108383e-05, |
| "loss": 0.0244, |
| "num_input_tokens_seen": 1995160, |
| "step": 7020 |
| }, |
| { |
| "epoch": 13.10634328358209, |
| "grad_norm": 0.09903822094202042, |
| "learning_rate": 1.6022564388284392e-05, |
| "loss": 0.0027, |
| "num_input_tokens_seen": 1996728, |
| "step": 7025 |
| }, |
| { |
| "epoch": 13.115671641791044, |
| "grad_norm": 0.011722357012331486, |
| "learning_rate": 1.598458852519502e-05, |
| "loss": 0.0024, |
| "num_input_tokens_seen": 1998168, |
| "step": 7030 |
| }, |
| { |
| "epoch": 13.125, |
| "grad_norm": 0.1223590299487114, |
| "learning_rate": 1.594663655950394e-05, |
| "loss": 0.0163, |
| "num_input_tokens_seen": 1999640, |
| "step": 7035 |
| }, |
| { |
| "epoch": 13.134328358208956, |
| "grad_norm": 0.26209792494773865, |
| "learning_rate": 1.590870859181146e-05, |
| "loss": 0.0016, |
| "num_input_tokens_seen": 2000888, |
| "step": 7040 |
| }, |
| { |
| "epoch": 13.14365671641791, |
| "grad_norm": 0.07664523273706436, |
| "learning_rate": 1.5870804722654275e-05, |
| "loss": 0.0044, |
| "num_input_tokens_seen": 2002232, |
| "step": 7045 |
| }, |
| { |
| "epoch": 13.152985074626866, |
| "grad_norm": 2.964474678039551, |
| "learning_rate": 1.5832925052505235e-05, |
| "loss": 0.0126, |
| "num_input_tokens_seen": 2003480, |
| "step": 7050 |
| }, |
| { |
| "epoch": 13.162313432835822, |
| "grad_norm": 0.07859363406896591, |
| "learning_rate": 1.579506968177299e-05, |
| "loss": 0.0074, |
| "num_input_tokens_seen": 2004696, |
| "step": 7055 |
| }, |
| { |
| "epoch": 13.171641791044776, |
| "grad_norm": 0.055025290697813034, |
| "learning_rate": 1.575723871080184e-05, |
| "loss": 0.043, |
| "num_input_tokens_seen": 2006040, |
| "step": 7060 |
| }, |
| { |
| "epoch": 13.180970149253731, |
| "grad_norm": 0.03810274973511696, |
| "learning_rate": 1.5719432239871347e-05, |
| "loss": 0.0019, |
| "num_input_tokens_seen": 2007608, |
| "step": 7065 |
| }, |
| { |
| "epoch": 13.190298507462687, |
| "grad_norm": 0.11981242150068283, |
| "learning_rate": 1.5681650369196165e-05, |
| "loss": 0.0009, |
| "num_input_tokens_seen": 2009112, |
| "step": 7070 |
| }, |
| { |
| "epoch": 13.199626865671641, |
| "grad_norm": 2.8463268280029297, |
| "learning_rate": 1.5643893198925764e-05, |
| "loss": 0.0421, |
| "num_input_tokens_seen": 2010424, |
| "step": 7075 |
| }, |
| { |
| "epoch": 13.208955223880597, |
| "grad_norm": 0.06994317471981049, |
| "learning_rate": 1.5606160829144066e-05, |
| "loss": 0.0377, |
| "num_input_tokens_seen": 2011768, |
| "step": 7080 |
| }, |
| { |
| "epoch": 13.218283582089553, |
| "grad_norm": 3.862454891204834, |
| "learning_rate": 1.5568453359869334e-05, |
| "loss": 0.0884, |
| "num_input_tokens_seen": 2013208, |
| "step": 7085 |
| }, |
| { |
| "epoch": 13.227611940298507, |
| "grad_norm": 0.056930139660835266, |
| "learning_rate": 1.5530770891053763e-05, |
| "loss": 0.0127, |
| "num_input_tokens_seen": 2014616, |
| "step": 7090 |
| }, |
| { |
| "epoch": 13.236940298507463, |
| "grad_norm": 0.08053652942180634, |
| "learning_rate": 1.5493113522583334e-05, |
| "loss": 0.0012, |
| "num_input_tokens_seen": 2016088, |
| "step": 7095 |
| }, |
| { |
| "epoch": 13.246268656716419, |
| "grad_norm": 0.1629336178302765, |
| "learning_rate": 1.5455481354277435e-05, |
| "loss": 0.0403, |
| "num_input_tokens_seen": 2017368, |
| "step": 7100 |
| }, |
| { |
| "epoch": 13.255597014925373, |
| "grad_norm": 1.2709959745407104, |
| "learning_rate": 1.5417874485888706e-05, |
| "loss": 0.0171, |
| "num_input_tokens_seen": 2018680, |
| "step": 7105 |
| }, |
| { |
| "epoch": 13.264925373134329, |
| "grad_norm": 0.9837348461151123, |
| "learning_rate": 1.53802930171027e-05, |
| "loss": 0.0254, |
| "num_input_tokens_seen": 2020248, |
| "step": 7110 |
| }, |
| { |
| "epoch": 13.274253731343283, |
| "grad_norm": 1.2628322839736938, |
| "learning_rate": 1.5342737047537643e-05, |
| "loss": 0.007, |
| "num_input_tokens_seen": 2021560, |
| "step": 7115 |
| }, |
| { |
| "epoch": 13.283582089552239, |
| "grad_norm": 0.13153661787509918, |
| "learning_rate": 1.5305206676744187e-05, |
| "loss": 0.0017, |
| "num_input_tokens_seen": 2023032, |
| "step": 7120 |
| }, |
| { |
| "epoch": 13.292910447761194, |
| "grad_norm": 2.554349184036255, |
| "learning_rate": 1.5267702004205083e-05, |
| "loss": 0.011, |
| "num_input_tokens_seen": 2024600, |
| "step": 7125 |
| }, |
| { |
| "epoch": 13.302238805970148, |
| "grad_norm": 0.06674165278673172, |
| "learning_rate": 1.5230223129335019e-05, |
| "loss": 0.0334, |
| "num_input_tokens_seen": 2025976, |
| "step": 7130 |
| }, |
| { |
| "epoch": 13.311567164179104, |
| "grad_norm": 0.6052500605583191, |
| "learning_rate": 1.5192770151480278e-05, |
| "loss": 0.0546, |
| "num_input_tokens_seen": 2027576, |
| "step": 7135 |
| }, |
| { |
| "epoch": 13.32089552238806, |
| "grad_norm": 0.9598552584648132, |
| "learning_rate": 1.5155343169918485e-05, |
| "loss": 0.025, |
| "num_input_tokens_seen": 2029112, |
| "step": 7140 |
| }, |
| { |
| "epoch": 13.330223880597014, |
| "grad_norm": 0.09493947774171829, |
| "learning_rate": 1.5117942283858369e-05, |
| "loss": 0.0373, |
| "num_input_tokens_seen": 2030552, |
| "step": 7145 |
| }, |
| { |
| "epoch": 13.33955223880597, |
| "grad_norm": 0.055504944175481796, |
| "learning_rate": 1.5080567592439479e-05, |
| "loss": 0.0304, |
| "num_input_tokens_seen": 2031864, |
| "step": 7150 |
| }, |
| { |
| "epoch": 13.348880597014926, |
| "grad_norm": 0.05839948356151581, |
| "learning_rate": 1.5043219194731945e-05, |
| "loss": 0.0305, |
| "num_input_tokens_seen": 2033240, |
| "step": 7155 |
| }, |
| { |
| "epoch": 13.35820895522388, |
| "grad_norm": 1.0808312892913818, |
| "learning_rate": 1.5005897189736173e-05, |
| "loss": 0.0557, |
| "num_input_tokens_seen": 2034552, |
| "step": 7160 |
| }, |
| { |
| "epoch": 13.367537313432836, |
| "grad_norm": 0.08113997429609299, |
| "learning_rate": 1.4968601676382634e-05, |
| "loss": 0.0178, |
| "num_input_tokens_seen": 2035992, |
| "step": 7165 |
| }, |
| { |
| "epoch": 13.376865671641792, |
| "grad_norm": 0.1014963835477829, |
| "learning_rate": 1.4931332753531574e-05, |
| "loss": 0.0142, |
| "num_input_tokens_seen": 2037272, |
| "step": 7170 |
| }, |
| { |
| "epoch": 13.386194029850746, |
| "grad_norm": 6.284354209899902, |
| "learning_rate": 1.4894090519972731e-05, |
| "loss": 0.0516, |
| "num_input_tokens_seen": 2038776, |
| "step": 7175 |
| }, |
| { |
| "epoch": 13.395522388059701, |
| "grad_norm": 0.07496251165866852, |
| "learning_rate": 1.4856875074425131e-05, |
| "loss": 0.0546, |
| "num_input_tokens_seen": 2040152, |
| "step": 7180 |
| }, |
| { |
| "epoch": 13.404850746268657, |
| "grad_norm": 0.04527842998504639, |
| "learning_rate": 1.4819686515536762e-05, |
| "loss": 0.0107, |
| "num_input_tokens_seen": 2041592, |
| "step": 7185 |
| }, |
| { |
| "epoch": 13.414179104477611, |
| "grad_norm": 0.10157701373100281, |
| "learning_rate": 1.4782524941884366e-05, |
| "loss": 0.0246, |
| "num_input_tokens_seen": 2042936, |
| "step": 7190 |
| }, |
| { |
| "epoch": 13.423507462686567, |
| "grad_norm": 0.050505075603723526, |
| "learning_rate": 1.4745390451973146e-05, |
| "loss": 0.049, |
| "num_input_tokens_seen": 2044344, |
| "step": 7195 |
| }, |
| { |
| "epoch": 13.432835820895523, |
| "grad_norm": 1.9053412675857544, |
| "learning_rate": 1.4708283144236498e-05, |
| "loss": 0.0263, |
| "num_input_tokens_seen": 2045816, |
| "step": 7200 |
| }, |
| { |
| "epoch": 13.442164179104477, |
| "grad_norm": 0.045376043766736984, |
| "learning_rate": 1.4671203117035799e-05, |
| "loss": 0.0311, |
| "num_input_tokens_seen": 2047160, |
| "step": 7205 |
| }, |
| { |
| "epoch": 13.451492537313433, |
| "grad_norm": 0.03770196810364723, |
| "learning_rate": 1.4634150468660073e-05, |
| "loss": 0.0011, |
| "num_input_tokens_seen": 2048408, |
| "step": 7210 |
| }, |
| { |
| "epoch": 13.460820895522389, |
| "grad_norm": 0.04605880379676819, |
| "learning_rate": 1.4597125297325811e-05, |
| "loss": 0.0066, |
| "num_input_tokens_seen": 2049752, |
| "step": 7215 |
| }, |
| { |
| "epoch": 13.470149253731343, |
| "grad_norm": 0.1393289417028427, |
| "learning_rate": 1.4560127701176635e-05, |
| "loss": 0.0239, |
| "num_input_tokens_seen": 2051192, |
| "step": 7220 |
| }, |
| { |
| "epoch": 13.479477611940299, |
| "grad_norm": 0.12368310987949371, |
| "learning_rate": 1.452315777828308e-05, |
| "loss": 0.0178, |
| "num_input_tokens_seen": 2052760, |
| "step": 7225 |
| }, |
| { |
| "epoch": 13.488805970149254, |
| "grad_norm": 0.6569920182228088, |
| "learning_rate": 1.4486215626642364e-05, |
| "loss": 0.0473, |
| "num_input_tokens_seen": 2054328, |
| "step": 7230 |
| }, |
| { |
| "epoch": 13.498134328358208, |
| "grad_norm": 0.6476588845252991, |
| "learning_rate": 1.4449301344178037e-05, |
| "loss": 0.0426, |
| "num_input_tokens_seen": 2055736, |
| "step": 7235 |
| }, |
| { |
| "epoch": 13.507462686567164, |
| "grad_norm": 0.022660747170448303, |
| "learning_rate": 1.4412415028739806e-05, |
| "loss": 0.0013, |
| "num_input_tokens_seen": 2057112, |
| "step": 7240 |
| }, |
| { |
| "epoch": 13.51679104477612, |
| "grad_norm": 0.71531081199646, |
| "learning_rate": 1.4375556778103227e-05, |
| "loss": 0.0019, |
| "num_input_tokens_seen": 2058616, |
| "step": 7245 |
| }, |
| { |
| "epoch": 13.526119402985074, |
| "grad_norm": 0.05053507536649704, |
| "learning_rate": 1.4338726689969495e-05, |
| "loss": 0.051, |
| "num_input_tokens_seen": 2060024, |
| "step": 7250 |
| }, |
| { |
| "epoch": 13.53544776119403, |
| "grad_norm": 0.0691198855638504, |
| "learning_rate": 1.4301924861965122e-05, |
| "loss": 0.0031, |
| "num_input_tokens_seen": 2061208, |
| "step": 7255 |
| }, |
| { |
| "epoch": 13.544776119402986, |
| "grad_norm": 1.6907533407211304, |
| "learning_rate": 1.4265151391641719e-05, |
| "loss": 0.0302, |
| "num_input_tokens_seen": 2062808, |
| "step": 7260 |
| }, |
| { |
| "epoch": 13.55410447761194, |
| "grad_norm": 0.7497798204421997, |
| "learning_rate": 1.4228406376475742e-05, |
| "loss": 0.047, |
| "num_input_tokens_seen": 2064312, |
| "step": 7265 |
| }, |
| { |
| "epoch": 13.563432835820896, |
| "grad_norm": 1.0596312284469604, |
| "learning_rate": 1.4191689913868206e-05, |
| "loss": 0.029, |
| "num_input_tokens_seen": 2065816, |
| "step": 7270 |
| }, |
| { |
| "epoch": 13.572761194029852, |
| "grad_norm": 0.1084161102771759, |
| "learning_rate": 1.4155002101144443e-05, |
| "loss": 0.0096, |
| "num_input_tokens_seen": 2067352, |
| "step": 7275 |
| }, |
| { |
| "epoch": 13.582089552238806, |
| "grad_norm": 0.06317305564880371, |
| "learning_rate": 1.4118343035553836e-05, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 2068856, |
| "step": 7280 |
| }, |
| { |
| "epoch": 13.591417910447761, |
| "grad_norm": 1.4339714050292969, |
| "learning_rate": 1.4081712814269593e-05, |
| "loss": 0.0095, |
| "num_input_tokens_seen": 2070200, |
| "step": 7285 |
| }, |
| { |
| "epoch": 13.600746268656717, |
| "grad_norm": 0.13171721994876862, |
| "learning_rate": 1.4045111534388433e-05, |
| "loss": 0.0245, |
| "num_input_tokens_seen": 2071608, |
| "step": 7290 |
| }, |
| { |
| "epoch": 13.610074626865671, |
| "grad_norm": 1.2379400730133057, |
| "learning_rate": 1.4008539292930368e-05, |
| "loss": 0.0852, |
| "num_input_tokens_seen": 2073080, |
| "step": 7295 |
| }, |
| { |
| "epoch": 13.619402985074627, |
| "grad_norm": 0.30336299538612366, |
| "learning_rate": 1.3971996186838451e-05, |
| "loss": 0.02, |
| "num_input_tokens_seen": 2074616, |
| "step": 7300 |
| }, |
| { |
| "epoch": 13.628731343283581, |
| "grad_norm": 0.16188789904117584, |
| "learning_rate": 1.3935482312978492e-05, |
| "loss": 0.0126, |
| "num_input_tokens_seen": 2075992, |
| "step": 7305 |
| }, |
| { |
| "epoch": 13.638059701492537, |
| "grad_norm": 0.11073967814445496, |
| "learning_rate": 1.389899776813881e-05, |
| "loss": 0.0011, |
| "num_input_tokens_seen": 2077336, |
| "step": 7310 |
| }, |
| { |
| "epoch": 13.647388059701493, |
| "grad_norm": 0.08337308466434479, |
| "learning_rate": 1.3862542649030002e-05, |
| "loss": 0.0472, |
| "num_input_tokens_seen": 2078808, |
| "step": 7315 |
| }, |
| { |
| "epoch": 13.656716417910447, |
| "grad_norm": 0.06680534034967422, |
| "learning_rate": 1.3826117052284646e-05, |
| "loss": 0.0017, |
| "num_input_tokens_seen": 2080248, |
| "step": 7320 |
| }, |
| { |
| "epoch": 13.666044776119403, |
| "grad_norm": 1.4123318195343018, |
| "learning_rate": 1.3789721074457063e-05, |
| "loss": 0.024, |
| "num_input_tokens_seen": 2081624, |
| "step": 7325 |
| }, |
| { |
| "epoch": 13.675373134328359, |
| "grad_norm": 0.08048944920301437, |
| "learning_rate": 1.3753354812023067e-05, |
| "loss": 0.0345, |
| "num_input_tokens_seen": 2083192, |
| "step": 7330 |
| }, |
| { |
| "epoch": 13.684701492537313, |
| "grad_norm": 0.23001337051391602, |
| "learning_rate": 1.3717018361379719e-05, |
| "loss": 0.0079, |
| "num_input_tokens_seen": 2084696, |
| "step": 7335 |
| }, |
| { |
| "epoch": 13.694029850746269, |
| "grad_norm": 0.14615000784397125, |
| "learning_rate": 1.3680711818845039e-05, |
| "loss": 0.0146, |
| "num_input_tokens_seen": 2086104, |
| "step": 7340 |
| }, |
| { |
| "epoch": 13.703358208955224, |
| "grad_norm": 2.0742828845977783, |
| "learning_rate": 1.3644435280657764e-05, |
| "loss": 0.0317, |
| "num_input_tokens_seen": 2087416, |
| "step": 7345 |
| }, |
| { |
| "epoch": 13.712686567164178, |
| "grad_norm": 0.03604760766029358, |
| "learning_rate": 1.3608188842977127e-05, |
| "loss": 0.0182, |
| "num_input_tokens_seen": 2088760, |
| "step": 7350 |
| }, |
| { |
| "epoch": 13.722014925373134, |
| "grad_norm": 0.3438225984573364, |
| "learning_rate": 1.3571972601882544e-05, |
| "loss": 0.0075, |
| "num_input_tokens_seen": 2090200, |
| "step": 7355 |
| }, |
| { |
| "epoch": 13.73134328358209, |
| "grad_norm": 0.08444015681743622, |
| "learning_rate": 1.3535786653373397e-05, |
| "loss": 0.0133, |
| "num_input_tokens_seen": 2091768, |
| "step": 7360 |
| }, |
| { |
| "epoch": 13.740671641791044, |
| "grad_norm": 1.4621909856796265, |
| "learning_rate": 1.3499631093368764e-05, |
| "loss": 0.007, |
| "num_input_tokens_seen": 2093400, |
| "step": 7365 |
| }, |
| { |
| "epoch": 13.75, |
| "grad_norm": 1.5778751373291016, |
| "learning_rate": 1.3463506017707197e-05, |
| "loss": 0.0449, |
| "num_input_tokens_seen": 2094680, |
| "step": 7370 |
| }, |
| { |
| "epoch": 13.759328358208956, |
| "grad_norm": 0.13752609491348267, |
| "learning_rate": 1.3427411522146416e-05, |
| "loss": 0.0385, |
| "num_input_tokens_seen": 2096152, |
| "step": 7375 |
| }, |
| { |
| "epoch": 13.76865671641791, |
| "grad_norm": 0.7393186688423157, |
| "learning_rate": 1.3391347702363078e-05, |
| "loss": 0.0188, |
| "num_input_tokens_seen": 2097784, |
| "step": 7380 |
| }, |
| { |
| "epoch": 13.777985074626866, |
| "grad_norm": 2.2789297103881836, |
| "learning_rate": 1.3355314653952555e-05, |
| "loss": 0.0635, |
| "num_input_tokens_seen": 2099000, |
| "step": 7385 |
| }, |
| { |
| "epoch": 13.787313432835822, |
| "grad_norm": 0.37225502729415894, |
| "learning_rate": 1.3319312472428636e-05, |
| "loss": 0.0281, |
| "num_input_tokens_seen": 2100504, |
| "step": 7390 |
| }, |
| { |
| "epoch": 13.796641791044776, |
| "grad_norm": 0.06163875013589859, |
| "learning_rate": 1.328334125322328e-05, |
| "loss": 0.0022, |
| "num_input_tokens_seen": 2101784, |
| "step": 7395 |
| }, |
| { |
| "epoch": 13.805970149253731, |
| "grad_norm": 1.3379021883010864, |
| "learning_rate": 1.3247401091686379e-05, |
| "loss": 0.0198, |
| "num_input_tokens_seen": 2103192, |
| "step": 7400 |
| }, |
| { |
| "epoch": 13.815298507462687, |
| "grad_norm": 1.2568506002426147, |
| "learning_rate": 1.3211492083085511e-05, |
| "loss": 0.0298, |
| "num_input_tokens_seen": 2104504, |
| "step": 7405 |
| }, |
| { |
| "epoch": 13.824626865671641, |
| "grad_norm": 0.10658665001392365, |
| "learning_rate": 1.317561432260569e-05, |
| "loss": 0.001, |
| "num_input_tokens_seen": 2105912, |
| "step": 7410 |
| }, |
| { |
| "epoch": 13.833955223880597, |
| "grad_norm": 2.1878490447998047, |
| "learning_rate": 1.3139767905349044e-05, |
| "loss": 0.0187, |
| "num_input_tokens_seen": 2107320, |
| "step": 7415 |
| }, |
| { |
| "epoch": 13.843283582089553, |
| "grad_norm": 0.08159002661705017, |
| "learning_rate": 1.3103952926334678e-05, |
| "loss": 0.0396, |
| "num_input_tokens_seen": 2108728, |
| "step": 7420 |
| }, |
| { |
| "epoch": 13.852611940298507, |
| "grad_norm": 0.21240752935409546, |
| "learning_rate": 1.3068169480498333e-05, |
| "loss": 0.0184, |
| "num_input_tokens_seen": 2109944, |
| "step": 7425 |
| }, |
| { |
| "epoch": 13.861940298507463, |
| "grad_norm": 5.079789161682129, |
| "learning_rate": 1.3032417662692153e-05, |
| "loss": 0.1076, |
| "num_input_tokens_seen": 2111384, |
| "step": 7430 |
| }, |
| { |
| "epoch": 13.871268656716419, |
| "grad_norm": 0.08404181897640228, |
| "learning_rate": 1.2996697567684485e-05, |
| "loss": 0.0062, |
| "num_input_tokens_seen": 2112792, |
| "step": 7435 |
| }, |
| { |
| "epoch": 13.880597014925373, |
| "grad_norm": 0.07029449939727783, |
| "learning_rate": 1.2961009290159549e-05, |
| "loss": 0.0301, |
| "num_input_tokens_seen": 2114552, |
| "step": 7440 |
| }, |
| { |
| "epoch": 13.889925373134329, |
| "grad_norm": 0.08538229763507843, |
| "learning_rate": 1.292535292471726e-05, |
| "loss": 0.0047, |
| "num_input_tokens_seen": 2115928, |
| "step": 7445 |
| }, |
| { |
| "epoch": 13.899253731343283, |
| "grad_norm": 0.04317712038755417, |
| "learning_rate": 1.2889728565872888e-05, |
| "loss": 0.021, |
| "num_input_tokens_seen": 2117368, |
| "step": 7450 |
| }, |
| { |
| "epoch": 13.908582089552239, |
| "grad_norm": 0.23353944718837738, |
| "learning_rate": 1.2854136308056933e-05, |
| "loss": 0.0056, |
| "num_input_tokens_seen": 2118584, |
| "step": 7455 |
| }, |
| { |
| "epoch": 13.917910447761194, |
| "grad_norm": 0.10192212462425232, |
| "learning_rate": 1.281857624561474e-05, |
| "loss": 0.0255, |
| "num_input_tokens_seen": 2120056, |
| "step": 7460 |
| }, |
| { |
| "epoch": 13.927238805970148, |
| "grad_norm": 0.1365603506565094, |
| "learning_rate": 1.2783048472806363e-05, |
| "loss": 0.024, |
| "num_input_tokens_seen": 2121528, |
| "step": 7465 |
| }, |
| { |
| "epoch": 13.936567164179104, |
| "grad_norm": 1.0852069854736328, |
| "learning_rate": 1.274755308380624e-05, |
| "loss": 0.0155, |
| "num_input_tokens_seen": 2123000, |
| "step": 7470 |
| }, |
| { |
| "epoch": 13.94589552238806, |
| "grad_norm": 2.4422950744628906, |
| "learning_rate": 1.2712090172702951e-05, |
| "loss": 0.0428, |
| "num_input_tokens_seen": 2124504, |
| "step": 7475 |
| }, |
| { |
| "epoch": 13.955223880597014, |
| "grad_norm": 0.10498291999101639, |
| "learning_rate": 1.2676659833499044e-05, |
| "loss": 0.001, |
| "num_input_tokens_seen": 2125816, |
| "step": 7480 |
| }, |
| { |
| "epoch": 13.96455223880597, |
| "grad_norm": 0.05799134075641632, |
| "learning_rate": 1.2641262160110645e-05, |
| "loss": 0.0476, |
| "num_input_tokens_seen": 2127096, |
| "step": 7485 |
| }, |
| { |
| "epoch": 13.973880597014926, |
| "grad_norm": 2.333284854888916, |
| "learning_rate": 1.260589724636736e-05, |
| "loss": 0.0267, |
| "num_input_tokens_seen": 2128472, |
| "step": 7490 |
| }, |
| { |
| "epoch": 13.98320895522388, |
| "grad_norm": 0.11944572627544403, |
| "learning_rate": 1.2570565186011946e-05, |
| "loss": 0.0319, |
| "num_input_tokens_seen": 2130040, |
| "step": 7495 |
| }, |
| { |
| "epoch": 13.992537313432836, |
| "grad_norm": 1.0807158946990967, |
| "learning_rate": 1.2535266072700052e-05, |
| "loss": 0.0248, |
| "num_input_tokens_seen": 2131608, |
| "step": 7500 |
| }, |
| { |
| "epoch": 14.0, |
| "eval_loss": 1.2389403581619263, |
| "eval_runtime": 2.9122, |
| "eval_samples_per_second": 81.726, |
| "eval_steps_per_second": 20.603, |
| "num_input_tokens_seen": 2132544, |
| "step": 7504 |
| }, |
| { |
| "epoch": 14.001865671641792, |
| "grad_norm": 0.3519318103790283, |
| "learning_rate": 1.2500000000000006e-05, |
| "loss": 0.0073, |
| "num_input_tokens_seen": 2132800, |
| "step": 7505 |
| }, |
| { |
| "epoch": 14.011194029850746, |
| "grad_norm": 0.08003352582454681, |
| "learning_rate": 1.2464767061392543e-05, |
| "loss": 0.0548, |
| "num_input_tokens_seen": 2134080, |
| "step": 7510 |
| }, |
| { |
| "epoch": 14.020522388059701, |
| "grad_norm": 0.12272654473781586, |
| "learning_rate": 1.2429567350270596e-05, |
| "loss": 0.0284, |
| "num_input_tokens_seen": 2135520, |
| "step": 7515 |
| }, |
| { |
| "epoch": 14.029850746268657, |
| "grad_norm": 0.36942487955093384, |
| "learning_rate": 1.2394400959939001e-05, |
| "loss": 0.0021, |
| "num_input_tokens_seen": 2136864, |
| "step": 7520 |
| }, |
| { |
| "epoch": 14.039179104477611, |
| "grad_norm": 0.06986375153064728, |
| "learning_rate": 1.2359267983614256e-05, |
| "loss": 0.018, |
| "num_input_tokens_seen": 2138176, |
| "step": 7525 |
| }, |
| { |
| "epoch": 14.048507462686567, |
| "grad_norm": 0.8765741586685181, |
| "learning_rate": 1.2324168514424328e-05, |
| "loss": 0.0395, |
| "num_input_tokens_seen": 2139520, |
| "step": 7530 |
| }, |
| { |
| "epoch": 14.057835820895523, |
| "grad_norm": 0.2672477066516876, |
| "learning_rate": 1.2289102645408329e-05, |
| "loss": 0.0041, |
| "num_input_tokens_seen": 2141344, |
| "step": 7535 |
| }, |
| { |
| "epoch": 14.067164179104477, |
| "grad_norm": 0.06005933880805969, |
| "learning_rate": 1.2254070469516324e-05, |
| "loss": 0.0127, |
| "num_input_tokens_seen": 2142720, |
| "step": 7540 |
| }, |
| { |
| "epoch": 14.076492537313433, |
| "grad_norm": 0.43129757046699524, |
| "learning_rate": 1.2219072079609045e-05, |
| "loss": 0.0218, |
| "num_input_tokens_seen": 2144032, |
| "step": 7545 |
| }, |
| { |
| "epoch": 14.085820895522389, |
| "grad_norm": 0.0365731418132782, |
| "learning_rate": 1.2184107568457709e-05, |
| "loss": 0.0237, |
| "num_input_tokens_seen": 2145408, |
| "step": 7550 |
| }, |
| { |
| "epoch": 14.095149253731343, |
| "grad_norm": 0.12061981856822968, |
| "learning_rate": 1.2149177028743691e-05, |
| "loss": 0.0177, |
| "num_input_tokens_seen": 2146784, |
| "step": 7555 |
| }, |
| { |
| "epoch": 14.104477611940299, |
| "grad_norm": 0.08856409043073654, |
| "learning_rate": 1.2114280553058324e-05, |
| "loss": 0.0058, |
| "num_input_tokens_seen": 2148128, |
| "step": 7560 |
| }, |
| { |
| "epoch": 14.113805970149254, |
| "grad_norm": 2.4051401615142822, |
| "learning_rate": 1.2079418233902667e-05, |
| "loss": 0.0175, |
| "num_input_tokens_seen": 2149600, |
| "step": 7565 |
| }, |
| { |
| "epoch": 14.123134328358208, |
| "grad_norm": 0.04482206329703331, |
| "learning_rate": 1.2044590163687219e-05, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 2151168, |
| "step": 7570 |
| }, |
| { |
| "epoch": 14.132462686567164, |
| "grad_norm": 1.0668768882751465, |
| "learning_rate": 1.2009796434731688e-05, |
| "loss": 0.0382, |
| "num_input_tokens_seen": 2152544, |
| "step": 7575 |
| }, |
| { |
| "epoch": 14.14179104477612, |
| "grad_norm": 0.8667140603065491, |
| "learning_rate": 1.1975037139264782e-05, |
| "loss": 0.0473, |
| "num_input_tokens_seen": 2153984, |
| "step": 7580 |
| }, |
| { |
| "epoch": 14.151119402985074, |
| "grad_norm": 0.05873038247227669, |
| "learning_rate": 1.1940312369423917e-05, |
| "loss": 0.001, |
| "num_input_tokens_seen": 2155200, |
| "step": 7585 |
| }, |
| { |
| "epoch": 14.16044776119403, |
| "grad_norm": 0.8484371304512024, |
| "learning_rate": 1.1905622217254986e-05, |
| "loss": 0.0136, |
| "num_input_tokens_seen": 2156544, |
| "step": 7590 |
| }, |
| { |
| "epoch": 14.169776119402986, |
| "grad_norm": 0.08751197904348373, |
| "learning_rate": 1.1870966774712114e-05, |
| "loss": 0.0152, |
| "num_input_tokens_seen": 2157952, |
| "step": 7595 |
| }, |
| { |
| "epoch": 14.17910447761194, |
| "grad_norm": 0.08112747222185135, |
| "learning_rate": 1.1836346133657459e-05, |
| "loss": 0.0011, |
| "num_input_tokens_seen": 2159616, |
| "step": 7600 |
| }, |
| { |
| "epoch": 14.188432835820896, |
| "grad_norm": 1.1953579187393188, |
| "learning_rate": 1.1801760385860885e-05, |
| "loss": 0.0268, |
| "num_input_tokens_seen": 2161280, |
| "step": 7605 |
| }, |
| { |
| "epoch": 14.197761194029852, |
| "grad_norm": 0.0357501357793808, |
| "learning_rate": 1.1767209622999781e-05, |
| "loss": 0.0316, |
| "num_input_tokens_seen": 2162624, |
| "step": 7610 |
| }, |
| { |
| "epoch": 14.207089552238806, |
| "grad_norm": 0.366653710603714, |
| "learning_rate": 1.1732693936658814e-05, |
| "loss": 0.0261, |
| "num_input_tokens_seen": 2164128, |
| "step": 7615 |
| }, |
| { |
| "epoch": 14.216417910447761, |
| "grad_norm": 1.4579626321792603, |
| "learning_rate": 1.169821341832965e-05, |
| "loss": 0.0239, |
| "num_input_tokens_seen": 2165632, |
| "step": 7620 |
| }, |
| { |
| "epoch": 14.225746268656717, |
| "grad_norm": 0.07542331516742706, |
| "learning_rate": 1.1663768159410748e-05, |
| "loss": 0.0327, |
| "num_input_tokens_seen": 2166944, |
| "step": 7625 |
| }, |
| { |
| "epoch": 14.235074626865671, |
| "grad_norm": 0.04812522232532501, |
| "learning_rate": 1.162935825120709e-05, |
| "loss": 0.0011, |
| "num_input_tokens_seen": 2168448, |
| "step": 7630 |
| }, |
| { |
| "epoch": 14.244402985074627, |
| "grad_norm": 0.06067129597067833, |
| "learning_rate": 1.1594983784929969e-05, |
| "loss": 0.0168, |
| "num_input_tokens_seen": 2169888, |
| "step": 7635 |
| }, |
| { |
| "epoch": 14.253731343283581, |
| "grad_norm": 0.22783125936985016, |
| "learning_rate": 1.1560644851696744e-05, |
| "loss": 0.0306, |
| "num_input_tokens_seen": 2171424, |
| "step": 7640 |
| }, |
| { |
| "epoch": 14.263059701492537, |
| "grad_norm": 0.03479655832052231, |
| "learning_rate": 1.152634154253053e-05, |
| "loss": 0.0053, |
| "num_input_tokens_seen": 2172896, |
| "step": 7645 |
| }, |
| { |
| "epoch": 14.272388059701493, |
| "grad_norm": 0.018343765288591385, |
| "learning_rate": 1.149207394836008e-05, |
| "loss": 0.026, |
| "num_input_tokens_seen": 2174336, |
| "step": 7650 |
| }, |
| { |
| "epoch": 14.281716417910447, |
| "grad_norm": 0.8041732907295227, |
| "learning_rate": 1.1457842160019419e-05, |
| "loss": 0.004, |
| "num_input_tokens_seen": 2175680, |
| "step": 7655 |
| }, |
| { |
| "epoch": 14.291044776119403, |
| "grad_norm": 0.07619409263134003, |
| "learning_rate": 1.1423646268247722e-05, |
| "loss": 0.0389, |
| "num_input_tokens_seen": 2177248, |
| "step": 7660 |
| }, |
| { |
| "epoch": 14.300373134328359, |
| "grad_norm": 0.056708406656980515, |
| "learning_rate": 1.1389486363688934e-05, |
| "loss": 0.0252, |
| "num_input_tokens_seen": 2178688, |
| "step": 7665 |
| }, |
| { |
| "epoch": 14.309701492537313, |
| "grad_norm": 0.14702460169792175, |
| "learning_rate": 1.1355362536891673e-05, |
| "loss": 0.0042, |
| "num_input_tokens_seen": 2180192, |
| "step": 7670 |
| }, |
| { |
| "epoch": 14.319029850746269, |
| "grad_norm": 0.06439556926488876, |
| "learning_rate": 1.1321274878308919e-05, |
| "loss": 0.0045, |
| "num_input_tokens_seen": 2181600, |
| "step": 7675 |
| }, |
| { |
| "epoch": 14.328358208955224, |
| "grad_norm": 0.1918197125196457, |
| "learning_rate": 1.1287223478297724e-05, |
| "loss": 0.0011, |
| "num_input_tokens_seen": 2183168, |
| "step": 7680 |
| }, |
| { |
| "epoch": 14.337686567164178, |
| "grad_norm": 0.016894975677132607, |
| "learning_rate": 1.1253208427119094e-05, |
| "loss": 0.0086, |
| "num_input_tokens_seen": 2184608, |
| "step": 7685 |
| }, |
| { |
| "epoch": 14.347014925373134, |
| "grad_norm": 0.061480239033699036, |
| "learning_rate": 1.1219229814937646e-05, |
| "loss": 0.1233, |
| "num_input_tokens_seen": 2186048, |
| "step": 7690 |
| }, |
| { |
| "epoch": 14.35634328358209, |
| "grad_norm": 2.6116342544555664, |
| "learning_rate": 1.1185287731821429e-05, |
| "loss": 0.0381, |
| "num_input_tokens_seen": 2187328, |
| "step": 7695 |
| }, |
| { |
| "epoch": 14.365671641791044, |
| "grad_norm": 0.12151702493429184, |
| "learning_rate": 1.1151382267741647e-05, |
| "loss": 0.0041, |
| "num_input_tokens_seen": 2188672, |
| "step": 7700 |
| }, |
| { |
| "epoch": 14.375, |
| "grad_norm": 0.19662828743457794, |
| "learning_rate": 1.1117513512572436e-05, |
| "loss": 0.0009, |
| "num_input_tokens_seen": 2190016, |
| "step": 7705 |
| }, |
| { |
| "epoch": 14.384328358208956, |
| "grad_norm": 0.09480198472738266, |
| "learning_rate": 1.108368155609065e-05, |
| "loss": 0.024, |
| "num_input_tokens_seen": 2191488, |
| "step": 7710 |
| }, |
| { |
| "epoch": 14.39365671641791, |
| "grad_norm": 0.07968703657388687, |
| "learning_rate": 1.1049886487975572e-05, |
| "loss": 0.0259, |
| "num_input_tokens_seen": 2192992, |
| "step": 7715 |
| }, |
| { |
| "epoch": 14.402985074626866, |
| "grad_norm": 1.8118599653244019, |
| "learning_rate": 1.1016128397808716e-05, |
| "loss": 0.0067, |
| "num_input_tokens_seen": 2194496, |
| "step": 7720 |
| }, |
| { |
| "epoch": 14.412313432835822, |
| "grad_norm": 0.10556568950414658, |
| "learning_rate": 1.0982407375073573e-05, |
| "loss": 0.0273, |
| "num_input_tokens_seen": 2195936, |
| "step": 7725 |
| }, |
| { |
| "epoch": 14.421641791044776, |
| "grad_norm": 0.03794952109456062, |
| "learning_rate": 1.0948723509155393e-05, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 2197216, |
| "step": 7730 |
| }, |
| { |
| "epoch": 14.430970149253731, |
| "grad_norm": 0.5993940234184265, |
| "learning_rate": 1.0915076889340917e-05, |
| "loss": 0.0046, |
| "num_input_tokens_seen": 2198848, |
| "step": 7735 |
| }, |
| { |
| "epoch": 14.440298507462687, |
| "grad_norm": 0.08497757464647293, |
| "learning_rate": 1.0881467604818154e-05, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 2200256, |
| "step": 7740 |
| }, |
| { |
| "epoch": 14.449626865671641, |
| "grad_norm": 2.5819053649902344, |
| "learning_rate": 1.0847895744676173e-05, |
| "loss": 0.0672, |
| "num_input_tokens_seen": 2201568, |
| "step": 7745 |
| }, |
| { |
| "epoch": 14.458955223880597, |
| "grad_norm": 0.11283766478300095, |
| "learning_rate": 1.0814361397904812e-05, |
| "loss": 0.0049, |
| "num_input_tokens_seen": 2203040, |
| "step": 7750 |
| }, |
| { |
| "epoch": 14.468283582089553, |
| "grad_norm": 0.062314633280038834, |
| "learning_rate": 1.0780864653394478e-05, |
| "loss": 0.001, |
| "num_input_tokens_seen": 2204384, |
| "step": 7755 |
| }, |
| { |
| "epoch": 14.477611940298507, |
| "grad_norm": 2.370483875274658, |
| "learning_rate": 1.0747405599935929e-05, |
| "loss": 0.0112, |
| "num_input_tokens_seen": 2205600, |
| "step": 7760 |
| }, |
| { |
| "epoch": 14.486940298507463, |
| "grad_norm": 0.6928048133850098, |
| "learning_rate": 1.0713984326219978e-05, |
| "loss": 0.0285, |
| "num_input_tokens_seen": 2207008, |
| "step": 7765 |
| }, |
| { |
| "epoch": 14.496268656716419, |
| "grad_norm": 0.16109126806259155, |
| "learning_rate": 1.0680600920837319e-05, |
| "loss": 0.0144, |
| "num_input_tokens_seen": 2208448, |
| "step": 7770 |
| }, |
| { |
| "epoch": 14.505597014925373, |
| "grad_norm": 1.4980695247650146, |
| "learning_rate": 1.064725547227825e-05, |
| "loss": 0.017, |
| "num_input_tokens_seen": 2209856, |
| "step": 7775 |
| }, |
| { |
| "epoch": 14.514925373134329, |
| "grad_norm": 0.06491298973560333, |
| "learning_rate": 1.0613948068932481e-05, |
| "loss": 0.0259, |
| "num_input_tokens_seen": 2211232, |
| "step": 7780 |
| }, |
| { |
| "epoch": 14.524253731343283, |
| "grad_norm": 0.02034805715084076, |
| "learning_rate": 1.0580678799088848e-05, |
| "loss": 0.0059, |
| "num_input_tokens_seen": 2212672, |
| "step": 7785 |
| }, |
| { |
| "epoch": 14.533582089552239, |
| "grad_norm": 1.9371212720870972, |
| "learning_rate": 1.054744775093511e-05, |
| "loss": 0.0066, |
| "num_input_tokens_seen": 2214112, |
| "step": 7790 |
| }, |
| { |
| "epoch": 14.542910447761194, |
| "grad_norm": 0.9395687580108643, |
| "learning_rate": 1.0514255012557738e-05, |
| "loss": 0.027, |
| "num_input_tokens_seen": 2215584, |
| "step": 7795 |
| }, |
| { |
| "epoch": 14.552238805970148, |
| "grad_norm": 1.229207992553711, |
| "learning_rate": 1.0481100671941618e-05, |
| "loss": 0.0182, |
| "num_input_tokens_seen": 2216992, |
| "step": 7800 |
| }, |
| { |
| "epoch": 14.561567164179104, |
| "grad_norm": 0.03663705661892891, |
| "learning_rate": 1.0447984816969874e-05, |
| "loss": 0.0461, |
| "num_input_tokens_seen": 2218432, |
| "step": 7805 |
| }, |
| { |
| "epoch": 14.57089552238806, |
| "grad_norm": 1.0331707000732422, |
| "learning_rate": 1.0414907535423602e-05, |
| "loss": 0.0034, |
| "num_input_tokens_seen": 2220000, |
| "step": 7810 |
| }, |
| { |
| "epoch": 14.580223880597014, |
| "grad_norm": 2.9640111923217773, |
| "learning_rate": 1.0381868914981673e-05, |
| "loss": 0.0782, |
| "num_input_tokens_seen": 2221280, |
| "step": 7815 |
| }, |
| { |
| "epoch": 14.58955223880597, |
| "grad_norm": 0.12030115723609924, |
| "learning_rate": 1.0348869043220458e-05, |
| "loss": 0.019, |
| "num_input_tokens_seen": 2222656, |
| "step": 7820 |
| }, |
| { |
| "epoch": 14.598880597014926, |
| "grad_norm": 0.09528183192014694, |
| "learning_rate": 1.0315908007613609e-05, |
| "loss": 0.0208, |
| "num_input_tokens_seen": 2224064, |
| "step": 7825 |
| }, |
| { |
| "epoch": 14.60820895522388, |
| "grad_norm": 2.188758373260498, |
| "learning_rate": 1.0282985895531865e-05, |
| "loss": 0.0395, |
| "num_input_tokens_seen": 2225376, |
| "step": 7830 |
| }, |
| { |
| "epoch": 14.617537313432836, |
| "grad_norm": 0.04384111985564232, |
| "learning_rate": 1.0250102794242767e-05, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 2226944, |
| "step": 7835 |
| }, |
| { |
| "epoch": 14.626865671641792, |
| "grad_norm": 1.0109224319458008, |
| "learning_rate": 1.0217258790910448e-05, |
| "loss": 0.0561, |
| "num_input_tokens_seen": 2228384, |
| "step": 7840 |
| }, |
| { |
| "epoch": 14.636194029850746, |
| "grad_norm": 0.035871949046850204, |
| "learning_rate": 1.01844539725954e-05, |
| "loss": 0.0327, |
| "num_input_tokens_seen": 2229952, |
| "step": 7845 |
| }, |
| { |
| "epoch": 14.645522388059701, |
| "grad_norm": 0.202130526304245, |
| "learning_rate": 1.0151688426254264e-05, |
| "loss": 0.0118, |
| "num_input_tokens_seen": 2231296, |
| "step": 7850 |
| }, |
| { |
| "epoch": 14.654850746268657, |
| "grad_norm": 2.4090077877044678, |
| "learning_rate": 1.0118962238739586e-05, |
| "loss": 0.0218, |
| "num_input_tokens_seen": 2232704, |
| "step": 7855 |
| }, |
| { |
| "epoch": 14.664179104477611, |
| "grad_norm": 0.39101025462150574, |
| "learning_rate": 1.0086275496799535e-05, |
| "loss": 0.0108, |
| "num_input_tokens_seen": 2234144, |
| "step": 7860 |
| }, |
| { |
| "epoch": 14.673507462686567, |
| "grad_norm": 0.06438241899013519, |
| "learning_rate": 1.0053628287077782e-05, |
| "loss": 0.0009, |
| "num_input_tokens_seen": 2235584, |
| "step": 7865 |
| }, |
| { |
| "epoch": 14.682835820895523, |
| "grad_norm": 0.23394466936588287, |
| "learning_rate": 1.002102069611317e-05, |
| "loss": 0.0072, |
| "num_input_tokens_seen": 2237024, |
| "step": 7870 |
| }, |
| { |
| "epoch": 14.692164179104477, |
| "grad_norm": 0.05994849279522896, |
| "learning_rate": 9.988452810339527e-06, |
| "loss": 0.006, |
| "num_input_tokens_seen": 2238400, |
| "step": 7875 |
| }, |
| { |
| "epoch": 14.701492537313433, |
| "grad_norm": 0.8589292764663696, |
| "learning_rate": 9.955924716085455e-06, |
| "loss": 0.0439, |
| "num_input_tokens_seen": 2239712, |
| "step": 7880 |
| }, |
| { |
| "epoch": 14.710820895522389, |
| "grad_norm": 0.03968377038836479, |
| "learning_rate": 9.923436499574046e-06, |
| "loss": 0.0011, |
| "num_input_tokens_seen": 2241376, |
| "step": 7885 |
| }, |
| { |
| "epoch": 14.720149253731343, |
| "grad_norm": 0.03648172318935394, |
| "learning_rate": 9.89098824692274e-06, |
| "loss": 0.0012, |
| "num_input_tokens_seen": 2242848, |
| "step": 7890 |
| }, |
| { |
| "epoch": 14.729477611940299, |
| "grad_norm": 0.06013834476470947, |
| "learning_rate": 9.858580044142966e-06, |
| "loss": 0.0072, |
| "num_input_tokens_seen": 2244416, |
| "step": 7895 |
| }, |
| { |
| "epoch": 14.738805970149254, |
| "grad_norm": 0.6766325831413269, |
| "learning_rate": 9.826211977140065e-06, |
| "loss": 0.0048, |
| "num_input_tokens_seen": 2245952, |
| "step": 7900 |
| }, |
| { |
| "epoch": 14.748134328358208, |
| "grad_norm": 0.01058275531977415, |
| "learning_rate": 9.793884131712943e-06, |
| "loss": 0.0587, |
| "num_input_tokens_seen": 2247424, |
| "step": 7905 |
| }, |
| { |
| "epoch": 14.757462686567164, |
| "grad_norm": 0.05173706263303757, |
| "learning_rate": 9.761596593553924e-06, |
| "loss": 0.0236, |
| "num_input_tokens_seen": 2248768, |
| "step": 7910 |
| }, |
| { |
| "epoch": 14.76679104477612, |
| "grad_norm": 1.2847206592559814, |
| "learning_rate": 9.72934944824846e-06, |
| "loss": 0.0083, |
| "num_input_tokens_seen": 2250176, |
| "step": 7915 |
| }, |
| { |
| "epoch": 14.776119402985074, |
| "grad_norm": 0.02579890564084053, |
| "learning_rate": 9.69714278127493e-06, |
| "loss": 0.0049, |
| "num_input_tokens_seen": 2251744, |
| "step": 7920 |
| }, |
| { |
| "epoch": 14.78544776119403, |
| "grad_norm": 0.0510295070707798, |
| "learning_rate": 9.664976678004464e-06, |
| "loss": 0.028, |
| "num_input_tokens_seen": 2252992, |
| "step": 7925 |
| }, |
| { |
| "epoch": 14.794776119402986, |
| "grad_norm": 0.8147083520889282, |
| "learning_rate": 9.632851223700593e-06, |
| "loss": 0.0053, |
| "num_input_tokens_seen": 2254336, |
| "step": 7930 |
| }, |
| { |
| "epoch": 14.80410447761194, |
| "grad_norm": 0.027658287435770035, |
| "learning_rate": 9.600766503519154e-06, |
| "loss": 0.0243, |
| "num_input_tokens_seen": 2255616, |
| "step": 7935 |
| }, |
| { |
| "epoch": 14.813432835820896, |
| "grad_norm": 0.02234121784567833, |
| "learning_rate": 9.568722602508009e-06, |
| "loss": 0.0372, |
| "num_input_tokens_seen": 2256896, |
| "step": 7940 |
| }, |
| { |
| "epoch": 14.822761194029852, |
| "grad_norm": 0.06608214229345322, |
| "learning_rate": 9.536719605606795e-06, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 2258400, |
| "step": 7945 |
| }, |
| { |
| "epoch": 14.832089552238806, |
| "grad_norm": 0.06717480719089508, |
| "learning_rate": 9.50475759764673e-06, |
| "loss": 0.0347, |
| "num_input_tokens_seen": 2259808, |
| "step": 7950 |
| }, |
| { |
| "epoch": 14.841417910447761, |
| "grad_norm": 0.05631548538804054, |
| "learning_rate": 9.472836663350377e-06, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 2261216, |
| "step": 7955 |
| }, |
| { |
| "epoch": 14.850746268656717, |
| "grad_norm": 0.13347527384757996, |
| "learning_rate": 9.440956887331446e-06, |
| "loss": 0.0542, |
| "num_input_tokens_seen": 2262656, |
| "step": 7960 |
| }, |
| { |
| "epoch": 14.860074626865671, |
| "grad_norm": 0.9020296931266785, |
| "learning_rate": 9.40911835409453e-06, |
| "loss": 0.0122, |
| "num_input_tokens_seen": 2264224, |
| "step": 7965 |
| }, |
| { |
| "epoch": 14.869402985074627, |
| "grad_norm": 0.15232621133327484, |
| "learning_rate": 9.37732114803489e-06, |
| "loss": 0.0153, |
| "num_input_tokens_seen": 2265504, |
| "step": 7970 |
| }, |
| { |
| "epoch": 14.878731343283581, |
| "grad_norm": 0.067230224609375, |
| "learning_rate": 9.345565353438268e-06, |
| "loss": 0.005, |
| "num_input_tokens_seen": 2266912, |
| "step": 7975 |
| }, |
| { |
| "epoch": 14.888059701492537, |
| "grad_norm": 0.021031435579061508, |
| "learning_rate": 9.313851054480614e-06, |
| "loss": 0.0455, |
| "num_input_tokens_seen": 2268320, |
| "step": 7980 |
| }, |
| { |
| "epoch": 14.897388059701493, |
| "grad_norm": 1.4675151109695435, |
| "learning_rate": 9.282178335227884e-06, |
| "loss": 0.0646, |
| "num_input_tokens_seen": 2269728, |
| "step": 7985 |
| }, |
| { |
| "epoch": 14.906716417910447, |
| "grad_norm": 0.7634879946708679, |
| "learning_rate": 9.250547279635818e-06, |
| "loss": 0.0378, |
| "num_input_tokens_seen": 2271232, |
| "step": 7990 |
| }, |
| { |
| "epoch": 14.916044776119403, |
| "grad_norm": 1.56741201877594, |
| "learning_rate": 9.218957971549742e-06, |
| "loss": 0.0105, |
| "num_input_tokens_seen": 2272800, |
| "step": 7995 |
| }, |
| { |
| "epoch": 14.925373134328359, |
| "grad_norm": 0.054754115641117096, |
| "learning_rate": 9.187410494704286e-06, |
| "loss": 0.011, |
| "num_input_tokens_seen": 2274048, |
| "step": 8000 |
| }, |
| { |
| "epoch": 14.934701492537313, |
| "grad_norm": 1.0253604650497437, |
| "learning_rate": 9.155904932723202e-06, |
| "loss": 0.0216, |
| "num_input_tokens_seen": 2275296, |
| "step": 8005 |
| }, |
| { |
| "epoch": 14.944029850746269, |
| "grad_norm": 2.142385482788086, |
| "learning_rate": 9.124441369119171e-06, |
| "loss": 0.0657, |
| "num_input_tokens_seen": 2276736, |
| "step": 8010 |
| }, |
| { |
| "epoch": 14.953358208955224, |
| "grad_norm": 2.3308064937591553, |
| "learning_rate": 9.093019887293514e-06, |
| "loss": 0.048, |
| "num_input_tokens_seen": 2278176, |
| "step": 8015 |
| }, |
| { |
| "epoch": 14.962686567164178, |
| "grad_norm": 1.4632657766342163, |
| "learning_rate": 9.061640570536007e-06, |
| "loss": 0.0205, |
| "num_input_tokens_seen": 2279488, |
| "step": 8020 |
| }, |
| { |
| "epoch": 14.972014925373134, |
| "grad_norm": 0.13808825612068176, |
| "learning_rate": 9.030303502024661e-06, |
| "loss": 0.024, |
| "num_input_tokens_seen": 2280768, |
| "step": 8025 |
| }, |
| { |
| "epoch": 14.98134328358209, |
| "grad_norm": 2.6733450889587402, |
| "learning_rate": 8.99900876482552e-06, |
| "loss": 0.0828, |
| "num_input_tokens_seen": 2281984, |
| "step": 8030 |
| }, |
| { |
| "epoch": 14.990671641791044, |
| "grad_norm": 0.7640068531036377, |
| "learning_rate": 8.967756441892395e-06, |
| "loss": 0.0352, |
| "num_input_tokens_seen": 2283296, |
| "step": 8035 |
| }, |
| { |
| "epoch": 15.0, |
| "grad_norm": 0.36165326833724976, |
| "learning_rate": 8.93654661606666e-06, |
| "loss": 0.0014, |
| "num_input_tokens_seen": 2284440, |
| "step": 8040 |
| }, |
| { |
| "epoch": 15.0, |
| "eval_loss": 1.362017035484314, |
| "eval_runtime": 3.3655, |
| "eval_samples_per_second": 70.717, |
| "eval_steps_per_second": 17.828, |
| "num_input_tokens_seen": 2284440, |
| "step": 8040 |
| }, |
| { |
| "epoch": 15.009328358208956, |
| "grad_norm": 1.36297607421875, |
| "learning_rate": 8.905379370077077e-06, |
| "loss": 0.0421, |
| "num_input_tokens_seen": 2285944, |
| "step": 8045 |
| }, |
| { |
| "epoch": 15.01865671641791, |
| "grad_norm": 0.8399866819381714, |
| "learning_rate": 8.87425478653951e-06, |
| "loss": 0.0463, |
| "num_input_tokens_seen": 2287320, |
| "step": 8050 |
| }, |
| { |
| "epoch": 15.027985074626866, |
| "grad_norm": 0.0843312069773674, |
| "learning_rate": 8.843172947956737e-06, |
| "loss": 0.0038, |
| "num_input_tokens_seen": 2288792, |
| "step": 8055 |
| }, |
| { |
| "epoch": 15.037313432835822, |
| "grad_norm": 0.043519098311662674, |
| "learning_rate": 8.81213393671826e-06, |
| "loss": 0.0016, |
| "num_input_tokens_seen": 2290232, |
| "step": 8060 |
| }, |
| { |
| "epoch": 15.046641791044776, |
| "grad_norm": 0.5979039072990417, |
| "learning_rate": 8.78113783510002e-06, |
| "loss": 0.009, |
| "num_input_tokens_seen": 2291800, |
| "step": 8065 |
| }, |
| { |
| "epoch": 15.055970149253731, |
| "grad_norm": 0.0802803561091423, |
| "learning_rate": 8.75018472526424e-06, |
| "loss": 0.0037, |
| "num_input_tokens_seen": 2293112, |
| "step": 8070 |
| }, |
| { |
| "epoch": 15.065298507462687, |
| "grad_norm": 0.06343238800764084, |
| "learning_rate": 8.719274689259166e-06, |
| "loss": 0.0322, |
| "num_input_tokens_seen": 2294648, |
| "step": 8075 |
| }, |
| { |
| "epoch": 15.074626865671641, |
| "grad_norm": 1.6169513463974, |
| "learning_rate": 8.688407809018895e-06, |
| "loss": 0.0547, |
| "num_input_tokens_seen": 2295928, |
| "step": 8080 |
| }, |
| { |
| "epoch": 15.083955223880597, |
| "grad_norm": 0.04254364222288132, |
| "learning_rate": 8.657584166363103e-06, |
| "loss": 0.0143, |
| "num_input_tokens_seen": 2297368, |
| "step": 8085 |
| }, |
| { |
| "epoch": 15.093283582089553, |
| "grad_norm": 1.4507561922073364, |
| "learning_rate": 8.626803842996856e-06, |
| "loss": 0.0246, |
| "num_input_tokens_seen": 2298808, |
| "step": 8090 |
| }, |
| { |
| "epoch": 15.102611940298507, |
| "grad_norm": 0.05373208224773407, |
| "learning_rate": 8.596066920510417e-06, |
| "loss": 0.0188, |
| "num_input_tokens_seen": 2300216, |
| "step": 8095 |
| }, |
| { |
| "epoch": 15.111940298507463, |
| "grad_norm": 0.6847307682037354, |
| "learning_rate": 8.565373480378977e-06, |
| "loss": 0.0034, |
| "num_input_tokens_seen": 2301656, |
| "step": 8100 |
| }, |
| { |
| "epoch": 15.121268656716419, |
| "grad_norm": 0.03840490058064461, |
| "learning_rate": 8.534723603962497e-06, |
| "loss": 0.0431, |
| "num_input_tokens_seen": 2303192, |
| "step": 8105 |
| }, |
| { |
| "epoch": 15.130597014925373, |
| "grad_norm": 0.08182709664106369, |
| "learning_rate": 8.504117372505416e-06, |
| "loss": 0.0468, |
| "num_input_tokens_seen": 2304504, |
| "step": 8110 |
| }, |
| { |
| "epoch": 15.139925373134329, |
| "grad_norm": 0.7403824925422668, |
| "learning_rate": 8.473554867136524e-06, |
| "loss": 0.0048, |
| "num_input_tokens_seen": 2305720, |
| "step": 8115 |
| }, |
| { |
| "epoch": 15.149253731343283, |
| "grad_norm": 3.1654574871063232, |
| "learning_rate": 8.443036168868709e-06, |
| "loss": 0.014, |
| "num_input_tokens_seen": 2307160, |
| "step": 8120 |
| }, |
| { |
| "epoch": 15.158582089552239, |
| "grad_norm": 0.14238913357257843, |
| "learning_rate": 8.412561358598693e-06, |
| "loss": 0.0012, |
| "num_input_tokens_seen": 2308536, |
| "step": 8125 |
| }, |
| { |
| "epoch": 15.167910447761194, |
| "grad_norm": 0.2234521061182022, |
| "learning_rate": 8.382130517106907e-06, |
| "loss": 0.0019, |
| "num_input_tokens_seen": 2310168, |
| "step": 8130 |
| }, |
| { |
| "epoch": 15.177238805970148, |
| "grad_norm": 2.252610921859741, |
| "learning_rate": 8.351743725057204e-06, |
| "loss": 0.0213, |
| "num_input_tokens_seen": 2311448, |
| "step": 8135 |
| }, |
| { |
| "epoch": 15.186567164179104, |
| "grad_norm": 0.09281405061483383, |
| "learning_rate": 8.321401062996714e-06, |
| "loss": 0.0162, |
| "num_input_tokens_seen": 2312888, |
| "step": 8140 |
| }, |
| { |
| "epoch": 15.19589552238806, |
| "grad_norm": 0.06658900529146194, |
| "learning_rate": 8.291102611355525e-06, |
| "loss": 0.0317, |
| "num_input_tokens_seen": 2314328, |
| "step": 8145 |
| }, |
| { |
| "epoch": 15.205223880597014, |
| "grad_norm": 0.046269021928310394, |
| "learning_rate": 8.260848450446596e-06, |
| "loss": 0.0157, |
| "num_input_tokens_seen": 2315640, |
| "step": 8150 |
| }, |
| { |
| "epoch": 15.21455223880597, |
| "grad_norm": 0.4967081546783447, |
| "learning_rate": 8.230638660465461e-06, |
| "loss": 0.0347, |
| "num_input_tokens_seen": 2316984, |
| "step": 8155 |
| }, |
| { |
| "epoch": 15.223880597014926, |
| "grad_norm": 0.31126704812049866, |
| "learning_rate": 8.200473321490035e-06, |
| "loss": 0.0011, |
| "num_input_tokens_seen": 2318232, |
| "step": 8160 |
| }, |
| { |
| "epoch": 15.23320895522388, |
| "grad_norm": 0.056040745228528976, |
| "learning_rate": 8.170352513480408e-06, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 2319704, |
| "step": 8165 |
| }, |
| { |
| "epoch": 15.242537313432836, |
| "grad_norm": 0.0655110627412796, |
| "learning_rate": 8.140276316278623e-06, |
| "loss": 0.0158, |
| "num_input_tokens_seen": 2321304, |
| "step": 8170 |
| }, |
| { |
| "epoch": 15.251865671641792, |
| "grad_norm": 0.3595825135707855, |
| "learning_rate": 8.110244809608495e-06, |
| "loss": 0.0178, |
| "num_input_tokens_seen": 2322456, |
| "step": 8175 |
| }, |
| { |
| "epoch": 15.261194029850746, |
| "grad_norm": 0.03200814872980118, |
| "learning_rate": 8.080258073075357e-06, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 2324024, |
| "step": 8180 |
| }, |
| { |
| "epoch": 15.270522388059701, |
| "grad_norm": 0.021252451464533806, |
| "learning_rate": 8.050316186165863e-06, |
| "loss": 0.0025, |
| "num_input_tokens_seen": 2325496, |
| "step": 8185 |
| }, |
| { |
| "epoch": 15.279850746268657, |
| "grad_norm": 1.3189091682434082, |
| "learning_rate": 8.020419228247807e-06, |
| "loss": 0.0195, |
| "num_input_tokens_seen": 2326904, |
| "step": 8190 |
| }, |
| { |
| "epoch": 15.289179104477611, |
| "grad_norm": 0.7680203914642334, |
| "learning_rate": 7.990567278569872e-06, |
| "loss": 0.0302, |
| "num_input_tokens_seen": 2328184, |
| "step": 8195 |
| }, |
| { |
| "epoch": 15.298507462686567, |
| "grad_norm": 0.05537838861346245, |
| "learning_rate": 7.960760416261437e-06, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 2329592, |
| "step": 8200 |
| }, |
| { |
| "epoch": 15.307835820895523, |
| "grad_norm": 0.07239200919866562, |
| "learning_rate": 7.930998720332358e-06, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 2331000, |
| "step": 8205 |
| }, |
| { |
| "epoch": 15.317164179104477, |
| "grad_norm": 0.045129068195819855, |
| "learning_rate": 7.901282269672799e-06, |
| "loss": 0.0346, |
| "num_input_tokens_seen": 2332216, |
| "step": 8210 |
| }, |
| { |
| "epoch": 15.326492537313433, |
| "grad_norm": 0.8535535931587219, |
| "learning_rate": 7.87161114305296e-06, |
| "loss": 0.0041, |
| "num_input_tokens_seen": 2333592, |
| "step": 8215 |
| }, |
| { |
| "epoch": 15.335820895522389, |
| "grad_norm": 0.04702851548790932, |
| "learning_rate": 7.8419854191229e-06, |
| "loss": 0.023, |
| "num_input_tokens_seen": 2335064, |
| "step": 8220 |
| }, |
| { |
| "epoch": 15.345149253731343, |
| "grad_norm": 0.05041104182600975, |
| "learning_rate": 7.812405176412355e-06, |
| "loss": 0.0503, |
| "num_input_tokens_seen": 2336632, |
| "step": 8225 |
| }, |
| { |
| "epoch": 15.354477611940299, |
| "grad_norm": 0.05126981437206268, |
| "learning_rate": 7.782870493330473e-06, |
| "loss": 0.0165, |
| "num_input_tokens_seen": 2337880, |
| "step": 8230 |
| }, |
| { |
| "epoch": 15.363805970149254, |
| "grad_norm": 0.02263198420405388, |
| "learning_rate": 7.75338144816565e-06, |
| "loss": 0.0375, |
| "num_input_tokens_seen": 2339224, |
| "step": 8235 |
| }, |
| { |
| "epoch": 15.373134328358208, |
| "grad_norm": 7.4993510246276855, |
| "learning_rate": 7.723938119085314e-06, |
| "loss": 0.0808, |
| "num_input_tokens_seen": 2340632, |
| "step": 8240 |
| }, |
| { |
| "epoch": 15.382462686567164, |
| "grad_norm": 0.7374345660209656, |
| "learning_rate": 7.694540584135696e-06, |
| "loss": 0.0167, |
| "num_input_tokens_seen": 2342008, |
| "step": 8245 |
| }, |
| { |
| "epoch": 15.39179104477612, |
| "grad_norm": 0.06797999143600464, |
| "learning_rate": 7.665188921241654e-06, |
| "loss": 0.003, |
| "num_input_tokens_seen": 2343480, |
| "step": 8250 |
| }, |
| { |
| "epoch": 15.401119402985074, |
| "grad_norm": 0.607535719871521, |
| "learning_rate": 7.635883208206429e-06, |
| "loss": 0.0121, |
| "num_input_tokens_seen": 2345016, |
| "step": 8255 |
| }, |
| { |
| "epoch": 15.41044776119403, |
| "grad_norm": 0.08975960314273834, |
| "learning_rate": 7.606623522711498e-06, |
| "loss": 0.0221, |
| "num_input_tokens_seen": 2346328, |
| "step": 8260 |
| }, |
| { |
| "epoch": 15.419776119402986, |
| "grad_norm": 0.04766280949115753, |
| "learning_rate": 7.5774099423163045e-06, |
| "loss": 0.0191, |
| "num_input_tokens_seen": 2347608, |
| "step": 8265 |
| }, |
| { |
| "epoch": 15.42910447761194, |
| "grad_norm": 1.7337509393692017, |
| "learning_rate": 7.548242544458078e-06, |
| "loss": 0.035, |
| "num_input_tokens_seen": 2348888, |
| "step": 8270 |
| }, |
| { |
| "epoch": 15.438432835820896, |
| "grad_norm": 0.05372800678014755, |
| "learning_rate": 7.519121406451657e-06, |
| "loss": 0.0062, |
| "num_input_tokens_seen": 2350392, |
| "step": 8275 |
| }, |
| { |
| "epoch": 15.447761194029852, |
| "grad_norm": 0.026804322376847267, |
| "learning_rate": 7.490046605489226e-06, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 2351800, |
| "step": 8280 |
| }, |
| { |
| "epoch": 15.457089552238806, |
| "grad_norm": 2.7270753383636475, |
| "learning_rate": 7.461018218640162e-06, |
| "loss": 0.0292, |
| "num_input_tokens_seen": 2353176, |
| "step": 8285 |
| }, |
| { |
| "epoch": 15.466417910447761, |
| "grad_norm": 0.0315847247838974, |
| "learning_rate": 7.432036322850797e-06, |
| "loss": 0.0213, |
| "num_input_tokens_seen": 2354616, |
| "step": 8290 |
| }, |
| { |
| "epoch": 15.475746268656717, |
| "grad_norm": 0.04154283553361893, |
| "learning_rate": 7.403100994944251e-06, |
| "loss": 0.0096, |
| "num_input_tokens_seen": 2356120, |
| "step": 8295 |
| }, |
| { |
| "epoch": 15.485074626865671, |
| "grad_norm": 0.9504329562187195, |
| "learning_rate": 7.374212311620182e-06, |
| "loss": 0.009, |
| "num_input_tokens_seen": 2357624, |
| "step": 8300 |
| }, |
| { |
| "epoch": 15.494402985074627, |
| "grad_norm": 1.2491915225982666, |
| "learning_rate": 7.345370349454611e-06, |
| "loss": 0.0062, |
| "num_input_tokens_seen": 2359096, |
| "step": 8305 |
| }, |
| { |
| "epoch": 15.503731343283581, |
| "grad_norm": 1.3733524084091187, |
| "learning_rate": 7.3165751848997296e-06, |
| "loss": 0.0654, |
| "num_input_tokens_seen": 2360504, |
| "step": 8310 |
| }, |
| { |
| "epoch": 15.513059701492537, |
| "grad_norm": 0.050251707434654236, |
| "learning_rate": 7.287826894283664e-06, |
| "loss": 0.0009, |
| "num_input_tokens_seen": 2361944, |
| "step": 8315 |
| }, |
| { |
| "epoch": 15.522388059701493, |
| "grad_norm": 1.194697380065918, |
| "learning_rate": 7.259125553810295e-06, |
| "loss": 0.0231, |
| "num_input_tokens_seen": 2363448, |
| "step": 8320 |
| }, |
| { |
| "epoch": 15.531716417910447, |
| "grad_norm": 1.301934838294983, |
| "learning_rate": 7.230471239559042e-06, |
| "loss": 0.0838, |
| "num_input_tokens_seen": 2364792, |
| "step": 8325 |
| }, |
| { |
| "epoch": 15.541044776119403, |
| "grad_norm": 0.10775480419397354, |
| "learning_rate": 7.201864027484695e-06, |
| "loss": 0.0238, |
| "num_input_tokens_seen": 2366232, |
| "step": 8330 |
| }, |
| { |
| "epoch": 15.550373134328359, |
| "grad_norm": 0.07272236049175262, |
| "learning_rate": 7.173303993417185e-06, |
| "loss": 0.0525, |
| "num_input_tokens_seen": 2367544, |
| "step": 8335 |
| }, |
| { |
| "epoch": 15.559701492537313, |
| "grad_norm": 0.04792439937591553, |
| "learning_rate": 7.144791213061347e-06, |
| "loss": 0.0011, |
| "num_input_tokens_seen": 2369016, |
| "step": 8340 |
| }, |
| { |
| "epoch": 15.569029850746269, |
| "grad_norm": 0.047780051827430725, |
| "learning_rate": 7.116325761996817e-06, |
| "loss": 0.0063, |
| "num_input_tokens_seen": 2370744, |
| "step": 8345 |
| }, |
| { |
| "epoch": 15.578358208955224, |
| "grad_norm": 1.0226856470108032, |
| "learning_rate": 7.087907715677733e-06, |
| "loss": 0.0128, |
| "num_input_tokens_seen": 2372056, |
| "step": 8350 |
| }, |
| { |
| "epoch": 15.587686567164178, |
| "grad_norm": 0.07632843405008316, |
| "learning_rate": 7.059537149432582e-06, |
| "loss": 0.001, |
| "num_input_tokens_seen": 2373496, |
| "step": 8355 |
| }, |
| { |
| "epoch": 15.597014925373134, |
| "grad_norm": 0.054843612015247345, |
| "learning_rate": 7.031214138464023e-06, |
| "loss": 0.0051, |
| "num_input_tokens_seen": 2375096, |
| "step": 8360 |
| }, |
| { |
| "epoch": 15.60634328358209, |
| "grad_norm": 0.06572072952985764, |
| "learning_rate": 7.0029387578486146e-06, |
| "loss": 0.0032, |
| "num_input_tokens_seen": 2376408, |
| "step": 8365 |
| }, |
| { |
| "epoch": 15.615671641791044, |
| "grad_norm": 0.05430394038558006, |
| "learning_rate": 6.974711082536711e-06, |
| "loss": 0.0036, |
| "num_input_tokens_seen": 2377816, |
| "step": 8370 |
| }, |
| { |
| "epoch": 15.625, |
| "grad_norm": 1.977893590927124, |
| "learning_rate": 6.946531187352154e-06, |
| "loss": 0.0467, |
| "num_input_tokens_seen": 2379160, |
| "step": 8375 |
| }, |
| { |
| "epoch": 15.634328358208956, |
| "grad_norm": 0.07812154293060303, |
| "learning_rate": 6.918399146992183e-06, |
| "loss": 0.0566, |
| "num_input_tokens_seen": 2380632, |
| "step": 8380 |
| }, |
| { |
| "epoch": 15.64365671641791, |
| "grad_norm": 0.02741282619535923, |
| "learning_rate": 6.8903150360271565e-06, |
| "loss": 0.0047, |
| "num_input_tokens_seen": 2382072, |
| "step": 8385 |
| }, |
| { |
| "epoch": 15.652985074626866, |
| "grad_norm": 0.019559860229492188, |
| "learning_rate": 6.862278928900412e-06, |
| "loss": 0.0012, |
| "num_input_tokens_seen": 2383544, |
| "step": 8390 |
| }, |
| { |
| "epoch": 15.662313432835822, |
| "grad_norm": 0.09155000001192093, |
| "learning_rate": 6.8342908999280195e-06, |
| "loss": 0.0187, |
| "num_input_tokens_seen": 2384920, |
| "step": 8395 |
| }, |
| { |
| "epoch": 15.671641791044776, |
| "grad_norm": 0.03511262312531471, |
| "learning_rate": 6.806351023298604e-06, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 2386392, |
| "step": 8400 |
| }, |
| { |
| "epoch": 15.680970149253731, |
| "grad_norm": 0.8765169382095337, |
| "learning_rate": 6.77845937307319e-06, |
| "loss": 0.0405, |
| "num_input_tokens_seen": 2387768, |
| "step": 8405 |
| }, |
| { |
| "epoch": 15.690298507462687, |
| "grad_norm": 0.8757217526435852, |
| "learning_rate": 6.750616023184905e-06, |
| "loss": 0.0446, |
| "num_input_tokens_seen": 2389240, |
| "step": 8410 |
| }, |
| { |
| "epoch": 15.699626865671641, |
| "grad_norm": 0.01548250112682581, |
| "learning_rate": 6.722821047438896e-06, |
| "loss": 0.0332, |
| "num_input_tokens_seen": 2390776, |
| "step": 8415 |
| }, |
| { |
| "epoch": 15.708955223880597, |
| "grad_norm": 0.32023245096206665, |
| "learning_rate": 6.695074519512076e-06, |
| "loss": 0.002, |
| "num_input_tokens_seen": 2392024, |
| "step": 8420 |
| }, |
| { |
| "epoch": 15.718283582089553, |
| "grad_norm": 2.9007318019866943, |
| "learning_rate": 6.667376512952919e-06, |
| "loss": 0.0353, |
| "num_input_tokens_seen": 2393432, |
| "step": 8425 |
| }, |
| { |
| "epoch": 15.727611940298507, |
| "grad_norm": 0.06238672137260437, |
| "learning_rate": 6.639727101181286e-06, |
| "loss": 0.001, |
| "num_input_tokens_seen": 2394744, |
| "step": 8430 |
| }, |
| { |
| "epoch": 15.736940298507463, |
| "grad_norm": 0.07146617025136948, |
| "learning_rate": 6.612126357488229e-06, |
| "loss": 0.0451, |
| "num_input_tokens_seen": 2396216, |
| "step": 8435 |
| }, |
| { |
| "epoch": 15.746268656716419, |
| "grad_norm": 0.034194137901067734, |
| "learning_rate": 6.584574355035808e-06, |
| "loss": 0.0092, |
| "num_input_tokens_seen": 2397496, |
| "step": 8440 |
| }, |
| { |
| "epoch": 15.755597014925373, |
| "grad_norm": 0.018550215288996696, |
| "learning_rate": 6.557071166856862e-06, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 2398936, |
| "step": 8445 |
| }, |
| { |
| "epoch": 15.764925373134329, |
| "grad_norm": 0.38596251606941223, |
| "learning_rate": 6.529616865854843e-06, |
| "loss": 0.0671, |
| "num_input_tokens_seen": 2400440, |
| "step": 8450 |
| }, |
| { |
| "epoch": 15.774253731343283, |
| "grad_norm": 0.04764106869697571, |
| "learning_rate": 6.502211524803628e-06, |
| "loss": 0.0064, |
| "num_input_tokens_seen": 2402008, |
| "step": 8455 |
| }, |
| { |
| "epoch": 15.783582089552239, |
| "grad_norm": 0.048227131366729736, |
| "learning_rate": 6.4748552163473e-06, |
| "loss": 0.014, |
| "num_input_tokens_seen": 2403800, |
| "step": 8460 |
| }, |
| { |
| "epoch": 15.792910447761194, |
| "grad_norm": 0.0581585168838501, |
| "learning_rate": 6.44754801299998e-06, |
| "loss": 0.0148, |
| "num_input_tokens_seen": 2405272, |
| "step": 8465 |
| }, |
| { |
| "epoch": 15.802238805970148, |
| "grad_norm": 1.3411815166473389, |
| "learning_rate": 6.420289987145609e-06, |
| "loss": 0.0299, |
| "num_input_tokens_seen": 2406648, |
| "step": 8470 |
| }, |
| { |
| "epoch": 15.811567164179104, |
| "grad_norm": 0.19821366667747498, |
| "learning_rate": 6.393081211037799e-06, |
| "loss": 0.0011, |
| "num_input_tokens_seen": 2408024, |
| "step": 8475 |
| }, |
| { |
| "epoch": 15.82089552238806, |
| "grad_norm": 1.3007615804672241, |
| "learning_rate": 6.365921756799589e-06, |
| "loss": 0.0063, |
| "num_input_tokens_seen": 2409400, |
| "step": 8480 |
| }, |
| { |
| "epoch": 15.830223880597014, |
| "grad_norm": 1.9445916414260864, |
| "learning_rate": 6.338811696423283e-06, |
| "loss": 0.0537, |
| "num_input_tokens_seen": 2410616, |
| "step": 8485 |
| }, |
| { |
| "epoch": 15.83955223880597, |
| "grad_norm": 0.0262154508382082, |
| "learning_rate": 6.311751101770277e-06, |
| "loss": 0.0044, |
| "num_input_tokens_seen": 2412216, |
| "step": 8490 |
| }, |
| { |
| "epoch": 15.848880597014926, |
| "grad_norm": 0.04396534338593483, |
| "learning_rate": 6.284740044570825e-06, |
| "loss": 0.0251, |
| "num_input_tokens_seen": 2413592, |
| "step": 8495 |
| }, |
| { |
| "epoch": 15.85820895522388, |
| "grad_norm": 0.032748475670814514, |
| "learning_rate": 6.257778596423869e-06, |
| "loss": 0.0582, |
| "num_input_tokens_seen": 2414968, |
| "step": 8500 |
| }, |
| { |
| "epoch": 15.867537313432836, |
| "grad_norm": 0.0600719079375267, |
| "learning_rate": 6.230866828796861e-06, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 2416536, |
| "step": 8505 |
| }, |
| { |
| "epoch": 15.876865671641792, |
| "grad_norm": 0.0341942198574543, |
| "learning_rate": 6.204004813025568e-06, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 2418040, |
| "step": 8510 |
| }, |
| { |
| "epoch": 15.886194029850746, |
| "grad_norm": 1.7425408363342285, |
| "learning_rate": 6.177192620313868e-06, |
| "loss": 0.0156, |
| "num_input_tokens_seen": 2419512, |
| "step": 8515 |
| }, |
| { |
| "epoch": 15.895522388059701, |
| "grad_norm": 1.8912273645401, |
| "learning_rate": 6.150430321733566e-06, |
| "loss": 0.029, |
| "num_input_tokens_seen": 2420824, |
| "step": 8520 |
| }, |
| { |
| "epoch": 15.904850746268657, |
| "grad_norm": 0.031218253076076508, |
| "learning_rate": 6.123717988224237e-06, |
| "loss": 0.0346, |
| "num_input_tokens_seen": 2422264, |
| "step": 8525 |
| }, |
| { |
| "epoch": 15.914179104477611, |
| "grad_norm": 1.9380316734313965, |
| "learning_rate": 6.097055690592987e-06, |
| "loss": 0.0246, |
| "num_input_tokens_seen": 2423864, |
| "step": 8530 |
| }, |
| { |
| "epoch": 15.923507462686567, |
| "grad_norm": 0.04504099488258362, |
| "learning_rate": 6.070443499514292e-06, |
| "loss": 0.0306, |
| "num_input_tokens_seen": 2425240, |
| "step": 8535 |
| }, |
| { |
| "epoch": 15.932835820895523, |
| "grad_norm": 0.04207184165716171, |
| "learning_rate": 6.043881485529831e-06, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 2426648, |
| "step": 8540 |
| }, |
| { |
| "epoch": 15.942164179104477, |
| "grad_norm": 1.802028775215149, |
| "learning_rate": 6.017369719048255e-06, |
| "loss": 0.0084, |
| "num_input_tokens_seen": 2428056, |
| "step": 8545 |
| }, |
| { |
| "epoch": 15.951492537313433, |
| "grad_norm": 2.1756534576416016, |
| "learning_rate": 5.990908270345031e-06, |
| "loss": 0.016, |
| "num_input_tokens_seen": 2429336, |
| "step": 8550 |
| }, |
| { |
| "epoch": 15.960820895522389, |
| "grad_norm": 0.15736201405525208, |
| "learning_rate": 5.964497209562234e-06, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 2430712, |
| "step": 8555 |
| }, |
| { |
| "epoch": 15.970149253731343, |
| "grad_norm": 1.6881898641586304, |
| "learning_rate": 5.9381366067084e-06, |
| "loss": 0.0077, |
| "num_input_tokens_seen": 2432184, |
| "step": 8560 |
| }, |
| { |
| "epoch": 15.979477611940299, |
| "grad_norm": 0.12998859584331512, |
| "learning_rate": 5.911826531658315e-06, |
| "loss": 0.0111, |
| "num_input_tokens_seen": 2433592, |
| "step": 8565 |
| }, |
| { |
| "epoch": 15.988805970149254, |
| "grad_norm": 2.8887104988098145, |
| "learning_rate": 5.885567054152785e-06, |
| "loss": 0.0639, |
| "num_input_tokens_seen": 2435000, |
| "step": 8570 |
| }, |
| { |
| "epoch": 15.998134328358208, |
| "grad_norm": 0.08808637410402298, |
| "learning_rate": 5.859358243798549e-06, |
| "loss": 0.018, |
| "num_input_tokens_seen": 2436472, |
| "step": 8575 |
| }, |
| { |
| "epoch": 16.0, |
| "eval_loss": 1.4279977083206177, |
| "eval_runtime": 2.918, |
| "eval_samples_per_second": 81.562, |
| "eval_steps_per_second": 20.562, |
| "num_input_tokens_seen": 2436520, |
| "step": 8576 |
| }, |
| { |
| "epoch": 16.007462686567163, |
| "grad_norm": 0.7371411323547363, |
| "learning_rate": 5.8332001700680065e-06, |
| "loss": 0.0164, |
| "num_input_tokens_seen": 2437736, |
| "step": 8580 |
| }, |
| { |
| "epoch": 16.01679104477612, |
| "grad_norm": 0.06622138619422913, |
| "learning_rate": 5.8070929022991e-06, |
| "loss": 0.0172, |
| "num_input_tokens_seen": 2439176, |
| "step": 8585 |
| }, |
| { |
| "epoch": 16.026119402985074, |
| "grad_norm": 0.8311758637428284, |
| "learning_rate": 5.781036509695048e-06, |
| "loss": 0.0277, |
| "num_input_tokens_seen": 2440456, |
| "step": 8590 |
| }, |
| { |
| "epoch": 16.03544776119403, |
| "grad_norm": 1.4553191661834717, |
| "learning_rate": 5.755031061324264e-06, |
| "loss": 0.0063, |
| "num_input_tokens_seen": 2441864, |
| "step": 8595 |
| }, |
| { |
| "epoch": 16.044776119402986, |
| "grad_norm": 0.013120700605213642, |
| "learning_rate": 5.7290766261201165e-06, |
| "loss": 0.002, |
| "num_input_tokens_seen": 2443176, |
| "step": 8600 |
| }, |
| { |
| "epoch": 16.05410447761194, |
| "grad_norm": 2.280221700668335, |
| "learning_rate": 5.703173272880707e-06, |
| "loss": 0.0277, |
| "num_input_tokens_seen": 2444424, |
| "step": 8605 |
| }, |
| { |
| "epoch": 16.063432835820894, |
| "grad_norm": 1.0285797119140625, |
| "learning_rate": 5.67732107026879e-06, |
| "loss": 0.0058, |
| "num_input_tokens_seen": 2445960, |
| "step": 8610 |
| }, |
| { |
| "epoch": 16.07276119402985, |
| "grad_norm": 0.6588021516799927, |
| "learning_rate": 5.6515200868114875e-06, |
| "loss": 0.0148, |
| "num_input_tokens_seen": 2447624, |
| "step": 8615 |
| }, |
| { |
| "epoch": 16.082089552238806, |
| "grad_norm": 0.2523604929447174, |
| "learning_rate": 5.625770390900189e-06, |
| "loss": 0.0018, |
| "num_input_tokens_seen": 2449000, |
| "step": 8620 |
| }, |
| { |
| "epoch": 16.09141791044776, |
| "grad_norm": 1.1427383422851562, |
| "learning_rate": 5.600072050790317e-06, |
| "loss": 0.0741, |
| "num_input_tokens_seen": 2450408, |
| "step": 8625 |
| }, |
| { |
| "epoch": 16.100746268656717, |
| "grad_norm": 0.8434598445892334, |
| "learning_rate": 5.574425134601152e-06, |
| "loss": 0.0076, |
| "num_input_tokens_seen": 2451848, |
| "step": 8630 |
| }, |
| { |
| "epoch": 16.11007462686567, |
| "grad_norm": 0.04192919284105301, |
| "learning_rate": 5.548829710315695e-06, |
| "loss": 0.0165, |
| "num_input_tokens_seen": 2453448, |
| "step": 8635 |
| }, |
| { |
| "epoch": 16.119402985074625, |
| "grad_norm": 0.038911547511816025, |
| "learning_rate": 5.523285845780432e-06, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 2454920, |
| "step": 8640 |
| }, |
| { |
| "epoch": 16.128731343283583, |
| "grad_norm": 0.040439728647470474, |
| "learning_rate": 5.497793608705184e-06, |
| "loss": 0.0012, |
| "num_input_tokens_seen": 2456328, |
| "step": 8645 |
| }, |
| { |
| "epoch": 16.138059701492537, |
| "grad_norm": 0.05301021784543991, |
| "learning_rate": 5.472353066662916e-06, |
| "loss": 0.0223, |
| "num_input_tokens_seen": 2457864, |
| "step": 8650 |
| }, |
| { |
| "epoch": 16.14738805970149, |
| "grad_norm": 0.06302767246961594, |
| "learning_rate": 5.446964287089587e-06, |
| "loss": 0.0229, |
| "num_input_tokens_seen": 2459272, |
| "step": 8655 |
| }, |
| { |
| "epoch": 16.15671641791045, |
| "grad_norm": 0.13890638947486877, |
| "learning_rate": 5.421627337283916e-06, |
| "loss": 0.001, |
| "num_input_tokens_seen": 2460712, |
| "step": 8660 |
| }, |
| { |
| "epoch": 16.166044776119403, |
| "grad_norm": 0.8767593502998352, |
| "learning_rate": 5.396342284407252e-06, |
| "loss": 0.0058, |
| "num_input_tokens_seen": 2462152, |
| "step": 8665 |
| }, |
| { |
| "epoch": 16.175373134328357, |
| "grad_norm": 0.558708906173706, |
| "learning_rate": 5.3711091954833845e-06, |
| "loss": 0.035, |
| "num_input_tokens_seen": 2463560, |
| "step": 8670 |
| }, |
| { |
| "epoch": 16.184701492537314, |
| "grad_norm": 1.342138409614563, |
| "learning_rate": 5.345928137398351e-06, |
| "loss": 0.0062, |
| "num_input_tokens_seen": 2464904, |
| "step": 8675 |
| }, |
| { |
| "epoch": 16.19402985074627, |
| "grad_norm": 0.07958870381116867, |
| "learning_rate": 5.320799176900265e-06, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 2466248, |
| "step": 8680 |
| }, |
| { |
| "epoch": 16.203358208955223, |
| "grad_norm": 1.3327081203460693, |
| "learning_rate": 5.295722380599166e-06, |
| "loss": 0.0222, |
| "num_input_tokens_seen": 2467592, |
| "step": 8685 |
| }, |
| { |
| "epoch": 16.21268656716418, |
| "grad_norm": 0.0555197075009346, |
| "learning_rate": 5.270697814966793e-06, |
| "loss": 0.0466, |
| "num_input_tokens_seen": 2468872, |
| "step": 8690 |
| }, |
| { |
| "epoch": 16.222014925373134, |
| "grad_norm": 0.07982856780290604, |
| "learning_rate": 5.245725546336452e-06, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 2470184, |
| "step": 8695 |
| }, |
| { |
| "epoch": 16.23134328358209, |
| "grad_norm": 0.07369718700647354, |
| "learning_rate": 5.220805640902815e-06, |
| "loss": 0.0014, |
| "num_input_tokens_seen": 2471656, |
| "step": 8700 |
| }, |
| { |
| "epoch": 16.240671641791046, |
| "grad_norm": 1.4571572542190552, |
| "learning_rate": 5.1959381647217666e-06, |
| "loss": 0.0717, |
| "num_input_tokens_seen": 2473096, |
| "step": 8705 |
| }, |
| { |
| "epoch": 16.25, |
| "grad_norm": 0.06650495529174805, |
| "learning_rate": 5.171123183710205e-06, |
| "loss": 0.0237, |
| "num_input_tokens_seen": 2474344, |
| "step": 8710 |
| }, |
| { |
| "epoch": 16.259328358208954, |
| "grad_norm": 0.2796175479888916, |
| "learning_rate": 5.1463607636458716e-06, |
| "loss": 0.0012, |
| "num_input_tokens_seen": 2475976, |
| "step": 8715 |
| }, |
| { |
| "epoch": 16.26865671641791, |
| "grad_norm": 0.06426035612821579, |
| "learning_rate": 5.121650970167208e-06, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 2477288, |
| "step": 8720 |
| }, |
| { |
| "epoch": 16.277985074626866, |
| "grad_norm": 0.6642663478851318, |
| "learning_rate": 5.096993868773131e-06, |
| "loss": 0.0055, |
| "num_input_tokens_seen": 2478824, |
| "step": 8725 |
| }, |
| { |
| "epoch": 16.28731343283582, |
| "grad_norm": 0.0488874688744545, |
| "learning_rate": 5.0723895248228955e-06, |
| "loss": 0.0013, |
| "num_input_tokens_seen": 2480232, |
| "step": 8730 |
| }, |
| { |
| "epoch": 16.296641791044777, |
| "grad_norm": 0.061697665601968765, |
| "learning_rate": 5.047838003535904e-06, |
| "loss": 0.0129, |
| "num_input_tokens_seen": 2481608, |
| "step": 8735 |
| }, |
| { |
| "epoch": 16.30597014925373, |
| "grad_norm": 2.4327104091644287, |
| "learning_rate": 5.0233393699915645e-06, |
| "loss": 0.0281, |
| "num_input_tokens_seen": 2483080, |
| "step": 8740 |
| }, |
| { |
| "epoch": 16.315298507462686, |
| "grad_norm": 0.5413012504577637, |
| "learning_rate": 4.998893689129061e-06, |
| "loss": 0.0049, |
| "num_input_tokens_seen": 2484616, |
| "step": 8745 |
| }, |
| { |
| "epoch": 16.324626865671643, |
| "grad_norm": 1.7092820405960083, |
| "learning_rate": 4.974501025747233e-06, |
| "loss": 0.0523, |
| "num_input_tokens_seen": 2485960, |
| "step": 8750 |
| }, |
| { |
| "epoch": 16.333955223880597, |
| "grad_norm": 0.030523357912898064, |
| "learning_rate": 4.950161444504386e-06, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 2487432, |
| "step": 8755 |
| }, |
| { |
| "epoch": 16.34328358208955, |
| "grad_norm": 0.0935099646449089, |
| "learning_rate": 4.925875009918116e-06, |
| "loss": 0.0068, |
| "num_input_tokens_seen": 2488968, |
| "step": 8760 |
| }, |
| { |
| "epoch": 16.35261194029851, |
| "grad_norm": 1.29145085811615, |
| "learning_rate": 4.901641786365135e-06, |
| "loss": 0.0589, |
| "num_input_tokens_seen": 2490376, |
| "step": 8765 |
| }, |
| { |
| "epoch": 16.361940298507463, |
| "grad_norm": 1.3844202756881714, |
| "learning_rate": 4.8774618380811095e-06, |
| "loss": 0.0054, |
| "num_input_tokens_seen": 2492040, |
| "step": 8770 |
| }, |
| { |
| "epoch": 16.371268656716417, |
| "grad_norm": 1.3486878871917725, |
| "learning_rate": 4.853335229160497e-06, |
| "loss": 0.0048, |
| "num_input_tokens_seen": 2493352, |
| "step": 8775 |
| }, |
| { |
| "epoch": 16.380597014925375, |
| "grad_norm": 0.029819438233971596, |
| "learning_rate": 4.829262023556375e-06, |
| "loss": 0.0035, |
| "num_input_tokens_seen": 2494760, |
| "step": 8780 |
| }, |
| { |
| "epoch": 16.38992537313433, |
| "grad_norm": 0.06936067342758179, |
| "learning_rate": 4.805242285080222e-06, |
| "loss": 0.0092, |
| "num_input_tokens_seen": 2496296, |
| "step": 8785 |
| }, |
| { |
| "epoch": 16.399253731343283, |
| "grad_norm": 0.024990657344460487, |
| "learning_rate": 4.7812760774018365e-06, |
| "loss": 0.0151, |
| "num_input_tokens_seen": 2497800, |
| "step": 8790 |
| }, |
| { |
| "epoch": 16.40858208955224, |
| "grad_norm": 0.060628414154052734, |
| "learning_rate": 4.757363464049094e-06, |
| "loss": 0.0036, |
| "num_input_tokens_seen": 2499368, |
| "step": 8795 |
| }, |
| { |
| "epoch": 16.417910447761194, |
| "grad_norm": 0.10507309436798096, |
| "learning_rate": 4.733504508407813e-06, |
| "loss": 0.0173, |
| "num_input_tokens_seen": 2500808, |
| "step": 8800 |
| }, |
| { |
| "epoch": 16.42723880597015, |
| "grad_norm": 0.05006544291973114, |
| "learning_rate": 4.7096992737215876e-06, |
| "loss": 0.0123, |
| "num_input_tokens_seen": 2502216, |
| "step": 8805 |
| }, |
| { |
| "epoch": 16.436567164179106, |
| "grad_norm": 2.649094581604004, |
| "learning_rate": 4.6859478230916e-06, |
| "loss": 0.0185, |
| "num_input_tokens_seen": 2503464, |
| "step": 8810 |
| }, |
| { |
| "epoch": 16.44589552238806, |
| "grad_norm": 0.06191951408982277, |
| "learning_rate": 4.6622502194764825e-06, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 2504744, |
| "step": 8815 |
| }, |
| { |
| "epoch": 16.455223880597014, |
| "grad_norm": 0.06118141859769821, |
| "learning_rate": 4.6386065256921045e-06, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 2506248, |
| "step": 8820 |
| }, |
| { |
| "epoch": 16.46455223880597, |
| "grad_norm": 0.025624345988035202, |
| "learning_rate": 4.615016804411465e-06, |
| "loss": 0.0234, |
| "num_input_tokens_seen": 2507560, |
| "step": 8825 |
| }, |
| { |
| "epoch": 16.473880597014926, |
| "grad_norm": 0.02961966022849083, |
| "learning_rate": 4.591481118164479e-06, |
| "loss": 0.0228, |
| "num_input_tokens_seen": 2508904, |
| "step": 8830 |
| }, |
| { |
| "epoch": 16.48320895522388, |
| "grad_norm": 0.8041293621063232, |
| "learning_rate": 4.567999529337844e-06, |
| "loss": 0.0417, |
| "num_input_tokens_seen": 2510440, |
| "step": 8835 |
| }, |
| { |
| "epoch": 16.492537313432837, |
| "grad_norm": 0.04186255484819412, |
| "learning_rate": 4.544572100174843e-06, |
| "loss": 0.0091, |
| "num_input_tokens_seen": 2512040, |
| "step": 8840 |
| }, |
| { |
| "epoch": 16.50186567164179, |
| "grad_norm": 0.04514903575181961, |
| "learning_rate": 4.521198892775203e-06, |
| "loss": 0.0234, |
| "num_input_tokens_seen": 2513576, |
| "step": 8845 |
| }, |
| { |
| "epoch": 16.511194029850746, |
| "grad_norm": 0.08915029466152191, |
| "learning_rate": 4.4978799690949425e-06, |
| "loss": 0.0433, |
| "num_input_tokens_seen": 2514920, |
| "step": 8850 |
| }, |
| { |
| "epoch": 16.520522388059703, |
| "grad_norm": 0.6367937922477722, |
| "learning_rate": 4.474615390946144e-06, |
| "loss": 0.0304, |
| "num_input_tokens_seen": 2516328, |
| "step": 8855 |
| }, |
| { |
| "epoch": 16.529850746268657, |
| "grad_norm": 0.03587573766708374, |
| "learning_rate": 4.451405219996876e-06, |
| "loss": 0.0119, |
| "num_input_tokens_seen": 2517704, |
| "step": 8860 |
| }, |
| { |
| "epoch": 16.53917910447761, |
| "grad_norm": 0.058651890605688095, |
| "learning_rate": 4.428249517770986e-06, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 2519112, |
| "step": 8865 |
| }, |
| { |
| "epoch": 16.548507462686565, |
| "grad_norm": 0.03816595301032066, |
| "learning_rate": 4.405148345647914e-06, |
| "loss": 0.0281, |
| "num_input_tokens_seen": 2520552, |
| "step": 8870 |
| }, |
| { |
| "epoch": 16.557835820895523, |
| "grad_norm": 0.03874373808503151, |
| "learning_rate": 4.382101764862576e-06, |
| "loss": 0.0225, |
| "num_input_tokens_seen": 2522088, |
| "step": 8875 |
| }, |
| { |
| "epoch": 16.567164179104477, |
| "grad_norm": 0.6240473389625549, |
| "learning_rate": 4.359109836505165e-06, |
| "loss": 0.0086, |
| "num_input_tokens_seen": 2523592, |
| "step": 8880 |
| }, |
| { |
| "epoch": 16.576492537313435, |
| "grad_norm": 0.9810454845428467, |
| "learning_rate": 4.336172621521034e-06, |
| "loss": 0.0133, |
| "num_input_tokens_seen": 2525096, |
| "step": 8885 |
| }, |
| { |
| "epoch": 16.58582089552239, |
| "grad_norm": 0.029001150280237198, |
| "learning_rate": 4.313290180710478e-06, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 2526472, |
| "step": 8890 |
| }, |
| { |
| "epoch": 16.595149253731343, |
| "grad_norm": 2.0481338500976562, |
| "learning_rate": 4.290462574728599e-06, |
| "loss": 0.0395, |
| "num_input_tokens_seen": 2527816, |
| "step": 8895 |
| }, |
| { |
| "epoch": 16.604477611940297, |
| "grad_norm": 1.243726372718811, |
| "learning_rate": 4.267689864085178e-06, |
| "loss": 0.0069, |
| "num_input_tokens_seen": 2529288, |
| "step": 8900 |
| }, |
| { |
| "epoch": 16.613805970149254, |
| "grad_norm": 2.7746329307556152, |
| "learning_rate": 4.244972109144454e-06, |
| "loss": 0.0358, |
| "num_input_tokens_seen": 2530440, |
| "step": 8905 |
| }, |
| { |
| "epoch": 16.62313432835821, |
| "grad_norm": 0.06856881827116013, |
| "learning_rate": 4.222309370124999e-06, |
| "loss": 0.0158, |
| "num_input_tokens_seen": 2531752, |
| "step": 8910 |
| }, |
| { |
| "epoch": 16.632462686567163, |
| "grad_norm": 0.838624894618988, |
| "learning_rate": 4.199701707099557e-06, |
| "loss": 0.0049, |
| "num_input_tokens_seen": 2533128, |
| "step": 8915 |
| }, |
| { |
| "epoch": 16.64179104477612, |
| "grad_norm": 0.04727941006422043, |
| "learning_rate": 4.1771491799948885e-06, |
| "loss": 0.0344, |
| "num_input_tokens_seen": 2534472, |
| "step": 8920 |
| }, |
| { |
| "epoch": 16.651119402985074, |
| "grad_norm": 0.034551363438367844, |
| "learning_rate": 4.15465184859159e-06, |
| "loss": 0.0196, |
| "num_input_tokens_seen": 2535880, |
| "step": 8925 |
| }, |
| { |
| "epoch": 16.66044776119403, |
| "grad_norm": 0.0369015596807003, |
| "learning_rate": 4.132209772523945e-06, |
| "loss": 0.0345, |
| "num_input_tokens_seen": 2537384, |
| "step": 8930 |
| }, |
| { |
| "epoch": 16.669776119402986, |
| "grad_norm": 0.8165420293807983, |
| "learning_rate": 4.1098230112797984e-06, |
| "loss": 0.0247, |
| "num_input_tokens_seen": 2539016, |
| "step": 8935 |
| }, |
| { |
| "epoch": 16.67910447761194, |
| "grad_norm": 0.025570623576641083, |
| "learning_rate": 4.087491624200337e-06, |
| "loss": 0.0288, |
| "num_input_tokens_seen": 2540328, |
| "step": 8940 |
| }, |
| { |
| "epoch": 16.688432835820894, |
| "grad_norm": 0.05779852345585823, |
| "learning_rate": 4.065215670479991e-06, |
| "loss": 0.017, |
| "num_input_tokens_seen": 2541768, |
| "step": 8945 |
| }, |
| { |
| "epoch": 16.69776119402985, |
| "grad_norm": 0.04519381746649742, |
| "learning_rate": 4.042995209166225e-06, |
| "loss": 0.0289, |
| "num_input_tokens_seen": 2543016, |
| "step": 8950 |
| }, |
| { |
| "epoch": 16.707089552238806, |
| "grad_norm": 0.1660148650407791, |
| "learning_rate": 4.020830299159445e-06, |
| "loss": 0.0143, |
| "num_input_tokens_seen": 2544520, |
| "step": 8955 |
| }, |
| { |
| "epoch": 16.71641791044776, |
| "grad_norm": 0.02256009727716446, |
| "learning_rate": 3.998720999212776e-06, |
| "loss": 0.0036, |
| "num_input_tokens_seen": 2545928, |
| "step": 8960 |
| }, |
| { |
| "epoch": 16.725746268656717, |
| "grad_norm": 0.049024246633052826, |
| "learning_rate": 3.976667367931941e-06, |
| "loss": 0.0165, |
| "num_input_tokens_seen": 2547528, |
| "step": 8965 |
| }, |
| { |
| "epoch": 16.73507462686567, |
| "grad_norm": 1.2392404079437256, |
| "learning_rate": 3.9546694637751125e-06, |
| "loss": 0.0083, |
| "num_input_tokens_seen": 2548968, |
| "step": 8970 |
| }, |
| { |
| "epoch": 16.744402985074625, |
| "grad_norm": 0.7506499886512756, |
| "learning_rate": 3.932727345052736e-06, |
| "loss": 0.0186, |
| "num_input_tokens_seen": 2550568, |
| "step": 8975 |
| }, |
| { |
| "epoch": 16.753731343283583, |
| "grad_norm": 0.9621379971504211, |
| "learning_rate": 3.910841069927379e-06, |
| "loss": 0.0214, |
| "num_input_tokens_seen": 2551976, |
| "step": 8980 |
| }, |
| { |
| "epoch": 16.763059701492537, |
| "grad_norm": 0.025292817503213882, |
| "learning_rate": 3.8890106964136056e-06, |
| "loss": 0.0184, |
| "num_input_tokens_seen": 2553416, |
| "step": 8985 |
| }, |
| { |
| "epoch": 16.77238805970149, |
| "grad_norm": 0.5844005942344666, |
| "learning_rate": 3.867236282377776e-06, |
| "loss": 0.0482, |
| "num_input_tokens_seen": 2554888, |
| "step": 8990 |
| }, |
| { |
| "epoch": 16.78171641791045, |
| "grad_norm": 0.01892007328569889, |
| "learning_rate": 3.845517885537927e-06, |
| "loss": 0.0116, |
| "num_input_tokens_seen": 2556360, |
| "step": 8995 |
| }, |
| { |
| "epoch": 16.791044776119403, |
| "grad_norm": 0.052818454802036285, |
| "learning_rate": 3.823855563463605e-06, |
| "loss": 0.023, |
| "num_input_tokens_seen": 2557640, |
| "step": 9000 |
| }, |
| { |
| "epoch": 16.800373134328357, |
| "grad_norm": 1.1880854368209839, |
| "learning_rate": 3.802249373575728e-06, |
| "loss": 0.0209, |
| "num_input_tokens_seen": 2559048, |
| "step": 9005 |
| }, |
| { |
| "epoch": 16.809701492537314, |
| "grad_norm": 0.9076938629150391, |
| "learning_rate": 3.7806993731464154e-06, |
| "loss": 0.0171, |
| "num_input_tokens_seen": 2560456, |
| "step": 9010 |
| }, |
| { |
| "epoch": 16.81902985074627, |
| "grad_norm": 0.06037643924355507, |
| "learning_rate": 3.7592056192988333e-06, |
| "loss": 0.0389, |
| "num_input_tokens_seen": 2561800, |
| "step": 9015 |
| }, |
| { |
| "epoch": 16.828358208955223, |
| "grad_norm": 0.034027956426143646, |
| "learning_rate": 3.7377681690070775e-06, |
| "loss": 0.0009, |
| "num_input_tokens_seen": 2563176, |
| "step": 9020 |
| }, |
| { |
| "epoch": 16.83768656716418, |
| "grad_norm": 0.03610417619347572, |
| "learning_rate": 3.716387079095973e-06, |
| "loss": 0.0061, |
| "num_input_tokens_seen": 2564584, |
| "step": 9025 |
| }, |
| { |
| "epoch": 16.847014925373134, |
| "grad_norm": 1.5323071479797363, |
| "learning_rate": 3.695062406240979e-06, |
| "loss": 0.0257, |
| "num_input_tokens_seen": 2566088, |
| "step": 9030 |
| }, |
| { |
| "epoch": 16.85634328358209, |
| "grad_norm": 2.1922688484191895, |
| "learning_rate": 3.6737942069679675e-06, |
| "loss": 0.0448, |
| "num_input_tokens_seen": 2567400, |
| "step": 9035 |
| }, |
| { |
| "epoch": 16.865671641791046, |
| "grad_norm": 0.0320160873234272, |
| "learning_rate": 3.6525825376531484e-06, |
| "loss": 0.0743, |
| "num_input_tokens_seen": 2568808, |
| "step": 9040 |
| }, |
| { |
| "epoch": 16.875, |
| "grad_norm": 0.05016082152724266, |
| "learning_rate": 3.631427454522887e-06, |
| "loss": 0.0073, |
| "num_input_tokens_seen": 2570120, |
| "step": 9045 |
| }, |
| { |
| "epoch": 16.884328358208954, |
| "grad_norm": 0.0560573972761631, |
| "learning_rate": 3.610329013653518e-06, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 2571496, |
| "step": 9050 |
| }, |
| { |
| "epoch": 16.89365671641791, |
| "grad_norm": 0.06031356006860733, |
| "learning_rate": 3.5892872709712726e-06, |
| "loss": 0.0418, |
| "num_input_tokens_seen": 2572872, |
| "step": 9055 |
| }, |
| { |
| "epoch": 16.902985074626866, |
| "grad_norm": 0.04734707996249199, |
| "learning_rate": 3.5683022822520675e-06, |
| "loss": 0.0219, |
| "num_input_tokens_seen": 2574376, |
| "step": 9060 |
| }, |
| { |
| "epoch": 16.91231343283582, |
| "grad_norm": 0.32867783308029175, |
| "learning_rate": 3.547374103121398e-06, |
| "loss": 0.0221, |
| "num_input_tokens_seen": 2575720, |
| "step": 9065 |
| }, |
| { |
| "epoch": 16.921641791044777, |
| "grad_norm": 0.04076541215181351, |
| "learning_rate": 3.526502789054148e-06, |
| "loss": 0.0164, |
| "num_input_tokens_seen": 2577192, |
| "step": 9070 |
| }, |
| { |
| "epoch": 16.93097014925373, |
| "grad_norm": 0.41653311252593994, |
| "learning_rate": 3.5056883953744844e-06, |
| "loss": 0.0062, |
| "num_input_tokens_seen": 2578792, |
| "step": 9075 |
| }, |
| { |
| "epoch": 16.940298507462686, |
| "grad_norm": 0.036007676273584366, |
| "learning_rate": 3.4849309772557043e-06, |
| "loss": 0.045, |
| "num_input_tokens_seen": 2580328, |
| "step": 9080 |
| }, |
| { |
| "epoch": 16.949626865671643, |
| "grad_norm": 0.03823821619153023, |
| "learning_rate": 3.4642305897200548e-06, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 2581640, |
| "step": 9085 |
| }, |
| { |
| "epoch": 16.958955223880597, |
| "grad_norm": 0.03576407581567764, |
| "learning_rate": 3.4435872876386193e-06, |
| "loss": 0.0022, |
| "num_input_tokens_seen": 2583016, |
| "step": 9090 |
| }, |
| { |
| "epoch": 16.96828358208955, |
| "grad_norm": 0.08179601281881332, |
| "learning_rate": 3.4230011257311625e-06, |
| "loss": 0.0138, |
| "num_input_tokens_seen": 2584424, |
| "step": 9095 |
| }, |
| { |
| "epoch": 16.97761194029851, |
| "grad_norm": 0.03792068734765053, |
| "learning_rate": 3.4024721585659976e-06, |
| "loss": 0.0107, |
| "num_input_tokens_seen": 2585896, |
| "step": 9100 |
| }, |
| { |
| "epoch": 16.986940298507463, |
| "grad_norm": 0.16742895543575287, |
| "learning_rate": 3.3820004405598154e-06, |
| "loss": 0.0235, |
| "num_input_tokens_seen": 2587400, |
| "step": 9105 |
| }, |
| { |
| "epoch": 16.996268656716417, |
| "grad_norm": 0.025154493749141693, |
| "learning_rate": 3.3615860259775507e-06, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 2588808, |
| "step": 9110 |
| }, |
| { |
| "epoch": 17.0, |
| "eval_loss": 1.4675872325897217, |
| "eval_runtime": 2.9239, |
| "eval_samples_per_second": 81.397, |
| "eval_steps_per_second": 20.52, |
| "num_input_tokens_seen": 2589096, |
| "step": 9112 |
| }, |
| { |
| "epoch": 17.005597014925375, |
| "grad_norm": 0.060653235763311386, |
| "learning_rate": 3.3412289689322694e-06, |
| "loss": 0.0249, |
| "num_input_tokens_seen": 2589928, |
| "step": 9115 |
| }, |
| { |
| "epoch": 17.01492537313433, |
| "grad_norm": 1.8787555694580078, |
| "learning_rate": 3.3209293233849693e-06, |
| "loss": 0.0394, |
| "num_input_tokens_seen": 2591336, |
| "step": 9120 |
| }, |
| { |
| "epoch": 17.024253731343283, |
| "grad_norm": 0.9568975567817688, |
| "learning_rate": 3.300687143144482e-06, |
| "loss": 0.02, |
| "num_input_tokens_seen": 2592808, |
| "step": 9125 |
| }, |
| { |
| "epoch": 17.03358208955224, |
| "grad_norm": 0.04066040739417076, |
| "learning_rate": 3.280502481867298e-06, |
| "loss": 0.0402, |
| "num_input_tokens_seen": 2594408, |
| "step": 9130 |
| }, |
| { |
| "epoch": 17.042910447761194, |
| "grad_norm": 0.027916472405195236, |
| "learning_rate": 3.260375393057469e-06, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 2595880, |
| "step": 9135 |
| }, |
| { |
| "epoch": 17.05223880597015, |
| "grad_norm": 0.8156128525733948, |
| "learning_rate": 3.240305930066412e-06, |
| "loss": 0.0185, |
| "num_input_tokens_seen": 2597320, |
| "step": 9140 |
| }, |
| { |
| "epoch": 17.061567164179106, |
| "grad_norm": 0.042358048260211945, |
| "learning_rate": 3.2202941460927976e-06, |
| "loss": 0.0098, |
| "num_input_tokens_seen": 2598824, |
| "step": 9145 |
| }, |
| { |
| "epoch": 17.07089552238806, |
| "grad_norm": 0.0342281199991703, |
| "learning_rate": 3.2003400941824217e-06, |
| "loss": 0.0062, |
| "num_input_tokens_seen": 2600296, |
| "step": 9150 |
| }, |
| { |
| "epoch": 17.080223880597014, |
| "grad_norm": 1.3356010913848877, |
| "learning_rate": 3.180443827228033e-06, |
| "loss": 0.0259, |
| "num_input_tokens_seen": 2601672, |
| "step": 9155 |
| }, |
| { |
| "epoch": 17.08955223880597, |
| "grad_norm": 0.05003998801112175, |
| "learning_rate": 3.160605397969202e-06, |
| "loss": 0.0306, |
| "num_input_tokens_seen": 2603048, |
| "step": 9160 |
| }, |
| { |
| "epoch": 17.098880597014926, |
| "grad_norm": 0.8239122629165649, |
| "learning_rate": 3.1408248589922083e-06, |
| "loss": 0.0135, |
| "num_input_tokens_seen": 2604360, |
| "step": 9165 |
| }, |
| { |
| "epoch": 17.10820895522388, |
| "grad_norm": 0.032905213534832, |
| "learning_rate": 3.1211022627298692e-06, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 2605736, |
| "step": 9170 |
| }, |
| { |
| "epoch": 17.117537313432837, |
| "grad_norm": 0.044976215809583664, |
| "learning_rate": 3.1014376614614036e-06, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 2607176, |
| "step": 9175 |
| }, |
| { |
| "epoch": 17.12686567164179, |
| "grad_norm": 0.04005315899848938, |
| "learning_rate": 3.081831107312308e-06, |
| "loss": 0.0177, |
| "num_input_tokens_seen": 2608424, |
| "step": 9180 |
| }, |
| { |
| "epoch": 17.136194029850746, |
| "grad_norm": 0.08549685031175613, |
| "learning_rate": 3.0622826522542196e-06, |
| "loss": 0.0009, |
| "num_input_tokens_seen": 2609864, |
| "step": 9185 |
| }, |
| { |
| "epoch": 17.145522388059703, |
| "grad_norm": 1.0654666423797607, |
| "learning_rate": 3.0427923481047645e-06, |
| "loss": 0.0535, |
| "num_input_tokens_seen": 2611112, |
| "step": 9190 |
| }, |
| { |
| "epoch": 17.154850746268657, |
| "grad_norm": 0.07777464389801025, |
| "learning_rate": 3.023360246527418e-06, |
| "loss": 0.0011, |
| "num_input_tokens_seen": 2612680, |
| "step": 9195 |
| }, |
| { |
| "epoch": 17.16417910447761, |
| "grad_norm": 0.01853298954665661, |
| "learning_rate": 3.0039863990313917e-06, |
| "loss": 0.03, |
| "num_input_tokens_seen": 2614088, |
| "step": 9200 |
| }, |
| { |
| "epoch": 17.17350746268657, |
| "grad_norm": 0.7376510500907898, |
| "learning_rate": 2.984670856971475e-06, |
| "loss": 0.0177, |
| "num_input_tokens_seen": 2615592, |
| "step": 9205 |
| }, |
| { |
| "epoch": 17.182835820895523, |
| "grad_norm": 1.9682793617248535, |
| "learning_rate": 2.965413671547901e-06, |
| "loss": 0.031, |
| "num_input_tokens_seen": 2616904, |
| "step": 9210 |
| }, |
| { |
| "epoch": 17.192164179104477, |
| "grad_norm": 0.05809041112661362, |
| "learning_rate": 2.9462148938062123e-06, |
| "loss": 0.0279, |
| "num_input_tokens_seen": 2618280, |
| "step": 9215 |
| }, |
| { |
| "epoch": 17.20149253731343, |
| "grad_norm": 0.04442291706800461, |
| "learning_rate": 2.927074574637148e-06, |
| "loss": 0.0131, |
| "num_input_tokens_seen": 2619848, |
| "step": 9220 |
| }, |
| { |
| "epoch": 17.21082089552239, |
| "grad_norm": 0.02212505042552948, |
| "learning_rate": 2.907992764776471e-06, |
| "loss": 0.0022, |
| "num_input_tokens_seen": 2621256, |
| "step": 9225 |
| }, |
| { |
| "epoch": 17.220149253731343, |
| "grad_norm": 0.041076451539993286, |
| "learning_rate": 2.888969514804854e-06, |
| "loss": 0.0155, |
| "num_input_tokens_seen": 2622984, |
| "step": 9230 |
| }, |
| { |
| "epoch": 17.229477611940297, |
| "grad_norm": 0.03341635689139366, |
| "learning_rate": 2.8700048751477527e-06, |
| "loss": 0.0031, |
| "num_input_tokens_seen": 2624296, |
| "step": 9235 |
| }, |
| { |
| "epoch": 17.238805970149254, |
| "grad_norm": 2.1263813972473145, |
| "learning_rate": 2.8510988960752575e-06, |
| "loss": 0.0185, |
| "num_input_tokens_seen": 2625448, |
| "step": 9240 |
| }, |
| { |
| "epoch": 17.24813432835821, |
| "grad_norm": 2.638948440551758, |
| "learning_rate": 2.8322516277019624e-06, |
| "loss": 0.03, |
| "num_input_tokens_seen": 2626856, |
| "step": 9245 |
| }, |
| { |
| "epoch": 17.257462686567163, |
| "grad_norm": 0.7286362648010254, |
| "learning_rate": 2.813463119986834e-06, |
| "loss": 0.0252, |
| "num_input_tokens_seen": 2628392, |
| "step": 9250 |
| }, |
| { |
| "epoch": 17.26679104477612, |
| "grad_norm": 0.02920675463974476, |
| "learning_rate": 2.7947334227330897e-06, |
| "loss": 0.017, |
| "num_input_tokens_seen": 2629800, |
| "step": 9255 |
| }, |
| { |
| "epoch": 17.276119402985074, |
| "grad_norm": 0.10381735861301422, |
| "learning_rate": 2.776062585588063e-06, |
| "loss": 0.0084, |
| "num_input_tokens_seen": 2631048, |
| "step": 9260 |
| }, |
| { |
| "epoch": 17.28544776119403, |
| "grad_norm": 0.051370952278375626, |
| "learning_rate": 2.7574506580430287e-06, |
| "loss": 0.0029, |
| "num_input_tokens_seen": 2632424, |
| "step": 9265 |
| }, |
| { |
| "epoch": 17.294776119402986, |
| "grad_norm": 0.01324853952974081, |
| "learning_rate": 2.7388976894331537e-06, |
| "loss": 0.0236, |
| "num_input_tokens_seen": 2634056, |
| "step": 9270 |
| }, |
| { |
| "epoch": 17.30410447761194, |
| "grad_norm": 0.04250672459602356, |
| "learning_rate": 2.72040372893729e-06, |
| "loss": 0.0241, |
| "num_input_tokens_seen": 2635496, |
| "step": 9275 |
| }, |
| { |
| "epoch": 17.313432835820894, |
| "grad_norm": 0.03932982310652733, |
| "learning_rate": 2.7019688255778857e-06, |
| "loss": 0.0038, |
| "num_input_tokens_seen": 2636872, |
| "step": 9280 |
| }, |
| { |
| "epoch": 17.32276119402985, |
| "grad_norm": 0.061921052634716034, |
| "learning_rate": 2.6835930282208517e-06, |
| "loss": 0.0441, |
| "num_input_tokens_seen": 2638184, |
| "step": 9285 |
| }, |
| { |
| "epoch": 17.332089552238806, |
| "grad_norm": 0.0441136471927166, |
| "learning_rate": 2.6652763855754106e-06, |
| "loss": 0.0031, |
| "num_input_tokens_seen": 2639560, |
| "step": 9290 |
| }, |
| { |
| "epoch": 17.34141791044776, |
| "grad_norm": 1.9832870960235596, |
| "learning_rate": 2.647018946193999e-06, |
| "loss": 0.0435, |
| "num_input_tokens_seen": 2640904, |
| "step": 9295 |
| }, |
| { |
| "epoch": 17.350746268656717, |
| "grad_norm": 0.046195998787879944, |
| "learning_rate": 2.628820758472095e-06, |
| "loss": 0.0163, |
| "num_input_tokens_seen": 2642248, |
| "step": 9300 |
| }, |
| { |
| "epoch": 17.36007462686567, |
| "grad_norm": 0.5949422717094421, |
| "learning_rate": 2.610681870648149e-06, |
| "loss": 0.0161, |
| "num_input_tokens_seen": 2643784, |
| "step": 9305 |
| }, |
| { |
| "epoch": 17.369402985074625, |
| "grad_norm": 1.7056877613067627, |
| "learning_rate": 2.5926023308033952e-06, |
| "loss": 0.0084, |
| "num_input_tokens_seen": 2645416, |
| "step": 9310 |
| }, |
| { |
| "epoch": 17.378731343283583, |
| "grad_norm": 0.05675986036658287, |
| "learning_rate": 2.5745821868617792e-06, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 2646696, |
| "step": 9315 |
| }, |
| { |
| "epoch": 17.388059701492537, |
| "grad_norm": 1.5419673919677734, |
| "learning_rate": 2.556621486589783e-06, |
| "loss": 0.0139, |
| "num_input_tokens_seen": 2647976, |
| "step": 9320 |
| }, |
| { |
| "epoch": 17.39738805970149, |
| "grad_norm": 1.4380757808685303, |
| "learning_rate": 2.5387202775963236e-06, |
| "loss": 0.0231, |
| "num_input_tokens_seen": 2649352, |
| "step": 9325 |
| }, |
| { |
| "epoch": 17.40671641791045, |
| "grad_norm": 0.06628582626581192, |
| "learning_rate": 2.520878607332641e-06, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 2650888, |
| "step": 9330 |
| }, |
| { |
| "epoch": 17.416044776119403, |
| "grad_norm": 0.04775846004486084, |
| "learning_rate": 2.5030965230921186e-06, |
| "loss": 0.0082, |
| "num_input_tokens_seen": 2652328, |
| "step": 9335 |
| }, |
| { |
| "epoch": 17.425373134328357, |
| "grad_norm": 0.056970998644828796, |
| "learning_rate": 2.485374072010224e-06, |
| "loss": 0.0321, |
| "num_input_tokens_seen": 2653800, |
| "step": 9340 |
| }, |
| { |
| "epoch": 17.434701492537314, |
| "grad_norm": 0.03184131905436516, |
| "learning_rate": 2.4677113010643486e-06, |
| "loss": 0.0313, |
| "num_input_tokens_seen": 2655336, |
| "step": 9345 |
| }, |
| { |
| "epoch": 17.44402985074627, |
| "grad_norm": 0.6344913840293884, |
| "learning_rate": 2.450108257073683e-06, |
| "loss": 0.0139, |
| "num_input_tokens_seen": 2657000, |
| "step": 9350 |
| }, |
| { |
| "epoch": 17.453358208955223, |
| "grad_norm": 0.04712509363889694, |
| "learning_rate": 2.4325649866990928e-06, |
| "loss": 0.0318, |
| "num_input_tokens_seen": 2658376, |
| "step": 9355 |
| }, |
| { |
| "epoch": 17.46268656716418, |
| "grad_norm": 0.3967978358268738, |
| "learning_rate": 2.4150815364430036e-06, |
| "loss": 0.0054, |
| "num_input_tokens_seen": 2659944, |
| "step": 9360 |
| }, |
| { |
| "epoch": 17.472014925373134, |
| "grad_norm": 0.026971952989697456, |
| "learning_rate": 2.397657952649285e-06, |
| "loss": 0.0145, |
| "num_input_tokens_seen": 2661448, |
| "step": 9365 |
| }, |
| { |
| "epoch": 17.48134328358209, |
| "grad_norm": 1.0097392797470093, |
| "learning_rate": 2.380294281503104e-06, |
| "loss": 0.0328, |
| "num_input_tokens_seen": 2662984, |
| "step": 9370 |
| }, |
| { |
| "epoch": 17.490671641791046, |
| "grad_norm": 0.04540817812085152, |
| "learning_rate": 2.3629905690308126e-06, |
| "loss": 0.0167, |
| "num_input_tokens_seen": 2664360, |
| "step": 9375 |
| }, |
| { |
| "epoch": 17.5, |
| "grad_norm": 1.3904001712799072, |
| "learning_rate": 2.3457468610998486e-06, |
| "loss": 0.0085, |
| "num_input_tokens_seen": 2665768, |
| "step": 9380 |
| }, |
| { |
| "epoch": 17.509328358208954, |
| "grad_norm": 0.061172667890787125, |
| "learning_rate": 2.328563203418574e-06, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 2667112, |
| "step": 9385 |
| }, |
| { |
| "epoch": 17.51865671641791, |
| "grad_norm": 0.04892322048544884, |
| "learning_rate": 2.311439641536184e-06, |
| "loss": 0.0148, |
| "num_input_tokens_seen": 2668616, |
| "step": 9390 |
| }, |
| { |
| "epoch": 17.527985074626866, |
| "grad_norm": 0.07648860663175583, |
| "learning_rate": 2.2943762208425646e-06, |
| "loss": 0.0199, |
| "num_input_tokens_seen": 2669992, |
| "step": 9395 |
| }, |
| { |
| "epoch": 17.53731343283582, |
| "grad_norm": 0.10180746018886566, |
| "learning_rate": 2.2773729865682046e-06, |
| "loss": 0.0156, |
| "num_input_tokens_seen": 2671400, |
| "step": 9400 |
| }, |
| { |
| "epoch": 17.546641791044777, |
| "grad_norm": 1.2862694263458252, |
| "learning_rate": 2.2604299837840374e-06, |
| "loss": 0.0089, |
| "num_input_tokens_seen": 2672936, |
| "step": 9405 |
| }, |
| { |
| "epoch": 17.55597014925373, |
| "grad_norm": 3.372100591659546, |
| "learning_rate": 2.2435472574013433e-06, |
| "loss": 0.0557, |
| "num_input_tokens_seen": 2674280, |
| "step": 9410 |
| }, |
| { |
| "epoch": 17.565298507462686, |
| "grad_norm": 0.04837459325790405, |
| "learning_rate": 2.2267248521716327e-06, |
| "loss": 0.031, |
| "num_input_tokens_seen": 2675656, |
| "step": 9415 |
| }, |
| { |
| "epoch": 17.574626865671643, |
| "grad_norm": 0.07434497028589249, |
| "learning_rate": 2.209962812686514e-06, |
| "loss": 0.0009, |
| "num_input_tokens_seen": 2677064, |
| "step": 9420 |
| }, |
| { |
| "epoch": 17.583955223880597, |
| "grad_norm": 0.23598632216453552, |
| "learning_rate": 2.1932611833775846e-06, |
| "loss": 0.024, |
| "num_input_tokens_seen": 2678504, |
| "step": 9425 |
| }, |
| { |
| "epoch": 17.59328358208955, |
| "grad_norm": 1.3149594068527222, |
| "learning_rate": 2.1766200085163058e-06, |
| "loss": 0.0516, |
| "num_input_tokens_seen": 2679752, |
| "step": 9430 |
| }, |
| { |
| "epoch": 17.60261194029851, |
| "grad_norm": 0.838098406791687, |
| "learning_rate": 2.1600393322139034e-06, |
| "loss": 0.014, |
| "num_input_tokens_seen": 2681032, |
| "step": 9435 |
| }, |
| { |
| "epoch": 17.611940298507463, |
| "grad_norm": 0.0411720871925354, |
| "learning_rate": 2.1435191984212315e-06, |
| "loss": 0.0048, |
| "num_input_tokens_seen": 2682376, |
| "step": 9440 |
| }, |
| { |
| "epoch": 17.621268656716417, |
| "grad_norm": 0.0346861258149147, |
| "learning_rate": 2.1270596509286504e-06, |
| "loss": 0.0022, |
| "num_input_tokens_seen": 2683816, |
| "step": 9445 |
| }, |
| { |
| "epoch": 17.630597014925375, |
| "grad_norm": 1.3915529251098633, |
| "learning_rate": 2.1106607333659463e-06, |
| "loss": 0.0201, |
| "num_input_tokens_seen": 2685192, |
| "step": 9450 |
| }, |
| { |
| "epoch": 17.63992537313433, |
| "grad_norm": 0.030525896698236465, |
| "learning_rate": 2.0943224892021746e-06, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 2686664, |
| "step": 9455 |
| }, |
| { |
| "epoch": 17.649253731343283, |
| "grad_norm": 0.04545668885111809, |
| "learning_rate": 2.078044961745562e-06, |
| "loss": 0.0138, |
| "num_input_tokens_seen": 2687912, |
| "step": 9460 |
| }, |
| { |
| "epoch": 17.65858208955224, |
| "grad_norm": 0.8304170370101929, |
| "learning_rate": 2.061828194143406e-06, |
| "loss": 0.0414, |
| "num_input_tokens_seen": 2689256, |
| "step": 9465 |
| }, |
| { |
| "epoch": 17.667910447761194, |
| "grad_norm": 0.027987072244286537, |
| "learning_rate": 2.0456722293819315e-06, |
| "loss": 0.0125, |
| "num_input_tokens_seen": 2690888, |
| "step": 9470 |
| }, |
| { |
| "epoch": 17.67723880597015, |
| "grad_norm": 0.042125120759010315, |
| "learning_rate": 2.0295771102861987e-06, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 2692296, |
| "step": 9475 |
| }, |
| { |
| "epoch": 17.686567164179106, |
| "grad_norm": 0.07713036984205246, |
| "learning_rate": 2.013542879519975e-06, |
| "loss": 0.0071, |
| "num_input_tokens_seen": 2693640, |
| "step": 9480 |
| }, |
| { |
| "epoch": 17.69589552238806, |
| "grad_norm": 0.016053033992648125, |
| "learning_rate": 1.997569579585648e-06, |
| "loss": 0.0115, |
| "num_input_tokens_seen": 2694984, |
| "step": 9485 |
| }, |
| { |
| "epoch": 17.705223880597014, |
| "grad_norm": 1.3871374130249023, |
| "learning_rate": 1.9816572528240707e-06, |
| "loss": 0.0041, |
| "num_input_tokens_seen": 2696456, |
| "step": 9490 |
| }, |
| { |
| "epoch": 17.71455223880597, |
| "grad_norm": 0.05942324176430702, |
| "learning_rate": 1.9658059414144834e-06, |
| "loss": 0.0076, |
| "num_input_tokens_seen": 2697896, |
| "step": 9495 |
| }, |
| { |
| "epoch": 17.723880597014926, |
| "grad_norm": 0.028521442785859108, |
| "learning_rate": 1.9500156873743985e-06, |
| "loss": 0.0093, |
| "num_input_tokens_seen": 2699336, |
| "step": 9500 |
| }, |
| { |
| "epoch": 17.73320895522388, |
| "grad_norm": 0.7255182266235352, |
| "learning_rate": 1.934286532559468e-06, |
| "loss": 0.0036, |
| "num_input_tokens_seen": 2700680, |
| "step": 9505 |
| }, |
| { |
| "epoch": 17.742537313432837, |
| "grad_norm": 1.8643840551376343, |
| "learning_rate": 1.9186185186634066e-06, |
| "loss": 0.0064, |
| "num_input_tokens_seen": 2702152, |
| "step": 9510 |
| }, |
| { |
| "epoch": 17.75186567164179, |
| "grad_norm": 0.05329243466258049, |
| "learning_rate": 1.9030116872178316e-06, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 2703528, |
| "step": 9515 |
| }, |
| { |
| "epoch": 17.761194029850746, |
| "grad_norm": 0.021340399980545044, |
| "learning_rate": 1.8874660795922067e-06, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 2704904, |
| "step": 9520 |
| }, |
| { |
| "epoch": 17.770522388059703, |
| "grad_norm": 0.8271576762199402, |
| "learning_rate": 1.8719817369937082e-06, |
| "loss": 0.0168, |
| "num_input_tokens_seen": 2706376, |
| "step": 9525 |
| }, |
| { |
| "epoch": 17.779850746268657, |
| "grad_norm": 1.2568962574005127, |
| "learning_rate": 1.8565587004670898e-06, |
| "loss": 0.0318, |
| "num_input_tokens_seen": 2708072, |
| "step": 9530 |
| }, |
| { |
| "epoch": 17.78917910447761, |
| "grad_norm": 0.05921648070216179, |
| "learning_rate": 1.8411970108946296e-06, |
| "loss": 0.0162, |
| "num_input_tokens_seen": 2709448, |
| "step": 9535 |
| }, |
| { |
| "epoch": 17.798507462686565, |
| "grad_norm": 0.054840583354234695, |
| "learning_rate": 1.8258967089959749e-06, |
| "loss": 0.0047, |
| "num_input_tokens_seen": 2710856, |
| "step": 9540 |
| }, |
| { |
| "epoch": 17.807835820895523, |
| "grad_norm": 1.218041181564331, |
| "learning_rate": 1.8106578353280585e-06, |
| "loss": 0.0172, |
| "num_input_tokens_seen": 2712232, |
| "step": 9545 |
| }, |
| { |
| "epoch": 17.817164179104477, |
| "grad_norm": 0.029290800914168358, |
| "learning_rate": 1.7954804302849793e-06, |
| "loss": 0.018, |
| "num_input_tokens_seen": 2713800, |
| "step": 9550 |
| }, |
| { |
| "epoch": 17.826492537313435, |
| "grad_norm": 0.057863421738147736, |
| "learning_rate": 1.7803645340978948e-06, |
| "loss": 0.0348, |
| "num_input_tokens_seen": 2715240, |
| "step": 9555 |
| }, |
| { |
| "epoch": 17.83582089552239, |
| "grad_norm": 0.007874362170696259, |
| "learning_rate": 1.7653101868349343e-06, |
| "loss": 0.0065, |
| "num_input_tokens_seen": 2716712, |
| "step": 9560 |
| }, |
| { |
| "epoch": 17.845149253731343, |
| "grad_norm": 0.030309578403830528, |
| "learning_rate": 1.750317428401066e-06, |
| "loss": 0.0303, |
| "num_input_tokens_seen": 2718152, |
| "step": 9565 |
| }, |
| { |
| "epoch": 17.854477611940297, |
| "grad_norm": 0.06596005707979202, |
| "learning_rate": 1.7353862985380027e-06, |
| "loss": 0.0085, |
| "num_input_tokens_seen": 2719400, |
| "step": 9570 |
| }, |
| { |
| "epoch": 17.863805970149254, |
| "grad_norm": 0.024357857182621956, |
| "learning_rate": 1.7205168368240986e-06, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 2721224, |
| "step": 9575 |
| }, |
| { |
| "epoch": 17.87313432835821, |
| "grad_norm": 0.05525515601038933, |
| "learning_rate": 1.7057090826742505e-06, |
| "loss": 0.0376, |
| "num_input_tokens_seen": 2722632, |
| "step": 9580 |
| }, |
| { |
| "epoch": 17.882462686567163, |
| "grad_norm": 0.03141052648425102, |
| "learning_rate": 1.6909630753397716e-06, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 2724200, |
| "step": 9585 |
| }, |
| { |
| "epoch": 17.89179104477612, |
| "grad_norm": 0.07031945139169693, |
| "learning_rate": 1.6762788539083086e-06, |
| "loss": 0.0009, |
| "num_input_tokens_seen": 2725704, |
| "step": 9590 |
| }, |
| { |
| "epoch": 17.901119402985074, |
| "grad_norm": 1.3148480653762817, |
| "learning_rate": 1.6616564573037342e-06, |
| "loss": 0.0314, |
| "num_input_tokens_seen": 2727144, |
| "step": 9595 |
| }, |
| { |
| "epoch": 17.91044776119403, |
| "grad_norm": 0.07452524453401566, |
| "learning_rate": 1.6470959242860296e-06, |
| "loss": 0.0054, |
| "num_input_tokens_seen": 2728488, |
| "step": 9600 |
| }, |
| { |
| "epoch": 17.919776119402986, |
| "grad_norm": 3.458937168121338, |
| "learning_rate": 1.6325972934512018e-06, |
| "loss": 0.0421, |
| "num_input_tokens_seen": 2729864, |
| "step": 9605 |
| }, |
| { |
| "epoch": 17.92910447761194, |
| "grad_norm": 0.9662806391716003, |
| "learning_rate": 1.6181606032311696e-06, |
| "loss": 0.0254, |
| "num_input_tokens_seen": 2731368, |
| "step": 9610 |
| }, |
| { |
| "epoch": 17.938432835820894, |
| "grad_norm": 1.094976782798767, |
| "learning_rate": 1.6037858918936638e-06, |
| "loss": 0.0261, |
| "num_input_tokens_seen": 2732744, |
| "step": 9615 |
| }, |
| { |
| "epoch": 17.94776119402985, |
| "grad_norm": 0.05033053085207939, |
| "learning_rate": 1.589473197542124e-06, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 2734152, |
| "step": 9620 |
| }, |
| { |
| "epoch": 17.957089552238806, |
| "grad_norm": 0.06351548433303833, |
| "learning_rate": 1.5752225581155993e-06, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 2735720, |
| "step": 9625 |
| }, |
| { |
| "epoch": 17.96641791044776, |
| "grad_norm": 1.6464263200759888, |
| "learning_rate": 1.5610340113886568e-06, |
| "loss": 0.0466, |
| "num_input_tokens_seen": 2737032, |
| "step": 9630 |
| }, |
| { |
| "epoch": 17.975746268656717, |
| "grad_norm": 0.05575360730290413, |
| "learning_rate": 1.5469075949712613e-06, |
| "loss": 0.007, |
| "num_input_tokens_seen": 2738568, |
| "step": 9635 |
| }, |
| { |
| "epoch": 17.98507462686567, |
| "grad_norm": 1.8744620084762573, |
| "learning_rate": 1.5328433463086904e-06, |
| "loss": 0.0212, |
| "num_input_tokens_seen": 2739912, |
| "step": 9640 |
| }, |
| { |
| "epoch": 17.994402985074625, |
| "grad_norm": 0.5996155738830566, |
| "learning_rate": 1.5188413026814396e-06, |
| "loss": 0.0019, |
| "num_input_tokens_seen": 2741256, |
| "step": 9645 |
| }, |
| { |
| "epoch": 18.0, |
| "eval_loss": 1.5183556079864502, |
| "eval_runtime": 2.9485, |
| "eval_samples_per_second": 80.719, |
| "eval_steps_per_second": 20.349, |
| "num_input_tokens_seen": 2741936, |
| "step": 9648 |
| }, |
| { |
| "epoch": 18.003731343283583, |
| "grad_norm": 0.04537326097488403, |
| "learning_rate": 1.5049015012051055e-06, |
| "loss": 0.0184, |
| "num_input_tokens_seen": 2742640, |
| "step": 9650 |
| }, |
| { |
| "epoch": 18.013059701492537, |
| "grad_norm": 0.02327471785247326, |
| "learning_rate": 1.4910239788303027e-06, |
| "loss": 0.0056, |
| "num_input_tokens_seen": 2743888, |
| "step": 9655 |
| }, |
| { |
| "epoch": 18.02238805970149, |
| "grad_norm": 0.5093995928764343, |
| "learning_rate": 1.4772087723425559e-06, |
| "loss": 0.0195, |
| "num_input_tokens_seen": 2745424, |
| "step": 9660 |
| }, |
| { |
| "epoch": 18.03171641791045, |
| "grad_norm": 0.9184834361076355, |
| "learning_rate": 1.4634559183622193e-06, |
| "loss": 0.0111, |
| "num_input_tokens_seen": 2746896, |
| "step": 9665 |
| }, |
| { |
| "epoch": 18.041044776119403, |
| "grad_norm": 0.055679481476545334, |
| "learning_rate": 1.4497654533443538e-06, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 2748272, |
| "step": 9670 |
| }, |
| { |
| "epoch": 18.050373134328357, |
| "grad_norm": 0.023909028619527817, |
| "learning_rate": 1.436137413578653e-06, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 2749616, |
| "step": 9675 |
| }, |
| { |
| "epoch": 18.059701492537314, |
| "grad_norm": 0.029305724427103996, |
| "learning_rate": 1.4225718351893374e-06, |
| "loss": 0.0216, |
| "num_input_tokens_seen": 2751088, |
| "step": 9680 |
| }, |
| { |
| "epoch": 18.06902985074627, |
| "grad_norm": 0.9868423342704773, |
| "learning_rate": 1.4090687541350538e-06, |
| "loss": 0.0324, |
| "num_input_tokens_seen": 2752496, |
| "step": 9685 |
| }, |
| { |
| "epoch": 18.078358208955223, |
| "grad_norm": 0.07312313467264175, |
| "learning_rate": 1.3956282062087933e-06, |
| "loss": 0.0044, |
| "num_input_tokens_seen": 2753872, |
| "step": 9690 |
| }, |
| { |
| "epoch": 18.08768656716418, |
| "grad_norm": 1.1943861246109009, |
| "learning_rate": 1.3822502270377762e-06, |
| "loss": 0.0418, |
| "num_input_tokens_seen": 2755216, |
| "step": 9695 |
| }, |
| { |
| "epoch": 18.097014925373134, |
| "grad_norm": 0.03826881945133209, |
| "learning_rate": 1.368934852083384e-06, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 2756624, |
| "step": 9700 |
| }, |
| { |
| "epoch": 18.10634328358209, |
| "grad_norm": 0.009288906119763851, |
| "learning_rate": 1.3556821166410522e-06, |
| "loss": 0.0212, |
| "num_input_tokens_seen": 2758160, |
| "step": 9705 |
| }, |
| { |
| "epoch": 18.115671641791046, |
| "grad_norm": 0.0451342947781086, |
| "learning_rate": 1.3424920558401611e-06, |
| "loss": 0.0158, |
| "num_input_tokens_seen": 2759568, |
| "step": 9710 |
| }, |
| { |
| "epoch": 18.125, |
| "grad_norm": 0.0310677457600832, |
| "learning_rate": 1.3293647046439678e-06, |
| "loss": 0.0025, |
| "num_input_tokens_seen": 2761072, |
| "step": 9715 |
| }, |
| { |
| "epoch": 18.134328358208954, |
| "grad_norm": 0.07583888620138168, |
| "learning_rate": 1.3163000978495072e-06, |
| "loss": 0.0029, |
| "num_input_tokens_seen": 2762576, |
| "step": 9720 |
| }, |
| { |
| "epoch": 18.14365671641791, |
| "grad_norm": 0.7294575572013855, |
| "learning_rate": 1.3032982700874802e-06, |
| "loss": 0.0138, |
| "num_input_tokens_seen": 2764112, |
| "step": 9725 |
| }, |
| { |
| "epoch": 18.152985074626866, |
| "grad_norm": 0.7203591465950012, |
| "learning_rate": 1.2903592558222016e-06, |
| "loss": 0.0031, |
| "num_input_tokens_seen": 2765584, |
| "step": 9730 |
| }, |
| { |
| "epoch": 18.16231343283582, |
| "grad_norm": 0.940115213394165, |
| "learning_rate": 1.2774830893514583e-06, |
| "loss": 0.031, |
| "num_input_tokens_seen": 2766960, |
| "step": 9735 |
| }, |
| { |
| "epoch": 18.171641791044777, |
| "grad_norm": 0.09588240832090378, |
| "learning_rate": 1.2646698048064703e-06, |
| "loss": 0.0407, |
| "num_input_tokens_seen": 2768464, |
| "step": 9740 |
| }, |
| { |
| "epoch": 18.18097014925373, |
| "grad_norm": 0.20360387861728668, |
| "learning_rate": 1.2519194361517466e-06, |
| "loss": 0.0083, |
| "num_input_tokens_seen": 2769840, |
| "step": 9745 |
| }, |
| { |
| "epoch": 18.190298507462686, |
| "grad_norm": 0.0818498507142067, |
| "learning_rate": 1.2392320171850546e-06, |
| "loss": 0.0027, |
| "num_input_tokens_seen": 2771376, |
| "step": 9750 |
| }, |
| { |
| "epoch": 18.199626865671643, |
| "grad_norm": 1.647507905960083, |
| "learning_rate": 1.2266075815372701e-06, |
| "loss": 0.0211, |
| "num_input_tokens_seen": 2772752, |
| "step": 9755 |
| }, |
| { |
| "epoch": 18.208955223880597, |
| "grad_norm": 0.03543038293719292, |
| "learning_rate": 1.2140461626723414e-06, |
| "loss": 0.001, |
| "num_input_tokens_seen": 2774192, |
| "step": 9760 |
| }, |
| { |
| "epoch": 18.21828358208955, |
| "grad_norm": 0.044983524829149246, |
| "learning_rate": 1.2015477938871617e-06, |
| "loss": 0.0167, |
| "num_input_tokens_seen": 2775760, |
| "step": 9765 |
| }, |
| { |
| "epoch": 18.22761194029851, |
| "grad_norm": 0.04682010039687157, |
| "learning_rate": 1.1891125083114962e-06, |
| "loss": 0.0093, |
| "num_input_tokens_seen": 2777008, |
| "step": 9770 |
| }, |
| { |
| "epoch": 18.236940298507463, |
| "grad_norm": 0.06852086633443832, |
| "learning_rate": 1.1767403389079057e-06, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 2778448, |
| "step": 9775 |
| }, |
| { |
| "epoch": 18.246268656716417, |
| "grad_norm": 0.03673939406871796, |
| "learning_rate": 1.164431318471626e-06, |
| "loss": 0.052, |
| "num_input_tokens_seen": 2780016, |
| "step": 9780 |
| }, |
| { |
| "epoch": 18.255597014925375, |
| "grad_norm": 0.06711920350790024, |
| "learning_rate": 1.1521854796305242e-06, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 2781392, |
| "step": 9785 |
| }, |
| { |
| "epoch": 18.26492537313433, |
| "grad_norm": 1.4102665185928345, |
| "learning_rate": 1.1400028548449821e-06, |
| "loss": 0.0168, |
| "num_input_tokens_seen": 2782768, |
| "step": 9790 |
| }, |
| { |
| "epoch": 18.274253731343283, |
| "grad_norm": 0.03481665626168251, |
| "learning_rate": 1.1278834764078123e-06, |
| "loss": 0.0289, |
| "num_input_tokens_seen": 2784336, |
| "step": 9795 |
| }, |
| { |
| "epoch": 18.28358208955224, |
| "grad_norm": 0.024769533425569534, |
| "learning_rate": 1.1158273764441868e-06, |
| "loss": 0.0022, |
| "num_input_tokens_seen": 2785808, |
| "step": 9800 |
| }, |
| { |
| "epoch": 18.292910447761194, |
| "grad_norm": 1.2971240282058716, |
| "learning_rate": 1.103834586911534e-06, |
| "loss": 0.0364, |
| "num_input_tokens_seen": 2787216, |
| "step": 9805 |
| }, |
| { |
| "epoch": 18.30223880597015, |
| "grad_norm": 1.2479645013809204, |
| "learning_rate": 1.0919051395994778e-06, |
| "loss": 0.0212, |
| "num_input_tokens_seen": 2788688, |
| "step": 9810 |
| }, |
| { |
| "epoch": 18.311567164179106, |
| "grad_norm": 0.0322304293513298, |
| "learning_rate": 1.0800390661297261e-06, |
| "loss": 0.048, |
| "num_input_tokens_seen": 2789936, |
| "step": 9815 |
| }, |
| { |
| "epoch": 18.32089552238806, |
| "grad_norm": 0.02541501261293888, |
| "learning_rate": 1.0682363979560046e-06, |
| "loss": 0.0025, |
| "num_input_tokens_seen": 2791344, |
| "step": 9820 |
| }, |
| { |
| "epoch": 18.330223880597014, |
| "grad_norm": 0.2888713479042053, |
| "learning_rate": 1.056497166363976e-06, |
| "loss": 0.0204, |
| "num_input_tokens_seen": 2792944, |
| "step": 9825 |
| }, |
| { |
| "epoch": 18.33955223880597, |
| "grad_norm": 1.8631011247634888, |
| "learning_rate": 1.0448214024711384e-06, |
| "loss": 0.0211, |
| "num_input_tokens_seen": 2794480, |
| "step": 9830 |
| }, |
| { |
| "epoch": 18.348880597014926, |
| "grad_norm": 1.3589205741882324, |
| "learning_rate": 1.0332091372267566e-06, |
| "loss": 0.0078, |
| "num_input_tokens_seen": 2795856, |
| "step": 9835 |
| }, |
| { |
| "epoch": 18.35820895522388, |
| "grad_norm": 0.04251015558838844, |
| "learning_rate": 1.0216604014117837e-06, |
| "loss": 0.005, |
| "num_input_tokens_seen": 2797168, |
| "step": 9840 |
| }, |
| { |
| "epoch": 18.367537313432837, |
| "grad_norm": 0.6733338236808777, |
| "learning_rate": 1.0101752256387682e-06, |
| "loss": 0.0232, |
| "num_input_tokens_seen": 2798544, |
| "step": 9845 |
| }, |
| { |
| "epoch": 18.37686567164179, |
| "grad_norm": 0.036090996116399765, |
| "learning_rate": 9.98753640351785e-07, |
| "loss": 0.0258, |
| "num_input_tokens_seen": 2799856, |
| "step": 9850 |
| }, |
| { |
| "epoch": 18.386194029850746, |
| "grad_norm": 0.03340015187859535, |
| "learning_rate": 9.873956758263359e-07, |
| "loss": 0.0194, |
| "num_input_tokens_seen": 2801360, |
| "step": 9855 |
| }, |
| { |
| "epoch": 18.395522388059703, |
| "grad_norm": 0.07597798854112625, |
| "learning_rate": 9.76101362169296e-07, |
| "loss": 0.0039, |
| "num_input_tokens_seen": 2802736, |
| "step": 9860 |
| }, |
| { |
| "epoch": 18.404850746268657, |
| "grad_norm": 0.03966192528605461, |
| "learning_rate": 9.64870729318809e-07, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 2804208, |
| "step": 9865 |
| }, |
| { |
| "epoch": 18.41417910447761, |
| "grad_norm": 0.015840215608477592, |
| "learning_rate": 9.537038070442206e-07, |
| "loss": 0.0135, |
| "num_input_tokens_seen": 2805744, |
| "step": 9870 |
| }, |
| { |
| "epoch": 18.423507462686565, |
| "grad_norm": 0.05047855153679848, |
| "learning_rate": 9.42600624945994e-07, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 2807088, |
| "step": 9875 |
| }, |
| { |
| "epoch": 18.432835820895523, |
| "grad_norm": 0.09986899048089981, |
| "learning_rate": 9.315612124556477e-07, |
| "loss": 0.0037, |
| "num_input_tokens_seen": 2808432, |
| "step": 9880 |
| }, |
| { |
| "epoch": 18.442164179104477, |
| "grad_norm": 1.9718987941741943, |
| "learning_rate": 9.205855988356466e-07, |
| "loss": 0.0542, |
| "num_input_tokens_seen": 2809936, |
| "step": 9885 |
| }, |
| { |
| "epoch": 18.451492537313435, |
| "grad_norm": 0.901888906955719, |
| "learning_rate": 9.096738131793542e-07, |
| "loss": 0.0081, |
| "num_input_tokens_seen": 2811312, |
| "step": 9890 |
| }, |
| { |
| "epoch": 18.46082089552239, |
| "grad_norm": 0.061976321041584015, |
| "learning_rate": 8.988258844109393e-07, |
| "loss": 0.0295, |
| "num_input_tokens_seen": 2812656, |
| "step": 9895 |
| }, |
| { |
| "epoch": 18.470149253731343, |
| "grad_norm": 0.01533551700413227, |
| "learning_rate": 8.880418412853059e-07, |
| "loss": 0.0248, |
| "num_input_tokens_seen": 2814320, |
| "step": 9900 |
| }, |
| { |
| "epoch": 18.479477611940297, |
| "grad_norm": 0.031214183196425438, |
| "learning_rate": 8.773217123880073e-07, |
| "loss": 0.0055, |
| "num_input_tokens_seen": 2815824, |
| "step": 9905 |
| }, |
| { |
| "epoch": 18.488805970149254, |
| "grad_norm": 2.1086349487304688, |
| "learning_rate": 8.666655261351908e-07, |
| "loss": 0.0087, |
| "num_input_tokens_seen": 2817296, |
| "step": 9910 |
| }, |
| { |
| "epoch": 18.49813432835821, |
| "grad_norm": 0.0364716574549675, |
| "learning_rate": 8.560733107734947e-07, |
| "loss": 0.0058, |
| "num_input_tokens_seen": 2819024, |
| "step": 9915 |
| }, |
| { |
| "epoch": 18.507462686567163, |
| "grad_norm": 0.044736701995134354, |
| "learning_rate": 8.455450943799958e-07, |
| "loss": 0.0032, |
| "num_input_tokens_seen": 2820592, |
| "step": 9920 |
| }, |
| { |
| "epoch": 18.51679104477612, |
| "grad_norm": 1.2729895114898682, |
| "learning_rate": 8.35080904862126e-07, |
| "loss": 0.0118, |
| "num_input_tokens_seen": 2822064, |
| "step": 9925 |
| }, |
| { |
| "epoch": 18.526119402985074, |
| "grad_norm": 0.018470635637640953, |
| "learning_rate": 8.246807699576032e-07, |
| "loss": 0.0446, |
| "num_input_tokens_seen": 2823440, |
| "step": 9930 |
| }, |
| { |
| "epoch": 18.53544776119403, |
| "grad_norm": 0.04572044685482979, |
| "learning_rate": 8.143447172343471e-07, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 2824848, |
| "step": 9935 |
| }, |
| { |
| "epoch": 18.544776119402986, |
| "grad_norm": 0.05050905421376228, |
| "learning_rate": 8.040727740904113e-07, |
| "loss": 0.0097, |
| "num_input_tokens_seen": 2826128, |
| "step": 9940 |
| }, |
| { |
| "epoch": 18.55410447761194, |
| "grad_norm": 0.0652928501367569, |
| "learning_rate": 7.938649677539267e-07, |
| "loss": 0.0031, |
| "num_input_tokens_seen": 2827664, |
| "step": 9945 |
| }, |
| { |
| "epoch": 18.563432835820894, |
| "grad_norm": 0.04034271836280823, |
| "learning_rate": 7.837213252829989e-07, |
| "loss": 0.0164, |
| "num_input_tokens_seen": 2829136, |
| "step": 9950 |
| }, |
| { |
| "epoch": 18.57276119402985, |
| "grad_norm": 0.02220243401825428, |
| "learning_rate": 7.736418735656586e-07, |
| "loss": 0.0177, |
| "num_input_tokens_seen": 2830672, |
| "step": 9955 |
| }, |
| { |
| "epoch": 18.582089552238806, |
| "grad_norm": 0.03507387638092041, |
| "learning_rate": 7.636266393197866e-07, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 2832112, |
| "step": 9960 |
| }, |
| { |
| "epoch": 18.59141791044776, |
| "grad_norm": 0.034595634788274765, |
| "learning_rate": 7.536756490930358e-07, |
| "loss": 0.0003, |
| "num_input_tokens_seen": 2833648, |
| "step": 9965 |
| }, |
| { |
| "epoch": 18.600746268656717, |
| "grad_norm": 0.698068380355835, |
| "learning_rate": 7.437889292627787e-07, |
| "loss": 0.0045, |
| "num_input_tokens_seen": 2834896, |
| "step": 9970 |
| }, |
| { |
| "epoch": 18.61007462686567, |
| "grad_norm": 1.3435269594192505, |
| "learning_rate": 7.339665060360018e-07, |
| "loss": 0.0562, |
| "num_input_tokens_seen": 2836240, |
| "step": 9975 |
| }, |
| { |
| "epoch": 18.619402985074625, |
| "grad_norm": 0.533918559551239, |
| "learning_rate": 7.24208405449281e-07, |
| "loss": 0.029, |
| "num_input_tokens_seen": 2837680, |
| "step": 9980 |
| }, |
| { |
| "epoch": 18.628731343283583, |
| "grad_norm": 0.02629079855978489, |
| "learning_rate": 7.145146533686725e-07, |
| "loss": 0.0394, |
| "num_input_tokens_seen": 2839056, |
| "step": 9985 |
| }, |
| { |
| "epoch": 18.638059701492537, |
| "grad_norm": 0.08451460301876068, |
| "learning_rate": 7.048852754896806e-07, |
| "loss": 0.0218, |
| "num_input_tokens_seen": 2840304, |
| "step": 9990 |
| }, |
| { |
| "epoch": 18.64738805970149, |
| "grad_norm": 0.09654192626476288, |
| "learning_rate": 6.953202973371514e-07, |
| "loss": 0.0045, |
| "num_input_tokens_seen": 2841840, |
| "step": 9995 |
| }, |
| { |
| "epoch": 18.65671641791045, |
| "grad_norm": 0.05838331952691078, |
| "learning_rate": 6.858197442652369e-07, |
| "loss": 0.0041, |
| "num_input_tokens_seen": 2843344, |
| "step": 10000 |
| }, |
| { |
| "epoch": 18.666044776119403, |
| "grad_norm": 1.2025679349899292, |
| "learning_rate": 6.763836414573232e-07, |
| "loss": 0.0164, |
| "num_input_tokens_seen": 2844848, |
| "step": 10005 |
| }, |
| { |
| "epoch": 18.675373134328357, |
| "grad_norm": 0.03383278846740723, |
| "learning_rate": 6.670120139259328e-07, |
| "loss": 0.0431, |
| "num_input_tokens_seen": 2846192, |
| "step": 10010 |
| }, |
| { |
| "epoch": 18.684701492537314, |
| "grad_norm": 2.6112592220306396, |
| "learning_rate": 6.577048865127028e-07, |
| "loss": 0.0394, |
| "num_input_tokens_seen": 2847504, |
| "step": 10015 |
| }, |
| { |
| "epoch": 18.69402985074627, |
| "grad_norm": 0.03963230177760124, |
| "learning_rate": 6.484622838882903e-07, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 2848784, |
| "step": 10020 |
| }, |
| { |
| "epoch": 18.703358208955223, |
| "grad_norm": 0.034403346478939056, |
| "learning_rate": 6.392842305523172e-07, |
| "loss": 0.0046, |
| "num_input_tokens_seen": 2850192, |
| "step": 10025 |
| }, |
| { |
| "epoch": 18.71268656716418, |
| "grad_norm": 0.035918716341257095, |
| "learning_rate": 6.301707508332977e-07, |
| "loss": 0.0476, |
| "num_input_tokens_seen": 2851536, |
| "step": 10030 |
| }, |
| { |
| "epoch": 18.722014925373134, |
| "grad_norm": 0.7790325880050659, |
| "learning_rate": 6.2112186888858e-07, |
| "loss": 0.0025, |
| "num_input_tokens_seen": 2853200, |
| "step": 10035 |
| }, |
| { |
| "epoch": 18.73134328358209, |
| "grad_norm": 0.04096542298793793, |
| "learning_rate": 6.121376087042913e-07, |
| "loss": 0.04, |
| "num_input_tokens_seen": 2854608, |
| "step": 10040 |
| }, |
| { |
| "epoch": 18.740671641791046, |
| "grad_norm": 0.06973053514957428, |
| "learning_rate": 6.032179940952509e-07, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 2855984, |
| "step": 10045 |
| }, |
| { |
| "epoch": 18.75, |
| "grad_norm": 0.04260961338877678, |
| "learning_rate": 5.943630487049295e-07, |
| "loss": 0.017, |
| "num_input_tokens_seen": 2857424, |
| "step": 10050 |
| }, |
| { |
| "epoch": 18.759328358208954, |
| "grad_norm": 0.06968097388744354, |
| "learning_rate": 5.855727960053653e-07, |
| "loss": 0.0297, |
| "num_input_tokens_seen": 2858832, |
| "step": 10055 |
| }, |
| { |
| "epoch": 18.76865671641791, |
| "grad_norm": 1.3283379077911377, |
| "learning_rate": 5.768472592971308e-07, |
| "loss": 0.017, |
| "num_input_tokens_seen": 2860240, |
| "step": 10060 |
| }, |
| { |
| "epoch": 18.777985074626866, |
| "grad_norm": 0.0491754449903965, |
| "learning_rate": 5.681864617092414e-07, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 2861616, |
| "step": 10065 |
| }, |
| { |
| "epoch": 18.78731343283582, |
| "grad_norm": 0.05519312620162964, |
| "learning_rate": 5.595904261991109e-07, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 2862928, |
| "step": 10070 |
| }, |
| { |
| "epoch": 18.796641791044777, |
| "grad_norm": 0.05045041814446449, |
| "learning_rate": 5.510591755524874e-07, |
| "loss": 0.0039, |
| "num_input_tokens_seen": 2864496, |
| "step": 10075 |
| }, |
| { |
| "epoch": 18.80597014925373, |
| "grad_norm": 0.06302618235349655, |
| "learning_rate": 5.425927323833902e-07, |
| "loss": 0.0068, |
| "num_input_tokens_seen": 2865712, |
| "step": 10080 |
| }, |
| { |
| "epoch": 18.815298507462686, |
| "grad_norm": 0.058925140649080276, |
| "learning_rate": 5.341911191340504e-07, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 2867184, |
| "step": 10085 |
| }, |
| { |
| "epoch": 18.824626865671643, |
| "grad_norm": 0.676299512386322, |
| "learning_rate": 5.258543580748565e-07, |
| "loss": 0.0129, |
| "num_input_tokens_seen": 2868624, |
| "step": 10090 |
| }, |
| { |
| "epoch": 18.833955223880597, |
| "grad_norm": 0.809614896774292, |
| "learning_rate": 5.175824713042926e-07, |
| "loss": 0.0139, |
| "num_input_tokens_seen": 2870000, |
| "step": 10095 |
| }, |
| { |
| "epoch": 18.84328358208955, |
| "grad_norm": 0.6781437993049622, |
| "learning_rate": 5.093754807488693e-07, |
| "loss": 0.0096, |
| "num_input_tokens_seen": 2871536, |
| "step": 10100 |
| }, |
| { |
| "epoch": 18.85261194029851, |
| "grad_norm": 0.044449422508478165, |
| "learning_rate": 5.01233408163082e-07, |
| "loss": 0.0183, |
| "num_input_tokens_seen": 2872944, |
| "step": 10105 |
| }, |
| { |
| "epoch": 18.861940298507463, |
| "grad_norm": 0.03643164038658142, |
| "learning_rate": 4.931562751293528e-07, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 2874288, |
| "step": 10110 |
| }, |
| { |
| "epoch": 18.871268656716417, |
| "grad_norm": 0.043168120086193085, |
| "learning_rate": 4.851441030579523e-07, |
| "loss": 0.0026, |
| "num_input_tokens_seen": 2875728, |
| "step": 10115 |
| }, |
| { |
| "epoch": 18.880597014925375, |
| "grad_norm": 0.04298883676528931, |
| "learning_rate": 4.771969131869669e-07, |
| "loss": 0.021, |
| "num_input_tokens_seen": 2877136, |
| "step": 10120 |
| }, |
| { |
| "epoch": 18.88992537313433, |
| "grad_norm": 1.2369238138198853, |
| "learning_rate": 4.6931472658223176e-07, |
| "loss": 0.018, |
| "num_input_tokens_seen": 2878640, |
| "step": 10125 |
| }, |
| { |
| "epoch": 18.899253731343283, |
| "grad_norm": 1.134828805923462, |
| "learning_rate": 4.614975641372754e-07, |
| "loss": 0.0441, |
| "num_input_tokens_seen": 2879824, |
| "step": 10130 |
| }, |
| { |
| "epoch": 18.90858208955224, |
| "grad_norm": 1.2059731483459473, |
| "learning_rate": 4.5374544657326157e-07, |
| "loss": 0.0101, |
| "num_input_tokens_seen": 2881232, |
| "step": 10135 |
| }, |
| { |
| "epoch": 18.917910447761194, |
| "grad_norm": 0.13986089825630188, |
| "learning_rate": 4.460583944389418e-07, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 2882640, |
| "step": 10140 |
| }, |
| { |
| "epoch": 18.92723880597015, |
| "grad_norm": 0.06102216988801956, |
| "learning_rate": 4.3843642811059737e-07, |
| "loss": 0.0031, |
| "num_input_tokens_seen": 2883920, |
| "step": 10145 |
| }, |
| { |
| "epoch": 18.936567164179106, |
| "grad_norm": 1.5771390199661255, |
| "learning_rate": 4.3087956779198356e-07, |
| "loss": 0.0304, |
| "num_input_tokens_seen": 2885456, |
| "step": 10150 |
| }, |
| { |
| "epoch": 18.94589552238806, |
| "grad_norm": 2.980865716934204, |
| "learning_rate": 4.2338783351427156e-07, |
| "loss": 0.0573, |
| "num_input_tokens_seen": 2887056, |
| "step": 10155 |
| }, |
| { |
| "epoch": 18.955223880597014, |
| "grad_norm": 0.7480371594429016, |
| "learning_rate": 4.159612451360151e-07, |
| "loss": 0.0284, |
| "num_input_tokens_seen": 2888368, |
| "step": 10160 |
| }, |
| { |
| "epoch": 18.96455223880597, |
| "grad_norm": 0.02631082572042942, |
| "learning_rate": 4.085998223430698e-07, |
| "loss": 0.0037, |
| "num_input_tokens_seen": 2889744, |
| "step": 10165 |
| }, |
| { |
| "epoch": 18.973880597014926, |
| "grad_norm": 0.08783715963363647, |
| "learning_rate": 4.013035846485658e-07, |
| "loss": 0.0242, |
| "num_input_tokens_seen": 2891024, |
| "step": 10170 |
| }, |
| { |
| "epoch": 18.98320895522388, |
| "grad_norm": 0.8408134579658508, |
| "learning_rate": 3.940725513928323e-07, |
| "loss": 0.0148, |
| "num_input_tokens_seen": 2892656, |
| "step": 10175 |
| }, |
| { |
| "epoch": 18.992537313432837, |
| "grad_norm": 0.08102292567491531, |
| "learning_rate": 3.8690674174337305e-07, |
| "loss": 0.0222, |
| "num_input_tokens_seen": 2893968, |
| "step": 10180 |
| }, |
| { |
| "epoch": 19.0, |
| "eval_loss": 1.5373179912567139, |
| "eval_runtime": 2.8908, |
| "eval_samples_per_second": 82.329, |
| "eval_steps_per_second": 20.755, |
| "num_input_tokens_seen": 2894976, |
| "step": 10184 |
| }, |
| { |
| "epoch": 19.00186567164179, |
| "grad_norm": 0.02344077080488205, |
| "learning_rate": 3.7980617469479953e-07, |
| "loss": 0.0483, |
| "num_input_tokens_seen": 2895264, |
| "step": 10185 |
| }, |
| { |
| "epoch": 19.011194029850746, |
| "grad_norm": 1.1383206844329834, |
| "learning_rate": 3.7277086906877256e-07, |
| "loss": 0.0284, |
| "num_input_tokens_seen": 2896672, |
| "step": 10190 |
| }, |
| { |
| "epoch": 19.020522388059703, |
| "grad_norm": 0.03940989822149277, |
| "learning_rate": 3.65800843513972e-07, |
| "loss": 0.0063, |
| "num_input_tokens_seen": 2898176, |
| "step": 10195 |
| }, |
| { |
| "epoch": 19.029850746268657, |
| "grad_norm": 0.5624316930770874, |
| "learning_rate": 3.588961165060356e-07, |
| "loss": 0.0407, |
| "num_input_tokens_seen": 2899520, |
| "step": 10200 |
| }, |
| { |
| "epoch": 19.03917910447761, |
| "grad_norm": 0.08106131851673126, |
| "learning_rate": 3.5205670634751163e-07, |
| "loss": 0.0077, |
| "num_input_tokens_seen": 2900672, |
| "step": 10205 |
| }, |
| { |
| "epoch": 19.04850746268657, |
| "grad_norm": 1.2663943767547607, |
| "learning_rate": 3.452826311678148e-07, |
| "loss": 0.0076, |
| "num_input_tokens_seen": 2902144, |
| "step": 10210 |
| }, |
| { |
| "epoch": 19.057835820895523, |
| "grad_norm": 0.049268901348114014, |
| "learning_rate": 3.3857390892316764e-07, |
| "loss": 0.0169, |
| "num_input_tokens_seen": 2903648, |
| "step": 10215 |
| }, |
| { |
| "epoch": 19.067164179104477, |
| "grad_norm": 0.7177232503890991, |
| "learning_rate": 3.319305573965703e-07, |
| "loss": 0.0336, |
| "num_input_tokens_seen": 2905120, |
| "step": 10220 |
| }, |
| { |
| "epoch": 19.07649253731343, |
| "grad_norm": 0.03332367539405823, |
| "learning_rate": 3.253525941977309e-07, |
| "loss": 0.0162, |
| "num_input_tokens_seen": 2906752, |
| "step": 10225 |
| }, |
| { |
| "epoch": 19.08582089552239, |
| "grad_norm": 0.07545983046293259, |
| "learning_rate": 3.1884003676303786e-07, |
| "loss": 0.0303, |
| "num_input_tokens_seen": 2907936, |
| "step": 10230 |
| }, |
| { |
| "epoch": 19.095149253731343, |
| "grad_norm": 0.04152274504303932, |
| "learning_rate": 3.1239290235550724e-07, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 2909248, |
| "step": 10235 |
| }, |
| { |
| "epoch": 19.104477611940297, |
| "grad_norm": 0.010615754872560501, |
| "learning_rate": 3.0601120806473535e-07, |
| "loss": 0.0124, |
| "num_input_tokens_seen": 2911008, |
| "step": 10240 |
| }, |
| { |
| "epoch": 19.113805970149254, |
| "grad_norm": 1.3244807720184326, |
| "learning_rate": 2.9969497080685196e-07, |
| "loss": 0.0396, |
| "num_input_tokens_seen": 2912192, |
| "step": 10245 |
| }, |
| { |
| "epoch": 19.12313432835821, |
| "grad_norm": 0.8979429602622986, |
| "learning_rate": 2.934442073244809e-07, |
| "loss": 0.005, |
| "num_input_tokens_seen": 2913568, |
| "step": 10250 |
| }, |
| { |
| "epoch": 19.132462686567163, |
| "grad_norm": 0.07357241213321686, |
| "learning_rate": 2.87258934186696e-07, |
| "loss": 0.0029, |
| "num_input_tokens_seen": 2914944, |
| "step": 10255 |
| }, |
| { |
| "epoch": 19.14179104477612, |
| "grad_norm": 0.03743238374590874, |
| "learning_rate": 2.8113916778896575e-07, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 2916320, |
| "step": 10260 |
| }, |
| { |
| "epoch": 19.151119402985074, |
| "grad_norm": 0.48300549387931824, |
| "learning_rate": 2.750849243531223e-07, |
| "loss": 0.0259, |
| "num_input_tokens_seen": 2917760, |
| "step": 10265 |
| }, |
| { |
| "epoch": 19.16044776119403, |
| "grad_norm": 0.8661367297172546, |
| "learning_rate": 2.6909621992731726e-07, |
| "loss": 0.0407, |
| "num_input_tokens_seen": 2919104, |
| "step": 10270 |
| }, |
| { |
| "epoch": 19.169776119402986, |
| "grad_norm": 0.04742630943655968, |
| "learning_rate": 2.6317307038597196e-07, |
| "loss": 0.0092, |
| "num_input_tokens_seen": 2920480, |
| "step": 10275 |
| }, |
| { |
| "epoch": 19.17910447761194, |
| "grad_norm": 0.0385856106877327, |
| "learning_rate": 2.573154914297438e-07, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 2921824, |
| "step": 10280 |
| }, |
| { |
| "epoch": 19.188432835820894, |
| "grad_norm": 0.7155548334121704, |
| "learning_rate": 2.515234985854736e-07, |
| "loss": 0.0293, |
| "num_input_tokens_seen": 2923200, |
| "step": 10285 |
| }, |
| { |
| "epoch": 19.19776119402985, |
| "grad_norm": 0.802899956703186, |
| "learning_rate": 2.45797107206161e-07, |
| "loss": 0.0036, |
| "num_input_tokens_seen": 2924512, |
| "step": 10290 |
| }, |
| { |
| "epoch": 19.207089552238806, |
| "grad_norm": 1.6426715850830078, |
| "learning_rate": 2.401363324709055e-07, |
| "loss": 0.0061, |
| "num_input_tokens_seen": 2926048, |
| "step": 10295 |
| }, |
| { |
| "epoch": 19.21641791044776, |
| "grad_norm": 0.04436664283275604, |
| "learning_rate": 2.3454118938487367e-07, |
| "loss": 0.0181, |
| "num_input_tokens_seen": 2927456, |
| "step": 10300 |
| }, |
| { |
| "epoch": 19.225746268656717, |
| "grad_norm": 0.023140206933021545, |
| "learning_rate": 2.2901169277927127e-07, |
| "loss": 0.0532, |
| "num_input_tokens_seen": 2928896, |
| "step": 10305 |
| }, |
| { |
| "epoch": 19.23507462686567, |
| "grad_norm": 0.012807963415980339, |
| "learning_rate": 2.2354785731128482e-07, |
| "loss": 0.021, |
| "num_input_tokens_seen": 2930208, |
| "step": 10310 |
| }, |
| { |
| "epoch": 19.244402985074625, |
| "grad_norm": 0.029781557619571686, |
| "learning_rate": 2.18149697464054e-07, |
| "loss": 0.0185, |
| "num_input_tokens_seen": 2931648, |
| "step": 10315 |
| }, |
| { |
| "epoch": 19.253731343283583, |
| "grad_norm": 1.614999771118164, |
| "learning_rate": 2.128172275466217e-07, |
| "loss": 0.0311, |
| "num_input_tokens_seen": 2932960, |
| "step": 10320 |
| }, |
| { |
| "epoch": 19.263059701492537, |
| "grad_norm": 0.044670864939689636, |
| "learning_rate": 2.0755046169392e-07, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 2934400, |
| "step": 10325 |
| }, |
| { |
| "epoch": 19.27238805970149, |
| "grad_norm": 0.1330539584159851, |
| "learning_rate": 2.0234941386670925e-07, |
| "loss": 0.0236, |
| "num_input_tokens_seen": 2935680, |
| "step": 10330 |
| }, |
| { |
| "epoch": 19.28171641791045, |
| "grad_norm": 0.0322183296084404, |
| "learning_rate": 1.9721409785154466e-07, |
| "loss": 0.015, |
| "num_input_tokens_seen": 2937056, |
| "step": 10335 |
| }, |
| { |
| "epoch": 19.291044776119403, |
| "grad_norm": 0.7474008798599243, |
| "learning_rate": 1.9214452726075137e-07, |
| "loss": 0.0287, |
| "num_input_tokens_seen": 2938432, |
| "step": 10340 |
| }, |
| { |
| "epoch": 19.300373134328357, |
| "grad_norm": 0.042100176215171814, |
| "learning_rate": 1.871407155323801e-07, |
| "loss": 0.0132, |
| "num_input_tokens_seen": 2939776, |
| "step": 10345 |
| }, |
| { |
| "epoch": 19.309701492537314, |
| "grad_norm": 0.5275018811225891, |
| "learning_rate": 1.8220267593017092e-07, |
| "loss": 0.0329, |
| "num_input_tokens_seen": 2941152, |
| "step": 10350 |
| }, |
| { |
| "epoch": 19.31902985074627, |
| "grad_norm": 0.03239260986447334, |
| "learning_rate": 1.7733042154352008e-07, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 2942560, |
| "step": 10355 |
| }, |
| { |
| "epoch": 19.328358208955223, |
| "grad_norm": 0.027028344571590424, |
| "learning_rate": 1.7252396528744663e-07, |
| "loss": 0.0045, |
| "num_input_tokens_seen": 2944128, |
| "step": 10360 |
| }, |
| { |
| "epoch": 19.33768656716418, |
| "grad_norm": 1.7077810764312744, |
| "learning_rate": 1.6778331990255914e-07, |
| "loss": 0.049, |
| "num_input_tokens_seen": 2945632, |
| "step": 10365 |
| }, |
| { |
| "epoch": 19.347014925373134, |
| "grad_norm": 0.032773226499557495, |
| "learning_rate": 1.6310849795500848e-07, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 2946976, |
| "step": 10370 |
| }, |
| { |
| "epoch": 19.35634328358209, |
| "grad_norm": 1.1679803133010864, |
| "learning_rate": 1.584995118364796e-07, |
| "loss": 0.0131, |
| "num_input_tokens_seen": 2948352, |
| "step": 10375 |
| }, |
| { |
| "epoch": 19.365671641791046, |
| "grad_norm": 0.07310190051794052, |
| "learning_rate": 1.5395637376413585e-07, |
| "loss": 0.01, |
| "num_input_tokens_seen": 2949536, |
| "step": 10380 |
| }, |
| { |
| "epoch": 19.375, |
| "grad_norm": 0.023872822523117065, |
| "learning_rate": 1.4947909578059971e-07, |
| "loss": 0.0396, |
| "num_input_tokens_seen": 2950784, |
| "step": 10385 |
| }, |
| { |
| "epoch": 19.384328358208954, |
| "grad_norm": 0.6308934092521667, |
| "learning_rate": 1.4506768975391382e-07, |
| "loss": 0.0069, |
| "num_input_tokens_seen": 2952288, |
| "step": 10390 |
| }, |
| { |
| "epoch": 19.39365671641791, |
| "grad_norm": 0.02815130352973938, |
| "learning_rate": 1.4072216737751055e-07, |
| "loss": 0.0063, |
| "num_input_tokens_seen": 2953728, |
| "step": 10395 |
| }, |
| { |
| "epoch": 19.402985074626866, |
| "grad_norm": 0.04811060428619385, |
| "learning_rate": 1.3644254017018964e-07, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 2955264, |
| "step": 10400 |
| }, |
| { |
| "epoch": 19.41231343283582, |
| "grad_norm": 0.03747585043311119, |
| "learning_rate": 1.3222881947607123e-07, |
| "loss": 0.0237, |
| "num_input_tokens_seen": 2956672, |
| "step": 10405 |
| }, |
| { |
| "epoch": 19.421641791044777, |
| "grad_norm": 0.05800265818834305, |
| "learning_rate": 1.280810164645846e-07, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 2958208, |
| "step": 10410 |
| }, |
| { |
| "epoch": 19.43097014925373, |
| "grad_norm": 1.4601998329162598, |
| "learning_rate": 1.2399914213042373e-07, |
| "loss": 0.0037, |
| "num_input_tokens_seen": 2959680, |
| "step": 10415 |
| }, |
| { |
| "epoch": 19.440298507462686, |
| "grad_norm": 0.050306838005781174, |
| "learning_rate": 1.1998320729352252e-07, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 2961152, |
| "step": 10420 |
| }, |
| { |
| "epoch": 19.449626865671643, |
| "grad_norm": 0.02892743982374668, |
| "learning_rate": 1.160332225990296e-07, |
| "loss": 0.0074, |
| "num_input_tokens_seen": 2962592, |
| "step": 10425 |
| }, |
| { |
| "epoch": 19.458955223880597, |
| "grad_norm": 0.018832534551620483, |
| "learning_rate": 1.1214919851728068e-07, |
| "loss": 0.0285, |
| "num_input_tokens_seen": 2964032, |
| "step": 10430 |
| }, |
| { |
| "epoch": 19.46828358208955, |
| "grad_norm": 0.5395162105560303, |
| "learning_rate": 1.0833114534376798e-07, |
| "loss": 0.0234, |
| "num_input_tokens_seen": 2965568, |
| "step": 10435 |
| }, |
| { |
| "epoch": 19.47761194029851, |
| "grad_norm": 0.041747547686100006, |
| "learning_rate": 1.0457907319909865e-07, |
| "loss": 0.0004, |
| "num_input_tokens_seen": 2966976, |
| "step": 10440 |
| }, |
| { |
| "epoch": 19.486940298507463, |
| "grad_norm": 0.04969732463359833, |
| "learning_rate": 1.0089299202900304e-07, |
| "loss": 0.0055, |
| "num_input_tokens_seen": 2968672, |
| "step": 10445 |
| }, |
| { |
| "epoch": 19.496268656716417, |
| "grad_norm": 0.03835641220211983, |
| "learning_rate": 9.727291160427366e-08, |
| "loss": 0.0105, |
| "num_input_tokens_seen": 2970400, |
| "step": 10450 |
| }, |
| { |
| "epoch": 19.505597014925375, |
| "grad_norm": 0.03803160414099693, |
| "learning_rate": 9.371884152075683e-08, |
| "loss": 0.0064, |
| "num_input_tokens_seen": 2972064, |
| "step": 10455 |
| }, |
| { |
| "epoch": 19.51492537313433, |
| "grad_norm": 0.04522380232810974, |
| "learning_rate": 9.023079119932498e-08, |
| "loss": 0.0182, |
| "num_input_tokens_seen": 2973600, |
| "step": 10460 |
| }, |
| { |
| "epoch": 19.524253731343283, |
| "grad_norm": 0.061548732221126556, |
| "learning_rate": 8.680876988584608e-08, |
| "loss": 0.0021, |
| "num_input_tokens_seen": 2974944, |
| "step": 10465 |
| }, |
| { |
| "epoch": 19.53358208955224, |
| "grad_norm": 0.04232119768857956, |
| "learning_rate": 8.345278665116974e-08, |
| "loss": 0.0262, |
| "num_input_tokens_seen": 2976352, |
| "step": 10470 |
| }, |
| { |
| "epoch": 19.542910447761194, |
| "grad_norm": 0.11064521223306656, |
| "learning_rate": 8.01628503910884e-08, |
| "loss": 0.0036, |
| "num_input_tokens_seen": 2977920, |
| "step": 10475 |
| }, |
| { |
| "epoch": 19.55223880597015, |
| "grad_norm": 0.06205377355217934, |
| "learning_rate": 7.693896982632898e-08, |
| "loss": 0.0174, |
| "num_input_tokens_seen": 2979456, |
| "step": 10480 |
| }, |
| { |
| "epoch": 19.561567164179106, |
| "grad_norm": 0.06822652369737625, |
| "learning_rate": 7.378115350251957e-08, |
| "loss": 0.0198, |
| "num_input_tokens_seen": 2980832, |
| "step": 10485 |
| }, |
| { |
| "epoch": 19.57089552238806, |
| "grad_norm": 0.027772486209869385, |
| "learning_rate": 7.068940979017003e-08, |
| "loss": 0.0051, |
| "num_input_tokens_seen": 2982336, |
| "step": 10490 |
| }, |
| { |
| "epoch": 19.580223880597014, |
| "grad_norm": 0.7043355107307434, |
| "learning_rate": 6.766374688464971e-08, |
| "loss": 0.0273, |
| "num_input_tokens_seen": 2983648, |
| "step": 10495 |
| }, |
| { |
| "epoch": 19.58955223880597, |
| "grad_norm": 0.025868261232972145, |
| "learning_rate": 6.470417280616814e-08, |
| "loss": 0.042, |
| "num_input_tokens_seen": 2985216, |
| "step": 10500 |
| }, |
| { |
| "epoch": 19.598880597014926, |
| "grad_norm": 0.04729580134153366, |
| "learning_rate": 6.181069539974716e-08, |
| "loss": 0.0008, |
| "num_input_tokens_seen": 2986592, |
| "step": 10505 |
| }, |
| { |
| "epoch": 19.60820895522388, |
| "grad_norm": 0.05596684664487839, |
| "learning_rate": 5.898332233520987e-08, |
| "loss": 0.0038, |
| "num_input_tokens_seen": 2988000, |
| "step": 10510 |
| }, |
| { |
| "epoch": 19.617537313432837, |
| "grad_norm": 0.007715870626270771, |
| "learning_rate": 5.622206110714734e-08, |
| "loss": 0.0033, |
| "num_input_tokens_seen": 2989280, |
| "step": 10515 |
| }, |
| { |
| "epoch": 19.62686567164179, |
| "grad_norm": 0.03997211158275604, |
| "learning_rate": 5.352691903491303e-08, |
| "loss": 0.0132, |
| "num_input_tokens_seen": 2990784, |
| "step": 10520 |
| }, |
| { |
| "epoch": 19.636194029850746, |
| "grad_norm": 2.087615966796875, |
| "learning_rate": 5.089790326259225e-08, |
| "loss": 0.0427, |
| "num_input_tokens_seen": 2992288, |
| "step": 10525 |
| }, |
| { |
| "epoch": 19.645522388059703, |
| "grad_norm": 0.02270590141415596, |
| "learning_rate": 4.83350207589911e-08, |
| "loss": 0.0208, |
| "num_input_tokens_seen": 2993760, |
| "step": 10530 |
| }, |
| { |
| "epoch": 19.654850746268657, |
| "grad_norm": 0.05315188318490982, |
| "learning_rate": 4.583827831761145e-08, |
| "loss": 0.0007, |
| "num_input_tokens_seen": 2995232, |
| "step": 10535 |
| }, |
| { |
| "epoch": 19.66417910447761, |
| "grad_norm": 0.9703123569488525, |
| "learning_rate": 4.340768255663708e-08, |
| "loss": 0.0456, |
| "num_input_tokens_seen": 2996640, |
| "step": 10540 |
| }, |
| { |
| "epoch": 19.673507462686565, |
| "grad_norm": 1.0346511602401733, |
| "learning_rate": 4.1043239918914233e-08, |
| "loss": 0.036, |
| "num_input_tokens_seen": 2998016, |
| "step": 10545 |
| }, |
| { |
| "epoch": 19.682835820895523, |
| "grad_norm": 0.8127740025520325, |
| "learning_rate": 3.8744956671937784e-08, |
| "loss": 0.0028, |
| "num_input_tokens_seen": 2999328, |
| "step": 10550 |
| }, |
| { |
| "epoch": 19.692164179104477, |
| "grad_norm": 1.9517704248428345, |
| "learning_rate": 3.6512838907828974e-08, |
| "loss": 0.0424, |
| "num_input_tokens_seen": 3000768, |
| "step": 10555 |
| }, |
| { |
| "epoch": 19.701492537313435, |
| "grad_norm": 0.05239935591816902, |
| "learning_rate": 3.4346892543321576e-08, |
| "loss": 0.004, |
| "num_input_tokens_seen": 3002112, |
| "step": 10560 |
| }, |
| { |
| "epoch": 19.71082089552239, |
| "grad_norm": 0.5744187235832214, |
| "learning_rate": 3.224712331975077e-08, |
| "loss": 0.0098, |
| "num_input_tokens_seen": 3003616, |
| "step": 10565 |
| }, |
| { |
| "epoch": 19.720149253731343, |
| "grad_norm": 0.045157551765441895, |
| "learning_rate": 3.021353680303096e-08, |
| "loss": 0.0335, |
| "num_input_tokens_seen": 3005120, |
| "step": 10570 |
| }, |
| { |
| "epoch": 19.729477611940297, |
| "grad_norm": 0.020377272740006447, |
| "learning_rate": 2.82461383836502e-08, |
| "loss": 0.0225, |
| "num_input_tokens_seen": 3006464, |
| "step": 10575 |
| }, |
| { |
| "epoch": 19.738805970149254, |
| "grad_norm": 0.05177824944257736, |
| "learning_rate": 2.634493327663967e-08, |
| "loss": 0.0042, |
| "num_input_tokens_seen": 3008032, |
| "step": 10580 |
| }, |
| { |
| "epoch": 19.74813432835821, |
| "grad_norm": 1.0635018348693848, |
| "learning_rate": 2.450992652157924e-08, |
| "loss": 0.005, |
| "num_input_tokens_seen": 3009440, |
| "step": 10585 |
| }, |
| { |
| "epoch": 19.757462686567163, |
| "grad_norm": 0.03223692253232002, |
| "learning_rate": 2.2741122982569694e-08, |
| "loss": 0.0005, |
| "num_input_tokens_seen": 3011040, |
| "step": 10590 |
| }, |
| { |
| "epoch": 19.76679104477612, |
| "grad_norm": 0.06035694479942322, |
| "learning_rate": 2.1038527348229974e-08, |
| "loss": 0.0125, |
| "num_input_tokens_seen": 3012384, |
| "step": 10595 |
| }, |
| { |
| "epoch": 19.776119402985074, |
| "grad_norm": 0.029151590541005135, |
| "learning_rate": 1.940214413167496e-08, |
| "loss": 0.0225, |
| "num_input_tokens_seen": 3013824, |
| "step": 10600 |
| }, |
| { |
| "epoch": 19.78544776119403, |
| "grad_norm": 0.06548058241605759, |
| "learning_rate": 1.7831977670507148e-08, |
| "loss": 0.0156, |
| "num_input_tokens_seen": 3015168, |
| "step": 10605 |
| }, |
| { |
| "epoch": 19.794776119402986, |
| "grad_norm": 0.024946341291069984, |
| "learning_rate": 1.632803212681666e-08, |
| "loss": 0.0048, |
| "num_input_tokens_seen": 3016544, |
| "step": 10610 |
| }, |
| { |
| "epoch": 19.80410447761194, |
| "grad_norm": 0.7831483483314514, |
| "learning_rate": 1.4890311487150698e-08, |
| "loss": 0.0141, |
| "num_input_tokens_seen": 3017920, |
| "step": 10615 |
| }, |
| { |
| "epoch": 19.813432835820894, |
| "grad_norm": 0.04067152366042137, |
| "learning_rate": 1.3518819562510776e-08, |
| "loss": 0.0045, |
| "num_input_tokens_seen": 3019232, |
| "step": 10620 |
| }, |
| { |
| "epoch": 19.82276119402985, |
| "grad_norm": 0.05499504134058952, |
| "learning_rate": 1.221355998835272e-08, |
| "loss": 0.0163, |
| "num_input_tokens_seen": 3020640, |
| "step": 10625 |
| }, |
| { |
| "epoch": 19.832089552238806, |
| "grad_norm": 0.8861346244812012, |
| "learning_rate": 1.0974536224561682e-08, |
| "loss": 0.0233, |
| "num_input_tokens_seen": 3022080, |
| "step": 10630 |
| }, |
| { |
| "epoch": 19.84141791044776, |
| "grad_norm": 0.061029836535453796, |
| "learning_rate": 9.801751555452154e-09, |
| "loss": 0.0073, |
| "num_input_tokens_seen": 3023520, |
| "step": 10635 |
| }, |
| { |
| "epoch": 19.850746268656717, |
| "grad_norm": 0.030993487685918808, |
| "learning_rate": 8.695209089759626e-09, |
| "loss": 0.0006, |
| "num_input_tokens_seen": 3024896, |
| "step": 10640 |
| }, |
| { |
| "epoch": 19.86007462686567, |
| "grad_norm": 0.9736220836639404, |
| "learning_rate": 7.654911760621163e-09, |
| "loss": 0.0123, |
| "num_input_tokens_seen": 3026176, |
| "step": 10645 |
| }, |
| { |
| "epoch": 19.869402985074625, |
| "grad_norm": 0.046350885182619095, |
| "learning_rate": 6.680862325583736e-09, |
| "loss": 0.0076, |
| "num_input_tokens_seen": 3027584, |
| "step": 10650 |
| }, |
| { |
| "epoch": 19.878731343283583, |
| "grad_norm": 0.045127496123313904, |
| "learning_rate": 5.7730633665903365e-09, |
| "loss": 0.004, |
| "num_input_tokens_seen": 3028960, |
| "step": 10655 |
| }, |
| { |
| "epoch": 19.888059701492537, |
| "grad_norm": 0.0691378191113472, |
| "learning_rate": 4.931517289963328e-09, |
| "loss": 0.004, |
| "num_input_tokens_seen": 3030560, |
| "step": 10660 |
| }, |
| { |
| "epoch": 19.89738805970149, |
| "grad_norm": 1.4448070526123047, |
| "learning_rate": 4.156226326415547e-09, |
| "loss": 0.0054, |
| "num_input_tokens_seen": 3031872, |
| "step": 10665 |
| }, |
| { |
| "epoch": 19.90671641791045, |
| "grad_norm": 0.0396936871111393, |
| "learning_rate": 3.4471925310280985e-09, |
| "loss": 0.0036, |
| "num_input_tokens_seen": 3033248, |
| "step": 10670 |
| }, |
| { |
| "epoch": 19.916044776119403, |
| "grad_norm": 0.06104220077395439, |
| "learning_rate": 2.804417783261459e-09, |
| "loss": 0.0347, |
| "num_input_tokens_seen": 3034912, |
| "step": 10675 |
| }, |
| { |
| "epoch": 19.925373134328357, |
| "grad_norm": 0.02694948948919773, |
| "learning_rate": 2.2279037869304964e-09, |
| "loss": 0.0036, |
| "num_input_tokens_seen": 3036352, |
| "step": 10680 |
| }, |
| { |
| "epoch": 19.934701492537314, |
| "grad_norm": 0.07582668960094452, |
| "learning_rate": 1.7176520702238964e-09, |
| "loss": 0.0016, |
| "num_input_tokens_seen": 3037888, |
| "step": 10685 |
| }, |
| { |
| "epoch": 19.94402985074627, |
| "grad_norm": 0.03105338290333748, |
| "learning_rate": 1.2736639856736344e-09, |
| "loss": 0.0355, |
| "num_input_tokens_seen": 3039136, |
| "step": 10690 |
| }, |
| { |
| "epoch": 19.953358208955223, |
| "grad_norm": 0.07373655587434769, |
| "learning_rate": 8.959407101716277e-10, |
| "loss": 0.0214, |
| "num_input_tokens_seen": 3040416, |
| "step": 10695 |
| }, |
| { |
| "epoch": 19.96268656716418, |
| "grad_norm": 0.07853496819734573, |
| "learning_rate": 5.844832449641846e-10, |
| "loss": 0.0319, |
| "num_input_tokens_seen": 3041760, |
| "step": 10700 |
| }, |
| { |
| "epoch": 19.972014925373134, |
| "grad_norm": 0.045687563717365265, |
| "learning_rate": 3.3929241563535053e-10, |
| "loss": 0.0045, |
| "num_input_tokens_seen": 3043520, |
| "step": 10705 |
| }, |
| { |
| "epoch": 19.98134328358209, |
| "grad_norm": 0.7027135491371155, |
| "learning_rate": 1.6036887212078634e-10, |
| "loss": 0.0145, |
| "num_input_tokens_seen": 3044928, |
| "step": 10710 |
| }, |
| { |
| "epoch": 19.990671641791046, |
| "grad_norm": 0.7830105423927307, |
| "learning_rate": 4.771308869666591e-11, |
| "loss": 0.0043, |
| "num_input_tokens_seen": 3046464, |
| "step": 10715 |
| }, |
| { |
| "epoch": 20.0, |
| "grad_norm": 3.6271634101867676, |
| "learning_rate": 1.3253639852273126e-12, |
| "loss": 0.0093, |
| "num_input_tokens_seen": 3047552, |
| "step": 10720 |
| }, |
| { |
| "epoch": 20.0, |
| "eval_loss": 1.5435822010040283, |
| "eval_runtime": 2.9017, |
| "eval_samples_per_second": 82.021, |
| "eval_steps_per_second": 20.678, |
| "num_input_tokens_seen": 3047552, |
| "step": 10720 |
| }, |
| { |
| "epoch": 20.0, |
| "num_input_tokens_seen": 3047552, |
| "step": 10720, |
| "total_flos": 1.3749831871443763e+17, |
| "train_loss": 0.19485118995551384, |
| "train_runtime": 2150.065, |
| "train_samples_per_second": 19.916, |
| "train_steps_per_second": 4.986 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 10720, |
| "num_input_tokens_seen": 3047552, |
| "num_train_epochs": 20, |
| "save_steps": 536, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.3749831871443763e+17, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|