| { |
| "best_global_step": 3216, |
| "best_metric": 0.6421719789505005, |
| "best_model_checkpoint": "saves_multiple/p-tuning/llama-3-8b-instruct/train_conala_42_1760637549/checkpoint-3216", |
| "epoch": 20.0, |
| "eval_steps": 536, |
| "global_step": 10720, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.009328358208955223, |
| "grad_norm": 50.2962646484375, |
| "learning_rate": 3.7313432835820897e-06, |
| "loss": 5.9132, |
| "num_input_tokens_seen": 1504, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.018656716417910446, |
| "grad_norm": 62.8128662109375, |
| "learning_rate": 8.395522388059701e-06, |
| "loss": 5.341, |
| "num_input_tokens_seen": 2976, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.027985074626865673, |
| "grad_norm": 33.016021728515625, |
| "learning_rate": 1.3059701492537313e-05, |
| "loss": 4.1119, |
| "num_input_tokens_seen": 4480, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.03731343283582089, |
| "grad_norm": 29.6210994720459, |
| "learning_rate": 1.7723880597014924e-05, |
| "loss": 2.4265, |
| "num_input_tokens_seen": 5984, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.04664179104477612, |
| "grad_norm": 24.01149559020996, |
| "learning_rate": 2.2388059701492536e-05, |
| "loss": 1.7477, |
| "num_input_tokens_seen": 7392, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.055970149253731345, |
| "grad_norm": 10.972054481506348, |
| "learning_rate": 2.7052238805970147e-05, |
| "loss": 1.2305, |
| "num_input_tokens_seen": 8864, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.06529850746268656, |
| "grad_norm": 10.251397132873535, |
| "learning_rate": 3.171641791044776e-05, |
| "loss": 1.0503, |
| "num_input_tokens_seen": 10272, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.07462686567164178, |
| "grad_norm": 10.270315170288086, |
| "learning_rate": 3.638059701492537e-05, |
| "loss": 1.1064, |
| "num_input_tokens_seen": 11584, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.08395522388059702, |
| "grad_norm": 9.021138191223145, |
| "learning_rate": 4.104477611940299e-05, |
| "loss": 1.0329, |
| "num_input_tokens_seen": 13120, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.09328358208955224, |
| "grad_norm": 8.116998672485352, |
| "learning_rate": 4.5708955223880595e-05, |
| "loss": 0.8612, |
| "num_input_tokens_seen": 14624, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.10261194029850747, |
| "grad_norm": 7.769300937652588, |
| "learning_rate": 5.037313432835821e-05, |
| "loss": 0.8652, |
| "num_input_tokens_seen": 16224, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.11194029850746269, |
| "grad_norm": 3.922929048538208, |
| "learning_rate": 5.503731343283582e-05, |
| "loss": 0.5218, |
| "num_input_tokens_seen": 17568, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.12126865671641791, |
| "grad_norm": 2.2006962299346924, |
| "learning_rate": 5.9701492537313435e-05, |
| "loss": 1.0326, |
| "num_input_tokens_seen": 19072, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.13059701492537312, |
| "grad_norm": 7.339227676391602, |
| "learning_rate": 6.436567164179105e-05, |
| "loss": 0.8905, |
| "num_input_tokens_seen": 20448, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.13992537313432835, |
| "grad_norm": 6.341765403747559, |
| "learning_rate": 6.902985074626866e-05, |
| "loss": 0.8382, |
| "num_input_tokens_seen": 21920, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.14925373134328357, |
| "grad_norm": 2.7121095657348633, |
| "learning_rate": 7.369402985074628e-05, |
| "loss": 0.6957, |
| "num_input_tokens_seen": 23360, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.15858208955223882, |
| "grad_norm": 16.42344856262207, |
| "learning_rate": 7.835820895522389e-05, |
| "loss": 1.0716, |
| "num_input_tokens_seen": 24864, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.16791044776119404, |
| "grad_norm": 4.9739580154418945, |
| "learning_rate": 8.30223880597015e-05, |
| "loss": 0.5727, |
| "num_input_tokens_seen": 26592, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.17723880597014927, |
| "grad_norm": 6.865658283233643, |
| "learning_rate": 8.76865671641791e-05, |
| "loss": 0.7009, |
| "num_input_tokens_seen": 27936, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.1865671641791045, |
| "grad_norm": 3.1465818881988525, |
| "learning_rate": 9.235074626865672e-05, |
| "loss": 0.9568, |
| "num_input_tokens_seen": 29440, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.1958955223880597, |
| "grad_norm": 3.4791672229766846, |
| "learning_rate": 9.701492537313434e-05, |
| "loss": 1.0771, |
| "num_input_tokens_seen": 31040, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.20522388059701493, |
| "grad_norm": 6.314712047576904, |
| "learning_rate": 0.00010167910447761195, |
| "loss": 0.9265, |
| "num_input_tokens_seen": 32512, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.21455223880597016, |
| "grad_norm": 3.4345123767852783, |
| "learning_rate": 0.00010634328358208955, |
| "loss": 0.8327, |
| "num_input_tokens_seen": 33952, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.22388059701492538, |
| "grad_norm": 1.887539267539978, |
| "learning_rate": 0.00011100746268656716, |
| "loss": 0.5377, |
| "num_input_tokens_seen": 35520, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.2332089552238806, |
| "grad_norm": 3.8296196460723877, |
| "learning_rate": 0.00011567164179104479, |
| "loss": 1.0696, |
| "num_input_tokens_seen": 37024, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.24253731343283583, |
| "grad_norm": 17.493654251098633, |
| "learning_rate": 0.0001203358208955224, |
| "loss": 0.8046, |
| "num_input_tokens_seen": 38752, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.251865671641791, |
| "grad_norm": 3.7208962440490723, |
| "learning_rate": 0.000125, |
| "loss": 1.2193, |
| "num_input_tokens_seen": 40064, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.26119402985074625, |
| "grad_norm": 3.16337513923645, |
| "learning_rate": 0.00012966417910447762, |
| "loss": 0.5599, |
| "num_input_tokens_seen": 41600, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.27052238805970147, |
| "grad_norm": 2.210693120956421, |
| "learning_rate": 0.00013432835820895522, |
| "loss": 0.8478, |
| "num_input_tokens_seen": 42880, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.2798507462686567, |
| "grad_norm": 3.8227005004882812, |
| "learning_rate": 0.00013899253731343284, |
| "loss": 0.8006, |
| "num_input_tokens_seen": 44320, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.2891791044776119, |
| "grad_norm": 3.232940196990967, |
| "learning_rate": 0.00014365671641791044, |
| "loss": 0.8651, |
| "num_input_tokens_seen": 45664, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.29850746268656714, |
| "grad_norm": 3.337759017944336, |
| "learning_rate": 0.00014832089552238806, |
| "loss": 1.3061, |
| "num_input_tokens_seen": 47008, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.30783582089552236, |
| "grad_norm": 2.434713363647461, |
| "learning_rate": 0.00015298507462686568, |
| "loss": 0.6944, |
| "num_input_tokens_seen": 48416, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.31716417910447764, |
| "grad_norm": 4.333398342132568, |
| "learning_rate": 0.00015764925373134328, |
| "loss": 0.8114, |
| "num_input_tokens_seen": 49984, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.32649253731343286, |
| "grad_norm": 2.168666124343872, |
| "learning_rate": 0.0001623134328358209, |
| "loss": 0.7556, |
| "num_input_tokens_seen": 51264, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.3358208955223881, |
| "grad_norm": 1.2355871200561523, |
| "learning_rate": 0.00016697761194029852, |
| "loss": 0.8644, |
| "num_input_tokens_seen": 52704, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.3451492537313433, |
| "grad_norm": 1.1702848672866821, |
| "learning_rate": 0.00017164179104477612, |
| "loss": 0.6404, |
| "num_input_tokens_seen": 54112, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.35447761194029853, |
| "grad_norm": 1.418177604675293, |
| "learning_rate": 0.00017630597014925374, |
| "loss": 0.8023, |
| "num_input_tokens_seen": 55456, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.36380597014925375, |
| "grad_norm": 0.9577491283416748, |
| "learning_rate": 0.00018097014925373133, |
| "loss": 0.6875, |
| "num_input_tokens_seen": 56800, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.373134328358209, |
| "grad_norm": 1.9909303188323975, |
| "learning_rate": 0.00018563432835820896, |
| "loss": 0.7119, |
| "num_input_tokens_seen": 58240, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.3824626865671642, |
| "grad_norm": 5.038618087768555, |
| "learning_rate": 0.00019029850746268658, |
| "loss": 0.6525, |
| "num_input_tokens_seen": 59552, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.3917910447761194, |
| "grad_norm": 1.5434777736663818, |
| "learning_rate": 0.00019496268656716417, |
| "loss": 0.6378, |
| "num_input_tokens_seen": 60832, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.40111940298507465, |
| "grad_norm": 1.161812424659729, |
| "learning_rate": 0.0001996268656716418, |
| "loss": 0.95, |
| "num_input_tokens_seen": 62208, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.41044776119402987, |
| "grad_norm": 0.9651714563369751, |
| "learning_rate": 0.0002042910447761194, |
| "loss": 0.7952, |
| "num_input_tokens_seen": 63680, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.4197761194029851, |
| "grad_norm": 1.8524196147918701, |
| "learning_rate": 0.000208955223880597, |
| "loss": 0.6494, |
| "num_input_tokens_seen": 65120, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.4291044776119403, |
| "grad_norm": 3.1311075687408447, |
| "learning_rate": 0.00021361940298507463, |
| "loss": 0.9792, |
| "num_input_tokens_seen": 66624, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.43843283582089554, |
| "grad_norm": 1.3214956521987915, |
| "learning_rate": 0.00021828358208955223, |
| "loss": 0.8154, |
| "num_input_tokens_seen": 68192, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.44776119402985076, |
| "grad_norm": 1.1524676084518433, |
| "learning_rate": 0.00022294776119402985, |
| "loss": 1.0075, |
| "num_input_tokens_seen": 69376, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.457089552238806, |
| "grad_norm": 0.8888850212097168, |
| "learning_rate": 0.00022761194029850745, |
| "loss": 0.7885, |
| "num_input_tokens_seen": 70656, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.4664179104477612, |
| "grad_norm": 0.7135015726089478, |
| "learning_rate": 0.00023227611940298507, |
| "loss": 0.7681, |
| "num_input_tokens_seen": 72032, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.47574626865671643, |
| "grad_norm": 0.7615672945976257, |
| "learning_rate": 0.0002369402985074627, |
| "loss": 0.601, |
| "num_input_tokens_seen": 73600, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.48507462686567165, |
| "grad_norm": 0.9831456542015076, |
| "learning_rate": 0.00024160447761194029, |
| "loss": 0.6599, |
| "num_input_tokens_seen": 74944, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.4944029850746269, |
| "grad_norm": 0.826670229434967, |
| "learning_rate": 0.0002462686567164179, |
| "loss": 0.7404, |
| "num_input_tokens_seen": 76288, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.503731343283582, |
| "grad_norm": 1.3727489709854126, |
| "learning_rate": 0.00025093283582089556, |
| "loss": 0.7172, |
| "num_input_tokens_seen": 77536, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.5130597014925373, |
| "grad_norm": 0.6129507422447205, |
| "learning_rate": 0.00025559701492537315, |
| "loss": 0.6915, |
| "num_input_tokens_seen": 78944, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.5223880597014925, |
| "grad_norm": 0.9328954219818115, |
| "learning_rate": 0.00026026119402985075, |
| "loss": 0.855, |
| "num_input_tokens_seen": 80224, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.5317164179104478, |
| "grad_norm": 1.9826902151107788, |
| "learning_rate": 0.00026492537313432834, |
| "loss": 0.8697, |
| "num_input_tokens_seen": 81568, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.5410447761194029, |
| "grad_norm": 1.385163426399231, |
| "learning_rate": 0.000269589552238806, |
| "loss": 0.7755, |
| "num_input_tokens_seen": 82912, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.5503731343283582, |
| "grad_norm": 1.0601534843444824, |
| "learning_rate": 0.0002742537313432836, |
| "loss": 0.6186, |
| "num_input_tokens_seen": 84448, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.5597014925373134, |
| "grad_norm": 0.6051679849624634, |
| "learning_rate": 0.00027891791044776124, |
| "loss": 0.7983, |
| "num_input_tokens_seen": 85792, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.5690298507462687, |
| "grad_norm": 0.9398823380470276, |
| "learning_rate": 0.0002835820895522388, |
| "loss": 0.8253, |
| "num_input_tokens_seen": 87200, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.5783582089552238, |
| "grad_norm": 0.9723435044288635, |
| "learning_rate": 0.0002882462686567164, |
| "loss": 0.7626, |
| "num_input_tokens_seen": 88608, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.5876865671641791, |
| "grad_norm": 1.2982566356658936, |
| "learning_rate": 0.000292910447761194, |
| "loss": 0.9217, |
| "num_input_tokens_seen": 90176, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.5970149253731343, |
| "grad_norm": 0.6757448315620422, |
| "learning_rate": 0.00029757462686567167, |
| "loss": 0.8694, |
| "num_input_tokens_seen": 91424, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.6063432835820896, |
| "grad_norm": 0.8649131059646606, |
| "learning_rate": 0.00030223880597014926, |
| "loss": 0.8126, |
| "num_input_tokens_seen": 92832, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.6156716417910447, |
| "grad_norm": 1.049835205078125, |
| "learning_rate": 0.00030690298507462686, |
| "loss": 0.9724, |
| "num_input_tokens_seen": 94240, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.625, |
| "grad_norm": 0.9085893630981445, |
| "learning_rate": 0.00031156716417910445, |
| "loss": 0.9384, |
| "num_input_tokens_seen": 95584, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.6343283582089553, |
| "grad_norm": 0.6684736013412476, |
| "learning_rate": 0.0003162313432835821, |
| "loss": 0.5239, |
| "num_input_tokens_seen": 97056, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.6436567164179104, |
| "grad_norm": 0.5452622175216675, |
| "learning_rate": 0.0003208955223880597, |
| "loss": 0.7606, |
| "num_input_tokens_seen": 98464, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.6529850746268657, |
| "grad_norm": 0.5732467770576477, |
| "learning_rate": 0.00032555970149253735, |
| "loss": 0.6014, |
| "num_input_tokens_seen": 99968, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.6623134328358209, |
| "grad_norm": 1.0637229681015015, |
| "learning_rate": 0.0003302238805970149, |
| "loss": 0.858, |
| "num_input_tokens_seen": 101536, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.6716417910447762, |
| "grad_norm": 0.574134349822998, |
| "learning_rate": 0.00033488805970149254, |
| "loss": 0.5917, |
| "num_input_tokens_seen": 102976, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.6809701492537313, |
| "grad_norm": 0.9447924494743347, |
| "learning_rate": 0.00033955223880597013, |
| "loss": 0.6635, |
| "num_input_tokens_seen": 104512, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.6902985074626866, |
| "grad_norm": 0.7179959416389465, |
| "learning_rate": 0.0003442164179104478, |
| "loss": 0.6203, |
| "num_input_tokens_seen": 105888, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.6996268656716418, |
| "grad_norm": 1.140496015548706, |
| "learning_rate": 0.0003488805970149254, |
| "loss": 0.7066, |
| "num_input_tokens_seen": 107232, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.7089552238805971, |
| "grad_norm": 0.5980051159858704, |
| "learning_rate": 0.000353544776119403, |
| "loss": 0.8239, |
| "num_input_tokens_seen": 108704, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.7182835820895522, |
| "grad_norm": 0.7040757536888123, |
| "learning_rate": 0.00035820895522388057, |
| "loss": 0.6021, |
| "num_input_tokens_seen": 110016, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.7276119402985075, |
| "grad_norm": 0.6738182902336121, |
| "learning_rate": 0.0003628731343283582, |
| "loss": 0.5962, |
| "num_input_tokens_seen": 111520, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.7369402985074627, |
| "grad_norm": 0.8782476186752319, |
| "learning_rate": 0.0003675373134328358, |
| "loss": 0.8944, |
| "num_input_tokens_seen": 112800, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.746268656716418, |
| "grad_norm": 0.72857666015625, |
| "learning_rate": 0.00037220149253731346, |
| "loss": 0.8248, |
| "num_input_tokens_seen": 114304, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.7555970149253731, |
| "grad_norm": 0.6835526823997498, |
| "learning_rate": 0.00037686567164179106, |
| "loss": 0.6764, |
| "num_input_tokens_seen": 115648, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.7649253731343284, |
| "grad_norm": 0.6340111494064331, |
| "learning_rate": 0.00038152985074626865, |
| "loss": 0.4727, |
| "num_input_tokens_seen": 117088, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.7742537313432836, |
| "grad_norm": 0.5367112159729004, |
| "learning_rate": 0.00038619402985074625, |
| "loss": 0.5831, |
| "num_input_tokens_seen": 118560, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.7835820895522388, |
| "grad_norm": 0.8845934271812439, |
| "learning_rate": 0.0003908582089552239, |
| "loss": 0.6996, |
| "num_input_tokens_seen": 119968, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.792910447761194, |
| "grad_norm": 0.5628993511199951, |
| "learning_rate": 0.0003955223880597015, |
| "loss": 0.525, |
| "num_input_tokens_seen": 121184, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.8022388059701493, |
| "grad_norm": 0.44992223381996155, |
| "learning_rate": 0.00040018656716417914, |
| "loss": 0.6806, |
| "num_input_tokens_seen": 122592, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.8115671641791045, |
| "grad_norm": 0.3055427670478821, |
| "learning_rate": 0.0004048507462686567, |
| "loss": 0.667, |
| "num_input_tokens_seen": 123936, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.8208955223880597, |
| "grad_norm": 0.6931546926498413, |
| "learning_rate": 0.00040951492537313433, |
| "loss": 0.8071, |
| "num_input_tokens_seen": 125344, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.8302238805970149, |
| "grad_norm": 0.5060365796089172, |
| "learning_rate": 0.0004141791044776119, |
| "loss": 0.632, |
| "num_input_tokens_seen": 126848, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.8395522388059702, |
| "grad_norm": 0.4852692484855652, |
| "learning_rate": 0.0004188432835820896, |
| "loss": 0.7955, |
| "num_input_tokens_seen": 128160, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.8488805970149254, |
| "grad_norm": 0.4757196009159088, |
| "learning_rate": 0.00042350746268656717, |
| "loss": 0.669, |
| "num_input_tokens_seen": 129728, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.8582089552238806, |
| "grad_norm": 0.4002057611942291, |
| "learning_rate": 0.00042817164179104476, |
| "loss": 0.5291, |
| "num_input_tokens_seen": 131232, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.8675373134328358, |
| "grad_norm": 1.0518105030059814, |
| "learning_rate": 0.00043283582089552236, |
| "loss": 0.9762, |
| "num_input_tokens_seen": 132576, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.8768656716417911, |
| "grad_norm": 0.5036869645118713, |
| "learning_rate": 0.0004375, |
| "loss": 0.6403, |
| "num_input_tokens_seen": 134080, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.8861940298507462, |
| "grad_norm": 0.6456478238105774, |
| "learning_rate": 0.00044216417910447766, |
| "loss": 0.5029, |
| "num_input_tokens_seen": 135584, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.8955223880597015, |
| "grad_norm": 0.6417862772941589, |
| "learning_rate": 0.00044682835820895525, |
| "loss": 0.6198, |
| "num_input_tokens_seen": 137248, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.9048507462686567, |
| "grad_norm": 0.6095438003540039, |
| "learning_rate": 0.00045149253731343285, |
| "loss": 0.7217, |
| "num_input_tokens_seen": 138656, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.914179104477612, |
| "grad_norm": 0.5077698826789856, |
| "learning_rate": 0.00045615671641791044, |
| "loss": 0.6754, |
| "num_input_tokens_seen": 140224, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.9235074626865671, |
| "grad_norm": 0.41700974106788635, |
| "learning_rate": 0.0004608208955223881, |
| "loss": 0.6219, |
| "num_input_tokens_seen": 141792, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.9328358208955224, |
| "grad_norm": 0.33780136704444885, |
| "learning_rate": 0.0004654850746268657, |
| "loss": 0.5651, |
| "num_input_tokens_seen": 143200, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.9421641791044776, |
| "grad_norm": 0.9682436585426331, |
| "learning_rate": 0.00047014925373134334, |
| "loss": 0.6506, |
| "num_input_tokens_seen": 144608, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.9514925373134329, |
| "grad_norm": 0.5964924693107605, |
| "learning_rate": 0.0004748134328358209, |
| "loss": 0.5424, |
| "num_input_tokens_seen": 146048, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.960820895522388, |
| "grad_norm": 0.6313313245773315, |
| "learning_rate": 0.00047947761194029853, |
| "loss": 0.8197, |
| "num_input_tokens_seen": 147520, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.9701492537313433, |
| "grad_norm": 0.45782962441444397, |
| "learning_rate": 0.0004841417910447761, |
| "loss": 0.739, |
| "num_input_tokens_seen": 149184, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.9794776119402985, |
| "grad_norm": 1.2939541339874268, |
| "learning_rate": 0.0004888059701492537, |
| "loss": 0.645, |
| "num_input_tokens_seen": 150592, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.9888059701492538, |
| "grad_norm": 0.4261542558670044, |
| "learning_rate": 0.0004934701492537313, |
| "loss": 0.8326, |
| "num_input_tokens_seen": 151936, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.9981343283582089, |
| "grad_norm": 0.6485226154327393, |
| "learning_rate": 0.000498134328358209, |
| "loss": 0.8366, |
| "num_input_tokens_seen": 153280, |
| "step": 535 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 0.6933413147926331, |
| "eval_runtime": 4.1732, |
| "eval_samples_per_second": 57.031, |
| "eval_steps_per_second": 14.377, |
| "num_input_tokens_seen": 153352, |
| "step": 536 |
| }, |
| { |
| "epoch": 1.007462686567164, |
| "grad_norm": 0.599692165851593, |
| "learning_rate": 0.0005027985074626866, |
| "loss": 0.7255, |
| "num_input_tokens_seen": 154408, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.0167910447761195, |
| "grad_norm": 0.867232620716095, |
| "learning_rate": 0.0005074626865671642, |
| "loss": 0.6433, |
| "num_input_tokens_seen": 155976, |
| "step": 545 |
| }, |
| { |
| "epoch": 1.0261194029850746, |
| "grad_norm": 0.3278610110282898, |
| "learning_rate": 0.0005121268656716418, |
| "loss": 0.4283, |
| "num_input_tokens_seen": 157320, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.0354477611940298, |
| "grad_norm": 0.5500964522361755, |
| "learning_rate": 0.0005167910447761194, |
| "loss": 0.682, |
| "num_input_tokens_seen": 158920, |
| "step": 555 |
| }, |
| { |
| "epoch": 1.044776119402985, |
| "grad_norm": 0.5103652477264404, |
| "learning_rate": 0.0005214552238805971, |
| "loss": 0.5198, |
| "num_input_tokens_seen": 160264, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.0541044776119404, |
| "grad_norm": 0.47425708174705505, |
| "learning_rate": 0.0005261194029850747, |
| "loss": 0.7302, |
| "num_input_tokens_seen": 161768, |
| "step": 565 |
| }, |
| { |
| "epoch": 1.0634328358208955, |
| "grad_norm": 0.5512856245040894, |
| "learning_rate": 0.0005307835820895523, |
| "loss": 0.7863, |
| "num_input_tokens_seen": 163336, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.0727611940298507, |
| "grad_norm": 1.0708633661270142, |
| "learning_rate": 0.0005354477611940298, |
| "loss": 0.7078, |
| "num_input_tokens_seen": 164808, |
| "step": 575 |
| }, |
| { |
| "epoch": 1.0820895522388059, |
| "grad_norm": 0.3448982536792755, |
| "learning_rate": 0.0005401119402985075, |
| "loss": 0.8767, |
| "num_input_tokens_seen": 166440, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.0914179104477613, |
| "grad_norm": 0.5863126516342163, |
| "learning_rate": 0.0005447761194029851, |
| "loss": 0.5437, |
| "num_input_tokens_seen": 167784, |
| "step": 585 |
| }, |
| { |
| "epoch": 1.1007462686567164, |
| "grad_norm": 0.4399169981479645, |
| "learning_rate": 0.0005494402985074627, |
| "loss": 0.5878, |
| "num_input_tokens_seen": 169256, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.1100746268656716, |
| "grad_norm": 0.5453768372535706, |
| "learning_rate": 0.0005541044776119403, |
| "loss": 0.6676, |
| "num_input_tokens_seen": 170824, |
| "step": 595 |
| }, |
| { |
| "epoch": 1.1194029850746268, |
| "grad_norm": 0.506891667842865, |
| "learning_rate": 0.000558768656716418, |
| "loss": 0.7238, |
| "num_input_tokens_seen": 172232, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.1287313432835822, |
| "grad_norm": 1.1439929008483887, |
| "learning_rate": 0.0005634328358208956, |
| "loss": 0.8913, |
| "num_input_tokens_seen": 173576, |
| "step": 605 |
| }, |
| { |
| "epoch": 1.1380597014925373, |
| "grad_norm": 0.836676299571991, |
| "learning_rate": 0.0005680970149253732, |
| "loss": 0.6187, |
| "num_input_tokens_seen": 174824, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.1473880597014925, |
| "grad_norm": 1.3345333337783813, |
| "learning_rate": 0.0005727611940298508, |
| "loss": 0.6908, |
| "num_input_tokens_seen": 176360, |
| "step": 615 |
| }, |
| { |
| "epoch": 1.1567164179104479, |
| "grad_norm": 0.5238152742385864, |
| "learning_rate": 0.0005774253731343285, |
| "loss": 0.7062, |
| "num_input_tokens_seen": 177864, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.166044776119403, |
| "grad_norm": 0.42747148871421814, |
| "learning_rate": 0.0005820895522388059, |
| "loss": 0.6453, |
| "num_input_tokens_seen": 179304, |
| "step": 625 |
| }, |
| { |
| "epoch": 1.1753731343283582, |
| "grad_norm": 0.5538851618766785, |
| "learning_rate": 0.0005867537313432835, |
| "loss": 0.476, |
| "num_input_tokens_seen": 181000, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.1847014925373134, |
| "grad_norm": 0.34586551785469055, |
| "learning_rate": 0.0005914179104477611, |
| "loss": 0.526, |
| "num_input_tokens_seen": 182376, |
| "step": 635 |
| }, |
| { |
| "epoch": 1.1940298507462686, |
| "grad_norm": 0.6016301512718201, |
| "learning_rate": 0.0005960820895522388, |
| "loss": 0.7885, |
| "num_input_tokens_seen": 183784, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.203358208955224, |
| "grad_norm": 0.4861624240875244, |
| "learning_rate": 0.0006007462686567164, |
| "loss": 0.6053, |
| "num_input_tokens_seen": 185192, |
| "step": 645 |
| }, |
| { |
| "epoch": 1.212686567164179, |
| "grad_norm": 0.7675857543945312, |
| "learning_rate": 0.000605410447761194, |
| "loss": 0.4889, |
| "num_input_tokens_seen": 186504, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.2220149253731343, |
| "grad_norm": 0.33078664541244507, |
| "learning_rate": 0.0006100746268656716, |
| "loss": 0.7418, |
| "num_input_tokens_seen": 187912, |
| "step": 655 |
| }, |
| { |
| "epoch": 1.2313432835820897, |
| "grad_norm": 0.4360729455947876, |
| "learning_rate": 0.0006147388059701493, |
| "loss": 0.7477, |
| "num_input_tokens_seen": 189256, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.2406716417910448, |
| "grad_norm": 0.5418694615364075, |
| "learning_rate": 0.0006194029850746269, |
| "loss": 0.6469, |
| "num_input_tokens_seen": 190568, |
| "step": 665 |
| }, |
| { |
| "epoch": 1.25, |
| "grad_norm": 0.4630480408668518, |
| "learning_rate": 0.0006240671641791045, |
| "loss": 0.737, |
| "num_input_tokens_seen": 191880, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.2593283582089552, |
| "grad_norm": 0.5126736164093018, |
| "learning_rate": 0.0006287313432835821, |
| "loss": 0.6417, |
| "num_input_tokens_seen": 193128, |
| "step": 675 |
| }, |
| { |
| "epoch": 1.2686567164179103, |
| "grad_norm": 0.47021928429603577, |
| "learning_rate": 0.0006333955223880597, |
| "loss": 0.4621, |
| "num_input_tokens_seen": 194376, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.2779850746268657, |
| "grad_norm": 0.4174472987651825, |
| "learning_rate": 0.0006380597014925373, |
| "loss": 0.4478, |
| "num_input_tokens_seen": 195880, |
| "step": 685 |
| }, |
| { |
| "epoch": 1.287313432835821, |
| "grad_norm": 0.4990299344062805, |
| "learning_rate": 0.0006427238805970149, |
| "loss": 0.4875, |
| "num_input_tokens_seen": 197128, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.296641791044776, |
| "grad_norm": 0.5962094664573669, |
| "learning_rate": 0.0006473880597014925, |
| "loss": 0.8478, |
| "num_input_tokens_seen": 198504, |
| "step": 695 |
| }, |
| { |
| "epoch": 1.3059701492537314, |
| "grad_norm": 0.7166761755943298, |
| "learning_rate": 0.0006520522388059702, |
| "loss": 0.9623, |
| "num_input_tokens_seen": 199848, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.3152985074626866, |
| "grad_norm": 0.4672403633594513, |
| "learning_rate": 0.0006567164179104478, |
| "loss": 0.6889, |
| "num_input_tokens_seen": 201160, |
| "step": 705 |
| }, |
| { |
| "epoch": 1.3246268656716418, |
| "grad_norm": 0.2829459607601166, |
| "learning_rate": 0.0006613805970149254, |
| "loss": 0.7543, |
| "num_input_tokens_seen": 202504, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.333955223880597, |
| "grad_norm": 0.6832309365272522, |
| "learning_rate": 0.000666044776119403, |
| "loss": 0.6392, |
| "num_input_tokens_seen": 203784, |
| "step": 715 |
| }, |
| { |
| "epoch": 1.3432835820895521, |
| "grad_norm": 0.31956401467323303, |
| "learning_rate": 0.0006707089552238807, |
| "loss": 0.7686, |
| "num_input_tokens_seen": 205256, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.3526119402985075, |
| "grad_norm": 0.5048463940620422, |
| "learning_rate": 0.0006753731343283583, |
| "loss": 0.6293, |
| "num_input_tokens_seen": 206856, |
| "step": 725 |
| }, |
| { |
| "epoch": 1.3619402985074627, |
| "grad_norm": 0.45297038555145264, |
| "learning_rate": 0.0006800373134328358, |
| "loss": 0.4723, |
| "num_input_tokens_seen": 208232, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.3712686567164178, |
| "grad_norm": 0.40795764327049255, |
| "learning_rate": 0.0006847014925373134, |
| "loss": 0.7259, |
| "num_input_tokens_seen": 209576, |
| "step": 735 |
| }, |
| { |
| "epoch": 1.3805970149253732, |
| "grad_norm": 0.4087425470352173, |
| "learning_rate": 0.0006893656716417911, |
| "loss": 0.8533, |
| "num_input_tokens_seen": 210920, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.3899253731343284, |
| "grad_norm": 0.3699188828468323, |
| "learning_rate": 0.0006940298507462687, |
| "loss": 0.4452, |
| "num_input_tokens_seen": 212424, |
| "step": 745 |
| }, |
| { |
| "epoch": 1.3992537313432836, |
| "grad_norm": 0.5148809552192688, |
| "learning_rate": 0.0006986940298507463, |
| "loss": 0.6277, |
| "num_input_tokens_seen": 213736, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.4085820895522387, |
| "grad_norm": 0.314248651266098, |
| "learning_rate": 0.0007033582089552238, |
| "loss": 0.8832, |
| "num_input_tokens_seen": 215048, |
| "step": 755 |
| }, |
| { |
| "epoch": 1.417910447761194, |
| "grad_norm": 0.2668759226799011, |
| "learning_rate": 0.0007080223880597016, |
| "loss": 0.6837, |
| "num_input_tokens_seen": 216456, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.4272388059701493, |
| "grad_norm": 0.5210652947425842, |
| "learning_rate": 0.0007126865671641791, |
| "loss": 0.5989, |
| "num_input_tokens_seen": 217896, |
| "step": 765 |
| }, |
| { |
| "epoch": 1.4365671641791045, |
| "grad_norm": 0.3388369083404541, |
| "learning_rate": 0.0007173507462686567, |
| "loss": 0.8363, |
| "num_input_tokens_seen": 219240, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.4458955223880596, |
| "grad_norm": 0.45436587929725647, |
| "learning_rate": 0.0007220149253731343, |
| "loss": 0.5282, |
| "num_input_tokens_seen": 220904, |
| "step": 775 |
| }, |
| { |
| "epoch": 1.455223880597015, |
| "grad_norm": 0.6213339567184448, |
| "learning_rate": 0.0007266791044776119, |
| "loss": 0.8923, |
| "num_input_tokens_seen": 222184, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.4645522388059702, |
| "grad_norm": 0.4394315183162689, |
| "learning_rate": 0.0007313432835820895, |
| "loss": 0.7169, |
| "num_input_tokens_seen": 223656, |
| "step": 785 |
| }, |
| { |
| "epoch": 1.4738805970149254, |
| "grad_norm": 0.3317413032054901, |
| "learning_rate": 0.0007360074626865671, |
| "loss": 0.4891, |
| "num_input_tokens_seen": 225064, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.4832089552238805, |
| "grad_norm": 0.3865710496902466, |
| "learning_rate": 0.0007406716417910447, |
| "loss": 0.7401, |
| "num_input_tokens_seen": 226536, |
| "step": 795 |
| }, |
| { |
| "epoch": 1.4925373134328357, |
| "grad_norm": 0.28780487179756165, |
| "learning_rate": 0.0007453358208955224, |
| "loss": 0.5946, |
| "num_input_tokens_seen": 227816, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.501865671641791, |
| "grad_norm": 0.3245728015899658, |
| "learning_rate": 0.00075, |
| "loss": 0.751, |
| "num_input_tokens_seen": 229288, |
| "step": 805 |
| }, |
| { |
| "epoch": 1.5111940298507462, |
| "grad_norm": 0.20647266507148743, |
| "learning_rate": 0.0007546641791044776, |
| "loss": 0.4726, |
| "num_input_tokens_seen": 230728, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.5205223880597014, |
| "grad_norm": 0.41778045892715454, |
| "learning_rate": 0.0007593283582089553, |
| "loss": 0.4172, |
| "num_input_tokens_seen": 232104, |
| "step": 815 |
| }, |
| { |
| "epoch": 1.5298507462686568, |
| "grad_norm": 0.5311048626899719, |
| "learning_rate": 0.0007639925373134329, |
| "loss": 0.6319, |
| "num_input_tokens_seen": 233512, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.539179104477612, |
| "grad_norm": 0.5205721855163574, |
| "learning_rate": 0.0007686567164179105, |
| "loss": 0.5626, |
| "num_input_tokens_seen": 234792, |
| "step": 825 |
| }, |
| { |
| "epoch": 1.5485074626865671, |
| "grad_norm": 0.43273499608039856, |
| "learning_rate": 0.0007733208955223881, |
| "loss": 0.6892, |
| "num_input_tokens_seen": 236168, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.5578358208955225, |
| "grad_norm": 0.3638947606086731, |
| "learning_rate": 0.0007779850746268657, |
| "loss": 0.7235, |
| "num_input_tokens_seen": 237704, |
| "step": 835 |
| }, |
| { |
| "epoch": 1.5671641791044775, |
| "grad_norm": 0.26659175753593445, |
| "learning_rate": 0.0007826492537313433, |
| "loss": 0.6181, |
| "num_input_tokens_seen": 239272, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.5764925373134329, |
| "grad_norm": 0.399054616689682, |
| "learning_rate": 0.0007873134328358209, |
| "loss": 0.6014, |
| "num_input_tokens_seen": 240552, |
| "step": 845 |
| }, |
| { |
| "epoch": 1.585820895522388, |
| "grad_norm": 0.8174265623092651, |
| "learning_rate": 0.0007919776119402985, |
| "loss": 0.6507, |
| "num_input_tokens_seen": 242024, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.5951492537313432, |
| "grad_norm": 0.43701648712158203, |
| "learning_rate": 0.0007966417910447762, |
| "loss": 0.6974, |
| "num_input_tokens_seen": 243400, |
| "step": 855 |
| }, |
| { |
| "epoch": 1.6044776119402986, |
| "grad_norm": 0.5361718535423279, |
| "learning_rate": 0.0008013059701492538, |
| "loss": 1.2414, |
| "num_input_tokens_seen": 244744, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.6138059701492538, |
| "grad_norm": 0.36682432889938354, |
| "learning_rate": 0.0008059701492537314, |
| "loss": 0.6403, |
| "num_input_tokens_seen": 246312, |
| "step": 865 |
| }, |
| { |
| "epoch": 1.623134328358209, |
| "grad_norm": 0.48592299222946167, |
| "learning_rate": 0.000810634328358209, |
| "loss": 0.6154, |
| "num_input_tokens_seen": 247944, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.6324626865671643, |
| "grad_norm": 0.1437925547361374, |
| "learning_rate": 0.0008152985074626867, |
| "loss": 0.6444, |
| "num_input_tokens_seen": 249544, |
| "step": 875 |
| }, |
| { |
| "epoch": 1.6417910447761193, |
| "grad_norm": 0.3561725914478302, |
| "learning_rate": 0.0008199626865671643, |
| "loss": 0.637, |
| "num_input_tokens_seen": 250920, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.6511194029850746, |
| "grad_norm": 0.36449792981147766, |
| "learning_rate": 0.0008246268656716418, |
| "loss": 0.6722, |
| "num_input_tokens_seen": 252392, |
| "step": 885 |
| }, |
| { |
| "epoch": 1.6604477611940298, |
| "grad_norm": 0.2657022178173065, |
| "learning_rate": 0.0008292910447761193, |
| "loss": 0.6982, |
| "num_input_tokens_seen": 254056, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.669776119402985, |
| "grad_norm": 0.3086923658847809, |
| "learning_rate": 0.000833955223880597, |
| "loss": 0.6361, |
| "num_input_tokens_seen": 255528, |
| "step": 895 |
| }, |
| { |
| "epoch": 1.6791044776119404, |
| "grad_norm": 0.4550116956233978, |
| "learning_rate": 0.0008386194029850746, |
| "loss": 0.7733, |
| "num_input_tokens_seen": 257096, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.6884328358208955, |
| "grad_norm": 0.3251428008079529, |
| "learning_rate": 0.0008432835820895522, |
| "loss": 0.4253, |
| "num_input_tokens_seen": 258536, |
| "step": 905 |
| }, |
| { |
| "epoch": 1.6977611940298507, |
| "grad_norm": 0.18625348806381226, |
| "learning_rate": 0.0008479477611940298, |
| "loss": 0.5241, |
| "num_input_tokens_seen": 260136, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.707089552238806, |
| "grad_norm": 0.4147641062736511, |
| "learning_rate": 0.0008526119402985075, |
| "loss": 0.4974, |
| "num_input_tokens_seen": 261384, |
| "step": 915 |
| }, |
| { |
| "epoch": 1.716417910447761, |
| "grad_norm": 0.4366629123687744, |
| "learning_rate": 0.0008572761194029851, |
| "loss": 0.5932, |
| "num_input_tokens_seen": 262952, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.7257462686567164, |
| "grad_norm": 0.47915467619895935, |
| "learning_rate": 0.0008619402985074627, |
| "loss": 0.6001, |
| "num_input_tokens_seen": 264328, |
| "step": 925 |
| }, |
| { |
| "epoch": 1.7350746268656716, |
| "grad_norm": 0.2741464376449585, |
| "learning_rate": 0.0008666044776119403, |
| "loss": 0.8693, |
| "num_input_tokens_seen": 265672, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.7444029850746268, |
| "grad_norm": 0.2753334939479828, |
| "learning_rate": 0.0008712686567164179, |
| "loss": 0.6336, |
| "num_input_tokens_seen": 267080, |
| "step": 935 |
| }, |
| { |
| "epoch": 1.7537313432835822, |
| "grad_norm": 0.4339545667171478, |
| "learning_rate": 0.0008759328358208955, |
| "loss": 0.5896, |
| "num_input_tokens_seen": 268648, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.7630597014925373, |
| "grad_norm": 0.46501514315605164, |
| "learning_rate": 0.0008805970149253731, |
| "loss": 0.5796, |
| "num_input_tokens_seen": 269960, |
| "step": 945 |
| }, |
| { |
| "epoch": 1.7723880597014925, |
| "grad_norm": 0.31456464529037476, |
| "learning_rate": 0.0008852611940298507, |
| "loss": 0.7311, |
| "num_input_tokens_seen": 271272, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.7817164179104479, |
| "grad_norm": 0.19265945255756378, |
| "learning_rate": 0.0008899253731343284, |
| "loss": 0.6366, |
| "num_input_tokens_seen": 272488, |
| "step": 955 |
| }, |
| { |
| "epoch": 1.7910447761194028, |
| "grad_norm": 0.3629007935523987, |
| "learning_rate": 0.000894589552238806, |
| "loss": 0.9754, |
| "num_input_tokens_seen": 274024, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.8003731343283582, |
| "grad_norm": 0.3736506402492523, |
| "learning_rate": 0.0008992537313432836, |
| "loss": 0.5409, |
| "num_input_tokens_seen": 275336, |
| "step": 965 |
| }, |
| { |
| "epoch": 1.8097014925373134, |
| "grad_norm": 0.262616902589798, |
| "learning_rate": 0.0009039179104477612, |
| "loss": 0.5686, |
| "num_input_tokens_seen": 276584, |
| "step": 970 |
| }, |
| { |
| "epoch": 1.8190298507462686, |
| "grad_norm": 0.41028597950935364, |
| "learning_rate": 0.0009085820895522389, |
| "loss": 0.8302, |
| "num_input_tokens_seen": 277960, |
| "step": 975 |
| }, |
| { |
| "epoch": 1.828358208955224, |
| "grad_norm": 0.3309934139251709, |
| "learning_rate": 0.0009132462686567165, |
| "loss": 0.4458, |
| "num_input_tokens_seen": 279400, |
| "step": 980 |
| }, |
| { |
| "epoch": 1.837686567164179, |
| "grad_norm": 0.25409263372421265, |
| "learning_rate": 0.0009179104477611941, |
| "loss": 0.4202, |
| "num_input_tokens_seen": 281064, |
| "step": 985 |
| }, |
| { |
| "epoch": 1.8470149253731343, |
| "grad_norm": 0.48292016983032227, |
| "learning_rate": 0.0009225746268656716, |
| "loss": 0.7663, |
| "num_input_tokens_seen": 282312, |
| "step": 990 |
| }, |
| { |
| "epoch": 1.8563432835820897, |
| "grad_norm": 0.3051922023296356, |
| "learning_rate": 0.0009272388059701493, |
| "loss": 0.4506, |
| "num_input_tokens_seen": 283688, |
| "step": 995 |
| }, |
| { |
| "epoch": 1.8656716417910446, |
| "grad_norm": 0.20524144172668457, |
| "learning_rate": 0.0009319029850746269, |
| "loss": 0.7163, |
| "num_input_tokens_seen": 285192, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.875, |
| "grad_norm": 0.2460353970527649, |
| "learning_rate": 0.0009365671641791045, |
| "loss": 0.5831, |
| "num_input_tokens_seen": 286664, |
| "step": 1005 |
| }, |
| { |
| "epoch": 1.8843283582089554, |
| "grad_norm": 0.4078156650066376, |
| "learning_rate": 0.0009412313432835821, |
| "loss": 0.5654, |
| "num_input_tokens_seen": 287976, |
| "step": 1010 |
| }, |
| { |
| "epoch": 1.8936567164179103, |
| "grad_norm": 0.3027799427509308, |
| "learning_rate": 0.0009458955223880598, |
| "loss": 0.6781, |
| "num_input_tokens_seen": 289256, |
| "step": 1015 |
| }, |
| { |
| "epoch": 1.9029850746268657, |
| "grad_norm": 0.4628511965274811, |
| "learning_rate": 0.0009505597014925374, |
| "loss": 0.7969, |
| "num_input_tokens_seen": 290792, |
| "step": 1020 |
| }, |
| { |
| "epoch": 1.912313432835821, |
| "grad_norm": 0.29008105397224426, |
| "learning_rate": 0.000955223880597015, |
| "loss": 0.7692, |
| "num_input_tokens_seen": 292104, |
| "step": 1025 |
| }, |
| { |
| "epoch": 1.921641791044776, |
| "grad_norm": 0.2522449791431427, |
| "learning_rate": 0.0009598880597014926, |
| "loss": 0.6877, |
| "num_input_tokens_seen": 293672, |
| "step": 1030 |
| }, |
| { |
| "epoch": 1.9309701492537314, |
| "grad_norm": 0.28948765993118286, |
| "learning_rate": 0.0009645522388059703, |
| "loss": 0.6873, |
| "num_input_tokens_seen": 295048, |
| "step": 1035 |
| }, |
| { |
| "epoch": 1.9402985074626866, |
| "grad_norm": 0.6912420392036438, |
| "learning_rate": 0.0009692164179104477, |
| "loss": 0.6167, |
| "num_input_tokens_seen": 296584, |
| "step": 1040 |
| }, |
| { |
| "epoch": 1.9496268656716418, |
| "grad_norm": 0.17778247594833374, |
| "learning_rate": 0.0009738805970149253, |
| "loss": 0.5713, |
| "num_input_tokens_seen": 297928, |
| "step": 1045 |
| }, |
| { |
| "epoch": 1.9589552238805972, |
| "grad_norm": 1.7690346240997314, |
| "learning_rate": 0.000978544776119403, |
| "loss": 0.9329, |
| "num_input_tokens_seen": 299336, |
| "step": 1050 |
| }, |
| { |
| "epoch": 1.9682835820895521, |
| "grad_norm": 0.19372034072875977, |
| "learning_rate": 0.0009832089552238806, |
| "loss": 0.4327, |
| "num_input_tokens_seen": 300776, |
| "step": 1055 |
| }, |
| { |
| "epoch": 1.9776119402985075, |
| "grad_norm": 0.5918903350830078, |
| "learning_rate": 0.0009878731343283583, |
| "loss": 0.8278, |
| "num_input_tokens_seen": 302216, |
| "step": 1060 |
| }, |
| { |
| "epoch": 1.9869402985074627, |
| "grad_norm": 0.3756718635559082, |
| "learning_rate": 0.0009925373134328358, |
| "loss": 0.6511, |
| "num_input_tokens_seen": 303560, |
| "step": 1065 |
| }, |
| { |
| "epoch": 1.9962686567164178, |
| "grad_norm": 0.23910151422023773, |
| "learning_rate": 0.0009972014925373133, |
| "loss": 0.6292, |
| "num_input_tokens_seen": 305128, |
| "step": 1070 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.6899347901344299, |
| "eval_runtime": 4.1904, |
| "eval_samples_per_second": 56.797, |
| "eval_steps_per_second": 14.318, |
| "num_input_tokens_seen": 305496, |
| "step": 1072 |
| }, |
| { |
| "epoch": 2.0055970149253732, |
| "grad_norm": 0.2828496992588043, |
| "learning_rate": 0.0009999998939708842, |
| "loss": 0.8802, |
| "num_input_tokens_seen": 306456, |
| "step": 1075 |
| }, |
| { |
| "epoch": 2.014925373134328, |
| "grad_norm": 0.32462286949157715, |
| "learning_rate": 0.0009999987011438459, |
| "loss": 0.5237, |
| "num_input_tokens_seen": 308216, |
| "step": 1080 |
| }, |
| { |
| "epoch": 2.0242537313432836, |
| "grad_norm": 0.27709144353866577, |
| "learning_rate": 0.0009999961829565468, |
| "loss": 0.5552, |
| "num_input_tokens_seen": 309688, |
| "step": 1085 |
| }, |
| { |
| "epoch": 2.033582089552239, |
| "grad_norm": 0.1804264932870865, |
| "learning_rate": 0.0009999923394156621, |
| "loss": 0.7655, |
| "num_input_tokens_seen": 311096, |
| "step": 1090 |
| }, |
| { |
| "epoch": 2.042910447761194, |
| "grad_norm": 0.40875178575515747, |
| "learning_rate": 0.0009999871705313795, |
| "loss": 0.6567, |
| "num_input_tokens_seen": 312728, |
| "step": 1095 |
| }, |
| { |
| "epoch": 2.0522388059701493, |
| "grad_norm": 0.4328378140926361, |
| "learning_rate": 0.0009999806763174009, |
| "loss": 0.6443, |
| "num_input_tokens_seen": 314200, |
| "step": 1100 |
| }, |
| { |
| "epoch": 2.0615671641791047, |
| "grad_norm": 0.28709733486175537, |
| "learning_rate": 0.0009999728567909403, |
| "loss": 0.5532, |
| "num_input_tokens_seen": 315608, |
| "step": 1105 |
| }, |
| { |
| "epoch": 2.0708955223880596, |
| "grad_norm": 0.3292154371738434, |
| "learning_rate": 0.0009999637119727251, |
| "loss": 0.5018, |
| "num_input_tokens_seen": 317080, |
| "step": 1110 |
| }, |
| { |
| "epoch": 2.080223880597015, |
| "grad_norm": 0.4961127042770386, |
| "learning_rate": 0.000999953241886996, |
| "loss": 0.5222, |
| "num_input_tokens_seen": 318648, |
| "step": 1115 |
| }, |
| { |
| "epoch": 2.08955223880597, |
| "grad_norm": 0.30666232109069824, |
| "learning_rate": 0.0009999414465615062, |
| "loss": 0.6002, |
| "num_input_tokens_seen": 320248, |
| "step": 1120 |
| }, |
| { |
| "epoch": 2.0988805970149254, |
| "grad_norm": 0.37278977036476135, |
| "learning_rate": 0.0009999283260275218, |
| "loss": 0.5347, |
| "num_input_tokens_seen": 321656, |
| "step": 1125 |
| }, |
| { |
| "epoch": 2.1082089552238807, |
| "grad_norm": 0.206303671002388, |
| "learning_rate": 0.000999913880319822, |
| "loss": 0.6089, |
| "num_input_tokens_seen": 323256, |
| "step": 1130 |
| }, |
| { |
| "epoch": 2.1175373134328357, |
| "grad_norm": 0.23221929371356964, |
| "learning_rate": 0.000999898109476698, |
| "loss": 0.6503, |
| "num_input_tokens_seen": 324632, |
| "step": 1135 |
| }, |
| { |
| "epoch": 2.126865671641791, |
| "grad_norm": 0.36873659491539, |
| "learning_rate": 0.0009998810135399545, |
| "loss": 0.806, |
| "num_input_tokens_seen": 325816, |
| "step": 1140 |
| }, |
| { |
| "epoch": 2.1361940298507465, |
| "grad_norm": 0.21523557603359222, |
| "learning_rate": 0.000999862592554908, |
| "loss": 0.4014, |
| "num_input_tokens_seen": 327288, |
| "step": 1145 |
| }, |
| { |
| "epoch": 2.1455223880597014, |
| "grad_norm": 0.3459737002849579, |
| "learning_rate": 0.0009998428465703873, |
| "loss": 0.6605, |
| "num_input_tokens_seen": 328632, |
| "step": 1150 |
| }, |
| { |
| "epoch": 2.154850746268657, |
| "grad_norm": 0.22625266015529633, |
| "learning_rate": 0.000999821775638734, |
| "loss": 0.7785, |
| "num_input_tokens_seen": 329976, |
| "step": 1155 |
| }, |
| { |
| "epoch": 2.1641791044776117, |
| "grad_norm": 0.38777706027030945, |
| "learning_rate": 0.000999799379815801, |
| "loss": 0.647, |
| "num_input_tokens_seen": 331192, |
| "step": 1160 |
| }, |
| { |
| "epoch": 2.173507462686567, |
| "grad_norm": 0.7987584471702576, |
| "learning_rate": 0.0009997756591609537, |
| "loss": 0.6472, |
| "num_input_tokens_seen": 332504, |
| "step": 1165 |
| }, |
| { |
| "epoch": 2.1828358208955225, |
| "grad_norm": 0.2811796963214874, |
| "learning_rate": 0.0009997506137370692, |
| "loss": 0.5644, |
| "num_input_tokens_seen": 333816, |
| "step": 1170 |
| }, |
| { |
| "epoch": 2.1921641791044775, |
| "grad_norm": 0.350900262594223, |
| "learning_rate": 0.0009997242436105358, |
| "loss": 0.5828, |
| "num_input_tokens_seen": 335224, |
| "step": 1175 |
| }, |
| { |
| "epoch": 2.201492537313433, |
| "grad_norm": 0.24604664742946625, |
| "learning_rate": 0.000999696548851254, |
| "loss": 0.5264, |
| "num_input_tokens_seen": 336504, |
| "step": 1180 |
| }, |
| { |
| "epoch": 2.2108208955223883, |
| "grad_norm": 0.13010980188846588, |
| "learning_rate": 0.0009996675295326344, |
| "loss": 0.5302, |
| "num_input_tokens_seen": 337784, |
| "step": 1185 |
| }, |
| { |
| "epoch": 2.220149253731343, |
| "grad_norm": 0.30489736795425415, |
| "learning_rate": 0.0009996371857316, |
| "loss": 0.7664, |
| "num_input_tokens_seen": 339096, |
| "step": 1190 |
| }, |
| { |
| "epoch": 2.2294776119402986, |
| "grad_norm": 0.3614640533924103, |
| "learning_rate": 0.0009996055175285833, |
| "loss": 0.4912, |
| "num_input_tokens_seen": 340376, |
| "step": 1195 |
| }, |
| { |
| "epoch": 2.2388059701492535, |
| "grad_norm": 0.20211651921272278, |
| "learning_rate": 0.0009995725250075288, |
| "loss": 0.6022, |
| "num_input_tokens_seen": 341880, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.248134328358209, |
| "grad_norm": 0.1927761435508728, |
| "learning_rate": 0.0009995382082558899, |
| "loss": 0.5555, |
| "num_input_tokens_seen": 343416, |
| "step": 1205 |
| }, |
| { |
| "epoch": 2.2574626865671643, |
| "grad_norm": 0.33579009771347046, |
| "learning_rate": 0.0009995025673646314, |
| "loss": 0.8326, |
| "num_input_tokens_seen": 344920, |
| "step": 1210 |
| }, |
| { |
| "epoch": 2.2667910447761193, |
| "grad_norm": 0.13861723244190216, |
| "learning_rate": 0.0009994656024282277, |
| "loss": 0.5842, |
| "num_input_tokens_seen": 346424, |
| "step": 1215 |
| }, |
| { |
| "epoch": 2.2761194029850746, |
| "grad_norm": 0.2502031624317169, |
| "learning_rate": 0.0009994273135446622, |
| "loss": 0.621, |
| "num_input_tokens_seen": 347960, |
| "step": 1220 |
| }, |
| { |
| "epoch": 2.28544776119403, |
| "grad_norm": 0.17798258364200592, |
| "learning_rate": 0.000999387700815429, |
| "loss": 0.486, |
| "num_input_tokens_seen": 349400, |
| "step": 1225 |
| }, |
| { |
| "epoch": 2.294776119402985, |
| "grad_norm": 0.19423194229602814, |
| "learning_rate": 0.0009993467643455301, |
| "loss": 0.6475, |
| "num_input_tokens_seen": 350744, |
| "step": 1230 |
| }, |
| { |
| "epoch": 2.3041044776119404, |
| "grad_norm": 0.10956660658121109, |
| "learning_rate": 0.0009993045042434772, |
| "loss": 0.8843, |
| "num_input_tokens_seen": 352152, |
| "step": 1235 |
| }, |
| { |
| "epoch": 2.3134328358208958, |
| "grad_norm": 0.21084345877170563, |
| "learning_rate": 0.0009992609206212902, |
| "loss": 0.6028, |
| "num_input_tokens_seen": 353560, |
| "step": 1240 |
| }, |
| { |
| "epoch": 2.3227611940298507, |
| "grad_norm": 0.23705990612506866, |
| "learning_rate": 0.0009992160135944975, |
| "loss": 0.6813, |
| "num_input_tokens_seen": 354808, |
| "step": 1245 |
| }, |
| { |
| "epoch": 2.332089552238806, |
| "grad_norm": 0.26078465580940247, |
| "learning_rate": 0.0009991697832821354, |
| "loss": 0.7028, |
| "num_input_tokens_seen": 356120, |
| "step": 1250 |
| }, |
| { |
| "epoch": 2.341417910447761, |
| "grad_norm": 0.4466816186904907, |
| "learning_rate": 0.0009991222298067477, |
| "loss": 0.6778, |
| "num_input_tokens_seen": 357400, |
| "step": 1255 |
| }, |
| { |
| "epoch": 2.3507462686567164, |
| "grad_norm": 0.2676059603691101, |
| "learning_rate": 0.0009990733532943858, |
| "loss": 0.4555, |
| "num_input_tokens_seen": 358584, |
| "step": 1260 |
| }, |
| { |
| "epoch": 2.360074626865672, |
| "grad_norm": 0.28492385149002075, |
| "learning_rate": 0.0009990231538746079, |
| "loss": 0.8428, |
| "num_input_tokens_seen": 360088, |
| "step": 1265 |
| }, |
| { |
| "epoch": 2.3694029850746268, |
| "grad_norm": 0.324629545211792, |
| "learning_rate": 0.0009989716316804794, |
| "loss": 0.4129, |
| "num_input_tokens_seen": 361496, |
| "step": 1270 |
| }, |
| { |
| "epoch": 2.378731343283582, |
| "grad_norm": 0.13705921173095703, |
| "learning_rate": 0.000998918786848571, |
| "loss": 0.459, |
| "num_input_tokens_seen": 362968, |
| "step": 1275 |
| }, |
| { |
| "epoch": 2.388059701492537, |
| "grad_norm": 0.2732636034488678, |
| "learning_rate": 0.0009988646195189601, |
| "loss": 0.7371, |
| "num_input_tokens_seen": 364312, |
| "step": 1280 |
| }, |
| { |
| "epoch": 2.3973880597014925, |
| "grad_norm": 0.21165116131305695, |
| "learning_rate": 0.00099880912983523, |
| "loss": 0.4329, |
| "num_input_tokens_seen": 365752, |
| "step": 1285 |
| }, |
| { |
| "epoch": 2.406716417910448, |
| "grad_norm": 0.2917303144931793, |
| "learning_rate": 0.0009987523179444682, |
| "loss": 0.5815, |
| "num_input_tokens_seen": 367576, |
| "step": 1290 |
| }, |
| { |
| "epoch": 2.416044776119403, |
| "grad_norm": 0.22703352570533752, |
| "learning_rate": 0.0009986941839972676, |
| "loss": 0.5994, |
| "num_input_tokens_seen": 369016, |
| "step": 1295 |
| }, |
| { |
| "epoch": 2.425373134328358, |
| "grad_norm": 0.2888859212398529, |
| "learning_rate": 0.0009986347281477257, |
| "loss": 0.5273, |
| "num_input_tokens_seen": 370584, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.4347014925373136, |
| "grad_norm": 0.25431835651397705, |
| "learning_rate": 0.0009985739505534437, |
| "loss": 0.6897, |
| "num_input_tokens_seen": 371992, |
| "step": 1305 |
| }, |
| { |
| "epoch": 2.4440298507462686, |
| "grad_norm": 0.28907614946365356, |
| "learning_rate": 0.000998511851375526, |
| "loss": 0.4778, |
| "num_input_tokens_seen": 373304, |
| "step": 1310 |
| }, |
| { |
| "epoch": 2.453358208955224, |
| "grad_norm": 0.2507428228855133, |
| "learning_rate": 0.000998448430778581, |
| "loss": 0.8246, |
| "num_input_tokens_seen": 374776, |
| "step": 1315 |
| }, |
| { |
| "epoch": 2.4626865671641793, |
| "grad_norm": 0.24830341339111328, |
| "learning_rate": 0.0009983836889307196, |
| "loss": 0.6537, |
| "num_input_tokens_seen": 376248, |
| "step": 1320 |
| }, |
| { |
| "epoch": 2.4720149253731343, |
| "grad_norm": 0.2691262364387512, |
| "learning_rate": 0.0009983176260035544, |
| "loss": 0.5673, |
| "num_input_tokens_seen": 377560, |
| "step": 1325 |
| }, |
| { |
| "epoch": 2.4813432835820897, |
| "grad_norm": 0.44198179244995117, |
| "learning_rate": 0.0009982502421722005, |
| "loss": 0.7322, |
| "num_input_tokens_seen": 378904, |
| "step": 1330 |
| }, |
| { |
| "epoch": 2.4906716417910446, |
| "grad_norm": 0.3275417983531952, |
| "learning_rate": 0.0009981815376152736, |
| "loss": 0.6478, |
| "num_input_tokens_seen": 380312, |
| "step": 1335 |
| }, |
| { |
| "epoch": 2.5, |
| "grad_norm": 0.2878626585006714, |
| "learning_rate": 0.000998111512514891, |
| "loss": 0.5703, |
| "num_input_tokens_seen": 381784, |
| "step": 1340 |
| }, |
| { |
| "epoch": 2.5093283582089554, |
| "grad_norm": 0.15107418596744537, |
| "learning_rate": 0.0009980401670566705, |
| "loss": 0.3825, |
| "num_input_tokens_seen": 383256, |
| "step": 1345 |
| }, |
| { |
| "epoch": 2.5186567164179103, |
| "grad_norm": 0.23873454332351685, |
| "learning_rate": 0.0009979675014297293, |
| "loss": 0.7657, |
| "num_input_tokens_seen": 384536, |
| "step": 1350 |
| }, |
| { |
| "epoch": 2.5279850746268657, |
| "grad_norm": 0.22512106597423553, |
| "learning_rate": 0.0009978935158266838, |
| "loss": 0.5868, |
| "num_input_tokens_seen": 385880, |
| "step": 1355 |
| }, |
| { |
| "epoch": 2.5373134328358207, |
| "grad_norm": 0.1494188755750656, |
| "learning_rate": 0.00099781821044365, |
| "loss": 0.5919, |
| "num_input_tokens_seen": 387576, |
| "step": 1360 |
| }, |
| { |
| "epoch": 2.546641791044776, |
| "grad_norm": 0.29194745421409607, |
| "learning_rate": 0.0009977415854802419, |
| "loss": 0.652, |
| "num_input_tokens_seen": 389112, |
| "step": 1365 |
| }, |
| { |
| "epoch": 2.5559701492537314, |
| "grad_norm": 0.25149044394493103, |
| "learning_rate": 0.000997663641139571, |
| "loss": 0.5783, |
| "num_input_tokens_seen": 390456, |
| "step": 1370 |
| }, |
| { |
| "epoch": 2.5652985074626864, |
| "grad_norm": 0.2535479962825775, |
| "learning_rate": 0.0009975843776282472, |
| "loss": 0.6898, |
| "num_input_tokens_seen": 392024, |
| "step": 1375 |
| }, |
| { |
| "epoch": 2.574626865671642, |
| "grad_norm": 0.3264197111129761, |
| "learning_rate": 0.0009975037951563761, |
| "loss": 0.6649, |
| "num_input_tokens_seen": 393176, |
| "step": 1380 |
| }, |
| { |
| "epoch": 2.583955223880597, |
| "grad_norm": 0.15236614644527435, |
| "learning_rate": 0.00099742189393756, |
| "loss": 0.4409, |
| "num_input_tokens_seen": 394680, |
| "step": 1385 |
| }, |
| { |
| "epoch": 2.593283582089552, |
| "grad_norm": 0.2783982455730438, |
| "learning_rate": 0.0009973386741888963, |
| "loss": 0.6724, |
| "num_input_tokens_seen": 396024, |
| "step": 1390 |
| }, |
| { |
| "epoch": 2.6026119402985075, |
| "grad_norm": 0.3226190507411957, |
| "learning_rate": 0.0009972541361309782, |
| "loss": 0.6453, |
| "num_input_tokens_seen": 397560, |
| "step": 1395 |
| }, |
| { |
| "epoch": 2.611940298507463, |
| "grad_norm": 0.3189884424209595, |
| "learning_rate": 0.000997168279987893, |
| "loss": 0.6401, |
| "num_input_tokens_seen": 398776, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.621268656716418, |
| "grad_norm": 0.1870376318693161, |
| "learning_rate": 0.000997081105987222, |
| "loss": 0.5131, |
| "num_input_tokens_seen": 400376, |
| "step": 1405 |
| }, |
| { |
| "epoch": 2.6305970149253732, |
| "grad_norm": 0.2934994697570801, |
| "learning_rate": 0.0009969926143600396, |
| "loss": 0.773, |
| "num_input_tokens_seen": 401624, |
| "step": 1410 |
| }, |
| { |
| "epoch": 2.6399253731343286, |
| "grad_norm": 0.27938956022262573, |
| "learning_rate": 0.0009969028053409131, |
| "loss": 0.7983, |
| "num_input_tokens_seen": 402968, |
| "step": 1415 |
| }, |
| { |
| "epoch": 2.6492537313432836, |
| "grad_norm": 0.3684418797492981, |
| "learning_rate": 0.0009968116791679014, |
| "loss": 0.7426, |
| "num_input_tokens_seen": 404440, |
| "step": 1420 |
| }, |
| { |
| "epoch": 2.658582089552239, |
| "grad_norm": 0.21254830062389374, |
| "learning_rate": 0.0009967192360825557, |
| "loss": 0.5771, |
| "num_input_tokens_seen": 405752, |
| "step": 1425 |
| }, |
| { |
| "epoch": 2.667910447761194, |
| "grad_norm": 0.3110397160053253, |
| "learning_rate": 0.000996625476329917, |
| "loss": 0.6822, |
| "num_input_tokens_seen": 407064, |
| "step": 1430 |
| }, |
| { |
| "epoch": 2.6772388059701493, |
| "grad_norm": 0.38691264390945435, |
| "learning_rate": 0.000996530400158517, |
| "loss": 0.5575, |
| "num_input_tokens_seen": 408536, |
| "step": 1435 |
| }, |
| { |
| "epoch": 2.6865671641791042, |
| "grad_norm": 0.28269466757774353, |
| "learning_rate": 0.0009964340078203765, |
| "loss": 0.817, |
| "num_input_tokens_seen": 409912, |
| "step": 1440 |
| }, |
| { |
| "epoch": 2.6958955223880596, |
| "grad_norm": 0.28892797231674194, |
| "learning_rate": 0.0009963362995710056, |
| "loss": 0.7048, |
| "num_input_tokens_seen": 411192, |
| "step": 1445 |
| }, |
| { |
| "epoch": 2.705223880597015, |
| "grad_norm": 0.4032808840274811, |
| "learning_rate": 0.0009962372756694023, |
| "loss": 0.5718, |
| "num_input_tokens_seen": 412632, |
| "step": 1450 |
| }, |
| { |
| "epoch": 2.71455223880597, |
| "grad_norm": 0.28502070903778076, |
| "learning_rate": 0.0009961369363780514, |
| "loss": 0.6646, |
| "num_input_tokens_seen": 413944, |
| "step": 1455 |
| }, |
| { |
| "epoch": 2.7238805970149254, |
| "grad_norm": 0.3142834007740021, |
| "learning_rate": 0.0009960352819629258, |
| "loss": 0.7401, |
| "num_input_tokens_seen": 415352, |
| "step": 1460 |
| }, |
| { |
| "epoch": 2.7332089552238807, |
| "grad_norm": 0.24872903525829315, |
| "learning_rate": 0.000995932312693483, |
| "loss": 0.6132, |
| "num_input_tokens_seen": 416536, |
| "step": 1465 |
| }, |
| { |
| "epoch": 2.7425373134328357, |
| "grad_norm": 0.2512160539627075, |
| "learning_rate": 0.0009958280288426668, |
| "loss": 0.6953, |
| "num_input_tokens_seen": 418040, |
| "step": 1470 |
| }, |
| { |
| "epoch": 2.751865671641791, |
| "grad_norm": 0.2220417857170105, |
| "learning_rate": 0.0009957224306869053, |
| "loss": 0.5511, |
| "num_input_tokens_seen": 419448, |
| "step": 1475 |
| }, |
| { |
| "epoch": 2.7611940298507465, |
| "grad_norm": 0.22836509346961975, |
| "learning_rate": 0.00099561551850611, |
| "loss": 0.5046, |
| "num_input_tokens_seen": 420856, |
| "step": 1480 |
| }, |
| { |
| "epoch": 2.7705223880597014, |
| "grad_norm": 0.2554475963115692, |
| "learning_rate": 0.0009955072925836765, |
| "loss": 0.7276, |
| "num_input_tokens_seen": 422264, |
| "step": 1485 |
| }, |
| { |
| "epoch": 2.779850746268657, |
| "grad_norm": 0.3000124990940094, |
| "learning_rate": 0.0009953977532064819, |
| "loss": 0.7986, |
| "num_input_tokens_seen": 423800, |
| "step": 1490 |
| }, |
| { |
| "epoch": 2.789179104477612, |
| "grad_norm": 0.34656092524528503, |
| "learning_rate": 0.0009952869006648853, |
| "loss": 0.5285, |
| "num_input_tokens_seen": 425496, |
| "step": 1495 |
| }, |
| { |
| "epoch": 2.798507462686567, |
| "grad_norm": 0.23471760749816895, |
| "learning_rate": 0.0009951747352527265, |
| "loss": 0.5433, |
| "num_input_tokens_seen": 426872, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.8078358208955225, |
| "grad_norm": 0.2542097270488739, |
| "learning_rate": 0.0009950612572673255, |
| "loss": 0.8049, |
| "num_input_tokens_seen": 428248, |
| "step": 1505 |
| }, |
| { |
| "epoch": 2.8171641791044775, |
| "grad_norm": 0.2707230746746063, |
| "learning_rate": 0.0009949464670094815, |
| "loss": 0.6151, |
| "num_input_tokens_seen": 429624, |
| "step": 1510 |
| }, |
| { |
| "epoch": 2.826492537313433, |
| "grad_norm": 0.6066590547561646, |
| "learning_rate": 0.0009948303647834722, |
| "loss": 0.6492, |
| "num_input_tokens_seen": 431000, |
| "step": 1515 |
| }, |
| { |
| "epoch": 2.835820895522388, |
| "grad_norm": 0.24002781510353088, |
| "learning_rate": 0.000994712950897053, |
| "loss": 0.5351, |
| "num_input_tokens_seen": 432760, |
| "step": 1520 |
| }, |
| { |
| "epoch": 2.845149253731343, |
| "grad_norm": 0.2488313466310501, |
| "learning_rate": 0.000994594225661456, |
| "loss": 0.7667, |
| "num_input_tokens_seen": 434232, |
| "step": 1525 |
| }, |
| { |
| "epoch": 2.8544776119402986, |
| "grad_norm": 0.22874920070171356, |
| "learning_rate": 0.0009944741893913895, |
| "loss": 0.8033, |
| "num_input_tokens_seen": 435608, |
| "step": 1530 |
| }, |
| { |
| "epoch": 2.8638059701492535, |
| "grad_norm": 0.1811210960149765, |
| "learning_rate": 0.0009943528424050368, |
| "loss": 0.5726, |
| "num_input_tokens_seen": 436984, |
| "step": 1535 |
| }, |
| { |
| "epoch": 2.873134328358209, |
| "grad_norm": 0.1760341376066208, |
| "learning_rate": 0.000994230185024056, |
| "loss": 0.6519, |
| "num_input_tokens_seen": 438424, |
| "step": 1540 |
| }, |
| { |
| "epoch": 2.8824626865671643, |
| "grad_norm": 0.12924505770206451, |
| "learning_rate": 0.000994106217573578, |
| "loss": 0.4132, |
| "num_input_tokens_seen": 439928, |
| "step": 1545 |
| }, |
| { |
| "epoch": 2.8917910447761193, |
| "grad_norm": 0.3055063784122467, |
| "learning_rate": 0.0009939809403822068, |
| "loss": 0.5335, |
| "num_input_tokens_seen": 441496, |
| "step": 1550 |
| }, |
| { |
| "epoch": 2.9011194029850746, |
| "grad_norm": 0.2536128759384155, |
| "learning_rate": 0.0009938543537820184, |
| "loss": 0.5995, |
| "num_input_tokens_seen": 442872, |
| "step": 1555 |
| }, |
| { |
| "epoch": 2.91044776119403, |
| "grad_norm": 0.20395897328853607, |
| "learning_rate": 0.0009937264581085592, |
| "loss": 0.5315, |
| "num_input_tokens_seen": 444312, |
| "step": 1560 |
| }, |
| { |
| "epoch": 2.919776119402985, |
| "grad_norm": 0.20518085360527039, |
| "learning_rate": 0.0009935972537008456, |
| "loss": 0.5239, |
| "num_input_tokens_seen": 445592, |
| "step": 1565 |
| }, |
| { |
| "epoch": 2.9291044776119404, |
| "grad_norm": 0.21364372968673706, |
| "learning_rate": 0.0009934667409013634, |
| "loss": 0.8367, |
| "num_input_tokens_seen": 446968, |
| "step": 1570 |
| }, |
| { |
| "epoch": 2.9384328358208958, |
| "grad_norm": 0.24295471608638763, |
| "learning_rate": 0.0009933349200560665, |
| "loss": 0.4123, |
| "num_input_tokens_seen": 448408, |
| "step": 1575 |
| }, |
| { |
| "epoch": 2.9477611940298507, |
| "grad_norm": 0.13642080128192902, |
| "learning_rate": 0.0009932017915143757, |
| "loss": 0.6884, |
| "num_input_tokens_seen": 449816, |
| "step": 1580 |
| }, |
| { |
| "epoch": 2.957089552238806, |
| "grad_norm": 0.28196004033088684, |
| "learning_rate": 0.000993067355629179, |
| "loss": 0.4606, |
| "num_input_tokens_seen": 451128, |
| "step": 1585 |
| }, |
| { |
| "epoch": 2.966417910447761, |
| "grad_norm": 0.3152237832546234, |
| "learning_rate": 0.0009929316127568288, |
| "loss": 0.6885, |
| "num_input_tokens_seen": 452728, |
| "step": 1590 |
| }, |
| { |
| "epoch": 2.9757462686567164, |
| "grad_norm": 0.1925637274980545, |
| "learning_rate": 0.000992794563257143, |
| "loss": 0.5397, |
| "num_input_tokens_seen": 454328, |
| "step": 1595 |
| }, |
| { |
| "epoch": 2.9850746268656714, |
| "grad_norm": 0.3836488723754883, |
| "learning_rate": 0.0009926562074934018, |
| "loss": 0.4762, |
| "num_input_tokens_seen": 455800, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.9944029850746268, |
| "grad_norm": 0.17350491881370544, |
| "learning_rate": 0.000992516545832349, |
| "loss": 0.4186, |
| "num_input_tokens_seen": 457560, |
| "step": 1605 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 0.6456666588783264, |
| "eval_runtime": 4.1922, |
| "eval_samples_per_second": 56.772, |
| "eval_steps_per_second": 14.312, |
| "num_input_tokens_seen": 458160, |
| "step": 1608 |
| }, |
| { |
| "epoch": 3.003731343283582, |
| "grad_norm": 0.23106440901756287, |
| "learning_rate": 0.0009923755786441896, |
| "loss": 0.6499, |
| "num_input_tokens_seen": 458736, |
| "step": 1610 |
| }, |
| { |
| "epoch": 3.013059701492537, |
| "grad_norm": 0.155120387673378, |
| "learning_rate": 0.0009922333063025893, |
| "loss": 0.6578, |
| "num_input_tokens_seen": 460080, |
| "step": 1615 |
| }, |
| { |
| "epoch": 3.0223880597014925, |
| "grad_norm": 0.23484502732753754, |
| "learning_rate": 0.0009920897291846732, |
| "loss": 0.5039, |
| "num_input_tokens_seen": 461488, |
| "step": 1620 |
| }, |
| { |
| "epoch": 3.031716417910448, |
| "grad_norm": 0.19638964533805847, |
| "learning_rate": 0.0009919448476710248, |
| "loss": 0.5332, |
| "num_input_tokens_seen": 462992, |
| "step": 1625 |
| }, |
| { |
| "epoch": 3.041044776119403, |
| "grad_norm": 0.1650891751050949, |
| "learning_rate": 0.0009917986621456856, |
| "loss": 0.6471, |
| "num_input_tokens_seen": 464304, |
| "step": 1630 |
| }, |
| { |
| "epoch": 3.050373134328358, |
| "grad_norm": 0.1567000150680542, |
| "learning_rate": 0.000991651172996154, |
| "loss": 0.318, |
| "num_input_tokens_seen": 465968, |
| "step": 1635 |
| }, |
| { |
| "epoch": 3.0597014925373136, |
| "grad_norm": 0.23875364661216736, |
| "learning_rate": 0.0009915023806133833, |
| "loss": 0.7304, |
| "num_input_tokens_seen": 467344, |
| "step": 1640 |
| }, |
| { |
| "epoch": 3.0690298507462686, |
| "grad_norm": 0.32178816199302673, |
| "learning_rate": 0.0009913522853917812, |
| "loss": 0.4617, |
| "num_input_tokens_seen": 468816, |
| "step": 1645 |
| }, |
| { |
| "epoch": 3.078358208955224, |
| "grad_norm": 0.24563711881637573, |
| "learning_rate": 0.0009912008877292096, |
| "loss": 0.6088, |
| "num_input_tokens_seen": 470416, |
| "step": 1650 |
| }, |
| { |
| "epoch": 3.0876865671641793, |
| "grad_norm": 0.26710382103919983, |
| "learning_rate": 0.0009910481880269825, |
| "loss": 0.4255, |
| "num_input_tokens_seen": 471760, |
| "step": 1655 |
| }, |
| { |
| "epoch": 3.0970149253731343, |
| "grad_norm": 0.25559887290000916, |
| "learning_rate": 0.0009908941866898647, |
| "loss": 0.4412, |
| "num_input_tokens_seen": 473104, |
| "step": 1660 |
| }, |
| { |
| "epoch": 3.1063432835820897, |
| "grad_norm": 0.2960032820701599, |
| "learning_rate": 0.0009907388841260722, |
| "loss": 0.6419, |
| "num_input_tokens_seen": 474320, |
| "step": 1665 |
| }, |
| { |
| "epoch": 3.1156716417910446, |
| "grad_norm": 0.25373607873916626, |
| "learning_rate": 0.0009905822807472699, |
| "loss": 0.5873, |
| "num_input_tokens_seen": 475632, |
| "step": 1670 |
| }, |
| { |
| "epoch": 3.125, |
| "grad_norm": 0.2537466287612915, |
| "learning_rate": 0.0009904243769685702, |
| "loss": 0.5751, |
| "num_input_tokens_seen": 477168, |
| "step": 1675 |
| }, |
| { |
| "epoch": 3.1343283582089554, |
| "grad_norm": 0.25388938188552856, |
| "learning_rate": 0.0009902651732085332, |
| "loss": 0.8109, |
| "num_input_tokens_seen": 478544, |
| "step": 1680 |
| }, |
| { |
| "epoch": 3.1436567164179103, |
| "grad_norm": 0.13722610473632812, |
| "learning_rate": 0.0009901046698891649, |
| "loss": 0.4441, |
| "num_input_tokens_seen": 479952, |
| "step": 1685 |
| }, |
| { |
| "epoch": 3.1529850746268657, |
| "grad_norm": 0.23463381826877594, |
| "learning_rate": 0.0009899428674359154, |
| "loss": 0.4207, |
| "num_input_tokens_seen": 481392, |
| "step": 1690 |
| }, |
| { |
| "epoch": 3.1623134328358207, |
| "grad_norm": 0.19632478058338165, |
| "learning_rate": 0.000989779766277679, |
| "loss": 0.4384, |
| "num_input_tokens_seen": 482992, |
| "step": 1695 |
| }, |
| { |
| "epoch": 3.171641791044776, |
| "grad_norm": 0.2727997899055481, |
| "learning_rate": 0.0009896153668467926, |
| "loss": 0.5026, |
| "num_input_tokens_seen": 484304, |
| "step": 1700 |
| }, |
| { |
| "epoch": 3.1809701492537314, |
| "grad_norm": 1.4269156455993652, |
| "learning_rate": 0.0009894496695790345, |
| "loss": 0.5492, |
| "num_input_tokens_seen": 485808, |
| "step": 1705 |
| }, |
| { |
| "epoch": 3.1902985074626864, |
| "grad_norm": 0.25974732637405396, |
| "learning_rate": 0.0009892826749136224, |
| "loss": 0.6403, |
| "num_input_tokens_seen": 487344, |
| "step": 1710 |
| }, |
| { |
| "epoch": 3.199626865671642, |
| "grad_norm": 0.2388739287853241, |
| "learning_rate": 0.000989114383293214, |
| "loss": 0.6338, |
| "num_input_tokens_seen": 488848, |
| "step": 1715 |
| }, |
| { |
| "epoch": 3.208955223880597, |
| "grad_norm": 0.21609841287136078, |
| "learning_rate": 0.0009889447951639044, |
| "loss": 0.6226, |
| "num_input_tokens_seen": 490288, |
| "step": 1720 |
| }, |
| { |
| "epoch": 3.218283582089552, |
| "grad_norm": 0.38346755504608154, |
| "learning_rate": 0.0009887739109752255, |
| "loss": 0.7313, |
| "num_input_tokens_seen": 491728, |
| "step": 1725 |
| }, |
| { |
| "epoch": 3.2276119402985075, |
| "grad_norm": 0.20293354988098145, |
| "learning_rate": 0.0009886017311801448, |
| "loss": 0.3952, |
| "num_input_tokens_seen": 493200, |
| "step": 1730 |
| }, |
| { |
| "epoch": 3.236940298507463, |
| "grad_norm": 0.26398634910583496, |
| "learning_rate": 0.000988428256235064, |
| "loss": 0.7915, |
| "num_input_tokens_seen": 494480, |
| "step": 1735 |
| }, |
| { |
| "epoch": 3.246268656716418, |
| "grad_norm": 0.505710244178772, |
| "learning_rate": 0.0009882534865998176, |
| "loss": 0.4743, |
| "num_input_tokens_seen": 495696, |
| "step": 1740 |
| }, |
| { |
| "epoch": 3.2555970149253732, |
| "grad_norm": 0.22293971478939056, |
| "learning_rate": 0.0009880774227376727, |
| "loss": 0.5284, |
| "num_input_tokens_seen": 497136, |
| "step": 1745 |
| }, |
| { |
| "epoch": 3.264925373134328, |
| "grad_norm": 0.2277284413576126, |
| "learning_rate": 0.0009879000651153262, |
| "loss": 0.3545, |
| "num_input_tokens_seen": 498672, |
| "step": 1750 |
| }, |
| { |
| "epoch": 3.2742537313432836, |
| "grad_norm": 0.24438872933387756, |
| "learning_rate": 0.0009877214142029053, |
| "loss": 0.595, |
| "num_input_tokens_seen": 500016, |
| "step": 1755 |
| }, |
| { |
| "epoch": 3.283582089552239, |
| "grad_norm": 0.2326008379459381, |
| "learning_rate": 0.0009875414704739645, |
| "loss": 0.5767, |
| "num_input_tokens_seen": 501360, |
| "step": 1760 |
| }, |
| { |
| "epoch": 3.292910447761194, |
| "grad_norm": 0.22996485233306885, |
| "learning_rate": 0.0009873602344054855, |
| "loss": 0.6522, |
| "num_input_tokens_seen": 502768, |
| "step": 1765 |
| }, |
| { |
| "epoch": 3.3022388059701493, |
| "grad_norm": 0.24486055970191956, |
| "learning_rate": 0.0009871777064778759, |
| "loss": 0.5828, |
| "num_input_tokens_seen": 504176, |
| "step": 1770 |
| }, |
| { |
| "epoch": 3.3115671641791042, |
| "grad_norm": 0.24386072158813477, |
| "learning_rate": 0.0009869938871749674, |
| "loss": 0.4539, |
| "num_input_tokens_seen": 505424, |
| "step": 1775 |
| }, |
| { |
| "epoch": 3.3208955223880596, |
| "grad_norm": 0.1616196483373642, |
| "learning_rate": 0.0009868087769840151, |
| "loss": 0.4915, |
| "num_input_tokens_seen": 507024, |
| "step": 1780 |
| }, |
| { |
| "epoch": 3.330223880597015, |
| "grad_norm": 0.18312668800354004, |
| "learning_rate": 0.0009866223763956954, |
| "loss": 0.8632, |
| "num_input_tokens_seen": 508432, |
| "step": 1785 |
| }, |
| { |
| "epoch": 3.33955223880597, |
| "grad_norm": 0.28520822525024414, |
| "learning_rate": 0.0009864346859041057, |
| "loss": 0.728, |
| "num_input_tokens_seen": 510000, |
| "step": 1790 |
| }, |
| { |
| "epoch": 3.3488805970149254, |
| "grad_norm": 0.19970983266830444, |
| "learning_rate": 0.0009862457060067617, |
| "loss": 0.9002, |
| "num_input_tokens_seen": 511376, |
| "step": 1795 |
| }, |
| { |
| "epoch": 3.3582089552238807, |
| "grad_norm": 0.23394840955734253, |
| "learning_rate": 0.0009860554372045985, |
| "loss": 0.5357, |
| "num_input_tokens_seen": 512688, |
| "step": 1800 |
| }, |
| { |
| "epoch": 3.3675373134328357, |
| "grad_norm": 0.23587185144424438, |
| "learning_rate": 0.000985863880001966, |
| "loss": 0.7726, |
| "num_input_tokens_seen": 514000, |
| "step": 1805 |
| }, |
| { |
| "epoch": 3.376865671641791, |
| "grad_norm": 0.25410711765289307, |
| "learning_rate": 0.0009856710349066308, |
| "loss": 0.5922, |
| "num_input_tokens_seen": 515312, |
| "step": 1810 |
| }, |
| { |
| "epoch": 3.3861940298507465, |
| "grad_norm": 0.3701179623603821, |
| "learning_rate": 0.000985476902429772, |
| "loss": 0.7119, |
| "num_input_tokens_seen": 516528, |
| "step": 1815 |
| }, |
| { |
| "epoch": 3.3955223880597014, |
| "grad_norm": 0.19945183396339417, |
| "learning_rate": 0.0009852814830859826, |
| "loss": 0.5055, |
| "num_input_tokens_seen": 517936, |
| "step": 1820 |
| }, |
| { |
| "epoch": 3.404850746268657, |
| "grad_norm": 0.40914735198020935, |
| "learning_rate": 0.0009850847773932656, |
| "loss": 0.4616, |
| "num_input_tokens_seen": 519376, |
| "step": 1825 |
| }, |
| { |
| "epoch": 3.4141791044776117, |
| "grad_norm": 0.244802787899971, |
| "learning_rate": 0.000984886785873034, |
| "loss": 0.6121, |
| "num_input_tokens_seen": 521168, |
| "step": 1830 |
| }, |
| { |
| "epoch": 3.423507462686567, |
| "grad_norm": 0.48571139574050903, |
| "learning_rate": 0.00098468750905011, |
| "loss": 0.9497, |
| "num_input_tokens_seen": 522512, |
| "step": 1835 |
| }, |
| { |
| "epoch": 3.4328358208955225, |
| "grad_norm": 0.284146249294281, |
| "learning_rate": 0.0009844869474527214, |
| "loss": 0.7253, |
| "num_input_tokens_seen": 523888, |
| "step": 1840 |
| }, |
| { |
| "epoch": 3.4421641791044775, |
| "grad_norm": 0.2026161402463913, |
| "learning_rate": 0.0009842851016125028, |
| "loss": 0.6453, |
| "num_input_tokens_seen": 525360, |
| "step": 1845 |
| }, |
| { |
| "epoch": 3.451492537313433, |
| "grad_norm": 0.26175108551979065, |
| "learning_rate": 0.0009840819720644922, |
| "loss": 0.6997, |
| "num_input_tokens_seen": 526928, |
| "step": 1850 |
| }, |
| { |
| "epoch": 3.4608208955223883, |
| "grad_norm": 0.18297512829303741, |
| "learning_rate": 0.0009838775593471309, |
| "loss": 0.561, |
| "num_input_tokens_seen": 528272, |
| "step": 1855 |
| }, |
| { |
| "epoch": 3.470149253731343, |
| "grad_norm": 0.24992622435092926, |
| "learning_rate": 0.0009836718640022612, |
| "loss": 0.3125, |
| "num_input_tokens_seen": 529584, |
| "step": 1860 |
| }, |
| { |
| "epoch": 3.4794776119402986, |
| "grad_norm": 0.3266121745109558, |
| "learning_rate": 0.0009834648865751252, |
| "loss": 0.6182, |
| "num_input_tokens_seen": 530896, |
| "step": 1865 |
| }, |
| { |
| "epoch": 3.4888059701492535, |
| "grad_norm": 0.24236100912094116, |
| "learning_rate": 0.0009832566276143642, |
| "loss": 0.6259, |
| "num_input_tokens_seen": 532304, |
| "step": 1870 |
| }, |
| { |
| "epoch": 3.498134328358209, |
| "grad_norm": 0.20327317714691162, |
| "learning_rate": 0.0009830470876720152, |
| "loss": 0.5588, |
| "num_input_tokens_seen": 533808, |
| "step": 1875 |
| }, |
| { |
| "epoch": 3.5074626865671643, |
| "grad_norm": 0.18417012691497803, |
| "learning_rate": 0.000982836267303512, |
| "loss": 0.5219, |
| "num_input_tokens_seen": 535216, |
| "step": 1880 |
| }, |
| { |
| "epoch": 3.5167910447761193, |
| "grad_norm": 0.32585784792900085, |
| "learning_rate": 0.0009826241670676816, |
| "loss": 0.5187, |
| "num_input_tokens_seen": 536720, |
| "step": 1885 |
| }, |
| { |
| "epoch": 3.5261194029850746, |
| "grad_norm": 0.31505173444747925, |
| "learning_rate": 0.0009824107875267443, |
| "loss": 0.6881, |
| "num_input_tokens_seen": 538064, |
| "step": 1890 |
| }, |
| { |
| "epoch": 3.53544776119403, |
| "grad_norm": 0.3329700529575348, |
| "learning_rate": 0.0009821961292463108, |
| "loss": 0.4131, |
| "num_input_tokens_seen": 539600, |
| "step": 1895 |
| }, |
| { |
| "epoch": 3.544776119402985, |
| "grad_norm": 0.1608249843120575, |
| "learning_rate": 0.0009819801927953816, |
| "loss": 0.6528, |
| "num_input_tokens_seen": 540976, |
| "step": 1900 |
| }, |
| { |
| "epoch": 3.5541044776119404, |
| "grad_norm": 0.24903835356235504, |
| "learning_rate": 0.0009817629787463456, |
| "loss": 0.6215, |
| "num_input_tokens_seen": 542416, |
| "step": 1905 |
| }, |
| { |
| "epoch": 3.5634328358208958, |
| "grad_norm": 0.21984891593456268, |
| "learning_rate": 0.0009815444876749779, |
| "loss": 0.5424, |
| "num_input_tokens_seen": 543792, |
| "step": 1910 |
| }, |
| { |
| "epoch": 3.5727611940298507, |
| "grad_norm": 0.37518739700317383, |
| "learning_rate": 0.0009813247201604389, |
| "loss": 0.6319, |
| "num_input_tokens_seen": 545328, |
| "step": 1915 |
| }, |
| { |
| "epoch": 3.582089552238806, |
| "grad_norm": 0.16793659329414368, |
| "learning_rate": 0.0009811036767852725, |
| "loss": 0.4634, |
| "num_input_tokens_seen": 546672, |
| "step": 1920 |
| }, |
| { |
| "epoch": 3.591417910447761, |
| "grad_norm": 0.17305253446102142, |
| "learning_rate": 0.000980881358135404, |
| "loss": 0.4199, |
| "num_input_tokens_seen": 548080, |
| "step": 1925 |
| }, |
| { |
| "epoch": 3.6007462686567164, |
| "grad_norm": 0.13368631899356842, |
| "learning_rate": 0.0009806577648001397, |
| "loss": 0.4275, |
| "num_input_tokens_seen": 549680, |
| "step": 1930 |
| }, |
| { |
| "epoch": 3.6100746268656714, |
| "grad_norm": 0.3820650577545166, |
| "learning_rate": 0.0009804328973721645, |
| "loss": 0.6723, |
| "num_input_tokens_seen": 550992, |
| "step": 1935 |
| }, |
| { |
| "epoch": 3.6194029850746268, |
| "grad_norm": 0.18812960386276245, |
| "learning_rate": 0.0009802067564475413, |
| "loss": 0.7015, |
| "num_input_tokens_seen": 552496, |
| "step": 1940 |
| }, |
| { |
| "epoch": 3.628731343283582, |
| "grad_norm": 0.31182393431663513, |
| "learning_rate": 0.000979979342625707, |
| "loss": 0.5029, |
| "num_input_tokens_seen": 553840, |
| "step": 1945 |
| }, |
| { |
| "epoch": 3.638059701492537, |
| "grad_norm": 0.1738317906856537, |
| "learning_rate": 0.0009797506565094745, |
| "loss": 0.733, |
| "num_input_tokens_seen": 555248, |
| "step": 1950 |
| }, |
| { |
| "epoch": 3.6473880597014925, |
| "grad_norm": 0.2425081580877304, |
| "learning_rate": 0.000979520698705028, |
| "loss": 0.5905, |
| "num_input_tokens_seen": 556624, |
| "step": 1955 |
| }, |
| { |
| "epoch": 3.656716417910448, |
| "grad_norm": 0.23325420916080475, |
| "learning_rate": 0.000979289469821923, |
| "loss": 1.0098, |
| "num_input_tokens_seen": 558096, |
| "step": 1960 |
| }, |
| { |
| "epoch": 3.666044776119403, |
| "grad_norm": 0.3119276463985443, |
| "learning_rate": 0.0009790569704730843, |
| "loss": 0.7166, |
| "num_input_tokens_seen": 559376, |
| "step": 1965 |
| }, |
| { |
| "epoch": 3.675373134328358, |
| "grad_norm": 0.26881784200668335, |
| "learning_rate": 0.0009788232012748043, |
| "loss": 0.7225, |
| "num_input_tokens_seen": 560784, |
| "step": 1970 |
| }, |
| { |
| "epoch": 3.6847014925373136, |
| "grad_norm": 0.3153090476989746, |
| "learning_rate": 0.0009785881628467412, |
| "loss": 0.3893, |
| "num_input_tokens_seen": 562416, |
| "step": 1975 |
| }, |
| { |
| "epoch": 3.6940298507462686, |
| "grad_norm": 0.26809364557266235, |
| "learning_rate": 0.0009783518558119182, |
| "loss": 0.6695, |
| "num_input_tokens_seen": 563920, |
| "step": 1980 |
| }, |
| { |
| "epoch": 3.703358208955224, |
| "grad_norm": 0.3261995017528534, |
| "learning_rate": 0.0009781142807967205, |
| "loss": 0.5614, |
| "num_input_tokens_seen": 565424, |
| "step": 1985 |
| }, |
| { |
| "epoch": 3.7126865671641793, |
| "grad_norm": 0.23048445582389832, |
| "learning_rate": 0.0009778754384308947, |
| "loss": 0.4495, |
| "num_input_tokens_seen": 566896, |
| "step": 1990 |
| }, |
| { |
| "epoch": 3.7220149253731343, |
| "grad_norm": 0.2346111238002777, |
| "learning_rate": 0.000977635329347547, |
| "loss": 0.7348, |
| "num_input_tokens_seen": 568176, |
| "step": 1995 |
| }, |
| { |
| "epoch": 3.7313432835820897, |
| "grad_norm": 0.4382629990577698, |
| "learning_rate": 0.000977393954183141, |
| "loss": 0.7043, |
| "num_input_tokens_seen": 569616, |
| "step": 2000 |
| }, |
| { |
| "epoch": 3.7406716417910446, |
| "grad_norm": 0.28776127099990845, |
| "learning_rate": 0.0009771513135774965, |
| "loss": 0.3675, |
| "num_input_tokens_seen": 571024, |
| "step": 2005 |
| }, |
| { |
| "epoch": 3.75, |
| "grad_norm": 0.24283809959888458, |
| "learning_rate": 0.0009769074081737877, |
| "loss": 0.7819, |
| "num_input_tokens_seen": 572432, |
| "step": 2010 |
| }, |
| { |
| "epoch": 3.7593283582089554, |
| "grad_norm": 0.299737811088562, |
| "learning_rate": 0.000976662238618541, |
| "loss": 0.5218, |
| "num_input_tokens_seen": 573712, |
| "step": 2015 |
| }, |
| { |
| "epoch": 3.7686567164179103, |
| "grad_norm": 0.26282361149787903, |
| "learning_rate": 0.0009764158055616346, |
| "loss": 0.6876, |
| "num_input_tokens_seen": 575184, |
| "step": 2020 |
| }, |
| { |
| "epoch": 3.7779850746268657, |
| "grad_norm": 0.13650095462799072, |
| "learning_rate": 0.0009761681096562949, |
| "loss": 0.5098, |
| "num_input_tokens_seen": 576688, |
| "step": 2025 |
| }, |
| { |
| "epoch": 3.7873134328358207, |
| "grad_norm": 0.2913561463356018, |
| "learning_rate": 0.0009759191515590963, |
| "loss": 0.8325, |
| "num_input_tokens_seen": 578000, |
| "step": 2030 |
| }, |
| { |
| "epoch": 3.796641791044776, |
| "grad_norm": 0.23576973378658295, |
| "learning_rate": 0.0009756689319299592, |
| "loss": 0.5476, |
| "num_input_tokens_seen": 579632, |
| "step": 2035 |
| }, |
| { |
| "epoch": 3.8059701492537314, |
| "grad_norm": 0.14105857908725739, |
| "learning_rate": 0.0009754174514321472, |
| "loss": 0.5916, |
| "num_input_tokens_seen": 581104, |
| "step": 2040 |
| }, |
| { |
| "epoch": 3.8152985074626864, |
| "grad_norm": 0.3116491436958313, |
| "learning_rate": 0.0009751647107322667, |
| "loss": 0.8013, |
| "num_input_tokens_seen": 582512, |
| "step": 2045 |
| }, |
| { |
| "epoch": 3.824626865671642, |
| "grad_norm": 0.3280572295188904, |
| "learning_rate": 0.0009749107105002646, |
| "loss": 0.5183, |
| "num_input_tokens_seen": 583920, |
| "step": 2050 |
| }, |
| { |
| "epoch": 3.833955223880597, |
| "grad_norm": 0.2682742178440094, |
| "learning_rate": 0.000974655451409426, |
| "loss": 0.7085, |
| "num_input_tokens_seen": 585328, |
| "step": 2055 |
| }, |
| { |
| "epoch": 3.843283582089552, |
| "grad_norm": 0.18694724142551422, |
| "learning_rate": 0.0009743989341363731, |
| "loss": 0.6983, |
| "num_input_tokens_seen": 586608, |
| "step": 2060 |
| }, |
| { |
| "epoch": 3.8526119402985075, |
| "grad_norm": 0.19222024083137512, |
| "learning_rate": 0.0009741411593610635, |
| "loss": 0.5223, |
| "num_input_tokens_seen": 588080, |
| "step": 2065 |
| }, |
| { |
| "epoch": 3.861940298507463, |
| "grad_norm": 0.19455499947071075, |
| "learning_rate": 0.0009738821277667878, |
| "loss": 0.4412, |
| "num_input_tokens_seen": 589552, |
| "step": 2070 |
| }, |
| { |
| "epoch": 3.871268656716418, |
| "grad_norm": 0.14917276799678802, |
| "learning_rate": 0.0009736218400401682, |
| "loss": 0.5749, |
| "num_input_tokens_seen": 590928, |
| "step": 2075 |
| }, |
| { |
| "epoch": 3.8805970149253732, |
| "grad_norm": 0.1883125603199005, |
| "learning_rate": 0.0009733602968711565, |
| "loss": 0.6998, |
| "num_input_tokens_seen": 592560, |
| "step": 2080 |
| }, |
| { |
| "epoch": 3.8899253731343286, |
| "grad_norm": 0.4482322335243225, |
| "learning_rate": 0.0009730974989530321, |
| "loss": 0.5617, |
| "num_input_tokens_seen": 594032, |
| "step": 2085 |
| }, |
| { |
| "epoch": 3.8992537313432836, |
| "grad_norm": 0.22975589334964752, |
| "learning_rate": 0.000972833446982401, |
| "loss": 0.6922, |
| "num_input_tokens_seen": 595472, |
| "step": 2090 |
| }, |
| { |
| "epoch": 3.908582089552239, |
| "grad_norm": 0.5198706388473511, |
| "learning_rate": 0.0009725681416591927, |
| "loss": 0.7359, |
| "num_input_tokens_seen": 596976, |
| "step": 2095 |
| }, |
| { |
| "epoch": 3.917910447761194, |
| "grad_norm": 0.2125498652458191, |
| "learning_rate": 0.0009723015836866595, |
| "loss": 0.5097, |
| "num_input_tokens_seen": 598256, |
| "step": 2100 |
| }, |
| { |
| "epoch": 3.9272388059701493, |
| "grad_norm": 0.1989123821258545, |
| "learning_rate": 0.0009720337737713739, |
| "loss": 0.5684, |
| "num_input_tokens_seen": 599568, |
| "step": 2105 |
| }, |
| { |
| "epoch": 3.9365671641791042, |
| "grad_norm": 0.2720445990562439, |
| "learning_rate": 0.000971764712623227, |
| "loss": 0.5866, |
| "num_input_tokens_seen": 601008, |
| "step": 2110 |
| }, |
| { |
| "epoch": 3.9458955223880596, |
| "grad_norm": 0.23459851741790771, |
| "learning_rate": 0.0009714944009554262, |
| "loss": 0.5404, |
| "num_input_tokens_seen": 602256, |
| "step": 2115 |
| }, |
| { |
| "epoch": 3.955223880597015, |
| "grad_norm": 0.17958712577819824, |
| "learning_rate": 0.0009712228394844945, |
| "loss": 0.4168, |
| "num_input_tokens_seen": 603760, |
| "step": 2120 |
| }, |
| { |
| "epoch": 3.96455223880597, |
| "grad_norm": 0.13713163137435913, |
| "learning_rate": 0.0009709500289302673, |
| "loss": 0.5269, |
| "num_input_tokens_seen": 605232, |
| "step": 2125 |
| }, |
| { |
| "epoch": 3.9738805970149254, |
| "grad_norm": 0.15691228210926056, |
| "learning_rate": 0.0009706759700158907, |
| "loss": 0.5853, |
| "num_input_tokens_seen": 606448, |
| "step": 2130 |
| }, |
| { |
| "epoch": 3.9832089552238807, |
| "grad_norm": 0.15463533997535706, |
| "learning_rate": 0.0009704006634678205, |
| "loss": 0.3447, |
| "num_input_tokens_seen": 608144, |
| "step": 2135 |
| }, |
| { |
| "epoch": 3.9925373134328357, |
| "grad_norm": 0.20043453574180603, |
| "learning_rate": 0.0009701241100158189, |
| "loss": 0.5617, |
| "num_input_tokens_seen": 609744, |
| "step": 2140 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 0.64982670545578, |
| "eval_runtime": 4.1877, |
| "eval_samples_per_second": 56.834, |
| "eval_steps_per_second": 14.328, |
| "num_input_tokens_seen": 610584, |
| "step": 2144 |
| }, |
| { |
| "epoch": 4.001865671641791, |
| "grad_norm": 0.22188594937324524, |
| "learning_rate": 0.0009698463103929542, |
| "loss": 0.4556, |
| "num_input_tokens_seen": 610808, |
| "step": 2145 |
| }, |
| { |
| "epoch": 4.0111940298507465, |
| "grad_norm": 0.151460200548172, |
| "learning_rate": 0.0009695672653355972, |
| "loss": 0.387, |
| "num_input_tokens_seen": 612280, |
| "step": 2150 |
| }, |
| { |
| "epoch": 4.020522388059701, |
| "grad_norm": 0.1626012623310089, |
| "learning_rate": 0.0009692869755834203, |
| "loss": 0.6283, |
| "num_input_tokens_seen": 613816, |
| "step": 2155 |
| }, |
| { |
| "epoch": 4.029850746268656, |
| "grad_norm": 0.21569664776325226, |
| "learning_rate": 0.0009690054418793955, |
| "loss": 0.3275, |
| "num_input_tokens_seen": 615224, |
| "step": 2160 |
| }, |
| { |
| "epoch": 4.039179104477612, |
| "grad_norm": 0.2591455280780792, |
| "learning_rate": 0.0009687226649697915, |
| "loss": 0.5641, |
| "num_input_tokens_seen": 616568, |
| "step": 2165 |
| }, |
| { |
| "epoch": 4.048507462686567, |
| "grad_norm": 0.24344514310359955, |
| "learning_rate": 0.000968438645604173, |
| "loss": 0.3863, |
| "num_input_tokens_seen": 618008, |
| "step": 2170 |
| }, |
| { |
| "epoch": 4.057835820895522, |
| "grad_norm": 0.32639652490615845, |
| "learning_rate": 0.0009681533845353978, |
| "loss": 0.5677, |
| "num_input_tokens_seen": 619576, |
| "step": 2175 |
| }, |
| { |
| "epoch": 4.067164179104478, |
| "grad_norm": 0.3175128400325775, |
| "learning_rate": 0.0009678668825196154, |
| "loss": 0.6541, |
| "num_input_tokens_seen": 620920, |
| "step": 2180 |
| }, |
| { |
| "epoch": 4.076492537313433, |
| "grad_norm": 0.327414333820343, |
| "learning_rate": 0.0009675791403162645, |
| "loss": 0.5617, |
| "num_input_tokens_seen": 622296, |
| "step": 2185 |
| }, |
| { |
| "epoch": 4.085820895522388, |
| "grad_norm": 0.1868485063314438, |
| "learning_rate": 0.0009672901586880711, |
| "loss": 0.2805, |
| "num_input_tokens_seen": 623800, |
| "step": 2190 |
| }, |
| { |
| "epoch": 4.095149253731344, |
| "grad_norm": 0.2809881567955017, |
| "learning_rate": 0.000966999938401047, |
| "loss": 0.4671, |
| "num_input_tokens_seen": 625208, |
| "step": 2195 |
| }, |
| { |
| "epoch": 4.104477611940299, |
| "grad_norm": 0.21550977230072021, |
| "learning_rate": 0.0009667084802244868, |
| "loss": 0.585, |
| "num_input_tokens_seen": 626584, |
| "step": 2200 |
| }, |
| { |
| "epoch": 4.1138059701492535, |
| "grad_norm": 0.27427732944488525, |
| "learning_rate": 0.0009664157849309669, |
| "loss": 0.6408, |
| "num_input_tokens_seen": 627960, |
| "step": 2205 |
| }, |
| { |
| "epoch": 4.123134328358209, |
| "grad_norm": 0.23494300246238708, |
| "learning_rate": 0.0009661218532963426, |
| "loss": 0.5565, |
| "num_input_tokens_seen": 629368, |
| "step": 2210 |
| }, |
| { |
| "epoch": 4.132462686567164, |
| "grad_norm": 0.26595717668533325, |
| "learning_rate": 0.0009658266860997465, |
| "loss": 0.6724, |
| "num_input_tokens_seen": 630776, |
| "step": 2215 |
| }, |
| { |
| "epoch": 4.141791044776119, |
| "grad_norm": 0.18775132298469543, |
| "learning_rate": 0.0009655302841235865, |
| "loss": 0.4481, |
| "num_input_tokens_seen": 632120, |
| "step": 2220 |
| }, |
| { |
| "epoch": 4.151119402985074, |
| "grad_norm": 0.13259465992450714, |
| "learning_rate": 0.0009652326481535434, |
| "loss": 0.4325, |
| "num_input_tokens_seen": 633784, |
| "step": 2225 |
| }, |
| { |
| "epoch": 4.16044776119403, |
| "grad_norm": 0.1564403623342514, |
| "learning_rate": 0.0009649337789785688, |
| "loss": 0.6364, |
| "num_input_tokens_seen": 635224, |
| "step": 2230 |
| }, |
| { |
| "epoch": 4.169776119402985, |
| "grad_norm": 0.326425701379776, |
| "learning_rate": 0.000964633677390884, |
| "loss": 0.5656, |
| "num_input_tokens_seen": 636504, |
| "step": 2235 |
| }, |
| { |
| "epoch": 4.17910447761194, |
| "grad_norm": 0.18858228623867035, |
| "learning_rate": 0.0009643323441859757, |
| "loss": 0.4936, |
| "num_input_tokens_seen": 638232, |
| "step": 2240 |
| }, |
| { |
| "epoch": 4.188432835820896, |
| "grad_norm": 0.21565784513950348, |
| "learning_rate": 0.0009640297801625968, |
| "loss": 0.595, |
| "num_input_tokens_seen": 639608, |
| "step": 2245 |
| }, |
| { |
| "epoch": 4.197761194029851, |
| "grad_norm": 0.15167202055454254, |
| "learning_rate": 0.0009637259861227616, |
| "loss": 0.5027, |
| "num_input_tokens_seen": 641112, |
| "step": 2250 |
| }, |
| { |
| "epoch": 4.207089552238806, |
| "grad_norm": 0.24145975708961487, |
| "learning_rate": 0.0009634209628717455, |
| "loss": 0.577, |
| "num_input_tokens_seen": 642488, |
| "step": 2255 |
| }, |
| { |
| "epoch": 4.2164179104477615, |
| "grad_norm": 0.32112327218055725, |
| "learning_rate": 0.000963114711218082, |
| "loss": 0.6006, |
| "num_input_tokens_seen": 643800, |
| "step": 2260 |
| }, |
| { |
| "epoch": 4.225746268656716, |
| "grad_norm": 0.3594324290752411, |
| "learning_rate": 0.0009628072319735606, |
| "loss": 0.7394, |
| "num_input_tokens_seen": 645304, |
| "step": 2265 |
| }, |
| { |
| "epoch": 4.235074626865671, |
| "grad_norm": 0.26933395862579346, |
| "learning_rate": 0.0009624985259532251, |
| "loss": 0.5461, |
| "num_input_tokens_seen": 646648, |
| "step": 2270 |
| }, |
| { |
| "epoch": 4.244402985074627, |
| "grad_norm": 0.2904312014579773, |
| "learning_rate": 0.000962188593975371, |
| "loss": 0.6195, |
| "num_input_tokens_seen": 648216, |
| "step": 2275 |
| }, |
| { |
| "epoch": 4.253731343283582, |
| "grad_norm": 0.12113097310066223, |
| "learning_rate": 0.0009618774368615432, |
| "loss": 0.6924, |
| "num_input_tokens_seen": 649592, |
| "step": 2280 |
| }, |
| { |
| "epoch": 4.263059701492537, |
| "grad_norm": 0.32047057151794434, |
| "learning_rate": 0.000961565055436535, |
| "loss": 0.468, |
| "num_input_tokens_seen": 651096, |
| "step": 2285 |
| }, |
| { |
| "epoch": 4.272388059701493, |
| "grad_norm": 0.2290755957365036, |
| "learning_rate": 0.0009612514505283838, |
| "loss": 0.4576, |
| "num_input_tokens_seen": 652536, |
| "step": 2290 |
| }, |
| { |
| "epoch": 4.281716417910448, |
| "grad_norm": 0.19104692339897156, |
| "learning_rate": 0.000960936622968371, |
| "loss": 0.7115, |
| "num_input_tokens_seen": 653976, |
| "step": 2295 |
| }, |
| { |
| "epoch": 4.291044776119403, |
| "grad_norm": 0.2492237538099289, |
| "learning_rate": 0.0009606205735910186, |
| "loss": 0.5699, |
| "num_input_tokens_seen": 655320, |
| "step": 2300 |
| }, |
| { |
| "epoch": 4.300373134328359, |
| "grad_norm": 0.19649381935596466, |
| "learning_rate": 0.0009603033032340874, |
| "loss": 0.7391, |
| "num_input_tokens_seen": 656760, |
| "step": 2305 |
| }, |
| { |
| "epoch": 4.309701492537314, |
| "grad_norm": 0.20256049931049347, |
| "learning_rate": 0.0009599848127385747, |
| "loss": 0.6327, |
| "num_input_tokens_seen": 658200, |
| "step": 2310 |
| }, |
| { |
| "epoch": 4.3190298507462686, |
| "grad_norm": 0.22700995206832886, |
| "learning_rate": 0.0009596651029487116, |
| "loss": 0.6798, |
| "num_input_tokens_seen": 659544, |
| "step": 2315 |
| }, |
| { |
| "epoch": 4.3283582089552235, |
| "grad_norm": 0.1987559199333191, |
| "learning_rate": 0.000959344174711962, |
| "loss": 0.5871, |
| "num_input_tokens_seen": 660952, |
| "step": 2320 |
| }, |
| { |
| "epoch": 4.337686567164179, |
| "grad_norm": 0.21659596264362335, |
| "learning_rate": 0.0009590220288790191, |
| "loss": 0.5206, |
| "num_input_tokens_seen": 662424, |
| "step": 2325 |
| }, |
| { |
| "epoch": 4.347014925373134, |
| "grad_norm": 0.20847119390964508, |
| "learning_rate": 0.0009586986663038035, |
| "loss": 0.341, |
| "num_input_tokens_seen": 663832, |
| "step": 2330 |
| }, |
| { |
| "epoch": 4.356343283582089, |
| "grad_norm": 0.2969812750816345, |
| "learning_rate": 0.0009583740878434616, |
| "loss": 0.6927, |
| "num_input_tokens_seen": 665112, |
| "step": 2335 |
| }, |
| { |
| "epoch": 4.365671641791045, |
| "grad_norm": 0.33004283905029297, |
| "learning_rate": 0.0009580482943583621, |
| "loss": 0.7178, |
| "num_input_tokens_seen": 666712, |
| "step": 2340 |
| }, |
| { |
| "epoch": 4.375, |
| "grad_norm": 0.2885778248310089, |
| "learning_rate": 0.0009577212867120946, |
| "loss": 0.6155, |
| "num_input_tokens_seen": 668248, |
| "step": 2345 |
| }, |
| { |
| "epoch": 4.384328358208955, |
| "grad_norm": 0.15386725962162018, |
| "learning_rate": 0.0009573930657714678, |
| "loss": 0.5363, |
| "num_input_tokens_seen": 669816, |
| "step": 2350 |
| }, |
| { |
| "epoch": 4.393656716417911, |
| "grad_norm": 0.19172047078609467, |
| "learning_rate": 0.0009570636324065054, |
| "loss": 0.7379, |
| "num_input_tokens_seen": 671448, |
| "step": 2355 |
| }, |
| { |
| "epoch": 4.402985074626866, |
| "grad_norm": 0.14004682004451752, |
| "learning_rate": 0.0009567329874904456, |
| "loss": 0.5496, |
| "num_input_tokens_seen": 673016, |
| "step": 2360 |
| }, |
| { |
| "epoch": 4.412313432835821, |
| "grad_norm": 0.24465550482273102, |
| "learning_rate": 0.0009564011318997379, |
| "loss": 0.4256, |
| "num_input_tokens_seen": 674456, |
| "step": 2365 |
| }, |
| { |
| "epoch": 4.4216417910447765, |
| "grad_norm": 0.2554193437099457, |
| "learning_rate": 0.0009560680665140414, |
| "loss": 0.619, |
| "num_input_tokens_seen": 676056, |
| "step": 2370 |
| }, |
| { |
| "epoch": 4.4309701492537314, |
| "grad_norm": 0.16861599683761597, |
| "learning_rate": 0.0009557337922162211, |
| "loss": 0.327, |
| "num_input_tokens_seen": 677784, |
| "step": 2375 |
| }, |
| { |
| "epoch": 4.440298507462686, |
| "grad_norm": 0.21215523779392242, |
| "learning_rate": 0.0009553983098923473, |
| "loss": 0.5506, |
| "num_input_tokens_seen": 678968, |
| "step": 2380 |
| }, |
| { |
| "epoch": 4.449626865671641, |
| "grad_norm": 0.23762239515781403, |
| "learning_rate": 0.0009550616204316922, |
| "loss": 0.672, |
| "num_input_tokens_seen": 680504, |
| "step": 2385 |
| }, |
| { |
| "epoch": 4.458955223880597, |
| "grad_norm": 0.3079289197921753, |
| "learning_rate": 0.0009547237247267277, |
| "loss": 0.6254, |
| "num_input_tokens_seen": 681816, |
| "step": 2390 |
| }, |
| { |
| "epoch": 4.468283582089552, |
| "grad_norm": 0.37273162603378296, |
| "learning_rate": 0.0009543846236731234, |
| "loss": 0.6398, |
| "num_input_tokens_seen": 683256, |
| "step": 2395 |
| }, |
| { |
| "epoch": 4.477611940298507, |
| "grad_norm": 0.18848223984241486, |
| "learning_rate": 0.0009540443181697436, |
| "loss": 0.4413, |
| "num_input_tokens_seen": 684888, |
| "step": 2400 |
| }, |
| { |
| "epoch": 4.486940298507463, |
| "grad_norm": 0.22393417358398438, |
| "learning_rate": 0.0009537028091186453, |
| "loss": 0.3668, |
| "num_input_tokens_seen": 686296, |
| "step": 2405 |
| }, |
| { |
| "epoch": 4.496268656716418, |
| "grad_norm": 0.22919961810112, |
| "learning_rate": 0.000953360097425076, |
| "loss": 0.5036, |
| "num_input_tokens_seen": 687736, |
| "step": 2410 |
| }, |
| { |
| "epoch": 4.505597014925373, |
| "grad_norm": 0.15756945312023163, |
| "learning_rate": 0.0009530161839974711, |
| "loss": 0.6575, |
| "num_input_tokens_seen": 689240, |
| "step": 2415 |
| }, |
| { |
| "epoch": 4.514925373134329, |
| "grad_norm": 0.1780344396829605, |
| "learning_rate": 0.0009526710697474513, |
| "loss": 0.7309, |
| "num_input_tokens_seen": 690616, |
| "step": 2420 |
| }, |
| { |
| "epoch": 4.524253731343284, |
| "grad_norm": 0.2617693245410919, |
| "learning_rate": 0.0009523247555898205, |
| "loss": 0.6014, |
| "num_input_tokens_seen": 691960, |
| "step": 2425 |
| }, |
| { |
| "epoch": 4.5335820895522385, |
| "grad_norm": 0.23155321180820465, |
| "learning_rate": 0.0009519772424425628, |
| "loss": 0.5385, |
| "num_input_tokens_seen": 693592, |
| "step": 2430 |
| }, |
| { |
| "epoch": 4.542910447761194, |
| "grad_norm": 0.2970794141292572, |
| "learning_rate": 0.000951628531226841, |
| "loss": 0.513, |
| "num_input_tokens_seen": 695128, |
| "step": 2435 |
| }, |
| { |
| "epoch": 4.552238805970149, |
| "grad_norm": 0.14807343482971191, |
| "learning_rate": 0.0009512786228669936, |
| "loss": 0.3699, |
| "num_input_tokens_seen": 696504, |
| "step": 2440 |
| }, |
| { |
| "epoch": 4.561567164179104, |
| "grad_norm": 0.26394209265708923, |
| "learning_rate": 0.0009509275182905322, |
| "loss": 0.6715, |
| "num_input_tokens_seen": 697880, |
| "step": 2445 |
| }, |
| { |
| "epoch": 4.57089552238806, |
| "grad_norm": 0.29761648178100586, |
| "learning_rate": 0.0009505752184281391, |
| "loss": 0.6177, |
| "num_input_tokens_seen": 699352, |
| "step": 2450 |
| }, |
| { |
| "epoch": 4.580223880597015, |
| "grad_norm": 0.24216167628765106, |
| "learning_rate": 0.0009502217242136656, |
| "loss": 0.609, |
| "num_input_tokens_seen": 700824, |
| "step": 2455 |
| }, |
| { |
| "epoch": 4.58955223880597, |
| "grad_norm": 0.2688663899898529, |
| "learning_rate": 0.0009498670365841282, |
| "loss": 0.4861, |
| "num_input_tokens_seen": 702232, |
| "step": 2460 |
| }, |
| { |
| "epoch": 4.598880597014926, |
| "grad_norm": 0.22293132543563843, |
| "learning_rate": 0.0009495111564797073, |
| "loss": 0.345, |
| "num_input_tokens_seen": 703640, |
| "step": 2465 |
| }, |
| { |
| "epoch": 4.608208955223881, |
| "grad_norm": 0.21038322150707245, |
| "learning_rate": 0.000949154084843744, |
| "loss": 0.6915, |
| "num_input_tokens_seen": 704952, |
| "step": 2470 |
| }, |
| { |
| "epoch": 4.617537313432836, |
| "grad_norm": 0.21407586336135864, |
| "learning_rate": 0.0009487958226227378, |
| "loss": 0.4968, |
| "num_input_tokens_seen": 706328, |
| "step": 2475 |
| }, |
| { |
| "epoch": 4.6268656716417915, |
| "grad_norm": 0.2326938360929489, |
| "learning_rate": 0.0009484363707663442, |
| "loss": 0.5528, |
| "num_input_tokens_seen": 707800, |
| "step": 2480 |
| }, |
| { |
| "epoch": 4.6361940298507465, |
| "grad_norm": 0.23766377568244934, |
| "learning_rate": 0.0009480757302273721, |
| "loss": 0.5268, |
| "num_input_tokens_seen": 709144, |
| "step": 2485 |
| }, |
| { |
| "epoch": 4.645522388059701, |
| "grad_norm": 0.1962820440530777, |
| "learning_rate": 0.0009477139019617813, |
| "loss": 0.4697, |
| "num_input_tokens_seen": 710488, |
| "step": 2490 |
| }, |
| { |
| "epoch": 4.654850746268656, |
| "grad_norm": 0.29607799649238586, |
| "learning_rate": 0.00094735088692868, |
| "loss": 0.5732, |
| "num_input_tokens_seen": 711736, |
| "step": 2495 |
| }, |
| { |
| "epoch": 4.664179104477612, |
| "grad_norm": 0.26132312417030334, |
| "learning_rate": 0.0009469866860903217, |
| "loss": 0.7312, |
| "num_input_tokens_seen": 712888, |
| "step": 2500 |
| }, |
| { |
| "epoch": 4.673507462686567, |
| "grad_norm": 0.29316362738609314, |
| "learning_rate": 0.0009466213004121041, |
| "loss": 0.4659, |
| "num_input_tokens_seen": 714168, |
| "step": 2505 |
| }, |
| { |
| "epoch": 4.682835820895522, |
| "grad_norm": 0.21937410533428192, |
| "learning_rate": 0.0009462547308625647, |
| "loss": 0.2729, |
| "num_input_tokens_seen": 715544, |
| "step": 2510 |
| }, |
| { |
| "epoch": 4.692164179104478, |
| "grad_norm": 0.15683762729167938, |
| "learning_rate": 0.0009458869784133795, |
| "loss": 0.4786, |
| "num_input_tokens_seen": 716952, |
| "step": 2515 |
| }, |
| { |
| "epoch": 4.701492537313433, |
| "grad_norm": 0.377902626991272, |
| "learning_rate": 0.0009455180440393598, |
| "loss": 0.4655, |
| "num_input_tokens_seen": 718264, |
| "step": 2520 |
| }, |
| { |
| "epoch": 4.710820895522388, |
| "grad_norm": 0.21586859226226807, |
| "learning_rate": 0.0009451479287184505, |
| "loss": 0.4381, |
| "num_input_tokens_seen": 719704, |
| "step": 2525 |
| }, |
| { |
| "epoch": 4.720149253731344, |
| "grad_norm": 0.12033271789550781, |
| "learning_rate": 0.000944776633431726, |
| "loss": 0.5281, |
| "num_input_tokens_seen": 721112, |
| "step": 2530 |
| }, |
| { |
| "epoch": 4.729477611940299, |
| "grad_norm": 0.29399579763412476, |
| "learning_rate": 0.0009444041591633893, |
| "loss": 0.7112, |
| "num_input_tokens_seen": 722456, |
| "step": 2535 |
| }, |
| { |
| "epoch": 4.7388059701492535, |
| "grad_norm": 0.21166181564331055, |
| "learning_rate": 0.0009440305069007678, |
| "loss": 0.6444, |
| "num_input_tokens_seen": 723864, |
| "step": 2540 |
| }, |
| { |
| "epoch": 4.7481343283582085, |
| "grad_norm": 0.30777788162231445, |
| "learning_rate": 0.0009436556776343119, |
| "loss": 0.6277, |
| "num_input_tokens_seen": 725272, |
| "step": 2545 |
| }, |
| { |
| "epoch": 4.757462686567164, |
| "grad_norm": 0.5244273543357849, |
| "learning_rate": 0.0009432796723575918, |
| "loss": 0.6372, |
| "num_input_tokens_seen": 726648, |
| "step": 2550 |
| }, |
| { |
| "epoch": 4.766791044776119, |
| "grad_norm": 0.28162702918052673, |
| "learning_rate": 0.000942902492067295, |
| "loss": 0.7612, |
| "num_input_tokens_seen": 727928, |
| "step": 2555 |
| }, |
| { |
| "epoch": 4.776119402985074, |
| "grad_norm": 0.23690469563007355, |
| "learning_rate": 0.0009425241377632239, |
| "loss": 0.7744, |
| "num_input_tokens_seen": 729208, |
| "step": 2560 |
| }, |
| { |
| "epoch": 4.78544776119403, |
| "grad_norm": 0.26138120889663696, |
| "learning_rate": 0.0009421446104482923, |
| "loss": 0.6066, |
| "num_input_tokens_seen": 730616, |
| "step": 2565 |
| }, |
| { |
| "epoch": 4.794776119402985, |
| "grad_norm": 0.24936138093471527, |
| "learning_rate": 0.0009417639111285234, |
| "loss": 0.54, |
| "num_input_tokens_seen": 732120, |
| "step": 2570 |
| }, |
| { |
| "epoch": 4.80410447761194, |
| "grad_norm": 0.23255106806755066, |
| "learning_rate": 0.000941382040813048, |
| "loss": 0.4528, |
| "num_input_tokens_seen": 733560, |
| "step": 2575 |
| }, |
| { |
| "epoch": 4.813432835820896, |
| "grad_norm": 0.19354680180549622, |
| "learning_rate": 0.0009409990005140998, |
| "loss": 0.496, |
| "num_input_tokens_seen": 735000, |
| "step": 2580 |
| }, |
| { |
| "epoch": 4.822761194029851, |
| "grad_norm": 0.1485157459974289, |
| "learning_rate": 0.0009406147912470142, |
| "loss": 0.5483, |
| "num_input_tokens_seen": 736440, |
| "step": 2585 |
| }, |
| { |
| "epoch": 4.832089552238806, |
| "grad_norm": 0.3111797273159027, |
| "learning_rate": 0.0009402294140302255, |
| "loss": 0.8823, |
| "num_input_tokens_seen": 738008, |
| "step": 2590 |
| }, |
| { |
| "epoch": 4.8414179104477615, |
| "grad_norm": 0.26995381712913513, |
| "learning_rate": 0.0009398428698852632, |
| "loss": 0.6301, |
| "num_input_tokens_seen": 739320, |
| "step": 2595 |
| }, |
| { |
| "epoch": 4.850746268656716, |
| "grad_norm": 0.30793821811676025, |
| "learning_rate": 0.0009394551598367509, |
| "loss": 0.539, |
| "num_input_tokens_seen": 740792, |
| "step": 2600 |
| }, |
| { |
| "epoch": 4.860074626865671, |
| "grad_norm": 0.2184814214706421, |
| "learning_rate": 0.0009390662849124021, |
| "loss": 0.4266, |
| "num_input_tokens_seen": 742296, |
| "step": 2605 |
| }, |
| { |
| "epoch": 4.869402985074627, |
| "grad_norm": 0.26626819372177124, |
| "learning_rate": 0.0009386762461430182, |
| "loss": 0.7834, |
| "num_input_tokens_seen": 743928, |
| "step": 2610 |
| }, |
| { |
| "epoch": 4.878731343283582, |
| "grad_norm": 0.16901420056819916, |
| "learning_rate": 0.0009382850445624855, |
| "loss": 0.4878, |
| "num_input_tokens_seen": 745304, |
| "step": 2615 |
| }, |
| { |
| "epoch": 4.888059701492537, |
| "grad_norm": 0.16570930182933807, |
| "learning_rate": 0.0009378926812077732, |
| "loss": 0.3941, |
| "num_input_tokens_seen": 746648, |
| "step": 2620 |
| }, |
| { |
| "epoch": 4.897388059701493, |
| "grad_norm": 0.21604560315608978, |
| "learning_rate": 0.000937499157118929, |
| "loss": 0.6682, |
| "num_input_tokens_seen": 748152, |
| "step": 2625 |
| }, |
| { |
| "epoch": 4.906716417910448, |
| "grad_norm": 0.15624657273292542, |
| "learning_rate": 0.0009371044733390786, |
| "loss": 0.648, |
| "num_input_tokens_seen": 749560, |
| "step": 2630 |
| }, |
| { |
| "epoch": 4.916044776119403, |
| "grad_norm": 0.24081504344940186, |
| "learning_rate": 0.0009367086309144206, |
| "loss": 0.5101, |
| "num_input_tokens_seen": 751000, |
| "step": 2635 |
| }, |
| { |
| "epoch": 4.925373134328359, |
| "grad_norm": 0.17723239958286285, |
| "learning_rate": 0.0009363116308942256, |
| "loss": 0.547, |
| "num_input_tokens_seen": 752408, |
| "step": 2640 |
| }, |
| { |
| "epoch": 4.934701492537314, |
| "grad_norm": 0.26823535561561584, |
| "learning_rate": 0.0009359134743308324, |
| "loss": 0.5798, |
| "num_input_tokens_seen": 753784, |
| "step": 2645 |
| }, |
| { |
| "epoch": 4.9440298507462686, |
| "grad_norm": 0.2015918642282486, |
| "learning_rate": 0.0009355141622796455, |
| "loss": 0.757, |
| "num_input_tokens_seen": 755224, |
| "step": 2650 |
| }, |
| { |
| "epoch": 4.9533582089552235, |
| "grad_norm": 0.2711903154850006, |
| "learning_rate": 0.0009351136957991324, |
| "loss": 0.5665, |
| "num_input_tokens_seen": 756600, |
| "step": 2655 |
| }, |
| { |
| "epoch": 4.962686567164179, |
| "grad_norm": 0.3151332139968872, |
| "learning_rate": 0.0009347120759508205, |
| "loss": 0.7391, |
| "num_input_tokens_seen": 758008, |
| "step": 2660 |
| }, |
| { |
| "epoch": 4.972014925373134, |
| "grad_norm": 0.2473651021718979, |
| "learning_rate": 0.0009343093037992945, |
| "loss": 0.6037, |
| "num_input_tokens_seen": 759448, |
| "step": 2665 |
| }, |
| { |
| "epoch": 4.981343283582089, |
| "grad_norm": 0.1524503082036972, |
| "learning_rate": 0.0009339053804121936, |
| "loss": 0.6259, |
| "num_input_tokens_seen": 760696, |
| "step": 2670 |
| }, |
| { |
| "epoch": 4.990671641791045, |
| "grad_norm": 0.20363134145736694, |
| "learning_rate": 0.0009335003068602086, |
| "loss": 0.5679, |
| "num_input_tokens_seen": 762104, |
| "step": 2675 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 0.4046460688114166, |
| "learning_rate": 0.0009330940842170789, |
| "loss": 0.5602, |
| "num_input_tokens_seen": 763216, |
| "step": 2680 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_loss": 0.6487875580787659, |
| "eval_runtime": 4.2039, |
| "eval_samples_per_second": 56.614, |
| "eval_steps_per_second": 14.272, |
| "num_input_tokens_seen": 763216, |
| "step": 2680 |
| }, |
| { |
| "epoch": 5.009328358208955, |
| "grad_norm": 0.20314471423625946, |
| "learning_rate": 0.0009326867135595905, |
| "loss": 0.4152, |
| "num_input_tokens_seen": 764624, |
| "step": 2685 |
| }, |
| { |
| "epoch": 5.018656716417911, |
| "grad_norm": 0.11228202283382416, |
| "learning_rate": 0.0009322781959675714, |
| "loss": 0.4049, |
| "num_input_tokens_seen": 766128, |
| "step": 2690 |
| }, |
| { |
| "epoch": 5.027985074626866, |
| "grad_norm": 0.19546857476234436, |
| "learning_rate": 0.0009318685325238908, |
| "loss": 0.5314, |
| "num_input_tokens_seen": 767536, |
| "step": 2695 |
| }, |
| { |
| "epoch": 5.037313432835821, |
| "grad_norm": 0.1725783497095108, |
| "learning_rate": 0.0009314577243144546, |
| "loss": 0.3877, |
| "num_input_tokens_seen": 769008, |
| "step": 2700 |
| }, |
| { |
| "epoch": 5.0466417910447765, |
| "grad_norm": 0.2179887890815735, |
| "learning_rate": 0.0009310457724282034, |
| "loss": 0.4164, |
| "num_input_tokens_seen": 770544, |
| "step": 2705 |
| }, |
| { |
| "epoch": 5.0559701492537314, |
| "grad_norm": 0.27726832032203674, |
| "learning_rate": 0.0009306326779571092, |
| "loss": 0.5732, |
| "num_input_tokens_seen": 771792, |
| "step": 2710 |
| }, |
| { |
| "epoch": 5.065298507462686, |
| "grad_norm": 0.23128509521484375, |
| "learning_rate": 0.0009302184419961731, |
| "loss": 0.5697, |
| "num_input_tokens_seen": 773328, |
| "step": 2715 |
| }, |
| { |
| "epoch": 5.074626865671641, |
| "grad_norm": 0.2378668338060379, |
| "learning_rate": 0.0009298030656434216, |
| "loss": 0.5977, |
| "num_input_tokens_seen": 774992, |
| "step": 2720 |
| }, |
| { |
| "epoch": 5.083955223880597, |
| "grad_norm": 0.24161554872989655, |
| "learning_rate": 0.0009293865499999043, |
| "loss": 0.6412, |
| "num_input_tokens_seen": 776368, |
| "step": 2725 |
| }, |
| { |
| "epoch": 5.093283582089552, |
| "grad_norm": 0.16764408349990845, |
| "learning_rate": 0.0009289688961696904, |
| "loss": 0.4418, |
| "num_input_tokens_seen": 777808, |
| "step": 2730 |
| }, |
| { |
| "epoch": 5.102611940298507, |
| "grad_norm": 0.14279451966285706, |
| "learning_rate": 0.0009285501052598666, |
| "loss": 0.5859, |
| "num_input_tokens_seen": 779376, |
| "step": 2735 |
| }, |
| { |
| "epoch": 5.111940298507463, |
| "grad_norm": 0.2730609178543091, |
| "learning_rate": 0.0009281301783805331, |
| "loss": 0.8133, |
| "num_input_tokens_seen": 780688, |
| "step": 2740 |
| }, |
| { |
| "epoch": 5.121268656716418, |
| "grad_norm": 0.310469388961792, |
| "learning_rate": 0.0009277091166448022, |
| "loss": 0.5642, |
| "num_input_tokens_seen": 782000, |
| "step": 2745 |
| }, |
| { |
| "epoch": 5.130597014925373, |
| "grad_norm": 0.283130407333374, |
| "learning_rate": 0.0009272869211687931, |
| "loss": 0.5266, |
| "num_input_tokens_seen": 783216, |
| "step": 2750 |
| }, |
| { |
| "epoch": 5.139925373134329, |
| "grad_norm": 0.3301401138305664, |
| "learning_rate": 0.0009268635930716314, |
| "loss": 0.5039, |
| "num_input_tokens_seen": 784592, |
| "step": 2755 |
| }, |
| { |
| "epoch": 5.149253731343284, |
| "grad_norm": 0.31221431493759155, |
| "learning_rate": 0.0009264391334754441, |
| "loss": 0.5843, |
| "num_input_tokens_seen": 785968, |
| "step": 2760 |
| }, |
| { |
| "epoch": 5.1585820895522385, |
| "grad_norm": 0.19616518914699554, |
| "learning_rate": 0.0009260135435053583, |
| "loss": 0.3343, |
| "num_input_tokens_seen": 787536, |
| "step": 2765 |
| }, |
| { |
| "epoch": 5.167910447761194, |
| "grad_norm": 0.370902419090271, |
| "learning_rate": 0.0009255868242894967, |
| "loss": 0.7573, |
| "num_input_tokens_seen": 788880, |
| "step": 2770 |
| }, |
| { |
| "epoch": 5.177238805970149, |
| "grad_norm": 0.25199368596076965, |
| "learning_rate": 0.0009251589769589757, |
| "loss": 0.5911, |
| "num_input_tokens_seen": 790192, |
| "step": 2775 |
| }, |
| { |
| "epoch": 5.186567164179104, |
| "grad_norm": 0.24278584122657776, |
| "learning_rate": 0.000924730002647902, |
| "loss": 0.7491, |
| "num_input_tokens_seen": 791504, |
| "step": 2780 |
| }, |
| { |
| "epoch": 5.19589552238806, |
| "grad_norm": 0.33345553278923035, |
| "learning_rate": 0.0009242999024933694, |
| "loss": 0.6366, |
| "num_input_tokens_seen": 792880, |
| "step": 2785 |
| }, |
| { |
| "epoch": 5.205223880597015, |
| "grad_norm": 0.24725379049777985, |
| "learning_rate": 0.0009238686776354564, |
| "loss": 0.6177, |
| "num_input_tokens_seen": 794288, |
| "step": 2790 |
| }, |
| { |
| "epoch": 5.21455223880597, |
| "grad_norm": 0.25025907158851624, |
| "learning_rate": 0.0009234363292172224, |
| "loss": 0.6038, |
| "num_input_tokens_seen": 795536, |
| "step": 2795 |
| }, |
| { |
| "epoch": 5.223880597014926, |
| "grad_norm": 0.2058946043252945, |
| "learning_rate": 0.0009230028583847054, |
| "loss": 0.4427, |
| "num_input_tokens_seen": 796944, |
| "step": 2800 |
| }, |
| { |
| "epoch": 5.233208955223881, |
| "grad_norm": 0.31456539034843445, |
| "learning_rate": 0.000922568266286918, |
| "loss": 0.494, |
| "num_input_tokens_seen": 798384, |
| "step": 2805 |
| }, |
| { |
| "epoch": 5.242537313432836, |
| "grad_norm": 0.30209773778915405, |
| "learning_rate": 0.0009221325540758458, |
| "loss": 0.6301, |
| "num_input_tokens_seen": 799792, |
| "step": 2810 |
| }, |
| { |
| "epoch": 5.251865671641791, |
| "grad_norm": 0.25161316990852356, |
| "learning_rate": 0.0009216957229064429, |
| "loss": 0.7827, |
| "num_input_tokens_seen": 801200, |
| "step": 2815 |
| }, |
| { |
| "epoch": 5.2611940298507465, |
| "grad_norm": 0.3006390631198883, |
| "learning_rate": 0.0009212577739366297, |
| "loss": 0.4305, |
| "num_input_tokens_seen": 802640, |
| "step": 2820 |
| }, |
| { |
| "epoch": 5.270522388059701, |
| "grad_norm": 0.20886574685573578, |
| "learning_rate": 0.0009208187083272894, |
| "loss": 0.6315, |
| "num_input_tokens_seen": 804080, |
| "step": 2825 |
| }, |
| { |
| "epoch": 5.279850746268656, |
| "grad_norm": 0.2783293128013611, |
| "learning_rate": 0.0009203785272422656, |
| "loss": 0.3979, |
| "num_input_tokens_seen": 805616, |
| "step": 2830 |
| }, |
| { |
| "epoch": 5.289179104477612, |
| "grad_norm": 0.18563398718833923, |
| "learning_rate": 0.0009199372318483581, |
| "loss": 0.5115, |
| "num_input_tokens_seen": 807088, |
| "step": 2835 |
| }, |
| { |
| "epoch": 5.298507462686567, |
| "grad_norm": 0.265781432390213, |
| "learning_rate": 0.0009194948233153206, |
| "loss": 0.5374, |
| "num_input_tokens_seen": 808464, |
| "step": 2840 |
| }, |
| { |
| "epoch": 5.307835820895522, |
| "grad_norm": 0.3264658451080322, |
| "learning_rate": 0.0009190513028158578, |
| "loss": 0.5647, |
| "num_input_tokens_seen": 810064, |
| "step": 2845 |
| }, |
| { |
| "epoch": 5.317164179104478, |
| "grad_norm": 0.28654745221138, |
| "learning_rate": 0.0009186066715256213, |
| "loss": 0.5609, |
| "num_input_tokens_seen": 811312, |
| "step": 2850 |
| }, |
| { |
| "epoch": 5.326492537313433, |
| "grad_norm": 0.2565745413303375, |
| "learning_rate": 0.000918160930623208, |
| "loss": 0.4579, |
| "num_input_tokens_seen": 812656, |
| "step": 2855 |
| }, |
| { |
| "epoch": 5.335820895522388, |
| "grad_norm": 0.15798066556453705, |
| "learning_rate": 0.0009177140812901549, |
| "loss": 0.3762, |
| "num_input_tokens_seen": 814128, |
| "step": 2860 |
| }, |
| { |
| "epoch": 5.345149253731344, |
| "grad_norm": 0.23694878816604614, |
| "learning_rate": 0.0009172661247109382, |
| "loss": 0.7089, |
| "num_input_tokens_seen": 815440, |
| "step": 2865 |
| }, |
| { |
| "epoch": 5.354477611940299, |
| "grad_norm": 0.1648920327425003, |
| "learning_rate": 0.0009168170620729683, |
| "loss": 0.7812, |
| "num_input_tokens_seen": 816848, |
| "step": 2870 |
| }, |
| { |
| "epoch": 5.3638059701492535, |
| "grad_norm": 0.22573807835578918, |
| "learning_rate": 0.0009163668945665884, |
| "loss": 0.3872, |
| "num_input_tokens_seen": 818352, |
| "step": 2875 |
| }, |
| { |
| "epoch": 5.373134328358209, |
| "grad_norm": 0.2857038974761963, |
| "learning_rate": 0.0009159156233850693, |
| "loss": 0.6371, |
| "num_input_tokens_seen": 819696, |
| "step": 2880 |
| }, |
| { |
| "epoch": 5.382462686567164, |
| "grad_norm": 0.325327068567276, |
| "learning_rate": 0.0009154632497246081, |
| "loss": 0.461, |
| "num_input_tokens_seen": 821040, |
| "step": 2885 |
| }, |
| { |
| "epoch": 5.391791044776119, |
| "grad_norm": 0.10212195664644241, |
| "learning_rate": 0.0009150097747843242, |
| "loss": 0.4968, |
| "num_input_tokens_seen": 822832, |
| "step": 2890 |
| }, |
| { |
| "epoch": 5.401119402985074, |
| "grad_norm": 0.2704038619995117, |
| "learning_rate": 0.0009145551997662559, |
| "loss": 0.6225, |
| "num_input_tokens_seen": 824304, |
| "step": 2895 |
| }, |
| { |
| "epoch": 5.41044776119403, |
| "grad_norm": 0.14122045040130615, |
| "learning_rate": 0.0009140995258753577, |
| "loss": 0.4699, |
| "num_input_tokens_seen": 825968, |
| "step": 2900 |
| }, |
| { |
| "epoch": 5.419776119402985, |
| "grad_norm": 0.23320503532886505, |
| "learning_rate": 0.0009136427543194967, |
| "loss": 0.3793, |
| "num_input_tokens_seen": 827248, |
| "step": 2905 |
| }, |
| { |
| "epoch": 5.42910447761194, |
| "grad_norm": 0.20187321305274963, |
| "learning_rate": 0.0009131848863094501, |
| "loss": 0.7002, |
| "num_input_tokens_seen": 828656, |
| "step": 2910 |
| }, |
| { |
| "epoch": 5.438432835820896, |
| "grad_norm": 0.25212332606315613, |
| "learning_rate": 0.000912725923058901, |
| "loss": 0.6236, |
| "num_input_tokens_seen": 830160, |
| "step": 2915 |
| }, |
| { |
| "epoch": 5.447761194029851, |
| "grad_norm": 0.2680214047431946, |
| "learning_rate": 0.0009122658657844358, |
| "loss": 0.5077, |
| "num_input_tokens_seen": 831792, |
| "step": 2920 |
| }, |
| { |
| "epoch": 5.457089552238806, |
| "grad_norm": 0.19384333491325378, |
| "learning_rate": 0.0009118047157055412, |
| "loss": 0.4084, |
| "num_input_tokens_seen": 833424, |
| "step": 2925 |
| }, |
| { |
| "epoch": 5.4664179104477615, |
| "grad_norm": 0.2156229168176651, |
| "learning_rate": 0.0009113424740446, |
| "loss": 0.5477, |
| "num_input_tokens_seen": 834896, |
| "step": 2930 |
| }, |
| { |
| "epoch": 5.475746268656716, |
| "grad_norm": 0.11520472913980484, |
| "learning_rate": 0.0009108791420268891, |
| "loss": 0.379, |
| "num_input_tokens_seen": 836208, |
| "step": 2935 |
| }, |
| { |
| "epoch": 5.485074626865671, |
| "grad_norm": 0.29559290409088135, |
| "learning_rate": 0.0009104147208805753, |
| "loss": 0.6259, |
| "num_input_tokens_seen": 837712, |
| "step": 2940 |
| }, |
| { |
| "epoch": 5.494402985074627, |
| "grad_norm": 0.25696009397506714, |
| "learning_rate": 0.0009099492118367123, |
| "loss": 0.5686, |
| "num_input_tokens_seen": 838992, |
| "step": 2945 |
| }, |
| { |
| "epoch": 5.503731343283582, |
| "grad_norm": 0.2539997398853302, |
| "learning_rate": 0.000909482616129238, |
| "loss": 0.3924, |
| "num_input_tokens_seen": 840464, |
| "step": 2950 |
| }, |
| { |
| "epoch": 5.513059701492537, |
| "grad_norm": 0.1732897013425827, |
| "learning_rate": 0.0009090149349949701, |
| "loss": 0.6351, |
| "num_input_tokens_seen": 841968, |
| "step": 2955 |
| }, |
| { |
| "epoch": 5.522388059701493, |
| "grad_norm": 0.22714626789093018, |
| "learning_rate": 0.000908546169673604, |
| "loss": 0.6026, |
| "num_input_tokens_seen": 843280, |
| "step": 2960 |
| }, |
| { |
| "epoch": 5.531716417910448, |
| "grad_norm": 0.15474703907966614, |
| "learning_rate": 0.0009080763214077088, |
| "loss": 0.685, |
| "num_input_tokens_seen": 844848, |
| "step": 2965 |
| }, |
| { |
| "epoch": 5.541044776119403, |
| "grad_norm": 0.183040589094162, |
| "learning_rate": 0.0009076053914427242, |
| "loss": 0.5363, |
| "num_input_tokens_seen": 846224, |
| "step": 2970 |
| }, |
| { |
| "epoch": 5.550373134328359, |
| "grad_norm": 0.2342386096715927, |
| "learning_rate": 0.0009071333810269569, |
| "loss": 0.5128, |
| "num_input_tokens_seen": 847760, |
| "step": 2975 |
| }, |
| { |
| "epoch": 5.559701492537314, |
| "grad_norm": 0.3226813077926636, |
| "learning_rate": 0.0009066602914115781, |
| "loss": 0.5095, |
| "num_input_tokens_seen": 849232, |
| "step": 2980 |
| }, |
| { |
| "epoch": 5.5690298507462686, |
| "grad_norm": 0.17433519661426544, |
| "learning_rate": 0.0009061861238506193, |
| "loss": 0.4562, |
| "num_input_tokens_seen": 850480, |
| "step": 2985 |
| }, |
| { |
| "epoch": 5.5783582089552235, |
| "grad_norm": 0.2655240297317505, |
| "learning_rate": 0.0009057108796009696, |
| "loss": 0.539, |
| "num_input_tokens_seen": 851792, |
| "step": 2990 |
| }, |
| { |
| "epoch": 5.587686567164179, |
| "grad_norm": 0.23230163753032684, |
| "learning_rate": 0.0009052345599223719, |
| "loss": 0.6974, |
| "num_input_tokens_seen": 853136, |
| "step": 2995 |
| }, |
| { |
| "epoch": 5.597014925373134, |
| "grad_norm": 0.369926393032074, |
| "learning_rate": 0.0009047571660774197, |
| "loss": 0.95, |
| "num_input_tokens_seen": 854512, |
| "step": 3000 |
| }, |
| { |
| "epoch": 5.606343283582089, |
| "grad_norm": 0.18043026328086853, |
| "learning_rate": 0.0009042786993315539, |
| "loss": 0.4186, |
| "num_input_tokens_seen": 856112, |
| "step": 3005 |
| }, |
| { |
| "epoch": 5.615671641791045, |
| "grad_norm": 0.24363280832767487, |
| "learning_rate": 0.0009037991609530596, |
| "loss": 0.5985, |
| "num_input_tokens_seen": 857456, |
| "step": 3010 |
| }, |
| { |
| "epoch": 5.625, |
| "grad_norm": 0.19951696693897247, |
| "learning_rate": 0.0009033185522130622, |
| "loss": 0.5111, |
| "num_input_tokens_seen": 859056, |
| "step": 3015 |
| }, |
| { |
| "epoch": 5.634328358208955, |
| "grad_norm": 0.14212666451931, |
| "learning_rate": 0.0009028368743855247, |
| "loss": 0.7327, |
| "num_input_tokens_seen": 860560, |
| "step": 3020 |
| }, |
| { |
| "epoch": 5.643656716417911, |
| "grad_norm": 0.2475363165140152, |
| "learning_rate": 0.0009023541287472434, |
| "loss": 0.55, |
| "num_input_tokens_seen": 862064, |
| "step": 3025 |
| }, |
| { |
| "epoch": 5.652985074626866, |
| "grad_norm": 0.2791047990322113, |
| "learning_rate": 0.0009018703165778457, |
| "loss": 0.5895, |
| "num_input_tokens_seen": 863536, |
| "step": 3030 |
| }, |
| { |
| "epoch": 5.662313432835821, |
| "grad_norm": 0.28386175632476807, |
| "learning_rate": 0.0009013854391597856, |
| "loss": 0.4768, |
| "num_input_tokens_seen": 864880, |
| "step": 3035 |
| }, |
| { |
| "epoch": 5.6716417910447765, |
| "grad_norm": 0.274679571390152, |
| "learning_rate": 0.0009008994977783407, |
| "loss": 0.6087, |
| "num_input_tokens_seen": 866288, |
| "step": 3040 |
| }, |
| { |
| "epoch": 5.6809701492537314, |
| "grad_norm": 0.3296756148338318, |
| "learning_rate": 0.0009004124937216096, |
| "loss": 0.6623, |
| "num_input_tokens_seen": 867664, |
| "step": 3045 |
| }, |
| { |
| "epoch": 5.690298507462686, |
| "grad_norm": 0.24241182208061218, |
| "learning_rate": 0.0008999244282805072, |
| "loss": 0.6314, |
| "num_input_tokens_seen": 869040, |
| "step": 3050 |
| }, |
| { |
| "epoch": 5.699626865671641, |
| "grad_norm": 0.27372506260871887, |
| "learning_rate": 0.0008994353027487616, |
| "loss": 0.416, |
| "num_input_tokens_seen": 870608, |
| "step": 3055 |
| }, |
| { |
| "epoch": 5.708955223880597, |
| "grad_norm": 0.2752893269062042, |
| "learning_rate": 0.0008989451184229118, |
| "loss": 0.7361, |
| "num_input_tokens_seen": 871824, |
| "step": 3060 |
| }, |
| { |
| "epoch": 5.718283582089552, |
| "grad_norm": 0.3607504963874817, |
| "learning_rate": 0.0008984538766023024, |
| "loss": 0.5149, |
| "num_input_tokens_seen": 873168, |
| "step": 3065 |
| }, |
| { |
| "epoch": 5.727611940298507, |
| "grad_norm": 0.2767292559146881, |
| "learning_rate": 0.0008979615785890817, |
| "loss": 0.5195, |
| "num_input_tokens_seen": 874640, |
| "step": 3070 |
| }, |
| { |
| "epoch": 5.736940298507463, |
| "grad_norm": 0.2679109275341034, |
| "learning_rate": 0.0008974682256881974, |
| "loss": 0.4831, |
| "num_input_tokens_seen": 876080, |
| "step": 3075 |
| }, |
| { |
| "epoch": 5.746268656716418, |
| "grad_norm": 0.18482770025730133, |
| "learning_rate": 0.0008969738192073939, |
| "loss": 0.3598, |
| "num_input_tokens_seen": 877680, |
| "step": 3080 |
| }, |
| { |
| "epoch": 5.755597014925373, |
| "grad_norm": 0.23143534362316132, |
| "learning_rate": 0.0008964783604572076, |
| "loss": 0.4919, |
| "num_input_tokens_seen": 879056, |
| "step": 3085 |
| }, |
| { |
| "epoch": 5.764925373134329, |
| "grad_norm": 0.2972404360771179, |
| "learning_rate": 0.0008959818507509649, |
| "loss": 0.5199, |
| "num_input_tokens_seen": 880368, |
| "step": 3090 |
| }, |
| { |
| "epoch": 5.774253731343284, |
| "grad_norm": 0.31133216619491577, |
| "learning_rate": 0.0008954842914047776, |
| "loss": 0.6719, |
| "num_input_tokens_seen": 881648, |
| "step": 3095 |
| }, |
| { |
| "epoch": 5.7835820895522385, |
| "grad_norm": 0.16302241384983063, |
| "learning_rate": 0.0008949856837375397, |
| "loss": 0.4121, |
| "num_input_tokens_seen": 883024, |
| "step": 3100 |
| }, |
| { |
| "epoch": 5.792910447761194, |
| "grad_norm": 0.3164055049419403, |
| "learning_rate": 0.0008944860290709245, |
| "loss": 0.4792, |
| "num_input_tokens_seen": 884240, |
| "step": 3105 |
| }, |
| { |
| "epoch": 5.802238805970149, |
| "grad_norm": 0.23580315709114075, |
| "learning_rate": 0.0008939853287293802, |
| "loss": 0.5963, |
| "num_input_tokens_seen": 885744, |
| "step": 3110 |
| }, |
| { |
| "epoch": 5.811567164179104, |
| "grad_norm": 0.22375214099884033, |
| "learning_rate": 0.000893483584040127, |
| "loss": 0.5519, |
| "num_input_tokens_seen": 887440, |
| "step": 3115 |
| }, |
| { |
| "epoch": 5.82089552238806, |
| "grad_norm": 0.17884212732315063, |
| "learning_rate": 0.000892980796333153, |
| "loss": 0.5346, |
| "num_input_tokens_seen": 888720, |
| "step": 3120 |
| }, |
| { |
| "epoch": 5.830223880597015, |
| "grad_norm": 0.21421754360198975, |
| "learning_rate": 0.0008924769669412116, |
| "loss": 0.4967, |
| "num_input_tokens_seen": 890288, |
| "step": 3125 |
| }, |
| { |
| "epoch": 5.83955223880597, |
| "grad_norm": 0.21498635411262512, |
| "learning_rate": 0.0008919720971998172, |
| "loss": 0.681, |
| "num_input_tokens_seen": 891760, |
| "step": 3130 |
| }, |
| { |
| "epoch": 5.848880597014926, |
| "grad_norm": 0.30558252334594727, |
| "learning_rate": 0.0008914661884472418, |
| "loss": 0.5239, |
| "num_input_tokens_seen": 893072, |
| "step": 3135 |
| }, |
| { |
| "epoch": 5.858208955223881, |
| "grad_norm": 0.2122422456741333, |
| "learning_rate": 0.0008909592420245116, |
| "loss": 0.3839, |
| "num_input_tokens_seen": 894544, |
| "step": 3140 |
| }, |
| { |
| "epoch": 5.867537313432836, |
| "grad_norm": 0.23761402070522308, |
| "learning_rate": 0.0008904512592754033, |
| "loss": 0.5714, |
| "num_input_tokens_seen": 895824, |
| "step": 3145 |
| }, |
| { |
| "epoch": 5.8768656716417915, |
| "grad_norm": 0.26812535524368286, |
| "learning_rate": 0.0008899422415464408, |
| "loss": 0.4689, |
| "num_input_tokens_seen": 897296, |
| "step": 3150 |
| }, |
| { |
| "epoch": 5.8861940298507465, |
| "grad_norm": 0.2236464023590088, |
| "learning_rate": 0.0008894321901868915, |
| "loss": 0.4325, |
| "num_input_tokens_seen": 898608, |
| "step": 3155 |
| }, |
| { |
| "epoch": 5.895522388059701, |
| "grad_norm": 0.2537100315093994, |
| "learning_rate": 0.0008889211065487621, |
| "loss": 0.4664, |
| "num_input_tokens_seen": 899952, |
| "step": 3160 |
| }, |
| { |
| "epoch": 5.904850746268656, |
| "grad_norm": 0.2743018865585327, |
| "learning_rate": 0.0008884089919867963, |
| "loss": 0.5646, |
| "num_input_tokens_seen": 901296, |
| "step": 3165 |
| }, |
| { |
| "epoch": 5.914179104477612, |
| "grad_norm": 0.1934853196144104, |
| "learning_rate": 0.0008878958478584703, |
| "loss": 0.4589, |
| "num_input_tokens_seen": 902768, |
| "step": 3170 |
| }, |
| { |
| "epoch": 5.923507462686567, |
| "grad_norm": 0.28554612398147583, |
| "learning_rate": 0.000887381675523989, |
| "loss": 0.8204, |
| "num_input_tokens_seen": 904112, |
| "step": 3175 |
| }, |
| { |
| "epoch": 5.932835820895522, |
| "grad_norm": 0.2392246276140213, |
| "learning_rate": 0.0008868664763462832, |
| "loss": 0.3573, |
| "num_input_tokens_seen": 905680, |
| "step": 3180 |
| }, |
| { |
| "epoch": 5.942164179104478, |
| "grad_norm": 0.23579584062099457, |
| "learning_rate": 0.0008863502516910058, |
| "loss": 0.5725, |
| "num_input_tokens_seen": 907024, |
| "step": 3185 |
| }, |
| { |
| "epoch": 5.951492537313433, |
| "grad_norm": 0.1369100958108902, |
| "learning_rate": 0.0008858330029265271, |
| "loss": 0.5443, |
| "num_input_tokens_seen": 908368, |
| "step": 3190 |
| }, |
| { |
| "epoch": 5.960820895522388, |
| "grad_norm": 0.19882099330425262, |
| "learning_rate": 0.0008853147314239329, |
| "loss": 0.5867, |
| "num_input_tokens_seen": 909808, |
| "step": 3195 |
| }, |
| { |
| "epoch": 5.970149253731344, |
| "grad_norm": 0.23294775187969208, |
| "learning_rate": 0.0008847954385570198, |
| "loss": 0.6957, |
| "num_input_tokens_seen": 911344, |
| "step": 3200 |
| }, |
| { |
| "epoch": 5.979477611940299, |
| "grad_norm": 0.18562950193881989, |
| "learning_rate": 0.0008842751257022911, |
| "loss": 0.4195, |
| "num_input_tokens_seen": 912752, |
| "step": 3205 |
| }, |
| { |
| "epoch": 5.9888059701492535, |
| "grad_norm": 0.2487240731716156, |
| "learning_rate": 0.0008837537942389551, |
| "loss": 0.7351, |
| "num_input_tokens_seen": 914032, |
| "step": 3210 |
| }, |
| { |
| "epoch": 5.9981343283582085, |
| "grad_norm": 0.12646648287773132, |
| "learning_rate": 0.0008832314455489188, |
| "loss": 0.4184, |
| "num_input_tokens_seen": 915472, |
| "step": 3215 |
| }, |
| { |
| "epoch": 6.0, |
| "eval_loss": 0.6421719789505005, |
| "eval_runtime": 4.1775, |
| "eval_samples_per_second": 56.972, |
| "eval_steps_per_second": 14.363, |
| "num_input_tokens_seen": 915528, |
| "step": 3216 |
| }, |
| { |
| "epoch": 6.007462686567164, |
| "grad_norm": 0.2584002912044525, |
| "learning_rate": 0.0008827080810167864, |
| "loss": 0.5532, |
| "num_input_tokens_seen": 916680, |
| "step": 3220 |
| }, |
| { |
| "epoch": 6.016791044776119, |
| "grad_norm": 0.23565295338630676, |
| "learning_rate": 0.0008821837020298546, |
| "loss": 0.603, |
| "num_input_tokens_seen": 918120, |
| "step": 3225 |
| }, |
| { |
| "epoch": 6.026119402985074, |
| "grad_norm": 0.17623180150985718, |
| "learning_rate": 0.0008816583099781093, |
| "loss": 0.5479, |
| "num_input_tokens_seen": 919496, |
| "step": 3230 |
| }, |
| { |
| "epoch": 6.03544776119403, |
| "grad_norm": 0.22531640529632568, |
| "learning_rate": 0.0008811319062542214, |
| "loss": 0.6126, |
| "num_input_tokens_seen": 920808, |
| "step": 3235 |
| }, |
| { |
| "epoch": 6.044776119402985, |
| "grad_norm": 0.27955806255340576, |
| "learning_rate": 0.0008806044922535436, |
| "loss": 0.4747, |
| "num_input_tokens_seen": 922024, |
| "step": 3240 |
| }, |
| { |
| "epoch": 6.05410447761194, |
| "grad_norm": 0.20974354445934296, |
| "learning_rate": 0.0008800760693741068, |
| "loss": 0.5458, |
| "num_input_tokens_seen": 923432, |
| "step": 3245 |
| }, |
| { |
| "epoch": 6.063432835820896, |
| "grad_norm": 0.14437653124332428, |
| "learning_rate": 0.0008795466390166161, |
| "loss": 0.4203, |
| "num_input_tokens_seen": 924936, |
| "step": 3250 |
| }, |
| { |
| "epoch": 6.072761194029851, |
| "grad_norm": 0.2256515473127365, |
| "learning_rate": 0.000879016202584447, |
| "loss": 0.4165, |
| "num_input_tokens_seen": 926408, |
| "step": 3255 |
| }, |
| { |
| "epoch": 6.082089552238806, |
| "grad_norm": 0.16263511776924133, |
| "learning_rate": 0.0008784847614836418, |
| "loss": 0.4719, |
| "num_input_tokens_seen": 927944, |
| "step": 3260 |
| }, |
| { |
| "epoch": 6.0914179104477615, |
| "grad_norm": 0.19972696900367737, |
| "learning_rate": 0.000877952317122906, |
| "loss": 0.7397, |
| "num_input_tokens_seen": 929416, |
| "step": 3265 |
| }, |
| { |
| "epoch": 6.100746268656716, |
| "grad_norm": 0.2780134975910187, |
| "learning_rate": 0.0008774188709136045, |
| "loss": 0.5968, |
| "num_input_tokens_seen": 930632, |
| "step": 3270 |
| }, |
| { |
| "epoch": 6.110074626865671, |
| "grad_norm": 0.38921645283699036, |
| "learning_rate": 0.0008768844242697578, |
| "loss": 0.7472, |
| "num_input_tokens_seen": 932168, |
| "step": 3275 |
| }, |
| { |
| "epoch": 6.119402985074627, |
| "grad_norm": 0.2642660439014435, |
| "learning_rate": 0.0008763489786080383, |
| "loss": 0.5761, |
| "num_input_tokens_seen": 933416, |
| "step": 3280 |
| }, |
| { |
| "epoch": 6.128731343283582, |
| "grad_norm": 0.3693527579307556, |
| "learning_rate": 0.0008758125353477663, |
| "loss": 0.6748, |
| "num_input_tokens_seen": 934824, |
| "step": 3285 |
| }, |
| { |
| "epoch": 6.138059701492537, |
| "grad_norm": 0.15553449094295502, |
| "learning_rate": 0.000875275095910907, |
| "loss": 0.6219, |
| "num_input_tokens_seen": 936200, |
| "step": 3290 |
| }, |
| { |
| "epoch": 6.147388059701493, |
| "grad_norm": 0.16091737151145935, |
| "learning_rate": 0.0008747366617220656, |
| "loss": 0.4781, |
| "num_input_tokens_seen": 937704, |
| "step": 3295 |
| }, |
| { |
| "epoch": 6.156716417910448, |
| "grad_norm": 0.23516695201396942, |
| "learning_rate": 0.0008741972342084843, |
| "loss": 0.452, |
| "num_input_tokens_seen": 939272, |
| "step": 3300 |
| }, |
| { |
| "epoch": 6.166044776119403, |
| "grad_norm": 0.2708747684955597, |
| "learning_rate": 0.0008736568148000385, |
| "loss": 0.4932, |
| "num_input_tokens_seen": 940520, |
| "step": 3305 |
| }, |
| { |
| "epoch": 6.175373134328359, |
| "grad_norm": 0.3128451108932495, |
| "learning_rate": 0.0008731154049292329, |
| "loss": 0.3687, |
| "num_input_tokens_seen": 942120, |
| "step": 3310 |
| }, |
| { |
| "epoch": 6.184701492537314, |
| "grad_norm": 0.39994189143180847, |
| "learning_rate": 0.0008725730060311972, |
| "loss": 0.4669, |
| "num_input_tokens_seen": 943528, |
| "step": 3315 |
| }, |
| { |
| "epoch": 6.1940298507462686, |
| "grad_norm": 0.11239203810691833, |
| "learning_rate": 0.0008720296195436831, |
| "loss": 0.3533, |
| "num_input_tokens_seen": 945096, |
| "step": 3320 |
| }, |
| { |
| "epoch": 6.2033582089552235, |
| "grad_norm": 0.26304325461387634, |
| "learning_rate": 0.0008714852469070602, |
| "loss": 0.5458, |
| "num_input_tokens_seen": 946472, |
| "step": 3325 |
| }, |
| { |
| "epoch": 6.212686567164179, |
| "grad_norm": 0.13661259412765503, |
| "learning_rate": 0.0008709398895643117, |
| "loss": 0.4306, |
| "num_input_tokens_seen": 948072, |
| "step": 3330 |
| }, |
| { |
| "epoch": 6.222014925373134, |
| "grad_norm": 0.23913314938545227, |
| "learning_rate": 0.0008703935489610315, |
| "loss": 0.5885, |
| "num_input_tokens_seen": 949512, |
| "step": 3335 |
| }, |
| { |
| "epoch": 6.231343283582089, |
| "grad_norm": 0.4055419862270355, |
| "learning_rate": 0.0008698462265454197, |
| "loss": 0.5071, |
| "num_input_tokens_seen": 950888, |
| "step": 3340 |
| }, |
| { |
| "epoch": 6.240671641791045, |
| "grad_norm": 0.1768474131822586, |
| "learning_rate": 0.0008692979237682786, |
| "loss": 0.3251, |
| "num_input_tokens_seen": 952392, |
| "step": 3345 |
| }, |
| { |
| "epoch": 6.25, |
| "grad_norm": 0.30177706480026245, |
| "learning_rate": 0.0008687486420830093, |
| "loss": 0.756, |
| "num_input_tokens_seen": 953736, |
| "step": 3350 |
| }, |
| { |
| "epoch": 6.259328358208955, |
| "grad_norm": 0.23593708872795105, |
| "learning_rate": 0.000868198382945608, |
| "loss": 0.6532, |
| "num_input_tokens_seen": 955272, |
| "step": 3355 |
| }, |
| { |
| "epoch": 6.268656716417911, |
| "grad_norm": 0.2411566525697708, |
| "learning_rate": 0.0008676471478146617, |
| "loss": 0.3248, |
| "num_input_tokens_seen": 956648, |
| "step": 3360 |
| }, |
| { |
| "epoch": 6.277985074626866, |
| "grad_norm": 0.18759582936763763, |
| "learning_rate": 0.0008670949381513445, |
| "loss": 0.4224, |
| "num_input_tokens_seen": 958024, |
| "step": 3365 |
| }, |
| { |
| "epoch": 6.287313432835821, |
| "grad_norm": 0.2575382888317108, |
| "learning_rate": 0.0008665417554194135, |
| "loss": 0.5043, |
| "num_input_tokens_seen": 959400, |
| "step": 3370 |
| }, |
| { |
| "epoch": 6.2966417910447765, |
| "grad_norm": 0.32555249333381653, |
| "learning_rate": 0.0008659876010852055, |
| "loss": 0.6098, |
| "num_input_tokens_seen": 960872, |
| "step": 3375 |
| }, |
| { |
| "epoch": 6.3059701492537314, |
| "grad_norm": 0.24410775303840637, |
| "learning_rate": 0.0008654324766176325, |
| "loss": 0.7446, |
| "num_input_tokens_seen": 962184, |
| "step": 3380 |
| }, |
| { |
| "epoch": 6.315298507462686, |
| "grad_norm": 0.1657802164554596, |
| "learning_rate": 0.000864876383488178, |
| "loss": 0.5946, |
| "num_input_tokens_seen": 963496, |
| "step": 3385 |
| }, |
| { |
| "epoch": 6.324626865671641, |
| "grad_norm": 0.22825486958026886, |
| "learning_rate": 0.0008643193231708937, |
| "loss": 0.4938, |
| "num_input_tokens_seen": 965032, |
| "step": 3390 |
| }, |
| { |
| "epoch": 6.333955223880597, |
| "grad_norm": 0.11497091501951218, |
| "learning_rate": 0.0008637612971423943, |
| "loss": 0.4575, |
| "num_input_tokens_seen": 966696, |
| "step": 3395 |
| }, |
| { |
| "epoch": 6.343283582089552, |
| "grad_norm": 0.11938580870628357, |
| "learning_rate": 0.000863202306881855, |
| "loss": 0.3512, |
| "num_input_tokens_seen": 968200, |
| "step": 3400 |
| }, |
| { |
| "epoch": 6.352611940298507, |
| "grad_norm": 0.2507960796356201, |
| "learning_rate": 0.0008626423538710062, |
| "loss": 0.4401, |
| "num_input_tokens_seen": 969672, |
| "step": 3405 |
| }, |
| { |
| "epoch": 6.361940298507463, |
| "grad_norm": 0.1698283851146698, |
| "learning_rate": 0.000862081439594131, |
| "loss": 0.4485, |
| "num_input_tokens_seen": 971112, |
| "step": 3410 |
| }, |
| { |
| "epoch": 6.371268656716418, |
| "grad_norm": 0.12110260128974915, |
| "learning_rate": 0.00086151956553806, |
| "loss": 0.5036, |
| "num_input_tokens_seen": 972456, |
| "step": 3415 |
| }, |
| { |
| "epoch": 6.380597014925373, |
| "grad_norm": 0.27388572692871094, |
| "learning_rate": 0.0008609567331921684, |
| "loss": 0.4426, |
| "num_input_tokens_seen": 973992, |
| "step": 3420 |
| }, |
| { |
| "epoch": 6.389925373134329, |
| "grad_norm": 0.3161713778972626, |
| "learning_rate": 0.0008603929440483713, |
| "loss": 0.5486, |
| "num_input_tokens_seen": 975464, |
| "step": 3425 |
| }, |
| { |
| "epoch": 6.399253731343284, |
| "grad_norm": 0.35460421442985535, |
| "learning_rate": 0.0008598281996011199, |
| "loss": 0.5463, |
| "num_input_tokens_seen": 976840, |
| "step": 3430 |
| }, |
| { |
| "epoch": 6.4085820895522385, |
| "grad_norm": 0.29080653190612793, |
| "learning_rate": 0.0008592625013473978, |
| "loss": 0.6731, |
| "num_input_tokens_seen": 978344, |
| "step": 3435 |
| }, |
| { |
| "epoch": 6.417910447761194, |
| "grad_norm": 0.27483072876930237, |
| "learning_rate": 0.0008586958507867168, |
| "loss": 0.7566, |
| "num_input_tokens_seen": 979752, |
| "step": 3440 |
| }, |
| { |
| "epoch": 6.427238805970149, |
| "grad_norm": 0.22223156690597534, |
| "learning_rate": 0.0008581282494211134, |
| "loss": 0.4567, |
| "num_input_tokens_seen": 981192, |
| "step": 3445 |
| }, |
| { |
| "epoch": 6.436567164179104, |
| "grad_norm": 0.23036664724349976, |
| "learning_rate": 0.0008575596987551438, |
| "loss": 0.5825, |
| "num_input_tokens_seen": 982536, |
| "step": 3450 |
| }, |
| { |
| "epoch": 6.44589552238806, |
| "grad_norm": 0.19232529401779175, |
| "learning_rate": 0.000856990200295881, |
| "loss": 0.4377, |
| "num_input_tokens_seen": 984008, |
| "step": 3455 |
| }, |
| { |
| "epoch": 6.455223880597015, |
| "grad_norm": 0.28331199288368225, |
| "learning_rate": 0.00085641975555291, |
| "loss": 0.5706, |
| "num_input_tokens_seen": 985448, |
| "step": 3460 |
| }, |
| { |
| "epoch": 6.46455223880597, |
| "grad_norm": 0.24698172509670258, |
| "learning_rate": 0.0008558483660383245, |
| "loss": 0.4667, |
| "num_input_tokens_seen": 986920, |
| "step": 3465 |
| }, |
| { |
| "epoch": 6.473880597014926, |
| "grad_norm": 0.23428291082382202, |
| "learning_rate": 0.0008552760332667223, |
| "loss": 0.5937, |
| "num_input_tokens_seen": 988520, |
| "step": 3470 |
| }, |
| { |
| "epoch": 6.483208955223881, |
| "grad_norm": 0.32097363471984863, |
| "learning_rate": 0.0008547027587552012, |
| "loss": 0.6805, |
| "num_input_tokens_seen": 989928, |
| "step": 3475 |
| }, |
| { |
| "epoch": 6.492537313432836, |
| "grad_norm": 0.3465026915073395, |
| "learning_rate": 0.0008541285440233562, |
| "loss": 0.4623, |
| "num_input_tokens_seen": 991368, |
| "step": 3480 |
| }, |
| { |
| "epoch": 6.5018656716417915, |
| "grad_norm": 0.19996623694896698, |
| "learning_rate": 0.0008535533905932737, |
| "loss": 0.4461, |
| "num_input_tokens_seen": 992712, |
| "step": 3485 |
| }, |
| { |
| "epoch": 6.5111940298507465, |
| "grad_norm": 0.20951132476329803, |
| "learning_rate": 0.0008529772999895289, |
| "loss": 0.5706, |
| "num_input_tokens_seen": 994024, |
| "step": 3490 |
| }, |
| { |
| "epoch": 6.520522388059701, |
| "grad_norm": 0.2945927083492279, |
| "learning_rate": 0.0008524002737391807, |
| "loss": 0.67, |
| "num_input_tokens_seen": 995400, |
| "step": 3495 |
| }, |
| { |
| "epoch": 6.529850746268656, |
| "grad_norm": 0.23618575930595398, |
| "learning_rate": 0.0008518223133717687, |
| "loss": 0.4631, |
| "num_input_tokens_seen": 997032, |
| "step": 3500 |
| }, |
| { |
| "epoch": 6.539179104477612, |
| "grad_norm": 0.19505491852760315, |
| "learning_rate": 0.0008512434204193079, |
| "loss": 0.4377, |
| "num_input_tokens_seen": 998408, |
| "step": 3505 |
| }, |
| { |
| "epoch": 6.548507462686567, |
| "grad_norm": 0.26609039306640625, |
| "learning_rate": 0.000850663596416286, |
| "loss": 0.522, |
| "num_input_tokens_seen": 999944, |
| "step": 3510 |
| }, |
| { |
| "epoch": 6.557835820895522, |
| "grad_norm": 0.2637292444705963, |
| "learning_rate": 0.0008500828428996583, |
| "loss": 0.293, |
| "num_input_tokens_seen": 1001480, |
| "step": 3515 |
| }, |
| { |
| "epoch": 6.567164179104478, |
| "grad_norm": 0.3107680082321167, |
| "learning_rate": 0.0008495011614088439, |
| "loss": 0.6297, |
| "num_input_tokens_seen": 1002792, |
| "step": 3520 |
| }, |
| { |
| "epoch": 6.576492537313433, |
| "grad_norm": 0.20006173849105835, |
| "learning_rate": 0.0008489185534857223, |
| "loss": 0.3466, |
| "num_input_tokens_seen": 1004264, |
| "step": 3525 |
| }, |
| { |
| "epoch": 6.585820895522388, |
| "grad_norm": 0.19617074728012085, |
| "learning_rate": 0.0008483350206746278, |
| "loss": 0.4365, |
| "num_input_tokens_seen": 1005704, |
| "step": 3530 |
| }, |
| { |
| "epoch": 6.595149253731344, |
| "grad_norm": 0.10148897767066956, |
| "learning_rate": 0.000847750564522347, |
| "loss": 0.4371, |
| "num_input_tokens_seen": 1007240, |
| "step": 3535 |
| }, |
| { |
| "epoch": 6.604477611940299, |
| "grad_norm": 0.42178183794021606, |
| "learning_rate": 0.000847165186578114, |
| "loss": 0.4784, |
| "num_input_tokens_seen": 1008648, |
| "step": 3540 |
| }, |
| { |
| "epoch": 6.6138059701492535, |
| "grad_norm": 0.41911423206329346, |
| "learning_rate": 0.0008465788883936059, |
| "loss": 0.5173, |
| "num_input_tokens_seen": 1010088, |
| "step": 3545 |
| }, |
| { |
| "epoch": 6.6231343283582085, |
| "grad_norm": 0.1649949848651886, |
| "learning_rate": 0.0008459916715229396, |
| "loss": 0.39, |
| "num_input_tokens_seen": 1011432, |
| "step": 3550 |
| }, |
| { |
| "epoch": 6.632462686567164, |
| "grad_norm": 0.3317459225654602, |
| "learning_rate": 0.000845403537522667, |
| "loss": 0.6234, |
| "num_input_tokens_seen": 1012840, |
| "step": 3555 |
| }, |
| { |
| "epoch": 6.641791044776119, |
| "grad_norm": 0.16076388955116272, |
| "learning_rate": 0.0008448144879517705, |
| "loss": 0.5754, |
| "num_input_tokens_seen": 1014216, |
| "step": 3560 |
| }, |
| { |
| "epoch": 6.651119402985074, |
| "grad_norm": 0.2902985215187073, |
| "learning_rate": 0.0008442245243716606, |
| "loss": 0.5202, |
| "num_input_tokens_seen": 1015560, |
| "step": 3565 |
| }, |
| { |
| "epoch": 6.66044776119403, |
| "grad_norm": 0.22482368350028992, |
| "learning_rate": 0.0008436336483461695, |
| "loss": 0.6555, |
| "num_input_tokens_seen": 1017064, |
| "step": 3570 |
| }, |
| { |
| "epoch": 6.669776119402985, |
| "grad_norm": 0.2745371162891388, |
| "learning_rate": 0.0008430418614415487, |
| "loss": 0.496, |
| "num_input_tokens_seen": 1018312, |
| "step": 3575 |
| }, |
| { |
| "epoch": 6.67910447761194, |
| "grad_norm": 0.15380549430847168, |
| "learning_rate": 0.0008424491652264639, |
| "loss": 0.5961, |
| "num_input_tokens_seen": 1019688, |
| "step": 3580 |
| }, |
| { |
| "epoch": 6.688432835820896, |
| "grad_norm": 0.18027140200138092, |
| "learning_rate": 0.000841855561271991, |
| "loss": 0.6227, |
| "num_input_tokens_seen": 1021128, |
| "step": 3585 |
| }, |
| { |
| "epoch": 6.697761194029851, |
| "grad_norm": 0.2426953762769699, |
| "learning_rate": 0.0008412610511516125, |
| "loss": 0.4379, |
| "num_input_tokens_seen": 1022504, |
| "step": 3590 |
| }, |
| { |
| "epoch": 6.707089552238806, |
| "grad_norm": 0.23552139103412628, |
| "learning_rate": 0.0008406656364412128, |
| "loss": 0.7039, |
| "num_input_tokens_seen": 1023752, |
| "step": 3595 |
| }, |
| { |
| "epoch": 6.7164179104477615, |
| "grad_norm": 0.25078070163726807, |
| "learning_rate": 0.0008400693187190736, |
| "loss": 0.4357, |
| "num_input_tokens_seen": 1025224, |
| "step": 3600 |
| }, |
| { |
| "epoch": 6.725746268656716, |
| "grad_norm": 0.2658351957798004, |
| "learning_rate": 0.000839472099565871, |
| "loss": 0.8229, |
| "num_input_tokens_seen": 1026568, |
| "step": 3605 |
| }, |
| { |
| "epoch": 6.735074626865671, |
| "grad_norm": 0.25117790699005127, |
| "learning_rate": 0.00083887398056467, |
| "loss": 0.7487, |
| "num_input_tokens_seen": 1027912, |
| "step": 3610 |
| }, |
| { |
| "epoch": 6.744402985074627, |
| "grad_norm": 0.1720697283744812, |
| "learning_rate": 0.000838274963300921, |
| "loss": 0.4305, |
| "num_input_tokens_seen": 1029384, |
| "step": 3615 |
| }, |
| { |
| "epoch": 6.753731343283582, |
| "grad_norm": 0.21772412955760956, |
| "learning_rate": 0.0008376750493624555, |
| "loss": 0.5213, |
| "num_input_tokens_seen": 1030952, |
| "step": 3620 |
| }, |
| { |
| "epoch": 6.763059701492537, |
| "grad_norm": 0.21206441521644592, |
| "learning_rate": 0.000837074240339482, |
| "loss": 0.6029, |
| "num_input_tokens_seen": 1032264, |
| "step": 3625 |
| }, |
| { |
| "epoch": 6.772388059701493, |
| "grad_norm": 0.19085289537906647, |
| "learning_rate": 0.0008364725378245811, |
| "loss": 0.5669, |
| "num_input_tokens_seen": 1033544, |
| "step": 3630 |
| }, |
| { |
| "epoch": 6.781716417910448, |
| "grad_norm": 0.2896425724029541, |
| "learning_rate": 0.0008358699434127024, |
| "loss": 0.6797, |
| "num_input_tokens_seen": 1035112, |
| "step": 3635 |
| }, |
| { |
| "epoch": 6.791044776119403, |
| "grad_norm": 0.18138831853866577, |
| "learning_rate": 0.0008352664587011595, |
| "loss": 0.42, |
| "num_input_tokens_seen": 1036488, |
| "step": 3640 |
| }, |
| { |
| "epoch": 6.800373134328359, |
| "grad_norm": 0.36892759799957275, |
| "learning_rate": 0.0008346620852896256, |
| "loss": 0.6121, |
| "num_input_tokens_seen": 1037960, |
| "step": 3645 |
| }, |
| { |
| "epoch": 6.809701492537314, |
| "grad_norm": 0.24577677249908447, |
| "learning_rate": 0.00083405682478013, |
| "loss": 0.6551, |
| "num_input_tokens_seen": 1039304, |
| "step": 3650 |
| }, |
| { |
| "epoch": 6.8190298507462686, |
| "grad_norm": 0.2634584903717041, |
| "learning_rate": 0.0008334506787770532, |
| "loss": 0.5676, |
| "num_input_tokens_seen": 1040616, |
| "step": 3655 |
| }, |
| { |
| "epoch": 6.8283582089552235, |
| "grad_norm": 0.3486064076423645, |
| "learning_rate": 0.0008328436488871234, |
| "loss": 0.4678, |
| "num_input_tokens_seen": 1042056, |
| "step": 3660 |
| }, |
| { |
| "epoch": 6.837686567164179, |
| "grad_norm": 0.14382286369800568, |
| "learning_rate": 0.0008322357367194109, |
| "loss": 0.474, |
| "num_input_tokens_seen": 1043528, |
| "step": 3665 |
| }, |
| { |
| "epoch": 6.847014925373134, |
| "grad_norm": 0.19818229973316193, |
| "learning_rate": 0.0008316269438853255, |
| "loss": 0.5245, |
| "num_input_tokens_seen": 1044904, |
| "step": 3670 |
| }, |
| { |
| "epoch": 6.856343283582089, |
| "grad_norm": 0.36218205094337463, |
| "learning_rate": 0.0008310172719986108, |
| "loss": 0.5821, |
| "num_input_tokens_seen": 1046408, |
| "step": 3675 |
| }, |
| { |
| "epoch": 6.865671641791045, |
| "grad_norm": 0.22650587558746338, |
| "learning_rate": 0.0008304067226753408, |
| "loss": 0.4627, |
| "num_input_tokens_seen": 1047912, |
| "step": 3680 |
| }, |
| { |
| "epoch": 6.875, |
| "grad_norm": 0.23026876151561737, |
| "learning_rate": 0.0008297952975339155, |
| "loss": 0.7357, |
| "num_input_tokens_seen": 1049320, |
| "step": 3685 |
| }, |
| { |
| "epoch": 6.884328358208955, |
| "grad_norm": 0.2109512984752655, |
| "learning_rate": 0.0008291829981950562, |
| "loss": 0.4451, |
| "num_input_tokens_seen": 1050632, |
| "step": 3690 |
| }, |
| { |
| "epoch": 6.893656716417911, |
| "grad_norm": 0.1196155846118927, |
| "learning_rate": 0.0008285698262818016, |
| "loss": 0.4494, |
| "num_input_tokens_seen": 1051976, |
| "step": 3695 |
| }, |
| { |
| "epoch": 6.902985074626866, |
| "grad_norm": 0.1713961511850357, |
| "learning_rate": 0.0008279557834195031, |
| "loss": 0.6417, |
| "num_input_tokens_seen": 1053224, |
| "step": 3700 |
| }, |
| { |
| "epoch": 6.912313432835821, |
| "grad_norm": 0.3117183446884155, |
| "learning_rate": 0.000827340871235821, |
| "loss": 0.4689, |
| "num_input_tokens_seen": 1054568, |
| "step": 3705 |
| }, |
| { |
| "epoch": 6.9216417910447765, |
| "grad_norm": 0.2057737559080124, |
| "learning_rate": 0.00082672509136072, |
| "loss": 0.5804, |
| "num_input_tokens_seen": 1055944, |
| "step": 3710 |
| }, |
| { |
| "epoch": 6.9309701492537314, |
| "grad_norm": 0.21782958507537842, |
| "learning_rate": 0.0008261084454264647, |
| "loss": 0.3641, |
| "num_input_tokens_seen": 1057384, |
| "step": 3715 |
| }, |
| { |
| "epoch": 6.940298507462686, |
| "grad_norm": 0.33613821864128113, |
| "learning_rate": 0.0008254909350676151, |
| "loss": 0.6094, |
| "num_input_tokens_seen": 1058920, |
| "step": 3720 |
| }, |
| { |
| "epoch": 6.949626865671641, |
| "grad_norm": 0.480368435382843, |
| "learning_rate": 0.0008248725619210233, |
| "loss": 0.4046, |
| "num_input_tokens_seen": 1060424, |
| "step": 3725 |
| }, |
| { |
| "epoch": 6.958955223880597, |
| "grad_norm": 0.36097168922424316, |
| "learning_rate": 0.0008242533276258277, |
| "loss": 0.2856, |
| "num_input_tokens_seen": 1061832, |
| "step": 3730 |
| }, |
| { |
| "epoch": 6.968283582089552, |
| "grad_norm": 0.1994985193014145, |
| "learning_rate": 0.0008236332338234496, |
| "loss": 0.6103, |
| "num_input_tokens_seen": 1063176, |
| "step": 3735 |
| }, |
| { |
| "epoch": 6.977611940298507, |
| "grad_norm": 0.15217134356498718, |
| "learning_rate": 0.0008230122821575884, |
| "loss": 0.3072, |
| "num_input_tokens_seen": 1064616, |
| "step": 3740 |
| }, |
| { |
| "epoch": 6.986940298507463, |
| "grad_norm": 0.33062049746513367, |
| "learning_rate": 0.0008223904742742181, |
| "loss": 0.3627, |
| "num_input_tokens_seen": 1066152, |
| "step": 3745 |
| }, |
| { |
| "epoch": 6.996268656716418, |
| "grad_norm": 0.30312541127204895, |
| "learning_rate": 0.0008217678118215819, |
| "loss": 0.5468, |
| "num_input_tokens_seen": 1067592, |
| "step": 3750 |
| }, |
| { |
| "epoch": 7.0, |
| "eval_loss": 0.6684337258338928, |
| "eval_runtime": 4.1842, |
| "eval_samples_per_second": 56.881, |
| "eval_steps_per_second": 14.34, |
| "num_input_tokens_seen": 1067904, |
| "step": 3752 |
| }, |
| { |
| "epoch": 7.005597014925373, |
| "grad_norm": 0.30474305152893066, |
| "learning_rate": 0.0008211442964501879, |
| "loss": 0.5487, |
| "num_input_tokens_seen": 1068736, |
| "step": 3755 |
| }, |
| { |
| "epoch": 7.014925373134329, |
| "grad_norm": 0.44127199053764343, |
| "learning_rate": 0.0008205199298128055, |
| "loss": 0.6161, |
| "num_input_tokens_seen": 1070176, |
| "step": 3760 |
| }, |
| { |
| "epoch": 7.024253731343284, |
| "grad_norm": 0.2390872836112976, |
| "learning_rate": 0.0008198947135644606, |
| "loss": 0.5455, |
| "num_input_tokens_seen": 1071584, |
| "step": 3765 |
| }, |
| { |
| "epoch": 7.0335820895522385, |
| "grad_norm": 0.27098581194877625, |
| "learning_rate": 0.000819268649362431, |
| "loss": 0.6568, |
| "num_input_tokens_seen": 1072896, |
| "step": 3770 |
| }, |
| { |
| "epoch": 7.042910447761194, |
| "grad_norm": 0.3743234872817993, |
| "learning_rate": 0.0008186417388662421, |
| "loss": 0.4295, |
| "num_input_tokens_seen": 1074336, |
| "step": 3775 |
| }, |
| { |
| "epoch": 7.052238805970149, |
| "grad_norm": 0.16391004621982574, |
| "learning_rate": 0.000818013983737663, |
| "loss": 0.5123, |
| "num_input_tokens_seen": 1075712, |
| "step": 3780 |
| }, |
| { |
| "epoch": 7.061567164179104, |
| "grad_norm": 0.29341673851013184, |
| "learning_rate": 0.0008173853856407011, |
| "loss": 0.3093, |
| "num_input_tokens_seen": 1077152, |
| "step": 3785 |
| }, |
| { |
| "epoch": 7.07089552238806, |
| "grad_norm": 0.14143411815166473, |
| "learning_rate": 0.0008167559462415988, |
| "loss": 0.5145, |
| "num_input_tokens_seen": 1078496, |
| "step": 3790 |
| }, |
| { |
| "epoch": 7.080223880597015, |
| "grad_norm": 0.18919670581817627, |
| "learning_rate": 0.0008161256672088285, |
| "loss": 0.5812, |
| "num_input_tokens_seen": 1079680, |
| "step": 3795 |
| }, |
| { |
| "epoch": 7.08955223880597, |
| "grad_norm": 0.30277538299560547, |
| "learning_rate": 0.0008154945502130877, |
| "loss": 0.5403, |
| "num_input_tokens_seen": 1080864, |
| "step": 3800 |
| }, |
| { |
| "epoch": 7.098880597014926, |
| "grad_norm": 0.24299748241901398, |
| "learning_rate": 0.0008148625969272959, |
| "loss": 0.4128, |
| "num_input_tokens_seen": 1082208, |
| "step": 3805 |
| }, |
| { |
| "epoch": 7.108208955223881, |
| "grad_norm": 0.24601735174655914, |
| "learning_rate": 0.0008142298090265887, |
| "loss": 0.5396, |
| "num_input_tokens_seen": 1083680, |
| "step": 3810 |
| }, |
| { |
| "epoch": 7.117537313432836, |
| "grad_norm": 0.29024389386177063, |
| "learning_rate": 0.0008135961881883146, |
| "loss": 0.4562, |
| "num_input_tokens_seen": 1085152, |
| "step": 3815 |
| }, |
| { |
| "epoch": 7.126865671641791, |
| "grad_norm": 0.33937326073646545, |
| "learning_rate": 0.0008129617360920296, |
| "loss": 0.3553, |
| "num_input_tokens_seen": 1086752, |
| "step": 3820 |
| }, |
| { |
| "epoch": 7.1361940298507465, |
| "grad_norm": 0.20014850795269012, |
| "learning_rate": 0.0008123264544194933, |
| "loss": 0.4393, |
| "num_input_tokens_seen": 1088064, |
| "step": 3825 |
| }, |
| { |
| "epoch": 7.145522388059701, |
| "grad_norm": 0.2823147475719452, |
| "learning_rate": 0.0008116903448546639, |
| "loss": 0.5611, |
| "num_input_tokens_seen": 1089568, |
| "step": 3830 |
| }, |
| { |
| "epoch": 7.154850746268656, |
| "grad_norm": 0.2024276703596115, |
| "learning_rate": 0.0008110534090836951, |
| "loss": 0.5713, |
| "num_input_tokens_seen": 1091136, |
| "step": 3835 |
| }, |
| { |
| "epoch": 7.164179104477612, |
| "grad_norm": 0.2638244926929474, |
| "learning_rate": 0.0008104156487949297, |
| "loss": 0.3811, |
| "num_input_tokens_seen": 1092704, |
| "step": 3840 |
| }, |
| { |
| "epoch": 7.173507462686567, |
| "grad_norm": 0.21746188402175903, |
| "learning_rate": 0.000809777065678896, |
| "loss": 0.9308, |
| "num_input_tokens_seen": 1093952, |
| "step": 3845 |
| }, |
| { |
| "epoch": 7.182835820895522, |
| "grad_norm": 0.41835376620292664, |
| "learning_rate": 0.0008091376614283045, |
| "loss": 0.587, |
| "num_input_tokens_seen": 1095264, |
| "step": 3850 |
| }, |
| { |
| "epoch": 7.192164179104478, |
| "grad_norm": 0.41872432827949524, |
| "learning_rate": 0.0008084974377380409, |
| "loss": 0.4512, |
| "num_input_tokens_seen": 1096576, |
| "step": 3855 |
| }, |
| { |
| "epoch": 7.201492537313433, |
| "grad_norm": 0.19008269906044006, |
| "learning_rate": 0.0008078563963051642, |
| "loss": 0.6325, |
| "num_input_tokens_seen": 1097920, |
| "step": 3860 |
| }, |
| { |
| "epoch": 7.210820895522388, |
| "grad_norm": 0.19446614384651184, |
| "learning_rate": 0.0008072145388289, |
| "loss": 0.3391, |
| "num_input_tokens_seen": 1099424, |
| "step": 3865 |
| }, |
| { |
| "epoch": 7.220149253731344, |
| "grad_norm": 0.3054392635822296, |
| "learning_rate": 0.0008065718670106379, |
| "loss": 0.4429, |
| "num_input_tokens_seen": 1100864, |
| "step": 3870 |
| }, |
| { |
| "epoch": 7.229477611940299, |
| "grad_norm": 0.23451483249664307, |
| "learning_rate": 0.0008059283825539256, |
| "loss": 0.4245, |
| "num_input_tokens_seen": 1102272, |
| "step": 3875 |
| }, |
| { |
| "epoch": 7.2388059701492535, |
| "grad_norm": 0.30054786801338196, |
| "learning_rate": 0.0008052840871644649, |
| "loss": 0.4085, |
| "num_input_tokens_seen": 1103680, |
| "step": 3880 |
| }, |
| { |
| "epoch": 7.248134328358209, |
| "grad_norm": 0.2466873675584793, |
| "learning_rate": 0.0008046389825501072, |
| "loss": 0.3337, |
| "num_input_tokens_seen": 1104960, |
| "step": 3885 |
| }, |
| { |
| "epoch": 7.257462686567164, |
| "grad_norm": 0.1704123169183731, |
| "learning_rate": 0.0008039930704208492, |
| "loss": 0.6229, |
| "num_input_tokens_seen": 1106368, |
| "step": 3890 |
| }, |
| { |
| "epoch": 7.266791044776119, |
| "grad_norm": 0.19805479049682617, |
| "learning_rate": 0.0008033463524888278, |
| "loss": 0.4459, |
| "num_input_tokens_seen": 1107680, |
| "step": 3895 |
| }, |
| { |
| "epoch": 7.276119402985074, |
| "grad_norm": 0.3426274061203003, |
| "learning_rate": 0.0008026988304683158, |
| "loss": 0.5647, |
| "num_input_tokens_seen": 1109184, |
| "step": 3900 |
| }, |
| { |
| "epoch": 7.28544776119403, |
| "grad_norm": 0.18383286893367767, |
| "learning_rate": 0.0008020505060757178, |
| "loss": 0.3523, |
| "num_input_tokens_seen": 1110656, |
| "step": 3905 |
| }, |
| { |
| "epoch": 7.294776119402985, |
| "grad_norm": 0.22394180297851562, |
| "learning_rate": 0.0008014013810295649, |
| "loss": 0.4805, |
| "num_input_tokens_seen": 1112256, |
| "step": 3910 |
| }, |
| { |
| "epoch": 7.30410447761194, |
| "grad_norm": 0.1889072060585022, |
| "learning_rate": 0.0008007514570505107, |
| "loss": 0.4508, |
| "num_input_tokens_seen": 1113696, |
| "step": 3915 |
| }, |
| { |
| "epoch": 7.313432835820896, |
| "grad_norm": 0.1642770767211914, |
| "learning_rate": 0.0008001007358613263, |
| "loss": 0.5743, |
| "num_input_tokens_seen": 1114976, |
| "step": 3920 |
| }, |
| { |
| "epoch": 7.322761194029851, |
| "grad_norm": 0.32939183712005615, |
| "learning_rate": 0.0007994492191868965, |
| "loss": 0.4055, |
| "num_input_tokens_seen": 1116288, |
| "step": 3925 |
| }, |
| { |
| "epoch": 7.332089552238806, |
| "grad_norm": 0.32331785559654236, |
| "learning_rate": 0.0007987969087542142, |
| "loss": 0.4768, |
| "num_input_tokens_seen": 1117760, |
| "step": 3930 |
| }, |
| { |
| "epoch": 7.3414179104477615, |
| "grad_norm": 0.19969119131565094, |
| "learning_rate": 0.0007981438062923767, |
| "loss": 0.6326, |
| "num_input_tokens_seen": 1119072, |
| "step": 3935 |
| }, |
| { |
| "epoch": 7.350746268656716, |
| "grad_norm": 0.31672680377960205, |
| "learning_rate": 0.0007974899135325804, |
| "loss": 0.6273, |
| "num_input_tokens_seen": 1120448, |
| "step": 3940 |
| }, |
| { |
| "epoch": 7.360074626865671, |
| "grad_norm": 0.19286391139030457, |
| "learning_rate": 0.000796835232208117, |
| "loss": 0.3779, |
| "num_input_tokens_seen": 1121952, |
| "step": 3945 |
| }, |
| { |
| "epoch": 7.369402985074627, |
| "grad_norm": 0.1753414422273636, |
| "learning_rate": 0.0007961797640543678, |
| "loss": 0.4282, |
| "num_input_tokens_seen": 1123424, |
| "step": 3950 |
| }, |
| { |
| "epoch": 7.378731343283582, |
| "grad_norm": 0.1815296709537506, |
| "learning_rate": 0.0007955235108088008, |
| "loss": 0.4562, |
| "num_input_tokens_seen": 1124800, |
| "step": 3955 |
| }, |
| { |
| "epoch": 7.388059701492537, |
| "grad_norm": 0.22495396435260773, |
| "learning_rate": 0.0007948664742109639, |
| "loss": 0.5935, |
| "num_input_tokens_seen": 1126208, |
| "step": 3960 |
| }, |
| { |
| "epoch": 7.397388059701493, |
| "grad_norm": 0.2588635981082916, |
| "learning_rate": 0.0007942086560024826, |
| "loss": 0.5487, |
| "num_input_tokens_seen": 1127488, |
| "step": 3965 |
| }, |
| { |
| "epoch": 7.406716417910448, |
| "grad_norm": 0.459797203540802, |
| "learning_rate": 0.0007935500579270532, |
| "loss": 0.8277, |
| "num_input_tokens_seen": 1128864, |
| "step": 3970 |
| }, |
| { |
| "epoch": 7.416044776119403, |
| "grad_norm": 0.398291677236557, |
| "learning_rate": 0.0007928906817304397, |
| "loss": 0.5892, |
| "num_input_tokens_seen": 1130368, |
| "step": 3975 |
| }, |
| { |
| "epoch": 7.425373134328359, |
| "grad_norm": 0.20469245314598083, |
| "learning_rate": 0.0007922305291604687, |
| "loss": 0.4956, |
| "num_input_tokens_seen": 1131872, |
| "step": 3980 |
| }, |
| { |
| "epoch": 7.434701492537314, |
| "grad_norm": 0.36543020606040955, |
| "learning_rate": 0.0007915696019670248, |
| "loss": 0.6108, |
| "num_input_tokens_seen": 1133440, |
| "step": 3985 |
| }, |
| { |
| "epoch": 7.4440298507462686, |
| "grad_norm": 0.29691100120544434, |
| "learning_rate": 0.000790907901902046, |
| "loss": 0.6105, |
| "num_input_tokens_seen": 1134720, |
| "step": 3990 |
| }, |
| { |
| "epoch": 7.4533582089552235, |
| "grad_norm": 0.23733581602573395, |
| "learning_rate": 0.0007902454307195184, |
| "loss": 0.3708, |
| "num_input_tokens_seen": 1136224, |
| "step": 3995 |
| }, |
| { |
| "epoch": 7.462686567164179, |
| "grad_norm": 0.19936566054821014, |
| "learning_rate": 0.0007895821901754727, |
| "loss": 0.5386, |
| "num_input_tokens_seen": 1137600, |
| "step": 4000 |
| }, |
| { |
| "epoch": 7.472014925373134, |
| "grad_norm": 0.21452738344669342, |
| "learning_rate": 0.000788918182027979, |
| "loss": 0.5008, |
| "num_input_tokens_seen": 1139040, |
| "step": 4005 |
| }, |
| { |
| "epoch": 7.481343283582089, |
| "grad_norm": 0.2815008759498596, |
| "learning_rate": 0.0007882534080371414, |
| "loss": 0.3934, |
| "num_input_tokens_seen": 1140576, |
| "step": 4010 |
| }, |
| { |
| "epoch": 7.490671641791045, |
| "grad_norm": 0.28204086422920227, |
| "learning_rate": 0.000787587869965095, |
| "loss": 0.4209, |
| "num_input_tokens_seen": 1142080, |
| "step": 4015 |
| }, |
| { |
| "epoch": 7.5, |
| "grad_norm": 0.2625596523284912, |
| "learning_rate": 0.0007869215695759996, |
| "loss": 0.6684, |
| "num_input_tokens_seen": 1143680, |
| "step": 4020 |
| }, |
| { |
| "epoch": 7.509328358208955, |
| "grad_norm": 0.25634852051734924, |
| "learning_rate": 0.000786254508636036, |
| "loss": 0.5474, |
| "num_input_tokens_seen": 1145120, |
| "step": 4025 |
| }, |
| { |
| "epoch": 7.518656716417911, |
| "grad_norm": 0.2722490727901459, |
| "learning_rate": 0.0007855866889134008, |
| "loss": 0.848, |
| "num_input_tokens_seen": 1146496, |
| "step": 4030 |
| }, |
| { |
| "epoch": 7.527985074626866, |
| "grad_norm": 0.3221842348575592, |
| "learning_rate": 0.0007849181121783021, |
| "loss": 0.4516, |
| "num_input_tokens_seen": 1147808, |
| "step": 4035 |
| }, |
| { |
| "epoch": 7.537313432835821, |
| "grad_norm": 0.22305116057395935, |
| "learning_rate": 0.0007842487802029545, |
| "loss": 0.6108, |
| "num_input_tokens_seen": 1149344, |
| "step": 4040 |
| }, |
| { |
| "epoch": 7.5466417910447765, |
| "grad_norm": 0.23894543945789337, |
| "learning_rate": 0.0007835786947615748, |
| "loss": 0.3998, |
| "num_input_tokens_seen": 1150880, |
| "step": 4045 |
| }, |
| { |
| "epoch": 7.5559701492537314, |
| "grad_norm": 0.30003124475479126, |
| "learning_rate": 0.0007829078576303768, |
| "loss": 0.5354, |
| "num_input_tokens_seen": 1152352, |
| "step": 4050 |
| }, |
| { |
| "epoch": 7.565298507462686, |
| "grad_norm": 0.344744473695755, |
| "learning_rate": 0.0007822362705875667, |
| "loss": 0.6149, |
| "num_input_tokens_seen": 1153856, |
| "step": 4055 |
| }, |
| { |
| "epoch": 7.574626865671641, |
| "grad_norm": 0.20434407889842987, |
| "learning_rate": 0.0007815639354133388, |
| "loss": 0.3679, |
| "num_input_tokens_seen": 1155424, |
| "step": 4060 |
| }, |
| { |
| "epoch": 7.583955223880597, |
| "grad_norm": 0.3096279203891754, |
| "learning_rate": 0.0007808908538898703, |
| "loss": 0.4134, |
| "num_input_tokens_seen": 1156992, |
| "step": 4065 |
| }, |
| { |
| "epoch": 7.593283582089552, |
| "grad_norm": 0.12501931190490723, |
| "learning_rate": 0.000780217027801317, |
| "loss": 0.3932, |
| "num_input_tokens_seen": 1158624, |
| "step": 4070 |
| }, |
| { |
| "epoch": 7.602611940298507, |
| "grad_norm": 0.18204405903816223, |
| "learning_rate": 0.0007795424589338079, |
| "loss": 0.4619, |
| "num_input_tokens_seen": 1159968, |
| "step": 4075 |
| }, |
| { |
| "epoch": 7.611940298507463, |
| "grad_norm": 0.3062838912010193, |
| "learning_rate": 0.0007788671490754416, |
| "loss": 0.6422, |
| "num_input_tokens_seen": 1161216, |
| "step": 4080 |
| }, |
| { |
| "epoch": 7.621268656716418, |
| "grad_norm": 0.3464128375053406, |
| "learning_rate": 0.00077819110001628, |
| "loss": 0.4757, |
| "num_input_tokens_seen": 1162592, |
| "step": 4085 |
| }, |
| { |
| "epoch": 7.630597014925373, |
| "grad_norm": 0.1720285266637802, |
| "learning_rate": 0.0007775143135483451, |
| "loss": 0.3046, |
| "num_input_tokens_seen": 1164288, |
| "step": 4090 |
| }, |
| { |
| "epoch": 7.639925373134329, |
| "grad_norm": 0.32936620712280273, |
| "learning_rate": 0.0007768367914656135, |
| "loss": 0.4966, |
| "num_input_tokens_seen": 1165824, |
| "step": 4095 |
| }, |
| { |
| "epoch": 7.649253731343284, |
| "grad_norm": 0.3390771448612213, |
| "learning_rate": 0.0007761585355640112, |
| "loss": 0.5416, |
| "num_input_tokens_seen": 1167264, |
| "step": 4100 |
| }, |
| { |
| "epoch": 7.6585820895522385, |
| "grad_norm": 0.23222362995147705, |
| "learning_rate": 0.00077547954764141, |
| "loss": 0.4355, |
| "num_input_tokens_seen": 1168672, |
| "step": 4105 |
| }, |
| { |
| "epoch": 7.667910447761194, |
| "grad_norm": 0.2488616555929184, |
| "learning_rate": 0.0007747998294976216, |
| "loss": 0.4442, |
| "num_input_tokens_seen": 1170400, |
| "step": 4110 |
| }, |
| { |
| "epoch": 7.677238805970149, |
| "grad_norm": 0.33894675970077515, |
| "learning_rate": 0.0007741193829343937, |
| "loss": 0.5229, |
| "num_input_tokens_seen": 1172000, |
| "step": 4115 |
| }, |
| { |
| "epoch": 7.686567164179104, |
| "grad_norm": 0.1934259831905365, |
| "learning_rate": 0.0007734382097554044, |
| "loss": 0.3793, |
| "num_input_tokens_seen": 1173568, |
| "step": 4120 |
| }, |
| { |
| "epoch": 7.69589552238806, |
| "grad_norm": 0.3763267993927002, |
| "learning_rate": 0.0007727563117662584, |
| "loss": 0.5349, |
| "num_input_tokens_seen": 1175040, |
| "step": 4125 |
| }, |
| { |
| "epoch": 7.705223880597015, |
| "grad_norm": 0.28987762331962585, |
| "learning_rate": 0.0007720736907744811, |
| "loss": 0.402, |
| "num_input_tokens_seen": 1176480, |
| "step": 4130 |
| }, |
| { |
| "epoch": 7.71455223880597, |
| "grad_norm": 0.3133031129837036, |
| "learning_rate": 0.0007713903485895148, |
| "loss": 0.449, |
| "num_input_tokens_seen": 1177856, |
| "step": 4135 |
| }, |
| { |
| "epoch": 7.723880597014926, |
| "grad_norm": 0.26172906160354614, |
| "learning_rate": 0.0007707062870227136, |
| "loss": 0.341, |
| "num_input_tokens_seen": 1179072, |
| "step": 4140 |
| }, |
| { |
| "epoch": 7.733208955223881, |
| "grad_norm": 0.23562432825565338, |
| "learning_rate": 0.0007700215078873378, |
| "loss": 0.3482, |
| "num_input_tokens_seen": 1180640, |
| "step": 4145 |
| }, |
| { |
| "epoch": 7.742537313432836, |
| "grad_norm": 0.213342547416687, |
| "learning_rate": 0.0007693360129985507, |
| "loss": 0.5387, |
| "num_input_tokens_seen": 1181920, |
| "step": 4150 |
| }, |
| { |
| "epoch": 7.7518656716417915, |
| "grad_norm": 0.3449512720108032, |
| "learning_rate": 0.000768649804173412, |
| "loss": 0.3724, |
| "num_input_tokens_seen": 1183488, |
| "step": 4155 |
| }, |
| { |
| "epoch": 7.7611940298507465, |
| "grad_norm": 0.3592718243598938, |
| "learning_rate": 0.0007679628832308743, |
| "loss": 0.5064, |
| "num_input_tokens_seen": 1184864, |
| "step": 4160 |
| }, |
| { |
| "epoch": 7.770522388059701, |
| "grad_norm": 0.32927563786506653, |
| "learning_rate": 0.0007672752519917779, |
| "loss": 0.4903, |
| "num_input_tokens_seen": 1186240, |
| "step": 4165 |
| }, |
| { |
| "epoch": 7.779850746268656, |
| "grad_norm": 0.18627624213695526, |
| "learning_rate": 0.0007665869122788458, |
| "loss": 0.6339, |
| "num_input_tokens_seen": 1187488, |
| "step": 4170 |
| }, |
| { |
| "epoch": 7.789179104477612, |
| "grad_norm": 0.42838889360427856, |
| "learning_rate": 0.0007658978659166787, |
| "loss": 0.769, |
| "num_input_tokens_seen": 1188800, |
| "step": 4175 |
| }, |
| { |
| "epoch": 7.798507462686567, |
| "grad_norm": 0.21956242620944977, |
| "learning_rate": 0.0007652081147317509, |
| "loss": 0.4466, |
| "num_input_tokens_seen": 1190272, |
| "step": 4180 |
| }, |
| { |
| "epoch": 7.807835820895522, |
| "grad_norm": 0.40260300040245056, |
| "learning_rate": 0.0007645176605524049, |
| "loss": 0.5467, |
| "num_input_tokens_seen": 1191648, |
| "step": 4185 |
| }, |
| { |
| "epoch": 7.817164179104478, |
| "grad_norm": 0.34386584162712097, |
| "learning_rate": 0.000763826505208846, |
| "loss": 0.7075, |
| "num_input_tokens_seen": 1193024, |
| "step": 4190 |
| }, |
| { |
| "epoch": 7.826492537313433, |
| "grad_norm": 0.45418819785118103, |
| "learning_rate": 0.0007631346505331391, |
| "loss": 0.5423, |
| "num_input_tokens_seen": 1194272, |
| "step": 4195 |
| }, |
| { |
| "epoch": 7.835820895522388, |
| "grad_norm": 0.36769580841064453, |
| "learning_rate": 0.0007624420983592022, |
| "loss": 0.4375, |
| "num_input_tokens_seen": 1195680, |
| "step": 4200 |
| }, |
| { |
| "epoch": 7.845149253731344, |
| "grad_norm": 0.20569390058517456, |
| "learning_rate": 0.0007617488505228023, |
| "loss": 0.4004, |
| "num_input_tokens_seen": 1197280, |
| "step": 4205 |
| }, |
| { |
| "epoch": 7.854477611940299, |
| "grad_norm": 0.2689351439476013, |
| "learning_rate": 0.0007610549088615504, |
| "loss": 0.5249, |
| "num_input_tokens_seen": 1198656, |
| "step": 4210 |
| }, |
| { |
| "epoch": 7.8638059701492535, |
| "grad_norm": 0.2242366224527359, |
| "learning_rate": 0.0007603602752148968, |
| "loss": 0.5177, |
| "num_input_tokens_seen": 1200160, |
| "step": 4215 |
| }, |
| { |
| "epoch": 7.8731343283582085, |
| "grad_norm": 0.20943698287010193, |
| "learning_rate": 0.0007596649514241259, |
| "loss": 0.3971, |
| "num_input_tokens_seen": 1201536, |
| "step": 4220 |
| }, |
| { |
| "epoch": 7.882462686567164, |
| "grad_norm": 0.19763123989105225, |
| "learning_rate": 0.0007589689393323513, |
| "loss": 0.4378, |
| "num_input_tokens_seen": 1203104, |
| "step": 4225 |
| }, |
| { |
| "epoch": 7.891791044776119, |
| "grad_norm": 0.06624174118041992, |
| "learning_rate": 0.0007582722407845118, |
| "loss": 0.4475, |
| "num_input_tokens_seen": 1204640, |
| "step": 4230 |
| }, |
| { |
| "epoch": 7.901119402985074, |
| "grad_norm": 0.17526885867118835, |
| "learning_rate": 0.0007575748576273649, |
| "loss": 0.5821, |
| "num_input_tokens_seen": 1206176, |
| "step": 4235 |
| }, |
| { |
| "epoch": 7.91044776119403, |
| "grad_norm": 0.22568243741989136, |
| "learning_rate": 0.0007568767917094836, |
| "loss": 0.465, |
| "num_input_tokens_seen": 1207680, |
| "step": 4240 |
| }, |
| { |
| "epoch": 7.919776119402985, |
| "grad_norm": 0.17621439695358276, |
| "learning_rate": 0.0007561780448812501, |
| "loss": 0.4096, |
| "num_input_tokens_seen": 1208960, |
| "step": 4245 |
| }, |
| { |
| "epoch": 7.92910447761194, |
| "grad_norm": 0.26744747161865234, |
| "learning_rate": 0.0007554786189948518, |
| "loss": 0.5476, |
| "num_input_tokens_seen": 1210400, |
| "step": 4250 |
| }, |
| { |
| "epoch": 7.938432835820896, |
| "grad_norm": 0.12365785986185074, |
| "learning_rate": 0.0007547785159042761, |
| "loss": 0.2842, |
| "num_input_tokens_seen": 1211904, |
| "step": 4255 |
| }, |
| { |
| "epoch": 7.947761194029851, |
| "grad_norm": 0.21446040272712708, |
| "learning_rate": 0.0007540777374653056, |
| "loss": 0.524, |
| "num_input_tokens_seen": 1213344, |
| "step": 4260 |
| }, |
| { |
| "epoch": 7.957089552238806, |
| "grad_norm": 0.21890805661678314, |
| "learning_rate": 0.0007533762855355126, |
| "loss": 0.4233, |
| "num_input_tokens_seen": 1214592, |
| "step": 4265 |
| }, |
| { |
| "epoch": 7.9664179104477615, |
| "grad_norm": 0.284196674823761, |
| "learning_rate": 0.0007526741619742553, |
| "loss": 0.4797, |
| "num_input_tokens_seen": 1215936, |
| "step": 4270 |
| }, |
| { |
| "epoch": 7.975746268656716, |
| "grad_norm": 0.30007484555244446, |
| "learning_rate": 0.0007519713686426717, |
| "loss": 0.7494, |
| "num_input_tokens_seen": 1217504, |
| "step": 4275 |
| }, |
| { |
| "epoch": 7.985074626865671, |
| "grad_norm": 0.17375683784484863, |
| "learning_rate": 0.0007512679074036751, |
| "loss": 0.7121, |
| "num_input_tokens_seen": 1218880, |
| "step": 4280 |
| }, |
| { |
| "epoch": 7.994402985074627, |
| "grad_norm": 0.22325703501701355, |
| "learning_rate": 0.00075056378012195, |
| "loss": 0.429, |
| "num_input_tokens_seen": 1220320, |
| "step": 4285 |
| }, |
| { |
| "epoch": 8.0, |
| "eval_loss": 0.6669880747795105, |
| "eval_runtime": 4.1872, |
| "eval_samples_per_second": 56.84, |
| "eval_steps_per_second": 14.329, |
| "num_input_tokens_seen": 1221016, |
| "step": 4288 |
| }, |
| { |
| "epoch": 8.003731343283581, |
| "grad_norm": 0.34321409463882446, |
| "learning_rate": 0.0007498589886639457, |
| "loss": 0.5687, |
| "num_input_tokens_seen": 1221560, |
| "step": 4290 |
| }, |
| { |
| "epoch": 8.013059701492537, |
| "grad_norm": 0.2419702708721161, |
| "learning_rate": 0.0007491535348978719, |
| "loss": 0.2726, |
| "num_input_tokens_seen": 1223224, |
| "step": 4295 |
| }, |
| { |
| "epoch": 8.022388059701493, |
| "grad_norm": 0.25188514590263367, |
| "learning_rate": 0.0007484474206936947, |
| "loss": 0.5986, |
| "num_input_tokens_seen": 1224600, |
| "step": 4300 |
| }, |
| { |
| "epoch": 8.031716417910447, |
| "grad_norm": 0.30116868019104004, |
| "learning_rate": 0.0007477406479231299, |
| "loss": 0.339, |
| "num_input_tokens_seen": 1225880, |
| "step": 4305 |
| }, |
| { |
| "epoch": 8.041044776119403, |
| "grad_norm": 0.21877850592136383, |
| "learning_rate": 0.0007470332184596398, |
| "loss": 0.3826, |
| "num_input_tokens_seen": 1227288, |
| "step": 4310 |
| }, |
| { |
| "epoch": 8.050373134328359, |
| "grad_norm": 0.3433838486671448, |
| "learning_rate": 0.0007463251341784271, |
| "loss": 0.3924, |
| "num_input_tokens_seen": 1228600, |
| "step": 4315 |
| }, |
| { |
| "epoch": 8.059701492537313, |
| "grad_norm": 0.23504002392292023, |
| "learning_rate": 0.00074561639695643, |
| "loss": 0.4071, |
| "num_input_tokens_seen": 1230008, |
| "step": 4320 |
| }, |
| { |
| "epoch": 8.069029850746269, |
| "grad_norm": 0.344004362821579, |
| "learning_rate": 0.0007449070086723178, |
| "loss": 0.6863, |
| "num_input_tokens_seen": 1231320, |
| "step": 4325 |
| }, |
| { |
| "epoch": 8.078358208955224, |
| "grad_norm": 0.6055997610092163, |
| "learning_rate": 0.0007441969712064856, |
| "loss": 0.5183, |
| "num_input_tokens_seen": 1232536, |
| "step": 4330 |
| }, |
| { |
| "epoch": 8.087686567164178, |
| "grad_norm": 0.20780882239341736, |
| "learning_rate": 0.0007434862864410487, |
| "loss": 0.4671, |
| "num_input_tokens_seen": 1233880, |
| "step": 4335 |
| }, |
| { |
| "epoch": 8.097014925373134, |
| "grad_norm": 0.17353133857250214, |
| "learning_rate": 0.0007427749562598392, |
| "loss": 0.4412, |
| "num_input_tokens_seen": 1235416, |
| "step": 4340 |
| }, |
| { |
| "epoch": 8.10634328358209, |
| "grad_norm": 0.2616088390350342, |
| "learning_rate": 0.0007420629825483993, |
| "loss": 0.4073, |
| "num_input_tokens_seen": 1236664, |
| "step": 4345 |
| }, |
| { |
| "epoch": 8.115671641791044, |
| "grad_norm": 0.3216690123081207, |
| "learning_rate": 0.000741350367193977, |
| "loss": 0.4714, |
| "num_input_tokens_seen": 1238136, |
| "step": 4350 |
| }, |
| { |
| "epoch": 8.125, |
| "grad_norm": 0.19009575247764587, |
| "learning_rate": 0.000740637112085522, |
| "loss": 0.4795, |
| "num_input_tokens_seen": 1239768, |
| "step": 4355 |
| }, |
| { |
| "epoch": 8.134328358208956, |
| "grad_norm": 0.3063131868839264, |
| "learning_rate": 0.0007399232191136785, |
| "loss": 0.3927, |
| "num_input_tokens_seen": 1241240, |
| "step": 4360 |
| }, |
| { |
| "epoch": 8.14365671641791, |
| "grad_norm": 0.2866196930408478, |
| "learning_rate": 0.0007392086901707824, |
| "loss": 0.4687, |
| "num_input_tokens_seen": 1242712, |
| "step": 4365 |
| }, |
| { |
| "epoch": 8.152985074626866, |
| "grad_norm": 0.19563937187194824, |
| "learning_rate": 0.0007384935271508552, |
| "loss": 0.3187, |
| "num_input_tokens_seen": 1244088, |
| "step": 4370 |
| }, |
| { |
| "epoch": 8.162313432835822, |
| "grad_norm": 0.3950585722923279, |
| "learning_rate": 0.000737777731949599, |
| "loss": 0.3728, |
| "num_input_tokens_seen": 1245496, |
| "step": 4375 |
| }, |
| { |
| "epoch": 8.171641791044776, |
| "grad_norm": 0.18759894371032715, |
| "learning_rate": 0.0007370613064643921, |
| "loss": 0.3397, |
| "num_input_tokens_seen": 1247096, |
| "step": 4380 |
| }, |
| { |
| "epoch": 8.180970149253731, |
| "grad_norm": 0.2706758677959442, |
| "learning_rate": 0.0007363442525942826, |
| "loss": 0.4425, |
| "num_input_tokens_seen": 1248600, |
| "step": 4385 |
| }, |
| { |
| "epoch": 8.190298507462687, |
| "grad_norm": 0.1951591670513153, |
| "learning_rate": 0.0007356265722399854, |
| "loss": 0.4339, |
| "num_input_tokens_seen": 1250008, |
| "step": 4390 |
| }, |
| { |
| "epoch": 8.199626865671641, |
| "grad_norm": 0.2725299894809723, |
| "learning_rate": 0.0007349082673038752, |
| "loss": 0.4047, |
| "num_input_tokens_seen": 1251512, |
| "step": 4395 |
| }, |
| { |
| "epoch": 8.208955223880597, |
| "grad_norm": 0.38873162865638733, |
| "learning_rate": 0.0007341893396899825, |
| "loss": 0.4714, |
| "num_input_tokens_seen": 1252952, |
| "step": 4400 |
| }, |
| { |
| "epoch": 8.218283582089553, |
| "grad_norm": 0.24204622209072113, |
| "learning_rate": 0.0007334697913039885, |
| "loss": 0.3452, |
| "num_input_tokens_seen": 1254392, |
| "step": 4405 |
| }, |
| { |
| "epoch": 8.227611940298507, |
| "grad_norm": 0.3463059067726135, |
| "learning_rate": 0.0007327496240532201, |
| "loss": 0.5566, |
| "num_input_tokens_seen": 1255576, |
| "step": 4410 |
| }, |
| { |
| "epoch": 8.236940298507463, |
| "grad_norm": 0.2980581223964691, |
| "learning_rate": 0.0007320288398466442, |
| "loss": 0.4341, |
| "num_input_tokens_seen": 1257080, |
| "step": 4415 |
| }, |
| { |
| "epoch": 8.246268656716419, |
| "grad_norm": 0.18802469968795776, |
| "learning_rate": 0.0007313074405948629, |
| "loss": 0.5806, |
| "num_input_tokens_seen": 1258520, |
| "step": 4420 |
| }, |
| { |
| "epoch": 8.255597014925373, |
| "grad_norm": 0.20121391117572784, |
| "learning_rate": 0.0007305854282101097, |
| "loss": 0.3982, |
| "num_input_tokens_seen": 1259992, |
| "step": 4425 |
| }, |
| { |
| "epoch": 8.264925373134329, |
| "grad_norm": 0.28050583600997925, |
| "learning_rate": 0.0007298628046062416, |
| "loss": 0.5442, |
| "num_input_tokens_seen": 1261400, |
| "step": 4430 |
| }, |
| { |
| "epoch": 8.274253731343283, |
| "grad_norm": 0.218626469373703, |
| "learning_rate": 0.0007291395716987379, |
| "loss": 0.5104, |
| "num_input_tokens_seen": 1262776, |
| "step": 4435 |
| }, |
| { |
| "epoch": 8.283582089552239, |
| "grad_norm": 0.2403896003961563, |
| "learning_rate": 0.0007284157314046911, |
| "loss": 0.3303, |
| "num_input_tokens_seen": 1264184, |
| "step": 4440 |
| }, |
| { |
| "epoch": 8.292910447761194, |
| "grad_norm": 0.24433453381061554, |
| "learning_rate": 0.0007276912856428048, |
| "loss": 0.4662, |
| "num_input_tokens_seen": 1265528, |
| "step": 4445 |
| }, |
| { |
| "epoch": 8.302238805970148, |
| "grad_norm": 0.25048547983169556, |
| "learning_rate": 0.0007269662363333873, |
| "loss": 0.4738, |
| "num_input_tokens_seen": 1266808, |
| "step": 4450 |
| }, |
| { |
| "epoch": 8.311567164179104, |
| "grad_norm": 0.33556827902793884, |
| "learning_rate": 0.0007262405853983467, |
| "loss": 0.5143, |
| "num_input_tokens_seen": 1268120, |
| "step": 4455 |
| }, |
| { |
| "epoch": 8.32089552238806, |
| "grad_norm": 0.2557665705680847, |
| "learning_rate": 0.0007255143347611855, |
| "loss": 0.5515, |
| "num_input_tokens_seen": 1269464, |
| "step": 4460 |
| }, |
| { |
| "epoch": 8.330223880597014, |
| "grad_norm": 0.2448815554380417, |
| "learning_rate": 0.0007247874863469963, |
| "loss": 0.4604, |
| "num_input_tokens_seen": 1271032, |
| "step": 4465 |
| }, |
| { |
| "epoch": 8.33955223880597, |
| "grad_norm": 0.2030094712972641, |
| "learning_rate": 0.0007240600420824564, |
| "loss": 0.3656, |
| "num_input_tokens_seen": 1272280, |
| "step": 4470 |
| }, |
| { |
| "epoch": 8.348880597014926, |
| "grad_norm": 0.3827509582042694, |
| "learning_rate": 0.000723332003895822, |
| "loss": 0.5106, |
| "num_input_tokens_seen": 1273848, |
| "step": 4475 |
| }, |
| { |
| "epoch": 8.35820895522388, |
| "grad_norm": 0.2691296637058258, |
| "learning_rate": 0.000722603373716924, |
| "loss": 0.3502, |
| "num_input_tokens_seen": 1275384, |
| "step": 4480 |
| }, |
| { |
| "epoch": 8.367537313432836, |
| "grad_norm": 0.29246485233306885, |
| "learning_rate": 0.0007218741534771621, |
| "loss": 0.4031, |
| "num_input_tokens_seen": 1276728, |
| "step": 4485 |
| }, |
| { |
| "epoch": 8.376865671641792, |
| "grad_norm": 0.5716930031776428, |
| "learning_rate": 0.0007211443451095007, |
| "loss": 0.4505, |
| "num_input_tokens_seen": 1278328, |
| "step": 4490 |
| }, |
| { |
| "epoch": 8.386194029850746, |
| "grad_norm": 0.29046720266342163, |
| "learning_rate": 0.0007204139505484627, |
| "loss": 0.6272, |
| "num_input_tokens_seen": 1279832, |
| "step": 4495 |
| }, |
| { |
| "epoch": 8.395522388059701, |
| "grad_norm": 0.24031990766525269, |
| "learning_rate": 0.000719682971730125, |
| "loss": 0.498, |
| "num_input_tokens_seen": 1281176, |
| "step": 4500 |
| }, |
| { |
| "epoch": 8.404850746268657, |
| "grad_norm": 0.3898731470108032, |
| "learning_rate": 0.0007189514105921133, |
| "loss": 0.5826, |
| "num_input_tokens_seen": 1282648, |
| "step": 4505 |
| }, |
| { |
| "epoch": 8.414179104477611, |
| "grad_norm": 0.2248714119195938, |
| "learning_rate": 0.0007182192690735964, |
| "loss": 0.4553, |
| "num_input_tokens_seen": 1283928, |
| "step": 4510 |
| }, |
| { |
| "epoch": 8.423507462686567, |
| "grad_norm": 0.3099049925804138, |
| "learning_rate": 0.0007174865491152823, |
| "loss": 0.6155, |
| "num_input_tokens_seen": 1285272, |
| "step": 4515 |
| }, |
| { |
| "epoch": 8.432835820895523, |
| "grad_norm": 0.3284352719783783, |
| "learning_rate": 0.0007167532526594115, |
| "loss": 0.5621, |
| "num_input_tokens_seen": 1286616, |
| "step": 4520 |
| }, |
| { |
| "epoch": 8.442164179104477, |
| "grad_norm": 0.28215911984443665, |
| "learning_rate": 0.0007160193816497536, |
| "loss": 0.2902, |
| "num_input_tokens_seen": 1288088, |
| "step": 4525 |
| }, |
| { |
| "epoch": 8.451492537313433, |
| "grad_norm": 0.2650202810764313, |
| "learning_rate": 0.0007152849380315999, |
| "loss": 0.5968, |
| "num_input_tokens_seen": 1289528, |
| "step": 4530 |
| }, |
| { |
| "epoch": 8.460820895522389, |
| "grad_norm": 0.2579214572906494, |
| "learning_rate": 0.0007145499237517607, |
| "loss": 0.4103, |
| "num_input_tokens_seen": 1291032, |
| "step": 4535 |
| }, |
| { |
| "epoch": 8.470149253731343, |
| "grad_norm": 0.24751406908035278, |
| "learning_rate": 0.0007138143407585584, |
| "loss": 0.4289, |
| "num_input_tokens_seen": 1292472, |
| "step": 4540 |
| }, |
| { |
| "epoch": 8.479477611940299, |
| "grad_norm": 0.23992621898651123, |
| "learning_rate": 0.0007130781910018227, |
| "loss": 0.6279, |
| "num_input_tokens_seen": 1293944, |
| "step": 4545 |
| }, |
| { |
| "epoch": 8.488805970149254, |
| "grad_norm": 0.12086621671915054, |
| "learning_rate": 0.0007123414764328864, |
| "loss": 0.3467, |
| "num_input_tokens_seen": 1295384, |
| "step": 4550 |
| }, |
| { |
| "epoch": 8.498134328358208, |
| "grad_norm": 0.3661694824695587, |
| "learning_rate": 0.0007116041990045788, |
| "loss": 0.584, |
| "num_input_tokens_seen": 1296792, |
| "step": 4555 |
| }, |
| { |
| "epoch": 8.507462686567164, |
| "grad_norm": 0.44709137082099915, |
| "learning_rate": 0.0007108663606712214, |
| "loss": 0.4874, |
| "num_input_tokens_seen": 1298104, |
| "step": 4560 |
| }, |
| { |
| "epoch": 8.51679104477612, |
| "grad_norm": 0.47159019112586975, |
| "learning_rate": 0.0007101279633886222, |
| "loss": 0.5383, |
| "num_input_tokens_seen": 1299608, |
| "step": 4565 |
| }, |
| { |
| "epoch": 8.526119402985074, |
| "grad_norm": 0.3675750494003296, |
| "learning_rate": 0.0007093890091140716, |
| "loss": 0.4992, |
| "num_input_tokens_seen": 1300888, |
| "step": 4570 |
| }, |
| { |
| "epoch": 8.53544776119403, |
| "grad_norm": 0.4664657413959503, |
| "learning_rate": 0.0007086494998063357, |
| "loss": 0.5515, |
| "num_input_tokens_seen": 1302296, |
| "step": 4575 |
| }, |
| { |
| "epoch": 8.544776119402986, |
| "grad_norm": 0.25660377740859985, |
| "learning_rate": 0.0007079094374256521, |
| "loss": 0.5217, |
| "num_input_tokens_seen": 1303768, |
| "step": 4580 |
| }, |
| { |
| "epoch": 8.55410447761194, |
| "grad_norm": 0.21756531298160553, |
| "learning_rate": 0.0007071688239337244, |
| "loss": 0.5465, |
| "num_input_tokens_seen": 1305208, |
| "step": 4585 |
| }, |
| { |
| "epoch": 8.563432835820896, |
| "grad_norm": 0.3067342936992645, |
| "learning_rate": 0.0007064276612937172, |
| "loss": 0.5361, |
| "num_input_tokens_seen": 1306584, |
| "step": 4590 |
| }, |
| { |
| "epoch": 8.572761194029852, |
| "grad_norm": 0.169167622923851, |
| "learning_rate": 0.0007056859514702506, |
| "loss": 0.4106, |
| "num_input_tokens_seen": 1308056, |
| "step": 4595 |
| }, |
| { |
| "epoch": 8.582089552238806, |
| "grad_norm": 0.2943738102912903, |
| "learning_rate": 0.0007049436964293949, |
| "loss": 0.4275, |
| "num_input_tokens_seen": 1309624, |
| "step": 4600 |
| }, |
| { |
| "epoch": 8.591417910447761, |
| "grad_norm": 0.2510972023010254, |
| "learning_rate": 0.0007042008981386663, |
| "loss": 0.4938, |
| "num_input_tokens_seen": 1311160, |
| "step": 4605 |
| }, |
| { |
| "epoch": 8.600746268656717, |
| "grad_norm": 0.45341455936431885, |
| "learning_rate": 0.0007034575585670204, |
| "loss": 0.383, |
| "num_input_tokens_seen": 1312504, |
| "step": 4610 |
| }, |
| { |
| "epoch": 8.610074626865671, |
| "grad_norm": 0.2252267301082611, |
| "learning_rate": 0.0007027136796848477, |
| "loss": 0.3927, |
| "num_input_tokens_seen": 1314104, |
| "step": 4615 |
| }, |
| { |
| "epoch": 8.619402985074627, |
| "grad_norm": 0.2824556827545166, |
| "learning_rate": 0.0007019692634639683, |
| "loss": 0.4237, |
| "num_input_tokens_seen": 1315544, |
| "step": 4620 |
| }, |
| { |
| "epoch": 8.628731343283581, |
| "grad_norm": 0.2865005433559418, |
| "learning_rate": 0.0007012243118776269, |
| "loss": 0.4228, |
| "num_input_tokens_seen": 1317016, |
| "step": 4625 |
| }, |
| { |
| "epoch": 8.638059701492537, |
| "grad_norm": 0.32809343934059143, |
| "learning_rate": 0.0007004788269004869, |
| "loss": 0.4611, |
| "num_input_tokens_seen": 1318392, |
| "step": 4630 |
| }, |
| { |
| "epoch": 8.647388059701493, |
| "grad_norm": 0.38598981499671936, |
| "learning_rate": 0.0006997328105086257, |
| "loss": 0.5493, |
| "num_input_tokens_seen": 1319768, |
| "step": 4635 |
| }, |
| { |
| "epoch": 8.656716417910447, |
| "grad_norm": 0.32761213183403015, |
| "learning_rate": 0.0006989862646795298, |
| "loss": 0.4471, |
| "num_input_tokens_seen": 1321336, |
| "step": 4640 |
| }, |
| { |
| "epoch": 8.666044776119403, |
| "grad_norm": 0.21694421768188477, |
| "learning_rate": 0.0006982391913920883, |
| "loss": 0.2965, |
| "num_input_tokens_seen": 1322712, |
| "step": 4645 |
| }, |
| { |
| "epoch": 8.675373134328359, |
| "grad_norm": 0.21429555118083954, |
| "learning_rate": 0.0006974915926265889, |
| "loss": 0.5602, |
| "num_input_tokens_seen": 1324216, |
| "step": 4650 |
| }, |
| { |
| "epoch": 8.684701492537313, |
| "grad_norm": 0.4526868462562561, |
| "learning_rate": 0.0006967434703647122, |
| "loss": 0.7087, |
| "num_input_tokens_seen": 1325656, |
| "step": 4655 |
| }, |
| { |
| "epoch": 8.694029850746269, |
| "grad_norm": 0.28775081038475037, |
| "learning_rate": 0.0006959948265895264, |
| "loss": 0.5634, |
| "num_input_tokens_seen": 1327032, |
| "step": 4660 |
| }, |
| { |
| "epoch": 8.703358208955224, |
| "grad_norm": 0.4699382185935974, |
| "learning_rate": 0.000695245663285482, |
| "loss": 0.3949, |
| "num_input_tokens_seen": 1328184, |
| "step": 4665 |
| }, |
| { |
| "epoch": 8.712686567164178, |
| "grad_norm": 0.3626880645751953, |
| "learning_rate": 0.0006944959824384067, |
| "loss": 0.5689, |
| "num_input_tokens_seen": 1329592, |
| "step": 4670 |
| }, |
| { |
| "epoch": 8.722014925373134, |
| "grad_norm": 0.2578609883785248, |
| "learning_rate": 0.0006937457860355002, |
| "loss": 0.4704, |
| "num_input_tokens_seen": 1330904, |
| "step": 4675 |
| }, |
| { |
| "epoch": 8.73134328358209, |
| "grad_norm": 0.3120516538619995, |
| "learning_rate": 0.0006929950760653285, |
| "loss": 0.4856, |
| "num_input_tokens_seen": 1332440, |
| "step": 4680 |
| }, |
| { |
| "epoch": 8.740671641791044, |
| "grad_norm": 0.24091090261936188, |
| "learning_rate": 0.0006922438545178194, |
| "loss": 0.5936, |
| "num_input_tokens_seen": 1333784, |
| "step": 4685 |
| }, |
| { |
| "epoch": 8.75, |
| "grad_norm": 0.4123125970363617, |
| "learning_rate": 0.000691492123384256, |
| "loss": 0.5579, |
| "num_input_tokens_seen": 1335128, |
| "step": 4690 |
| }, |
| { |
| "epoch": 8.759328358208956, |
| "grad_norm": 0.19290226697921753, |
| "learning_rate": 0.0006907398846572728, |
| "loss": 0.396, |
| "num_input_tokens_seen": 1336568, |
| "step": 4695 |
| }, |
| { |
| "epoch": 8.76865671641791, |
| "grad_norm": 0.2879132032394409, |
| "learning_rate": 0.0006899871403308498, |
| "loss": 0.3578, |
| "num_input_tokens_seen": 1338040, |
| "step": 4700 |
| }, |
| { |
| "epoch": 8.777985074626866, |
| "grad_norm": 0.23420406877994537, |
| "learning_rate": 0.0006892338924003068, |
| "loss": 0.4722, |
| "num_input_tokens_seen": 1339608, |
| "step": 4705 |
| }, |
| { |
| "epoch": 8.787313432835822, |
| "grad_norm": 0.3197929561138153, |
| "learning_rate": 0.0006884801428622989, |
| "loss": 0.5515, |
| "num_input_tokens_seen": 1340920, |
| "step": 4710 |
| }, |
| { |
| "epoch": 8.796641791044776, |
| "grad_norm": 0.3196072578430176, |
| "learning_rate": 0.0006877258937148103, |
| "loss": 0.3438, |
| "num_input_tokens_seen": 1342328, |
| "step": 4715 |
| }, |
| { |
| "epoch": 8.805970149253731, |
| "grad_norm": 0.3184308409690857, |
| "learning_rate": 0.0006869711469571504, |
| "loss": 0.4048, |
| "num_input_tokens_seen": 1343672, |
| "step": 4720 |
| }, |
| { |
| "epoch": 8.815298507462687, |
| "grad_norm": 0.16743049025535583, |
| "learning_rate": 0.0006862159045899468, |
| "loss": 0.4086, |
| "num_input_tokens_seen": 1345176, |
| "step": 4725 |
| }, |
| { |
| "epoch": 8.824626865671641, |
| "grad_norm": 0.250658243894577, |
| "learning_rate": 0.0006854601686151412, |
| "loss": 0.4755, |
| "num_input_tokens_seen": 1346552, |
| "step": 4730 |
| }, |
| { |
| "epoch": 8.833955223880597, |
| "grad_norm": 0.3016541302204132, |
| "learning_rate": 0.0006847039410359837, |
| "loss": 0.4492, |
| "num_input_tokens_seen": 1348088, |
| "step": 4735 |
| }, |
| { |
| "epoch": 8.843283582089553, |
| "grad_norm": 0.1681380718946457, |
| "learning_rate": 0.0006839472238570273, |
| "loss": 0.4924, |
| "num_input_tokens_seen": 1349432, |
| "step": 4740 |
| }, |
| { |
| "epoch": 8.852611940298507, |
| "grad_norm": 0.32491400837898254, |
| "learning_rate": 0.0006831900190841231, |
| "loss": 0.5161, |
| "num_input_tokens_seen": 1350712, |
| "step": 4745 |
| }, |
| { |
| "epoch": 8.861940298507463, |
| "grad_norm": 0.4035620391368866, |
| "learning_rate": 0.0006824323287244146, |
| "loss": 0.6008, |
| "num_input_tokens_seen": 1351992, |
| "step": 4750 |
| }, |
| { |
| "epoch": 8.871268656716419, |
| "grad_norm": 0.3141079843044281, |
| "learning_rate": 0.0006816741547863324, |
| "loss": 0.3931, |
| "num_input_tokens_seen": 1353496, |
| "step": 4755 |
| }, |
| { |
| "epoch": 8.880597014925373, |
| "grad_norm": 0.23624996840953827, |
| "learning_rate": 0.0006809154992795887, |
| "loss": 0.4218, |
| "num_input_tokens_seen": 1354904, |
| "step": 4760 |
| }, |
| { |
| "epoch": 8.889925373134329, |
| "grad_norm": 0.3903117775917053, |
| "learning_rate": 0.0006801563642151729, |
| "loss": 0.6213, |
| "num_input_tokens_seen": 1356280, |
| "step": 4765 |
| }, |
| { |
| "epoch": 8.899253731343283, |
| "grad_norm": 0.4300483465194702, |
| "learning_rate": 0.0006793967516053448, |
| "loss": 0.6809, |
| "num_input_tokens_seen": 1357720, |
| "step": 4770 |
| }, |
| { |
| "epoch": 8.908582089552239, |
| "grad_norm": 0.2975708246231079, |
| "learning_rate": 0.0006786366634636303, |
| "loss": 0.9333, |
| "num_input_tokens_seen": 1359096, |
| "step": 4775 |
| }, |
| { |
| "epoch": 8.917910447761194, |
| "grad_norm": 0.2819536626338959, |
| "learning_rate": 0.0006778761018048161, |
| "loss": 0.6371, |
| "num_input_tokens_seen": 1360472, |
| "step": 4780 |
| }, |
| { |
| "epoch": 8.927238805970148, |
| "grad_norm": 0.3771061599254608, |
| "learning_rate": 0.0006771150686449435, |
| "loss": 0.5515, |
| "num_input_tokens_seen": 1362008, |
| "step": 4785 |
| }, |
| { |
| "epoch": 8.936567164179104, |
| "grad_norm": 0.37994271516799927, |
| "learning_rate": 0.0006763535660013044, |
| "loss": 0.4953, |
| "num_input_tokens_seen": 1363384, |
| "step": 4790 |
| }, |
| { |
| "epoch": 8.94589552238806, |
| "grad_norm": 0.2501105070114136, |
| "learning_rate": 0.0006755915958924344, |
| "loss": 0.4797, |
| "num_input_tokens_seen": 1364728, |
| "step": 4795 |
| }, |
| { |
| "epoch": 8.955223880597014, |
| "grad_norm": 0.2391573041677475, |
| "learning_rate": 0.0006748291603381087, |
| "loss": 0.3529, |
| "num_input_tokens_seen": 1366136, |
| "step": 4800 |
| }, |
| { |
| "epoch": 8.96455223880597, |
| "grad_norm": 0.19431819021701813, |
| "learning_rate": 0.000674066261359336, |
| "loss": 0.4215, |
| "num_input_tokens_seen": 1367448, |
| "step": 4805 |
| }, |
| { |
| "epoch": 8.973880597014926, |
| "grad_norm": 0.23345661163330078, |
| "learning_rate": 0.0006733029009783537, |
| "loss": 0.5026, |
| "num_input_tokens_seen": 1369208, |
| "step": 4810 |
| }, |
| { |
| "epoch": 8.98320895522388, |
| "grad_norm": 0.3160540759563446, |
| "learning_rate": 0.000672539081218622, |
| "loss": 0.2601, |
| "num_input_tokens_seen": 1370744, |
| "step": 4815 |
| }, |
| { |
| "epoch": 8.992537313432836, |
| "grad_norm": 0.2732608914375305, |
| "learning_rate": 0.0006717748041048187, |
| "loss": 0.4701, |
| "num_input_tokens_seen": 1372056, |
| "step": 4820 |
| }, |
| { |
| "epoch": 9.0, |
| "eval_loss": 0.6892353296279907, |
| "eval_runtime": 4.1958, |
| "eval_samples_per_second": 56.724, |
| "eval_steps_per_second": 14.3, |
| "num_input_tokens_seen": 1373032, |
| "step": 4824 |
| }, |
| { |
| "epoch": 9.001865671641792, |
| "grad_norm": 0.3725232779979706, |
| "learning_rate": 0.0006710100716628344, |
| "loss": 0.6341, |
| "num_input_tokens_seen": 1373416, |
| "step": 4825 |
| }, |
| { |
| "epoch": 9.011194029850746, |
| "grad_norm": 0.3368667960166931, |
| "learning_rate": 0.0006702448859197661, |
| "loss": 0.574, |
| "num_input_tokens_seen": 1374888, |
| "step": 4830 |
| }, |
| { |
| "epoch": 9.020522388059701, |
| "grad_norm": 0.21971359848976135, |
| "learning_rate": 0.0006694792489039128, |
| "loss": 0.4461, |
| "num_input_tokens_seen": 1376392, |
| "step": 4835 |
| }, |
| { |
| "epoch": 9.029850746268657, |
| "grad_norm": 0.36806461215019226, |
| "learning_rate": 0.0006687131626447694, |
| "loss": 0.5298, |
| "num_input_tokens_seen": 1377992, |
| "step": 4840 |
| }, |
| { |
| "epoch": 9.039179104477611, |
| "grad_norm": 0.24120855331420898, |
| "learning_rate": 0.0006679466291730218, |
| "loss": 0.4954, |
| "num_input_tokens_seen": 1379400, |
| "step": 4845 |
| }, |
| { |
| "epoch": 9.048507462686567, |
| "grad_norm": 0.16234050691127777, |
| "learning_rate": 0.0006671796505205414, |
| "loss": 0.4344, |
| "num_input_tokens_seen": 1380872, |
| "step": 4850 |
| }, |
| { |
| "epoch": 9.057835820895523, |
| "grad_norm": 0.2468133419752121, |
| "learning_rate": 0.0006664122287203791, |
| "loss": 0.6219, |
| "num_input_tokens_seen": 1382184, |
| "step": 4855 |
| }, |
| { |
| "epoch": 9.067164179104477, |
| "grad_norm": 0.288142591714859, |
| "learning_rate": 0.0006656443658067615, |
| "loss": 0.3348, |
| "num_input_tokens_seen": 1383816, |
| "step": 4860 |
| }, |
| { |
| "epoch": 9.076492537313433, |
| "grad_norm": 0.5281508564949036, |
| "learning_rate": 0.0006648760638150832, |
| "loss": 0.5361, |
| "num_input_tokens_seen": 1385128, |
| "step": 4865 |
| }, |
| { |
| "epoch": 9.085820895522389, |
| "grad_norm": 0.3075248897075653, |
| "learning_rate": 0.0006641073247819041, |
| "loss": 0.4782, |
| "num_input_tokens_seen": 1386440, |
| "step": 4870 |
| }, |
| { |
| "epoch": 9.095149253731343, |
| "grad_norm": 0.18158727884292603, |
| "learning_rate": 0.0006633381507449412, |
| "loss": 0.4621, |
| "num_input_tokens_seen": 1387944, |
| "step": 4875 |
| }, |
| { |
| "epoch": 9.104477611940299, |
| "grad_norm": 0.4284612238407135, |
| "learning_rate": 0.0006625685437430655, |
| "loss": 0.5046, |
| "num_input_tokens_seen": 1389416, |
| "step": 4880 |
| }, |
| { |
| "epoch": 9.113805970149254, |
| "grad_norm": 0.2957424521446228, |
| "learning_rate": 0.0006617985058162953, |
| "loss": 0.3839, |
| "num_input_tokens_seen": 1390632, |
| "step": 4885 |
| }, |
| { |
| "epoch": 9.123134328358208, |
| "grad_norm": 0.23130932450294495, |
| "learning_rate": 0.0006610280390057914, |
| "loss": 0.3038, |
| "num_input_tokens_seen": 1392200, |
| "step": 4890 |
| }, |
| { |
| "epoch": 9.132462686567164, |
| "grad_norm": 0.20099236071109772, |
| "learning_rate": 0.000660257145353851, |
| "loss": 0.2988, |
| "num_input_tokens_seen": 1393576, |
| "step": 4895 |
| }, |
| { |
| "epoch": 9.14179104477612, |
| "grad_norm": 0.37872520089149475, |
| "learning_rate": 0.0006594858269039032, |
| "loss": 0.3758, |
| "num_input_tokens_seen": 1394824, |
| "step": 4900 |
| }, |
| { |
| "epoch": 9.151119402985074, |
| "grad_norm": 0.28783118724823, |
| "learning_rate": 0.0006587140857005029, |
| "loss": 0.4344, |
| "num_input_tokens_seen": 1396136, |
| "step": 4905 |
| }, |
| { |
| "epoch": 9.16044776119403, |
| "grad_norm": 0.5245941877365112, |
| "learning_rate": 0.0006579419237893256, |
| "loss": 0.6692, |
| "num_input_tokens_seen": 1397384, |
| "step": 4910 |
| }, |
| { |
| "epoch": 9.169776119402986, |
| "grad_norm": 0.38943246006965637, |
| "learning_rate": 0.0006571693432171624, |
| "loss": 0.4255, |
| "num_input_tokens_seen": 1399016, |
| "step": 4915 |
| }, |
| { |
| "epoch": 9.17910447761194, |
| "grad_norm": 0.3359316289424896, |
| "learning_rate": 0.0006563963460319134, |
| "loss": 0.4888, |
| "num_input_tokens_seen": 1400424, |
| "step": 4920 |
| }, |
| { |
| "epoch": 9.188432835820896, |
| "grad_norm": 0.2700708210468292, |
| "learning_rate": 0.0006556229342825835, |
| "loss": 0.461, |
| "num_input_tokens_seen": 1401992, |
| "step": 4925 |
| }, |
| { |
| "epoch": 9.197761194029852, |
| "grad_norm": 0.2669214606285095, |
| "learning_rate": 0.0006548491100192763, |
| "loss": 0.4753, |
| "num_input_tokens_seen": 1403336, |
| "step": 4930 |
| }, |
| { |
| "epoch": 9.207089552238806, |
| "grad_norm": 0.3095604181289673, |
| "learning_rate": 0.0006540748752931894, |
| "loss": 0.4736, |
| "num_input_tokens_seen": 1404840, |
| "step": 4935 |
| }, |
| { |
| "epoch": 9.216417910447761, |
| "grad_norm": 0.18496310710906982, |
| "learning_rate": 0.0006533002321566078, |
| "loss": 0.4155, |
| "num_input_tokens_seen": 1406216, |
| "step": 4940 |
| }, |
| { |
| "epoch": 9.225746268656717, |
| "grad_norm": 0.34469074010849, |
| "learning_rate": 0.0006525251826628991, |
| "loss": 0.3456, |
| "num_input_tokens_seen": 1407656, |
| "step": 4945 |
| }, |
| { |
| "epoch": 9.235074626865671, |
| "grad_norm": 0.21673814952373505, |
| "learning_rate": 0.0006517497288665086, |
| "loss": 0.1967, |
| "num_input_tokens_seen": 1409448, |
| "step": 4950 |
| }, |
| { |
| "epoch": 9.244402985074627, |
| "grad_norm": 0.357761025428772, |
| "learning_rate": 0.0006509738728229525, |
| "loss": 0.4642, |
| "num_input_tokens_seen": 1410792, |
| "step": 4955 |
| }, |
| { |
| "epoch": 9.253731343283581, |
| "grad_norm": 0.3709312975406647, |
| "learning_rate": 0.000650197616588814, |
| "loss": 0.449, |
| "num_input_tokens_seen": 1412296, |
| "step": 4960 |
| }, |
| { |
| "epoch": 9.263059701492537, |
| "grad_norm": 0.4108433723449707, |
| "learning_rate": 0.0006494209622217365, |
| "loss": 0.4615, |
| "num_input_tokens_seen": 1413800, |
| "step": 4965 |
| }, |
| { |
| "epoch": 9.272388059701493, |
| "grad_norm": 0.26729562878608704, |
| "learning_rate": 0.0006486439117804195, |
| "loss": 0.495, |
| "num_input_tokens_seen": 1415176, |
| "step": 4970 |
| }, |
| { |
| "epoch": 9.281716417910447, |
| "grad_norm": 0.2769668996334076, |
| "learning_rate": 0.0006478664673246115, |
| "loss": 0.327, |
| "num_input_tokens_seen": 1416616, |
| "step": 4975 |
| }, |
| { |
| "epoch": 9.291044776119403, |
| "grad_norm": 0.31615519523620605, |
| "learning_rate": 0.0006470886309151058, |
| "loss": 0.5333, |
| "num_input_tokens_seen": 1417960, |
| "step": 4980 |
| }, |
| { |
| "epoch": 9.300373134328359, |
| "grad_norm": 0.39532470703125, |
| "learning_rate": 0.0006463104046137349, |
| "loss": 0.5181, |
| "num_input_tokens_seen": 1419176, |
| "step": 4985 |
| }, |
| { |
| "epoch": 9.309701492537313, |
| "grad_norm": 0.27754735946655273, |
| "learning_rate": 0.0006455317904833644, |
| "loss": 0.3109, |
| "num_input_tokens_seen": 1420584, |
| "step": 4990 |
| }, |
| { |
| "epoch": 9.319029850746269, |
| "grad_norm": 0.24021460115909576, |
| "learning_rate": 0.0006447527905878883, |
| "loss": 0.4653, |
| "num_input_tokens_seen": 1422024, |
| "step": 4995 |
| }, |
| { |
| "epoch": 9.328358208955224, |
| "grad_norm": 0.35687342286109924, |
| "learning_rate": 0.0006439734069922229, |
| "loss": 0.4767, |
| "num_input_tokens_seen": 1423336, |
| "step": 5000 |
| }, |
| { |
| "epoch": 9.337686567164178, |
| "grad_norm": 0.32762643694877625, |
| "learning_rate": 0.0006431936417623016, |
| "loss": 0.4609, |
| "num_input_tokens_seen": 1424680, |
| "step": 5005 |
| }, |
| { |
| "epoch": 9.347014925373134, |
| "grad_norm": 0.31598585844039917, |
| "learning_rate": 0.0006424134969650695, |
| "loss": 0.4259, |
| "num_input_tokens_seen": 1426024, |
| "step": 5010 |
| }, |
| { |
| "epoch": 9.35634328358209, |
| "grad_norm": 0.6430713534355164, |
| "learning_rate": 0.0006416329746684779, |
| "loss": 0.685, |
| "num_input_tokens_seen": 1427304, |
| "step": 5015 |
| }, |
| { |
| "epoch": 9.365671641791044, |
| "grad_norm": 0.25953567028045654, |
| "learning_rate": 0.0006408520769414785, |
| "loss": 0.3353, |
| "num_input_tokens_seen": 1428936, |
| "step": 5020 |
| }, |
| { |
| "epoch": 9.375, |
| "grad_norm": 0.22472409904003143, |
| "learning_rate": 0.0006400708058540182, |
| "loss": 0.3017, |
| "num_input_tokens_seen": 1430344, |
| "step": 5025 |
| }, |
| { |
| "epoch": 9.384328358208956, |
| "grad_norm": 0.32781997323036194, |
| "learning_rate": 0.0006392891634770341, |
| "loss": 0.3861, |
| "num_input_tokens_seen": 1431656, |
| "step": 5030 |
| }, |
| { |
| "epoch": 9.39365671641791, |
| "grad_norm": 0.3750348389148712, |
| "learning_rate": 0.0006385071518824467, |
| "loss": 0.5049, |
| "num_input_tokens_seen": 1433032, |
| "step": 5035 |
| }, |
| { |
| "epoch": 9.402985074626866, |
| "grad_norm": 0.3276551365852356, |
| "learning_rate": 0.0006377247731431557, |
| "loss": 0.4301, |
| "num_input_tokens_seen": 1434440, |
| "step": 5040 |
| }, |
| { |
| "epoch": 9.412313432835822, |
| "grad_norm": 0.3144753873348236, |
| "learning_rate": 0.0006369420293330338, |
| "loss": 0.5351, |
| "num_input_tokens_seen": 1436072, |
| "step": 5045 |
| }, |
| { |
| "epoch": 9.421641791044776, |
| "grad_norm": 0.21689414978027344, |
| "learning_rate": 0.0006361589225269216, |
| "loss": 0.2511, |
| "num_input_tokens_seen": 1437448, |
| "step": 5050 |
| }, |
| { |
| "epoch": 9.430970149253731, |
| "grad_norm": 0.2835860252380371, |
| "learning_rate": 0.0006353754548006215, |
| "loss": 0.4159, |
| "num_input_tokens_seen": 1438792, |
| "step": 5055 |
| }, |
| { |
| "epoch": 9.440298507462687, |
| "grad_norm": 0.2518909275531769, |
| "learning_rate": 0.0006345916282308932, |
| "loss": 0.4009, |
| "num_input_tokens_seen": 1440168, |
| "step": 5060 |
| }, |
| { |
| "epoch": 9.449626865671641, |
| "grad_norm": 0.2714730203151703, |
| "learning_rate": 0.0006338074448954472, |
| "loss": 0.4108, |
| "num_input_tokens_seen": 1441640, |
| "step": 5065 |
| }, |
| { |
| "epoch": 9.458955223880597, |
| "grad_norm": 0.4161335527896881, |
| "learning_rate": 0.0006330229068729396, |
| "loss": 0.3473, |
| "num_input_tokens_seen": 1443048, |
| "step": 5070 |
| }, |
| { |
| "epoch": 9.468283582089553, |
| "grad_norm": 0.38253721594810486, |
| "learning_rate": 0.0006322380162429671, |
| "loss": 0.48, |
| "num_input_tokens_seen": 1444360, |
| "step": 5075 |
| }, |
| { |
| "epoch": 9.477611940298507, |
| "grad_norm": 0.21055766940116882, |
| "learning_rate": 0.0006314527750860603, |
| "loss": 0.5905, |
| "num_input_tokens_seen": 1445864, |
| "step": 5080 |
| }, |
| { |
| "epoch": 9.486940298507463, |
| "grad_norm": 0.5116686820983887, |
| "learning_rate": 0.0006306671854836801, |
| "loss": 0.4005, |
| "num_input_tokens_seen": 1447240, |
| "step": 5085 |
| }, |
| { |
| "epoch": 9.496268656716419, |
| "grad_norm": 0.38486605882644653, |
| "learning_rate": 0.00062988124951821, |
| "loss": 0.6322, |
| "num_input_tokens_seen": 1448616, |
| "step": 5090 |
| }, |
| { |
| "epoch": 9.505597014925373, |
| "grad_norm": 0.3872508704662323, |
| "learning_rate": 0.0006290949692729521, |
| "loss": 0.3415, |
| "num_input_tokens_seen": 1450056, |
| "step": 5095 |
| }, |
| { |
| "epoch": 9.514925373134329, |
| "grad_norm": 0.21692124009132385, |
| "learning_rate": 0.000628308346832121, |
| "loss": 0.4015, |
| "num_input_tokens_seen": 1451496, |
| "step": 5100 |
| }, |
| { |
| "epoch": 9.524253731343283, |
| "grad_norm": 0.19488053023815155, |
| "learning_rate": 0.0006275213842808383, |
| "loss": 0.3756, |
| "num_input_tokens_seen": 1453288, |
| "step": 5105 |
| }, |
| { |
| "epoch": 9.533582089552239, |
| "grad_norm": 0.18054017424583435, |
| "learning_rate": 0.0006267340837051273, |
| "loss": 0.5908, |
| "num_input_tokens_seen": 1454792, |
| "step": 5110 |
| }, |
| { |
| "epoch": 9.542910447761194, |
| "grad_norm": 0.26882755756378174, |
| "learning_rate": 0.0006259464471919069, |
| "loss": 0.4475, |
| "num_input_tokens_seen": 1456136, |
| "step": 5115 |
| }, |
| { |
| "epoch": 9.552238805970148, |
| "grad_norm": 0.21234282851219177, |
| "learning_rate": 0.0006251584768289874, |
| "loss": 0.4061, |
| "num_input_tokens_seen": 1457672, |
| "step": 5120 |
| }, |
| { |
| "epoch": 9.561567164179104, |
| "grad_norm": 0.33642059564590454, |
| "learning_rate": 0.000624370174705063, |
| "loss": 0.7005, |
| "num_input_tokens_seen": 1459080, |
| "step": 5125 |
| }, |
| { |
| "epoch": 9.57089552238806, |
| "grad_norm": 0.1804080754518509, |
| "learning_rate": 0.000623581542909708, |
| "loss": 0.452, |
| "num_input_tokens_seen": 1460552, |
| "step": 5130 |
| }, |
| { |
| "epoch": 9.580223880597014, |
| "grad_norm": 0.3603133261203766, |
| "learning_rate": 0.0006227925835333699, |
| "loss": 0.4513, |
| "num_input_tokens_seen": 1461832, |
| "step": 5135 |
| }, |
| { |
| "epoch": 9.58955223880597, |
| "grad_norm": 0.18820270895957947, |
| "learning_rate": 0.0006220032986673652, |
| "loss": 0.3246, |
| "num_input_tokens_seen": 1463176, |
| "step": 5140 |
| }, |
| { |
| "epoch": 9.598880597014926, |
| "grad_norm": 0.4032508432865143, |
| "learning_rate": 0.000621213690403873, |
| "loss": 0.486, |
| "num_input_tokens_seen": 1464488, |
| "step": 5145 |
| }, |
| { |
| "epoch": 9.60820895522388, |
| "grad_norm": 0.4418434202671051, |
| "learning_rate": 0.0006204237608359296, |
| "loss": 0.4409, |
| "num_input_tokens_seen": 1465928, |
| "step": 5150 |
| }, |
| { |
| "epoch": 9.617537313432836, |
| "grad_norm": 0.4267770051956177, |
| "learning_rate": 0.0006196335120574227, |
| "loss": 0.3294, |
| "num_input_tokens_seen": 1467432, |
| "step": 5155 |
| }, |
| { |
| "epoch": 9.626865671641792, |
| "grad_norm": 0.2499537467956543, |
| "learning_rate": 0.0006188429461630866, |
| "loss": 0.3229, |
| "num_input_tokens_seen": 1468840, |
| "step": 5160 |
| }, |
| { |
| "epoch": 9.636194029850746, |
| "grad_norm": 0.3874378204345703, |
| "learning_rate": 0.000618052065248496, |
| "loss": 0.4701, |
| "num_input_tokens_seen": 1470280, |
| "step": 5165 |
| }, |
| { |
| "epoch": 9.645522388059701, |
| "grad_norm": 0.25871914625167847, |
| "learning_rate": 0.0006172608714100603, |
| "loss": 0.3273, |
| "num_input_tokens_seen": 1471880, |
| "step": 5170 |
| }, |
| { |
| "epoch": 9.654850746268657, |
| "grad_norm": 0.2216385304927826, |
| "learning_rate": 0.000616469366745019, |
| "loss": 0.4356, |
| "num_input_tokens_seen": 1473224, |
| "step": 5175 |
| }, |
| { |
| "epoch": 9.664179104477611, |
| "grad_norm": 0.26175910234451294, |
| "learning_rate": 0.0006156775533514353, |
| "loss": 0.4981, |
| "num_input_tokens_seen": 1474504, |
| "step": 5180 |
| }, |
| { |
| "epoch": 9.673507462686567, |
| "grad_norm": 0.22560495138168335, |
| "learning_rate": 0.0006148854333281905, |
| "loss": 0.386, |
| "num_input_tokens_seen": 1476008, |
| "step": 5185 |
| }, |
| { |
| "epoch": 9.682835820895523, |
| "grad_norm": 0.2675865888595581, |
| "learning_rate": 0.0006140930087749789, |
| "loss": 0.4949, |
| "num_input_tokens_seen": 1477416, |
| "step": 5190 |
| }, |
| { |
| "epoch": 9.692164179104477, |
| "grad_norm": 0.41903549432754517, |
| "learning_rate": 0.0006133002817923018, |
| "loss": 0.5435, |
| "num_input_tokens_seen": 1478696, |
| "step": 5195 |
| }, |
| { |
| "epoch": 9.701492537313433, |
| "grad_norm": 0.3527034819126129, |
| "learning_rate": 0.0006125072544814625, |
| "loss": 0.5474, |
| "num_input_tokens_seen": 1480040, |
| "step": 5200 |
| }, |
| { |
| "epoch": 9.710820895522389, |
| "grad_norm": 0.4132210910320282, |
| "learning_rate": 0.0006117139289445601, |
| "loss": 0.4877, |
| "num_input_tokens_seen": 1481384, |
| "step": 5205 |
| }, |
| { |
| "epoch": 9.720149253731343, |
| "grad_norm": 0.3663824498653412, |
| "learning_rate": 0.0006109203072844847, |
| "loss": 0.7451, |
| "num_input_tokens_seen": 1482760, |
| "step": 5210 |
| }, |
| { |
| "epoch": 9.729477611940299, |
| "grad_norm": 0.29538971185684204, |
| "learning_rate": 0.0006101263916049107, |
| "loss": 0.4493, |
| "num_input_tokens_seen": 1484136, |
| "step": 5215 |
| }, |
| { |
| "epoch": 9.738805970149254, |
| "grad_norm": 0.341337114572525, |
| "learning_rate": 0.0006093321840102921, |
| "loss": 0.5761, |
| "num_input_tokens_seen": 1485672, |
| "step": 5220 |
| }, |
| { |
| "epoch": 9.748134328358208, |
| "grad_norm": 0.17743420600891113, |
| "learning_rate": 0.0006085376866058568, |
| "loss": 0.2284, |
| "num_input_tokens_seen": 1487304, |
| "step": 5225 |
| }, |
| { |
| "epoch": 9.757462686567164, |
| "grad_norm": 0.2598201632499695, |
| "learning_rate": 0.000607742901497601, |
| "loss": 0.4622, |
| "num_input_tokens_seen": 1488808, |
| "step": 5230 |
| }, |
| { |
| "epoch": 9.76679104477612, |
| "grad_norm": 0.2551731467247009, |
| "learning_rate": 0.0006069478307922831, |
| "loss": 0.61, |
| "num_input_tokens_seen": 1490280, |
| "step": 5235 |
| }, |
| { |
| "epoch": 9.776119402985074, |
| "grad_norm": 0.4407395124435425, |
| "learning_rate": 0.000606152476597419, |
| "loss": 0.3799, |
| "num_input_tokens_seen": 1491656, |
| "step": 5240 |
| }, |
| { |
| "epoch": 9.78544776119403, |
| "grad_norm": 0.41758984327316284, |
| "learning_rate": 0.0006053568410212759, |
| "loss": 0.4126, |
| "num_input_tokens_seen": 1493160, |
| "step": 5245 |
| }, |
| { |
| "epoch": 9.794776119402986, |
| "grad_norm": 0.3324006497859955, |
| "learning_rate": 0.0006045609261728667, |
| "loss": 0.4813, |
| "num_input_tokens_seen": 1494504, |
| "step": 5250 |
| }, |
| { |
| "epoch": 9.80410447761194, |
| "grad_norm": 0.2460772544145584, |
| "learning_rate": 0.0006037647341619448, |
| "loss": 0.3861, |
| "num_input_tokens_seen": 1495816, |
| "step": 5255 |
| }, |
| { |
| "epoch": 9.813432835820896, |
| "grad_norm": 0.2575085759162903, |
| "learning_rate": 0.000602968267098998, |
| "loss": 0.4007, |
| "num_input_tokens_seen": 1497288, |
| "step": 5260 |
| }, |
| { |
| "epoch": 9.822761194029852, |
| "grad_norm": 0.2791326940059662, |
| "learning_rate": 0.0006021715270952435, |
| "loss": 0.4939, |
| "num_input_tokens_seen": 1498792, |
| "step": 5265 |
| }, |
| { |
| "epoch": 9.832089552238806, |
| "grad_norm": 0.38074228167533875, |
| "learning_rate": 0.000601374516262622, |
| "loss": 0.4303, |
| "num_input_tokens_seen": 1500520, |
| "step": 5270 |
| }, |
| { |
| "epoch": 9.841417910447761, |
| "grad_norm": 0.34724968671798706, |
| "learning_rate": 0.0006005772367137917, |
| "loss": 0.2854, |
| "num_input_tokens_seen": 1501832, |
| "step": 5275 |
| }, |
| { |
| "epoch": 9.850746268656717, |
| "grad_norm": 0.27803465723991394, |
| "learning_rate": 0.0005997796905621236, |
| "loss": 0.4868, |
| "num_input_tokens_seen": 1503112, |
| "step": 5280 |
| }, |
| { |
| "epoch": 9.860074626865671, |
| "grad_norm": 0.31478098034858704, |
| "learning_rate": 0.0005989818799216949, |
| "loss": 0.4081, |
| "num_input_tokens_seen": 1504392, |
| "step": 5285 |
| }, |
| { |
| "epoch": 9.869402985074627, |
| "grad_norm": 0.3727143108844757, |
| "learning_rate": 0.0005981838069072843, |
| "loss": 0.5072, |
| "num_input_tokens_seen": 1505640, |
| "step": 5290 |
| }, |
| { |
| "epoch": 9.878731343283581, |
| "grad_norm": 0.2286718338727951, |
| "learning_rate": 0.0005973854736343658, |
| "loss": 0.5946, |
| "num_input_tokens_seen": 1506824, |
| "step": 5295 |
| }, |
| { |
| "epoch": 9.888059701492537, |
| "grad_norm": 0.24771994352340698, |
| "learning_rate": 0.0005965868822191032, |
| "loss": 0.3597, |
| "num_input_tokens_seen": 1508264, |
| "step": 5300 |
| }, |
| { |
| "epoch": 9.897388059701493, |
| "grad_norm": 0.2050355076789856, |
| "learning_rate": 0.0005957880347783449, |
| "loss": 0.4493, |
| "num_input_tokens_seen": 1509704, |
| "step": 5305 |
| }, |
| { |
| "epoch": 9.906716417910447, |
| "grad_norm": 0.35616564750671387, |
| "learning_rate": 0.0005949889334296172, |
| "loss": 0.4813, |
| "num_input_tokens_seen": 1511304, |
| "step": 5310 |
| }, |
| { |
| "epoch": 9.916044776119403, |
| "grad_norm": 0.3519928455352783, |
| "learning_rate": 0.0005941895802911205, |
| "loss": 0.288, |
| "num_input_tokens_seen": 1512712, |
| "step": 5315 |
| }, |
| { |
| "epoch": 9.925373134328359, |
| "grad_norm": 0.31984153389930725, |
| "learning_rate": 0.0005933899774817216, |
| "loss": 0.4164, |
| "num_input_tokens_seen": 1514184, |
| "step": 5320 |
| }, |
| { |
| "epoch": 9.934701492537313, |
| "grad_norm": 0.34073641896247864, |
| "learning_rate": 0.00059259012712095, |
| "loss": 0.4098, |
| "num_input_tokens_seen": 1515464, |
| "step": 5325 |
| }, |
| { |
| "epoch": 9.944029850746269, |
| "grad_norm": 0.17610888183116913, |
| "learning_rate": 0.0005917900313289906, |
| "loss": 0.4047, |
| "num_input_tokens_seen": 1516968, |
| "step": 5330 |
| }, |
| { |
| "epoch": 9.953358208955224, |
| "grad_norm": 0.38012248277664185, |
| "learning_rate": 0.0005909896922266796, |
| "loss": 0.3608, |
| "num_input_tokens_seen": 1518504, |
| "step": 5335 |
| }, |
| { |
| "epoch": 9.962686567164178, |
| "grad_norm": 0.21262069046497345, |
| "learning_rate": 0.0005901891119354976, |
| "loss": 0.3746, |
| "num_input_tokens_seen": 1519880, |
| "step": 5340 |
| }, |
| { |
| "epoch": 9.972014925373134, |
| "grad_norm": 0.20688098669052124, |
| "learning_rate": 0.0005893882925775648, |
| "loss": 0.4088, |
| "num_input_tokens_seen": 1521320, |
| "step": 5345 |
| }, |
| { |
| "epoch": 9.98134328358209, |
| "grad_norm": 0.3373405635356903, |
| "learning_rate": 0.0005885872362756348, |
| "loss": 0.4424, |
| "num_input_tokens_seen": 1522696, |
| "step": 5350 |
| }, |
| { |
| "epoch": 9.990671641791044, |
| "grad_norm": 0.2352280467748642, |
| "learning_rate": 0.0005877859451530895, |
| "loss": 0.4812, |
| "num_input_tokens_seen": 1524072, |
| "step": 5355 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 1.3264073133468628, |
| "learning_rate": 0.0005869844213339338, |
| "loss": 0.9467, |
| "num_input_tokens_seen": 1525104, |
| "step": 5360 |
| }, |
| { |
| "epoch": 10.0, |
| "eval_loss": 0.6917274594306946, |
| "eval_runtime": 4.1989, |
| "eval_samples_per_second": 56.681, |
| "eval_steps_per_second": 14.289, |
| "num_input_tokens_seen": 1525104, |
| "step": 5360 |
| }, |
| { |
| "epoch": 10.009328358208956, |
| "grad_norm": 0.391521155834198, |
| "learning_rate": 0.0005861826669427882, |
| "loss": 0.4883, |
| "num_input_tokens_seen": 1526608, |
| "step": 5365 |
| }, |
| { |
| "epoch": 10.01865671641791, |
| "grad_norm": 0.3746884763240814, |
| "learning_rate": 0.0005853806841048854, |
| "loss": 0.4612, |
| "num_input_tokens_seen": 1528048, |
| "step": 5370 |
| }, |
| { |
| "epoch": 10.027985074626866, |
| "grad_norm": 0.44840776920318604, |
| "learning_rate": 0.0005845784749460631, |
| "loss": 0.567, |
| "num_input_tokens_seen": 1529360, |
| "step": 5375 |
| }, |
| { |
| "epoch": 10.037313432835822, |
| "grad_norm": 0.4189889430999756, |
| "learning_rate": 0.0005837760415927593, |
| "loss": 0.4075, |
| "num_input_tokens_seen": 1530672, |
| "step": 5380 |
| }, |
| { |
| "epoch": 10.046641791044776, |
| "grad_norm": 0.37005850672721863, |
| "learning_rate": 0.0005829733861720059, |
| "loss": 0.4669, |
| "num_input_tokens_seen": 1532176, |
| "step": 5385 |
| }, |
| { |
| "epoch": 10.055970149253731, |
| "grad_norm": 0.3520726263523102, |
| "learning_rate": 0.0005821705108114236, |
| "loss": 0.348, |
| "num_input_tokens_seen": 1533680, |
| "step": 5390 |
| }, |
| { |
| "epoch": 10.065298507462687, |
| "grad_norm": 0.3111060559749603, |
| "learning_rate": 0.0005813674176392163, |
| "loss": 0.6895, |
| "num_input_tokens_seen": 1535024, |
| "step": 5395 |
| }, |
| { |
| "epoch": 10.074626865671641, |
| "grad_norm": 0.1819819211959839, |
| "learning_rate": 0.0005805641087841649, |
| "loss": 0.3598, |
| "num_input_tokens_seen": 1536624, |
| "step": 5400 |
| }, |
| { |
| "epoch": 10.083955223880597, |
| "grad_norm": 0.3409096896648407, |
| "learning_rate": 0.0005797605863756224, |
| "loss": 0.4161, |
| "num_input_tokens_seen": 1538000, |
| "step": 5405 |
| }, |
| { |
| "epoch": 10.093283582089553, |
| "grad_norm": 0.3817724883556366, |
| "learning_rate": 0.0005789568525435076, |
| "loss": 0.3222, |
| "num_input_tokens_seen": 1539440, |
| "step": 5410 |
| }, |
| { |
| "epoch": 10.102611940298507, |
| "grad_norm": 0.3032921254634857, |
| "learning_rate": 0.0005781529094182995, |
| "loss": 0.4074, |
| "num_input_tokens_seen": 1540848, |
| "step": 5415 |
| }, |
| { |
| "epoch": 10.111940298507463, |
| "grad_norm": 0.514035165309906, |
| "learning_rate": 0.0005773487591310328, |
| "loss": 0.4141, |
| "num_input_tokens_seen": 1542352, |
| "step": 5420 |
| }, |
| { |
| "epoch": 10.121268656716419, |
| "grad_norm": 0.46226271986961365, |
| "learning_rate": 0.0005765444038132901, |
| "loss": 0.3154, |
| "num_input_tokens_seen": 1543856, |
| "step": 5425 |
| }, |
| { |
| "epoch": 10.130597014925373, |
| "grad_norm": 0.39483413100242615, |
| "learning_rate": 0.0005757398455971984, |
| "loss": 0.2744, |
| "num_input_tokens_seen": 1545264, |
| "step": 5430 |
| }, |
| { |
| "epoch": 10.139925373134329, |
| "grad_norm": 0.3189958333969116, |
| "learning_rate": 0.000574935086615422, |
| "loss": 0.3179, |
| "num_input_tokens_seen": 1546544, |
| "step": 5435 |
| }, |
| { |
| "epoch": 10.149253731343283, |
| "grad_norm": 0.21321584284305573, |
| "learning_rate": 0.000574130129001158, |
| "loss": 0.3314, |
| "num_input_tokens_seen": 1547856, |
| "step": 5440 |
| }, |
| { |
| "epoch": 10.158582089552239, |
| "grad_norm": 0.30174720287323, |
| "learning_rate": 0.000573324974888129, |
| "loss": 0.303, |
| "num_input_tokens_seen": 1549264, |
| "step": 5445 |
| }, |
| { |
| "epoch": 10.167910447761194, |
| "grad_norm": 0.22235773503780365, |
| "learning_rate": 0.0005725196264105796, |
| "loss": 0.4078, |
| "num_input_tokens_seen": 1550576, |
| "step": 5450 |
| }, |
| { |
| "epoch": 10.177238805970148, |
| "grad_norm": 0.3746367394924164, |
| "learning_rate": 0.0005717140857032691, |
| "loss": 0.2964, |
| "num_input_tokens_seen": 1552080, |
| "step": 5455 |
| }, |
| { |
| "epoch": 10.186567164179104, |
| "grad_norm": 0.319317102432251, |
| "learning_rate": 0.0005709083549014658, |
| "loss": 0.2896, |
| "num_input_tokens_seen": 1553488, |
| "step": 5460 |
| }, |
| { |
| "epoch": 10.19589552238806, |
| "grad_norm": 0.534639298915863, |
| "learning_rate": 0.0005701024361409431, |
| "loss": 0.4107, |
| "num_input_tokens_seen": 1554960, |
| "step": 5465 |
| }, |
| { |
| "epoch": 10.205223880597014, |
| "grad_norm": 0.45749685168266296, |
| "learning_rate": 0.0005692963315579712, |
| "loss": 0.2126, |
| "num_input_tokens_seen": 1556464, |
| "step": 5470 |
| }, |
| { |
| "epoch": 10.21455223880597, |
| "grad_norm": 0.6257716417312622, |
| "learning_rate": 0.0005684900432893141, |
| "loss": 0.6183, |
| "num_input_tokens_seen": 1557872, |
| "step": 5475 |
| }, |
| { |
| "epoch": 10.223880597014926, |
| "grad_norm": 0.3536997437477112, |
| "learning_rate": 0.0005676835734722222, |
| "loss": 0.4375, |
| "num_input_tokens_seen": 1559152, |
| "step": 5480 |
| }, |
| { |
| "epoch": 10.23320895522388, |
| "grad_norm": 0.4197767376899719, |
| "learning_rate": 0.0005668769242444271, |
| "loss": 0.4315, |
| "num_input_tokens_seen": 1560592, |
| "step": 5485 |
| }, |
| { |
| "epoch": 10.242537313432836, |
| "grad_norm": 0.4398539662361145, |
| "learning_rate": 0.0005660700977441358, |
| "loss": 0.6232, |
| "num_input_tokens_seen": 1561872, |
| "step": 5490 |
| }, |
| { |
| "epoch": 10.251865671641792, |
| "grad_norm": 0.3463667035102844, |
| "learning_rate": 0.000565263096110026, |
| "loss": 0.3988, |
| "num_input_tokens_seen": 1563280, |
| "step": 5495 |
| }, |
| { |
| "epoch": 10.261194029850746, |
| "grad_norm": 0.3945913314819336, |
| "learning_rate": 0.0005644559214812382, |
| "loss": 0.4993, |
| "num_input_tokens_seen": 1564464, |
| "step": 5500 |
| }, |
| { |
| "epoch": 10.270522388059701, |
| "grad_norm": 0.34219738841056824, |
| "learning_rate": 0.0005636485759973729, |
| "loss": 0.4785, |
| "num_input_tokens_seen": 1565648, |
| "step": 5505 |
| }, |
| { |
| "epoch": 10.279850746268657, |
| "grad_norm": 0.2330724447965622, |
| "learning_rate": 0.0005628410617984828, |
| "loss": 0.4828, |
| "num_input_tokens_seen": 1567248, |
| "step": 5510 |
| }, |
| { |
| "epoch": 10.289179104477611, |
| "grad_norm": 0.4671902060508728, |
| "learning_rate": 0.0005620333810250678, |
| "loss": 0.4126, |
| "num_input_tokens_seen": 1568784, |
| "step": 5515 |
| }, |
| { |
| "epoch": 10.298507462686567, |
| "grad_norm": 0.2536733150482178, |
| "learning_rate": 0.0005612255358180698, |
| "loss": 0.3553, |
| "num_input_tokens_seen": 1570288, |
| "step": 5520 |
| }, |
| { |
| "epoch": 10.307835820895523, |
| "grad_norm": 0.3621693551540375, |
| "learning_rate": 0.0005604175283188658, |
| "loss": 0.4296, |
| "num_input_tokens_seen": 1571632, |
| "step": 5525 |
| }, |
| { |
| "epoch": 10.317164179104477, |
| "grad_norm": 0.42798948287963867, |
| "learning_rate": 0.0005596093606692635, |
| "loss": 0.4056, |
| "num_input_tokens_seen": 1572944, |
| "step": 5530 |
| }, |
| { |
| "epoch": 10.326492537313433, |
| "grad_norm": 0.23219768702983856, |
| "learning_rate": 0.0005588010350114953, |
| "loss": 0.6229, |
| "num_input_tokens_seen": 1574480, |
| "step": 5535 |
| }, |
| { |
| "epoch": 10.335820895522389, |
| "grad_norm": 0.4013402760028839, |
| "learning_rate": 0.0005579925534882117, |
| "loss": 0.4501, |
| "num_input_tokens_seen": 1575952, |
| "step": 5540 |
| }, |
| { |
| "epoch": 10.345149253731343, |
| "grad_norm": 0.39546915888786316, |
| "learning_rate": 0.0005571839182424775, |
| "loss": 0.4323, |
| "num_input_tokens_seen": 1577360, |
| "step": 5545 |
| }, |
| { |
| "epoch": 10.354477611940299, |
| "grad_norm": 0.42694559693336487, |
| "learning_rate": 0.0005563751314177638, |
| "loss": 0.5057, |
| "num_input_tokens_seen": 1578864, |
| "step": 5550 |
| }, |
| { |
| "epoch": 10.363805970149254, |
| "grad_norm": 0.46801120042800903, |
| "learning_rate": 0.0005555661951579442, |
| "loss": 0.4682, |
| "num_input_tokens_seen": 1580176, |
| "step": 5555 |
| }, |
| { |
| "epoch": 10.373134328358208, |
| "grad_norm": 0.5218855142593384, |
| "learning_rate": 0.000554757111607288, |
| "loss": 0.3993, |
| "num_input_tokens_seen": 1581776, |
| "step": 5560 |
| }, |
| { |
| "epoch": 10.382462686567164, |
| "grad_norm": 0.37519174814224243, |
| "learning_rate": 0.0005539478829104555, |
| "loss": 0.261, |
| "num_input_tokens_seen": 1583248, |
| "step": 5565 |
| }, |
| { |
| "epoch": 10.39179104477612, |
| "grad_norm": 0.5104201436042786, |
| "learning_rate": 0.0005531385112124912, |
| "loss": 0.3501, |
| "num_input_tokens_seen": 1584592, |
| "step": 5570 |
| }, |
| { |
| "epoch": 10.401119402985074, |
| "grad_norm": 0.25808826088905334, |
| "learning_rate": 0.0005523289986588188, |
| "loss": 0.295, |
| "num_input_tokens_seen": 1585840, |
| "step": 5575 |
| }, |
| { |
| "epoch": 10.41044776119403, |
| "grad_norm": 0.4015122652053833, |
| "learning_rate": 0.0005515193473952355, |
| "loss": 0.4941, |
| "num_input_tokens_seen": 1587216, |
| "step": 5580 |
| }, |
| { |
| "epoch": 10.419776119402986, |
| "grad_norm": 0.2906988859176636, |
| "learning_rate": 0.0005507095595679059, |
| "loss": 0.5156, |
| "num_input_tokens_seen": 1588560, |
| "step": 5585 |
| }, |
| { |
| "epoch": 10.42910447761194, |
| "grad_norm": 0.31299829483032227, |
| "learning_rate": 0.000549899637323357, |
| "loss": 0.3407, |
| "num_input_tokens_seen": 1590128, |
| "step": 5590 |
| }, |
| { |
| "epoch": 10.438432835820896, |
| "grad_norm": 0.31594422459602356, |
| "learning_rate": 0.0005490895828084717, |
| "loss": 0.528, |
| "num_input_tokens_seen": 1591568, |
| "step": 5595 |
| }, |
| { |
| "epoch": 10.447761194029852, |
| "grad_norm": 0.30748066306114197, |
| "learning_rate": 0.0005482793981704841, |
| "loss": 0.4311, |
| "num_input_tokens_seen": 1592944, |
| "step": 5600 |
| }, |
| { |
| "epoch": 10.457089552238806, |
| "grad_norm": 0.4373854100704193, |
| "learning_rate": 0.0005474690855569724, |
| "loss": 0.5543, |
| "num_input_tokens_seen": 1594416, |
| "step": 5605 |
| }, |
| { |
| "epoch": 10.466417910447761, |
| "grad_norm": 0.318352073431015, |
| "learning_rate": 0.0005466586471158548, |
| "loss": 0.618, |
| "num_input_tokens_seen": 1595824, |
| "step": 5610 |
| }, |
| { |
| "epoch": 10.475746268656717, |
| "grad_norm": 0.3729168772697449, |
| "learning_rate": 0.0005458480849953822, |
| "loss": 0.3934, |
| "num_input_tokens_seen": 1597168, |
| "step": 5615 |
| }, |
| { |
| "epoch": 10.485074626865671, |
| "grad_norm": 0.4165967106819153, |
| "learning_rate": 0.0005450374013441343, |
| "loss": 0.3776, |
| "num_input_tokens_seen": 1598768, |
| "step": 5620 |
| }, |
| { |
| "epoch": 10.494402985074627, |
| "grad_norm": 0.4932622015476227, |
| "learning_rate": 0.0005442265983110123, |
| "loss": 0.3508, |
| "num_input_tokens_seen": 1600592, |
| "step": 5625 |
| }, |
| { |
| "epoch": 10.503731343283581, |
| "grad_norm": 0.3454512059688568, |
| "learning_rate": 0.0005434156780452339, |
| "loss": 0.4834, |
| "num_input_tokens_seen": 1602064, |
| "step": 5630 |
| }, |
| { |
| "epoch": 10.513059701492537, |
| "grad_norm": 0.4346556067466736, |
| "learning_rate": 0.0005426046426963279, |
| "loss": 0.6184, |
| "num_input_tokens_seen": 1603568, |
| "step": 5635 |
| }, |
| { |
| "epoch": 10.522388059701493, |
| "grad_norm": 0.24719765782356262, |
| "learning_rate": 0.0005417934944141277, |
| "loss": 0.4562, |
| "num_input_tokens_seen": 1604912, |
| "step": 5640 |
| }, |
| { |
| "epoch": 10.531716417910447, |
| "grad_norm": 0.3545502722263336, |
| "learning_rate": 0.0005409822353487666, |
| "loss": 0.5075, |
| "num_input_tokens_seen": 1606288, |
| "step": 5645 |
| }, |
| { |
| "epoch": 10.541044776119403, |
| "grad_norm": 0.4384237825870514, |
| "learning_rate": 0.0005401708676506709, |
| "loss": 0.353, |
| "num_input_tokens_seen": 1607728, |
| "step": 5650 |
| }, |
| { |
| "epoch": 10.550373134328359, |
| "grad_norm": 0.22765681147575378, |
| "learning_rate": 0.0005393593934705553, |
| "loss": 0.4498, |
| "num_input_tokens_seen": 1609168, |
| "step": 5655 |
| }, |
| { |
| "epoch": 10.559701492537313, |
| "grad_norm": 0.37284964323043823, |
| "learning_rate": 0.0005385478149594168, |
| "loss": 0.4036, |
| "num_input_tokens_seen": 1610416, |
| "step": 5660 |
| }, |
| { |
| "epoch": 10.569029850746269, |
| "grad_norm": 0.376591295003891, |
| "learning_rate": 0.0005377361342685286, |
| "loss": 0.4248, |
| "num_input_tokens_seen": 1611824, |
| "step": 5665 |
| }, |
| { |
| "epoch": 10.578358208955224, |
| "grad_norm": 0.30000317096710205, |
| "learning_rate": 0.0005369243535494352, |
| "loss": 0.3881, |
| "num_input_tokens_seen": 1613200, |
| "step": 5670 |
| }, |
| { |
| "epoch": 10.587686567164178, |
| "grad_norm": 0.4292525351047516, |
| "learning_rate": 0.0005361124749539457, |
| "loss": 0.4555, |
| "num_input_tokens_seen": 1614480, |
| "step": 5675 |
| }, |
| { |
| "epoch": 10.597014925373134, |
| "grad_norm": 0.3540624678134918, |
| "learning_rate": 0.000535300500634129, |
| "loss": 0.4955, |
| "num_input_tokens_seen": 1615792, |
| "step": 5680 |
| }, |
| { |
| "epoch": 10.60634328358209, |
| "grad_norm": 0.361230731010437, |
| "learning_rate": 0.000534488432742308, |
| "loss": 0.4688, |
| "num_input_tokens_seen": 1617168, |
| "step": 5685 |
| }, |
| { |
| "epoch": 10.615671641791044, |
| "grad_norm": 0.4050374925136566, |
| "learning_rate": 0.0005336762734310529, |
| "loss": 0.325, |
| "num_input_tokens_seen": 1618800, |
| "step": 5690 |
| }, |
| { |
| "epoch": 10.625, |
| "grad_norm": 0.3878132700920105, |
| "learning_rate": 0.000532864024853177, |
| "loss": 0.2777, |
| "num_input_tokens_seen": 1620304, |
| "step": 5695 |
| }, |
| { |
| "epoch": 10.634328358208956, |
| "grad_norm": 0.3716028928756714, |
| "learning_rate": 0.0005320516891617296, |
| "loss": 0.5114, |
| "num_input_tokens_seen": 1621808, |
| "step": 5700 |
| }, |
| { |
| "epoch": 10.64365671641791, |
| "grad_norm": 0.3676183819770813, |
| "learning_rate": 0.0005312392685099914, |
| "loss": 0.3349, |
| "num_input_tokens_seen": 1623248, |
| "step": 5705 |
| }, |
| { |
| "epoch": 10.652985074626866, |
| "grad_norm": 0.4085880517959595, |
| "learning_rate": 0.0005304267650514678, |
| "loss": 0.3084, |
| "num_input_tokens_seen": 1624656, |
| "step": 5710 |
| }, |
| { |
| "epoch": 10.662313432835822, |
| "grad_norm": 0.49978184700012207, |
| "learning_rate": 0.0005296141809398844, |
| "loss": 0.456, |
| "num_input_tokens_seen": 1626032, |
| "step": 5715 |
| }, |
| { |
| "epoch": 10.671641791044776, |
| "grad_norm": 0.21887758374214172, |
| "learning_rate": 0.0005288015183291797, |
| "loss": 0.3885, |
| "num_input_tokens_seen": 1627440, |
| "step": 5720 |
| }, |
| { |
| "epoch": 10.680970149253731, |
| "grad_norm": 0.42853760719299316, |
| "learning_rate": 0.0005279887793735011, |
| "loss": 0.3648, |
| "num_input_tokens_seen": 1628784, |
| "step": 5725 |
| }, |
| { |
| "epoch": 10.690298507462687, |
| "grad_norm": 0.32524973154067993, |
| "learning_rate": 0.0005271759662271978, |
| "loss": 0.4533, |
| "num_input_tokens_seen": 1630032, |
| "step": 5730 |
| }, |
| { |
| "epoch": 10.699626865671641, |
| "grad_norm": 0.3890216052532196, |
| "learning_rate": 0.0005263630810448161, |
| "loss": 0.307, |
| "num_input_tokens_seen": 1631504, |
| "step": 5735 |
| }, |
| { |
| "epoch": 10.708955223880597, |
| "grad_norm": 0.37297531962394714, |
| "learning_rate": 0.0005255501259810929, |
| "loss": 0.3934, |
| "num_input_tokens_seen": 1632944, |
| "step": 5740 |
| }, |
| { |
| "epoch": 10.718283582089553, |
| "grad_norm": 0.28741270303726196, |
| "learning_rate": 0.0005247371031909505, |
| "loss": 0.2608, |
| "num_input_tokens_seen": 1634480, |
| "step": 5745 |
| }, |
| { |
| "epoch": 10.727611940298507, |
| "grad_norm": 0.23218028247356415, |
| "learning_rate": 0.0005239240148294907, |
| "loss": 0.4041, |
| "num_input_tokens_seen": 1636048, |
| "step": 5750 |
| }, |
| { |
| "epoch": 10.736940298507463, |
| "grad_norm": 0.47181567549705505, |
| "learning_rate": 0.0005231108630519891, |
| "loss": 0.6394, |
| "num_input_tokens_seen": 1637392, |
| "step": 5755 |
| }, |
| { |
| "epoch": 10.746268656716419, |
| "grad_norm": 0.4819324016571045, |
| "learning_rate": 0.0005222976500138894, |
| "loss": 0.4447, |
| "num_input_tokens_seen": 1638800, |
| "step": 5760 |
| }, |
| { |
| "epoch": 10.755597014925373, |
| "grad_norm": 0.3169451355934143, |
| "learning_rate": 0.0005214843778707977, |
| "loss": 0.4645, |
| "num_input_tokens_seen": 1640112, |
| "step": 5765 |
| }, |
| { |
| "epoch": 10.764925373134329, |
| "grad_norm": 0.2931522727012634, |
| "learning_rate": 0.0005206710487784767, |
| "loss": 0.4478, |
| "num_input_tokens_seen": 1641488, |
| "step": 5770 |
| }, |
| { |
| "epoch": 10.774253731343283, |
| "grad_norm": 0.4820563793182373, |
| "learning_rate": 0.0005198576648928402, |
| "loss": 0.3516, |
| "num_input_tokens_seen": 1642832, |
| "step": 5775 |
| }, |
| { |
| "epoch": 10.783582089552239, |
| "grad_norm": 0.3501931428909302, |
| "learning_rate": 0.0005190442283699472, |
| "loss": 0.2941, |
| "num_input_tokens_seen": 1644368, |
| "step": 5780 |
| }, |
| { |
| "epoch": 10.792910447761194, |
| "grad_norm": 0.32435059547424316, |
| "learning_rate": 0.000518230741365996, |
| "loss": 0.4281, |
| "num_input_tokens_seen": 1645840, |
| "step": 5785 |
| }, |
| { |
| "epoch": 10.802238805970148, |
| "grad_norm": 0.39015549421310425, |
| "learning_rate": 0.0005174172060373189, |
| "loss": 0.3721, |
| "num_input_tokens_seen": 1647408, |
| "step": 5790 |
| }, |
| { |
| "epoch": 10.811567164179104, |
| "grad_norm": 0.4212459325790405, |
| "learning_rate": 0.0005166036245403767, |
| "loss": 0.7593, |
| "num_input_tokens_seen": 1648688, |
| "step": 5795 |
| }, |
| { |
| "epoch": 10.82089552238806, |
| "grad_norm": 0.42889150977134705, |
| "learning_rate": 0.0005157899990317515, |
| "loss": 0.5395, |
| "num_input_tokens_seen": 1650224, |
| "step": 5800 |
| }, |
| { |
| "epoch": 10.830223880597014, |
| "grad_norm": 0.2492639124393463, |
| "learning_rate": 0.0005149763316681433, |
| "loss": 0.3117, |
| "num_input_tokens_seen": 1651696, |
| "step": 5805 |
| }, |
| { |
| "epoch": 10.83955223880597, |
| "grad_norm": 0.389493465423584, |
| "learning_rate": 0.0005141626246063622, |
| "loss": 0.4391, |
| "num_input_tokens_seen": 1653200, |
| "step": 5810 |
| }, |
| { |
| "epoch": 10.848880597014926, |
| "grad_norm": 0.1207047626376152, |
| "learning_rate": 0.0005133488800033241, |
| "loss": 0.2615, |
| "num_input_tokens_seen": 1654896, |
| "step": 5815 |
| }, |
| { |
| "epoch": 10.85820895522388, |
| "grad_norm": 0.36992502212524414, |
| "learning_rate": 0.0005125351000160438, |
| "loss": 0.367, |
| "num_input_tokens_seen": 1656240, |
| "step": 5820 |
| }, |
| { |
| "epoch": 10.867537313432836, |
| "grad_norm": 0.31403908133506775, |
| "learning_rate": 0.0005117212868016303, |
| "loss": 0.3378, |
| "num_input_tokens_seen": 1657744, |
| "step": 5825 |
| }, |
| { |
| "epoch": 10.876865671641792, |
| "grad_norm": 0.28204184770584106, |
| "learning_rate": 0.0005109074425172806, |
| "loss": 0.3401, |
| "num_input_tokens_seen": 1658960, |
| "step": 5830 |
| }, |
| { |
| "epoch": 10.886194029850746, |
| "grad_norm": 0.5115631222724915, |
| "learning_rate": 0.0005100935693202741, |
| "loss": 0.4407, |
| "num_input_tokens_seen": 1660240, |
| "step": 5835 |
| }, |
| { |
| "epoch": 10.895522388059701, |
| "grad_norm": 0.45001569390296936, |
| "learning_rate": 0.0005092796693679667, |
| "loss": 0.4221, |
| "num_input_tokens_seen": 1661680, |
| "step": 5840 |
| }, |
| { |
| "epoch": 10.904850746268657, |
| "grad_norm": 0.44775334000587463, |
| "learning_rate": 0.0005084657448177855, |
| "loss": 0.4631, |
| "num_input_tokens_seen": 1663056, |
| "step": 5845 |
| }, |
| { |
| "epoch": 10.914179104477611, |
| "grad_norm": 0.18757286667823792, |
| "learning_rate": 0.0005076517978272225, |
| "loss": 0.389, |
| "num_input_tokens_seen": 1664400, |
| "step": 5850 |
| }, |
| { |
| "epoch": 10.923507462686567, |
| "grad_norm": 0.3951946794986725, |
| "learning_rate": 0.0005068378305538292, |
| "loss": 0.3844, |
| "num_input_tokens_seen": 1665968, |
| "step": 5855 |
| }, |
| { |
| "epoch": 10.932835820895523, |
| "grad_norm": 0.23343099653720856, |
| "learning_rate": 0.0005060238451552111, |
| "loss": 0.4433, |
| "num_input_tokens_seen": 1667280, |
| "step": 5860 |
| }, |
| { |
| "epoch": 10.942164179104477, |
| "grad_norm": 0.3723389804363251, |
| "learning_rate": 0.0005052098437890215, |
| "loss": 0.547, |
| "num_input_tokens_seen": 1669008, |
| "step": 5865 |
| }, |
| { |
| "epoch": 10.951492537313433, |
| "grad_norm": 0.37627390027046204, |
| "learning_rate": 0.0005043958286129562, |
| "loss": 0.3503, |
| "num_input_tokens_seen": 1670512, |
| "step": 5870 |
| }, |
| { |
| "epoch": 10.960820895522389, |
| "grad_norm": 0.21539832651615143, |
| "learning_rate": 0.0005035818017847476, |
| "loss": 0.2571, |
| "num_input_tokens_seen": 1672080, |
| "step": 5875 |
| }, |
| { |
| "epoch": 10.970149253731343, |
| "grad_norm": 0.38906389474868774, |
| "learning_rate": 0.0005027677654621586, |
| "loss": 0.3463, |
| "num_input_tokens_seen": 1673552, |
| "step": 5880 |
| }, |
| { |
| "epoch": 10.979477611940299, |
| "grad_norm": 0.5205991268157959, |
| "learning_rate": 0.000501953721802978, |
| "loss": 0.3297, |
| "num_input_tokens_seen": 1674896, |
| "step": 5885 |
| }, |
| { |
| "epoch": 10.988805970149254, |
| "grad_norm": 0.2763025164604187, |
| "learning_rate": 0.0005011396729650135, |
| "loss": 0.4295, |
| "num_input_tokens_seen": 1676272, |
| "step": 5890 |
| }, |
| { |
| "epoch": 10.998134328358208, |
| "grad_norm": 0.32854244112968445, |
| "learning_rate": 0.0005003256211060866, |
| "loss": 0.5869, |
| "num_input_tokens_seen": 1677616, |
| "step": 5895 |
| }, |
| { |
| "epoch": 11.0, |
| "eval_loss": 0.7297408580780029, |
| "eval_runtime": 4.1905, |
| "eval_samples_per_second": 56.795, |
| "eval_steps_per_second": 14.318, |
| "num_input_tokens_seen": 1677680, |
| "step": 5896 |
| }, |
| { |
| "epoch": 11.007462686567164, |
| "grad_norm": 0.2975498139858246, |
| "learning_rate": 0.0004995115683840269, |
| "loss": 0.2877, |
| "num_input_tokens_seen": 1678960, |
| "step": 5900 |
| }, |
| { |
| "epoch": 11.01679104477612, |
| "grad_norm": 0.3614169955253601, |
| "learning_rate": 0.0004986975169566662, |
| "loss": 0.4449, |
| "num_input_tokens_seen": 1680496, |
| "step": 5905 |
| }, |
| { |
| "epoch": 11.026119402985074, |
| "grad_norm": 0.31577494740486145, |
| "learning_rate": 0.0004978834689818331, |
| "loss": 0.2121, |
| "num_input_tokens_seen": 1681744, |
| "step": 5910 |
| }, |
| { |
| "epoch": 11.03544776119403, |
| "grad_norm": 0.2219756692647934, |
| "learning_rate": 0.0004970694266173466, |
| "loss": 0.3117, |
| "num_input_tokens_seen": 1683248, |
| "step": 5915 |
| }, |
| { |
| "epoch": 11.044776119402986, |
| "grad_norm": 0.4874166250228882, |
| "learning_rate": 0.0004962553920210117, |
| "loss": 0.492, |
| "num_input_tokens_seen": 1684528, |
| "step": 5920 |
| }, |
| { |
| "epoch": 11.05410447761194, |
| "grad_norm": 0.4654216468334198, |
| "learning_rate": 0.0004954413673506114, |
| "loss": 0.299, |
| "num_input_tokens_seen": 1686064, |
| "step": 5925 |
| }, |
| { |
| "epoch": 11.063432835820896, |
| "grad_norm": 0.311125785112381, |
| "learning_rate": 0.0004946273547639039, |
| "loss": 0.2558, |
| "num_input_tokens_seen": 1687440, |
| "step": 5930 |
| }, |
| { |
| "epoch": 11.072761194029852, |
| "grad_norm": 0.26367080211639404, |
| "learning_rate": 0.0004938133564186141, |
| "loss": 0.3663, |
| "num_input_tokens_seen": 1688976, |
| "step": 5935 |
| }, |
| { |
| "epoch": 11.082089552238806, |
| "grad_norm": 0.5018680691719055, |
| "learning_rate": 0.00049299937447243, |
| "loss": 0.2736, |
| "num_input_tokens_seen": 1690512, |
| "step": 5940 |
| }, |
| { |
| "epoch": 11.091417910447761, |
| "grad_norm": 0.5152081847190857, |
| "learning_rate": 0.0004921854110829962, |
| "loss": 0.4058, |
| "num_input_tokens_seen": 1691920, |
| "step": 5945 |
| }, |
| { |
| "epoch": 11.100746268656716, |
| "grad_norm": 0.5324569344520569, |
| "learning_rate": 0.0004913714684079071, |
| "loss": 0.4961, |
| "num_input_tokens_seen": 1693392, |
| "step": 5950 |
| }, |
| { |
| "epoch": 11.110074626865671, |
| "grad_norm": 0.48775115609169006, |
| "learning_rate": 0.0004905575486047034, |
| "loss": 0.4429, |
| "num_input_tokens_seen": 1694768, |
| "step": 5955 |
| }, |
| { |
| "epoch": 11.119402985074627, |
| "grad_norm": 0.4510795474052429, |
| "learning_rate": 0.0004897436538308641, |
| "loss": 0.3833, |
| "num_input_tokens_seen": 1696272, |
| "step": 5960 |
| }, |
| { |
| "epoch": 11.128731343283581, |
| "grad_norm": 0.4744063913822174, |
| "learning_rate": 0.0004889297862438028, |
| "loss": 0.4359, |
| "num_input_tokens_seen": 1697584, |
| "step": 5965 |
| }, |
| { |
| "epoch": 11.138059701492537, |
| "grad_norm": 0.357310026884079, |
| "learning_rate": 0.00048811594800086066, |
| "loss": 0.266, |
| "num_input_tokens_seen": 1699216, |
| "step": 5970 |
| }, |
| { |
| "epoch": 11.147388059701493, |
| "grad_norm": 0.39351770281791687, |
| "learning_rate": 0.00048730214125930076, |
| "loss": 0.286, |
| "num_input_tokens_seen": 1700432, |
| "step": 5975 |
| }, |
| { |
| "epoch": 11.156716417910447, |
| "grad_norm": 0.3268272876739502, |
| "learning_rate": 0.0004864883681763032, |
| "loss": 0.4033, |
| "num_input_tokens_seen": 1701840, |
| "step": 5980 |
| }, |
| { |
| "epoch": 11.166044776119403, |
| "grad_norm": 0.31125694513320923, |
| "learning_rate": 0.0004856746309089582, |
| "loss": 0.3183, |
| "num_input_tokens_seen": 1703216, |
| "step": 5985 |
| }, |
| { |
| "epoch": 11.175373134328359, |
| "grad_norm": 0.5361759662628174, |
| "learning_rate": 0.0004848609316142618, |
| "loss": 0.4384, |
| "num_input_tokens_seen": 1704496, |
| "step": 5990 |
| }, |
| { |
| "epoch": 11.184701492537313, |
| "grad_norm": 0.3227105736732483, |
| "learning_rate": 0.00048404727244910883, |
| "loss": 0.3747, |
| "num_input_tokens_seen": 1705808, |
| "step": 5995 |
| }, |
| { |
| "epoch": 11.194029850746269, |
| "grad_norm": 0.5162398815155029, |
| "learning_rate": 0.000483233655570288, |
| "loss": 0.4446, |
| "num_input_tokens_seen": 1707280, |
| "step": 6000 |
| }, |
| { |
| "epoch": 11.203358208955224, |
| "grad_norm": 0.2446233034133911, |
| "learning_rate": 0.000482420083134476, |
| "loss": 0.357, |
| "num_input_tokens_seen": 1708848, |
| "step": 6005 |
| }, |
| { |
| "epoch": 11.212686567164178, |
| "grad_norm": 0.5056248307228088, |
| "learning_rate": 0.0004816065572982313, |
| "loss": 0.4111, |
| "num_input_tokens_seen": 1710288, |
| "step": 6010 |
| }, |
| { |
| "epoch": 11.222014925373134, |
| "grad_norm": 0.5253577828407288, |
| "learning_rate": 0.0004807930802179894, |
| "loss": 0.4007, |
| "num_input_tokens_seen": 1711600, |
| "step": 6015 |
| }, |
| { |
| "epoch": 11.23134328358209, |
| "grad_norm": 0.4653439521789551, |
| "learning_rate": 0.0004799796540500561, |
| "loss": 0.356, |
| "num_input_tokens_seen": 1712912, |
| "step": 6020 |
| }, |
| { |
| "epoch": 11.240671641791044, |
| "grad_norm": 0.3523794412612915, |
| "learning_rate": 0.0004791662809506025, |
| "loss": 0.2683, |
| "num_input_tokens_seen": 1714256, |
| "step": 6025 |
| }, |
| { |
| "epoch": 11.25, |
| "grad_norm": 0.5544191002845764, |
| "learning_rate": 0.00047835296307565913, |
| "loss": 0.7138, |
| "num_input_tokens_seen": 1715664, |
| "step": 6030 |
| }, |
| { |
| "epoch": 11.259328358208956, |
| "grad_norm": 0.42940109968185425, |
| "learning_rate": 0.0004775397025811097, |
| "loss": 0.5004, |
| "num_input_tokens_seen": 1717040, |
| "step": 6035 |
| }, |
| { |
| "epoch": 11.26865671641791, |
| "grad_norm": 0.48940351605415344, |
| "learning_rate": 0.0004767265016226863, |
| "loss": 0.6169, |
| "num_input_tokens_seen": 1718480, |
| "step": 6040 |
| }, |
| { |
| "epoch": 11.277985074626866, |
| "grad_norm": 0.4342253804206848, |
| "learning_rate": 0.0004759133623559628, |
| "loss": 0.4248, |
| "num_input_tokens_seen": 1719728, |
| "step": 6045 |
| }, |
| { |
| "epoch": 11.287313432835822, |
| "grad_norm": 0.30282506346702576, |
| "learning_rate": 0.00047510028693634995, |
| "loss": 0.4106, |
| "num_input_tokens_seen": 1721264, |
| "step": 6050 |
| }, |
| { |
| "epoch": 11.296641791044776, |
| "grad_norm": 0.3388311564922333, |
| "learning_rate": 0.0004742872775190889, |
| "loss": 0.552, |
| "num_input_tokens_seen": 1722896, |
| "step": 6055 |
| }, |
| { |
| "epoch": 11.305970149253731, |
| "grad_norm": 0.4148770868778229, |
| "learning_rate": 0.000473474336259246, |
| "loss": 0.306, |
| "num_input_tokens_seen": 1724240, |
| "step": 6060 |
| }, |
| { |
| "epoch": 11.315298507462687, |
| "grad_norm": 0.19478370249271393, |
| "learning_rate": 0.0004726614653117071, |
| "loss": 0.3752, |
| "num_input_tokens_seen": 1725680, |
| "step": 6065 |
| }, |
| { |
| "epoch": 11.324626865671641, |
| "grad_norm": 0.4652278423309326, |
| "learning_rate": 0.00047184866683117125, |
| "loss": 0.4032, |
| "num_input_tokens_seen": 1727056, |
| "step": 6070 |
| }, |
| { |
| "epoch": 11.333955223880597, |
| "grad_norm": 0.4905243217945099, |
| "learning_rate": 0.00047103594297214597, |
| "loss": 0.4775, |
| "num_input_tokens_seen": 1728496, |
| "step": 6075 |
| }, |
| { |
| "epoch": 11.343283582089553, |
| "grad_norm": 0.38213881850242615, |
| "learning_rate": 0.00047022329588894033, |
| "loss": 0.3003, |
| "num_input_tokens_seen": 1729936, |
| "step": 6080 |
| }, |
| { |
| "epoch": 11.352611940298507, |
| "grad_norm": 0.45340460538864136, |
| "learning_rate": 0.0004694107277356604, |
| "loss": 0.2487, |
| "num_input_tokens_seen": 1731536, |
| "step": 6085 |
| }, |
| { |
| "epoch": 11.361940298507463, |
| "grad_norm": 0.385991632938385, |
| "learning_rate": 0.00046859824066620287, |
| "loss": 0.3439, |
| "num_input_tokens_seen": 1733072, |
| "step": 6090 |
| }, |
| { |
| "epoch": 11.371268656716419, |
| "grad_norm": 0.36053866147994995, |
| "learning_rate": 0.00046778583683424943, |
| "loss": 0.3811, |
| "num_input_tokens_seen": 1734640, |
| "step": 6095 |
| }, |
| { |
| "epoch": 11.380597014925373, |
| "grad_norm": 0.4456652104854584, |
| "learning_rate": 0.0004669735183932613, |
| "loss": 0.3588, |
| "num_input_tokens_seen": 1735984, |
| "step": 6100 |
| }, |
| { |
| "epoch": 11.389925373134329, |
| "grad_norm": 0.21310646831989288, |
| "learning_rate": 0.00046616128749647296, |
| "loss": 0.2416, |
| "num_input_tokens_seen": 1737488, |
| "step": 6105 |
| }, |
| { |
| "epoch": 11.399253731343283, |
| "grad_norm": 0.36799484491348267, |
| "learning_rate": 0.00046534914629688747, |
| "loss": 0.5418, |
| "num_input_tokens_seen": 1738960, |
| "step": 6110 |
| }, |
| { |
| "epoch": 11.408582089552239, |
| "grad_norm": 0.30316370725631714, |
| "learning_rate": 0.00046453709694726944, |
| "loss": 0.3266, |
| "num_input_tokens_seen": 1740464, |
| "step": 6115 |
| }, |
| { |
| "epoch": 11.417910447761194, |
| "grad_norm": 0.3485303521156311, |
| "learning_rate": 0.00046372514160014037, |
| "loss": 0.3988, |
| "num_input_tokens_seen": 1741904, |
| "step": 6120 |
| }, |
| { |
| "epoch": 11.427238805970148, |
| "grad_norm": 0.41022706031799316, |
| "learning_rate": 0.00046291328240777297, |
| "loss": 0.4461, |
| "num_input_tokens_seen": 1743216, |
| "step": 6125 |
| }, |
| { |
| "epoch": 11.436567164179104, |
| "grad_norm": 0.4416004419326782, |
| "learning_rate": 0.00046210152152218397, |
| "loss": 0.2416, |
| "num_input_tokens_seen": 1744816, |
| "step": 6130 |
| }, |
| { |
| "epoch": 11.44589552238806, |
| "grad_norm": 0.541028618812561, |
| "learning_rate": 0.000461289861095131, |
| "loss": 0.4478, |
| "num_input_tokens_seen": 1746064, |
| "step": 6135 |
| }, |
| { |
| "epoch": 11.455223880597014, |
| "grad_norm": 0.2696289122104645, |
| "learning_rate": 0.0004604783032781039, |
| "loss": 0.3888, |
| "num_input_tokens_seen": 1747728, |
| "step": 6140 |
| }, |
| { |
| "epoch": 11.46455223880597, |
| "grad_norm": 0.2709604799747467, |
| "learning_rate": 0.00045966685022232143, |
| "loss": 0.4124, |
| "num_input_tokens_seen": 1749104, |
| "step": 6145 |
| }, |
| { |
| "epoch": 11.473880597014926, |
| "grad_norm": 0.3468097448348999, |
| "learning_rate": 0.00045885550407872476, |
| "loss": 0.3091, |
| "num_input_tokens_seen": 1750416, |
| "step": 6150 |
| }, |
| { |
| "epoch": 11.48320895522388, |
| "grad_norm": 0.38475626707077026, |
| "learning_rate": 0.0004580442669979708, |
| "loss": 0.4098, |
| "num_input_tokens_seen": 1751952, |
| "step": 6155 |
| }, |
| { |
| "epoch": 11.492537313432836, |
| "grad_norm": 0.4953322112560272, |
| "learning_rate": 0.00045723314113042856, |
| "loss": 0.4645, |
| "num_input_tokens_seen": 1753328, |
| "step": 6160 |
| }, |
| { |
| "epoch": 11.501865671641792, |
| "grad_norm": 0.6726153492927551, |
| "learning_rate": 0.00045642212862617086, |
| "loss": 0.5969, |
| "num_input_tokens_seen": 1754672, |
| "step": 6165 |
| }, |
| { |
| "epoch": 11.511194029850746, |
| "grad_norm": 0.33583176136016846, |
| "learning_rate": 0.0004556112316349716, |
| "loss": 0.4025, |
| "num_input_tokens_seen": 1756080, |
| "step": 6170 |
| }, |
| { |
| "epoch": 11.520522388059701, |
| "grad_norm": 0.29343482851982117, |
| "learning_rate": 0.0004548004523062968, |
| "loss": 0.2802, |
| "num_input_tokens_seen": 1757456, |
| "step": 6175 |
| }, |
| { |
| "epoch": 11.529850746268657, |
| "grad_norm": 0.2839649021625519, |
| "learning_rate": 0.000453989792789302, |
| "loss": 0.4035, |
| "num_input_tokens_seen": 1758960, |
| "step": 6180 |
| }, |
| { |
| "epoch": 11.539179104477611, |
| "grad_norm": 0.4980579614639282, |
| "learning_rate": 0.0004531792552328247, |
| "loss": 0.3924, |
| "num_input_tokens_seen": 1760368, |
| "step": 6185 |
| }, |
| { |
| "epoch": 11.548507462686567, |
| "grad_norm": 0.43809887766838074, |
| "learning_rate": 0.0004523688417853785, |
| "loss": 0.3805, |
| "num_input_tokens_seen": 1761744, |
| "step": 6190 |
| }, |
| { |
| "epoch": 11.557835820895523, |
| "grad_norm": 0.16008871793746948, |
| "learning_rate": 0.00045155855459514917, |
| "loss": 0.2367, |
| "num_input_tokens_seen": 1763472, |
| "step": 6195 |
| }, |
| { |
| "epoch": 11.567164179104477, |
| "grad_norm": 0.406055748462677, |
| "learning_rate": 0.00045074839580998646, |
| "loss": 0.5474, |
| "num_input_tokens_seen": 1764848, |
| "step": 6200 |
| }, |
| { |
| "epoch": 11.576492537313433, |
| "grad_norm": 0.5940313935279846, |
| "learning_rate": 0.00044993836757740096, |
| "loss": 0.4387, |
| "num_input_tokens_seen": 1766160, |
| "step": 6205 |
| }, |
| { |
| "epoch": 11.585820895522389, |
| "grad_norm": 0.5240064859390259, |
| "learning_rate": 0.0004491284720445567, |
| "loss": 0.2951, |
| "num_input_tokens_seen": 1767664, |
| "step": 6210 |
| }, |
| { |
| "epoch": 11.595149253731343, |
| "grad_norm": 0.4598220884799957, |
| "learning_rate": 0.00044831871135826576, |
| "loss": 0.4724, |
| "num_input_tokens_seen": 1769040, |
| "step": 6215 |
| }, |
| { |
| "epoch": 11.604477611940299, |
| "grad_norm": 0.40932193398475647, |
| "learning_rate": 0.0004475090876649831, |
| "loss": 0.3878, |
| "num_input_tokens_seen": 1770480, |
| "step": 6220 |
| }, |
| { |
| "epoch": 11.613805970149254, |
| "grad_norm": 0.2749176323413849, |
| "learning_rate": 0.0004466996031108004, |
| "loss": 0.2885, |
| "num_input_tokens_seen": 1771952, |
| "step": 6225 |
| }, |
| { |
| "epoch": 11.623134328358208, |
| "grad_norm": 0.6056463718414307, |
| "learning_rate": 0.00044589025984144063, |
| "loss": 0.2995, |
| "num_input_tokens_seen": 1773392, |
| "step": 6230 |
| }, |
| { |
| "epoch": 11.632462686567164, |
| "grad_norm": 0.3814938962459564, |
| "learning_rate": 0.0004450810600022519, |
| "loss": 0.4098, |
| "num_input_tokens_seen": 1774800, |
| "step": 6235 |
| }, |
| { |
| "epoch": 11.64179104477612, |
| "grad_norm": 0.41231343150138855, |
| "learning_rate": 0.0004442720057382027, |
| "loss": 0.5592, |
| "num_input_tokens_seen": 1776176, |
| "step": 6240 |
| }, |
| { |
| "epoch": 11.651119402985074, |
| "grad_norm": 0.25101757049560547, |
| "learning_rate": 0.0004434630991938754, |
| "loss": 0.2176, |
| "num_input_tokens_seen": 1778000, |
| "step": 6245 |
| }, |
| { |
| "epoch": 11.66044776119403, |
| "grad_norm": 0.4763393998146057, |
| "learning_rate": 0.0004426543425134604, |
| "loss": 0.2743, |
| "num_input_tokens_seen": 1779536, |
| "step": 6250 |
| }, |
| { |
| "epoch": 11.669776119402986, |
| "grad_norm": 0.3026311695575714, |
| "learning_rate": 0.0004418457378407516, |
| "loss": 0.3232, |
| "num_input_tokens_seen": 1780912, |
| "step": 6255 |
| }, |
| { |
| "epoch": 11.67910447761194, |
| "grad_norm": 0.5905582904815674, |
| "learning_rate": 0.00044103728731913916, |
| "loss": 0.3738, |
| "num_input_tokens_seen": 1782256, |
| "step": 6260 |
| }, |
| { |
| "epoch": 11.688432835820896, |
| "grad_norm": 0.2815220355987549, |
| "learning_rate": 0.0004402289930916053, |
| "loss": 0.3236, |
| "num_input_tokens_seen": 1783696, |
| "step": 6265 |
| }, |
| { |
| "epoch": 11.697761194029852, |
| "grad_norm": 0.5030792355537415, |
| "learning_rate": 0.0004394208573007177, |
| "loss": 0.5156, |
| "num_input_tokens_seen": 1785040, |
| "step": 6270 |
| }, |
| { |
| "epoch": 11.707089552238806, |
| "grad_norm": 0.4295044541358948, |
| "learning_rate": 0.00043861288208862394, |
| "loss": 0.3153, |
| "num_input_tokens_seen": 1786544, |
| "step": 6275 |
| }, |
| { |
| "epoch": 11.716417910447761, |
| "grad_norm": 0.49271804094314575, |
| "learning_rate": 0.00043780506959704616, |
| "loss": 0.4076, |
| "num_input_tokens_seen": 1788080, |
| "step": 6280 |
| }, |
| { |
| "epoch": 11.725746268656717, |
| "grad_norm": 0.3802368640899658, |
| "learning_rate": 0.0004369974219672748, |
| "loss": 0.4979, |
| "num_input_tokens_seen": 1789424, |
| "step": 6285 |
| }, |
| { |
| "epoch": 11.735074626865671, |
| "grad_norm": 0.4424251317977905, |
| "learning_rate": 0.000436189941340164, |
| "loss": 0.4163, |
| "num_input_tokens_seen": 1790992, |
| "step": 6290 |
| }, |
| { |
| "epoch": 11.744402985074627, |
| "grad_norm": 0.28739726543426514, |
| "learning_rate": 0.00043538262985612445, |
| "loss": 0.5469, |
| "num_input_tokens_seen": 1792464, |
| "step": 6295 |
| }, |
| { |
| "epoch": 11.753731343283581, |
| "grad_norm": 0.40592968463897705, |
| "learning_rate": 0.00043457548965511884, |
| "loss": 0.4155, |
| "num_input_tokens_seen": 1793872, |
| "step": 6300 |
| }, |
| { |
| "epoch": 11.763059701492537, |
| "grad_norm": 0.4930996000766754, |
| "learning_rate": 0.0004337685228766561, |
| "loss": 0.3816, |
| "num_input_tokens_seen": 1795344, |
| "step": 6305 |
| }, |
| { |
| "epoch": 11.772388059701493, |
| "grad_norm": 0.34781700372695923, |
| "learning_rate": 0.0004329617316597849, |
| "loss": 0.2512, |
| "num_input_tokens_seen": 1796848, |
| "step": 6310 |
| }, |
| { |
| "epoch": 11.781716417910447, |
| "grad_norm": 0.5414386987686157, |
| "learning_rate": 0.000432155118143089, |
| "loss": 0.4204, |
| "num_input_tokens_seen": 1798192, |
| "step": 6315 |
| }, |
| { |
| "epoch": 11.791044776119403, |
| "grad_norm": 0.41075921058654785, |
| "learning_rate": 0.0004313486844646808, |
| "loss": 0.5099, |
| "num_input_tokens_seen": 1799664, |
| "step": 6320 |
| }, |
| { |
| "epoch": 11.800373134328359, |
| "grad_norm": 0.40663543343544006, |
| "learning_rate": 0.0004305424327621962, |
| "loss": 0.4131, |
| "num_input_tokens_seen": 1801072, |
| "step": 6325 |
| }, |
| { |
| "epoch": 11.809701492537313, |
| "grad_norm": 0.47907477617263794, |
| "learning_rate": 0.00042973636517278893, |
| "loss": 0.4387, |
| "num_input_tokens_seen": 1802640, |
| "step": 6330 |
| }, |
| { |
| "epoch": 11.819029850746269, |
| "grad_norm": 0.510122537612915, |
| "learning_rate": 0.0004289304838331241, |
| "loss": 0.4236, |
| "num_input_tokens_seen": 1804048, |
| "step": 6335 |
| }, |
| { |
| "epoch": 11.828358208955224, |
| "grad_norm": 0.42072373628616333, |
| "learning_rate": 0.0004281247908793737, |
| "loss": 0.2858, |
| "num_input_tokens_seen": 1805296, |
| "step": 6340 |
| }, |
| { |
| "epoch": 11.837686567164178, |
| "grad_norm": 0.509661078453064, |
| "learning_rate": 0.0004273192884472099, |
| "loss": 0.5232, |
| "num_input_tokens_seen": 1806640, |
| "step": 6345 |
| }, |
| { |
| "epoch": 11.847014925373134, |
| "grad_norm": 0.24327515065670013, |
| "learning_rate": 0.0004265139786718004, |
| "loss": 0.2774, |
| "num_input_tokens_seen": 1808112, |
| "step": 6350 |
| }, |
| { |
| "epoch": 11.85634328358209, |
| "grad_norm": 0.38800907135009766, |
| "learning_rate": 0.0004257088636878015, |
| "loss": 0.2649, |
| "num_input_tokens_seen": 1809360, |
| "step": 6355 |
| }, |
| { |
| "epoch": 11.865671641791044, |
| "grad_norm": 0.3098767399787903, |
| "learning_rate": 0.0004249039456293537, |
| "loss": 0.3889, |
| "num_input_tokens_seen": 1810832, |
| "step": 6360 |
| }, |
| { |
| "epoch": 11.875, |
| "grad_norm": 0.4385940730571747, |
| "learning_rate": 0.0004240992266300757, |
| "loss": 0.3961, |
| "num_input_tokens_seen": 1812240, |
| "step": 6365 |
| }, |
| { |
| "epoch": 11.884328358208956, |
| "grad_norm": 0.30191636085510254, |
| "learning_rate": 0.00042329470882305765, |
| "loss": 0.2755, |
| "num_input_tokens_seen": 1813584, |
| "step": 6370 |
| }, |
| { |
| "epoch": 11.89365671641791, |
| "grad_norm": 0.4465119540691376, |
| "learning_rate": 0.00042249039434085747, |
| "loss": 0.7074, |
| "num_input_tokens_seen": 1815024, |
| "step": 6375 |
| }, |
| { |
| "epoch": 11.902985074626866, |
| "grad_norm": 0.7072200179100037, |
| "learning_rate": 0.0004216862853154932, |
| "loss": 0.52, |
| "num_input_tokens_seen": 1816400, |
| "step": 6380 |
| }, |
| { |
| "epoch": 11.912313432835822, |
| "grad_norm": 0.360170841217041, |
| "learning_rate": 0.0004208823838784386, |
| "loss": 0.4577, |
| "num_input_tokens_seen": 1817840, |
| "step": 6385 |
| }, |
| { |
| "epoch": 11.921641791044776, |
| "grad_norm": 0.37331485748291016, |
| "learning_rate": 0.0004200786921606179, |
| "loss": 0.4316, |
| "num_input_tokens_seen": 1819152, |
| "step": 6390 |
| }, |
| { |
| "epoch": 11.930970149253731, |
| "grad_norm": 0.5474173426628113, |
| "learning_rate": 0.00041927521229239795, |
| "loss": 0.5153, |
| "num_input_tokens_seen": 1820592, |
| "step": 6395 |
| }, |
| { |
| "epoch": 11.940298507462687, |
| "grad_norm": 0.38393107056617737, |
| "learning_rate": 0.0004184719464035856, |
| "loss": 0.3978, |
| "num_input_tokens_seen": 1821872, |
| "step": 6400 |
| }, |
| { |
| "epoch": 11.949626865671641, |
| "grad_norm": 0.17455793917179108, |
| "learning_rate": 0.00041766889662341907, |
| "loss": 0.3079, |
| "num_input_tokens_seen": 1823152, |
| "step": 6405 |
| }, |
| { |
| "epoch": 11.958955223880597, |
| "grad_norm": 0.6789186596870422, |
| "learning_rate": 0.000416866065080565, |
| "loss": 0.427, |
| "num_input_tokens_seen": 1824464, |
| "step": 6410 |
| }, |
| { |
| "epoch": 11.968283582089553, |
| "grad_norm": 0.44379931688308716, |
| "learning_rate": 0.0004160634539031105, |
| "loss": 0.3356, |
| "num_input_tokens_seen": 1825872, |
| "step": 6415 |
| }, |
| { |
| "epoch": 11.977611940298507, |
| "grad_norm": 0.37370961904525757, |
| "learning_rate": 0.0004152610652185592, |
| "loss": 0.463, |
| "num_input_tokens_seen": 1827280, |
| "step": 6420 |
| }, |
| { |
| "epoch": 11.986940298507463, |
| "grad_norm": 0.7289624810218811, |
| "learning_rate": 0.000414458901153825, |
| "loss": 0.4434, |
| "num_input_tokens_seen": 1828624, |
| "step": 6425 |
| }, |
| { |
| "epoch": 11.996268656716419, |
| "grad_norm": 0.41602692008018494, |
| "learning_rate": 0.00041365696383522586, |
| "loss": 0.3809, |
| "num_input_tokens_seen": 1829872, |
| "step": 6430 |
| }, |
| { |
| "epoch": 12.0, |
| "eval_loss": 0.7218378782272339, |
| "eval_runtime": 4.2039, |
| "eval_samples_per_second": 56.614, |
| "eval_steps_per_second": 14.272, |
| "num_input_tokens_seen": 1830200, |
| "step": 6432 |
| }, |
| { |
| "epoch": 12.005597014925373, |
| "grad_norm": 0.3600406348705292, |
| "learning_rate": 0.00041285525538847936, |
| "loss": 0.3121, |
| "num_input_tokens_seen": 1831032, |
| "step": 6435 |
| }, |
| { |
| "epoch": 12.014925373134329, |
| "grad_norm": 0.19030925631523132, |
| "learning_rate": 0.0004120537779386954, |
| "loss": 0.3586, |
| "num_input_tokens_seen": 1832472, |
| "step": 6440 |
| }, |
| { |
| "epoch": 12.024253731343284, |
| "grad_norm": 0.4201837182044983, |
| "learning_rate": 0.00041125253361037277, |
| "loss": 0.5458, |
| "num_input_tokens_seen": 1833752, |
| "step": 6445 |
| }, |
| { |
| "epoch": 12.033582089552239, |
| "grad_norm": 0.2953159809112549, |
| "learning_rate": 0.00041045152452739183, |
| "loss": 0.2556, |
| "num_input_tokens_seen": 1835192, |
| "step": 6450 |
| }, |
| { |
| "epoch": 12.042910447761194, |
| "grad_norm": 0.4765187203884125, |
| "learning_rate": 0.00040965075281300893, |
| "loss": 0.2821, |
| "num_input_tokens_seen": 1836632, |
| "step": 6455 |
| }, |
| { |
| "epoch": 12.052238805970148, |
| "grad_norm": 0.44841447472572327, |
| "learning_rate": 0.00040885022058985193, |
| "loss": 0.3791, |
| "num_input_tokens_seen": 1837976, |
| "step": 6460 |
| }, |
| { |
| "epoch": 12.061567164179104, |
| "grad_norm": 0.6252573728561401, |
| "learning_rate": 0.0004080499299799133, |
| "loss": 0.4758, |
| "num_input_tokens_seen": 1839384, |
| "step": 6465 |
| }, |
| { |
| "epoch": 12.07089552238806, |
| "grad_norm": 0.3894113004207611, |
| "learning_rate": 0.0004072498831045455, |
| "loss": 0.2906, |
| "num_input_tokens_seen": 1840696, |
| "step": 6470 |
| }, |
| { |
| "epoch": 12.080223880597014, |
| "grad_norm": 0.4687945544719696, |
| "learning_rate": 0.00040645008208445445, |
| "loss": 0.2959, |
| "num_input_tokens_seen": 1842328, |
| "step": 6475 |
| }, |
| { |
| "epoch": 12.08955223880597, |
| "grad_norm": 0.2961972653865814, |
| "learning_rate": 0.0004056505290396948, |
| "loss": 0.2436, |
| "num_input_tokens_seen": 1843992, |
| "step": 6480 |
| }, |
| { |
| "epoch": 12.098880597014926, |
| "grad_norm": 0.4359433948993683, |
| "learning_rate": 0.00040485122608966377, |
| "loss": 0.2145, |
| "num_input_tokens_seen": 1845464, |
| "step": 6485 |
| }, |
| { |
| "epoch": 12.10820895522388, |
| "grad_norm": 0.38550934195518494, |
| "learning_rate": 0.00040405217535309545, |
| "loss": 0.2177, |
| "num_input_tokens_seen": 1847000, |
| "step": 6490 |
| }, |
| { |
| "epoch": 12.117537313432836, |
| "grad_norm": 0.4485433101654053, |
| "learning_rate": 0.0004032533789480557, |
| "loss": 0.5714, |
| "num_input_tokens_seen": 1848344, |
| "step": 6495 |
| }, |
| { |
| "epoch": 12.126865671641792, |
| "grad_norm": 0.33935850858688354, |
| "learning_rate": 0.00040245483899193594, |
| "loss": 0.2848, |
| "num_input_tokens_seen": 1849784, |
| "step": 6500 |
| }, |
| { |
| "epoch": 12.136194029850746, |
| "grad_norm": 0.38977697491645813, |
| "learning_rate": 0.00040165655760144783, |
| "loss": 0.4378, |
| "num_input_tokens_seen": 1851256, |
| "step": 6505 |
| }, |
| { |
| "epoch": 12.145522388059701, |
| "grad_norm": 0.5800938606262207, |
| "learning_rate": 0.0004008585368926179, |
| "loss": 0.2898, |
| "num_input_tokens_seen": 1852760, |
| "step": 6510 |
| }, |
| { |
| "epoch": 12.154850746268657, |
| "grad_norm": 0.430899977684021, |
| "learning_rate": 0.0004000607789807814, |
| "loss": 0.5703, |
| "num_input_tokens_seen": 1854264, |
| "step": 6515 |
| }, |
| { |
| "epoch": 12.164179104477611, |
| "grad_norm": 0.45637640357017517, |
| "learning_rate": 0.0003992632859805773, |
| "loss": 0.3081, |
| "num_input_tokens_seen": 1855736, |
| "step": 6520 |
| }, |
| { |
| "epoch": 12.173507462686567, |
| "grad_norm": 0.4487372040748596, |
| "learning_rate": 0.0003984660600059418, |
| "loss": 0.3069, |
| "num_input_tokens_seen": 1857112, |
| "step": 6525 |
| }, |
| { |
| "epoch": 12.182835820895523, |
| "grad_norm": 0.24957291781902313, |
| "learning_rate": 0.00039766910317010377, |
| "loss": 0.4203, |
| "num_input_tokens_seen": 1858712, |
| "step": 6530 |
| }, |
| { |
| "epoch": 12.192164179104477, |
| "grad_norm": 0.44331094622612, |
| "learning_rate": 0.0003968724175855788, |
| "loss": 0.2864, |
| "num_input_tokens_seen": 1860280, |
| "step": 6535 |
| }, |
| { |
| "epoch": 12.201492537313433, |
| "grad_norm": 0.37186896800994873, |
| "learning_rate": 0.00039607600536416287, |
| "loss": 0.2347, |
| "num_input_tokens_seen": 1861816, |
| "step": 6540 |
| }, |
| { |
| "epoch": 12.210820895522389, |
| "grad_norm": 0.5465824604034424, |
| "learning_rate": 0.0003952798686169279, |
| "loss": 0.2855, |
| "num_input_tokens_seen": 1863160, |
| "step": 6545 |
| }, |
| { |
| "epoch": 12.220149253731343, |
| "grad_norm": 0.422227144241333, |
| "learning_rate": 0.0003944840094542152, |
| "loss": 0.3494, |
| "num_input_tokens_seen": 1864504, |
| "step": 6550 |
| }, |
| { |
| "epoch": 12.229477611940299, |
| "grad_norm": 0.31042227149009705, |
| "learning_rate": 0.00039368842998563065, |
| "loss": 0.2654, |
| "num_input_tokens_seen": 1866104, |
| "step": 6555 |
| }, |
| { |
| "epoch": 12.238805970149254, |
| "grad_norm": 0.6005743741989136, |
| "learning_rate": 0.0003928931323200384, |
| "loss": 0.3776, |
| "num_input_tokens_seen": 1867480, |
| "step": 6560 |
| }, |
| { |
| "epoch": 12.248134328358208, |
| "grad_norm": 0.39204689860343933, |
| "learning_rate": 0.00039209811856555566, |
| "loss": 0.332, |
| "num_input_tokens_seen": 1869144, |
| "step": 6565 |
| }, |
| { |
| "epoch": 12.257462686567164, |
| "grad_norm": 0.26447024941444397, |
| "learning_rate": 0.0003913033908295477, |
| "loss": 0.326, |
| "num_input_tokens_seen": 1870552, |
| "step": 6570 |
| }, |
| { |
| "epoch": 12.26679104477612, |
| "grad_norm": 0.4324093759059906, |
| "learning_rate": 0.00039050895121862055, |
| "loss": 0.2876, |
| "num_input_tokens_seen": 1872216, |
| "step": 6575 |
| }, |
| { |
| "epoch": 12.276119402985074, |
| "grad_norm": 0.5774677395820618, |
| "learning_rate": 0.0003897148018386174, |
| "loss": 0.345, |
| "num_input_tokens_seen": 1873432, |
| "step": 6580 |
| }, |
| { |
| "epoch": 12.28544776119403, |
| "grad_norm": 0.3903844356536865, |
| "learning_rate": 0.0003889209447946116, |
| "loss": 0.5506, |
| "num_input_tokens_seen": 1874776, |
| "step": 6585 |
| }, |
| { |
| "epoch": 12.294776119402986, |
| "grad_norm": 0.39230290055274963, |
| "learning_rate": 0.0003881273821909016, |
| "loss": 0.3959, |
| "num_input_tokens_seen": 1876216, |
| "step": 6590 |
| }, |
| { |
| "epoch": 12.30410447761194, |
| "grad_norm": 0.49180158972740173, |
| "learning_rate": 0.00038733411613100615, |
| "loss": 0.3796, |
| "num_input_tokens_seen": 1877528, |
| "step": 6595 |
| }, |
| { |
| "epoch": 12.313432835820896, |
| "grad_norm": 0.5546965003013611, |
| "learning_rate": 0.0003865411487176567, |
| "loss": 0.3909, |
| "num_input_tokens_seen": 1879064, |
| "step": 6600 |
| }, |
| { |
| "epoch": 12.322761194029852, |
| "grad_norm": 0.4393283724784851, |
| "learning_rate": 0.00038574848205279416, |
| "loss": 0.3867, |
| "num_input_tokens_seen": 1880504, |
| "step": 6605 |
| }, |
| { |
| "epoch": 12.332089552238806, |
| "grad_norm": 0.4588424563407898, |
| "learning_rate": 0.0003849561182375613, |
| "loss": 0.3882, |
| "num_input_tokens_seen": 1881944, |
| "step": 6610 |
| }, |
| { |
| "epoch": 12.341417910447761, |
| "grad_norm": 0.30848169326782227, |
| "learning_rate": 0.0003841640593722992, |
| "loss": 0.3721, |
| "num_input_tokens_seen": 1883352, |
| "step": 6615 |
| }, |
| { |
| "epoch": 12.350746268656717, |
| "grad_norm": 0.5776231288909912, |
| "learning_rate": 0.0003833723075565394, |
| "loss": 0.4433, |
| "num_input_tokens_seen": 1884888, |
| "step": 6620 |
| }, |
| { |
| "epoch": 12.360074626865671, |
| "grad_norm": 0.45106711983680725, |
| "learning_rate": 0.0003825808648890005, |
| "loss": 0.4721, |
| "num_input_tokens_seen": 1886456, |
| "step": 6625 |
| }, |
| { |
| "epoch": 12.369402985074627, |
| "grad_norm": 0.4222325086593628, |
| "learning_rate": 0.00038178973346758143, |
| "loss": 0.5654, |
| "num_input_tokens_seen": 1887832, |
| "step": 6630 |
| }, |
| { |
| "epoch": 12.378731343283581, |
| "grad_norm": 0.3949355185031891, |
| "learning_rate": 0.00038099891538935537, |
| "loss": 0.4811, |
| "num_input_tokens_seen": 1889144, |
| "step": 6635 |
| }, |
| { |
| "epoch": 12.388059701492537, |
| "grad_norm": 0.48827362060546875, |
| "learning_rate": 0.0003802084127505662, |
| "loss": 0.3642, |
| "num_input_tokens_seen": 1890616, |
| "step": 6640 |
| }, |
| { |
| "epoch": 12.397388059701493, |
| "grad_norm": 0.41914358735084534, |
| "learning_rate": 0.0003794182276466201, |
| "loss": 0.3448, |
| "num_input_tokens_seen": 1892248, |
| "step": 6645 |
| }, |
| { |
| "epoch": 12.406716417910447, |
| "grad_norm": 0.41748374700546265, |
| "learning_rate": 0.00037862836217208295, |
| "loss": 0.3126, |
| "num_input_tokens_seen": 1893688, |
| "step": 6650 |
| }, |
| { |
| "epoch": 12.416044776119403, |
| "grad_norm": 0.3538048565387726, |
| "learning_rate": 0.0003778388184206728, |
| "loss": 0.3595, |
| "num_input_tokens_seen": 1895096, |
| "step": 6655 |
| }, |
| { |
| "epoch": 12.425373134328359, |
| "grad_norm": 0.6328346729278564, |
| "learning_rate": 0.00037704959848525464, |
| "loss": 0.3776, |
| "num_input_tokens_seen": 1896312, |
| "step": 6660 |
| }, |
| { |
| "epoch": 12.434701492537313, |
| "grad_norm": 0.5902320146560669, |
| "learning_rate": 0.00037626070445783566, |
| "loss": 0.4868, |
| "num_input_tokens_seen": 1897496, |
| "step": 6665 |
| }, |
| { |
| "epoch": 12.444029850746269, |
| "grad_norm": 0.4881054162979126, |
| "learning_rate": 0.0003754721384295587, |
| "loss": 0.3676, |
| "num_input_tokens_seen": 1898968, |
| "step": 6670 |
| }, |
| { |
| "epoch": 12.453358208955224, |
| "grad_norm": 0.3948395848274231, |
| "learning_rate": 0.0003746839024906974, |
| "loss": 0.4843, |
| "num_input_tokens_seen": 1900248, |
| "step": 6675 |
| }, |
| { |
| "epoch": 12.462686567164178, |
| "grad_norm": 0.41477930545806885, |
| "learning_rate": 0.00037389599873065033, |
| "loss": 0.3943, |
| "num_input_tokens_seen": 1901592, |
| "step": 6680 |
| }, |
| { |
| "epoch": 12.472014925373134, |
| "grad_norm": 0.4800880253314972, |
| "learning_rate": 0.0003731084292379356, |
| "loss": 0.4224, |
| "num_input_tokens_seen": 1902904, |
| "step": 6685 |
| }, |
| { |
| "epoch": 12.48134328358209, |
| "grad_norm": 0.5596414804458618, |
| "learning_rate": 0.00037232119610018535, |
| "loss": 0.3543, |
| "num_input_tokens_seen": 1904248, |
| "step": 6690 |
| }, |
| { |
| "epoch": 12.490671641791044, |
| "grad_norm": 0.4646502435207367, |
| "learning_rate": 0.00037153430140413984, |
| "loss": 0.3899, |
| "num_input_tokens_seen": 1905688, |
| "step": 6695 |
| }, |
| { |
| "epoch": 12.5, |
| "grad_norm": 0.7304385304450989, |
| "learning_rate": 0.00037074774723564266, |
| "loss": 0.5413, |
| "num_input_tokens_seen": 1906840, |
| "step": 6700 |
| }, |
| { |
| "epoch": 12.509328358208956, |
| "grad_norm": 0.4171794056892395, |
| "learning_rate": 0.0003699615356796342, |
| "loss": 0.2608, |
| "num_input_tokens_seen": 1908184, |
| "step": 6705 |
| }, |
| { |
| "epoch": 12.51865671641791, |
| "grad_norm": 0.4009363055229187, |
| "learning_rate": 0.0003691756688201471, |
| "loss": 0.3174, |
| "num_input_tokens_seen": 1909528, |
| "step": 6710 |
| }, |
| { |
| "epoch": 12.527985074626866, |
| "grad_norm": 0.5148348808288574, |
| "learning_rate": 0.0003683901487403004, |
| "loss": 0.3957, |
| "num_input_tokens_seen": 1910968, |
| "step": 6715 |
| }, |
| { |
| "epoch": 12.537313432835822, |
| "grad_norm": 0.3038680851459503, |
| "learning_rate": 0.00036760497752229336, |
| "loss": 0.2928, |
| "num_input_tokens_seen": 1912472, |
| "step": 6720 |
| }, |
| { |
| "epoch": 12.546641791044776, |
| "grad_norm": 0.5594529509544373, |
| "learning_rate": 0.00036682015724740115, |
| "loss": 0.3539, |
| "num_input_tokens_seen": 1913912, |
| "step": 6725 |
| }, |
| { |
| "epoch": 12.555970149253731, |
| "grad_norm": 0.35502153635025024, |
| "learning_rate": 0.00036603568999596815, |
| "loss": 0.3463, |
| "num_input_tokens_seen": 1915512, |
| "step": 6730 |
| }, |
| { |
| "epoch": 12.565298507462687, |
| "grad_norm": 0.547687828540802, |
| "learning_rate": 0.00036525157784740337, |
| "loss": 0.4171, |
| "num_input_tokens_seen": 1916856, |
| "step": 6735 |
| }, |
| { |
| "epoch": 12.574626865671641, |
| "grad_norm": 0.3124009072780609, |
| "learning_rate": 0.0003644678228801742, |
| "loss": 0.2618, |
| "num_input_tokens_seen": 1918360, |
| "step": 6740 |
| }, |
| { |
| "epoch": 12.583955223880597, |
| "grad_norm": 0.5247648358345032, |
| "learning_rate": 0.00036368442717180154, |
| "loss": 0.3337, |
| "num_input_tokens_seen": 1919864, |
| "step": 6745 |
| }, |
| { |
| "epoch": 12.593283582089553, |
| "grad_norm": 0.41606518626213074, |
| "learning_rate": 0.00036290139279885394, |
| "loss": 0.3932, |
| "num_input_tokens_seen": 1921176, |
| "step": 6750 |
| }, |
| { |
| "epoch": 12.602611940298507, |
| "grad_norm": 0.4646458327770233, |
| "learning_rate": 0.0003621187218369418, |
| "loss": 0.5042, |
| "num_input_tokens_seen": 1922424, |
| "step": 6755 |
| }, |
| { |
| "epoch": 12.611940298507463, |
| "grad_norm": 0.322963148355484, |
| "learning_rate": 0.0003613364163607128, |
| "loss": 0.388, |
| "num_input_tokens_seen": 1923736, |
| "step": 6760 |
| }, |
| { |
| "epoch": 12.621268656716419, |
| "grad_norm": 0.4273810088634491, |
| "learning_rate": 0.00036055447844384527, |
| "loss": 0.3713, |
| "num_input_tokens_seen": 1925112, |
| "step": 6765 |
| }, |
| { |
| "epoch": 12.630597014925373, |
| "grad_norm": 0.4464828073978424, |
| "learning_rate": 0.0003597729101590436, |
| "loss": 0.3738, |
| "num_input_tokens_seen": 1926680, |
| "step": 6770 |
| }, |
| { |
| "epoch": 12.639925373134329, |
| "grad_norm": 0.5656095743179321, |
| "learning_rate": 0.0003589917135780323, |
| "loss": 0.2075, |
| "num_input_tokens_seen": 1928024, |
| "step": 6775 |
| }, |
| { |
| "epoch": 12.649253731343283, |
| "grad_norm": 0.5868489146232605, |
| "learning_rate": 0.00035821089077155046, |
| "loss": 0.2905, |
| "num_input_tokens_seen": 1929592, |
| "step": 6780 |
| }, |
| { |
| "epoch": 12.658582089552239, |
| "grad_norm": 0.5153266191482544, |
| "learning_rate": 0.00035743044380934653, |
| "loss": 0.4824, |
| "num_input_tokens_seen": 1930840, |
| "step": 6785 |
| }, |
| { |
| "epoch": 12.667910447761194, |
| "grad_norm": 0.632239818572998, |
| "learning_rate": 0.00035665037476017257, |
| "loss": 0.3472, |
| "num_input_tokens_seen": 1932312, |
| "step": 6790 |
| }, |
| { |
| "epoch": 12.677238805970148, |
| "grad_norm": 0.4063834547996521, |
| "learning_rate": 0.00035587068569177923, |
| "loss": 0.4525, |
| "num_input_tokens_seen": 1933784, |
| "step": 6795 |
| }, |
| { |
| "epoch": 12.686567164179104, |
| "grad_norm": 0.44790780544281006, |
| "learning_rate": 0.0003550913786709094, |
| "loss": 0.3497, |
| "num_input_tokens_seen": 1935224, |
| "step": 6800 |
| }, |
| { |
| "epoch": 12.69589552238806, |
| "grad_norm": 0.4837767481803894, |
| "learning_rate": 0.0003543124557632936, |
| "loss": 0.4892, |
| "num_input_tokens_seen": 1936504, |
| "step": 6805 |
| }, |
| { |
| "epoch": 12.705223880597014, |
| "grad_norm": 0.48883289098739624, |
| "learning_rate": 0.0003535339190336446, |
| "loss": 0.433, |
| "num_input_tokens_seen": 1937912, |
| "step": 6810 |
| }, |
| { |
| "epoch": 12.71455223880597, |
| "grad_norm": 0.3776569664478302, |
| "learning_rate": 0.00035275577054565046, |
| "loss": 0.3781, |
| "num_input_tokens_seen": 1939128, |
| "step": 6815 |
| }, |
| { |
| "epoch": 12.723880597014926, |
| "grad_norm": 0.4895813763141632, |
| "learning_rate": 0.0003519780123619709, |
| "loss": 0.3092, |
| "num_input_tokens_seen": 1940600, |
| "step": 6820 |
| }, |
| { |
| "epoch": 12.73320895522388, |
| "grad_norm": 0.3760162591934204, |
| "learning_rate": 0.0003512006465442309, |
| "loss": 0.4304, |
| "num_input_tokens_seen": 1942008, |
| "step": 6825 |
| }, |
| { |
| "epoch": 12.742537313432836, |
| "grad_norm": 0.31054630875587463, |
| "learning_rate": 0.0003504236751530152, |
| "loss": 0.2892, |
| "num_input_tokens_seen": 1943352, |
| "step": 6830 |
| }, |
| { |
| "epoch": 12.751865671641792, |
| "grad_norm": 0.36416080594062805, |
| "learning_rate": 0.0003496471002478635, |
| "loss": 0.2315, |
| "num_input_tokens_seen": 1944792, |
| "step": 6835 |
| }, |
| { |
| "epoch": 12.761194029850746, |
| "grad_norm": 0.22087052464485168, |
| "learning_rate": 0.0003488709238872637, |
| "loss": 0.194, |
| "num_input_tokens_seen": 1946392, |
| "step": 6840 |
| }, |
| { |
| "epoch": 12.770522388059701, |
| "grad_norm": 0.4641207754611969, |
| "learning_rate": 0.0003480951481286484, |
| "loss": 0.2972, |
| "num_input_tokens_seen": 1947896, |
| "step": 6845 |
| }, |
| { |
| "epoch": 12.779850746268657, |
| "grad_norm": 0.3787482678890228, |
| "learning_rate": 0.00034731977502838686, |
| "loss": 0.2635, |
| "num_input_tokens_seen": 1949304, |
| "step": 6850 |
| }, |
| { |
| "epoch": 12.789179104477611, |
| "grad_norm": 0.3138027787208557, |
| "learning_rate": 0.00034654480664178257, |
| "loss": 0.1856, |
| "num_input_tokens_seen": 1950744, |
| "step": 6855 |
| }, |
| { |
| "epoch": 12.798507462686567, |
| "grad_norm": 0.34298160672187805, |
| "learning_rate": 0.00034577024502306484, |
| "loss": 0.4447, |
| "num_input_tokens_seen": 1952184, |
| "step": 6860 |
| }, |
| { |
| "epoch": 12.807835820895523, |
| "grad_norm": 0.7594728469848633, |
| "learning_rate": 0.0003449960922253857, |
| "loss": 0.4111, |
| "num_input_tokens_seen": 1953400, |
| "step": 6865 |
| }, |
| { |
| "epoch": 12.817164179104477, |
| "grad_norm": 0.41920387744903564, |
| "learning_rate": 0.0003442223503008135, |
| "loss": 0.4234, |
| "num_input_tokens_seen": 1955032, |
| "step": 6870 |
| }, |
| { |
| "epoch": 12.826492537313433, |
| "grad_norm": 0.30781400203704834, |
| "learning_rate": 0.0003434490213003264, |
| "loss": 0.3907, |
| "num_input_tokens_seen": 1956600, |
| "step": 6875 |
| }, |
| { |
| "epoch": 12.835820895522389, |
| "grad_norm": 0.42326509952545166, |
| "learning_rate": 0.00034267610727380956, |
| "loss": 0.4245, |
| "num_input_tokens_seen": 1958392, |
| "step": 6880 |
| }, |
| { |
| "epoch": 12.845149253731343, |
| "grad_norm": 0.3201337456703186, |
| "learning_rate": 0.0003419036102700467, |
| "loss": 0.3024, |
| "num_input_tokens_seen": 1959864, |
| "step": 6885 |
| }, |
| { |
| "epoch": 12.854477611940299, |
| "grad_norm": 0.5356371998786926, |
| "learning_rate": 0.0003411315323367172, |
| "loss": 0.5257, |
| "num_input_tokens_seen": 1961240, |
| "step": 6890 |
| }, |
| { |
| "epoch": 12.863805970149254, |
| "grad_norm": 0.5199465751647949, |
| "learning_rate": 0.00034035987552038914, |
| "loss": 0.3615, |
| "num_input_tokens_seen": 1962584, |
| "step": 6895 |
| }, |
| { |
| "epoch": 12.873134328358208, |
| "grad_norm": 0.3426276445388794, |
| "learning_rate": 0.0003395886418665144, |
| "loss": 0.3051, |
| "num_input_tokens_seen": 1964056, |
| "step": 6900 |
| }, |
| { |
| "epoch": 12.882462686567164, |
| "grad_norm": 0.45455190539360046, |
| "learning_rate": 0.0003388178334194232, |
| "loss": 0.3607, |
| "num_input_tokens_seen": 1965560, |
| "step": 6905 |
| }, |
| { |
| "epoch": 12.89179104477612, |
| "grad_norm": 0.40248537063598633, |
| "learning_rate": 0.00033804745222231836, |
| "loss": 0.3731, |
| "num_input_tokens_seen": 1967000, |
| "step": 6910 |
| }, |
| { |
| "epoch": 12.901119402985074, |
| "grad_norm": 0.24892118573188782, |
| "learning_rate": 0.00033727750031727077, |
| "loss": 0.2977, |
| "num_input_tokens_seen": 1968376, |
| "step": 6915 |
| }, |
| { |
| "epoch": 12.91044776119403, |
| "grad_norm": 0.2123814970254898, |
| "learning_rate": 0.00033650797974521285, |
| "loss": 0.372, |
| "num_input_tokens_seen": 1969752, |
| "step": 6920 |
| }, |
| { |
| "epoch": 12.919776119402986, |
| "grad_norm": 0.6029353141784668, |
| "learning_rate": 0.00033573889254593384, |
| "loss": 0.6293, |
| "num_input_tokens_seen": 1971192, |
| "step": 6925 |
| }, |
| { |
| "epoch": 12.92910447761194, |
| "grad_norm": 0.32942214608192444, |
| "learning_rate": 0.0003349702407580745, |
| "loss": 0.4528, |
| "num_input_tokens_seen": 1972664, |
| "step": 6930 |
| }, |
| { |
| "epoch": 12.938432835820896, |
| "grad_norm": 0.40889158844947815, |
| "learning_rate": 0.0003342020264191208, |
| "loss": 0.4118, |
| "num_input_tokens_seen": 1974104, |
| "step": 6935 |
| }, |
| { |
| "epoch": 12.947761194029852, |
| "grad_norm": 0.38301944732666016, |
| "learning_rate": 0.0003334342515654, |
| "loss": 0.3288, |
| "num_input_tokens_seen": 1975512, |
| "step": 6940 |
| }, |
| { |
| "epoch": 12.957089552238806, |
| "grad_norm": 0.6120052337646484, |
| "learning_rate": 0.00033266691823207356, |
| "loss": 0.4245, |
| "num_input_tokens_seen": 1976760, |
| "step": 6945 |
| }, |
| { |
| "epoch": 12.966417910447761, |
| "grad_norm": 0.34194818139076233, |
| "learning_rate": 0.0003319000284531332, |
| "loss": 0.2904, |
| "num_input_tokens_seen": 1978072, |
| "step": 6950 |
| }, |
| { |
| "epoch": 12.975746268656717, |
| "grad_norm": 0.31473881006240845, |
| "learning_rate": 0.00033113358426139464, |
| "loss": 0.248, |
| "num_input_tokens_seen": 1979512, |
| "step": 6955 |
| }, |
| { |
| "epoch": 12.985074626865671, |
| "grad_norm": 0.6300244331359863, |
| "learning_rate": 0.0003303675876884923, |
| "loss": 0.4075, |
| "num_input_tokens_seen": 1980792, |
| "step": 6960 |
| }, |
| { |
| "epoch": 12.994402985074627, |
| "grad_norm": 0.463429719209671, |
| "learning_rate": 0.0003296020407648747, |
| "loss": 0.301, |
| "num_input_tokens_seen": 1982104, |
| "step": 6965 |
| }, |
| { |
| "epoch": 13.0, |
| "eval_loss": 0.763629674911499, |
| "eval_runtime": 4.2048, |
| "eval_samples_per_second": 56.602, |
| "eval_steps_per_second": 14.27, |
| "num_input_tokens_seen": 1982664, |
| "step": 6968 |
| }, |
| { |
| "epoch": 13.003731343283581, |
| "grad_norm": 0.36986637115478516, |
| "learning_rate": 0.00032883694551979765, |
| "loss": 0.4503, |
| "num_input_tokens_seen": 1983272, |
| "step": 6970 |
| }, |
| { |
| "epoch": 13.013059701492537, |
| "grad_norm": 0.5193461179733276, |
| "learning_rate": 0.00032807230398132037, |
| "loss": 0.3963, |
| "num_input_tokens_seen": 1984744, |
| "step": 6975 |
| }, |
| { |
| "epoch": 13.022388059701493, |
| "grad_norm": 0.4553769528865814, |
| "learning_rate": 0.0003273081181762989, |
| "loss": 0.2928, |
| "num_input_tokens_seen": 1986088, |
| "step": 6980 |
| }, |
| { |
| "epoch": 13.031716417910447, |
| "grad_norm": 0.30947038531303406, |
| "learning_rate": 0.0003265443901303816, |
| "loss": 0.2241, |
| "num_input_tokens_seen": 1987496, |
| "step": 6985 |
| }, |
| { |
| "epoch": 13.041044776119403, |
| "grad_norm": 0.4447937607765198, |
| "learning_rate": 0.0003257811218680035, |
| "loss": 0.3132, |
| "num_input_tokens_seen": 1988904, |
| "step": 6990 |
| }, |
| { |
| "epoch": 13.050373134328359, |
| "grad_norm": 0.7096430659294128, |
| "learning_rate": 0.00032501831541238046, |
| "loss": 0.5288, |
| "num_input_tokens_seen": 1990280, |
| "step": 6995 |
| }, |
| { |
| "epoch": 13.059701492537313, |
| "grad_norm": 0.4876936972141266, |
| "learning_rate": 0.0003242559727855047, |
| "loss": 0.3568, |
| "num_input_tokens_seen": 1991720, |
| "step": 7000 |
| }, |
| { |
| "epoch": 13.069029850746269, |
| "grad_norm": 0.5312484502792358, |
| "learning_rate": 0.0003234940960081384, |
| "loss": 0.4548, |
| "num_input_tokens_seen": 1993128, |
| "step": 7005 |
| }, |
| { |
| "epoch": 13.078358208955224, |
| "grad_norm": 0.4942055344581604, |
| "learning_rate": 0.00032273268709980934, |
| "loss": 0.2092, |
| "num_input_tokens_seen": 1994696, |
| "step": 7010 |
| }, |
| { |
| "epoch": 13.087686567164178, |
| "grad_norm": 0.5409295558929443, |
| "learning_rate": 0.0003219717480788052, |
| "loss": 0.4487, |
| "num_input_tokens_seen": 1996072, |
| "step": 7015 |
| }, |
| { |
| "epoch": 13.097014925373134, |
| "grad_norm": 0.49344658851623535, |
| "learning_rate": 0.0003212112809621676, |
| "loss": 0.365, |
| "num_input_tokens_seen": 1997448, |
| "step": 7020 |
| }, |
| { |
| "epoch": 13.10634328358209, |
| "grad_norm": 0.2911165654659271, |
| "learning_rate": 0.0003204512877656878, |
| "loss": 0.3687, |
| "num_input_tokens_seen": 1999016, |
| "step": 7025 |
| }, |
| { |
| "epoch": 13.115671641791044, |
| "grad_norm": 0.596234142780304, |
| "learning_rate": 0.0003196917705039004, |
| "loss": 0.3475, |
| "num_input_tokens_seen": 2000136, |
| "step": 7030 |
| }, |
| { |
| "epoch": 13.125, |
| "grad_norm": 0.4349397122859955, |
| "learning_rate": 0.0003189327311900788, |
| "loss": 0.333, |
| "num_input_tokens_seen": 2001576, |
| "step": 7035 |
| }, |
| { |
| "epoch": 13.134328358208956, |
| "grad_norm": 0.5499020218849182, |
| "learning_rate": 0.00031817417183622917, |
| "loss": 0.2503, |
| "num_input_tokens_seen": 2002920, |
| "step": 7040 |
| }, |
| { |
| "epoch": 13.14365671641791, |
| "grad_norm": 0.5799429416656494, |
| "learning_rate": 0.0003174160944530855, |
| "loss": 0.3933, |
| "num_input_tokens_seen": 2004424, |
| "step": 7045 |
| }, |
| { |
| "epoch": 13.152985074626866, |
| "grad_norm": 0.5225064754486084, |
| "learning_rate": 0.00031665850105010466, |
| "loss": 0.3231, |
| "num_input_tokens_seen": 2005864, |
| "step": 7050 |
| }, |
| { |
| "epoch": 13.162313432835822, |
| "grad_norm": 0.4829569160938263, |
| "learning_rate": 0.0003159013936354598, |
| "loss": 0.405, |
| "num_input_tokens_seen": 2007432, |
| "step": 7055 |
| }, |
| { |
| "epoch": 13.171641791044776, |
| "grad_norm": 0.47015833854675293, |
| "learning_rate": 0.00031514477421603677, |
| "loss": 0.3184, |
| "num_input_tokens_seen": 2008808, |
| "step": 7060 |
| }, |
| { |
| "epoch": 13.180970149253731, |
| "grad_norm": 0.5161189436912537, |
| "learning_rate": 0.0003143886447974269, |
| "loss": 0.3284, |
| "num_input_tokens_seen": 2010184, |
| "step": 7065 |
| }, |
| { |
| "epoch": 13.190298507462687, |
| "grad_norm": 0.5534188151359558, |
| "learning_rate": 0.0003136330073839233, |
| "loss": 0.2741, |
| "num_input_tokens_seen": 2011816, |
| "step": 7070 |
| }, |
| { |
| "epoch": 13.199626865671641, |
| "grad_norm": 0.37270939350128174, |
| "learning_rate": 0.00031287786397851523, |
| "loss": 0.3485, |
| "num_input_tokens_seen": 2013448, |
| "step": 7075 |
| }, |
| { |
| "epoch": 13.208955223880597, |
| "grad_norm": 0.35400187969207764, |
| "learning_rate": 0.0003121232165828813, |
| "loss": 0.3619, |
| "num_input_tokens_seen": 2014632, |
| "step": 7080 |
| }, |
| { |
| "epoch": 13.218283582089553, |
| "grad_norm": 0.7645230293273926, |
| "learning_rate": 0.0003113690671973867, |
| "loss": 0.335, |
| "num_input_tokens_seen": 2015944, |
| "step": 7085 |
| }, |
| { |
| "epoch": 13.227611940298507, |
| "grad_norm": 0.6307098865509033, |
| "learning_rate": 0.0003106154178210753, |
| "loss": 0.3954, |
| "num_input_tokens_seen": 2017288, |
| "step": 7090 |
| }, |
| { |
| "epoch": 13.236940298507463, |
| "grad_norm": 0.4696156680583954, |
| "learning_rate": 0.0003098622704516667, |
| "loss": 0.3492, |
| "num_input_tokens_seen": 2018728, |
| "step": 7095 |
| }, |
| { |
| "epoch": 13.246268656716419, |
| "grad_norm": 0.3370290994644165, |
| "learning_rate": 0.0003091096270855487, |
| "loss": 0.3904, |
| "num_input_tokens_seen": 2020136, |
| "step": 7100 |
| }, |
| { |
| "epoch": 13.255597014925373, |
| "grad_norm": 0.5213427543640137, |
| "learning_rate": 0.00030835748971777413, |
| "loss": 0.2854, |
| "num_input_tokens_seen": 2021384, |
| "step": 7105 |
| }, |
| { |
| "epoch": 13.264925373134329, |
| "grad_norm": 0.6416372060775757, |
| "learning_rate": 0.000307605860342054, |
| "loss": 0.2781, |
| "num_input_tokens_seen": 2022728, |
| "step": 7110 |
| }, |
| { |
| "epoch": 13.274253731343283, |
| "grad_norm": 0.40849006175994873, |
| "learning_rate": 0.0003068547409507528, |
| "loss": 0.2897, |
| "num_input_tokens_seen": 2024168, |
| "step": 7115 |
| }, |
| { |
| "epoch": 13.283582089552239, |
| "grad_norm": 0.4013388156890869, |
| "learning_rate": 0.0003061041335348837, |
| "loss": 0.2546, |
| "num_input_tokens_seen": 2025576, |
| "step": 7120 |
| }, |
| { |
| "epoch": 13.292910447761194, |
| "grad_norm": 0.3964715301990509, |
| "learning_rate": 0.00030535404008410165, |
| "loss": 0.227, |
| "num_input_tokens_seen": 2026984, |
| "step": 7125 |
| }, |
| { |
| "epoch": 13.302238805970148, |
| "grad_norm": 0.49855029582977295, |
| "learning_rate": 0.0003046044625867004, |
| "loss": 0.4356, |
| "num_input_tokens_seen": 2028584, |
| "step": 7130 |
| }, |
| { |
| "epoch": 13.311567164179104, |
| "grad_norm": 0.22731898725032806, |
| "learning_rate": 0.0003038554030296056, |
| "loss": 0.4009, |
| "num_input_tokens_seen": 2030088, |
| "step": 7135 |
| }, |
| { |
| "epoch": 13.32089552238806, |
| "grad_norm": 0.43297654390335083, |
| "learning_rate": 0.0003031068633983697, |
| "loss": 0.259, |
| "num_input_tokens_seen": 2031304, |
| "step": 7140 |
| }, |
| { |
| "epoch": 13.330223880597014, |
| "grad_norm": 0.3339049220085144, |
| "learning_rate": 0.00030235884567716737, |
| "loss": 0.2448, |
| "num_input_tokens_seen": 2032648, |
| "step": 7145 |
| }, |
| { |
| "epoch": 13.33955223880597, |
| "grad_norm": 0.4528850317001343, |
| "learning_rate": 0.00030161135184878955, |
| "loss": 0.4167, |
| "num_input_tokens_seen": 2034024, |
| "step": 7150 |
| }, |
| { |
| "epoch": 13.348880597014926, |
| "grad_norm": 0.367870032787323, |
| "learning_rate": 0.00030086438389463887, |
| "loss": 0.3501, |
| "num_input_tokens_seen": 2035656, |
| "step": 7155 |
| }, |
| { |
| "epoch": 13.35820895522388, |
| "grad_norm": 0.49384191632270813, |
| "learning_rate": 0.00030011794379472344, |
| "loss": 0.3107, |
| "num_input_tokens_seen": 2037256, |
| "step": 7160 |
| }, |
| { |
| "epoch": 13.367537313432836, |
| "grad_norm": 0.578643798828125, |
| "learning_rate": 0.00029937203352765267, |
| "loss": 0.3913, |
| "num_input_tokens_seen": 2038888, |
| "step": 7165 |
| }, |
| { |
| "epoch": 13.376865671641792, |
| "grad_norm": 0.4546909034252167, |
| "learning_rate": 0.00029862665507063144, |
| "loss": 0.2388, |
| "num_input_tokens_seen": 2040456, |
| "step": 7170 |
| }, |
| { |
| "epoch": 13.386194029850746, |
| "grad_norm": 0.39690783619880676, |
| "learning_rate": 0.00029788181039945463, |
| "loss": 0.2372, |
| "num_input_tokens_seen": 2041896, |
| "step": 7175 |
| }, |
| { |
| "epoch": 13.395522388059701, |
| "grad_norm": 0.3378133773803711, |
| "learning_rate": 0.0002971375014885026, |
| "loss": 0.3537, |
| "num_input_tokens_seen": 2043368, |
| "step": 7180 |
| }, |
| { |
| "epoch": 13.404850746268657, |
| "grad_norm": 0.5262777209281921, |
| "learning_rate": 0.0002963937303107352, |
| "loss": 0.3643, |
| "num_input_tokens_seen": 2044680, |
| "step": 7185 |
| }, |
| { |
| "epoch": 13.414179104477611, |
| "grad_norm": 0.506253182888031, |
| "learning_rate": 0.0002956504988376873, |
| "loss": 0.3046, |
| "num_input_tokens_seen": 2046120, |
| "step": 7190 |
| }, |
| { |
| "epoch": 13.423507462686567, |
| "grad_norm": 0.4321569502353668, |
| "learning_rate": 0.0002949078090394629, |
| "loss": 0.3583, |
| "num_input_tokens_seen": 2047528, |
| "step": 7195 |
| }, |
| { |
| "epoch": 13.432835820895523, |
| "grad_norm": 0.6814833879470825, |
| "learning_rate": 0.00029416566288472995, |
| "loss": 0.3434, |
| "num_input_tokens_seen": 2048968, |
| "step": 7200 |
| }, |
| { |
| "epoch": 13.442164179104477, |
| "grad_norm": 0.1856795698404312, |
| "learning_rate": 0.00029342406234071595, |
| "loss": 0.3247, |
| "num_input_tokens_seen": 2050312, |
| "step": 7205 |
| }, |
| { |
| "epoch": 13.451492537313433, |
| "grad_norm": 0.43723657727241516, |
| "learning_rate": 0.00029268300937320145, |
| "loss": 0.2912, |
| "num_input_tokens_seen": 2051848, |
| "step": 7210 |
| }, |
| { |
| "epoch": 13.460820895522389, |
| "grad_norm": 0.6841850280761719, |
| "learning_rate": 0.00029194250594651624, |
| "loss": 0.3761, |
| "num_input_tokens_seen": 2053160, |
| "step": 7215 |
| }, |
| { |
| "epoch": 13.470149253731343, |
| "grad_norm": 0.3979460895061493, |
| "learning_rate": 0.0002912025540235327, |
| "loss": 0.4691, |
| "num_input_tokens_seen": 2054600, |
| "step": 7220 |
| }, |
| { |
| "epoch": 13.479477611940299, |
| "grad_norm": 0.33426252007484436, |
| "learning_rate": 0.0002904631555656616, |
| "loss": 0.3804, |
| "num_input_tokens_seen": 2055944, |
| "step": 7225 |
| }, |
| { |
| "epoch": 13.488805970149254, |
| "grad_norm": 0.28705891966819763, |
| "learning_rate": 0.00028972431253284725, |
| "loss": 0.3625, |
| "num_input_tokens_seen": 2057384, |
| "step": 7230 |
| }, |
| { |
| "epoch": 13.498134328358208, |
| "grad_norm": 0.49933427572250366, |
| "learning_rate": 0.0002889860268835607, |
| "loss": 0.297, |
| "num_input_tokens_seen": 2058888, |
| "step": 7235 |
| }, |
| { |
| "epoch": 13.507462686567164, |
| "grad_norm": 0.5333276391029358, |
| "learning_rate": 0.00028824830057479613, |
| "loss": 0.1839, |
| "num_input_tokens_seen": 2060200, |
| "step": 7240 |
| }, |
| { |
| "epoch": 13.51679104477612, |
| "grad_norm": 0.3615100085735321, |
| "learning_rate": 0.00028751113556206456, |
| "loss": 0.244, |
| "num_input_tokens_seen": 2061640, |
| "step": 7245 |
| }, |
| { |
| "epoch": 13.526119402985074, |
| "grad_norm": 0.38745981454849243, |
| "learning_rate": 0.0002867745337993899, |
| "loss": 0.4342, |
| "num_input_tokens_seen": 2063112, |
| "step": 7250 |
| }, |
| { |
| "epoch": 13.53544776119403, |
| "grad_norm": 0.5390931367874146, |
| "learning_rate": 0.00028603849723930243, |
| "loss": 0.3696, |
| "num_input_tokens_seen": 2064488, |
| "step": 7255 |
| }, |
| { |
| "epoch": 13.544776119402986, |
| "grad_norm": 0.4872875511646271, |
| "learning_rate": 0.00028530302783283433, |
| "loss": 0.4132, |
| "num_input_tokens_seen": 2065864, |
| "step": 7260 |
| }, |
| { |
| "epoch": 13.55410447761194, |
| "grad_norm": 0.6258790493011475, |
| "learning_rate": 0.00028456812752951485, |
| "loss": 0.3688, |
| "num_input_tokens_seen": 2067304, |
| "step": 7265 |
| }, |
| { |
| "epoch": 13.563432835820896, |
| "grad_norm": 0.4587050974369049, |
| "learning_rate": 0.0002838337982773641, |
| "loss": 0.3483, |
| "num_input_tokens_seen": 2068680, |
| "step": 7270 |
| }, |
| { |
| "epoch": 13.572761194029852, |
| "grad_norm": 0.5654541850090027, |
| "learning_rate": 0.00028310004202288885, |
| "loss": 0.6093, |
| "num_input_tokens_seen": 2070120, |
| "step": 7275 |
| }, |
| { |
| "epoch": 13.582089552238806, |
| "grad_norm": 0.4311787784099579, |
| "learning_rate": 0.0002823668607110767, |
| "loss": 0.2606, |
| "num_input_tokens_seen": 2071496, |
| "step": 7280 |
| }, |
| { |
| "epoch": 13.591417910447761, |
| "grad_norm": 0.4221719205379486, |
| "learning_rate": 0.00028163425628539184, |
| "loss": 0.3933, |
| "num_input_tokens_seen": 2072840, |
| "step": 7285 |
| }, |
| { |
| "epoch": 13.600746268656717, |
| "grad_norm": 0.4586186110973358, |
| "learning_rate": 0.00028090223068776867, |
| "loss": 0.4825, |
| "num_input_tokens_seen": 2074152, |
| "step": 7290 |
| }, |
| { |
| "epoch": 13.610074626865671, |
| "grad_norm": 0.6966432332992554, |
| "learning_rate": 0.00028017078585860735, |
| "loss": 0.371, |
| "num_input_tokens_seen": 2075528, |
| "step": 7295 |
| }, |
| { |
| "epoch": 13.619402985074627, |
| "grad_norm": 0.5013579726219177, |
| "learning_rate": 0.000279439923736769, |
| "loss": 0.3927, |
| "num_input_tokens_seen": 2077000, |
| "step": 7300 |
| }, |
| { |
| "epoch": 13.628731343283581, |
| "grad_norm": 0.4000113904476166, |
| "learning_rate": 0.00027870964625956987, |
| "loss": 0.4186, |
| "num_input_tokens_seen": 2078440, |
| "step": 7305 |
| }, |
| { |
| "epoch": 13.638059701492537, |
| "grad_norm": 0.4626561105251312, |
| "learning_rate": 0.0002779799553627762, |
| "loss": 0.3657, |
| "num_input_tokens_seen": 2079848, |
| "step": 7310 |
| }, |
| { |
| "epoch": 13.647388059701493, |
| "grad_norm": 0.289797306060791, |
| "learning_rate": 0.00027725085298060004, |
| "loss": 0.2702, |
| "num_input_tokens_seen": 2081416, |
| "step": 7315 |
| }, |
| { |
| "epoch": 13.656716417910447, |
| "grad_norm": 0.23923040926456451, |
| "learning_rate": 0.0002765223410456929, |
| "loss": 0.3342, |
| "num_input_tokens_seen": 2082888, |
| "step": 7320 |
| }, |
| { |
| "epoch": 13.666044776119403, |
| "grad_norm": 0.5279592871665955, |
| "learning_rate": 0.0002757944214891412, |
| "loss": 0.2363, |
| "num_input_tokens_seen": 2084264, |
| "step": 7325 |
| }, |
| { |
| "epoch": 13.675373134328359, |
| "grad_norm": 0.4002044200897217, |
| "learning_rate": 0.00027506709624046133, |
| "loss": 0.2882, |
| "num_input_tokens_seen": 2085640, |
| "step": 7330 |
| }, |
| { |
| "epoch": 13.684701492537313, |
| "grad_norm": 0.4956214725971222, |
| "learning_rate": 0.00027434036722759434, |
| "loss": 0.3519, |
| "num_input_tokens_seen": 2087048, |
| "step": 7335 |
| }, |
| { |
| "epoch": 13.694029850746269, |
| "grad_norm": 0.4935523271560669, |
| "learning_rate": 0.00027361423637690073, |
| "loss": 0.4493, |
| "num_input_tokens_seen": 2088456, |
| "step": 7340 |
| }, |
| { |
| "epoch": 13.703358208955224, |
| "grad_norm": 0.5428826808929443, |
| "learning_rate": 0.00027288870561315525, |
| "loss": 0.2448, |
| "num_input_tokens_seen": 2090120, |
| "step": 7345 |
| }, |
| { |
| "epoch": 13.712686567164178, |
| "grad_norm": 0.5399242043495178, |
| "learning_rate": 0.00027216377685954253, |
| "loss": 0.4441, |
| "num_input_tokens_seen": 2091592, |
| "step": 7350 |
| }, |
| { |
| "epoch": 13.722014925373134, |
| "grad_norm": 0.397484689950943, |
| "learning_rate": 0.00027143945203765086, |
| "loss": 0.3017, |
| "num_input_tokens_seen": 2093192, |
| "step": 7355 |
| }, |
| { |
| "epoch": 13.73134328358209, |
| "grad_norm": 0.4469533860683441, |
| "learning_rate": 0.00027071573306746793, |
| "loss": 0.4668, |
| "num_input_tokens_seen": 2094632, |
| "step": 7360 |
| }, |
| { |
| "epoch": 13.740671641791044, |
| "grad_norm": 0.538241982460022, |
| "learning_rate": 0.0002699926218673753, |
| "loss": 0.2292, |
| "num_input_tokens_seen": 2096200, |
| "step": 7365 |
| }, |
| { |
| "epoch": 13.75, |
| "grad_norm": 0.6859297752380371, |
| "learning_rate": 0.00026927012035414397, |
| "loss": 0.2249, |
| "num_input_tokens_seen": 2097736, |
| "step": 7370 |
| }, |
| { |
| "epoch": 13.759328358208956, |
| "grad_norm": 0.5205817818641663, |
| "learning_rate": 0.0002685482304429283, |
| "loss": 0.3923, |
| "num_input_tokens_seen": 2099016, |
| "step": 7375 |
| }, |
| { |
| "epoch": 13.76865671641791, |
| "grad_norm": 0.5006594061851501, |
| "learning_rate": 0.00026782695404726153, |
| "loss": 0.2378, |
| "num_input_tokens_seen": 2100360, |
| "step": 7380 |
| }, |
| { |
| "epoch": 13.777985074626866, |
| "grad_norm": 0.45096078515052795, |
| "learning_rate": 0.00026710629307905107, |
| "loss": 0.2475, |
| "num_input_tokens_seen": 2101832, |
| "step": 7385 |
| }, |
| { |
| "epoch": 13.787313432835822, |
| "grad_norm": 0.3145253360271454, |
| "learning_rate": 0.0002663862494485727, |
| "loss": 0.2111, |
| "num_input_tokens_seen": 2103208, |
| "step": 7390 |
| }, |
| { |
| "epoch": 13.796641791044776, |
| "grad_norm": 0.739738404750824, |
| "learning_rate": 0.0002656668250644656, |
| "loss": 0.251, |
| "num_input_tokens_seen": 2104648, |
| "step": 7395 |
| }, |
| { |
| "epoch": 13.805970149253731, |
| "grad_norm": 0.4297608733177185, |
| "learning_rate": 0.0002649480218337276, |
| "loss": 0.3109, |
| "num_input_tokens_seen": 2105992, |
| "step": 7400 |
| }, |
| { |
| "epoch": 13.815298507462687, |
| "grad_norm": 0.4921467900276184, |
| "learning_rate": 0.0002642298416617102, |
| "loss": 0.3678, |
| "num_input_tokens_seen": 2107368, |
| "step": 7405 |
| }, |
| { |
| "epoch": 13.824626865671641, |
| "grad_norm": 0.35035428404808044, |
| "learning_rate": 0.0002635122864521138, |
| "loss": 0.3889, |
| "num_input_tokens_seen": 2108744, |
| "step": 7410 |
| }, |
| { |
| "epoch": 13.833955223880597, |
| "grad_norm": 0.467009961605072, |
| "learning_rate": 0.00026279535810698083, |
| "loss": 0.1599, |
| "num_input_tokens_seen": 2110184, |
| "step": 7415 |
| }, |
| { |
| "epoch": 13.843283582089553, |
| "grad_norm": 0.5431391000747681, |
| "learning_rate": 0.00026207905852669355, |
| "loss": 0.316, |
| "num_input_tokens_seen": 2111656, |
| "step": 7420 |
| }, |
| { |
| "epoch": 13.852611940298507, |
| "grad_norm": 0.5196968913078308, |
| "learning_rate": 0.00026136338960996666, |
| "loss": 0.3132, |
| "num_input_tokens_seen": 2113064, |
| "step": 7425 |
| }, |
| { |
| "epoch": 13.861940298507463, |
| "grad_norm": 0.8434855341911316, |
| "learning_rate": 0.00026064835325384305, |
| "loss": 0.3459, |
| "num_input_tokens_seen": 2114536, |
| "step": 7430 |
| }, |
| { |
| "epoch": 13.871268656716419, |
| "grad_norm": 0.6276556849479675, |
| "learning_rate": 0.0002599339513536897, |
| "loss": 0.3708, |
| "num_input_tokens_seen": 2115912, |
| "step": 7435 |
| }, |
| { |
| "epoch": 13.880597014925373, |
| "grad_norm": 0.5806488394737244, |
| "learning_rate": 0.000259220185803191, |
| "loss": 0.3929, |
| "num_input_tokens_seen": 2117352, |
| "step": 7440 |
| }, |
| { |
| "epoch": 13.889925373134329, |
| "grad_norm": 0.45292994379997253, |
| "learning_rate": 0.0002585070584943452, |
| "loss": 0.3945, |
| "num_input_tokens_seen": 2118664, |
| "step": 7445 |
| }, |
| { |
| "epoch": 13.899253731343283, |
| "grad_norm": 0.45331817865371704, |
| "learning_rate": 0.00025779457131745774, |
| "loss": 0.4171, |
| "num_input_tokens_seen": 2120008, |
| "step": 7450 |
| }, |
| { |
| "epoch": 13.908582089552239, |
| "grad_norm": 0.23430407047271729, |
| "learning_rate": 0.00025708272616113866, |
| "loss": 0.2375, |
| "num_input_tokens_seen": 2121480, |
| "step": 7455 |
| }, |
| { |
| "epoch": 13.917910447761194, |
| "grad_norm": 0.47137734293937683, |
| "learning_rate": 0.0002563715249122948, |
| "loss": 0.3517, |
| "num_input_tokens_seen": 2122920, |
| "step": 7460 |
| }, |
| { |
| "epoch": 13.927238805970148, |
| "grad_norm": 0.6656373739242554, |
| "learning_rate": 0.00025566096945612725, |
| "loss": 0.4133, |
| "num_input_tokens_seen": 2124264, |
| "step": 7465 |
| }, |
| { |
| "epoch": 13.936567164179104, |
| "grad_norm": 0.6758831739425659, |
| "learning_rate": 0.0002549510616761248, |
| "loss": 0.3163, |
| "num_input_tokens_seen": 2125832, |
| "step": 7470 |
| }, |
| { |
| "epoch": 13.94589552238806, |
| "grad_norm": 0.6634296178817749, |
| "learning_rate": 0.00025424180345405903, |
| "loss": 0.2261, |
| "num_input_tokens_seen": 2127336, |
| "step": 7475 |
| }, |
| { |
| "epoch": 13.955223880597014, |
| "grad_norm": 0.6755518317222595, |
| "learning_rate": 0.0002535331966699809, |
| "loss": 0.3299, |
| "num_input_tokens_seen": 2128616, |
| "step": 7480 |
| }, |
| { |
| "epoch": 13.96455223880597, |
| "grad_norm": 0.5297658443450928, |
| "learning_rate": 0.0002528252432022129, |
| "loss": 0.2549, |
| "num_input_tokens_seen": 2129928, |
| "step": 7485 |
| }, |
| { |
| "epoch": 13.973880597014926, |
| "grad_norm": 0.8608595132827759, |
| "learning_rate": 0.0002521179449273472, |
| "loss": 0.2852, |
| "num_input_tokens_seen": 2131304, |
| "step": 7490 |
| }, |
| { |
| "epoch": 13.98320895522388, |
| "grad_norm": 0.5724624395370483, |
| "learning_rate": 0.0002514113037202389, |
| "loss": 0.4458, |
| "num_input_tokens_seen": 2132808, |
| "step": 7495 |
| }, |
| { |
| "epoch": 13.992537313432836, |
| "grad_norm": 0.27308163046836853, |
| "learning_rate": 0.00025070532145400105, |
| "loss": 0.1776, |
| "num_input_tokens_seen": 2134184, |
| "step": 7500 |
| }, |
| { |
| "epoch": 14.0, |
| "eval_loss": 0.7891493439674377, |
| "eval_runtime": 4.213, |
| "eval_samples_per_second": 56.491, |
| "eval_steps_per_second": 14.242, |
| "num_input_tokens_seen": 2135168, |
| "step": 7504 |
| }, |
| { |
| "epoch": 14.001865671641792, |
| "grad_norm": 0.3923545777797699, |
| "learning_rate": 0.0002500000000000001, |
| "loss": 0.5102, |
| "num_input_tokens_seen": 2135456, |
| "step": 7505 |
| }, |
| { |
| "epoch": 14.011194029850746, |
| "grad_norm": 0.5965924263000488, |
| "learning_rate": 0.00024929534122785084, |
| "loss": 0.2351, |
| "num_input_tokens_seen": 2136960, |
| "step": 7510 |
| }, |
| { |
| "epoch": 14.020522388059701, |
| "grad_norm": 0.32224705815315247, |
| "learning_rate": 0.0002485913470054119, |
| "loss": 0.2127, |
| "num_input_tokens_seen": 2138272, |
| "step": 7515 |
| }, |
| { |
| "epoch": 14.029850746268657, |
| "grad_norm": 0.3885835111141205, |
| "learning_rate": 0.00024788801919878, |
| "loss": 0.2403, |
| "num_input_tokens_seen": 2139680, |
| "step": 7520 |
| }, |
| { |
| "epoch": 14.039179104477611, |
| "grad_norm": 0.42298731207847595, |
| "learning_rate": 0.0002471853596722851, |
| "loss": 0.2951, |
| "num_input_tokens_seen": 2141056, |
| "step": 7525 |
| }, |
| { |
| "epoch": 14.048507462686567, |
| "grad_norm": 0.46374988555908203, |
| "learning_rate": 0.00024648337028848654, |
| "loss": 0.1905, |
| "num_input_tokens_seen": 2142784, |
| "step": 7530 |
| }, |
| { |
| "epoch": 14.057835820895523, |
| "grad_norm": 0.5345138311386108, |
| "learning_rate": 0.00024578205290816656, |
| "loss": 0.2886, |
| "num_input_tokens_seen": 2144192, |
| "step": 7535 |
| }, |
| { |
| "epoch": 14.067164179104477, |
| "grad_norm": 0.5211803913116455, |
| "learning_rate": 0.00024508140939032646, |
| "loss": 0.3581, |
| "num_input_tokens_seen": 2145600, |
| "step": 7540 |
| }, |
| { |
| "epoch": 14.076492537313433, |
| "grad_norm": 0.365950345993042, |
| "learning_rate": 0.0002443814415921809, |
| "loss": 0.2361, |
| "num_input_tokens_seen": 2146976, |
| "step": 7545 |
| }, |
| { |
| "epoch": 14.085820895522389, |
| "grad_norm": 0.5437518954277039, |
| "learning_rate": 0.00024368215136915417, |
| "loss": 0.3863, |
| "num_input_tokens_seen": 2148352, |
| "step": 7550 |
| }, |
| { |
| "epoch": 14.095149253731343, |
| "grad_norm": 0.415061891078949, |
| "learning_rate": 0.00024298354057487382, |
| "loss": 0.3582, |
| "num_input_tokens_seen": 2149664, |
| "step": 7555 |
| }, |
| { |
| "epoch": 14.104477611940299, |
| "grad_norm": 0.6573102474212646, |
| "learning_rate": 0.00024228561106116647, |
| "loss": 0.2205, |
| "num_input_tokens_seen": 2151520, |
| "step": 7560 |
| }, |
| { |
| "epoch": 14.113805970149254, |
| "grad_norm": 0.3543594181537628, |
| "learning_rate": 0.00024158836467805334, |
| "loss": 0.2546, |
| "num_input_tokens_seen": 2152864, |
| "step": 7565 |
| }, |
| { |
| "epoch": 14.123134328358208, |
| "grad_norm": 0.43706467747688293, |
| "learning_rate": 0.0002408918032737444, |
| "loss": 0.1902, |
| "num_input_tokens_seen": 2154368, |
| "step": 7570 |
| }, |
| { |
| "epoch": 14.132462686567164, |
| "grad_norm": 0.6588314771652222, |
| "learning_rate": 0.00024019592869463374, |
| "loss": 0.1914, |
| "num_input_tokens_seen": 2155712, |
| "step": 7575 |
| }, |
| { |
| "epoch": 14.14179104477612, |
| "grad_norm": 0.3580341935157776, |
| "learning_rate": 0.00023950074278529567, |
| "loss": 0.2419, |
| "num_input_tokens_seen": 2157280, |
| "step": 7580 |
| }, |
| { |
| "epoch": 14.151119402985074, |
| "grad_norm": 0.7395253777503967, |
| "learning_rate": 0.00023880624738847835, |
| "loss": 0.34, |
| "num_input_tokens_seen": 2158656, |
| "step": 7585 |
| }, |
| { |
| "epoch": 14.16044776119403, |
| "grad_norm": 0.49231547117233276, |
| "learning_rate": 0.0002381124443450997, |
| "loss": 0.2136, |
| "num_input_tokens_seen": 2160160, |
| "step": 7590 |
| }, |
| { |
| "epoch": 14.169776119402986, |
| "grad_norm": 0.5264060497283936, |
| "learning_rate": 0.00023741933549424228, |
| "loss": 0.3088, |
| "num_input_tokens_seen": 2161600, |
| "step": 7595 |
| }, |
| { |
| "epoch": 14.17910447761194, |
| "grad_norm": 0.6888109445571899, |
| "learning_rate": 0.00023672692267314916, |
| "loss": 0.3481, |
| "num_input_tokens_seen": 2162912, |
| "step": 7600 |
| }, |
| { |
| "epoch": 14.188432835820896, |
| "grad_norm": 0.6409774422645569, |
| "learning_rate": 0.0002360352077172177, |
| "loss": 0.3394, |
| "num_input_tokens_seen": 2164320, |
| "step": 7605 |
| }, |
| { |
| "epoch": 14.197761194029852, |
| "grad_norm": 0.3647206127643585, |
| "learning_rate": 0.0002353441924599956, |
| "loss": 0.1627, |
| "num_input_tokens_seen": 2165888, |
| "step": 7610 |
| }, |
| { |
| "epoch": 14.207089552238806, |
| "grad_norm": 0.48043763637542725, |
| "learning_rate": 0.0002346538787331763, |
| "loss": 0.4696, |
| "num_input_tokens_seen": 2167296, |
| "step": 7615 |
| }, |
| { |
| "epoch": 14.216417910447761, |
| "grad_norm": 0.5464047193527222, |
| "learning_rate": 0.00023396426836659303, |
| "loss": 0.2985, |
| "num_input_tokens_seen": 2168672, |
| "step": 7620 |
| }, |
| { |
| "epoch": 14.225746268656717, |
| "grad_norm": 0.688077986240387, |
| "learning_rate": 0.00023327536318821495, |
| "loss": 0.3632, |
| "num_input_tokens_seen": 2170048, |
| "step": 7625 |
| }, |
| { |
| "epoch": 14.235074626865671, |
| "grad_norm": 0.6989650130271912, |
| "learning_rate": 0.0002325871650241418, |
| "loss": 0.3588, |
| "num_input_tokens_seen": 2171264, |
| "step": 7630 |
| }, |
| { |
| "epoch": 14.244402985074627, |
| "grad_norm": 0.999320387840271, |
| "learning_rate": 0.00023189967569859938, |
| "loss": 0.2911, |
| "num_input_tokens_seen": 2172800, |
| "step": 7635 |
| }, |
| { |
| "epoch": 14.253731343283581, |
| "grad_norm": 0.3599291443824768, |
| "learning_rate": 0.00023121289703393488, |
| "loss": 0.2441, |
| "num_input_tokens_seen": 2174208, |
| "step": 7640 |
| }, |
| { |
| "epoch": 14.263059701492537, |
| "grad_norm": 0.8607851266860962, |
| "learning_rate": 0.0002305268308506106, |
| "loss": 0.3268, |
| "num_input_tokens_seen": 2175584, |
| "step": 7645 |
| }, |
| { |
| "epoch": 14.272388059701493, |
| "grad_norm": 0.4545849859714508, |
| "learning_rate": 0.0002298414789672016, |
| "loss": 0.2989, |
| "num_input_tokens_seen": 2176960, |
| "step": 7650 |
| }, |
| { |
| "epoch": 14.281716417910447, |
| "grad_norm": 0.482704758644104, |
| "learning_rate": 0.00022915684320038836, |
| "loss": 0.3393, |
| "num_input_tokens_seen": 2178400, |
| "step": 7655 |
| }, |
| { |
| "epoch": 14.291044776119403, |
| "grad_norm": 0.49524471163749695, |
| "learning_rate": 0.00022847292536495447, |
| "loss": 0.3255, |
| "num_input_tokens_seen": 2179648, |
| "step": 7660 |
| }, |
| { |
| "epoch": 14.300373134328359, |
| "grad_norm": 0.39817991852760315, |
| "learning_rate": 0.00022778972727377866, |
| "loss": 0.3488, |
| "num_input_tokens_seen": 2181088, |
| "step": 7665 |
| }, |
| { |
| "epoch": 14.309701492537313, |
| "grad_norm": 0.3302770256996155, |
| "learning_rate": 0.00022710725073783346, |
| "loss": 0.2023, |
| "num_input_tokens_seen": 2182464, |
| "step": 7670 |
| }, |
| { |
| "epoch": 14.319029850746269, |
| "grad_norm": 0.6790076494216919, |
| "learning_rate": 0.00022642549756617835, |
| "loss": 0.2749, |
| "num_input_tokens_seen": 2183968, |
| "step": 7675 |
| }, |
| { |
| "epoch": 14.328358208955224, |
| "grad_norm": 0.4351758062839508, |
| "learning_rate": 0.00022574446956595445, |
| "loss": 0.2023, |
| "num_input_tokens_seen": 2185248, |
| "step": 7680 |
| }, |
| { |
| "epoch": 14.337686567164178, |
| "grad_norm": 0.3751530349254608, |
| "learning_rate": 0.00022506416854238187, |
| "loss": 0.2206, |
| "num_input_tokens_seen": 2186848, |
| "step": 7685 |
| }, |
| { |
| "epoch": 14.347014925373134, |
| "grad_norm": 0.3495856821537018, |
| "learning_rate": 0.00022438459629875291, |
| "loss": 0.2509, |
| "num_input_tokens_seen": 2188320, |
| "step": 7690 |
| }, |
| { |
| "epoch": 14.35634328358209, |
| "grad_norm": 0.6211624145507812, |
| "learning_rate": 0.00022370575463642856, |
| "loss": 0.3539, |
| "num_input_tokens_seen": 2189792, |
| "step": 7695 |
| }, |
| { |
| "epoch": 14.365671641791044, |
| "grad_norm": 0.56031733751297, |
| "learning_rate": 0.00022302764535483293, |
| "loss": 0.2784, |
| "num_input_tokens_seen": 2191232, |
| "step": 7700 |
| }, |
| { |
| "epoch": 14.375, |
| "grad_norm": 0.24593977630138397, |
| "learning_rate": 0.00022235027025144873, |
| "loss": 0.3143, |
| "num_input_tokens_seen": 2192672, |
| "step": 7705 |
| }, |
| { |
| "epoch": 14.384328358208956, |
| "grad_norm": 0.7664890885353088, |
| "learning_rate": 0.000221673631121813, |
| "loss": 0.4823, |
| "num_input_tokens_seen": 2193984, |
| "step": 7710 |
| }, |
| { |
| "epoch": 14.39365671641791, |
| "grad_norm": 0.4323963522911072, |
| "learning_rate": 0.00022099772975951143, |
| "loss": 0.3216, |
| "num_input_tokens_seen": 2195424, |
| "step": 7715 |
| }, |
| { |
| "epoch": 14.402985074626866, |
| "grad_norm": 0.6344655156135559, |
| "learning_rate": 0.00022032256795617434, |
| "loss": 0.4039, |
| "num_input_tokens_seen": 2196736, |
| "step": 7720 |
| }, |
| { |
| "epoch": 14.412313432835822, |
| "grad_norm": 0.4761766493320465, |
| "learning_rate": 0.00021964814750147143, |
| "loss": 0.3493, |
| "num_input_tokens_seen": 2198080, |
| "step": 7725 |
| }, |
| { |
| "epoch": 14.421641791044776, |
| "grad_norm": 0.5730311274528503, |
| "learning_rate": 0.00021897447018310784, |
| "loss": 0.326, |
| "num_input_tokens_seen": 2199328, |
| "step": 7730 |
| }, |
| { |
| "epoch": 14.430970149253731, |
| "grad_norm": 0.5259273648262024, |
| "learning_rate": 0.00021830153778681832, |
| "loss": 0.2736, |
| "num_input_tokens_seen": 2200768, |
| "step": 7735 |
| }, |
| { |
| "epoch": 14.440298507462687, |
| "grad_norm": 0.3467364013195038, |
| "learning_rate": 0.00021762935209636308, |
| "loss": 0.2123, |
| "num_input_tokens_seen": 2202240, |
| "step": 7740 |
| }, |
| { |
| "epoch": 14.449626865671641, |
| "grad_norm": 0.6609408259391785, |
| "learning_rate": 0.00021695791489352345, |
| "loss": 0.3752, |
| "num_input_tokens_seen": 2203584, |
| "step": 7745 |
| }, |
| { |
| "epoch": 14.458955223880597, |
| "grad_norm": 0.27350056171417236, |
| "learning_rate": 0.00021628722795809623, |
| "loss": 0.252, |
| "num_input_tokens_seen": 2204928, |
| "step": 7750 |
| }, |
| { |
| "epoch": 14.468283582089553, |
| "grad_norm": 0.7308428883552551, |
| "learning_rate": 0.00021561729306788957, |
| "loss": 0.3164, |
| "num_input_tokens_seen": 2206464, |
| "step": 7755 |
| }, |
| { |
| "epoch": 14.477611940298507, |
| "grad_norm": 0.4643590450286865, |
| "learning_rate": 0.00021494811199871856, |
| "loss": 0.3272, |
| "num_input_tokens_seen": 2207904, |
| "step": 7760 |
| }, |
| { |
| "epoch": 14.486940298507463, |
| "grad_norm": 0.49550262093544006, |
| "learning_rate": 0.00021427968652439956, |
| "loss": 0.2784, |
| "num_input_tokens_seen": 2209376, |
| "step": 7765 |
| }, |
| { |
| "epoch": 14.496268656716419, |
| "grad_norm": 0.5133547186851501, |
| "learning_rate": 0.00021361201841674639, |
| "loss": 0.3686, |
| "num_input_tokens_seen": 2210912, |
| "step": 7770 |
| }, |
| { |
| "epoch": 14.505597014925373, |
| "grad_norm": 0.275790810585022, |
| "learning_rate": 0.000212945109445565, |
| "loss": 0.3955, |
| "num_input_tokens_seen": 2212288, |
| "step": 7775 |
| }, |
| { |
| "epoch": 14.514925373134329, |
| "grad_norm": 0.3647710382938385, |
| "learning_rate": 0.0002122789613786496, |
| "loss": 0.2778, |
| "num_input_tokens_seen": 2213664, |
| "step": 7780 |
| }, |
| { |
| "epoch": 14.524253731343283, |
| "grad_norm": 0.5007681250572205, |
| "learning_rate": 0.00021161357598177693, |
| "loss": 0.3911, |
| "num_input_tokens_seen": 2214976, |
| "step": 7785 |
| }, |
| { |
| "epoch": 14.533582089552239, |
| "grad_norm": 0.4205113351345062, |
| "learning_rate": 0.0002109489550187022, |
| "loss": 0.2883, |
| "num_input_tokens_seen": 2216512, |
| "step": 7790 |
| }, |
| { |
| "epoch": 14.542910447761194, |
| "grad_norm": 0.3494776487350464, |
| "learning_rate": 0.00021028510025115476, |
| "loss": 0.2344, |
| "num_input_tokens_seen": 2217920, |
| "step": 7795 |
| }, |
| { |
| "epoch": 14.552238805970148, |
| "grad_norm": 0.7673720717430115, |
| "learning_rate": 0.00020962201343883237, |
| "loss": 0.4006, |
| "num_input_tokens_seen": 2219296, |
| "step": 7800 |
| }, |
| { |
| "epoch": 14.561567164179104, |
| "grad_norm": 0.5976295471191406, |
| "learning_rate": 0.00020895969633939748, |
| "loss": 0.3326, |
| "num_input_tokens_seen": 2220864, |
| "step": 7805 |
| }, |
| { |
| "epoch": 14.57089552238806, |
| "grad_norm": 0.5417135953903198, |
| "learning_rate": 0.00020829815070847203, |
| "loss": 0.3475, |
| "num_input_tokens_seen": 2222208, |
| "step": 7810 |
| }, |
| { |
| "epoch": 14.580223880597014, |
| "grad_norm": 0.8238497972488403, |
| "learning_rate": 0.00020763737829963347, |
| "loss": 0.4032, |
| "num_input_tokens_seen": 2223456, |
| "step": 7815 |
| }, |
| { |
| "epoch": 14.58955223880597, |
| "grad_norm": 0.2751303017139435, |
| "learning_rate": 0.00020697738086440914, |
| "loss": 0.2232, |
| "num_input_tokens_seen": 2224928, |
| "step": 7820 |
| }, |
| { |
| "epoch": 14.598880597014926, |
| "grad_norm": 0.6023114919662476, |
| "learning_rate": 0.00020631816015227218, |
| "loss": 0.37, |
| "num_input_tokens_seen": 2226464, |
| "step": 7825 |
| }, |
| { |
| "epoch": 14.60820895522388, |
| "grad_norm": 0.9885722994804382, |
| "learning_rate": 0.00020565971791063731, |
| "loss": 0.3756, |
| "num_input_tokens_seen": 2227968, |
| "step": 7830 |
| }, |
| { |
| "epoch": 14.617537313432836, |
| "grad_norm": 0.5508349537849426, |
| "learning_rate": 0.0002050020558848553, |
| "loss": 0.3164, |
| "num_input_tokens_seen": 2229312, |
| "step": 7835 |
| }, |
| { |
| "epoch": 14.626865671641792, |
| "grad_norm": 0.3190430998802185, |
| "learning_rate": 0.00020434517581820893, |
| "loss": 0.2365, |
| "num_input_tokens_seen": 2230752, |
| "step": 7840 |
| }, |
| { |
| "epoch": 14.636194029850746, |
| "grad_norm": 0.46599534153938293, |
| "learning_rate": 0.000203689079451908, |
| "loss": 0.219, |
| "num_input_tokens_seen": 2232096, |
| "step": 7845 |
| }, |
| { |
| "epoch": 14.645522388059701, |
| "grad_norm": 0.5045431852340698, |
| "learning_rate": 0.00020303376852508526, |
| "loss": 0.2736, |
| "num_input_tokens_seen": 2233504, |
| "step": 7850 |
| }, |
| { |
| "epoch": 14.654850746268657, |
| "grad_norm": 0.7143300771713257, |
| "learning_rate": 0.0002023792447747917, |
| "loss": 0.4204, |
| "num_input_tokens_seen": 2234848, |
| "step": 7855 |
| }, |
| { |
| "epoch": 14.664179104477611, |
| "grad_norm": 0.4074963927268982, |
| "learning_rate": 0.00020172550993599072, |
| "loss": 0.2737, |
| "num_input_tokens_seen": 2236192, |
| "step": 7860 |
| }, |
| { |
| "epoch": 14.673507462686567, |
| "grad_norm": 0.38568681478500366, |
| "learning_rate": 0.00020107256574155563, |
| "loss": 0.3055, |
| "num_input_tokens_seen": 2237696, |
| "step": 7865 |
| }, |
| { |
| "epoch": 14.682835820895523, |
| "grad_norm": 0.47734951972961426, |
| "learning_rate": 0.0002004204139222634, |
| "loss": 0.3397, |
| "num_input_tokens_seen": 2239104, |
| "step": 7870 |
| }, |
| { |
| "epoch": 14.692164179104477, |
| "grad_norm": 0.708835244178772, |
| "learning_rate": 0.00019976905620679053, |
| "loss": 0.5271, |
| "num_input_tokens_seen": 2240480, |
| "step": 7875 |
| }, |
| { |
| "epoch": 14.701492537313433, |
| "grad_norm": 0.508047342300415, |
| "learning_rate": 0.00019911849432170908, |
| "loss": 0.385, |
| "num_input_tokens_seen": 2241984, |
| "step": 7880 |
| }, |
| { |
| "epoch": 14.710820895522389, |
| "grad_norm": 0.7358197569847107, |
| "learning_rate": 0.0001984687299914809, |
| "loss": 0.4814, |
| "num_input_tokens_seen": 2243328, |
| "step": 7885 |
| }, |
| { |
| "epoch": 14.720149253731343, |
| "grad_norm": 0.5231989622116089, |
| "learning_rate": 0.00019781976493845477, |
| "loss": 0.2839, |
| "num_input_tokens_seen": 2244832, |
| "step": 7890 |
| }, |
| { |
| "epoch": 14.729477611940299, |
| "grad_norm": 0.4327712655067444, |
| "learning_rate": 0.00019717160088285928, |
| "loss": 0.1759, |
| "num_input_tokens_seen": 2246240, |
| "step": 7895 |
| }, |
| { |
| "epoch": 14.738805970149254, |
| "grad_norm": 0.5407418608665466, |
| "learning_rate": 0.0001965242395428013, |
| "loss": 0.3704, |
| "num_input_tokens_seen": 2247456, |
| "step": 7900 |
| }, |
| { |
| "epoch": 14.748134328358208, |
| "grad_norm": 0.5258449912071228, |
| "learning_rate": 0.00019587768263425886, |
| "loss": 0.3365, |
| "num_input_tokens_seen": 2248768, |
| "step": 7905 |
| }, |
| { |
| "epoch": 14.757462686567164, |
| "grad_norm": 0.29008418321609497, |
| "learning_rate": 0.00019523193187107846, |
| "loss": 0.2287, |
| "num_input_tokens_seen": 2250112, |
| "step": 7910 |
| }, |
| { |
| "epoch": 14.76679104477612, |
| "grad_norm": 0.4476899802684784, |
| "learning_rate": 0.00019458698896496917, |
| "loss": 0.3182, |
| "num_input_tokens_seen": 2251776, |
| "step": 7915 |
| }, |
| { |
| "epoch": 14.776119402985074, |
| "grad_norm": 0.35547712445259094, |
| "learning_rate": 0.00019394285562549863, |
| "loss": 0.2906, |
| "num_input_tokens_seen": 2253184, |
| "step": 7920 |
| }, |
| { |
| "epoch": 14.78544776119403, |
| "grad_norm": 0.5664058327674866, |
| "learning_rate": 0.00019329953356008928, |
| "loss": 0.3694, |
| "num_input_tokens_seen": 2254560, |
| "step": 7925 |
| }, |
| { |
| "epoch": 14.794776119402986, |
| "grad_norm": 0.6664396524429321, |
| "learning_rate": 0.00019265702447401184, |
| "loss": 0.4551, |
| "num_input_tokens_seen": 2255904, |
| "step": 7930 |
| }, |
| { |
| "epoch": 14.80410447761194, |
| "grad_norm": 0.39799797534942627, |
| "learning_rate": 0.00019201533007038308, |
| "loss": 0.2978, |
| "num_input_tokens_seen": 2257280, |
| "step": 7935 |
| }, |
| { |
| "epoch": 14.813432835820896, |
| "grad_norm": 0.5523400902748108, |
| "learning_rate": 0.00019137445205016018, |
| "loss": 0.2565, |
| "num_input_tokens_seen": 2258624, |
| "step": 7940 |
| }, |
| { |
| "epoch": 14.822761194029852, |
| "grad_norm": 0.5098326206207275, |
| "learning_rate": 0.00019073439211213589, |
| "loss": 0.2085, |
| "num_input_tokens_seen": 2260096, |
| "step": 7945 |
| }, |
| { |
| "epoch": 14.832089552238806, |
| "grad_norm": 0.6128082871437073, |
| "learning_rate": 0.0001900951519529346, |
| "loss": 0.2444, |
| "num_input_tokens_seen": 2261696, |
| "step": 7950 |
| }, |
| { |
| "epoch": 14.841417910447761, |
| "grad_norm": 0.6016358137130737, |
| "learning_rate": 0.0001894567332670075, |
| "loss": 0.3776, |
| "num_input_tokens_seen": 2263040, |
| "step": 7955 |
| }, |
| { |
| "epoch": 14.850746268656717, |
| "grad_norm": 0.5598629117012024, |
| "learning_rate": 0.0001888191377466289, |
| "loss": 0.3781, |
| "num_input_tokens_seen": 2264320, |
| "step": 7960 |
| }, |
| { |
| "epoch": 14.860074626865671, |
| "grad_norm": 0.39370617270469666, |
| "learning_rate": 0.00018818236708189058, |
| "loss": 0.2014, |
| "num_input_tokens_seen": 2265792, |
| "step": 7965 |
| }, |
| { |
| "epoch": 14.869402985074627, |
| "grad_norm": 0.5136460661888123, |
| "learning_rate": 0.0001875464229606978, |
| "loss": 0.3423, |
| "num_input_tokens_seen": 2267520, |
| "step": 7970 |
| }, |
| { |
| "epoch": 14.878731343283581, |
| "grad_norm": 0.44041234254837036, |
| "learning_rate": 0.00018691130706876535, |
| "loss": 0.2944, |
| "num_input_tokens_seen": 2268928, |
| "step": 7975 |
| }, |
| { |
| "epoch": 14.888059701492537, |
| "grad_norm": 0.6629489064216614, |
| "learning_rate": 0.00018627702108961225, |
| "loss": 0.2556, |
| "num_input_tokens_seen": 2270240, |
| "step": 7980 |
| }, |
| { |
| "epoch": 14.897388059701493, |
| "grad_norm": 0.4910067319869995, |
| "learning_rate": 0.00018564356670455767, |
| "loss": 0.3722, |
| "num_input_tokens_seen": 2271776, |
| "step": 7985 |
| }, |
| { |
| "epoch": 14.906716417910447, |
| "grad_norm": 0.5173795223236084, |
| "learning_rate": 0.00018501094559271637, |
| "loss": 0.2558, |
| "num_input_tokens_seen": 2273248, |
| "step": 7990 |
| }, |
| { |
| "epoch": 14.916044776119403, |
| "grad_norm": 0.589825451374054, |
| "learning_rate": 0.0001843791594309948, |
| "loss": 0.2848, |
| "num_input_tokens_seen": 2274816, |
| "step": 7995 |
| }, |
| { |
| "epoch": 14.925373134328359, |
| "grad_norm": 0.34794145822525024, |
| "learning_rate": 0.0001837482098940857, |
| "loss": 0.2368, |
| "num_input_tokens_seen": 2276192, |
| "step": 8000 |
| }, |
| { |
| "epoch": 14.934701492537313, |
| "grad_norm": 0.8483067750930786, |
| "learning_rate": 0.00018311809865446404, |
| "loss": 0.3701, |
| "num_input_tokens_seen": 2277408, |
| "step": 8005 |
| }, |
| { |
| "epoch": 14.944029850746269, |
| "grad_norm": 0.38415467739105225, |
| "learning_rate": 0.00018248882738238344, |
| "loss": 0.4439, |
| "num_input_tokens_seen": 2278784, |
| "step": 8010 |
| }, |
| { |
| "epoch": 14.953358208955224, |
| "grad_norm": 0.4851486384868622, |
| "learning_rate": 0.00018186039774587025, |
| "loss": 0.3857, |
| "num_input_tokens_seen": 2280416, |
| "step": 8015 |
| }, |
| { |
| "epoch": 14.962686567164178, |
| "grad_norm": 0.3671543002128601, |
| "learning_rate": 0.0001812328114107201, |
| "loss": 0.4152, |
| "num_input_tokens_seen": 2281856, |
| "step": 8020 |
| }, |
| { |
| "epoch": 14.972014925373134, |
| "grad_norm": 0.6492642760276794, |
| "learning_rate": 0.00018060607004049322, |
| "loss": 0.5156, |
| "num_input_tokens_seen": 2283200, |
| "step": 8025 |
| }, |
| { |
| "epoch": 14.98134328358209, |
| "grad_norm": 0.44069990515708923, |
| "learning_rate": 0.0001799801752965104, |
| "loss": 0.3393, |
| "num_input_tokens_seen": 2284736, |
| "step": 8030 |
| }, |
| { |
| "epoch": 14.990671641791044, |
| "grad_norm": 0.676559329032898, |
| "learning_rate": 0.00017935512883784788, |
| "loss": 0.4805, |
| "num_input_tokens_seen": 2285984, |
| "step": 8035 |
| }, |
| { |
| "epoch": 15.0, |
| "grad_norm": 1.6291553974151611, |
| "learning_rate": 0.0001787309323213332, |
| "loss": 0.3594, |
| "num_input_tokens_seen": 2287232, |
| "step": 8040 |
| }, |
| { |
| "epoch": 15.0, |
| "eval_loss": 0.8014525175094604, |
| "eval_runtime": 4.2002, |
| "eval_samples_per_second": 56.664, |
| "eval_steps_per_second": 14.285, |
| "num_input_tokens_seen": 2287232, |
| "step": 8040 |
| }, |
| { |
| "epoch": 15.009328358208956, |
| "grad_norm": 0.4468965232372284, |
| "learning_rate": 0.00017810758740154155, |
| "loss": 0.2567, |
| "num_input_tokens_seen": 2288576, |
| "step": 8045 |
| }, |
| { |
| "epoch": 15.01865671641791, |
| "grad_norm": 0.4501148760318756, |
| "learning_rate": 0.0001774850957307902, |
| "loss": 0.365, |
| "num_input_tokens_seen": 2289856, |
| "step": 8050 |
| }, |
| { |
| "epoch": 15.027985074626866, |
| "grad_norm": 0.8252508044242859, |
| "learning_rate": 0.00017686345895913475, |
| "loss": 0.3132, |
| "num_input_tokens_seen": 2291232, |
| "step": 8055 |
| }, |
| { |
| "epoch": 15.037313432835822, |
| "grad_norm": 0.631758987903595, |
| "learning_rate": 0.00017624267873436516, |
| "loss": 0.3493, |
| "num_input_tokens_seen": 2292576, |
| "step": 8060 |
| }, |
| { |
| "epoch": 15.046641791044776, |
| "grad_norm": 0.3204401731491089, |
| "learning_rate": 0.0001756227567020004, |
| "loss": 0.3052, |
| "num_input_tokens_seen": 2294368, |
| "step": 8065 |
| }, |
| { |
| "epoch": 15.055970149253731, |
| "grad_norm": 0.5686541199684143, |
| "learning_rate": 0.00017500369450528482, |
| "loss": 0.2585, |
| "num_input_tokens_seen": 2295936, |
| "step": 8070 |
| }, |
| { |
| "epoch": 15.065298507462687, |
| "grad_norm": 0.5454219579696655, |
| "learning_rate": 0.0001743854937851833, |
| "loss": 0.3738, |
| "num_input_tokens_seen": 2297280, |
| "step": 8075 |
| }, |
| { |
| "epoch": 15.074626865671641, |
| "grad_norm": 0.8047488927841187, |
| "learning_rate": 0.00017376815618037788, |
| "loss": 0.3455, |
| "num_input_tokens_seen": 2298720, |
| "step": 8080 |
| }, |
| { |
| "epoch": 15.083955223880597, |
| "grad_norm": 0.6854057312011719, |
| "learning_rate": 0.00017315168332726207, |
| "loss": 0.2792, |
| "num_input_tokens_seen": 2300192, |
| "step": 8085 |
| }, |
| { |
| "epoch": 15.093283582089553, |
| "grad_norm": 0.6454561352729797, |
| "learning_rate": 0.0001725360768599371, |
| "loss": 0.3133, |
| "num_input_tokens_seen": 2301472, |
| "step": 8090 |
| }, |
| { |
| "epoch": 15.102611940298507, |
| "grad_norm": 0.6675901412963867, |
| "learning_rate": 0.00017192133841020834, |
| "loss": 0.2859, |
| "num_input_tokens_seen": 2303104, |
| "step": 8095 |
| }, |
| { |
| "epoch": 15.111940298507463, |
| "grad_norm": 0.48817259073257446, |
| "learning_rate": 0.00017130746960757954, |
| "loss": 0.3838, |
| "num_input_tokens_seen": 2304480, |
| "step": 8100 |
| }, |
| { |
| "epoch": 15.121268656716419, |
| "grad_norm": 0.6950250864028931, |
| "learning_rate": 0.00017069447207924992, |
| "loss": 0.3662, |
| "num_input_tokens_seen": 2305792, |
| "step": 8105 |
| }, |
| { |
| "epoch": 15.130597014925373, |
| "grad_norm": 0.48446905612945557, |
| "learning_rate": 0.00017008234745010832, |
| "loss": 0.4478, |
| "num_input_tokens_seen": 2307328, |
| "step": 8110 |
| }, |
| { |
| "epoch": 15.139925373134329, |
| "grad_norm": 0.4077727198600769, |
| "learning_rate": 0.00016947109734273048, |
| "loss": 0.1662, |
| "num_input_tokens_seen": 2308896, |
| "step": 8115 |
| }, |
| { |
| "epoch": 15.149253731343283, |
| "grad_norm": 0.3715053200721741, |
| "learning_rate": 0.00016886072337737417, |
| "loss": 0.3857, |
| "num_input_tokens_seen": 2310464, |
| "step": 8120 |
| }, |
| { |
| "epoch": 15.158582089552239, |
| "grad_norm": 0.24879996478557587, |
| "learning_rate": 0.00016825122717197382, |
| "loss": 0.138, |
| "num_input_tokens_seen": 2311872, |
| "step": 8125 |
| }, |
| { |
| "epoch": 15.167910447761194, |
| "grad_norm": 0.46096736192703247, |
| "learning_rate": 0.00016764261034213812, |
| "loss": 0.3597, |
| "num_input_tokens_seen": 2313312, |
| "step": 8130 |
| }, |
| { |
| "epoch": 15.177238805970148, |
| "grad_norm": 0.5743706822395325, |
| "learning_rate": 0.00016703487450114407, |
| "loss": 0.3697, |
| "num_input_tokens_seen": 2314560, |
| "step": 8135 |
| }, |
| { |
| "epoch": 15.186567164179104, |
| "grad_norm": 0.6948251128196716, |
| "learning_rate": 0.00016642802125993428, |
| "loss": 0.2639, |
| "num_input_tokens_seen": 2315840, |
| "step": 8140 |
| }, |
| { |
| "epoch": 15.19589552238806, |
| "grad_norm": 0.37598899006843567, |
| "learning_rate": 0.0001658220522271105, |
| "loss": 0.3013, |
| "num_input_tokens_seen": 2317152, |
| "step": 8145 |
| }, |
| { |
| "epoch": 15.205223880597014, |
| "grad_norm": 0.5866940021514893, |
| "learning_rate": 0.00016521696900893192, |
| "loss": 0.273, |
| "num_input_tokens_seen": 2318720, |
| "step": 8150 |
| }, |
| { |
| "epoch": 15.21455223880597, |
| "grad_norm": 0.5363821983337402, |
| "learning_rate": 0.00016461277320930923, |
| "loss": 0.2352, |
| "num_input_tokens_seen": 2320192, |
| "step": 8155 |
| }, |
| { |
| "epoch": 15.223880597014926, |
| "grad_norm": 0.6275938749313354, |
| "learning_rate": 0.0001640094664298007, |
| "loss": 0.3144, |
| "num_input_tokens_seen": 2321536, |
| "step": 8160 |
| }, |
| { |
| "epoch": 15.23320895522388, |
| "grad_norm": 0.47853443026542664, |
| "learning_rate": 0.00016340705026960818, |
| "loss": 0.2924, |
| "num_input_tokens_seen": 2322976, |
| "step": 8165 |
| }, |
| { |
| "epoch": 15.242537313432836, |
| "grad_norm": 0.6668580770492554, |
| "learning_rate": 0.00016280552632557245, |
| "loss": 0.2733, |
| "num_input_tokens_seen": 2324448, |
| "step": 8170 |
| }, |
| { |
| "epoch": 15.251865671641792, |
| "grad_norm": 0.6049531102180481, |
| "learning_rate": 0.0001622048961921699, |
| "loss": 0.4943, |
| "num_input_tokens_seen": 2325760, |
| "step": 8175 |
| }, |
| { |
| "epoch": 15.261194029850746, |
| "grad_norm": 0.47579655051231384, |
| "learning_rate": 0.0001616051614615071, |
| "loss": 0.2544, |
| "num_input_tokens_seen": 2327328, |
| "step": 8180 |
| }, |
| { |
| "epoch": 15.270522388059701, |
| "grad_norm": 0.6176946759223938, |
| "learning_rate": 0.00016100632372331725, |
| "loss": 0.2634, |
| "num_input_tokens_seen": 2328672, |
| "step": 8185 |
| }, |
| { |
| "epoch": 15.279850746268657, |
| "grad_norm": 0.6476304531097412, |
| "learning_rate": 0.00016040838456495615, |
| "loss": 0.3383, |
| "num_input_tokens_seen": 2330176, |
| "step": 8190 |
| }, |
| { |
| "epoch": 15.289179104477611, |
| "grad_norm": 0.3113108277320862, |
| "learning_rate": 0.00015981134557139742, |
| "loss": 0.2696, |
| "num_input_tokens_seen": 2331584, |
| "step": 8195 |
| }, |
| { |
| "epoch": 15.298507462686567, |
| "grad_norm": 0.4137718379497528, |
| "learning_rate": 0.00015921520832522874, |
| "loss": 0.3759, |
| "num_input_tokens_seen": 2332832, |
| "step": 8200 |
| }, |
| { |
| "epoch": 15.307835820895523, |
| "grad_norm": 1.1625181436538696, |
| "learning_rate": 0.00015861997440664717, |
| "loss": 0.3662, |
| "num_input_tokens_seen": 2333984, |
| "step": 8205 |
| }, |
| { |
| "epoch": 15.317164179104477, |
| "grad_norm": 0.5340922474861145, |
| "learning_rate": 0.000158025645393456, |
| "loss": 0.2759, |
| "num_input_tokens_seen": 2335232, |
| "step": 8210 |
| }, |
| { |
| "epoch": 15.326492537313433, |
| "grad_norm": 0.46783941984176636, |
| "learning_rate": 0.0001574322228610592, |
| "loss": 0.2871, |
| "num_input_tokens_seen": 2336512, |
| "step": 8215 |
| }, |
| { |
| "epoch": 15.335820895522389, |
| "grad_norm": 0.6034365296363831, |
| "learning_rate": 0.00015683970838245798, |
| "loss": 0.305, |
| "num_input_tokens_seen": 2338112, |
| "step": 8220 |
| }, |
| { |
| "epoch": 15.345149253731343, |
| "grad_norm": 0.3258196711540222, |
| "learning_rate": 0.0001562481035282471, |
| "loss": 0.2678, |
| "num_input_tokens_seen": 2339744, |
| "step": 8225 |
| }, |
| { |
| "epoch": 15.354477611940299, |
| "grad_norm": 0.4759286344051361, |
| "learning_rate": 0.00015565740986660947, |
| "loss": 0.3317, |
| "num_input_tokens_seen": 2341088, |
| "step": 8230 |
| }, |
| { |
| "epoch": 15.363805970149254, |
| "grad_norm": 0.8320894241333008, |
| "learning_rate": 0.000155067628963313, |
| "loss": 0.2662, |
| "num_input_tokens_seen": 2342592, |
| "step": 8235 |
| }, |
| { |
| "epoch": 15.373134328358208, |
| "grad_norm": 0.5550680160522461, |
| "learning_rate": 0.00015447876238170626, |
| "loss": 0.2799, |
| "num_input_tokens_seen": 2344128, |
| "step": 8240 |
| }, |
| { |
| "epoch": 15.382462686567164, |
| "grad_norm": 0.5990989804267883, |
| "learning_rate": 0.0001538908116827139, |
| "loss": 0.2282, |
| "num_input_tokens_seen": 2345472, |
| "step": 8245 |
| }, |
| { |
| "epoch": 15.39179104477612, |
| "grad_norm": 0.791718602180481, |
| "learning_rate": 0.00015330377842483306, |
| "loss": 0.2224, |
| "num_input_tokens_seen": 2346944, |
| "step": 8250 |
| }, |
| { |
| "epoch": 15.401119402985074, |
| "grad_norm": 0.5611154437065125, |
| "learning_rate": 0.00015271766416412858, |
| "loss": 0.2715, |
| "num_input_tokens_seen": 2348288, |
| "step": 8255 |
| }, |
| { |
| "epoch": 15.41044776119403, |
| "grad_norm": 0.4410983920097351, |
| "learning_rate": 0.00015213247045422996, |
| "loss": 0.2581, |
| "num_input_tokens_seen": 2349728, |
| "step": 8260 |
| }, |
| { |
| "epoch": 15.419776119402986, |
| "grad_norm": 0.6476612091064453, |
| "learning_rate": 0.0001515481988463261, |
| "loss": 0.3037, |
| "num_input_tokens_seen": 2351008, |
| "step": 8265 |
| }, |
| { |
| "epoch": 15.42910447761194, |
| "grad_norm": 0.831028938293457, |
| "learning_rate": 0.00015096485088916155, |
| "loss": 0.2987, |
| "num_input_tokens_seen": 2352384, |
| "step": 8270 |
| }, |
| { |
| "epoch": 15.438432835820896, |
| "grad_norm": 0.22620081901550293, |
| "learning_rate": 0.00015038242812903313, |
| "loss": 0.3147, |
| "num_input_tokens_seen": 2353920, |
| "step": 8275 |
| }, |
| { |
| "epoch": 15.447761194029852, |
| "grad_norm": 0.4023994207382202, |
| "learning_rate": 0.00014980093210978452, |
| "loss": 0.3046, |
| "num_input_tokens_seen": 2355392, |
| "step": 8280 |
| }, |
| { |
| "epoch": 15.457089552238806, |
| "grad_norm": 0.6006656885147095, |
| "learning_rate": 0.00014922036437280324, |
| "loss": 0.3329, |
| "num_input_tokens_seen": 2356800, |
| "step": 8285 |
| }, |
| { |
| "epoch": 15.466417910447761, |
| "grad_norm": 0.6616522669792175, |
| "learning_rate": 0.00014864072645701592, |
| "loss": 0.205, |
| "num_input_tokens_seen": 2358208, |
| "step": 8290 |
| }, |
| { |
| "epoch": 15.475746268656717, |
| "grad_norm": 0.5346037745475769, |
| "learning_rate": 0.00014806201989888502, |
| "loss": 0.3307, |
| "num_input_tokens_seen": 2359552, |
| "step": 8295 |
| }, |
| { |
| "epoch": 15.485074626865671, |
| "grad_norm": 0.4655405282974243, |
| "learning_rate": 0.00014748424623240363, |
| "loss": 0.3352, |
| "num_input_tokens_seen": 2360960, |
| "step": 8300 |
| }, |
| { |
| "epoch": 15.494402985074627, |
| "grad_norm": 1.0552483797073364, |
| "learning_rate": 0.00014690740698909222, |
| "loss": 0.2739, |
| "num_input_tokens_seen": 2362400, |
| "step": 8305 |
| }, |
| { |
| "epoch": 15.503731343283581, |
| "grad_norm": 0.5363913178443909, |
| "learning_rate": 0.0001463315036979946, |
| "loss": 0.3144, |
| "num_input_tokens_seen": 2363808, |
| "step": 8310 |
| }, |
| { |
| "epoch": 15.513059701492537, |
| "grad_norm": 0.4912722408771515, |
| "learning_rate": 0.0001457565378856733, |
| "loss": 0.2818, |
| "num_input_tokens_seen": 2365344, |
| "step": 8315 |
| }, |
| { |
| "epoch": 15.522388059701493, |
| "grad_norm": 0.2873976230621338, |
| "learning_rate": 0.0001451825110762059, |
| "loss": 0.1968, |
| "num_input_tokens_seen": 2366976, |
| "step": 8320 |
| }, |
| { |
| "epoch": 15.531716417910447, |
| "grad_norm": 0.6075371503829956, |
| "learning_rate": 0.00014460942479118083, |
| "loss": 0.3125, |
| "num_input_tokens_seen": 2368384, |
| "step": 8325 |
| }, |
| { |
| "epoch": 15.541044776119403, |
| "grad_norm": 0.42490240931510925, |
| "learning_rate": 0.0001440372805496939, |
| "loss": 0.3007, |
| "num_input_tokens_seen": 2369952, |
| "step": 8330 |
| }, |
| { |
| "epoch": 15.550373134328359, |
| "grad_norm": 0.40874677896499634, |
| "learning_rate": 0.0001434660798683437, |
| "loss": 0.2063, |
| "num_input_tokens_seen": 2371360, |
| "step": 8335 |
| }, |
| { |
| "epoch": 15.559701492537313, |
| "grad_norm": 0.3459983766078949, |
| "learning_rate": 0.00014289582426122693, |
| "loss": 0.2497, |
| "num_input_tokens_seen": 2372928, |
| "step": 8340 |
| }, |
| { |
| "epoch": 15.569029850746269, |
| "grad_norm": 0.4452391564846039, |
| "learning_rate": 0.00014232651523993635, |
| "loss": 0.2508, |
| "num_input_tokens_seen": 2374400, |
| "step": 8345 |
| }, |
| { |
| "epoch": 15.578358208955224, |
| "grad_norm": 0.6100215911865234, |
| "learning_rate": 0.00014175815431355466, |
| "loss": 0.2734, |
| "num_input_tokens_seen": 2375904, |
| "step": 8350 |
| }, |
| { |
| "epoch": 15.587686567164178, |
| "grad_norm": 0.8004417419433594, |
| "learning_rate": 0.00014119074298865164, |
| "loss": 0.402, |
| "num_input_tokens_seen": 2377056, |
| "step": 8355 |
| }, |
| { |
| "epoch": 15.597014925373134, |
| "grad_norm": 0.3961009979248047, |
| "learning_rate": 0.00014062428276928046, |
| "loss": 0.1442, |
| "num_input_tokens_seen": 2378432, |
| "step": 8360 |
| }, |
| { |
| "epoch": 15.60634328358209, |
| "grad_norm": 0.6664312481880188, |
| "learning_rate": 0.0001400587751569723, |
| "loss": 0.3781, |
| "num_input_tokens_seen": 2379872, |
| "step": 8365 |
| }, |
| { |
| "epoch": 15.615671641791044, |
| "grad_norm": 0.46110233664512634, |
| "learning_rate": 0.00013949422165073421, |
| "loss": 0.1627, |
| "num_input_tokens_seen": 2381376, |
| "step": 8370 |
| }, |
| { |
| "epoch": 15.625, |
| "grad_norm": 0.5933917760848999, |
| "learning_rate": 0.00013893062374704308, |
| "loss": 0.4071, |
| "num_input_tokens_seen": 2382752, |
| "step": 8375 |
| }, |
| { |
| "epoch": 15.634328358208956, |
| "grad_norm": 0.5305347442626953, |
| "learning_rate": 0.00013836798293984364, |
| "loss": 0.2568, |
| "num_input_tokens_seen": 2384000, |
| "step": 8380 |
| }, |
| { |
| "epoch": 15.64365671641791, |
| "grad_norm": 0.3536977171897888, |
| "learning_rate": 0.00013780630072054311, |
| "loss": 0.1958, |
| "num_input_tokens_seen": 2385344, |
| "step": 8385 |
| }, |
| { |
| "epoch": 15.652985074626866, |
| "grad_norm": 0.46019840240478516, |
| "learning_rate": 0.00013724557857800824, |
| "loss": 0.2185, |
| "num_input_tokens_seen": 2386784, |
| "step": 8390 |
| }, |
| { |
| "epoch": 15.662313432835822, |
| "grad_norm": 0.7752018570899963, |
| "learning_rate": 0.0001366858179985604, |
| "loss": 0.2764, |
| "num_input_tokens_seen": 2388192, |
| "step": 8395 |
| }, |
| { |
| "epoch": 15.671641791044776, |
| "grad_norm": 0.6549847722053528, |
| "learning_rate": 0.0001361270204659721, |
| "loss": 0.2833, |
| "num_input_tokens_seen": 2389664, |
| "step": 8400 |
| }, |
| { |
| "epoch": 15.680970149253731, |
| "grad_norm": 0.2715495228767395, |
| "learning_rate": 0.0001355691874614638, |
| "loss": 0.3665, |
| "num_input_tokens_seen": 2391136, |
| "step": 8405 |
| }, |
| { |
| "epoch": 15.690298507462687, |
| "grad_norm": 0.46817731857299805, |
| "learning_rate": 0.00013501232046369811, |
| "loss": 0.235, |
| "num_input_tokens_seen": 2392448, |
| "step": 8410 |
| }, |
| { |
| "epoch": 15.699626865671641, |
| "grad_norm": 0.9094480276107788, |
| "learning_rate": 0.00013445642094877793, |
| "loss": 0.4955, |
| "num_input_tokens_seen": 2393888, |
| "step": 8415 |
| }, |
| { |
| "epoch": 15.708955223880597, |
| "grad_norm": 0.4685889780521393, |
| "learning_rate": 0.0001339014903902415, |
| "loss": 0.3565, |
| "num_input_tokens_seen": 2395264, |
| "step": 8420 |
| }, |
| { |
| "epoch": 15.718283582089553, |
| "grad_norm": 1.049347162246704, |
| "learning_rate": 0.0001333475302590584, |
| "loss": 0.3012, |
| "num_input_tokens_seen": 2396608, |
| "step": 8425 |
| }, |
| { |
| "epoch": 15.727611940298507, |
| "grad_norm": 0.5767874121665955, |
| "learning_rate": 0.00013279454202362573, |
| "loss": 0.2502, |
| "num_input_tokens_seen": 2398080, |
| "step": 8430 |
| }, |
| { |
| "epoch": 15.736940298507463, |
| "grad_norm": 0.589856743812561, |
| "learning_rate": 0.0001322425271497646, |
| "loss": 0.2351, |
| "num_input_tokens_seen": 2399616, |
| "step": 8435 |
| }, |
| { |
| "epoch": 15.746268656716419, |
| "grad_norm": 0.5481240749359131, |
| "learning_rate": 0.00013169148710071615, |
| "loss": 0.2441, |
| "num_input_tokens_seen": 2401056, |
| "step": 8440 |
| }, |
| { |
| "epoch": 15.755597014925373, |
| "grad_norm": 0.5046172738075256, |
| "learning_rate": 0.00013114142333713725, |
| "loss": 0.2152, |
| "num_input_tokens_seen": 2402528, |
| "step": 8445 |
| }, |
| { |
| "epoch": 15.764925373134329, |
| "grad_norm": 0.7722668647766113, |
| "learning_rate": 0.00013059233731709685, |
| "loss": 0.226, |
| "num_input_tokens_seen": 2403872, |
| "step": 8450 |
| }, |
| { |
| "epoch": 15.774253731343283, |
| "grad_norm": 0.6092769503593445, |
| "learning_rate": 0.00013004423049607256, |
| "loss": 0.2227, |
| "num_input_tokens_seen": 2405376, |
| "step": 8455 |
| }, |
| { |
| "epoch": 15.783582089552239, |
| "grad_norm": 0.6560283899307251, |
| "learning_rate": 0.000129497104326946, |
| "loss": 0.261, |
| "num_input_tokens_seen": 2406656, |
| "step": 8460 |
| }, |
| { |
| "epoch": 15.792910447761194, |
| "grad_norm": 0.4380224049091339, |
| "learning_rate": 0.00012895096025999957, |
| "loss": 0.4086, |
| "num_input_tokens_seen": 2408256, |
| "step": 8465 |
| }, |
| { |
| "epoch": 15.802238805970148, |
| "grad_norm": 0.28187674283981323, |
| "learning_rate": 0.00012840579974291217, |
| "loss": 0.3477, |
| "num_input_tokens_seen": 2409568, |
| "step": 8470 |
| }, |
| { |
| "epoch": 15.811567164179104, |
| "grad_norm": 0.4487137794494629, |
| "learning_rate": 0.00012786162422075598, |
| "loss": 0.1889, |
| "num_input_tokens_seen": 2410912, |
| "step": 8475 |
| }, |
| { |
| "epoch": 15.82089552238806, |
| "grad_norm": 0.5683954358100891, |
| "learning_rate": 0.0001273184351359918, |
| "loss": 0.2914, |
| "num_input_tokens_seen": 2412416, |
| "step": 8480 |
| }, |
| { |
| "epoch": 15.830223880597014, |
| "grad_norm": 0.37054821848869324, |
| "learning_rate": 0.00012677623392846565, |
| "loss": 0.2331, |
| "num_input_tokens_seen": 2413792, |
| "step": 8485 |
| }, |
| { |
| "epoch": 15.83955223880597, |
| "grad_norm": 0.5374351143836975, |
| "learning_rate": 0.00012623502203540555, |
| "loss": 0.2678, |
| "num_input_tokens_seen": 2415232, |
| "step": 8490 |
| }, |
| { |
| "epoch": 15.848880597014926, |
| "grad_norm": 0.6811636090278625, |
| "learning_rate": 0.0001256948008914165, |
| "loss": 0.2646, |
| "num_input_tokens_seen": 2416544, |
| "step": 8495 |
| }, |
| { |
| "epoch": 15.85820895522388, |
| "grad_norm": 0.5923631191253662, |
| "learning_rate": 0.00012515557192847737, |
| "loss": 0.3662, |
| "num_input_tokens_seen": 2418176, |
| "step": 8500 |
| }, |
| { |
| "epoch": 15.867537313432836, |
| "grad_norm": 0.5537749528884888, |
| "learning_rate": 0.00012461733657593722, |
| "loss": 0.2539, |
| "num_input_tokens_seen": 2419616, |
| "step": 8505 |
| }, |
| { |
| "epoch": 15.876865671641792, |
| "grad_norm": 0.5480448603630066, |
| "learning_rate": 0.00012408009626051135, |
| "loss": 0.2163, |
| "num_input_tokens_seen": 2420928, |
| "step": 8510 |
| }, |
| { |
| "epoch": 15.886194029850746, |
| "grad_norm": 0.8457586169242859, |
| "learning_rate": 0.00012354385240627736, |
| "loss": 0.1695, |
| "num_input_tokens_seen": 2422336, |
| "step": 8515 |
| }, |
| { |
| "epoch": 15.895522388059701, |
| "grad_norm": 0.3043023645877838, |
| "learning_rate": 0.00012300860643467133, |
| "loss": 0.2807, |
| "num_input_tokens_seen": 2423584, |
| "step": 8520 |
| }, |
| { |
| "epoch": 15.904850746268657, |
| "grad_norm": 0.822465717792511, |
| "learning_rate": 0.00012247435976448474, |
| "loss": 0.2682, |
| "num_input_tokens_seen": 2424928, |
| "step": 8525 |
| }, |
| { |
| "epoch": 15.914179104477611, |
| "grad_norm": 0.5391532182693481, |
| "learning_rate": 0.00012194111381185973, |
| "loss": 0.2548, |
| "num_input_tokens_seen": 2426400, |
| "step": 8530 |
| }, |
| { |
| "epoch": 15.923507462686567, |
| "grad_norm": 0.737484335899353, |
| "learning_rate": 0.00012140886999028583, |
| "loss": 0.1762, |
| "num_input_tokens_seen": 2427872, |
| "step": 8535 |
| }, |
| { |
| "epoch": 15.932835820895523, |
| "grad_norm": 0.6602703928947449, |
| "learning_rate": 0.00012087762971059663, |
| "loss": 0.2668, |
| "num_input_tokens_seen": 2429120, |
| "step": 8540 |
| }, |
| { |
| "epoch": 15.942164179104477, |
| "grad_norm": 0.5175513029098511, |
| "learning_rate": 0.00012034739438096509, |
| "loss": 0.2865, |
| "num_input_tokens_seen": 2430368, |
| "step": 8545 |
| }, |
| { |
| "epoch": 15.951492537313433, |
| "grad_norm": 0.7112910747528076, |
| "learning_rate": 0.0001198181654069006, |
| "loss": 0.3435, |
| "num_input_tokens_seen": 2431712, |
| "step": 8550 |
| }, |
| { |
| "epoch": 15.960820895522389, |
| "grad_norm": 0.46317917108535767, |
| "learning_rate": 0.00011928994419124467, |
| "loss": 0.2369, |
| "num_input_tokens_seen": 2433152, |
| "step": 8555 |
| }, |
| { |
| "epoch": 15.970149253731343, |
| "grad_norm": 0.8226314783096313, |
| "learning_rate": 0.000118762732134168, |
| "loss": 0.317, |
| "num_input_tokens_seen": 2434752, |
| "step": 8560 |
| }, |
| { |
| "epoch": 15.979477611940299, |
| "grad_norm": 0.6281651258468628, |
| "learning_rate": 0.00011823653063316631, |
| "loss": 0.3671, |
| "num_input_tokens_seen": 2436128, |
| "step": 8565 |
| }, |
| { |
| "epoch": 15.988805970149254, |
| "grad_norm": 0.4451799988746643, |
| "learning_rate": 0.0001177113410830557, |
| "loss": 0.3262, |
| "num_input_tokens_seen": 2437536, |
| "step": 8570 |
| }, |
| { |
| "epoch": 15.998134328358208, |
| "grad_norm": 0.43823346495628357, |
| "learning_rate": 0.00011718716487597098, |
| "loss": 0.2563, |
| "num_input_tokens_seen": 2438944, |
| "step": 8575 |
| }, |
| { |
| "epoch": 16.0, |
| "eval_loss": 0.8285130858421326, |
| "eval_runtime": 4.2024, |
| "eval_samples_per_second": 56.635, |
| "eval_steps_per_second": 14.278, |
| "num_input_tokens_seen": 2438992, |
| "step": 8576 |
| }, |
| { |
| "epoch": 16.007462686567163, |
| "grad_norm": 0.6276515126228333, |
| "learning_rate": 0.00011666400340136013, |
| "loss": 0.3762, |
| "num_input_tokens_seen": 2439984, |
| "step": 8580 |
| }, |
| { |
| "epoch": 16.01679104477612, |
| "grad_norm": 0.732320249080658, |
| "learning_rate": 0.00011614185804598199, |
| "loss": 0.3555, |
| "num_input_tokens_seen": 2441232, |
| "step": 8585 |
| }, |
| { |
| "epoch": 16.026119402985074, |
| "grad_norm": 0.41469866037368774, |
| "learning_rate": 0.00011562073019390096, |
| "loss": 0.225, |
| "num_input_tokens_seen": 2442512, |
| "step": 8590 |
| }, |
| { |
| "epoch": 16.03544776119403, |
| "grad_norm": 0.2994535565376282, |
| "learning_rate": 0.00011510062122648528, |
| "loss": 0.1708, |
| "num_input_tokens_seen": 2444176, |
| "step": 8595 |
| }, |
| { |
| "epoch": 16.044776119402986, |
| "grad_norm": 0.6539736986160278, |
| "learning_rate": 0.00011458153252240233, |
| "loss": 0.2227, |
| "num_input_tokens_seen": 2445584, |
| "step": 8600 |
| }, |
| { |
| "epoch": 16.05410447761194, |
| "grad_norm": 0.6162667274475098, |
| "learning_rate": 0.00011406346545761415, |
| "loss": 0.3296, |
| "num_input_tokens_seen": 2446928, |
| "step": 8605 |
| }, |
| { |
| "epoch": 16.063432835820894, |
| "grad_norm": 0.5158094763755798, |
| "learning_rate": 0.0001135464214053758, |
| "loss": 0.3577, |
| "num_input_tokens_seen": 2448272, |
| "step": 8610 |
| }, |
| { |
| "epoch": 16.07276119402985, |
| "grad_norm": 0.6340551376342773, |
| "learning_rate": 0.00011303040173622975, |
| "loss": 0.2312, |
| "num_input_tokens_seen": 2449744, |
| "step": 8615 |
| }, |
| { |
| "epoch": 16.082089552238806, |
| "grad_norm": 0.5297074913978577, |
| "learning_rate": 0.00011251540781800379, |
| "loss": 0.2522, |
| "num_input_tokens_seen": 2451184, |
| "step": 8620 |
| }, |
| { |
| "epoch": 16.09141791044776, |
| "grad_norm": 0.5866245031356812, |
| "learning_rate": 0.00011200144101580634, |
| "loss": 0.3074, |
| "num_input_tokens_seen": 2452496, |
| "step": 8625 |
| }, |
| { |
| "epoch": 16.100746268656717, |
| "grad_norm": 0.6043202877044678, |
| "learning_rate": 0.00011148850269202305, |
| "loss": 0.2295, |
| "num_input_tokens_seen": 2454000, |
| "step": 8630 |
| }, |
| { |
| "epoch": 16.11007462686567, |
| "grad_norm": 0.5400373935699463, |
| "learning_rate": 0.0001109765942063139, |
| "loss": 0.3557, |
| "num_input_tokens_seen": 2455440, |
| "step": 8635 |
| }, |
| { |
| "epoch": 16.119402985074625, |
| "grad_norm": 0.48669207096099854, |
| "learning_rate": 0.00011046571691560863, |
| "loss": 0.2135, |
| "num_input_tokens_seen": 2456784, |
| "step": 8640 |
| }, |
| { |
| "epoch": 16.128731343283583, |
| "grad_norm": 0.8610177636146545, |
| "learning_rate": 0.00010995587217410369, |
| "loss": 0.2616, |
| "num_input_tokens_seen": 2458224, |
| "step": 8645 |
| }, |
| { |
| "epoch": 16.138059701492537, |
| "grad_norm": 0.47535207867622375, |
| "learning_rate": 0.00010944706133325832, |
| "loss": 0.3128, |
| "num_input_tokens_seen": 2459824, |
| "step": 8650 |
| }, |
| { |
| "epoch": 16.14738805970149, |
| "grad_norm": 0.7235194444656372, |
| "learning_rate": 0.00010893928574179174, |
| "loss": 0.2185, |
| "num_input_tokens_seen": 2461264, |
| "step": 8655 |
| }, |
| { |
| "epoch": 16.15671641791045, |
| "grad_norm": 0.6055487990379333, |
| "learning_rate": 0.00010843254674567832, |
| "loss": 0.4397, |
| "num_input_tokens_seen": 2462576, |
| "step": 8660 |
| }, |
| { |
| "epoch": 16.166044776119403, |
| "grad_norm": 0.4858103096485138, |
| "learning_rate": 0.00010792684568814504, |
| "loss": 0.3052, |
| "num_input_tokens_seen": 2464144, |
| "step": 8665 |
| }, |
| { |
| "epoch": 16.175373134328357, |
| "grad_norm": 0.6048887372016907, |
| "learning_rate": 0.00010742218390966768, |
| "loss": 0.2574, |
| "num_input_tokens_seen": 2465584, |
| "step": 8670 |
| }, |
| { |
| "epoch": 16.184701492537314, |
| "grad_norm": 0.4468727707862854, |
| "learning_rate": 0.00010691856274796702, |
| "loss": 0.1812, |
| "num_input_tokens_seen": 2467056, |
| "step": 8675 |
| }, |
| { |
| "epoch": 16.19402985074627, |
| "grad_norm": 0.7549290657043457, |
| "learning_rate": 0.0001064159835380053, |
| "loss": 0.2291, |
| "num_input_tokens_seen": 2468464, |
| "step": 8680 |
| }, |
| { |
| "epoch": 16.203358208955223, |
| "grad_norm": 0.5962735414505005, |
| "learning_rate": 0.00010591444761198332, |
| "loss": 0.3033, |
| "num_input_tokens_seen": 2469968, |
| "step": 8685 |
| }, |
| { |
| "epoch": 16.21268656716418, |
| "grad_norm": 0.7374847531318665, |
| "learning_rate": 0.00010541395629933586, |
| "loss": 0.3931, |
| "num_input_tokens_seen": 2471280, |
| "step": 8690 |
| }, |
| { |
| "epoch": 16.222014925373134, |
| "grad_norm": 0.3907533884048462, |
| "learning_rate": 0.00010491451092672904, |
| "loss": 0.1632, |
| "num_input_tokens_seen": 2472560, |
| "step": 8695 |
| }, |
| { |
| "epoch": 16.23134328358209, |
| "grad_norm": 0.4274299740791321, |
| "learning_rate": 0.0001044161128180563, |
| "loss": 0.2025, |
| "num_input_tokens_seen": 2474032, |
| "step": 8700 |
| }, |
| { |
| "epoch": 16.240671641791046, |
| "grad_norm": 0.4550669491291046, |
| "learning_rate": 0.00010391876329443534, |
| "loss": 0.3309, |
| "num_input_tokens_seen": 2475472, |
| "step": 8705 |
| }, |
| { |
| "epoch": 16.25, |
| "grad_norm": 0.5084713697433472, |
| "learning_rate": 0.00010342246367420411, |
| "loss": 0.2977, |
| "num_input_tokens_seen": 2476912, |
| "step": 8710 |
| }, |
| { |
| "epoch": 16.259328358208954, |
| "grad_norm": 0.5619415044784546, |
| "learning_rate": 0.00010292721527291742, |
| "loss": 0.2917, |
| "num_input_tokens_seen": 2478256, |
| "step": 8715 |
| }, |
| { |
| "epoch": 16.26865671641791, |
| "grad_norm": 0.574165403842926, |
| "learning_rate": 0.00010243301940334415, |
| "loss": 0.2578, |
| "num_input_tokens_seen": 2479696, |
| "step": 8720 |
| }, |
| { |
| "epoch": 16.277985074626866, |
| "grad_norm": 0.6561511754989624, |
| "learning_rate": 0.00010193987737546262, |
| "loss": 0.2869, |
| "num_input_tokens_seen": 2481136, |
| "step": 8725 |
| }, |
| { |
| "epoch": 16.28731343283582, |
| "grad_norm": 0.5067024827003479, |
| "learning_rate": 0.00010144779049645792, |
| "loss": 0.1726, |
| "num_input_tokens_seen": 2482416, |
| "step": 8730 |
| }, |
| { |
| "epoch": 16.296641791044777, |
| "grad_norm": 0.6899192333221436, |
| "learning_rate": 0.00010095676007071808, |
| "loss": 0.2841, |
| "num_input_tokens_seen": 2483792, |
| "step": 8735 |
| }, |
| { |
| "epoch": 16.30597014925373, |
| "grad_norm": 0.6847012042999268, |
| "learning_rate": 0.00010046678739983129, |
| "loss": 0.2843, |
| "num_input_tokens_seen": 2485136, |
| "step": 8740 |
| }, |
| { |
| "epoch": 16.315298507462686, |
| "grad_norm": 0.4542698264122009, |
| "learning_rate": 9.997787378258122e-05, |
| "loss": 0.3108, |
| "num_input_tokens_seen": 2486640, |
| "step": 8745 |
| }, |
| { |
| "epoch": 16.324626865671643, |
| "grad_norm": 0.4527323842048645, |
| "learning_rate": 9.949002051494465e-05, |
| "loss": 0.2882, |
| "num_input_tokens_seen": 2488176, |
| "step": 8750 |
| }, |
| { |
| "epoch": 16.333955223880597, |
| "grad_norm": 0.596170961856842, |
| "learning_rate": 9.900322889008772e-05, |
| "loss": 0.2263, |
| "num_input_tokens_seen": 2489648, |
| "step": 8755 |
| }, |
| { |
| "epoch": 16.34328358208955, |
| "grad_norm": 0.9109885096549988, |
| "learning_rate": 9.851750019836231e-05, |
| "loss": 0.3221, |
| "num_input_tokens_seen": 2491120, |
| "step": 8760 |
| }, |
| { |
| "epoch": 16.35261194029851, |
| "grad_norm": 0.5357970595359802, |
| "learning_rate": 9.803283572730271e-05, |
| "loss": 0.2394, |
| "num_input_tokens_seen": 2492656, |
| "step": 8765 |
| }, |
| { |
| "epoch": 16.361940298507463, |
| "grad_norm": 0.705450177192688, |
| "learning_rate": 9.75492367616222e-05, |
| "loss": 0.2454, |
| "num_input_tokens_seen": 2493904, |
| "step": 8770 |
| }, |
| { |
| "epoch": 16.371268656716417, |
| "grad_norm": 0.6140018105506897, |
| "learning_rate": 9.706670458320993e-05, |
| "loss": 0.2821, |
| "num_input_tokens_seen": 2495536, |
| "step": 8775 |
| }, |
| { |
| "epoch": 16.380597014925375, |
| "grad_norm": 0.545140266418457, |
| "learning_rate": 9.658524047112749e-05, |
| "loss": 0.303, |
| "num_input_tokens_seen": 2496816, |
| "step": 8780 |
| }, |
| { |
| "epoch": 16.38992537313433, |
| "grad_norm": 0.6749919652938843, |
| "learning_rate": 9.610484570160444e-05, |
| "loss": 0.2499, |
| "num_input_tokens_seen": 2498224, |
| "step": 8785 |
| }, |
| { |
| "epoch": 16.399253731343283, |
| "grad_norm": 0.5334325432777405, |
| "learning_rate": 9.562552154803673e-05, |
| "loss": 0.3425, |
| "num_input_tokens_seen": 2499760, |
| "step": 8790 |
| }, |
| { |
| "epoch": 16.40858208955224, |
| "grad_norm": 0.3113941252231598, |
| "learning_rate": 9.514726928098189e-05, |
| "loss": 0.2433, |
| "num_input_tokens_seen": 2501040, |
| "step": 8795 |
| }, |
| { |
| "epoch": 16.417910447761194, |
| "grad_norm": 0.39453864097595215, |
| "learning_rate": 9.467009016815625e-05, |
| "loss": 0.2447, |
| "num_input_tokens_seen": 2502672, |
| "step": 8800 |
| }, |
| { |
| "epoch": 16.42723880597015, |
| "grad_norm": 0.7549194693565369, |
| "learning_rate": 9.419398547443175e-05, |
| "loss": 0.3651, |
| "num_input_tokens_seen": 2504048, |
| "step": 8805 |
| }, |
| { |
| "epoch": 16.436567164179106, |
| "grad_norm": 0.4813898801803589, |
| "learning_rate": 9.371895646183199e-05, |
| "loss": 0.4382, |
| "num_input_tokens_seen": 2505488, |
| "step": 8810 |
| }, |
| { |
| "epoch": 16.44589552238806, |
| "grad_norm": 0.6909576654434204, |
| "learning_rate": 9.324500438952965e-05, |
| "loss": 0.2031, |
| "num_input_tokens_seen": 2506800, |
| "step": 8815 |
| }, |
| { |
| "epoch": 16.455223880597014, |
| "grad_norm": 0.31445515155792236, |
| "learning_rate": 9.27721305138421e-05, |
| "loss": 0.1622, |
| "num_input_tokens_seen": 2508144, |
| "step": 8820 |
| }, |
| { |
| "epoch": 16.46455223880597, |
| "grad_norm": 0.4787421226501465, |
| "learning_rate": 9.23003360882293e-05, |
| "loss": 0.2795, |
| "num_input_tokens_seen": 2509456, |
| "step": 8825 |
| }, |
| { |
| "epoch": 16.473880597014926, |
| "grad_norm": 0.610919713973999, |
| "learning_rate": 9.182962236328957e-05, |
| "loss": 0.3738, |
| "num_input_tokens_seen": 2510864, |
| "step": 8830 |
| }, |
| { |
| "epoch": 16.48320895522388, |
| "grad_norm": 0.5633202791213989, |
| "learning_rate": 9.135999058675687e-05, |
| "loss": 0.1995, |
| "num_input_tokens_seen": 2512144, |
| "step": 8835 |
| }, |
| { |
| "epoch": 16.492537313432837, |
| "grad_norm": 0.5101869106292725, |
| "learning_rate": 9.089144200349685e-05, |
| "loss": 0.3134, |
| "num_input_tokens_seen": 2513488, |
| "step": 8840 |
| }, |
| { |
| "epoch": 16.50186567164179, |
| "grad_norm": 0.38828736543655396, |
| "learning_rate": 9.042397785550405e-05, |
| "loss": 0.1767, |
| "num_input_tokens_seen": 2514768, |
| "step": 8845 |
| }, |
| { |
| "epoch": 16.511194029850746, |
| "grad_norm": 0.5241112112998962, |
| "learning_rate": 8.995759938189884e-05, |
| "loss": 0.3342, |
| "num_input_tokens_seen": 2516560, |
| "step": 8850 |
| }, |
| { |
| "epoch": 16.520522388059703, |
| "grad_norm": 0.47178515791893005, |
| "learning_rate": 8.949230781892287e-05, |
| "loss": 0.1936, |
| "num_input_tokens_seen": 2518064, |
| "step": 8855 |
| }, |
| { |
| "epoch": 16.529850746268657, |
| "grad_norm": 0.7868510484695435, |
| "learning_rate": 8.902810439993752e-05, |
| "loss": 0.3258, |
| "num_input_tokens_seen": 2519568, |
| "step": 8860 |
| }, |
| { |
| "epoch": 16.53917910447761, |
| "grad_norm": 0.44424474239349365, |
| "learning_rate": 8.85649903554197e-05, |
| "loss": 0.177, |
| "num_input_tokens_seen": 2520912, |
| "step": 8865 |
| }, |
| { |
| "epoch": 16.548507462686565, |
| "grad_norm": 0.6200153231620789, |
| "learning_rate": 8.810296691295827e-05, |
| "loss": 0.3229, |
| "num_input_tokens_seen": 2522320, |
| "step": 8870 |
| }, |
| { |
| "epoch": 16.557835820895523, |
| "grad_norm": 0.6333581209182739, |
| "learning_rate": 8.764203529725152e-05, |
| "loss": 0.213, |
| "num_input_tokens_seen": 2523728, |
| "step": 8875 |
| }, |
| { |
| "epoch": 16.567164179104477, |
| "grad_norm": 0.40464335680007935, |
| "learning_rate": 8.71821967301033e-05, |
| "loss": 0.2427, |
| "num_input_tokens_seen": 2525104, |
| "step": 8880 |
| }, |
| { |
| "epoch": 16.576492537313435, |
| "grad_norm": 0.5083310604095459, |
| "learning_rate": 8.672345243042069e-05, |
| "loss": 0.2852, |
| "num_input_tokens_seen": 2526544, |
| "step": 8885 |
| }, |
| { |
| "epoch": 16.58582089552239, |
| "grad_norm": 0.3773159682750702, |
| "learning_rate": 8.626580361420955e-05, |
| "loss": 0.4596, |
| "num_input_tokens_seen": 2527952, |
| "step": 8890 |
| }, |
| { |
| "epoch": 16.595149253731343, |
| "grad_norm": 0.7448423504829407, |
| "learning_rate": 8.580925149457197e-05, |
| "loss": 0.1847, |
| "num_input_tokens_seen": 2529328, |
| "step": 8895 |
| }, |
| { |
| "epoch": 16.604477611940297, |
| "grad_norm": 0.6146315336227417, |
| "learning_rate": 8.535379728170356e-05, |
| "loss": 0.2781, |
| "num_input_tokens_seen": 2530736, |
| "step": 8900 |
| }, |
| { |
| "epoch": 16.613805970149254, |
| "grad_norm": 0.4092324376106262, |
| "learning_rate": 8.489944218288909e-05, |
| "loss": 0.2197, |
| "num_input_tokens_seen": 2532208, |
| "step": 8905 |
| }, |
| { |
| "epoch": 16.62313432835821, |
| "grad_norm": 0.4566566050052643, |
| "learning_rate": 8.444618740249998e-05, |
| "loss": 0.177, |
| "num_input_tokens_seen": 2533616, |
| "step": 8910 |
| }, |
| { |
| "epoch": 16.632462686567163, |
| "grad_norm": 0.7507036924362183, |
| "learning_rate": 8.399403414199114e-05, |
| "loss": 0.2945, |
| "num_input_tokens_seen": 2535024, |
| "step": 8915 |
| }, |
| { |
| "epoch": 16.64179104477612, |
| "grad_norm": 0.49269193410873413, |
| "learning_rate": 8.354298359989776e-05, |
| "loss": 0.2004, |
| "num_input_tokens_seen": 2536368, |
| "step": 8920 |
| }, |
| { |
| "epoch": 16.651119402985074, |
| "grad_norm": 0.5462902188301086, |
| "learning_rate": 8.309303697183179e-05, |
| "loss": 0.2524, |
| "num_input_tokens_seen": 2537712, |
| "step": 8925 |
| }, |
| { |
| "epoch": 16.66044776119403, |
| "grad_norm": 0.7016326785087585, |
| "learning_rate": 8.264419545047891e-05, |
| "loss": 0.2144, |
| "num_input_tokens_seen": 2539024, |
| "step": 8930 |
| }, |
| { |
| "epoch": 16.669776119402986, |
| "grad_norm": 0.8711559176445007, |
| "learning_rate": 8.219646022559597e-05, |
| "loss": 0.2867, |
| "num_input_tokens_seen": 2540240, |
| "step": 8935 |
| }, |
| { |
| "epoch": 16.67910447761194, |
| "grad_norm": 0.4443308115005493, |
| "learning_rate": 8.174983248400674e-05, |
| "loss": 0.3432, |
| "num_input_tokens_seen": 2541584, |
| "step": 8940 |
| }, |
| { |
| "epoch": 16.688432835820894, |
| "grad_norm": 0.44244229793548584, |
| "learning_rate": 8.130431340959982e-05, |
| "loss": 0.2659, |
| "num_input_tokens_seen": 2543024, |
| "step": 8945 |
| }, |
| { |
| "epoch": 16.69776119402985, |
| "grad_norm": 0.4805716574192047, |
| "learning_rate": 8.08599041833245e-05, |
| "loss": 0.274, |
| "num_input_tokens_seen": 2544528, |
| "step": 8950 |
| }, |
| { |
| "epoch": 16.707089552238806, |
| "grad_norm": 0.35907986760139465, |
| "learning_rate": 8.041660598318889e-05, |
| "loss": 0.238, |
| "num_input_tokens_seen": 2546064, |
| "step": 8955 |
| }, |
| { |
| "epoch": 16.71641791044776, |
| "grad_norm": 0.597048282623291, |
| "learning_rate": 7.997441998425553e-05, |
| "loss": 0.3557, |
| "num_input_tokens_seen": 2547472, |
| "step": 8960 |
| }, |
| { |
| "epoch": 16.725746268656717, |
| "grad_norm": 0.500659167766571, |
| "learning_rate": 7.953334735863881e-05, |
| "loss": 0.3487, |
| "num_input_tokens_seen": 2549040, |
| "step": 8965 |
| }, |
| { |
| "epoch": 16.73507462686567, |
| "grad_norm": 0.5837845206260681, |
| "learning_rate": 7.909338927550225e-05, |
| "loss": 0.1703, |
| "num_input_tokens_seen": 2550480, |
| "step": 8970 |
| }, |
| { |
| "epoch": 16.744402985074625, |
| "grad_norm": 0.4150075912475586, |
| "learning_rate": 7.865454690105473e-05, |
| "loss": 0.2964, |
| "num_input_tokens_seen": 2551888, |
| "step": 8975 |
| }, |
| { |
| "epoch": 16.753731343283583, |
| "grad_norm": 0.616115927696228, |
| "learning_rate": 7.821682139854758e-05, |
| "loss": 0.2685, |
| "num_input_tokens_seen": 2553232, |
| "step": 8980 |
| }, |
| { |
| "epoch": 16.763059701492537, |
| "grad_norm": 0.3962574899196625, |
| "learning_rate": 7.778021392827211e-05, |
| "loss": 0.3528, |
| "num_input_tokens_seen": 2554640, |
| "step": 8985 |
| }, |
| { |
| "epoch": 16.77238805970149, |
| "grad_norm": 0.5545009970664978, |
| "learning_rate": 7.734472564755551e-05, |
| "loss": 0.231, |
| "num_input_tokens_seen": 2556112, |
| "step": 8990 |
| }, |
| { |
| "epoch": 16.78171641791045, |
| "grad_norm": 0.46232175827026367, |
| "learning_rate": 7.691035771075855e-05, |
| "loss": 0.3658, |
| "num_input_tokens_seen": 2557584, |
| "step": 8995 |
| }, |
| { |
| "epoch": 16.791044776119403, |
| "grad_norm": 0.6394046545028687, |
| "learning_rate": 7.64771112692721e-05, |
| "loss": 0.2446, |
| "num_input_tokens_seen": 2559184, |
| "step": 9000 |
| }, |
| { |
| "epoch": 16.800373134328357, |
| "grad_norm": 0.34025850892066956, |
| "learning_rate": 7.604498747151456e-05, |
| "loss": 0.2375, |
| "num_input_tokens_seen": 2560688, |
| "step": 9005 |
| }, |
| { |
| "epoch": 16.809701492537314, |
| "grad_norm": 0.42679092288017273, |
| "learning_rate": 7.56139874629283e-05, |
| "loss": 0.2571, |
| "num_input_tokens_seen": 2562224, |
| "step": 9010 |
| }, |
| { |
| "epoch": 16.81902985074627, |
| "grad_norm": 0.512681245803833, |
| "learning_rate": 7.518411238597667e-05, |
| "loss": 0.1278, |
| "num_input_tokens_seen": 2563568, |
| "step": 9015 |
| }, |
| { |
| "epoch": 16.828358208955223, |
| "grad_norm": 0.8411496877670288, |
| "learning_rate": 7.475536338014155e-05, |
| "loss": 0.262, |
| "num_input_tokens_seen": 2565104, |
| "step": 9020 |
| }, |
| { |
| "epoch": 16.83768656716418, |
| "grad_norm": 0.45846620202064514, |
| "learning_rate": 7.432774158191946e-05, |
| "loss": 0.2383, |
| "num_input_tokens_seen": 2566512, |
| "step": 9025 |
| }, |
| { |
| "epoch": 16.847014925373134, |
| "grad_norm": 0.6833055019378662, |
| "learning_rate": 7.390124812481957e-05, |
| "loss": 0.2417, |
| "num_input_tokens_seen": 2568048, |
| "step": 9030 |
| }, |
| { |
| "epoch": 16.85634328358209, |
| "grad_norm": 0.5997596383094788, |
| "learning_rate": 7.347588413935935e-05, |
| "loss": 0.3637, |
| "num_input_tokens_seen": 2569392, |
| "step": 9035 |
| }, |
| { |
| "epoch": 16.865671641791046, |
| "grad_norm": 0.6235828995704651, |
| "learning_rate": 7.305165075306297e-05, |
| "loss": 0.3092, |
| "num_input_tokens_seen": 2570704, |
| "step": 9040 |
| }, |
| { |
| "epoch": 16.875, |
| "grad_norm": 0.5785413980484009, |
| "learning_rate": 7.262854909045774e-05, |
| "loss": 0.325, |
| "num_input_tokens_seen": 2572336, |
| "step": 9045 |
| }, |
| { |
| "epoch": 16.884328358208954, |
| "grad_norm": 0.4815627336502075, |
| "learning_rate": 7.220658027307036e-05, |
| "loss": 0.2405, |
| "num_input_tokens_seen": 2573680, |
| "step": 9050 |
| }, |
| { |
| "epoch": 16.89365671641791, |
| "grad_norm": 0.2667815089225769, |
| "learning_rate": 7.178574541942545e-05, |
| "loss": 0.3088, |
| "num_input_tokens_seen": 2575120, |
| "step": 9055 |
| }, |
| { |
| "epoch": 16.902985074626866, |
| "grad_norm": 0.5672833323478699, |
| "learning_rate": 7.136604564504134e-05, |
| "loss": 0.2136, |
| "num_input_tokens_seen": 2576400, |
| "step": 9060 |
| }, |
| { |
| "epoch": 16.91231343283582, |
| "grad_norm": 0.7024040818214417, |
| "learning_rate": 7.094748206242796e-05, |
| "loss": 0.316, |
| "num_input_tokens_seen": 2578320, |
| "step": 9065 |
| }, |
| { |
| "epoch": 16.921641791044777, |
| "grad_norm": 0.6449324488639832, |
| "learning_rate": 7.053005578108296e-05, |
| "loss": 0.2807, |
| "num_input_tokens_seen": 2579888, |
| "step": 9070 |
| }, |
| { |
| "epoch": 16.93097014925373, |
| "grad_norm": 0.6066656112670898, |
| "learning_rate": 7.01137679074897e-05, |
| "loss": 0.2749, |
| "num_input_tokens_seen": 2581296, |
| "step": 9075 |
| }, |
| { |
| "epoch": 16.940298507462686, |
| "grad_norm": 0.7407752871513367, |
| "learning_rate": 6.969861954511409e-05, |
| "loss": 0.2132, |
| "num_input_tokens_seen": 2582768, |
| "step": 9080 |
| }, |
| { |
| "epoch": 16.949626865671643, |
| "grad_norm": 0.3702264130115509, |
| "learning_rate": 6.928461179440109e-05, |
| "loss": 0.1839, |
| "num_input_tokens_seen": 2584144, |
| "step": 9085 |
| }, |
| { |
| "epoch": 16.958955223880597, |
| "grad_norm": 0.6591739654541016, |
| "learning_rate": 6.887174575277239e-05, |
| "loss": 0.2496, |
| "num_input_tokens_seen": 2585456, |
| "step": 9090 |
| }, |
| { |
| "epoch": 16.96828358208955, |
| "grad_norm": 0.8507482409477234, |
| "learning_rate": 6.846002251462324e-05, |
| "loss": 0.306, |
| "num_input_tokens_seen": 2586768, |
| "step": 9095 |
| }, |
| { |
| "epoch": 16.97761194029851, |
| "grad_norm": 0.6380784511566162, |
| "learning_rate": 6.804944317131995e-05, |
| "loss": 0.4314, |
| "num_input_tokens_seen": 2588016, |
| "step": 9100 |
| }, |
| { |
| "epoch": 16.986940298507463, |
| "grad_norm": 0.3737356662750244, |
| "learning_rate": 6.76400088111963e-05, |
| "loss": 0.3087, |
| "num_input_tokens_seen": 2589520, |
| "step": 9105 |
| }, |
| { |
| "epoch": 16.996268656716417, |
| "grad_norm": 0.6403619050979614, |
| "learning_rate": 6.723172051955101e-05, |
| "loss": 0.124, |
| "num_input_tokens_seen": 2591120, |
| "step": 9110 |
| }, |
| { |
| "epoch": 17.0, |
| "eval_loss": 0.8633560538291931, |
| "eval_runtime": 4.6195, |
| "eval_samples_per_second": 51.521, |
| "eval_steps_per_second": 12.988, |
| "num_input_tokens_seen": 2591432, |
| "step": 9112 |
| }, |
| { |
| "epoch": 17.005597014925375, |
| "grad_norm": 0.4501326084136963, |
| "learning_rate": 6.682457937864538e-05, |
| "loss": 0.3698, |
| "num_input_tokens_seen": 2592296, |
| "step": 9115 |
| }, |
| { |
| "epoch": 17.01492537313433, |
| "grad_norm": 0.48867860436439514, |
| "learning_rate": 6.641858646769938e-05, |
| "loss": 0.3062, |
| "num_input_tokens_seen": 2593704, |
| "step": 9120 |
| }, |
| { |
| "epoch": 17.024253731343283, |
| "grad_norm": 0.6671390533447266, |
| "learning_rate": 6.601374286288963e-05, |
| "loss": 0.2267, |
| "num_input_tokens_seen": 2595272, |
| "step": 9125 |
| }, |
| { |
| "epoch": 17.03358208955224, |
| "grad_norm": 0.4498516619205475, |
| "learning_rate": 6.561004963734595e-05, |
| "loss": 0.2633, |
| "num_input_tokens_seen": 2596712, |
| "step": 9130 |
| }, |
| { |
| "epoch": 17.042910447761194, |
| "grad_norm": 0.5411965250968933, |
| "learning_rate": 6.520750786114938e-05, |
| "loss": 0.3891, |
| "num_input_tokens_seen": 2598152, |
| "step": 9135 |
| }, |
| { |
| "epoch": 17.05223880597015, |
| "grad_norm": 0.28718677163124084, |
| "learning_rate": 6.480611860132824e-05, |
| "loss": 0.3361, |
| "num_input_tokens_seen": 2599624, |
| "step": 9140 |
| }, |
| { |
| "epoch": 17.061567164179106, |
| "grad_norm": 1.2912830114364624, |
| "learning_rate": 6.440588292185595e-05, |
| "loss": 0.211, |
| "num_input_tokens_seen": 2601160, |
| "step": 9145 |
| }, |
| { |
| "epoch": 17.07089552238806, |
| "grad_norm": 0.38549405336380005, |
| "learning_rate": 6.400680188364844e-05, |
| "loss": 0.2075, |
| "num_input_tokens_seen": 2602536, |
| "step": 9150 |
| }, |
| { |
| "epoch": 17.080223880597014, |
| "grad_norm": 0.8170197010040283, |
| "learning_rate": 6.360887654456065e-05, |
| "loss": 0.2086, |
| "num_input_tokens_seen": 2604040, |
| "step": 9155 |
| }, |
| { |
| "epoch": 17.08955223880597, |
| "grad_norm": 0.3383115530014038, |
| "learning_rate": 6.321210795938403e-05, |
| "loss": 0.1943, |
| "num_input_tokens_seen": 2605352, |
| "step": 9160 |
| }, |
| { |
| "epoch": 17.098880597014926, |
| "grad_norm": 0.7286571264266968, |
| "learning_rate": 6.281649717984417e-05, |
| "loss": 0.2104, |
| "num_input_tokens_seen": 2606696, |
| "step": 9165 |
| }, |
| { |
| "epoch": 17.10820895522388, |
| "grad_norm": 0.5768951773643494, |
| "learning_rate": 6.242204525459738e-05, |
| "loss": 0.2729, |
| "num_input_tokens_seen": 2608168, |
| "step": 9170 |
| }, |
| { |
| "epoch": 17.117537313432837, |
| "grad_norm": 0.5091217756271362, |
| "learning_rate": 6.202875322922808e-05, |
| "loss": 0.3402, |
| "num_input_tokens_seen": 2609608, |
| "step": 9175 |
| }, |
| { |
| "epoch": 17.12686567164179, |
| "grad_norm": 0.4321063160896301, |
| "learning_rate": 6.163662214624616e-05, |
| "loss": 0.2309, |
| "num_input_tokens_seen": 2610920, |
| "step": 9180 |
| }, |
| { |
| "epoch": 17.136194029850746, |
| "grad_norm": 0.7079524993896484, |
| "learning_rate": 6.12456530450844e-05, |
| "loss": 0.1885, |
| "num_input_tokens_seen": 2612200, |
| "step": 9185 |
| }, |
| { |
| "epoch": 17.145522388059703, |
| "grad_norm": 0.5813358426094055, |
| "learning_rate": 6.0855846962095285e-05, |
| "loss": 0.245, |
| "num_input_tokens_seen": 2613544, |
| "step": 9190 |
| }, |
| { |
| "epoch": 17.154850746268657, |
| "grad_norm": 0.5503816604614258, |
| "learning_rate": 6.0467204930548357e-05, |
| "loss": 0.2032, |
| "num_input_tokens_seen": 2614888, |
| "step": 9195 |
| }, |
| { |
| "epoch": 17.16417910447761, |
| "grad_norm": 0.43504875898361206, |
| "learning_rate": 6.007972798062783e-05, |
| "loss": 0.1711, |
| "num_input_tokens_seen": 2616488, |
| "step": 9200 |
| }, |
| { |
| "epoch": 17.17350746268657, |
| "grad_norm": 0.41562536358833313, |
| "learning_rate": 5.96934171394295e-05, |
| "loss": 0.2347, |
| "num_input_tokens_seen": 2617800, |
| "step": 9205 |
| }, |
| { |
| "epoch": 17.182835820895523, |
| "grad_norm": 0.33625268936157227, |
| "learning_rate": 5.930827343095801e-05, |
| "loss": 0.2238, |
| "num_input_tokens_seen": 2619080, |
| "step": 9210 |
| }, |
| { |
| "epoch": 17.192164179104477, |
| "grad_norm": 0.608429491519928, |
| "learning_rate": 5.8924297876124246e-05, |
| "loss": 0.3715, |
| "num_input_tokens_seen": 2620520, |
| "step": 9215 |
| }, |
| { |
| "epoch": 17.20149253731343, |
| "grad_norm": 0.8513853549957275, |
| "learning_rate": 5.854149149274296e-05, |
| "loss": 0.3546, |
| "num_input_tokens_seen": 2621896, |
| "step": 9220 |
| }, |
| { |
| "epoch": 17.21082089552239, |
| "grad_norm": 0.42540302872657776, |
| "learning_rate": 5.815985529552942e-05, |
| "loss": 0.2143, |
| "num_input_tokens_seen": 2623176, |
| "step": 9225 |
| }, |
| { |
| "epoch": 17.220149253731343, |
| "grad_norm": 0.7629372477531433, |
| "learning_rate": 5.777939029609708e-05, |
| "loss": 0.2484, |
| "num_input_tokens_seen": 2624584, |
| "step": 9230 |
| }, |
| { |
| "epoch": 17.229477611940297, |
| "grad_norm": 0.6999621391296387, |
| "learning_rate": 5.740009750295505e-05, |
| "loss": 0.3354, |
| "num_input_tokens_seen": 2626120, |
| "step": 9235 |
| }, |
| { |
| "epoch": 17.238805970149254, |
| "grad_norm": 1.2121412754058838, |
| "learning_rate": 5.7021977921505156e-05, |
| "loss": 0.1779, |
| "num_input_tokens_seen": 2627624, |
| "step": 9240 |
| }, |
| { |
| "epoch": 17.24813432835821, |
| "grad_norm": 0.6345969438552856, |
| "learning_rate": 5.664503255403924e-05, |
| "loss": 0.3501, |
| "num_input_tokens_seen": 2629000, |
| "step": 9245 |
| }, |
| { |
| "epoch": 17.257462686567163, |
| "grad_norm": 0.36961647868156433, |
| "learning_rate": 5.626926239973668e-05, |
| "loss": 0.1767, |
| "num_input_tokens_seen": 2630504, |
| "step": 9250 |
| }, |
| { |
| "epoch": 17.26679104477612, |
| "grad_norm": 0.4116383194923401, |
| "learning_rate": 5.589466845466179e-05, |
| "loss": 0.2798, |
| "num_input_tokens_seen": 2631880, |
| "step": 9255 |
| }, |
| { |
| "epoch": 17.276119402985074, |
| "grad_norm": 0.8709968328475952, |
| "learning_rate": 5.5521251711761256e-05, |
| "loss": 0.2116, |
| "num_input_tokens_seen": 2633288, |
| "step": 9260 |
| }, |
| { |
| "epoch": 17.28544776119403, |
| "grad_norm": 0.567836344242096, |
| "learning_rate": 5.5149013160860575e-05, |
| "loss": 0.2808, |
| "num_input_tokens_seen": 2634824, |
| "step": 9265 |
| }, |
| { |
| "epoch": 17.294776119402986, |
| "grad_norm": 0.6120123267173767, |
| "learning_rate": 5.477795378866307e-05, |
| "loss": 0.2145, |
| "num_input_tokens_seen": 2636136, |
| "step": 9270 |
| }, |
| { |
| "epoch": 17.30410447761194, |
| "grad_norm": 0.6449649930000305, |
| "learning_rate": 5.4408074578745806e-05, |
| "loss": 0.2289, |
| "num_input_tokens_seen": 2637480, |
| "step": 9275 |
| }, |
| { |
| "epoch": 17.313432835820894, |
| "grad_norm": 0.38337478041648865, |
| "learning_rate": 5.403937651155771e-05, |
| "loss": 0.229, |
| "num_input_tokens_seen": 2639080, |
| "step": 9280 |
| }, |
| { |
| "epoch": 17.32276119402985, |
| "grad_norm": 0.49158769845962524, |
| "learning_rate": 5.367186056441703e-05, |
| "loss": 0.2575, |
| "num_input_tokens_seen": 2640616, |
| "step": 9285 |
| }, |
| { |
| "epoch": 17.332089552238806, |
| "grad_norm": 0.3472841680049896, |
| "learning_rate": 5.3305527711508205e-05, |
| "loss": 0.1758, |
| "num_input_tokens_seen": 2642184, |
| "step": 9290 |
| }, |
| { |
| "epoch": 17.34141791044776, |
| "grad_norm": 0.29027411341667175, |
| "learning_rate": 5.294037892387998e-05, |
| "loss": 0.1465, |
| "num_input_tokens_seen": 2643784, |
| "step": 9295 |
| }, |
| { |
| "epoch": 17.350746268656717, |
| "grad_norm": 0.4464806616306305, |
| "learning_rate": 5.2576415169441895e-05, |
| "loss": 0.2601, |
| "num_input_tokens_seen": 2645160, |
| "step": 9300 |
| }, |
| { |
| "epoch": 17.36007462686567, |
| "grad_norm": 0.34677013754844666, |
| "learning_rate": 5.221363741296298e-05, |
| "loss": 0.2752, |
| "num_input_tokens_seen": 2646760, |
| "step": 9305 |
| }, |
| { |
| "epoch": 17.369402985074625, |
| "grad_norm": 0.7161241173744202, |
| "learning_rate": 5.18520466160679e-05, |
| "loss": 0.2292, |
| "num_input_tokens_seen": 2648264, |
| "step": 9310 |
| }, |
| { |
| "epoch": 17.378731343283583, |
| "grad_norm": 0.3472307622432709, |
| "learning_rate": 5.149164373723558e-05, |
| "loss": 0.1797, |
| "num_input_tokens_seen": 2649864, |
| "step": 9315 |
| }, |
| { |
| "epoch": 17.388059701492537, |
| "grad_norm": 0.6642903089523315, |
| "learning_rate": 5.113242973179566e-05, |
| "loss": 0.3569, |
| "num_input_tokens_seen": 2651272, |
| "step": 9320 |
| }, |
| { |
| "epoch": 17.39738805970149, |
| "grad_norm": 0.6346752047538757, |
| "learning_rate": 5.077440555192647e-05, |
| "loss": 0.2407, |
| "num_input_tokens_seen": 2652552, |
| "step": 9325 |
| }, |
| { |
| "epoch": 17.40671641791045, |
| "grad_norm": 0.7212795615196228, |
| "learning_rate": 5.0417572146652825e-05, |
| "loss": 0.1955, |
| "num_input_tokens_seen": 2654056, |
| "step": 9330 |
| }, |
| { |
| "epoch": 17.416044776119403, |
| "grad_norm": 0.5936789512634277, |
| "learning_rate": 5.0061930461842375e-05, |
| "loss": 0.2506, |
| "num_input_tokens_seen": 2655592, |
| "step": 9335 |
| }, |
| { |
| "epoch": 17.425373134328357, |
| "grad_norm": 0.45224127173423767, |
| "learning_rate": 4.9707481440204486e-05, |
| "loss": 0.1789, |
| "num_input_tokens_seen": 2657064, |
| "step": 9340 |
| }, |
| { |
| "epoch": 17.434701492537314, |
| "grad_norm": 0.6721286773681641, |
| "learning_rate": 4.9354226021286975e-05, |
| "loss": 0.2685, |
| "num_input_tokens_seen": 2658664, |
| "step": 9345 |
| }, |
| { |
| "epoch": 17.44402985074627, |
| "grad_norm": 0.2122386246919632, |
| "learning_rate": 4.900216514147365e-05, |
| "loss": 0.3693, |
| "num_input_tokens_seen": 2660232, |
| "step": 9350 |
| }, |
| { |
| "epoch": 17.453358208955223, |
| "grad_norm": 0.29205867648124695, |
| "learning_rate": 4.8651299733981855e-05, |
| "loss": 0.2109, |
| "num_input_tokens_seen": 2661704, |
| "step": 9355 |
| }, |
| { |
| "epoch": 17.46268656716418, |
| "grad_norm": 1.0239243507385254, |
| "learning_rate": 4.830163072886007e-05, |
| "loss": 0.2698, |
| "num_input_tokens_seen": 2662984, |
| "step": 9360 |
| }, |
| { |
| "epoch": 17.472014925373134, |
| "grad_norm": 0.6300740242004395, |
| "learning_rate": 4.7953159052985693e-05, |
| "loss": 0.3608, |
| "num_input_tokens_seen": 2664392, |
| "step": 9365 |
| }, |
| { |
| "epoch": 17.48134328358209, |
| "grad_norm": 0.7175711989402771, |
| "learning_rate": 4.760588563006207e-05, |
| "loss": 0.1967, |
| "num_input_tokens_seen": 2665960, |
| "step": 9370 |
| }, |
| { |
| "epoch": 17.490671641791046, |
| "grad_norm": 0.47732552886009216, |
| "learning_rate": 4.725981138061625e-05, |
| "loss": 0.257, |
| "num_input_tokens_seen": 2667528, |
| "step": 9375 |
| }, |
| { |
| "epoch": 17.5, |
| "grad_norm": 0.7046051025390625, |
| "learning_rate": 4.691493722199697e-05, |
| "loss": 0.2495, |
| "num_input_tokens_seen": 2668904, |
| "step": 9380 |
| }, |
| { |
| "epoch": 17.509328358208954, |
| "grad_norm": 0.5751326680183411, |
| "learning_rate": 4.657126406837148e-05, |
| "loss": 0.2772, |
| "num_input_tokens_seen": 2670536, |
| "step": 9385 |
| }, |
| { |
| "epoch": 17.51865671641791, |
| "grad_norm": 0.437429279088974, |
| "learning_rate": 4.622879283072368e-05, |
| "loss": 0.2323, |
| "num_input_tokens_seen": 2671816, |
| "step": 9390 |
| }, |
| { |
| "epoch": 17.527985074626866, |
| "grad_norm": 0.35734742879867554, |
| "learning_rate": 4.588752441685129e-05, |
| "loss": 0.1012, |
| "num_input_tokens_seen": 2673288, |
| "step": 9395 |
| }, |
| { |
| "epoch": 17.53731343283582, |
| "grad_norm": 0.6184195280075073, |
| "learning_rate": 4.554745973136409e-05, |
| "loss": 0.3341, |
| "num_input_tokens_seen": 2674664, |
| "step": 9400 |
| }, |
| { |
| "epoch": 17.546641791044777, |
| "grad_norm": 0.9945972561836243, |
| "learning_rate": 4.5208599675680754e-05, |
| "loss": 0.2572, |
| "num_input_tokens_seen": 2676040, |
| "step": 9405 |
| }, |
| { |
| "epoch": 17.55597014925373, |
| "grad_norm": 0.6488131880760193, |
| "learning_rate": 4.487094514802686e-05, |
| "loss": 0.1918, |
| "num_input_tokens_seen": 2677416, |
| "step": 9410 |
| }, |
| { |
| "epoch": 17.565298507462686, |
| "grad_norm": 0.5945156216621399, |
| "learning_rate": 4.4534497043432655e-05, |
| "loss": 0.2109, |
| "num_input_tokens_seen": 2678792, |
| "step": 9415 |
| }, |
| { |
| "epoch": 17.574626865671643, |
| "grad_norm": 0.5408649444580078, |
| "learning_rate": 4.419925625373028e-05, |
| "loss": 0.1932, |
| "num_input_tokens_seen": 2680232, |
| "step": 9420 |
| }, |
| { |
| "epoch": 17.583955223880597, |
| "grad_norm": 0.7728055119514465, |
| "learning_rate": 4.386522366755169e-05, |
| "loss": 0.3155, |
| "num_input_tokens_seen": 2681640, |
| "step": 9425 |
| }, |
| { |
| "epoch": 17.59328358208955, |
| "grad_norm": 0.6577746272087097, |
| "learning_rate": 4.353240017032611e-05, |
| "loss": 0.3126, |
| "num_input_tokens_seen": 2682984, |
| "step": 9430 |
| }, |
| { |
| "epoch": 17.60261194029851, |
| "grad_norm": 0.6137169599533081, |
| "learning_rate": 4.3200786644278064e-05, |
| "loss": 0.3288, |
| "num_input_tokens_seen": 2684456, |
| "step": 9435 |
| }, |
| { |
| "epoch": 17.611940298507463, |
| "grad_norm": 0.4128672778606415, |
| "learning_rate": 4.287038396842463e-05, |
| "loss": 0.1679, |
| "num_input_tokens_seen": 2685992, |
| "step": 9440 |
| }, |
| { |
| "epoch": 17.621268656716417, |
| "grad_norm": 0.5442803502082825, |
| "learning_rate": 4.254119301857301e-05, |
| "loss": 0.3345, |
| "num_input_tokens_seen": 2687368, |
| "step": 9445 |
| }, |
| { |
| "epoch": 17.630597014925375, |
| "grad_norm": 0.412265807390213, |
| "learning_rate": 4.2213214667318925e-05, |
| "loss": 0.1516, |
| "num_input_tokens_seen": 2688840, |
| "step": 9450 |
| }, |
| { |
| "epoch": 17.63992537313433, |
| "grad_norm": 0.7147823572158813, |
| "learning_rate": 4.188644978404349e-05, |
| "loss": 0.2244, |
| "num_input_tokens_seen": 2690280, |
| "step": 9455 |
| }, |
| { |
| "epoch": 17.649253731343283, |
| "grad_norm": 0.6293306350708008, |
| "learning_rate": 4.156089923491124e-05, |
| "loss": 0.2742, |
| "num_input_tokens_seen": 2691720, |
| "step": 9460 |
| }, |
| { |
| "epoch": 17.65858208955224, |
| "grad_norm": 0.7018571496009827, |
| "learning_rate": 4.1236563882868116e-05, |
| "loss": 0.2319, |
| "num_input_tokens_seen": 2693064, |
| "step": 9465 |
| }, |
| { |
| "epoch": 17.667910447761194, |
| "grad_norm": 0.5599743723869324, |
| "learning_rate": 4.091344458763863e-05, |
| "loss": 0.3065, |
| "num_input_tokens_seen": 2694344, |
| "step": 9470 |
| }, |
| { |
| "epoch": 17.67723880597015, |
| "grad_norm": 0.3867090046405792, |
| "learning_rate": 4.0591542205723975e-05, |
| "loss": 0.3248, |
| "num_input_tokens_seen": 2695816, |
| "step": 9475 |
| }, |
| { |
| "epoch": 17.686567164179106, |
| "grad_norm": 0.7426350712776184, |
| "learning_rate": 4.02708575903995e-05, |
| "loss": 0.3151, |
| "num_input_tokens_seen": 2697192, |
| "step": 9480 |
| }, |
| { |
| "epoch": 17.69589552238806, |
| "grad_norm": 0.5198386907577515, |
| "learning_rate": 3.995139159171296e-05, |
| "loss": 0.2335, |
| "num_input_tokens_seen": 2698696, |
| "step": 9485 |
| }, |
| { |
| "epoch": 17.705223880597014, |
| "grad_norm": 0.5265973210334778, |
| "learning_rate": 3.963314505648141e-05, |
| "loss": 0.3741, |
| "num_input_tokens_seen": 2700200, |
| "step": 9490 |
| }, |
| { |
| "epoch": 17.71455223880597, |
| "grad_norm": 0.4689077138900757, |
| "learning_rate": 3.931611882828967e-05, |
| "loss": 0.1904, |
| "num_input_tokens_seen": 2701800, |
| "step": 9495 |
| }, |
| { |
| "epoch": 17.723880597014926, |
| "grad_norm": 0.6163126826286316, |
| "learning_rate": 3.900031374748797e-05, |
| "loss": 0.1411, |
| "num_input_tokens_seen": 2703208, |
| "step": 9500 |
| }, |
| { |
| "epoch": 17.73320895522388, |
| "grad_norm": 0.7090937495231628, |
| "learning_rate": 3.868573065118935e-05, |
| "loss": 0.3838, |
| "num_input_tokens_seen": 2704744, |
| "step": 9505 |
| }, |
| { |
| "epoch": 17.742537313432837, |
| "grad_norm": 0.4497239589691162, |
| "learning_rate": 3.837237037326813e-05, |
| "loss": 0.3361, |
| "num_input_tokens_seen": 2706120, |
| "step": 9510 |
| }, |
| { |
| "epoch": 17.75186567164179, |
| "grad_norm": 0.4925253391265869, |
| "learning_rate": 3.806023374435663e-05, |
| "loss": 0.2392, |
| "num_input_tokens_seen": 2707656, |
| "step": 9515 |
| }, |
| { |
| "epoch": 17.761194029850746, |
| "grad_norm": 0.22657965123653412, |
| "learning_rate": 3.774932159184413e-05, |
| "loss": 0.156, |
| "num_input_tokens_seen": 2709320, |
| "step": 9520 |
| }, |
| { |
| "epoch": 17.770522388059703, |
| "grad_norm": 0.8829783201217651, |
| "learning_rate": 3.7439634739874165e-05, |
| "loss": 0.3334, |
| "num_input_tokens_seen": 2710728, |
| "step": 9525 |
| }, |
| { |
| "epoch": 17.779850746268657, |
| "grad_norm": 0.6137654781341553, |
| "learning_rate": 3.7131174009341794e-05, |
| "loss": 0.2255, |
| "num_input_tokens_seen": 2712136, |
| "step": 9530 |
| }, |
| { |
| "epoch": 17.78917910447761, |
| "grad_norm": 0.8342494964599609, |
| "learning_rate": 3.682394021789259e-05, |
| "loss": 0.3316, |
| "num_input_tokens_seen": 2713672, |
| "step": 9535 |
| }, |
| { |
| "epoch": 17.798507462686565, |
| "grad_norm": 0.3486193120479584, |
| "learning_rate": 3.65179341799195e-05, |
| "loss": 0.3042, |
| "num_input_tokens_seen": 2715048, |
| "step": 9540 |
| }, |
| { |
| "epoch": 17.807835820895523, |
| "grad_norm": 0.6700887680053711, |
| "learning_rate": 3.6213156706561166e-05, |
| "loss": 0.3171, |
| "num_input_tokens_seen": 2716360, |
| "step": 9545 |
| }, |
| { |
| "epoch": 17.817164179104477, |
| "grad_norm": 0.6130694150924683, |
| "learning_rate": 3.590960860569959e-05, |
| "loss": 0.3269, |
| "num_input_tokens_seen": 2717928, |
| "step": 9550 |
| }, |
| { |
| "epoch": 17.826492537313435, |
| "grad_norm": 0.5257990956306458, |
| "learning_rate": 3.5607290681957894e-05, |
| "loss": 0.2473, |
| "num_input_tokens_seen": 2719304, |
| "step": 9555 |
| }, |
| { |
| "epoch": 17.83582089552239, |
| "grad_norm": 0.7420773506164551, |
| "learning_rate": 3.5306203736698685e-05, |
| "loss": 0.3058, |
| "num_input_tokens_seen": 2720680, |
| "step": 9560 |
| }, |
| { |
| "epoch": 17.845149253731343, |
| "grad_norm": 0.6225281953811646, |
| "learning_rate": 3.500634856802132e-05, |
| "loss": 0.2302, |
| "num_input_tokens_seen": 2722056, |
| "step": 9565 |
| }, |
| { |
| "epoch": 17.854477611940297, |
| "grad_norm": 0.4176914095878601, |
| "learning_rate": 3.4707725970760054e-05, |
| "loss": 0.1842, |
| "num_input_tokens_seen": 2723592, |
| "step": 9570 |
| }, |
| { |
| "epoch": 17.863805970149254, |
| "grad_norm": 0.6423101425170898, |
| "learning_rate": 3.441033673648197e-05, |
| "loss": 0.1773, |
| "num_input_tokens_seen": 2724808, |
| "step": 9575 |
| }, |
| { |
| "epoch": 17.87313432835821, |
| "grad_norm": 0.6204359531402588, |
| "learning_rate": 3.411418165348501e-05, |
| "loss": 0.2692, |
| "num_input_tokens_seen": 2726216, |
| "step": 9580 |
| }, |
| { |
| "epoch": 17.882462686567163, |
| "grad_norm": 0.7418746948242188, |
| "learning_rate": 3.381926150679543e-05, |
| "loss": 0.2128, |
| "num_input_tokens_seen": 2727528, |
| "step": 9585 |
| }, |
| { |
| "epoch": 17.89179104477612, |
| "grad_norm": 0.49704796075820923, |
| "learning_rate": 3.352557707816617e-05, |
| "loss": 0.1486, |
| "num_input_tokens_seen": 2728936, |
| "step": 9590 |
| }, |
| { |
| "epoch": 17.901119402985074, |
| "grad_norm": 0.4309185743331909, |
| "learning_rate": 3.323312914607468e-05, |
| "loss": 0.1939, |
| "num_input_tokens_seen": 2730600, |
| "step": 9595 |
| }, |
| { |
| "epoch": 17.91044776119403, |
| "grad_norm": 0.7381911277770996, |
| "learning_rate": 3.294191848572059e-05, |
| "loss": 0.2325, |
| "num_input_tokens_seen": 2732072, |
| "step": 9600 |
| }, |
| { |
| "epoch": 17.919776119402986, |
| "grad_norm": 0.5855839848518372, |
| "learning_rate": 3.265194586902404e-05, |
| "loss": 0.3232, |
| "num_input_tokens_seen": 2733704, |
| "step": 9605 |
| }, |
| { |
| "epoch": 17.92910447761194, |
| "grad_norm": 0.5614606142044067, |
| "learning_rate": 3.236321206462339e-05, |
| "loss": 0.2282, |
| "num_input_tokens_seen": 2735016, |
| "step": 9610 |
| }, |
| { |
| "epoch": 17.938432835820894, |
| "grad_norm": 0.8269302248954773, |
| "learning_rate": 3.207571783787328e-05, |
| "loss": 0.1947, |
| "num_input_tokens_seen": 2736424, |
| "step": 9615 |
| }, |
| { |
| "epoch": 17.94776119402985, |
| "grad_norm": 0.8580751419067383, |
| "learning_rate": 3.1789463950842476e-05, |
| "loss": 0.2183, |
| "num_input_tokens_seen": 2737864, |
| "step": 9620 |
| }, |
| { |
| "epoch": 17.957089552238806, |
| "grad_norm": 0.8197803497314453, |
| "learning_rate": 3.1504451162311986e-05, |
| "loss": 0.2768, |
| "num_input_tokens_seen": 2739016, |
| "step": 9625 |
| }, |
| { |
| "epoch": 17.96641791044776, |
| "grad_norm": 0.5299542546272278, |
| "learning_rate": 3.122068022777313e-05, |
| "loss": 0.3969, |
| "num_input_tokens_seen": 2740520, |
| "step": 9630 |
| }, |
| { |
| "epoch": 17.975746268656717, |
| "grad_norm": 0.42745083570480347, |
| "learning_rate": 3.093815189942523e-05, |
| "loss": 0.2228, |
| "num_input_tokens_seen": 2741800, |
| "step": 9635 |
| }, |
| { |
| "epoch": 17.98507462686567, |
| "grad_norm": 0.6629648804664612, |
| "learning_rate": 3.065686692617381e-05, |
| "loss": 0.2194, |
| "num_input_tokens_seen": 2743208, |
| "step": 9640 |
| }, |
| { |
| "epoch": 17.994402985074625, |
| "grad_norm": 0.4685826897621155, |
| "learning_rate": 3.037682605362879e-05, |
| "loss": 0.3369, |
| "num_input_tokens_seen": 2744424, |
| "step": 9645 |
| }, |
| { |
| "epoch": 18.0, |
| "eval_loss": 0.8991429805755615, |
| "eval_runtime": 4.2043, |
| "eval_samples_per_second": 56.609, |
| "eval_steps_per_second": 14.271, |
| "num_input_tokens_seen": 2744944, |
| "step": 9648 |
| }, |
| { |
| "epoch": 18.003731343283583, |
| "grad_norm": 0.5331962704658508, |
| "learning_rate": 3.0098030024102107e-05, |
| "loss": 0.1665, |
| "num_input_tokens_seen": 2745456, |
| "step": 9650 |
| }, |
| { |
| "epoch": 18.013059701492537, |
| "grad_norm": 0.40830346941947937, |
| "learning_rate": 2.9820479576606054e-05, |
| "loss": 0.2144, |
| "num_input_tokens_seen": 2747120, |
| "step": 9655 |
| }, |
| { |
| "epoch": 18.02238805970149, |
| "grad_norm": 0.6302569508552551, |
| "learning_rate": 2.954417544685112e-05, |
| "loss": 0.1607, |
| "num_input_tokens_seen": 2748528, |
| "step": 9660 |
| }, |
| { |
| "epoch": 18.03171641791045, |
| "grad_norm": 0.5230251550674438, |
| "learning_rate": 2.9269118367244385e-05, |
| "loss": 0.2282, |
| "num_input_tokens_seen": 2749744, |
| "step": 9665 |
| }, |
| { |
| "epoch": 18.041044776119403, |
| "grad_norm": 0.31676530838012695, |
| "learning_rate": 2.8995309066887076e-05, |
| "loss": 0.2259, |
| "num_input_tokens_seen": 2751280, |
| "step": 9670 |
| }, |
| { |
| "epoch": 18.050373134328357, |
| "grad_norm": 0.7035130858421326, |
| "learning_rate": 2.8722748271573064e-05, |
| "loss": 0.2814, |
| "num_input_tokens_seen": 2752656, |
| "step": 9675 |
| }, |
| { |
| "epoch": 18.059701492537314, |
| "grad_norm": 0.6157044172286987, |
| "learning_rate": 2.845143670378675e-05, |
| "loss": 0.2271, |
| "num_input_tokens_seen": 2754000, |
| "step": 9680 |
| }, |
| { |
| "epoch": 18.06902985074627, |
| "grad_norm": 0.22312885522842407, |
| "learning_rate": 2.8181375082701077e-05, |
| "loss": 0.241, |
| "num_input_tokens_seen": 2755440, |
| "step": 9685 |
| }, |
| { |
| "epoch": 18.078358208955223, |
| "grad_norm": 0.7368862628936768, |
| "learning_rate": 2.7912564124175866e-05, |
| "loss": 0.2336, |
| "num_input_tokens_seen": 2757008, |
| "step": 9690 |
| }, |
| { |
| "epoch": 18.08768656716418, |
| "grad_norm": 0.48901888728141785, |
| "learning_rate": 2.7645004540755525e-05, |
| "loss": 0.3062, |
| "num_input_tokens_seen": 2758224, |
| "step": 9695 |
| }, |
| { |
| "epoch": 18.097014925373134, |
| "grad_norm": 0.5629950165748596, |
| "learning_rate": 2.7378697041667676e-05, |
| "loss": 0.2175, |
| "num_input_tokens_seen": 2759600, |
| "step": 9700 |
| }, |
| { |
| "epoch": 18.10634328358209, |
| "grad_norm": 0.5760655999183655, |
| "learning_rate": 2.7113642332821043e-05, |
| "loss": 0.2869, |
| "num_input_tokens_seen": 2761072, |
| "step": 9705 |
| }, |
| { |
| "epoch": 18.115671641791046, |
| "grad_norm": 0.5357734560966492, |
| "learning_rate": 2.6849841116803218e-05, |
| "loss": 0.2062, |
| "num_input_tokens_seen": 2762608, |
| "step": 9710 |
| }, |
| { |
| "epoch": 18.125, |
| "grad_norm": 0.4605900049209595, |
| "learning_rate": 2.6587294092879354e-05, |
| "loss": 0.206, |
| "num_input_tokens_seen": 2764144, |
| "step": 9715 |
| }, |
| { |
| "epoch": 18.134328358208954, |
| "grad_norm": 0.8400527834892273, |
| "learning_rate": 2.632600195699014e-05, |
| "loss": 0.2989, |
| "num_input_tokens_seen": 2765424, |
| "step": 9720 |
| }, |
| { |
| "epoch": 18.14365671641791, |
| "grad_norm": 0.7730468511581421, |
| "learning_rate": 2.6065965401749602e-05, |
| "loss": 0.2935, |
| "num_input_tokens_seen": 2767024, |
| "step": 9725 |
| }, |
| { |
| "epoch": 18.152985074626866, |
| "grad_norm": 0.6380898952484131, |
| "learning_rate": 2.5807185116444033e-05, |
| "loss": 0.4573, |
| "num_input_tokens_seen": 2768432, |
| "step": 9730 |
| }, |
| { |
| "epoch": 18.16231343283582, |
| "grad_norm": 0.5300564765930176, |
| "learning_rate": 2.5549661787029167e-05, |
| "loss": 0.2651, |
| "num_input_tokens_seen": 2769968, |
| "step": 9735 |
| }, |
| { |
| "epoch": 18.171641791044777, |
| "grad_norm": 0.5630916357040405, |
| "learning_rate": 2.5293396096129406e-05, |
| "loss": 0.2353, |
| "num_input_tokens_seen": 2771280, |
| "step": 9740 |
| }, |
| { |
| "epoch": 18.18097014925373, |
| "grad_norm": 0.4602615237236023, |
| "learning_rate": 2.503838872303493e-05, |
| "loss": 0.2609, |
| "num_input_tokens_seen": 2772784, |
| "step": 9745 |
| }, |
| { |
| "epoch": 18.190298507462686, |
| "grad_norm": 0.33134764432907104, |
| "learning_rate": 2.4784640343701094e-05, |
| "loss": 0.1409, |
| "num_input_tokens_seen": 2774256, |
| "step": 9750 |
| }, |
| { |
| "epoch": 18.199626865671643, |
| "grad_norm": 0.597190797328949, |
| "learning_rate": 2.4532151630745403e-05, |
| "loss": 0.2962, |
| "num_input_tokens_seen": 2775856, |
| "step": 9755 |
| }, |
| { |
| "epoch": 18.208955223880597, |
| "grad_norm": 0.30851173400878906, |
| "learning_rate": 2.428092325344683e-05, |
| "loss": 0.1645, |
| "num_input_tokens_seen": 2777456, |
| "step": 9760 |
| }, |
| { |
| "epoch": 18.21828358208955, |
| "grad_norm": 0.5781135559082031, |
| "learning_rate": 2.4030955877743232e-05, |
| "loss": 0.2795, |
| "num_input_tokens_seen": 2778896, |
| "step": 9765 |
| }, |
| { |
| "epoch": 18.22761194029851, |
| "grad_norm": 0.6469405293464661, |
| "learning_rate": 2.3782250166229925e-05, |
| "loss": 0.274, |
| "num_input_tokens_seen": 2780400, |
| "step": 9770 |
| }, |
| { |
| "epoch": 18.236940298507463, |
| "grad_norm": 0.2876856327056885, |
| "learning_rate": 2.3534806778158113e-05, |
| "loss": 0.2551, |
| "num_input_tokens_seen": 2781968, |
| "step": 9775 |
| }, |
| { |
| "epoch": 18.246268656716417, |
| "grad_norm": 0.4758549630641937, |
| "learning_rate": 2.328862636943252e-05, |
| "loss": 0.1693, |
| "num_input_tokens_seen": 2783376, |
| "step": 9780 |
| }, |
| { |
| "epoch": 18.255597014925375, |
| "grad_norm": 0.8363102078437805, |
| "learning_rate": 2.3043709592610483e-05, |
| "loss": 0.1628, |
| "num_input_tokens_seen": 2784624, |
| "step": 9785 |
| }, |
| { |
| "epoch": 18.26492537313433, |
| "grad_norm": 0.6684785485267639, |
| "learning_rate": 2.280005709689964e-05, |
| "loss": 0.2323, |
| "num_input_tokens_seen": 2786128, |
| "step": 9790 |
| }, |
| { |
| "epoch": 18.274253731343283, |
| "grad_norm": 0.5245328545570374, |
| "learning_rate": 2.2557669528156245e-05, |
| "loss": 0.1595, |
| "num_input_tokens_seen": 2787440, |
| "step": 9795 |
| }, |
| { |
| "epoch": 18.28358208955224, |
| "grad_norm": 0.839958667755127, |
| "learning_rate": 2.2316547528883734e-05, |
| "loss": 0.4074, |
| "num_input_tokens_seen": 2788848, |
| "step": 9800 |
| }, |
| { |
| "epoch": 18.292910447761194, |
| "grad_norm": 0.4217190444469452, |
| "learning_rate": 2.207669173823068e-05, |
| "loss": 0.1433, |
| "num_input_tokens_seen": 2790448, |
| "step": 9805 |
| }, |
| { |
| "epoch": 18.30223880597015, |
| "grad_norm": 0.745557963848114, |
| "learning_rate": 2.1838102791989557e-05, |
| "loss": 0.2321, |
| "num_input_tokens_seen": 2791984, |
| "step": 9810 |
| }, |
| { |
| "epoch": 18.311567164179106, |
| "grad_norm": 0.7656036019325256, |
| "learning_rate": 2.160078132259452e-05, |
| "loss": 0.2635, |
| "num_input_tokens_seen": 2793360, |
| "step": 9815 |
| }, |
| { |
| "epoch": 18.32089552238806, |
| "grad_norm": 0.6078481078147888, |
| "learning_rate": 2.1364727959120088e-05, |
| "loss": 0.3943, |
| "num_input_tokens_seen": 2794608, |
| "step": 9820 |
| }, |
| { |
| "epoch": 18.330223880597014, |
| "grad_norm": 0.6271487474441528, |
| "learning_rate": 2.112994332727952e-05, |
| "loss": 0.23, |
| "num_input_tokens_seen": 2796048, |
| "step": 9825 |
| }, |
| { |
| "epoch": 18.33955223880597, |
| "grad_norm": 0.9702181220054626, |
| "learning_rate": 2.0896428049422765e-05, |
| "loss": 0.208, |
| "num_input_tokens_seen": 2797488, |
| "step": 9830 |
| }, |
| { |
| "epoch": 18.348880597014926, |
| "grad_norm": 0.5267967581748962, |
| "learning_rate": 2.0664182744535132e-05, |
| "loss": 0.2428, |
| "num_input_tokens_seen": 2798864, |
| "step": 9835 |
| }, |
| { |
| "epoch": 18.35820895522388, |
| "grad_norm": 0.4196302890777588, |
| "learning_rate": 2.0433208028235672e-05, |
| "loss": 0.1731, |
| "num_input_tokens_seen": 2800304, |
| "step": 9840 |
| }, |
| { |
| "epoch": 18.367537313432837, |
| "grad_norm": 0.8147212862968445, |
| "learning_rate": 2.020350451277536e-05, |
| "loss": 0.1959, |
| "num_input_tokens_seen": 2801584, |
| "step": 9845 |
| }, |
| { |
| "epoch": 18.37686567164179, |
| "grad_norm": 1.0722787380218506, |
| "learning_rate": 1.99750728070357e-05, |
| "loss": 0.4208, |
| "num_input_tokens_seen": 2802960, |
| "step": 9850 |
| }, |
| { |
| "epoch": 18.386194029850746, |
| "grad_norm": 0.6142206788063049, |
| "learning_rate": 1.9747913516526715e-05, |
| "loss": 0.3148, |
| "num_input_tokens_seen": 2804272, |
| "step": 9855 |
| }, |
| { |
| "epoch": 18.395522388059703, |
| "grad_norm": 0.9456634521484375, |
| "learning_rate": 1.952202724338592e-05, |
| "loss": 0.3173, |
| "num_input_tokens_seen": 2805520, |
| "step": 9860 |
| }, |
| { |
| "epoch": 18.404850746268657, |
| "grad_norm": 0.8432918787002563, |
| "learning_rate": 1.9297414586376184e-05, |
| "loss": 0.2247, |
| "num_input_tokens_seen": 2806960, |
| "step": 9865 |
| }, |
| { |
| "epoch": 18.41417910447761, |
| "grad_norm": 0.683684766292572, |
| "learning_rate": 1.907407614088441e-05, |
| "loss": 0.2114, |
| "num_input_tokens_seen": 2808432, |
| "step": 9870 |
| }, |
| { |
| "epoch": 18.423507462686565, |
| "grad_norm": 0.9373521208763123, |
| "learning_rate": 1.885201249891988e-05, |
| "loss": 0.2357, |
| "num_input_tokens_seen": 2810000, |
| "step": 9875 |
| }, |
| { |
| "epoch": 18.432835820895523, |
| "grad_norm": 0.38543543219566345, |
| "learning_rate": 1.8631224249112953e-05, |
| "loss": 0.2834, |
| "num_input_tokens_seen": 2811568, |
| "step": 9880 |
| }, |
| { |
| "epoch": 18.442164179104477, |
| "grad_norm": 0.4837745428085327, |
| "learning_rate": 1.841171197671293e-05, |
| "loss": 0.2146, |
| "num_input_tokens_seen": 2812816, |
| "step": 9885 |
| }, |
| { |
| "epoch": 18.451492537313435, |
| "grad_norm": 0.30426064133644104, |
| "learning_rate": 1.8193476263587084e-05, |
| "loss": 0.2971, |
| "num_input_tokens_seen": 2814480, |
| "step": 9890 |
| }, |
| { |
| "epoch": 18.46082089552239, |
| "grad_norm": 0.5340366959571838, |
| "learning_rate": 1.7976517688218786e-05, |
| "loss": 0.1861, |
| "num_input_tokens_seen": 2815856, |
| "step": 9895 |
| }, |
| { |
| "epoch": 18.470149253731343, |
| "grad_norm": 0.672731876373291, |
| "learning_rate": 1.7760836825706117e-05, |
| "loss": 0.328, |
| "num_input_tokens_seen": 2817328, |
| "step": 9900 |
| }, |
| { |
| "epoch": 18.479477611940297, |
| "grad_norm": 0.6022140383720398, |
| "learning_rate": 1.7546434247760147e-05, |
| "loss": 0.2673, |
| "num_input_tokens_seen": 2818608, |
| "step": 9905 |
| }, |
| { |
| "epoch": 18.488805970149254, |
| "grad_norm": 0.6115740537643433, |
| "learning_rate": 1.7333310522703814e-05, |
| "loss": 0.3055, |
| "num_input_tokens_seen": 2820080, |
| "step": 9910 |
| }, |
| { |
| "epoch": 18.49813432835821, |
| "grad_norm": 0.43116679787635803, |
| "learning_rate": 1.7121466215469893e-05, |
| "loss": 0.1638, |
| "num_input_tokens_seen": 2821296, |
| "step": 9915 |
| }, |
| { |
| "epoch": 18.507462686567163, |
| "grad_norm": 0.8029485940933228, |
| "learning_rate": 1.6910901887599917e-05, |
| "loss": 0.3756, |
| "num_input_tokens_seen": 2822704, |
| "step": 9920 |
| }, |
| { |
| "epoch": 18.51679104477612, |
| "grad_norm": 0.3002271354198456, |
| "learning_rate": 1.6701618097242522e-05, |
| "loss": 0.2539, |
| "num_input_tokens_seen": 2824432, |
| "step": 9925 |
| }, |
| { |
| "epoch": 18.526119402985074, |
| "grad_norm": 0.35039186477661133, |
| "learning_rate": 1.649361539915206e-05, |
| "loss": 0.1985, |
| "num_input_tokens_seen": 2825872, |
| "step": 9930 |
| }, |
| { |
| "epoch": 18.53544776119403, |
| "grad_norm": 0.6990351676940918, |
| "learning_rate": 1.628689434468694e-05, |
| "loss": 0.2503, |
| "num_input_tokens_seen": 2827184, |
| "step": 9935 |
| }, |
| { |
| "epoch": 18.544776119402986, |
| "grad_norm": 0.5490748882293701, |
| "learning_rate": 1.6081455481808226e-05, |
| "loss": 0.2516, |
| "num_input_tokens_seen": 2828720, |
| "step": 9940 |
| }, |
| { |
| "epoch": 18.55410447761194, |
| "grad_norm": 0.48887062072753906, |
| "learning_rate": 1.5877299355078534e-05, |
| "loss": 0.2402, |
| "num_input_tokens_seen": 2829968, |
| "step": 9945 |
| }, |
| { |
| "epoch": 18.563432835820894, |
| "grad_norm": 0.36090975999832153, |
| "learning_rate": 1.567442650565998e-05, |
| "loss": 0.1997, |
| "num_input_tokens_seen": 2831504, |
| "step": 9950 |
| }, |
| { |
| "epoch": 18.57276119402985, |
| "grad_norm": 0.32836994528770447, |
| "learning_rate": 1.5472837471313174e-05, |
| "loss": 0.1798, |
| "num_input_tokens_seen": 2832816, |
| "step": 9955 |
| }, |
| { |
| "epoch": 18.582089552238806, |
| "grad_norm": 0.5255143642425537, |
| "learning_rate": 1.5272532786395733e-05, |
| "loss": 0.1834, |
| "num_input_tokens_seen": 2834352, |
| "step": 9960 |
| }, |
| { |
| "epoch": 18.59141791044776, |
| "grad_norm": 0.6501192450523376, |
| "learning_rate": 1.5073512981860716e-05, |
| "loss": 0.2873, |
| "num_input_tokens_seen": 2835856, |
| "step": 9965 |
| }, |
| { |
| "epoch": 18.600746268656717, |
| "grad_norm": 0.6325224041938782, |
| "learning_rate": 1.4875778585255572e-05, |
| "loss": 0.2008, |
| "num_input_tokens_seen": 2837200, |
| "step": 9970 |
| }, |
| { |
| "epoch": 18.61007462686567, |
| "grad_norm": 0.42452865839004517, |
| "learning_rate": 1.4679330120720036e-05, |
| "loss": 0.1948, |
| "num_input_tokens_seen": 2838512, |
| "step": 9975 |
| }, |
| { |
| "epoch": 18.619402985074625, |
| "grad_norm": 0.2303168773651123, |
| "learning_rate": 1.4484168108985619e-05, |
| "loss": 0.1488, |
| "num_input_tokens_seen": 2840048, |
| "step": 9980 |
| }, |
| { |
| "epoch": 18.628731343283583, |
| "grad_norm": 0.7124575972557068, |
| "learning_rate": 1.429029306737345e-05, |
| "loss": 0.2161, |
| "num_input_tokens_seen": 2841360, |
| "step": 9985 |
| }, |
| { |
| "epoch": 18.638059701492537, |
| "grad_norm": 0.6218158006668091, |
| "learning_rate": 1.4097705509793612e-05, |
| "loss": 0.1754, |
| "num_input_tokens_seen": 2842672, |
| "step": 9990 |
| }, |
| { |
| "epoch": 18.64738805970149, |
| "grad_norm": 0.4380490183830261, |
| "learning_rate": 1.3906405946743028e-05, |
| "loss": 0.1943, |
| "num_input_tokens_seen": 2844144, |
| "step": 9995 |
| }, |
| { |
| "epoch": 18.65671641791045, |
| "grad_norm": 0.7543335556983948, |
| "learning_rate": 1.371639488530474e-05, |
| "loss": 0.2378, |
| "num_input_tokens_seen": 2845616, |
| "step": 10000 |
| }, |
| { |
| "epoch": 18.666044776119403, |
| "grad_norm": 0.6214406490325928, |
| "learning_rate": 1.3527672829146465e-05, |
| "loss": 0.3568, |
| "num_input_tokens_seen": 2847152, |
| "step": 10005 |
| }, |
| { |
| "epoch": 18.675373134328357, |
| "grad_norm": 0.8499645590782166, |
| "learning_rate": 1.3340240278518656e-05, |
| "loss": 0.1902, |
| "num_input_tokens_seen": 2848464, |
| "step": 10010 |
| }, |
| { |
| "epoch": 18.684701492537314, |
| "grad_norm": 0.31233519315719604, |
| "learning_rate": 1.3154097730254055e-05, |
| "loss": 0.1963, |
| "num_input_tokens_seen": 2850000, |
| "step": 10015 |
| }, |
| { |
| "epoch": 18.69402985074627, |
| "grad_norm": 0.5519477725028992, |
| "learning_rate": 1.2969245677765806e-05, |
| "loss": 0.2531, |
| "num_input_tokens_seen": 2851376, |
| "step": 10020 |
| }, |
| { |
| "epoch": 18.703358208955223, |
| "grad_norm": 0.6598090529441833, |
| "learning_rate": 1.2785684611046344e-05, |
| "loss": 0.1737, |
| "num_input_tokens_seen": 2852816, |
| "step": 10025 |
| }, |
| { |
| "epoch": 18.71268656716418, |
| "grad_norm": 0.41588467359542847, |
| "learning_rate": 1.2603415016665954e-05, |
| "loss": 0.2888, |
| "num_input_tokens_seen": 2854224, |
| "step": 10030 |
| }, |
| { |
| "epoch": 18.722014925373134, |
| "grad_norm": 0.9295200109481812, |
| "learning_rate": 1.24224373777716e-05, |
| "loss": 0.2034, |
| "num_input_tokens_seen": 2855536, |
| "step": 10035 |
| }, |
| { |
| "epoch": 18.73134328358209, |
| "grad_norm": 0.4858807623386383, |
| "learning_rate": 1.2242752174085824e-05, |
| "loss": 0.2194, |
| "num_input_tokens_seen": 2856976, |
| "step": 10040 |
| }, |
| { |
| "epoch": 18.740671641791046, |
| "grad_norm": 0.8125934600830078, |
| "learning_rate": 1.2064359881905018e-05, |
| "loss": 0.3345, |
| "num_input_tokens_seen": 2858480, |
| "step": 10045 |
| }, |
| { |
| "epoch": 18.75, |
| "grad_norm": 0.5733120441436768, |
| "learning_rate": 1.188726097409859e-05, |
| "loss": 0.194, |
| "num_input_tokens_seen": 2859888, |
| "step": 10050 |
| }, |
| { |
| "epoch": 18.759328358208954, |
| "grad_norm": 0.4118148982524872, |
| "learning_rate": 1.1711455920107306e-05, |
| "loss": 0.2242, |
| "num_input_tokens_seen": 2861488, |
| "step": 10055 |
| }, |
| { |
| "epoch": 18.76865671641791, |
| "grad_norm": 0.5921308994293213, |
| "learning_rate": 1.1536945185942615e-05, |
| "loss": 0.2137, |
| "num_input_tokens_seen": 2862864, |
| "step": 10060 |
| }, |
| { |
| "epoch": 18.777985074626866, |
| "grad_norm": 0.41935235261917114, |
| "learning_rate": 1.1363729234184827e-05, |
| "loss": 0.2219, |
| "num_input_tokens_seen": 2864400, |
| "step": 10065 |
| }, |
| { |
| "epoch": 18.78731343283582, |
| "grad_norm": 0.9974322319030762, |
| "learning_rate": 1.1191808523982217e-05, |
| "loss": 0.2545, |
| "num_input_tokens_seen": 2865648, |
| "step": 10070 |
| }, |
| { |
| "epoch": 18.796641791044777, |
| "grad_norm": 0.46737658977508545, |
| "learning_rate": 1.1021183511049748e-05, |
| "loss": 0.2224, |
| "num_input_tokens_seen": 2867120, |
| "step": 10075 |
| }, |
| { |
| "epoch": 18.80597014925373, |
| "grad_norm": 0.35736414790153503, |
| "learning_rate": 1.0851854647667803e-05, |
| "loss": 0.2584, |
| "num_input_tokens_seen": 2868464, |
| "step": 10080 |
| }, |
| { |
| "epoch": 18.815298507462686, |
| "grad_norm": 0.7535758018493652, |
| "learning_rate": 1.0683822382681008e-05, |
| "loss": 0.275, |
| "num_input_tokens_seen": 2869904, |
| "step": 10085 |
| }, |
| { |
| "epoch": 18.824626865671643, |
| "grad_norm": 0.32048988342285156, |
| "learning_rate": 1.051708716149713e-05, |
| "loss": 0.1825, |
| "num_input_tokens_seen": 2871248, |
| "step": 10090 |
| }, |
| { |
| "epoch": 18.833955223880597, |
| "grad_norm": 0.43274596333503723, |
| "learning_rate": 1.0351649426085852e-05, |
| "loss": 0.2755, |
| "num_input_tokens_seen": 2872496, |
| "step": 10095 |
| }, |
| { |
| "epoch": 18.84328358208955, |
| "grad_norm": 0.6226127743721008, |
| "learning_rate": 1.0187509614977387e-05, |
| "loss": 0.3063, |
| "num_input_tokens_seen": 2873872, |
| "step": 10100 |
| }, |
| { |
| "epoch": 18.85261194029851, |
| "grad_norm": 0.39895033836364746, |
| "learning_rate": 1.0024668163261641e-05, |
| "loss": 0.2383, |
| "num_input_tokens_seen": 2875376, |
| "step": 10105 |
| }, |
| { |
| "epoch": 18.861940298507463, |
| "grad_norm": 0.6880218982696533, |
| "learning_rate": 9.863125502587056e-06, |
| "loss": 0.277, |
| "num_input_tokens_seen": 2876848, |
| "step": 10110 |
| }, |
| { |
| "epoch": 18.871268656716417, |
| "grad_norm": 0.7496880292892456, |
| "learning_rate": 9.702882061159046e-06, |
| "loss": 0.1987, |
| "num_input_tokens_seen": 2878128, |
| "step": 10115 |
| }, |
| { |
| "epoch": 18.880597014925375, |
| "grad_norm": 0.6792168617248535, |
| "learning_rate": 9.543938263739338e-06, |
| "loss": 0.1663, |
| "num_input_tokens_seen": 2879728, |
| "step": 10120 |
| }, |
| { |
| "epoch": 18.88992537313433, |
| "grad_norm": 0.8453506827354431, |
| "learning_rate": 9.386294531644634e-06, |
| "loss": 0.2542, |
| "num_input_tokens_seen": 2881104, |
| "step": 10125 |
| }, |
| { |
| "epoch": 18.899253731343283, |
| "grad_norm": 0.7367790937423706, |
| "learning_rate": 9.229951282745507e-06, |
| "loss": 0.1282, |
| "num_input_tokens_seen": 2882576, |
| "step": 10130 |
| }, |
| { |
| "epoch": 18.90858208955224, |
| "grad_norm": 0.5763761401176453, |
| "learning_rate": 9.07490893146523e-06, |
| "loss": 0.2398, |
| "num_input_tokens_seen": 2884112, |
| "step": 10135 |
| }, |
| { |
| "epoch": 18.917910447761194, |
| "grad_norm": 0.7230084538459778, |
| "learning_rate": 8.921167888778836e-06, |
| "loss": 0.2872, |
| "num_input_tokens_seen": 2885360, |
| "step": 10140 |
| }, |
| { |
| "epoch": 18.92723880597015, |
| "grad_norm": 0.5383572578430176, |
| "learning_rate": 8.768728562211947e-06, |
| "loss": 0.1634, |
| "num_input_tokens_seen": 2886896, |
| "step": 10145 |
| }, |
| { |
| "epoch": 18.936567164179106, |
| "grad_norm": 0.20805750787258148, |
| "learning_rate": 8.617591355839672e-06, |
| "loss": 0.1707, |
| "num_input_tokens_seen": 2888240, |
| "step": 10150 |
| }, |
| { |
| "epoch": 18.94589552238806, |
| "grad_norm": 0.4978165328502655, |
| "learning_rate": 8.467756670285432e-06, |
| "loss": 0.3313, |
| "num_input_tokens_seen": 2889616, |
| "step": 10155 |
| }, |
| { |
| "epoch": 18.955223880597014, |
| "grad_norm": 0.7477753162384033, |
| "learning_rate": 8.319224902720302e-06, |
| "loss": 0.2436, |
| "num_input_tokens_seen": 2890960, |
| "step": 10160 |
| }, |
| { |
| "epoch": 18.96455223880597, |
| "grad_norm": 0.6375783681869507, |
| "learning_rate": 8.171996446861396e-06, |
| "loss": 0.4074, |
| "num_input_tokens_seen": 2892368, |
| "step": 10165 |
| }, |
| { |
| "epoch": 18.973880597014926, |
| "grad_norm": 0.7193611860275269, |
| "learning_rate": 8.026071692971315e-06, |
| "loss": 0.248, |
| "num_input_tokens_seen": 2893808, |
| "step": 10170 |
| }, |
| { |
| "epoch": 18.98320895522388, |
| "grad_norm": 0.4070175886154175, |
| "learning_rate": 7.881451027856645e-06, |
| "loss": 0.2247, |
| "num_input_tokens_seen": 2895280, |
| "step": 10175 |
| }, |
| { |
| "epoch": 18.992537313432837, |
| "grad_norm": 0.9722506999969482, |
| "learning_rate": 7.738134834867461e-06, |
| "loss": 0.3073, |
| "num_input_tokens_seen": 2896720, |
| "step": 10180 |
| }, |
| { |
| "epoch": 19.0, |
| "eval_loss": 0.9123162627220154, |
| "eval_runtime": 4.1942, |
| "eval_samples_per_second": 56.745, |
| "eval_steps_per_second": 14.305, |
| "num_input_tokens_seen": 2897552, |
| "step": 10184 |
| }, |
| { |
| "epoch": 19.00186567164179, |
| "grad_norm": 0.4594847559928894, |
| "learning_rate": 7.59612349389599e-06, |
| "loss": 0.1506, |
| "num_input_tokens_seen": 2897840, |
| "step": 10185 |
| }, |
| { |
| "epoch": 19.011194029850746, |
| "grad_norm": 0.45773735642433167, |
| "learning_rate": 7.455417381375451e-06, |
| "loss": 0.1724, |
| "num_input_tokens_seen": 2899248, |
| "step": 10190 |
| }, |
| { |
| "epoch": 19.020522388059703, |
| "grad_norm": 0.9145358204841614, |
| "learning_rate": 7.316016870279441e-06, |
| "loss": 0.1997, |
| "num_input_tokens_seen": 2900464, |
| "step": 10195 |
| }, |
| { |
| "epoch": 19.029850746268657, |
| "grad_norm": 0.5567941069602966, |
| "learning_rate": 7.177922330120712e-06, |
| "loss": 0.1263, |
| "num_input_tokens_seen": 2901840, |
| "step": 10200 |
| }, |
| { |
| "epoch": 19.03917910447761, |
| "grad_norm": 0.6734683513641357, |
| "learning_rate": 7.041134126950233e-06, |
| "loss": 0.2547, |
| "num_input_tokens_seen": 2903248, |
| "step": 10205 |
| }, |
| { |
| "epoch": 19.04850746268657, |
| "grad_norm": 0.7478652000427246, |
| "learning_rate": 6.9056526233562955e-06, |
| "loss": 0.3607, |
| "num_input_tokens_seen": 2904944, |
| "step": 10210 |
| }, |
| { |
| "epoch": 19.057835820895523, |
| "grad_norm": 1.0115318298339844, |
| "learning_rate": 6.771478178463353e-06, |
| "loss": 0.2637, |
| "num_input_tokens_seen": 2906256, |
| "step": 10215 |
| }, |
| { |
| "epoch": 19.067164179104477, |
| "grad_norm": 0.5048120021820068, |
| "learning_rate": 6.638611147931406e-06, |
| "loss": 0.1724, |
| "num_input_tokens_seen": 2907760, |
| "step": 10220 |
| }, |
| { |
| "epoch": 19.07649253731343, |
| "grad_norm": 0.8928444385528564, |
| "learning_rate": 6.507051883954618e-06, |
| "loss": 0.2378, |
| "num_input_tokens_seen": 2909040, |
| "step": 10225 |
| }, |
| { |
| "epoch": 19.08582089552239, |
| "grad_norm": 0.5665672421455383, |
| "learning_rate": 6.376800735260757e-06, |
| "loss": 0.2593, |
| "num_input_tokens_seen": 2910448, |
| "step": 10230 |
| }, |
| { |
| "epoch": 19.095149253731343, |
| "grad_norm": 0.5140485763549805, |
| "learning_rate": 6.247858047110145e-06, |
| "loss": 0.1728, |
| "num_input_tokens_seen": 2911792, |
| "step": 10235 |
| }, |
| { |
| "epoch": 19.104477611940297, |
| "grad_norm": 0.5509104132652283, |
| "learning_rate": 6.1202241612947075e-06, |
| "loss": 0.2249, |
| "num_input_tokens_seen": 2913072, |
| "step": 10240 |
| }, |
| { |
| "epoch": 19.113805970149254, |
| "grad_norm": 0.6295739412307739, |
| "learning_rate": 5.993899416137039e-06, |
| "loss": 0.1709, |
| "num_input_tokens_seen": 2914480, |
| "step": 10245 |
| }, |
| { |
| "epoch": 19.12313432835821, |
| "grad_norm": 0.8722971081733704, |
| "learning_rate": 5.868884146489617e-06, |
| "loss": 0.5177, |
| "num_input_tokens_seen": 2915920, |
| "step": 10250 |
| }, |
| { |
| "epoch": 19.132462686567163, |
| "grad_norm": 0.5105932354927063, |
| "learning_rate": 5.7451786837339205e-06, |
| "loss": 0.2387, |
| "num_input_tokens_seen": 2917360, |
| "step": 10255 |
| }, |
| { |
| "epoch": 19.14179104477612, |
| "grad_norm": 0.5840516090393066, |
| "learning_rate": 5.622783355779315e-06, |
| "loss": 0.2359, |
| "num_input_tokens_seen": 2918896, |
| "step": 10260 |
| }, |
| { |
| "epoch": 19.151119402985074, |
| "grad_norm": 0.5876781940460205, |
| "learning_rate": 5.501698487062445e-06, |
| "loss": 0.1998, |
| "num_input_tokens_seen": 2920368, |
| "step": 10265 |
| }, |
| { |
| "epoch": 19.16044776119403, |
| "grad_norm": 0.38335317373275757, |
| "learning_rate": 5.3819243985463454e-06, |
| "loss": 0.1661, |
| "num_input_tokens_seen": 2921872, |
| "step": 10270 |
| }, |
| { |
| "epoch": 19.169776119402986, |
| "grad_norm": 0.7055832147598267, |
| "learning_rate": 5.263461407719438e-06, |
| "loss": 0.2888, |
| "num_input_tokens_seen": 2923312, |
| "step": 10275 |
| }, |
| { |
| "epoch": 19.17910447761194, |
| "grad_norm": 0.8590061664581299, |
| "learning_rate": 5.146309828594875e-06, |
| "loss": 0.3048, |
| "num_input_tokens_seen": 2924912, |
| "step": 10280 |
| }, |
| { |
| "epoch": 19.188432835820894, |
| "grad_norm": 0.5439656376838684, |
| "learning_rate": 5.030469971709472e-06, |
| "loss": 0.4523, |
| "num_input_tokens_seen": 2926416, |
| "step": 10285 |
| }, |
| { |
| "epoch": 19.19776119402985, |
| "grad_norm": 0.44218385219573975, |
| "learning_rate": 4.91594214412322e-06, |
| "loss": 0.1621, |
| "num_input_tokens_seen": 2927696, |
| "step": 10290 |
| }, |
| { |
| "epoch": 19.207089552238806, |
| "grad_norm": 0.523102879524231, |
| "learning_rate": 4.80272664941811e-06, |
| "loss": 0.1128, |
| "num_input_tokens_seen": 2929232, |
| "step": 10295 |
| }, |
| { |
| "epoch": 19.21641791044776, |
| "grad_norm": 0.8231872320175171, |
| "learning_rate": 4.690823787697473e-06, |
| "loss": 0.2353, |
| "num_input_tokens_seen": 2930608, |
| "step": 10300 |
| }, |
| { |
| "epoch": 19.225746268656717, |
| "grad_norm": 0.4466846287250519, |
| "learning_rate": 4.5802338555854254e-06, |
| "loss": 0.1663, |
| "num_input_tokens_seen": 2932240, |
| "step": 10305 |
| }, |
| { |
| "epoch": 19.23507462686567, |
| "grad_norm": 0.4112730920314789, |
| "learning_rate": 4.4709571462256956e-06, |
| "loss": 0.2029, |
| "num_input_tokens_seen": 2933712, |
| "step": 10310 |
| }, |
| { |
| "epoch": 19.244402985074625, |
| "grad_norm": 0.6375637650489807, |
| "learning_rate": 4.36299394928108e-06, |
| "loss": 0.173, |
| "num_input_tokens_seen": 2934960, |
| "step": 10315 |
| }, |
| { |
| "epoch": 19.253731343283583, |
| "grad_norm": 0.3967975080013275, |
| "learning_rate": 4.256344550932434e-06, |
| "loss": 0.2052, |
| "num_input_tokens_seen": 2936368, |
| "step": 10320 |
| }, |
| { |
| "epoch": 19.263059701492537, |
| "grad_norm": 0.41094884276390076, |
| "learning_rate": 4.1510092338784e-06, |
| "loss": 0.1815, |
| "num_input_tokens_seen": 2937936, |
| "step": 10325 |
| }, |
| { |
| "epoch": 19.27238805970149, |
| "grad_norm": 0.48285865783691406, |
| "learning_rate": 4.046988277334185e-06, |
| "loss": 0.3622, |
| "num_input_tokens_seen": 2939248, |
| "step": 10330 |
| }, |
| { |
| "epoch": 19.28171641791045, |
| "grad_norm": 0.43332603573799133, |
| "learning_rate": 3.944281957030893e-06, |
| "loss": 0.1298, |
| "num_input_tokens_seen": 2940688, |
| "step": 10335 |
| }, |
| { |
| "epoch": 19.291044776119403, |
| "grad_norm": 0.716364324092865, |
| "learning_rate": 3.842890545215028e-06, |
| "loss": 0.1903, |
| "num_input_tokens_seen": 2942032, |
| "step": 10340 |
| }, |
| { |
| "epoch": 19.300373134328357, |
| "grad_norm": 0.7156405448913574, |
| "learning_rate": 3.742814310647602e-06, |
| "loss": 0.2553, |
| "num_input_tokens_seen": 2943472, |
| "step": 10345 |
| }, |
| { |
| "epoch": 19.309701492537314, |
| "grad_norm": 0.7371711134910583, |
| "learning_rate": 3.6440535186034184e-06, |
| "loss": 0.3324, |
| "num_input_tokens_seen": 2944880, |
| "step": 10350 |
| }, |
| { |
| "epoch": 19.31902985074627, |
| "grad_norm": 0.6731292605400085, |
| "learning_rate": 3.5466084308704017e-06, |
| "loss": 0.2659, |
| "num_input_tokens_seen": 2946352, |
| "step": 10355 |
| }, |
| { |
| "epoch": 19.328358208955223, |
| "grad_norm": 0.7745460867881775, |
| "learning_rate": 3.4504793057489326e-06, |
| "loss": 0.3356, |
| "num_input_tokens_seen": 2947568, |
| "step": 10360 |
| }, |
| { |
| "epoch": 19.33768656716418, |
| "grad_norm": 0.33525073528289795, |
| "learning_rate": 3.3556663980511826e-06, |
| "loss": 0.1609, |
| "num_input_tokens_seen": 2949104, |
| "step": 10365 |
| }, |
| { |
| "epoch": 19.347014925373134, |
| "grad_norm": 0.3880627155303955, |
| "learning_rate": 3.2621699591001695e-06, |
| "loss": 0.2441, |
| "num_input_tokens_seen": 2950448, |
| "step": 10370 |
| }, |
| { |
| "epoch": 19.35634328358209, |
| "grad_norm": 0.885238766670227, |
| "learning_rate": 3.1699902367295917e-06, |
| "loss": 0.2596, |
| "num_input_tokens_seen": 2951760, |
| "step": 10375 |
| }, |
| { |
| "epoch": 19.365671641791046, |
| "grad_norm": 0.6533879041671753, |
| "learning_rate": 3.079127475282717e-06, |
| "loss": 0.2539, |
| "num_input_tokens_seen": 2953200, |
| "step": 10380 |
| }, |
| { |
| "epoch": 19.375, |
| "grad_norm": 0.48250091075897217, |
| "learning_rate": 2.9895819156119943e-06, |
| "loss": 0.1706, |
| "num_input_tokens_seen": 2954512, |
| "step": 10385 |
| }, |
| { |
| "epoch": 19.384328358208954, |
| "grad_norm": 0.6290906071662903, |
| "learning_rate": 2.9013537950782765e-06, |
| "loss": 0.238, |
| "num_input_tokens_seen": 2955856, |
| "step": 10390 |
| }, |
| { |
| "epoch": 19.39365671641791, |
| "grad_norm": 0.40463876724243164, |
| "learning_rate": 2.8144433475502105e-06, |
| "loss": 0.2206, |
| "num_input_tokens_seen": 2957328, |
| "step": 10395 |
| }, |
| { |
| "epoch": 19.402985074626866, |
| "grad_norm": 0.5499265193939209, |
| "learning_rate": 2.728850803403793e-06, |
| "loss": 0.1399, |
| "num_input_tokens_seen": 2958768, |
| "step": 10400 |
| }, |
| { |
| "epoch": 19.41231343283582, |
| "grad_norm": 0.7918092012405396, |
| "learning_rate": 2.644576389521425e-06, |
| "loss": 0.3264, |
| "num_input_tokens_seen": 2960272, |
| "step": 10405 |
| }, |
| { |
| "epoch": 19.421641791044777, |
| "grad_norm": 0.38886696100234985, |
| "learning_rate": 2.5616203292916916e-06, |
| "loss": 0.2346, |
| "num_input_tokens_seen": 2961648, |
| "step": 10410 |
| }, |
| { |
| "epoch": 19.43097014925373, |
| "grad_norm": 0.4961099624633789, |
| "learning_rate": 2.479982842608475e-06, |
| "loss": 0.091, |
| "num_input_tokens_seen": 2963088, |
| "step": 10415 |
| }, |
| { |
| "epoch": 19.440298507462686, |
| "grad_norm": 0.9479005336761475, |
| "learning_rate": 2.3996641458704504e-06, |
| "loss": 0.371, |
| "num_input_tokens_seen": 2964432, |
| "step": 10420 |
| }, |
| { |
| "epoch": 19.449626865671643, |
| "grad_norm": 0.568725049495697, |
| "learning_rate": 2.320664451980592e-06, |
| "loss": 0.2244, |
| "num_input_tokens_seen": 2965808, |
| "step": 10425 |
| }, |
| { |
| "epoch": 19.458955223880597, |
| "grad_norm": 0.46647632122039795, |
| "learning_rate": 2.2429839703456136e-06, |
| "loss": 0.2795, |
| "num_input_tokens_seen": 2967248, |
| "step": 10430 |
| }, |
| { |
| "epoch": 19.46828358208955, |
| "grad_norm": 0.4957118034362793, |
| "learning_rate": 2.1666229068753594e-06, |
| "loss": 0.2252, |
| "num_input_tokens_seen": 2968528, |
| "step": 10435 |
| }, |
| { |
| "epoch": 19.47761194029851, |
| "grad_norm": 0.8878422975540161, |
| "learning_rate": 2.091581463981973e-06, |
| "loss": 0.2318, |
| "num_input_tokens_seen": 2969968, |
| "step": 10440 |
| }, |
| { |
| "epoch": 19.486940298507463, |
| "grad_norm": 0.5628931522369385, |
| "learning_rate": 2.0178598405800606e-06, |
| "loss": 0.2825, |
| "num_input_tokens_seen": 2971472, |
| "step": 10445 |
| }, |
| { |
| "epoch": 19.496268656716417, |
| "grad_norm": 0.4205546975135803, |
| "learning_rate": 1.945458232085473e-06, |
| "loss": 0.228, |
| "num_input_tokens_seen": 2972880, |
| "step": 10450 |
| }, |
| { |
| "epoch": 19.505597014925375, |
| "grad_norm": 0.8694109320640564, |
| "learning_rate": 1.8743768304151366e-06, |
| "loss": 0.434, |
| "num_input_tokens_seen": 2974192, |
| "step": 10455 |
| }, |
| { |
| "epoch": 19.51492537313433, |
| "grad_norm": 0.941114604473114, |
| "learning_rate": 1.8046158239864996e-06, |
| "loss": 0.1921, |
| "num_input_tokens_seen": 2975632, |
| "step": 10460 |
| }, |
| { |
| "epoch": 19.524253731343283, |
| "grad_norm": 0.6050335764884949, |
| "learning_rate": 1.7361753977169215e-06, |
| "loss": 0.2097, |
| "num_input_tokens_seen": 2977168, |
| "step": 10465 |
| }, |
| { |
| "epoch": 19.53358208955224, |
| "grad_norm": 0.6744943857192993, |
| "learning_rate": 1.6690557330233947e-06, |
| "loss": 0.3264, |
| "num_input_tokens_seen": 2978736, |
| "step": 10470 |
| }, |
| { |
| "epoch": 19.542910447761194, |
| "grad_norm": 0.43842124938964844, |
| "learning_rate": 1.6032570078217678e-06, |
| "loss": 0.2519, |
| "num_input_tokens_seen": 2980176, |
| "step": 10475 |
| }, |
| { |
| "epoch": 19.55223880597015, |
| "grad_norm": 0.395082026720047, |
| "learning_rate": 1.5387793965265794e-06, |
| "loss": 0.2272, |
| "num_input_tokens_seen": 2981488, |
| "step": 10480 |
| }, |
| { |
| "epoch": 19.561567164179106, |
| "grad_norm": 0.28870466351509094, |
| "learning_rate": 1.4756230700503914e-06, |
| "loss": 0.1393, |
| "num_input_tokens_seen": 2982800, |
| "step": 10485 |
| }, |
| { |
| "epoch": 19.57089552238806, |
| "grad_norm": 0.4630226492881775, |
| "learning_rate": 1.4137881958034006e-06, |
| "loss": 0.1739, |
| "num_input_tokens_seen": 2984208, |
| "step": 10490 |
| }, |
| { |
| "epoch": 19.580223880597014, |
| "grad_norm": 0.7136488556861877, |
| "learning_rate": 1.3532749376929944e-06, |
| "loss": 0.3847, |
| "num_input_tokens_seen": 2985616, |
| "step": 10495 |
| }, |
| { |
| "epoch": 19.58955223880597, |
| "grad_norm": 0.5953083634376526, |
| "learning_rate": 1.2940834561233627e-06, |
| "loss": 0.2892, |
| "num_input_tokens_seen": 2987056, |
| "step": 10500 |
| }, |
| { |
| "epoch": 19.598880597014926, |
| "grad_norm": 0.5428503155708313, |
| "learning_rate": 1.236213907994943e-06, |
| "loss": 0.2513, |
| "num_input_tokens_seen": 2988592, |
| "step": 10505 |
| }, |
| { |
| "epoch": 19.60820895522388, |
| "grad_norm": 0.8605926632881165, |
| "learning_rate": 1.1796664467041973e-06, |
| "loss": 0.225, |
| "num_input_tokens_seen": 2989904, |
| "step": 10510 |
| }, |
| { |
| "epoch": 19.617537313432837, |
| "grad_norm": 0.4664938449859619, |
| "learning_rate": 1.1244412221429468e-06, |
| "loss": 0.1427, |
| "num_input_tokens_seen": 2991440, |
| "step": 10515 |
| }, |
| { |
| "epoch": 19.62686567164179, |
| "grad_norm": 0.7807358503341675, |
| "learning_rate": 1.0705383806982606e-06, |
| "loss": 0.2393, |
| "num_input_tokens_seen": 2992848, |
| "step": 10520 |
| }, |
| { |
| "epoch": 19.636194029850746, |
| "grad_norm": 0.6703472137451172, |
| "learning_rate": 1.017958065251845e-06, |
| "loss": 0.4173, |
| "num_input_tokens_seen": 2994096, |
| "step": 10525 |
| }, |
| { |
| "epoch": 19.645522388059703, |
| "grad_norm": 0.5756611227989197, |
| "learning_rate": 9.66700415179822e-07, |
| "loss": 0.2562, |
| "num_input_tokens_seen": 2995440, |
| "step": 10530 |
| }, |
| { |
| "epoch": 19.654850746268657, |
| "grad_norm": 0.7052452564239502, |
| "learning_rate": 9.16765566352229e-07, |
| "loss": 0.3594, |
| "num_input_tokens_seen": 2996816, |
| "step": 10535 |
| }, |
| { |
| "epoch": 19.66417910447761, |
| "grad_norm": 0.4399753212928772, |
| "learning_rate": 8.681536511327415e-07, |
| "loss": 0.2093, |
| "num_input_tokens_seen": 2998352, |
| "step": 10540 |
| }, |
| { |
| "epoch": 19.673507462686565, |
| "grad_norm": 0.439879834651947, |
| "learning_rate": 8.208647983782846e-07, |
| "loss": 0.2758, |
| "num_input_tokens_seen": 2999792, |
| "step": 10545 |
| }, |
| { |
| "epoch": 19.682835820895523, |
| "grad_norm": 0.4704674482345581, |
| "learning_rate": 7.748991334387557e-07, |
| "loss": 0.1757, |
| "num_input_tokens_seen": 3001072, |
| "step": 10550 |
| }, |
| { |
| "epoch": 19.692164179104477, |
| "grad_norm": 0.5811406970024109, |
| "learning_rate": 7.302567781565794e-07, |
| "loss": 0.2236, |
| "num_input_tokens_seen": 3002576, |
| "step": 10555 |
| }, |
| { |
| "epoch": 19.701492537313435, |
| "grad_norm": 0.28484269976615906, |
| "learning_rate": 6.869378508664315e-07, |
| "loss": 0.1104, |
| "num_input_tokens_seen": 3003984, |
| "step": 10560 |
| }, |
| { |
| "epoch": 19.71082089552239, |
| "grad_norm": 0.7560864686965942, |
| "learning_rate": 6.449424663950155e-07, |
| "loss": 0.1164, |
| "num_input_tokens_seen": 3005392, |
| "step": 10565 |
| }, |
| { |
| "epoch": 19.720149253731343, |
| "grad_norm": 0.6477477550506592, |
| "learning_rate": 6.042707360606192e-07, |
| "loss": 0.3706, |
| "num_input_tokens_seen": 3006896, |
| "step": 10570 |
| }, |
| { |
| "epoch": 19.729477611940297, |
| "grad_norm": 0.6175541281700134, |
| "learning_rate": 5.64922767673004e-07, |
| "loss": 0.2673, |
| "num_input_tokens_seen": 3008240, |
| "step": 10575 |
| }, |
| { |
| "epoch": 19.738805970149254, |
| "grad_norm": 0.3923323452472687, |
| "learning_rate": 5.268986655327934e-07, |
| "loss": 0.2049, |
| "num_input_tokens_seen": 3010000, |
| "step": 10580 |
| }, |
| { |
| "epoch": 19.74813432835821, |
| "grad_norm": 0.3566945195198059, |
| "learning_rate": 4.901985304315848e-07, |
| "loss": 0.2348, |
| "num_input_tokens_seen": 3011344, |
| "step": 10585 |
| }, |
| { |
| "epoch": 19.757462686567163, |
| "grad_norm": 0.57565838098526, |
| "learning_rate": 4.548224596513939e-07, |
| "loss": 0.3252, |
| "num_input_tokens_seen": 3012880, |
| "step": 10590 |
| }, |
| { |
| "epoch": 19.76679104477612, |
| "grad_norm": 0.3473455309867859, |
| "learning_rate": 4.207705469645995e-07, |
| "loss": 0.2089, |
| "num_input_tokens_seen": 3014448, |
| "step": 10595 |
| }, |
| { |
| "epoch": 19.776119402985074, |
| "grad_norm": 0.44248923659324646, |
| "learning_rate": 3.8804288263349917e-07, |
| "loss": 0.1524, |
| "num_input_tokens_seen": 3015984, |
| "step": 10600 |
| }, |
| { |
| "epoch": 19.78544776119403, |
| "grad_norm": 0.729060709476471, |
| "learning_rate": 3.56639553410143e-07, |
| "loss": 0.1925, |
| "num_input_tokens_seen": 3017392, |
| "step": 10605 |
| }, |
| { |
| "epoch": 19.794776119402986, |
| "grad_norm": 0.7432746291160583, |
| "learning_rate": 3.265606425363332e-07, |
| "loss": 0.3039, |
| "num_input_tokens_seen": 3018896, |
| "step": 10610 |
| }, |
| { |
| "epoch": 19.80410447761194, |
| "grad_norm": 0.34453150629997253, |
| "learning_rate": 2.97806229743014e-07, |
| "loss": 0.0832, |
| "num_input_tokens_seen": 3020496, |
| "step": 10615 |
| }, |
| { |
| "epoch": 19.813432835820894, |
| "grad_norm": 1.0268018245697021, |
| "learning_rate": 2.703763912502155e-07, |
| "loss": 0.1898, |
| "num_input_tokens_seen": 3021840, |
| "step": 10620 |
| }, |
| { |
| "epoch": 19.82276119402985, |
| "grad_norm": 0.6820981502532959, |
| "learning_rate": 2.4427119976705436e-07, |
| "loss": 0.2509, |
| "num_input_tokens_seen": 3023216, |
| "step": 10625 |
| }, |
| { |
| "epoch": 19.832089552238806, |
| "grad_norm": 0.4682190716266632, |
| "learning_rate": 2.1949072449123363e-07, |
| "loss": 0.1979, |
| "num_input_tokens_seen": 3024720, |
| "step": 10630 |
| }, |
| { |
| "epoch": 19.84141791044776, |
| "grad_norm": 0.5495747923851013, |
| "learning_rate": 1.9603503110904308e-07, |
| "loss": 0.268, |
| "num_input_tokens_seen": 3026160, |
| "step": 10635 |
| }, |
| { |
| "epoch": 19.850746268656717, |
| "grad_norm": 0.9342331886291504, |
| "learning_rate": 1.739041817951925e-07, |
| "loss": 0.3326, |
| "num_input_tokens_seen": 3027632, |
| "step": 10640 |
| }, |
| { |
| "epoch": 19.86007462686567, |
| "grad_norm": 0.5672308802604675, |
| "learning_rate": 1.5309823521242328e-07, |
| "loss": 0.1947, |
| "num_input_tokens_seen": 3028848, |
| "step": 10645 |
| }, |
| { |
| "epoch": 19.869402985074625, |
| "grad_norm": 0.5239127278327942, |
| "learning_rate": 1.3361724651167473e-07, |
| "loss": 0.2189, |
| "num_input_tokens_seen": 3030288, |
| "step": 10650 |
| }, |
| { |
| "epoch": 19.878731343283583, |
| "grad_norm": 0.5062621235847473, |
| "learning_rate": 1.1546126733180673e-07, |
| "loss": 0.1734, |
| "num_input_tokens_seen": 3031696, |
| "step": 10655 |
| }, |
| { |
| "epoch": 19.888059701492537, |
| "grad_norm": 0.8205133676528931, |
| "learning_rate": 9.863034579926655e-08, |
| "loss": 0.1619, |
| "num_input_tokens_seen": 3032976, |
| "step": 10660 |
| }, |
| { |
| "epoch": 19.89738805970149, |
| "grad_norm": 0.5459089875221252, |
| "learning_rate": 8.312452652831093e-08, |
| "loss": 0.2341, |
| "num_input_tokens_seen": 3034512, |
| "step": 10665 |
| }, |
| { |
| "epoch": 19.90671641791045, |
| "grad_norm": 0.5834176540374756, |
| "learning_rate": 6.894385062056197e-08, |
| "loss": 0.1828, |
| "num_input_tokens_seen": 3036016, |
| "step": 10670 |
| }, |
| { |
| "epoch": 19.916044776119403, |
| "grad_norm": 0.5604172348976135, |
| "learning_rate": 5.6088355665229187e-08, |
| "loss": 0.1543, |
| "num_input_tokens_seen": 3037296, |
| "step": 10675 |
| }, |
| { |
| "epoch": 19.925373134328357, |
| "grad_norm": 0.8893586993217468, |
| "learning_rate": 4.4558075738609926e-08, |
| "loss": 0.2943, |
| "num_input_tokens_seen": 3038736, |
| "step": 10680 |
| }, |
| { |
| "epoch": 19.934701492537314, |
| "grad_norm": 0.833718478679657, |
| "learning_rate": 3.4353041404477926e-08, |
| "loss": 0.2967, |
| "num_input_tokens_seen": 3040048, |
| "step": 10685 |
| }, |
| { |
| "epoch": 19.94402985074627, |
| "grad_norm": 0.4309261739253998, |
| "learning_rate": 2.5473279713472685e-08, |
| "loss": 0.1819, |
| "num_input_tokens_seen": 3041424, |
| "step": 10690 |
| }, |
| { |
| "epoch": 19.953358208955223, |
| "grad_norm": 0.5390483736991882, |
| "learning_rate": 1.7918814203432555e-08, |
| "loss": 0.2043, |
| "num_input_tokens_seen": 3042960, |
| "step": 10695 |
| }, |
| { |
| "epoch": 19.96268656716418, |
| "grad_norm": 0.4581194818019867, |
| "learning_rate": 1.1689664899283691e-08, |
| "loss": 0.2257, |
| "num_input_tokens_seen": 3044336, |
| "step": 10700 |
| }, |
| { |
| "epoch": 19.972014925373134, |
| "grad_norm": 0.27400490641593933, |
| "learning_rate": 6.78584831270701e-09, |
| "loss": 0.1333, |
| "num_input_tokens_seen": 3045904, |
| "step": 10705 |
| }, |
| { |
| "epoch": 19.98134328358209, |
| "grad_norm": 0.8952294588088989, |
| "learning_rate": 3.2073774424157263e-09, |
| "loss": 0.5205, |
| "num_input_tokens_seen": 3047248, |
| "step": 10710 |
| }, |
| { |
| "epoch": 19.990671641791046, |
| "grad_norm": 0.7515770196914673, |
| "learning_rate": 9.54261773933318e-10, |
| "loss": 0.2663, |
| "num_input_tokens_seen": 3048848, |
| "step": 10715 |
| }, |
| { |
| "epoch": 20.0, |
| "grad_norm": 0.20985476672649384, |
| "learning_rate": 2.650727970454625e-11, |
| "loss": 0.2325, |
| "num_input_tokens_seen": 3049984, |
| "step": 10720 |
| }, |
| { |
| "epoch": 20.0, |
| "eval_loss": 0.9178647398948669, |
| "eval_runtime": 4.1975, |
| "eval_samples_per_second": 56.701, |
| "eval_steps_per_second": 14.294, |
| "num_input_tokens_seen": 3049984, |
| "step": 10720 |
| }, |
| { |
| "epoch": 20.0, |
| "num_input_tokens_seen": 3049984, |
| "step": 10720, |
| "total_flos": 1.3733940102483149e+17, |
| "train_loss": 0.44957640529957726, |
| "train_runtime": 1754.1922, |
| "train_samples_per_second": 24.41, |
| "train_steps_per_second": 6.111 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 10720, |
| "num_input_tokens_seen": 3049984, |
| "num_train_epochs": 20, |
| "save_steps": 536, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.3733940102483149e+17, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|