| { |
| "best_global_step": 1072, |
| "best_metric": 0.5668801665306091, |
| "best_model_checkpoint": "saves/lora/llama-3-8b-instruct/train_conala_1754507516/checkpoint-1072", |
| "epoch": 10.0, |
| "eval_steps": 268, |
| "global_step": 5360, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.009328358208955223, |
| "grad_norm": 3.3039438724517822, |
| "learning_rate": 3.7313432835820895e-07, |
| "loss": 2.9969, |
| "num_input_tokens_seen": 1216, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.018656716417910446, |
| "grad_norm": 2.344093084335327, |
| "learning_rate": 8.395522388059702e-07, |
| "loss": 2.8165, |
| "num_input_tokens_seen": 2528, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.027985074626865673, |
| "grad_norm": 2.859999179840088, |
| "learning_rate": 1.3059701492537314e-06, |
| "loss": 2.9084, |
| "num_input_tokens_seen": 4160, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.03731343283582089, |
| "grad_norm": 3.245666027069092, |
| "learning_rate": 1.7723880597014925e-06, |
| "loss": 2.9545, |
| "num_input_tokens_seen": 5504, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.04664179104477612, |
| "grad_norm": 2.4401042461395264, |
| "learning_rate": 2.238805970149254e-06, |
| "loss": 2.9278, |
| "num_input_tokens_seen": 6912, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.055970149253731345, |
| "grad_norm": 3.616495132446289, |
| "learning_rate": 2.705223880597015e-06, |
| "loss": 2.5233, |
| "num_input_tokens_seen": 8544, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.06529850746268656, |
| "grad_norm": 3.2376859188079834, |
| "learning_rate": 3.171641791044776e-06, |
| "loss": 3.2097, |
| "num_input_tokens_seen": 9696, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.07462686567164178, |
| "grad_norm": 4.033511161804199, |
| "learning_rate": 3.6380597014925376e-06, |
| "loss": 3.1844, |
| "num_input_tokens_seen": 10976, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.08395522388059702, |
| "grad_norm": 2.769594192504883, |
| "learning_rate": 4.1044776119402985e-06, |
| "loss": 2.9462, |
| "num_input_tokens_seen": 12320, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.09328358208955224, |
| "grad_norm": 4.4679951667785645, |
| "learning_rate": 4.57089552238806e-06, |
| "loss": 2.3983, |
| "num_input_tokens_seen": 13920, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.10261194029850747, |
| "grad_norm": 2.432633638381958, |
| "learning_rate": 5.037313432835821e-06, |
| "loss": 2.5835, |
| "num_input_tokens_seen": 15456, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.11194029850746269, |
| "grad_norm": 2.4959990978240967, |
| "learning_rate": 5.503731343283583e-06, |
| "loss": 2.6624, |
| "num_input_tokens_seen": 16864, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.12126865671641791, |
| "grad_norm": 3.6922364234924316, |
| "learning_rate": 5.970149253731343e-06, |
| "loss": 2.4356, |
| "num_input_tokens_seen": 18336, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.13059701492537312, |
| "grad_norm": 4.336235523223877, |
| "learning_rate": 6.436567164179105e-06, |
| "loss": 2.3425, |
| "num_input_tokens_seen": 19712, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.13992537313432835, |
| "grad_norm": 4.3843512535095215, |
| "learning_rate": 6.902985074626867e-06, |
| "loss": 2.138, |
| "num_input_tokens_seen": 21152, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.14925373134328357, |
| "grad_norm": 5.932689189910889, |
| "learning_rate": 7.369402985074628e-06, |
| "loss": 1.8921, |
| "num_input_tokens_seen": 22560, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.15858208955223882, |
| "grad_norm": 5.42878532409668, |
| "learning_rate": 7.835820895522389e-06, |
| "loss": 1.7154, |
| "num_input_tokens_seen": 23968, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.16791044776119404, |
| "grad_norm": 3.9380810260772705, |
| "learning_rate": 8.30223880597015e-06, |
| "loss": 1.2577, |
| "num_input_tokens_seen": 25600, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.17723880597014927, |
| "grad_norm": 4.0588459968566895, |
| "learning_rate": 8.768656716417911e-06, |
| "loss": 1.2147, |
| "num_input_tokens_seen": 26912, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.1865671641791045, |
| "grad_norm": 6.410980224609375, |
| "learning_rate": 9.235074626865672e-06, |
| "loss": 1.1433, |
| "num_input_tokens_seen": 28512, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.1958955223880597, |
| "grad_norm": 6.021280288696289, |
| "learning_rate": 9.701492537313434e-06, |
| "loss": 1.4927, |
| "num_input_tokens_seen": 29888, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.20522388059701493, |
| "grad_norm": 4.134380340576172, |
| "learning_rate": 1.0167910447761195e-05, |
| "loss": 1.1815, |
| "num_input_tokens_seen": 31296, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.21455223880597016, |
| "grad_norm": 3.0576136112213135, |
| "learning_rate": 1.0634328358208955e-05, |
| "loss": 0.9471, |
| "num_input_tokens_seen": 32736, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.22388059701492538, |
| "grad_norm": 3.78719425201416, |
| "learning_rate": 1.1100746268656717e-05, |
| "loss": 1.2346, |
| "num_input_tokens_seen": 34240, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.2332089552238806, |
| "grad_norm": 3.5856218338012695, |
| "learning_rate": 1.1567164179104478e-05, |
| "loss": 1.0844, |
| "num_input_tokens_seen": 35648, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.24253731343283583, |
| "grad_norm": 3.030383348464966, |
| "learning_rate": 1.203358208955224e-05, |
| "loss": 1.0139, |
| "num_input_tokens_seen": 36992, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.251865671641791, |
| "grad_norm": 2.8542256355285645, |
| "learning_rate": 1.25e-05, |
| "loss": 0.7634, |
| "num_input_tokens_seen": 38528, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.26119402985074625, |
| "grad_norm": 3.358400344848633, |
| "learning_rate": 1.2966417910447761e-05, |
| "loss": 1.0169, |
| "num_input_tokens_seen": 39840, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.27052238805970147, |
| "grad_norm": 3.1415629386901855, |
| "learning_rate": 1.3432835820895523e-05, |
| "loss": 0.7976, |
| "num_input_tokens_seen": 41216, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.2798507462686567, |
| "grad_norm": 2.910473346710205, |
| "learning_rate": 1.3899253731343284e-05, |
| "loss": 0.7605, |
| "num_input_tokens_seen": 42624, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.2891791044776119, |
| "grad_norm": 2.240058183670044, |
| "learning_rate": 1.4365671641791046e-05, |
| "loss": 0.6211, |
| "num_input_tokens_seen": 44192, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.29850746268656714, |
| "grad_norm": 2.0009899139404297, |
| "learning_rate": 1.4832089552238807e-05, |
| "loss": 0.91, |
| "num_input_tokens_seen": 45504, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.30783582089552236, |
| "grad_norm": 2.056093692779541, |
| "learning_rate": 1.529850746268657e-05, |
| "loss": 0.5572, |
| "num_input_tokens_seen": 47360, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.31716417910447764, |
| "grad_norm": 2.57108736038208, |
| "learning_rate": 1.5764925373134328e-05, |
| "loss": 0.7706, |
| "num_input_tokens_seen": 48640, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.32649253731343286, |
| "grad_norm": 3.3332395553588867, |
| "learning_rate": 1.623134328358209e-05, |
| "loss": 0.8388, |
| "num_input_tokens_seen": 50016, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.3358208955223881, |
| "grad_norm": 3.2255704402923584, |
| "learning_rate": 1.6697761194029852e-05, |
| "loss": 0.928, |
| "num_input_tokens_seen": 51296, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.3451492537313433, |
| "grad_norm": 3.521524667739868, |
| "learning_rate": 1.716417910447761e-05, |
| "loss": 1.022, |
| "num_input_tokens_seen": 52576, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.35447761194029853, |
| "grad_norm": 1.4818261861801147, |
| "learning_rate": 1.7630597014925373e-05, |
| "loss": 0.604, |
| "num_input_tokens_seen": 54080, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.36380597014925375, |
| "grad_norm": 3.140883207321167, |
| "learning_rate": 1.8097014925373135e-05, |
| "loss": 1.0696, |
| "num_input_tokens_seen": 55392, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.373134328358209, |
| "grad_norm": 3.2878828048706055, |
| "learning_rate": 1.8563432835820898e-05, |
| "loss": 0.7571, |
| "num_input_tokens_seen": 56864, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.3824626865671642, |
| "grad_norm": 2.6628379821777344, |
| "learning_rate": 1.9029850746268656e-05, |
| "loss": 0.7685, |
| "num_input_tokens_seen": 58368, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.3917910447761194, |
| "grad_norm": 2.1965172290802, |
| "learning_rate": 1.949626865671642e-05, |
| "loss": 0.9628, |
| "num_input_tokens_seen": 59744, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.40111940298507465, |
| "grad_norm": 3.5606253147125244, |
| "learning_rate": 1.996268656716418e-05, |
| "loss": 0.6318, |
| "num_input_tokens_seen": 61152, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.41044776119402987, |
| "grad_norm": 3.8230621814727783, |
| "learning_rate": 2.042910447761194e-05, |
| "loss": 0.7146, |
| "num_input_tokens_seen": 62496, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.4197761194029851, |
| "grad_norm": 3.6789462566375732, |
| "learning_rate": 2.0895522388059702e-05, |
| "loss": 0.6272, |
| "num_input_tokens_seen": 63808, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.4291044776119403, |
| "grad_norm": 3.801642656326294, |
| "learning_rate": 2.1361940298507464e-05, |
| "loss": 0.6831, |
| "num_input_tokens_seen": 65088, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.43843283582089554, |
| "grad_norm": 3.4296956062316895, |
| "learning_rate": 2.1828358208955223e-05, |
| "loss": 0.7715, |
| "num_input_tokens_seen": 66496, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.44776119402985076, |
| "grad_norm": 2.187211751937866, |
| "learning_rate": 2.2294776119402985e-05, |
| "loss": 0.6603, |
| "num_input_tokens_seen": 67872, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.457089552238806, |
| "grad_norm": 2.6200978755950928, |
| "learning_rate": 2.2761194029850747e-05, |
| "loss": 0.877, |
| "num_input_tokens_seen": 69216, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.4664179104477612, |
| "grad_norm": 2.322495698928833, |
| "learning_rate": 2.3227611940298506e-05, |
| "loss": 0.8545, |
| "num_input_tokens_seen": 70816, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.47574626865671643, |
| "grad_norm": 2.191216230392456, |
| "learning_rate": 2.369402985074627e-05, |
| "loss": 0.661, |
| "num_input_tokens_seen": 72224, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.48507462686567165, |
| "grad_norm": 2.5148890018463135, |
| "learning_rate": 2.416044776119403e-05, |
| "loss": 1.0506, |
| "num_input_tokens_seen": 73600, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.4944029850746269, |
| "grad_norm": 2.9455857276916504, |
| "learning_rate": 2.4626865671641793e-05, |
| "loss": 0.6251, |
| "num_input_tokens_seen": 75168, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.5, |
| "eval_loss": 0.6427921652793884, |
| "eval_runtime": 2.8361, |
| "eval_samples_per_second": 83.917, |
| "eval_steps_per_second": 21.156, |
| "num_input_tokens_seen": 75936, |
| "step": 268 |
| }, |
| { |
| "epoch": 0.503731343283582, |
| "grad_norm": 3.210460662841797, |
| "learning_rate": 2.5093283582089555e-05, |
| "loss": 0.7251, |
| "num_input_tokens_seen": 76544, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.5130597014925373, |
| "grad_norm": 1.885954737663269, |
| "learning_rate": 2.5559701492537314e-05, |
| "loss": 0.5867, |
| "num_input_tokens_seen": 78080, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.5223880597014925, |
| "grad_norm": 4.004876613616943, |
| "learning_rate": 2.6026119402985076e-05, |
| "loss": 0.6115, |
| "num_input_tokens_seen": 79520, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.5317164179104478, |
| "grad_norm": 2.1550943851470947, |
| "learning_rate": 2.6492537313432835e-05, |
| "loss": 0.8722, |
| "num_input_tokens_seen": 80992, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.5410447761194029, |
| "grad_norm": 2.8556361198425293, |
| "learning_rate": 2.69589552238806e-05, |
| "loss": 0.717, |
| "num_input_tokens_seen": 82496, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.5503731343283582, |
| "grad_norm": 2.432227849960327, |
| "learning_rate": 2.742537313432836e-05, |
| "loss": 0.5667, |
| "num_input_tokens_seen": 83712, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.5597014925373134, |
| "grad_norm": 2.1414365768432617, |
| "learning_rate": 2.789179104477612e-05, |
| "loss": 0.6744, |
| "num_input_tokens_seen": 85120, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.5690298507462687, |
| "grad_norm": 3.6815836429595947, |
| "learning_rate": 2.835820895522388e-05, |
| "loss": 0.7555, |
| "num_input_tokens_seen": 86560, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.5783582089552238, |
| "grad_norm": 2.964207649230957, |
| "learning_rate": 2.8824626865671646e-05, |
| "loss": 0.5645, |
| "num_input_tokens_seen": 87936, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.5876865671641791, |
| "grad_norm": 2.524484395980835, |
| "learning_rate": 2.92910447761194e-05, |
| "loss": 0.639, |
| "num_input_tokens_seen": 89312, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.5970149253731343, |
| "grad_norm": 4.354158401489258, |
| "learning_rate": 2.9757462686567167e-05, |
| "loss": 0.6987, |
| "num_input_tokens_seen": 90560, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.6063432835820896, |
| "grad_norm": 2.2236135005950928, |
| "learning_rate": 3.0223880597014926e-05, |
| "loss": 0.8123, |
| "num_input_tokens_seen": 91936, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.6156716417910447, |
| "grad_norm": 2.927922248840332, |
| "learning_rate": 3.069029850746269e-05, |
| "loss": 0.4901, |
| "num_input_tokens_seen": 93344, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.625, |
| "grad_norm": 2.627314805984497, |
| "learning_rate": 3.115671641791045e-05, |
| "loss": 0.8231, |
| "num_input_tokens_seen": 94560, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.6343283582089553, |
| "grad_norm": 2.316772699356079, |
| "learning_rate": 3.162313432835821e-05, |
| "loss": 0.5376, |
| "num_input_tokens_seen": 96032, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.6436567164179104, |
| "grad_norm": 1.9249050617218018, |
| "learning_rate": 3.208955223880597e-05, |
| "loss": 0.7791, |
| "num_input_tokens_seen": 97504, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.6529850746268657, |
| "grad_norm": 2.550285816192627, |
| "learning_rate": 3.2555970149253733e-05, |
| "loss": 0.8282, |
| "num_input_tokens_seen": 98720, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.6623134328358209, |
| "grad_norm": 4.007699966430664, |
| "learning_rate": 3.302238805970149e-05, |
| "loss": 0.6517, |
| "num_input_tokens_seen": 100064, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.6716417910447762, |
| "grad_norm": 2.926208257675171, |
| "learning_rate": 3.348880597014926e-05, |
| "loss": 0.5923, |
| "num_input_tokens_seen": 101536, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.6809701492537313, |
| "grad_norm": 2.118560791015625, |
| "learning_rate": 3.395522388059701e-05, |
| "loss": 0.7388, |
| "num_input_tokens_seen": 103072, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.6902985074626866, |
| "grad_norm": 4.18084192276001, |
| "learning_rate": 3.4421641791044776e-05, |
| "loss": 0.9092, |
| "num_input_tokens_seen": 104416, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.6996268656716418, |
| "grad_norm": 2.2685937881469727, |
| "learning_rate": 3.488805970149254e-05, |
| "loss": 0.6046, |
| "num_input_tokens_seen": 105952, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.7089552238805971, |
| "grad_norm": 2.770325183868408, |
| "learning_rate": 3.53544776119403e-05, |
| "loss": 0.7064, |
| "num_input_tokens_seen": 107520, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.7182835820895522, |
| "grad_norm": 2.5074832439422607, |
| "learning_rate": 3.582089552238806e-05, |
| "loss": 0.8754, |
| "num_input_tokens_seen": 109248, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.7276119402985075, |
| "grad_norm": 4.230381488800049, |
| "learning_rate": 3.6287313432835824e-05, |
| "loss": 0.7411, |
| "num_input_tokens_seen": 110752, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.7369402985074627, |
| "grad_norm": 2.8285441398620605, |
| "learning_rate": 3.675373134328358e-05, |
| "loss": 0.5648, |
| "num_input_tokens_seen": 112320, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.746268656716418, |
| "grad_norm": 2.2659432888031006, |
| "learning_rate": 3.722014925373135e-05, |
| "loss": 0.7996, |
| "num_input_tokens_seen": 113792, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.7555970149253731, |
| "grad_norm": 2.894287586212158, |
| "learning_rate": 3.7686567164179104e-05, |
| "loss": 0.6009, |
| "num_input_tokens_seen": 115328, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.7649253731343284, |
| "grad_norm": 3.6457371711730957, |
| "learning_rate": 3.8152985074626867e-05, |
| "loss": 0.4918, |
| "num_input_tokens_seen": 116800, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.7742537313432836, |
| "grad_norm": 2.8405661582946777, |
| "learning_rate": 3.861940298507463e-05, |
| "loss": 0.5695, |
| "num_input_tokens_seen": 118592, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.7835820895522388, |
| "grad_norm": 5.264738082885742, |
| "learning_rate": 3.908582089552239e-05, |
| "loss": 0.5826, |
| "num_input_tokens_seen": 119936, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.792910447761194, |
| "grad_norm": 3.0115413665771484, |
| "learning_rate": 3.9552238805970146e-05, |
| "loss": 0.8592, |
| "num_input_tokens_seen": 121280, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.8022388059701493, |
| "grad_norm": 1.878330111503601, |
| "learning_rate": 4.0018656716417915e-05, |
| "loss": 0.5756, |
| "num_input_tokens_seen": 122944, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.8115671641791045, |
| "grad_norm": 2.4789698123931885, |
| "learning_rate": 4.048507462686567e-05, |
| "loss": 0.5513, |
| "num_input_tokens_seen": 124320, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.8208955223880597, |
| "grad_norm": 2.8399338722229004, |
| "learning_rate": 4.095149253731343e-05, |
| "loss": 0.6889, |
| "num_input_tokens_seen": 125888, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.8302238805970149, |
| "grad_norm": 2.9758617877960205, |
| "learning_rate": 4.1417910447761195e-05, |
| "loss": 0.5956, |
| "num_input_tokens_seen": 127296, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.8395522388059702, |
| "grad_norm": 3.118220567703247, |
| "learning_rate": 4.188432835820896e-05, |
| "loss": 0.6192, |
| "num_input_tokens_seen": 128512, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.8488805970149254, |
| "grad_norm": 1.7349121570587158, |
| "learning_rate": 4.235074626865671e-05, |
| "loss": 0.5956, |
| "num_input_tokens_seen": 130048, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.8582089552238806, |
| "grad_norm": 3.488770008087158, |
| "learning_rate": 4.281716417910448e-05, |
| "loss": 0.5617, |
| "num_input_tokens_seen": 131424, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.8675373134328358, |
| "grad_norm": 2.119168996810913, |
| "learning_rate": 4.328358208955224e-05, |
| "loss": 0.5395, |
| "num_input_tokens_seen": 132704, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.8768656716417911, |
| "grad_norm": 4.024372100830078, |
| "learning_rate": 4.375e-05, |
| "loss": 0.7535, |
| "num_input_tokens_seen": 134048, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.8861940298507462, |
| "grad_norm": 2.1798813343048096, |
| "learning_rate": 4.421641791044777e-05, |
| "loss": 0.568, |
| "num_input_tokens_seen": 135392, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.8955223880597015, |
| "grad_norm": 2.1467628479003906, |
| "learning_rate": 4.4682835820895524e-05, |
| "loss": 0.8717, |
| "num_input_tokens_seen": 136832, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.9048507462686567, |
| "grad_norm": 4.489499092102051, |
| "learning_rate": 4.5149253731343286e-05, |
| "loss": 0.738, |
| "num_input_tokens_seen": 138336, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.914179104477612, |
| "grad_norm": 3.1459500789642334, |
| "learning_rate": 4.561567164179105e-05, |
| "loss": 0.5385, |
| "num_input_tokens_seen": 139840, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.9235074626865671, |
| "grad_norm": 3.7459237575531006, |
| "learning_rate": 4.608208955223881e-05, |
| "loss": 0.5492, |
| "num_input_tokens_seen": 141344, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.9328358208955224, |
| "grad_norm": 2.8763620853424072, |
| "learning_rate": 4.6548507462686566e-05, |
| "loss": 0.5847, |
| "num_input_tokens_seen": 142848, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.9421641791044776, |
| "grad_norm": 2.4034414291381836, |
| "learning_rate": 4.7014925373134335e-05, |
| "loss": 0.538, |
| "num_input_tokens_seen": 144384, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.9514925373134329, |
| "grad_norm": 3.182269334793091, |
| "learning_rate": 4.748134328358209e-05, |
| "loss": 0.5046, |
| "num_input_tokens_seen": 145760, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.960820895522388, |
| "grad_norm": 2.230666399002075, |
| "learning_rate": 4.794776119402985e-05, |
| "loss": 0.4282, |
| "num_input_tokens_seen": 147168, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.9701492537313433, |
| "grad_norm": 2.445336103439331, |
| "learning_rate": 4.8414179104477615e-05, |
| "loss": 0.3602, |
| "num_input_tokens_seen": 148480, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.9794776119402985, |
| "grad_norm": 2.517080307006836, |
| "learning_rate": 4.888059701492538e-05, |
| "loss": 0.5692, |
| "num_input_tokens_seen": 149920, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.9888059701492538, |
| "grad_norm": 3.1791677474975586, |
| "learning_rate": 4.934701492537314e-05, |
| "loss": 0.7844, |
| "num_input_tokens_seen": 151264, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.9981343283582089, |
| "grad_norm": 2.657611131668091, |
| "learning_rate": 4.98134328358209e-05, |
| "loss": 0.8223, |
| "num_input_tokens_seen": 152608, |
| "step": 535 |
| }, |
| { |
| "epoch": 1.0, |
| "eval_loss": 0.5907514095306396, |
| "eval_runtime": 2.8334, |
| "eval_samples_per_second": 83.997, |
| "eval_steps_per_second": 21.176, |
| "num_input_tokens_seen": 152672, |
| "step": 536 |
| }, |
| { |
| "epoch": 1.007462686567164, |
| "grad_norm": 1.8119145631790161, |
| "learning_rate": 4.999995228691131e-05, |
| "loss": 0.7166, |
| "num_input_tokens_seen": 153760, |
| "step": 540 |
| }, |
| { |
| "epoch": 1.0167910447761195, |
| "grad_norm": 3.3788795471191406, |
| "learning_rate": 4.999966070758437e-05, |
| "loss": 0.6792, |
| "num_input_tokens_seen": 155104, |
| "step": 545 |
| }, |
| { |
| "epoch": 1.0261194029850746, |
| "grad_norm": 2.5415327548980713, |
| "learning_rate": 4.999910405928983e-05, |
| "loss": 0.7882, |
| "num_input_tokens_seen": 156416, |
| "step": 550 |
| }, |
| { |
| "epoch": 1.0354477611940298, |
| "grad_norm": 1.9734182357788086, |
| "learning_rate": 4.9998282347929784e-05, |
| "loss": 0.7253, |
| "num_input_tokens_seen": 157888, |
| "step": 555 |
| }, |
| { |
| "epoch": 1.044776119402985, |
| "grad_norm": 2.9964828491210938, |
| "learning_rate": 4.999719558221674e-05, |
| "loss": 0.5567, |
| "num_input_tokens_seen": 159392, |
| "step": 560 |
| }, |
| { |
| "epoch": 1.0541044776119404, |
| "grad_norm": 3.4440269470214844, |
| "learning_rate": 4.999584377367359e-05, |
| "loss": 0.6371, |
| "num_input_tokens_seen": 160864, |
| "step": 565 |
| }, |
| { |
| "epoch": 1.0634328358208955, |
| "grad_norm": 4.4826765060424805, |
| "learning_rate": 4.9994226936633415e-05, |
| "loss": 0.745, |
| "num_input_tokens_seen": 162240, |
| "step": 570 |
| }, |
| { |
| "epoch": 1.0727611940298507, |
| "grad_norm": 2.468851089477539, |
| "learning_rate": 4.999234508823938e-05, |
| "loss": 0.6465, |
| "num_input_tokens_seen": 163616, |
| "step": 575 |
| }, |
| { |
| "epoch": 1.0820895522388059, |
| "grad_norm": 4.831507205963135, |
| "learning_rate": 4.999019824844455e-05, |
| "loss": 0.8606, |
| "num_input_tokens_seen": 165152, |
| "step": 580 |
| }, |
| { |
| "epoch": 1.0914179104477613, |
| "grad_norm": 3.0804178714752197, |
| "learning_rate": 4.998778644001165e-05, |
| "loss": 0.4977, |
| "num_input_tokens_seen": 166560, |
| "step": 585 |
| }, |
| { |
| "epoch": 1.1007462686567164, |
| "grad_norm": 3.1055033206939697, |
| "learning_rate": 4.9985109688512854e-05, |
| "loss": 0.5725, |
| "num_input_tokens_seen": 167904, |
| "step": 590 |
| }, |
| { |
| "epoch": 1.1100746268656716, |
| "grad_norm": 4.078853607177734, |
| "learning_rate": 4.998216802232949e-05, |
| "loss": 0.6868, |
| "num_input_tokens_seen": 169280, |
| "step": 595 |
| }, |
| { |
| "epoch": 1.1194029850746268, |
| "grad_norm": 2.6483054161071777, |
| "learning_rate": 4.9978961472651774e-05, |
| "loss": 0.5227, |
| "num_input_tokens_seen": 170656, |
| "step": 600 |
| }, |
| { |
| "epoch": 1.1287313432835822, |
| "grad_norm": 3.426713466644287, |
| "learning_rate": 4.997549007347842e-05, |
| "loss": 0.5781, |
| "num_input_tokens_seen": 172352, |
| "step": 605 |
| }, |
| { |
| "epoch": 1.1380597014925373, |
| "grad_norm": 2.8154170513153076, |
| "learning_rate": 4.9971753861616354e-05, |
| "loss": 0.4025, |
| "num_input_tokens_seen": 173888, |
| "step": 610 |
| }, |
| { |
| "epoch": 1.1473880597014925, |
| "grad_norm": 2.37202787399292, |
| "learning_rate": 4.9967752876680254e-05, |
| "loss": 0.8043, |
| "num_input_tokens_seen": 175040, |
| "step": 615 |
| }, |
| { |
| "epoch": 1.1567164179104479, |
| "grad_norm": 2.483752965927124, |
| "learning_rate": 4.996348716109217e-05, |
| "loss": 0.494, |
| "num_input_tokens_seen": 176480, |
| "step": 620 |
| }, |
| { |
| "epoch": 1.166044776119403, |
| "grad_norm": 3.4618771076202393, |
| "learning_rate": 4.9958956760081085e-05, |
| "loss": 0.5055, |
| "num_input_tokens_seen": 177952, |
| "step": 625 |
| }, |
| { |
| "epoch": 1.1753731343283582, |
| "grad_norm": 3.647676706314087, |
| "learning_rate": 4.995416172168239e-05, |
| "loss": 0.8458, |
| "num_input_tokens_seen": 179552, |
| "step": 630 |
| }, |
| { |
| "epoch": 1.1847014925373134, |
| "grad_norm": 3.57362961769104, |
| "learning_rate": 4.994910209673741e-05, |
| "loss": 0.6153, |
| "num_input_tokens_seen": 181120, |
| "step": 635 |
| }, |
| { |
| "epoch": 1.1940298507462686, |
| "grad_norm": 2.355445146560669, |
| "learning_rate": 4.9943777938892855e-05, |
| "loss": 0.4659, |
| "num_input_tokens_seen": 182560, |
| "step": 640 |
| }, |
| { |
| "epoch": 1.203358208955224, |
| "grad_norm": 3.3932909965515137, |
| "learning_rate": 4.993818930460026e-05, |
| "loss": 0.5026, |
| "num_input_tokens_seen": 184064, |
| "step": 645 |
| }, |
| { |
| "epoch": 1.212686567164179, |
| "grad_norm": 1.7333723306655884, |
| "learning_rate": 4.9932336253115354e-05, |
| "loss": 0.446, |
| "num_input_tokens_seen": 185600, |
| "step": 650 |
| }, |
| { |
| "epoch": 1.2220149253731343, |
| "grad_norm": 2.949510097503662, |
| "learning_rate": 4.9926218846497486e-05, |
| "loss": 0.5243, |
| "num_input_tokens_seen": 186848, |
| "step": 655 |
| }, |
| { |
| "epoch": 1.2313432835820897, |
| "grad_norm": 2.7718191146850586, |
| "learning_rate": 4.991983714960892e-05, |
| "loss": 0.4645, |
| "num_input_tokens_seen": 188224, |
| "step": 660 |
| }, |
| { |
| "epoch": 1.2406716417910448, |
| "grad_norm": 3.2659573554992676, |
| "learning_rate": 4.9913191230114156e-05, |
| "loss": 0.5369, |
| "num_input_tokens_seen": 189696, |
| "step": 665 |
| }, |
| { |
| "epoch": 1.25, |
| "grad_norm": 3.1746866703033447, |
| "learning_rate": 4.990628115847924e-05, |
| "loss": 0.6512, |
| "num_input_tokens_seen": 191360, |
| "step": 670 |
| }, |
| { |
| "epoch": 1.2593283582089552, |
| "grad_norm": 2.9308531284332275, |
| "learning_rate": 4.9899107007971004e-05, |
| "loss": 0.4969, |
| "num_input_tokens_seen": 192800, |
| "step": 675 |
| }, |
| { |
| "epoch": 1.2686567164179103, |
| "grad_norm": 3.0414493083953857, |
| "learning_rate": 4.989166885465624e-05, |
| "loss": 0.8361, |
| "num_input_tokens_seen": 194208, |
| "step": 680 |
| }, |
| { |
| "epoch": 1.2779850746268657, |
| "grad_norm": 3.008410692214966, |
| "learning_rate": 4.988396677740097e-05, |
| "loss": 0.6767, |
| "num_input_tokens_seen": 195552, |
| "step": 685 |
| }, |
| { |
| "epoch": 1.287313432835821, |
| "grad_norm": 1.6996742486953735, |
| "learning_rate": 4.9876000857869583e-05, |
| "loss": 0.46, |
| "num_input_tokens_seen": 196832, |
| "step": 690 |
| }, |
| { |
| "epoch": 1.296641791044776, |
| "grad_norm": 2.3557729721069336, |
| "learning_rate": 4.986777118052393e-05, |
| "loss": 0.5388, |
| "num_input_tokens_seen": 198112, |
| "step": 695 |
| }, |
| { |
| "epoch": 1.3059701492537314, |
| "grad_norm": 3.6261024475097656, |
| "learning_rate": 4.9859277832622494e-05, |
| "loss": 0.635, |
| "num_input_tokens_seen": 199680, |
| "step": 700 |
| }, |
| { |
| "epoch": 1.3152985074626866, |
| "grad_norm": 2.118464231491089, |
| "learning_rate": 4.98505209042194e-05, |
| "loss": 0.5433, |
| "num_input_tokens_seen": 200960, |
| "step": 705 |
| }, |
| { |
| "epoch": 1.3246268656716418, |
| "grad_norm": 2.204644203186035, |
| "learning_rate": 4.9841500488163526e-05, |
| "loss": 0.484, |
| "num_input_tokens_seen": 202368, |
| "step": 710 |
| }, |
| { |
| "epoch": 1.333955223880597, |
| "grad_norm": 3.5416007041931152, |
| "learning_rate": 4.983221668009744e-05, |
| "loss": 0.5999, |
| "num_input_tokens_seen": 203744, |
| "step": 715 |
| }, |
| { |
| "epoch": 1.3432835820895521, |
| "grad_norm": 3.1746866703033447, |
| "learning_rate": 4.982266957845648e-05, |
| "loss": 0.5582, |
| "num_input_tokens_seen": 205056, |
| "step": 720 |
| }, |
| { |
| "epoch": 1.3526119402985075, |
| "grad_norm": 2.6072187423706055, |
| "learning_rate": 4.981285928446762e-05, |
| "loss": 0.5673, |
| "num_input_tokens_seen": 206432, |
| "step": 725 |
| }, |
| { |
| "epoch": 1.3619402985074627, |
| "grad_norm": 3.2916784286499023, |
| "learning_rate": 4.9802785902148455e-05, |
| "loss": 0.8181, |
| "num_input_tokens_seen": 207712, |
| "step": 730 |
| }, |
| { |
| "epoch": 1.3712686567164178, |
| "grad_norm": 2.1889536380767822, |
| "learning_rate": 4.979244953830608e-05, |
| "loss": 0.5568, |
| "num_input_tokens_seen": 208992, |
| "step": 735 |
| }, |
| { |
| "epoch": 1.3805970149253732, |
| "grad_norm": 3.0767509937286377, |
| "learning_rate": 4.9781850302535945e-05, |
| "loss": 0.702, |
| "num_input_tokens_seen": 210400, |
| "step": 740 |
| }, |
| { |
| "epoch": 1.3899253731343284, |
| "grad_norm": 3.937638998031616, |
| "learning_rate": 4.9770988307220736e-05, |
| "loss": 0.5878, |
| "num_input_tokens_seen": 211936, |
| "step": 745 |
| }, |
| { |
| "epoch": 1.3992537313432836, |
| "grad_norm": 2.7709805965423584, |
| "learning_rate": 4.97598636675291e-05, |
| "loss": 0.553, |
| "num_input_tokens_seen": 213376, |
| "step": 750 |
| }, |
| { |
| "epoch": 1.4085820895522387, |
| "grad_norm": 4.210634231567383, |
| "learning_rate": 4.974847650141453e-05, |
| "loss": 0.7493, |
| "num_input_tokens_seen": 214784, |
| "step": 755 |
| }, |
| { |
| "epoch": 1.417910447761194, |
| "grad_norm": 4.1886305809021, |
| "learning_rate": 4.973682692961403e-05, |
| "loss": 0.4791, |
| "num_input_tokens_seen": 216224, |
| "step": 760 |
| }, |
| { |
| "epoch": 1.4272388059701493, |
| "grad_norm": 2.7188637256622314, |
| "learning_rate": 4.972491507564688e-05, |
| "loss": 0.7112, |
| "num_input_tokens_seen": 217600, |
| "step": 765 |
| }, |
| { |
| "epoch": 1.4365671641791045, |
| "grad_norm": 2.6320688724517822, |
| "learning_rate": 4.971274106581331e-05, |
| "loss": 0.6073, |
| "num_input_tokens_seen": 219040, |
| "step": 770 |
| }, |
| { |
| "epoch": 1.4458955223880596, |
| "grad_norm": 4.980727195739746, |
| "learning_rate": 4.970030502919315e-05, |
| "loss": 0.7264, |
| "num_input_tokens_seen": 220480, |
| "step": 775 |
| }, |
| { |
| "epoch": 1.455223880597015, |
| "grad_norm": 2.803880453109741, |
| "learning_rate": 4.9687607097644495e-05, |
| "loss": 0.3849, |
| "num_input_tokens_seen": 222208, |
| "step": 780 |
| }, |
| { |
| "epoch": 1.4645522388059702, |
| "grad_norm": 3.2096080780029297, |
| "learning_rate": 4.967464740580227e-05, |
| "loss": 0.4493, |
| "num_input_tokens_seen": 223648, |
| "step": 785 |
| }, |
| { |
| "epoch": 1.4738805970149254, |
| "grad_norm": 2.615246057510376, |
| "learning_rate": 4.9661426091076834e-05, |
| "loss": 0.4907, |
| "num_input_tokens_seen": 225152, |
| "step": 790 |
| }, |
| { |
| "epoch": 1.4832089552238805, |
| "grad_norm": 3.244617223739624, |
| "learning_rate": 4.9647943293652486e-05, |
| "loss": 0.6406, |
| "num_input_tokens_seen": 226528, |
| "step": 795 |
| }, |
| { |
| "epoch": 1.4925373134328357, |
| "grad_norm": 5.02903938293457, |
| "learning_rate": 4.963419915648603e-05, |
| "loss": 0.5594, |
| "num_input_tokens_seen": 228000, |
| "step": 800 |
| }, |
| { |
| "epoch": 1.5, |
| "eval_loss": 0.5868470668792725, |
| "eval_runtime": 2.8376, |
| "eval_samples_per_second": 83.874, |
| "eval_steps_per_second": 21.145, |
| "num_input_tokens_seen": 229344, |
| "step": 804 |
| }, |
| { |
| "epoch": 1.501865671641791, |
| "grad_norm": 2.3755788803100586, |
| "learning_rate": 4.962019382530521e-05, |
| "loss": 0.4216, |
| "num_input_tokens_seen": 229600, |
| "step": 805 |
| }, |
| { |
| "epoch": 1.5111940298507462, |
| "grad_norm": 2.2762041091918945, |
| "learning_rate": 4.960592744860717e-05, |
| "loss": 0.5793, |
| "num_input_tokens_seen": 230912, |
| "step": 810 |
| }, |
| { |
| "epoch": 1.5205223880597014, |
| "grad_norm": 2.5347917079925537, |
| "learning_rate": 4.9591400177656935e-05, |
| "loss": 0.5173, |
| "num_input_tokens_seen": 232256, |
| "step": 815 |
| }, |
| { |
| "epoch": 1.5298507462686568, |
| "grad_norm": 2.4342548847198486, |
| "learning_rate": 4.957661216648573e-05, |
| "loss": 0.4191, |
| "num_input_tokens_seen": 233728, |
| "step": 820 |
| }, |
| { |
| "epoch": 1.539179104477612, |
| "grad_norm": 2.5100491046905518, |
| "learning_rate": 4.95615635718894e-05, |
| "loss": 0.4628, |
| "num_input_tokens_seen": 235072, |
| "step": 825 |
| }, |
| { |
| "epoch": 1.5485074626865671, |
| "grad_norm": 2.098355770111084, |
| "learning_rate": 4.954625455342674e-05, |
| "loss": 0.6258, |
| "num_input_tokens_seen": 236512, |
| "step": 830 |
| }, |
| { |
| "epoch": 1.5578358208955225, |
| "grad_norm": 2.1511592864990234, |
| "learning_rate": 4.953068527341777e-05, |
| "loss": 0.4474, |
| "num_input_tokens_seen": 237952, |
| "step": 835 |
| }, |
| { |
| "epoch": 1.5671641791044775, |
| "grad_norm": 2.2810299396514893, |
| "learning_rate": 4.9514855896942046e-05, |
| "loss": 0.594, |
| "num_input_tokens_seen": 239328, |
| "step": 840 |
| }, |
| { |
| "epoch": 1.5764925373134329, |
| "grad_norm": 2.053398370742798, |
| "learning_rate": 4.949876659183692e-05, |
| "loss": 0.4474, |
| "num_input_tokens_seen": 240736, |
| "step": 845 |
| }, |
| { |
| "epoch": 1.585820895522388, |
| "grad_norm": 2.715132236480713, |
| "learning_rate": 4.948241752869571e-05, |
| "loss": 0.5873, |
| "num_input_tokens_seen": 241984, |
| "step": 850 |
| }, |
| { |
| "epoch": 1.5951492537313432, |
| "grad_norm": 3.5476458072662354, |
| "learning_rate": 4.946580888086595e-05, |
| "loss": 0.6303, |
| "num_input_tokens_seen": 243296, |
| "step": 855 |
| }, |
| { |
| "epoch": 1.6044776119402986, |
| "grad_norm": 3.9557089805603027, |
| "learning_rate": 4.9448940824447515e-05, |
| "loss": 0.493, |
| "num_input_tokens_seen": 244960, |
| "step": 860 |
| }, |
| { |
| "epoch": 1.6138059701492538, |
| "grad_norm": 2.8011562824249268, |
| "learning_rate": 4.943181353829076e-05, |
| "loss": 0.5564, |
| "num_input_tokens_seen": 246304, |
| "step": 865 |
| }, |
| { |
| "epoch": 1.623134328358209, |
| "grad_norm": 6.440000534057617, |
| "learning_rate": 4.941442720399464e-05, |
| "loss": 0.6063, |
| "num_input_tokens_seen": 247680, |
| "step": 870 |
| }, |
| { |
| "epoch": 1.6324626865671643, |
| "grad_norm": 3.128471851348877, |
| "learning_rate": 4.939678200590475e-05, |
| "loss": 0.604, |
| "num_input_tokens_seen": 249152, |
| "step": 875 |
| }, |
| { |
| "epoch": 1.6417910447761193, |
| "grad_norm": 2.410419464111328, |
| "learning_rate": 4.937887813111142e-05, |
| "loss": 0.57, |
| "num_input_tokens_seen": 250496, |
| "step": 880 |
| }, |
| { |
| "epoch": 1.6511194029850746, |
| "grad_norm": 2.878894329071045, |
| "learning_rate": 4.936071576944769e-05, |
| "loss": 0.5103, |
| "num_input_tokens_seen": 251872, |
| "step": 885 |
| }, |
| { |
| "epoch": 1.6604477611940298, |
| "grad_norm": 3.006802797317505, |
| "learning_rate": 4.93422951134873e-05, |
| "loss": 0.4058, |
| "num_input_tokens_seen": 253440, |
| "step": 890 |
| }, |
| { |
| "epoch": 1.669776119402985, |
| "grad_norm": 3.3054444789886475, |
| "learning_rate": 4.932361635854268e-05, |
| "loss": 0.5208, |
| "num_input_tokens_seen": 254720, |
| "step": 895 |
| }, |
| { |
| "epoch": 1.6791044776119404, |
| "grad_norm": 5.9087629318237305, |
| "learning_rate": 4.9304679702662854e-05, |
| "loss": 0.7451, |
| "num_input_tokens_seen": 256320, |
| "step": 900 |
| }, |
| { |
| "epoch": 1.6884328358208955, |
| "grad_norm": 5.368732452392578, |
| "learning_rate": 4.9285485346631334e-05, |
| "loss": 0.5319, |
| "num_input_tokens_seen": 257600, |
| "step": 905 |
| }, |
| { |
| "epoch": 1.6977611940298507, |
| "grad_norm": 2.286201238632202, |
| "learning_rate": 4.9266033493964e-05, |
| "loss": 0.3497, |
| "num_input_tokens_seen": 259008, |
| "step": 910 |
| }, |
| { |
| "epoch": 1.707089552238806, |
| "grad_norm": 3.3814620971679688, |
| "learning_rate": 4.924632435090696e-05, |
| "loss": 0.4123, |
| "num_input_tokens_seen": 260448, |
| "step": 915 |
| }, |
| { |
| "epoch": 1.716417910447761, |
| "grad_norm": 3.644437789916992, |
| "learning_rate": 4.922635812643434e-05, |
| "loss": 0.6904, |
| "num_input_tokens_seen": 261888, |
| "step": 920 |
| }, |
| { |
| "epoch": 1.7257462686567164, |
| "grad_norm": 2.0309898853302, |
| "learning_rate": 4.920613503224608e-05, |
| "loss": 0.5573, |
| "num_input_tokens_seen": 263328, |
| "step": 925 |
| }, |
| { |
| "epoch": 1.7350746268656716, |
| "grad_norm": 2.886786699295044, |
| "learning_rate": 4.9185655282765655e-05, |
| "loss": 0.4878, |
| "num_input_tokens_seen": 264832, |
| "step": 930 |
| }, |
| { |
| "epoch": 1.7444029850746268, |
| "grad_norm": 3.0554866790771484, |
| "learning_rate": 4.916491909513787e-05, |
| "loss": 0.8702, |
| "num_input_tokens_seen": 266368, |
| "step": 935 |
| }, |
| { |
| "epoch": 1.7537313432835822, |
| "grad_norm": 3.1428325176239014, |
| "learning_rate": 4.914392668922651e-05, |
| "loss": 0.727, |
| "num_input_tokens_seen": 267904, |
| "step": 940 |
| }, |
| { |
| "epoch": 1.7630597014925373, |
| "grad_norm": 2.113858461380005, |
| "learning_rate": 4.912267828761199e-05, |
| "loss": 0.3656, |
| "num_input_tokens_seen": 269504, |
| "step": 945 |
| }, |
| { |
| "epoch": 1.7723880597014925, |
| "grad_norm": 1.717736840248108, |
| "learning_rate": 4.910117411558906e-05, |
| "loss": 0.4073, |
| "num_input_tokens_seen": 270912, |
| "step": 950 |
| }, |
| { |
| "epoch": 1.7817164179104479, |
| "grad_norm": 2.0296521186828613, |
| "learning_rate": 4.907941440116436e-05, |
| "loss": 0.5451, |
| "num_input_tokens_seen": 272384, |
| "step": 955 |
| }, |
| { |
| "epoch": 1.7910447761194028, |
| "grad_norm": 3.1257433891296387, |
| "learning_rate": 4.905739937505401e-05, |
| "loss": 0.5503, |
| "num_input_tokens_seen": 273760, |
| "step": 960 |
| }, |
| { |
| "epoch": 1.8003731343283582, |
| "grad_norm": 1.63154137134552, |
| "learning_rate": 4.9035129270681196e-05, |
| "loss": 0.3515, |
| "num_input_tokens_seen": 275424, |
| "step": 965 |
| }, |
| { |
| "epoch": 1.8097014925373134, |
| "grad_norm": 6.433821201324463, |
| "learning_rate": 4.901260432417367e-05, |
| "loss": 0.7626, |
| "num_input_tokens_seen": 276832, |
| "step": 970 |
| }, |
| { |
| "epoch": 1.8190298507462686, |
| "grad_norm": 2.9503695964813232, |
| "learning_rate": 4.8989824774361236e-05, |
| "loss": 0.8191, |
| "num_input_tokens_seen": 278336, |
| "step": 975 |
| }, |
| { |
| "epoch": 1.828358208955224, |
| "grad_norm": 3.2087488174438477, |
| "learning_rate": 4.896679086277325e-05, |
| "loss": 0.5652, |
| "num_input_tokens_seen": 279712, |
| "step": 980 |
| }, |
| { |
| "epoch": 1.837686567164179, |
| "grad_norm": 3.4168965816497803, |
| "learning_rate": 4.8943502833636026e-05, |
| "loss": 0.6386, |
| "num_input_tokens_seen": 281152, |
| "step": 985 |
| }, |
| { |
| "epoch": 1.8470149253731343, |
| "grad_norm": 2.7717483043670654, |
| "learning_rate": 4.891996093387028e-05, |
| "loss": 0.6585, |
| "num_input_tokens_seen": 282560, |
| "step": 990 |
| }, |
| { |
| "epoch": 1.8563432835820897, |
| "grad_norm": 2.669891119003296, |
| "learning_rate": 4.889616541308847e-05, |
| "loss": 0.5547, |
| "num_input_tokens_seen": 283968, |
| "step": 995 |
| }, |
| { |
| "epoch": 1.8656716417910446, |
| "grad_norm": 3.489332675933838, |
| "learning_rate": 4.8872116523592196e-05, |
| "loss": 0.5445, |
| "num_input_tokens_seen": 285440, |
| "step": 1000 |
| }, |
| { |
| "epoch": 1.875, |
| "grad_norm": 2.2695631980895996, |
| "learning_rate": 4.8847814520369475e-05, |
| "loss": 0.4197, |
| "num_input_tokens_seen": 286944, |
| "step": 1005 |
| }, |
| { |
| "epoch": 1.8843283582089554, |
| "grad_norm": 3.5801985263824463, |
| "learning_rate": 4.8823259661092104e-05, |
| "loss": 0.6542, |
| "num_input_tokens_seen": 288320, |
| "step": 1010 |
| }, |
| { |
| "epoch": 1.8936567164179103, |
| "grad_norm": 5.4535231590271, |
| "learning_rate": 4.879845220611284e-05, |
| "loss": 0.4567, |
| "num_input_tokens_seen": 289568, |
| "step": 1015 |
| }, |
| { |
| "epoch": 1.9029850746268657, |
| "grad_norm": 3.336360216140747, |
| "learning_rate": 4.877339241846273e-05, |
| "loss": 0.7613, |
| "num_input_tokens_seen": 290976, |
| "step": 1020 |
| }, |
| { |
| "epoch": 1.912313432835821, |
| "grad_norm": 3.318920135498047, |
| "learning_rate": 4.874808056384825e-05, |
| "loss": 0.6616, |
| "num_input_tokens_seen": 292576, |
| "step": 1025 |
| }, |
| { |
| "epoch": 1.921641791044776, |
| "grad_norm": 3.1449263095855713, |
| "learning_rate": 4.872251691064854e-05, |
| "loss": 0.5958, |
| "num_input_tokens_seen": 293952, |
| "step": 1030 |
| }, |
| { |
| "epoch": 1.9309701492537314, |
| "grad_norm": 3.4713878631591797, |
| "learning_rate": 4.869670172991252e-05, |
| "loss": 0.5301, |
| "num_input_tokens_seen": 295488, |
| "step": 1035 |
| }, |
| { |
| "epoch": 1.9402985074626866, |
| "grad_norm": 3.4399540424346924, |
| "learning_rate": 4.8670635295356035e-05, |
| "loss": 0.5952, |
| "num_input_tokens_seen": 296768, |
| "step": 1040 |
| }, |
| { |
| "epoch": 1.9496268656716418, |
| "grad_norm": 1.9748096466064453, |
| "learning_rate": 4.8644317883358956e-05, |
| "loss": 0.4382, |
| "num_input_tokens_seen": 298240, |
| "step": 1045 |
| }, |
| { |
| "epoch": 1.9589552238805972, |
| "grad_norm": 2.6479525566101074, |
| "learning_rate": 4.861774977296223e-05, |
| "loss": 0.5005, |
| "num_input_tokens_seen": 299712, |
| "step": 1050 |
| }, |
| { |
| "epoch": 1.9682835820895521, |
| "grad_norm": 3.0531985759735107, |
| "learning_rate": 4.8590931245864954e-05, |
| "loss": 0.5483, |
| "num_input_tokens_seen": 300864, |
| "step": 1055 |
| }, |
| { |
| "epoch": 1.9776119402985075, |
| "grad_norm": 2.208282947540283, |
| "learning_rate": 4.856386258642135e-05, |
| "loss": 0.5512, |
| "num_input_tokens_seen": 302336, |
| "step": 1060 |
| }, |
| { |
| "epoch": 1.9869402985074627, |
| "grad_norm": 2.8091766834259033, |
| "learning_rate": 4.8536544081637787e-05, |
| "loss": 0.555, |
| "num_input_tokens_seen": 303584, |
| "step": 1065 |
| }, |
| { |
| "epoch": 1.9962686567164178, |
| "grad_norm": 2.3788318634033203, |
| "learning_rate": 4.8508976021169705e-05, |
| "loss": 0.5917, |
| "num_input_tokens_seen": 304960, |
| "step": 1070 |
| }, |
| { |
| "epoch": 2.0, |
| "eval_loss": 0.5668801665306091, |
| "eval_runtime": 2.8383, |
| "eval_samples_per_second": 83.852, |
| "eval_steps_per_second": 21.139, |
| "num_input_tokens_seen": 305288, |
| "step": 1072 |
| }, |
| { |
| "epoch": 2.0055970149253732, |
| "grad_norm": 2.5771918296813965, |
| "learning_rate": 4.8481158697318564e-05, |
| "loss": 0.3916, |
| "num_input_tokens_seen": 306152, |
| "step": 1075 |
| }, |
| { |
| "epoch": 2.014925373134328, |
| "grad_norm": 2.9171202182769775, |
| "learning_rate": 4.845309240502874e-05, |
| "loss": 0.552, |
| "num_input_tokens_seen": 307464, |
| "step": 1080 |
| }, |
| { |
| "epoch": 2.0242537313432836, |
| "grad_norm": 1.2176073789596558, |
| "learning_rate": 4.8424777441884405e-05, |
| "loss": 0.4889, |
| "num_input_tokens_seen": 308968, |
| "step": 1085 |
| }, |
| { |
| "epoch": 2.033582089552239, |
| "grad_norm": 3.1968235969543457, |
| "learning_rate": 4.839621410810634e-05, |
| "loss": 0.5083, |
| "num_input_tokens_seen": 310440, |
| "step": 1090 |
| }, |
| { |
| "epoch": 2.042910447761194, |
| "grad_norm": 2.3309972286224365, |
| "learning_rate": 4.8367402706548805e-05, |
| "loss": 0.5588, |
| "num_input_tokens_seen": 312008, |
| "step": 1095 |
| }, |
| { |
| "epoch": 2.0522388059701493, |
| "grad_norm": 3.1228597164154053, |
| "learning_rate": 4.8338343542696275e-05, |
| "loss": 0.5096, |
| "num_input_tokens_seen": 313416, |
| "step": 1100 |
| }, |
| { |
| "epoch": 2.0615671641791047, |
| "grad_norm": 2.475100040435791, |
| "learning_rate": 4.830903692466024e-05, |
| "loss": 0.5368, |
| "num_input_tokens_seen": 314728, |
| "step": 1105 |
| }, |
| { |
| "epoch": 2.0708955223880596, |
| "grad_norm": 2.120330572128296, |
| "learning_rate": 4.82794831631759e-05, |
| "loss": 0.5446, |
| "num_input_tokens_seen": 316072, |
| "step": 1110 |
| }, |
| { |
| "epoch": 2.080223880597015, |
| "grad_norm": 3.5224862098693848, |
| "learning_rate": 4.8249682571598945e-05, |
| "loss": 0.4733, |
| "num_input_tokens_seen": 317384, |
| "step": 1115 |
| }, |
| { |
| "epoch": 2.08955223880597, |
| "grad_norm": 2.3685243129730225, |
| "learning_rate": 4.821963546590211e-05, |
| "loss": 0.3946, |
| "num_input_tokens_seen": 318696, |
| "step": 1120 |
| }, |
| { |
| "epoch": 2.0988805970149254, |
| "grad_norm": 3.419828414916992, |
| "learning_rate": 4.8189342164671944e-05, |
| "loss": 0.4263, |
| "num_input_tokens_seen": 320008, |
| "step": 1125 |
| }, |
| { |
| "epoch": 2.1082089552238807, |
| "grad_norm": 1.8945624828338623, |
| "learning_rate": 4.815880298910537e-05, |
| "loss": 0.3674, |
| "num_input_tokens_seen": 321480, |
| "step": 1130 |
| }, |
| { |
| "epoch": 2.1175373134328357, |
| "grad_norm": 3.2473342418670654, |
| "learning_rate": 4.8128018263006305e-05, |
| "loss": 0.3894, |
| "num_input_tokens_seen": 323016, |
| "step": 1135 |
| }, |
| { |
| "epoch": 2.126865671641791, |
| "grad_norm": 2.2384839057922363, |
| "learning_rate": 4.8096988312782174e-05, |
| "loss": 0.3365, |
| "num_input_tokens_seen": 324456, |
| "step": 1140 |
| }, |
| { |
| "epoch": 2.1361940298507465, |
| "grad_norm": 1.5901774168014526, |
| "learning_rate": 4.806571346744053e-05, |
| "loss": 0.249, |
| "num_input_tokens_seen": 326120, |
| "step": 1145 |
| }, |
| { |
| "epoch": 2.1455223880597014, |
| "grad_norm": 2.6024768352508545, |
| "learning_rate": 4.803419405858553e-05, |
| "loss": 0.558, |
| "num_input_tokens_seen": 327624, |
| "step": 1150 |
| }, |
| { |
| "epoch": 2.154850746268657, |
| "grad_norm": 4.870899200439453, |
| "learning_rate": 4.8002430420414356e-05, |
| "loss": 0.3841, |
| "num_input_tokens_seen": 329160, |
| "step": 1155 |
| }, |
| { |
| "epoch": 2.1641791044776117, |
| "grad_norm": 4.261692047119141, |
| "learning_rate": 4.79704228897138e-05, |
| "loss": 0.6926, |
| "num_input_tokens_seen": 330472, |
| "step": 1160 |
| }, |
| { |
| "epoch": 2.173507462686567, |
| "grad_norm": 3.7260003089904785, |
| "learning_rate": 4.79381718058566e-05, |
| "loss": 0.2736, |
| "num_input_tokens_seen": 332008, |
| "step": 1165 |
| }, |
| { |
| "epoch": 2.1828358208955225, |
| "grad_norm": 3.4958226680755615, |
| "learning_rate": 4.790567751079783e-05, |
| "loss": 0.3372, |
| "num_input_tokens_seen": 333384, |
| "step": 1170 |
| }, |
| { |
| "epoch": 2.1921641791044775, |
| "grad_norm": 4.219935417175293, |
| "learning_rate": 4.787294034907135e-05, |
| "loss": 0.474, |
| "num_input_tokens_seen": 334696, |
| "step": 1175 |
| }, |
| { |
| "epoch": 2.201492537313433, |
| "grad_norm": 3.23542857170105, |
| "learning_rate": 4.78399606677861e-05, |
| "loss": 0.4129, |
| "num_input_tokens_seen": 336232, |
| "step": 1180 |
| }, |
| { |
| "epoch": 2.2108208955223883, |
| "grad_norm": 3.0439059734344482, |
| "learning_rate": 4.780673881662242e-05, |
| "loss": 0.4226, |
| "num_input_tokens_seen": 337512, |
| "step": 1185 |
| }, |
| { |
| "epoch": 2.220149253731343, |
| "grad_norm": 2.0258500576019287, |
| "learning_rate": 4.777327514782837e-05, |
| "loss": 0.4282, |
| "num_input_tokens_seen": 339080, |
| "step": 1190 |
| }, |
| { |
| "epoch": 2.2294776119402986, |
| "grad_norm": 4.752810955047607, |
| "learning_rate": 4.773957001621597e-05, |
| "loss": 0.5652, |
| "num_input_tokens_seen": 340456, |
| "step": 1195 |
| }, |
| { |
| "epoch": 2.2388059701492535, |
| "grad_norm": 0.7080585956573486, |
| "learning_rate": 4.7705623779157435e-05, |
| "loss": 0.3826, |
| "num_input_tokens_seen": 341864, |
| "step": 1200 |
| }, |
| { |
| "epoch": 2.248134328358209, |
| "grad_norm": 3.4947409629821777, |
| "learning_rate": 4.7671436796581426e-05, |
| "loss": 0.519, |
| "num_input_tokens_seen": 343240, |
| "step": 1205 |
| }, |
| { |
| "epoch": 2.2574626865671643, |
| "grad_norm": 2.21931529045105, |
| "learning_rate": 4.7637009430969194e-05, |
| "loss": 0.5512, |
| "num_input_tokens_seen": 344808, |
| "step": 1210 |
| }, |
| { |
| "epoch": 2.2667910447761193, |
| "grad_norm": 3.669142484664917, |
| "learning_rate": 4.760234204735072e-05, |
| "loss": 0.3782, |
| "num_input_tokens_seen": 346152, |
| "step": 1215 |
| }, |
| { |
| "epoch": 2.2761194029850746, |
| "grad_norm": 3.0081098079681396, |
| "learning_rate": 4.756743501330091e-05, |
| "loss": 0.3546, |
| "num_input_tokens_seen": 347528, |
| "step": 1220 |
| }, |
| { |
| "epoch": 2.28544776119403, |
| "grad_norm": 4.164097309112549, |
| "learning_rate": 4.753228869893566e-05, |
| "loss": 0.3816, |
| "num_input_tokens_seen": 348776, |
| "step": 1225 |
| }, |
| { |
| "epoch": 2.294776119402985, |
| "grad_norm": 2.5778393745422363, |
| "learning_rate": 4.7496903476907885e-05, |
| "loss": 0.3566, |
| "num_input_tokens_seen": 350088, |
| "step": 1230 |
| }, |
| { |
| "epoch": 2.3041044776119404, |
| "grad_norm": 4.034834861755371, |
| "learning_rate": 4.746127972240367e-05, |
| "loss": 0.5524, |
| "num_input_tokens_seen": 351528, |
| "step": 1235 |
| }, |
| { |
| "epoch": 2.3134328358208958, |
| "grad_norm": 2.3974575996398926, |
| "learning_rate": 4.742541781313822e-05, |
| "loss": 0.5909, |
| "num_input_tokens_seen": 352840, |
| "step": 1240 |
| }, |
| { |
| "epoch": 2.3227611940298507, |
| "grad_norm": 1.4837062358856201, |
| "learning_rate": 4.738931812935186e-05, |
| "loss": 0.4514, |
| "num_input_tokens_seen": 354152, |
| "step": 1245 |
| }, |
| { |
| "epoch": 2.332089552238806, |
| "grad_norm": 4.002533435821533, |
| "learning_rate": 4.735298105380601e-05, |
| "loss": 0.4815, |
| "num_input_tokens_seen": 355496, |
| "step": 1250 |
| }, |
| { |
| "epoch": 2.341417910447761, |
| "grad_norm": 3.0783839225769043, |
| "learning_rate": 4.7316406971779145e-05, |
| "loss": 0.3368, |
| "num_input_tokens_seen": 357096, |
| "step": 1255 |
| }, |
| { |
| "epoch": 2.3507462686567164, |
| "grad_norm": 3.411510944366455, |
| "learning_rate": 4.7279596271062716e-05, |
| "loss": 0.3544, |
| "num_input_tokens_seen": 358312, |
| "step": 1260 |
| }, |
| { |
| "epoch": 2.360074626865672, |
| "grad_norm": 3.8714399337768555, |
| "learning_rate": 4.724254934195697e-05, |
| "loss": 0.5672, |
| "num_input_tokens_seen": 359752, |
| "step": 1265 |
| }, |
| { |
| "epoch": 2.3694029850746268, |
| "grad_norm": 3.677072763442993, |
| "learning_rate": 4.720526657726691e-05, |
| "loss": 0.37, |
| "num_input_tokens_seen": 361128, |
| "step": 1270 |
| }, |
| { |
| "epoch": 2.378731343283582, |
| "grad_norm": 3.104233741760254, |
| "learning_rate": 4.716774837229804e-05, |
| "loss": 0.3489, |
| "num_input_tokens_seen": 362600, |
| "step": 1275 |
| }, |
| { |
| "epoch": 2.388059701492537, |
| "grad_norm": 3.0244579315185547, |
| "learning_rate": 4.712999512485225e-05, |
| "loss": 0.543, |
| "num_input_tokens_seen": 364072, |
| "step": 1280 |
| }, |
| { |
| "epoch": 2.3973880597014925, |
| "grad_norm": 2.737908124923706, |
| "learning_rate": 4.709200723522353e-05, |
| "loss": 0.5813, |
| "num_input_tokens_seen": 365576, |
| "step": 1285 |
| }, |
| { |
| "epoch": 2.406716417910448, |
| "grad_norm": 4.255146503448486, |
| "learning_rate": 4.7053785106193793e-05, |
| "loss": 0.4065, |
| "num_input_tokens_seen": 367208, |
| "step": 1290 |
| }, |
| { |
| "epoch": 2.416044776119403, |
| "grad_norm": 3.175067901611328, |
| "learning_rate": 4.701532914302853e-05, |
| "loss": 0.5858, |
| "num_input_tokens_seen": 368680, |
| "step": 1295 |
| }, |
| { |
| "epoch": 2.425373134328358, |
| "grad_norm": 1.7548394203186035, |
| "learning_rate": 4.697663975347258e-05, |
| "loss": 0.5877, |
| "num_input_tokens_seen": 370408, |
| "step": 1300 |
| }, |
| { |
| "epoch": 2.4347014925373136, |
| "grad_norm": 3.2105941772460938, |
| "learning_rate": 4.693771734774578e-05, |
| "loss": 0.3376, |
| "num_input_tokens_seen": 371784, |
| "step": 1305 |
| }, |
| { |
| "epoch": 2.4440298507462686, |
| "grad_norm": 3.962158679962158, |
| "learning_rate": 4.6898562338538606e-05, |
| "loss": 0.4392, |
| "num_input_tokens_seen": 373256, |
| "step": 1310 |
| }, |
| { |
| "epoch": 2.453358208955224, |
| "grad_norm": 1.7701318264007568, |
| "learning_rate": 4.6859175141007796e-05, |
| "loss": 0.3501, |
| "num_input_tokens_seen": 374760, |
| "step": 1315 |
| }, |
| { |
| "epoch": 2.4626865671641793, |
| "grad_norm": 2.577972650527954, |
| "learning_rate": 4.6819556172771974e-05, |
| "loss": 0.4175, |
| "num_input_tokens_seen": 376360, |
| "step": 1320 |
| }, |
| { |
| "epoch": 2.4720149253731343, |
| "grad_norm": 3.9498510360717773, |
| "learning_rate": 4.6779705853907205e-05, |
| "loss": 0.372, |
| "num_input_tokens_seen": 377960, |
| "step": 1325 |
| }, |
| { |
| "epoch": 2.4813432835820897, |
| "grad_norm": 4.253821849822998, |
| "learning_rate": 4.673962460694254e-05, |
| "loss": 0.6105, |
| "num_input_tokens_seen": 379208, |
| "step": 1330 |
| }, |
| { |
| "epoch": 2.4906716417910446, |
| "grad_norm": 3.2186644077301025, |
| "learning_rate": 4.669931285685553e-05, |
| "loss": 0.4713, |
| "num_input_tokens_seen": 380744, |
| "step": 1335 |
| }, |
| { |
| "epoch": 2.5, |
| "grad_norm": 3.395958662033081, |
| "learning_rate": 4.6658771031067734e-05, |
| "loss": 0.4615, |
| "num_input_tokens_seen": 382120, |
| "step": 1340 |
| }, |
| { |
| "epoch": 2.5, |
| "eval_loss": 0.5915355682373047, |
| "eval_runtime": 2.8529, |
| "eval_samples_per_second": 83.423, |
| "eval_steps_per_second": 21.031, |
| "num_input_tokens_seen": 382120, |
| "step": 1340 |
| }, |
| { |
| "epoch": 2.5093283582089554, |
| "grad_norm": 4.6183271408081055, |
| "learning_rate": 4.6617999559440187e-05, |
| "loss": 0.5498, |
| "num_input_tokens_seen": 383400, |
| "step": 1345 |
| }, |
| { |
| "epoch": 2.5186567164179103, |
| "grad_norm": 3.489081859588623, |
| "learning_rate": 4.657699887426884e-05, |
| "loss": 0.4757, |
| "num_input_tokens_seen": 384680, |
| "step": 1350 |
| }, |
| { |
| "epoch": 2.5279850746268657, |
| "grad_norm": 4.734048366546631, |
| "learning_rate": 4.653576941027995e-05, |
| "loss": 0.4859, |
| "num_input_tokens_seen": 385992, |
| "step": 1355 |
| }, |
| { |
| "epoch": 2.5373134328358207, |
| "grad_norm": 2.875990152359009, |
| "learning_rate": 4.649431160462552e-05, |
| "loss": 0.3726, |
| "num_input_tokens_seen": 387368, |
| "step": 1360 |
| }, |
| { |
| "epoch": 2.546641791044776, |
| "grad_norm": 3.749927520751953, |
| "learning_rate": 4.645262589687861e-05, |
| "loss": 0.6045, |
| "num_input_tokens_seen": 388776, |
| "step": 1365 |
| }, |
| { |
| "epoch": 2.5559701492537314, |
| "grad_norm": 2.7659881114959717, |
| "learning_rate": 4.6410712729028734e-05, |
| "loss": 0.4556, |
| "num_input_tokens_seen": 390440, |
| "step": 1370 |
| }, |
| { |
| "epoch": 2.5652985074626864, |
| "grad_norm": 4.874058723449707, |
| "learning_rate": 4.636857254547712e-05, |
| "loss": 0.5299, |
| "num_input_tokens_seen": 391720, |
| "step": 1375 |
| }, |
| { |
| "epoch": 2.574626865671642, |
| "grad_norm": 3.4538872241973877, |
| "learning_rate": 4.632620579303203e-05, |
| "loss": 0.531, |
| "num_input_tokens_seen": 393032, |
| "step": 1380 |
| }, |
| { |
| "epoch": 2.583955223880597, |
| "grad_norm": 2.4599039554595947, |
| "learning_rate": 4.628361292090403e-05, |
| "loss": 0.4497, |
| "num_input_tokens_seen": 394312, |
| "step": 1385 |
| }, |
| { |
| "epoch": 2.593283582089552, |
| "grad_norm": 3.1702048778533936, |
| "learning_rate": 4.624079438070117e-05, |
| "loss": 0.5636, |
| "num_input_tokens_seen": 395656, |
| "step": 1390 |
| }, |
| { |
| "epoch": 2.6026119402985075, |
| "grad_norm": 1.6631338596343994, |
| "learning_rate": 4.6197750626424277e-05, |
| "loss": 0.4542, |
| "num_input_tokens_seen": 397288, |
| "step": 1395 |
| }, |
| { |
| "epoch": 2.611940298507463, |
| "grad_norm": 2.4974026679992676, |
| "learning_rate": 4.615448211446208e-05, |
| "loss": 0.559, |
| "num_input_tokens_seen": 398600, |
| "step": 1400 |
| }, |
| { |
| "epoch": 2.621268656716418, |
| "grad_norm": 5.367152690887451, |
| "learning_rate": 4.6110989303586396e-05, |
| "loss": 0.555, |
| "num_input_tokens_seen": 400168, |
| "step": 1405 |
| }, |
| { |
| "epoch": 2.6305970149253732, |
| "grad_norm": 7.613901138305664, |
| "learning_rate": 4.606727265494727e-05, |
| "loss": 0.5897, |
| "num_input_tokens_seen": 401544, |
| "step": 1410 |
| }, |
| { |
| "epoch": 2.6399253731343286, |
| "grad_norm": 3.7300751209259033, |
| "learning_rate": 4.602333263206806e-05, |
| "loss": 0.3904, |
| "num_input_tokens_seen": 402984, |
| "step": 1415 |
| }, |
| { |
| "epoch": 2.6492537313432836, |
| "grad_norm": 4.695628643035889, |
| "learning_rate": 4.597916970084056e-05, |
| "loss": 0.6003, |
| "num_input_tokens_seen": 404328, |
| "step": 1420 |
| }, |
| { |
| "epoch": 2.658582089552239, |
| "grad_norm": 4.135203838348389, |
| "learning_rate": 4.593478432952002e-05, |
| "loss": 0.484, |
| "num_input_tokens_seen": 405608, |
| "step": 1425 |
| }, |
| { |
| "epoch": 2.667910447761194, |
| "grad_norm": 3.4883320331573486, |
| "learning_rate": 4.5890176988720205e-05, |
| "loss": 0.339, |
| "num_input_tokens_seen": 407208, |
| "step": 1430 |
| }, |
| { |
| "epoch": 2.6772388059701493, |
| "grad_norm": 2.4935200214385986, |
| "learning_rate": 4.584534815140842e-05, |
| "loss": 0.3702, |
| "num_input_tokens_seen": 408776, |
| "step": 1435 |
| }, |
| { |
| "epoch": 2.6865671641791042, |
| "grad_norm": 3.4878594875335693, |
| "learning_rate": 4.5800298292900446e-05, |
| "loss": 0.498, |
| "num_input_tokens_seen": 410120, |
| "step": 1440 |
| }, |
| { |
| "epoch": 2.6958955223880596, |
| "grad_norm": 2.8144829273223877, |
| "learning_rate": 4.575502789085555e-05, |
| "loss": 0.3603, |
| "num_input_tokens_seen": 411624, |
| "step": 1445 |
| }, |
| { |
| "epoch": 2.705223880597015, |
| "grad_norm": 3.9806501865386963, |
| "learning_rate": 4.57095374252714e-05, |
| "loss": 0.3537, |
| "num_input_tokens_seen": 413096, |
| "step": 1450 |
| }, |
| { |
| "epoch": 2.71455223880597, |
| "grad_norm": 6.080500602722168, |
| "learning_rate": 4.5663827378478975e-05, |
| "loss": 0.6438, |
| "num_input_tokens_seen": 414472, |
| "step": 1455 |
| }, |
| { |
| "epoch": 2.7238805970149254, |
| "grad_norm": 2.3082258701324463, |
| "learning_rate": 4.561789823513743e-05, |
| "loss": 0.4969, |
| "num_input_tokens_seen": 416072, |
| "step": 1460 |
| }, |
| { |
| "epoch": 2.7332089552238807, |
| "grad_norm": 3.060985803604126, |
| "learning_rate": 4.5571750482229016e-05, |
| "loss": 0.6129, |
| "num_input_tokens_seen": 417640, |
| "step": 1465 |
| }, |
| { |
| "epoch": 2.7425373134328357, |
| "grad_norm": 3.2514724731445312, |
| "learning_rate": 4.552538460905386e-05, |
| "loss": 0.3567, |
| "num_input_tokens_seen": 419048, |
| "step": 1470 |
| }, |
| { |
| "epoch": 2.751865671641791, |
| "grad_norm": 2.2998499870300293, |
| "learning_rate": 4.54788011072248e-05, |
| "loss": 0.4313, |
| "num_input_tokens_seen": 420520, |
| "step": 1475 |
| }, |
| { |
| "epoch": 2.7611940298507465, |
| "grad_norm": 2.7978920936584473, |
| "learning_rate": 4.543200047066216e-05, |
| "loss": 0.4962, |
| "num_input_tokens_seen": 421960, |
| "step": 1480 |
| }, |
| { |
| "epoch": 2.7705223880597014, |
| "grad_norm": 3.3027796745300293, |
| "learning_rate": 4.538498319558854e-05, |
| "loss": 0.4794, |
| "num_input_tokens_seen": 423304, |
| "step": 1485 |
| }, |
| { |
| "epoch": 2.779850746268657, |
| "grad_norm": 2.6871962547302246, |
| "learning_rate": 4.5337749780523526e-05, |
| "loss": 0.5021, |
| "num_input_tokens_seen": 424552, |
| "step": 1490 |
| }, |
| { |
| "epoch": 2.789179104477612, |
| "grad_norm": 2.6993939876556396, |
| "learning_rate": 4.5290300726278415e-05, |
| "loss": 0.4109, |
| "num_input_tokens_seen": 425896, |
| "step": 1495 |
| }, |
| { |
| "epoch": 2.798507462686567, |
| "grad_norm": 3.0125951766967773, |
| "learning_rate": 4.5242636535950913e-05, |
| "loss": 0.3658, |
| "num_input_tokens_seen": 427208, |
| "step": 1500 |
| }, |
| { |
| "epoch": 2.8078358208955225, |
| "grad_norm": 3.0026652812957764, |
| "learning_rate": 4.519475771491978e-05, |
| "loss": 0.4149, |
| "num_input_tokens_seen": 428392, |
| "step": 1505 |
| }, |
| { |
| "epoch": 2.8171641791044775, |
| "grad_norm": 2.5900356769561768, |
| "learning_rate": 4.5146664770839495e-05, |
| "loss": 0.4586, |
| "num_input_tokens_seen": 429928, |
| "step": 1510 |
| }, |
| { |
| "epoch": 2.826492537313433, |
| "grad_norm": 3.02093243598938, |
| "learning_rate": 4.5098358213634876e-05, |
| "loss": 0.4759, |
| "num_input_tokens_seen": 431464, |
| "step": 1515 |
| }, |
| { |
| "epoch": 2.835820895522388, |
| "grad_norm": 3.1159753799438477, |
| "learning_rate": 4.504983855549562e-05, |
| "loss": 0.3903, |
| "num_input_tokens_seen": 432968, |
| "step": 1520 |
| }, |
| { |
| "epoch": 2.845149253731343, |
| "grad_norm": 5.831302642822266, |
| "learning_rate": 4.5001106310870946e-05, |
| "loss": 0.5089, |
| "num_input_tokens_seen": 434472, |
| "step": 1525 |
| }, |
| { |
| "epoch": 2.8544776119402986, |
| "grad_norm": 4.163346767425537, |
| "learning_rate": 4.49521619964641e-05, |
| "loss": 0.4205, |
| "num_input_tokens_seen": 435944, |
| "step": 1530 |
| }, |
| { |
| "epoch": 2.8638059701492535, |
| "grad_norm": 4.286188125610352, |
| "learning_rate": 4.4903006131226874e-05, |
| "loss": 0.3572, |
| "num_input_tokens_seen": 437608, |
| "step": 1535 |
| }, |
| { |
| "epoch": 2.873134328358209, |
| "grad_norm": 6.123046398162842, |
| "learning_rate": 4.485363923635413e-05, |
| "loss": 0.5312, |
| "num_input_tokens_seen": 439176, |
| "step": 1540 |
| }, |
| { |
| "epoch": 2.8824626865671643, |
| "grad_norm": 3.077726125717163, |
| "learning_rate": 4.480406183527823e-05, |
| "loss": 0.8189, |
| "num_input_tokens_seen": 440520, |
| "step": 1545 |
| }, |
| { |
| "epoch": 2.8917910447761193, |
| "grad_norm": 3.5009377002716064, |
| "learning_rate": 4.475427445366355e-05, |
| "loss": 0.6008, |
| "num_input_tokens_seen": 441896, |
| "step": 1550 |
| }, |
| { |
| "epoch": 2.9011194029850746, |
| "grad_norm": 5.455629348754883, |
| "learning_rate": 4.4704277619400834e-05, |
| "loss": 0.6319, |
| "num_input_tokens_seen": 443176, |
| "step": 1555 |
| }, |
| { |
| "epoch": 2.91044776119403, |
| "grad_norm": 2.8272933959960938, |
| "learning_rate": 4.4654071862601654e-05, |
| "loss": 0.4157, |
| "num_input_tokens_seen": 444680, |
| "step": 1560 |
| }, |
| { |
| "epoch": 2.919776119402985, |
| "grad_norm": 2.7164626121520996, |
| "learning_rate": 4.460365771559275e-05, |
| "loss": 0.463, |
| "num_input_tokens_seen": 446024, |
| "step": 1565 |
| }, |
| { |
| "epoch": 2.9291044776119404, |
| "grad_norm": 2.233797311782837, |
| "learning_rate": 4.455303571291042e-05, |
| "loss": 0.3601, |
| "num_input_tokens_seen": 447656, |
| "step": 1570 |
| }, |
| { |
| "epoch": 2.9384328358208958, |
| "grad_norm": 3.2844743728637695, |
| "learning_rate": 4.4502206391294824e-05, |
| "loss": 0.5214, |
| "num_input_tokens_seen": 448904, |
| "step": 1575 |
| }, |
| { |
| "epoch": 2.9477611940298507, |
| "grad_norm": 3.05997371673584, |
| "learning_rate": 4.445117028968431e-05, |
| "loss": 0.3986, |
| "num_input_tokens_seen": 450216, |
| "step": 1580 |
| }, |
| { |
| "epoch": 2.957089552238806, |
| "grad_norm": 2.920384168624878, |
| "learning_rate": 4.439992794920969e-05, |
| "loss": 0.4488, |
| "num_input_tokens_seen": 451592, |
| "step": 1585 |
| }, |
| { |
| "epoch": 2.966417910447761, |
| "grad_norm": 2.8971283435821533, |
| "learning_rate": 4.434847991318851e-05, |
| "loss": 0.6949, |
| "num_input_tokens_seen": 453032, |
| "step": 1590 |
| }, |
| { |
| "epoch": 2.9757462686567164, |
| "grad_norm": 3.9646215438842773, |
| "learning_rate": 4.4296826727119296e-05, |
| "loss": 0.4116, |
| "num_input_tokens_seen": 454408, |
| "step": 1595 |
| }, |
| { |
| "epoch": 2.9850746268656714, |
| "grad_norm": 4.827686309814453, |
| "learning_rate": 4.424496893867573e-05, |
| "loss": 0.5092, |
| "num_input_tokens_seen": 455912, |
| "step": 1600 |
| }, |
| { |
| "epoch": 2.9944029850746268, |
| "grad_norm": 3.855867624282837, |
| "learning_rate": 4.419290709770091e-05, |
| "loss": 0.3834, |
| "num_input_tokens_seen": 457256, |
| "step": 1605 |
| }, |
| { |
| "epoch": 3.0, |
| "eval_loss": 0.5813223719596863, |
| "eval_runtime": 2.8305, |
| "eval_samples_per_second": 84.083, |
| "eval_steps_per_second": 21.197, |
| "num_input_tokens_seen": 457952, |
| "step": 1608 |
| }, |
| { |
| "epoch": 3.003731343283582, |
| "grad_norm": 3.147845983505249, |
| "learning_rate": 4.414064175620146e-05, |
| "loss": 0.4672, |
| "num_input_tokens_seen": 458464, |
| "step": 1610 |
| }, |
| { |
| "epoch": 3.013059701492537, |
| "grad_norm": 1.6751502752304077, |
| "learning_rate": 4.408817346834169e-05, |
| "loss": 0.2935, |
| "num_input_tokens_seen": 459872, |
| "step": 1615 |
| }, |
| { |
| "epoch": 3.0223880597014925, |
| "grad_norm": 1.2061867713928223, |
| "learning_rate": 4.4035502790437764e-05, |
| "loss": 0.1711, |
| "num_input_tokens_seen": 461408, |
| "step": 1620 |
| }, |
| { |
| "epoch": 3.031716417910448, |
| "grad_norm": 3.155708074569702, |
| "learning_rate": 4.398263028095175e-05, |
| "loss": 0.4812, |
| "num_input_tokens_seen": 462624, |
| "step": 1625 |
| }, |
| { |
| "epoch": 3.041044776119403, |
| "grad_norm": 4.356350898742676, |
| "learning_rate": 4.392955650048571e-05, |
| "loss": 0.2992, |
| "num_input_tokens_seen": 464096, |
| "step": 1630 |
| }, |
| { |
| "epoch": 3.050373134328358, |
| "grad_norm": 4.092106342315674, |
| "learning_rate": 4.387628201177577e-05, |
| "loss": 0.3587, |
| "num_input_tokens_seen": 465408, |
| "step": 1635 |
| }, |
| { |
| "epoch": 3.0597014925373136, |
| "grad_norm": 4.664444446563721, |
| "learning_rate": 4.382280737968614e-05, |
| "loss": 0.413, |
| "num_input_tokens_seen": 466880, |
| "step": 1640 |
| }, |
| { |
| "epoch": 3.0690298507462686, |
| "grad_norm": 2.144272804260254, |
| "learning_rate": 4.3769133171203144e-05, |
| "loss": 0.2475, |
| "num_input_tokens_seen": 468416, |
| "step": 1645 |
| }, |
| { |
| "epoch": 3.078358208955224, |
| "grad_norm": 3.4352564811706543, |
| "learning_rate": 4.371525995542918e-05, |
| "loss": 0.3375, |
| "num_input_tokens_seen": 469952, |
| "step": 1650 |
| }, |
| { |
| "epoch": 3.0876865671641793, |
| "grad_norm": 2.480738401412964, |
| "learning_rate": 4.366118830357672e-05, |
| "loss": 0.3945, |
| "num_input_tokens_seen": 471264, |
| "step": 1655 |
| }, |
| { |
| "epoch": 3.0970149253731343, |
| "grad_norm": 3.0115928649902344, |
| "learning_rate": 4.3606918788962205e-05, |
| "loss": 0.5361, |
| "num_input_tokens_seen": 472640, |
| "step": 1660 |
| }, |
| { |
| "epoch": 3.1063432835820897, |
| "grad_norm": 1.617244839668274, |
| "learning_rate": 4.355245198700003e-05, |
| "loss": 0.3582, |
| "num_input_tokens_seen": 473984, |
| "step": 1665 |
| }, |
| { |
| "epoch": 3.1156716417910446, |
| "grad_norm": 3.321077823638916, |
| "learning_rate": 4.3497788475196376e-05, |
| "loss": 0.3491, |
| "num_input_tokens_seen": 475488, |
| "step": 1670 |
| }, |
| { |
| "epoch": 3.125, |
| "grad_norm": 4.451226234436035, |
| "learning_rate": 4.3442928833143145e-05, |
| "loss": 0.3885, |
| "num_input_tokens_seen": 476864, |
| "step": 1675 |
| }, |
| { |
| "epoch": 3.1343283582089554, |
| "grad_norm": 3.2839231491088867, |
| "learning_rate": 4.338787364251177e-05, |
| "loss": 0.2521, |
| "num_input_tokens_seen": 478272, |
| "step": 1680 |
| }, |
| { |
| "epoch": 3.1436567164179103, |
| "grad_norm": 3.267918109893799, |
| "learning_rate": 4.3332623487047084e-05, |
| "loss": 0.389, |
| "num_input_tokens_seen": 479648, |
| "step": 1685 |
| }, |
| { |
| "epoch": 3.1529850746268657, |
| "grad_norm": 3.0583980083465576, |
| "learning_rate": 4.32771789525611e-05, |
| "loss": 0.2113, |
| "num_input_tokens_seen": 481152, |
| "step": 1690 |
| }, |
| { |
| "epoch": 3.1623134328358207, |
| "grad_norm": 3.1691653728485107, |
| "learning_rate": 4.3221540626926824e-05, |
| "loss": 0.4001, |
| "num_input_tokens_seen": 482624, |
| "step": 1695 |
| }, |
| { |
| "epoch": 3.171641791044776, |
| "grad_norm": 2.5445826053619385, |
| "learning_rate": 4.3165709100071986e-05, |
| "loss": 0.2876, |
| "num_input_tokens_seen": 484032, |
| "step": 1700 |
| }, |
| { |
| "epoch": 3.1809701492537314, |
| "grad_norm": 3.640956401824951, |
| "learning_rate": 4.310968496397284e-05, |
| "loss": 0.3981, |
| "num_input_tokens_seen": 485440, |
| "step": 1705 |
| }, |
| { |
| "epoch": 3.1902985074626864, |
| "grad_norm": 2.416405439376831, |
| "learning_rate": 4.305346881264785e-05, |
| "loss": 0.3107, |
| "num_input_tokens_seen": 486848, |
| "step": 1710 |
| }, |
| { |
| "epoch": 3.199626865671642, |
| "grad_norm": 3.496523857116699, |
| "learning_rate": 4.299706124215138e-05, |
| "loss": 0.374, |
| "num_input_tokens_seen": 488192, |
| "step": 1715 |
| }, |
| { |
| "epoch": 3.208955223880597, |
| "grad_norm": 5.093874454498291, |
| "learning_rate": 4.294046285056742e-05, |
| "loss": 0.4224, |
| "num_input_tokens_seen": 489696, |
| "step": 1720 |
| }, |
| { |
| "epoch": 3.218283582089552, |
| "grad_norm": 5.35825777053833, |
| "learning_rate": 4.288367423800319e-05, |
| "loss": 0.5943, |
| "num_input_tokens_seen": 491008, |
| "step": 1725 |
| }, |
| { |
| "epoch": 3.2276119402985075, |
| "grad_norm": 2.8824002742767334, |
| "learning_rate": 4.2826696006582825e-05, |
| "loss": 0.2812, |
| "num_input_tokens_seen": 492576, |
| "step": 1730 |
| }, |
| { |
| "epoch": 3.236940298507463, |
| "grad_norm": 3.4041173458099365, |
| "learning_rate": 4.276952876044096e-05, |
| "loss": 0.5292, |
| "num_input_tokens_seen": 493824, |
| "step": 1735 |
| }, |
| { |
| "epoch": 3.246268656716418, |
| "grad_norm": 2.9648215770721436, |
| "learning_rate": 4.2712173105716346e-05, |
| "loss": 0.3369, |
| "num_input_tokens_seen": 495200, |
| "step": 1740 |
| }, |
| { |
| "epoch": 3.2555970149253732, |
| "grad_norm": 2.6128084659576416, |
| "learning_rate": 4.265462965054539e-05, |
| "loss": 0.4209, |
| "num_input_tokens_seen": 496576, |
| "step": 1745 |
| }, |
| { |
| "epoch": 3.264925373134328, |
| "grad_norm": 2.278860330581665, |
| "learning_rate": 4.259689900505576e-05, |
| "loss": 0.4414, |
| "num_input_tokens_seen": 498112, |
| "step": 1750 |
| }, |
| { |
| "epoch": 3.2742537313432836, |
| "grad_norm": 2.604008197784424, |
| "learning_rate": 4.253898178135985e-05, |
| "loss": 0.3195, |
| "num_input_tokens_seen": 499552, |
| "step": 1755 |
| }, |
| { |
| "epoch": 3.283582089552239, |
| "grad_norm": 2.744481086730957, |
| "learning_rate": 4.2480878593548344e-05, |
| "loss": 0.2242, |
| "num_input_tokens_seen": 500960, |
| "step": 1760 |
| }, |
| { |
| "epoch": 3.292910447761194, |
| "grad_norm": 5.318899631500244, |
| "learning_rate": 4.24225900576837e-05, |
| "loss": 0.2398, |
| "num_input_tokens_seen": 502208, |
| "step": 1765 |
| }, |
| { |
| "epoch": 3.3022388059701493, |
| "grad_norm": 2.088782548904419, |
| "learning_rate": 4.236411679179357e-05, |
| "loss": 0.3478, |
| "num_input_tokens_seen": 503808, |
| "step": 1770 |
| }, |
| { |
| "epoch": 3.3115671641791042, |
| "grad_norm": 2.437868356704712, |
| "learning_rate": 4.230545941586431e-05, |
| "loss": 0.2478, |
| "num_input_tokens_seen": 505248, |
| "step": 1775 |
| }, |
| { |
| "epoch": 3.3208955223880596, |
| "grad_norm": 2.555248260498047, |
| "learning_rate": 4.224661855183435e-05, |
| "loss": 0.1955, |
| "num_input_tokens_seen": 507072, |
| "step": 1780 |
| }, |
| { |
| "epoch": 3.330223880597015, |
| "grad_norm": 7.7683539390563965, |
| "learning_rate": 4.218759482358765e-05, |
| "loss": 0.3727, |
| "num_input_tokens_seen": 508576, |
| "step": 1785 |
| }, |
| { |
| "epoch": 3.33955223880597, |
| "grad_norm": 4.428984642028809, |
| "learning_rate": 4.212838885694705e-05, |
| "loss": 0.4217, |
| "num_input_tokens_seen": 509824, |
| "step": 1790 |
| }, |
| { |
| "epoch": 3.3488805970149254, |
| "grad_norm": 3.0158886909484863, |
| "learning_rate": 4.206900127966764e-05, |
| "loss": 0.2859, |
| "num_input_tokens_seen": 511392, |
| "step": 1795 |
| }, |
| { |
| "epoch": 3.3582089552238807, |
| "grad_norm": 7.137477874755859, |
| "learning_rate": 4.200943272143013e-05, |
| "loss": 0.3853, |
| "num_input_tokens_seen": 512672, |
| "step": 1800 |
| }, |
| { |
| "epoch": 3.3675373134328357, |
| "grad_norm": 2.7308247089385986, |
| "learning_rate": 4.194968381383414e-05, |
| "loss": 0.2724, |
| "num_input_tokens_seen": 514144, |
| "step": 1805 |
| }, |
| { |
| "epoch": 3.376865671641791, |
| "grad_norm": 5.041729927062988, |
| "learning_rate": 4.188975519039151e-05, |
| "loss": 0.3753, |
| "num_input_tokens_seen": 515584, |
| "step": 1810 |
| }, |
| { |
| "epoch": 3.3861940298507465, |
| "grad_norm": 3.7106144428253174, |
| "learning_rate": 4.1829647486519596e-05, |
| "loss": 0.296, |
| "num_input_tokens_seen": 516928, |
| "step": 1815 |
| }, |
| { |
| "epoch": 3.3955223880597014, |
| "grad_norm": 4.017701148986816, |
| "learning_rate": 4.176936133953454e-05, |
| "loss": 0.2825, |
| "num_input_tokens_seen": 518432, |
| "step": 1820 |
| }, |
| { |
| "epoch": 3.404850746268657, |
| "grad_norm": 3.9543960094451904, |
| "learning_rate": 4.170889738864448e-05, |
| "loss": 0.3754, |
| "num_input_tokens_seen": 519904, |
| "step": 1825 |
| }, |
| { |
| "epoch": 3.4141791044776117, |
| "grad_norm": 6.449535369873047, |
| "learning_rate": 4.16482562749428e-05, |
| "loss": 0.3, |
| "num_input_tokens_seen": 521472, |
| "step": 1830 |
| }, |
| { |
| "epoch": 3.423507462686567, |
| "grad_norm": 3.069199800491333, |
| "learning_rate": 4.158743864140131e-05, |
| "loss": 0.4347, |
| "num_input_tokens_seen": 522848, |
| "step": 1835 |
| }, |
| { |
| "epoch": 3.4328358208955225, |
| "grad_norm": 6.249277114868164, |
| "learning_rate": 4.152644513286348e-05, |
| "loss": 0.4153, |
| "num_input_tokens_seen": 524160, |
| "step": 1840 |
| }, |
| { |
| "epoch": 3.4421641791044775, |
| "grad_norm": 5.4783782958984375, |
| "learning_rate": 4.146527639603751e-05, |
| "loss": 0.4303, |
| "num_input_tokens_seen": 525504, |
| "step": 1845 |
| }, |
| { |
| "epoch": 3.451492537313433, |
| "grad_norm": 3.1311559677124023, |
| "learning_rate": 4.1403933079489585e-05, |
| "loss": 0.2777, |
| "num_input_tokens_seen": 526816, |
| "step": 1850 |
| }, |
| { |
| "epoch": 3.4608208955223883, |
| "grad_norm": 21.318811416625977, |
| "learning_rate": 4.1342415833636904e-05, |
| "loss": 0.5387, |
| "num_input_tokens_seen": 528224, |
| "step": 1855 |
| }, |
| { |
| "epoch": 3.470149253731343, |
| "grad_norm": 3.1834213733673096, |
| "learning_rate": 4.128072531074084e-05, |
| "loss": 0.4099, |
| "num_input_tokens_seen": 529504, |
| "step": 1860 |
| }, |
| { |
| "epoch": 3.4794776119402986, |
| "grad_norm": 3.155311107635498, |
| "learning_rate": 4.121886216489998e-05, |
| "loss": 0.4934, |
| "num_input_tokens_seen": 531072, |
| "step": 1865 |
| }, |
| { |
| "epoch": 3.4888059701492535, |
| "grad_norm": 2.414945602416992, |
| "learning_rate": 4.115682705204326e-05, |
| "loss": 0.2416, |
| "num_input_tokens_seen": 532832, |
| "step": 1870 |
| }, |
| { |
| "epoch": 3.498134328358209, |
| "grad_norm": 3.1543750762939453, |
| "learning_rate": 4.109462062992293e-05, |
| "loss": 0.3483, |
| "num_input_tokens_seen": 534272, |
| "step": 1875 |
| }, |
| { |
| "epoch": 3.5, |
| "eval_loss": 0.6248273253440857, |
| "eval_runtime": 2.846, |
| "eval_samples_per_second": 83.626, |
| "eval_steps_per_second": 21.082, |
| "num_input_tokens_seen": 534688, |
| "step": 1876 |
| }, |
| { |
| "epoch": 3.5074626865671643, |
| "grad_norm": 3.662583589553833, |
| "learning_rate": 4.103224355810761e-05, |
| "loss": 0.4059, |
| "num_input_tokens_seen": 535872, |
| "step": 1880 |
| }, |
| { |
| "epoch": 3.5167910447761193, |
| "grad_norm": 6.081570625305176, |
| "learning_rate": 4.096969649797534e-05, |
| "loss": 0.3749, |
| "num_input_tokens_seen": 537376, |
| "step": 1885 |
| }, |
| { |
| "epoch": 3.5261194029850746, |
| "grad_norm": 3.688330888748169, |
| "learning_rate": 4.0906980112706494e-05, |
| "loss": 0.5712, |
| "num_input_tokens_seen": 538880, |
| "step": 1890 |
| }, |
| { |
| "epoch": 3.53544776119403, |
| "grad_norm": 10.374156951904297, |
| "learning_rate": 4.08440950672768e-05, |
| "loss": 0.2894, |
| "num_input_tokens_seen": 540512, |
| "step": 1895 |
| }, |
| { |
| "epoch": 3.544776119402985, |
| "grad_norm": 4.767528057098389, |
| "learning_rate": 4.078104202845027e-05, |
| "loss": 0.3879, |
| "num_input_tokens_seen": 541920, |
| "step": 1900 |
| }, |
| { |
| "epoch": 3.5541044776119404, |
| "grad_norm": 6.066161632537842, |
| "learning_rate": 4.071782166477213e-05, |
| "loss": 0.4177, |
| "num_input_tokens_seen": 543328, |
| "step": 1905 |
| }, |
| { |
| "epoch": 3.5634328358208958, |
| "grad_norm": 5.299854278564453, |
| "learning_rate": 4.065443464656174e-05, |
| "loss": 0.3341, |
| "num_input_tokens_seen": 544800, |
| "step": 1910 |
| }, |
| { |
| "epoch": 3.5727611940298507, |
| "grad_norm": 6.5161051750183105, |
| "learning_rate": 4.0590881645905475e-05, |
| "loss": 0.4716, |
| "num_input_tokens_seen": 546208, |
| "step": 1915 |
| }, |
| { |
| "epoch": 3.582089552238806, |
| "grad_norm": 4.15543270111084, |
| "learning_rate": 4.052716333664963e-05, |
| "loss": 0.265, |
| "num_input_tokens_seen": 547616, |
| "step": 1920 |
| }, |
| { |
| "epoch": 3.591417910447761, |
| "grad_norm": 5.431297302246094, |
| "learning_rate": 4.046328039439321e-05, |
| "loss": 0.4432, |
| "num_input_tokens_seen": 549088, |
| "step": 1925 |
| }, |
| { |
| "epoch": 3.6007462686567164, |
| "grad_norm": 2.523219347000122, |
| "learning_rate": 4.039923349648084e-05, |
| "loss": 0.3204, |
| "num_input_tokens_seen": 550464, |
| "step": 1930 |
| }, |
| { |
| "epoch": 3.6100746268656714, |
| "grad_norm": 7.0382513999938965, |
| "learning_rate": 4.0335023321995545e-05, |
| "loss": 0.2728, |
| "num_input_tokens_seen": 552032, |
| "step": 1935 |
| }, |
| { |
| "epoch": 3.6194029850746268, |
| "grad_norm": 5.659396648406982, |
| "learning_rate": 4.0270650551751546e-05, |
| "loss": 0.4712, |
| "num_input_tokens_seen": 553344, |
| "step": 1940 |
| }, |
| { |
| "epoch": 3.628731343283582, |
| "grad_norm": 4.22886848449707, |
| "learning_rate": 4.020611586828705e-05, |
| "loss": 0.344, |
| "num_input_tokens_seen": 554592, |
| "step": 1945 |
| }, |
| { |
| "epoch": 3.638059701492537, |
| "grad_norm": 3.0496978759765625, |
| "learning_rate": 4.0141419955857044e-05, |
| "loss": 0.3534, |
| "num_input_tokens_seen": 556064, |
| "step": 1950 |
| }, |
| { |
| "epoch": 3.6473880597014925, |
| "grad_norm": 2.605456829071045, |
| "learning_rate": 4.007656350042595e-05, |
| "loss": 0.2536, |
| "num_input_tokens_seen": 557536, |
| "step": 1955 |
| }, |
| { |
| "epoch": 3.656716417910448, |
| "grad_norm": 4.017327308654785, |
| "learning_rate": 4.001154718966048e-05, |
| "loss": 0.3313, |
| "num_input_tokens_seen": 558976, |
| "step": 1960 |
| }, |
| { |
| "epoch": 3.666044776119403, |
| "grad_norm": 3.715014934539795, |
| "learning_rate": 3.994637171292223e-05, |
| "loss": 0.3304, |
| "num_input_tokens_seen": 560192, |
| "step": 1965 |
| }, |
| { |
| "epoch": 3.675373134328358, |
| "grad_norm": 7.917237758636475, |
| "learning_rate": 3.988103776126042e-05, |
| "loss": 0.36, |
| "num_input_tokens_seen": 561472, |
| "step": 1970 |
| }, |
| { |
| "epoch": 3.6847014925373136, |
| "grad_norm": 6.506331443786621, |
| "learning_rate": 3.9815546027404603e-05, |
| "loss": 0.4706, |
| "num_input_tokens_seen": 562912, |
| "step": 1975 |
| }, |
| { |
| "epoch": 3.6940298507462686, |
| "grad_norm": 4.329020023345947, |
| "learning_rate": 3.974989720575724e-05, |
| "loss": 0.3231, |
| "num_input_tokens_seen": 564288, |
| "step": 1980 |
| }, |
| { |
| "epoch": 3.703358208955224, |
| "grad_norm": 3.0138943195343018, |
| "learning_rate": 3.9684091992386393e-05, |
| "loss": 0.3898, |
| "num_input_tokens_seen": 565664, |
| "step": 1985 |
| }, |
| { |
| "epoch": 3.7126865671641793, |
| "grad_norm": 4.0429534912109375, |
| "learning_rate": 3.961813108501833e-05, |
| "loss": 0.488, |
| "num_input_tokens_seen": 567040, |
| "step": 1990 |
| }, |
| { |
| "epoch": 3.7220149253731343, |
| "grad_norm": 3.257197380065918, |
| "learning_rate": 3.9552015183030136e-05, |
| "loss": 0.3438, |
| "num_input_tokens_seen": 568320, |
| "step": 1995 |
| }, |
| { |
| "epoch": 3.7313432835820897, |
| "grad_norm": 3.7157187461853027, |
| "learning_rate": 3.9485744987442266e-05, |
| "loss": 0.4853, |
| "num_input_tokens_seen": 569696, |
| "step": 2000 |
| }, |
| { |
| "epoch": 3.7406716417910446, |
| "grad_norm": 2.6363704204559326, |
| "learning_rate": 3.9419321200911155e-05, |
| "loss": 0.3174, |
| "num_input_tokens_seen": 571296, |
| "step": 2005 |
| }, |
| { |
| "epoch": 3.75, |
| "grad_norm": 4.168699741363525, |
| "learning_rate": 3.9352744527721754e-05, |
| "loss": 0.3712, |
| "num_input_tokens_seen": 572768, |
| "step": 2010 |
| }, |
| { |
| "epoch": 3.7593283582089554, |
| "grad_norm": 3.1416618824005127, |
| "learning_rate": 3.928601567378003e-05, |
| "loss": 0.2522, |
| "num_input_tokens_seen": 574080, |
| "step": 2015 |
| }, |
| { |
| "epoch": 3.7686567164179103, |
| "grad_norm": 8.181133270263672, |
| "learning_rate": 3.921913534660552e-05, |
| "loss": 0.3402, |
| "num_input_tokens_seen": 575456, |
| "step": 2020 |
| }, |
| { |
| "epoch": 3.7779850746268657, |
| "grad_norm": 5.146003246307373, |
| "learning_rate": 3.915210425532383e-05, |
| "loss": 0.3121, |
| "num_input_tokens_seen": 576864, |
| "step": 2025 |
| }, |
| { |
| "epoch": 3.7873134328358207, |
| "grad_norm": 5.798357963562012, |
| "learning_rate": 3.908492311065909e-05, |
| "loss": 0.2784, |
| "num_input_tokens_seen": 578304, |
| "step": 2030 |
| }, |
| { |
| "epoch": 3.796641791044776, |
| "grad_norm": 2.587034225463867, |
| "learning_rate": 3.901759262492643e-05, |
| "loss": 0.1437, |
| "num_input_tokens_seen": 579744, |
| "step": 2035 |
| }, |
| { |
| "epoch": 3.8059701492537314, |
| "grad_norm": 6.835939407348633, |
| "learning_rate": 3.895011351202443e-05, |
| "loss": 0.4146, |
| "num_input_tokens_seen": 581088, |
| "step": 2040 |
| }, |
| { |
| "epoch": 3.8152985074626864, |
| "grad_norm": 7.292059421539307, |
| "learning_rate": 3.888248648742756e-05, |
| "loss": 0.372, |
| "num_input_tokens_seen": 582560, |
| "step": 2045 |
| }, |
| { |
| "epoch": 3.824626865671642, |
| "grad_norm": 3.776172399520874, |
| "learning_rate": 3.881471226817858e-05, |
| "loss": 0.3083, |
| "num_input_tokens_seen": 584000, |
| "step": 2050 |
| }, |
| { |
| "epoch": 3.833955223880597, |
| "grad_norm": 6.265244960784912, |
| "learning_rate": 3.874679157288092e-05, |
| "loss": 0.5771, |
| "num_input_tokens_seen": 585248, |
| "step": 2055 |
| }, |
| { |
| "epoch": 3.843283582089552, |
| "grad_norm": 4.3855085372924805, |
| "learning_rate": 3.86787251216911e-05, |
| "loss": 0.4541, |
| "num_input_tokens_seen": 586720, |
| "step": 2060 |
| }, |
| { |
| "epoch": 3.8526119402985075, |
| "grad_norm": 4.705705165863037, |
| "learning_rate": 3.8610513636311073e-05, |
| "loss": 0.303, |
| "num_input_tokens_seen": 588352, |
| "step": 2065 |
| }, |
| { |
| "epoch": 3.861940298507463, |
| "grad_norm": 5.55418062210083, |
| "learning_rate": 3.854215783998058e-05, |
| "loss": 0.3534, |
| "num_input_tokens_seen": 589632, |
| "step": 2070 |
| }, |
| { |
| "epoch": 3.871268656716418, |
| "grad_norm": 1.596548318862915, |
| "learning_rate": 3.8473658457469466e-05, |
| "loss": 0.3093, |
| "num_input_tokens_seen": 591072, |
| "step": 2075 |
| }, |
| { |
| "epoch": 3.8805970149253732, |
| "grad_norm": 3.642246723175049, |
| "learning_rate": 3.840501621507003e-05, |
| "loss": 0.2841, |
| "num_input_tokens_seen": 592544, |
| "step": 2080 |
| }, |
| { |
| "epoch": 3.8899253731343286, |
| "grad_norm": 5.0441484451293945, |
| "learning_rate": 3.833623184058926e-05, |
| "loss": 0.2935, |
| "num_input_tokens_seen": 594112, |
| "step": 2085 |
| }, |
| { |
| "epoch": 3.8992537313432836, |
| "grad_norm": 4.8261895179748535, |
| "learning_rate": 3.826730606334119e-05, |
| "loss": 0.2972, |
| "num_input_tokens_seen": 595744, |
| "step": 2090 |
| }, |
| { |
| "epoch": 3.908582089552239, |
| "grad_norm": 4.6991119384765625, |
| "learning_rate": 3.819823961413912e-05, |
| "loss": 0.4074, |
| "num_input_tokens_seen": 597184, |
| "step": 2095 |
| }, |
| { |
| "epoch": 3.917910447761194, |
| "grad_norm": 3.095750331878662, |
| "learning_rate": 3.812903322528789e-05, |
| "loss": 0.3549, |
| "num_input_tokens_seen": 598816, |
| "step": 2100 |
| }, |
| { |
| "epoch": 3.9272388059701493, |
| "grad_norm": 4.424613952636719, |
| "learning_rate": 3.805968763057609e-05, |
| "loss": 0.5034, |
| "num_input_tokens_seen": 600192, |
| "step": 2105 |
| }, |
| { |
| "epoch": 3.9365671641791042, |
| "grad_norm": 3.4246561527252197, |
| "learning_rate": 3.7990203565268314e-05, |
| "loss": 0.3196, |
| "num_input_tokens_seen": 601472, |
| "step": 2110 |
| }, |
| { |
| "epoch": 3.9458955223880596, |
| "grad_norm": 3.3944056034088135, |
| "learning_rate": 3.792058176609734e-05, |
| "loss": 0.3427, |
| "num_input_tokens_seen": 602880, |
| "step": 2115 |
| }, |
| { |
| "epoch": 3.955223880597015, |
| "grad_norm": 7.415956497192383, |
| "learning_rate": 3.785082297125631e-05, |
| "loss": 0.4636, |
| "num_input_tokens_seen": 604416, |
| "step": 2120 |
| }, |
| { |
| "epoch": 3.96455223880597, |
| "grad_norm": 3.31691312789917, |
| "learning_rate": 3.7780927920390964e-05, |
| "loss": 0.4663, |
| "num_input_tokens_seen": 605696, |
| "step": 2125 |
| }, |
| { |
| "epoch": 3.9738805970149254, |
| "grad_norm": 2.998872756958008, |
| "learning_rate": 3.771089735459168e-05, |
| "loss": 0.29, |
| "num_input_tokens_seen": 607040, |
| "step": 2130 |
| }, |
| { |
| "epoch": 3.9832089552238807, |
| "grad_norm": 2.857823371887207, |
| "learning_rate": 3.7640732016385745e-05, |
| "loss": 0.351, |
| "num_input_tokens_seen": 608608, |
| "step": 2135 |
| }, |
| { |
| "epoch": 3.9925373134328357, |
| "grad_norm": 4.320472717285156, |
| "learning_rate": 3.757043264972941e-05, |
| "loss": 0.2366, |
| "num_input_tokens_seen": 609984, |
| "step": 2140 |
| }, |
| { |
| "epoch": 4.0, |
| "eval_loss": 0.6150075197219849, |
| "eval_runtime": 2.841, |
| "eval_samples_per_second": 83.773, |
| "eval_steps_per_second": 21.119, |
| "num_input_tokens_seen": 610944, |
| "step": 2144 |
| }, |
| { |
| "epoch": 4.001865671641791, |
| "grad_norm": 2.3480653762817383, |
| "learning_rate": 3.7500000000000003e-05, |
| "loss": 0.1939, |
| "num_input_tokens_seen": 611200, |
| "step": 2145 |
| }, |
| { |
| "epoch": 4.0111940298507465, |
| "grad_norm": 2.5952515602111816, |
| "learning_rate": 3.742943481398805e-05, |
| "loss": 0.1706, |
| "num_input_tokens_seen": 612512, |
| "step": 2150 |
| }, |
| { |
| "epoch": 4.020522388059701, |
| "grad_norm": 1.5225565433502197, |
| "learning_rate": 3.7358737839889356e-05, |
| "loss": 0.2817, |
| "num_input_tokens_seen": 613920, |
| "step": 2155 |
| }, |
| { |
| "epoch": 4.029850746268656, |
| "grad_norm": 3.9663429260253906, |
| "learning_rate": 3.728790982729705e-05, |
| "loss": 0.2205, |
| "num_input_tokens_seen": 615264, |
| "step": 2160 |
| }, |
| { |
| "epoch": 4.039179104477612, |
| "grad_norm": 3.3339385986328125, |
| "learning_rate": 3.721695152719364e-05, |
| "loss": 0.3802, |
| "num_input_tokens_seen": 616576, |
| "step": 2165 |
| }, |
| { |
| "epoch": 4.048507462686567, |
| "grad_norm": 4.832662105560303, |
| "learning_rate": 3.7145863691943076e-05, |
| "loss": 0.2195, |
| "num_input_tokens_seen": 617984, |
| "step": 2170 |
| }, |
| { |
| "epoch": 4.057835820895522, |
| "grad_norm": 5.448497772216797, |
| "learning_rate": 3.707464707528275e-05, |
| "loss": 0.1588, |
| "num_input_tokens_seen": 619520, |
| "step": 2175 |
| }, |
| { |
| "epoch": 4.067164179104478, |
| "grad_norm": 3.924748182296753, |
| "learning_rate": 3.700330243231552e-05, |
| "loss": 0.2589, |
| "num_input_tokens_seen": 620864, |
| "step": 2180 |
| }, |
| { |
| "epoch": 4.076492537313433, |
| "grad_norm": 5.021126747131348, |
| "learning_rate": 3.6931830519501685e-05, |
| "loss": 0.3013, |
| "num_input_tokens_seen": 622272, |
| "step": 2185 |
| }, |
| { |
| "epoch": 4.085820895522388, |
| "grad_norm": 4.3957366943359375, |
| "learning_rate": 3.686023209465096e-05, |
| "loss": 0.135, |
| "num_input_tokens_seen": 623840, |
| "step": 2190 |
| }, |
| { |
| "epoch": 4.095149253731344, |
| "grad_norm": 5.3109917640686035, |
| "learning_rate": 3.678850791691448e-05, |
| "loss": 0.2931, |
| "num_input_tokens_seen": 625216, |
| "step": 2195 |
| }, |
| { |
| "epoch": 4.104477611940299, |
| "grad_norm": 3.735299587249756, |
| "learning_rate": 3.671665874677673e-05, |
| "loss": 0.2086, |
| "num_input_tokens_seen": 626592, |
| "step": 2200 |
| }, |
| { |
| "epoch": 4.1138059701492535, |
| "grad_norm": 4.386044979095459, |
| "learning_rate": 3.664468534604745e-05, |
| "loss": 0.2092, |
| "num_input_tokens_seen": 627968, |
| "step": 2205 |
| }, |
| { |
| "epoch": 4.123134328358209, |
| "grad_norm": 4.557720184326172, |
| "learning_rate": 3.65725884778536e-05, |
| "loss": 0.2979, |
| "num_input_tokens_seen": 629248, |
| "step": 2210 |
| }, |
| { |
| "epoch": 4.132462686567164, |
| "grad_norm": 4.342172622680664, |
| "learning_rate": 3.650036890663124e-05, |
| "loss": 0.2427, |
| "num_input_tokens_seen": 631040, |
| "step": 2215 |
| }, |
| { |
| "epoch": 4.141791044776119, |
| "grad_norm": 5.317047119140625, |
| "learning_rate": 3.642802739811747e-05, |
| "loss": 0.1704, |
| "num_input_tokens_seen": 632320, |
| "step": 2220 |
| }, |
| { |
| "epoch": 4.151119402985074, |
| "grad_norm": 4.969810962677002, |
| "learning_rate": 3.635556471934224e-05, |
| "loss": 0.3303, |
| "num_input_tokens_seen": 633792, |
| "step": 2225 |
| }, |
| { |
| "epoch": 4.16044776119403, |
| "grad_norm": 3.9854063987731934, |
| "learning_rate": 3.628298163862029e-05, |
| "loss": 0.1994, |
| "num_input_tokens_seen": 635328, |
| "step": 2230 |
| }, |
| { |
| "epoch": 4.169776119402985, |
| "grad_norm": 3.2962820529937744, |
| "learning_rate": 3.621027892554295e-05, |
| "loss": 0.2023, |
| "num_input_tokens_seen": 636864, |
| "step": 2235 |
| }, |
| { |
| "epoch": 4.17910447761194, |
| "grad_norm": 2.729600429534912, |
| "learning_rate": 3.613745735096999e-05, |
| "loss": 0.1903, |
| "num_input_tokens_seen": 638112, |
| "step": 2240 |
| }, |
| { |
| "epoch": 4.188432835820896, |
| "grad_norm": 2.658935308456421, |
| "learning_rate": 3.606451768702151e-05, |
| "loss": 0.2606, |
| "num_input_tokens_seen": 639456, |
| "step": 2245 |
| }, |
| { |
| "epoch": 4.197761194029851, |
| "grad_norm": 5.2288289070129395, |
| "learning_rate": 3.599146070706964e-05, |
| "loss": 0.2769, |
| "num_input_tokens_seen": 640736, |
| "step": 2250 |
| }, |
| { |
| "epoch": 4.207089552238806, |
| "grad_norm": 4.973333835601807, |
| "learning_rate": 3.5918287185730414e-05, |
| "loss": 0.2722, |
| "num_input_tokens_seen": 642208, |
| "step": 2255 |
| }, |
| { |
| "epoch": 4.2164179104477615, |
| "grad_norm": 3.9337849617004395, |
| "learning_rate": 3.5844997898855566e-05, |
| "loss": 0.1483, |
| "num_input_tokens_seen": 643616, |
| "step": 2260 |
| }, |
| { |
| "epoch": 4.225746268656716, |
| "grad_norm": 5.327485084533691, |
| "learning_rate": 3.5771593623524265e-05, |
| "loss": 0.2416, |
| "num_input_tokens_seen": 645024, |
| "step": 2265 |
| }, |
| { |
| "epoch": 4.235074626865671, |
| "grad_norm": 4.1554484367370605, |
| "learning_rate": 3.569807513803488e-05, |
| "loss": 0.2972, |
| "num_input_tokens_seen": 646464, |
| "step": 2270 |
| }, |
| { |
| "epoch": 4.244402985074627, |
| "grad_norm": 3.438372850418091, |
| "learning_rate": 3.5624443221896776e-05, |
| "loss": 0.2882, |
| "num_input_tokens_seen": 647968, |
| "step": 2275 |
| }, |
| { |
| "epoch": 4.253731343283582, |
| "grad_norm": 4.879497051239014, |
| "learning_rate": 3.555069865582197e-05, |
| "loss": 0.259, |
| "num_input_tokens_seen": 649312, |
| "step": 2280 |
| }, |
| { |
| "epoch": 4.263059701492537, |
| "grad_norm": 5.402458667755127, |
| "learning_rate": 3.547684222171692e-05, |
| "loss": 0.4159, |
| "num_input_tokens_seen": 650656, |
| "step": 2285 |
| }, |
| { |
| "epoch": 4.272388059701493, |
| "grad_norm": 3.2078697681427, |
| "learning_rate": 3.54028747026742e-05, |
| "loss": 0.3347, |
| "num_input_tokens_seen": 652032, |
| "step": 2290 |
| }, |
| { |
| "epoch": 4.281716417910448, |
| "grad_norm": 2.500237464904785, |
| "learning_rate": 3.532879688296421e-05, |
| "loss": 0.1921, |
| "num_input_tokens_seen": 653504, |
| "step": 2295 |
| }, |
| { |
| "epoch": 4.291044776119403, |
| "grad_norm": 5.173084259033203, |
| "learning_rate": 3.5254609548026865e-05, |
| "loss": 0.1822, |
| "num_input_tokens_seen": 654912, |
| "step": 2300 |
| }, |
| { |
| "epoch": 4.300373134328359, |
| "grad_norm": 3.935835599899292, |
| "learning_rate": 3.518031348446324e-05, |
| "loss": 0.3313, |
| "num_input_tokens_seen": 656256, |
| "step": 2305 |
| }, |
| { |
| "epoch": 4.309701492537314, |
| "grad_norm": 7.697382926940918, |
| "learning_rate": 3.5105909480027276e-05, |
| "loss": 0.1935, |
| "num_input_tokens_seen": 657792, |
| "step": 2310 |
| }, |
| { |
| "epoch": 4.3190298507462686, |
| "grad_norm": 1.638433575630188, |
| "learning_rate": 3.5031398323617366e-05, |
| "loss": 0.1713, |
| "num_input_tokens_seen": 659264, |
| "step": 2315 |
| }, |
| { |
| "epoch": 4.3283582089552235, |
| "grad_norm": 6.272957801818848, |
| "learning_rate": 3.4956780805268066e-05, |
| "loss": 0.3794, |
| "num_input_tokens_seen": 660640, |
| "step": 2320 |
| }, |
| { |
| "epoch": 4.337686567164179, |
| "grad_norm": 4.078192234039307, |
| "learning_rate": 3.4882057716141635e-05, |
| "loss": 0.2836, |
| "num_input_tokens_seen": 662016, |
| "step": 2325 |
| }, |
| { |
| "epoch": 4.347014925373134, |
| "grad_norm": 4.388688564300537, |
| "learning_rate": 3.480722984851972e-05, |
| "loss": 0.2175, |
| "num_input_tokens_seen": 663392, |
| "step": 2330 |
| }, |
| { |
| "epoch": 4.356343283582089, |
| "grad_norm": 7.101146221160889, |
| "learning_rate": 3.473229799579492e-05, |
| "loss": 0.31, |
| "num_input_tokens_seen": 664928, |
| "step": 2335 |
| }, |
| { |
| "epoch": 4.365671641791045, |
| "grad_norm": 6.327828884124756, |
| "learning_rate": 3.465726295246236e-05, |
| "loss": 0.3994, |
| "num_input_tokens_seen": 666176, |
| "step": 2340 |
| }, |
| { |
| "epoch": 4.375, |
| "grad_norm": 2.0350537300109863, |
| "learning_rate": 3.45821255141113e-05, |
| "loss": 0.2578, |
| "num_input_tokens_seen": 667552, |
| "step": 2345 |
| }, |
| { |
| "epoch": 4.384328358208955, |
| "grad_norm": 4.2583794593811035, |
| "learning_rate": 3.450688647741668e-05, |
| "loss": 0.3207, |
| "num_input_tokens_seen": 669056, |
| "step": 2350 |
| }, |
| { |
| "epoch": 4.393656716417911, |
| "grad_norm": 1.5456268787384033, |
| "learning_rate": 3.443154664013067e-05, |
| "loss": 0.2498, |
| "num_input_tokens_seen": 670592, |
| "step": 2355 |
| }, |
| { |
| "epoch": 4.402985074626866, |
| "grad_norm": 2.0810458660125732, |
| "learning_rate": 3.4356106801074245e-05, |
| "loss": 0.088, |
| "num_input_tokens_seen": 672096, |
| "step": 2360 |
| }, |
| { |
| "epoch": 4.412313432835821, |
| "grad_norm": 4.0025506019592285, |
| "learning_rate": 3.4280567760128656e-05, |
| "loss": 0.5051, |
| "num_input_tokens_seen": 673568, |
| "step": 2365 |
| }, |
| { |
| "epoch": 4.4216417910447765, |
| "grad_norm": 3.0601840019226074, |
| "learning_rate": 3.4204930318227016e-05, |
| "loss": 0.2056, |
| "num_input_tokens_seen": 674976, |
| "step": 2370 |
| }, |
| { |
| "epoch": 4.4309701492537314, |
| "grad_norm": 7.199167251586914, |
| "learning_rate": 3.4129195277345724e-05, |
| "loss": 0.3745, |
| "num_input_tokens_seen": 676288, |
| "step": 2375 |
| }, |
| { |
| "epoch": 4.440298507462686, |
| "grad_norm": 3.746263027191162, |
| "learning_rate": 3.405336344049607e-05, |
| "loss": 0.197, |
| "num_input_tokens_seen": 677888, |
| "step": 2380 |
| }, |
| { |
| "epoch": 4.449626865671641, |
| "grad_norm": 3.992307424545288, |
| "learning_rate": 3.397743561171562e-05, |
| "loss": 0.2493, |
| "num_input_tokens_seen": 679328, |
| "step": 2385 |
| }, |
| { |
| "epoch": 4.458955223880597, |
| "grad_norm": 2.753150701522827, |
| "learning_rate": 3.390141259605975e-05, |
| "loss": 0.2671, |
| "num_input_tokens_seen": 680736, |
| "step": 2390 |
| }, |
| { |
| "epoch": 4.468283582089552, |
| "grad_norm": 1.8495646715164185, |
| "learning_rate": 3.3825295199593084e-05, |
| "loss": 0.1628, |
| "num_input_tokens_seen": 682304, |
| "step": 2395 |
| }, |
| { |
| "epoch": 4.477611940298507, |
| "grad_norm": 5.084019660949707, |
| "learning_rate": 3.3749084229380976e-05, |
| "loss": 0.3361, |
| "num_input_tokens_seen": 683936, |
| "step": 2400 |
| }, |
| { |
| "epoch": 4.486940298507463, |
| "grad_norm": 3.3552839756011963, |
| "learning_rate": 3.367278049348093e-05, |
| "loss": 0.2479, |
| "num_input_tokens_seen": 685312, |
| "step": 2405 |
| }, |
| { |
| "epoch": 4.496268656716418, |
| "grad_norm": 4.067103385925293, |
| "learning_rate": 3.3596384800934e-05, |
| "loss": 0.1721, |
| "num_input_tokens_seen": 686752, |
| "step": 2410 |
| }, |
| { |
| "epoch": 4.5, |
| "eval_loss": 0.6774219274520874, |
| "eval_runtime": 2.8338, |
| "eval_samples_per_second": 83.987, |
| "eval_steps_per_second": 21.173, |
| "num_input_tokens_seen": 687328, |
| "step": 2412 |
| }, |
| { |
| "epoch": 4.505597014925373, |
| "grad_norm": 2.315110683441162, |
| "learning_rate": 3.351989796175628e-05, |
| "loss": 0.2477, |
| "num_input_tokens_seen": 688096, |
| "step": 2415 |
| }, |
| { |
| "epoch": 4.514925373134329, |
| "grad_norm": 3.236654043197632, |
| "learning_rate": 3.3443320786930275e-05, |
| "loss": 0.2619, |
| "num_input_tokens_seen": 689664, |
| "step": 2420 |
| }, |
| { |
| "epoch": 4.524253731343284, |
| "grad_norm": 6.749086856842041, |
| "learning_rate": 3.3366654088396326e-05, |
| "loss": 0.4531, |
| "num_input_tokens_seen": 690912, |
| "step": 2425 |
| }, |
| { |
| "epoch": 4.5335820895522385, |
| "grad_norm": 5.3623223304748535, |
| "learning_rate": 3.328989867904396e-05, |
| "loss": 0.1507, |
| "num_input_tokens_seen": 692320, |
| "step": 2430 |
| }, |
| { |
| "epoch": 4.542910447761194, |
| "grad_norm": 3.595451831817627, |
| "learning_rate": 3.3213055372703305e-05, |
| "loss": 0.2201, |
| "num_input_tokens_seen": 693568, |
| "step": 2435 |
| }, |
| { |
| "epoch": 4.552238805970149, |
| "grad_norm": 5.594698429107666, |
| "learning_rate": 3.313612498413646e-05, |
| "loss": 0.185, |
| "num_input_tokens_seen": 695168, |
| "step": 2440 |
| }, |
| { |
| "epoch": 4.561567164179104, |
| "grad_norm": 7.6441802978515625, |
| "learning_rate": 3.305910832902884e-05, |
| "loss": 0.2157, |
| "num_input_tokens_seen": 696384, |
| "step": 2445 |
| }, |
| { |
| "epoch": 4.57089552238806, |
| "grad_norm": 3.2429683208465576, |
| "learning_rate": 3.298200622398054e-05, |
| "loss": 0.2227, |
| "num_input_tokens_seen": 697792, |
| "step": 2450 |
| }, |
| { |
| "epoch": 4.580223880597015, |
| "grad_norm": 5.2971367835998535, |
| "learning_rate": 3.290481948649767e-05, |
| "loss": 0.2154, |
| "num_input_tokens_seen": 699232, |
| "step": 2455 |
| }, |
| { |
| "epoch": 4.58955223880597, |
| "grad_norm": 3.51114559173584, |
| "learning_rate": 3.282754893498369e-05, |
| "loss": 0.2471, |
| "num_input_tokens_seen": 700704, |
| "step": 2460 |
| }, |
| { |
| "epoch": 4.598880597014926, |
| "grad_norm": 6.7201313972473145, |
| "learning_rate": 3.275019538873071e-05, |
| "loss": 0.3039, |
| "num_input_tokens_seen": 702016, |
| "step": 2465 |
| }, |
| { |
| "epoch": 4.608208955223881, |
| "grad_norm": 2.5125515460968018, |
| "learning_rate": 3.267275966791088e-05, |
| "loss": 0.3126, |
| "num_input_tokens_seen": 703456, |
| "step": 2470 |
| }, |
| { |
| "epoch": 4.617537313432836, |
| "grad_norm": 3.4876139163970947, |
| "learning_rate": 3.259524259356759e-05, |
| "loss": 0.2698, |
| "num_input_tokens_seen": 704896, |
| "step": 2475 |
| }, |
| { |
| "epoch": 4.6268656716417915, |
| "grad_norm": 3.5510451793670654, |
| "learning_rate": 3.251764498760683e-05, |
| "loss": 0.2716, |
| "num_input_tokens_seen": 706272, |
| "step": 2480 |
| }, |
| { |
| "epoch": 4.6361940298507465, |
| "grad_norm": 6.643056869506836, |
| "learning_rate": 3.243996767278846e-05, |
| "loss": 0.1564, |
| "num_input_tokens_seen": 707744, |
| "step": 2485 |
| }, |
| { |
| "epoch": 4.645522388059701, |
| "grad_norm": 3.5300536155700684, |
| "learning_rate": 3.2362211472717484e-05, |
| "loss": 0.3767, |
| "num_input_tokens_seen": 709120, |
| "step": 2490 |
| }, |
| { |
| "epoch": 4.654850746268656, |
| "grad_norm": 3.97326397895813, |
| "learning_rate": 3.228437721183531e-05, |
| "loss": 0.2985, |
| "num_input_tokens_seen": 710592, |
| "step": 2495 |
| }, |
| { |
| "epoch": 4.664179104477612, |
| "grad_norm": 5.200433254241943, |
| "learning_rate": 3.220646571541105e-05, |
| "loss": 0.3666, |
| "num_input_tokens_seen": 712000, |
| "step": 2500 |
| }, |
| { |
| "epoch": 4.673507462686567, |
| "grad_norm": 5.719107627868652, |
| "learning_rate": 3.2128477809532684e-05, |
| "loss": 0.1744, |
| "num_input_tokens_seen": 713408, |
| "step": 2505 |
| }, |
| { |
| "epoch": 4.682835820895522, |
| "grad_norm": 2.513913631439209, |
| "learning_rate": 3.2050414321098385e-05, |
| "loss": 0.2601, |
| "num_input_tokens_seen": 714688, |
| "step": 2510 |
| }, |
| { |
| "epoch": 4.692164179104478, |
| "grad_norm": 6.390174388885498, |
| "learning_rate": 3.197227607780774e-05, |
| "loss": 0.2791, |
| "num_input_tokens_seen": 716128, |
| "step": 2515 |
| }, |
| { |
| "epoch": 4.701492537313433, |
| "grad_norm": 4.924427032470703, |
| "learning_rate": 3.1894063908152916e-05, |
| "loss": 0.3237, |
| "num_input_tokens_seen": 717632, |
| "step": 2520 |
| }, |
| { |
| "epoch": 4.710820895522388, |
| "grad_norm": 7.218212604522705, |
| "learning_rate": 3.181577864140992e-05, |
| "loss": 0.333, |
| "num_input_tokens_seen": 719264, |
| "step": 2525 |
| }, |
| { |
| "epoch": 4.720149253731344, |
| "grad_norm": 3.707828998565674, |
| "learning_rate": 3.173742110762984e-05, |
| "loss": 0.2671, |
| "num_input_tokens_seen": 720832, |
| "step": 2530 |
| }, |
| { |
| "epoch": 4.729477611940299, |
| "grad_norm": 7.342929840087891, |
| "learning_rate": 3.165899213762995e-05, |
| "loss": 0.3259, |
| "num_input_tokens_seen": 722240, |
| "step": 2535 |
| }, |
| { |
| "epoch": 4.7388059701492535, |
| "grad_norm": 3.9566140174865723, |
| "learning_rate": 3.158049256298499e-05, |
| "loss": 0.3972, |
| "num_input_tokens_seen": 723552, |
| "step": 2540 |
| }, |
| { |
| "epoch": 4.7481343283582085, |
| "grad_norm": 5.2011613845825195, |
| "learning_rate": 3.15019232160183e-05, |
| "loss": 0.3483, |
| "num_input_tokens_seen": 724864, |
| "step": 2545 |
| }, |
| { |
| "epoch": 4.757462686567164, |
| "grad_norm": 6.435481548309326, |
| "learning_rate": 3.142328492979301e-05, |
| "loss": 0.1986, |
| "num_input_tokens_seen": 726176, |
| "step": 2550 |
| }, |
| { |
| "epoch": 4.766791044776119, |
| "grad_norm": 3.969621419906616, |
| "learning_rate": 3.134457853810322e-05, |
| "loss": 0.2841, |
| "num_input_tokens_seen": 727680, |
| "step": 2555 |
| }, |
| { |
| "epoch": 4.776119402985074, |
| "grad_norm": 5.346433639526367, |
| "learning_rate": 3.126580487546513e-05, |
| "loss": 0.3486, |
| "num_input_tokens_seen": 729024, |
| "step": 2560 |
| }, |
| { |
| "epoch": 4.78544776119403, |
| "grad_norm": 6.051307201385498, |
| "learning_rate": 3.1186964777108215e-05, |
| "loss": 0.2384, |
| "num_input_tokens_seen": 730304, |
| "step": 2565 |
| }, |
| { |
| "epoch": 4.794776119402985, |
| "grad_norm": 5.0239033699035645, |
| "learning_rate": 3.110805907896637e-05, |
| "loss": 0.201, |
| "num_input_tokens_seen": 731776, |
| "step": 2570 |
| }, |
| { |
| "epoch": 4.80410447761194, |
| "grad_norm": 1.9890495538711548, |
| "learning_rate": 3.1029088617669e-05, |
| "loss": 0.2111, |
| "num_input_tokens_seen": 733152, |
| "step": 2575 |
| }, |
| { |
| "epoch": 4.813432835820896, |
| "grad_norm": 3.9248900413513184, |
| "learning_rate": 3.0950054230532235e-05, |
| "loss": 0.4165, |
| "num_input_tokens_seen": 734432, |
| "step": 2580 |
| }, |
| { |
| "epoch": 4.822761194029851, |
| "grad_norm": 3.7893781661987305, |
| "learning_rate": 3.0870956755549976e-05, |
| "loss": 0.3464, |
| "num_input_tokens_seen": 735808, |
| "step": 2585 |
| }, |
| { |
| "epoch": 4.832089552238806, |
| "grad_norm": 7.637203693389893, |
| "learning_rate": 3.079179703138505e-05, |
| "loss": 0.3467, |
| "num_input_tokens_seen": 737088, |
| "step": 2590 |
| }, |
| { |
| "epoch": 4.8414179104477615, |
| "grad_norm": 3.6080985069274902, |
| "learning_rate": 3.0712575897360304e-05, |
| "loss": 0.26, |
| "num_input_tokens_seen": 738528, |
| "step": 2595 |
| }, |
| { |
| "epoch": 4.850746268656716, |
| "grad_norm": 3.8839356899261475, |
| "learning_rate": 3.0633294193449695e-05, |
| "loss": 0.3099, |
| "num_input_tokens_seen": 740128, |
| "step": 2600 |
| }, |
| { |
| "epoch": 4.860074626865671, |
| "grad_norm": 3.8288557529449463, |
| "learning_rate": 3.0553952760269426e-05, |
| "loss": 0.2701, |
| "num_input_tokens_seen": 741760, |
| "step": 2605 |
| }, |
| { |
| "epoch": 4.869402985074627, |
| "grad_norm": 6.6330366134643555, |
| "learning_rate": 3.0474552439068978e-05, |
| "loss": 0.2656, |
| "num_input_tokens_seen": 743136, |
| "step": 2610 |
| }, |
| { |
| "epoch": 4.878731343283582, |
| "grad_norm": 4.981152057647705, |
| "learning_rate": 3.039509407172222e-05, |
| "loss": 0.2156, |
| "num_input_tokens_seen": 744384, |
| "step": 2615 |
| }, |
| { |
| "epoch": 4.888059701492537, |
| "grad_norm": 4.150975227355957, |
| "learning_rate": 3.0315578500718476e-05, |
| "loss": 0.3084, |
| "num_input_tokens_seen": 745888, |
| "step": 2620 |
| }, |
| { |
| "epoch": 4.897388059701493, |
| "grad_norm": 5.988077640533447, |
| "learning_rate": 3.0236006569153617e-05, |
| "loss": 0.3035, |
| "num_input_tokens_seen": 747360, |
| "step": 2625 |
| }, |
| { |
| "epoch": 4.906716417910448, |
| "grad_norm": 6.701168060302734, |
| "learning_rate": 3.0156379120721068e-05, |
| "loss": 0.4183, |
| "num_input_tokens_seen": 748576, |
| "step": 2630 |
| }, |
| { |
| "epoch": 4.916044776119403, |
| "grad_norm": 4.422663688659668, |
| "learning_rate": 3.0076696999702913e-05, |
| "loss": 0.2044, |
| "num_input_tokens_seen": 749792, |
| "step": 2635 |
| }, |
| { |
| "epoch": 4.925373134328359, |
| "grad_norm": 8.353050231933594, |
| "learning_rate": 2.9996961050960932e-05, |
| "loss": 0.1865, |
| "num_input_tokens_seen": 751200, |
| "step": 2640 |
| }, |
| { |
| "epoch": 4.934701492537314, |
| "grad_norm": 5.12870454788208, |
| "learning_rate": 2.9917172119927606e-05, |
| "loss": 0.2289, |
| "num_input_tokens_seen": 752608, |
| "step": 2645 |
| }, |
| { |
| "epoch": 4.9440298507462686, |
| "grad_norm": 4.996561527252197, |
| "learning_rate": 2.9837331052597224e-05, |
| "loss": 0.2604, |
| "num_input_tokens_seen": 754144, |
| "step": 2650 |
| }, |
| { |
| "epoch": 4.9533582089552235, |
| "grad_norm": 3.875826835632324, |
| "learning_rate": 2.9757438695516816e-05, |
| "loss": 0.2741, |
| "num_input_tokens_seen": 755584, |
| "step": 2655 |
| }, |
| { |
| "epoch": 4.962686567164179, |
| "grad_norm": 4.2091755867004395, |
| "learning_rate": 2.9677495895777286e-05, |
| "loss": 0.2509, |
| "num_input_tokens_seen": 757152, |
| "step": 2660 |
| }, |
| { |
| "epoch": 4.972014925373134, |
| "grad_norm": 13.200364112854004, |
| "learning_rate": 2.9597503501004343e-05, |
| "loss": 0.1927, |
| "num_input_tokens_seen": 758528, |
| "step": 2665 |
| }, |
| { |
| "epoch": 4.981343283582089, |
| "grad_norm": 4.190772533416748, |
| "learning_rate": 2.9517462359349553e-05, |
| "loss": 0.2796, |
| "num_input_tokens_seen": 759904, |
| "step": 2670 |
| }, |
| { |
| "epoch": 4.990671641791045, |
| "grad_norm": 3.4300618171691895, |
| "learning_rate": 2.943737331948136e-05, |
| "loss": 0.2621, |
| "num_input_tokens_seen": 761344, |
| "step": 2675 |
| }, |
| { |
| "epoch": 5.0, |
| "grad_norm": 11.973982810974121, |
| "learning_rate": 2.9357237230576045e-05, |
| "loss": 0.3979, |
| "num_input_tokens_seen": 762440, |
| "step": 2680 |
| }, |
| { |
| "epoch": 5.0, |
| "eval_loss": 0.6893951296806335, |
| "eval_runtime": 2.8462, |
| "eval_samples_per_second": 83.619, |
| "eval_steps_per_second": 21.081, |
| "num_input_tokens_seen": 762440, |
| "step": 2680 |
| }, |
| { |
| "epoch": 5.009328358208955, |
| "grad_norm": 3.1089532375335693, |
| "learning_rate": 2.927705494230875e-05, |
| "loss": 0.1512, |
| "num_input_tokens_seen": 763656, |
| "step": 2685 |
| }, |
| { |
| "epoch": 5.018656716417911, |
| "grad_norm": 4.243420124053955, |
| "learning_rate": 2.9196827304844483e-05, |
| "loss": 0.2745, |
| "num_input_tokens_seen": 764968, |
| "step": 2690 |
| }, |
| { |
| "epoch": 5.027985074626866, |
| "grad_norm": 3.036355495452881, |
| "learning_rate": 2.911655516882905e-05, |
| "loss": 0.1327, |
| "num_input_tokens_seen": 766440, |
| "step": 2695 |
| }, |
| { |
| "epoch": 5.037313432835821, |
| "grad_norm": 5.335038661956787, |
| "learning_rate": 2.9036239385380098e-05, |
| "loss": 0.15, |
| "num_input_tokens_seen": 767784, |
| "step": 2700 |
| }, |
| { |
| "epoch": 5.0466417910447765, |
| "grad_norm": 4.391361713409424, |
| "learning_rate": 2.8955880806078068e-05, |
| "loss": 0.2069, |
| "num_input_tokens_seen": 769128, |
| "step": 2705 |
| }, |
| { |
| "epoch": 5.0559701492537314, |
| "grad_norm": 6.702354431152344, |
| "learning_rate": 2.8875480282957133e-05, |
| "loss": 0.2142, |
| "num_input_tokens_seen": 770536, |
| "step": 2710 |
| }, |
| { |
| "epoch": 5.065298507462686, |
| "grad_norm": 9.222060203552246, |
| "learning_rate": 2.8795038668496222e-05, |
| "loss": 0.1254, |
| "num_input_tokens_seen": 772040, |
| "step": 2715 |
| }, |
| { |
| "epoch": 5.074626865671641, |
| "grad_norm": 6.100081443786621, |
| "learning_rate": 2.8714556815609926e-05, |
| "loss": 0.1724, |
| "num_input_tokens_seen": 773480, |
| "step": 2720 |
| }, |
| { |
| "epoch": 5.083955223880597, |
| "grad_norm": 2.687472343444824, |
| "learning_rate": 2.8634035577639505e-05, |
| "loss": 0.1623, |
| "num_input_tokens_seen": 774888, |
| "step": 2725 |
| }, |
| { |
| "epoch": 5.093283582089552, |
| "grad_norm": 6.644227504730225, |
| "learning_rate": 2.8553475808343798e-05, |
| "loss": 0.1983, |
| "num_input_tokens_seen": 776296, |
| "step": 2730 |
| }, |
| { |
| "epoch": 5.102611940298507, |
| "grad_norm": 3.4679696559906006, |
| "learning_rate": 2.847287836189019e-05, |
| "loss": 0.1264, |
| "num_input_tokens_seen": 777928, |
| "step": 2735 |
| }, |
| { |
| "epoch": 5.111940298507463, |
| "grad_norm": 3.4555349349975586, |
| "learning_rate": 2.839224409284556e-05, |
| "loss": 0.1328, |
| "num_input_tokens_seen": 779304, |
| "step": 2740 |
| }, |
| { |
| "epoch": 5.121268656716418, |
| "grad_norm": 4.143601417541504, |
| "learning_rate": 2.83115738561672e-05, |
| "loss": 0.2062, |
| "num_input_tokens_seen": 780456, |
| "step": 2745 |
| }, |
| { |
| "epoch": 5.130597014925373, |
| "grad_norm": 4.653928279876709, |
| "learning_rate": 2.8230868507193785e-05, |
| "loss": 0.1811, |
| "num_input_tokens_seen": 781800, |
| "step": 2750 |
| }, |
| { |
| "epoch": 5.139925373134329, |
| "grad_norm": 2.0559017658233643, |
| "learning_rate": 2.8150128901636262e-05, |
| "loss": 0.1975, |
| "num_input_tokens_seen": 783176, |
| "step": 2755 |
| }, |
| { |
| "epoch": 5.149253731343284, |
| "grad_norm": 3.160630226135254, |
| "learning_rate": 2.8069355895568805e-05, |
| "loss": 0.155, |
| "num_input_tokens_seen": 784488, |
| "step": 2760 |
| }, |
| { |
| "epoch": 5.1585820895522385, |
| "grad_norm": 2.787592887878418, |
| "learning_rate": 2.7988550345419733e-05, |
| "loss": 0.1237, |
| "num_input_tokens_seen": 785928, |
| "step": 2765 |
| }, |
| { |
| "epoch": 5.167910447761194, |
| "grad_norm": 2.561467409133911, |
| "learning_rate": 2.790771310796243e-05, |
| "loss": 0.1287, |
| "num_input_tokens_seen": 787368, |
| "step": 2770 |
| }, |
| { |
| "epoch": 5.177238805970149, |
| "grad_norm": 3.929704189300537, |
| "learning_rate": 2.7826845040306238e-05, |
| "loss": 0.1418, |
| "num_input_tokens_seen": 788744, |
| "step": 2775 |
| }, |
| { |
| "epoch": 5.186567164179104, |
| "grad_norm": 3.3693349361419678, |
| "learning_rate": 2.7745946999887406e-05, |
| "loss": 0.1844, |
| "num_input_tokens_seen": 790216, |
| "step": 2780 |
| }, |
| { |
| "epoch": 5.19589552238806, |
| "grad_norm": 2.9200971126556396, |
| "learning_rate": 2.766501984445999e-05, |
| "loss": 0.1688, |
| "num_input_tokens_seen": 791656, |
| "step": 2785 |
| }, |
| { |
| "epoch": 5.205223880597015, |
| "grad_norm": 4.284022331237793, |
| "learning_rate": 2.7584064432086716e-05, |
| "loss": 0.241, |
| "num_input_tokens_seen": 793288, |
| "step": 2790 |
| }, |
| { |
| "epoch": 5.21455223880597, |
| "grad_norm": 2.3091182708740234, |
| "learning_rate": 2.750308162112995e-05, |
| "loss": 0.1195, |
| "num_input_tokens_seen": 794984, |
| "step": 2795 |
| }, |
| { |
| "epoch": 5.223880597014926, |
| "grad_norm": 2.9606106281280518, |
| "learning_rate": 2.7422072270242548e-05, |
| "loss": 0.1857, |
| "num_input_tokens_seen": 796328, |
| "step": 2800 |
| }, |
| { |
| "epoch": 5.233208955223881, |
| "grad_norm": 6.744553565979004, |
| "learning_rate": 2.7341037238358774e-05, |
| "loss": 0.2755, |
| "num_input_tokens_seen": 797704, |
| "step": 2805 |
| }, |
| { |
| "epoch": 5.242537313432836, |
| "grad_norm": 4.244629859924316, |
| "learning_rate": 2.7259977384685163e-05, |
| "loss": 0.1671, |
| "num_input_tokens_seen": 798984, |
| "step": 2810 |
| }, |
| { |
| "epoch": 5.251865671641791, |
| "grad_norm": 3.4353435039520264, |
| "learning_rate": 2.717889356869146e-05, |
| "loss": 0.1936, |
| "num_input_tokens_seen": 800392, |
| "step": 2815 |
| }, |
| { |
| "epoch": 5.2611940298507465, |
| "grad_norm": 2.8996193408966064, |
| "learning_rate": 2.7097786650101458e-05, |
| "loss": 0.1628, |
| "num_input_tokens_seen": 801832, |
| "step": 2820 |
| }, |
| { |
| "epoch": 5.270522388059701, |
| "grad_norm": 2.4687931537628174, |
| "learning_rate": 2.7016657488883928e-05, |
| "loss": 0.2559, |
| "num_input_tokens_seen": 803272, |
| "step": 2825 |
| }, |
| { |
| "epoch": 5.279850746268656, |
| "grad_norm": 7.177276134490967, |
| "learning_rate": 2.693550694524346e-05, |
| "loss": 0.2615, |
| "num_input_tokens_seen": 804872, |
| "step": 2830 |
| }, |
| { |
| "epoch": 5.289179104477612, |
| "grad_norm": 3.9573192596435547, |
| "learning_rate": 2.685433587961136e-05, |
| "loss": 0.1751, |
| "num_input_tokens_seen": 806312, |
| "step": 2835 |
| }, |
| { |
| "epoch": 5.298507462686567, |
| "grad_norm": 0.8054132461547852, |
| "learning_rate": 2.6773145152636536e-05, |
| "loss": 0.1009, |
| "num_input_tokens_seen": 807944, |
| "step": 2840 |
| }, |
| { |
| "epoch": 5.307835820895522, |
| "grad_norm": 6.455924987792969, |
| "learning_rate": 2.6691935625176357e-05, |
| "loss": 0.1956, |
| "num_input_tokens_seen": 809320, |
| "step": 2845 |
| }, |
| { |
| "epoch": 5.317164179104478, |
| "grad_norm": 9.160236358642578, |
| "learning_rate": 2.6610708158287535e-05, |
| "loss": 0.1495, |
| "num_input_tokens_seen": 810952, |
| "step": 2850 |
| }, |
| { |
| "epoch": 5.326492537313433, |
| "grad_norm": 7.734012603759766, |
| "learning_rate": 2.6529463613216986e-05, |
| "loss": 0.1283, |
| "num_input_tokens_seen": 812264, |
| "step": 2855 |
| }, |
| { |
| "epoch": 5.335820895522388, |
| "grad_norm": 2.5043344497680664, |
| "learning_rate": 2.644820285139271e-05, |
| "loss": 0.2011, |
| "num_input_tokens_seen": 813704, |
| "step": 2860 |
| }, |
| { |
| "epoch": 5.345149253731344, |
| "grad_norm": 2.219484329223633, |
| "learning_rate": 2.636692673441465e-05, |
| "loss": 0.1501, |
| "num_input_tokens_seen": 815272, |
| "step": 2865 |
| }, |
| { |
| "epoch": 5.354477611940299, |
| "grad_norm": 4.445641040802002, |
| "learning_rate": 2.628563612404556e-05, |
| "loss": 0.1918, |
| "num_input_tokens_seen": 816904, |
| "step": 2870 |
| }, |
| { |
| "epoch": 5.3638059701492535, |
| "grad_norm": 4.634942531585693, |
| "learning_rate": 2.6204331882201864e-05, |
| "loss": 0.1762, |
| "num_input_tokens_seen": 818440, |
| "step": 2875 |
| }, |
| { |
| "epoch": 5.373134328358209, |
| "grad_norm": 1.1625159978866577, |
| "learning_rate": 2.6123014870944517e-05, |
| "loss": 0.2302, |
| "num_input_tokens_seen": 819752, |
| "step": 2880 |
| }, |
| { |
| "epoch": 5.382462686567164, |
| "grad_norm": 10.424229621887207, |
| "learning_rate": 2.6041685952469875e-05, |
| "loss": 0.3692, |
| "num_input_tokens_seen": 820936, |
| "step": 2885 |
| }, |
| { |
| "epoch": 5.391791044776119, |
| "grad_norm": 5.409058570861816, |
| "learning_rate": 2.596034598910053e-05, |
| "loss": 0.2003, |
| "num_input_tokens_seen": 822408, |
| "step": 2890 |
| }, |
| { |
| "epoch": 5.401119402985074, |
| "grad_norm": 5.473613739013672, |
| "learning_rate": 2.5878995843276204e-05, |
| "loss": 0.2755, |
| "num_input_tokens_seen": 823784, |
| "step": 2895 |
| }, |
| { |
| "epoch": 5.41044776119403, |
| "grad_norm": 7.402808666229248, |
| "learning_rate": 2.5797636377544564e-05, |
| "loss": 0.1741, |
| "num_input_tokens_seen": 825064, |
| "step": 2900 |
| }, |
| { |
| "epoch": 5.419776119402985, |
| "grad_norm": 8.776235580444336, |
| "learning_rate": 2.5716268454552095e-05, |
| "loss": 0.1554, |
| "num_input_tokens_seen": 826600, |
| "step": 2905 |
| }, |
| { |
| "epoch": 5.42910447761194, |
| "grad_norm": 2.3435590267181396, |
| "learning_rate": 2.563489293703496e-05, |
| "loss": 0.1413, |
| "num_input_tokens_seen": 828104, |
| "step": 2910 |
| }, |
| { |
| "epoch": 5.438432835820896, |
| "grad_norm": 4.115440368652344, |
| "learning_rate": 2.5553510687809855e-05, |
| "loss": 0.2164, |
| "num_input_tokens_seen": 829640, |
| "step": 2915 |
| }, |
| { |
| "epoch": 5.447761194029851, |
| "grad_norm": 5.793088436126709, |
| "learning_rate": 2.547212256976484e-05, |
| "loss": 0.1486, |
| "num_input_tokens_seen": 831112, |
| "step": 2920 |
| }, |
| { |
| "epoch": 5.457089552238806, |
| "grad_norm": 2.8201024532318115, |
| "learning_rate": 2.5390729445850198e-05, |
| "loss": 0.2202, |
| "num_input_tokens_seen": 832744, |
| "step": 2925 |
| }, |
| { |
| "epoch": 5.4664179104477615, |
| "grad_norm": 3.4384210109710693, |
| "learning_rate": 2.53093321790693e-05, |
| "loss": 0.1257, |
| "num_input_tokens_seen": 834248, |
| "step": 2930 |
| }, |
| { |
| "epoch": 5.475746268656716, |
| "grad_norm": 5.487016677856445, |
| "learning_rate": 2.5227931632469437e-05, |
| "loss": 0.2742, |
| "num_input_tokens_seen": 835752, |
| "step": 2935 |
| }, |
| { |
| "epoch": 5.485074626865671, |
| "grad_norm": 3.396284341812134, |
| "learning_rate": 2.5146528669132664e-05, |
| "loss": 0.2555, |
| "num_input_tokens_seen": 837064, |
| "step": 2940 |
| }, |
| { |
| "epoch": 5.494402985074627, |
| "grad_norm": 9.393867492675781, |
| "learning_rate": 2.506512415216669e-05, |
| "loss": 0.1644, |
| "num_input_tokens_seen": 838440, |
| "step": 2945 |
| }, |
| { |
| "epoch": 5.5, |
| "eval_loss": 0.7815956473350525, |
| "eval_runtime": 2.8388, |
| "eval_samples_per_second": 83.838, |
| "eval_steps_per_second": 21.136, |
| "num_input_tokens_seen": 839656, |
| "step": 2948 |
| }, |
| { |
| "epoch": 5.503731343283582, |
| "grad_norm": 5.140641689300537, |
| "learning_rate": 2.4983718944695672e-05, |
| "loss": 0.1517, |
| "num_input_tokens_seen": 840232, |
| "step": 2950 |
| }, |
| { |
| "epoch": 5.513059701492537, |
| "grad_norm": 5.742760181427002, |
| "learning_rate": 2.49023139098511e-05, |
| "loss": 0.1567, |
| "num_input_tokens_seen": 841576, |
| "step": 2955 |
| }, |
| { |
| "epoch": 5.522388059701493, |
| "grad_norm": 9.388687133789062, |
| "learning_rate": 2.4820909910762628e-05, |
| "loss": 0.1811, |
| "num_input_tokens_seen": 842984, |
| "step": 2960 |
| }, |
| { |
| "epoch": 5.531716417910448, |
| "grad_norm": 2.2369930744171143, |
| "learning_rate": 2.473950781054893e-05, |
| "loss": 0.1632, |
| "num_input_tokens_seen": 844232, |
| "step": 2965 |
| }, |
| { |
| "epoch": 5.541044776119403, |
| "grad_norm": 6.7049150466918945, |
| "learning_rate": 2.4658108472308545e-05, |
| "loss": 0.2983, |
| "num_input_tokens_seen": 845576, |
| "step": 2970 |
| }, |
| { |
| "epoch": 5.550373134328359, |
| "grad_norm": 3.308563232421875, |
| "learning_rate": 2.4576712759110728e-05, |
| "loss": 0.2053, |
| "num_input_tokens_seen": 846984, |
| "step": 2975 |
| }, |
| { |
| "epoch": 5.559701492537314, |
| "grad_norm": 11.623724937438965, |
| "learning_rate": 2.4495321533986297e-05, |
| "loss": 0.2027, |
| "num_input_tokens_seen": 848328, |
| "step": 2980 |
| }, |
| { |
| "epoch": 5.5690298507462686, |
| "grad_norm": 5.230249881744385, |
| "learning_rate": 2.441393565991849e-05, |
| "loss": 0.1416, |
| "num_input_tokens_seen": 849800, |
| "step": 2985 |
| }, |
| { |
| "epoch": 5.5783582089552235, |
| "grad_norm": 4.813288688659668, |
| "learning_rate": 2.43325559998338e-05, |
| "loss": 0.1608, |
| "num_input_tokens_seen": 851176, |
| "step": 2990 |
| }, |
| { |
| "epoch": 5.587686567164179, |
| "grad_norm": 1.506446361541748, |
| "learning_rate": 2.425118341659284e-05, |
| "loss": 0.1256, |
| "num_input_tokens_seen": 852776, |
| "step": 2995 |
| }, |
| { |
| "epoch": 5.597014925373134, |
| "grad_norm": 1.2224841117858887, |
| "learning_rate": 2.416981877298118e-05, |
| "loss": 0.1861, |
| "num_input_tokens_seen": 854184, |
| "step": 3000 |
| }, |
| { |
| "epoch": 5.606343283582089, |
| "grad_norm": 4.181064605712891, |
| "learning_rate": 2.4088462931700214e-05, |
| "loss": 0.2261, |
| "num_input_tokens_seen": 855496, |
| "step": 3005 |
| }, |
| { |
| "epoch": 5.615671641791045, |
| "grad_norm": 4.503936767578125, |
| "learning_rate": 2.4007116755357995e-05, |
| "loss": 0.1758, |
| "num_input_tokens_seen": 856840, |
| "step": 3010 |
| }, |
| { |
| "epoch": 5.625, |
| "grad_norm": 7.320756435394287, |
| "learning_rate": 2.392578110646012e-05, |
| "loss": 0.1809, |
| "num_input_tokens_seen": 858184, |
| "step": 3015 |
| }, |
| { |
| "epoch": 5.634328358208955, |
| "grad_norm": 6.467454433441162, |
| "learning_rate": 2.384445684740055e-05, |
| "loss": 0.2259, |
| "num_input_tokens_seen": 859624, |
| "step": 3020 |
| }, |
| { |
| "epoch": 5.643656716417911, |
| "grad_norm": 2.777430295944214, |
| "learning_rate": 2.376314484045248e-05, |
| "loss": 0.2234, |
| "num_input_tokens_seen": 860968, |
| "step": 3025 |
| }, |
| { |
| "epoch": 5.652985074626866, |
| "grad_norm": 7.616672039031982, |
| "learning_rate": 2.3681845947759206e-05, |
| "loss": 0.2319, |
| "num_input_tokens_seen": 862472, |
| "step": 3030 |
| }, |
| { |
| "epoch": 5.662313432835821, |
| "grad_norm": 9.492351531982422, |
| "learning_rate": 2.3600561031324958e-05, |
| "loss": 0.1348, |
| "num_input_tokens_seen": 863880, |
| "step": 3035 |
| }, |
| { |
| "epoch": 5.6716417910447765, |
| "grad_norm": 4.998366832733154, |
| "learning_rate": 2.3519290953005784e-05, |
| "loss": 0.1646, |
| "num_input_tokens_seen": 865288, |
| "step": 3040 |
| }, |
| { |
| "epoch": 5.6809701492537314, |
| "grad_norm": 0.7700138688087463, |
| "learning_rate": 2.3438036574500432e-05, |
| "loss": 0.0566, |
| "num_input_tokens_seen": 866824, |
| "step": 3045 |
| }, |
| { |
| "epoch": 5.690298507462686, |
| "grad_norm": 2.6870310306549072, |
| "learning_rate": 2.3356798757341155e-05, |
| "loss": 0.1444, |
| "num_input_tokens_seen": 868104, |
| "step": 3050 |
| }, |
| { |
| "epoch": 5.699626865671641, |
| "grad_norm": 7.493394374847412, |
| "learning_rate": 2.327557836288461e-05, |
| "loss": 0.1025, |
| "num_input_tokens_seen": 869512, |
| "step": 3055 |
| }, |
| { |
| "epoch": 5.708955223880597, |
| "grad_norm": 14.742531776428223, |
| "learning_rate": 2.3194376252302723e-05, |
| "loss": 0.1756, |
| "num_input_tokens_seen": 871048, |
| "step": 3060 |
| }, |
| { |
| "epoch": 5.718283582089552, |
| "grad_norm": 5.719645977020264, |
| "learning_rate": 2.311319328657358e-05, |
| "loss": 0.2248, |
| "num_input_tokens_seen": 872424, |
| "step": 3065 |
| }, |
| { |
| "epoch": 5.727611940298507, |
| "grad_norm": 4.379673480987549, |
| "learning_rate": 2.3032030326472235e-05, |
| "loss": 0.1617, |
| "num_input_tokens_seen": 873992, |
| "step": 3070 |
| }, |
| { |
| "epoch": 5.736940298507463, |
| "grad_norm": 8.308077812194824, |
| "learning_rate": 2.2950888232561672e-05, |
| "loss": 0.1124, |
| "num_input_tokens_seen": 875368, |
| "step": 3075 |
| }, |
| { |
| "epoch": 5.746268656716418, |
| "grad_norm": 5.258336067199707, |
| "learning_rate": 2.2869767865183612e-05, |
| "loss": 0.1254, |
| "num_input_tokens_seen": 876808, |
| "step": 3080 |
| }, |
| { |
| "epoch": 5.755597014925373, |
| "grad_norm": 6.911777019500732, |
| "learning_rate": 2.2788670084449394e-05, |
| "loss": 0.2631, |
| "num_input_tokens_seen": 878152, |
| "step": 3085 |
| }, |
| { |
| "epoch": 5.764925373134329, |
| "grad_norm": 2.7381227016448975, |
| "learning_rate": 2.2707595750230894e-05, |
| "loss": 0.1523, |
| "num_input_tokens_seen": 879496, |
| "step": 3090 |
| }, |
| { |
| "epoch": 5.774253731343284, |
| "grad_norm": 8.662060737609863, |
| "learning_rate": 2.2626545722151384e-05, |
| "loss": 0.313, |
| "num_input_tokens_seen": 880904, |
| "step": 3095 |
| }, |
| { |
| "epoch": 5.7835820895522385, |
| "grad_norm": 8.383820533752441, |
| "learning_rate": 2.254552085957641e-05, |
| "loss": 0.2706, |
| "num_input_tokens_seen": 882152, |
| "step": 3100 |
| }, |
| { |
| "epoch": 5.792910447761194, |
| "grad_norm": 6.075637340545654, |
| "learning_rate": 2.246452202160471e-05, |
| "loss": 0.2054, |
| "num_input_tokens_seen": 883400, |
| "step": 3105 |
| }, |
| { |
| "epoch": 5.802238805970149, |
| "grad_norm": 2.7444355487823486, |
| "learning_rate": 2.2383550067059063e-05, |
| "loss": 0.1254, |
| "num_input_tokens_seen": 884968, |
| "step": 3110 |
| }, |
| { |
| "epoch": 5.811567164179104, |
| "grad_norm": 6.638121128082275, |
| "learning_rate": 2.2302605854477228e-05, |
| "loss": 0.1277, |
| "num_input_tokens_seen": 886312, |
| "step": 3115 |
| }, |
| { |
| "epoch": 5.82089552238806, |
| "grad_norm": 1.8276153802871704, |
| "learning_rate": 2.22216902421028e-05, |
| "loss": 0.1038, |
| "num_input_tokens_seen": 887752, |
| "step": 3120 |
| }, |
| { |
| "epoch": 5.830223880597015, |
| "grad_norm": 7.056869029998779, |
| "learning_rate": 2.2140804087876132e-05, |
| "loss": 0.2267, |
| "num_input_tokens_seen": 889096, |
| "step": 3125 |
| }, |
| { |
| "epoch": 5.83955223880597, |
| "grad_norm": 4.616489887237549, |
| "learning_rate": 2.2059948249425243e-05, |
| "loss": 0.1902, |
| "num_input_tokens_seen": 890408, |
| "step": 3130 |
| }, |
| { |
| "epoch": 5.848880597014926, |
| "grad_norm": 5.256584644317627, |
| "learning_rate": 2.197912358405672e-05, |
| "loss": 0.1469, |
| "num_input_tokens_seen": 892072, |
| "step": 3135 |
| }, |
| { |
| "epoch": 5.858208955223881, |
| "grad_norm": 3.31722354888916, |
| "learning_rate": 2.189833094874661e-05, |
| "loss": 0.1961, |
| "num_input_tokens_seen": 893608, |
| "step": 3140 |
| }, |
| { |
| "epoch": 5.867537313432836, |
| "grad_norm": 4.113776683807373, |
| "learning_rate": 2.1817571200131358e-05, |
| "loss": 0.2628, |
| "num_input_tokens_seen": 894920, |
| "step": 3145 |
| }, |
| { |
| "epoch": 5.8768656716417915, |
| "grad_norm": 5.336353778839111, |
| "learning_rate": 2.173684519449872e-05, |
| "loss": 0.1827, |
| "num_input_tokens_seen": 896328, |
| "step": 3150 |
| }, |
| { |
| "epoch": 5.8861940298507465, |
| "grad_norm": 0.9053072333335876, |
| "learning_rate": 2.1656153787778645e-05, |
| "loss": 0.1567, |
| "num_input_tokens_seen": 897896, |
| "step": 3155 |
| }, |
| { |
| "epoch": 5.895522388059701, |
| "grad_norm": 4.398068428039551, |
| "learning_rate": 2.157549783553429e-05, |
| "loss": 0.1489, |
| "num_input_tokens_seen": 899272, |
| "step": 3160 |
| }, |
| { |
| "epoch": 5.904850746268656, |
| "grad_norm": 2.8485960960388184, |
| "learning_rate": 2.1494878192952855e-05, |
| "loss": 0.1038, |
| "num_input_tokens_seen": 900584, |
| "step": 3165 |
| }, |
| { |
| "epoch": 5.914179104477612, |
| "grad_norm": 1.8162635564804077, |
| "learning_rate": 2.141429571483655e-05, |
| "loss": 0.0956, |
| "num_input_tokens_seen": 901992, |
| "step": 3170 |
| }, |
| { |
| "epoch": 5.923507462686567, |
| "grad_norm": 7.570062637329102, |
| "learning_rate": 2.1333751255593552e-05, |
| "loss": 0.2342, |
| "num_input_tokens_seen": 903304, |
| "step": 3175 |
| }, |
| { |
| "epoch": 5.932835820895522, |
| "grad_norm": 9.095661163330078, |
| "learning_rate": 2.1253245669228905e-05, |
| "loss": 0.1784, |
| "num_input_tokens_seen": 904712, |
| "step": 3180 |
| }, |
| { |
| "epoch": 5.942164179104478, |
| "grad_norm": 3.692988872528076, |
| "learning_rate": 2.1172779809335496e-05, |
| "loss": 0.165, |
| "num_input_tokens_seen": 906120, |
| "step": 3185 |
| }, |
| { |
| "epoch": 5.951492537313433, |
| "grad_norm": 3.7332866191864014, |
| "learning_rate": 2.109235452908502e-05, |
| "loss": 0.1954, |
| "num_input_tokens_seen": 907464, |
| "step": 3190 |
| }, |
| { |
| "epoch": 5.960820895522388, |
| "grad_norm": 6.348042964935303, |
| "learning_rate": 2.101197068121888e-05, |
| "loss": 0.1819, |
| "num_input_tokens_seen": 909000, |
| "step": 3195 |
| }, |
| { |
| "epoch": 5.970149253731344, |
| "grad_norm": 5.93782901763916, |
| "learning_rate": 2.0931629118039185e-05, |
| "loss": 0.1695, |
| "num_input_tokens_seen": 910280, |
| "step": 3200 |
| }, |
| { |
| "epoch": 5.979477611940299, |
| "grad_norm": 6.746236324310303, |
| "learning_rate": 2.0851330691399713e-05, |
| "loss": 0.2628, |
| "num_input_tokens_seen": 911816, |
| "step": 3205 |
| }, |
| { |
| "epoch": 5.9888059701492535, |
| "grad_norm": 5.286591053009033, |
| "learning_rate": 2.0771076252696846e-05, |
| "loss": 0.2808, |
| "num_input_tokens_seen": 913320, |
| "step": 3210 |
| }, |
| { |
| "epoch": 5.9981343283582085, |
| "grad_norm": 7.335351943969727, |
| "learning_rate": 2.0690866652860584e-05, |
| "loss": 0.2128, |
| "num_input_tokens_seen": 914856, |
| "step": 3215 |
| }, |
| { |
| "epoch": 6.0, |
| "eval_loss": 0.7775281071662903, |
| "eval_runtime": 2.8424, |
| "eval_samples_per_second": 83.733, |
| "eval_steps_per_second": 21.109, |
| "num_input_tokens_seen": 914920, |
| "step": 3216 |
| }, |
| { |
| "epoch": 6.007462686567164, |
| "grad_norm": 2.354477882385254, |
| "learning_rate": 2.0610702742345517e-05, |
| "loss": 0.1133, |
| "num_input_tokens_seen": 916136, |
| "step": 3220 |
| }, |
| { |
| "epoch": 6.016791044776119, |
| "grad_norm": 0.44997233152389526, |
| "learning_rate": 2.053058537112177e-05, |
| "loss": 0.0791, |
| "num_input_tokens_seen": 917544, |
| "step": 3225 |
| }, |
| { |
| "epoch": 6.026119402985074, |
| "grad_norm": 1.4087895154953003, |
| "learning_rate": 2.0450515388666022e-05, |
| "loss": 0.0836, |
| "num_input_tokens_seen": 919080, |
| "step": 3230 |
| }, |
| { |
| "epoch": 6.03544776119403, |
| "grad_norm": 5.186655521392822, |
| "learning_rate": 2.0370493643952507e-05, |
| "loss": 0.0919, |
| "num_input_tokens_seen": 920840, |
| "step": 3235 |
| }, |
| { |
| "epoch": 6.044776119402985, |
| "grad_norm": 0.6414034962654114, |
| "learning_rate": 2.0290520985443985e-05, |
| "loss": 0.0641, |
| "num_input_tokens_seen": 922248, |
| "step": 3240 |
| }, |
| { |
| "epoch": 6.05410447761194, |
| "grad_norm": 1.245631456375122, |
| "learning_rate": 2.0210598261082765e-05, |
| "loss": 0.1351, |
| "num_input_tokens_seen": 923912, |
| "step": 3245 |
| }, |
| { |
| "epoch": 6.063432835820896, |
| "grad_norm": 5.61036491394043, |
| "learning_rate": 2.013072631828171e-05, |
| "loss": 0.1321, |
| "num_input_tokens_seen": 925352, |
| "step": 3250 |
| }, |
| { |
| "epoch": 6.072761194029851, |
| "grad_norm": 5.4239277839660645, |
| "learning_rate": 2.005090600391526e-05, |
| "loss": 0.0974, |
| "num_input_tokens_seen": 926856, |
| "step": 3255 |
| }, |
| { |
| "epoch": 6.082089552238806, |
| "grad_norm": 5.463709354400635, |
| "learning_rate": 1.9971138164310422e-05, |
| "loss": 0.272, |
| "num_input_tokens_seen": 928168, |
| "step": 3260 |
| }, |
| { |
| "epoch": 6.0914179104477615, |
| "grad_norm": 3.6790611743927, |
| "learning_rate": 1.9891423645237834e-05, |
| "loss": 0.199, |
| "num_input_tokens_seen": 929544, |
| "step": 3265 |
| }, |
| { |
| "epoch": 6.100746268656716, |
| "grad_norm": 3.5772032737731934, |
| "learning_rate": 1.9811763291902774e-05, |
| "loss": 0.1227, |
| "num_input_tokens_seen": 930792, |
| "step": 3270 |
| }, |
| { |
| "epoch": 6.110074626865671, |
| "grad_norm": 5.671499252319336, |
| "learning_rate": 1.9732157948936204e-05, |
| "loss": 0.1362, |
| "num_input_tokens_seen": 932072, |
| "step": 3275 |
| }, |
| { |
| "epoch": 6.119402985074627, |
| "grad_norm": 5.0299391746521, |
| "learning_rate": 1.9652608460385843e-05, |
| "loss": 0.1832, |
| "num_input_tokens_seen": 933384, |
| "step": 3280 |
| }, |
| { |
| "epoch": 6.128731343283582, |
| "grad_norm": 3.0618503093719482, |
| "learning_rate": 1.957311566970716e-05, |
| "loss": 0.0931, |
| "num_input_tokens_seen": 934888, |
| "step": 3285 |
| }, |
| { |
| "epoch": 6.138059701492537, |
| "grad_norm": 1.4679239988327026, |
| "learning_rate": 1.949368041975447e-05, |
| "loss": 0.0627, |
| "num_input_tokens_seen": 936328, |
| "step": 3290 |
| }, |
| { |
| "epoch": 6.147388059701493, |
| "grad_norm": 3.217824935913086, |
| "learning_rate": 1.9414303552771997e-05, |
| "loss": 0.1307, |
| "num_input_tokens_seen": 937864, |
| "step": 3295 |
| }, |
| { |
| "epoch": 6.156716417910448, |
| "grad_norm": 7.090113639831543, |
| "learning_rate": 1.9334985910384923e-05, |
| "loss": 0.096, |
| "num_input_tokens_seen": 939464, |
| "step": 3300 |
| }, |
| { |
| "epoch": 6.166044776119403, |
| "grad_norm": 1.2847950458526611, |
| "learning_rate": 1.9255728333590475e-05, |
| "loss": 0.0558, |
| "num_input_tokens_seen": 940968, |
| "step": 3305 |
| }, |
| { |
| "epoch": 6.175373134328359, |
| "grad_norm": 1.8112127780914307, |
| "learning_rate": 1.9176531662749044e-05, |
| "loss": 0.1536, |
| "num_input_tokens_seen": 942152, |
| "step": 3310 |
| }, |
| { |
| "epoch": 6.184701492537314, |
| "grad_norm": 3.3246536254882812, |
| "learning_rate": 1.9097396737575206e-05, |
| "loss": 0.098, |
| "num_input_tokens_seen": 943720, |
| "step": 3315 |
| }, |
| { |
| "epoch": 6.1940298507462686, |
| "grad_norm": 0.9060310125350952, |
| "learning_rate": 1.9018324397128866e-05, |
| "loss": 0.1522, |
| "num_input_tokens_seen": 944904, |
| "step": 3320 |
| }, |
| { |
| "epoch": 6.2033582089552235, |
| "grad_norm": 3.79464054107666, |
| "learning_rate": 1.893931547980635e-05, |
| "loss": 0.0886, |
| "num_input_tokens_seen": 946344, |
| "step": 3325 |
| }, |
| { |
| "epoch": 6.212686567164179, |
| "grad_norm": 1.9358528852462769, |
| "learning_rate": 1.8860370823331513e-05, |
| "loss": 0.1089, |
| "num_input_tokens_seen": 947816, |
| "step": 3330 |
| }, |
| { |
| "epoch": 6.222014925373134, |
| "grad_norm": 10.025459289550781, |
| "learning_rate": 1.878149126474685e-05, |
| "loss": 0.1417, |
| "num_input_tokens_seen": 949320, |
| "step": 3335 |
| }, |
| { |
| "epoch": 6.231343283582089, |
| "grad_norm": 7.90490198135376, |
| "learning_rate": 1.870267764040465e-05, |
| "loss": 0.0677, |
| "num_input_tokens_seen": 950664, |
| "step": 3340 |
| }, |
| { |
| "epoch": 6.240671641791045, |
| "grad_norm": 5.662836074829102, |
| "learning_rate": 1.8623930785958092e-05, |
| "loss": 0.1749, |
| "num_input_tokens_seen": 952328, |
| "step": 3345 |
| }, |
| { |
| "epoch": 6.25, |
| "grad_norm": 6.037897109985352, |
| "learning_rate": 1.8545251536352402e-05, |
| "loss": 0.1317, |
| "num_input_tokens_seen": 953736, |
| "step": 3350 |
| }, |
| { |
| "epoch": 6.259328358208955, |
| "grad_norm": 1.9377055168151855, |
| "learning_rate": 1.8466640725816e-05, |
| "loss": 0.0369, |
| "num_input_tokens_seen": 955272, |
| "step": 3355 |
| }, |
| { |
| "epoch": 6.268656716417911, |
| "grad_norm": 7.1008830070495605, |
| "learning_rate": 1.838809918785166e-05, |
| "loss": 0.067, |
| "num_input_tokens_seen": 956808, |
| "step": 3360 |
| }, |
| { |
| "epoch": 6.277985074626866, |
| "grad_norm": 5.102013111114502, |
| "learning_rate": 1.8309627755227643e-05, |
| "loss": 0.2614, |
| "num_input_tokens_seen": 958152, |
| "step": 3365 |
| }, |
| { |
| "epoch": 6.287313432835821, |
| "grad_norm": 8.426166534423828, |
| "learning_rate": 1.8231227259968926e-05, |
| "loss": 0.1055, |
| "num_input_tokens_seen": 959560, |
| "step": 3370 |
| }, |
| { |
| "epoch": 6.2966417910447765, |
| "grad_norm": 3.8010165691375732, |
| "learning_rate": 1.8152898533348317e-05, |
| "loss": 0.1154, |
| "num_input_tokens_seen": 960776, |
| "step": 3375 |
| }, |
| { |
| "epoch": 6.3059701492537314, |
| "grad_norm": 3.010805606842041, |
| "learning_rate": 1.8074642405877673e-05, |
| "loss": 0.0929, |
| "num_input_tokens_seen": 962280, |
| "step": 3380 |
| }, |
| { |
| "epoch": 6.315298507462686, |
| "grad_norm": 3.829035520553589, |
| "learning_rate": 1.799645970729909e-05, |
| "loss": 0.0522, |
| "num_input_tokens_seen": 963752, |
| "step": 3385 |
| }, |
| { |
| "epoch": 6.324626865671641, |
| "grad_norm": 1.9314767122268677, |
| "learning_rate": 1.7918351266576112e-05, |
| "loss": 0.0582, |
| "num_input_tokens_seen": 965096, |
| "step": 3390 |
| }, |
| { |
| "epoch": 6.333955223880597, |
| "grad_norm": 2.5984270572662354, |
| "learning_rate": 1.784031791188492e-05, |
| "loss": 0.1246, |
| "num_input_tokens_seen": 966600, |
| "step": 3395 |
| }, |
| { |
| "epoch": 6.343283582089552, |
| "grad_norm": 4.988873481750488, |
| "learning_rate": 1.776236047060558e-05, |
| "loss": 0.1138, |
| "num_input_tokens_seen": 967976, |
| "step": 3400 |
| }, |
| { |
| "epoch": 6.352611940298507, |
| "grad_norm": 8.279336929321289, |
| "learning_rate": 1.768447976931326e-05, |
| "loss": 0.2019, |
| "num_input_tokens_seen": 969256, |
| "step": 3405 |
| }, |
| { |
| "epoch": 6.361940298507463, |
| "grad_norm": 1.1861586570739746, |
| "learning_rate": 1.760667663376943e-05, |
| "loss": 0.1032, |
| "num_input_tokens_seen": 970696, |
| "step": 3410 |
| }, |
| { |
| "epoch": 6.371268656716418, |
| "grad_norm": 2.52764892578125, |
| "learning_rate": 1.7528951888913177e-05, |
| "loss": 0.1149, |
| "num_input_tokens_seen": 972072, |
| "step": 3415 |
| }, |
| { |
| "epoch": 6.380597014925373, |
| "grad_norm": 4.8980841636657715, |
| "learning_rate": 1.7451306358852386e-05, |
| "loss": 0.0872, |
| "num_input_tokens_seen": 973736, |
| "step": 3420 |
| }, |
| { |
| "epoch": 6.389925373134329, |
| "grad_norm": 8.90312385559082, |
| "learning_rate": 1.737374086685504e-05, |
| "loss": 0.1471, |
| "num_input_tokens_seen": 975144, |
| "step": 3425 |
| }, |
| { |
| "epoch": 6.399253731343284, |
| "grad_norm": 0.9255650043487549, |
| "learning_rate": 1.729625623534053e-05, |
| "loss": 0.1376, |
| "num_input_tokens_seen": 976712, |
| "step": 3430 |
| }, |
| { |
| "epoch": 6.4085820895522385, |
| "grad_norm": 10.34676456451416, |
| "learning_rate": 1.721885328587083e-05, |
| "loss": 0.1997, |
| "num_input_tokens_seen": 978088, |
| "step": 3435 |
| }, |
| { |
| "epoch": 6.417910447761194, |
| "grad_norm": 2.8841071128845215, |
| "learning_rate": 1.714153283914189e-05, |
| "loss": 0.0611, |
| "num_input_tokens_seen": 979560, |
| "step": 3440 |
| }, |
| { |
| "epoch": 6.427238805970149, |
| "grad_norm": 4.697388648986816, |
| "learning_rate": 1.706429571497486e-05, |
| "loss": 0.1438, |
| "num_input_tokens_seen": 981000, |
| "step": 3445 |
| }, |
| { |
| "epoch": 6.436567164179104, |
| "grad_norm": 6.702895164489746, |
| "learning_rate": 1.698714273230746e-05, |
| "loss": 0.1977, |
| "num_input_tokens_seen": 982568, |
| "step": 3450 |
| }, |
| { |
| "epoch": 6.44589552238806, |
| "grad_norm": 4.011035442352295, |
| "learning_rate": 1.6910074709185235e-05, |
| "loss": 0.1248, |
| "num_input_tokens_seen": 984040, |
| "step": 3455 |
| }, |
| { |
| "epoch": 6.455223880597015, |
| "grad_norm": 3.284123420715332, |
| "learning_rate": 1.6833092462752942e-05, |
| "loss": 0.0852, |
| "num_input_tokens_seen": 985384, |
| "step": 3460 |
| }, |
| { |
| "epoch": 6.46455223880597, |
| "grad_norm": 6.607419490814209, |
| "learning_rate": 1.6756196809245838e-05, |
| "loss": 0.0708, |
| "num_input_tokens_seen": 986696, |
| "step": 3465 |
| }, |
| { |
| "epoch": 6.473880597014926, |
| "grad_norm": 4.480660438537598, |
| "learning_rate": 1.667938856398105e-05, |
| "loss": 0.0761, |
| "num_input_tokens_seen": 988136, |
| "step": 3470 |
| }, |
| { |
| "epoch": 6.483208955223881, |
| "grad_norm": 0.6971399784088135, |
| "learning_rate": 1.6602668541348916e-05, |
| "loss": 0.1867, |
| "num_input_tokens_seen": 989608, |
| "step": 3475 |
| }, |
| { |
| "epoch": 6.492537313432836, |
| "grad_norm": 5.047956943511963, |
| "learning_rate": 1.6526037554804367e-05, |
| "loss": 0.0629, |
| "num_input_tokens_seen": 990984, |
| "step": 3480 |
| }, |
| { |
| "epoch": 6.5, |
| "eval_loss": 0.8907724618911743, |
| "eval_runtime": 2.8286, |
| "eval_samples_per_second": 84.14, |
| "eval_steps_per_second": 21.212, |
| "num_input_tokens_seen": 992104, |
| "step": 3484 |
| }, |
| { |
| "epoch": 6.5018656716417915, |
| "grad_norm": 8.58009147644043, |
| "learning_rate": 1.6449496416858284e-05, |
| "loss": 0.1048, |
| "num_input_tokens_seen": 992360, |
| "step": 3485 |
| }, |
| { |
| "epoch": 6.5111940298507465, |
| "grad_norm": 4.399396896362305, |
| "learning_rate": 1.637304593906891e-05, |
| "loss": 0.2073, |
| "num_input_tokens_seen": 993864, |
| "step": 3490 |
| }, |
| { |
| "epoch": 6.520522388059701, |
| "grad_norm": 2.828930377960205, |
| "learning_rate": 1.6296686932033203e-05, |
| "loss": 0.1103, |
| "num_input_tokens_seen": 995176, |
| "step": 3495 |
| }, |
| { |
| "epoch": 6.529850746268656, |
| "grad_norm": 2.121316909790039, |
| "learning_rate": 1.6220420205378282e-05, |
| "loss": 0.1479, |
| "num_input_tokens_seen": 996680, |
| "step": 3500 |
| }, |
| { |
| "epoch": 6.539179104477612, |
| "grad_norm": 3.9069769382476807, |
| "learning_rate": 1.614424656775283e-05, |
| "loss": 0.1313, |
| "num_input_tokens_seen": 998024, |
| "step": 3505 |
| }, |
| { |
| "epoch": 6.548507462686567, |
| "grad_norm": 8.533858299255371, |
| "learning_rate": 1.6068166826818492e-05, |
| "loss": 0.1362, |
| "num_input_tokens_seen": 999432, |
| "step": 3510 |
| }, |
| { |
| "epoch": 6.557835820895522, |
| "grad_norm": 12.1807222366333, |
| "learning_rate": 1.5992181789241353e-05, |
| "loss": 0.1531, |
| "num_input_tokens_seen": 1000712, |
| "step": 3515 |
| }, |
| { |
| "epoch": 6.567164179104478, |
| "grad_norm": 8.048212051391602, |
| "learning_rate": 1.591629226068338e-05, |
| "loss": 0.1318, |
| "num_input_tokens_seen": 1002056, |
| "step": 3520 |
| }, |
| { |
| "epoch": 6.576492537313433, |
| "grad_norm": 7.229369163513184, |
| "learning_rate": 1.5840499045793843e-05, |
| "loss": 0.1279, |
| "num_input_tokens_seen": 1003464, |
| "step": 3525 |
| }, |
| { |
| "epoch": 6.585820895522388, |
| "grad_norm": 5.163874626159668, |
| "learning_rate": 1.5764802948200825e-05, |
| "loss": 0.0779, |
| "num_input_tokens_seen": 1004808, |
| "step": 3530 |
| }, |
| { |
| "epoch": 6.595149253731344, |
| "grad_norm": 0.4225695729255676, |
| "learning_rate": 1.5689204770502666e-05, |
| "loss": 0.0755, |
| "num_input_tokens_seen": 1006120, |
| "step": 3535 |
| }, |
| { |
| "epoch": 6.604477611940299, |
| "grad_norm": 3.0213398933410645, |
| "learning_rate": 1.5613705314259492e-05, |
| "loss": 0.1819, |
| "num_input_tokens_seen": 1007624, |
| "step": 3540 |
| }, |
| { |
| "epoch": 6.6138059701492535, |
| "grad_norm": 5.0767412185668945, |
| "learning_rate": 1.553830537998466e-05, |
| "loss": 0.2494, |
| "num_input_tokens_seen": 1009000, |
| "step": 3545 |
| }, |
| { |
| "epoch": 6.6231343283582085, |
| "grad_norm": 1.984185814857483, |
| "learning_rate": 1.5463005767136356e-05, |
| "loss": 0.1003, |
| "num_input_tokens_seen": 1010376, |
| "step": 3550 |
| }, |
| { |
| "epoch": 6.632462686567164, |
| "grad_norm": 5.121238708496094, |
| "learning_rate": 1.5387807274109038e-05, |
| "loss": 0.0881, |
| "num_input_tokens_seen": 1011816, |
| "step": 3555 |
| }, |
| { |
| "epoch": 6.641791044776119, |
| "grad_norm": 4.423469543457031, |
| "learning_rate": 1.531271069822499e-05, |
| "loss": 0.1076, |
| "num_input_tokens_seen": 1013416, |
| "step": 3560 |
| }, |
| { |
| "epoch": 6.651119402985074, |
| "grad_norm": 4.872020244598389, |
| "learning_rate": 1.5237716835725906e-05, |
| "loss": 0.0858, |
| "num_input_tokens_seen": 1014728, |
| "step": 3565 |
| }, |
| { |
| "epoch": 6.66044776119403, |
| "grad_norm": 4.624637126922607, |
| "learning_rate": 1.5162826481764398e-05, |
| "loss": 0.1641, |
| "num_input_tokens_seen": 1016136, |
| "step": 3570 |
| }, |
| { |
| "epoch": 6.669776119402985, |
| "grad_norm": 2.2791385650634766, |
| "learning_rate": 1.5088040430395589e-05, |
| "loss": 0.114, |
| "num_input_tokens_seen": 1017416, |
| "step": 3575 |
| }, |
| { |
| "epoch": 6.67910447761194, |
| "grad_norm": 5.884322643280029, |
| "learning_rate": 1.5013359474568712e-05, |
| "loss": 0.0883, |
| "num_input_tokens_seen": 1019048, |
| "step": 3580 |
| }, |
| { |
| "epoch": 6.688432835820896, |
| "grad_norm": 2.3762447834014893, |
| "learning_rate": 1.493878440611866e-05, |
| "loss": 0.0628, |
| "num_input_tokens_seen": 1020552, |
| "step": 3585 |
| }, |
| { |
| "epoch": 6.697761194029851, |
| "grad_norm": 1.0577960014343262, |
| "learning_rate": 1.4864316015757623e-05, |
| "loss": 0.0718, |
| "num_input_tokens_seen": 1021896, |
| "step": 3590 |
| }, |
| { |
| "epoch": 6.707089552238806, |
| "grad_norm": 0.3185429275035858, |
| "learning_rate": 1.478995509306669e-05, |
| "loss": 0.109, |
| "num_input_tokens_seen": 1023240, |
| "step": 3595 |
| }, |
| { |
| "epoch": 6.7164179104477615, |
| "grad_norm": 7.734789848327637, |
| "learning_rate": 1.4715702426487482e-05, |
| "loss": 0.0972, |
| "num_input_tokens_seen": 1024712, |
| "step": 3600 |
| }, |
| { |
| "epoch": 6.725746268656716, |
| "grad_norm": 3.9840004444122314, |
| "learning_rate": 1.4641558803313781e-05, |
| "loss": 0.058, |
| "num_input_tokens_seen": 1026216, |
| "step": 3605 |
| }, |
| { |
| "epoch": 6.735074626865671, |
| "grad_norm": 2.4233551025390625, |
| "learning_rate": 1.4567525009683219e-05, |
| "loss": 0.1402, |
| "num_input_tokens_seen": 1027592, |
| "step": 3610 |
| }, |
| { |
| "epoch": 6.744402985074627, |
| "grad_norm": 3.210083246231079, |
| "learning_rate": 1.4493601830568887e-05, |
| "loss": 0.0479, |
| "num_input_tokens_seen": 1029064, |
| "step": 3615 |
| }, |
| { |
| "epoch": 6.753731343283582, |
| "grad_norm": 0.9049201011657715, |
| "learning_rate": 1.4419790049771068e-05, |
| "loss": 0.1016, |
| "num_input_tokens_seen": 1030632, |
| "step": 3620 |
| }, |
| { |
| "epoch": 6.763059701492537, |
| "grad_norm": 8.050421714782715, |
| "learning_rate": 1.434609044990886e-05, |
| "loss": 0.1442, |
| "num_input_tokens_seen": 1032072, |
| "step": 3625 |
| }, |
| { |
| "epoch": 6.772388059701493, |
| "grad_norm": 4.569393634796143, |
| "learning_rate": 1.4272503812411974e-05, |
| "loss": 0.1531, |
| "num_input_tokens_seen": 1033480, |
| "step": 3630 |
| }, |
| { |
| "epoch": 6.781716417910448, |
| "grad_norm": 5.774075984954834, |
| "learning_rate": 1.4199030917512329e-05, |
| "loss": 0.1576, |
| "num_input_tokens_seen": 1034792, |
| "step": 3635 |
| }, |
| { |
| "epoch": 6.791044776119403, |
| "grad_norm": 2.3532445430755615, |
| "learning_rate": 1.412567254423589e-05, |
| "loss": 0.1353, |
| "num_input_tokens_seen": 1036200, |
| "step": 3640 |
| }, |
| { |
| "epoch": 6.800373134328359, |
| "grad_norm": 3.6300628185272217, |
| "learning_rate": 1.4052429470394352e-05, |
| "loss": 0.1005, |
| "num_input_tokens_seen": 1037608, |
| "step": 3645 |
| }, |
| { |
| "epoch": 6.809701492537314, |
| "grad_norm": 8.160791397094727, |
| "learning_rate": 1.3979302472576868e-05, |
| "loss": 0.3113, |
| "num_input_tokens_seen": 1038952, |
| "step": 3650 |
| }, |
| { |
| "epoch": 6.8190298507462686, |
| "grad_norm": 2.6540005207061768, |
| "learning_rate": 1.3906292326141904e-05, |
| "loss": 0.1477, |
| "num_input_tokens_seen": 1040584, |
| "step": 3655 |
| }, |
| { |
| "epoch": 6.8283582089552235, |
| "grad_norm": 5.503115653991699, |
| "learning_rate": 1.3833399805208904e-05, |
| "loss": 0.1157, |
| "num_input_tokens_seen": 1041960, |
| "step": 3660 |
| }, |
| { |
| "epoch": 6.837686567164179, |
| "grad_norm": 5.932112216949463, |
| "learning_rate": 1.376062568265018e-05, |
| "loss": 0.0719, |
| "num_input_tokens_seen": 1043336, |
| "step": 3665 |
| }, |
| { |
| "epoch": 6.847014925373134, |
| "grad_norm": 4.92405891418457, |
| "learning_rate": 1.3687970730082678e-05, |
| "loss": 0.1961, |
| "num_input_tokens_seen": 1044712, |
| "step": 3670 |
| }, |
| { |
| "epoch": 6.856343283582089, |
| "grad_norm": 4.606871604919434, |
| "learning_rate": 1.361543571785976e-05, |
| "loss": 0.0742, |
| "num_input_tokens_seen": 1046120, |
| "step": 3675 |
| }, |
| { |
| "epoch": 6.865671641791045, |
| "grad_norm": 4.565592288970947, |
| "learning_rate": 1.3543021415063117e-05, |
| "loss": 0.1448, |
| "num_input_tokens_seen": 1047592, |
| "step": 3680 |
| }, |
| { |
| "epoch": 6.875, |
| "grad_norm": 8.16478443145752, |
| "learning_rate": 1.347072858949453e-05, |
| "loss": 0.1418, |
| "num_input_tokens_seen": 1048936, |
| "step": 3685 |
| }, |
| { |
| "epoch": 6.884328358208955, |
| "grad_norm": 3.465632200241089, |
| "learning_rate": 1.3398558007667806e-05, |
| "loss": 0.106, |
| "num_input_tokens_seen": 1050376, |
| "step": 3690 |
| }, |
| { |
| "epoch": 6.893656716417911, |
| "grad_norm": 4.966838836669922, |
| "learning_rate": 1.3326510434800566e-05, |
| "loss": 0.1917, |
| "num_input_tokens_seen": 1051816, |
| "step": 3695 |
| }, |
| { |
| "epoch": 6.902985074626866, |
| "grad_norm": 5.525684356689453, |
| "learning_rate": 1.3254586634806249e-05, |
| "loss": 0.1972, |
| "num_input_tokens_seen": 1053128, |
| "step": 3700 |
| }, |
| { |
| "epoch": 6.912313432835821, |
| "grad_norm": 3.0775158405303955, |
| "learning_rate": 1.3182787370285865e-05, |
| "loss": 0.1311, |
| "num_input_tokens_seen": 1054440, |
| "step": 3705 |
| }, |
| { |
| "epoch": 6.9216417910447765, |
| "grad_norm": 4.294163227081299, |
| "learning_rate": 1.3111113402520053e-05, |
| "loss": 0.2328, |
| "num_input_tokens_seen": 1055816, |
| "step": 3710 |
| }, |
| { |
| "epoch": 6.9309701492537314, |
| "grad_norm": 1.8091331720352173, |
| "learning_rate": 1.3039565491460882e-05, |
| "loss": 0.0862, |
| "num_input_tokens_seen": 1057288, |
| "step": 3715 |
| }, |
| { |
| "epoch": 6.940298507462686, |
| "grad_norm": 2.1181790828704834, |
| "learning_rate": 1.2968144395723914e-05, |
| "loss": 0.1018, |
| "num_input_tokens_seen": 1058888, |
| "step": 3720 |
| }, |
| { |
| "epoch": 6.949626865671641, |
| "grad_norm": 4.534855365753174, |
| "learning_rate": 1.289685087258004e-05, |
| "loss": 0.2043, |
| "num_input_tokens_seen": 1060296, |
| "step": 3725 |
| }, |
| { |
| "epoch": 6.958955223880597, |
| "grad_norm": 4.624890327453613, |
| "learning_rate": 1.2825685677947563e-05, |
| "loss": 0.0744, |
| "num_input_tokens_seen": 1061640, |
| "step": 3730 |
| }, |
| { |
| "epoch": 6.968283582089552, |
| "grad_norm": 0.9943703413009644, |
| "learning_rate": 1.2754649566384114e-05, |
| "loss": 0.0967, |
| "num_input_tokens_seen": 1062888, |
| "step": 3735 |
| }, |
| { |
| "epoch": 6.977611940298507, |
| "grad_norm": 3.311591148376465, |
| "learning_rate": 1.2683743291078649e-05, |
| "loss": 0.1132, |
| "num_input_tokens_seen": 1064328, |
| "step": 3740 |
| }, |
| { |
| "epoch": 6.986940298507463, |
| "grad_norm": 3.9562103748321533, |
| "learning_rate": 1.2612967603843512e-05, |
| "loss": 0.0571, |
| "num_input_tokens_seen": 1065928, |
| "step": 3745 |
| }, |
| { |
| "epoch": 6.996268656716418, |
| "grad_norm": 2.187641143798828, |
| "learning_rate": 1.2542323255106408e-05, |
| "loss": 0.1303, |
| "num_input_tokens_seen": 1067176, |
| "step": 3750 |
| }, |
| { |
| "epoch": 7.0, |
| "eval_loss": 0.8885967135429382, |
| "eval_runtime": 2.845, |
| "eval_samples_per_second": 83.655, |
| "eval_steps_per_second": 21.089, |
| "num_input_tokens_seen": 1067520, |
| "step": 3752 |
| }, |
| { |
| "epoch": 7.005597014925373, |
| "grad_norm": 4.070071220397949, |
| "learning_rate": 1.24718109939025e-05, |
| "loss": 0.1253, |
| "num_input_tokens_seen": 1068192, |
| "step": 3755 |
| }, |
| { |
| "epoch": 7.014925373134329, |
| "grad_norm": 3.5709125995635986, |
| "learning_rate": 1.2401431567866425e-05, |
| "loss": 0.0977, |
| "num_input_tokens_seen": 1069600, |
| "step": 3760 |
| }, |
| { |
| "epoch": 7.024253731343284, |
| "grad_norm": 5.1813178062438965, |
| "learning_rate": 1.233118572322437e-05, |
| "loss": 0.1069, |
| "num_input_tokens_seen": 1070880, |
| "step": 3765 |
| }, |
| { |
| "epoch": 7.0335820895522385, |
| "grad_norm": 0.7963029146194458, |
| "learning_rate": 1.22610742047862e-05, |
| "loss": 0.0372, |
| "num_input_tokens_seen": 1072416, |
| "step": 3770 |
| }, |
| { |
| "epoch": 7.042910447761194, |
| "grad_norm": 5.064619064331055, |
| "learning_rate": 1.21910977559375e-05, |
| "loss": 0.0819, |
| "num_input_tokens_seen": 1073824, |
| "step": 3775 |
| }, |
| { |
| "epoch": 7.052238805970149, |
| "grad_norm": 4.033430099487305, |
| "learning_rate": 1.212125711863176e-05, |
| "loss": 0.0361, |
| "num_input_tokens_seen": 1074976, |
| "step": 3780 |
| }, |
| { |
| "epoch": 7.061567164179104, |
| "grad_norm": 1.5920814275741577, |
| "learning_rate": 1.2051553033382425e-05, |
| "loss": 0.0392, |
| "num_input_tokens_seen": 1076416, |
| "step": 3785 |
| }, |
| { |
| "epoch": 7.07089552238806, |
| "grad_norm": 1.3038798570632935, |
| "learning_rate": 1.1981986239255164e-05, |
| "loss": 0.0998, |
| "num_input_tokens_seen": 1077920, |
| "step": 3790 |
| }, |
| { |
| "epoch": 7.080223880597015, |
| "grad_norm": 2.997217893600464, |
| "learning_rate": 1.1912557473859895e-05, |
| "loss": 0.0555, |
| "num_input_tokens_seen": 1079776, |
| "step": 3795 |
| }, |
| { |
| "epoch": 7.08955223880597, |
| "grad_norm": 8.95091438293457, |
| "learning_rate": 1.1843267473343048e-05, |
| "loss": 0.1004, |
| "num_input_tokens_seen": 1081088, |
| "step": 3800 |
| }, |
| { |
| "epoch": 7.098880597014926, |
| "grad_norm": 6.654694557189941, |
| "learning_rate": 1.177411697237977e-05, |
| "loss": 0.0711, |
| "num_input_tokens_seen": 1082624, |
| "step": 3805 |
| }, |
| { |
| "epoch": 7.108208955223881, |
| "grad_norm": 1.1965399980545044, |
| "learning_rate": 1.1705106704166069e-05, |
| "loss": 0.0429, |
| "num_input_tokens_seen": 1084000, |
| "step": 3810 |
| }, |
| { |
| "epoch": 7.117537313432836, |
| "grad_norm": 4.996100902557373, |
| "learning_rate": 1.1636237400411107e-05, |
| "loss": 0.0688, |
| "num_input_tokens_seen": 1085504, |
| "step": 3815 |
| }, |
| { |
| "epoch": 7.126865671641791, |
| "grad_norm": 1.3035321235656738, |
| "learning_rate": 1.1567509791329401e-05, |
| "loss": 0.0243, |
| "num_input_tokens_seen": 1086848, |
| "step": 3820 |
| }, |
| { |
| "epoch": 7.1361940298507465, |
| "grad_norm": 3.5921273231506348, |
| "learning_rate": 1.1498924605633111e-05, |
| "loss": 0.065, |
| "num_input_tokens_seen": 1088160, |
| "step": 3825 |
| }, |
| { |
| "epoch": 7.145522388059701, |
| "grad_norm": 4.819706439971924, |
| "learning_rate": 1.1430482570524253e-05, |
| "loss": 0.1322, |
| "num_input_tokens_seen": 1089568, |
| "step": 3830 |
| }, |
| { |
| "epoch": 7.154850746268656, |
| "grad_norm": 3.4641623497009277, |
| "learning_rate": 1.136218441168709e-05, |
| "loss": 0.1218, |
| "num_input_tokens_seen": 1090752, |
| "step": 3835 |
| }, |
| { |
| "epoch": 7.164179104477612, |
| "grad_norm": 6.5249409675598145, |
| "learning_rate": 1.1294030853280321e-05, |
| "loss": 0.0636, |
| "num_input_tokens_seen": 1092128, |
| "step": 3840 |
| }, |
| { |
| "epoch": 7.173507462686567, |
| "grad_norm": 1.9167776107788086, |
| "learning_rate": 1.12260226179295e-05, |
| "loss": 0.0745, |
| "num_input_tokens_seen": 1093376, |
| "step": 3845 |
| }, |
| { |
| "epoch": 7.182835820895522, |
| "grad_norm": 2.50765061378479, |
| "learning_rate": 1.115816042671933e-05, |
| "loss": 0.0663, |
| "num_input_tokens_seen": 1094848, |
| "step": 3850 |
| }, |
| { |
| "epoch": 7.192164179104478, |
| "grad_norm": 6.417735576629639, |
| "learning_rate": 1.1090444999186e-05, |
| "loss": 0.1031, |
| "num_input_tokens_seen": 1096352, |
| "step": 3855 |
| }, |
| { |
| "epoch": 7.201492537313433, |
| "grad_norm": 4.3002119064331055, |
| "learning_rate": 1.102287705330961e-05, |
| "loss": 0.0699, |
| "num_input_tokens_seen": 1097824, |
| "step": 3860 |
| }, |
| { |
| "epoch": 7.210820895522388, |
| "grad_norm": 2.75150728225708, |
| "learning_rate": 1.095545730550649e-05, |
| "loss": 0.0319, |
| "num_input_tokens_seen": 1099136, |
| "step": 3865 |
| }, |
| { |
| "epoch": 7.220149253731344, |
| "grad_norm": 2.7232816219329834, |
| "learning_rate": 1.0888186470621675e-05, |
| "loss": 0.0223, |
| "num_input_tokens_seen": 1100512, |
| "step": 3870 |
| }, |
| { |
| "epoch": 7.229477611940299, |
| "grad_norm": 4.735564708709717, |
| "learning_rate": 1.0821065261921262e-05, |
| "loss": 0.1111, |
| "num_input_tokens_seen": 1101856, |
| "step": 3875 |
| }, |
| { |
| "epoch": 7.2388059701492535, |
| "grad_norm": 4.048469066619873, |
| "learning_rate": 1.0754094391084896e-05, |
| "loss": 0.0785, |
| "num_input_tokens_seen": 1103264, |
| "step": 3880 |
| }, |
| { |
| "epoch": 7.248134328358209, |
| "grad_norm": 14.058972358703613, |
| "learning_rate": 1.0687274568198208e-05, |
| "loss": 0.2413, |
| "num_input_tokens_seen": 1104736, |
| "step": 3885 |
| }, |
| { |
| "epoch": 7.257462686567164, |
| "grad_norm": 1.591977596282959, |
| "learning_rate": 1.0620606501745251e-05, |
| "loss": 0.0251, |
| "num_input_tokens_seen": 1106336, |
| "step": 3890 |
| }, |
| { |
| "epoch": 7.266791044776119, |
| "grad_norm": 6.045614242553711, |
| "learning_rate": 1.0554090898601062e-05, |
| "loss": 0.1345, |
| "num_input_tokens_seen": 1107808, |
| "step": 3895 |
| }, |
| { |
| "epoch": 7.276119402985074, |
| "grad_norm": 4.331659317016602, |
| "learning_rate": 1.0487728464024086e-05, |
| "loss": 0.1, |
| "num_input_tokens_seen": 1109184, |
| "step": 3900 |
| }, |
| { |
| "epoch": 7.28544776119403, |
| "grad_norm": 4.281707763671875, |
| "learning_rate": 1.0421519901648758e-05, |
| "loss": 0.0997, |
| "num_input_tokens_seen": 1110720, |
| "step": 3905 |
| }, |
| { |
| "epoch": 7.294776119402985, |
| "grad_norm": 1.8491820096969604, |
| "learning_rate": 1.0355465913478019e-05, |
| "loss": 0.0359, |
| "num_input_tokens_seen": 1112096, |
| "step": 3910 |
| }, |
| { |
| "epoch": 7.30410447761194, |
| "grad_norm": 2.129868507385254, |
| "learning_rate": 1.0289567199875878e-05, |
| "loss": 0.0603, |
| "num_input_tokens_seen": 1113408, |
| "step": 3915 |
| }, |
| { |
| "epoch": 7.313432835820896, |
| "grad_norm": 4.202950954437256, |
| "learning_rate": 1.0223824459559971e-05, |
| "loss": 0.0771, |
| "num_input_tokens_seen": 1114784, |
| "step": 3920 |
| }, |
| { |
| "epoch": 7.322761194029851, |
| "grad_norm": 1.0255919694900513, |
| "learning_rate": 1.0158238389594164e-05, |
| "loss": 0.0394, |
| "num_input_tokens_seen": 1116416, |
| "step": 3925 |
| }, |
| { |
| "epoch": 7.332089552238806, |
| "grad_norm": 2.7475836277008057, |
| "learning_rate": 1.009280968538118e-05, |
| "loss": 0.0344, |
| "num_input_tokens_seen": 1117824, |
| "step": 3930 |
| }, |
| { |
| "epoch": 7.3414179104477615, |
| "grad_norm": 6.523220539093018, |
| "learning_rate": 1.0027539040655173e-05, |
| "loss": 0.0992, |
| "num_input_tokens_seen": 1119136, |
| "step": 3935 |
| }, |
| { |
| "epoch": 7.350746268656716, |
| "grad_norm": 3.7833569049835205, |
| "learning_rate": 9.962427147474474e-06, |
| "loss": 0.108, |
| "num_input_tokens_seen": 1120480, |
| "step": 3940 |
| }, |
| { |
| "epoch": 7.360074626865671, |
| "grad_norm": 1.2123339176177979, |
| "learning_rate": 9.89747469621411e-06, |
| "loss": 0.0323, |
| "num_input_tokens_seen": 1121856, |
| "step": 3945 |
| }, |
| { |
| "epoch": 7.369402985074627, |
| "grad_norm": 1.6664719581604004, |
| "learning_rate": 9.832682375558614e-06, |
| "loss": 0.0888, |
| "num_input_tokens_seen": 1123136, |
| "step": 3950 |
| }, |
| { |
| "epoch": 7.378731343283582, |
| "grad_norm": 4.599035263061523, |
| "learning_rate": 9.768050872494638e-06, |
| "loss": 0.071, |
| "num_input_tokens_seen": 1124480, |
| "step": 3955 |
| }, |
| { |
| "epoch": 7.388059701492537, |
| "grad_norm": 5.097347736358643, |
| "learning_rate": 9.70358087230373e-06, |
| "loss": 0.2264, |
| "num_input_tokens_seen": 1125856, |
| "step": 3960 |
| }, |
| { |
| "epoch": 7.397388059701493, |
| "grad_norm": 3.7319228649139404, |
| "learning_rate": 9.639273058555004e-06, |
| "loss": 0.0666, |
| "num_input_tokens_seen": 1127264, |
| "step": 3965 |
| }, |
| { |
| "epoch": 7.406716417910448, |
| "grad_norm": 1.4606637954711914, |
| "learning_rate": 9.575128113097952e-06, |
| "loss": 0.0958, |
| "num_input_tokens_seen": 1128640, |
| "step": 3970 |
| }, |
| { |
| "epoch": 7.416044776119403, |
| "grad_norm": 6.494813442230225, |
| "learning_rate": 9.5111467160552e-06, |
| "loss": 0.2061, |
| "num_input_tokens_seen": 1129984, |
| "step": 3975 |
| }, |
| { |
| "epoch": 7.425373134328359, |
| "grad_norm": 1.749520182609558, |
| "learning_rate": 9.447329545815246e-06, |
| "loss": 0.0526, |
| "num_input_tokens_seen": 1131552, |
| "step": 3980 |
| }, |
| { |
| "epoch": 7.434701492537314, |
| "grad_norm": 5.3069586753845215, |
| "learning_rate": 9.383677279025347e-06, |
| "loss": 0.1195, |
| "num_input_tokens_seen": 1132928, |
| "step": 3985 |
| }, |
| { |
| "epoch": 7.4440298507462686, |
| "grad_norm": 1.512669324874878, |
| "learning_rate": 9.320190590584273e-06, |
| "loss": 0.0523, |
| "num_input_tokens_seen": 1134336, |
| "step": 3990 |
| }, |
| { |
| "epoch": 7.4533582089552235, |
| "grad_norm": 0.5251164436340332, |
| "learning_rate": 9.256870153635208e-06, |
| "loss": 0.0662, |
| "num_input_tokens_seen": 1135840, |
| "step": 3995 |
| }, |
| { |
| "epoch": 7.462686567164179, |
| "grad_norm": 11.922141075134277, |
| "learning_rate": 9.193716639558587e-06, |
| "loss": 0.0691, |
| "num_input_tokens_seen": 1137216, |
| "step": 4000 |
| }, |
| { |
| "epoch": 7.472014925373134, |
| "grad_norm": 1.0020458698272705, |
| "learning_rate": 9.130730717964948e-06, |
| "loss": 0.0889, |
| "num_input_tokens_seen": 1138528, |
| "step": 4005 |
| }, |
| { |
| "epoch": 7.481343283582089, |
| "grad_norm": 5.1921210289001465, |
| "learning_rate": 9.067913056687899e-06, |
| "loss": 0.1071, |
| "num_input_tokens_seen": 1139872, |
| "step": 4010 |
| }, |
| { |
| "epoch": 7.490671641791045, |
| "grad_norm": 3.8372581005096436, |
| "learning_rate": 9.005264321776974e-06, |
| "loss": 0.0785, |
| "num_input_tokens_seen": 1141408, |
| "step": 4015 |
| }, |
| { |
| "epoch": 7.5, |
| "grad_norm": 3.8975751399993896, |
| "learning_rate": 8.942785177490617e-06, |
| "loss": 0.1196, |
| "num_input_tokens_seen": 1142912, |
| "step": 4020 |
| }, |
| { |
| "epoch": 7.5, |
| "eval_loss": 1.002477765083313, |
| "eval_runtime": 2.8707, |
| "eval_samples_per_second": 82.906, |
| "eval_steps_per_second": 20.901, |
| "num_input_tokens_seen": 1142912, |
| "step": 4020 |
| }, |
| { |
| "epoch": 7.509328358208955, |
| "grad_norm": 1.9084550142288208, |
| "learning_rate": 8.880476286289091e-06, |
| "loss": 0.0674, |
| "num_input_tokens_seen": 1144256, |
| "step": 4025 |
| }, |
| { |
| "epoch": 7.518656716417911, |
| "grad_norm": 5.356456756591797, |
| "learning_rate": 8.818338308827529e-06, |
| "loss": 0.1112, |
| "num_input_tokens_seen": 1145568, |
| "step": 4030 |
| }, |
| { |
| "epoch": 7.527985074626866, |
| "grad_norm": 5.473300933837891, |
| "learning_rate": 8.75637190394884e-06, |
| "loss": 0.0651, |
| "num_input_tokens_seen": 1147008, |
| "step": 4035 |
| }, |
| { |
| "epoch": 7.537313432835821, |
| "grad_norm": 8.976759910583496, |
| "learning_rate": 8.69457772867677e-06, |
| "loss": 0.0424, |
| "num_input_tokens_seen": 1148416, |
| "step": 4040 |
| }, |
| { |
| "epoch": 7.5466417910447765, |
| "grad_norm": 0.3374466300010681, |
| "learning_rate": 8.632956438208961e-06, |
| "loss": 0.0379, |
| "num_input_tokens_seen": 1149888, |
| "step": 4045 |
| }, |
| { |
| "epoch": 7.5559701492537314, |
| "grad_norm": 3.9886317253112793, |
| "learning_rate": 8.571508685909931e-06, |
| "loss": 0.0699, |
| "num_input_tokens_seen": 1151360, |
| "step": 4050 |
| }, |
| { |
| "epoch": 7.565298507462686, |
| "grad_norm": 1.0209050178527832, |
| "learning_rate": 8.510235123304227e-06, |
| "loss": 0.0997, |
| "num_input_tokens_seen": 1152800, |
| "step": 4055 |
| }, |
| { |
| "epoch": 7.574626865671641, |
| "grad_norm": 3.5404539108276367, |
| "learning_rate": 8.449136400069464e-06, |
| "loss": 0.0373, |
| "num_input_tokens_seen": 1154272, |
| "step": 4060 |
| }, |
| { |
| "epoch": 7.583955223880597, |
| "grad_norm": 5.972469806671143, |
| "learning_rate": 8.38821316402946e-06, |
| "loss": 0.0632, |
| "num_input_tokens_seen": 1155584, |
| "step": 4065 |
| }, |
| { |
| "epoch": 7.593283582089552, |
| "grad_norm": 5.776495933532715, |
| "learning_rate": 8.327466061147337e-06, |
| "loss": 0.0832, |
| "num_input_tokens_seen": 1157152, |
| "step": 4070 |
| }, |
| { |
| "epoch": 7.602611940298507, |
| "grad_norm": 8.755820274353027, |
| "learning_rate": 8.266895735518729e-06, |
| "loss": 0.0913, |
| "num_input_tokens_seen": 1158432, |
| "step": 4075 |
| }, |
| { |
| "epoch": 7.611940298507463, |
| "grad_norm": 3.1021695137023926, |
| "learning_rate": 8.20650282936488e-06, |
| "loss": 0.048, |
| "num_input_tokens_seen": 1160032, |
| "step": 4080 |
| }, |
| { |
| "epoch": 7.621268656716418, |
| "grad_norm": 3.7271265983581543, |
| "learning_rate": 8.146287983025903e-06, |
| "loss": 0.0599, |
| "num_input_tokens_seen": 1161536, |
| "step": 4085 |
| }, |
| { |
| "epoch": 7.630597014925373, |
| "grad_norm": 4.845669269561768, |
| "learning_rate": 8.086251834953953e-06, |
| "loss": 0.1202, |
| "num_input_tokens_seen": 1163072, |
| "step": 4090 |
| }, |
| { |
| "epoch": 7.639925373134329, |
| "grad_norm": 0.8477360010147095, |
| "learning_rate": 8.02639502170645e-06, |
| "loss": 0.0307, |
| "num_input_tokens_seen": 1164384, |
| "step": 4095 |
| }, |
| { |
| "epoch": 7.649253731343284, |
| "grad_norm": 5.028936862945557, |
| "learning_rate": 7.966718177939366e-06, |
| "loss": 0.1139, |
| "num_input_tokens_seen": 1165728, |
| "step": 4100 |
| }, |
| { |
| "epoch": 7.6585820895522385, |
| "grad_norm": 2.090909481048584, |
| "learning_rate": 7.907221936400453e-06, |
| "loss": 0.0573, |
| "num_input_tokens_seen": 1167264, |
| "step": 4105 |
| }, |
| { |
| "epoch": 7.667910447761194, |
| "grad_norm": 4.771596908569336, |
| "learning_rate": 7.847906927922574e-06, |
| "loss": 0.0854, |
| "num_input_tokens_seen": 1168800, |
| "step": 4110 |
| }, |
| { |
| "epoch": 7.677238805970149, |
| "grad_norm": 1.967151165008545, |
| "learning_rate": 7.788773781416974e-06, |
| "loss": 0.0879, |
| "num_input_tokens_seen": 1170240, |
| "step": 4115 |
| }, |
| { |
| "epoch": 7.686567164179104, |
| "grad_norm": 3.251784086227417, |
| "learning_rate": 7.729823123866656e-06, |
| "loss": 0.0482, |
| "num_input_tokens_seen": 1171936, |
| "step": 4120 |
| }, |
| { |
| "epoch": 7.69589552238806, |
| "grad_norm": 4.973485469818115, |
| "learning_rate": 7.671055580319706e-06, |
| "loss": 0.1193, |
| "num_input_tokens_seen": 1173216, |
| "step": 4125 |
| }, |
| { |
| "epoch": 7.705223880597015, |
| "grad_norm": 1.899452567100525, |
| "learning_rate": 7.612471773882651e-06, |
| "loss": 0.0967, |
| "num_input_tokens_seen": 1174496, |
| "step": 4130 |
| }, |
| { |
| "epoch": 7.71455223880597, |
| "grad_norm": 3.737504720687866, |
| "learning_rate": 7.554072325713896e-06, |
| "loss": 0.0755, |
| "num_input_tokens_seen": 1176032, |
| "step": 4135 |
| }, |
| { |
| "epoch": 7.723880597014926, |
| "grad_norm": 1.9849438667297363, |
| "learning_rate": 7.495857855017091e-06, |
| "loss": 0.1349, |
| "num_input_tokens_seen": 1177472, |
| "step": 4140 |
| }, |
| { |
| "epoch": 7.733208955223881, |
| "grad_norm": 5.063560485839844, |
| "learning_rate": 7.437828979034606e-06, |
| "loss": 0.0485, |
| "num_input_tokens_seen": 1178976, |
| "step": 4145 |
| }, |
| { |
| "epoch": 7.742537313432836, |
| "grad_norm": 3.42476749420166, |
| "learning_rate": 7.379986313040959e-06, |
| "loss": 0.1142, |
| "num_input_tokens_seen": 1180448, |
| "step": 4150 |
| }, |
| { |
| "epoch": 7.7518656716417915, |
| "grad_norm": 7.727182865142822, |
| "learning_rate": 7.3223304703363135e-06, |
| "loss": 0.0715, |
| "num_input_tokens_seen": 1181888, |
| "step": 4155 |
| }, |
| { |
| "epoch": 7.7611940298507465, |
| "grad_norm": 1.7799769639968872, |
| "learning_rate": 7.264862062239947e-06, |
| "loss": 0.04, |
| "num_input_tokens_seen": 1183456, |
| "step": 4160 |
| }, |
| { |
| "epoch": 7.770522388059701, |
| "grad_norm": 2.8688454627990723, |
| "learning_rate": 7.2075816980837814e-06, |
| "loss": 0.1056, |
| "num_input_tokens_seen": 1184800, |
| "step": 4165 |
| }, |
| { |
| "epoch": 7.779850746268656, |
| "grad_norm": 4.925065517425537, |
| "learning_rate": 7.150489985205952e-06, |
| "loss": 0.0945, |
| "num_input_tokens_seen": 1186304, |
| "step": 4170 |
| }, |
| { |
| "epoch": 7.789179104477612, |
| "grad_norm": 2.0020103454589844, |
| "learning_rate": 7.093587528944326e-06, |
| "loss": 0.1462, |
| "num_input_tokens_seen": 1187744, |
| "step": 4175 |
| }, |
| { |
| "epoch": 7.798507462686567, |
| "grad_norm": 2.420915365219116, |
| "learning_rate": 7.036874932630117e-06, |
| "loss": 0.0798, |
| "num_input_tokens_seen": 1189376, |
| "step": 4180 |
| }, |
| { |
| "epoch": 7.807835820895522, |
| "grad_norm": 2.1272621154785156, |
| "learning_rate": 6.980352797581438e-06, |
| "loss": 0.0591, |
| "num_input_tokens_seen": 1190912, |
| "step": 4185 |
| }, |
| { |
| "epoch": 7.817164179104478, |
| "grad_norm": 3.962641954421997, |
| "learning_rate": 6.924021723097004e-06, |
| "loss": 0.1434, |
| "num_input_tokens_seen": 1192416, |
| "step": 4190 |
| }, |
| { |
| "epoch": 7.826492537313433, |
| "grad_norm": 4.562009334564209, |
| "learning_rate": 6.867882306449694e-06, |
| "loss": 0.0957, |
| "num_input_tokens_seen": 1193792, |
| "step": 4195 |
| }, |
| { |
| "epoch": 7.835820895522388, |
| "grad_norm": 1.5632730722427368, |
| "learning_rate": 6.8119351428802796e-06, |
| "loss": 0.0744, |
| "num_input_tokens_seen": 1195200, |
| "step": 4200 |
| }, |
| { |
| "epoch": 7.845149253731344, |
| "grad_norm": 2.4257190227508545, |
| "learning_rate": 6.756180825591099e-06, |
| "loss": 0.0499, |
| "num_input_tokens_seen": 1196672, |
| "step": 4205 |
| }, |
| { |
| "epoch": 7.854477611940299, |
| "grad_norm": 4.7365617752075195, |
| "learning_rate": 6.700619945739728e-06, |
| "loss": 0.0978, |
| "num_input_tokens_seen": 1198336, |
| "step": 4210 |
| }, |
| { |
| "epoch": 7.8638059701492535, |
| "grad_norm": 1.2376832962036133, |
| "learning_rate": 6.645253092432785e-06, |
| "loss": 0.0117, |
| "num_input_tokens_seen": 1199712, |
| "step": 4215 |
| }, |
| { |
| "epoch": 7.8731343283582085, |
| "grad_norm": 1.8205276727676392, |
| "learning_rate": 6.590080852719602e-06, |
| "loss": 0.041, |
| "num_input_tokens_seen": 1201088, |
| "step": 4220 |
| }, |
| { |
| "epoch": 7.882462686567164, |
| "grad_norm": 10.758254051208496, |
| "learning_rate": 6.535103811586085e-06, |
| "loss": 0.1037, |
| "num_input_tokens_seen": 1202432, |
| "step": 4225 |
| }, |
| { |
| "epoch": 7.891791044776119, |
| "grad_norm": 2.6332898139953613, |
| "learning_rate": 6.480322551948426e-06, |
| "loss": 0.0793, |
| "num_input_tokens_seen": 1203872, |
| "step": 4230 |
| }, |
| { |
| "epoch": 7.901119402985074, |
| "grad_norm": 8.059739112854004, |
| "learning_rate": 6.425737654646993e-06, |
| "loss": 0.0765, |
| "num_input_tokens_seen": 1205312, |
| "step": 4235 |
| }, |
| { |
| "epoch": 7.91044776119403, |
| "grad_norm": 2.998669385910034, |
| "learning_rate": 6.371349698440149e-06, |
| "loss": 0.1025, |
| "num_input_tokens_seen": 1206688, |
| "step": 4240 |
| }, |
| { |
| "epoch": 7.919776119402985, |
| "grad_norm": 2.7940337657928467, |
| "learning_rate": 6.317159259998073e-06, |
| "loss": 0.0948, |
| "num_input_tokens_seen": 1208064, |
| "step": 4245 |
| }, |
| { |
| "epoch": 7.92910447761194, |
| "grad_norm": 2.4969735145568848, |
| "learning_rate": 6.2631669138967285e-06, |
| "loss": 0.0764, |
| "num_input_tokens_seen": 1209600, |
| "step": 4250 |
| }, |
| { |
| "epoch": 7.938432835820896, |
| "grad_norm": 1.7211167812347412, |
| "learning_rate": 6.209373232611682e-06, |
| "loss": 0.056, |
| "num_input_tokens_seen": 1210976, |
| "step": 4255 |
| }, |
| { |
| "epoch": 7.947761194029851, |
| "grad_norm": 8.207921028137207, |
| "learning_rate": 6.155778786512109e-06, |
| "loss": 0.0935, |
| "num_input_tokens_seen": 1212416, |
| "step": 4260 |
| }, |
| { |
| "epoch": 7.957089552238806, |
| "grad_norm": 5.8965935707092285, |
| "learning_rate": 6.102384143854698e-06, |
| "loss": 0.0747, |
| "num_input_tokens_seen": 1213920, |
| "step": 4265 |
| }, |
| { |
| "epoch": 7.9664179104477615, |
| "grad_norm": 5.342392444610596, |
| "learning_rate": 6.049189870777652e-06, |
| "loss": 0.0898, |
| "num_input_tokens_seen": 1215360, |
| "step": 4270 |
| }, |
| { |
| "epoch": 7.975746268656716, |
| "grad_norm": 6.013731956481934, |
| "learning_rate": 5.996196531294657e-06, |
| "loss": 0.0349, |
| "num_input_tokens_seen": 1216736, |
| "step": 4275 |
| }, |
| { |
| "epoch": 7.985074626865671, |
| "grad_norm": 0.6840583086013794, |
| "learning_rate": 5.943404687288939e-06, |
| "loss": 0.1189, |
| "num_input_tokens_seen": 1218240, |
| "step": 4280 |
| }, |
| { |
| "epoch": 7.994402985074627, |
| "grad_norm": 0.9111075401306152, |
| "learning_rate": 5.890814898507277e-06, |
| "loss": 0.1486, |
| "num_input_tokens_seen": 1219488, |
| "step": 4285 |
| }, |
| { |
| "epoch": 8.0, |
| "eval_loss": 0.989206850528717, |
| "eval_runtime": 2.8406, |
| "eval_samples_per_second": 83.787, |
| "eval_steps_per_second": 21.123, |
| "num_input_tokens_seen": 1220200, |
| "step": 4288 |
| }, |
| { |
| "epoch": 8.003731343283581, |
| "grad_norm": 3.8169362545013428, |
| "learning_rate": 5.838427722554057e-06, |
| "loss": 0.1003, |
| "num_input_tokens_seen": 1220808, |
| "step": 4290 |
| }, |
| { |
| "epoch": 8.013059701492537, |
| "grad_norm": 3.055495023727417, |
| "learning_rate": 5.786243714885442e-06, |
| "loss": 0.0973, |
| "num_input_tokens_seen": 1222280, |
| "step": 4295 |
| }, |
| { |
| "epoch": 8.022388059701493, |
| "grad_norm": 1.732284665107727, |
| "learning_rate": 5.734263428803352e-06, |
| "loss": 0.0957, |
| "num_input_tokens_seen": 1223688, |
| "step": 4300 |
| }, |
| { |
| "epoch": 8.031716417910447, |
| "grad_norm": 7.572145462036133, |
| "learning_rate": 5.6824874154497194e-06, |
| "loss": 0.0993, |
| "num_input_tokens_seen": 1225352, |
| "step": 4305 |
| }, |
| { |
| "epoch": 8.041044776119403, |
| "grad_norm": 3.91949462890625, |
| "learning_rate": 5.63091622380055e-06, |
| "loss": 0.051, |
| "num_input_tokens_seen": 1226824, |
| "step": 4310 |
| }, |
| { |
| "epoch": 8.050373134328359, |
| "grad_norm": 3.6691184043884277, |
| "learning_rate": 5.5795504006601855e-06, |
| "loss": 0.0212, |
| "num_input_tokens_seen": 1228232, |
| "step": 4315 |
| }, |
| { |
| "epoch": 8.059701492537313, |
| "grad_norm": 4.257026672363281, |
| "learning_rate": 5.528390490655428e-06, |
| "loss": 0.0401, |
| "num_input_tokens_seen": 1229672, |
| "step": 4320 |
| }, |
| { |
| "epoch": 8.069029850746269, |
| "grad_norm": 1.5009533166885376, |
| "learning_rate": 5.477437036229833e-06, |
| "loss": 0.0669, |
| "num_input_tokens_seen": 1230920, |
| "step": 4325 |
| }, |
| { |
| "epoch": 8.078358208955224, |
| "grad_norm": 0.6105980277061462, |
| "learning_rate": 5.426690577637913e-06, |
| "loss": 0.0118, |
| "num_input_tokens_seen": 1232232, |
| "step": 4330 |
| }, |
| { |
| "epoch": 8.087686567164178, |
| "grad_norm": 2.6570894718170166, |
| "learning_rate": 5.37615165293942e-06, |
| "loss": 0.0893, |
| "num_input_tokens_seen": 1233640, |
| "step": 4335 |
| }, |
| { |
| "epoch": 8.097014925373134, |
| "grad_norm": 4.449321269989014, |
| "learning_rate": 5.325820797993661e-06, |
| "loss": 0.052, |
| "num_input_tokens_seen": 1235048, |
| "step": 4340 |
| }, |
| { |
| "epoch": 8.10634328358209, |
| "grad_norm": 0.9205982685089111, |
| "learning_rate": 5.275698546453775e-06, |
| "loss": 0.046, |
| "num_input_tokens_seen": 1236552, |
| "step": 4345 |
| }, |
| { |
| "epoch": 8.115671641791044, |
| "grad_norm": 0.7229433655738831, |
| "learning_rate": 5.225785429761124e-06, |
| "loss": 0.0889, |
| "num_input_tokens_seen": 1237896, |
| "step": 4350 |
| }, |
| { |
| "epoch": 8.125, |
| "grad_norm": 1.545849084854126, |
| "learning_rate": 5.176081977139621e-06, |
| "loss": 0.0465, |
| "num_input_tokens_seen": 1239112, |
| "step": 4355 |
| }, |
| { |
| "epoch": 8.134328358208956, |
| "grad_norm": 0.5773685574531555, |
| "learning_rate": 5.12658871559013e-06, |
| "loss": 0.0149, |
| "num_input_tokens_seen": 1240744, |
| "step": 4360 |
| }, |
| { |
| "epoch": 8.14365671641791, |
| "grad_norm": 1.7627652883529663, |
| "learning_rate": 5.077306169884888e-06, |
| "loss": 0.0375, |
| "num_input_tokens_seen": 1242152, |
| "step": 4365 |
| }, |
| { |
| "epoch": 8.152985074626866, |
| "grad_norm": 2.992675304412842, |
| "learning_rate": 5.0282348625619175e-06, |
| "loss": 0.022, |
| "num_input_tokens_seen": 1243496, |
| "step": 4370 |
| }, |
| { |
| "epoch": 8.162313432835822, |
| "grad_norm": 1.1340343952178955, |
| "learning_rate": 4.979375313919526e-06, |
| "loss": 0.0248, |
| "num_input_tokens_seen": 1245064, |
| "step": 4375 |
| }, |
| { |
| "epoch": 8.171641791044776, |
| "grad_norm": 4.9795637130737305, |
| "learning_rate": 4.930728042010724e-06, |
| "loss": 0.03, |
| "num_input_tokens_seen": 1246408, |
| "step": 4380 |
| }, |
| { |
| "epoch": 8.180970149253731, |
| "grad_norm": 3.1528468132019043, |
| "learning_rate": 4.882293562637827e-06, |
| "loss": 0.0996, |
| "num_input_tokens_seen": 1247784, |
| "step": 4385 |
| }, |
| { |
| "epoch": 8.190298507462687, |
| "grad_norm": 2.4150609970092773, |
| "learning_rate": 4.834072389346883e-06, |
| "loss": 0.0576, |
| "num_input_tokens_seen": 1249192, |
| "step": 4390 |
| }, |
| { |
| "epoch": 8.199626865671641, |
| "grad_norm": 1.7107328176498413, |
| "learning_rate": 4.78606503342231e-06, |
| "loss": 0.0262, |
| "num_input_tokens_seen": 1250664, |
| "step": 4395 |
| }, |
| { |
| "epoch": 8.208955223880597, |
| "grad_norm": 2.542297601699829, |
| "learning_rate": 4.738272003881417e-06, |
| "loss": 0.1036, |
| "num_input_tokens_seen": 1252072, |
| "step": 4400 |
| }, |
| { |
| "epoch": 8.218283582089553, |
| "grad_norm": 8.60915756225586, |
| "learning_rate": 4.690693807469035e-06, |
| "loss": 0.1197, |
| "num_input_tokens_seen": 1253416, |
| "step": 4405 |
| }, |
| { |
| "epoch": 8.227611940298507, |
| "grad_norm": 4.1545023918151855, |
| "learning_rate": 4.643330948652155e-06, |
| "loss": 0.1054, |
| "num_input_tokens_seen": 1254728, |
| "step": 4410 |
| }, |
| { |
| "epoch": 8.236940298507463, |
| "grad_norm": 4.680455684661865, |
| "learning_rate": 4.596183929614559e-06, |
| "loss": 0.044, |
| "num_input_tokens_seen": 1255976, |
| "step": 4415 |
| }, |
| { |
| "epoch": 8.246268656716419, |
| "grad_norm": 4.429104804992676, |
| "learning_rate": 4.549253250251498e-06, |
| "loss": 0.0391, |
| "num_input_tokens_seen": 1257288, |
| "step": 4420 |
| }, |
| { |
| "epoch": 8.255597014925373, |
| "grad_norm": 0.6497788429260254, |
| "learning_rate": 4.502539408164386e-06, |
| "loss": 0.0472, |
| "num_input_tokens_seen": 1258664, |
| "step": 4425 |
| }, |
| { |
| "epoch": 8.264925373134329, |
| "grad_norm": 0.19893686473369598, |
| "learning_rate": 4.4560428986555516e-06, |
| "loss": 0.0358, |
| "num_input_tokens_seen": 1260104, |
| "step": 4430 |
| }, |
| { |
| "epoch": 8.274253731343283, |
| "grad_norm": 4.880557060241699, |
| "learning_rate": 4.409764214722945e-06, |
| "loss": 0.0644, |
| "num_input_tokens_seen": 1261448, |
| "step": 4435 |
| }, |
| { |
| "epoch": 8.283582089552239, |
| "grad_norm": 2.664081335067749, |
| "learning_rate": 4.363703847054948e-06, |
| "loss": 0.0292, |
| "num_input_tokens_seen": 1263048, |
| "step": 4440 |
| }, |
| { |
| "epoch": 8.292910447761194, |
| "grad_norm": 0.40079203248023987, |
| "learning_rate": 4.317862284025165e-06, |
| "loss": 0.0477, |
| "num_input_tokens_seen": 1264360, |
| "step": 4445 |
| }, |
| { |
| "epoch": 8.302238805970148, |
| "grad_norm": 1.9124809503555298, |
| "learning_rate": 4.272240011687206e-06, |
| "loss": 0.0577, |
| "num_input_tokens_seen": 1265640, |
| "step": 4450 |
| }, |
| { |
| "epoch": 8.311567164179104, |
| "grad_norm": 2.0922703742980957, |
| "learning_rate": 4.226837513769596e-06, |
| "loss": 0.0273, |
| "num_input_tokens_seen": 1267016, |
| "step": 4455 |
| }, |
| { |
| "epoch": 8.32089552238806, |
| "grad_norm": 6.26092529296875, |
| "learning_rate": 4.181655271670587e-06, |
| "loss": 0.0576, |
| "num_input_tokens_seen": 1268744, |
| "step": 4460 |
| }, |
| { |
| "epoch": 8.330223880597014, |
| "grad_norm": 0.2856329679489136, |
| "learning_rate": 4.1366937644531e-06, |
| "loss": 0.0167, |
| "num_input_tokens_seen": 1270120, |
| "step": 4465 |
| }, |
| { |
| "epoch": 8.33955223880597, |
| "grad_norm": 0.9626179933547974, |
| "learning_rate": 4.091953468839607e-06, |
| "loss": 0.0808, |
| "num_input_tokens_seen": 1271656, |
| "step": 4470 |
| }, |
| { |
| "epoch": 8.348880597014926, |
| "grad_norm": 1.031617283821106, |
| "learning_rate": 4.047434859207114e-06, |
| "loss": 0.0466, |
| "num_input_tokens_seen": 1273128, |
| "step": 4475 |
| }, |
| { |
| "epoch": 8.35820895522388, |
| "grad_norm": 1.6361671686172485, |
| "learning_rate": 4.003138407582102e-06, |
| "loss": 0.0772, |
| "num_input_tokens_seen": 1274632, |
| "step": 4480 |
| }, |
| { |
| "epoch": 8.367537313432836, |
| "grad_norm": 6.7830071449279785, |
| "learning_rate": 3.959064583635527e-06, |
| "loss": 0.05, |
| "num_input_tokens_seen": 1275848, |
| "step": 4485 |
| }, |
| { |
| "epoch": 8.376865671641792, |
| "grad_norm": 1.5918464660644531, |
| "learning_rate": 3.9152138546778625e-06, |
| "loss": 0.0817, |
| "num_input_tokens_seen": 1277224, |
| "step": 4490 |
| }, |
| { |
| "epoch": 8.386194029850746, |
| "grad_norm": 2.880215644836426, |
| "learning_rate": 3.871586685654102e-06, |
| "loss": 0.1527, |
| "num_input_tokens_seen": 1278504, |
| "step": 4495 |
| }, |
| { |
| "epoch": 8.395522388059701, |
| "grad_norm": 2.9496569633483887, |
| "learning_rate": 3.82818353913888e-06, |
| "loss": 0.0752, |
| "num_input_tokens_seen": 1279784, |
| "step": 4500 |
| }, |
| { |
| "epoch": 8.404850746268657, |
| "grad_norm": 2.991889476776123, |
| "learning_rate": 3.785004875331527e-06, |
| "loss": 0.0481, |
| "num_input_tokens_seen": 1281256, |
| "step": 4505 |
| }, |
| { |
| "epoch": 8.414179104477611, |
| "grad_norm": 0.3692754805088043, |
| "learning_rate": 3.742051152051221e-06, |
| "loss": 0.0779, |
| "num_input_tokens_seen": 1282792, |
| "step": 4510 |
| }, |
| { |
| "epoch": 8.423507462686567, |
| "grad_norm": 0.9901413917541504, |
| "learning_rate": 3.6993228247320877e-06, |
| "loss": 0.0226, |
| "num_input_tokens_seen": 1284232, |
| "step": 4515 |
| }, |
| { |
| "epoch": 8.432835820895523, |
| "grad_norm": 1.377192497253418, |
| "learning_rate": 3.6568203464184383e-06, |
| "loss": 0.0282, |
| "num_input_tokens_seen": 1285448, |
| "step": 4520 |
| }, |
| { |
| "epoch": 8.442164179104477, |
| "grad_norm": 6.552360534667969, |
| "learning_rate": 3.614544167759901e-06, |
| "loss": 0.1244, |
| "num_input_tokens_seen": 1286824, |
| "step": 4525 |
| }, |
| { |
| "epoch": 8.451492537313433, |
| "grad_norm": 4.57367467880249, |
| "learning_rate": 3.5724947370066708e-06, |
| "loss": 0.0978, |
| "num_input_tokens_seen": 1288136, |
| "step": 4530 |
| }, |
| { |
| "epoch": 8.460820895522389, |
| "grad_norm": 0.2352503538131714, |
| "learning_rate": 3.5306725000047918e-06, |
| "loss": 0.0084, |
| "num_input_tokens_seen": 1289512, |
| "step": 4535 |
| }, |
| { |
| "epoch": 8.470149253731343, |
| "grad_norm": 1.2270539999008179, |
| "learning_rate": 3.4890779001913452e-06, |
| "loss": 0.0404, |
| "num_input_tokens_seen": 1291144, |
| "step": 4540 |
| }, |
| { |
| "epoch": 8.479477611940299, |
| "grad_norm": 4.6860127449035645, |
| "learning_rate": 3.4477113785898407e-06, |
| "loss": 0.0543, |
| "num_input_tokens_seen": 1292680, |
| "step": 4545 |
| }, |
| { |
| "epoch": 8.488805970149254, |
| "grad_norm": 1.3688758611679077, |
| "learning_rate": 3.4065733738054606e-06, |
| "loss": 0.0709, |
| "num_input_tokens_seen": 1294056, |
| "step": 4550 |
| }, |
| { |
| "epoch": 8.498134328358208, |
| "grad_norm": 2.114455223083496, |
| "learning_rate": 3.3656643220204785e-06, |
| "loss": 0.1137, |
| "num_input_tokens_seen": 1295400, |
| "step": 4555 |
| }, |
| { |
| "epoch": 8.5, |
| "eval_loss": 1.0715049505233765, |
| "eval_runtime": 2.8681, |
| "eval_samples_per_second": 82.983, |
| "eval_steps_per_second": 20.92, |
| "num_input_tokens_seen": 1295720, |
| "step": 4556 |
| }, |
| { |
| "epoch": 8.507462686567164, |
| "grad_norm": 3.474656820297241, |
| "learning_rate": 3.3249846569895744e-06, |
| "loss": 0.0608, |
| "num_input_tokens_seen": 1296872, |
| "step": 4560 |
| }, |
| { |
| "epoch": 8.51679104477612, |
| "grad_norm": 3.5808937549591064, |
| "learning_rate": 3.284534810035278e-06, |
| "loss": 0.0579, |
| "num_input_tokens_seen": 1298408, |
| "step": 4565 |
| }, |
| { |
| "epoch": 8.526119402985074, |
| "grad_norm": 0.6513265371322632, |
| "learning_rate": 3.2443152100433832e-06, |
| "loss": 0.1137, |
| "num_input_tokens_seen": 1299848, |
| "step": 4570 |
| }, |
| { |
| "epoch": 8.53544776119403, |
| "grad_norm": 5.376895427703857, |
| "learning_rate": 3.204326283458381e-06, |
| "loss": 0.0555, |
| "num_input_tokens_seen": 1301224, |
| "step": 4575 |
| }, |
| { |
| "epoch": 8.544776119402986, |
| "grad_norm": 3.2730259895324707, |
| "learning_rate": 3.1645684542789765e-06, |
| "loss": 0.0765, |
| "num_input_tokens_seen": 1302856, |
| "step": 4580 |
| }, |
| { |
| "epoch": 8.55410447761194, |
| "grad_norm": 0.9466462731361389, |
| "learning_rate": 3.12504214405355e-06, |
| "loss": 0.0706, |
| "num_input_tokens_seen": 1304168, |
| "step": 4585 |
| }, |
| { |
| "epoch": 8.563432835820896, |
| "grad_norm": 0.6005380749702454, |
| "learning_rate": 3.0857477718757187e-06, |
| "loss": 0.0415, |
| "num_input_tokens_seen": 1305704, |
| "step": 4590 |
| }, |
| { |
| "epoch": 8.572761194029852, |
| "grad_norm": 4.953082084655762, |
| "learning_rate": 3.0466857543798966e-06, |
| "loss": 0.0547, |
| "num_input_tokens_seen": 1307080, |
| "step": 4595 |
| }, |
| { |
| "epoch": 8.582089552238806, |
| "grad_norm": 9.258735656738281, |
| "learning_rate": 3.007856505736836e-06, |
| "loss": 0.0695, |
| "num_input_tokens_seen": 1308296, |
| "step": 4600 |
| }, |
| { |
| "epoch": 8.591417910447761, |
| "grad_norm": 0.6133414506912231, |
| "learning_rate": 2.969260437649293e-06, |
| "loss": 0.0229, |
| "num_input_tokens_seen": 1309832, |
| "step": 4605 |
| }, |
| { |
| "epoch": 8.600746268656717, |
| "grad_norm": 1.7344980239868164, |
| "learning_rate": 2.9308979593476004e-06, |
| "loss": 0.1455, |
| "num_input_tokens_seen": 1311368, |
| "step": 4610 |
| }, |
| { |
| "epoch": 8.610074626865671, |
| "grad_norm": 4.416316986083984, |
| "learning_rate": 2.892769477585397e-06, |
| "loss": 0.101, |
| "num_input_tokens_seen": 1312680, |
| "step": 4615 |
| }, |
| { |
| "epoch": 8.619402985074627, |
| "grad_norm": 3.205669403076172, |
| "learning_rate": 2.8548753966352426e-06, |
| "loss": 0.0302, |
| "num_input_tokens_seen": 1314152, |
| "step": 4620 |
| }, |
| { |
| "epoch": 8.628731343283581, |
| "grad_norm": 0.5664398074150085, |
| "learning_rate": 2.8172161182844075e-06, |
| "loss": 0.0906, |
| "num_input_tokens_seen": 1315496, |
| "step": 4625 |
| }, |
| { |
| "epoch": 8.638059701492537, |
| "grad_norm": 0.44297924637794495, |
| "learning_rate": 2.779792041830537e-06, |
| "loss": 0.0642, |
| "num_input_tokens_seen": 1317192, |
| "step": 4630 |
| }, |
| { |
| "epoch": 8.647388059701493, |
| "grad_norm": 1.3405535221099854, |
| "learning_rate": 2.742603564077478e-06, |
| "loss": 0.0349, |
| "num_input_tokens_seen": 1318632, |
| "step": 4635 |
| }, |
| { |
| "epoch": 8.656716417910447, |
| "grad_norm": 1.4588735103607178, |
| "learning_rate": 2.7056510793310314e-06, |
| "loss": 0.0819, |
| "num_input_tokens_seen": 1320232, |
| "step": 4640 |
| }, |
| { |
| "epoch": 8.666044776119403, |
| "grad_norm": 3.1087090969085693, |
| "learning_rate": 2.6689349793947994e-06, |
| "loss": 0.0577, |
| "num_input_tokens_seen": 1321768, |
| "step": 4645 |
| }, |
| { |
| "epoch": 8.675373134328359, |
| "grad_norm": 0.8451427817344666, |
| "learning_rate": 2.6324556535660045e-06, |
| "loss": 0.0422, |
| "num_input_tokens_seen": 1323272, |
| "step": 4650 |
| }, |
| { |
| "epoch": 8.684701492537313, |
| "grad_norm": 6.077998638153076, |
| "learning_rate": 2.5962134886313936e-06, |
| "loss": 0.046, |
| "num_input_tokens_seen": 1324680, |
| "step": 4655 |
| }, |
| { |
| "epoch": 8.694029850746269, |
| "grad_norm": 5.307253360748291, |
| "learning_rate": 2.5602088688631148e-06, |
| "loss": 0.058, |
| "num_input_tokens_seen": 1325992, |
| "step": 4660 |
| }, |
| { |
| "epoch": 8.703358208955224, |
| "grad_norm": 0.48778557777404785, |
| "learning_rate": 2.5244421760146355e-06, |
| "loss": 0.096, |
| "num_input_tokens_seen": 1327400, |
| "step": 4665 |
| }, |
| { |
| "epoch": 8.712686567164178, |
| "grad_norm": 0.23075968027114868, |
| "learning_rate": 2.488913789316724e-06, |
| "loss": 0.0656, |
| "num_input_tokens_seen": 1328744, |
| "step": 4670 |
| }, |
| { |
| "epoch": 8.722014925373134, |
| "grad_norm": 0.8861811757087708, |
| "learning_rate": 2.4536240854733967e-06, |
| "loss": 0.0934, |
| "num_input_tokens_seen": 1330280, |
| "step": 4675 |
| }, |
| { |
| "epoch": 8.73134328358209, |
| "grad_norm": 0.8622788190841675, |
| "learning_rate": 2.4185734386579506e-06, |
| "loss": 0.0772, |
| "num_input_tokens_seen": 1331784, |
| "step": 4680 |
| }, |
| { |
| "epoch": 8.740671641791044, |
| "grad_norm": 0.8652991056442261, |
| "learning_rate": 2.383762220508984e-06, |
| "loss": 0.0419, |
| "num_input_tokens_seen": 1333128, |
| "step": 4685 |
| }, |
| { |
| "epoch": 8.75, |
| "grad_norm": 1.206106424331665, |
| "learning_rate": 2.349190800126444e-06, |
| "loss": 0.0313, |
| "num_input_tokens_seen": 1334696, |
| "step": 4690 |
| }, |
| { |
| "epoch": 8.759328358208956, |
| "grad_norm": 6.369112968444824, |
| "learning_rate": 2.3148595440677405e-06, |
| "loss": 0.0279, |
| "num_input_tokens_seen": 1336264, |
| "step": 4695 |
| }, |
| { |
| "epoch": 8.76865671641791, |
| "grad_norm": 1.2947605848312378, |
| "learning_rate": 2.280768816343834e-06, |
| "loss": 0.08, |
| "num_input_tokens_seen": 1337608, |
| "step": 4700 |
| }, |
| { |
| "epoch": 8.777985074626866, |
| "grad_norm": 2.03328013420105, |
| "learning_rate": 2.246918978415394e-06, |
| "loss": 0.0522, |
| "num_input_tokens_seen": 1339144, |
| "step": 4705 |
| }, |
| { |
| "epoch": 8.787313432835822, |
| "grad_norm": 1.3140164613723755, |
| "learning_rate": 2.2133103891889438e-06, |
| "loss": 0.0668, |
| "num_input_tokens_seen": 1340488, |
| "step": 4710 |
| }, |
| { |
| "epoch": 8.796641791044776, |
| "grad_norm": 3.0681769847869873, |
| "learning_rate": 2.1799434050131018e-06, |
| "loss": 0.0935, |
| "num_input_tokens_seen": 1341960, |
| "step": 4715 |
| }, |
| { |
| "epoch": 8.805970149253731, |
| "grad_norm": 0.45719027519226074, |
| "learning_rate": 2.1468183796747365e-06, |
| "loss": 0.1279, |
| "num_input_tokens_seen": 1343624, |
| "step": 4720 |
| }, |
| { |
| "epoch": 8.815298507462687, |
| "grad_norm": 0.30700984597206116, |
| "learning_rate": 2.1139356643952666e-06, |
| "loss": 0.0479, |
| "num_input_tokens_seen": 1344936, |
| "step": 4725 |
| }, |
| { |
| "epoch": 8.824626865671641, |
| "grad_norm": 7.035800933837891, |
| "learning_rate": 2.0812956078269275e-06, |
| "loss": 0.169, |
| "num_input_tokens_seen": 1346408, |
| "step": 4730 |
| }, |
| { |
| "epoch": 8.833955223880597, |
| "grad_norm": 6.293770790100098, |
| "learning_rate": 2.0488985560490477e-06, |
| "loss": 0.0542, |
| "num_input_tokens_seen": 1347784, |
| "step": 4735 |
| }, |
| { |
| "epoch": 8.843283582089553, |
| "grad_norm": 1.7941235303878784, |
| "learning_rate": 2.01674485256442e-06, |
| "loss": 0.0438, |
| "num_input_tokens_seen": 1349128, |
| "step": 4740 |
| }, |
| { |
| "epoch": 8.852611940298507, |
| "grad_norm": 5.885344982147217, |
| "learning_rate": 1.9848348382956293e-06, |
| "loss": 0.0847, |
| "num_input_tokens_seen": 1350312, |
| "step": 4745 |
| }, |
| { |
| "epoch": 8.861940298507463, |
| "grad_norm": 5.088900566101074, |
| "learning_rate": 1.953168851581452e-06, |
| "loss": 0.0228, |
| "num_input_tokens_seen": 1351784, |
| "step": 4750 |
| }, |
| { |
| "epoch": 8.871268656716419, |
| "grad_norm": 4.313127040863037, |
| "learning_rate": 1.921747228173254e-06, |
| "loss": 0.039, |
| "num_input_tokens_seen": 1353416, |
| "step": 4755 |
| }, |
| { |
| "epoch": 8.880597014925373, |
| "grad_norm": 4.173219203948975, |
| "learning_rate": 1.8905703012314563e-06, |
| "loss": 0.0546, |
| "num_input_tokens_seen": 1354760, |
| "step": 4760 |
| }, |
| { |
| "epoch": 8.889925373134329, |
| "grad_norm": 1.8911163806915283, |
| "learning_rate": 1.8596384013219725e-06, |
| "loss": 0.0127, |
| "num_input_tokens_seen": 1355976, |
| "step": 4765 |
| }, |
| { |
| "epoch": 8.899253731343283, |
| "grad_norm": 2.033926010131836, |
| "learning_rate": 1.8289518564127223e-06, |
| "loss": 0.0129, |
| "num_input_tokens_seen": 1357416, |
| "step": 4770 |
| }, |
| { |
| "epoch": 8.908582089552239, |
| "grad_norm": 4.268761157989502, |
| "learning_rate": 1.7985109918701643e-06, |
| "loss": 0.0522, |
| "num_input_tokens_seen": 1359080, |
| "step": 4775 |
| }, |
| { |
| "epoch": 8.917910447761194, |
| "grad_norm": 4.991677761077881, |
| "learning_rate": 1.7683161304558076e-06, |
| "loss": 0.0884, |
| "num_input_tokens_seen": 1360552, |
| "step": 4780 |
| }, |
| { |
| "epoch": 8.927238805970148, |
| "grad_norm": 5.955003261566162, |
| "learning_rate": 1.738367592322837e-06, |
| "loss": 0.0635, |
| "num_input_tokens_seen": 1361960, |
| "step": 4785 |
| }, |
| { |
| "epoch": 8.936567164179104, |
| "grad_norm": 2.555006742477417, |
| "learning_rate": 1.708665695012676e-06, |
| "loss": 0.1217, |
| "num_input_tokens_seen": 1363368, |
| "step": 4790 |
| }, |
| { |
| "epoch": 8.94589552238806, |
| "grad_norm": 0.8509443402290344, |
| "learning_rate": 1.6792107534516571e-06, |
| "loss": 0.0507, |
| "num_input_tokens_seen": 1364840, |
| "step": 4795 |
| }, |
| { |
| "epoch": 8.955223880597014, |
| "grad_norm": 1.2004653215408325, |
| "learning_rate": 1.6500030799476513e-06, |
| "loss": 0.0284, |
| "num_input_tokens_seen": 1366152, |
| "step": 4800 |
| }, |
| { |
| "epoch": 8.96455223880597, |
| "grad_norm": 0.9058286547660828, |
| "learning_rate": 1.621042984186777e-06, |
| "loss": 0.0362, |
| "num_input_tokens_seen": 1367528, |
| "step": 4805 |
| }, |
| { |
| "epoch": 8.973880597014926, |
| "grad_norm": 2.2216506004333496, |
| "learning_rate": 1.5923307732301136e-06, |
| "loss": 0.0165, |
| "num_input_tokens_seen": 1368968, |
| "step": 4810 |
| }, |
| { |
| "epoch": 8.98320895522388, |
| "grad_norm": 1.0829942226409912, |
| "learning_rate": 1.5638667515104288e-06, |
| "loss": 0.0831, |
| "num_input_tokens_seen": 1370248, |
| "step": 4815 |
| }, |
| { |
| "epoch": 8.992537313432836, |
| "grad_norm": 1.8271247148513794, |
| "learning_rate": 1.5356512208289846e-06, |
| "loss": 0.1189, |
| "num_input_tokens_seen": 1371496, |
| "step": 4820 |
| }, |
| { |
| "epoch": 9.0, |
| "eval_loss": 1.0934998989105225, |
| "eval_runtime": 2.9047, |
| "eval_samples_per_second": 81.936, |
| "eval_steps_per_second": 20.656, |
| "num_input_tokens_seen": 1372560, |
| "step": 4824 |
| }, |
| { |
| "epoch": 9.001865671641792, |
| "grad_norm": 3.900106430053711, |
| "learning_rate": 1.5076844803522922e-06, |
| "loss": 0.0342, |
| "num_input_tokens_seen": 1372784, |
| "step": 4825 |
| }, |
| { |
| "epoch": 9.011194029850746, |
| "grad_norm": 1.2447712421417236, |
| "learning_rate": 1.4799668266089834e-06, |
| "loss": 0.0629, |
| "num_input_tokens_seen": 1374160, |
| "step": 4830 |
| }, |
| { |
| "epoch": 9.020522388059701, |
| "grad_norm": 11.13252067565918, |
| "learning_rate": 1.452498553486642e-06, |
| "loss": 0.0588, |
| "num_input_tokens_seen": 1375472, |
| "step": 4835 |
| }, |
| { |
| "epoch": 9.029850746268657, |
| "grad_norm": 3.7272777557373047, |
| "learning_rate": 1.4252799522286892e-06, |
| "loss": 0.0933, |
| "num_input_tokens_seen": 1376976, |
| "step": 4840 |
| }, |
| { |
| "epoch": 9.039179104477611, |
| "grad_norm": 7.738912105560303, |
| "learning_rate": 1.398311311431308e-06, |
| "loss": 0.0386, |
| "num_input_tokens_seen": 1378256, |
| "step": 4845 |
| }, |
| { |
| "epoch": 9.048507462686567, |
| "grad_norm": 0.2231058031320572, |
| "learning_rate": 1.3715929170403623e-06, |
| "loss": 0.0204, |
| "num_input_tokens_seen": 1379856, |
| "step": 4850 |
| }, |
| { |
| "epoch": 9.057835820895523, |
| "grad_norm": 0.46430137753486633, |
| "learning_rate": 1.3451250523483976e-06, |
| "loss": 0.0174, |
| "num_input_tokens_seen": 1381584, |
| "step": 4855 |
| }, |
| { |
| "epoch": 9.067164179104477, |
| "grad_norm": 0.7592486143112183, |
| "learning_rate": 1.3189079979915864e-06, |
| "loss": 0.0612, |
| "num_input_tokens_seen": 1382896, |
| "step": 4860 |
| }, |
| { |
| "epoch": 9.076492537313433, |
| "grad_norm": 3.734837293624878, |
| "learning_rate": 1.2929420319468255e-06, |
| "loss": 0.0232, |
| "num_input_tokens_seen": 1384240, |
| "step": 4865 |
| }, |
| { |
| "epoch": 9.085820895522389, |
| "grad_norm": 2.6872589588165283, |
| "learning_rate": 1.2672274295287057e-06, |
| "loss": 0.0602, |
| "num_input_tokens_seen": 1385808, |
| "step": 4870 |
| }, |
| { |
| "epoch": 9.095149253731343, |
| "grad_norm": 3.77540922164917, |
| "learning_rate": 1.2417644633866632e-06, |
| "loss": 0.0118, |
| "num_input_tokens_seen": 1387312, |
| "step": 4875 |
| }, |
| { |
| "epoch": 9.104477611940299, |
| "grad_norm": 5.3559770584106445, |
| "learning_rate": 1.2165534035020409e-06, |
| "loss": 0.0889, |
| "num_input_tokens_seen": 1388688, |
| "step": 4880 |
| }, |
| { |
| "epoch": 9.113805970149254, |
| "grad_norm": 1.3853775262832642, |
| "learning_rate": 1.1915945171852572e-06, |
| "loss": 0.0509, |
| "num_input_tokens_seen": 1390000, |
| "step": 4885 |
| }, |
| { |
| "epoch": 9.123134328358208, |
| "grad_norm": 0.5135497450828552, |
| "learning_rate": 1.1668880690729467e-06, |
| "loss": 0.1112, |
| "num_input_tokens_seen": 1391408, |
| "step": 4890 |
| }, |
| { |
| "epoch": 9.132462686567164, |
| "grad_norm": 0.50111323595047, |
| "learning_rate": 1.142434321125177e-06, |
| "loss": 0.0332, |
| "num_input_tokens_seen": 1392848, |
| "step": 4895 |
| }, |
| { |
| "epoch": 9.14179104477612, |
| "grad_norm": 0.5052019357681274, |
| "learning_rate": 1.1182335326226533e-06, |
| "loss": 0.0618, |
| "num_input_tokens_seen": 1394320, |
| "step": 4900 |
| }, |
| { |
| "epoch": 9.151119402985074, |
| "grad_norm": 1.7275025844573975, |
| "learning_rate": 1.0942859601639794e-06, |
| "loss": 0.0221, |
| "num_input_tokens_seen": 1395504, |
| "step": 4905 |
| }, |
| { |
| "epoch": 9.16044776119403, |
| "grad_norm": 2.4492604732513428, |
| "learning_rate": 1.0705918576629364e-06, |
| "loss": 0.0678, |
| "num_input_tokens_seen": 1396880, |
| "step": 4910 |
| }, |
| { |
| "epoch": 9.169776119402986, |
| "grad_norm": 4.030468940734863, |
| "learning_rate": 1.0471514763457814e-06, |
| "loss": 0.0707, |
| "num_input_tokens_seen": 1398064, |
| "step": 4915 |
| }, |
| { |
| "epoch": 9.17910447761194, |
| "grad_norm": 1.5302867889404297, |
| "learning_rate": 1.023965064748597e-06, |
| "loss": 0.085, |
| "num_input_tokens_seen": 1399504, |
| "step": 4920 |
| }, |
| { |
| "epoch": 9.188432835820896, |
| "grad_norm": 3.870030164718628, |
| "learning_rate": 1.0010328687146464e-06, |
| "loss": 0.0628, |
| "num_input_tokens_seen": 1400848, |
| "step": 4925 |
| }, |
| { |
| "epoch": 9.197761194029852, |
| "grad_norm": 2.3980712890625, |
| "learning_rate": 9.783551313917699e-07, |
| "loss": 0.0371, |
| "num_input_tokens_seen": 1402128, |
| "step": 4930 |
| }, |
| { |
| "epoch": 9.207089552238806, |
| "grad_norm": 1.7022837400436401, |
| "learning_rate": 9.559320932298111e-07, |
| "loss": 0.0495, |
| "num_input_tokens_seen": 1403632, |
| "step": 4935 |
| }, |
| { |
| "epoch": 9.216417910447761, |
| "grad_norm": 4.417207717895508, |
| "learning_rate": 9.337639919780539e-07, |
| "loss": 0.0689, |
| "num_input_tokens_seen": 1405072, |
| "step": 4940 |
| }, |
| { |
| "epoch": 9.225746268656717, |
| "grad_norm": 0.48929741978645325, |
| "learning_rate": 9.118510626827198e-07, |
| "loss": 0.0672, |
| "num_input_tokens_seen": 1406480, |
| "step": 4945 |
| }, |
| { |
| "epoch": 9.235074626865671, |
| "grad_norm": 2.055109977722168, |
| "learning_rate": 8.901935376844611e-07, |
| "loss": 0.0545, |
| "num_input_tokens_seen": 1407792, |
| "step": 4950 |
| }, |
| { |
| "epoch": 9.244402985074627, |
| "grad_norm": 1.1414642333984375, |
| "learning_rate": 8.687916466159157e-07, |
| "loss": 0.0825, |
| "num_input_tokens_seen": 1409072, |
| "step": 4955 |
| }, |
| { |
| "epoch": 9.253731343283581, |
| "grad_norm": 0.5905304551124573, |
| "learning_rate": 8.476456163992397e-07, |
| "loss": 0.0579, |
| "num_input_tokens_seen": 1410576, |
| "step": 4960 |
| }, |
| { |
| "epoch": 9.263059701492537, |
| "grad_norm": 3.927872896194458, |
| "learning_rate": 8.267556712437341e-07, |
| "loss": 0.064, |
| "num_input_tokens_seen": 1411856, |
| "step": 4965 |
| }, |
| { |
| "epoch": 9.272388059701493, |
| "grad_norm": 4.378158092498779, |
| "learning_rate": 8.061220326434582e-07, |
| "loss": 0.1059, |
| "num_input_tokens_seen": 1413200, |
| "step": 4970 |
| }, |
| { |
| "epoch": 9.281716417910447, |
| "grad_norm": 1.4743947982788086, |
| "learning_rate": 7.857449193748645e-07, |
| "loss": 0.03, |
| "num_input_tokens_seen": 1414576, |
| "step": 4975 |
| }, |
| { |
| "epoch": 9.291044776119403, |
| "grad_norm": 5.256697654724121, |
| "learning_rate": 7.656245474945034e-07, |
| "loss": 0.0289, |
| "num_input_tokens_seen": 1415952, |
| "step": 4980 |
| }, |
| { |
| "epoch": 9.300373134328359, |
| "grad_norm": 3.6384172439575195, |
| "learning_rate": 7.457611303367196e-07, |
| "loss": 0.0882, |
| "num_input_tokens_seen": 1417136, |
| "step": 4985 |
| }, |
| { |
| "epoch": 9.309701492537313, |
| "grad_norm": 0.5115447640419006, |
| "learning_rate": 7.261548785113925e-07, |
| "loss": 0.035, |
| "num_input_tokens_seen": 1418736, |
| "step": 4990 |
| }, |
| { |
| "epoch": 9.319029850746269, |
| "grad_norm": 7.139561176300049, |
| "learning_rate": 7.068059999016969e-07, |
| "loss": 0.0518, |
| "num_input_tokens_seen": 1420144, |
| "step": 4995 |
| }, |
| { |
| "epoch": 9.328358208955224, |
| "grad_norm": 1.7403579950332642, |
| "learning_rate": 6.877146996619122e-07, |
| "loss": 0.0781, |
| "num_input_tokens_seen": 1421552, |
| "step": 5000 |
| }, |
| { |
| "epoch": 9.337686567164178, |
| "grad_norm": 9.996808052062988, |
| "learning_rate": 6.688811802152279e-07, |
| "loss": 0.0536, |
| "num_input_tokens_seen": 1423024, |
| "step": 5005 |
| }, |
| { |
| "epoch": 9.347014925373134, |
| "grad_norm": 5.569436073303223, |
| "learning_rate": 6.503056412516223e-07, |
| "loss": 0.034, |
| "num_input_tokens_seen": 1424208, |
| "step": 5010 |
| }, |
| { |
| "epoch": 9.35634328358209, |
| "grad_norm": 5.359943866729736, |
| "learning_rate": 6.31988279725726e-07, |
| "loss": 0.0423, |
| "num_input_tokens_seen": 1425648, |
| "step": 5015 |
| }, |
| { |
| "epoch": 9.365671641791044, |
| "grad_norm": 1.2342942953109741, |
| "learning_rate": 6.139292898547366e-07, |
| "loss": 0.0291, |
| "num_input_tokens_seen": 1426896, |
| "step": 5020 |
| }, |
| { |
| "epoch": 9.375, |
| "grad_norm": 4.85499382019043, |
| "learning_rate": 5.961288631163687e-07, |
| "loss": 0.0374, |
| "num_input_tokens_seen": 1428336, |
| "step": 5025 |
| }, |
| { |
| "epoch": 9.384328358208956, |
| "grad_norm": 7.676522254943848, |
| "learning_rate": 5.785871882468069e-07, |
| "loss": 0.0771, |
| "num_input_tokens_seen": 1429712, |
| "step": 5030 |
| }, |
| { |
| "epoch": 9.39365671641791, |
| "grad_norm": 0.8172091245651245, |
| "learning_rate": 5.613044512387283e-07, |
| "loss": 0.0179, |
| "num_input_tokens_seen": 1430928, |
| "step": 5035 |
| }, |
| { |
| "epoch": 9.402985074626866, |
| "grad_norm": 5.501680374145508, |
| "learning_rate": 5.442808353393059e-07, |
| "loss": 0.0209, |
| "num_input_tokens_seen": 1432272, |
| "step": 5040 |
| }, |
| { |
| "epoch": 9.412313432835822, |
| "grad_norm": 6.81107759475708, |
| "learning_rate": 5.275165210482824e-07, |
| "loss": 0.0866, |
| "num_input_tokens_seen": 1433616, |
| "step": 5045 |
| }, |
| { |
| "epoch": 9.421641791044776, |
| "grad_norm": 1.5359623432159424, |
| "learning_rate": 5.110116861160502e-07, |
| "loss": 0.0631, |
| "num_input_tokens_seen": 1435216, |
| "step": 5050 |
| }, |
| { |
| "epoch": 9.430970149253731, |
| "grad_norm": 0.578716516494751, |
| "learning_rate": 4.947665055417605e-07, |
| "loss": 0.0134, |
| "num_input_tokens_seen": 1436432, |
| "step": 5055 |
| }, |
| { |
| "epoch": 9.440298507462687, |
| "grad_norm": 2.1612274646759033, |
| "learning_rate": 4.787811515714919e-07, |
| "loss": 0.0471, |
| "num_input_tokens_seen": 1437808, |
| "step": 5060 |
| }, |
| { |
| "epoch": 9.449626865671641, |
| "grad_norm": 1.1400558948516846, |
| "learning_rate": 4.6305579369638475e-07, |
| "loss": 0.0854, |
| "num_input_tokens_seen": 1439152, |
| "step": 5065 |
| }, |
| { |
| "epoch": 9.458955223880597, |
| "grad_norm": 2.896364450454712, |
| "learning_rate": 4.4759059865088494e-07, |
| "loss": 0.0528, |
| "num_input_tokens_seen": 1440752, |
| "step": 5070 |
| }, |
| { |
| "epoch": 9.468283582089553, |
| "grad_norm": 0.8584596514701843, |
| "learning_rate": 4.323857304109419e-07, |
| "loss": 0.0149, |
| "num_input_tokens_seen": 1442128, |
| "step": 5075 |
| }, |
| { |
| "epoch": 9.477611940298507, |
| "grad_norm": 6.571222305297852, |
| "learning_rate": 4.1744135019230193e-07, |
| "loss": 0.0709, |
| "num_input_tokens_seen": 1443728, |
| "step": 5080 |
| }, |
| { |
| "epoch": 9.486940298507463, |
| "grad_norm": 4.060767650604248, |
| "learning_rate": 4.0275761644876787e-07, |
| "loss": 0.0426, |
| "num_input_tokens_seen": 1445136, |
| "step": 5085 |
| }, |
| { |
| "epoch": 9.496268656716419, |
| "grad_norm": 2.0186190605163574, |
| "learning_rate": 3.883346848705338e-07, |
| "loss": 0.064, |
| "num_input_tokens_seen": 1446736, |
| "step": 5090 |
| }, |
| { |
| "epoch": 9.5, |
| "eval_loss": 1.1125332117080688, |
| "eval_runtime": 2.9081, |
| "eval_samples_per_second": 81.84, |
| "eval_steps_per_second": 20.632, |
| "num_input_tokens_seen": 1447376, |
| "step": 5092 |
| }, |
| { |
| "epoch": 9.505597014925373, |
| "grad_norm": 2.9371654987335205, |
| "learning_rate": 3.741727083825475e-07, |
| "loss": 0.0181, |
| "num_input_tokens_seen": 1448112, |
| "step": 5095 |
| }, |
| { |
| "epoch": 9.514925373134329, |
| "grad_norm": 3.3434977531433105, |
| "learning_rate": 3.6027183714285595e-07, |
| "loss": 0.0247, |
| "num_input_tokens_seen": 1449744, |
| "step": 5100 |
| }, |
| { |
| "epoch": 9.524253731343283, |
| "grad_norm": 5.2155022621154785, |
| "learning_rate": 3.4663221854105423e-07, |
| "loss": 0.1933, |
| "num_input_tokens_seen": 1451024, |
| "step": 5105 |
| }, |
| { |
| "epoch": 9.533582089552239, |
| "grad_norm": 5.120900630950928, |
| "learning_rate": 3.332539971966836e-07, |
| "loss": 0.0678, |
| "num_input_tokens_seen": 1452304, |
| "step": 5110 |
| }, |
| { |
| "epoch": 9.542910447761194, |
| "grad_norm": 1.6553268432617188, |
| "learning_rate": 3.201373149577247e-07, |
| "loss": 0.0107, |
| "num_input_tokens_seen": 1453968, |
| "step": 5115 |
| }, |
| { |
| "epoch": 9.552238805970148, |
| "grad_norm": 1.2175383567810059, |
| "learning_rate": 3.0728231089907634e-07, |
| "loss": 0.0232, |
| "num_input_tokens_seen": 1455376, |
| "step": 5120 |
| }, |
| { |
| "epoch": 9.561567164179104, |
| "grad_norm": 1.4662021398544312, |
| "learning_rate": 2.946891213211012e-07, |
| "loss": 0.0155, |
| "num_input_tokens_seen": 1456816, |
| "step": 5125 |
| }, |
| { |
| "epoch": 9.57089552238806, |
| "grad_norm": 1.416324257850647, |
| "learning_rate": 2.823578797481574e-07, |
| "loss": 0.0557, |
| "num_input_tokens_seen": 1458320, |
| "step": 5130 |
| }, |
| { |
| "epoch": 9.580223880597014, |
| "grad_norm": 2.3085410594940186, |
| "learning_rate": 2.7028871692720003e-07, |
| "loss": 0.0216, |
| "num_input_tokens_seen": 1459632, |
| "step": 5135 |
| }, |
| { |
| "epoch": 9.58955223880597, |
| "grad_norm": 1.184879183769226, |
| "learning_rate": 2.5848176082639007e-07, |
| "loss": 0.0153, |
| "num_input_tokens_seen": 1461008, |
| "step": 5140 |
| }, |
| { |
| "epoch": 9.598880597014926, |
| "grad_norm": 4.239015102386475, |
| "learning_rate": 2.4693713663372644e-07, |
| "loss": 0.0264, |
| "num_input_tokens_seen": 1462224, |
| "step": 5145 |
| }, |
| { |
| "epoch": 9.60820895522388, |
| "grad_norm": 1.2997117042541504, |
| "learning_rate": 2.3565496675574118e-07, |
| "loss": 0.0579, |
| "num_input_tokens_seen": 1463664, |
| "step": 5150 |
| }, |
| { |
| "epoch": 9.617537313432836, |
| "grad_norm": 1.3154493570327759, |
| "learning_rate": 2.246353708161758e-07, |
| "loss": 0.0204, |
| "num_input_tokens_seen": 1465168, |
| "step": 5155 |
| }, |
| { |
| "epoch": 9.626865671641792, |
| "grad_norm": 0.9623708724975586, |
| "learning_rate": 2.1387846565474045e-07, |
| "loss": 0.0356, |
| "num_input_tokens_seen": 1466544, |
| "step": 5160 |
| }, |
| { |
| "epoch": 9.636194029850746, |
| "grad_norm": 2.9215333461761475, |
| "learning_rate": 2.0338436532584827e-07, |
| "loss": 0.1265, |
| "num_input_tokens_seen": 1468176, |
| "step": 5165 |
| }, |
| { |
| "epoch": 9.645522388059701, |
| "grad_norm": 1.9559794664382935, |
| "learning_rate": 1.9315318109742465e-07, |
| "loss": 0.1062, |
| "num_input_tokens_seen": 1469680, |
| "step": 5170 |
| }, |
| { |
| "epoch": 9.654850746268657, |
| "grad_norm": 5.172410488128662, |
| "learning_rate": 1.831850214497194e-07, |
| "loss": 0.1171, |
| "num_input_tokens_seen": 1471024, |
| "step": 5175 |
| }, |
| { |
| "epoch": 9.664179104477611, |
| "grad_norm": 2.4927995204925537, |
| "learning_rate": 1.7347999207415478e-07, |
| "loss": 0.0173, |
| "num_input_tokens_seen": 1472464, |
| "step": 5180 |
| }, |
| { |
| "epoch": 9.673507462686567, |
| "grad_norm": 0.5646374821662903, |
| "learning_rate": 1.6403819587221814e-07, |
| "loss": 0.0451, |
| "num_input_tokens_seen": 1473904, |
| "step": 5185 |
| }, |
| { |
| "epoch": 9.682835820895523, |
| "grad_norm": 0.573626697063446, |
| "learning_rate": 1.5485973295434885e-07, |
| "loss": 0.0635, |
| "num_input_tokens_seen": 1475408, |
| "step": 5190 |
| }, |
| { |
| "epoch": 9.692164179104477, |
| "grad_norm": 0.5452666878700256, |
| "learning_rate": 1.4594470063890308e-07, |
| "loss": 0.036, |
| "num_input_tokens_seen": 1476784, |
| "step": 5195 |
| }, |
| { |
| "epoch": 9.701492537313433, |
| "grad_norm": 3.4904301166534424, |
| "learning_rate": 1.3729319345109348e-07, |
| "loss": 0.0753, |
| "num_input_tokens_seen": 1478160, |
| "step": 5200 |
| }, |
| { |
| "epoch": 9.710820895522389, |
| "grad_norm": 5.300086975097656, |
| "learning_rate": 1.2890530312200945e-07, |
| "loss": 0.033, |
| "num_input_tokens_seen": 1479600, |
| "step": 5205 |
| }, |
| { |
| "epoch": 9.720149253731343, |
| "grad_norm": 4.462669372558594, |
| "learning_rate": 1.207811185876373e-07, |
| "loss": 0.0104, |
| "num_input_tokens_seen": 1480944, |
| "step": 5210 |
| }, |
| { |
| "epoch": 9.729477611940299, |
| "grad_norm": 8.35970687866211, |
| "learning_rate": 1.1292072598791114e-07, |
| "loss": 0.0233, |
| "num_input_tokens_seen": 1482448, |
| "step": 5215 |
| }, |
| { |
| "epoch": 9.738805970149254, |
| "grad_norm": 2.1812546253204346, |
| "learning_rate": 1.0532420866581072e-07, |
| "loss": 0.0917, |
| "num_input_tokens_seen": 1483920, |
| "step": 5220 |
| }, |
| { |
| "epoch": 9.748134328358208, |
| "grad_norm": 0.512653648853302, |
| "learning_rate": 9.799164716646769e-08, |
| "loss": 0.0277, |
| "num_input_tokens_seen": 1485232, |
| "step": 5225 |
| }, |
| { |
| "epoch": 9.757462686567164, |
| "grad_norm": 2.9472293853759766, |
| "learning_rate": 9.092311923632191e-08, |
| "loss": 0.0345, |
| "num_input_tokens_seen": 1486640, |
| "step": 5230 |
| }, |
| { |
| "epoch": 9.76679104477612, |
| "grad_norm": 1.5441632270812988, |
| "learning_rate": 8.411869982228038e-08, |
| "loss": 0.0261, |
| "num_input_tokens_seen": 1488080, |
| "step": 5235 |
| }, |
| { |
| "epoch": 9.776119402985074, |
| "grad_norm": 1.129857063293457, |
| "learning_rate": 7.757846107094291e-08, |
| "loss": 0.0113, |
| "num_input_tokens_seen": 1489456, |
| "step": 5240 |
| }, |
| { |
| "epoch": 9.78544776119403, |
| "grad_norm": 2.280777931213379, |
| "learning_rate": 7.130247232782216e-08, |
| "loss": 0.012, |
| "num_input_tokens_seen": 1491024, |
| "step": 5245 |
| }, |
| { |
| "epoch": 9.794776119402986, |
| "grad_norm": 1.9997568130493164, |
| "learning_rate": 6.529080013661648e-08, |
| "loss": 0.0495, |
| "num_input_tokens_seen": 1492432, |
| "step": 5250 |
| }, |
| { |
| "epoch": 9.80410447761194, |
| "grad_norm": 0.8336589336395264, |
| "learning_rate": 5.954350823850208e-08, |
| "loss": 0.0478, |
| "num_input_tokens_seen": 1494064, |
| "step": 5255 |
| }, |
| { |
| "epoch": 9.813432835820896, |
| "grad_norm": 8.102984428405762, |
| "learning_rate": 5.4060657571453064e-08, |
| "loss": 0.0894, |
| "num_input_tokens_seen": 1495536, |
| "step": 5260 |
| }, |
| { |
| "epoch": 9.822761194029852, |
| "grad_norm": 6.9452080726623535, |
| "learning_rate": 4.884230626960307e-08, |
| "loss": 0.0761, |
| "num_input_tokens_seen": 1496848, |
| "step": 5265 |
| }, |
| { |
| "epoch": 9.832089552238806, |
| "grad_norm": 2.261775016784668, |
| "learning_rate": 4.388850966261793e-08, |
| "loss": 0.0502, |
| "num_input_tokens_seen": 1498352, |
| "step": 5270 |
| }, |
| { |
| "epoch": 9.841417910447761, |
| "grad_norm": 0.4817403256893158, |
| "learning_rate": 3.919932027512674e-08, |
| "loss": 0.0313, |
| "num_input_tokens_seen": 1499824, |
| "step": 5275 |
| }, |
| { |
| "epoch": 9.850746268656717, |
| "grad_norm": 4.4961934089660645, |
| "learning_rate": 3.477478782614452e-08, |
| "loss": 0.0669, |
| "num_input_tokens_seen": 1501328, |
| "step": 5280 |
| }, |
| { |
| "epoch": 9.860074626865671, |
| "grad_norm": 1.8667352199554443, |
| "learning_rate": 3.061495922855873e-08, |
| "loss": 0.0758, |
| "num_input_tokens_seen": 1502640, |
| "step": 5285 |
| }, |
| { |
| "epoch": 9.869402985074627, |
| "grad_norm": 4.105433464050293, |
| "learning_rate": 2.67198785886269e-08, |
| "loss": 0.0583, |
| "num_input_tokens_seen": 1504144, |
| "step": 5290 |
| }, |
| { |
| "epoch": 9.878731343283581, |
| "grad_norm": 3.6148746013641357, |
| "learning_rate": 2.3089587205507578e-08, |
| "loss": 0.0424, |
| "num_input_tokens_seen": 1505584, |
| "step": 5295 |
| }, |
| { |
| "epoch": 9.888059701492537, |
| "grad_norm": 1.5156457424163818, |
| "learning_rate": 1.972412357083009e-08, |
| "loss": 0.0236, |
| "num_input_tokens_seen": 1507184, |
| "step": 5300 |
| }, |
| { |
| "epoch": 9.897388059701493, |
| "grad_norm": 2.752258777618408, |
| "learning_rate": 1.662352336827544e-08, |
| "loss": 0.0838, |
| "num_input_tokens_seen": 1508656, |
| "step": 5305 |
| }, |
| { |
| "epoch": 9.906716417910447, |
| "grad_norm": 5.198145389556885, |
| "learning_rate": 1.3787819473207176e-08, |
| "loss": 0.1115, |
| "num_input_tokens_seen": 1510192, |
| "step": 5310 |
| }, |
| { |
| "epoch": 9.916044776119403, |
| "grad_norm": 5.04305362701416, |
| "learning_rate": 1.1217041952313323e-08, |
| "loss": 0.0159, |
| "num_input_tokens_seen": 1511632, |
| "step": 5315 |
| }, |
| { |
| "epoch": 9.925373134328359, |
| "grad_norm": 0.9993813633918762, |
| "learning_rate": 8.91121806330386e-09, |
| "loss": 0.0262, |
| "num_input_tokens_seen": 1513008, |
| "step": 5320 |
| }, |
| { |
| "epoch": 9.934701492537313, |
| "grad_norm": 2.5751473903656006, |
| "learning_rate": 6.870372254602631e-09, |
| "loss": 0.0262, |
| "num_input_tokens_seen": 1514416, |
| "step": 5325 |
| }, |
| { |
| "epoch": 9.944029850746269, |
| "grad_norm": 2.5701522827148438, |
| "learning_rate": 5.09452616509476e-09, |
| "loss": 0.0792, |
| "num_input_tokens_seen": 1515856, |
| "step": 5330 |
| }, |
| { |
| "epoch": 9.953358208955224, |
| "grad_norm": 2.827641487121582, |
| "learning_rate": 3.58369862391017e-09, |
| "loss": 0.0246, |
| "num_input_tokens_seen": 1517328, |
| "step": 5335 |
| }, |
| { |
| "epoch": 9.962686567164178, |
| "grad_norm": 0.461745947599411, |
| "learning_rate": 2.3379056502015327e-09, |
| "loss": 0.0383, |
| "num_input_tokens_seen": 1518800, |
| "step": 5340 |
| }, |
| { |
| "epoch": 9.972014925373134, |
| "grad_norm": 7.578468322753906, |
| "learning_rate": 1.357160452988837e-09, |
| "loss": 0.0562, |
| "num_input_tokens_seen": 1520208, |
| "step": 5345 |
| }, |
| { |
| "epoch": 9.98134328358209, |
| "grad_norm": 0.6003517508506775, |
| "learning_rate": 6.414734310233872e-10, |
| "loss": 0.0526, |
| "num_input_tokens_seen": 1521616, |
| "step": 5350 |
| }, |
| { |
| "epoch": 9.990671641791044, |
| "grad_norm": 3.2531113624572754, |
| "learning_rate": 1.9085217266290312e-10, |
| "loss": 0.036, |
| "num_input_tokens_seen": 1522992, |
| "step": 5355 |
| }, |
| { |
| "epoch": 10.0, |
| "grad_norm": 3.1259007453918457, |
| "learning_rate": 5.3014557993558144e-12, |
| "loss": 0.0079, |
| "num_input_tokens_seen": 1524216, |
| "step": 5360 |
| }, |
| { |
| "epoch": 10.0, |
| "eval_loss": 1.1125046014785767, |
| "eval_runtime": 2.9087, |
| "eval_samples_per_second": 81.824, |
| "eval_steps_per_second": 20.628, |
| "num_input_tokens_seen": 1524216, |
| "step": 5360 |
| }, |
| { |
| "epoch": 10.0, |
| "num_input_tokens_seen": 1524216, |
| "step": 5360, |
| "total_flos": 6.876901111372186e+16, |
| "train_loss": 0.32153739333430775, |
| "train_runtime": 1122.489, |
| "train_samples_per_second": 19.074, |
| "train_steps_per_second": 4.775 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 5360, |
| "num_input_tokens_seen": 1524216, |
| "num_train_epochs": 10, |
| "save_steps": 268, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 6.876901111372186e+16, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|