{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 1642, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0060901339829476245, "grad_norm": 17.5, "learning_rate": 4.9999267881610354e-05, "loss": 6.1919, "num_input_tokens_seen": 655360, "step": 5, "train_runtime": 22.3092, "train_tokens_per_second": 29376.191 }, { "epoch": 0.012180267965895249, "grad_norm": 3.9375, "learning_rate": 4.9996293724142536e-05, "loss": 4.6261, "num_input_tokens_seen": 1310720, "step": 10, "train_runtime": 44.0485, "train_tokens_per_second": 29756.318 }, { "epoch": 0.018270401948842874, "grad_norm": 5.84375, "learning_rate": 4.9991032042166476e-05, "loss": 4.0276, "num_input_tokens_seen": 1966080, "step": 15, "train_runtime": 65.7637, "train_tokens_per_second": 29896.111 }, { "epoch": 0.024360535931790498, "grad_norm": 7.28125, "learning_rate": 4.998348331720263e-05, "loss": 3.5105, "num_input_tokens_seen": 2621440, "step": 20, "train_runtime": 87.5389, "train_tokens_per_second": 29945.994 }, { "epoch": 0.030450669914738125, "grad_norm": 5.46875, "learning_rate": 4.997364824006915e-05, "loss": 2.9731, "num_input_tokens_seen": 3276800, "step": 25, "train_runtime": 109.5185, "train_tokens_per_second": 29920.05 }, { "epoch": 0.03654080389768575, "grad_norm": 4.96875, "learning_rate": 4.996152771081866e-05, "loss": 2.5386, "num_input_tokens_seen": 3932160, "step": 30, "train_runtime": 131.3684, "train_tokens_per_second": 29932.308 }, { "epoch": 0.04263093788063337, "grad_norm": 4.0625, "learning_rate": 4.9947122838655915e-05, "loss": 2.1857, "num_input_tokens_seen": 4587520, "step": 35, "train_runtime": 153.1526, "train_tokens_per_second": 29953.909 }, { "epoch": 0.048721071863580996, "grad_norm": 5.03125, "learning_rate": 4.993043494183627e-05, "loss": 1.9064, "num_input_tokens_seen": 5242880, "step": 40, "train_runtime": 174.9243, "train_tokens_per_second": 29972.285 }, { "epoch": 0.05481120584652863, "grad_norm": 4.0, "learning_rate": 4.9911465547545044e-05, "loss": 1.7284, "num_input_tokens_seen": 5898240, "step": 45, "train_runtime": 196.7433, "train_tokens_per_second": 29979.364 }, { "epoch": 0.06090133982947625, "grad_norm": 3.453125, "learning_rate": 4.989021639175778e-05, "loss": 1.5783, "num_input_tokens_seen": 6553600, "step": 50, "train_runtime": 218.5219, "train_tokens_per_second": 29990.583 }, { "epoch": 0.06699147381242387, "grad_norm": 2.84375, "learning_rate": 4.986668941908136e-05, "loss": 1.4457, "num_input_tokens_seen": 7208960, "step": 55, "train_runtime": 240.2767, "train_tokens_per_second": 30002.749 }, { "epoch": 0.0730816077953715, "grad_norm": 2.6875, "learning_rate": 4.9840886782576024e-05, "loss": 1.3575, "num_input_tokens_seen": 7864320, "step": 60, "train_runtime": 262.1559, "train_tokens_per_second": 29998.643 }, { "epoch": 0.07917174177831912, "grad_norm": 2.390625, "learning_rate": 4.981281084355839e-05, "loss": 1.2639, "num_input_tokens_seen": 8519680, "step": 65, "train_runtime": 283.9622, "train_tokens_per_second": 30002.866 }, { "epoch": 0.08526187576126674, "grad_norm": 2.296875, "learning_rate": 4.97824641713853e-05, "loss": 1.2196, "num_input_tokens_seen": 9175040, "step": 70, "train_runtime": 305.7342, "train_tokens_per_second": 30009.862 }, { "epoch": 0.09135200974421437, "grad_norm": 2.25, "learning_rate": 4.974984954321873e-05, "loss": 1.1541, "num_input_tokens_seen": 9830400, "step": 75, "train_runtime": 327.4637, "train_tokens_per_second": 30019.82 }, { "epoch": 0.09744214372716199, "grad_norm": 2.3125, "learning_rate": 4.971496994377163e-05, "loss": 1.1022, "num_input_tokens_seen": 10485760, "step": 80, "train_runtime": 349.2345, "train_tokens_per_second": 30024.981 }, { "epoch": 0.10353227771010962, "grad_norm": 1.9140625, "learning_rate": 4.967782856503473e-05, "loss": 1.0584, "num_input_tokens_seen": 11141120, "step": 85, "train_runtime": 370.9965, "train_tokens_per_second": 30030.259 }, { "epoch": 0.10962241169305725, "grad_norm": 2.125, "learning_rate": 4.963842880598453e-05, "loss": 1.0431, "num_input_tokens_seen": 11796480, "step": 90, "train_runtime": 392.6965, "train_tokens_per_second": 30039.688 }, { "epoch": 0.11571254567600488, "grad_norm": 1.9375, "learning_rate": 4.9596774272272115e-05, "loss": 0.9951, "num_input_tokens_seen": 12451840, "step": 95, "train_runtime": 414.5171, "train_tokens_per_second": 30039.386 }, { "epoch": 0.1218026796589525, "grad_norm": 1.890625, "learning_rate": 4.955286877589331e-05, "loss": 0.9762, "num_input_tokens_seen": 13107200, "step": 100, "train_runtime": 436.3192, "train_tokens_per_second": 30040.394 }, { "epoch": 0.1278928136419001, "grad_norm": 1.75, "learning_rate": 4.9506716334839756e-05, "loss": 0.9444, "num_input_tokens_seen": 13762560, "step": 105, "train_runtime": 458.1573, "train_tokens_per_second": 30038.943 }, { "epoch": 0.13398294762484775, "grad_norm": 1.9453125, "learning_rate": 4.945832117273118e-05, "loss": 0.9425, "num_input_tokens_seen": 14417920, "step": 110, "train_runtime": 479.9278, "train_tokens_per_second": 30041.851 }, { "epoch": 0.14007308160779536, "grad_norm": 1.734375, "learning_rate": 4.940768771842896e-05, "loss": 0.907, "num_input_tokens_seen": 15073280, "step": 115, "train_runtime": 501.7348, "train_tokens_per_second": 30042.328 }, { "epoch": 0.146163215590743, "grad_norm": 1.8515625, "learning_rate": 4.9354820605630745e-05, "loss": 0.8877, "num_input_tokens_seen": 15728640, "step": 120, "train_runtime": 523.4795, "train_tokens_per_second": 30046.335 }, { "epoch": 0.15225334957369063, "grad_norm": 1.78125, "learning_rate": 4.929972467244645e-05, "loss": 0.9025, "num_input_tokens_seen": 16384000, "step": 125, "train_runtime": 545.2706, "train_tokens_per_second": 30047.468 }, { "epoch": 0.15834348355663824, "grad_norm": 1.6796875, "learning_rate": 4.9242404960955456e-05, "loss": 0.8531, "num_input_tokens_seen": 17039360, "step": 130, "train_runtime": 567.0521, "train_tokens_per_second": 30049.021 }, { "epoch": 0.16443361753958588, "grad_norm": 1.703125, "learning_rate": 4.918286671674523e-05, "loss": 0.8443, "num_input_tokens_seen": 17694720, "step": 135, "train_runtime": 588.838, "train_tokens_per_second": 30050.236 }, { "epoch": 0.1705237515225335, "grad_norm": 1.6953125, "learning_rate": 4.912111538843124e-05, "loss": 0.8392, "num_input_tokens_seen": 18350080, "step": 140, "train_runtime": 610.6656, "train_tokens_per_second": 30049.309 }, { "epoch": 0.17661388550548113, "grad_norm": 1.65625, "learning_rate": 4.905715662715835e-05, "loss": 0.8256, "num_input_tokens_seen": 19005440, "step": 145, "train_runtime": 632.4494, "train_tokens_per_second": 30050.53 }, { "epoch": 0.18270401948842874, "grad_norm": 1.6953125, "learning_rate": 4.899099628608365e-05, "loss": 0.819, "num_input_tokens_seen": 19660800, "step": 150, "train_runtime": 654.24, "train_tokens_per_second": 30051.355 }, { "epoch": 0.18879415347137637, "grad_norm": 1.546875, "learning_rate": 4.8922640419840826e-05, "loss": 0.8083, "num_input_tokens_seen": 20316160, "step": 155, "train_runtime": 675.9786, "train_tokens_per_second": 30054.443 }, { "epoch": 0.19488428745432398, "grad_norm": 1.6484375, "learning_rate": 4.885209528398603e-05, "loss": 0.7974, "num_input_tokens_seen": 20971520, "step": 160, "train_runtime": 697.7598, "train_tokens_per_second": 30055.5 }, { "epoch": 0.20097442143727162, "grad_norm": 1.7109375, "learning_rate": 4.8779367334425466e-05, "loss": 0.7856, "num_input_tokens_seen": 21626880, "step": 165, "train_runtime": 719.5124, "train_tokens_per_second": 30057.687 }, { "epoch": 0.20706455542021923, "grad_norm": 1.5078125, "learning_rate": 4.87044632268245e-05, "loss": 0.7741, "num_input_tokens_seen": 22282240, "step": 170, "train_runtime": 741.2635, "train_tokens_per_second": 30059.812 }, { "epoch": 0.21315468940316687, "grad_norm": 1.6171875, "learning_rate": 4.8627389815998654e-05, "loss": 0.762, "num_input_tokens_seen": 22937600, "step": 175, "train_runtime": 763.0585, "train_tokens_per_second": 30060.082 }, { "epoch": 0.2192448233861145, "grad_norm": 1.65625, "learning_rate": 4.854815415528624e-05, "loss": 0.7554, "num_input_tokens_seen": 23592960, "step": 180, "train_runtime": 784.8921, "train_tokens_per_second": 30058.857 }, { "epoch": 0.22533495736906212, "grad_norm": 1.5859375, "learning_rate": 4.8466763495902886e-05, "loss": 0.7566, "num_input_tokens_seen": 24248320, "step": 185, "train_runtime": 806.6931, "train_tokens_per_second": 30058.917 }, { "epoch": 0.23142509135200975, "grad_norm": 1.609375, "learning_rate": 4.838322528627796e-05, "loss": 0.7454, "num_input_tokens_seen": 24903680, "step": 190, "train_runtime": 828.4828, "train_tokens_per_second": 30059.381 }, { "epoch": 0.23751522533495736, "grad_norm": 1.546875, "learning_rate": 4.829754717137291e-05, "loss": 0.7475, "num_input_tokens_seen": 25559040, "step": 195, "train_runtime": 850.3012, "train_tokens_per_second": 30058.808 }, { "epoch": 0.243605359317905, "grad_norm": 1.5390625, "learning_rate": 4.820973699198164e-05, "loss": 0.7259, "num_input_tokens_seen": 26214400, "step": 200, "train_runtime": 872.1297, "train_tokens_per_second": 30057.916 }, { "epoch": 0.2496954933008526, "grad_norm": 1.4765625, "learning_rate": 4.811980278401299e-05, "loss": 0.7284, "num_input_tokens_seen": 26869760, "step": 205, "train_runtime": 893.9692, "train_tokens_per_second": 30056.694 }, { "epoch": 0.2557856272838002, "grad_norm": 1.4921875, "learning_rate": 4.802775277775529e-05, "loss": 0.7169, "num_input_tokens_seen": 27525120, "step": 210, "train_runtime": 915.7965, "train_tokens_per_second": 30055.935 }, { "epoch": 0.2618757612667479, "grad_norm": 1.5, "learning_rate": 4.793359539712322e-05, "loss": 0.7164, "num_input_tokens_seen": 28180480, "step": 215, "train_runtime": 937.6276, "train_tokens_per_second": 30055.088 }, { "epoch": 0.2679658952496955, "grad_norm": 1.3984375, "learning_rate": 4.783733925888685e-05, "loss": 0.7133, "num_input_tokens_seen": 28835840, "step": 220, "train_runtime": 959.4192, "train_tokens_per_second": 30055.516 }, { "epoch": 0.2740560292326431, "grad_norm": 1.5078125, "learning_rate": 4.773899317188311e-05, "loss": 0.7116, "num_input_tokens_seen": 29491200, "step": 225, "train_runtime": 981.209, "train_tokens_per_second": 30055.981 }, { "epoch": 0.2801461632155907, "grad_norm": 1.4140625, "learning_rate": 4.763856613620965e-05, "loss": 0.7029, "num_input_tokens_seen": 30146560, "step": 230, "train_runtime": 1002.9822, "train_tokens_per_second": 30056.926 }, { "epoch": 0.2862362971985384, "grad_norm": 1.4375, "learning_rate": 4.7536067342401194e-05, "loss": 0.6875, "num_input_tokens_seen": 30801920, "step": 235, "train_runtime": 1024.7415, "train_tokens_per_second": 30058.234 }, { "epoch": 0.292326431181486, "grad_norm": 1.625, "learning_rate": 4.7431506170588456e-05, "loss": 0.6949, "num_input_tokens_seen": 31457280, "step": 240, "train_runtime": 1046.5332, "train_tokens_per_second": 30058.558 }, { "epoch": 0.2984165651644336, "grad_norm": 1.5078125, "learning_rate": 4.732489218963978e-05, "loss": 0.6828, "num_input_tokens_seen": 32112640, "step": 245, "train_runtime": 1068.248, "train_tokens_per_second": 30061.035 }, { "epoch": 0.30450669914738127, "grad_norm": 1.4453125, "learning_rate": 4.721623515628537e-05, "loss": 0.6958, "num_input_tokens_seen": 32768000, "step": 250, "train_runtime": 1090.0205, "train_tokens_per_second": 30061.819 }, { "epoch": 0.3105968331303289, "grad_norm": 1.3671875, "learning_rate": 4.710554501422447e-05, "loss": 0.6947, "num_input_tokens_seen": 33423360, "step": 255, "train_runtime": 1111.7983, "train_tokens_per_second": 30062.432 }, { "epoch": 0.3166869671132765, "grad_norm": 1.546875, "learning_rate": 4.6992831893215325e-05, "loss": 0.6836, "num_input_tokens_seen": 34078720, "step": 260, "train_runtime": 1133.6327, "train_tokens_per_second": 30061.518 }, { "epoch": 0.3227771010962241, "grad_norm": 1.390625, "learning_rate": 4.6878106108148215e-05, "loss": 0.6701, "num_input_tokens_seen": 34734080, "step": 265, "train_runtime": 1155.4501, "train_tokens_per_second": 30061.081 }, { "epoch": 0.32886723507917176, "grad_norm": 1.3203125, "learning_rate": 4.676137815810142e-05, "loss": 0.6729, "num_input_tokens_seen": 35389440, "step": 270, "train_runtime": 1177.285, "train_tokens_per_second": 30060.214 }, { "epoch": 0.33495736906211937, "grad_norm": 1.3984375, "learning_rate": 4.664265872538048e-05, "loss": 0.6687, "num_input_tokens_seen": 36044800, "step": 275, "train_runtime": 1199.1576, "train_tokens_per_second": 30058.434 }, { "epoch": 0.341047503045067, "grad_norm": 1.4765625, "learning_rate": 4.6521958674540554e-05, "loss": 0.669, "num_input_tokens_seen": 36700160, "step": 280, "train_runtime": 1220.9936, "train_tokens_per_second": 30057.62 }, { "epoch": 0.3471376370280146, "grad_norm": 1.375, "learning_rate": 4.639928905139216e-05, "loss": 0.6637, "num_input_tokens_seen": 37355520, "step": 285, "train_runtime": 1242.9648, "train_tokens_per_second": 30053.563 }, { "epoch": 0.35322777101096225, "grad_norm": 1.3359375, "learning_rate": 4.627466108199037e-05, "loss": 0.659, "num_input_tokens_seen": 38010880, "step": 290, "train_runtime": 1264.7781, "train_tokens_per_second": 30053.398 }, { "epoch": 0.35931790499390986, "grad_norm": 1.3671875, "learning_rate": 4.614808617160737e-05, "loss": 0.6573, "num_input_tokens_seen": 38666240, "step": 295, "train_runtime": 1286.5591, "train_tokens_per_second": 30053.995 }, { "epoch": 0.3654080389768575, "grad_norm": 1.359375, "learning_rate": 4.601957590368884e-05, "loss": 0.6545, "num_input_tokens_seen": 39321600, "step": 300, "train_runtime": 1308.3044, "train_tokens_per_second": 30055.39 }, { "epoch": 0.37149817295980514, "grad_norm": 1.3046875, "learning_rate": 4.5889142038793766e-05, "loss": 0.6364, "num_input_tokens_seen": 39976960, "step": 305, "train_runtime": 1330.1444, "train_tokens_per_second": 30054.601 }, { "epoch": 0.37758830694275275, "grad_norm": 1.3125, "learning_rate": 4.5756796513518276e-05, "loss": 0.6487, "num_input_tokens_seen": 40632320, "step": 310, "train_runtime": 1351.8973, "train_tokens_per_second": 30055.773 }, { "epoch": 0.38367844092570036, "grad_norm": 1.34375, "learning_rate": 4.5622551439403226e-05, "loss": 0.6375, "num_input_tokens_seen": 41287680, "step": 315, "train_runtime": 1373.641, "train_tokens_per_second": 30057.111 }, { "epoch": 0.38976857490864797, "grad_norm": 1.4609375, "learning_rate": 4.548641910182582e-05, "loss": 0.6449, "num_input_tokens_seen": 41943040, "step": 320, "train_runtime": 1395.3852, "train_tokens_per_second": 30058.395 }, { "epoch": 0.39585870889159563, "grad_norm": 1.3203125, "learning_rate": 4.534841195887531e-05, "loss": 0.6377, "num_input_tokens_seen": 42598400, "step": 325, "train_runtime": 1417.1617, "train_tokens_per_second": 30058.955 }, { "epoch": 0.40194884287454324, "grad_norm": 1.3671875, "learning_rate": 4.520854264021296e-05, "loss": 0.6312, "num_input_tokens_seen": 43253760, "step": 330, "train_runtime": 1439.0046, "train_tokens_per_second": 30058.111 }, { "epoch": 0.40803897685749085, "grad_norm": 1.2890625, "learning_rate": 4.506682394591614e-05, "loss": 0.625, "num_input_tokens_seen": 43909120, "step": 335, "train_runtime": 1460.7409, "train_tokens_per_second": 30059.485 }, { "epoch": 0.41412911084043846, "grad_norm": 1.375, "learning_rate": 4.492326884530705e-05, "loss": 0.6168, "num_input_tokens_seen": 44564480, "step": 340, "train_runtime": 1482.5072, "train_tokens_per_second": 30060.212 }, { "epoch": 0.42021924482338613, "grad_norm": 1.2890625, "learning_rate": 4.477789047576574e-05, "loss": 0.6228, "num_input_tokens_seen": 45219840, "step": 345, "train_runtime": 1504.2816, "train_tokens_per_second": 30060.754 }, { "epoch": 0.42630937880633374, "grad_norm": 1.390625, "learning_rate": 4.463070214152791e-05, "loss": 0.62, "num_input_tokens_seen": 45875200, "step": 350, "train_runtime": 1526.0478, "train_tokens_per_second": 30061.443 }, { "epoch": 0.43239951278928135, "grad_norm": 1.359375, "learning_rate": 4.448171731246736e-05, "loss": 0.625, "num_input_tokens_seen": 46530560, "step": 355, "train_runtime": 1547.7838, "train_tokens_per_second": 30062.7 }, { "epoch": 0.438489646772229, "grad_norm": 1.53125, "learning_rate": 4.4330949622863306e-05, "loss": 0.6146, "num_input_tokens_seen": 47185920, "step": 360, "train_runtime": 1569.5543, "train_tokens_per_second": 30063.261 }, { "epoch": 0.4445797807551766, "grad_norm": 1.2265625, "learning_rate": 4.417841287015263e-05, "loss": 0.6044, "num_input_tokens_seen": 47841280, "step": 365, "train_runtime": 1591.3459, "train_tokens_per_second": 30063.407 }, { "epoch": 0.45066991473812423, "grad_norm": 1.2578125, "learning_rate": 4.402412101366722e-05, "loss": 0.6129, "num_input_tokens_seen": 48496640, "step": 370, "train_runtime": 1613.1368, "train_tokens_per_second": 30063.563 }, { "epoch": 0.45676004872107184, "grad_norm": 1.75, "learning_rate": 4.38680881733565e-05, "loss": 0.6078, "num_input_tokens_seen": 49152000, "step": 375, "train_runtime": 1634.9595, "train_tokens_per_second": 30063.131 }, { "epoch": 0.4628501827040195, "grad_norm": 1.34375, "learning_rate": 4.371032862849525e-05, "loss": 0.606, "num_input_tokens_seen": 49807360, "step": 380, "train_runtime": 1656.7683, "train_tokens_per_second": 30062.96 }, { "epoch": 0.4689403166869671, "grad_norm": 1.453125, "learning_rate": 4.3550856816376815e-05, "loss": 0.6063, "num_input_tokens_seen": 50462720, "step": 385, "train_runtime": 1678.5702, "train_tokens_per_second": 30062.919 }, { "epoch": 0.47503045066991473, "grad_norm": 1.3125, "learning_rate": 4.3389687330991914e-05, "loss": 0.6039, "num_input_tokens_seen": 51118080, "step": 390, "train_runtime": 1700.3779, "train_tokens_per_second": 30062.776 }, { "epoch": 0.48112058465286234, "grad_norm": 1.2421875, "learning_rate": 4.3226834921693064e-05, "loss": 0.5973, "num_input_tokens_seen": 51773440, "step": 395, "train_runtime": 1722.1624, "train_tokens_per_second": 30063.042 }, { "epoch": 0.48721071863581, "grad_norm": 1.3046875, "learning_rate": 4.306231449184481e-05, "loss": 0.5986, "num_input_tokens_seen": 52428800, "step": 400, "train_runtime": 1743.9777, "train_tokens_per_second": 30062.77 }, { "epoch": 0.4933008526187576, "grad_norm": 1.265625, "learning_rate": 4.289614109745984e-05, "loss": 0.5919, "num_input_tokens_seen": 53084160, "step": 405, "train_runtime": 1765.7704, "train_tokens_per_second": 30062.889 }, { "epoch": 0.4993909866017052, "grad_norm": 1.2734375, "learning_rate": 4.272832994582112e-05, "loss": 0.6017, "num_input_tokens_seen": 53739520, "step": 410, "train_runtime": 1787.5322, "train_tokens_per_second": 30063.526 }, { "epoch": 0.5054811205846529, "grad_norm": 1.3125, "learning_rate": 4.255889639409028e-05, "loss": 0.5838, "num_input_tokens_seen": 54394880, "step": 415, "train_runtime": 1809.3305, "train_tokens_per_second": 30063.54 }, { "epoch": 0.5115712545676004, "grad_norm": 1.34375, "learning_rate": 4.23878559479021e-05, "loss": 0.5906, "num_input_tokens_seen": 55050240, "step": 420, "train_runtime": 1831.1085, "train_tokens_per_second": 30063.888 }, { "epoch": 0.5176613885505481, "grad_norm": 1.875, "learning_rate": 4.221522425994563e-05, "loss": 0.5879, "num_input_tokens_seen": 55705600, "step": 425, "train_runtime": 1852.9092, "train_tokens_per_second": 30063.859 }, { "epoch": 0.5237515225334958, "grad_norm": 1.3515625, "learning_rate": 4.2041017128531665e-05, "loss": 0.5895, "num_input_tokens_seen": 56360960, "step": 430, "train_runtime": 1874.732, "train_tokens_per_second": 30063.476 }, { "epoch": 0.5298416565164433, "grad_norm": 1.359375, "learning_rate": 4.186525049614699e-05, "loss": 0.5794, "num_input_tokens_seen": 57016320, "step": 435, "train_runtime": 1896.6145, "train_tokens_per_second": 30062.155 }, { "epoch": 0.535931790499391, "grad_norm": 1.2734375, "learning_rate": 4.168794044799544e-05, "loss": 0.5833, "num_input_tokens_seen": 57671680, "step": 440, "train_runtime": 1918.4108, "train_tokens_per_second": 30062.216 }, { "epoch": 0.5420219244823387, "grad_norm": 1.1875, "learning_rate": 4.150910321052584e-05, "loss": 0.5748, "num_input_tokens_seen": 58327040, "step": 445, "train_runtime": 1940.1777, "train_tokens_per_second": 30062.731 }, { "epoch": 0.5481120584652862, "grad_norm": 1.2578125, "learning_rate": 4.132875514994701e-05, "loss": 0.5874, "num_input_tokens_seen": 58982400, "step": 450, "train_runtime": 1961.9296, "train_tokens_per_second": 30063.464 }, { "epoch": 0.5542021924482339, "grad_norm": 1.2265625, "learning_rate": 4.114691277073013e-05, "loss": 0.5807, "num_input_tokens_seen": 59637760, "step": 455, "train_runtime": 1983.7172, "train_tokens_per_second": 30063.639 }, { "epoch": 0.5602923264311814, "grad_norm": 1.265625, "learning_rate": 4.096359271409822e-05, "loss": 0.5897, "num_input_tokens_seen": 60293120, "step": 460, "train_runtime": 2005.5211, "train_tokens_per_second": 30063.569 }, { "epoch": 0.5663824604141291, "grad_norm": 1.2109375, "learning_rate": 4.077881175650332e-05, "loss": 0.5829, "num_input_tokens_seen": 60948480, "step": 465, "train_runtime": 2027.4851, "train_tokens_per_second": 30061.124 }, { "epoch": 0.5724725943970768, "grad_norm": 1.84375, "learning_rate": 4.059258680809114e-05, "loss": 0.568, "num_input_tokens_seen": 61603840, "step": 470, "train_runtime": 2049.3409, "train_tokens_per_second": 30060.319 }, { "epoch": 0.5785627283800243, "grad_norm": 1.9453125, "learning_rate": 4.040493491115355e-05, "loss": 0.5716, "num_input_tokens_seen": 62259200, "step": 475, "train_runtime": 2071.288, "train_tokens_per_second": 30058.205 }, { "epoch": 0.584652862362972, "grad_norm": 1.3671875, "learning_rate": 4.0215873238568986e-05, "loss": 0.577, "num_input_tokens_seen": 62914560, "step": 480, "train_runtime": 2093.1653, "train_tokens_per_second": 30057.139 }, { "epoch": 0.5907429963459196, "grad_norm": 1.25, "learning_rate": 4.002541909223084e-05, "loss": 0.5727, "num_input_tokens_seen": 63569920, "step": 485, "train_runtime": 2115.077, "train_tokens_per_second": 30055.605 }, { "epoch": 0.5968331303288672, "grad_norm": 1.234375, "learning_rate": 3.983358990146415e-05, "loss": 0.5732, "num_input_tokens_seen": 64225280, "step": 490, "train_runtime": 2137.0427, "train_tokens_per_second": 30053.344 }, { "epoch": 0.6029232643118149, "grad_norm": 1.1796875, "learning_rate": 3.964040322143049e-05, "loss": 0.5649, "num_input_tokens_seen": 64880640, "step": 495, "train_runtime": 2158.8505, "train_tokens_per_second": 30053.327 }, { "epoch": 0.6090133982947625, "grad_norm": 1.1796875, "learning_rate": 3.9445876731521433e-05, "loss": 0.5743, "num_input_tokens_seen": 65536000, "step": 500, "train_runtime": 2180.692, "train_tokens_per_second": 30052.846 }, { "epoch": 0.6151035322777101, "grad_norm": 1.1953125, "learning_rate": 3.925002823374071e-05, "loss": 0.5682, "num_input_tokens_seen": 66191360, "step": 505, "train_runtime": 2210.8877, "train_tokens_per_second": 29938.816 }, { "epoch": 0.6211936662606578, "grad_norm": 1.2578125, "learning_rate": 3.9052875651074936e-05, "loss": 0.5651, "num_input_tokens_seen": 66846720, "step": 510, "train_runtime": 2232.8769, "train_tokens_per_second": 29937.486 }, { "epoch": 0.6272838002436053, "grad_norm": 1.34375, "learning_rate": 3.8854437025853505e-05, "loss": 0.5674, "num_input_tokens_seen": 67502080, "step": 515, "train_runtime": 2254.7216, "train_tokens_per_second": 29938.1 }, { "epoch": 0.633373934226553, "grad_norm": 1.28125, "learning_rate": 3.86547305180974e-05, "loss": 0.5636, "num_input_tokens_seen": 68157440, "step": 520, "train_runtime": 2276.679, "train_tokens_per_second": 29937.22 }, { "epoch": 0.6394640682095006, "grad_norm": 1.46875, "learning_rate": 3.845377440385731e-05, "loss": 0.5706, "num_input_tokens_seen": 68812800, "step": 525, "train_runtime": 2298.5683, "train_tokens_per_second": 29937.244 }, { "epoch": 0.6455542021924482, "grad_norm": 1.1875, "learning_rate": 3.825158707354108e-05, "loss": 0.5576, "num_input_tokens_seen": 69468160, "step": 530, "train_runtime": 2320.5639, "train_tokens_per_second": 29935.897 }, { "epoch": 0.6516443361753959, "grad_norm": 1.640625, "learning_rate": 3.8048187030230745e-05, "loss": 0.5558, "num_input_tokens_seen": 70123520, "step": 535, "train_runtime": 2342.5476, "train_tokens_per_second": 29934.726 }, { "epoch": 0.6577344701583435, "grad_norm": 1.296875, "learning_rate": 3.784359288798921e-05, "loss": 0.5547, "num_input_tokens_seen": 70778880, "step": 540, "train_runtime": 2364.3333, "train_tokens_per_second": 29936.084 }, { "epoch": 0.6638246041412911, "grad_norm": 1.328125, "learning_rate": 3.763782337015683e-05, "loss": 0.5675, "num_input_tokens_seen": 71434240, "step": 545, "train_runtime": 2386.1478, "train_tokens_per_second": 29937.056 }, { "epoch": 0.6699147381242387, "grad_norm": 1.15625, "learning_rate": 3.743089730763792e-05, "loss": 0.5597, "num_input_tokens_seen": 72089600, "step": 550, "train_runtime": 2407.9479, "train_tokens_per_second": 29938.189 }, { "epoch": 0.6760048721071864, "grad_norm": 1.1953125, "learning_rate": 3.722283363717743e-05, "loss": 0.5529, "num_input_tokens_seen": 72744960, "step": 555, "train_runtime": 2429.7703, "train_tokens_per_second": 29939.027 }, { "epoch": 0.682095006090134, "grad_norm": 1.171875, "learning_rate": 3.7013651399628004e-05, "loss": 0.5622, "num_input_tokens_seen": 73400320, "step": 560, "train_runtime": 2451.4626, "train_tokens_per_second": 29941.44 }, { "epoch": 0.6881851400730816, "grad_norm": 1.21875, "learning_rate": 3.6803369738207444e-05, "loss": 0.5582, "num_input_tokens_seen": 74055680, "step": 565, "train_runtime": 2473.2169, "train_tokens_per_second": 29943.06 }, { "epoch": 0.6942752740560292, "grad_norm": 1.25, "learning_rate": 3.6592007896746846e-05, "loss": 0.551, "num_input_tokens_seen": 74711040, "step": 570, "train_runtime": 2494.9949, "train_tokens_per_second": 29944.366 }, { "epoch": 0.7003654080389768, "grad_norm": 1.1875, "learning_rate": 3.6379585217929474e-05, "loss": 0.5601, "num_input_tokens_seen": 75366400, "step": 575, "train_runtime": 2516.7415, "train_tokens_per_second": 29946.024 }, { "epoch": 0.7064555420219245, "grad_norm": 1.171875, "learning_rate": 3.6166121141520655e-05, "loss": 0.5487, "num_input_tokens_seen": 76021760, "step": 580, "train_runtime": 2538.5185, "train_tokens_per_second": 29947.294 }, { "epoch": 0.7125456760048721, "grad_norm": 1.234375, "learning_rate": 3.595163520258873e-05, "loss": 0.5604, "num_input_tokens_seen": 76677120, "step": 585, "train_runtime": 2560.24, "train_tokens_per_second": 29949.192 }, { "epoch": 0.7186358099878197, "grad_norm": 1.34375, "learning_rate": 3.573614702971735e-05, "loss": 0.5521, "num_input_tokens_seen": 77332480, "step": 590, "train_runtime": 2582.0217, "train_tokens_per_second": 29950.36 }, { "epoch": 0.7247259439707674, "grad_norm": 1.1875, "learning_rate": 3.551967634320911e-05, "loss": 0.5472, "num_input_tokens_seen": 77987840, "step": 595, "train_runtime": 2603.7692, "train_tokens_per_second": 29951.902 }, { "epoch": 0.730816077953715, "grad_norm": 1.2734375, "learning_rate": 3.530224295328096e-05, "loss": 0.5447, "num_input_tokens_seen": 78643200, "step": 600, "train_runtime": 2625.5133, "train_tokens_per_second": 29953.457 }, { "epoch": 0.7369062119366626, "grad_norm": 1.21875, "learning_rate": 3.508386675825116e-05, "loss": 0.5441, "num_input_tokens_seen": 79298560, "step": 605, "train_runtime": 2647.2508, "train_tokens_per_second": 29955.061 }, { "epoch": 0.7429963459196103, "grad_norm": 1.1328125, "learning_rate": 3.486456774271837e-05, "loss": 0.5417, "num_input_tokens_seen": 79953920, "step": 610, "train_runtime": 2668.9717, "train_tokens_per_second": 29956.825 }, { "epoch": 0.7490864799025578, "grad_norm": 1.1953125, "learning_rate": 3.464436597573276e-05, "loss": 0.5495, "num_input_tokens_seen": 80609280, "step": 615, "train_runtime": 2690.7063, "train_tokens_per_second": 29958.409 }, { "epoch": 0.7551766138855055, "grad_norm": 1.2890625, "learning_rate": 3.4423281608959376e-05, "loss": 0.5388, "num_input_tokens_seen": 81264640, "step": 620, "train_runtime": 2712.4728, "train_tokens_per_second": 29959.615 }, { "epoch": 0.761266747868453, "grad_norm": 1.171875, "learning_rate": 3.420133487483402e-05, "loss": 0.5358, "num_input_tokens_seen": 81920000, "step": 625, "train_runtime": 2734.2134, "train_tokens_per_second": 29961.085 }, { "epoch": 0.7673568818514007, "grad_norm": 1.1484375, "learning_rate": 3.3978546084711595e-05, "loss": 0.5433, "num_input_tokens_seen": 82575360, "step": 630, "train_runtime": 2755.9692, "train_tokens_per_second": 29962.367 }, { "epoch": 0.7734470158343484, "grad_norm": 1.15625, "learning_rate": 3.375493562700742e-05, "loss": 0.5464, "num_input_tokens_seen": 83230720, "step": 635, "train_runtime": 2777.7109, "train_tokens_per_second": 29963.78 }, { "epoch": 0.7795371498172959, "grad_norm": 1.21875, "learning_rate": 3.353052396533133e-05, "loss": 0.5404, "num_input_tokens_seen": 83886080, "step": 640, "train_runtime": 2799.4496, "train_tokens_per_second": 29965.204 }, { "epoch": 0.7856272838002436, "grad_norm": 1.1875, "learning_rate": 3.330533163661501e-05, "loss": 0.5427, "num_input_tokens_seen": 84541440, "step": 645, "train_runtime": 2821.1758, "train_tokens_per_second": 29966.739 }, { "epoch": 0.7917174177831913, "grad_norm": 1.1875, "learning_rate": 3.3079379249232475e-05, "loss": 0.5393, "num_input_tokens_seen": 85196800, "step": 650, "train_runtime": 2842.9286, "train_tokens_per_second": 29967.971 }, { "epoch": 0.7978075517661388, "grad_norm": 1.1328125, "learning_rate": 3.2852687481114235e-05, "loss": 0.5404, "num_input_tokens_seen": 85852160, "step": 655, "train_runtime": 2864.6768, "train_tokens_per_second": 29969.23 }, { "epoch": 0.8038976857490865, "grad_norm": 1.6171875, "learning_rate": 3.2625277077854855e-05, "loss": 0.5407, "num_input_tokens_seen": 86507520, "step": 660, "train_runtime": 2886.4158, "train_tokens_per_second": 29970.567 }, { "epoch": 0.8099878197320342, "grad_norm": 1.09375, "learning_rate": 3.239716885081446e-05, "loss": 0.5304, "num_input_tokens_seen": 87162880, "step": 665, "train_runtime": 2908.1158, "train_tokens_per_second": 29972.287 }, { "epoch": 0.8160779537149817, "grad_norm": 1.2109375, "learning_rate": 3.216838367521424e-05, "loss": 0.5397, "num_input_tokens_seen": 87818240, "step": 670, "train_runtime": 2929.8531, "train_tokens_per_second": 29973.598 }, { "epoch": 0.8221680876979294, "grad_norm": 1.2265625, "learning_rate": 3.193894248822599e-05, "loss": 0.5362, "num_input_tokens_seen": 88473600, "step": 675, "train_runtime": 2951.6047, "train_tokens_per_second": 29974.745 }, { "epoch": 0.8282582216808769, "grad_norm": 1.25, "learning_rate": 3.17088662870561e-05, "loss": 0.5333, "num_input_tokens_seen": 89128960, "step": 680, "train_runtime": 2973.4611, "train_tokens_per_second": 29974.82 }, { "epoch": 0.8343483556638246, "grad_norm": 1.2265625, "learning_rate": 3.147817612702403e-05, "loss": 0.5333, "num_input_tokens_seen": 89784320, "step": 685, "train_runtime": 2995.2224, "train_tokens_per_second": 29975.844 }, { "epoch": 0.8404384896467723, "grad_norm": 1.1640625, "learning_rate": 3.124689311963535e-05, "loss": 0.5239, "num_input_tokens_seen": 90439680, "step": 690, "train_runtime": 3017.0068, "train_tokens_per_second": 29976.625 }, { "epoch": 0.8465286236297198, "grad_norm": 1.140625, "learning_rate": 3.101503843064981e-05, "loss": 0.5356, "num_input_tokens_seen": 91095040, "step": 695, "train_runtime": 3038.7428, "train_tokens_per_second": 29977.872 }, { "epoch": 0.8526187576126675, "grad_norm": 1.125, "learning_rate": 3.078263327814438e-05, "loss": 0.5301, "num_input_tokens_seen": 91750400, "step": 700, "train_runtime": 3060.5883, "train_tokens_per_second": 29978.028 }, { "epoch": 0.8587088915956151, "grad_norm": 1.1484375, "learning_rate": 3.0549698930571386e-05, "loss": 0.5336, "num_input_tokens_seen": 92405760, "step": 705, "train_runtime": 3082.32, "train_tokens_per_second": 29979.288 }, { "epoch": 0.8647990255785627, "grad_norm": 1.0546875, "learning_rate": 3.0316256704812252e-05, "loss": 0.5262, "num_input_tokens_seen": 93061120, "step": 710, "train_runtime": 3104.0532, "train_tokens_per_second": 29980.517 }, { "epoch": 0.8708891595615104, "grad_norm": 1.1015625, "learning_rate": 3.0082327964226615e-05, "loss": 0.5249, "num_input_tokens_seen": 93716480, "step": 715, "train_runtime": 3125.8473, "train_tokens_per_second": 29981.145 }, { "epoch": 0.876979293544458, "grad_norm": 1.34375, "learning_rate": 2.9847934116697307e-05, "loss": 0.5313, "num_input_tokens_seen": 94371840, "step": 720, "train_runtime": 3147.5696, "train_tokens_per_second": 29982.448 }, { "epoch": 0.8830694275274056, "grad_norm": 1.2109375, "learning_rate": 2.9613096612671225e-05, "loss": 0.5308, "num_input_tokens_seen": 95027200, "step": 725, "train_runtime": 3169.2945, "train_tokens_per_second": 29983.708 }, { "epoch": 0.8891595615103532, "grad_norm": 1.1484375, "learning_rate": 2.9377836943196256e-05, "loss": 0.5318, "num_input_tokens_seen": 95682560, "step": 730, "train_runtime": 3191.0555, "train_tokens_per_second": 29984.611 }, { "epoch": 0.8952496954933008, "grad_norm": 1.3828125, "learning_rate": 2.91421766379546e-05, "loss": 0.5383, "num_input_tokens_seen": 96337920, "step": 735, "train_runtime": 3212.7812, "train_tokens_per_second": 29985.833 }, { "epoch": 0.9013398294762485, "grad_norm": 1.4296875, "learning_rate": 2.8906137263292442e-05, "loss": 0.532, "num_input_tokens_seen": 96993280, "step": 740, "train_runtime": 3234.5148, "train_tokens_per_second": 29986.965 }, { "epoch": 0.9074299634591961, "grad_norm": 1.2578125, "learning_rate": 2.8669740420246334e-05, "loss": 0.5222, "num_input_tokens_seen": 97648640, "step": 745, "train_runtime": 3256.233, "train_tokens_per_second": 29988.222 }, { "epoch": 0.9135200974421437, "grad_norm": 1.0859375, "learning_rate": 2.843300774256638e-05, "loss": 0.52, "num_input_tokens_seen": 98304000, "step": 750, "train_runtime": 3277.9714, "train_tokens_per_second": 29989.279 }, { "epoch": 0.9196102314250914, "grad_norm": 1.1171875, "learning_rate": 2.819596089473646e-05, "loss": 0.5194, "num_input_tokens_seen": 98959360, "step": 755, "train_runtime": 3299.7126, "train_tokens_per_second": 29990.297 }, { "epoch": 0.925700365408039, "grad_norm": 1.0859375, "learning_rate": 2.795862156999157e-05, "loss": 0.5278, "num_input_tokens_seen": 99614720, "step": 760, "train_runtime": 3321.428, "train_tokens_per_second": 29991.534 }, { "epoch": 0.9317904993909866, "grad_norm": 1.1015625, "learning_rate": 2.7721011488332615e-05, "loss": 0.5221, "num_input_tokens_seen": 100270080, "step": 765, "train_runtime": 3343.2094, "train_tokens_per_second": 29992.163 }, { "epoch": 0.9378806333739342, "grad_norm": 1.0703125, "learning_rate": 2.748315239453868e-05, "loss": 0.5159, "num_input_tokens_seen": 100925440, "step": 770, "train_runtime": 3364.9146, "train_tokens_per_second": 29993.463 }, { "epoch": 0.9439707673568819, "grad_norm": 1.25, "learning_rate": 2.7245066056177093e-05, "loss": 0.5135, "num_input_tokens_seen": 101580800, "step": 775, "train_runtime": 3386.6156, "train_tokens_per_second": 29994.783 }, { "epoch": 0.9500609013398295, "grad_norm": 1.125, "learning_rate": 2.7006774261611373e-05, "loss": 0.5237, "num_input_tokens_seen": 102236160, "step": 780, "train_runtime": 3408.3514, "train_tokens_per_second": 29995.78 }, { "epoch": 0.9561510353227771, "grad_norm": 1.078125, "learning_rate": 2.6768298818007253e-05, "loss": 0.5154, "num_input_tokens_seen": 102891520, "step": 785, "train_runtime": 3430.0863, "train_tokens_per_second": 29996.773 }, { "epoch": 0.9622411693057247, "grad_norm": 1.1171875, "learning_rate": 2.6529661549337032e-05, "loss": 0.5177, "num_input_tokens_seen": 103546880, "step": 790, "train_runtime": 3451.8095, "train_tokens_per_second": 29997.854 }, { "epoch": 0.9683313032886723, "grad_norm": 1.0703125, "learning_rate": 2.6290884294382366e-05, "loss": 0.5142, "num_input_tokens_seen": 104202240, "step": 795, "train_runtime": 3473.5565, "train_tokens_per_second": 29998.717 }, { "epoch": 0.97442143727162, "grad_norm": 1.1328125, "learning_rate": 2.6051988904735686e-05, "loss": 0.5138, "num_input_tokens_seen": 104857600, "step": 800, "train_runtime": 3495.316, "train_tokens_per_second": 29999.462 }, { "epoch": 0.9805115712545676, "grad_norm": 1.0859375, "learning_rate": 2.5812997242800456e-05, "loss": 0.5225, "num_input_tokens_seen": 105512960, "step": 805, "train_runtime": 3517.0562, "train_tokens_per_second": 30000.362 }, { "epoch": 0.9866017052375152, "grad_norm": 1.2421875, "learning_rate": 2.5573931179790472e-05, "loss": 0.5116, "num_input_tokens_seen": 106168320, "step": 810, "train_runtime": 3538.7869, "train_tokens_per_second": 30001.331 }, { "epoch": 0.9926918392204629, "grad_norm": 1.109375, "learning_rate": 2.5334812593728296e-05, "loss": 0.526, "num_input_tokens_seen": 106823680, "step": 815, "train_runtime": 3560.5431, "train_tokens_per_second": 30002.074 }, { "epoch": 0.9987819732034104, "grad_norm": 1.1640625, "learning_rate": 2.5095663367443123e-05, "loss": 0.5278, "num_input_tokens_seen": 107479040, "step": 820, "train_runtime": 3582.2344, "train_tokens_per_second": 30003.352 }, { "epoch": 1.004872107186358, "grad_norm": 1.0859375, "learning_rate": 2.485650538656817e-05, "loss": 0.454, "num_input_tokens_seen": 108103680, "step": 825, "train_runtime": 3603.2471, "train_tokens_per_second": 30001.74 }, { "epoch": 1.0109622411693058, "grad_norm": 1.171875, "learning_rate": 2.461736053753783e-05, "loss": 0.44, "num_input_tokens_seen": 108759040, "step": 830, "train_runtime": 3624.9733, "train_tokens_per_second": 30002.714 }, { "epoch": 1.0170523751522533, "grad_norm": 1.1015625, "learning_rate": 2.4378250705584737e-05, "loss": 0.4402, "num_input_tokens_seen": 109414400, "step": 835, "train_runtime": 3646.7181, "train_tokens_per_second": 30003.526 }, { "epoch": 1.0231425091352009, "grad_norm": 1.0859375, "learning_rate": 2.4139197772736942e-05, "loss": 0.4341, "num_input_tokens_seen": 110069760, "step": 840, "train_runtime": 3668.5013, "train_tokens_per_second": 30004.013 }, { "epoch": 1.0292326431181487, "grad_norm": 1.0390625, "learning_rate": 2.3900223615815438e-05, "loss": 0.4492, "num_input_tokens_seen": 110725120, "step": 845, "train_runtime": 3690.286, "train_tokens_per_second": 30004.482 }, { "epoch": 1.0353227771010962, "grad_norm": 1.1015625, "learning_rate": 2.3661350104432037e-05, "loss": 0.4401, "num_input_tokens_seen": 111380480, "step": 850, "train_runtime": 3712.0285, "train_tokens_per_second": 30005.287 }, { "epoch": 1.0414129110840438, "grad_norm": 1.1015625, "learning_rate": 2.3422599098988023e-05, "loss": 0.4402, "num_input_tokens_seen": 112035840, "step": 855, "train_runtime": 3733.7851, "train_tokens_per_second": 30005.969 }, { "epoch": 1.0475030450669915, "grad_norm": 1.0859375, "learning_rate": 2.3183992448673615e-05, "loss": 0.4383, "num_input_tokens_seen": 112691200, "step": 860, "train_runtime": 3755.54, "train_tokens_per_second": 30006.657 }, { "epoch": 1.053593179049939, "grad_norm": 1.109375, "learning_rate": 2.294555198946845e-05, "loss": 0.4408, "num_input_tokens_seen": 113346560, "step": 865, "train_runtime": 3777.2755, "train_tokens_per_second": 30007.491 }, { "epoch": 1.0596833130328867, "grad_norm": 1.1328125, "learning_rate": 2.270729954214324e-05, "loss": 0.4344, "num_input_tokens_seen": 114001920, "step": 870, "train_runtime": 3799.0222, "train_tokens_per_second": 30008.227 }, { "epoch": 1.0657734470158344, "grad_norm": 1.1484375, "learning_rate": 2.2469256910262877e-05, "loss": 0.4417, "num_input_tokens_seen": 114657280, "step": 875, "train_runtime": 3820.7855, "train_tokens_per_second": 30008.824 }, { "epoch": 1.071863580998782, "grad_norm": 1.046875, "learning_rate": 2.2231445878191107e-05, "loss": 0.4379, "num_input_tokens_seen": 115312640, "step": 880, "train_runtime": 3842.4905, "train_tokens_per_second": 30009.87 }, { "epoch": 1.0779537149817295, "grad_norm": 1.0703125, "learning_rate": 2.1993888209096897e-05, "loss": 0.4367, "num_input_tokens_seen": 115968000, "step": 885, "train_runtime": 3864.1859, "train_tokens_per_second": 30010.979 }, { "epoch": 1.0840438489646773, "grad_norm": 1.0546875, "learning_rate": 2.1756605642962827e-05, "loss": 0.439, "num_input_tokens_seen": 116623360, "step": 890, "train_runtime": 3885.9318, "train_tokens_per_second": 30011.685 }, { "epoch": 1.0901339829476249, "grad_norm": 1.125, "learning_rate": 2.1519619894595567e-05, "loss": 0.4357, "num_input_tokens_seen": 117278720, "step": 895, "train_runtime": 3907.6564, "train_tokens_per_second": 30012.547 }, { "epoch": 1.0962241169305724, "grad_norm": 1.0625, "learning_rate": 2.1282952651638626e-05, "loss": 0.4365, "num_input_tokens_seen": 117934080, "step": 900, "train_runtime": 3929.4342, "train_tokens_per_second": 30012.993 }, { "epoch": 1.1023142509135202, "grad_norm": 1.0625, "learning_rate": 2.1046625572587633e-05, "loss": 0.4301, "num_input_tokens_seen": 118589440, "step": 905, "train_runtime": 3951.2394, "train_tokens_per_second": 30013.225 }, { "epoch": 1.1084043848964678, "grad_norm": 1.1015625, "learning_rate": 2.0810660284808297e-05, "loss": 0.4309, "num_input_tokens_seen": 119244800, "step": 910, "train_runtime": 3972.9809, "train_tokens_per_second": 30013.937 }, { "epoch": 1.1144945188794153, "grad_norm": 1.125, "learning_rate": 2.0575078382557137e-05, "loss": 0.4336, "num_input_tokens_seen": 119900160, "step": 915, "train_runtime": 3994.6574, "train_tokens_per_second": 30015.129 }, { "epoch": 1.1205846528623629, "grad_norm": 1.109375, "learning_rate": 2.0339901425005315e-05, "loss": 0.4329, "num_input_tokens_seen": 120555520, "step": 920, "train_runtime": 4016.3768, "train_tokens_per_second": 30015.989 }, { "epoch": 1.1266747868453106, "grad_norm": 1.078125, "learning_rate": 2.0105150934265687e-05, "loss": 0.4377, "num_input_tokens_seen": 121210880, "step": 925, "train_runtime": 4038.1192, "train_tokens_per_second": 30016.667 }, { "epoch": 1.1327649208282582, "grad_norm": 1.171875, "learning_rate": 1.9870848393423176e-05, "loss": 0.4414, "num_input_tokens_seen": 121866240, "step": 930, "train_runtime": 4059.8399, "train_tokens_per_second": 30017.499 }, { "epoch": 1.1388550548112057, "grad_norm": 1.0546875, "learning_rate": 1.963701524456877e-05, "loss": 0.4327, "num_input_tokens_seen": 122521600, "step": 935, "train_runtime": 4081.578, "train_tokens_per_second": 30018.194 }, { "epoch": 1.1449451887941535, "grad_norm": 1.109375, "learning_rate": 1.9403672886837264e-05, "loss": 0.4283, "num_input_tokens_seen": 123176960, "step": 940, "train_runtime": 4103.2994, "train_tokens_per_second": 30019.004 }, { "epoch": 1.151035322777101, "grad_norm": 1.0703125, "learning_rate": 1.9170842674448942e-05, "loss": 0.4207, "num_input_tokens_seen": 123832320, "step": 945, "train_runtime": 4125.0267, "train_tokens_per_second": 30019.762 }, { "epoch": 1.1571254567600486, "grad_norm": 1.109375, "learning_rate": 1.89385459147553e-05, "loss": 0.437, "num_input_tokens_seen": 124487680, "step": 950, "train_runtime": 4146.7599, "train_tokens_per_second": 30020.47 }, { "epoch": 1.1632155907429964, "grad_norm": 1.1640625, "learning_rate": 1.8706803866289208e-05, "loss": 0.4381, "num_input_tokens_seen": 125143040, "step": 955, "train_runtime": 4168.4938, "train_tokens_per_second": 30021.165 }, { "epoch": 1.169305724725944, "grad_norm": 1.109375, "learning_rate": 1.8475637736819335e-05, "loss": 0.4272, "num_input_tokens_seen": 125798400, "step": 960, "train_runtime": 4190.2193, "train_tokens_per_second": 30021.913 }, { "epoch": 1.1753958587088915, "grad_norm": 1.0625, "learning_rate": 1.824506868140942e-05, "loss": 0.4248, "num_input_tokens_seen": 126453760, "step": 965, "train_runtime": 4211.9556, "train_tokens_per_second": 30022.577 }, { "epoch": 1.1814859926918393, "grad_norm": 1.0625, "learning_rate": 1.801511780048221e-05, "loss": 0.429, "num_input_tokens_seen": 127109120, "step": 970, "train_runtime": 4233.6883, "train_tokens_per_second": 30023.259 }, { "epoch": 1.1875761266747868, "grad_norm": 1.125, "learning_rate": 1.778580613788853e-05, "loss": 0.4305, "num_input_tokens_seen": 127764480, "step": 975, "train_runtime": 4255.3913, "train_tokens_per_second": 30024.144 }, { "epoch": 1.1936662606577344, "grad_norm": 1.0625, "learning_rate": 1.755715467898139e-05, "loss": 0.4307, "num_input_tokens_seen": 128419840, "step": 980, "train_runtime": 4277.133, "train_tokens_per_second": 30024.748 }, { "epoch": 1.1997563946406822, "grad_norm": 1.0625, "learning_rate": 1.7329184348695586e-05, "loss": 0.4238, "num_input_tokens_seen": 129075200, "step": 985, "train_runtime": 4298.8319, "train_tokens_per_second": 30025.645 }, { "epoch": 1.2058465286236297, "grad_norm": 1.0859375, "learning_rate": 1.7101916009632733e-05, "loss": 0.4402, "num_input_tokens_seen": 129730560, "step": 990, "train_runtime": 4320.5575, "train_tokens_per_second": 30026.347 }, { "epoch": 1.2119366626065773, "grad_norm": 1.0703125, "learning_rate": 1.6875370460152023e-05, "loss": 0.4324, "num_input_tokens_seen": 130385920, "step": 995, "train_runtime": 4342.3036, "train_tokens_per_second": 30026.901 }, { "epoch": 1.218026796589525, "grad_norm": 1.21875, "learning_rate": 1.6649568432466884e-05, "loss": 0.4349, "num_input_tokens_seen": 131041280, "step": 1000, "train_runtime": 4364.0234, "train_tokens_per_second": 30027.63 }, { "epoch": 1.2241169305724726, "grad_norm": 1.109375, "learning_rate": 1.6424530590747724e-05, "loss": 0.4318, "num_input_tokens_seen": 131696640, "step": 1005, "train_runtime": 4393.7613, "train_tokens_per_second": 29973.553 }, { "epoch": 1.2302070645554202, "grad_norm": 1.171875, "learning_rate": 1.6200277529230768e-05, "loss": 0.4475, "num_input_tokens_seen": 132352000, "step": 1010, "train_runtime": 4415.4584, "train_tokens_per_second": 29974.69 }, { "epoch": 1.236297198538368, "grad_norm": 1.1015625, "learning_rate": 1.5976829770333452e-05, "loss": 0.4415, "num_input_tokens_seen": 133007360, "step": 1015, "train_runtime": 4437.1808, "train_tokens_per_second": 29975.646 }, { "epoch": 1.2423873325213155, "grad_norm": 1.09375, "learning_rate": 1.5754207762776325e-05, "loss": 0.4288, "num_input_tokens_seen": 133662720, "step": 1020, "train_runtime": 4458.8792, "train_tokens_per_second": 29976.753 }, { "epoch": 1.248477466504263, "grad_norm": 1.0546875, "learning_rate": 1.5532431879711657e-05, "loss": 0.4289, "num_input_tokens_seen": 134318080, "step": 1025, "train_runtime": 4480.5616, "train_tokens_per_second": 29977.956 }, { "epoch": 1.2545676004872108, "grad_norm": 1.0546875, "learning_rate": 1.5311522416859016e-05, "loss": 0.4246, "num_input_tokens_seen": 134973440, "step": 1030, "train_runtime": 4502.2815, "train_tokens_per_second": 29978.898 }, { "epoch": 1.2606577344701584, "grad_norm": 1.125, "learning_rate": 1.5091499590647936e-05, "loss": 0.432, "num_input_tokens_seen": 135628800, "step": 1035, "train_runtime": 4524.0863, "train_tokens_per_second": 29979.269 }, { "epoch": 1.266747868453106, "grad_norm": 1.0703125, "learning_rate": 1.4872383536367785e-05, "loss": 0.4333, "num_input_tokens_seen": 136284160, "step": 1040, "train_runtime": 4545.9201, "train_tokens_per_second": 29979.444 }, { "epoch": 1.2728380024360537, "grad_norm": 1.078125, "learning_rate": 1.4654194306325093e-05, "loss": 0.4282, "num_input_tokens_seen": 136939520, "step": 1045, "train_runtime": 4567.69, "train_tokens_per_second": 29980.038 }, { "epoch": 1.2789281364190013, "grad_norm": 1.0625, "learning_rate": 1.4436951868008536e-05, "loss": 0.4307, "num_input_tokens_seen": 137594880, "step": 1050, "train_runtime": 4589.5037, "train_tokens_per_second": 29980.34 }, { "epoch": 1.2850182704019488, "grad_norm": 1.078125, "learning_rate": 1.4220676102261532e-05, "loss": 0.4323, "num_input_tokens_seen": 138250240, "step": 1055, "train_runtime": 4611.2636, "train_tokens_per_second": 29980.988 }, { "epoch": 1.2911084043848966, "grad_norm": 1.0625, "learning_rate": 1.4005386801462896e-05, "loss": 0.428, "num_input_tokens_seen": 138905600, "step": 1060, "train_runtime": 4633.0271, "train_tokens_per_second": 29981.607 }, { "epoch": 1.2971985383678442, "grad_norm": 1.1015625, "learning_rate": 1.3791103667715577e-05, "loss": 0.4226, "num_input_tokens_seen": 139560960, "step": 1065, "train_runtime": 4654.7487, "train_tokens_per_second": 29982.491 }, { "epoch": 1.3032886723507917, "grad_norm": 1.109375, "learning_rate": 1.3577846311043593e-05, "loss": 0.4332, "num_input_tokens_seen": 140216320, "step": 1070, "train_runtime": 4676.4958, "train_tokens_per_second": 29983.203 }, { "epoch": 1.3093788063337393, "grad_norm": 1.1015625, "learning_rate": 1.3365634247597415e-05, "loss": 0.426, "num_input_tokens_seen": 140871680, "step": 1075, "train_runtime": 4698.2878, "train_tokens_per_second": 29983.621 }, { "epoch": 1.315468940316687, "grad_norm": 1.03125, "learning_rate": 1.3154486897867996e-05, "loss": 0.4302, "num_input_tokens_seen": 141527040, "step": 1080, "train_runtime": 4720.0824, "train_tokens_per_second": 29984.019 }, { "epoch": 1.3215590742996346, "grad_norm": 1.0859375, "learning_rate": 1.2944423584909502e-05, "loss": 0.4306, "num_input_tokens_seen": 142182400, "step": 1085, "train_runtime": 4741.7882, "train_tokens_per_second": 29984.975 }, { "epoch": 1.3276492082825821, "grad_norm": 1.09375, "learning_rate": 1.273546353257096e-05, "loss": 0.4204, "num_input_tokens_seen": 142837760, "step": 1090, "train_runtime": 4763.5593, "train_tokens_per_second": 29985.511 }, { "epoch": 1.3337393422655297, "grad_norm": 1.046875, "learning_rate": 1.2527625863736981e-05, "loss": 0.4253, "num_input_tokens_seen": 143493120, "step": 1095, "train_runtime": 4785.3061, "train_tokens_per_second": 29986.195 }, { "epoch": 1.3398294762484775, "grad_norm": 1.0546875, "learning_rate": 1.2320929598577777e-05, "loss": 0.4353, "num_input_tokens_seen": 144148480, "step": 1100, "train_runtime": 4807.0374, "train_tokens_per_second": 29986.969 }, { "epoch": 1.345919610231425, "grad_norm": 1.09375, "learning_rate": 1.2115393652808526e-05, "loss": 0.4358, "num_input_tokens_seen": 144803840, "step": 1105, "train_runtime": 4828.7956, "train_tokens_per_second": 29987.569 }, { "epoch": 1.3520097442143726, "grad_norm": 1.0546875, "learning_rate": 1.1911036835958274e-05, "loss": 0.4386, "num_input_tokens_seen": 145459200, "step": 1110, "train_runtime": 4850.5882, "train_tokens_per_second": 29987.951 }, { "epoch": 1.3580998781973204, "grad_norm": 1.03125, "learning_rate": 1.1707877849648643e-05, "loss": 0.4304, "num_input_tokens_seen": 146114560, "step": 1115, "train_runtime": 4872.3825, "train_tokens_per_second": 29988.319 }, { "epoch": 1.364190012180268, "grad_norm": 1.09375, "learning_rate": 1.1505935285882336e-05, "loss": 0.4327, "num_input_tokens_seen": 146769920, "step": 1120, "train_runtime": 4894.2345, "train_tokens_per_second": 29988.33 }, { "epoch": 1.3702801461632155, "grad_norm": 1.0703125, "learning_rate": 1.1305227625341657e-05, "loss": 0.4316, "num_input_tokens_seen": 147425280, "step": 1125, "train_runtime": 4915.9875, "train_tokens_per_second": 29988.945 }, { "epoch": 1.3763702801461632, "grad_norm": 1.078125, "learning_rate": 1.1105773235697376e-05, "loss": 0.4247, "num_input_tokens_seen": 148080640, "step": 1130, "train_runtime": 4937.7365, "train_tokens_per_second": 29989.579 }, { "epoch": 1.3824604141291108, "grad_norm": 1.0859375, "learning_rate": 1.0907590369927674e-05, "loss": 0.4298, "num_input_tokens_seen": 148736000, "step": 1135, "train_runtime": 4959.4689, "train_tokens_per_second": 29990.308 }, { "epoch": 1.3885505481120584, "grad_norm": 1.03125, "learning_rate": 1.0710697164647807e-05, "loss": 0.431, "num_input_tokens_seen": 149391360, "step": 1140, "train_runtime": 4981.2189, "train_tokens_per_second": 29990.925 }, { "epoch": 1.3946406820950061, "grad_norm": 1.0546875, "learning_rate": 1.0515111638450395e-05, "loss": 0.4236, "num_input_tokens_seen": 150046720, "step": 1145, "train_runtime": 5002.928, "train_tokens_per_second": 29991.781 }, { "epoch": 1.4007308160779537, "grad_norm": 1.0703125, "learning_rate": 1.0320851690256324e-05, "loss": 0.4318, "num_input_tokens_seen": 150702080, "step": 1150, "train_runtime": 5024.6238, "train_tokens_per_second": 29992.709 }, { "epoch": 1.4068209500609012, "grad_norm": 1.0546875, "learning_rate": 1.0127935097676855e-05, "loss": 0.4371, "num_input_tokens_seen": 151357440, "step": 1155, "train_runtime": 5046.36, "train_tokens_per_second": 29993.389 }, { "epoch": 1.412911084043849, "grad_norm": 1.0546875, "learning_rate": 9.936379515386663e-06, "loss": 0.4213, "num_input_tokens_seen": 152012800, "step": 1160, "train_runtime": 5068.1066, "train_tokens_per_second": 29994.002 }, { "epoch": 1.4190012180267966, "grad_norm": 1.078125, "learning_rate": 9.74620247350815e-06, "loss": 0.4245, "num_input_tokens_seen": 152668160, "step": 1165, "train_runtime": 5089.847, "train_tokens_per_second": 29994.646 }, { "epoch": 1.4250913520097441, "grad_norm": 1.140625, "learning_rate": 9.557421376007258e-06, "loss": 0.4272, "num_input_tokens_seen": 153323520, "step": 1170, "train_runtime": 5111.5715, "train_tokens_per_second": 29995.378 }, { "epoch": 1.431181485992692, "grad_norm": 1.09375, "learning_rate": 9.370053499100698e-06, "loss": 0.418, "num_input_tokens_seen": 153978880, "step": 1175, "train_runtime": 5133.3011, "train_tokens_per_second": 29996.074 }, { "epoch": 1.4372716199756395, "grad_norm": 1.0625, "learning_rate": 9.184115989674913e-06, "loss": 0.4314, "num_input_tokens_seen": 154634240, "step": 1180, "train_runtime": 5155.025, "train_tokens_per_second": 29996.797 }, { "epoch": 1.443361753958587, "grad_norm": 1.046875, "learning_rate": 8.999625863716951e-06, "loss": 0.4283, "num_input_tokens_seen": 155289600, "step": 1185, "train_runtime": 5176.7613, "train_tokens_per_second": 29997.443 }, { "epoch": 1.4494518879415348, "grad_norm": 1.0234375, "learning_rate": 8.816600004757175e-06, "loss": 0.4367, "num_input_tokens_seen": 155944960, "step": 1190, "train_runtime": 5198.4684, "train_tokens_per_second": 29998.251 }, { "epoch": 1.4555420219244823, "grad_norm": 1.0390625, "learning_rate": 8.635055162324276e-06, "loss": 0.416, "num_input_tokens_seen": 156600320, "step": 1195, "train_runtime": 5220.1609, "train_tokens_per_second": 29999.137 }, { "epoch": 1.46163215590743, "grad_norm": 1.0546875, "learning_rate": 8.455007950412324e-06, "loss": 0.4317, "num_input_tokens_seen": 157255680, "step": 1200, "train_runtime": 5241.8684, "train_tokens_per_second": 29999.929 }, { "epoch": 1.4677222898903777, "grad_norm": 1.734375, "learning_rate": 8.276474845960448e-06, "loss": 0.4237, "num_input_tokens_seen": 157911040, "step": 1205, "train_runtime": 5263.5912, "train_tokens_per_second": 30000.628 }, { "epoch": 1.4738124238733252, "grad_norm": 1.0546875, "learning_rate": 8.099472187344914e-06, "loss": 0.4356, "num_input_tokens_seen": 158566400, "step": 1210, "train_runtime": 5285.317, "train_tokens_per_second": 30001.304 }, { "epoch": 1.4799025578562728, "grad_norm": 1.1015625, "learning_rate": 7.924016172883908e-06, "loss": 0.4297, "num_input_tokens_seen": 159221760, "step": 1215, "train_runtime": 5307.0638, "train_tokens_per_second": 30001.855 }, { "epoch": 1.4859926918392206, "grad_norm": 1.078125, "learning_rate": 7.750122859355199e-06, "loss": 0.4317, "num_input_tokens_seen": 159877120, "step": 1220, "train_runtime": 5328.8039, "train_tokens_per_second": 30002.44 }, { "epoch": 1.4920828258221681, "grad_norm": 1.0625, "learning_rate": 7.577808160526692e-06, "loss": 0.4311, "num_input_tokens_seen": 160532480, "step": 1225, "train_runtime": 5350.5736, "train_tokens_per_second": 30002.854 }, { "epoch": 1.4981729598051157, "grad_norm": 1.078125, "learning_rate": 7.40708784570005e-06, "loss": 0.4269, "num_input_tokens_seen": 161187840, "step": 1230, "train_runtime": 5372.3219, "train_tokens_per_second": 30003.385 }, { "epoch": 1.5042630937880634, "grad_norm": 1.03125, "learning_rate": 7.2379775382676375e-06, "loss": 0.4268, "num_input_tokens_seen": 161843200, "step": 1235, "train_runtime": 5394.073, "train_tokens_per_second": 30003.895 }, { "epoch": 1.510353227771011, "grad_norm": 1.0546875, "learning_rate": 7.070492714282706e-06, "loss": 0.4243, "num_input_tokens_seen": 162498560, "step": 1240, "train_runtime": 5415.8136, "train_tokens_per_second": 30004.459 }, { "epoch": 1.5164433617539586, "grad_norm": 1.0859375, "learning_rate": 6.904648701043137e-06, "loss": 0.4237, "num_input_tokens_seen": 163153920, "step": 1245, "train_runtime": 5437.5717, "train_tokens_per_second": 30004.923 }, { "epoch": 1.5225334957369063, "grad_norm": 1.0546875, "learning_rate": 6.740460675688734e-06, "loss": 0.4214, "num_input_tokens_seen": 163809280, "step": 1250, "train_runtime": 5459.3212, "train_tokens_per_second": 30005.43 }, { "epoch": 1.5286236297198539, "grad_norm": 1.2421875, "learning_rate": 6.577943663812344e-06, "loss": 0.4331, "num_input_tokens_seen": 164464640, "step": 1255, "train_runtime": 5481.0582, "train_tokens_per_second": 30006.001 }, { "epoch": 1.5347137637028014, "grad_norm": 1.0390625, "learning_rate": 6.417112538084771e-06, "loss": 0.4269, "num_input_tokens_seen": 165120000, "step": 1260, "train_runtime": 5502.8244, "train_tokens_per_second": 30006.409 }, { "epoch": 1.5408038976857492, "grad_norm": 1.015625, "learning_rate": 6.257982016893685e-06, "loss": 0.4197, "num_input_tokens_seen": 165775360, "step": 1265, "train_runtime": 5524.4904, "train_tokens_per_second": 30007.358 }, { "epoch": 1.5468940316686965, "grad_norm": 1.0859375, "learning_rate": 6.100566662996732e-06, "loss": 0.4407, "num_input_tokens_seen": 166430720, "step": 1270, "train_runtime": 5546.194, "train_tokens_per_second": 30008.096 }, { "epoch": 1.5529841656516443, "grad_norm": 1.0390625, "learning_rate": 5.944880882188786e-06, "loss": 0.4268, "num_input_tokens_seen": 167086080, "step": 1275, "train_runtime": 5567.9166, "train_tokens_per_second": 30008.725 }, { "epoch": 1.559074299634592, "grad_norm": 1.0625, "learning_rate": 5.790938921983608e-06, "loss": 0.4275, "num_input_tokens_seen": 167741440, "step": 1280, "train_runtime": 5589.613, "train_tokens_per_second": 30009.491 }, { "epoch": 1.5651644336175394, "grad_norm": 1.0703125, "learning_rate": 5.638754870310042e-06, "loss": 0.4291, "num_input_tokens_seen": 168396800, "step": 1285, "train_runtime": 5611.3139, "train_tokens_per_second": 30010.227 }, { "epoch": 1.5712545676004872, "grad_norm": 1.046875, "learning_rate": 5.488342654222695e-06, "loss": 0.4283, "num_input_tokens_seen": 169052160, "step": 1290, "train_runtime": 5633.0305, "train_tokens_per_second": 30010.872 }, { "epoch": 1.577344701583435, "grad_norm": 1.25, "learning_rate": 5.33971603862746e-06, "loss": 0.4282, "num_input_tokens_seen": 169707520, "step": 1295, "train_runtime": 5654.7734, "train_tokens_per_second": 30011.374 }, { "epoch": 1.5834348355663823, "grad_norm": 1.1015625, "learning_rate": 5.192888625021794e-06, "loss": 0.438, "num_input_tokens_seen": 170362880, "step": 1300, "train_runtime": 5676.6131, "train_tokens_per_second": 30011.36 }, { "epoch": 1.58952496954933, "grad_norm": 1.046875, "learning_rate": 5.047873850250012e-06, "loss": 0.4227, "num_input_tokens_seen": 171018240, "step": 1305, "train_runtime": 5698.3158, "train_tokens_per_second": 30012.068 }, { "epoch": 1.5956151035322779, "grad_norm": 1.03125, "learning_rate": 4.9046849852736085e-06, "loss": 0.4339, "num_input_tokens_seen": 171673600, "step": 1310, "train_runtime": 5720.0011, "train_tokens_per_second": 30012.862 }, { "epoch": 1.6017052375152252, "grad_norm": 1.09375, "learning_rate": 4.763335133956751e-06, "loss": 0.4233, "num_input_tokens_seen": 172328960, "step": 1315, "train_runtime": 5741.7125, "train_tokens_per_second": 30013.512 }, { "epoch": 1.607795371498173, "grad_norm": 1.140625, "learning_rate": 4.6238372318671175e-06, "loss": 0.4293, "num_input_tokens_seen": 172984320, "step": 1320, "train_runtime": 5763.4052, "train_tokens_per_second": 30014.256 }, { "epoch": 1.6138855054811205, "grad_norm": 1.078125, "learning_rate": 4.486204045092102e-06, "loss": 0.422, "num_input_tokens_seen": 173639680, "step": 1325, "train_runtime": 5785.115, "train_tokens_per_second": 30014.906 }, { "epoch": 1.619975639464068, "grad_norm": 1.015625, "learning_rate": 4.350448169070481e-06, "loss": 0.4234, "num_input_tokens_seen": 174295040, "step": 1330, "train_runtime": 5806.8095, "train_tokens_per_second": 30015.629 }, { "epoch": 1.6260657734470159, "grad_norm": 1.015625, "learning_rate": 4.2165820274398444e-06, "loss": 0.4258, "num_input_tokens_seen": 174950400, "step": 1335, "train_runtime": 5828.5149, "train_tokens_per_second": 30016.291 }, { "epoch": 1.6321559074299634, "grad_norm": 1.0234375, "learning_rate": 4.084617870899546e-06, "loss": 0.4212, "num_input_tokens_seen": 175605760, "step": 1340, "train_runtime": 5850.239, "train_tokens_per_second": 30016.852 }, { "epoch": 1.638246041412911, "grad_norm": 1.0546875, "learning_rate": 3.954567776089643e-06, "loss": 0.4218, "num_input_tokens_seen": 176261120, "step": 1345, "train_runtime": 5872.0406, "train_tokens_per_second": 30017.013 }, { "epoch": 1.6443361753958587, "grad_norm": 1.046875, "learning_rate": 3.826443644485731e-06, "loss": 0.4322, "num_input_tokens_seen": 176916480, "step": 1350, "train_runtime": 5893.7566, "train_tokens_per_second": 30017.609 }, { "epoch": 1.6504263093788063, "grad_norm": 1.046875, "learning_rate": 3.7002572013097147e-06, "loss": 0.4234, "num_input_tokens_seen": 177571840, "step": 1355, "train_runtime": 5915.4571, "train_tokens_per_second": 30018.279 }, { "epoch": 1.6565164433617539, "grad_norm": 1.09375, "learning_rate": 3.5760199944568418e-06, "loss": 0.4241, "num_input_tokens_seen": 178227200, "step": 1360, "train_runtime": 5937.1539, "train_tokens_per_second": 30018.963 }, { "epoch": 1.6626065773447016, "grad_norm": 1.109375, "learning_rate": 3.4537433934388798e-06, "loss": 0.4313, "num_input_tokens_seen": 178882560, "step": 1365, "train_runtime": 5958.8509, "train_tokens_per_second": 30019.64 }, { "epoch": 1.6686967113276492, "grad_norm": 1.0546875, "learning_rate": 3.333438588343624e-06, "loss": 0.4224, "num_input_tokens_seen": 179537920, "step": 1370, "train_runtime": 5980.5729, "train_tokens_per_second": 30020.187 }, { "epoch": 1.6747868453105967, "grad_norm": 1.1796875, "learning_rate": 3.2151165888108765e-06, "loss": 0.4228, "num_input_tokens_seen": 180193280, "step": 1375, "train_runtime": 6002.3024, "train_tokens_per_second": 30020.693 }, { "epoch": 1.6808769792935445, "grad_norm": 1.046875, "learning_rate": 3.0987882230248816e-06, "loss": 0.4335, "num_input_tokens_seen": 180848640, "step": 1380, "train_runtime": 6024.0561, "train_tokens_per_second": 30021.075 }, { "epoch": 1.686967113276492, "grad_norm": 1.078125, "learning_rate": 2.9844641367233834e-06, "loss": 0.4241, "num_input_tokens_seen": 181504000, "step": 1385, "train_runtime": 6045.7757, "train_tokens_per_second": 30021.623 }, { "epoch": 1.6930572472594396, "grad_norm": 1.0546875, "learning_rate": 2.8721547922234055e-06, "loss": 0.4206, "num_input_tokens_seen": 182159360, "step": 1390, "train_runtime": 6067.6225, "train_tokens_per_second": 30021.538 }, { "epoch": 1.6991473812423874, "grad_norm": 1.046875, "learning_rate": 2.761870467463784e-06, "loss": 0.4284, "num_input_tokens_seen": 182814720, "step": 1395, "train_runtime": 6089.3947, "train_tokens_per_second": 30021.822 }, { "epoch": 1.705237515225335, "grad_norm": 1.1171875, "learning_rate": 2.6536212550645977e-06, "loss": 0.4234, "num_input_tokens_seen": 183470080, "step": 1400, "train_runtime": 6111.127, "train_tokens_per_second": 30022.299 }, { "epoch": 1.7113276492082825, "grad_norm": 1.0703125, "learning_rate": 2.547417061403523e-06, "loss": 0.4351, "num_input_tokens_seen": 184125440, "step": 1405, "train_runtime": 6132.8723, "train_tokens_per_second": 30022.709 }, { "epoch": 1.7174177831912303, "grad_norm": 1.03125, "learning_rate": 2.4432676057092818e-06, "loss": 0.42, "num_input_tokens_seen": 184780800, "step": 1410, "train_runtime": 6154.6229, "train_tokens_per_second": 30023.091 }, { "epoch": 1.7235079171741778, "grad_norm": 1.0703125, "learning_rate": 2.3411824191721887e-06, "loss": 0.4214, "num_input_tokens_seen": 185436160, "step": 1415, "train_runtime": 6176.372, "train_tokens_per_second": 30023.476 }, { "epoch": 1.7295980511571254, "grad_norm": 1.0859375, "learning_rate": 2.24117084407188e-06, "loss": 0.4281, "num_input_tokens_seen": 186091520, "step": 1420, "train_runtime": 6198.2101, "train_tokens_per_second": 30023.429 }, { "epoch": 1.7356881851400732, "grad_norm": 1.0234375, "learning_rate": 2.143242032922396e-06, "loss": 0.4217, "num_input_tokens_seen": 186746880, "step": 1425, "train_runtime": 6219.929, "train_tokens_per_second": 30023.957 }, { "epoch": 1.7417783191230207, "grad_norm": 1.0546875, "learning_rate": 2.0474049476345737e-06, "loss": 0.4236, "num_input_tokens_seen": 187402240, "step": 1430, "train_runtime": 6241.6962, "train_tokens_per_second": 30024.249 }, { "epoch": 1.7478684531059683, "grad_norm": 1.0859375, "learning_rate": 1.953668358695901e-06, "loss": 0.4193, "num_input_tokens_seen": 188057600, "step": 1435, "train_runtime": 6263.445, "train_tokens_per_second": 30024.627 }, { "epoch": 1.753958587088916, "grad_norm": 1.0625, "learning_rate": 1.8620408443678904e-06, "loss": 0.4328, "num_input_tokens_seen": 188712960, "step": 1440, "train_runtime": 6285.263, "train_tokens_per_second": 30024.672 }, { "epoch": 1.7600487210718636, "grad_norm": 1.078125, "learning_rate": 1.7725307899010586e-06, "loss": 0.4322, "num_input_tokens_seen": 189368320, "step": 1445, "train_runtime": 6306.9813, "train_tokens_per_second": 30025.192 }, { "epoch": 1.7661388550548112, "grad_norm": 1.0625, "learning_rate": 1.6851463867675305e-06, "loss": 0.4276, "num_input_tokens_seen": 190023680, "step": 1450, "train_runtime": 6328.7332, "train_tokens_per_second": 30025.548 }, { "epoch": 1.772228989037759, "grad_norm": 1.0703125, "learning_rate": 1.599895631911405e-06, "loss": 0.4266, "num_input_tokens_seen": 190679040, "step": 1455, "train_runtime": 6350.4722, "train_tokens_per_second": 30025.962 }, { "epoch": 1.7783191230207065, "grad_norm": 1.046875, "learning_rate": 1.5167863270169448e-06, "loss": 0.4233, "num_input_tokens_seen": 191334400, "step": 1460, "train_runtime": 6372.1834, "train_tokens_per_second": 30026.506 }, { "epoch": 1.784409257003654, "grad_norm": 1.046875, "learning_rate": 1.435826077794572e-06, "loss": 0.4202, "num_input_tokens_seen": 191989760, "step": 1465, "train_runtime": 6393.9217, "train_tokens_per_second": 30026.918 }, { "epoch": 1.7904993909866018, "grad_norm": 1.09375, "learning_rate": 1.3570222932848514e-06, "loss": 0.429, "num_input_tokens_seen": 192645120, "step": 1470, "train_runtime": 6415.6575, "train_tokens_per_second": 30027.339 }, { "epoch": 1.7965895249695494, "grad_norm": 1.0546875, "learning_rate": 1.2803821851804677e-06, "loss": 0.4373, "num_input_tokens_seen": 193300480, "step": 1475, "train_runtime": 6437.4201, "train_tokens_per_second": 30027.632 }, { "epoch": 1.802679658952497, "grad_norm": 1.1953125, "learning_rate": 1.2059127671662285e-06, "loss": 0.4318, "num_input_tokens_seen": 193955840, "step": 1480, "train_runtime": 6459.1614, "train_tokens_per_second": 30028.022 }, { "epoch": 1.8087697929354447, "grad_norm": 1.046875, "learning_rate": 1.1336208542772147e-06, "loss": 0.4266, "num_input_tokens_seen": 194611200, "step": 1485, "train_runtime": 6480.9309, "train_tokens_per_second": 30028.279 }, { "epoch": 1.814859926918392, "grad_norm": 1.0234375, "learning_rate": 1.0635130622751343e-06, "loss": 0.4203, "num_input_tokens_seen": 195266560, "step": 1490, "train_runtime": 6502.6538, "train_tokens_per_second": 30028.749 }, { "epoch": 1.8209500609013398, "grad_norm": 1.078125, "learning_rate": 9.955958070428344e-07, "loss": 0.4189, "num_input_tokens_seen": 195921920, "step": 1495, "train_runtime": 6524.4119, "train_tokens_per_second": 30029.055 }, { "epoch": 1.8270401948842876, "grad_norm": 1.0703125, "learning_rate": 9.298753039971964e-07, "loss": 0.431, "num_input_tokens_seen": 196577280, "step": 1500, "train_runtime": 6546.1214, "train_tokens_per_second": 30029.58 }, { "epoch": 1.833130328867235, "grad_norm": 1.0390625, "learning_rate": 8.663575675203151e-07, "loss": 0.4204, "num_input_tokens_seen": 197232640, "step": 1505, "train_runtime": 6575.1193, "train_tokens_per_second": 29996.815 }, { "epoch": 1.8392204628501827, "grad_norm": 1.0859375, "learning_rate": 8.050484104090927e-07, "loss": 0.4226, "num_input_tokens_seen": 197888000, "step": 1510, "train_runtime": 6596.8218, "train_tokens_per_second": 29997.475 }, { "epoch": 1.8453105968331305, "grad_norm": 1.0625, "learning_rate": 7.459534433433085e-07, "loss": 0.4262, "num_input_tokens_seen": 198543360, "step": 1515, "train_runtime": 6618.5414, "train_tokens_per_second": 29998.054 }, { "epoch": 1.8514007308160778, "grad_norm": 1.0546875, "learning_rate": 6.890780743721209e-07, "loss": 0.4272, "num_input_tokens_seen": 199198720, "step": 1520, "train_runtime": 6640.1962, "train_tokens_per_second": 29998.921 }, { "epoch": 1.8574908647990256, "grad_norm": 1.0703125, "learning_rate": 6.344275084191886e-07, "loss": 0.4257, "num_input_tokens_seen": 199854080, "step": 1525, "train_runtime": 6661.9288, "train_tokens_per_second": 29999.432 }, { "epoch": 1.8635809987819734, "grad_norm": 1.0546875, "learning_rate": 5.820067468063212e-07, "loss": 0.4249, "num_input_tokens_seen": 200509440, "step": 1530, "train_runtime": 6683.6664, "train_tokens_per_second": 29999.917 }, { "epoch": 1.8696711327649207, "grad_norm": 1.0390625, "learning_rate": 5.318205867957893e-07, "loss": 0.4277, "num_input_tokens_seen": 201164800, "step": 1535, "train_runtime": 6705.4025, "train_tokens_per_second": 30000.406 }, { "epoch": 1.8757612667478685, "grad_norm": 1.1015625, "learning_rate": 4.838736211513233e-07, "loss": 0.4282, "num_input_tokens_seen": 201820160, "step": 1540, "train_runtime": 6727.1629, "train_tokens_per_second": 30000.784 }, { "epoch": 1.881851400730816, "grad_norm": 1.0859375, "learning_rate": 4.3817023771778596e-07, "loss": 0.4298, "num_input_tokens_seen": 202475520, "step": 1545, "train_runtime": 6749.1883, "train_tokens_per_second": 29999.981 }, { "epoch": 1.8879415347137636, "grad_norm": 1.0625, "learning_rate": 3.947146190196205e-07, "loss": 0.4261, "num_input_tokens_seen": 203130880, "step": 1550, "train_runtime": 6771.0741, "train_tokens_per_second": 29999.802 }, { "epoch": 1.8940316686967114, "grad_norm": 1.0546875, "learning_rate": 3.5351074187811586e-07, "loss": 0.4294, "num_input_tokens_seen": 203786240, "step": 1555, "train_runtime": 6793.1191, "train_tokens_per_second": 29998.921 }, { "epoch": 1.900121802679659, "grad_norm": 1.0859375, "learning_rate": 3.145623770474365e-07, "loss": 0.4286, "num_input_tokens_seen": 204441600, "step": 1560, "train_runtime": 6815.1261, "train_tokens_per_second": 29998.212 }, { "epoch": 1.9062119366626065, "grad_norm": 1.0703125, "learning_rate": 2.778730888695652e-07, "loss": 0.4269, "num_input_tokens_seen": 205096960, "step": 1565, "train_runtime": 6836.9748, "train_tokens_per_second": 29998.203 }, { "epoch": 1.9123020706455542, "grad_norm": 1.0546875, "learning_rate": 2.4344623494810814e-07, "loss": 0.4283, "num_input_tokens_seen": 205752320, "step": 1570, "train_runtime": 6859.0238, "train_tokens_per_second": 29997.318 }, { "epoch": 1.9183922046285018, "grad_norm": 1.0859375, "learning_rate": 2.1128496584102154e-07, "loss": 0.4308, "num_input_tokens_seen": 206407680, "step": 1575, "train_runtime": 6880.8878, "train_tokens_per_second": 29997.245 }, { "epoch": 1.9244823386114494, "grad_norm": 1.09375, "learning_rate": 1.8139222477229212e-07, "loss": 0.4435, "num_input_tokens_seen": 207063040, "step": 1580, "train_runtime": 6902.6722, "train_tokens_per_second": 29997.519 }, { "epoch": 1.9305724725943971, "grad_norm": 1.0546875, "learning_rate": 1.5377074736259155e-07, "loss": 0.427, "num_input_tokens_seen": 207718400, "step": 1585, "train_runtime": 6924.4409, "train_tokens_per_second": 29997.859 }, { "epoch": 1.9366626065773447, "grad_norm": 1.0625, "learning_rate": 1.2842306137892392e-07, "loss": 0.4222, "num_input_tokens_seen": 208373760, "step": 1590, "train_runtime": 6946.2191, "train_tokens_per_second": 29998.156 }, { "epoch": 1.9427527405602922, "grad_norm": 1.046875, "learning_rate": 1.0535148650330228e-07, "loss": 0.4234, "num_input_tokens_seen": 209029120, "step": 1595, "train_runtime": 6968.3008, "train_tokens_per_second": 29997.144 }, { "epoch": 1.94884287454324, "grad_norm": 1.1484375, "learning_rate": 8.455813412046042e-08, "loss": 0.4268, "num_input_tokens_seen": 209684480, "step": 1600, "train_runtime": 6990.0361, "train_tokens_per_second": 29997.625 }, { "epoch": 1.9549330085261876, "grad_norm": 1.078125, "learning_rate": 6.604490712463218e-08, "loss": 0.4307, "num_input_tokens_seen": 210339840, "step": 1605, "train_runtime": 7011.7811, "train_tokens_per_second": 29998.062 }, { "epoch": 1.9610231425091351, "grad_norm": 1.1015625, "learning_rate": 4.98134997454075e-08, "loss": 0.4234, "num_input_tokens_seen": 210995200, "step": 1610, "train_runtime": 7033.5419, "train_tokens_per_second": 29998.428 }, { "epoch": 1.967113276492083, "grad_norm": 1.078125, "learning_rate": 3.5865397392684244e-08, "loss": 0.4349, "num_input_tokens_seen": 211650560, "step": 1615, "train_runtime": 7055.3037, "train_tokens_per_second": 29998.788 }, { "epoch": 1.9732034104750305, "grad_norm": 1.0703125, "learning_rate": 2.420187652074357e-08, "loss": 0.4288, "num_input_tokens_seen": 212305920, "step": 1620, "train_runtime": 7077.0456, "train_tokens_per_second": 29999.23 }, { "epoch": 1.979293544457978, "grad_norm": 1.0859375, "learning_rate": 1.4824004511415634e-08, "loss": 0.4133, "num_input_tokens_seen": 212961280, "step": 1625, "train_runtime": 7098.7568, "train_tokens_per_second": 29999.799 }, { "epoch": 1.9853836784409258, "grad_norm": 1.078125, "learning_rate": 7.732639576413236e-09, "loss": 0.4267, "num_input_tokens_seen": 213616640, "step": 1630, "train_runtime": 7120.4765, "train_tokens_per_second": 30000.329 }, { "epoch": 1.9914738124238733, "grad_norm": 1.125, "learning_rate": 2.9284306787918937e-09, "loss": 0.4308, "num_input_tokens_seen": 214272000, "step": 1635, "train_runtime": 7142.2064, "train_tokens_per_second": 30000.813 }, { "epoch": 1.997563946406821, "grad_norm": 1.0703125, "learning_rate": 4.118174735529001e-10, "loss": 0.4244, "num_input_tokens_seen": 214927360, "step": 1640, "train_runtime": 7163.9456, "train_tokens_per_second": 30001.255 }, { "epoch": 2.0, "num_input_tokens_seen": 215158784, "step": 1642, "total_flos": 9.00950810931757e+17, "train_loss": 0.6156596739239872, "train_runtime": 7180.2112, "train_samples_per_second": 14.632, "train_steps_per_second": 0.229 } ], "logging_steps": 5, "max_steps": 1642, "num_input_tokens_seen": 215158784, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.00950810931757e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }