gemma-3-1b-fo / trainer_state.json
Theoistic's picture
Upload folder using huggingface_hub
0987db6 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 1642,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0060901339829476245,
"grad_norm": 17.5,
"learning_rate": 4.9999267881610354e-05,
"loss": 6.1919,
"num_input_tokens_seen": 655360,
"step": 5,
"train_runtime": 22.3092,
"train_tokens_per_second": 29376.191
},
{
"epoch": 0.012180267965895249,
"grad_norm": 3.9375,
"learning_rate": 4.9996293724142536e-05,
"loss": 4.6261,
"num_input_tokens_seen": 1310720,
"step": 10,
"train_runtime": 44.0485,
"train_tokens_per_second": 29756.318
},
{
"epoch": 0.018270401948842874,
"grad_norm": 5.84375,
"learning_rate": 4.9991032042166476e-05,
"loss": 4.0276,
"num_input_tokens_seen": 1966080,
"step": 15,
"train_runtime": 65.7637,
"train_tokens_per_second": 29896.111
},
{
"epoch": 0.024360535931790498,
"grad_norm": 7.28125,
"learning_rate": 4.998348331720263e-05,
"loss": 3.5105,
"num_input_tokens_seen": 2621440,
"step": 20,
"train_runtime": 87.5389,
"train_tokens_per_second": 29945.994
},
{
"epoch": 0.030450669914738125,
"grad_norm": 5.46875,
"learning_rate": 4.997364824006915e-05,
"loss": 2.9731,
"num_input_tokens_seen": 3276800,
"step": 25,
"train_runtime": 109.5185,
"train_tokens_per_second": 29920.05
},
{
"epoch": 0.03654080389768575,
"grad_norm": 4.96875,
"learning_rate": 4.996152771081866e-05,
"loss": 2.5386,
"num_input_tokens_seen": 3932160,
"step": 30,
"train_runtime": 131.3684,
"train_tokens_per_second": 29932.308
},
{
"epoch": 0.04263093788063337,
"grad_norm": 4.0625,
"learning_rate": 4.9947122838655915e-05,
"loss": 2.1857,
"num_input_tokens_seen": 4587520,
"step": 35,
"train_runtime": 153.1526,
"train_tokens_per_second": 29953.909
},
{
"epoch": 0.048721071863580996,
"grad_norm": 5.03125,
"learning_rate": 4.993043494183627e-05,
"loss": 1.9064,
"num_input_tokens_seen": 5242880,
"step": 40,
"train_runtime": 174.9243,
"train_tokens_per_second": 29972.285
},
{
"epoch": 0.05481120584652863,
"grad_norm": 4.0,
"learning_rate": 4.9911465547545044e-05,
"loss": 1.7284,
"num_input_tokens_seen": 5898240,
"step": 45,
"train_runtime": 196.7433,
"train_tokens_per_second": 29979.364
},
{
"epoch": 0.06090133982947625,
"grad_norm": 3.453125,
"learning_rate": 4.989021639175778e-05,
"loss": 1.5783,
"num_input_tokens_seen": 6553600,
"step": 50,
"train_runtime": 218.5219,
"train_tokens_per_second": 29990.583
},
{
"epoch": 0.06699147381242387,
"grad_norm": 2.84375,
"learning_rate": 4.986668941908136e-05,
"loss": 1.4457,
"num_input_tokens_seen": 7208960,
"step": 55,
"train_runtime": 240.2767,
"train_tokens_per_second": 30002.749
},
{
"epoch": 0.0730816077953715,
"grad_norm": 2.6875,
"learning_rate": 4.9840886782576024e-05,
"loss": 1.3575,
"num_input_tokens_seen": 7864320,
"step": 60,
"train_runtime": 262.1559,
"train_tokens_per_second": 29998.643
},
{
"epoch": 0.07917174177831912,
"grad_norm": 2.390625,
"learning_rate": 4.981281084355839e-05,
"loss": 1.2639,
"num_input_tokens_seen": 8519680,
"step": 65,
"train_runtime": 283.9622,
"train_tokens_per_second": 30002.866
},
{
"epoch": 0.08526187576126674,
"grad_norm": 2.296875,
"learning_rate": 4.97824641713853e-05,
"loss": 1.2196,
"num_input_tokens_seen": 9175040,
"step": 70,
"train_runtime": 305.7342,
"train_tokens_per_second": 30009.862
},
{
"epoch": 0.09135200974421437,
"grad_norm": 2.25,
"learning_rate": 4.974984954321873e-05,
"loss": 1.1541,
"num_input_tokens_seen": 9830400,
"step": 75,
"train_runtime": 327.4637,
"train_tokens_per_second": 30019.82
},
{
"epoch": 0.09744214372716199,
"grad_norm": 2.3125,
"learning_rate": 4.971496994377163e-05,
"loss": 1.1022,
"num_input_tokens_seen": 10485760,
"step": 80,
"train_runtime": 349.2345,
"train_tokens_per_second": 30024.981
},
{
"epoch": 0.10353227771010962,
"grad_norm": 1.9140625,
"learning_rate": 4.967782856503473e-05,
"loss": 1.0584,
"num_input_tokens_seen": 11141120,
"step": 85,
"train_runtime": 370.9965,
"train_tokens_per_second": 30030.259
},
{
"epoch": 0.10962241169305725,
"grad_norm": 2.125,
"learning_rate": 4.963842880598453e-05,
"loss": 1.0431,
"num_input_tokens_seen": 11796480,
"step": 90,
"train_runtime": 392.6965,
"train_tokens_per_second": 30039.688
},
{
"epoch": 0.11571254567600488,
"grad_norm": 1.9375,
"learning_rate": 4.9596774272272115e-05,
"loss": 0.9951,
"num_input_tokens_seen": 12451840,
"step": 95,
"train_runtime": 414.5171,
"train_tokens_per_second": 30039.386
},
{
"epoch": 0.1218026796589525,
"grad_norm": 1.890625,
"learning_rate": 4.955286877589331e-05,
"loss": 0.9762,
"num_input_tokens_seen": 13107200,
"step": 100,
"train_runtime": 436.3192,
"train_tokens_per_second": 30040.394
},
{
"epoch": 0.1278928136419001,
"grad_norm": 1.75,
"learning_rate": 4.9506716334839756e-05,
"loss": 0.9444,
"num_input_tokens_seen": 13762560,
"step": 105,
"train_runtime": 458.1573,
"train_tokens_per_second": 30038.943
},
{
"epoch": 0.13398294762484775,
"grad_norm": 1.9453125,
"learning_rate": 4.945832117273118e-05,
"loss": 0.9425,
"num_input_tokens_seen": 14417920,
"step": 110,
"train_runtime": 479.9278,
"train_tokens_per_second": 30041.851
},
{
"epoch": 0.14007308160779536,
"grad_norm": 1.734375,
"learning_rate": 4.940768771842896e-05,
"loss": 0.907,
"num_input_tokens_seen": 15073280,
"step": 115,
"train_runtime": 501.7348,
"train_tokens_per_second": 30042.328
},
{
"epoch": 0.146163215590743,
"grad_norm": 1.8515625,
"learning_rate": 4.9354820605630745e-05,
"loss": 0.8877,
"num_input_tokens_seen": 15728640,
"step": 120,
"train_runtime": 523.4795,
"train_tokens_per_second": 30046.335
},
{
"epoch": 0.15225334957369063,
"grad_norm": 1.78125,
"learning_rate": 4.929972467244645e-05,
"loss": 0.9025,
"num_input_tokens_seen": 16384000,
"step": 125,
"train_runtime": 545.2706,
"train_tokens_per_second": 30047.468
},
{
"epoch": 0.15834348355663824,
"grad_norm": 1.6796875,
"learning_rate": 4.9242404960955456e-05,
"loss": 0.8531,
"num_input_tokens_seen": 17039360,
"step": 130,
"train_runtime": 567.0521,
"train_tokens_per_second": 30049.021
},
{
"epoch": 0.16443361753958588,
"grad_norm": 1.703125,
"learning_rate": 4.918286671674523e-05,
"loss": 0.8443,
"num_input_tokens_seen": 17694720,
"step": 135,
"train_runtime": 588.838,
"train_tokens_per_second": 30050.236
},
{
"epoch": 0.1705237515225335,
"grad_norm": 1.6953125,
"learning_rate": 4.912111538843124e-05,
"loss": 0.8392,
"num_input_tokens_seen": 18350080,
"step": 140,
"train_runtime": 610.6656,
"train_tokens_per_second": 30049.309
},
{
"epoch": 0.17661388550548113,
"grad_norm": 1.65625,
"learning_rate": 4.905715662715835e-05,
"loss": 0.8256,
"num_input_tokens_seen": 19005440,
"step": 145,
"train_runtime": 632.4494,
"train_tokens_per_second": 30050.53
},
{
"epoch": 0.18270401948842874,
"grad_norm": 1.6953125,
"learning_rate": 4.899099628608365e-05,
"loss": 0.819,
"num_input_tokens_seen": 19660800,
"step": 150,
"train_runtime": 654.24,
"train_tokens_per_second": 30051.355
},
{
"epoch": 0.18879415347137637,
"grad_norm": 1.546875,
"learning_rate": 4.8922640419840826e-05,
"loss": 0.8083,
"num_input_tokens_seen": 20316160,
"step": 155,
"train_runtime": 675.9786,
"train_tokens_per_second": 30054.443
},
{
"epoch": 0.19488428745432398,
"grad_norm": 1.6484375,
"learning_rate": 4.885209528398603e-05,
"loss": 0.7974,
"num_input_tokens_seen": 20971520,
"step": 160,
"train_runtime": 697.7598,
"train_tokens_per_second": 30055.5
},
{
"epoch": 0.20097442143727162,
"grad_norm": 1.7109375,
"learning_rate": 4.8779367334425466e-05,
"loss": 0.7856,
"num_input_tokens_seen": 21626880,
"step": 165,
"train_runtime": 719.5124,
"train_tokens_per_second": 30057.687
},
{
"epoch": 0.20706455542021923,
"grad_norm": 1.5078125,
"learning_rate": 4.87044632268245e-05,
"loss": 0.7741,
"num_input_tokens_seen": 22282240,
"step": 170,
"train_runtime": 741.2635,
"train_tokens_per_second": 30059.812
},
{
"epoch": 0.21315468940316687,
"grad_norm": 1.6171875,
"learning_rate": 4.8627389815998654e-05,
"loss": 0.762,
"num_input_tokens_seen": 22937600,
"step": 175,
"train_runtime": 763.0585,
"train_tokens_per_second": 30060.082
},
{
"epoch": 0.2192448233861145,
"grad_norm": 1.65625,
"learning_rate": 4.854815415528624e-05,
"loss": 0.7554,
"num_input_tokens_seen": 23592960,
"step": 180,
"train_runtime": 784.8921,
"train_tokens_per_second": 30058.857
},
{
"epoch": 0.22533495736906212,
"grad_norm": 1.5859375,
"learning_rate": 4.8466763495902886e-05,
"loss": 0.7566,
"num_input_tokens_seen": 24248320,
"step": 185,
"train_runtime": 806.6931,
"train_tokens_per_second": 30058.917
},
{
"epoch": 0.23142509135200975,
"grad_norm": 1.609375,
"learning_rate": 4.838322528627796e-05,
"loss": 0.7454,
"num_input_tokens_seen": 24903680,
"step": 190,
"train_runtime": 828.4828,
"train_tokens_per_second": 30059.381
},
{
"epoch": 0.23751522533495736,
"grad_norm": 1.546875,
"learning_rate": 4.829754717137291e-05,
"loss": 0.7475,
"num_input_tokens_seen": 25559040,
"step": 195,
"train_runtime": 850.3012,
"train_tokens_per_second": 30058.808
},
{
"epoch": 0.243605359317905,
"grad_norm": 1.5390625,
"learning_rate": 4.820973699198164e-05,
"loss": 0.7259,
"num_input_tokens_seen": 26214400,
"step": 200,
"train_runtime": 872.1297,
"train_tokens_per_second": 30057.916
},
{
"epoch": 0.2496954933008526,
"grad_norm": 1.4765625,
"learning_rate": 4.811980278401299e-05,
"loss": 0.7284,
"num_input_tokens_seen": 26869760,
"step": 205,
"train_runtime": 893.9692,
"train_tokens_per_second": 30056.694
},
{
"epoch": 0.2557856272838002,
"grad_norm": 1.4921875,
"learning_rate": 4.802775277775529e-05,
"loss": 0.7169,
"num_input_tokens_seen": 27525120,
"step": 210,
"train_runtime": 915.7965,
"train_tokens_per_second": 30055.935
},
{
"epoch": 0.2618757612667479,
"grad_norm": 1.5,
"learning_rate": 4.793359539712322e-05,
"loss": 0.7164,
"num_input_tokens_seen": 28180480,
"step": 215,
"train_runtime": 937.6276,
"train_tokens_per_second": 30055.088
},
{
"epoch": 0.2679658952496955,
"grad_norm": 1.3984375,
"learning_rate": 4.783733925888685e-05,
"loss": 0.7133,
"num_input_tokens_seen": 28835840,
"step": 220,
"train_runtime": 959.4192,
"train_tokens_per_second": 30055.516
},
{
"epoch": 0.2740560292326431,
"grad_norm": 1.5078125,
"learning_rate": 4.773899317188311e-05,
"loss": 0.7116,
"num_input_tokens_seen": 29491200,
"step": 225,
"train_runtime": 981.209,
"train_tokens_per_second": 30055.981
},
{
"epoch": 0.2801461632155907,
"grad_norm": 1.4140625,
"learning_rate": 4.763856613620965e-05,
"loss": 0.7029,
"num_input_tokens_seen": 30146560,
"step": 230,
"train_runtime": 1002.9822,
"train_tokens_per_second": 30056.926
},
{
"epoch": 0.2862362971985384,
"grad_norm": 1.4375,
"learning_rate": 4.7536067342401194e-05,
"loss": 0.6875,
"num_input_tokens_seen": 30801920,
"step": 235,
"train_runtime": 1024.7415,
"train_tokens_per_second": 30058.234
},
{
"epoch": 0.292326431181486,
"grad_norm": 1.625,
"learning_rate": 4.7431506170588456e-05,
"loss": 0.6949,
"num_input_tokens_seen": 31457280,
"step": 240,
"train_runtime": 1046.5332,
"train_tokens_per_second": 30058.558
},
{
"epoch": 0.2984165651644336,
"grad_norm": 1.5078125,
"learning_rate": 4.732489218963978e-05,
"loss": 0.6828,
"num_input_tokens_seen": 32112640,
"step": 245,
"train_runtime": 1068.248,
"train_tokens_per_second": 30061.035
},
{
"epoch": 0.30450669914738127,
"grad_norm": 1.4453125,
"learning_rate": 4.721623515628537e-05,
"loss": 0.6958,
"num_input_tokens_seen": 32768000,
"step": 250,
"train_runtime": 1090.0205,
"train_tokens_per_second": 30061.819
},
{
"epoch": 0.3105968331303289,
"grad_norm": 1.3671875,
"learning_rate": 4.710554501422447e-05,
"loss": 0.6947,
"num_input_tokens_seen": 33423360,
"step": 255,
"train_runtime": 1111.7983,
"train_tokens_per_second": 30062.432
},
{
"epoch": 0.3166869671132765,
"grad_norm": 1.546875,
"learning_rate": 4.6992831893215325e-05,
"loss": 0.6836,
"num_input_tokens_seen": 34078720,
"step": 260,
"train_runtime": 1133.6327,
"train_tokens_per_second": 30061.518
},
{
"epoch": 0.3227771010962241,
"grad_norm": 1.390625,
"learning_rate": 4.6878106108148215e-05,
"loss": 0.6701,
"num_input_tokens_seen": 34734080,
"step": 265,
"train_runtime": 1155.4501,
"train_tokens_per_second": 30061.081
},
{
"epoch": 0.32886723507917176,
"grad_norm": 1.3203125,
"learning_rate": 4.676137815810142e-05,
"loss": 0.6729,
"num_input_tokens_seen": 35389440,
"step": 270,
"train_runtime": 1177.285,
"train_tokens_per_second": 30060.214
},
{
"epoch": 0.33495736906211937,
"grad_norm": 1.3984375,
"learning_rate": 4.664265872538048e-05,
"loss": 0.6687,
"num_input_tokens_seen": 36044800,
"step": 275,
"train_runtime": 1199.1576,
"train_tokens_per_second": 30058.434
},
{
"epoch": 0.341047503045067,
"grad_norm": 1.4765625,
"learning_rate": 4.6521958674540554e-05,
"loss": 0.669,
"num_input_tokens_seen": 36700160,
"step": 280,
"train_runtime": 1220.9936,
"train_tokens_per_second": 30057.62
},
{
"epoch": 0.3471376370280146,
"grad_norm": 1.375,
"learning_rate": 4.639928905139216e-05,
"loss": 0.6637,
"num_input_tokens_seen": 37355520,
"step": 285,
"train_runtime": 1242.9648,
"train_tokens_per_second": 30053.563
},
{
"epoch": 0.35322777101096225,
"grad_norm": 1.3359375,
"learning_rate": 4.627466108199037e-05,
"loss": 0.659,
"num_input_tokens_seen": 38010880,
"step": 290,
"train_runtime": 1264.7781,
"train_tokens_per_second": 30053.398
},
{
"epoch": 0.35931790499390986,
"grad_norm": 1.3671875,
"learning_rate": 4.614808617160737e-05,
"loss": 0.6573,
"num_input_tokens_seen": 38666240,
"step": 295,
"train_runtime": 1286.5591,
"train_tokens_per_second": 30053.995
},
{
"epoch": 0.3654080389768575,
"grad_norm": 1.359375,
"learning_rate": 4.601957590368884e-05,
"loss": 0.6545,
"num_input_tokens_seen": 39321600,
"step": 300,
"train_runtime": 1308.3044,
"train_tokens_per_second": 30055.39
},
{
"epoch": 0.37149817295980514,
"grad_norm": 1.3046875,
"learning_rate": 4.5889142038793766e-05,
"loss": 0.6364,
"num_input_tokens_seen": 39976960,
"step": 305,
"train_runtime": 1330.1444,
"train_tokens_per_second": 30054.601
},
{
"epoch": 0.37758830694275275,
"grad_norm": 1.3125,
"learning_rate": 4.5756796513518276e-05,
"loss": 0.6487,
"num_input_tokens_seen": 40632320,
"step": 310,
"train_runtime": 1351.8973,
"train_tokens_per_second": 30055.773
},
{
"epoch": 0.38367844092570036,
"grad_norm": 1.34375,
"learning_rate": 4.5622551439403226e-05,
"loss": 0.6375,
"num_input_tokens_seen": 41287680,
"step": 315,
"train_runtime": 1373.641,
"train_tokens_per_second": 30057.111
},
{
"epoch": 0.38976857490864797,
"grad_norm": 1.4609375,
"learning_rate": 4.548641910182582e-05,
"loss": 0.6449,
"num_input_tokens_seen": 41943040,
"step": 320,
"train_runtime": 1395.3852,
"train_tokens_per_second": 30058.395
},
{
"epoch": 0.39585870889159563,
"grad_norm": 1.3203125,
"learning_rate": 4.534841195887531e-05,
"loss": 0.6377,
"num_input_tokens_seen": 42598400,
"step": 325,
"train_runtime": 1417.1617,
"train_tokens_per_second": 30058.955
},
{
"epoch": 0.40194884287454324,
"grad_norm": 1.3671875,
"learning_rate": 4.520854264021296e-05,
"loss": 0.6312,
"num_input_tokens_seen": 43253760,
"step": 330,
"train_runtime": 1439.0046,
"train_tokens_per_second": 30058.111
},
{
"epoch": 0.40803897685749085,
"grad_norm": 1.2890625,
"learning_rate": 4.506682394591614e-05,
"loss": 0.625,
"num_input_tokens_seen": 43909120,
"step": 335,
"train_runtime": 1460.7409,
"train_tokens_per_second": 30059.485
},
{
"epoch": 0.41412911084043846,
"grad_norm": 1.375,
"learning_rate": 4.492326884530705e-05,
"loss": 0.6168,
"num_input_tokens_seen": 44564480,
"step": 340,
"train_runtime": 1482.5072,
"train_tokens_per_second": 30060.212
},
{
"epoch": 0.42021924482338613,
"grad_norm": 1.2890625,
"learning_rate": 4.477789047576574e-05,
"loss": 0.6228,
"num_input_tokens_seen": 45219840,
"step": 345,
"train_runtime": 1504.2816,
"train_tokens_per_second": 30060.754
},
{
"epoch": 0.42630937880633374,
"grad_norm": 1.390625,
"learning_rate": 4.463070214152791e-05,
"loss": 0.62,
"num_input_tokens_seen": 45875200,
"step": 350,
"train_runtime": 1526.0478,
"train_tokens_per_second": 30061.443
},
{
"epoch": 0.43239951278928135,
"grad_norm": 1.359375,
"learning_rate": 4.448171731246736e-05,
"loss": 0.625,
"num_input_tokens_seen": 46530560,
"step": 355,
"train_runtime": 1547.7838,
"train_tokens_per_second": 30062.7
},
{
"epoch": 0.438489646772229,
"grad_norm": 1.53125,
"learning_rate": 4.4330949622863306e-05,
"loss": 0.6146,
"num_input_tokens_seen": 47185920,
"step": 360,
"train_runtime": 1569.5543,
"train_tokens_per_second": 30063.261
},
{
"epoch": 0.4445797807551766,
"grad_norm": 1.2265625,
"learning_rate": 4.417841287015263e-05,
"loss": 0.6044,
"num_input_tokens_seen": 47841280,
"step": 365,
"train_runtime": 1591.3459,
"train_tokens_per_second": 30063.407
},
{
"epoch": 0.45066991473812423,
"grad_norm": 1.2578125,
"learning_rate": 4.402412101366722e-05,
"loss": 0.6129,
"num_input_tokens_seen": 48496640,
"step": 370,
"train_runtime": 1613.1368,
"train_tokens_per_second": 30063.563
},
{
"epoch": 0.45676004872107184,
"grad_norm": 1.75,
"learning_rate": 4.38680881733565e-05,
"loss": 0.6078,
"num_input_tokens_seen": 49152000,
"step": 375,
"train_runtime": 1634.9595,
"train_tokens_per_second": 30063.131
},
{
"epoch": 0.4628501827040195,
"grad_norm": 1.34375,
"learning_rate": 4.371032862849525e-05,
"loss": 0.606,
"num_input_tokens_seen": 49807360,
"step": 380,
"train_runtime": 1656.7683,
"train_tokens_per_second": 30062.96
},
{
"epoch": 0.4689403166869671,
"grad_norm": 1.453125,
"learning_rate": 4.3550856816376815e-05,
"loss": 0.6063,
"num_input_tokens_seen": 50462720,
"step": 385,
"train_runtime": 1678.5702,
"train_tokens_per_second": 30062.919
},
{
"epoch": 0.47503045066991473,
"grad_norm": 1.3125,
"learning_rate": 4.3389687330991914e-05,
"loss": 0.6039,
"num_input_tokens_seen": 51118080,
"step": 390,
"train_runtime": 1700.3779,
"train_tokens_per_second": 30062.776
},
{
"epoch": 0.48112058465286234,
"grad_norm": 1.2421875,
"learning_rate": 4.3226834921693064e-05,
"loss": 0.5973,
"num_input_tokens_seen": 51773440,
"step": 395,
"train_runtime": 1722.1624,
"train_tokens_per_second": 30063.042
},
{
"epoch": 0.48721071863581,
"grad_norm": 1.3046875,
"learning_rate": 4.306231449184481e-05,
"loss": 0.5986,
"num_input_tokens_seen": 52428800,
"step": 400,
"train_runtime": 1743.9777,
"train_tokens_per_second": 30062.77
},
{
"epoch": 0.4933008526187576,
"grad_norm": 1.265625,
"learning_rate": 4.289614109745984e-05,
"loss": 0.5919,
"num_input_tokens_seen": 53084160,
"step": 405,
"train_runtime": 1765.7704,
"train_tokens_per_second": 30062.889
},
{
"epoch": 0.4993909866017052,
"grad_norm": 1.2734375,
"learning_rate": 4.272832994582112e-05,
"loss": 0.6017,
"num_input_tokens_seen": 53739520,
"step": 410,
"train_runtime": 1787.5322,
"train_tokens_per_second": 30063.526
},
{
"epoch": 0.5054811205846529,
"grad_norm": 1.3125,
"learning_rate": 4.255889639409028e-05,
"loss": 0.5838,
"num_input_tokens_seen": 54394880,
"step": 415,
"train_runtime": 1809.3305,
"train_tokens_per_second": 30063.54
},
{
"epoch": 0.5115712545676004,
"grad_norm": 1.34375,
"learning_rate": 4.23878559479021e-05,
"loss": 0.5906,
"num_input_tokens_seen": 55050240,
"step": 420,
"train_runtime": 1831.1085,
"train_tokens_per_second": 30063.888
},
{
"epoch": 0.5176613885505481,
"grad_norm": 1.875,
"learning_rate": 4.221522425994563e-05,
"loss": 0.5879,
"num_input_tokens_seen": 55705600,
"step": 425,
"train_runtime": 1852.9092,
"train_tokens_per_second": 30063.859
},
{
"epoch": 0.5237515225334958,
"grad_norm": 1.3515625,
"learning_rate": 4.2041017128531665e-05,
"loss": 0.5895,
"num_input_tokens_seen": 56360960,
"step": 430,
"train_runtime": 1874.732,
"train_tokens_per_second": 30063.476
},
{
"epoch": 0.5298416565164433,
"grad_norm": 1.359375,
"learning_rate": 4.186525049614699e-05,
"loss": 0.5794,
"num_input_tokens_seen": 57016320,
"step": 435,
"train_runtime": 1896.6145,
"train_tokens_per_second": 30062.155
},
{
"epoch": 0.535931790499391,
"grad_norm": 1.2734375,
"learning_rate": 4.168794044799544e-05,
"loss": 0.5833,
"num_input_tokens_seen": 57671680,
"step": 440,
"train_runtime": 1918.4108,
"train_tokens_per_second": 30062.216
},
{
"epoch": 0.5420219244823387,
"grad_norm": 1.1875,
"learning_rate": 4.150910321052584e-05,
"loss": 0.5748,
"num_input_tokens_seen": 58327040,
"step": 445,
"train_runtime": 1940.1777,
"train_tokens_per_second": 30062.731
},
{
"epoch": 0.5481120584652862,
"grad_norm": 1.2578125,
"learning_rate": 4.132875514994701e-05,
"loss": 0.5874,
"num_input_tokens_seen": 58982400,
"step": 450,
"train_runtime": 1961.9296,
"train_tokens_per_second": 30063.464
},
{
"epoch": 0.5542021924482339,
"grad_norm": 1.2265625,
"learning_rate": 4.114691277073013e-05,
"loss": 0.5807,
"num_input_tokens_seen": 59637760,
"step": 455,
"train_runtime": 1983.7172,
"train_tokens_per_second": 30063.639
},
{
"epoch": 0.5602923264311814,
"grad_norm": 1.265625,
"learning_rate": 4.096359271409822e-05,
"loss": 0.5897,
"num_input_tokens_seen": 60293120,
"step": 460,
"train_runtime": 2005.5211,
"train_tokens_per_second": 30063.569
},
{
"epoch": 0.5663824604141291,
"grad_norm": 1.2109375,
"learning_rate": 4.077881175650332e-05,
"loss": 0.5829,
"num_input_tokens_seen": 60948480,
"step": 465,
"train_runtime": 2027.4851,
"train_tokens_per_second": 30061.124
},
{
"epoch": 0.5724725943970768,
"grad_norm": 1.84375,
"learning_rate": 4.059258680809114e-05,
"loss": 0.568,
"num_input_tokens_seen": 61603840,
"step": 470,
"train_runtime": 2049.3409,
"train_tokens_per_second": 30060.319
},
{
"epoch": 0.5785627283800243,
"grad_norm": 1.9453125,
"learning_rate": 4.040493491115355e-05,
"loss": 0.5716,
"num_input_tokens_seen": 62259200,
"step": 475,
"train_runtime": 2071.288,
"train_tokens_per_second": 30058.205
},
{
"epoch": 0.584652862362972,
"grad_norm": 1.3671875,
"learning_rate": 4.0215873238568986e-05,
"loss": 0.577,
"num_input_tokens_seen": 62914560,
"step": 480,
"train_runtime": 2093.1653,
"train_tokens_per_second": 30057.139
},
{
"epoch": 0.5907429963459196,
"grad_norm": 1.25,
"learning_rate": 4.002541909223084e-05,
"loss": 0.5727,
"num_input_tokens_seen": 63569920,
"step": 485,
"train_runtime": 2115.077,
"train_tokens_per_second": 30055.605
},
{
"epoch": 0.5968331303288672,
"grad_norm": 1.234375,
"learning_rate": 3.983358990146415e-05,
"loss": 0.5732,
"num_input_tokens_seen": 64225280,
"step": 490,
"train_runtime": 2137.0427,
"train_tokens_per_second": 30053.344
},
{
"epoch": 0.6029232643118149,
"grad_norm": 1.1796875,
"learning_rate": 3.964040322143049e-05,
"loss": 0.5649,
"num_input_tokens_seen": 64880640,
"step": 495,
"train_runtime": 2158.8505,
"train_tokens_per_second": 30053.327
},
{
"epoch": 0.6090133982947625,
"grad_norm": 1.1796875,
"learning_rate": 3.9445876731521433e-05,
"loss": 0.5743,
"num_input_tokens_seen": 65536000,
"step": 500,
"train_runtime": 2180.692,
"train_tokens_per_second": 30052.846
},
{
"epoch": 0.6151035322777101,
"grad_norm": 1.1953125,
"learning_rate": 3.925002823374071e-05,
"loss": 0.5682,
"num_input_tokens_seen": 66191360,
"step": 505,
"train_runtime": 2210.8877,
"train_tokens_per_second": 29938.816
},
{
"epoch": 0.6211936662606578,
"grad_norm": 1.2578125,
"learning_rate": 3.9052875651074936e-05,
"loss": 0.5651,
"num_input_tokens_seen": 66846720,
"step": 510,
"train_runtime": 2232.8769,
"train_tokens_per_second": 29937.486
},
{
"epoch": 0.6272838002436053,
"grad_norm": 1.34375,
"learning_rate": 3.8854437025853505e-05,
"loss": 0.5674,
"num_input_tokens_seen": 67502080,
"step": 515,
"train_runtime": 2254.7216,
"train_tokens_per_second": 29938.1
},
{
"epoch": 0.633373934226553,
"grad_norm": 1.28125,
"learning_rate": 3.86547305180974e-05,
"loss": 0.5636,
"num_input_tokens_seen": 68157440,
"step": 520,
"train_runtime": 2276.679,
"train_tokens_per_second": 29937.22
},
{
"epoch": 0.6394640682095006,
"grad_norm": 1.46875,
"learning_rate": 3.845377440385731e-05,
"loss": 0.5706,
"num_input_tokens_seen": 68812800,
"step": 525,
"train_runtime": 2298.5683,
"train_tokens_per_second": 29937.244
},
{
"epoch": 0.6455542021924482,
"grad_norm": 1.1875,
"learning_rate": 3.825158707354108e-05,
"loss": 0.5576,
"num_input_tokens_seen": 69468160,
"step": 530,
"train_runtime": 2320.5639,
"train_tokens_per_second": 29935.897
},
{
"epoch": 0.6516443361753959,
"grad_norm": 1.640625,
"learning_rate": 3.8048187030230745e-05,
"loss": 0.5558,
"num_input_tokens_seen": 70123520,
"step": 535,
"train_runtime": 2342.5476,
"train_tokens_per_second": 29934.726
},
{
"epoch": 0.6577344701583435,
"grad_norm": 1.296875,
"learning_rate": 3.784359288798921e-05,
"loss": 0.5547,
"num_input_tokens_seen": 70778880,
"step": 540,
"train_runtime": 2364.3333,
"train_tokens_per_second": 29936.084
},
{
"epoch": 0.6638246041412911,
"grad_norm": 1.328125,
"learning_rate": 3.763782337015683e-05,
"loss": 0.5675,
"num_input_tokens_seen": 71434240,
"step": 545,
"train_runtime": 2386.1478,
"train_tokens_per_second": 29937.056
},
{
"epoch": 0.6699147381242387,
"grad_norm": 1.15625,
"learning_rate": 3.743089730763792e-05,
"loss": 0.5597,
"num_input_tokens_seen": 72089600,
"step": 550,
"train_runtime": 2407.9479,
"train_tokens_per_second": 29938.189
},
{
"epoch": 0.6760048721071864,
"grad_norm": 1.1953125,
"learning_rate": 3.722283363717743e-05,
"loss": 0.5529,
"num_input_tokens_seen": 72744960,
"step": 555,
"train_runtime": 2429.7703,
"train_tokens_per_second": 29939.027
},
{
"epoch": 0.682095006090134,
"grad_norm": 1.171875,
"learning_rate": 3.7013651399628004e-05,
"loss": 0.5622,
"num_input_tokens_seen": 73400320,
"step": 560,
"train_runtime": 2451.4626,
"train_tokens_per_second": 29941.44
},
{
"epoch": 0.6881851400730816,
"grad_norm": 1.21875,
"learning_rate": 3.6803369738207444e-05,
"loss": 0.5582,
"num_input_tokens_seen": 74055680,
"step": 565,
"train_runtime": 2473.2169,
"train_tokens_per_second": 29943.06
},
{
"epoch": 0.6942752740560292,
"grad_norm": 1.25,
"learning_rate": 3.6592007896746846e-05,
"loss": 0.551,
"num_input_tokens_seen": 74711040,
"step": 570,
"train_runtime": 2494.9949,
"train_tokens_per_second": 29944.366
},
{
"epoch": 0.7003654080389768,
"grad_norm": 1.1875,
"learning_rate": 3.6379585217929474e-05,
"loss": 0.5601,
"num_input_tokens_seen": 75366400,
"step": 575,
"train_runtime": 2516.7415,
"train_tokens_per_second": 29946.024
},
{
"epoch": 0.7064555420219245,
"grad_norm": 1.171875,
"learning_rate": 3.6166121141520655e-05,
"loss": 0.5487,
"num_input_tokens_seen": 76021760,
"step": 580,
"train_runtime": 2538.5185,
"train_tokens_per_second": 29947.294
},
{
"epoch": 0.7125456760048721,
"grad_norm": 1.234375,
"learning_rate": 3.595163520258873e-05,
"loss": 0.5604,
"num_input_tokens_seen": 76677120,
"step": 585,
"train_runtime": 2560.24,
"train_tokens_per_second": 29949.192
},
{
"epoch": 0.7186358099878197,
"grad_norm": 1.34375,
"learning_rate": 3.573614702971735e-05,
"loss": 0.5521,
"num_input_tokens_seen": 77332480,
"step": 590,
"train_runtime": 2582.0217,
"train_tokens_per_second": 29950.36
},
{
"epoch": 0.7247259439707674,
"grad_norm": 1.1875,
"learning_rate": 3.551967634320911e-05,
"loss": 0.5472,
"num_input_tokens_seen": 77987840,
"step": 595,
"train_runtime": 2603.7692,
"train_tokens_per_second": 29951.902
},
{
"epoch": 0.730816077953715,
"grad_norm": 1.2734375,
"learning_rate": 3.530224295328096e-05,
"loss": 0.5447,
"num_input_tokens_seen": 78643200,
"step": 600,
"train_runtime": 2625.5133,
"train_tokens_per_second": 29953.457
},
{
"epoch": 0.7369062119366626,
"grad_norm": 1.21875,
"learning_rate": 3.508386675825116e-05,
"loss": 0.5441,
"num_input_tokens_seen": 79298560,
"step": 605,
"train_runtime": 2647.2508,
"train_tokens_per_second": 29955.061
},
{
"epoch": 0.7429963459196103,
"grad_norm": 1.1328125,
"learning_rate": 3.486456774271837e-05,
"loss": 0.5417,
"num_input_tokens_seen": 79953920,
"step": 610,
"train_runtime": 2668.9717,
"train_tokens_per_second": 29956.825
},
{
"epoch": 0.7490864799025578,
"grad_norm": 1.1953125,
"learning_rate": 3.464436597573276e-05,
"loss": 0.5495,
"num_input_tokens_seen": 80609280,
"step": 615,
"train_runtime": 2690.7063,
"train_tokens_per_second": 29958.409
},
{
"epoch": 0.7551766138855055,
"grad_norm": 1.2890625,
"learning_rate": 3.4423281608959376e-05,
"loss": 0.5388,
"num_input_tokens_seen": 81264640,
"step": 620,
"train_runtime": 2712.4728,
"train_tokens_per_second": 29959.615
},
{
"epoch": 0.761266747868453,
"grad_norm": 1.171875,
"learning_rate": 3.420133487483402e-05,
"loss": 0.5358,
"num_input_tokens_seen": 81920000,
"step": 625,
"train_runtime": 2734.2134,
"train_tokens_per_second": 29961.085
},
{
"epoch": 0.7673568818514007,
"grad_norm": 1.1484375,
"learning_rate": 3.3978546084711595e-05,
"loss": 0.5433,
"num_input_tokens_seen": 82575360,
"step": 630,
"train_runtime": 2755.9692,
"train_tokens_per_second": 29962.367
},
{
"epoch": 0.7734470158343484,
"grad_norm": 1.15625,
"learning_rate": 3.375493562700742e-05,
"loss": 0.5464,
"num_input_tokens_seen": 83230720,
"step": 635,
"train_runtime": 2777.7109,
"train_tokens_per_second": 29963.78
},
{
"epoch": 0.7795371498172959,
"grad_norm": 1.21875,
"learning_rate": 3.353052396533133e-05,
"loss": 0.5404,
"num_input_tokens_seen": 83886080,
"step": 640,
"train_runtime": 2799.4496,
"train_tokens_per_second": 29965.204
},
{
"epoch": 0.7856272838002436,
"grad_norm": 1.1875,
"learning_rate": 3.330533163661501e-05,
"loss": 0.5427,
"num_input_tokens_seen": 84541440,
"step": 645,
"train_runtime": 2821.1758,
"train_tokens_per_second": 29966.739
},
{
"epoch": 0.7917174177831913,
"grad_norm": 1.1875,
"learning_rate": 3.3079379249232475e-05,
"loss": 0.5393,
"num_input_tokens_seen": 85196800,
"step": 650,
"train_runtime": 2842.9286,
"train_tokens_per_second": 29967.971
},
{
"epoch": 0.7978075517661388,
"grad_norm": 1.1328125,
"learning_rate": 3.2852687481114235e-05,
"loss": 0.5404,
"num_input_tokens_seen": 85852160,
"step": 655,
"train_runtime": 2864.6768,
"train_tokens_per_second": 29969.23
},
{
"epoch": 0.8038976857490865,
"grad_norm": 1.6171875,
"learning_rate": 3.2625277077854855e-05,
"loss": 0.5407,
"num_input_tokens_seen": 86507520,
"step": 660,
"train_runtime": 2886.4158,
"train_tokens_per_second": 29970.567
},
{
"epoch": 0.8099878197320342,
"grad_norm": 1.09375,
"learning_rate": 3.239716885081446e-05,
"loss": 0.5304,
"num_input_tokens_seen": 87162880,
"step": 665,
"train_runtime": 2908.1158,
"train_tokens_per_second": 29972.287
},
{
"epoch": 0.8160779537149817,
"grad_norm": 1.2109375,
"learning_rate": 3.216838367521424e-05,
"loss": 0.5397,
"num_input_tokens_seen": 87818240,
"step": 670,
"train_runtime": 2929.8531,
"train_tokens_per_second": 29973.598
},
{
"epoch": 0.8221680876979294,
"grad_norm": 1.2265625,
"learning_rate": 3.193894248822599e-05,
"loss": 0.5362,
"num_input_tokens_seen": 88473600,
"step": 675,
"train_runtime": 2951.6047,
"train_tokens_per_second": 29974.745
},
{
"epoch": 0.8282582216808769,
"grad_norm": 1.25,
"learning_rate": 3.17088662870561e-05,
"loss": 0.5333,
"num_input_tokens_seen": 89128960,
"step": 680,
"train_runtime": 2973.4611,
"train_tokens_per_second": 29974.82
},
{
"epoch": 0.8343483556638246,
"grad_norm": 1.2265625,
"learning_rate": 3.147817612702403e-05,
"loss": 0.5333,
"num_input_tokens_seen": 89784320,
"step": 685,
"train_runtime": 2995.2224,
"train_tokens_per_second": 29975.844
},
{
"epoch": 0.8404384896467723,
"grad_norm": 1.1640625,
"learning_rate": 3.124689311963535e-05,
"loss": 0.5239,
"num_input_tokens_seen": 90439680,
"step": 690,
"train_runtime": 3017.0068,
"train_tokens_per_second": 29976.625
},
{
"epoch": 0.8465286236297198,
"grad_norm": 1.140625,
"learning_rate": 3.101503843064981e-05,
"loss": 0.5356,
"num_input_tokens_seen": 91095040,
"step": 695,
"train_runtime": 3038.7428,
"train_tokens_per_second": 29977.872
},
{
"epoch": 0.8526187576126675,
"grad_norm": 1.125,
"learning_rate": 3.078263327814438e-05,
"loss": 0.5301,
"num_input_tokens_seen": 91750400,
"step": 700,
"train_runtime": 3060.5883,
"train_tokens_per_second": 29978.028
},
{
"epoch": 0.8587088915956151,
"grad_norm": 1.1484375,
"learning_rate": 3.0549698930571386e-05,
"loss": 0.5336,
"num_input_tokens_seen": 92405760,
"step": 705,
"train_runtime": 3082.32,
"train_tokens_per_second": 29979.288
},
{
"epoch": 0.8647990255785627,
"grad_norm": 1.0546875,
"learning_rate": 3.0316256704812252e-05,
"loss": 0.5262,
"num_input_tokens_seen": 93061120,
"step": 710,
"train_runtime": 3104.0532,
"train_tokens_per_second": 29980.517
},
{
"epoch": 0.8708891595615104,
"grad_norm": 1.1015625,
"learning_rate": 3.0082327964226615e-05,
"loss": 0.5249,
"num_input_tokens_seen": 93716480,
"step": 715,
"train_runtime": 3125.8473,
"train_tokens_per_second": 29981.145
},
{
"epoch": 0.876979293544458,
"grad_norm": 1.34375,
"learning_rate": 2.9847934116697307e-05,
"loss": 0.5313,
"num_input_tokens_seen": 94371840,
"step": 720,
"train_runtime": 3147.5696,
"train_tokens_per_second": 29982.448
},
{
"epoch": 0.8830694275274056,
"grad_norm": 1.2109375,
"learning_rate": 2.9613096612671225e-05,
"loss": 0.5308,
"num_input_tokens_seen": 95027200,
"step": 725,
"train_runtime": 3169.2945,
"train_tokens_per_second": 29983.708
},
{
"epoch": 0.8891595615103532,
"grad_norm": 1.1484375,
"learning_rate": 2.9377836943196256e-05,
"loss": 0.5318,
"num_input_tokens_seen": 95682560,
"step": 730,
"train_runtime": 3191.0555,
"train_tokens_per_second": 29984.611
},
{
"epoch": 0.8952496954933008,
"grad_norm": 1.3828125,
"learning_rate": 2.91421766379546e-05,
"loss": 0.5383,
"num_input_tokens_seen": 96337920,
"step": 735,
"train_runtime": 3212.7812,
"train_tokens_per_second": 29985.833
},
{
"epoch": 0.9013398294762485,
"grad_norm": 1.4296875,
"learning_rate": 2.8906137263292442e-05,
"loss": 0.532,
"num_input_tokens_seen": 96993280,
"step": 740,
"train_runtime": 3234.5148,
"train_tokens_per_second": 29986.965
},
{
"epoch": 0.9074299634591961,
"grad_norm": 1.2578125,
"learning_rate": 2.8669740420246334e-05,
"loss": 0.5222,
"num_input_tokens_seen": 97648640,
"step": 745,
"train_runtime": 3256.233,
"train_tokens_per_second": 29988.222
},
{
"epoch": 0.9135200974421437,
"grad_norm": 1.0859375,
"learning_rate": 2.843300774256638e-05,
"loss": 0.52,
"num_input_tokens_seen": 98304000,
"step": 750,
"train_runtime": 3277.9714,
"train_tokens_per_second": 29989.279
},
{
"epoch": 0.9196102314250914,
"grad_norm": 1.1171875,
"learning_rate": 2.819596089473646e-05,
"loss": 0.5194,
"num_input_tokens_seen": 98959360,
"step": 755,
"train_runtime": 3299.7126,
"train_tokens_per_second": 29990.297
},
{
"epoch": 0.925700365408039,
"grad_norm": 1.0859375,
"learning_rate": 2.795862156999157e-05,
"loss": 0.5278,
"num_input_tokens_seen": 99614720,
"step": 760,
"train_runtime": 3321.428,
"train_tokens_per_second": 29991.534
},
{
"epoch": 0.9317904993909866,
"grad_norm": 1.1015625,
"learning_rate": 2.7721011488332615e-05,
"loss": 0.5221,
"num_input_tokens_seen": 100270080,
"step": 765,
"train_runtime": 3343.2094,
"train_tokens_per_second": 29992.163
},
{
"epoch": 0.9378806333739342,
"grad_norm": 1.0703125,
"learning_rate": 2.748315239453868e-05,
"loss": 0.5159,
"num_input_tokens_seen": 100925440,
"step": 770,
"train_runtime": 3364.9146,
"train_tokens_per_second": 29993.463
},
{
"epoch": 0.9439707673568819,
"grad_norm": 1.25,
"learning_rate": 2.7245066056177093e-05,
"loss": 0.5135,
"num_input_tokens_seen": 101580800,
"step": 775,
"train_runtime": 3386.6156,
"train_tokens_per_second": 29994.783
},
{
"epoch": 0.9500609013398295,
"grad_norm": 1.125,
"learning_rate": 2.7006774261611373e-05,
"loss": 0.5237,
"num_input_tokens_seen": 102236160,
"step": 780,
"train_runtime": 3408.3514,
"train_tokens_per_second": 29995.78
},
{
"epoch": 0.9561510353227771,
"grad_norm": 1.078125,
"learning_rate": 2.6768298818007253e-05,
"loss": 0.5154,
"num_input_tokens_seen": 102891520,
"step": 785,
"train_runtime": 3430.0863,
"train_tokens_per_second": 29996.773
},
{
"epoch": 0.9622411693057247,
"grad_norm": 1.1171875,
"learning_rate": 2.6529661549337032e-05,
"loss": 0.5177,
"num_input_tokens_seen": 103546880,
"step": 790,
"train_runtime": 3451.8095,
"train_tokens_per_second": 29997.854
},
{
"epoch": 0.9683313032886723,
"grad_norm": 1.0703125,
"learning_rate": 2.6290884294382366e-05,
"loss": 0.5142,
"num_input_tokens_seen": 104202240,
"step": 795,
"train_runtime": 3473.5565,
"train_tokens_per_second": 29998.717
},
{
"epoch": 0.97442143727162,
"grad_norm": 1.1328125,
"learning_rate": 2.6051988904735686e-05,
"loss": 0.5138,
"num_input_tokens_seen": 104857600,
"step": 800,
"train_runtime": 3495.316,
"train_tokens_per_second": 29999.462
},
{
"epoch": 0.9805115712545676,
"grad_norm": 1.0859375,
"learning_rate": 2.5812997242800456e-05,
"loss": 0.5225,
"num_input_tokens_seen": 105512960,
"step": 805,
"train_runtime": 3517.0562,
"train_tokens_per_second": 30000.362
},
{
"epoch": 0.9866017052375152,
"grad_norm": 1.2421875,
"learning_rate": 2.5573931179790472e-05,
"loss": 0.5116,
"num_input_tokens_seen": 106168320,
"step": 810,
"train_runtime": 3538.7869,
"train_tokens_per_second": 30001.331
},
{
"epoch": 0.9926918392204629,
"grad_norm": 1.109375,
"learning_rate": 2.5334812593728296e-05,
"loss": 0.526,
"num_input_tokens_seen": 106823680,
"step": 815,
"train_runtime": 3560.5431,
"train_tokens_per_second": 30002.074
},
{
"epoch": 0.9987819732034104,
"grad_norm": 1.1640625,
"learning_rate": 2.5095663367443123e-05,
"loss": 0.5278,
"num_input_tokens_seen": 107479040,
"step": 820,
"train_runtime": 3582.2344,
"train_tokens_per_second": 30003.352
},
{
"epoch": 1.004872107186358,
"grad_norm": 1.0859375,
"learning_rate": 2.485650538656817e-05,
"loss": 0.454,
"num_input_tokens_seen": 108103680,
"step": 825,
"train_runtime": 3603.2471,
"train_tokens_per_second": 30001.74
},
{
"epoch": 1.0109622411693058,
"grad_norm": 1.171875,
"learning_rate": 2.461736053753783e-05,
"loss": 0.44,
"num_input_tokens_seen": 108759040,
"step": 830,
"train_runtime": 3624.9733,
"train_tokens_per_second": 30002.714
},
{
"epoch": 1.0170523751522533,
"grad_norm": 1.1015625,
"learning_rate": 2.4378250705584737e-05,
"loss": 0.4402,
"num_input_tokens_seen": 109414400,
"step": 835,
"train_runtime": 3646.7181,
"train_tokens_per_second": 30003.526
},
{
"epoch": 1.0231425091352009,
"grad_norm": 1.0859375,
"learning_rate": 2.4139197772736942e-05,
"loss": 0.4341,
"num_input_tokens_seen": 110069760,
"step": 840,
"train_runtime": 3668.5013,
"train_tokens_per_second": 30004.013
},
{
"epoch": 1.0292326431181487,
"grad_norm": 1.0390625,
"learning_rate": 2.3900223615815438e-05,
"loss": 0.4492,
"num_input_tokens_seen": 110725120,
"step": 845,
"train_runtime": 3690.286,
"train_tokens_per_second": 30004.482
},
{
"epoch": 1.0353227771010962,
"grad_norm": 1.1015625,
"learning_rate": 2.3661350104432037e-05,
"loss": 0.4401,
"num_input_tokens_seen": 111380480,
"step": 850,
"train_runtime": 3712.0285,
"train_tokens_per_second": 30005.287
},
{
"epoch": 1.0414129110840438,
"grad_norm": 1.1015625,
"learning_rate": 2.3422599098988023e-05,
"loss": 0.4402,
"num_input_tokens_seen": 112035840,
"step": 855,
"train_runtime": 3733.7851,
"train_tokens_per_second": 30005.969
},
{
"epoch": 1.0475030450669915,
"grad_norm": 1.0859375,
"learning_rate": 2.3183992448673615e-05,
"loss": 0.4383,
"num_input_tokens_seen": 112691200,
"step": 860,
"train_runtime": 3755.54,
"train_tokens_per_second": 30006.657
},
{
"epoch": 1.053593179049939,
"grad_norm": 1.109375,
"learning_rate": 2.294555198946845e-05,
"loss": 0.4408,
"num_input_tokens_seen": 113346560,
"step": 865,
"train_runtime": 3777.2755,
"train_tokens_per_second": 30007.491
},
{
"epoch": 1.0596833130328867,
"grad_norm": 1.1328125,
"learning_rate": 2.270729954214324e-05,
"loss": 0.4344,
"num_input_tokens_seen": 114001920,
"step": 870,
"train_runtime": 3799.0222,
"train_tokens_per_second": 30008.227
},
{
"epoch": 1.0657734470158344,
"grad_norm": 1.1484375,
"learning_rate": 2.2469256910262877e-05,
"loss": 0.4417,
"num_input_tokens_seen": 114657280,
"step": 875,
"train_runtime": 3820.7855,
"train_tokens_per_second": 30008.824
},
{
"epoch": 1.071863580998782,
"grad_norm": 1.046875,
"learning_rate": 2.2231445878191107e-05,
"loss": 0.4379,
"num_input_tokens_seen": 115312640,
"step": 880,
"train_runtime": 3842.4905,
"train_tokens_per_second": 30009.87
},
{
"epoch": 1.0779537149817295,
"grad_norm": 1.0703125,
"learning_rate": 2.1993888209096897e-05,
"loss": 0.4367,
"num_input_tokens_seen": 115968000,
"step": 885,
"train_runtime": 3864.1859,
"train_tokens_per_second": 30010.979
},
{
"epoch": 1.0840438489646773,
"grad_norm": 1.0546875,
"learning_rate": 2.1756605642962827e-05,
"loss": 0.439,
"num_input_tokens_seen": 116623360,
"step": 890,
"train_runtime": 3885.9318,
"train_tokens_per_second": 30011.685
},
{
"epoch": 1.0901339829476249,
"grad_norm": 1.125,
"learning_rate": 2.1519619894595567e-05,
"loss": 0.4357,
"num_input_tokens_seen": 117278720,
"step": 895,
"train_runtime": 3907.6564,
"train_tokens_per_second": 30012.547
},
{
"epoch": 1.0962241169305724,
"grad_norm": 1.0625,
"learning_rate": 2.1282952651638626e-05,
"loss": 0.4365,
"num_input_tokens_seen": 117934080,
"step": 900,
"train_runtime": 3929.4342,
"train_tokens_per_second": 30012.993
},
{
"epoch": 1.1023142509135202,
"grad_norm": 1.0625,
"learning_rate": 2.1046625572587633e-05,
"loss": 0.4301,
"num_input_tokens_seen": 118589440,
"step": 905,
"train_runtime": 3951.2394,
"train_tokens_per_second": 30013.225
},
{
"epoch": 1.1084043848964678,
"grad_norm": 1.1015625,
"learning_rate": 2.0810660284808297e-05,
"loss": 0.4309,
"num_input_tokens_seen": 119244800,
"step": 910,
"train_runtime": 3972.9809,
"train_tokens_per_second": 30013.937
},
{
"epoch": 1.1144945188794153,
"grad_norm": 1.125,
"learning_rate": 2.0575078382557137e-05,
"loss": 0.4336,
"num_input_tokens_seen": 119900160,
"step": 915,
"train_runtime": 3994.6574,
"train_tokens_per_second": 30015.129
},
{
"epoch": 1.1205846528623629,
"grad_norm": 1.109375,
"learning_rate": 2.0339901425005315e-05,
"loss": 0.4329,
"num_input_tokens_seen": 120555520,
"step": 920,
"train_runtime": 4016.3768,
"train_tokens_per_second": 30015.989
},
{
"epoch": 1.1266747868453106,
"grad_norm": 1.078125,
"learning_rate": 2.0105150934265687e-05,
"loss": 0.4377,
"num_input_tokens_seen": 121210880,
"step": 925,
"train_runtime": 4038.1192,
"train_tokens_per_second": 30016.667
},
{
"epoch": 1.1327649208282582,
"grad_norm": 1.171875,
"learning_rate": 1.9870848393423176e-05,
"loss": 0.4414,
"num_input_tokens_seen": 121866240,
"step": 930,
"train_runtime": 4059.8399,
"train_tokens_per_second": 30017.499
},
{
"epoch": 1.1388550548112057,
"grad_norm": 1.0546875,
"learning_rate": 1.963701524456877e-05,
"loss": 0.4327,
"num_input_tokens_seen": 122521600,
"step": 935,
"train_runtime": 4081.578,
"train_tokens_per_second": 30018.194
},
{
"epoch": 1.1449451887941535,
"grad_norm": 1.109375,
"learning_rate": 1.9403672886837264e-05,
"loss": 0.4283,
"num_input_tokens_seen": 123176960,
"step": 940,
"train_runtime": 4103.2994,
"train_tokens_per_second": 30019.004
},
{
"epoch": 1.151035322777101,
"grad_norm": 1.0703125,
"learning_rate": 1.9170842674448942e-05,
"loss": 0.4207,
"num_input_tokens_seen": 123832320,
"step": 945,
"train_runtime": 4125.0267,
"train_tokens_per_second": 30019.762
},
{
"epoch": 1.1571254567600486,
"grad_norm": 1.109375,
"learning_rate": 1.89385459147553e-05,
"loss": 0.437,
"num_input_tokens_seen": 124487680,
"step": 950,
"train_runtime": 4146.7599,
"train_tokens_per_second": 30020.47
},
{
"epoch": 1.1632155907429964,
"grad_norm": 1.1640625,
"learning_rate": 1.8706803866289208e-05,
"loss": 0.4381,
"num_input_tokens_seen": 125143040,
"step": 955,
"train_runtime": 4168.4938,
"train_tokens_per_second": 30021.165
},
{
"epoch": 1.169305724725944,
"grad_norm": 1.109375,
"learning_rate": 1.8475637736819335e-05,
"loss": 0.4272,
"num_input_tokens_seen": 125798400,
"step": 960,
"train_runtime": 4190.2193,
"train_tokens_per_second": 30021.913
},
{
"epoch": 1.1753958587088915,
"grad_norm": 1.0625,
"learning_rate": 1.824506868140942e-05,
"loss": 0.4248,
"num_input_tokens_seen": 126453760,
"step": 965,
"train_runtime": 4211.9556,
"train_tokens_per_second": 30022.577
},
{
"epoch": 1.1814859926918393,
"grad_norm": 1.0625,
"learning_rate": 1.801511780048221e-05,
"loss": 0.429,
"num_input_tokens_seen": 127109120,
"step": 970,
"train_runtime": 4233.6883,
"train_tokens_per_second": 30023.259
},
{
"epoch": 1.1875761266747868,
"grad_norm": 1.125,
"learning_rate": 1.778580613788853e-05,
"loss": 0.4305,
"num_input_tokens_seen": 127764480,
"step": 975,
"train_runtime": 4255.3913,
"train_tokens_per_second": 30024.144
},
{
"epoch": 1.1936662606577344,
"grad_norm": 1.0625,
"learning_rate": 1.755715467898139e-05,
"loss": 0.4307,
"num_input_tokens_seen": 128419840,
"step": 980,
"train_runtime": 4277.133,
"train_tokens_per_second": 30024.748
},
{
"epoch": 1.1997563946406822,
"grad_norm": 1.0625,
"learning_rate": 1.7329184348695586e-05,
"loss": 0.4238,
"num_input_tokens_seen": 129075200,
"step": 985,
"train_runtime": 4298.8319,
"train_tokens_per_second": 30025.645
},
{
"epoch": 1.2058465286236297,
"grad_norm": 1.0859375,
"learning_rate": 1.7101916009632733e-05,
"loss": 0.4402,
"num_input_tokens_seen": 129730560,
"step": 990,
"train_runtime": 4320.5575,
"train_tokens_per_second": 30026.347
},
{
"epoch": 1.2119366626065773,
"grad_norm": 1.0703125,
"learning_rate": 1.6875370460152023e-05,
"loss": 0.4324,
"num_input_tokens_seen": 130385920,
"step": 995,
"train_runtime": 4342.3036,
"train_tokens_per_second": 30026.901
},
{
"epoch": 1.218026796589525,
"grad_norm": 1.21875,
"learning_rate": 1.6649568432466884e-05,
"loss": 0.4349,
"num_input_tokens_seen": 131041280,
"step": 1000,
"train_runtime": 4364.0234,
"train_tokens_per_second": 30027.63
},
{
"epoch": 1.2241169305724726,
"grad_norm": 1.109375,
"learning_rate": 1.6424530590747724e-05,
"loss": 0.4318,
"num_input_tokens_seen": 131696640,
"step": 1005,
"train_runtime": 4393.7613,
"train_tokens_per_second": 29973.553
},
{
"epoch": 1.2302070645554202,
"grad_norm": 1.171875,
"learning_rate": 1.6200277529230768e-05,
"loss": 0.4475,
"num_input_tokens_seen": 132352000,
"step": 1010,
"train_runtime": 4415.4584,
"train_tokens_per_second": 29974.69
},
{
"epoch": 1.236297198538368,
"grad_norm": 1.1015625,
"learning_rate": 1.5976829770333452e-05,
"loss": 0.4415,
"num_input_tokens_seen": 133007360,
"step": 1015,
"train_runtime": 4437.1808,
"train_tokens_per_second": 29975.646
},
{
"epoch": 1.2423873325213155,
"grad_norm": 1.09375,
"learning_rate": 1.5754207762776325e-05,
"loss": 0.4288,
"num_input_tokens_seen": 133662720,
"step": 1020,
"train_runtime": 4458.8792,
"train_tokens_per_second": 29976.753
},
{
"epoch": 1.248477466504263,
"grad_norm": 1.0546875,
"learning_rate": 1.5532431879711657e-05,
"loss": 0.4289,
"num_input_tokens_seen": 134318080,
"step": 1025,
"train_runtime": 4480.5616,
"train_tokens_per_second": 29977.956
},
{
"epoch": 1.2545676004872108,
"grad_norm": 1.0546875,
"learning_rate": 1.5311522416859016e-05,
"loss": 0.4246,
"num_input_tokens_seen": 134973440,
"step": 1030,
"train_runtime": 4502.2815,
"train_tokens_per_second": 29978.898
},
{
"epoch": 1.2606577344701584,
"grad_norm": 1.125,
"learning_rate": 1.5091499590647936e-05,
"loss": 0.432,
"num_input_tokens_seen": 135628800,
"step": 1035,
"train_runtime": 4524.0863,
"train_tokens_per_second": 29979.269
},
{
"epoch": 1.266747868453106,
"grad_norm": 1.0703125,
"learning_rate": 1.4872383536367785e-05,
"loss": 0.4333,
"num_input_tokens_seen": 136284160,
"step": 1040,
"train_runtime": 4545.9201,
"train_tokens_per_second": 29979.444
},
{
"epoch": 1.2728380024360537,
"grad_norm": 1.078125,
"learning_rate": 1.4654194306325093e-05,
"loss": 0.4282,
"num_input_tokens_seen": 136939520,
"step": 1045,
"train_runtime": 4567.69,
"train_tokens_per_second": 29980.038
},
{
"epoch": 1.2789281364190013,
"grad_norm": 1.0625,
"learning_rate": 1.4436951868008536e-05,
"loss": 0.4307,
"num_input_tokens_seen": 137594880,
"step": 1050,
"train_runtime": 4589.5037,
"train_tokens_per_second": 29980.34
},
{
"epoch": 1.2850182704019488,
"grad_norm": 1.078125,
"learning_rate": 1.4220676102261532e-05,
"loss": 0.4323,
"num_input_tokens_seen": 138250240,
"step": 1055,
"train_runtime": 4611.2636,
"train_tokens_per_second": 29980.988
},
{
"epoch": 1.2911084043848966,
"grad_norm": 1.0625,
"learning_rate": 1.4005386801462896e-05,
"loss": 0.428,
"num_input_tokens_seen": 138905600,
"step": 1060,
"train_runtime": 4633.0271,
"train_tokens_per_second": 29981.607
},
{
"epoch": 1.2971985383678442,
"grad_norm": 1.1015625,
"learning_rate": 1.3791103667715577e-05,
"loss": 0.4226,
"num_input_tokens_seen": 139560960,
"step": 1065,
"train_runtime": 4654.7487,
"train_tokens_per_second": 29982.491
},
{
"epoch": 1.3032886723507917,
"grad_norm": 1.109375,
"learning_rate": 1.3577846311043593e-05,
"loss": 0.4332,
"num_input_tokens_seen": 140216320,
"step": 1070,
"train_runtime": 4676.4958,
"train_tokens_per_second": 29983.203
},
{
"epoch": 1.3093788063337393,
"grad_norm": 1.1015625,
"learning_rate": 1.3365634247597415e-05,
"loss": 0.426,
"num_input_tokens_seen": 140871680,
"step": 1075,
"train_runtime": 4698.2878,
"train_tokens_per_second": 29983.621
},
{
"epoch": 1.315468940316687,
"grad_norm": 1.03125,
"learning_rate": 1.3154486897867996e-05,
"loss": 0.4302,
"num_input_tokens_seen": 141527040,
"step": 1080,
"train_runtime": 4720.0824,
"train_tokens_per_second": 29984.019
},
{
"epoch": 1.3215590742996346,
"grad_norm": 1.0859375,
"learning_rate": 1.2944423584909502e-05,
"loss": 0.4306,
"num_input_tokens_seen": 142182400,
"step": 1085,
"train_runtime": 4741.7882,
"train_tokens_per_second": 29984.975
},
{
"epoch": 1.3276492082825821,
"grad_norm": 1.09375,
"learning_rate": 1.273546353257096e-05,
"loss": 0.4204,
"num_input_tokens_seen": 142837760,
"step": 1090,
"train_runtime": 4763.5593,
"train_tokens_per_second": 29985.511
},
{
"epoch": 1.3337393422655297,
"grad_norm": 1.046875,
"learning_rate": 1.2527625863736981e-05,
"loss": 0.4253,
"num_input_tokens_seen": 143493120,
"step": 1095,
"train_runtime": 4785.3061,
"train_tokens_per_second": 29986.195
},
{
"epoch": 1.3398294762484775,
"grad_norm": 1.0546875,
"learning_rate": 1.2320929598577777e-05,
"loss": 0.4353,
"num_input_tokens_seen": 144148480,
"step": 1100,
"train_runtime": 4807.0374,
"train_tokens_per_second": 29986.969
},
{
"epoch": 1.345919610231425,
"grad_norm": 1.09375,
"learning_rate": 1.2115393652808526e-05,
"loss": 0.4358,
"num_input_tokens_seen": 144803840,
"step": 1105,
"train_runtime": 4828.7956,
"train_tokens_per_second": 29987.569
},
{
"epoch": 1.3520097442143726,
"grad_norm": 1.0546875,
"learning_rate": 1.1911036835958274e-05,
"loss": 0.4386,
"num_input_tokens_seen": 145459200,
"step": 1110,
"train_runtime": 4850.5882,
"train_tokens_per_second": 29987.951
},
{
"epoch": 1.3580998781973204,
"grad_norm": 1.03125,
"learning_rate": 1.1707877849648643e-05,
"loss": 0.4304,
"num_input_tokens_seen": 146114560,
"step": 1115,
"train_runtime": 4872.3825,
"train_tokens_per_second": 29988.319
},
{
"epoch": 1.364190012180268,
"grad_norm": 1.09375,
"learning_rate": 1.1505935285882336e-05,
"loss": 0.4327,
"num_input_tokens_seen": 146769920,
"step": 1120,
"train_runtime": 4894.2345,
"train_tokens_per_second": 29988.33
},
{
"epoch": 1.3702801461632155,
"grad_norm": 1.0703125,
"learning_rate": 1.1305227625341657e-05,
"loss": 0.4316,
"num_input_tokens_seen": 147425280,
"step": 1125,
"train_runtime": 4915.9875,
"train_tokens_per_second": 29988.945
},
{
"epoch": 1.3763702801461632,
"grad_norm": 1.078125,
"learning_rate": 1.1105773235697376e-05,
"loss": 0.4247,
"num_input_tokens_seen": 148080640,
"step": 1130,
"train_runtime": 4937.7365,
"train_tokens_per_second": 29989.579
},
{
"epoch": 1.3824604141291108,
"grad_norm": 1.0859375,
"learning_rate": 1.0907590369927674e-05,
"loss": 0.4298,
"num_input_tokens_seen": 148736000,
"step": 1135,
"train_runtime": 4959.4689,
"train_tokens_per_second": 29990.308
},
{
"epoch": 1.3885505481120584,
"grad_norm": 1.03125,
"learning_rate": 1.0710697164647807e-05,
"loss": 0.431,
"num_input_tokens_seen": 149391360,
"step": 1140,
"train_runtime": 4981.2189,
"train_tokens_per_second": 29990.925
},
{
"epoch": 1.3946406820950061,
"grad_norm": 1.0546875,
"learning_rate": 1.0515111638450395e-05,
"loss": 0.4236,
"num_input_tokens_seen": 150046720,
"step": 1145,
"train_runtime": 5002.928,
"train_tokens_per_second": 29991.781
},
{
"epoch": 1.4007308160779537,
"grad_norm": 1.0703125,
"learning_rate": 1.0320851690256324e-05,
"loss": 0.4318,
"num_input_tokens_seen": 150702080,
"step": 1150,
"train_runtime": 5024.6238,
"train_tokens_per_second": 29992.709
},
{
"epoch": 1.4068209500609012,
"grad_norm": 1.0546875,
"learning_rate": 1.0127935097676855e-05,
"loss": 0.4371,
"num_input_tokens_seen": 151357440,
"step": 1155,
"train_runtime": 5046.36,
"train_tokens_per_second": 29993.389
},
{
"epoch": 1.412911084043849,
"grad_norm": 1.0546875,
"learning_rate": 9.936379515386663e-06,
"loss": 0.4213,
"num_input_tokens_seen": 152012800,
"step": 1160,
"train_runtime": 5068.1066,
"train_tokens_per_second": 29994.002
},
{
"epoch": 1.4190012180267966,
"grad_norm": 1.078125,
"learning_rate": 9.74620247350815e-06,
"loss": 0.4245,
"num_input_tokens_seen": 152668160,
"step": 1165,
"train_runtime": 5089.847,
"train_tokens_per_second": 29994.646
},
{
"epoch": 1.4250913520097441,
"grad_norm": 1.140625,
"learning_rate": 9.557421376007258e-06,
"loss": 0.4272,
"num_input_tokens_seen": 153323520,
"step": 1170,
"train_runtime": 5111.5715,
"train_tokens_per_second": 29995.378
},
{
"epoch": 1.431181485992692,
"grad_norm": 1.09375,
"learning_rate": 9.370053499100698e-06,
"loss": 0.418,
"num_input_tokens_seen": 153978880,
"step": 1175,
"train_runtime": 5133.3011,
"train_tokens_per_second": 29996.074
},
{
"epoch": 1.4372716199756395,
"grad_norm": 1.0625,
"learning_rate": 9.184115989674913e-06,
"loss": 0.4314,
"num_input_tokens_seen": 154634240,
"step": 1180,
"train_runtime": 5155.025,
"train_tokens_per_second": 29996.797
},
{
"epoch": 1.443361753958587,
"grad_norm": 1.046875,
"learning_rate": 8.999625863716951e-06,
"loss": 0.4283,
"num_input_tokens_seen": 155289600,
"step": 1185,
"train_runtime": 5176.7613,
"train_tokens_per_second": 29997.443
},
{
"epoch": 1.4494518879415348,
"grad_norm": 1.0234375,
"learning_rate": 8.816600004757175e-06,
"loss": 0.4367,
"num_input_tokens_seen": 155944960,
"step": 1190,
"train_runtime": 5198.4684,
"train_tokens_per_second": 29998.251
},
{
"epoch": 1.4555420219244823,
"grad_norm": 1.0390625,
"learning_rate": 8.635055162324276e-06,
"loss": 0.416,
"num_input_tokens_seen": 156600320,
"step": 1195,
"train_runtime": 5220.1609,
"train_tokens_per_second": 29999.137
},
{
"epoch": 1.46163215590743,
"grad_norm": 1.0546875,
"learning_rate": 8.455007950412324e-06,
"loss": 0.4317,
"num_input_tokens_seen": 157255680,
"step": 1200,
"train_runtime": 5241.8684,
"train_tokens_per_second": 29999.929
},
{
"epoch": 1.4677222898903777,
"grad_norm": 1.734375,
"learning_rate": 8.276474845960448e-06,
"loss": 0.4237,
"num_input_tokens_seen": 157911040,
"step": 1205,
"train_runtime": 5263.5912,
"train_tokens_per_second": 30000.628
},
{
"epoch": 1.4738124238733252,
"grad_norm": 1.0546875,
"learning_rate": 8.099472187344914e-06,
"loss": 0.4356,
"num_input_tokens_seen": 158566400,
"step": 1210,
"train_runtime": 5285.317,
"train_tokens_per_second": 30001.304
},
{
"epoch": 1.4799025578562728,
"grad_norm": 1.1015625,
"learning_rate": 7.924016172883908e-06,
"loss": 0.4297,
"num_input_tokens_seen": 159221760,
"step": 1215,
"train_runtime": 5307.0638,
"train_tokens_per_second": 30001.855
},
{
"epoch": 1.4859926918392206,
"grad_norm": 1.078125,
"learning_rate": 7.750122859355199e-06,
"loss": 0.4317,
"num_input_tokens_seen": 159877120,
"step": 1220,
"train_runtime": 5328.8039,
"train_tokens_per_second": 30002.44
},
{
"epoch": 1.4920828258221681,
"grad_norm": 1.0625,
"learning_rate": 7.577808160526692e-06,
"loss": 0.4311,
"num_input_tokens_seen": 160532480,
"step": 1225,
"train_runtime": 5350.5736,
"train_tokens_per_second": 30002.854
},
{
"epoch": 1.4981729598051157,
"grad_norm": 1.078125,
"learning_rate": 7.40708784570005e-06,
"loss": 0.4269,
"num_input_tokens_seen": 161187840,
"step": 1230,
"train_runtime": 5372.3219,
"train_tokens_per_second": 30003.385
},
{
"epoch": 1.5042630937880634,
"grad_norm": 1.03125,
"learning_rate": 7.2379775382676375e-06,
"loss": 0.4268,
"num_input_tokens_seen": 161843200,
"step": 1235,
"train_runtime": 5394.073,
"train_tokens_per_second": 30003.895
},
{
"epoch": 1.510353227771011,
"grad_norm": 1.0546875,
"learning_rate": 7.070492714282706e-06,
"loss": 0.4243,
"num_input_tokens_seen": 162498560,
"step": 1240,
"train_runtime": 5415.8136,
"train_tokens_per_second": 30004.459
},
{
"epoch": 1.5164433617539586,
"grad_norm": 1.0859375,
"learning_rate": 6.904648701043137e-06,
"loss": 0.4237,
"num_input_tokens_seen": 163153920,
"step": 1245,
"train_runtime": 5437.5717,
"train_tokens_per_second": 30004.923
},
{
"epoch": 1.5225334957369063,
"grad_norm": 1.0546875,
"learning_rate": 6.740460675688734e-06,
"loss": 0.4214,
"num_input_tokens_seen": 163809280,
"step": 1250,
"train_runtime": 5459.3212,
"train_tokens_per_second": 30005.43
},
{
"epoch": 1.5286236297198539,
"grad_norm": 1.2421875,
"learning_rate": 6.577943663812344e-06,
"loss": 0.4331,
"num_input_tokens_seen": 164464640,
"step": 1255,
"train_runtime": 5481.0582,
"train_tokens_per_second": 30006.001
},
{
"epoch": 1.5347137637028014,
"grad_norm": 1.0390625,
"learning_rate": 6.417112538084771e-06,
"loss": 0.4269,
"num_input_tokens_seen": 165120000,
"step": 1260,
"train_runtime": 5502.8244,
"train_tokens_per_second": 30006.409
},
{
"epoch": 1.5408038976857492,
"grad_norm": 1.015625,
"learning_rate": 6.257982016893685e-06,
"loss": 0.4197,
"num_input_tokens_seen": 165775360,
"step": 1265,
"train_runtime": 5524.4904,
"train_tokens_per_second": 30007.358
},
{
"epoch": 1.5468940316686965,
"grad_norm": 1.0859375,
"learning_rate": 6.100566662996732e-06,
"loss": 0.4407,
"num_input_tokens_seen": 166430720,
"step": 1270,
"train_runtime": 5546.194,
"train_tokens_per_second": 30008.096
},
{
"epoch": 1.5529841656516443,
"grad_norm": 1.0390625,
"learning_rate": 5.944880882188786e-06,
"loss": 0.4268,
"num_input_tokens_seen": 167086080,
"step": 1275,
"train_runtime": 5567.9166,
"train_tokens_per_second": 30008.725
},
{
"epoch": 1.559074299634592,
"grad_norm": 1.0625,
"learning_rate": 5.790938921983608e-06,
"loss": 0.4275,
"num_input_tokens_seen": 167741440,
"step": 1280,
"train_runtime": 5589.613,
"train_tokens_per_second": 30009.491
},
{
"epoch": 1.5651644336175394,
"grad_norm": 1.0703125,
"learning_rate": 5.638754870310042e-06,
"loss": 0.4291,
"num_input_tokens_seen": 168396800,
"step": 1285,
"train_runtime": 5611.3139,
"train_tokens_per_second": 30010.227
},
{
"epoch": 1.5712545676004872,
"grad_norm": 1.046875,
"learning_rate": 5.488342654222695e-06,
"loss": 0.4283,
"num_input_tokens_seen": 169052160,
"step": 1290,
"train_runtime": 5633.0305,
"train_tokens_per_second": 30010.872
},
{
"epoch": 1.577344701583435,
"grad_norm": 1.25,
"learning_rate": 5.33971603862746e-06,
"loss": 0.4282,
"num_input_tokens_seen": 169707520,
"step": 1295,
"train_runtime": 5654.7734,
"train_tokens_per_second": 30011.374
},
{
"epoch": 1.5834348355663823,
"grad_norm": 1.1015625,
"learning_rate": 5.192888625021794e-06,
"loss": 0.438,
"num_input_tokens_seen": 170362880,
"step": 1300,
"train_runtime": 5676.6131,
"train_tokens_per_second": 30011.36
},
{
"epoch": 1.58952496954933,
"grad_norm": 1.046875,
"learning_rate": 5.047873850250012e-06,
"loss": 0.4227,
"num_input_tokens_seen": 171018240,
"step": 1305,
"train_runtime": 5698.3158,
"train_tokens_per_second": 30012.068
},
{
"epoch": 1.5956151035322779,
"grad_norm": 1.03125,
"learning_rate": 4.9046849852736085e-06,
"loss": 0.4339,
"num_input_tokens_seen": 171673600,
"step": 1310,
"train_runtime": 5720.0011,
"train_tokens_per_second": 30012.862
},
{
"epoch": 1.6017052375152252,
"grad_norm": 1.09375,
"learning_rate": 4.763335133956751e-06,
"loss": 0.4233,
"num_input_tokens_seen": 172328960,
"step": 1315,
"train_runtime": 5741.7125,
"train_tokens_per_second": 30013.512
},
{
"epoch": 1.607795371498173,
"grad_norm": 1.140625,
"learning_rate": 4.6238372318671175e-06,
"loss": 0.4293,
"num_input_tokens_seen": 172984320,
"step": 1320,
"train_runtime": 5763.4052,
"train_tokens_per_second": 30014.256
},
{
"epoch": 1.6138855054811205,
"grad_norm": 1.078125,
"learning_rate": 4.486204045092102e-06,
"loss": 0.422,
"num_input_tokens_seen": 173639680,
"step": 1325,
"train_runtime": 5785.115,
"train_tokens_per_second": 30014.906
},
{
"epoch": 1.619975639464068,
"grad_norm": 1.015625,
"learning_rate": 4.350448169070481e-06,
"loss": 0.4234,
"num_input_tokens_seen": 174295040,
"step": 1330,
"train_runtime": 5806.8095,
"train_tokens_per_second": 30015.629
},
{
"epoch": 1.6260657734470159,
"grad_norm": 1.015625,
"learning_rate": 4.2165820274398444e-06,
"loss": 0.4258,
"num_input_tokens_seen": 174950400,
"step": 1335,
"train_runtime": 5828.5149,
"train_tokens_per_second": 30016.291
},
{
"epoch": 1.6321559074299634,
"grad_norm": 1.0234375,
"learning_rate": 4.084617870899546e-06,
"loss": 0.4212,
"num_input_tokens_seen": 175605760,
"step": 1340,
"train_runtime": 5850.239,
"train_tokens_per_second": 30016.852
},
{
"epoch": 1.638246041412911,
"grad_norm": 1.0546875,
"learning_rate": 3.954567776089643e-06,
"loss": 0.4218,
"num_input_tokens_seen": 176261120,
"step": 1345,
"train_runtime": 5872.0406,
"train_tokens_per_second": 30017.013
},
{
"epoch": 1.6443361753958587,
"grad_norm": 1.046875,
"learning_rate": 3.826443644485731e-06,
"loss": 0.4322,
"num_input_tokens_seen": 176916480,
"step": 1350,
"train_runtime": 5893.7566,
"train_tokens_per_second": 30017.609
},
{
"epoch": 1.6504263093788063,
"grad_norm": 1.046875,
"learning_rate": 3.7002572013097147e-06,
"loss": 0.4234,
"num_input_tokens_seen": 177571840,
"step": 1355,
"train_runtime": 5915.4571,
"train_tokens_per_second": 30018.279
},
{
"epoch": 1.6565164433617539,
"grad_norm": 1.09375,
"learning_rate": 3.5760199944568418e-06,
"loss": 0.4241,
"num_input_tokens_seen": 178227200,
"step": 1360,
"train_runtime": 5937.1539,
"train_tokens_per_second": 30018.963
},
{
"epoch": 1.6626065773447016,
"grad_norm": 1.109375,
"learning_rate": 3.4537433934388798e-06,
"loss": 0.4313,
"num_input_tokens_seen": 178882560,
"step": 1365,
"train_runtime": 5958.8509,
"train_tokens_per_second": 30019.64
},
{
"epoch": 1.6686967113276492,
"grad_norm": 1.0546875,
"learning_rate": 3.333438588343624e-06,
"loss": 0.4224,
"num_input_tokens_seen": 179537920,
"step": 1370,
"train_runtime": 5980.5729,
"train_tokens_per_second": 30020.187
},
{
"epoch": 1.6747868453105967,
"grad_norm": 1.1796875,
"learning_rate": 3.2151165888108765e-06,
"loss": 0.4228,
"num_input_tokens_seen": 180193280,
"step": 1375,
"train_runtime": 6002.3024,
"train_tokens_per_second": 30020.693
},
{
"epoch": 1.6808769792935445,
"grad_norm": 1.046875,
"learning_rate": 3.0987882230248816e-06,
"loss": 0.4335,
"num_input_tokens_seen": 180848640,
"step": 1380,
"train_runtime": 6024.0561,
"train_tokens_per_second": 30021.075
},
{
"epoch": 1.686967113276492,
"grad_norm": 1.078125,
"learning_rate": 2.9844641367233834e-06,
"loss": 0.4241,
"num_input_tokens_seen": 181504000,
"step": 1385,
"train_runtime": 6045.7757,
"train_tokens_per_second": 30021.623
},
{
"epoch": 1.6930572472594396,
"grad_norm": 1.0546875,
"learning_rate": 2.8721547922234055e-06,
"loss": 0.4206,
"num_input_tokens_seen": 182159360,
"step": 1390,
"train_runtime": 6067.6225,
"train_tokens_per_second": 30021.538
},
{
"epoch": 1.6991473812423874,
"grad_norm": 1.046875,
"learning_rate": 2.761870467463784e-06,
"loss": 0.4284,
"num_input_tokens_seen": 182814720,
"step": 1395,
"train_runtime": 6089.3947,
"train_tokens_per_second": 30021.822
},
{
"epoch": 1.705237515225335,
"grad_norm": 1.1171875,
"learning_rate": 2.6536212550645977e-06,
"loss": 0.4234,
"num_input_tokens_seen": 183470080,
"step": 1400,
"train_runtime": 6111.127,
"train_tokens_per_second": 30022.299
},
{
"epoch": 1.7113276492082825,
"grad_norm": 1.0703125,
"learning_rate": 2.547417061403523e-06,
"loss": 0.4351,
"num_input_tokens_seen": 184125440,
"step": 1405,
"train_runtime": 6132.8723,
"train_tokens_per_second": 30022.709
},
{
"epoch": 1.7174177831912303,
"grad_norm": 1.03125,
"learning_rate": 2.4432676057092818e-06,
"loss": 0.42,
"num_input_tokens_seen": 184780800,
"step": 1410,
"train_runtime": 6154.6229,
"train_tokens_per_second": 30023.091
},
{
"epoch": 1.7235079171741778,
"grad_norm": 1.0703125,
"learning_rate": 2.3411824191721887e-06,
"loss": 0.4214,
"num_input_tokens_seen": 185436160,
"step": 1415,
"train_runtime": 6176.372,
"train_tokens_per_second": 30023.476
},
{
"epoch": 1.7295980511571254,
"grad_norm": 1.0859375,
"learning_rate": 2.24117084407188e-06,
"loss": 0.4281,
"num_input_tokens_seen": 186091520,
"step": 1420,
"train_runtime": 6198.2101,
"train_tokens_per_second": 30023.429
},
{
"epoch": 1.7356881851400732,
"grad_norm": 1.0234375,
"learning_rate": 2.143242032922396e-06,
"loss": 0.4217,
"num_input_tokens_seen": 186746880,
"step": 1425,
"train_runtime": 6219.929,
"train_tokens_per_second": 30023.957
},
{
"epoch": 1.7417783191230207,
"grad_norm": 1.0546875,
"learning_rate": 2.0474049476345737e-06,
"loss": 0.4236,
"num_input_tokens_seen": 187402240,
"step": 1430,
"train_runtime": 6241.6962,
"train_tokens_per_second": 30024.249
},
{
"epoch": 1.7478684531059683,
"grad_norm": 1.0859375,
"learning_rate": 1.953668358695901e-06,
"loss": 0.4193,
"num_input_tokens_seen": 188057600,
"step": 1435,
"train_runtime": 6263.445,
"train_tokens_per_second": 30024.627
},
{
"epoch": 1.753958587088916,
"grad_norm": 1.0625,
"learning_rate": 1.8620408443678904e-06,
"loss": 0.4328,
"num_input_tokens_seen": 188712960,
"step": 1440,
"train_runtime": 6285.263,
"train_tokens_per_second": 30024.672
},
{
"epoch": 1.7600487210718636,
"grad_norm": 1.078125,
"learning_rate": 1.7725307899010586e-06,
"loss": 0.4322,
"num_input_tokens_seen": 189368320,
"step": 1445,
"train_runtime": 6306.9813,
"train_tokens_per_second": 30025.192
},
{
"epoch": 1.7661388550548112,
"grad_norm": 1.0625,
"learning_rate": 1.6851463867675305e-06,
"loss": 0.4276,
"num_input_tokens_seen": 190023680,
"step": 1450,
"train_runtime": 6328.7332,
"train_tokens_per_second": 30025.548
},
{
"epoch": 1.772228989037759,
"grad_norm": 1.0703125,
"learning_rate": 1.599895631911405e-06,
"loss": 0.4266,
"num_input_tokens_seen": 190679040,
"step": 1455,
"train_runtime": 6350.4722,
"train_tokens_per_second": 30025.962
},
{
"epoch": 1.7783191230207065,
"grad_norm": 1.046875,
"learning_rate": 1.5167863270169448e-06,
"loss": 0.4233,
"num_input_tokens_seen": 191334400,
"step": 1460,
"train_runtime": 6372.1834,
"train_tokens_per_second": 30026.506
},
{
"epoch": 1.784409257003654,
"grad_norm": 1.046875,
"learning_rate": 1.435826077794572e-06,
"loss": 0.4202,
"num_input_tokens_seen": 191989760,
"step": 1465,
"train_runtime": 6393.9217,
"train_tokens_per_second": 30026.918
},
{
"epoch": 1.7904993909866018,
"grad_norm": 1.09375,
"learning_rate": 1.3570222932848514e-06,
"loss": 0.429,
"num_input_tokens_seen": 192645120,
"step": 1470,
"train_runtime": 6415.6575,
"train_tokens_per_second": 30027.339
},
{
"epoch": 1.7965895249695494,
"grad_norm": 1.0546875,
"learning_rate": 1.2803821851804677e-06,
"loss": 0.4373,
"num_input_tokens_seen": 193300480,
"step": 1475,
"train_runtime": 6437.4201,
"train_tokens_per_second": 30027.632
},
{
"epoch": 1.802679658952497,
"grad_norm": 1.1953125,
"learning_rate": 1.2059127671662285e-06,
"loss": 0.4318,
"num_input_tokens_seen": 193955840,
"step": 1480,
"train_runtime": 6459.1614,
"train_tokens_per_second": 30028.022
},
{
"epoch": 1.8087697929354447,
"grad_norm": 1.046875,
"learning_rate": 1.1336208542772147e-06,
"loss": 0.4266,
"num_input_tokens_seen": 194611200,
"step": 1485,
"train_runtime": 6480.9309,
"train_tokens_per_second": 30028.279
},
{
"epoch": 1.814859926918392,
"grad_norm": 1.0234375,
"learning_rate": 1.0635130622751343e-06,
"loss": 0.4203,
"num_input_tokens_seen": 195266560,
"step": 1490,
"train_runtime": 6502.6538,
"train_tokens_per_second": 30028.749
},
{
"epoch": 1.8209500609013398,
"grad_norm": 1.078125,
"learning_rate": 9.955958070428344e-07,
"loss": 0.4189,
"num_input_tokens_seen": 195921920,
"step": 1495,
"train_runtime": 6524.4119,
"train_tokens_per_second": 30029.055
},
{
"epoch": 1.8270401948842876,
"grad_norm": 1.0703125,
"learning_rate": 9.298753039971964e-07,
"loss": 0.431,
"num_input_tokens_seen": 196577280,
"step": 1500,
"train_runtime": 6546.1214,
"train_tokens_per_second": 30029.58
},
{
"epoch": 1.833130328867235,
"grad_norm": 1.0390625,
"learning_rate": 8.663575675203151e-07,
"loss": 0.4204,
"num_input_tokens_seen": 197232640,
"step": 1505,
"train_runtime": 6575.1193,
"train_tokens_per_second": 29996.815
},
{
"epoch": 1.8392204628501827,
"grad_norm": 1.0859375,
"learning_rate": 8.050484104090927e-07,
"loss": 0.4226,
"num_input_tokens_seen": 197888000,
"step": 1510,
"train_runtime": 6596.8218,
"train_tokens_per_second": 29997.475
},
{
"epoch": 1.8453105968331305,
"grad_norm": 1.0625,
"learning_rate": 7.459534433433085e-07,
"loss": 0.4262,
"num_input_tokens_seen": 198543360,
"step": 1515,
"train_runtime": 6618.5414,
"train_tokens_per_second": 29998.054
},
{
"epoch": 1.8514007308160778,
"grad_norm": 1.0546875,
"learning_rate": 6.890780743721209e-07,
"loss": 0.4272,
"num_input_tokens_seen": 199198720,
"step": 1520,
"train_runtime": 6640.1962,
"train_tokens_per_second": 29998.921
},
{
"epoch": 1.8574908647990256,
"grad_norm": 1.0703125,
"learning_rate": 6.344275084191886e-07,
"loss": 0.4257,
"num_input_tokens_seen": 199854080,
"step": 1525,
"train_runtime": 6661.9288,
"train_tokens_per_second": 29999.432
},
{
"epoch": 1.8635809987819734,
"grad_norm": 1.0546875,
"learning_rate": 5.820067468063212e-07,
"loss": 0.4249,
"num_input_tokens_seen": 200509440,
"step": 1530,
"train_runtime": 6683.6664,
"train_tokens_per_second": 29999.917
},
{
"epoch": 1.8696711327649207,
"grad_norm": 1.0390625,
"learning_rate": 5.318205867957893e-07,
"loss": 0.4277,
"num_input_tokens_seen": 201164800,
"step": 1535,
"train_runtime": 6705.4025,
"train_tokens_per_second": 30000.406
},
{
"epoch": 1.8757612667478685,
"grad_norm": 1.1015625,
"learning_rate": 4.838736211513233e-07,
"loss": 0.4282,
"num_input_tokens_seen": 201820160,
"step": 1540,
"train_runtime": 6727.1629,
"train_tokens_per_second": 30000.784
},
{
"epoch": 1.881851400730816,
"grad_norm": 1.0859375,
"learning_rate": 4.3817023771778596e-07,
"loss": 0.4298,
"num_input_tokens_seen": 202475520,
"step": 1545,
"train_runtime": 6749.1883,
"train_tokens_per_second": 29999.981
},
{
"epoch": 1.8879415347137636,
"grad_norm": 1.0625,
"learning_rate": 3.947146190196205e-07,
"loss": 0.4261,
"num_input_tokens_seen": 203130880,
"step": 1550,
"train_runtime": 6771.0741,
"train_tokens_per_second": 29999.802
},
{
"epoch": 1.8940316686967114,
"grad_norm": 1.0546875,
"learning_rate": 3.5351074187811586e-07,
"loss": 0.4294,
"num_input_tokens_seen": 203786240,
"step": 1555,
"train_runtime": 6793.1191,
"train_tokens_per_second": 29998.921
},
{
"epoch": 1.900121802679659,
"grad_norm": 1.0859375,
"learning_rate": 3.145623770474365e-07,
"loss": 0.4286,
"num_input_tokens_seen": 204441600,
"step": 1560,
"train_runtime": 6815.1261,
"train_tokens_per_second": 29998.212
},
{
"epoch": 1.9062119366626065,
"grad_norm": 1.0703125,
"learning_rate": 2.778730888695652e-07,
"loss": 0.4269,
"num_input_tokens_seen": 205096960,
"step": 1565,
"train_runtime": 6836.9748,
"train_tokens_per_second": 29998.203
},
{
"epoch": 1.9123020706455542,
"grad_norm": 1.0546875,
"learning_rate": 2.4344623494810814e-07,
"loss": 0.4283,
"num_input_tokens_seen": 205752320,
"step": 1570,
"train_runtime": 6859.0238,
"train_tokens_per_second": 29997.318
},
{
"epoch": 1.9183922046285018,
"grad_norm": 1.0859375,
"learning_rate": 2.1128496584102154e-07,
"loss": 0.4308,
"num_input_tokens_seen": 206407680,
"step": 1575,
"train_runtime": 6880.8878,
"train_tokens_per_second": 29997.245
},
{
"epoch": 1.9244823386114494,
"grad_norm": 1.09375,
"learning_rate": 1.8139222477229212e-07,
"loss": 0.4435,
"num_input_tokens_seen": 207063040,
"step": 1580,
"train_runtime": 6902.6722,
"train_tokens_per_second": 29997.519
},
{
"epoch": 1.9305724725943971,
"grad_norm": 1.0546875,
"learning_rate": 1.5377074736259155e-07,
"loss": 0.427,
"num_input_tokens_seen": 207718400,
"step": 1585,
"train_runtime": 6924.4409,
"train_tokens_per_second": 29997.859
},
{
"epoch": 1.9366626065773447,
"grad_norm": 1.0625,
"learning_rate": 1.2842306137892392e-07,
"loss": 0.4222,
"num_input_tokens_seen": 208373760,
"step": 1590,
"train_runtime": 6946.2191,
"train_tokens_per_second": 29998.156
},
{
"epoch": 1.9427527405602922,
"grad_norm": 1.046875,
"learning_rate": 1.0535148650330228e-07,
"loss": 0.4234,
"num_input_tokens_seen": 209029120,
"step": 1595,
"train_runtime": 6968.3008,
"train_tokens_per_second": 29997.144
},
{
"epoch": 1.94884287454324,
"grad_norm": 1.1484375,
"learning_rate": 8.455813412046042e-08,
"loss": 0.4268,
"num_input_tokens_seen": 209684480,
"step": 1600,
"train_runtime": 6990.0361,
"train_tokens_per_second": 29997.625
},
{
"epoch": 1.9549330085261876,
"grad_norm": 1.078125,
"learning_rate": 6.604490712463218e-08,
"loss": 0.4307,
"num_input_tokens_seen": 210339840,
"step": 1605,
"train_runtime": 7011.7811,
"train_tokens_per_second": 29998.062
},
{
"epoch": 1.9610231425091351,
"grad_norm": 1.1015625,
"learning_rate": 4.98134997454075e-08,
"loss": 0.4234,
"num_input_tokens_seen": 210995200,
"step": 1610,
"train_runtime": 7033.5419,
"train_tokens_per_second": 29998.428
},
{
"epoch": 1.967113276492083,
"grad_norm": 1.078125,
"learning_rate": 3.5865397392684244e-08,
"loss": 0.4349,
"num_input_tokens_seen": 211650560,
"step": 1615,
"train_runtime": 7055.3037,
"train_tokens_per_second": 29998.788
},
{
"epoch": 1.9732034104750305,
"grad_norm": 1.0703125,
"learning_rate": 2.420187652074357e-08,
"loss": 0.4288,
"num_input_tokens_seen": 212305920,
"step": 1620,
"train_runtime": 7077.0456,
"train_tokens_per_second": 29999.23
},
{
"epoch": 1.979293544457978,
"grad_norm": 1.0859375,
"learning_rate": 1.4824004511415634e-08,
"loss": 0.4133,
"num_input_tokens_seen": 212961280,
"step": 1625,
"train_runtime": 7098.7568,
"train_tokens_per_second": 29999.799
},
{
"epoch": 1.9853836784409258,
"grad_norm": 1.078125,
"learning_rate": 7.732639576413236e-09,
"loss": 0.4267,
"num_input_tokens_seen": 213616640,
"step": 1630,
"train_runtime": 7120.4765,
"train_tokens_per_second": 30000.329
},
{
"epoch": 1.9914738124238733,
"grad_norm": 1.125,
"learning_rate": 2.9284306787918937e-09,
"loss": 0.4308,
"num_input_tokens_seen": 214272000,
"step": 1635,
"train_runtime": 7142.2064,
"train_tokens_per_second": 30000.813
},
{
"epoch": 1.997563946406821,
"grad_norm": 1.0703125,
"learning_rate": 4.118174735529001e-10,
"loss": 0.4244,
"num_input_tokens_seen": 214927360,
"step": 1640,
"train_runtime": 7163.9456,
"train_tokens_per_second": 30001.255
},
{
"epoch": 2.0,
"num_input_tokens_seen": 215158784,
"step": 1642,
"total_flos": 9.00950810931757e+17,
"train_loss": 0.6156596739239872,
"train_runtime": 7180.2112,
"train_samples_per_second": 14.632,
"train_steps_per_second": 0.229
}
],
"logging_steps": 5,
"max_steps": 1642,
"num_input_tokens_seen": 215158784,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 9.00950810931757e+17,
"train_batch_size": 16,
"trial_name": null,
"trial_params": null
}