Llama8B_mathinstruct_SFT / trainer_state.json
Changahou's picture
Upload model
05dbd61 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 500,
"global_step": 6252,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0023999520009599807,
"grad_norm": 0.49803411960601807,
"learning_rate": 4.999994949996767e-05,
"loss": 0.9716,
"num_input_tokens_seen": 54328,
"step": 5,
"train_runtime": 8.3772,
"train_tokens_per_second": 6485.207
},
{
"epoch": 0.004799904001919961,
"grad_norm": 0.4587724804878235,
"learning_rate": 4.9999744343936e-05,
"loss": 0.9705,
"num_input_tokens_seen": 108376,
"step": 10,
"train_runtime": 16.0234,
"train_tokens_per_second": 6763.598
},
{
"epoch": 0.007199856002879942,
"grad_norm": 0.3823186159133911,
"learning_rate": 4.999938137694701e-05,
"loss": 0.8622,
"num_input_tokens_seen": 163816,
"step": 15,
"train_runtime": 24.1319,
"train_tokens_per_second": 6788.347
},
{
"epoch": 0.009599808003839923,
"grad_norm": 0.3449329733848572,
"learning_rate": 4.999886060129194e-05,
"loss": 0.8309,
"num_input_tokens_seen": 225216,
"step": 20,
"train_runtime": 33.0832,
"train_tokens_per_second": 6807.559
},
{
"epoch": 0.011999760004799903,
"grad_norm": 0.4700082242488861,
"learning_rate": 4.999818202025819e-05,
"loss": 0.8118,
"num_input_tokens_seen": 279480,
"step": 25,
"train_runtime": 40.5384,
"train_tokens_per_second": 6894.209
},
{
"epoch": 0.014399712005759884,
"grad_norm": 0.3817142844200134,
"learning_rate": 4.999734563812929e-05,
"loss": 0.7763,
"num_input_tokens_seen": 337816,
"step": 30,
"train_runtime": 48.8916,
"train_tokens_per_second": 6909.488
},
{
"epoch": 0.016799664006719867,
"grad_norm": 0.33569249510765076,
"learning_rate": 4.9996351460184923e-05,
"loss": 0.7919,
"num_input_tokens_seen": 394952,
"step": 35,
"train_runtime": 57.353,
"train_tokens_per_second": 6886.34
},
{
"epoch": 0.019199616007679846,
"grad_norm": 0.29491692781448364,
"learning_rate": 4.9995199492700826e-05,
"loss": 0.7095,
"num_input_tokens_seen": 454608,
"step": 40,
"train_runtime": 65.8361,
"train_tokens_per_second": 6905.143
},
{
"epoch": 0.021599568008639828,
"grad_norm": 0.3096805810928345,
"learning_rate": 4.9993889742948806e-05,
"loss": 0.7347,
"num_input_tokens_seen": 510256,
"step": 45,
"train_runtime": 74.0444,
"train_tokens_per_second": 6891.219
},
{
"epoch": 0.023999520009599807,
"grad_norm": 0.3358306884765625,
"learning_rate": 4.9992422219196656e-05,
"loss": 0.7461,
"num_input_tokens_seen": 568112,
"step": 50,
"train_runtime": 82.4236,
"train_tokens_per_second": 6892.587
},
{
"epoch": 0.02639947201055979,
"grad_norm": 0.3951747417449951,
"learning_rate": 4.9990796930708125e-05,
"loss": 0.7168,
"num_input_tokens_seen": 625896,
"step": 55,
"train_runtime": 90.7851,
"train_tokens_per_second": 6894.26
},
{
"epoch": 0.02879942401151977,
"grad_norm": 0.3642365038394928,
"learning_rate": 4.9989013887742856e-05,
"loss": 0.7117,
"num_input_tokens_seen": 677856,
"step": 60,
"train_runtime": 98.2631,
"train_tokens_per_second": 6898.377
},
{
"epoch": 0.03119937601247975,
"grad_norm": 0.3379388153553009,
"learning_rate": 4.998707310155631e-05,
"loss": 0.6441,
"num_input_tokens_seen": 740792,
"step": 65,
"train_runtime": 107.5229,
"train_tokens_per_second": 6889.624
},
{
"epoch": 0.03359932801343973,
"grad_norm": 0.46827253699302673,
"learning_rate": 4.99849745843997e-05,
"loss": 0.617,
"num_input_tokens_seen": 795784,
"step": 70,
"train_runtime": 115.3119,
"train_tokens_per_second": 6901.144
},
{
"epoch": 0.03599928001439971,
"grad_norm": 0.46408799290657043,
"learning_rate": 4.998271834951993e-05,
"loss": 0.685,
"num_input_tokens_seen": 852016,
"step": 75,
"train_runtime": 123.2024,
"train_tokens_per_second": 6915.578
},
{
"epoch": 0.03839923201535969,
"grad_norm": 0.4789453148841858,
"learning_rate": 4.998030441115949e-05,
"loss": 0.6505,
"num_input_tokens_seen": 909224,
"step": 80,
"train_runtime": 131.6277,
"train_tokens_per_second": 6907.542
},
{
"epoch": 0.04079918401631968,
"grad_norm": 0.40359923243522644,
"learning_rate": 4.9977732784556355e-05,
"loss": 0.6212,
"num_input_tokens_seen": 959568,
"step": 85,
"train_runtime": 139.0702,
"train_tokens_per_second": 6899.88
},
{
"epoch": 0.043199136017279656,
"grad_norm": 0.452709436416626,
"learning_rate": 4.997500348594394e-05,
"loss": 0.6978,
"num_input_tokens_seen": 1010696,
"step": 90,
"train_runtime": 146.8828,
"train_tokens_per_second": 6880.967
},
{
"epoch": 0.045599088018239635,
"grad_norm": 0.4287179112434387,
"learning_rate": 4.997211653255096e-05,
"loss": 0.6212,
"num_input_tokens_seen": 1067912,
"step": 95,
"train_runtime": 155.302,
"train_tokens_per_second": 6876.357
},
{
"epoch": 0.047999040019199614,
"grad_norm": 0.5242288112640381,
"learning_rate": 4.996907194260129e-05,
"loss": 0.6182,
"num_input_tokens_seen": 1127264,
"step": 100,
"train_runtime": 164.3956,
"train_tokens_per_second": 6857.02
},
{
"epoch": 0.0503989920201596,
"grad_norm": 0.31285974383354187,
"learning_rate": 4.996586973531394e-05,
"loss": 0.6254,
"num_input_tokens_seen": 1183208,
"step": 105,
"train_runtime": 172.5905,
"train_tokens_per_second": 6855.582
},
{
"epoch": 0.05279894402111958,
"grad_norm": 0.30165454745292664,
"learning_rate": 4.9962509930902836e-05,
"loss": 0.5758,
"num_input_tokens_seen": 1243128,
"step": 110,
"train_runtime": 181.2314,
"train_tokens_per_second": 6859.34
},
{
"epoch": 0.05519889602207956,
"grad_norm": 0.3959163725376129,
"learning_rate": 4.9958992550576754e-05,
"loss": 0.6427,
"num_input_tokens_seen": 1294648,
"step": 115,
"train_runtime": 188.848,
"train_tokens_per_second": 6855.503
},
{
"epoch": 0.05759884802303954,
"grad_norm": 0.44546279311180115,
"learning_rate": 4.9955317616539174e-05,
"loss": 0.6416,
"num_input_tokens_seen": 1349136,
"step": 120,
"train_runtime": 196.8055,
"train_tokens_per_second": 6855.173
},
{
"epoch": 0.05999880002399952,
"grad_norm": 0.5479788184165955,
"learning_rate": 4.9951485151988126e-05,
"loss": 0.6039,
"num_input_tokens_seen": 1403304,
"step": 125,
"train_runtime": 204.9341,
"train_tokens_per_second": 6847.585
},
{
"epoch": 0.0623987520249595,
"grad_norm": 0.46208852529525757,
"learning_rate": 4.994749518111604e-05,
"loss": 0.6365,
"num_input_tokens_seen": 1460712,
"step": 130,
"train_runtime": 212.8501,
"train_tokens_per_second": 6862.633
},
{
"epoch": 0.06479870402591949,
"grad_norm": 0.5154985189437866,
"learning_rate": 4.9943347729109646e-05,
"loss": 0.5757,
"num_input_tokens_seen": 1516920,
"step": 135,
"train_runtime": 221.1296,
"train_tokens_per_second": 6859.868
},
{
"epoch": 0.06719865602687947,
"grad_norm": 0.4509885311126709,
"learning_rate": 4.993904282214972e-05,
"loss": 0.6484,
"num_input_tokens_seen": 1569296,
"step": 140,
"train_runtime": 228.9245,
"train_tokens_per_second": 6855.081
},
{
"epoch": 0.06959860802783945,
"grad_norm": 0.47324448823928833,
"learning_rate": 4.993458048741102e-05,
"loss": 0.5967,
"num_input_tokens_seen": 1627720,
"step": 145,
"train_runtime": 237.6306,
"train_tokens_per_second": 6849.792
},
{
"epoch": 0.07199856002879942,
"grad_norm": 0.4491414427757263,
"learning_rate": 4.992996075306203e-05,
"loss": 0.6705,
"num_input_tokens_seen": 1680600,
"step": 150,
"train_runtime": 245.5875,
"train_tokens_per_second": 6843.181
},
{
"epoch": 0.0743985120297594,
"grad_norm": 0.5371958613395691,
"learning_rate": 4.992518364826484e-05,
"loss": 0.5925,
"num_input_tokens_seen": 1732368,
"step": 155,
"train_runtime": 253.2225,
"train_tokens_per_second": 6841.288
},
{
"epoch": 0.07679846403071938,
"grad_norm": 0.44730937480926514,
"learning_rate": 4.9920249203174945e-05,
"loss": 0.5695,
"num_input_tokens_seen": 1794680,
"step": 160,
"train_runtime": 262.1391,
"train_tokens_per_second": 6846.289
},
{
"epoch": 0.07919841603167936,
"grad_norm": 0.4398422837257385,
"learning_rate": 4.9915157448941044e-05,
"loss": 0.5549,
"num_input_tokens_seen": 1854040,
"step": 165,
"train_runtime": 270.3386,
"train_tokens_per_second": 6858.214
},
{
"epoch": 0.08159836803263935,
"grad_norm": 0.5156921148300171,
"learning_rate": 4.9909908417704835e-05,
"loss": 0.5701,
"num_input_tokens_seen": 1908808,
"step": 170,
"train_runtime": 278.3256,
"train_tokens_per_second": 6858.183
},
{
"epoch": 0.08399832003359933,
"grad_norm": 0.40140026807785034,
"learning_rate": 4.990450214260086e-05,
"loss": 0.5478,
"num_input_tokens_seen": 1966184,
"step": 175,
"train_runtime": 286.7588,
"train_tokens_per_second": 6856.577
},
{
"epoch": 0.08639827203455931,
"grad_norm": 0.5238102674484253,
"learning_rate": 4.9898938657756234e-05,
"loss": 0.5816,
"num_input_tokens_seen": 2023280,
"step": 180,
"train_runtime": 295.3163,
"train_tokens_per_second": 6851.231
},
{
"epoch": 0.08879822403551929,
"grad_norm": 0.4058316648006439,
"learning_rate": 4.989321799829048e-05,
"loss": 0.6243,
"num_input_tokens_seen": 2079160,
"step": 185,
"train_runtime": 303.4477,
"train_tokens_per_second": 6851.791
},
{
"epoch": 0.09119817603647927,
"grad_norm": 0.48315656185150146,
"learning_rate": 4.988734020031527e-05,
"loss": 0.5903,
"num_input_tokens_seen": 2129480,
"step": 190,
"train_runtime": 310.6729,
"train_tokens_per_second": 6854.413
},
{
"epoch": 0.09359812803743925,
"grad_norm": 0.49458763003349304,
"learning_rate": 4.9881305300934225e-05,
"loss": 0.5232,
"num_input_tokens_seen": 2189160,
"step": 195,
"train_runtime": 319.1985,
"train_tokens_per_second": 6858.302
},
{
"epoch": 0.09599808003839923,
"grad_norm": 0.3490532338619232,
"learning_rate": 4.987511333824266e-05,
"loss": 0.5846,
"num_input_tokens_seen": 2247704,
"step": 200,
"train_runtime": 327.3152,
"train_tokens_per_second": 6867.093
},
{
"epoch": 0.0983980320393592,
"grad_norm": 0.41308099031448364,
"learning_rate": 4.986876435132736e-05,
"loss": 0.589,
"num_input_tokens_seen": 2307040,
"step": 205,
"train_runtime": 335.8335,
"train_tokens_per_second": 6869.595
},
{
"epoch": 0.1007979840403192,
"grad_norm": 0.4715804159641266,
"learning_rate": 4.9862258380266325e-05,
"loss": 0.5737,
"num_input_tokens_seen": 2357488,
"step": 210,
"train_runtime": 343.3468,
"train_tokens_per_second": 6866.201
},
{
"epoch": 0.10319793604127918,
"grad_norm": 0.35753390192985535,
"learning_rate": 4.985559546612851e-05,
"loss": 0.6138,
"num_input_tokens_seen": 2414064,
"step": 215,
"train_runtime": 351.1882,
"train_tokens_per_second": 6873.99
},
{
"epoch": 0.10559788804223916,
"grad_norm": 0.44587111473083496,
"learning_rate": 4.984877565097359e-05,
"loss": 0.5923,
"num_input_tokens_seen": 2472744,
"step": 220,
"train_runtime": 359.483,
"train_tokens_per_second": 6878.611
},
{
"epoch": 0.10799784004319914,
"grad_norm": 0.47545069456100464,
"learning_rate": 4.984179897785166e-05,
"loss": 0.55,
"num_input_tokens_seen": 2529024,
"step": 225,
"train_runtime": 367.4645,
"train_tokens_per_second": 6882.364
},
{
"epoch": 0.11039779204415912,
"grad_norm": 0.5392165184020996,
"learning_rate": 4.983466549080299e-05,
"loss": 0.618,
"num_input_tokens_seen": 2584864,
"step": 230,
"train_runtime": 375.6998,
"train_tokens_per_second": 6880.132
},
{
"epoch": 0.1127977440451191,
"grad_norm": 0.6994487047195435,
"learning_rate": 4.9827375234857735e-05,
"loss": 0.6055,
"num_input_tokens_seen": 2638696,
"step": 235,
"train_runtime": 383.3187,
"train_tokens_per_second": 6883.818
},
{
"epoch": 0.11519769604607907,
"grad_norm": 0.5480724573135376,
"learning_rate": 4.981992825603566e-05,
"loss": 0.5962,
"num_input_tokens_seen": 2699160,
"step": 240,
"train_runtime": 391.6683,
"train_tokens_per_second": 6891.444
},
{
"epoch": 0.11759764804703907,
"grad_norm": 0.49630582332611084,
"learning_rate": 4.981232460134584e-05,
"loss": 0.5556,
"num_input_tokens_seen": 2756440,
"step": 245,
"train_runtime": 400.0318,
"train_tokens_per_second": 6890.551
},
{
"epoch": 0.11999760004799905,
"grad_norm": 0.48846226930618286,
"learning_rate": 4.980456431878636e-05,
"loss": 0.6064,
"num_input_tokens_seen": 2811584,
"step": 250,
"train_runtime": 408.3053,
"train_tokens_per_second": 6885.985
},
{
"epoch": 0.12239755204895902,
"grad_norm": 0.7514108419418335,
"learning_rate": 4.9796647457344034e-05,
"loss": 0.622,
"num_input_tokens_seen": 2864600,
"step": 255,
"train_runtime": 416.1151,
"train_tokens_per_second": 6884.152
},
{
"epoch": 0.124797504049919,
"grad_norm": 0.45766520500183105,
"learning_rate": 4.9788574066994074e-05,
"loss": 0.5792,
"num_input_tokens_seen": 2920384,
"step": 260,
"train_runtime": 424.193,
"train_tokens_per_second": 6884.564
},
{
"epoch": 0.12719745605087898,
"grad_norm": 0.40784621238708496,
"learning_rate": 4.978034419869977e-05,
"loss": 0.5464,
"num_input_tokens_seen": 2976400,
"step": 265,
"train_runtime": 432.5949,
"train_tokens_per_second": 6880.34
},
{
"epoch": 0.12959740805183897,
"grad_norm": 0.5691152811050415,
"learning_rate": 4.977195790441219e-05,
"loss": 0.5769,
"num_input_tokens_seen": 3031640,
"step": 270,
"train_runtime": 440.75,
"train_tokens_per_second": 6878.366
},
{
"epoch": 0.13199736005279894,
"grad_norm": 0.509024977684021,
"learning_rate": 4.976341523706986e-05,
"loss": 0.5853,
"num_input_tokens_seen": 3088304,
"step": 275,
"train_runtime": 448.5804,
"train_tokens_per_second": 6884.616
},
{
"epoch": 0.13439731205375893,
"grad_norm": 0.5476660132408142,
"learning_rate": 4.975471625059837e-05,
"loss": 0.5715,
"num_input_tokens_seen": 3146984,
"step": 280,
"train_runtime": 456.8318,
"train_tokens_per_second": 6888.716
},
{
"epoch": 0.1367972640547189,
"grad_norm": 0.41494348645210266,
"learning_rate": 4.9745860999910093e-05,
"loss": 0.5492,
"num_input_tokens_seen": 3206416,
"step": 285,
"train_runtime": 465.4275,
"train_tokens_per_second": 6889.185
},
{
"epoch": 0.1391972160556789,
"grad_norm": 0.4294047951698303,
"learning_rate": 4.973684954090384e-05,
"loss": 0.6008,
"num_input_tokens_seen": 3263920,
"step": 290,
"train_runtime": 473.6647,
"train_tokens_per_second": 6890.782
},
{
"epoch": 0.14159716805663886,
"grad_norm": 0.673201858997345,
"learning_rate": 4.972768193046446e-05,
"loss": 0.5588,
"num_input_tokens_seen": 3318032,
"step": 295,
"train_runtime": 481.6549,
"train_tokens_per_second": 6888.816
},
{
"epoch": 0.14399712005759885,
"grad_norm": 0.6196733117103577,
"learning_rate": 4.971835822646254e-05,
"loss": 0.5692,
"num_input_tokens_seen": 3373136,
"step": 300,
"train_runtime": 489.8925,
"train_tokens_per_second": 6885.461
},
{
"epoch": 0.14639707205855884,
"grad_norm": 0.5182610154151917,
"learning_rate": 4.9708878487753976e-05,
"loss": 0.5801,
"num_input_tokens_seen": 3428032,
"step": 305,
"train_runtime": 497.9042,
"train_tokens_per_second": 6884.923
},
{
"epoch": 0.1487970240595188,
"grad_norm": 0.5710193514823914,
"learning_rate": 4.969924277417963e-05,
"loss": 0.5601,
"num_input_tokens_seen": 3482432,
"step": 310,
"train_runtime": 505.9162,
"train_tokens_per_second": 6883.416
},
{
"epoch": 0.1511969760604788,
"grad_norm": 0.5431010127067566,
"learning_rate": 4.968945114656499e-05,
"loss": 0.6167,
"num_input_tokens_seen": 3540200,
"step": 315,
"train_runtime": 513.822,
"train_tokens_per_second": 6889.935
},
{
"epoch": 0.15359692806143876,
"grad_norm": 0.5962916016578674,
"learning_rate": 4.967950366671973e-05,
"loss": 0.5528,
"num_input_tokens_seen": 3590376,
"step": 320,
"train_runtime": 521.033,
"train_tokens_per_second": 6890.88
},
{
"epoch": 0.15599688006239876,
"grad_norm": 0.43872061371803284,
"learning_rate": 4.966940039743734e-05,
"loss": 0.582,
"num_input_tokens_seen": 3650392,
"step": 325,
"train_runtime": 529.3248,
"train_tokens_per_second": 6896.317
},
{
"epoch": 0.15839683206335872,
"grad_norm": 0.6549321413040161,
"learning_rate": 4.965914140249475e-05,
"loss": 0.6262,
"num_input_tokens_seen": 3700960,
"step": 330,
"train_runtime": 537.0024,
"train_tokens_per_second": 6891.887
},
{
"epoch": 0.16079678406431872,
"grad_norm": 0.49688732624053955,
"learning_rate": 4.9648726746651875e-05,
"loss": 0.555,
"num_input_tokens_seen": 3757192,
"step": 335,
"train_runtime": 544.9852,
"train_tokens_per_second": 6894.117
},
{
"epoch": 0.1631967360652787,
"grad_norm": 0.600683331489563,
"learning_rate": 4.9638156495651265e-05,
"loss": 0.5747,
"num_input_tokens_seen": 3812168,
"step": 340,
"train_runtime": 552.5883,
"train_tokens_per_second": 6898.749
},
{
"epoch": 0.16559668806623867,
"grad_norm": 0.506166398525238,
"learning_rate": 4.9627430716217674e-05,
"loss": 0.562,
"num_input_tokens_seen": 3873432,
"step": 345,
"train_runtime": 561.3753,
"train_tokens_per_second": 6899.898
},
{
"epoch": 0.16799664006719867,
"grad_norm": 0.5933504104614258,
"learning_rate": 4.96165494760576e-05,
"loss": 0.5751,
"num_input_tokens_seen": 3928216,
"step": 350,
"train_runtime": 569.6058,
"train_tokens_per_second": 6896.377
},
{
"epoch": 0.17039659206815863,
"grad_norm": 0.7012840509414673,
"learning_rate": 4.96055128438589e-05,
"loss": 0.5283,
"num_input_tokens_seen": 3985672,
"step": 355,
"train_runtime": 578.024,
"train_tokens_per_second": 6895.339
},
{
"epoch": 0.17279654406911862,
"grad_norm": 0.5886171460151672,
"learning_rate": 4.959432088929036e-05,
"loss": 0.5688,
"num_input_tokens_seen": 4042336,
"step": 360,
"train_runtime": 586.1104,
"train_tokens_per_second": 6896.885
},
{
"epoch": 0.1751964960700786,
"grad_norm": 0.6454927325248718,
"learning_rate": 4.958297368300122e-05,
"loss": 0.5236,
"num_input_tokens_seen": 4097248,
"step": 365,
"train_runtime": 594.2204,
"train_tokens_per_second": 6895.165
},
{
"epoch": 0.17759644807103858,
"grad_norm": 0.48636892437934875,
"learning_rate": 4.957147129662074e-05,
"loss": 0.5569,
"num_input_tokens_seen": 4152816,
"step": 370,
"train_runtime": 602.2577,
"train_tokens_per_second": 6895.413
},
{
"epoch": 0.17999640007199855,
"grad_norm": 0.5636932253837585,
"learning_rate": 4.9559813802757785e-05,
"loss": 0.5558,
"num_input_tokens_seen": 4210824,
"step": 375,
"train_runtime": 610.6583,
"train_tokens_per_second": 6895.549
},
{
"epoch": 0.18239635207295854,
"grad_norm": 0.4750101864337921,
"learning_rate": 4.954800127500031e-05,
"loss": 0.5055,
"num_input_tokens_seen": 4263672,
"step": 380,
"train_runtime": 618.8445,
"train_tokens_per_second": 6889.73
},
{
"epoch": 0.18479630407391853,
"grad_norm": 0.6123194694519043,
"learning_rate": 4.953603378791493e-05,
"loss": 0.5524,
"num_input_tokens_seen": 4319024,
"step": 385,
"train_runtime": 626.7904,
"train_tokens_per_second": 6890.699
},
{
"epoch": 0.1871962560748785,
"grad_norm": 0.49063947796821594,
"learning_rate": 4.952391141704644e-05,
"loss": 0.5653,
"num_input_tokens_seen": 4377064,
"step": 390,
"train_runtime": 634.9033,
"train_tokens_per_second": 6894.064
},
{
"epoch": 0.1895962080758385,
"grad_norm": 0.5559214949607849,
"learning_rate": 4.951163423891735e-05,
"loss": 0.6034,
"num_input_tokens_seen": 4434984,
"step": 395,
"train_runtime": 643.2672,
"train_tokens_per_second": 6894.466
},
{
"epoch": 0.19199616007679846,
"grad_norm": 0.3978354334831238,
"learning_rate": 4.949920233102736e-05,
"loss": 0.5667,
"num_input_tokens_seen": 4492368,
"step": 400,
"train_runtime": 651.0435,
"train_tokens_per_second": 6900.258
},
{
"epoch": 0.19439611207775845,
"grad_norm": 0.5354523658752441,
"learning_rate": 4.948661577185295e-05,
"loss": 0.5445,
"num_input_tokens_seen": 4549008,
"step": 405,
"train_runtime": 659.5277,
"train_tokens_per_second": 6897.372
},
{
"epoch": 0.1967960640787184,
"grad_norm": 0.4975457191467285,
"learning_rate": 4.947387464084679e-05,
"loss": 0.5462,
"num_input_tokens_seen": 4609072,
"step": 410,
"train_runtime": 668.316,
"train_tokens_per_second": 6896.546
},
{
"epoch": 0.1991960160796784,
"grad_norm": 0.5424690246582031,
"learning_rate": 4.9460979018437314e-05,
"loss": 0.5855,
"num_input_tokens_seen": 4662560,
"step": 415,
"train_runtime": 676.3317,
"train_tokens_per_second": 6893.895
},
{
"epoch": 0.2015959680806384,
"grad_norm": 0.5656135678291321,
"learning_rate": 4.944792898602818e-05,
"loss": 0.5909,
"num_input_tokens_seen": 4719248,
"step": 420,
"train_runtime": 684.592,
"train_tokens_per_second": 6893.519
},
{
"epoch": 0.20399592008159836,
"grad_norm": 0.4792700409889221,
"learning_rate": 4.943472462599775e-05,
"loss": 0.5211,
"num_input_tokens_seen": 4774096,
"step": 425,
"train_runtime": 692.7133,
"train_tokens_per_second": 6891.879
},
{
"epoch": 0.20639587208255836,
"grad_norm": 0.5212066173553467,
"learning_rate": 4.942136602169858e-05,
"loss": 0.5245,
"num_input_tokens_seen": 4832616,
"step": 430,
"train_runtime": 700.7277,
"train_tokens_per_second": 6896.567
},
{
"epoch": 0.20879582408351832,
"grad_norm": 0.5669515132904053,
"learning_rate": 4.94078532574569e-05,
"loss": 0.5304,
"num_input_tokens_seen": 4887208,
"step": 435,
"train_runtime": 708.8314,
"train_tokens_per_second": 6894.74
},
{
"epoch": 0.21119577608447831,
"grad_norm": 0.6369892358779907,
"learning_rate": 4.939418641857209e-05,
"loss": 0.5879,
"num_input_tokens_seen": 4942504,
"step": 440,
"train_runtime": 716.88,
"train_tokens_per_second": 6894.464
},
{
"epoch": 0.21359572808543828,
"grad_norm": 0.5132316946983337,
"learning_rate": 4.938036559131608e-05,
"loss": 0.5854,
"num_input_tokens_seen": 4997880,
"step": 445,
"train_runtime": 724.9035,
"train_tokens_per_second": 6894.546
},
{
"epoch": 0.21599568008639827,
"grad_norm": 0.5846990942955017,
"learning_rate": 4.9366390862932896e-05,
"loss": 0.5545,
"num_input_tokens_seen": 5060096,
"step": 450,
"train_runtime": 733.2702,
"train_tokens_per_second": 6900.725
},
{
"epoch": 0.21839563208735827,
"grad_norm": 0.5361617803573608,
"learning_rate": 4.9352262321638056e-05,
"loss": 0.528,
"num_input_tokens_seen": 5120168,
"step": 455,
"train_runtime": 741.6463,
"train_tokens_per_second": 6903.787
},
{
"epoch": 0.22079558408831823,
"grad_norm": 0.6068050265312195,
"learning_rate": 4.9337980056618006e-05,
"loss": 0.5462,
"num_input_tokens_seen": 5175776,
"step": 460,
"train_runtime": 750.017,
"train_tokens_per_second": 6900.878
},
{
"epoch": 0.22319553608927822,
"grad_norm": 0.6304349899291992,
"learning_rate": 4.932354415802959e-05,
"loss": 0.5399,
"num_input_tokens_seen": 5232032,
"step": 465,
"train_runtime": 758.2013,
"train_tokens_per_second": 6900.584
},
{
"epoch": 0.2255954880902382,
"grad_norm": 0.5615517497062683,
"learning_rate": 4.9308954716999464e-05,
"loss": 0.5224,
"num_input_tokens_seen": 5292688,
"step": 470,
"train_runtime": 766.6597,
"train_tokens_per_second": 6903.569
},
{
"epoch": 0.22799544009119818,
"grad_norm": 0.7061598896980286,
"learning_rate": 4.92942118256235e-05,
"loss": 0.5335,
"num_input_tokens_seen": 5353096,
"step": 475,
"train_runtime": 775.3136,
"train_tokens_per_second": 6904.427
},
{
"epoch": 0.23039539209215815,
"grad_norm": 0.6964676976203918,
"learning_rate": 4.9279315576966265e-05,
"loss": 0.4755,
"num_input_tokens_seen": 5412360,
"step": 480,
"train_runtime": 784.1147,
"train_tokens_per_second": 6902.511
},
{
"epoch": 0.23279534409311814,
"grad_norm": 0.6583765745162964,
"learning_rate": 4.926426606506036e-05,
"loss": 0.5725,
"num_input_tokens_seen": 5466664,
"step": 485,
"train_runtime": 792.1938,
"train_tokens_per_second": 6900.665
},
{
"epoch": 0.23519529609407813,
"grad_norm": 0.6751510500907898,
"learning_rate": 4.924906338490586e-05,
"loss": 0.5181,
"num_input_tokens_seen": 5526480,
"step": 490,
"train_runtime": 800.9788,
"train_tokens_per_second": 6899.658
},
{
"epoch": 0.2375952480950381,
"grad_norm": 0.5503116250038147,
"learning_rate": 4.9233707632469746e-05,
"loss": 0.5586,
"num_input_tokens_seen": 5579704,
"step": 495,
"train_runtime": 808.8081,
"train_tokens_per_second": 6898.674
},
{
"epoch": 0.2399952000959981,
"grad_norm": 0.5688736438751221,
"learning_rate": 4.921819890468523e-05,
"loss": 0.5465,
"num_input_tokens_seen": 5633168,
"step": 500,
"train_runtime": 816.7042,
"train_tokens_per_second": 6897.44
},
{
"epoch": 0.24239515209695806,
"grad_norm": 0.48173242807388306,
"learning_rate": 4.9202537299451215e-05,
"loss": 0.488,
"num_input_tokens_seen": 5692232,
"step": 505,
"train_runtime": 825.1134,
"train_tokens_per_second": 6898.727
},
{
"epoch": 0.24479510409791805,
"grad_norm": 0.5660738945007324,
"learning_rate": 4.9186722915631626e-05,
"loss": 0.5354,
"num_input_tokens_seen": 5751464,
"step": 510,
"train_runtime": 833.8275,
"train_tokens_per_second": 6897.666
},
{
"epoch": 0.247195056098878,
"grad_norm": 0.5903744697570801,
"learning_rate": 4.9170755853054806e-05,
"loss": 0.6093,
"num_input_tokens_seen": 5811696,
"step": 515,
"train_runtime": 842.0821,
"train_tokens_per_second": 6901.579
},
{
"epoch": 0.249595008099838,
"grad_norm": 0.6396485567092896,
"learning_rate": 4.915463621251287e-05,
"loss": 0.5436,
"num_input_tokens_seen": 5867200,
"step": 520,
"train_runtime": 850.626,
"train_tokens_per_second": 6897.509
},
{
"epoch": 0.25199496010079797,
"grad_norm": 0.5617818236351013,
"learning_rate": 4.913836409576112e-05,
"loss": 0.5537,
"num_input_tokens_seen": 5924320,
"step": 525,
"train_runtime": 858.7807,
"train_tokens_per_second": 6898.525
},
{
"epoch": 0.25439491210175796,
"grad_norm": 0.6151410937309265,
"learning_rate": 4.912193960551732e-05,
"loss": 0.5392,
"num_input_tokens_seen": 5979680,
"step": 530,
"train_runtime": 866.7277,
"train_tokens_per_second": 6899.145
},
{
"epoch": 0.25679486410271796,
"grad_norm": 0.6780862808227539,
"learning_rate": 4.9105362845461114e-05,
"loss": 0.5776,
"num_input_tokens_seen": 6037568,
"step": 535,
"train_runtime": 874.7881,
"train_tokens_per_second": 6901.749
},
{
"epoch": 0.25919481610367795,
"grad_norm": 0.6386091113090515,
"learning_rate": 4.9088633920233345e-05,
"loss": 0.5463,
"num_input_tokens_seen": 6092712,
"step": 540,
"train_runtime": 883.1921,
"train_tokens_per_second": 6898.513
},
{
"epoch": 0.2615947681046379,
"grad_norm": 0.49828580021858215,
"learning_rate": 4.907175293543541e-05,
"loss": 0.6055,
"num_input_tokens_seen": 6147664,
"step": 545,
"train_runtime": 891.085,
"train_tokens_per_second": 6899.077
},
{
"epoch": 0.2639947201055979,
"grad_norm": 0.5254030227661133,
"learning_rate": 4.905471999762857e-05,
"loss": 0.6124,
"num_input_tokens_seen": 6199352,
"step": 550,
"train_runtime": 898.7767,
"train_tokens_per_second": 6897.544
},
{
"epoch": 0.2663946721065579,
"grad_norm": 0.519650936126709,
"learning_rate": 4.9037535214333287e-05,
"loss": 0.5247,
"num_input_tokens_seen": 6255144,
"step": 555,
"train_runtime": 906.8762,
"train_tokens_per_second": 6897.462
},
{
"epoch": 0.26879462410751787,
"grad_norm": 0.568850040435791,
"learning_rate": 4.9020198694028565e-05,
"loss": 0.5647,
"num_input_tokens_seen": 6306704,
"step": 560,
"train_runtime": 914.4502,
"train_tokens_per_second": 6896.717
},
{
"epoch": 0.2711945761084778,
"grad_norm": 0.47335347533226013,
"learning_rate": 4.900271054615123e-05,
"loss": 0.4978,
"num_input_tokens_seen": 6366360,
"step": 565,
"train_runtime": 923.5165,
"train_tokens_per_second": 6893.607
},
{
"epoch": 0.2735945281094378,
"grad_norm": 0.6809021830558777,
"learning_rate": 4.898507088109527e-05,
"loss": 0.545,
"num_input_tokens_seen": 6421288,
"step": 570,
"train_runtime": 931.4592,
"train_tokens_per_second": 6893.794
},
{
"epoch": 0.2759944801103978,
"grad_norm": 0.41399407386779785,
"learning_rate": 4.8967279810211114e-05,
"loss": 0.5454,
"num_input_tokens_seen": 6479424,
"step": 575,
"train_runtime": 939.8206,
"train_tokens_per_second": 6894.32
},
{
"epoch": 0.2783944321113578,
"grad_norm": 0.6248930096626282,
"learning_rate": 4.894933744580496e-05,
"loss": 0.5506,
"num_input_tokens_seen": 6534464,
"step": 580,
"train_runtime": 947.6162,
"train_tokens_per_second": 6895.686
},
{
"epoch": 0.2807943841123178,
"grad_norm": 0.5835601687431335,
"learning_rate": 4.893124390113802e-05,
"loss": 0.5536,
"num_input_tokens_seen": 6587088,
"step": 585,
"train_runtime": 955.2033,
"train_tokens_per_second": 6896.006
},
{
"epoch": 0.2831943361132777,
"grad_norm": 0.6930661797523499,
"learning_rate": 4.8912999290425854e-05,
"loss": 0.5646,
"num_input_tokens_seen": 6641552,
"step": 590,
"train_runtime": 963.1053,
"train_tokens_per_second": 6895.977
},
{
"epoch": 0.2855942881142377,
"grad_norm": 0.6734236478805542,
"learning_rate": 4.889460372883762e-05,
"loss": 0.5492,
"num_input_tokens_seen": 6695296,
"step": 595,
"train_runtime": 971.3483,
"train_tokens_per_second": 6892.786
},
{
"epoch": 0.2879942401151977,
"grad_norm": 0.5208594799041748,
"learning_rate": 4.887605733249535e-05,
"loss": 0.5629,
"num_input_tokens_seen": 6753000,
"step": 600,
"train_runtime": 979.3691,
"train_tokens_per_second": 6895.255
},
{
"epoch": 0.2903941921161577,
"grad_norm": 0.5543494820594788,
"learning_rate": 4.885736021847322e-05,
"loss": 0.5165,
"num_input_tokens_seen": 6808816,
"step": 605,
"train_runtime": 987.597,
"train_tokens_per_second": 6894.326
},
{
"epoch": 0.2927941441171177,
"grad_norm": 0.4651249051094055,
"learning_rate": 4.883851250479682e-05,
"loss": 0.5292,
"num_input_tokens_seen": 6866616,
"step": 610,
"train_runtime": 995.7307,
"train_tokens_per_second": 6896.057
},
{
"epoch": 0.2951940961180776,
"grad_norm": 0.6964675188064575,
"learning_rate": 4.881951431044241e-05,
"loss": 0.5368,
"num_input_tokens_seen": 6926136,
"step": 615,
"train_runtime": 1004.3343,
"train_tokens_per_second": 6896.246
},
{
"epoch": 0.2975940481190376,
"grad_norm": 0.5867466330528259,
"learning_rate": 4.8800365755336114e-05,
"loss": 0.5104,
"num_input_tokens_seen": 6982680,
"step": 620,
"train_runtime": 1012.6109,
"train_tokens_per_second": 6895.719
},
{
"epoch": 0.2999940001199976,
"grad_norm": 0.7193952202796936,
"learning_rate": 4.8781066960353264e-05,
"loss": 0.5729,
"num_input_tokens_seen": 7035152,
"step": 625,
"train_runtime": 1020.3607,
"train_tokens_per_second": 6894.77
},
{
"epoch": 0.3023939521209576,
"grad_norm": 0.6436483860015869,
"learning_rate": 4.876161804731756e-05,
"loss": 0.5777,
"num_input_tokens_seen": 7085976,
"step": 630,
"train_runtime": 1028.043,
"train_tokens_per_second": 6892.684
},
{
"epoch": 0.30479390412191754,
"grad_norm": 0.8330582976341248,
"learning_rate": 4.87420191390003e-05,
"loss": 0.5729,
"num_input_tokens_seen": 7139560,
"step": 635,
"train_runtime": 1035.6955,
"train_tokens_per_second": 6893.493
},
{
"epoch": 0.30719385612287753,
"grad_norm": 0.5311642289161682,
"learning_rate": 4.872227035911967e-05,
"loss": 0.5212,
"num_input_tokens_seen": 7194016,
"step": 640,
"train_runtime": 1043.4188,
"train_tokens_per_second": 6894.658
},
{
"epoch": 0.3095938081238375,
"grad_norm": 0.5079819560050964,
"learning_rate": 4.87023718323399e-05,
"loss": 0.5227,
"num_input_tokens_seen": 7249352,
"step": 645,
"train_runtime": 1051.5718,
"train_tokens_per_second": 6893.825
},
{
"epoch": 0.3119937601247975,
"grad_norm": 0.5671476721763611,
"learning_rate": 4.868232368427048e-05,
"loss": 0.5057,
"num_input_tokens_seen": 7312768,
"step": 650,
"train_runtime": 1060.8218,
"train_tokens_per_second": 6893.494
},
{
"epoch": 0.3143937121257575,
"grad_norm": 0.5429338216781616,
"learning_rate": 4.8662126041465414e-05,
"loss": 0.522,
"num_input_tokens_seen": 7371440,
"step": 655,
"train_runtime": 1068.911,
"train_tokens_per_second": 6896.215
},
{
"epoch": 0.31679366412671744,
"grad_norm": 0.5430482625961304,
"learning_rate": 4.864177903142237e-05,
"loss": 0.5712,
"num_input_tokens_seen": 7428856,
"step": 660,
"train_runtime": 1077.1698,
"train_tokens_per_second": 6896.643
},
{
"epoch": 0.31919361612767744,
"grad_norm": 0.5577422380447388,
"learning_rate": 4.862128278258191e-05,
"loss": 0.5763,
"num_input_tokens_seen": 7482928,
"step": 665,
"train_runtime": 1085.0793,
"train_tokens_per_second": 6896.204
},
{
"epoch": 0.32159356812863743,
"grad_norm": 0.8080245852470398,
"learning_rate": 4.8600637424326676e-05,
"loss": 0.5921,
"num_input_tokens_seen": 7537000,
"step": 670,
"train_runtime": 1092.9929,
"train_tokens_per_second": 6895.744
},
{
"epoch": 0.3239935201295974,
"grad_norm": 0.5444366931915283,
"learning_rate": 4.8579843086980536e-05,
"loss": 0.5164,
"num_input_tokens_seen": 7600512,
"step": 675,
"train_runtime": 1102.2798,
"train_tokens_per_second": 6895.266
},
{
"epoch": 0.3263934721305574,
"grad_norm": 0.7307661771774292,
"learning_rate": 4.855889990180781e-05,
"loss": 0.4963,
"num_input_tokens_seen": 7655032,
"step": 680,
"train_runtime": 1110.5484,
"train_tokens_per_second": 6893.02
},
{
"epoch": 0.32879342413151735,
"grad_norm": 0.5061231851577759,
"learning_rate": 4.853780800101241e-05,
"loss": 0.5042,
"num_input_tokens_seen": 7709432,
"step": 685,
"train_runtime": 1118.5898,
"train_tokens_per_second": 6892.099
},
{
"epoch": 0.33119337613247735,
"grad_norm": 0.5457553863525391,
"learning_rate": 4.851656751773702e-05,
"loss": 0.5505,
"num_input_tokens_seen": 7768248,
"step": 690,
"train_runtime": 1127.0452,
"train_tokens_per_second": 6892.579
},
{
"epoch": 0.33359332813343734,
"grad_norm": 0.581109881401062,
"learning_rate": 4.849517858606225e-05,
"loss": 0.5219,
"num_input_tokens_seen": 7821976,
"step": 695,
"train_runtime": 1135.0848,
"train_tokens_per_second": 6891.094
},
{
"epoch": 0.33599328013439733,
"grad_norm": 0.6451846361160278,
"learning_rate": 4.84736413410058e-05,
"loss": 0.541,
"num_input_tokens_seen": 7875264,
"step": 700,
"train_runtime": 1143.0269,
"train_tokens_per_second": 6889.833
},
{
"epoch": 0.33839323213535727,
"grad_norm": 0.48146116733551025,
"learning_rate": 4.8451955918521586e-05,
"loss": 0.5666,
"num_input_tokens_seen": 7929656,
"step": 705,
"train_runtime": 1150.8799,
"train_tokens_per_second": 6890.081
},
{
"epoch": 0.34079318413631726,
"grad_norm": 0.5709965825080872,
"learning_rate": 4.84301224554989e-05,
"loss": 0.5295,
"num_input_tokens_seen": 7980872,
"step": 710,
"train_runtime": 1158.4978,
"train_tokens_per_second": 6888.983
},
{
"epoch": 0.34319313613727725,
"grad_norm": 0.6052954196929932,
"learning_rate": 4.840814108976154e-05,
"loss": 0.5509,
"num_input_tokens_seen": 8037376,
"step": 715,
"train_runtime": 1166.5335,
"train_tokens_per_second": 6889.966
},
{
"epoch": 0.34559308813823725,
"grad_norm": 0.5755806565284729,
"learning_rate": 4.838601196006694e-05,
"loss": 0.54,
"num_input_tokens_seen": 8094024,
"step": 720,
"train_runtime": 1175.0556,
"train_tokens_per_second": 6888.205
},
{
"epoch": 0.34799304013919724,
"grad_norm": 0.5676959753036499,
"learning_rate": 4.8363735206105276e-05,
"loss": 0.5663,
"num_input_tokens_seen": 8152456,
"step": 725,
"train_runtime": 1183.2718,
"train_tokens_per_second": 6889.758
},
{
"epoch": 0.3503929921401572,
"grad_norm": 0.7371501922607422,
"learning_rate": 4.8341310968498656e-05,
"loss": 0.5171,
"num_input_tokens_seen": 8206424,
"step": 730,
"train_runtime": 1191.1851,
"train_tokens_per_second": 6889.294
},
{
"epoch": 0.35279294414111717,
"grad_norm": 0.6847190260887146,
"learning_rate": 4.831873938880012e-05,
"loss": 0.5407,
"num_input_tokens_seen": 8262160,
"step": 735,
"train_runtime": 1199.2457,
"train_tokens_per_second": 6889.464
},
{
"epoch": 0.35519289614207716,
"grad_norm": 0.5282928347587585,
"learning_rate": 4.829602060949282e-05,
"loss": 0.5729,
"num_input_tokens_seen": 8316480,
"step": 740,
"train_runtime": 1207.1347,
"train_tokens_per_second": 6889.438
},
{
"epoch": 0.35759284814303716,
"grad_norm": 0.39273539185523987,
"learning_rate": 4.827315477398914e-05,
"loss": 0.4796,
"num_input_tokens_seen": 8379024,
"step": 745,
"train_runtime": 1216.4818,
"train_tokens_per_second": 6887.916
},
{
"epoch": 0.3599928001439971,
"grad_norm": 0.442878395318985,
"learning_rate": 4.825014202662972e-05,
"loss": 0.5178,
"num_input_tokens_seen": 8436408,
"step": 750,
"train_runtime": 1224.6516,
"train_tokens_per_second": 6888.823
},
{
"epoch": 0.3623927521449571,
"grad_norm": 0.5023097395896912,
"learning_rate": 4.82269825126826e-05,
"loss": 0.5436,
"num_input_tokens_seen": 8494184,
"step": 755,
"train_runtime": 1233.1902,
"train_tokens_per_second": 6887.975
},
{
"epoch": 0.3647927041459171,
"grad_norm": 0.6507300138473511,
"learning_rate": 4.8203676378342263e-05,
"loss": 0.5761,
"num_input_tokens_seen": 8543600,
"step": 760,
"train_runtime": 1240.5356,
"train_tokens_per_second": 6887.025
},
{
"epoch": 0.36719265614687707,
"grad_norm": 0.6500417590141296,
"learning_rate": 4.818022377072876e-05,
"loss": 0.5519,
"num_input_tokens_seen": 8601672,
"step": 765,
"train_runtime": 1249.3942,
"train_tokens_per_second": 6884.674
},
{
"epoch": 0.36959260814783707,
"grad_norm": 0.720543384552002,
"learning_rate": 4.8156624837886744e-05,
"loss": 0.5488,
"num_input_tokens_seen": 8654824,
"step": 770,
"train_runtime": 1257.4098,
"train_tokens_per_second": 6883.058
},
{
"epoch": 0.371992560148797,
"grad_norm": 0.5728187561035156,
"learning_rate": 4.813287972878454e-05,
"loss": 0.5093,
"num_input_tokens_seen": 8709032,
"step": 775,
"train_runtime": 1265.5372,
"train_tokens_per_second": 6881.688
},
{
"epoch": 0.374392512149757,
"grad_norm": 0.6271533966064453,
"learning_rate": 4.810898859331322e-05,
"loss": 0.5451,
"num_input_tokens_seen": 8766264,
"step": 780,
"train_runtime": 1273.7019,
"train_tokens_per_second": 6882.508
},
{
"epoch": 0.376792464150717,
"grad_norm": 0.5907756686210632,
"learning_rate": 4.8084951582285634e-05,
"loss": 0.4965,
"num_input_tokens_seen": 8820344,
"step": 785,
"train_runtime": 1281.2863,
"train_tokens_per_second": 6883.976
},
{
"epoch": 0.379192416151677,
"grad_norm": 0.5380600094795227,
"learning_rate": 4.80607688474355e-05,
"loss": 0.5298,
"num_input_tokens_seen": 8881528,
"step": 790,
"train_runtime": 1289.7202,
"train_tokens_per_second": 6886.399
},
{
"epoch": 0.381592368152637,
"grad_norm": 0.6812204718589783,
"learning_rate": 4.803644054141639e-05,
"loss": 0.5278,
"num_input_tokens_seen": 8939712,
"step": 795,
"train_runtime": 1298.1298,
"train_tokens_per_second": 6886.609
},
{
"epoch": 0.3839923201535969,
"grad_norm": 0.8065762519836426,
"learning_rate": 4.8011966817800804e-05,
"loss": 0.548,
"num_input_tokens_seen": 8994888,
"step": 800,
"train_runtime": 1306.2424,
"train_tokens_per_second": 6886.078
},
{
"epoch": 0.3863922721545569,
"grad_norm": 0.7721138596534729,
"learning_rate": 4.79873478310792e-05,
"loss": 0.5446,
"num_input_tokens_seen": 9052200,
"step": 805,
"train_runtime": 1314.0422,
"train_tokens_per_second": 6888.82
},
{
"epoch": 0.3887922241555169,
"grad_norm": 0.7508792281150818,
"learning_rate": 4.796258373665899e-05,
"loss": 0.5531,
"num_input_tokens_seen": 9106936,
"step": 810,
"train_runtime": 1322.1708,
"train_tokens_per_second": 6887.867
},
{
"epoch": 0.3911921761564769,
"grad_norm": 0.7303242087364197,
"learning_rate": 4.793767469086361e-05,
"loss": 0.5786,
"num_input_tokens_seen": 9158400,
"step": 815,
"train_runtime": 1329.9099,
"train_tokens_per_second": 6886.482
},
{
"epoch": 0.3935921281574368,
"grad_norm": 0.5493381023406982,
"learning_rate": 4.791262085093147e-05,
"loss": 0.5285,
"num_input_tokens_seen": 9218552,
"step": 820,
"train_runtime": 1338.4057,
"train_tokens_per_second": 6887.711
},
{
"epoch": 0.3959920801583968,
"grad_norm": 0.5721644163131714,
"learning_rate": 4.788742237501499e-05,
"loss": 0.5481,
"num_input_tokens_seen": 9272768,
"step": 825,
"train_runtime": 1346.3952,
"train_tokens_per_second": 6887.107
},
{
"epoch": 0.3983920321593568,
"grad_norm": 0.5689188241958618,
"learning_rate": 4.786207942217965e-05,
"loss": 0.5457,
"num_input_tokens_seen": 9327048,
"step": 830,
"train_runtime": 1354.3004,
"train_tokens_per_second": 6886.986
},
{
"epoch": 0.4007919841603168,
"grad_norm": 0.48985663056373596,
"learning_rate": 4.783659215240289e-05,
"loss": 0.5067,
"num_input_tokens_seen": 9389344,
"step": 835,
"train_runtime": 1363.2987,
"train_tokens_per_second": 6887.224
},
{
"epoch": 0.4031919361612768,
"grad_norm": 0.7661736011505127,
"learning_rate": 4.78109607265732e-05,
"loss": 0.5576,
"num_input_tokens_seen": 9444656,
"step": 840,
"train_runtime": 1371.5402,
"train_tokens_per_second": 6886.168
},
{
"epoch": 0.40559188816223674,
"grad_norm": 0.6617030501365662,
"learning_rate": 4.778518530648899e-05,
"loss": 0.5566,
"num_input_tokens_seen": 9499464,
"step": 845,
"train_runtime": 1379.7517,
"train_tokens_per_second": 6884.908
},
{
"epoch": 0.40799184016319673,
"grad_norm": 0.6450020670890808,
"learning_rate": 4.77592660548577e-05,
"loss": 0.5486,
"num_input_tokens_seen": 9553432,
"step": 850,
"train_runtime": 1387.7923,
"train_tokens_per_second": 6883.906
},
{
"epoch": 0.4103917921641567,
"grad_norm": 0.6538447737693787,
"learning_rate": 4.7733203135294676e-05,
"loss": 0.5289,
"num_input_tokens_seen": 9608536,
"step": 855,
"train_runtime": 1396.0755,
"train_tokens_per_second": 6882.533
},
{
"epoch": 0.4127917441651167,
"grad_norm": 0.5988488793373108,
"learning_rate": 4.770699671232216e-05,
"loss": 0.5261,
"num_input_tokens_seen": 9661208,
"step": 860,
"train_runtime": 1403.8269,
"train_tokens_per_second": 6882.051
},
{
"epoch": 0.41519169616607665,
"grad_norm": 0.5807068347930908,
"learning_rate": 4.768064695136829e-05,
"loss": 0.5306,
"num_input_tokens_seen": 9721752,
"step": 865,
"train_runtime": 1412.2285,
"train_tokens_per_second": 6883.98
},
{
"epoch": 0.41759164816703664,
"grad_norm": 0.48121166229248047,
"learning_rate": 4.765415401876599e-05,
"loss": 0.5549,
"num_input_tokens_seen": 9779768,
"step": 870,
"train_runtime": 1420.4407,
"train_tokens_per_second": 6885.024
},
{
"epoch": 0.41999160016799664,
"grad_norm": 0.565889835357666,
"learning_rate": 4.7627518081751975e-05,
"loss": 0.5355,
"num_input_tokens_seen": 9835272,
"step": 875,
"train_runtime": 1428.7292,
"train_tokens_per_second": 6883.93
},
{
"epoch": 0.42239155216895663,
"grad_norm": 0.7845768928527832,
"learning_rate": 4.760073930846569e-05,
"loss": 0.5411,
"num_input_tokens_seen": 9890512,
"step": 880,
"train_runtime": 1436.5286,
"train_tokens_per_second": 6885.009
},
{
"epoch": 0.4247915041699166,
"grad_norm": 0.6052142381668091,
"learning_rate": 4.75738178679482e-05,
"loss": 0.5432,
"num_input_tokens_seen": 9944392,
"step": 885,
"train_runtime": 1444.2701,
"train_tokens_per_second": 6885.41
},
{
"epoch": 0.42719145617087656,
"grad_norm": 0.6109101176261902,
"learning_rate": 4.754675393014117e-05,
"loss": 0.4997,
"num_input_tokens_seen": 9999080,
"step": 890,
"train_runtime": 1452.4138,
"train_tokens_per_second": 6884.457
},
{
"epoch": 0.42959140817183655,
"grad_norm": 0.8205054998397827,
"learning_rate": 4.751954766588581e-05,
"loss": 0.5276,
"num_input_tokens_seen": 10053320,
"step": 895,
"train_runtime": 1460.9561,
"train_tokens_per_second": 6881.329
},
{
"epoch": 0.43199136017279655,
"grad_norm": 0.6081852316856384,
"learning_rate": 4.749219924692172e-05,
"loss": 0.4801,
"num_input_tokens_seen": 10112592,
"step": 900,
"train_runtime": 1469.5246,
"train_tokens_per_second": 6881.54
},
{
"epoch": 0.43439131217375654,
"grad_norm": 0.6444746851921082,
"learning_rate": 4.7464708845885877e-05,
"loss": 0.4976,
"num_input_tokens_seen": 10168072,
"step": 905,
"train_runtime": 1477.8554,
"train_tokens_per_second": 6880.289
},
{
"epoch": 0.43679126417471653,
"grad_norm": 0.591349184513092,
"learning_rate": 4.7437076636311514e-05,
"loss": 0.5343,
"num_input_tokens_seen": 10221648,
"step": 910,
"train_runtime": 1486.1153,
"train_tokens_per_second": 6878.099
},
{
"epoch": 0.43919121617567647,
"grad_norm": 0.6491187810897827,
"learning_rate": 4.7409302792627044e-05,
"loss": 0.4946,
"num_input_tokens_seen": 10284000,
"step": 915,
"train_runtime": 1494.8745,
"train_tokens_per_second": 6879.507
},
{
"epoch": 0.44159116817663646,
"grad_norm": 0.6963967084884644,
"learning_rate": 4.738138749015492e-05,
"loss": 0.5109,
"num_input_tokens_seen": 10340768,
"step": 920,
"train_runtime": 1502.9982,
"train_tokens_per_second": 6880.094
},
{
"epoch": 0.44399112017759645,
"grad_norm": 0.4319298267364502,
"learning_rate": 4.735333090511056e-05,
"loss": 0.5082,
"num_input_tokens_seen": 10400928,
"step": 925,
"train_runtime": 1511.7616,
"train_tokens_per_second": 6880.006
},
{
"epoch": 0.44639107217855645,
"grad_norm": 0.6248960494995117,
"learning_rate": 4.732513321460127e-05,
"loss": 0.5612,
"num_input_tokens_seen": 10456208,
"step": 930,
"train_runtime": 1519.9104,
"train_tokens_per_second": 6879.49
},
{
"epoch": 0.4487910241795164,
"grad_norm": 0.7751626372337341,
"learning_rate": 4.729679459662502e-05,
"loss": 0.5253,
"num_input_tokens_seen": 10513352,
"step": 935,
"train_runtime": 1528.6579,
"train_tokens_per_second": 6877.505
},
{
"epoch": 0.4511909761804764,
"grad_norm": 0.5862913131713867,
"learning_rate": 4.726831523006944e-05,
"loss": 0.5403,
"num_input_tokens_seen": 10568872,
"step": 940,
"train_runtime": 1537.6257,
"train_tokens_per_second": 6873.501
},
{
"epoch": 0.45359092818143637,
"grad_norm": 0.7188037633895874,
"learning_rate": 4.7239695294710586e-05,
"loss": 0.5332,
"num_input_tokens_seen": 10623984,
"step": 945,
"train_runtime": 1545.7364,
"train_tokens_per_second": 6873.089
},
{
"epoch": 0.45599088018239636,
"grad_norm": 0.7903031706809998,
"learning_rate": 4.7210934971211906e-05,
"loss": 0.572,
"num_input_tokens_seen": 10675064,
"step": 950,
"train_runtime": 1553.3218,
"train_tokens_per_second": 6872.41
},
{
"epoch": 0.45839083218335636,
"grad_norm": 0.5360180139541626,
"learning_rate": 4.718203444112301e-05,
"loss": 0.4812,
"num_input_tokens_seen": 10735624,
"step": 955,
"train_runtime": 1561.8181,
"train_tokens_per_second": 6873.799
},
{
"epoch": 0.4607907841843163,
"grad_norm": 0.6711071133613586,
"learning_rate": 4.7152993886878585e-05,
"loss": 0.4681,
"num_input_tokens_seen": 10790472,
"step": 960,
"train_runtime": 1569.6307,
"train_tokens_per_second": 6874.529
},
{
"epoch": 0.4631907361852763,
"grad_norm": 0.6165657043457031,
"learning_rate": 4.712381349179721e-05,
"loss": 0.508,
"num_input_tokens_seen": 10844896,
"step": 965,
"train_runtime": 1577.4118,
"train_tokens_per_second": 6875.12
},
{
"epoch": 0.4655906881862363,
"grad_norm": 0.6834767460823059,
"learning_rate": 4.709449344008021e-05,
"loss": 0.4988,
"num_input_tokens_seen": 10902552,
"step": 970,
"train_runtime": 1586.0811,
"train_tokens_per_second": 6873.893
},
{
"epoch": 0.46799064018719627,
"grad_norm": 0.7366524338722229,
"learning_rate": 4.706503391681049e-05,
"loss": 0.5755,
"num_input_tokens_seen": 10956224,
"step": 975,
"train_runtime": 1593.6535,
"train_tokens_per_second": 6874.91
},
{
"epoch": 0.47039059218815626,
"grad_norm": 0.5903698205947876,
"learning_rate": 4.7035435107951384e-05,
"loss": 0.5283,
"num_input_tokens_seen": 11011072,
"step": 980,
"train_runtime": 1601.6979,
"train_tokens_per_second": 6874.625
},
{
"epoch": 0.4727905441891162,
"grad_norm": 0.631288468837738,
"learning_rate": 4.700569720034545e-05,
"loss": 0.4954,
"num_input_tokens_seen": 11066344,
"step": 985,
"train_runtime": 1609.8085,
"train_tokens_per_second": 6874.323
},
{
"epoch": 0.4751904961900762,
"grad_norm": 0.5448499917984009,
"learning_rate": 4.697582038171332e-05,
"loss": 0.5431,
"num_input_tokens_seen": 11121472,
"step": 990,
"train_runtime": 1618.0718,
"train_tokens_per_second": 6873.287
},
{
"epoch": 0.4775904481910362,
"grad_norm": 0.5397956967353821,
"learning_rate": 4.694580484065248e-05,
"loss": 0.4836,
"num_input_tokens_seen": 11181736,
"step": 995,
"train_runtime": 1627.2301,
"train_tokens_per_second": 6871.638
},
{
"epoch": 0.4799904001919962,
"grad_norm": 0.7059435248374939,
"learning_rate": 4.6915650766636156e-05,
"loss": 0.4765,
"num_input_tokens_seen": 11241912,
"step": 1000,
"train_runtime": 1635.6606,
"train_tokens_per_second": 6873.01
},
{
"epoch": 0.4823903521929561,
"grad_norm": 0.5551899075508118,
"learning_rate": 4.6885358350011986e-05,
"loss": 0.523,
"num_input_tokens_seen": 11296568,
"step": 1005,
"train_runtime": 1644.0041,
"train_tokens_per_second": 6871.375
},
{
"epoch": 0.4847903041939161,
"grad_norm": 0.659951388835907,
"learning_rate": 4.6854927782000954e-05,
"loss": 0.4891,
"num_input_tokens_seen": 11351944,
"step": 1010,
"train_runtime": 1652.1239,
"train_tokens_per_second": 6871.121
},
{
"epoch": 0.4871902561948761,
"grad_norm": 0.6763627529144287,
"learning_rate": 4.6824359254696105e-05,
"loss": 0.502,
"num_input_tokens_seen": 11410584,
"step": 1015,
"train_runtime": 1661.21,
"train_tokens_per_second": 6868.839
},
{
"epoch": 0.4895902081958361,
"grad_norm": 0.49618440866470337,
"learning_rate": 4.6793652961061364e-05,
"loss": 0.5451,
"num_input_tokens_seen": 11465560,
"step": 1020,
"train_runtime": 1669.6454,
"train_tokens_per_second": 6867.063
},
{
"epoch": 0.4919901601967961,
"grad_norm": 0.6427881717681885,
"learning_rate": 4.676280909493028e-05,
"loss": 0.5277,
"num_input_tokens_seen": 11523960,
"step": 1025,
"train_runtime": 1678.2247,
"train_tokens_per_second": 6866.756
},
{
"epoch": 0.494390112197756,
"grad_norm": 0.7086818218231201,
"learning_rate": 4.673182785100485e-05,
"loss": 0.4885,
"num_input_tokens_seen": 11584904,
"step": 1030,
"train_runtime": 1687.3682,
"train_tokens_per_second": 6865.665
},
{
"epoch": 0.496790064198716,
"grad_norm": 0.5998096466064453,
"learning_rate": 4.6700709424854274e-05,
"loss": 0.5266,
"num_input_tokens_seen": 11642456,
"step": 1035,
"train_runtime": 1696.3396,
"train_tokens_per_second": 6863.281
},
{
"epoch": 0.499190016199676,
"grad_norm": 0.6782186627388,
"learning_rate": 4.66694540129137e-05,
"loss": 0.5813,
"num_input_tokens_seen": 11696912,
"step": 1040,
"train_runtime": 1704.0809,
"train_tokens_per_second": 6864.059
},
{
"epoch": 0.501589968200636,
"grad_norm": 0.541053056716919,
"learning_rate": 4.6638061812483005e-05,
"loss": 0.4875,
"num_input_tokens_seen": 11755104,
"step": 1045,
"train_runtime": 1712.4044,
"train_tokens_per_second": 6864.677
},
{
"epoch": 0.5039899202015959,
"grad_norm": 0.6517828106880188,
"learning_rate": 4.660653302172554e-05,
"loss": 0.5367,
"num_input_tokens_seen": 11810984,
"step": 1050,
"train_runtime": 1720.4999,
"train_tokens_per_second": 6864.856
},
{
"epoch": 0.5063898722025559,
"grad_norm": 0.6961039900779724,
"learning_rate": 4.6574867839666895e-05,
"loss": 0.5314,
"num_input_tokens_seen": 11862312,
"step": 1055,
"train_runtime": 1728.5831,
"train_tokens_per_second": 6862.448
},
{
"epoch": 0.5087898242035159,
"grad_norm": 0.7300373911857605,
"learning_rate": 4.654306646619361e-05,
"loss": 0.5496,
"num_input_tokens_seen": 11923072,
"step": 1060,
"train_runtime": 1737.4285,
"train_tokens_per_second": 6862.482
},
{
"epoch": 0.5111897762044759,
"grad_norm": 0.7295413017272949,
"learning_rate": 4.6511129102051954e-05,
"loss": 0.5205,
"num_input_tokens_seen": 11978568,
"step": 1065,
"train_runtime": 1745.4719,
"train_tokens_per_second": 6862.653
},
{
"epoch": 0.5135897282054359,
"grad_norm": 0.5520017743110657,
"learning_rate": 4.647905594884663e-05,
"loss": 0.4768,
"num_input_tokens_seen": 12035000,
"step": 1070,
"train_runtime": 1754.3574,
"train_tokens_per_second": 6860.062
},
{
"epoch": 0.5159896802063959,
"grad_norm": 0.5629371404647827,
"learning_rate": 4.6446847209039504e-05,
"loss": 0.5136,
"num_input_tokens_seen": 12096040,
"step": 1075,
"train_runtime": 1762.6751,
"train_tokens_per_second": 6862.32
},
{
"epoch": 0.5183896322073559,
"grad_norm": 0.750357449054718,
"learning_rate": 4.6414503085948334e-05,
"loss": 0.5022,
"num_input_tokens_seen": 12148448,
"step": 1080,
"train_runtime": 1770.3881,
"train_tokens_per_second": 6862.025
},
{
"epoch": 0.5207895842083158,
"grad_norm": 0.9546124339103699,
"learning_rate": 4.63820237837455e-05,
"loss": 0.5196,
"num_input_tokens_seen": 12207120,
"step": 1085,
"train_runtime": 1778.4216,
"train_tokens_per_second": 6864.019
},
{
"epoch": 0.5231895362092758,
"grad_norm": 0.6891536712646484,
"learning_rate": 4.634940950745668e-05,
"loss": 0.5566,
"num_input_tokens_seen": 12261136,
"step": 1090,
"train_runtime": 1786.6486,
"train_tokens_per_second": 6862.646
},
{
"epoch": 0.5255894882102358,
"grad_norm": 0.7175304889678955,
"learning_rate": 4.631666046295959e-05,
"loss": 0.5483,
"num_input_tokens_seen": 12313856,
"step": 1095,
"train_runtime": 1794.8084,
"train_tokens_per_second": 6860.819
},
{
"epoch": 0.5279894402111958,
"grad_norm": 0.7148723602294922,
"learning_rate": 4.628377685698268e-05,
"loss": 0.5072,
"num_input_tokens_seen": 12367984,
"step": 1100,
"train_runtime": 1802.927,
"train_tokens_per_second": 6859.947
},
{
"epoch": 0.5303893922121558,
"grad_norm": 0.6276180148124695,
"learning_rate": 4.6250758897103775e-05,
"loss": 0.5316,
"num_input_tokens_seen": 12422128,
"step": 1105,
"train_runtime": 1810.8688,
"train_tokens_per_second": 6859.761
},
{
"epoch": 0.5327893442131157,
"grad_norm": 0.5570586919784546,
"learning_rate": 4.621760679174887e-05,
"loss": 0.4781,
"num_input_tokens_seen": 12477576,
"step": 1110,
"train_runtime": 1818.8781,
"train_tokens_per_second": 6860.04
},
{
"epoch": 0.5351892962140757,
"grad_norm": 0.46177980303764343,
"learning_rate": 4.618432075019071e-05,
"loss": 0.5028,
"num_input_tokens_seen": 12536840,
"step": 1115,
"train_runtime": 1827.02,
"train_tokens_per_second": 6861.906
},
{
"epoch": 0.5375892482150357,
"grad_norm": 0.8723595142364502,
"learning_rate": 4.615090098254753e-05,
"loss": 0.5637,
"num_input_tokens_seen": 12592424,
"step": 1120,
"train_runtime": 1835.1133,
"train_tokens_per_second": 6861.933
},
{
"epoch": 0.5399892002159957,
"grad_norm": 0.5950156450271606,
"learning_rate": 4.6117347699781706e-05,
"loss": 0.5276,
"num_input_tokens_seen": 12650424,
"step": 1125,
"train_runtime": 1843.7548,
"train_tokens_per_second": 6861.229
},
{
"epoch": 0.5423891522169556,
"grad_norm": 0.7282635569572449,
"learning_rate": 4.608366111369843e-05,
"loss": 0.518,
"num_input_tokens_seen": 12706224,
"step": 1130,
"train_runtime": 1851.8221,
"train_tokens_per_second": 6861.471
},
{
"epoch": 0.5447891042179156,
"grad_norm": 0.5508381724357605,
"learning_rate": 4.6049841436944385e-05,
"loss": 0.4956,
"num_input_tokens_seen": 12767096,
"step": 1135,
"train_runtime": 1860.911,
"train_tokens_per_second": 6860.67
},
{
"epoch": 0.5471890562188756,
"grad_norm": 0.57481849193573,
"learning_rate": 4.6015888883006364e-05,
"loss": 0.539,
"num_input_tokens_seen": 12821808,
"step": 1140,
"train_runtime": 1868.849,
"train_tokens_per_second": 6860.805
},
{
"epoch": 0.5495890082198356,
"grad_norm": 0.4912041425704956,
"learning_rate": 4.598180366620996e-05,
"loss": 0.5163,
"num_input_tokens_seen": 12874928,
"step": 1145,
"train_runtime": 1876.9045,
"train_tokens_per_second": 6859.661
},
{
"epoch": 0.5519889602207956,
"grad_norm": 0.666242778301239,
"learning_rate": 4.594758600171821e-05,
"loss": 0.5662,
"num_input_tokens_seen": 12927848,
"step": 1150,
"train_runtime": 1884.5707,
"train_tokens_per_second": 6859.837
},
{
"epoch": 0.5543889122217556,
"grad_norm": 0.6598814129829407,
"learning_rate": 4.591323610553021e-05,
"loss": 0.493,
"num_input_tokens_seen": 12985640,
"step": 1155,
"train_runtime": 1892.6667,
"train_tokens_per_second": 6861.028
},
{
"epoch": 0.5567888642227156,
"grad_norm": 0.8162060379981995,
"learning_rate": 4.587875419447979e-05,
"loss": 0.5289,
"num_input_tokens_seen": 13041608,
"step": 1160,
"train_runtime": 1901.1294,
"train_tokens_per_second": 6859.927
},
{
"epoch": 0.5591888162236756,
"grad_norm": 0.7061068415641785,
"learning_rate": 4.5844140486234086e-05,
"loss": 0.4997,
"num_input_tokens_seen": 13094240,
"step": 1165,
"train_runtime": 1909.2382,
"train_tokens_per_second": 6858.358
},
{
"epoch": 0.5615887682246355,
"grad_norm": 0.5444318056106567,
"learning_rate": 4.580939519929226e-05,
"loss": 0.5155,
"num_input_tokens_seen": 13150544,
"step": 1170,
"train_runtime": 1917.2941,
"train_tokens_per_second": 6858.908
},
{
"epoch": 0.5639887202255955,
"grad_norm": 0.5705589652061462,
"learning_rate": 4.577451855298402e-05,
"loss": 0.4927,
"num_input_tokens_seen": 13211016,
"step": 1175,
"train_runtime": 1925.8239,
"train_tokens_per_second": 6859.93
},
{
"epoch": 0.5663886722265554,
"grad_norm": 0.6715133190155029,
"learning_rate": 4.5739510767468295e-05,
"loss": 0.5525,
"num_input_tokens_seen": 13269168,
"step": 1180,
"train_runtime": 1934.5386,
"train_tokens_per_second": 6859.087
},
{
"epoch": 0.5687886242275154,
"grad_norm": 0.5893720388412476,
"learning_rate": 4.570437206373183e-05,
"loss": 0.5094,
"num_input_tokens_seen": 13326336,
"step": 1185,
"train_runtime": 1942.8203,
"train_tokens_per_second": 6859.274
},
{
"epoch": 0.5711885762284754,
"grad_norm": 0.5553702116012573,
"learning_rate": 4.5669102663587795e-05,
"loss": 0.5036,
"num_input_tokens_seen": 13382784,
"step": 1190,
"train_runtime": 1950.7367,
"train_tokens_per_second": 6860.374
},
{
"epoch": 0.5735885282294354,
"grad_norm": 0.9842544198036194,
"learning_rate": 4.563370278967437e-05,
"loss": 0.523,
"num_input_tokens_seen": 13438016,
"step": 1195,
"train_runtime": 1958.7567,
"train_tokens_per_second": 6860.482
},
{
"epoch": 0.5759884802303954,
"grad_norm": 0.7406736612319946,
"learning_rate": 4.559817266545337e-05,
"loss": 0.562,
"num_input_tokens_seen": 13492904,
"step": 1200,
"train_runtime": 1966.6536,
"train_tokens_per_second": 6860.844
},
{
"epoch": 0.5783884322313554,
"grad_norm": 0.6010822057723999,
"learning_rate": 4.5562512515208816e-05,
"loss": 0.5257,
"num_input_tokens_seen": 13546992,
"step": 1205,
"train_runtime": 1974.2569,
"train_tokens_per_second": 6861.818
},
{
"epoch": 0.5807883842323154,
"grad_norm": 0.5682114362716675,
"learning_rate": 4.5526722564045486e-05,
"loss": 0.5234,
"num_input_tokens_seen": 13599704,
"step": 1210,
"train_runtime": 1982.0432,
"train_tokens_per_second": 6861.457
},
{
"epoch": 0.5831883362332754,
"grad_norm": 0.7476803064346313,
"learning_rate": 4.5490803037887556e-05,
"loss": 0.4522,
"num_input_tokens_seen": 13658840,
"step": 1215,
"train_runtime": 1990.3973,
"train_tokens_per_second": 6862.369
},
{
"epoch": 0.5855882882342354,
"grad_norm": 0.8684011697769165,
"learning_rate": 4.545475416347714e-05,
"loss": 0.504,
"num_input_tokens_seen": 13712920,
"step": 1220,
"train_runtime": 1998.5695,
"train_tokens_per_second": 6861.367
},
{
"epoch": 0.5879882402351952,
"grad_norm": 0.6915135383605957,
"learning_rate": 4.5418576168372864e-05,
"loss": 0.5473,
"num_input_tokens_seen": 13768056,
"step": 1225,
"train_runtime": 2006.2278,
"train_tokens_per_second": 6862.658
},
{
"epoch": 0.5903881922361552,
"grad_norm": 0.6309444308280945,
"learning_rate": 4.538226928094841e-05,
"loss": 0.5321,
"num_input_tokens_seen": 13826288,
"step": 1230,
"train_runtime": 2014.608,
"train_tokens_per_second": 6863.016
},
{
"epoch": 0.5927881442371152,
"grad_norm": 0.7776080965995789,
"learning_rate": 4.534583373039112e-05,
"loss": 0.5578,
"num_input_tokens_seen": 13880688,
"step": 1235,
"train_runtime": 2022.5528,
"train_tokens_per_second": 6862.955
},
{
"epoch": 0.5951880962380752,
"grad_norm": 0.5800984501838684,
"learning_rate": 4.530926974670052e-05,
"loss": 0.5097,
"num_input_tokens_seen": 13937072,
"step": 1240,
"train_runtime": 2030.7522,
"train_tokens_per_second": 6863.01
},
{
"epoch": 0.5975880482390352,
"grad_norm": 0.6254319548606873,
"learning_rate": 4.5272577560686834e-05,
"loss": 0.5038,
"num_input_tokens_seen": 13990528,
"step": 1245,
"train_runtime": 2038.6265,
"train_tokens_per_second": 6862.723
},
{
"epoch": 0.5999880002399952,
"grad_norm": 0.7174450755119324,
"learning_rate": 4.523575740396962e-05,
"loss": 0.5304,
"num_input_tokens_seen": 14044296,
"step": 1250,
"train_runtime": 2046.8343,
"train_tokens_per_second": 6861.472
},
{
"epoch": 0.6023879522409552,
"grad_norm": 0.7481257915496826,
"learning_rate": 4.5198809508976206e-05,
"loss": 0.4927,
"num_input_tokens_seen": 14102520,
"step": 1255,
"train_runtime": 2055.3394,
"train_tokens_per_second": 6861.407
},
{
"epoch": 0.6047879042419152,
"grad_norm": 0.909005343914032,
"learning_rate": 4.516173410894028e-05,
"loss": 0.5067,
"num_input_tokens_seen": 14153848,
"step": 1260,
"train_runtime": 2062.8941,
"train_tokens_per_second": 6861.161
},
{
"epoch": 0.6071878562428752,
"grad_norm": 0.674818754196167,
"learning_rate": 4.512453143790042e-05,
"loss": 0.528,
"num_input_tokens_seen": 14210416,
"step": 1265,
"train_runtime": 2071.062,
"train_tokens_per_second": 6861.415
},
{
"epoch": 0.6095878082438351,
"grad_norm": 0.7137752771377563,
"learning_rate": 4.508720173069859e-05,
"loss": 0.5395,
"num_input_tokens_seen": 14263360,
"step": 1270,
"train_runtime": 2079.097,
"train_tokens_per_second": 6860.363
},
{
"epoch": 0.6119877602447951,
"grad_norm": 0.5564314723014832,
"learning_rate": 4.5049745222978665e-05,
"loss": 0.522,
"num_input_tokens_seen": 14320200,
"step": 1275,
"train_runtime": 2087.1564,
"train_tokens_per_second": 6861.105
},
{
"epoch": 0.6143877122457551,
"grad_norm": 0.7505349516868591,
"learning_rate": 4.501216215118498e-05,
"loss": 0.5303,
"num_input_tokens_seen": 14376904,
"step": 1280,
"train_runtime": 2095.304,
"train_tokens_per_second": 6861.488
},
{
"epoch": 0.616787664246715,
"grad_norm": 0.6077600121498108,
"learning_rate": 4.497445275256076e-05,
"loss": 0.5027,
"num_input_tokens_seen": 14434888,
"step": 1285,
"train_runtime": 2103.8746,
"train_tokens_per_second": 6861.097
},
{
"epoch": 0.619187616247675,
"grad_norm": 0.6120113730430603,
"learning_rate": 4.4936617265146696e-05,
"loss": 0.5192,
"num_input_tokens_seen": 14489232,
"step": 1290,
"train_runtime": 2112.087,
"train_tokens_per_second": 6860.149
},
{
"epoch": 0.621587568248635,
"grad_norm": 0.7720391750335693,
"learning_rate": 4.489865592777941e-05,
"loss": 0.5137,
"num_input_tokens_seen": 14543200,
"step": 1295,
"train_runtime": 2119.9779,
"train_tokens_per_second": 6860.072
},
{
"epoch": 0.623987520249595,
"grad_norm": 0.8337739706039429,
"learning_rate": 4.486056898008996e-05,
"loss": 0.5647,
"num_input_tokens_seen": 14597160,
"step": 1300,
"train_runtime": 2127.8662,
"train_tokens_per_second": 6859.999
},
{
"epoch": 0.626387472250555,
"grad_norm": 0.6936734914779663,
"learning_rate": 4.48223566625023e-05,
"loss": 0.5372,
"num_input_tokens_seen": 14656120,
"step": 1305,
"train_runtime": 2136.0775,
"train_tokens_per_second": 6861.23
},
{
"epoch": 0.628787424251515,
"grad_norm": 0.42849820852279663,
"learning_rate": 4.47840192162318e-05,
"loss": 0.4987,
"num_input_tokens_seen": 14715168,
"step": 1310,
"train_runtime": 2144.4803,
"train_tokens_per_second": 6861.881
},
{
"epoch": 0.6311873762524749,
"grad_norm": 0.6073727607727051,
"learning_rate": 4.47455568832837e-05,
"loss": 0.5242,
"num_input_tokens_seen": 14771992,
"step": 1315,
"train_runtime": 2152.4662,
"train_tokens_per_second": 6862.822
},
{
"epoch": 0.6335873282534349,
"grad_norm": 0.81267911195755,
"learning_rate": 4.470696990645158e-05,
"loss": 0.5488,
"num_input_tokens_seen": 14827224,
"step": 1320,
"train_runtime": 2160.85,
"train_tokens_per_second": 6861.755
},
{
"epoch": 0.6359872802543949,
"grad_norm": 0.9082570672035217,
"learning_rate": 4.4668258529315855e-05,
"loss": 0.5578,
"num_input_tokens_seen": 14880216,
"step": 1325,
"train_runtime": 2168.5347,
"train_tokens_per_second": 6861.876
},
{
"epoch": 0.6383872322553549,
"grad_norm": 0.4958833158016205,
"learning_rate": 4.462942299624219e-05,
"loss": 0.4897,
"num_input_tokens_seen": 14938264,
"step": 1330,
"train_runtime": 2176.7759,
"train_tokens_per_second": 6862.564
},
{
"epoch": 0.6407871842563149,
"grad_norm": 0.5597286224365234,
"learning_rate": 4.459046355238e-05,
"loss": 0.5071,
"num_input_tokens_seen": 14996424,
"step": 1335,
"train_runtime": 2184.9625,
"train_tokens_per_second": 6863.47
},
{
"epoch": 0.6431871362572749,
"grad_norm": 0.5538758635520935,
"learning_rate": 4.455138044366088e-05,
"loss": 0.5117,
"num_input_tokens_seen": 15054880,
"step": 1340,
"train_runtime": 2193.2667,
"train_tokens_per_second": 6864.136
},
{
"epoch": 0.6455870882582349,
"grad_norm": 0.6640130877494812,
"learning_rate": 4.4512173916797085e-05,
"loss": 0.4721,
"num_input_tokens_seen": 15117888,
"step": 1345,
"train_runtime": 2202.1591,
"train_tokens_per_second": 6865.03
},
{
"epoch": 0.6479870402591948,
"grad_norm": 0.8442539572715759,
"learning_rate": 4.447284421927991e-05,
"loss": 0.554,
"num_input_tokens_seen": 15175016,
"step": 1350,
"train_runtime": 2211.0135,
"train_tokens_per_second": 6863.376
},
{
"epoch": 0.6503869922601548,
"grad_norm": 0.7367165684700012,
"learning_rate": 4.443339159937818e-05,
"loss": 0.5125,
"num_input_tokens_seen": 15230944,
"step": 1355,
"train_runtime": 2219.076,
"train_tokens_per_second": 6863.642
},
{
"epoch": 0.6527869442611148,
"grad_norm": 0.6845333576202393,
"learning_rate": 4.439381630613668e-05,
"loss": 0.5286,
"num_input_tokens_seen": 15287896,
"step": 1360,
"train_runtime": 2227.4192,
"train_tokens_per_second": 6863.502
},
{
"epoch": 0.6551868962620747,
"grad_norm": 0.6416659355163574,
"learning_rate": 4.435411858937456e-05,
"loss": 0.6131,
"num_input_tokens_seen": 15342584,
"step": 1365,
"train_runtime": 2235.1031,
"train_tokens_per_second": 6864.374
},
{
"epoch": 0.6575868482630347,
"grad_norm": 0.5809879302978516,
"learning_rate": 4.431429869968378e-05,
"loss": 0.5062,
"num_input_tokens_seen": 15404096,
"step": 1370,
"train_runtime": 2243.5171,
"train_tokens_per_second": 6866.048
},
{
"epoch": 0.6599868002639947,
"grad_norm": 0.6339114308357239,
"learning_rate": 4.427435688842748e-05,
"loss": 0.4943,
"num_input_tokens_seen": 15462616,
"step": 1375,
"train_runtime": 2251.8474,
"train_tokens_per_second": 6866.636
},
{
"epoch": 0.6623867522649547,
"grad_norm": 0.4654648005962372,
"learning_rate": 4.423429340773847e-05,
"loss": 0.5096,
"num_input_tokens_seen": 15519912,
"step": 1380,
"train_runtime": 2260.1318,
"train_tokens_per_second": 6866.817
},
{
"epoch": 0.6647867042659147,
"grad_norm": 0.6752036809921265,
"learning_rate": 4.41941085105176e-05,
"loss": 0.5394,
"num_input_tokens_seen": 15576136,
"step": 1385,
"train_runtime": 2268.2935,
"train_tokens_per_second": 6866.896
},
{
"epoch": 0.6671866562668747,
"grad_norm": 0.5208489894866943,
"learning_rate": 4.415380245043213e-05,
"loss": 0.4537,
"num_input_tokens_seen": 15633480,
"step": 1390,
"train_runtime": 2276.6508,
"train_tokens_per_second": 6866.877
},
{
"epoch": 0.6695866082678347,
"grad_norm": 0.6454225778579712,
"learning_rate": 4.4113375481914186e-05,
"loss": 0.5155,
"num_input_tokens_seen": 15688200,
"step": 1395,
"train_runtime": 2284.8437,
"train_tokens_per_second": 6866.203
},
{
"epoch": 0.6719865602687947,
"grad_norm": 0.5845027565956116,
"learning_rate": 4.407282786015913e-05,
"loss": 0.5255,
"num_input_tokens_seen": 15742392,
"step": 1400,
"train_runtime": 2292.7215,
"train_tokens_per_second": 6866.247
},
{
"epoch": 0.6743865122697547,
"grad_norm": 0.9591690301895142,
"learning_rate": 4.403215984112392e-05,
"loss": 0.5122,
"num_input_tokens_seen": 15799472,
"step": 1405,
"train_runtime": 2301.0926,
"train_tokens_per_second": 6866.074
},
{
"epoch": 0.6767864642707145,
"grad_norm": 0.6333798766136169,
"learning_rate": 4.3991371681525556e-05,
"loss": 0.511,
"num_input_tokens_seen": 15858960,
"step": 1410,
"train_runtime": 2309.9976,
"train_tokens_per_second": 6865.358
},
{
"epoch": 0.6791864162716745,
"grad_norm": 0.5859664082527161,
"learning_rate": 4.395046363883941e-05,
"loss": 0.5375,
"num_input_tokens_seen": 15915472,
"step": 1415,
"train_runtime": 2317.9598,
"train_tokens_per_second": 6866.155
},
{
"epoch": 0.6815863682726345,
"grad_norm": 0.6732012629508972,
"learning_rate": 4.390943597129761e-05,
"loss": 0.5682,
"num_input_tokens_seen": 15970752,
"step": 1420,
"train_runtime": 2325.6345,
"train_tokens_per_second": 6867.267
},
{
"epoch": 0.6839863202735945,
"grad_norm": 0.7597581148147583,
"learning_rate": 4.3868288937887445e-05,
"loss": 0.5164,
"num_input_tokens_seen": 16025456,
"step": 1425,
"train_runtime": 2333.8264,
"train_tokens_per_second": 6866.601
},
{
"epoch": 0.6863862722745545,
"grad_norm": 0.7212057113647461,
"learning_rate": 4.382702279834965e-05,
"loss": 0.5524,
"num_input_tokens_seen": 16075744,
"step": 1430,
"train_runtime": 2341.4051,
"train_tokens_per_second": 6865.853
},
{
"epoch": 0.6887862242755145,
"grad_norm": 0.58528733253479,
"learning_rate": 4.378563781317687e-05,
"loss": 0.497,
"num_input_tokens_seen": 16137672,
"step": 1435,
"train_runtime": 2350.3848,
"train_tokens_per_second": 6865.97
},
{
"epoch": 0.6911861762764745,
"grad_norm": 0.570091962814331,
"learning_rate": 4.374413424361195e-05,
"loss": 0.4888,
"num_input_tokens_seen": 16199088,
"step": 1440,
"train_runtime": 2358.886,
"train_tokens_per_second": 6867.262
},
{
"epoch": 0.6935861282774345,
"grad_norm": 0.768666684627533,
"learning_rate": 4.370251235164625e-05,
"loss": 0.5343,
"num_input_tokens_seen": 16253792,
"step": 1445,
"train_runtime": 2367.1689,
"train_tokens_per_second": 6866.342
},
{
"epoch": 0.6959860802783945,
"grad_norm": 0.6287879347801208,
"learning_rate": 4.366077240001813e-05,
"loss": 0.4848,
"num_input_tokens_seen": 16316608,
"step": 1450,
"train_runtime": 2376.0866,
"train_tokens_per_second": 6867.009
},
{
"epoch": 0.6983860322793544,
"grad_norm": 0.74793541431427,
"learning_rate": 4.361891465221112e-05,
"loss": 0.4847,
"num_input_tokens_seen": 16375648,
"step": 1455,
"train_runtime": 2384.4535,
"train_tokens_per_second": 6867.673
},
{
"epoch": 0.7007859842803144,
"grad_norm": 0.6209436655044556,
"learning_rate": 4.3576939372452394e-05,
"loss": 0.5295,
"num_input_tokens_seen": 16429360,
"step": 1460,
"train_runtime": 2392.6099,
"train_tokens_per_second": 6866.711
},
{
"epoch": 0.7031859362812743,
"grad_norm": 0.7456108331680298,
"learning_rate": 4.353484682571101e-05,
"loss": 0.5144,
"num_input_tokens_seen": 16480088,
"step": 1465,
"train_runtime": 2400.4701,
"train_tokens_per_second": 6865.359
},
{
"epoch": 0.7055858882822343,
"grad_norm": 0.573098361492157,
"learning_rate": 4.349263727769629e-05,
"loss": 0.4636,
"num_input_tokens_seen": 16538968,
"step": 1470,
"train_runtime": 2408.9134,
"train_tokens_per_second": 6865.738
},
{
"epoch": 0.7079858402831943,
"grad_norm": 0.6599897146224976,
"learning_rate": 4.3450310994856135e-05,
"loss": 0.5415,
"num_input_tokens_seen": 16595104,
"step": 1475,
"train_runtime": 2417.2231,
"train_tokens_per_second": 6865.359
},
{
"epoch": 0.7103857922841543,
"grad_norm": 0.9016920328140259,
"learning_rate": 4.3407868244375315e-05,
"loss": 0.5367,
"num_input_tokens_seen": 16650488,
"step": 1480,
"train_runtime": 2425.2913,
"train_tokens_per_second": 6865.356
},
{
"epoch": 0.7127857442851143,
"grad_norm": 0.7661956548690796,
"learning_rate": 4.3365309294173825e-05,
"loss": 0.4729,
"num_input_tokens_seen": 16701384,
"step": 1485,
"train_runtime": 2432.991,
"train_tokens_per_second": 6864.548
},
{
"epoch": 0.7151856962860743,
"grad_norm": 0.8703396916389465,
"learning_rate": 4.332263441290515e-05,
"loss": 0.5373,
"num_input_tokens_seen": 16754152,
"step": 1490,
"train_runtime": 2440.463,
"train_tokens_per_second": 6865.153
},
{
"epoch": 0.7175856482870343,
"grad_norm": 0.633375883102417,
"learning_rate": 4.3279843869954604e-05,
"loss": 0.5037,
"num_input_tokens_seen": 16809056,
"step": 1495,
"train_runtime": 2448.2645,
"train_tokens_per_second": 6865.703
},
{
"epoch": 0.7199856002879942,
"grad_norm": 0.7101417779922485,
"learning_rate": 4.3236937935437614e-05,
"loss": 0.5324,
"num_input_tokens_seen": 16859504,
"step": 1500,
"train_runtime": 2455.6516,
"train_tokens_per_second": 6865.593
},
{
"epoch": 0.7223855522889542,
"grad_norm": 0.6423754692077637,
"learning_rate": 4.3193916880198004e-05,
"loss": 0.5109,
"num_input_tokens_seen": 16919952,
"step": 1505,
"train_runtime": 2464.4089,
"train_tokens_per_second": 6865.724
},
{
"epoch": 0.7247855042899142,
"grad_norm": 0.7076619863510132,
"learning_rate": 4.3150780975806315e-05,
"loss": 0.5425,
"num_input_tokens_seen": 16976592,
"step": 1510,
"train_runtime": 2472.6158,
"train_tokens_per_second": 6865.843
},
{
"epoch": 0.7271854562908742,
"grad_norm": 0.5288546085357666,
"learning_rate": 4.310753049455806e-05,
"loss": 0.515,
"num_input_tokens_seen": 17034816,
"step": 1515,
"train_runtime": 2480.9341,
"train_tokens_per_second": 6866.291
},
{
"epoch": 0.7295854082918342,
"grad_norm": 0.6262106895446777,
"learning_rate": 4.3064165709472036e-05,
"loss": 0.5271,
"num_input_tokens_seen": 17088560,
"step": 1520,
"train_runtime": 2488.3235,
"train_tokens_per_second": 6867.499
},
{
"epoch": 0.7319853602927942,
"grad_norm": 0.5250151753425598,
"learning_rate": 4.3020686894288564e-05,
"loss": 0.5055,
"num_input_tokens_seen": 17144640,
"step": 1525,
"train_runtime": 2496.1311,
"train_tokens_per_second": 6868.485
},
{
"epoch": 0.7343853122937541,
"grad_norm": 0.7805795669555664,
"learning_rate": 4.2977094323467784e-05,
"loss": 0.48,
"num_input_tokens_seen": 17200416,
"step": 1530,
"train_runtime": 2504.3678,
"train_tokens_per_second": 6868.167
},
{
"epoch": 0.7367852642947141,
"grad_norm": 0.7616066336631775,
"learning_rate": 4.293338827218794e-05,
"loss": 0.4972,
"num_input_tokens_seen": 17256344,
"step": 1535,
"train_runtime": 2512.462,
"train_tokens_per_second": 6868.3
},
{
"epoch": 0.7391852162956741,
"grad_norm": 0.7682455778121948,
"learning_rate": 4.288956901634359e-05,
"loss": 0.4691,
"num_input_tokens_seen": 17314072,
"step": 1540,
"train_runtime": 2520.9232,
"train_tokens_per_second": 6868.147
},
{
"epoch": 0.741585168296634,
"grad_norm": 0.7621558308601379,
"learning_rate": 4.2845636832543914e-05,
"loss": 0.4942,
"num_input_tokens_seen": 17373728,
"step": 1545,
"train_runtime": 2529.3504,
"train_tokens_per_second": 6868.85
},
{
"epoch": 0.743985120297594,
"grad_norm": 0.6085621118545532,
"learning_rate": 4.2801591998110946e-05,
"loss": 0.5119,
"num_input_tokens_seen": 17425920,
"step": 1550,
"train_runtime": 2536.8035,
"train_tokens_per_second": 6869.243
},
{
"epoch": 0.746385072298554,
"grad_norm": 0.6101738214492798,
"learning_rate": 4.275743479107785e-05,
"loss": 0.5201,
"num_input_tokens_seen": 17480304,
"step": 1555,
"train_runtime": 2544.9492,
"train_tokens_per_second": 6868.626
},
{
"epoch": 0.748785024299514,
"grad_norm": 0.6207472085952759,
"learning_rate": 4.271316549018708e-05,
"loss": 0.517,
"num_input_tokens_seen": 17539776,
"step": 1560,
"train_runtime": 2553.0822,
"train_tokens_per_second": 6870.04
},
{
"epoch": 0.751184976300474,
"grad_norm": 0.688941478729248,
"learning_rate": 4.2668784374888756e-05,
"loss": 0.4894,
"num_input_tokens_seen": 17595928,
"step": 1565,
"train_runtime": 2561.6413,
"train_tokens_per_second": 6869.005
},
{
"epoch": 0.753584928301434,
"grad_norm": 0.9783554673194885,
"learning_rate": 4.262429172533878e-05,
"loss": 0.5213,
"num_input_tokens_seen": 17651664,
"step": 1570,
"train_runtime": 2569.4494,
"train_tokens_per_second": 6869.824
},
{
"epoch": 0.755984880302394,
"grad_norm": 0.9513911604881287,
"learning_rate": 4.257968782239714e-05,
"loss": 0.506,
"num_input_tokens_seen": 17703960,
"step": 1575,
"train_runtime": 2576.9625,
"train_tokens_per_second": 6870.088
},
{
"epoch": 0.758384832303354,
"grad_norm": 0.7099276185035706,
"learning_rate": 4.2534972947626094e-05,
"loss": 0.5073,
"num_input_tokens_seen": 17761448,
"step": 1580,
"train_runtime": 2585.427,
"train_tokens_per_second": 6869.832
},
{
"epoch": 0.760784784304314,
"grad_norm": 0.5648279786109924,
"learning_rate": 4.249014738328842e-05,
"loss": 0.5265,
"num_input_tokens_seen": 17817984,
"step": 1585,
"train_runtime": 2593.1431,
"train_tokens_per_second": 6871.192
},
{
"epoch": 0.763184736305274,
"grad_norm": 0.6818917989730835,
"learning_rate": 4.2445211412345615e-05,
"loss": 0.5244,
"num_input_tokens_seen": 17874768,
"step": 1590,
"train_runtime": 2601.224,
"train_tokens_per_second": 6871.676
},
{
"epoch": 0.7655846883062338,
"grad_norm": 0.6163448691368103,
"learning_rate": 4.240016531845612e-05,
"loss": 0.5406,
"num_input_tokens_seen": 17931864,
"step": 1595,
"train_runtime": 2609.5192,
"train_tokens_per_second": 6871.712
},
{
"epoch": 0.7679846403071938,
"grad_norm": 0.6879476308822632,
"learning_rate": 4.235500938597354e-05,
"loss": 0.4871,
"num_input_tokens_seen": 17985744,
"step": 1600,
"train_runtime": 2617.2291,
"train_tokens_per_second": 6872.056
},
{
"epoch": 0.7703845923081538,
"grad_norm": 0.5437011122703552,
"learning_rate": 4.230974389994483e-05,
"loss": 0.5015,
"num_input_tokens_seen": 18044152,
"step": 1605,
"train_runtime": 2625.4686,
"train_tokens_per_second": 6872.736
},
{
"epoch": 0.7727845443091138,
"grad_norm": 0.5755176544189453,
"learning_rate": 4.226436914610849e-05,
"loss": 0.541,
"num_input_tokens_seen": 18100976,
"step": 1610,
"train_runtime": 2633.5328,
"train_tokens_per_second": 6873.268
},
{
"epoch": 0.7751844963100738,
"grad_norm": 0.6550777554512024,
"learning_rate": 4.2218885410892785e-05,
"loss": 0.5314,
"num_input_tokens_seen": 18156240,
"step": 1615,
"train_runtime": 2641.1036,
"train_tokens_per_second": 6874.49
},
{
"epoch": 0.7775844483110338,
"grad_norm": 0.6372175216674805,
"learning_rate": 4.2173292981413914e-05,
"loss": 0.4875,
"num_input_tokens_seen": 18216472,
"step": 1620,
"train_runtime": 2649.6605,
"train_tokens_per_second": 6875.021
},
{
"epoch": 0.7799844003119938,
"grad_norm": 0.5091462731361389,
"learning_rate": 4.212759214547424e-05,
"loss": 0.4954,
"num_input_tokens_seen": 18271168,
"step": 1625,
"train_runtime": 2657.4608,
"train_tokens_per_second": 6875.423
},
{
"epoch": 0.7823843523129538,
"grad_norm": 0.6974900960922241,
"learning_rate": 4.2081783191560405e-05,
"loss": 0.4939,
"num_input_tokens_seen": 18326128,
"step": 1630,
"train_runtime": 2665.3267,
"train_tokens_per_second": 6875.753
},
{
"epoch": 0.7847843043139138,
"grad_norm": 0.5476020574569702,
"learning_rate": 4.203586640884156e-05,
"loss": 0.4995,
"num_input_tokens_seen": 18385280,
"step": 1635,
"train_runtime": 2673.6877,
"train_tokens_per_second": 6876.375
},
{
"epoch": 0.7871842563148737,
"grad_norm": 0.5772519111633301,
"learning_rate": 4.1989842087167534e-05,
"loss": 0.5198,
"num_input_tokens_seen": 18444000,
"step": 1640,
"train_runtime": 2682.4357,
"train_tokens_per_second": 6875.84
},
{
"epoch": 0.7895842083158336,
"grad_norm": 0.6971266269683838,
"learning_rate": 4.1943710517066984e-05,
"loss": 0.4696,
"num_input_tokens_seen": 18500344,
"step": 1645,
"train_runtime": 2690.5876,
"train_tokens_per_second": 6875.949
},
{
"epoch": 0.7919841603167936,
"grad_norm": 0.7783945798873901,
"learning_rate": 4.1897471989745575e-05,
"loss": 0.4777,
"num_input_tokens_seen": 18553136,
"step": 1650,
"train_runtime": 2698.2872,
"train_tokens_per_second": 6875.894
},
{
"epoch": 0.7943841123177536,
"grad_norm": 0.7614520192146301,
"learning_rate": 4.185112679708415e-05,
"loss": 0.525,
"num_input_tokens_seen": 18610264,
"step": 1655,
"train_runtime": 2706.4236,
"train_tokens_per_second": 6876.331
},
{
"epoch": 0.7967840643187136,
"grad_norm": 0.5857712626457214,
"learning_rate": 4.180467523163686e-05,
"loss": 0.4906,
"num_input_tokens_seen": 18670624,
"step": 1660,
"train_runtime": 2714.993,
"train_tokens_per_second": 6876.859
},
{
"epoch": 0.7991840163196736,
"grad_norm": 0.5816935300827026,
"learning_rate": 4.175811758662935e-05,
"loss": 0.4851,
"num_input_tokens_seen": 18727824,
"step": 1665,
"train_runtime": 2723.4951,
"train_tokens_per_second": 6876.393
},
{
"epoch": 0.8015839683206336,
"grad_norm": 0.5751060843467712,
"learning_rate": 4.1711454155956895e-05,
"loss": 0.4694,
"num_input_tokens_seen": 18785440,
"step": 1670,
"train_runtime": 2731.305,
"train_tokens_per_second": 6877.826
},
{
"epoch": 0.8039839203215936,
"grad_norm": 0.8796506524085999,
"learning_rate": 4.166468523418251e-05,
"loss": 0.5254,
"num_input_tokens_seen": 18839288,
"step": 1675,
"train_runtime": 2739.4392,
"train_tokens_per_second": 6877.06
},
{
"epoch": 0.8063838723225536,
"grad_norm": 0.6676029562950134,
"learning_rate": 4.1617811116535176e-05,
"loss": 0.5521,
"num_input_tokens_seen": 18893696,
"step": 1680,
"train_runtime": 2747.1069,
"train_tokens_per_second": 6877.67
},
{
"epoch": 0.8087838243235135,
"grad_norm": 0.8193256258964539,
"learning_rate": 4.1570832098907874e-05,
"loss": 0.5444,
"num_input_tokens_seen": 18946504,
"step": 1685,
"train_runtime": 2754.72,
"train_tokens_per_second": 6877.833
},
{
"epoch": 0.8111837763244735,
"grad_norm": 0.5464473962783813,
"learning_rate": 4.152374847785579e-05,
"loss": 0.5321,
"num_input_tokens_seen": 19003664,
"step": 1690,
"train_runtime": 2763.0844,
"train_tokens_per_second": 6877.699
},
{
"epoch": 0.8135837283254335,
"grad_norm": 0.8191189169883728,
"learning_rate": 4.1476560550594414e-05,
"loss": 0.4826,
"num_input_tokens_seen": 19056544,
"step": 1695,
"train_runtime": 2770.9361,
"train_tokens_per_second": 6877.295
},
{
"epoch": 0.8159836803263935,
"grad_norm": 0.745058000087738,
"learning_rate": 4.142926861499768e-05,
"loss": 0.5543,
"num_input_tokens_seen": 19107344,
"step": 1700,
"train_runtime": 2778.5593,
"train_tokens_per_second": 6876.709
},
{
"epoch": 0.8183836323273534,
"grad_norm": 0.6147037744522095,
"learning_rate": 4.138187296959606e-05,
"loss": 0.505,
"num_input_tokens_seen": 19162000,
"step": 1705,
"train_runtime": 2786.3906,
"train_tokens_per_second": 6876.997
},
{
"epoch": 0.8207835843283134,
"grad_norm": 0.687018632888794,
"learning_rate": 4.13343739135747e-05,
"loss": 0.522,
"num_input_tokens_seen": 19217512,
"step": 1710,
"train_runtime": 2794.2498,
"train_tokens_per_second": 6877.521
},
{
"epoch": 0.8231835363292734,
"grad_norm": 0.6172505617141724,
"learning_rate": 4.128677174677153e-05,
"loss": 0.5411,
"num_input_tokens_seen": 19276384,
"step": 1715,
"train_runtime": 2802.4832,
"train_tokens_per_second": 6878.323
},
{
"epoch": 0.8255834883302334,
"grad_norm": 0.735072135925293,
"learning_rate": 4.123906676967536e-05,
"loss": 0.513,
"num_input_tokens_seen": 19328432,
"step": 1720,
"train_runtime": 2810.1311,
"train_tokens_per_second": 6878.125
},
{
"epoch": 0.8279834403311934,
"grad_norm": 0.9113159775733948,
"learning_rate": 4.1191259283424e-05,
"loss": 0.5244,
"num_input_tokens_seen": 19384016,
"step": 1725,
"train_runtime": 2818.2045,
"train_tokens_per_second": 6878.144
},
{
"epoch": 0.8303833923321533,
"grad_norm": 0.8989443778991699,
"learning_rate": 4.1143349589802326e-05,
"loss": 0.5471,
"num_input_tokens_seen": 19442016,
"step": 1730,
"train_runtime": 2826.3519,
"train_tokens_per_second": 6878.838
},
{
"epoch": 0.8327833443331133,
"grad_norm": 0.572564423084259,
"learning_rate": 4.1095337991240436e-05,
"loss": 0.5352,
"num_input_tokens_seen": 19496880,
"step": 1735,
"train_runtime": 2834.1751,
"train_tokens_per_second": 6879.208
},
{
"epoch": 0.8351832963340733,
"grad_norm": 0.4649478793144226,
"learning_rate": 4.104722479081167e-05,
"loss": 0.4709,
"num_input_tokens_seen": 19555656,
"step": 1740,
"train_runtime": 2842.1514,
"train_tokens_per_second": 6880.582
},
{
"epoch": 0.8375832483350333,
"grad_norm": 0.6450087428092957,
"learning_rate": 4.099901029223075e-05,
"loss": 0.5104,
"num_input_tokens_seen": 19610352,
"step": 1745,
"train_runtime": 2849.9024,
"train_tokens_per_second": 6881.061
},
{
"epoch": 0.8399832003359933,
"grad_norm": 0.7608988881111145,
"learning_rate": 4.095069479985183e-05,
"loss": 0.5151,
"num_input_tokens_seen": 19666656,
"step": 1750,
"train_runtime": 2858.2857,
"train_tokens_per_second": 6880.577
},
{
"epoch": 0.8423831523369533,
"grad_norm": 0.5766634345054626,
"learning_rate": 4.090227861866659e-05,
"loss": 0.5355,
"num_input_tokens_seen": 19723528,
"step": 1755,
"train_runtime": 2866.3853,
"train_tokens_per_second": 6880.976
},
{
"epoch": 0.8447831043379133,
"grad_norm": 0.8256959915161133,
"learning_rate": 4.085376205430233e-05,
"loss": 0.5475,
"num_input_tokens_seen": 19775232,
"step": 1760,
"train_runtime": 2873.9931,
"train_tokens_per_second": 6880.751
},
{
"epoch": 0.8471830563388733,
"grad_norm": 0.6020644903182983,
"learning_rate": 4.080514541301998e-05,
"loss": 0.5043,
"num_input_tokens_seen": 19832592,
"step": 1765,
"train_runtime": 2881.8352,
"train_tokens_per_second": 6881.931
},
{
"epoch": 0.8495830083398332,
"grad_norm": 0.6027383804321289,
"learning_rate": 4.075642900171223e-05,
"loss": 0.5501,
"num_input_tokens_seen": 19886104,
"step": 1770,
"train_runtime": 2889.3788,
"train_tokens_per_second": 6882.484
},
{
"epoch": 0.8519829603407932,
"grad_norm": 0.7463006377220154,
"learning_rate": 4.070761312790157e-05,
"loss": 0.5666,
"num_input_tokens_seen": 19944808,
"step": 1775,
"train_runtime": 2897.8024,
"train_tokens_per_second": 6882.736
},
{
"epoch": 0.8543829123417531,
"grad_norm": 0.5846840143203735,
"learning_rate": 4.065869809973833e-05,
"loss": 0.5026,
"num_input_tokens_seen": 20000048,
"step": 1780,
"train_runtime": 2905.6359,
"train_tokens_per_second": 6883.191
},
{
"epoch": 0.8567828643427131,
"grad_norm": 0.6461730599403381,
"learning_rate": 4.060968422599879e-05,
"loss": 0.4991,
"num_input_tokens_seen": 20054800,
"step": 1785,
"train_runtime": 2913.7209,
"train_tokens_per_second": 6882.883
},
{
"epoch": 0.8591828163436731,
"grad_norm": 0.7940958142280579,
"learning_rate": 4.0560571816083156e-05,
"loss": 0.5496,
"num_input_tokens_seen": 20111120,
"step": 1790,
"train_runtime": 2921.8875,
"train_tokens_per_second": 6882.921
},
{
"epoch": 0.8615827683446331,
"grad_norm": 0.6765144467353821,
"learning_rate": 4.051136118001364e-05,
"loss": 0.4827,
"num_input_tokens_seen": 20165552,
"step": 1795,
"train_runtime": 2929.7258,
"train_tokens_per_second": 6883.085
},
{
"epoch": 0.8639827203455931,
"grad_norm": 0.9223127365112305,
"learning_rate": 4.046205262843254e-05,
"loss": 0.4949,
"num_input_tokens_seen": 20221072,
"step": 1800,
"train_runtime": 2938.3425,
"train_tokens_per_second": 6881.796
},
{
"epoch": 0.8663826723465531,
"grad_norm": 0.5317054390907288,
"learning_rate": 4.041264647260022e-05,
"loss": 0.4844,
"num_input_tokens_seen": 20277640,
"step": 1805,
"train_runtime": 2947.9518,
"train_tokens_per_second": 6878.552
},
{
"epoch": 0.8687826243475131,
"grad_norm": 0.5232411623001099,
"learning_rate": 4.036314302439319e-05,
"loss": 0.4938,
"num_input_tokens_seen": 20333328,
"step": 1810,
"train_runtime": 2955.884,
"train_tokens_per_second": 6878.933
},
{
"epoch": 0.8711825763484731,
"grad_norm": 0.7968527674674988,
"learning_rate": 4.031354259630209e-05,
"loss": 0.5246,
"num_input_tokens_seen": 20389752,
"step": 1815,
"train_runtime": 2963.7323,
"train_tokens_per_second": 6879.755
},
{
"epoch": 0.8735825283494331,
"grad_norm": 0.5793075561523438,
"learning_rate": 4.026384550142978e-05,
"loss": 0.5467,
"num_input_tokens_seen": 20447184,
"step": 1820,
"train_runtime": 2971.7237,
"train_tokens_per_second": 6880.58
},
{
"epoch": 0.875982480350393,
"grad_norm": 0.6629696488380432,
"learning_rate": 4.0214052053489304e-05,
"loss": 0.4753,
"num_input_tokens_seen": 20501512,
"step": 1825,
"train_runtime": 2979.5222,
"train_tokens_per_second": 6880.805
},
{
"epoch": 0.8783824323513529,
"grad_norm": 0.6974778175354004,
"learning_rate": 4.016416256680194e-05,
"loss": 0.5134,
"num_input_tokens_seen": 20556688,
"step": 1830,
"train_runtime": 2987.3905,
"train_tokens_per_second": 6881.152
},
{
"epoch": 0.8807823843523129,
"grad_norm": 0.7780594825744629,
"learning_rate": 4.011417735629522e-05,
"loss": 0.4771,
"num_input_tokens_seen": 20613504,
"step": 1835,
"train_runtime": 2995.7447,
"train_tokens_per_second": 6880.928
},
{
"epoch": 0.8831823363532729,
"grad_norm": 0.6135735511779785,
"learning_rate": 4.006409673750094e-05,
"loss": 0.4904,
"num_input_tokens_seen": 20670776,
"step": 1840,
"train_runtime": 3004.2957,
"train_tokens_per_second": 6880.407
},
{
"epoch": 0.8855822883542329,
"grad_norm": 0.6567316651344299,
"learning_rate": 4.0013921026553125e-05,
"loss": 0.5172,
"num_input_tokens_seen": 20726776,
"step": 1845,
"train_runtime": 3012.3296,
"train_tokens_per_second": 6880.647
},
{
"epoch": 0.8879822403551929,
"grad_norm": 0.733647882938385,
"learning_rate": 3.9963650540186116e-05,
"loss": 0.5168,
"num_input_tokens_seen": 20781792,
"step": 1850,
"train_runtime": 3020.8457,
"train_tokens_per_second": 6879.462
},
{
"epoch": 0.8903821923561529,
"grad_norm": 0.7651314735412598,
"learning_rate": 3.991328559573248e-05,
"loss": 0.551,
"num_input_tokens_seen": 20835512,
"step": 1855,
"train_runtime": 3028.6209,
"train_tokens_per_second": 6879.538
},
{
"epoch": 0.8927821443571129,
"grad_norm": 0.7899940013885498,
"learning_rate": 3.9862826511121085e-05,
"loss": 0.5242,
"num_input_tokens_seen": 20887216,
"step": 1860,
"train_runtime": 3036.1277,
"train_tokens_per_second": 6879.558
},
{
"epoch": 0.8951820963580729,
"grad_norm": 0.6774663329124451,
"learning_rate": 3.981227360487504e-05,
"loss": 0.5273,
"num_input_tokens_seen": 20943744,
"step": 1865,
"train_runtime": 3044.3369,
"train_tokens_per_second": 6879.575
},
{
"epoch": 0.8975820483590328,
"grad_norm": 0.6696859002113342,
"learning_rate": 3.976162719610972e-05,
"loss": 0.5006,
"num_input_tokens_seen": 20991568,
"step": 1870,
"train_runtime": 3053.2072,
"train_tokens_per_second": 6875.252
},
{
"epoch": 0.8999820003599928,
"grad_norm": 0.7721266746520996,
"learning_rate": 3.971088760453071e-05,
"loss": 0.5214,
"num_input_tokens_seen": 21047408,
"step": 1875,
"train_runtime": 3061.9813,
"train_tokens_per_second": 6873.787
},
{
"epoch": 0.9023819523609528,
"grad_norm": 0.7528117299079895,
"learning_rate": 3.966005515043183e-05,
"loss": 0.5172,
"num_input_tokens_seen": 21105344,
"step": 1880,
"train_runtime": 3070.238,
"train_tokens_per_second": 6874.172
},
{
"epoch": 0.9047819043619127,
"grad_norm": 0.7893593311309814,
"learning_rate": 3.960913015469311e-05,
"loss": 0.5581,
"num_input_tokens_seen": 21161704,
"step": 1885,
"train_runtime": 3078.4575,
"train_tokens_per_second": 6874.126
},
{
"epoch": 0.9071818563628727,
"grad_norm": 0.6411826610565186,
"learning_rate": 3.95581129387787e-05,
"loss": 0.5006,
"num_input_tokens_seen": 21220960,
"step": 1890,
"train_runtime": 3087.9925,
"train_tokens_per_second": 6872.089
},
{
"epoch": 0.9095818083638327,
"grad_norm": 0.48201116919517517,
"learning_rate": 3.950700382473494e-05,
"loss": 0.5143,
"num_input_tokens_seen": 21285456,
"step": 1895,
"train_runtime": 3097.6261,
"train_tokens_per_second": 6871.538
},
{
"epoch": 0.9119817603647927,
"grad_norm": 0.7874345779418945,
"learning_rate": 3.9455803135188265e-05,
"loss": 0.5133,
"num_input_tokens_seen": 21340656,
"step": 1900,
"train_runtime": 3105.618,
"train_tokens_per_second": 6871.629
},
{
"epoch": 0.9143817123657527,
"grad_norm": 0.8059301972389221,
"learning_rate": 3.940451119334315e-05,
"loss": 0.4716,
"num_input_tokens_seen": 21402256,
"step": 1905,
"train_runtime": 3114.7644,
"train_tokens_per_second": 6871.228
},
{
"epoch": 0.9167816643667127,
"grad_norm": 0.5982013940811157,
"learning_rate": 3.935312832298014e-05,
"loss": 0.4752,
"num_input_tokens_seen": 21456968,
"step": 1910,
"train_runtime": 3122.6252,
"train_tokens_per_second": 6871.452
},
{
"epoch": 0.9191816163676726,
"grad_norm": 0.6114861965179443,
"learning_rate": 3.9301654848453744e-05,
"loss": 0.5358,
"num_input_tokens_seen": 21510880,
"step": 1915,
"train_runtime": 3130.5306,
"train_tokens_per_second": 6871.321
},
{
"epoch": 0.9215815683686326,
"grad_norm": 0.6739422678947449,
"learning_rate": 3.9250091094690424e-05,
"loss": 0.508,
"num_input_tokens_seen": 21567176,
"step": 1920,
"train_runtime": 3139.4979,
"train_tokens_per_second": 6869.626
},
{
"epoch": 0.9239815203695926,
"grad_norm": 0.9573784470558167,
"learning_rate": 3.9198437387186514e-05,
"loss": 0.4969,
"num_input_tokens_seen": 21616728,
"step": 1925,
"train_runtime": 3147.1512,
"train_tokens_per_second": 6868.665
},
{
"epoch": 0.9263814723705526,
"grad_norm": 0.6872597336769104,
"learning_rate": 3.914669405200619e-05,
"loss": 0.5231,
"num_input_tokens_seen": 21669600,
"step": 1930,
"train_runtime": 3154.6855,
"train_tokens_per_second": 6869.021
},
{
"epoch": 0.9287814243715126,
"grad_norm": 0.5402712225914001,
"learning_rate": 3.909486141577941e-05,
"loss": 0.5557,
"num_input_tokens_seen": 21725144,
"step": 1935,
"train_runtime": 3162.9029,
"train_tokens_per_second": 6868.736
},
{
"epoch": 0.9311813763724726,
"grad_norm": 0.5620856881141663,
"learning_rate": 3.904293980569983e-05,
"loss": 0.5202,
"num_input_tokens_seen": 21780960,
"step": 1940,
"train_runtime": 3171.7075,
"train_tokens_per_second": 6867.266
},
{
"epoch": 0.9335813283734326,
"grad_norm": 0.48633241653442383,
"learning_rate": 3.899092954952276e-05,
"loss": 0.4965,
"num_input_tokens_seen": 21835904,
"step": 1945,
"train_runtime": 3180.9981,
"train_tokens_per_second": 6864.482
},
{
"epoch": 0.9359812803743925,
"grad_norm": 0.6408486366271973,
"learning_rate": 3.89388309755631e-05,
"loss": 0.5271,
"num_input_tokens_seen": 21890264,
"step": 1950,
"train_runtime": 3188.8619,
"train_tokens_per_second": 6864.601
},
{
"epoch": 0.9383812323753525,
"grad_norm": 0.6832561492919922,
"learning_rate": 3.888664441269324e-05,
"loss": 0.513,
"num_input_tokens_seen": 21943944,
"step": 1955,
"train_runtime": 3196.9004,
"train_tokens_per_second": 6864.131
},
{
"epoch": 0.9407811843763125,
"grad_norm": 0.7224368453025818,
"learning_rate": 3.8834370190341016e-05,
"loss": 0.4975,
"num_input_tokens_seen": 22000688,
"step": 1960,
"train_runtime": 3205.2356,
"train_tokens_per_second": 6863.985
},
{
"epoch": 0.9431811363772724,
"grad_norm": 0.921877384185791,
"learning_rate": 3.8782008638487585e-05,
"loss": 0.5142,
"num_input_tokens_seen": 22056928,
"step": 1965,
"train_runtime": 3213.437,
"train_tokens_per_second": 6863.968
},
{
"epoch": 0.9455810883782324,
"grad_norm": 0.8015443682670593,
"learning_rate": 3.872956008766541e-05,
"loss": 0.5345,
"num_input_tokens_seen": 22109984,
"step": 1970,
"train_runtime": 3221.3456,
"train_tokens_per_second": 6863.586
},
{
"epoch": 0.9479810403791924,
"grad_norm": 0.60637366771698,
"learning_rate": 3.867702486895611e-05,
"loss": 0.519,
"num_input_tokens_seen": 22167792,
"step": 1975,
"train_runtime": 3229.4918,
"train_tokens_per_second": 6864.173
},
{
"epoch": 0.9503809923801524,
"grad_norm": 0.6260784268379211,
"learning_rate": 3.86244033139884e-05,
"loss": 0.4549,
"num_input_tokens_seen": 22224944,
"step": 1980,
"train_runtime": 3237.4363,
"train_tokens_per_second": 6864.983
},
{
"epoch": 0.9527809443811124,
"grad_norm": 0.7488238215446472,
"learning_rate": 3.857169575493601e-05,
"loss": 0.4988,
"num_input_tokens_seen": 22280208,
"step": 1985,
"train_runtime": 3245.3144,
"train_tokens_per_second": 6865.347
},
{
"epoch": 0.9551808963820724,
"grad_norm": 1.2673466205596924,
"learning_rate": 3.851890252451553e-05,
"loss": 0.5948,
"num_input_tokens_seen": 22331688,
"step": 1990,
"train_runtime": 3252.7162,
"train_tokens_per_second": 6865.551
},
{
"epoch": 0.9575808483830324,
"grad_norm": 0.7167654633522034,
"learning_rate": 3.846602395598441e-05,
"loss": 0.4765,
"num_input_tokens_seen": 22391056,
"step": 1995,
"train_runtime": 3261.3251,
"train_tokens_per_second": 6865.631
},
{
"epoch": 0.9599808003839924,
"grad_norm": 0.7767099142074585,
"learning_rate": 3.8413060383138735e-05,
"loss": 0.5067,
"num_input_tokens_seen": 22442560,
"step": 2000,
"train_runtime": 3268.751,
"train_tokens_per_second": 6865.791
},
{
"epoch": 0.9623807523849524,
"grad_norm": 0.6243239641189575,
"learning_rate": 3.836001214031122e-05,
"loss": 0.441,
"num_input_tokens_seen": 22504640,
"step": 2005,
"train_runtime": 3277.3712,
"train_tokens_per_second": 6866.674
},
{
"epoch": 0.9647807043859122,
"grad_norm": 0.7347325086593628,
"learning_rate": 3.830687956236907e-05,
"loss": 0.4923,
"num_input_tokens_seen": 22565448,
"step": 2010,
"train_runtime": 3285.5854,
"train_tokens_per_second": 6868.014
},
{
"epoch": 0.9671806563868722,
"grad_norm": 0.7760552167892456,
"learning_rate": 3.8253662984711795e-05,
"loss": 0.4971,
"num_input_tokens_seen": 22618928,
"step": 2015,
"train_runtime": 3293.6417,
"train_tokens_per_second": 6867.453
},
{
"epoch": 0.9695806083878322,
"grad_norm": 0.6205884218215942,
"learning_rate": 3.820036274326922e-05,
"loss": 0.4979,
"num_input_tokens_seen": 22674720,
"step": 2020,
"train_runtime": 3301.4874,
"train_tokens_per_second": 6868.032
},
{
"epoch": 0.9719805603887922,
"grad_norm": 0.7021058797836304,
"learning_rate": 3.8146979174499265e-05,
"loss": 0.48,
"num_input_tokens_seen": 22734768,
"step": 2025,
"train_runtime": 3309.628,
"train_tokens_per_second": 6869.282
},
{
"epoch": 0.9743805123897522,
"grad_norm": 0.8105769753456116,
"learning_rate": 3.809351261538585e-05,
"loss": 0.4802,
"num_input_tokens_seen": 22792864,
"step": 2030,
"train_runtime": 3318.078,
"train_tokens_per_second": 6869.297
},
{
"epoch": 0.9767804643907122,
"grad_norm": 0.7583296895027161,
"learning_rate": 3.8039963403436806e-05,
"loss": 0.5393,
"num_input_tokens_seen": 22846392,
"step": 2035,
"train_runtime": 3326.837,
"train_tokens_per_second": 6867.301
},
{
"epoch": 0.9791804163916722,
"grad_norm": 0.7417272925376892,
"learning_rate": 3.798633187668166e-05,
"loss": 0.5505,
"num_input_tokens_seen": 22899608,
"step": 2040,
"train_runtime": 3337.0101,
"train_tokens_per_second": 6862.313
},
{
"epoch": 0.9815803683926322,
"grad_norm": 0.6118446588516235,
"learning_rate": 3.793261837366959e-05,
"loss": 0.4829,
"num_input_tokens_seen": 22960648,
"step": 2045,
"train_runtime": 3348.0559,
"train_tokens_per_second": 6857.905
},
{
"epoch": 0.9839803203935922,
"grad_norm": 0.6822954416275024,
"learning_rate": 3.7878823233467234e-05,
"loss": 0.5252,
"num_input_tokens_seen": 23017960,
"step": 2050,
"train_runtime": 3357.979,
"train_tokens_per_second": 6854.706
},
{
"epoch": 0.9863802723945521,
"grad_norm": 0.8443323373794556,
"learning_rate": 3.782494679565656e-05,
"loss": 0.5098,
"num_input_tokens_seen": 23073264,
"step": 2055,
"train_runtime": 3367.9787,
"train_tokens_per_second": 6850.775
},
{
"epoch": 0.988780224395512,
"grad_norm": 0.8180744647979736,
"learning_rate": 3.777098940033275e-05,
"loss": 0.4722,
"num_input_tokens_seen": 23130952,
"step": 2060,
"train_runtime": 3379.0655,
"train_tokens_per_second": 6845.37
},
{
"epoch": 0.991180176396472,
"grad_norm": 1.0012092590332031,
"learning_rate": 3.7716951388102e-05,
"loss": 0.512,
"num_input_tokens_seen": 23184912,
"step": 2065,
"train_runtime": 3390.0285,
"train_tokens_per_second": 6839.15
},
{
"epoch": 0.993580128397432,
"grad_norm": 0.8469212651252747,
"learning_rate": 3.766283310007943e-05,
"loss": 0.5002,
"num_input_tokens_seen": 23238656,
"step": 2070,
"train_runtime": 3398.1559,
"train_tokens_per_second": 6838.608
},
{
"epoch": 0.995980080398392,
"grad_norm": 0.7020851969718933,
"learning_rate": 3.7608634877886885e-05,
"loss": 0.5014,
"num_input_tokens_seen": 23293008,
"step": 2075,
"train_runtime": 3406.0069,
"train_tokens_per_second": 6838.802
},
{
"epoch": 0.998380032399352,
"grad_norm": 0.9155061841011047,
"learning_rate": 3.755435706365079e-05,
"loss": 0.4932,
"num_input_tokens_seen": 23349040,
"step": 2080,
"train_runtime": 3414.3354,
"train_tokens_per_second": 6838.531
},
{
"epoch": 1.000479990400192,
"grad_norm": 0.7089964151382446,
"learning_rate": 3.7500000000000003e-05,
"loss": 0.5376,
"num_input_tokens_seen": 23400800,
"step": 2085,
"train_runtime": 3421.5018,
"train_tokens_per_second": 6839.336
},
{
"epoch": 1.002879942401152,
"grad_norm": 0.5927316546440125,
"learning_rate": 3.7445564030063646e-05,
"loss": 0.4811,
"num_input_tokens_seen": 23456048,
"step": 2090,
"train_runtime": 3429.5202,
"train_tokens_per_second": 6839.455
},
{
"epoch": 1.005279894402112,
"grad_norm": 0.5862952470779419,
"learning_rate": 3.739104949746893e-05,
"loss": 0.4931,
"num_input_tokens_seen": 23511576,
"step": 2095,
"train_runtime": 3437.359,
"train_tokens_per_second": 6840.012
},
{
"epoch": 1.0076798464030718,
"grad_norm": 0.8004628419876099,
"learning_rate": 3.7336456746339e-05,
"loss": 0.4666,
"num_input_tokens_seen": 23567088,
"step": 2100,
"train_runtime": 3445.3003,
"train_tokens_per_second": 6840.358
},
{
"epoch": 1.010079798404032,
"grad_norm": 0.5078383088111877,
"learning_rate": 3.728178612129075e-05,
"loss": 0.4806,
"num_input_tokens_seen": 23626528,
"step": 2105,
"train_runtime": 3454.0548,
"train_tokens_per_second": 6840.23
},
{
"epoch": 1.0124797504049918,
"grad_norm": 0.8467037081718445,
"learning_rate": 3.722703796743267e-05,
"loss": 0.4856,
"num_input_tokens_seen": 23681288,
"step": 2110,
"train_runtime": 3462.258,
"train_tokens_per_second": 6839.839
},
{
"epoch": 1.014879702405952,
"grad_norm": 0.6897312998771667,
"learning_rate": 3.7172212630362627e-05,
"loss": 0.5198,
"num_input_tokens_seen": 23740272,
"step": 2115,
"train_runtime": 3470.5143,
"train_tokens_per_second": 6840.563
},
{
"epoch": 1.0172796544069118,
"grad_norm": 0.7425886392593384,
"learning_rate": 3.7117310456165696e-05,
"loss": 0.5217,
"num_input_tokens_seen": 23796168,
"step": 2120,
"train_runtime": 3478.8621,
"train_tokens_per_second": 6840.216
},
{
"epoch": 1.019679606407872,
"grad_norm": 0.7550194263458252,
"learning_rate": 3.7062331791412045e-05,
"loss": 0.5463,
"num_input_tokens_seen": 23852288,
"step": 2125,
"train_runtime": 3486.8348,
"train_tokens_per_second": 6840.67
},
{
"epoch": 1.0220795584088318,
"grad_norm": 0.5753782391548157,
"learning_rate": 3.700727698315463e-05,
"loss": 0.5069,
"num_input_tokens_seen": 23906400,
"step": 2130,
"train_runtime": 3494.7803,
"train_tokens_per_second": 6840.602
},
{
"epoch": 1.024479510409792,
"grad_norm": 0.7684709429740906,
"learning_rate": 3.6952146378927095e-05,
"loss": 0.4976,
"num_input_tokens_seen": 23966288,
"step": 2135,
"train_runtime": 3503.1065,
"train_tokens_per_second": 6841.439
},
{
"epoch": 1.0268794624107518,
"grad_norm": 0.8290258646011353,
"learning_rate": 3.689694032674153e-05,
"loss": 0.4863,
"num_input_tokens_seen": 24019784,
"step": 2140,
"train_runtime": 3511.9759,
"train_tokens_per_second": 6839.393
},
{
"epoch": 1.0292794144117117,
"grad_norm": 0.5777615904808044,
"learning_rate": 3.684165917508628e-05,
"loss": 0.5026,
"num_input_tokens_seen": 24075104,
"step": 2145,
"train_runtime": 3522.5617,
"train_tokens_per_second": 6834.544
},
{
"epoch": 1.0316793664126718,
"grad_norm": 0.8155114650726318,
"learning_rate": 3.678630327292381e-05,
"loss": 0.5197,
"num_input_tokens_seen": 24125896,
"step": 2150,
"train_runtime": 3530.4751,
"train_tokens_per_second": 6833.612
},
{
"epoch": 1.0340793184136317,
"grad_norm": 0.5378252267837524,
"learning_rate": 3.673087296968838e-05,
"loss": 0.4873,
"num_input_tokens_seen": 24182088,
"step": 2155,
"train_runtime": 3538.664,
"train_tokens_per_second": 6833.677
},
{
"epoch": 1.0364792704145918,
"grad_norm": 0.8574205040931702,
"learning_rate": 3.667536861528396e-05,
"loss": 0.515,
"num_input_tokens_seen": 24242048,
"step": 2160,
"train_runtime": 3547.103,
"train_tokens_per_second": 6834.323
},
{
"epoch": 1.0388792224155516,
"grad_norm": 0.8171690106391907,
"learning_rate": 3.661979056008191e-05,
"loss": 0.486,
"num_input_tokens_seen": 24294336,
"step": 2165,
"train_runtime": 3554.7165,
"train_tokens_per_second": 6834.395
},
{
"epoch": 1.0412791744165117,
"grad_norm": 0.7367947101593018,
"learning_rate": 3.6564139154918895e-05,
"loss": 0.5121,
"num_input_tokens_seen": 24348872,
"step": 2170,
"train_runtime": 3562.3935,
"train_tokens_per_second": 6834.975
},
{
"epoch": 1.0436791264174716,
"grad_norm": 0.718895673751831,
"learning_rate": 3.6508414751094556e-05,
"loss": 0.5462,
"num_input_tokens_seen": 24402136,
"step": 2175,
"train_runtime": 3570.1249,
"train_tokens_per_second": 6835.093
},
{
"epoch": 1.0460790784184317,
"grad_norm": 0.7847620248794556,
"learning_rate": 3.6452617700369345e-05,
"loss": 0.4975,
"num_input_tokens_seen": 24451792,
"step": 2180,
"train_runtime": 3577.4533,
"train_tokens_per_second": 6834.972
},
{
"epoch": 1.0484790304193916,
"grad_norm": 0.7218212485313416,
"learning_rate": 3.639674835496232e-05,
"loss": 0.568,
"num_input_tokens_seen": 24508800,
"step": 2185,
"train_runtime": 3585.3931,
"train_tokens_per_second": 6835.736
},
{
"epoch": 1.0508789824203515,
"grad_norm": 0.6216446161270142,
"learning_rate": 3.634080706754887e-05,
"loss": 0.5024,
"num_input_tokens_seen": 24567000,
"step": 2190,
"train_runtime": 3593.4867,
"train_tokens_per_second": 6836.536
},
{
"epoch": 1.0532789344213116,
"grad_norm": 0.7098725438117981,
"learning_rate": 3.628479419125852e-05,
"loss": 0.5057,
"num_input_tokens_seen": 24629752,
"step": 2195,
"train_runtime": 3602.2113,
"train_tokens_per_second": 6837.398
},
{
"epoch": 1.0556788864222715,
"grad_norm": 0.7154077887535095,
"learning_rate": 3.6228710079672734e-05,
"loss": 0.5329,
"num_input_tokens_seen": 24685968,
"step": 2200,
"train_runtime": 3610.3704,
"train_tokens_per_second": 6837.517
},
{
"epoch": 1.0580788384232316,
"grad_norm": 0.6186597347259521,
"learning_rate": 3.6172555086822615e-05,
"loss": 0.5114,
"num_input_tokens_seen": 24745552,
"step": 2205,
"train_runtime": 3618.4119,
"train_tokens_per_second": 6838.788
},
{
"epoch": 1.0604787904241915,
"grad_norm": 0.7932461500167847,
"learning_rate": 3.6116329567186724e-05,
"loss": 0.4939,
"num_input_tokens_seen": 24799856,
"step": 2210,
"train_runtime": 3626.1603,
"train_tokens_per_second": 6839.151
},
{
"epoch": 1.0628787424251516,
"grad_norm": 0.7647953629493713,
"learning_rate": 3.6060033875688804e-05,
"loss": 0.5289,
"num_input_tokens_seen": 24853952,
"step": 2215,
"train_runtime": 3633.6609,
"train_tokens_per_second": 6839.921
},
{
"epoch": 1.0652786944261114,
"grad_norm": 0.722197413444519,
"learning_rate": 3.600366836769557e-05,
"loss": 0.5015,
"num_input_tokens_seen": 24911328,
"step": 2220,
"train_runtime": 3641.5303,
"train_tokens_per_second": 6840.895
},
{
"epoch": 1.0676786464270716,
"grad_norm": 0.9403772354125977,
"learning_rate": 3.5947233399014444e-05,
"loss": 0.4982,
"num_input_tokens_seen": 24967496,
"step": 2225,
"train_runtime": 3649.8212,
"train_tokens_per_second": 6840.745
},
{
"epoch": 1.0700785984280314,
"grad_norm": 0.5855931639671326,
"learning_rate": 3.589072932589134e-05,
"loss": 0.4706,
"num_input_tokens_seen": 25028408,
"step": 2230,
"train_runtime": 3658.1326,
"train_tokens_per_second": 6841.854
},
{
"epoch": 1.0724785504289913,
"grad_norm": 0.7537211179733276,
"learning_rate": 3.583415650500837e-05,
"loss": 0.5351,
"num_input_tokens_seen": 25082672,
"step": 2235,
"train_runtime": 3665.8181,
"train_tokens_per_second": 6842.312
},
{
"epoch": 1.0748785024299514,
"grad_norm": 0.7052933573722839,
"learning_rate": 3.577751529348163e-05,
"loss": 0.5137,
"num_input_tokens_seen": 25138272,
"step": 2240,
"train_runtime": 3673.8839,
"train_tokens_per_second": 6842.424
},
{
"epoch": 1.0772784544309113,
"grad_norm": 0.6160354614257812,
"learning_rate": 3.572080604885894e-05,
"loss": 0.4984,
"num_input_tokens_seen": 25198880,
"step": 2245,
"train_runtime": 3682.6208,
"train_tokens_per_second": 6842.649
},
{
"epoch": 1.0796784064318714,
"grad_norm": 0.7151322960853577,
"learning_rate": 3.566402912911755e-05,
"loss": 0.4745,
"num_input_tokens_seen": 25255672,
"step": 2250,
"train_runtime": 3691.127,
"train_tokens_per_second": 6842.266
},
{
"epoch": 1.0820783584328313,
"grad_norm": 0.6750310063362122,
"learning_rate": 3.560718489266194e-05,
"loss": 0.4705,
"num_input_tokens_seen": 25310096,
"step": 2255,
"train_runtime": 3698.9218,
"train_tokens_per_second": 6842.561
},
{
"epoch": 1.0844783104337914,
"grad_norm": 0.7280714511871338,
"learning_rate": 3.555027369832151e-05,
"loss": 0.529,
"num_input_tokens_seen": 25365416,
"step": 2260,
"train_runtime": 3706.9184,
"train_tokens_per_second": 6842.723
},
{
"epoch": 1.0868782624347513,
"grad_norm": 0.7498377561569214,
"learning_rate": 3.5493295905348334e-05,
"loss": 0.4974,
"num_input_tokens_seen": 25421480,
"step": 2265,
"train_runtime": 3715.1661,
"train_tokens_per_second": 6842.623
},
{
"epoch": 1.0892782144357114,
"grad_norm": 0.7328541874885559,
"learning_rate": 3.54362518734149e-05,
"loss": 0.4618,
"num_input_tokens_seen": 25482160,
"step": 2270,
"train_runtime": 3723.7211,
"train_tokens_per_second": 6843.198
},
{
"epoch": 1.0916781664366713,
"grad_norm": 0.6172477006912231,
"learning_rate": 3.537914196261181e-05,
"loss": 0.5266,
"num_input_tokens_seen": 25538416,
"step": 2275,
"train_runtime": 3731.9378,
"train_tokens_per_second": 6843.205
},
{
"epoch": 1.0940781184376314,
"grad_norm": 0.5969734191894531,
"learning_rate": 3.5321966533445547e-05,
"loss": 0.5244,
"num_input_tokens_seen": 25594328,
"step": 2280,
"train_runtime": 3739.9474,
"train_tokens_per_second": 6843.499
},
{
"epoch": 1.0964780704385912,
"grad_norm": 0.9102872610092163,
"learning_rate": 3.526472594683617e-05,
"loss": 0.5011,
"num_input_tokens_seen": 25647608,
"step": 2285,
"train_runtime": 3747.8696,
"train_tokens_per_second": 6843.25
},
{
"epoch": 1.0988780224395511,
"grad_norm": 0.7734837532043457,
"learning_rate": 3.5207420564115045e-05,
"loss": 0.5229,
"num_input_tokens_seen": 25702960,
"step": 2290,
"train_runtime": 3755.5877,
"train_tokens_per_second": 6843.925
},
{
"epoch": 1.1012779744405112,
"grad_norm": 0.6865848898887634,
"learning_rate": 3.515005074702256e-05,
"loss": 0.5035,
"num_input_tokens_seen": 25758120,
"step": 2295,
"train_runtime": 3763.673,
"train_tokens_per_second": 6843.878
},
{
"epoch": 1.1036779264414711,
"grad_norm": 0.6671602129936218,
"learning_rate": 3.509261685770585e-05,
"loss": 0.4939,
"num_input_tokens_seen": 25817024,
"step": 2300,
"train_runtime": 3772.0902,
"train_tokens_per_second": 6844.222
},
{
"epoch": 1.1060778784424312,
"grad_norm": 0.6217396855354309,
"learning_rate": 3.5035119258716495e-05,
"loss": 0.5389,
"num_input_tokens_seen": 25876744,
"step": 2305,
"train_runtime": 3780.9145,
"train_tokens_per_second": 6844.044
},
{
"epoch": 1.108477830443391,
"grad_norm": 0.7444595098495483,
"learning_rate": 3.497755831300828e-05,
"loss": 0.49,
"num_input_tokens_seen": 25928600,
"step": 2310,
"train_runtime": 3788.9853,
"train_tokens_per_second": 6843.151
},
{
"epoch": 1.1108777824443512,
"grad_norm": 0.6591025590896606,
"learning_rate": 3.491993438393481e-05,
"loss": 0.4658,
"num_input_tokens_seen": 25985192,
"step": 2315,
"train_runtime": 3797.2779,
"train_tokens_per_second": 6843.11
},
{
"epoch": 1.113277734445311,
"grad_norm": 0.7887580394744873,
"learning_rate": 3.486224783524731e-05,
"loss": 0.5464,
"num_input_tokens_seen": 26040520,
"step": 2320,
"train_runtime": 3804.9274,
"train_tokens_per_second": 6843.894
},
{
"epoch": 1.1156776864462712,
"grad_norm": 0.8074533939361572,
"learning_rate": 3.480449903109229e-05,
"loss": 0.5227,
"num_input_tokens_seen": 26093336,
"step": 2325,
"train_runtime": 3812.5053,
"train_tokens_per_second": 6844.144
},
{
"epoch": 1.118077638447231,
"grad_norm": 0.7056359648704529,
"learning_rate": 3.474668833600923e-05,
"loss": 0.4759,
"num_input_tokens_seen": 26148320,
"step": 2330,
"train_runtime": 3820.6134,
"train_tokens_per_second": 6844.011
},
{
"epoch": 1.120477590448191,
"grad_norm": 0.841861367225647,
"learning_rate": 3.4688816114928327e-05,
"loss": 0.5181,
"num_input_tokens_seen": 26206080,
"step": 2335,
"train_runtime": 3828.5922,
"train_tokens_per_second": 6844.835
},
{
"epoch": 1.122877542449151,
"grad_norm": 0.6521568298339844,
"learning_rate": 3.4630882733168116e-05,
"loss": 0.4938,
"num_input_tokens_seen": 26262688,
"step": 2340,
"train_runtime": 3836.5264,
"train_tokens_per_second": 6845.434
},
{
"epoch": 1.125277494450111,
"grad_norm": 0.7665443420410156,
"learning_rate": 3.4572888556433246e-05,
"loss": 0.4681,
"num_input_tokens_seen": 26321160,
"step": 2345,
"train_runtime": 3844.9857,
"train_tokens_per_second": 6845.581
},
{
"epoch": 1.127677446451071,
"grad_norm": 0.616336464881897,
"learning_rate": 3.451483395081212e-05,
"loss": 0.4631,
"num_input_tokens_seen": 26378192,
"step": 2350,
"train_runtime": 3853.2119,
"train_tokens_per_second": 6845.767
},
{
"epoch": 1.130077398452031,
"grad_norm": 0.6478726863861084,
"learning_rate": 3.445671928277461e-05,
"loss": 0.4676,
"num_input_tokens_seen": 26430848,
"step": 2355,
"train_runtime": 3861.022,
"train_tokens_per_second": 6845.558
},
{
"epoch": 1.132477350452991,
"grad_norm": 0.6371597647666931,
"learning_rate": 3.4398544919169715e-05,
"loss": 0.4904,
"num_input_tokens_seen": 26489064,
"step": 2360,
"train_runtime": 3868.9291,
"train_tokens_per_second": 6846.614
},
{
"epoch": 1.134877302453951,
"grad_norm": 0.6929451823234558,
"learning_rate": 3.4340311227223273e-05,
"loss": 0.5352,
"num_input_tokens_seen": 26543528,
"step": 2365,
"train_runtime": 3877.0017,
"train_tokens_per_second": 6846.406
},
{
"epoch": 1.137277254454911,
"grad_norm": 0.9073979258537292,
"learning_rate": 3.428201857453562e-05,
"loss": 0.5051,
"num_input_tokens_seen": 26596928,
"step": 2370,
"train_runtime": 3884.7443,
"train_tokens_per_second": 6846.507
},
{
"epoch": 1.139677206455871,
"grad_norm": 0.7150000929832458,
"learning_rate": 3.422366732907931e-05,
"loss": 0.4361,
"num_input_tokens_seen": 26654072,
"step": 2375,
"train_runtime": 3893.2295,
"train_tokens_per_second": 6846.263
},
{
"epoch": 1.1420771584568308,
"grad_norm": 0.6671944260597229,
"learning_rate": 3.416525785919673e-05,
"loss": 0.488,
"num_input_tokens_seen": 26707464,
"step": 2380,
"train_runtime": 3901.0068,
"train_tokens_per_second": 6846.3
},
{
"epoch": 1.1444771104577909,
"grad_norm": 0.585337221622467,
"learning_rate": 3.410679053359784e-05,
"loss": 0.4326,
"num_input_tokens_seen": 26766704,
"step": 2385,
"train_runtime": 3909.5898,
"train_tokens_per_second": 6846.423
},
{
"epoch": 1.1468770624587508,
"grad_norm": 0.5534717440605164,
"learning_rate": 3.404826572135779e-05,
"loss": 0.4831,
"num_input_tokens_seen": 26826328,
"step": 2390,
"train_runtime": 3918.5924,
"train_tokens_per_second": 6845.909
},
{
"epoch": 1.1492770144597109,
"grad_norm": 0.5429486632347107,
"learning_rate": 3.398968379191462e-05,
"loss": 0.4909,
"num_input_tokens_seen": 26880888,
"step": 2395,
"train_runtime": 3926.453,
"train_tokens_per_second": 6846.099
},
{
"epoch": 1.1516769664606707,
"grad_norm": 0.8771390914916992,
"learning_rate": 3.393104511506694e-05,
"loss": 0.4903,
"num_input_tokens_seen": 26937800,
"step": 2400,
"train_runtime": 3934.3502,
"train_tokens_per_second": 6846.823
},
{
"epoch": 1.1540769184616309,
"grad_norm": 0.7701951861381531,
"learning_rate": 3.387235006097155e-05,
"loss": 0.4994,
"num_input_tokens_seen": 26993776,
"step": 2405,
"train_runtime": 3942.0785,
"train_tokens_per_second": 6847.6
},
{
"epoch": 1.1564768704625907,
"grad_norm": 0.5495705008506775,
"learning_rate": 3.381359900014116e-05,
"loss": 0.4745,
"num_input_tokens_seen": 27053440,
"step": 2410,
"train_runtime": 3950.7471,
"train_tokens_per_second": 6847.677
},
{
"epoch": 1.1588768224635508,
"grad_norm": 0.7725142240524292,
"learning_rate": 3.375479230344199e-05,
"loss": 0.5404,
"num_input_tokens_seen": 27104744,
"step": 2415,
"train_runtime": 3958.6488,
"train_tokens_per_second": 6846.969
},
{
"epoch": 1.1612767744645107,
"grad_norm": 1.0459918975830078,
"learning_rate": 3.369593034209149e-05,
"loss": 0.5069,
"num_input_tokens_seen": 27159864,
"step": 2420,
"train_runtime": 3967.0288,
"train_tokens_per_second": 6846.4
},
{
"epoch": 1.1636767264654706,
"grad_norm": 0.6602296829223633,
"learning_rate": 3.363701348765597e-05,
"loss": 0.4541,
"num_input_tokens_seen": 27219344,
"step": 2425,
"train_runtime": 3976.0119,
"train_tokens_per_second": 6845.891
},
{
"epoch": 1.1660766784664307,
"grad_norm": 0.5902988910675049,
"learning_rate": 3.3578042112048226e-05,
"loss": 0.4447,
"num_input_tokens_seen": 27279536,
"step": 2430,
"train_runtime": 3984.8836,
"train_tokens_per_second": 6845.755
},
{
"epoch": 1.1684766304673906,
"grad_norm": 0.9325588941574097,
"learning_rate": 3.351901658752524e-05,
"loss": 0.5227,
"num_input_tokens_seen": 27336160,
"step": 2435,
"train_runtime": 3992.8679,
"train_tokens_per_second": 6846.247
},
{
"epoch": 1.1708765824683507,
"grad_norm": 0.6601638793945312,
"learning_rate": 3.34599372866858e-05,
"loss": 0.4813,
"num_input_tokens_seen": 27393304,
"step": 2440,
"train_runtime": 4001.2293,
"train_tokens_per_second": 6846.222
},
{
"epoch": 1.1732765344693106,
"grad_norm": 0.8339878916740417,
"learning_rate": 3.3400804582468154e-05,
"loss": 0.5101,
"num_input_tokens_seen": 27444632,
"step": 2445,
"train_runtime": 4008.6642,
"train_tokens_per_second": 6846.329
},
{
"epoch": 1.1756764864702707,
"grad_norm": 0.8969867825508118,
"learning_rate": 3.334161884814769e-05,
"loss": 0.4709,
"num_input_tokens_seen": 27502576,
"step": 2450,
"train_runtime": 4016.7436,
"train_tokens_per_second": 6846.983
},
{
"epoch": 1.1780764384712306,
"grad_norm": 0.8373593091964722,
"learning_rate": 3.3282380457334505e-05,
"loss": 0.5498,
"num_input_tokens_seen": 27559352,
"step": 2455,
"train_runtime": 4024.9244,
"train_tokens_per_second": 6847.173
},
{
"epoch": 1.1804763904721907,
"grad_norm": 0.8110735416412354,
"learning_rate": 3.3223089783971114e-05,
"loss": 0.507,
"num_input_tokens_seen": 27615472,
"step": 2460,
"train_runtime": 4032.7198,
"train_tokens_per_second": 6847.853
},
{
"epoch": 1.1828763424731505,
"grad_norm": 0.7023930549621582,
"learning_rate": 3.3163747202330066e-05,
"loss": 0.498,
"num_input_tokens_seen": 27671096,
"step": 2465,
"train_runtime": 4040.3448,
"train_tokens_per_second": 6848.697
},
{
"epoch": 1.1852762944741104,
"grad_norm": 0.783581554889679,
"learning_rate": 3.310435308701156e-05,
"loss": 0.5188,
"num_input_tokens_seen": 27722512,
"step": 2470,
"train_runtime": 4048.0115,
"train_tokens_per_second": 6848.427
},
{
"epoch": 1.1876762464750705,
"grad_norm": 0.7718804478645325,
"learning_rate": 3.304490781294114e-05,
"loss": 0.4861,
"num_input_tokens_seen": 27778280,
"step": 2475,
"train_runtime": 4055.8209,
"train_tokens_per_second": 6848.991
},
{
"epoch": 1.1900761984760304,
"grad_norm": 0.5067981481552124,
"learning_rate": 3.2985411755367246e-05,
"loss": 0.4792,
"num_input_tokens_seen": 27839424,
"step": 2480,
"train_runtime": 4064.5853,
"train_tokens_per_second": 6849.266
},
{
"epoch": 1.1924761504769905,
"grad_norm": 0.7346833348274231,
"learning_rate": 3.292586528985894e-05,
"loss": 0.4599,
"num_input_tokens_seen": 27894440,
"step": 2485,
"train_runtime": 4072.5002,
"train_tokens_per_second": 6849.463
},
{
"epoch": 1.1948761024779504,
"grad_norm": 0.5885698199272156,
"learning_rate": 3.2866268792303424e-05,
"loss": 0.4936,
"num_input_tokens_seen": 27959096,
"step": 2490,
"train_runtime": 4082.1306,
"train_tokens_per_second": 6849.143
},
{
"epoch": 1.1972760544789105,
"grad_norm": 0.5944679975509644,
"learning_rate": 3.2806622638903764e-05,
"loss": 0.5008,
"num_input_tokens_seen": 28010352,
"step": 2495,
"train_runtime": 4089.5284,
"train_tokens_per_second": 6849.287
},
{
"epoch": 1.1996760064798704,
"grad_norm": 0.7197619080543518,
"learning_rate": 3.274692720617649e-05,
"loss": 0.5232,
"num_input_tokens_seen": 28067424,
"step": 2500,
"train_runtime": 4098.1617,
"train_tokens_per_second": 6848.784
},
{
"epoch": 1.2020759584808305,
"grad_norm": 0.71132493019104,
"learning_rate": 3.2687182870949185e-05,
"loss": 0.4749,
"num_input_tokens_seen": 28126704,
"step": 2505,
"train_runtime": 4106.4308,
"train_tokens_per_second": 6849.428
},
{
"epoch": 1.2044759104817904,
"grad_norm": 0.7117146849632263,
"learning_rate": 3.2627390010358133e-05,
"loss": 0.4965,
"num_input_tokens_seen": 28184072,
"step": 2510,
"train_runtime": 4114.8063,
"train_tokens_per_second": 6849.429
},
{
"epoch": 1.2068758624827503,
"grad_norm": 0.7712971568107605,
"learning_rate": 3.256754900184593e-05,
"loss": 0.489,
"num_input_tokens_seen": 28237608,
"step": 2515,
"train_runtime": 4122.4987,
"train_tokens_per_second": 6849.634
},
{
"epoch": 1.2092758144837104,
"grad_norm": 0.843129575252533,
"learning_rate": 3.2507660223159115e-05,
"loss": 0.449,
"num_input_tokens_seen": 28299544,
"step": 2520,
"train_runtime": 4131.2681,
"train_tokens_per_second": 6850.086
},
{
"epoch": 1.2116757664846702,
"grad_norm": 0.6665219068527222,
"learning_rate": 3.2447724052345786e-05,
"loss": 0.4269,
"num_input_tokens_seen": 28357640,
"step": 2525,
"train_runtime": 4139.6319,
"train_tokens_per_second": 6850.281
},
{
"epoch": 1.2140757184856303,
"grad_norm": 0.7961658835411072,
"learning_rate": 3.238774086775317e-05,
"loss": 0.4937,
"num_input_tokens_seen": 28411848,
"step": 2530,
"train_runtime": 4147.578,
"train_tokens_per_second": 6850.226
},
{
"epoch": 1.2164756704865902,
"grad_norm": 0.7647880911827087,
"learning_rate": 3.2327711048025314e-05,
"loss": 0.473,
"num_input_tokens_seen": 28465072,
"step": 2535,
"train_runtime": 4155.7446,
"train_tokens_per_second": 6849.572
},
{
"epoch": 1.2188756224875503,
"grad_norm": 0.7645636796951294,
"learning_rate": 3.226763497210061e-05,
"loss": 0.5217,
"num_input_tokens_seen": 28513584,
"step": 2540,
"train_runtime": 4162.7633,
"train_tokens_per_second": 6849.677
},
{
"epoch": 1.2212755744885102,
"grad_norm": 0.9397866725921631,
"learning_rate": 3.2207513019209455e-05,
"loss": 0.5058,
"num_input_tokens_seen": 28569888,
"step": 2545,
"train_runtime": 4170.8063,
"train_tokens_per_second": 6849.968
},
{
"epoch": 1.2236755264894703,
"grad_norm": 0.8510188460350037,
"learning_rate": 3.2147345568871874e-05,
"loss": 0.4699,
"num_input_tokens_seen": 28623888,
"step": 2550,
"train_runtime": 4178.6198,
"train_tokens_per_second": 6850.082
},
{
"epoch": 1.2260754784904302,
"grad_norm": 0.7524721622467041,
"learning_rate": 3.208713300089504e-05,
"loss": 0.4585,
"num_input_tokens_seen": 28680088,
"step": 2555,
"train_runtime": 4187.0852,
"train_tokens_per_second": 6849.655
},
{
"epoch": 1.22847543049139,
"grad_norm": 0.6238115429878235,
"learning_rate": 3.2026875695370975e-05,
"loss": 0.4872,
"num_input_tokens_seen": 28733184,
"step": 2560,
"train_runtime": 4194.4934,
"train_tokens_per_second": 6850.216
},
{
"epoch": 1.2308753824923502,
"grad_norm": 0.8195456862449646,
"learning_rate": 3.1966574032674074e-05,
"loss": 0.5134,
"num_input_tokens_seen": 28787400,
"step": 2565,
"train_runtime": 4202.0819,
"train_tokens_per_second": 6850.747
},
{
"epoch": 1.23327533449331,
"grad_norm": 0.7062321305274963,
"learning_rate": 3.190622839345878e-05,
"loss": 0.4758,
"num_input_tokens_seen": 28840944,
"step": 2570,
"train_runtime": 4209.9012,
"train_tokens_per_second": 6850.741
},
{
"epoch": 1.2356752864942702,
"grad_norm": 0.6290914416313171,
"learning_rate": 3.184583915865709e-05,
"loss": 0.5343,
"num_input_tokens_seen": 28893352,
"step": 2575,
"train_runtime": 4217.2229,
"train_tokens_per_second": 6851.275
},
{
"epoch": 1.23807523849523,
"grad_norm": 0.6599912643432617,
"learning_rate": 3.178540670947624e-05,
"loss": 0.4822,
"num_input_tokens_seen": 28952544,
"step": 2580,
"train_runtime": 4225.8796,
"train_tokens_per_second": 6851.247
},
{
"epoch": 1.2404751904961901,
"grad_norm": 0.6899898052215576,
"learning_rate": 3.172493142739622e-05,
"loss": 0.4529,
"num_input_tokens_seen": 29007344,
"step": 2585,
"train_runtime": 4233.7269,
"train_tokens_per_second": 6851.492
},
{
"epoch": 1.24287514249715,
"grad_norm": 0.8615679144859314,
"learning_rate": 3.1664413694167424e-05,
"loss": 0.5018,
"num_input_tokens_seen": 29065880,
"step": 2590,
"train_runtime": 4242.1314,
"train_tokens_per_second": 6851.716
},
{
"epoch": 1.2452750944981101,
"grad_norm": 0.829759955406189,
"learning_rate": 3.160385389180822e-05,
"loss": 0.5014,
"num_input_tokens_seen": 29120600,
"step": 2595,
"train_runtime": 4250.6385,
"train_tokens_per_second": 6850.877
},
{
"epoch": 1.24767504649907,
"grad_norm": 1.099179744720459,
"learning_rate": 3.154325240260254e-05,
"loss": 0.4823,
"num_input_tokens_seen": 29174832,
"step": 2600,
"train_runtime": 4258.7641,
"train_tokens_per_second": 6850.54
},
{
"epoch": 1.25007499850003,
"grad_norm": 0.7731813788414001,
"learning_rate": 3.148260960909745e-05,
"loss": 0.4527,
"num_input_tokens_seen": 29228680,
"step": 2605,
"train_runtime": 4266.7683,
"train_tokens_per_second": 6850.309
},
{
"epoch": 1.25247495050099,
"grad_norm": 0.7874563336372375,
"learning_rate": 3.1421925894100745e-05,
"loss": 0.5152,
"num_input_tokens_seen": 29282976,
"step": 2610,
"train_runtime": 4274.5977,
"train_tokens_per_second": 6850.464
},
{
"epoch": 1.2548749025019499,
"grad_norm": 0.6936095952987671,
"learning_rate": 3.1361201640678554e-05,
"loss": 0.5055,
"num_input_tokens_seen": 29337384,
"step": 2615,
"train_runtime": 4282.725,
"train_tokens_per_second": 6850.168
},
{
"epoch": 1.25727485450291,
"grad_norm": 0.8180893063545227,
"learning_rate": 3.130043723215291e-05,
"loss": 0.4808,
"num_input_tokens_seen": 29398256,
"step": 2620,
"train_runtime": 4291.6094,
"train_tokens_per_second": 6850.17
},
{
"epoch": 1.2596748065038699,
"grad_norm": 0.7401306629180908,
"learning_rate": 3.123963305209932e-05,
"loss": 0.5101,
"num_input_tokens_seen": 29455288,
"step": 2625,
"train_runtime": 4299.6287,
"train_tokens_per_second": 6850.659
},
{
"epoch": 1.26207475850483,
"grad_norm": 0.7376925349235535,
"learning_rate": 3.1178789484344326e-05,
"loss": 0.468,
"num_input_tokens_seen": 29513208,
"step": 2630,
"train_runtime": 4308.0487,
"train_tokens_per_second": 6850.714
},
{
"epoch": 1.2644747105057899,
"grad_norm": 0.7442266345024109,
"learning_rate": 3.1117906912963124e-05,
"loss": 0.5214,
"num_input_tokens_seen": 29566424,
"step": 2635,
"train_runtime": 4315.7814,
"train_tokens_per_second": 6850.77
},
{
"epoch": 1.26687466250675,
"grad_norm": 0.7198356986045837,
"learning_rate": 3.105698572227712e-05,
"loss": 0.5059,
"num_input_tokens_seen": 29621112,
"step": 2640,
"train_runtime": 4324.1308,
"train_tokens_per_second": 6850.189
},
{
"epoch": 1.2692746145077098,
"grad_norm": 0.6759196519851685,
"learning_rate": 3.0996026296851516e-05,
"loss": 0.4705,
"num_input_tokens_seen": 29672896,
"step": 2645,
"train_runtime": 4331.5888,
"train_tokens_per_second": 6850.349
},
{
"epoch": 1.2716745665086697,
"grad_norm": 0.659756600856781,
"learning_rate": 3.093502902149285e-05,
"loss": 0.4753,
"num_input_tokens_seen": 29724344,
"step": 2650,
"train_runtime": 4339.2532,
"train_tokens_per_second": 6850.106
},
{
"epoch": 1.2740745185096298,
"grad_norm": 0.7627817988395691,
"learning_rate": 3.087399428124659e-05,
"loss": 0.5218,
"num_input_tokens_seen": 29779744,
"step": 2655,
"train_runtime": 4347.2112,
"train_tokens_per_second": 6850.31
},
{
"epoch": 1.2764744705105897,
"grad_norm": 0.5417824387550354,
"learning_rate": 3.081292246139473e-05,
"loss": 0.4784,
"num_input_tokens_seen": 29834824,
"step": 2660,
"train_runtime": 4355.3061,
"train_tokens_per_second": 6850.224
},
{
"epoch": 1.2788744225115498,
"grad_norm": 0.7506272792816162,
"learning_rate": 3.0751813947453265e-05,
"loss": 0.4886,
"num_input_tokens_seen": 29890520,
"step": 2665,
"train_runtime": 4362.9276,
"train_tokens_per_second": 6851.024
},
{
"epoch": 1.2812743745125097,
"grad_norm": 0.6071366667747498,
"learning_rate": 3.069066912516991e-05,
"loss": 0.5277,
"num_input_tokens_seen": 29945288,
"step": 2670,
"train_runtime": 4370.6908,
"train_tokens_per_second": 6851.386
},
{
"epoch": 1.2836743265134698,
"grad_norm": 0.7744503021240234,
"learning_rate": 3.0629488380521504e-05,
"loss": 0.5158,
"num_input_tokens_seen": 30001032,
"step": 2675,
"train_runtime": 4378.8355,
"train_tokens_per_second": 6851.372
},
{
"epoch": 1.2860742785144297,
"grad_norm": 0.4839749336242676,
"learning_rate": 3.056827209971167e-05,
"loss": 0.5022,
"num_input_tokens_seen": 30057416,
"step": 2680,
"train_runtime": 4387.5074,
"train_tokens_per_second": 6850.682
},
{
"epoch": 1.2884742305153898,
"grad_norm": 0.5500566363334656,
"learning_rate": 3.0507020669168367e-05,
"loss": 0.4875,
"num_input_tokens_seen": 30113512,
"step": 2685,
"train_runtime": 4395.5794,
"train_tokens_per_second": 6850.863
},
{
"epoch": 1.2908741825163497,
"grad_norm": 0.7816157341003418,
"learning_rate": 3.044573447554141e-05,
"loss": 0.4872,
"num_input_tokens_seen": 30171064,
"step": 2690,
"train_runtime": 4404.1038,
"train_tokens_per_second": 6850.671
},
{
"epoch": 1.2932741345173095,
"grad_norm": 0.6968929767608643,
"learning_rate": 3.038441390570008e-05,
"loss": 0.4715,
"num_input_tokens_seen": 30226872,
"step": 2695,
"train_runtime": 4412.2507,
"train_tokens_per_second": 6850.67
},
{
"epoch": 1.2956740865182697,
"grad_norm": 0.8923588395118713,
"learning_rate": 3.0323059346730666e-05,
"loss": 0.5249,
"num_input_tokens_seen": 30281784,
"step": 2700,
"train_runtime": 4420.3662,
"train_tokens_per_second": 6850.515
},
{
"epoch": 1.2980740385192295,
"grad_norm": 0.9175417423248291,
"learning_rate": 3.026167118593396e-05,
"loss": 0.5334,
"num_input_tokens_seen": 30336824,
"step": 2705,
"train_runtime": 4428.4152,
"train_tokens_per_second": 6850.492
},
{
"epoch": 1.3004739905201896,
"grad_norm": 0.5945408344268799,
"learning_rate": 3.0200249810822922e-05,
"loss": 0.4795,
"num_input_tokens_seen": 30391968,
"step": 2710,
"train_runtime": 4436.7566,
"train_tokens_per_second": 6850.042
},
{
"epoch": 1.3028739425211495,
"grad_norm": 0.6741787195205688,
"learning_rate": 3.0138795609120156e-05,
"loss": 0.5054,
"num_input_tokens_seen": 30448056,
"step": 2715,
"train_runtime": 4445.0926,
"train_tokens_per_second": 6849.814
},
{
"epoch": 1.3052738945221096,
"grad_norm": 0.7565773129463196,
"learning_rate": 3.0077308968755484e-05,
"loss": 0.4871,
"num_input_tokens_seen": 30509528,
"step": 2720,
"train_runtime": 4454.1899,
"train_tokens_per_second": 6849.624
},
{
"epoch": 1.3076738465230695,
"grad_norm": 0.7174657583236694,
"learning_rate": 3.0015790277863504e-05,
"loss": 0.5235,
"num_input_tokens_seen": 30564064,
"step": 2725,
"train_runtime": 4462.4576,
"train_tokens_per_second": 6849.155
},
{
"epoch": 1.3100737985240296,
"grad_norm": 0.808497965335846,
"learning_rate": 2.9954239924781114e-05,
"loss": 0.5481,
"num_input_tokens_seen": 30617256,
"step": 2730,
"train_runtime": 4469.9742,
"train_tokens_per_second": 6849.538
},
{
"epoch": 1.3124737505249895,
"grad_norm": 0.7192595601081848,
"learning_rate": 2.9892658298045105e-05,
"loss": 0.4882,
"num_input_tokens_seen": 30676776,
"step": 2735,
"train_runtime": 4478.1351,
"train_tokens_per_second": 6850.346
},
{
"epoch": 1.3148737025259494,
"grad_norm": 0.7198320627212524,
"learning_rate": 2.983104578638966e-05,
"loss": 0.5133,
"num_input_tokens_seen": 30729600,
"step": 2740,
"train_runtime": 4486.2754,
"train_tokens_per_second": 6849.691
},
{
"epoch": 1.3172736545269095,
"grad_norm": 0.6649105548858643,
"learning_rate": 2.976940277874395e-05,
"loss": 0.4772,
"num_input_tokens_seen": 30786720,
"step": 2745,
"train_runtime": 4494.0586,
"train_tokens_per_second": 6850.538
},
{
"epoch": 1.3196736065278696,
"grad_norm": 0.8715736269950867,
"learning_rate": 2.9707729664229623e-05,
"loss": 0.5323,
"num_input_tokens_seen": 30844488,
"step": 2750,
"train_runtime": 4502.1358,
"train_tokens_per_second": 6851.079
},
{
"epoch": 1.3220735585288295,
"grad_norm": 0.7848823666572571,
"learning_rate": 2.964602683215839e-05,
"loss": 0.5318,
"num_input_tokens_seen": 30901200,
"step": 2755,
"train_runtime": 4510.5455,
"train_tokens_per_second": 6850.879
},
{
"epoch": 1.3244735105297893,
"grad_norm": 0.5609360337257385,
"learning_rate": 2.958429467202956e-05,
"loss": 0.4453,
"num_input_tokens_seen": 30957496,
"step": 2760,
"train_runtime": 4519.3334,
"train_tokens_per_second": 6850.014
},
{
"epoch": 1.3268734625307494,
"grad_norm": 0.8397387266159058,
"learning_rate": 2.9522533573527568e-05,
"loss": 0.4547,
"num_input_tokens_seen": 31014440,
"step": 2765,
"train_runtime": 4527.735,
"train_tokens_per_second": 6849.88
},
{
"epoch": 1.3292734145317093,
"grad_norm": 0.883388340473175,
"learning_rate": 2.9460743926519524e-05,
"loss": 0.4866,
"num_input_tokens_seen": 31069232,
"step": 2770,
"train_runtime": 4535.7952,
"train_tokens_per_second": 6849.787
},
{
"epoch": 1.3316733665326694,
"grad_norm": 0.6454315185546875,
"learning_rate": 2.9398926121052757e-05,
"loss": 0.4363,
"num_input_tokens_seen": 31124192,
"step": 2775,
"train_runtime": 4543.7024,
"train_tokens_per_second": 6849.963
},
{
"epoch": 1.3340733185336293,
"grad_norm": 0.8647413849830627,
"learning_rate": 2.933708054735232e-05,
"loss": 0.5387,
"num_input_tokens_seen": 31181208,
"step": 2780,
"train_runtime": 4551.829,
"train_tokens_per_second": 6850.259
},
{
"epoch": 1.3364732705345892,
"grad_norm": 0.8238906860351562,
"learning_rate": 2.9275207595818587e-05,
"loss": 0.4733,
"num_input_tokens_seen": 31238792,
"step": 2785,
"train_runtime": 4560.1671,
"train_tokens_per_second": 6850.361
},
{
"epoch": 1.3388732225355493,
"grad_norm": 0.8096624612808228,
"learning_rate": 2.9213307657024747e-05,
"loss": 0.4498,
"num_input_tokens_seen": 31293408,
"step": 2790,
"train_runtime": 4568.3465,
"train_tokens_per_second": 6850.051
},
{
"epoch": 1.3412731745365094,
"grad_norm": 0.6373225450515747,
"learning_rate": 2.9151381121714326e-05,
"loss": 0.4626,
"num_input_tokens_seen": 31351360,
"step": 2795,
"train_runtime": 4576.4713,
"train_tokens_per_second": 6850.553
},
{
"epoch": 1.3436731265374693,
"grad_norm": 0.9298360347747803,
"learning_rate": 2.9089428380798765e-05,
"loss": 0.5147,
"num_input_tokens_seen": 31408064,
"step": 2800,
"train_runtime": 4584.763,
"train_tokens_per_second": 6850.532
},
{
"epoch": 1.3460730785384292,
"grad_norm": 0.7824495434761047,
"learning_rate": 2.9027449825354914e-05,
"loss": 0.5005,
"num_input_tokens_seen": 31465944,
"step": 2805,
"train_runtime": 4593.7143,
"train_tokens_per_second": 6849.783
},
{
"epoch": 1.3484730305393893,
"grad_norm": 0.8347817063331604,
"learning_rate": 2.8965445846622575e-05,
"loss": 0.5212,
"num_input_tokens_seen": 31519296,
"step": 2810,
"train_runtime": 4601.6577,
"train_tokens_per_second": 6849.552
},
{
"epoch": 1.3508729825403492,
"grad_norm": 0.7829338312149048,
"learning_rate": 2.8903416836002046e-05,
"loss": 0.4881,
"num_input_tokens_seen": 31575040,
"step": 2815,
"train_runtime": 4609.5566,
"train_tokens_per_second": 6849.908
},
{
"epoch": 1.3532729345413093,
"grad_norm": 0.7527592182159424,
"learning_rate": 2.8841363185051627e-05,
"loss": 0.5284,
"num_input_tokens_seen": 31627864,
"step": 2820,
"train_runtime": 4617.3734,
"train_tokens_per_second": 6849.752
},
{
"epoch": 1.3556728865422691,
"grad_norm": 0.5921339988708496,
"learning_rate": 2.877928528548518e-05,
"loss": 0.5337,
"num_input_tokens_seen": 31681448,
"step": 2825,
"train_runtime": 4625.135,
"train_tokens_per_second": 6849.843
},
{
"epoch": 1.358072838543229,
"grad_norm": 0.8095146417617798,
"learning_rate": 2.871718352916961e-05,
"loss": 0.4355,
"num_input_tokens_seen": 31734720,
"step": 2830,
"train_runtime": 4632.6583,
"train_tokens_per_second": 6850.218
},
{
"epoch": 1.3604727905441891,
"grad_norm": 0.863218367099762,
"learning_rate": 2.8655058308122435e-05,
"loss": 0.522,
"num_input_tokens_seen": 31786472,
"step": 2835,
"train_runtime": 4640.2065,
"train_tokens_per_second": 6850.228
},
{
"epoch": 1.3628727425451492,
"grad_norm": 0.6763318181037903,
"learning_rate": 2.8592910014509284e-05,
"loss": 0.4825,
"num_input_tokens_seen": 31842040,
"step": 2840,
"train_runtime": 4648.7432,
"train_tokens_per_second": 6849.602
},
{
"epoch": 1.3652726945461091,
"grad_norm": 0.9902337789535522,
"learning_rate": 2.853073904064144e-05,
"loss": 0.4791,
"num_input_tokens_seen": 31901936,
"step": 2845,
"train_runtime": 4657.7444,
"train_tokens_per_second": 6849.224
},
{
"epoch": 1.367672646547069,
"grad_norm": 0.607513427734375,
"learning_rate": 2.8468545778973365e-05,
"loss": 0.4962,
"num_input_tokens_seen": 31955760,
"step": 2850,
"train_runtime": 4665.9209,
"train_tokens_per_second": 6848.757
},
{
"epoch": 1.370072598548029,
"grad_norm": 0.7585775256156921,
"learning_rate": 2.8406330622100185e-05,
"loss": 0.5193,
"num_input_tokens_seen": 32012936,
"step": 2855,
"train_runtime": 4674.1143,
"train_tokens_per_second": 6848.984
},
{
"epoch": 1.372472550548989,
"grad_norm": 0.6520575284957886,
"learning_rate": 2.834409396275526e-05,
"loss": 0.4838,
"num_input_tokens_seen": 32075400,
"step": 2860,
"train_runtime": 4683.1148,
"train_tokens_per_second": 6849.159
},
{
"epoch": 1.374872502549949,
"grad_norm": 0.7430661916732788,
"learning_rate": 2.8281836193807677e-05,
"loss": 0.5193,
"num_input_tokens_seen": 32127560,
"step": 2865,
"train_runtime": 4690.6625,
"train_tokens_per_second": 6849.258
},
{
"epoch": 1.377272454550909,
"grad_norm": 0.6538442373275757,
"learning_rate": 2.821955770825978e-05,
"loss": 0.563,
"num_input_tokens_seen": 32182368,
"step": 2870,
"train_runtime": 4698.3261,
"train_tokens_per_second": 6849.752
},
{
"epoch": 1.3796724065518688,
"grad_norm": 0.6958315968513489,
"learning_rate": 2.81572588992447e-05,
"loss": 0.4983,
"num_input_tokens_seen": 32238704,
"step": 2875,
"train_runtime": 4706.8956,
"train_tokens_per_second": 6849.25
},
{
"epoch": 1.382072358552829,
"grad_norm": 0.5171172618865967,
"learning_rate": 2.809494016002382e-05,
"loss": 0.4887,
"num_input_tokens_seen": 32299312,
"step": 2880,
"train_runtime": 4717.4351,
"train_tokens_per_second": 6846.795
},
{
"epoch": 1.384472310553789,
"grad_norm": 0.7386242151260376,
"learning_rate": 2.8032601883984373e-05,
"loss": 0.4676,
"num_input_tokens_seen": 32353968,
"step": 2885,
"train_runtime": 4727.1468,
"train_tokens_per_second": 6844.291
},
{
"epoch": 1.386872262554749,
"grad_norm": 0.6488030552864075,
"learning_rate": 2.7970244464636907e-05,
"loss": 0.5187,
"num_input_tokens_seen": 32408248,
"step": 2890,
"train_runtime": 4737.0735,
"train_tokens_per_second": 6841.407
},
{
"epoch": 1.3892722145557088,
"grad_norm": 0.7091050744056702,
"learning_rate": 2.7907868295612805e-05,
"loss": 0.5009,
"num_input_tokens_seen": 32461008,
"step": 2895,
"train_runtime": 4746.6232,
"train_tokens_per_second": 6838.758
},
{
"epoch": 1.391672166556669,
"grad_norm": 0.735463559627533,
"learning_rate": 2.7845473770661816e-05,
"loss": 0.4448,
"num_input_tokens_seen": 32519744,
"step": 2900,
"train_runtime": 4756.731,
"train_tokens_per_second": 6836.574
},
{
"epoch": 1.3940721185576288,
"grad_norm": 0.8551938533782959,
"learning_rate": 2.7783061283649547e-05,
"loss": 0.4562,
"num_input_tokens_seen": 32575104,
"step": 2905,
"train_runtime": 4767.5045,
"train_tokens_per_second": 6832.737
},
{
"epoch": 1.396472070558589,
"grad_norm": 0.8265554904937744,
"learning_rate": 2.7720631228555003e-05,
"loss": 0.4771,
"num_input_tokens_seen": 32633880,
"step": 2910,
"train_runtime": 4778.118,
"train_tokens_per_second": 6829.861
},
{
"epoch": 1.3988720225595488,
"grad_norm": 0.7008459568023682,
"learning_rate": 2.7658183999468096e-05,
"loss": 0.5213,
"num_input_tokens_seen": 32687728,
"step": 2915,
"train_runtime": 4787.6745,
"train_tokens_per_second": 6827.475
},
{
"epoch": 1.4012719745605087,
"grad_norm": 0.714462399482727,
"learning_rate": 2.759571999058712e-05,
"loss": 0.4879,
"num_input_tokens_seen": 32744776,
"step": 2920,
"train_runtime": 4798.5825,
"train_tokens_per_second": 6823.843
},
{
"epoch": 1.4036719265614688,
"grad_norm": 0.7445899248123169,
"learning_rate": 2.7533239596216326e-05,
"loss": 0.4801,
"num_input_tokens_seen": 32802640,
"step": 2925,
"train_runtime": 4809.0391,
"train_tokens_per_second": 6821.038
},
{
"epoch": 1.4060718785624289,
"grad_norm": 0.7316624522209167,
"learning_rate": 2.747074321076336e-05,
"loss": 0.4811,
"num_input_tokens_seen": 32858848,
"step": 2930,
"train_runtime": 4819.753,
"train_tokens_per_second": 6817.538
},
{
"epoch": 1.4084718305633888,
"grad_norm": 0.8229737877845764,
"learning_rate": 2.7408231228736854e-05,
"loss": 0.4749,
"num_input_tokens_seen": 32915328,
"step": 2935,
"train_runtime": 4829.6875,
"train_tokens_per_second": 6815.209
},
{
"epoch": 1.4108717825643486,
"grad_norm": 0.6625364422798157,
"learning_rate": 2.7345704044743857e-05,
"loss": 0.5214,
"num_input_tokens_seen": 32970256,
"step": 2940,
"train_runtime": 4839.5418,
"train_tokens_per_second": 6812.681
},
{
"epoch": 1.4132717345653087,
"grad_norm": 0.7320582270622253,
"learning_rate": 2.7283162053487406e-05,
"loss": 0.5137,
"num_input_tokens_seen": 33024728,
"step": 2945,
"train_runtime": 4849.3505,
"train_tokens_per_second": 6810.134
},
{
"epoch": 1.4156716865662686,
"grad_norm": 0.8458564281463623,
"learning_rate": 2.7220605649763997e-05,
"loss": 0.4864,
"num_input_tokens_seen": 33083776,
"step": 2950,
"train_runtime": 4859.7251,
"train_tokens_per_second": 6807.746
},
{
"epoch": 1.4180716385672287,
"grad_norm": 0.6681801676750183,
"learning_rate": 2.71580352284611e-05,
"loss": 0.4656,
"num_input_tokens_seen": 33141792,
"step": 2955,
"train_runtime": 4870.089,
"train_tokens_per_second": 6805.172
},
{
"epoch": 1.4204715905681886,
"grad_norm": 0.5828260779380798,
"learning_rate": 2.7095451184554684e-05,
"loss": 0.4626,
"num_input_tokens_seen": 33200320,
"step": 2960,
"train_runtime": 4879.7888,
"train_tokens_per_second": 6803.639
},
{
"epoch": 1.4228715425691485,
"grad_norm": 0.6321309208869934,
"learning_rate": 2.7032853913106702e-05,
"loss": 0.5166,
"num_input_tokens_seen": 33258192,
"step": 2965,
"train_runtime": 4889.401,
"train_tokens_per_second": 6802.1
},
{
"epoch": 1.4252714945701086,
"grad_norm": 0.5766092538833618,
"learning_rate": 2.697024380926261e-05,
"loss": 0.4709,
"num_input_tokens_seen": 33315416,
"step": 2970,
"train_runtime": 4899.761,
"train_tokens_per_second": 6799.396
},
{
"epoch": 1.4276714465710687,
"grad_norm": 0.5863097906112671,
"learning_rate": 2.6907621268248867e-05,
"loss": 0.4682,
"num_input_tokens_seen": 33374248,
"step": 2975,
"train_runtime": 4910.9171,
"train_tokens_per_second": 6795.93
},
{
"epoch": 1.4300713985720286,
"grad_norm": 0.6625893115997314,
"learning_rate": 2.6844986685370438e-05,
"loss": 0.4795,
"num_input_tokens_seen": 33430576,
"step": 2980,
"train_runtime": 4920.8367,
"train_tokens_per_second": 6793.677
},
{
"epoch": 1.4324713505729885,
"grad_norm": 0.889992356300354,
"learning_rate": 2.6782340456008304e-05,
"loss": 0.5081,
"num_input_tokens_seen": 33481872,
"step": 2985,
"train_runtime": 4930.6268,
"train_tokens_per_second": 6790.591
},
{
"epoch": 1.4348713025739486,
"grad_norm": 0.8572867512702942,
"learning_rate": 2.6719682975616972e-05,
"loss": 0.5238,
"num_input_tokens_seen": 33535608,
"step": 2990,
"train_runtime": 4940.3628,
"train_tokens_per_second": 6788.086
},
{
"epoch": 1.4372712545749085,
"grad_norm": 0.7185449600219727,
"learning_rate": 2.6657014639721963e-05,
"loss": 0.4628,
"num_input_tokens_seen": 33595176,
"step": 2995,
"train_runtime": 4950.583,
"train_tokens_per_second": 6786.105
},
{
"epoch": 1.4396712065758686,
"grad_norm": 0.6952937245368958,
"learning_rate": 2.659433584391733e-05,
"loss": 0.4726,
"num_input_tokens_seen": 33655192,
"step": 3000,
"train_runtime": 4960.7955,
"train_tokens_per_second": 6784.233
},
{
"epoch": 1.4420711585768284,
"grad_norm": 0.5073747634887695,
"learning_rate": 2.6531646983863135e-05,
"loss": 0.5086,
"num_input_tokens_seen": 33710344,
"step": 3005,
"train_runtime": 4971.2496,
"train_tokens_per_second": 6781.06
},
{
"epoch": 1.4444711105777883,
"grad_norm": 0.5523395538330078,
"learning_rate": 2.6468948455283006e-05,
"loss": 0.4855,
"num_input_tokens_seen": 33762880,
"step": 3010,
"train_runtime": 4981.002,
"train_tokens_per_second": 6778.331
},
{
"epoch": 1.4468710625787484,
"grad_norm": 0.7493255138397217,
"learning_rate": 2.6406240653961562e-05,
"loss": 0.5121,
"num_input_tokens_seen": 33814912,
"step": 3015,
"train_runtime": 4990.9252,
"train_tokens_per_second": 6775.279
},
{
"epoch": 1.4492710145797085,
"grad_norm": 0.7933918833732605,
"learning_rate": 2.6343523975741995e-05,
"loss": 0.4822,
"num_input_tokens_seen": 33869336,
"step": 3020,
"train_runtime": 5000.7837,
"train_tokens_per_second": 6772.806
},
{
"epoch": 1.4516709665806684,
"grad_norm": 0.827980101108551,
"learning_rate": 2.628079881652351e-05,
"loss": 0.5094,
"num_input_tokens_seen": 33921376,
"step": 3025,
"train_runtime": 5010.3271,
"train_tokens_per_second": 6770.292
},
{
"epoch": 1.4540709185816283,
"grad_norm": 0.7234380841255188,
"learning_rate": 2.6218065572258847e-05,
"loss": 0.4494,
"num_input_tokens_seen": 33979216,
"step": 3030,
"train_runtime": 5021.1603,
"train_tokens_per_second": 6767.204
},
{
"epoch": 1.4564708705825884,
"grad_norm": 0.6564066410064697,
"learning_rate": 2.6155324638951795e-05,
"loss": 0.5281,
"num_input_tokens_seen": 34036320,
"step": 3035,
"train_runtime": 5032.1108,
"train_tokens_per_second": 6763.826
},
{
"epoch": 1.4588708225835483,
"grad_norm": 0.9267168045043945,
"learning_rate": 2.6092576412654668e-05,
"loss": 0.5001,
"num_input_tokens_seen": 34090128,
"step": 3040,
"train_runtime": 5042.1218,
"train_tokens_per_second": 6761.068
},
{
"epoch": 1.4612707745845084,
"grad_norm": 0.6622974276542664,
"learning_rate": 2.602982128946583e-05,
"loss": 0.4876,
"num_input_tokens_seen": 34148400,
"step": 3045,
"train_runtime": 5052.2931,
"train_tokens_per_second": 6758.99
},
{
"epoch": 1.4636707265854683,
"grad_norm": 0.6938877105712891,
"learning_rate": 2.596705966552718e-05,
"loss": 0.4316,
"num_input_tokens_seen": 34205656,
"step": 3050,
"train_runtime": 5063.4654,
"train_tokens_per_second": 6755.385
},
{
"epoch": 1.4660706785864281,
"grad_norm": 1.1527178287506104,
"learning_rate": 2.5904291937021623e-05,
"loss": 0.5168,
"num_input_tokens_seen": 34256136,
"step": 3055,
"train_runtime": 5073.3962,
"train_tokens_per_second": 6752.111
},
{
"epoch": 1.4684706305873882,
"grad_norm": 0.8553231358528137,
"learning_rate": 2.5841518500170647e-05,
"loss": 0.4756,
"num_input_tokens_seen": 34311976,
"step": 3060,
"train_runtime": 5083.9773,
"train_tokens_per_second": 6749.042
},
{
"epoch": 1.4708705825883484,
"grad_norm": 0.6087079644203186,
"learning_rate": 2.5778739751231747e-05,
"loss": 0.4665,
"num_input_tokens_seen": 34370640,
"step": 3065,
"train_runtime": 5094.5141,
"train_tokens_per_second": 6746.598
},
{
"epoch": 1.4732705345893082,
"grad_norm": 0.7348918318748474,
"learning_rate": 2.5715956086495947e-05,
"loss": 0.4652,
"num_input_tokens_seen": 34421432,
"step": 3070,
"train_runtime": 5103.6348,
"train_tokens_per_second": 6744.494
},
{
"epoch": 1.4756704865902681,
"grad_norm": 1.1253235340118408,
"learning_rate": 2.565316790228532e-05,
"loss": 0.4909,
"num_input_tokens_seen": 34478304,
"step": 3075,
"train_runtime": 5113.6496,
"train_tokens_per_second": 6742.406
},
{
"epoch": 1.4780704385912282,
"grad_norm": 0.7545915842056274,
"learning_rate": 2.5590375594950443e-05,
"loss": 0.4865,
"num_input_tokens_seen": 34532640,
"step": 3080,
"train_runtime": 5123.1565,
"train_tokens_per_second": 6740.501
},
{
"epoch": 1.480470390592188,
"grad_norm": 0.8254991769790649,
"learning_rate": 2.5527579560867947e-05,
"loss": 0.503,
"num_input_tokens_seen": 34597280,
"step": 3085,
"train_runtime": 5135.0435,
"train_tokens_per_second": 6737.485
},
{
"epoch": 1.4828703425931482,
"grad_norm": 0.7427690625190735,
"learning_rate": 2.546478019643797e-05,
"loss": 0.4799,
"num_input_tokens_seen": 34654488,
"step": 3090,
"train_runtime": 5145.9423,
"train_tokens_per_second": 6734.333
},
{
"epoch": 1.485270294594108,
"grad_norm": 0.6483776569366455,
"learning_rate": 2.540197789808168e-05,
"loss": 0.4463,
"num_input_tokens_seen": 34716120,
"step": 3095,
"train_runtime": 5158.485,
"train_tokens_per_second": 6729.906
},
{
"epoch": 1.487670246595068,
"grad_norm": 0.5190485715866089,
"learning_rate": 2.5339173062238774e-05,
"loss": 0.4597,
"num_input_tokens_seen": 34777640,
"step": 3100,
"train_runtime": 5171.0585,
"train_tokens_per_second": 6725.439
},
{
"epoch": 1.490070198596028,
"grad_norm": 0.5749461054801941,
"learning_rate": 2.5276366085364937e-05,
"loss": 0.5084,
"num_input_tokens_seen": 34831992,
"step": 3105,
"train_runtime": 5181.3994,
"train_tokens_per_second": 6722.507
},
{
"epoch": 1.4924701505969882,
"grad_norm": 0.7715994119644165,
"learning_rate": 2.52135573639294e-05,
"loss": 0.4786,
"num_input_tokens_seen": 34894736,
"step": 3110,
"train_runtime": 5191.5337,
"train_tokens_per_second": 6721.47
},
{
"epoch": 1.494870102597948,
"grad_norm": 0.9101441502571106,
"learning_rate": 2.5150747294412398e-05,
"loss": 0.5175,
"num_input_tokens_seen": 34951296,
"step": 3115,
"train_runtime": 5201.6456,
"train_tokens_per_second": 6719.277
},
{
"epoch": 1.497270054598908,
"grad_norm": 0.7418543696403503,
"learning_rate": 2.508793627330267e-05,
"loss": 0.451,
"num_input_tokens_seen": 35006168,
"step": 3120,
"train_runtime": 5211.4651,
"train_tokens_per_second": 6717.145
},
{
"epoch": 1.499670006599868,
"grad_norm": 0.7147541642189026,
"learning_rate": 2.502512469709497e-05,
"loss": 0.5077,
"num_input_tokens_seen": 35059176,
"step": 3125,
"train_runtime": 5221.3263,
"train_tokens_per_second": 6714.611
},
{
"epoch": 1.5020699586008281,
"grad_norm": 0.5535465478897095,
"learning_rate": 2.4962312962287544e-05,
"loss": 0.4924,
"num_input_tokens_seen": 35114264,
"step": 3130,
"train_runtime": 5230.6201,
"train_tokens_per_second": 6713.212
},
{
"epoch": 1.504469910601788,
"grad_norm": 0.7213118672370911,
"learning_rate": 2.4899501465379644e-05,
"loss": 0.5004,
"num_input_tokens_seen": 35168424,
"step": 3135,
"train_runtime": 5241.0072,
"train_tokens_per_second": 6710.242
},
{
"epoch": 1.506869862602748,
"grad_norm": 0.7794874310493469,
"learning_rate": 2.4836690602869044e-05,
"loss": 0.5145,
"num_input_tokens_seen": 35224296,
"step": 3140,
"train_runtime": 5250.7072,
"train_tokens_per_second": 6708.486
},
{
"epoch": 1.5092698146037078,
"grad_norm": 0.9129291772842407,
"learning_rate": 2.4773880771249477e-05,
"loss": 0.4889,
"num_input_tokens_seen": 35280088,
"step": 3145,
"train_runtime": 5261.3252,
"train_tokens_per_second": 6705.552
},
{
"epoch": 1.511669766604668,
"grad_norm": 0.7600094079971313,
"learning_rate": 2.4711072367008176e-05,
"loss": 0.4967,
"num_input_tokens_seen": 35340720,
"step": 3150,
"train_runtime": 5271.563,
"train_tokens_per_second": 6704.031
},
{
"epoch": 1.514069718605628,
"grad_norm": 0.5989595055580139,
"learning_rate": 2.4648265786623388e-05,
"loss": 0.4843,
"num_input_tokens_seen": 35397240,
"step": 3155,
"train_runtime": 5282.0778,
"train_tokens_per_second": 6701.386
},
{
"epoch": 1.5164696706065879,
"grad_norm": 0.6885458827018738,
"learning_rate": 2.4585461426561818e-05,
"loss": 0.5011,
"num_input_tokens_seen": 35460504,
"step": 3160,
"train_runtime": 5293.3254,
"train_tokens_per_second": 6699.098
},
{
"epoch": 1.5188696226075478,
"grad_norm": 0.5150988698005676,
"learning_rate": 2.452265968327618e-05,
"loss": 0.512,
"num_input_tokens_seen": 35517032,
"step": 3165,
"train_runtime": 5303.2586,
"train_tokens_per_second": 6697.209
},
{
"epoch": 1.5212695746085079,
"grad_norm": 0.7029662132263184,
"learning_rate": 2.4459860953202635e-05,
"loss": 0.4807,
"num_input_tokens_seen": 35567328,
"step": 3170,
"train_runtime": 5312.0452,
"train_tokens_per_second": 6695.6
},
{
"epoch": 1.523669526609468,
"grad_norm": 0.6837257742881775,
"learning_rate": 2.4397065632758374e-05,
"loss": 0.4578,
"num_input_tokens_seen": 35622032,
"step": 3175,
"train_runtime": 5321.4999,
"train_tokens_per_second": 6693.983
},
{
"epoch": 1.5260694786104279,
"grad_norm": 0.7105430364608765,
"learning_rate": 2.4334274118339014e-05,
"loss": 0.512,
"num_input_tokens_seen": 35684184,
"step": 3180,
"train_runtime": 5331.4522,
"train_tokens_per_second": 6693.145
},
{
"epoch": 1.5284694306113877,
"grad_norm": 0.788021445274353,
"learning_rate": 2.4271486806316173e-05,
"loss": 0.5011,
"num_input_tokens_seen": 35741544,
"step": 3185,
"train_runtime": 5341.311,
"train_tokens_per_second": 6691.53
},
{
"epoch": 1.5308693826123476,
"grad_norm": 0.8190677165985107,
"learning_rate": 2.420870409303495e-05,
"loss": 0.4627,
"num_input_tokens_seen": 35797096,
"step": 3190,
"train_runtime": 5350.6319,
"train_tokens_per_second": 6690.256
},
{
"epoch": 1.5332693346133077,
"grad_norm": 0.9217768907546997,
"learning_rate": 2.4145926374811395e-05,
"loss": 0.4672,
"num_input_tokens_seen": 35849520,
"step": 3195,
"train_runtime": 5360.1483,
"train_tokens_per_second": 6688.158
},
{
"epoch": 1.5356692866142678,
"grad_norm": 0.729516327381134,
"learning_rate": 2.4083154047930014e-05,
"loss": 0.4645,
"num_input_tokens_seen": 35908672,
"step": 3200,
"train_runtime": 5371.5509,
"train_tokens_per_second": 6684.973
},
{
"epoch": 1.5380692386152277,
"grad_norm": 0.7882852554321289,
"learning_rate": 2.4020387508641322e-05,
"loss": 0.4833,
"num_input_tokens_seen": 35963328,
"step": 3205,
"train_runtime": 5382.0522,
"train_tokens_per_second": 6682.085
},
{
"epoch": 1.5404691906161876,
"grad_norm": 0.6502909660339355,
"learning_rate": 2.3957627153159277e-05,
"loss": 0.4763,
"num_input_tokens_seen": 36021192,
"step": 3210,
"train_runtime": 5392.0941,
"train_tokens_per_second": 6680.372
},
{
"epoch": 1.5428691426171477,
"grad_norm": 0.8590161204338074,
"learning_rate": 2.3894873377658788e-05,
"loss": 0.4768,
"num_input_tokens_seen": 36078448,
"step": 3215,
"train_runtime": 5402.6273,
"train_tokens_per_second": 6677.945
},
{
"epoch": 1.5452690946181078,
"grad_norm": 1.034970760345459,
"learning_rate": 2.383212657827324e-05,
"loss": 0.502,
"num_input_tokens_seen": 36132656,
"step": 3220,
"train_runtime": 5412.5546,
"train_tokens_per_second": 6675.712
},
{
"epoch": 1.5476690466190677,
"grad_norm": 0.5326734185218811,
"learning_rate": 2.3769387151092e-05,
"loss": 0.4883,
"num_input_tokens_seen": 36191712,
"step": 3225,
"train_runtime": 5422.7637,
"train_tokens_per_second": 6674.034
},
{
"epoch": 1.5500689986200276,
"grad_norm": 0.9736510515213013,
"learning_rate": 2.370665549215787e-05,
"loss": 0.5341,
"num_input_tokens_seen": 36245160,
"step": 3230,
"train_runtime": 5432.9922,
"train_tokens_per_second": 6671.307
},
{
"epoch": 1.5524689506209874,
"grad_norm": 0.6917448043823242,
"learning_rate": 2.3643931997464617e-05,
"loss": 0.4849,
"num_input_tokens_seen": 36303576,
"step": 3235,
"train_runtime": 5443.3631,
"train_tokens_per_second": 6669.328
},
{
"epoch": 1.5548689026219475,
"grad_norm": 0.9082401394844055,
"learning_rate": 2.35812170629545e-05,
"loss": 0.4583,
"num_input_tokens_seen": 36360840,
"step": 3240,
"train_runtime": 5453.245,
"train_tokens_per_second": 6667.744
},
{
"epoch": 1.5572688546229077,
"grad_norm": 0.6470857262611389,
"learning_rate": 2.351851108451571e-05,
"loss": 0.4604,
"num_input_tokens_seen": 36422200,
"step": 3245,
"train_runtime": 5463.4424,
"train_tokens_per_second": 6666.529
},
{
"epoch": 1.5596688066238675,
"grad_norm": 0.8061736822128296,
"learning_rate": 2.34558144579799e-05,
"loss": 0.5048,
"num_input_tokens_seen": 36476632,
"step": 3250,
"train_runtime": 5473.1542,
"train_tokens_per_second": 6664.645
},
{
"epoch": 1.5620687586248274,
"grad_norm": 0.7560340762138367,
"learning_rate": 2.339312757911973e-05,
"loss": 0.5113,
"num_input_tokens_seen": 36529792,
"step": 3255,
"train_runtime": 5482.3009,
"train_tokens_per_second": 6663.223
},
{
"epoch": 1.5644687106257875,
"grad_norm": 0.7179074883460999,
"learning_rate": 2.3330450843646296e-05,
"loss": 0.5005,
"num_input_tokens_seen": 36586016,
"step": 3260,
"train_runtime": 5492.5745,
"train_tokens_per_second": 6660.996
},
{
"epoch": 1.5668686626267476,
"grad_norm": 0.5973109602928162,
"learning_rate": 2.3267784647206658e-05,
"loss": 0.4804,
"num_input_tokens_seen": 36641112,
"step": 3265,
"train_runtime": 5502.2894,
"train_tokens_per_second": 6659.248
},
{
"epoch": 1.5692686146277075,
"grad_norm": 0.9687879681587219,
"learning_rate": 2.3205129385381355e-05,
"loss": 0.4928,
"num_input_tokens_seen": 36697088,
"step": 3270,
"train_runtime": 5512.5707,
"train_tokens_per_second": 6656.983
},
{
"epoch": 1.5716685666286674,
"grad_norm": 0.6984615325927734,
"learning_rate": 2.3142485453681925e-05,
"loss": 0.4872,
"num_input_tokens_seen": 36755920,
"step": 3275,
"train_runtime": 5523.731,
"train_tokens_per_second": 6654.184
},
{
"epoch": 1.5740685186296273,
"grad_norm": 0.7793405652046204,
"learning_rate": 2.307985324754835e-05,
"loss": 0.5391,
"num_input_tokens_seen": 36811304,
"step": 3280,
"train_runtime": 5534.0048,
"train_tokens_per_second": 6651.838
},
{
"epoch": 1.5764684706305874,
"grad_norm": 0.7121679782867432,
"learning_rate": 2.3017233162346608e-05,
"loss": 0.4955,
"num_input_tokens_seen": 36868680,
"step": 3285,
"train_runtime": 5543.499,
"train_tokens_per_second": 6650.796
},
{
"epoch": 1.5788684226315475,
"grad_norm": 0.9568763375282288,
"learning_rate": 2.295462559336618e-05,
"loss": 0.4775,
"num_input_tokens_seen": 36925400,
"step": 3290,
"train_runtime": 5553.1982,
"train_tokens_per_second": 6649.394
},
{
"epoch": 1.5812683746325074,
"grad_norm": 0.5952507257461548,
"learning_rate": 2.2892030935817517e-05,
"loss": 0.457,
"num_input_tokens_seen": 36984032,
"step": 3295,
"train_runtime": 5563.7199,
"train_tokens_per_second": 6647.357
},
{
"epoch": 1.5836683266334672,
"grad_norm": 0.8516509532928467,
"learning_rate": 2.2829449584829558e-05,
"loss": 0.5231,
"num_input_tokens_seen": 37038928,
"step": 3300,
"train_runtime": 5573.6606,
"train_tokens_per_second": 6645.35
},
{
"epoch": 1.5860682786344273,
"grad_norm": 0.569814920425415,
"learning_rate": 2.2766881935447275e-05,
"loss": 0.5044,
"num_input_tokens_seen": 37092208,
"step": 3305,
"train_runtime": 5583.51,
"train_tokens_per_second": 6643.17
},
{
"epoch": 1.5884682306353874,
"grad_norm": 0.8386396169662476,
"learning_rate": 2.2704328382629138e-05,
"loss": 0.4753,
"num_input_tokens_seen": 37147680,
"step": 3310,
"train_runtime": 5592.6848,
"train_tokens_per_second": 6642.191
},
{
"epoch": 1.5908681826363473,
"grad_norm": 0.7655364871025085,
"learning_rate": 2.264178932124462e-05,
"loss": 0.4796,
"num_input_tokens_seen": 37203656,
"step": 3315,
"train_runtime": 5601.9649,
"train_tokens_per_second": 6641.18
},
{
"epoch": 1.5932681346373072,
"grad_norm": 0.8739466071128845,
"learning_rate": 2.257926514607171e-05,
"loss": 0.4852,
"num_input_tokens_seen": 37263520,
"step": 3320,
"train_runtime": 5612.1576,
"train_tokens_per_second": 6639.785
},
{
"epoch": 1.595668086638267,
"grad_norm": 0.6632476449012756,
"learning_rate": 2.2516756251794463e-05,
"loss": 0.5121,
"num_input_tokens_seen": 37318192,
"step": 3325,
"train_runtime": 5621.7888,
"train_tokens_per_second": 6638.135
},
{
"epoch": 1.5980680386392272,
"grad_norm": 0.7768703699111938,
"learning_rate": 2.245426303300044e-05,
"loss": 0.5128,
"num_input_tokens_seen": 37374224,
"step": 3330,
"train_runtime": 5631.8308,
"train_tokens_per_second": 6636.248
},
{
"epoch": 1.6004679906401873,
"grad_norm": 0.7217375636100769,
"learning_rate": 2.2391785884178256e-05,
"loss": 0.4835,
"num_input_tokens_seen": 37435240,
"step": 3335,
"train_runtime": 5642.3272,
"train_tokens_per_second": 6634.716
},
{
"epoch": 1.6028679426411472,
"grad_norm": 0.5615156888961792,
"learning_rate": 2.2329325199715114e-05,
"loss": 0.4575,
"num_input_tokens_seen": 37492120,
"step": 3340,
"train_runtime": 5652.3686,
"train_tokens_per_second": 6632.993
},
{
"epoch": 1.605267894642107,
"grad_norm": 0.826392650604248,
"learning_rate": 2.226688137389425e-05,
"loss": 0.4922,
"num_input_tokens_seen": 37548408,
"step": 3345,
"train_runtime": 5662.4517,
"train_tokens_per_second": 6631.122
},
{
"epoch": 1.6076678466430672,
"grad_norm": 0.589180052280426,
"learning_rate": 2.220445480089248e-05,
"loss": 0.4807,
"num_input_tokens_seen": 37610280,
"step": 3350,
"train_runtime": 5674.3947,
"train_tokens_per_second": 6628.069
},
{
"epoch": 1.6100677986440273,
"grad_norm": 0.8704653978347778,
"learning_rate": 2.214204587477774e-05,
"loss": 0.5322,
"num_input_tokens_seen": 37668512,
"step": 3355,
"train_runtime": 5684.5435,
"train_tokens_per_second": 6626.48
},
{
"epoch": 1.6124677506449872,
"grad_norm": 0.7563439607620239,
"learning_rate": 2.207965498950655e-05,
"loss": 0.4843,
"num_input_tokens_seen": 37727112,
"step": 3360,
"train_runtime": 5694.2908,
"train_tokens_per_second": 6625.428
},
{
"epoch": 1.614867702645947,
"grad_norm": 0.7133488059043884,
"learning_rate": 2.2017282538921556e-05,
"loss": 0.4732,
"num_input_tokens_seen": 37780192,
"step": 3365,
"train_runtime": 5703.5817,
"train_tokens_per_second": 6623.942
},
{
"epoch": 1.617267654646907,
"grad_norm": 0.8156766295433044,
"learning_rate": 2.1954928916749006e-05,
"loss": 0.5115,
"num_input_tokens_seen": 37839376,
"step": 3370,
"train_runtime": 5713.8648,
"train_tokens_per_second": 6622.379
},
{
"epoch": 1.619667606647867,
"grad_norm": 0.7063591480255127,
"learning_rate": 2.1892594516596343e-05,
"loss": 0.5177,
"num_input_tokens_seen": 37894296,
"step": 3375,
"train_runtime": 5723.4986,
"train_tokens_per_second": 6620.827
},
{
"epoch": 1.6220675586488271,
"grad_norm": 0.8170085549354553,
"learning_rate": 2.183027973194964e-05,
"loss": 0.4848,
"num_input_tokens_seen": 37951552,
"step": 3380,
"train_runtime": 5733.3985,
"train_tokens_per_second": 6619.382
},
{
"epoch": 1.624467510649787,
"grad_norm": 0.6729702353477478,
"learning_rate": 2.176798495617114e-05,
"loss": 0.4927,
"num_input_tokens_seen": 38011968,
"step": 3385,
"train_runtime": 5743.0143,
"train_tokens_per_second": 6618.818
},
{
"epoch": 1.6268674626507469,
"grad_norm": 0.7593095898628235,
"learning_rate": 2.1705710582496815e-05,
"loss": 0.4888,
"num_input_tokens_seen": 38067280,
"step": 3390,
"train_runtime": 5752.7516,
"train_tokens_per_second": 6617.23
},
{
"epoch": 1.629267414651707,
"grad_norm": 1.1748439073562622,
"learning_rate": 2.1643457004033807e-05,
"loss": 0.5178,
"num_input_tokens_seen": 38124912,
"step": 3395,
"train_runtime": 5763.3474,
"train_tokens_per_second": 6615.064
},
{
"epoch": 1.631667366652667,
"grad_norm": 0.8947390913963318,
"learning_rate": 2.1581224613758005e-05,
"loss": 0.5112,
"num_input_tokens_seen": 38178808,
"step": 3400,
"train_runtime": 5772.7591,
"train_tokens_per_second": 6613.615
},
{
"epoch": 1.634067318653627,
"grad_norm": 0.702033519744873,
"learning_rate": 2.1519013804511562e-05,
"loss": 0.5106,
"num_input_tokens_seen": 38233976,
"step": 3405,
"train_runtime": 5782.5071,
"train_tokens_per_second": 6612.007
},
{
"epoch": 1.6364672706545869,
"grad_norm": 0.9868459105491638,
"learning_rate": 2.145682496900039e-05,
"loss": 0.501,
"num_input_tokens_seen": 38291736,
"step": 3410,
"train_runtime": 5792.3708,
"train_tokens_per_second": 6610.719
},
{
"epoch": 1.6388672226555467,
"grad_norm": 1.0660921335220337,
"learning_rate": 2.1394658499791684e-05,
"loss": 0.4836,
"num_input_tokens_seen": 38347056,
"step": 3415,
"train_runtime": 5800.6961,
"train_tokens_per_second": 6610.768
},
{
"epoch": 1.6412671746565068,
"grad_norm": 0.809270441532135,
"learning_rate": 2.1332514789311448e-05,
"loss": 0.5138,
"num_input_tokens_seen": 38399184,
"step": 3420,
"train_runtime": 5808.2869,
"train_tokens_per_second": 6611.103
},
{
"epoch": 1.643667126657467,
"grad_norm": 0.7200763821601868,
"learning_rate": 2.1270394229842044e-05,
"loss": 0.4522,
"num_input_tokens_seen": 38456896,
"step": 3425,
"train_runtime": 5816.1423,
"train_tokens_per_second": 6612.097
},
{
"epoch": 1.6460670786584268,
"grad_norm": 0.8460598587989807,
"learning_rate": 2.1208297213519686e-05,
"loss": 0.4847,
"num_input_tokens_seen": 38512168,
"step": 3430,
"train_runtime": 5823.8311,
"train_tokens_per_second": 6612.858
},
{
"epoch": 1.6484670306593867,
"grad_norm": 0.7235488891601562,
"learning_rate": 2.1146224132331944e-05,
"loss": 0.4733,
"num_input_tokens_seen": 38573240,
"step": 3435,
"train_runtime": 5832.0444,
"train_tokens_per_second": 6614.017
},
{
"epoch": 1.6508669826603468,
"grad_norm": 0.8452171087265015,
"learning_rate": 2.1084175378115344e-05,
"loss": 0.5236,
"num_input_tokens_seen": 38624080,
"step": 3440,
"train_runtime": 5839.2065,
"train_tokens_per_second": 6614.611
},
{
"epoch": 1.653266934661307,
"grad_norm": 0.7488996982574463,
"learning_rate": 2.1022151342552815e-05,
"loss": 0.5226,
"num_input_tokens_seen": 38679488,
"step": 3445,
"train_runtime": 5846.9076,
"train_tokens_per_second": 6615.375
},
{
"epoch": 1.6556668866622668,
"grad_norm": 0.7845451235771179,
"learning_rate": 2.0960152417171243e-05,
"loss": 0.4533,
"num_input_tokens_seen": 38736136,
"step": 3450,
"train_runtime": 5855.1703,
"train_tokens_per_second": 6615.715
},
{
"epoch": 1.6580668386632267,
"grad_norm": 0.9303568005561829,
"learning_rate": 2.089817899333904e-05,
"loss": 0.483,
"num_input_tokens_seen": 38788592,
"step": 3455,
"train_runtime": 5862.705,
"train_tokens_per_second": 6616.16
},
{
"epoch": 1.6604667906641866,
"grad_norm": 0.7032025456428528,
"learning_rate": 2.083623146226362e-05,
"loss": 0.4556,
"num_input_tokens_seen": 38846528,
"step": 3460,
"train_runtime": 5870.8119,
"train_tokens_per_second": 6616.892
},
{
"epoch": 1.6628667426651467,
"grad_norm": 1.0094935894012451,
"learning_rate": 2.0774310214988942e-05,
"loss": 0.545,
"num_input_tokens_seen": 38896768,
"step": 3465,
"train_runtime": 5879.1312,
"train_tokens_per_second": 6616.074
},
{
"epoch": 1.6652666946661068,
"grad_norm": 0.8336009979248047,
"learning_rate": 2.071241564239305e-05,
"loss": 0.4741,
"num_input_tokens_seen": 38952672,
"step": 3470,
"train_runtime": 5888.8317,
"train_tokens_per_second": 6614.669
},
{
"epoch": 1.6676666466670667,
"grad_norm": 0.6727505326271057,
"learning_rate": 2.0650548135185618e-05,
"loss": 0.4831,
"num_input_tokens_seen": 39007376,
"step": 3475,
"train_runtime": 5898.9169,
"train_tokens_per_second": 6612.634
},
{
"epoch": 1.6700665986680265,
"grad_norm": 0.7282326221466064,
"learning_rate": 2.0588708083905468e-05,
"loss": 0.5174,
"num_input_tokens_seen": 39064568,
"step": 3480,
"train_runtime": 5909.1279,
"train_tokens_per_second": 6610.886
},
{
"epoch": 1.6724665506689866,
"grad_norm": 0.6648644208908081,
"learning_rate": 2.0526895878918077e-05,
"loss": 0.5055,
"num_input_tokens_seen": 39117320,
"step": 3485,
"train_runtime": 5918.494,
"train_tokens_per_second": 6609.337
},
{
"epoch": 1.6748665026699467,
"grad_norm": 0.8427759408950806,
"learning_rate": 2.0465111910413192e-05,
"loss": 0.5316,
"num_input_tokens_seen": 39171840,
"step": 3490,
"train_runtime": 5927.2143,
"train_tokens_per_second": 6608.811
},
{
"epoch": 1.6772664546709066,
"grad_norm": 0.6149888634681702,
"learning_rate": 2.040335656840228e-05,
"loss": 0.4517,
"num_input_tokens_seen": 39226624,
"step": 3495,
"train_runtime": 5935.062,
"train_tokens_per_second": 6609.303
},
{
"epoch": 1.6796664066718665,
"grad_norm": 0.9388527870178223,
"learning_rate": 2.03416302427161e-05,
"loss": 0.5067,
"num_input_tokens_seen": 39284168,
"step": 3500,
"train_runtime": 5942.9844,
"train_tokens_per_second": 6610.175
},
{
"epoch": 1.6820663586728264,
"grad_norm": 0.8548518419265747,
"learning_rate": 2.027993332300227e-05,
"loss": 0.5064,
"num_input_tokens_seen": 39340120,
"step": 3505,
"train_runtime": 5951.1485,
"train_tokens_per_second": 6610.509
},
{
"epoch": 1.6844663106737865,
"grad_norm": 0.6581935882568359,
"learning_rate": 2.021826619872278e-05,
"loss": 0.4523,
"num_input_tokens_seen": 39399136,
"step": 3510,
"train_runtime": 5959.3451,
"train_tokens_per_second": 6611.32
},
{
"epoch": 1.6868662626747466,
"grad_norm": 0.6218190789222717,
"learning_rate": 2.0156629259151515e-05,
"loss": 0.4804,
"num_input_tokens_seen": 39456808,
"step": 3515,
"train_runtime": 5967.3525,
"train_tokens_per_second": 6612.113
},
{
"epoch": 1.6892662146757065,
"grad_norm": 0.8073654174804688,
"learning_rate": 2.0095022893371826e-05,
"loss": 0.4838,
"num_input_tokens_seen": 39516000,
"step": 3520,
"train_runtime": 5975.9682,
"train_tokens_per_second": 6612.485
},
{
"epoch": 1.6916661666766664,
"grad_norm": 0.7715812921524048,
"learning_rate": 2.0033447490274083e-05,
"loss": 0.4669,
"num_input_tokens_seen": 39569280,
"step": 3525,
"train_runtime": 5983.6596,
"train_tokens_per_second": 6612.89
},
{
"epoch": 1.6940661186776265,
"grad_norm": 0.8139777183532715,
"learning_rate": 1.99719034385532e-05,
"loss": 0.5031,
"num_input_tokens_seen": 39625464,
"step": 3530,
"train_runtime": 5991.822,
"train_tokens_per_second": 6613.258
},
{
"epoch": 1.6964660706785866,
"grad_norm": 0.7577908635139465,
"learning_rate": 1.9910391126706158e-05,
"loss": 0.4991,
"num_input_tokens_seen": 39676928,
"step": 3535,
"train_runtime": 5999.8126,
"train_tokens_per_second": 6613.028
},
{
"epoch": 1.6988660226795465,
"grad_norm": 0.5273564457893372,
"learning_rate": 1.9848910943029624e-05,
"loss": 0.4548,
"num_input_tokens_seen": 39734168,
"step": 3540,
"train_runtime": 6008.5552,
"train_tokens_per_second": 6612.932
},
{
"epoch": 1.7012659746805063,
"grad_norm": 0.8542927503585815,
"learning_rate": 1.978746327561741e-05,
"loss": 0.4886,
"num_input_tokens_seen": 39795520,
"step": 3545,
"train_runtime": 6017.0289,
"train_tokens_per_second": 6613.816
},
{
"epoch": 1.7036659266814662,
"grad_norm": 0.6213528513908386,
"learning_rate": 1.972604851235811e-05,
"loss": 0.4737,
"num_input_tokens_seen": 39851264,
"step": 3550,
"train_runtime": 6025.5762,
"train_tokens_per_second": 6613.685
},
{
"epoch": 1.7060658786824263,
"grad_norm": 0.7265267372131348,
"learning_rate": 1.9664667040932577e-05,
"loss": 0.5013,
"num_input_tokens_seen": 39904120,
"step": 3555,
"train_runtime": 6033.0567,
"train_tokens_per_second": 6614.246
},
{
"epoch": 1.7084658306833864,
"grad_norm": 0.8746877312660217,
"learning_rate": 1.9603319248811542e-05,
"loss": 0.4541,
"num_input_tokens_seen": 39957104,
"step": 3560,
"train_runtime": 6040.7403,
"train_tokens_per_second": 6614.604
},
{
"epoch": 1.7108657826843463,
"grad_norm": 0.690990686416626,
"learning_rate": 1.9542005523253103e-05,
"loss": 0.5057,
"num_input_tokens_seen": 40014640,
"step": 3565,
"train_runtime": 6048.7384,
"train_tokens_per_second": 6615.37
},
{
"epoch": 1.7132657346853062,
"grad_norm": 0.5996572375297546,
"learning_rate": 1.948072625130032e-05,
"loss": 0.5071,
"num_input_tokens_seen": 40071928,
"step": 3570,
"train_runtime": 6056.481,
"train_tokens_per_second": 6616.371
},
{
"epoch": 1.7156656866862663,
"grad_norm": 1.0447416305541992,
"learning_rate": 1.9419481819778785e-05,
"loss": 0.5099,
"num_input_tokens_seen": 40125856,
"step": 3575,
"train_runtime": 6063.7113,
"train_tokens_per_second": 6617.376
},
{
"epoch": 1.7180656386872264,
"grad_norm": 1.0107308626174927,
"learning_rate": 1.9358272615294153e-05,
"loss": 0.4823,
"num_input_tokens_seen": 40181760,
"step": 3580,
"train_runtime": 6071.812,
"train_tokens_per_second": 6617.754
},
{
"epoch": 1.7204655906881863,
"grad_norm": 0.7742976546287537,
"learning_rate": 1.9297099024229675e-05,
"loss": 0.5261,
"num_input_tokens_seen": 40236472,
"step": 3585,
"train_runtime": 6079.4422,
"train_tokens_per_second": 6618.448
},
{
"epoch": 1.7228655426891462,
"grad_norm": 0.7820068597793579,
"learning_rate": 1.923596143274385e-05,
"loss": 0.4674,
"num_input_tokens_seen": 40295104,
"step": 3590,
"train_runtime": 6087.6682,
"train_tokens_per_second": 6619.136
},
{
"epoch": 1.725265494690106,
"grad_norm": 0.6710221171379089,
"learning_rate": 1.9174860226767876e-05,
"loss": 0.5175,
"num_input_tokens_seen": 40345800,
"step": 3595,
"train_runtime": 6095.1949,
"train_tokens_per_second": 6619.28
},
{
"epoch": 1.7276654466910661,
"grad_norm": 0.7176735401153564,
"learning_rate": 1.91137957920033e-05,
"loss": 0.5171,
"num_input_tokens_seen": 40402256,
"step": 3600,
"train_runtime": 6103.2553,
"train_tokens_per_second": 6619.788
},
{
"epoch": 1.7300653986920262,
"grad_norm": 0.9111002087593079,
"learning_rate": 1.905276851391954e-05,
"loss": 0.4883,
"num_input_tokens_seen": 40458888,
"step": 3605,
"train_runtime": 6111.7843,
"train_tokens_per_second": 6619.816
},
{
"epoch": 1.7324653506929861,
"grad_norm": 0.7179924845695496,
"learning_rate": 1.899177877775146e-05,
"loss": 0.4852,
"num_input_tokens_seen": 40516112,
"step": 3610,
"train_runtime": 6120.0523,
"train_tokens_per_second": 6620.223
},
{
"epoch": 1.734865302693946,
"grad_norm": 0.7747234106063843,
"learning_rate": 1.8930826968496943e-05,
"loss": 0.5067,
"num_input_tokens_seen": 40572824,
"step": 3615,
"train_runtime": 6128.2202,
"train_tokens_per_second": 6620.654
},
{
"epoch": 1.7372652546949061,
"grad_norm": 0.7451600432395935,
"learning_rate": 1.8869913470914448e-05,
"loss": 0.4881,
"num_input_tokens_seen": 40631656,
"step": 3620,
"train_runtime": 6136.6832,
"train_tokens_per_second": 6621.11
},
{
"epoch": 1.7396652066958662,
"grad_norm": 0.9544029235839844,
"learning_rate": 1.880903866952062e-05,
"loss": 0.5206,
"num_input_tokens_seen": 40687064,
"step": 3625,
"train_runtime": 6144.9437,
"train_tokens_per_second": 6621.227
},
{
"epoch": 1.742065158696826,
"grad_norm": 0.7754983901977539,
"learning_rate": 1.8748202948587813e-05,
"loss": 0.4979,
"num_input_tokens_seen": 40743400,
"step": 3630,
"train_runtime": 6153.1589,
"train_tokens_per_second": 6621.542
},
{
"epoch": 1.744465110697786,
"grad_norm": 0.7278411388397217,
"learning_rate": 1.8687406692141673e-05,
"loss": 0.4632,
"num_input_tokens_seen": 40802376,
"step": 3635,
"train_runtime": 6161.8706,
"train_tokens_per_second": 6621.751
},
{
"epoch": 1.7468650626987459,
"grad_norm": 0.6943597793579102,
"learning_rate": 1.8626650283958762e-05,
"loss": 0.4851,
"num_input_tokens_seen": 40854616,
"step": 3640,
"train_runtime": 6169.6683,
"train_tokens_per_second": 6621.85
},
{
"epoch": 1.749265014699706,
"grad_norm": 0.8194776177406311,
"learning_rate": 1.8565934107564068e-05,
"loss": 0.4573,
"num_input_tokens_seen": 40911032,
"step": 3645,
"train_runtime": 6178.2227,
"train_tokens_per_second": 6621.812
},
{
"epoch": 1.751664966700666,
"grad_norm": 0.8596030473709106,
"learning_rate": 1.8505258546228623e-05,
"loss": 0.4862,
"num_input_tokens_seen": 40970312,
"step": 3650,
"train_runtime": 6186.5562,
"train_tokens_per_second": 6622.475
},
{
"epoch": 1.754064918701626,
"grad_norm": 0.6645076274871826,
"learning_rate": 1.8444623982967098e-05,
"loss": 0.4606,
"num_input_tokens_seen": 41028576,
"step": 3655,
"train_runtime": 6195.0286,
"train_tokens_per_second": 6622.823
},
{
"epoch": 1.7564648707025858,
"grad_norm": 0.668375551700592,
"learning_rate": 1.8384030800535332e-05,
"loss": 0.4504,
"num_input_tokens_seen": 41088352,
"step": 3660,
"train_runtime": 6203.7002,
"train_tokens_per_second": 6623.201
},
{
"epoch": 1.758864822703546,
"grad_norm": 0.6859973669052124,
"learning_rate": 1.832347938142796e-05,
"loss": 0.5408,
"num_input_tokens_seen": 41144096,
"step": 3665,
"train_runtime": 6211.4168,
"train_tokens_per_second": 6623.947
},
{
"epoch": 1.761264774704506,
"grad_norm": 0.8838623762130737,
"learning_rate": 1.8262970107875994e-05,
"loss": 0.4798,
"num_input_tokens_seen": 41199488,
"step": 3670,
"train_runtime": 6219.0044,
"train_tokens_per_second": 6624.772
},
{
"epoch": 1.763664726705466,
"grad_norm": 0.8268917202949524,
"learning_rate": 1.8202503361844393e-05,
"loss": 0.5226,
"num_input_tokens_seen": 41254392,
"step": 3675,
"train_runtime": 6226.8544,
"train_tokens_per_second": 6625.238
},
{
"epoch": 1.7660646787064258,
"grad_norm": 0.9109818339347839,
"learning_rate": 1.8142079525029672e-05,
"loss": 0.5196,
"num_input_tokens_seen": 41310952,
"step": 3680,
"train_runtime": 6234.9064,
"train_tokens_per_second": 6625.753
},
{
"epoch": 1.7684646307073857,
"grad_norm": 0.8743447661399841,
"learning_rate": 1.808169897885745e-05,
"loss": 0.4813,
"num_input_tokens_seen": 41363784,
"step": 3685,
"train_runtime": 6242.8579,
"train_tokens_per_second": 6625.777
},
{
"epoch": 1.7708645827083458,
"grad_norm": 0.8028547763824463,
"learning_rate": 1.802136210448012e-05,
"loss": 0.4864,
"num_input_tokens_seen": 41418736,
"step": 3690,
"train_runtime": 6250.665,
"train_tokens_per_second": 6626.293
},
{
"epoch": 1.773264534709306,
"grad_norm": 0.8359841108322144,
"learning_rate": 1.796106928277437e-05,
"loss": 0.451,
"num_input_tokens_seen": 41480096,
"step": 3695,
"train_runtime": 6259.4151,
"train_tokens_per_second": 6626.833
},
{
"epoch": 1.7756644867102658,
"grad_norm": 0.6087771654129028,
"learning_rate": 1.7900820894338786e-05,
"loss": 0.4405,
"num_input_tokens_seen": 41535640,
"step": 3700,
"train_runtime": 6267.1679,
"train_tokens_per_second": 6627.498
},
{
"epoch": 1.7780644387112257,
"grad_norm": 0.7156651020050049,
"learning_rate": 1.7840617319491527e-05,
"loss": 0.51,
"num_input_tokens_seen": 41592104,
"step": 3705,
"train_runtime": 6275.4346,
"train_tokens_per_second": 6627.765
},
{
"epoch": 1.7804643907121858,
"grad_norm": 0.7992216348648071,
"learning_rate": 1.7780458938267807e-05,
"loss": 0.4488,
"num_input_tokens_seen": 41649776,
"step": 3710,
"train_runtime": 6283.7454,
"train_tokens_per_second": 6628.177
},
{
"epoch": 1.7828643427131459,
"grad_norm": 0.7933105230331421,
"learning_rate": 1.772034613041758e-05,
"loss": 0.4581,
"num_input_tokens_seen": 41707280,
"step": 3715,
"train_runtime": 6291.9245,
"train_tokens_per_second": 6628.7
},
{
"epoch": 1.7852642947141057,
"grad_norm": 0.8297272324562073,
"learning_rate": 1.7660279275403124e-05,
"loss": 0.4598,
"num_input_tokens_seen": 41765768,
"step": 3720,
"train_runtime": 6300.2081,
"train_tokens_per_second": 6629.268
},
{
"epoch": 1.7876642467150656,
"grad_norm": 0.6287772059440613,
"learning_rate": 1.7600258752396626e-05,
"loss": 0.4783,
"num_input_tokens_seen": 41819576,
"step": 3725,
"train_runtime": 6308.2419,
"train_tokens_per_second": 6629.355
},
{
"epoch": 1.7900641987160257,
"grad_norm": 0.7246582508087158,
"learning_rate": 1.754028494027782e-05,
"loss": 0.4821,
"num_input_tokens_seen": 41876528,
"step": 3730,
"train_runtime": 6316.3849,
"train_tokens_per_second": 6629.825
},
{
"epoch": 1.7924641507169856,
"grad_norm": 0.752740204334259,
"learning_rate": 1.748035821763154e-05,
"loss": 0.4984,
"num_input_tokens_seen": 41933488,
"step": 3735,
"train_runtime": 6324.4895,
"train_tokens_per_second": 6630.336
},
{
"epoch": 1.7948641027179457,
"grad_norm": 0.7370868921279907,
"learning_rate": 1.7420478962745424e-05,
"loss": 0.4707,
"num_input_tokens_seen": 41989264,
"step": 3740,
"train_runtime": 6332.3923,
"train_tokens_per_second": 6630.869
},
{
"epoch": 1.7972640547189056,
"grad_norm": 0.5607179999351501,
"learning_rate": 1.736064755360742e-05,
"loss": 0.5113,
"num_input_tokens_seen": 42045264,
"step": 3745,
"train_runtime": 6340.5688,
"train_tokens_per_second": 6631.15
},
{
"epoch": 1.7996640067198655,
"grad_norm": 0.851588785648346,
"learning_rate": 1.7300864367903462e-05,
"loss": 0.4807,
"num_input_tokens_seen": 42103712,
"step": 3750,
"train_runtime": 6348.5367,
"train_tokens_per_second": 6632.034
},
{
"epoch": 1.8020639587208256,
"grad_norm": 0.6969419717788696,
"learning_rate": 1.7241129783015108e-05,
"loss": 0.5129,
"num_input_tokens_seen": 42156568,
"step": 3755,
"train_runtime": 6356.2935,
"train_tokens_per_second": 6632.256
},
{
"epoch": 1.8044639107217857,
"grad_norm": 0.705589771270752,
"learning_rate": 1.7181444176017077e-05,
"loss": 0.4709,
"num_input_tokens_seen": 42214056,
"step": 3760,
"train_runtime": 6364.5049,
"train_tokens_per_second": 6632.732
},
{
"epoch": 1.8068638627227456,
"grad_norm": 0.9332826733589172,
"learning_rate": 1.7121807923674926e-05,
"loss": 0.4609,
"num_input_tokens_seen": 42270872,
"step": 3765,
"train_runtime": 6372.8289,
"train_tokens_per_second": 6632.984
},
{
"epoch": 1.8092638147237055,
"grad_norm": 0.6459842324256897,
"learning_rate": 1.7062221402442678e-05,
"loss": 0.5136,
"num_input_tokens_seen": 42324392,
"step": 3770,
"train_runtime": 6380.6203,
"train_tokens_per_second": 6633.272
},
{
"epoch": 1.8116637667246656,
"grad_norm": 0.8273303508758545,
"learning_rate": 1.7002684988460417e-05,
"loss": 0.465,
"num_input_tokens_seen": 42381736,
"step": 3775,
"train_runtime": 6388.9298,
"train_tokens_per_second": 6633.621
},
{
"epoch": 1.8140637187256254,
"grad_norm": 0.6155418157577515,
"learning_rate": 1.694319905755193e-05,
"loss": 0.4924,
"num_input_tokens_seen": 42442312,
"step": 3780,
"train_runtime": 6399.8787,
"train_tokens_per_second": 6631.737
},
{
"epoch": 1.8164636707265855,
"grad_norm": 1.0188329219818115,
"learning_rate": 1.6883763985222305e-05,
"loss": 0.468,
"num_input_tokens_seen": 42496896,
"step": 3785,
"train_runtime": 6409.4045,
"train_tokens_per_second": 6630.397
},
{
"epoch": 1.8188636227275454,
"grad_norm": 0.604070782661438,
"learning_rate": 1.6824380146655633e-05,
"loss": 0.5271,
"num_input_tokens_seen": 42554600,
"step": 3790,
"train_runtime": 6419.249,
"train_tokens_per_second": 6629.218
},
{
"epoch": 1.8212635747285053,
"grad_norm": 0.7463460564613342,
"learning_rate": 1.6765047916712545e-05,
"loss": 0.5052,
"num_input_tokens_seen": 42611168,
"step": 3795,
"train_runtime": 6429.2745,
"train_tokens_per_second": 6627.679
},
{
"epoch": 1.8236635267294654,
"grad_norm": 0.6504276990890503,
"learning_rate": 1.6705767669927914e-05,
"loss": 0.4572,
"num_input_tokens_seen": 42668344,
"step": 3800,
"train_runtime": 6440.1221,
"train_tokens_per_second": 6625.394
},
{
"epoch": 1.8260634787304255,
"grad_norm": 0.8336795568466187,
"learning_rate": 1.6646539780508478e-05,
"loss": 0.4514,
"num_input_tokens_seen": 42725880,
"step": 3805,
"train_runtime": 6450.437,
"train_tokens_per_second": 6623.719
},
{
"epoch": 1.8284634307313854,
"grad_norm": 0.6106321215629578,
"learning_rate": 1.658736462233045e-05,
"loss": 0.4553,
"num_input_tokens_seen": 42785824,
"step": 3810,
"train_runtime": 6460.6963,
"train_tokens_per_second": 6622.479
},
{
"epoch": 1.8308633827323453,
"grad_norm": 0.9887316823005676,
"learning_rate": 1.6528242568937174e-05,
"loss": 0.5347,
"num_input_tokens_seen": 42840440,
"step": 3815,
"train_runtime": 6470.5401,
"train_tokens_per_second": 6620.845
},
{
"epoch": 1.8332633347333054,
"grad_norm": 0.6800510287284851,
"learning_rate": 1.6469173993536787e-05,
"loss": 0.5028,
"num_input_tokens_seen": 42893576,
"step": 3820,
"train_runtime": 6480.2024,
"train_tokens_per_second": 6619.172
},
{
"epoch": 1.8356632867342653,
"grad_norm": 0.5527476668357849,
"learning_rate": 1.641015926899985e-05,
"loss": 0.4997,
"num_input_tokens_seen": 42952744,
"step": 3825,
"train_runtime": 6490.1332,
"train_tokens_per_second": 6618.161
},
{
"epoch": 1.8380632387352254,
"grad_norm": 0.833662211894989,
"learning_rate": 1.6351198767856978e-05,
"loss": 0.5076,
"num_input_tokens_seen": 43010768,
"step": 3830,
"train_runtime": 6498.8469,
"train_tokens_per_second": 6618.215
},
{
"epoch": 1.8404631907361853,
"grad_norm": 0.8122771978378296,
"learning_rate": 1.6292292862296482e-05,
"loss": 0.4789,
"num_input_tokens_seen": 43067120,
"step": 3835,
"train_runtime": 6506.9502,
"train_tokens_per_second": 6618.634
},
{
"epoch": 1.8428631427371451,
"grad_norm": 0.7453281283378601,
"learning_rate": 1.6233441924162085e-05,
"loss": 0.472,
"num_input_tokens_seen": 43124944,
"step": 3840,
"train_runtime": 6514.9238,
"train_tokens_per_second": 6619.409
},
{
"epoch": 1.8452630947381052,
"grad_norm": 0.7798519730567932,
"learning_rate": 1.617464632495048e-05,
"loss": 0.4968,
"num_input_tokens_seen": 43181496,
"step": 3845,
"train_runtime": 6522.7215,
"train_tokens_per_second": 6620.165
},
{
"epoch": 1.8476630467390653,
"grad_norm": 0.770413339138031,
"learning_rate": 1.611590643580906e-05,
"loss": 0.4799,
"num_input_tokens_seen": 43236224,
"step": 3850,
"train_runtime": 6531.17,
"train_tokens_per_second": 6619.981
},
{
"epoch": 1.8500629987400252,
"grad_norm": 0.7712330222129822,
"learning_rate": 1.6057222627533554e-05,
"loss": 0.4825,
"num_input_tokens_seen": 43291464,
"step": 3855,
"train_runtime": 6539.3507,
"train_tokens_per_second": 6620.147
},
{
"epoch": 1.852462950740985,
"grad_norm": 0.667767345905304,
"learning_rate": 1.599859527056566e-05,
"loss": 0.4525,
"num_input_tokens_seen": 43349520,
"step": 3860,
"train_runtime": 6547.7333,
"train_tokens_per_second": 6620.538
},
{
"epoch": 1.8548629027419452,
"grad_norm": 0.8143635988235474,
"learning_rate": 1.594002473499073e-05,
"loss": 0.4601,
"num_input_tokens_seen": 43410208,
"step": 3865,
"train_runtime": 6556.4635,
"train_tokens_per_second": 6620.979
},
{
"epoch": 1.857262854742905,
"grad_norm": 0.6884592771530151,
"learning_rate": 1.588151139053544e-05,
"loss": 0.4458,
"num_input_tokens_seen": 43469344,
"step": 3870,
"train_runtime": 6565.0785,
"train_tokens_per_second": 6621.298
},
{
"epoch": 1.8596628067438652,
"grad_norm": 0.8038159608840942,
"learning_rate": 1.5823055606565458e-05,
"loss": 0.4859,
"num_input_tokens_seen": 43526440,
"step": 3875,
"train_runtime": 6573.964,
"train_tokens_per_second": 6621.034
},
{
"epoch": 1.862062758744825,
"grad_norm": 0.6315177083015442,
"learning_rate": 1.5764657752083072e-05,
"loss": 0.4795,
"num_input_tokens_seen": 43583936,
"step": 3880,
"train_runtime": 6582.4382,
"train_tokens_per_second": 6621.245
},
{
"epoch": 1.864462710745785,
"grad_norm": 0.7281184792518616,
"learning_rate": 1.5706318195724894e-05,
"loss": 0.4707,
"num_input_tokens_seen": 43639480,
"step": 3885,
"train_runtime": 6590.3977,
"train_tokens_per_second": 6621.676
},
{
"epoch": 1.866862662746745,
"grad_norm": 0.8681549429893494,
"learning_rate": 1.5648037305759566e-05,
"loss": 0.4557,
"num_input_tokens_seen": 43690520,
"step": 3890,
"train_runtime": 6598.0076,
"train_tokens_per_second": 6621.775
},
{
"epoch": 1.8692626147477052,
"grad_norm": 0.9573807120323181,
"learning_rate": 1.5589815450085355e-05,
"loss": 0.4621,
"num_input_tokens_seen": 43749480,
"step": 3895,
"train_runtime": 6606.515,
"train_tokens_per_second": 6622.172
},
{
"epoch": 1.871662566748665,
"grad_norm": 0.9825738072395325,
"learning_rate": 1.5531652996227885e-05,
"loss": 0.4627,
"num_input_tokens_seen": 43799824,
"step": 3900,
"train_runtime": 6614.0046,
"train_tokens_per_second": 6622.285
},
{
"epoch": 1.874062518749625,
"grad_norm": 0.8160600662231445,
"learning_rate": 1.5473550311337833e-05,
"loss": 0.4806,
"num_input_tokens_seen": 43858032,
"step": 3905,
"train_runtime": 6622.3127,
"train_tokens_per_second": 6622.767
},
{
"epoch": 1.876462470750585,
"grad_norm": 0.8037713766098022,
"learning_rate": 1.541550776218855e-05,
"loss": 0.4767,
"num_input_tokens_seen": 43914232,
"step": 3910,
"train_runtime": 6630.3703,
"train_tokens_per_second": 6623.194
},
{
"epoch": 1.878862422751545,
"grad_norm": 0.8697477579116821,
"learning_rate": 1.535752571517379e-05,
"loss": 0.4582,
"num_input_tokens_seen": 43970744,
"step": 3915,
"train_runtime": 6638.2775,
"train_tokens_per_second": 6623.818
},
{
"epoch": 1.881262374752505,
"grad_norm": 0.6897442936897278,
"learning_rate": 1.529960453630538e-05,
"loss": 0.4725,
"num_input_tokens_seen": 44028408,
"step": 3920,
"train_runtime": 6646.2538,
"train_tokens_per_second": 6624.545
},
{
"epoch": 1.883662326753465,
"grad_norm": 0.7267577052116394,
"learning_rate": 1.5241744591210954e-05,
"loss": 0.4661,
"num_input_tokens_seen": 44085968,
"step": 3925,
"train_runtime": 6654.4818,
"train_tokens_per_second": 6625.004
},
{
"epoch": 1.8860622787544248,
"grad_norm": 0.6550572514533997,
"learning_rate": 1.5183946245131563e-05,
"loss": 0.5171,
"num_input_tokens_seen": 44143360,
"step": 3930,
"train_runtime": 6662.7155,
"train_tokens_per_second": 6625.431
},
{
"epoch": 1.8884622307553849,
"grad_norm": 0.8330610394477844,
"learning_rate": 1.5126209862919427e-05,
"loss": 0.4935,
"num_input_tokens_seen": 44193864,
"step": 3935,
"train_runtime": 6669.9997,
"train_tokens_per_second": 6625.767
},
{
"epoch": 1.890862182756345,
"grad_norm": 0.8436587452888489,
"learning_rate": 1.506853580903564e-05,
"loss": 0.5181,
"num_input_tokens_seen": 44249464,
"step": 3940,
"train_runtime": 6677.7583,
"train_tokens_per_second": 6626.395
},
{
"epoch": 1.8932621347573049,
"grad_norm": 0.8945364356040955,
"learning_rate": 1.5010924447547808e-05,
"loss": 0.445,
"num_input_tokens_seen": 44306480,
"step": 3945,
"train_runtime": 6685.9167,
"train_tokens_per_second": 6626.837
},
{
"epoch": 1.8956620867582648,
"grad_norm": 0.7293525338172913,
"learning_rate": 1.4953376142127828e-05,
"loss": 0.4933,
"num_input_tokens_seen": 44363776,
"step": 3950,
"train_runtime": 6693.7184,
"train_tokens_per_second": 6627.673
},
{
"epoch": 1.8980620387592249,
"grad_norm": 0.8093637228012085,
"learning_rate": 1.4895891256049548e-05,
"loss": 0.4952,
"num_input_tokens_seen": 44419016,
"step": 3955,
"train_runtime": 6701.9236,
"train_tokens_per_second": 6627.801
},
{
"epoch": 1.900461990760185,
"grad_norm": 0.8808810710906982,
"learning_rate": 1.483847015218647e-05,
"loss": 0.5036,
"num_input_tokens_seen": 44473296,
"step": 3960,
"train_runtime": 6709.8008,
"train_tokens_per_second": 6628.11
},
{
"epoch": 1.9028619427611448,
"grad_norm": 0.606708288192749,
"learning_rate": 1.4781113193009466e-05,
"loss": 0.4709,
"num_input_tokens_seen": 44533064,
"step": 3965,
"train_runtime": 6718.435,
"train_tokens_per_second": 6628.488
},
{
"epoch": 1.9052618947621047,
"grad_norm": 0.7501396536827087,
"learning_rate": 1.472382074058451e-05,
"loss": 0.487,
"num_input_tokens_seen": 44591088,
"step": 3970,
"train_runtime": 6726.4717,
"train_tokens_per_second": 6629.194
},
{
"epoch": 1.9076618467630646,
"grad_norm": 0.7472719550132751,
"learning_rate": 1.4666593156570376e-05,
"loss": 0.4822,
"num_input_tokens_seen": 44639864,
"step": 3975,
"train_runtime": 6733.726,
"train_tokens_per_second": 6629.296
},
{
"epoch": 1.9100617987640247,
"grad_norm": 0.9028266668319702,
"learning_rate": 1.460943080221635e-05,
"loss": 0.4792,
"num_input_tokens_seen": 44697568,
"step": 3980,
"train_runtime": 6742.2116,
"train_tokens_per_second": 6629.511
},
{
"epoch": 1.9124617507649848,
"grad_norm": 0.6775950193405151,
"learning_rate": 1.4552334038359938e-05,
"loss": 0.4861,
"num_input_tokens_seen": 44750848,
"step": 3985,
"train_runtime": 6749.8459,
"train_tokens_per_second": 6629.907
},
{
"epoch": 1.9148617027659447,
"grad_norm": 0.7115968465805054,
"learning_rate": 1.4495303225424656e-05,
"loss": 0.4546,
"num_input_tokens_seen": 44804648,
"step": 3990,
"train_runtime": 6758.0597,
"train_tokens_per_second": 6629.809
},
{
"epoch": 1.9172616547669046,
"grad_norm": 0.8527563214302063,
"learning_rate": 1.4438338723417654e-05,
"loss": 0.5007,
"num_input_tokens_seen": 44860632,
"step": 3995,
"train_runtime": 6766.2816,
"train_tokens_per_second": 6630.027
},
{
"epoch": 1.9196616067678647,
"grad_norm": 0.8954775333404541,
"learning_rate": 1.4381440891927512e-05,
"loss": 0.5301,
"num_input_tokens_seen": 44913712,
"step": 4000,
"train_runtime": 6774.2353,
"train_tokens_per_second": 6630.078
},
{
"epoch": 1.9220615587688248,
"grad_norm": 0.7284995317459106,
"learning_rate": 1.432461009012196e-05,
"loss": 0.5028,
"num_input_tokens_seen": 44970992,
"step": 4005,
"train_runtime": 6782.3775,
"train_tokens_per_second": 6630.565
},
{
"epoch": 1.9244615107697847,
"grad_norm": 1.017869472503662,
"learning_rate": 1.4267846676745598e-05,
"loss": 0.4618,
"num_input_tokens_seen": 45024328,
"step": 4010,
"train_runtime": 6790.5882,
"train_tokens_per_second": 6630.402
},
{
"epoch": 1.9268614627707445,
"grad_norm": 0.7588083148002625,
"learning_rate": 1.4211151010117627e-05,
"loss": 0.5078,
"num_input_tokens_seen": 45082296,
"step": 4015,
"train_runtime": 6798.9435,
"train_tokens_per_second": 6630.78
},
{
"epoch": 1.9292614147717044,
"grad_norm": 0.66818767786026,
"learning_rate": 1.4154523448129597e-05,
"loss": 0.4823,
"num_input_tokens_seen": 45137992,
"step": 4020,
"train_runtime": 6806.9385,
"train_tokens_per_second": 6631.174
},
{
"epoch": 1.9316613667726645,
"grad_norm": 0.700678825378418,
"learning_rate": 1.4097964348243172e-05,
"loss": 0.4639,
"num_input_tokens_seen": 45197208,
"step": 4025,
"train_runtime": 6815.6104,
"train_tokens_per_second": 6631.425
},
{
"epoch": 1.9340613187736246,
"grad_norm": 0.8906050324440002,
"learning_rate": 1.4041474067487814e-05,
"loss": 0.4599,
"num_input_tokens_seen": 45256040,
"step": 4030,
"train_runtime": 6824.0323,
"train_tokens_per_second": 6631.862
},
{
"epoch": 1.9364612707745845,
"grad_norm": 0.8205930590629578,
"learning_rate": 1.3985052962458593e-05,
"loss": 0.4903,
"num_input_tokens_seen": 45311968,
"step": 4035,
"train_runtime": 6831.8772,
"train_tokens_per_second": 6632.433
},
{
"epoch": 1.9388612227755444,
"grad_norm": 0.9148489832878113,
"learning_rate": 1.3928701389313897e-05,
"loss": 0.4939,
"num_input_tokens_seen": 45361584,
"step": 4040,
"train_runtime": 6839.5045,
"train_tokens_per_second": 6632.291
},
{
"epoch": 1.9412611747765045,
"grad_norm": 1.021208643913269,
"learning_rate": 1.3872419703773187e-05,
"loss": 0.4876,
"num_input_tokens_seen": 45421616,
"step": 4045,
"train_runtime": 6848.0389,
"train_tokens_per_second": 6632.792
},
{
"epoch": 1.9436611267774646,
"grad_norm": 0.8669795393943787,
"learning_rate": 1.3816208261114755e-05,
"loss": 0.5142,
"num_input_tokens_seen": 45475784,
"step": 4050,
"train_runtime": 6855.5247,
"train_tokens_per_second": 6633.451
},
{
"epoch": 1.9460610787784245,
"grad_norm": 1.084006428718567,
"learning_rate": 1.3760067416173511e-05,
"loss": 0.4949,
"num_input_tokens_seen": 45529816,
"step": 4055,
"train_runtime": 6863.458,
"train_tokens_per_second": 6633.655
},
{
"epoch": 1.9484610307793844,
"grad_norm": 0.639717161655426,
"learning_rate": 1.3703997523338688e-05,
"loss": 0.4917,
"num_input_tokens_seen": 45585432,
"step": 4060,
"train_runtime": 6870.9893,
"train_tokens_per_second": 6634.479
},
{
"epoch": 1.9508609827803443,
"grad_norm": 0.7942274808883667,
"learning_rate": 1.3647998936551643e-05,
"loss": 0.4542,
"num_input_tokens_seen": 45642256,
"step": 4065,
"train_runtime": 6879.1089,
"train_tokens_per_second": 6634.908
},
{
"epoch": 1.9532609347813044,
"grad_norm": 0.7706002593040466,
"learning_rate": 1.3592072009303603e-05,
"loss": 0.4767,
"num_input_tokens_seen": 45700704,
"step": 4070,
"train_runtime": 6887.1919,
"train_tokens_per_second": 6635.608
},
{
"epoch": 1.9556608867822645,
"grad_norm": 0.6891798377037048,
"learning_rate": 1.3536217094633471e-05,
"loss": 0.4649,
"num_input_tokens_seen": 45754672,
"step": 4075,
"train_runtime": 6895.2959,
"train_tokens_per_second": 6635.636
},
{
"epoch": 1.9580608387832243,
"grad_norm": 0.6927337646484375,
"learning_rate": 1.3480434545125562e-05,
"loss": 0.4794,
"num_input_tokens_seen": 45805360,
"step": 4080,
"train_runtime": 6902.6999,
"train_tokens_per_second": 6635.861
},
{
"epoch": 1.9604607907841842,
"grad_norm": 0.7922900319099426,
"learning_rate": 1.3424724712907355e-05,
"loss": 0.5073,
"num_input_tokens_seen": 45859408,
"step": 4085,
"train_runtime": 6910.3792,
"train_tokens_per_second": 6636.309
},
{
"epoch": 1.9628607427851443,
"grad_norm": 0.5073052048683167,
"learning_rate": 1.3369087949647352e-05,
"loss": 0.4844,
"num_input_tokens_seen": 45915912,
"step": 4090,
"train_runtime": 6918.4066,
"train_tokens_per_second": 6636.776
},
{
"epoch": 1.9652606947861044,
"grad_norm": 0.805068850517273,
"learning_rate": 1.3313524606552763e-05,
"loss": 0.4683,
"num_input_tokens_seen": 45972424,
"step": 4095,
"train_runtime": 6926.7284,
"train_tokens_per_second": 6636.961
},
{
"epoch": 1.9676606467870643,
"grad_norm": 0.7410593628883362,
"learning_rate": 1.3258035034367338e-05,
"loss": 0.4847,
"num_input_tokens_seen": 46029616,
"step": 4100,
"train_runtime": 6934.891,
"train_tokens_per_second": 6637.396
},
{
"epoch": 1.9700605987880242,
"grad_norm": 0.9381468296051025,
"learning_rate": 1.3202619583369189e-05,
"loss": 0.5131,
"num_input_tokens_seen": 46087816,
"step": 4105,
"train_runtime": 6943.9707,
"train_tokens_per_second": 6637.098
},
{
"epoch": 1.972460550788984,
"grad_norm": 0.7725812792778015,
"learning_rate": 1.3147278603368487e-05,
"loss": 0.496,
"num_input_tokens_seen": 46141504,
"step": 4110,
"train_runtime": 6951.6679,
"train_tokens_per_second": 6637.472
},
{
"epoch": 1.9748605027899442,
"grad_norm": 0.9349031448364258,
"learning_rate": 1.3092012443705332e-05,
"loss": 0.4513,
"num_input_tokens_seen": 46202072,
"step": 4115,
"train_runtime": 6960.2643,
"train_tokens_per_second": 6637.977
},
{
"epoch": 1.9772604547909043,
"grad_norm": 0.5486748218536377,
"learning_rate": 1.3036821453247506e-05,
"loss": 0.4997,
"num_input_tokens_seen": 46258400,
"step": 4120,
"train_runtime": 6968.5186,
"train_tokens_per_second": 6638.197
},
{
"epoch": 1.9796604067918642,
"grad_norm": 0.8410947322845459,
"learning_rate": 1.2981705980388295e-05,
"loss": 0.5062,
"num_input_tokens_seen": 46309656,
"step": 4125,
"train_runtime": 6975.9975,
"train_tokens_per_second": 6638.428
},
{
"epoch": 1.982060358792824,
"grad_norm": 0.6465336680412292,
"learning_rate": 1.2926666373044294e-05,
"loss": 0.4891,
"num_input_tokens_seen": 46366888,
"step": 4130,
"train_runtime": 6984.2364,
"train_tokens_per_second": 6638.791
},
{
"epoch": 1.9844603107937842,
"grad_norm": 0.6658479571342468,
"learning_rate": 1.2871702978653163e-05,
"loss": 0.5002,
"num_input_tokens_seen": 46419304,
"step": 4135,
"train_runtime": 6991.7902,
"train_tokens_per_second": 6639.116
},
{
"epoch": 1.9868602627947443,
"grad_norm": 0.8227950930595398,
"learning_rate": 1.28168161441715e-05,
"loss": 0.5105,
"num_input_tokens_seen": 46469520,
"step": 4140,
"train_runtime": 6999.0924,
"train_tokens_per_second": 6639.364
},
{
"epoch": 1.9892602147957041,
"grad_norm": 1.1198500394821167,
"learning_rate": 1.27620062160726e-05,
"loss": 0.5154,
"num_input_tokens_seen": 46523240,
"step": 4145,
"train_runtime": 7007.0718,
"train_tokens_per_second": 6639.47
},
{
"epoch": 1.991660166796664,
"grad_norm": 0.8290591835975647,
"learning_rate": 1.2707273540344274e-05,
"loss": 0.5361,
"num_input_tokens_seen": 46577712,
"step": 4150,
"train_runtime": 7015.0992,
"train_tokens_per_second": 6639.637
},
{
"epoch": 1.994060118797624,
"grad_norm": 0.6306242346763611,
"learning_rate": 1.265261846248672e-05,
"loss": 0.4873,
"num_input_tokens_seen": 46629984,
"step": 4155,
"train_runtime": 7022.7592,
"train_tokens_per_second": 6639.838
},
{
"epoch": 1.996460070798584,
"grad_norm": 0.8492105007171631,
"learning_rate": 1.2598041327510254e-05,
"loss": 0.4779,
"num_input_tokens_seen": 46689664,
"step": 4160,
"train_runtime": 7031.9599,
"train_tokens_per_second": 6639.637
},
{
"epoch": 1.9988600227995441,
"grad_norm": 0.8231053352355957,
"learning_rate": 1.25435424799332e-05,
"loss": 0.4451,
"num_input_tokens_seen": 46752192,
"step": 4165,
"train_runtime": 7041.1099,
"train_tokens_per_second": 6639.89
},
{
"epoch": 2.000959980800384,
"grad_norm": 0.6937538385391235,
"learning_rate": 1.2489122263779684e-05,
"loss": 0.4431,
"num_input_tokens_seen": 46800120,
"step": 4170,
"train_runtime": 7048.3249,
"train_tokens_per_second": 6639.893
},
{
"epoch": 2.003359932801344,
"grad_norm": 0.5429336428642273,
"learning_rate": 1.2434781022577476e-05,
"loss": 0.4561,
"num_input_tokens_seen": 46859352,
"step": 4175,
"train_runtime": 7056.8347,
"train_tokens_per_second": 6640.279
},
{
"epoch": 2.005759884802304,
"grad_norm": 0.7788823843002319,
"learning_rate": 1.2380519099355831e-05,
"loss": 0.4531,
"num_input_tokens_seen": 46918656,
"step": 4180,
"train_runtime": 7065.0446,
"train_tokens_per_second": 6640.957
},
{
"epoch": 2.008159836803264,
"grad_norm": 0.7995026111602783,
"learning_rate": 1.2326336836643274e-05,
"loss": 0.5048,
"num_input_tokens_seen": 46976896,
"step": 4185,
"train_runtime": 7073.7177,
"train_tokens_per_second": 6641.048
},
{
"epoch": 2.010559788804224,
"grad_norm": 0.7401773929595947,
"learning_rate": 1.227223457646551e-05,
"loss": 0.4846,
"num_input_tokens_seen": 47033584,
"step": 4190,
"train_runtime": 7081.7376,
"train_tokens_per_second": 6641.532
},
{
"epoch": 2.012959740805184,
"grad_norm": 1.0051988363265991,
"learning_rate": 1.22182126603432e-05,
"loss": 0.497,
"num_input_tokens_seen": 47084560,
"step": 4195,
"train_runtime": 7089.243,
"train_tokens_per_second": 6641.691
},
{
"epoch": 2.0153596928061437,
"grad_norm": 0.7586055994033813,
"learning_rate": 1.2164271429289837e-05,
"loss": 0.4671,
"num_input_tokens_seen": 47141040,
"step": 4200,
"train_runtime": 7097.5162,
"train_tokens_per_second": 6641.907
},
{
"epoch": 2.017759644807104,
"grad_norm": 0.6509086489677429,
"learning_rate": 1.2110411223809612e-05,
"loss": 0.4329,
"num_input_tokens_seen": 47198656,
"step": 4205,
"train_runtime": 7107.6834,
"train_tokens_per_second": 6640.512
},
{
"epoch": 2.020159596808064,
"grad_norm": 0.7223982810974121,
"learning_rate": 1.2056632383895217e-05,
"loss": 0.4903,
"num_input_tokens_seen": 47255504,
"step": 4210,
"train_runtime": 7117.033,
"train_tokens_per_second": 6639.776
},
{
"epoch": 2.0225595488090238,
"grad_norm": 0.9436632990837097,
"learning_rate": 1.2002935249025732e-05,
"loss": 0.4788,
"num_input_tokens_seen": 47307728,
"step": 4215,
"train_runtime": 7126.5999,
"train_tokens_per_second": 6638.191
},
{
"epoch": 2.0249595008099837,
"grad_norm": 0.7383816838264465,
"learning_rate": 1.1949320158164466e-05,
"loss": 0.4692,
"num_input_tokens_seen": 47365504,
"step": 4220,
"train_runtime": 7136.5388,
"train_tokens_per_second": 6637.041
},
{
"epoch": 2.027359452810944,
"grad_norm": 0.8641635775566101,
"learning_rate": 1.1895787449756834e-05,
"loss": 0.4565,
"num_input_tokens_seen": 47424664,
"step": 4225,
"train_runtime": 7147.349,
"train_tokens_per_second": 6635.28
},
{
"epoch": 2.029759404811904,
"grad_norm": 0.8401957750320435,
"learning_rate": 1.1842337461728232e-05,
"loss": 0.5177,
"num_input_tokens_seen": 47482624,
"step": 4230,
"train_runtime": 7158.1241,
"train_tokens_per_second": 6633.389
},
{
"epoch": 2.0321593568128637,
"grad_norm": 0.7083563208580017,
"learning_rate": 1.1788970531481832e-05,
"loss": 0.4509,
"num_input_tokens_seen": 47541264,
"step": 4235,
"train_runtime": 7168.9418,
"train_tokens_per_second": 6631.559
},
{
"epoch": 2.0345593088138236,
"grad_norm": 0.7770140171051025,
"learning_rate": 1.1735686995896559e-05,
"loss": 0.5111,
"num_input_tokens_seen": 47596256,
"step": 4240,
"train_runtime": 7178.4941,
"train_tokens_per_second": 6630.396
},
{
"epoch": 2.0369592608147835,
"grad_norm": 0.8754630088806152,
"learning_rate": 1.1682487191324868e-05,
"loss": 0.5576,
"num_input_tokens_seen": 47649808,
"step": 4245,
"train_runtime": 7188.2139,
"train_tokens_per_second": 6628.88
},
{
"epoch": 2.039359212815744,
"grad_norm": 0.6423441767692566,
"learning_rate": 1.1629371453590671e-05,
"loss": 0.4836,
"num_input_tokens_seen": 47709328,
"step": 4250,
"train_runtime": 7198.5845,
"train_tokens_per_second": 6627.599
},
{
"epoch": 2.0417591648167037,
"grad_norm": 0.7070155143737793,
"learning_rate": 1.1576340117987233e-05,
"loss": 0.5057,
"num_input_tokens_seen": 47765800,
"step": 4255,
"train_runtime": 7209.1424,
"train_tokens_per_second": 6625.726
},
{
"epoch": 2.0441591168176636,
"grad_norm": 0.8831612467765808,
"learning_rate": 1.1523393519274996e-05,
"loss": 0.4447,
"num_input_tokens_seen": 47820320,
"step": 4260,
"train_runtime": 7218.3344,
"train_tokens_per_second": 6624.841
},
{
"epoch": 2.0465590688186235,
"grad_norm": 0.6510924100875854,
"learning_rate": 1.1470531991679523e-05,
"loss": 0.5101,
"num_input_tokens_seen": 47876928,
"step": 4265,
"train_runtime": 7228.3009,
"train_tokens_per_second": 6623.538
},
{
"epoch": 2.048959020819584,
"grad_norm": 0.6335709691047668,
"learning_rate": 1.1417755868889343e-05,
"loss": 0.4432,
"num_input_tokens_seen": 47933280,
"step": 4270,
"train_runtime": 7237.6205,
"train_tokens_per_second": 6622.795
},
{
"epoch": 2.0513589728205437,
"grad_norm": 0.7883151769638062,
"learning_rate": 1.1365065484053895e-05,
"loss": 0.4606,
"num_input_tokens_seen": 47991280,
"step": 4275,
"train_runtime": 7247.4539,
"train_tokens_per_second": 6621.812
},
{
"epoch": 2.0537589248215036,
"grad_norm": 0.8296838998794556,
"learning_rate": 1.1312461169781383e-05,
"loss": 0.4669,
"num_input_tokens_seen": 48045896,
"step": 4280,
"train_runtime": 7257.2601,
"train_tokens_per_second": 6620.391
},
{
"epoch": 2.0561588768224635,
"grad_norm": 0.8068815469741821,
"learning_rate": 1.1259943258136682e-05,
"loss": 0.4849,
"num_input_tokens_seen": 48105824,
"step": 4285,
"train_runtime": 7268.0346,
"train_tokens_per_second": 6618.822
},
{
"epoch": 2.0585588288234233,
"grad_norm": 0.977588415145874,
"learning_rate": 1.1207512080639273e-05,
"loss": 0.4956,
"num_input_tokens_seen": 48160632,
"step": 4290,
"train_runtime": 7277.9569,
"train_tokens_per_second": 6617.329
},
{
"epoch": 2.0609587808243837,
"grad_norm": 0.7364087700843811,
"learning_rate": 1.1155167968261105e-05,
"loss": 0.4357,
"num_input_tokens_seen": 48217992,
"step": 4295,
"train_runtime": 7288.3331,
"train_tokens_per_second": 6615.778
},
{
"epoch": 2.0633587328253435,
"grad_norm": 0.757265031337738,
"learning_rate": 1.1102911251424526e-05,
"loss": 0.4907,
"num_input_tokens_seen": 48276216,
"step": 4300,
"train_runtime": 7298.7103,
"train_tokens_per_second": 6614.349
},
{
"epoch": 2.0657586848263034,
"grad_norm": 0.773041844367981,
"learning_rate": 1.1050742260000226e-05,
"loss": 0.4687,
"num_input_tokens_seen": 48331296,
"step": 4305,
"train_runtime": 7308.4104,
"train_tokens_per_second": 6613.106
},
{
"epoch": 2.0681586368272633,
"grad_norm": 1.1142570972442627,
"learning_rate": 1.0998661323305107e-05,
"loss": 0.4574,
"num_input_tokens_seen": 48387368,
"step": 4310,
"train_runtime": 7317.8081,
"train_tokens_per_second": 6612.276
},
{
"epoch": 2.0705585888282236,
"grad_norm": 1.0279673337936401,
"learning_rate": 1.094666877010023e-05,
"loss": 0.5004,
"num_input_tokens_seen": 48440296,
"step": 4315,
"train_runtime": 7327.8587,
"train_tokens_per_second": 6610.43
},
{
"epoch": 2.0729585408291835,
"grad_norm": 0.9261734485626221,
"learning_rate": 1.0894764928588721e-05,
"loss": 0.4747,
"num_input_tokens_seen": 48492496,
"step": 4320,
"train_runtime": 7336.9344,
"train_tokens_per_second": 6609.368
},
{
"epoch": 2.0753584928301434,
"grad_norm": 1.1111286878585815,
"learning_rate": 1.0842950126413742e-05,
"loss": 0.5137,
"num_input_tokens_seen": 48549184,
"step": 4325,
"train_runtime": 7346.6107,
"train_tokens_per_second": 6608.378
},
{
"epoch": 2.0777584448311033,
"grad_norm": 0.8526914119720459,
"learning_rate": 1.0791224690656384e-05,
"loss": 0.4573,
"num_input_tokens_seen": 48601016,
"step": 4330,
"train_runtime": 7354.8806,
"train_tokens_per_second": 6607.995
},
{
"epoch": 2.080158396832063,
"grad_norm": 0.5850500464439392,
"learning_rate": 1.0739588947833593e-05,
"loss": 0.4814,
"num_input_tokens_seen": 48655504,
"step": 4335,
"train_runtime": 7363.7381,
"train_tokens_per_second": 6607.446
},
{
"epoch": 2.0825583488330235,
"grad_norm": 1.0572696924209595,
"learning_rate": 1.068804322389616e-05,
"loss": 0.4997,
"num_input_tokens_seen": 48708616,
"step": 4340,
"train_runtime": 7372.6454,
"train_tokens_per_second": 6606.667
},
{
"epoch": 2.0849583008339834,
"grad_norm": 0.5862051844596863,
"learning_rate": 1.06365878442266e-05,
"loss": 0.4459,
"num_input_tokens_seen": 48769440,
"step": 4345,
"train_runtime": 7382.0777,
"train_tokens_per_second": 6606.465
},
{
"epoch": 2.0873582528349433,
"grad_norm": 0.7404434680938721,
"learning_rate": 1.0585223133637143e-05,
"loss": 0.4882,
"num_input_tokens_seen": 48827720,
"step": 4350,
"train_runtime": 7391.0584,
"train_tokens_per_second": 6606.323
},
{
"epoch": 2.089758204835903,
"grad_norm": 0.7802624106407166,
"learning_rate": 1.053394941636768e-05,
"loss": 0.5322,
"num_input_tokens_seen": 48879552,
"step": 4355,
"train_runtime": 7398.7138,
"train_tokens_per_second": 6606.493
},
{
"epoch": 2.0921581568368635,
"grad_norm": 0.7315226197242737,
"learning_rate": 1.0482767016083694e-05,
"loss": 0.4515,
"num_input_tokens_seen": 48932848,
"step": 4360,
"train_runtime": 7406.1993,
"train_tokens_per_second": 6607.012
},
{
"epoch": 2.0945581088378233,
"grad_norm": 0.967128574848175,
"learning_rate": 1.0431676255874232e-05,
"loss": 0.5213,
"num_input_tokens_seen": 48989744,
"step": 4365,
"train_runtime": 7414.1239,
"train_tokens_per_second": 6607.624
},
{
"epoch": 2.0969580608387832,
"grad_norm": 0.731792151927948,
"learning_rate": 1.0380677458249852e-05,
"loss": 0.4821,
"num_input_tokens_seen": 49043888,
"step": 4370,
"train_runtime": 7421.75,
"train_tokens_per_second": 6608.13
},
{
"epoch": 2.099358012839743,
"grad_norm": 0.8551647067070007,
"learning_rate": 1.0329770945140618e-05,
"loss": 0.5018,
"num_input_tokens_seen": 49099976,
"step": 4375,
"train_runtime": 7429.6538,
"train_tokens_per_second": 6608.649
},
{
"epoch": 2.101757964840703,
"grad_norm": 0.8482736945152283,
"learning_rate": 1.0278957037894048e-05,
"loss": 0.5266,
"num_input_tokens_seen": 49158168,
"step": 4380,
"train_runtime": 7437.7108,
"train_tokens_per_second": 6609.314
},
{
"epoch": 2.1041579168416633,
"grad_norm": 0.8070186376571655,
"learning_rate": 1.0228236057273063e-05,
"loss": 0.4906,
"num_input_tokens_seen": 49209920,
"step": 4385,
"train_runtime": 7445.1797,
"train_tokens_per_second": 6609.635
},
{
"epoch": 2.106557868842623,
"grad_norm": 0.7493661046028137,
"learning_rate": 1.0177608323454008e-05,
"loss": 0.5067,
"num_input_tokens_seen": 49262384,
"step": 4390,
"train_runtime": 7452.9186,
"train_tokens_per_second": 6609.811
},
{
"epoch": 2.108957820843583,
"grad_norm": 0.7874744534492493,
"learning_rate": 1.0127074156024594e-05,
"loss": 0.4642,
"num_input_tokens_seen": 49315632,
"step": 4395,
"train_runtime": 7460.8462,
"train_tokens_per_second": 6609.925
},
{
"epoch": 2.111357772844543,
"grad_norm": 0.9224854707717896,
"learning_rate": 1.0076633873981883e-05,
"loss": 0.4984,
"num_input_tokens_seen": 49371384,
"step": 4400,
"train_runtime": 7468.9769,
"train_tokens_per_second": 6610.194
},
{
"epoch": 2.1137577248455033,
"grad_norm": 0.8540477156639099,
"learning_rate": 1.0026287795730319e-05,
"loss": 0.4767,
"num_input_tokens_seen": 49426056,
"step": 4405,
"train_runtime": 7477.3027,
"train_tokens_per_second": 6610.145
},
{
"epoch": 2.116157676846463,
"grad_norm": 1.0904680490493774,
"learning_rate": 9.976036239079656e-06,
"loss": 0.491,
"num_input_tokens_seen": 49483160,
"step": 4410,
"train_runtime": 7485.9905,
"train_tokens_per_second": 6610.102
},
{
"epoch": 2.118557628847423,
"grad_norm": 0.5771769881248474,
"learning_rate": 9.925879521242978e-06,
"loss": 0.4566,
"num_input_tokens_seen": 49537568,
"step": 4415,
"train_runtime": 7494.7254,
"train_tokens_per_second": 6609.657
},
{
"epoch": 2.120957580848383,
"grad_norm": 0.765743613243103,
"learning_rate": 9.87581795883473e-06,
"loss": 0.4878,
"num_input_tokens_seen": 49594120,
"step": 4420,
"train_runtime": 7503.2956,
"train_tokens_per_second": 6609.645
},
{
"epoch": 2.123357532849343,
"grad_norm": 0.8731431365013123,
"learning_rate": 9.825851867868646e-06,
"loss": 0.4871,
"num_input_tokens_seen": 49647944,
"step": 4425,
"train_runtime": 7511.5023,
"train_tokens_per_second": 6609.589
},
{
"epoch": 2.125757484850303,
"grad_norm": 0.9633266925811768,
"learning_rate": 9.775981563755835e-06,
"loss": 0.4747,
"num_input_tokens_seen": 49702848,
"step": 4430,
"train_runtime": 7520.7688,
"train_tokens_per_second": 6608.746
},
{
"epoch": 2.128157436851263,
"grad_norm": 0.8484842777252197,
"learning_rate": 9.726207361302716e-06,
"loss": 0.4871,
"num_input_tokens_seen": 49754336,
"step": 4435,
"train_runtime": 7530.7578,
"train_tokens_per_second": 6606.817
},
{
"epoch": 2.130557388852223,
"grad_norm": 0.6933907270431519,
"learning_rate": 9.676529574709104e-06,
"loss": 0.4813,
"num_input_tokens_seen": 49818104,
"step": 4440,
"train_runtime": 7543.2161,
"train_tokens_per_second": 6604.359
},
{
"epoch": 2.132957340853183,
"grad_norm": 0.8864620327949524,
"learning_rate": 9.62694851756616e-06,
"loss": 0.5196,
"num_input_tokens_seen": 49872640,
"step": 4445,
"train_runtime": 7553.5538,
"train_tokens_per_second": 6602.54
},
{
"epoch": 2.135357292854143,
"grad_norm": 0.6627900004386902,
"learning_rate": 9.577464502854432e-06,
"loss": 0.441,
"num_input_tokens_seen": 49929176,
"step": 4450,
"train_runtime": 7563.6578,
"train_tokens_per_second": 6601.194
},
{
"epoch": 2.137757244855103,
"grad_norm": 0.8925694823265076,
"learning_rate": 9.528077842941929e-06,
"loss": 0.4755,
"num_input_tokens_seen": 49984040,
"step": 4455,
"train_runtime": 7574.4434,
"train_tokens_per_second": 6599.038
},
{
"epoch": 2.140157196856063,
"grad_norm": 0.7881972789764404,
"learning_rate": 9.478788849582071e-06,
"loss": 0.4841,
"num_input_tokens_seen": 50036368,
"step": 4460,
"train_runtime": 7584.774,
"train_tokens_per_second": 6596.949
},
{
"epoch": 2.1425571488570228,
"grad_norm": 0.7480626106262207,
"learning_rate": 9.42959783391176e-06,
"loss": 0.4813,
"num_input_tokens_seen": 50091376,
"step": 4465,
"train_runtime": 7594.7525,
"train_tokens_per_second": 6595.524
},
{
"epoch": 2.1449571008579826,
"grad_norm": 0.8503336310386658,
"learning_rate": 9.38050510644944e-06,
"loss": 0.4844,
"num_input_tokens_seen": 50148472,
"step": 4470,
"train_runtime": 7604.4881,
"train_tokens_per_second": 6594.589
},
{
"epoch": 2.147357052858943,
"grad_norm": 0.896701991558075,
"learning_rate": 9.331510977093077e-06,
"loss": 0.4784,
"num_input_tokens_seen": 50202392,
"step": 4475,
"train_runtime": 7614.8511,
"train_tokens_per_second": 6592.695
},
{
"epoch": 2.149757004859903,
"grad_norm": 0.7483791708946228,
"learning_rate": 9.282615755118266e-06,
"loss": 0.4473,
"num_input_tokens_seen": 50262048,
"step": 4480,
"train_runtime": 7625.4864,
"train_tokens_per_second": 6591.324
},
{
"epoch": 2.1521569568608627,
"grad_norm": 0.8028972148895264,
"learning_rate": 9.23381974917622e-06,
"loss": 0.4611,
"num_input_tokens_seen": 50318512,
"step": 4485,
"train_runtime": 7635.9972,
"train_tokens_per_second": 6589.645
},
{
"epoch": 2.1545569088618226,
"grad_norm": 0.7019287347793579,
"learning_rate": 9.185123267291881e-06,
"loss": 0.4622,
"num_input_tokens_seen": 50371472,
"step": 4490,
"train_runtime": 7645.9049,
"train_tokens_per_second": 6588.033
},
{
"epoch": 2.156956860862783,
"grad_norm": 0.849296510219574,
"learning_rate": 9.136526616861921e-06,
"loss": 0.501,
"num_input_tokens_seen": 50425888,
"step": 4495,
"train_runtime": 7656.303,
"train_tokens_per_second": 6586.193
},
{
"epoch": 2.159356812863743,
"grad_norm": 0.5608788728713989,
"learning_rate": 9.088030104652829e-06,
"loss": 0.4828,
"num_input_tokens_seen": 50484136,
"step": 4500,
"train_runtime": 7666.3853,
"train_tokens_per_second": 6585.129
},
{
"epoch": 2.1617567648647027,
"grad_norm": 0.7533180713653564,
"learning_rate": 9.03963403679899e-06,
"loss": 0.463,
"num_input_tokens_seen": 50540376,
"step": 4505,
"train_runtime": 7676.9709,
"train_tokens_per_second": 6583.375
},
{
"epoch": 2.1641567168656626,
"grad_norm": 0.8343721032142639,
"learning_rate": 8.99133871880071e-06,
"loss": 0.4948,
"num_input_tokens_seen": 50594968,
"step": 4510,
"train_runtime": 7687.0369,
"train_tokens_per_second": 6581.856
},
{
"epoch": 2.1665566688666225,
"grad_norm": 1.0494121313095093,
"learning_rate": 8.943144455522314e-06,
"loss": 0.4919,
"num_input_tokens_seen": 50649296,
"step": 4515,
"train_runtime": 7697.4192,
"train_tokens_per_second": 6580.036
},
{
"epoch": 2.168956620867583,
"grad_norm": 0.8824997544288635,
"learning_rate": 8.895051551190248e-06,
"loss": 0.4279,
"num_input_tokens_seen": 50706696,
"step": 4520,
"train_runtime": 7707.9222,
"train_tokens_per_second": 6578.517
},
{
"epoch": 2.1713565728685427,
"grad_norm": 0.8693490028381348,
"learning_rate": 8.847060309391084e-06,
"loss": 0.4776,
"num_input_tokens_seen": 50758984,
"step": 4525,
"train_runtime": 7717.5559,
"train_tokens_per_second": 6577.08
},
{
"epoch": 2.1737565248695025,
"grad_norm": 0.6775808334350586,
"learning_rate": 8.799171033069695e-06,
"loss": 0.4821,
"num_input_tokens_seen": 50812536,
"step": 4530,
"train_runtime": 7727.2348,
"train_tokens_per_second": 6575.772
},
{
"epoch": 2.1761564768704624,
"grad_norm": 0.7019457817077637,
"learning_rate": 8.75138402452725e-06,
"loss": 0.4698,
"num_input_tokens_seen": 50867192,
"step": 4535,
"train_runtime": 7737.0022,
"train_tokens_per_second": 6574.535
},
{
"epoch": 2.1785564288714228,
"grad_norm": 0.6866047978401184,
"learning_rate": 8.7036995854194e-06,
"loss": 0.4612,
"num_input_tokens_seen": 50925384,
"step": 4540,
"train_runtime": 7746.4582,
"train_tokens_per_second": 6574.022
},
{
"epoch": 2.1809563808723826,
"grad_norm": 0.605133593082428,
"learning_rate": 8.656118016754292e-06,
"loss": 0.4939,
"num_input_tokens_seen": 50983216,
"step": 4545,
"train_runtime": 7757.2379,
"train_tokens_per_second": 6572.341
},
{
"epoch": 2.1833563328733425,
"grad_norm": 0.6981828212738037,
"learning_rate": 8.608639618890702e-06,
"loss": 0.5204,
"num_input_tokens_seen": 51038664,
"step": 4550,
"train_runtime": 7767.547,
"train_tokens_per_second": 6570.757
},
{
"epoch": 2.1857562848743024,
"grad_norm": 0.8705071806907654,
"learning_rate": 8.561264691536172e-06,
"loss": 0.4907,
"num_input_tokens_seen": 51096648,
"step": 4555,
"train_runtime": 7777.3381,
"train_tokens_per_second": 6569.94
},
{
"epoch": 2.1881562368752627,
"grad_norm": 0.7312107682228088,
"learning_rate": 8.51399353374506e-06,
"loss": 0.5114,
"num_input_tokens_seen": 51152456,
"step": 4560,
"train_runtime": 7787.2126,
"train_tokens_per_second": 6568.776
},
{
"epoch": 2.1905561888762226,
"grad_norm": 0.8138951063156128,
"learning_rate": 8.466826443916667e-06,
"loss": 0.4822,
"num_input_tokens_seen": 51207840,
"step": 4565,
"train_runtime": 7796.4469,
"train_tokens_per_second": 6568.1
},
{
"epoch": 2.1929561408771825,
"grad_norm": 0.6703912019729614,
"learning_rate": 8.4197637197934e-06,
"loss": 0.4849,
"num_input_tokens_seen": 51261448,
"step": 4570,
"train_runtime": 7806.184,
"train_tokens_per_second": 6566.774
},
{
"epoch": 2.1953560928781424,
"grad_norm": 0.9687227010726929,
"learning_rate": 8.37280565845884e-06,
"loss": 0.467,
"num_input_tokens_seen": 51317720,
"step": 4575,
"train_runtime": 7816.3502,
"train_tokens_per_second": 6565.433
},
{
"epoch": 2.1977560448791023,
"grad_norm": 0.8064000606536865,
"learning_rate": 8.325952556335878e-06,
"loss": 0.4851,
"num_input_tokens_seen": 51372576,
"step": 4580,
"train_runtime": 7825.9422,
"train_tokens_per_second": 6564.395
},
{
"epoch": 2.2001559968800626,
"grad_norm": 0.8729395866394043,
"learning_rate": 8.279204709184843e-06,
"loss": 0.5434,
"num_input_tokens_seen": 51422552,
"step": 4585,
"train_runtime": 7835.905,
"train_tokens_per_second": 6562.427
},
{
"epoch": 2.2025559488810225,
"grad_norm": 0.898769199848175,
"learning_rate": 8.232562412101674e-06,
"loss": 0.5217,
"num_input_tokens_seen": 51477960,
"step": 4590,
"train_runtime": 7846.3182,
"train_tokens_per_second": 6560.779
},
{
"epoch": 2.2049559008819823,
"grad_norm": 0.9951900243759155,
"learning_rate": 8.186025959515995e-06,
"loss": 0.4839,
"num_input_tokens_seen": 51537952,
"step": 4595,
"train_runtime": 7856.7191,
"train_tokens_per_second": 6559.73
},
{
"epoch": 2.2073558528829422,
"grad_norm": 0.8248569965362549,
"learning_rate": 8.139595645189282e-06,
"loss": 0.4497,
"num_input_tokens_seen": 51592688,
"step": 4600,
"train_runtime": 7866.4031,
"train_tokens_per_second": 6558.612
},
{
"epoch": 2.209755804883902,
"grad_norm": 0.8907241821289062,
"learning_rate": 8.09327176221305e-06,
"loss": 0.4774,
"num_input_tokens_seen": 51645280,
"step": 4605,
"train_runtime": 7876.1364,
"train_tokens_per_second": 6557.185
},
{
"epoch": 2.2121557568848624,
"grad_norm": 0.6718706488609314,
"learning_rate": 8.047054603006931e-06,
"loss": 0.5308,
"num_input_tokens_seen": 51698536,
"step": 4610,
"train_runtime": 7886.3852,
"train_tokens_per_second": 6555.416
},
{
"epoch": 2.2145557088858223,
"grad_norm": 0.6906898617744446,
"learning_rate": 8.000944459316864e-06,
"loss": 0.4422,
"num_input_tokens_seen": 51756256,
"step": 4615,
"train_runtime": 7897.4196,
"train_tokens_per_second": 6553.565
},
{
"epoch": 2.216955660886782,
"grad_norm": 0.7952353954315186,
"learning_rate": 7.954941622213272e-06,
"loss": 0.5049,
"num_input_tokens_seen": 51813256,
"step": 4620,
"train_runtime": 7907.2916,
"train_tokens_per_second": 6552.592
},
{
"epoch": 2.219355612887742,
"grad_norm": 0.7251629829406738,
"learning_rate": 7.909046382089203e-06,
"loss": 0.4541,
"num_input_tokens_seen": 51867560,
"step": 4625,
"train_runtime": 7917.3897,
"train_tokens_per_second": 6551.093
},
{
"epoch": 2.2217555648887024,
"grad_norm": 0.7001914978027344,
"learning_rate": 7.863259028658485e-06,
"loss": 0.4918,
"num_input_tokens_seen": 51920280,
"step": 4630,
"train_runtime": 7927.2271,
"train_tokens_per_second": 6549.614
},
{
"epoch": 2.2241555168896623,
"grad_norm": 0.722760021686554,
"learning_rate": 7.817579850953904e-06,
"loss": 0.4356,
"num_input_tokens_seen": 51975984,
"step": 4635,
"train_runtime": 7939.1498,
"train_tokens_per_second": 6546.795
},
{
"epoch": 2.226555468890622,
"grad_norm": 0.8394641876220703,
"learning_rate": 7.77200913732542e-06,
"loss": 0.5007,
"num_input_tokens_seen": 52031784,
"step": 4640,
"train_runtime": 7948.6393,
"train_tokens_per_second": 6545.999
},
{
"epoch": 2.228955420891582,
"grad_norm": 0.8581427335739136,
"learning_rate": 7.72654717543828e-06,
"loss": 0.4482,
"num_input_tokens_seen": 52086728,
"step": 4645,
"train_runtime": 7958.8142,
"train_tokens_per_second": 6544.534
},
{
"epoch": 2.2313553728925424,
"grad_norm": 0.8242650032043457,
"learning_rate": 7.681194252271242e-06,
"loss": 0.4219,
"num_input_tokens_seen": 52143544,
"step": 4650,
"train_runtime": 7968.3907,
"train_tokens_per_second": 6543.799
},
{
"epoch": 2.2337553248935023,
"grad_norm": 0.7680621147155762,
"learning_rate": 7.635950654114782e-06,
"loss": 0.4771,
"num_input_tokens_seen": 52203016,
"step": 4655,
"train_runtime": 7978.6952,
"train_tokens_per_second": 6542.801
},
{
"epoch": 2.236155276894462,
"grad_norm": 0.6597278118133545,
"learning_rate": 7.5908166665692285e-06,
"loss": 0.4791,
"num_input_tokens_seen": 52258320,
"step": 4660,
"train_runtime": 7988.4947,
"train_tokens_per_second": 6541.698
},
{
"epoch": 2.238555228895422,
"grad_norm": 0.8721866011619568,
"learning_rate": 7.545792574543003e-06,
"loss": 0.4895,
"num_input_tokens_seen": 52313336,
"step": 4665,
"train_runtime": 7998.6775,
"train_tokens_per_second": 6540.248
},
{
"epoch": 2.240955180896382,
"grad_norm": 1.1070098876953125,
"learning_rate": 7.500878662250818e-06,
"loss": 0.5019,
"num_input_tokens_seen": 52366728,
"step": 4670,
"train_runtime": 8008.484,
"train_tokens_per_second": 6538.906
},
{
"epoch": 2.2433551328973422,
"grad_norm": 0.6862952709197998,
"learning_rate": 7.456075213211883e-06,
"loss": 0.4622,
"num_input_tokens_seen": 52423136,
"step": 4675,
"train_runtime": 8018.9005,
"train_tokens_per_second": 6537.447
},
{
"epoch": 2.245755084898302,
"grad_norm": 0.7063257098197937,
"learning_rate": 7.411382510248091e-06,
"loss": 0.4422,
"num_input_tokens_seen": 52480088,
"step": 4680,
"train_runtime": 8028.8285,
"train_tokens_per_second": 6536.456
},
{
"epoch": 2.248155036899262,
"grad_norm": 0.7958875894546509,
"learning_rate": 7.366800835482246e-06,
"loss": 0.4774,
"num_input_tokens_seen": 52538696,
"step": 4685,
"train_runtime": 8038.5124,
"train_tokens_per_second": 6535.873
},
{
"epoch": 2.250554988900222,
"grad_norm": 0.7092862725257874,
"learning_rate": 7.3223304703363135e-06,
"loss": 0.4537,
"num_input_tokens_seen": 52598800,
"step": 4690,
"train_runtime": 8049.8427,
"train_tokens_per_second": 6534.14
},
{
"epoch": 2.2529549409011818,
"grad_norm": 0.6956859230995178,
"learning_rate": 7.277971695529592e-06,
"loss": 0.4435,
"num_input_tokens_seen": 52657280,
"step": 4695,
"train_runtime": 8060.2392,
"train_tokens_per_second": 6532.967
},
{
"epoch": 2.255354892902142,
"grad_norm": 0.6482681632041931,
"learning_rate": 7.233724791076968e-06,
"loss": 0.455,
"num_input_tokens_seen": 52713952,
"step": 4700,
"train_runtime": 8070.1937,
"train_tokens_per_second": 6531.931
},
{
"epoch": 2.257754844903102,
"grad_norm": 0.7593861222267151,
"learning_rate": 7.189590036287167e-06,
"loss": 0.4506,
"num_input_tokens_seen": 52772688,
"step": 4705,
"train_runtime": 8080.8866,
"train_tokens_per_second": 6530.557
},
{
"epoch": 2.260154796904062,
"grad_norm": 0.8229504823684692,
"learning_rate": 7.145567709760942e-06,
"loss": 0.4944,
"num_input_tokens_seen": 52829984,
"step": 4710,
"train_runtime": 8091.297,
"train_tokens_per_second": 6529.236
},
{
"epoch": 2.2625547489050217,
"grad_norm": 0.7563186287879944,
"learning_rate": 7.1016580893893514e-06,
"loss": 0.485,
"num_input_tokens_seen": 52888368,
"step": 4715,
"train_runtime": 8102.4796,
"train_tokens_per_second": 6527.43
},
{
"epoch": 2.264954700905982,
"grad_norm": 0.8408580422401428,
"learning_rate": 7.057861452352005e-06,
"loss": 0.4722,
"num_input_tokens_seen": 52945664,
"step": 4720,
"train_runtime": 8112.5815,
"train_tokens_per_second": 6526.364
},
{
"epoch": 2.267354652906942,
"grad_norm": 0.791147768497467,
"learning_rate": 7.014178075115305e-06,
"loss": 0.5043,
"num_input_tokens_seen": 53001096,
"step": 4725,
"train_runtime": 8122.5542,
"train_tokens_per_second": 6525.176
},
{
"epoch": 2.269754604907902,
"grad_norm": 0.8713123798370361,
"learning_rate": 6.9706082334306895e-06,
"loss": 0.4978,
"num_input_tokens_seen": 53054936,
"step": 4730,
"train_runtime": 8132.2978,
"train_tokens_per_second": 6523.979
},
{
"epoch": 2.2721545569088617,
"grad_norm": 0.9158002734184265,
"learning_rate": 6.927152202332898e-06,
"loss": 0.4493,
"num_input_tokens_seen": 53115032,
"step": 4735,
"train_runtime": 8142.6092,
"train_tokens_per_second": 6523.097
},
{
"epoch": 2.274554508909822,
"grad_norm": 0.8470547795295715,
"learning_rate": 6.883810256138268e-06,
"loss": 0.5082,
"num_input_tokens_seen": 53168048,
"step": 4740,
"train_runtime": 8152.4189,
"train_tokens_per_second": 6521.751
},
{
"epoch": 2.276954460910782,
"grad_norm": 0.8152704834938049,
"learning_rate": 6.8405826684429495e-06,
"loss": 0.4622,
"num_input_tokens_seen": 53228112,
"step": 4745,
"train_runtime": 8163.4113,
"train_tokens_per_second": 6520.327
},
{
"epoch": 2.279354412911742,
"grad_norm": 1.1918436288833618,
"learning_rate": 6.7974697121212044e-06,
"loss": 0.475,
"num_input_tokens_seen": 53282056,
"step": 4750,
"train_runtime": 8172.6885,
"train_tokens_per_second": 6519.526
},
{
"epoch": 2.2817543649127017,
"grad_norm": 0.8063285946846008,
"learning_rate": 6.754471659323708e-06,
"loss": 0.4444,
"num_input_tokens_seen": 53342728,
"step": 4755,
"train_runtime": 8181.7917,
"train_tokens_per_second": 6519.688
},
{
"epoch": 2.2841543169136616,
"grad_norm": 0.8364700078964233,
"learning_rate": 6.711588781475786e-06,
"loss": 0.4833,
"num_input_tokens_seen": 53397656,
"step": 4760,
"train_runtime": 8189.9068,
"train_tokens_per_second": 6519.935
},
{
"epoch": 2.286554268914622,
"grad_norm": 0.8302350640296936,
"learning_rate": 6.668821349275714e-06,
"loss": 0.4532,
"num_input_tokens_seen": 53452736,
"step": 4765,
"train_runtime": 8198.4472,
"train_tokens_per_second": 6519.861
},
{
"epoch": 2.2889542209155818,
"grad_norm": 0.7638778686523438,
"learning_rate": 6.626169632693041e-06,
"loss": 0.4679,
"num_input_tokens_seen": 53510640,
"step": 4770,
"train_runtime": 8207.0649,
"train_tokens_per_second": 6520.07
},
{
"epoch": 2.2913541729165416,
"grad_norm": 0.6307675242424011,
"learning_rate": 6.5836339009668564e-06,
"loss": 0.4336,
"num_input_tokens_seen": 53568536,
"step": 4775,
"train_runtime": 8216.2863,
"train_tokens_per_second": 6519.799
},
{
"epoch": 2.2937541249175015,
"grad_norm": 0.7008303999900818,
"learning_rate": 6.541214422604078e-06,
"loss": 0.4903,
"num_input_tokens_seen": 53623272,
"step": 4780,
"train_runtime": 8224.518,
"train_tokens_per_second": 6519.929
},
{
"epoch": 2.2961540769184614,
"grad_norm": 0.7568659782409668,
"learning_rate": 6.49891146537778e-06,
"loss": 0.4665,
"num_input_tokens_seen": 53680840,
"step": 4785,
"train_runtime": 8233.1619,
"train_tokens_per_second": 6520.076
},
{
"epoch": 2.2985540289194217,
"grad_norm": 0.7729014158248901,
"learning_rate": 6.456725296325511e-06,
"loss": 0.4648,
"num_input_tokens_seen": 53736888,
"step": 4790,
"train_runtime": 8241.812,
"train_tokens_per_second": 6520.033
},
{
"epoch": 2.3009539809203816,
"grad_norm": 0.8767671585083008,
"learning_rate": 6.414656181747578e-06,
"loss": 0.4426,
"num_input_tokens_seen": 53793888,
"step": 4795,
"train_runtime": 8250.382,
"train_tokens_per_second": 6520.169
},
{
"epoch": 2.3033539329213415,
"grad_norm": 0.5542830228805542,
"learning_rate": 6.3727043872053775e-06,
"loss": 0.4942,
"num_input_tokens_seen": 53853120,
"step": 4800,
"train_runtime": 8259.364,
"train_tokens_per_second": 6520.25
},
{
"epoch": 2.3057538849223014,
"grad_norm": 0.677183985710144,
"learning_rate": 6.330870177519749e-06,
"loss": 0.4601,
"num_input_tokens_seen": 53911008,
"step": 4805,
"train_runtime": 8268.0332,
"train_tokens_per_second": 6520.415
},
{
"epoch": 2.3081538369232617,
"grad_norm": 0.6295929551124573,
"learning_rate": 6.2891538167692525e-06,
"loss": 0.4975,
"num_input_tokens_seen": 53970856,
"step": 4810,
"train_runtime": 8276.673,
"train_tokens_per_second": 6520.839
},
{
"epoch": 2.3105537889242216,
"grad_norm": 0.6823136806488037,
"learning_rate": 6.247555568288524e-06,
"loss": 0.5108,
"num_input_tokens_seen": 54024760,
"step": 4815,
"train_runtime": 8284.8494,
"train_tokens_per_second": 6520.91
},
{
"epoch": 2.3129537409251815,
"grad_norm": 1.1955187320709229,
"learning_rate": 6.2060756946666385e-06,
"loss": 0.4972,
"num_input_tokens_seen": 54079992,
"step": 4820,
"train_runtime": 8293.4716,
"train_tokens_per_second": 6520.791
},
{
"epoch": 2.3153536929261413,
"grad_norm": 0.5726960301399231,
"learning_rate": 6.164714457745416e-06,
"loss": 0.4765,
"num_input_tokens_seen": 54137056,
"step": 4825,
"train_runtime": 8302.0452,
"train_tokens_per_second": 6520.93
},
{
"epoch": 2.3177536449271017,
"grad_norm": 0.8014964461326599,
"learning_rate": 6.123472118617779e-06,
"loss": 0.502,
"num_input_tokens_seen": 54187216,
"step": 4830,
"train_runtime": 8309.8904,
"train_tokens_per_second": 6520.81
},
{
"epoch": 2.3201535969280616,
"grad_norm": 0.6722724437713623,
"learning_rate": 6.082348937626103e-06,
"loss": 0.5223,
"num_input_tokens_seen": 54243408,
"step": 4835,
"train_runtime": 8318.422,
"train_tokens_per_second": 6520.877
},
{
"epoch": 2.3225535489290214,
"grad_norm": 0.7219895720481873,
"learning_rate": 6.041345174360602e-06,
"loss": 0.4379,
"num_input_tokens_seen": 54300888,
"step": 4840,
"train_runtime": 8327.0808,
"train_tokens_per_second": 6520.999
},
{
"epoch": 2.3249535009299813,
"grad_norm": 0.5452620983123779,
"learning_rate": 6.0004610876576385e-06,
"loss": 0.425,
"num_input_tokens_seen": 54359080,
"step": 4845,
"train_runtime": 8335.9015,
"train_tokens_per_second": 6521.08
},
{
"epoch": 2.327353452930941,
"grad_norm": 0.7828608751296997,
"learning_rate": 5.9596969355981165e-06,
"loss": 0.4783,
"num_input_tokens_seen": 54414784,
"step": 4850,
"train_runtime": 8343.8457,
"train_tokens_per_second": 6521.547
},
{
"epoch": 2.3297534049319015,
"grad_norm": 0.7745143175125122,
"learning_rate": 5.9190529755058786e-06,
"loss": 0.4625,
"num_input_tokens_seen": 54469544,
"step": 4855,
"train_runtime": 8352.9742,
"train_tokens_per_second": 6520.976
},
{
"epoch": 2.3321533569328614,
"grad_norm": 0.7965600490570068,
"learning_rate": 5.878529463946028e-06,
"loss": 0.4517,
"num_input_tokens_seen": 54525088,
"step": 4860,
"train_runtime": 8362.2759,
"train_tokens_per_second": 6520.365
},
{
"epoch": 2.3345533089338213,
"grad_norm": 0.7234916090965271,
"learning_rate": 5.838126656723353e-06,
"loss": 0.4848,
"num_input_tokens_seen": 54581656,
"step": 4865,
"train_runtime": 8372.4358,
"train_tokens_per_second": 6519.209
},
{
"epoch": 2.336953260934781,
"grad_norm": 0.8496655225753784,
"learning_rate": 5.797844808880681e-06,
"loss": 0.4535,
"num_input_tokens_seen": 54633656,
"step": 4870,
"train_runtime": 8381.8667,
"train_tokens_per_second": 6518.077
},
{
"epoch": 2.339353212935741,
"grad_norm": 0.8986937999725342,
"learning_rate": 5.757684174697306e-06,
"loss": 0.5149,
"num_input_tokens_seen": 54688552,
"step": 4875,
"train_runtime": 8392.2449,
"train_tokens_per_second": 6516.558
},
{
"epoch": 2.3417531649367014,
"grad_norm": 0.8993620276451111,
"learning_rate": 5.717645007687333e-06,
"loss": 0.4811,
"num_input_tokens_seen": 54745736,
"step": 4880,
"train_runtime": 8401.6978,
"train_tokens_per_second": 6516.032
},
{
"epoch": 2.3441531169376613,
"grad_norm": 0.8470688462257385,
"learning_rate": 5.677727560598117e-06,
"loss": 0.4531,
"num_input_tokens_seen": 54801056,
"step": 4885,
"train_runtime": 8411.9299,
"train_tokens_per_second": 6514.683
},
{
"epoch": 2.346553068938621,
"grad_norm": 0.7177883982658386,
"learning_rate": 5.637932085408665e-06,
"loss": 0.428,
"num_input_tokens_seen": 54862792,
"step": 4890,
"train_runtime": 8422.5464,
"train_tokens_per_second": 6513.801
},
{
"epoch": 2.348953020939581,
"grad_norm": 0.9984344840049744,
"learning_rate": 5.598258833328024e-06,
"loss": 0.5082,
"num_input_tokens_seen": 54917120,
"step": 4895,
"train_runtime": 8432.5181,
"train_tokens_per_second": 6512.541
},
{
"epoch": 2.3513529729405414,
"grad_norm": 0.7532204985618591,
"learning_rate": 5.558708054793702e-06,
"loss": 0.4747,
"num_input_tokens_seen": 54970952,
"step": 4900,
"train_runtime": 8442.5045,
"train_tokens_per_second": 6511.214
},
{
"epoch": 2.3537529249415012,
"grad_norm": 0.9301844835281372,
"learning_rate": 5.519279999470114e-06,
"loss": 0.4653,
"num_input_tokens_seen": 55030344,
"step": 4905,
"train_runtime": 8453.3379,
"train_tokens_per_second": 6509.895
},
{
"epoch": 2.356152876942461,
"grad_norm": 0.7001831531524658,
"learning_rate": 5.47997491624696e-06,
"loss": 0.4505,
"num_input_tokens_seen": 55089240,
"step": 4910,
"train_runtime": 8463.8354,
"train_tokens_per_second": 6508.78
},
{
"epoch": 2.358552828943421,
"grad_norm": 1.0007083415985107,
"learning_rate": 5.440793053237703e-06,
"loss": 0.4951,
"num_input_tokens_seen": 55145288,
"step": 4915,
"train_runtime": 8472.6075,
"train_tokens_per_second": 6508.656
},
{
"epoch": 2.3609527809443813,
"grad_norm": 0.807292103767395,
"learning_rate": 5.401734657777949e-06,
"loss": 0.4555,
"num_input_tokens_seen": 55202104,
"step": 4920,
"train_runtime": 8481.8958,
"train_tokens_per_second": 6508.227
},
{
"epoch": 2.363352732945341,
"grad_norm": 0.8415015339851379,
"learning_rate": 5.362799976423946e-06,
"loss": 0.4936,
"num_input_tokens_seen": 55259704,
"step": 4925,
"train_runtime": 8490.9011,
"train_tokens_per_second": 6508.108
},
{
"epoch": 2.365752684946301,
"grad_norm": 0.6624288558959961,
"learning_rate": 5.323989254950973e-06,
"loss": 0.4645,
"num_input_tokens_seen": 55317744,
"step": 4930,
"train_runtime": 8500.053,
"train_tokens_per_second": 6507.929
},
{
"epoch": 2.368152636947261,
"grad_norm": 0.8374559283256531,
"learning_rate": 5.285302738351813e-06,
"loss": 0.4797,
"num_input_tokens_seen": 55372296,
"step": 4935,
"train_runtime": 8507.8541,
"train_tokens_per_second": 6508.374
},
{
"epoch": 2.370552588948221,
"grad_norm": 0.5884356498718262,
"learning_rate": 5.246740670835227e-06,
"loss": 0.4606,
"num_input_tokens_seen": 55433904,
"step": 4940,
"train_runtime": 8517.3387,
"train_tokens_per_second": 6508.36
},
{
"epoch": 2.372952540949181,
"grad_norm": 0.7946999669075012,
"learning_rate": 5.208303295824368e-06,
"loss": 0.4901,
"num_input_tokens_seen": 55489480,
"step": 4945,
"train_runtime": 8525.7706,
"train_tokens_per_second": 6508.442
},
{
"epoch": 2.375352492950141,
"grad_norm": 0.8008665442466736,
"learning_rate": 5.16999085595527e-06,
"loss": 0.4489,
"num_input_tokens_seen": 55548432,
"step": 4950,
"train_runtime": 8534.5861,
"train_tokens_per_second": 6508.626
},
{
"epoch": 2.377752444951101,
"grad_norm": 0.6131346225738525,
"learning_rate": 5.1318035930753295e-06,
"loss": 0.4751,
"num_input_tokens_seen": 55606952,
"step": 4955,
"train_runtime": 8544.472,
"train_tokens_per_second": 6507.945
},
{
"epoch": 2.380152396952061,
"grad_norm": 0.6987022757530212,
"learning_rate": 5.09374174824174e-06,
"loss": 0.4716,
"num_input_tokens_seen": 55665912,
"step": 4960,
"train_runtime": 8553.5875,
"train_tokens_per_second": 6507.902
},
{
"epoch": 2.3825523489530207,
"grad_norm": 0.9554920792579651,
"learning_rate": 5.0558055617200205e-06,
"loss": 0.4208,
"num_input_tokens_seen": 55719624,
"step": 4965,
"train_runtime": 8561.4317,
"train_tokens_per_second": 6508.213
},
{
"epoch": 2.384952300953981,
"grad_norm": 0.7300603985786438,
"learning_rate": 5.0179952729824395e-06,
"loss": 0.4832,
"num_input_tokens_seen": 55774472,
"step": 4970,
"train_runtime": 8570.1123,
"train_tokens_per_second": 6508.021
},
{
"epoch": 2.387352252954941,
"grad_norm": 0.8243890404701233,
"learning_rate": 4.980311120706569e-06,
"loss": 0.5135,
"num_input_tokens_seen": 55826392,
"step": 4975,
"train_runtime": 8578.1037,
"train_tokens_per_second": 6508.011
},
{
"epoch": 2.389752204955901,
"grad_norm": 0.7249002456665039,
"learning_rate": 4.942753342773718e-06,
"loss": 0.5443,
"num_input_tokens_seen": 55880968,
"step": 4980,
"train_runtime": 8586.3873,
"train_tokens_per_second": 6508.088
},
{
"epoch": 2.3921521569568607,
"grad_norm": 0.883586585521698,
"learning_rate": 4.90532217626746e-06,
"loss": 0.4719,
"num_input_tokens_seen": 55933504,
"step": 4985,
"train_runtime": 8594.9139,
"train_tokens_per_second": 6507.745
},
{
"epoch": 2.394552108957821,
"grad_norm": 0.9183365702629089,
"learning_rate": 4.868017857472157e-06,
"loss": 0.4971,
"num_input_tokens_seen": 55986736,
"step": 4990,
"train_runtime": 8603.0537,
"train_tokens_per_second": 6507.775
},
{
"epoch": 2.396952060958781,
"grad_norm": 0.9093974232673645,
"learning_rate": 4.830840621871416e-06,
"loss": 0.471,
"num_input_tokens_seen": 56042472,
"step": 4995,
"train_runtime": 8612.3964,
"train_tokens_per_second": 6507.187
},
{
"epoch": 2.3993520129597408,
"grad_norm": 0.8658146858215332,
"learning_rate": 4.793790704146639e-06,
"loss": 0.5096,
"num_input_tokens_seen": 56094608,
"step": 5000,
"train_runtime": 8620.6801,
"train_tokens_per_second": 6506.982
},
{
"epoch": 2.4017519649607006,
"grad_norm": 0.881760835647583,
"learning_rate": 4.756868338175552e-06,
"loss": 0.4545,
"num_input_tokens_seen": 56152192,
"step": 5005,
"train_runtime": 8628.9033,
"train_tokens_per_second": 6507.454
},
{
"epoch": 2.404151916961661,
"grad_norm": 0.6396927833557129,
"learning_rate": 4.7200737570306765e-06,
"loss": 0.482,
"num_input_tokens_seen": 56209072,
"step": 5010,
"train_runtime": 8637.3318,
"train_tokens_per_second": 6507.689
},
{
"epoch": 2.406551868962621,
"grad_norm": 0.7207968831062317,
"learning_rate": 4.683407192977923e-06,
"loss": 0.4701,
"num_input_tokens_seen": 56265496,
"step": 5015,
"train_runtime": 8645.9013,
"train_tokens_per_second": 6507.765
},
{
"epoch": 2.4089518209635807,
"grad_norm": 0.6970353126525879,
"learning_rate": 4.646868877475083e-06,
"loss": 0.4906,
"num_input_tokens_seen": 56324336,
"step": 5020,
"train_runtime": 8654.8609,
"train_tokens_per_second": 6507.827
},
{
"epoch": 2.4113517729645406,
"grad_norm": 0.6664267182350159,
"learning_rate": 4.610459041170376e-06,
"loss": 0.4497,
"num_input_tokens_seen": 56387160,
"step": 5025,
"train_runtime": 8664.2456,
"train_tokens_per_second": 6508.029
},
{
"epoch": 2.4137517249655005,
"grad_norm": 0.6361657977104187,
"learning_rate": 4.574177913900992e-06,
"loss": 0.4473,
"num_input_tokens_seen": 56450040,
"step": 5030,
"train_runtime": 8672.7824,
"train_tokens_per_second": 6508.873
},
{
"epoch": 2.416151676966461,
"grad_norm": 0.9782693386077881,
"learning_rate": 4.538025724691647e-06,
"loss": 0.5403,
"num_input_tokens_seen": 56509192,
"step": 5035,
"train_runtime": 8680.8979,
"train_tokens_per_second": 6509.602
},
{
"epoch": 2.4185516289674207,
"grad_norm": 1.0109143257141113,
"learning_rate": 4.502002701753149e-06,
"loss": 0.4535,
"num_input_tokens_seen": 56564168,
"step": 5040,
"train_runtime": 8689.3056,
"train_tokens_per_second": 6509.63
},
{
"epoch": 2.4209515809683806,
"grad_norm": 0.8760951161384583,
"learning_rate": 4.4661090724809286e-06,
"loss": 0.4666,
"num_input_tokens_seen": 56619720,
"step": 5045,
"train_runtime": 8698.0152,
"train_tokens_per_second": 6509.499
},
{
"epoch": 2.4233515329693405,
"grad_norm": 0.879936933517456,
"learning_rate": 4.430345063453614e-06,
"loss": 0.4685,
"num_input_tokens_seen": 56674064,
"step": 5050,
"train_runtime": 8707.0335,
"train_tokens_per_second": 6508.998
},
{
"epoch": 2.4257514849703004,
"grad_norm": 0.5749469995498657,
"learning_rate": 4.394710900431628e-06,
"loss": 0.5077,
"num_input_tokens_seen": 56730176,
"step": 5055,
"train_runtime": 8715.7157,
"train_tokens_per_second": 6508.952
},
{
"epoch": 2.4281514369712607,
"grad_norm": 0.670002818107605,
"learning_rate": 4.359206808355715e-06,
"loss": 0.4711,
"num_input_tokens_seen": 56786912,
"step": 5060,
"train_runtime": 8724.2214,
"train_tokens_per_second": 6509.109
},
{
"epoch": 2.4305513889722206,
"grad_norm": 0.8267392516136169,
"learning_rate": 4.32383301134556e-06,
"loss": 0.468,
"num_input_tokens_seen": 56846864,
"step": 5065,
"train_runtime": 8733.1875,
"train_tokens_per_second": 6509.292
},
{
"epoch": 2.4329513409731804,
"grad_norm": 0.9042259454727173,
"learning_rate": 4.288589732698365e-06,
"loss": 0.4722,
"num_input_tokens_seen": 56903624,
"step": 5070,
"train_runtime": 8741.802,
"train_tokens_per_second": 6509.37
},
{
"epoch": 2.4353512929741403,
"grad_norm": 0.9303114414215088,
"learning_rate": 4.253477194887423e-06,
"loss": 0.4879,
"num_input_tokens_seen": 56961168,
"step": 5075,
"train_runtime": 8750.9039,
"train_tokens_per_second": 6509.175
},
{
"epoch": 2.4377512449751007,
"grad_norm": 0.8733497858047485,
"learning_rate": 4.218495619560725e-06,
"loss": 0.4762,
"num_input_tokens_seen": 57017760,
"step": 5080,
"train_runtime": 8759.4851,
"train_tokens_per_second": 6509.259
},
{
"epoch": 2.4401511969760605,
"grad_norm": 0.8203326463699341,
"learning_rate": 4.1836452275395624e-06,
"loss": 0.4934,
"num_input_tokens_seen": 57072760,
"step": 5085,
"train_runtime": 8768.1106,
"train_tokens_per_second": 6509.129
},
{
"epoch": 2.4425511489770204,
"grad_norm": 1.0363794565200806,
"learning_rate": 4.148926238817141e-06,
"loss": 0.4518,
"num_input_tokens_seen": 57128592,
"step": 5090,
"train_runtime": 8776.3031,
"train_tokens_per_second": 6509.414
},
{
"epoch": 2.4449511009779803,
"grad_norm": 0.9167368412017822,
"learning_rate": 4.114338872557175e-06,
"loss": 0.4542,
"num_input_tokens_seen": 57184720,
"step": 5095,
"train_runtime": 8784.8429,
"train_tokens_per_second": 6509.476
},
{
"epoch": 2.4473510529789406,
"grad_norm": 0.662429928779602,
"learning_rate": 4.079883347092506e-06,
"loss": 0.4811,
"num_input_tokens_seen": 57248888,
"step": 5100,
"train_runtime": 8794.2311,
"train_tokens_per_second": 6509.823
},
{
"epoch": 2.4497510049799005,
"grad_norm": 0.6756502389907837,
"learning_rate": 4.045559879923747e-06,
"loss": 0.454,
"num_input_tokens_seen": 57307744,
"step": 5105,
"train_runtime": 8803.0414,
"train_tokens_per_second": 6509.994
},
{
"epoch": 2.4521509569808604,
"grad_norm": 0.7121127843856812,
"learning_rate": 4.011368687717867e-06,
"loss": 0.4506,
"num_input_tokens_seen": 57363824,
"step": 5110,
"train_runtime": 8811.1922,
"train_tokens_per_second": 6510.336
},
{
"epoch": 2.4545509089818203,
"grad_norm": 0.764569878578186,
"learning_rate": 3.977309986306874e-06,
"loss": 0.4614,
"num_input_tokens_seen": 57422952,
"step": 5115,
"train_runtime": 8819.6634,
"train_tokens_per_second": 6510.787
},
{
"epoch": 2.45695086098278,
"grad_norm": 0.9439240097999573,
"learning_rate": 3.943383990686425e-06,
"loss": 0.5036,
"num_input_tokens_seen": 57475568,
"step": 5120,
"train_runtime": 8827.7896,
"train_tokens_per_second": 6510.754
},
{
"epoch": 2.4593508129837405,
"grad_norm": 0.7676842212677002,
"learning_rate": 3.909590915014455e-06,
"loss": 0.4741,
"num_input_tokens_seen": 57533000,
"step": 5125,
"train_runtime": 8836.2004,
"train_tokens_per_second": 6511.056
},
{
"epoch": 2.4617507649847004,
"grad_norm": 0.7224127054214478,
"learning_rate": 3.875930972609851e-06,
"loss": 0.4555,
"num_input_tokens_seen": 57591416,
"step": 5130,
"train_runtime": 8844.7508,
"train_tokens_per_second": 6511.367
},
{
"epoch": 2.4641507169856602,
"grad_norm": 0.8699045777320862,
"learning_rate": 3.842404375951089e-06,
"loss": 0.4948,
"num_input_tokens_seen": 57648120,
"step": 5135,
"train_runtime": 8853.2169,
"train_tokens_per_second": 6511.545
},
{
"epoch": 2.46655066898662,
"grad_norm": 0.8307254910469055,
"learning_rate": 3.809011336674917e-06,
"loss": 0.4747,
"num_input_tokens_seen": 57705096,
"step": 5140,
"train_runtime": 8861.9212,
"train_tokens_per_second": 6511.579
},
{
"epoch": 2.46895062098758,
"grad_norm": 1.0947297811508179,
"learning_rate": 3.7757520655749863e-06,
"loss": 0.4711,
"num_input_tokens_seen": 57760000,
"step": 5145,
"train_runtime": 8870.5168,
"train_tokens_per_second": 6511.458
},
{
"epoch": 2.4713505729885403,
"grad_norm": 0.6444729566574097,
"learning_rate": 3.7426267726005354e-06,
"loss": 0.4566,
"num_input_tokens_seen": 57814992,
"step": 5150,
"train_runtime": 8879.2323,
"train_tokens_per_second": 6511.26
},
{
"epoch": 2.4737505249895,
"grad_norm": 0.7921139001846313,
"learning_rate": 3.709635666855077e-06,
"loss": 0.4552,
"num_input_tokens_seen": 57870400,
"step": 5155,
"train_runtime": 8888.1359,
"train_tokens_per_second": 6510.972
},
{
"epoch": 2.47615047699046,
"grad_norm": 0.6223105192184448,
"learning_rate": 3.6767789565950563e-06,
"loss": 0.425,
"num_input_tokens_seen": 57932208,
"step": 5160,
"train_runtime": 8896.7689,
"train_tokens_per_second": 6511.601
},
{
"epoch": 2.4785504289914204,
"grad_norm": 0.7725955843925476,
"learning_rate": 3.64405684922855e-06,
"loss": 0.4413,
"num_input_tokens_seen": 57989280,
"step": 5165,
"train_runtime": 8905.0042,
"train_tokens_per_second": 6511.988
},
{
"epoch": 2.4809503809923803,
"grad_norm": 0.7563416361808777,
"learning_rate": 3.611469551313959e-06,
"loss": 0.521,
"num_input_tokens_seen": 58045968,
"step": 5170,
"train_runtime": 8913.261,
"train_tokens_per_second": 6512.316
},
{
"epoch": 2.48335033299334,
"grad_norm": 0.7822843790054321,
"learning_rate": 3.579017268558693e-06,
"loss": 0.4989,
"num_input_tokens_seen": 58098536,
"step": 5175,
"train_runtime": 8920.913,
"train_tokens_per_second": 6512.622
},
{
"epoch": 2.4857502849943,
"grad_norm": 0.80488520860672,
"learning_rate": 3.5467002058178764e-06,
"loss": 0.498,
"num_input_tokens_seen": 58153656,
"step": 5180,
"train_runtime": 8929.6199,
"train_tokens_per_second": 6512.445
},
{
"epoch": 2.48815023699526,
"grad_norm": 0.7986950278282166,
"learning_rate": 3.514518567093056e-06,
"loss": 0.4513,
"num_input_tokens_seen": 58208960,
"step": 5185,
"train_runtime": 8938.3362,
"train_tokens_per_second": 6512.281
},
{
"epoch": 2.4905501889962203,
"grad_norm": 0.7876197695732117,
"learning_rate": 3.4824725555309272e-06,
"loss": 0.4757,
"num_input_tokens_seen": 58268880,
"step": 5190,
"train_runtime": 8946.6352,
"train_tokens_per_second": 6512.938
},
{
"epoch": 2.49295014099718,
"grad_norm": 0.8735581040382385,
"learning_rate": 3.4505623734220226e-06,
"loss": 0.4926,
"num_input_tokens_seen": 58323184,
"step": 5195,
"train_runtime": 8954.4183,
"train_tokens_per_second": 6513.341
},
{
"epoch": 2.49535009299814,
"grad_norm": 0.8230021595954895,
"learning_rate": 3.4187882221994564e-06,
"loss": 0.5169,
"num_input_tokens_seen": 58379592,
"step": 5200,
"train_runtime": 8962.9041,
"train_tokens_per_second": 6513.468
},
{
"epoch": 2.4977500449991,
"grad_norm": 0.9317114353179932,
"learning_rate": 3.3871503024376554e-06,
"loss": 0.4625,
"num_input_tokens_seen": 58439472,
"step": 5205,
"train_runtime": 8971.3456,
"train_tokens_per_second": 6514.014
},
{
"epoch": 2.50014999700006,
"grad_norm": 0.889101505279541,
"learning_rate": 3.3556488138510674e-06,
"loss": 0.4478,
"num_input_tokens_seen": 58498776,
"step": 5210,
"train_runtime": 8980.13,
"train_tokens_per_second": 6514.246
},
{
"epoch": 2.50254994900102,
"grad_norm": 0.5332804322242737,
"learning_rate": 3.3242839552929366e-06,
"loss": 0.4552,
"num_input_tokens_seen": 58559344,
"step": 5215,
"train_runtime": 8988.8739,
"train_tokens_per_second": 6514.647
},
{
"epoch": 2.50494990100198,
"grad_norm": 0.9555898308753967,
"learning_rate": 3.2930559247540267e-06,
"loss": 0.4537,
"num_input_tokens_seen": 58614416,
"step": 5220,
"train_runtime": 8997.7825,
"train_tokens_per_second": 6514.318
},
{
"epoch": 2.50734985300294,
"grad_norm": 1.1382311582565308,
"learning_rate": 3.2619649193613626e-06,
"loss": 0.5041,
"num_input_tokens_seen": 58667216,
"step": 5225,
"train_runtime": 9006.2657,
"train_tokens_per_second": 6514.045
},
{
"epoch": 2.5097498050038998,
"grad_norm": 1.1261781454086304,
"learning_rate": 3.2310111353770045e-06,
"loss": 0.5123,
"num_input_tokens_seen": 58722648,
"step": 5230,
"train_runtime": 9014.5273,
"train_tokens_per_second": 6514.224
},
{
"epoch": 2.5121497570048597,
"grad_norm": 0.6339508295059204,
"learning_rate": 3.2001947681967987e-06,
"loss": 0.466,
"num_input_tokens_seen": 58780640,
"step": 5235,
"train_runtime": 9023.7118,
"train_tokens_per_second": 6514.02
},
{
"epoch": 2.51454970900582,
"grad_norm": 0.8819341659545898,
"learning_rate": 3.169516012349161e-06,
"loss": 0.4855,
"num_input_tokens_seen": 58839080,
"step": 5240,
"train_runtime": 9032.9027,
"train_tokens_per_second": 6513.862
},
{
"epoch": 2.51694966100678,
"grad_norm": 0.8198482394218445,
"learning_rate": 3.138975061493815e-06,
"loss": 0.5462,
"num_input_tokens_seen": 58888056,
"step": 5245,
"train_runtime": 9041.1086,
"train_tokens_per_second": 6513.367
},
{
"epoch": 2.5193496130077397,
"grad_norm": 0.7308799028396606,
"learning_rate": 3.1085721084205987e-06,
"loss": 0.4879,
"num_input_tokens_seen": 58948912,
"step": 5250,
"train_runtime": 9049.9278,
"train_tokens_per_second": 6513.744
},
{
"epoch": 2.5217495650087,
"grad_norm": 0.7503857612609863,
"learning_rate": 3.078307345048251e-06,
"loss": 0.434,
"num_input_tokens_seen": 59005656,
"step": 5255,
"train_runtime": 9058.4522,
"train_tokens_per_second": 6513.878
},
{
"epoch": 2.52414951700966,
"grad_norm": 0.7755120992660522,
"learning_rate": 3.0481809624231667e-06,
"loss": 0.4226,
"num_input_tokens_seen": 59064880,
"step": 5260,
"train_runtime": 9067.2632,
"train_tokens_per_second": 6514.08
},
{
"epoch": 2.52654946901062,
"grad_norm": 0.7984574437141418,
"learning_rate": 3.018193150718224e-06,
"loss": 0.4881,
"num_input_tokens_seen": 59122920,
"step": 5265,
"train_runtime": 9075.8636,
"train_tokens_per_second": 6514.302
},
{
"epoch": 2.5289494210115797,
"grad_norm": 0.7857392430305481,
"learning_rate": 2.9883440992315744e-06,
"loss": 0.4949,
"num_input_tokens_seen": 59180768,
"step": 5270,
"train_runtime": 9084.2259,
"train_tokens_per_second": 6514.674
},
{
"epoch": 2.5313493730125396,
"grad_norm": 0.7636000514030457,
"learning_rate": 2.9586339963854402e-06,
"loss": 0.4584,
"num_input_tokens_seen": 59236392,
"step": 5275,
"train_runtime": 9093.425,
"train_tokens_per_second": 6514.2
},
{
"epoch": 2.5337493250135,
"grad_norm": 0.7404913306236267,
"learning_rate": 2.929063029724924e-06,
"loss": 0.5001,
"num_input_tokens_seen": 59288152,
"step": 5280,
"train_runtime": 9101.2939,
"train_tokens_per_second": 6514.255
},
{
"epoch": 2.53614927701446,
"grad_norm": 0.8310667872428894,
"learning_rate": 2.8996313859168373e-06,
"loss": 0.4752,
"num_input_tokens_seen": 59350448,
"step": 5285,
"train_runtime": 9109.697,
"train_tokens_per_second": 6515.085
},
{
"epoch": 2.5385492290154197,
"grad_norm": 0.7058178782463074,
"learning_rate": 2.8703392507485244e-06,
"loss": 0.5058,
"num_input_tokens_seen": 59405224,
"step": 5290,
"train_runtime": 9118.1859,
"train_tokens_per_second": 6515.027
},
{
"epoch": 2.5409491810163796,
"grad_norm": 0.9837594628334045,
"learning_rate": 2.8411868091266614e-06,
"loss": 0.5101,
"num_input_tokens_seen": 59459408,
"step": 5295,
"train_runtime": 9125.7939,
"train_tokens_per_second": 6515.533
},
{
"epoch": 2.5433491330173394,
"grad_norm": 0.749136745929718,
"learning_rate": 2.812174245076121e-06,
"loss": 0.4509,
"num_input_tokens_seen": 59519864,
"step": 5300,
"train_runtime": 9134.5564,
"train_tokens_per_second": 6515.901
},
{
"epoch": 2.5457490850182998,
"grad_norm": 0.8679369688034058,
"learning_rate": 2.783301741738803e-06,
"loss": 0.5337,
"num_input_tokens_seen": 59575648,
"step": 5305,
"train_runtime": 9142.5914,
"train_tokens_per_second": 6516.276
},
{
"epoch": 2.5481490370192597,
"grad_norm": 0.7311270833015442,
"learning_rate": 2.75456948137246e-06,
"loss": 0.4446,
"num_input_tokens_seen": 59631568,
"step": 5310,
"train_runtime": 9150.8949,
"train_tokens_per_second": 6516.474
},
{
"epoch": 2.5505489890202195,
"grad_norm": 0.9072261452674866,
"learning_rate": 2.725977645349567e-06,
"loss": 0.4515,
"num_input_tokens_seen": 59688168,
"step": 5315,
"train_runtime": 9158.8503,
"train_tokens_per_second": 6516.993
},
{
"epoch": 2.5529489410211794,
"grad_norm": 0.7925878763198853,
"learning_rate": 2.6975264141561792e-06,
"loss": 0.4743,
"num_input_tokens_seen": 59750784,
"step": 5320,
"train_runtime": 9167.7914,
"train_tokens_per_second": 6517.468
},
{
"epoch": 2.5553488930221393,
"grad_norm": 0.7712064981460571,
"learning_rate": 2.6692159673907674e-06,
"loss": 0.4835,
"num_input_tokens_seen": 59804776,
"step": 5325,
"train_runtime": 9176.5665,
"train_tokens_per_second": 6517.119
},
{
"epoch": 2.5577488450230996,
"grad_norm": 0.9932171106338501,
"learning_rate": 2.641046483763107e-06,
"loss": 0.4954,
"num_input_tokens_seen": 59862336,
"step": 5330,
"train_runtime": 9184.9522,
"train_tokens_per_second": 6517.436
},
{
"epoch": 2.5601487970240595,
"grad_norm": 0.8807353377342224,
"learning_rate": 2.613018141093143e-06,
"loss": 0.5017,
"num_input_tokens_seen": 59920072,
"step": 5335,
"train_runtime": 9193.9014,
"train_tokens_per_second": 6517.372
},
{
"epoch": 2.5625487490250194,
"grad_norm": 0.7849051356315613,
"learning_rate": 2.585131116309872e-06,
"loss": 0.4951,
"num_input_tokens_seen": 59975568,
"step": 5340,
"train_runtime": 9202.2095,
"train_tokens_per_second": 6517.518
},
{
"epoch": 2.5649487010259797,
"grad_norm": 0.5779772400856018,
"learning_rate": 2.557385585450217e-06,
"loss": 0.4706,
"num_input_tokens_seen": 60036392,
"step": 5345,
"train_runtime": 9211.2288,
"train_tokens_per_second": 6517.74
},
{
"epoch": 2.5673486530269396,
"grad_norm": 0.9567521810531616,
"learning_rate": 2.529781723657915e-06,
"loss": 0.4893,
"num_input_tokens_seen": 60093024,
"step": 5350,
"train_runtime": 9220.1795,
"train_tokens_per_second": 6517.555
},
{
"epoch": 2.5697486050278995,
"grad_norm": 0.7940301299095154,
"learning_rate": 2.5023197051824267e-06,
"loss": 0.5055,
"num_input_tokens_seen": 60144920,
"step": 5355,
"train_runtime": 9228.2311,
"train_tokens_per_second": 6517.492
},
{
"epoch": 2.5721485570288594,
"grad_norm": 0.9344842433929443,
"learning_rate": 2.4749997033778228e-06,
"loss": 0.5167,
"num_input_tokens_seen": 60203224,
"step": 5360,
"train_runtime": 9236.6101,
"train_tokens_per_second": 6517.892
},
{
"epoch": 2.5745485090298192,
"grad_norm": 0.9174864888191223,
"learning_rate": 2.4478218907016877e-06,
"loss": 0.4896,
"num_input_tokens_seen": 60259032,
"step": 5365,
"train_runtime": 9245.2879,
"train_tokens_per_second": 6517.81
},
{
"epoch": 2.5769484610307796,
"grad_norm": 0.9624903798103333,
"learning_rate": 2.4207864387140512e-06,
"loss": 0.5132,
"num_input_tokens_seen": 60308024,
"step": 5370,
"train_runtime": 9253.8315,
"train_tokens_per_second": 6517.087
},
{
"epoch": 2.5793484130317395,
"grad_norm": 0.6800229549407959,
"learning_rate": 2.3938935180762707e-06,
"loss": 0.5086,
"num_input_tokens_seen": 60362552,
"step": 5375,
"train_runtime": 9261.5584,
"train_tokens_per_second": 6517.537
},
{
"epoch": 2.5817483650326993,
"grad_norm": 0.9939396977424622,
"learning_rate": 2.36714329854999e-06,
"loss": 0.5001,
"num_input_tokens_seen": 60415520,
"step": 5380,
"train_runtime": 9269.7261,
"train_tokens_per_second": 6517.509
},
{
"epoch": 2.584148317033659,
"grad_norm": 0.7869457602500916,
"learning_rate": 2.3405359489960365e-06,
"loss": 0.493,
"num_input_tokens_seen": 60469016,
"step": 5385,
"train_runtime": 9277.4328,
"train_tokens_per_second": 6517.861
},
{
"epoch": 2.586548269034619,
"grad_norm": 0.8779625296592712,
"learning_rate": 2.314071637373394e-06,
"loss": 0.537,
"num_input_tokens_seen": 60528736,
"step": 5390,
"train_runtime": 9286.7608,
"train_tokens_per_second": 6517.745
},
{
"epoch": 2.5889482210355794,
"grad_norm": 0.9168468713760376,
"learning_rate": 2.2877505307380976e-06,
"loss": 0.5101,
"num_input_tokens_seen": 60585352,
"step": 5395,
"train_runtime": 9294.8068,
"train_tokens_per_second": 6518.194
},
{
"epoch": 2.5913481730365393,
"grad_norm": 0.7564955353736877,
"learning_rate": 2.2615727952422033e-06,
"loss": 0.4426,
"num_input_tokens_seen": 60645192,
"step": 5400,
"train_runtime": 9303.554,
"train_tokens_per_second": 6518.497
},
{
"epoch": 2.593748125037499,
"grad_norm": 0.823637843132019,
"learning_rate": 2.235538596132747e-06,
"loss": 0.4401,
"num_input_tokens_seen": 60705872,
"step": 5405,
"train_runtime": 9314.3874,
"train_tokens_per_second": 6517.43
},
{
"epoch": 2.596148077038459,
"grad_norm": 0.5428220629692078,
"learning_rate": 2.2096480977506883e-06,
"loss": 0.466,
"num_input_tokens_seen": 60766448,
"step": 5410,
"train_runtime": 9324.731,
"train_tokens_per_second": 6516.697
},
{
"epoch": 2.598548029039419,
"grad_norm": 1.0644038915634155,
"learning_rate": 2.183901463529861e-06,
"loss": 0.4647,
"num_input_tokens_seen": 60820832,
"step": 5415,
"train_runtime": 9335.3113,
"train_tokens_per_second": 6515.137
},
{
"epoch": 2.6009479810403793,
"grad_norm": 0.7919825315475464,
"learning_rate": 2.1582988559959773e-06,
"loss": 0.4435,
"num_input_tokens_seen": 60879048,
"step": 5420,
"train_runtime": 9346.1879,
"train_tokens_per_second": 6513.784
},
{
"epoch": 2.603347933041339,
"grad_norm": 1.047285556793213,
"learning_rate": 2.132840436765568e-06,
"loss": 0.4641,
"num_input_tokens_seen": 60927720,
"step": 5425,
"train_runtime": 9355.613,
"train_tokens_per_second": 6512.424
},
{
"epoch": 2.605747885042299,
"grad_norm": 0.9616097211837769,
"learning_rate": 2.1075263665449737e-06,
"loss": 0.4677,
"num_input_tokens_seen": 60981576,
"step": 5430,
"train_runtime": 9365.6809,
"train_tokens_per_second": 6511.174
},
{
"epoch": 2.6081478370432594,
"grad_norm": 0.9964049458503723,
"learning_rate": 2.082356805129332e-06,
"loss": 0.4929,
"num_input_tokens_seen": 61039448,
"step": 5435,
"train_runtime": 9376.1343,
"train_tokens_per_second": 6510.087
},
{
"epoch": 2.6105477890442192,
"grad_norm": 0.8985645174980164,
"learning_rate": 2.0573319114015775e-06,
"loss": 0.4886,
"num_input_tokens_seen": 61093640,
"step": 5440,
"train_runtime": 9386.3154,
"train_tokens_per_second": 6508.799
},
{
"epoch": 2.612947741045179,
"grad_norm": 0.7488046884536743,
"learning_rate": 2.0324518433314206e-06,
"loss": 0.4697,
"num_input_tokens_seen": 61149808,
"step": 5445,
"train_runtime": 9396.5128,
"train_tokens_per_second": 6507.713
},
{
"epoch": 2.615347693046139,
"grad_norm": 0.7769824862480164,
"learning_rate": 2.0077167579743593e-06,
"loss": 0.4645,
"num_input_tokens_seen": 61206176,
"step": 5450,
"train_runtime": 9406.5758,
"train_tokens_per_second": 6506.743
},
{
"epoch": 2.617747645047099,
"grad_norm": 0.7720673084259033,
"learning_rate": 1.9831268114706925e-06,
"loss": 0.4667,
"num_input_tokens_seen": 61266712,
"step": 5455,
"train_runtime": 9417.2442,
"train_tokens_per_second": 6505.8
},
{
"epoch": 2.620147597048059,
"grad_norm": 0.7182523012161255,
"learning_rate": 1.958682159044531e-06,
"loss": 0.4644,
"num_input_tokens_seen": 61319856,
"step": 5460,
"train_runtime": 9426.6437,
"train_tokens_per_second": 6504.951
},
{
"epoch": 2.622547549049019,
"grad_norm": 0.8977944850921631,
"learning_rate": 1.934382955002803e-06,
"loss": 0.5007,
"num_input_tokens_seen": 61377048,
"step": 5465,
"train_runtime": 9437.2729,
"train_tokens_per_second": 6503.685
},
{
"epoch": 2.624947501049979,
"grad_norm": 0.7803311347961426,
"learning_rate": 1.9102293527343163e-06,
"loss": 0.4658,
"num_input_tokens_seen": 61434248,
"step": 5470,
"train_runtime": 9448.0138,
"train_tokens_per_second": 6502.345
},
{
"epoch": 2.627347453050939,
"grad_norm": 0.72231125831604,
"learning_rate": 1.886221504708746e-06,
"loss": 0.4968,
"num_input_tokens_seen": 61494600,
"step": 5475,
"train_runtime": 9459.0534,
"train_tokens_per_second": 6501.137
},
{
"epoch": 2.6297474050518987,
"grad_norm": 0.5621334314346313,
"learning_rate": 1.8623595624757045e-06,
"loss": 0.4606,
"num_input_tokens_seen": 61555232,
"step": 5480,
"train_runtime": 9469.3682,
"train_tokens_per_second": 6500.458
},
{
"epoch": 2.632147357052859,
"grad_norm": 0.6386857628822327,
"learning_rate": 1.8386436766637593e-06,
"loss": 0.4647,
"num_input_tokens_seen": 61610480,
"step": 5485,
"train_runtime": 9479.3329,
"train_tokens_per_second": 6499.453
},
{
"epoch": 2.634547309053819,
"grad_norm": 0.6079943776130676,
"learning_rate": 1.8150739969795245e-06,
"loss": 0.4742,
"num_input_tokens_seen": 61666936,
"step": 5490,
"train_runtime": 9489.1199,
"train_tokens_per_second": 6498.699
},
{
"epoch": 2.636947261054779,
"grad_norm": 0.6471970677375793,
"learning_rate": 1.7916506722066573e-06,
"loss": 0.5121,
"num_input_tokens_seen": 61723152,
"step": 5495,
"train_runtime": 9498.3327,
"train_tokens_per_second": 6498.314
},
{
"epoch": 2.639347213055739,
"grad_norm": 0.8927129507064819,
"learning_rate": 1.7683738502049658e-06,
"loss": 0.5282,
"num_input_tokens_seen": 61779792,
"step": 5500,
"train_runtime": 9508.4194,
"train_tokens_per_second": 6497.378
},
{
"epoch": 2.6417471650566986,
"grad_norm": 0.9175587296485901,
"learning_rate": 1.7452436779094527e-06,
"loss": 0.5226,
"num_input_tokens_seen": 61837696,
"step": 5505,
"train_runtime": 9518.2144,
"train_tokens_per_second": 6496.775
},
{
"epoch": 2.644147117057659,
"grad_norm": 0.6489665508270264,
"learning_rate": 1.7222603013294036e-06,
"loss": 0.4645,
"num_input_tokens_seen": 61896032,
"step": 5510,
"train_runtime": 9528.6748,
"train_tokens_per_second": 6495.765
},
{
"epoch": 2.646547069058619,
"grad_norm": 0.8270627856254578,
"learning_rate": 1.6994238655474394e-06,
"loss": 0.4943,
"num_input_tokens_seen": 61949384,
"step": 5515,
"train_runtime": 9538.4414,
"train_tokens_per_second": 6494.707
},
{
"epoch": 2.6489470210595787,
"grad_norm": 0.7798356413841248,
"learning_rate": 1.6767345147186336e-06,
"loss": 0.5109,
"num_input_tokens_seen": 62002592,
"step": 5520,
"train_runtime": 9548.0079,
"train_tokens_per_second": 6493.773
},
{
"epoch": 2.651346973060539,
"grad_norm": 0.8514456748962402,
"learning_rate": 1.6541923920695756e-06,
"loss": 0.4477,
"num_input_tokens_seen": 62055040,
"step": 5525,
"train_runtime": 9558.322,
"train_tokens_per_second": 6492.253
},
{
"epoch": 2.653746925061499,
"grad_norm": 1.0111453533172607,
"learning_rate": 1.6317976398974782e-06,
"loss": 0.5174,
"num_input_tokens_seen": 62109976,
"step": 5530,
"train_runtime": 9567.8838,
"train_tokens_per_second": 6491.506
},
{
"epoch": 2.6561468770624588,
"grad_norm": 0.702575147151947,
"learning_rate": 1.6095503995692762e-06,
"loss": 0.4668,
"num_input_tokens_seen": 62167376,
"step": 5535,
"train_runtime": 9577.5036,
"train_tokens_per_second": 6490.979
},
{
"epoch": 2.6585468290634187,
"grad_norm": 0.8962842226028442,
"learning_rate": 1.5874508115207408e-06,
"loss": 0.4676,
"num_input_tokens_seen": 62221488,
"step": 5540,
"train_runtime": 9587.9271,
"train_tokens_per_second": 6489.566
},
{
"epoch": 2.6609467810643785,
"grad_norm": 0.7158124446868896,
"learning_rate": 1.5654990152555837e-06,
"loss": 0.4947,
"num_input_tokens_seen": 62277176,
"step": 5545,
"train_runtime": 9597.969,
"train_tokens_per_second": 6488.579
},
{
"epoch": 2.663346733065339,
"grad_norm": 1.1132010221481323,
"learning_rate": 1.5436951493445762e-06,
"loss": 0.4875,
"num_input_tokens_seen": 62330544,
"step": 5550,
"train_runtime": 9607.4993,
"train_tokens_per_second": 6487.697
},
{
"epoch": 2.6657466850662987,
"grad_norm": 0.8258331418037415,
"learning_rate": 1.5220393514246895e-06,
"loss": 0.5035,
"num_input_tokens_seen": 62381768,
"step": 5555,
"train_runtime": 9616.8354,
"train_tokens_per_second": 6486.725
},
{
"epoch": 2.6681466370672586,
"grad_norm": 0.8152797818183899,
"learning_rate": 1.5005317581982092e-06,
"loss": 0.4839,
"num_input_tokens_seen": 62436944,
"step": 5560,
"train_runtime": 9626.6187,
"train_tokens_per_second": 6485.864
},
{
"epoch": 2.6705465890682185,
"grad_norm": 0.8248258233070374,
"learning_rate": 1.479172505431875e-06,
"loss": 0.4973,
"num_input_tokens_seen": 62491352,
"step": 5565,
"train_runtime": 9636.4281,
"train_tokens_per_second": 6484.908
},
{
"epoch": 2.6729465410691784,
"grad_norm": 1.0632202625274658,
"learning_rate": 1.4579617279560393e-06,
"loss": 0.486,
"num_input_tokens_seen": 62546464,
"step": 5570,
"train_runtime": 9646.0848,
"train_tokens_per_second": 6484.13
},
{
"epoch": 2.6753464930701387,
"grad_norm": 1.1524382829666138,
"learning_rate": 1.4368995596637902e-06,
"loss": 0.4729,
"num_input_tokens_seen": 62602496,
"step": 5575,
"train_runtime": 9656.9657,
"train_tokens_per_second": 6482.626
},
{
"epoch": 2.6777464450710986,
"grad_norm": 0.66849684715271,
"learning_rate": 1.415986133510122e-06,
"loss": 0.4894,
"num_input_tokens_seen": 62664360,
"step": 5580,
"train_runtime": 9668.3929,
"train_tokens_per_second": 6481.363
},
{
"epoch": 2.6801463970720585,
"grad_norm": 0.7072093486785889,
"learning_rate": 1.395221581511097e-06,
"loss": 0.4524,
"num_input_tokens_seen": 62721848,
"step": 5585,
"train_runtime": 9678.2677,
"train_tokens_per_second": 6480.69
},
{
"epoch": 2.682546349073019,
"grad_norm": 0.8476486802101135,
"learning_rate": 1.3746060347430118e-06,
"loss": 0.4765,
"num_input_tokens_seen": 62776544,
"step": 5590,
"train_runtime": 9687.8596,
"train_tokens_per_second": 6479.919
},
{
"epoch": 2.6849463010739782,
"grad_norm": 0.807366132736206,
"learning_rate": 1.354139623341566e-06,
"loss": 0.4656,
"num_input_tokens_seen": 62834048,
"step": 5595,
"train_runtime": 9698.4717,
"train_tokens_per_second": 6478.758
},
{
"epoch": 2.6873462530749386,
"grad_norm": 0.6468657851219177,
"learning_rate": 1.3338224765010315e-06,
"loss": 0.4573,
"num_input_tokens_seen": 62894360,
"step": 5600,
"train_runtime": 9709.7847,
"train_tokens_per_second": 6477.421
},
{
"epoch": 2.6897462050758985,
"grad_norm": 0.9837515354156494,
"learning_rate": 1.3136547224734646e-06,
"loss": 0.4944,
"num_input_tokens_seen": 62952560,
"step": 5605,
"train_runtime": 9720.79,
"train_tokens_per_second": 6476.074
},
{
"epoch": 2.6921461570768583,
"grad_norm": 0.7956768274307251,
"learning_rate": 1.2936364885678676e-06,
"loss": 0.4829,
"num_input_tokens_seen": 63006360,
"step": 5610,
"train_runtime": 9729.8891,
"train_tokens_per_second": 6475.548
},
{
"epoch": 2.6945461090778187,
"grad_norm": 0.7825217247009277,
"learning_rate": 1.2737679011493947e-06,
"loss": 0.4819,
"num_input_tokens_seen": 63065920,
"step": 5615,
"train_runtime": 9740.3812,
"train_tokens_per_second": 6474.687
},
{
"epoch": 2.6969460610787785,
"grad_norm": 0.8457074761390686,
"learning_rate": 1.2540490856385672e-06,
"loss": 0.4717,
"num_input_tokens_seen": 63121320,
"step": 5620,
"train_runtime": 9751.1742,
"train_tokens_per_second": 6473.202
},
{
"epoch": 2.6993460130797384,
"grad_norm": 0.8086642026901245,
"learning_rate": 1.23448016651046e-06,
"loss": 0.462,
"num_input_tokens_seen": 63176440,
"step": 5625,
"train_runtime": 9760.6545,
"train_tokens_per_second": 6472.562
},
{
"epoch": 2.7017459650806983,
"grad_norm": 0.6313350796699524,
"learning_rate": 1.215061267293932e-06,
"loss": 0.4332,
"num_input_tokens_seen": 63242712,
"step": 5630,
"train_runtime": 9772.2646,
"train_tokens_per_second": 6471.654
},
{
"epoch": 2.704145917081658,
"grad_norm": 1.2930268049240112,
"learning_rate": 1.195792510570834e-06,
"loss": 0.4613,
"num_input_tokens_seen": 63294640,
"step": 5635,
"train_runtime": 9782.3592,
"train_tokens_per_second": 6470.284
},
{
"epoch": 2.7065458690826185,
"grad_norm": 0.6524819731712341,
"learning_rate": 1.1766740179752572e-06,
"loss": 0.4588,
"num_input_tokens_seen": 63353040,
"step": 5640,
"train_runtime": 9793.201,
"train_tokens_per_second": 6469.084
},
{
"epoch": 2.7089458210835784,
"grad_norm": 0.9691641330718994,
"learning_rate": 1.1577059101927385e-06,
"loss": 0.5275,
"num_input_tokens_seen": 63408480,
"step": 5645,
"train_runtime": 9803.1346,
"train_tokens_per_second": 6468.184
},
{
"epoch": 2.7113457730845383,
"grad_norm": 0.7839572429656982,
"learning_rate": 1.138888306959504e-06,
"loss": 0.4728,
"num_input_tokens_seen": 63465824,
"step": 5650,
"train_runtime": 9814.8407,
"train_tokens_per_second": 6466.312
},
{
"epoch": 2.713745725085498,
"grad_norm": 0.9171317219734192,
"learning_rate": 1.1202213270617322e-06,
"loss": 0.4897,
"num_input_tokens_seen": 63518744,
"step": 5655,
"train_runtime": 9824.5678,
"train_tokens_per_second": 6465.297
},
{
"epoch": 2.716145677086458,
"grad_norm": 1.0188878774642944,
"learning_rate": 1.101705088334795e-06,
"loss": 0.4849,
"num_input_tokens_seen": 63573232,
"step": 5660,
"train_runtime": 9833.9406,
"train_tokens_per_second": 6464.675
},
{
"epoch": 2.7185456290874184,
"grad_norm": 0.811906099319458,
"learning_rate": 1.0833397076624897e-06,
"loss": 0.4778,
"num_input_tokens_seen": 63626872,
"step": 5665,
"train_runtime": 9843.8939,
"train_tokens_per_second": 6463.588
},
{
"epoch": 2.7209455810883783,
"grad_norm": 0.9648638367652893,
"learning_rate": 1.065125300976344e-06,
"loss": 0.5255,
"num_input_tokens_seen": 63680184,
"step": 5670,
"train_runtime": 9852.7656,
"train_tokens_per_second": 6463.179
},
{
"epoch": 2.723345533089338,
"grad_norm": 0.8658723831176758,
"learning_rate": 1.0470619832548461e-06,
"loss": 0.5119,
"num_input_tokens_seen": 63732752,
"step": 5675,
"train_runtime": 9861.8742,
"train_tokens_per_second": 6462.54
},
{
"epoch": 2.7257454850902985,
"grad_norm": 0.6413763761520386,
"learning_rate": 1.0291498685227441e-06,
"loss": 0.4683,
"num_input_tokens_seen": 63790384,
"step": 5680,
"train_runtime": 9873.128,
"train_tokens_per_second": 6461.011
},
{
"epoch": 2.7281454370912583,
"grad_norm": 0.9176835417747498,
"learning_rate": 1.0113890698503076e-06,
"loss": 0.4943,
"num_input_tokens_seen": 63845528,
"step": 5685,
"train_runtime": 9883.5777,
"train_tokens_per_second": 6459.759
},
{
"epoch": 2.7305453890922182,
"grad_norm": 0.8102623224258423,
"learning_rate": 9.937796993526343e-07,
"loss": 0.4989,
"num_input_tokens_seen": 63898616,
"step": 5690,
"train_runtime": 9893.716,
"train_tokens_per_second": 6458.505
},
{
"epoch": 2.732945341093178,
"grad_norm": 0.7839487195014954,
"learning_rate": 9.763218681889203e-07,
"loss": 0.4506,
"num_input_tokens_seen": 63953600,
"step": 5695,
"train_runtime": 9903.0294,
"train_tokens_per_second": 6457.983
},
{
"epoch": 2.735345293094138,
"grad_norm": 0.8236997723579407,
"learning_rate": 9.59015686561779e-07,
"loss": 0.4606,
"num_input_tokens_seen": 64012184,
"step": 5700,
"train_runtime": 9913.4852,
"train_tokens_per_second": 6457.082
},
{
"epoch": 2.7377452450950983,
"grad_norm": 0.7789479494094849,
"learning_rate": 9.418612637165286e-07,
"loss": 0.4545,
"num_input_tokens_seen": 64065248,
"step": 5705,
"train_runtime": 9924.2434,
"train_tokens_per_second": 6455.429
},
{
"epoch": 2.740145197096058,
"grad_norm": 0.890102744102478,
"learning_rate": 9.24858707940518e-07,
"loss": 0.5299,
"num_input_tokens_seen": 64120216,
"step": 5710,
"train_runtime": 9934.9595,
"train_tokens_per_second": 6453.999
},
{
"epoch": 2.742545149097018,
"grad_norm": 0.9005339741706848,
"learning_rate": 9.08008126562418e-07,
"loss": 0.4609,
"num_input_tokens_seen": 64181128,
"step": 5715,
"train_runtime": 9946.018,
"train_tokens_per_second": 6452.947
},
{
"epoch": 2.744945101097978,
"grad_norm": 0.9289687275886536,
"learning_rate": 8.913096259515835e-07,
"loss": 0.464,
"num_input_tokens_seen": 64234984,
"step": 5720,
"train_runtime": 9954.6483,
"train_tokens_per_second": 6452.763
},
{
"epoch": 2.747345053098938,
"grad_norm": 1.0818783044815063,
"learning_rate": 8.747633115173404e-07,
"loss": 0.4932,
"num_input_tokens_seen": 64290040,
"step": 5725,
"train_runtime": 9963.0154,
"train_tokens_per_second": 6452.87
},
{
"epoch": 2.749745005099898,
"grad_norm": 0.7084750533103943,
"learning_rate": 8.583692877083465e-07,
"loss": 0.4344,
"num_input_tokens_seen": 64347256,
"step": 5730,
"train_runtime": 9971.5711,
"train_tokens_per_second": 6453.071
},
{
"epoch": 2.752144957100858,
"grad_norm": 0.8155821561813354,
"learning_rate": 8.421276580119236e-07,
"loss": 0.4921,
"num_input_tokens_seen": 64401448,
"step": 5735,
"train_runtime": 9980.1585,
"train_tokens_per_second": 6452.948
},
{
"epoch": 2.754544909101818,
"grad_norm": 0.7858007550239563,
"learning_rate": 8.260385249534042e-07,
"loss": 0.4953,
"num_input_tokens_seen": 64457576,
"step": 5740,
"train_runtime": 9988.8703,
"train_tokens_per_second": 6452.94
},
{
"epoch": 2.756944861102778,
"grad_norm": 0.8042717576026917,
"learning_rate": 8.101019900954881e-07,
"loss": 0.4595,
"num_input_tokens_seen": 64515152,
"step": 5745,
"train_runtime": 9998.2113,
"train_tokens_per_second": 6452.669
},
{
"epoch": 2.7593448131037377,
"grad_norm": 0.61765056848526,
"learning_rate": 7.943181540375988e-07,
"loss": 0.4843,
"num_input_tokens_seen": 64573768,
"step": 5750,
"train_runtime": 10006.8604,
"train_tokens_per_second": 6452.95
},
{
"epoch": 2.761744765104698,
"grad_norm": 0.8006062507629395,
"learning_rate": 7.786871164152415e-07,
"loss": 0.4595,
"num_input_tokens_seen": 64626520,
"step": 5755,
"train_runtime": 10014.3267,
"train_tokens_per_second": 6453.406
},
{
"epoch": 2.764144717105658,
"grad_norm": 0.7694302797317505,
"learning_rate": 7.632089758993932e-07,
"loss": 0.4565,
"num_input_tokens_seen": 64683224,
"step": 5760,
"train_runtime": 10022.5457,
"train_tokens_per_second": 6453.772
},
{
"epoch": 2.766544669106618,
"grad_norm": 0.7269204258918762,
"learning_rate": 7.478838301958502e-07,
"loss": 0.4728,
"num_input_tokens_seen": 64738056,
"step": 5765,
"train_runtime": 10030.9759,
"train_tokens_per_second": 6453.814
},
{
"epoch": 2.768944621107578,
"grad_norm": 0.8213253021240234,
"learning_rate": 7.327117760446478e-07,
"loss": 0.4835,
"num_input_tokens_seen": 64790592,
"step": 5770,
"train_runtime": 10039.0056,
"train_tokens_per_second": 6453.885
},
{
"epoch": 2.771344573108538,
"grad_norm": 0.6208813190460205,
"learning_rate": 7.17692909219414e-07,
"loss": 0.4922,
"num_input_tokens_seen": 64844640,
"step": 5775,
"train_runtime": 10047.9962,
"train_tokens_per_second": 6453.49
},
{
"epoch": 2.773744525109498,
"grad_norm": 0.7945714592933655,
"learning_rate": 7.028273245267947e-07,
"loss": 0.4473,
"num_input_tokens_seen": 64903320,
"step": 5780,
"train_runtime": 10056.9037,
"train_tokens_per_second": 6453.609
},
{
"epoch": 2.7761444771104578,
"grad_norm": 0.6964590549468994,
"learning_rate": 6.881151158058263e-07,
"loss": 0.5196,
"num_input_tokens_seen": 64963432,
"step": 5785,
"train_runtime": 10066.3751,
"train_tokens_per_second": 6453.508
},
{
"epoch": 2.7785444291114176,
"grad_norm": 0.7940050959587097,
"learning_rate": 6.735563759273783e-07,
"loss": 0.4862,
"num_input_tokens_seen": 65020920,
"step": 5790,
"train_runtime": 10074.8972,
"train_tokens_per_second": 6453.755
},
{
"epoch": 2.780944381112378,
"grad_norm": 0.7207697033882141,
"learning_rate": 6.591511967935282e-07,
"loss": 0.4557,
"num_input_tokens_seen": 65077720,
"step": 5795,
"train_runtime": 10083.2022,
"train_tokens_per_second": 6454.073
},
{
"epoch": 2.783344333113338,
"grad_norm": 0.9495781064033508,
"learning_rate": 6.448996693370179e-07,
"loss": 0.4682,
"num_input_tokens_seen": 65133616,
"step": 5800,
"train_runtime": 10092.261,
"train_tokens_per_second": 6453.818
},
{
"epoch": 2.7857442851142977,
"grad_norm": 0.8136801719665527,
"learning_rate": 6.308018835206541e-07,
"loss": 0.4646,
"num_input_tokens_seen": 65187840,
"step": 5805,
"train_runtime": 10100.5435,
"train_tokens_per_second": 6453.894
},
{
"epoch": 2.7881442371152576,
"grad_norm": 0.6333021521568298,
"learning_rate": 6.168579283367476e-07,
"loss": 0.472,
"num_input_tokens_seen": 65240368,
"step": 5810,
"train_runtime": 10108.7592,
"train_tokens_per_second": 6453.845
},
{
"epoch": 2.7905441891162175,
"grad_norm": 1.0317847728729248,
"learning_rate": 6.030678918065552e-07,
"loss": 0.4831,
"num_input_tokens_seen": 65295184,
"step": 5815,
"train_runtime": 10117.6223,
"train_tokens_per_second": 6453.61
},
{
"epoch": 2.792944141117178,
"grad_norm": 1.2926782369613647,
"learning_rate": 5.894318609797222e-07,
"loss": 0.4951,
"num_input_tokens_seen": 65351248,
"step": 5820,
"train_runtime": 10125.5866,
"train_tokens_per_second": 6454.07
},
{
"epoch": 2.7953440931181377,
"grad_norm": 0.8632203936576843,
"learning_rate": 5.759499219337328e-07,
"loss": 0.4852,
"num_input_tokens_seen": 65405976,
"step": 5825,
"train_runtime": 10133.4185,
"train_tokens_per_second": 6454.483
},
{
"epoch": 2.7977440451190976,
"grad_norm": 0.8666356801986694,
"learning_rate": 5.626221597733655e-07,
"loss": 0.4505,
"num_input_tokens_seen": 65466136,
"step": 5830,
"train_runtime": 10141.883,
"train_tokens_per_second": 6455.028
},
{
"epoch": 2.8001439971200575,
"grad_norm": 0.894623875617981,
"learning_rate": 5.494486586301528e-07,
"loss": 0.5448,
"num_input_tokens_seen": 65518496,
"step": 5835,
"train_runtime": 10149.8014,
"train_tokens_per_second": 6455.151
},
{
"epoch": 2.8025439491210173,
"grad_norm": 0.8759870529174805,
"learning_rate": 5.364295016618643e-07,
"loss": 0.4865,
"num_input_tokens_seen": 65577616,
"step": 5840,
"train_runtime": 10157.9244,
"train_tokens_per_second": 6455.809
},
{
"epoch": 2.8049439011219777,
"grad_norm": 0.7551533579826355,
"learning_rate": 5.235647710519626e-07,
"loss": 0.4664,
"num_input_tokens_seen": 65634592,
"step": 5845,
"train_runtime": 10166.5957,
"train_tokens_per_second": 6455.907
},
{
"epoch": 2.8073438531229375,
"grad_norm": 0.7756850719451904,
"learning_rate": 5.108545480090931e-07,
"loss": 0.4649,
"num_input_tokens_seen": 65691480,
"step": 5850,
"train_runtime": 10174.9677,
"train_tokens_per_second": 6456.186
},
{
"epoch": 2.8097438051238974,
"grad_norm": 0.6903165578842163,
"learning_rate": 4.982989127665816e-07,
"loss": 0.4969,
"num_input_tokens_seen": 65745568,
"step": 5855,
"train_runtime": 10183.3283,
"train_tokens_per_second": 6456.196
},
{
"epoch": 2.8121437571248578,
"grad_norm": 0.7350341081619263,
"learning_rate": 4.858979445819089e-07,
"loss": 0.4742,
"num_input_tokens_seen": 65799784,
"step": 5860,
"train_runtime": 10190.9666,
"train_tokens_per_second": 6456.677
},
{
"epoch": 2.8145437091258176,
"grad_norm": 0.7910242676734924,
"learning_rate": 4.7365172173621796e-07,
"loss": 0.4561,
"num_input_tokens_seen": 65856528,
"step": 5865,
"train_runtime": 10199.5186,
"train_tokens_per_second": 6456.827
},
{
"epoch": 2.8169436611267775,
"grad_norm": 0.8002808094024658,
"learning_rate": 4.615603215338299e-07,
"loss": 0.4425,
"num_input_tokens_seen": 65911144,
"step": 5870,
"train_runtime": 10208.0985,
"train_tokens_per_second": 6456.75
},
{
"epoch": 2.8193436131277374,
"grad_norm": 0.6876586079597473,
"learning_rate": 4.496238203017422e-07,
"loss": 0.4873,
"num_input_tokens_seen": 65971080,
"step": 5875,
"train_runtime": 10216.3273,
"train_tokens_per_second": 6457.416
},
{
"epoch": 2.8217435651286973,
"grad_norm": 0.65282142162323,
"learning_rate": 4.3784229338915406e-07,
"loss": 0.4867,
"num_input_tokens_seen": 66026344,
"step": 5880,
"train_runtime": 10224.7475,
"train_tokens_per_second": 6457.504
},
{
"epoch": 2.8241435171296576,
"grad_norm": 0.6614166498184204,
"learning_rate": 4.262158151669804e-07,
"loss": 0.4813,
"num_input_tokens_seen": 66082360,
"step": 5885,
"train_runtime": 10233.2091,
"train_tokens_per_second": 6457.638
},
{
"epoch": 2.8265434691306175,
"grad_norm": 0.7193440794944763,
"learning_rate": 4.147444590274052e-07,
"loss": 0.4968,
"num_input_tokens_seen": 66134928,
"step": 5890,
"train_runtime": 10241.3234,
"train_tokens_per_second": 6457.654
},
{
"epoch": 2.8289434211315774,
"grad_norm": 0.7374788522720337,
"learning_rate": 4.0342829738339583e-07,
"loss": 0.4744,
"num_input_tokens_seen": 66190032,
"step": 5895,
"train_runtime": 10249.2265,
"train_tokens_per_second": 6458.051
},
{
"epoch": 2.8313433731325373,
"grad_norm": 0.9320788979530334,
"learning_rate": 3.922674016682504e-07,
"loss": 0.4819,
"num_input_tokens_seen": 66244312,
"step": 5900,
"train_runtime": 10256.9977,
"train_tokens_per_second": 6458.45
},
{
"epoch": 2.833743325133497,
"grad_norm": 0.526983916759491,
"learning_rate": 3.812618423351622e-07,
"loss": 0.4424,
"num_input_tokens_seen": 66305552,
"step": 5905,
"train_runtime": 10265.6243,
"train_tokens_per_second": 6458.989
},
{
"epoch": 2.8361432771344575,
"grad_norm": 0.9565876722335815,
"learning_rate": 3.704116888567505e-07,
"loss": 0.4926,
"num_input_tokens_seen": 66358648,
"step": 5910,
"train_runtime": 10273.7771,
"train_tokens_per_second": 6459.031
},
{
"epoch": 2.8385432291354173,
"grad_norm": 0.9867433905601501,
"learning_rate": 3.597170097246416e-07,
"loss": 0.4706,
"num_input_tokens_seen": 66417384,
"step": 5915,
"train_runtime": 10283.2277,
"train_tokens_per_second": 6458.807
},
{
"epoch": 2.8409431811363772,
"grad_norm": 0.6663256883621216,
"learning_rate": 3.4917787244902743e-07,
"loss": 0.4945,
"num_input_tokens_seen": 66477648,
"step": 5920,
"train_runtime": 10293.4798,
"train_tokens_per_second": 6458.229
},
{
"epoch": 2.843343133137337,
"grad_norm": 0.621631920337677,
"learning_rate": 3.387943435582436e-07,
"loss": 0.495,
"num_input_tokens_seen": 66532464,
"step": 5925,
"train_runtime": 10302.8802,
"train_tokens_per_second": 6457.657
},
{
"epoch": 2.845743085138297,
"grad_norm": 0.638155460357666,
"learning_rate": 3.285664885983447e-07,
"loss": 0.4263,
"num_input_tokens_seen": 66589296,
"step": 5930,
"train_runtime": 10312.6945,
"train_tokens_per_second": 6457.022
},
{
"epoch": 2.8481430371392573,
"grad_norm": 0.7790648341178894,
"learning_rate": 3.184943721326938e-07,
"loss": 0.4473,
"num_input_tokens_seen": 66648144,
"step": 5935,
"train_runtime": 10322.4204,
"train_tokens_per_second": 6456.639
},
{
"epoch": 2.850542989140217,
"grad_norm": 0.9435281753540039,
"learning_rate": 3.0857805774155423e-07,
"loss": 0.4773,
"num_input_tokens_seen": 66702560,
"step": 5940,
"train_runtime": 10331.5732,
"train_tokens_per_second": 6456.186
},
{
"epoch": 2.852942941141177,
"grad_norm": 0.7527910470962524,
"learning_rate": 2.988176080216898e-07,
"loss": 0.5113,
"num_input_tokens_seen": 66757360,
"step": 5945,
"train_runtime": 10341.338,
"train_tokens_per_second": 6455.389
},
{
"epoch": 2.8553428931421374,
"grad_norm": 0.949381411075592,
"learning_rate": 2.892130845859653e-07,
"loss": 0.5225,
"num_input_tokens_seen": 66813080,
"step": 5950,
"train_runtime": 10351.5482,
"train_tokens_per_second": 6454.405
},
{
"epoch": 2.8577428451430973,
"grad_norm": 0.682515561580658,
"learning_rate": 2.7976454806296906e-07,
"loss": 0.4474,
"num_input_tokens_seen": 66870744,
"step": 5955,
"train_runtime": 10361.7884,
"train_tokens_per_second": 6453.591
},
{
"epoch": 2.860142797144057,
"grad_norm": 0.8949669599533081,
"learning_rate": 2.7047205809660746e-07,
"loss": 0.4552,
"num_input_tokens_seen": 66926176,
"step": 5960,
"train_runtime": 10372.0384,
"train_tokens_per_second": 6452.558
},
{
"epoch": 2.862542749145017,
"grad_norm": 0.672732949256897,
"learning_rate": 2.6133567334575e-07,
"loss": 0.461,
"num_input_tokens_seen": 66982736,
"step": 5965,
"train_runtime": 10381.6755,
"train_tokens_per_second": 6452.016
},
{
"epoch": 2.864942701145977,
"grad_norm": 0.7349382638931274,
"learning_rate": 2.523554514838544e-07,
"loss": 0.4649,
"num_input_tokens_seen": 67040256,
"step": 5970,
"train_runtime": 10391.7883,
"train_tokens_per_second": 6451.272
},
{
"epoch": 2.8673426531469373,
"grad_norm": 0.7584925293922424,
"learning_rate": 2.435314491985974e-07,
"loss": 0.5227,
"num_input_tokens_seen": 67098776,
"step": 5975,
"train_runtime": 10401.6032,
"train_tokens_per_second": 6450.811
},
{
"epoch": 2.869742605147897,
"grad_norm": 0.8414415717124939,
"learning_rate": 2.3486372219151675e-07,
"loss": 0.4989,
"num_input_tokens_seen": 67151768,
"step": 5980,
"train_runtime": 10411.5952,
"train_tokens_per_second": 6449.71
},
{
"epoch": 2.872142557148857,
"grad_norm": 0.6477630734443665,
"learning_rate": 2.263523251776617e-07,
"loss": 0.4962,
"num_input_tokens_seen": 67210600,
"step": 5985,
"train_runtime": 10422.1011,
"train_tokens_per_second": 6448.853
},
{
"epoch": 2.874542509149817,
"grad_norm": 1.1014198064804077,
"learning_rate": 2.1799731188525407e-07,
"loss": 0.5162,
"num_input_tokens_seen": 67263744,
"step": 5990,
"train_runtime": 10431.8385,
"train_tokens_per_second": 6447.928
},
{
"epoch": 2.876942461150777,
"grad_norm": 0.9391694664955139,
"learning_rate": 2.0979873505533876e-07,
"loss": 0.449,
"num_input_tokens_seen": 67316560,
"step": 5995,
"train_runtime": 10441.9194,
"train_tokens_per_second": 6446.761
},
{
"epoch": 2.879342413151737,
"grad_norm": 0.8007956147193909,
"learning_rate": 2.0175664644145053e-07,
"loss": 0.4849,
"num_input_tokens_seen": 67373408,
"step": 6000,
"train_runtime": 10452.6728,
"train_tokens_per_second": 6445.568
},
{
"epoch": 2.881742365152697,
"grad_norm": 0.7711721658706665,
"learning_rate": 1.9387109680930327e-07,
"loss": 0.4332,
"num_input_tokens_seen": 67428800,
"step": 6005,
"train_runtime": 10463.242,
"train_tokens_per_second": 6444.351
},
{
"epoch": 2.884142317153657,
"grad_norm": 0.8150792121887207,
"learning_rate": 1.8614213593644846e-07,
"loss": 0.4459,
"num_input_tokens_seen": 67490440,
"step": 6010,
"train_runtime": 10473.7424,
"train_tokens_per_second": 6443.775
},
{
"epoch": 2.8865422691546168,
"grad_norm": 0.7124377489089966,
"learning_rate": 1.7856981261197002e-07,
"loss": 0.4779,
"num_input_tokens_seen": 67545608,
"step": 6015,
"train_runtime": 10483.2085,
"train_tokens_per_second": 6443.219
},
{
"epoch": 2.8889422211555766,
"grad_norm": 0.8673171997070312,
"learning_rate": 1.7115417463618722e-07,
"loss": 0.4598,
"num_input_tokens_seen": 67595400,
"step": 6020,
"train_runtime": 10492.2481,
"train_tokens_per_second": 6442.413
},
{
"epoch": 2.891342173156537,
"grad_norm": 0.7837307453155518,
"learning_rate": 1.638952688203327e-07,
"loss": 0.4797,
"num_input_tokens_seen": 67646720,
"step": 6025,
"train_runtime": 10501.2034,
"train_tokens_per_second": 6441.806
},
{
"epoch": 2.893742125157497,
"grad_norm": 0.6940703392028809,
"learning_rate": 1.567931409862694e-07,
"loss": 0.4915,
"num_input_tokens_seen": 67700752,
"step": 6030,
"train_runtime": 10511.0778,
"train_tokens_per_second": 6440.895
},
{
"epoch": 2.8961420771584567,
"grad_norm": 0.8700549602508545,
"learning_rate": 1.4984783596619922e-07,
"loss": 0.4946,
"num_input_tokens_seen": 67755144,
"step": 6035,
"train_runtime": 10520.7321,
"train_tokens_per_second": 6440.155
},
{
"epoch": 2.898542029159417,
"grad_norm": 0.7011561989784241,
"learning_rate": 1.430593976023825e-07,
"loss": 0.4919,
"num_input_tokens_seen": 67814680,
"step": 6040,
"train_runtime": 10531.5769,
"train_tokens_per_second": 6439.176
},
{
"epoch": 2.900941981160377,
"grad_norm": 0.893417477607727,
"learning_rate": 1.3642786874685233e-07,
"loss": 0.5055,
"num_input_tokens_seen": 67867648,
"step": 6045,
"train_runtime": 10541.6146,
"train_tokens_per_second": 6438.07
},
{
"epoch": 2.903341933161337,
"grad_norm": 0.7926166653633118,
"learning_rate": 1.299532912611534e-07,
"loss": 0.459,
"num_input_tokens_seen": 67922728,
"step": 6050,
"train_runtime": 10550.8628,
"train_tokens_per_second": 6437.647
},
{
"epoch": 2.9057418851622967,
"grad_norm": 0.7883651852607727,
"learning_rate": 1.2363570601608143e-07,
"loss": 0.4636,
"num_input_tokens_seen": 67975200,
"step": 6055,
"train_runtime": 10560.1447,
"train_tokens_per_second": 6436.957
},
{
"epoch": 2.9081418371632566,
"grad_norm": 0.9356446266174316,
"learning_rate": 1.1747515289140254e-07,
"loss": 0.4612,
"num_input_tokens_seen": 68029864,
"step": 6060,
"train_runtime": 10570.9284,
"train_tokens_per_second": 6435.562
},
{
"epoch": 2.910541789164217,
"grad_norm": 1.2164058685302734,
"learning_rate": 1.1147167077562859e-07,
"loss": 0.5042,
"num_input_tokens_seen": 68079824,
"step": 6065,
"train_runtime": 10580.6679,
"train_tokens_per_second": 6434.36
},
{
"epoch": 2.912941741165177,
"grad_norm": 0.9457964301109314,
"learning_rate": 1.0562529756576179e-07,
"loss": 0.4287,
"num_input_tokens_seen": 68136632,
"step": 6070,
"train_runtime": 10591.0019,
"train_tokens_per_second": 6433.445
},
{
"epoch": 2.9153416931661367,
"grad_norm": 0.7782816290855408,
"learning_rate": 9.993607016704209e-08,
"loss": 0.4994,
"num_input_tokens_seen": 68192816,
"step": 6075,
"train_runtime": 10601.2725,
"train_tokens_per_second": 6432.512
},
{
"epoch": 2.9177416451670966,
"grad_norm": 0.7655016183853149,
"learning_rate": 9.440402449274188e-08,
"loss": 0.5164,
"num_input_tokens_seen": 68244208,
"step": 6080,
"train_runtime": 10610.674,
"train_tokens_per_second": 6431.656
},
{
"epoch": 2.9201415971680564,
"grad_norm": 0.8917096257209778,
"learning_rate": 8.902919546390776e-08,
"loss": 0.4609,
"num_input_tokens_seen": 68300352,
"step": 6085,
"train_runtime": 10620.9066,
"train_tokens_per_second": 6430.746
},
{
"epoch": 2.9225415491690168,
"grad_norm": 0.940250039100647,
"learning_rate": 8.381161700916906e-08,
"loss": 0.5296,
"num_input_tokens_seen": 68350392,
"step": 6090,
"train_runtime": 10630.1557,
"train_tokens_per_second": 6429.858
},
{
"epoch": 2.9249415011699766,
"grad_norm": 0.8829488158226013,
"learning_rate": 7.87513220644992e-08,
"loss": 0.5012,
"num_input_tokens_seen": 68405152,
"step": 6095,
"train_runtime": 10639.9288,
"train_tokens_per_second": 6429.099
},
{
"epoch": 2.9273414531709365,
"grad_norm": 0.9745586514472961,
"learning_rate": 7.384834257302687e-08,
"loss": 0.5022,
"num_input_tokens_seen": 68461336,
"step": 6100,
"train_runtime": 10650.3632,
"train_tokens_per_second": 6428.075
},
{
"epoch": 2.9297414051718964,
"grad_norm": 0.9082819819450378,
"learning_rate": 6.910270948482789e-08,
"loss": 0.477,
"num_input_tokens_seen": 68512936,
"step": 6105,
"train_runtime": 10660.0699,
"train_tokens_per_second": 6427.063
},
{
"epoch": 2.9321413571728563,
"grad_norm": 0.831038773059845,
"learning_rate": 6.451445275671986e-08,
"loss": 0.4894,
"num_input_tokens_seen": 68569728,
"step": 6110,
"train_runtime": 10670.2152,
"train_tokens_per_second": 6426.274
},
{
"epoch": 2.9345413091738166,
"grad_norm": 0.7757657170295715,
"learning_rate": 6.008360135208724e-08,
"loss": 0.4685,
"num_input_tokens_seen": 68623976,
"step": 6115,
"train_runtime": 10680.1954,
"train_tokens_per_second": 6425.348
},
{
"epoch": 2.9369412611747765,
"grad_norm": 0.8630353212356567,
"learning_rate": 5.581018324069543e-08,
"loss": 0.4904,
"num_input_tokens_seen": 68679096,
"step": 6120,
"train_runtime": 10691.1399,
"train_tokens_per_second": 6423.926
},
{
"epoch": 2.9393412131757364,
"grad_norm": 0.881776750087738,
"learning_rate": 5.169422539850477e-08,
"loss": 0.4671,
"num_input_tokens_seen": 68734576,
"step": 6125,
"train_runtime": 10700.7437,
"train_tokens_per_second": 6423.346
},
{
"epoch": 2.9417411651766967,
"grad_norm": 0.8964380025863647,
"learning_rate": 4.773575380750961e-08,
"loss": 0.469,
"num_input_tokens_seen": 68793128,
"step": 6130,
"train_runtime": 10711.0036,
"train_tokens_per_second": 6422.659
},
{
"epoch": 2.9441411171776566,
"grad_norm": 0.8133379220962524,
"learning_rate": 4.393479345557727e-08,
"loss": 0.5031,
"num_input_tokens_seen": 68847592,
"step": 6135,
"train_runtime": 10721.1224,
"train_tokens_per_second": 6421.678
},
{
"epoch": 2.9465410691786165,
"grad_norm": 0.6794693470001221,
"learning_rate": 4.0291368336276e-08,
"loss": 0.4709,
"num_input_tokens_seen": 68905096,
"step": 6140,
"train_runtime": 10731.8838,
"train_tokens_per_second": 6420.597
},
{
"epoch": 2.9489410211795763,
"grad_norm": 0.8234326839447021,
"learning_rate": 3.6805501448744505e-08,
"loss": 0.4638,
"num_input_tokens_seen": 68960224,
"step": 6145,
"train_runtime": 10741.5942,
"train_tokens_per_second": 6419.924
},
{
"epoch": 2.9513409731805362,
"grad_norm": 0.8420405387878418,
"learning_rate": 3.347721479751986e-08,
"loss": 0.5143,
"num_input_tokens_seen": 69014200,
"step": 6150,
"train_runtime": 10751.3552,
"train_tokens_per_second": 6419.116
},
{
"epoch": 2.9537409251814966,
"grad_norm": 0.876466691493988,
"learning_rate": 3.0306529392426507e-08,
"loss": 0.4258,
"num_input_tokens_seen": 69071584,
"step": 6155,
"train_runtime": 10761.6029,
"train_tokens_per_second": 6418.336
},
{
"epoch": 2.9561408771824564,
"grad_norm": 0.8103510737419128,
"learning_rate": 2.72934652484208e-08,
"loss": 0.4785,
"num_input_tokens_seen": 69125824,
"step": 6160,
"train_runtime": 10771.3537,
"train_tokens_per_second": 6417.561
},
{
"epoch": 2.9585408291834163,
"grad_norm": 0.9023430347442627,
"learning_rate": 2.4438041385480003e-08,
"loss": 0.5019,
"num_input_tokens_seen": 69183992,
"step": 6165,
"train_runtime": 10782.2651,
"train_tokens_per_second": 6416.462
},
{
"epoch": 2.960940781184376,
"grad_norm": 0.9007648825645447,
"learning_rate": 2.174027582848015e-08,
"loss": 0.4764,
"num_input_tokens_seen": 69243264,
"step": 6170,
"train_runtime": 10792.8565,
"train_tokens_per_second": 6415.657
},
{
"epoch": 2.963340733185336,
"grad_norm": 0.9024353623390198,
"learning_rate": 1.92001856070656e-08,
"loss": 0.499,
"num_input_tokens_seen": 69299200,
"step": 6175,
"train_runtime": 10803.5555,
"train_tokens_per_second": 6414.481
},
{
"epoch": 2.9657406851862964,
"grad_norm": 0.7554855942726135,
"learning_rate": 1.6817786755568553e-08,
"loss": 0.4397,
"num_input_tokens_seen": 69352824,
"step": 6180,
"train_runtime": 10812.9366,
"train_tokens_per_second": 6413.875
},
{
"epoch": 2.9681406371872563,
"grad_norm": 0.7788093686103821,
"learning_rate": 1.4593094312889688e-08,
"loss": 0.452,
"num_input_tokens_seen": 69415024,
"step": 6185,
"train_runtime": 10823.0536,
"train_tokens_per_second": 6413.627
},
{
"epoch": 2.970540589188216,
"grad_norm": 0.7968340516090393,
"learning_rate": 1.2526122322401024e-08,
"loss": 0.4915,
"num_input_tokens_seen": 69471512,
"step": 6190,
"train_runtime": 10832.747,
"train_tokens_per_second": 6413.102
},
{
"epoch": 2.972940541189176,
"grad_norm": 0.7601198554039001,
"learning_rate": 1.0616883831873758e-08,
"loss": 0.4443,
"num_input_tokens_seen": 69527768,
"step": 6195,
"train_runtime": 10842.6627,
"train_tokens_per_second": 6412.426
},
{
"epoch": 2.975340493190136,
"grad_norm": 0.8078719973564148,
"learning_rate": 8.86539089338112e-09,
"loss": 0.4387,
"num_input_tokens_seen": 69583024,
"step": 6200,
"train_runtime": 10852.4744,
"train_tokens_per_second": 6411.72
},
{
"epoch": 2.9777404451910963,
"grad_norm": 1.0166022777557373,
"learning_rate": 7.271654563223429e-09,
"loss": 0.4519,
"num_input_tokens_seen": 69639080,
"step": 6205,
"train_runtime": 10863.2159,
"train_tokens_per_second": 6410.54
},
{
"epoch": 2.980140397192056,
"grad_norm": 1.051282286643982,
"learning_rate": 5.835684901869809e-09,
"loss": 0.5355,
"num_input_tokens_seen": 69695440,
"step": 6210,
"train_runtime": 10873.6609,
"train_tokens_per_second": 6409.565
},
{
"epoch": 2.982540349193016,
"grad_norm": 0.9155645966529846,
"learning_rate": 4.5574909738804735e-09,
"loss": 0.4775,
"num_input_tokens_seen": 69752488,
"step": 6215,
"train_runtime": 10884.1415,
"train_tokens_per_second": 6408.635
},
{
"epoch": 2.9849403011939764,
"grad_norm": 0.8648121356964111,
"learning_rate": 3.4370808478595417e-09,
"loss": 0.4861,
"num_input_tokens_seen": 69804712,
"step": 6220,
"train_runtime": 10894.0291,
"train_tokens_per_second": 6407.612
},
{
"epoch": 2.9873402531949362,
"grad_norm": 0.9490159153938293,
"learning_rate": 2.474461596396749e-09,
"loss": 0.4641,
"num_input_tokens_seen": 69863384,
"step": 6225,
"train_runtime": 10903.095,
"train_tokens_per_second": 6407.665
},
{
"epoch": 2.989740205195896,
"grad_norm": 0.823014497756958,
"learning_rate": 1.6696392960341423e-09,
"loss": 0.4785,
"num_input_tokens_seen": 69920712,
"step": 6230,
"train_runtime": 10911.9258,
"train_tokens_per_second": 6407.733
},
{
"epoch": 2.992140157196856,
"grad_norm": 0.9870671629905701,
"learning_rate": 1.022619027207794e-09,
"loss": 0.4529,
"num_input_tokens_seen": 69978976,
"step": 6235,
"train_runtime": 10920.4005,
"train_tokens_per_second": 6408.096
},
{
"epoch": 2.994540109197816,
"grad_norm": 0.8132453560829163,
"learning_rate": 5.334048742394737e-10,
"loss": 0.4621,
"num_input_tokens_seen": 70037816,
"step": 6240,
"train_runtime": 10929.3119,
"train_tokens_per_second": 6408.255
},
{
"epoch": 2.996940061198776,
"grad_norm": 0.9090087413787842,
"learning_rate": 2.0199992529501554e-10,
"loss": 0.4757,
"num_input_tokens_seen": 70098000,
"step": 6245,
"train_runtime": 10938.0514,
"train_tokens_per_second": 6408.637
},
{
"epoch": 2.999340013199736,
"grad_norm": 0.8769118189811707,
"learning_rate": 2.8406272370440357e-11,
"loss": 0.463,
"num_input_tokens_seen": 70153968,
"step": 6250,
"train_runtime": 10946.6798,
"train_tokens_per_second": 6408.698
},
{
"epoch": 3.0,
"num_input_tokens_seen": 70167528,
"step": 6252,
"total_flos": 3.161046812140241e+18,
"train_loss": 0.5038315440246255,
"train_runtime": 10949.1572,
"train_samples_per_second": 27.399,
"train_steps_per_second": 0.571
}
],
"logging_steps": 5,
"max_steps": 6252,
"num_input_tokens_seen": 70167528,
"num_train_epochs": 3,
"save_steps": 1500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.161046812140241e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}